| ;; x86-64 instruction selection and CLIF-to-MachInst lowering. |
| |
| ;; The main lowering constructor term: takes a clif `Inst` and returns the |
| ;; register(s) within which the lowered instruction's result values live. |
| (decl partial lower (Inst) InstOutput) |
| |
| ;; A variant of the main lowering constructor term, used for branches. |
| ;; The only difference is that it gets an extra argument holding a vector |
| ;; of branch targets to be used. |
| (decl partial lower_branch (Inst MachLabelSlice) Unit) |
| |
| ;;;; Rules for `iconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; `i64` and smaller. |
| (rule (lower (has_type (fits_in_64 ty) |
| (iconst (u64_from_imm64 x)))) |
| (imm ty x)) |
| |
| ;; `i128` |
| (rule 1 (lower (has_type $I128 |
| (iconst (u64_from_imm64 x)))) |
| (value_regs (imm $I64 x) |
| (imm $I64 0))) |
| |
| ;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (f32const (u32_from_ieee32 x))) |
| (imm $F32 x)) |
| |
| ;;;; Rules for `f64const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (f64const (u64_from_ieee64 x))) |
| (imm $F64 x)) |
| |
| ;;;; Rules for `null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type ty (null))) |
| (imm ty 0)) |
| |
| ;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; `i64` and smaller. |
| |
| ;; Base case for 8 and 16-bit types |
| (rule -6 (lower (has_type (fits_in_16 ty) |
| (iadd x y))) |
| (x64_add ty x y)) |
| |
| ;; Base case for 32 and 64-bit types which might end up using the `lea` |
| ;; instruction to fold multiple operations into one. |
| ;; |
| ;; Note that at this time this always generates a `lea` pseudo-instruction, |
| ;; but the actual instruction emitted might be an `add` if it's equivalent. |
| ;; For more details on this see the `emit.rs` logic to emit |
| ;; `LoadEffectiveAddress`. |
| (rule -5 (lower (has_type (ty_32_or_64 ty) (iadd x y))) |
| (x64_lea ty (to_amode_add (mem_flags_trusted) x y (zero_offset)))) |
| |
| ;; Higher-priority cases than the previous two where a load can be sunk into |
| ;; the add instruction itself. Note that both operands are tested for |
| ;; sink-ability since addition is commutative |
| (rule -4 (lower (has_type (fits_in_64 ty) |
| (iadd x (sinkable_load y)))) |
| (x64_add ty x y)) |
| (rule -3 (lower (has_type (fits_in_64 ty) |
| (iadd (sinkable_load x) y))) |
| (x64_add ty y x)) |
| |
| ;; SSE. |
| |
| (rule (lower (has_type (multi_lane 8 16) |
| (iadd x y))) |
| (x64_paddb x y)) |
| |
| (rule (lower (has_type (multi_lane 16 8) |
| (iadd x y))) |
| (x64_paddw x y)) |
| |
| (rule (lower (has_type (multi_lane 32 4) |
| (iadd x y))) |
| (x64_paddd x y)) |
| |
| (rule (lower (has_type (multi_lane 64 2) |
| (iadd x y))) |
| (x64_paddq x y)) |
| |
| ;; `i128` |
| (rule 1 (lower (has_type $I128 (iadd x y))) |
| ;; Get the high/low registers for `x`. |
| (let ((x_regs ValueRegs x) |
| (x_lo Gpr (value_regs_get_gpr x_regs 0)) |
| (x_hi Gpr (value_regs_get_gpr x_regs 1))) |
| ;; Get the high/low registers for `y`. |
| (let ((y_regs ValueRegs y) |
| (y_lo Gpr (value_regs_get_gpr y_regs 0)) |
| (y_hi Gpr (value_regs_get_gpr y_regs 1))) |
| ;; Do an add followed by an add-with-carry. |
| (with_flags (x64_add_with_flags_paired $I64 x_lo y_lo) |
| (x64_adc_paired $I64 x_hi y_hi))))) |
| |
| ;;;; Helpers for `*_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (decl construct_overflow_op (CC ProducesFlags) InstOutput) |
| (rule (construct_overflow_op cc inst) |
| (let ((results ValueRegs (with_flags inst |
| (x64_setcc_paired cc)))) |
| (output_pair (value_regs_get results 0) |
| (value_regs_get results 1)))) |
| |
| (decl construct_overflow_op_alu (Type CC AluRmiROpcode Gpr GprMemImm) InstOutput) |
| (rule (construct_overflow_op_alu ty cc alu_op src1 src2) |
| (construct_overflow_op cc (x64_alurmi_with_flags_paired alu_op ty src1 src2))) |
| |
| ;; This essentially creates |
| ;; alu_<op1> x_lo, y_lo |
| ;; alu_<op2> x_hi, y_hi |
| ;; set<cc> r8 |
| (decl construct_overflow_op_alu_128 (CC AluRmiROpcode AluRmiROpcode Value Value) InstOutput) |
| (rule (construct_overflow_op_alu_128 cc op1 op2 x y) |
| ;; Get the high/low registers for `x`. |
| (let ((x_regs ValueRegs x) |
| (x_lo Gpr (value_regs_get_gpr x_regs 0)) |
| (x_hi Gpr (value_regs_get_gpr x_regs 1))) |
| ;; Get the high/low registers for `y`. |
| (let ((y_regs ValueRegs y) |
| (y_lo Gpr (value_regs_get_gpr y_regs 0)) |
| (y_hi Gpr (value_regs_get_gpr y_regs 1))) |
| (let ((lo_inst ProducesFlags (x64_alurmi_with_flags_paired op1 $I64 x_lo y_lo)) |
| (hi_inst ConsumesAndProducesFlags (x64_alurmi_with_flags_chained op2 $I64 x_hi y_hi)) |
| (of_inst ConsumesFlags (x64_setcc_paired cc)) |
| |
| (result MultiReg (with_flags_chained lo_inst hi_inst of_inst))) |
| (multi_reg_to_pair_and_single result))))) |
| |
| ;;;; Rules for `uadd_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule 1 (lower (uadd_overflow x y @ (value_type (fits_in_64 ty)))) |
| (construct_overflow_op_alu ty (CC.B) (AluRmiROpcode.Add) x y)) |
| |
| ;; i128 gets lowered into adc and add |
| (rule 0 (lower (uadd_overflow x y @ (value_type $I128))) |
| (construct_overflow_op_alu_128 (CC.B) (AluRmiROpcode.Add) (AluRmiROpcode.Adc) x y)) |
| |
| ;;;; Rules for `sadd_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule 1 (lower (sadd_overflow x y @ (value_type (fits_in_64 ty)))) |
| (construct_overflow_op_alu ty (CC.O) (AluRmiROpcode.Add) x y)) |
| |
| (rule 0 (lower (sadd_overflow x y @ (value_type $I128))) |
| (construct_overflow_op_alu_128 (CC.O) (AluRmiROpcode.Add) (AluRmiROpcode.Adc) x y)) |
| |
| ;;;; Rules for `usub_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule 1 (lower (usub_overflow x y @ (value_type (fits_in_64 ty)))) |
| (construct_overflow_op_alu ty (CC.B) (AluRmiROpcode.Sub) x y)) |
| |
| (rule 0 (lower (usub_overflow x y @ (value_type $I128))) |
| (construct_overflow_op_alu_128 (CC.B) (AluRmiROpcode.Sub) (AluRmiROpcode.Sbb) x y)) |
| |
| ;;;; Rules for `ssub_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule 1 (lower (ssub_overflow x y @ (value_type (fits_in_64 ty)))) |
| (construct_overflow_op_alu ty (CC.O) (AluRmiROpcode.Sub) x y)) |
| |
| (rule 0 (lower (ssub_overflow x y @ (value_type $I128))) |
| (construct_overflow_op_alu_128 (CC.O) (AluRmiROpcode.Sub) (AluRmiROpcode.Sbb) x y)) |
| |
| ;;;; Rules for `umul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule 2 (lower (umul_overflow x y @ (value_type (fits_in_64 ty)))) |
| (construct_overflow_op (CC.O) (x64_umullo_with_flags_paired ty x y))) |
| |
| ;;;; Rules for `smul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule 2 (lower (smul_overflow x y @ (value_type (ty_int_ref_16_to_64 ty)))) |
| (construct_overflow_op_alu ty (CC.O) (AluRmiROpcode.Mul) x y)) |
| |
| ;; there is no 8bit imul with an immediate operand so we need to put it in a register or memory |
| (rule 1 (lower (smul_overflow x y @ (value_type $I8))) |
| (construct_overflow_op (CC.O) (x64_alurmi_with_flags_paired (AluRmiROpcode.Mul) $I8 x (reg_mem_to_reg_mem_imm (put_in_reg_mem y))))) |
| |
| ;;;; Rules for `sadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type (multi_lane 8 16) |
| (sadd_sat x y))) |
| (x64_paddsb x y)) |
| |
| (rule (lower (has_type (multi_lane 16 8) |
| (sadd_sat x y))) |
| (x64_paddsw x y)) |
| |
| ;;;; Rules for `uadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type (multi_lane 8 16) |
| (uadd_sat x y))) |
| (x64_paddusb x y)) |
| |
| (rule (lower (has_type (multi_lane 16 8) |
| (uadd_sat x y))) |
| (x64_paddusw x y)) |
| |
| ;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; `i64` and smaller. |
| |
| ;; Sub two registers. |
| (rule -3 (lower (has_type (fits_in_64 ty) |
| (isub x y))) |
| (x64_sub ty x y)) |
| |
| ;; SSE. |
| |
| (rule (lower (has_type (multi_lane 8 16) |
| (isub x y))) |
| (x64_psubb x y)) |
| |
| (rule (lower (has_type (multi_lane 16 8) |
| (isub x y))) |
| (x64_psubw x y)) |
| |
| (rule (lower (has_type (multi_lane 32 4) |
| (isub x y))) |
| (x64_psubd x y)) |
| |
| (rule (lower (has_type (multi_lane 64 2) |
| (isub x y))) |
| (x64_psubq x y)) |
| |
| ;; `i128` |
| (rule 1 (lower (has_type $I128 (isub x y))) |
| ;; Get the high/low registers for `x`. |
| (let ((x_regs ValueRegs x) |
| (x_lo Gpr (value_regs_get_gpr x_regs 0)) |
| (x_hi Gpr (value_regs_get_gpr x_regs 1))) |
| ;; Get the high/low registers for `y`. |
| (let ((y_regs ValueRegs y) |
| (y_lo Gpr (value_regs_get_gpr y_regs 0)) |
| (y_hi Gpr (value_regs_get_gpr y_regs 1))) |
| ;; Do a sub followed by an sub-with-borrow. |
| (with_flags (x64_sub_with_flags_paired $I64 x_lo y_lo) |
| (x64_sbb_paired $I64 x_hi y_hi))))) |
| |
| ;;;; Rules for `ssub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type (multi_lane 8 16) |
| (ssub_sat x y))) |
| (x64_psubsb x y)) |
| |
| (rule (lower (has_type (multi_lane 16 8) |
| (ssub_sat x y))) |
| (x64_psubsw x y)) |
| |
| ;;;; Rules for `usub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type (multi_lane 8 16) |
| (usub_sat x y))) |
| (x64_psubusb x y)) |
| |
| (rule (lower (has_type (multi_lane 16 8) |
| (usub_sat x y))) |
| (x64_psubusw x y)) |
| |
| ;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; `{i,b}64` and smaller. |
| |
| ;; And two registers. |
| (rule 0 (lower (has_type ty (band x y))) |
| (if (ty_int_ref_scalar_64 ty)) |
| (x64_and ty x y)) |
| |
| ;; The above case automatically handles when the rhs is an immediate or a |
| ;; sinkable load, but additionally handle the lhs here. |
| |
| (rule 1 (lower (has_type ty (band (sinkable_load x) y))) |
| (if (ty_int_ref_scalar_64 ty)) |
| (x64_and ty y x)) |
| |
| (rule 2 (lower (has_type ty (band (simm32_from_value x) y))) |
| (if (ty_int_ref_scalar_64 ty)) |
| (x64_and ty y x)) |
| |
| ;; f32 and f64 |
| |
| (rule 5 (lower (has_type (ty_scalar_float ty) (band x y))) |
| (sse_and ty x y)) |
| |
| ;; SSE. |
| |
| (decl sse_and (Type Xmm XmmMem) Xmm) |
| (rule (sse_and $F32X4 x y) (x64_andps x y)) |
| (rule (sse_and $F64X2 x y) (x64_andpd x y)) |
| (rule (sse_and $F32 x y) (x64_andps x y)) |
| (rule (sse_and $F64 x y) (x64_andpd x y)) |
| (rule -1 (sse_and (multi_lane _bits _lanes) x y) (x64_pand x y)) |
| |
| (rule 6 (lower (has_type ty @ (multi_lane _bits _lanes) |
| (band x y))) |
| (sse_and ty x y)) |
| |
| ;; `i128`. |
| |
| (decl and_i128 (ValueRegs ValueRegs) ValueRegs) |
| (rule (and_i128 x y) |
| (let ((x_regs ValueRegs x) |
| (x_lo Gpr (value_regs_get_gpr x_regs 0)) |
| (x_hi Gpr (value_regs_get_gpr x_regs 1)) |
| (y_regs ValueRegs y) |
| (y_lo Gpr (value_regs_get_gpr y_regs 0)) |
| (y_hi Gpr (value_regs_get_gpr y_regs 1))) |
| (value_gprs (x64_and $I64 x_lo y_lo) |
| (x64_and $I64 x_hi y_hi)))) |
| |
| (rule 7 (lower (has_type $I128 (band x y))) |
| (and_i128 x y)) |
| |
| ;; Specialized lowerings for `(band x (bnot y))` which is additionally produced |
| ;; by Cranelift's `band_not` instruction that is legalized into the simpler |
| ;; forms early on. |
| |
| (decl sse_and_not (Type Xmm XmmMem) Xmm) |
| (rule (sse_and_not $F32X4 x y) (x64_andnps x y)) |
| (rule (sse_and_not $F64X2 x y) (x64_andnpd x y)) |
| (rule -1 (sse_and_not (multi_lane _bits _lanes) x y) (x64_pandn x y)) |
| |
| ;; Note the flipping of operands below as we're match |
| ;; |
| ;; (band x (bnot y)) |
| ;; |
| ;; while x86 does |
| ;; |
| ;; pandn(x, y) = and(not(x), y) |
| (rule 8 (lower (has_type ty @ (multi_lane _bits _lane) (band x (bnot y)))) |
| (sse_and_not ty y x)) |
| (rule 9 (lower (has_type ty @ (multi_lane _bits _lane) (band (bnot y) x))) |
| (sse_and_not ty y x)) |
| |
| (rule 10 (lower (has_type ty (band x (bnot y)))) |
| (if (ty_int_ref_scalar_64 ty)) |
| (if-let $true (use_bmi1)) |
| ;; the first argument is the one that gets inverted with andn |
| (x64_andn ty y x)) |
| (rule 11 (lower (has_type ty (band (bnot y) x))) |
| (if (ty_int_ref_scalar_64 ty)) |
| (if-let $true (use_bmi1)) |
| (x64_andn ty y x)) |
| |
| ;; Specialization of `blsr` for BMI1 |
| |
| (decl pure partial val_minus_one (Value) Value) |
| (rule 0 (val_minus_one (isub x (u64_from_iconst 1))) x) |
| (rule 0 (val_minus_one (iadd x (i64_from_iconst -1))) x) |
| (rule 1 (val_minus_one (iadd (i64_from_iconst -1) x)) x) |
| |
| (rule 12 (lower (has_type (ty_32_or_64 ty) (band x y))) |
| (if-let $true (use_bmi1)) |
| (if-let x (val_minus_one y)) |
| (x64_blsr ty x)) |
| (rule 13 (lower (has_type (ty_32_or_64 ty) (band y x))) |
| (if-let $true (use_bmi1)) |
| (if-let x (val_minus_one y)) |
| (x64_blsr ty x)) |
| |
| ;; Specialization of `blsi` for BMI1 |
| |
| (rule 14 (lower (has_type (ty_32_or_64 ty) (band (ineg x) x))) |
| (if-let $true (use_bmi1)) |
| (x64_blsi ty x)) |
| (rule 15 (lower (has_type (ty_32_or_64 ty) (band x (ineg x)))) |
| (if-let $true (use_bmi1)) |
| (x64_blsi ty x)) |
| |
| ;; Specialization of `bzhi` for BMI2 |
| ;; |
| ;; The `bzhi` instruction clears all bits indexed by the second operand of the |
| ;; first operand. This is pattern-matched here with a `band` against a mask |
| ;; which is generated to be N bits large. Note that if the index is larger than |
| ;; the bit-width of the type then `bzhi` doesn't have the same semantics as |
| ;; `ishl`, so an `and` instruction is required to mask the index to match the |
| ;; semantics of Cranelift's `ishl`. |
| |
| (rule 16 (lower (has_type (ty_32_or_64 ty) (band x y))) |
| (if-let $true (use_bmi2)) |
| (if-let (ishl (u64_from_iconst 1) index) (val_minus_one y)) |
| (x64_bzhi ty x (x64_and ty index (RegMemImm.Imm (u32_sub (ty_bits ty) 1))))) |
| |
| ;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; `{i,b}64` and smaller. |
| |
| ;; Or two registers. |
| (rule 0 (lower (has_type ty (bor x y))) |
| (if (ty_int_ref_scalar_64 ty)) |
| (x64_or ty x y)) |
| |
| ;; Handle immediates/sinkable loads on the lhs in addition to the automatic |
| ;; handling of the rhs above |
| |
| (rule 1 (lower (has_type ty (bor (sinkable_load x) y))) |
| (if (ty_int_ref_scalar_64 ty)) |
| (x64_or ty y x)) |
| |
| (rule 2 (lower (has_type ty (bor (simm32_from_value x) y))) |
| (if (ty_int_ref_scalar_64 ty)) |
| (x64_or ty y x)) |
| |
| ;; f32 and f64 |
| |
| (rule 5 (lower (has_type (ty_scalar_float ty) (bor x y))) |
| (sse_or ty x y)) |
| |
| ;; SSE. |
| |
| (decl sse_or (Type Xmm XmmMem) Xmm) |
| (rule (sse_or $F32X4 x y) (x64_orps x y)) |
| (rule (sse_or $F64X2 x y) (x64_orpd x y)) |
| (rule (sse_or $F32 x y) (x64_orps x y)) |
| (rule (sse_or $F64 x y) (x64_orpd x y)) |
| (rule -1 (sse_or (multi_lane _bits _lanes) x y) (x64_por x y)) |
| |
| (rule 6 (lower (has_type ty @ (multi_lane _bits _lanes) |
| (bor x y))) |
| (sse_or ty x y)) |
| |
| ;; `{i,b}128`. |
| |
| (decl or_i128 (ValueRegs ValueRegs) ValueRegs) |
| (rule (or_i128 x y) |
| (let ((x_lo Gpr (value_regs_get_gpr x 0)) |
| (x_hi Gpr (value_regs_get_gpr x 1)) |
| (y_lo Gpr (value_regs_get_gpr y 0)) |
| (y_hi Gpr (value_regs_get_gpr y 1))) |
| (value_gprs (x64_or $I64 x_lo y_lo) |
| (x64_or $I64 x_hi y_hi)))) |
| |
| (rule 7 (lower (has_type $I128 (bor x y))) |
| (or_i128 x y)) |
| |
| ;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; `{i,b}64` and smaller. |
| |
| ;; Xor two registers. |
| (rule 0 (lower (has_type ty (bxor x y))) |
| (if (ty_int_ref_scalar_64 ty)) |
| (x64_xor ty x y)) |
| |
| ;; Handle xor with lhs immediates/sinkable loads in addition to the automatic |
| ;; handling of the rhs above. |
| |
| (rule 1 (lower (has_type ty (bxor (sinkable_load x) y))) |
| (if (ty_int_ref_scalar_64 ty)) |
| (x64_xor ty y x)) |
| |
| (rule 4 (lower (has_type ty (bxor (simm32_from_value x) y))) |
| (if (ty_int_ref_scalar_64 ty)) |
| (x64_xor ty y x)) |
| |
| ;; f32 and f64 |
| |
| (rule 5 (lower (has_type (ty_scalar_float ty) (bxor x y))) |
| (x64_xor_vector ty x y)) |
| |
| ;; SSE. |
| |
| (rule 6 (lower (has_type ty @ (multi_lane _bits _lanes) (bxor x y))) |
| (x64_xor_vector ty x y)) |
| |
| ;; `{i,b}128`. |
| |
| (rule 7 (lower (has_type $I128 (bxor x y))) |
| (let ((x_regs ValueRegs x) |
| (x_lo Gpr (value_regs_get_gpr x_regs 0)) |
| (x_hi Gpr (value_regs_get_gpr x_regs 1)) |
| (y_regs ValueRegs y) |
| (y_lo Gpr (value_regs_get_gpr y_regs 0)) |
| (y_hi Gpr (value_regs_get_gpr y_regs 1))) |
| (value_gprs (x64_xor $I64 x_lo y_lo) |
| (x64_xor $I64 x_hi y_hi)))) |
| |
| ;; Specialization of `blsmsk` for BMI1 |
| |
| (rule 8 (lower (has_type (ty_32_or_64 ty) (bxor x y))) |
| (if-let $true (use_bmi1)) |
| (if-let x (val_minus_one y)) |
| (x64_blsmsk ty x)) |
| (rule 9 (lower (has_type (ty_32_or_64 ty) (bxor y x))) |
| (if-let $true (use_bmi1)) |
| (if-let x (val_minus_one y)) |
| (x64_blsmsk ty x)) |
| |
| ;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; `i64` and smaller. |
| |
| (rule -1 (lower (has_type (fits_in_64 ty) (ishl src amt))) |
| (x64_shl ty src (put_masked_in_imm8_gpr amt ty))) |
| |
| ;; `i128`. |
| |
| (decl shl_i128 (ValueRegs Gpr) ValueRegs) |
| (rule (shl_i128 src amt) |
| ;; Unpack the registers that make up the 128-bit value being shifted. |
| (let ((src_lo Gpr (value_regs_get_gpr src 0)) |
| (src_hi Gpr (value_regs_get_gpr src 1)) |
| ;; Do two 64-bit shifts. |
| (lo_shifted Gpr (x64_shl $I64 src_lo amt)) |
| (hi_shifted Gpr (x64_shl $I64 src_hi amt)) |
| ;; `src_lo >> (64 - amt)` are the bits to carry over from the lo |
| ;; into the hi. |
| (carry Gpr (x64_shr $I64 |
| src_lo |
| (x64_sub $I64 |
| (imm $I64 64) |
| amt))) |
| (zero Gpr (imm $I64 0)) |
| ;; Nullify the carry if we are shifting in by a multiple of 128. |
| (carry_ Gpr (with_flags_reg (x64_test (OperandSize.Size64) |
| (RegMemImm.Imm 127) |
| amt) |
| (cmove $I64 |
| (CC.Z) |
| zero |
| carry))) |
| ;; Add the carry into the high half. |
| (hi_shifted_ Gpr (x64_or $I64 carry_ hi_shifted))) |
| ;; Combine the two shifted halves. However, if we are shifting by >= 64 |
| ;; (modulo 128), then the low bits are zero and the high bits are our |
| ;; low bits. |
| (with_flags (x64_test (OperandSize.Size64) (RegMemImm.Imm 64) amt) |
| (consumes_flags_concat |
| (cmove $I64 (CC.Z) lo_shifted zero) |
| (cmove $I64 (CC.Z) hi_shifted_ lo_shifted))))) |
| |
| (rule (lower (has_type $I128 (ishl src amt))) |
| ;; NB: Only the low bits of `amt` matter since we logically mask the shift |
| ;; amount to the value's bit width. |
| (let ((amt_ Gpr (lo_gpr amt))) |
| (shl_i128 src amt_))) |
| |
| ;; SSE. |
| |
| ;; Since the x86 instruction set does not have any 8x16 shift instructions (even |
| ;; in higher feature sets like AVX), we lower the `ishl.i8x16` to a sequence of |
| ;; instructions. The basic idea, whether the amount to shift by is an immediate |
| ;; or not, is to use a 16x8 shift and then mask off the incorrect bits to 0s. |
| (rule (lower (has_type ty @ $I8X16 (ishl src amt))) |
| (let ( |
| ;; Mask the amount to ensure wrapping behaviour |
| (masked_amt RegMemImm (mask_xmm_shift ty amt)) |
| ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be |
| ;; correct for half of the lanes; the others must be fixed up with |
| ;; the mask below. |
| (unmasked Xmm (x64_psllw src (mov_rmi_to_xmm masked_amt))) |
| (mask_addr SyntheticAmode (ishl_i8x16_mask masked_amt)) |
| (mask Reg (x64_load $I8X16 mask_addr (ExtKind.None)))) |
| (sse_and $I8X16 unmasked (RegMem.Reg mask)))) |
| |
| ;; Get the address of the mask to use when fixing up the lanes that weren't |
| ;; correctly generated by the 16x8 shift. |
| (decl ishl_i8x16_mask (RegMemImm) SyntheticAmode) |
| |
| ;; When the shift amount is known, we can statically (i.e. at compile time) |
| ;; determine the mask to use and only emit that. |
| (decl ishl_i8x16_mask_for_const (u32) SyntheticAmode) |
| (extern constructor ishl_i8x16_mask_for_const ishl_i8x16_mask_for_const) |
| (rule (ishl_i8x16_mask (RegMemImm.Imm amt)) |
| (ishl_i8x16_mask_for_const amt)) |
| |
| ;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run |
| ;; time) find the correct mask offset in the table. We use `lea` to find the |
| ;; base address of the mask table and then complex addressing to offset to the |
| ;; right mask: `base_address + amt << 4` |
| (decl ishl_i8x16_mask_table () SyntheticAmode) |
| (extern constructor ishl_i8x16_mask_table ishl_i8x16_mask_table) |
| (rule (ishl_i8x16_mask (RegMemImm.Reg amt)) |
| (let ((mask_table SyntheticAmode (ishl_i8x16_mask_table)) |
| (base_mask_addr Gpr (x64_lea $I64 mask_table)) |
| (mask_offset Gpr (x64_shl $I64 amt |
| (imm8_to_imm8_gpr 4)))) |
| (Amode.ImmRegRegShift 0 |
| base_mask_addr |
| mask_offset |
| 0 |
| (mem_flags_trusted)))) |
| |
| (rule (ishl_i8x16_mask (RegMemImm.Mem amt)) |
| (ishl_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None))))) |
| |
| ;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked. |
| |
| (rule (lower (has_type ty @ $I16X8 (ishl src amt))) |
| (x64_psllw src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) |
| |
| (rule (lower (has_type ty @ $I32X4 (ishl src amt))) |
| (x64_pslld src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) |
| |
| (rule (lower (has_type ty @ $I64X2 (ishl src amt))) |
| (x64_psllq src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) |
| |
| ;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; `i64` and smaller. |
| |
| (rule -1 (lower (has_type (fits_in_64 ty) (ushr src amt))) |
| (let ((src_ Gpr (extend_to_gpr src ty (ExtendKind.Zero)))) |
| (x64_shr ty src_ (put_masked_in_imm8_gpr amt ty)))) |
| |
| ;; `i128`. |
| |
| (decl shr_i128 (ValueRegs Gpr) ValueRegs) |
| (rule (shr_i128 src amt) |
| ;; Unpack the lo/hi halves of `src`. |
| (let ((src_lo Gpr (value_regs_get_gpr src 0)) |
| (src_hi Gpr (value_regs_get_gpr src 1)) |
| ;; Do a shift on each half. |
| (lo_shifted Gpr (x64_shr $I64 src_lo amt)) |
| (hi_shifted Gpr (x64_shr $I64 src_hi amt)) |
| ;; `src_hi << (64 - amt)` are the bits to carry over from the hi |
| ;; into the lo. |
| (carry Gpr (x64_shl $I64 |
| src_hi |
| (x64_sub $I64 |
| (imm $I64 64) |
| amt))) |
| ;; Share the zero value to reduce register pressure |
| (zero Gpr (imm $I64 0)) |
| |
| ;; Nullify the carry if we are shifting by a multiple of 128. |
| (carry_ Gpr (with_flags_reg (x64_test (OperandSize.Size64) (RegMemImm.Imm 127) amt) |
| (cmove $I64 (CC.Z) zero carry))) |
| ;; Add the carry bits into the lo. |
| (lo_shifted_ Gpr (x64_or $I64 carry_ lo_shifted))) |
| ;; Combine the two shifted halves. However, if we are shifting by >= 64 |
| ;; (modulo 128), then the hi bits are zero and the lo bits are what |
| ;; would otherwise be our hi bits. |
| (with_flags (x64_test (OperandSize.Size64) (RegMemImm.Imm 64) amt) |
| (consumes_flags_concat |
| (cmove $I64 (CC.Z) lo_shifted_ hi_shifted) |
| (cmove $I64 (CC.Z) hi_shifted zero))))) |
| |
| (rule (lower (has_type $I128 (ushr src amt))) |
| ;; NB: Only the low bits of `amt` matter since we logically mask the shift |
| ;; amount to the value's bit width. |
| (let ((amt_ Gpr (lo_gpr amt))) |
| (shr_i128 src amt_))) |
| |
| ;; SSE. |
| |
| ;; There are no 8x16 shifts in x64. Do the same 16x8-shift-and-mask thing we do |
| ;; with 8x16 `ishl`. |
| (rule (lower (has_type ty @ $I8X16 (ushr src amt))) |
| (let ( |
| ;; Mask the amount to ensure wrapping behaviour |
| (masked_amt RegMemImm (mask_xmm_shift ty amt)) |
| ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be |
| ;; correct for half of the lanes; the others must be fixed up with |
| ;; the mask below. |
| (unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt)))) |
| (sse_and $I8X16 |
| unmasked |
| (ushr_i8x16_mask masked_amt)))) |
| |
| ;; Get the address of the mask to use when fixing up the lanes that weren't |
| ;; correctly generated by the 16x8 shift. |
| (decl ushr_i8x16_mask (RegMemImm) SyntheticAmode) |
| |
| ;; When the shift amount is known, we can statically (i.e. at compile time) |
| ;; determine the mask to use and only emit that. |
| (decl ushr_i8x16_mask_for_const (u32) SyntheticAmode) |
| (extern constructor ushr_i8x16_mask_for_const ushr_i8x16_mask_for_const) |
| (rule (ushr_i8x16_mask (RegMemImm.Imm amt)) |
| (ushr_i8x16_mask_for_const amt)) |
| |
| ;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run |
| ;; time) find the correct mask offset in the table. We use `lea` to find the |
| ;; base address of the mask table and then complex addressing to offset to the |
| ;; right mask: `base_address + amt << 4` |
| (decl ushr_i8x16_mask_table () SyntheticAmode) |
| (extern constructor ushr_i8x16_mask_table ushr_i8x16_mask_table) |
| (rule (ushr_i8x16_mask (RegMemImm.Reg amt)) |
| (let ((mask_table SyntheticAmode (ushr_i8x16_mask_table)) |
| (base_mask_addr Gpr (x64_lea $I64 mask_table)) |
| (mask_offset Gpr (x64_shl $I64 |
| amt |
| (imm8_to_imm8_gpr 4)))) |
| (Amode.ImmRegRegShift 0 |
| base_mask_addr |
| mask_offset |
| 0 |
| (mem_flags_trusted)))) |
| |
| (rule (ushr_i8x16_mask (RegMemImm.Mem amt)) |
| (ushr_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None))))) |
| |
| ;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked. |
| |
| (rule (lower (has_type ty @ $I16X8 (ushr src amt))) |
| (x64_psrlw src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) |
| |
| (rule (lower (has_type ty @ $I32X4 (ushr src amt))) |
| (x64_psrld src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) |
| |
| (rule (lower (has_type ty @ $I64X2 (ushr src amt))) |
| (x64_psrlq src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) |
| |
| (decl mask_xmm_shift (Type Value) RegMemImm) |
| (rule (mask_xmm_shift ty amt) |
| (gpr_to_reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) |
| (rule 1 (mask_xmm_shift ty (iconst n)) |
| (RegMemImm.Imm (shift_amount_masked ty n))) |
| |
| ;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; `i64` and smaller. |
| |
| (rule -1 (lower (has_type (fits_in_64 ty) (sshr src amt))) |
| (let ((src_ Gpr (extend_to_gpr src ty (ExtendKind.Sign)))) |
| (x64_sar ty src_ (put_masked_in_imm8_gpr amt ty)))) |
| |
| ;; `i128`. |
| |
| (decl sar_i128 (ValueRegs Gpr) ValueRegs) |
| (rule (sar_i128 src amt) |
| ;; Unpack the low/high halves of `src`. |
| (let ((src_lo Gpr (value_regs_get_gpr src 0)) |
| (src_hi Gpr (value_regs_get_gpr src 1)) |
| ;; Do a shift of each half. NB: the low half uses an unsigned shift |
| ;; because its MSB is not a sign bit. |
| (lo_shifted Gpr (x64_shr $I64 src_lo amt)) |
| (hi_shifted Gpr (x64_sar $I64 src_hi amt)) |
| ;; `src_hi << (64 - amt)` are the bits to carry over from the low |
| ;; half to the high half. |
| (carry Gpr (x64_shl $I64 |
| src_hi |
| (x64_sub $I64 |
| (imm $I64 64) |
| amt))) |
| ;; Nullify the carry if we are shifting by a multiple of 128. |
| (carry_ Gpr (with_flags_reg (x64_test (OperandSize.Size64) (RegMemImm.Imm 127) amt) |
| (cmove $I64 (CC.Z) (imm $I64 0) carry))) |
| ;; Add the carry into the low half. |
| (lo_shifted_ Gpr (x64_or $I64 lo_shifted carry_)) |
| ;; Get all sign bits. |
| (sign_bits Gpr (x64_sar $I64 src_hi (imm8_to_imm8_gpr 63)))) |
| ;; Combine the two shifted halves. However, if we are shifting by >= 64 |
| ;; (modulo 128), then the hi bits are all sign bits and the lo bits are |
| ;; what would otherwise be our hi bits. |
| (with_flags (x64_test (OperandSize.Size64) (RegMemImm.Imm 64) amt) |
| (consumes_flags_concat |
| (cmove $I64 (CC.Z) lo_shifted_ hi_shifted) |
| (cmove $I64 (CC.Z) hi_shifted sign_bits))))) |
| |
| (rule (lower (has_type $I128 (sshr src amt))) |
| ;; NB: Only the low bits of `amt` matter since we logically mask the shift |
| ;; amount to the value's bit width. |
| (let ((amt_ Gpr (lo_gpr amt))) |
| (sar_i128 src amt_))) |
| |
| ;; SSE. |
| |
| ;; Since the x86 instruction set does not have an 8x16 shift instruction and the |
| ;; approach used for `ishl` and `ushr` cannot be easily used (the masks do not |
| ;; preserve the sign), we use a different approach here: separate the low and |
| ;; high lanes, shift them separately, and merge them into the final result. |
| ;; |
| ;; Visually, this looks like the following, where `src.i8x16 = [s0, s1, ..., |
| ;; s15]: |
| ;; |
| ;; lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)] |
| ;; shifted_lo.i16x8 = shift each lane of `low` |
| ;; hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)] |
| ;; shifted_hi.i16x8 = shift each lane of `high` |
| ;; result = [s0'', s1'', ..., s15''] |
| (rule (lower (has_type ty @ $I8X16 (sshr src amt @ (value_type amt_ty)))) |
| (let ((src_ Xmm (put_in_xmm src)) |
| ;; Mask the amount to ensure wrapping behaviour |
| (masked_amt RegMemImm (mask_xmm_shift ty amt)) |
| ;; In order for `packsswb` later to only use the high byte of each |
| ;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to |
| ;; fill in the upper bits appropriately. |
| (lo Xmm (x64_punpcklbw src_ src_)) |
| (hi Xmm (x64_punpckhbw src_ src_)) |
| (amt_ XmmMemImm (sshr_i8x16_bigger_shift amt_ty masked_amt)) |
| (shifted_lo Xmm (x64_psraw lo amt_)) |
| (shifted_hi Xmm (x64_psraw hi amt_))) |
| (x64_packsswb shifted_lo shifted_hi))) |
| |
| (decl sshr_i8x16_bigger_shift (Type RegMemImm) XmmMemImm) |
| (rule (sshr_i8x16_bigger_shift _ty (RegMemImm.Imm i)) |
| (xmm_mem_imm_new (RegMemImm.Imm (u32_add i 8)))) |
| (rule (sshr_i8x16_bigger_shift ty (RegMemImm.Reg r)) |
| (mov_rmi_to_xmm (RegMemImm.Reg (x64_add ty |
| r |
| (RegMemImm.Imm 8))))) |
| (rule (sshr_i8x16_bigger_shift ty rmi @ (RegMemImm.Mem _m)) |
| (mov_rmi_to_xmm (RegMemImm.Reg (x64_add ty |
| (imm ty 8) |
| rmi)))) |
| |
| ;; `sshr.{i16x8,i32x4}` can be a simple `psra{w,d}`, we just have to make sure |
| ;; that if the shift amount is in a register, it is in an XMM register. |
| |
| (rule (lower (has_type ty @ $I16X8 (sshr src amt))) |
| (x64_psraw src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) |
| |
| (rule (lower (has_type ty @ $I32X4 (sshr src amt))) |
| (x64_psrad src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) |
| |
| ;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older |
| ;; feature sets. To remedy this, a small dance is done with an unsigned right |
| ;; shift plus some extra ops. |
| (rule 3 (lower (has_type ty @ $I64X2 (sshr src (iconst n)))) |
| (if-let $true (use_avx512vl)) |
| (if-let $true (use_avx512f)) |
| (x64_vpsraq_imm src (shift_amount_masked ty n))) |
| |
| (rule 2 (lower (has_type ty @ $I64X2 (sshr src amt))) |
| (if-let $true (use_avx512vl)) |
| (if-let $true (use_avx512f)) |
| (let ((masked Gpr (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) |
| (x64_vpsraq src (x64_movd_to_xmm masked)))) |
| |
| (rule 1 (lower (has_type $I64X2 (sshr src (iconst (u64_from_imm64 (u64_as_u32 amt)))))) |
| (lower_i64x2_sshr_imm src (u32_and amt 63))) |
| |
| (rule (lower (has_type $I64X2 (sshr src amt))) |
| (lower_i64x2_sshr_gpr src (x64_and $I64 amt (RegMemImm.Imm 63)))) |
| |
| (decl lower_i64x2_sshr_imm (Xmm u32) Xmm) |
| |
| ;; If the shift amount is less than 32 then do an sshr with 32-bit lanes to |
| ;; produce the upper halves of each result, followed by a ushr of 64-bit lanes |
| ;; to produce the lower halves of each result. Interleave results at the end. |
| (rule 2 (lower_i64x2_sshr_imm vec imm) |
| (if-let $true (u64_lt imm 32)) |
| (let ( |
| (high32 Xmm (x64_psrad vec (xmi_imm imm))) |
| (high32 Xmm (x64_pshufd high32 0b11_10_11_01)) |
| (low32 Xmm (x64_psrlq vec (xmi_imm imm))) |
| (low32 Xmm (x64_pshufd low32 0b11_10_10_00)) |
| ) |
| (x64_punpckldq low32 high32))) |
| |
| ;; If the shift amount is 32 then the `psrlq` from the above rule can be avoided |
| (rule 1 (lower_i64x2_sshr_imm vec 32) |
| (let ( |
| (low32 Xmm (x64_pshufd vec 0b11_10_11_01)) |
| (high32 Xmm (x64_psrad vec (xmi_imm 31))) |
| (high32 Xmm (x64_pshufd high32 0b11_10_11_01)) |
| ) |
| (x64_punpckldq low32 high32))) |
| |
| ;; Shifts >= 32 use one `psrad` to generate the upper bits and second `psrad` to |
| ;; generate the lower bits. Everything is then woven back together with |
| ;; shuffles. |
| (rule (lower_i64x2_sshr_imm vec imm) |
| (if-let $true (u64_lt 32 imm)) |
| (let ( |
| (high32 Xmm (x64_psrad vec (xmi_imm 31))) |
| (high32 Xmm (x64_pshufd high32 0b11_10_11_01)) |
| (low32 Xmm (x64_psrad vec (xmi_imm (u32_sub imm 32)))) |
| (low32 Xmm (x64_pshufd low32 0b11_10_11_01)) |
| ) |
| (x64_punpckldq low32 high32))) |
| |
| ;; A variable shift amount is slightly more complicated than the immediate |
| ;; shift amounts from above. The `Gpr` argument is guaranteed to be <= 63 by |
| ;; earlier masking. A `ushr` operation is used with some xor/sub math to |
| ;; generate the sign bits. |
| (decl lower_i64x2_sshr_gpr (Xmm Gpr) Xmm) |
| (rule (lower_i64x2_sshr_gpr vec val) |
| (let ( |
| (val Xmm (x64_movq_to_xmm val)) |
| (mask Xmm (flip_high_bit_mask $I64X2)) |
| (sign_bit_loc Xmm (x64_psrlq mask val)) |
| (ushr Xmm (x64_psrlq vec val)) |
| (ushr_sign_bit_flip Xmm (x64_pxor sign_bit_loc ushr)) |
| ) |
| (x64_psubq ushr_sign_bit_flip sign_bit_loc))) |
| |
| ;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; `i64` and smaller: we can rely on x86's rotate-amount masking since |
| ;; we operate on the whole register. For const's we mask the constant. |
| |
| (rule -1 (lower (has_type (fits_in_64 ty) (rotl src amt))) |
| (x64_rotl ty src (put_masked_in_imm8_gpr amt ty))) |
| |
| |
| ;; `i128`. |
| |
| (rule (lower (has_type $I128 (rotl src amt))) |
| (let ((src_ ValueRegs src) |
| ;; NB: Only the low bits of `amt` matter since we logically mask the |
| ;; rotation amount to the value's bit width. |
| (amt_ Gpr (lo_gpr amt))) |
| (or_i128 (shl_i128 src_ amt_) |
| (shr_i128 src_ (x64_sub $I64 |
| (imm $I64 128) |
| amt_))))) |
| |
| ;;;; Rules for `rotr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; `i64` and smaller: we can rely on x86's rotate-amount masking since |
| ;; we operate on the whole register. For const's we mask the constant. |
| |
| (rule -1 (lower (has_type (fits_in_64 ty) (rotr src amt))) |
| (x64_rotr ty src (put_masked_in_imm8_gpr amt ty))) |
| |
| |
| ;; `i128`. |
| |
| (rule (lower (has_type $I128 (rotr src amt))) |
| (let ((src_ ValueRegs src) |
| ;; NB: Only the low bits of `amt` matter since we logically mask the |
| ;; rotation amount to the value's bit width. |
| (amt_ Gpr (lo_gpr amt))) |
| (or_i128 (shr_i128 src_ amt_) |
| (shl_i128 src_ (x64_sub $I64 |
| (imm $I64 128) |
| amt_))))) |
| |
| ;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; `i64` and smaller. |
| |
| (rule -1 (lower (has_type (fits_in_64 ty) (ineg x))) |
| (x64_neg ty x)) |
| |
| (rule -2 (lower (has_type $I128 (ineg x))) |
| ;; Get the high/low registers for `x`. |
| (let ((regs ValueRegs x) |
| (lo Gpr (value_regs_get_gpr regs 0)) |
| (hi Gpr (value_regs_get_gpr regs 1))) |
| ;; Do a neg followed by an sub-with-borrow. |
| (with_flags (x64_neg_paired $I64 lo) |
| (x64_sbb_paired $I64 (imm $I64 0) hi)))) |
| |
| ;; SSE. |
| |
| (rule (lower (has_type $I8X16 (ineg x))) |
| (x64_psubb (imm $I8X16 0) x)) |
| |
| (rule (lower (has_type $I16X8 (ineg x))) |
| (x64_psubw (imm $I16X8 0) x)) |
| |
| (rule (lower (has_type $I32X4 (ineg x))) |
| (x64_psubd (imm $I32X4 0) x)) |
| |
| (rule (lower (has_type $I64X2 (ineg x))) |
| (x64_psubq (imm $I64X2 0) x)) |
| |
| ;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type (multi_lane 8 16) |
| (avg_round x y))) |
| (x64_pavgb x y)) |
| |
| (rule (lower (has_type (multi_lane 16 8) |
| (avg_round x y))) |
| (x64_pavgw x y)) |
| |
| ;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; `i64` and smaller. |
| |
| ;; Multiply two registers. |
| (rule -5 (lower (has_type (fits_in_64 ty) (imul x y))) |
| (x64_mul ty x y)) |
| |
| ;; Handle multiplication where the lhs is an immediate or sinkable load in |
| ;; addition to the automatic rhs handling above. |
| |
| (rule -4 (lower (has_type (fits_in_64 ty) |
| (imul (simm32_from_value x) y))) |
| (x64_mul ty y x)) |
| (rule -3 (lower (has_type (fits_in_64 ty) |
| (imul (sinkable_load x) y))) |
| (x64_mul ty y x)) |
| |
| ;; `i128`. |
| |
| ;; mul: |
| ;; dst_lo = lhs_lo * rhs_lo |
| ;; dst_hi = umulhi(lhs_lo, rhs_lo) + |
| ;; lhs_lo * rhs_hi + |
| ;; lhs_hi * rhs_lo |
| ;; |
| ;; so we emit: |
| ;; lo_hi = mul x_lo, y_hi |
| ;; hi_lo = mul x_hi, y_lo |
| ;; hilo_hilo = add lo_hi, hi_lo |
| ;; dst_lo:hi_lolo = mulhi_u x_lo, y_lo |
| ;; dst_hi = add hilo_hilo, hi_lolo |
| ;; return (dst_lo, dst_hi) |
| (rule 2 (lower (has_type $I128 (imul x y))) |
| ;; Put `x` into registers and unpack its hi/lo halves. |
| (let ((x_regs ValueRegs x) |
| (x_lo Gpr (value_regs_get_gpr x_regs 0)) |
| (x_hi Gpr (value_regs_get_gpr x_regs 1)) |
| ;; Put `y` into registers and unpack its hi/lo halves. |
| (y_regs ValueRegs y) |
| (y_lo Gpr (value_regs_get_gpr y_regs 0)) |
| (y_hi Gpr (value_regs_get_gpr y_regs 1)) |
| ;; lo_hi = mul x_lo, y_hi |
| (lo_hi Gpr (x64_mul $I64 x_lo y_hi)) |
| ;; hi_lo = mul x_hi, y_lo |
| (hi_lo Gpr (x64_mul $I64 x_hi y_lo)) |
| ;; hilo_hilo = add lo_hi, hi_lo |
| (hilo_hilo Gpr (x64_add $I64 lo_hi hi_lo)) |
| ;; dst_lo:hi_lolo = mulhi_u x_lo, y_lo |
| (mul_regs ValueRegs (mulhi_u $I64 x_lo y_lo)) |
| (dst_lo Gpr (value_regs_get_gpr mul_regs 0)) |
| (hi_lolo Gpr (value_regs_get_gpr mul_regs 1)) |
| ;; dst_hi = add hilo_hilo, hi_lolo |
| (dst_hi Gpr (x64_add $I64 hilo_hilo hi_lolo))) |
| (value_gprs dst_lo dst_hi))) |
| |
| ;; SSE. |
| |
| ;; (No i8x16 multiply.) |
| |
| (rule (lower (has_type (multi_lane 16 8) (imul x y))) |
| (x64_pmullw x y)) |
| |
| (rule (lower (has_type (multi_lane 32 4) (imul x y))) |
| (if-let $true (use_sse41)) |
| (x64_pmulld x y)) |
| |
| ;; Without `pmulld` the `pmuludq` instruction is used instead which performs |
| ;; 32-bit multiplication storing the 64-bit result. The 64-bit result is |
| ;; truncated to 32-bits and everything else is woven into place. |
| (rule -1 (lower (has_type (multi_lane 32 4) (imul x y))) |
| (let ( |
| (x Xmm x) |
| (y Xmm y) |
| (x_hi Xmm (x64_pshufd x 0b00_11_00_01)) |
| (y_hi Xmm (x64_pshufd y 0b00_11_00_01)) |
| (mul_lo Xmm (x64_pshufd (x64_pmuludq x y) 0b00_00_10_00)) |
| (mul_hi Xmm (x64_pshufd (x64_pmuludq x_hi y_hi) 0b00_00_10_00)) |
| ) |
| (x64_punpckldq mul_lo mul_hi))) |
| |
| ;; With AVX-512 we can implement `i64x2` multiplication with a single |
| ;; instruction. |
| (rule 3 (lower (has_type (multi_lane 64 2) (imul x y))) |
| (if-let $true (use_avx512vl)) |
| (if-let $true (use_avx512dq)) |
| (x64_vpmullq x y)) |
| |
| ;; Otherwise, for i64x2 multiplication we describe a lane A as being composed of |
| ;; a 32-bit upper half "Ah" and a 32-bit lower half "Al". The 32-bit long hand |
| ;; multiplication can then be written as: |
| ;; |
| ;; Ah Al |
| ;; * Bh Bl |
| ;; ----- |
| ;; Al * Bl |
| ;; + (Ah * Bl) << 32 |
| ;; + (Al * Bh) << 32 |
| ;; |
| ;; So for each lane we will compute: |
| ;; |
| ;; A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32 |
| ;; |
| ;; Note, the algorithm will use `pmuludq` which operates directly on the lower |
| ;; 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of |
| ;; the lane of the destination. For this reason we don't need shifts to isolate |
| ;; the lower 32-bits, however, we will need to use shifts to isolate the high |
| ;; 32-bits when doing calculations, i.e., `Ah == A >> 32`. |
| (rule (lower (has_type (multi_lane 64 2) |
| (imul a b))) |
| (let ((a0 Xmm a) |
| (b0 Xmm b) |
| ;; a_hi = A >> 32 |
| (a_hi Xmm (x64_psrlq a0 (xmi_imm 32))) |
| ;; ah_bl = Ah * Bl |
| (ah_bl Xmm (x64_pmuludq a_hi b0)) |
| ;; b_hi = B >> 32 |
| (b_hi Xmm (x64_psrlq b0 (xmi_imm 32))) |
| ;; al_bh = Al * Bh |
| (al_bh Xmm (x64_pmuludq a0 b_hi)) |
| ;; aa_bb = ah_bl + al_bh |
| (aa_bb Xmm (x64_paddq ah_bl al_bh)) |
| ;; aa_bb_shifted = aa_bb << 32 |
| (aa_bb_shifted Xmm (x64_psllq aa_bb (xmi_imm 32))) |
| ;; al_bl = Al * Bl |
| (al_bl Xmm (x64_pmuludq a0 b0))) |
| ;; al_bl + aa_bb_shifted |
| (x64_paddq al_bl aa_bb_shifted))) |
| |
| ;; Special case for `i32x4.extmul_high_i16x8_s`. |
| (rule 1 (lower (has_type (multi_lane 32 4) |
| (imul (swiden_high (and (value_type (multi_lane 16 8)) |
| x)) |
| (swiden_high (and (value_type (multi_lane 16 8)) |
| y))))) |
| (let ((x2 Xmm x) |
| (y2 Xmm y) |
| (lo Xmm (x64_pmullw x2 y2)) |
| (hi Xmm (x64_pmulhw x2 y2))) |
| (x64_punpckhwd lo hi))) |
| |
| ;; Special case for `i64x2.extmul_high_i32x4_s`. |
| (rule 1 (lower (has_type (multi_lane 64 2) |
| (imul (swiden_high (and (value_type (multi_lane 32 4)) |
| x)) |
| (swiden_high (and (value_type (multi_lane 32 4)) |
| y))))) |
| (if-let $true (use_sse41)) |
| (let ((x2 Xmm (x64_pshufd x 0xFA)) |
| (y2 Xmm (x64_pshufd y 0xFA))) |
| (x64_pmuldq x2 y2))) |
| |
| ;; Special case for `i32x4.extmul_low_i16x8_s`. |
| (rule 1 (lower (has_type (multi_lane 32 4) |
| (imul (swiden_low (and (value_type (multi_lane 16 8)) |
| x)) |
| (swiden_low (and (value_type (multi_lane 16 8)) |
| y))))) |
| (let ((x2 Xmm x) |
| (y2 Xmm y) |
| (lo Xmm (x64_pmullw x2 y2)) |
| (hi Xmm (x64_pmulhw x2 y2))) |
| (x64_punpcklwd lo hi))) |
| |
| ;; Special case for `i64x2.extmul_low_i32x4_s`. |
| (rule 1 (lower (has_type (multi_lane 64 2) |
| (imul (swiden_low (and (value_type (multi_lane 32 4)) |
| x)) |
| (swiden_low (and (value_type (multi_lane 32 4)) |
| y))))) |
| (if-let $true (use_sse41)) |
| (let ((x2 Xmm (x64_pshufd x 0x50)) |
| (y2 Xmm (x64_pshufd y 0x50))) |
| (x64_pmuldq x2 y2))) |
| |
| ;; Special case for `i32x4.extmul_high_i16x8_u`. |
| (rule 1 (lower (has_type (multi_lane 32 4) |
| (imul (uwiden_high (and (value_type (multi_lane 16 8)) |
| x)) |
| (uwiden_high (and (value_type (multi_lane 16 8)) |
| y))))) |
| (let ((x2 Xmm x) |
| (y2 Xmm y) |
| (lo Xmm (x64_pmullw x2 y2)) |
| (hi Xmm (x64_pmulhuw x2 y2))) |
| (x64_punpckhwd lo hi))) |
| |
| ;; Special case for `i64x2.extmul_high_i32x4_u`. |
| (rule 1 (lower (has_type (multi_lane 64 2) |
| (imul (uwiden_high (and (value_type (multi_lane 32 4)) |
| x)) |
| (uwiden_high (and (value_type (multi_lane 32 4)) |
| y))))) |
| (let ((x2 Xmm (x64_pshufd x 0xFA)) |
| (y2 Xmm (x64_pshufd y 0xFA))) |
| (x64_pmuludq x2 y2))) |
| |
| ;; Special case for `i32x4.extmul_low_i16x8_u`. |
| (rule 1 (lower (has_type (multi_lane 32 4) |
| (imul (uwiden_low (and (value_type (multi_lane 16 8)) |
| x)) |
| (uwiden_low (and (value_type (multi_lane 16 8)) |
| y))))) |
| (let ((x2 Xmm x) |
| (y2 Xmm y) |
| (lo Xmm (x64_pmullw x2 y2)) |
| (hi Xmm (x64_pmulhuw x2 y2))) |
| (x64_punpcklwd lo hi))) |
| |
| ;; Special case for `i64x2.extmul_low_i32x4_u`. |
| (rule 1 (lower (has_type (multi_lane 64 2) |
| (imul (uwiden_low (and (value_type (multi_lane 32 4)) |
| x)) |
| (uwiden_low (and (value_type (multi_lane 32 4)) |
| y))))) |
| (let ((x2 Xmm (x64_pshufd x 0x50)) |
| (y2 Xmm (x64_pshufd y 0x50))) |
| (x64_pmuludq x2 y2))) |
| |
| ;;;; Rules for `iabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule 1 (lower (has_type $I8X16 (iabs x))) |
| (if-let $true (use_ssse3)) |
| (x64_pabsb x)) |
| |
| ;; Note the use of `pminub` with signed inputs will produce the positive signed |
| ;; result which is what is desired here. The `pmaxub` isn't available until |
| ;; SSE4.1 in which case the single-instruction above lowering would apply. |
| (rule (lower (has_type $I8X16 (iabs x))) |
| (let ( |
| (x Xmm x) |
| (negated Xmm (x64_psubb (xmm_zero $I8X16) x)) |
| ) |
| (x64_pminub x negated))) |
| |
| (rule 1 (lower (has_type $I16X8 (iabs x))) |
| (if-let $true (use_ssse3)) |
| (x64_pabsw x)) |
| |
| (rule (lower (has_type $I16X8 (iabs x))) |
| (let ( |
| (x Xmm x) |
| (negated Xmm (x64_psubw (xmm_zero $I16X8) x)) |
| ) |
| (x64_pmaxsw x negated))) |
| |
| (rule 1 (lower (has_type $I32X4 (iabs x))) |
| (if-let $true (use_ssse3)) |
| (x64_pabsd x)) |
| |
| ;; Generate a `negative_mask` which is either numerically -1 or 0 depending on |
| ;; if the lane is negative. If the lane is positive then the xor operation |
| ;; won't change the lane but otherwise it'll bit-flip everything. By then |
| ;; subtracting the mask this subtracts 0 for positive lanes (does nothing) or |
| ;; ends up adding one for negative lanes. This means that for a negative lane |
| ;; `x` the result is `!x + 1` which is the result of negating it. |
| (rule (lower (has_type $I32X4 (iabs x))) |
| (let ( |
| (x Xmm x) |
| (negative_mask Xmm (x64_psrad x (xmi_imm 31))) |
| (flipped_if_negative Xmm (x64_pxor x negative_mask)) |
| ) |
| (x64_psubd flipped_if_negative negative_mask))) |
| |
| ;; When AVX512 is available, we can use a single `vpabsq` instruction. |
| (rule 2 (lower (has_type $I64X2 (iabs x))) |
| (if-let $true (use_avx512vl)) |
| (if-let $true (use_avx512f)) |
| (x64_vpabsq x)) |
| |
| ;; Otherwise, we use a separate register, `neg`, to contain the results of `0 - |
| ;; x` and then blend in those results with `blendvpd` if the MSB of `neg` was |
| ;; set to 1 (i.e. if `neg` was negative or, conversely, if `x` was originally |
| ;; positive). |
| (rule 1 (lower (has_type $I64X2 (iabs x))) |
| (if-let $true (use_sse41)) |
| (let ((rx Xmm x) |
| (neg Xmm (x64_psubq (imm $I64X2 0) rx))) |
| (x64_blendvpd neg rx neg))) |
| |
| ;; and if `blendvpd` isn't available then perform a shift/shuffle to generate a |
| ;; mask of which lanes are negative, followed by flipping bits/sub to make both |
| ;; positive. |
| (rule (lower (has_type $I64X2 (iabs x))) |
| (let ((x Xmm x) |
| (signs Xmm (x64_psrad x (RegMemImm.Imm 31))) |
| (signs Xmm (x64_pshufd signs 0b11_11_01_01)) |
| (xor_if_negative Xmm (x64_pxor x signs))) |
| (x64_psubq xor_if_negative signs))) |
| |
| ;; `i64` and smaller. |
| |
| (rule -1 (lower (has_type (fits_in_64 ty) (iabs x))) |
| (let ((src Gpr x) |
| (neg ProducesFlags (x64_neg_paired ty src)) |
| ;; Manually extract the result from the neg, then ignore |
| ;; it below, since we need to pass it into the cmove |
| ;; before we pass the cmove to with_flags_reg. |
| (neg_result Gpr (produces_flags_get_reg neg)) |
| ;; When the neg instruction sets the sign flag, |
| ;; takes the original (non-negative) value. |
| (cmove ConsumesFlags (cmove ty (CC.S) src neg_result))) |
| (with_flags_reg (produces_flags_ignore neg) cmove))) |
| |
| ;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type $F32 (fabs x))) |
| (x64_andps x (imm $F32 0x7fffffff))) |
| |
| (rule (lower (has_type $F64 (fabs x))) |
| (x64_andpd x (imm $F64 0x7fffffffffffffff))) |
| |
| ;; Special case for `f32x4.abs`. |
| (rule (lower (has_type $F32X4 (fabs x))) |
| (x64_andps x |
| (x64_psrld (vector_all_ones) (xmi_imm 1)))) |
| |
| ;; Special case for `f64x2.abs`. |
| (rule (lower (has_type $F64X2 (fabs x))) |
| (x64_andpd x |
| (x64_psrlq (vector_all_ones) (xmi_imm 1)))) |
| |
| ;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type $F32 (fneg x))) |
| (x64_xorps x (imm $F32 0x80000000))) |
| |
| (rule (lower (has_type $F64 (fneg x))) |
| (x64_xorpd x (imm $F64 0x8000000000000000))) |
| |
| (rule (lower (has_type $F32X4 (fneg x))) |
| (x64_xorps x |
| (x64_pslld (vector_all_ones) (xmi_imm 31)))) |
| |
| (rule (lower (has_type $F64X2 (fneg x))) |
| (x64_xorpd x |
| (x64_psllq (vector_all_ones) (xmi_imm 63)))) |
| |
| ;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (decl lower_bmask (Type Type ValueRegs) ValueRegs) |
| |
| ;; Values that fit in a register |
| ;; |
| ;; Use the neg instruction on the input which sets the CF (carry) flag |
| ;; to 0 if the input is 0 or 1 otherwise. |
| ;; We then subtract the output register with itself, which always gives a 0, |
| ;; however use the carry flag from the previous negate to generate a -1 if it |
| ;; was nonzero. |
| ;; |
| ;; neg in_reg |
| ;; sbb out_reg, out_reg |
| (rule 0 |
| (lower_bmask (fits_in_64 out_ty) (fits_in_64 in_ty) val) |
| (let ((reg Gpr (value_regs_get_gpr val 0)) |
| (out ValueRegs (with_flags |
| (x64_neg_paired in_ty reg) |
| (x64_sbb_paired out_ty reg reg)))) |
| ;; Extract only the output of the sbb instruction |
| (value_reg (value_regs_get out 1)))) |
| |
| |
| ;; If the input type is I128 we can `or` the registers, and recurse to the general case. |
| (rule 1 |
| (lower_bmask (fits_in_64 out_ty) $I128 val) |
| (let ((lo Gpr (value_regs_get_gpr val 0)) |
| (hi Gpr (value_regs_get_gpr val 1)) |
| (mixed Gpr (x64_or $I64 lo hi))) |
| (lower_bmask out_ty $I64 (value_reg mixed)))) |
| |
| ;; If the output type is I128 we just duplicate the result of the I64 lowering |
| (rule 2 |
| (lower_bmask $I128 in_ty val) |
| (let ((res ValueRegs (lower_bmask $I64 in_ty val)) |
| (res Gpr (value_regs_get_gpr res 0))) |
| (value_regs res res))) |
| |
| |
| ;; Call the lower_bmask rule that does all the procssing |
| (rule (lower (has_type out_ty (bmask x @ (value_type in_ty)))) |
| (lower_bmask out_ty in_ty x)) |
| |
| ;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; `i64` and smaller. |
| |
| (rule -2 (lower (has_type ty (bnot x))) |
| (if (ty_int_ref_scalar_64 ty)) |
| (x64_not ty x)) |
| |
| |
| ;; `i128`. |
| |
| (decl i128_not (Value) ValueRegs) |
| (rule (i128_not x) |
| (let ((x_regs ValueRegs x) |
| (x_lo Gpr (value_regs_get_gpr x_regs 0)) |
| (x_hi Gpr (value_regs_get_gpr x_regs 1))) |
| (value_gprs (x64_not $I64 x_lo) |
| (x64_not $I64 x_hi)))) |
| |
| (rule (lower (has_type $I128 (bnot x))) |
| (i128_not x)) |
| |
| ;; f32 and f64 |
| |
| (rule -3 (lower (has_type (ty_scalar_float ty) (bnot x))) |
| (x64_xor_vector ty x (vector_all_ones))) |
| |
| ;; Special case for vector-types where bit-negation is an xor against an |
| ;; all-one value |
| (rule -1 (lower (has_type ty @ (multi_lane _bits _lanes) (bnot x))) |
| (x64_xor_vector ty x (vector_all_ones))) |
| |
| ;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type ty @ (multi_lane _bits _lanes) |
| (bitselect condition |
| if_true |
| if_false))) |
| ;; a = and if_true, condition |
| ;; b = and_not condition, if_false |
| ;; or b, a |
| (let ((cond_xmm Xmm condition) |
| (a Xmm (sse_and ty if_true cond_xmm)) |
| (b Xmm (sse_and_not ty cond_xmm if_false))) |
| (sse_or ty b a))) |
| |
| ;; If every byte of the condition is guaranteed to be all ones or all zeroes, |
| ;; we can use x64_blend. |
| (rule 1 (lower (has_type ty @ (multi_lane _bits _lanes) |
| (bitselect condition |
| if_true |
| if_false))) |
| (if-let $true (use_sse41)) |
| (if (all_ones_or_all_zeros condition)) |
| (x64_pblendvb if_false if_true condition)) |
| |
| (decl pure partial all_ones_or_all_zeros (Value) bool) |
| (rule (all_ones_or_all_zeros (and (icmp _ _ _) (value_type (multi_lane _ _)))) $true) |
| (rule (all_ones_or_all_zeros (and (fcmp _ _ _) (value_type (multi_lane _ _)))) $true) |
| (rule (all_ones_or_all_zeros (vconst (vconst_all_ones_or_all_zeros))) $true) |
| |
| (decl pure vconst_all_ones_or_all_zeros () Constant) |
| (extern extractor vconst_all_ones_or_all_zeros vconst_all_ones_or_all_zeros) |
| |
| ;; Specializations for floating-pointer compares to generate a `minp*` or a |
| ;; `maxp*` instruction. These are equivalent to the wasm `f32x4.{pmin,pmax}` |
| ;; instructions and how they're lowered into CLIF. Note the careful ordering |
| ;; of all the operands here to ensure that the input CLIF matched is implemented |
| ;; by the corresponding x64 instruction. |
| (rule 2 (lower (has_type $F32X4 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) x y)) x y))) |
| (x64_minps x y)) |
| (rule 2 (lower (has_type $F64X2 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) x y)) x y))) |
| (x64_minpd x y)) |
| |
| (rule 3 (lower (has_type $F32X4 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) y x)) x y))) |
| (x64_maxps x y)) |
| (rule 3 (lower (has_type $F64X2 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) y x)) x y))) |
| (x64_maxpd x y)) |
| |
| ;; Scalar rules |
| |
| (rule 3 (lower (has_type $I128 (bitselect c t f))) |
| (let ((a ValueRegs (and_i128 c t)) |
| (b ValueRegs (and_i128 (i128_not c) f))) |
| (or_i128 a b))) |
| |
| (rule 4 (lower (has_type (ty_int_ref_scalar_64 ty) (bitselect c t f))) |
| (let ((a Gpr (x64_and ty c t)) |
| (b Gpr (x64_and ty (x64_not ty c) f))) |
| (x64_or ty a b))) |
| |
| (rule 5 (lower (has_type (ty_scalar_float ty) (bitselect c t f))) |
| (let ((a Xmm (sse_and ty c t)) |
| (c_neg Xmm (x64_xor_vector ty c (vector_all_ones))) |
| (b Xmm (sse_and ty c_neg f))) |
| (sse_or ty a b))) |
| |
| ;;;; Rules for `x86_blendv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type $I8X16 |
| (x86_blendv condition if_true if_false))) |
| (if-let $true (use_sse41)) |
| (x64_pblendvb if_false if_true condition)) |
| |
| (rule (lower (has_type $I32X4 |
| (x86_blendv condition if_true if_false))) |
| (if-let $true (use_sse41)) |
| (x64_blendvps if_false if_true condition)) |
| |
| (rule (lower (has_type $I64X2 |
| (x86_blendv condition if_true if_false))) |
| (if-let $true (use_sse41)) |
| (x64_blendvpd if_false if_true condition)) |
| |
| ;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (insertlane vec @ (value_type ty) val (u8_from_uimm8 idx))) |
| (vec_insert_lane ty vec val idx)) |
| |
| ;; Helper function used below for `insertlane` but also here for other |
| ;; lowerings. |
| ;; |
| ;; Note that the `Type` used here is the type of vector the insertion is |
| ;; happening into, or the type of the first `Reg` argument. |
| (decl vec_insert_lane (Type Xmm RegMem u8) Xmm) |
| |
| ;; i8x16.replace_lane |
| (rule 1 (vec_insert_lane $I8X16 vec val idx) |
| (if-let $true (use_sse41)) |
| (x64_pinsrb vec val idx)) |
| |
| ;; This lowering is particularly unoptimized and is mostly just here to work |
| ;; rather than here to be fast. Requiring SSE 4.1 for the above lowering isn't |
| ;; the end of the world hopefully as that's a pretty old instruction set, so |
| ;; this is the "simplest" version that works on SSE2 for now. |
| ;; |
| ;; This lowering masks the original vector with a constant with all 1s except |
| ;; for the "hole" where this value will get placed into, meaning the desired |
| ;; lane is guaranteed as all 0s. Next the `val` is shuffled into this hole with |
| ;; a few operations: |
| ;; |
| ;; 1. The `val` is zero-extended to 32-bits to guarantee the lower 32-bits |
| ;; are all defined. |
| ;; 2. An arithmetic shift-left is used with the low two bits of `n`, the |
| ;; desired lane, to move the value into the right position within the 32-bit |
| ;; register value. |
| ;; 3. The 32-bit register is moved with `movd` into an XMM register |
| ;; 4. The XMM register, where all lanes are 0 except for the first lane which |
| ;; has the shifted value, is then shuffled with `pshufd` to move the |
| ;; shifted value to the correct and final lane. This uses the upper two |
| ;; bits of `n` to index the i32x4 lane that we're targeting. |
| ;; |
| ;; This all, laboriously, gets the `val` into the desired lane so it's then |
| ;; `por`'d with the original vec-with-a-hole to produce the final result of the |
| ;; insertion. |
| (rule (vec_insert_lane $I8X16 vec val n) |
| (let ((vec_with_hole Xmm (x64_pand vec (insert_i8x16_lane_hole n))) |
| (val Gpr (x64_movzx (ExtMode.BL) val)) |
| (val Gpr (x64_shl $I32 val (Imm8Reg.Imm8 (u8_shl (u8_and n 3) 3)))) |
| (val Xmm (x64_movd_to_xmm val)) |
| (val_at_hole Xmm (x64_pshufd val (insert_i8x16_lane_pshufd_imm (u8_shr n 2))))) |
| (x64_por vec_with_hole val_at_hole))) |
| |
| (decl insert_i8x16_lane_hole (u8) VCodeConstant) |
| (extern constructor insert_i8x16_lane_hole insert_i8x16_lane_hole) |
| (decl insert_i8x16_lane_pshufd_imm (u8) u8) |
| (rule (insert_i8x16_lane_pshufd_imm 0) 0b01_01_01_00) |
| (rule (insert_i8x16_lane_pshufd_imm 1) 0b01_01_00_01) |
| (rule (insert_i8x16_lane_pshufd_imm 2) 0b01_00_01_01) |
| (rule (insert_i8x16_lane_pshufd_imm 3) 0b00_01_01_01) |
| |
| ;; i16x8.replace_lane |
| (rule (vec_insert_lane $I16X8 vec val idx) |
| (x64_pinsrw vec val idx)) |
| |
| ;; i32x4.replace_lane |
| (rule 1 (vec_insert_lane $I32X4 vec val idx) |
| (if-let $true (use_sse41)) |
| (x64_pinsrd vec val idx)) |
| |
| (rule (vec_insert_lane $I32X4 vec val 0) |
| (x64_movss_regmove vec (x64_movd_to_xmm val))) |
| |
| ;; tmp = [ vec[1] vec[0] val[1] val[0] ] |
| ;; result = [ vec[3] vec[2] tmp[0] tmp[2] ] |
| (rule (vec_insert_lane $I32X4 vec val 1) |
| (let ((val Xmm (x64_movd_to_xmm val)) |
| (vec Xmm vec)) |
| (x64_shufps (x64_punpcklqdq val vec) vec 0b11_10_00_10))) |
| |
| ;; tmp = [ vec[0] vec[3] val[0] val[0] ] |
| ;; result = [ tmp[2] tmp[0] vec[1] vec[0] ] |
| (rule (vec_insert_lane $I32X4 vec val 2) |
| (let ((val Xmm (x64_movd_to_xmm val)) |
| (vec Xmm vec)) |
| (x64_shufps vec (x64_shufps val vec 0b00_11_00_00) 0b10_00_01_00))) |
| |
| ;; tmp = [ vec[3] vec[2] val[1] val[0] ] |
| ;; result = [ tmp[0] tmp[2] vec[1] vec[0] ] |
| (rule (vec_insert_lane $I32X4 vec val 3) |
| (let ((val Xmm (x64_movd_to_xmm val)) |
| (vec Xmm vec)) |
| (x64_shufps vec (x64_shufps val vec 0b11_10_01_00) 0b00_10_01_00))) |
| |
| ;; i64x2.replace_lane |
| (rule 1 (vec_insert_lane $I64X2 vec val idx) |
| (if-let $true (use_sse41)) |
| (x64_pinsrq vec val idx)) |
| (rule (vec_insert_lane $I64X2 vec val 0) |
| (x64_movsd_regmove vec (x64_movq_to_xmm val))) |
| (rule (vec_insert_lane $I64X2 vec val 1) |
| (x64_punpcklqdq vec (x64_movq_to_xmm val))) |
| |
| ;; f32x4.replace_lane |
| (rule 1 (vec_insert_lane $F32X4 vec val idx) |
| (if-let $true (use_sse41)) |
| (x64_insertps vec val (sse_insertps_lane_imm idx))) |
| |
| ;; f32x4.replace_lane 0 - without insertps |
| (rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 0) |
| (x64_movss_regmove vec val)) |
| |
| ;; f32x4.replace_lane 1 - without insertps |
| ;; tmp = [ vec[1] vec[0] val[1] val[0] ] |
| ;; result = [ vec[3] vec[2] tmp[0] tmp[2] ] |
| (rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 1) |
| (let ((tmp Xmm (x64_movlhps val vec))) |
| (x64_shufps tmp vec 0b11_10_00_10))) |
| |
| ;; f32x4.replace_lane 2 - without insertps |
| ;; tmp = [ vec[0] vec[3] val[0] val[0] ] |
| ;; result = [ tmp[2] tmp[0] vec[1] vec[0] ] |
| (rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 2) |
| (let ((tmp Xmm (x64_shufps val vec 0b00_11_00_00))) |
| (x64_shufps vec tmp 0b10_00_01_00))) |
| |
| ;; f32x4.replace_lane 3 - without insertps |
| ;; tmp = [ vec[3] vec[2] val[1] val[0] ] |
| ;; result = [ tmp[0] tmp[2] vec[1] vec[0] ] |
| (rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 3) |
| (let ((tmp Xmm (x64_shufps val vec 0b11_10_01_00))) |
| (x64_shufps vec tmp 0b00_10_01_00))) |
| |
| ;; Recursively delegate to the above rules by loading from memory first. |
| (rule (vec_insert_lane $F32X4 vec (RegMem.Mem addr) idx) |
| (vec_insert_lane $F32X4 vec (x64_movss_load addr) idx)) |
| |
| ;; External rust code used to calculate the immediate value to `insertps`. |
| (decl sse_insertps_lane_imm (u8) u8) |
| (extern constructor sse_insertps_lane_imm sse_insertps_lane_imm) |
| |
| ;; f64x2.replace_lane 0 |
| ;; |
| ;; Here the `movsd` instruction is used specifically to specialize moving |
| ;; into the fist lane where unlike above cases we're not using the lane |
| ;; immediate as an immediate to the instruction itself. |
| (rule (vec_insert_lane $F64X2 vec (RegMem.Reg val) 0) |
| (x64_movsd_regmove vec val)) |
| (rule (vec_insert_lane $F64X2 vec (RegMem.Mem val) 0) |
| (x64_movsd_regmove vec (x64_movsd_load val))) |
| |
| ;; f64x2.replace_lane 1 |
| ;; |
| ;; Here the `movlhps` instruction is used specifically to specialize moving |
| ;; into the second lane where unlike above cases we're not using the lane |
| ;; immediate as an immediate to the instruction itself. |
| (rule (vec_insert_lane $F64X2 vec val 1) |
| (x64_movlhps vec val)) |
| |
| ;;;; Rules for `smin`, `smax`, `umin`, `umax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; `i64` and smaller. |
| |
| (decl cmp_and_choose (Type CC Value Value) ValueRegs) |
| (rule (cmp_and_choose (fits_in_64 ty) cc x y) |
| (let ((size OperandSize (raw_operand_size_of_type ty)) |
| ;; We need to put x and y in registers explicitly because |
| ;; we use the values more than once. Hence, even if these |
| ;; are "unique uses" at the CLIF level and would otherwise |
| ;; allow for load-op merging, here we cannot do that. |
| (x_reg Reg x) |
| (y_reg Reg y)) |
| (with_flags_reg (x64_cmp size x_reg y_reg) |
| (cmove ty cc y_reg x_reg)))) |
| |
| (rule -1 (lower (has_type (fits_in_64 ty) (umin x y))) |
| (cmp_and_choose ty (CC.B) x y)) |
| |
| (rule -1 (lower (has_type (fits_in_64 ty) (umax x y))) |
| (cmp_and_choose ty (CC.NB) x y)) |
| |
| (rule -1 (lower (has_type (fits_in_64 ty) (smin x y))) |
| (cmp_and_choose ty (CC.L) x y)) |
| |
| (rule -1 (lower (has_type (fits_in_64 ty) (smax x y))) |
| (cmp_and_choose ty (CC.NL) x y)) |
| |
| ;; SSE helpers for determining if single-instruction lowerings are available. |
| |
| (decl pure has_pmins (Type) bool) |
| (rule 1 (has_pmins $I16X8) $true) |
| (rule 1 (has_pmins $I64X2) $false) |
| (rule (has_pmins _) (use_sse41)) |
| |
| (decl pure has_pmaxs (Type) bool) |
| (rule 1 (has_pmaxs $I16X8) $true) |
| (rule 1 (has_pmaxs $I64X2) $false) |
| (rule (has_pmaxs _) (use_sse41)) |
| |
| (decl pure has_pmaxu (Type) bool) |
| (rule 1 (has_pmaxu $I8X16) $true) |
| (rule 1 (has_pmaxu $I64X2) $false) |
| (rule (has_pmaxu _) (use_sse41)) |
| |
| (decl pure has_pminu (Type) bool) |
| (rule 1 (has_pminu $I8X16) $true) |
| (rule 1 (has_pminu $I64X2) $false) |
| (rule (has_pminu _) (use_sse41)) |
| |
| ;; SSE `smax`. |
| |
| (rule (lower (has_type (ty_vec128 ty) (smax x y))) |
| (lower_vec_smax ty x y)) |
| |
| (decl lower_vec_smax (Type Xmm Xmm) Xmm) |
| (rule 1 (lower_vec_smax ty x y) |
| (if-let $true (has_pmaxs ty)) |
| (x64_pmaxs ty x y)) |
| |
| (rule (lower_vec_smax ty x y) |
| (let ( |
| (x Xmm x) |
| (y Xmm y) |
| (cmp Xmm (x64_pcmpgt ty x y)) |
| (x_is_max Xmm (x64_pand cmp x)) |
| (y_is_max Xmm (x64_pandn cmp y)) |
| ) |
| (x64_por x_is_max y_is_max))) |
| |
| ;; SSE `smin`. |
| |
| (rule 1 (lower (has_type (ty_vec128 ty) (smin x y))) |
| (if-let $true (has_pmins ty)) |
| (x64_pmins ty x y)) |
| |
| (rule (lower (has_type (ty_vec128 ty) (smin x y))) |
| (let ( |
| (x Xmm x) |
| (y Xmm y) |
| (cmp Xmm (x64_pcmpgt ty y x)) |
| (x_is_min Xmm (x64_pand cmp x)) |
| (y_is_min Xmm (x64_pandn cmp y)) |
| ) |
| (x64_por x_is_min y_is_min))) |
| |
| ;; SSE `umax`. |
| |
| (rule 2 (lower (has_type (ty_vec128 ty) (umax x y))) |
| (if-let $true (has_pmaxu ty)) |
| (x64_pmaxu ty x y)) |
| |
| ;; If y < x then the saturating subtraction will be zero, otherwise when added |
| ;; back to x it'll return y. |
| (rule 1 (lower (has_type $I16X8 (umax x y))) |
| (let ((x Xmm x)) |
| (x64_paddw x (x64_psubusw y x)))) |
| |
| ;; Flip the upper bits of each lane so the signed comparison has the same |
| ;; result as a signed comparison, and then select the results with the output |
| ;; mask. See `pcmpgt` lowering for info on flipping the upper bit. |
| (rule (lower (has_type (ty_vec128 ty) (umax x y))) |
| (let ( |
| (x Xmm x) |
| (y Xmm y) |
| (mask Xmm (flip_high_bit_mask ty)) |
| (x_masked Xmm (x64_pxor x mask)) |
| (y_masked Xmm (x64_pxor y mask)) |
| (cmp Xmm (x64_pcmpgt ty x_masked y_masked)) |
| (x_is_max Xmm (x64_pand cmp x)) |
| (y_is_max Xmm (x64_pandn cmp y)) |
| ) |
| (x64_por x_is_max y_is_max))) |
| |
| (decl flip_high_bit_mask (Type) Xmm) |
| (rule (flip_high_bit_mask $I16X8) |
| (x64_movdqu_load (emit_u128_le_const 0x8000_8000_8000_8000_8000_8000_8000_8000))) |
| (rule (flip_high_bit_mask $I32X4) |
| (x64_movdqu_load (emit_u128_le_const 0x80000000_80000000_80000000_80000000))) |
| (rule (flip_high_bit_mask $I64X2) |
| (x64_movdqu_load (emit_u128_le_const 0x8000000000000000_8000000000000000))) |
| |
| ;; SSE `umin`. |
| |
| (rule 2 (lower (has_type (ty_vec128 ty) (umin x y))) |
| (if-let $true (has_pminu ty)) |
| (x64_pminu ty x y)) |
| |
| ;; If x < y then the saturating subtraction will be 0. Otherwise if x > y then |
| ;; the saturated result, when subtracted again, will go back to `y`. |
| (rule 1 (lower (has_type $I16X8 (umin x y))) |
| (let ((x Xmm x)) |
| (x64_psubw x (x64_psubusw x y)))) |
| |
| ;; Same as `umax`, and see `pcmpgt` for docs on flipping the upper bit. |
| (rule (lower (has_type (ty_vec128 ty) (umin x y))) |
| (let ( |
| (x Xmm x) |
| (y Xmm y) |
| (mask Xmm (flip_high_bit_mask ty)) |
| (x_masked Xmm (x64_pxor x mask)) |
| (y_masked Xmm (x64_pxor y mask)) |
| (cmp Xmm (x64_pcmpgt ty y_masked x_masked)) |
| (x_is_max Xmm (x64_pand cmp x)) |
| (y_is_max Xmm (x64_pandn cmp y)) |
| ) |
| (x64_por x_is_max y_is_max))) |
| |
| ;;;; Rules for `trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (trap code)) |
| (side_effect (x64_ud2 code))) |
| |
| ;;;; Rules for `uadd_overflow_trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type (fits_in_64 ty) (uadd_overflow_trap a b tc))) |
| (with_flags |
| (x64_add_with_flags_paired ty a b) |
| (trap_if (CC.B) tc))) |
| |
| ;; Handle lhs immediates/sinkable loads in addition to the automatic rhs |
| ;; handling of above. |
| |
| (rule 1 (lower (has_type (fits_in_64 ty) |
| (uadd_overflow_trap (simm32_from_value a) b tc))) |
| (with_flags |
| (x64_add_with_flags_paired ty b a) |
| (trap_if (CC.B) tc))) |
| |
| (rule 2 (lower (has_type (fits_in_64 ty) |
| (uadd_overflow_trap (sinkable_load a) b tc))) |
| (with_flags |
| (x64_add_with_flags_paired ty b a) |
| (trap_if (CC.B) tc))) |
| |
| ;;;; Rules for `resumable_trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (resumable_trap code)) |
| (side_effect (x64_ud2 code))) |
| |
| ;;;; Rules for `return` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; N.B.: the Ret itself is generated by the ABI. |
| (rule (lower (return args)) |
| (lower_return args)) |
| |
| ;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule -2 (lower (icmp cc a @ (value_type (fits_in_64 ty)) b)) |
| (lower_icmp_bool (emit_cmp cc a b))) |
| |
| (rule -1 (lower (icmp cc a @ (value_type $I128) b)) |
| (lower_icmp_bool (emit_cmp cc a b))) |
| |
| ;; Peephole optimization for `x < 0`, when x is a signed 64 bit value |
| (rule 2 (lower (has_type $I8 (icmp (IntCC.SignedLessThan) x @ (value_type $I64) (u64_from_iconst 0)))) |
| (x64_shr $I64 x (Imm8Reg.Imm8 63))) |
| |
| ;; Peephole optimization for `0 > x`, when x is a signed 64 bit value |
| (rule 2 (lower (has_type $I8 (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I64)))) |
| (x64_shr $I64 x (Imm8Reg.Imm8 63))) |
| |
| ;; Peephole optimization for `0 <= x`, when x is a signed 64 bit value |
| (rule 2 (lower (has_type $I8 (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I64)))) |
| (x64_shr $I64 (x64_not $I64 x) (Imm8Reg.Imm8 63))) |
| |
| ;; Peephole optimization for `x >= 0`, when x is a signed 64 bit value |
| (rule 2 (lower (has_type $I8 (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I64) (u64_from_iconst 0)))) |
| (x64_shr $I64 (x64_not $I64 x) (Imm8Reg.Imm8 63))) |
| |
| ;; Peephole optimization for `x < 0`, when x is a signed 32 bit value |
| (rule 2 (lower (has_type $I8 (icmp (IntCC.SignedLessThan) x @ (value_type $I32) (u64_from_iconst 0)))) |
| (x64_shr $I32 x (Imm8Reg.Imm8 31))) |
| |
| ;; Peephole optimization for `0 > x`, when x is a signed 32 bit value |
| (rule 2 (lower (has_type $I8 (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I32)))) |
| (x64_shr $I32 x (Imm8Reg.Imm8 31))) |
| |
| ;; Peephole optimization for `0 <= x`, when x is a signed 32 bit value |
| (rule 2 (lower (has_type $I8 (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I32)))) |
| (x64_shr $I32 (x64_not $I64 x) (Imm8Reg.Imm8 31))) |
| |
| ;; Peephole optimization for `x >= 0`, when x is a signed 32 bit value |
| (rule 2 (lower (has_type $I8 (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I32) (u64_from_iconst 0)))) |
| (x64_shr $I32 (x64_not $I64 x) (Imm8Reg.Imm8 31))) |
| |
| ;; For XMM-held values, we lower to `PCMP*` instructions, sometimes more than |
| ;; one. To note: what is different here about the output values is that each |
| ;; lane will be filled with all 1s or all 0s according to the comparison, |
| ;; whereas for GPR-held values, the result will be simply 0 or 1 (upper bits |
| ;; unset). |
| (rule (lower (icmp (IntCC.Equal) a @ (value_type (ty_vec128 ty)) b)) |
| (x64_pcmpeq ty a b)) |
| |
| ;; To lower a not-equals comparison, we perform an equality comparison |
| ;; (PCMPEQ*) and then invert the bits (PXOR with all 1s). |
| (rule (lower (icmp (IntCC.NotEqual) a @ (value_type (ty_vec128 ty)) b)) |
| (let ((checked Xmm (x64_pcmpeq ty a b)) |
| (all_ones Xmm (vector_all_ones))) |
| (x64_pxor checked all_ones))) |
| |
| ;; SSE `sgt` |
| |
| (rule (lower (icmp (IntCC.SignedGreaterThan) a @ (value_type (ty_vec128 ty)) b)) |
| (x64_pcmpgt ty a b)) |
| |
| ;; SSE `slt` |
| |
| (rule (lower (icmp (IntCC.SignedLessThan) a @ (value_type (ty_vec128 ty)) b)) |
| (x64_pcmpgt ty b a)) |
| |
| ;; SSE `ugt` |
| |
| ;; N.B.: we must manually prevent load coalescing operands; the |
| ;; register allocator gets confused otherwise. |
| (rule 1 (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b)) |
| (if-let $true (has_pmaxu ty)) |
| (let ((a Xmm a) |
| (b Xmm b) |
| (max Xmm (x64_pmaxu ty a b)) |
| (eq Xmm (x64_pcmpeq ty max b))) |
| (x64_pxor eq (vector_all_ones)))) |
| |
| ;; Flip the upper bit of each lane so the result of a signed comparison is the |
| ;; same as the result of an unsigned comparison (see docs on `pcmpgt` for more) |
| (rule (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b)) |
| (let ((mask Xmm (flip_high_bit_mask ty)) |
| (a_masked Xmm (x64_pxor a mask)) |
| (b_masked Xmm (x64_pxor b mask))) |
| (x64_pcmpgt ty a_masked b_masked))) |
| |
| ;; SSE `ult` |
| |
| (rule 1 (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b)) |
| (if-let $true (has_pminu ty)) |
| ;; N.B.: see note above. |
| (let ((a Xmm a) |
| (b Xmm b) |
| (min Xmm (x64_pminu ty a b)) |
| (eq Xmm (x64_pcmpeq ty min b))) |
| (x64_pxor eq (vector_all_ones)))) |
| |
| ;; Flip the upper bit of `a` and `b` so the signed comparison result will |
| ;; be the same as the unsigned comparison result (see docs on `pcmpgt` for more). |
| (rule (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b)) |
| (let ((mask Xmm (flip_high_bit_mask ty)) |
| (a_masked Xmm (x64_pxor a mask)) |
| (b_masked Xmm (x64_pxor b mask))) |
| (x64_pcmpgt ty b_masked a_masked))) |
| |
| ;; SSE `sge` |
| |
| ;; Use `pmaxs*` and compare the result to `a` to see if it's `>= b`. |
| (rule 1 (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) |
| (if-let $true (has_pmaxs ty)) |
| (x64_pcmpeq ty a (x64_pmaxs ty a b))) |
| |
| ;; Without `pmaxs*` use a `pcmpgt*` with reversed operands and invert the |
| ;; result. |
| (rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) |
| (x64_pxor (x64_pcmpgt ty b a) (vector_all_ones))) |
| |
| ;; SSE `sle` |
| |
| ;; With `pmins*` use that and compare the result to `a`. |
| (rule 1 (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) |
| (if-let $true (has_pmins ty)) |
| (x64_pcmpeq ty a (x64_pmins ty a b))) |
| |
| ;; Without `pmins*` perform a greater-than test and invert the result. |
| (rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) |
| (x64_pxor (x64_pcmpgt ty a b) (vector_all_ones))) |
| |
| ;; SSE `uge` |
| |
| (rule 2 (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) |
| (if-let $true (has_pmaxu ty)) |
| (x64_pcmpeq ty a (x64_pmaxu ty a b))) |
| |
| ;; Perform a saturating subtract of `a` from `b` and if the result is zero then |
| ;; `a` is greater or equal. |
| (rule 1 (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type $I16X8) b)) |
| (x64_pcmpeqw (x64_psubusw b a) (xmm_zero $I16X8))) |
| |
| ;; Flip the upper bit of each lane so the signed comparison is the same as |
| ;; an unsigned one and then invert the result. See docs on `pcmpgt` for why |
| ;; flipping the upper bit works. |
| (rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) |
| (let ( |
| (mask Xmm (flip_high_bit_mask ty)) |
| (a_masked Xmm (x64_pxor a mask)) |
| (b_masked Xmm (x64_pxor b mask)) |
| (cmp Xmm (x64_pcmpgt ty b_masked a_masked)) |
| ) |
| (x64_pxor cmp (vector_all_ones)))) |
| |
| ;; SSE `ule` |
| |
| (rule 2 (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) |
| (if-let $true (has_pminu ty)) |
| (x64_pcmpeq ty a (x64_pminu ty a b))) |
| |
| ;; A saturating subtraction will produce zeros if `a` is less than `b`, so |
| ;; compare that result to an all-zeros result to figure out lanes of `a` that |
| ;; are <= to the lanes in `b` |
| (rule 1 (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type $I16X8) b)) |
| (let ((zeros_if_a_is_min Xmm (x64_psubusw a b))) |
| (x64_pcmpeqw zeros_if_a_is_min (xmm_zero $I8X16)))) |
| |
| ;; Flip the upper bit of each lane in `a` and `b` so a signed comparison |
| ;; produces the same result as an unsigned comparison. Then test test for `gt` |
| ;; and invert the result to get the `le` that is desired here. See docs on |
| ;; `pcmpgt` for why flipping the upper bit works. |
| (rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) |
| (let ( |
| (mask Xmm (flip_high_bit_mask ty)) |
| (a_masked Xmm (x64_pxor a mask)) |
| (b_masked Xmm (x64_pxor b mask)) |
| (cmp Xmm (x64_pcmpgt ty a_masked b_masked)) |
| ) |
| (x64_pxor cmp (vector_all_ones)))) |
| |
| ;;;; Rules for `fcmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; CLIF's `fcmp` instruction always operates on XMM registers--both scalar and |
| ;; vector. For the scalar versions, we use the flag-setting behavior of the |
| ;; `UCOMIS*` instruction to `SETcc` a 0 or 1 in a GPR register. Note that CLIF's |
| ;; `select` uses the same kind of flag-setting behavior but chooses values other |
| ;; than 0 or 1. |
| ;; |
| ;; Checking the result of `UCOMIS*` is unfortunately difficult in some cases |
| ;; because we do not have `SETcc` instructions that explicitly check |
| ;; simultaneously for the condition (i.e., `eq`, `le`, `gt`, etc.) *and* |
| ;; orderedness. Instead, we must check the flags multiple times. The UCOMIS* |
| ;; documentation (see Intel's Software Developer's Manual, volume 2, chapter 4) |
| ;; is helpful: |
| ;; - unordered assigns Z = 1, P = 1, C = 1 |
| ;; - greater than assigns Z = 0, P = 0, C = 0 |
| ;; - less than assigns Z = 0, P = 0, C = 1 |
| ;; - equal assigns Z = 1, P = 0, C = 0 |
| |
| (rule -1 (lower (fcmp cc a @ (value_type (ty_scalar_float ty)) b)) |
| (lower_fcmp_bool (emit_fcmp cc a b))) |
| |
| ;; For vector lowerings, we use `CMPP*` instructions with a 3-bit operand that |
| ;; determines the comparison to make. Note that comparisons that succeed will |
| ;; fill the lane with 1s; comparisons that do not will fill the lane with 0s. |
| |
| (rule (lower (fcmp (FloatCC.Equal) a @ (value_type (ty_vec128 ty)) b)) |
| (x64_cmpp ty a b (FcmpImm.Equal))) |
| (rule (lower (fcmp (FloatCC.NotEqual) a @ (value_type (ty_vec128 ty)) b)) |
| (x64_cmpp ty a b (FcmpImm.NotEqual))) |
| (rule (lower (fcmp (FloatCC.LessThan) a @ (value_type (ty_vec128 ty)) b)) |
| (x64_cmpp ty a b (FcmpImm.LessThan))) |
| (rule (lower (fcmp (FloatCC.LessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) |
| (x64_cmpp ty a b (FcmpImm.LessThanOrEqual))) |
| (rule (lower (fcmp (FloatCC.Ordered) a @ (value_type (ty_vec128 ty)) b)) |
| (x64_cmpp ty a b (FcmpImm.Ordered))) |
| (rule (lower (fcmp (FloatCC.Unordered) a @ (value_type (ty_vec128 ty)) b)) |
| (x64_cmpp ty a b (FcmpImm.Unordered))) |
| (rule (lower (fcmp (FloatCC.UnorderedOrGreaterThan) a @ (value_type (ty_vec128 ty)) b)) |
| (x64_cmpp ty a b (FcmpImm.UnorderedOrGreaterThan))) |
| (rule (lower (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) |
| (x64_cmpp ty a b (FcmpImm.UnorderedOrGreaterThanOrEqual))) |
| |
| ;; Some vector lowerings rely on flipping the operands and using a reversed |
| ;; comparison code. |
| |
| (rule (lower (fcmp (FloatCC.GreaterThan) a @ (value_type (ty_vec128 ty)) b)) |
| (x64_cmpp ty b a (FcmpImm.LessThan))) |
| (rule (lower (fcmp (FloatCC.GreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) |
| (x64_cmpp ty b a (FcmpImm.LessThanOrEqual))) |
| (rule (lower (fcmp (FloatCC.UnorderedOrLessThan) a @ (value_type (ty_vec128 ty)) b)) |
| (x64_cmpp ty b a (FcmpImm.UnorderedOrGreaterThan))) |
| (rule (lower (fcmp (FloatCC.UnorderedOrLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) |
| (x64_cmpp ty b a (FcmpImm.UnorderedOrGreaterThanOrEqual))) |
| |
| ;; Some vector lowerings are simply not supported for certain codes: |
| ;; - FloatCC::OrderedNotEqual |
| ;; - FloatCC::UnorderedOrEqual |
| |
| ;;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; When a `select` has an `fcmp` as a condition then rely on `emit_fcmp` to |
| ;; figure out how to perform the comparison. |
| ;; |
| ;; Note, though, that the `FloatCC.Equal` requires an "and" to happen for two |
| ;; condition codes which isn't the easiest thing to lower to a `cmove` |
| ;; instruction. For this reason a `select (fcmp eq ..) ..` is instead |
| ;; flipped around to be `select (fcmp ne ..) ..` with all operands reversed. |
| ;; This will produce a `FcmpCondResult.OrCondition` which is easier to codegen |
| ;; for. |
| (rule (lower (has_type ty (select (maybe_uextend (fcmp cc a b)) x y))) |
| (lower_select_fcmp ty (emit_fcmp cc a b) x y)) |
| (rule 1 (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.Equal) a b)) x y))) |
| (lower_select_fcmp ty (emit_fcmp (FloatCC.NotEqual) a b) y x)) |
| |
| (decl lower_select_fcmp (Type FcmpCondResult Value Value) InstOutput) |
| (rule (lower_select_fcmp ty (FcmpCondResult.Condition flags cc) x y) |
| (with_flags flags (cmove_from_values ty cc x y))) |
| (rule (lower_select_fcmp ty (FcmpCondResult.OrCondition flags cc1 cc2) x y) |
| (with_flags flags (cmove_or_from_values ty cc1 cc2 x y))) |
| |
| ;; We also can lower `select`s that depend on an `icmp` test, but more simply |
| ;; than the `fcmp` variants above. In these cases, we lower to a `CMP` |
| ;; instruction plus a `CMOV`; recall that `cmove_from_values` here may emit more |
| ;; than one instruction for certain types (e.g., XMM-held, I128). |
| |
| (rule (lower (has_type ty (select (maybe_uextend (icmp cc a @ (value_type (fits_in_64 a_ty)) b)) x y))) |
| (let ((size OperandSize (raw_operand_size_of_type a_ty))) |
| (with_flags (x64_cmp size b a) (cmove_from_values ty cc x y)))) |
| |
| ;; Finally, we lower `select` from a condition value `c`. These rules are meant |
| ;; to be the final, default lowerings if no other patterns matched above. |
| |
| (rule -1 (lower (has_type ty (select c @ (value_type (fits_in_64 a_ty)) x y))) |
| (let ((size OperandSize (raw_operand_size_of_type a_ty)) |
| ;; N.B.: disallow load-op fusion, see above. TODO: |
| ;; https://github.com/bytecodealliance/wasmtime/issues/3953. |
| (gpr_c Gpr (put_in_gpr c))) |
| (with_flags (x64_test size gpr_c gpr_c) (cmove_from_values ty (CC.NZ) x y)))) |
| |
| (rule -2 (lower (has_type ty (select c @ (value_type $I128) x y))) |
| (let ((cond_result IcmpCondResult (cmp_zero_i128 (CC.Z) c))) |
| (select_icmp cond_result x y))) |
| |
| ;; Specializations for floating-point compares to generate a `mins*` or a |
| ;; `maxs*` instruction. These are equivalent to the "pseudo-m{in,ax}" |
| ;; specializations for vectors. |
| (rule 2 (lower (has_type $F32 (select (maybe_uextend (fcmp (FloatCC.LessThan) x y)) x y))) |
| (x64_minss x y)) |
| (rule 2 (lower (has_type $F64 (select (maybe_uextend (fcmp (FloatCC.LessThan) x y)) x y))) |
| (x64_minsd x y)) |
| (rule 3 (lower (has_type $F32 (select (maybe_uextend (fcmp (FloatCC.LessThan) y x)) x y))) |
| (x64_maxss x y)) |
| (rule 3 (lower (has_type $F64 (select (maybe_uextend (fcmp (FloatCC.LessThan) y x)) x y))) |
| (x64_maxsd x y)) |
| |
| ;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; If available, we can use a plain lzcnt instruction here. Note no |
| ;; special handling is required for zero inputs, because the machine |
| ;; instruction does what the CLIF expects for zero, i.e. it returns |
| ;; zero. |
| (rule 3 (lower (has_type (ty_32_or_64 ty) (clz src))) |
| (if-let $true (use_lzcnt)) |
| (x64_lzcnt ty src)) |
| |
| (rule 2 (lower (has_type (ty_32_or_64 ty) (clz src))) |
| (do_clz ty ty src)) |
| |
| (rule 1 (lower |
| (has_type (ty_8_or_16 ty) |
| (clz src))) |
| (do_clz $I32 ty (extend_to_gpr src $I32 (ExtendKind.Zero)))) |
| |
| (rule 0 (lower |
| (has_type $I128 |
| (clz src))) |
| (let ((upper Gpr (do_clz $I64 $I64 (value_regs_get_gpr src 1))) |
| (lower Gpr (x64_add $I64 |
| (do_clz $I64 $I64 (value_regs_get_gpr src 0)) |
| (RegMemImm.Imm 64))) |
| (result_lo Gpr |
| (with_flags_reg |
| (x64_cmp_imm (OperandSize.Size64) 64 upper) |
| (cmove $I64 (CC.NZ) upper lower)))) |
| (value_regs result_lo (imm $I64 0)))) |
| |
| ;; Implementation helper for clz; operates on 32 or 64-bit units. |
| (decl do_clz (Type Type Gpr) Gpr) |
| (rule (do_clz ty orig_ty src) |
| (let ((highest_bit_index Reg (bsr_or_else ty src (imm_i64 $I64 -1))) |
| (bits_minus_1 Reg (imm ty (u64_sub (ty_bits_u64 orig_ty) 1)))) |
| (x64_sub ty bits_minus_1 highest_bit_index))) |
| |
| ;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; Analogous to `clz` cases above, but using mirror instructions |
| ;; (tzcnt vs lzcnt, bsf vs bsr). |
| |
| (rule 3 (lower (has_type (ty_32_or_64 ty) (ctz src))) |
| (if-let $true (use_bmi1)) |
| (x64_tzcnt ty src)) |
| |
| (rule 2 (lower (has_type (ty_32_or_64 ty) (ctz src))) |
| (do_ctz ty ty src)) |
| |
| (rule 1 (lower |
| (has_type (ty_8_or_16 ty) |
| (ctz src))) |
| (do_ctz $I32 ty (extend_to_gpr src $I32 (ExtendKind.Zero)))) |
| |
| (rule 0 (lower |
| (has_type $I128 |
| (ctz src))) |
| (let ((lower Gpr (do_ctz $I64 $I64 (value_regs_get_gpr src 0))) |
| (upper Gpr (x64_add $I64 |
| (do_ctz $I64 $I64 (value_regs_get_gpr src 1)) |
| (RegMemImm.Imm 64))) |
| (result_lo Gpr |
| (with_flags_reg |
| (x64_cmp_imm (OperandSize.Size64) 64 lower) |
| (cmove $I64 (CC.Z) upper lower)))) |
| (value_regs result_lo (imm $I64 0)))) |
| |
| (decl do_ctz (Type Type Gpr) Gpr) |
| (rule (do_ctz ty orig_ty src) |
| (bsf_or_else ty src (imm $I64 (ty_bits_u64 orig_ty)))) |
| |
| ;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule 4 (lower (has_type (ty_32_or_64 ty) (popcnt src))) |
| (if-let $true (use_popcnt)) |
| (x64_popcnt ty src)) |
| |
| (rule 3 (lower (has_type (ty_8_or_16 ty) (popcnt src))) |
| (if-let $true (use_popcnt)) |
| (x64_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero)))) |
| |
| (rule 1 (lower (has_type $I128 (popcnt src))) |
| (if-let $true (use_popcnt)) |
| (let ((lo_count Gpr (x64_popcnt $I64 (value_regs_get_gpr src 0))) |
| (hi_count Gpr (x64_popcnt $I64 (value_regs_get_gpr src 1)))) |
| (value_regs (x64_add $I64 lo_count hi_count) (imm $I64 0)))) |
| |
| (rule -1 (lower |
| (has_type (ty_32_or_64 ty) |
| (popcnt src))) |
| (do_popcnt ty src)) |
| |
| (rule -2 (lower |
| (has_type (ty_8_or_16 ty) |
| (popcnt src))) |
| (do_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero)))) |
| |
| (rule (lower |
| (has_type $I128 |
| (popcnt src))) |
| (let ((lo_count Gpr (do_popcnt $I64 (value_regs_get_gpr src 0))) |
| (hi_count Gpr (do_popcnt $I64 (value_regs_get_gpr src 1)))) |
| (value_regs (x64_add $I64 lo_count hi_count) (imm $I64 0)))) |
| |
| ;; Implementation of popcount when we don't nave a native popcount |
| ;; instruction. |
| (decl do_popcnt (Type Gpr) Gpr) |
| (rule (do_popcnt $I64 src) |
| (let ((shifted1 Gpr (x64_shr $I64 src (Imm8Reg.Imm8 1))) |
| (sevens Gpr (imm $I64 0x7777777777777777)) |
| (masked1 Gpr (x64_and $I64 shifted1 sevens)) |
| ;; diff1 := src - ((src >> 1) & 0b0111_0111_0111...) |
| (diff1 Gpr (x64_sub $I64 src masked1)) |
| (shifted2 Gpr (x64_shr $I64 masked1 (Imm8Reg.Imm8 1))) |
| (masked2 Gpr (x64_and $I64 shifted2 sevens)) |
| ;; diff2 := diff1 - ((diff1 >> 1) & 0b0111_0111_0111...) |
| (diff2 Gpr (x64_sub $I64 diff1 masked2)) |
| (shifted3 Gpr (x64_shr $I64 masked2 (Imm8Reg.Imm8 1))) |
| (masked3 Gpr (x64_and $I64 shifted3 sevens)) |
| ;; diff3 := diff2 - ((diff2 >> 1) & 0b0111_0111_0111...) |
| ;; |
| ;; At this point, each nibble of diff3 is the popcount of |
| ;; that nibble. This works because at each step above, we |
| ;; are basically subtracting floor(value / 2) from the |
| ;; running value; the leftover remainder is 1 if the LSB |
| ;; was 1. After three steps, we have (nibble / 8) -- 0 or |
| ;; 1 for the MSB of the nibble -- plus three possible |
| ;; additions for the three other bits. |
| (diff3 Gpr (x64_sub $I64 diff2 masked3)) |
| ;; Add the two nibbles of each byte together. |
| (sum1 Gpr (x64_add $I64 |
| (x64_shr $I64 diff3 (Imm8Reg.Imm8 4)) |
| diff3)) |
| ;; Mask the above sum to have the popcount for each byte |
| ;; in the lower nibble of that byte. |
| (ofof Gpr (imm $I64 0x0f0f0f0f0f0f0f0f)) |
| (masked4 Gpr (x64_and $I64 sum1 ofof)) |
| (ones Gpr (imm $I64 0x0101010101010101)) |
| ;; Use a multiply to sum all of the bytes' popcounts into |
| ;; the top byte. Consider the binomial expansion for the |
| ;; top byte: it is the sum of the bytes (masked4 >> 56) * |
| ;; 0x01 + (masked4 >> 48) * 0x01 + (masked4 >> 40) * 0x01 |
| ;; + ... + (masked4 >> 0). |
| (mul Gpr (x64_mul $I64 masked4 ones)) |
| ;; Now take that top byte and return it as the popcount. |
| (final Gpr (x64_shr $I64 mul (Imm8Reg.Imm8 56)))) |
| final)) |
| |
| ;; This is the 32-bit version of the above; the steps for each nibble |
| ;; are the same, we just use constants half as wide. |
| (rule (do_popcnt $I32 src) |
| (let ((shifted1 Gpr (x64_shr $I32 src (Imm8Reg.Imm8 1))) |
| (sevens Gpr (imm $I32 0x77777777)) |
| (masked1 Gpr (x64_and $I32 shifted1 sevens)) |
| (diff1 Gpr (x64_sub $I32 src masked1)) |
| (shifted2 Gpr (x64_shr $I32 masked1 (Imm8Reg.Imm8 1))) |
| (masked2 Gpr (x64_and $I32 shifted2 sevens)) |
| (diff2 Gpr (x64_sub $I32 diff1 masked2)) |
| (shifted3 Gpr (x64_shr $I32 masked2 (Imm8Reg.Imm8 1))) |
| (masked3 Gpr (x64_and $I32 shifted3 sevens)) |
| (diff3 Gpr (x64_sub $I32 diff2 masked3)) |
| (sum1 Gpr (x64_add $I32 |
| (x64_shr $I32 diff3 (Imm8Reg.Imm8 4)) |
| diff3)) |
| (masked4 Gpr (x64_and $I32 sum1 (RegMemImm.Imm 0x0f0f0f0f))) |
| (mul Gpr (x64_mul $I32 masked4 (RegMemImm.Imm 0x01010101))) |
| (final Gpr (x64_shr $I32 mul (Imm8Reg.Imm8 24)))) |
| final)) |
| |
| |
| (rule 2 (lower (has_type $I8X16 (popcnt src))) |
| (if-let $true (use_avx512vl)) |
| (if-let $true (use_avx512bitalg)) |
| (x64_vpopcntb src)) |
| |
| |
| ;; For SSE 4.2 we use Mula's algorithm (https://arxiv.org/pdf/1611.07612.pdf): |
| ;; |
| ;; __m128i count_bytes ( __m128i v) { |
| ;; __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); |
| ;; __m128i low_mask = _mm_set1_epi8 (0x0f); |
| ;; __m128i lo = _mm_and_si128 (v, low_mask); |
| ;; __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4), low_mask); |
| ;; __m128i cnt1 = _mm_shuffle_epi8 (lookup, lo); |
| ;; __m128i cnt2 = _mm_shuffle_epi8 (lookup, hi); |
| ;; return _mm_add_epi8 (cnt1, cnt2); |
| ;; } |
| ;; |
| ;; Details of the above algorithm can be found in the reference noted above, but the basics |
| ;; are to create a lookup table that pre populates the popcnt values for each number [0,15]. |
| ;; The algorithm uses shifts to isolate 4 bit sections of the vector, pshufb as part of the |
| ;; lookup process, and adds together the results. |
| ;; |
| ;; __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); |
| |
| |
| (rule 1 (lower (has_type $I8X16 (popcnt src))) |
| (if-let $true (use_ssse3)) |
| (let ((low_mask XmmMem (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f)) |
| (low_nibbles Xmm (sse_and $I8X16 src low_mask)) |
| ;; Note that this is a 16x8 shift, but that's OK; we mask |
| ;; off anything that traverses from one byte to the next |
| ;; with the low_mask below. |
| (shifted_src Xmm (x64_psrlw src (xmi_imm 4))) |
| (high_nibbles Xmm (sse_and $I8X16 shifted_src low_mask)) |
| (lookup Xmm (x64_xmm_load_const $I8X16 |
| (emit_u128_le_const 0x04030302_03020201_03020201_02010100))) |
| (bit_counts_low Xmm (x64_pshufb lookup low_nibbles)) |
| (bit_counts_high Xmm (x64_pshufb lookup high_nibbles))) |
| (x64_paddb bit_counts_low bit_counts_high))) |
| |
| ;; A modified version of the popcnt method from Hacker's Delight. |
| (rule (lower (has_type $I8X16 (popcnt src))) |
| (let ((mask1 XmmMem (emit_u128_le_const 0x77777777777777777777777777777777)) |
| (src Xmm src) |
| (shifted Xmm (x64_pand (x64_psrlq src (xmi_imm 1)) mask1)) |
| (src Xmm (x64_psubb src shifted)) |
| (shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1)) |
| (src Xmm (x64_psubb src shifted)) |
| (shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1)) |
| (src Xmm (x64_psubb src shifted)) |
| (src Xmm (x64_paddb src (x64_psrlw src (xmi_imm 4))))) |
| (x64_pand src (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f)))) |
| |
| ;; Rules for `bitrev` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type $I8 (bitrev src))) |
| (do_bitrev8 $I32 src)) |
| |
| (rule (lower (has_type $I16 (bitrev src))) |
| (do_bitrev16 $I32 src)) |
| |
| (rule (lower (has_type $I32 (bitrev src))) |
| (do_bitrev32 $I32 src)) |
| |
| (rule (lower (has_type $I64 (bitrev src))) |
| (do_bitrev64 $I64 src)) |
| |
| (rule (lower (has_type $I128 (bitrev src))) |
| (value_regs |
| (do_bitrev64 $I64 (value_regs_get_gpr src 1)) |
| (do_bitrev64 $I64 (value_regs_get_gpr src 0)))) |
| |
| (decl do_bitrev8 (Type Gpr) Gpr) |
| (rule (do_bitrev8 ty src) |
| (let ((tymask u64 (ty_mask ty)) |
| (mask1 Gpr (imm ty (u64_and tymask 0x5555555555555555))) |
| (lo1 Gpr (x64_and ty src mask1)) |
| (hi1 Gpr (x64_and ty (x64_shr ty src (Imm8Reg.Imm8 1)) mask1)) |
| (swap1 Gpr (x64_or ty |
| (x64_shl ty lo1 (Imm8Reg.Imm8 1)) |
| hi1)) |
| (mask2 Gpr (imm ty (u64_and tymask 0x3333333333333333))) |
| (lo2 Gpr (x64_and ty swap1 mask2)) |
| (hi2 Gpr (x64_and ty (x64_shr ty swap1 (Imm8Reg.Imm8 2)) mask2)) |
| (swap2 Gpr (x64_or ty |
| (x64_shl ty lo2 (Imm8Reg.Imm8 2)) |
| hi2)) |
| (mask4 Gpr (imm ty (u64_and tymask 0x0f0f0f0f0f0f0f0f))) |
| (lo4 Gpr (x64_and ty swap2 mask4)) |
| (hi4 Gpr (x64_and ty (x64_shr ty swap2 (Imm8Reg.Imm8 4)) mask4)) |
| (swap4 Gpr (x64_or ty |
| (x64_shl ty lo4 (Imm8Reg.Imm8 4)) |
| hi4))) |
| swap4)) |
| |
| (decl do_bitrev16 (Type Gpr) Gpr) |
| (rule (do_bitrev16 ty src) |
| (let ((src_ Gpr (do_bitrev8 ty src)) |
| (tymask u64 (ty_mask ty)) |
| (mask8 Gpr (imm ty (u64_and tymask 0x00ff00ff00ff00ff))) |
| (lo8 Gpr (x64_and ty src_ mask8)) |
| (hi8 Gpr (x64_and ty (x64_shr ty src_ (Imm8Reg.Imm8 8)) mask8)) |
| (swap8 Gpr (x64_or ty |
| (x64_shl ty lo8 (Imm8Reg.Imm8 8)) |
| hi8))) |
| swap8)) |
| |
| (decl do_bitrev32 (Type Gpr) Gpr) |
| (rule (do_bitrev32 ty src) |
| (let ((src_ Gpr (do_bitrev16 ty src)) |
| (tymask u64 (ty_mask ty)) |
| (mask16 Gpr (imm ty (u64_and tymask 0x0000ffff0000ffff))) |
| (lo16 Gpr (x64_and ty src_ mask16)) |
| (hi16 Gpr (x64_and ty (x64_shr ty src_ (Imm8Reg.Imm8 16)) mask16)) |
| (swap16 Gpr (x64_or ty |
| (x64_shl ty lo16 (Imm8Reg.Imm8 16)) |
| hi16))) |
| swap16)) |
| |
| (decl do_bitrev64 (Type Gpr) Gpr) |
| (rule (do_bitrev64 ty @ $I64 src) |
| (let ((src_ Gpr (do_bitrev32 ty src)) |
| (mask32 Gpr (imm ty 0xffffffff)) |
| (lo32 Gpr (x64_and ty src_ mask32)) |
| (hi32 Gpr (x64_shr ty src_ (Imm8Reg.Imm8 32))) |
| (swap32 Gpr (x64_or ty |
| (x64_shl ty lo32 (Imm8Reg.Imm8 32)) |
| hi32))) |
| swap32)) |
| |
| ;; Rules for `bswap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; x64 bswap instruction is only for 32- or 64-bit swaps |
| ;; implement the 16-bit swap as a rotl by 8 |
| (rule (lower (has_type $I16 (bswap src))) |
| (x64_rotl $I16 src (Imm8Reg.Imm8 8))) |
| |
| (rule (lower (has_type $I32 (bswap src))) |
| (x64_bswap $I32 src)) |
| |
| (rule (lower (has_type $I64 (bswap src))) |
| (x64_bswap $I64 src)) |
| |
| (rule (lower (has_type $I128 (bswap src))) |
| (value_regs |
| (x64_bswap $I64 (value_regs_get_gpr src 1)) |
| (x64_bswap $I64 (value_regs_get_gpr src 0)))) |
| |
| ;; Rules for `is_null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; Null references are represented by the constant value `0`. |
| (rule (lower (is_null src @ (value_type $R64))) |
| (with_flags |
| (x64_cmp_imm (OperandSize.Size64) 0 src) |
| (x64_setcc (CC.Z)))) |
| |
| ;; Rules for `is_invalid` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; Invalid references are represented by the constant value `-1`. |
| (rule (lower (is_invalid src @ (value_type $R64))) |
| (with_flags |
| (x64_cmp_imm (OperandSize.Size64) 0xffffffff src) ;; simm32 0xffff_ffff is sign-extended to -1. |
| (x64_setcc (CC.Z)))) |
| |
| |
| ;; Rules for `uextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; I{8,16,32,64} -> I128. |
| (rule (lower (has_type $I128 (uextend src))) |
| (value_regs (extend_to_gpr src $I64 (ExtendKind.Zero)) (imm $I64 0))) |
| |
| ;; I{8,16,32} -> I64. |
| (rule (lower (has_type $I64 (uextend src))) |
| (extend_to_gpr src $I64 (ExtendKind.Zero))) |
| |
| ;; I{8,16} -> I32 |
| ;; I8 -> I16 |
| (rule -1 (lower (has_type (fits_in_32 _) (uextend src))) |
| (extend_to_gpr src $I32 (ExtendKind.Zero))) |
| |
| ;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; I{8,16,32} -> I128. |
| ;; |
| ;; Produce upper 64 bits sign-extended from lower 64: shift right by |
| ;; 63 bits to spread the sign bit across the result. |
| (rule (lower (has_type $I128 (sextend src))) |
| (let ((lo Gpr (extend_to_gpr src $I64 (ExtendKind.Sign))) |
| (hi Gpr (x64_sar $I64 lo (Imm8Reg.Imm8 63)))) |
| (value_regs lo hi))) |
| |
| ;; I{8,16,32} -> I64. |
| (rule (lower (has_type $I64 (sextend src))) |
| (extend_to_gpr src $I64 (ExtendKind.Sign))) |
| |
| ;; I{8,16} -> I32 |
| ;; I8 -> I16 |
| (rule -1 (lower (has_type (fits_in_32 _) (sextend src))) |
| (extend_to_gpr src $I32 (ExtendKind.Sign))) |
| |
| ;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; T -> T is always a no-op, even I128 -> I128. |
| (rule (lower (has_type ty (ireduce src @ (value_type ty)))) |
| src) |
| |
| ;; T -> I{64,32,16,8}: We can simply pass through the value: values |
| ;; are always stored with high bits undefined, so we can just leave |
| ;; them be. |
| (rule 1 (lower (has_type (fits_in_64 ty) (ireduce src))) |
| (value_regs_get_gpr src 0)) |
| |
| ;; Rules for `debugtrap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (debugtrap)) |
| (side_effect (x64_hlt))) |
| |
| ;; Rules for `x86_pmaddubsw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type $I16X8 (x86_pmaddubsw x y))) |
| (if-let $true (use_ssse3)) |
| (x64_pmaddubsw y x)) |
| |
| ;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type $F32 (fadd x y))) |
| (x64_addss x y)) |
| (rule (lower (has_type $F64 (fadd x y))) |
| (x64_addsd x y)) |
| (rule (lower (has_type $F32X4 (fadd x y))) |
| (x64_addps x y)) |
| (rule (lower (has_type $F64X2 (fadd x y))) |
| (x64_addpd x y)) |
| |
| ;; The above rules automatically sink loads for rhs operands, so additionally |
| ;; add rules for sinking loads with lhs operands. |
| (rule 1 (lower (has_type $F32 (fadd (sinkable_load x) y))) |
| (x64_addss y x)) |
| (rule 1 (lower (has_type $F64 (fadd (sinkable_load x) y))) |
| (x64_addsd y x)) |
| (rule 1 (lower (has_type $F32X4 (fadd (sinkable_load x) y))) |
| (x64_addps y x)) |
| (rule 1 (lower (has_type $F64X2 (fadd (sinkable_load x) y))) |
| (x64_addpd y x)) |
| |
| ;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type $F32 (fsub x y))) |
| (x64_subss x y)) |
| (rule (lower (has_type $F64 (fsub x y))) |
| (x64_subsd x y)) |
| (rule (lower (has_type $F32X4 (fsub x y))) |
| (x64_subps x y)) |
| (rule (lower (has_type $F64X2 (fsub x y))) |
| (x64_subpd x y)) |
| |
| ;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type $F32 (fmul x y))) |
| (x64_mulss x y)) |
| (rule (lower (has_type $F64 (fmul x y))) |
| (x64_mulsd x y)) |
| (rule (lower (has_type $F32X4 (fmul x y))) |
| (x64_mulps x y)) |
| (rule (lower (has_type $F64X2 (fmul x y))) |
| (x64_mulpd x y)) |
| |
| ;; The above rules automatically sink loads for rhs operands, so additionally |
| ;; add rules for sinking loads with lhs operands. |
| (rule 1 (lower (has_type $F32 (fmul (sinkable_load x) y))) |
| (x64_mulss y x)) |
| (rule 1 (lower (has_type $F64 (fmul (sinkable_load x) y))) |
| (x64_mulsd y x)) |
| (rule 1 (lower (has_type $F32X4 (fmul (sinkable_load x) y))) |
| (x64_mulps y x)) |
| (rule 1 (lower (has_type $F64X2 (fmul (sinkable_load x) y))) |
| (x64_mulpd y x)) |
| |
| ;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type $F32 (fdiv x y))) |
| (x64_divss x y)) |
| (rule (lower (has_type $F64 (fdiv x y))) |
| (x64_divsd x y)) |
| (rule (lower (has_type $F32X4 (fdiv x y))) |
| (x64_divps x y)) |
| (rule (lower (has_type $F64X2 (fdiv x y))) |
| (x64_divpd x y)) |
| |
| ;; Rules for `sqrt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| (rule (lower (has_type $F32 (sqrt x))) |
| (x64_sqrtss x)) |
| (rule (lower (has_type $F64 (sqrt x))) |
| (x64_sqrtsd x)) |
| (rule (lower (has_type $F32X4 (sqrt x))) |
| (x64_sqrtps x)) |
| (rule (lower (has_type $F64X2 (sqrt x))) |
| (x64_sqrtpd x)) |
| |
| ;; Rules for `fpromote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| (rule (lower (has_type $F64 (fpromote x))) |
| (x64_cvtss2sd x)) |
| |
| ;; Rules for `fvpromote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| (rule (lower (has_type $F64X2 (fvpromote_low x))) |
| (x64_cvtps2pd (put_in_xmm x))) |
| |
| ;; Rules for `fdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| (rule (lower (has_type $F32 (fdemote x))) |
| (x64_cvtsd2ss x)) |
| |
| ;; Rules for `fvdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| (rule (lower (has_type $F32X4 (fvdemote x))) |
| (x64_cvtpd2ps x)) |
| |
| ;; Rules for `fmin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type $F32 (fmin x y))) |
| (xmm_min_max_seq $F32 $true x y)) |
| (rule (lower (has_type $F64 (fmin x y))) |
| (xmm_min_max_seq $F64 $true x y)) |
| |
| ;; Vector-typed version. We don't use single pseudoinstructions as |
| ;; above, because we don't need to generate a mini-CFG. Instead, we |
| ;; perform a branchless series of operations. |
| ;; |
| ;; We cannot simply use native min instructions (minps, minpd) because |
| ;; NaN handling is different per CLIF semantics than on |
| ;; x86. Specifically, if an argument is NaN, or the arguments are both |
| ;; zero but of opposite signs, then the x86 instruction always |
| ;; produces the second argument. However, per CLIF semantics, we |
| ;; require that fmin(NaN, _) = fmin(_, NaN) = NaN, and fmin(+0, -0) = |
| ;; fmin(-0, +0) = -0. |
| |
| (rule (lower (has_type $F32X4 (fmin x y))) |
| ;; Compute min(x, y) and min(y, x) with native |
| ;; instructions. These will differ in one of the edge cases |
| ;; above that we have to handle properly. (Conversely, if they |
| ;; don't differ, then the native instruction's answer is the |
| ;; right one per CLIF semantics.) |
| (let ((min1 Xmm (x64_minps x y)) |
| (min2 Xmm (x64_minps y x)) |
| ;; Compute the OR of the two. Note that NaNs have an |
| ;; exponent field of all-ones (0xFF for F32), so if either |
| ;; result is a NaN, this OR will be. And if either is a |
| ;; zero (which has an exponent of 0 and mantissa of 0), |
| ;; this captures a sign-bit of 1 (negative) if either |
| ;; input is negative. |
| ;; |
| ;; In the case where we don't have a +/-0 mismatch or |
| ;; NaNs, then `min1` and `min2` are equal and `min_or` is |
| ;; the correct minimum. |
| (min_or Xmm (x64_orps min1 min2)) |
| ;; "compare unordered" produces a true mask (all ones) in |
| ;; a given lane if the min is a NaN. We use this to |
| ;; generate a mask to ensure quiet NaNs. |
| (is_nan_mask Xmm (x64_cmpps min_or min2 (FcmpImm.Unordered))) |
| ;; OR in the NaN mask. |
| (min_or_2 Xmm (x64_orps min_or is_nan_mask)) |
| ;; Shift the NaN mask down so that it covers just the |
| ;; fraction below the NaN signalling bit; we'll use this |
| ;; to mask off non-canonical NaN payloads. |
| ;; |
| ;; All-ones for NaN, shifted down to leave 10 top bits (1 |
| ;; sign, 8 exponent, 1 QNaN bit that must remain set) |
| ;; cleared. |
| (nan_fraction_mask Xmm (x64_psrld is_nan_mask (xmi_imm 10))) |
| ;; Do a NAND, so that we retain every bit not set in |
| ;; `nan_fraction_mask`. This mask will be all zeroes (so |
| ;; we retain every bit) in non-NaN cases, and will have |
| ;; ones (so we clear those bits) in NaN-payload bits |
| ;; otherwise. |
| (final Xmm (x64_andnps nan_fraction_mask min_or_2))) |
| final)) |
| |
| ;; Likewise for F64 lanes, except that the right-shift is by 13 bits |
| ;; (1 sign, 11 exponent, 1 QNaN bit). |
| (rule (lower (has_type $F64X2 (fmin x y))) |
| (let ((min1 Xmm (x64_minpd x y)) |
| (min2 Xmm (x64_minpd y x)) |
| (min_or Xmm (x64_orpd min1 min2)) |
| (is_nan_mask Xmm (x64_cmppd min1 min2 (FcmpImm.Unordered))) |
| (min_or_2 Xmm (x64_orpd min_or is_nan_mask)) |
| (nan_fraction_mask Xmm (x64_psrlq is_nan_mask (xmi_imm 13))) |
| (final Xmm (x64_andnpd nan_fraction_mask min_or_2))) |
| final)) |
| |
| ;; Rules for `fmax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type $F32 (fmax x y))) |
| (xmm_min_max_seq $F32 $false x y)) |
| (rule (lower (has_type $F64 (fmax x y))) |
| (xmm_min_max_seq $F64 $false x y)) |
| |
| ;; The vector version of fmax here is a dual to the fmin sequence |
| ;; above, almost, with a few differences. |
| |
| (rule (lower (has_type $F32X4 (fmax x y))) |
| ;; Compute max(x, y) and max(y, x) with native |
| ;; instructions. These will differ in one of the edge cases |
| ;; above that we have to handle properly. (Conversely, if they |
| ;; don't differ, then the native instruction's answer is the |
| ;; right one per CLIF semantics.) |
| (let ((max1 Xmm (x64_maxps x y)) |
| (max2 Xmm (x64_maxps y x)) |
| ;; Compute the XOR of the two maxima. In the case |
| ;; where we don't have a +/-0 mismatch or NaNs, then |
| ;; `min1` and `min2` are equal and this XOR is zero. |
| (max_xor Xmm (x64_xorps max1 max2)) |
| ;; OR the XOR into one of the original maxima. If they are |
| ;; equal, this does nothing. If max2 was NaN, its exponent |
| ;; bits were all-ones, so the xor's exponent bits were the |
| ;; complement of max1, and the OR of max1 and max_xor has |
| ;; an all-ones exponent (is a NaN). If max1 was NaN, then |
| ;; its exponent bits were already all-ones, so the OR will |
| ;; be a NaN as well. |
| (max_blended_nan Xmm (x64_orps max1 max_xor)) |
| ;; Subtract the XOR. This ensures that if we had +0 and |
| ;; -0, we end up with +0. |
| (max_blended_nan_positive Xmm (x64_subps max_blended_nan max_xor)) |
| ;; "compare unordered" produces a true mask (all ones) in |
| ;; a given lane if the min is a NaN. We use this to |
| ;; generate a mask to ensure quiet NaNs. |
| (is_nan_mask Xmm (x64_cmpps max_blended_nan max_blended_nan (FcmpImm.Unordered))) |
| ;; Shift the NaN mask down so that it covers just the |
| ;; fraction below the NaN signalling bit; we'll use this |
| ;; to mask off non-canonical NaN payloads. |
| ;; |
| ;; All-ones for NaN, shifted down to leave 10 top bits (1 |
| ;; sign, 8 exponent, 1 QNaN bit that must remain set) |
| ;; cleared. |
| (nan_fraction_mask Xmm (x64_psrld is_nan_mask (xmi_imm 10))) |
| ;; Do a NAND, so that we retain every bit not set in |
| ;; `nan_fraction_mask`. This mask will be all zeroes (so |
| ;; we retain every bit) in non-NaN cases, and will have |
| ;; ones (so we clear those bits) in NaN-payload bits |
| ;; otherwise. |
| (final Xmm (x64_andnps nan_fraction_mask max_blended_nan_positive))) |
| final)) |
| |
| (rule (lower (has_type $F64X2 (fmax x y))) |
| ;; Compute max(x, y) and max(y, x) with native |
| ;; instructions. These will differ in one of the edge cases |
| ;; above that we have to handle properly. (Conversely, if they |
| ;; don't differ, then the native instruction's answer is the |
| ;; right one per CLIF semantics.) |
| (let ((max1 Xmm (x64_maxpd x y)) |
| (max2 Xmm (x64_maxpd y x)) |
| ;; Compute the XOR of the two maxima. In the case |
| ;; where we don't have a +/-0 mismatch or NaNs, then |
| ;; `min1` and `min2` are equal and this XOR is zero. |
| (max_xor Xmm (x64_xorpd max1 max2)) |
| ;; OR the XOR into one of the original maxima. If they are |
| ;; equal, this does nothing. If max2 was NaN, its exponent |
| ;; bits were all-ones, so the xor's exponent bits were the |
| ;; complement of max1, and the OR of max1 and max_xor has |
| ;; an all-ones exponent (is a NaN). If max1 was NaN, then |
| ;; its exponent bits were already all-ones, so the OR will |
| ;; be a NaN as well. |
| (max_blended_nan Xmm (x64_orpd max1 max_xor)) |
| ;; Subtract the XOR. This ensures that if we had +0 and |
| ;; -0, we end up with +0. |
| (max_blended_nan_positive Xmm (x64_subpd max_blended_nan max_xor)) |
| ;; `cmpps` with predicate index `3` is `cmpunordps`, or |
| ;; "compare unordered": it produces a true mask (all ones) |
| ;; in a given lane if the min is a NaN. We use this to |
| ;; generate a mask to ensure quiet NaNs. |
| (is_nan_mask Xmm (x64_cmppd max_blended_nan max_blended_nan (FcmpImm.Unordered))) |
| ;; Shift the NaN mask down so that it covers just the |
| ;; fraction below the NaN signalling bit; we'll use this |
| ;; to mask off non-canonical NaN payloads. |
| ;; |
| ;; All-ones for NaN, shifted down to leave 13 top bits (1 |
| ;; sign, 11 exponent, 1 QNaN bit that must remain set) |
| ;; cleared. |
| (nan_fraction_mask Xmm (x64_psrlq is_nan_mask (xmi_imm 13))) |
| ;; Do a NAND, so that we retain every bit not set in |
| ;; `nan_fraction_mask`. This mask will be all zeroes (so |
| ;; we retain every bit) in non-NaN cases, and will have |
| ;; ones (so we clear those bits) in NaN-payload bits |
| ;; otherwise. |
| (final Xmm (x64_andnpd nan_fraction_mask max_blended_nan_positive))) |
| final)) |
| |
| ;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; Base case for fma is to call out to one of two libcalls. For vectors they |
| ;; need to be decomposed, handle each element individually, and then recomposed. |
| |
| (rule (lower (has_type $F32 (fma x y z))) |
| (libcall_3 (LibCall.FmaF32) x y z)) |
| (rule (lower (has_type $F64 (fma x y z))) |
| (libcall_3 (LibCall.FmaF64) x y z)) |
| |
| (rule (lower (has_type $F32X4 (fma x y z))) |
| (let ( |
| (x Xmm (put_in_xmm x)) |
| (y Xmm (put_in_xmm y)) |
| (z Xmm (put_in_xmm z)) |
| (x0 Xmm (libcall_3 (LibCall.FmaF32) x y z)) |
| (x1 Xmm (libcall_3 (LibCall.FmaF32) |
| (x64_pshufd x 1) |
| (x64_pshufd y 1) |
| (x64_pshufd z 1))) |
| (x2 Xmm (libcall_3 (LibCall.FmaF32) |
| (x64_pshufd x 2) |
| (x64_pshufd y 2) |
| (x64_pshufd z 2))) |
| (x3 Xmm (libcall_3 (LibCall.FmaF32) |
| (x64_pshufd x 3) |
| (x64_pshufd y 3) |
| (x64_pshufd z 3))) |
| |
| (tmp Xmm (vec_insert_lane $F32X4 x0 x1 1)) |
| (tmp Xmm (vec_insert_lane $F32X4 tmp x2 2)) |
| (tmp Xmm (vec_insert_lane $F32X4 tmp x3 3)) |
| ) |
| tmp)) |
| (rule (lower (has_type $F64X2 (fma x y z))) |
| (let ( |
| (x Xmm (put_in_xmm x)) |
| (y Xmm (put_in_xmm y)) |
| (z Xmm (put_in_xmm z)) |
| (x0 Xmm (libcall_3 (LibCall.FmaF64) x y z)) |
| (x1 Xmm (libcall_3 (LibCall.FmaF64) |
| (x64_pshufd x 0xee) |
| (x64_pshufd y 0xee) |
| (x64_pshufd z 0xee))) |
| ) |
| (vec_insert_lane $F64X2 x0 x1 1))) |
| |
| |
| ;; Special case for when the `fma` feature is active and a native instruction |
| ;; can be used. |
| (rule 1 (lower (has_type ty (fma x y z))) |
| (if-let $true (use_fma)) |
| (fmadd ty x y z)) |
| |
| (decl fmadd (Type Value Value Value) Xmm) |
| (decl fnmadd (Type Value Value Value) Xmm) |
| |
| ;; Base case. Note that this will automatically sink a load with `z`, the value |
| ;; to add. |
| (rule (fmadd ty x y z) (x64_vfmadd213 ty x y z)) |
| |
| ;; Allow sinking loads with one of the two values being multiplied in addition |
| ;; to the value being added. Note that both x and y can be sunk here due to |
| ;; multiplication being commutative. |
| (rule 1 (fmadd ty (sinkable_load x) y z) (x64_vfmadd132 ty y z x)) |
| (rule 2 (fmadd ty x (sinkable_load y) z) (x64_vfmadd132 ty x z y)) |
| |
| ;; If one of the values being multiplied is negated then use a `vfnmadd*` |
| ;; instruction instead |
| (rule 3 (fmadd ty (fneg x) y z) (fnmadd ty x y z)) |
| (rule 4 (fmadd ty x (fneg y) z) (fnmadd ty x y z)) |
| |
| (rule (fnmadd ty x y z) (x64_vfnmadd213 ty x y z)) |
| (rule 1 (fnmadd ty (sinkable_load x) y z) (x64_vfnmadd132 ty y z x)) |
| (rule 2 (fnmadd ty x (sinkable_load y) z) (x64_vfnmadd132 ty x z y)) |
| |
| ;; Like `fmadd` if one argument is negated switch which one is being codegen'd |
| (rule 3 (fnmadd ty (fneg x) y z) (fmadd ty x y z)) |
| (rule 4 (fnmadd ty x (fneg y) z) (fmadd ty x y z)) |
| |
| ;; Rules for `load*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; In order to load a value from memory to a GPR register, we may need to extend |
| ;; the loaded value from 8-, 16-, or 32-bits to this backend's expected GPR |
| ;; width: 64 bits. Note that `ext_mode` will load 1-bit types (booleans) as |
| ;; 8-bit loads. |
| ;; |
| ;; By default, we zero-extend all sub-64-bit loads to a GPR. |
| (rule -4 (lower (has_type (and (fits_in_32 ty) (is_gpr_type _)) (load flags address offset))) |
| (x64_movzx (ext_mode (ty_bits_u16 ty) 64) (to_amode flags address offset))) |
| ;; But if we know that both the `from` and `to` are 64 bits, we simply load with |
| ;; no extension. |
| (rule -1 (lower (has_type (ty_int_ref_64 ty) (load flags address offset))) |
| (x64_mov (to_amode flags address offset))) |
| ;; Also, certain scalar loads have a specific `from` width and extension kind |
| ;; (signed -> `sx`, zeroed -> `zx`). We overwrite the high bits of the 64-bit |
| ;; GPR even if the `to` type is smaller (e.g., 16-bits). |
| (rule (lower (has_type (is_gpr_type ty) (uload8 flags address offset))) |
| (x64_movzx (ExtMode.BQ) (to_amode flags address offset))) |
| (rule (lower (has_type (is_gpr_type ty) (sload8 flags address offset))) |
| (x64_movsx (ExtMode.BQ) (to_amode flags address offset))) |
| (rule (lower (has_type (is_gpr_type ty) (uload16 flags address offset))) |
| (x64_movzx (ExtMode.WQ) (to_amode flags address offset))) |
| (rule (lower (has_type (is_gpr_type ty) (sload16 flags address offset))) |
| (x64_movsx (ExtMode.WQ) (to_amode flags address offset))) |
| (rule (lower (has_type (is_gpr_type ty) (uload32 flags address offset))) |
| (x64_movzx (ExtMode.LQ) (to_amode flags address offset))) |
| (rule (lower (has_type (is_gpr_type ty) (sload32 flags address offset))) |
| (x64_movsx (ExtMode.LQ) (to_amode flags address offset))) |
| |
| ;; To load to XMM registers, we use the x64-specific instructions for each type. |
| ;; For `$F32` and `$F64` this is important--we only want to load 32 or 64 bits. |
| ;; But for the 128-bit types, this is not strictly necessary for performance but |
| ;; might help with clarity during disassembly. |
| (rule (lower (has_type $F32 (load flags address offset))) |
| (x64_movss_load (to_amode flags address offset))) |
| (rule (lower (has_type $F64 (load flags address offset))) |
| (x64_movsd_load (to_amode flags address offset))) |
| (rule (lower (has_type $F32X4 (load flags address offset))) |
| (x64_movups_load (to_amode flags address offset))) |
| (rule (lower (has_type $F64X2 (load flags address offset))) |
| (x64_movupd_load (to_amode flags address offset))) |
| (rule -2 (lower (has_type (ty_vec128 ty) (load flags address offset))) |
| (x64_movdqu_load (to_amode flags address offset))) |
| |
| ;; We can load an I128 by doing two 64-bit loads. |
| (rule -3 (lower (has_type $I128 |
| (load flags address offset))) |
| (let ((addr_lo Amode (to_amode flags address offset)) |
| (addr_hi Amode (amode_offset addr_lo 8)) |
| (value_lo Reg (x64_mov addr_lo)) |
| (value_hi Reg (x64_mov addr_hi))) |
| (value_regs value_lo value_hi))) |
| |
| ;; We also include widening vector loads; these sign- or zero-extend each lane |
| ;; to the next wider width (e.g., 16x4 -> 32x4). |
| (rule 1 (lower (has_type $I16X8 (sload8x8 flags address offset))) |
| (if-let $true (use_sse41)) |
| (x64_pmovsxbw (to_amode flags address offset))) |
| (rule 1 (lower (has_type $I16X8 (uload8x8 flags address offset))) |
| (if-let $true (use_sse41)) |
| (x64_pmovzxbw (to_amode flags address offset))) |
| (rule 1 (lower (has_type $I32X4 (sload16x4 flags address offset))) |
| (if-let $true (use_sse41)) |
| (x64_pmovsxwd (to_amode flags address offset))) |
| (rule 1 (lower (has_type $I32X4 (uload16x4 flags address offset))) |
| (if-let $true (use_sse41)) |
| (x64_pmovzxwd (to_amode flags address offset))) |
| (rule 1 (lower (has_type $I64X2 (sload32x2 flags address offset))) |
| (if-let $true (use_sse41)) |
| (x64_pmovsxdq (to_amode flags address offset))) |
| (rule 1 (lower (has_type $I64X2 (uload32x2 flags address offset))) |
| (if-let $true (use_sse41)) |
| (x64_pmovzxdq (to_amode flags address offset))) |
| |
| (rule (lower (has_type $I16X8 (sload8x8 flags address offset))) |
| (lower_swiden_low $I16X8 (x64_movq_to_xmm (to_amode flags address offset)))) |
| (rule (lower (has_type $I16X8 (uload8x8 flags address offset))) |
| (lower_uwiden_low $I16X8 (x64_movq_to_xmm (to_amode flags address offset)))) |
| (rule (lower (has_type $I32X4 (sload16x4 flags address offset))) |
| (lower_swiden_low $I32X4 (x64_movq_to_xmm (to_amode flags address offset)))) |
| (rule (lower (has_type $I32X4 (uload16x4 flags address offset))) |
| (lower_uwiden_low $I32X4 (x64_movq_to_xmm (to_amode flags address offset)))) |
| (rule (lower (has_type $I64X2 (sload32x2 flags address offset))) |
| (lower_swiden_low $I64X2 (x64_movq_to_xmm (to_amode flags address offset)))) |
| (rule (lower (has_type $I64X2 (uload32x2 flags address offset))) |
| (lower_uwiden_low $I64X2 (x64_movq_to_xmm (to_amode flags address offset)))) |
| |
| ;; Rules for `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; 8-, 16-, 32- and 64-bit GPR stores. |
| (rule -2 (lower (store flags |
| value @ (value_type (is_gpr_type ty)) |
| address |
| offset)) |
| (side_effect |
| (x64_movrm ty (to_amode flags address offset) value))) |
| |
| ;; Explicit 8/16/32-bit opcodes. |
| (rule (lower (istore8 flags value address offset)) |
| (side_effect |
| (x64_movrm $I8 (to_amode flags address offset) value))) |
| (rule (lower (istore16 flags value address offset)) |
| (side_effect |
| (x64_movrm $I16 (to_amode flags address offset) value))) |
| (rule (lower (istore32 flags value address offset)) |
| (side_effect |
| (x64_movrm $I32 (to_amode flags address offset) value))) |
| |
| ;; IMM stores |
| (rule 2 (lower (store flags (has_type (fits_in_64 ty) (iconst (simm32 value))) address offset)) |
| (side_effect |
| (x64_movimm_m ty (to_amode flags address offset) value))) |
| |
| ;; F32 stores of values in XMM registers. |
| (rule 1 (lower (store flags |
| value @ (value_type $F32) |
| address |
| offset)) |
| (side_effect |
| (x64_movss_store (to_amode flags address offset) value))) |
| |
| ;; F64 stores of values in XMM registers. |
| (rule 1 (lower (store flags |
| value @ (value_type $F64) |
| address |
| offset)) |
| (side_effect |
| (x64_movsd_store (to_amode flags address offset) value))) |
| |
| ;; Stores of F32X4 vectors. |
| (rule 1 (lower (store flags |
| value @ (value_type $F32X4) |
| address |
| offset)) |
| (side_effect |
| (x64_movups_store (to_amode flags address offset) value))) |
| |
| ;; Stores of F64X2 vectors. |
| (rule 1 (lower (store flags |
| value @ (value_type $F64X2) |
| address |
| offset)) |
| (side_effect |
| (x64_movupd_store (to_amode flags address offset) value))) |
| |
| ;; Stores of all other 128-bit vector types with integer lanes. |
| (rule -1 (lower (store flags |
| value @ (value_type (ty_vec128_int _)) |
| address |
| offset)) |
| (side_effect |
| (x64_movdqu_store (to_amode flags address offset) value))) |
| |
| ;; Stores of I128 values: store the two 64-bit halves separately. |
| (rule 0 (lower (store flags |
| value @ (value_type $I128) |
| address |
| offset)) |
| (let ((value_reg ValueRegs value) |
| (value_lo Gpr (value_regs_get_gpr value_reg 0)) |
| (value_hi Gpr (value_regs_get_gpr value_reg 1)) |
| (addr_lo Amode (to_amode flags address offset)) |
| (addr_hi Amode (amode_offset addr_lo 8))) |
| (side_effect |
| (side_effect_concat |
| (x64_movrm $I64 addr_lo value_lo) |
| (x64_movrm $I64 addr_hi value_hi))))) |
| |
| ;; Slightly optimize the extraction of the first lane from a vector which is |
| ;; stored in memory. In the case the first lane specifically is selected the |
| ;; standard `movss` and `movsd` instructions can be used as-if we're storing a |
| ;; f32 or f64 despite the source perhaps being an integer vector since the |
| ;; result of the instruction is the same. |
| (rule 2 (lower (store flags |
| (has_type $F32 (extractlane value (u8_from_uimm8 0))) |
| address |
| offset)) |
| (side_effect |
| (x64_movss_store (to_amode flags address offset) value))) |
| (rule 2 (lower (store flags |
| (has_type $F64 (extractlane value (u8_from_uimm8 0))) |
| address |
| offset)) |
| (side_effect |
| (x64_movsd_store (to_amode flags address offset) value))) |
| (rule 2 (lower (store flags |
| (has_type $I8 (extractlane value (u8_from_uimm8 n))) |
| address |
| offset)) |
| (if-let $true (use_sse41)) |
| (side_effect |
| (x64_pextrb_store (to_amode flags address offset) value n))) |
| (rule 2 (lower (store flags |
| (has_type $I16 (extractlane value (u8_from_uimm8 n))) |
| address |
| offset)) |
| (if-let $true (use_sse41)) |
| (side_effect |
| (x64_pextrw_store (to_amode flags address offset) value n))) |
| (rule 2 (lower (store flags |
| (has_type $I32 (extractlane value (u8_from_uimm8 n))) |
| address |
| offset)) |
| (if-let $true (use_sse41)) |
| (side_effect |
| (x64_pextrd_store (to_amode flags address offset) value n))) |
| (rule 2 (lower (store flags |
| (has_type $I64 (extractlane value (u8_from_uimm8 n))) |
| address |
| offset)) |
| (if-let $true (use_sse41)) |
| (side_effect |
| (x64_pextrq_store (to_amode flags address offset) value n))) |
| |
| ;; Rules for `load*` + ALU op + `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; Add mem, reg |
| (rule 3 (lower |
| (store flags |
| (has_type (ty_32_or_64 ty) |
| (iadd (and |
| (sinkable_load sink) |
| (load flags addr offset)) |
| src2)) |
| addr |
| offset)) |
| (let ((_ RegMemImm sink)) |
| (side_effect |
| (x64_add_mem ty (to_amode flags addr offset) src2)))) |
| |
| ;; Add mem, reg with args swapped |
| (rule 2 (lower |
| (store flags |
| (has_type (ty_32_or_64 ty) |
| (iadd src2 |
| (and |
| (sinkable_load sink) |
| (load flags addr offset)))) |
| addr |
| offset)) |
| (let ((_ RegMemImm sink)) |
| (side_effect |
| (x64_add_mem ty (to_amode flags addr offset) src2)))) |
| |
| ;; Sub mem, reg |
| (rule 2 (lower |
| (store flags |
| (has_type (ty_32_or_64 ty) |
| (isub (and |
| (sinkable_load sink) |
| (load flags addr offset)) |
| src2)) |
| addr |
| offset)) |
| (let ((_ RegMemImm sink)) |
| (side_effect |
| (x64_sub_mem ty (to_amode flags addr offset) src2)))) |
| |
| ;; And mem, reg |
| (rule 3 (lower |
| (store flags |
| (has_type (ty_32_or_64 ty) |
| (band (and |
| (sinkable_load sink) |
| (load flags addr offset)) |
| src2)) |
| addr |
| offset)) |
| (let ((_ RegMemImm sink)) |
| (side_effect |
| (x64_and_mem ty (to_amode flags addr offset) src2)))) |
| |
| ;; And mem, reg with args swapped |
| (rule 2 (lower |
| (store flags |
| (has_type (ty_32_or_64 ty) |
| (band src2 |
| (and |
| (sinkable_load sink) |
| (load flags addr offset)))) |
| addr |
| offset)) |
| (let ((_ RegMemImm sink)) |
| (side_effect |
| (x64_and_mem ty (to_amode flags addr offset) src2)))) |
| |
| ;; Or mem, reg |
| (rule 3 (lower |
| (store flags |
| (has_type (ty_32_or_64 ty) |
| (bor (and |
| (sinkable_load sink) |
| (load flags addr offset)) |
| src2)) |
| addr |
| offset)) |
| (let ((_ RegMemImm sink)) |
| (side_effect |
| (x64_or_mem ty (to_amode flags addr offset) src2)))) |
| |
| ;; Or mem, reg with args swapped |
| (rule 2 (lower |
| (store flags |
| (has_type (ty_32_or_64 ty) |
| (bor src2 |
| (and |
| (sinkable_load sink) |
| (load flags addr offset)))) |
| addr |
| offset)) |
| (let ((_ RegMemImm sink)) |
| (side_effect |
| (x64_or_mem ty (to_amode flags addr offset) src2)))) |
| |
| ;; Xor mem, reg |
| (rule 3 (lower |
| (store flags |
| (has_type (ty_32_or_64 ty) |
| (bxor (and |
| (sinkable_load sink) |
| (load flags addr offset)) |
| src2)) |
| addr |
| offset)) |
| (let ((_ RegMemImm sink)) |
| (side_effect |
| (x64_xor_mem ty (to_amode flags addr offset) src2)))) |
| |
| ;; Xor mem, reg with args swapped |
| (rule 2 (lower |
| (store flags |
| (has_type (ty_32_or_64 ty) |
| (bxor src2 |
| (and |
| (sinkable_load sink) |
| (load flags addr offset)))) |
| addr |
| offset)) |
| (let ((_ RegMemImm sink)) |
| (side_effect |
| (x64_xor_mem ty (to_amode flags addr offset) src2)))) |
| |
| ;; Rules for `fence` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (fence)) |
| (side_effect (x64_mfence))) |
| |
| ;; Rules for `func_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (func_addr (func_ref_data _ extname dist))) |
| (load_ext_name extname 0 dist)) |
| |
| ;; Rules for `symbol_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (symbol_value (symbol_value_data extname dist offset))) |
| (load_ext_name extname offset dist)) |
| |
| ;; Rules for `atomic_load` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; This is a normal load. The x86-TSO memory model provides sufficient |
| ;; sequencing to satisfy the CLIF synchronisation requirements for `AtomicLoad` |
| ;; without the need for any fence instructions. |
| ;; |
| ;; As described in the `atomic_load` documentation, this lowering is only valid |
| ;; for I8, I16, I32, and I64. The sub-64-bit types are zero extended, as with a |
| ;; normal load. |
| (rule 1 (lower (has_type $I64 (atomic_load flags address))) |
| (x64_mov (to_amode flags address (zero_offset)))) |
| (rule (lower (has_type (and (fits_in_32 ty) (ty_int _)) (atomic_load flags address))) |
| (x64_movzx (ext_mode (ty_bits_u16 ty) 64) (to_amode flags address (zero_offset)))) |
| |
| ;; Rules for `atomic_store` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; This is a normal store followed by an `mfence` instruction. As described in |
| ;; the `atomic_load` documentation, this lowering is only valid for I8, I16, |
| ;; I32, and I64. |
| (rule (lower (atomic_store flags |
| value @ (value_type (and (fits_in_64 ty) (ty_int _))) |
| address)) |
| (side_effect (side_effect_concat |
| (x64_movrm ty (to_amode flags address (zero_offset)) value) |
| (x64_mfence)))) |
| |
| ;; Rules for `atomic_cas` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type (and (fits_in_64 ty) (ty_int _)) |
| (atomic_cas flags address expected replacement))) |
| (x64_cmpxchg ty expected replacement (to_amode flags address (zero_offset)))) |
| |
| ;; Rules for `atomic_rmw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; This is a simple, general-case atomic update, based on a loop involving |
| ;; `cmpxchg`. Note that we could do much better than this in the case where the |
| ;; old value at the location (that is to say, the SSA `Value` computed by this |
| ;; CLIF instruction) is not required. In that case, we could instead implement |
| ;; this using a single `lock`-prefixed x64 read-modify-write instruction. Also, |
| ;; even in the case where the old value is required, for the `add` and `sub` |
| ;; cases, we can use the single instruction `lock xadd`. However, those |
| ;; improvements have been left for another day. TODO: filed as |
| ;; https://github.com/bytecodealliance/wasmtime/issues/2153. |
| |
| (rule (lower (has_type (and (fits_in_64 ty) (ty_int _)) |
| (atomic_rmw flags op address input))) |
| (x64_atomic_rmw_seq ty op (to_amode flags address (zero_offset)) input)) |
| |
| ;; Rules for `call` and `call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (call (func_ref_data sig_ref extname dist) inputs)) |
| (gen_call sig_ref extname dist inputs)) |
| |
| (rule (lower (call_indirect sig_ref val inputs)) |
| (gen_call_indirect sig_ref val inputs)) |
| |
| ;;;; Rules for `return_call` and `return_call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (return_call (func_ref_data sig_ref extname dist) args)) |
| (gen_return_call sig_ref extname dist args)) |
| |
| (rule (lower (return_call_indirect sig_ref callee args)) |
| (gen_return_call_indirect sig_ref callee args)) |
| |
| ;;;; Rules for `get_{frame,stack}_pointer` and `get_return_address` ;;;;;;;;;;;; |
| |
| (rule (lower (get_frame_pointer)) |
| (x64_rbp)) |
| |
| (rule (lower (get_stack_pointer)) |
| (x64_rsp)) |
| |
| (rule (lower (get_return_address)) |
| (x64_load $I64 |
| (Amode.ImmReg 8 (x64_rbp) (mem_flags_trusted)) |
| (ExtKind.None))) |
| |
| ;; Rules for `jump` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower_branch (jump _) (single_target target)) |
| (emit_side_effect (jmp_known target))) |
| |
| ;; Rules for `brif` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule 2 (lower_branch (brif (maybe_uextend (icmp cc a b)) _ _) (two_targets then else)) |
| (emit_side_effect (jmp_cond_icmp (emit_cmp cc a b) then else))) |
| |
| (rule 2 (lower_branch (brif (maybe_uextend (fcmp cc a b)) _ _) (two_targets then else)) |
| (emit_side_effect (jmp_cond_fcmp (emit_fcmp cc a b) then else))) |
| |
| (rule 1 (lower_branch (brif val @ (value_type $I128) _ _) |
| (two_targets then else)) |
| (emit_side_effect (jmp_cond_icmp (cmp_zero_i128 (CC.Z) val) then else))) |
| |
| (rule (lower_branch (brif val @ (value_type (ty_int_bool_or_ref)) _ _) |
| (two_targets then else)) |
| (emit_side_effect (with_flags_side_effect |
| (cmp_zero_int_bool_ref val) |
| (jmp_cond (CC.NZ) then else)))) |
| |
| |
| ;; Compare an I128 value to zero, returning a flags result suitable for making a |
| ;; jump decision. The comparison is implemented as `(hi == 0) && (low == 0)`, |
| ;; and the result can be interpreted as follows |
| ;; * CC.Z indicates that the value was non-zero, as one or both of the halves of |
| ;; the value were non-zero |
| ;; * CC.NZ indicates that both halves of the value were 0 |
| (decl cmp_zero_i128 (CC ValueRegs) IcmpCondResult) |
| (rule (cmp_zero_i128 (cc_nz_or_z cc) val) |
| (let ((lo Gpr (value_regs_get_gpr val 0)) |
| (hi Gpr (value_regs_get_gpr val 1)) |
| (lo_z Gpr (with_flags_reg (x64_cmp (OperandSize.Size64) (RegMemImm.Imm 0) lo) |
| (x64_setcc (CC.Z)))) |
| (hi_z Gpr (with_flags_reg (x64_cmp (OperandSize.Size64) (RegMemImm.Imm 0) hi) |
| (x64_setcc (CC.Z))))) |
| (icmp_cond_result (x64_test (OperandSize.Size8) lo_z hi_z) cc))) |
| |
| |
| (decl cmp_zero_int_bool_ref (Value) ProducesFlags) |
| (rule (cmp_zero_int_bool_ref val @ (value_type ty)) |
| (let ((size OperandSize (raw_operand_size_of_type ty)) |
| (src Gpr val)) |
| (x64_test size src src))) |
| |
| ;; Rules for `br_table` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower_branch (br_table idx @ (value_type ty) _) (jump_table_targets default_target jt_targets)) |
| (let ((size OperandSize (raw_operand_size_of_type ty)) |
| (jt_size u32 (jump_table_size jt_targets)) |
| (size_reg Reg (imm ty (u32_as_u64 jt_size))) |
| (idx_reg Gpr (extend_to_gpr idx $I64 (ExtendKind.Zero))) |
| (clamped_idx Reg (with_flags_reg |
| (x64_cmp size size_reg idx_reg) |
| (cmove ty (CC.B) idx_reg size_reg)))) |
| (emit_side_effect (jmp_table_seq ty clamped_idx default_target jt_targets)))) |
| |
| ;; Rules for `select_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (select_spectre_guard (icmp cc a b) x y)) |
| (select_icmp (emit_cmp cc a b) x y)) |
| |
| (rule -1 (lower (has_type ty (select_spectre_guard c @ (value_type (fits_in_64 a_ty)) x y))) |
| (let ((size OperandSize (raw_operand_size_of_type a_ty)) |
| (gpr_c Gpr (put_in_gpr c))) |
| (with_flags (x64_test size gpr_c gpr_c) (cmove_from_values ty (CC.NZ) x y)))) |
| |
| (rule -2 (lower (has_type ty (select_spectre_guard c @ (value_type $I128) x y))) |
| (let ((cond_result IcmpCondResult (cmp_zero_i128 (CC.Z) c))) |
| (select_icmp cond_result x y))) |
| |
| ;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; Note that the `cvtsi2s{s,d}` instruction is not just an int-to-float |
| ;; conversion instruction in isolation, it also takes the upper 64-bits of an |
| ;; xmm register and places it into the destination. We don't actually want that |
| ;; to happen as it could accidentally create a false dependency with a |
| ;; previous instruction defining the register's upper 64-bits. See #7085 for |
| ;; an instance of this. |
| ;; |
| ;; This means that the first operand to all of the int-to-float conversions here |
| ;; are `(xmm_zero)` operands which is a guaranteed zero register that has no |
| ;; dependencies on other instructions. |
| ;; |
| ;; Ideally this would be lifted out to a higher level to get deduplicated |
| ;; between consecutive int-to-float operations but that's not easy |
| ;; to do at this time. One possibility would be a mid-end rule which rewrites |
| ;; `fcvt_from_sint` to an x86-specific opcode using a zero constant which would |
| ;; be subject to normal LICM, but that's not feasible today. |
| |
| (rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I8)))) |
| (x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign)))) |
| |
| (rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I16)))) |
| (x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign)))) |
| |
| (rule 1 (lower (has_type $F32 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty)))))) |
| (x64_cvtsi2ss ty (xmm_zero $F32X4) a)) |
| |
| (rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I8)))) |
| (x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign)))) |
| |
| (rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I16)))) |
| (x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign)))) |
| |
| (rule 1 (lower (has_type $F64 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty)))))) |
| (x64_cvtsi2sd ty (xmm_zero $F64X2) a)) |
| |
| (rule 0 (lower (fcvt_from_sint a @ (value_type $I32X4))) |
| (x64_cvtdq2ps a)) |
| |
| (rule 1 (lower (has_type $F64X2 (fcvt_from_sint (swiden_low a @ (value_type $I32X4))))) |
| (x64_cvtdq2pd a)) |
| |
| ;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule 1 (lower (has_type $F32 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty)))))) |
| (x64_cvtsi2ss $I64 (xmm_zero $F32X4) (extend_to_gpr val $I64 (ExtendKind.Zero)))) |
| |
| (rule 1 (lower (has_type $F64 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty)))))) |
| (x64_cvtsi2sd $I64 (xmm_zero $F64X2) (extend_to_gpr val $I64 (ExtendKind.Zero)))) |
| |
| (rule (lower (has_type ty (fcvt_from_uint val @ (value_type $I64)))) |
| (cvt_u64_to_float_seq ty val)) |
| |
| ;; Algorithm uses unpcklps to help create a float that is equivalent |
| ;; 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent |
| ;; every value of the mantissa represents a corresponding uint32 number. |
| ;; When we subtract 0x1.0p52 we are left with double(src). |
| (rule 1 (lower (has_type $F64X2 (fcvt_from_uint (uwiden_low val @ (value_type $I32X4))))) |
| (let ((uint_mask XmmMem (emit_u128_le_const 0x43300000_43300000)) |
| (res Xmm (x64_unpcklps val uint_mask)) |
| (uint_mask_high XmmMem (emit_u128_le_const 0x4330000000000000_4330000000000000))) |
| (x64_subpd res uint_mask_high))) |
| |
| ;; When AVX512VL and AVX512F are available, |
| ;; `fcvt_from_uint` can be lowered to a single instruction. |
| (rule 2 (lower (has_type $F32X4 (fcvt_from_uint src))) |
| (if-let $true (use_avx512vl)) |
| (if-let $true (use_avx512f)) |
| (x64_vcvtudq2ps src)) |
| |
| ;; Converting packed unsigned integers to packed floats |
| ;; requires a few steps. There is no single instruction |
| ;; lowering for converting unsigned floats but there is for |
| ;; converting packed signed integers to float (cvtdq2ps). In |
| ;; the steps below we isolate the upper half (16 bits) and |
| ;; lower half (16 bits) of each lane and then we convert |
| ;; each half separately using cvtdq2ps meant for signed |
| ;; integers. In order for this to work for the upper half |
| ;; bits we must shift right by 1 (divide by 2) these bits in |
| ;; order to ensure the most significant bit is 0 not signed, |
| ;; and then after the conversion we double the value. |
| ;; Finally we add the converted values where addition will |
| ;; correctly round. |
| ;; |
| ;; Sequence: |
| ;; -> A = 0xffffffff |
| ;; -> Ah = 0xffff0000 |
| ;; -> Al = 0x0000ffff |
| ;; -> Convert(Al) // Convert int to float |
| ;; -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed |
| ;; -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift |
| ;; -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion. |
| ;; -> dst = Ah + Al // Add the two floats together |
| (rule 1 (lower (has_type $F32X4 (fcvt_from_uint val))) |
| (let ((a Xmm val) |
| |
| ;; get the low 16 bits |
| (a_lo Xmm (x64_pslld a (xmi_imm 16))) |
| (a_lo Xmm (x64_psrld a_lo (xmi_imm 16))) |
| |
| ;; get the high 16 bits |
| (a_hi Xmm (x64_psubd a a_lo)) |
| |
| ;; convert the low 16 bits |
| (a_lo Xmm (x64_cvtdq2ps a_lo)) |
| |
| ;; shift the high bits by 1, convert, and double to get the correct |
| ;; value |
| (a_hi Xmm (x64_psrld a_hi (xmi_imm 1))) |
| (a_hi Xmm (x64_cvtdq2ps a_hi)) |
| (a_hi Xmm (x64_addps a_hi a_hi))) |
| |
| ;; add together the two converted values |
| (x64_addps a_hi a_lo))) |
| |
| ;; Rules for `fcvt_to_uint` and `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type out_ty (fcvt_to_uint val @ (value_type (ty_scalar_float _))))) |
| (cvt_float_to_uint_seq out_ty val $false)) |
| |
| (rule (lower (has_type out_ty (fcvt_to_uint_sat val @ (value_type (ty_scalar_float _))))) |
| (cvt_float_to_uint_seq out_ty val $true)) |
| |
| (rule (lower (has_type out_ty (fcvt_to_sint val @ (value_type (ty_scalar_float _))))) |
| (cvt_float_to_sint_seq out_ty val $false)) |
| |
| (rule (lower (has_type out_ty (fcvt_to_sint_sat val @ (value_type (ty_scalar_float _))))) |
| (cvt_float_to_sint_seq out_ty val $true)) |
| |
| ;; The x64 backend currently only supports these two type combinations. |
| (rule 1 (lower (has_type $I32X4 (fcvt_to_sint_sat val @ (value_type $F32X4)))) |
| (let ((src Xmm val) |
| |
| ;; Sets tmp to zero if float is NaN |
| (tmp Xmm (x64_cmpps src src (FcmpImm.Equal))) |
| (dst Xmm (x64_andps src tmp)) |
| |
| ;; Sets top bit of tmp if float is positive |
| ;; Setting up to set top bit on negative float values |
| (tmp Xmm (x64_pxor tmp dst)) |
| |
| ;; Convert the packed float to packed doubleword. |
| (dst Xmm (x64_cvttps2dq dst)) |
| |
| ;; Set top bit only if < 0 |
| (tmp Xmm (x64_pand dst tmp)) |
| (tmp Xmm (x64_psrad tmp (xmi_imm 31)))) |
| |
| ;; On overflow 0x80000000 is returned to a lane. |
| ;; Below sets positive overflow lanes to 0x7FFFFFFF |
| ;; Keeps negative overflow lanes as is. |
| (x64_pxor tmp dst))) |
| |
| ;; The algorithm for converting floats to unsigned ints is a little tricky. The |
| ;; complication arises because we are converting from a signed 64-bit int with a positive |
| ;; integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended |
| ;; range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX |
| ;; (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our |
| ;; conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but |
| ;; which conveniently setting underflows and overflows (smaller than MIN_INT or larger than |
| ;; MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes |
| ;; precisely INT_MAX values we can correctly account for and convert every value in this range |
| ;; if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction |
| ;; every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX). |
| ;; After the conversion we add INT_MAX+1 back to this converted value, noting again that |
| ;; values we are trying to account for were already set to INT_MAX+1 during the original conversion. |
| ;; We simply have to create a mask and make sure we are adding together only the lanes that need |
| ;; to be accounted for. Digesting it all the steps then are: |
| ;; |
| ;; Step 1 - Account for NaN and negative floats by setting these src values to zero. |
| ;; Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for |
| ;; reasons described above. |
| ;; Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX |
| ;; Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those |
| ;; values that were originally in the range (0..INT_MAX). This will come in handy during |
| ;; step 7 when we zero negative lanes. |
| ;; Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than |
| ;; UINT_MAX that are now less than INT_MAX thanks to the subtraction. |
| ;; Step 6 - Convert the second set of values (tmp1) |
| ;; Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been |
| ;; converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF |
| ;; as this will allow us to properly saturate overflow lanes when adding to 0x80000000 |
| ;; Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less |
| ;; than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and |
| ;; UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally |
| ;; greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF). |
| ;; |
| ;; |
| ;; The table below illustrates the result after each step where it matters for the converted set. |
| ;; Note the original value range (original src set) is the final dst in Step 8: |
| ;; |
| ;; Original src set: |
| ;; | Original Value Range | Step 1 | Step 3 | Step 8 | |
| ;; | -FLT_MIN..FLT_MAX | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) | |
| ;; |
| ;; Copied src set (tmp1): |
| ;; | Step 2 | Step 4 | |
| ;; | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) | |
| ;; |
| ;; | Step 6 | Step 7 | |
| ;; | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) | |
| (rule 1 (lower (has_type $I32X4 (fcvt_to_uint_sat val @ (value_type $F32X4)))) |
| (let ((src Xmm val) |
| |
| ;; Converting to unsigned int so if float src is negative or NaN |
| ;; will first set to zero. |
| (tmp2 Xmm (xmm_zero $F32X4)) |
| (dst Xmm (x64_maxps src tmp2)) |
| |
| ;; Set tmp2 to INT_MAX+1. It is important to note here that after it looks |
| ;; like we are only converting INT_MAX (0x7FFFFFFF) but in fact because |
| ;; single precision IEEE-754 floats can only accurately represent contingous |
| ;; integers up to 2^23 and outside of this range it rounds to the closest |
| ;; integer that it can represent. In the case of INT_MAX, this value gets |
| ;; represented as 0x4f000000 which is the integer value (INT_MAX+1). |
| (tmp2 Xmm (x64_pcmpeqd tmp2 tmp2)) |
| (tmp2 Xmm (x64_psrld tmp2 (xmi_imm 1))) |
| (tmp2 Xmm (x64_cvtdq2ps tmp2)) |
| |
| ;; Make a copy of these lanes and then do the first conversion. |
| ;; Overflow lanes greater than the maximum allowed signed value will |
| ;; set to 0x80000000. Negative and NaN lanes will be 0x0 |
| (tmp1 Xmm dst) |
| (dst Xmm (x64_cvttps2dq dst)) |
| |
| ;; Set lanes to src - max_signed_int |
| (tmp1 Xmm (x64_subps tmp1 tmp2)) |
| |
| ;; Create mask for all positive lanes to saturate (i.e. greater than |
| ;; or equal to the maxmimum allowable unsigned int). |
| (tmp2 Xmm (x64_cmpps tmp2 tmp1 (FcmpImm.LessThanOrEqual))) |
| |
| ;; Convert those set of lanes that have the max_signed_int factored out. |
| (tmp1 Xmm (x64_cvttps2dq tmp1)) |
| |
| ;; Prepare converted lanes by zeroing negative lanes and prepping lanes |
| ;; that have positive overflow (based on the mask) by setting these lanes |
| ;; to 0x7FFFFFFF |
| (tmp1 Xmm (x64_pxor tmp1 tmp2)) |
| (tmp2 Xmm (xmm_zero $I32X4)) |
| (tmp1 Xmm (lower_vec_smax $I32X4 tmp1 tmp2))) |
| |
| ;; Add this second set of converted lanes to the original to properly handle |
| ;; values greater than max signed int. |
| (x64_paddd tmp1 dst))) |
| |
| ;; Rules for `x86_cvtt2dq` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type $I32X4 (x86_cvtt2dq val @ (value_type $F32X4)))) |
| (x64_cvttps2dq val)) |
| |
| ;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type $I8X16 (iadd_pairwise x y))) |
| (let ( |
| ;; Shuffle all the even lanes of `x` and `y` into one register |
| (even_lane_mask Xmm (x64_movdqu_load (emit_u128_le_const 0x00ff_00ff_00ff_00ff_00ff_00ff_00ff_00ff))) |
| (x_evens Xmm (x64_pand x even_lane_mask)) |
| (y_evens Xmm (x64_pand y even_lane_mask)) |
| (evens Xmm (x64_packuswb x_evens y_evens)) |
| |
| ;; Shuffle all the odd lanes of `x` and `y` into one register |
| (x_odds Xmm (x64_psrlw x (xmi_imm 8))) |
| (y_odds Xmm (x64_psrlw y (xmi_imm 8))) |
| (odds Xmm (x64_packuswb x_odds y_odds)) |
| ) |
| (x64_paddb evens odds))) |
| |
| |
| (rule 1 (lower (has_type $I16X8 (iadd_pairwise x y))) |
| (if-let $true (use_ssse3)) |
| (x64_phaddw x y)) |
| |
| (rule (lower (has_type $I16X8 (iadd_pairwise x y))) |
| (let ( |
| (x Xmm x) |
| (y Xmm y) |
| |
| ;; Shuffle the even-numbered 16-bit lanes into low four lanes of each |
| ;; vector by shuffling 16-bit lanes then shuffling 32-bit lanes. |
| ;; With these in place generate a new vector from the two low 64-bits |
| ;; of each vector (the low four 16-bit lanes). |
| ;; |
| ;; 0xe8 == 0b11_10_10_00 |
| (x_evens Xmm (x64_pshufd (x64_pshufhw (x64_pshuflw x 0xe8) 0xe8) 0xe8)) |
| (y_evens Xmm (x64_pshufd (x64_pshufhw (x64_pshuflw y 0xe8) 0xe8) 0xe8)) |
| (evens Xmm (x64_punpcklqdq x_evens y_evens)) |
| |
| ;; Shuffle the odd-numbered 16-bit lanes into the low 8 lanes by |
| ;; performing `sshr` operation on 32-bit lanes, effectively moving the |
| ;; odd lanes into even lanes while leaving their sign bits in the |
| ;; odd lanes. The `packssdw` instruction then conveniently will |
| ;; put everything into one vector for us. |
| (x_shifted Xmm (x64_psrad x (xmi_imm 16))) |
| (y_shifted Xmm (x64_psrad y (xmi_imm 16))) |
| (odds Xmm (x64_packssdw x_shifted y_shifted)) |
| ) |
| (x64_paddw evens odds))) |
| |
| (rule 1 (lower (has_type $I32X4 (iadd_pairwise x y))) |
| (if-let $true (use_ssse3)) |
| (x64_phaddd x y)) |
| |
| (rule (lower (has_type $I32X4 (iadd_pairwise x y))) |
| (let ( |
| (x Xmm x) |
| (y Xmm y) |
| ;; evens = [ x[0] x[2] y[0] y[2] ] |
| (evens Xmm (x64_shufps x y 0b10_00_10_00)) |
| ;; odds = [ x[1] x[3] y[1] y[3] ] |
| (odds Xmm (x64_shufps x y 0b11_01_11_01)) |
| ) |
| (x64_paddd evens odds))) |
| |
| ;; special case for the `i16x8.extadd_pairwise_i8x16_s` wasm instruction |
| (rule 2 (lower |
| (has_type $I16X8 (iadd_pairwise |
| (swiden_low val @ (value_type $I8X16)) |
| (swiden_high val)))) |
| (if-let $true (use_ssse3)) |
| (let ((mul_const Xmm (x64_xmm_load_const $I8X16 |
| (emit_u128_le_const 0x01010101010101010101010101010101)))) |
| (x64_pmaddubsw mul_const val))) |
| |
| ;; special case for the `i32x4.extadd_pairwise_i16x8_s` wasm instruction |
| (rule 2 (lower |
| (has_type $I32X4 (iadd_pairwise |
| (swiden_low val @ (value_type $I16X8)) |
| (swiden_high val)))) |
| (let ((mul_const XmmMem (emit_u128_le_const 0x0001_0001_0001_0001_0001_0001_0001_0001))) |
| (x64_pmaddwd val mul_const))) |
| |
| ;; special case for the `i16x8.extadd_pairwise_i8x16_u` wasm instruction |
| (rule 2 (lower |
| (has_type $I16X8 (iadd_pairwise |
| (uwiden_low val @ (value_type $I8X16)) |
| (uwiden_high val)))) |
| (if-let $true (use_ssse3)) |
| (let ((mul_const XmmMem (emit_u128_le_const 0x01010101010101010101010101010101))) |
| (x64_pmaddubsw val mul_const))) |
| |
| ;; special case for the `i32x4.extadd_pairwise_i16x8_u` wasm instruction |
| (rule 2 (lower |
| (has_type $I32X4 (iadd_pairwise |
| (uwiden_low val @ (value_type $I16X8)) |
| (uwiden_high val)))) |
| (let ((xor_const XmmMem (emit_u128_le_const 0x8000_8000_8000_8000_8000_8000_8000_8000)) |
| (dst Xmm (x64_pxor val xor_const)) |
| |
| (madd_const XmmMem (emit_u128_le_const 0x0001_0001_0001_0001_0001_0001_0001_0001)) |
| (dst Xmm (x64_pmaddwd dst madd_const)) |
| |
| (addd_const XmmMem (emit_u128_le_const 0x00010000_00010000_00010000_00010000))) |
| (x64_paddd dst addd_const))) |
| |
| ;; special case for the `i32x4.dot_i16x8_s` wasm instruction |
| (rule 2 (lower |
| (has_type $I32X4 (iadd_pairwise |
| (imul (swiden_low x) (swiden_low y)) |
| (imul (swiden_high x) (swiden_high y))))) |
| (x64_pmaddwd x y)) |
| |
| ;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; With SSE4.1 use the `pmovsx*` instructions for this |
| (rule 1 (lower (has_type $I16X8 (swiden_low val @ (value_type $I8X16)))) |
| (if-let $true (use_sse41)) |
| (x64_pmovsxbw val)) |
| (rule 1 (lower (has_type $I32X4 (swiden_low val @ (value_type $I16X8)))) |
| (if-let $true (use_sse41)) |
| (x64_pmovsxwd val)) |
| (rule 1 (lower (has_type $I64X2 (swiden_low val @ (value_type $I32X4)))) |
| (if-let $true (use_sse41)) |
| (x64_pmovsxdq val)) |
| |
| (rule (lower (has_type ty (swiden_low val))) (lower_swiden_low ty val)) |
| |
| (decl lower_swiden_low (Type Xmm) Xmm) |
| |
| ;; Duplicate the low lanes next to each other, then perform a wider shift-right |
| ;; by the low lane width to move the upper of each pair back into the lower lane |
| ;; of each pair, achieving the widening of the lower lanes. |
| (rule (lower_swiden_low $I16X8 val) |
| (x64_psraw (x64_punpcklbw val val) (xmi_imm 8))) |
| (rule (lower_swiden_low $I32X4 val) |
| (x64_psrad (x64_punpcklwd val val) (xmi_imm 16))) |
| |
| ;; Generate the sign-extended halves with a `val < 0` comparison (expressed |
| ;; reversed here), then interleave the low 32-bit halves to create the full |
| ;; 64-bit results. |
| (rule (lower_swiden_low $I64X2 val) |
| (let ((tmp Xmm (x64_pcmpgtd (xmm_zero $I32X4) val))) |
| (x64_punpckldq val tmp))) |
| |
| ;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; Similar to `swiden_low` with SSE4.1 except that the upper lanes are moved |
| ;; to the lower lanes first. |
| (rule 1 (lower (has_type $I16X8 (swiden_high val @ (value_type $I8X16)))) |
| (if-let $true (use_sse41)) |
| (if-let $true (use_ssse3)) |
| (let ((x Xmm val)) |
| (x64_pmovsxbw (x64_palignr x x 8)))) |
| (rule 1 (lower (has_type $I32X4 (swiden_high val @ (value_type $I16X8)))) |
| (if-let $true (use_sse41)) |
| (if-let $true (use_ssse3)) |
| (let ((x Xmm val)) |
| (x64_pmovsxwd (x64_palignr x x 8)))) |
| (rule 1 (lower (has_type $I64X2 (swiden_high val @ (value_type $I32X4)))) |
| (if-let $true (use_sse41)) |
| (x64_pmovsxdq (x64_pshufd val 0b11_10_11_10))) |
| |
| ;; Similar to `swiden_low` versions but using `punpckh*` instructions to |
| ;; pair the high lanes next to each other. |
| (rule (lower (has_type $I16X8 (swiden_high val @ (value_type $I8X16)))) |
| (let ((val Xmm val)) |
| (x64_psraw (x64_punpckhbw val val) (xmi_imm 8)))) |
| (rule (lower (has_type $I32X4 (swiden_high val @ (value_type $I16X8)))) |
| (let ((val Xmm val)) |
| (x64_psrad (x64_punpckhwd val val) (xmi_imm 16)))) |
| |
| ;; Same as `swiden_low`, but `val` has its high lanes moved down. |
| (rule (lower (has_type $I64X2 (swiden_high val @ (value_type $I32X4)))) |
| (let ((val Xmm (x64_pshufd val 0b00_00_11_10)) |
| (tmp Xmm (x64_pcmpgtd (xmm_zero $I32X4) val))) |
| (x64_punpckldq val tmp))) |
| |
| ;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; With SSE4.1 use the `pmovzx*` instructions for this |
| (rule 1 (lower (has_type $I16X8 (uwiden_low val @ (value_type $I8X16)))) |
| (if-let $true (use_sse41)) |
| (x64_pmovzxbw val)) |
| (rule 1 (lower (has_type $I32X4 (uwiden_low val @ (value_type $I16X8)))) |
| (if-let $true (use_sse41)) |
| (x64_pmovzxwd val)) |
| (rule 1 (lower (has_type $I64X2 (uwiden_low val @ (value_type $I32X4)))) |
| (if-let $true (use_sse41)) |
| (x64_pmovzxdq val)) |
| |
| (rule (lower (has_type ty (uwiden_low val))) (lower_uwiden_low ty val)) |
| |
| ;; Interleave an all-zero register with the low lanes to produce zero-extended |
| ;; results. |
| (decl lower_uwiden_low (Type Xmm) Xmm) |
| (rule (lower_uwiden_low $I16X8 val) (x64_punpcklbw val (xmm_zero $I8X16))) |
| (rule (lower_uwiden_low $I32X4 val) (x64_punpcklwd val (xmm_zero $I8X16))) |
| (rule (lower_uwiden_low $I64X2 val) (x64_unpcklps val (xmm_zero $F32X4))) |
| |
| ;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; Same as `uwiden_high`, but interleaving high lanes instead. |
| ;; |
| ;; Note that according to `llvm-mca` at least these instructions are faster |
| ;; than using `pmovzx*` in terms of cycles, even if SSE4.1 is available. |
| (rule (lower (has_type $I16X8 (uwiden_high val @ (value_type $I8X16)))) |
| (x64_punpckhbw val (xmm_zero $I8X16))) |
| (rule (lower (has_type $I32X4 (uwiden_high val @ (value_type $I16X8)))) |
| (x64_punpckhwd val (xmm_zero $I8X16))) |
| (rule (lower (has_type $I64X2 (uwiden_high val @ (value_type $I32X4)))) |
| (x64_unpckhps val (xmm_zero $F32X4))) |
| |
| ;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type $I8X16 (snarrow a @ (value_type $I16X8) b))) |
| (x64_packsswb a b)) |
| |
| (rule (lower (has_type $I16X8 (snarrow a @ (value_type $I32X4) b))) |
| (x64_packssdw a b)) |
| |
| ;; We're missing a `snarrow` case for $I64X2 |
| ;; https://github.com/bytecodealliance/wasmtime/issues/4734 |
| |
| ;; This rule is a special case for handling the translation of the wasm op |
| ;; `i32x4.trunc_sat_f64x2_s_zero`. It can be removed once we have an |
| ;; implementation of `snarrow` for `I64X2`. |
| (rule (lower (has_type $I32X4 (snarrow (has_type $I64X2 (fcvt_to_sint_sat val)) |
| (vconst (u128_from_constant 0))))) |
| (let ((a Xmm val) |
| |
| ;; y = i32x4.trunc_sat_f64x2_s_zero(x) is lowered to: |
| ;; MOVE xmm_tmp, xmm_x |
| ;; CMPEQPD xmm_tmp, xmm_x |
| ;; MOVE xmm_y, xmm_x |
| ;; ANDPS xmm_tmp, [wasm_f64x2_splat(2147483647.0)] |
| ;; MINPD xmm_y, xmm_tmp |
| ;; CVTTPD2DQ xmm_y, xmm_y |
| |
| (tmp1 Xmm (x64_cmppd a a (FcmpImm.Equal))) |
| |
| ;; 2147483647.0 is equivalent to 0x41DFFFFFFFC00000 |
| (umax_mask XmmMem (emit_u128_le_const 0x41DFFFFFFFC00000_41DFFFFFFFC00000)) |
| |
| ;; ANDPD xmm_y, [wasm_f64x2_splat(2147483647.0)] |
| (tmp1 Xmm (x64_andps tmp1 umax_mask)) |
| (dst Xmm (x64_minpd a tmp1))) |
| (x64_cvttpd2dq dst))) |
| |
| ;; This rule is a special case for handling the translation of the wasm op |
| ;; `i32x4.relaxed_trunc_f64x2_s_zero`. |
| (rule (lower (has_type $I32X4 (snarrow (has_type $I64X2 (x86_cvtt2dq val)) |
| (vconst (u128_from_constant 0))))) |
| (x64_cvttpd2dq val)) |
| |
| ;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type $I8X16 (unarrow a @ (value_type $I16X8) b))) |
| (x64_packuswb a b)) |
| |
| (rule 1 (lower (has_type $I16X8 (unarrow a @ (value_type $I32X4) b))) |
| (if-let $true (use_sse41)) |
| (x64_packusdw a b)) |
| |
| ;; For each input `a` and `b` take the four 32-bit lanes and compress them to |
| ;; the low 64-bits of the vector as four 16-bit lanes. Then these are woven |
| ;; into one final vector with a `punpcklqdq`. |
| ;; |
| ;; If this is performance sensitive then it's probably best to upgrade the CPU |
| ;; to get the above single-instruction lowering. |
| (rule (lower (has_type $I16X8 (unarrow a @ (value_type $I32X4) b))) |
| (let ( |
| (a Xmm (unarrow_i32x4_lanes_to_low_u16_lanes a)) |
| (b Xmm (unarrow_i32x4_lanes_to_low_u16_lanes b)) |
| ) |
| (x64_punpcklqdq a b))) |
| |
| (decl unarrow_i32x4_lanes_to_low_u16_lanes (Xmm) Xmm) |
| (rule (unarrow_i32x4_lanes_to_low_u16_lanes val) |
| (let ( |
| ;; First convert all negative values in `val` to zero lanes. |
| (val_gt_zero Xmm (x64_pcmpgtd val (xmm_zero $I32X4))) |
| (val Xmm (x64_pand val val_gt_zero)) |
| |
| ;; Next clamp all larger-than-u16-max lanes to u16::MAX. |
| (max Xmm (x64_movdqu_load (emit_u128_le_const 0x0000ffff_0000ffff_0000ffff_0000ffff))) |
| (cmp Xmm (x64_pcmpgtd max val)) |
| (valid_lanes Xmm (x64_pand val cmp)) |
| (clamped_lanes Xmm (x64_pandn cmp max)) |
| (val Xmm (x64_por valid_lanes clamped_lanes)) |
| |
| ;; Within each 64-bit half of the 32x4 vector move the first 16 bits |
| ;; and the third 16 bits to the bottom of the half. Afterwards |
| ;; for the 32x4 vector move the first and third lanes to the bottom |
| ;; lanes, which finishes up the conversion here as all the lanes |
| ;; are now converted to 16-bit values in the low 4 lanes. |
| (val Xmm (x64_pshuflw val 0b00_00_10_00)) |
| (val Xmm (x64_pshufhw val 0b00_00_10_00)) |
| ) |
| (x64_pshufd val 0b00_00_10_00))) |
| |
| |
| ;; We're missing a `unarrow` case for $I64X2 |
| ;; https://github.com/bytecodealliance/wasmtime/issues/4734 |
| |
| ;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type $I32 (bitcast _ src @ (value_type $F32)))) |
| (bitcast_xmm_to_gpr $F32 src)) |
| |
| (rule (lower (has_type $F32 (bitcast _ src @ (value_type $I32)))) |
| (bitcast_gpr_to_xmm $I32 src)) |
| |
| (rule (lower (has_type $I64 (bitcast _ src @ (value_type $F64)))) |
| (bitcast_xmm_to_gpr $F64 src)) |
| |
| (rule (lower (has_type $F64 (bitcast _ src @ (value_type $I64)))) |
| (bitcast_gpr_to_xmm $I64 src)) |
| |
| ;; Bitcast between types residing in GPR registers is a no-op. |
| (rule 1 (lower (has_type (is_gpr_type _) |
| (bitcast _ x @ (value_type (is_gpr_type _))))) x) |
| |
| ;; Bitcast between types residing in XMM registers is a no-op. |
| (rule 2 (lower (has_type (is_xmm_type _) |
| (bitcast _ x @ (value_type (is_xmm_type _))))) x) |
| |
| ;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type $F32 (fcopysign a @ (value_type $F32) b))) |
| (let ((sign_bit Xmm (imm $F32 0x80000000))) |
| (x64_orps |
| (x64_andnps sign_bit a) |
| (x64_andps sign_bit b)))) |
| |
| (rule (lower (has_type $F64 (fcopysign a @ (value_type $F64) b))) |
| (let ((sign_bit Xmm (imm $F64 0x8000000000000000))) |
| (x64_orpd |
| (x64_andnpd sign_bit a) |
| (x64_andpd sign_bit b)))) |
| |
| ;; Helper for the `ceil`/`floor`/`nearest`/`trunc` instructions ;;;;;;;;;;;;;;;; |
| |
| ;; Emits either a `round{ss,sd,ps,pd}` instruction, as appropriate, or generates |
| ;; the appropriate libcall and sequence to call that. |
| (decl x64_round (Type RegMem RoundImm) Xmm) |
| (rule 1 (x64_round $F32 a imm) |
| (if-let $true (use_sse41)) |
| (x64_roundss a imm)) |
| (rule 1 (x64_round $F64 a imm) |
| (if-let $true (use_sse41)) |
| (x64_roundsd a imm)) |
| (rule 1 (x64_round $F32X4 a imm) |
| (if-let $true (use_sse41)) |
| (x64_roundps a imm)) |
| (rule 1 (x64_round $F64X2 a imm) |
| (if-let $true (use_sse41)) |
| (x64_roundpd a imm)) |
| |
| (rule (x64_round $F32 (RegMem.Reg a) imm) (libcall_1 (round_libcall $F32 imm) a)) |
| (rule (x64_round $F64 (RegMem.Reg a) imm) (libcall_1 (round_libcall $F64 imm) a)) |
| (rule (x64_round $F32X4 (RegMem.Reg a) imm) |
| (let ( |
| (libcall LibCall (round_libcall $F32 imm)) |
| (result Xmm (libcall_1 libcall a)) |
| (a1 Xmm (libcall_1 libcall (x64_pshufd a 1))) |
| (result Xmm (vec_insert_lane $F32X4 result a1 1)) |
| (a2 Xmm (libcall_1 libcall (x64_pshufd a 2))) |
| (result Xmm (vec_insert_lane $F32X4 result a2 2)) |
| (a3 Xmm (libcall_1 libcall (x64_pshufd a 3))) |
| (result Xmm (vec_insert_lane $F32X4 result a3 3)) |
| ) |
| result)) |
| (rule (x64_round $F64X2 (RegMem.Reg a) imm) |
| (let ( |
| (libcall LibCall (round_libcall $F64 imm)) |
| (result Xmm (libcall_1 libcall a)) |
| (a1 Xmm (libcall_1 libcall (x64_pshufd a 0b00_00_11_10))) |
| (result Xmm (vec_insert_lane $F64X2 result a1 1)) |
| ) |
| result)) |
| (rule (x64_round ty (RegMem.Mem addr) imm) |
| (x64_round ty (RegMem.Reg (x64_load ty addr (ExtKind.ZeroExtend))) imm)) |
| |
| (decl round_libcall (Type RoundImm) LibCall) |
| (rule (round_libcall $F32 (RoundImm.RoundUp)) (LibCall.CeilF32)) |
| (rule (round_libcall $F64 (RoundImm.RoundUp)) (LibCall.CeilF64)) |
| (rule (round_libcall $F32 (RoundImm.RoundDown)) (LibCall.FloorF32)) |
| (rule (round_libcall $F64 (RoundImm.RoundDown)) (LibCall.FloorF64)) |
| (rule (round_libcall $F32 (RoundImm.RoundNearest)) (LibCall.NearestF32)) |
| (rule (round_libcall $F64 (RoundImm.RoundNearest)) (LibCall.NearestF64)) |
| (rule (round_libcall $F32 (RoundImm.RoundZero)) (LibCall.TruncF32)) |
| (rule (round_libcall $F64 (RoundImm.RoundZero)) (LibCall.TruncF64)) |
| |
| ;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (ceil a @ (value_type ty))) |
| (x64_round ty a (RoundImm.RoundUp))) |
| |
| ;; Rules for `floor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (floor a @ (value_type ty))) |
| (x64_round ty a (RoundImm.RoundDown))) |
| |
| ;; Rules for `nearest` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (nearest a @ (value_type ty))) |
| (x64_round ty a (RoundImm.RoundNearest))) |
| |
| ;; Rules for `trunc` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (trunc a @ (value_type ty))) |
| (x64_round ty a (RoundImm.RoundZero))) |
| |
| ;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (stack_addr stack_slot offset)) |
| (stack_addr_impl stack_slot offset)) |
| |
| ;; Rules for `udiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; NB: a `RegMem` divisor, while allowed in the instruction encoding, isn't |
| ;; used right now to prevent a possibly-trapping load getting folded into the |
| ;; `div` instruction. Ideally non-trapping loads would get folded, however, or |
| ;; alternatively Wasmtime/Cranelift would grow support for multiple traps on |
| ;; a single opcode and the signal kind would differentiate at runtime. |
| |
| ;; The inputs to the `div` instruction are different for 8-bit division so |
| ;; it needs a special case here since the instruction being crafted has a |
| ;; different shape. |
| (rule 2 (lower (udiv a @ (value_type $I8) b)) |
| (x64_div8 (extend_to_gpr a $I32 (ExtendKind.Zero)) |
| (put_in_gpr b) |
| (DivSignedness.Unsigned) |
| (TrapCode.IntegerDivisionByZero))) |
| |
| ;; 16-to-64-bit division is all done with a similar instruction and the only |
| ;; tricky requirement here is that when div traps are disallowed the divisor |
| ;; must not be zero. |
| (rule 1 (lower (udiv a @ (value_type (fits_in_64 ty)) b)) |
| (x64_div_quotient a |
| (imm $I64 0) |
| (put_in_gpr b) |
| (raw_operand_size_of_type ty) |
| (DivSignedness.Unsigned) |
| (TrapCode.IntegerDivisionByZero))) |
| |
| ;; Rules for `sdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule 2 (lower (sdiv a @ (value_type $I8) b)) |
| (x64_div8 (x64_sign_extend_data a (OperandSize.Size8)) |
| (nonzero_sdiv_divisor $I8 b) |
| (DivSignedness.Signed) |
| (TrapCode.IntegerOverflow))) |
| |
| (rule 1 (lower (sdiv a @ (value_type (fits_in_64 ty)) b)) |
| (let ( |
| (a Gpr a) |
| (size OperandSize (raw_operand_size_of_type ty)) |
| ) |
| (x64_div_quotient a |
| (x64_sign_extend_data a size) |
| (nonzero_sdiv_divisor ty b) |
| size |
| (DivSignedness.Signed) |
| (TrapCode.IntegerOverflow)))) |
| |
| ;; Checks to make sure that the input `Value` is a non-zero value for `sdiv`. |
| ;; |
| ;; This is required to differentiate the divide-by-zero trap from the |
| ;; integer-overflow trap, the two trapping conditions of signed division. |
| (decl nonzero_sdiv_divisor (Type Value) Reg) |
| (rule 1 (nonzero_sdiv_divisor ty (iconst imm)) |
| (if-let n (safe_divisor_from_imm64 ty imm)) |
| (imm ty n)) |
| (rule 0 (nonzero_sdiv_divisor ty val) |
| (let ( |
| (val Reg val) |
| (_ InstOutput (side_effect (with_flags_side_effect |
| (x64_test (raw_operand_size_of_type ty) val val) |
| (trap_if (CC.Z) (TrapCode.IntegerDivisionByZero))))) |
| ) |
| val)) |
| |
| ;; Rules for `urem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; The remainder is in AH, so take the result of the division and right-shift |
| ;; by 8. |
| (rule 2 (lower (urem a @ (value_type $I8) b)) |
| (let ( |
| (result Gpr (x64_div8 (extend_to_gpr a $I32 (ExtendKind.Zero)) |
| (put_in_gpr b) ;; see `udiv` for why not `gpr_mem` |
| (DivSignedness.Unsigned) |
| (TrapCode.IntegerDivisionByZero))) |
| ) |
| (x64_shr $I64 result (Imm8Reg.Imm8 8)))) |
| |
| (rule 1 (lower (urem a @ (value_type (fits_in_64 ty)) b)) |
| (x64_div_remainder a |
| (imm $I64 0) |
| (put_in_gpr b) ;; see `udiv` for why not `gpr_mem` |
| (raw_operand_size_of_type ty) |
| (DivSignedness.Unsigned) |
| (TrapCode.IntegerDivisionByZero))) |
| |
| ;; Rules for `srem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; Special-cases first for constant `srem` where the checks for 0 and -1 aren't |
| ;; applicable. |
| ;; |
| ;; Note that like `urem` for i8 types the result is in AH so to get the result |
| ;; it's right-shifted down. |
| (rule 3 (lower (srem a @ (value_type $I8) (iconst imm))) |
| (if-let n (safe_divisor_from_imm64 $I8 imm)) |
| (let ( |
| (a Gpr (x64_sign_extend_data a (OperandSize.Size8))) |
| (result Gpr (x64_div8 a (imm $I8 n) (DivSignedness.Signed) (TrapCode.IntegerDivisionByZero))) |
| ) |
| (x64_shr $I64 result (Imm8Reg.Imm8 8)))) |
| |
| ;; Same as the above rule but for 16-to-64 bit types. |
| (rule 2 (lower (srem a @ (value_type ty) (iconst imm))) |
| (if-let n (safe_divisor_from_imm64 ty imm)) |
| (let ( |
| (a Gpr a) |
| (size OperandSize (raw_operand_size_of_type ty)) |
| ) |
| (x64_div_remainder a |
| (x64_sign_extend_data a size) |
| (imm ty n) |
| size |
| (DivSignedness.Signed) |
| (TrapCode.IntegerDivisionByZero)))) |
| |
| (rule 1 (lower (srem a @ (value_type $I8) b)) |
| (let ( |
| (a Gpr (x64_sign_extend_data a (OperandSize.Size8))) |
| ) |
| (x64_shr $I64 (x64_checked_srem_seq8 a b) (Imm8Reg.Imm8 8)))) |
| |
| (rule (lower (srem a @ (value_type ty) b)) |
| (let ( |
| (a Gpr a) |
| (size OperandSize (raw_operand_size_of_type ty)) |
| (hi Gpr (x64_sign_extend_data a size)) |
| (tmp ValueRegs (x64_checked_srem_seq size a hi b)) |
| ) |
| (value_regs_get tmp 1))) |
| |
| ;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (umulhi a @ (value_type $I16) b)) |
| (let ((res ValueRegs (mul_hi $I16 $false a b)) |
| (hi Gpr (value_regs_get_gpr res 1))) |
| hi)) |
| |
| (rule (lower (umulhi a @ (value_type $I32) b)) |
| (let ((res ValueRegs (mul_hi $I32 $false a b)) |
| (hi Gpr (value_regs_get_gpr res 1))) |
| hi)) |
| |
| (rule (lower (umulhi a @ (value_type $I64) b)) |
| (let ((res ValueRegs (mul_hi $I64 $false a b)) |
| (hi Gpr (value_regs_get_gpr res 1))) |
| hi)) |
| |
| ;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (smulhi a @ (value_type $I16) b)) |
| (let ((res ValueRegs (mul_hi $I16 $true a b)) |
| (hi Gpr (value_regs_get_gpr res 1))) |
| hi)) |
| |
| (rule (lower (smulhi a @ (value_type $I32) b)) |
| (let ((res ValueRegs (mul_hi $I32 $true a b)) |
| (hi Gpr (value_regs_get_gpr res 1))) |
| hi)) |
| |
| (rule (lower (smulhi a @ (value_type $I64) b)) |
| (let ((res ValueRegs (mul_hi $I64 $true a b)) |
| (hi Gpr (value_regs_get_gpr res 1))) |
| hi)) |
| |
| ;; Rules for `get_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (get_pinned_reg)) |
| (read_pinned_gpr)) |
| |
| ;; Rules for `set_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (set_pinned_reg a @ (value_type ty))) |
| (side_effect (write_pinned_gpr a))) |
| |
| ;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type ty (vconst const))) |
| ;; TODO use Inst::gen_constant() instead. |
| (x64_xmm_load_const ty (const_to_vconst const))) |
| |
| ;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; Special case for `pblendw` which takes an 8-bit immediate where each bit |
| ;; indicates which lane of the two operands is chosen for the output. A bit of |
| ;; 0 chooses the corresponding 16-it lane from `a` and a bit of 1 chooses the |
| ;; corresponding 16-bit lane from `b`. |
| (rule 14 (lower (shuffle a b (pblendw_imm n))) |
| (if-let $true (use_sse41)) |
| (x64_pblendw a b n)) |
| (decl pblendw_imm (u8) Immediate) |
| (extern extractor pblendw_imm pblendw_imm) |
| |
| ;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8 |
| ;; bytes", that's a `palignr` instruction. Note that the order of operands are |
| ;; swapped in the instruction here. The `palignr` instruction uses the second |
| ;; operand as the low-order bytes and the first operand as high-order bytes, |
| ;; so put `a` second. |
| (rule 13 (lower (shuffle a b (palignr_imm_from_immediate n))) |
| (if-let $true (use_ssse3)) |
| (x64_palignr b a n)) |
| (decl palignr_imm_from_immediate (u8) Immediate) |
| (extern extractor palignr_imm_from_immediate palignr_imm_from_immediate) |
| |
| ;; Special case the `pshuf{l,h}w` instruction which shuffles four 16-bit |
| ;; integers within one value, preserving the other four 16-bit integers in that |
| ;; value (either the high or low half). The complicated logic is in the |
| ;; extractors here implemented in Rust and note that there's two cases for each |
| ;; instruction here to match when either the first or second shuffle operand is |
| ;; used. |
| (rule 12 (lower (shuffle x y (pshuflw_lhs_imm imm))) |
| (x64_pshuflw x imm)) |
| (rule 11 (lower (shuffle x y (pshuflw_rhs_imm imm))) |
| (x64_pshuflw y imm)) |
| (rule 10 (lower (shuffle x y (pshufhw_lhs_imm imm))) |
| (x64_pshufhw x imm)) |
| (rule 9 (lower (shuffle x y (pshufhw_rhs_imm imm))) |
| (x64_pshufhw y imm)) |
| |
| (decl pshuflw_lhs_imm (u8) Immediate) |
| (extern extractor pshuflw_lhs_imm pshuflw_lhs_imm) |
| (decl pshuflw_rhs_imm (u8) Immediate) |
| (extern extractor pshuflw_rhs_imm pshuflw_rhs_imm) |
| (decl pshufhw_lhs_imm (u8) Immediate) |
| (extern extractor pshufhw_lhs_imm pshufhw_lhs_imm) |
| (decl pshufhw_rhs_imm (u8) Immediate) |
| (extern extractor pshufhw_rhs_imm pshufhw_rhs_imm) |
| |
| ;; Special case for the `pshufd` instruction which will permute 32-bit values |
| ;; within a single register. This is only applicable if the `imm` specified |
| ;; selects 32-bit values from either `x` or `y`, but not both. This means |
| ;; there's one rule for selecting from `x` and another rule for selecting from |
| ;; `y`. |
| (rule 8 (lower (shuffle x y (pshufd_lhs_imm imm))) |
| (x64_pshufd x imm)) |
| (rule 7 (lower (shuffle x y (pshufd_rhs_imm imm))) |
| (x64_pshufd y imm)) |
| |
| (decl pshufd_lhs_imm (u8) Immediate) |
| (extern extractor pshufd_lhs_imm pshufd_lhs_imm) |
| (decl pshufd_rhs_imm (u8) Immediate) |
| (extern extractor pshufd_rhs_imm pshufd_rhs_imm) |
| |
| ;; Special case for i8-level interleaving of upper/low bytes. |
| (rule 6 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808))) |
| (x64_punpckhbw a b)) |
| (rule 6 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000))) |
| (x64_punpcklbw a b)) |
| |
| ;; Special case for i16-level interleaving of upper/low bytes. |
| (rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908))) |
| (x64_punpckhwd a b)) |
| (rule 6 (lower (shuffle a b (u128_from_immediate 0x1716_0706_1514_0504_1312_0302_1110_0100))) |
| (x64_punpcklwd a b)) |
| |
| ;; Special case for i32-level interleaving of upper/low bytes. |
| (rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_1b1a1918_0b0a0908))) |
| (x64_punpckhdq a b)) |
| (rule 6 (lower (shuffle a b (u128_from_immediate 0x17161514_07060504_13121110_03020100))) |
| (x64_punpckldq a b)) |
| |
| ;; Special case for i64-level interleaving of upper/low bytes. |
| (rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c1b1a1918_0f0e0d0c0b0a0908))) |
| (x64_punpckhqdq a b)) |
| (rule 6 (lower (shuffle a b (u128_from_immediate 0x1716151413121110_0706050403020100))) |
| (x64_punpcklqdq a b)) |
| |
| ;; If the vector shift mask is all 0s then that means the first byte of the |
| ;; first operand is broadcast to all bytes. Falling through would load an |
| ;; all-zeros constant from a rip-relative location but it should be slightly |
| ;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero |
| ;; register. |
| (rule 6 (lower (shuffle a _ (u128_from_immediate 0))) |
| (if-let $true (use_ssse3)) |
| (x64_pshufb a (xmm_zero $I8X16))) |
| |
| ;; Special case for the `shufps` instruction which will select two 32-bit values |
| ;; from the first operand and two 32-bit values from the second operand. Note |
| ;; that there is a second case here as well for when the operands can be |
| ;; swapped. |
| ;; |
| ;; Note that the priority of this instruction is currently lower than the above |
| ;; special cases since `shufps` handles many of them and for now it's |
| ;; hypothesized that the dedicated instructions are better than `shufps`. |
| ;; Someone with more knowledge about x86 timings should perhaps reorder the |
| ;; rules here eventually though. |
| (rule 5 (lower (shuffle x y (shufps_imm imm))) |
| (x64_shufps x y imm)) |
| (rule 4 (lower (shuffle x y (shufps_rev_imm imm))) |
| (x64_shufps y x imm)) |
| |
| (decl shufps_imm(u8) Immediate) |
| (extern extractor shufps_imm shufps_imm) |
| (decl shufps_rev_imm(u8) Immediate) |
| (extern extractor shufps_rev_imm shufps_rev_imm) |
| |
| |
| ;; If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM |
| ;; register. We statically build `constructed_mask` to zero out any unknown lane |
| ;; indices (may not be completely necessary: verification could fail incorrect |
| ;; mask values) and fix the indexes to all point to the `dst` vector. |
| (rule 3 (lower (shuffle a a (vec_mask_from_immediate mask))) |
| (if-let $true (use_ssse3)) |
| (x64_pshufb a (shuffle_0_31_mask mask))) |
| |
| ;; For the case where the shuffle mask contains out-of-bounds values (values |
| ;; greater than 31) we must mask off those resulting values in the result of |
| ;; `vpermi2b`. |
| (rule 2 (lower (shuffle a b (vec_mask_from_immediate (perm_from_mask_with_zeros mask zeros)))) |
| (if-let $true (use_avx512vl)) |
| (if-let $true (use_avx512vbmi)) |
| (x64_andps (x64_vpermi2b (x64_xmm_load_const $I8X16 mask) a b) zeros)) |
| |
| ;; However, if the shuffle mask contains no out-of-bounds values, we can use |
| ;; `vpermi2b` without any masking. |
| (rule 1 (lower (shuffle a b (vec_mask_from_immediate mask))) |
| (if-let $true (use_avx512vl)) |
| (if-let $true (use_avx512vbmi)) |
| (x64_vpermi2b (x64_xmm_load_const $I8X16 (perm_from_mask mask)) a b)) |
| |
| ;; If `lhs` and `rhs` are different, we must shuffle each separately and then OR |
| ;; them together. This is necessary due to PSHUFB semantics. As in the case |
| ;; above, we build the `constructed_mask` for each case statically. |
| (rule (lower (shuffle a b (vec_mask_from_immediate mask))) |
| (x64_por |
| (lower_pshufb a (shuffle_0_15_mask mask)) |
| (lower_pshufb b (shuffle_16_31_mask mask)))) |
| |
| ;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; SIMD swizzle; the following inefficient implementation is due to the Wasm |
| ;; SIMD spec requiring mask indexes greater than 15 to have the same semantics |
| ;; as a 0 index. For the spec discussion, see |
| ;; https://github.com/WebAssembly/simd/issues/93. The CLIF semantics match the |
| ;; Wasm SIMD semantics for this instruction. The instruction format maps to |
| ;; variables like: %dst = swizzle %src, %mask |
| (rule (lower (swizzle src mask)) |
| (let ((mask Xmm (x64_paddusb mask (emit_u128_le_const 0x70707070707070707070707070707070)))) |
| (lower_pshufb src mask))) |
| |
| ;; Rules for `x86_pshufb` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (x86_pshufb src mask)) |
| (if-let $true (use_ssse3)) |
| (x64_pshufb src mask)) |
| |
| ;; A helper function to generate either the `pshufb` instruction or a libcall to |
| ;; the `X86Pshufb` libcall. Note that the libcall is not exactly the most |
| ;; performant thing in the world so this is primarily here for completeness |
| ;; of lowerings on all x86 cpus but if rules are ideally gated on the presence |
| ;; of SSSE3 to use the `pshufb` instruction itself. |
| (decl lower_pshufb (Xmm RegMem) Xmm) |
| (rule 1 (lower_pshufb src mask) |
| (if-let $true (use_ssse3)) |
| (x64_pshufb src mask)) |
| (rule (lower_pshufb src (RegMem.Reg mask)) |
| (libcall_2 (LibCall.X86Pshufb) src mask)) |
| (rule (lower_pshufb src (RegMem.Mem addr)) |
| (lower_pshufb src (x64_movdqu_load addr))) |
| |
| ;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; Remove the extractlane instruction, leaving the float where it is. The upper |
| ;; bits will remain unchanged; for correctness, this relies on Cranelift type |
| ;; checking to avoid using those bits. |
| (rule 3 (lower (has_type (ty_scalar_float _) (extractlane val 0))) |
| val) |
| |
| ;; `f32x4.extract_lane N` where `N != 0` |
| (rule 1 (lower (extractlane val @ (value_type $F32X4) (u8_from_uimm8 lane))) |
| (x64_pshufd val lane)) |
| |
| ;; `f64x2.extract_lane N` where `N != 0` (aka N == 1) |
| (rule (lower (extractlane val @ (value_type $F64X2) 1)) |
| (x64_pshufd val 0b11_10_11_10)) |
| |
| ;; `i8x16.extract_lane N` |
| ;; |
| ;; Note that without SSE4.1 a 16-bit lane extraction is performed and then |
| ;; the result is updated if the desired index is either odd or even. |
| (rule 2 (lower (extractlane val @ (value_type ty @ $I8X16) (u8_from_uimm8 lane))) |
| (if-let $true (use_sse41)) |
| (x64_pextrb val lane)) |
| ;; extracting an odd lane has an extra shift-right |
| (rule 1 (lower (extractlane val @ (value_type ty @ $I8X16) (u8_from_uimm8 lane))) |
| (if-let 1 (u8_and lane 1)) |
| (x64_shr $I16 (x64_pextrw val (u8_shr lane 1)) (Imm8Reg.Imm8 8))) |
| ;; Extracting an even lane already has the desired lane in the lower bits. Note |
| ;; that having arbitrary upper bits in the returned register should be ok since |
| ;; all operators on the resulting `i8` type should work correctly regardless of |
| ;; the bits in the rest of the register. |
| (rule (lower (extractlane val @ (value_type ty @ $I8X16) (u8_from_uimm8 lane))) |
| (if-let 0 (u8_and lane 1)) |
| (x64_pextrw val (u8_shr lane 1))) |
| |
| ;; `i16x8.extract_lane N` |
| (rule (lower (extractlane val @ (value_type ty @ $I16X8) (u8_from_uimm8 lane))) |
| (x64_pextrw val lane)) |
| |
| ;; `i32x4.extract_lane N` |
| (rule 2 (lower (extractlane val @ (value_type ty @ $I32X4) (u8_from_uimm8 lane))) |
| (if-let $true (use_sse41)) |
| (x64_pextrd val lane)) |
| (rule 1 (lower (extractlane val @ (value_type $I32X4) 0)) |
| (x64_movd_to_gpr val)) |
| (rule (lower (extractlane val @ (value_type $I32X4) (u8_from_uimm8 n))) |
| (x64_movd_to_gpr (x64_pshufd val n))) |
| |
| ;; `i64x2.extract_lane N` |
| (rule 1 (lower (extractlane val @ (value_type $I64X2) (u8_from_uimm8 lane))) |
| (if-let $true (use_sse41)) |
| (x64_pextrq val lane)) |
| (rule (lower (extractlane val @ (value_type $I64X2) 0)) |
| (x64_movq_to_gpr val)) |
| (rule (lower (extractlane val @ (value_type $I64X2) 1)) |
| (x64_movq_to_gpr (x64_pshufd val 0b00_00_11_10))) |
| |
| ;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; Case 1: when moving a scalar float, we simply move from one XMM register |
| ;; to another, expecting the register allocator to elide this. Here we |
| ;; assume that the upper bits of a scalar float have not been munged with |
| ;; (the same assumption the old backend makes). |
| (rule 1 (lower (scalar_to_vector src @ (value_type (ty_scalar_float _)))) |
| src) |
| |
| ;; Case 2: when moving a scalar value of any other type, use MOVD to zero |
| ;; the upper lanes. |
| (rule (lower (scalar_to_vector src @ (value_type ty))) |
| (bitcast_gpr_to_xmm ty src)) |
| |
| ;; Case 3: when presented with `load + scalar_to_vector`, coalesce into a single |
| ;; MOVSS/MOVSD instruction. |
| (rule 2 (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_32 _))))) |
| (x64_movss_load src)) |
| (rule 3 (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_64 _))))) |
| (x64_movsd_load src)) |
| |
| ;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; For all the splat rules below one of the goals is that splatting a value |
| ;; doesn't end up accidentally depending on the previous value in a register. |
| ;; This means that instructions are chosen to avoid false dependencies where |
| ;; new values are created fresh or otherwise overwrite previous register |
| ;; contents where possible. |
| ;; |
| ;; Additionally splats are specialized to special-case load-and-splat which |
| ;; has a number of micro-optimizations available. |
| |
| ;; i8x16 splats: use `vpbroadcastb` on AVX2 and otherwise `pshufb` broadcasts |
| ;; with a mask of zero which is calculated with an xor-against-itself register. |
| (rule 0 (lower (has_type $I8X16 (splat src))) |
| (let ((src Xmm (x64_movd_to_xmm src))) |
| (x64_pshufd (x64_pshuflw (x64_punpcklbw src src) 0) 0))) |
| (rule 1 (lower (has_type $I8X16 (splat src))) |
| (if-let $true (use_ssse3)) |
| (x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16))) |
| (rule 2 (lower (has_type $I8X16 (splat src))) |
| (if-let $true (use_avx2)) |
| (x64_vpbroadcastb (bitcast_gpr_to_xmm $I32 src))) |
| (rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr)))) |
| (if-let $true (use_sse41)) |
| (if-let $true (use_ssse3)) |
| (x64_pshufb (x64_pinsrb (xmm_uninit_value) addr 0) (xmm_zero $I8X16))) |
| (rule 4 (lower (has_type $I8X16 (splat (sinkable_load_exact addr)))) |
| (if-let $true (use_avx2)) |
| (x64_vpbroadcastb addr)) |
| |
| ;; i16x8 splats: use `vpbroadcastw` on AVX2 and otherwise a 16-bit value is |
| ;; loaded into an xmm register, `pshuflw` broadcasts the low 16-bit lane |
| ;; to the low four lanes, and `pshufd` broadcasts the low 32-bit lane (which |
| ;; at that point is two of the 16-bit values we want to broadcast) to all the |
| ;; lanes. |
| (rule 0 (lower (has_type $I16X8 (splat src))) |
| (x64_pshufd (x64_pshuflw (bitcast_gpr_to_xmm $I32 src) 0) 0)) |
| (rule 1 (lower (has_type $I16X8 (splat src))) |
| (if-let $true (use_avx2)) |
| (x64_vpbroadcastw (bitcast_gpr_to_xmm $I32 src))) |
| (rule 2 (lower (has_type $I16X8 (splat (sinkable_load_exact addr)))) |
| (x64_pshufd (x64_pshuflw (x64_pinsrw (xmm_uninit_value) addr 0) 0) 0)) |
| (rule 3 (lower (has_type $I16X8 (splat (sinkable_load_exact addr)))) |
| (if-let $true (use_avx2)) |
| (x64_vpbroadcastw addr)) |
| |
| ;; i32x4.splat - use `vpbroadcastd` on AVX2 and otherwise `pshufd` can be |
| ;; used to broadcast the low lane to all other lanes. |
| ;; |
| ;; Note that sinkable-load cases come later |
| (rule 0 (lower (has_type $I32X4 (splat src))) |
| (x64_pshufd (bitcast_gpr_to_xmm $I32 src) 0)) |
| (rule 1 (lower (has_type $I32X4 (splat src))) |
| (if-let $true (use_avx2)) |
| (x64_vpbroadcastd (bitcast_gpr_to_xmm $I32 src))) |
| |
| ;; f32x4.splat - the source is already in an xmm register so `shufps` is all |
| ;; that's necessary to complete the splat. This is specialized to `vbroadcastss` |
| ;; on AVX2 to leverage that specific instruction for this operation. |
| (rule 0 (lower (has_type $F32X4 (splat src))) |
| (let ((tmp Xmm src)) |
| (x64_shufps src src 0))) |
| (rule 1 (lower (has_type $F32X4 (splat src))) |
| (if-let $true (use_avx2)) |
| (x64_vbroadcastss src)) |
| |
| ;; t32x4.splat of a load - use a `movss` to load into an xmm register and then |
| ;; `shufps` broadcasts to the other lanes. Note that this is used for both i32 |
| ;; and f32 splats. |
| ;; |
| ;; With AVX the `vbroadcastss` instruction suits this purpose precisely. Note |
| ;; that the memory-operand encoding of `vbroadcastss` is usable with AVX, but |
| ;; the register-based encoding is only available with AVX2. With the |
| ;; `sinkable_load` extractor this should be guaranteed to use the memory-based |
| ;; encoding hence the `use_avx` test. |
| (rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr)))) |
| (let ((tmp Xmm (x64_movss_load addr))) |
| (x64_shufps tmp tmp 0))) |
| (rule 6 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr)))) |
| (if-let $true (use_avx)) |
| (x64_vbroadcastss addr)) |
| |
| ;; t64x2.splat - use `pshufd` to broadcast the lower 64-bit lane to the upper |
| ;; lane. A minor specialization for sinkable loads to avoid going through a gpr |
| ;; for i64 splats is used as well when `movddup` is available. |
| (rule 0 (lower (has_type $I64X2 (splat src))) |
| (x64_pshufd (bitcast_gpr_to_xmm $I64 src) 0b01_00_01_00)) |
| (rule 0 (lower (has_type $F64X2 (splat src))) |
| (x64_pshufd src 0b01_00_01_00)) |
| (rule 6 (lower (has_type (multi_lane 64 2) (splat (sinkable_load addr)))) |
| (if-let $true (use_ssse3)) |
| (x64_movddup addr)) |
| |
| ;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule 1 (lower (vany_true val)) |
| (if-let $true (use_sse41)) |
| (let ((val Xmm val)) |
| (with_flags (x64_ptest val val) (x64_setcc (CC.NZ))))) |
| |
| ;; Any nonzero byte in `val` means that any lane is true. Compare `val` with a |
| ;; zeroed register and extract the high bits to a gpr mask. If the mask is |
| ;; 0xffff then every byte was equal to zero, so test if the comparison is |
| ;; not-equal or NZ. |
| (rule (lower (vany_true val)) |
| (let ( |
| (any_byte_zero Xmm (x64_pcmpeqb val (xmm_zero $I8X16))) |
| (mask Gpr (x64_pmovmskb (OperandSize.Size32) any_byte_zero)) |
| ) |
| (with_flags (x64_cmp (OperandSize.Size32) (RegMemImm.Imm 0xffff) mask) |
| (x64_setcc (CC.NZ))))) |
| |
| ;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule 1 (lower (vall_true val @ (value_type ty))) |
| (if-let $true (use_sse41)) |
| (let ((src Xmm val) |
| (zeros Xmm (xmm_zero ty)) |
| (cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros))) |
| (with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z))))) |
| |
| ;; Perform an appropriately-sized lane-wise comparison with zero. If the |
| ;; result is all 0s then all of them are true because nothing was equal to |
| ;; zero. |
| (rule (lower (vall_true val @ (value_type ty))) |
| (let ((lanes_with_zero Xmm (x64_pcmpeq (vec_int_type ty) val (xmm_zero ty))) |
| (mask Gpr (x64_pmovmskb (OperandSize.Size32) lanes_with_zero))) |
| (with_flags (x64_test (OperandSize.Size32) mask mask) |
| (x64_setcc (CC.Z))))) |
| |
| ;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; The Intel specification allows using both 32-bit and 64-bit GPRs as |
| ;; destination for the "move mask" instructions. This is controlled by the REX.R |
| ;; bit: "In 64-bit mode, the instruction can access additional registers when |
| ;; used with a REX.R prefix. The default operand size is 64-bit in 64-bit mode" |
| ;; (PMOVMSKB in IA Software Development Manual, vol. 2). This being the case, we |
| ;; will always clear REX.W since its use is unnecessary (`OperandSize` is used |
| ;; for setting/clearing REX.W) as we need at most 16 bits of output for |
| ;; `vhigh_bits`. |
| |
| (rule (lower (vhigh_bits val @ (value_type (multi_lane 8 16)))) |
| (x64_pmovmskb (OperandSize.Size32) val)) |
| |
| (rule (lower (vhigh_bits val @ (value_type (multi_lane 32 4)))) |
| (x64_movmskps (OperandSize.Size32) val)) |
| |
| (rule (lower (vhigh_bits val @ (value_type (multi_lane 64 2)))) |
| (x64_movmskpd (OperandSize.Size32) val)) |
| |
| ;; There is no x86 instruction for extracting the high bit of 16-bit lanes so |
| ;; here we: |
| ;; - duplicate the 16-bit lanes of `src` into 8-bit lanes: |
| ;; PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...] |
| ;; - use PMOVMSKB to gather the high bits; now we have duplicates, though |
| ;; - shift away the bottom 8 high bits to remove the duplicates. |
| (rule (lower (vhigh_bits val @ (value_type (multi_lane 16 8)))) |
| (let ((src Xmm val) |
| (tmp Xmm (x64_packsswb src src)) |
| (tmp Gpr (x64_pmovmskb (OperandSize.Size32) tmp))) |
| (x64_shr $I64 tmp (Imm8Reg.Imm8 8)))) |
| |
| ;; Rules for `iconcat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (iconcat lo @ (value_type $I64) hi)) |
| (value_regs lo hi)) |
| |
| ;; Rules for `isplit` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (isplit val @ (value_type $I128))) |
| (let ((regs ValueRegs val) |
| (lo Reg (value_regs_get regs 0)) |
| (hi Reg (value_regs_get regs 1))) |
| (output_pair lo hi))) |
| |
| ;; Rules for `tls_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (has_type (tls_model (TlsModel.ElfGd)) (tls_value (symbol_value_data name _ _)))) |
| (elf_tls_get_addr name)) |
| |
| (rule (lower (has_type (tls_model (TlsModel.Macho)) (tls_value (symbol_value_data name _ _)))) |
| (macho_tls_get_addr name)) |
| |
| (rule (lower (has_type (tls_model (TlsModel.Coff)) (tls_value (symbol_value_data name _ _)))) |
| (coff_tls_get_addr name)) |
| |
| ;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule 1 (lower (sqmul_round_sat qx @ (value_type $I16X8) qy)) |
| (if-let $true (use_ssse3)) |
| (let ((src1 Xmm qx) |
| (src2 Xmm qy) |
| |
| (mask XmmMem (emit_u128_le_const 0x8000_8000_8000_8000_8000_8000_8000_8000)) |
| (dst Xmm (x64_pmulhrsw src1 src2)) |
| (cmp Xmm (x64_pcmpeqw dst mask))) |
| (x64_pxor dst cmp))) |
| |
| ;; This operation is defined in wasm as: |
| ;; |
| ;; S.SignedSaturate((x * y + 0x4000) >> 15) |
| ;; |
| ;; so perform all those operations here manually with a lack of the native |
| ;; instruction. |
| (rule (lower (sqmul_round_sat qx @ (value_type $I16X8) qy)) |
| (let ( |
| (qx Xmm qx) |
| (qy Xmm qy) |
| ;; Multiply `qx` and `qy` generating 32-bit intermediate results. The |
| ;; 32-bit results have their low-halves stored in `mul_lsb` and the |
| ;; high halves are stored in `mul_msb`. These are then shuffled into |
| ;; `mul_lo` and `mul_hi` which represent the low 4 multiplications |
| ;; and the upper 4 multiplications. |
| (mul_lsb Xmm (x64_pmullw qx qy)) |
| (mul_msb Xmm (x64_pmulhw qx qy)) |
| (mul_lo Xmm (x64_punpcklwd mul_lsb mul_msb)) |
| (mul_hi Xmm (x64_punpckhwd mul_lsb mul_msb)) |
| ;; Add the 0x4000 constant to all multiplications |
| (val Xmm (x64_movdqu_load (emit_u128_le_const 0x00004000_00004000_00004000_00004000))) |
| (mul_lo Xmm (x64_paddd mul_lo val)) |
| (mul_hi Xmm (x64_paddd mul_hi val)) |
| ;; Perform the right-shift by 15 to all multiplications |
| (lo Xmm (x64_psrad mul_lo (xmi_imm 15))) |
| (hi Xmm (x64_psrad mul_hi (xmi_imm 15))) |
| ) |
| ;; And finally perform a saturating 32-to-16-bit conversion. |
| (x64_packssdw lo hi))) |
| |
| ;; Rules for `x86_pmulhrsw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (x86_pmulhrsw qx @ (value_type $I16X8) qy)) |
| (if-let $true (use_ssse3)) |
| (x64_pmulhrsw qx qy)) |
| |
| ;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ;; TODO: currently we only lower a special case of `uunarrow` needed to support |
| ;; the translation of wasm's i32x4.trunc_sat_f64x2_u_zero operation. |
| ;; https://github.com/bytecodealliance/wasmtime/issues/4791 |
| ;; |
| ;; y = i32x4.trunc_sat_f64x2_u_zero(x) is lowered to: |
| ;; MOVAPD xmm_y, xmm_x |
| ;; XORPD xmm_tmp, xmm_tmp |
| ;; MAXPD xmm_y, xmm_tmp |
| ;; MINPD xmm_y, [wasm_f64x2_splat(4294967295.0)] |
| ;; ROUNDPD xmm_y, xmm_y, 0x0B |
| ;; ADDPD xmm_y, [wasm_f64x2_splat(0x1.0p+52)] |
| ;; SHUFPS xmm_y, xmm_xmp, 0x88 |
| (rule (lower (uunarrow (fcvt_to_uint_sat src @ (value_type $F64X2)) |
| (vconst (u128_from_constant 0)))) |
| (let ((src Xmm src) |
| |
| ;; MOVAPD xmm_y, xmm_x |
| ;; XORPD xmm_tmp, xmm_tmp |
| (zeros Xmm (xmm_zero $F64X2)) |
| (dst Xmm (x64_maxpd src zeros)) |
| |
| ;; 4294967295.0 is equivalent to 0x41EFFFFFFFE00000 |
| (umax_mask XmmMem (emit_u128_le_const 0x41EFFFFFFFE00000_41EFFFFFFFE00000)) |
| |
| ;; MINPD xmm_y, [wasm_f64x2_splat(4294967295.0)] |
| (dst Xmm (x64_minpd dst umax_mask)) |
| |
| ;; ROUNDPD xmm_y, xmm_y, 0x0B |
| (dst Xmm (x64_round $F64X2 dst (RoundImm.RoundZero))) |
| |
| ;; ADDPD xmm_y, [wasm_f64x2_splat(0x1.0p+52)] |
| (uint_mask XmmMem (emit_u128_le_const 0x4330000000000000_4330000000000000)) |
| |
| (dst Xmm (x64_addpd dst uint_mask))) |
| |
| ;; SHUFPS xmm_y, xmm_xmp, 0x88 |
| (x64_shufps dst zeros 0x88))) |
| |
| ;; Rules for `nop` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| (rule (lower (nop)) |
| (invalid_reg)) |