blob: 6bd65154eb23200b8a954a26c137e8d562bfdd87 [file] [log] [blame]
use crate::binemit::{Addend, Reloc};
use crate::ir;
use crate::ir::immediates::{Ieee32, Ieee64};
use crate::ir::TrapCode;
use crate::ir::{KnownSymbol, LibCall, MemFlags};
use crate::isa::x64::encoding::evex::{EvexInstruction, EvexVectorLength, RegisterOrAmode};
use crate::isa::x64::encoding::rex::{
emit_simm, emit_std_enc_enc, emit_std_enc_mem, emit_std_reg_mem, emit_std_reg_reg, int_reg_enc,
low8_will_sign_extend_to_32, low8_will_sign_extend_to_64, reg_enc, LegacyPrefixes, OpcodeMap,
RexFlags,
};
use crate::isa::x64::encoding::vex::{VexInstruction, VexVectorLength};
use crate::isa::x64::inst::args::*;
use crate::isa::x64::inst::*;
use crate::machinst::{inst_common, MachBuffer, MachInstEmit, MachLabel, Reg, Writable};
use core::convert::TryInto;
/// A small helper to generate a signed conversion instruction.
fn emit_signed_cvt(
sink: &mut MachBuffer<Inst>,
info: &EmitInfo,
state: &mut EmitState,
// Required to be RealRegs.
src: Reg,
dst: Writable<Reg>,
to_f64: bool,
) {
// Handle an unsigned int, which is the "easy" case: a signed conversion will do the
// right thing.
let op = if to_f64 {
SseOpcode::Cvtsi2sd
} else {
SseOpcode::Cvtsi2ss
};
Inst::CvtIntToFloat {
op,
dst: Writable::from_reg(Xmm::new(dst.to_reg()).unwrap()),
src1: Xmm::new(dst.to_reg()).unwrap(),
src2: GprMem::new(RegMem::reg(src)).unwrap(),
src2_size: OperandSize::Size64,
}
.emit(&[], sink, info, state);
}
/// Emits a one way conditional jump if CC is set (true).
fn one_way_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {
let cond_start = sink.cur_offset();
let cond_disp_off = cond_start + 2;
sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32);
sink.put1(0x0F);
sink.put1(0x80 + cc.get_enc());
sink.put4(0x0);
}
/// Emits a relocation, attaching the current source location as well.
fn emit_reloc(sink: &mut MachBuffer<Inst>, kind: Reloc, name: &ExternalName, addend: Addend) {
sink.add_reloc(kind, name, addend);
}
/// The top-level emit function.
///
/// Important! Do not add improved (shortened) encoding cases to existing
/// instructions without also adding tests for those improved encodings. That
/// is a dangerous game that leads to hard-to-track-down errors in the emitted
/// code.
///
/// For all instructions, make sure to have test coverage for all of the
/// following situations. Do this by creating the cross product resulting from
/// applying the following rules to each operand:
///
/// (1) for any insn that mentions a register: one test using a register from
/// the group [rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi] and a second one
/// using a register from the group [r8, r9, r10, r11, r12, r13, r14, r15].
/// This helps detect incorrect REX prefix construction.
///
/// (2) for any insn that mentions a byte register: one test for each of the
/// four encoding groups [al, cl, dl, bl], [spl, bpl, sil, dil],
/// [r8b .. r11b] and [r12b .. r15b]. This checks that
/// apparently-redundant REX prefixes are retained when required.
///
/// (3) for any insn that contains an immediate field, check the following
/// cases: field is zero, field is in simm8 range (-128 .. 127), field is
/// in simm32 range (-0x8000_0000 .. 0x7FFF_FFFF). This is because some
/// instructions that require a 32-bit immediate have a short-form encoding
/// when the imm is in simm8 range.
///
/// Rules (1), (2) and (3) don't apply for registers within address expressions
/// (`Addr`s). Those are already pretty well tested, and the registers in them
/// don't have any effect on the containing instruction (apart from possibly
/// require REX prefix bits).
///
/// When choosing registers for a test, avoid using registers with the same
/// offset within a given group. For example, don't use rax and r8, since they
/// both have the lowest 3 bits as 000, and so the test won't detect errors
/// where those 3-bit register sub-fields are confused by the emitter. Instead
/// use (eg) rax (lo3 = 000) and r9 (lo3 = 001). Similarly, don't use (eg) cl
/// and bpl since they have the same offset in their group; use instead (eg) cl
/// and sil.
///
/// For all instructions, also add a test that uses only low-half registers
/// (rax .. rdi, xmm0 .. xmm7) etc, so as to check that any redundant REX
/// prefixes are correctly omitted. This low-half restriction must apply to
/// _all_ registers in the insn, even those in address expressions.
///
/// Following these rules creates large numbers of test cases, but it's the
/// only way to make the emitter reliable.
///
/// Known possible improvements:
///
/// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate. (Do we
/// care?)
pub(crate) fn emit(
inst: &Inst,
allocs: &mut AllocationConsumer<'_>,
sink: &mut MachBuffer<Inst>,
info: &EmitInfo,
state: &mut EmitState,
) {
let matches_isa_flags = |iset_requirement: &InstructionSet| -> bool {
match iset_requirement {
// Cranelift assumes SSE2 at least.
InstructionSet::SSE | InstructionSet::SSE2 => true,
InstructionSet::SSSE3 => info.isa_flags.use_ssse3(),
InstructionSet::SSE41 => info.isa_flags.use_sse41(),
InstructionSet::SSE42 => info.isa_flags.use_sse42(),
InstructionSet::Popcnt => info.isa_flags.use_popcnt(),
InstructionSet::Lzcnt => info.isa_flags.use_lzcnt(),
InstructionSet::BMI1 => info.isa_flags.use_bmi1(),
InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
InstructionSet::FMA => info.isa_flags.has_fma(),
InstructionSet::AVX => info.isa_flags.has_avx(),
InstructionSet::AVX2 => info.isa_flags.has_avx2(),
InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(),
InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
InstructionSet::AVX512VBMI => info.isa_flags.has_avx512vbmi(),
InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
}
};
// Certain instructions may be present in more than one ISA feature set; we must at least match
// one of them in the target CPU.
let isa_requirements = inst.available_in_any_isa();
if !isa_requirements.is_empty() && !isa_requirements.iter().all(matches_isa_flags) {
panic!(
"Cannot emit inst '{:?}' for target; failed to match ISA requirements: {:?}",
inst, isa_requirements
)
}
match inst {
Inst::AluRmiR {
size,
op,
src1,
src2,
dst: reg_g,
} => {
let src1 = allocs.next(src1.to_reg());
let reg_g = allocs.next(reg_g.to_reg().to_reg());
debug_assert_eq!(src1, reg_g);
let src2 = src2.clone().to_reg_mem_imm().with_allocs(allocs);
let prefix = if *size == OperandSize::Size16 {
LegacyPrefixes::_66
} else {
LegacyPrefixes::None
};
let mut rex = RexFlags::from(*size);
if *op == AluRmiROpcode::Mul {
// We kinda freeloaded Mul into RMI_R_Op, but it doesn't fit the usual pattern, so
// we have to special-case it.
if *size == OperandSize::Size8 {
match src2 {
RegMemImm::Reg { reg: reg_e } => {
debug_assert!(reg_e.is_real());
rex.always_emit_if_8bit_needed(reg_e);
let enc_e = int_reg_enc(reg_e);
emit_std_enc_enc(sink, LegacyPrefixes::None, 0xF6, 1, 5, enc_e, rex);
}
RegMemImm::Mem { addr } => {
let amode = addr.finalize(state, sink);
emit_std_enc_mem(
sink,
LegacyPrefixes::None,
0xF6,
1,
5,
&amode,
rex,
0,
);
}
RegMemImm::Imm { .. } => {
panic!("Cannot emit 8bit imul with 8bit immediate");
}
}
} else {
match src2 {
RegMemImm::Reg { reg: reg_e } => {
emit_std_reg_reg(sink, prefix, 0x0FAF, 2, reg_g, reg_e, rex);
}
RegMemImm::Mem { addr } => {
let amode = addr.finalize(state, sink);
emit_std_reg_mem(sink, prefix, 0x0FAF, 2, reg_g, &amode, rex, 0);
}
RegMemImm::Imm { simm32 } => {
let imm_size = if low8_will_sign_extend_to_32(simm32) {
1
} else {
if *size == OperandSize::Size16 {
2
} else {
4
}
};
let opcode = if imm_size == 1 { 0x6B } else { 0x69 };
// Yes, really, reg_g twice.
emit_std_reg_reg(sink, prefix, opcode, 1, reg_g, reg_g, rex);
emit_simm(sink, imm_size, simm32);
}
}
}
} else {
let (opcode_r, opcode_m, subopcode_i) = match op {
AluRmiROpcode::Add => (0x01, 0x03, 0),
AluRmiROpcode::Adc => (0x11, 0x03, 0),
AluRmiROpcode::Sub => (0x29, 0x2B, 5),
AluRmiROpcode::Sbb => (0x19, 0x2B, 5),
AluRmiROpcode::And => (0x21, 0x23, 4),
AluRmiROpcode::Or => (0x09, 0x0B, 1),
AluRmiROpcode::Xor => (0x31, 0x33, 6),
AluRmiROpcode::Mul => panic!("unreachable"),
};
let (opcode_r, opcode_m) = if *size == OperandSize::Size8 {
(opcode_r - 1, opcode_m - 1)
} else {
(opcode_r, opcode_m)
};
if *size == OperandSize::Size8 {
debug_assert!(reg_g.is_real());
rex.always_emit_if_8bit_needed(reg_g);
}
match src2 {
RegMemImm::Reg { reg: reg_e } => {
if *size == OperandSize::Size8 {
debug_assert!(reg_e.is_real());
rex.always_emit_if_8bit_needed(reg_e);
}
// GCC/llvm use the swapped operand encoding (viz., the R/RM vs RM/R
// duality). Do this too, so as to be able to compare generated machine
// code easily.
emit_std_reg_reg(sink, prefix, opcode_r, 1, reg_e, reg_g, rex);
}
RegMemImm::Mem { addr } => {
let amode = addr.finalize(state, sink);
// Here we revert to the "normal" G-E ordering.
emit_std_reg_mem(sink, prefix, opcode_m, 1, reg_g, &amode, rex, 0);
}
RegMemImm::Imm { simm32 } => {
let imm_size = if *size == OperandSize::Size8 {
1
} else {
if low8_will_sign_extend_to_32(simm32) {
1
} else {
if *size == OperandSize::Size16 {
2
} else {
4
}
}
};
let opcode = if *size == OperandSize::Size8 {
0x80
} else if low8_will_sign_extend_to_32(simm32) {
0x83
} else {
0x81
};
// And also here we use the "normal" G-E ordering.
let enc_g = int_reg_enc(reg_g);
emit_std_enc_enc(sink, prefix, opcode, 1, subopcode_i, enc_g, rex);
emit_simm(sink, imm_size, simm32);
}
}
}
}
Inst::AluConstOp { op, size, dst } => {
let dst = allocs.next(dst.to_reg().to_reg());
emit(
&Inst::AluRmiR {
size: *size,
op: *op,
dst: Writable::from_reg(Gpr::new(dst).unwrap()),
src1: Gpr::new(dst).unwrap(),
src2: Gpr::new(dst).unwrap().into(),
},
allocs,
sink,
info,
state,
);
}
Inst::AluRM {
size,
src1_dst,
src2,
op,
} => {
let src2 = allocs.next(src2.to_reg());
let src1_dst = src1_dst.finalize(state, sink).with_allocs(allocs);
let opcode = match op {
AluRmiROpcode::Add => 0x01,
AluRmiROpcode::Sub => 0x29,
AluRmiROpcode::And => 0x21,
AluRmiROpcode::Or => 0x09,
AluRmiROpcode::Xor => 0x31,
_ => panic!("Unsupported read-modify-write ALU opcode"),
};
let prefix = if *size == OperandSize::Size16 {
LegacyPrefixes::_66
} else {
LegacyPrefixes::None
};
let opcode = if *size == OperandSize::Size8 {
opcode - 1
} else {
opcode
};
let mut rex = RexFlags::from(*size);
if *size == OperandSize::Size8 {
debug_assert!(src2.is_real());
rex.always_emit_if_8bit_needed(src2);
}
let enc_g = int_reg_enc(src2);
emit_std_enc_mem(sink, prefix, opcode, 1, enc_g, &src1_dst, rex, 0);
}
Inst::AluRmRVex {
size,
op,
dst,
src1,
src2,
} => {
use AluRmROpcode::*;
use LegacyPrefixes as LP;
let dst = allocs.next(dst.to_reg().to_reg());
let src1 = allocs.next(src1.to_reg());
let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let w = match size {
OperandSize::Size32 => false,
OperandSize::Size64 => true,
// the other cases would be rejected by isle constructors
_ => unreachable!(),
};
let (prefix, opcode) = match op {
Andn => (LP::None, 0xf2),
Sarx => (LP::_F3, 0xf7),
Shrx => (LP::_F2, 0xf7),
Shlx => (LP::_66, 0xf7),
Bzhi => (LP::None, 0xf5),
};
VexInstruction::new()
.prefix(prefix)
.map(OpcodeMap::_0F38)
.w(w)
.reg(dst.to_real_reg().unwrap().hw_enc())
.vvvv(src1.to_real_reg().unwrap().hw_enc())
.rm(src2)
.opcode(opcode)
.encode(sink);
}
Inst::UnaryRmR { size, op, src, dst } => {
let dst = allocs.next(dst.to_reg().to_reg());
let rex_flags = RexFlags::from(*size);
use UnaryRmROpcode::*;
let prefix = match size {
OperandSize::Size16 => match op {
Bsr | Bsf => LegacyPrefixes::_66,
Lzcnt | Tzcnt | Popcnt => LegacyPrefixes::_66F3,
},
OperandSize::Size32 | OperandSize::Size64 => match op {
Bsr | Bsf => LegacyPrefixes::None,
Lzcnt | Tzcnt | Popcnt => LegacyPrefixes::_F3,
},
_ => unreachable!(),
};
let (opcode, num_opcodes) = match op {
Bsr => (0x0fbd, 2),
Bsf => (0x0fbc, 2),
Lzcnt => (0x0fbd, 2),
Tzcnt => (0x0fbc, 2),
Popcnt => (0x0fb8, 2),
};
match src.clone().into() {
RegMem::Reg { reg: src } => {
let src = allocs.next(src);
emit_std_reg_reg(sink, prefix, opcode, num_opcodes, dst, src, rex_flags);
}
RegMem::Mem { addr: src } => {
let amode = src.finalize(state, sink).with_allocs(allocs);
emit_std_reg_mem(sink, prefix, opcode, num_opcodes, dst, &amode, rex_flags, 0);
}
}
}
Inst::UnaryRmRVex { size, op, src, dst } => {
let dst = allocs.next(dst.to_reg().to_reg());
let src = match src.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let (opcode, opcode_ext) = match op {
UnaryRmRVexOpcode::Blsr => (0xF3, 1),
UnaryRmRVexOpcode::Blsmsk => (0xF3, 2),
UnaryRmRVexOpcode::Blsi => (0xF3, 3),
};
VexInstruction::new()
.map(OpcodeMap::_0F38)
.w(*size == OperandSize::Size64)
.opcode(opcode)
.reg(opcode_ext)
.vvvv(dst.to_real_reg().unwrap().hw_enc())
.rm(src)
.encode(sink);
}
Inst::UnaryRmRImmVex {
size,
op,
src,
dst,
imm,
} => {
let dst = allocs.next(dst.to_reg().to_reg());
let src = match src.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let opcode = match op {
UnaryRmRImmVexOpcode::Rorx => 0xF0,
};
VexInstruction::new()
.prefix(LegacyPrefixes::_F2)
.map(OpcodeMap::_0F3A)
.w(*size == OperandSize::Size64)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
.rm(src)
.imm(*imm)
.encode(sink);
}
Inst::Not { size, src, dst } => {
let src = allocs.next(src.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(src, dst);
let rex_flags = RexFlags::from((*size, dst));
let (opcode, prefix) = match size {
OperandSize::Size8 => (0xF6, LegacyPrefixes::None),
OperandSize::Size16 => (0xF7, LegacyPrefixes::_66),
OperandSize::Size32 => (0xF7, LegacyPrefixes::None),
OperandSize::Size64 => (0xF7, LegacyPrefixes::None),
};
let subopcode = 2;
let enc_src = int_reg_enc(dst);
emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_src, rex_flags)
}
Inst::Neg { size, src, dst } => {
let src = allocs.next(src.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(src, dst);
let rex_flags = RexFlags::from((*size, dst));
let (opcode, prefix) = match size {
OperandSize::Size8 => (0xF6, LegacyPrefixes::None),
OperandSize::Size16 => (0xF7, LegacyPrefixes::_66),
OperandSize::Size32 => (0xF7, LegacyPrefixes::None),
OperandSize::Size64 => (0xF7, LegacyPrefixes::None),
};
let subopcode = 3;
let enc_src = int_reg_enc(dst);
emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_src, rex_flags)
}
Inst::Div {
sign,
trap,
divisor,
..
}
| Inst::Div8 {
sign,
trap,
divisor,
..
} => {
let divisor = divisor.clone().to_reg_mem().with_allocs(allocs);
let size = match inst {
Inst::Div {
size,
dividend_lo,
dividend_hi,
dst_quotient,
dst_remainder,
..
} => {
let dividend_lo = allocs.next(dividend_lo.to_reg());
let dividend_hi = allocs.next(dividend_hi.to_reg());
let dst_quotient = allocs.next(dst_quotient.to_reg().to_reg());
let dst_remainder = allocs.next(dst_remainder.to_reg().to_reg());
debug_assert_eq!(dividend_lo, regs::rax());
debug_assert_eq!(dividend_hi, regs::rdx());
debug_assert_eq!(dst_quotient, regs::rax());
debug_assert_eq!(dst_remainder, regs::rdx());
*size
}
Inst::Div8 { dividend, dst, .. } => {
let dividend = allocs.next(dividend.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(dividend, regs::rax());
debug_assert_eq!(dst, regs::rax());
OperandSize::Size8
}
_ => unreachable!(),
};
let (opcode, prefix) = match size {
OperandSize::Size8 => (0xF6, LegacyPrefixes::None),
OperandSize::Size16 => (0xF7, LegacyPrefixes::_66),
OperandSize::Size32 => (0xF7, LegacyPrefixes::None),
OperandSize::Size64 => (0xF7, LegacyPrefixes::None),
};
sink.add_trap(*trap);
let subopcode = match sign {
DivSignedness::Signed => 7,
DivSignedness::Unsigned => 6,
};
match divisor {
RegMem::Reg { reg } => {
let src = int_reg_enc(reg);
emit_std_enc_enc(
sink,
prefix,
opcode,
1,
subopcode,
src,
RexFlags::from((size, reg)),
)
}
RegMem::Mem { addr: src } => {
let amode = src.finalize(state, sink);
emit_std_enc_mem(
sink,
prefix,
opcode,
1,
subopcode,
&amode,
RexFlags::from(size),
0,
);
}
}
}
Inst::MulHi {
size,
signed,
src1,
src2,
dst_lo,
dst_hi,
} => {
let src1 = allocs.next(src1.to_reg());
let dst_lo = allocs.next(dst_lo.to_reg().to_reg());
let dst_hi = allocs.next(dst_hi.to_reg().to_reg());
debug_assert_eq!(src1, regs::rax());
debug_assert_eq!(dst_lo, regs::rax());
debug_assert_eq!(dst_hi, regs::rdx());
let rex_flags = RexFlags::from(*size);
let prefix = match size {
OperandSize::Size16 => LegacyPrefixes::_66,
OperandSize::Size32 => LegacyPrefixes::None,
OperandSize::Size64 => LegacyPrefixes::None,
_ => unreachable!(),
};
let subopcode = if *signed { 5 } else { 4 };
match src2.clone().to_reg_mem() {
RegMem::Reg { reg } => {
let reg = allocs.next(reg);
let src = int_reg_enc(reg);
emit_std_enc_enc(sink, prefix, 0xF7, 1, subopcode, src, rex_flags)
}
RegMem::Mem { addr: src } => {
let amode = src.finalize(state, sink).with_allocs(allocs);
emit_std_enc_mem(sink, prefix, 0xF7, 1, subopcode, &amode, rex_flags, 0);
}
}
}
Inst::UMulLo {
size,
src1,
src2,
dst,
} => {
let src1 = allocs.next(src1.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(src1, regs::rax());
debug_assert_eq!(dst, regs::rax());
let mut rex = RexFlags::from(*size);
let prefix = match size {
OperandSize::Size16 => LegacyPrefixes::_66,
_ => LegacyPrefixes::None,
};
let opcode = if *size == OperandSize::Size8 {
0xF6
} else {
0xF7
};
match src2.clone().to_reg_mem() {
RegMem::Reg { reg } => {
let reg = allocs.next(reg);
if *size == OperandSize::Size8 {
rex.always_emit_if_8bit_needed(reg);
}
let reg_e = int_reg_enc(reg);
emit_std_enc_enc(sink, prefix, opcode, 1, 4, reg_e, rex);
}
RegMem::Mem { addr: src } => {
let amode = src.finalize(state, sink).with_allocs(allocs);
emit_std_enc_mem(sink, prefix, opcode, 1, 4, &amode, rex, 0);
}
}
}
Inst::SignExtendData { size, src, dst } => {
let src = allocs.next(src.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(src, regs::rax());
if *size == OperandSize::Size8 {
debug_assert_eq!(dst, regs::rax());
} else {
debug_assert_eq!(dst, regs::rdx());
}
match size {
OperandSize::Size8 => {
sink.put1(0x66);
sink.put1(0x98);
}
OperandSize::Size16 => {
sink.put1(0x66);
sink.put1(0x99);
}
OperandSize::Size32 => sink.put1(0x99),
OperandSize::Size64 => {
sink.put1(0x48);
sink.put1(0x99);
}
}
}
Inst::CheckedSRemSeq { divisor, .. } | Inst::CheckedSRemSeq8 { divisor, .. } => {
let divisor = allocs.next(divisor.to_reg());
// Validate that the register constraints of the dividend and the
// destination are all as expected.
let (dst, size) = match inst {
Inst::CheckedSRemSeq {
dividend_lo,
dividend_hi,
dst_quotient,
dst_remainder,
size,
..
} => {
let dividend_lo = allocs.next(dividend_lo.to_reg());
let dividend_hi = allocs.next(dividend_hi.to_reg());
let dst_quotient = allocs.next(dst_quotient.to_reg().to_reg());
let dst_remainder = allocs.next(dst_remainder.to_reg().to_reg());
debug_assert_eq!(dividend_lo, regs::rax());
debug_assert_eq!(dividend_hi, regs::rdx());
debug_assert_eq!(dst_quotient, regs::rax());
debug_assert_eq!(dst_remainder, regs::rdx());
(regs::rdx(), *size)
}
Inst::CheckedSRemSeq8 { dividend, dst, .. } => {
let dividend = allocs.next(dividend.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(dividend, regs::rax());
debug_assert_eq!(dst, regs::rax());
(regs::rax(), OperandSize::Size8)
}
_ => unreachable!(),
};
// Generates the following code sequence:
//
// cmp -1 %divisor
// jnz $do_op
//
// ;; for srem, result is 0
// mov #0, %dst
// j $done
//
// $do_op:
// idiv %divisor
//
// $done:
let do_op = sink.get_label();
let done_label = sink.get_label();
// Check if the divisor is -1, and if it isn't then immediately
// go to the `idiv`.
let inst = Inst::cmp_rmi_r(size, RegMemImm::imm(0xffffffff), divisor);
inst.emit(&[], sink, info, state);
one_way_jmp(sink, CC::NZ, do_op);
// ... otherwise the divisor is -1 and the result is always 0. This
// is written to the destination register which will be %rax for
// 8-bit srem and %rdx otherwise.
//
// Note that for 16-to-64-bit srem operations this leaves the
// second destination, %rax, unchanged. This isn't semantically
// correct if a lowering actually tries to use the `dst_quotient`
// output but for srem only the `dst_remainder` output is used for
// now.
let inst = Inst::imm(OperandSize::Size64, 0, Writable::from_reg(dst));
inst.emit(&[], sink, info, state);
let inst = Inst::jmp_known(done_label);
inst.emit(&[], sink, info, state);
// Here the `idiv` is executed, which is different depending on the
// size
sink.bind_label(do_op, state.ctrl_plane_mut());
let inst = match size {
OperandSize::Size8 => Inst::div8(
DivSignedness::Signed,
TrapCode::IntegerDivisionByZero,
RegMem::reg(divisor),
Gpr::new(regs::rax()).unwrap(),
Writable::from_reg(Gpr::new(regs::rax()).unwrap()),
),
_ => Inst::div(
size,
DivSignedness::Signed,
TrapCode::IntegerDivisionByZero,
RegMem::reg(divisor),
Gpr::new(regs::rax()).unwrap(),
Gpr::new(regs::rdx()).unwrap(),
Writable::from_reg(Gpr::new(regs::rax()).unwrap()),
Writable::from_reg(Gpr::new(regs::rdx()).unwrap()),
),
};
inst.emit(&[], sink, info, state);
sink.bind_label(done_label, state.ctrl_plane_mut());
}
Inst::Imm {
dst_size,
simm64,
dst,
} => {
let dst = allocs.next(dst.to_reg().to_reg());
let enc_dst = int_reg_enc(dst);
if *dst_size == OperandSize::Size64 {
if low32_will_sign_extend_to_64(*simm64) {
// Sign-extended move imm32.
emit_std_enc_enc(
sink,
LegacyPrefixes::None,
0xC7,
1,
/* subopcode */ 0,
enc_dst,
RexFlags::set_w(),
);
sink.put4(*simm64 as u32);
} else {
sink.put1(0x48 | ((enc_dst >> 3) & 1));
sink.put1(0xB8 | (enc_dst & 7));
sink.put8(*simm64);
}
} else {
if ((enc_dst >> 3) & 1) == 1 {
sink.put1(0x41);
}
sink.put1(0xB8 | (enc_dst & 7));
sink.put4(*simm64 as u32);
}
}
Inst::MovImmM { size, simm32, dst } => {
let dst = &dst.finalize(state, sink).with_allocs(allocs);
let default_rex = RexFlags::clear_w();
let default_opcode = 0xC7;
let bytes = size.to_bytes();
let prefix = LegacyPrefixes::None;
let (opcode, rex, size, prefix) = match *size {
// In the 8-bit case, we don't need to enforce REX flags via
// `always_emit_if_8bit_needed()` since the destination
// operand is a memory operand, not a possibly 8-bit register.
OperandSize::Size8 => (0xC6, default_rex, bytes, prefix),
OperandSize::Size16 => (0xC7, default_rex, bytes, LegacyPrefixes::_66),
OperandSize::Size64 => (default_opcode, RexFlags::from(*size), bytes, prefix),
_ => (default_opcode, default_rex, bytes, prefix),
};
// 8-bit C6 /0 ib
// 16-bit 0x66 C7 /0 iw
// 32-bit C7 /0 id
// 64-bit REX.W C7 /0 id
emit_std_enc_mem(sink, prefix, opcode, 1, /*subopcode*/ 0, dst, rex, 0);
emit_simm(sink, size, *simm32 as u32);
}
Inst::MovRR { size, src, dst } => {
let src = allocs.next(src.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
emit_std_reg_reg(
sink,
LegacyPrefixes::None,
0x89,
1,
src,
dst,
RexFlags::from(*size),
);
}
Inst::MovFromPReg { src, dst } => {
allocs.next_fixed_nonallocatable(*src);
let src: Reg = (*src).into();
debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&src));
let src = Gpr::new(src).unwrap();
let size = OperandSize::Size64;
let dst = allocs.next(dst.to_reg().to_reg());
let dst = WritableGpr::from_writable_reg(Writable::from_reg(dst)).unwrap();
Inst::MovRR { size, src, dst }.emit(&[], sink, info, state);
}
Inst::MovToPReg { src, dst } => {
let src = allocs.next(src.to_reg());
let src = Gpr::new(src).unwrap();
allocs.next_fixed_nonallocatable(*dst);
let dst: Reg = (*dst).into();
debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&dst));
let dst = WritableGpr::from_writable_reg(Writable::from_reg(dst)).unwrap();
let size = OperandSize::Size64;
Inst::MovRR { size, src, dst }.emit(&[], sink, info, state);
}
Inst::MovzxRmR { ext_mode, src, dst } => {
let dst = allocs.next(dst.to_reg().to_reg());
let (opcodes, num_opcodes, mut rex_flags) = match ext_mode {
ExtMode::BL => {
// MOVZBL is (REX.W==0) 0F B6 /r
(0x0FB6, 2, RexFlags::clear_w())
}
ExtMode::BQ => {
// MOVZBQ is (REX.W==1) 0F B6 /r
// I'm not sure why the Intel manual offers different
// encodings for MOVZBQ than for MOVZBL. AIUI they should
// achieve the same, since MOVZBL is just going to zero out
// the upper half of the destination anyway.
(0x0FB6, 2, RexFlags::set_w())
}
ExtMode::WL => {
// MOVZWL is (REX.W==0) 0F B7 /r
(0x0FB7, 2, RexFlags::clear_w())
}
ExtMode::WQ => {
// MOVZWQ is (REX.W==1) 0F B7 /r
(0x0FB7, 2, RexFlags::set_w())
}
ExtMode::LQ => {
// This is just a standard 32 bit load, and we rely on the
// default zero-extension rule to perform the extension.
// Note that in reg/reg mode, gcc seems to use the swapped form R/RM, which we
// don't do here, since it's the same encoding size.
// MOV r/m32, r32 is (REX.W==0) 8B /r
(0x8B, 1, RexFlags::clear_w())
}
};
match src.clone().to_reg_mem() {
RegMem::Reg { reg: src } => {
let src = allocs.next(src);
match ext_mode {
ExtMode::BL | ExtMode::BQ => {
// A redundant REX prefix must be emitted for certain register inputs.
rex_flags.always_emit_if_8bit_needed(src);
}
_ => {}
}
emit_std_reg_reg(
sink,
LegacyPrefixes::None,
opcodes,
num_opcodes,
dst,
src,
rex_flags,
)
}
RegMem::Mem { addr: src } => {
let src = &src.finalize(state, sink).with_allocs(allocs);
emit_std_reg_mem(
sink,
LegacyPrefixes::None,
opcodes,
num_opcodes,
dst,
src,
rex_flags,
0,
)
}
}
}
Inst::Mov64MR { src, dst } => {
let dst = allocs.next(dst.to_reg().to_reg());
let src = &src.finalize(state, sink).with_allocs(allocs);
emit_std_reg_mem(
sink,
LegacyPrefixes::None,
0x8B,
1,
dst,
src,
RexFlags::set_w(),
0,
)
}
Inst::LoadEffectiveAddress { addr, dst, size } => {
let dst = allocs.next(dst.to_reg().to_reg());
let amode = addr.finalize(state, sink).with_allocs(allocs);
// If this `lea` can actually get encoded as an `add` then do that
// instead. Currently all candidate `iadd`s become an `lea`
// pseudo-instruction here but maximizing the sue of `lea` is not
// necessarily optimal. The `lea` instruction goes through dedicated
// address units on cores which are finite and disjoint from the
// general ALU, so if everything uses `lea` then those units can get
// saturated while leaving the ALU idle.
//
// To help make use of more parts of a cpu, this attempts to use
// `add` when it's semantically equivalent to `lea`, or otherwise
// when the `dst` register is the same as the `base` or `index`
// register.
//
// FIXME: ideally regalloc is informed of this constraint. Register
// allocation of `lea` should "attempt" to put the `base` in the
// same register as `dst` but not at the expense of generating a
// `mov` instruction. Currently that's not possible but perhaps one
// day it may be worth it.
match amode {
// If `base == dst` then this is `add $imm, %dst`, so encode
// that instead.
Amode::ImmReg {
simm32,
base,
flags: _,
} if base == dst => {
let inst = Inst::alu_rmi_r(
*size,
AluRmiROpcode::Add,
RegMemImm::imm(simm32 as u32),
Writable::from_reg(dst),
);
inst.emit(&[], sink, info, state);
}
// If the offset is 0 and the shift is 0 (meaning multiplication
// by 1) then:
//
// * If `base == dst`, then this is `add %index, %base`
// * If `index == dst`, then this is `add %base, %index`
//
// Encode the appropriate instruction here in that case.
Amode::ImmRegRegShift {
simm32: 0,
base,
index,
shift: 0,
flags: _,
} if base == dst || index == dst => {
let (dst, operand) = if base == dst {
(base, index)
} else {
(index, base)
};
let inst = Inst::alu_rmi_r(
*size,
AluRmiROpcode::Add,
RegMemImm::reg(operand.to_reg()),
Writable::from_reg(dst.to_reg()),
);
inst.emit(&[], sink, info, state);
}
// If `lea`'s 3-operand mode is leveraged by regalloc, or if
// it's fancy like imm-plus-shift-plus-base, then `lea` is
// actually emitted.
_ => {
let flags = match size {
OperandSize::Size32 => RexFlags::clear_w(),
OperandSize::Size64 => RexFlags::set_w(),
_ => unreachable!(),
};
emit_std_reg_mem(sink, LegacyPrefixes::None, 0x8D, 1, dst, &amode, flags, 0);
}
};
}
Inst::MovsxRmR { ext_mode, src, dst } => {
let dst = allocs.next(dst.to_reg().to_reg());
let (opcodes, num_opcodes, mut rex_flags) = match ext_mode {
ExtMode::BL => {
// MOVSBL is (REX.W==0) 0F BE /r
(0x0FBE, 2, RexFlags::clear_w())
}
ExtMode::BQ => {
// MOVSBQ is (REX.W==1) 0F BE /r
(0x0FBE, 2, RexFlags::set_w())
}
ExtMode::WL => {
// MOVSWL is (REX.W==0) 0F BF /r
(0x0FBF, 2, RexFlags::clear_w())
}
ExtMode::WQ => {
// MOVSWQ is (REX.W==1) 0F BF /r
(0x0FBF, 2, RexFlags::set_w())
}
ExtMode::LQ => {
// MOVSLQ is (REX.W==1) 63 /r
(0x63, 1, RexFlags::set_w())
}
};
match src.clone().to_reg_mem() {
RegMem::Reg { reg: src } => {
let src = allocs.next(src);
match ext_mode {
ExtMode::BL | ExtMode::BQ => {
// A redundant REX prefix must be emitted for certain register inputs.
rex_flags.always_emit_if_8bit_needed(src);
}
_ => {}
}
emit_std_reg_reg(
sink,
LegacyPrefixes::None,
opcodes,
num_opcodes,
dst,
src,
rex_flags,
)
}
RegMem::Mem { addr: src } => {
let src = &src.finalize(state, sink).with_allocs(allocs);
emit_std_reg_mem(
sink,
LegacyPrefixes::None,
opcodes,
num_opcodes,
dst,
src,
rex_flags,
0,
)
}
}
}
Inst::MovRM { size, src, dst } => {
let src = allocs.next(src.to_reg());
let dst = &dst.finalize(state, sink).with_allocs(allocs);
let prefix = match size {
OperandSize::Size16 => LegacyPrefixes::_66,
_ => LegacyPrefixes::None,
};
let opcode = match size {
OperandSize::Size8 => 0x88,
_ => 0x89,
};
// This is one of the few places where the presence of a
// redundant REX prefix changes the meaning of the
// instruction.
let rex = RexFlags::from((*size, src));
// 8-bit: MOV r8, r/m8 is (REX.W==0) 88 /r
// 16-bit: MOV r16, r/m16 is 66 (REX.W==0) 89 /r
// 32-bit: MOV r32, r/m32 is (REX.W==0) 89 /r
// 64-bit: MOV r64, r/m64 is (REX.W==1) 89 /r
emit_std_reg_mem(sink, prefix, opcode, 1, src, dst, rex, 0);
}
Inst::ShiftR {
size,
kind,
src,
num_bits,
dst,
} => {
let src = allocs.next(src.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(src, dst);
let subopcode = match kind {
ShiftKind::RotateLeft => 0,
ShiftKind::RotateRight => 1,
ShiftKind::ShiftLeft => 4,
ShiftKind::ShiftRightLogical => 5,
ShiftKind::ShiftRightArithmetic => 7,
};
let enc_dst = int_reg_enc(dst);
let rex_flags = RexFlags::from((*size, dst));
match num_bits.clone().to_imm8_reg() {
Imm8Reg::Reg { reg } => {
let reg = allocs.next(reg);
debug_assert_eq!(reg, regs::rcx());
let (opcode, prefix) = match size {
OperandSize::Size8 => (0xD2, LegacyPrefixes::None),
OperandSize::Size16 => (0xD3, LegacyPrefixes::_66),
OperandSize::Size32 => (0xD3, LegacyPrefixes::None),
OperandSize::Size64 => (0xD3, LegacyPrefixes::None),
};
// SHL/SHR/SAR %cl, reg8 is (REX.W==0) D2 /subopcode
// SHL/SHR/SAR %cl, reg16 is 66 (REX.W==0) D3 /subopcode
// SHL/SHR/SAR %cl, reg32 is (REX.W==0) D3 /subopcode
// SHL/SHR/SAR %cl, reg64 is (REX.W==1) D3 /subopcode
emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_dst, rex_flags);
}
Imm8Reg::Imm8 { imm: num_bits } => {
let (opcode, prefix) = match size {
OperandSize::Size8 => (0xC0, LegacyPrefixes::None),
OperandSize::Size16 => (0xC1, LegacyPrefixes::_66),
OperandSize::Size32 => (0xC1, LegacyPrefixes::None),
OperandSize::Size64 => (0xC1, LegacyPrefixes::None),
};
// SHL/SHR/SAR $ib, reg8 is (REX.W==0) C0 /subopcode
// SHL/SHR/SAR $ib, reg16 is 66 (REX.W==0) C1 /subopcode
// SHL/SHR/SAR $ib, reg32 is (REX.W==0) C1 /subopcode ib
// SHL/SHR/SAR $ib, reg64 is (REX.W==1) C1 /subopcode ib
// When the shift amount is 1, there's an even shorter encoding, but we don't
// bother with that nicety here.
emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_dst, rex_flags);
sink.put1(num_bits);
}
}
}
Inst::XmmRmiReg {
opcode,
src1,
src2,
dst,
} => {
let src1 = allocs.next(src1.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(src1, dst);
let rex = RexFlags::clear_w();
let prefix = LegacyPrefixes::_66;
let src2 = src2.clone().to_reg_mem_imm();
if let RegMemImm::Imm { simm32 } = src2 {
let (opcode_bytes, reg_digit) = match opcode {
SseOpcode::Psllw => (0x0F71, 6),
SseOpcode::Pslld => (0x0F72, 6),
SseOpcode::Psllq => (0x0F73, 6),
SseOpcode::Psraw => (0x0F71, 4),
SseOpcode::Psrad => (0x0F72, 4),
SseOpcode::Psrlw => (0x0F71, 2),
SseOpcode::Psrld => (0x0F72, 2),
SseOpcode::Psrlq => (0x0F73, 2),
_ => panic!("invalid opcode: {}", opcode),
};
let dst_enc = reg_enc(dst);
emit_std_enc_enc(sink, prefix, opcode_bytes, 2, reg_digit, dst_enc, rex);
let imm = (simm32)
.try_into()
.expect("the immediate must be convertible to a u8");
sink.put1(imm);
} else {
let opcode_bytes = match opcode {
SseOpcode::Psllw => 0x0FF1,
SseOpcode::Pslld => 0x0FF2,
SseOpcode::Psllq => 0x0FF3,
SseOpcode::Psraw => 0x0FE1,
SseOpcode::Psrad => 0x0FE2,
SseOpcode::Psrlw => 0x0FD1,
SseOpcode::Psrld => 0x0FD2,
SseOpcode::Psrlq => 0x0FD3,
_ => panic!("invalid opcode: {}", opcode),
};
match src2 {
RegMemImm::Reg { reg } => {
let reg = allocs.next(reg);
emit_std_reg_reg(sink, prefix, opcode_bytes, 2, dst, reg, rex);
}
RegMemImm::Mem { addr } => {
let addr = &addr.finalize(state, sink).with_allocs(allocs);
emit_std_reg_mem(sink, prefix, opcode_bytes, 2, dst, addr, rex, 0);
}
RegMemImm::Imm { .. } => unreachable!(),
}
};
}
Inst::CmpRmiR {
size,
src: src_e,
dst: reg_g,
opcode,
} => {
let reg_g = allocs.next(reg_g.to_reg());
let is_cmp = match opcode {
CmpOpcode::Cmp => true,
CmpOpcode::Test => false,
};
let mut prefix = LegacyPrefixes::None;
if *size == OperandSize::Size16 {
prefix = LegacyPrefixes::_66;
}
// A redundant REX prefix can change the meaning of this instruction.
let mut rex = RexFlags::from((*size, reg_g));
match src_e.clone().to_reg_mem_imm() {
RegMemImm::Reg { reg: reg_e } => {
let reg_e = allocs.next(reg_e);
if *size == OperandSize::Size8 {
// Check whether the E register forces the use of a redundant REX.
rex.always_emit_if_8bit_needed(reg_e);
}
// Use the swapped operands encoding for CMP, to stay consistent with the output of
// gcc/llvm.
let opcode = match (*size, is_cmp) {
(OperandSize::Size8, true) => 0x38,
(_, true) => 0x39,
(OperandSize::Size8, false) => 0x84,
(_, false) => 0x85,
};
emit_std_reg_reg(sink, prefix, opcode, 1, reg_e, reg_g, rex);
}
RegMemImm::Mem { addr } => {
let addr = &addr.finalize(state, sink).with_allocs(allocs);
// Whereas here we revert to the "normal" G-E ordering for CMP.
let opcode = match (*size, is_cmp) {
(OperandSize::Size8, true) => 0x3A,
(_, true) => 0x3B,
(OperandSize::Size8, false) => 0x84,
(_, false) => 0x85,
};
emit_std_reg_mem(sink, prefix, opcode, 1, reg_g, addr, rex, 0);
}
RegMemImm::Imm { simm32 } => {
// FIXME JRS 2020Feb11: there are shorter encodings for
// cmp $imm, rax/eax/ax/al.
let use_imm8 = is_cmp && low8_will_sign_extend_to_32(simm32);
// And also here we use the "normal" G-E ordering.
let opcode = if is_cmp {
if *size == OperandSize::Size8 {
0x80
} else if use_imm8 {
0x83
} else {
0x81
}
} else {
if *size == OperandSize::Size8 {
0xF6
} else {
0xF7
}
};
let subopcode = if is_cmp { 7 } else { 0 };
let enc_g = int_reg_enc(reg_g);
emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_g, rex);
emit_simm(sink, if use_imm8 { 1 } else { size.to_bytes() }, simm32);
}
}
}
Inst::Setcc { cc, dst } => {
let dst = allocs.next(dst.to_reg().to_reg());
let opcode = 0x0f90 + cc.get_enc() as u32;
let mut rex_flags = RexFlags::clear_w();
rex_flags.always_emit();
emit_std_enc_enc(
sink,
LegacyPrefixes::None,
opcode,
2,
0,
reg_enc(dst),
rex_flags,
);
}
Inst::Bswap { size, src, dst } => {
let src = allocs.next(src.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(src, dst);
let enc_reg = int_reg_enc(dst);
// BSWAP reg32 is (REX.W==0) 0F C8
// BSWAP reg64 is (REX.W==1) 0F C8
let rex_flags = RexFlags::from(*size);
rex_flags.emit_one_op(sink, enc_reg);
sink.put1(0x0F);
sink.put1(0xC8 | (enc_reg & 7));
}
Inst::Cmove {
size,
cc,
consequent,
alternative,
dst,
} => {
let alternative = allocs.next(alternative.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(alternative, dst);
let rex_flags = RexFlags::from(*size);
let prefix = match size {
OperandSize::Size16 => LegacyPrefixes::_66,
OperandSize::Size32 => LegacyPrefixes::None,
OperandSize::Size64 => LegacyPrefixes::None,
_ => unreachable!("invalid size spec for cmove"),
};
let opcode = 0x0F40 + cc.get_enc() as u32;
match consequent.clone().to_reg_mem() {
RegMem::Reg { reg } => {
let reg = allocs.next(reg);
emit_std_reg_reg(sink, prefix, opcode, 2, dst, reg, rex_flags);
}
RegMem::Mem { addr } => {
let addr = &addr.finalize(state, sink).with_allocs(allocs);
emit_std_reg_mem(sink, prefix, opcode, 2, dst, addr, rex_flags, 0);
}
}
}
Inst::XmmCmove {
ty,
cc,
consequent,
alternative,
dst,
} => {
let alternative = allocs.next(alternative.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(alternative, dst);
let consequent = consequent.clone().to_reg_mem().with_allocs(allocs);
// Lowering of the Select IR opcode when the input is an fcmp relies on the fact that
// this doesn't clobber flags. Make sure to not do so here.
let next = sink.get_label();
// Jump if cc is *not* set.
one_way_jmp(sink, cc.invert(), next);
let op = match *ty {
types::F64 => SseOpcode::Movsd,
types::F32 => SseOpcode::Movsd,
types::F32X4 => SseOpcode::Movaps,
types::F64X2 => SseOpcode::Movapd,
ty => {
debug_assert!(ty.is_vector() && ty.bytes() == 16);
SseOpcode::Movdqa
}
};
let inst = Inst::xmm_unary_rm_r(op, consequent, Writable::from_reg(dst));
inst.emit(&[], sink, info, state);
sink.bind_label(next, state.ctrl_plane_mut());
}
Inst::Push64 { src } => {
let src = src.clone().to_reg_mem_imm().with_allocs(allocs);
match src {
RegMemImm::Reg { reg } => {
let enc_reg = int_reg_enc(reg);
let rex = 0x40 | ((enc_reg >> 3) & 1);
if rex != 0x40 {
sink.put1(rex);
}
sink.put1(0x50 | (enc_reg & 7));
}
RegMemImm::Mem { addr } => {
let addr = &addr.finalize(state, sink);
emit_std_enc_mem(
sink,
LegacyPrefixes::None,
0xFF,
1,
6, /*subopcode*/
addr,
RexFlags::clear_w(),
0,
);
}
RegMemImm::Imm { simm32 } => {
if low8_will_sign_extend_to_64(simm32) {
sink.put1(0x6A);
sink.put1(simm32 as u8);
} else {
sink.put1(0x68);
sink.put4(simm32);
}
}
}
}
Inst::Pop64 { dst } => {
let dst = allocs.next(dst.to_reg().to_reg());
let enc_dst = int_reg_enc(dst);
if enc_dst >= 8 {
// 0x41 == REX.{W=0, B=1}. It seems that REX.W is irrelevant here.
sink.put1(0x41);
}
sink.put1(0x58 + (enc_dst & 7));
}
Inst::StackProbeLoop {
tmp,
frame_size,
guard_size,
} => {
assert!(info.flags.enable_probestack());
assert!(guard_size.is_power_of_two());
let tmp = allocs.next_writable(*tmp);
// Number of probes that we need to perform
let probe_count = align_to(*frame_size, *guard_size) / guard_size;
// The inline stack probe loop has 3 phases:
//
// We generate the "guard area" register which is essentially the frame_size aligned to
// guard_size. We copy the stack pointer and subtract the guard area from it. This
// gets us a register that we can use to compare when looping.
//
// After that we emit the loop. Essentially we just adjust the stack pointer one guard_size'd
// distance at a time and then touch the stack by writing anything to it. We use the previously
// created "guard area" register to know when to stop looping.
//
// When we have touched all the pages that we need, we have to restore the stack pointer
// to where it was before.
//
// Generate the following code:
// mov tmp_reg, rsp
// sub tmp_reg, guard_size * probe_count
// .loop_start:
// sub rsp, guard_size
// mov [rsp], rsp
// cmp rsp, tmp_reg
// jne .loop_start
// add rsp, guard_size * probe_count
// Create the guard bound register
// mov tmp_reg, rsp
let inst = Inst::gen_move(tmp, regs::rsp(), types::I64);
inst.emit(&[], sink, info, state);
// sub tmp_reg, GUARD_SIZE * probe_count
let inst = Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Sub,
RegMemImm::imm(guard_size * probe_count),
tmp,
);
inst.emit(&[], sink, info, state);
// Emit the main loop!
let loop_start = sink.get_label();
sink.bind_label(loop_start, state.ctrl_plane_mut());
// sub rsp, GUARD_SIZE
let inst = Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Sub,
RegMemImm::imm(*guard_size),
Writable::from_reg(regs::rsp()),
);
inst.emit(&[], sink, info, state);
// TODO: `mov [rsp], 0` would be better, but we don't have that instruction
// Probe the stack! We don't use Inst::gen_store_stack here because we need a predictable
// instruction size.
// mov [rsp], rsp
let inst = Inst::mov_r_m(
OperandSize::Size32, // Use Size32 since it saves us one byte
regs::rsp(),
SyntheticAmode::Real(Amode::imm_reg(0, regs::rsp())),
);
inst.emit(&[], sink, info, state);
// Compare and jump if we are not done yet
// cmp rsp, tmp_reg
let inst = Inst::cmp_rmi_r(
OperandSize::Size64,
RegMemImm::reg(regs::rsp()),
tmp.to_reg(),
);
inst.emit(&[], sink, info, state);
// jne .loop_start
// TODO: Encoding the JmpIf as a short jump saves us 4 bytes here.
one_way_jmp(sink, CC::NZ, loop_start);
// The regular prologue code is going to emit a `sub` after this, so we need to
// reset the stack pointer
//
// TODO: It would be better if we could avoid the `add` + `sub` that is generated here
// and in the stack adj portion of the prologue
//
// add rsp, GUARD_SIZE * probe_count
let inst = Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Add,
RegMemImm::imm(guard_size * probe_count),
Writable::from_reg(regs::rsp()),
);
inst.emit(&[], sink, info, state);
}
Inst::CallKnown {
dest,
info: call_info,
..
} => {
if let Some(s) = state.take_stack_map() {
sink.add_stack_map(StackMapExtent::UpcomingBytes(5), s);
}
sink.put1(0xE8);
// The addend adjusts for the difference between the end of the instruction and the
// beginning of the immediate field.
emit_reloc(sink, Reloc::X86CallPCRel4, &dest, -4);
sink.put4(0);
if call_info.opcode.is_call() {
sink.add_call_site(call_info.opcode);
}
let callee_pop_size = i64::from(call_info.callee_pop_size);
state.adjust_virtual_sp_offset(-callee_pop_size);
}
Inst::ReturnCallKnown {
callee,
info: call_info,
} => {
emit_return_call_common_sequence(
allocs,
sink,
info,
state,
call_info.new_stack_arg_size,
call_info.old_stack_arg_size,
call_info.ret_addr,
call_info.fp,
call_info.tmp,
&call_info.uses,
);
// Finally, jump to the callee!
//
// Note: this is not `Inst::Jmp { .. }.emit(..)` because we have
// different metadata in this case: we don't have a label for the
// target, but rather a function relocation.
sink.put1(0xE9);
// The addend adjusts for the difference between the end of the instruction and the
// beginning of the immediate field.
emit_reloc(sink, Reloc::X86CallPCRel4, &callee, -4);
sink.put4(0);
sink.add_call_site(ir::Opcode::ReturnCall);
}
Inst::ReturnCallUnknown {
callee,
info: call_info,
} => {
let callee = callee.with_allocs(allocs);
emit_return_call_common_sequence(
allocs,
sink,
info,
state,
call_info.new_stack_arg_size,
call_info.old_stack_arg_size,
call_info.ret_addr,
call_info.fp,
call_info.tmp,
&call_info.uses,
);
Inst::JmpUnknown { target: callee }.emit(&[], sink, info, state);
sink.add_call_site(ir::Opcode::ReturnCallIndirect);
}
Inst::CallUnknown {
dest,
info: call_info,
..
} => {
let dest = dest.with_allocs(allocs);
let start_offset = sink.cur_offset();
match dest {
RegMem::Reg { reg } => {
let reg_enc = int_reg_enc(reg);
emit_std_enc_enc(
sink,
LegacyPrefixes::None,
0xFF,
1,
2, /*subopcode*/
reg_enc,
RexFlags::clear_w(),
);
}
RegMem::Mem { addr } => {
let addr = &addr.finalize(state, sink);
emit_std_enc_mem(
sink,
LegacyPrefixes::None,
0xFF,
1,
2, /*subopcode*/
addr,
RexFlags::clear_w(),
0,
);
}
}
if let Some(s) = state.take_stack_map() {
sink.add_stack_map(StackMapExtent::StartedAtOffset(start_offset), s);
}
if call_info.opcode.is_call() {
sink.add_call_site(call_info.opcode);
}
let callee_pop_size = i64::from(call_info.callee_pop_size);
state.adjust_virtual_sp_offset(-callee_pop_size);
}
Inst::Args { .. } => {}
Inst::Rets { .. } => {}
Inst::Ret {
stack_bytes_to_pop: 0,
} => sink.put1(0xC3),
Inst::Ret { stack_bytes_to_pop } => {
sink.put1(0xC2);
sink.put2(u16::try_from(*stack_bytes_to_pop).unwrap());
}
Inst::JmpKnown { dst } => {
let br_start = sink.cur_offset();
let br_disp_off = br_start + 1;
let br_end = br_start + 5;
sink.use_label_at_offset(br_disp_off, *dst, LabelUse::JmpRel32);
sink.add_uncond_branch(br_start, br_end, *dst);
sink.put1(0xE9);
// Placeholder for the label value.
sink.put4(0x0);
}
Inst::JmpIf { cc, taken } => {
let cond_start = sink.cur_offset();
let cond_disp_off = cond_start + 2;
sink.use_label_at_offset(cond_disp_off, *taken, LabelUse::JmpRel32);
// Since this is not a terminator, don't enroll in the branch inversion mechanism.
sink.put1(0x0F);
sink.put1(0x80 + cc.get_enc());
// Placeholder for the label value.
sink.put4(0x0);
}
Inst::JmpCond {
cc,
taken,
not_taken,
} => {
// If taken.
let cond_start = sink.cur_offset();
let cond_disp_off = cond_start + 2;
let cond_end = cond_start + 6;
sink.use_label_at_offset(cond_disp_off, *taken, LabelUse::JmpRel32);
let inverted: [u8; 6] = [0x0F, 0x80 + (cc.invert().get_enc()), 0x00, 0x00, 0x00, 0x00];
sink.add_cond_branch(cond_start, cond_end, *taken, &inverted[..]);
sink.put1(0x0F);
sink.put1(0x80 + cc.get_enc());
// Placeholder for the label value.
sink.put4(0x0);
// If not taken.
let uncond_start = sink.cur_offset();
let uncond_disp_off = uncond_start + 1;
let uncond_end = uncond_start + 5;
sink.use_label_at_offset(uncond_disp_off, *not_taken, LabelUse::JmpRel32);
sink.add_uncond_branch(uncond_start, uncond_end, *not_taken);
sink.put1(0xE9);
// Placeholder for the label value.
sink.put4(0x0);
}
Inst::JmpUnknown { target } => {
let target = target.with_allocs(allocs);
match target {
RegMem::Reg { reg } => {
let reg_enc = int_reg_enc(reg);
emit_std_enc_enc(
sink,
LegacyPrefixes::None,
0xFF,
1,
4, /*subopcode*/
reg_enc,
RexFlags::clear_w(),
);
}
RegMem::Mem { addr } => {
let addr = &addr.finalize(state, sink);
emit_std_enc_mem(
sink,
LegacyPrefixes::None,
0xFF,
1,
4, /*subopcode*/
addr,
RexFlags::clear_w(),
0,
);
}
}
}
Inst::JmpTableSeq {
idx,
tmp1,
tmp2,
ref targets,
default_target,
..
} => {
let idx = allocs.next(*idx);
let tmp1 = Writable::from_reg(allocs.next(tmp1.to_reg()));
let tmp2 = Writable::from_reg(allocs.next(tmp2.to_reg()));
// This sequence is *one* instruction in the vcode, and is expanded only here at
// emission time, because we cannot allow the regalloc to insert spills/reloads in
// the middle; we depend on hardcoded PC-rel addressing below.
//
// We don't have to worry about emitting islands, because the only label-use type has a
// maximum range of 2 GB. If we later consider using shorter-range label references,
// this will need to be revisited.
// We generate the following sequence. Note that the only read of %idx is before the
// write to %tmp2, so regalloc may use the same register for both; fix x64/inst/mod.rs
// if you change this.
// lea start_of_jump_table_offset(%rip), %tmp1
// movslq [%tmp1, %idx, 4], %tmp2 ;; shift of 2, viz. multiply index by 4
// addq %tmp2, %tmp1
// j *%tmp1
// $start_of_jump_table:
// -- jump table entries
// Load base address of jump table.
let start_of_jumptable = sink.get_label();
let inst = Inst::lea(Amode::rip_relative(start_of_jumptable), tmp1);
inst.emit(&[], sink, info, state);
// Load value out of the jump table. It's a relative offset to the target block, so it
// might be negative; use a sign-extension.
let inst = Inst::movsx_rm_r(
ExtMode::LQ,
RegMem::mem(Amode::imm_reg_reg_shift(
0,
Gpr::new(tmp1.to_reg()).unwrap(),
Gpr::new(idx).unwrap(),
2,
)),
tmp2,
);
inst.emit(&[], sink, info, state);
// Add base of jump table to jump-table-sourced block offset.
let inst = Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Add,
RegMemImm::reg(tmp2.to_reg()),
tmp1,
);
inst.emit(&[], sink, info, state);
// Branch to computed address.
let inst = Inst::jmp_unknown(RegMem::reg(tmp1.to_reg()));
inst.emit(&[], sink, info, state);
// Emit jump table (table of 32-bit offsets).
sink.bind_label(start_of_jumptable, state.ctrl_plane_mut());
let jt_off = sink.cur_offset();
for &target in targets.iter().chain(std::iter::once(default_target)) {
let word_off = sink.cur_offset();
// off_into_table is an addend here embedded in the label to be later patched at
// the end of codegen. The offset is initially relative to this jump table entry;
// with the extra addend, it'll be relative to the jump table's start, after
// patching.
let off_into_table = word_off - jt_off;
sink.use_label_at_offset(word_off, target, LabelUse::PCRel32);
sink.put4(off_into_table);
}
}
Inst::TrapIf { cc, trap_code } => {
let trap_label = sink.defer_trap(*trap_code, state.take_stack_map());
one_way_jmp(sink, *cc, trap_label);
}
Inst::TrapIfAnd {
cc1,
cc2,
trap_code,
} => {
let trap_label = sink.defer_trap(*trap_code, state.take_stack_map());
let else_label = sink.get_label();
// Jump to the end if the first condition isn't true, and then if
// the second condition is true go to the trap.
one_way_jmp(sink, cc1.invert(), else_label);
one_way_jmp(sink, *cc2, trap_label);
sink.bind_label(else_label, state.ctrl_plane_mut());
}
Inst::TrapIfOr {
cc1,
cc2,
trap_code,
} => {
let trap_label = sink.defer_trap(*trap_code, state.take_stack_map());
// Emit two jumps to the same trap if either condition code is true.
one_way_jmp(sink, *cc1, trap_label);
one_way_jmp(sink, *cc2, trap_label);
}
Inst::XmmUnaryRmR { op, src, dst } => {
emit(
&Inst::XmmUnaryRmRUnaligned {
op: *op,
src: XmmMem::new(src.clone().into()).unwrap(),
dst: *dst,
},
allocs,
sink,
info,
state,
);
}
Inst::XmmUnaryRmRUnaligned {
op,
src: src_e,
dst: reg_g,
} => {
let reg_g = allocs.next(reg_g.to_reg().to_reg());
let src_e = src_e.clone().to_reg_mem().with_allocs(allocs);
let rex = RexFlags::clear_w();
let (prefix, opcode, num_opcodes) = match op {
SseOpcode::Cvtdq2pd => (LegacyPrefixes::_F3, 0x0FE6, 2),
SseOpcode::Cvtpd2ps => (LegacyPrefixes::_66, 0x0F5A, 2),
SseOpcode::Cvtps2pd => (LegacyPrefixes::None, 0x0F5A, 2),
SseOpcode::Cvtdq2ps => (LegacyPrefixes::None, 0x0F5B, 2),
SseOpcode::Cvtss2sd => (LegacyPrefixes::_F3, 0x0F5A, 2),
SseOpcode::Cvtsd2ss => (LegacyPrefixes::_F2, 0x0F5A, 2),
SseOpcode::Cvttpd2dq => (LegacyPrefixes::_66, 0x0FE6, 2),
SseOpcode::Cvttps2dq => (LegacyPrefixes::_F3, 0x0F5B, 2),
SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F28, 2),
SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F28, 2),
SseOpcode::Movdqa => (LegacyPrefixes::_66, 0x0F6F, 2),
SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F6F, 2),
SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10, 2),
SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F10, 2),
SseOpcode::Movups => (LegacyPrefixes::None, 0x0F10, 2),
SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F10, 2),
SseOpcode::Pabsb => (LegacyPrefixes::_66, 0x0F381C, 3),
SseOpcode::Pabsw => (LegacyPrefixes::_66, 0x0F381D, 3),
SseOpcode::Pabsd => (LegacyPrefixes::_66, 0x0F381E, 3),
SseOpcode::Pmovsxbd => (LegacyPrefixes::_66, 0x0F3821, 3),
SseOpcode::Pmovsxbw => (LegacyPrefixes::_66, 0x0F3820, 3),
SseOpcode::Pmovsxbq => (LegacyPrefixes::_66, 0x0F3822, 3),
SseOpcode::Pmovsxwd => (LegacyPrefixes::_66, 0x0F3823, 3),
SseOpcode::Pmovsxwq => (LegacyPrefixes::_66, 0x0F3824, 3),
SseOpcode::Pmovsxdq => (LegacyPrefixes::_66, 0x0F3825, 3),
SseOpcode::Pmovzxbd => (LegacyPrefixes::_66, 0x0F3831, 3),
SseOpcode::Pmovzxbw => (LegacyPrefixes::_66, 0x0F3830, 3),
SseOpcode::Pmovzxbq => (LegacyPrefixes::_66, 0x0F3832, 3),
SseOpcode::Pmovzxwd => (LegacyPrefixes::_66, 0x0F3833, 3),
SseOpcode::Pmovzxwq => (LegacyPrefixes::_66, 0x0F3834, 3),
SseOpcode::Pmovzxdq => (LegacyPrefixes::_66, 0x0F3835, 3),
SseOpcode::Sqrtps => (LegacyPrefixes::None, 0x0F51, 2),
SseOpcode::Sqrtpd => (LegacyPrefixes::_66, 0x0F51, 2),
SseOpcode::Sqrtss => (LegacyPrefixes::_F3, 0x0F51, 2),
SseOpcode::Sqrtsd => (LegacyPrefixes::_F2, 0x0F51, 2),
SseOpcode::Movddup => (LegacyPrefixes::_F2, 0x0F12, 2),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
match src_e {
RegMem::Reg { reg: reg_e } => {
emit_std_reg_reg(sink, prefix, opcode, num_opcodes, reg_g, reg_e, rex);
}
RegMem::Mem { addr } => {
let addr = &addr.finalize(state, sink);
emit_std_reg_mem(sink, prefix, opcode, num_opcodes, reg_g, addr, rex, 0);
}
};
}
Inst::XmmUnaryRmRImm { op, src, dst, imm } => {
let dst = allocs.next(dst.to_reg().to_reg());
let src = src.clone().to_reg_mem().with_allocs(allocs);
let rex = RexFlags::clear_w();
let (prefix, opcode, len) = match op {
SseOpcode::Roundps => (LegacyPrefixes::_66, 0x0F3A08, 3),
SseOpcode::Roundss => (LegacyPrefixes::_66, 0x0F3A0A, 3),
SseOpcode::Roundpd => (LegacyPrefixes::_66, 0x0F3A09, 3),
SseOpcode::Roundsd => (LegacyPrefixes::_66, 0x0F3A0B, 3),
SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2),
SseOpcode::Pshuflw => (LegacyPrefixes::_F2, 0x0F70, 2),
SseOpcode::Pshufhw => (LegacyPrefixes::_F3, 0x0F70, 2),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
match src {
RegMem::Reg { reg } => {
emit_std_reg_reg(sink, prefix, opcode, len, dst, reg, rex);
}
RegMem::Mem { addr } => {
let addr = &addr.finalize(state, sink);
// N.B.: bytes_at_end == 1, because of the `imm` byte below.
emit_std_reg_mem(sink, prefix, opcode, len, dst, addr, rex, 1);
}
}
sink.put1(*imm);
}
Inst::XmmUnaryRmREvex { op, src, dst } => {
let dst = allocs.next(dst.to_reg().to_reg());
let src = match src.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let (prefix, map, w, opcode) = match op {
Avx512Opcode::Vcvtudq2ps => (LegacyPrefixes::_F2, OpcodeMap::_0F, false, 0x7a),
Avx512Opcode::Vpabsq => (LegacyPrefixes::_66, OpcodeMap::_0F38, true, 0x1f),
Avx512Opcode::Vpopcntb => (LegacyPrefixes::_66, OpcodeMap::_0F38, false, 0x54),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
EvexInstruction::new()
.length(EvexVectorLength::V128)
.prefix(prefix)
.map(map)
.w(w)
.opcode(opcode)
.tuple_type(op.tuple_type())
.reg(dst.to_real_reg().unwrap().hw_enc())
.rm(src)
.encode(sink);
}
Inst::XmmUnaryRmRImmEvex { op, src, dst, imm } => {
let dst = allocs.next(dst.to_reg().to_reg());
let src = match src.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let (opcode, opcode_ext, w) = match op {
Avx512Opcode::VpsraqImm => (0x72, 4, true),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
EvexInstruction::new()
.length(EvexVectorLength::V128)
.prefix(LegacyPrefixes::_66)
.map(OpcodeMap::_0F)
.w(w)
.opcode(opcode)
.reg(opcode_ext)
.vvvvv(dst.to_real_reg().unwrap().hw_enc())
.tuple_type(op.tuple_type())
.rm(src)
.imm(*imm)
.encode(sink);
}
Inst::XmmRmR {
op,
src1,
src2,
dst,
} => emit(
&Inst::XmmRmRUnaligned {
op: *op,
dst: *dst,
src1: *src1,
src2: XmmMem::new(src2.clone().to_reg_mem()).unwrap(),
},
allocs,
sink,
info,
state,
),
Inst::XmmRmRUnaligned {
op,
src1,
src2: src_e,
dst: reg_g,
} => {
let src1 = allocs.next(src1.to_reg());
let reg_g = allocs.next(reg_g.to_reg().to_reg());
let src_e = src_e.clone().to_reg_mem().with_allocs(allocs);
debug_assert_eq!(src1, reg_g);
let rex = RexFlags::clear_w();
let (prefix, opcode, length) = match op {
SseOpcode::Addps => (LegacyPrefixes::None, 0x0F58, 2),
SseOpcode::Addpd => (LegacyPrefixes::_66, 0x0F58, 2),
SseOpcode::Addss => (LegacyPrefixes::_F3, 0x0F58, 2),
SseOpcode::Addsd => (LegacyPrefixes::_F2, 0x0F58, 2),
SseOpcode::Andps => (LegacyPrefixes::None, 0x0F54, 2),
SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2),
SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2),
SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2),
SseOpcode::Divps => (LegacyPrefixes::None, 0x0F5E, 2),
SseOpcode::Divpd => (LegacyPrefixes::_66, 0x0F5E, 2),
SseOpcode::Divss => (LegacyPrefixes::_F3, 0x0F5E, 2),
SseOpcode::Divsd => (LegacyPrefixes::_F2, 0x0F5E, 2),
SseOpcode::Maxps => (LegacyPrefixes::None, 0x0F5F, 2),
SseOpcode::Maxpd => (LegacyPrefixes::_66, 0x0F5F, 2),
SseOpcode::Maxss => (LegacyPrefixes::_F3, 0x0F5F, 2),
SseOpcode::Maxsd => (LegacyPrefixes::_F2, 0x0F5F, 2),
SseOpcode::Minps => (LegacyPrefixes::None, 0x0F5D, 2),
SseOpcode::Minpd => (LegacyPrefixes::_66, 0x0F5D, 2),
SseOpcode::Minss => (LegacyPrefixes::_F3, 0x0F5D, 2),
SseOpcode::Minsd => (LegacyPrefixes::_F2, 0x0F5D, 2),
SseOpcode::Movlhps => (LegacyPrefixes::None, 0x0F16, 2),
SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10, 2),
SseOpcode::Mulps => (LegacyPrefixes::None, 0x0F59, 2),
SseOpcode::Mulpd => (LegacyPrefixes::_66, 0x0F59, 2),
SseOpcode::Mulss => (LegacyPrefixes::_F3, 0x0F59, 2),
SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2),
SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2),
SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2),
SseOpcode::Packssdw => (LegacyPrefixes::_66, 0x0F6B, 2),
SseOpcode::Packsswb => (LegacyPrefixes::_66, 0x0F63, 2),
SseOpcode::Packusdw => (LegacyPrefixes::_66, 0x0F382B, 3),
SseOpcode::Packuswb => (LegacyPrefixes::_66, 0x0F67, 2),
SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2),
SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2),
SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2),
SseOpcode::Paddw => (LegacyPrefixes::_66, 0x0FFD, 2),
SseOpcode::Paddsb => (LegacyPrefixes::_66, 0x0FEC, 2),
SseOpcode::Paddsw => (LegacyPrefixes::_66, 0x0FED, 2),
SseOpcode::Paddusb => (LegacyPrefixes::_66, 0x0FDC, 2),
SseOpcode::Paddusw => (LegacyPrefixes::_66, 0x0FDD, 2),
SseOpcode::Pmaddubsw => (LegacyPrefixes::_66, 0x0F3804, 3),
SseOpcode::Pand => (LegacyPrefixes::_66, 0x0FDB, 2),
SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2),
SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2),
SseOpcode::Pavgw => (LegacyPrefixes::_66, 0x0FE3, 2),
SseOpcode::Pcmpeqb => (LegacyPrefixes::_66, 0x0F74, 2),
SseOpcode::Pcmpeqw => (LegacyPrefixes::_66, 0x0F75, 2),
SseOpcode::Pcmpeqd => (LegacyPrefixes::_66, 0x0F76, 2),
SseOpcode::Pcmpeqq => (LegacyPrefixes::_66, 0x0F3829, 3),
SseOpcode::Pcmpgtb => (LegacyPrefixes::_66, 0x0F64, 2),
SseOpcode::Pcmpgtw => (LegacyPrefixes::_66, 0x0F65, 2),
SseOpcode::Pcmpgtd => (LegacyPrefixes::_66, 0x0F66, 2),
SseOpcode::Pcmpgtq => (LegacyPrefixes::_66, 0x0F3837, 3),
SseOpcode::Pmaddwd => (LegacyPrefixes::_66, 0x0FF5, 2),
SseOpcode::Pmaxsb => (LegacyPrefixes::_66, 0x0F383C, 3),
SseOpcode::Pmaxsw => (LegacyPrefixes::_66, 0x0FEE, 2),
SseOpcode::Pmaxsd => (LegacyPrefixes::_66, 0x0F383D, 3),
SseOpcode::Pmaxub => (LegacyPrefixes::_66, 0x0FDE, 2),
SseOpcode::Pmaxuw => (LegacyPrefixes::_66, 0x0F383E, 3),
SseOpcode::Pmaxud => (LegacyPrefixes::_66, 0x0F383F, 3),
SseOpcode::Pminsb => (LegacyPrefixes::_66, 0x0F3838, 3),
SseOpcode::Pminsw => (LegacyPrefixes::_66, 0x0FEA, 2),
SseOpcode::Pminsd => (LegacyPrefixes::_66, 0x0F3839, 3),
SseOpcode::Pminub => (LegacyPrefixes::_66, 0x0FDA, 2),
SseOpcode::Pminuw => (LegacyPrefixes::_66, 0x0F383A, 3),
SseOpcode::Pminud => (LegacyPrefixes::_66, 0x0F383B, 3),
SseOpcode::Pmuldq => (LegacyPrefixes::_66, 0x0F3828, 3),
SseOpcode::Pmulhw => (LegacyPrefixes::_66, 0x0FE5, 2),
SseOpcode::Pmulhrsw => (LegacyPrefixes::_66, 0x0F380B, 3),
SseOpcode::Pmulhuw => (LegacyPrefixes::_66, 0x0FE4, 2),
SseOpcode::Pmulld => (LegacyPrefixes::_66, 0x0F3840, 3),
SseOpcode::Pmullw => (LegacyPrefixes::_66, 0x0FD5, 2),
SseOpcode::Pmuludq => (LegacyPrefixes::_66, 0x0FF4, 2),
SseOpcode::Por => (LegacyPrefixes::_66, 0x0FEB, 2),
SseOpcode::Pshufb => (LegacyPrefixes::_66, 0x0F3800, 3),
SseOpcode::Psubb => (LegacyPrefixes::_66, 0x0FF8, 2),
SseOpcode::Psubd => (LegacyPrefixes::_66, 0x0FFA, 2),
SseOpcode::Psubq => (LegacyPrefixes::_66, 0x0FFB, 2),
SseOpcode::Psubw => (LegacyPrefixes::_66, 0x0FF9, 2),
SseOpcode::Psubsb => (LegacyPrefixes::_66, 0x0FE8, 2),
SseOpcode::Psubsw => (LegacyPrefixes::_66, 0x0FE9, 2),
SseOpcode::Psubusb => (LegacyPrefixes::_66, 0x0FD8, 2),
SseOpcode::Psubusw => (LegacyPrefixes::_66, 0x0FD9, 2),
SseOpcode::Punpckhbw => (LegacyPrefixes::_66, 0x0F68, 2),
SseOpcode::Punpckhwd => (LegacyPrefixes::_66, 0x0F69, 2),
SseOpcode::Punpcklbw => (LegacyPrefixes::_66, 0x0F60, 2),
SseOpcode::Punpcklwd => (LegacyPrefixes::_66, 0x0F61, 2),
SseOpcode::Punpckldq => (LegacyPrefixes::_66, 0x0F62, 2),
SseOpcode::Punpcklqdq => (LegacyPrefixes::_66, 0x0F6C, 2),
SseOpcode::Punpckhdq => (LegacyPrefixes::_66, 0x0F6A, 2),
SseOpcode::Punpckhqdq => (LegacyPrefixes::_66, 0x0F6D, 2),
SseOpcode::Pxor => (LegacyPrefixes::_66, 0x0FEF, 2),
SseOpcode::Subps => (LegacyPrefixes::None, 0x0F5C, 2),
SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2),
SseOpcode::Subss => (LegacyPrefixes::_F3, 0x0F5C, 2),
SseOpcode::Subsd => (LegacyPrefixes::_F2, 0x0F5C, 2),
SseOpcode::Unpcklps => (LegacyPrefixes::None, 0x0F14, 2),
SseOpcode::Unpckhps => (LegacyPrefixes::None, 0x0F15, 2),
SseOpcode::Xorps => (LegacyPrefixes::None, 0x0F57, 2),
SseOpcode::Xorpd => (LegacyPrefixes::_66, 0x0F57, 2),
SseOpcode::Phaddw => (LegacyPrefixes::_66, 0x0F3801, 3),
SseOpcode::Phaddd => (LegacyPrefixes::_66, 0x0F3802, 3),
SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F10, 2),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
match src_e {
RegMem::Reg { reg: reg_e } => {
emit_std_reg_reg(sink, prefix, opcode, length, reg_g, reg_e, rex);
}
RegMem::Mem { addr } => {
let addr = &addr.finalize(state, sink);
emit_std_reg_mem(sink, prefix, opcode, length, reg_g, addr, rex, 0);
}
}
}
Inst::XmmRmRBlend {
op,
src1,
src2,
dst,
mask,
} => {
let src1 = allocs.next(src1.to_reg());
let mask = allocs.next(mask.to_reg());
debug_assert_eq!(mask, regs::xmm0());
let reg_g = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(src1, reg_g);
let src_e = src2.clone().to_reg_mem().with_allocs(allocs);
let rex = RexFlags::clear_w();
let (prefix, opcode, length) = match op {
SseOpcode::Blendvps => (LegacyPrefixes::_66, 0x0F3814, 3),
SseOpcode::Blendvpd => (LegacyPrefixes::_66, 0x0F3815, 3),
SseOpcode::Pblendvb => (LegacyPrefixes::_66, 0x0F3810, 3),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
match src_e {
RegMem::Reg { reg: reg_e } => {
emit_std_reg_reg(sink, prefix, opcode, length, reg_g, reg_e, rex);
}
RegMem::Mem { addr } => {
let addr = &addr.finalize(state, sink);
emit_std_reg_mem(sink, prefix, opcode, length, reg_g, addr, rex, 0);
}
}
}
Inst::XmmRmiRVex {
op,
src1,
src2,
dst,
} => {
use LegacyPrefixes as LP;
use OpcodeMap as OM;
let dst = allocs.next(dst.to_reg().to_reg());
let src1 = allocs.next(src1.to_reg());
let src2 = src2.clone().to_reg_mem_imm().with_allocs(allocs);
// When the opcode is commutative, src1 is xmm{0..7}, and src2 is
// xmm{8..15}, then we can swap the operands to save one byte on the
// instruction's encoding.
let (src1, src2) = match (src1, src2) {
(src1, RegMemImm::Reg { reg: src2 })
if op.is_commutative()
&& src1.to_real_reg().unwrap().hw_enc() < 8
&& src2.to_real_reg().unwrap().hw_enc() >= 8 =>
{
(src2, RegMemImm::Reg { reg: src1 })
}
(src1, src2) => (src1, src2),
};
let src2 = match src2 {
// For opcodes where one of the operands is an immediate the
// encoding is a bit different, notably the usage of
// `opcode_ext`, so handle that specially here.
RegMemImm::Imm { simm32 } => {
let (opcode, opcode_ext, prefix) = match op {
AvxOpcode::Vpsrlw => (0x71, 2, LegacyPrefixes::_66),
AvxOpcode::Vpsrld => (0x72, 2, LegacyPrefixes::_66),
AvxOpcode::Vpsrlq => (0x73, 2, LegacyPrefixes::_66),
AvxOpcode::Vpsllw => (0x71, 6, LegacyPrefixes::_66),
AvxOpcode::Vpslld => (0x72, 6, LegacyPrefixes::_66),
AvxOpcode::Vpsllq => (0x73, 6, LegacyPrefixes::_66),
AvxOpcode::Vpsraw => (0x71, 4, LegacyPrefixes::_66),
AvxOpcode::Vpsrad => (0x72, 4, LegacyPrefixes::_66),
_ => panic!("unexpected rmi_r_vex opcode with immediate {op:?}"),
};
VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(prefix)
.map(OpcodeMap::_0F)
.opcode(opcode)
.opcode_ext(opcode_ext)
.vvvv(dst.to_real_reg().unwrap().hw_enc())
.prefix(LegacyPrefixes::_66)
.rm(src1.to_real_reg().unwrap().hw_enc())
.imm(simm32.try_into().unwrap())
.encode(sink);
return;
}
RegMemImm::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMemImm::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let (prefix, map, opcode) = match op {
AvxOpcode::Vminps => (LP::None, OM::_0F, 0x5D),
AvxOpcode::Vminpd => (LP::_66, OM::_0F, 0x5D),
AvxOpcode::Vmaxps => (LP::None, OM::_0F, 0x5F),
AvxOpcode::Vmaxpd => (LP::_66, OM::_0F, 0x5F),
AvxOpcode::Vandnps => (LP::None, OM::_0F, 0x55),
AvxOpcode::Vandnpd => (LP::_66, OM::_0F, 0x55),
AvxOpcode::Vpandn => (LP::_66, OM::_0F, 0xDF),
AvxOpcode::Vpsrlw => (LP::_66, OM::_0F, 0xD1),
AvxOpcode::Vpsrld => (LP::_66, OM::_0F, 0xD2),
AvxOpcode::Vpsrlq => (LP::_66, OM::_0F, 0xD3),
AvxOpcode::Vpaddb => (LP::_66, OM::_0F, 0xFC),
AvxOpcode::Vpaddw => (LP::_66, OM::_0F, 0xFD),
AvxOpcode::Vpaddd => (LP::_66, OM::_0F, 0xFE),
AvxOpcode::Vpaddq => (LP::_66, OM::_0F, 0xD4),
AvxOpcode::Vpaddsb => (LP::_66, OM::_0F, 0xEC),
AvxOpcode::Vpaddsw => (LP::_66, OM::_0F, 0xED),
AvxOpcode::Vpaddusb => (LP::_66, OM::_0F, 0xDC),
AvxOpcode::Vpaddusw => (LP::_66, OM::_0F, 0xDD),
AvxOpcode::Vpsubb => (LP::_66, OM::_0F, 0xF8),
AvxOpcode::Vpsubw => (LP::_66, OM::_0F, 0xF9),
AvxOpcode::Vpsubd => (LP::_66, OM::_0F, 0xFA),
AvxOpcode::Vpsubq => (LP::_66, OM::_0F, 0xFB),
AvxOpcode::Vpsubsb => (LP::_66, OM::_0F, 0xE8),
AvxOpcode::Vpsubsw => (LP::_66, OM::_0F, 0xE9),
AvxOpcode::Vpsubusb => (LP::_66, OM::_0F, 0xD8),
AvxOpcode::Vpsubusw => (LP::_66, OM::_0F, 0xD9),
AvxOpcode::Vpavgb => (LP::_66, OM::_0F, 0xE0),
AvxOpcode::Vpavgw => (LP::_66, OM::_0F, 0xE3),
AvxOpcode::Vpand => (LP::_66, OM::_0F, 0xDB),
AvxOpcode::Vandps => (LP::None, OM::_0F, 0x54),
AvxOpcode::Vandpd => (LP::_66, OM::_0F, 0x54),
AvxOpcode::Vpor => (LP::_66, OM::_0F, 0xEB),
AvxOpcode::Vorps => (LP::None, OM::_0F, 0x56),
AvxOpcode::Vorpd => (LP::_66, OM::_0F, 0x56),
AvxOpcode::Vpxor => (LP::_66, OM::_0F, 0xEF),
AvxOpcode::Vxorps => (LP::None, OM::_0F, 0x57),
AvxOpcode::Vxorpd => (LP::_66, OM::_0F, 0x57),
AvxOpcode::Vpmullw => (LP::_66, OM::_0F, 0xD5),
AvxOpcode::Vpmulld => (LP::_66, OM::_0F38, 0x40),
AvxOpcode::Vpmulhw => (LP::_66, OM::_0F, 0xE5),
AvxOpcode::Vpmulhrsw => (LP::_66, OM::_0F38, 0x0B),
AvxOpcode::Vpmulhuw => (LP::_66, OM::_0F, 0xE4),
AvxOpcode::Vpmuldq => (LP::_66, OM::_0F38, 0x28),
AvxOpcode::Vpmuludq => (LP::_66, OM::_0F, 0xF4),
AvxOpcode::Vpunpckhwd => (LP::_66, OM::_0F, 0x69),
AvxOpcode::Vpunpcklwd => (LP::_66, OM::_0F, 0x61),
AvxOpcode::Vunpcklps => (LP::None, OM::_0F, 0x14),
AvxOpcode::Vunpckhps => (LP::None, OM::_0F, 0x15),
AvxOpcode::Vaddps => (LP::None, OM::_0F, 0x58),
AvxOpcode::Vaddpd => (LP::_66, OM::_0F, 0x58),
AvxOpcode::Vsubps => (LP::None, OM::_0F, 0x5C),
AvxOpcode::Vsubpd => (LP::_66, OM::_0F, 0x5C),
AvxOpcode::Vmulps => (LP::None, OM::_0F, 0x59),
AvxOpcode::Vmulpd => (LP::_66, OM::_0F, 0x59),
AvxOpcode::Vdivps => (LP::None, OM::_0F, 0x5E),
AvxOpcode::Vdivpd => (LP::_66, OM::_0F, 0x5E),
AvxOpcode::Vpcmpeqb => (LP::_66, OM::_0F, 0x74),
AvxOpcode::Vpcmpeqw => (LP::_66, OM::_0F, 0x75),
AvxOpcode::Vpcmpeqd => (LP::_66, OM::_0F, 0x76),
AvxOpcode::Vpcmpeqq => (LP::_66, OM::_0F38, 0x29),
AvxOpcode::Vpcmpgtb => (LP::_66, OM::_0F, 0x64),
AvxOpcode::Vpcmpgtw => (LP::_66, OM::_0F, 0x65),
AvxOpcode::Vpcmpgtd => (LP::_66, OM::_0F, 0x66),
AvxOpcode::Vpcmpgtq => (LP::_66, OM::_0F38, 0x37),
AvxOpcode::Vmovlhps => (LP::None, OM::_0F, 0x16),
AvxOpcode::Vpminsb => (LP::_66, OM::_0F38, 0x38),
AvxOpcode::Vpminsw => (LP::_66, OM::_0F, 0xEA),
AvxOpcode::Vpminsd => (LP::_66, OM::_0F38, 0x39),
AvxOpcode::Vpmaxsb => (LP::_66, OM::_0F38, 0x3C),
AvxOpcode::Vpmaxsw => (LP::_66, OM::_0F, 0xEE),
AvxOpcode::Vpmaxsd => (LP::_66, OM::_0F38, 0x3D),
AvxOpcode::Vpminub => (LP::_66, OM::_0F, 0xDA),
AvxOpcode::Vpminuw => (LP::_66, OM::_0F38, 0x3A),
AvxOpcode::Vpminud => (LP::_66, OM::_0F38, 0x3B),
AvxOpcode::Vpmaxub => (LP::_66, OM::_0F, 0xDE),
AvxOpcode::Vpmaxuw => (LP::_66, OM::_0F38, 0x3E),
AvxOpcode::Vpmaxud => (LP::_66, OM::_0F38, 0x3F),
AvxOpcode::Vpunpcklbw => (LP::_66, OM::_0F, 0x60),
AvxOpcode::Vpunpckhbw => (LP::_66, OM::_0F, 0x68),
AvxOpcode::Vpacksswb => (LP::_66, OM::_0F, 0x63),
AvxOpcode::Vpackssdw => (LP::_66, OM::_0F, 0x6B),
AvxOpcode::Vpackuswb => (LP::_66, OM::_0F, 0x67),
AvxOpcode::Vpackusdw => (LP::_66, OM::_0F38, 0x2B),
AvxOpcode::Vpmaddwd => (LP::_66, OM::_0F, 0xF5),
AvxOpcode::Vpmaddubsw => (LP::_66, OM::_0F38, 0x04),
AvxOpcode::Vpshufb => (LP::_66, OM::_0F38, 0x00),
AvxOpcode::Vpsllw => (LP::_66, OM::_0F, 0xF1),
AvxOpcode::Vpslld => (LP::_66, OM::_0F, 0xF2),
AvxOpcode::Vpsllq => (LP::_66, OM::_0F, 0xF3),
AvxOpcode::Vpsraw => (LP::_66, OM::_0F, 0xE1),
AvxOpcode::Vpsrad => (LP::_66, OM::_0F, 0xE2),
AvxOpcode::Vaddss => (LP::_F3, OM::_0F, 0x58),
AvxOpcode::Vaddsd => (LP::_F2, OM::_0F, 0x58),
AvxOpcode::Vmulss => (LP::_F3, OM::_0F, 0x59),
AvxOpcode::Vmulsd => (LP::_F2, OM::_0F, 0x59),
AvxOpcode::Vsubss => (LP::_F3, OM::_0F, 0x5C),
AvxOpcode::Vsubsd => (LP::_F2, OM::_0F, 0x5C),
AvxOpcode::Vdivss => (LP::_F3, OM::_0F, 0x5E),
AvxOpcode::Vdivsd => (LP::_F2, OM::_0F, 0x5E),
AvxOpcode::Vminss => (LP::_F3, OM::_0F, 0x5D),
AvxOpcode::Vminsd => (LP::_F2, OM::_0F, 0x5D),
AvxOpcode::Vmaxss => (LP::_F3, OM::_0F, 0x5F),
AvxOpcode::Vmaxsd => (LP::_F2, OM::_0F, 0x5F),
AvxOpcode::Vphaddw => (LP::_66, OM::_0F38, 0x01),
AvxOpcode::Vphaddd => (LP::_66, OM::_0F38, 0x02),
AvxOpcode::Vpunpckldq => (LP::_66, OM::_0F, 0x62),
AvxOpcode::Vpunpckhdq => (LP::_66, OM::_0F, 0x6A),
AvxOpcode::Vpunpcklqdq => (LP::_66, OM::_0F, 0x6C),
AvxOpcode::Vpunpckhqdq => (LP::_66, OM::_0F, 0x6D),
AvxOpcode::Vmovsd => (LP::_F2, OM::_0F, 0x10),
AvxOpcode::Vmovss => (LP::_F3, OM::_0F, 0x10),
_ => panic!("unexpected rmir vex opcode {op:?}"),
};
VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(prefix)
.map(map)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
.vvvv(src1.to_real_reg().unwrap().hw_enc())
.rm(src2)
.encode(sink);
}
Inst::XmmRmRImmVex {
op,
src1,
src2,
dst,
imm,
} => {
let dst = allocs.next(dst.to_reg().to_reg());
let src1 = allocs.next(src1.to_reg());
let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let (w, prefix, map, opcode) = match op {
AvxOpcode::Vcmpps => (false, LegacyPrefixes::None, OpcodeMap::_0F, 0xC2),
AvxOpcode::Vcmppd => (false, LegacyPrefixes::_66, OpcodeMap::_0F, 0xC2),
AvxOpcode::Vpalignr => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0F),
AvxOpcode::Vinsertps => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x21),
AvxOpcode::Vshufps => (false, LegacyPrefixes::None, OpcodeMap::_0F, 0xC6),
AvxOpcode::Vpblendw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0E),
_ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
};
VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(prefix)
.map(map)
.w(w)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
.vvvv(src1.to_real_reg().unwrap().hw_enc())
.rm(src2)
.imm(*imm)
.encode(sink);
}
Inst::XmmVexPinsr {
op,
src1,
src2,
dst,
imm,
} => {
let dst = allocs.next(dst.to_reg().to_reg());
let src1 = allocs.next(src1.to_reg());
let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let (w, map, opcode) = match op {
AvxOpcode::Vpinsrb => (false, OpcodeMap::_0F3A, 0x20),
AvxOpcode::Vpinsrw => (false, OpcodeMap::_0F, 0xC4),
AvxOpcode::Vpinsrd => (false, OpcodeMap::_0F3A, 0x22),
AvxOpcode::Vpinsrq => (true, OpcodeMap::_0F3A, 0x22),
_ => panic!("unexpected vex_pinsr opcode {op:?}"),
};
VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(LegacyPrefixes::_66)
.map(map)
.w(w)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
.vvvv(src1.to_real_reg().unwrap().hw_enc())
.rm(src2)
.imm(*imm)
.encode(sink);
}
Inst::XmmRmRVex3 {
op,
src1,
src2,
src3,
dst,
} => {
let src1 = allocs.next(src1.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(src1, dst);
let src2 = allocs.next(src2.to_reg());
let src3 = match src3.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let (w, map, opcode) = match op {
AvxOpcode::Vfmadd132ss => (false, OpcodeMap::_0F38, 0x99),
AvxOpcode::Vfmadd213ss => (false, OpcodeMap::_0F38, 0xA9),
AvxOpcode::Vfnmadd132ss => (false, OpcodeMap::_0F38, 0x9D),
AvxOpcode::Vfnmadd213ss => (false, OpcodeMap::_0F38, 0xAD),
AvxOpcode::Vfmadd132sd => (true, OpcodeMap::_0F38, 0x99),
AvxOpcode::Vfmadd213sd => (true, OpcodeMap::_0F38, 0xA9),
AvxOpcode::Vfnmadd132sd => (true, OpcodeMap::_0F38, 0x9D),
AvxOpcode::Vfnmadd213sd => (true, OpcodeMap::_0F38, 0xAD),
AvxOpcode::Vfmadd132ps => (false, OpcodeMap::_0F38, 0x98),
AvxOpcode::Vfmadd213ps => (false, OpcodeMap::_0F38, 0xA8),
AvxOpcode::Vfnmadd132ps => (false, OpcodeMap::_0F38, 0x9C),
AvxOpcode::Vfnmadd213ps => (false, OpcodeMap::_0F38, 0xAC),
AvxOpcode::Vfmadd132pd => (true, OpcodeMap::_0F38, 0x98),
AvxOpcode::Vfmadd213pd => (true, OpcodeMap::_0F38, 0xA8),
AvxOpcode::Vfnmadd132pd => (true, OpcodeMap::_0F38, 0x9C),
AvxOpcode::Vfnmadd213pd => (true, OpcodeMap::_0F38, 0xAC),
AvxOpcode::Vblendvps => (false, OpcodeMap::_0F3A, 0x4A),
AvxOpcode::Vblendvpd => (false, OpcodeMap::_0F3A, 0x4B),
AvxOpcode::Vpblendvb => (false, OpcodeMap::_0F3A, 0x4C),
_ => unreachable!(),
};
VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(LegacyPrefixes::_66)
.map(map)
.w(w)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
.rm(src3)
.vvvv(src2.to_real_reg().unwrap().hw_enc())
.encode(sink);
}
Inst::XmmRmRBlendVex {
op,
src1,
src2,
mask,
dst,
} => {
let dst = allocs.next(dst.to_reg().to_reg());
let src1 = allocs.next(src1.to_reg());
let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let mask = allocs.next(mask.to_reg());
let opcode = match op {
AvxOpcode::Vblendvps => 0x4A,
AvxOpcode::Vblendvpd => 0x4B,
AvxOpcode::Vpblendvb => 0x4C,
_ => unreachable!(),
};
VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(LegacyPrefixes::_66)
.map(OpcodeMap::_0F3A)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
.vvvv(src1.to_real_reg().unwrap().hw_enc())
.rm(src2)
.imm(mask.to_real_reg().unwrap().hw_enc() << 4)
.encode(sink);
}
Inst::XmmUnaryRmRVex { op, src, dst } => {
let dst = allocs.next(dst.to_reg().to_reg());
let src = match src.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let (prefix, map, opcode) = match op {
AvxOpcode::Vpmovsxbw => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x20),
AvxOpcode::Vpmovzxbw => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x30),
AvxOpcode::Vpmovsxwd => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x23),
AvxOpcode::Vpmovzxwd => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x33),
AvxOpcode::Vpmovsxdq => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x25),
AvxOpcode::Vpmovzxdq => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x35),
AvxOpcode::Vpabsb => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x1C),
AvxOpcode::Vpabsw => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x1D),
AvxOpcode::Vpabsd => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x1E),
AvxOpcode::Vsqrtps => (LegacyPrefixes::None, OpcodeMap::_0F, 0x51),
AvxOpcode::Vsqrtpd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x51),
AvxOpcode::Vcvtdq2pd => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0xE6),
AvxOpcode::Vcvtdq2ps => (LegacyPrefixes::None, OpcodeMap::_0F, 0x5B),
AvxOpcode::Vcvtpd2ps => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x5A),
AvxOpcode::Vcvtps2pd => (LegacyPrefixes::None, OpcodeMap::_0F, 0x5A),
AvxOpcode::Vcvttpd2dq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0xE6),
AvxOpcode::Vcvttps2dq => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x5B),
AvxOpcode::Vmovdqu => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x6F),
AvxOpcode::Vmovups => (LegacyPrefixes::None, OpcodeMap::_0F, 0x10),
AvxOpcode::Vmovupd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x10),
// Note that for `vmov{s,d}` the `inst.isle` rules should
// statically ensure that only `Amode` operands are used here.
// Otherwise the other encodings of `vmovss` are more like
// 2-operand instructions which this unary encoding does not
// have.
AvxOpcode::Vmovss => match &src {
RegisterOrAmode::Amode(_) => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x10),
_ => unreachable!(),
},
AvxOpcode::Vmovsd => match &src {
RegisterOrAmode::Amode(_) => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x10),
_ => unreachable!(),
},
AvxOpcode::Vpbroadcastb => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x78),
AvxOpcode::Vpbroadcastw => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x79),
AvxOpcode::Vpbroadcastd => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x58),
AvxOpcode::Vbroadcastss => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x18),
AvxOpcode::Vmovddup => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x12),
AvxOpcode::Vcvtss2sd => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x5A),
AvxOpcode::Vcvtsd2ss => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x5A),
AvxOpcode::Vsqrtss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x51),
AvxOpcode::Vsqrtsd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x51),
_ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
};
let vex = VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(prefix)
.map(map)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
.rm(src);
// These opcodes take a second operand through `vvvv` which copies
// the upper bits into the destination register. That's not
// reflected in the CLIF instruction, however, since the SSE version
// doesn't have this functionality. Instead just copy whatever
// happens to already be in the destination, which at least is what
// LLVM seems to do.
let vex = match op {
AvxOpcode::Vcvtss2sd
| AvxOpcode::Vcvtsd2ss
| AvxOpcode::Vsqrtss
| AvxOpcode::Vsqrtsd => vex.vvvv(dst.to_real_reg().unwrap().hw_enc()),
_ => vex,
};
vex.encode(sink);
}
Inst::XmmUnaryRmRImmVex { op, src, dst, imm } => {
let dst = allocs.next(dst.to_reg().to_reg());
let src = match src.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let (prefix, map, opcode) = match op {
AvxOpcode::Vroundps => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x08),
AvxOpcode::Vroundpd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x09),
AvxOpcode::Vpshuflw => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x70),
AvxOpcode::Vpshufhw => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x70),
AvxOpcode::Vpshufd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x70),
AvxOpcode::Vroundss => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0A),
AvxOpcode::Vroundsd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0B),
_ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
};
let vex = VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(prefix)
.map(map)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
.rm(src)
.imm(*imm);
// See comments in similar block above in `XmmUnaryRmRVex` for what
// this is doing.
let vex = match op {
AvxOpcode::Vroundss | AvxOpcode::Vroundsd => {
vex.vvvv(dst.to_real_reg().unwrap().hw_enc())
}
_ => vex,
};
vex.encode(sink);
}
Inst::XmmMovRMVex { op, src, dst } => {
let src = allocs.next(src.to_reg());
let dst = dst.with_allocs(allocs).finalize(state, sink);
let (prefix, map, opcode) = match op {
AvxOpcode::Vmovdqu => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x7F),
AvxOpcode::Vmovss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x11),
AvxOpcode::Vmovsd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x11),
AvxOpcode::Vmovups => (LegacyPrefixes::None, OpcodeMap::_0F, 0x11),
AvxOpcode::Vmovupd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x11),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(prefix)
.map(map)
.opcode(opcode)
.rm(dst)
.reg(src.to_real_reg().unwrap().hw_enc())
.encode(sink);
}
Inst::XmmMovRMImmVex { op, src, dst, imm } => {
let src = allocs.next(src.to_reg());
let dst = dst.with_allocs(allocs).finalize(state, sink);
let (w, prefix, map, opcode) = match op {
AvxOpcode::Vpextrb => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x14),
AvxOpcode::Vpextrw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x15),
AvxOpcode::Vpextrd => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
AvxOpcode::Vpextrq => (true, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
VexInstruction::new()
.length(VexVectorLength::V128)
.w(w)
.prefix(prefix)
.map(map)
.opcode(opcode)
.rm(dst)
.reg(src.to_real_reg().unwrap().hw_enc())
.imm(*imm)
.encode(sink);
}
Inst::XmmToGprImmVex { op, src, dst, imm } => {
let src = allocs.next(src.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
let (w, prefix, map, opcode) = match op {
AvxOpcode::Vpextrb => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x14),
AvxOpcode::Vpextrw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x15),
AvxOpcode::Vpextrd => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
AvxOpcode::Vpextrq => (true, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
VexInstruction::new()
.length(VexVectorLength::V128)
.w(w)
.prefix(prefix)
.map(map)
.opcode(opcode)
.rm(dst.to_real_reg().unwrap().hw_enc())
.reg(src.to_real_reg().unwrap().hw_enc())
.imm(*imm)
.encode(sink);
}
Inst::XmmToGprVex {
op,
src,
dst,
dst_size,
} => {
let src = allocs.next(src.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
let (prefix, map, opcode) = match op {
// vmovd/vmovq are differentiated by `w`
AvxOpcode::Vmovd | AvxOpcode::Vmovq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x7E),
AvxOpcode::Vmovmskps => (LegacyPrefixes::None, OpcodeMap::_0F, 0x50),
AvxOpcode::Vmovmskpd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x50),
AvxOpcode::Vpmovmskb => (LegacyPrefixes::_66, OpcodeMap::_0F, 0xD7),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
let w = match dst_size {
OperandSize::Size64 => true,
_ => false,
};
let mut vex = VexInstruction::new()
.length(VexVectorLength::V128)
.w(w)
.prefix(prefix)
.map(map)
.opcode(opcode);
vex = match op {
// The `vmovq/vmovd` reverse the order of the destination/source
// relative to other opcodes using this shape of instruction.
AvxOpcode::Vmovd | AvxOpcode::Vmovq => vex
.rm(dst.to_real_reg().unwrap().hw_enc())
.reg(src.to_real_reg().unwrap().hw_enc()),
_ => vex
.rm(src.to_real_reg().unwrap().hw_enc())
.reg(dst.to_real_reg().unwrap().hw_enc()),
};
vex.encode(sink);
}
Inst::GprToXmmVex {
op,
src,
dst,
src_size,
} => {
let dst = allocs.next(dst.to_reg().to_reg());
let src = match src.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let (prefix, map, opcode) = match op {
// vmovd/vmovq are differentiated by `w`
AvxOpcode::Vmovd | AvxOpcode::Vmovq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x6E),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
let w = match src_size {
OperandSize::Size64 => true,
_ => false,
};
VexInstruction::new()
.length(VexVectorLength::V128)
.w(w)
.prefix(prefix)
.map(map)
.opcode(opcode)
.rm(src)
.reg(dst.to_real_reg().unwrap().hw_enc())
.encode(sink);
}
Inst::XmmRmREvex {
op,
src1,
src2,
dst,
}
| Inst::XmmRmREvex3 {
op,
src1: _, // `dst` reuses `src1`.
src2: src1,
src3: src2,
dst,
} => {
let reused_src = match inst {
Inst::XmmRmREvex3 { src1, .. } => Some(allocs.next(src1.to_reg())),
_ => None,
};
let src1 = allocs.next(src1.to_reg());
let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let dst = allocs.next(dst.to_reg().to_reg());
if let Some(src1) = reused_src {
debug_assert_eq!(src1, dst);
}
let (w, opcode, map) = match op {
Avx512Opcode::Vpermi2b => (false, 0x75, OpcodeMap::_0F38),
Avx512Opcode::Vpmullq => (true, 0x40, OpcodeMap::_0F38),
Avx512Opcode::Vpsraq => (true, 0xE2, OpcodeMap::_0F),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
EvexInstruction::new()
.length(EvexVectorLength::V128)
.prefix(LegacyPrefixes::_66)
.map(map)
.w(w)
.opcode(opcode)
.tuple_type(op.tuple_type())
.reg(dst.to_real_reg().unwrap().hw_enc())
.vvvvv(src1.to_real_reg().unwrap().hw_enc())
.rm(src2)
.encode(sink);
}
Inst::XmmMinMaxSeq {
size,
is_min,
lhs,
rhs,
dst,
} => {
let rhs = allocs.next(rhs.to_reg());
let lhs = allocs.next(lhs.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(rhs, dst);
// Generates the following sequence:
// cmpss/cmpsd %lhs, %rhs_dst
// jnz do_min_max
// jp propagate_nan
//
// ;; ordered and equal: propagate the sign bit (for -0 vs 0):
// {and,or}{ss,sd} %lhs, %rhs_dst
// j done
//
// ;; to get the desired NaN behavior (signalling NaN transformed into a quiet NaN, the
// ;; NaN value is returned), we add both inputs.
// propagate_nan:
// add{ss,sd} %lhs, %rhs_dst
// j done
//
// do_min_max:
// {min,max}{ss,sd} %lhs, %rhs_dst
//
// done:
let done = sink.get_label();
let propagate_nan = sink.get_label();
let do_min_max = sink.get_label();
let (add_op, cmp_op, and_op, or_op, min_max_op) = match size {
OperandSize::Size32 => (
SseOpcode::Addss,
SseOpcode::Ucomiss,
SseOpcode::Andps,
SseOpcode::Orps,
if *is_min {
SseOpcode::Minss
} else {
SseOpcode::Maxss
},
),
OperandSize::Size64 => (
SseOpcode::Addsd,
SseOpcode::Ucomisd,
SseOpcode::Andpd,
SseOpcode::Orpd,
if *is_min {
SseOpcode::Minsd
} else {
SseOpcode::Maxsd
},
),
_ => unreachable!(),
};
let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(lhs), dst);
inst.emit(&[], sink, info, state);
one_way_jmp(sink, CC::NZ, do_min_max);
one_way_jmp(sink, CC::P, propagate_nan);
// Ordered and equal. The operands are bit-identical unless they are zero
// and negative zero. These instructions merge the sign bits in that
// case, and are no-ops otherwise.
let op = if *is_min { or_op } else { and_op };
let inst = Inst::xmm_rm_r(op, RegMem::reg(lhs), Writable::from_reg(dst));
inst.emit(&[], sink, info, state);
let inst = Inst::jmp_known(done);
inst.emit(&[], sink, info, state);
// x86's min/max are not symmetric; if either operand is a NaN, they return the
// read-only operand: perform an addition between the two operands, which has the
// desired NaN propagation effects.
sink.bind_label(propagate_nan, state.ctrl_plane_mut());
let inst = Inst::xmm_rm_r(add_op, RegMem::reg(lhs), Writable::from_reg(dst));
inst.emit(&[], sink, info, state);
one_way_jmp(sink, CC::P, done);
sink.bind_label(do_min_max, state.ctrl_plane_mut());
let inst = Inst::xmm_rm_r(min_max_op, RegMem::reg(lhs), Writable::from_reg(dst));
inst.emit(&[], sink, info, state);
sink.bind_label(done, state.ctrl_plane_mut());
}
Inst::XmmRmRImm {
op,
src1,
src2,
dst,
imm,
size,
} => {
let src1 = allocs.next(*src1);
let dst = allocs.next(dst.to_reg());
let src2 = src2.with_allocs(allocs);
debug_assert_eq!(src1, dst);
let (prefix, opcode, len) = match op {
SseOpcode::Cmpps => (LegacyPrefixes::None, 0x0FC2, 2),
SseOpcode::Cmppd => (LegacyPrefixes::_66, 0x0FC2, 2),
SseOpcode::Cmpss => (LegacyPrefixes::_F3, 0x0FC2, 2),
SseOpcode::Cmpsd => (LegacyPrefixes::_F2, 0x0FC2, 2),
SseOpcode::Insertps => (LegacyPrefixes::_66, 0x0F3A21, 3),
SseOpcode::Palignr => (LegacyPrefixes::_66, 0x0F3A0F, 3),
SseOpcode::Pinsrb => (LegacyPrefixes::_66, 0x0F3A20, 3),
SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2),
SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3),
SseOpcode::Shufps => (LegacyPrefixes::None, 0x0FC6, 2),
SseOpcode::Pblendw => (LegacyPrefixes::_66, 0x0F3A0E, 3),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
let rex = RexFlags::from(*size);
let regs_swapped = match *op {
// These opcodes (and not the SSE2 version of PEXTRW) flip the operand
// encoding: `dst` in ModRM's r/m, `src` in ModRM's reg field.
SseOpcode::Pextrb | SseOpcode::Pextrd => true,
// The rest of the opcodes have the customary encoding: `dst` in ModRM's reg,
// `src` in ModRM's r/m field.
_ => false,
};
match src2 {
RegMem::Reg { reg } => {
if regs_swapped {
emit_std_reg_reg(sink, prefix, opcode, len, reg, dst, rex);
} else {
emit_std_reg_reg(sink, prefix, opcode, len, dst, reg, rex);
}
}
RegMem::Mem { addr } => {
let addr = &addr.finalize(state, sink);
assert!(
!regs_swapped,
"No existing way to encode a mem argument in the ModRM r/m field."
);
// N.B.: bytes_at_end == 1, because of the `imm` byte below.
emit_std_reg_mem(sink, prefix, opcode, len, dst, addr, rex, 1);
}
}
sink.put1(*imm);
}
Inst::XmmUninitializedValue { .. } => {
// This instruction format only exists to declare a register as a `def`; no code is
// emitted.
}
Inst::XmmMovRM { op, src, dst } => {
let src = allocs.next(src.to_reg());
let dst = dst.with_allocs(allocs);
let (prefix, opcode) = match op {
SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F29),
SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F29),
SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F7F),
SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F11),
SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F11),
SseOpcode::Movups => (LegacyPrefixes::None, 0x0F11),
SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F11),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
let dst = &dst.finalize(state, sink);
emit_std_reg_mem(sink, prefix, opcode, 2, src, dst, RexFlags::clear_w(), 0);
}
Inst::XmmMovRMImm { op, src, dst, imm } => {
let src = allocs.next(src.to_reg());
let dst = dst.with_allocs(allocs);
let (w, prefix, opcode) = match op {
SseOpcode::Pextrb => (false, LegacyPrefixes::_66, 0x0F3A14),
SseOpcode::Pextrw => (false, LegacyPrefixes::_66, 0x0F3A15),
SseOpcode::Pextrd => (false, LegacyPrefixes::_66, 0x0F3A16),
SseOpcode::Pextrq => (true, LegacyPrefixes::_66, 0x0F3A16),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
let rex = if w {
RexFlags::set_w()
} else {
RexFlags::clear_w()
};
let dst = &dst.finalize(state, sink);
emit_std_reg_mem(sink, prefix, opcode, 3, src, dst, rex, 1);
sink.put1(*imm);
}
Inst::XmmToGpr {
op,
src,
dst,
dst_size,
} => {
let src = allocs.next(src.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
let (prefix, opcode, dst_first) = match op {
SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true),
SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true),
// Movd and movq use the same opcode; the presence of the REX prefix (set below)
// actually determines which is used.
SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F7E, false),
SseOpcode::Movmskps => (LegacyPrefixes::None, 0x0F50, true),
SseOpcode::Movmskpd => (LegacyPrefixes::_66, 0x0F50, true),
SseOpcode::Pmovmskb => (LegacyPrefixes::_66, 0x0FD7, true),
_ => panic!("unexpected opcode {:?}", op),
};
let rex = RexFlags::from(*dst_size);
let (src, dst) = if dst_first { (dst, src) } else { (src, dst) };
emit_std_reg_reg(sink, prefix, opcode, 2, src, dst, rex);
}
Inst::XmmToGprImm { op, src, dst, imm } => {
use OperandSize as OS;
let src = allocs.next(src.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
let (prefix, opcode, opcode_bytes, dst_size, dst_first) = match op {
SseOpcode::Pextrb => (LegacyPrefixes::_66, 0x0F3A14, 3, OS::Size32, false),
SseOpcode::Pextrw => (LegacyPrefixes::_66, 0x0FC5, 2, OS::Size32, true),
SseOpcode::Pextrd => (LegacyPrefixes::_66, 0x0F3A16, 3, OS::Size32, false),
SseOpcode::Pextrq => (LegacyPrefixes::_66, 0x0F3A16, 3, OS::Size64, false),
_ => panic!("unexpected opcode {:?}", op),
};
let rex = RexFlags::from(dst_size);
let (src, dst) = if dst_first { (dst, src) } else { (src, dst) };
emit_std_reg_reg(sink, prefix, opcode, opcode_bytes, src, dst, rex);
sink.put1(*imm);
}
Inst::GprToXmm {
op,
src: src_e,
dst: reg_g,
src_size,
} => {
let reg_g = allocs.next(reg_g.to_reg().to_reg());
let src_e = src_e.clone().to_reg_mem().with_allocs(allocs);
let (prefix, opcode) = match op {
// Movd and movq use the same opcode; the presence of the REX prefix (set below)
// actually determines which is used.
SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F6E),
_ => panic!("unexpected opcode {:?}", op),
};
let rex = RexFlags::from(*src_size);
match src_e {
RegMem::Reg { reg: reg_e } => {
emit_std_reg_reg(sink, prefix, opcode, 2, reg_g, reg_e, rex);
}
RegMem::Mem { addr } => {
let addr = &addr.finalize(state, sink);
emit_std_reg_mem(sink, prefix, opcode, 2, reg_g, addr, rex, 0);
}
}
}
Inst::XmmCmpRmR { op, src, dst } => {
let dst = allocs.next(dst.to_reg());
let src = src.clone().to_reg_mem().with_allocs(allocs);
let rex = RexFlags::clear_w();
let (prefix, opcode, len) = match op {
SseOpcode::Ptest => (LegacyPrefixes::_66, 0x0F3817, 3),
SseOpcode::Ucomisd => (LegacyPrefixes::_66, 0x0F2E, 2),
SseOpcode::Ucomiss => (LegacyPrefixes::None, 0x0F2E, 2),
_ => unimplemented!("Emit xmm cmp rm r"),
};
match src {
RegMem::Reg { reg } => {
emit_std_reg_reg(sink, prefix, opcode, len, dst, reg, rex);
}
RegMem::Mem { addr } => {
let addr = &addr.finalize(state, sink);
emit_std_reg_mem(sink, prefix, opcode, len, dst, addr, rex, 0);
}
}
}
Inst::CvtIntToFloat {
op,
src1,
src2,
dst,
src2_size,
} => {
let src1 = allocs.next(src1.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
assert_eq!(src1, dst);
let src2 = src2.clone().to_reg_mem().with_allocs(allocs);
let (prefix, opcode) = match op {
SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A),
SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A),
_ => panic!("unexpected opcode {:?}", op),
};
let rex = RexFlags::from(*src2_size);
match src2 {
RegMem::Reg { reg: src2 } => {
emit_std_reg_reg(sink, prefix, opcode, 2, dst, src2, rex);
}
RegMem::Mem { addr } => {
let addr = &addr.finalize(state, sink);
emit_std_reg_mem(sink, prefix, opcode, 2, dst, addr, rex, 0);
}
}
}
Inst::CvtIntToFloatVex {
op,
src1,
src2,
dst,
src2_size,
} => {
let dst = allocs.next(dst.to_reg().to_reg());
let src1 = allocs.next(src1.to_reg());
let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let (prefix, map, opcode) = match op {
AvxOpcode::Vcvtsi2ss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x2A),
AvxOpcode::Vcvtsi2sd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x2A),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
let w = match src2_size {
OperandSize::Size64 => true,
_ => false,
};
VexInstruction::new()
.length(VexVectorLength::V128)
.w(w)
.prefix(prefix)
.map(map)
.opcode(opcode)
.rm(src2)
.reg(dst.to_real_reg().unwrap().hw_enc())
.vvvv(src1.to_real_reg().unwrap().hw_enc())
.encode(sink);
}
Inst::CvtUint64ToFloatSeq {
dst_size,
src,
dst,
tmp_gpr1,
tmp_gpr2,
} => {
let src = allocs.next(src.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
let tmp_gpr1 = allocs.next(tmp_gpr1.to_reg().to_reg());
let tmp_gpr2 = allocs.next(tmp_gpr2.to_reg().to_reg());
// Note: this sequence is specific to 64-bit mode; a 32-bit mode would require a
// different sequence.
//
// Emit the following sequence:
//
// cmp 0, %src
// jl handle_negative
//
// ;; handle positive, which can't overflow
// cvtsi2sd/cvtsi2ss %src, %dst
// j done
//
// ;; handle negative: see below for an explanation of what it's doing.
// handle_negative:
// mov %src, %tmp_gpr1
// shr $1, %tmp_gpr1
// mov %src, %tmp_gpr2
// and $1, %tmp_gpr2
// or %tmp_gpr1, %tmp_gpr2
// cvtsi2sd/cvtsi2ss %tmp_gpr2, %dst
// addsd/addss %dst, %dst
//
// done:
assert_ne!(src, tmp_gpr1);
assert_ne!(src, tmp_gpr2);
assert_ne!(tmp_gpr1, tmp_gpr2);
let handle_negative = sink.get_label();
let done = sink.get_label();
// If x seen as a signed int64 is not negative, a signed-conversion will do the right
// thing.
// TODO use tst src, src here.
let inst = Inst::cmp_rmi_r(OperandSize::Size64, RegMemImm::imm(0), src);
inst.emit(&[], sink, info, state);
one_way_jmp(sink, CC::L, handle_negative);
// Handle a positive int64, which is the "easy" case: a signed conversion will do the
// right thing.
emit_signed_cvt(
sink,
info,
state,
src,
Writable::from_reg(dst),
*dst_size == OperandSize::Size64,
);
let inst = Inst::jmp_known(done);
inst.emit(&[], sink, info, state);
sink.bind_label(handle_negative, state.ctrl_plane_mut());
// Divide x by two to get it in range for the signed conversion, keep the LSB, and
// scale it back up on the FP side.
let inst = Inst::gen_move(Writable::from_reg(tmp_gpr1), src, types::I64);
inst.emit(&[], sink, info, state);
// tmp_gpr1 := src >> 1
let inst = Inst::shift_r(
OperandSize::Size64,
ShiftKind::ShiftRightLogical,
Imm8Gpr::new(Imm8Reg::Imm8 { imm: 1 }).unwrap(),
tmp_gpr1,
Writable::from_reg(tmp_gpr1),
);
inst.emit(&[], sink, info, state);
let inst = Inst::gen_move(Writable::from_reg(tmp_gpr2), src, types::I64);
inst.emit(&[], sink, info, state);
let inst = Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::And,
RegMemImm::imm(1),
Writable::from_reg(tmp_gpr2),
);
inst.emit(&[], sink, info, state);
let inst = Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Or,
RegMemImm::reg(tmp_gpr1),
Writable::from_reg(tmp_gpr2),
);
inst.emit(&[], sink, info, state);
emit_signed_cvt(
sink,
info,
state,
tmp_gpr2,
Writable::from_reg(dst),
*dst_size == OperandSize::Size64,
);
let add_op = if *dst_size == OperandSize::Size64 {
SseOpcode::Addsd
} else {
SseOpcode::Addss
};
let inst = Inst::xmm_rm_r(add_op, RegMem::reg(dst), Writable::from_reg(dst));
inst.emit(&[], sink, info, state);
sink.bind_label(done, state.ctrl_plane_mut());
}
Inst::CvtFloatToSintSeq {
src_size,
dst_size,
is_saturating,
src,
dst,
tmp_gpr,
tmp_xmm,
} => {
let src = allocs.next(src.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
let tmp_gpr = allocs.next(tmp_gpr.to_reg().to_reg());
let tmp_xmm = allocs.next(tmp_xmm.to_reg().to_reg());
// Emits the following common sequence:
//
// cvttss2si/cvttsd2si %src, %dst
// cmp %dst, 1
// jno done
//
// Then, for saturating conversions:
//
// ;; check for NaN
// cmpss/cmpsd %src, %src
// jnp not_nan
// xor %dst, %dst
//
// ;; positive inputs get saturated to INT_MAX; negative ones to INT_MIN, which is
// ;; already in %dst.
// xorpd %tmp_xmm, %tmp_xmm
// cmpss/cmpsd %src, %tmp_xmm
// jnb done
// mov/movaps $INT_MAX, %dst
//
// done:
//
// Then, for non-saturating conversions:
//
// ;; check for NaN
// cmpss/cmpsd %src, %src
// jnp not_nan
// ud2 trap BadConversionToInteger
//
// ;; check if INT_MIN was the correct result, against a magic constant:
// not_nan:
// movaps/mov $magic, %tmp_gpr
// movq/movd %tmp_gpr, %tmp_xmm
// cmpss/cmpsd %tmp_xmm, %src
// jnb/jnbe $check_positive
// ud2 trap IntegerOverflow
//
// ;; if positive, it was a real overflow
// check_positive:
// xorpd %tmp_xmm, %tmp_xmm
// cmpss/cmpsd %src, %tmp_xmm
// jnb done
// ud2 trap IntegerOverflow
//
// done:
let (cast_op, cmp_op, trunc_op) = match src_size {
OperandSize::Size64 => (SseOpcode::Movq, SseOpcode::Ucomisd, SseOpcode::Cvttsd2si),
OperandSize::Size32 => (SseOpcode::Movd, SseOpcode::Ucomiss, SseOpcode::Cvttss2si),
_ => unreachable!(),
};
let done = sink.get_label();
// The truncation.
let inst = Inst::xmm_to_gpr(trunc_op, src, Writable::from_reg(dst), *dst_size);
inst.emit(&[], sink, info, state);
// Compare against 1, in case of overflow the dst operand was INT_MIN.
let inst = Inst::cmp_rmi_r(*dst_size, RegMemImm::imm(1), dst);
inst.emit(&[], sink, info, state);
one_way_jmp(sink, CC::NO, done); // no overflow => done
// Check for NaN.
let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), src);
inst.emit(&[], sink, info, state);
if *is_saturating {
let not_nan = sink.get_label();
one_way_jmp(sink, CC::NP, not_nan); // go to not_nan if not a NaN
// For NaN, emit 0.
let inst = Inst::alu_rmi_r(
*dst_size,
AluRmiROpcode::Xor,
RegMemImm::reg(dst),
Writable::from_reg(dst),
);
inst.emit(&[], sink, info, state);
let inst = Inst::jmp_known(done);
inst.emit(&[], sink, info, state);
sink.bind_label(not_nan, state.ctrl_plane_mut());
// If the input was positive, saturate to INT_MAX.
// Zero out tmp_xmm.
let inst = Inst::xmm_rm_r(
SseOpcode::Xorpd,
RegMem::reg(tmp_xmm),
Writable::from_reg(tmp_xmm),
);
inst.emit(&[], sink, info, state);
let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm);
inst.emit(&[], sink, info, state);
// Jump if >= to done.
one_way_jmp(sink, CC::NB, done);
// Otherwise, put INT_MAX.
if *dst_size == OperandSize::Size64 {
let inst = Inst::imm(
OperandSize::Size64,
0x7fffffffffffffff,
Writable::from_reg(dst),
);
inst.emit(&[], sink, info, state);
} else {
let inst = Inst::imm(OperandSize::Size32, 0x7fffffff, Writable::from_reg(dst));
inst.emit(&[], sink, info, state);
}
} else {
let inst = Inst::trap_if(CC::P, TrapCode::BadConversionToInteger);
inst.emit(&[], sink, info, state);
// Check if INT_MIN was the correct result: determine the smallest floating point
// number that would convert to INT_MIN, put it in a temporary register, and compare
// against the src register.
// If the src register is less (or in some cases, less-or-equal) than the threshold,
// trap!
let mut no_overflow_cc = CC::NB; // >=
let output_bits = dst_size.to_bits();
match *src_size {
OperandSize::Size32 => {
let cst = Ieee32::pow2(output_bits - 1).neg().bits();
let inst =
Inst::imm(OperandSize::Size32, cst as u64, Writable::from_reg(tmp_gpr));
inst.emit(&[], sink, info, state);
}
OperandSize::Size64 => {
// An f64 can represent `i32::min_value() - 1` exactly with precision to spare,
// so there are values less than -2^(N-1) that convert correctly to INT_MIN.
let cst = if output_bits < 64 {
no_overflow_cc = CC::NBE; // >
Ieee64::fcvt_to_sint_negative_overflow(output_bits)
} else {
Ieee64::pow2(output_bits - 1).neg()
};
let inst =
Inst::imm(OperandSize::Size64, cst.bits(), Writable::from_reg(tmp_gpr));
inst.emit(&[], sink, info, state);
}
_ => unreachable!(),
}
let inst = Inst::gpr_to_xmm(
cast_op,
RegMem::reg(tmp_gpr),
*src_size,
Writable::from_reg(tmp_xmm),
);
inst.emit(&[], sink, info, state);
let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(tmp_xmm), src);
inst.emit(&[], sink, info, state);
// no trap if src >= or > threshold
let inst = Inst::trap_if(no_overflow_cc.invert(), TrapCode::IntegerOverflow);
inst.emit(&[], sink, info, state);
// If positive, it was a real overflow.
// Zero out the tmp_xmm register.
let inst = Inst::xmm_rm_r(
SseOpcode::Xorpd,
RegMem::reg(tmp_xmm),
Writable::from_reg(tmp_xmm),
);
inst.emit(&[], sink, info, state);
let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm);
inst.emit(&[], sink, info, state);
// no trap if 0 >= src
let inst = Inst::trap_if(CC::B, TrapCode::IntegerOverflow);
inst.emit(&[], sink, info, state);
}
sink.bind_label(done, state.ctrl_plane_mut());
}
Inst::CvtFloatToUintSeq {
src_size,
dst_size,
is_saturating,
src,
dst,
tmp_gpr,
tmp_xmm,
tmp_xmm2,
} => {
let src = allocs.next(src.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
let tmp_gpr = allocs.next(tmp_gpr.to_reg().to_reg());
let tmp_xmm = allocs.next(tmp_xmm.to_reg().to_reg());
let tmp_xmm2 = allocs.next(tmp_xmm2.to_reg().to_reg());
// The only difference in behavior between saturating and non-saturating is how we
// handle errors. Emits the following sequence:
//
// movaps/mov 2**(int_width - 1), %tmp_gpr
// movq/movd %tmp_gpr, %tmp_xmm
// cmpss/cmpsd %tmp_xmm, %src
// jnb is_large
//
// ;; check for NaN inputs
// jnp not_nan
// -- non-saturating: ud2 trap BadConversionToInteger
// -- saturating: xor %dst, %dst; j done
//
// not_nan:
// cvttss2si/cvttsd2si %src, %dst
// cmp 0, %dst
// jnl done
// -- non-saturating: ud2 trap IntegerOverflow
// -- saturating: xor %dst, %dst; j done
//
// is_large:
// mov %src, %tmp_xmm2
// subss/subsd %tmp_xmm, %tmp_xmm2
// cvttss2si/cvttss2sd %tmp_x, %dst
// cmp 0, %dst
// jnl next_is_large
// -- non-saturating: ud2 trap IntegerOverflow
// -- saturating: movaps $UINT_MAX, %dst; j done
//
// next_is_large:
// add 2**(int_width -1), %dst ;; 2 instructions for 64-bits integers
//
// done:
assert_ne!(tmp_xmm, src, "tmp_xmm clobbers src!");
let (sub_op, cast_op, cmp_op, trunc_op) = match src_size {
OperandSize::Size32 => (
SseOpcode::Subss,
SseOpcode::Movd,
SseOpcode::Ucomiss,
SseOpcode::Cvttss2si,
),
OperandSize::Size64 => (
SseOpcode::Subsd,
SseOpcode::Movq,
SseOpcode::Ucomisd,
SseOpcode::Cvttsd2si,
),
_ => unreachable!(),
};
let done = sink.get_label();
let cst = match src_size {
OperandSize::Size32 => Ieee32::pow2(dst_size.to_bits() - 1).bits() as u64,
OperandSize::Size64 => Ieee64::pow2(dst_size.to_bits() - 1).bits(),
_ => unreachable!(),
};
let inst = Inst::imm(*src_size, cst, Writable::from_reg(tmp_gpr));
inst.emit(&[], sink, info, state);
let inst = Inst::gpr_to_xmm(
cast_op,
RegMem::reg(tmp_gpr),
*src_size,
Writable::from_reg(tmp_xmm),
);
inst.emit(&[], sink, info, state);
let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(tmp_xmm), src);
inst.emit(&[], sink, info, state);
let handle_large = sink.get_label();
one_way_jmp(sink, CC::NB, handle_large); // jump to handle_large if src >= large_threshold
if *is_saturating {
// If not NaN jump over this 0-return, otherwise return 0
let not_nan = sink.get_label();
one_way_jmp(sink, CC::NP, not_nan);
let inst = Inst::alu_rmi_r(
*dst_size,
AluRmiROpcode::Xor,
RegMemImm::reg(dst),
Writable::from_reg(dst),
);
inst.emit(&[], sink, info, state);
let inst = Inst::jmp_known(done);
inst.emit(&[], sink, info, state);
sink.bind_label(not_nan, state.ctrl_plane_mut());
} else {
// Trap.
let inst = Inst::trap_if(CC::P, TrapCode::BadConversionToInteger);
inst.emit(&[], sink, info, state);
}
// Actual truncation for small inputs: if the result is not positive, then we had an
// overflow.
let inst = Inst::xmm_to_gpr(trunc_op, src, Writable::from_reg(dst), *dst_size);
inst.emit(&[], sink, info, state);
let inst = Inst::cmp_rmi_r(*dst_size, RegMemImm::imm(0), dst);
inst.emit(&[], sink, info, state);
one_way_jmp(sink, CC::NL, done); // if dst >= 0, jump to done
if *is_saturating {
// The input was "small" (< 2**(width -1)), so the only way to get an integer
// overflow is because the input was too small: saturate to the min value, i.e. 0.
let inst = Inst::alu_rmi_r(
*dst_size,
AluRmiROpcode::Xor,
RegMemImm::reg(dst),
Writable::from_reg(dst),
);
inst.emit(&[], sink, info, state);
let inst = Inst::jmp_known(done);
inst.emit(&[], sink, info, state);
} else {
// Trap.
let inst = Inst::trap(TrapCode::IntegerOverflow);
inst.emit(&[], sink, info, state);
}
// Now handle large inputs.
sink.bind_label(handle_large, state.ctrl_plane_mut());
let inst = Inst::gen_move(Writable::from_reg(tmp_xmm2), src, types::F64);
inst.emit(&[], sink, info, state);
let inst = Inst::xmm_rm_r(sub_op, RegMem::reg(tmp_xmm), Writable::from_reg(tmp_xmm2));
inst.emit(&[], sink, info, state);
let inst = Inst::xmm_to_gpr(trunc_op, tmp_xmm2, Writable::from_reg(dst), *dst_size);
inst.emit(&[], sink, info, state);
let inst = Inst::cmp_rmi_r(*dst_size, RegMemImm::imm(0), dst);
inst.emit(&[], sink, info, state);
if *is_saturating {
let next_is_large = sink.get_label();
one_way_jmp(sink, CC::NL, next_is_large); // if dst >= 0, jump to next_is_large
// The input was "large" (>= 2**(width -1)), so the only way to get an integer
// overflow is because the input was too large: saturate to the max value.
let inst = Inst::imm(
OperandSize::Size64,
if *dst_size == OperandSize::Size64 {
u64::max_value()
} else {
u32::max_value() as u64
},
Writable::from_reg(dst),
);
inst.emit(&[], sink, info, state);
let inst = Inst::jmp_known(done);
inst.emit(&[], sink, info, state);
sink.bind_label(next_is_large, state.ctrl_plane_mut());
} else {
let inst = Inst::trap_if(CC::L, TrapCode::IntegerOverflow);
inst.emit(&[], sink, info, state);
}
if *dst_size == OperandSize::Size64 {
let inst = Inst::imm(OperandSize::Size64, 1 << 63, Writable::from_reg(tmp_gpr));
inst.emit(&[], sink, info, state);
let inst = Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Add,
RegMemImm::reg(tmp_gpr),
Writable::from_reg(dst),
);
inst.emit(&[], sink, info, state);
} else {
let inst = Inst::alu_rmi_r(
OperandSize::Size32,
AluRmiROpcode::Add,
RegMemImm::imm(1 << 31),
Writable::from_reg(dst),
);
inst.emit(&[], sink, info, state);
}
sink.bind_label(done, state.ctrl_plane_mut());
}
Inst::LoadExtName {
dst,
name,
offset,
distance,
} => {
let dst = allocs.next(dst.to_reg());
if info.flags.is_pic() {
// Generates: movq symbol@GOTPCREL(%rip), %dst
let enc_dst = int_reg_enc(dst);
sink.put1(0x48 | ((enc_dst >> 3) & 1) << 2);
sink.put1(0x8B);
sink.put1(0x05 | ((enc_dst & 7) << 3));
emit_reloc(sink, Reloc::X86GOTPCRel4, name, -4);
sink.put4(0);
// Offset in the relocation above applies to the address of the *GOT entry*, not
// the loaded address; so we emit a separate add or sub instruction if needed.
if *offset < 0 {
assert!(*offset >= -i32::MAX as i64);
sink.put1(0x48 | ((enc_dst >> 3) & 1));
sink.put1(0x81);
sink.put1(0xe8 | (enc_dst & 7));
sink.put4((-*offset) as u32);
} else if *offset > 0 {
assert!(*offset <= i32::MAX as i64);
sink.put1(0x48 | ((enc_dst >> 3) & 1));
sink.put1(0x81);
sink.put1(0xc0 | (enc_dst & 7));
sink.put4(*offset as u32);
}
} else if distance == &RelocDistance::Near {
// If we know the distance to the name is within 2GB (e.g., a module-local function),
// we can generate a RIP-relative address, with a relocation.
// Generates: lea $name(%rip), $dst
let enc_dst = int_reg_enc(dst);
sink.put1(0x48 | ((enc_dst >> 3) & 1) << 2);
sink.put1(0x8D);
sink.put1(0x05 | ((enc_dst & 7) << 3));
emit_reloc(sink, Reloc::X86CallPCRel4, name, -4);
sink.put4(0);
} else {
// The full address can be encoded in the register, with a relocation.
// Generates: movabsq $name, %dst
let enc_dst = int_reg_enc(dst);
sink.put1(0x48 | ((enc_dst >> 3) & 1));
sink.put1(0xB8 | (enc_dst & 7));
emit_reloc(sink, Reloc::Abs8, name, *offset);
sink.put8(0);
}
}
Inst::LockCmpxchg {
ty,
replacement,
expected,
mem,
dst_old,
} => {
let replacement = allocs.next(*replacement);
let expected = allocs.next(*expected);
let dst_old = allocs.next(dst_old.to_reg());
let mem = mem.with_allocs(allocs);
debug_assert_eq!(expected, regs::rax());
debug_assert_eq!(dst_old, regs::rax());
// lock cmpxchg{b,w,l,q} %replacement, (mem)
// Note that 0xF0 is the Lock prefix.
let (prefix, opcodes) = match *ty {
types::I8 => (LegacyPrefixes::_F0, 0x0FB0),
types::I16 => (LegacyPrefixes::_66F0, 0x0FB1),
types::I32 => (LegacyPrefixes::_F0, 0x0FB1),
types::I64 => (LegacyPrefixes::_F0, 0x0FB1),
_ => unreachable!(),
};
let rex = RexFlags::from((OperandSize::from_ty(*ty), replacement));
let amode = mem.finalize(state, sink);
emit_std_reg_mem(sink, prefix, opcodes, 2, replacement, &amode, rex, 0);
}
Inst::AtomicRmwSeq {
ty,
op,
mem,
operand,
temp,
dst_old,
} => {
let operand = allocs.next(*operand);
let temp = allocs.next_writable(*temp);
let dst_old = allocs.next_writable(*dst_old);
debug_assert_eq!(dst_old.to_reg(), regs::rax());
let mem = mem.finalize(state, sink).with_allocs(allocs);
// Emit this:
// mov{zbq,zwq,zlq,q} (%r_address), %rax // rax = old value
// again:
// movq %rax, %r_temp // rax = old value, r_temp = old value
// `op`q %r_operand, %r_temp // rax = old value, r_temp = new value
// lock cmpxchg{b,w,l,q} %r_temp, (%r_address) // try to store new value
// jnz again // If this is taken, rax will have a "revised" old value
//
// Operand conventions: IN: %r_address, %r_operand OUT: %rax (old
// value), %r_temp (trashed), %rflags (trashed)
//
// In the case where the operation is 'xchg', the "`op`q"
// instruction is instead: movq %r_operand,
// %r_temp so that we simply write in the destination, the "2nd
// arg for `op`".
//
// TODO: this sequence can be significantly improved (e.g., to `lock
// <op>`) when it is known that `dst_old` is not used later, see
// https://github.com/bytecodealliance/wasmtime/issues/2153.
let again_label = sink.get_label();
// mov{zbq,zwq,zlq,q} (%r_address), %rax
// No need to call `add_trap` here, since the `i1` emit will do that.
let i1 = Inst::load(*ty, mem.clone(), dst_old, ExtKind::ZeroExtend);
i1.emit(&[], sink, info, state);
// again:
sink.bind_label(again_label, state.ctrl_plane_mut());
// movq %rax, %r_temp
let i2 = Inst::mov_r_r(OperandSize::Size64, dst_old.to_reg(), temp);
i2.emit(&[], sink, info, state);
let operand_rmi = RegMemImm::reg(operand);
use inst_common::MachAtomicRmwOp as RmwOp;
match op {
RmwOp::Xchg => {
// movq %r_operand, %r_temp
let i3 = Inst::mov_r_r(OperandSize::Size64, operand, temp);
i3.emit(&[], sink, info, state);
}
RmwOp::Nand => {
// andq %r_operand, %r_temp
let i3 =
Inst::alu_rmi_r(OperandSize::Size64, AluRmiROpcode::And, operand_rmi, temp);
i3.emit(&[], sink, info, state);
// notq %r_temp
let i4 = Inst::not(OperandSize::Size64, temp);
i4.emit(&[], sink, info, state);
}
RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {
// cmp %r_temp, %r_operand
let i3 = Inst::cmp_rmi_r(
OperandSize::from_ty(*ty),
RegMemImm::reg(temp.to_reg()),
operand,
);
i3.emit(&[], sink, info, state);
// cmovcc %r_operand, %r_temp
let cc = match op {
RmwOp::Umin => CC::BE,
RmwOp::Umax => CC::NB,
RmwOp::Smin => CC::LE,
RmwOp::Smax => CC::NL,
_ => unreachable!(),
};
let i4 = Inst::cmove(OperandSize::Size64, cc, RegMem::reg(operand), temp);
i4.emit(&[], sink, info, state);
}
_ => {
// opq %r_operand, %r_temp
let alu_op = match op {
RmwOp::Add => AluRmiROpcode::Add,
RmwOp::Sub => AluRmiROpcode::Sub,
RmwOp::And => AluRmiROpcode::And,
RmwOp::Or => AluRmiROpcode::Or,
RmwOp::Xor => AluRmiROpcode::Xor,
RmwOp::Xchg
| RmwOp::Nand
| RmwOp::Umin
| RmwOp::Umax
| RmwOp::Smin
| RmwOp::Smax => unreachable!(),
};
let i3 = Inst::alu_rmi_r(OperandSize::Size64, alu_op, operand_rmi, temp);
i3.emit(&[], sink, info, state);
}
}
// lock cmpxchg{b,w,l,q} %r_temp, (%r_address)
// No need to call `add_trap` here, since the `i4` emit will do that.
let i4 = Inst::LockCmpxchg {
ty: *ty,
replacement: temp.to_reg(),
expected: dst_old.to_reg(),
mem: mem.into(),
dst_old,
};
i4.emit(&[], sink, info, state);
// jnz again
one_way_jmp(sink, CC::NZ, again_label);
}
Inst::Fence { kind } => {
sink.put1(0x0F);
sink.put1(0xAE);
match kind {
FenceKind::MFence => sink.put1(0xF0), // mfence = 0F AE F0
FenceKind::LFence => sink.put1(0xE8), // lfence = 0F AE E8
FenceKind::SFence => sink.put1(0xF8), // sfence = 0F AE F8
}
}
Inst::Hlt => {
sink.put1(0xcc);
}
Inst::Ud2 { trap_code } => {
sink.add_trap(*trap_code);
if let Some(s) = state.take_stack_map() {
sink.add_stack_map(StackMapExtent::UpcomingBytes(2), s);
}
sink.put_data(Inst::TRAP_OPCODE);
}
Inst::VirtualSPOffsetAdj { offset } => {
state.adjust_virtual_sp_offset(*offset);
}
Inst::Nop { len } => {
// These encodings can all be found in Intel's architecture manual, at the NOP
// instruction description.
let mut len = *len;
while len != 0 {
let emitted = u8::min(len, 9);
match emitted {
0 => {}
1 => sink.put1(0x90), // NOP
2 => {
// 66 NOP
sink.put1(0x66);
sink.put1(0x90);
}
3 => {
// NOP [EAX]
sink.put1(0x0F);
sink.put1(0x1F);
sink.put1(0x00);
}
4 => {
// NOP 0(EAX), with 0 a 1-byte immediate.
sink.put1(0x0F);
sink.put1(0x1F);
sink.put1(0x40);
sink.put1(0x00);
}
5 => {
// NOP [EAX, EAX, 1]
sink.put1(0x0F);
sink.put1(0x1F);
sink.put1(0x44);
sink.put1(0x00);
sink.put1(0x00);
}
6 => {
// 66 NOP [EAX, EAX, 1]
sink.put1(0x66);
sink.put1(0x0F);
sink.put1(0x1F);
sink.put1(0x44);
sink.put1(0x00);
sink.put1(0x00);
}
7 => {
// NOP 0[EAX], but 0 is a 4 bytes immediate.
sink.put1(0x0F);
sink.put1(0x1F);
sink.put1(0x80);
sink.put1(0x00);
sink.put1(0x00);
sink.put1(0x00);
sink.put1(0x00);
}
8 => {
// NOP 0[EAX, EAX, 1], with 0 a 4 bytes immediate.
sink.put1(0x0F);
sink.put1(0x1F);
sink.put1(0x84);
sink.put1(0x00);
sink.put1(0x00);
sink.put1(0x00);
sink.put1(0x00);
sink.put1(0x00);
}
9 => {
// 66 NOP 0[EAX, EAX, 1], with 0 a 4 bytes immediate.
sink.put1(0x66);
sink.put1(0x0F);
sink.put1(0x1F);
sink.put1(0x84);
sink.put1(0x00);
sink.put1(0x00);
sink.put1(0x00);
sink.put1(0x00);
sink.put1(0x00);
}
_ => unreachable!(),
}
len -= emitted;
}
}
Inst::ElfTlsGetAddr { ref symbol, dst } => {
let dst = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(dst, regs::rax());
// N.B.: Must be exactly this byte sequence; the linker requires it,
// because it must know how to rewrite the bytes.
// data16 lea gv@tlsgd(%rip),%rdi
sink.put1(0x66); // data16
sink.put1(0b01001000); // REX.W
sink.put1(0x8d); // LEA
sink.put1(0x3d); // ModRM byte
emit_reloc(sink, Reloc::ElfX86_64TlsGd, symbol, -4);
sink.put4(0); // offset
// data16 data16 callq __tls_get_addr-4
sink.put1(0x66); // data16
sink.put1(0x66); // data16
sink.put1(0b01001000); // REX.W
sink.put1(0xe8); // CALL
emit_reloc(
sink,
Reloc::X86CallPLTRel4,
&ExternalName::LibCall(LibCall::ElfTlsGetAddr),
-4,
);
sink.put4(0); // offset
}
Inst::MachOTlsGetAddr { ref symbol, dst } => {
let dst = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(dst, regs::rax());
// movq gv@tlv(%rip), %rdi
sink.put1(0x48); // REX.w
sink.put1(0x8b); // MOV
sink.put1(0x3d); // ModRM byte
emit_reloc(sink, Reloc::MachOX86_64Tlv, symbol, -4);
sink.put4(0); // offset
// callq *(%rdi)
sink.put1(0xff);
sink.put1(0x17);
}
Inst::CoffTlsGetAddr {
ref symbol,
dst,
tmp,
} => {
let dst = allocs.next(dst.to_reg().to_reg());
debug_assert_eq!(dst, regs::rax());
// tmp is used below directly as %rcx
let tmp = allocs.next(tmp.to_reg().to_reg());
debug_assert_eq!(tmp, regs::rcx());
// See: https://gcc.godbolt.org/z/M8or9x6ss
// And: https://github.com/bjorn3/rustc_codegen_cranelift/issues/388#issuecomment-532930282
// Emit the following sequence
// movl (%rip), %eax ; IMAGE_REL_AMD64_REL32 _tls_index
// movq %gs:88, %rcx
// movq (%rcx,%rax,8), %rax
// leaq (%rax), %rax ; Reloc: IMAGE_REL_AMD64_SECREL symbol
// Load TLS index for current thread
// movl (%rip), %eax
sink.put1(0x8b); // mov
sink.put1(0x05);
emit_reloc(
sink,
Reloc::X86PCRel4,
&ExternalName::KnownSymbol(KnownSymbol::CoffTlsIndex),
-4,
);
sink.put4(0); // offset
// movq %gs:88, %rcx
// Load the TLS Storage Array pointer
// The gs segment register refers to the base address of the TEB on x64.
// 0x58 is the offset in the TEB for the ThreadLocalStoragePointer member on x64:
sink.put_data(&[
0x65, 0x48, // REX.W
0x8b, // MOV
0x0c, 0x25, 0x58, // 0x58 - ThreadLocalStoragePointer offset
0x00, 0x00, 0x00,
]);
// movq (%rcx,%rax,8), %rax
// Load the actual TLS entry for this thread.
// Computes ThreadLocalStoragePointer + _tls_index*8
sink.put_data(&[0x48, 0x8b, 0x04, 0xc1]);
// leaq (%rax), %rax
sink.put1(0x48);
sink.put1(0x8d);
sink.put1(0x80);
emit_reloc(sink, Reloc::X86SecRel, symbol, 0);
sink.put4(0); // offset
}
Inst::Unwind { ref inst } => {
sink.add_unwind(inst.clone());
}
Inst::DummyUse { .. } => {
// Nothing.
}
}
state.clear_post_insn();
}
/// Emit the common sequence used for both direct and indirect tail calls:
///
/// * Copy the new frame's stack arguments over the top of our current frame.
///
/// * Restore the old frame pointer.
///
/// * Initialize the tail callee's stack pointer (simultaneously deallocating
/// the temporary stack space we allocated when creating the new frame's stack
/// arguments).
///
/// * Move the return address into its stack slot.
fn emit_return_call_common_sequence(
allocs: &mut AllocationConsumer<'_>,
sink: &mut MachBuffer<Inst>,
info: &EmitInfo,
state: &mut EmitState,
new_stack_arg_size: u32,
old_stack_arg_size: u32,
ret_addr: Option<Gpr>,
fp: Gpr,
tmp: WritableGpr,
uses: &CallArgList,
) {
assert!(
info.flags.preserve_frame_pointers(),
"frame pointers aren't fundamentally required for tail calls, \
but the current implementation relies on them being present"
);
for u in uses {
let _ = allocs.next(u.vreg);
}
let ret_addr = ret_addr.map(|r| Gpr::new(allocs.next(*r)).unwrap());
let fp = allocs.next(*fp);
let tmp = allocs.next(tmp.to_reg().to_reg());
let tmp = Gpr::new(tmp).unwrap();
let tmp_w = WritableGpr::from_reg(tmp);
// Copy the new frame (which is `frame_size` bytes above the SP)
// onto our current frame, using only volatile, non-argument
// registers.
//
//
// The current stack layout is the following:
//
// | ... |
// +---------------------+
// | ... |
// | stack arguments |
// | ... |
// current | return address |
// frame | old FP | <-- FP
// | ... |
// | old stack slots |
// | ... |
// +---------------------+
// | ... |
// new | new stack arguments |
// frame | ... | <-- SP
// +---------------------+
//
// We need to restore the old FP, copy the new stack arguments over the old
// stack arguments, write the return address into the correct slot just
// after the new stack arguments, adjust SP to point to the new return
// address, and then jump to the callee (which will push the old FP again).
// Restore the old FP into `rbp`.
Inst::Mov64MR {
src: SyntheticAmode::Real(Amode::ImmReg {
simm32: 0,
base: fp,
flags: MemFlags::trusted(),
}),
dst: Writable::from_reg(Gpr::new(regs::rbp()).unwrap()),
}
.emit(&[], sink, info, state);
// The new lowest address (top of stack) -- relative to FP -- for
// our tail callee. We compute this now so that we can move our
// stack arguments into place.
let callee_sp_relative_to_fp = i64::from(old_stack_arg_size) - i64::from(new_stack_arg_size);
// Copy over each word, using `tmp` as a temporary register.
//
// Note that we have to do this from stack slots with the highest
// address to lowest address because in the case of when the tail
// callee has more stack arguments than we do, we might otherwise
// overwrite some of our stack arguments before they've been copied
// into place.
assert_eq!(
new_stack_arg_size % 8,
0,
"stack argument space sizes should always be 8-byte aligned"
);
for i in (0..new_stack_arg_size / 8).rev() {
Inst::Mov64MR {
src: SyntheticAmode::Real(Amode::ImmReg {
simm32: (i * 8).try_into().unwrap(),
base: regs::rsp(),
flags: MemFlags::trusted(),
}),
dst: tmp_w,
}
.emit(&[], sink, info, state);
Inst::MovRM {
size: OperandSize::Size64,
src: tmp,
dst: SyntheticAmode::Real(Amode::ImmReg {
// Add 2 because we need to skip over the old FP and the
// return address.
simm32: (callee_sp_relative_to_fp + i64::from((i + 2) * 8))
.try_into()
.unwrap(),
base: fp,
flags: MemFlags::trusted(),
}),
}
.emit(&[], sink, info, state);
}
// Initialize SP for the tail callee, deallocating the temporary
// stack arguments space at the same time.
Inst::LoadEffectiveAddress {
size: OperandSize::Size64,
addr: SyntheticAmode::Real(Amode::ImmReg {
// NB: We add a word to `callee_sp_relative_to_fp` here because the
// callee will push FP, not us.
simm32: callee_sp_relative_to_fp.wrapping_add(8).try_into().unwrap(),
base: fp,
flags: MemFlags::trusted(),
}),
dst: Writable::from_reg(Gpr::new(regs::rsp()).unwrap()),
}
.emit(&[], sink, info, state);
state.adjust_virtual_sp_offset(-i64::from(new_stack_arg_size));
// Write the return address into the correct stack slot.
if let Some(ret_addr) = ret_addr {
Inst::MovRM {
size: OperandSize::Size64,
src: ret_addr,
dst: SyntheticAmode::Real(Amode::ImmReg {
simm32: 0,
base: regs::rsp(),
flags: MemFlags::trusted(),
}),
}
.emit(&[], sink, info, state);
}
}