vendor/cranelift-codegen/src/isa/riscv64/inst/emit.rs - toolchain/rustc - Git at Google

 //! Riscv64 ISA: binary code emission.

 use crate::binemit::StackMap;
 use crate::ir::RelSourceLoc;
 use crate::ir::TrapCode;
 use crate::isa::riscv64::inst::*;
 use crate::isa::riscv64::inst::{zero_reg, AluOPRRR};
 use crate::machinst::{AllocationConsumer, Reg, Writable};
 use crate::trace;
 use cranelift_control::ControlPlane;
 use regalloc2::Allocation;

 pub struct EmitInfo {
     shared_flag: settings::Flags,
     isa_flags: super::super::riscv_settings::Flags,
 }

 impl EmitInfo {
     pub(crate) fn new(
         shared_flag: settings::Flags,
         isa_flags: super::super::riscv_settings::Flags,
     ) -> Self {
         Self {
             shared_flag,
             isa_flags,
         }
     }
 }

 /// load constant by put the constant in the code stream.
 /// calculate the pc and using load instruction.
 /// This is only allow used in the emit stage.
 /// Because of those instruction must execute together.
 /// see https://github.com/bytecodealliance/wasmtime/pull/5612
 #[derive(Clone, Copy)]
 pub(crate) enum LoadConstant {
     U32(u32),
     U64(u64),
 }

 impl LoadConstant {
     fn to_le_bytes(self) -> Vec<u8> {
         match self {
             LoadConstant::U32(x) => Vec::from_iter(x.to_le_bytes().into_iter()),
             LoadConstant::U64(x) => Vec::from_iter(x.to_le_bytes().into_iter()),
         }
     }
     fn load_op(self) -> LoadOP {
         match self {
             LoadConstant::U32(_) => LoadOP::Lwu,
             LoadConstant::U64(_) => LoadOP::Ld,
         }
     }
     fn load_ty(self) -> Type {
         match self {
             LoadConstant::U32(_) => R32,
             LoadConstant::U64(_) => R64,
         }
     }

     pub(crate) fn load_constant<F: FnMut(Type) -> Writable<Reg>>(
         self,
         rd: Writable<Reg>,
         alloc_tmp: &mut F,
     ) -> SmallInstVec<Inst> {
         let mut insts = SmallInstVec::new();
         // get current pc.
         let pc = alloc_tmp(I64);
         insts.push(Inst::Auipc {
             rd: pc,
             imm: Imm20 { bits: 0 },
         });
         // load
         insts.push(Inst::Load {
             rd,
             op: self.load_op(),
             flags: MemFlags::new(),
             from: AMode::RegOffset(pc.to_reg(), 12, self.load_ty()),
         });
         let data = self.to_le_bytes();
         // jump over.
         insts.push(Inst::Jal {
             dest: BranchTarget::ResolvedOffset(Inst::INSTRUCTION_SIZE + data.len() as i32),
         });
         insts.push(Inst::RawData { data });
         insts
     }

     // load and perform an extra add.
     pub(crate) fn load_constant_and_add(self, rd: Writable<Reg>, rs: Reg) -> SmallInstVec<Inst> {
         let mut insts = self.load_constant(rd, &mut |_| rd);
         insts.push(Inst::AluRRR {
             alu_op: AluOPRRR::Add,
             rd,
             rs1: rd.to_reg(),
             rs2: rs,
         });
         insts
     }
 }

 pub(crate) fn reg_to_gpr_num(m: Reg) -> u32 {
     u32::try_from(m.to_real_reg().unwrap().hw_enc() & 31).unwrap()
 }

 #[derive(Clone, Debug, PartialEq, Default)]
 pub enum EmitVState {
     #[default]
     Unknown,
     Known(VState),
 }

 /// State carried between emissions of a sequence of instructions.
 #[derive(Default, Clone, Debug)]
 pub struct EmitState {
     pub(crate) virtual_sp_offset: i64,
     pub(crate) nominal_sp_to_fp: i64,
     /// Safepoint stack map for upcoming instruction, as provided to `pre_safepoint()`.
     stack_map: Option<StackMap>,
     /// Current source-code location corresponding to instruction to be emitted.
     cur_srcloc: RelSourceLoc,
     /// Only used during fuzz-testing. Otherwise, it is a zero-sized struct and
     /// optimized away at compiletime. See [cranelift_control].
     ctrl_plane: ControlPlane,
     /// Vector State
     /// Controls the current state of the vector unit at the emission point.
     vstate: EmitVState,
 }

 impl EmitState {
     fn take_stack_map(&mut self) -> Option<StackMap> {
         self.stack_map.take()
     }

     fn clear_post_insn(&mut self) {
         self.stack_map = None;
     }

     fn cur_srcloc(&self) -> RelSourceLoc {
         self.cur_srcloc
     }
 }

 impl MachInstEmitState<Inst> for EmitState {
     fn new(
         abi: &Callee<crate::isa::riscv64::abi::Riscv64MachineDeps>,
         ctrl_plane: ControlPlane,
     ) -> Self {
         EmitState {
             virtual_sp_offset: 0,
             nominal_sp_to_fp: abi.frame_size() as i64,
             stack_map: None,
             cur_srcloc: RelSourceLoc::default(),
             ctrl_plane,
             vstate: EmitVState::Unknown,
         }
     }

     fn pre_safepoint(&mut self, stack_map: StackMap) {
         self.stack_map = Some(stack_map);
     }

     fn pre_sourceloc(&mut self, srcloc: RelSourceLoc) {
         self.cur_srcloc = srcloc;
     }

     fn ctrl_plane_mut(&mut self) -> &mut ControlPlane {
         &mut self.ctrl_plane
     }

     fn take_ctrl_plane(self) -> ControlPlane {
         self.ctrl_plane
     }

     fn on_new_block(&mut self) {
         // Reset the vector state.
         self.vstate = EmitVState::Unknown;
     }
 }

 impl Inst {
     /// construct a "imm - rs".
     pub(crate) fn construct_imm_sub_rs(rd: Writable<Reg>, imm: u64, rs: Reg) -> SmallInstVec<Inst> {
         let mut insts = Inst::load_constant_u64(rd, imm, &mut |_| rd);
         insts.push(Inst::AluRRR {
             alu_op: AluOPRRR::Sub,
             rd,
             rs1: rd.to_reg(),
             rs2: rs,
         });
         insts
     }

     /// Load int mask.
     /// If ty is int then 0xff in rd.
     pub(crate) fn load_int_mask(rd: Writable<Reg>, ty: Type) -> SmallInstVec<Inst> {
         let mut insts = SmallInstVec::new();
         assert!(ty.is_int() && ty.bits() <= 64);
         match ty {
             I64 => {
                 insts.push(Inst::load_imm12(rd, Imm12::from_bits(-1)));
             }
             I32 | I16 => {
                 insts.push(Inst::load_imm12(rd, Imm12::from_bits(-1)));
                 insts.push(Inst::Extend {
                     rd: rd,
                     rn: rd.to_reg(),
                     signed: false,
                     from_bits: ty.bits() as u8,
                     to_bits: 64,
                 });
             }
             I8 => {
                 insts.push(Inst::load_imm12(rd, Imm12::from_bits(255)));
             }
             _ => unreachable!("ty:{:?}", ty),
         }
         insts
     }
     ///  inverse all bit
     pub(crate) fn construct_bit_not(rd: Writable<Reg>, rs: Reg) -> Inst {
         Inst::AluRRImm12 {
             alu_op: AluOPRRI::Xori,
             rd,
             rs,
             imm12: Imm12::from_bits(-1),
         }
     }

     // emit a float is not a nan.
     pub(crate) fn emit_not_nan(rd: Writable<Reg>, rs: Reg, ty: Type) -> Inst {
         Inst::FpuRRR {
             alu_op: if ty == F32 {
                 FpuOPRRR::FeqS
             } else {
                 FpuOPRRR::FeqD
             },
             frm: None,
             rd: rd,
             rs1: rs,
             rs2: rs,
         }
     }

     pub(crate) fn emit_fabs(rd: Writable<Reg>, rs: Reg, ty: Type) -> Inst {
         Inst::FpuRRR {
             alu_op: if ty == F32 {
                 FpuOPRRR::FsgnjxS
             } else {
                 FpuOPRRR::FsgnjxD
             },
             frm: None,
             rd: rd,
             rs1: rs,
             rs2: rs,
         }
     }
     /// If a float is zero.
     pub(crate) fn emit_if_float_not_zero(
         tmp: Writable<Reg>,
         rs: Reg,
         ty: Type,
         taken: BranchTarget,
         not_taken: BranchTarget,
     ) -> SmallInstVec<Inst> {
         let mut insts = SmallInstVec::new();
         let class_op = if ty == F32 {
             FpuOPRR::FclassS
         } else {
             FpuOPRR::FclassD
         };
         insts.push(Inst::FpuRR {
             alu_op: class_op,
             frm: None,
             rd: tmp,
             rs: rs,
         });
         insts.push(Inst::AluRRImm12 {
             alu_op: AluOPRRI::Andi,
             rd: tmp,
             rs: tmp.to_reg(),
             imm12: Imm12::from_bits(FClassResult::is_zero_bits() as i16),
         });
         insts.push(Inst::CondBr {
             taken,
             not_taken,
             kind: IntegerCompare {
                 kind: IntCC::Equal,
                 rs1: tmp.to_reg(),
                 rs2: zero_reg(),
             },
         });
         insts
     }
     pub(crate) fn emit_fneg(rd: Writable<Reg>, rs: Reg, ty: Type) -> Inst {
         Inst::FpuRRR {
             alu_op: if ty == F32 {
                 FpuOPRRR::FsgnjnS
             } else {
                 FpuOPRRR::FsgnjnD
             },
             frm: None,
             rd: rd,
             rs1: rs,
             rs2: rs,
         }
     }

     pub(crate) fn lower_br_icmp(
         cc: IntCC,
         a: ValueRegs<Reg>,
         b: ValueRegs<Reg>,
         taken: BranchTarget,
         not_taken: BranchTarget,
         ty: Type,
     ) -> SmallInstVec<Inst> {
         let mut insts = SmallInstVec::new();
         if ty.bits() <= 64 {
             let rs1 = a.only_reg().unwrap();
             let rs2 = b.only_reg().unwrap();
             let inst = Inst::CondBr {
                 taken,
                 not_taken,
                 kind: IntegerCompare { kind: cc, rs1, rs2 },
             };
             insts.push(inst);
             return insts;
         }
         // compare i128
         let low = |cc: IntCC| -> IntegerCompare {
             IntegerCompare {
                 rs1: a.regs()[0],
                 rs2: b.regs()[0],
                 kind: cc,
             }
         };
         let high = |cc: IntCC| -> IntegerCompare {
             IntegerCompare {
                 rs1: a.regs()[1],
                 rs2: b.regs()[1],
                 kind: cc,
             }
         };
         match cc {
             IntCC::Equal => {
                 // if high part not equal,
                 // then we can go to not_taken otherwise fallthrough.
                 insts.push(Inst::CondBr {
                     taken: not_taken,
                     not_taken: BranchTarget::zero(),
                     kind: high(IntCC::NotEqual),
                 });
                 // the rest part.
                 insts.push(Inst::CondBr {
                     taken,
                     not_taken,
                     kind: low(IntCC::Equal),
                 });
             }

             IntCC::NotEqual => {
                 // if the high part not equal ,
                 // we know the whole must be not equal,
                 // we can goto the taken part , otherwise fallthrought.
                 insts.push(Inst::CondBr {
                     taken,
                     not_taken: BranchTarget::zero(), //  no branch
                     kind: high(IntCC::NotEqual),
                 });

                 insts.push(Inst::CondBr {
                     taken,
                     not_taken,
                     kind: low(IntCC::NotEqual),
                 });
             }
             IntCC::SignedGreaterThanOrEqual
             | IntCC::SignedLessThanOrEqual
             | IntCC::UnsignedGreaterThanOrEqual
             | IntCC::UnsignedLessThanOrEqual
             | IntCC::SignedGreaterThan
             | IntCC::SignedLessThan
             | IntCC::UnsignedLessThan
             | IntCC::UnsignedGreaterThan => {
                 //
                 insts.push(Inst::CondBr {
                     taken,
                     not_taken: BranchTarget::zero(),
                     kind: high(cc.without_equal()),
                 });
                 //
                 insts.push(Inst::CondBr {
                     taken: not_taken,
                     not_taken: BranchTarget::zero(),
                     kind: high(IntCC::NotEqual),
                 });
                 insts.push(Inst::CondBr {
                     taken,
                     not_taken,
                     kind: low(cc.unsigned()),
                 });
             }
         }
         insts
     }

     /// Returns Some(VState) if this insturction is expecting a specific vector state
     /// before emission.
     fn expected_vstate(&self) -> Option<&VState> {
         match self {
             Inst::Nop0
             | Inst::Nop4
             | Inst::BrTable { .. }
             | Inst::Auipc { .. }
             | Inst::Lui { .. }
             | Inst::LoadConst32 { .. }
             | Inst::LoadConst64 { .. }
             | Inst::AluRRR { .. }
             | Inst::FpuRRR { .. }
             | Inst::AluRRImm12 { .. }
             | Inst::Load { .. }
             | Inst::Store { .. }
             | Inst::Args { .. }
             | Inst::Ret { .. }
             | Inst::Extend { .. }
             | Inst::AdjustSp { .. }
             | Inst::Call { .. }
             | Inst::CallInd { .. }
             | Inst::TrapIf { .. }
             | Inst::Jal { .. }
             | Inst::CondBr { .. }
             | Inst::LoadExtName { .. }
             | Inst::LoadAddr { .. }
             | Inst::VirtualSPOffsetAdj { .. }
             | Inst::Mov { .. }
             | Inst::MovFromPReg { .. }
             | Inst::Fence { .. }
             | Inst::FenceI
             | Inst::ECall
             | Inst::EBreak
             | Inst::Udf { .. }
             | Inst::FpuRR { .. }
             | Inst::FpuRRRR { .. }
             | Inst::Jalr { .. }
             | Inst::Atomic { .. }
             | Inst::Select { .. }
             | Inst::AtomicCas { .. }
             | Inst::IntSelect { .. }
             | Inst::Icmp { .. }
             | Inst::SelectReg { .. }
             | Inst::FcvtToInt { .. }
             | Inst::RawData { .. }
             | Inst::AtomicStore { .. }
             | Inst::AtomicLoad { .. }
             | Inst::AtomicRmwLoop { .. }
             | Inst::TrapIfC { .. }
             | Inst::Unwind { .. }
             | Inst::DummyUse { .. }
             | Inst::FloatRound { .. }
             | Inst::FloatSelect { .. }
             | Inst::FloatSelectPseudo { .. }
             | Inst::Popcnt { .. }
             | Inst::Rev8 { .. }
             | Inst::Cltz { .. }
             | Inst::Brev8 { .. }
             | Inst::StackProbeLoop { .. } => None,

             // VecSetState does not expect any vstate, rather it updates it.
             Inst::VecSetState { .. } => None,

             // `vmv` instructions copy a set of registers and ignore vstate.
             Inst::VecAluRRImm5 { op: VecAluOpRRImm5::VmvrV, .. } => None,

             Inst::VecAluRR { vstate, .. } |
             Inst::VecAluRRR { vstate, .. } |
             Inst::VecAluRImm5 { vstate, .. } |
             Inst::VecAluRRImm5 { vstate, .. } |
             Inst::VecAluRRRImm5 { vstate, .. } |
             // TODO: Unit-stride loads and stores only need the AVL to be correct, not
             // the full vtype. A future optimization could be to decouple these two when
             // updating vstate. This would allow us to avoid emitting a VecSetState in
             // some cases.
             Inst::VecLoad { vstate, .. }
             | Inst::VecStore { vstate, .. } => Some(vstate),
         }
     }
 }

 impl MachInstEmit for Inst {
     type State = EmitState;
     type Info = EmitInfo;

     fn emit(
         &self,
         allocs: &[Allocation],
         sink: &mut MachBuffer<Inst>,
         emit_info: &Self::Info,
         state: &mut EmitState,
     ) {
         let mut allocs = AllocationConsumer::new(allocs);

         // Check if we need to update the vector state before emitting this instruction
         if let Some(expected) = self.expected_vstate() {
             if state.vstate != EmitVState::Known(expected.clone()) {
                 // Update the vector state.
                 Inst::VecSetState {
                     rd: writable_zero_reg(),
                     vstate: expected.clone(),
                 }
                 .emit(&[], sink, emit_info, state);
             }
         }

         // N.B.: we *must* not exceed the "worst-case size" used to compute
         // where to insert islands, except when islands are explicitly triggered
         // (with an `EmitIsland`). We check this in debug builds. This is `mut`
         // to allow disabling the check for `JTSequence`, which is always
         // emitted following an `EmitIsland`.
         let mut start_off = sink.cur_offset();
         match self {
             &Inst::Nop0 => {
                 // do nothing
             }
             // Addi x0, x0, 0
             &Inst::Nop4 => {
                 let x = Inst::AluRRImm12 {
                     alu_op: AluOPRRI::Addi,
                     rd: Writable::from_reg(zero_reg()),
                     rs: zero_reg(),
                     imm12: Imm12::zero(),
                 };
                 x.emit(&[], sink, emit_info, state)
             }
             &Inst::RawData { ref data } => {
                 // Right now we only put a u32 or u64 in this instruction.
                 // It is not very long, no need to check if need `emit_island`.
                 // If data is very long , this is a bug because RawData is typecial
                 // use to load some data and rely on some positon in the code stream.
                 // and we may exceed `Inst::worst_case_size`.
                 // for more information see https://github.com/bytecodealliance/wasmtime/pull/5612.
                 sink.put_data(&data[..]);
             }
             &Inst::Lui { rd, ref imm } => {
                 let rd = allocs.next_writable(rd);
                 let x: u32 = 0b0110111 | reg_to_gpr_num(rd.to_reg()) << 7 | (imm.as_u32() << 12);
                 sink.put4(x);
             }
             &Inst::LoadConst32 { rd, imm } => {
                 let rd = allocs.next_writable(rd);
                 LoadConstant::U32(imm)
                     .load_constant(rd, &mut |_| rd)
                     .into_iter()
                     .for_each(|inst| inst.emit(&[], sink, emit_info, state));
             }
             &Inst::LoadConst64 { rd, imm } => {
                 let rd = allocs.next_writable(rd);
                 LoadConstant::U64(imm)
                     .load_constant(rd, &mut |_| rd)
                     .into_iter()
                     .for_each(|inst| inst.emit(&[], sink, emit_info, state));
             }
             &Inst::FpuRR {
                 frm,
                 alu_op,
                 rd,
                 rs,
             } => {
                 let rs = allocs.next(rs);
                 let rd = allocs.next_writable(rd);
                 let x = alu_op.op_code()
                     | reg_to_gpr_num(rd.to_reg()) << 7
                     | alu_op.funct3(frm) << 12
                     | reg_to_gpr_num(rs) << 15
                     | alu_op.rs2_funct5() << 20
                     | alu_op.funct7() << 25;
                 let srcloc = state.cur_srcloc();
                 if !srcloc.is_default() && alu_op.is_convert_to_int() {
                     sink.add_trap(TrapCode::BadConversionToInteger);
                 }
                 sink.put4(x);
             }
             &Inst::FpuRRRR {
                 alu_op,
                 rd,
                 rs1,
                 rs2,
                 rs3,
                 frm,
             } => {
                 let rs1 = allocs.next(rs1);
                 let rs2 = allocs.next(rs2);
                 let rs3 = allocs.next(rs3);
                 let rd = allocs.next_writable(rd);
                 let x = alu_op.op_code()
                     | reg_to_gpr_num(rd.to_reg()) << 7
                     | alu_op.funct3(frm) << 12
                     | reg_to_gpr_num(rs1) << 15
                     | reg_to_gpr_num(rs2) << 20
                     | alu_op.funct2() << 25
                     | reg_to_gpr_num(rs3) << 27;

                 sink.put4(x);
             }
             &Inst::FpuRRR {
                 alu_op,
                 frm,
                 rd,
                 rs1,
                 rs2,
             } => {
                 let rs1 = allocs.next(rs1);
                 let rs2 = allocs.next(rs2);
                 let rd = allocs.next_writable(rd);

                 let x: u32 = alu_op.op_code()
                     | reg_to_gpr_num(rd.to_reg()) << 7
                     | (alu_op.funct3(frm)) << 12
                     | reg_to_gpr_num(rs1) << 15
                     | reg_to_gpr_num(rs2) << 20
                     | alu_op.funct7() << 25;
                 sink.put4(x);
             }
             &Inst::Unwind { ref inst } => {
                 sink.add_unwind(inst.clone());
             }
             &Inst::DummyUse { reg } => {
                 allocs.next(reg);
             }
             &Inst::AluRRR {
                 alu_op,
                 rd,
                 rs1,
                 rs2,
             } => {
                 let rs1 = allocs.next(rs1);
                 let rs2 = allocs.next(rs2);
                 let rd = allocs.next_writable(rd);
                 let (rs1, rs2) = if alu_op.reverse_rs() {
                     (rs2, rs1)
                 } else {
                     (rs1, rs2)
                 };

                 sink.put4(encode_r_type(
                     alu_op.op_code(),
                     rd,
                     alu_op.funct3(),
                     rs1,
                     rs2,
                     alu_op.funct7(),
                 ));
             }
             &Inst::AluRRImm12 {
                 alu_op,
                 rd,
                 rs,
                 imm12,
             } => {
                 let rs = allocs.next(rs);
                 let rd = allocs.next_writable(rd);
                 let x = alu_op.op_code()
                     | reg_to_gpr_num(rd.to_reg()) << 7
                     | alu_op.funct3() << 12
                     | reg_to_gpr_num(rs) << 15
                     | alu_op.imm12(imm12) << 20;
                 sink.put4(x);
             }
             &Inst::Load {
                 rd,
                 op,
                 from,
                 flags,
             } => {
                 let from = from.clone().with_allocs(&mut allocs);
                 let rd = allocs.next_writable(rd);

                 let base = from.get_base_register();
                 let offset = from.get_offset_with_state(state);
                 let offset_imm12 = Imm12::maybe_from_u64(offset as u64);

                 let (addr, imm12) = match (base, offset_imm12) {
                     // If the offset fits into an imm12 we can directly encode it.
                     (Some(base), Some(imm12)) => (base, imm12),
                     // Otherwise load the address it into a reg and load from it.
                     _ => {
                         let tmp = writable_spilltmp_reg();
                         Inst::LoadAddr { rd: tmp, mem: from }.emit(&[], sink, emit_info, state);
                         (tmp.to_reg(), Imm12::zero())
                     }
                 };

                 let srcloc = state.cur_srcloc();
                 if !srcloc.is_default() && !flags.notrap() {
                     // Register the offset at which the actual load instruction starts.
                     sink.add_trap(TrapCode::HeapOutOfBounds);
                 }

                 sink.put4(encode_i_type(op.op_code(), rd, op.funct3(), addr, imm12));
             }
             &Inst::Store { op, src, flags, to } => {
                 let to = to.clone().with_allocs(&mut allocs);
                 let src = allocs.next(src);

                 let base = to.get_base_register();
                 let offset = to.get_offset_with_state(state);
                 let offset_imm12 = Imm12::maybe_from_u64(offset as u64);

                 let (addr, imm12) = match (base, offset_imm12) {
                     // If the offset fits into an imm12 we can directly encode it.
                     (Some(base), Some(imm12)) => (base, imm12),
                     // Otherwise load the address it into a reg and load from it.
                     _ => {
                         let tmp = writable_spilltmp_reg();
                         Inst::LoadAddr { rd: tmp, mem: to }.emit(&[], sink, emit_info, state);
                         (tmp.to_reg(), Imm12::zero())
                     }
                 };

                 let srcloc = state.cur_srcloc();
                 if !srcloc.is_default() && !flags.notrap() {
                     // Register the offset at which the actual load instruction starts.
                     sink.add_trap(TrapCode::HeapOutOfBounds);
                 }

                 sink.put4(encode_s_type(op.op_code(), op.funct3(), addr, src, imm12));
             }
             &Inst::Args { .. } => {
                 // Nothing: this is a pseudoinstruction that serves
                 // only to constrain registers at a certain point.
             }
             &Inst::Ret {
                 stack_bytes_to_pop, ..
             } => {
                 if stack_bytes_to_pop != 0 {
                     Inst::AdjustSp {
                         amount: i64::from(stack_bytes_to_pop),
                     }
                     .emit(&[], sink, emit_info, state);
                 }
                 //jalr x0, x1, 0
                 let x: u32 = (0b1100111) | (1 << 15);
                 sink.put4(x);
             }

             &Inst::Extend {
                 rd,
                 rn,
                 signed,
                 from_bits,
                 to_bits: _to_bits,
             } => {
                 let rn = allocs.next(rn);
                 let rd = allocs.next_writable(rd);
                 let mut insts = SmallInstVec::new();
                 let shift_bits = (64 - from_bits) as i16;
                 let is_u8 = || from_bits == 8 && signed == false;
                 if is_u8() {
                     // special for u8.
                     insts.push(Inst::AluRRImm12 {
                         alu_op: AluOPRRI::Andi,
                         rd,
                         rs: rn,
                         imm12: Imm12::from_bits(255),
                     });
                 } else {
                     insts.push(Inst::AluRRImm12 {
                         alu_op: AluOPRRI::Slli,
                         rd,
                         rs: rn,
                         imm12: Imm12::from_bits(shift_bits),
                     });
                     insts.push(Inst::AluRRImm12 {
                         alu_op: if signed {
                             AluOPRRI::Srai
                         } else {
                             AluOPRRI::Srli
                         },
                         rd,
                         rs: rd.to_reg(),
                         imm12: Imm12::from_bits(shift_bits),
                     });
                 }
                 insts
                     .into_iter()
                     .for_each(|i| i.emit(&[], sink, emit_info, state));
             }
             &Inst::AdjustSp { amount } => {
                 if let Some(imm) = Imm12::maybe_from_u64(amount as u64) {
                     Inst::AluRRImm12 {
                         alu_op: AluOPRRI::Addi,
                         rd: writable_stack_reg(),
                         rs: stack_reg(),
                         imm12: imm,
                     }
                     .emit(&[], sink, emit_info, state);
                 } else {
                     let tmp = writable_spilltmp_reg();
                     let mut insts = Inst::load_constant_u64(tmp, amount as u64, &mut |_| tmp);
                     insts.push(Inst::AluRRR {
                         alu_op: AluOPRRR::Add,
                         rd: writable_stack_reg(),
                         rs1: tmp.to_reg(),
                         rs2: stack_reg(),
                     });
                     insts
                         .into_iter()
                         .for_each(|i| i.emit(&[], sink, emit_info, state));
                 }
             }
             &Inst::Call { ref info } => {
                 // call
                 match info.dest {
                     ExternalName::User { .. } => {
                         if info.opcode.is_call() {
                             sink.add_call_site(info.opcode);
                         }
                         sink.add_reloc(Reloc::RiscvCall, &info.dest, 0);
                         if let Some(s) = state.take_stack_map() {
                             sink.add_stack_map(StackMapExtent::UpcomingBytes(8), s);
                         }
                         Inst::construct_auipc_and_jalr(
                             Some(writable_link_reg()),
                             writable_link_reg(),
                             0,
                         )
                         .into_iter()
                         .for_each(|i| i.emit(&[], sink, emit_info, state));
                     }
                     ExternalName::LibCall(..)
                     | ExternalName::TestCase { .. }
                     | ExternalName::KnownSymbol(..) => {
                         // use indirect call. it is more simple.
                         // load ext name.
                         Inst::LoadExtName {
                             rd: writable_spilltmp_reg2(),
                             name: Box::new(info.dest.clone()),
                             offset: 0,
                         }
                         .emit(&[], sink, emit_info, state);

                         if let Some(s) = state.take_stack_map() {
                             sink.add_stack_map(StackMapExtent::UpcomingBytes(4), s);
                         }
                         if info.opcode.is_call() {
                             sink.add_call_site(info.opcode);
                         }
                         // call
                         Inst::Jalr {
                             rd: writable_link_reg(),
                             base: spilltmp_reg2(),
                             offset: Imm12::zero(),
                         }
                         .emit(&[], sink, emit_info, state);
                     }
                 }

                 let callee_pop_size = i64::from(info.callee_pop_size);
                 state.virtual_sp_offset -= callee_pop_size;
                 trace!(
                     "call adjusts virtual sp offset by {callee_pop_size} -> {}",
                     state.virtual_sp_offset
                 );
             }
             &Inst::CallInd { ref info } => {
                 let rn = allocs.next(info.rn);
                 if let Some(s) = state.take_stack_map() {
                     sink.add_stack_map(StackMapExtent::UpcomingBytes(4), s);
                 }

                 if info.opcode.is_call() {
                     sink.add_call_site(info.opcode);
                 }
                 Inst::Jalr {
                     rd: writable_link_reg(),
                     base: rn,
                     offset: Imm12::zero(),
                 }
                 .emit(&[], sink, emit_info, state);

                 let callee_pop_size = i64::from(info.callee_pop_size);
                 state.virtual_sp_offset -= callee_pop_size;
                 trace!(
                     "call adjusts virtual sp offset by {callee_pop_size} -> {}",
                     state.virtual_sp_offset
                 );
             }

             &Inst::Jal { dest } => {
                 let code: u32 = 0b1101111;
                 match dest {
                     BranchTarget::Label(lable) => {
                         sink.use_label_at_offset(start_off, lable, LabelUse::Jal20);
                         sink.add_uncond_branch(start_off, start_off + 4, lable);
                         sink.put4(code);
                     }
                     BranchTarget::ResolvedOffset(offset) => {
                         let offset = offset as i64;
                         if offset != 0 {
                             if LabelUse::Jal20.offset_in_range(offset) {
                                 let mut code = code.to_le_bytes();
                                 LabelUse::Jal20.patch_raw_offset(&mut code, offset);
                                 sink.put_data(&code[..]);
                             } else {
                                 Inst::construct_auipc_and_jalr(
                                     None,
                                     writable_spilltmp_reg(),
                                     offset,
                                 )
                                 .into_iter()
                                 .for_each(|i| i.emit(&[], sink, emit_info, state));
                             }
                         } else {
                             // CondBr often generate Jal {dest : 0}, means otherwise no jump.
                         }
                     }
                 }
             }
             &Inst::CondBr {
                 taken,
                 not_taken,
                 mut kind,
             } => {
                 kind.rs1 = allocs.next(kind.rs1);
                 kind.rs2 = allocs.next(kind.rs2);
                 match taken {
                     BranchTarget::Label(label) => {
                         let code = kind.emit();
                         let code_inverse = kind.inverse().emit().to_le_bytes();
                         sink.use_label_at_offset(start_off, label, LabelUse::B12);
                         sink.add_cond_branch(start_off, start_off + 4, label, &code_inverse);
                         sink.put4(code);
                     }
                     BranchTarget::ResolvedOffset(offset) => {
                         assert!(offset != 0);
                         if LabelUse::B12.offset_in_range(offset as i64) {
                             let code = kind.emit();
                             let mut code = code.to_le_bytes();
                             LabelUse::B12.patch_raw_offset(&mut code, offset as i64);
                             sink.put_data(&code[..])
                         } else {
                             let mut code = kind.emit().to_le_bytes();
                             // jump over the condbr , 4 bytes.
                             LabelUse::B12.patch_raw_offset(&mut code[..], 4);
                             sink.put_data(&code[..]);
                             Inst::construct_auipc_and_jalr(
                                 None,
                                 writable_spilltmp_reg(),
                                 offset as i64,
                             )
                             .into_iter()
                             .for_each(|i| i.emit(&[], sink, emit_info, state));
                         }
                     }
                 }
                 Inst::Jal { dest: not_taken }.emit(&[], sink, emit_info, state);
             }

             &Inst::Mov { rd, rm, ty } => {
                 debug_assert_eq!(rd.to_reg().class(), rm.class());
                 if rd.to_reg() == rm {
                     return;
                 }

                 let rm = allocs.next(rm);
                 let rd = allocs.next_writable(rd);

                 match rm.class() {
                     RegClass::Int => Inst::AluRRImm12 {
                         alu_op: AluOPRRI::Ori,
                         rd: rd,
                         rs: rm,
                         imm12: Imm12::zero(),
                     },
                     RegClass::Float => Inst::FpuRRR {
                         alu_op: if ty == F32 {
                             FpuOPRRR::FsgnjS
                         } else {
                             FpuOPRRR::FsgnjD
                         },
                         frm: None,
                         rd: rd,
                         rs1: rm,
                         rs2: rm,
                     },
                     RegClass::Vector => Inst::VecAluRRImm5 {
                         op: VecAluOpRRImm5::VmvrV,
                         vd: rd,
                         vs2: rm,
                         // Imm 0 means copy 1 register.
                         imm: Imm5::maybe_from_i8(0).unwrap(),
                         mask: VecOpMasking::Disabled,
                         // Vstate for this instruction is ignored.
                         vstate: VState::from_type(ty),
                     },
                 }
                 .emit(&[], sink, emit_info, state);
             }

             &Inst::MovFromPReg { rd, rm } => {
                 debug_assert!([px_reg(2), px_reg(8)].contains(&rm));
                 let rd = allocs.next_writable(rd);
                 let x = Inst::AluRRImm12 {
                     alu_op: AluOPRRI::Ori,
                     rd,
                     rs: Reg::from(rm),
                     imm12: Imm12::zero(),
                 };
                 x.emit(&[], sink, emit_info, state);
             }

             &Inst::BrTable {
                 index,
                 tmp1,
                 tmp2,
                 ref targets,
             } => {
                 let index = allocs.next(index);
                 let tmp1 = allocs.next_writable(tmp1);
                 let tmp2 = allocs.next_writable(tmp2);
                 let ext_index = writable_spilltmp_reg();

                 // The default target is passed in as the 0th element of `targets`
                 // separate it here for clarity.
                 let default_target = targets[0];
                 let targets = &targets[1..];

                 // We emit a bounds check on the index, if the index is larger than the number of
                 // jump table entries, we jump to the default block.  Otherwise we compute a jump
                 // offset by multiplying the index by 8 (the size of each entry) and then jump to
                 // that offset. Each jump table entry is a regular auipc+jalr which we emit sequentially.
                 //
                 // Build the following sequence:
                 //
                 // extend_index:
                 //     zext.w  ext_index, index
                 // bounds_check:
                 //     li      tmp, n_labels
                 //     bltu    ext_index, tmp, compute_target
                 // jump_to_default_block:
                 //     auipc   pc, 0
                 //     jalr    zero, pc, default_block
                 // compute_target:
                 //     auipc   pc, 0
                 //     slli    tmp, ext_index, 3
                 //     add     pc, pc, tmp
                 //     jalr    zero, pc, 0x10
                 // jump_table:
                 //     ; This repeats for each entry in the jumptable
                 //     auipc   pc, 0
                 //     jalr    zero, pc, block_target

                 // Extend the index to 64 bits.
                 //
                 // This prevents us branching on the top 32 bits of the index, which
                 // are undefined.
                 Inst::Extend {
                     rd: ext_index,
                     rn: index,
                     signed: false,
                     from_bits: 32,
                     to_bits: 64,
                 }
                 .emit(&[], sink, emit_info, state);

                 // Bounds check.
                 //
                 // Check if the index passed in is larger than the number of jumptable
                 // entries that we have. If it is, we fallthrough to a jump into the
                 // default block.
                 Inst::load_constant_u32(tmp2, targets.len() as u64, &mut |_| tmp2)
                     .iter()
                     .for_each(|i| i.emit(&[], sink, emit_info, state));
                 Inst::CondBr {
                     taken: BranchTarget::offset(Inst::INSTRUCTION_SIZE * 3),
                     not_taken: BranchTarget::zero(),
                     kind: IntegerCompare {
                         kind: IntCC::UnsignedLessThan,
                         rs1: ext_index.to_reg(),
                         rs2: tmp2.to_reg(),
                     },
                 }
                 .emit(&[], sink, emit_info, state);
                 sink.use_label_at_offset(
                     sink.cur_offset(),
                     default_target.as_label().unwrap(),
                     LabelUse::PCRel32,
                 );
                 Inst::construct_auipc_and_jalr(None, tmp2, 0)
                     .iter()
                     .for_each(|i| i.emit(&[], sink, emit_info, state));

                 // Compute the jump table offset.
                 // We need to emit a PC relative offset,

                 // Get the current PC.
                 Inst::Auipc {
                     rd: tmp1,
                     imm: Imm20::from_bits(0),
                 }
                 .emit(&[], sink, emit_info, state);

                 // Multiply the index by 8, since that is the size in
                 // bytes of each jump table entry
                 Inst::AluRRImm12 {
                     alu_op: AluOPRRI::Slli,
                     rd: tmp2,
                     rs: ext_index.to_reg(),
                     imm12: Imm12::from_bits(3),
                 }
                 .emit(&[], sink, emit_info, state);

                 // Calculate the base of the jump, PC + the offset from above.
                 Inst::AluRRR {
                     alu_op: AluOPRRR::Add,
                     rd: tmp1,
                     rs1: tmp1.to_reg(),
                     rs2: tmp2.to_reg(),
                 }
                 .emit(&[], sink, emit_info, state);

                 // Jump to the middle of the jump table.
                 // We add a 16 byte offset here, since we used 4 instructions
                 // since the AUIPC that was used to get the PC.
                 Inst::Jalr {
                     rd: writable_zero_reg(),
                     base: tmp1.to_reg(),
                     offset: Imm12::from_bits((4 * Inst::INSTRUCTION_SIZE) as i16),
                 }
                 .emit(&[], sink, emit_info, state);

                 // Emit the jump table.
                 //
                 // Each entry is a aupc + jalr to the target block. We also start with a island
                 // if necessary.

                 // Each entry in the jump table is 2 instructions, so 8 bytes. Check if
                 // we need to emit a jump table here to support that jump.
                 let distance = (targets.len() * 2 * Inst::INSTRUCTION_SIZE as usize) as u32;
                 if sink.island_needed(distance) {
                     sink.emit_island(distance, &mut state.ctrl_plane);
                 }

                 // Emit the jumps back to back
                 for target in targets.iter() {
                     sink.use_label_at_offset(
                         sink.cur_offset(),
                         target.as_label().unwrap(),
                         LabelUse::PCRel32,
                     );

                     Inst::construct_auipc_and_jalr(None, tmp2, 0)
                         .iter()
                         .for_each(|i| i.emit(&[], sink, emit_info, state));
                 }

                 // We've just emitted an island that is safe up to *here*.
                 // Mark it as such so that we don't needlessly emit additional islands.
                 start_off = sink.cur_offset();
             }

             &Inst::VirtualSPOffsetAdj { amount } => {
                 crate::trace!(
                     "virtual sp offset adjusted by {} -> {}",
                     amount,
                     state.virtual_sp_offset + amount
                 );
                 state.virtual_sp_offset += amount;
             }
             &Inst::Atomic {
                 op,
                 rd,
                 addr,
                 src,
                 amo,
             } => {
                 let addr = allocs.next(addr);
                 let src = allocs.next(src);
                 let rd = allocs.next_writable(rd);
                 let srcloc = state.cur_srcloc();
                 if !srcloc.is_default() {
                     sink.add_trap(TrapCode::HeapOutOfBounds);
                 }
                 let x = op.op_code()
                     | reg_to_gpr_num(rd.to_reg()) << 7
                     | op.funct3() << 12
                     | reg_to_gpr_num(addr) << 15
                     | reg_to_gpr_num(src) << 20
                     | op.funct7(amo) << 25;

                 sink.put4(x);
             }
             &Inst::Fence { pred, succ } => {
                 let x = 0b0001111
                     | 0b00000 << 7
                     | 0b000 << 12
                     | 0b00000 << 15
                     | (succ as u32) << 20
                     | (pred as u32) << 24;

                 sink.put4(x);
             }
             &Inst::FenceI => sink.put4(0x0000100f),
             &Inst::Auipc { rd, imm } => {
                 let rd = allocs.next_writable(rd);
                 let x = enc_auipc(rd, imm);
                 sink.put4(x);
             }

             &Inst::LoadAddr { rd, mem } => {
                 let mem = mem.with_allocs(&mut allocs);
                 let rd = allocs.next_writable(rd);

                 let base = mem.get_base_register();
                 let offset = mem.get_offset_with_state(state);
                 let offset_imm12 = Imm12::maybe_from_u64(offset as u64);

                 match (mem, base, offset_imm12) {
                     (_, Some(rs), Some(imm12)) => {
                         Inst::AluRRImm12 {
                             alu_op: AluOPRRI::Addi,
                             rd,
                             rs,
                             imm12,
                         }
                         .emit(&[], sink, emit_info, state);
                     }
                     (_, Some(rs), None) => {
                         LoadConstant::U64(offset as u64)
                             .load_constant_and_add(rd, rs)
                             .into_iter()
                             .for_each(|inst| inst.emit(&[], sink, emit_info, state));
                     }
                     (AMode::Const(addr), None, _) => {
                         // Get an address label for the constant and recurse.
                         let label = sink.get_label_for_constant(addr);
                         Inst::LoadAddr {
                             rd,
                             mem: AMode::Label(label),
                         }
                         .emit(&[], sink, emit_info, state);
                     }
                     (AMode::Label(label), None, _) => {
                         // Get the current PC.
                         sink.use_label_at_offset(sink.cur_offset(), label, LabelUse::PCRelHi20);
                         let inst = Inst::Auipc {
                             rd,
                             imm: Imm20::from_bits(0),
                         };
                         inst.emit(&[], sink, emit_info, state);

                         // Emit an add to the address with a relocation.
                         // This later gets patched up with the correct offset.
                         sink.use_label_at_offset(sink.cur_offset(), label, LabelUse::PCRelLo12I);
                         Inst::AluRRImm12 {
                             alu_op: AluOPRRI::Addi,
                             rd,
                             rs: rd.to_reg(),
                             imm12: Imm12::zero(),
                         }
                         .emit(&[], sink, emit_info, state);
                     }
                     (amode, _, _) => {
                         unimplemented!("LoadAddr: {:?}", amode);
                     }
                 }
             }

             &Inst::Select {
                 ref dst,
                 condition,
                 ref x,
                 ref y,
                 ty: _ty,
             } => {
                 let condition = allocs.next(condition);
                 let x = alloc_value_regs(x, &mut allocs);
                 let y = alloc_value_regs(y, &mut allocs);
                 let dst: Vec<_> = dst
                     .clone()
                     .into_iter()
                     .map(|r| allocs.next_writable(r))
                     .collect();

                 let mut insts = SmallInstVec::new();
                 let label_false = sink.get_label();
                 insts.push(Inst::CondBr {
                     taken: BranchTarget::Label(label_false),
                     not_taken: BranchTarget::zero(),
                     kind: IntegerCompare {
                         kind: IntCC::Equal,
                         rs1: condition,
                         rs2: zero_reg(),
                     },
                 });
                 // here is the true
                 // select the first value
                 insts.extend(gen_moves(&dst[..], x.regs()));
                 let label_jump_over = sink.get_label();
                 insts.push(Inst::Jal {
                     dest: BranchTarget::Label(label_jump_over),
                 });
                 // here is false
                 insts
                     .drain(..)
                     .for_each(|i: Inst| i.emit(&[], sink, emit_info, state));
                 sink.bind_label(label_false, &mut state.ctrl_plane);
                 // select second value1
                 insts.extend(gen_moves(&dst[..], y.regs()));
                 insts
                     .into_iter()
                     .for_each(|i| i.emit(&[], sink, emit_info, state));
                 sink.bind_label(label_jump_over, &mut state.ctrl_plane);
             }
             &Inst::Jalr { rd, base, offset } => {
                 let rd = allocs.next_writable(rd);
                 let x = enc_jalr(rd, base, offset);
                 sink.put4(x);
             }
             &Inst::ECall => {
                 sink.put4(0x00000073);
             }
             &Inst::EBreak => {
                 sink.put4(0x00100073);
             }
             &Inst::Icmp {
                 cc,
                 rd,
                 ref a,
                 ref b,
                 ty,
             } => {
                 let a = alloc_value_regs(a, &mut allocs);
                 let b = alloc_value_regs(b, &mut allocs);
                 let rd = allocs.next_writable(rd);
                 let label_true = sink.get_label();
                 let label_false = sink.get_label();
                 Inst::lower_br_icmp(
                     cc,
                     a,
                     b,
                     BranchTarget::Label(label_true),
                     BranchTarget::Label(label_false),
                     ty,
                 )
                 .into_iter()
                 .for_each(|i| i.emit(&[], sink, emit_info, state));

                 sink.bind_label(label_true, &mut state.ctrl_plane);
                 Inst::load_imm12(rd, Imm12::TRUE).emit(&[], sink, emit_info, state);
                 Inst::Jal {
                     dest: BranchTarget::offset(Inst::INSTRUCTION_SIZE * 2),
                 }
                 .emit(&[], sink, emit_info, state);
                 sink.bind_label(label_false, &mut state.ctrl_plane);
                 Inst::load_imm12(rd, Imm12::FALSE).emit(&[], sink, emit_info, state);
             }
             &Inst::AtomicCas {
                 offset,
                 t0,
                 dst,
                 e,
                 addr,
                 v,
                 ty,
             } => {
                 let offset = allocs.next(offset);
                 let e = allocs.next(e);
                 let addr = allocs.next(addr);
                 let v = allocs.next(v);
                 let t0 = allocs.next_writable(t0);
                 let dst = allocs.next_writable(dst);

                 //     # addr holds address of memory location
                 //     # e holds expected value
                 //     # v holds desired value
                 //     # dst holds return value
                 // cas:
                 //     lr.w dst, (addr)       # Load original value.
                 //     bne dst, e, fail       # Doesn’t match, so fail.
                 //     sc.w t0, v, (addr)     # Try to update.
                 //     bnez t0 , cas          # if store not ok,retry.
                 // fail:
                 let fail_label = sink.get_label();
                 let cas_lebel = sink.get_label();
                 sink.bind_label(cas_lebel, &mut state.ctrl_plane);
                 Inst::Atomic {
                     op: AtomicOP::load_op(ty),
                     rd: dst,
                     addr,
                     src: zero_reg(),
                     amo: AMO::SeqCst,
                 }
                 .emit(&[], sink, emit_info, state);
                 if ty.bits() < 32 {
                     AtomicOP::extract(dst, offset, dst.to_reg(), ty)
                         .iter()
                         .for_each(|i| i.emit(&[], sink, emit_info, state));
                 } else if ty.bits() == 32 {
                     Inst::Extend {
                         rd: dst,
                         rn: dst.to_reg(),
                         signed: false,
                         from_bits: 32,
                         to_bits: 64,
                     }
                     .emit(&[], sink, emit_info, state);
                 }
                 Inst::CondBr {
                     taken: BranchTarget::Label(fail_label),
                     not_taken: BranchTarget::zero(),
                     kind: IntegerCompare {
                         kind: IntCC::NotEqual,
                         rs1: e,
                         rs2: dst.to_reg(),
                     },
                 }
                 .emit(&[], sink, emit_info, state);
                 let store_value = if ty.bits() < 32 {
                     // reload value to t0.
                     Inst::Atomic {
                         op: AtomicOP::load_op(ty),
                         rd: t0,
                         addr,
                         src: zero_reg(),
                         amo: AMO::SeqCst,
                     }
                     .emit(&[], sink, emit_info, state);
                     // set reset part.
                     AtomicOP::merge(t0, writable_spilltmp_reg(), offset, v, ty)
                         .iter()
                         .for_each(|i| i.emit(&[], sink, emit_info, state));
                     t0.to_reg()
                 } else {
                     v
                 };
                 Inst::Atomic {
                     op: AtomicOP::store_op(ty),
                     rd: t0,
                     addr,
                     src: store_value,
                     amo: AMO::SeqCst,
                 }
                 .emit(&[], sink, emit_info, state);
                 // check is our value stored.
                 Inst::CondBr {
                     taken: BranchTarget::Label(cas_lebel),
                     not_taken: BranchTarget::zero(),
                     kind: IntegerCompare {
                         kind: IntCC::NotEqual,
                         rs1: t0.to_reg(),
                         rs2: zero_reg(),
                     },
                 }
                 .emit(&[], sink, emit_info, state);
                 sink.bind_label(fail_label, &mut state.ctrl_plane);
             }
             &Inst::AtomicRmwLoop {
                 offset,
                 op,
                 dst,
                 ty,
                 p,
                 x,
                 t0,
             } => {
                 let offset = allocs.next(offset);
                 let p = allocs.next(p);
                 let x = allocs.next(x);
                 let t0 = allocs.next_writable(t0);
                 let dst = allocs.next_writable(dst);
                 let retry = sink.get_label();
                 sink.bind_label(retry, &mut state.ctrl_plane);
                 // load old value.
                 Inst::Atomic {
                     op: AtomicOP::load_op(ty),
                     rd: dst,
                     addr: p,
                     src: zero_reg(),
                     amo: AMO::SeqCst,
                 }
                 .emit(&[], sink, emit_info, state);
                 //

                 let store_value: Reg = match op {
                     crate::ir::AtomicRmwOp::Add
                     | crate::ir::AtomicRmwOp::Sub
                     | crate::ir::AtomicRmwOp::And
                     | crate::ir::AtomicRmwOp::Or
                     | crate::ir::AtomicRmwOp::Xor => {
                         AtomicOP::extract(dst, offset, dst.to_reg(), ty)
                             .iter()
                             .for_each(|i| i.emit(&[], sink, emit_info, state));
                         Inst::AluRRR {
                             alu_op: match op {
                                 crate::ir::AtomicRmwOp::Add => AluOPRRR::Add,
                                 crate::ir::AtomicRmwOp::Sub => AluOPRRR::Sub,
                                 crate::ir::AtomicRmwOp::And => AluOPRRR::And,
                                 crate::ir::AtomicRmwOp::Or => AluOPRRR::Or,
                                 crate::ir::AtomicRmwOp::Xor => AluOPRRR::Xor,
                                 _ => unreachable!(),
                             },
                             rd: t0,
                             rs1: dst.to_reg(),
                             rs2: x,
                         }
                         .emit(&[], sink, emit_info, state);
                         Inst::Atomic {
                             op: AtomicOP::load_op(ty),
                             rd: writable_spilltmp_reg2(),
                             addr: p,
                             src: zero_reg(),
                             amo: AMO::SeqCst,
                         }
                         .emit(&[], sink, emit_info, state);
                         AtomicOP::merge(
                             writable_spilltmp_reg2(),
                             writable_spilltmp_reg(),
                             offset,
                             t0.to_reg(),
                             ty,
                         )
                         .iter()
                         .for_each(|i| i.emit(&[], sink, emit_info, state));
                         spilltmp_reg2()
                     }
                     crate::ir::AtomicRmwOp::Nand => {
                         if ty.bits() < 32 {
                             AtomicOP::extract(dst, offset, dst.to_reg(), ty)
                                 .iter()
                                 .for_each(|i| i.emit(&[], sink, emit_info, state));
                         }
                         Inst::AluRRR {
                             alu_op: AluOPRRR::And,
                             rd: t0,
                             rs1: x,
                             rs2: dst.to_reg(),
                         }
                         .emit(&[], sink, emit_info, state);
                         Inst::construct_bit_not(t0, t0.to_reg()).emit(&[], sink, emit_info, state);
                         if ty.bits() < 32 {
                             Inst::Atomic {
                                 op: AtomicOP::load_op(ty),
                                 rd: writable_spilltmp_reg2(),
                                 addr: p,
                                 src: zero_reg(),
                                 amo: AMO::SeqCst,
                             }
                             .emit(&[], sink, emit_info, state);
                             AtomicOP::merge(
                                 writable_spilltmp_reg2(),
                                 writable_spilltmp_reg(),
                                 offset,
                                 t0.to_reg(),
                                 ty,
                             )
                             .iter()
                             .for_each(|i| i.emit(&[], sink, emit_info, state));
                             spilltmp_reg2()
                         } else {
                             t0.to_reg()
                         }
                     }

                     crate::ir::AtomicRmwOp::Umin
                     | crate::ir::AtomicRmwOp::Umax
                     | crate::ir::AtomicRmwOp::Smin
                     | crate::ir::AtomicRmwOp::Smax => {
                         let label_select_dst = sink.get_label();
                         let label_select_done = sink.get_label();
                         if op == crate::ir::AtomicRmwOp::Umin || op == crate::ir::AtomicRmwOp::Umax
                         {
                             AtomicOP::extract(dst, offset, dst.to_reg(), ty)
                         } else {
                             AtomicOP::extract_sext(dst, offset, dst.to_reg(), ty)
                         }
                         .iter()
                         .for_each(|i| i.emit(&[], sink, emit_info, state));
                         Inst::lower_br_icmp(
                             match op {
                                 crate::ir::AtomicRmwOp::Umin => IntCC::UnsignedLessThan,
                                 crate::ir::AtomicRmwOp::Umax => IntCC::UnsignedGreaterThan,
                                 crate::ir::AtomicRmwOp::Smin => IntCC::SignedLessThan,
                                 crate::ir::AtomicRmwOp::Smax => IntCC::SignedGreaterThan,
                                 _ => unreachable!(),
                             },
                             ValueRegs::one(dst.to_reg()),
                             ValueRegs::one(x),
                             BranchTarget::Label(label_select_dst),
                             BranchTarget::zero(),
                             ty,
                         )
                         .iter()
                         .for_each(|i| i.emit(&[], sink, emit_info, state));
                         // here we select x.
                         Inst::gen_move(t0, x, I64).emit(&[], sink, emit_info, state);
                         Inst::Jal {
                             dest: BranchTarget::Label(label_select_done),
                         }
                         .emit(&[], sink, emit_info, state);
                         sink.bind_label(label_select_dst, &mut state.ctrl_plane);
                         Inst::gen_move(t0, dst.to_reg(), I64).emit(&[], sink, emit_info, state);
                         sink.bind_label(label_select_done, &mut state.ctrl_plane);
                         Inst::Atomic {
                             op: AtomicOP::load_op(ty),
                             rd: writable_spilltmp_reg2(),
                             addr: p,
                             src: zero_reg(),
                             amo: AMO::SeqCst,
                         }
                         .emit(&[], sink, emit_info, state);
                         AtomicOP::merge(
                             writable_spilltmp_reg2(),
                             writable_spilltmp_reg(),
                             offset,
                             t0.to_reg(),
                             ty,
                         )
                         .iter()
                         .for_each(|i| i.emit(&[], sink, emit_info, state));
                         spilltmp_reg2()
                     }
                     crate::ir::AtomicRmwOp::Xchg => {
                         AtomicOP::extract(dst, offset, dst.to_reg(), ty)
                             .iter()
                             .for_each(|i| i.emit(&[], sink, emit_info, state));
                         Inst::Atomic {
                             op: AtomicOP::load_op(ty),
                             rd: writable_spilltmp_reg2(),
                             addr: p,
                             src: zero_reg(),
                             amo: AMO::SeqCst,
                         }
                         .emit(&[], sink, emit_info, state);
                         AtomicOP::merge(
                             writable_spilltmp_reg2(),
                             writable_spilltmp_reg(),
                             offset,
                             x,
                             ty,
                         )
                         .iter()
                         .for_each(|i| i.emit(&[], sink, emit_info, state));
                         spilltmp_reg2()
                     }
                 };

                 Inst::Atomic {
                     op: AtomicOP::store_op(ty),
                     rd: t0,
                     addr: p,
                     src: store_value,
                     amo: AMO::SeqCst,
                 }
                 .emit(&[], sink, emit_info, state);

                 // if store is not ok,retry.
                 Inst::CondBr {
                     taken: BranchTarget::Label(retry),
                     not_taken: BranchTarget::zero(),
                     kind: IntegerCompare {
                         kind: IntCC::NotEqual,
                         rs1: t0.to_reg(),
                         rs2: zero_reg(),
                     },
                 }
                 .emit(&[], sink, emit_info, state);
             }

             &Inst::IntSelect {
                 op,
                 ref dst,
                 ref x,
                 ref y,
                 ty,
             } => {
                 let x = alloc_value_regs(x, &mut allocs);
                 let y = alloc_value_regs(y, &mut allocs);
                 let dst: Vec<_> = dst.iter().map(|r| allocs.next_writable(*r)).collect();
                 let label_true = sink.get_label();
                 let label_false = sink.get_label();
                 let label_done = sink.get_label();
                 Inst::lower_br_icmp(
                     op.to_int_cc(),
                     x,
                     y,
                     BranchTarget::Label(label_true),
                     BranchTarget::Label(label_false),
                     ty,
                 )
                 .into_iter()
                 .for_each(|i| i.emit(&[], sink, emit_info, state));

                 let gen_move = |dst: &Vec<Writable<Reg>>,
                                 val: &ValueRegs<Reg>,
                                 sink: &mut MachBuffer<Inst>,
                                 state: &mut EmitState| {
                     let mut insts = SmallInstVec::new();
                     insts.push(Inst::Mov {
                         rd: dst[0],
                         rm: val.regs()[0],
                         ty: I64,
                     });
                     if ty.bits() == 128 {
                         insts.push(Inst::Mov {
                             rd: dst[1],
                             rm: val.regs()[1],
                             ty,
                         });
                     }
                     insts
                         .into_iter()
                         .for_each(|i| i.emit(&[], sink, emit_info, state));
                 };
                 //here is true , use x.
                 sink.bind_label(label_true, &mut state.ctrl_plane);
                 gen_move(&dst, &x, sink, state);
                 Inst::gen_jump(label_done).emit(&[], sink, emit_info, state);
                 // here is false use y
                 sink.bind_label(label_false, &mut state.ctrl_plane);
                 gen_move(&dst, &y, sink, state);
                 sink.bind_label(label_done, &mut state.ctrl_plane);
             }

             &Inst::SelectReg {
                 condition,
                 rd,
                 rs1,
                 rs2,
             } => {
                 let mut condition = condition.clone();
                 condition.rs1 = allocs.next(condition.rs1);
                 condition.rs2 = allocs.next(condition.rs2);
                 let rs1 = allocs.next(rs1);
                 let rs2 = allocs.next(rs2);
                 let rd = allocs.next_writable(rd);
                 let label_true = sink.get_label();
                 let label_jump_over = sink.get_label();
                 let ty = Inst::canonical_type_for_rc(rs1.class());

                 sink.use_label_at_offset(sink.cur_offset(), label_true, LabelUse::B12);
                 let x = condition.emit();
                 sink.put4(x);
                 // here is false , use rs2
                 Inst::gen_move(rd, rs2, ty).emit(&[], sink, emit_info, state);
                 // and jump over
                 Inst::Jal {
                     dest: BranchTarget::Label(label_jump_over),
                 }
                 .emit(&[], sink, emit_info, state);
                 // here condition is true , use rs1
                 sink.bind_label(label_true, &mut state.ctrl_plane);
                 Inst::gen_move(rd, rs1, ty).emit(&[], sink, emit_info, state);
                 sink.bind_label(label_jump_over, &mut state.ctrl_plane);
             }
             &Inst::FcvtToInt {
                 is_sat,
                 rd,
                 rs,
                 is_signed,
                 in_type,
                 out_type,
                 tmp,
             } => {
                 let rs = allocs.next(rs);
                 let tmp = allocs.next_writable(tmp);
                 let rd = allocs.next_writable(rd);
                 let label_nan = sink.get_label();
                 let label_jump_over = sink.get_label();
                 // get if nan.
                 Inst::emit_not_nan(rd, rs, in_type).emit(&[], sink, emit_info, state);
                 // jump to nan.
                 Inst::CondBr {
                     taken: BranchTarget::Label(label_nan),
                     not_taken: BranchTarget::zero(),
                     kind: IntegerCompare {
                         kind: IntCC::Equal,
                         rs2: zero_reg(),
                         rs1: rd.to_reg(),
                     },
                 }
                 .emit(&[], sink, emit_info, state);

                 if !is_sat {
                     let f32_bounds = f32_cvt_to_int_bounds(is_signed, out_type.bits() as u8);
                     let f64_bounds = f64_cvt_to_int_bounds(is_signed, out_type.bits() as u8);
                     if in_type == F32 {
                         Inst::load_fp_constant32(tmp, f32_bits(f32_bounds.0), |_| {
                             writable_spilltmp_reg()
                         })
                     } else {
                         Inst::load_fp_constant64(tmp, f64_bits(f64_bounds.0), |_| {
                             writable_spilltmp_reg()
                         })
                     }
                     .iter()
                     .for_each(|i| i.emit(&[], sink, emit_info, state));

                     let le_op = if in_type == F32 {
                         FpuOPRRR::FleS
                     } else {
                         FpuOPRRR::FleD
                     };

                     // rd := rs <= tmp
                     Inst::FpuRRR {
                         alu_op: le_op,
                         frm: None,
                         rd,
                         rs1: rs,
                         rs2: tmp.to_reg(),
                     }
                     .emit(&[], sink, emit_info, state);
                     Inst::TrapIf {
                         test: rd.to_reg(),
                         trap_code: TrapCode::IntegerOverflow,
                     }
                     .emit(&[], sink, emit_info, state);

                     if in_type == F32 {
                         Inst::load_fp_constant32(tmp, f32_bits(f32_bounds.1), |_| {
                             writable_spilltmp_reg()
                         })
                     } else {
                         Inst::load_fp_constant64(tmp, f64_bits(f64_bounds.1), |_| {
                             writable_spilltmp_reg()
                         })
                     }
                     .iter()
                     .for_each(|i| i.emit(&[], sink, emit_info, state));

                     // rd := rs >= tmp
                     Inst::FpuRRR {
                         alu_op: le_op,
                         frm: None,
                         rd,
                         rs1: tmp.to_reg(),
                         rs2: rs,
                     }
                     .emit(&[], sink, emit_info, state);

                     Inst::TrapIf {
                         test: rd.to_reg(),
                         trap_code: TrapCode::IntegerOverflow,
                     }
                     .emit(&[], sink, emit_info, state);
                 }
                 // convert to int normally.
                 Inst::FpuRR {
                     frm: Some(FRM::RTZ),
                     alu_op: FpuOPRR::float_convert_2_int_op(in_type, is_signed, out_type),
                     rd,
                     rs,
                 }
                 .emit(&[], sink, emit_info, state);
                 if out_type.bits() < 32 && is_signed {
                     // load value part mask.
                     Inst::load_constant_u32(
                         writable_spilltmp_reg(),
                         if 16 == out_type.bits() {
                             (u16::MAX >> 1) as u64
                         } else {
                             // I8
                             (u8::MAX >> 1) as u64
                         },
                         &mut |_| writable_spilltmp_reg2(),
                     )
                     .into_iter()
                     .for_each(|x| x.emit(&[], sink, emit_info, state));
                     // keep value part.
                     Inst::AluRRR {
                         alu_op: AluOPRRR::And,
                         rd: writable_spilltmp_reg(),
                         rs1: rd.to_reg(),
                         rs2: spilltmp_reg(),
                     }
                     .emit(&[], sink, emit_info, state);
                     // extact sign bit.
                     Inst::AluRRImm12 {
                         alu_op: AluOPRRI::Srli,
                         rd: rd,
                         rs: rd.to_reg(),
                         imm12: Imm12::from_bits(31),
                     }
                     .emit(&[], sink, emit_info, state);
                     Inst::AluRRImm12 {
                         alu_op: AluOPRRI::Slli,
                         rd: rd,
                         rs: rd.to_reg(),
                         imm12: Imm12::from_bits(if 16 == out_type.bits() {
                             15
                         } else {
                             // I8
                             7
                         }),
                     }
                     .emit(&[], sink, emit_info, state);
                     // make result,sign bit and value part.
                     Inst::AluRRR {
                         alu_op: AluOPRRR::Or,
                         rd: rd,
                         rs1: rd.to_reg(),
                         rs2: spilltmp_reg(),
                     }
                     .emit(&[], sink, emit_info, state);
                 }

                 // I already have the result,jump over.
                 Inst::Jal {
                     dest: BranchTarget::Label(label_jump_over),
                 }
                 .emit(&[], sink, emit_info, state);
                 // here is nan , move 0 into rd register
                 sink.bind_label(label_nan, &mut state.ctrl_plane);
                 if is_sat {
                     Inst::load_imm12(rd, Imm12::from_bits(0)).emit(&[], sink, emit_info, state);
                 } else {
                     // here is ud2.
                     Inst::Udf {
                         trap_code: TrapCode::BadConversionToInteger,
                     }
                     .emit(&[], sink, emit_info, state);
                 }
                 // bind jump_over
                 sink.bind_label(label_jump_over, &mut state.ctrl_plane);
             }

             &Inst::LoadExtName {
                 rd,
                 ref name,
                 offset,
             } => {
                 let rd = allocs.next_writable(rd);
                 // get the current pc.
                 Inst::Auipc {
                     rd: rd,
                     imm: Imm20::from_bits(0),
                 }
                 .emit(&[], sink, emit_info, state);
                 // load the value.
                 Inst::Load {
                     rd: rd,
                     op: LoadOP::Ld,
                     flags: MemFlags::trusted(),
                     from: AMode::RegOffset(
                         rd.to_reg(),
                         12, // auipc load and jal.
                         I64,
                     ),
                 }
                 .emit(&[], sink, emit_info, state);
                 // jump over.
                 Inst::Jal {
                     // jal and abs8 size for 12.
                     dest: BranchTarget::offset(12),
                 }
                 .emit(&[], sink, emit_info, state);

                 sink.add_reloc(Reloc::Abs8, name.as_ref(), offset);
                 sink.put8(0);
             }
             &Inst::TrapIfC {
                 rs1,
                 rs2,
                 cc,
                 trap_code,
             } => {
                 let rs1 = allocs.next(rs1);
                 let rs2 = allocs.next(rs2);
                 let label_trap = sink.get_label();
                 let label_jump_over = sink.get_label();
                 Inst::CondBr {
                     taken: BranchTarget::Label(label_trap),
                     not_taken: BranchTarget::Label(label_jump_over),
                     kind: IntegerCompare { kind: cc, rs1, rs2 },
                 }
                 .emit(&[], sink, emit_info, state);
                 // trap
                 sink.bind_label(label_trap, &mut state.ctrl_plane);
                 Inst::Udf {
                     trap_code: trap_code,
                 }
                 .emit(&[], sink, emit_info, state);
                 sink.bind_label(label_jump_over, &mut state.ctrl_plane);
             }
             &Inst::TrapIf { test, trap_code } => {
                 let test = allocs.next(test);
                 let label_trap = sink.get_label();
                 let label_jump_over = sink.get_label();
                 Inst::CondBr {
                     taken: BranchTarget::Label(label_trap),
                     not_taken: BranchTarget::Label(label_jump_over),
                     kind: IntegerCompare {
                         kind: IntCC::NotEqual,
                         rs1: test,
                         rs2: zero_reg(),
                     },
                 }
                 .emit(&[], sink, emit_info, state);
                 // trap
                 sink.bind_label(label_trap, &mut state.ctrl_plane);
                 Inst::Udf {
                     trap_code: trap_code,
                 }
                 .emit(&[], sink, emit_info, state);
                 sink.bind_label(label_jump_over, &mut state.ctrl_plane);
             }
             &Inst::Udf { trap_code } => {
                 sink.add_trap(trap_code);
                 if let Some(s) = state.take_stack_map() {
                     sink.add_stack_map(StackMapExtent::UpcomingBytes(4), s);
                 }
                 sink.put_data(Inst::TRAP_OPCODE);
             }
             &Inst::AtomicLoad { rd, ty, p } => {
                 let p = allocs.next(p);
                 let rd = allocs.next_writable(rd);
                 // emit the fence.
                 Inst::Fence {
                     pred: Inst::FENCE_REQ_R | Inst::FENCE_REQ_W,
                     succ: Inst::FENCE_REQ_R | Inst::FENCE_REQ_W,
                 }
                 .emit(&[], sink, emit_info, state);
                 // load.
                 Inst::Load {
                     rd: rd,
                     op: LoadOP::from_type(ty),
                     flags: MemFlags::new(),
                     from: AMode::RegOffset(p, 0, ty),
                 }
                 .emit(&[], sink, emit_info, state);
                 Inst::Fence {
                     pred: Inst::FENCE_REQ_R,
                     succ: Inst::FENCE_REQ_R | Inst::FENCE_REQ_W,
                 }
                 .emit(&[], sink, emit_info, state);
             }
             &Inst::AtomicStore { src, ty, p } => {
                 let src = allocs.next(src);
                 let p = allocs.next(p);
                 Inst::Fence {
                     pred: Inst::FENCE_REQ_R | Inst::FENCE_REQ_W,
                     succ: Inst::FENCE_REQ_W,
                 }
                 .emit(&[], sink, emit_info, state);
                 Inst::Store {
                     to: AMode::RegOffset(p, 0, ty),
                     op: StoreOP::from_type(ty),
                     flags: MemFlags::new(),
                     src,
                 }
                 .emit(&[], sink, emit_info, state);
             }
             &Inst::FloatRound {
                 op,
                 rd,
                 int_tmp,
                 f_tmp,
                 rs,
                 ty,
             } => {
                 // this code is port from glibc ceil floor ... implementation.
                 let rs = allocs.next(rs);
                 let int_tmp = allocs.next_writable(int_tmp);
                 let f_tmp = allocs.next_writable(f_tmp);
                 let rd = allocs.next_writable(rd);
                 let label_nan = sink.get_label();
                 let label_x = sink.get_label();
                 let label_jump_over = sink.get_label();
                 // check if is nan.
                 Inst::emit_not_nan(int_tmp, rs, ty).emit(&[], sink, emit_info, state);
                 Inst::CondBr {
                     taken: BranchTarget::Label(label_nan),
                     not_taken: BranchTarget::zero(),
                     kind: IntegerCompare {
                         kind: IntCC::Equal,
                         rs1: int_tmp.to_reg(),
                         rs2: zero_reg(),
                     },
                 }
                 .emit(&[], sink, emit_info, state);
                 fn max_value_need_round(ty: Type) -> u64 {
                     match ty {
                         F32 => {
                             let x: u64 = 1 << f32::MANTISSA_DIGITS;
                             let x = x as f32;
                             let x = u32::from_le_bytes(x.to_le_bytes());
                             x as u64
                         }
                         F64 => {
                             let x: u64 = 1 << f64::MANTISSA_DIGITS;
                             let x = x as f64;
                             u64::from_le_bytes(x.to_le_bytes())
                         }
                         _ => unreachable!(),
                     }
                 }
                 // load max value need to round.
                 if ty == F32 {
                     Inst::load_fp_constant32(f_tmp, max_value_need_round(ty) as u32, &mut |_| {
                         writable_spilltmp_reg()
                     })
                 } else {
                     Inst::load_fp_constant64(f_tmp, max_value_need_round(ty), &mut |_| {
                         writable_spilltmp_reg()
                     })
                 }
                 .into_iter()
                 .for_each(|i| i.emit(&[], sink, emit_info, state));

                 // get abs value.
                 Inst::emit_fabs(rd, rs, ty).emit(&[], sink, emit_info, state);

                 // branch if f_tmp < rd
                 Inst::FpuRRR {
                     frm: None,
                     alu_op: if ty == F32 {
                         FpuOPRRR::FltS
                     } else {
                         FpuOPRRR::FltD
                     },
                     rd: int_tmp,
                     rs1: f_tmp.to_reg(),
                     rs2: rd.to_reg(),
                 }
                 .emit(&[], sink, emit_info, state);

                 Inst::CondBr {
                     taken: BranchTarget::Label(label_x),
                     not_taken: BranchTarget::zero(),
                     kind: IntegerCompare {
                         kind: IntCC::NotEqual,
                         rs1: int_tmp.to_reg(),
                         rs2: zero_reg(),
                     },
                 }
                 .emit(&[], sink, emit_info, state);

                 //convert to int.
                 Inst::FpuRR {
                     alu_op: FpuOPRR::float_convert_2_int_op(ty, true, I64),
                     frm: Some(op.to_frm()),
                     rd: int_tmp,
                     rs: rs,
                 }
                 .emit(&[], sink, emit_info, state);
                 //convert back.
                 Inst::FpuRR {
                     alu_op: FpuOPRR::int_convert_2_float_op(I64, true, ty),
                     frm: Some(op.to_frm()),
                     rd,
                     rs: int_tmp.to_reg(),
                 }
                 .emit(&[], sink, emit_info, state);
                 // copy sign.
                 Inst::FpuRRR {
                     alu_op: if ty == F32 {
                         FpuOPRRR::FsgnjS
                     } else {
                         FpuOPRRR::FsgnjD
                     },
                     frm: None,
                     rd,
                     rs1: rd.to_reg(),
                     rs2: rs,
                 }
                 .emit(&[], sink, emit_info, state);
                 // jump over.
                 Inst::Jal {
                     dest: BranchTarget::Label(label_jump_over),
                 }
                 .emit(&[], sink, emit_info, state);
                 // here is nan.
                 sink.bind_label(label_nan, &mut state.ctrl_plane);
                 Inst::FpuRRR {
                     alu_op: if ty == F32 {
                         FpuOPRRR::FaddS
                     } else {
                         FpuOPRRR::FaddD
                     },
                     frm: None,
                     rd: rd,
                     rs1: rs,
                     rs2: rs,
                 }
                 .emit(&[], sink, emit_info, state);
                 Inst::Jal {
                     dest: BranchTarget::Label(label_jump_over),
                 }
                 .emit(&[], sink, emit_info, state);
                 // here select origin x.
                 sink.bind_label(label_x, &mut state.ctrl_plane);
                 Inst::gen_move(rd, rs, ty).emit(&[], sink, emit_info, state);
                 sink.bind_label(label_jump_over, &mut state.ctrl_plane);
             }
             &Inst::FloatSelectPseudo {
                 op,
                 rd,
                 tmp,
                 rs1,
                 rs2,
                 ty,
             } => {
                 let rs1 = allocs.next(rs1);
                 let rs2 = allocs.next(rs2);
                 let tmp = allocs.next_writable(tmp);
                 let rd = allocs.next_writable(rd);
                 let label_rs2 = sink.get_label();
                 let label_jump_over = sink.get_label();
                 let lt_op = if ty == F32 {
                     FpuOPRRR::FltS
                 } else {
                     FpuOPRRR::FltD
                 };
                 Inst::FpuRRR {
                     alu_op: lt_op,
                     frm: None,
                     rd: tmp,
                     rs1: if op == FloatSelectOP::Max { rs1 } else { rs2 },
                     rs2: if op == FloatSelectOP::Max { rs2 } else { rs1 },
                 }
                 .emit(&[], sink, emit_info, state);
                 Inst::CondBr {
                     taken: BranchTarget::Label(label_rs2),
                     not_taken: BranchTarget::zero(),
                     kind: IntegerCompare {
                         kind: IntCC::NotEqual,
                         rs1: tmp.to_reg(),
                         rs2: zero_reg(),
                     },
                 }
                 .emit(&[], sink, emit_info, state);
                 // here select rs1 as result.
                 Inst::gen_move(rd, rs1, ty).emit(&[], sink, emit_info, state);
                 Inst::Jal {
                     dest: BranchTarget::Label(label_jump_over),
                 }
                 .emit(&[], sink, emit_info, state);
                 sink.bind_label(label_rs2, &mut state.ctrl_plane);
                 Inst::gen_move(rd, rs2, ty).emit(&[], sink, emit_info, state);
                 sink.bind_label(label_jump_over, &mut state.ctrl_plane);
             }

             &Inst::FloatSelect {
                 op,
                 rd,
                 tmp,
                 rs1,
                 rs2,
                 ty,
             } => {
                 let rs1 = allocs.next(rs1);
                 let rs2 = allocs.next(rs2);
                 let tmp = allocs.next_writable(tmp);
                 let rd = allocs.next_writable(rd);
                 let label_nan = sink.get_label();
                 let label_jump_over = sink.get_label();
                 // check if rs1 is nan.
                 Inst::emit_not_nan(tmp, rs1, ty).emit(&[], sink, emit_info, state);
                 Inst::CondBr {
                     taken: BranchTarget::Label(label_nan),
                     not_taken: BranchTarget::zero(),
                     kind: IntegerCompare {
                         kind: IntCC::Equal,
                         rs1: tmp.to_reg(),
                         rs2: zero_reg(),
                     },
                 }
                 .emit(&[], sink, emit_info, state);
                 // check if rs2 is nan.
                 Inst::emit_not_nan(tmp, rs2, ty).emit(&[], sink, emit_info, state);
                 Inst::CondBr {
                     taken: BranchTarget::Label(label_nan),
                     not_taken: BranchTarget::zero(),
                     kind: IntegerCompare {
                         kind: IntCC::Equal,
                         rs1: tmp.to_reg(),
                         rs2: zero_reg(),
                     },
                 }
                 .emit(&[], sink, emit_info, state);
                 // here rs1 and rs2 is not nan.
                 Inst::FpuRRR {
                     alu_op: op.to_fpuoprrr(ty),
                     frm: None,
                     rd: rd,
                     rs1: rs1,
                     rs2: rs2,
                 }
                 .emit(&[], sink, emit_info, state);
                 // special handle for +0 or -0.
                 {
                     // check is rs1 and rs2 all equal to zero.
                     let label_done = sink.get_label();
                     {
                         // if rs1 == 0
                         let mut insts = Inst::emit_if_float_not_zero(
                             tmp,
                             rs1,
                             ty,
                             BranchTarget::Label(label_done),
                             BranchTarget::zero(),
                         );
                         insts.extend(Inst::emit_if_float_not_zero(
                             tmp,
                             rs2,
                             ty,
                             BranchTarget::Label(label_done),
                             BranchTarget::zero(),
                         ));
                         insts
                             .iter()
                             .for_each(|i| i.emit(&[], sink, emit_info, state));
                     }
                     Inst::FpuRR {
                         alu_op: FpuOPRR::move_f_to_x_op(ty),
                         frm: None,
                         rd: tmp,
                         rs: rs1,
                     }
                     .emit(&[], sink, emit_info, state);
                     Inst::FpuRR {
                         alu_op: FpuOPRR::move_f_to_x_op(ty),
                         frm: None,
                         rd: writable_spilltmp_reg(),
                         rs: rs2,
                     }
                     .emit(&[], sink, emit_info, state);
                     Inst::AluRRR {
                         alu_op: if op == FloatSelectOP::Max {
                             AluOPRRR::And
                         } else {
                             AluOPRRR::Or
                         },
                         rd: tmp,
                         rs1: tmp.to_reg(),
                         rs2: spilltmp_reg(),
                     }
                     .emit(&[], sink, emit_info, state);
                     // move back to rd.
                     Inst::FpuRR {
                         alu_op: FpuOPRR::move_x_to_f_op(ty),
                         frm: None,
                         rd,
                         rs: tmp.to_reg(),
                     }
                     .emit(&[], sink, emit_info, state);
                     //
                     sink.bind_label(label_done, &mut state.ctrl_plane);
                 }
                 // we have the reuslt,jump over.
                 Inst::Jal {
                     dest: BranchTarget::Label(label_jump_over),
                 }
                 .emit(&[], sink, emit_info, state);
                 // here is nan.
                 sink.bind_label(label_nan, &mut state.ctrl_plane);
                 op.snan_bits(tmp, ty)
                     .into_iter()
                     .for_each(|i| i.emit(&[], sink, emit_info, state));
                 // move to rd.
                 Inst::FpuRR {
                     alu_op: FpuOPRR::move_x_to_f_op(ty),
                     frm: None,
                     rd,
                     rs: tmp.to_reg(),
                 }
                 .emit(&[], sink, emit_info, state);
                 sink.bind_label(label_jump_over, &mut state.ctrl_plane);
             }
             &Inst::Popcnt {
                 sum,
                 tmp,
                 step,
                 rs,
                 ty,
             } => {
                 let rs = allocs.next(rs);
                 let tmp = allocs.next_writable(tmp);
                 let step = allocs.next_writable(step);
                 let sum = allocs.next_writable(sum);
                 // load 0 to sum , init.
                 Inst::gen_move(sum, zero_reg(), I64).emit(&[], sink, emit_info, state);
                 // load
                 Inst::load_imm12(step, Imm12::from_bits(ty.bits() as i16)).emit(
                     &[],
                     sink,
                     emit_info,
                     state,
                 );
                 //
                 Inst::load_imm12(tmp, Imm12::from_bits(1)).emit(&[], sink, emit_info, state);
                 Inst::AluRRImm12 {
                     alu_op: AluOPRRI::Slli,
                     rd: tmp,
                     rs: tmp.to_reg(),
                     imm12: Imm12::from_bits((ty.bits() - 1) as i16),
                 }
                 .emit(&[], sink, emit_info, state);
                 let label_done = sink.get_label();
                 let label_loop = sink.get_label();
                 sink.bind_label(label_loop, &mut state.ctrl_plane);
                 Inst::CondBr {
                     taken: BranchTarget::Label(label_done),
                     not_taken: BranchTarget::zero(),
                     kind: IntegerCompare {
                         kind: IntCC::SignedLessThanOrEqual,
                         rs1: step.to_reg(),
                         rs2: zero_reg(),
                     },
                 }
                 .emit(&[], sink, emit_info, state);
                 // test and add sum.
                 {
                     Inst::AluRRR {
                         alu_op: AluOPRRR::And,
                         rd: writable_spilltmp_reg2(),
                         rs1: tmp.to_reg(),
                         rs2: rs,
                     }
                     .emit(&[], sink, emit_info, state);
                     let label_over = sink.get_label();
                     Inst::CondBr {
                         taken: BranchTarget::Label(label_over),
                         not_taken: BranchTarget::zero(),
                         kind: IntegerCompare {
                             kind: IntCC::Equal,
                             rs1: zero_reg(),
                             rs2: spilltmp_reg2(),
                         },
                     }
                     .emit(&[], sink, emit_info, state);
                     Inst::AluRRImm12 {
                         alu_op: AluOPRRI::Addi,
                         rd: sum,
                         rs: sum.to_reg(),
                         imm12: Imm12::from_bits(1),
                     }
                     .emit(&[], sink, emit_info, state);
                     sink.bind_label(label_over, &mut state.ctrl_plane);
                 }
                 // set step and tmp.
                 {
                     Inst::AluRRImm12 {
                         alu_op: AluOPRRI::Addi,
                         rd: step,
                         rs: step.to_reg(),
                         imm12: Imm12::from_bits(-1),
                     }
                     .emit(&[], sink, emit_info, state);
                     Inst::AluRRImm12 {
                         alu_op: AluOPRRI::Srli,
                         rd: tmp,
                         rs: tmp.to_reg(),
                         imm12: Imm12::from_bits(1),
                     }
                     .emit(&[], sink, emit_info, state);
                     Inst::Jal {
                         dest: BranchTarget::Label(label_loop),
                     }
                     .emit(&[], sink, emit_info, state);
                 }
                 sink.bind_label(label_done, &mut state.ctrl_plane);
             }
             &Inst::Rev8 { rs, rd, tmp, step } => {
                 let rs = allocs.next(rs);
                 let tmp = allocs.next_writable(tmp);
                 let step = allocs.next_writable(step);
                 let rd = allocs.next_writable(rd);
                 // init.
                 Inst::gen_move(rd, zero_reg(), I64).emit(&[], sink, emit_info, state);
                 Inst::gen_move(tmp, rs, I64).emit(&[], sink, emit_info, state);
                 // load 56 to step.
                 Inst::load_imm12(step, Imm12::from_bits(56)).emit(&[], sink, emit_info, state);
                 let label_done = sink.get_label();
                 let label_loop = sink.get_label();
                 sink.bind_label(label_loop, &mut state.ctrl_plane);
                 Inst::CondBr {
                     taken: BranchTarget::Label(label_done),
                     not_taken: BranchTarget::zero(),
                     kind: IntegerCompare {
                         kind: IntCC::SignedLessThan,
                         rs1: step.to_reg(),
                         rs2: zero_reg(),
                     },
                 }
                 .emit(&[], sink, emit_info, state);
                 Inst::AluRRImm12 {
                     alu_op: AluOPRRI::Andi,
                     rd: writable_spilltmp_reg(),
                     rs: tmp.to_reg(),
                     imm12: Imm12::from_bits(255),
                 }
                 .emit(&[], sink, emit_info, state);
                 Inst::AluRRR {
                     alu_op: AluOPRRR::Sll,
                     rd: writable_spilltmp_reg(),
                     rs1: spilltmp_reg(),
                     rs2: step.to_reg(),
                 }
                 .emit(&[], sink, emit_info, state);

                 Inst::AluRRR {
                     alu_op: AluOPRRR::Or,
                     rd: rd,
                     rs1: rd.to_reg(),
                     rs2: spilltmp_reg(),
                 }
                 .emit(&[], sink, emit_info, state);
                 {
                     // reset step
                     Inst::AluRRImm12 {
                         alu_op: AluOPRRI::Addi,
                         rd: step,
                         rs: step.to_reg(),
                         imm12: Imm12::from_bits(-8),
                     }
                     .emit(&[], sink, emit_info, state);
                     //reset tmp.
                     Inst::AluRRImm12 {
                         alu_op: AluOPRRI::Srli,
                         rd: tmp,
                         rs: tmp.to_reg(),
                         imm12: Imm12::from_bits(8),
                     }
                     .emit(&[], sink, emit_info, state);
                     // loop.
                     Inst::Jal {
                         dest: BranchTarget::Label(label_loop),
                     }
                 }
                 .emit(&[], sink, emit_info, state);
                 sink.bind_label(label_done, &mut state.ctrl_plane);
             }
             &Inst::Cltz {
                 sum,
                 tmp,
                 step,
                 rs,
                 leading,
                 ty,
             } => {
                 let rs = allocs.next(rs);
                 let tmp = allocs.next_writable(tmp);
                 let step = allocs.next_writable(step);
                 let sum = allocs.next_writable(sum);
                 // load 0 to sum , init.
                 Inst::gen_move(sum, zero_reg(), I64).emit(&[], sink, emit_info, state);
                 // load
                 Inst::load_imm12(step, Imm12::from_bits(ty.bits() as i16)).emit(
                     &[],
                     sink,
                     emit_info,
                     state,
                 );
                 //
                 Inst::load_imm12(tmp, Imm12::from_bits(1)).emit(&[], sink, emit_info, state);
                 if leading {
                     Inst::AluRRImm12 {
                         alu_op: AluOPRRI::Slli,
                         rd: tmp,
                         rs: tmp.to_reg(),
                         imm12: Imm12::from_bits((ty.bits() - 1) as i16),
                     }
                     .emit(&[], sink, emit_info, state);
                 }
                 let label_done = sink.get_label();
                 let label_loop = sink.get_label();
                 sink.bind_label(label_loop, &mut state.ctrl_plane);
                 Inst::CondBr {
                     taken: BranchTarget::Label(label_done),
                     not_taken: BranchTarget::zero(),
                     kind: IntegerCompare {
                         kind: IntCC::SignedLessThanOrEqual,
                         rs1: step.to_reg(),
                         rs2: zero_reg(),
                     },
                 }
                 .emit(&[], sink, emit_info, state);
                 // test and add sum.
                 {
                     Inst::AluRRR {
                         alu_op: AluOPRRR::And,
                         rd: writable_spilltmp_reg2(),
                         rs1: tmp.to_reg(),
                         rs2: rs,
                     }
                     .emit(&[], sink, emit_info, state);
                     Inst::CondBr {
                         taken: BranchTarget::Label(label_done),
                         not_taken: BranchTarget::zero(),
                         kind: IntegerCompare {
                             kind: IntCC::NotEqual,
                             rs1: zero_reg(),
                             rs2: spilltmp_reg2(),
                         },
                     }
                     .emit(&[], sink, emit_info, state);
                     Inst::AluRRImm12 {
                         alu_op: AluOPRRI::Addi,
                         rd: sum,
                         rs: sum.to_reg(),
                         imm12: Imm12::from_bits(1),
                     }
                     .emit(&[], sink, emit_info, state);
                 }
                 // set step and tmp.
                 {
                     Inst::AluRRImm12 {
                         alu_op: AluOPRRI::Addi,
                         rd: step,
                         rs: step.to_reg(),
                         imm12: Imm12::from_bits(-1),
                     }
                     .emit(&[], sink, emit_info, state);
                     Inst::AluRRImm12 {
                         alu_op: if leading {
                             AluOPRRI::Srli
                         } else {
                             AluOPRRI::Slli
                         },
                         rd: tmp,
                         rs: tmp.to_reg(),
                         imm12: Imm12::from_bits(1),
                     }
                     .emit(&[], sink, emit_info, state);
                     Inst::Jal {
                         dest: BranchTarget::Label(label_loop),
                     }
                     .emit(&[], sink, emit_info, state);
                 }
                 sink.bind_label(label_done, &mut state.ctrl_plane);
             }
             &Inst::Brev8 {
                 rs,
                 ty,
                 step,
                 tmp,
                 tmp2,
                 rd,
             } => {
                 let rs = allocs.next(rs);
                 let step = allocs.next_writable(step);
                 let tmp = allocs.next_writable(tmp);
                 let tmp2 = allocs.next_writable(tmp2);
                 let rd = allocs.next_writable(rd);
                 Inst::gen_move(rd, zero_reg(), I64).emit(&[], sink, emit_info, state);
                 Inst::load_imm12(step, Imm12::from_bits(ty.bits() as i16)).emit(
                     &[],
                     sink,
                     emit_info,
                     state,
                 );
                 //
                 Inst::load_imm12(tmp, Imm12::from_bits(1)).emit(&[], sink, emit_info, state);
                 Inst::AluRRImm12 {
                     alu_op: AluOPRRI::Slli,
                     rd: tmp,
                     rs: tmp.to_reg(),
                     imm12: Imm12::from_bits((ty.bits() - 1) as i16),
                 }
                 .emit(&[], sink, emit_info, state);
                 Inst::load_imm12(tmp2, Imm12::from_bits(1)).emit(&[], sink, emit_info, state);
                 Inst::AluRRImm12 {
                     alu_op: AluOPRRI::Slli,
                     rd: tmp2,
                     rs: tmp2.to_reg(),
                     imm12: Imm12::from_bits((ty.bits() - 8) as i16),
                 }
                 .emit(&[], sink, emit_info, state);

                 let label_done = sink.get_label();
                 let label_loop = sink.get_label();
                 sink.bind_label(label_loop, &mut state.ctrl_plane);
                 Inst::CondBr {
                     taken: BranchTarget::Label(label_done),
                     not_taken: BranchTarget::zero(),
                     kind: IntegerCompare {
                         kind: IntCC::SignedLessThanOrEqual,
                         rs1: step.to_reg(),
                         rs2: zero_reg(),
                     },
                 }
                 .emit(&[], sink, emit_info, state);
                 // test and set bit.
                 {
                     Inst::AluRRR {
                         alu_op: AluOPRRR::And,
                         rd: writable_spilltmp_reg2(),
                         rs1: tmp.to_reg(),
                         rs2: rs,
                     }
                     .emit(&[], sink, emit_info, state);
                     let label_over = sink.get_label();
                     Inst::CondBr {
                         taken: BranchTarget::Label(label_over),
                         not_taken: BranchTarget::zero(),
                         kind: IntegerCompare {
                             kind: IntCC::Equal,
                             rs1: zero_reg(),
                             rs2: spilltmp_reg2(),
                         },
                     }
                     .emit(&[], sink, emit_info, state);
                     Inst::AluRRR {
                         alu_op: AluOPRRR::Or,
                         rd: rd,
                         rs1: rd.to_reg(),
                         rs2: tmp2.to_reg(),
                     }
                     .emit(&[], sink, emit_info, state);
                     sink.bind_label(label_over, &mut state.ctrl_plane);
                 }
                 // set step and tmp.
                 {
                     Inst::AluRRImm12 {
                         alu_op: AluOPRRI::Addi,
                         rd: step,
                         rs: step.to_reg(),
                         imm12: Imm12::from_bits(-1),
                     }
                     .emit(&[], sink, emit_info, state);
                     Inst::AluRRImm12 {
                         alu_op: AluOPRRI::Srli,
                         rd: tmp,
                         rs: tmp.to_reg(),
                         imm12: Imm12::from_bits(1),
                     }
                     .emit(&[], sink, emit_info, state);
                     {
                         // reset tmp2
                         // if (step %=8 == 0) then tmp2 = tmp2 >> 15
                         // if (step %=8 != 0) then tmp2 = tmp2 << 1
                         let label_over = sink.get_label();
                         let label_sll_1 = sink.get_label();
                         Inst::load_imm12(writable_spilltmp_reg2(), Imm12::from_bits(8)).emit(
                             &[],
                             sink,
                             emit_info,
                             state,
                         );
                         Inst::AluRRR {
                             alu_op: AluOPRRR::Rem,
                             rd: writable_spilltmp_reg2(),
                             rs1: step.to_reg(),
                             rs2: spilltmp_reg2(),
                         }
                         .emit(&[], sink, emit_info, state);
                         Inst::CondBr {
                             taken: BranchTarget::Label(label_sll_1),
                             not_taken: BranchTarget::zero(),
                             kind: IntegerCompare {
                                 kind: IntCC::NotEqual,
                                 rs1: spilltmp_reg2(),
                                 rs2: zero_reg(),
                             },
                         }
                         .emit(&[], sink, emit_info, state);
                         Inst::AluRRImm12 {
                             alu_op: AluOPRRI::Srli,
                             rd: tmp2,
                             rs: tmp2.to_reg(),
                             imm12: Imm12::from_bits(15),
                         }
                         .emit(&[], sink, emit_info, state);
                         Inst::Jal {
                             dest: BranchTarget::Label(label_over),
                         }
                         .emit(&[], sink, emit_info, state);
                         sink.bind_label(label_sll_1, &mut state.ctrl_plane);
                         Inst::AluRRImm12 {
                             alu_op: AluOPRRI::Slli,
                             rd: tmp2,
                             rs: tmp2.to_reg(),
                             imm12: Imm12::from_bits(1),
                         }
                         .emit(&[], sink, emit_info, state);
                         sink.bind_label(label_over, &mut state.ctrl_plane);
                     }
                     Inst::Jal {
                         dest: BranchTarget::Label(label_loop),
                     }
                     .emit(&[], sink, emit_info, state);
                 }
                 sink.bind_label(label_done, &mut state.ctrl_plane);
             }
             &Inst::StackProbeLoop {
                 guard_size,
                 probe_count,
                 tmp: guard_size_tmp,
             } => {
                 let step = writable_spilltmp_reg();
                 Inst::load_constant_u64(
                     step,
                     (guard_size as u64) * (probe_count as u64),
                     &mut |_| step,
                 )
                 .iter()
                 .for_each(|i| i.emit(&[], sink, emit_info, state));
                 Inst::load_constant_u64(guard_size_tmp, guard_size as u64, &mut |_| guard_size_tmp)
                     .iter()
                     .for_each(|i| i.emit(&[], sink, emit_info, state));

                 let loop_start = sink.get_label();
                 let label_done = sink.get_label();
                 sink.bind_label(loop_start, &mut state.ctrl_plane);
                 Inst::CondBr {
                     taken: BranchTarget::Label(label_done),
                     not_taken: BranchTarget::zero(),
                     kind: IntegerCompare {
                         kind: IntCC::UnsignedLessThanOrEqual,
                         rs1: step.to_reg(),
                         rs2: guard_size_tmp.to_reg(),
                     },
                 }
                 .emit(&[], sink, emit_info, state);
                 // compute address.
                 Inst::AluRRR {
                     alu_op: AluOPRRR::Sub,
                     rd: writable_spilltmp_reg2(),
                     rs1: stack_reg(),
                     rs2: step.to_reg(),
                 }
                 .emit(&[], sink, emit_info, state);
                 Inst::Store {
                     to: AMode::RegOffset(spilltmp_reg2(), 0, I8),
                     op: StoreOP::Sb,
                     flags: MemFlags::new(),
                     src: zero_reg(),
                 }
                 .emit(&[], sink, emit_info, state);
                 // reset step.
                 Inst::AluRRR {
                     alu_op: AluOPRRR::Sub,
                     rd: step,
                     rs1: step.to_reg(),
                     rs2: guard_size_tmp.to_reg(),
                 }
                 .emit(&[], sink, emit_info, state);
                 Inst::Jal {
                     dest: BranchTarget::Label(loop_start),
                 }
                 .emit(&[], sink, emit_info, state);
                 sink.bind_label(label_done, &mut state.ctrl_plane);
             }
             &Inst::VecAluRRRImm5 {
                 op,
                 vd,
                 vd_src,
                 imm,
                 vs2,
                 ref mask,
                 ..
             } => {
                 let vs2 = allocs.next(vs2);
                 let vd_src = allocs.next(vd_src);
                 let vd = allocs.next_writable(vd);
                 let mask = mask.with_allocs(&mut allocs);

                 debug_assert_eq!(vd.to_reg(), vd_src);

                 sink.put4(encode_valu_rrr_imm(op, vd, imm, vs2, mask));
             }
             &Inst::VecAluRRR {
                 op,
                 vd,
                 vs1,
                 vs2,
                 ref mask,
                 ..
             } => {
                 let vs1 = allocs.next(vs1);
                 let vs2 = allocs.next(vs2);
                 let vd = allocs.next_writable(vd);
                 let mask = mask.with_allocs(&mut allocs);

                 sink.put4(encode_valu(op, vd, vs1, vs2, mask));
             }
             &Inst::VecAluRRImm5 {
                 op,
                 vd,
                 imm,
                 vs2,
                 ref mask,
                 ..
             } => {
                 let vs2 = allocs.next(vs2);
                 let vd = allocs.next_writable(vd);
                 let mask = mask.with_allocs(&mut allocs);

                 sink.put4(encode_valu_rr_imm(op, vd, imm, vs2, mask));
             }
             &Inst::VecAluRR {
                 op,
                 vd,
                 vs,
                 ref mask,
                 ..
             } => {
                 let vs = allocs.next(vs);
                 let vd = allocs.next_writable(vd);
                 let mask = mask.with_allocs(&mut allocs);

                 sink.put4(encode_valu_rr(op, vd, vs, mask));
             }
             &Inst::VecAluRImm5 {
                 op,
                 vd,
                 imm,
                 ref mask,
                 ..
             } => {
                 let vd = allocs.next_writable(vd);
                 let mask = mask.with_allocs(&mut allocs);

                 sink.put4(encode_valu_r_imm(op, vd, imm, mask));
             }
             &Inst::VecSetState { rd, ref vstate } => {
                 let rd = allocs.next_writable(rd);

                 sink.put4(encode_vcfg_imm(
                     0x57,
                     rd.to_reg(),
                     vstate.avl.unwrap_static(),
                     &vstate.vtype,
                 ));

                 // Update the current vector emit state.
                 state.vstate = EmitVState::Known(vstate.clone());
             }

             &Inst::VecLoad {
                 eew,
                 to,
                 ref from,
                 ref mask,
                 flags,
                 ..
             } => {
                 let from = from.clone().with_allocs(&mut allocs);
                 let to = allocs.next_writable(to);
                 let mask = mask.with_allocs(&mut allocs);

                 // Vector Loads don't support immediate offsets, so we need to load it into a register.
                 let addr = match from {
                     VecAMode::UnitStride { base } => {
                         let base_reg = base.get_base_register();
                         let offset = base.get_offset_with_state(state);

                         // Reg+0 Offset can be directly encoded
                         if let (Some(base_reg), 0) = (base_reg, offset) {
                             base_reg
                         } else {
                             // Otherwise load the address it into a reg and load from it.
                             let tmp = writable_spilltmp_reg();
                             Inst::LoadAddr {
                                 rd: tmp,
                                 mem: base.clone(),
                             }
                             .emit(&[], sink, emit_info, state);
                             tmp.to_reg()
                         }
                     }
                 };

                 let srcloc = state.cur_srcloc();
                 if !srcloc.is_default() && !flags.notrap() {
                     // Register the offset at which the actual load instruction starts.
                     sink.add_trap(TrapCode::HeapOutOfBounds);
                 }

                 sink.put4(encode_vmem_load(
                     0x07,
                     to.to_reg(),
                     eew,
                     addr,
                     from.lumop(),
                     mask,
                     from.mop(),
                     from.nf(),
                 ));
             }

             &Inst::VecStore {
                 eew,
                 ref to,
                 from,
                 ref mask,
                 flags,
                 ..
             } => {
                 let to = to.clone().with_allocs(&mut allocs);
                 let from = allocs.next(from);
                 let mask = mask.with_allocs(&mut allocs);

                 // Vector Stores don't support immediate offsets, so we need to load it into a register.
                 let addr = match to {
                     VecAMode::UnitStride { base } => {
                         let base_reg = base.get_base_register();
                         let offset = base.get_offset_with_state(state);

                         // Reg+0 Offset can be directly encoded
                         if let (Some(base_reg), 0) = (base_reg, offset) {
                             base_reg
                         } else {
                             // Otherwise load the address it into a reg and load from it.
                             let tmp = writable_spilltmp_reg();
                             Inst::LoadAddr {
                                 rd: tmp,
                                 mem: base.clone(),
                             }
                             .emit(&[], sink, emit_info, state);
                             tmp.to_reg()
                         }
                     }
                 };

                 let srcloc = state.cur_srcloc();
                 if !srcloc.is_default() && !flags.notrap() {
                     // Register the offset at which the actual load instruction starts.
                     sink.add_trap(TrapCode::HeapOutOfBounds);
                 }

                 sink.put4(encode_vmem_store(
                     0x27,
                     from,
                     eew,
                     addr,
                     to.sumop(),
                     mask,
                     to.mop(),
                     to.nf(),
                 ));
             }
         };
         let end_off = sink.cur_offset();
         assert!(
             (end_off - start_off) <= Inst::worst_case_size(),
             "Inst:{:?} length:{} worst_case_size:{}",
             self,
             end_off - start_off,
             Inst::worst_case_size()
         );
     }

     fn pretty_print_inst(&self, allocs: &[Allocation], state: &mut Self::State) -> String {
         let mut allocs = AllocationConsumer::new(allocs);
         self.print_with_state(state, &mut allocs)
     }
 }

 // helper function.
 fn alloc_value_regs(orgin: &ValueRegs<Reg>, alloc: &mut AllocationConsumer) -> ValueRegs<Reg> {
     match orgin.regs().len() {
         1 => ValueRegs::one(alloc.next(orgin.regs()[0])),
         2 => ValueRegs::two(alloc.next(orgin.regs()[0]), alloc.next(orgin.regs()[1])),
         _ => unreachable!(),
     }
 }