diff options
| author | Takashi Kokubun <takashi.kokubun@shopify.com> | 2025-11-03 16:49:25 -0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-11-03 16:49:25 -0800 |
| commit | 9ca940757315e53d1eaddc83071b1b4581f8f578 (patch) | |
| tree | 82429b9d0516f431f07207a1c820e9045226f3b3 | |
| parent | 0d210f4d39c1cff71f34dbe9938345897f3e5eb1 (diff) | |
ZJIT: Implement register spill (#14936)
| -rw-r--r-- | zjit/src/asm/x86_64/mod.rs | 7 | ||||
| -rw-r--r-- | zjit/src/asm/x86_64/tests.rs | 18 | ||||
| -rw-r--r-- | zjit/src/backend/arm64/mod.rs | 187 | ||||
| -rw-r--r-- | zjit/src/backend/lir.rs | 213 | ||||
| -rw-r--r-- | zjit/src/backend/x86_64/mod.rs | 157 |
5 files changed, 431 insertions, 151 deletions
diff --git a/zjit/src/asm/x86_64/mod.rs b/zjit/src/asm/x86_64/mod.rs index 9d3bf18dcd..cfedca4540 100644 --- a/zjit/src/asm/x86_64/mod.rs +++ b/zjit/src/asm/x86_64/mod.rs @@ -779,13 +779,6 @@ pub fn imul(cb: &mut CodeBlock, opnd0: X86Opnd, opnd1: X86Opnd) { write_rm(cb, false, true, opnd0, opnd1, None, &[0x0F, 0xAF]); } - // Flip the operands to handle this case. This instruction has weird encoding restrictions. - (X86Opnd::Mem(_), X86Opnd::Reg(_)) => { - //REX.W + 0F AF /rIMUL r64, r/m64 - // Quadword register := Quadword register * r/m64. - write_rm(cb, false, true, opnd1, opnd0, None, &[0x0F, 0xAF]); - } - _ => unreachable!() } } diff --git a/zjit/src/asm/x86_64/tests.rs b/zjit/src/asm/x86_64/tests.rs index 0f86725946..d574bdb034 100644 --- a/zjit/src/asm/x86_64/tests.rs +++ b/zjit/src/asm/x86_64/tests.rs @@ -228,23 +228,29 @@ fn test_cqo() { fn test_imul() { let cb1 = compile(|cb| imul(cb, RAX, RBX)); let cb2 = compile(|cb| imul(cb, RDX, mem_opnd(64, RAX, 0))); - // Operands flipped for encoding since multiplication is commutative - let cb3 = compile(|cb| imul(cb, mem_opnd(64, RAX, 0), RDX)); - assert_disasm_snapshot!(disasms!(cb1, cb2, cb3), @r" + assert_disasm_snapshot!(disasms!(cb1, cb2), @r" 0x0: imul rax, rbx 0x0: imul rdx, qword ptr [rax] - 0x0: imul rdx, qword ptr [rax] "); - assert_snapshot!(hexdumps!(cb1, cb2, cb3), @r" + assert_snapshot!(hexdumps!(cb1, cb2), @r" 480fafc3 480faf10 - 480faf10 "); } #[test] +#[should_panic] +fn test_imul_mem_reg() { + // imul doesn't have (Mem, Reg) encoding. Since multiplication is communicative, imul() could + // swap operands. However, x86_scratch_split may need to move the result to the output operand, + // which can be complicated if the assembler may sometimes change the result operand. + // So x86_scratch_split should be responsible for that swap, not the assembler. + compile(|cb| imul(cb, mem_opnd(64, RAX, 0), RDX)); +} + +#[test] fn test_jge_label() { let cb = compile(|cb| { let label_idx = cb.new_label("loop".to_owned()); diff --git a/zjit/src/backend/arm64/mod.rs b/zjit/src/backend/arm64/mod.rs index d762b14c91..acf0576f9c 100644 --- a/zjit/src/backend/arm64/mod.rs +++ b/zjit/src/backend/arm64/mod.rs @@ -79,6 +79,9 @@ impl From<Opnd> for A64Opnd { Opnd::Mem(Mem { base: MemBase::VReg(_), .. }) => { panic!("attempted to lower an Opnd::Mem with a MemBase::VReg base") }, + Opnd::Mem(Mem { base: MemBase::Stack { .. }, .. }) => { + panic!("attempted to lower an Opnd::Mem with a MemBase::Stack base") + }, Opnd::VReg { .. } => panic!("attempted to lower an Opnd::VReg"), Opnd::Value(_) => panic!("attempted to lower an Opnd::Value"), Opnd::None => panic!( @@ -203,6 +206,7 @@ pub const ALLOC_REGS: &[Reg] = &[ /// [`Assembler::arm64_scratch_split`] or [`Assembler::new_with_scratch_reg`]. const SCRATCH0_OPND: Opnd = Opnd::Reg(X15_REG); const SCRATCH1_OPND: Opnd = Opnd::Reg(X17_REG); +const SCRATCH2_OPND: Opnd = Opnd::Reg(X14_REG); impl Assembler { /// Special register for intermediate processing in arm64_emit. It should be used only by arm64_emit. @@ -690,22 +694,129 @@ impl Assembler { /// need to be split with registers after `alloc_regs`, e.g. for `compile_exits`, so this /// splits them and uses scratch registers for it. fn arm64_scratch_split(self) -> Assembler { - let mut asm = Assembler::new_with_asm(&self); + /// If opnd is Opnd::Mem with a too large disp, make the disp smaller using lea. + fn split_large_disp(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd) -> Opnd { + match opnd { + Opnd::Mem(Mem { num_bits, disp, .. }) if !mem_disp_fits_bits(disp) => { + asm.lea_into(scratch_opnd, opnd); + Opnd::mem(num_bits, scratch_opnd, 0) + } + _ => opnd, + } + } + + /// If opnd is Opnd::Mem with MemBase::Stack, lower it to Opnd::Mem with MemBase::Reg, and split a large disp. + fn split_stack_membase(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd, stack_state: &StackState) -> Opnd { + let opnd = split_only_stack_membase(asm, opnd, scratch_opnd, stack_state); + split_large_disp(asm, opnd, scratch_opnd) + } + + /// split_stack_membase but without split_large_disp. This should be used only by lea. + fn split_only_stack_membase(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd, stack_state: &StackState) -> Opnd { + if let Opnd::Mem(Mem { base: stack_membase @ MemBase::Stack { .. }, disp: opnd_disp, num_bits: opnd_num_bits }) = opnd { + let base = Opnd::Mem(stack_state.stack_membase_to_mem(stack_membase)); + let base = split_large_disp(asm, base, scratch_opnd); + asm.load_into(scratch_opnd, base); + Opnd::Mem(Mem { base: MemBase::Reg(scratch_opnd.unwrap_reg().reg_no), disp: opnd_disp, num_bits: opnd_num_bits }) + } else { + opnd + } + } + + /// If opnd is Opnd::Mem, lower it to scratch_opnd. You should use this when `opnd` is read by the instruction, not written. + fn split_memory_read(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd) -> Opnd { + if let Opnd::Mem(_) = opnd { + let opnd = split_large_disp(asm, opnd, scratch_opnd); + let scratch_opnd = opnd.num_bits().map(|num_bits| scratch_opnd.with_num_bits(num_bits)).unwrap_or(scratch_opnd); + asm.load_into(scratch_opnd, opnd); + scratch_opnd + } else { + opnd + } + } + + /// If opnd is Opnd::Mem, set scratch_reg to *opnd. Return Some(Opnd::Mem) if it needs to be written back from scratch_reg. + fn split_memory_write(opnd: &mut Opnd, scratch_opnd: Opnd) -> Option<Opnd> { + if let Opnd::Mem(_) = opnd { + let mem_opnd = opnd.clone(); + *opnd = opnd.num_bits().map(|num_bits| scratch_opnd.with_num_bits(num_bits)).unwrap_or(scratch_opnd); + Some(mem_opnd) + } else { + None + } + } + + // Prepare StackState to lower MemBase::Stack + let stack_state = StackState::new(self.stack_base_idx); + + let mut asm_local = Assembler::new_with_asm(&self); + let asm = &mut asm_local; asm.accept_scratch_reg = true; let mut iterator = self.insns.into_iter().enumerate().peekable(); while let Some((_, mut insn)) = iterator.next() { match &mut insn { - &mut Insn::Mul { out, .. } => { + Insn::Add { left, right, out } | + Insn::Sub { left, right, out } | + Insn::And { left, right, out } | + Insn::Or { left, right, out } | + Insn::Xor { left, right, out } | + Insn::CSelZ { truthy: left, falsy: right, out } | + Insn::CSelNZ { truthy: left, falsy: right, out } | + Insn::CSelE { truthy: left, falsy: right, out } | + Insn::CSelNE { truthy: left, falsy: right, out } | + Insn::CSelL { truthy: left, falsy: right, out } | + Insn::CSelLE { truthy: left, falsy: right, out } | + Insn::CSelG { truthy: left, falsy: right, out } | + Insn::CSelGE { truthy: left, falsy: right, out } => { + *left = split_memory_read(asm, *left, SCRATCH0_OPND); + *right = split_memory_read(asm, *right, SCRATCH1_OPND); + let mem_out = split_memory_write(out, SCRATCH0_OPND); + + asm.push_insn(insn); + + if let Some(mem_out) = mem_out { + let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND); + asm.store(mem_out, SCRATCH0_OPND); + } + } + Insn::Mul { left, right, out } => { + *left = split_memory_read(asm, *left, SCRATCH0_OPND); + *right = split_memory_read(asm, *right, SCRATCH1_OPND); + let mem_out = split_memory_write(out, SCRATCH0_OPND); + let reg_out = out.clone(); + asm.push_insn(insn); + if let Some(mem_out) = mem_out { + let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND); + asm.store(mem_out, SCRATCH0_OPND); + }; + // If the next instruction is JoMul if matches!(iterator.peek(), Some((_, Insn::JoMul(_)))) { // Produce a register that is all zeros or all ones // Based on the sign bit of the 64-bit mul result - asm.push_insn(Insn::RShift { out: SCRATCH0_OPND, opnd: out, shift: Opnd::UImm(63) }); + asm.push_insn(Insn::RShift { out: SCRATCH0_OPND, opnd: reg_out, shift: Opnd::UImm(63) }); + } + } + Insn::RShift { opnd, out, .. } => { + *opnd = split_memory_read(asm, *opnd, SCRATCH0_OPND); + let mem_out = split_memory_write(out, SCRATCH0_OPND); + + asm.push_insn(insn); + + if let Some(mem_out) = mem_out { + let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND); + asm.store(mem_out, SCRATCH0_OPND); } } + Insn::Cmp { left, right } | + Insn::Test { left, right } => { + *left = split_memory_read(asm, *left, SCRATCH0_OPND); + *right = split_memory_read(asm, *right, SCRATCH1_OPND); + asm.push_insn(insn); + } // For compile_exits, support splitting simple C arguments here Insn::CCall { opnds, .. } if !opnds.is_empty() => { for (i, opnd) in opnds.iter().enumerate() { @@ -714,16 +825,32 @@ impl Assembler { *opnds = vec![]; asm.push_insn(insn); } - &mut Insn::Lea { opnd, out } => { - match (opnd, out) { - // Split here for compile_exits - (Opnd::Mem(_), Opnd::Mem(_)) => { - asm.lea_into(SCRATCH0_OPND, opnd); - asm.store(out, SCRATCH0_OPND); - } - _ => { - asm.push_insn(insn); + Insn::Lea { opnd, out } => { + *opnd = split_only_stack_membase(asm, *opnd, SCRATCH0_OPND, &stack_state); + let mem_out = split_memory_write(out, SCRATCH0_OPND); + + asm.push_insn(insn); + + if let Some(mem_out) = mem_out { + let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND); + asm.store(mem_out, SCRATCH0_OPND); + } + } + Insn::Load { opnd, out } | + Insn::LoadInto { opnd, dest: out } => { + *opnd = split_stack_membase(asm, *opnd, SCRATCH0_OPND, &stack_state); + *out = split_stack_membase(asm, *out, SCRATCH1_OPND, &stack_state); + + if let Opnd::Mem(_) = out { + // If NATIVE_STACK_PTR is used as a source for Store, it's handled as xzr, storeing zero. + // To save the content of NATIVE_STACK_PTR, we need to load it into another register first. + if *opnd == NATIVE_STACK_PTR { + asm.load_into(SCRATCH0_OPND, NATIVE_STACK_PTR); + *opnd = SCRATCH0_OPND; } + asm.store(*out, *opnd); + } else { + asm.push_insn(insn); } } &mut Insn::IncrCounter { mem, value } => { @@ -741,31 +868,24 @@ impl Assembler { asm.cmp(SCRATCH1_OPND, 0.into()); asm.jne(label); } - &mut Insn::Store { dest, src } => { - let Opnd::Mem(Mem { num_bits: dest_num_bits, disp: dest_disp, .. }) = dest else { - panic!("Insn::Store destination must be Opnd::Mem: {dest:?}, {src:?}"); - }; - - // Split dest using a scratch register if necessary. - let dest = if mem_disp_fits_bits(dest_disp) { - dest - } else { - asm.lea_into(SCRATCH0_OPND, dest); - Opnd::mem(dest_num_bits, SCRATCH0_OPND, 0) - }; - - asm.store(dest, src); + Insn::Store { dest, .. } => { + *dest = split_stack_membase(asm, *dest, SCRATCH0_OPND, &stack_state); + asm.push_insn(insn); } - &mut Insn::Mov { dest, src } => { + Insn::Mov { dest, src } => { + *src = split_stack_membase(asm, *src, SCRATCH0_OPND, &stack_state); + *dest = split_large_disp(asm, *dest, SCRATCH1_OPND); match dest { - Opnd::Reg(_) => asm.load_into(dest, src), - Opnd::Mem(_) => asm.store(dest, src), + Opnd::Reg(_) => asm.load_into(*dest, *src), + Opnd::Mem(_) => asm.store(*dest, *src), _ => asm.push_insn(insn), } } // Resolve ParallelMov that couldn't be handled without a scratch register. Insn::ParallelMov { moves } => { for (dst, src) in Self::resolve_parallel_moves(moves, Some(SCRATCH0_OPND)).unwrap() { + let src = split_stack_membase(asm, src, SCRATCH1_OPND, &stack_state); + let dst = split_large_disp(asm, dst, SCRATCH2_OPND); match dst { Opnd::Reg(_) => asm.load_into(dst, src), Opnd::Mem(_) => asm.store(dst, src), @@ -779,7 +899,7 @@ impl Assembler { } } - asm + asm_local } /// Emit platform-specific machine code @@ -1157,10 +1277,11 @@ impl Assembler { load_effective_address(cb, Self::EMIT_OPND, src_base_reg_no, src_disp); A64Opnd::new_mem(dest.rm_num_bits(), Self::EMIT_OPND, 0) }; + let dst = A64Opnd::Reg(Self::EMIT_REG.with_num_bits(src_num_bits)); match src_num_bits { - 64 | 32 => ldur(cb, Self::EMIT_OPND, src_mem), - 16 => ldurh(cb, Self::EMIT_OPND, src_mem), - 8 => ldurb(cb, Self::EMIT_OPND, src_mem), + 64 | 32 => ldur(cb, dst, src_mem), + 16 => ldurh(cb, dst, src_mem), + 8 => ldurb(cb, dst, src_mem), num_bits => panic!("unexpected num_bits: {num_bits}") }; Self::EMIT_REG diff --git a/zjit/src/backend/lir.rs b/zjit/src/backend/lir.rs index 584251de80..66e89a1304 100644 --- a/zjit/src/backend/lir.rs +++ b/zjit/src/backend/lir.rs @@ -28,8 +28,12 @@ pub static JIT_PRESERVED_REGS: &[Opnd] = &[CFP, SP, EC]; #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum MemBase { + /// Register: Every Opnd::Mem should have MemBase::Reg as of emit. Reg(u8), + /// Virtual register: Lowered to MemBase::Reg or MemBase::Stack in alloc_regs. VReg(usize), + /// Stack slot: Lowered to MemBase::Reg in scratch_split. + Stack { stack_idx: usize, num_bits: u8 }, } // Memory location @@ -55,6 +59,8 @@ impl fmt::Display for Mem { match self.base { MemBase::Reg(reg_no) => write!(f, "{}", mem_base_reg(reg_no))?, MemBase::VReg(idx) => write!(f, "v{idx}")?, + MemBase::Stack { stack_idx, num_bits } if num_bits == 64 => write!(f, "Stack[{stack_idx}]")?, + MemBase::Stack { stack_idx, num_bits } => write!(f, "Stack{num_bits}[{stack_idx}]")?, } if self.disp != 0 { let sign = if self.disp > 0 { '+' } else { '-' }; @@ -1143,6 +1149,81 @@ impl LiveRange { } } +/// StackState manages which stack slots are used by which VReg +pub struct StackState { + /// The maximum number of spilled VRegs at a time + stack_size: usize, + /// Map from index at the C stack for spilled VRegs to Some(vreg_idx) if allocated + stack_slots: Vec<Option<usize>>, + /// Copy of Assembler::stack_base_idx. Used for calculating stack slot offsets. + stack_base_idx: usize, +} + +impl StackState { + /// Initialize a stack allocator + pub(super) fn new(stack_base_idx: usize) -> Self { + StackState { + stack_size: 0, + stack_slots: vec![], + stack_base_idx, + } + } + + /// Allocate a stack slot for a given vreg_idx + fn alloc_stack(&mut self, vreg_idx: usize) -> Opnd { + for stack_idx in 0..self.stack_size { + if self.stack_slots[stack_idx].is_none() { + self.stack_slots[stack_idx] = Some(vreg_idx); + return Opnd::mem(64, NATIVE_BASE_PTR, self.stack_idx_to_disp(stack_idx)); + } + } + // Every stack slot is in use. Allocate a new stack slot. + self.stack_size += 1; + self.stack_slots.push(Some(vreg_idx)); + Opnd::mem(64, NATIVE_BASE_PTR, self.stack_idx_to_disp(self.stack_slots.len() - 1)) + } + + /// Deallocate a stack slot for a given disp + fn dealloc_stack(&mut self, disp: i32) { + let stack_idx = self.disp_to_stack_idx(disp); + if self.stack_slots[stack_idx].is_some() { + self.stack_slots[stack_idx] = None; + } + } + + /// Convert the `disp` of a stack slot operand to the stack index + fn disp_to_stack_idx(&self, disp: i32) -> usize { + (-disp / SIZEOF_VALUE_I32) as usize - self.stack_base_idx - 1 + } + + /// Convert a stack index to the `disp` of the stack slot + fn stack_idx_to_disp(&self, stack_idx: usize) -> i32 { + (self.stack_base_idx + stack_idx + 1) as i32 * -SIZEOF_VALUE_I32 + } + + /// Convert Mem to MemBase::Stack + fn mem_to_stack_membase(&self, mem: Mem) -> MemBase { + match mem { + Mem { base: MemBase::Reg(reg_no), disp, num_bits } if NATIVE_BASE_PTR.unwrap_reg().reg_no == reg_no => { + let stack_idx = self.disp_to_stack_idx(disp); + MemBase::Stack { stack_idx, num_bits } + } + _ => unreachable!(), + } + } + + /// Convert MemBase::Stack to Mem + pub(super) fn stack_membase_to_mem(&self, membase: MemBase) -> Mem { + match membase { + MemBase::Stack { stack_idx, num_bits } => { + let disp = self.stack_idx_to_disp(stack_idx); + Mem { base: MemBase::Reg(NATIVE_BASE_PTR.unwrap_reg().reg_no), disp, num_bits } + } + _ => unreachable!(), + } + } +} + /// RegisterPool manages which registers are used by which VReg struct RegisterPool { /// List of registers that can be allocated @@ -1155,45 +1236,54 @@ struct RegisterPool { /// The number of live registers. /// Provides a quick way to query `pool.filter(|r| r.is_some()).count()` live_regs: usize, + + /// Fallback to let StackState allocate stack slots when RegisterPool runs out of registers. + stack_state: StackState, } impl RegisterPool { /// Initialize a register pool - fn new(regs: Vec<Reg>) -> Self { + fn new(regs: Vec<Reg>, stack_base_idx: usize) -> Self { let pool = vec![None; regs.len()]; RegisterPool { regs, pool, live_regs: 0, + stack_state: StackState::new(stack_base_idx), } } /// Mutate the pool to indicate that the register at the index /// has been allocated and is live. - fn alloc_reg(&mut self, vreg_idx: usize) -> Option<Reg> { + fn alloc_opnd(&mut self, vreg_idx: usize) -> Opnd { for (reg_idx, reg) in self.regs.iter().enumerate() { if self.pool[reg_idx].is_none() { self.pool[reg_idx] = Some(vreg_idx); self.live_regs += 1; - return Some(*reg); + return Opnd::Reg(*reg); } } - None + self.stack_state.alloc_stack(vreg_idx) } /// Allocate a specific register - fn take_reg(&mut self, reg: &Reg, vreg_idx: usize) -> Reg { + fn take_reg(&mut self, reg: &Reg, vreg_idx: usize) -> Opnd { let reg_idx = self.regs.iter().position(|elem| elem.reg_no == reg.reg_no) .unwrap_or_else(|| panic!("Unable to find register: {}", reg.reg_no)); assert_eq!(self.pool[reg_idx], None, "register already allocated for VReg({:?})", self.pool[reg_idx]); self.pool[reg_idx] = Some(vreg_idx); self.live_regs += 1; - *reg + Opnd::Reg(*reg) } // Mutate the pool to indicate that the given register is being returned // as it is no longer used by the instruction that previously held it. - fn dealloc_reg(&mut self, reg: &Reg) { + fn dealloc_opnd(&mut self, opnd: &Opnd) { + if let Opnd::Mem(Mem { disp, .. }) = *opnd { + return self.stack_state.dealloc_stack(disp); + } + + let reg = opnd.unwrap_reg(); let reg_idx = self.regs.iter().position(|elem| elem.reg_no == reg.reg_no) .unwrap_or_else(|| panic!("Unable to find register: {}", reg.reg_no)); if self.pool[reg_idx].is_some() { @@ -1428,61 +1518,40 @@ impl Assembler /// registers because their output is used as the operand on a subsequent /// instruction. This is our implementation of the linear scan algorithm. pub(super) fn alloc_regs(mut self, regs: Vec<Reg>) -> Result<Assembler, CompileError> { - // Dump live registers for register spill debugging. - fn dump_live_regs(insns: Vec<Insn>, live_ranges: Vec<LiveRange>, num_regs: usize, spill_index: usize) { - // Convert live_ranges to live_regs: the number of live registers at each index - let mut live_regs: Vec<usize> = vec![]; - for insn_idx in 0..insns.len() { - let live_count = live_ranges.iter().filter(|range| - match (range.start, range.end) { - (Some(start), Some(end)) => start <= insn_idx && insn_idx <= end, - _ => false, - } - ).count(); - live_regs.push(live_count); - } - - // Dump insns along with live registers - for (insn_idx, insn) in insns.iter().enumerate() { - eprint!("{:3} ", if spill_index == insn_idx { "==>" } else { "" }); - for reg in 0..=num_regs { - eprint!("{:1}", if reg < live_regs[insn_idx] { "|" } else { "" }); - } - eprintln!(" [{:3}] {:?}", insn_idx, insn); - } - } - // First, create the pool of registers. - let mut pool = RegisterPool::new(regs.clone()); + let mut pool = RegisterPool::new(regs.clone(), self.stack_base_idx); - // Mapping between VReg and allocated VReg for each VReg index. - // None if no register has been allocated for the VReg. - let mut reg_mapping: Vec<Option<Reg>> = vec![None; self.live_ranges.len()]; + // Mapping between VReg and register or stack slot for each VReg index. + // None if no register or stack slot has been allocated for the VReg. + let mut vreg_opnd: Vec<Option<Opnd>> = vec![None; self.live_ranges.len()]; // List of registers saved before a C call, paired with the VReg index. let mut saved_regs: Vec<(Reg, usize)> = vec![]; + // Remember the indexes of Insn::FrameSetup to update the stack size later + let mut frame_setup_idxs: Vec<usize> = vec![]; + // live_ranges is indexed by original `index` given by the iterator. let mut asm = Assembler::new_with_asm(&self); let live_ranges: Vec<LiveRange> = take(&mut self.live_ranges); let mut iterator = self.insns.into_iter().enumerate().peekable(); while let Some((index, mut insn)) = iterator.next() { + // Remember the index of FrameSetup to bump slot_count when we know the max number of spilled VRegs. + if let Insn::FrameSetup { .. } = insn { + frame_setup_idxs.push(asm.insns.len()); + } + let before_ccall = match (&insn, iterator.peek().map(|(_, insn)| insn)) { (Insn::ParallelMov { .. }, Some(Insn::CCall { .. })) | (Insn::CCall { .. }, _) if !pool.is_empty() => { // If C_RET_REG is in use, move it to another register. // This must happen before last-use registers are deallocated. if let Some(vreg_idx) = pool.vreg_for(&C_RET_REG) { - let new_reg = if let Some(new_reg) = pool.alloc_reg(vreg_idx) { - new_reg - } else { - debug!("spilling VReg is not implemented yet, can't evacuate C_RET_REG on CCall"); - return Err(CompileError::RegisterSpillOnCCall); - }; - asm.mov(Opnd::Reg(new_reg), C_RET_OPND); - pool.dealloc_reg(&C_RET_REG); - reg_mapping[vreg_idx] = Some(new_reg); + let new_opnd = pool.alloc_opnd(vreg_idx); + asm.mov(new_opnd, C_RET_OPND); + pool.dealloc_opnd(&Opnd::Reg(C_RET_REG)); + vreg_opnd[vreg_idx] = Some(new_opnd); } true @@ -1501,8 +1570,8 @@ impl Assembler // uses this operand. If it is, we can return the allocated // register to the pool. if live_ranges[idx].end() == index { - if let Some(reg) = reg_mapping[idx] { - pool.dealloc_reg(®); + if let Some(opnd) = vreg_opnd[idx] { + pool.dealloc_opnd(&opnd); } else { unreachable!("no register allocated for insn {:?}", insn); } @@ -1520,7 +1589,7 @@ impl Assembler // Save live registers for &(reg, _) in saved_regs.iter() { asm.cpush(Opnd::Reg(reg)); - pool.dealloc_reg(®); + pool.dealloc_opnd(&Opnd::Reg(reg)); } // On x86_64, maintain 16-byte stack alignment if cfg!(target_arch = "x86_64") && saved_regs.len() % 2 == 1 { @@ -1560,7 +1629,7 @@ impl Assembler if let Some(Opnd::VReg{ idx, .. }) = opnd_iter.next() { if live_ranges[*idx].end() == index { - if let Some(reg) = reg_mapping[*idx] { + if let Some(Opnd::Reg(reg)) = vreg_opnd[*idx] { out_reg = Some(pool.take_reg(®, vreg_idx)); } } @@ -1569,23 +1638,7 @@ impl Assembler // Allocate a new register for this instruction if one is not // already allocated. - if out_reg.is_none() { - out_reg = match pool.alloc_reg(vreg_idx) { - Some(reg) => Some(reg), - None => { - if get_option!(debug) { - let mut insns = asm.insns; - insns.push(insn); - for (_, insn) in iterator.by_ref() { - insns.push(insn); - } - dump_live_regs(insns, live_ranges, regs.len(), index); - } - debug!("Register spill not supported"); - return Err(CompileError::RegisterSpillOnAlloc); - } - }; - } + let out_opnd = out_reg.unwrap_or_else(|| pool.alloc_opnd(vreg_idx)); // Set the output operand on the instruction let out_num_bits = Opnd::match_num_bits_iter(insn.opnd_iter()); @@ -1594,9 +1647,9 @@ impl Assembler // output operand on this instruction because the live range // extends beyond the index of the instruction. let out = insn.out_opnd_mut().unwrap(); - let reg = out_reg.unwrap().with_num_bits(out_num_bits); - reg_mapping[out.vreg_idx()] = Some(reg); - *out = Opnd::Reg(reg); + let out_opnd = out_opnd.with_num_bits(out_num_bits); + vreg_opnd[out.vreg_idx()] = Some(out_opnd); + *out = out_opnd; } // Replace VReg and Param operands by their corresponding register @@ -1604,11 +1657,15 @@ impl Assembler while let Some(opnd) = opnd_iter.next() { match *opnd { Opnd::VReg { idx, num_bits } => { - *opnd = Opnd::Reg(reg_mapping[idx].unwrap()).with_num_bits(num_bits); + *opnd = vreg_opnd[idx].unwrap().with_num_bits(num_bits); }, Opnd::Mem(Mem { base: MemBase::VReg(idx), disp, num_bits }) => { - let base = MemBase::Reg(reg_mapping[idx].unwrap().reg_no); - *opnd = Opnd::Mem(Mem { base, disp, num_bits }); + *opnd = match vreg_opnd[idx].unwrap() { + Opnd::Reg(reg) => Opnd::Mem(Mem { base: MemBase::Reg(reg.reg_no), disp, num_bits }), + // If the base is spilled, lower it to MemBase::Stack, which scratch_split will lower to MemBase::Reg. + Opnd::Mem(mem) => Opnd::Mem(Mem { base: pool.stack_state.mem_to_stack_membase(mem), disp, num_bits }), + _ => unreachable!(), + } } _ => {}, } @@ -1618,8 +1675,8 @@ impl Assembler // register if let Some(idx) = vreg_idx { if live_ranges[idx].end() == index { - if let Some(reg) = reg_mapping[idx] { - pool.dealloc_reg(®); + if let Some(opnd) = vreg_opnd[idx] { + pool.dealloc_opnd(&opnd); } else { unreachable!("no register allocated for insn {:?}", insn); } @@ -1671,6 +1728,16 @@ impl Assembler } } + // Extend the stack space for spilled operands + for frame_setup_idx in frame_setup_idxs { + match &mut asm.insns[frame_setup_idx] { + Insn::FrameSetup { slot_count, .. } => { + *slot_count += pool.stack_state.stack_size; + } + _ => unreachable!(), + } + } + assert!(pool.is_empty(), "Expected all registers to be returned to the pool"); Ok(asm) } diff --git a/zjit/src/backend/x86_64/mod.rs b/zjit/src/backend/x86_64/mod.rs index 14c0df8dd0..1d5d90a856 100644 --- a/zjit/src/backend/x86_64/mod.rs +++ b/zjit/src/backend/x86_64/mod.rs @@ -1,4 +1,4 @@ -use std::mem::take; +use std::mem::{self, take}; use crate::asm::*; use crate::asm::x86_64::*; @@ -97,13 +97,13 @@ pub const ALLOC_REGS: &[Reg] = &[ RCX_REG, R8_REG, R9_REG, - R10_REG, RAX_REG, ]; /// Special scratch register for intermediate processing. It should be used only by /// [`Assembler::x86_scratch_split`] or [`Assembler::new_with_scratch_reg`]. const SCRATCH0_OPND: Opnd = Opnd::Reg(R11_REG); +const SCRATCH1_OPND: Opnd = Opnd::Reg(R10_REG); impl Assembler { /// Return an Assembler with scratch registers disabled in the backend, and a scratch register. @@ -395,13 +395,13 @@ impl Assembler { /// without requiring more registers to be available in the register /// allocator. So we just use the SCRATCH0_OPND register temporarily to hold /// the value before we immediately use it. - fn split_64bit_immediate(asm: &mut Assembler, opnd: Opnd) -> Opnd { + fn split_64bit_immediate(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd) -> Opnd { match opnd { Opnd::Imm(value) => { // 32-bit values will be sign-extended if imm_num_bits(value) > 32 { - asm.mov(SCRATCH0_OPND, opnd); - SCRATCH0_OPND + asm.mov(scratch_opnd, opnd); + scratch_opnd } else { opnd } @@ -409,8 +409,8 @@ impl Assembler { Opnd::UImm(value) => { // 32-bit values will be sign-extended if imm_num_bits(value as i64) > 32 { - asm.mov(SCRATCH0_OPND, opnd); - SCRATCH0_OPND + asm.mov(scratch_opnd, opnd); + scratch_opnd } else { Opnd::Imm(value as i64) } @@ -419,23 +419,102 @@ impl Assembler { } } - let mut asm = Assembler::new_with_asm(&self); + /// If a given operand is Opnd::Mem and it uses MemBase::Stack, lower it to MemBase::Reg using a scratch regsiter. + fn split_stack_membase(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd, stack_state: &StackState) -> Opnd { + if let Opnd::Mem(Mem { base: stack_membase @ MemBase::Stack { .. }, disp, num_bits }) = opnd { + let base = Opnd::Mem(stack_state.stack_membase_to_mem(stack_membase)); + asm.load_into(scratch_opnd, base); + Opnd::Mem(Mem { base: MemBase::Reg(scratch_opnd.unwrap_reg().reg_no), disp, num_bits }) + } else { + opnd + } + } + + /// If opnd is Opnd::Mem, set scratch_reg to *opnd. Return Some(Opnd::Mem) if it needs to be written back from scratch_reg. + fn split_memory_write(opnd: &mut Opnd, scratch_opnd: Opnd) -> Option<Opnd> { + if let Opnd::Mem(_) = opnd { + let mem_opnd = opnd.clone(); + *opnd = opnd.num_bits().map(|num_bits| scratch_opnd.with_num_bits(num_bits)).unwrap_or(scratch_opnd); + Some(mem_opnd) + } else { + None + } + } + + /// If both opnd and other are Opnd::Mem, split opnd with scratch_opnd. + fn split_if_both_memory(asm: &mut Assembler, opnd: Opnd, other: Opnd, scratch_opnd: Opnd) -> Opnd { + if let (Opnd::Mem(_), Opnd::Mem(_)) = (opnd, other) { + asm.load_into(scratch_opnd.with_num_bits(opnd.rm_num_bits()), opnd); + scratch_opnd.with_num_bits(opnd.rm_num_bits()) + } else { + opnd + } + } + + /// Move src to dst, splitting it with scratch_opnd if it's a Mem-to-Mem move. Skip it if dst == src. + fn asm_mov(asm: &mut Assembler, dst: Opnd, src: Opnd, scratch_opnd: Opnd) { + if dst != src { + if let (Opnd::Mem(_), Opnd::Mem(_)) = (dst, src) { + asm.mov(scratch_opnd, src); + asm.mov(dst, scratch_opnd); + } else { + asm.mov(dst, src); + } + } + } + + // Prepare StackState to lower MemBase::Stack + let stack_state = StackState::new(self.stack_base_idx); + + let mut asm_local = Assembler::new_with_asm(&self); + let asm = &mut asm_local; asm.accept_scratch_reg = true; let mut iterator = self.insns.into_iter().enumerate().peekable(); while let Some((_, mut insn)) = iterator.next() { match &mut insn { - Insn::Add { right, .. } | - Insn::Sub { right, .. } | - Insn::Mul { right, .. } | - Insn::And { right, .. } | - Insn::Or { right, .. } | - Insn::Xor { right, .. } | - Insn::Test { right, .. } => { - *right = split_64bit_immediate(&mut asm, *right); + Insn::Add { left, right, out } | + Insn::Sub { left, right, out } | + Insn::And { left, right, out } | + Insn::Or { left, right, out } | + Insn::Xor { left, right, out } => { + *left = split_stack_membase(asm, *left, SCRATCH0_OPND, &stack_state); + *left = split_if_both_memory(asm, *left, *right, SCRATCH0_OPND); + *right = split_stack_membase(asm, *right, SCRATCH1_OPND, &stack_state); + *right = split_64bit_immediate(asm, *right, SCRATCH1_OPND); + + let (out, left) = (*out, *left); + asm.push_insn(insn); + asm_mov(asm, out, left, SCRATCH0_OPND); + } + Insn::Mul { left, right, out } => { + *left = split_stack_membase(asm, *left, SCRATCH0_OPND, &stack_state); + *left = split_if_both_memory(asm, *left, *right, SCRATCH0_OPND); + *right = split_stack_membase(asm, *right, SCRATCH1_OPND, &stack_state); + *right = split_64bit_immediate(asm, *right, SCRATCH1_OPND); + + // imul doesn't have (Mem, Reg) encoding. Swap left and right in that case. + if let (Opnd::Mem(_), Opnd::Reg(_)) = (&left, &right) { + mem::swap(left, right); + } + + let (out, left) = (*out, *left); asm.push_insn(insn); + asm_mov(asm, out, left, SCRATCH0_OPND); } + &mut Insn::Not { opnd, out } | + &mut Insn::LShift { opnd, out, .. } | + &mut Insn::RShift { opnd, out, .. } | + &mut Insn::URShift { opnd, out, .. } => { + asm.push_insn(insn); + asm_mov(asm, out, opnd, SCRATCH0_OPND); + } + Insn::Test { left, right } | Insn::Cmp { left, right } => { + *left = split_stack_membase(asm, *left, SCRATCH1_OPND, &stack_state); + *right = split_stack_membase(asm, *right, SCRATCH0_OPND, &stack_state); + *right = split_if_both_memory(asm, *right, *left, SCRATCH0_OPND); + let num_bits = match right { Opnd::Imm(value) => Some(imm_num_bits(*value)), Opnd::UImm(value) => Some(uimm_num_bits(*value)), @@ -450,7 +529,7 @@ impl Assembler { // directly in the instruction. let use_imm = num_bits.is_some() && left.num_bits() == num_bits && num_bits.unwrap() < 64; if !use_imm { - *right = split_64bit_immediate(&mut asm, *right); + *right = split_64bit_immediate(asm, *right, SCRATCH0_OPND); } asm.push_insn(insn); } @@ -462,16 +541,19 @@ impl Assembler { *opnds = vec![]; asm.push_insn(insn); } - &mut Insn::Lea { opnd, out } => { - match (opnd, out) { - // Split here for compile_exits - (Opnd::Mem(_), Opnd::Mem(_)) => { - asm.lea_into(SCRATCH0_OPND, opnd); - asm.store(out, SCRATCH0_OPND); - } - _ => { - asm.push_insn(insn); - } + Insn::CSelZ { out, .. } | + Insn::CSelNZ { out, .. } | + Insn::CSelE { out, .. } | + Insn::CSelNE { out, .. } | + Insn::CSelL { out, .. } | + Insn::CSelLE { out, .. } | + Insn::CSelG { out, .. } | + Insn::CSelGE { out, .. } | + Insn::Lea { out, .. } => { + let mem_out = split_memory_write(out, SCRATCH0_OPND); + asm.push_insn(insn); + if let Some(mem_out) = mem_out { + asm.store(mem_out, SCRATCH0_OPND); } } Insn::LeaJumpTarget { target, out } => { @@ -480,6 +562,15 @@ impl Assembler { asm.mov(*out, SCRATCH0_OPND); } } + Insn::Load { out, opnd } | + Insn::LoadInto { dest: out, opnd } => { + *opnd = split_stack_membase(asm, *opnd, SCRATCH0_OPND, &stack_state); + let mem_out = split_memory_write(out, SCRATCH0_OPND); + asm.push_insn(insn); + if let Some(mem_out) = mem_out { + asm.store(mem_out, SCRATCH0_OPND.with_num_bits(mem_out.rm_num_bits())); + } + } // Convert Opnd::const_ptr into Opnd::Mem. This split is done here to give // a register for compile_exits. &mut Insn::IncrCounter { mem, value } => { @@ -487,17 +578,19 @@ impl Assembler { asm.load_into(SCRATCH0_OPND, mem); asm.incr_counter(Opnd::mem(64, SCRATCH0_OPND, 0), value); } + &mut Insn::Mov { dest, src } => { + asm_mov(asm, dest, src, SCRATCH0_OPND); + } // Resolve ParallelMov that couldn't be handled without a scratch register. Insn::ParallelMov { moves } => { for (dst, src) in Self::resolve_parallel_moves(&moves, Some(SCRATCH0_OPND)).unwrap() { - asm.mov(dst, src) + asm_mov(asm, dst, src, SCRATCH0_OPND); } } // Handle various operand combinations for spills on compile_exits. &mut Insn::Store { dest, src } => { - let Opnd::Mem(Mem { num_bits, .. }) = dest else { - panic!("Unexpected Insn::Store destination in x86_scratch_split: {dest:?}"); - }; + let num_bits = dest.rm_num_bits(); + let dest = split_stack_membase(asm, dest, SCRATCH1_OPND, &stack_state); let src = match src { Opnd::Reg(_) => src, @@ -541,7 +634,7 @@ impl Assembler { } } - asm + asm_local } /// Emit platform-specific machine code |
