ZJIT: Implement register spill (#14936)

author: Takashi Kokubun <takashi.kokubun@shopify.com> 2025-11-03 16:49:25 -0800
committer: GitHub <noreply@github.com> 2025-11-03 16:49:25 -0800
commit: 9ca940757315e53d1eaddc83071b1b4581f8f578 (patch)
tree: 82429b9d0516f431f07207a1c820e9045226f3b3
parent: 0d210f4d39c1cff71f34dbe9938345897f3e5eb1 (diff)
5 files changed, 431 insertions, 151 deletions
diff --git a/zjit/src/asm/x86_64/mod.rs b/zjit/src/asm/x86_64/mod.rs
index 9d3bf18dcd..cfedca4540 100644
--- a/zjit/src/asm/x86_64/mod.rs
+++ b/zjit/src/asm/x86_64/mod.rs
@@ -779,13 +779,6 @@ pub fn imul(cb: &mut CodeBlock, opnd0: X86Opnd, opnd1: X86Opnd) {
             write_rm(cb, false, true, opnd0, opnd1, None, &[0x0F, 0xAF]);
         }
 
-        // Flip the operands to handle this case. This instruction has weird encoding restrictions.
-        (X86Opnd::Mem(_), X86Opnd::Reg(_)) => {
-            //REX.W + 0F AF /rIMUL r64, r/m64
-            // Quadword register := Quadword register * r/m64.
-            write_rm(cb, false, true, opnd1, opnd0, None, &[0x0F, 0xAF]);
-        }
-
         _ => unreachable!()
     }
 }
diff --git a/zjit/src/asm/x86_64/tests.rs b/zjit/src/asm/x86_64/tests.rs
index 0f86725946..d574bdb034 100644
--- a/zjit/src/asm/x86_64/tests.rs
+++ b/zjit/src/asm/x86_64/tests.rs
@@ -228,23 +228,29 @@ fn test_cqo() {
 fn test_imul() {
     let cb1 = compile(|cb| imul(cb, RAX, RBX));
     let cb2 = compile(|cb| imul(cb, RDX, mem_opnd(64, RAX, 0)));
-    // Operands flipped for encoding since multiplication is commutative
-    let cb3 = compile(|cb| imul(cb, mem_opnd(64, RAX, 0), RDX));
 
-    assert_disasm_snapshot!(disasms!(cb1, cb2, cb3), @r"
+    assert_disasm_snapshot!(disasms!(cb1, cb2), @r"
     0x0: imul rax, rbx
     0x0: imul rdx, qword ptr [rax]
-    0x0: imul rdx, qword ptr [rax]
     ");
 
-    assert_snapshot!(hexdumps!(cb1, cb2, cb3), @r"
+    assert_snapshot!(hexdumps!(cb1, cb2), @r"
     480fafc3
     480faf10
-    480faf10
     ");
 }
 
 #[test]
+#[should_panic]
+fn test_imul_mem_reg() {
+    // imul doesn't have (Mem, Reg) encoding. Since multiplication is communicative, imul() could
+    // swap operands. However, x86_scratch_split may need to move the result to the output operand,
+    // which can be complicated if the assembler may sometimes change the result operand.
+    // So x86_scratch_split should be responsible for that swap, not the assembler.
+    compile(|cb| imul(cb, mem_opnd(64, RAX, 0), RDX));
+}
+
+#[test]
 fn test_jge_label() {
     let cb = compile(|cb| {
         let label_idx = cb.new_label("loop".to_owned());
diff --git a/zjit/src/backend/arm64/mod.rs b/zjit/src/backend/arm64/mod.rs
index d762b14c91..acf0576f9c 100644
--- a/zjit/src/backend/arm64/mod.rs
+++ b/zjit/src/backend/arm64/mod.rs
@@ -79,6 +79,9 @@ impl From<Opnd> for A64Opnd {
             Opnd::Mem(Mem { base: MemBase::VReg(_), .. }) => {
                 panic!("attempted to lower an Opnd::Mem with a MemBase::VReg base")
             },
+            Opnd::Mem(Mem { base: MemBase::Stack { .. }, .. }) => {
+                panic!("attempted to lower an Opnd::Mem with a MemBase::Stack base")
+            },
             Opnd::VReg { .. } => panic!("attempted to lower an Opnd::VReg"),
             Opnd::Value(_) => panic!("attempted to lower an Opnd::Value"),
             Opnd::None => panic!(
@@ -203,6 +206,7 @@ pub const ALLOC_REGS: &[Reg] = &[
 /// [`Assembler::arm64_scratch_split`] or [`Assembler::new_with_scratch_reg`].
 const SCRATCH0_OPND: Opnd = Opnd::Reg(X15_REG);
 const SCRATCH1_OPND: Opnd = Opnd::Reg(X17_REG);
+const SCRATCH2_OPND: Opnd = Opnd::Reg(X14_REG);
 
 impl Assembler {
     /// Special register for intermediate processing in arm64_emit. It should be used only by arm64_emit.
@@ -690,22 +694,129 @@ impl Assembler {
     /// need to be split with registers after `alloc_regs`, e.g. for `compile_exits`, so this
     /// splits them and uses scratch registers for it.
     fn arm64_scratch_split(self) -> Assembler {
-        let mut asm = Assembler::new_with_asm(&self);
+        /// If opnd is Opnd::Mem with a too large disp, make the disp smaller using lea.
+        fn split_large_disp(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd) -> Opnd {
+            match opnd {
+                Opnd::Mem(Mem { num_bits, disp, .. }) if !mem_disp_fits_bits(disp) => {
+                    asm.lea_into(scratch_opnd, opnd);
+                    Opnd::mem(num_bits, scratch_opnd, 0)
+                }
+                _ => opnd,
+            }
+        }
+
+        /// If opnd is Opnd::Mem with MemBase::Stack, lower it to Opnd::Mem with MemBase::Reg, and split a large disp.
+        fn split_stack_membase(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd, stack_state: &StackState) -> Opnd {
+            let opnd = split_only_stack_membase(asm, opnd, scratch_opnd, stack_state);
+            split_large_disp(asm, opnd, scratch_opnd)
+        }
+
+        /// split_stack_membase but without split_large_disp. This should be used only by lea.
+        fn split_only_stack_membase(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd, stack_state: &StackState) -> Opnd {
+            if let Opnd::Mem(Mem { base: stack_membase @ MemBase::Stack { .. }, disp: opnd_disp, num_bits: opnd_num_bits }) = opnd {
+                let base = Opnd::Mem(stack_state.stack_membase_to_mem(stack_membase));
+                let base = split_large_disp(asm, base, scratch_opnd);
+                asm.load_into(scratch_opnd, base);
+                Opnd::Mem(Mem { base: MemBase::Reg(scratch_opnd.unwrap_reg().reg_no), disp: opnd_disp, num_bits: opnd_num_bits })
+            } else {
+                opnd
+            }
+        }
+
+        /// If opnd is Opnd::Mem, lower it to scratch_opnd. You should use this when `opnd` is read by the instruction, not written.
+        fn split_memory_read(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd) -> Opnd {
+            if let Opnd::Mem(_) = opnd {
+                let opnd = split_large_disp(asm, opnd, scratch_opnd);
+                let scratch_opnd = opnd.num_bits().map(|num_bits| scratch_opnd.with_num_bits(num_bits)).unwrap_or(scratch_opnd);
+                asm.load_into(scratch_opnd, opnd);
+                scratch_opnd
+            } else {
+                opnd
+            }
+        }
+
+        /// If opnd is Opnd::Mem, set scratch_reg to *opnd. Return Some(Opnd::Mem) if it needs to be written back from scratch_reg.
+        fn split_memory_write(opnd: &mut Opnd, scratch_opnd: Opnd) -> Option<Opnd> {
+            if let Opnd::Mem(_) = opnd {
+                let mem_opnd = opnd.clone();
+                *opnd = opnd.num_bits().map(|num_bits| scratch_opnd.with_num_bits(num_bits)).unwrap_or(scratch_opnd);
+                Some(mem_opnd)
+            } else {
+                None
+            }
+        }
+
+        // Prepare StackState to lower MemBase::Stack
+        let stack_state = StackState::new(self.stack_base_idx);
+
+        let mut asm_local = Assembler::new_with_asm(&self);
+        let asm = &mut asm_local;
         asm.accept_scratch_reg = true;
         let mut iterator = self.insns.into_iter().enumerate().peekable();
 
         while let Some((_, mut insn)) = iterator.next() {
             match &mut insn {
-                &mut Insn::Mul { out, .. } => {
+                Insn::Add { left, right, out } |
+                Insn::Sub { left, right, out } |
+                Insn::And { left, right, out } |
+                Insn::Or { left, right, out } |
+                Insn::Xor { left, right, out } |
+                Insn::CSelZ  { truthy: left, falsy: right, out } |
+                Insn::CSelNZ { truthy: left, falsy: right, out } |
+                Insn::CSelE  { truthy: left, falsy: right, out } |
+                Insn::CSelNE { truthy: left, falsy: right, out } |
+                Insn::CSelL  { truthy: left, falsy: right, out } |
+                Insn::CSelLE { truthy: left, falsy: right, out } |
+                Insn::CSelG  { truthy: left, falsy: right, out } |
+                Insn::CSelGE { truthy: left, falsy: right, out } => {
+                    *left = split_memory_read(asm, *left, SCRATCH0_OPND);
+                    *right = split_memory_read(asm, *right, SCRATCH1_OPND);
+                    let mem_out = split_memory_write(out, SCRATCH0_OPND);
+
+                    asm.push_insn(insn);
+
+                    if let Some(mem_out) = mem_out {
+                        let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND);
+                        asm.store(mem_out, SCRATCH0_OPND);
+                    }
+                }
+                Insn::Mul { left, right, out } => {
+                    *left = split_memory_read(asm, *left, SCRATCH0_OPND);
+                    *right = split_memory_read(asm, *right, SCRATCH1_OPND);
+                    let mem_out = split_memory_write(out, SCRATCH0_OPND);
+                    let reg_out = out.clone();
+
                     asm.push_insn(insn);
 
+                    if let Some(mem_out) = mem_out {
+                        let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND);
+                        asm.store(mem_out, SCRATCH0_OPND);
+                    };
+
                     // If the next instruction is JoMul
                     if matches!(iterator.peek(), Some((_, Insn::JoMul(_)))) {
                         // Produce a register that is all zeros or all ones
                         // Based on the sign bit of the 64-bit mul result
-                        asm.push_insn(Insn::RShift { out: SCRATCH0_OPND, opnd: out, shift: Opnd::UImm(63) });
+                        asm.push_insn(Insn::RShift { out: SCRATCH0_OPND, opnd: reg_out, shift: Opnd::UImm(63) });
+                    }
+                }
+                Insn::RShift { opnd, out, .. } => {
+                    *opnd = split_memory_read(asm, *opnd, SCRATCH0_OPND);
+                    let mem_out = split_memory_write(out, SCRATCH0_OPND);
+
+                    asm.push_insn(insn);
+
+                    if let Some(mem_out) = mem_out {
+                        let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND);
+                        asm.store(mem_out, SCRATCH0_OPND);
                     }
                 }
+                Insn::Cmp { left, right } |
+                Insn::Test { left, right } => {
+                    *left = split_memory_read(asm, *left, SCRATCH0_OPND);
+                    *right = split_memory_read(asm, *right, SCRATCH1_OPND);
+                    asm.push_insn(insn);
+                }
                 // For compile_exits, support splitting simple C arguments here
                 Insn::CCall { opnds, .. } if !opnds.is_empty() => {
                     for (i, opnd) in opnds.iter().enumerate() {
@@ -714,16 +825,32 @@ impl Assembler {
                     *opnds = vec![];
                     asm.push_insn(insn);
                 }
-                &mut Insn::Lea { opnd, out } => {
-                    match (opnd, out) {
-                        // Split here for compile_exits
-                        (Opnd::Mem(_), Opnd::Mem(_)) => {
-                            asm.lea_into(SCRATCH0_OPND, opnd);
-                            asm.store(out, SCRATCH0_OPND);
-                        }
-                        _ => {
-                            asm.push_insn(insn);
+                Insn::Lea { opnd, out } => {
+                    *opnd = split_only_stack_membase(asm, *opnd, SCRATCH0_OPND, &stack_state);
+                    let mem_out = split_memory_write(out, SCRATCH0_OPND);
+
+                    asm.push_insn(insn);
+
+                    if let Some(mem_out) = mem_out {
+                        let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND);
+                        asm.store(mem_out, SCRATCH0_OPND);
+                    }
+                }
+                Insn::Load { opnd, out } |
+                Insn::LoadInto { opnd, dest: out } => {
+                    *opnd = split_stack_membase(asm, *opnd, SCRATCH0_OPND, &stack_state);
+                    *out = split_stack_membase(asm, *out, SCRATCH1_OPND, &stack_state);
+
+                    if let Opnd::Mem(_) = out {
+                        // If NATIVE_STACK_PTR is used as a source for Store, it's handled as xzr, storeing zero.
+                        // To save the content of NATIVE_STACK_PTR, we need to load it into another register first.
+                        if *opnd == NATIVE_STACK_PTR {
+                            asm.load_into(SCRATCH0_OPND, NATIVE_STACK_PTR);
+                            *opnd = SCRATCH0_OPND;
                         }
+                        asm.store(*out, *opnd);
+                    } else {
+                        asm.push_insn(insn);
                     }
                 }
                 &mut Insn::IncrCounter { mem, value } => {
@@ -741,31 +868,24 @@ impl Assembler {
                     asm.cmp(SCRATCH1_OPND, 0.into());
                     asm.jne(label);
                 }
-                &mut Insn::Store { dest, src } => {
-                    let Opnd::Mem(Mem { num_bits: dest_num_bits, disp: dest_disp, .. }) = dest else {
-                        panic!("Insn::Store destination must be Opnd::Mem: {dest:?}, {src:?}");
-                    };
-
-                    // Split dest using a scratch register if necessary.
-                    let dest = if mem_disp_fits_bits(dest_disp) {
-                        dest
-                    } else {
-                        asm.lea_into(SCRATCH0_OPND, dest);
-                        Opnd::mem(dest_num_bits, SCRATCH0_OPND, 0)
-                    };
-
-                    asm.store(dest, src);
+                Insn::Store { dest, .. } => {
+                    *dest = split_stack_membase(asm, *dest, SCRATCH0_OPND, &stack_state);
+                    asm.push_insn(insn);
                 }
-                &mut Insn::Mov { dest, src } => {
+                Insn::Mov { dest, src } => {
+                    *src = split_stack_membase(asm, *src, SCRATCH0_OPND, &stack_state);
+                    *dest = split_large_disp(asm, *dest, SCRATCH1_OPND);
                     match dest {
-                        Opnd::Reg(_) => asm.load_into(dest, src),
-                        Opnd::Mem(_) => asm.store(dest, src),
+                        Opnd::Reg(_) => asm.load_into(*dest, *src),
+                        Opnd::Mem(_) => asm.store(*dest, *src),
                         _ => asm.push_insn(insn),
                     }
                 }
                 // Resolve ParallelMov that couldn't be handled without a scratch register.
                 Insn::ParallelMov { moves } => {
                     for (dst, src) in Self::resolve_parallel_moves(moves, Some(SCRATCH0_OPND)).unwrap() {
+                        let src = split_stack_membase(asm, src, SCRATCH1_OPND, &stack_state);
+                        let dst = split_large_disp(asm, dst, SCRATCH2_OPND);
                         match dst {
                             Opnd::Reg(_) => asm.load_into(dst, src),
                             Opnd::Mem(_) => asm.store(dst, src),
@@ -779,7 +899,7 @@ impl Assembler {
             }
         }
 
-        asm
+        asm_local
     }
 
     /// Emit platform-specific machine code
@@ -1157,10 +1277,11 @@ impl Assembler {
                                 load_effective_address(cb, Self::EMIT_OPND, src_base_reg_no, src_disp);
                                 A64Opnd::new_mem(dest.rm_num_bits(), Self::EMIT_OPND, 0)
                             };
+                            let dst = A64Opnd::Reg(Self::EMIT_REG.with_num_bits(src_num_bits));
                             match src_num_bits {
-                                64 | 32 => ldur(cb, Self::EMIT_OPND, src_mem),
-                                16 => ldurh(cb, Self::EMIT_OPND, src_mem),
-                                8 => ldurb(cb, Self::EMIT_OPND, src_mem),
+                                64 | 32 => ldur(cb, dst, src_mem),
+                                16 => ldurh(cb, dst, src_mem),
+                                8 => ldurb(cb, dst, src_mem),
                                 num_bits => panic!("unexpected num_bits: {num_bits}")
                             };
                             Self::EMIT_REG
diff --git a/zjit/src/backend/lir.rs b/zjit/src/backend/lir.rs
index 584251de80..66e89a1304 100644
--- a/zjit/src/backend/lir.rs
+++ b/zjit/src/backend/lir.rs
@@ -28,8 +28,12 @@ pub static JIT_PRESERVED_REGS: &[Opnd] = &[CFP, SP, EC];
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub enum MemBase
 {
+    /// Register: Every Opnd::Mem should have MemBase::Reg as of emit.
     Reg(u8),
+    /// Virtual register: Lowered to MemBase::Reg or MemBase::Stack in alloc_regs.
     VReg(usize),
+    /// Stack slot: Lowered to MemBase::Reg in scratch_split.
+    Stack { stack_idx: usize, num_bits: u8 },
 }
 
 // Memory location
@@ -55,6 +59,8 @@ impl fmt::Display for Mem {
         match self.base {
             MemBase::Reg(reg_no) => write!(f, "{}", mem_base_reg(reg_no))?,
             MemBase::VReg(idx) => write!(f, "v{idx}")?,
+            MemBase::Stack { stack_idx, num_bits } if num_bits == 64 => write!(f, "Stack[{stack_idx}]")?,
+            MemBase::Stack { stack_idx, num_bits } => write!(f, "Stack{num_bits}[{stack_idx}]")?,
         }
         if self.disp != 0 {
             let sign = if self.disp > 0 { '+' } else { '-' };
@@ -1143,6 +1149,81 @@ impl LiveRange {
     }
 }
 
+/// StackState manages which stack slots are used by which VReg
+pub struct StackState {
+    /// The maximum number of spilled VRegs at a time
+    stack_size: usize,
+    /// Map from index at the C stack for spilled VRegs to Some(vreg_idx) if allocated
+    stack_slots: Vec<Option<usize>>,
+    /// Copy of Assembler::stack_base_idx. Used for calculating stack slot offsets.
+    stack_base_idx: usize,
+}
+
+impl StackState {
+    /// Initialize a stack allocator
+    pub(super) fn new(stack_base_idx: usize) -> Self {
+        StackState {
+            stack_size: 0,
+            stack_slots: vec![],
+            stack_base_idx,
+        }
+    }
+
+    /// Allocate a stack slot for a given vreg_idx
+    fn alloc_stack(&mut self, vreg_idx: usize) -> Opnd {
+        for stack_idx in 0..self.stack_size {
+            if self.stack_slots[stack_idx].is_none() {
+                self.stack_slots[stack_idx] = Some(vreg_idx);
+                return Opnd::mem(64, NATIVE_BASE_PTR, self.stack_idx_to_disp(stack_idx));
+            }
+        }
+        // Every stack slot is in use. Allocate a new stack slot.
+        self.stack_size += 1;
+        self.stack_slots.push(Some(vreg_idx));
+        Opnd::mem(64, NATIVE_BASE_PTR, self.stack_idx_to_disp(self.stack_slots.len() - 1))
+    }
+
+    /// Deallocate a stack slot for a given disp
+    fn dealloc_stack(&mut self, disp: i32) {
+        let stack_idx = self.disp_to_stack_idx(disp);
+        if self.stack_slots[stack_idx].is_some() {
+            self.stack_slots[stack_idx] = None;
+        }
+    }
+
+    /// Convert the `disp` of a stack slot operand to the stack index
+    fn disp_to_stack_idx(&self, disp: i32) -> usize {
+        (-disp / SIZEOF_VALUE_I32) as usize - self.stack_base_idx - 1
+    }
+
+    /// Convert a stack index to the `disp` of the stack slot
+    fn stack_idx_to_disp(&self, stack_idx: usize) -> i32 {
+        (self.stack_base_idx + stack_idx + 1) as i32 * -SIZEOF_VALUE_I32
+    }
+
+    /// Convert Mem to MemBase::Stack
+    fn mem_to_stack_membase(&self, mem: Mem) -> MemBase {
+        match mem {
+            Mem { base: MemBase::Reg(reg_no), disp, num_bits } if NATIVE_BASE_PTR.unwrap_reg().reg_no == reg_no => {
+                let stack_idx = self.disp_to_stack_idx(disp);
+                MemBase::Stack { stack_idx, num_bits }
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    /// Convert MemBase::Stack to Mem
+    pub(super) fn stack_membase_to_mem(&self, membase: MemBase) -> Mem {
+        match membase {
+            MemBase::Stack { stack_idx, num_bits } => {
+                let disp = self.stack_idx_to_disp(stack_idx);
+                Mem { base: MemBase::Reg(NATIVE_BASE_PTR.unwrap_reg().reg_no), disp, num_bits }
+            }
+            _ => unreachable!(),
+        }
+    }
+}
+
 /// RegisterPool manages which registers are used by which VReg
 struct RegisterPool {
     /// List of registers that can be allocated
@@ -1155,45 +1236,54 @@ struct RegisterPool {
     /// The number of live registers.
     /// Provides a quick way to query `pool.filter(|r| r.is_some()).count()`
     live_regs: usize,
+
+    /// Fallback to let StackState allocate stack slots when RegisterPool runs out of registers.
+    stack_state: StackState,
 }
 
 impl RegisterPool {
     /// Initialize a register pool
-    fn new(regs: Vec<Reg>) -> Self {
+    fn new(regs: Vec<Reg>, stack_base_idx: usize) -> Self {
         let pool = vec![None; regs.len()];
         RegisterPool {
             regs,
             pool,
             live_regs: 0,
+            stack_state: StackState::new(stack_base_idx),
         }
     }
 
     /// Mutate the pool to indicate that the register at the index
     /// has been allocated and is live.
-    fn alloc_reg(&mut self, vreg_idx: usize) -> Option<Reg> {
+    fn alloc_opnd(&mut self, vreg_idx: usize) -> Opnd {
         for (reg_idx, reg) in self.regs.iter().enumerate() {
             if self.pool[reg_idx].is_none() {
                 self.pool[reg_idx] = Some(vreg_idx);
                 self.live_regs += 1;
-                return Some(*reg);
+                return Opnd::Reg(*reg);
             }
         }
-        None
+        self.stack_state.alloc_stack(vreg_idx)
     }
 
     /// Allocate a specific register
-    fn take_reg(&mut self, reg: &Reg, vreg_idx: usize) -> Reg {
+    fn take_reg(&mut self, reg: &Reg, vreg_idx: usize) -> Opnd {
         let reg_idx = self.regs.iter().position(|elem| elem.reg_no == reg.reg_no)
             .unwrap_or_else(|| panic!("Unable to find register: {}", reg.reg_no));
         assert_eq!(self.pool[reg_idx], None, "register already allocated for VReg({:?})", self.pool[reg_idx]);
         self.pool[reg_idx] = Some(vreg_idx);
         self.live_regs += 1;
-        *reg
+        Opnd::Reg(*reg)
     }
 
     // Mutate the pool to indicate that the given register is being returned
     // as it is no longer used by the instruction that previously held it.
-    fn dealloc_reg(&mut self, reg: &Reg) {
+    fn dealloc_opnd(&mut self, opnd: &Opnd) {
+        if let Opnd::Mem(Mem { disp, .. }) = *opnd {
+            return self.stack_state.dealloc_stack(disp);
+        }
+
+        let reg = opnd.unwrap_reg();
         let reg_idx = self.regs.iter().position(|elem| elem.reg_no == reg.reg_no)
             .unwrap_or_else(|| panic!("Unable to find register: {}", reg.reg_no));
         if self.pool[reg_idx].is_some() {
@@ -1428,61 +1518,40 @@ impl Assembler
     /// registers because their output is used as the operand on a subsequent
     /// instruction. This is our implementation of the linear scan algorithm.
     pub(super) fn alloc_regs(mut self, regs: Vec<Reg>) -> Result<Assembler, CompileError> {
-        // Dump live registers for register spill debugging.
-        fn dump_live_regs(insns: Vec<Insn>, live_ranges: Vec<LiveRange>, num_regs: usize, spill_index: usize) {
-            // Convert live_ranges to live_regs: the number of live registers at each index
-            let mut live_regs: Vec<usize> = vec![];
-            for insn_idx in 0..insns.len() {
-                let live_count = live_ranges.iter().filter(|range|
-                    match (range.start, range.end) {
-                        (Some(start), Some(end)) => start <= insn_idx && insn_idx <= end,
-                        _ => false,
-                    }
-                ).count();
-                live_regs.push(live_count);
-            }
-
-            // Dump insns along with live registers
-            for (insn_idx, insn) in insns.iter().enumerate() {
-                eprint!("{:3} ", if spill_index == insn_idx { "==>" } else { "" });
-                for reg in 0..=num_regs {
-                    eprint!("{:1}", if reg < live_regs[insn_idx] { "|" } else { "" });
-                }
-                eprintln!(" [{:3}] {:?}", insn_idx, insn);
-            }
-        }
-
         // First, create the pool of registers.
-        let mut pool = RegisterPool::new(regs.clone());
+        let mut pool = RegisterPool::new(regs.clone(), self.stack_base_idx);
 
-        // Mapping between VReg and allocated VReg for each VReg index.
-        // None if no register has been allocated for the VReg.
-        let mut reg_mapping: Vec<Option<Reg>> = vec![None; self.live_ranges.len()];
+        // Mapping between VReg and register or stack slot for each VReg index.
+        // None if no register or stack slot has been allocated for the VReg.
+        let mut vreg_opnd: Vec<Option<Opnd>> = vec![None; self.live_ranges.len()];
 
         // List of registers saved before a C call, paired with the VReg index.
         let mut saved_regs: Vec<(Reg, usize)> = vec![];
 
+        // Remember the indexes of Insn::FrameSetup to update the stack size later
+        let mut frame_setup_idxs: Vec<usize> = vec![];
+
         // live_ranges is indexed by original `index` given by the iterator.
         let mut asm = Assembler::new_with_asm(&self);
         let live_ranges: Vec<LiveRange> = take(&mut self.live_ranges);
         let mut iterator = self.insns.into_iter().enumerate().peekable();
 
         while let Some((index, mut insn)) = iterator.next() {
+            // Remember the index of FrameSetup to bump slot_count when we know the max number of spilled VRegs.
+            if let Insn::FrameSetup { .. } = insn {
+                frame_setup_idxs.push(asm.insns.len());
+            }
+
             let before_ccall = match (&insn, iterator.peek().map(|(_, insn)| insn)) {
                 (Insn::ParallelMov { .. }, Some(Insn::CCall { .. })) |
                 (Insn::CCall { .. }, _) if !pool.is_empty() => {
                     // If C_RET_REG is in use, move it to another register.
                     // This must happen before last-use registers are deallocated.
                     if let Some(vreg_idx) = pool.vreg_for(&C_RET_REG) {
-                        let new_reg = if let Some(new_reg) = pool.alloc_reg(vreg_idx) {
-                            new_reg
-                        } else {
-                            debug!("spilling VReg is not implemented yet, can't evacuate C_RET_REG on CCall");
-                            return Err(CompileError::RegisterSpillOnCCall);
-                        };
-                        asm.mov(Opnd::Reg(new_reg), C_RET_OPND);
-                        pool.dealloc_reg(&C_RET_REG);
-                        reg_mapping[vreg_idx] = Some(new_reg);
+                        let new_opnd = pool.alloc_opnd(vreg_idx);
+                        asm.mov(new_opnd, C_RET_OPND);
+                        pool.dealloc_opnd(&Opnd::Reg(C_RET_REG));
+                        vreg_opnd[vreg_idx] = Some(new_opnd);
                     }
 
                     true
@@ -1501,8 +1570,8 @@ impl Assembler
                         // uses this operand. If it is, we can return the allocated
                         // register to the pool.
                         if live_ranges[idx].end() == index {
-                            if let Some(reg) = reg_mapping[idx] {
-                                pool.dealloc_reg(&reg);
+                            if let Some(opnd) = vreg_opnd[idx] {
+                                pool.dealloc_opnd(&opnd);
                             } else {
                                 unreachable!("no register allocated for insn {:?}", insn);
                             }
@@ -1520,7 +1589,7 @@ impl Assembler
                 // Save live registers
                 for &(reg, _) in saved_regs.iter() {
                     asm.cpush(Opnd::Reg(reg));
-                    pool.dealloc_reg(&reg);
+                    pool.dealloc_opnd(&Opnd::Reg(reg));
                 }
                 // On x86_64, maintain 16-byte stack alignment
                 if cfg!(target_arch = "x86_64") && saved_regs.len() % 2 == 1 {
@@ -1560,7 +1629,7 @@ impl Assembler
 
                     if let Some(Opnd::VReg{ idx, .. }) = opnd_iter.next() {
                         if live_ranges[*idx].end() == index {
-                            if let Some(reg) = reg_mapping[*idx] {
+                            if let Some(Opnd::Reg(reg)) = vreg_opnd[*idx] {
                                 out_reg = Some(pool.take_reg(&reg, vreg_idx));
                             }
                         }
@@ -1569,23 +1638,7 @@ impl Assembler
 
                 // Allocate a new register for this instruction if one is not
                 // already allocated.
-                if out_reg.is_none() {
-                    out_reg = match pool.alloc_reg(vreg_idx) {
-                        Some(reg) => Some(reg),
-                        None => {
-                            if get_option!(debug) {
-                                let mut insns = asm.insns;
-                                insns.push(insn);
-                                for (_, insn) in iterator.by_ref() {
-                                    insns.push(insn);
-                                }
-                                dump_live_regs(insns, live_ranges, regs.len(), index);
-                            }
-                            debug!("Register spill not supported");
-                            return Err(CompileError::RegisterSpillOnAlloc);
-                        }
-                    };
-                }
+                let out_opnd = out_reg.unwrap_or_else(|| pool.alloc_opnd(vreg_idx));
 
                 // Set the output operand on the instruction
                 let out_num_bits = Opnd::match_num_bits_iter(insn.opnd_iter());
@@ -1594,9 +1647,9 @@ impl Assembler
                 // output operand on this instruction because the live range
                 // extends beyond the index of the instruction.
                 let out = insn.out_opnd_mut().unwrap();
-                let reg = out_reg.unwrap().with_num_bits(out_num_bits);
-                reg_mapping[out.vreg_idx()] = Some(reg);
-                *out = Opnd::Reg(reg);
+                let out_opnd = out_opnd.with_num_bits(out_num_bits);
+                vreg_opnd[out.vreg_idx()] = Some(out_opnd);
+                *out = out_opnd;
             }
 
             // Replace VReg and Param operands by their corresponding register
@@ -1604,11 +1657,15 @@ impl Assembler
             while let Some(opnd) = opnd_iter.next() {
                 match *opnd {
                     Opnd::VReg { idx, num_bits } => {
-                        *opnd = Opnd::Reg(reg_mapping[idx].unwrap()).with_num_bits(num_bits);
+                        *opnd = vreg_opnd[idx].unwrap().with_num_bits(num_bits);
                     },
                     Opnd::Mem(Mem { base: MemBase::VReg(idx), disp, num_bits }) => {
-                        let base = MemBase::Reg(reg_mapping[idx].unwrap().reg_no);
-                        *opnd = Opnd::Mem(Mem { base, disp, num_bits });
+                        *opnd = match vreg_opnd[idx].unwrap() {
+                            Opnd::Reg(reg) => Opnd::Mem(Mem { base: MemBase::Reg(reg.reg_no), disp, num_bits }),
+                            // If the base is spilled, lower it to MemBase::Stack, which scratch_split will lower to MemBase::Reg.
+                            Opnd::Mem(mem) => Opnd::Mem(Mem { base: pool.stack_state.mem_to_stack_membase(mem), disp, num_bits }),
+                            _ => unreachable!(),
+                        }
                     }
                     _ => {},
                 }
@@ -1618,8 +1675,8 @@ impl Assembler
             // register
             if let Some(idx) = vreg_idx {
                 if live_ranges[idx].end() == index {
-                    if let Some(reg) = reg_mapping[idx] {
-                        pool.dealloc_reg(&reg);
+                    if let Some(opnd) = vreg_opnd[idx] {
+                        pool.dealloc_opnd(&opnd);
                     } else {
                         unreachable!("no register allocated for insn {:?}", insn);
                     }
@@ -1671,6 +1728,16 @@ impl Assembler
             }
         }
 
+        // Extend the stack space for spilled operands
+        for frame_setup_idx in frame_setup_idxs {
+            match &mut asm.insns[frame_setup_idx] {
+                Insn::FrameSetup { slot_count, .. } => {
+                    *slot_count += pool.stack_state.stack_size;
+                }
+                _ => unreachable!(),
+            }
+        }
+
         assert!(pool.is_empty(), "Expected all registers to be returned to the pool");
         Ok(asm)
     }
diff --git a/zjit/src/backend/x86_64/mod.rs b/zjit/src/backend/x86_64/mod.rs
index 14c0df8dd0..1d5d90a856 100644
--- a/zjit/src/backend/x86_64/mod.rs
+++ b/zjit/src/backend/x86_64/mod.rs
@@ -1,4 +1,4 @@
-use std::mem::take;
+use std::mem::{self, take};
 
 use crate::asm::*;
 use crate::asm::x86_64::*;
@@ -97,13 +97,13 @@ pub const ALLOC_REGS: &[Reg] = &[
     RCX_REG,
     R8_REG,
     R9_REG,
-    R10_REG,
     RAX_REG,
 ];
 
 /// Special scratch register for intermediate processing. It should be used only by
 /// [`Assembler::x86_scratch_split`] or [`Assembler::new_with_scratch_reg`].
 const SCRATCH0_OPND: Opnd = Opnd::Reg(R11_REG);
+const SCRATCH1_OPND: Opnd = Opnd::Reg(R10_REG);
 
 impl Assembler {
     /// Return an Assembler with scratch registers disabled in the backend, and a scratch register.
@@ -395,13 +395,13 @@ impl Assembler {
         /// without requiring more registers to be available in the register
         /// allocator. So we just use the SCRATCH0_OPND register temporarily to hold
         /// the value before we immediately use it.
-        fn split_64bit_immediate(asm: &mut Assembler, opnd: Opnd) -> Opnd {
+        fn split_64bit_immediate(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd) -> Opnd {
             match opnd {
                 Opnd::Imm(value) => {
                     // 32-bit values will be sign-extended
                     if imm_num_bits(value) > 32 {
-                        asm.mov(SCRATCH0_OPND, opnd);
-                        SCRATCH0_OPND
+                        asm.mov(scratch_opnd, opnd);
+                        scratch_opnd
                     } else {
                         opnd
                     }
@@ -409,8 +409,8 @@ impl Assembler {
                 Opnd::UImm(value) => {
                     // 32-bit values will be sign-extended
                     if imm_num_bits(value as i64) > 32 {
-                        asm.mov(SCRATCH0_OPND, opnd);
-                        SCRATCH0_OPND
+                        asm.mov(scratch_opnd, opnd);
+                        scratch_opnd
                     } else {
                         Opnd::Imm(value as i64)
                     }
@@ -419,23 +419,102 @@ impl Assembler {
             }
         }
 
-        let mut asm = Assembler::new_with_asm(&self);
+        /// If a given operand is Opnd::Mem and it uses MemBase::Stack, lower it to MemBase::Reg using a scratch regsiter.
+        fn split_stack_membase(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd, stack_state: &StackState) -> Opnd {
+            if let Opnd::Mem(Mem { base: stack_membase @ MemBase::Stack { .. }, disp, num_bits }) = opnd {
+                let base = Opnd::Mem(stack_state.stack_membase_to_mem(stack_membase));
+                asm.load_into(scratch_opnd, base);
+                Opnd::Mem(Mem { base: MemBase::Reg(scratch_opnd.unwrap_reg().reg_no), disp, num_bits })
+            } else {
+                opnd
+            }
+        }
+
+        /// If opnd is Opnd::Mem, set scratch_reg to *opnd. Return Some(Opnd::Mem) if it needs to be written back from scratch_reg.
+        fn split_memory_write(opnd: &mut Opnd, scratch_opnd: Opnd) -> Option<Opnd> {
+            if let Opnd::Mem(_) = opnd {
+                let mem_opnd = opnd.clone();
+                *opnd = opnd.num_bits().map(|num_bits| scratch_opnd.with_num_bits(num_bits)).unwrap_or(scratch_opnd);
+                Some(mem_opnd)
+            } else {
+                None
+            }
+        }
+
+        /// If both opnd and other are Opnd::Mem, split opnd with scratch_opnd.
+        fn split_if_both_memory(asm: &mut Assembler, opnd: Opnd, other: Opnd, scratch_opnd: Opnd) -> Opnd {
+            if let (Opnd::Mem(_), Opnd::Mem(_)) = (opnd, other) {
+                asm.load_into(scratch_opnd.with_num_bits(opnd.rm_num_bits()), opnd);
+                scratch_opnd.with_num_bits(opnd.rm_num_bits())
+            } else {
+                opnd
+            }
+        }
+
+        /// Move src to dst, splitting it with scratch_opnd if it's a Mem-to-Mem move. Skip it if dst == src.
+        fn asm_mov(asm: &mut Assembler, dst: Opnd, src: Opnd, scratch_opnd: Opnd) {
+            if dst != src {
+                if let (Opnd::Mem(_), Opnd::Mem(_)) = (dst, src) {
+                    asm.mov(scratch_opnd, src);
+                    asm.mov(dst, scratch_opnd);
+                } else {
+                    asm.mov(dst, src);
+                }
+            }
+        }
+
+        // Prepare StackState to lower MemBase::Stack
+        let stack_state = StackState::new(self.stack_base_idx);
+
+        let mut asm_local = Assembler::new_with_asm(&self);
+        let asm = &mut asm_local;
         asm.accept_scratch_reg = true;
         let mut iterator = self.insns.into_iter().enumerate().peekable();
 
         while let Some((_, mut insn)) = iterator.next() {
             match &mut insn {
-                Insn::Add { right, .. } |
-                Insn::Sub { right, .. } |
-                Insn::Mul { right, .. } |
-                Insn::And { right, .. } |
-                Insn::Or { right, .. } |
-                Insn::Xor { right, .. } |
-                Insn::Test { right, .. } => {
-                    *right = split_64bit_immediate(&mut asm, *right);
+                Insn::Add { left, right, out } |
+                Insn::Sub { left, right, out } |
+                Insn::And { left, right, out } |
+                Insn::Or  { left, right, out } |
+                Insn::Xor { left, right, out } => {
+                    *left = split_stack_membase(asm, *left, SCRATCH0_OPND, &stack_state);
+                    *left = split_if_both_memory(asm, *left, *right, SCRATCH0_OPND);
+                    *right = split_stack_membase(asm, *right, SCRATCH1_OPND, &stack_state);
+                    *right = split_64bit_immediate(asm, *right, SCRATCH1_OPND);
+
+                    let (out, left) = (*out, *left);
+                    asm.push_insn(insn);
+                    asm_mov(asm, out, left, SCRATCH0_OPND);
+                }
+                Insn::Mul { left, right, out } => {
+                    *left = split_stack_membase(asm, *left, SCRATCH0_OPND, &stack_state);
+                    *left = split_if_both_memory(asm, *left, *right, SCRATCH0_OPND);
+                    *right = split_stack_membase(asm, *right, SCRATCH1_OPND, &stack_state);
+                    *right = split_64bit_immediate(asm, *right, SCRATCH1_OPND);
+
+                    // imul doesn't have (Mem, Reg) encoding. Swap left and right in that case.
+                    if let (Opnd::Mem(_), Opnd::Reg(_)) = (&left, &right) {
+                        mem::swap(left, right);
+                    }
+
+                    let (out, left) = (*out, *left);
                     asm.push_insn(insn);
+                    asm_mov(asm, out, left, SCRATCH0_OPND);
                 }
+                &mut Insn::Not { opnd, out } |
+                &mut Insn::LShift { opnd, out, .. } |
+                &mut Insn::RShift { opnd, out, .. } |
+                &mut Insn::URShift { opnd, out, .. } => {
+                    asm.push_insn(insn);
+                    asm_mov(asm, out, opnd, SCRATCH0_OPND);
+                }
+                Insn::Test { left, right } |
                 Insn::Cmp { left, right } => {
+                    *left = split_stack_membase(asm, *left, SCRATCH1_OPND, &stack_state);
+                    *right = split_stack_membase(asm, *right, SCRATCH0_OPND, &stack_state);
+                    *right = split_if_both_memory(asm, *right, *left, SCRATCH0_OPND);
+
                     let num_bits = match right {
                         Opnd::Imm(value) => Some(imm_num_bits(*value)),
                         Opnd::UImm(value) => Some(uimm_num_bits(*value)),
@@ -450,7 +529,7 @@ impl Assembler {
                     // directly in the instruction.
                     let use_imm = num_bits.is_some() && left.num_bits() == num_bits && num_bits.unwrap() < 64;
                     if !use_imm {
-                        *right = split_64bit_immediate(&mut asm, *right);
+                        *right = split_64bit_immediate(asm, *right, SCRATCH0_OPND);
                     }
                     asm.push_insn(insn);
                 }
@@ -462,16 +541,19 @@ impl Assembler {
                     *opnds = vec![];
                     asm.push_insn(insn);
                 }
-                &mut Insn::Lea { opnd, out } => {
-                    match (opnd, out) {
-                        // Split here for compile_exits
-                        (Opnd::Mem(_), Opnd::Mem(_)) => {
-                            asm.lea_into(SCRATCH0_OPND, opnd);
-                            asm.store(out, SCRATCH0_OPND);
-                        }
-                        _ => {
-                            asm.push_insn(insn);
-                        }
+                Insn::CSelZ { out, .. } |
+                Insn::CSelNZ { out, .. } |
+                Insn::CSelE { out, .. } |
+                Insn::CSelNE { out, .. } |
+                Insn::CSelL { out, .. } |
+                Insn::CSelLE { out, .. } |
+                Insn::CSelG { out, .. } |
+                Insn::CSelGE { out, .. } |
+                Insn::Lea { out, .. } => {
+                    let mem_out = split_memory_write(out, SCRATCH0_OPND);
+                    asm.push_insn(insn);
+                    if let Some(mem_out) = mem_out {
+                        asm.store(mem_out, SCRATCH0_OPND);
                     }
                 }
                 Insn::LeaJumpTarget { target, out } => {
@@ -480,6 +562,15 @@ impl Assembler {
                         asm.mov(*out, SCRATCH0_OPND);
                     }
                 }
+                Insn::Load { out, opnd } |
+                Insn::LoadInto { dest: out, opnd } => {
+                    *opnd = split_stack_membase(asm, *opnd, SCRATCH0_OPND, &stack_state);
+                    let mem_out = split_memory_write(out, SCRATCH0_OPND);
+                    asm.push_insn(insn);
+                    if let Some(mem_out) = mem_out {
+                        asm.store(mem_out, SCRATCH0_OPND.with_num_bits(mem_out.rm_num_bits()));
+                    }
+                }
                 // Convert Opnd::const_ptr into Opnd::Mem. This split is done here to give
                 // a register for compile_exits.
                 &mut Insn::IncrCounter { mem, value } => {
@@ -487,17 +578,19 @@ impl Assembler {
                     asm.load_into(SCRATCH0_OPND, mem);
                     asm.incr_counter(Opnd::mem(64, SCRATCH0_OPND, 0), value);
                 }
+                &mut Insn::Mov { dest, src } => {
+                    asm_mov(asm, dest, src, SCRATCH0_OPND);
+                }
                 // Resolve ParallelMov that couldn't be handled without a scratch register.
                 Insn::ParallelMov { moves } => {
                     for (dst, src) in Self::resolve_parallel_moves(&moves, Some(SCRATCH0_OPND)).unwrap() {
-                        asm.mov(dst, src)
+                        asm_mov(asm, dst, src, SCRATCH0_OPND);
                     }
                 }
                 // Handle various operand combinations for spills on compile_exits.
                 &mut Insn::Store { dest, src } => {
-                    let Opnd::Mem(Mem { num_bits, .. }) = dest else {
-                        panic!("Unexpected Insn::Store destination in x86_scratch_split: {dest:?}");
-                    };
+                    let num_bits = dest.rm_num_bits();
+                    let dest = split_stack_membase(asm, dest, SCRATCH1_OPND, &stack_state);
 
                     let src = match src {
                         Opnd::Reg(_) => src,
@@ -541,7 +634,7 @@ impl Assembler {
             }
         }
 
-        asm
+        asm_local
     }
 
     /// Emit platform-specific machine code
author	Takashi Kokubun <takashi.kokubun@shopify.com>	2025-11-03 16:49:25 -0800
committer	GitHub <noreply@github.com>	2025-11-03 16:49:25 -0800
commit	9ca940757315e53d1eaddc83071b1b4581f8f578 (patch)
tree	82429b9d0516f431f07207a1c820e9045226f3b3
parent	0d210f4d39c1cff71f34dbe9938345897f3e5eb1 (diff)