8 files changed, 773 insertions, 219 deletions
diff --git a/zjit/src/backend/arm64/mod.rs b/zjit/src/backend/arm64/mod.rs
index 574249dabd..d06e84536f 100644
--- a/zjit/src/backend/arm64/mod.rs
+++ b/zjit/src/backend/arm64/mod.rs
@@ -694,7 +694,8 @@ impl Assembler {
     /// VRegs, most splits should happen in [`Self::arm64_split`]. However, some instructions
     /// need to be split with registers after `alloc_regs`, e.g. for `compile_exits`, so this
     /// splits them and uses scratch registers for it.
-    fn arm64_scratch_split(mut self) -> Assembler {
+    /// Linearizes all blocks into a single giant block.
+    fn arm64_scratch_split(self) -> Assembler {
         /// If opnd is Opnd::Mem with a too large disp, make the disp smaller using lea.
         fn split_large_disp(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd) -> Opnd {
             match opnd {
@@ -750,12 +751,23 @@ impl Assembler {
         // Prepare StackState to lower MemBase::Stack
         let stack_state = StackState::new(self.stack_base_idx);
 
-        let mut asm_local = Assembler::new_with_asm(&self);
+        let mut asm_local = Assembler::new();
+        asm_local.accept_scratch_reg = true;
+        asm_local.stack_base_idx = self.stack_base_idx;
+        asm_local.label_names = self.label_names.clone();
+        asm_local.live_ranges.resize(self.live_ranges.len(), LiveRange { start: None, end: None });
+
+        // Create one giant block to linearize everything into
+        asm_local.new_block_without_id();
+
         let asm = &mut asm_local;
-        asm.accept_scratch_reg = true;
-        let iterator = &mut self.instruction_iterator();
 
-        while let Some((_, mut insn)) = iterator.next(asm) {
+        // Get linearized instructions with branch parameters expanded into ParallelMov
+        let linearized_insns = self.linearize_instructions();
+
+        // Process each linearized instruction
+        for (idx, insn) in linearized_insns.iter().enumerate() {
+            let mut insn = insn.clone();
             match &mut insn {
                 Insn::Add { left, right, out } |
                 Insn::Sub { left, right, out } |
@@ -795,7 +807,7 @@ impl Assembler {
                     };
 
                     // If the next instruction is JoMul
-                    if matches!(iterator.peek(), Some((_, Insn::JoMul(_)))) {
+                    if idx + 1 < linearized_insns.len() && matches!(linearized_insns[idx + 1], Insn::JoMul(_)) {
                         // Produce a register that is all zeros or all ones
                         // Based on the sign bit of the 64-bit mul result
                         asm.push_insn(Insn::RShift { out: SCRATCH0_OPND, opnd: reg_out, shift: Opnd::UImm(63) });
@@ -940,7 +952,7 @@ impl Assembler {
 
         /// Emit a conditional jump instruction to a specific target. This is
         /// called when lowering any of the conditional jump instructions.
-        fn emit_conditional_jump<const CONDITION: u8>(cb: &mut CodeBlock, target: Target) {
+        fn emit_conditional_jump<const CONDITION: u8>(asm: &Assembler, cb: &mut CodeBlock, target: Target) {
             fn generate_branch<const CONDITION: u8>(cb: &mut CodeBlock, src_addr: i64, dst_addr: i64) {
                 let num_insns = if bcond_offset_fits_bits((dst_addr - src_addr) / 4) {
                     // If the jump offset fits into the conditional jump as
@@ -991,30 +1003,31 @@ impl Assembler {
                 (num_insns..cb.conditional_jump_insns()).for_each(|_| nop(cb));
             }
 
-            match target {
+            let label = match target {
                 Target::CodePtr(dst_ptr) => {
                     let dst_addr = dst_ptr.as_offset();
                     let src_addr = cb.get_write_ptr().as_offset();
                     generate_branch::<CONDITION>(cb, src_addr, dst_addr);
+                    return;
                 },
-                Target::Label(label_idx) => {
-                    // Try to use a single B.cond instruction
-                    cb.label_ref(label_idx, 4, |cb, src_addr, dst_addr| {
-                        // +1 since src_addr is after the instruction while A64
-                        // counts the offset relative to the start.
-                        let offset = (dst_addr - src_addr) / 4 + 1;
-                        if bcond_offset_fits_bits(offset) {
-                            bcond(cb, CONDITION, InstructionOffset::from_insns(offset as i32));
-                            Ok(())
-                        } else {
-                            Err(())
-                        }
-                    });
-                },
+                Target::Label(l) => l,
+                Target::Block(ref edge) => asm.block_label(edge.target),
                 Target::SideExit { .. } => {
                     unreachable!("Target::SideExit should have been compiled by compile_exits")
                 },
             };
+            // Try to use a single B.cond instruction
+            cb.label_ref(label, 4, |cb, src_addr, dst_addr| {
+                // +1 since src_addr is after the instruction while A64
+                // counts the offset relative to the start.
+                let offset = (dst_addr - src_addr) / 4 + 1;
+                if bcond_offset_fits_bits(offset) {
+                    bcond(cb, CONDITION, InstructionOffset::from_insns(offset as i32));
+                    Ok(())
+                } else {
+                    Err(())
+                }
+            });
         }
 
         /// Emit a CBZ or CBNZ which branches when a register is zero or non-zero
@@ -1117,8 +1130,13 @@ impl Assembler {
         let (_hook, mut hook_insn_idx) = AssemblerPanicHook::new(self, 0);
 
         // For each instruction
+        // NOTE: At this point, the assembler should have been linearized into a single giant block
+        // by either resolve_parallel_mov_pass() or arm64_scratch_split().
         let mut insn_idx: usize = 0;
-        while let Some(insn) = self.insns.get(insn_idx) {
+        assert_eq!(self.basic_blocks.len(), 1, "Assembler should be linearized into a single block before arm64_emit");
+        let insns = &self.basic_blocks[0].insns;
+
+        while let Some(insn) = insns.get(insn_idx) {
             // Update insn_idx that is shown on panic
             hook_insn_idx.as_mut().map(|idx| idx.lock().map(|mut idx| *idx = insn_idx).unwrap());
 
@@ -1222,7 +1240,7 @@ impl Assembler {
                 },
                 Insn::Mul { left, right, out } => {
                     // If the next instruction is JoMul with RShift created by arm64_scratch_split
-                    match (self.insns.get(insn_idx + 1), self.insns.get(insn_idx + 2)) {
+                    match (insns.get(insn_idx + 1), insns.get(insn_idx + 2)) {
                         (Some(Insn::RShift { out: out_sign, opnd: out_opnd, shift: out_shift }), Some(Insn::JoMul(_))) => {
                             // Compute the high 64 bits
                             smulh(cb, Self::EMIT_OPND, left.into(), right.into());
@@ -1487,34 +1505,48 @@ impl Assembler {
                                 }
                             });
                         },
+                        Target::Block(ref edge) => {
+                            let label = self.block_label(edge.target);
+                            cb.label_ref(label, 4, |cb, src_addr, dst_addr| {
+                                // +1 since src_addr is after the instruction while A64
+                                // counts the offset relative to the start.
+                                let offset = (dst_addr - src_addr) / 4 + 1;
+                                if b_offset_fits_bits(offset) {
+                                    b(cb, InstructionOffset::from_insns(offset as i32));
+                                    Ok(())
+                                } else {
+                                    Err(())
+                                }
+                            });
+                        },
                         Target::SideExit { .. } => {
                             unreachable!("Target::SideExit should have been compiled by compile_exits")
                         },
                     };
                 },
                 Insn::Je(target) | Insn::Jz(target) => {
-                    emit_conditional_jump::<{Condition::EQ}>(cb, target.clone());
+                    emit_conditional_jump::<{Condition::EQ}>(self, cb, target.clone());
                 },
                 Insn::Jne(target) | Insn::Jnz(target) | Insn::JoMul(target) => {
-                    emit_conditional_jump::<{Condition::NE}>(cb, target.clone());
+                    emit_conditional_jump::<{Condition::NE}>(self, cb, target.clone());
                 },
                 Insn::Jl(target) => {
-                    emit_conditional_jump::<{Condition::LT}>(cb, target.clone());
+                    emit_conditional_jump::<{Condition::LT}>(self, cb, target.clone());
                 },
                 Insn::Jg(target) => {
-                    emit_conditional_jump::<{Condition::GT}>(cb, target.clone());
+                    emit_conditional_jump::<{Condition::GT}>(self, cb, target.clone());
                 },
                 Insn::Jge(target) => {
-                    emit_conditional_jump::<{Condition::GE}>(cb, target.clone());
+                    emit_conditional_jump::<{Condition::GE}>(self, cb, target.clone());
                 },
                 Insn::Jbe(target) => {
-                    emit_conditional_jump::<{Condition::LS}>(cb, target.clone());
+                    emit_conditional_jump::<{Condition::LS}>(self, cb, target.clone());
                 },
                 Insn::Jb(target) => {
-                    emit_conditional_jump::<{Condition::CC}>(cb, target.clone());
+                    emit_conditional_jump::<{Condition::CC}>(self, cb, target.clone());
                 },
                 Insn::Jo(target) => {
-                    emit_conditional_jump::<{Condition::VS}>(cb, target.clone());
+                    emit_conditional_jump::<{Condition::VS}>(self, cb, target.clone());
                 },
                 Insn::Joz(opnd, target) => {
                     emit_cmp_zero_jump(cb, opnd.into(), true, target.clone());
@@ -1537,8 +1569,8 @@ impl Assembler {
                     let Some(Insn::Cmp {
                         left: status_reg @ Opnd::Reg(_),
                         right: Opnd::UImm(_) | Opnd::Imm(_),
-                    }) = self.insns.get(insn_idx + 1) else {
-                        panic!("arm64_scratch_split should add Cmp after IncrCounter: {:?}", self.insns.get(insn_idx + 1));
+                    }) = insns.get(insn_idx + 1) else {
+                        panic!("arm64_scratch_split should add Cmp after IncrCounter: {:?}", insns.get(insn_idx + 1));
                     };
 
                     // Attempt to increment a counter
@@ -1587,7 +1619,7 @@ impl Assembler {
         } else {
             // No bytes dropped, so the pos markers point to valid code
             for (insn_idx, pos) in pos_markers {
-                if let Insn::PosMarker(callback) = self.insns.get(insn_idx).unwrap() {
+                if let Insn::PosMarker(callback) = insns.get(insn_idx).unwrap() {
                     callback(pos, cb);
                 } else {
                     panic!("non-PosMarker in pos_markers insn_idx={insn_idx} {self:?}");
@@ -1617,6 +1649,10 @@ impl Assembler {
         if use_scratch_reg {
             asm = asm.arm64_scratch_split();
             asm_dump!(asm, scratch_split);
+        } else {
+            // For trampolines that use scratch registers, resolve ParallelMov without scratch_reg.
+            asm = asm.resolve_parallel_mov_pass();
+            asm_dump!(asm, resolve_parallel_mov);
         }
 
         // Create label instances in the code block
@@ -1681,12 +1717,15 @@ mod tests {
 
     use super::*;
     use insta::assert_snapshot;
+    use crate::hir;
 
     static TEMP_REGS: [Reg; 5] = [X1_REG, X9_REG, X10_REG, X14_REG, X15_REG];
 
     fn setup_asm() -> (Assembler, CodeBlock) {
         crate::options::rb_zjit_prepare_options(); // Allow `get_option!` in Assembler
-        (Assembler::new(), CodeBlock::new_dummy())
+        let mut asm = Assembler::new();
+        asm.new_block_without_id();
+        (asm, CodeBlock::new_dummy())
     }
 
     #[test]
@@ -1694,6 +1733,7 @@ mod tests {
         use crate::hir::SideExitReason;
 
         let mut asm = Assembler::new();
+        asm.new_block_without_id();
         asm.stack_base_idx = 1;
 
         let label = asm.new_label("bb0");
@@ -2107,6 +2147,7 @@ mod tests {
     #[test]
     fn test_store_with_valid_scratch_reg() {
         let (mut asm, scratch_reg) = Assembler::new_with_scratch_reg();
+        asm.new_block_without_id();
         let mut cb = CodeBlock::new_dummy();
         asm.store(Opnd::mem(64, scratch_reg, 0), 0x83902.into());
 
@@ -2560,6 +2601,7 @@ mod tests {
 
         crate::options::rb_zjit_prepare_options(); // Allow `get_option!` in Assembler
         let mut asm = Assembler::new();
+        asm.new_block_without_id();
         let mut cb = CodeBlock::new_dummy_sized(memory_required);
 
         let far_label = asm.new_label("far");
diff --git a/zjit/src/backend/lir.rs b/zjit/src/backend/lir.rs
index 06127b5c1a..f2f7bc6165 100644
--- a/zjit/src/backend/lir.rs
+++ b/zjit/src/backend/lir.rs
@@ -7,6 +7,7 @@ use std::sync::{Arc, Mutex};
 use crate::codegen::local_size_and_idx_to_ep_offset;
 use crate::cruby::{Qundef, RUBY_OFFSET_CFP_PC, RUBY_OFFSET_CFP_SP, SIZEOF_VALUE_I32, vm_stack_canary};
 use crate::hir::{Invariant, SideExitReason};
+use crate::hir;
 use crate::options::{TraceExits, debug, get_option};
 use crate::cruby::VALUE;
 use crate::payload::IseqVersionRef;
@@ -15,6 +16,104 @@ use crate::virtualmem::CodePtr;
 use crate::asm::{CodeBlock, Label};
 use crate::state::rb_zjit_record_exit_stack;
 
+/// LIR Block ID. Unique ID for each block, and also defined in LIR so
+/// we can differentiate it from HIR block ids.
+#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug, PartialOrd, Ord)]
+pub struct BlockId(pub usize);
+
+impl From<BlockId> for usize {
+    fn from(val: BlockId) -> Self {
+        val.0
+    }
+}
+
+impl std::fmt::Display for BlockId {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "l{}", self.0)
+    }
+}
+
+/// Dummy HIR block ID used when creating test or invalid LIR blocks
+const DUMMY_HIR_BLOCK_ID: usize = usize::MAX;
+/// Dummy RPO index used when creating test or invalid LIR blocks
+const DUMMY_RPO_INDEX: usize = usize::MAX;
+
+#[derive(Debug, PartialEq, Clone)]
+pub struct BranchEdge {
+    pub target: BlockId,
+    pub args: Vec<Opnd>,
+}
+
+#[derive(Clone, Debug)]
+pub struct BasicBlock {
+    // Unique id for this block
+    pub id: BlockId,
+
+    // HIR block this LIR block was lowered from. Not injective: multiple LIR blocks may share
+    // the same hir_block_id because we split HIR blocks into multiple LIR blocks during lowering.
+    pub hir_block_id: hir::BlockId,
+
+    pub is_entry: bool,
+
+    // Instructions in this basic block
+    pub insns: Vec<Insn>,
+
+    // Input parameters for this block
+    pub parameters: Vec<Opnd>,
+
+    // RPO position of the source HIR block
+    pub rpo_index: usize,
+}
+
+pub struct EdgePair(Option<BranchEdge>, Option<BranchEdge>);
+
+impl BasicBlock {
+    fn new(id: BlockId, hir_block_id: hir::BlockId, is_entry: bool, rpo_index: usize) -> Self {
+        Self {
+            id,
+            hir_block_id,
+            is_entry,
+            insns: vec![],
+            parameters: vec![],
+            rpo_index,
+        }
+    }
+
+    pub fn add_parameter(&mut self, param: Opnd) {
+        self.parameters.push(param);
+    }
+
+    pub fn push_insn(&mut self, insn: Insn) {
+        self.insns.push(insn);
+    }
+
+    pub fn edges(&self) -> EdgePair {
+        assert!(self.insns.last().unwrap().is_terminator());
+        let extract_edge = |insn: &Insn| -> Option<BranchEdge> {
+            if let Some(Target::Block(edge)) = insn.target() {
+                Some(edge.clone())
+            } else {
+                None
+            }
+        };
+
+        match self.insns.as_slice() {
+            [] => panic!("empty block"),
+            [.., second_last, last] => {
+                EdgePair(extract_edge(second_last), extract_edge(last))
+            },
+            [.., last] => {
+                EdgePair(extract_edge(last), None)
+            }
+        }
+    }
+
+    /// Sort key for scheduling blocks in code layout order
+    pub fn sort_key(&self) -> (usize, usize) {
+        (self.rpo_index, self.id.0)
+    }
+}
+
 pub use crate::backend::current::{
     mem_base_reg,
     Reg,
@@ -309,13 +408,15 @@ pub struct SideExit {
 
 /// Branch target (something that we can jump to)
 /// for branch instructions
-#[derive(Clone, Debug)]
+#[derive(Clone)]
 pub enum Target
 {
     /// Pointer to a piece of ZJIT-generated code
     CodePtr(CodePtr),
     /// A label within the generated code
     Label(Label),
+    /// An LIR branch edge
+    Block(BranchEdge),
     /// Side exit to the interpreter
     SideExit {
         /// Context used for compiling the side exit
@@ -325,6 +426,32 @@ pub enum Target
     },
 }
 
+impl fmt::Debug for Target {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Target::CodePtr(ptr) => write!(f, "CodePtr({:?})", ptr),
+            Target::Label(label) => write!(f, "Label({:?})", label),
+            Target::Block(edge) => {
+                if edge.args.is_empty() {
+                    write!(f, "Block({:?})", edge.target)
+                } else {
+                    write!(f, "Block({:?}(", edge.target)?;
+                    for (i, arg) in edge.args.iter().enumerate() {
+                        if i > 0 {
+                            write!(f, ", ")?;
+                        }
+                        write!(f, "{:?}", arg)?;
+                    }
+                    write!(f, "))")
+                }
+            }
+            Target::SideExit { exit, reason } => {
+                write!(f, "SideExit {{ exit: {:?}, reason: {:?} }}", exit, reason)
+            }
+        }
+    }
+}
+
 impl Target
 {
     pub fn unwrap_label(&self) -> Label {
@@ -771,6 +898,29 @@ impl Insn {
             _ => None
         }
     }
+
+    /// Returns true if this instruction is a terminator (ends a basic block).
+    pub fn is_terminator(&self) -> bool {
+        match self {
+            Insn::Jbe(_) |
+            Insn::Jb(_) |
+            Insn::Je(_) |
+            Insn::Jl(_) |
+            Insn::Jg(_) |
+            Insn::Jge(_) |
+            Insn::Jmp(_) |
+            Insn::JmpOpnd(_) |
+            Insn::Jne(_) |
+            Insn::Jnz(_) |
+            Insn::Jo(_) |
+            Insn::JoMul(_) |
+            Insn::Jz(_) |
+            Insn::Joz(..) |
+            Insn::Jonz(..) |
+            Insn::CRet(_) => true,
+            _ => false
+        }
+    }
 }
 
 /// An iterator that will yield a non-mutable reference to each operand in turn
@@ -806,22 +956,33 @@ impl<'a> Iterator for InsnOpndIterator<'a> {
             Insn::Label(target) |
             Insn::LeaJumpTarget { target, .. } |
             Insn::PatchPoint { target, .. } => {
-                if let Target::SideExit { exit: SideExit { stack, locals, .. }, .. } = target {
-                    let stack_idx = self.idx;
-                    if stack_idx < stack.len() {
-                        let opnd = &stack[stack_idx];
-                        self.idx += 1;
-                        return Some(opnd);
-                    }
+                match target {
+                    Target::SideExit { exit: SideExit { stack, locals, .. }, .. } => {
+                        let stack_idx = self.idx;
+                        if stack_idx < stack.len() {
+                            let opnd = &stack[stack_idx];
+                            self.idx += 1;
+                            return Some(opnd);
+                        }
 
-                    let local_idx = self.idx - stack.len();
-                    if local_idx < locals.len() {
-                        let opnd = &locals[local_idx];
-                        self.idx += 1;
-                        return Some(opnd);
+                        let local_idx = self.idx - stack.len();
+                        if local_idx < locals.len() {
+                            let opnd = &locals[local_idx];
+                            self.idx += 1;
+                            return Some(opnd);
+                        }
+                        None
+                    }
+                    Target::Block(edge) => {
+                        if self.idx < edge.args.len() {
+                            let opnd = &edge.args[self.idx];
+                            self.idx += 1;
+                            return Some(opnd);
+                        }
+                        None
                     }
+                    _ => None
                 }
-                None
             }
 
             Insn::Joz(opnd, target) |
@@ -831,22 +992,34 @@ impl<'a> Iterator for InsnOpndIterator<'a> {
                     return Some(opnd);
                 }
 
-                if let Target::SideExit { exit: SideExit { stack, locals, .. }, .. } = target {
-                    let stack_idx = self.idx - 1;
-                    if stack_idx < stack.len() {
-                        let opnd = &stack[stack_idx];
-                        self.idx += 1;
-                        return Some(opnd);
-                    }
+                match target {
+                    Target::SideExit { exit: SideExit { stack, locals, .. }, .. } => {
+                        let stack_idx = self.idx - 1;
+                        if stack_idx < stack.len() {
+                            let opnd = &stack[stack_idx];
+                            self.idx += 1;
+                            return Some(opnd);
+                        }
 
-                    let local_idx = stack_idx - stack.len();
-                    if local_idx < locals.len() {
-                        let opnd = &locals[local_idx];
-                        self.idx += 1;
-                        return Some(opnd);
+                        let local_idx = stack_idx - stack.len();
+                        if local_idx < locals.len() {
+                            let opnd = &locals[local_idx];
+                            self.idx += 1;
+                            return Some(opnd);
+                        }
+                        None
+                    }
+                    Target::Block(edge) => {
+                        let arg_idx = self.idx - 1;
+                        if arg_idx < edge.args.len() {
+                            let opnd = &edge.args[arg_idx];
+                            self.idx += 1;
+                            return Some(opnd);
+                        }
+                        None
                     }
+                    _ => None
                 }
-                None
             }
 
             Insn::BakeString(_) |
@@ -975,22 +1148,33 @@ impl<'a> InsnOpndMutIterator<'a> {
             Insn::Label(target) |
             Insn::LeaJumpTarget { target, .. } |
             Insn::PatchPoint { target, .. } => {
-                if let Target::SideExit { exit: SideExit { stack, locals, .. }, .. } = target {
-                    let stack_idx = self.idx;
-                    if stack_idx < stack.len() {
-                        let opnd = &mut stack[stack_idx];
-                        self.idx += 1;
-                        return Some(opnd);
-                    }
+                match target {
+                    Target::SideExit { exit: SideExit { stack, locals, .. }, .. } => {
+                        let stack_idx = self.idx;
+                        if stack_idx < stack.len() {
+                            let opnd = &mut stack[stack_idx];
+                            self.idx += 1;
+                            return Some(opnd);
+                        }
 
-                    let local_idx = self.idx - stack.len();
-                    if local_idx < locals.len() {
-                        let opnd = &mut locals[local_idx];
-                        self.idx += 1;
-                        return Some(opnd);
+                        let local_idx = self.idx - stack.len();
+                        if local_idx < locals.len() {
+                            let opnd = &mut locals[local_idx];
+                            self.idx += 1;
+                            return Some(opnd);
+                        }
+                        None
+                    }
+                    Target::Block(edge) => {
+                        if self.idx < edge.args.len() {
+                            let opnd = &mut edge.args[self.idx];
+                            self.idx += 1;
+                            return Some(opnd);
+                        }
+                        None
                     }
+                    _ => None
                 }
-                None
             }
 
             Insn::Joz(opnd, target) |
@@ -1000,22 +1184,34 @@ impl<'a> InsnOpndMutIterator<'a> {
                     return Some(opnd);
                 }
 
-                if let Target::SideExit { exit: SideExit { stack, locals, .. }, .. } = target {
-                    let stack_idx = self.idx - 1;
-                    if stack_idx < stack.len() {
-                        let opnd = &mut stack[stack_idx];
-                        self.idx += 1;
-                        return Some(opnd);
-                    }
+                match target {
+                    Target::SideExit { exit: SideExit { stack, locals, .. }, .. } => {
+                        let stack_idx = self.idx - 1;
+                        if stack_idx < stack.len() {
+                            let opnd = &mut stack[stack_idx];
+                            self.idx += 1;
+                            return Some(opnd);
+                        }
 
-                    let local_idx = stack_idx - stack.len();
-                    if local_idx < locals.len() {
-                        let opnd = &mut locals[local_idx];
-                        self.idx += 1;
-                        return Some(opnd);
+                        let local_idx = stack_idx - stack.len();
+                        if local_idx < locals.len() {
+                            let opnd = &mut locals[local_idx];
+                            self.idx += 1;
+                            return Some(opnd);
+                        }
+                        None
                     }
+                    Target::Block(edge) => {
+                        let arg_idx = self.idx - 1;
+                        if arg_idx < edge.args.len() {
+                            let opnd = &mut edge.args[arg_idx];
+                            self.idx += 1;
+                            return Some(opnd);
+                        }
+                        None
+                    }
+                    _ => None
                 }
-                None
             }
 
             Insn::BakeString(_) |
@@ -1332,7 +1528,12 @@ const ASSEMBLER_INSNS_CAPACITY: usize = 256;
 /// optimized and lowered
 #[derive(Clone)]
 pub struct Assembler {
-    pub(super) insns: Vec<Insn>,
+    pub basic_blocks: Vec<BasicBlock>,
+
+    /// The block to which new instructions are added. Used during HIR to LIR lowering to
+    /// determine which LIR block we should add instructions to. Set by `set_current_block()`
+    /// and automatically set to new entry blocks created by `new_block()`.
+    current_block_id: BlockId,
 
     /// Live range for each VReg indexed by its `idx``
     pub(super) live_ranges: Vec<LiveRange>,
@@ -1350,7 +1551,10 @@ pub struct Assembler {
     pub(super) stack_base_idx: usize,
 
     /// If Some, the next ccall should verify its leafness
-    leaf_ccall_stack_size: Option<usize>
+    leaf_ccall_stack_size: Option<usize>,
+
+    /// Current instruction index, incremented for each instruction pushed
+    idx: usize,
 }
 
 impl Assembler
@@ -1358,12 +1562,14 @@ impl Assembler
     /// Create an Assembler with defaults
     pub fn new() -> Self {
         Self {
-            insns: Vec::with_capacity(ASSEMBLER_INSNS_CAPACITY),
-            live_ranges: Vec::with_capacity(ASSEMBLER_INSNS_CAPACITY),
             label_names: Vec::default(),
             accept_scratch_reg: false,
             stack_base_idx: 0,
             leaf_ccall_stack_size: None,
+            basic_blocks: Vec::default(),
+            current_block_id: BlockId(0),
+            live_ranges: Vec::default(),
+            idx: 0,
         }
     }
 
@@ -1387,11 +1593,62 @@ impl Assembler
             stack_base_idx: old_asm.stack_base_idx,
             ..Self::new()
         };
-        // Bump the initial VReg index to allow the use of the VRegs for the old Assembler
+
+        // Initialize basic blocks from the old assembler, preserving hir_block_id and entry flag
+        // but with empty instruction lists
+        for old_block in &old_asm.basic_blocks {
+            asm.new_block_from_old_block(&old_block);
+        }
+
+        // Initialize live_ranges to match the old assembler's size
+        // This allows reusing VRegs from the old assembler
         asm.live_ranges.resize(old_asm.live_ranges.len(), LiveRange { start: None, end: None });
+
         asm
     }
 
+    // Create a new LIR basic block.  Returns the newly created block ID
+    pub fn new_block(&mut self, hir_block_id: hir::BlockId, is_entry: bool, rpo_index: usize) -> BlockId {
+        let bb_id = BlockId(self.basic_blocks.len());
+        let lir_bb = BasicBlock::new(bb_id, hir_block_id, is_entry, rpo_index);
+        self.basic_blocks.push(lir_bb);
+        if is_entry {
+            self.set_current_block(bb_id);
+        }
+        bb_id
+    }
+
+    // Create a new LIR basic block from an old one.  This should only be used
+    // when creating new assemblers during passes when we want to translate
+    // one assembler to a new one.
+    pub fn new_block_from_old_block(&mut self, old_block: &BasicBlock) -> BlockId {
+        let bb_id = BlockId(self.basic_blocks.len());
+        let lir_bb = BasicBlock::new(bb_id, old_block.hir_block_id, old_block.is_entry, old_block.rpo_index);
+        self.basic_blocks.push(lir_bb);
+        bb_id
+    }
+
+    // Create a LIR basic block without a valid HIR block ID (for testing or internal use).
+    pub fn new_block_without_id(&mut self) -> BlockId {
+        self.new_block(hir::BlockId(DUMMY_HIR_BLOCK_ID), true, DUMMY_RPO_INDEX)
+    }
+
+    pub fn set_current_block(&mut self, block_id: BlockId) {
+        self.current_block_id = block_id;
+    }
+
+    pub fn current_block(&mut self) -> &mut BasicBlock {
+        &mut self.basic_blocks[self.current_block_id.0]
+    }
+
+    /// Return basic blocks sorted by RPO index, then by block ID.
+    /// TODO: Use a more advanced scheduling algorithm
+    pub fn sorted_blocks(&self) -> Vec<&BasicBlock> {
+        let mut sorted: Vec<&BasicBlock> = self.basic_blocks.iter().collect();
+        sorted.sort_by_key(|block| block.sort_key());
+        sorted
+    }
+
     /// Return true if `opnd` is or depends on `reg`
     pub fn has_reg(opnd: Opnd, reg: Reg) -> bool {
         match opnd {
@@ -1402,11 +1659,100 @@ impl Assembler
     }
 
     pub fn instruction_iterator(&mut self) -> InsnIter {
-        let insns = take(&mut self.insns);
-        InsnIter {
-            old_insns_iter: insns.into_iter(),
+        let mut blocks = take(&mut self.basic_blocks);
+        blocks.sort_by_key(|block| block.sort_key());
+
+        let mut iter = InsnIter {
+            blocks,
+            current_block_idx: 0,
+            current_insn_iter: vec![].into_iter(), // Will be replaced immediately
             peeked: None,
             index: 0,
+        };
+
+        // Set up first block's iterator
+        if !iter.blocks.is_empty() {
+            iter.current_insn_iter = take(&mut iter.blocks[0].insns).into_iter();
+        }
+
+        iter
+    }
+
+    /// Return an operand for a basic block argument at a given index.
+    /// To simplify the implementation, we allocate a fixed register or a stack slot
+    /// for each basic block argument.
+    pub fn param_opnd(idx: usize) -> Opnd {
+        use crate::backend::current::ALLOC_REGS;
+        use crate::cruby::SIZEOF_VALUE_I32;
+
+        if idx < ALLOC_REGS.len() {
+            Opnd::Reg(ALLOC_REGS[idx])
+        } else {
+            // With FrameSetup, the address that NATIVE_BASE_PTR points to stores an old value in the register.
+            // To avoid clobbering it, we need to start from the next slot, hence `+ 1` for the index.
+            Opnd::mem(64, NATIVE_BASE_PTR, (idx - ALLOC_REGS.len() + 1) as i32 * -SIZEOF_VALUE_I32)
+        }
+    }
+
+    pub fn linearize_instructions(&self) -> Vec<Insn> {
+        // Emit instructions with labels, expanding branch parameters
+        let mut insns = Vec::with_capacity(ASSEMBLER_INSNS_CAPACITY);
+
+        for block in self.sorted_blocks() {
+            // Process each instruction, expanding branch params if needed
+            for insn in &block.insns {
+                self.expand_branch_insn(insn, &mut insns);
+            }
+        }
+        insns
+    }
+
+    /// Expand and linearize a branch instruction:
+    /// 1. If the branch has Target::Block with arguments, insert a ParallelMov first
+    /// 2. Convert Target::Block to Target::Label
+    /// 3. Push the converted instruction
+    fn expand_branch_insn(&self, insn: &Insn, insns: &mut Vec<Insn>) {
+        // Helper to process branch arguments and return the label target
+        let mut process_edge = |edge: &BranchEdge| -> Label {
+            if !edge.args.is_empty() {
+                insns.push(Insn::ParallelMov {
+                    moves: edge.args.iter().enumerate()
+                        .map(|(idx, &arg)| (Assembler::param_opnd(idx), arg))
+                        .collect()
+                });
+            }
+            self.block_label(edge.target)
+        };
+
+        // Convert Target::Block to Target::Label, processing args if needed
+        let stripped_insn = match insn {
+            Insn::Jmp(Target::Block(edge)) => Insn::Jmp(Target::Label(process_edge(edge))),
+            Insn::Jz(Target::Block(edge)) => Insn::Jz(Target::Label(process_edge(edge))),
+            Insn::Jnz(Target::Block(edge)) => Insn::Jnz(Target::Label(process_edge(edge))),
+            Insn::Je(Target::Block(edge)) => Insn::Je(Target::Label(process_edge(edge))),
+            Insn::Jne(Target::Block(edge)) => Insn::Jne(Target::Label(process_edge(edge))),
+            Insn::Jl(Target::Block(edge)) => Insn::Jl(Target::Label(process_edge(edge))),
+            Insn::Jg(Target::Block(edge)) => Insn::Jg(Target::Label(process_edge(edge))),
+            Insn::Jge(Target::Block(edge)) => Insn::Jge(Target::Label(process_edge(edge))),
+            Insn::Jbe(Target::Block(edge)) => Insn::Jbe(Target::Label(process_edge(edge))),
+            Insn::Jb(Target::Block(edge)) => Insn::Jb(Target::Label(process_edge(edge))),
+            Insn::Jo(Target::Block(edge)) => Insn::Jo(Target::Label(process_edge(edge))),
+            Insn::JoMul(Target::Block(edge)) => Insn::JoMul(Target::Label(process_edge(edge))),
+            Insn::Joz(opnd, Target::Block(edge)) => Insn::Joz(*opnd, Target::Label(process_edge(edge))),
+            Insn::Jonz(opnd, Target::Block(edge)) => Insn::Jonz(*opnd, Target::Label(process_edge(edge))),
+            _ => insn.clone()
+        };
+
+        // Push the stripped instruction
+        insns.push(stripped_insn);
+    }
+
+    // Get the label for a given block by extracting it from the first instruction.
+    pub(super) fn block_label(&self, block_id: BlockId) -> Label {
+        let block = &self.basic_blocks[block_id.0];
+        match block.insns.first() {
+            Some(Insn::Label(Target::Label(label))) => *label,
+            other => panic!("Expected first instruction of block {:?} to be a Label, but found: {:?}", block_id, other),
         }
     }
 
@@ -1444,7 +1790,7 @@ impl Assembler
     /// operands to this instruction.
     pub fn push_insn(&mut self, insn: Insn) {
         // Index of this instruction
-        let insn_idx = self.insns.len();
+        let insn_idx = self.idx;
 
         // Initialize the live range of the output VReg to insn_idx..=insn_idx
         if let Some(Opnd::VReg { idx, .. }) = insn.out_opnd() {
@@ -1475,7 +1821,9 @@ impl Assembler
             }
         }
 
-        self.insns.push(insn);
+        self.idx += 1;
+
+        self.current_block().push_insn(insn);
     }
 
     /// Create a new label instance that we can jump to
@@ -1533,6 +1881,7 @@ impl Assembler
         Some(new_moves)
     }
 
+
     /// Sets the out field on the various instructions that require allocated
     /// registers because their output is used as the operand on a subsequent
     /// instruction. This is our implementation of the linear scan algorithm.
@@ -1548,17 +1897,22 @@ impl Assembler
         let mut saved_regs: Vec<(Reg, usize)> = vec![];
 
         // Remember the indexes of Insn::FrameSetup to update the stack size later
-        let mut frame_setup_idxs: Vec<usize> = vec![];
+        let mut frame_setup_idxs: Vec<(BlockId, usize)> = vec![];
 
         // live_ranges is indexed by original `index` given by the iterator.
-        let mut asm = Assembler::new_with_asm(&self);
+        let mut asm_local = Assembler::new_with_asm(&self);
+
+        let iterator = &mut self.instruction_iterator();
+
+        let asm = &mut asm_local;
+
         let live_ranges: Vec<LiveRange> = take(&mut self.live_ranges);
-        let mut iterator = self.insns.into_iter().enumerate().peekable();
 
-        while let Some((index, mut insn)) = iterator.next() {
+        while let Some((index, mut insn)) = iterator.next(asm) {
             // Remember the index of FrameSetup to bump slot_count when we know the max number of spilled VRegs.
             if let Insn::FrameSetup { .. } = insn {
-                frame_setup_idxs.push(asm.insns.len());
+                assert!(asm.current_block().is_entry);
+                frame_setup_idxs.push((asm.current_block().id, asm.current_block().insns.len()));
             }
 
             let before_ccall = match (&insn, iterator.peek().map(|(_, insn)| insn)) {
@@ -1715,17 +2069,6 @@ impl Assembler
             // Push instruction(s)
             let is_ccall = matches!(insn, Insn::CCall { .. });
             match insn {
-                Insn::ParallelMov { moves } => {
-                    // For trampolines that use scratch registers, attempt to lower ParallelMov without scratch_reg.
-                    if let Some(moves) = Self::resolve_parallel_moves(&moves, None) {
-                        for (dst, src) in moves {
-                            asm.mov(dst, src);
-                        }
-                    } else {
-                        // If it needs a scratch_reg, leave it to *_split_with_scratch_regs to handle it.
-                        asm.push_insn(Insn::ParallelMov { moves });
-                    }
-                }
                 Insn::CCall { opnds, fptr, start_marker, end_marker, out } => {
                     // Split start_marker and end_marker here to avoid inserting push/pop between them.
                     if let Some(start_marker) = start_marker {
@@ -1768,8 +2111,8 @@ impl Assembler
         }
 
         // Extend the stack space for spilled operands
-        for frame_setup_idx in frame_setup_idxs {
-            match &mut asm.insns[frame_setup_idx] {
+        for (block_id, frame_setup_idx) in frame_setup_idxs {
+            match &mut asm.basic_blocks[block_id.0].insns[frame_setup_idx] {
                 Insn::FrameSetup { slot_count, .. } => {
                     *slot_count += pool.stack_state.stack_size;
                 }
@@ -1778,7 +2121,7 @@ impl Assembler
         }
 
         assert!(pool.is_empty(), "Expected all registers to be returned to the pool");
-        Ok(asm)
+        Ok(asm_local)
     }
 
     /// Compile the instructions down to machine code.
@@ -1852,16 +2195,19 @@ impl Assembler
 
         // Extract targets first so that we can update instructions while referencing part of them.
         let mut targets = HashMap::new();
-        for (idx, insn) in self.insns.iter().enumerate() {
-            if let Some(target @ Target::SideExit { .. }) = insn.target() {
-                targets.insert(idx, target.clone());
+
+        for block in self.sorted_blocks().iter() {
+            for (idx, insn) in block.insns.iter().enumerate() {
+                if let Some(target @ Target::SideExit { .. }) = insn.target() {
+                    targets.insert((block.id.0, idx), target.clone());
+                }
             }
         }
 
         // Map from SideExit to compiled Label. This table is used to deduplicate side exit code.
         let mut compiled_exits: HashMap<SideExit, Label> = HashMap::new();
 
-        for (idx, target) in targets {
+        for ((block_id, idx), target) in targets {
             // Compile a side exit. Note that this is past the split pass and alloc_regs(),
             // so you can't use an instruction that returns a VReg.
             if let Target::SideExit { exit: exit @ SideExit { pc, .. }, reason } = target {
@@ -1914,7 +2260,7 @@ impl Assembler
                     new_exit
                 };
 
-                *self.insns[idx].target_mut().unwrap() = counted_exit.unwrap_or(compiled_exit);
+                *self.basic_blocks[block_id].insns[idx].target_mut().unwrap() = counted_exit.unwrap_or(compiled_exit);
             }
         }
     }
@@ -1949,7 +2295,7 @@ impl fmt::Display for Assembler {
             }
         }
 
-        for insn in self.insns.iter() {
+        for insn in self.linearize_instructions().iter() {
             match insn {
                 Insn::Comment(comment) => {
                     writeln!(f, "    {bold_begin}# {comment}{bold_end}")?;
@@ -1985,6 +2331,20 @@ impl fmt::Display for Assembler {
                             Target::CodePtr(code_ptr) => write!(f, " {code_ptr:?}")?,
                             Target::Label(Label(label_idx)) => write!(f, " {}", label_name(self, *label_idx, &label_counts))?,
                             Target::SideExit { reason, .. } => write!(f, " Exit({reason})")?,
+                            Target::Block(edge) => {
+                                if edge.args.is_empty() {
+                                    write!(f, " bb{}", edge.target.0)?;
+                                } else {
+                                    write!(f, " bb{}(", edge.target.0)?;
+                                    for (i, arg) in edge.args.iter().enumerate() {
+                                        if i > 0 {
+                                            write!(f, ", ")?;
+                                        }
+                                        write!(f, "{}", arg)?;
+                                    }
+                                    write!(f, ")")?;
+                                }
+                            }
                         }
                     }
 
@@ -2000,6 +2360,17 @@ impl fmt::Display for Assembler {
                             }
                             _ => {}
                         }
+                    } else if let Some(Target::Block(_)) = insn.target() {
+                        // If the instruction has a Block target, avoid using opnd_iter() for branch args
+                        // since they're already printed inline with the target. Only print non-target operands.
+                        match insn {
+                            Insn::Joz(opnd, _) |
+                            Insn::Jonz(opnd, _) |
+                            Insn::LeaJumpTarget { out: opnd, target: _ } => {
+                                write!(f, ", {opnd}")?;
+                            }
+                            _ => {}
+                        }
                     } else if let Insn::ParallelMov { moves } = insn {
                         // Print operands with a special syntax for ParallelMov
                         moves.iter().try_fold(" ", |prefix, (dst, src)| write!(f, "{prefix}{dst} <- {src}").and(Ok(", ")))?;
@@ -2019,7 +2390,7 @@ impl fmt::Debug for Assembler {
     fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
         writeln!(fmt, "Assembler")?;
 
-        for (idx, insn) in self.insns.iter().enumerate() {
+        for (idx, insn) in self.linearize_instructions().iter().enumerate() {
             writeln!(fmt, "    {idx:03} {insn:?}")?;
         }
 
@@ -2028,7 +2399,9 @@ impl fmt::Debug for Assembler {
 }
 
 pub struct InsnIter {
-    old_insns_iter: std::vec::IntoIter<Insn>,
+    blocks: Vec<BasicBlock>,
+    current_block_idx: usize,
+    current_insn_iter: std::vec::IntoIter<Insn>,
     peeked: Option<(usize, Insn)>,
     index: usize,
 }
@@ -2039,7 +2412,7 @@ impl InsnIter {
     pub fn peek(&mut self) -> Option<&(usize, Insn)> {
         // If we don't have a peeked value, get one
         if self.peeked.is_none() {
-            let insn = self.old_insns_iter.next()?;
+            let insn = self.current_insn_iter.next()?;
             let idx = self.index;
             self.index += 1;
             self.peeked = Some((idx, insn));
@@ -2048,17 +2421,34 @@ impl InsnIter {
         self.peeked.as_ref()
     }
 
-    // Get the next instruction.  Right now we're passing the "new" assembler
-    // (the assembler we're copying in to) as a parameter.  Once we've
-    // introduced basic blocks to LIR, we'll use the to set the correct BB
-    // on the new assembler, but for now it is unused.
-    pub fn next(&mut self, _new_asm: &mut Assembler) -> Option<(usize, Insn)> {
+    // Get the next instruction, advancing to the next block when current block is exhausted.
+    // Sets the current block on new_asm when moving to a new block.
+    pub fn next(&mut self, new_asm: &mut Assembler) -> Option<(usize, Insn)> {
         // If we have a peeked value, return it
         if let Some(item) = self.peeked.take() {
             return Some(item);
         }
-        // Otherwise get the next from underlying iterator
-        let insn = self.old_insns_iter.next()?;
+
+        // Try to get the next instruction from current block
+        if let Some(insn) = self.current_insn_iter.next() {
+            let idx = self.index;
+            self.index += 1;
+            return Some((idx, insn));
+        }
+
+        // Current block is exhausted, move to next block
+        self.current_block_idx += 1;
+        if self.current_block_idx >= self.blocks.len() {
+            return None;
+        }
+
+        // Set up the next block
+        let next_block = &mut self.blocks[self.current_block_idx];
+        new_asm.set_current_block(next_block.id);
+        self.current_insn_iter = take(&mut next_block.insns).into_iter();
+
+        // Get first instruction from the new block
+        let insn = self.current_insn_iter.next()?;
         let idx = self.index;
         self.index += 1;
         Some((idx, insn))
@@ -2451,6 +2841,43 @@ impl Assembler {
         self.push_insn(Insn::Xor { left, right, out });
         out
     }
+
+    /// This is used for trampolines that don't allow scratch registers.
+    /// Linearizes all blocks into a single giant block.
+    pub fn resolve_parallel_mov_pass(self) -> Assembler {
+        let mut asm_local = Assembler::new();
+        asm_local.accept_scratch_reg = self.accept_scratch_reg;
+        asm_local.stack_base_idx = self.stack_base_idx;
+        asm_local.label_names = self.label_names.clone();
+        asm_local.live_ranges.resize(self.live_ranges.len(), LiveRange { start: None, end: None });
+
+        // Create one giant block to linearize everything into
+        asm_local.new_block_without_id();
+
+        // Get linearized instructions with branch parameters expanded into ParallelMov
+        let linearized_insns = self.linearize_instructions();
+
+        // Process each linearized instruction
+        for insn in linearized_insns {
+            match insn {
+                Insn::ParallelMov { moves } => {
+                    // Resolve parallel moves without scratch register
+                    if let Some(resolved_moves) = Assembler::resolve_parallel_moves(&moves, None) {
+                        for (dst, src) in resolved_moves {
+                            asm_local.mov(dst, src);
+                        }
+                    } else {
+                        unreachable!("ParallelMov requires scratch register but scratch_reg is not allowed");
+                    }
+                }
+                _ => {
+                    asm_local.push_insn(insn);
+                }
+            }
+        }
+
+        asm_local
+    }
 }
 
 /// Macro to use format! for Insn::Comment, which skips a format! call
diff --git a/zjit/src/backend/tests.rs b/zjit/src/backend/tests.rs
index ece6f8605f..701029b8ec 100644
--- a/zjit/src/backend/tests.rs
+++ b/zjit/src/backend/tests.rs
@@ -3,10 +3,12 @@ use crate::backend::lir::*;
 use crate::cruby::*;
 use crate::codegen::c_callable;
 use crate::options::rb_zjit_prepare_options;
+use crate::hir;
 
 #[test]
 fn test_add() {
     let mut asm = Assembler::new();
+    asm.new_block_without_id();
     let out = asm.add(SP, Opnd::UImm(1));
     let _ = asm.add(out, Opnd::UImm(2));
 }
@@ -15,6 +17,7 @@ fn test_add() {
 fn test_alloc_regs() {
     rb_zjit_prepare_options(); // for asm.alloc_regs
     let mut asm = Assembler::new();
+    asm.new_block_without_id();
 
     // Get the first output that we're going to reuse later.
     let out1 = asm.add(EC, Opnd::UImm(1));
@@ -37,7 +40,7 @@ fn test_alloc_regs() {
     let _ = asm.add(out3, Opnd::UImm(6));
 
     // Here we're going to allocate the registers.
-    let result = asm.alloc_regs(Assembler::get_alloc_regs()).unwrap();
+    let result = &asm.alloc_regs(Assembler::get_alloc_regs()).unwrap().basic_blocks[0];
 
     // Now we're going to verify that the out field has been appropriately
     // updated for each of the instructions that needs it.
@@ -63,7 +66,9 @@ fn test_alloc_regs() {
 
 fn setup_asm() -> (Assembler, CodeBlock) {
     rb_zjit_prepare_options(); // for get_option! on asm.compile
-    (Assembler::new(), CodeBlock::new_dummy())
+    let mut asm = Assembler::new();
+    asm.new_block_without_id();
+    (asm, CodeBlock::new_dummy())
 }
 
 // Test full codegen pipeline
@@ -293,6 +298,7 @@ fn test_no_pos_marker_callback_when_compile_fails() {
     // We don't want to invoke the pos_marker callbacks with positions of malformed code.
     let mut asm = Assembler::new();
     rb_zjit_prepare_options(); // for asm.compile
+    asm.new_block_without_id();
 
     // Markers around code to exhaust memory limit
     let fail_if_called = |_code_ptr, _cb: &_| panic!("pos_marker callback should not be called");
diff --git a/zjit/src/backend/x86_64/mod.rs b/zjit/src/backend/x86_64/mod.rs
index 38b9f2791b..a4cf8dfcc5 100644
--- a/zjit/src/backend/x86_64/mod.rs
+++ b/zjit/src/backend/x86_64/mod.rs
@@ -392,7 +392,7 @@ impl Assembler {
     /// for VRegs, most splits should happen in [`Self::x86_split`]. However, some instructions
     /// need to be split with registers after `alloc_regs`, e.g. for `compile_exits`, so
     /// this splits them and uses scratch registers for it.
-    pub fn x86_scratch_split(mut self) -> Assembler {
+    pub fn x86_scratch_split(self) -> Assembler {
         /// For some instructions, we want to be able to lower a 64-bit operand
         /// without requiring more registers to be available in the register
         /// allocator. So we just use the SCRATCH0_OPND register temporarily to hold
@@ -468,12 +468,22 @@ impl Assembler {
         // Prepare StackState to lower MemBase::Stack
         let stack_state = StackState::new(self.stack_base_idx);
 
-        let mut asm_local = Assembler::new_with_asm(&self);
+        let mut asm_local = Assembler::new();
+        asm_local.accept_scratch_reg = true;
+        asm_local.stack_base_idx = self.stack_base_idx;
+        asm_local.label_names = self.label_names.clone();
+        asm_local.live_ranges.resize(self.live_ranges.len(), LiveRange { start: None, end: None });
+
+        // Create one giant block to linearize everything into
+        asm_local.new_block_without_id();
+
         let asm = &mut asm_local;
-        asm.accept_scratch_reg = true;
-        let mut iterator = self.instruction_iterator();
 
-        while let Some((_, mut insn)) = iterator.next(asm) {
+        // Get linearized instructions with branch parameters expanded into ParallelMov
+        let linearized_insns = self.linearize_instructions();
+
+        for insn in linearized_insns.iter() {
+            let mut insn = insn.clone();
             match &mut insn {
                 Insn::Add { left, right, out } |
                 Insn::Sub { left, right, out } |
@@ -703,7 +713,10 @@ impl Assembler {
 
         // For each instruction
         let mut insn_idx: usize = 0;
-        while let Some(insn) = self.insns.get(insn_idx) {
+        assert_eq!(self.basic_blocks.len(), 1, "Assembler should be linearized into a single block before arm64_emit");
+        let insns = &self.basic_blocks[0].insns;
+
+        while let Some(insn) = insns.get(insn_idx) {
             // Update insn_idx that is shown on panic
             hook_insn_idx.as_mut().map(|idx| idx.lock().map(|mut idx| *idx = insn_idx).unwrap());
 
@@ -907,6 +920,7 @@ impl Assembler {
                     match *target {
                         Target::CodePtr(code_ptr) => jmp_ptr(cb, code_ptr),
                         Target::Label(label) => jmp_label(cb, label),
+                        Target::Block(ref edge) => jmp_label(cb, self.block_label(edge.target)),
                         Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_exits"),
                     }
                 }
@@ -915,6 +929,7 @@ impl Assembler {
                     match *target {
                         Target::CodePtr(code_ptr) => je_ptr(cb, code_ptr),
                         Target::Label(label) => je_label(cb, label),
+                        Target::Block(ref edge) => je_label(cb, self.block_label(edge.target)),
                         Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_exits"),
                     }
                 }
@@ -923,6 +938,7 @@ impl Assembler {
                     match *target {
                         Target::CodePtr(code_ptr) => jne_ptr(cb, code_ptr),
                         Target::Label(label) => jne_label(cb, label),
+                        Target::Block(ref edge) => jne_label(cb, self.block_label(edge.target)),
                         Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_exits"),
                     }
                 }
@@ -931,6 +947,7 @@ impl Assembler {
                     match *target {
                         Target::CodePtr(code_ptr) => jl_ptr(cb, code_ptr),
                         Target::Label(label) => jl_label(cb, label),
+                        Target::Block(ref edge) => jl_label(cb, self.block_label(edge.target)),
                         Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_exits"),
                     }
                 },
@@ -939,6 +956,7 @@ impl Assembler {
                     match *target {
                         Target::CodePtr(code_ptr) => jg_ptr(cb, code_ptr),
                         Target::Label(label) => jg_label(cb, label),
+                        Target::Block(ref edge) => jg_label(cb, self.block_label(edge.target)),
                         Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_exits"),
                     }
                 },
@@ -947,6 +965,7 @@ impl Assembler {
                     match *target {
                         Target::CodePtr(code_ptr) => jge_ptr(cb, code_ptr),
                         Target::Label(label) => jge_label(cb, label),
+                        Target::Block(ref edge) => jge_label(cb, self.block_label(edge.target)),
                         Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_exits"),
                     }
                 },
@@ -955,6 +974,7 @@ impl Assembler {
                     match *target {
                         Target::CodePtr(code_ptr) => jbe_ptr(cb, code_ptr),
                         Target::Label(label) => jbe_label(cb, label),
+                        Target::Block(ref edge) => jbe_label(cb, self.block_label(edge.target)),
                         Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_exits"),
                     }
                 },
@@ -963,6 +983,7 @@ impl Assembler {
                     match *target {
                         Target::CodePtr(code_ptr) => jb_ptr(cb, code_ptr),
                         Target::Label(label) => jb_label(cb, label),
+                        Target::Block(ref edge) => jb_label(cb, self.block_label(edge.target)),
                         Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_exits"),
                     }
                 },
@@ -971,6 +992,7 @@ impl Assembler {
                     match *target {
                         Target::CodePtr(code_ptr) => jz_ptr(cb, code_ptr),
                         Target::Label(label) => jz_label(cb, label),
+                        Target::Block(ref edge) => jz_label(cb, self.block_label(edge.target)),
                         Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_exits"),
                     }
                 }
@@ -979,6 +1001,7 @@ impl Assembler {
                     match *target {
                         Target::CodePtr(code_ptr) => jnz_ptr(cb, code_ptr),
                         Target::Label(label) => jnz_label(cb, label),
+                        Target::Block(ref edge) => jnz_label(cb, self.block_label(edge.target)),
                         Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_exits"),
                     }
                 }
@@ -988,6 +1011,7 @@ impl Assembler {
                     match *target {
                         Target::CodePtr(code_ptr) => jo_ptr(cb, code_ptr),
                         Target::Label(label) => jo_label(cb, label),
+                        Target::Block(ref edge) => jo_label(cb, self.block_label(edge.target)),
                         Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_exits"),
                     }
                 }
@@ -1052,7 +1076,7 @@ impl Assembler {
         } else {
             // No bytes dropped, so the pos markers point to valid code
             for (insn_idx, pos) in pos_markers {
-                if let Insn::PosMarker(callback) = self.insns.get(insn_idx).unwrap() {
+                if let Insn::PosMarker(callback) = insns.get(insn_idx).unwrap() {
                     callback(pos, cb);
                 } else {
                     panic!("non-PosMarker in pos_markers insn_idx={insn_idx} {self:?}");
@@ -1082,6 +1106,10 @@ impl Assembler {
         if use_scratch_regs {
             asm = asm.x86_scratch_split();
             asm_dump!(asm, scratch_split);
+        } else {
+            // For trampolines that use scratch registers, resolve ParallelMov without scratch_reg.
+            asm = asm.resolve_parallel_mov_pass();
+            asm_dump!(asm, resolve_parallel_mov);
         }
 
         // Create label instances in the code block
@@ -1115,7 +1143,9 @@ mod tests {
 
     fn setup_asm() -> (Assembler, CodeBlock) {
         rb_zjit_prepare_options(); // for get_option! on asm.compile
-        (Assembler::new(), CodeBlock::new_dummy())
+        let mut asm = Assembler::new();
+        asm.new_block_without_id();
+        (asm, CodeBlock::new_dummy())
     }
 
     #[test]
@@ -1123,6 +1153,7 @@ mod tests {
         use crate::hir::SideExitReason;
 
         let mut asm = Assembler::new();
+        asm.new_block_without_id();
         asm.stack_base_idx = 1;
 
         let label = asm.new_label("bb0");
diff --git a/zjit/src/codegen.rs b/zjit/src/codegen.rs
index a3cf09d7c4..7c867cd6b4 100644
--- a/zjit/src/codegen.rs
+++ b/zjit/src/codegen.rs
@@ -18,8 +18,8 @@ use crate::state::ZJITState;
 use crate::stats::{CompileError, exit_counter_for_compile_error, exit_counter_for_unhandled_hir_insn, incr_counter, incr_counter_by, send_fallback_counter, send_fallback_counter_for_method_type, send_fallback_counter_for_super_method_type, send_fallback_counter_ptr_for_opcode, send_without_block_fallback_counter_for_method_type, send_without_block_fallback_counter_for_optimized_method_type};
 use crate::stats::{counter_ptr, with_time_stat, Counter, Counter::{compile_time_ns, exit_compile_error}};
 use crate::{asm::CodeBlock, cruby::*, options::debug, virtualmem::CodePtr};
-use crate::backend::lir::{self, Assembler, C_ARG_OPNDS, C_RET_OPND, CFP, EC, NATIVE_BASE_PTR, NATIVE_STACK_PTR, Opnd, SP, SideExit, Target, asm_ccall, asm_comment};
-use crate::hir::{iseq_to_hir, BlockId, BranchEdge, Invariant, RangeType, SideExitReason::{self, *}, SpecialBackrefSymbol, SpecialObjectType};
+use crate::backend::lir::{self, Assembler, C_ARG_OPNDS, C_RET_OPND, CFP, EC, NATIVE_STACK_PTR, Opnd, SP, SideExit, Target, asm_ccall, asm_comment};
+use crate::hir::{iseq_to_hir, BlockId, Invariant, RangeType, SideExitReason::{self, *}, SpecialBackrefSymbol, SpecialObjectType};
 use crate::hir::{Const, FrameState, Function, Insn, InsnId, SendFallbackReason};
 use crate::hir_type::{types, Type};
 use crate::options::get_option;
@@ -75,12 +75,17 @@ impl JITState {
     }
 
     /// Find or create a label for a given BlockId
-    fn get_label(&mut self, asm: &mut Assembler, block_id: BlockId) -> Target {
-        match &self.labels[block_id.0] {
+    fn get_label(&mut self, asm: &mut Assembler, lir_block_id: lir::BlockId, hir_block_id: BlockId) -> Target {
+        // Extend labels vector if the requested index is out of bounds
+        if lir_block_id.0 >= self.labels.len() {
+            self.labels.resize(lir_block_id.0 + 1, None);
+        }
+
+        match &self.labels[lir_block_id.0] {
             Some(label) => label.clone(),
             None => {
-                let label = asm.new_label(&format!("{block_id}"));
-                self.labels[block_id.0] = Some(label.clone());
+                let label = asm.new_label(&format!("{hir_block_id}_{lir_block_id}"));
+                self.labels[lir_block_id.0] = Some(label.clone());
                 label
             }
         }
@@ -176,6 +181,7 @@ fn register_with_perf(iseq_name: String, start_ptr: usize, code_size: usize) {
 pub fn gen_entry_trampoline(cb: &mut CodeBlock) -> Result<CodePtr, CompileError> {
     // Set up registers for CFP, EC, SP, and basic block arguments
     let mut asm = Assembler::new();
+    asm.new_block_without_id();
     gen_entry_prologue(&mut asm);
 
     // Jump to the first block using a call instruction. This trampoline is used
@@ -264,11 +270,28 @@ fn gen_function(cb: &mut CodeBlock, iseq: IseqPtr, version: IseqVersionRef, func
     let mut jit = JITState::new(iseq, version, function.num_insns(), function.num_blocks());
     let mut asm = Assembler::new_with_stack_slots(num_spilled_params);
 
-    // Compile each basic block
+    // Mapping from HIR block IDs to LIR block IDs.
+    // This is is a one-to-one mapping from HIR to LIR blocks used for finding
+    // jump targets in LIR (LIR should always jump to the head of an HIR block)
+    let mut hir_to_lir: Vec<Option<lir::BlockId>> = vec![None; function.num_blocks()];
+
     let reverse_post_order = function.rpo();
-    for &block_id in reverse_post_order.iter() {
+
+    // Create all LIR basic blocks corresponding to HIR basic blocks
+    for (rpo_idx, &block_id) in reverse_post_order.iter().enumerate() {
+        let lir_block_id = asm.new_block(block_id, function.is_entry_block(block_id), rpo_idx);
+        hir_to_lir[block_id.0] = Some(lir_block_id);
+    }
+
+    // Compile each basic block
+    for (rpo_idx, &block_id) in reverse_post_order.iter().enumerate() {
+        // Set the current block to the LIR block that corresponds to this
+        // HIR block.
+        let lir_block_id = hir_to_lir[block_id.0].unwrap();
+        asm.set_current_block(lir_block_id);
+
         // Write a label to jump to the basic block
-        let label = jit.get_label(&mut asm, block_id);
+        let label = jit.get_label(&mut asm, lir_block_id, block_id);
         asm.write_label(label);
 
         let block = function.block(block_id);
@@ -291,15 +314,73 @@ fn gen_function(cb: &mut CodeBlock, iseq: IseqPtr, version: IseqVersionRef, func
         // Compile all instructions
         for &insn_id in block.insns() {
             let insn = function.find(insn_id);
-            if let Err(last_snapshot) = gen_insn(cb, &mut jit, &mut asm, function, insn_id, &insn) {
-                debug!("ZJIT: gen_function: Failed to compile insn: {insn_id} {insn}. Generating side-exit.");
-                gen_incr_counter(&mut asm, exit_counter_for_unhandled_hir_insn(&insn));
-                gen_side_exit(&mut jit, &mut asm, &SideExitReason::UnhandledHIRInsn(insn_id), &function.frame_state(last_snapshot));
-                // Don't bother generating code after a side-exit. We won't run it.
-                // TODO(max): Generate ud2 or equivalent.
-                break;
-            };
-            // It's fine; we generated the instruction
+            match insn {
+                Insn::IfFalse { val, target } => {
+                    let val_opnd = jit.get_opnd(val);
+
+                    let lir_target = hir_to_lir[target.target.0].unwrap();
+
+                    let fall_through_target = asm.new_block(block_id, false, rpo_idx);
+
+                    let branch_edge = lir::BranchEdge {
+                        target: lir_target,
+                        args: target.args.iter().map(|insn_id| jit.get_opnd(*insn_id)).collect()
+                    };
+
+                    let fall_through_edge = lir::BranchEdge {
+                        target: fall_through_target,
+                        args: vec![]
+                    };
+
+                    gen_if_false(&mut asm, val_opnd, branch_edge, fall_through_edge);
+                    asm.set_current_block(fall_through_target);
+
+                    let label = jit.get_label(&mut asm, fall_through_target, block_id);
+                    asm.write_label(label);
+                },
+                Insn::IfTrue { val, target } => {
+                    let val_opnd = jit.get_opnd(val);
+
+                    let lir_target = hir_to_lir[target.target.0].unwrap();
+
+                    let fall_through_target = asm.new_block(block_id, false, rpo_idx);
+
+                    let branch_edge = lir::BranchEdge {
+                        target: lir_target,
+                        args: target.args.iter().map(|insn_id| jit.get_opnd(*insn_id)).collect()
+                    };
+
+                    let fall_through_edge = lir::BranchEdge {
+                        target: fall_through_target,
+                        args: vec![]
+                    };
+
+                    gen_if_true(&mut asm, val_opnd, branch_edge, fall_through_edge);
+                    asm.set_current_block(fall_through_target);
+
+                    let label = jit.get_label(&mut asm, fall_through_target, block_id);
+                    asm.write_label(label);
+                }
+                Insn::Jump(target) => {
+                    let lir_target = hir_to_lir[target.target.0].unwrap();
+                    let branch_edge = lir::BranchEdge {
+                        target: lir_target,
+                        args: target.args.iter().map(|insn_id| jit.get_opnd(*insn_id)).collect()
+                    };
+                    gen_jump(&mut asm, branch_edge);
+                },
+                _ => {
+                    if let Err(last_snapshot) = gen_insn(cb, &mut jit, &mut asm, function, insn_id, &insn) {
+                        debug!("ZJIT: gen_function: Failed to compile insn: {insn_id} {insn}. Generating side-exit.");
+                        gen_incr_counter(&mut asm, exit_counter_for_unhandled_hir_insn(&insn));
+                        gen_side_exit(&mut jit, &mut asm, &SideExitReason::UnhandledHIRInsn(insn_id), &function.frame_state(last_snapshot));
+                        // Don't bother generating code after a side-exit. We won't run it.
+                        // TODO(max): Generate ud2 or equivalent.
+                        break;
+                    };
+                    // It's fine; we generated the instruction
+                }
+            }
         }
         // Make sure the last patch point has enough space to insert a jump
         asm.pad_patch_point();
@@ -395,9 +476,6 @@ fn gen_insn(cb: &mut CodeBlock, jit: &mut JITState, asm: &mut Assembler, functio
         Insn::ToRegexp { opt, values, state } => gen_toregexp(jit, asm, *opt, opnds!(values), &function.frame_state(*state)),
         Insn::Param => unreachable!("block.insns should not have Insn::Param"),
         Insn::Snapshot { .. } => return Ok(()), // we don't need to do anything for this instruction at the moment
-        Insn::Jump(branch) => no_output!(gen_jump(jit, asm, branch)),
-        Insn::IfTrue { val, target } => no_output!(gen_if_true(jit, asm, opnd!(val), target)),
-        Insn::IfFalse { val, target } => no_output!(gen_if_false(jit, asm, opnd!(val), target)),
         &Insn::Send { cd, blockiseq, state, reason, .. } => gen_send(jit, asm, cd, blockiseq, &function.frame_state(state), reason),
         &Insn::SendForward { cd, blockiseq, state, reason, .. } => gen_send_forward(jit, asm, cd, blockiseq, &function.frame_state(state), reason),
         &Insn::SendWithoutBlock { cd, state, reason, .. } => gen_send_without_block(jit, asm, cd, &function.frame_state(state), reason),
@@ -511,6 +589,8 @@ fn gen_insn(cb: &mut CodeBlock, jit: &mut JITState, asm: &mut Assembler, functio
         &Insn::ArrayMax { state, .. }
         | &Insn::Throw { state, .. }
         => return Err(state),
+        &Insn::IfFalse { .. } | Insn::IfTrue { .. }
+        | &Insn::Jump { .. } => unreachable!(),
     };
 
     assert!(insn.has_output(), "Cannot write LIR output of HIR instruction with no output: {insn}");
@@ -1190,18 +1270,6 @@ fn gen_entry_prologue(asm: &mut Assembler) {
     asm.mov(SP, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP));
 }
 
-/// Set branch params to basic block arguments
-fn gen_branch_params(jit: &mut JITState, asm: &mut Assembler, branch: &BranchEdge) {
-    if branch.args.is_empty() {
-        return;
-    }
-
-    asm_comment!(asm, "set branch params: {}", branch.args.len());
-    asm.parallel_mov(branch.args.iter().enumerate().map(|(idx, &arg)|
-        (param_opnd(idx), jit.get_opnd(arg))
-    ).collect());
-}
-
 /// Compile a constant
 fn gen_const_value(val: VALUE) -> lir::Opnd {
     // Just propagate the constant value and generate nothing
@@ -1228,7 +1296,7 @@ fn gen_const_uint32(val: u32) -> lir::Opnd {
 /// Compile a basic block argument
 fn gen_param(asm: &mut Assembler, idx: usize) -> lir::Opnd {
     // Allocate a register or a stack slot
-    match param_opnd(idx) {
+    match Assembler::param_opnd(idx) {
         // If it's a register, insert LiveReg instruction to reserve the register
         // in the register pool for register allocation.
         param @ Opnd::Reg(_) => asm.live_reg_opnd(param),
@@ -1237,45 +1305,25 @@ fn gen_param(asm: &mut Assembler, idx: usize) -> lir::Opnd {
 }
 
 /// Compile a jump to a basic block
-fn gen_jump(jit: &mut JITState, asm: &mut Assembler, branch: &BranchEdge) {
-    // Set basic block arguments
-    gen_branch_params(jit, asm, branch);
-
+fn gen_jump(asm: &mut Assembler, branch: lir::BranchEdge) {
     // Jump to the basic block
-    let target = jit.get_label(asm, branch.target);
-    asm.jmp(target);
+    asm.jmp(Target::Block(branch));
 }
 
 /// Compile a conditional branch to a basic block
-fn gen_if_true(jit: &mut JITState, asm: &mut Assembler, val: lir::Opnd, branch: &BranchEdge) {
+fn gen_if_true(asm: &mut Assembler, val: lir::Opnd, branch: lir::BranchEdge, fall_through: lir::BranchEdge) {
     // If val is zero, move on to the next instruction.
-    let if_false = asm.new_label("if_false");
     asm.test(val, val);
-    asm.jz(if_false.clone());
-
-    // If val is not zero, set basic block arguments and jump to the branch target.
-    // TODO: Consider generating the loads out-of-line
-    let if_true = jit.get_label(asm, branch.target);
-    gen_branch_params(jit, asm, branch);
-    asm.jmp(if_true);
-
-    asm.write_label(if_false);
+    asm.jz(Target::Block(fall_through));
+    asm.jmp(Target::Block(branch));
 }
 
 /// Compile a conditional branch to a basic block
-fn gen_if_false(jit: &mut JITState, asm: &mut Assembler, val: lir::Opnd, branch: &BranchEdge) {
+fn gen_if_false(asm: &mut Assembler, val: lir::Opnd, branch: lir::BranchEdge, fall_through: lir::BranchEdge) {
     // If val is not zero, move on to the next instruction.
-    let if_true = asm.new_label("if_true");
     asm.test(val, val);
-    asm.jnz(if_true.clone());
-
-    // If val is zero, set basic block arguments and jump to the branch target.
-    // TODO: Consider generating the loads out-of-line
-    let if_false = jit.get_label(asm, branch.target);
-    gen_branch_params(jit, asm, branch);
-    asm.jmp(if_false);
-
-    asm.write_label(if_true);
+    asm.jnz(Target::Block(fall_through));
+    asm.jmp(Target::Block(branch));
 }
 
 /// Compile a dynamic dispatch with block
@@ -2411,19 +2459,6 @@ fn gen_stack_overflow_check(jit: &mut JITState, asm: &mut Assembler, state: &Fra
     asm.jbe(side_exit(jit, state, StackOverflow));
 }
 
-/// Return an operand we use for the basic block argument at a given index
-fn param_opnd(idx: usize) -> Opnd {
-    // To simplify the implementation, allocate a fixed register or a stack slot for each basic block argument for now.
-    // Note that this is implemented here as opposed to automatically inside LIR machineries.
-    // TODO: Allow allocating arbitrary registers for basic block arguments
-    if idx < ALLOC_REGS.len() {
-        Opnd::Reg(ALLOC_REGS[idx])
-    } else {
-        // With FrameSetup, the address that NATIVE_BASE_PTR points to stores an old value in the register.
-        // To avoid clobbering it, we need to start from the next slot, hence `+ 1` for the index.
-        Opnd::mem(64, NATIVE_BASE_PTR, (idx - ALLOC_REGS.len() + 1) as i32 * -SIZEOF_VALUE_I32)
-    }
-}
 
 /// Inverse of ep_offset_to_local_idx(). See ep_offset_to_local_idx() for details.
 pub fn local_idx_to_ep_offset(iseq: IseqPtr, local_idx: usize) -> i32 {
@@ -2618,6 +2653,7 @@ fn function_stub_hit_body(cb: &mut CodeBlock, iseq_call: &IseqCallRef) -> Result
 /// Compile a stub for an ISEQ called by SendWithoutBlockDirect
 fn gen_function_stub(cb: &mut CodeBlock, iseq_call: IseqCallRef) -> Result<CodePtr, CompileError> {
     let (mut asm, scratch_reg) = Assembler::new_with_scratch_reg();
+    asm.new_block_without_id();
     asm_comment!(asm, "Stub: {}", iseq_get_location(iseq_call.iseq.get(), 0));
 
     // Call function_stub_hit using the shared trampoline. See `gen_function_stub_hit_trampoline`.
@@ -2635,6 +2671,7 @@ fn gen_function_stub(cb: &mut CodeBlock, iseq_call: IseqCallRef) -> Result<CodeP
 /// See [gen_function_stub] for how it's used.
 pub fn gen_function_stub_hit_trampoline(cb: &mut CodeBlock) -> Result<CodePtr, CompileError> {
     let (mut asm, scratch_reg) = Assembler::new_with_scratch_reg();
+    asm.new_block_without_id();
     asm_comment!(asm, "function_stub_hit trampoline");
 
     // Maintain alignment for x86_64, and set up a frame for arm64 properly
@@ -2693,6 +2730,7 @@ pub fn gen_function_stub_hit_trampoline(cb: &mut CodeBlock) -> Result<CodePtr, C
 /// Generate a trampoline that is used when a function exits without restoring PC and the stack
 pub fn gen_exit_trampoline(cb: &mut CodeBlock) -> Result<CodePtr, CompileError> {
     let mut asm = Assembler::new();
+    asm.new_block_without_id();
 
     asm_comment!(asm, "side-exit trampoline");
     asm.frame_teardown(&[]); // matching the setup in gen_entry_point()
@@ -2707,6 +2745,7 @@ pub fn gen_exit_trampoline(cb: &mut CodeBlock) -> Result<CodePtr, CompileError>
 /// Generate a trampoline that increments exit_compilation_failure and jumps to exit_trampoline.
 pub fn gen_exit_trampoline_with_counter(cb: &mut CodeBlock, exit_trampoline: CodePtr) -> Result<CodePtr, CompileError> {
     let mut asm = Assembler::new();
+    asm.new_block_without_id();
 
     asm_comment!(asm, "function stub exit trampoline");
     gen_incr_counter(&mut asm, exit_compile_error);
@@ -2915,6 +2954,7 @@ impl IseqCall {
     fn regenerate(&self, cb: &mut CodeBlock, callback: impl Fn(&mut Assembler)) {
         cb.with_write_ptr(self.start_addr.get().unwrap(), |cb| {
             let mut asm = Assembler::new();
+            asm.new_block_without_id();
             callback(&mut asm);
             asm.compile(cb).unwrap();
             assert_eq!(self.end_addr.get().unwrap(), cb.get_write_ptr());
diff --git a/zjit/src/hir.rs b/zjit/src/hir.rs
index fc49dc0c3c..6bbf477c94 100644
--- a/zjit/src/hir.rs
+++ b/zjit/src/hir.rs
@@ -4702,6 +4702,10 @@ impl Function {
         entry_blocks
     }
 
+    pub fn is_entry_block(&self, block_id: BlockId) -> bool {
+        self.entry_block == block_id || self.jit_entry_blocks.contains(&block_id)
+    }
+
     /// Return a traversal of the `Function`'s `BlockId`s in reverse post-order.
     pub fn rpo(&self) -> Vec<BlockId> {
         let mut result = self.po_from(self.entry_blocks());
diff --git a/zjit/src/invariants.rs b/zjit/src/invariants.rs
index d183eb18ab..f1180acf2a 100644
--- a/zjit/src/invariants.rs
+++ b/zjit/src/invariants.rs
@@ -16,6 +16,7 @@ macro_rules! compile_patch_points {
             for patch_point in $patch_points {
                 let written_range = $cb.with_write_ptr(patch_point.patch_point_ptr, |cb| {
                     let mut asm = Assembler::new();
+                    asm.new_block_without_id();
                     asm_comment!(asm, $($comment_args)*);
                     asm.jmp(patch_point.side_exit_ptr.into());
                     asm.compile(cb).expect("can write existing code");
diff --git a/zjit/src/options.rs b/zjit/src/options.rs
index 40b49146b7..9121e49bff 100644
--- a/zjit/src/options.rs
+++ b/zjit/src/options.rs
@@ -180,6 +180,8 @@ pub enum DumpLIR {
     alloc_regs,
     /// Dump LIR after compile_exits
     compile_exits,
+    /// Dump LIR after resolve_parallel_mov
+    resolve_parallel_mov,
     /// Dump LIR after {arch}_scratch_split
     scratch_split,
 }
@@ -190,6 +192,7 @@ const DUMP_LIR_ALL: &[DumpLIR] = &[
     DumpLIR::split,
     DumpLIR::alloc_regs,
     DumpLIR::compile_exits,
+    DumpLIR::resolve_parallel_mov,
     DumpLIR::scratch_split,
 ];