1 files changed, 6112 insertions, 3156 deletions
diff --git a/yjit/src/codegen.rs b/yjit/src/codegen.rs
index 6da3b1fc68..961d6438e3 100644
--- a/yjit/src/codegen.rs
+++ b/yjit/src/codegen.rs
@@ -12,12 +12,17 @@ use crate::utils::*;
 use CodegenStatus::*;
 use YARVOpnd::*;
 
+use std::cell::Cell;
 use std::cmp;
+use std::cmp::min;
 use std::collections::HashMap;
+use std::ffi::c_void;
 use std::ffi::CStr;
-use std::mem::{self, size_of};
-use std::os::raw::{c_int, c_uint};
+use std::mem;
+use std::os::raw::c_int;
 use std::ptr;
+use std::rc::Rc;
+use std::cell::RefCell;
 use std::slice;
 
 pub use crate::virtualmem::CodePtr;
@@ -26,67 +31,115 @@ pub use crate::virtualmem::CodePtr;
 #[derive(PartialEq, Debug)]
 enum CodegenStatus {
     KeepCompiling,
-    CantCompile,
     EndBlock,
 }
 
 /// Code generation function signature
 type InsnGenFn = fn(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus;
+) -> Option<CodegenStatus>;
 
-/// Code generation state
-/// This struct only lives while code is being generated
+/// Ephemeral code generation state.
+/// Represents a [core::Block] while we build it.
 pub struct JITState {
-    // Block version being compiled
-    block: BlockRef,
+    /// Instruction sequence for the compiling block
+    pub iseq: IseqPtr,
 
-    // Instruction sequence this is associated with
-    iseq: IseqPtr,
+    /// The iseq index of the first instruction in the block
+    starting_insn_idx: IseqIdx,
+
+    /// The [Context] entering into the first instruction of the block
+    starting_ctx: Context,
+
+    /// The placement for the machine code of the [Block]
+    output_ptr: CodePtr,
 
-    // Index of the current instruction being compiled
-    insn_idx: u32,
+    /// Index of the current instruction being compiled
+    insn_idx: IseqIdx,
 
-    // Opcode for the instruction being compiled
+    /// Opcode for the instruction being compiled
     opcode: usize,
 
-    // PC of the instruction being compiled
+    /// PC of the instruction being compiled
     pc: *mut VALUE,
 
-    // Side exit to the instruction being compiled. See :side-exit:.
-    side_exit_for_pc: Option<CodePtr>,
+    /// stack_size when it started to compile the current instruction.
+    stack_size_for_pc: u8,
+
+    /// Execution context when compilation started
+    /// This allows us to peek at run-time values
+    ec: EcPtr,
+
+    /// The outgoing branches the block will have
+    pub pending_outgoing: Vec<PendingBranchRef>,
+
+    // --- Fields for block invalidation and invariants tracking below:
+    // Public mostly so into_block defined in the sibling module core
+    // can partially move out of Self.
+
+    /// Whether we need to record the code address at
+    /// the end of this bytecode instruction for global invalidation
+    pub record_boundary_patch_point: bool,
+
+    /// Code for immediately exiting upon entry to the block.
+    /// Required for invalidation.
+    pub block_entry_exit: Option<CodePtr>,
+
+    /// A list of callable method entries that must be valid for the block to be valid.
+    pub method_lookup_assumptions: Vec<CmePtr>,
 
-    // Execution context when compilation started
-    // This allows us to peek at run-time values
-    ec: Option<EcPtr>,
+    /// A list of basic operators that not be redefined for the block to be valid.
+    pub bop_assumptions: Vec<(RedefinitionFlag, ruby_basic_operators)>,
 
-    // Whether we need to record the code address at
-    // the end of this bytecode instruction for global invalidation
-    record_boundary_patch_point: bool,
+    /// A list of constant expression path segments that must have
+    /// not been written to for the block to be valid.
+    pub stable_constant_names_assumption: Option<*const ID>,
+
+    /// A list of classes that are not supposed to have a singleton class.
+    pub no_singleton_class_assumptions: Vec<VALUE>,
+
+    /// When true, the block is valid only when base pointer is equal to environment pointer.
+    pub no_ep_escape: bool,
+
+    /// When true, the block is valid only when there is a total of one ractor running
+    pub block_assumes_single_ractor: bool,
+
+    /// Address range for Linux perf's [JIT interface](https://github.com/torvalds/linux/blob/master/tools/perf/Documentation/jit-interface.txt)
+    perf_map: Rc::<RefCell::<Vec<(CodePtr, Option<CodePtr>, String)>>>,
+
+    /// Stack of symbol names for --yjit-perf
+    perf_stack: Vec<String>,
 }
 
 impl JITState {
-    pub fn new(blockref: &BlockRef) -> Self {
+    pub fn new(blockid: BlockId, starting_ctx: Context, output_ptr: CodePtr, ec: EcPtr) -> Self {
         JITState {
-            block: blockref.clone(),
-            iseq: ptr::null(), // TODO: initialize this from the blockid
+            iseq: blockid.iseq,
+            starting_insn_idx: blockid.idx,
+            starting_ctx,
+            output_ptr,
             insn_idx: 0,
             opcode: 0,
             pc: ptr::null_mut::<VALUE>(),
-            side_exit_for_pc: None,
-            ec: None,
+            stack_size_for_pc: starting_ctx.get_stack_size(),
+            pending_outgoing: vec![],
+            ec,
             record_boundary_patch_point: false,
+            block_entry_exit: None,
+            method_lookup_assumptions: vec![],
+            bop_assumptions: vec![],
+            stable_constant_names_assumption: None,
+            no_singleton_class_assumptions: vec![],
+            no_ep_escape: false,
+            block_assumes_single_ractor: false,
+            perf_map: Rc::default(),
+            perf_stack: vec![],
         }
     }
 
-    pub fn get_block(&self) -> BlockRef {
-        self.block.clone()
-    }
-
-    pub fn get_insn_idx(&self) -> u32 {
+    pub fn get_insn_idx(&self) -> IseqIdx {
         self.insn_idx
     }
 
@@ -101,124 +154,313 @@ impl JITState {
     pub fn get_pc(self: &JITState) -> *mut VALUE {
         self.pc
     }
-}
 
-use crate::codegen::JCCKinds::*;
+    pub fn get_starting_insn_idx(&self) -> IseqIdx {
+        self.starting_insn_idx
+    }
 
-#[allow(non_camel_case_types, unused)]
-pub enum JCCKinds {
-    JCC_JNE,
-    JCC_JNZ,
-    JCC_JZ,
-    JCC_JE,
-    JCC_JBE,
-    JCC_JNA,
-}
+    pub fn get_block_entry_exit(&self) -> Option<CodePtr> {
+        self.block_entry_exit
+    }
 
-pub fn jit_get_arg(jit: &JITState, arg_idx: isize) -> VALUE {
-    // insn_len require non-test config
-    #[cfg(not(test))]
-    assert!(insn_len(jit.get_opcode()) > (arg_idx + 1).try_into().unwrap());
-    unsafe { *(jit.pc.offset(arg_idx + 1)) }
-}
+    pub fn get_starting_ctx(&self) -> Context {
+        self.starting_ctx
+    }
 
-// Get the index of the next instruction
-fn jit_next_insn_idx(jit: &JITState) -> u32 {
-    jit.insn_idx + insn_len(jit.get_opcode())
-}
+    pub fn get_arg(&self, arg_idx: isize) -> VALUE {
+        // insn_len require non-test config
+        #[cfg(not(test))]
+        assert!(insn_len(self.get_opcode()) > (arg_idx + 1).try_into().unwrap());
+        unsafe { *(self.pc.offset(arg_idx + 1)) }
+    }
+
+    /// Return true if the current ISEQ could escape an environment.
+    ///
+    /// As of vm_push_frame(), EP is always equal to BP. However, after pushing
+    /// a frame, some ISEQ setups call vm_bind_update_env(), which redirects EP.
+    /// Also, some method calls escape the environment to the heap.
+    fn escapes_ep(&self) -> bool {
+        match unsafe { get_iseq_body_type(self.iseq) } {
+            // <main> frame is always associated to TOPLEVEL_BINDING.
+            ISEQ_TYPE_MAIN |
+            // Kernel#eval uses a heap EP when a Binding argument is not nil.
+            ISEQ_TYPE_EVAL => true,
+            // If this ISEQ has previously escaped EP, give up the optimization.
+            _ if iseq_escapes_ep(self.iseq) => true,
+            _ => false,
+        }
+    }
 
-// Check if we are compiling the instruction at the stub PC
-// Meaning we are compiling the instruction that is next to execute
-fn jit_at_current_insn(jit: &JITState) -> bool {
-    let ec_pc: *mut VALUE = unsafe { get_cfp_pc(get_ec_cfp(jit.ec.unwrap())) };
-    ec_pc == jit.pc
-}
+    // Get the index of the next instruction
+    fn next_insn_idx(&self) -> u16 {
+        self.insn_idx + insn_len(self.get_opcode()) as u16
+    }
 
-// Peek at the nth topmost value on the Ruby stack.
-// Returns the topmost value when n == 0.
-fn jit_peek_at_stack(jit: &JITState, ctx: &Context, n: isize) -> VALUE {
-    assert!(jit_at_current_insn(jit));
-    assert!(n < ctx.get_stack_size() as isize);
+    /// Get the index of the next instruction of the next instruction
+    fn next_next_insn_idx(&self) -> u16 {
+        let next_pc = unsafe { rb_iseq_pc_at_idx(self.iseq, self.next_insn_idx().into()) };
+        let next_opcode: usize = unsafe { rb_iseq_opcode_at_pc(self.iseq, next_pc) }.try_into().unwrap();
+        self.next_insn_idx() + insn_len(next_opcode) as u16
+    }
 
-    // Note: this does not account for ctx->sp_offset because
-    // this is only available when hitting a stub, and while
-    // hitting a stub, cfp->sp needs to be up to date in case
-    // codegen functions trigger GC. See :stub-sp-flush:.
-    return unsafe {
-        let sp: *mut VALUE = get_cfp_sp(get_ec_cfp(jit.ec.unwrap()));
+    // Check if we are compiling the instruction at the stub PC
+    // Meaning we are compiling the instruction that is next to execute
+    pub fn at_current_insn(&self) -> bool {
+        let ec_pc: *mut VALUE = unsafe { get_cfp_pc(self.get_cfp()) };
+        ec_pc == self.pc
+    }
 
-        *(sp.offset(-1 - n))
-    };
-}
+    // Peek at the nth topmost value on the Ruby stack.
+    // Returns the topmost value when n == 0.
+    pub fn peek_at_stack(&self, ctx: &Context, n: isize) -> VALUE {
+        assert!(self.at_current_insn());
+        assert!(n < ctx.get_stack_size() as isize);
 
-fn jit_peek_at_self(jit: &JITState) -> VALUE {
-    unsafe { get_cfp_self(get_ec_cfp(jit.ec.unwrap())) }
-}
+        // Note: this does not account for ctx->sp_offset because
+        // this is only available when hitting a stub, and while
+        // hitting a stub, cfp->sp needs to be up to date in case
+        // codegen functions trigger GC. See :stub-sp-flush:.
+        return unsafe {
+            let sp: *mut VALUE = get_cfp_sp(self.get_cfp());
 
-fn jit_peek_at_local(jit: &JITState, n: i32) -> VALUE {
-    assert!(jit_at_current_insn(jit));
+            *(sp.offset(-1 - n))
+        };
+    }
 
-    let local_table_size: isize = unsafe { get_iseq_body_local_table_size(jit.iseq) }
-        .try_into()
-        .unwrap();
-    assert!(n < local_table_size.try_into().unwrap());
+    fn peek_at_self(&self) -> VALUE {
+        unsafe { get_cfp_self(self.get_cfp()) }
+    }
 
-    unsafe {
-        let ep = get_cfp_ep(get_ec_cfp(jit.ec.unwrap()));
-        let n_isize: isize = n.try_into().unwrap();
-        let offs: isize = -(VM_ENV_DATA_SIZE as isize) - local_table_size + n_isize + 1;
-        *ep.offset(offs)
+    fn peek_at_local(&self, n: i32) -> VALUE {
+        assert!(self.at_current_insn());
+
+        let local_table_size: isize = unsafe { get_iseq_body_local_table_size(self.iseq) }
+            .try_into()
+            .unwrap();
+        assert!(n < local_table_size.try_into().unwrap());
+
+        unsafe {
+            let ep = get_cfp_ep(self.get_cfp());
+            let n_isize: isize = n.try_into().unwrap();
+            let offs: isize = -(VM_ENV_DATA_SIZE as isize) - local_table_size + n_isize + 1;
+            *ep.offset(offs)
+        }
     }
-}
 
-fn jit_peek_at_block_handler(jit: &JITState, level: u32) -> VALUE {
-    assert!(jit_at_current_insn(jit));
+    fn peek_at_block_handler(&self, level: u32) -> VALUE {
+        assert!(self.at_current_insn());
 
-    unsafe {
-        let ep = get_cfp_ep_level(get_ec_cfp(jit.ec.unwrap()), level);
-        *ep.offset(VM_ENV_DATA_INDEX_SPECVAL as isize)
+        unsafe {
+            let ep = get_cfp_ep_level(self.get_cfp(), level);
+            *ep.offset(VM_ENV_DATA_INDEX_SPECVAL as isize)
+        }
+    }
+
+    pub fn assume_expected_cfunc(
+        &mut self,
+        asm: &mut Assembler,
+        ocb: &mut OutlinedCb,
+        class: VALUE,
+        method: ID,
+        cfunc: *mut c_void,
+    ) -> bool {
+        let cme = unsafe { rb_callable_method_entry(class, method) };
+
+        if cme.is_null() {
+            return false;
+        }
+
+        let def_type = unsafe { get_cme_def_type(cme) };
+        if def_type != VM_METHOD_TYPE_CFUNC {
+            return false;
+        }
+        if unsafe { get_mct_func(get_cme_def_body_cfunc(cme)) } != cfunc {
+            return false;
+        }
+
+        self.assume_method_lookup_stable(asm, ocb, cme);
+
+        true
+    }
+
+    pub fn assume_method_lookup_stable(&mut self, asm: &mut Assembler, ocb: &mut OutlinedCb, cme: CmePtr) -> Option<()> {
+        jit_ensure_block_entry_exit(self, asm, ocb)?;
+        self.method_lookup_assumptions.push(cme);
+
+        Some(())
+    }
+
+    /// Assume that objects of a given class will have no singleton class.
+    /// Return true if there has been no such singleton class since boot
+    /// and we can safely invalidate it.
+    pub fn assume_no_singleton_class(&mut self, asm: &mut Assembler, ocb: &mut OutlinedCb, klass: VALUE) -> bool {
+        if jit_ensure_block_entry_exit(self, asm, ocb).is_none() {
+            return false; // out of space, give up
+        }
+        if has_singleton_class_of(klass) {
+            return false; // we've seen a singleton class. disable the optimization to avoid an invalidation loop.
+        }
+        self.no_singleton_class_assumptions.push(klass);
+        true
+    }
+
+    /// Assume that base pointer is equal to environment pointer in the current ISEQ.
+    /// Return true if it's safe to assume so.
+    fn assume_no_ep_escape(&mut self, asm: &mut Assembler, ocb: &mut OutlinedCb) -> bool {
+        if jit_ensure_block_entry_exit(self, asm, ocb).is_none() {
+            return false; // out of space, give up
+        }
+        if self.escapes_ep() {
+            return false; // EP has been escaped in this ISEQ. disable the optimization to avoid an invalidation loop.
+        }
+        self.no_ep_escape = true;
+        true
+    }
+
+    fn get_cfp(&self) -> *mut rb_control_frame_struct {
+        unsafe { get_ec_cfp(self.ec) }
+    }
+
+    pub fn assume_stable_constant_names(&mut self, asm: &mut Assembler, ocb: &mut OutlinedCb, id: *const ID) -> Option<()> {
+        jit_ensure_block_entry_exit(self, asm, ocb)?;
+        self.stable_constant_names_assumption = Some(id);
+
+        Some(())
+    }
+
+    pub fn queue_outgoing_branch(&mut self, branch: PendingBranchRef) {
+        self.pending_outgoing.push(branch)
+    }
+
+    /// Push a symbol for --yjit-perf
+    fn perf_symbol_push(&mut self, asm: &mut Assembler, symbol_name: &str) {
+        if !self.perf_stack.is_empty() {
+            self.perf_symbol_range_end(asm);
+        }
+        self.perf_stack.push(symbol_name.to_string());
+        self.perf_symbol_range_start(asm, symbol_name);
+    }
+
+    /// Pop the stack-top symbol for --yjit-perf
+    fn perf_symbol_pop(&mut self, asm: &mut Assembler) {
+        self.perf_symbol_range_end(asm);
+        self.perf_stack.pop();
+        if let Some(symbol_name) = self.perf_stack.get(0) {
+            self.perf_symbol_range_start(asm, symbol_name);
+        }
+    }
+
+    /// Mark the start address of a symbol to be reported to perf
+    fn perf_symbol_range_start(&self, asm: &mut Assembler, symbol_name: &str) {
+        let symbol_name = format!("[JIT] {}", symbol_name);
+        let syms = self.perf_map.clone();
+        asm.pos_marker(move |start, _| syms.borrow_mut().push((start, None, symbol_name.clone())));
+    }
+
+    /// Mark the end address of a symbol to be reported to perf
+    fn perf_symbol_range_end(&self, asm: &mut Assembler) {
+        let syms = self.perf_map.clone();
+        asm.pos_marker(move |end, _| {
+            if let Some((_, ref mut end_store, _)) = syms.borrow_mut().last_mut() {
+                assert_eq!(None, *end_store);
+                *end_store = Some(end);
+            }
+        });
+    }
+
+    /// Flush addresses and symbols to /tmp/perf-{pid}.map
+    fn flush_perf_symbols(&self, cb: &CodeBlock) {
+        assert_eq!(0, self.perf_stack.len());
+        let path = format!("/tmp/perf-{}.map", std::process::id());
+        let mut f = std::fs::File::options().create(true).append(true).open(path).unwrap();
+        for sym in self.perf_map.borrow().iter() {
+            if let (start, Some(end), name) = sym {
+                // In case the code straddles two pages, part of it belongs to the symbol.
+                for (inline_start, inline_end) in cb.writable_addrs(*start, *end) {
+                    use std::io::Write;
+                    let code_size = inline_end - inline_start;
+                    writeln!(f, "{inline_start:x} {code_size:x} {name}").unwrap();
+                }
+            }
+        }
+    }
+
+    /// Return true if we're compiling a send-like instruction, not an opt_* instruction.
+    pub fn is_sendish(&self) -> bool {
+        match unsafe { rb_iseq_opcode_at_pc(self.iseq, self.pc) } as u32 {
+            YARVINSN_send |
+            YARVINSN_opt_send_without_block |
+            YARVINSN_invokesuper => true,
+            _ => false,
+        }
     }
 }
 
-macro_rules! gen_counter_incr {
-    ($asm:tt, $counter_name:ident) => {
-        if (get_option!(gen_stats)) {
-            // Get a pointer to the counter variable
-            let ptr = ptr_to_counter!($counter_name);
+/// Macro to call jit.perf_symbol_push() without evaluating arguments when
+/// the option is turned off, which is useful for avoiding string allocation.
+macro_rules! jit_perf_symbol_push {
+    ($jit:expr, $asm:expr, $symbol_name:expr, $perf_map:expr) => {
+        if get_option!(perf_map) == Some($perf_map) {
+            $jit.perf_symbol_push($asm, $symbol_name);
+        }
+    };
+}
 
-            // Load the pointer into a register
-            $asm.comment(&format!("increment counter {}", stringify!($counter_name)));
-            let ptr_reg = $asm.load(Opnd::const_ptr(ptr as *const u8));
-            let counter_opnd = Opnd::mem(64, ptr_reg, 0);
+/// Macro to call jit.perf_symbol_pop(), for consistency with jit_perf_symbol_push!().
+macro_rules! jit_perf_symbol_pop {
+    ($jit:expr, $asm:expr, $perf_map:expr) => {
+        if get_option!(perf_map) == Some($perf_map) {
+            $jit.perf_symbol_pop($asm);
+        }
+    };
+}
 
-            // Increment and store the updated value
-            $asm.incr_counter(counter_opnd, Opnd::UImm(1));
+/// Macro to push and pop a perf symbol around a function call.
+macro_rules! perf_call {
+    // perf_call!("prefix: ", func(...)) uses "prefix: func" as a symbol.
+    ($prefix:expr, $func_name:ident($jit:expr, $asm:expr$(, $arg:expr)*$(,)?) ) => {
+        {
+            jit_perf_symbol_push!($jit, $asm, &format!("{}{}", $prefix, stringify!($func_name)), PerfMap::Codegen);
+            let ret = $func_name($jit, $asm, $($arg),*);
+            jit_perf_symbol_pop!($jit, $asm, PerfMap::Codegen);
+            ret
         }
     };
+    // perf_call! { func(...) } uses "func" as a symbol.
+    { $func_name:ident($jit:expr, $asm:expr$(, $arg:expr)*$(,)?) } => {
+        perf_call!("", $func_name($jit, $asm, $($arg),*))
+    };
 }
 
-macro_rules! counted_exit {
-    ($ocb:tt, $existing_side_exit:tt, $counter_name:ident) => {
-        // The counter is only incremented when stats are enabled
-        if (!get_option!(gen_stats)) {
-            $existing_side_exit
-        } else {
-            let ocb = $ocb.unwrap();
-            let code_ptr = ocb.get_write_ptr();
+use crate::codegen::JCCKinds::*;
 
-            let mut ocb_asm = Assembler::new();
+#[allow(non_camel_case_types, unused)]
+pub enum JCCKinds {
+    JCC_JNE,
+    JCC_JNZ,
+    JCC_JZ,
+    JCC_JE,
+    JCC_JB,
+    JCC_JBE,
+    JCC_JNA,
+    JCC_JNAE,
+    JCC_JO_MUL,
+}
 
-            // Increment the counter
-            gen_counter_incr!(ocb_asm, $counter_name);
+#[inline(always)]
+fn gen_counter_incr(asm: &mut Assembler, counter: Counter) {
+    // Assert that default counters are not incremented by generated code as this would impact performance
+    assert!(!DEFAULT_COUNTERS.contains(&counter), "gen_counter_incr incremented {:?}", counter);
 
-            // Jump to the existing side exit
-            ocb_asm.jmp($existing_side_exit);
-            ocb_asm.compile(ocb);
+    if get_option!(gen_stats) {
+        asm_comment!(asm, "increment counter {}", counter.get_name());
+        let ptr = get_counter_ptr(&counter.get_name());
+        let ptr_reg = asm.load(Opnd::const_ptr(ptr as *const u8));
+        let counter_opnd = Opnd::mem(64, ptr_reg, 0);
 
-            // Pointer to the side-exit code
-            code_ptr.as_side_exit()
-        }
-    };
+        // Increment and store the updated value
+        asm.incr_counter(counter_opnd, Opnd::UImm(1));
+    }
 }
 
 // Save the incremented PC on the CFP
@@ -230,7 +472,7 @@ fn jit_save_pc(jit: &JITState, asm: &mut Assembler) {
         pc.offset(cur_insn_len)
     };
 
-    asm.comment("save PC to CFP");
+    asm_comment!(asm, "save PC to CFP");
     asm.mov(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_PC), Opnd::const_ptr(ptr as *const u8));
 }
 
@@ -238,43 +480,116 @@ fn jit_save_pc(jit: &JITState, asm: &mut Assembler) {
 /// This realigns the interpreter SP with the JIT SP
 /// Note: this will change the current value of REG_SP,
 ///       which could invalidate memory operands
-fn gen_save_sp(_jit: &JITState, asm: &mut Assembler, ctx: &mut Context) {
-    if ctx.get_sp_offset() != 0 {
-        asm.comment("save SP to CFP");
-        let stack_pointer = ctx.sp_opnd(0);
+fn gen_save_sp(asm: &mut Assembler) {
+    gen_save_sp_with_offset(asm, 0);
+}
+
+/// Save the current SP + offset on the CFP
+fn gen_save_sp_with_offset(asm: &mut Assembler, offset: i8) {
+    if asm.ctx.get_sp_offset() != -offset {
+        asm_comment!(asm, "save SP to CFP");
+        let stack_pointer = asm.ctx.sp_opnd(offset as i32);
         let sp_addr = asm.lea(stack_pointer);
         asm.mov(SP, sp_addr);
         let cfp_sp_opnd = Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP);
         asm.mov(cfp_sp_opnd, SP);
-        ctx.set_sp_offset(0);
+        asm.ctx.set_sp_offset(-offset);
+    }
+}
+
+/// Basically jit_prepare_non_leaf_call(), but this registers the current PC
+/// to lazily push a C method frame when it's necessary.
+fn jit_prepare_lazy_frame_call(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    cme: *const rb_callable_method_entry_t,
+    recv_opnd: YARVOpnd,
+) -> bool {
+    // We can use this only when the receiver is on stack.
+    let recv_idx = match recv_opnd {
+        StackOpnd(recv_idx) => recv_idx,
+        _ => unreachable!("recv_opnd must be on stack, but got: {:?}", recv_opnd),
+    };
+
+    // Get the next PC. jit_save_pc() saves that PC.
+    let pc: *mut VALUE = unsafe {
+        let cur_insn_len = insn_len(jit.get_opcode()) as isize;
+        jit.get_pc().offset(cur_insn_len)
+    };
+
+    let pc_to_cfunc = CodegenGlobals::get_pc_to_cfunc();
+    match pc_to_cfunc.get(&pc) {
+        Some(&(other_cme, _)) if other_cme != cme => {
+            // Bail out if it's not the only cme on this callsite.
+            incr_counter!(lazy_frame_failure);
+            return false;
+        }
+        _ => {
+            // Let rb_yjit_lazy_push_frame() lazily push a C frame on this PC.
+            incr_counter!(lazy_frame_count);
+            pc_to_cfunc.insert(pc, (cme, recv_idx));
+        }
+    }
+
+    // Save the PC to trigger a lazy frame push, and save the SP to get the receiver.
+    // The C func may call a method that doesn't raise, so prepare for invalidation too.
+    jit_prepare_non_leaf_call(jit, asm);
+
+    // Make sure we're ready for calling rb_vm_push_cfunc_frame().
+    let cfunc_argc = unsafe { get_mct_argc(get_cme_def_body_cfunc(cme)) };
+    if cfunc_argc != -1 {
+        assert_eq!(recv_idx as i32, cfunc_argc); // verify the receiver index if possible
     }
+    assert!(asm.get_leaf_ccall()); // It checks the stack canary we set for known_cfunc_codegen.
+
+    true
 }
 
-/// jit_save_pc() + gen_save_sp(). Should be used before calling a routine that
-/// could:
+/// jit_save_pc() + gen_save_sp(). Should be used before calling a routine that could:
 ///  - Perform GC allocation
 ///  - Take the VM lock through RB_VM_LOCK_ENTER()
 ///  - Perform Ruby method call
-fn jit_prepare_routine_call(
+///
+/// If the routine doesn't call arbitrary methods, use jit_prepare_call_with_gc() instead.
+fn jit_prepare_non_leaf_call(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler
 ) {
-    jit.record_boundary_patch_point = true;
-    jit_save_pc(jit, asm);
-    gen_save_sp(jit, asm, ctx);
+    // Prepare for GC. Setting PC also prepares for showing a backtrace.
+    jit.record_boundary_patch_point = true; // VM lock could trigger invalidation
+    jit_save_pc(jit, asm); // for allocation tracing
+    gen_save_sp(asm); // protect objects from GC
 
     // In case the routine calls Ruby methods, it can set local variables
-    // through Kernel#binding and other means.
-    ctx.clear_local_types();
+    // through Kernel#binding, rb_debug_inspector API, and other means.
+    asm.clear_local_types();
+}
+
+/// jit_save_pc() + gen_save_sp(). Should be used before calling a routine that could:
+///  - Perform GC allocation
+///  - Take the VM lock through RB_VM_LOCK_ENTER()
+fn jit_prepare_call_with_gc(
+    jit: &mut JITState,
+    asm: &mut Assembler
+) {
+    jit.record_boundary_patch_point = true; // VM lock could trigger invalidation
+    jit_save_pc(jit, asm); // for allocation tracing
+    gen_save_sp(asm); // protect objects from GC
+
+    // Expect a leaf ccall(). You should use jit_prepare_non_leaf_call() if otherwise.
+    asm.expect_leaf_ccall();
 }
 
 /// Record the current codeblock write position for rewriting into a jump into
 /// the outlined block later. Used to implement global code invalidation.
 fn record_global_inval_patch(asm: &mut Assembler, outline_block_target_pos: CodePtr) {
+    // We add a padding before pos_marker so that the previous patch will not overlap this.
+    // jump_to_next_insn() puts a patch point at the end of the block in fallthrough cases.
+    // In the fallthrough case, the next block should start with the same Context, so the
+    // patch is fine, but it should not overlap another patch.
     asm.pad_inval_patch();
-    asm.pos_marker(move |code_ptr| {
-        CodegenGlobals::push_global_inval_patch(code_ptr, outline_block_target_pos);
+    asm.pos_marker(move |code_ptr, cb| {
+        CodegenGlobals::push_global_inval_patch(code_ptr, outline_block_target_pos, cb);
     });
 }
 
@@ -285,14 +600,36 @@ fn verify_ctx(jit: &JITState, ctx: &Context) {
         unsafe { CStr::from_ptr(rb_obj_info(val)).to_str().unwrap() }
     }
 
+    // Some types such as CString only assert the class field of the object
+    // when there has never been a singleton class created for objects of that class.
+    // Once there is a singleton class created they become their weaker
+    // `T*` variant, and we more objects should pass the verification.
+    fn relax_type_with_singleton_class_assumption(ty: Type) -> Type {
+        if let Type::CString | Type::CArray | Type::CHash = ty {
+            if has_singleton_class_of(ty.known_class().unwrap()) {
+                match ty {
+                    Type::CString => return Type::TString,
+                    Type::CArray => return Type::TArray,
+                    Type::CHash => return Type::THash,
+                    _ => (),
+                }
+            }
+        }
+
+        ty
+    }
+
     // Only able to check types when at current insn
-    assert!(jit_at_current_insn(jit));
+    assert!(jit.at_current_insn());
 
-    let self_val = jit_peek_at_self(jit);
+    let self_val = jit.peek_at_self();
     let self_val_type = Type::from(self_val);
+    let learned_self_type = ctx.get_opnd_type(SelfOpnd);
+    let learned_self_type = relax_type_with_singleton_class_assumption(learned_self_type);
+
 
     // Verify self operand type
-    if self_val_type.diff(ctx.get_opnd_type(SelfOpnd)) == usize::MAX {
+    if self_val_type.diff(learned_self_type) == TypeDiff::Incompatible {
         panic!(
             "verify_ctx: ctx self type ({:?}) incompatible with actual value of self {}",
             ctx.get_opnd_type(SelfOpnd),
@@ -301,14 +638,17 @@ fn verify_ctx(jit: &JITState, ctx: &Context) {
     }
 
     // Verify stack operand types
-    let top_idx = cmp::min(ctx.get_stack_size(), MAX_TEMP_TYPES as u16);
+    let top_idx = cmp::min(ctx.get_stack_size(), MAX_TEMP_TYPES as u8);
     for i in 0..top_idx {
-        let (learned_mapping, learned_type) = ctx.get_opnd_mapping(StackOpnd(i));
-        let stack_val = jit_peek_at_stack(jit, ctx, i as isize);
+        let learned_mapping = ctx.get_opnd_mapping(StackOpnd(i));
+        let learned_type = ctx.get_opnd_type(StackOpnd(i));
+        let learned_type = relax_type_with_singleton_class_assumption(learned_type);
+
+        let stack_val = jit.peek_at_stack(ctx, i as isize);
         let val_type = Type::from(stack_val);
 
-        match learned_mapping {
-            TempMapping::MapToSelf => {
+        match learned_mapping.get_kind() {
+            TempMappingKind::MapToSelf => {
                 if self_val != stack_val {
                     panic!(
                         "verify_ctx: stack value was mapped to self, but values did not match!\n  stack: {}\n  self: {}",
@@ -317,8 +657,9 @@ fn verify_ctx(jit: &JITState, ctx: &Context) {
                     );
                 }
             }
-            TempMapping::MapToLocal(local_idx) => {
-                let local_val = jit_peek_at_local(jit, local_idx.into());
+            TempMappingKind::MapToLocal => {
+                let local_idx: u8 = learned_mapping.get_local_idx();
+                let local_val = jit.peek_at_local(local_idx.into());
                 if local_val != stack_val {
                     panic!(
                         "verify_ctx: stack value was mapped to local, but values did not match\n  stack: {}\n  local {}: {}",
@@ -328,15 +669,16 @@ fn verify_ctx(jit: &JITState, ctx: &Context) {
                     );
                 }
             }
-            TempMapping::MapToStack => {}
+            TempMappingKind::MapToStack => {}
         }
 
         // If the actual type differs from the learned type
-        if val_type.diff(learned_type) == usize::MAX {
+        if val_type.diff(learned_type) == TypeDiff::Incompatible {
             panic!(
-                "verify_ctx: ctx type ({:?}) incompatible with actual value on stack: {}",
+                "verify_ctx: ctx type ({:?}) incompatible with actual value on stack: {} ({:?})",
                 learned_type,
-                obj_info_str(stack_val)
+                obj_info_str(stack_val),
+                val_type,
             );
         }
     }
@@ -346,10 +688,11 @@ fn verify_ctx(jit: &JITState, ctx: &Context) {
     let top_idx: usize = cmp::min(local_table_size as usize, MAX_TEMP_TYPES);
     for i in 0..top_idx {
         let learned_type = ctx.get_local_type(i);
-        let local_val = jit_peek_at_local(jit, i as i32);
+        let learned_type = relax_type_with_singleton_class_assumption(learned_type);
+        let local_val = jit.peek_at_local(i as i32);
         let local_type = Type::from(local_val);
 
-        if local_type.diff(learned_type) == usize::MAX {
+        if local_type.diff(learned_type) == TypeDiff::Incompatible {
             panic!(
                 "verify_ctx: ctx type ({:?}) incompatible with actual value of local: {} (type {:?})",
                 learned_type,
@@ -364,14 +707,13 @@ fn verify_ctx(jit: &JITState, ctx: &Context) {
 // to the interpreter when it cannot service a stub by generating new code.
 // Before coming here, branch_stub_hit() takes care of fully reconstructing
 // interpreter state.
-fn gen_code_for_exit_from_stub(ocb: &mut OutlinedCb) -> CodePtr {
+fn gen_stub_exit(ocb: &mut OutlinedCb) -> Option<CodePtr> {
     let ocb = ocb.unwrap();
-    let code_ptr = ocb.get_write_ptr();
     let mut asm = Assembler::new();
 
-    gen_counter_incr!(asm, exit_from_branch_stub);
+    gen_counter_incr(&mut asm, Counter::exit_from_branch_stub);
 
-    asm.comment("exit from branch stub");
+    asm_comment!(asm, "exit from branch stub");
     asm.cpop_into(SP);
     asm.cpop_into(EC);
     asm.cpop_into(CFP);
@@ -380,23 +722,30 @@ fn gen_code_for_exit_from_stub(ocb: &mut OutlinedCb) -> CodePtr {
 
     asm.cret(Qundef.into());
 
-    asm.compile(ocb);
-
-    code_ptr
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
 }
 
 /// Generate an exit to return to the interpreter
-fn gen_exit(exit_pc: *mut VALUE, ctx: &Context, asm: &mut Assembler) {
+fn gen_exit(exit_pc: *mut VALUE, asm: &mut Assembler) {
     #[cfg(all(feature = "disasm", not(test)))]
     {
         let opcode = unsafe { rb_vm_insn_addr2opcode((*exit_pc).as_ptr()) };
-        asm.comment(&format!("exit to interpreter on {}", insn_name(opcode as usize)));
+        asm_comment!(asm, "exit to interpreter on {}", insn_name(opcode as usize));
     }
 
+    if asm.ctx.is_return_landing() {
+        asm.mov(SP, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP));
+        let top = asm.stack_push(Type::Unknown);
+        asm.mov(top, C_RET_OPND);
+    }
+
+    // Spill stack temps before returning to the interpreter
+    asm.spill_temps();
+
     // Generate the code to exit to the interpreters
     // Write the adjusted SP back into the CFP
-    if ctx.get_sp_offset() != 0 {
-        let sp_opnd = asm.lea(ctx.sp_opnd(0));
+    if asm.ctx.get_sp_offset() != 0 {
+        let sp_opnd = asm.lea(asm.ctx.sp_opnd(0));
         asm.mov(
             Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP),
             sp_opnd
@@ -416,9 +765,9 @@ fn gen_exit(exit_pc: *mut VALUE, ctx: &Context, asm: &mut Assembler) {
             vec![Opnd::const_ptr(exit_pc as *const u8)]
         );
 
-        // If --yjit-trace-exits option is enabled, record the exit stack
-        // while recording the side exits.
-        if get_option!(gen_trace_exits) {
+        // If --yjit-trace-exits is enabled, record the exit stack while recording
+        // the side exits. TraceExits::Counter is handled by gen_counted_exit().
+        if get_option!(trace_exits) == Some(TraceExits::All) {
             asm.ccall(
                 rb_yjit_record_exit_stack as *const u8,
                 vec![Opnd::const_ptr(exit_pc as *const u8)]
@@ -435,81 +784,112 @@ fn gen_exit(exit_pc: *mut VALUE, ctx: &Context, asm: &mut Assembler) {
     asm.cret(Qundef.into());
 }
 
-/// Generate an exit to the interpreter in the outlined code block
-fn gen_outlined_exit(exit_pc: *mut VALUE, ctx: &Context, ocb: &mut OutlinedCb) -> CodePtr {
+/// :side-exit:
+/// Get an exit for the current instruction in the outlined block. The code
+/// for each instruction often begins with several guards before proceeding
+/// to do work. When guards fail, an option we have is to exit to the
+/// interpreter at an instruction boundary. The piece of code that takes
+/// care of reconstructing interpreter state and exiting out of generated
+/// code is called the side exit.
+///
+/// No guards change the logic for reconstructing interpreter state at the
+/// moment, so there is one unique side exit for each context. Note that
+/// it's incorrect to jump to the side exit after any ctx stack push operations
+/// since they change the logic required for reconstructing interpreter state.
+pub fn gen_outlined_exit(exit_pc: *mut VALUE, ctx: &Context, ocb: &mut OutlinedCb) -> Option<CodePtr> {
     let mut cb = ocb.unwrap();
-    let exit_code = cb.get_write_ptr();
     let mut asm = Assembler::new();
+    asm.ctx = *ctx;
+    asm.set_reg_temps(ctx.get_reg_temps());
 
-    gen_exit(exit_pc, ctx, &mut asm);
+    gen_exit(exit_pc, &mut asm);
+
+    asm.compile(&mut cb, None).map(|(code_ptr, _)| code_ptr)
+}
 
-    asm.compile(&mut cb);
+/// Get a side exit. Increment a counter in it if --yjit-stats is enabled.
+pub fn gen_counted_exit(exit_pc: *mut VALUE, side_exit: CodePtr, ocb: &mut OutlinedCb, counter: Option<Counter>) -> Option<CodePtr> {
+    // The counter is only incremented when stats are enabled
+    if !get_option!(gen_stats) {
+        return Some(side_exit);
+    }
+    let counter = match counter {
+        Some(counter) => counter,
+        None => return Some(side_exit),
+    };
+
+    let mut asm = Assembler::new();
 
-    exit_code
+    // Increment a counter
+    gen_counter_incr(&mut asm, counter);
+
+    // Trace a counted exit if --yjit-trace-exits=counter is given.
+    // TraceExits::All is handled by gen_exit().
+    if get_option!(trace_exits) == Some(TraceExits::CountedExit(counter)) {
+        with_caller_saved_temp_regs(&mut asm, |asm| {
+            asm.ccall(rb_yjit_record_exit_stack as *const u8, vec![Opnd::const_ptr(exit_pc as *const u8)]);
+        });
+    }
+
+    // Jump to the existing side exit
+    asm.jmp(Target::CodePtr(side_exit));
+
+    let ocb = ocb.unwrap();
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
 }
 
-// :side-exit:
-// Get an exit for the current instruction in the outlined block. The code
-// for each instruction often begins with several guards before proceeding
-// to do work. When guards fail, an option we have is to exit to the
-// interpreter at an instruction boundary. The piece of code that takes
-// care of reconstructing interpreter state and exiting out of generated
-// code is called the side exit.
-//
-// No guards change the logic for reconstructing interpreter state at the
-// moment, so there is one unique side exit for each context. Note that
-// it's incorrect to jump to the side exit after any ctx stack push operations
-// since they change the logic required for reconstructing interpreter state.
-fn get_side_exit(jit: &mut JITState, ocb: &mut OutlinedCb, ctx: &Context) -> Target {
-    match jit.side_exit_for_pc {
-        None => {
-            let exit_code = gen_outlined_exit(jit.pc, ctx, ocb);
-            jit.side_exit_for_pc = Some(exit_code);
-            exit_code.as_side_exit()
-        }
-        Some(code_ptr) => code_ptr.as_side_exit()
+/// Preserve caller-saved stack temp registers during the call of a given block
+fn with_caller_saved_temp_regs<F, R>(asm: &mut Assembler, block: F) -> R where F: FnOnce(&mut Assembler) -> R {
+    for &reg in caller_saved_temp_regs() {
+        asm.cpush(Opnd::Reg(reg)); // save stack temps
     }
+    let ret = block(asm);
+    for &reg in caller_saved_temp_regs().rev() {
+        asm.cpop_into(Opnd::Reg(reg)); // restore stack temps
+    }
+    ret
 }
 
 // Ensure that there is an exit for the start of the block being compiled.
 // Block invalidation uses this exit.
-pub fn jit_ensure_block_entry_exit(jit: &mut JITState, ocb: &mut OutlinedCb) {
-    let blockref = jit.block.clone();
-    let mut block = blockref.borrow_mut();
-    let block_ctx = block.get_ctx();
-    let blockid = block.get_blockid();
-
-    if block.entry_exit.is_some() {
-        return;
+#[must_use]
+pub fn jit_ensure_block_entry_exit(jit: &mut JITState, asm: &mut Assembler, ocb: &mut OutlinedCb) -> Option<()> {
+    if jit.block_entry_exit.is_some() {
+        return Some(());
     }
 
+    let block_starting_context = &jit.get_starting_ctx();
+
     // If we're compiling the first instruction in the block.
-    if jit.insn_idx == blockid.idx {
-        // Generate the exit with the cache in jitstate.
-        block.entry_exit = Some(get_side_exit(jit, ocb, &block_ctx).unwrap_code_ptr());
+    if jit.insn_idx == jit.starting_insn_idx {
+        // Generate the exit with the cache in Assembler.
+        let side_exit_context = SideExitContext::new(jit.pc, *block_starting_context);
+        let entry_exit = asm.get_side_exit(&side_exit_context, None, ocb);
+        jit.block_entry_exit = Some(entry_exit?);
     } else {
-        let _pc = unsafe { rb_iseq_pc_at_idx(blockid.iseq, blockid.idx) };
-        block.entry_exit = Some(gen_outlined_exit(jit.pc, &block_ctx, ocb));
+        let block_entry_pc = unsafe { rb_iseq_pc_at_idx(jit.iseq, jit.starting_insn_idx.into()) };
+        jit.block_entry_exit = Some(gen_outlined_exit(block_entry_pc, block_starting_context, ocb)?);
     }
+
+    Some(())
 }
 
 // Landing code for when c_return tracing is enabled. See full_cfunc_return().
-fn gen_full_cfunc_return(ocb: &mut OutlinedCb) -> CodePtr {
+fn gen_full_cfunc_return(ocb: &mut OutlinedCb) -> Option<CodePtr> {
     let ocb = ocb.unwrap();
-    let code_ptr = ocb.get_write_ptr();
     let mut asm = Assembler::new();
 
     // This chunk of code expects REG_EC to be filled properly and
     // RAX to contain the return value of the C method.
 
-    asm.comment("full cfunc return");
+    asm_comment!(asm, "full cfunc return");
     asm.ccall(
         rb_full_cfunc_return as *const u8,
         vec![EC, C_RET_OPND]
     );
 
     // Count the exit
-    gen_counter_incr!(asm, traced_cfunc_return);
+    gen_counter_incr(&mut asm, Counter::traced_cfunc_return);
 
     // Return to the interpreter
     asm.cpop_into(SP);
@@ -520,16 +900,13 @@ fn gen_full_cfunc_return(ocb: &mut OutlinedCb) -> CodePtr {
 
     asm.cret(Qundef.into());
 
-    asm.compile(ocb);
-
-    return code_ptr;
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
 }
 
 /// Generate a continuation for leave that exits to the interpreter at REG_CFP->pc.
 /// This is used by gen_leave() and gen_entry_prologue()
-fn gen_leave_exit(ocb: &mut OutlinedCb) -> CodePtr {
+fn gen_leave_exit(ocb: &mut OutlinedCb) -> Option<CodePtr> {
     let ocb = ocb.unwrap();
-    let code_ptr = ocb.get_write_ptr();
     let mut asm = Assembler::new();
 
     // gen_leave() fully reconstructs interpreter state and leaves the
@@ -537,9 +914,9 @@ fn gen_leave_exit(ocb: &mut OutlinedCb) -> CodePtr {
     let ret_opnd = asm.live_reg_opnd(C_RET_OPND);
 
     // Every exit to the interpreter should be counted
-    gen_counter_incr!(asm, leave_interp_return);
+    gen_counter_incr(&mut asm, Counter::leave_interp_return);
 
-    asm.comment("exit from leave");
+    asm_comment!(asm, "exit from leave");
     asm.cpop_into(SP);
     asm.cpop_into(EC);
     asm.cpop_into(CFP);
@@ -548,51 +925,89 @@ fn gen_leave_exit(ocb: &mut OutlinedCb) -> CodePtr {
 
     asm.cret(ret_opnd);
 
-    asm.compile(ocb);
-
-    return code_ptr;
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
 }
 
-// Generate a runtime guard that ensures the PC is at the expected
-// instruction index in the iseq, otherwise takes a side-exit.
-// This is to handle the situation of optional parameters.
-// When a function with optional parameters is called, the entry
-// PC for the method isn't necessarily 0.
-fn gen_pc_guard(asm: &mut Assembler, iseq: IseqPtr, insn_idx: u32) {
-    let pc_opnd = Opnd::mem(64, CFP, RUBY_OFFSET_CFP_PC);
-    let expected_pc = unsafe { rb_iseq_pc_at_idx(iseq, insn_idx) };
-    let expected_pc_opnd = Opnd::const_ptr(expected_pc as *const u8);
+// Increment SP and transfer the execution to the interpreter after jit_exec_exception().
+// On jit_exec_exception(), you need to return Qundef to keep executing caller non-FINISH
+// frames on the interpreter. You also need to increment SP to push the return value to
+// the caller's stack, which is different from gen_stub_exit().
+fn gen_leave_exception(ocb: &mut OutlinedCb) -> Option<CodePtr> {
+    let ocb = ocb.unwrap();
+    let mut asm = Assembler::new();
 
-    asm.cmp(pc_opnd, expected_pc_opnd);
+    // gen_leave() leaves the return value in C_RET_OPND before coming here.
+    let ruby_ret_val = asm.live_reg_opnd(C_RET_OPND);
 
-    let pc_match = asm.new_label("pc_match");
-    asm.je(pc_match);
+    // Every exit to the interpreter should be counted
+    gen_counter_incr(&mut asm, Counter::leave_interp_return);
 
-    // We're not starting at the first PC, so we need to exit.
-    gen_counter_incr!(asm, leave_start_pc_non_zero);
+    asm_comment!(asm, "push return value through cfp->sp");
+    let cfp_sp = Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP);
+    let sp = asm.load(cfp_sp);
+    asm.mov(Opnd::mem(64, sp, 0), ruby_ret_val);
+    let new_sp = asm.add(sp, SIZEOF_VALUE.into());
+    asm.mov(cfp_sp, new_sp);
 
+    asm_comment!(asm, "exit from exception");
     asm.cpop_into(SP);
     asm.cpop_into(EC);
     asm.cpop_into(CFP);
 
     asm.frame_teardown();
 
+    // Execute vm_exec_core
     asm.cret(Qundef.into());
 
-    // PC should match the expected insn_idx
-    asm.write_label(pc_match);
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
+}
+
+// Generate a runtime guard that ensures the PC is at the expected
+// instruction index in the iseq, otherwise takes an entry stub
+// that generates another check and entry.
+// This is to handle the situation of optional parameters.
+// When a function with optional parameters is called, the entry
+// PC for the method isn't necessarily 0.
+pub fn gen_entry_chain_guard(
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    iseq: IseqPtr,
+    insn_idx: u16,
+) -> Option<PendingEntryRef> {
+    let entry = new_pending_entry();
+    let stub_addr = gen_entry_stub(entry.uninit_entry.as_ptr() as usize, ocb)?;
+
+    let pc_opnd = Opnd::mem(64, CFP, RUBY_OFFSET_CFP_PC);
+    let expected_pc = unsafe { rb_iseq_pc_at_idx(iseq, insn_idx.into()) };
+    let expected_pc_opnd = Opnd::const_ptr(expected_pc as *const u8);
+
+    asm_comment!(asm, "guard expected PC");
+    asm.cmp(pc_opnd, expected_pc_opnd);
+
+    asm.mark_entry_start(&entry);
+    asm.jne(stub_addr.into());
+    asm.mark_entry_end(&entry);
+    return Some(entry);
 }
 
 /// Compile an interpreter entry block to be inserted into an iseq
 /// Returns None if compilation fails.
-pub fn gen_entry_prologue(cb: &mut CodeBlock, iseq: IseqPtr, insn_idx: u32) -> Option<CodePtr> {
+/// If jit_exception is true, compile JIT code for handling exceptions.
+/// See [jit_compile_exception] for details.
+pub fn gen_entry_prologue(
+    cb: &mut CodeBlock,
+    ocb: &mut OutlinedCb,
+    iseq: IseqPtr,
+    insn_idx: u16,
+    jit_exception: bool,
+) -> Option<CodePtr> {
     let code_ptr = cb.get_write_ptr();
 
     let mut asm = Assembler::new();
     if get_option_ref!(dump_disasm).is_some() {
-        asm.comment(&format!("YJIT entry point: {}", iseq_get_location(iseq, 0)));
+        asm_comment!(asm, "YJIT entry point: {}", iseq_get_location(iseq, 0));
     } else {
-        asm.comment("YJIT entry");
+        asm_comment!(asm, "YJIT entry");
     }
 
     asm.frame_setup();
@@ -610,22 +1025,42 @@ pub fn gen_entry_prologue(cb: &mut CodeBlock, iseq: IseqPtr, insn_idx: u32) -> O
     asm.mov(SP, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP));
 
     // Setup cfp->jit_return
-    asm.mov(
-        Opnd::mem(64, CFP, RUBY_OFFSET_CFP_JIT_RETURN),
-        Opnd::const_ptr(CodegenGlobals::get_leave_exit_code().raw_ptr()),
-    );
+    // If this is an exception handler entry point
+    if jit_exception {
+        // On jit_exec_exception(), it's NOT safe to return a non-Qundef value
+        // from a non-FINISH frame. This function fixes that problem.
+        // See [jit_compile_exception] for details.
+        asm.ccall(
+            rb_yjit_set_exception_return as *mut u8,
+            vec![
+                CFP,
+                Opnd::const_ptr(CodegenGlobals::get_leave_exit_code().raw_ptr(cb)),
+                Opnd::const_ptr(CodegenGlobals::get_leave_exception_code().raw_ptr(cb)),
+            ],
+        );
+    } else {
+        // On jit_exec() or JIT_EXEC(), it's safe to return a non-Qundef value
+        // on the entry frame. See [jit_compile] for details.
+        asm.mov(
+            Opnd::mem(64, CFP, RUBY_OFFSET_CFP_JIT_RETURN),
+            Opnd::const_ptr(CodegenGlobals::get_leave_exit_code().raw_ptr(cb)),
+        );
+    }
 
-    // We're compiling iseqs that we *expect* to start at `insn_idx`. But in
-    // the case of optional parameters, the interpreter can set the pc to a
-    // different location depending on the optional parameters.  If an iseq
-    // has optional parameters, we'll add a runtime check that the PC we've
+    // We're compiling iseqs that we *expect* to start at `insn_idx`.
+    // But in the case of optional parameters or when handling exceptions,
+    // the interpreter can set the pc to a different location. For
+    // such scenarios, we'll add a runtime check that the PC we've
     // compiled for is the same PC that the interpreter wants us to run with.
-    // If they don't match, then we'll take a side exit.
-    if unsafe { get_iseq_flags_has_opt(iseq) } {
-        gen_pc_guard(&mut asm, iseq, insn_idx);
-    }
+    // If they don't match, then we'll jump to an entry stub and generate
+    // another PC check and entry there.
+    let pending_entry = if unsafe { get_iseq_flags_has_opt(iseq) } || jit_exception {
+        Some(gen_entry_chain_guard(&mut asm, ocb, iseq, insn_idx)?)
+    } else {
+        None
+    };
 
-    asm.compile(cb);
+    asm.compile(cb, Some(ocb))?;
 
     if cb.has_dropped_bytes() {
         None
@@ -635,53 +1070,71 @@ pub fn gen_entry_prologue(cb: &mut CodeBlock, iseq: IseqPtr, insn_idx: u32) -> O
         for page in cb.addrs_to_pages(code_ptr, cb.get_write_ptr()) {
             iseq_payload.pages.insert(page);
         }
+        // Write an entry to the heap and push it to the ISEQ
+        if let Some(pending_entry) = pending_entry {
+            let pending_entry = Rc::try_unwrap(pending_entry)
+                .ok().expect("PendingEntry should be unique");
+            iseq_payload.entries.push(pending_entry.into_entry());
+        }
         Some(code_ptr)
     }
 }
 
 // Generate code to check for interrupts and take a side-exit.
 // Warning: this function clobbers REG0
-fn gen_check_ints(asm: &mut Assembler, side_exit: Target) {
+fn gen_check_ints(
+    asm: &mut Assembler,
+    counter: Counter,
+) {
     // Check for interrupts
     // see RUBY_VM_CHECK_INTS(ec) macro
-    asm.comment("RUBY_VM_CHECK_INTS(ec)");
+    asm_comment!(asm, "RUBY_VM_CHECK_INTS(ec)");
 
     // Not checking interrupt_mask since it's zero outside finalize_deferred_heap_pages,
     // signal_exec, or rb_postponed_job_flush.
     let interrupt_flag = asm.load(Opnd::mem(32, EC, RUBY_OFFSET_EC_INTERRUPT_FLAG));
     asm.test(interrupt_flag, interrupt_flag);
 
-    asm.jnz(side_exit);
+    asm.jnz(Target::side_exit(counter));
 }
 
 // Generate a stubbed unconditional jump to the next bytecode instruction.
 // Blocks that are part of a guard chain can use this to share the same successor.
 fn jump_to_next_insn(
     jit: &mut JITState,
-    current_context: &Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) {
-    // Reset the depth since in current usages we only ever jump to to
+) -> Option<CodegenStatus> {
+    end_block_with_jump(jit, asm, ocb, jit.next_insn_idx())
+}
+
+fn end_block_with_jump(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    continuation_insn_idx: u16,
+) -> Option<CodegenStatus> {
+    // Reset the depth since in current usages we only ever jump to
     // chain_depth > 0 from the same instruction.
-    let mut reset_depth = current_context.clone();
-    reset_depth.reset_chain_depth();
+    let mut reset_depth = asm.ctx;
+    reset_depth.reset_chain_depth_and_defer();
 
     let jump_block = BlockId {
         iseq: jit.iseq,
-        idx: jit_next_insn_idx(jit),
+        idx: continuation_insn_idx,
     };
 
     // We are at the end of the current instruction. Record the boundary.
     if jit.record_boundary_patch_point {
-        let exit_pc = unsafe { jit.pc.offset(insn_len(jit.opcode).try_into().unwrap()) };
-        let exit_pos = gen_outlined_exit(exit_pc, &reset_depth, ocb);
-        record_global_inval_patch(asm, exit_pos);
         jit.record_boundary_patch_point = false;
+        let exit_pc = unsafe { rb_iseq_pc_at_idx(jit.iseq, continuation_insn_idx.into())};
+        let exit_pos = gen_outlined_exit(exit_pc, &reset_depth, ocb);
+        record_global_inval_patch(asm, exit_pos?);
     }
 
     // Generate the jump instruction
     gen_direct_jump(jit, &reset_depth, jump_block, asm);
+    Some(EndBlock)
 }
 
 // Compile a sequence of bytecode instructions for a given basic block version.
@@ -696,42 +1149,62 @@ pub fn gen_single_block(
     ocb: &mut OutlinedCb,
 ) -> Result<BlockRef, ()> {
     // Limit the number of specialized versions for this block
-    let mut ctx = limit_block_versions(blockid, start_ctx);
+    let ctx = limit_block_versions(blockid, start_ctx);
 
     verify_blockid(blockid);
     assert!(!(blockid.idx == 0 && ctx.get_stack_size() > 0));
 
+    // Save machine code placement of the block. `cb` might page switch when we
+    // generate code in `ocb`.
+    let block_start_addr = cb.get_write_ptr();
+
     // Instruction sequence to compile
     let iseq = blockid.iseq;
     let iseq_size = unsafe { get_iseq_encoded_size(iseq) };
-    let mut insn_idx: c_uint = blockid.idx;
-    let starting_insn_idx = insn_idx;
-
-    // Allocate the new block
-    let blockref = Block::new(blockid, &ctx);
+    let iseq_size: IseqIdx = if let Ok(size) = iseq_size.try_into() {
+        size
+    } else {
+        // ISeq too large to compile
+        return Err(());
+    };
+    let mut insn_idx: IseqIdx = blockid.idx;
 
     // Initialize a JIT state object
-    let mut jit = JITState::new(&blockref);
+    let mut jit = JITState::new(blockid, ctx, cb.get_write_ptr(), ec);
     jit.iseq = blockid.iseq;
-    jit.ec = Some(ec);
-
-    // Mark the start position of the block
-    blockref.borrow_mut().set_start_addr(cb.get_write_ptr());
 
     // Create a backend assembler instance
     let mut asm = Assembler::new();
+    asm.ctx = ctx;
 
     #[cfg(feature = "disasm")]
     if get_option_ref!(dump_disasm).is_some() {
         let blockid_idx = blockid.idx;
-        asm.comment(&format!("Block: {} (ISEQ offset: {})", iseq_get_location(blockid.iseq, blockid_idx), blockid_idx));
+        let chain_depth = if asm.ctx.get_chain_depth() > 0 { format!("(chain_depth: {})", asm.ctx.get_chain_depth()) } else { "".to_string() };
+        asm_comment!(asm, "Block: {} {}", iseq_get_location(blockid.iseq, blockid_idx), chain_depth);
+        asm_comment!(asm, "reg_temps: {:08b}", asm.ctx.get_reg_temps().as_u8());
+    }
+
+    // Mark the start of an ISEQ for --yjit-perf
+    jit_perf_symbol_push!(jit, &mut asm, &get_iseq_name(iseq), PerfMap::ISEQ);
+
+    if asm.ctx.is_return_landing() {
+        // Continuation of the end of gen_leave().
+        // Reload REG_SP for the current frame and transfer the return value
+        // to the stack top.
+        asm.mov(SP, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP));
+
+        let top = asm.stack_push(Type::Unknown);
+        asm.mov(top, C_RET_OPND);
+
+        asm.ctx.clear_return_landing();
     }
 
     // For each instruction to compile
     // NOTE: could rewrite this loop with a std::iter::Iterator
     while insn_idx < iseq_size {
         // Get the current pc and opcode
-        let pc = unsafe { rb_iseq_pc_at_idx(iseq, insn_idx) };
+        let pc = unsafe { rb_iseq_pc_at_idx(iseq, insn_idx.into()) };
         // try_into() call below is unfortunate. Maybe pick i32 instead of usize for opcodes.
         let opcode: usize = unsafe { rb_iseq_opcode_at_pc(iseq, pc) }
             .try_into()
@@ -740,8 +1213,8 @@ pub fn gen_single_block(
         // We need opt_getconstant_path to be in a block all on its own. Cut the block short
         // if we run into it. This is necessary because we want to invalidate based on the
         // instruction's index.
-        if opcode == YARVINSN_opt_getconstant_path.as_usize() && insn_idx > starting_insn_idx {
-            jump_to_next_insn(&mut jit, &ctx, &mut asm, ocb);
+        if opcode == YARVINSN_opt_getconstant_path.as_usize() && insn_idx > jit.starting_insn_idx {
+            jump_to_next_insn(&mut jit, &mut asm, ocb);
             break;
         }
 
@@ -749,31 +1222,38 @@ pub fn gen_single_block(
         jit.insn_idx = insn_idx;
         jit.opcode = opcode;
         jit.pc = pc;
-        jit.side_exit_for_pc = None;
+        jit.stack_size_for_pc = asm.ctx.get_stack_size();
+        asm.set_side_exit_context(pc, asm.ctx.get_stack_size());
+
+        // stack_pop doesn't immediately deallocate a register for stack temps,
+        // but it's safe to do so at this instruction boundary.
+        for stack_idx in asm.ctx.get_stack_size()..MAX_REG_TEMPS {
+            asm.ctx.dealloc_temp_reg(stack_idx);
+        }
 
         // If previous instruction requested to record the boundary
         if jit.record_boundary_patch_point {
             // Generate an exit to this instruction and record it
-            let exit_pos = gen_outlined_exit(jit.pc, &ctx, ocb);
+            let exit_pos = gen_outlined_exit(jit.pc, &asm.ctx, ocb).ok_or(())?;
             record_global_inval_patch(&mut asm, exit_pos);
             jit.record_boundary_patch_point = false;
         }
 
         // In debug mode, verify our existing assumption
-        if cfg!(debug_assertions) && get_option!(verify_ctx) && jit_at_current_insn(&jit) {
-            verify_ctx(&jit, &ctx);
+        if cfg!(debug_assertions) && get_option!(verify_ctx) && jit.at_current_insn() {
+            verify_ctx(&jit, &asm.ctx);
         }
 
+        // :count-placement:
+        // Count bytecode instructions that execute in generated code.
+        // Note that the increment happens even when the output takes side exit.
+        gen_counter_incr(&mut asm, Counter::yjit_insns_count);
+
         // Lookup the codegen function for this instruction
-        let mut status = CantCompile;
+        let mut status = None;
         if let Some(gen_fn) = get_gen_fn(VALUE(opcode)) {
-            // :count-placement:
-            // Count bytecode instructions that execute in generated code.
-            // Note that the increment happens even when the output takes side exit.
-            gen_counter_incr!(asm, exec_instruction);
-
             // Add a comment for the name of the YARV instruction
-            asm.comment(&format!("Insn: {}", insn_name(opcode)));
+            asm_comment!(asm, "Insn: {:04} {} (stack_size: {})", insn_idx, insn_name(opcode), asm.ctx.get_stack_size());
 
             // If requested, dump instructions for debugging
             if get_option!(dump_insns) {
@@ -782,27 +1262,30 @@ pub fn gen_single_block(
             }
 
             // Call the code generation function
-            status = gen_fn(&mut jit, &mut ctx, &mut asm, ocb);
+            jit_perf_symbol_push!(jit, &mut asm, &insn_name(opcode), PerfMap::Codegen);
+            status = gen_fn(&mut jit, &mut asm, ocb);
+            jit_perf_symbol_pop!(jit, &mut asm, PerfMap::Codegen);
+
+            #[cfg(debug_assertions)]
+            assert!(!asm.get_leaf_ccall(), "ccall() wasn't used after leaf_ccall was set in {}", insn_name(opcode));
         }
 
         // If we can't compile this instruction
         // exit to the interpreter and stop compiling
-        if status == CantCompile {
+        if status == None {
             if get_option!(dump_insns) {
                 println!("can't compile {}", insn_name(opcode));
             }
 
-            let mut block = jit.block.borrow_mut();
-
-            // TODO: if the codegen function makes changes to ctx and then return YJIT_CANT_COMPILE,
-            // the exit this generates would be wrong. We could save a copy of the entry context
-            // and assert that ctx is the same here.
-            gen_exit(jit.pc, &ctx, &mut asm);
+            // Rewind stack_size using ctx.with_stack_size to allow stack_size changes
+            // before you return None.
+            asm.ctx = asm.ctx.with_stack_size(jit.stack_size_for_pc);
+            gen_exit(jit.pc, &mut asm);
 
-            // If this is the first instruction in the block, then we can use
-            // the exit for block->entry_exit.
-            if insn_idx == block.get_blockid().idx {
-                block.entry_exit = block.get_start_addr();
+            // If this is the first instruction in the block, then
+            // the entry address is the address for block_entry_exit
+            if insn_idx == jit.starting_insn_idx {
+                jit.block_entry_exit = Some(jit.output_ptr);
             }
 
             break;
@@ -810,347 +1293,385 @@ pub fn gen_single_block(
 
         // For now, reset the chain depth after each instruction as only the
         // first instruction in the block can concern itself with the depth.
-        ctx.reset_chain_depth();
+        asm.ctx.reset_chain_depth_and_defer();
 
         // Move to the next instruction to compile
-        insn_idx += insn_len(opcode);
+        insn_idx += insn_len(opcode) as u16;
 
         // If the instruction terminates this block
-        if status == EndBlock {
+        if status == Some(EndBlock) {
             break;
         }
     }
+    let end_insn_idx = insn_idx;
 
-    // Finish filling out the block
-    {
-        let mut block = jit.block.borrow_mut();
-        if block.entry_exit.is_some() {
-            asm.pad_inval_patch();
-        }
+    // We currently can't handle cases where the request is for a block that
+    // doesn't go to the next instruction in the same iseq.
+    assert!(!jit.record_boundary_patch_point);
 
-        // Compile code into the code block
-        let gc_offsets = asm.compile(cb);
+    // Pad the block if it has the potential to be invalidated
+    if jit.block_entry_exit.is_some() {
+        asm.pad_inval_patch();
+    }
 
-        // Add the GC offsets to the block
-        block.add_gc_obj_offsets(gc_offsets);
+    // Mark the end of an ISEQ for --yjit-perf
+    jit_perf_symbol_pop!(jit, &mut asm, PerfMap::ISEQ);
 
-        // Mark the end position of the block
-        block.set_end_addr(cb.get_write_ptr());
+    // Compile code into the code block
+    let (_, gc_offsets) = asm.compile(cb, Some(ocb)).ok_or(())?;
+    let end_addr = cb.get_write_ptr();
 
-        // Store the index of the last instruction in the block
-        block.set_end_idx(insn_idx);
+    // Flush perf symbols after asm.compile() writes addresses
+    if get_option!(perf_map).is_some() {
+        jit.flush_perf_symbols(cb);
     }
 
-    // We currently can't handle cases where the request is for a block that
-    // doesn't go to the next instruction.
-    assert!(!jit.record_boundary_patch_point);
-
     // If code for the block doesn't fit, fail
     if cb.has_dropped_bytes() || ocb.unwrap().has_dropped_bytes() {
-        free_block(&blockref);
         return Err(());
     }
 
     // Block compiled successfully
-    Ok(blockref)
+    Ok(jit.into_block(end_insn_idx, block_start_addr, end_addr, gc_offsets))
 }
 
 fn gen_nop(
     _jit: &mut JITState,
-    _ctx: &mut Context,
     _asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Do nothing
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_pop(
     _jit: &mut JITState,
-    ctx: &mut Context,
-    _asm: &mut Assembler,
+    asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Decrement SP
-    ctx.stack_pop(1);
-    KeepCompiling
+    asm.stack_pop(1);
+    Some(KeepCompiling)
 }
 
 fn gen_dup(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-
-    let dup_val = ctx.stack_pop(0);
-    let (mapping, tmp_type) = ctx.get_opnd_mapping(StackOpnd(0));
+) -> Option<CodegenStatus> {
+    let dup_val = asm.stack_opnd(0);
+    let mapping = asm.ctx.get_opnd_mapping(dup_val.into());
 
-    let loc0 = ctx.stack_push_mapping((mapping, tmp_type));
+    let loc0 = asm.stack_push_mapping(mapping);
     asm.mov(loc0, dup_val);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // duplicate stack top n elements
 fn gen_dupn(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let n = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let n = jit.get_arg(0).as_usize();
 
     // In practice, seems to be only used for n==2
     if n != 2 {
-        return CantCompile;
+        return None;
     }
 
-    let opnd1: Opnd = ctx.stack_opnd(1);
-    let opnd0: Opnd = ctx.stack_opnd(0);
+    let opnd1: Opnd = asm.stack_opnd(1);
+    let opnd0: Opnd = asm.stack_opnd(0);
 
-    let mapping1 = ctx.get_opnd_mapping(StackOpnd(1));
-    let mapping0 = ctx.get_opnd_mapping(StackOpnd(0));
+    let mapping1 = asm.ctx.get_opnd_mapping(opnd1.into());
+    let mapping0 = asm.ctx.get_opnd_mapping(opnd0.into());
 
-    let dst1: Opnd = ctx.stack_push_mapping(mapping1);
+    let dst1: Opnd = asm.stack_push_mapping(mapping1);
     asm.mov(dst1, opnd1);
 
-    let dst0: Opnd = ctx.stack_push_mapping(mapping0);
+    let dst0: Opnd = asm.stack_push_mapping(mapping0);
     asm.mov(dst0, opnd0);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // Swap top 2 stack entries
 fn gen_swap(
-    jit: &mut JITState,
-    ctx: &mut Context,
+    _jit: &mut JITState,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    stack_swap(jit, ctx, asm, 0, 1);
-    KeepCompiling
+) -> Option<CodegenStatus> {
+    stack_swap(asm, 0, 1);
+    Some(KeepCompiling)
 }
 
 fn stack_swap(
-    _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    offset0: u16,
-    offset1: u16,
+    offset0: i32,
+    offset1: i32,
 ) {
-    let stack0_mem = ctx.stack_opnd(offset0 as i32);
-    let stack1_mem = ctx.stack_opnd(offset1 as i32);
+    let stack0_mem = asm.stack_opnd(offset0);
+    let stack1_mem = asm.stack_opnd(offset1);
 
-    let mapping0 = ctx.get_opnd_mapping(StackOpnd(offset0));
-    let mapping1 = ctx.get_opnd_mapping(StackOpnd(offset1));
+    let mapping0 = asm.ctx.get_opnd_mapping(stack0_mem.into());
+    let mapping1 = asm.ctx.get_opnd_mapping(stack1_mem.into());
 
     let stack0_reg = asm.load(stack0_mem);
     let stack1_reg = asm.load(stack1_mem);
     asm.mov(stack0_mem, stack1_reg);
     asm.mov(stack1_mem, stack0_reg);
 
-    ctx.set_opnd_mapping(StackOpnd(offset0), mapping1);
-    ctx.set_opnd_mapping(StackOpnd(offset1), mapping0);
+    asm.ctx.set_opnd_mapping(stack0_mem.into(), mapping1);
+    asm.ctx.set_opnd_mapping(stack1_mem.into(), mapping0);
 }
 
 fn gen_putnil(
-    jit: &mut JITState,
-    ctx: &mut Context,
+    _jit: &mut JITState,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    jit_putobject(jit, ctx, asm, Qnil);
-    KeepCompiling
+) -> Option<CodegenStatus> {
+    jit_putobject(asm, Qnil);
+    Some(KeepCompiling)
 }
 
-fn jit_putobject(_jit: &mut JITState, ctx: &mut Context, asm: &mut Assembler, arg: VALUE) {
+fn jit_putobject(asm: &mut Assembler, arg: VALUE) {
     let val_type: Type = Type::from(arg);
-    let stack_top = ctx.stack_push(val_type);
+    let stack_top = asm.stack_push(val_type);
     asm.mov(stack_top, arg.into());
 }
 
 fn gen_putobject_int2fix(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+    ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
     let opcode = jit.opcode;
     let cst_val: usize = if opcode == YARVINSN_putobject_INT2FIX_0_.as_usize() {
         0
     } else {
         1
     };
+    let cst_val = VALUE::fixnum_from_usize(cst_val);
 
-    jit_putobject(jit, ctx, asm, VALUE::fixnum_from_usize(cst_val));
-    KeepCompiling
+    if let Some(result) = fuse_putobject_opt_ltlt(jit, asm, cst_val, ocb) {
+        return Some(result);
+    }
+
+    jit_putobject(asm, cst_val);
+    Some(KeepCompiling)
 }
 
 fn gen_putobject(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let arg: VALUE = jit_get_arg(jit, 0);
+    ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    let arg: VALUE = jit.get_arg(0);
+
+    if let Some(result) = fuse_putobject_opt_ltlt(jit, asm, arg, ocb) {
+        return Some(result);
+    }
+
+    jit_putobject(asm, arg);
+    Some(KeepCompiling)
+}
+
+/// Combine `putobject` and `opt_ltlt` together if profitable, for example when
+/// left shifting an integer by a constant amount.
+fn fuse_putobject_opt_ltlt(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    constant_object: VALUE,
+    ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    let next_opcode = unsafe { rb_vm_insn_addr2opcode(jit.pc.add(insn_len(jit.opcode).as_usize()).read().as_ptr()) };
+    if next_opcode == YARVINSN_opt_ltlt as i32 && constant_object.fixnum_p() {
+        // Untag the fixnum shift amount
+        let shift_amt = constant_object.as_isize() >> 1;
+        if shift_amt > 63 || shift_amt < 0 {
+            return None;
+        }
+        if !jit.at_current_insn() {
+            defer_compilation(jit, asm, ocb);
+            return Some(EndBlock);
+        }
+
+        let lhs = jit.peek_at_stack(&asm.ctx, 0);
+        if !lhs.fixnum_p() {
+            return None;
+        }
+
+        if !assume_bop_not_redefined(jit, asm, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_LTLT) {
+            return None;
+        }
+
+        asm_comment!(asm, "integer left shift with rhs={shift_amt}");
+        let lhs = asm.stack_opnd(0);
 
-    jit_putobject(jit, ctx, asm, arg);
-    KeepCompiling
+        // Guard that lhs is a fixnum if necessary
+        let lhs_type = asm.ctx.get_opnd_type(lhs.into());
+        if lhs_type != Type::Fixnum {
+            asm_comment!(asm, "guard arg0 fixnum");
+            asm.test(lhs, Opnd::UImm(RUBY_FIXNUM_FLAG as u64));
+
+            jit_chain_guard(
+                JCC_JZ,
+                jit,
+                asm,
+                ocb,
+                SEND_MAX_DEPTH,
+                Counter::guard_send_not_fixnums,
+            );
+        }
+
+        asm.stack_pop(1);
+        fixnum_left_shift_body(asm, lhs, shift_amt as u64);
+        return end_block_with_jump(jit, asm, ocb, jit.next_next_insn_idx());
+    }
+    return None;
 }
 
 fn gen_putself(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
 
     // Write it on the stack
-    let stack_top = ctx.stack_push_self();
+    let stack_top = asm.stack_push_self();
     asm.mov(
         stack_top,
         Opnd::mem(VALUE_BITS, CFP, RUBY_OFFSET_CFP_SELF)
     );
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_putspecialobject(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let object_type = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let object_type = jit.get_arg(0).as_usize();
 
     if object_type == VM_SPECIAL_OBJECT_VMCORE.as_usize() {
-        let stack_top = ctx.stack_push(Type::UnknownHeap);
+        let stack_top = asm.stack_push(Type::UnknownHeap);
         let frozen_core = unsafe { rb_mRubyVMFrozenCore };
         asm.mov(stack_top, frozen_core.into());
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
         // TODO: implement for VM_SPECIAL_OBJECT_CBASE and
         // VM_SPECIAL_OBJECT_CONST_BASE
-        CantCompile
+        None
     }
 }
 
 // set Nth stack entry to stack top
 fn gen_setn(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let n = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let n = jit.get_arg(0).as_usize();
 
-    let top_val = ctx.stack_pop(0);
-    let dst_opnd = ctx.stack_opnd(n.try_into().unwrap());
+    let top_val = asm.stack_opnd(0);
+    let dst_opnd = asm.stack_opnd(n.try_into().unwrap());
     asm.mov(
         dst_opnd,
         top_val
     );
 
-    let mapping = ctx.get_opnd_mapping(StackOpnd(0));
-    ctx.set_opnd_mapping(StackOpnd(n.try_into().unwrap()), mapping);
+    let mapping = asm.ctx.get_opnd_mapping(top_val.into());
+    asm.ctx.set_opnd_mapping(dst_opnd.into(), mapping);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // get nth stack value, then push it
 fn gen_topn(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let n = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let n = jit.get_arg(0).as_usize();
 
-    let top_n_val = ctx.stack_opnd(n.try_into().unwrap());
-    let mapping = ctx.get_opnd_mapping(StackOpnd(n.try_into().unwrap()));
-    let loc0 = ctx.stack_push_mapping(mapping);
+    let top_n_val = asm.stack_opnd(n.try_into().unwrap());
+    let mapping = asm.ctx.get_opnd_mapping(top_n_val.into());
+    let loc0 = asm.stack_push_mapping(mapping);
     asm.mov(loc0, top_n_val);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // Pop n values off the stack
 fn gen_adjuststack(
     jit: &mut JITState,
-    ctx: &mut Context,
-    _cb: &mut Assembler,
+    asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let n = jit_get_arg(jit, 0).as_usize();
-    ctx.stack_pop(n);
-    KeepCompiling
+) -> Option<CodegenStatus> {
+    let n = jit.get_arg(0).as_usize();
+    asm.stack_pop(n);
+    Some(KeepCompiling)
 }
 
 fn gen_opt_plus(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
-    }
-
-    let comptime_a = jit_peek_at_stack(jit, ctx, 1);
-    let comptime_b = jit_peek_at_stack(jit, ctx, 0);
-
-    if comptime_a.fixnum_p() && comptime_b.fixnum_p() {
-        // Create a side-exit to fall back to the interpreter
-        // Note: we generate the side-exit before popping operands from the stack
-        let side_exit = get_side_exit(jit, ocb, ctx);
+) -> Option<CodegenStatus> {
+    let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
+        Some(two_fixnums) => two_fixnums,
+        None => {
+            defer_compilation(jit, asm, ocb);
+            return Some(EndBlock);
+        }
+    };
 
-        if !assume_bop_not_redefined(jit, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_PLUS) {
-            return CantCompile;
+    if two_fixnums {
+        if !assume_bop_not_redefined(jit, asm, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_PLUS) {
+            return None;
         }
 
         // Check that both operands are fixnums
-        guard_two_fixnums(jit, ctx, asm, ocb, side_exit);
+        guard_two_fixnums(jit, asm, ocb);
 
         // Get the operands from the stack
-        let arg1 = ctx.stack_pop(1);
-        let arg0 = ctx.stack_pop(1);
+        let arg1 = asm.stack_pop(1);
+        let arg0 = asm.stack_pop(1);
 
         // Add arg0 + arg1 and test for overflow
         let arg0_untag = asm.sub(arg0, Opnd::Imm(1));
         let out_val = asm.add(arg0_untag, arg1);
-        asm.jo(side_exit);
+        asm.jo(Target::side_exit(Counter::opt_plus_overflow));
 
         // Push the output on the stack
-        let dst = ctx.stack_push(Type::Fixnum);
+        let dst = asm.stack_push(Type::Fixnum);
         asm.mov(dst, out_val);
 
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm, ocb)
     }
 }
 
 // new array initialized from top N values
 fn gen_newarray(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let n = jit_get_arg(jit, 0).as_u32();
+) -> Option<CodegenStatus> {
+    let n = jit.get_arg(0).as_u32();
 
     // Save the PC and SP because we are allocating
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_call_with_gc(jit, asm);
 
     // If n is 0, then elts is never going to be read, so we can just pass null
     let values_ptr = if n == 0 {
         Opnd::UImm(0)
     } else {
-        asm.comment("load pointer to array elts");
-        let offset_magnitude = (SIZEOF_VALUE as u32) * n;
-        let values_opnd = ctx.sp_opnd(-(offset_magnitude as isize));
+        asm_comment!(asm, "load pointer to array elements");
+        let values_opnd = asm.ctx.sp_opnd(-(n as i32));
         asm.lea(values_opnd)
     };
 
@@ -1164,24 +1685,23 @@ fn gen_newarray(
         ]
     );
 
-    ctx.stack_pop(n.as_usize());
-    let stack_ret = ctx.stack_push(Type::CArray);
+    asm.stack_pop(n.as_usize());
+    let stack_ret = asm.stack_push(Type::CArray);
     asm.mov(stack_ret, new_ary);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // dup array
 fn gen_duparray(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let ary = jit_get_arg(jit, 0);
+) -> Option<CodegenStatus> {
+    let ary = jit.get_arg(0);
 
     // Save the PC and SP because we are allocating
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_call_with_gc(jit, asm);
 
     // call rb_ary_resurrect(VALUE ary);
     let new_ary = asm.ccall(
@@ -1189,141 +1709,341 @@ fn gen_duparray(
         vec![ary.into()],
     );
 
-    let stack_ret = ctx.stack_push(Type::CArray);
+    let stack_ret = asm.stack_push(Type::CArray);
     asm.mov(stack_ret, new_ary);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // dup hash
 fn gen_duphash(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let hash = jit_get_arg(jit, 0);
+) -> Option<CodegenStatus> {
+    let hash = jit.get_arg(0);
 
     // Save the PC and SP because we are allocating
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_call_with_gc(jit, asm);
 
     // call rb_hash_resurrect(VALUE hash);
     let hash = asm.ccall(rb_hash_resurrect as *const u8, vec![hash.into()]);
 
-    let stack_ret = ctx.stack_push(Type::Hash);
+    let stack_ret = asm.stack_push(Type::CHash);
     asm.mov(stack_ret, hash);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // call to_a on the array on the stack
 fn gen_splatarray(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let flag = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let flag = jit.get_arg(0).as_usize();
 
-    // Save the PC and SP because the callee may allocate
+    // Save the PC and SP because the callee may call #to_a
     // Note that this modifies REG_SP, which is why we do it first
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
     // Get the operands from the stack
-    let ary_opnd = ctx.stack_pop(1);
+    let ary_opnd = asm.stack_opnd(0);
 
     // Call rb_vm_splat_array(flag, ary)
     let ary = asm.ccall(rb_vm_splat_array as *const u8, vec![flag.into(), ary_opnd]);
+    asm.stack_pop(1); // Keep it on stack during ccall for GC
 
-    let stack_ret = ctx.stack_push(Type::TArray);
+    let stack_ret = asm.stack_push(Type::TArray);
     asm.mov(stack_ret, ary);
 
-    KeepCompiling
+    Some(KeepCompiling)
+}
+
+// call to_hash on hash to keyword splat before converting block
+// e.g. foo(**object, &block)
+fn gen_splatkw(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    // Defer compilation so we can specialize on a runtime hash operand
+    if !jit.at_current_insn() {
+        defer_compilation(jit, asm, ocb);
+        return Some(EndBlock);
+    }
+
+    let comptime_hash = jit.peek_at_stack(&asm.ctx, 1);
+    if comptime_hash.hash_p() {
+        // If a compile-time hash operand is T_HASH, just guard that it's T_HASH.
+        let hash_opnd = asm.stack_opnd(1);
+        guard_object_is_hash(asm, hash_opnd, hash_opnd.into(), Counter::splatkw_not_hash);
+    } else if comptime_hash.nil_p() {
+        // Speculate we'll see nil if compile-time hash operand is nil
+        let hash_opnd = asm.stack_opnd(1);
+        let hash_opnd_type = asm.ctx.get_opnd_type(hash_opnd.into());
+
+        if hash_opnd_type != Type::Nil {
+            asm.cmp(hash_opnd, Qnil.into());
+            asm.jne(Target::side_exit(Counter::splatkw_not_nil));
+
+            if Type::Nil.diff(hash_opnd_type) != TypeDiff::Incompatible {
+                asm.ctx.upgrade_opnd_type(hash_opnd.into(), Type::Nil);
+            }
+        }
+    } else {
+        // Otherwise, call #to_hash on the operand if it's not nil.
+
+        // Save the PC and SP because the callee may call #to_hash
+        jit_prepare_non_leaf_call(jit, asm);
+
+        // Get the operands from the stack
+        let block_opnd = asm.stack_opnd(0);
+        let block_type = asm.ctx.get_opnd_type(block_opnd.into());
+        let hash_opnd = asm.stack_opnd(1);
+
+        c_callable! {
+            fn to_hash_if_not_nil(mut obj: VALUE) -> VALUE {
+                if obj != Qnil {
+                    obj = unsafe { rb_to_hash_type(obj) };
+                }
+                obj
+            }
+        }
+
+        let hash = asm.ccall(to_hash_if_not_nil as _, vec![hash_opnd]);
+        asm.stack_pop(2); // Keep it on stack during ccall for GC
+
+        let stack_ret = asm.stack_push(Type::Unknown);
+        asm.mov(stack_ret, hash);
+        asm.stack_push(block_type);
+        // Leave block_opnd spilled by ccall as is
+        asm.ctx.dealloc_temp_reg(asm.ctx.get_stack_size() - 1);
+    }
+
+    Some(KeepCompiling)
 }
 
 // concat two arrays
 fn gen_concatarray(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    // Save the PC and SP because the callee may allocate
+) -> Option<CodegenStatus> {
+    // Save the PC and SP because the callee may call #to_a
     // Note that this modifies REG_SP, which is why we do it first
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
     // Get the operands from the stack
-    let ary2st_opnd = ctx.stack_pop(1);
-    let ary1_opnd = ctx.stack_pop(1);
+    let ary2st_opnd = asm.stack_opnd(0);
+    let ary1_opnd = asm.stack_opnd(1);
 
     // Call rb_vm_concat_array(ary1, ary2st)
     let ary = asm.ccall(rb_vm_concat_array as *const u8, vec![ary1_opnd, ary2st_opnd]);
+    asm.stack_pop(2); // Keep them on stack during ccall for GC
 
-    let stack_ret = ctx.stack_push(Type::TArray);
+    let stack_ret = asm.stack_push(Type::TArray);
     asm.mov(stack_ret, ary);
 
-    KeepCompiling
+    Some(KeepCompiling)
+}
+
+// concat second array to first array.
+// first argument must already be an array.
+// attempts to convert second object to array using to_a.
+fn gen_concattoarray(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    // Save the PC and SP because the callee may call #to_a
+    jit_prepare_non_leaf_call(jit, asm);
+
+    // Get the operands from the stack
+    let ary2_opnd = asm.stack_opnd(0);
+    let ary1_opnd = asm.stack_opnd(1);
+
+    let ary = asm.ccall(rb_vm_concat_to_array as *const u8, vec![ary1_opnd, ary2_opnd]);
+    asm.stack_pop(2); // Keep them on stack during ccall for GC
+
+    let stack_ret = asm.stack_push(Type::TArray);
+    asm.mov(stack_ret, ary);
+
+    Some(KeepCompiling)
+}
+
+// push given number of objects to array directly before.
+fn gen_pushtoarray(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    let num = jit.get_arg(0).as_u64();
+
+    // Save the PC and SP because the callee may allocate
+    jit_prepare_call_with_gc(jit, asm);
+
+    // Get the operands from the stack
+    let ary_opnd = asm.stack_opnd(num as i32);
+    let objp_opnd = asm.lea(asm.ctx.sp_opnd(-(num as i32)));
+
+    let ary = asm.ccall(rb_ary_cat as *const u8, vec![ary_opnd, objp_opnd, num.into()]);
+    asm.stack_pop(num as usize + 1); // Keep it on stack during ccall for GC
+
+    let stack_ret = asm.stack_push(Type::TArray);
+    asm.mov(stack_ret, ary);
+
+    Some(KeepCompiling)
 }
 
 // new range initialized from top 2 values
 fn gen_newrange(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let flag = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let flag = jit.get_arg(0).as_usize();
 
     // rb_range_new() allocates and can raise
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
     // val = rb_range_new(low, high, (int)flag);
     let range_opnd = asm.ccall(
         rb_range_new as *const u8,
         vec![
-            ctx.stack_opnd(1),
-            ctx.stack_opnd(0),
+            asm.stack_opnd(1),
+            asm.stack_opnd(0),
             flag.into()
         ]
     );
 
-    ctx.stack_pop(2);
-    let stack_ret = ctx.stack_push(Type::UnknownHeap);
+    asm.stack_pop(2);
+    let stack_ret = asm.stack_push(Type::UnknownHeap);
     asm.mov(stack_ret, range_opnd);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn guard_object_is_heap(
     asm: &mut Assembler,
-    object_opnd: Opnd,
-    side_exit: Target,
+    object: Opnd,
+    object_opnd: YARVOpnd,
+    counter: Counter,
 ) {
-    asm.comment("guard object is heap");
+    let object_type = asm.ctx.get_opnd_type(object_opnd);
+    if object_type.is_heap() {
+        return;
+    }
+
+    asm_comment!(asm, "guard object is heap");
 
     // Test that the object is not an immediate
-    asm.test(object_opnd, (RUBY_IMMEDIATE_MASK as u64).into());
-    asm.jnz(side_exit);
+    asm.test(object, (RUBY_IMMEDIATE_MASK as u64).into());
+    asm.jnz(Target::side_exit(counter));
 
     // Test that the object is not false
-    asm.cmp(object_opnd, Qfalse.into());
-    asm.je(side_exit);
+    asm.cmp(object, Qfalse.into());
+    asm.je(Target::side_exit(counter));
+
+    if Type::UnknownHeap.diff(object_type) != TypeDiff::Incompatible {
+        asm.ctx.upgrade_opnd_type(object_opnd, Type::UnknownHeap);
+    }
 }
 
 fn guard_object_is_array(
     asm: &mut Assembler,
-    object_opnd: Opnd,
-    side_exit: Target,
+    object: Opnd,
+    object_opnd: YARVOpnd,
+    counter: Counter,
 ) {
-    asm.comment("guard object is array");
+    let object_type = asm.ctx.get_opnd_type(object_opnd);
+    if object_type.is_array() {
+        return;
+    }
+
+    let object_reg = match object {
+        Opnd::InsnOut { .. } => object,
+        _ => asm.load(object),
+    };
+    guard_object_is_heap(asm, object_reg, object_opnd, counter);
+
+    asm_comment!(asm, "guard object is array");
 
     // Pull out the type mask
-    let flags_opnd = Opnd::mem(VALUE_BITS, object_opnd, RUBY_OFFSET_RBASIC_FLAGS);
+    let flags_opnd = Opnd::mem(VALUE_BITS, object_reg, RUBY_OFFSET_RBASIC_FLAGS);
     let flags_opnd = asm.and(flags_opnd, (RUBY_T_MASK as u64).into());
 
     // Compare the result with T_ARRAY
     asm.cmp(flags_opnd, (RUBY_T_ARRAY as u64).into());
-    asm.jne(side_exit);
+    asm.jne(Target::side_exit(counter));
+
+    if Type::TArray.diff(object_type) != TypeDiff::Incompatible {
+        asm.ctx.upgrade_opnd_type(object_opnd, Type::TArray);
+    }
+}
+
+fn guard_object_is_hash(
+    asm: &mut Assembler,
+    object: Opnd,
+    object_opnd: YARVOpnd,
+    counter: Counter,
+) {
+    let object_type = asm.ctx.get_opnd_type(object_opnd);
+    if object_type.is_hash() {
+        return;
+    }
+
+    let object_reg = match object {
+        Opnd::InsnOut { .. } => object,
+        _ => asm.load(object),
+    };
+    guard_object_is_heap(asm, object_reg, object_opnd, counter);
+
+    asm_comment!(asm, "guard object is hash");
+
+    // Pull out the type mask
+    let flags_opnd = Opnd::mem(VALUE_BITS, object_reg, RUBY_OFFSET_RBASIC_FLAGS);
+    let flags_opnd = asm.and(flags_opnd, (RUBY_T_MASK as u64).into());
+
+    // Compare the result with T_HASH
+    asm.cmp(flags_opnd, (RUBY_T_HASH as u64).into());
+    asm.jne(Target::side_exit(counter));
+
+    if Type::THash.diff(object_type) != TypeDiff::Incompatible {
+        asm.ctx.upgrade_opnd_type(object_opnd, Type::THash);
+    }
+}
+
+fn guard_object_is_string(
+    asm: &mut Assembler,
+    object: Opnd,
+    object_opnd: YARVOpnd,
+    counter: Counter,
+) {
+    let object_type = asm.ctx.get_opnd_type(object_opnd);
+    if object_type.is_string() {
+        return;
+    }
+
+    let object_reg = match object {
+        Opnd::InsnOut { .. } => object,
+        _ => asm.load(object),
+    };
+    guard_object_is_heap(asm, object_reg, object_opnd, counter);
+
+    asm_comment!(asm, "guard object is string");
+
+    // Pull out the type mask
+    let flags_reg = asm.load(Opnd::mem(VALUE_BITS, object_reg, RUBY_OFFSET_RBASIC_FLAGS));
+    let flags_reg = asm.and(flags_reg, Opnd::UImm(RUBY_T_MASK as u64));
+
+    // Compare the result with T_STRING
+    asm.cmp(flags_reg, Opnd::UImm(RUBY_T_STRING as u64));
+    asm.jne(Target::side_exit(counter));
+
+    if Type::TString.diff(object_type) != TypeDiff::Incompatible {
+        asm.ctx.upgrade_opnd_type(object_opnd, Type::TString);
+    }
 }
 
 /// This guards that a special flag is not set on a hash.
@@ -1333,9 +2053,9 @@ fn guard_object_is_array(
 fn guard_object_is_not_ruby2_keyword_hash(
     asm: &mut Assembler,
     object_opnd: Opnd,
-    side_exit: Target,
+    counter: Counter,
 ) {
-    asm.comment("guard object is not ruby2 keyword hash");
+    asm_comment!(asm, "guard object is not ruby2 keyword hash");
 
     let not_ruby2_keyword = asm.new_label("not_ruby2_keyword");
     asm.test(object_opnd, (RUBY_IMMEDIATE_MASK as u64).into());
@@ -1355,159 +2075,166 @@ fn guard_object_is_not_ruby2_keyword_hash(
     asm.jne(not_ruby2_keyword);
 
     asm.test(flags_opnd, (RHASH_PASS_AS_KEYWORDS as u64).into());
-    asm.jnz(side_exit);
+    asm.jnz(Target::side_exit(counter));
 
     asm.write_label(not_ruby2_keyword);
 }
 
-fn guard_object_is_string(
-    asm: &mut Assembler,
-    object_reg: Opnd,
-    side_exit: Target,
-) {
-    asm.comment("guard object is string");
-
-    // Pull out the type mask
-    let flags_reg = asm.load(Opnd::mem(VALUE_BITS, object_reg, RUBY_OFFSET_RBASIC_FLAGS));
-    let flags_reg = asm.and(flags_reg, Opnd::UImm(RUBY_T_MASK as u64));
-
-    // Compare the result with T_STRING
-    asm.cmp(flags_reg, Opnd::UImm(RUBY_T_STRING as u64));
-    asm.jne(side_exit);
-}
-
-// push enough nils onto the stack to fill out an array
+/// This instruction pops a single value off the stack, converts it to an
+/// arrayif it isn’t already one using the #to_ary method, and then pushes
+/// the values from the array back onto the stack.
 fn gen_expandarray(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Both arguments are rb_num_t which is unsigned
-    let num = jit_get_arg(jit, 0).as_usize();
-    let flag = jit_get_arg(jit, 1).as_usize();
+    let num = jit.get_arg(0).as_u32();
+    let flag = jit.get_arg(1).as_usize();
 
     // If this instruction has the splat flag, then bail out.
     if flag & 0x01 != 0 {
-        gen_counter_incr!(asm, expandarray_splat);
-        return CantCompile;
+        gen_counter_incr(asm, Counter::expandarray_splat);
+        return None;
     }
 
     // If this instruction has the postarg flag, then bail out.
     if flag & 0x02 != 0 {
-        gen_counter_incr!(asm, expandarray_postarg);
-        return CantCompile;
+        gen_counter_incr(asm, Counter::expandarray_postarg);
+        return None;
     }
 
-    let side_exit = get_side_exit(jit, ocb, ctx);
+    let array_opnd = asm.stack_opnd(0);
+
+    // Defer compilation so we can specialize on a runtime `self`
+    if !jit.at_current_insn() {
+        defer_compilation(jit, asm, ocb);
+        return Some(EndBlock);
+    }
+
+    let comptime_recv = jit.peek_at_stack(&asm.ctx, 0);
+
+    // If the comptime receiver is not an array
+    if !unsafe { RB_TYPE_P(comptime_recv, RUBY_T_ARRAY) } {
+        // at compile time, ensure to_ary is not defined
+        let target_cme = unsafe { rb_callable_method_entry_or_negative(comptime_recv.class_of(), ID!(to_ary)) };
+        let cme_def_type = unsafe { get_cme_def_type(target_cme) };
+
+        // if to_ary is defined, return can't compile so to_ary can be called
+        if cme_def_type != VM_METHOD_TYPE_UNDEF {
+            gen_counter_incr(asm, Counter::expandarray_to_ary);
+            return None;
+        }
+
+        // invalidate compile block if to_ary is later defined
+        jit.assume_method_lookup_stable(asm, ocb, target_cme);
+
+        jit_guard_known_klass(
+            jit,
+            asm,
+            ocb,
+            comptime_recv.class_of(),
+            array_opnd,
+            array_opnd.into(),
+            comptime_recv,
+            SEND_MAX_DEPTH,
+            Counter::expandarray_not_array,
+        );
+
+        let opnd = asm.stack_pop(1); // pop after using the type info
 
-    let array_type = ctx.get_opnd_type(StackOpnd(0));
-    let array_opnd = ctx.stack_pop(1);
+        // If we don't actually want any values, then just keep going
+        if num == 0 {
+            return Some(KeepCompiling);
+        }
 
-    // num is the number of requested values. If there aren't enough in the
-    // array then we're going to push on nils.
-    if matches!(array_type, Type::Nil) {
-        // special case for a, b = nil pattern
-        // push N nils onto the stack
-        for _ in 0..num {
-            let push_opnd = ctx.stack_push(Type::Nil);
+        // load opnd to avoid a race because we are also pushing onto the stack
+        let opnd = asm.load(opnd);
+
+        for _ in 1..num {
+            let push_opnd = asm.stack_push(Type::Nil);
             asm.mov(push_opnd, Qnil.into());
         }
-        return KeepCompiling;
+
+        let push_opnd = asm.stack_push(Type::Unknown);
+        asm.mov(push_opnd, opnd);
+
+        return Some(KeepCompiling);
     }
 
+    // Get the compile-time array length
+    let comptime_len = unsafe { rb_yjit_array_len(comptime_recv) as u32 };
+
     // Move the array from the stack and check that it's an array.
-    let array_reg = asm.load(array_opnd);
-    guard_object_is_heap(
-        asm,
-        array_reg,
-        counted_exit!(ocb, side_exit, expandarray_not_array),
-    );
     guard_object_is_array(
         asm,
-        array_reg,
-        counted_exit!(ocb, side_exit, expandarray_not_array),
+        array_opnd,
+        array_opnd.into(),
+        Counter::expandarray_not_array,
     );
 
     // If we don't actually want any values, then just return.
     if num == 0 {
-        return KeepCompiling;
+        asm.stack_pop(1); // pop the array
+        return Some(KeepCompiling);
     }
 
-    // Pull out the embed flag to check if it's an embedded array.
-    let flags_opnd = Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RBASIC_FLAGS);
-
-    // Move the length of the embedded array into REG1.
-    let emb_len_opnd = asm.and(flags_opnd, (RARRAY_EMBED_LEN_MASK as u64).into());
-    let emb_len_opnd = asm.rshift(emb_len_opnd, (RARRAY_EMBED_LEN_SHIFT as u64).into());
-
-    // Conditionally move the length of the heap array into REG1.
-    let flags_opnd = Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RBASIC_FLAGS);
-    asm.test(flags_opnd, (RARRAY_EMBED_FLAG as u64).into());
-    let array_len_opnd = Opnd::mem(
-        (8 * size_of::<std::os::raw::c_long>()) as u8,
-        asm.load(array_opnd),
-        RUBY_OFFSET_RARRAY_AS_HEAP_LEN,
-    );
-    let array_len_opnd = asm.csel_nz(emb_len_opnd, array_len_opnd);
-
-    // Only handle the case where the number of values in the array is greater
-    // than or equal to the number of values requested.
-    asm.cmp(array_len_opnd, num.into());
-    asm.jl(counted_exit!(ocb, side_exit, expandarray_rhs_too_small));
-
-    // Load the address of the embedded array into REG1.
-    // (struct RArray *)(obj)->as.ary
+    let array_opnd = asm.stack_opnd(0);
     let array_reg = asm.load(array_opnd);
-    let ary_opnd = asm.lea(Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RARRAY_AS_ARY));
+    let array_len_opnd = get_array_len(asm, array_reg);
 
-    // Conditionally load the address of the heap array into REG1.
-    // (struct RArray *)(obj)->as.heap.ptr
-    let flags_opnd = Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RBASIC_FLAGS);
-    asm.test(flags_opnd, Opnd::UImm(RARRAY_EMBED_FLAG as u64));
-    let heap_ptr_opnd = Opnd::mem(
-        (8 * size_of::<usize>()) as u8,
-        asm.load(array_opnd),
-        RUBY_OFFSET_RARRAY_AS_HEAP_PTR,
-    );
-    let ary_opnd = asm.csel_nz(ary_opnd, heap_ptr_opnd);
+    // Guard on the comptime/expected array length
+    if comptime_len >= num {
+        asm_comment!(asm, "guard array length >= {}", num);
+        asm.cmp(array_len_opnd, num.into());
+        jit_chain_guard(
+            JCC_JB,
+            jit,
+            asm,
+            ocb,
+            EXPANDARRAY_MAX_CHAIN_DEPTH,
+            Counter::expandarray_chain_max_depth,
+        );
 
-    // Loop backward through the array and push each element onto the stack.
-    for i in (0..num).rev() {
-        let top = ctx.stack_push(Type::Unknown);
-        let offset = i32::try_from(i * SIZEOF_VALUE).unwrap();
-        asm.mov(top, Opnd::mem(64, ary_opnd, offset));
+    } else {
+        asm_comment!(asm, "guard array length == {}", comptime_len);
+        asm.cmp(array_len_opnd, comptime_len.into());
+        jit_chain_guard(
+            JCC_JNE,
+            jit,
+            asm,
+            ocb,
+            EXPANDARRAY_MAX_CHAIN_DEPTH,
+            Counter::expandarray_chain_max_depth,
+        );
     }
 
-    KeepCompiling
-}
+    let array_opnd = asm.stack_pop(1); // pop after using the type info
 
-fn gen_getlocal_wc0(
-    jit: &mut JITState,
-    ctx: &mut Context,
-    asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    // Compute the offset from BP to the local
-    let slot_idx = jit_get_arg(jit, 0).as_i32();
-    let offs: i32 = -SIZEOF_VALUE_I32 * slot_idx;
-    let local_idx = slot_to_local_idx(jit.get_iseq(), slot_idx);
-
-    // Load environment pointer EP (level 0) from CFP
-    let ep_opnd = gen_get_ep(asm, 0);
+    // Load the pointer to the embedded or heap array
+    let ary_opnd = if comptime_len > 0 {
+        let array_reg = asm.load(array_opnd);
+        Some(get_array_ptr(asm, array_reg))
+    } else {
+        None
+    };
 
-    // Load the local from the EP
-    let local_opnd = Opnd::mem(64, ep_opnd, offs);
+    // Loop backward through the array and push each element onto the stack.
+    for i in (0..num).rev() {
+        let top = asm.stack_push(if i < comptime_len { Type::Unknown } else { Type::Nil });
+        let offset = i32::try_from(i * (SIZEOF_VALUE as u32)).unwrap();
 
-    // Write the local at SP
-    let stack_top = ctx.stack_push_local(local_idx.as_usize());
-    asm.mov(stack_top, local_opnd);
+        // Missing elements are Qnil
+        asm_comment!(asm, "load array[{}]", i);
+        let elem_opnd = if i < comptime_len { Opnd::mem(64, ary_opnd.unwrap(), offset) } else { Qnil.into() };
+        asm.mov(top, elem_opnd);
+    }
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // Compute the index of a local variable from its slot index
-fn slot_to_local_idx(iseq: IseqPtr, slot_idx: i32) -> u32 {
+fn ep_offset_to_local_idx(iseq: IseqPtr, ep_offset: u32) -> u32 {
     // Layout illustration
     // This is an array of VALUE
     //                                           | VM_ENV_DATA_SIZE |
@@ -1518,7 +2245,7 @@ fn slot_to_local_idx(iseq: IseqPtr, slot_idx: i32) -> u32 {
     //           ^       ^                       ^                  ^
     //           +-------+---local_table_size----+         cfp->ep--+
     //                   |                                          |
-    //                   +------------------slot_idx----------------+
+    //                   +------------------ep_offset---------------+
     //
     // See usages of local_var_name() from iseq.c for similar calculation.
 
@@ -1526,7 +2253,7 @@ fn slot_to_local_idx(iseq: IseqPtr, slot_idx: i32) -> u32 {
     let local_table_size: i32 = unsafe { get_iseq_body_local_table_size(iseq) }
         .try_into()
         .unwrap();
-    let op = slot_idx - (VM_ENV_DATA_SIZE as i32);
+    let op = (ep_offset - VM_ENV_DATA_SIZE) as i32;
     let local_idx = local_table_size - op - 1;
     assert!(local_idx >= 0 && local_idx < local_table_size);
     local_idx.try_into().unwrap()
@@ -1552,7 +2279,7 @@ fn gen_get_ep(asm: &mut Assembler, level: u32) -> Opnd {
 
 // Gets the EP of the ISeq of the containing method, or "local level".
 // Equivalent of GET_LEP() macro.
-fn gen_get_lep(jit: &mut JITState, asm: &mut Assembler) -> Opnd {
+fn gen_get_lep(jit: &JITState, asm: &mut Assembler) -> Opnd {
     // Equivalent of get_lvar_level() in compile.c
     fn get_lvar_level(iseq: IseqPtr) -> u32 {
         if iseq == unsafe { rb_get_iseq_body_local_iseq(iseq) } {
@@ -1567,180 +2294,184 @@ fn gen_get_lep(jit: &mut JITState, asm: &mut Assembler) -> Opnd {
 }
 
 fn gen_getlocal_generic(
-    ctx: &mut Context,
+    jit: &mut JITState,
     asm: &mut Assembler,
-    local_idx: u32,
+    ocb: &mut OutlinedCb,
+    ep_offset: u32,
     level: u32,
-) -> CodegenStatus {
-    // Load environment pointer EP (level 0) from CFP
-    let ep_opnd = gen_get_ep(asm, level);
+) -> Option<CodegenStatus> {
+    let local_opnd = if level == 0 && jit.assume_no_ep_escape(asm, ocb) {
+        // Load the local using SP register
+        asm.ctx.ep_opnd(-(ep_offset as i32))
+    } else {
+        // Load environment pointer EP (level 0) from CFP
+        let ep_opnd = gen_get_ep(asm, level);
 
-    // Load the local from the block
-    // val = *(vm_get_ep(GET_EP(), level) - idx);
-    let offs = -(SIZEOF_VALUE_I32 * local_idx as i32);
-    let local_opnd = Opnd::mem(64, ep_opnd, offs);
+        // Load the local from the block
+        // val = *(vm_get_ep(GET_EP(), level) - idx);
+        let offs = -(SIZEOF_VALUE_I32 * ep_offset as i32);
+        Opnd::mem(64, ep_opnd, offs)
+    };
 
     // Write the local at SP
-    let stack_top = ctx.stack_push(Type::Unknown);
+    let stack_top = if level == 0 {
+        let local_idx = ep_offset_to_local_idx(jit.get_iseq(), ep_offset);
+        asm.stack_push_local(local_idx.as_usize())
+    } else {
+        asm.stack_push(Type::Unknown)
+    };
 
     asm.mov(stack_top, local_opnd);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_getlocal(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let idx = jit_get_arg(jit, 0);
-    let level = jit_get_arg(jit, 1);
-    gen_getlocal_generic(ctx, asm, idx.as_u32(), level.as_u32())
+    ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    let idx = jit.get_arg(0).as_u32();
+    let level = jit.get_arg(1).as_u32();
+    gen_getlocal_generic(jit, asm, ocb, idx, level)
 }
 
-fn gen_getlocal_wc1(
+fn gen_getlocal_wc0(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let idx = jit_get_arg(jit, 0);
-    gen_getlocal_generic(ctx, asm, idx.as_u32(), 1)
+    ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    let idx = jit.get_arg(0).as_u32();
+    gen_getlocal_generic(jit, asm, ocb, idx, 0)
 }
 
-fn gen_setlocal_wc0(
+fn gen_getlocal_wc1(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    /*
-    vm_env_write(const VALUE *ep, int index, VALUE v)
-    {
-        VALUE flags = ep[VM_ENV_DATA_INDEX_FLAGS];
-        if (LIKELY((flags & VM_ENV_FLAG_WB_REQUIRED) == 0)) {
-            VM_STACK_ENV_WRITE(ep, index, v);
-        }
-        else {
-            vm_env_write_slowpath(ep, index, v);
-        }
-    }
-    */
-
-    let slot_idx = jit_get_arg(jit, 0).as_i32();
-    let local_idx = slot_to_local_idx(jit.get_iseq(), slot_idx).as_usize();
-    let value_type = ctx.get_opnd_type(StackOpnd(0));
-
-    // Load environment pointer EP (level 0) from CFP
-    let ep_opnd = gen_get_ep(asm, 0);
-
-    // Write barriers may be required when VM_ENV_FLAG_WB_REQUIRED is set, however write barriers
-    // only affect heap objects being written. If we know an immediate value is being written we
-    // can skip this check.
-    if !value_type.is_imm() {
-        // flags & VM_ENV_FLAG_WB_REQUIRED
-        let flags_opnd = Opnd::mem(
-            64,
-            ep_opnd,
-            SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_FLAGS as i32,
-        );
-        asm.test(flags_opnd, VM_ENV_FLAG_WB_REQUIRED.into());
-
-        // Create a side-exit to fall back to the interpreter
-        let side_exit = get_side_exit(jit, ocb, ctx);
-
-        // if (flags & VM_ENV_FLAG_WB_REQUIRED) != 0
-        asm.jnz(side_exit);
-    }
-
-    // Set the type of the local variable in the context
-    ctx.set_local_type(local_idx, value_type);
-
-    // Pop the value to write from the stack
-    let stack_top = ctx.stack_pop(1);
-
-    // Write the value at the environment pointer
-    let offs: i32 = -8 * slot_idx;
-    asm.mov(Opnd::mem(64, ep_opnd, offs), stack_top);
-
-    KeepCompiling
+) -> Option<CodegenStatus> {
+    let idx = jit.get_arg(0).as_u32();
+    gen_getlocal_generic(jit, asm, ocb, idx, 1)
 }
 
 fn gen_setlocal_generic(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-    local_idx: i32,
+    ep_offset: u32,
     level: u32,
-) -> CodegenStatus {
-    let value_type = ctx.get_opnd_type(StackOpnd(0));
+) -> Option<CodegenStatus> {
+    let value_type = asm.ctx.get_opnd_type(StackOpnd(0));
+
+    // Fallback because of write barrier
+    if asm.ctx.get_chain_depth() > 0 {
+        // Load environment pointer EP at level
+        let ep_opnd = gen_get_ep(asm, level);
+
+        // This function should not yield to the GC.
+        // void rb_vm_env_write(const VALUE *ep, int index, VALUE v)
+        let index = -(ep_offset as i64);
+        let value_opnd = asm.stack_opnd(0);
+        asm.ccall(
+            rb_vm_env_write as *const u8,
+            vec![
+                ep_opnd,
+                index.into(),
+                value_opnd,
+            ]
+        );
+        asm.stack_pop(1);
 
-    // Load environment pointer EP at level
-    let ep_opnd = gen_get_ep(asm, level);
+        return Some(KeepCompiling);
+    }
 
-    // Write barriers may be required when VM_ENV_FLAG_WB_REQUIRED is set, however write barriers
-    // only affect heap objects being written. If we know an immediate value is being written we
-    // can skip this check.
-    if !value_type.is_imm() {
-        // flags & VM_ENV_FLAG_WB_REQUIRED
+    let (flags_opnd, local_opnd) = if level == 0 && jit.assume_no_ep_escape(asm, ocb) {
+        // Load flags and the local using SP register
+        let local_opnd = asm.ctx.ep_opnd(-(ep_offset as i32));
+        let flags_opnd = asm.ctx.ep_opnd(VM_ENV_DATA_INDEX_FLAGS as i32);
+        (flags_opnd, local_opnd)
+    } else {
+        // Load flags and the local for the level
+        let ep_opnd = gen_get_ep(asm, level);
         let flags_opnd = Opnd::mem(
             64,
             ep_opnd,
             SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_FLAGS as i32,
         );
-        asm.test(flags_opnd, VM_ENV_FLAG_WB_REQUIRED.into());
+        (flags_opnd, Opnd::mem(64, ep_opnd, -SIZEOF_VALUE_I32 * ep_offset as i32))
+    };
 
-        // Create a side-exit to fall back to the interpreter
-        let side_exit = get_side_exit(jit, ocb, ctx);
+    // Write barriers may be required when VM_ENV_FLAG_WB_REQUIRED is set, however write barriers
+    // only affect heap objects being written. If we know an immediate value is being written we
+    // can skip this check.
+    if !value_type.is_imm() {
+        // flags & VM_ENV_FLAG_WB_REQUIRED
+        asm.test(flags_opnd, VM_ENV_FLAG_WB_REQUIRED.into());
 
         // if (flags & VM_ENV_FLAG_WB_REQUIRED) != 0
-        asm.jnz(side_exit);
+        assert!(asm.ctx.get_chain_depth() == 0);
+        jit_chain_guard(
+            JCC_JNZ,
+            jit,
+            asm,
+            ocb,
+            1,
+            Counter::setlocal_wb_required,
+        );
+    }
+
+    if level == 0 {
+        let local_idx = ep_offset_to_local_idx(jit.get_iseq(), ep_offset).as_usize();
+        asm.ctx.set_local_type(local_idx, value_type);
     }
 
     // Pop the value to write from the stack
-    let stack_top = ctx.stack_pop(1);
+    let stack_top = asm.stack_pop(1);
 
     // Write the value at the environment pointer
-    let offs = -(SIZEOF_VALUE_I32 * local_idx);
-    asm.mov(Opnd::mem(64, ep_opnd, offs), stack_top);
+    asm.mov(local_opnd, stack_top);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_setlocal(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let idx = jit_get_arg(jit, 0).as_i32();
-    let level = jit_get_arg(jit, 1).as_u32();
-    gen_setlocal_generic(jit, ctx, asm, ocb, idx, level)
+) -> Option<CodegenStatus> {
+    let idx = jit.get_arg(0).as_u32();
+    let level = jit.get_arg(1).as_u32();
+    gen_setlocal_generic(jit, asm, ocb, idx, level)
+}
+
+fn gen_setlocal_wc0(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    let idx = jit.get_arg(0).as_u32();
+    gen_setlocal_generic(jit, asm, ocb, idx, 0)
 }
 
 fn gen_setlocal_wc1(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let idx = jit_get_arg(jit, 0).as_i32();
-    gen_setlocal_generic(jit, ctx, asm, ocb, idx, 1)
+) -> Option<CodegenStatus> {
+    let idx = jit.get_arg(0).as_u32();
+    gen_setlocal_generic(jit, asm, ocb, idx, 1)
 }
 
 // new hash initialized from top N values
 fn gen_newhash(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let num: u64 = jit_get_arg(jit, 0).as_u64();
+) -> Option<CodegenStatus> {
+    let num: u64 = jit.get_arg(0).as_u64();
 
     // Save the PC and SP because we are allocating
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_call_with_gc(jit, asm);
 
     if num != 0 {
         // val = rb_hash_new_with_size(num / 2);
@@ -1754,7 +2485,7 @@ fn gen_newhash(
         asm.cpush(new_hash); // x86 alignment
 
         // Get a pointer to the values to insert into the hash
-        let stack_addr_from_top = asm.lea(ctx.stack_opnd((num - 1) as i32));
+        let stack_addr_from_top = asm.lea(asm.stack_opnd((num - 1) as i32));
 
         // rb_hash_bulk_insert(num, STACK_ADDR_FROM_TOP(num), val);
         asm.ccall(
@@ -1769,60 +2500,107 @@ fn gen_newhash(
         let new_hash = asm.cpop();
         asm.cpop_into(new_hash); // x86 alignment
 
-        ctx.stack_pop(num.try_into().unwrap());
-        let stack_ret = ctx.stack_push(Type::Hash);
+        asm.stack_pop(num.try_into().unwrap());
+        let stack_ret = asm.stack_push(Type::CHash);
         asm.mov(stack_ret, new_hash);
     } else {
         // val = rb_hash_new();
         let new_hash = asm.ccall(rb_hash_new as *const u8, vec![]);
-        let stack_ret = ctx.stack_push(Type::Hash);
+        let stack_ret = asm.stack_push(Type::CHash);
         asm.mov(stack_ret, new_hash);
     }
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_putstring(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let put_val = jit_get_arg(jit, 0);
+) -> Option<CodegenStatus> {
+    let put_val = jit.get_arg(0);
+
+    // Save the PC and SP because the callee will allocate
+    jit_prepare_call_with_gc(jit, asm);
+
+    let str_opnd = asm.ccall(
+        rb_ec_str_resurrect as *const u8,
+        vec![EC, put_val.into(), 0.into()]
+    );
+
+    let stack_top = asm.stack_push(Type::CString);
+    asm.mov(stack_top, str_opnd);
+
+    Some(KeepCompiling)
+}
+
+fn gen_putchilledstring(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    let put_val = jit.get_arg(0);
 
     // Save the PC and SP because the callee will allocate
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_call_with_gc(jit, asm);
 
     let str_opnd = asm.ccall(
         rb_ec_str_resurrect as *const u8,
-        vec![EC, put_val.into()]
+        vec![EC, put_val.into(), 1.into()]
     );
 
-    let stack_top = ctx.stack_push(Type::CString);
+    let stack_top = asm.stack_push(Type::CString);
     asm.mov(stack_top, str_opnd);
 
-    KeepCompiling
+    Some(KeepCompiling)
+}
+
+fn gen_checkmatch(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    let flag = jit.get_arg(0).as_u32();
+
+    // rb_vm_check_match is not leaf unless flag is VM_CHECKMATCH_TYPE_WHEN.
+    // See also: leafness_of_checkmatch() and check_match()
+    if flag != VM_CHECKMATCH_TYPE_WHEN {
+        jit_prepare_non_leaf_call(jit, asm);
+    }
+
+    let pattern = asm.stack_opnd(0);
+    let target = asm.stack_opnd(1);
+
+    extern "C" {
+        fn rb_vm_check_match(ec: EcPtr, target: VALUE, pattern: VALUE, num: u32) -> VALUE;
+    }
+    let result = asm.ccall(rb_vm_check_match as *const u8, vec![EC, target, pattern, flag.into()]);
+    asm.stack_pop(2); // Keep them on stack during ccall for GC
+
+    let stack_ret = asm.stack_push(Type::Unknown);
+    asm.mov(stack_ret, result);
+
+    Some(KeepCompiling)
 }
 
 // Push Qtrue or Qfalse depending on whether the given keyword was supplied by
 // the caller
 fn gen_checkkeyword(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // When a keyword is unspecified past index 32, a hash will be used
     // instead. This can only happen in iseqs taking more than 32 keywords.
     if unsafe { (*get_iseq_body_param_keyword(jit.iseq)).num >= 32 } {
-        return CantCompile;
+        return None;
     }
 
     // The EP offset to the undefined bits local
-    let bits_offset = jit_get_arg(jit, 0).as_i32();
+    let bits_offset = jit.get_arg(0).as_i32();
 
     // The index of the keyword we want to check
-    let index: i64 = jit_get_arg(jit, 1).as_i64();
+    let index: i64 = jit.get_arg(1).as_i64();
 
     // Load environment pointer EP
     let ep_opnd = gen_get_ep(asm, 0);
@@ -1838,67 +2616,33 @@ fn gen_checkkeyword(
     asm.test(bits_opnd, Opnd::Imm(bit_test));
     let ret_opnd = asm.csel_z(Qtrue.into(), Qfalse.into());
 
-    let stack_ret = ctx.stack_push(Type::UnknownImm);
+    let stack_ret = asm.stack_push(Type::UnknownImm);
     asm.mov(stack_ret, ret_opnd);
 
-    KeepCompiling
-}
-
-fn gen_jnz_to_target0(
-    asm: &mut Assembler,
-    target0: CodePtr,
-    _target1: Option<CodePtr>,
-    shape: BranchShape,
-) {
-    match shape {
-        BranchShape::Next0 | BranchShape::Next1 => unreachable!(),
-        BranchShape::Default => asm.jnz(target0.into()),
-    }
-}
-
-fn gen_jz_to_target0(
-    asm: &mut Assembler,
-    target0: CodePtr,
-    _target1: Option<CodePtr>,
-    shape: BranchShape,
-) {
-    match shape {
-        BranchShape::Next0 | BranchShape::Next1 => unreachable!(),
-        BranchShape::Default => asm.jz(Target::CodePtr(target0)),
-    }
-}
-
-fn gen_jbe_to_target0(
-    asm: &mut Assembler,
-    target0: CodePtr,
-    _target1: Option<CodePtr>,
-    shape: BranchShape,
-) {
-    match shape {
-        BranchShape::Next0 | BranchShape::Next1 => unreachable!(),
-        BranchShape::Default => asm.jbe(Target::CodePtr(target0)),
-    }
+    Some(KeepCompiling)
 }
 
 // Generate a jump to a stub that recompiles the current YARV instruction on failure.
 // When depth_limit is exceeded, generate a jump to a side exit.
 fn jit_chain_guard(
     jcc: JCCKinds,
-    jit: &JITState,
-    ctx: &Context,
+    jit: &mut JITState,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-    depth_limit: i32,
-    side_exit: Target,
+    depth_limit: u8,
+    counter: Counter,
 ) {
     let target0_gen_fn = match jcc {
-        JCC_JNE | JCC_JNZ => gen_jnz_to_target0,
-        JCC_JZ | JCC_JE => gen_jz_to_target0,
-        JCC_JBE | JCC_JNA => gen_jbe_to_target0,
+        JCC_JNE | JCC_JNZ => BranchGenFn::JNZToTarget0,
+        JCC_JZ | JCC_JE => BranchGenFn::JZToTarget0,
+        JCC_JBE | JCC_JNA => BranchGenFn::JBEToTarget0,
+        JCC_JB | JCC_JNAE => BranchGenFn::JBToTarget0,
+        JCC_JO_MUL => BranchGenFn::JOMulToTarget0,
     };
 
-    if (ctx.get_chain_depth() as i32) < depth_limit {
-        let mut deeper = ctx.clone();
+    if asm.ctx.get_chain_depth() < depth_limit {
+        // Rewind Context to use the stack_size at the beginning of this instruction.
+        let mut deeper = asm.ctx.with_stack_size(jit.stack_size_for_pc);
         deeper.increment_chain_depth();
         let bid = BlockId {
             iseq: jit.iseq,
@@ -1907,73 +2651,29 @@ fn jit_chain_guard(
 
         gen_branch(jit, asm, ocb, bid, &deeper, None, None, target0_gen_fn);
     } else {
-        target0_gen_fn(asm, side_exit.unwrap_code_ptr(), None, BranchShape::Default);
+        target0_gen_fn.call(asm, Target::side_exit(counter), None);
     }
 }
 
-// up to 5 different classes, and embedded or not for each
-pub const GET_IVAR_MAX_DEPTH: i32 = 10;
+// up to 8 different shapes for each
+pub const GET_IVAR_MAX_DEPTH: u8 = 8;
 
-// up to 5 different classes, and embedded or not for each
-pub const SET_IVAR_MAX_DEPTH: i32 = 10;
+// up to 8 different shapes for each
+pub const SET_IVAR_MAX_DEPTH: u8 = 8;
 
 // hashes and arrays
-pub const OPT_AREF_MAX_CHAIN_DEPTH: i32 = 2;
+pub const OPT_AREF_MAX_CHAIN_DEPTH: u8 = 2;
 
-// up to 5 different classes
-pub const SEND_MAX_DEPTH: i32 = 5;
+// expandarray
+pub const EXPANDARRAY_MAX_CHAIN_DEPTH: u8 = 4;
 
-// up to 20 different methods for send
-pub const SEND_MAX_CHAIN_DEPTH: i32 = 20;
+// up to 5 different methods for send
+pub const SEND_MAX_DEPTH: u8 = 5;
 
 // up to 20 different offsets for case-when
-pub const CASE_WHEN_MAX_DEPTH: i32 = 20;
-
-// Codegen for setting an instance variable.
-// Preconditions:
-//   - receiver is in REG0
-//   - receiver has the same class as CLASS_OF(comptime_receiver)
-//   - no stack push or pops to ctx since the entry to the codegen of the instruction being compiled
-fn gen_set_ivar(
-    jit: &mut JITState,
-    ctx: &mut Context,
-    asm: &mut Assembler,
-    _recv: VALUE,
-    ivar_name: ID,
-    flags: u32,
-    argc: i32,
-) -> CodegenStatus {
-
-    // This is a .send call and we need to adjust the stack
-    if flags & VM_CALL_OPT_SEND != 0 {
-        handle_opt_send_shift_stack(asm, argc, ctx);
-    }
-
-    // Save the PC and SP because the callee may allocate
-    // Note that this modifies REG_SP, which is why we do it first
-    jit_prepare_routine_call(jit, ctx, asm);
-
-    // Get the operands from the stack
-    let val_opnd = ctx.stack_pop(1);
-    let recv_opnd = ctx.stack_pop(1);
-
-    // Call rb_vm_set_ivar_id with the receiver, the ivar name, and the value
-    let val = asm.ccall(
-        rb_vm_set_ivar_id as *const u8,
-        vec![
-            recv_opnd,
-            Opnd::UImm(ivar_name),
-            val_opnd,
-        ],
-    );
-
-    let out_opnd = ctx.stack_push(Type::Unknown);
-    asm.mov(out_opnd, val);
-
-    KeepCompiling
-}
-
+pub const CASE_WHEN_MAX_DEPTH: u8 = 20;
 
+pub const MAX_SPLAT_LENGTH: i32 = 127;
 
 // Codegen for getting an instance variable.
 // Preconditions:
@@ -1981,27 +2681,19 @@ fn gen_set_ivar(
 //   - no stack push or pops to ctx since the entry to the codegen of the instruction being compiled
 fn gen_get_ivar(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-    max_chain_depth: i32,
+    max_chain_depth: u8,
     comptime_receiver: VALUE,
     ivar_name: ID,
     recv: Opnd,
     recv_opnd: YARVOpnd,
-    side_exit: Target,
-) -> CodegenStatus {
-    // If the object has a too complex shape, we exit
-    if comptime_receiver.shape_too_complex() {
-        return CantCompile;
-    }
-
+) -> Option<CodegenStatus> {
     let comptime_val_klass = comptime_receiver.class_of();
-    let starting_context = ctx.clone(); // make a copy for use with jit_chain_guard
 
     // If recv isn't already a register, load it.
     let recv = match recv {
-        Opnd::Reg(_) => recv,
+        Opnd::InsnOut { .. } => recv,
         _ => asm.load(recv),
     };
 
@@ -2017,32 +2709,38 @@ fn gen_get_ivar(
 
     // Check if the comptime receiver is a T_OBJECT
     let receiver_t_object = unsafe { RB_TYPE_P(comptime_receiver, RUBY_T_OBJECT) };
+    // Use a general C call at the last chain to avoid exits on megamorphic shapes
+    let megamorphic = asm.ctx.get_chain_depth() >= max_chain_depth;
+    if megamorphic {
+        gen_counter_incr(asm, Counter::num_getivar_megamorphic);
+    }
 
     // If the class uses the default allocator, instances should all be T_OBJECT
     // NOTE: This assumes nobody changes the allocator of the class after allocation.
     //       Eventually, we can encode whether an object is T_OBJECT or not
     //       inside object shapes.
-    if !receiver_t_object || uses_custom_allocator {
+    // too-complex shapes can't use index access, so we use rb_ivar_get for them too.
+    if !receiver_t_object || uses_custom_allocator || comptime_receiver.shape_too_complex() || megamorphic {
         // General case. Call rb_ivar_get().
         // VALUE rb_ivar_get(VALUE obj, ID id)
-        asm.comment("call rb_ivar_get()");
+        asm_comment!(asm, "call rb_ivar_get()");
 
-        // The function could raise exceptions.
-        jit_prepare_routine_call(jit, ctx, asm);
+        // The function could raise RactorIsolationError.
+        jit_prepare_non_leaf_call(jit, asm);
 
         let ivar_val = asm.ccall(rb_ivar_get as *const u8, vec![recv, Opnd::UImm(ivar_name)]);
 
         if recv_opnd != SelfOpnd {
-            ctx.stack_pop(1);
+            asm.stack_pop(1);
         }
 
         // Push the ivar on the stack
-        let out_opnd = ctx.stack_push(Type::Unknown);
+        let out_opnd = asm.stack_push(Type::Unknown);
         asm.mov(out_opnd, ivar_val);
 
         // Jump to next instruction. This allows guard chains to share the same successor.
-        jump_to_next_insn(jit, ctx, asm, ocb);
-        return EndBlock;
+        jump_to_next_insn(jit, asm, ocb);
+        return Some(EndBlock);
     }
 
     let ivar_index = unsafe {
@@ -2056,23 +2754,8 @@ fn gen_get_ivar(
         }
     };
 
-    // must be before stack_pop
-    let recv_type = ctx.get_opnd_type(recv_opnd);
-
-    // Upgrade type
-    if !recv_type.is_heap() {
-        ctx.upgrade_opnd_type(recv_opnd, Type::UnknownHeap);
-    }
-
-    // Pop receiver if it's on the temp stack
-    if recv_opnd != SelfOpnd {
-        ctx.stack_pop(1);
-    }
-
-    // Guard heap object
-    if !recv_type.is_heap() {
-        guard_object_is_heap(asm, recv, side_exit);
-    }
+    // Guard heap object (recv_opnd must be used before stack_pop)
+    guard_object_is_heap(asm, recv, recv_opnd, Counter::getivar_not_heap);
 
     // Compile time self is embedded and the ivar index lands within the object
     let embed_test_result = unsafe { FL_TEST_RAW(comptime_receiver, VALUE(ROBJECT_EMBED.as_usize())) != VALUE(0) };
@@ -2081,25 +2764,28 @@ fn gen_get_ivar(
     let shape_id_offset = unsafe { rb_shape_id_offset() };
     let shape_opnd = Opnd::mem(SHAPE_ID_NUM_BITS as u8, recv, shape_id_offset);
 
-    asm.comment("guard shape");
+    asm_comment!(asm, "guard shape");
     asm.cmp(shape_opnd, Opnd::UImm(expected_shape as u64));
-    let megamorphic_side_exit = counted_exit!(ocb, side_exit, getivar_megamorphic);
     jit_chain_guard(
         JCC_JNE,
         jit,
-        &starting_context,
         asm,
         ocb,
         max_chain_depth,
-        megamorphic_side_exit,
+        Counter::getivar_megamorphic,
     );
 
+    // Pop receiver if it's on the temp stack
+    if recv_opnd != SelfOpnd {
+        asm.stack_pop(1);
+    }
+
     match ivar_index {
         // If there is no IVAR index, then the ivar was undefined
         // when we entered the compiler.  That means we can just return
         // nil for this shape + iv name
         None => {
-            let out_opnd = ctx.stack_push(Type::Nil);
+            let out_opnd = asm.stack_push(Type::Nil);
             asm.mov(out_opnd, Qnil.into());
         }
         Some(ivar_index) => {
@@ -2107,57 +2793,52 @@ fn gen_get_ivar(
                 // See ROBJECT_IVPTR() from include/ruby/internal/core/robject.h
 
                 // Load the variable
-                let offs = ROBJECT_OFFSET_AS_ARY + (ivar_index * SIZEOF_VALUE) as i32;
+                let offs = ROBJECT_OFFSET_AS_ARY as i32 + (ivar_index * SIZEOF_VALUE) as i32;
                 let ivar_opnd = Opnd::mem(64, recv, offs);
 
                 // Push the ivar on the stack
-                let out_opnd = ctx.stack_push(Type::Unknown);
+                let out_opnd = asm.stack_push(Type::Unknown);
                 asm.mov(out_opnd, ivar_opnd);
             } else {
                 // Compile time value is *not* embedded.
 
                 // Get a pointer to the extended table
-                let tbl_opnd = asm.load(Opnd::mem(64, recv, ROBJECT_OFFSET_AS_HEAP_IVPTR));
+                let tbl_opnd = asm.load(Opnd::mem(64, recv, ROBJECT_OFFSET_AS_HEAP_IVPTR as i32));
 
                 // Read the ivar from the extended table
                 let ivar_opnd = Opnd::mem(64, tbl_opnd, (SIZEOF_VALUE * ivar_index) as i32);
 
-                let out_opnd = ctx.stack_push(Type::Unknown);
+                let out_opnd = asm.stack_push(Type::Unknown);
                 asm.mov(out_opnd, ivar_opnd);
             }
         }
     }
 
     // Jump to next instruction. This allows guard chains to share the same successor.
-    jump_to_next_insn(jit, ctx, asm, ocb);
-    EndBlock
+    jump_to_next_insn(jit, asm, ocb);
+    Some(EndBlock)
 }
 
 fn gen_getinstancevariable(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Defer compilation so we can specialize on a runtime `self`
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+    if !jit.at_current_insn() {
+        defer_compilation(jit, asm, ocb);
+        return Some(EndBlock);
     }
 
-    let ivar_name = jit_get_arg(jit, 0).as_u64();
+    let ivar_name = jit.get_arg(0).as_u64();
 
-    let comptime_val = jit_peek_at_self(jit);
-
-    // Generate a side exit
-    let side_exit = get_side_exit(jit, ocb, ctx);
+    let comptime_val = jit.peek_at_self();
 
     // Guard that the receiver has the same class as the one from compile time.
     let self_asm_opnd = Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF);
 
     gen_get_ivar(
         jit,
-        ctx,
         asm,
         ocb,
         GET_IVAR_MAX_DEPTH,
@@ -2165,7 +2846,6 @@ fn gen_getinstancevariable(
         ivar_name,
         self_asm_opnd,
         SelfOpnd,
-        side_exit,
     )
 }
 
@@ -2185,52 +2865,74 @@ fn gen_write_iv(
 
     if embed_test_result {
         // Find the IV offset
-        let offs = ROBJECT_OFFSET_AS_ARY + (ivar_index * SIZEOF_VALUE) as i32;
+        let offs = ROBJECT_OFFSET_AS_ARY as i32 + (ivar_index * SIZEOF_VALUE) as i32;
         let ivar_opnd = Opnd::mem(64, recv, offs);
 
         // Write the IV
-        asm.comment("write IV");
+        asm_comment!(asm, "write IV");
         asm.mov(ivar_opnd, set_value);
     } else {
         // Compile time value is *not* embedded.
 
         // Get a pointer to the extended table
-        let tbl_opnd = asm.load(Opnd::mem(64, recv, ROBJECT_OFFSET_AS_HEAP_IVPTR));
+        let tbl_opnd = asm.load(Opnd::mem(64, recv, ROBJECT_OFFSET_AS_HEAP_IVPTR as i32));
 
         // Write the ivar in to the extended table
         let ivar_opnd = Opnd::mem(64, tbl_opnd, (SIZEOF_VALUE * ivar_index) as i32);
 
-        asm.comment("write IV");
+        asm_comment!(asm, "write IV");
         asm.mov(ivar_opnd, set_value);
     }
 }
 
 fn gen_setinstancevariable(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let starting_context = ctx.clone(); // make a copy for use with jit_chain_guard
-
+) -> Option<CodegenStatus> {
     // Defer compilation so we can specialize on a runtime `self`
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+    if !jit.at_current_insn() {
+        defer_compilation(jit, asm, ocb);
+        return Some(EndBlock);
     }
 
-    let ivar_name = jit_get_arg(jit, 0).as_u64();
-    let comptime_receiver = jit_peek_at_self(jit);
+    let ivar_name = jit.get_arg(0).as_u64();
+    let ic = jit.get_arg(1).as_ptr();
+    let comptime_receiver = jit.peek_at_self();
+    gen_set_ivar(
+        jit,
+        asm,
+        ocb,
+        comptime_receiver,
+        ivar_name,
+        SelfOpnd,
+        Some(ic),
+    )
+}
+
+/// Set an instance variable on setinstancevariable or attr_writer.
+/// It switches the behavior based on what recv_opnd is given.
+/// * SelfOpnd: setinstancevariable, which doesn't push a result onto the stack.
+/// * StackOpnd: attr_writer, which pushes a result onto the stack.
+fn gen_set_ivar(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    comptime_receiver: VALUE,
+    ivar_name: ID,
+    recv_opnd: YARVOpnd,
+    ic: Option<*const iseq_inline_iv_cache_entry>,
+) -> Option<CodegenStatus> {
     let comptime_val_klass = comptime_receiver.class_of();
 
     // If the comptime receiver is frozen, writing an IV will raise an exception
     // and we don't want to JIT code to deal with that situation.
-    // If the object has a too complex shape, we will also exit
-    if comptime_receiver.is_frozen() || comptime_receiver.shape_too_complex() {
-        return CantCompile;
+    if comptime_receiver.is_frozen() {
+        gen_counter_incr(asm, Counter::setivar_frozen);
+        return None;
     }
 
-    let (_, stack_type) = ctx.get_opnd_mapping(StackOpnd(0));
+    let stack_type = asm.ctx.get_opnd_type(StackOpnd(0));
 
     // Check if the comptime class uses a custom allocator
     let custom_allocator = unsafe { rb_get_alloc_func(comptime_val_klass) };
@@ -2244,75 +2946,118 @@ fn gen_setinstancevariable(
 
     // Check if the comptime receiver is a T_OBJECT
     let receiver_t_object = unsafe { RB_TYPE_P(comptime_receiver, RUBY_T_OBJECT) };
+    // Use a general C call at the last chain to avoid exits on megamorphic shapes
+    let megamorphic = asm.ctx.get_chain_depth() >= SET_IVAR_MAX_DEPTH;
+    if megamorphic {
+        gen_counter_incr(asm, Counter::num_setivar_megamorphic);
+    }
 
-    // If the receiver isn't a T_OBJECT, or uses a custom allocator,
-    // then just write out the IV write as a function call
-    if !receiver_t_object || uses_custom_allocator {
-        asm.comment("call rb_vm_setinstancevariable()");
+    // Get the iv index
+    let shape_too_complex = comptime_receiver.shape_too_complex();
+    let ivar_index = if !shape_too_complex {
+        let shape_id = comptime_receiver.shape_id_of();
+        let shape = unsafe { rb_shape_get_shape_by_id(shape_id) };
+        let mut ivar_index: u32 = 0;
+        if unsafe { rb_shape_get_iv_index(shape, ivar_name, &mut ivar_index) } {
+            Some(ivar_index as usize)
+        } else {
+            None
+        }
+    } else {
+        None
+    };
 
-        let ic = jit_get_arg(jit, 1).as_u64(); // type IVC
+    // The current shape doesn't contain this iv, we need to transition to another shape.
+    let new_shape = if !shape_too_complex && receiver_t_object && ivar_index.is_none() {
+        let current_shape = comptime_receiver.shape_of();
+        let next_shape = unsafe { rb_shape_get_next_no_warnings(current_shape, comptime_receiver, ivar_name) };
+        let next_shape_id = unsafe { rb_shape_id(next_shape) };
 
-        // The function could raise exceptions.
-        // Note that this modifies REG_SP, which is why we do it first
-        jit_prepare_routine_call(jit, ctx, asm);
+        // If the VM ran out of shapes, or this class generated too many leaf,
+        // it may be de-optimized into OBJ_TOO_COMPLEX_SHAPE (hash-table).
+        if next_shape_id == OBJ_TOO_COMPLEX_SHAPE_ID {
+            Some((next_shape_id, None, 0_usize))
+        } else {
+            let current_capacity = unsafe { (*current_shape).capacity };
 
-        // Get the operands from the stack
-        let val_opnd = ctx.stack_pop(1);
+            // If the new shape has a different capacity, or is TOO_COMPLEX, we'll have to
+            // reallocate it.
+            let needs_extension = unsafe { (*current_shape).capacity != (*next_shape).capacity };
 
-        // Call rb_vm_setinstancevariable(iseq, obj, id, val, ic);
-        asm.ccall(
-            rb_vm_setinstancevariable as *const u8,
-            vec![
-                Opnd::const_ptr(jit.iseq as *const u8),
-                Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF),
-                ivar_name.into(),
-                val_opnd,
-                Opnd::const_ptr(ic as *const u8),
-            ]
-        );
-    } else {
-        // Get the iv index
-        let ivar_index = unsafe {
-            let shape_id = comptime_receiver.shape_id_of();
-            let shape = rb_shape_get_shape_by_id(shape_id);
-            let mut ivar_index: u32 = 0;
-            if rb_shape_get_iv_index(shape, ivar_name, &mut ivar_index) {
-                Some(ivar_index as usize)
+            // We can write to the object, but we need to transition the shape
+            let ivar_index = unsafe { (*current_shape).next_iv_index } as usize;
+
+            let needs_extension = if needs_extension {
+                Some((current_capacity, unsafe { (*next_shape).capacity }))
             } else {
                 None
-            }
-        };
+            };
+            Some((next_shape_id, needs_extension, ivar_index))
+        }
+    } else {
+        None
+    };
+    let new_shape_too_complex = matches!(new_shape, Some((OBJ_TOO_COMPLEX_SHAPE_ID, _, _)));
 
-        // Get the receiver
-        let mut recv = asm.load(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF));
+    // If the receiver isn't a T_OBJECT, or uses a custom allocator,
+    // then just write out the IV write as a function call.
+    // too-complex shapes can't use index access, so we use rb_ivar_get for them too.
+    if !receiver_t_object || uses_custom_allocator || shape_too_complex || new_shape_too_complex || megamorphic {
+        // The function could raise FrozenError.
+        // Note that this modifies REG_SP, which is why we do it first
+        jit_prepare_non_leaf_call(jit, asm);
 
-        let recv_opnd = SelfOpnd;
-        let recv_type = ctx.get_opnd_type(recv_opnd);
+        // Get the operands from the stack
+        let val_opnd = asm.stack_opnd(0);
 
-        // Generate a side exit
-        let side_exit = get_side_exit(jit, ocb, ctx);
+        if let StackOpnd(index) = recv_opnd { // attr_writer
+            let recv = asm.stack_opnd(index as i32);
+            asm_comment!(asm, "call rb_vm_set_ivar_id()");
+            asm.ccall(
+                rb_vm_set_ivar_id as *const u8,
+                vec![
+                    recv,
+                    Opnd::UImm(ivar_name),
+                    val_opnd,
+                ],
+            );
+        } else { // setinstancevariable
+            asm_comment!(asm, "call rb_vm_setinstancevariable()");
+            asm.ccall(
+                rb_vm_setinstancevariable as *const u8,
+                vec![
+                    Opnd::const_ptr(jit.iseq as *const u8),
+                    Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF),
+                    ivar_name.into(),
+                    val_opnd,
+                    Opnd::const_ptr(ic.unwrap() as *const u8),
+                ],
+            );
+        }
+    } else {
+        // Get the receiver
+        let mut recv = asm.load(if let StackOpnd(index) = recv_opnd {
+            asm.stack_opnd(index as i32)
+        } else {
+            Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF)
+        });
 
         // Upgrade type
-        if !recv_type.is_heap() { // Must be a heap type
-            ctx.upgrade_opnd_type(recv_opnd, Type::UnknownHeap);
-            guard_object_is_heap(asm, recv, side_exit);
-        }
+        guard_object_is_heap(asm, recv, recv_opnd, Counter::setivar_not_heap);
 
         let expected_shape = unsafe { rb_shape_get_shape_id(comptime_receiver) };
         let shape_id_offset = unsafe { rb_shape_id_offset() };
         let shape_opnd = Opnd::mem(SHAPE_ID_NUM_BITS as u8, recv, shape_id_offset);
 
-        asm.comment("guard shape");
+        asm_comment!(asm, "guard shape");
         asm.cmp(shape_opnd, Opnd::UImm(expected_shape as u64));
-        let megamorphic_side_exit = counted_exit!(ocb, side_exit, setivar_megamorphic);
         jit_chain_guard(
             JCC_JNE,
             jit,
-            &starting_context,
             asm,
             ocb,
             SET_IVAR_MAX_DEPTH,
-            megamorphic_side_exit,
+            Counter::setivar_megamorphic,
         );
 
         let write_val;
@@ -2321,42 +3066,15 @@ fn gen_setinstancevariable(
             // If we don't have an instance variable index, then we need to
             // transition out of the current shape.
             None => {
-                let shape = comptime_receiver.shape_of();
-
-                let current_capacity = unsafe { (*shape).capacity };
-                let new_capacity = current_capacity * 2;
-
-                // If the object doesn't have the capacity to store the IV,
-                // then we'll need to allocate it.
-                let needs_extension = unsafe { (*shape).next_iv_index >= current_capacity };
-
-                // We can write to the object, but we need to transition the shape
-                let ivar_index = unsafe { (*shape).next_iv_index } as usize;
-
-                let capa_shape = if needs_extension {
-                    // We need to add an extended table to the object
-                    // First, create an outgoing transition that increases the
-                    // capacity
-                    Some(unsafe { rb_shape_transition_shape_capa(shape, new_capacity) })
-                } else {
-                    None
-                };
-
-                let dest_shape = if let Some(capa_shape) = capa_shape {
-                    unsafe { rb_shape_get_next(capa_shape, comptime_receiver, ivar_name) }
-                } else {
-                    unsafe { rb_shape_get_next(shape, comptime_receiver, ivar_name) }
-                };
-
-                let new_shape_id = unsafe { rb_shape_id(dest_shape) };
-
-                if new_shape_id == OBJ_TOO_COMPLEX_SHAPE_ID {
-                    return CantCompile;
-                }
-
-                if needs_extension {
+                let (new_shape_id, needs_extension, ivar_index) = new_shape.unwrap();
+                if let Some((current_capacity, new_capacity)) = needs_extension {
                     // Generate the C call so that runtime code will increase
                     // the capacity and set the buffer.
+                    asm_comment!(asm, "call rb_ensure_iv_list_size");
+
+                    // It allocates so can trigger GC, which takes the VM lock
+                    // so could yield to a different ractor.
+                    jit_prepare_call_with_gc(jit, asm);
                     asm.ccall(rb_ensure_iv_list_size as *const u8,
                               vec![
                                   recv,
@@ -2366,13 +3084,17 @@ fn gen_setinstancevariable(
                     );
 
                     // Load the receiver again after the function call
-                    recv = asm.load(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF))
+                    recv = asm.load(if let StackOpnd(index) = recv_opnd {
+                        asm.stack_opnd(index as i32)
+                    } else {
+                        Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF)
+                    });
                 }
 
-                write_val = ctx.stack_pop(1);
-                gen_write_iv(asm, comptime_receiver, recv, ivar_index, write_val, needs_extension);
+                write_val = asm.stack_opnd(0);
+                gen_write_iv(asm, comptime_receiver, recv, ivar_index, write_val, needs_extension.is_some());
 
-                asm.comment("write shape");
+                asm_comment!(asm, "write shape");
 
                 let shape_id_offset = unsafe { rb_shape_id_offset() };
                 let shape_opnd = Opnd::mem(SHAPE_ID_NUM_BITS as u8, recv, shape_id_offset);
@@ -2387,7 +3109,7 @@ fn gen_setinstancevariable(
                 // the iv index by searching up the shape tree.  If we've
                 // made the transition already, then there's no reason to
                 // update the shape on the object.  Just set the IV.
-                write_val = ctx.stack_pop(1);
+                write_val = asm.stack_opnd(0);
                 gen_write_iv(asm, comptime_receiver, recv, ivar_index, write_val, false);
             },
         }
@@ -2395,6 +3117,7 @@ fn gen_setinstancevariable(
         // If we know the stack value is an immediate, there's no need to
         // generate WB code.
         if !stack_type.is_imm() {
+            asm.spill_temps(); // for ccall (unconditionally spill them for RegTemps consistency)
             let skip_wb = asm.new_label("skip_wb");
             // If the value we're writing is an immediate, we don't need to WB
             asm.test(write_val, (RUBY_IMMEDIATE_MASK as u64).into());
@@ -2404,7 +3127,7 @@ fn gen_setinstancevariable(
             asm.cmp(write_val, Qnil.into());
             asm.jbe(skip_wb);
 
-            asm.comment("write barrier");
+            asm_comment!(asm, "write barrier");
             asm.ccall(
                 rb_gc_writebarrier as *const u8,
                 vec![
@@ -2416,70 +3139,168 @@ fn gen_setinstancevariable(
             asm.write_label(skip_wb);
         }
     }
+    let write_val = asm.stack_pop(1); // Keep write_val on stack during ccall for GC
 
-    KeepCompiling
+    // If it's attr_writer, i.e. recv_opnd is StackOpnd, we need to pop
+    // the receiver and push the written value onto the stack.
+    if let StackOpnd(_) = recv_opnd {
+        asm.stack_pop(1); // Pop receiver
+
+        let out_opnd = asm.stack_push(Type::Unknown); // Push a return value
+        asm.mov(out_opnd, write_val);
+    }
+
+    Some(KeepCompiling)
 }
 
 fn gen_defined(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let op_type = jit_get_arg(jit, 0).as_u64();
-    let obj = jit_get_arg(jit, 1);
-    let pushval = jit_get_arg(jit, 2);
+) -> Option<CodegenStatus> {
+    let op_type = jit.get_arg(0).as_u64();
+    let obj = jit.get_arg(1);
+    let pushval = jit.get_arg(2);
 
-    // Save the PC and SP because the callee may allocate
-    // Note that this modifies REG_SP, which is why we do it first
-    jit_prepare_routine_call(jit, ctx, asm);
+    match op_type as u32 {
+        DEFINED_YIELD => {
+            asm.stack_pop(1); // v operand is not used
+            let out_opnd = asm.stack_push(Type::Unknown); // nil or "yield"
 
-    // Get the operands from the stack
-    let v_opnd = ctx.stack_pop(1);
+            gen_block_given(jit, asm, out_opnd, pushval.into(), Qnil.into());
+        }
+        _ => {
+            // Save the PC and SP because the callee may allocate or call #respond_to?
+            // Note that this modifies REG_SP, which is why we do it first
+            jit_prepare_non_leaf_call(jit, asm);
 
-    // Call vm_defined(ec, reg_cfp, op_type, obj, v)
-    let def_result = asm.ccall(rb_vm_defined as *const u8, vec![EC, CFP, op_type.into(), obj.into(), v_opnd]);
+            // Get the operands from the stack
+            let v_opnd = asm.stack_opnd(0);
 
-    // if (vm_defined(ec, GET_CFP(), op_type, obj, v)) {
-    //  val = pushval;
-    // }
-    asm.test(def_result, Opnd::UImm(255));
-    let out_value = asm.csel_nz(pushval.into(), Qnil.into());
+            // Call vm_defined(ec, reg_cfp, op_type, obj, v)
+            let def_result = asm.ccall(rb_vm_defined as *const u8, vec![EC, CFP, op_type.into(), obj.into(), v_opnd]);
+            asm.stack_pop(1); // Keep it on stack during ccall for GC
 
-    // Push the return value onto the stack
-    let out_type = if pushval.special_const_p() {
-        Type::UnknownImm
-    } else {
-        Type::Unknown
+            // if (vm_defined(ec, GET_CFP(), op_type, obj, v)) {
+            //  val = pushval;
+            // }
+            asm.test(def_result, Opnd::UImm(255));
+            let out_value = asm.csel_nz(pushval.into(), Qnil.into());
+
+            // Push the return value onto the stack
+            let out_type = if pushval.special_const_p() {
+                Type::UnknownImm
+            } else {
+                Type::Unknown
+            };
+            let stack_ret = asm.stack_push(out_type);
+            asm.mov(stack_ret, out_value);
+        }
+    }
+
+    Some(KeepCompiling)
+}
+
+fn gen_definedivar(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    // Defer compilation so we can specialize base on a runtime receiver
+    if !jit.at_current_insn() {
+        defer_compilation(jit, asm, ocb);
+        return Some(EndBlock);
+    }
+
+    let ivar_name = jit.get_arg(0).as_u64();
+    // Value that will be pushed on the stack if the ivar is defined. In practice this is always the
+    // string "instance-variable". If the ivar is not defined, nil will be pushed instead.
+    let pushval = jit.get_arg(2);
+
+    // Get the receiver
+    let recv = asm.load(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF));
+
+    // Specialize base on compile time values
+    let comptime_receiver = jit.peek_at_self();
+
+    if comptime_receiver.shape_too_complex() || asm.ctx.get_chain_depth() >= GET_IVAR_MAX_DEPTH {
+        // Fall back to calling rb_ivar_defined
+
+        // Save the PC and SP because the callee may allocate
+        // Note that this modifies REG_SP, which is why we do it first
+        jit_prepare_call_with_gc(jit, asm);
+
+        // Call rb_ivar_defined(recv, ivar_name)
+        let def_result = asm.ccall(rb_ivar_defined as *const u8, vec![recv, ivar_name.into()]);
+
+        // if (rb_ivar_defined(recv, ivar_name)) {
+        //  val = pushval;
+        // }
+        asm.test(def_result, Opnd::UImm(255));
+        let out_value = asm.csel_nz(pushval.into(), Qnil.into());
+
+        // Push the return value onto the stack
+        let out_type = if pushval.special_const_p() { Type::UnknownImm } else { Type::Unknown };
+        let stack_ret = asm.stack_push(out_type);
+        asm.mov(stack_ret, out_value);
+
+        return Some(KeepCompiling)
+    }
+
+    let shape_id = comptime_receiver.shape_id_of();
+    let ivar_exists = unsafe {
+        let shape = rb_shape_get_shape_by_id(shape_id);
+        let mut ivar_index: u32 = 0;
+        rb_shape_get_iv_index(shape, ivar_name, &mut ivar_index)
     };
-    let stack_ret = ctx.stack_push(out_type);
-    asm.mov(stack_ret, out_value);
 
-    KeepCompiling
+    // Guard heap object (recv_opnd must be used before stack_pop)
+    guard_object_is_heap(asm, recv, SelfOpnd, Counter::definedivar_not_heap);
+
+    let shape_id_offset = unsafe { rb_shape_id_offset() };
+    let shape_opnd = Opnd::mem(SHAPE_ID_NUM_BITS as u8, recv, shape_id_offset);
+
+    asm_comment!(asm, "guard shape");
+    asm.cmp(shape_opnd, Opnd::UImm(shape_id as u64));
+    jit_chain_guard(
+        JCC_JNE,
+        jit,
+        asm,
+        ocb,
+        GET_IVAR_MAX_DEPTH,
+        Counter::definedivar_megamorphic,
+    );
+
+    let result = if ivar_exists { pushval } else { Qnil };
+    jit_putobject(asm, result);
+
+    // Jump to next instruction. This allows guard chains to share the same successor.
+    jump_to_next_insn(jit, asm, ocb);
+
+    return Some(EndBlock);
 }
 
 fn gen_checktype(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let type_val = jit_get_arg(jit, 0).as_u32();
+) -> Option<CodegenStatus> {
+    let type_val = jit.get_arg(0).as_u32();
 
     // Only three types are emitted by compile.c at the moment
     if let RUBY_T_STRING | RUBY_T_ARRAY | RUBY_T_HASH = type_val {
-        let val_type = ctx.get_opnd_type(StackOpnd(0));
-        let val = asm.load(ctx.stack_pop(1));
+        let val_type = asm.ctx.get_opnd_type(StackOpnd(0));
+        let val = asm.stack_pop(1);
 
         // Check if we know from type information
         match val_type.known_value_type() {
             Some(value_type) => {
                 if value_type == type_val {
-                    jit_putobject(jit, ctx, asm, Qtrue);
-                    return KeepCompiling;
+                    jit_putobject(asm, Qtrue);
+                    return Some(KeepCompiling);
                 } else {
-                    jit_putobject(jit, ctx, asm, Qfalse);
-                    return KeepCompiling;
+                    jit_putobject(asm, Qfalse);
+                    return Some(KeepCompiling);
                 }
             },
             _ => (),
@@ -2487,6 +3308,7 @@ fn gen_checktype(
 
         let ret = asm.new_label("ret");
 
+        let val = asm.load(val);
         if !val_type.is_heap() {
             // if (SPECIAL_CONST_P(val)) {
             // Return Qfalse via REG1 if not on heap
@@ -2504,27 +3326,26 @@ fn gen_checktype(
         let ret_opnd = asm.csel_e(Qtrue.into(), Qfalse.into());
 
         asm.write_label(ret);
-        let stack_ret = ctx.stack_push(Type::UnknownImm);
+        let stack_ret = asm.stack_push(Type::UnknownImm);
         asm.mov(stack_ret, ret_opnd);
 
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
-        CantCompile
+        None
     }
 }
 
 fn gen_concatstrings(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let n = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let n = jit.get_arg(0).as_usize();
 
-    // Save the PC and SP because we are allocating
-    jit_prepare_routine_call(jit, ctx, asm);
+    // rb_str_concat_literals may raise Encoding::CompatibilityError
+    jit_prepare_non_leaf_call(jit, asm);
 
-    let values_ptr = asm.lea(ctx.sp_opnd(-((SIZEOF_VALUE as isize) * n as isize)));
+    let values_ptr = asm.lea(asm.ctx.sp_opnd(-(n as i32)));
 
     // call rb_str_concat_literals(size_t n, const VALUE *strings);
     let return_value = asm.ccall(
@@ -2532,39 +3353,43 @@ fn gen_concatstrings(
         vec![n.into(), values_ptr]
     );
 
-    ctx.stack_pop(n);
-    let stack_ret = ctx.stack_push(Type::CString);
+    asm.stack_pop(n);
+    let stack_ret = asm.stack_push(Type::TString);
     asm.mov(stack_ret, return_value);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn guard_two_fixnums(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-    side_exit: Target
 ) {
+    let counter = Counter::guard_send_not_fixnums;
+
+    // Get stack operands without popping them
+    let arg1 = asm.stack_opnd(0);
+    let arg0 = asm.stack_opnd(1);
+
     // Get the stack operand types
-    let arg1_type = ctx.get_opnd_type(StackOpnd(0));
-    let arg0_type = ctx.get_opnd_type(StackOpnd(1));
+    let arg1_type = asm.ctx.get_opnd_type(arg1.into());
+    let arg0_type = asm.ctx.get_opnd_type(arg0.into());
 
     if arg0_type.is_heap() || arg1_type.is_heap() {
-        asm.comment("arg is heap object");
-        asm.jmp(side_exit);
+        asm_comment!(asm, "arg is heap object");
+        asm.jmp(Target::side_exit(counter));
         return;
     }
 
     if arg0_type != Type::Fixnum && arg0_type.is_specific() {
-        asm.comment("arg0 not fixnum");
-        asm.jmp(side_exit);
+        asm_comment!(asm, "arg0 not fixnum");
+        asm.jmp(Target::side_exit(counter));
         return;
     }
 
     if arg1_type != Type::Fixnum && arg1_type.is_specific() {
-        asm.comment("arg1 not fixnum");
-        asm.jmp(side_exit);
+        asm_comment!(asm, "arg1 not fixnum");
+        asm.jmp(Target::side_exit(counter));
         return;
     }
 
@@ -2573,43 +3398,37 @@ fn guard_two_fixnums(
     assert!(arg0_type == Type::Fixnum || arg0_type.is_unknown());
     assert!(arg1_type == Type::Fixnum || arg1_type.is_unknown());
 
-    // Get stack operands without popping them
-    let arg1 = ctx.stack_opnd(0);
-    let arg0 = ctx.stack_opnd(1);
-
     // If not fixnums at run-time, fall back
     if arg0_type != Type::Fixnum {
-        asm.comment("guard arg0 fixnum");
+        asm_comment!(asm, "guard arg0 fixnum");
         asm.test(arg0, Opnd::UImm(RUBY_FIXNUM_FLAG as u64));
 
         jit_chain_guard(
             JCC_JZ,
             jit,
-            &ctx,
             asm,
             ocb,
             SEND_MAX_DEPTH,
-            side_exit,
+            counter,
         );
     }
     if arg1_type != Type::Fixnum {
-        asm.comment("guard arg1 fixnum");
+        asm_comment!(asm, "guard arg1 fixnum");
         asm.test(arg1, Opnd::UImm(RUBY_FIXNUM_FLAG as u64));
 
         jit_chain_guard(
             JCC_JZ,
             jit,
-            &ctx,
             asm,
             ocb,
             SEND_MAX_DEPTH,
-            side_exit,
+            counter,
         );
     }
 
     // Set stack types in context
-    ctx.upgrade_opnd_type(StackOpnd(0), Type::Fixnum);
-    ctx.upgrade_opnd_type(StackOpnd(1), Type::Fixnum);
+    asm.ctx.upgrade_opnd_type(arg1.into(), Type::Fixnum);
+    asm.ctx.upgrade_opnd_type(arg0.into(), Type::Fixnum);
 }
 
 // Conditional move operation used by comparison operators
@@ -2617,753 +3436,782 @@ type CmovFn = fn(cb: &mut Assembler, opnd0: Opnd, opnd1: Opnd) -> Opnd;
 
 fn gen_fixnum_cmp(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
     cmov_op: CmovFn,
-) -> CodegenStatus {
-    // Defer compilation so we can specialize base on a runtime receiver
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
-    }
-
-    let comptime_a = jit_peek_at_stack(jit, ctx, 1);
-    let comptime_b = jit_peek_at_stack(jit, ctx, 0);
-
-    if comptime_a.fixnum_p() && comptime_b.fixnum_p() {
-        // Create a side-exit to fall back to the interpreter
-        // Note: we generate the side-exit before popping operands from the stack
-        let side_exit = get_side_exit(jit, ocb, ctx);
+    bop: ruby_basic_operators,
+) -> Option<CodegenStatus> {
+    let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
+        Some(two_fixnums) => two_fixnums,
+        None => {
+            // Defer compilation so we can specialize based on a runtime receiver
+            defer_compilation(jit, asm, ocb);
+            return Some(EndBlock);
+        }
+    };
 
-        if !assume_bop_not_redefined(jit, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_LT) {
-            return CantCompile;
+    if two_fixnums {
+        if !assume_bop_not_redefined(jit, asm, ocb, INTEGER_REDEFINED_OP_FLAG, bop) {
+            return None;
         }
 
         // Check that both operands are fixnums
-        guard_two_fixnums(jit, ctx, asm, ocb, side_exit);
+        guard_two_fixnums(jit, asm, ocb);
 
         // Get the operands from the stack
-        let arg1 = ctx.stack_pop(1);
-        let arg0 = ctx.stack_pop(1);
+        let arg1 = asm.stack_pop(1);
+        let arg0 = asm.stack_pop(1);
 
         // Compare the arguments
         asm.cmp(arg0, arg1);
         let bool_opnd = cmov_op(asm, Qtrue.into(), Qfalse.into());
 
         // Push the output on the stack
-        let dst = ctx.stack_push(Type::Unknown);
+        let dst = asm.stack_push(Type::UnknownImm);
         asm.mov(dst, bool_opnd);
 
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm, ocb)
     }
 }
 
 fn gen_opt_lt(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    gen_fixnum_cmp(jit, ctx, asm, ocb, Assembler::csel_l)
+) -> Option<CodegenStatus> {
+    gen_fixnum_cmp(jit, asm, ocb, Assembler::csel_l, BOP_LT)
 }
 
 fn gen_opt_le(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    gen_fixnum_cmp(jit, ctx, asm, ocb, Assembler::csel_le)
+) -> Option<CodegenStatus> {
+    gen_fixnum_cmp(jit, asm, ocb, Assembler::csel_le, BOP_LE)
 }
 
 fn gen_opt_ge(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    gen_fixnum_cmp(jit, ctx, asm, ocb, Assembler::csel_ge)
+) -> Option<CodegenStatus> {
+    gen_fixnum_cmp(jit, asm, ocb, Assembler::csel_ge, BOP_GE)
 }
 
 fn gen_opt_gt(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    gen_fixnum_cmp(jit, ctx, asm, ocb, Assembler::csel_g)
+) -> Option<CodegenStatus> {
+    gen_fixnum_cmp(jit, asm, ocb, Assembler::csel_g, BOP_GT)
 }
 
 // Implements specialized equality for either two fixnum or two strings
-// Returns true if code was generated, otherwise false
+// Returns None if enough type information isn't available, Some(true)
+// if code was generated, otherwise Some(false).
 fn gen_equality_specialized(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-    side_exit: Target,
-) -> bool {
-    let comptime_a = jit_peek_at_stack(jit, ctx, 1);
-    let comptime_b = jit_peek_at_stack(jit, ctx, 0);
-
-    let a_opnd = ctx.stack_opnd(1);
-    let b_opnd = ctx.stack_opnd(0);
+    gen_eq: bool,
+) -> Option<bool> {
+    let a_opnd = asm.stack_opnd(1);
+    let b_opnd = asm.stack_opnd(0);
+
+    let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
+        Some(two_fixnums) => two_fixnums,
+        None => return None,
+    };
 
-    if comptime_a.fixnum_p() && comptime_b.fixnum_p() {
-        if !assume_bop_not_redefined(jit, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_EQ) {
+    if two_fixnums {
+        if !assume_bop_not_redefined(jit, asm, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_EQ) {
             // if overridden, emit the generic version
-            return false;
+            return Some(false);
         }
 
-        guard_two_fixnums(jit, ctx, asm, ocb, side_exit);
+        guard_two_fixnums(jit, asm, ocb);
 
         asm.cmp(a_opnd, b_opnd);
-
-        let val = asm.csel_ne(Qfalse.into(), Qtrue.into());
+        let val = if gen_eq {
+            asm.csel_e(Qtrue.into(), Qfalse.into())
+        } else {
+            asm.csel_ne(Qtrue.into(), Qfalse.into())
+        };
 
         // Push the output on the stack
-        ctx.stack_pop(2);
-        let dst = ctx.stack_push(Type::UnknownImm);
+        asm.stack_pop(2);
+        let dst = asm.stack_push(Type::UnknownImm);
         asm.mov(dst, val);
 
-        true
+        return Some(true);
     }
-    else if unsafe { comptime_a.class_of() == rb_cString && comptime_b.class_of() == rb_cString }
-    {
-        if !assume_bop_not_redefined(jit, ocb, STRING_REDEFINED_OP_FLAG, BOP_EQ) {
+
+    if !jit.at_current_insn() {
+        return None;
+    }
+    let comptime_a = jit.peek_at_stack(&asm.ctx, 1);
+    let comptime_b = jit.peek_at_stack(&asm.ctx, 0);
+
+    if unsafe { comptime_a.class_of() == rb_cString && comptime_b.class_of() == rb_cString } {
+        if !assume_bop_not_redefined(jit, asm, ocb, STRING_REDEFINED_OP_FLAG, BOP_EQ) {
             // if overridden, emit the generic version
-            return false;
+            return Some(false);
         }
 
         // Guard that a is a String
         jit_guard_known_klass(
             jit,
-            ctx,
             asm,
             ocb,
             unsafe { rb_cString },
             a_opnd,
-            StackOpnd(1),
+            a_opnd.into(),
             comptime_a,
             SEND_MAX_DEPTH,
-            side_exit,
+            Counter::guard_send_not_string,
         );
 
         let equal = asm.new_label("equal");
         let ret = asm.new_label("ret");
 
+        // Spill for ccall. For safety, unconditionally spill temps before branching.
+        asm.spill_temps();
+
         // If they are equal by identity, return true
         asm.cmp(a_opnd, b_opnd);
         asm.je(equal);
 
         // Otherwise guard that b is a T_STRING (from type info) or String (from runtime guard)
-        let btype = ctx.get_opnd_type(StackOpnd(0));
+        let btype = asm.ctx.get_opnd_type(b_opnd.into());
         if btype.known_value_type() != Some(RUBY_T_STRING) {
             // Note: any T_STRING is valid here, but we check for a ::String for simplicity
             // To pass a mutable static variable (rb_cString) requires an unsafe block
             jit_guard_known_klass(
                 jit,
-                ctx,
                 asm,
                 ocb,
                 unsafe { rb_cString },
                 b_opnd,
-                StackOpnd(0),
+                b_opnd.into(),
                 comptime_b,
                 SEND_MAX_DEPTH,
-                side_exit,
+                Counter::guard_send_not_string,
             );
         }
 
         // Call rb_str_eql_internal(a, b)
-        let val = asm.ccall(rb_str_eql_internal as *const u8, vec![a_opnd, b_opnd]);
+        let val = asm.ccall(
+            if gen_eq { rb_str_eql_internal } else { rb_str_neq_internal } as *const u8,
+            vec![a_opnd, b_opnd],
+        );
 
         // Push the output on the stack
-        ctx.stack_pop(2);
-        let dst = ctx.stack_push(Type::UnknownImm);
+        asm.stack_pop(2);
+        let dst = asm.stack_push(Type::UnknownImm);
         asm.mov(dst, val);
         asm.jmp(ret);
 
         asm.write_label(equal);
-        asm.mov(dst, Qtrue.into());
+        asm.mov(dst, if gen_eq { Qtrue } else { Qfalse }.into());
 
         asm.write_label(ret);
 
-        true
+        Some(true)
     } else {
-        false
+        Some(false)
     }
 }
 
 fn gen_opt_eq(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    // Defer compilation so we can specialize base on a runtime receiver
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
-    }
-
-    // Create a side-exit to fall back to the interpreter
-    let side_exit = get_side_exit(jit, ocb, ctx);
+) -> Option<CodegenStatus> {
+    let specialized = match gen_equality_specialized(jit, asm, ocb, true) {
+        Some(specialized) => specialized,
+        None => {
+            // Defer compilation so we can specialize base on a runtime receiver
+            defer_compilation(jit, asm, ocb);
+            return Some(EndBlock);
+        }
+    };
 
-    if gen_equality_specialized(jit, ctx, asm, ocb, side_exit) {
-        jump_to_next_insn(jit, ctx, asm, ocb);
-        EndBlock
+    if specialized {
+        jump_to_next_insn(jit, asm, ocb);
+        Some(EndBlock)
     } else {
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm, ocb)
     }
 }
 
 fn gen_opt_neq(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // opt_neq is passed two rb_call_data as arguments:
     // first for ==, second for !=
-    let cd = jit_get_arg(jit, 1).as_ptr();
-    return gen_send_general(jit, ctx, asm, ocb, cd, None);
+    let cd = jit.get_arg(1).as_ptr();
+    perf_call! { gen_send_general(jit, asm, ocb, cd, None) }
 }
 
 fn gen_opt_aref(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let cd: *const rb_call_data = jit_get_arg(jit, 0).as_ptr();
+) -> Option<CodegenStatus> {
+    let cd: *const rb_call_data = jit.get_arg(0).as_ptr();
     let argc = unsafe { vm_ci_argc((*cd).ci) };
 
     // Only JIT one arg calls like `ary[6]`
     if argc != 1 {
-        gen_counter_incr!(asm, oaref_argc_not_one);
-        return CantCompile;
+        gen_counter_incr(asm, Counter::opt_aref_argc_not_one);
+        return None;
     }
 
     // Defer compilation so we can specialize base on a runtime receiver
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+    if !jit.at_current_insn() {
+        defer_compilation(jit, asm, ocb);
+        return Some(EndBlock);
     }
 
     // Specialize base on compile time values
-    let comptime_idx = jit_peek_at_stack(jit, ctx, 0);
-    let comptime_recv = jit_peek_at_stack(jit, ctx, 1);
-
-    // Create a side-exit to fall back to the interpreter
-    let side_exit = get_side_exit(jit, ocb, ctx);
+    let comptime_idx = jit.peek_at_stack(&asm.ctx, 0);
+    let comptime_recv = jit.peek_at_stack(&asm.ctx, 1);
 
     if comptime_recv.class_of() == unsafe { rb_cArray } && comptime_idx.fixnum_p() {
-        if !assume_bop_not_redefined(jit, ocb, ARRAY_REDEFINED_OP_FLAG, BOP_AREF) {
-            return CantCompile;
+        if !assume_bop_not_redefined(jit, asm, ocb, ARRAY_REDEFINED_OP_FLAG, BOP_AREF) {
+            return None;
         }
 
         // Get the stack operands
-        let idx_opnd = ctx.stack_opnd(0);
-        let recv_opnd = ctx.stack_opnd(1);
+        let idx_opnd = asm.stack_opnd(0);
+        let recv_opnd = asm.stack_opnd(1);
 
         // Guard that the receiver is an ::Array
         // BOP_AREF check above is only good for ::Array.
         jit_guard_known_klass(
             jit,
-            ctx,
             asm,
             ocb,
             unsafe { rb_cArray },
             recv_opnd,
-            StackOpnd(1),
+            recv_opnd.into(),
             comptime_recv,
             OPT_AREF_MAX_CHAIN_DEPTH,
-            side_exit,
+            Counter::opt_aref_not_array,
         );
 
         // Bail if idx is not a FIXNUM
         let idx_reg = asm.load(idx_opnd);
         asm.test(idx_reg, (RUBY_FIXNUM_FLAG as u64).into());
-        asm.jz(counted_exit!(ocb, side_exit, oaref_arg_not_fixnum));
+        asm.jz(Target::side_exit(Counter::opt_aref_arg_not_fixnum));
 
         // Call VALUE rb_ary_entry_internal(VALUE ary, long offset).
         // It never raises or allocates, so we don't need to write to cfp->pc.
         {
+            // Pop the argument and the receiver
+            asm.stack_pop(2);
+
             let idx_reg = asm.rshift(idx_reg, Opnd::UImm(1)); // Convert fixnum to int
             let val = asm.ccall(rb_ary_entry_internal as *const u8, vec![recv_opnd, idx_reg]);
 
-            // Pop the argument and the receiver
-            ctx.stack_pop(2);
-
             // Push the return value onto the stack
-            let stack_ret = ctx.stack_push(Type::Unknown);
+            let stack_ret = asm.stack_push(Type::Unknown);
             asm.mov(stack_ret, val);
         }
 
         // Jump to next instruction. This allows guard chains to share the same successor.
-        jump_to_next_insn(jit, ctx, asm, ocb);
-        return EndBlock;
+        jump_to_next_insn(jit, asm, ocb);
+        return Some(EndBlock);
     } else if comptime_recv.class_of() == unsafe { rb_cHash } {
-        if !assume_bop_not_redefined(jit, ocb, HASH_REDEFINED_OP_FLAG, BOP_AREF) {
-            return CantCompile;
+        if !assume_bop_not_redefined(jit, asm, ocb, HASH_REDEFINED_OP_FLAG, BOP_AREF) {
+            return None;
         }
 
-        let recv_opnd = ctx.stack_opnd(1);
+        let recv_opnd = asm.stack_opnd(1);
 
         // Guard that the receiver is a hash
         jit_guard_known_klass(
             jit,
-            ctx,
             asm,
             ocb,
             unsafe { rb_cHash },
             recv_opnd,
-            StackOpnd(1),
+            recv_opnd.into(),
             comptime_recv,
             OPT_AREF_MAX_CHAIN_DEPTH,
-            side_exit,
+            Counter::opt_aref_not_hash,
         );
 
         // Prepare to call rb_hash_aref(). It might call #hash on the key.
-        jit_prepare_routine_call(jit, ctx, asm);
+        jit_prepare_non_leaf_call(jit, asm);
 
         // Call rb_hash_aref
-        let key_opnd = ctx.stack_opnd(0);
-        let recv_opnd = ctx.stack_opnd(1);
+        let key_opnd = asm.stack_opnd(0);
+        let recv_opnd = asm.stack_opnd(1);
         let val = asm.ccall(rb_hash_aref as *const u8, vec![recv_opnd, key_opnd]);
 
         // Pop the key and the receiver
-        ctx.stack_pop(2);
+        asm.stack_pop(2);
 
         // Push the return value onto the stack
-        let stack_ret = ctx.stack_push(Type::Unknown);
+        let stack_ret = asm.stack_push(Type::Unknown);
         asm.mov(stack_ret, val);
 
         // Jump to next instruction. This allows guard chains to share the same successor.
-        jump_to_next_insn(jit, ctx, asm, ocb);
-        EndBlock
+        jump_to_next_insn(jit, asm, ocb);
+        Some(EndBlock)
     } else {
         // General case. Call the [] method.
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm, ocb)
     }
 }
 
 fn gen_opt_aset(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Defer compilation so we can specialize on a runtime `self`
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+    if !jit.at_current_insn() {
+        defer_compilation(jit, asm, ocb);
+        return Some(EndBlock);
     }
 
-    let comptime_recv = jit_peek_at_stack(jit, ctx, 2);
-    let comptime_key = jit_peek_at_stack(jit, ctx, 1);
+    let comptime_recv = jit.peek_at_stack(&asm.ctx, 2);
+    let comptime_key = jit.peek_at_stack(&asm.ctx, 1);
 
     // Get the operands from the stack
-    let recv = ctx.stack_opnd(2);
-    let key = ctx.stack_opnd(1);
-    let _val = ctx.stack_opnd(0);
+    let recv = asm.stack_opnd(2);
+    let key = asm.stack_opnd(1);
+    let _val = asm.stack_opnd(0);
 
     if comptime_recv.class_of() == unsafe { rb_cArray } && comptime_key.fixnum_p() {
-        let side_exit = get_side_exit(jit, ocb, ctx);
-
         // Guard receiver is an Array
         jit_guard_known_klass(
             jit,
-            ctx,
             asm,
             ocb,
             unsafe { rb_cArray },
             recv,
-            StackOpnd(2),
+            recv.into(),
             comptime_recv,
             SEND_MAX_DEPTH,
-            side_exit,
+            Counter::opt_aset_not_array,
         );
 
         // Guard key is a fixnum
         jit_guard_known_klass(
             jit,
-            ctx,
             asm,
             ocb,
             unsafe { rb_cInteger },
             key,
-            StackOpnd(1),
+            key.into(),
             comptime_key,
             SEND_MAX_DEPTH,
-            side_exit,
+            Counter::opt_aset_not_fixnum,
         );
 
         // We might allocate or raise
-        jit_prepare_routine_call(jit, ctx, asm);
+        jit_prepare_non_leaf_call(jit, asm);
 
         // Call rb_ary_store
-        let recv = ctx.stack_opnd(2);
-        let key = asm.load(ctx.stack_opnd(1));
+        let recv = asm.stack_opnd(2);
+        let key = asm.load(asm.stack_opnd(1));
         let key = asm.rshift(key, Opnd::UImm(1)); // FIX2LONG(key)
-        let val = ctx.stack_opnd(0);
+        let val = asm.stack_opnd(0);
         asm.ccall(rb_ary_store as *const u8, vec![recv, key, val]);
 
         // rb_ary_store returns void
         // stored value should still be on stack
-        let val = asm.load(ctx.stack_opnd(0));
+        let val = asm.load(asm.stack_opnd(0));
 
         // Push the return value onto the stack
-        ctx.stack_pop(3);
-        let stack_ret = ctx.stack_push(Type::Unknown);
+        asm.stack_pop(3);
+        let stack_ret = asm.stack_push(Type::Unknown);
         asm.mov(stack_ret, val);
 
-        jump_to_next_insn(jit, ctx, asm, ocb);
-        return EndBlock;
+        jump_to_next_insn(jit, asm, ocb);
+        return Some(EndBlock);
     } else if comptime_recv.class_of() == unsafe { rb_cHash } {
-        let side_exit = get_side_exit(jit, ocb, ctx);
-
         // Guard receiver is a Hash
         jit_guard_known_klass(
             jit,
-            ctx,
             asm,
             ocb,
             unsafe { rb_cHash },
             recv,
-            StackOpnd(2),
+            recv.into(),
             comptime_recv,
             SEND_MAX_DEPTH,
-            side_exit,
+            Counter::opt_aset_not_hash,
         );
 
         // We might allocate or raise
-        jit_prepare_routine_call(jit, ctx, asm);
+        jit_prepare_non_leaf_call(jit, asm);
 
         // Call rb_hash_aset
-        let recv = ctx.stack_opnd(2);
-        let key = ctx.stack_opnd(1);
-        let val = ctx.stack_opnd(0);
+        let recv = asm.stack_opnd(2);
+        let key = asm.stack_opnd(1);
+        let val = asm.stack_opnd(0);
         let ret = asm.ccall(rb_hash_aset as *const u8, vec![recv, key, val]);
 
         // Push the return value onto the stack
-        ctx.stack_pop(3);
-        let stack_ret = ctx.stack_push(Type::Unknown);
+        asm.stack_pop(3);
+        let stack_ret = asm.stack_push(Type::Unknown);
         asm.mov(stack_ret, ret);
 
-        jump_to_next_insn(jit, ctx, asm, ocb);
-        EndBlock
+        jump_to_next_insn(jit, asm, ocb);
+        Some(EndBlock)
     } else {
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm, ocb)
     }
 }
 
-fn gen_opt_and(
+fn gen_opt_aref_with(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    // Defer compilation so we can specialize on a runtime `self`
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+    _ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus>{
+    // We might allocate or raise
+    jit_prepare_non_leaf_call(jit, asm);
+
+    let key_opnd = Opnd::Value(jit.get_arg(0));
+    let recv_opnd = asm.stack_opnd(0);
+
+    extern "C" {
+        fn rb_vm_opt_aref_with(recv: VALUE, key: VALUE) -> VALUE;
     }
 
-    let comptime_a = jit_peek_at_stack(jit, ctx, 1);
-    let comptime_b = jit_peek_at_stack(jit, ctx, 0);
+    let val_opnd = asm.ccall(
+        rb_vm_opt_aref_with as *const u8,
+        vec![
+            recv_opnd,
+            key_opnd
+        ],
+    );
+    asm.stack_pop(1); // Keep it on stack during GC
+
+    asm.cmp(val_opnd, Qundef.into());
+    asm.je(Target::side_exit(Counter::opt_aref_with_qundef));
 
-    if comptime_a.fixnum_p() && comptime_b.fixnum_p() {
-        // Create a side-exit to fall back to the interpreter
-        // Note: we generate the side-exit before popping operands from the stack
-        let side_exit = get_side_exit(jit, ocb, ctx);
+    let top = asm.stack_push(Type::Unknown);
+    asm.mov(top, val_opnd);
+
+    return Some(KeepCompiling);
+}
 
-        if !assume_bop_not_redefined(jit, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_AND) {
-            return CantCompile;
+fn gen_opt_and(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
+        Some(two_fixnums) => two_fixnums,
+        None => {
+            // Defer compilation so we can specialize on a runtime `self`
+            defer_compilation(jit, asm, ocb);
+            return Some(EndBlock);
+        }
+    };
+
+    if two_fixnums {
+        if !assume_bop_not_redefined(jit, asm, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_AND) {
+            return None;
         }
 
         // Check that both operands are fixnums
-        guard_two_fixnums(jit, ctx, asm, ocb, side_exit);
+        guard_two_fixnums(jit, asm, ocb);
 
         // Get the operands and destination from the stack
-        let arg1 = ctx.stack_pop(1);
-        let arg0 = ctx.stack_pop(1);
+        let arg1 = asm.stack_pop(1);
+        let arg0 = asm.stack_pop(1);
 
         // Do the bitwise and arg0 & arg1
         let val = asm.and(arg0, arg1);
 
         // Push the output on the stack
-        let dst = ctx.stack_push(Type::Fixnum);
-        asm.store(dst, val);
+        let dst = asm.stack_push(Type::Fixnum);
+        asm.mov(dst, val);
 
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
         // Delegate to send, call the method on the recv
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm, ocb)
     }
 }
 
 fn gen_opt_or(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    // Defer compilation so we can specialize on a runtime `self`
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
-    }
-
-    let comptime_a = jit_peek_at_stack(jit, ctx, 1);
-    let comptime_b = jit_peek_at_stack(jit, ctx, 0);
-
-    if comptime_a.fixnum_p() && comptime_b.fixnum_p() {
-        // Create a side-exit to fall back to the interpreter
-        // Note: we generate the side-exit before popping operands from the stack
-        let side_exit = get_side_exit(jit, ocb, ctx);
+) -> Option<CodegenStatus> {
+    let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
+        Some(two_fixnums) => two_fixnums,
+        None => {
+            // Defer compilation so we can specialize on a runtime `self`
+            defer_compilation(jit, asm, ocb);
+            return Some(EndBlock);
+        }
+    };
 
-        if !assume_bop_not_redefined(jit, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_OR) {
-            return CantCompile;
+    if two_fixnums {
+        if !assume_bop_not_redefined(jit, asm, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_OR) {
+            return None;
         }
 
         // Check that both operands are fixnums
-        guard_two_fixnums(jit, ctx, asm, ocb, side_exit);
+        guard_two_fixnums(jit, asm, ocb);
 
         // Get the operands and destination from the stack
-        let arg1 = ctx.stack_pop(1);
-        let arg0 = ctx.stack_pop(1);
+        let arg1 = asm.stack_pop(1);
+        let arg0 = asm.stack_pop(1);
 
         // Do the bitwise or arg0 | arg1
         let val = asm.or(arg0, arg1);
 
         // Push the output on the stack
-        let dst = ctx.stack_push(Type::Fixnum);
-        asm.store(dst, val);
+        let dst = asm.stack_push(Type::Fixnum);
+        asm.mov(dst, val);
 
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
         // Delegate to send, call the method on the recv
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm, ocb)
     }
 }
 
 fn gen_opt_minus(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    // Defer compilation so we can specialize on a runtime `self`
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
-    }
-
-    let comptime_a = jit_peek_at_stack(jit, ctx, 1);
-    let comptime_b = jit_peek_at_stack(jit, ctx, 0);
-
-    if comptime_a.fixnum_p() && comptime_b.fixnum_p() {
-        // Create a side-exit to fall back to the interpreter
-        // Note: we generate the side-exit before popping operands from the stack
-        let side_exit = get_side_exit(jit, ocb, ctx);
+) -> Option<CodegenStatus> {
+    let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
+        Some(two_fixnums) => two_fixnums,
+        None => {
+            // Defer compilation so we can specialize on a runtime `self`
+            defer_compilation(jit, asm, ocb);
+            return Some(EndBlock);
+        }
+    };
 
-        if !assume_bop_not_redefined(jit, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_MINUS) {
-            return CantCompile;
+    if two_fixnums {
+        if !assume_bop_not_redefined(jit, asm, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_MINUS) {
+            return None;
         }
 
         // Check that both operands are fixnums
-        guard_two_fixnums(jit, ctx, asm, ocb, side_exit);
+        guard_two_fixnums(jit, asm, ocb);
 
         // Get the operands and destination from the stack
-        let arg1 = ctx.stack_pop(1);
-        let arg0 = ctx.stack_pop(1);
+        let arg1 = asm.stack_pop(1);
+        let arg0 = asm.stack_pop(1);
 
         // Subtract arg0 - arg1 and test for overflow
         let val_untag = asm.sub(arg0, arg1);
-        asm.jo(side_exit);
+        asm.jo(Target::side_exit(Counter::opt_minus_overflow));
         let val = asm.add(val_untag, Opnd::Imm(1));
 
         // Push the output on the stack
-        let dst = ctx.stack_push(Type::Fixnum);
-        asm.store(dst, val);
+        let dst = asm.stack_push(Type::Fixnum);
+        asm.mov(dst, val);
 
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
         // Delegate to send, call the method on the recv
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm, ocb)
     }
 }
 
 fn gen_opt_mult(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    // Delegate to send, call the method on the recv
-    gen_opt_send_without_block(jit, ctx, asm, ocb)
+) -> Option<CodegenStatus> {
+    let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
+        Some(two_fixnums) => two_fixnums,
+        None => {
+            defer_compilation(jit, asm, ocb);
+            return Some(EndBlock);
+        }
+    };
+
+    // Fallback to a method call if it overflows
+    if two_fixnums && asm.ctx.get_chain_depth() == 0 {
+        if !assume_bop_not_redefined(jit, asm, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_MULT) {
+            return None;
+        }
+
+        // Check that both operands are fixnums
+        guard_two_fixnums(jit, asm, ocb);
+
+        // Get the operands from the stack
+        let arg1 = asm.stack_pop(1);
+        let arg0 = asm.stack_pop(1);
+
+        // Do some bitwise gymnastics to handle tag bits
+        // x * y is translated to (x >> 1) * (y - 1) + 1
+        let arg0_untag = asm.rshift(arg0, Opnd::UImm(1));
+        let arg1_untag = asm.sub(arg1, Opnd::UImm(1));
+        let out_val = asm.mul(arg0_untag, arg1_untag);
+        jit_chain_guard(JCC_JO_MUL, jit, asm, ocb, 1, Counter::opt_mult_overflow);
+        let out_val = asm.add(out_val, Opnd::UImm(1));
+
+        // Push the output on the stack
+        let dst = asm.stack_push(Type::Fixnum);
+        asm.mov(dst, out_val);
+
+        Some(KeepCompiling)
+    } else {
+        gen_opt_send_without_block(jit, asm, ocb)
+    }
 }
 
 fn gen_opt_div(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Delegate to send, call the method on the recv
-    gen_opt_send_without_block(jit, ctx, asm, ocb)
+    gen_opt_send_without_block(jit, asm, ocb)
 }
 
 fn gen_opt_mod(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    // Defer compilation so we can specialize on a runtime `self`
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
-    }
-
-    let comptime_a = jit_peek_at_stack(jit, ctx, 1);
-    let comptime_b = jit_peek_at_stack(jit, ctx, 0);
-
-    if comptime_a.fixnum_p() && comptime_b.fixnum_p() {
-        // Create a side-exit to fall back to the interpreter
-        // Note: we generate the side-exit before popping operands from the stack
-        let side_exit = get_side_exit(jit, ocb, ctx);
+) -> Option<CodegenStatus> {
+    let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
+        Some(two_fixnums) => two_fixnums,
+        None => {
+            // Defer compilation so we can specialize on a runtime `self`
+            defer_compilation(jit, asm, ocb);
+            return Some(EndBlock);
+        }
+    };
 
-        if !assume_bop_not_redefined(jit, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_MOD) {
-            return CantCompile;
+    if two_fixnums {
+        if !assume_bop_not_redefined(jit, asm, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_MOD) {
+            return None;
         }
 
         // Check that both operands are fixnums
-        guard_two_fixnums(jit, ctx, asm, ocb, side_exit);
+        guard_two_fixnums(jit, asm, ocb);
 
         // Get the operands and destination from the stack
-        let arg1 = ctx.stack_pop(1);
-        let arg0 = ctx.stack_pop(1);
+        let arg1 = asm.stack_pop(1);
+        let arg0 = asm.stack_pop(1);
 
         // Check for arg0 % 0
         asm.cmp(arg1, Opnd::Imm(VALUE::fixnum_from_usize(0).as_i64()));
-        asm.je(side_exit);
+        asm.je(Target::side_exit(Counter::opt_mod_zero));
 
         // Call rb_fix_mod_fix(VALUE recv, VALUE obj)
         let ret = asm.ccall(rb_fix_mod_fix as *const u8, vec![arg0, arg1]);
 
         // Push the return value onto the stack
-        let stack_ret = ctx.stack_push(Type::Unknown);
+        // When the two arguments are fixnums, the modulo output is always a fixnum
+        let stack_ret = asm.stack_push(Type::Fixnum);
         asm.mov(stack_ret, ret);
 
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
         // Delegate to send, call the method on the recv
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm, ocb)
     }
 }
 
 fn gen_opt_ltlt(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Delegate to send, call the method on the recv
-    gen_opt_send_without_block(jit, ctx, asm, ocb)
+    gen_opt_send_without_block(jit, asm, ocb)
 }
 
 fn gen_opt_nil_p(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Delegate to send, call the method on the recv
-    gen_opt_send_without_block(jit, ctx, asm, ocb)
+    gen_opt_send_without_block(jit, asm, ocb)
 }
 
 fn gen_opt_empty_p(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Delegate to send, call the method on the recv
-    gen_opt_send_without_block(jit, ctx, asm, ocb)
+    gen_opt_send_without_block(jit, asm, ocb)
 }
 
 fn gen_opt_succ(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Delegate to send, call the method on the recv
-    gen_opt_send_without_block(jit, ctx, asm, ocb)
+    gen_opt_send_without_block(jit, asm, ocb)
 }
 
-
 fn gen_opt_str_freeze(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    if !assume_bop_not_redefined(jit, ocb, STRING_REDEFINED_OP_FLAG, BOP_FREEZE) {
-        return CantCompile;
+) -> Option<CodegenStatus> {
+    if !assume_bop_not_redefined(jit, asm, ocb, STRING_REDEFINED_OP_FLAG, BOP_FREEZE) {
+        return None;
     }
 
-    let str = jit_get_arg(jit, 0);
+    let str = jit.get_arg(0);
 
     // Push the return value onto the stack
-    let stack_ret = ctx.stack_push(Type::CString);
+    let stack_ret = asm.stack_push(Type::CString);
     asm.mov(stack_ret, str.into());
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_opt_str_uminus(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    if !assume_bop_not_redefined(jit, ocb, STRING_REDEFINED_OP_FLAG, BOP_UMINUS) {
-        return CantCompile;
+) -> Option<CodegenStatus> {
+    if !assume_bop_not_redefined(jit, asm, ocb, STRING_REDEFINED_OP_FLAG, BOP_UMINUS) {
+        return None;
     }
 
-    let str = jit_get_arg(jit, 0);
+    let str = jit.get_arg(0);
 
     // Push the return value onto the stack
-    let stack_ret = ctx.stack_push(Type::CString);
+    let stack_ret = asm.stack_push(Type::CString);
     asm.mov(stack_ret, str.into());
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_opt_newarray_max(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let num = jit_get_arg(jit, 0).as_u32();
+) -> Option<CodegenStatus> {
+    let num = jit.get_arg(0).as_u32();
 
-    // Save the PC and SP because we may allocate
-    jit_prepare_routine_call(jit, ctx, asm);
+    // Save the PC and SP because we may call #max
+    jit_prepare_non_leaf_call(jit, asm);
 
     extern "C" {
         fn rb_vm_opt_newarray_max(ec: EcPtr, num: u32, elts: *const VALUE) -> VALUE;
     }
 
-    let offset_magnitude = (SIZEOF_VALUE as u32) * num;
-    let values_opnd = ctx.sp_opnd(-(offset_magnitude as isize));
+    let values_opnd = asm.ctx.sp_opnd(-(num as i32));
     let values_ptr = asm.lea(values_opnd);
 
     let val_opnd = asm.ccall(
@@ -3375,31 +4223,120 @@ fn gen_opt_newarray_max(
         ],
     );
 
-    ctx.stack_pop(num.as_usize());
-    let stack_ret = ctx.stack_push(Type::Unknown);
+    asm.stack_pop(num.as_usize());
+    let stack_ret = asm.stack_push(Type::Unknown);
     asm.mov(stack_ret, val_opnd);
 
-    KeepCompiling
+    Some(KeepCompiling)
+}
+
+fn gen_opt_newarray_send(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    let method = jit.get_arg(1).as_u64();
+
+    if method == ID!(min) {
+        gen_opt_newarray_min(jit, asm, _ocb)
+    } else if method == ID!(max) {
+        gen_opt_newarray_max(jit, asm, _ocb)
+    } else if method == ID!(hash) {
+        gen_opt_newarray_hash(jit, asm, _ocb)
+    } else if method == ID!(pack) {
+        gen_opt_newarray_pack(jit, asm, _ocb)
+    } else {
+        None
+    }
+}
+
+fn gen_opt_newarray_pack(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    // num == 4 ( for this code )
+    let num = jit.get_arg(0).as_u32();
+
+    // Save the PC and SP because we may call #pack
+    jit_prepare_non_leaf_call(jit, asm);
+
+    extern "C" {
+        fn rb_vm_opt_newarray_pack(ec: EcPtr, num: u32, elts: *const VALUE, fmt: VALUE) -> VALUE;
+    }
+
+    let values_opnd = asm.ctx.sp_opnd(-(num as i32));
+    let values_ptr = asm.lea(values_opnd);
+
+    let fmt_string = asm.ctx.sp_opnd(-1);
+
+    let val_opnd = asm.ccall(
+        rb_vm_opt_newarray_pack as *const u8,
+        vec![
+            EC,
+            (num - 1).into(),
+            values_ptr,
+            fmt_string
+        ],
+    );
+
+    asm.stack_pop(num.as_usize());
+    let stack_ret = asm.stack_push(Type::CString);
+    asm.mov(stack_ret, val_opnd);
+
+    Some(KeepCompiling)
+}
+
+fn gen_opt_newarray_hash(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+
+    let num = jit.get_arg(0).as_u32();
+
+    // Save the PC and SP because we may call #hash
+    jit_prepare_non_leaf_call(jit, asm);
+
+    extern "C" {
+        fn rb_vm_opt_newarray_hash(ec: EcPtr, num: u32, elts: *const VALUE) -> VALUE;
+    }
+
+    let values_opnd = asm.ctx.sp_opnd(-(num as i32));
+    let values_ptr = asm.lea(values_opnd);
+
+    let val_opnd = asm.ccall(
+        rb_vm_opt_newarray_hash as *const u8,
+        vec![
+            EC,
+            num.into(),
+            values_ptr
+        ],
+    );
+
+    asm.stack_pop(num.as_usize());
+    let stack_ret = asm.stack_push(Type::Unknown);
+    asm.mov(stack_ret, val_opnd);
+
+    Some(KeepCompiling)
 }
 
 fn gen_opt_newarray_min(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
 
-    let num = jit_get_arg(jit, 0).as_u32();
+    let num = jit.get_arg(0).as_u32();
 
-    // Save the PC and SP because we may allocate
-    jit_prepare_routine_call(jit, ctx, asm);
+    // Save the PC and SP because we may call #min
+    jit_prepare_non_leaf_call(jit, asm);
 
     extern "C" {
         fn rb_vm_opt_newarray_min(ec: EcPtr, num: u32, elts: *const VALUE) -> VALUE;
     }
 
-    let offset_magnitude = (SIZEOF_VALUE as u32) * num;
-    let values_opnd = ctx.sp_opnd(-(offset_magnitude as isize));
+    let values_opnd = asm.ctx.sp_opnd(-(num as i32));
     let values_ptr = asm.lea(values_opnd);
 
     let val_opnd = asm.ccall(
@@ -3411,55 +4348,50 @@ fn gen_opt_newarray_min(
         ],
     );
 
-    ctx.stack_pop(num.as_usize());
-    let stack_ret = ctx.stack_push(Type::Unknown);
+    asm.stack_pop(num.as_usize());
+    let stack_ret = asm.stack_push(Type::Unknown);
     asm.mov(stack_ret, val_opnd);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_opt_not(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    return gen_opt_send_without_block(jit, ctx, asm, ocb);
+) -> Option<CodegenStatus> {
+    return gen_opt_send_without_block(jit, asm, ocb);
 }
 
 fn gen_opt_size(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    return gen_opt_send_without_block(jit, ctx, asm, ocb);
+) -> Option<CodegenStatus> {
+    return gen_opt_send_without_block(jit, asm, ocb);
 }
 
 fn gen_opt_length(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    return gen_opt_send_without_block(jit, ctx, asm, ocb);
+) -> Option<CodegenStatus> {
+    return gen_opt_send_without_block(jit, asm, ocb);
 }
 
 fn gen_opt_regexpmatch2(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    return gen_opt_send_without_block(jit, ctx, asm, ocb);
+) -> Option<CodegenStatus> {
+    return gen_opt_send_without_block(jit, asm, ocb);
 }
 
 fn gen_opt_case_dispatch(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Normally this instruction would lookup the key in a hash and jump to an
     // offset based on that.
     // Instead we can take the fallback case and continue with the next
@@ -3467,19 +4399,18 @@ fn gen_opt_case_dispatch(
     // We'd hope that our jitted code will be sufficiently fast without the
     // hash lookup, at least for small hashes, but it's worth revisiting this
     // assumption in the future.
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+    if !jit.at_current_insn() {
+        defer_compilation(jit, asm, ocb);
+        return Some(EndBlock);
     }
-    let starting_context = ctx.clone();
 
-    let case_hash = jit_get_arg(jit, 0);
-    let else_offset = jit_get_arg(jit, 1).as_u32();
+    let case_hash = jit.get_arg(0);
+    let else_offset = jit.get_arg(1).as_u32();
 
     // Try to reorder case/else branches so that ones that are actually used come first.
     // Supporting only Fixnum for now so that the implementation can be an equality check.
-    let key_opnd = ctx.stack_pop(1);
-    let comptime_key = jit_peek_at_stack(jit, ctx, 0);
+    let key_opnd = asm.stack_opnd(0);
+    let comptime_key = jit.peek_at_stack(&asm.ctx, 0);
 
     // Check that all cases are fixnums to avoid having to register BOP assumptions on
     // all the types that case hashes support. This spends compile time to save memory.
@@ -3500,23 +4431,28 @@ fn gen_opt_case_dispatch(
         all_fixnum
     }
 
-    if comptime_key.fixnum_p() && comptime_key.0 <= u32::MAX.as_usize() && case_hash_all_fixnum_p(case_hash) {
-        if !assume_bop_not_redefined(jit, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_EQQ) {
-            return CantCompile;
+    // If megamorphic, fallback to compiling branch instructions after opt_case_dispatch
+    let megamorphic = asm.ctx.get_chain_depth() >= CASE_WHEN_MAX_DEPTH;
+    if megamorphic {
+        gen_counter_incr(asm, Counter::num_opt_case_dispatch_megamorphic);
+    }
+
+    if comptime_key.fixnum_p() && comptime_key.0 <= u32::MAX.as_usize() && case_hash_all_fixnum_p(case_hash) && !megamorphic {
+        if !assume_bop_not_redefined(jit, asm, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_EQQ) {
+            return None;
         }
 
         // Check if the key is the same value
         asm.cmp(key_opnd, comptime_key.into());
-        let side_exit = get_side_exit(jit, ocb, &starting_context);
         jit_chain_guard(
             JCC_JNE,
             jit,
-            &starting_context,
             asm,
             ocb,
             CASE_WHEN_MAX_DEPTH,
-            side_exit,
+            Counter::opt_case_dispatch_megamorphic,
         );
+        asm.stack_pop(1); // Pop key_opnd
 
         // Get the offset for the compile-time key
         let mut offset = 0;
@@ -3529,51 +4465,30 @@ fn gen_opt_case_dispatch(
         };
 
         // Jump to the offset of case or else
-        let jump_block = BlockId { iseq: jit.iseq, idx: jit_next_insn_idx(jit) + jump_offset };
-        gen_direct_jump(jit, &ctx, jump_block, asm);
-        EndBlock
+        let jump_idx = jit.next_insn_idx() as u32 + jump_offset;
+        let jump_block = BlockId { iseq: jit.iseq, idx: jump_idx.try_into().unwrap() };
+        gen_direct_jump(jit, &asm.ctx.clone(), jump_block, asm);
+        Some(EndBlock)
     } else {
-        KeepCompiling // continue with === branches
-    }
-}
-
-fn gen_branchif_branch(
-    asm: &mut Assembler,
-    target0: CodePtr,
-    target1: Option<CodePtr>,
-    shape: BranchShape,
-) {
-    assert!(target1 != None);
-    match shape {
-        BranchShape::Next0 => {
-            asm.jz(target1.unwrap().into());
-        }
-        BranchShape::Next1 => {
-            asm.jnz(target0.into());
-        }
-        BranchShape::Default => {
-            asm.jnz(target0.into());
-            asm.jmp(target1.unwrap().into());
-        }
+        asm.stack_pop(1); // Pop key_opnd
+        Some(KeepCompiling) // continue with === branches
     }
 }
 
 fn gen_branchif(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let jump_offset = jit_get_arg(jit, 0).as_i32();
+) -> Option<CodegenStatus> {
+    let jump_offset = jit.get_arg(0).as_i32();
 
     // Check for interrupts, but only on backward branches that may create loops
     if jump_offset < 0 {
-        let side_exit = get_side_exit(jit, ocb, ctx);
-        gen_check_ints(asm, side_exit);
+        gen_check_ints(asm, Counter::branchif_interrupted);
     }
 
     // Get the branch target instruction offsets
-    let next_idx = jit_next_insn_idx(jit);
+    let next_idx = jit.next_insn_idx();
     let jump_idx = (next_idx as i32) + jump_offset;
     let next_block = BlockId {
         iseq: jit.iseq,
@@ -3581,68 +4496,54 @@ fn gen_branchif(
     };
     let jump_block = BlockId {
         iseq: jit.iseq,
-        idx: jump_idx as u32,
+        idx: jump_idx.try_into().unwrap(),
     };
 
     // Test if any bit (outside of the Qnil bit) is on
     // See RB_TEST()
-    let val_type = ctx.get_opnd_type(StackOpnd(0));
-    let val_opnd = ctx.stack_pop(1);
+    let val_type = asm.ctx.get_opnd_type(StackOpnd(0));
+    let val_opnd = asm.stack_pop(1);
+
+    incr_counter!(branch_insn_count);
 
     if let Some(result) = val_type.known_truthy() {
         let target = if result { jump_block } else { next_block };
-        gen_direct_jump(jit, ctx, target, asm);
+        gen_direct_jump(jit, &asm.ctx.clone(), target, asm);
+        incr_counter!(branch_known_count);
     } else {
         asm.test(val_opnd, Opnd::Imm(!Qnil.as_i64()));
 
         // Generate the branch instructions
+        let ctx = asm.ctx;
         gen_branch(
             jit,
             asm,
             ocb,
             jump_block,
-            ctx,
+            &ctx,
             Some(next_block),
-            Some(ctx),
-            gen_branchif_branch,
+            Some(&ctx),
+            BranchGenFn::BranchIf(Cell::new(BranchShape::Default)),
         );
     }
 
-    EndBlock
-}
-
-fn gen_branchunless_branch(
-    asm: &mut Assembler,
-    target0: CodePtr,
-    target1: Option<CodePtr>,
-    shape: BranchShape,
-) {
-    match shape {
-        BranchShape::Next0 => asm.jnz(target1.unwrap().into()),
-        BranchShape::Next1 => asm.jz(target0.into()),
-        BranchShape::Default => {
-            asm.jz(target0.into());
-            asm.jmp(target1.unwrap().into());
-        }
-    }
+    Some(EndBlock)
 }
 
 fn gen_branchunless(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let jump_offset = jit_get_arg(jit, 0).as_i32();
+) -> Option<CodegenStatus> {
+    let jump_offset = jit.get_arg(0).as_i32();
 
     // Check for interrupts, but only on backward branches that may create loops
     if jump_offset < 0 {
-        let side_exit = get_side_exit(jit, ocb, ctx);
-        gen_check_ints(asm, side_exit);
+        gen_check_ints(asm, Counter::branchunless_interrupted);
     }
 
     // Get the branch target instruction offsets
-    let next_idx = jit_next_insn_idx(jit) as i32;
+    let next_idx = jit.next_insn_idx() as i32;
     let jump_idx = next_idx + jump_offset;
     let next_block = BlockId {
         iseq: jit.iseq,
@@ -3653,12 +4554,15 @@ fn gen_branchunless(
         idx: jump_idx.try_into().unwrap(),
     };
 
-    let val_type = ctx.get_opnd_type(StackOpnd(0));
-    let val_opnd = ctx.stack_pop(1);
+    let val_type = asm.ctx.get_opnd_type(StackOpnd(0));
+    let val_opnd = asm.stack_pop(1);
+
+    incr_counter!(branch_insn_count);
 
     if let Some(result) = val_type.known_truthy() {
         let target = if result { next_block } else { jump_block };
-        gen_direct_jump(jit, ctx, target, asm);
+        gen_direct_jump(jit, &asm.ctx.clone(), target, asm);
+        incr_counter!(branch_known_count);
     } else {
         // Test if any bit (outside of the Qnil bit) is on
         // See RB_TEST()
@@ -3666,53 +4570,36 @@ fn gen_branchunless(
         asm.test(val_opnd, not_qnil.into());
 
         // Generate the branch instructions
+        let ctx = asm.ctx;
         gen_branch(
             jit,
             asm,
             ocb,
             jump_block,
-            ctx,
+            &ctx,
             Some(next_block),
-            Some(ctx),
-            gen_branchunless_branch,
+            Some(&ctx),
+            BranchGenFn::BranchUnless(Cell::new(BranchShape::Default)),
         );
     }
 
-    EndBlock
-}
-
-fn gen_branchnil_branch(
-    asm: &mut Assembler,
-    target0: CodePtr,
-    target1: Option<CodePtr>,
-    shape: BranchShape,
-) {
-    match shape {
-        BranchShape::Next0 => asm.jne(target1.unwrap().into()),
-        BranchShape::Next1 => asm.je(target0.into()),
-        BranchShape::Default => {
-            asm.je(target0.into());
-            asm.jmp(target1.unwrap().into());
-        }
-    }
+    Some(EndBlock)
 }
 
 fn gen_branchnil(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let jump_offset = jit_get_arg(jit, 0).as_i32();
+) -> Option<CodegenStatus> {
+    let jump_offset = jit.get_arg(0).as_i32();
 
     // Check for interrupts, but only on backward branches that may create loops
     if jump_offset < 0 {
-        let side_exit = get_side_exit(jit, ocb, ctx);
-        gen_check_ints(asm, side_exit);
+        gen_check_ints(asm, Counter::branchnil_interrupted);
     }
 
     // Get the branch target instruction offsets
-    let next_idx = jit_next_insn_idx(jit) as i32;
+    let next_idx = jit.next_insn_idx() as i32;
     let jump_idx = next_idx + jump_offset;
     let next_block = BlockId {
         iseq: jit.iseq,
@@ -3723,56 +4610,101 @@ fn gen_branchnil(
         idx: jump_idx.try_into().unwrap(),
     };
 
-    let val_type = ctx.get_opnd_type(StackOpnd(0));
-    let val_opnd = ctx.stack_pop(1);
+    let val_type = asm.ctx.get_opnd_type(StackOpnd(0));
+    let val_opnd = asm.stack_pop(1);
+
+    incr_counter!(branch_insn_count);
 
     if let Some(result) = val_type.known_nil() {
         let target = if result { jump_block } else { next_block };
-        gen_direct_jump(jit, ctx, target, asm);
+        gen_direct_jump(jit, &asm.ctx.clone(), target, asm);
+        incr_counter!(branch_known_count);
     } else {
         // Test if the value is Qnil
         asm.cmp(val_opnd, Opnd::UImm(Qnil.into()));
         // Generate the branch instructions
+        let ctx = asm.ctx;
         gen_branch(
             jit,
             asm,
             ocb,
             jump_block,
-            ctx,
+            &ctx,
             Some(next_block),
-            Some(ctx),
-            gen_branchnil_branch,
+            Some(&ctx),
+            BranchGenFn::BranchNil(Cell::new(BranchShape::Default)),
         );
     }
 
-    EndBlock
+    Some(EndBlock)
+}
+
+fn gen_throw(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    let throw_state = jit.get_arg(0).as_u64();
+    let throwobj = asm.stack_pop(1);
+    let throwobj = asm.load(throwobj);
+
+    // Gather some statistics about throw
+    gen_counter_incr(asm, Counter::num_throw);
+    match (throw_state & VM_THROW_STATE_MASK as u64) as u32 {
+        RUBY_TAG_BREAK => gen_counter_incr(asm, Counter::num_throw_break),
+        RUBY_TAG_RETRY => gen_counter_incr(asm, Counter::num_throw_retry),
+        RUBY_TAG_RETURN => gen_counter_incr(asm, Counter::num_throw_return),
+        _ => {},
+    }
+
+    // THROW_DATA_NEW allocates. Save SP for GC and PC for allocation tracing as
+    // well as handling the catch table. However, not using jit_prepare_call_with_gc
+    // since we don't need a patch point for this implementation.
+    jit_save_pc(jit, asm);
+    gen_save_sp(asm);
+
+    // rb_vm_throw verifies it's a valid throw, sets ec->tag->state, and returns throw
+    // data, which is throwobj or a vm_throw_data wrapping it. When ec->tag->state is
+    // set, JIT code callers will handle the throw with vm_exec_handle_exception.
+    extern "C" {
+        fn rb_vm_throw(ec: EcPtr, reg_cfp: CfpPtr, throw_state: u32, throwobj: VALUE) -> VALUE;
+    }
+    let val = asm.ccall(rb_vm_throw as *mut u8, vec![EC, CFP, throw_state.into(), throwobj]);
+
+    asm_comment!(asm, "exit from throw");
+    asm.cpop_into(SP);
+    asm.cpop_into(EC);
+    asm.cpop_into(CFP);
+
+    asm.frame_teardown();
+
+    asm.cret(val);
+    Some(EndBlock)
 }
 
 fn gen_jump(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let jump_offset = jit_get_arg(jit, 0).as_i32();
+    _ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    let jump_offset = jit.get_arg(0).as_i32();
 
     // Check for interrupts, but only on backward branches that may create loops
     if jump_offset < 0 {
-        let side_exit = get_side_exit(jit, ocb, ctx);
-        gen_check_ints(asm, side_exit);
+        gen_check_ints(asm, Counter::jump_interrupted);
     }
 
     // Get the branch target instruction offsets
-    let jump_idx = (jit_next_insn_idx(jit) as i32) + jump_offset;
+    let jump_idx = jit.next_insn_idx() as i32 + jump_offset;
     let jump_block = BlockId {
         iseq: jit.iseq,
-        idx: jump_idx as u32,
+        idx: jump_idx.try_into().unwrap(),
     };
 
     // Generate the jump instruction
-    gen_direct_jump(jit, ctx, jump_block, asm);
+    gen_direct_jump(jit, &asm.ctx.clone(), jump_block, asm);
 
-    EndBlock
+    Some(EndBlock)
 }
 
 /// Guard that self or a stack operand has the same class as `known_klass`, using
@@ -3783,60 +4715,69 @@ fn gen_jump(
 /// Recompile as contingency if possible, or take side exit a last resort.
 fn jit_guard_known_klass(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
     known_klass: VALUE,
     obj_opnd: Opnd,
     insn_opnd: YARVOpnd,
     sample_instance: VALUE,
-    max_chain_depth: i32,
-    side_exit: Target,
+    max_chain_depth: u8,
+    counter: Counter,
 ) {
-    let val_type = ctx.get_opnd_type(insn_opnd);
+    let val_type = asm.ctx.get_opnd_type(insn_opnd);
 
     if val_type.known_class() == Some(known_klass) {
-        // We already know from type information that this is a match
-        return;
+        // Unless frozen, Array, Hash, and String objects may change their RBASIC_CLASS
+        // when they get a singleton class. Those types need invalidations.
+        if unsafe { [rb_cArray, rb_cHash, rb_cString].contains(&known_klass) } {
+            if jit.assume_no_singleton_class(asm, ocb, known_klass) {
+                // Speculate that this object will not have a singleton class,
+                // and invalidate the block in case it does.
+                return;
+            }
+        } else {
+            // We already know from type information that this is a match
+            return;
+        }
     }
 
     if unsafe { known_klass == rb_cNilClass } {
         assert!(!val_type.is_heap());
         assert!(val_type.is_unknown());
 
-        asm.comment("guard object is nil");
+        asm_comment!(asm, "guard object is nil");
         asm.cmp(obj_opnd, Qnil.into());
-        jit_chain_guard(JCC_JNE, jit, ctx, asm, ocb, max_chain_depth, side_exit);
+        jit_chain_guard(JCC_JNE, jit, asm, ocb, max_chain_depth, counter);
 
-        ctx.upgrade_opnd_type(insn_opnd, Type::Nil);
+        asm.ctx.upgrade_opnd_type(insn_opnd, Type::Nil);
     } else if unsafe { known_klass == rb_cTrueClass } {
         assert!(!val_type.is_heap());
         assert!(val_type.is_unknown());
 
-        asm.comment("guard object is true");
+        asm_comment!(asm, "guard object is true");
         asm.cmp(obj_opnd, Qtrue.into());
-        jit_chain_guard(JCC_JNE, jit, ctx, asm, ocb, max_chain_depth, side_exit);
+        jit_chain_guard(JCC_JNE, jit, asm, ocb, max_chain_depth, counter);
 
-        ctx.upgrade_opnd_type(insn_opnd, Type::True);
+        asm.ctx.upgrade_opnd_type(insn_opnd, Type::True);
     } else if unsafe { known_klass == rb_cFalseClass } {
         assert!(!val_type.is_heap());
         assert!(val_type.is_unknown());
 
-        asm.comment("guard object is false");
+        asm_comment!(asm, "guard object is false");
         assert!(Qfalse.as_i32() == 0);
         asm.test(obj_opnd, obj_opnd);
-        jit_chain_guard(JCC_JNZ, jit, ctx, asm, ocb, max_chain_depth, side_exit);
+        jit_chain_guard(JCC_JNZ, jit, asm, ocb, max_chain_depth, counter);
 
-        ctx.upgrade_opnd_type(insn_opnd, Type::False);
+        asm.ctx.upgrade_opnd_type(insn_opnd, Type::False);
     } else if unsafe { known_klass == rb_cInteger } && sample_instance.fixnum_p() {
         // We will guard fixnum and bignum as though they were separate classes
         // BIGNUM can be handled by the general else case below
         assert!(val_type.is_unknown());
 
-        asm.comment("guard object is fixnum");
+        asm_comment!(asm, "guard object is fixnum");
         asm.test(obj_opnd, Opnd::Imm(RUBY_FIXNUM_FLAG as i64));
-        jit_chain_guard(JCC_JZ, jit, ctx, asm, ocb, max_chain_depth, side_exit);
-        ctx.upgrade_opnd_type(insn_opnd, Type::Fixnum);
+        jit_chain_guard(JCC_JZ, jit, asm, ocb, max_chain_depth, counter);
+        asm.ctx.upgrade_opnd_type(insn_opnd, Type::Fixnum);
     } else if unsafe { known_klass == rb_cSymbol } && sample_instance.static_sym_p() {
         assert!(!val_type.is_heap());
         // We will guard STATIC vs DYNAMIC as though they were separate classes
@@ -3844,11 +4785,11 @@ fn jit_guard_known_klass(
         if val_type != Type::ImmSymbol || !val_type.is_imm() {
             assert!(val_type.is_unknown());
 
-            asm.comment("guard object is static symbol");
+            asm_comment!(asm, "guard object is static symbol");
             assert!(RUBY_SPECIAL_SHIFT == 8);
             asm.cmp(obj_opnd.with_num_bits(8).unwrap(), Opnd::UImm(RUBY_SYMBOL_FLAG as u64));
-            jit_chain_guard(JCC_JNE, jit, ctx, asm, ocb, max_chain_depth, side_exit);
-            ctx.upgrade_opnd_type(insn_opnd, Type::ImmSymbol);
+            jit_chain_guard(JCC_JNE, jit, asm, ocb, max_chain_depth, counter);
+            asm.ctx.upgrade_opnd_type(insn_opnd, Type::ImmSymbol);
         }
     } else if unsafe { known_klass == rb_cFloat } && sample_instance.flonum_p() {
         assert!(!val_type.is_heap());
@@ -3856,15 +4797,16 @@ fn jit_guard_known_klass(
             assert!(val_type.is_unknown());
 
             // We will guard flonum vs heap float as though they were separate classes
-            asm.comment("guard object is flonum");
+            asm_comment!(asm, "guard object is flonum");
             let flag_bits = asm.and(obj_opnd, Opnd::UImm(RUBY_FLONUM_MASK as u64));
             asm.cmp(flag_bits, Opnd::UImm(RUBY_FLONUM_FLAG as u64));
-            jit_chain_guard(JCC_JNE, jit, ctx, asm, ocb, max_chain_depth, side_exit);
-            ctx.upgrade_opnd_type(insn_opnd, Type::Flonum);
+            jit_chain_guard(JCC_JNE, jit, asm, ocb, max_chain_depth, counter);
+            asm.ctx.upgrade_opnd_type(insn_opnd, Type::Flonum);
         }
     } else if unsafe {
         FL_TEST(known_klass, VALUE(RUBY_FL_SINGLETON as usize)) != VALUE(0)
-            && sample_instance == rb_attr_get(known_klass, id__attached__ as ID)
+            && sample_instance == rb_class_attached_object(known_klass)
+            && !rb_obj_is_kind_of(sample_instance, rb_cIO).test()
     } {
         // Singleton classes are attached to one specific object, so we can
         // avoid one memory access (and potentially the is_heap check) by
@@ -3876,9 +4818,11 @@ fn jit_guard_known_klass(
         // that its singleton class is empty, so we can't avoid the memory
         // access. As an example, `Object.new.singleton_class` is an object in
         // this situation.
-        asm.comment("guard known object with singleton class");
+        // Also, guarding by identity is incorrect for IO objects because
+        // IO#reopen can be used to change the class and singleton class of IO objects!
+        asm_comment!(asm, "guard known object with singleton class");
         asm.cmp(obj_opnd, sample_instance.into());
-        jit_chain_guard(JCC_JNE, jit, ctx, asm, ocb, max_chain_depth, side_exit);
+        jit_chain_guard(JCC_JNE, jit, asm, ocb, max_chain_depth, counter);
     } else if val_type == Type::CString && unsafe { known_klass == rb_cString } {
         // guard elided because the context says we've already checked
         unsafe {
@@ -3890,32 +4834,34 @@ fn jit_guard_known_klass(
         // Check that the receiver is a heap object
         // Note: if we get here, the class doesn't have immediate instances.
         if !val_type.is_heap() {
-            asm.comment("guard not immediate");
+            asm_comment!(asm, "guard not immediate");
             asm.test(obj_opnd, (RUBY_IMMEDIATE_MASK as u64).into());
-            jit_chain_guard(JCC_JNZ, jit, ctx, asm, ocb, max_chain_depth, side_exit);
+            jit_chain_guard(JCC_JNZ, jit, asm, ocb, max_chain_depth, counter);
             asm.cmp(obj_opnd, Qfalse.into());
-            jit_chain_guard(JCC_JE, jit, ctx, asm, ocb, max_chain_depth, side_exit);
+            jit_chain_guard(JCC_JE, jit, asm, ocb, max_chain_depth, counter);
 
-            ctx.upgrade_opnd_type(insn_opnd, Type::UnknownHeap);
+            asm.ctx.upgrade_opnd_type(insn_opnd, Type::UnknownHeap);
         }
 
         // If obj_opnd isn't already a register, load it.
         let obj_opnd = match obj_opnd {
-            Opnd::Reg(_) => obj_opnd,
+            Opnd::InsnOut { .. } => obj_opnd,
             _ => asm.load(obj_opnd),
         };
         let klass_opnd = Opnd::mem(64, obj_opnd, RUBY_OFFSET_RBASIC_KLASS);
 
         // Bail if receiver class is different from known_klass
         // TODO: jit_mov_gc_ptr keeps a strong reference, which leaks the class.
-        asm.comment("guard known class");
+        asm_comment!(asm, "guard known class");
         asm.cmp(klass_opnd, known_klass.into());
-        jit_chain_guard(JCC_JNE, jit, ctx, asm, ocb, max_chain_depth, side_exit);
+        jit_chain_guard(JCC_JNE, jit, asm, ocb, max_chain_depth, counter);
 
         if known_klass == unsafe { rb_cString } {
-            ctx.upgrade_opnd_type(insn_opnd, Type::CString);
+            asm.ctx.upgrade_opnd_type(insn_opnd, Type::CString);
         } else if known_klass == unsafe { rb_cArray } {
-            ctx.upgrade_opnd_type(insn_opnd, Type::CArray);
+            asm.ctx.upgrade_opnd_type(insn_opnd, Type::CArray);
+        } else if known_klass == unsafe { rb_cHash } {
+            asm.ctx.upgrade_opnd_type(insn_opnd, Type::CHash);
         }
     }
 }
@@ -3923,11 +4869,8 @@ fn jit_guard_known_klass(
 // Generate ancestry guard for protected callee.
 // Calls to protected callees only go through when self.is_a?(klass_that_defines_the_callee).
 fn jit_protected_callee_ancestry_guard(
-    _jit: &mut JITState,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
     cme: *const rb_callable_method_entry_t,
-    side_exit: Target,
 ) {
     // See vm_call_method().
     let def_class = unsafe { (*cme).defined_class };
@@ -3942,7 +4885,7 @@ fn jit_protected_callee_ancestry_guard(
         ],
     );
     asm.test(val, val);
-    asm.jz(counted_exit!(ocb, side_exit, send_se_protected_check_failed))
+    asm.jz(Target::side_exit(Counter::guard_send_se_protected_check_failed))
 }
 
 // Codegen for rb_obj_not().
@@ -3950,29 +4893,28 @@ fn jit_protected_callee_ancestry_guard(
 // arity guards.
 fn jit_rb_obj_not(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
 ) -> bool {
-    let recv_opnd = ctx.get_opnd_type(StackOpnd(0));
+    let recv_opnd = asm.ctx.get_opnd_type(StackOpnd(0));
 
     match recv_opnd.known_truthy() {
         Some(false) => {
-            asm.comment("rb_obj_not(nil_or_false)");
-            ctx.stack_pop(1);
-            let out_opnd = ctx.stack_push(Type::True);
+            asm_comment!(asm, "rb_obj_not(nil_or_false)");
+            asm.stack_pop(1);
+            let out_opnd = asm.stack_push(Type::True);
             asm.mov(out_opnd, Qtrue.into());
         },
         Some(true) => {
             // Note: recv_opnd != Type::Nil && recv_opnd != Type::False.
-            asm.comment("rb_obj_not(truthy)");
-            ctx.stack_pop(1);
-            let out_opnd = ctx.stack_push(Type::False);
+            asm_comment!(asm, "rb_obj_not(truthy)");
+            asm.stack_pop(1);
+            let out_opnd = asm.stack_push(Type::False);
             asm.mov(out_opnd, Qfalse.into());
         },
         _ => {
@@ -3986,18 +4928,17 @@ fn jit_rb_obj_not(
 // Codegen for rb_true()
 fn jit_rb_true(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
 ) -> bool {
-    asm.comment("nil? == true");
-    ctx.stack_pop(1);
-    let stack_ret = ctx.stack_push(Type::True);
+    asm_comment!(asm, "nil? == true");
+    asm.stack_pop(1);
+    let stack_ret = asm.stack_push(Type::True);
     asm.mov(stack_ret, Qtrue.into());
     true
 }
@@ -4005,104 +4946,692 @@ fn jit_rb_true(
 // Codegen for rb_false()
 fn jit_rb_false(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
 ) -> bool {
-    asm.comment("nil? == false");
-    ctx.stack_pop(1);
-    let stack_ret = ctx.stack_push(Type::False);
+    asm_comment!(asm, "nil? == false");
+    asm.stack_pop(1);
+    let stack_ret = asm.stack_push(Type::False);
     asm.mov(stack_ret, Qfalse.into());
     true
 }
 
+/// Codegen for Kernel#is_a?
+fn jit_rb_kernel_is_a(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    argc: i32,
+    known_recv_class: Option<VALUE>,
+) -> bool {
+    if argc != 1 {
+        return false;
+    }
+
+    // If this is a super call we might not know the class
+    if known_recv_class.is_none() {
+        return false;
+    }
+
+    // Important note: The output code will simply `return true/false`.
+    // Correctness follows from:
+    //  - `known_recv_class` implies there is a guard scheduled before here
+    //    for a particular `CLASS_OF(lhs)`.
+    //  - We guard that rhs is identical to the compile-time sample
+    //  - In general, for any two Class instances A, B, `A < B` does not change at runtime.
+    //    Class#superclass is stable.
+
+    let sample_rhs = jit.peek_at_stack(&asm.ctx, 0);
+    let sample_lhs = jit.peek_at_stack(&asm.ctx, 1);
+
+    // We are not allowing module here because the module hierarchy can change at runtime.
+    if !unsafe { RB_TYPE_P(sample_rhs, RUBY_T_CLASS) } {
+        return false;
+    }
+    let sample_is_a = unsafe { rb_obj_is_kind_of(sample_lhs, sample_rhs) == Qtrue };
+
+    asm_comment!(asm, "Kernel#is_a?");
+    asm.cmp(asm.stack_opnd(0), sample_rhs.into());
+    asm.jne(Target::side_exit(Counter::guard_send_is_a_class_mismatch));
+
+    asm.stack_pop(2);
+
+    if sample_is_a {
+        let stack_ret = asm.stack_push(Type::True);
+        asm.mov(stack_ret, Qtrue.into());
+    } else {
+        let stack_ret = asm.stack_push(Type::False);
+        asm.mov(stack_ret, Qfalse.into());
+    }
+    return true;
+}
+
+/// Codegen for Kernel#instance_of?
+fn jit_rb_kernel_instance_of(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    argc: i32,
+    known_recv_class: Option<VALUE>,
+) -> bool {
+    if argc != 1 {
+        return false;
+    }
+
+    // If this is a super call we might not know the class
+    if known_recv_class.is_none() {
+        return false;
+    }
+
+    // Important note: The output code will simply `return true/false`.
+    // Correctness follows from:
+    //  - `known_recv_class` implies there is a guard scheduled before here
+    //    for a particular `CLASS_OF(lhs)`.
+    //  - We guard that rhs is identical to the compile-time sample
+    //  - For a particular `CLASS_OF(lhs)`, `rb_obj_class(lhs)` does not change.
+    //    (because for any singleton class `s`, `s.superclass.equal?(s.attached_object.class)`)
+
+    let sample_rhs = jit.peek_at_stack(&asm.ctx, 0);
+    let sample_lhs = jit.peek_at_stack(&asm.ctx, 1);
+
+    // Filters out cases where the C implementation raises
+    if unsafe { !(RB_TYPE_P(sample_rhs, RUBY_T_CLASS) || RB_TYPE_P(sample_rhs, RUBY_T_MODULE)) } {
+        return false;
+    }
+
+    // We need to grab the class here to deal with singleton classes.
+    // Instance of grabs the "real class" of the object rather than the
+    // singleton class.
+    let sample_lhs_real_class = unsafe { rb_obj_class(sample_lhs) };
+
+    let sample_instance_of = sample_lhs_real_class == sample_rhs;
+
+    asm_comment!(asm, "Kernel#instance_of?");
+    asm.cmp(asm.stack_opnd(0), sample_rhs.into());
+    jit_chain_guard(
+        JCC_JNE,
+        jit,
+        asm,
+        ocb,
+        SEND_MAX_DEPTH,
+        Counter::guard_send_instance_of_class_mismatch,
+    );
+
+    asm.stack_pop(2);
+
+    if sample_instance_of {
+        let stack_ret = asm.stack_push(Type::True);
+        asm.mov(stack_ret, Qtrue.into());
+    } else {
+        let stack_ret = asm.stack_push(Type::False);
+        asm.mov(stack_ret, Qfalse.into());
+    }
+    return true;
+}
+
+fn jit_rb_mod_eqq(
+    _jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    if argc != 1 {
+        return false;
+    }
+
+    asm_comment!(asm, "Module#===");
+    // By being here, we know that the receiver is a T_MODULE or a T_CLASS, because Module#=== can
+    // only live on these objects. With that, we can call rb_obj_is_kind_of() without
+    // jit_prepare_non_leaf_call() or a control frame push because it can't raise, allocate, or call
+    // Ruby methods with these inputs.
+    // Note the difference in approach from Kernel#is_a? because we don't get a free guard for the
+    // right hand side.
+    let rhs = asm.stack_pop(1);
+    let lhs = asm.stack_pop(1); // the module
+    let ret = asm.ccall(rb_obj_is_kind_of as *const u8, vec![rhs, lhs]);
+
+    // Return the result
+    let stack_ret = asm.stack_push(Type::UnknownImm);
+    asm.mov(stack_ret, ret);
+
+    return true;
+}
+
 // Codegen for rb_obj_equal()
 // object identity comparison
 fn jit_rb_obj_equal(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
 ) -> bool {
-    asm.comment("equal?");
-    let obj1 = ctx.stack_pop(1);
-    let obj2 = ctx.stack_pop(1);
+    asm_comment!(asm, "equal?");
+    let obj1 = asm.stack_pop(1);
+    let obj2 = asm.stack_pop(1);
 
     asm.cmp(obj1, obj2);
     let ret_opnd = asm.csel_e(Qtrue.into(), Qfalse.into());
 
-    let stack_ret = ctx.stack_push(Type::UnknownImm);
+    let stack_ret = asm.stack_push(Type::UnknownImm);
     asm.mov(stack_ret, ret_opnd);
     true
 }
 
+// Codegen for rb_obj_not_equal()
+// object identity comparison
+fn jit_rb_obj_not_equal(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    gen_equality_specialized(jit, asm, ocb, false) == Some(true)
+}
+
 // Codegen for rb_int_equal()
 fn jit_rb_int_equal(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
 ) -> bool {
-    let side_exit = get_side_exit(jit, ocb, ctx);
-
     // Check that both operands are fixnums
-    guard_two_fixnums(jit, ctx, asm, ocb, side_exit);
+    guard_two_fixnums(jit, asm, ocb);
 
     // Compare the arguments
-    asm.comment("rb_int_equal");
-    let arg1 = ctx.stack_pop(1);
-    let arg0 = ctx.stack_pop(1);
+    asm_comment!(asm, "rb_int_equal");
+    let arg1 = asm.stack_pop(1);
+    let arg0 = asm.stack_pop(1);
     asm.cmp(arg0, arg1);
     let ret_opnd = asm.csel_e(Qtrue.into(), Qfalse.into());
 
-    let stack_ret = ctx.stack_push(Type::UnknownImm);
+    let stack_ret = asm.stack_push(Type::UnknownImm);
     asm.mov(stack_ret, ret_opnd);
     true
 }
 
-/// If string is frozen, duplicate it to get a non-frozen string. Otherwise, return it.
-fn jit_rb_str_uplus(
+fn jit_rb_int_succ(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    // Guard the receiver is fixnum
+    let recv_type = asm.ctx.get_opnd_type(StackOpnd(0));
+    let recv = asm.stack_pop(1);
+    if recv_type != Type::Fixnum {
+        asm_comment!(asm, "guard object is fixnum");
+        asm.test(recv, Opnd::Imm(RUBY_FIXNUM_FLAG as i64));
+        asm.jz(Target::side_exit(Counter::opt_succ_not_fixnum));
+    }
+
+    asm_comment!(asm, "Integer#succ");
+    let out_val = asm.add(recv, Opnd::Imm(2)); // 2 is untagged Fixnum 1
+    asm.jo(Target::side_exit(Counter::opt_succ_overflow));
+
+    // Push the output onto the stack
+    let dst = asm.stack_push(Type::Fixnum);
+    asm.mov(dst, out_val);
+
+    true
+}
+
+fn jit_rb_int_div(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    if asm.ctx.two_fixnums_on_stack(jit) != Some(true) {
+        return false;
+    }
+    guard_two_fixnums(jit, asm, ocb);
+
+    // rb_fix_div_fix may GC-allocate for Bignum
+    jit_prepare_call_with_gc(jit, asm);
+
+    asm_comment!(asm, "Integer#/");
+    let obj = asm.stack_opnd(0);
+    let recv = asm.stack_opnd(1);
+
+    // Check for arg0 % 0
+    asm.cmp(obj, VALUE::fixnum_from_usize(0).as_i64().into());
+    asm.je(Target::side_exit(Counter::opt_div_zero));
+
+    let ret = asm.ccall(rb_fix_div_fix as *const u8, vec![recv, obj]);
+    asm.stack_pop(2); // Keep them during ccall for GC
+
+    let ret_opnd = asm.stack_push(Type::Unknown);
+    asm.mov(ret_opnd, ret);
+    true
+}
+
+fn jit_rb_int_lshift(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    if asm.ctx.two_fixnums_on_stack(jit) != Some(true) {
+        return false;
+    }
+    guard_two_fixnums(jit, asm, ocb);
+
+    let comptime_shift = jit.peek_at_stack(&asm.ctx, 0);
+
+    if !comptime_shift.fixnum_p() {
+        return false;
+    }
+
+    // Untag the fixnum shift amount
+    let shift_amt = comptime_shift.as_isize() >> 1;
+    if shift_amt > 63 || shift_amt < 0 {
+        return false;
+    }
+
+    // Fallback to a C call if the shift amount varies
+    // This check is needed because the chain guard will side-exit
+    // if its max depth is reached
+    if asm.ctx.get_chain_depth() > 0 {
+        return false;
+    }
+
+    let rhs = asm.stack_pop(1);
+    let lhs = asm.stack_pop(1);
+
+    // Guard on the shift amount we speculated on
+    asm.cmp(rhs, comptime_shift.into());
+    jit_chain_guard(
+        JCC_JNE,
+        jit,
+        asm,
+        ocb,
+        1,
+        Counter::lshift_amount_changed,
+    );
+
+    fixnum_left_shift_body(asm, lhs, shift_amt as u64);
+    true
+}
+
+fn fixnum_left_shift_body(asm: &mut Assembler, lhs: Opnd, shift_amt: u64) {
+    let in_val = asm.sub(lhs, 1.into());
+    let shift_opnd = Opnd::UImm(shift_amt);
+    let out_val = asm.lshift(in_val, shift_opnd);
+    let unshifted = asm.rshift(out_val, shift_opnd);
+
+    // Guard that we did not overflow
+    asm.cmp(unshifted, in_val);
+    asm.jne(Target::side_exit(Counter::lshift_overflow));
+
+    // Re-tag the output value
+    let out_val = asm.add(out_val, 1.into());
+
+    let ret_opnd = asm.stack_push(Type::Fixnum);
+    asm.mov(ret_opnd, out_val);
+}
+
+fn jit_rb_int_rshift(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    if asm.ctx.two_fixnums_on_stack(jit) != Some(true) {
+        return false;
+    }
+    guard_two_fixnums(jit, asm, ocb);
+
+    let comptime_shift = jit.peek_at_stack(&asm.ctx, 0);
+
+    // Untag the fixnum shift amount
+    let shift_amt = comptime_shift.as_isize() >> 1;
+    if shift_amt > 63 || shift_amt < 0 {
+        return false;
+    }
+
+    // Fallback to a C call if the shift amount varies
+    // This check is needed because the chain guard will side-exit
+    // if its max depth is reached
+    if asm.ctx.get_chain_depth() > 0 {
+        return false;
+    }
+
+    let rhs = asm.stack_pop(1);
+    let lhs = asm.stack_pop(1);
+
+    // Guard on the shift amount we speculated on
+    asm.cmp(rhs, comptime_shift.into());
+    jit_chain_guard(
+        JCC_JNE,
+        jit,
+        asm,
+        ocb,
+        1,
+        Counter::rshift_amount_changed,
+    );
+
+    let shift_opnd = Opnd::UImm(shift_amt as u64);
+    let out_val = asm.rshift(lhs, shift_opnd);
+    let out_val = asm.or(out_val, 1.into());
+
+    let ret_opnd = asm.stack_push(Type::Fixnum);
+    asm.mov(ret_opnd, out_val);
+    true
+}
+
+fn jit_rb_int_xor(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    if asm.ctx.two_fixnums_on_stack(jit) != Some(true) {
+        return false;
+    }
+    guard_two_fixnums(jit, asm, ocb);
+
+    let rhs = asm.stack_pop(1);
+    let lhs = asm.stack_pop(1);
+
+    // XOR and then re-tag the resulting fixnum
+    let out_val = asm.xor(lhs, rhs);
+    let out_val = asm.or(out_val, 1.into());
+
+    let ret_opnd = asm.stack_push(Type::Fixnum);
+    asm.mov(ret_opnd, out_val);
+    true
+}
+
+fn jit_rb_int_aref(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    if argc != 1 {
+        return false;
+    }
+    if asm.ctx.two_fixnums_on_stack(jit) != Some(true) {
+        return false;
+    }
+    guard_two_fixnums(jit, asm, ocb);
+
+    asm_comment!(asm, "Integer#[]");
+    let obj = asm.stack_pop(1);
+    let recv = asm.stack_pop(1);
+
+    let ret = asm.ccall(rb_fix_aref as *const u8, vec![recv, obj]);
+
+    let ret_opnd = asm.stack_push(Type::Fixnum);
+    asm.mov(ret_opnd, ret);
+    true
+}
+
+fn jit_rb_float_plus(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    // Guard obj is Fixnum or Flonum to avoid rb_funcall on rb_num_coerce_bin
+    let comptime_obj = jit.peek_at_stack(&asm.ctx, 0);
+    if comptime_obj.fixnum_p() || comptime_obj.flonum_p() {
+        let obj = asm.stack_opnd(0);
+        jit_guard_known_klass(
+            jit,
+            asm,
+            ocb,
+            comptime_obj.class_of(),
+            obj,
+            obj.into(),
+            comptime_obj,
+            SEND_MAX_DEPTH,
+            Counter::guard_send_not_fixnum_or_flonum,
+        );
+    } else {
+        return false;
+    }
+
+    // Save the PC and SP because the callee may allocate Float on heap
+    jit_prepare_call_with_gc(jit, asm);
+
+    asm_comment!(asm, "Float#+");
+    let obj = asm.stack_opnd(0);
+    let recv = asm.stack_opnd(1);
+
+    let ret = asm.ccall(rb_float_plus as *const u8, vec![recv, obj]);
+    asm.stack_pop(2); // Keep recv during ccall for GC
+
+    let ret_opnd = asm.stack_push(Type::Unknown); // Flonum or heap Float
+    asm.mov(ret_opnd, ret);
+    true
+}
+
+fn jit_rb_float_minus(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    // Guard obj is Fixnum or Flonum to avoid rb_funcall on rb_num_coerce_bin
+    let comptime_obj = jit.peek_at_stack(&asm.ctx, 0);
+    if comptime_obj.fixnum_p() || comptime_obj.flonum_p() {
+        let obj = asm.stack_opnd(0);
+        jit_guard_known_klass(
+            jit,
+            asm,
+            ocb,
+            comptime_obj.class_of(),
+            obj,
+            obj.into(),
+            comptime_obj,
+            SEND_MAX_DEPTH,
+            Counter::guard_send_not_fixnum_or_flonum,
+        );
+    } else {
+        return false;
+    }
+
+    // Save the PC and SP because the callee may allocate Float on heap
+    jit_prepare_call_with_gc(jit, asm);
+
+    asm_comment!(asm, "Float#-");
+    let obj = asm.stack_opnd(0);
+    let recv = asm.stack_opnd(1);
+
+    let ret = asm.ccall(rb_float_minus as *const u8, vec![recv, obj]);
+    asm.stack_pop(2); // Keep recv during ccall for GC
+
+    let ret_opnd = asm.stack_push(Type::Unknown); // Flonum or heap Float
+    asm.mov(ret_opnd, ret);
+    true
+}
+
+fn jit_rb_float_mul(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    // Guard obj is Fixnum or Flonum to avoid rb_funcall on rb_num_coerce_bin
+    let comptime_obj = jit.peek_at_stack(&asm.ctx, 0);
+    if comptime_obj.fixnum_p() || comptime_obj.flonum_p() {
+        let obj = asm.stack_opnd(0);
+        jit_guard_known_klass(
+            jit,
+            asm,
+            ocb,
+            comptime_obj.class_of(),
+            obj,
+            obj.into(),
+            comptime_obj,
+            SEND_MAX_DEPTH,
+            Counter::guard_send_not_fixnum_or_flonum,
+        );
+    } else {
+        return false;
+    }
+
+    // Save the PC and SP because the callee may allocate Float on heap
+    jit_prepare_call_with_gc(jit, asm);
+
+    asm_comment!(asm, "Float#*");
+    let obj = asm.stack_opnd(0);
+    let recv = asm.stack_opnd(1);
+
+    let ret = asm.ccall(rb_float_mul as *const u8, vec![recv, obj]);
+    asm.stack_pop(2); // Keep recv during ccall for GC
+
+    let ret_opnd = asm.stack_push(Type::Unknown); // Flonum or heap Float
+    asm.mov(ret_opnd, ret);
+    true
+}
+
+fn jit_rb_float_div(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    // Guard obj is Fixnum or Flonum to avoid rb_funcall on rb_num_coerce_bin
+    let comptime_obj = jit.peek_at_stack(&asm.ctx, 0);
+    if comptime_obj.fixnum_p() || comptime_obj.flonum_p() {
+        let obj = asm.stack_opnd(0);
+        jit_guard_known_klass(
+            jit,
+            asm,
+            ocb,
+            comptime_obj.class_of(),
+            obj,
+            obj.into(),
+            comptime_obj,
+            SEND_MAX_DEPTH,
+            Counter::guard_send_not_fixnum_or_flonum,
+        );
+    } else {
+        return false;
+    }
+
+    // Save the PC and SP because the callee may allocate Float on heap
+    jit_prepare_call_with_gc(jit, asm);
+
+    asm_comment!(asm, "Float#/");
+    let obj = asm.stack_opnd(0);
+    let recv = asm.stack_opnd(1);
+
+    let ret = asm.ccall(rb_float_div as *const u8, vec![recv, obj]);
+    asm.stack_pop(2); // Keep recv during ccall for GC
+
+    let ret_opnd = asm.stack_push(Type::Unknown); // Flonum or heap Float
+    asm.mov(ret_opnd, ret);
+    true
+}
+
+/// If string is frozen, duplicate it to get a non-frozen string. Otherwise, return it.
+fn jit_rb_str_uplus(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    argc: i32,
+    _known_recv_class: Option<VALUE>,
 ) -> bool
 {
-    asm.comment("Unary plus on string");
-    let recv_opnd = asm.load(ctx.stack_pop(1));
+    if argc != 0 {
+        return false;
+    }
+
+    // We allocate when we dup the string
+    jit_prepare_call_with_gc(jit, asm);
+    asm.spill_temps(); // For ccall. Unconditionally spill them for RegTemps consistency.
+
+    asm_comment!(asm, "Unary plus on string");
+    let recv_opnd = asm.stack_pop(1);
+    let recv_opnd = asm.load(recv_opnd);
     let flags_opnd = asm.load(Opnd::mem(64, recv_opnd, RUBY_OFFSET_RBASIC_FLAGS));
-    asm.test(flags_opnd, Opnd::Imm(RUBY_FL_FREEZE as i64));
+    asm.test(flags_opnd, Opnd::Imm(RUBY_FL_FREEZE as i64 | RSTRING_CHILLED as i64));
 
     let ret_label = asm.new_label("stack_ret");
 
-    // We guard for the receiver being a ::String, so the return value is too
-    let stack_ret = ctx.stack_push(Type::CString);
+    // String#+@ can only exist on T_STRING
+    let stack_ret = asm.stack_push(Type::TString);
 
     // If the string isn't frozen, we just return it.
     asm.mov(stack_ret, recv_opnd);
     asm.jz(ret_label);
 
     // Str is frozen - duplicate it
+    asm.spill_temps(); // for ccall
     let ret_opnd = asm.ccall(rb_str_dup as *const u8, vec![recv_opnd]);
     asm.mov(stack_ret, ret_opnd);
 
@@ -4111,23 +5640,203 @@ fn jit_rb_str_uplus(
     true
 }
 
+fn jit_rb_str_length(
+    _jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    asm_comment!(asm, "String#length");
+    extern "C" {
+        fn rb_str_length(str: VALUE) -> VALUE;
+    }
+
+    // This function cannot allocate or raise an exceptions
+    let recv = asm.stack_opnd(0);
+    let ret_opnd = asm.ccall(rb_str_length as *const u8, vec![recv]);
+    asm.stack_pop(1); // Keep recv on stack during ccall for GC
+
+    // Should be guaranteed to be a fixnum on 64-bit systems
+    let out_opnd = asm.stack_push(Type::Fixnum);
+    asm.mov(out_opnd, ret_opnd);
+
+    true
+}
+
 fn jit_rb_str_bytesize(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    asm_comment!(asm, "String#bytesize");
+
+    let recv = asm.stack_pop(1);
+
+    asm_comment!(asm, "get string length");
+    let str_len_opnd = Opnd::mem(
+        std::os::raw::c_long::BITS as u8,
+        asm.load(recv),
+        RUBY_OFFSET_RSTRING_LEN as i32,
+    );
+
+    let len = asm.load(str_len_opnd);
+    let shifted_val = asm.lshift(len, Opnd::UImm(1));
+    let out_val = asm.or(shifted_val, Opnd::UImm(RUBY_FIXNUM_FLAG as u64));
+
+    let out_opnd = asm.stack_push(Type::Fixnum);
+
+    asm.mov(out_opnd, out_val);
+
+    true
+}
+
+fn jit_rb_str_byteslice(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    argc: i32,
+    _known_recv_class: Option<VALUE>,
 ) -> bool {
-    asm.comment("String#bytesize");
+    if argc != 2 {
+        return false
+    }
+
+    // rb_str_byte_substr should be leaf if indexes are fixnums
+    match (asm.ctx.get_opnd_type(StackOpnd(0)), asm.ctx.get_opnd_type(StackOpnd(1))) {
+        (Type::Fixnum, Type::Fixnum) => {},
+        // Raises when non-integers are passed in, which requires the method frame
+        // to be pushed for the backtrace
+        _ => if !jit_prepare_lazy_frame_call(jit, asm, cme, StackOpnd(2)) {
+            return false;
+        }
+    }
+    asm_comment!(asm, "String#byteslice");
+
+    // rb_str_byte_substr allocates a substring
+    jit_prepare_call_with_gc(jit, asm);
+
+    // Get stack operands after potential SP change
+    let len = asm.stack_opnd(0);
+    let beg = asm.stack_opnd(1);
+    let recv = asm.stack_opnd(2);
 
-    let recv = ctx.stack_pop(1);
-    let ret_opnd = asm.ccall(rb_str_bytesize as *const u8, vec![recv]);
+    let ret_opnd = asm.ccall(rb_str_byte_substr as *const u8, vec![recv, beg, len]);
+    asm.stack_pop(3);
 
-    let out_opnd = ctx.stack_push(Type::Fixnum);
+    let out_opnd = asm.stack_push(Type::Unknown);
+    asm.mov(out_opnd, ret_opnd);
+
+    true
+}
+
+fn jit_rb_str_getbyte(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    asm_comment!(asm, "String#getbyte");
+
+    // Don't pop since we may bail
+    let idx = asm.stack_opnd(0);
+    let recv = asm.stack_opnd(1);
+
+    let comptime_idx = jit.peek_at_stack(&asm.ctx, 0);
+    if comptime_idx.fixnum_p(){
+        jit_guard_known_klass(
+            jit,
+            asm,
+            ocb,
+            comptime_idx.class_of(),
+            idx,
+            idx.into(),
+            comptime_idx,
+            SEND_MAX_DEPTH,
+            Counter::getbyte_idx_not_fixnum,
+        );
+    } else {
+        return false;
+    }
+
+    // Untag the index
+    let idx = asm.rshift(idx, Opnd::UImm(1));
+
+    // If index is negative, exit
+    asm.cmp(idx, Opnd::UImm(0));
+    asm.jl(Target::side_exit(Counter::getbyte_idx_negative));
+
+    asm_comment!(asm, "get string length");
+    let recv = asm.load(recv);
+    let str_len_opnd = Opnd::mem(
+        std::os::raw::c_long::BITS as u8,
+        asm.load(recv),
+        RUBY_OFFSET_RSTRING_LEN as i32,
+    );
+
+    // Exit if the index is out of bounds
+    asm.cmp(idx, str_len_opnd);
+    asm.jge(Target::side_exit(Counter::getbyte_idx_out_of_bounds));
+
+    let str_ptr = get_string_ptr(asm, recv);
+    // FIXME: could use SIB indexing here with proper support in backend
+    let str_ptr = asm.add(str_ptr, idx);
+    let byte = asm.load(Opnd::mem(8, str_ptr, 0));
+
+    // Zero-extend the byte to 64 bits
+    let byte = byte.with_num_bits(64).unwrap();
+    let byte = asm.and(byte, 0xFF.into());
+
+    // Tag the byte
+    let byte = asm.lshift(byte, Opnd::UImm(1));
+    let byte = asm.or(byte, Opnd::UImm(1));
+
+    asm.stack_pop(2); // Keep them on stack during ccall for GC
+    let out_opnd = asm.stack_push(Type::Fixnum);
+    asm.mov(out_opnd, byte);
+
+    true
+}
+
+fn jit_rb_str_setbyte(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    // Raises when index is out of range. Lazily push a frame in that case.
+    if !jit_prepare_lazy_frame_call(jit, asm, cme, StackOpnd(2)) {
+        return false;
+    }
+    asm_comment!(asm, "String#setbyte");
+
+    let value = asm.stack_opnd(0);
+    let index = asm.stack_opnd(1);
+    let recv = asm.stack_opnd(2);
+
+    let ret_opnd = asm.ccall(rb_str_setbyte as *const u8, vec![recv, index, value]);
+    asm.stack_pop(3); // Keep them on stack during ccall for GC
+
+    let out_opnd = asm.stack_push(Type::UnknownImm);
     asm.mov(out_opnd, ret_opnd);
 
     true
@@ -4139,17 +5848,16 @@ fn jit_rb_str_bytesize(
 // this situation happens a lot in some workloads.
 fn jit_rb_str_to_s(
     _jit: &mut JITState,
-    _ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    known_recv_class: *const VALUE,
+    known_recv_class: Option<VALUE>,
 ) -> bool {
-    if !known_recv_class.is_null() && unsafe { *known_recv_class == rb_cString } {
-        asm.comment("to_s on plain string");
+    if unsafe { known_recv_class == Some(rb_cString) } {
+        asm_comment!(asm, "to_s on plain string");
         // The method returns the receiver, which is already on the stack.
         // No stack movement.
         return true;
@@ -4157,34 +5865,29 @@ fn jit_rb_str_to_s(
     false
 }
 
-// Codegen for rb_str_empty()
-fn jit_rb_str_empty(
+// Codegen for rb_str_empty_p()
+fn jit_rb_str_empty_p(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
 ) -> bool {
-    const _: () = assert!(
-        RUBY_OFFSET_RSTRING_AS_HEAP_LEN == RUBY_OFFSET_RSTRING_EMBED_LEN,
-        "same offset to len embedded or not so we can use one code path to read the length",
-    );
-
-    let recv_opnd = ctx.stack_pop(1);
-    let out_opnd = ctx.stack_push(Type::UnknownImm);
+    let recv_opnd = asm.stack_pop(1);
 
+    asm_comment!(asm, "get string length");
     let str_len_opnd = Opnd::mem(
-        (8 * size_of::<std::os::raw::c_long>()) as u8,
+        std::os::raw::c_long::BITS as u8,
         asm.load(recv_opnd),
-        RUBY_OFFSET_RSTRING_AS_HEAP_LEN as i32,
+        RUBY_OFFSET_RSTRING_LEN as i32,
     );
 
     asm.cmp(str_len_opnd, Opnd::UImm(0));
     let string_empty = asm.csel_e(Qtrue.into(), Qfalse.into());
+    let out_opnd = asm.stack_push(Type::UnknownImm);
     asm.mov(out_opnd, string_empty);
 
     return true;
@@ -4195,49 +5898,38 @@ fn jit_rb_str_empty(
 // This is common in Erb and similar templating languages.
 fn jit_rb_str_concat(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
+    _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
 ) -> bool {
     // The << operator can accept integer codepoints for characters
     // as the argument. We only specially optimise string arguments.
     // If the peeked-at compile time argument is something other than
     // a string, assume it won't be a string later either.
-    let comptime_arg = jit_peek_at_stack(jit, ctx, 0);
+    let comptime_arg = jit.peek_at_stack(&asm.ctx, 0);
     if ! unsafe { RB_TYPE_P(comptime_arg, RUBY_T_STRING) } {
         return false;
     }
 
-    // Generate a side exit
-    let side_exit = get_side_exit(jit, ocb, ctx);
-
-    // Guard that the argument is of class String at runtime.
-    let arg_type = ctx.get_opnd_type(StackOpnd(0));
+    // Guard that the concat argument is a string
+    guard_object_is_string(asm, asm.stack_opnd(0), StackOpnd(0), Counter::guard_send_not_string);
 
-    let concat_arg = ctx.stack_pop(1);
-    let recv = ctx.stack_pop(1);
+    // Guard buffers from GC since rb_str_buf_append may allocate.
+    // rb_str_buf_append may raise Encoding::CompatibilityError, but we accept compromised
+    // backtraces on this method since the interpreter does the same thing on opt_ltlt.
+    jit_prepare_non_leaf_call(jit, asm);
+    asm.spill_temps(); // For ccall. Unconditionally spill them for RegTemps consistency.
 
-    // If we're not compile-time certain that this will always be a string, guard at runtime
-    if arg_type != Type::CString && arg_type != Type::TString {
-        let arg_opnd = asm.load(concat_arg);
-        if !arg_type.is_heap() {
-            asm.comment("guard arg not immediate");
-            asm.test(arg_opnd, (RUBY_IMMEDIATE_MASK as u64).into());
-            asm.jnz(side_exit);
-            asm.cmp(arg_opnd, Qfalse.into());
-            asm.je(side_exit);
-        }
-        guard_object_is_string(asm, arg_opnd, side_exit);
-    }
+    let concat_arg = asm.stack_pop(1);
+    let recv = asm.stack_pop(1);
 
     // Test if string encodings differ. If different, use rb_str_append. If the same,
     // use rb_yjit_str_simple_append, which calls rb_str_cat.
-    asm.comment("<< on strings");
+    asm_comment!(asm, "<< on strings");
 
     // Take receiver's object flags XOR arg's flags. If any
     // string-encoding flags are different between the two,
@@ -4250,21 +5942,22 @@ fn jit_rb_str_concat(
     );
     asm.test(flags_xor, Opnd::UImm(RUBY_ENCODING_MASK as u64));
 
-    // Push once, use the resulting operand in both branches below.
-    let stack_ret = ctx.stack_push(Type::CString);
-
     let enc_mismatch = asm.new_label("enc_mismatch");
     asm.jnz(enc_mismatch);
 
     // If encodings match, call the simple append function and jump to return
     let ret_opnd = asm.ccall(rb_yjit_str_simple_append as *const u8, vec![recv, concat_arg]);
     let ret_label = asm.new_label("func_return");
+    let stack_ret = asm.stack_push(Type::TString);
     asm.mov(stack_ret, ret_opnd);
+    asm.stack_pop(1); // forget stack_ret to re-push after ccall
     asm.jmp(ret_label);
 
     // If encodings are different, use a slower encoding-aware concatenate
     asm.write_label(enc_mismatch);
+    asm.spill_temps(); // Ignore the register for the other local branch
     let ret_opnd = asm.ccall(rb_str_buf_append as *const u8, vec![recv, concat_arg]);
+    let stack_ret = asm.stack_push(Type::TString);
     asm.mov(stack_ret, ret_opnd);
     // Drop through to return
 
@@ -4273,30 +5966,125 @@ fn jit_rb_str_concat(
     true
 }
 
+// Codegen for rb_ary_empty_p()
+fn jit_rb_ary_empty_p(
+    _jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    let array_opnd = asm.stack_pop(1);
+    let array_reg = asm.load(array_opnd);
+    let len_opnd = get_array_len(asm, array_reg);
+
+    asm.test(len_opnd, len_opnd);
+    let bool_val = asm.csel_z(Qtrue.into(), Qfalse.into());
+
+    let out_opnd = asm.stack_push(Type::UnknownImm);
+    asm.store(out_opnd, bool_val);
+
+    return true;
+}
+
+// Codegen for rb_ary_length()
+fn jit_rb_ary_length(
+    _jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    let array_opnd = asm.stack_pop(1);
+    let array_reg = asm.load(array_opnd);
+    let len_opnd = get_array_len(asm, array_reg);
+
+    // Convert the length to a fixnum
+    let shifted_val = asm.lshift(len_opnd, Opnd::UImm(1));
+    let out_val = asm.or(shifted_val, Opnd::UImm(RUBY_FIXNUM_FLAG as u64));
+
+    let out_opnd = asm.stack_push(Type::Fixnum);
+    asm.store(out_opnd, out_val);
+
+    return true;
+}
+
+fn jit_rb_ary_push(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    asm_comment!(asm, "Array#<<");
+
+    // rb_ary_push allocates memory for buffer extension and can raise FrozenError
+    // Not using a lazy frame here since the interpreter also has a truncated
+    // stack trace from opt_ltlt.
+    jit_prepare_non_leaf_call(jit, asm);
+
+    let item_opnd = asm.stack_opnd(0);
+    let ary_opnd = asm.stack_opnd(1);
+    let ret = asm.ccall(rb_ary_push as *const u8, vec![ary_opnd, item_opnd]);
+    asm.stack_pop(2); // Keep them on stack during ccall for GC
+
+    let ret_opnd = asm.stack_push(Type::TArray);
+    asm.mov(ret_opnd, ret);
+    true
+}
+
+// Just a leaf method, but not using `Primitive.attr! :leaf` since BOP methods can't use it.
+fn jit_rb_hash_empty_p(
+    _jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    asm_comment!(asm, "Hash#empty?");
+
+    let hash_opnd = asm.stack_pop(1);
+    let ret = asm.ccall(rb_hash_empty_p as *const u8, vec![hash_opnd]);
+
+    let ret_opnd = asm.stack_push(Type::UnknownImm);
+    asm.mov(ret_opnd, ret);
+    true
+}
+
 fn jit_obj_respond_to(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     argc: i32,
-    known_recv_class: *const VALUE,
+    known_recv_class: Option<VALUE>,
 ) -> bool {
     // respond_to(:sym) or respond_to(:sym, true)
     if argc != 1 && argc != 2 {
         return false;
     }
 
-    if known_recv_class.is_null() {
-        return false;
-    }
-
-    let recv_class = unsafe { *known_recv_class };
+    let recv_class = match known_recv_class {
+        Some(class) => class,
+        None => return false,
+    };
 
     // Get the method_id from compile time. We will later add a guard against it.
-    let mid_sym = jit_peek_at_stack(jit, ctx, (argc - 1) as isize);
+    let mid_sym = jit.peek_at_stack(&asm.ctx, (argc - 1) as isize);
     if !mid_sym.static_sym_p() {
         return false
     }
@@ -4308,7 +6096,7 @@ fn jit_obj_respond_to(
         Some(false)
     } else {
         // Get value from type information (may or may not be known)
-        ctx.get_opnd_type(StackOpnd(0)).known_truthy()
+        asm.ctx.get_opnd_type(StackOpnd(0)).known_truthy()
     };
 
     let target_cme = unsafe { rb_callable_method_entry_or_negative(recv_class, mid) };
@@ -4329,59 +6117,167 @@ fn jit_obj_respond_to(
     };
 
     let result = match (visibility, allow_priv) {
-        (METHOD_VISI_UNDEF, _) => Qfalse, // No method => false
-        (METHOD_VISI_PUBLIC, _) => Qtrue, // Public method => true regardless of include_all
-        (_, Some(true)) => Qtrue, // include_all => always true
+        (METHOD_VISI_UNDEF, _) => {
+            // No method, we can return false given respond_to_missing? hasn't been overridden.
+            // In the future, we might want to jit the call to respond_to_missing?
+            if !assume_method_basic_definition(jit, asm, ocb, recv_class, ID!(respond_to_missing)) {
+                return false;
+            }
+            Qfalse
+        }
+        (METHOD_VISI_PUBLIC, _) | // Public method => fine regardless of include_all
+        (_, Some(true)) => { // include_all => all visibility are acceptable
+            // Method exists and has acceptable visibility
+            if cme_def_type == VM_METHOD_TYPE_NOTIMPLEMENTED {
+                // C method with rb_f_notimplement(). `respond_to?` returns false
+                // without consulting `respond_to_missing?`. See also: rb_add_method_cfunc()
+                Qfalse
+            } else {
+                Qtrue
+            }
+        }
         (_, _) => return false // not public and include_all not known, can't compile
     };
 
-    if result != Qtrue {
-        // Only if respond_to_missing? hasn't been overridden
-        // In the future, we might want to jit the call to respond_to_missing?
-        if !assume_method_basic_definition(jit, ocb, recv_class, idRespond_to_missing.into()) {
-            return false;
-        }
-    }
-
     // Invalidate this block if method lookup changes for the method being queried. This works
     // both for the case where a method does or does not exist, as for the latter we asked for a
     // "negative CME" earlier.
-    assume_method_lookup_stable(jit, ocb, target_cme);
-
-    // Generate a side exit
-    let side_exit = get_side_exit(jit, ocb, ctx);
+    jit.assume_method_lookup_stable(asm, ocb, target_cme);
 
     if argc == 2 {
         // pop include_all argument (we only use its type info)
-        ctx.stack_pop(1);
+        asm.stack_pop(1);
     }
 
-    let sym_opnd = ctx.stack_pop(1);
-    let _recv_opnd = ctx.stack_pop(1);
+    let sym_opnd = asm.stack_pop(1);
+    let _recv_opnd = asm.stack_pop(1);
 
     // This is necessary because we have no guarantee that sym_opnd is a constant
-    asm.comment("guard known mid");
+    asm_comment!(asm, "guard known mid");
     asm.cmp(sym_opnd, mid_sym.into());
-    asm.jne(side_exit);
+    jit_chain_guard(
+        JCC_JNE,
+        jit,
+        asm,
+        ocb,
+        SEND_MAX_DEPTH,
+        Counter::guard_send_respond_to_mid_mismatch,
+    );
 
-    jit_putobject(jit, ctx, asm, result);
+    jit_putobject(asm, result);
+
+    true
+}
+
+fn jit_rb_f_block_given_p(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    asm.stack_pop(1);
+    let out_opnd = asm.stack_push(Type::UnknownImm);
+
+    gen_block_given(jit, asm, out_opnd, Qtrue.into(), Qfalse.into());
+
+    true
+}
+
+fn gen_block_given(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    out_opnd: Opnd,
+    true_opnd: Opnd,
+    false_opnd: Opnd,
+) {
+    asm_comment!(asm, "block_given?");
+
+    // Same as rb_vm_frame_block_handler
+    let ep_opnd = gen_get_lep(jit, asm);
+    let block_handler = asm.load(
+        Opnd::mem(64, ep_opnd, SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL)
+    );
+
+    // Return `block_handler != VM_BLOCK_HANDLER_NONE`
+    asm.cmp(block_handler, VM_BLOCK_HANDLER_NONE.into());
+    let block_given = asm.csel_ne(true_opnd, false_opnd);
+    asm.mov(out_opnd, block_given);
+}
+
+// Codegen for rb_class_superclass()
+fn jit_rb_class_superclass(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    cme: *const rb_callable_method_entry_t,
+    _block: Option<crate::codegen::BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    extern "C" {
+        fn rb_class_superclass(klass: VALUE) -> VALUE;
+    }
+
+    if !jit_prepare_lazy_frame_call(jit, asm, cme, StackOpnd(0)) {
+        return false;
+    }
+
+    asm_comment!(asm, "Class#superclass");
+    let recv_opnd = asm.stack_opnd(0);
+    let ret = asm.ccall(rb_class_superclass as *const u8, vec![recv_opnd]);
+
+    asm.stack_pop(1);
+    let ret_opnd = asm.stack_push(Type::Unknown);
+    asm.mov(ret_opnd, ret);
+
+    true
+}
+
+fn jit_rb_case_equal(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    known_recv_class: Option<VALUE>,
+) -> bool {
+    if !jit.assume_expected_cfunc( asm, ocb, known_recv_class.unwrap(), ID!(eq), rb_obj_equal as _) {
+        return false;
+    }
+
+    asm_comment!(asm, "case_equal: {}#===", get_class_name(known_recv_class));
+
+    // Compare the arguments
+    let arg1 = asm.stack_pop(1);
+    let arg0 = asm.stack_pop(1);
+    asm.cmp(arg0, arg1);
+    let ret_opnd = asm.csel_e(Qtrue.into(), Qfalse.into());
+
+    let stack_ret = asm.stack_push(Type::UnknownImm);
+    asm.mov(stack_ret, ret_opnd);
 
     true
 }
 
 fn jit_thread_s_current(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
 ) -> bool {
-    asm.comment("Thread.current");
-    ctx.stack_pop(1);
+    asm_comment!(asm, "Thread.current");
+    asm.stack_pop(1);
 
     // ec->thread_ptr
     let ec_thread_opnd = asm.load(Opnd::mem(64, EC, RUBY_OFFSET_EC_THREAD_PTR));
@@ -4389,7 +6285,7 @@ fn jit_thread_s_current(
     // thread->self
     let thread_self = Opnd::mem(64, ec_thread_opnd, RUBY_OFFSET_THREAD_SELF);
 
-    let stack_ret = ctx.stack_push(Type::UnknownHeap);
+    let stack_ret = asm.stack_push(Type::UnknownHeap);
     asm.mov(stack_ret, thread_self);
     true
 }
@@ -4397,15 +6293,20 @@ fn jit_thread_s_current(
 // Check if we know how to codegen for a particular cfunc method
 fn lookup_cfunc_codegen(def: *const rb_method_definition_t) -> Option<MethodGenFn> {
     let method_serial = unsafe { get_def_method_serial(def) };
+    let table = unsafe { METHOD_CODEGEN_TABLE.as_ref().unwrap() };
 
-    CodegenGlobals::look_up_codegen_method(method_serial)
+    let option_ref = table.get(&method_serial);
+    match option_ref {
+        None => None,
+        Some(&mgf) => Some(mgf), // Deref
+    }
 }
 
 // Is anyone listening for :c_call and :c_return event currently?
 fn c_method_tracing_currently_enabled(jit: &JITState) -> bool {
     // Defer to C implementation in yjit.c
     unsafe {
-        rb_c_method_tracing_currently_enabled(jit.ec.unwrap() as *mut rb_execution_context_struct)
+        rb_c_method_tracing_currently_enabled(jit.ec)
     }
 }
 
@@ -4430,13 +6331,25 @@ unsafe extern "C" fn build_kwhash(ci: *const rb_callinfo, sp: *const VALUE) -> V
 // at sp[-2]. Depending on the frame type, it can serve different purposes,
 // which are covered here by enum variants.
 enum SpecVal {
-    None,
-    BlockISeq(IseqPtr),
-    BlockParamProxy,
+    BlockHandler(Option<BlockHandler>),
     PrevEP(*const VALUE),
     PrevEPOpnd(Opnd),
 }
 
+// Each variant represents a branch in vm_caller_setup_arg_block.
+#[derive(Clone, Copy)]
+pub enum BlockHandler {
+    // send, invokesuper: blockiseq operand
+    BlockISeq(IseqPtr),
+    // invokesuper: GET_BLOCK_HANDLER() (GET_LEP()[VM_ENV_DATA_INDEX_SPECVAL])
+    LEPSpecVal,
+    // part of the allocate-free block forwarding scheme
+    BlockParamProxy,
+    // To avoid holding the block arg (e.g. proc and symbol) across C calls,
+    // we might need to set the block handler early in the call sequence
+    AlreadySet,
+}
+
 struct ControlFrame {
     recv: Opnd,
     sp: Opnd,
@@ -4445,7 +6358,6 @@ struct ControlFrame {
     frame_type: u32,
     specval: SpecVal,
     cme: *const rb_callable_method_entry_t,
-    local_size: i32
 }
 
 // Codegen performing a similar (but not identical) function to vm_push_frame
@@ -4460,21 +6372,17 @@ struct ControlFrame {
 //   * Provided sp should point to the new frame's sp, immediately following locals and the environment
 //   * At entry, CFP points to the caller (not callee) frame
 //   * At exit, ec->cfp is updated to the pushed CFP
-//   * CFP and SP registers are updated only if set_sp_cfp is set
+//   * SP register is updated only if frame.iseq is set
 //   * Stack overflow is not checked (should be done by the caller)
 //   * Interrupts are not checked (should be done by the caller)
 fn gen_push_frame(
     jit: &mut JITState,
-    _ctx: &mut Context,
     asm: &mut Assembler,
-    set_sp_cfp: bool, // if true CFP and SP will be switched to the callee
     frame: ControlFrame,
 ) {
-    assert!(frame.local_size >= 0);
-
     let sp = frame.sp;
 
-    asm.comment("push cme, specval, frame type");
+    asm_comment!(asm, "push cme, specval, frame type");
 
     // Write method entry at sp[-3]
     // sp[-3] = me;
@@ -4486,27 +6394,31 @@ fn gen_push_frame(
     // the outer environment depending on the frame type.
     // sp[-2] = specval;
     let specval: Opnd = match frame.specval {
-        SpecVal::None => {
-            VM_BLOCK_HANDLER_NONE.into()
-        }
-        SpecVal::BlockISeq(block_iseq) => {
-            // Change cfp->block_code in the current frame. See vm_caller_setup_arg_block().
-            // VM_CFP_TO_CAPTURED_BLOCK does &cfp->self, rb_captured_block->code.iseq aliases
-            // with cfp->block_code.
-            asm.store(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_BLOCK_CODE), VALUE::from(block_iseq).into());
-
-            let cfp_self = asm.lea(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF));
-            asm.or(cfp_self, Opnd::Imm(1))
-        }
-        SpecVal::BlockParamProxy => {
-            let ep_opnd = gen_get_lep(jit, asm);
-            let block_handler = asm.load(
-                Opnd::mem(64, ep_opnd, SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL)
-            );
-
-            asm.store(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_BLOCK_CODE), block_handler);
-
-            block_handler
+        SpecVal::BlockHandler(None) => VM_BLOCK_HANDLER_NONE.into(),
+        SpecVal::BlockHandler(Some(block_handler)) => {
+            match block_handler {
+                BlockHandler::BlockISeq(block_iseq) => {
+                    // Change cfp->block_code in the current frame. See vm_caller_setup_arg_block().
+                    // VM_CFP_TO_CAPTURED_BLOCK does &cfp->self, rb_captured_block->code.iseq aliases
+                    // with cfp->block_code.
+                    asm.store(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_BLOCK_CODE), VALUE::from(block_iseq).into());
+
+                    let cfp_self = asm.lea(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF));
+                    asm.or(cfp_self, Opnd::Imm(1))
+                }
+                BlockHandler::LEPSpecVal => {
+                    let lep_opnd = gen_get_lep(jit, asm);
+                    asm.load(Opnd::mem(64, lep_opnd, SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL))
+                }
+                BlockHandler::BlockParamProxy => {
+                    let ep_opnd = gen_get_lep(jit, asm);
+                    let block_handler = asm.load(
+                        Opnd::mem(64, ep_opnd, SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL)
+                    );
+                    block_handler
+                }
+                BlockHandler::AlreadySet => 0.into(), // unused
+            }
         }
         SpecVal::PrevEP(prev_ep) => {
             let tagged_prev_ep = (prev_ep as usize) | 1;
@@ -4514,9 +6426,13 @@ fn gen_push_frame(
         }
         SpecVal::PrevEPOpnd(ep_opnd) => {
             asm.or(ep_opnd, 1.into())
-        },
+        }
     };
-    asm.store(Opnd::mem(64, sp, SIZEOF_VALUE_I32 * -2), specval);
+    if let SpecVal::BlockHandler(Some(BlockHandler::AlreadySet)) = frame.specval {
+        asm_comment!(asm, "specval should have been set");
+    } else {
+        asm.store(Opnd::mem(64, sp, SIZEOF_VALUE_I32 * -2), specval);
+    }
 
     // Write env flags at sp[-1]
     // sp[-1] = frame_type;
@@ -4535,16 +6451,14 @@ fn gen_push_frame(
     //    .self       = recv,
     //    .ep         = <sp - 1>,
     //    .block_code = 0,
-    //    .__bp__     = sp,
     // };
-    asm.comment("push callee control frame");
+    asm_comment!(asm, "push callee control frame");
 
     // For an iseq call PC may be None, in which case we will not set PC and will allow jitted code
     // to set it as necessary.
-    let _pc = if let Some(pc) = frame.pc {
+    if let Some(pc) = frame.pc {
         asm.mov(cfp_opnd(RUBY_OFFSET_CFP_PC), pc.into());
     };
-    asm.mov(cfp_opnd(RUBY_OFFSET_CFP_BP), sp);
     asm.mov(cfp_opnd(RUBY_OFFSET_CFP_SP), sp);
     let iseq: Opnd = if let Some(iseq) = frame.iseq {
         VALUE::from(iseq).into()
@@ -4555,89 +6469,45 @@ fn gen_push_frame(
     asm.mov(cfp_opnd(RUBY_OFFSET_CFP_SELF), frame.recv);
     asm.mov(cfp_opnd(RUBY_OFFSET_CFP_BLOCK_CODE), 0.into());
 
-    // This Qnil fill snippet potentially requires 2 more registers on Arm, one for Qnil and
-    // another for calculating the address in case there are a lot of local variables. So doing
-    // this after releasing the register for specval and the receiver to avoid register spill.
-    let num_locals = frame.local_size;
-    if num_locals > 0 {
-        asm.comment("initialize locals");
-
-        // Initialize local variables to Qnil
-        for i in 0..num_locals {
-            let offs = SIZEOF_VALUE_I32 * (i - num_locals - 3);
-            asm.store(Opnd::mem(64, sp, offs), Qnil.into());
-        }
-    }
+    if frame.iseq.is_some() {
+        // Spill stack temps to let the callee use them (must be done before changing the SP register)
+        asm.spill_temps();
 
-    if set_sp_cfp {
         // Saving SP before calculating ep avoids a dependency on a register
         // However this must be done after referencing frame.recv, which may be SP-relative
         asm.mov(SP, sp);
     }
     let ep = asm.sub(sp, SIZEOF_VALUE.into());
     asm.mov(cfp_opnd(RUBY_OFFSET_CFP_EP), ep);
-
-    asm.comment("switch to new CFP");
-    let new_cfp = asm.lea(cfp_opnd(0));
-    if set_sp_cfp {
-        asm.mov(CFP, new_cfp);
-        asm.store(Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP), CFP);
-    } else {
-        asm.store(Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP), new_cfp);
-    }
 }
 
 fn gen_send_cfunc(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
     ci: *const rb_callinfo,
     cme: *const rb_callable_method_entry_t,
-    block: Option<IseqPtr>,
-    recv_known_klass: *const VALUE,
+    block: Option<BlockHandler>,
+    recv_known_class: Option<VALUE>,
     flags: u32,
     argc: i32,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     let cfunc = unsafe { get_cme_def_body_cfunc(cme) };
     let cfunc_argc = unsafe { get_mct_argc(cfunc) };
     let mut argc = argc;
 
-    // Create a side-exit to fall back to the interpreter
-    let side_exit = get_side_exit(jit, ocb, ctx);
-
-    // If the function expects a Ruby array of arguments
-    if cfunc_argc < 0 && cfunc_argc != -1 {
-        gen_counter_incr!(asm, send_cfunc_ruby_array_varg);
-        return CantCompile;
-    }
-
-    // We aren't handling a vararg cfuncs with splat currently.
-    if flags & VM_CALL_ARGS_SPLAT != 0 && cfunc_argc == -1 {
-        gen_counter_incr!(asm, send_args_splat_cfunc_var_args);
-        return CantCompile;
-    }
+    // Splat call to a C method that takes `VALUE *` and `len`
+    let variable_splat = flags & VM_CALL_ARGS_SPLAT != 0 && cfunc_argc == -1;
+    let block_arg = flags & VM_CALL_ARGS_BLOCKARG != 0;
 
-    if flags & VM_CALL_ARGS_SPLAT != 0 && flags & VM_CALL_ZSUPER != 0 {
-        // zsuper methods are super calls without any arguments.
-        // They are also marked as splat, but don't actually have an array
-        // they pull arguments from, instead we need to change to call
-        // a different method with the current stack.
-        gen_counter_incr!(asm, send_args_splat_cfunc_zuper);
-        return CantCompile;
+    // If it's a splat and the method expects a Ruby array of arguments
+    if cfunc_argc == -2 && flags & VM_CALL_ARGS_SPLAT != 0 {
+        gen_counter_incr(asm, Counter::send_cfunc_splat_neg2);
+        return None;
     }
 
-    // In order to handle backwards compatibility between ruby 3 and 2
-    // ruby2_keywords was introduced. It is called only on methods
-    // with splat and changes they way they handle them.
-    // We are just going to not compile these.
-    // https://docs.ruby-lang.org/en/3.2/Module.html#method-i-ruby2_keywords
-    if unsafe {
-        get_iseq_flags_ruby2_keywords(jit.iseq) && flags & VM_CALL_ARGS_SPLAT != 0
-    } {
-        gen_counter_incr!(asm, send_args_splat_cfunc_ruby2_keywords);
-        return CantCompile;
-    }
+    exit_if_kwsplat_non_nil(asm, flags, Counter::send_cfunc_kw_splat_non_nil)?;
+    let kw_splat = flags & VM_CALL_KW_SPLAT != 0;
 
     let kw_arg = unsafe { vm_ci_kwarg(ci) };
     let kw_arg_num = if kw_arg.is_null() {
@@ -4647,39 +6517,79 @@ fn gen_send_cfunc(
     };
 
     if kw_arg_num != 0 && flags & VM_CALL_ARGS_SPLAT != 0 {
-        gen_counter_incr!(asm, send_cfunc_splat_with_kw);
-        return CantCompile;
+        gen_counter_incr(asm, Counter::send_cfunc_splat_with_kw);
+        return None;
     }
 
     if c_method_tracing_currently_enabled(jit) {
         // Don't JIT if tracing c_call or c_return
-        gen_counter_incr!(asm, send_cfunc_tracing);
-        return CantCompile;
+        gen_counter_incr(asm, Counter::send_cfunc_tracing);
+        return None;
     }
 
+    // Increment total cfunc send count
+    gen_counter_incr(asm, Counter::num_send_cfunc);
+
     // Delegate to codegen for C methods if we have it.
-    if kw_arg.is_null() && flags & VM_CALL_OPT_SEND == 0 {
-        let codegen_p = lookup_cfunc_codegen(unsafe { (*cme).def });
-        if let Some(known_cfunc_codegen) = codegen_p {
-            if known_cfunc_codegen(jit, ctx, asm, ocb, ci, cme, block, argc, recv_known_klass) {
+    if kw_arg.is_null() &&
+            !kw_splat &&
+            flags & VM_CALL_OPT_SEND == 0 &&
+            flags & VM_CALL_ARGS_SPLAT == 0 &&
+            (cfunc_argc == -1 || argc == cfunc_argc) {
+        let expected_stack_after = asm.ctx.get_stack_size() as i32 - argc;
+        if let Some(known_cfunc_codegen) = lookup_cfunc_codegen(unsafe { (*cme).def }) {
+            // We don't push a frame for specialized cfunc codegen, so the generated code must be leaf.
+            // However, the interpreter doesn't push a frame on opt_* instruction either, so we allow
+            // non-sendish instructions to break this rule as an exception.
+            let cfunc_codegen = if jit.is_sendish() {
+                asm.with_leaf_ccall(|asm|
+                    perf_call!("gen_send_cfunc: ", known_cfunc_codegen(jit, asm, ocb, ci, cme, block, argc, recv_known_class))
+                )
+            } else {
+                perf_call!("gen_send_cfunc: ", known_cfunc_codegen(jit, asm, ocb, ci, cme, block, argc, recv_known_class))
+            };
+
+            if cfunc_codegen {
+                assert_eq!(expected_stack_after, asm.ctx.get_stack_size() as i32);
+                gen_counter_incr(asm, Counter::num_send_cfunc_inline);
                 // cfunc codegen generated code. Terminate the block so
                 // there isn't multiple calls in the same block.
-                jump_to_next_insn(jit, ctx, asm, ocb);
-                return EndBlock;
+                jump_to_next_insn(jit, asm, ocb);
+                return Some(EndBlock);
             }
         }
     }
 
     // Check for interrupts
-    gen_check_ints(asm, side_exit);
+    gen_check_ints(asm, Counter::guard_send_interrupted);
 
     // Stack overflow check
     // #define CHECK_VM_STACK_OVERFLOW0(cfp, sp, margin)
     // REG_CFP <= REG_SP + 4 * SIZEOF_VALUE + sizeof(rb_control_frame_t)
-    asm.comment("stack overflow check");
-    let stack_limit = asm.lea(ctx.sp_opnd((SIZEOF_VALUE * 4 + 2 * RUBY_SIZEOF_CONTROL_FRAME) as isize));
+    asm_comment!(asm, "stack overflow check");
+    const _: () = assert!(RUBY_SIZEOF_CONTROL_FRAME % SIZEOF_VALUE == 0, "sizeof(rb_control_frame_t) is a multiple of sizeof(VALUE)");
+    let stack_limit = asm.lea(asm.ctx.sp_opnd((4 + 2 * (RUBY_SIZEOF_CONTROL_FRAME / SIZEOF_VALUE)) as i32));
     asm.cmp(CFP, stack_limit);
-    asm.jbe(counted_exit!(ocb, side_exit, send_se_cf_overflow));
+    asm.jbe(Target::side_exit(Counter::guard_send_se_cf_overflow));
+
+    // Guard for variable length splat call before any modifications to the stack
+    if variable_splat {
+        let splat_array_idx = i32::from(kw_splat) + i32::from(block_arg);
+        let comptime_splat_array = jit.peek_at_stack(&asm.ctx, splat_array_idx as isize);
+        if unsafe { rb_yjit_ruby2_keywords_splat_p(comptime_splat_array) } != 0 {
+            gen_counter_incr(asm, Counter::send_cfunc_splat_varg_ruby2_keywords);
+            return None;
+        }
+
+        let splat_array = asm.stack_opnd(splat_array_idx);
+        guard_object_is_array(asm, splat_array, splat_array.into(), Counter::guard_send_splat_not_array);
+
+        asm_comment!(asm, "guard variable length splat call servicable");
+        let sp = asm.ctx.sp_opnd(0);
+        let proceed = asm.ccall(rb_yjit_splat_varg_checks as _, vec![sp, splat_array, CFP]);
+        asm.cmp(proceed, Qfalse.into());
+        asm.je(Target::side_exit(Counter::guard_send_cfunc_bad_splat_vargs));
+    }
 
     // Number of args which will be passed through to the callee
     // This is adjusted by the kwargs being combined into a hash.
@@ -4689,22 +6599,25 @@ fn gen_send_cfunc(
         argc - kw_arg_num + 1
     };
 
+    // Exclude the kw_splat hash from arity check
+    if kw_splat {
+        passed_argc -= 1;
+    }
 
     // If the argument count doesn't match
     if cfunc_argc >= 0 && cfunc_argc != passed_argc && flags & VM_CALL_ARGS_SPLAT == 0 {
-        gen_counter_incr!(asm, send_cfunc_argc_mismatch);
-        return CantCompile;
+        gen_counter_incr(asm, Counter::send_cfunc_argc_mismatch);
+        return None;
     }
 
     // Don't JIT functions that need C stack arguments for now
     if cfunc_argc >= 0 && passed_argc + 1 > (C_ARG_OPNDS.len() as i32) {
-        gen_counter_incr!(asm, send_cfunc_toomany_args);
-        return CantCompile;
+        gen_counter_incr(asm, Counter::send_cfunc_toomany_args);
+        return None;
     }
 
-    let block_arg = flags & VM_CALL_ARGS_BLOCKARG != 0;
     let block_arg_type = if block_arg {
-        Some(ctx.get_opnd_type(StackOpnd(0)))
+        Some(asm.ctx.get_opnd_type(StackOpnd(0)))
     } else {
         None
     };
@@ -4717,19 +6630,19 @@ fn gen_send_cfunc(
             // Nothing to do
         }
         _ => {
-            gen_counter_incr!(asm, send_block_arg);
-            return CantCompile;
+            gen_counter_incr(asm, Counter::send_cfunc_block_arg);
+            return None;
         }
     }
 
     match block_arg_type {
         Some(Type::Nil) => {
             // We have a nil block arg, so let's pop it off the args
-            ctx.stack_pop(1);
+            asm.stack_pop(1);
         }
         Some(Type::BlockParamProxy) => {
             // We don't need the actual stack value
-            ctx.stack_pop(1);
+            asm.stack_pop(1);
         }
         None => {
             // Nothing to do
@@ -4739,43 +6652,71 @@ fn gen_send_cfunc(
         }
     }
 
-    // This is a .send call and we need to adjust the stack
-    if flags & VM_CALL_OPT_SEND != 0 {
-        handle_opt_send_shift_stack(asm, argc, ctx);
+    // Pop the empty kw_splat hash
+    if kw_splat {
+        // Only `**nil` is supported right now. Checked in exit_if_kwsplat_non_nil()
+        assert_eq!(Type::Nil, asm.ctx.get_opnd_type(StackOpnd(0)));
+        asm.stack_pop(1);
+        argc -= 1;
     }
 
-    // push_splat_args does stack manipulation so we can no longer side exit
-    if flags & VM_CALL_ARGS_SPLAT != 0 {
+    // Splat handling when C method takes a static number of arguments.
+    // push_splat_args() does stack manipulation so we can no longer side exit
+    if flags & VM_CALL_ARGS_SPLAT != 0 && cfunc_argc >= 0 {
         let required_args : u32 = (cfunc_argc as u32).saturating_sub(argc as u32 - 1);
         // + 1 because we pass self
         if required_args + 1 >= C_ARG_OPNDS.len() as u32 {
-            gen_counter_incr!(asm, send_cfunc_toomany_args);
-            return CantCompile;
+            gen_counter_incr(asm, Counter::send_cfunc_toomany_args);
+            return None;
         }
+
         // We are going to assume that the splat fills
-        // all the remaining arguments. In the generated code
-        // we test if this is true and if not side exit.
-        argc = required_args as i32;
+        // all the remaining arguments. So the number of args
+        // should just equal the number of args the cfunc takes.
+        // In the generated code we test if this is true
+        // and if not side exit.
+        argc = cfunc_argc;
         passed_argc = argc;
-        push_splat_args(required_args, ctx, asm, ocb, side_exit)
+        push_splat_args(required_args, asm)
     }
 
+    // This is a .send call and we need to adjust the stack
+    if flags & VM_CALL_OPT_SEND != 0 {
+        handle_opt_send_shift_stack(asm, argc);
+    }
+
+    // Push a dynamic number of items from the splat array to the stack when calling a vargs method
+    let dynamic_splat_size = if variable_splat {
+        asm_comment!(asm, "variable length splat");
+        let stack_splat_array = asm.lea(asm.stack_opnd(0));
+        Some(asm.ccall(rb_yjit_splat_varg_cfunc as _, vec![stack_splat_array]))
+    } else {
+        None
+    };
+
     // Points to the receiver operand on the stack
-    let recv = ctx.stack_opnd(argc);
+    let recv = asm.stack_opnd(argc);
 
     // Store incremented PC into current control frame in case callee raises.
     jit_save_pc(jit, asm);
 
-    // Increment the stack pointer by 3 (in the callee)
-    // sp += 3
-    let sp = asm.lea(ctx.sp_opnd((SIZEOF_VALUE as isize) * 3));
+    // Find callee's SP with space for metadata.
+    // Usually sp+3.
+    let sp = if let Some(splat_size) = dynamic_splat_size {
+        // Compute the callee's SP at runtime in case we accept a variable size for the splat array
+        const _: () = assert!(SIZEOF_VALUE == 8, "opting for a shift since mul on A64 takes no immediates");
+        let splat_size_bytes = asm.lshift(splat_size, 3usize.into());
+        // 3 items for method metadata, minus one to remove the splat array
+        let static_stack_top = asm.lea(asm.ctx.sp_opnd(2));
+        asm.add(static_stack_top, splat_size_bytes)
+    } else {
+        asm.lea(asm.ctx.sp_opnd(3))
+    };
 
     let specval = if block_arg_type == Some(Type::BlockParamProxy) {
-        SpecVal::BlockParamProxy
-    } else if let Some(block_iseq) = block {
-        SpecVal::BlockISeq(block_iseq)
+        SpecVal::BlockHandler(Some(BlockHandler::BlockParamProxy))
     } else {
-        SpecVal::None
+        SpecVal::BlockHandler(block)
     };
 
     let mut frame_type = VM_FRAME_MAGIC_CFUNC | VM_FRAME_FLAG_CFRAME | VM_ENV_FLAG_LOCAL;
@@ -4783,60 +6724,83 @@ fn gen_send_cfunc(
         frame_type |= VM_FRAME_FLAG_CFRAME_KW
     }
 
-    gen_push_frame(jit, ctx, asm, false, ControlFrame {
+    perf_call!("gen_send_cfunc: ", gen_push_frame(jit, asm, ControlFrame {
         frame_type,
         specval,
         cme,
         recv,
         sp,
-        pc: Some(0),
+        pc: if cfg!(debug_assertions) {
+            Some(!0) // Poison value. Helps to fail fast.
+        } else {
+            None     // Leave PC uninitialized as cfuncs shouldn't read it
+        },
         iseq: None,
-        local_size: 0,
-    });
+    }));
+
+    asm_comment!(asm, "set ec->cfp");
+    let new_cfp = asm.lea(Opnd::mem(64, CFP, -(RUBY_SIZEOF_CONTROL_FRAME as i32)));
+    asm.store(Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP), new_cfp);
 
     if !kw_arg.is_null() {
         // Build a hash from all kwargs passed
-        asm.comment("build_kwhash");
+        asm_comment!(asm, "build_kwhash");
         let imemo_ci = VALUE(ci as usize);
         assert_ne!(0, unsafe { rb_IMEMO_TYPE_P(imemo_ci, imemo_callinfo) },
             "we assume all callinfos with kwargs are on the GC heap");
-        let sp = asm.lea(ctx.sp_opnd(0));
+        let sp = asm.lea(asm.ctx.sp_opnd(0));
         let kwargs = asm.ccall(build_kwhash as *const u8, vec![imemo_ci.into(), sp]);
 
         // Replace the stack location at the start of kwargs with the new hash
-        let stack_opnd = ctx.stack_opnd(argc - passed_argc);
+        let stack_opnd = asm.stack_opnd(argc - passed_argc);
         asm.mov(stack_opnd, kwargs);
     }
 
-    // Copy SP because REG_SP will get overwritten
-    let sp = asm.lea(ctx.sp_opnd(0));
-
-    // Pop the C function arguments from the stack (in the caller)
-    ctx.stack_pop((argc + 1).try_into().unwrap());
-
     // Write interpreter SP into CFP.
-    // Needed in case the callee yields to the block.
-    gen_save_sp(jit, asm, ctx);
+    // We don't pop arguments yet to use registers for passing them, but we
+    // have to set cfp->sp below them for full_cfunc_return() invalidation.
+    gen_save_sp_with_offset(asm, -(argc + 1) as i8);
 
     // Non-variadic method
     let args = if cfunc_argc >= 0 {
         // Copy the arguments from the stack to the C argument registers
         // self is the 0th argument and is at index argc from the stack top
         (0..=passed_argc).map(|i|
-            Opnd::mem(64, sp, -(argc + 1 - i) * SIZEOF_VALUE_I32)
+            asm.stack_opnd(argc - i)
         ).collect()
     }
     // Variadic method
     else if cfunc_argc == -1 {
         // The method gets a pointer to the first argument
         // rb_f_puts(int argc, VALUE *argv, VALUE recv)
+
+        let passed_argc_opnd = if let Some(splat_size) = dynamic_splat_size {
+            // The final argc is the size of the splat, minus one for the splat array itself
+            asm.add(splat_size, (passed_argc - 1).into())
+        } else {
+            // Without a splat, passed_argc is static
+            Opnd::Imm(passed_argc.into())
+        };
+
         vec![
-            Opnd::Imm(passed_argc.into()),
-            asm.lea(Opnd::mem(64, sp, -(argc) * SIZEOF_VALUE_I32)),
-            Opnd::mem(64, sp, -(argc + 1) * SIZEOF_VALUE_I32),
+            passed_argc_opnd,
+            asm.lea(asm.ctx.sp_opnd(-argc)),
+            asm.stack_opnd(argc),
         ]
     }
-    else {
+    // Variadic method taking a Ruby array
+    else if cfunc_argc == -2 {
+        // Slurp up all the arguments into an array
+        let stack_args = asm.lea(asm.ctx.sp_opnd(-argc));
+        let args_array = asm.ccall(
+            rb_ec_ary_new_from_values as _,
+            vec![EC, passed_argc.into(), stack_args]
+        );
+
+        // Example signature:
+        // VALUE neg2_method(VALUE self, VALUE argv)
+        vec![asm.stack_opnd(argc), args_array]
+    } else {
         panic!("unexpected cfunc_args: {}", cfunc_argc)
     };
 
@@ -4844,16 +6808,31 @@ fn gen_send_cfunc(
     // VALUE ret = (cfunc->func)(recv, argv[0], argv[1]);
     // cfunc comes from compile-time cme->def, which we assume to be stable.
     // Invalidation logic is in yjit_method_lookup_change()
-    asm.comment("call C function");
+    asm_comment!(asm, "call C function");
     let ret = asm.ccall(unsafe { get_mct_func(cfunc) }.cast(), args);
+    asm.stack_pop((argc + 1).try_into().unwrap()); // Pop arguments after ccall to use registers for passing them.
 
     // Record code position for TracePoint patching. See full_cfunc_return().
     record_global_inval_patch(asm, CodegenGlobals::get_outline_full_cfunc_return_pos());
 
     // Push the return value on the Ruby stack
-    let stack_ret = ctx.stack_push(Type::Unknown);
+    let stack_ret = asm.stack_push(Type::Unknown);
     asm.mov(stack_ret, ret);
 
+    // Log the name of the method we're calling to. We intentionally don't do this for inlined cfuncs.
+    // We also do this after the C call to minimize the impact of spill_temps() on asm.ccall().
+    if get_option!(gen_stats) {
+        // Assemble the method name string
+        let mid = unsafe { vm_ci_mid(ci) };
+        let name_str = get_method_name(recv_known_class, mid);
+
+        // Get an index for this cfunc name
+        let cfunc_idx = get_cfunc_idx(&name_str);
+
+        // Increment the counter for this cfunc
+        asm.ccall(incr_cfunc_counter as *const u8, vec![cfunc_idx.into()]);
+    }
+
     // Pop the stack frame (ec->cfp++)
     // Instead of recalculating, we can reuse the previous CFP, which is stored in a callee-saved
     // register
@@ -4861,56 +6840,27 @@ fn gen_send_cfunc(
     asm.store(ec_cfp_opnd, CFP);
 
     // cfunc calls may corrupt types
-    ctx.clear_local_types();
+    asm.clear_local_types();
 
     // Note: the return block of gen_send_iseq() has ctx->sp_offset == 1
     // which allows for sharing the same successor.
 
     // Jump (fall through) to the call continuation block
     // We do this to end the current block after the call
-    jump_to_next_insn(jit, ctx, asm, ocb);
-    EndBlock
+    jump_to_next_insn(jit, asm, ocb);
+    Some(EndBlock)
 }
 
-fn gen_return_branch(
-    asm: &mut Assembler,
-    target0: CodePtr,
-    _target1: Option<CodePtr>,
-    shape: BranchShape,
-) {
-    match shape {
-        BranchShape::Next0 | BranchShape::Next1 => unreachable!(),
-        BranchShape::Default => {
-            asm.comment("update cfp->jit_return");
-            asm.mov(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_JIT_RETURN), Opnd::const_ptr(target0.raw_ptr()));
-        }
-    }
-}
-
-/// Pushes arguments from an array to the stack that are passed with a splat (i.e. *args)
-/// It optimistically compiles to a static size that is the exact number of arguments
-/// needed for the function.
-fn push_splat_args(required_args: u32, ctx: &mut Context, asm: &mut Assembler, ocb: &mut OutlinedCb, side_exit: Target) {
-
-    asm.comment("push_splat_args");
-
-    let array_opnd = ctx.stack_opnd(0);
-    let array_reg = asm.load(array_opnd);
-
-    guard_object_is_heap(
-        asm,
-        array_reg,
-        counted_exit!(ocb, side_exit, send_splat_not_array),
-    );
-    guard_object_is_array(
-        asm,
-        array_reg,
-        counted_exit!(ocb, side_exit, send_splat_not_array),
-    );
-
-    asm.comment("Get array length for embedded or heap");
+// Generate RARRAY_LEN. For array_opnd, use Opnd::Reg to reduce memory access,
+// and use Opnd::Mem to save registers.
+fn get_array_len(asm: &mut Assembler, array_opnd: Opnd) -> Opnd {
+    asm_comment!(asm, "get array length for embedded or heap");
 
     // Pull out the embed flag to check if it's an embedded array.
+    let array_reg = match array_opnd {
+        Opnd::InsnOut { .. } => array_opnd,
+        _ => asm.load(array_opnd),
+    };
     let flags_opnd = Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RBASIC_FLAGS);
 
     // Get the length of the array
@@ -4921,94 +6871,144 @@ fn push_splat_args(required_args: u32, ctx: &mut Context, asm: &mut Assembler, o
     let flags_opnd = Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RBASIC_FLAGS);
     asm.test(flags_opnd, (RARRAY_EMBED_FLAG as u64).into());
 
-    // Need to repeat this here to deal with register allocation
-    let array_opnd = ctx.stack_opnd(0);
-    let array_reg = asm.load(array_opnd);
-
+    let array_reg = match array_opnd {
+        Opnd::InsnOut { .. } => array_opnd,
+        _ => asm.load(array_opnd),
+    };
     let array_len_opnd = Opnd::mem(
-        (8 * size_of::<std::os::raw::c_long>()) as u8,
+        std::os::raw::c_long::BITS as u8,
         array_reg,
         RUBY_OFFSET_RARRAY_AS_HEAP_LEN,
     );
-    let array_len_opnd = asm.csel_nz(emb_len_opnd, array_len_opnd);
-
-    asm.comment("Side exit if length doesn't not equal remaining args");
-    asm.cmp(array_len_opnd, required_args.into());
-    asm.jne(counted_exit!(ocb, side_exit, send_splatarray_length_not_equal));
 
-    asm.comment("Check last argument is not ruby2keyword hash");
+    // Select the array length value
+    asm.csel_nz(emb_len_opnd, array_len_opnd)
+}
 
-    // Need to repeat this here to deal with register allocation
-    let array_reg = asm.load(ctx.stack_opnd(0));
+// Generate RARRAY_CONST_PTR (part of RARRAY_AREF)
+fn get_array_ptr(asm: &mut Assembler, array_reg: Opnd) -> Opnd {
+    asm_comment!(asm, "get array pointer for embedded or heap");
 
     let flags_opnd = Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RBASIC_FLAGS);
     asm.test(flags_opnd, (RARRAY_EMBED_FLAG as u64).into());
     let heap_ptr_opnd = Opnd::mem(
-        (8 * size_of::<usize>()) as u8,
+        usize::BITS as u8,
         array_reg,
         RUBY_OFFSET_RARRAY_AS_HEAP_PTR,
     );
+
     // Load the address of the embedded array
     // (struct RArray *)(obj)->as.ary
     let ary_opnd = asm.lea(Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RARRAY_AS_ARY));
-    let ary_opnd = asm.csel_nz(ary_opnd, heap_ptr_opnd);
+    asm.csel_nz(ary_opnd, heap_ptr_opnd)
+}
+
+// Generate RSTRING_PTR
+fn get_string_ptr(asm: &mut Assembler, string_reg: Opnd) -> Opnd {
+    asm_comment!(asm, "get string pointer for embedded or heap");
 
-    let last_array_value = asm.load(Opnd::mem(64, ary_opnd, (required_args as i32 - 1) * (SIZEOF_VALUE as i32)));
+    let flags_opnd = Opnd::mem(VALUE_BITS, string_reg, RUBY_OFFSET_RBASIC_FLAGS);
+    asm.test(flags_opnd, (RSTRING_NOEMBED as u64).into());
+    let heap_ptr_opnd = asm.load(Opnd::mem(
+        usize::BITS as u8,
+        string_reg,
+        RUBY_OFFSET_RSTRING_AS_HEAP_PTR,
+    ));
 
-    guard_object_is_not_ruby2_keyword_hash(
+    // Load the address of the embedded array
+    // (struct RString *)(obj)->as.ary
+    let ary_opnd = asm.lea(Opnd::mem(VALUE_BITS, string_reg, RUBY_OFFSET_RSTRING_AS_ARY));
+    asm.csel_nz(heap_ptr_opnd, ary_opnd)
+}
+
+/// Pushes arguments from an array to the stack. Differs from push splat because
+/// the array can have items left over. Array is assumed to be T_ARRAY without guards.
+fn copy_splat_args_for_rest_callee(array: Opnd, num_args: u32, asm: &mut Assembler) {
+    asm_comment!(asm, "copy_splat_args_for_rest_callee");
+
+    // Unused operands cause the backend to panic
+    if num_args == 0 {
+        return;
+    }
+
+    asm_comment!(asm, "Push arguments from array");
+
+    let array_reg = asm.load(array);
+    let ary_opnd = get_array_ptr(asm, array_reg);
+    for i in 0..num_args {
+        let top = asm.stack_push(Type::Unknown);
+        asm.mov(top, Opnd::mem(64, ary_opnd, i as i32 * SIZEOF_VALUE_I32));
+    }
+}
+
+/// Pushes arguments from an array to the stack that are passed with a splat (i.e. *args)
+/// It optimistically compiles to a static size that is the exact number of arguments
+/// needed for the function.
+fn push_splat_args(required_args: u32, asm: &mut Assembler) {
+    asm_comment!(asm, "push_splat_args");
+
+    let array_opnd = asm.stack_opnd(0);
+    guard_object_is_array(
         asm,
-        last_array_value,
-        counted_exit!(ocb, side_exit, send_splatarray_last_ruby_2_keywords));
+        array_opnd,
+        array_opnd.into(),
+        Counter::guard_send_splat_not_array,
+    );
+
+    let array_len_opnd = get_array_len(asm, array_opnd);
 
-    asm.comment("Push arguments from array");
-    let array_opnd = ctx.stack_pop(1);
+    asm_comment!(asm, "Guard for expected splat length");
+    asm.cmp(array_len_opnd, required_args.into());
+    asm.jne(Target::side_exit(Counter::guard_send_splatarray_length_not_equal));
 
+    // Check last element of array if present
     if required_args > 0 {
-        // Load the address of the embedded array
-        // (struct RArray *)(obj)->as.ary
-        let array_reg = asm.load(array_opnd);
+        asm_comment!(asm, "Check last argument is not ruby2keyword hash");
 
-        // Conditionally load the address of the heap array
-        // (struct RArray *)(obj)->as.heap.ptr
-        let flags_opnd = Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RBASIC_FLAGS);
-        asm.test(flags_opnd, Opnd::UImm(RARRAY_EMBED_FLAG as u64));
-        let heap_ptr_opnd = Opnd::mem(
-            (8 * size_of::<usize>()) as u8,
-            array_reg,
-            RUBY_OFFSET_RARRAY_AS_HEAP_PTR,
+        // Need to repeat this here to deal with register allocation
+        let array_reg = asm.load(asm.stack_opnd(0));
+        let ary_opnd = get_array_ptr(asm, array_reg);
+        let last_array_value = asm.load(Opnd::mem(64, ary_opnd, (required_args as i32 - 1) * (SIZEOF_VALUE as i32)));
+        guard_object_is_not_ruby2_keyword_hash(
+            asm,
+            last_array_value,
+            Counter::guard_send_splatarray_last_ruby2_keywords,
         );
-        // Load the address of the embedded array
-        // (struct RArray *)(obj)->as.ary
-        let ary_opnd = asm.lea(Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RARRAY_AS_ARY));
-        let ary_opnd = asm.csel_nz(ary_opnd, heap_ptr_opnd);
+    }
+
+    asm_comment!(asm, "Push arguments from array");
+    let array_opnd = asm.stack_pop(1);
+
+    if required_args > 0 {
+        let array_reg = asm.load(array_opnd);
+        let ary_opnd = get_array_ptr(asm, array_reg);
 
         for i in 0..required_args {
-            let top = ctx.stack_push(Type::Unknown);
+            let top = asm.stack_push(Type::Unknown);
             asm.mov(top, Opnd::mem(64, ary_opnd, i as i32 * SIZEOF_VALUE_I32));
         }
 
-        asm.comment("end push_each");
+        asm_comment!(asm, "end push_each");
     }
 }
 
 fn gen_send_bmethod(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
     ci: *const rb_callinfo,
     cme: *const rb_callable_method_entry_t,
-    block: Option<IseqPtr>,
+    block: Option<BlockHandler>,
     flags: u32,
     argc: i32,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     let procv = unsafe { rb_get_def_bmethod_proc((*cme).def) };
 
     let proc = unsafe { rb_yjit_get_proc_ptr(procv) };
     let proc_block = unsafe { &(*proc).block };
 
     if proc_block.type_ != block_type_iseq {
-        return CantCompile;
+        return None;
     }
 
     let capture = unsafe { proc_block.as_.captured.as_ref() };
@@ -5016,25 +7016,73 @@ fn gen_send_bmethod(
 
     // Optimize for single ractor mode and avoid runtime check for
     // "defined with an un-shareable Proc in a different Ractor"
-    if !assume_single_ractor_mode(jit, ocb) {
-        gen_counter_incr!(asm, send_bmethod_ractor);
-        return CantCompile;
+    if !assume_single_ractor_mode(jit, asm, ocb) {
+        gen_counter_incr(asm, Counter::send_bmethod_ractor);
+        return None;
     }
 
     // Passing a block to a block needs logic different from passing
     // a block to a method and sometimes requires allocation. Bail for now.
     if block.is_some() {
-        gen_counter_incr!(asm, send_bmethod_block_arg);
-        return CantCompile;
+        gen_counter_incr(asm, Counter::send_bmethod_block_arg);
+        return None;
     }
 
     let frame_type = VM_FRAME_MAGIC_BLOCK | VM_FRAME_FLAG_BMETHOD | VM_FRAME_FLAG_LAMBDA;
-    gen_send_iseq(jit, ctx, asm, ocb, iseq, ci, frame_type, Some(capture.ep), cme, block, flags, argc, None)
+    perf_call! { gen_send_iseq(jit, asm, ocb, iseq, ci, frame_type, Some(capture.ep), cme, block, flags, argc, None) }
+}
+
+/// The kind of a value an ISEQ returns
+enum IseqReturn {
+    Value(VALUE),
+    LocalVariable(u32),
+    Receiver,
+}
+
+extern {
+    fn rb_simple_iseq_p(iseq: IseqPtr) -> bool;
+}
+
+/// Return the ISEQ's return value if it consists of one simple instruction and leave.
+fn iseq_get_return_value(iseq: IseqPtr, captured_opnd: Option<Opnd>, ci_flags: u32) -> Option<IseqReturn> {
+    // Expect only two instructions and one possible operand
+    let iseq_size = unsafe { get_iseq_encoded_size(iseq) };
+    if !(2..=3).contains(&iseq_size) {
+        return None;
+    }
+
+    // Get the first two instructions
+    let first_insn = iseq_opcode_at_idx(iseq, 0);
+    let second_insn = iseq_opcode_at_idx(iseq, insn_len(first_insn as usize));
+
+    // Extract the return value if known
+    if second_insn != YARVINSN_leave {
+        return None;
+    }
+    match first_insn {
+        YARVINSN_getlocal_WC_0  => {
+            // Only accept simple positional only cases for both the caller and the callee.
+            // Reject block ISEQs to avoid autosplat and other block parameter complications.
+            if captured_opnd.is_none() && unsafe { rb_simple_iseq_p(iseq) } && ci_flags & VM_CALL_ARGS_SIMPLE != 0 {
+                let ep_offset = unsafe { *rb_iseq_pc_at_idx(iseq, 1) }.as_u32();
+                let local_idx = ep_offset_to_local_idx(iseq, ep_offset);
+                Some(IseqReturn::LocalVariable(local_idx))
+            } else {
+                None
+            }
+        }
+        YARVINSN_putnil => Some(IseqReturn::Value(Qnil)),
+        YARVINSN_putobject => Some(IseqReturn::Value(unsafe { *rb_iseq_pc_at_idx(iseq, 1) })),
+        YARVINSN_putobject_INT2FIX_0_ => Some(IseqReturn::Value(VALUE::fixnum_from_usize(0))),
+        YARVINSN_putobject_INT2FIX_1_ => Some(IseqReturn::Value(VALUE::fixnum_from_usize(1))),
+        // We don't support invokeblock for now. Such ISEQs are likely not used by blocks anyway.
+        YARVINSN_putself if captured_opnd.is_none() => Some(IseqReturn::Receiver),
+        _ => None,
+    }
 }
 
 fn gen_send_iseq(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
     iseq: *const rb_iseq_t,
@@ -5042,99 +7090,41 @@ fn gen_send_iseq(
     frame_type: u32,
     prev_ep: Option<*const VALUE>,
     cme: *const rb_callable_method_entry_t,
-    block: Option<IseqPtr>,
+    block: Option<BlockHandler>,
     flags: u32,
     argc: i32,
     captured_opnd: Option<Opnd>,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
+    // Argument count. We will change this as we gather values from
+    // sources to satisfy the callee's parameters. To help make sense
+    // of changes, note that:
+    //   - Parameters syntactically on the left have lower addresses.
+    //     For example, all the lead (required) and optional parameters
+    //     have lower addresses than the rest parameter array.
+    //   - The larger the index one passes to Assembler::stack_opnd(),
+    //     the *lower* the address.
     let mut argc = argc;
 
-    // Create a side-exit to fall back to the interpreter
-    let side_exit = get_side_exit(jit, ocb, ctx);
-
-    // When you have keyword arguments, there is an extra object that gets
-    // placed on the stack the represents a bitmap of the keywords that were not
-    // specified at the call site. We need to keep track of the fact that this
-    // value is present on the stack in order to properly set up the callee's
-    // stack pointer.
-    let doing_kw_call = unsafe { get_iseq_flags_has_kw(iseq) };
+    // Iseqs with keyword parameters have a hidden, unnamed parameter local
+    // that the callee could use to know which keywords are unspecified
+    // (see the `checkkeyword` instruction and check `ruby --dump=insn -e 'def foo(k:itself)=k'`).
+    // We always need to set up this local if the call goes through.
+    let has_kwrest = unsafe { get_iseq_flags_has_kwrest(iseq) };
+    let doing_kw_call = unsafe { get_iseq_flags_has_kw(iseq) } || has_kwrest;
     let supplying_kws = unsafe { vm_ci_flag(ci) & VM_CALL_KWARG } != 0;
+    let iseq_has_rest = unsafe { get_iseq_flags_has_rest(iseq) };
+    let iseq_has_block_param = unsafe { get_iseq_flags_has_block(iseq) };
+    let arg_setup_block = captured_opnd.is_some(); // arg_setup_type: arg_setup_block (invokeblock)
+    let kw_splat = flags & VM_CALL_KW_SPLAT != 0;
+    let splat_call = flags & VM_CALL_ARGS_SPLAT != 0;
 
-    if unsafe { vm_ci_flag(ci) } & VM_CALL_TAILCALL != 0 {
-        // We can't handle tailcalls
-        gen_counter_incr!(asm, send_iseq_tailcall);
-        return CantCompile;
-    }
-
-    // No support for callees with these parameters yet as they require allocation
-    // or complex handling.
-    if unsafe { get_iseq_flags_has_rest(iseq) } {
-        gen_counter_incr!(asm, send_iseq_has_rest);
-        return CantCompile;
-    }
-    if unsafe { get_iseq_flags_has_post(iseq) } {
-        gen_counter_incr!(asm, send_iseq_has_post);
-        return CantCompile;
-    }
-    if unsafe { get_iseq_flags_has_kwrest(iseq) } {
-        gen_counter_incr!(asm, send_iseq_has_kwrest);
-        return CantCompile;
-    }
+    let forwarding_call = unsafe { rb_get_iseq_flags_forwardable(iseq) };
 
-    // In order to handle backwards compatibility between ruby 3 and 2
-    // ruby2_keywords was introduced. It is called only on methods
-    // with splat and changes they way they handle them.
-    // We are just going to not compile these.
-    // https://www.rubydoc.info/stdlib/core/Proc:ruby2_keywords
-    if unsafe {
-        get_iseq_flags_ruby2_keywords(jit.iseq) && flags & VM_CALL_ARGS_SPLAT != 0
-    } {
-        gen_counter_incr!(asm, send_iseq_ruby2_keywords);
-        return CantCompile;
-    }
+    // For computing offsets to callee locals
+    let num_params = unsafe { get_iseq_body_param_size(iseq) as i32 };
+    let num_locals = unsafe { get_iseq_body_local_table_size(iseq) as i32 };
 
-    // If we have keyword arguments being passed to a callee that only takes
-    // positionals, then we need to allocate a hash. For now we're going to
-    // call that too complex and bail.
-    if supplying_kws && !unsafe { get_iseq_flags_has_kw(iseq) } {
-        gen_counter_incr!(asm, send_iseq_has_no_kw);
-        return CantCompile;
-    }
-
-    // If we have a method accepting no kwargs (**nil), exit if we have passed
-    // it any kwargs.
-    if supplying_kws && unsafe { get_iseq_flags_accepts_no_kwarg(iseq) } {
-        gen_counter_incr!(asm, send_iseq_accepts_no_kwarg);
-        return CantCompile;
-    }
-
-    // For computing number of locals to set up for the callee
-    let mut num_params = unsafe { get_iseq_body_param_size(iseq) };
-
-    // Block parameter handling. This mirrors setup_parameters_complex().
-    if unsafe { get_iseq_flags_has_block(iseq) } {
-        if unsafe { get_iseq_body_local_iseq(iseq) == iseq } {
-            num_params -= 1;
-        } else {
-            // In this case (param.flags.has_block && local_iseq != iseq),
-            // the block argument is setup as a local variable and requires
-            // materialization (allocation). Bail.
-            gen_counter_incr!(asm, send_iseq_materialized_block);
-            return CantCompile;
-        }
-    }
-
-
-    if flags & VM_CALL_ARGS_SPLAT != 0 && flags & VM_CALL_ZSUPER != 0 {
-        // zsuper methods are super calls without any arguments.
-        // They are also marked as splat, but don't actually have an array
-        // they pull arguments from, instead we need to change to call
-        // a different method with the current stack.
-        gen_counter_incr!(asm, send_iseq_zsuper);
-        return CantCompile;
-    }
-
-    let mut start_pc_offset = 0;
+    let mut start_pc_offset: u16 = 0;
     let required_num = unsafe { get_iseq_body_param_lead_num(iseq) };
 
     // This struct represents the metadata about the caller-specified
@@ -5146,392 +7136,608 @@ fn gen_send_iseq(
         unsafe { get_cikw_keyword_len(kw_arg) }
     };
 
-    // Arity handling and optional parameter setup
-    let opts_filled = argc - required_num - kw_arg_num;
+    // Arity handling and optional parameter setup for positional arguments.
+    // Splats are handled later.
+    let mut opts_filled = argc - required_num - kw_arg_num - i32::from(kw_splat) - i32::from(splat_call);
     let opt_num = unsafe { get_iseq_body_param_opt_num(iseq) };
-    let opts_missing: i32 = opt_num - opts_filled;
-
-
-    if opt_num > 0 && flags & VM_CALL_ARGS_SPLAT != 0 {
-        gen_counter_incr!(asm, send_iseq_splat_with_opt);
-        return CantCompile;
+    // With a rest parameter or a yield to a block,
+    // callers can pass more than required + optional.
+    // So we cap ops_filled at opt_num.
+    if iseq_has_rest || arg_setup_block {
+        opts_filled = min(opts_filled, opt_num);
     }
+    let mut opts_missing: i32 = opt_num - opts_filled;
 
-    if doing_kw_call && flags & VM_CALL_ARGS_SPLAT != 0 {
-        gen_counter_incr!(asm, send_iseq_splat_with_kw);
-        return CantCompile;
+    let block_arg = flags & VM_CALL_ARGS_BLOCKARG != 0;
+    // Stack index of the splat array
+    let splat_pos = i32::from(block_arg) + i32::from(kw_splat) + kw_arg_num;
+
+    exit_if_stack_too_large(iseq)?;
+    exit_if_tail_call(asm, ci)?;
+    exit_if_has_post(asm, iseq)?;
+    exit_if_kwsplat_non_nil(asm, flags, Counter::send_iseq_kw_splat_non_nil)?;
+    exit_if_has_rest_and_captured(asm, iseq_has_rest, captured_opnd)?;
+    exit_if_has_kwrest_and_captured(asm, has_kwrest, captured_opnd)?;
+    exit_if_has_rest_and_supplying_kws(asm, iseq_has_rest, supplying_kws)?;
+    exit_if_supplying_kw_and_has_no_kw(asm, supplying_kws, doing_kw_call)?;
+    exit_if_supplying_kws_and_accept_no_kwargs(asm, supplying_kws, iseq)?;
+    exit_if_doing_kw_and_splat(asm, doing_kw_call, flags)?;
+    if !forwarding_call {
+        exit_if_wrong_number_arguments(asm, arg_setup_block, opts_filled, flags, opt_num, iseq_has_rest)?;
+    }
+    exit_if_doing_kw_and_opts_missing(asm, doing_kw_call, opts_missing)?;
+    exit_if_has_rest_and_optional_and_block(asm, iseq_has_rest, opt_num, iseq, block_arg)?;
+    if forwarding_call && flags & VM_CALL_OPT_SEND != 0 {
+        gen_counter_incr(asm, Counter::send_iseq_send_forwarding);
+        return None;
+    }
+    let block_arg_type = exit_if_unsupported_block_arg_type(jit, asm, block_arg)?;
+
+    // Bail if we can't drop extra arguments for a yield by just popping them
+    if supplying_kws && arg_setup_block && argc > (kw_arg_num + required_num + opt_num) {
+        gen_counter_incr(asm, Counter::send_iseq_complex_discard_extras);
+        return None;
     }
 
-    if opts_filled < 0 && flags & VM_CALL_ARGS_SPLAT == 0  {
-        // Too few arguments and no splat to make up for it
-        gen_counter_incr!(asm, send_iseq_arity_error);
-        return CantCompile;
+    // Block parameter handling. This mirrors setup_parameters_complex().
+    if iseq_has_block_param {
+        if unsafe { get_iseq_body_local_iseq(iseq) == iseq } {
+            // Do nothing
+        } else {
+            // In this case (param.flags.has_block && local_iseq != iseq),
+            // the block argument is setup as a local variable and requires
+            // materialization (allocation). Bail.
+            gen_counter_incr(asm, Counter::send_iseq_materialized_block);
+            return None;
+        }
     }
 
-    if opts_filled > opt_num {
-        // Too many arguments
-        gen_counter_incr!(asm, send_iseq_arity_error);
-        return CantCompile;
+    // Check that required keyword arguments are supplied and find any extras
+    // that should go into the keyword rest parameter (**kw_rest).
+    if doing_kw_call {
+        gen_iseq_kw_call_checks(asm, iseq, kw_arg, has_kwrest, kw_arg_num)?;
     }
 
-    let block_arg = flags & VM_CALL_ARGS_BLOCKARG != 0;
-    let block_arg_type = if block_arg {
-        Some(ctx.get_opnd_type(StackOpnd(0)))
+    let splat_array_length = if splat_call {
+        let array = jit.peek_at_stack(&asm.ctx, splat_pos as isize);
+        let array_length = if array == Qnil {
+            0
+        } else if unsafe { !RB_TYPE_P(array, RUBY_T_ARRAY) } {
+            gen_counter_incr(asm, Counter::send_iseq_splat_not_array);
+            return None;
+        } else {
+            unsafe { rb_yjit_array_len(array) as u32}
+        };
+
+        // Arity check accounting for size of the splat. When callee has rest parameters, we insert
+        // runtime guards later in copy_splat_args_for_rest_callee()
+        if !iseq_has_rest {
+            let supplying = argc - 1 - i32::from(kw_splat) + array_length as i32;
+            if (required_num..=required_num + opt_num).contains(&supplying) == false {
+                gen_counter_incr(asm, Counter::send_iseq_splat_arity_error);
+                return None;
+            }
+        }
+
+        if iseq_has_rest && opt_num > 0 {
+            // If we have a rest and option arguments
+            // we are going to set the pc_offset for where
+            // to jump in the called method.
+            // If the number of args change, that would need to
+            // change and we don't change that dynmically so we side exit.
+            // On a normal splat without rest and option args this is handled
+            // elsewhere depending on the case
+            asm_comment!(asm, "Side exit if length doesn't not equal compile time length");
+            let array_len_opnd = get_array_len(asm, asm.stack_opnd(splat_pos));
+            asm.cmp(array_len_opnd, array_length.into());
+            asm.jne(Target::side_exit(Counter::guard_send_splatarray_length_not_equal));
+        }
+
+        Some(array_length)
     } else {
         None
     };
 
-    match block_arg_type {
-        Some(Type::Nil | Type::BlockParamProxy) => {
-            // We'll handle this later
-        }
-        None => {
-            // Nothing to do
+    // Check if we need the arg0 splat handling of vm_callee_setup_block_arg()
+    // Also known as "autosplat" inside setup_parameters_complex().
+    // Autosplat checks argc == 1 after splat and kwsplat processing, so make
+    // sure to amend this if we start support kw_splat.
+    let block_arg0_splat = arg_setup_block
+        && (argc == 1 || (argc == 2 && splat_array_length == Some(0)))
+        && !supplying_kws && !doing_kw_call
+        && unsafe {
+            (get_iseq_flags_has_lead(iseq) || opt_num > 1)
+                && !get_iseq_flags_ambiguous_param0(iseq)
+        };
+    if block_arg0_splat {
+        // If block_arg0_splat, we still need side exits after splat, but
+        // the splat modifies the stack which breaks side exits. So bail out.
+        if splat_call {
+            gen_counter_incr(asm, Counter::invokeblock_iseq_arg0_args_splat);
+            return None;
         }
-        _ => {
-            gen_counter_incr!(asm, send_block_arg);
-            return CantCompile;
+        // The block_arg0_splat implementation cannot deal with optional parameters.
+        // This is a setup_parameters_complex() situation and interacts with the
+        // starting position of the callee.
+        if opt_num > 1 {
+            gen_counter_incr(asm, Counter::invokeblock_iseq_arg0_optional);
+            return None;
         }
     }
 
-    // If we have unfilled optional arguments and keyword arguments then we
-    // would need to adjust the arguments location to account for that.
-    // For now we aren't handling this case.
-    if doing_kw_call && opts_missing > 0 {
-        gen_counter_incr!(asm, send_iseq_missing_optional_kw);
-        return CantCompile;
+    // Adjust `opts_filled` and `opts_missing` taking
+    // into account the size of the splat expansion.
+    if let Some(len) = splat_array_length {
+        assert_eq!(kw_arg_num, 0); // Due to exit_if_doing_kw_and_splat().
+                                   // Simplifies calculation below.
+        let num_args = argc - 1 - i32::from(kw_splat) + len as i32;
+
+        opts_filled = if num_args >= required_num {
+            min(num_args - required_num, opt_num)
+        } else {
+            0
+        };
+        opts_missing = opt_num - opts_filled;
     }
 
+    assert_eq!(opts_missing + opts_filled, opt_num);
+    assert!(opts_filled >= 0);
+
+    // ISeq with optional parameters start at different
+    // locations depending on the number of optionals given.
     if opt_num > 0 {
-        num_params -= opts_missing as u32;
+        assert!(opts_filled >= 0);
         unsafe {
             let opt_table = get_iseq_body_param_opt_table(iseq);
-            start_pc_offset = (*opt_table.offset(opts_filled as isize)).as_u32();
+            start_pc_offset = opt_table.offset(opts_filled as isize).read().try_into().unwrap();
         }
     }
 
-    if doing_kw_call {
-        // Here we're calling a method with keyword arguments and specifying
-        // keyword arguments at this call site.
-
-        // This struct represents the metadata about the callee-specified
-        // keyword parameters.
-        let keyword = unsafe { get_iseq_body_param_keyword(iseq) };
-        let keyword_num: usize = unsafe { (*keyword).num }.try_into().unwrap();
-        let keyword_required_num: usize = unsafe { (*keyword).required_num }.try_into().unwrap();
-
-        let mut required_kwargs_filled = 0;
-
-        if keyword_num > 30 {
-            // We have so many keywords that (1 << num) encoded as a FIXNUM
-            // (which shifts it left one more) no longer fits inside a 32-bit
-            // immediate.
-            gen_counter_incr!(asm, send_iseq_too_many_kwargs);
-            return CantCompile;
-        }
-
-        // Check that the kwargs being passed are valid
-        if supplying_kws {
-            // This is the list of keyword arguments that the callee specified
-            // in its initial declaration.
-            // SAFETY: see compile.c for sizing of this slice.
-            let callee_kwargs = unsafe { slice::from_raw_parts((*keyword).table, keyword_num) };
-
-            // Here we're going to build up a list of the IDs that correspond to
-            // the caller-specified keyword arguments. If they're not in the
-            // same order as the order specified in the callee declaration, then
-            // we're going to need to generate some code to swap values around
-            // on the stack.
-            let kw_arg_keyword_len: usize =
-                unsafe { get_cikw_keyword_len(kw_arg) }.try_into().unwrap();
-            let mut caller_kwargs: Vec<ID> = vec![0; kw_arg_keyword_len];
-            for kwarg_idx in 0..kw_arg_keyword_len {
-                let sym = unsafe { get_cikw_keywords_idx(kw_arg, kwarg_idx.try_into().unwrap()) };
-                caller_kwargs[kwarg_idx] = unsafe { rb_sym2id(sym) };
-            }
+    // Increment total ISEQ send count
+    gen_counter_incr(asm, Counter::num_send_iseq);
 
-            // First, we're going to be sure that the names of every
-            // caller-specified keyword argument correspond to a name in the
-            // list of callee-specified keyword parameters.
-            for caller_kwarg in caller_kwargs {
-                let search_result = callee_kwargs
-                    .iter()
-                    .enumerate() // inject element index
-                    .find(|(_, &kwarg)| kwarg == caller_kwarg);
-
-                match search_result {
-                    None => {
-                        // If the keyword was never found, then we know we have a
-                        // mismatch in the names of the keyword arguments, so we need to
-                        // bail.
-                        gen_counter_incr!(asm, send_iseq_kwargs_mismatch);
-                        return CantCompile;
-                    }
-                    Some((callee_idx, _)) if callee_idx < keyword_required_num => {
-                        // Keep a count to ensure all required kwargs are specified
-                        required_kwargs_filled += 1;
-                    }
-                    _ => (),
+    // Shortcut for special `Primitive.attr! :leaf` builtins
+    let builtin_attrs = unsafe { rb_yjit_iseq_builtin_attrs(iseq) };
+    let builtin_func_raw = unsafe { rb_yjit_builtin_function(iseq) };
+    let builtin_func = if builtin_func_raw.is_null() { None } else { Some(builtin_func_raw) };
+    let opt_send_call = flags & VM_CALL_OPT_SEND != 0; // .send call is not currently supported for builtins
+    if let (None, Some(builtin_info), true, false, None | Some(0)) =
+           (block, builtin_func, builtin_attrs & BUILTIN_ATTR_LEAF != 0, opt_send_call, splat_array_length) {
+        let builtin_argc = unsafe { (*builtin_info).argc };
+        if builtin_argc + 1 < (C_ARG_OPNDS.len() as i32) {
+            // We pop the block arg without using it because:
+            //  - the builtin is leaf, so it promises to not `yield`.
+            //  - no leaf builtins have block param at the time of writing, and
+            //    adding one requires interpreter changes to support.
+            if block_arg_type.is_some() {
+                if iseq_has_block_param {
+                    gen_counter_incr(asm, Counter::send_iseq_leaf_builtin_block_arg_block_param);
+                    return None;
                 }
+                asm.stack_pop(1);
             }
-        }
-        assert!(required_kwargs_filled <= keyword_required_num);
-        if required_kwargs_filled != keyword_required_num {
-            gen_counter_incr!(asm, send_iseq_kwargs_mismatch);
-            return CantCompile;
-        }
-    }
-
-    // Number of locals that are not parameters
-    let num_locals = unsafe { get_iseq_body_local_table_size(iseq) as i32 } - (num_params as i32);
-
-    // Check for interrupts
-    gen_check_ints(asm, side_exit);
-
-    match block_arg_type {
-        Some(Type::Nil) => {
-            // We have a nil block arg, so let's pop it off the args
-            ctx.stack_pop(1);
-        }
-        Some(Type::BlockParamProxy) => {
-            // We don't need the actual stack value
-            ctx.stack_pop(1);
-        }
-        None => {
-            // Nothing to do
-        }
-        _ => {
-            assert!(false);
-        }
-    }
 
-    let leaf_builtin_raw = unsafe { rb_leaf_builtin_function(iseq) };
-    let leaf_builtin: Option<*const rb_builtin_function> = if leaf_builtin_raw.is_null() {
-        None
-    } else {
-        Some(leaf_builtin_raw)
-    };
-    if let (None, Some(builtin_info)) = (block, leaf_builtin) {
+            // Pop empty kw_splat hash which passes nothing (exit_if_kwsplat_non_nil())
+            if kw_splat {
+                asm.stack_pop(1);
+            }
 
-        // this is a .send call not currently supported for builtins
-        if flags & VM_CALL_OPT_SEND != 0 {
-            gen_counter_incr!(asm, send_send_builtin);
-            return CantCompile;
-        }
+            // Pop empty splat array which passes nothing
+            if let Some(0) = splat_array_length {
+                asm.stack_pop(1);
+            }
 
-        let builtin_argc = unsafe { (*builtin_info).argc };
-        if builtin_argc + 1 < (C_ARG_OPNDS.len() as i32) {
-            asm.comment("inlined leaf builtin");
+            asm_comment!(asm, "inlined leaf builtin");
+            gen_counter_incr(asm, Counter::num_send_iseq_leaf);
 
-            // Save the PC and SP because the callee may allocate
-            // e.g. Integer#abs on a bignum
-            jit_prepare_routine_call(jit, ctx, asm);
+            // The callee may allocate, e.g. Integer#abs on a Bignum.
+            // Save SP for GC, save PC for allocation tracing, and prepare
+            // for global invalidation after GC's VM lock contention.
+            jit_prepare_call_with_gc(jit, asm);
 
             // Call the builtin func (ec, recv, arg1, arg2, ...)
             let mut args = vec![EC];
 
             // Copy self and arguments
             for i in 0..=builtin_argc {
-                let stack_opnd = ctx.stack_opnd(builtin_argc - i);
+                let stack_opnd = asm.stack_opnd(builtin_argc - i);
                 args.push(stack_opnd);
             }
-            ctx.stack_pop((builtin_argc + 1).try_into().unwrap());
             let val = asm.ccall(unsafe { (*builtin_info).func_ptr as *const u8 }, args);
+            asm.stack_pop((builtin_argc + 1).try_into().unwrap()); // Keep them on stack during ccall for GC
 
             // Push the return value
-            let stack_ret = ctx.stack_push(Type::Unknown);
+            let stack_ret = asm.stack_push(Type::Unknown);
             asm.mov(stack_ret, val);
 
             // Note: assuming that the leaf builtin doesn't change local variables here.
             // Seems like a safe assumption.
 
-            return KeepCompiling;
+            // Let guard chains share the same successor
+            jump_to_next_insn(jit, asm, ocb);
+            return Some(EndBlock);
         }
     }
 
+    // Inline simple ISEQs whose return value is known at compile time
+    if let (Some(value), None, false) = (iseq_get_return_value(iseq, captured_opnd, flags), block_arg_type, opt_send_call) {
+        asm_comment!(asm, "inlined simple ISEQ");
+        gen_counter_incr(asm, Counter::num_send_iseq_inline);
+
+        match value {
+            IseqReturn::LocalVariable(local_idx) => {
+                // Put the local variable at the return slot
+                let stack_local = asm.stack_opnd(argc - 1 - local_idx as i32);
+                let stack_return = asm.stack_opnd(argc);
+                asm.mov(stack_return, stack_local);
+
+                // Update the mapping for the return value
+                let mapping = asm.ctx.get_opnd_mapping(stack_local.into());
+                asm.ctx.set_opnd_mapping(stack_return.into(), mapping);
+
+                // Pop everything but the return value
+                asm.stack_pop(argc as usize);
+            }
+            IseqReturn::Value(value) => {
+                // Pop receiver and arguments
+                asm.stack_pop(argc as usize + if captured_opnd.is_some() { 0 } else { 1 });
+
+                // Push the return value
+                let stack_ret = asm.stack_push(Type::from(value));
+                asm.mov(stack_ret, value.into());
+            },
+            IseqReturn::Receiver => {
+                // Just pop arguments and leave the receiver on stack
+                asm.stack_pop(argc as usize);
+            }
+        }
+
+        // Let guard chains share the same successor
+        jump_to_next_insn(jit, asm, ocb);
+        return Some(EndBlock);
+    }
+
     // Stack overflow check
     // Note that vm_push_frame checks it against a decremented cfp, hence the multiply by 2.
     // #define CHECK_VM_STACK_OVERFLOW0(cfp, sp, margin)
-    asm.comment("stack overflow check");
+    asm_comment!(asm, "stack overflow check");
+    const _: () = assert!(RUBY_SIZEOF_CONTROL_FRAME % SIZEOF_VALUE == 0, "sizeof(rb_control_frame_t) is a multiple of sizeof(VALUE)");
     let stack_max: i32 = unsafe { get_iseq_body_stack_max(iseq) }.try_into().unwrap();
-    let locals_offs =
-        SIZEOF_VALUE_I32 * (num_locals + stack_max) + 2 * (RUBY_SIZEOF_CONTROL_FRAME as i32);
-    let stack_limit = asm.lea(ctx.sp_opnd(locals_offs as isize));
+    let locals_offs = (num_locals + stack_max) + 2 * (RUBY_SIZEOF_CONTROL_FRAME / SIZEOF_VALUE) as i32;
+    let stack_limit = asm.lea(asm.ctx.sp_opnd(locals_offs));
     asm.cmp(CFP, stack_limit);
-    asm.jbe(counted_exit!(ocb, side_exit, send_se_cf_overflow));
-
-    // push_splat_args does stack manipulation so we can no longer side exit
-    if flags & VM_CALL_ARGS_SPLAT != 0 {
-        let required_args = num_params - (argc as u32 - 1);
-        // We are going to assume that the splat fills
-        // all the remaining arguments. In the generated code
-        // we test if this is true and if not side exit.
-        argc = num_params as i32;
-        push_splat_args(required_args, ctx, asm, ocb, side_exit)
-    }
+    asm.jbe(Target::side_exit(Counter::guard_send_se_cf_overflow));
+
+    if iseq_has_rest && splat_call {
+        // Insert length guard for a call to copy_splat_args_for_rest_callee()
+        // that will come later. We will have made changes to
+        // the stack by spilling or handling __send__ shifting
+        // by the time we get to that code, so we need the
+        // guard here where we can still side exit.
+        let non_rest_arg_count = argc - i32::from(kw_splat) - 1;
+        if non_rest_arg_count < required_num + opt_num {
+            let take_count: u32 = (required_num - non_rest_arg_count + opts_filled)
+                .try_into().unwrap();
+
+            if take_count > 0 {
+                asm_comment!(asm, "guard splat_array_length >= {take_count}");
+
+                let splat_array = asm.stack_opnd(splat_pos);
+                let array_len_opnd = get_array_len(asm, splat_array);
+                asm.cmp(array_len_opnd, take_count.into());
+                asm.jl(Target::side_exit(Counter::guard_send_iseq_has_rest_and_splat_too_few));
+            }
+        }
 
-    // This is a .send call and we need to adjust the stack
-    if flags & VM_CALL_OPT_SEND != 0 {
-        handle_opt_send_shift_stack(asm, argc, ctx);
+        // All splats need to guard for ruby2_keywords hash. Check with a function call when
+        // splatting into a rest param since the index for the last item in the array is dynamic.
+        asm_comment!(asm, "guard no ruby2_keywords hash in splat");
+        let bad_splat = asm.ccall(rb_yjit_ruby2_keywords_splat_p as _, vec![asm.stack_opnd(splat_pos)]);
+        asm.cmp(bad_splat, 0.into());
+        asm.jnz(Target::side_exit(Counter::guard_send_splatarray_last_ruby2_keywords));
     }
 
-    if doing_kw_call {
-        // Here we're calling a method with keyword arguments and specifying
-        // keyword arguments at this call site.
+    match block_arg_type {
+        Some(BlockArg::Nil) => {
+            // We have a nil block arg, so let's pop it off the args
+            asm.stack_pop(1);
+        }
+        Some(BlockArg::BlockParamProxy) => {
+            // We don't need the actual stack value
+            asm.stack_pop(1);
+        }
+        Some(BlockArg::TProc) => {
+            // Place the proc as the block handler. We do this early because
+            // the block arg being at the top of the stack gets in the way of
+            // rest param handling later. Also, since there are C calls that
+            // come later, we can't hold this value in a register and place it
+            // near the end when we push a new control frame.
+            asm_comment!(asm, "guard block arg is a proc");
+            // Simple predicate, no need for jit_prepare_non_leaf_call().
+            let is_proc = asm.ccall(rb_obj_is_proc as _, vec![asm.stack_opnd(0)]);
+            asm.cmp(is_proc, Qfalse.into());
+            jit_chain_guard(
+                JCC_JE,
+                jit,
+                asm,
+                ocb,
+                SEND_MAX_DEPTH,
+                Counter::guard_send_block_arg_type,
+            );
 
-        // Number of positional arguments the callee expects before the first
-        // keyword argument
-        let args_before_kw = required_num + opt_num;
+            let callee_ep = -argc + num_locals + VM_ENV_DATA_SIZE as i32 - 1;
+            let callee_specval = callee_ep + VM_ENV_DATA_INDEX_SPECVAL;
+            if callee_specval < 0 {
+                // Can't write to sp[-n] since that's where the arguments are
+                gen_counter_incr(asm, Counter::send_iseq_clobbering_block_arg);
+                return None;
+            }
+            let proc = asm.stack_pop(1); // Pop first, as argc doesn't account for the block arg
+            let callee_specval = asm.ctx.sp_opnd(callee_specval);
+            asm.store(callee_specval, proc);
+        }
+        None => {
+            // Nothing to do
+        }
+    }
 
-        // This struct represents the metadata about the caller-specified
-        // keyword arguments.
-        let ci_kwarg = unsafe { vm_ci_kwarg(ci) };
-        let caller_keyword_len: usize = if ci_kwarg.is_null() {
-            0
-        } else {
-            unsafe { get_cikw_keyword_len(ci_kwarg) }
-                .try_into()
-                .unwrap()
-        };
+    if kw_splat {
+        // Only `**nil` is supported right now. Checked in exit_if_kwsplat_non_nil()
+        assert_eq!(Type::Nil, asm.ctx.get_opnd_type(StackOpnd(0)));
+        asm.stack_pop(1);
+        argc -= 1;
+    }
 
-        // This struct represents the metadata about the callee-specified
-        // keyword parameters.
-        let keyword = unsafe { get_iseq_body_param_keyword(iseq) };
+    // push_splat_args does stack manipulation so we can no longer side exit
+    if let Some(array_length) = splat_array_length {
+        if !iseq_has_rest {
+            // Speculate that future splats will be done with
+            // an array that has the same length. We will insert guards.
+            argc = argc - 1 + array_length as i32;
+            if argc + asm.ctx.get_stack_size() as i32 > MAX_SPLAT_LENGTH {
+                gen_counter_incr(asm, Counter::send_splat_too_long);
+                return None;
+            }
+            push_splat_args(array_length, asm);
+        }
+    }
 
-        asm.comment("keyword args");
+    // This is a .send call and we need to adjust the stack
+    // TODO: This can be more efficient if we do it before
+    //       extracting from the splat array above.
+    if flags & VM_CALL_OPT_SEND != 0 {
+        handle_opt_send_shift_stack(asm, argc);
+    }
+
+    if iseq_has_rest {
+        // We are going to allocate so setting pc and sp.
+        jit_save_pc(jit, asm);
+        gen_save_sp(asm);
+
+        let rest_param_array = if splat_call {
+            let non_rest_arg_count = argc - 1;
+            // We start by dupping the array because someone else might have
+            // a reference to it. This also normalizes to an ::Array instance.
+            let array = asm.stack_opnd(0);
+            let array = asm.ccall(
+                rb_ary_dup as *const u8,
+                vec![array],
+            );
+            asm.stack_pop(1); // Pop array after ccall to use a register for passing it.
+
+            // This is the end stack state of all `non_rest_arg_count` situations below
+            argc = required_num + opts_filled;
+
+            if non_rest_arg_count > required_num + opt_num {
+                // If we have more arguments than required, we need to prepend
+                // the items from the stack onto the array.
+                let diff: u32 = (non_rest_arg_count - (required_num + opt_num))
+                    .try_into().unwrap();
+
+                // diff is >0 so no need to worry about null pointer
+                asm_comment!(asm, "load pointer to array elements");
+                let values_opnd = asm.ctx.sp_opnd(-(diff as i32));
+                let values_ptr = asm.lea(values_opnd);
+
+                asm_comment!(asm, "prepend stack values to rest array");
+                let array = asm.ccall(
+                    rb_ary_unshift_m as *const u8,
+                    vec![Opnd::UImm(diff as u64), values_ptr, array],
+                );
+                asm.stack_pop(diff as usize);
 
-        // This is the list of keyword arguments that the callee specified
-        // in its initial declaration.
-        let callee_kwargs = unsafe { (*keyword).table };
-        let total_kwargs: usize = unsafe { (*keyword).num }.try_into().unwrap();
+                array
+            } else if non_rest_arg_count < required_num + opt_num {
+                // If we have fewer arguments than required, we need to take some
+                // from the array and move them to the stack.
+                asm_comment!(asm, "take items from splat array");
 
-        // Here we're going to build up a list of the IDs that correspond to
-        // the caller-specified keyword arguments. If they're not in the
-        // same order as the order specified in the callee declaration, then
-        // we're going to need to generate some code to swap values around
-        // on the stack.
-        let mut caller_kwargs: Vec<ID> = vec![0; total_kwargs];
+                let take_count: u32 = (required_num - non_rest_arg_count + opts_filled)
+                    .try_into().unwrap();
 
-        for kwarg_idx in 0..caller_keyword_len {
-            let sym = unsafe { get_cikw_keywords_idx(ci_kwarg, kwarg_idx.try_into().unwrap()) };
-            caller_kwargs[kwarg_idx] = unsafe { rb_sym2id(sym) };
-        }
-        let mut kwarg_idx = caller_keyword_len;
+                // Copy required arguments to the stack without modifying the array
+                copy_splat_args_for_rest_callee(array, take_count, asm);
 
-        let mut unspecified_bits = 0;
+                // We will now slice the array to give us a new array of the correct size
+                let sliced = asm.ccall(rb_yjit_rb_ary_subseq_length as *const u8, vec![array, Opnd::UImm(take_count.into())]);
 
-        let keyword_required_num: usize = unsafe { (*keyword).required_num }.try_into().unwrap();
-        for callee_idx in keyword_required_num..total_kwargs {
-            let mut already_passed = false;
-            let callee_kwarg = unsafe { *(callee_kwargs.offset(callee_idx.try_into().unwrap())) };
+                sliced
+            } else {
+                // The arguments are equal so we can just push to the stack
+                asm_comment!(asm, "same length for splat array and rest param");
+                assert!(non_rest_arg_count == required_num + opt_num);
 
-            for caller_idx in 0..caller_keyword_len {
-                if caller_kwargs[caller_idx] == callee_kwarg {
-                    already_passed = true;
-                    break;
-                }
+                array
             }
+        } else {
+            asm_comment!(asm, "rest parameter without splat");
+
+            assert!(argc >= required_num);
+            let n = (argc - required_num - opts_filled) as u32;
+            argc = required_num + opts_filled;
+            // If n is 0, then elts is never going to be read, so we can just pass null
+            let values_ptr = if n == 0 {
+                Opnd::UImm(0)
+            } else {
+                asm_comment!(asm, "load pointer to array elements");
+                let values_opnd = asm.ctx.sp_opnd(-(n as i32));
+                asm.lea(values_opnd)
+            };
 
-            if !already_passed {
-                // Reserve space on the stack for each default value we'll be
-                // filling in (which is done in the next loop). Also increments
-                // argc so that the callee's SP is recorded correctly.
-                argc += 1;
-                let default_arg = ctx.stack_push(Type::Unknown);
-
-                // callee_idx - keyword->required_num is used in a couple of places below.
-                let req_num: isize = unsafe { (*keyword).required_num }.try_into().unwrap();
-                let callee_idx_isize: isize = callee_idx.try_into().unwrap();
-                let extra_args = callee_idx_isize - req_num;
-
-                //VALUE default_value = keyword->default_values[callee_idx - keyword->required_num];
-                let mut default_value = unsafe { *((*keyword).default_values.offset(extra_args)) };
-
-                if default_value == Qundef {
-                    // Qundef means that this value is not constant and must be
-                    // recalculated at runtime, so we record it in unspecified_bits
-                    // (Qnil is then used as a placeholder instead of Qundef).
-                    unspecified_bits |= 0x01 << extra_args;
-                    default_value = Qnil;
-                }
+            let new_ary = asm.ccall(
+                rb_ec_ary_new_from_values as *const u8,
+                vec![
+                    EC,
+                    Opnd::UImm(n.into()),
+                    values_ptr
+                ]
+            );
+            asm.stack_pop(n.as_usize());
 
-                asm.mov(default_arg, default_value.into());
+            new_ary
+        };
 
-                caller_kwargs[kwarg_idx] = callee_kwarg;
-                kwarg_idx += 1;
-            }
+        // Find where to put the rest parameter array
+        let rest_param = if opts_missing == 0 {
+            // All optionals are filled, the rest param goes at the top of the stack
+            argc += 1;
+            asm.stack_push(Type::TArray)
+        } else {
+            // The top of the stack will be a missing optional, but the rest
+            // parameter needs to be placed after all the missing optionals.
+            // Place it using a stack operand with a negative stack index.
+            // (Higher magnitude negative stack index have higher address.)
+            assert!(opts_missing > 0);
+            // The argument deepest in the stack will be the 0th local in the callee.
+            let callee_locals_base = argc - 1;
+            let rest_param_stack_idx = callee_locals_base - required_num - opt_num;
+            assert!(rest_param_stack_idx < 0);
+            asm.stack_opnd(rest_param_stack_idx)
+        };
+        // Store rest param to memory to avoid register shuffle as
+        // we won't be reading it for the remainder of the block.
+        asm.ctx.dealloc_temp_reg(rest_param.stack_idx());
+        asm.store(rest_param, rest_param_array);
+    }
+
+    // Pop surplus positional arguments when yielding
+    if arg_setup_block {
+        let extras = argc - required_num - opt_num;
+        if extras > 0 {
+            // Checked earlier. If there are keyword args, then
+            // the positional arguments are not at the stack top.
+            assert_eq!(0, kw_arg_num);
+
+            asm.stack_pop(extras as usize);
+            argc = required_num + opt_num;
         }
+    }
 
-        assert!(kwarg_idx == total_kwargs);
+    // Keyword argument passing
+    if doing_kw_call {
+        argc = gen_iseq_kw_call(jit, asm, kw_arg, iseq, argc, has_kwrest);
+    }
+
+    // Same as vm_callee_setup_block_arg_arg0_check and vm_callee_setup_block_arg_arg0_splat
+    // on vm_callee_setup_block_arg for arg_setup_block. This is done after CALLER_SETUP_ARG
+    // and CALLER_REMOVE_EMPTY_KW_SPLAT, so this implementation is put here. This may need
+    // side exits, so you still need to allow side exits here if block_arg0_splat is true.
+    // Note that you can't have side exits after this arg0 splat.
+    if block_arg0_splat {
+        let arg0_opnd = asm.stack_opnd(0);
+
+        // Only handle the case that you don't need to_ary conversion
+        let not_array_counter = Counter::invokeblock_iseq_arg0_not_array;
+        guard_object_is_array(asm, arg0_opnd, arg0_opnd.into(), not_array_counter);
+
+        // Only handle the same that the array length == ISEQ's lead_num (most common)
+        let arg0_len_opnd = get_array_len(asm, arg0_opnd);
+        let lead_num = unsafe { rb_get_iseq_body_param_lead_num(iseq) };
+        asm.cmp(arg0_len_opnd, lead_num.into());
+        asm.jne(Target::side_exit(Counter::invokeblock_iseq_arg0_wrong_len));
+
+        let arg0_reg = asm.load(arg0_opnd);
+        let array_opnd = get_array_ptr(asm, arg0_reg);
+        asm_comment!(asm, "push splat arg0 onto the stack");
+        asm.stack_pop(argc.try_into().unwrap());
+        for i in 0..lead_num {
+            let stack_opnd = asm.stack_push(Type::Unknown);
+            asm.mov(stack_opnd, Opnd::mem(64, array_opnd, SIZEOF_VALUE_I32 * i));
+        }
+        argc = lead_num;
+    }
 
-        // Next, we're going to loop through every keyword that was
-        // specified by the caller and make sure that it's in the correct
-        // place. If it's not we're going to swap it around with another one.
-        for kwarg_idx in 0..total_kwargs {
-            let kwarg_idx_isize: isize = kwarg_idx.try_into().unwrap();
-            let callee_kwarg = unsafe { *(callee_kwargs.offset(kwarg_idx_isize)) };
+    fn nil_fill(comment: &'static str, fill_range: std::ops::Range<i32>, asm: &mut Assembler) {
+        if fill_range.is_empty() {
+            return;
+        }
 
-            // If the argument is already in the right order, then we don't
-            // need to generate any code since the expected value is already
-            // in the right place on the stack.
-            if callee_kwarg == caller_kwargs[kwarg_idx] {
-                continue;
-            }
+        asm_comment!(asm, "{}", comment);
+        for i in fill_range {
+            let value_slot = asm.ctx.sp_opnd(i);
+            asm.store(value_slot, Qnil.into());
+        }
+    }
 
-            // In this case the argument is not in the right place, so we
-            // need to find its position where it _should_ be and swap with
-            // that location.
-            for swap_idx in (kwarg_idx + 1)..total_kwargs {
-                if callee_kwarg == caller_kwargs[swap_idx] {
-                    // First we're going to generate the code that is going
-                    // to perform the actual swapping at runtime.
-                    let swap_idx_i32: i32 = swap_idx.try_into().unwrap();
-                    let kwarg_idx_i32: i32 = kwarg_idx.try_into().unwrap();
-                    let offset0: u16 = (argc - 1 - swap_idx_i32 - args_before_kw)
-                        .try_into()
-                        .unwrap();
-                    let offset1: u16 = (argc - 1 - kwarg_idx_i32 - args_before_kw)
-                        .try_into()
-                        .unwrap();
-                    stack_swap(jit, ctx, asm, offset0, offset1);
-
-                    // Next we're going to do some bookkeeping on our end so
-                    // that we know the order that the arguments are
-                    // actually in now.
-                    caller_kwargs.swap(kwarg_idx, swap_idx);
+    if !forwarding_call {
+        // Nil-initialize missing optional parameters
+        nil_fill(
+            "nil-initialize missing optionals",
+            {
+                let begin = -argc + required_num + opts_filled;
+                let end   = -argc + required_num + opt_num;
 
-                    break;
-                }
-            }
+                begin..end
+            },
+            asm
+        );
+        // Nil-initialize the block parameter. It's the last parameter local
+        if iseq_has_block_param {
+            let block_param = asm.ctx.sp_opnd(-argc + num_params - 1);
+            asm.store(block_param, Qnil.into());
         }
+        // Nil-initialize non-parameter locals
+        nil_fill(
+            "nil-initialize locals",
+        {
+            let begin = -argc + num_params;
+            let end   = -argc + num_locals;
 
-        // Keyword arguments cause a special extra local variable to be
-        // pushed onto the stack that represents the parameters that weren't
-        // explicitly given a value and have a non-constant default.
-        let unspec_opnd = VALUE::fixnum_from_usize(unspecified_bits).as_u64();
-        asm.mov(ctx.stack_opnd(-1), unspec_opnd.into());
+            begin..end
+        },
+        asm
+        );
+    }
+
+    if forwarding_call {
+        assert_eq!(1, num_params);
+        asm.mov(asm.stack_opnd(-1), VALUE(ci as usize).into());
     }
 
     // Points to the receiver operand on the stack unless a captured environment is used
     let recv = match captured_opnd {
         Some(captured_opnd) => asm.load(Opnd::mem(64, captured_opnd, 0)), // captured->self
-        _ => ctx.stack_opnd(argc),
+        _ => asm.stack_opnd(argc),
     };
     let captured_self = captured_opnd.is_some();
-    let sp_offset = (argc as isize) + if captured_self { 0 } else { 1 };
+    let sp_offset = argc + if captured_self { 0 } else { 1 };
 
     // Store the updated SP on the current frame (pop arguments and receiver)
-    asm.comment("store caller sp");
-    let caller_sp = asm.lea(ctx.sp_opnd((SIZEOF_VALUE as isize) * -sp_offset));
+    asm_comment!(asm, "store caller sp");
+    let caller_sp = asm.lea(asm.ctx.sp_opnd(-sp_offset));
     asm.store(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP), caller_sp);
 
     // Store the next PC in the current frame
     jit_save_pc(jit, asm);
 
     // Adjust the callee's stack pointer
-    let offs =
-        (SIZEOF_VALUE as isize) * (3 + (num_locals as isize) + if doing_kw_call { 1 } else { 0 });
-    let callee_sp = asm.lea(ctx.sp_opnd(offs));
+    let callee_sp = if forwarding_call {
+        let offs = num_locals + VM_ENV_DATA_SIZE as i32;
+        asm.lea(asm.ctx.sp_opnd(offs))
+    } else {
+        let offs = -argc + num_locals + VM_ENV_DATA_SIZE as i32;
+        asm.lea(asm.ctx.sp_opnd(offs))
+    };
 
     let specval = if let Some(prev_ep) = prev_ep {
         // We've already side-exited if the callee expects a block, so we
@@ -5540,16 +7746,16 @@ fn gen_send_iseq(
     } else if let Some(captured_opnd) = captured_opnd {
         let ep_opnd = asm.load(Opnd::mem(64, captured_opnd, SIZEOF_VALUE_I32)); // captured->ep
         SpecVal::PrevEPOpnd(ep_opnd)
-    } else if block_arg_type == Some(Type::BlockParamProxy) {
-        SpecVal::BlockParamProxy
-    } else if let Some(block_val) = block {
-        SpecVal::BlockISeq(block_val)
+    } else if let Some(BlockArg::TProc) = block_arg_type {
+        SpecVal::BlockHandler(Some(BlockHandler::AlreadySet))
+    } else if let Some(BlockArg::BlockParamProxy) = block_arg_type {
+        SpecVal::BlockHandler(Some(BlockHandler::BlockParamProxy))
     } else {
-        SpecVal::None
+        SpecVal::BlockHandler(block)
     };
 
     // Setup the new frame
-    gen_push_frame(jit, ctx, asm, true, ControlFrame {
+    perf_call!("gen_send_iseq: ", gen_push_frame(jit, asm, ControlFrame {
         frame_type,
         specval,
         cme,
@@ -5557,8 +7763,20 @@ fn gen_send_iseq(
         sp: callee_sp,
         iseq: Some(iseq),
         pc: None, // We are calling into jitted code, which will set the PC as necessary
-        local_size: num_locals
-    });
+    }));
+
+    // Log the name of the method we're calling to. We intentionally don't do this for inlined ISEQs.
+    // We also do this after gen_push_frame() to minimize the impact of spill_temps() on asm.ccall().
+    if get_option!(gen_stats) {
+        // Assemble the ISEQ name string
+        let name_str = get_iseq_name(iseq);
+
+        // Get an index for this ISEQ name
+        let iseq_idx = get_iseq_idx(&name_str);
+
+        // Increment the counter for this cfunc
+        asm.ccall(incr_iseq_counter as *const u8, vec![iseq_idx.into()]);
+    }
 
     // No need to set cfp->pc since the callee sets it whenever calling into routines
     // that could look at it through jit_save_pc().
@@ -5568,37 +7786,43 @@ fn gen_send_iseq(
     // Stub so we can return to JITted code
     let return_block = BlockId {
         iseq: jit.iseq,
-        idx: jit_next_insn_idx(jit),
+        idx: jit.next_insn_idx(),
     };
 
     // Create a context for the callee
     let mut callee_ctx = Context::default();
 
+    // If the callee has :inline_block annotation and the callsite has a block ISEQ,
+    // duplicate a callee block for each block ISEQ to make its `yield` monomorphic.
+    if let (Some(BlockHandler::BlockISeq(iseq)), true) = (block, builtin_attrs & BUILTIN_ATTR_INLINE_BLOCK != 0) {
+        callee_ctx.set_inline_block(iseq);
+    }
+
     // Set the argument types in the callee's context
     for arg_idx in 0..argc {
-        let stack_offs: u16 = (argc - arg_idx - 1).try_into().unwrap();
-        let arg_type = ctx.get_opnd_type(StackOpnd(stack_offs));
+        let stack_offs: u8 = (argc - arg_idx - 1).try_into().unwrap();
+        let arg_type = asm.ctx.get_opnd_type(StackOpnd(stack_offs));
         callee_ctx.set_local_type(arg_idx.try_into().unwrap(), arg_type);
     }
 
     let recv_type = if captured_self {
         Type::Unknown // we don't track the type information of captured->self for now
     } else {
-        ctx.get_opnd_type(StackOpnd(argc.try_into().unwrap()))
+        asm.ctx.get_opnd_type(StackOpnd(argc.try_into().unwrap()))
     };
     callee_ctx.upgrade_opnd_type(SelfOpnd, recv_type);
 
     // The callee might change locals through Kernel#binding and other means.
-    ctx.clear_local_types();
+    asm.clear_local_types();
 
-    // Pop arguments and receiver in return context, push the return value
-    // After the return, sp_offset will be 1. The codegen for leave writes
-    // the return value in case of JIT-to-JIT return.
-    let mut return_ctx = ctx.clone();
-    return_ctx.stack_pop(sp_offset.try_into().unwrap());
-    return_ctx.stack_push(Type::Unknown);
-    return_ctx.set_sp_offset(1);
-    return_ctx.reset_chain_depth();
+    // Pop arguments and receiver in return context and
+    // mark it as a continuation of gen_leave()
+    let mut return_asm = Assembler::new();
+    return_asm.ctx = asm.ctx;
+    return_asm.stack_pop(sp_offset.try_into().unwrap());
+    return_asm.ctx.set_sp_offset(0); // We set SP on the caller's frame above
+    return_asm.ctx.reset_chain_depth_and_defer();
+    return_asm.ctx.set_as_return_landing();
 
     // Write the JIT return address on the callee frame
     gen_branch(
@@ -5606,14 +7830,17 @@ fn gen_send_iseq(
         asm,
         ocb,
         return_block,
-        &return_ctx,
+        &return_asm.ctx,
         None,
         None,
-        gen_return_branch,
+        BranchGenFn::JITReturn,
     );
 
-    //print_str(cb, "calling Ruby func:");
-    //print_str(cb, rb_id2name(vm_ci_mid(ci)));
+    // ec->cfp is updated after cfp->jit_return for rb_profile_frames() safety
+    asm_comment!(asm, "switch to new CFP");
+    let new_cfp = asm.sub(CFP, RUBY_SIZEOF_CONTROL_FRAME.into());
+    asm.mov(CFP, new_cfp);
+    asm.store(Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP), CFP);
 
     // Directly jump to the entry point of the callee
     gen_direct_jump(
@@ -5626,24 +7853,511 @@ fn gen_send_iseq(
         asm,
     );
 
-    EndBlock
+    Some(EndBlock)
+}
+
+// Check if we can handle a keyword call
+fn gen_iseq_kw_call_checks(
+    asm: &mut Assembler,
+    iseq: *const rb_iseq_t,
+    kw_arg: *const rb_callinfo_kwarg,
+    has_kwrest: bool,
+    caller_kw_num: i32
+) -> Option<()> {
+    // This struct represents the metadata about the callee-specified
+    // keyword parameters.
+    let keyword = unsafe { get_iseq_body_param_keyword(iseq) };
+    let keyword_num: usize = unsafe { (*keyword).num }.try_into().unwrap();
+    let keyword_required_num: usize = unsafe { (*keyword).required_num }.try_into().unwrap();
+
+    let mut required_kwargs_filled = 0;
+
+    if keyword_num > 30 || caller_kw_num > 64 {
+        // We have so many keywords that (1 << num) encoded as a FIXNUM
+        // (which shifts it left one more) no longer fits inside a 32-bit
+        // immediate. Similarly, we use a u64 in case of keyword rest parameter.
+        gen_counter_incr(asm, Counter::send_iseq_too_many_kwargs);
+        return None;
+    }
+
+    // Check that the kwargs being passed are valid
+    if caller_kw_num > 0 {
+        // This is the list of keyword arguments that the callee specified
+        // in its initial declaration.
+        // SAFETY: see compile.c for sizing of this slice.
+        let callee_kwargs = if keyword_num == 0 {
+            &[]
+        } else {
+            unsafe { slice::from_raw_parts((*keyword).table, keyword_num) }
+        };
+
+        // Here we're going to build up a list of the IDs that correspond to
+        // the caller-specified keyword arguments. If they're not in the
+        // same order as the order specified in the callee declaration, then
+        // we're going to need to generate some code to swap values around
+        // on the stack.
+        let kw_arg_keyword_len = caller_kw_num as usize;
+        let mut caller_kwargs: Vec<ID> = vec![0; kw_arg_keyword_len];
+        for kwarg_idx in 0..kw_arg_keyword_len {
+            let sym = unsafe { get_cikw_keywords_idx(kw_arg, kwarg_idx.try_into().unwrap()) };
+            caller_kwargs[kwarg_idx] = unsafe { rb_sym2id(sym) };
+        }
+
+        // First, we're going to be sure that the names of every
+        // caller-specified keyword argument correspond to a name in the
+        // list of callee-specified keyword parameters.
+        for caller_kwarg in caller_kwargs {
+            let search_result = callee_kwargs
+                .iter()
+                .enumerate() // inject element index
+                .find(|(_, &kwarg)| kwarg == caller_kwarg);
+
+            match search_result {
+                None if !has_kwrest => {
+                    // If the keyword was never found, then we know we have a
+                    // mismatch in the names of the keyword arguments, so we need to
+                    // bail.
+                    gen_counter_incr(asm, Counter::send_iseq_kwargs_mismatch);
+                    return None;
+                }
+                Some((callee_idx, _)) if callee_idx < keyword_required_num => {
+                    // Keep a count to ensure all required kwargs are specified
+                    required_kwargs_filled += 1;
+                }
+                _ => (),
+            }
+        }
+    }
+    assert!(required_kwargs_filled <= keyword_required_num);
+    if required_kwargs_filled != keyword_required_num {
+        gen_counter_incr(asm, Counter::send_iseq_kwargs_mismatch);
+        return None;
+    }
+
+    Some(())
+}
+
+// Codegen for keyword argument handling. Essentially private to gen_send_iseq() since
+// there are a lot of preconditions to check before reaching this code.
+fn gen_iseq_kw_call(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ci_kwarg: *const rb_callinfo_kwarg,
+    iseq: *const rb_iseq_t,
+    mut argc: i32,
+    has_kwrest: bool,
+) -> i32 {
+    let caller_keyword_len_i32: i32 = if ci_kwarg.is_null() {
+        0
+    } else {
+        unsafe { get_cikw_keyword_len(ci_kwarg) }
+    };
+    let caller_keyword_len: usize = caller_keyword_len_i32.try_into().unwrap();
+    let anon_kwrest = unsafe { rb_get_iseq_flags_anon_kwrest(iseq) && !get_iseq_flags_has_kw(iseq) };
+
+    // This struct represents the metadata about the callee-specified
+    // keyword parameters.
+    let keyword = unsafe { get_iseq_body_param_keyword(iseq) };
+
+    asm_comment!(asm, "keyword args");
+
+    // This is the list of keyword arguments that the callee specified
+    // in its initial declaration.
+    let callee_kwargs = unsafe { (*keyword).table };
+    let callee_kw_count_i32: i32 = unsafe { (*keyword).num };
+    let callee_kw_count: usize = callee_kw_count_i32.try_into().unwrap();
+    let keyword_required_num: usize = unsafe { (*keyword).required_num }.try_into().unwrap();
+
+    // Here we're going to build up a list of the IDs that correspond to
+    // the caller-specified keyword arguments. If they're not in the
+    // same order as the order specified in the callee declaration, then
+    // we're going to need to generate some code to swap values around
+    // on the stack.
+    let mut kwargs_order: Vec<ID> = vec![0; cmp::max(caller_keyword_len, callee_kw_count)];
+    for kwarg_idx in 0..caller_keyword_len {
+        let sym = unsafe { get_cikw_keywords_idx(ci_kwarg, kwarg_idx.try_into().unwrap()) };
+        kwargs_order[kwarg_idx] = unsafe { rb_sym2id(sym) };
+    }
+
+    let mut unspecified_bits = 0;
+
+    // The stack_opnd() index to the 0th keyword argument.
+    let kwargs_stack_base = caller_keyword_len_i32 - 1;
+
+    // Build the keyword rest parameter hash before we make any changes to the order of
+    // the supplied keyword arguments
+    let kwrest_type = if has_kwrest {
+        c_callable! {
+            fn build_kw_rest(rest_mask: u64, stack_kwargs: *const VALUE, keywords: *const rb_callinfo_kwarg) -> VALUE {
+                if keywords.is_null() {
+                    return unsafe { rb_hash_new() };
+                }
+
+                // Use the total number of supplied keywords as a size upper bound
+                let keyword_len = unsafe { (*keywords).keyword_len } as usize;
+                let hash = unsafe { rb_hash_new_with_size(keyword_len as u64) };
+
+                // Put pairs into the kwrest hash as the mask describes
+                for kwarg_idx in 0..keyword_len {
+                    if (rest_mask & (1 << kwarg_idx)) != 0 {
+                        unsafe {
+                            let keyword_symbol = (*keywords).keywords.as_ptr().add(kwarg_idx).read();
+                            let keyword_value = stack_kwargs.add(kwarg_idx).read();
+                            rb_hash_aset(hash, keyword_symbol, keyword_value);
+                        }
+                    }
+                }
+                return hash;
+            }
+        }
+
+        asm_comment!(asm, "build kwrest hash");
+
+        // Make a bit mask describing which keywords should go into kwrest.
+        let mut rest_mask: u64 = 0;
+        // Index for one argument that will go into kwrest.
+        let mut rest_collected_idx = None;
+        for (supplied_kw_idx, &supplied_kw) in kwargs_order.iter().take(caller_keyword_len).enumerate() {
+            let mut found = false;
+            for callee_idx in 0..callee_kw_count {
+                let callee_kw = unsafe { callee_kwargs.add(callee_idx).read() };
+                if callee_kw == supplied_kw {
+                    found = true;
+                    break;
+                }
+            }
+            if !found {
+                rest_mask |= 1 << supplied_kw_idx;
+                if rest_collected_idx.is_none() {
+                    rest_collected_idx = Some(supplied_kw_idx as i32);
+                }
+            }
+        }
+
+        let (kwrest, kwrest_type) = if rest_mask == 0 && anon_kwrest {
+            // In case the kwrest hash should be empty and is anonymous in the callee,
+            // we can pass nil instead of allocating. Anonymous kwrest can only be
+            // delegated, and nil is the same as an empty hash when delegating.
+            (Qnil.into(), Type::Nil)
+        } else {
+            // Save PC and SP before allocating
+            jit_save_pc(jit, asm);
+            gen_save_sp(asm);
+
+            // Build the kwrest hash. `struct rb_callinfo_kwarg` is malloc'd, so no GC concerns.
+            let kwargs_start = asm.lea(asm.ctx.sp_opnd(-caller_keyword_len_i32));
+            let hash = asm.ccall(
+                build_kw_rest as _,
+                vec![rest_mask.into(), kwargs_start, Opnd::const_ptr(ci_kwarg.cast())]
+            );
+            (hash, Type::THash)
+        };
+
+        // The kwrest parameter sits after `unspecified_bits` if the callee specifies any
+        // keywords.
+        let stack_kwrest_idx = kwargs_stack_base - callee_kw_count_i32 - i32::from(callee_kw_count > 0);
+        let stack_kwrest = asm.stack_opnd(stack_kwrest_idx);
+        // If `stack_kwrest` already has another argument there, we need to stow it elsewhere
+        // first before putting kwrest there. Use `rest_collected_idx` because that value went
+        // into kwrest so the slot is now free.
+        let kwrest_idx = callee_kw_count + usize::from(callee_kw_count > 0);
+        if let (Some(rest_collected_idx), true) = (rest_collected_idx, kwrest_idx < caller_keyword_len) {
+            let rest_collected = asm.stack_opnd(kwargs_stack_base - rest_collected_idx);
+            let mapping = asm.ctx.get_opnd_mapping(stack_kwrest.into());
+            asm.mov(rest_collected, stack_kwrest);
+            asm.ctx.set_opnd_mapping(rest_collected.into(), mapping);
+            // Update our bookkeeping to inform the reordering step later.
+            kwargs_order[rest_collected_idx as usize] = kwargs_order[kwrest_idx];
+            kwargs_order[kwrest_idx] = 0;
+        }
+        // Put kwrest straight into memory, since we might pop it later
+        asm.ctx.dealloc_temp_reg(stack_kwrest.stack_idx());
+        asm.mov(stack_kwrest, kwrest);
+        if stack_kwrest_idx >= 0 {
+            asm.ctx.set_opnd_mapping(stack_kwrest.into(), TempMapping::map_to_stack(kwrest_type));
+        }
+
+        Some(kwrest_type)
+    } else {
+        None
+    };
+
+    // Ensure the stack is large enough for the callee
+    for _ in caller_keyword_len..callee_kw_count {
+        argc += 1;
+        asm.stack_push(Type::Unknown);
+    }
+    // Now this is the stack_opnd() index to the 0th keyword argument.
+    let kwargs_stack_base = kwargs_order.len() as i32 - 1;
+
+    // Next, we're going to loop through every keyword that was
+    // specified by the caller and make sure that it's in the correct
+    // place. If it's not we're going to swap it around with another one.
+    for kwarg_idx in 0..callee_kw_count {
+        let callee_kwarg = unsafe { callee_kwargs.add(kwarg_idx).read() };
+
+        // If the argument is already in the right order, then we don't
+        // need to generate any code since the expected value is already
+        // in the right place on the stack.
+        if callee_kwarg == kwargs_order[kwarg_idx] {
+            continue;
+        }
+
+        // In this case the argument is not in the right place, so we
+        // need to find its position where it _should_ be and swap with
+        // that location.
+        for swap_idx in 0..kwargs_order.len() {
+            if callee_kwarg == kwargs_order[swap_idx] {
+                // First we're going to generate the code that is going
+                // to perform the actual swapping at runtime.
+                let swap_idx_i32: i32 = swap_idx.try_into().unwrap();
+                let kwarg_idx_i32: i32 = kwarg_idx.try_into().unwrap();
+                let offset0 = kwargs_stack_base - swap_idx_i32;
+                let offset1 = kwargs_stack_base - kwarg_idx_i32;
+                stack_swap(asm, offset0, offset1);
+
+                // Next we're going to do some bookkeeping on our end so
+                // that we know the order that the arguments are
+                // actually in now.
+                kwargs_order.swap(kwarg_idx, swap_idx);
+
+                break;
+            }
+        }
+    }
+
+    // Now that every caller specified kwarg is in the right place, filling
+    // in unspecified default paramters won't overwrite anything.
+    for kwarg_idx in keyword_required_num..callee_kw_count {
+        if kwargs_order[kwarg_idx] != unsafe { callee_kwargs.add(kwarg_idx).read() } {
+            let default_param_idx = kwarg_idx - keyword_required_num;
+            let mut default_value = unsafe { (*keyword).default_values.add(default_param_idx).read() };
+
+            if default_value == Qundef {
+                // Qundef means that this value is not constant and must be
+                // recalculated at runtime, so we record it in unspecified_bits
+                // (Qnil is then used as a placeholder instead of Qundef).
+                unspecified_bits |= 0x01 << default_param_idx;
+                default_value = Qnil;
+            }
+
+            let default_param = asm.stack_opnd(kwargs_stack_base - kwarg_idx as i32);
+            let param_type = Type::from(default_value);
+            asm.mov(default_param, default_value.into());
+            asm.ctx.set_opnd_mapping(default_param.into(), TempMapping::map_to_stack(param_type));
+        }
+    }
+
+    // Pop extra arguments that went into kwrest now that they're at stack top
+    if has_kwrest && caller_keyword_len > callee_kw_count {
+        let extra_kwarg_count = caller_keyword_len - callee_kw_count;
+        asm.stack_pop(extra_kwarg_count);
+        argc = argc - extra_kwarg_count as i32;
+    }
+
+    // Keyword arguments cause a special extra local variable to be
+    // pushed onto the stack that represents the parameters that weren't
+    // explicitly given a value and have a non-constant default.
+    if callee_kw_count > 0 {
+        let unspec_opnd = VALUE::fixnum_from_usize(unspecified_bits).as_u64();
+        let top = asm.stack_push(Type::Fixnum);
+        asm.mov(top, unspec_opnd.into());
+        argc += 1;
+    }
+
+    // The kwrest parameter sits after `unspecified_bits`
+    if let Some(kwrest_type) = kwrest_type {
+        let kwrest = asm.stack_push(kwrest_type);
+        // We put the kwrest parameter in memory earlier
+        asm.ctx.dealloc_temp_reg(kwrest.stack_idx());
+        argc += 1;
+    }
+
+    argc
+}
+
+/// This is a helper function to allow us to exit early
+/// during code generation if a predicate is true.
+/// We return Option<()> here because we will be able to
+/// short-circuit using the ? operator if we return None.
+/// It would be great if rust let you implement ? for your
+/// own types, but as of right now they don't.
+fn exit_if(asm: &mut Assembler, pred: bool, counter: Counter) -> Option<()> {
+    if pred {
+        gen_counter_incr(asm, counter);
+        return None
+    }
+    Some(())
+}
+
+#[must_use]
+fn exit_if_tail_call(asm: &mut Assembler, ci: *const rb_callinfo) -> Option<()> {
+    exit_if(asm, unsafe { vm_ci_flag(ci) } & VM_CALL_TAILCALL != 0, Counter::send_iseq_tailcall)
+}
+
+#[must_use]
+fn exit_if_has_post(asm: &mut Assembler, iseq: *const rb_iseq_t) -> Option<()> {
+    exit_if(asm, unsafe { get_iseq_flags_has_post(iseq) }, Counter::send_iseq_has_post)
+}
+
+#[must_use]
+fn exit_if_kwsplat_non_nil(asm: &mut Assembler, flags: u32, counter: Counter) -> Option<()> {
+    let kw_splat = flags & VM_CALL_KW_SPLAT != 0;
+    let kw_splat_stack = StackOpnd((flags & VM_CALL_ARGS_BLOCKARG != 0).into());
+    exit_if(asm, kw_splat && asm.ctx.get_opnd_type(kw_splat_stack) != Type::Nil, counter)
+}
+
+#[must_use]
+fn exit_if_has_rest_and_captured(asm: &mut Assembler, iseq_has_rest: bool, captured_opnd: Option<Opnd>) -> Option<()> {
+    exit_if(asm, iseq_has_rest && captured_opnd.is_some(), Counter::send_iseq_has_rest_and_captured)
+}
+
+#[must_use]
+fn exit_if_has_kwrest_and_captured(asm: &mut Assembler, iseq_has_kwrest: bool, captured_opnd: Option<Opnd>) -> Option<()> {
+    // We need to call a C function to allocate the kwrest hash, but also need to hold the captred
+    // block across the call, which we can't do.
+    exit_if(asm, iseq_has_kwrest && captured_opnd.is_some(), Counter::send_iseq_has_kwrest_and_captured)
+}
+
+#[must_use]
+fn exit_if_has_rest_and_supplying_kws(asm: &mut Assembler, iseq_has_rest: bool, supplying_kws: bool) -> Option<()> {
+    // There can be a gap between the rest parameter array and the supplied keywords, or
+    // no space to put the rest array (e.g. `def foo(*arr, k:) = arr; foo(k: 1)` 1 is
+    // sitting where the rest array should be).
+    exit_if(
+        asm,
+        iseq_has_rest && supplying_kws,
+        Counter::send_iseq_has_rest_and_kw_supplied,
+    )
+}
+
+#[must_use]
+fn exit_if_supplying_kw_and_has_no_kw(asm: &mut Assembler, supplying_kws: bool, callee_kws: bool) -> Option<()> {
+    // Passing keyword arguments to a callee means allocating a hash and treating
+    // that as a positional argument. Bail for now.
+    exit_if(
+        asm,
+        supplying_kws && !callee_kws,
+        Counter::send_iseq_has_no_kw,
+    )
+}
+
+#[must_use]
+fn exit_if_supplying_kws_and_accept_no_kwargs(asm: &mut Assembler, supplying_kws: bool, iseq: *const rb_iseq_t) -> Option<()> {
+    // If we have a method accepting no kwargs (**nil), exit if we have passed
+    // it any kwargs.
+    exit_if(
+        asm,
+        supplying_kws && unsafe { get_iseq_flags_accepts_no_kwarg(iseq) },
+        Counter::send_iseq_accepts_no_kwarg
+    )
+}
+
+#[must_use]
+fn exit_if_doing_kw_and_splat(asm: &mut Assembler, doing_kw_call: bool, flags: u32) -> Option<()> {
+    exit_if(asm, doing_kw_call && flags & VM_CALL_ARGS_SPLAT != 0, Counter::send_iseq_splat_with_kw)
+}
+
+#[must_use]
+fn exit_if_wrong_number_arguments(
+    asm: &mut Assembler,
+    args_setup_block: bool,
+    opts_filled: i32,
+    flags: u32,
+    opt_num: i32,
+    iseq_has_rest: bool,
+) -> Option<()> {
+    // Too few arguments and no splat to make up for it
+    let too_few = opts_filled < 0 && flags & VM_CALL_ARGS_SPLAT == 0;
+    // Too many arguments and no sink that take them
+    let too_many = opts_filled > opt_num && !(iseq_has_rest || args_setup_block);
+
+    exit_if(asm, too_few || too_many, Counter::send_iseq_arity_error)
+}
+
+#[must_use]
+fn exit_if_doing_kw_and_opts_missing(asm: &mut Assembler, doing_kw_call: bool, opts_missing: i32) -> Option<()> {
+    // If we have unfilled optional arguments and keyword arguments then we
+    // would need to adjust the arguments location to account for that.
+    // For now we aren't handling this case.
+    exit_if(asm, doing_kw_call && opts_missing > 0, Counter::send_iseq_missing_optional_kw)
+}
+
+#[must_use]
+fn exit_if_has_rest_and_optional_and_block(asm: &mut Assembler, iseq_has_rest: bool, opt_num: i32, iseq: *const rb_iseq_t, block_arg: bool) -> Option<()> {
+    exit_if(
+        asm,
+        iseq_has_rest && opt_num != 0 && (unsafe { get_iseq_flags_has_block(iseq) } || block_arg),
+        Counter::send_iseq_has_rest_opt_and_block
+    )
+}
+
+#[derive(Clone, Copy)]
+enum BlockArg {
+    Nil,
+    /// A special sentinel value indicating the block parameter should be read from
+    /// the current surrounding cfp
+    BlockParamProxy,
+    /// A proc object. Could be an instance of a subclass of ::rb_cProc
+    TProc,
+}
+
+#[must_use]
+fn exit_if_unsupported_block_arg_type(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    supplying_block_arg: bool
+) -> Option<Option<BlockArg>> {
+    let block_arg_type = if supplying_block_arg {
+        asm.ctx.get_opnd_type(StackOpnd(0))
+    } else {
+        // Passing no block argument
+        return Some(None);
+    };
+
+    match block_arg_type {
+        // We'll handle Nil and BlockParamProxy later
+        Type::Nil => Some(Some(BlockArg::Nil)),
+        Type::BlockParamProxy => Some(Some(BlockArg::BlockParamProxy)),
+        _ if {
+            let sample_block_arg = jit.peek_at_stack(&asm.ctx, 0);
+            unsafe { rb_obj_is_proc(sample_block_arg) }.test()
+        } => {
+            // Speculate that we'll have a proc as the block arg
+            Some(Some(BlockArg::TProc))
+        }
+        _ => {
+            gen_counter_incr(asm, Counter::send_iseq_block_arg_type);
+            None
+        }
+    }
+}
+
+#[must_use]
+fn exit_if_stack_too_large(iseq: *const rb_iseq_t) -> Option<()> {
+    let stack_max = unsafe { rb_get_iseq_body_stack_max(iseq) };
+    // Reject ISEQs with very large temp stacks,
+    // this will allow us to use u8/i8 values to track stack_size and sp_offset
+    if stack_max >= i8::MAX as u32 {
+        incr_counter!(iseq_stack_too_large);
+        return None;
+    }
+    Some(())
 }
 
 fn gen_struct_aref(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
     ci: *const rb_callinfo,
     cme: *const rb_callable_method_entry_t,
     comptime_recv: VALUE,
-    _comptime_recv_klass: VALUE,
     flags: u32,
     argc: i32,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
 
     if unsafe { vm_ci_argc(ci) } != 0 {
-        return CantCompile;
+        return None;
     }
 
     let off: i32 = unsafe { get_cme_def_body_optimized_index(cme) }
@@ -5659,13 +8373,20 @@ fn gen_struct_aref(
     {
         let native_off = (off as i64) * (SIZEOF_VALUE as i64);
         if native_off > (i32::MAX as i64) {
-            return CantCompile;
+            return None;
         }
     }
 
+    if c_method_tracing_currently_enabled(jit) {
+        // Struct accesses need fire c_call and c_return events, which we can't support
+        // See :attr-tracing:
+        gen_counter_incr(asm, Counter::send_cfunc_tracing);
+        return None;
+    }
+
     // This is a .send call and we need to adjust the stack
     if flags & VM_CALL_OPT_SEND != 0 {
-        handle_opt_send_shift_stack(asm, argc, ctx);
+        handle_opt_send_shift_stack(asm, argc);
     }
 
     // All structs from the same Struct class should have the same
@@ -5674,9 +8395,10 @@ fn gen_struct_aref(
     // true of the converse.
     let embedded = unsafe { FL_TEST_RAW(comptime_recv, VALUE(RSTRUCT_EMBED_LEN_MASK)) };
 
-    asm.comment("struct aref");
+    asm_comment!(asm, "struct aref");
 
-    let recv = asm.load(ctx.stack_pop(1));
+    let recv = asm.stack_pop(1);
+    let recv = asm.load(recv);
 
     let val = if embedded != VALUE(0) {
         Opnd::mem(64, recv, RUBY_OFFSET_RSTRUCT_AS_ARY + (SIZEOF_VALUE_I32 * off))
@@ -5685,32 +8407,37 @@ fn gen_struct_aref(
         Opnd::mem(64, rstruct_ptr, SIZEOF_VALUE_I32 * off)
     };
 
-    let ret = ctx.stack_push(Type::Unknown);
+    let ret = asm.stack_push(Type::Unknown);
     asm.mov(ret, val);
 
-    jump_to_next_insn(jit, ctx, asm, ocb);
-    EndBlock
+    jump_to_next_insn(jit, asm, ocb);
+    Some(EndBlock)
 }
 
 fn gen_struct_aset(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
     ci: *const rb_callinfo,
     cme: *const rb_callable_method_entry_t,
     comptime_recv: VALUE,
-    _comptime_recv_klass: VALUE,
     flags: u32,
     argc: i32,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     if unsafe { vm_ci_argc(ci) } != 1 {
-        return CantCompile;
+        return None;
+    }
+
+    if c_method_tracing_currently_enabled(jit) {
+        // Struct accesses need fire c_call and c_return events, which we can't support
+        // See :attr-tracing:
+        gen_counter_incr(asm, Counter::send_cfunc_tracing);
+        return None;
     }
 
     // This is a .send call and we need to adjust the stack
     if flags & VM_CALL_OPT_SEND != 0 {
-        handle_opt_send_shift_stack(asm, argc, ctx);
+        handle_opt_send_shift_stack(asm, argc);
     }
 
     let off: i32 = unsafe { get_cme_def_body_optimized_index(cme) }
@@ -5721,28 +8448,77 @@ fn gen_struct_aset(
     assert!(unsafe { RB_TYPE_P(comptime_recv, RUBY_T_STRUCT) });
     assert!((off as i64) < unsafe { RSTRUCT_LEN(comptime_recv) });
 
-    asm.comment("struct aset");
+    asm_comment!(asm, "struct aset");
 
-    let val = ctx.stack_pop(1);
-    let recv = ctx.stack_pop(1);
+    let val = asm.stack_pop(1);
+    let recv = asm.stack_pop(1);
 
     let val = asm.ccall(RSTRUCT_SET as *const u8, vec![recv, (off as i64).into(), val]);
 
-    let ret = ctx.stack_push(Type::Unknown);
+    let ret = asm.stack_push(Type::Unknown);
     asm.mov(ret, val);
 
-    jump_to_next_insn(jit, ctx, asm, ocb);
-    EndBlock
+    jump_to_next_insn(jit, asm, ocb);
+    Some(EndBlock)
+}
+
+// Generate code that calls a method with dynamic dispatch
+fn gen_send_dynamic<F: Fn(&mut Assembler) -> Opnd>(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    cd: *const rb_call_data,
+    sp_pops: usize,
+    vm_sendish: F,
+) -> Option<CodegenStatus> {
+    // Our frame handling is not compatible with tailcall
+    if unsafe { vm_ci_flag((*cd).ci) } & VM_CALL_TAILCALL != 0 {
+        return None;
+    }
+    jit_perf_symbol_push!(jit, asm, "gen_send_dynamic", PerfMap::Codegen);
+
+    // Rewind stack_size using ctx.with_stack_size to allow stack_size changes
+    // before you return None.
+    asm.ctx = asm.ctx.with_stack_size(jit.stack_size_for_pc);
+
+    // Save PC and SP to prepare for dynamic dispatch
+    jit_prepare_non_leaf_call(jit, asm);
+
+    // Squash stack canary that might be left over from elsewhere
+    assert_eq!(false, asm.get_leaf_ccall());
+    if cfg!(debug_assertions) {
+        asm.store(asm.ctx.sp_opnd(0), 0.into());
+    }
+
+    // Dispatch a method
+    let ret = vm_sendish(asm);
+
+    // Pop arguments and a receiver
+    asm.stack_pop(sp_pops);
+
+    // Push the return value
+    let stack_ret = asm.stack_push(Type::Unknown);
+    asm.mov(stack_ret, ret);
+
+    // Fix the interpreter SP deviated by vm_sendish
+    asm.mov(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP), SP);
+
+    gen_counter_incr(asm, Counter::num_send_dynamic);
+
+    jit_perf_symbol_pop!(jit, asm, PerfMap::Codegen);
+
+    // End the current block for invalidationg and sharing the same successor
+    jump_to_next_insn(jit, asm, ocb);
+    Some(EndBlock)
 }
 
 fn gen_send_general(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
     cd: *const rb_call_data,
-    block: Option<IseqPtr>,
-) -> CodegenStatus {
+    block: Option<BlockHandler>,
+) -> Option<CodegenStatus> {
     // Relevant definitions:
     // rb_execution_context_t       : vm_core.h
     // invoker, cfunc logic         : method.h, vm_method.c
@@ -5758,32 +8534,61 @@ fn gen_send_general(
     let mut mid = unsafe { vm_ci_mid(ci) };
     let mut flags = unsafe { vm_ci_flag(ci) };
 
-    // Don't JIT calls with keyword splat
-    if flags & VM_CALL_KW_SPLAT != 0 {
-        gen_counter_incr!(asm, send_kw_splat);
-        return CantCompile;
+    // Defer compilation so we can specialize on class of receiver
+    if !jit.at_current_insn() {
+        defer_compilation(jit, asm, ocb);
+        return Some(EndBlock);
     }
 
-    // Defer compilation so we can specialize on class of receiver
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+    let ci_flags = unsafe { vm_ci_flag(ci) };
+
+    // Dynamic stack layout. No good way to support without inlining.
+    if ci_flags & VM_CALL_FORWARDING != 0 {
+        gen_counter_incr(asm, Counter::send_iseq_forwarding);
+        return None;
     }
 
     let recv_idx = argc + if flags & VM_CALL_ARGS_BLOCKARG != 0 { 1 } else { 0 };
-
-    let comptime_recv = jit_peek_at_stack(jit, ctx, recv_idx as isize);
+    let comptime_recv = jit.peek_at_stack(&asm.ctx, recv_idx as isize);
     let comptime_recv_klass = comptime_recv.class_of();
+    assert_eq!(RUBY_T_CLASS, comptime_recv_klass.builtin_type(),
+        "objects visible to ruby code should have a T_CLASS in their klass field");
 
-    // Guard that the receiver has the same class as the one from compile time
-    let side_exit = get_side_exit(jit, ocb, ctx);
+    // Don't compile calls through singleton classes to avoid retaining the receiver.
+    // Make an exception for class methods since classes tend to be retained anyways.
+    // Also compile calls on top_self to help tests.
+    if VALUE(0) != unsafe { FL_TEST(comptime_recv_klass, VALUE(RUBY_FL_SINGLETON as usize)) }
+        && comptime_recv != unsafe { rb_vm_top_self() }
+        && !unsafe { RB_TYPE_P(comptime_recv, RUBY_T_CLASS) }
+        && !unsafe { RB_TYPE_P(comptime_recv, RUBY_T_MODULE) } {
+        gen_counter_incr(asm, Counter::send_singleton_class);
+        return None;
+    }
 
     // Points to the receiver operand on the stack
-    let recv = ctx.stack_opnd(recv_idx);
-    let recv_opnd = StackOpnd(recv_idx.try_into().unwrap());
-    jit_guard_known_klass(
+    let recv = asm.stack_opnd(recv_idx);
+    let recv_opnd: YARVOpnd = recv.into();
+
+    // Log the name of the method we're calling to
+    #[cfg(feature = "disasm")]
+    asm_comment!(asm, "call to {}", get_method_name(Some(comptime_recv_klass), mid));
+
+    // Gather some statistics about sends
+    gen_counter_incr(asm, Counter::num_send);
+    if let Some(_known_klass) = asm.ctx.get_opnd_type(recv_opnd).known_class()  {
+        gen_counter_incr(asm, Counter::num_send_known_class);
+    }
+    if asm.ctx.get_chain_depth() > 1 {
+        gen_counter_incr(asm, Counter::num_send_polymorphic);
+    }
+    // If megamorphic, let the caller fallback to dynamic dispatch
+    if asm.ctx.get_chain_depth() >= SEND_MAX_DEPTH {
+        gen_counter_incr(asm, Counter::send_megamorphic);
+        return None;
+    }
+
+    perf_call!("gen_send_general: ", jit_guard_known_klass(
         jit,
-        ctx,
         asm,
         ocb,
         comptime_recv_klass,
@@ -5791,16 +8596,20 @@ fn gen_send_general(
         recv_opnd,
         comptime_recv,
         SEND_MAX_DEPTH,
-        side_exit,
-    );
+        Counter::guard_send_klass_megamorphic,
+    ));
 
     // Do method lookup
     let mut cme = unsafe { rb_callable_method_entry(comptime_recv_klass, mid) };
     if cme.is_null() {
-        // TODO: counter
-        return CantCompile;
+        gen_counter_incr(asm, Counter::send_cme_not_found);
+        return None;
     }
 
+    // Load an overloaded cme if applicable. See vm_search_cc().
+    // It allows you to use a faster ISEQ if possible.
+    cme = unsafe { rb_check_overloaded_cme(cme, ci) };
+
     let visi = unsafe { METHOD_ENTRY_VISI(cme) };
     match visi {
         METHOD_VISI_PUBLIC => {
@@ -5810,15 +8619,16 @@ fn gen_send_general(
             if flags & VM_CALL_FCALL == 0 {
                 // Can only call private methods with FCALL callsites.
                 // (at the moment they are callsites without a receiver or an explicit `self` receiver)
-                return CantCompile;
+                gen_counter_incr(asm, Counter::send_private_not_fcall);
+                return None;
             }
         }
         METHOD_VISI_PROTECTED => {
             // If the method call is an FCALL, it is always valid
             if flags & VM_CALL_FCALL == 0 {
-                // otherwise we need an ancestry check to ensure the receiver is vaild to be called
+                // otherwise we need an ancestry check to ensure the receiver is valid to be called
                 // as protected
-                jit_protected_callee_ancestry_guard(jit, asm, ocb, cme, side_exit);
+                jit_protected_callee_ancestry_guard(asm, cme);
             }
         }
         _ => {
@@ -5828,7 +8638,7 @@ fn gen_send_general(
 
     // Register block for invalidation
     //assert!(cme->called_id == mid);
-    assume_method_lookup_stable(jit, ocb, cme);
+    jit.assume_method_lookup_stable(asm, ocb, cme);
 
     // To handle the aliased method case (VM_METHOD_TYPE_ALIAS)
     loop {
@@ -5838,38 +8648,59 @@ fn gen_send_general(
             VM_METHOD_TYPE_ISEQ => {
                 let iseq = unsafe { get_def_iseq_ptr((*cme).def) };
                 let frame_type = VM_FRAME_MAGIC_METHOD | VM_ENV_FLAG_LOCAL;
-                return gen_send_iseq(jit, ctx, asm, ocb, iseq, ci, frame_type, None, cme, block, flags, argc, None);
+                return perf_call! { gen_send_iseq(jit, asm, ocb, iseq, ci, frame_type, None, cme, block, flags, argc, None) };
             }
             VM_METHOD_TYPE_CFUNC => {
-                return gen_send_cfunc(
+                return perf_call! { gen_send_cfunc(
                     jit,
-                    ctx,
                     asm,
                     ocb,
                     ci,
                     cme,
                     block,
-                    &comptime_recv_klass,
+                    Some(comptime_recv_klass),
                     flags,
                     argc,
-                );
+                ) };
             }
             VM_METHOD_TYPE_IVAR => {
-                if flags & VM_CALL_ARGS_SPLAT != 0 {
-                    gen_counter_incr!(asm, send_args_splat_ivar);
-                    return CantCompile;
+                // This is a .send call not supported right now for attr_reader
+                if flags & VM_CALL_OPT_SEND != 0 {
+                    gen_counter_incr(asm, Counter::send_send_attr_reader);
+                    return None;
                 }
 
-                if argc != 0 {
-                    // Argument count mismatch. Getters take no arguments.
-                    gen_counter_incr!(asm, send_getter_arity);
-                    return CantCompile;
+                if flags & VM_CALL_ARGS_BLOCKARG != 0 {
+                    match asm.ctx.get_opnd_type(StackOpnd(0)) {
+                        Type::Nil | Type::BlockParamProxy => {
+                            // Getters ignore the block arg, and these types of block args can be
+                            // passed without side-effect (never any `to_proc` call).
+                            asm.stack_pop(1);
+                        }
+                        _ => {
+                            gen_counter_incr(asm, Counter::send_getter_block_arg);
+                            return None;
+                        }
+                    }
                 }
 
-                // This is a .send call not supported right now for getters
-                if flags & VM_CALL_OPT_SEND != 0 {
-                    gen_counter_incr!(asm, send_send_getter);
-                    return CantCompile;
+                if argc != 0 {
+                    // Guard for simple splat of empty array
+                    if VM_CALL_ARGS_SPLAT == flags & (VM_CALL_ARGS_SPLAT | VM_CALL_KWARG | VM_CALL_KW_SPLAT)
+                        && argc == 1 {
+                        // Not using chain guards since on failure these likely end up just raising
+                        // ArgumentError
+                        let splat = asm.stack_opnd(0);
+                        guard_object_is_array(asm, splat, splat.into(), Counter::guard_send_getter_splat_non_empty);
+                        let splat_len = get_array_len(asm, splat);
+                        asm.cmp(splat_len, 0.into());
+                        asm.jne(Target::side_exit(Counter::guard_send_getter_splat_non_empty));
+                        asm.stack_pop(1);
+                    } else {
+                        // Argument count mismatch. Getters take no arguments.
+                        gen_counter_incr(asm, Counter::send_getter_arity);
+                        return None;
+                    }
                 }
 
                 if c_method_tracing_currently_enabled(jit) {
@@ -5878,198 +8709,136 @@ fn gen_send_general(
                     // Handling the C method tracing events for attr_accessor
                     // methods is easier than regular C methods as we know the
                     // "method" we are calling into never enables those tracing
-                    // events. Once global invalidation runs, the code for the
-                    // attr_accessor is invalidated and we exit at the closest
-                    // instruction boundary which is always outside of the body of
-                    // the attr_accessor code.
-                    gen_counter_incr!(asm, send_cfunc_tracing);
-                    return CantCompile;
+                    // events. We are never inside the code that needs to be
+                    // invalidated when invalidation happens.
+                    gen_counter_incr(asm, Counter::send_cfunc_tracing);
+                    return None;
                 }
 
+                let recv = asm.stack_opnd(0); // the receiver should now be the stack top
                 let ivar_name = unsafe { get_cme_def_body_attr_id(cme) };
 
-                if flags & VM_CALL_ARGS_BLOCKARG != 0 {
-                    gen_counter_incr!(asm, send_block_arg);
-                    return CantCompile;
-                }
-
                 return gen_get_ivar(
                     jit,
-                    ctx,
                     asm,
                     ocb,
                     SEND_MAX_DEPTH,
                     comptime_recv,
                     ivar_name,
                     recv,
-                    recv_opnd,
-                    side_exit,
+                    recv.into(),
                 );
             }
             VM_METHOD_TYPE_ATTRSET => {
+                // This is a .send call not supported right now for attr_writer
+                if flags & VM_CALL_OPT_SEND != 0 {
+                    gen_counter_incr(asm, Counter::send_send_attr_writer);
+                    return None;
+                }
                 if flags & VM_CALL_ARGS_SPLAT != 0 {
-                    gen_counter_incr!(asm, send_args_splat_attrset);
-                    return CantCompile;
+                    gen_counter_incr(asm, Counter::send_args_splat_attrset);
+                    return None;
                 }
                 if flags & VM_CALL_KWARG != 0 {
-                    gen_counter_incr!(asm, send_attrset_kwargs);
-                    return CantCompile;
+                    gen_counter_incr(asm, Counter::send_attrset_kwargs);
+                    return None;
                 } else if argc != 1 || unsafe { !RB_TYPE_P(comptime_recv, RUBY_T_OBJECT) } {
-                    gen_counter_incr!(asm, send_ivar_set_method);
-                    return CantCompile;
+                    gen_counter_incr(asm, Counter::send_ivar_set_method);
+                    return None;
                 } else if c_method_tracing_currently_enabled(jit) {
                     // Can't generate code for firing c_call and c_return events
                     // See :attr-tracing:
-                    gen_counter_incr!(asm, send_cfunc_tracing);
-                    return CantCompile;
+                    gen_counter_incr(asm, Counter::send_cfunc_tracing);
+                    return None;
                 } else if flags & VM_CALL_ARGS_BLOCKARG != 0 {
-                    gen_counter_incr!(asm, send_block_arg);
-                    return CantCompile;
+                    gen_counter_incr(asm, Counter::send_attrset_block_arg);
+                    return None;
                 } else {
                     let ivar_name = unsafe { get_cme_def_body_attr_id(cme) };
-                    return gen_set_ivar(jit, ctx, asm, comptime_recv, ivar_name, flags, argc);
+                    return gen_set_ivar(jit, asm, ocb, comptime_recv, ivar_name, StackOpnd(1), None);
                 }
             }
             // Block method, e.g. define_method(:foo) { :my_block }
             VM_METHOD_TYPE_BMETHOD => {
                 if flags & VM_CALL_ARGS_SPLAT != 0 {
-                    gen_counter_incr!(asm, send_args_splat_bmethod);
-                    return CantCompile;
+                    gen_counter_incr(asm, Counter::send_args_splat_bmethod);
+                    return None;
                 }
-                return gen_send_bmethod(jit, ctx, asm, ocb, ci, cme, block, flags, argc);
-            }
-            VM_METHOD_TYPE_ZSUPER => {
-                gen_counter_incr!(asm, send_zsuper_method);
-                return CantCompile;
+                return gen_send_bmethod(jit, asm, ocb, ci, cme, block, flags, argc);
             }
             VM_METHOD_TYPE_ALIAS => {
                 // Retrieve the aliased method and re-enter the switch
                 cme = unsafe { rb_aliased_callable_method_entry(cme) };
                 continue;
             }
-            VM_METHOD_TYPE_UNDEF => {
-                gen_counter_incr!(asm, send_undef_method);
-                return CantCompile;
-            }
-            VM_METHOD_TYPE_NOTIMPLEMENTED => {
-                gen_counter_incr!(asm, send_not_implemented_method);
-                return CantCompile;
-            }
             // Send family of methods, e.g. call/apply
             VM_METHOD_TYPE_OPTIMIZED => {
                 if flags & VM_CALL_ARGS_BLOCKARG != 0 {
-                    gen_counter_incr!(asm, send_block_arg);
-                    return CantCompile;
-                }
-
-                if flags & VM_CALL_ARGS_SPLAT != 0 {
-                    gen_counter_incr!(asm, send_args_splat_optimized);
-                    return CantCompile;
+                    gen_counter_incr(asm, Counter::send_optimized_block_arg);
+                    return None;
                 }
 
                 let opt_type = unsafe { get_cme_def_body_optimized_type(cme) };
                 match opt_type {
                     OPTIMIZED_METHOD_TYPE_SEND => {
-
                         // This is for method calls like `foo.send(:bar)`
                         // The `send` method does not get its own stack frame.
                         // instead we look up the method and call it,
                         // doing some stack shifting based on the VM_CALL_OPT_SEND flag
 
-                        let starting_context = ctx.clone();
+                        // Reject nested cases such as `send(:send, :alias_for_send, :foo))`.
+                        // We would need to do some stack manipulation here or keep track of how
+                        // many levels deep we need to stack manipulate. Because of how exits
+                        // currently work, we can't do stack manipulation until we will no longer
+                        // side exit.
+                        if flags & VM_CALL_OPT_SEND != 0 {
+                            gen_counter_incr(asm, Counter::send_send_nested);
+                            return None;
+                        }
 
                         if argc == 0 {
-                            gen_counter_incr!(asm, send_send_wrong_args);
-                            return CantCompile;
+                            gen_counter_incr(asm, Counter::send_send_wrong_args);
+                            return None;
                         }
 
                         argc -= 1;
 
-                        let compile_time_name = jit_peek_at_stack(jit, ctx, argc as isize);
-
-                        if !compile_time_name.string_p() && !compile_time_name.static_sym_p()  {
-                            gen_counter_incr!(asm, send_send_chain_not_string_or_sym);
-                            return CantCompile;
-                        }
+                        let compile_time_name = jit.peek_at_stack(&asm.ctx, argc as isize);
 
                         mid = unsafe { rb_get_symbol_id(compile_time_name) };
                         if mid == 0 {
-                            gen_counter_incr!(asm, send_send_null_mid);
-                            return CantCompile;
+                            // This also rejects method names that need conversion
+                            gen_counter_incr(asm, Counter::send_send_null_mid);
+                            return None;
                         }
 
                         cme = unsafe { rb_callable_method_entry(comptime_recv_klass, mid) };
                         if cme.is_null() {
-                            gen_counter_incr!(asm, send_send_null_cme);
-                            return CantCompile;
-                        }
-
-                        // We aren't going to handle `send(send(:foo))`. We would need to
-                        // do some stack manipulation here or keep track of how many levels
-                        // deep we need to stack manipulate
-                        // Because of how exits currently work, we can't do stack manipulation
-                        // until we will no longer side exit.
-                        let def_type = unsafe { get_cme_def_type(cme) };
-                        if let VM_METHOD_TYPE_OPTIMIZED = def_type {
-                            let opt_type = unsafe { get_cme_def_body_optimized_type(cme) };
-                            if let OPTIMIZED_METHOD_TYPE_SEND = opt_type {
-                                gen_counter_incr!(asm, send_send_nested);
-                                return CantCompile;
-                            }
+                            gen_counter_incr(asm, Counter::send_send_null_cme);
+                            return None;
                         }
 
                         flags |= VM_CALL_FCALL | VM_CALL_OPT_SEND;
 
-                        assume_method_lookup_stable(jit, ocb, cme);
+                        jit.assume_method_lookup_stable(asm, ocb, cme);
 
-                        let (known_class, type_mismatch_exit) = {
-                            if compile_time_name.string_p() {
-                                (
-                                    unsafe { rb_cString },
-                                    counted_exit!(ocb, side_exit, send_send_chain_not_string),
-
-                                )
-                            } else {
-                                (
-                                    unsafe { rb_cSymbol },
-                                    counted_exit!(ocb, side_exit, send_send_chain_not_sym),
-                                )
-                            }
-                        };
-
-                        jit_guard_known_klass(
-                            jit,
-                            ctx,
+                        asm_comment!(
                             asm,
-                            ocb,
-                            known_class,
-                            ctx.stack_opnd(argc),
-                            StackOpnd(argc as u16),
-                            compile_time_name,
-                            2, // We have string or symbol, so max depth is 2
-                            type_mismatch_exit
+                            "guard sending method name \'{}\'",
+                            unsafe { cstr_to_rust_string(rb_id2name(mid)) }.unwrap_or_else(|| "<unknown>".to_owned()),
                         );
 
-                        // Need to do this here so we don't have too many live
-                        // values for the register allocator.
-                        let name_opnd = asm.load(ctx.stack_opnd(argc));
-
+                        let name_opnd = asm.stack_opnd(argc);
                         let symbol_id_opnd = asm.ccall(rb_get_symbol_id as *const u8, vec![name_opnd]);
 
-                        asm.comment("chain_guard_send");
-                        let chain_exit = counted_exit!(ocb, side_exit, send_send_chain);
-                        asm.cmp(symbol_id_opnd, 0.into());
-                        asm.jbe(chain_exit);
-
                         asm.cmp(symbol_id_opnd, mid.into());
                         jit_chain_guard(
                             JCC_JNE,
                             jit,
-                            &starting_context,
                             asm,
                             ocb,
-                            SEND_MAX_CHAIN_DEPTH,
-                            chain_exit,
+                            SEND_MAX_DEPTH,
+                            Counter::guard_send_send_name_chain,
                         );
 
                         // We have changed the argc, flags, mid, and cme, so we need to re-enter the match
@@ -6080,34 +8849,39 @@ fn gen_send_general(
                     OPTIMIZED_METHOD_TYPE_CALL => {
 
                         if block.is_some() {
-                            gen_counter_incr!(asm, send_call_block);
-                            return CantCompile;
+                            gen_counter_incr(asm, Counter::send_call_block);
+                            return None;
                         }
 
                         if flags & VM_CALL_KWARG != 0 {
-                            gen_counter_incr!(asm, send_call_kwarg);
-                            return CantCompile;
+                            gen_counter_incr(asm, Counter::send_call_kwarg);
+                            return None;
+                        }
+
+                        if flags & VM_CALL_ARGS_SPLAT != 0 {
+                            gen_counter_incr(asm, Counter::send_args_splat_opt_call);
+                            return None;
                         }
 
                         // Optimize for single ractor mode and avoid runtime check for
                         // "defined with an un-shareable Proc in a different Ractor"
-                        if !assume_single_ractor_mode(jit, ocb) {
-                            gen_counter_incr!(asm, send_call_multi_ractor);
-                            return CantCompile;
+                        if !assume_single_ractor_mode(jit, asm, ocb) {
+                            gen_counter_incr(asm, Counter::send_call_multi_ractor);
+                            return None;
                         }
 
                         // If this is a .send call we need to adjust the stack
                         if flags & VM_CALL_OPT_SEND != 0 {
-                            handle_opt_send_shift_stack(asm, argc, ctx);
+                            handle_opt_send_shift_stack(asm, argc);
                         }
 
                         // About to reset the SP, need to load this here
                         let recv_load = asm.load(recv);
 
-                        let sp = asm.lea(ctx.sp_opnd(0));
+                        let sp = asm.lea(asm.ctx.sp_opnd(0));
 
                         // Save the PC and SP because the callee can make Ruby calls
-                        jit_prepare_routine_call(jit, ctx, asm);
+                        jit_prepare_non_leaf_call(jit, asm);
 
                         let kw_splat = flags & VM_CALL_KW_SPLAT;
                         let stack_argument_pointer = asm.lea(Opnd::mem(64, sp, -(argc) * SIZEOF_VALUE_I32));
@@ -6121,49 +8895,45 @@ fn gen_send_general(
                             VM_BLOCK_HANDLER_NONE.into(),
                         ]);
 
-                        ctx.stack_pop(argc as usize + 1);
+                        asm.stack_pop(argc as usize + 1);
 
-                        let stack_ret = ctx.stack_push(Type::Unknown);
+                        let stack_ret = asm.stack_push(Type::Unknown);
                         asm.mov(stack_ret, ret);
-                        return KeepCompiling;
+                        return Some(KeepCompiling);
 
                     }
                     OPTIMIZED_METHOD_TYPE_BLOCK_CALL => {
-                        gen_counter_incr!(asm, send_optimized_method_block_call);
-                        return CantCompile;
+                        gen_counter_incr(asm, Counter::send_optimized_method_block_call);
+                        return None;
                     }
                     OPTIMIZED_METHOD_TYPE_STRUCT_AREF => {
                         if flags & VM_CALL_ARGS_SPLAT != 0 {
-                            gen_counter_incr!(asm, send_args_splat_aref);
-                            return CantCompile;
+                            gen_counter_incr(asm, Counter::send_args_splat_aref);
+                            return None;
                         }
                         return gen_struct_aref(
                             jit,
-                            ctx,
                             asm,
                             ocb,
                             ci,
                             cme,
                             comptime_recv,
-                            comptime_recv_klass,
                             flags,
                             argc,
                         );
                     }
                     OPTIMIZED_METHOD_TYPE_STRUCT_ASET => {
                         if flags & VM_CALL_ARGS_SPLAT != 0 {
-                            gen_counter_incr!(asm, send_args_splat_aset);
-                            return CantCompile;
+                            gen_counter_incr(asm, Counter::send_args_splat_aset);
+                            return None;
                         }
                         return gen_struct_aset(
                             jit,
-                            ctx,
                             asm,
                             ocb,
                             ci,
                             cme,
                             comptime_recv,
-                            comptime_recv_klass,
                             flags,
                             argc,
                         );
@@ -6173,13 +8943,25 @@ fn gen_send_general(
                     }
                 }
             }
+            VM_METHOD_TYPE_ZSUPER => {
+                gen_counter_incr(asm, Counter::send_zsuper_method);
+                return None;
+            }
+            VM_METHOD_TYPE_UNDEF => {
+                gen_counter_incr(asm, Counter::send_undef_method);
+                return None;
+            }
+            VM_METHOD_TYPE_NOTIMPLEMENTED => {
+                gen_counter_incr(asm, Counter::send_not_implemented_method);
+                return None;
+            }
             VM_METHOD_TYPE_MISSING => {
-                gen_counter_incr!(asm, send_missing_method);
-                return CantCompile;
+                gen_counter_incr(asm, Counter::send_missing_method);
+                return None;
             }
             VM_METHOD_TYPE_REFINED => {
-                gen_counter_incr!(asm, send_refined_method);
-                return CantCompile;
+                gen_counter_incr(asm, Counter::send_refined_method);
+                return None;
             }
             _ => {
                 unreachable!();
@@ -6188,6 +8970,32 @@ fn gen_send_general(
     }
 }
 
+/// Get class name from a class pointer.
+fn get_class_name(class: Option<VALUE>) -> String {
+    class.and_then(|class| unsafe {
+        cstr_to_rust_string(rb_class2name(class))
+    }).unwrap_or_else(|| "Unknown".to_string())
+}
+
+/// Assemble "{class_name}#{method_name}" from a class pointer and a method ID
+fn get_method_name(class: Option<VALUE>, mid: u64) -> String {
+    let class_name = get_class_name(class);
+    let method_name = if mid != 0 {
+        unsafe { cstr_to_rust_string(rb_id2name(mid)) }
+    } else {
+        None
+    }.unwrap_or_else(|| "Unknown".to_string());
+    format!("{}#{}", class_name, method_name)
+}
+
+/// Assemble "{label}@{iseq_path}:{lineno}" (iseq_inspect() format) from an ISEQ
+fn get_iseq_name(iseq: IseqPtr) -> String {
+    let c_string = unsafe { rb_yjit_iseq_inspect(iseq) };
+    let string = unsafe { CStr::from_ptr(c_string) }.to_str()
+        .unwrap_or_else(|_| "not UTF-8").to_string();
+    unsafe { ruby_xfree(c_string as *mut c_void); }
+    string
+}
 
 /// Shifts the stack for send in order to remove the name of the method
 /// Comment below borrow from vm_call_opt_send in vm_insnhelper.c
@@ -6204,164 +9012,305 @@ fn gen_send_general(
 ///--+------+--------+------+------
 ///
 /// We do this for our compiletime context and the actual stack
-fn handle_opt_send_shift_stack(asm: &mut Assembler, argc: i32, ctx: &mut Context) {
-    asm.comment("shift_stack");
+fn handle_opt_send_shift_stack(asm: &mut Assembler, argc: i32) {
+    asm_comment!(asm, "shift_stack");
     for j in (0..argc).rev() {
-        let opnd = ctx.stack_opnd(j);
-        let opnd2 = ctx.stack_opnd(j + 1);
+        let opnd = asm.stack_opnd(j);
+        let opnd2 = asm.stack_opnd(j + 1);
         asm.mov(opnd2, opnd);
     }
-    ctx.shift_stack(argc as usize);
+    asm.shift_stack(argc as usize);
 }
 
 fn gen_opt_send_without_block(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let cd = jit_get_arg(jit, 0).as_ptr();
+) -> Option<CodegenStatus> {
+    // Generate specialized code if possible
+    let cd = jit.get_arg(0).as_ptr();
+    if let Some(status) = perf_call! { gen_send_general(jit, asm, ocb, cd, None) } {
+        return Some(status);
+    }
 
-    gen_send_general(jit, ctx, asm, ocb, cd, None)
+    // Otherwise, fallback to dynamic dispatch using the interpreter's implementation of send
+    gen_send_dynamic(jit, asm, ocb, cd, unsafe { rb_yjit_sendish_sp_pops((*cd).ci) }, |asm| {
+        extern "C" {
+            fn rb_vm_opt_send_without_block(ec: EcPtr, cfp: CfpPtr, cd: VALUE) -> VALUE;
+        }
+        asm.ccall(
+            rb_vm_opt_send_without_block as *const u8,
+            vec![EC, CFP, (cd as usize).into()],
+        )
+    })
 }
 
 fn gen_send(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let cd = jit_get_arg(jit, 0).as_ptr();
-    let block = jit_get_arg(jit, 1).as_optional_ptr();
-    return gen_send_general(jit, ctx, asm, ocb, cd, block);
+) -> Option<CodegenStatus> {
+    // Generate specialized code if possible
+    let cd = jit.get_arg(0).as_ptr();
+    let block = jit.get_arg(1).as_optional_ptr().map(|iseq| BlockHandler::BlockISeq(iseq));
+    if let Some(status) = perf_call! { gen_send_general(jit, asm, ocb, cd, block) } {
+        return Some(status);
+    }
+
+    // Otherwise, fallback to dynamic dispatch using the interpreter's implementation of send
+    let blockiseq = jit.get_arg(1).as_iseq();
+    gen_send_dynamic(jit, asm, ocb, cd, unsafe { rb_yjit_sendish_sp_pops((*cd).ci) }, |asm| {
+        extern "C" {
+            fn rb_vm_send(ec: EcPtr, cfp: CfpPtr, cd: VALUE, blockiseq: IseqPtr) -> VALUE;
+        }
+        asm.ccall(
+            rb_vm_send as *const u8,
+            vec![EC, CFP, (cd as usize).into(), VALUE(blockiseq as usize).into()],
+        )
+    })
+}
+
+fn gen_sendforward(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    return gen_send(jit, asm, ocb);
 }
 
 fn gen_invokeblock(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+) -> Option<CodegenStatus> {
+    // Generate specialized code if possible
+    let cd = jit.get_arg(0).as_ptr();
+    if let Some(status) = gen_invokeblock_specialized(jit, asm, ocb, cd) {
+        return Some(status);
+    }
+
+    // Otherwise, fallback to dynamic dispatch using the interpreter's implementation of send
+    gen_send_dynamic(jit, asm, ocb, cd, unsafe { rb_yjit_invokeblock_sp_pops((*cd).ci) }, |asm| {
+        extern "C" {
+            fn rb_vm_invokeblock(ec: EcPtr, cfp: CfpPtr, cd: VALUE) -> VALUE;
+        }
+        asm.ccall(
+            rb_vm_invokeblock as *const u8,
+            vec![EC, CFP, (cd as usize).into()],
+        )
+    })
+}
+
+fn gen_invokeblock_specialized(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    cd: *const rb_call_data,
+) -> Option<CodegenStatus> {
+    if !jit.at_current_insn() {
+        defer_compilation(jit, asm, ocb);
+        return Some(EndBlock);
+    }
+
+    // Fallback to dynamic dispatch if this callsite is megamorphic
+    if asm.ctx.get_chain_depth() >= SEND_MAX_DEPTH {
+        gen_counter_incr(asm, Counter::invokeblock_megamorphic);
+        return None;
     }
 
     // Get call info
-    let cd = jit_get_arg(jit, 0).as_ptr();
     let ci = unsafe { get_call_data_ci(cd) };
     let argc: i32 = unsafe { vm_ci_argc(ci) }.try_into().unwrap();
     let flags = unsafe { vm_ci_flag(ci) };
 
     // Get block_handler
-    let cfp = unsafe { get_ec_cfp(jit.ec.unwrap()) };
+    let cfp = jit.get_cfp();
     let lep = unsafe { rb_vm_ep_local_ep(get_cfp_ep(cfp)) };
     let comptime_handler = unsafe { *lep.offset(VM_ENV_DATA_INDEX_SPECVAL.try_into().unwrap()) };
 
     // Handle each block_handler type
     if comptime_handler.0 == VM_BLOCK_HANDLER_NONE as usize { // no block given
-        gen_counter_incr!(asm, invokeblock_none);
-        CantCompile
+        gen_counter_incr(asm, Counter::invokeblock_none);
+        None
     } else if comptime_handler.0 & 0x3 == 0x1 { // VM_BH_ISEQ_BLOCK_P
-        asm.comment("get local EP");
+        asm_comment!(asm, "get local EP");
         let ep_opnd = gen_get_lep(jit, asm);
         let block_handler_opnd = asm.load(
             Opnd::mem(64, ep_opnd, SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL)
         );
 
-        asm.comment("guard block_handler type");
-        let side_exit = get_side_exit(jit, ocb, ctx);
+        asm_comment!(asm, "guard block_handler type");
         let tag_opnd = asm.and(block_handler_opnd, 0x3.into()); // block_handler is a tagged pointer
         asm.cmp(tag_opnd, 0x1.into()); // VM_BH_ISEQ_BLOCK_P
-        asm.jne(counted_exit!(ocb, side_exit, invokeblock_iseq_tag_changed));
+        jit_chain_guard(
+            JCC_JNE,
+            jit,
+            asm,
+            ocb,
+            SEND_MAX_DEPTH,
+            Counter::guard_invokeblock_tag_changed,
+        );
+
+        // If the current ISEQ is annotated to be inlined but it's not being inlined here,
+        // generate a dynamic dispatch to avoid making this yield megamorphic.
+        if unsafe { rb_yjit_iseq_builtin_attrs(jit.iseq) } & BUILTIN_ATTR_INLINE_BLOCK != 0 && !asm.ctx.inline() {
+            gen_counter_incr(asm, Counter::invokeblock_iseq_not_inlined);
+            return None;
+        }
 
-        // Not supporting vm_callee_setup_block_arg_arg0_splat for now
         let comptime_captured = unsafe { ((comptime_handler.0 & !0x3) as *const rb_captured_block).as_ref().unwrap() };
         let comptime_iseq = unsafe { *comptime_captured.code.iseq.as_ref() };
-        if argc == 1 && unsafe { get_iseq_flags_has_lead(comptime_iseq) && !get_iseq_flags_ambiguous_param0(comptime_iseq) } {
-            gen_counter_incr!(asm, invokeblock_iseq_arg0_splat);
-            return CantCompile;
-        }
 
-        asm.comment("guard known ISEQ");
+        asm_comment!(asm, "guard known ISEQ");
         let captured_opnd = asm.and(block_handler_opnd, Opnd::Imm(!0x3));
         let iseq_opnd = asm.load(Opnd::mem(64, captured_opnd, SIZEOF_VALUE_I32 * 2));
-        asm.cmp(iseq_opnd, (comptime_iseq as usize).into());
-        let block_changed_exit = counted_exit!(ocb, side_exit, invokeblock_iseq_block_changed);
+        asm.cmp(iseq_opnd, VALUE::from(comptime_iseq).into());
         jit_chain_guard(
             JCC_JNE,
             jit,
-            ctx,
             asm,
             ocb,
-            SEND_MAX_CHAIN_DEPTH,
-            block_changed_exit,
+            SEND_MAX_DEPTH,
+            Counter::guard_invokeblock_iseq_block_changed,
+        );
+
+        perf_call! { gen_send_iseq(jit, asm, ocb, comptime_iseq, ci, VM_FRAME_MAGIC_BLOCK, None, 0 as _, None, flags, argc, Some(captured_opnd)) }
+    } else if comptime_handler.0 & 0x3 == 0x3 { // VM_BH_IFUNC_P
+        // We aren't handling CALLER_SETUP_ARG and CALLER_REMOVE_EMPTY_KW_SPLAT yet.
+        if flags & VM_CALL_ARGS_SPLAT != 0 {
+            gen_counter_incr(asm, Counter::invokeblock_ifunc_args_splat);
+            return None;
+        }
+        if flags & VM_CALL_KW_SPLAT != 0 {
+            gen_counter_incr(asm, Counter::invokeblock_ifunc_kw_splat);
+            return None;
+        }
+
+        asm_comment!(asm, "get local EP");
+        let ep_opnd = gen_get_lep(jit, asm);
+        let block_handler_opnd = asm.load(
+            Opnd::mem(64, ep_opnd, SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL)
         );
 
-        gen_send_iseq(
+        asm_comment!(asm, "guard block_handler type");
+        let tag_opnd = asm.and(block_handler_opnd, 0x3.into()); // block_handler is a tagged pointer
+        asm.cmp(tag_opnd, 0x3.into()); // VM_BH_IFUNC_P
+        jit_chain_guard(
+            JCC_JNE,
             jit,
-            ctx,
             asm,
             ocb,
-            comptime_iseq,
-            ci,
-            VM_FRAME_MAGIC_BLOCK,
-            None,
-            0 as _,
-            None,
-            flags,
-            argc,
-            Some(captured_opnd),
-        )
-    } else if comptime_handler.0 & 0x3 == 0x3 { // VM_BH_IFUNC_P
-        gen_counter_incr!(asm, invokeblock_ifunc);
-        CantCompile
+            SEND_MAX_DEPTH,
+            Counter::guard_invokeblock_tag_changed,
+        );
+
+        // The cfunc may not be leaf
+        jit_prepare_non_leaf_call(jit, asm);
+
+        extern "C" {
+            fn rb_vm_yield_with_cfunc(ec: EcPtr, captured: *const rb_captured_block, argc: c_int, argv: *const VALUE) -> VALUE;
+        }
+        asm_comment!(asm, "call ifunc");
+        let captured_opnd = asm.and(block_handler_opnd, Opnd::Imm(!0x3));
+        let argv = asm.lea(asm.ctx.sp_opnd(-argc));
+        let ret = asm.ccall(
+            rb_vm_yield_with_cfunc as *const u8,
+            vec![EC, captured_opnd, argc.into(), argv],
+        );
+
+        asm.stack_pop(argc.try_into().unwrap());
+        let stack_ret = asm.stack_push(Type::Unknown);
+        asm.mov(stack_ret, ret);
+
+        // cfunc calls may corrupt types
+        asm.clear_local_types();
+
+        // Share the successor with other chains
+        jump_to_next_insn(jit, asm, ocb);
+        Some(EndBlock)
     } else if comptime_handler.symbol_p() {
-        gen_counter_incr!(asm, invokeblock_symbol);
-        CantCompile
+        gen_counter_incr(asm, Counter::invokeblock_symbol);
+        None
     } else { // Proc
-        gen_counter_incr!(asm, invokeblock_proc);
-        CantCompile
+        gen_counter_incr(asm, Counter::invokeblock_proc);
+        None
     }
 }
 
 fn gen_invokesuper(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let cd: *const rb_call_data = jit_get_arg(jit, 0).as_ptr();
-    let block: Option<IseqPtr> = jit_get_arg(jit, 1).as_optional_ptr();
+) -> Option<CodegenStatus> {
+    // Generate specialized code if possible
+    let cd = jit.get_arg(0).as_ptr();
+    if let Some(status) = gen_invokesuper_specialized(jit, asm, ocb, cd) {
+        return Some(status);
+    }
+
+    // Otherwise, fallback to dynamic dispatch using the interpreter's implementation of send
+    let blockiseq = jit.get_arg(1).as_iseq();
+    gen_send_dynamic(jit, asm, ocb, cd, unsafe { rb_yjit_sendish_sp_pops((*cd).ci) }, |asm| {
+        extern "C" {
+            fn rb_vm_invokesuper(ec: EcPtr, cfp: CfpPtr, cd: VALUE, blockiseq: IseqPtr) -> VALUE;
+        }
+        asm.ccall(
+            rb_vm_invokesuper as *const u8,
+            vec![EC, CFP, (cd as usize).into(), VALUE(blockiseq as usize).into()],
+        )
+    })
+}
 
+fn gen_invokesuperforward(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
+    return gen_invokesuper(jit, asm, ocb);
+}
+
+fn gen_invokesuper_specialized(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    cd: *const rb_call_data,
+) -> Option<CodegenStatus> {
     // Defer compilation so we can specialize on class of receiver
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+    if !jit.at_current_insn() {
+        defer_compilation(jit, asm, ocb);
+        return Some(EndBlock);
     }
 
-    let me = unsafe { rb_vm_frame_method_entry(get_ec_cfp(jit.ec.unwrap())) };
+    // Handle the last two branches of vm_caller_setup_arg_block
+    let block = if let Some(iseq) = jit.get_arg(1).as_optional_ptr() {
+        BlockHandler::BlockISeq(iseq)
+    } else {
+        BlockHandler::LEPSpecVal
+    };
+
+    // Fallback to dynamic dispatch if this callsite is megamorphic
+    if asm.ctx.get_chain_depth() >= SEND_MAX_DEPTH {
+        gen_counter_incr(asm, Counter::invokesuper_megamorphic);
+        return None;
+    }
+
+    let me = unsafe { rb_vm_frame_method_entry(jit.get_cfp()) };
     if me.is_null() {
-        return CantCompile;
+        gen_counter_incr(asm, Counter::invokesuper_no_me);
+        return None;
     }
 
     // FIXME: We should track and invalidate this block when this cme is invalidated
     let current_defined_class = unsafe { (*me).defined_class };
     let mid = unsafe { get_def_original_id((*me).def) };
 
-    if me != unsafe { rb_callable_method_entry(current_defined_class, (*me).called_id) } {
-        // Though we likely could generate this call, as we are only concerned
-        // with the method entry remaining valid, assume_method_lookup_stable
-        // below requires that the method lookup matches as well
-        return CantCompile;
-    }
-
     // vm_search_normal_superclass
     let rbasic_ptr: *const RBasic = current_defined_class.as_ptr();
     if current_defined_class.builtin_type() == RUBY_T_ICLASS
         && unsafe { RB_TYPE_P((*rbasic_ptr).klass, RUBY_T_MODULE) && FL_TEST_RAW((*rbasic_ptr).klass, VALUE(RMODULE_IS_REFINEMENT.as_usize())) != VALUE(0) }
     {
-        return CantCompile;
+        gen_counter_incr(asm, Counter::invokesuper_refinement);
+        return None;
     }
     let comptime_superclass =
         unsafe { rb_class_get_superclass(RCLASS_ORIGIN(current_defined_class)) };
@@ -6375,16 +9324,16 @@ fn gen_invokesuper(
     // Note, not using VM_CALL_ARGS_SIMPLE because sometimes we pass a block.
 
     if ci_flags & VM_CALL_KWARG != 0 {
-        gen_counter_incr!(asm, send_keywords);
-        return CantCompile;
+        gen_counter_incr(asm, Counter::invokesuper_kwarg);
+        return None;
     }
     if ci_flags & VM_CALL_KW_SPLAT != 0 {
-        gen_counter_incr!(asm, send_kw_splat);
-        return CantCompile;
+        gen_counter_incr(asm, Counter::invokesuper_kw_splat);
+        return None;
     }
-    if ci_flags & VM_CALL_ARGS_BLOCKARG != 0 {
-        gen_counter_incr!(asm, send_block_arg);
-        return CantCompile;
+    if ci_flags & VM_CALL_FORWARDING != 0 {
+        gen_counter_incr(asm, Counter::invokesuper_forwarding);
+        return None;
     }
 
     // Ensure we haven't rebound this method onto an incompatible class.
@@ -6392,267 +9341,242 @@ fn gen_invokesuper(
     // cheaper calculations first, but since we specialize on the method entry
     // and so only have to do this once at compile time this is fine to always
     // check and side exit.
-    let comptime_recv = jit_peek_at_stack(jit, ctx, argc as isize);
+    let comptime_recv = jit.peek_at_stack(&asm.ctx, argc as isize);
     if unsafe { rb_obj_is_kind_of(comptime_recv, current_defined_class) } == VALUE(0) {
-        return CantCompile;
+        gen_counter_incr(asm, Counter::invokesuper_defined_class_mismatch);
+        return None;
+    }
+
+    // Don't compile `super` on objects with singleton class to avoid retaining the receiver.
+    if VALUE(0) != unsafe { FL_TEST(comptime_recv.class_of(), VALUE(RUBY_FL_SINGLETON as usize)) } {
+        gen_counter_incr(asm, Counter::invokesuper_singleton_class);
+        return None;
     }
 
     // Do method lookup
     let cme = unsafe { rb_callable_method_entry(comptime_superclass, mid) };
-
     if cme.is_null() {
-        return CantCompile;
+        gen_counter_incr(asm, Counter::invokesuper_no_cme);
+        return None;
     }
 
     // Check that we'll be able to write this method dispatch before generating checks
     let cme_def_type = unsafe { get_cme_def_type(cme) };
     if cme_def_type != VM_METHOD_TYPE_ISEQ && cme_def_type != VM_METHOD_TYPE_CFUNC {
         // others unimplemented
-        return CantCompile;
-    }
-
-    // Guard that the receiver has the same class as the one from compile time
-    let side_exit = get_side_exit(jit, ocb, ctx);
-
-    let cfp = unsafe { get_ec_cfp(jit.ec.unwrap()) };
-    let ep = unsafe { get_cfp_ep(cfp) };
-    let cref_me = unsafe { *ep.offset(VM_ENV_DATA_INDEX_ME_CREF.try_into().unwrap()) };
-    let me_as_value = VALUE(me as usize);
-    if cref_me != me_as_value {
-        // This will be the case for super within a block
-        return CantCompile;
+        gen_counter_incr(asm, Counter::invokesuper_not_iseq_or_cfunc);
+        return None;
     }
 
-    asm.comment("guard known me");
-    let ep_opnd = asm.load(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_EP));
+    asm_comment!(asm, "guard known me");
+    let lep_opnd = gen_get_lep(jit, asm);
     let ep_me_opnd = Opnd::mem(
         64,
-        ep_opnd,
+        lep_opnd,
         SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_ME_CREF,
     );
+
+    let me_as_value = VALUE(me as usize);
     asm.cmp(ep_me_opnd, me_as_value.into());
-    asm.jne(counted_exit!(ocb, side_exit, invokesuper_me_changed));
-
-    if block.is_none() {
-        // Guard no block passed
-        // rb_vm_frame_block_handler(GET_EC()->cfp) == VM_BLOCK_HANDLER_NONE
-        // note, we assume VM_ASSERT(VM_ENV_LOCAL_P(ep))
-        //
-        // TODO: this could properly forward the current block handler, but
-        // would require changes to gen_send_*
-        asm.comment("guard no block given");
-        // EP is in REG0 from above
-        let ep_opnd = asm.load(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_EP));
-        let ep_specval_opnd = Opnd::mem(
-            64,
-            ep_opnd,
-            SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL,
-        );
-        asm.cmp(ep_specval_opnd, VM_BLOCK_HANDLER_NONE.into());
-        asm.jne(counted_exit!(ocb, side_exit, invokesuper_block));
-    }
+    jit_chain_guard(
+        JCC_JNE,
+        jit,
+        asm,
+        ocb,
+        SEND_MAX_DEPTH,
+        Counter::guard_invokesuper_me_changed,
+    );
 
     // We need to assume that both our current method entry and the super
     // method entry we invoke remain stable
-    assume_method_lookup_stable(jit, ocb, me);
-    assume_method_lookup_stable(jit, ocb, cme);
+    jit.assume_method_lookup_stable(asm, ocb, me);
+    jit.assume_method_lookup_stable(asm, ocb, cme);
 
     // Method calls may corrupt types
-    ctx.clear_local_types();
+    asm.clear_local_types();
 
     match cme_def_type {
         VM_METHOD_TYPE_ISEQ => {
             let iseq = unsafe { get_def_iseq_ptr((*cme).def) };
             let frame_type = VM_FRAME_MAGIC_METHOD | VM_ENV_FLAG_LOCAL;
-            gen_send_iseq(jit, ctx, asm, ocb, iseq, ci, frame_type, None, cme, block, ci_flags, argc, None)
+            perf_call! { gen_send_iseq(jit, asm, ocb, iseq, ci, frame_type, None, cme, Some(block), ci_flags, argc, None) }
         }
         VM_METHOD_TYPE_CFUNC => {
-            gen_send_cfunc(jit, ctx, asm, ocb, ci, cme, block, ptr::null(), ci_flags, argc)
+            perf_call! { gen_send_cfunc(jit, asm, ocb, ci, cme, Some(block), None, ci_flags, argc) }
         }
         _ => unreachable!(),
     }
 }
 
 fn gen_leave(
-    jit: &mut JITState,
-    ctx: &mut Context,
+    _jit: &mut JITState,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+    _ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
     // Only the return value should be on the stack
-    assert_eq!(1, ctx.get_stack_size());
-
-    // Create a side-exit to fall back to the interpreter
-    let side_exit = get_side_exit(jit, ocb, ctx);
-    let ocb_asm = Assembler::new();
+    assert_eq!(1, asm.ctx.get_stack_size(), "leave instruction expects stack size 1, but was: {}", asm.ctx.get_stack_size());
 
     // Check for interrupts
-    gen_check_ints(asm, counted_exit!(ocb, side_exit, leave_se_interrupt));
-    ocb_asm.compile(ocb.unwrap());
+    gen_check_ints(asm, Counter::leave_se_interrupt);
 
     // Pop the current frame (ec->cfp++)
     // Note: the return PC is already in the previous CFP
-    asm.comment("pop stack frame");
+    asm_comment!(asm, "pop stack frame");
     let incr_cfp = asm.add(CFP, RUBY_SIZEOF_CONTROL_FRAME.into());
     asm.mov(CFP, incr_cfp);
     asm.mov(Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP), CFP);
 
     // Load the return value
-    let retval_opnd = ctx.stack_pop(1);
+    let retval_opnd = asm.stack_pop(1);
 
-    // Move the return value into the C return register for gen_leave_exit()
+    // Move the return value into the C return register
     asm.mov(C_RET_OPND, retval_opnd);
 
-    // Reload REG_SP for the caller and write the return value.
-    // Top of the stack is REG_SP[0] since the caller has sp_offset=1.
-    asm.mov(SP, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP));
-    asm.mov(Opnd::mem(64, SP, 0), C_RET_OPND);
-
-    // Jump to the JIT return address on the frame that was just popped
+    // Jump to the JIT return address on the frame that was just popped.
+    // There are a few possible jump targets:
+    //   - gen_leave_exit() and gen_leave_exception(), for C callers
+    //   - Return context set up by gen_send_iseq()
+    // We don't write the return value to stack memory like the interpreter here.
+    // Each jump target do it as necessary.
     let offset_to_jit_return =
         -(RUBY_SIZEOF_CONTROL_FRAME as i32) + RUBY_OFFSET_CFP_JIT_RETURN;
     asm.jmp_opnd(Opnd::mem(64, CFP, offset_to_jit_return));
 
-    EndBlock
+    Some(EndBlock)
 }
 
 fn gen_getglobal(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let gid = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let gid = jit.get_arg(0).as_usize();
 
     // Save the PC and SP because we might make a Ruby call for warning
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
     let val_opnd = asm.ccall(
         rb_gvar_get as *const u8,
         vec![ gid.into() ]
     );
 
-    let top = ctx.stack_push(Type::Unknown);
+    let top = asm.stack_push(Type::Unknown);
     asm.mov(top, val_opnd);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_setglobal(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let gid = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let gid = jit.get_arg(0).as_usize();
 
     // Save the PC and SP because we might make a Ruby call for
     // Kernel#set_trace_var
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
+    let val = asm.stack_opnd(0);
     asm.ccall(
         rb_gvar_set as *const u8,
         vec![
             gid.into(),
-            ctx.stack_pop(1),
+            val,
         ],
     );
+    asm.stack_pop(1); // Keep it during ccall for GC
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_anytostring(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Save the PC and SP since we might call #to_s
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
-    let str = ctx.stack_pop(1);
-    let val = ctx.stack_pop(1);
+    let str = asm.stack_opnd(0);
+    let val = asm.stack_opnd(1);
 
     let val = asm.ccall(rb_obj_as_string_result as *const u8, vec![str, val]);
+    asm.stack_pop(2); // Keep them during ccall for GC
 
     // Push the return value
-    let stack_ret = ctx.stack_push(Type::TString);
+    let stack_ret = asm.stack_push(Type::TString);
     asm.mov(stack_ret, val);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_objtostring(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+) -> Option<CodegenStatus> {
+    if !jit.at_current_insn() {
+        defer_compilation(jit, asm, ocb);
+        return Some(EndBlock);
     }
 
-    let recv = ctx.stack_opnd(0);
-    let comptime_recv = jit_peek_at_stack(jit, ctx, 0);
+    let recv = asm.stack_opnd(0);
+    let comptime_recv = jit.peek_at_stack(&asm.ctx, 0);
 
     if unsafe { RB_TYPE_P(comptime_recv, RUBY_T_STRING) } {
-        let side_exit = get_side_exit(jit, ocb, ctx);
-
         jit_guard_known_klass(
             jit,
-            ctx,
             asm,
             ocb,
             comptime_recv.class_of(),
             recv,
-            StackOpnd(0),
+            recv.into(),
             comptime_recv,
             SEND_MAX_DEPTH,
-            side_exit,
+            Counter::objtostring_not_string,
         );
+
         // No work needed. The string value is already on the top of the stack.
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
-        let cd = jit_get_arg(jit, 0).as_ptr();
-        gen_send_general(jit, ctx, asm, ocb, cd, None)
+        let cd = jit.get_arg(0).as_ptr();
+        perf_call! { gen_send_general(jit, asm, ocb, cd, None) }
     }
 }
 
 fn gen_intern(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Save the PC and SP because we might allocate
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_call_with_gc(jit, asm);
 
-    let str = ctx.stack_pop(1);
+    let str = asm.stack_opnd(0);
     let sym = asm.ccall(rb_str_intern as *const u8, vec![str]);
+    asm.stack_pop(1); // Keep it during ccall for GC
 
     // Push the return value
-    let stack_ret = ctx.stack_push(Type::Unknown);
+    let stack_ret = asm.stack_push(Type::Unknown);
     asm.mov(stack_ret, sym);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_toregexp(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let opt = jit_get_arg(jit, 0).as_i64();
-    let cnt = jit_get_arg(jit, 1).as_usize();
+) -> Option<CodegenStatus> {
+    let opt = jit.get_arg(0).as_i64();
+    let cnt = jit.get_arg(1).as_usize();
 
     // Save the PC and SP because this allocates an object and could
     // raise an exception.
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
-    let values_ptr = asm.lea(ctx.sp_opnd(-((SIZEOF_VALUE as isize) * (cnt as isize))));
-    ctx.stack_pop(cnt);
+    let values_ptr = asm.lea(asm.ctx.sp_opnd(-(cnt as i32)));
 
     let ary = asm.ccall(
         rb_ary_tmp_new_from_values as *const u8,
@@ -6662,6 +9586,7 @@ fn gen_toregexp(
             values_ptr,
         ]
     );
+    asm.stack_pop(cnt); // Let ccall spill them
 
     // Save the array so we can clear it later
     asm.cpush(ary);
@@ -6681,77 +9606,76 @@ fn gen_toregexp(
     asm.cpop_into(ary);
 
     // The value we want to push on the stack is in RAX right now
-    let stack_ret = ctx.stack_push(Type::Unknown);
+    let stack_ret = asm.stack_push(Type::UnknownHeap);
     asm.mov(stack_ret, val);
 
     // Clear the temp array.
     asm.ccall(rb_ary_clear as *const u8, vec![ary]);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_getspecial(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // This takes two arguments, key and type
     // key is only used when type == 0
     // A non-zero type determines which type of backref to fetch
-    //rb_num_t key = jit_get_arg(jit, 0);
-    let rtype = jit_get_arg(jit, 1).as_u64();
+    //rb_num_t key = jit.jit_get_arg(0);
+    let rtype = jit.get_arg(1).as_u64();
 
     if rtype == 0 {
         // not yet implemented
-        return CantCompile;
+        return None;
     } else if rtype & 0x01 != 0 {
         // Fetch a "special" backref based on a char encoded by shifting by 1
 
         // Can raise if matchdata uninitialized
-        jit_prepare_routine_call(jit, ctx, asm);
+        jit_prepare_non_leaf_call(jit, asm);
 
         // call rb_backref_get()
-        asm.comment("rb_backref_get");
+        asm_comment!(asm, "rb_backref_get");
         let backref = asm.ccall(rb_backref_get as *const u8, vec![]);
 
         let rt_u8: u8 = (rtype >> 1).try_into().unwrap();
         let val = match rt_u8.into() {
             '&' => {
-                asm.comment("rb_reg_last_match");
+                asm_comment!(asm, "rb_reg_last_match");
                 asm.ccall(rb_reg_last_match as *const u8, vec![backref])
             }
             '`' => {
-                asm.comment("rb_reg_match_pre");
+                asm_comment!(asm, "rb_reg_match_pre");
                 asm.ccall(rb_reg_match_pre as *const u8, vec![backref])
             }
             '\'' => {
-                asm.comment("rb_reg_match_post");
+                asm_comment!(asm, "rb_reg_match_post");
                 asm.ccall(rb_reg_match_post as *const u8, vec![backref])
             }
             '+' => {
-                asm.comment("rb_reg_match_last");
+                asm_comment!(asm, "rb_reg_match_last");
                 asm.ccall(rb_reg_match_last as *const u8, vec![backref])
             }
             _ => panic!("invalid back-ref"),
         };
 
-        let stack_ret = ctx.stack_push(Type::Unknown);
+        let stack_ret = asm.stack_push(Type::Unknown);
         asm.mov(stack_ret, val);
 
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
         // Fetch the N-th match from the last backref based on type shifted by 1
 
         // Can raise if matchdata uninitialized
-        jit_prepare_routine_call(jit, ctx, asm);
+        jit_prepare_non_leaf_call(jit, asm);
 
         // call rb_backref_get()
-        asm.comment("rb_backref_get");
+        asm_comment!(asm, "rb_backref_get");
         let backref = asm.ccall(rb_backref_get as *const u8, vec![]);
 
         // rb_reg_nth_match((int)(type >> 1), backref);
-        asm.comment("rb_reg_nth_match");
+        asm_comment!(asm, "rb_reg_nth_match");
         let val = asm.ccall(
             rb_reg_nth_match as *const u8,
             vec![
@@ -6760,75 +9684,74 @@ fn gen_getspecial(
             ]
         );
 
-        let stack_ret = ctx.stack_push(Type::Unknown);
+        let stack_ret = asm.stack_push(Type::Unknown);
         asm.mov(stack_ret, val);
 
-        KeepCompiling
+        Some(KeepCompiling)
     }
 }
 
 fn gen_getclassvariable(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // rb_vm_getclassvariable can raise exceptions.
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
     let val_opnd = asm.ccall(
         rb_vm_getclassvariable as *const u8,
         vec![
             Opnd::mem(64, CFP, RUBY_OFFSET_CFP_ISEQ),
             CFP,
-            Opnd::UImm(jit_get_arg(jit, 0).as_u64()),
-            Opnd::UImm(jit_get_arg(jit, 1).as_u64()),
+            Opnd::UImm(jit.get_arg(0).as_u64()),
+            Opnd::UImm(jit.get_arg(1).as_u64()),
         ],
     );
 
-    let top = ctx.stack_push(Type::Unknown);
+    let top = asm.stack_push(Type::Unknown);
     asm.mov(top, val_opnd);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_setclassvariable(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // rb_vm_setclassvariable can raise exceptions.
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
+    let val = asm.stack_opnd(0);
     asm.ccall(
         rb_vm_setclassvariable as *const u8,
         vec![
             Opnd::mem(64, CFP, RUBY_OFFSET_CFP_ISEQ),
             CFP,
-            Opnd::UImm(jit_get_arg(jit, 0).as_u64()),
-            ctx.stack_pop(1),
-            Opnd::UImm(jit_get_arg(jit, 1).as_u64()),
+            Opnd::UImm(jit.get_arg(0).as_u64()),
+            val,
+            Opnd::UImm(jit.get_arg(1).as_u64()),
         ],
     );
+    asm.stack_pop(1); // Keep it during ccall for GC
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_getconstant(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
 
-    let id = jit_get_arg(jit, 0).as_usize();
+    let id = jit.get_arg(0).as_usize();
 
     // vm_get_ev_const can raise exceptions.
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
-    let allow_nil_opnd = ctx.stack_pop(1);
-    let klass_opnd = ctx.stack_pop(1);
+    let allow_nil_opnd = asm.stack_opnd(0);
+    let klass_opnd = asm.stack_opnd(1);
 
     extern "C" {
         fn rb_vm_get_ev_const(ec: EcPtr, klass: VALUE, id: ID, allow_nil: VALUE) -> VALUE;
@@ -6843,39 +9766,52 @@ fn gen_getconstant(
             allow_nil_opnd
         ],
     );
+    asm.stack_pop(2); // Keep them during ccall for GC
 
-    let top = ctx.stack_push(Type::Unknown);
+    let top = asm.stack_push(Type::Unknown);
     asm.mov(top, val_opnd);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_opt_getconstant_path(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let const_cache_as_value = jit_get_arg(jit, 0);
+) -> Option<CodegenStatus> {
+    let const_cache_as_value = jit.get_arg(0);
     let ic: *const iseq_inline_constant_cache = const_cache_as_value.as_ptr();
     let idlist: *const ID = unsafe { (*ic).segments };
 
+    // Make sure there is an exit for this block as the interpreter might want
+    // to invalidate this block from yjit_constant_ic_update().
+    jit_ensure_block_entry_exit(jit, asm, ocb)?;
+
     // See vm_ic_hit_p(). The same conditions are checked in yjit_constant_ic_update().
+    // If a cache is not filled, fallback to the general C call.
     let ice = unsafe { (*ic).entry };
     if ice.is_null() {
-        // In this case, leave a block that unconditionally side exits
-        // for the interpreter to invalidate.
-        return CantCompile;
-    }
+        // Prepare for const_missing
+        jit_prepare_non_leaf_call(jit, asm);
 
-    // Make sure there is an exit for this block as the interpreter might want
-    // to invalidate this block from yjit_constant_ic_update().
-    jit_ensure_block_entry_exit(jit, ocb);
+        // If this does not trigger const_missing, vm_ic_update will invalidate this block.
+        extern "C" {
+            fn rb_vm_opt_getconstant_path(ec: EcPtr, cfp: CfpPtr, ic: *const u8) -> VALUE;
+        }
+        let val = asm.ccall(
+            rb_vm_opt_getconstant_path as *const u8,
+            vec![EC, CFP, Opnd::const_ptr(ic as *const u8)],
+        );
+
+        let stack_top = asm.stack_push(Type::Unknown);
+        asm.store(stack_top, val);
+
+        jump_to_next_insn(jit, asm, ocb);
+        return Some(EndBlock);
+    }
 
     if !unsafe { (*ice).ic_cref }.is_null() {
         // Cache is keyed on a certain lexical scope. Use the interpreter's cache.
-        let side_exit = get_side_exit(jit, ocb, ctx);
-
         let inline_cache = asm.load(Opnd::const_ptr(ic as *const u8));
 
         // Call function to verify the cache. It doesn't allocate or call methods.
@@ -6887,7 +9823,7 @@ fn gen_opt_getconstant_path(
         // Check the result. SysV only specifies one byte for _Bool return values,
         // so it's important we only check one bit to ignore the higher bits in the register.
         asm.test(ret_val, 1.into());
-        asm.jz(counted_exit!(ocb, side_exit, opt_getinlinecache_miss));
+        asm.jz(Target::side_exit(Counter::opt_getconstant_path_ic_miss));
 
         let inline_cache = asm.load(Opnd::const_ptr(ic as *const u8));
 
@@ -6904,24 +9840,24 @@ fn gen_opt_getconstant_path(
         ));
 
         // Push ic->entry->value
-        let stack_top = ctx.stack_push(Type::Unknown);
+        let stack_top = asm.stack_push(Type::Unknown);
         asm.store(stack_top, ic_entry_val);
     } else {
         // Optimize for single ractor mode.
-        // FIXME: This leaks when st_insert raises NoMemoryError
-        if !assume_single_ractor_mode(jit, ocb) {
-            return CantCompile;
+        if !assume_single_ractor_mode(jit, asm, ocb) {
+            gen_counter_incr(asm, Counter::opt_getconstant_path_multi_ractor);
+            return None;
         }
 
         // Invalidate output code on any constant writes associated with
         // constants referenced within the current block.
-        assume_stable_constant_names(jit, ocb, idlist);
+        jit.assume_stable_constant_names(asm, ocb, idlist);
 
-        jit_putobject(jit, ctx, asm, unsafe { (*ice).value });
+        jit_putobject(asm, unsafe { (*ice).value });
     }
 
-    jump_to_next_insn(jit, ctx, asm, ocb);
-    EndBlock
+    jump_to_next_insn(jit, asm, ocb);
+    Some(EndBlock)
 }
 
 // Push the explicit block parameter onto the temporary stack. Part of the
@@ -6929,31 +9865,29 @@ fn gen_opt_getconstant_path(
 // explicit block parameters.
 fn gen_getblockparamproxy(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+) -> Option<CodegenStatus> {
+    if !jit.at_current_insn() {
+        defer_compilation(jit, asm, ocb);
+        return Some(EndBlock);
     }
 
-    let starting_context = ctx.clone(); // make a copy for use with jit_chain_guard
-
-    // A mirror of the interpreter code. Checking for the case
-    // where it's pushing rb_block_param_proxy.
-    let side_exit = get_side_exit(jit, ocb, ctx);
-
     // EP level
-    let level = jit_get_arg(jit, 1).as_u32();
+    let level = jit.get_arg(1).as_u32();
 
     // Peek at the block handler so we can check whether it's nil
-    let comptime_handler = jit_peek_at_block_handler(jit, level);
+    let comptime_handler = jit.peek_at_block_handler(level);
 
-    // When a block handler is present, it should always be a GC-guarded
-    // pointer (VM_BH_ISEQ_BLOCK_P)
-    if comptime_handler.as_u64() != 0 && comptime_handler.as_u64() & 0x3 != 0x1 {
-        return CantCompile;
+    // Filter for the 4 cases we currently handle
+    if !(comptime_handler.as_u64() == 0 ||              // no block given
+            comptime_handler.as_u64() & 0x3 == 0x1 ||   // iseq block (no associated GC managed object)
+            comptime_handler.as_u64() & 0x3 == 0x3 ||   // ifunc block (no associated GC managed object)
+            unsafe { rb_obj_is_proc(comptime_handler) }.test() // block is a Proc
+        ) {
+        // Missing the symbol case, where we basically need to call Symbol#to_proc at runtime
+        gen_counter_incr(asm, Counter::gbpp_unsupported_type);
+        return None;
     }
 
     // Load environment pointer EP from CFP
@@ -6966,7 +9900,7 @@ fn gen_getblockparamproxy(
         SIZEOF_VALUE_I32 * (VM_ENV_DATA_INDEX_FLAGS as i32),
     );
     asm.test(flag_check, VM_FRAME_FLAG_MODIFIED_BLOCK_PARAM.into());
-    asm.jnz(counted_exit!(ocb, side_exit, gbpp_block_param_modified));
+    asm.jnz(Target::side_exit(Counter::gbpp_block_param_modified));
 
     // Load the block handler for the current frame
     // note, VM_ASSERT(VM_ENV_LOCAL_P(ep))
@@ -6974,7 +9908,12 @@ fn gen_getblockparamproxy(
         Opnd::mem(64, ep_opnd, SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL)
     );
 
-    // Specialize compilation for the case where no block handler is present
+    // Use block handler sample to guide specialization...
+    // NOTE: we use jit_chain_guard() in this decision tree, and since
+    // there are only a few cases, it should never reach the depth limit use
+    // the exit counter we pass to it.
+    //
+    // No block given
     if comptime_handler.as_u64() == 0 {
         // Bail if there is a block handler
         asm.cmp(block_handler, Opnd::UImm(0));
@@ -6982,58 +9921,91 @@ fn gen_getblockparamproxy(
         jit_chain_guard(
             JCC_JNZ,
             jit,
-            &starting_context,
             asm,
             ocb,
             SEND_MAX_DEPTH,
-            side_exit,
+            Counter::gbpp_block_handler_not_none,
         );
 
-        jit_putobject(jit, ctx, asm, Qnil);
-    } else {
-        // Block handler is a tagged pointer. Look at the tag. 0x03 is from VM_BH_ISEQ_BLOCK_P().
-        let block_handler = asm.and(block_handler, 0x3.into());
-
-        // Bail unless VM_BH_ISEQ_BLOCK_P(bh). This also checks for null.
-        asm.cmp(block_handler, 0x1.into());
-
+        jit_putobject(asm, Qnil);
+    } else if comptime_handler.as_u64() & 0x1 == 0x1 {
+        // This handles two cases which are nearly identical
+        // Block handler is a tagged pointer. Look at the tag.
+        //   VM_BH_ISEQ_BLOCK_P(): block_handler & 0x03 == 0x01
+        //   VM_BH_IFUNC_P():      block_handler & 0x03 == 0x03
+        // So to check for either of those cases we can use: val & 0x1 == 0x1
+        const _: () = assert!(RUBY_SYMBOL_FLAG & 1 == 0, "guard below rejects symbol block handlers");
+        // Procs are aligned heap pointers so testing the bit rejects them too.
+
+        asm.test(block_handler, 0x1.into());
         jit_chain_guard(
-            JCC_JNZ,
+            JCC_JZ,
             jit,
-            &starting_context,
             asm,
             ocb,
             SEND_MAX_DEPTH,
-            side_exit,
+            Counter::gbpp_block_handler_not_iseq,
         );
 
         // Push rb_block_param_proxy. It's a root, so no need to use jit_mov_gc_ptr.
         assert!(!unsafe { rb_block_param_proxy }.special_const_p());
 
-        let top = ctx.stack_push(Type::BlockParamProxy);
+        let top = asm.stack_push(Type::BlockParamProxy);
         asm.mov(top, Opnd::const_ptr(unsafe { rb_block_param_proxy }.as_ptr()));
+    } else if unsafe { rb_obj_is_proc(comptime_handler) }.test() {
+        // The block parameter is a Proc
+        c_callable! {
+            // We can't hold values across C calls due to a backend limitation,
+            // so we'll use this thin wrapper around rb_obj_is_proc().
+            fn is_proc(object: VALUE) -> VALUE {
+                if unsafe { rb_obj_is_proc(object) }.test() {
+                    // VM_BH_TO_PROC() is the identify function.
+                    object
+                } else {
+                    Qfalse
+                }
+            }
+        }
+
+        // Simple predicate, no need to jit_prepare_non_leaf_call()
+        let proc_or_false = asm.ccall(is_proc as _, vec![block_handler]);
+
+        // Guard for proc
+        asm.cmp(proc_or_false, Qfalse.into());
+        jit_chain_guard(
+            JCC_JE,
+            jit,
+            asm,
+            ocb,
+            SEND_MAX_DEPTH,
+            Counter::gbpp_block_handler_not_proc,
+        );
+
+        let top = asm.stack_push(Type::Unknown);
+        asm.mov(top, proc_or_false);
+    } else {
+        unreachable!("absurd given initial filtering");
     }
 
-    jump_to_next_insn(jit, ctx, asm, ocb);
+    jump_to_next_insn(jit, asm, ocb);
 
-    EndBlock
+    Some(EndBlock)
 }
 
 fn gen_getblockparam(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+    _ocb: &mut OutlinedCb,
+) -> Option<CodegenStatus> {
     // EP level
-    let level = jit_get_arg(jit, 1).as_u32();
+    let level = jit.get_arg(1).as_u32();
 
     // Save the PC and SP because we might allocate
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_call_with_gc(jit, asm);
+    asm.spill_temps(); // For ccall. Unconditionally spill them for RegTemps consistency.
 
     // A mirror of the interpreter code. Checking for the case
     // where it's pushing rb_block_param_proxy.
-    let side_exit = get_side_exit(jit, ocb, ctx);
 
     // Load environment pointer EP from CFP
     let ep_opnd = gen_get_ep(asm, level);
@@ -7061,7 +10033,7 @@ fn gen_getblockparam(
     asm.test(flags_opnd, VM_ENV_FLAG_WB_REQUIRED.into());
 
     // if (flags & VM_ENV_FLAG_WB_REQUIRED) != 0
-    asm.jnz(side_exit);
+    asm.jnz(Target::side_exit(Counter::gbp_wb_required));
 
     // Convert the block handler in to a proc
     // call rb_vm_bh_to_procval(const rb_execution_context_t *ec, VALUE block_handler)
@@ -7083,7 +10055,7 @@ fn gen_getblockparam(
     let ep_opnd = gen_get_ep(asm, level);
 
     // Write the value at the environment pointer
-    let idx = jit_get_arg(jit, 0).as_i32();
+    let idx = jit.get_arg(0).as_i32();
     let offs = -(SIZEOF_VALUE_I32 * idx);
     asm.mov(Opnd::mem(64, ep_opnd, offs), proc);
 
@@ -7095,47 +10067,47 @@ fn gen_getblockparam(
     asm.write_label(frame_flag_modified);
 
     // Push the proc on the stack
-    let stack_ret = ctx.stack_push(Type::Unknown);
+    let stack_ret = asm.stack_push(Type::Unknown);
     let ep_opnd = gen_get_ep(asm, level);
     asm.mov(stack_ret, Opnd::mem(64, ep_opnd, offs));
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_invokebuiltin(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let bf: *const rb_builtin_function = jit_get_arg(jit, 0).as_ptr();
+) -> Option<CodegenStatus> {
+    let bf: *const rb_builtin_function = jit.get_arg(0).as_ptr();
     let bf_argc: usize = unsafe { (*bf).argc }.try_into().expect("non negative argc");
 
     // ec, self, and arguments
     if bf_argc + 2 > C_ARG_OPNDS.len() {
-        return CantCompile;
+        incr_counter!(invokebuiltin_too_many_args);
+        return None;
     }
 
     // If the calls don't allocate, do they need up to date PC, SP?
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
     // Call the builtin func (ec, recv, arg1, arg2, ...)
     let mut args = vec![EC, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF)];
 
     // Copy arguments from locals
     for i in 0..bf_argc {
-        let stack_opnd = ctx.stack_opnd((bf_argc - i - 1) as i32);
+        let stack_opnd = asm.stack_opnd((bf_argc - i - 1) as i32);
         args.push(stack_opnd);
     }
 
     let val = asm.ccall(unsafe { (*bf).func_ptr } as *const u8, args);
 
     // Push the return value
-    ctx.stack_pop(bf_argc);
-    let stack_ret = ctx.stack_push(Type::Unknown);
+    asm.stack_pop(bf_argc);
+    let stack_ret = asm.stack_push(Type::Unknown);
     asm.mov(stack_ret, val);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // opt_invokebuiltin_delegate calls a builtin function, like
@@ -7143,21 +10115,21 @@ fn gen_invokebuiltin(
 // stack uses the argument locals (and self) from the current method.
 fn gen_opt_invokebuiltin_delegate(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let bf: *const rb_builtin_function = jit_get_arg(jit, 0).as_ptr();
+) -> Option<CodegenStatus> {
+    let bf: *const rb_builtin_function = jit.get_arg(0).as_ptr();
     let bf_argc = unsafe { (*bf).argc };
-    let start_index = jit_get_arg(jit, 1).as_i32();
+    let start_index = jit.get_arg(1).as_i32();
 
     // ec, self, and arguments
     if bf_argc + 2 > (C_ARG_OPNDS.len() as i32) {
-        return CantCompile;
+        incr_counter!(invokebuiltin_too_many_args);
+        return None;
     }
 
     // If the calls don't allocate, do they need up to date PC, SP?
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
     // Call the builtin func (ec, recv, arg1, arg2, ...)
     let mut args = vec![EC, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF)];
@@ -7177,10 +10149,10 @@ fn gen_opt_invokebuiltin_delegate(
     let val = asm.ccall(unsafe { (*bf).func_ptr } as *const u8, args);
 
     // Push the return value
-    let stack_ret = ctx.stack_push(Type::Unknown);
+    let stack_ret = asm.stack_push(Type::Unknown);
     asm.mov(stack_ret, val);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 /// Maps a YARV opcode to a code generation function (if supported)
@@ -7227,14 +10199,19 @@ fn get_gen_fn(opcode: VALUE) -> Option<InsnGenFn> {
         YARVINSN_opt_mod => Some(gen_opt_mod),
         YARVINSN_opt_str_freeze => Some(gen_opt_str_freeze),
         YARVINSN_opt_str_uminus => Some(gen_opt_str_uminus),
-        YARVINSN_opt_newarray_max => Some(gen_opt_newarray_max),
-        YARVINSN_opt_newarray_min => Some(gen_opt_newarray_min),
+        YARVINSN_opt_newarray_send => Some(gen_opt_newarray_send),
         YARVINSN_splatarray => Some(gen_splatarray),
+        YARVINSN_splatkw => Some(gen_splatkw),
         YARVINSN_concatarray => Some(gen_concatarray),
+        YARVINSN_concattoarray => Some(gen_concattoarray),
+        YARVINSN_pushtoarray => Some(gen_pushtoarray),
         YARVINSN_newrange => Some(gen_newrange),
         YARVINSN_putstring => Some(gen_putstring),
+        YARVINSN_putchilledstring => Some(gen_putchilledstring),
         YARVINSN_expandarray => Some(gen_expandarray),
         YARVINSN_defined => Some(gen_defined),
+        YARVINSN_definedivar => Some(gen_definedivar),
+        YARVINSN_checkmatch => Some(gen_checkmatch),
         YARVINSN_checkkeyword => Some(gen_checkkeyword),
         YARVINSN_concatstrings => Some(gen_concatstrings),
         YARVINSN_getinstancevariable => Some(gen_getinstancevariable),
@@ -7244,6 +10221,7 @@ fn get_gen_fn(opcode: VALUE) -> Option<InsnGenFn> {
         YARVINSN_opt_neq => Some(gen_opt_neq),
         YARVINSN_opt_aref => Some(gen_opt_aref),
         YARVINSN_opt_aset => Some(gen_opt_aset),
+        YARVINSN_opt_aref_with => Some(gen_opt_aref_with),
         YARVINSN_opt_mult => Some(gen_opt_mult),
         YARVINSN_opt_div => Some(gen_opt_div),
         YARVINSN_opt_ltlt => Some(gen_opt_ltlt),
@@ -7263,14 +10241,17 @@ fn get_gen_fn(opcode: VALUE) -> Option<InsnGenFn> {
         YARVINSN_branchif => Some(gen_branchif),
         YARVINSN_branchunless => Some(gen_branchunless),
         YARVINSN_branchnil => Some(gen_branchnil),
+        YARVINSN_throw => Some(gen_throw),
         YARVINSN_jump => Some(gen_jump),
 
         YARVINSN_getblockparamproxy => Some(gen_getblockparamproxy),
         YARVINSN_getblockparam => Some(gen_getblockparam),
         YARVINSN_opt_send_without_block => Some(gen_opt_send_without_block),
         YARVINSN_send => Some(gen_send),
+        YARVINSN_sendforward => Some(gen_sendforward),
         YARVINSN_invokeblock => Some(gen_invokeblock),
         YARVINSN_invokesuper => Some(gen_invokesuper),
+        YARVINSN_invokesuperforward => Some(gen_invokesuperforward),
         YARVINSN_leave => Some(gen_leave),
 
         YARVINSN_getglobal => Some(gen_getglobal),
@@ -7289,22 +10270,123 @@ fn get_gen_fn(opcode: VALUE) -> Option<InsnGenFn> {
 }
 
 // Return true when the codegen function generates code.
-// known_recv_klass is non-NULL when the caller has used jit_guard_known_klass().
+// known_recv_class has Some value when the caller has used jit_guard_known_klass().
 // See yjit_reg_method().
 type MethodGenFn = fn(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
     ci: *const rb_callinfo,
     cme: *const rb_callable_method_entry_t,
-    block: Option<IseqPtr>,
+    block: Option<BlockHandler>,
     argc: i32,
-    known_recv_class: *const VALUE,
+    known_recv_class: Option<VALUE>,
 ) -> bool;
 
+/// Methods for generating code for hardcoded (usually C) methods
+static mut METHOD_CODEGEN_TABLE: Option<HashMap<usize, MethodGenFn>> = None;
+
+/// Register codegen functions for some Ruby core methods
+pub fn yjit_reg_method_codegen_fns() {
+    unsafe {
+        assert!(METHOD_CODEGEN_TABLE.is_none());
+        METHOD_CODEGEN_TABLE = Some(HashMap::default());
+
+        // Specialization for C methods. See yjit_reg_method() for details.
+        yjit_reg_method(rb_cBasicObject, "!", jit_rb_obj_not);
+
+        yjit_reg_method(rb_cNilClass, "nil?", jit_rb_true);
+        yjit_reg_method(rb_mKernel, "nil?", jit_rb_false);
+        yjit_reg_method(rb_mKernel, "is_a?", jit_rb_kernel_is_a);
+        yjit_reg_method(rb_mKernel, "kind_of?", jit_rb_kernel_is_a);
+        yjit_reg_method(rb_mKernel, "instance_of?", jit_rb_kernel_instance_of);
+
+        yjit_reg_method(rb_cBasicObject, "==", jit_rb_obj_equal);
+        yjit_reg_method(rb_cBasicObject, "equal?", jit_rb_obj_equal);
+        yjit_reg_method(rb_cBasicObject, "!=", jit_rb_obj_not_equal);
+        yjit_reg_method(rb_mKernel, "eql?", jit_rb_obj_equal);
+        yjit_reg_method(rb_cModule, "==", jit_rb_obj_equal);
+        yjit_reg_method(rb_cModule, "===", jit_rb_mod_eqq);
+        yjit_reg_method(rb_cSymbol, "==", jit_rb_obj_equal);
+        yjit_reg_method(rb_cSymbol, "===", jit_rb_obj_equal);
+        yjit_reg_method(rb_cInteger, "==", jit_rb_int_equal);
+        yjit_reg_method(rb_cInteger, "===", jit_rb_int_equal);
+
+        yjit_reg_method(rb_cInteger, "succ", jit_rb_int_succ);
+        yjit_reg_method(rb_cInteger, "/", jit_rb_int_div);
+        yjit_reg_method(rb_cInteger, "<<", jit_rb_int_lshift);
+        yjit_reg_method(rb_cInteger, ">>", jit_rb_int_rshift);
+        yjit_reg_method(rb_cInteger, "^", jit_rb_int_xor);
+        yjit_reg_method(rb_cInteger, "[]", jit_rb_int_aref);
+
+        yjit_reg_method(rb_cFloat, "+", jit_rb_float_plus);
+        yjit_reg_method(rb_cFloat, "-", jit_rb_float_minus);
+        yjit_reg_method(rb_cFloat, "*", jit_rb_float_mul);
+        yjit_reg_method(rb_cFloat, "/", jit_rb_float_div);
+
+        yjit_reg_method(rb_cString, "empty?", jit_rb_str_empty_p);
+        yjit_reg_method(rb_cString, "to_s", jit_rb_str_to_s);
+        yjit_reg_method(rb_cString, "to_str", jit_rb_str_to_s);
+        yjit_reg_method(rb_cString, "length", jit_rb_str_length);
+        yjit_reg_method(rb_cString, "size", jit_rb_str_length);
+        yjit_reg_method(rb_cString, "bytesize", jit_rb_str_bytesize);
+        yjit_reg_method(rb_cString, "getbyte", jit_rb_str_getbyte);
+        yjit_reg_method(rb_cString, "setbyte", jit_rb_str_setbyte);
+        yjit_reg_method(rb_cString, "byteslice", jit_rb_str_byteslice);
+        yjit_reg_method(rb_cString, "<<", jit_rb_str_concat);
+        yjit_reg_method(rb_cString, "+@", jit_rb_str_uplus);
+
+        yjit_reg_method(rb_cNilClass, "===", jit_rb_case_equal);
+        yjit_reg_method(rb_cTrueClass, "===", jit_rb_case_equal);
+        yjit_reg_method(rb_cFalseClass, "===", jit_rb_case_equal);
+
+        yjit_reg_method(rb_cArray, "empty?", jit_rb_ary_empty_p);
+        yjit_reg_method(rb_cArray, "length", jit_rb_ary_length);
+        yjit_reg_method(rb_cArray, "size", jit_rb_ary_length);
+        yjit_reg_method(rb_cArray, "<<", jit_rb_ary_push);
+
+        yjit_reg_method(rb_cHash, "empty?", jit_rb_hash_empty_p);
+
+        yjit_reg_method(rb_mKernel, "respond_to?", jit_obj_respond_to);
+        yjit_reg_method(rb_mKernel, "block_given?", jit_rb_f_block_given_p);
+
+        yjit_reg_method(rb_cClass, "superclass", jit_rb_class_superclass);
+
+        yjit_reg_method(rb_singleton_class(rb_cThread), "current", jit_thread_s_current);
+    }
+}
+
+// Register a specialized codegen function for a particular method. Note that
+// the if the function returns true, the code it generates runs without a
+// control frame and without interrupt checks. To avoid creating observable
+// behavior changes, the codegen function should only target simple code paths
+// that do not allocate and do not make method calls.
+fn yjit_reg_method(klass: VALUE, mid_str: &str, gen_fn: MethodGenFn) {
+    let id_string = std::ffi::CString::new(mid_str).expect("couldn't convert to CString!");
+    let mid = unsafe { rb_intern(id_string.as_ptr()) };
+    let me = unsafe { rb_method_entry_at(klass, mid) };
+
+    if me.is_null() {
+        panic!("undefined optimized method!: {mid_str}");
+    }
+
+    // For now, only cfuncs are supported
+    //RUBY_ASSERT(me && me->def);
+    //RUBY_ASSERT(me->def->type == VM_METHOD_TYPE_CFUNC);
+
+    let method_serial = unsafe {
+        let def = (*me).def;
+        get_def_method_serial(def)
+    };
+
+    unsafe { METHOD_CODEGEN_TABLE.as_mut().unwrap().insert(method_serial, gen_fn); }
+}
+
 /// Global state needed for code generation
 pub struct CodegenGlobals {
+    /// Flat vector of bits to store compressed context data
+    context_data: BitVector,
+
     /// Inline code block (fast path)
     inline_cb: CodeBlock,
 
@@ -7314,36 +10396,31 @@ pub struct CodegenGlobals {
     /// Code for exiting back to the interpreter from the leave instruction
     leave_exit_code: CodePtr,
 
+    /// Code for exiting back to the interpreter after handling an exception
+    leave_exception_code: CodePtr,
+
     // For exiting from YJIT frame from branch_stub_hit().
-    // Filled by gen_code_for_exit_from_stub().
+    // Filled by gen_stub_exit().
     stub_exit_code: CodePtr,
 
     // For servicing branch stubs
     branch_stub_hit_trampoline: CodePtr,
 
+    // For servicing entry stubs
+    entry_stub_hit_trampoline: CodePtr,
+
     // Code for full logic of returning from C method and exiting to the interpreter
     outline_full_cfunc_return_pos: CodePtr,
 
     /// For implementing global code invalidation
     global_inval_patches: Vec<CodepagePatch>,
 
-    /// For implementing global code invalidation. The number of bytes counting from the beginning
-    /// of the inline code block that should not be changed. After patching for global invalidation,
-    /// no one should make changes to the invalidated code region anymore. This is used to
-    /// break out of invalidation race when there are multiple ractors.
-    inline_frozen_bytes: usize,
-
-    // Methods for generating code for hardcoded (usually C) methods
-    method_codegen_table: HashMap<usize, MethodGenFn>,
-
     /// Page indexes for outlined code that are not associated to any ISEQ.
     ocb_pages: Vec<usize>,
 
-    /// Freed page indexes. None if code GC has not been used.
-    freed_pages: Option<Vec<usize>>,
-
-    /// How many times code GC has been executed.
-    code_gc_count: usize,
+    /// Map of cfunc YARV PCs to CMEs and receiver indexes, used to lazily push
+    /// a frame when rb_yjit_lazy_push_frame() is called with a PC in this HashMap.
+    pc_to_cfunc: HashMap<*mut VALUE, (*const rb_callable_method_entry_t, u8)>,
 }
 
 /// For implementing global code invalidation. A position in the inline
@@ -7363,12 +10440,8 @@ impl CodegenGlobals {
         // Executable memory and code page size in bytes
         let mem_size = get_option!(exec_mem_size);
 
-
         #[cfg(not(test))]
         let (mut cb, mut ocb) = {
-            use std::cell::RefCell;
-            use std::rc::Rc;
-
             let virt_block: *mut u8 = unsafe { rb_yjit_reserve_addr_space(mem_size as u32) };
 
             // Memory protection syscalls need page-aligned addresses, so check it here. Assuming
@@ -7395,10 +10468,9 @@ impl CodegenGlobals {
             );
             let mem_block = Rc::new(RefCell::new(mem_block));
 
-            let cb = CodeBlock::new(mem_block.clone(), false);
-            let ocb = OutlinedCb::wrap(CodeBlock::new(mem_block, true));
-
-            assert_eq!(cb.page_size() % page_size.as_usize(), 0, "code page size is not page-aligned");
+            let freed_pages = Rc::new(None);
+            let cb = CodeBlock::new(mem_block.clone(), false, freed_pages.clone());
+            let ocb = OutlinedCb::wrap(CodeBlock::new(mem_block, true, freed_pages));
 
             (cb, ocb)
         };
@@ -7411,14 +10483,16 @@ impl CodegenGlobals {
         let mut ocb = OutlinedCb::wrap(CodeBlock::new_dummy(mem_size / 2));
 
         let ocb_start_addr = ocb.unwrap().get_write_ptr();
-        let leave_exit_code = gen_leave_exit(&mut ocb);
+        let leave_exit_code = gen_leave_exit(&mut ocb).unwrap();
+        let leave_exception_code = gen_leave_exception(&mut ocb).unwrap();
 
-        let stub_exit_code = gen_code_for_exit_from_stub(&mut ocb);
+        let stub_exit_code = gen_stub_exit(&mut ocb).unwrap();
 
-        let branch_stub_hit_trampoline = gen_branch_stub_hit_trampoline(&mut ocb);
+        let branch_stub_hit_trampoline = gen_branch_stub_hit_trampoline(&mut ocb).unwrap();
+        let entry_stub_hit_trampoline = gen_entry_stub_hit_trampoline(&mut ocb).unwrap();
 
         // Generate full exit code for C func
-        let cfunc_exit_code = gen_full_cfunc_return(&mut ocb);
+        let cfunc_exit_code = gen_full_cfunc_return(&mut ocb).unwrap();
 
         let ocb_end_addr = ocb.unwrap().get_write_ptr();
         let ocb_pages = ocb.unwrap().addrs_to_pages(ocb_start_addr, ocb_end_addr);
@@ -7427,93 +10501,27 @@ impl CodegenGlobals {
         cb.mark_all_executable();
         ocb.unwrap().mark_all_executable();
 
-        let mut codegen_globals = CodegenGlobals {
+        let codegen_globals = CodegenGlobals {
+            context_data: BitVector::new(),
             inline_cb: cb,
             outlined_cb: ocb,
             leave_exit_code,
-            stub_exit_code: stub_exit_code,
+            leave_exception_code,
+            stub_exit_code,
             outline_full_cfunc_return_pos: cfunc_exit_code,
             branch_stub_hit_trampoline,
+            entry_stub_hit_trampoline,
             global_inval_patches: Vec::new(),
-            inline_frozen_bytes: 0,
-            method_codegen_table: HashMap::new(),
             ocb_pages,
-            freed_pages: None,
-            code_gc_count: 0,
+            pc_to_cfunc: HashMap::new(),
         };
 
-        // Register the method codegen functions
-        codegen_globals.reg_method_codegen_fns();
-
         // Initialize the codegen globals instance
         unsafe {
             CODEGEN_GLOBALS = Some(codegen_globals);
         }
     }
 
-    // Register a specialized codegen function for a particular method. Note that
-    // the if the function returns true, the code it generates runs without a
-    // control frame and without interrupt checks. To avoid creating observable
-    // behavior changes, the codegen function should only target simple code paths
-    // that do not allocate and do not make method calls.
-    fn yjit_reg_method(&mut self, klass: VALUE, mid_str: &str, gen_fn: MethodGenFn) {
-        let id_string = std::ffi::CString::new(mid_str).expect("couldn't convert to CString!");
-        let mid = unsafe { rb_intern(id_string.as_ptr()) };
-        let me = unsafe { rb_method_entry_at(klass, mid) };
-
-        if me.is_null() {
-            panic!("undefined optimized method!");
-        }
-
-        // For now, only cfuncs are supported
-        //RUBY_ASSERT(me && me->def);
-        //RUBY_ASSERT(me->def->type == VM_METHOD_TYPE_CFUNC);
-
-        let method_serial = unsafe {
-            let def = (*me).def;
-            get_def_method_serial(def)
-        };
-
-        self.method_codegen_table.insert(method_serial, gen_fn);
-    }
-
-    /// Register codegen functions for some Ruby core methods
-    fn reg_method_codegen_fns(&mut self) {
-        unsafe {
-            // Specialization for C methods. See yjit_reg_method() for details.
-            self.yjit_reg_method(rb_cBasicObject, "!", jit_rb_obj_not);
-
-            self.yjit_reg_method(rb_cNilClass, "nil?", jit_rb_true);
-            self.yjit_reg_method(rb_mKernel, "nil?", jit_rb_false);
-
-            self.yjit_reg_method(rb_cBasicObject, "==", jit_rb_obj_equal);
-            self.yjit_reg_method(rb_cBasicObject, "equal?", jit_rb_obj_equal);
-            self.yjit_reg_method(rb_mKernel, "eql?", jit_rb_obj_equal);
-            self.yjit_reg_method(rb_cModule, "==", jit_rb_obj_equal);
-            self.yjit_reg_method(rb_cSymbol, "==", jit_rb_obj_equal);
-            self.yjit_reg_method(rb_cSymbol, "===", jit_rb_obj_equal);
-            self.yjit_reg_method(rb_cInteger, "==", jit_rb_int_equal);
-            self.yjit_reg_method(rb_cInteger, "===", jit_rb_int_equal);
-
-            // rb_str_to_s() methods in string.c
-            self.yjit_reg_method(rb_cString, "empty?", jit_rb_str_empty);
-            self.yjit_reg_method(rb_cString, "to_s", jit_rb_str_to_s);
-            self.yjit_reg_method(rb_cString, "to_str", jit_rb_str_to_s);
-            self.yjit_reg_method(rb_cString, "bytesize", jit_rb_str_bytesize);
-            self.yjit_reg_method(rb_cString, "<<", jit_rb_str_concat);
-            self.yjit_reg_method(rb_cString, "+@", jit_rb_str_uplus);
-
-            self.yjit_reg_method(rb_mKernel, "respond_to?", jit_obj_respond_to);
-
-            // Thread.current
-            self.yjit_reg_method(
-                rb_singleton_class(rb_cThread),
-                "current",
-                jit_thread_s_current,
-            );
-        }
-    }
-
     /// Get a mutable reference to the codegen globals instance
     pub fn get_instance() -> &'static mut CodegenGlobals {
         unsafe { CODEGEN_GLOBALS.as_mut().unwrap() }
@@ -7523,6 +10531,11 @@ impl CodegenGlobals {
         unsafe { CODEGEN_GLOBALS.as_mut().is_some() }
     }
 
+    /// Get a mutable reference to the context data
+    pub fn get_context_data() -> &'static mut BitVector {
+        &mut CodegenGlobals::get_instance().context_data
+    }
+
     /// Get a mutable reference to the inline code block
     pub fn get_inline_cb() -> &'static mut CodeBlock {
         &mut CodegenGlobals::get_instance().inline_cb
@@ -7537,14 +10550,26 @@ impl CodegenGlobals {
         CodegenGlobals::get_instance().leave_exit_code
     }
 
+    pub fn get_leave_exception_code() -> CodePtr {
+        CodegenGlobals::get_instance().leave_exception_code
+    }
+
     pub fn get_stub_exit_code() -> CodePtr {
         CodegenGlobals::get_instance().stub_exit_code
     }
 
-    pub fn push_global_inval_patch(i_pos: CodePtr, o_pos: CodePtr) {
+    pub fn push_global_inval_patch(inline_pos: CodePtr, outlined_pos: CodePtr, cb: &CodeBlock) {
+        if let Some(last_patch) = CodegenGlobals::get_instance().global_inval_patches.last() {
+            let patch_offset = inline_pos.as_offset() - last_patch.inline_patch_pos.as_offset();
+            assert!(
+                patch_offset < 0 || cb.jmp_ptr_bytes() as i64 <= patch_offset,
+                "patches should not overlap (patch_offset: {patch_offset})",
+            );
+        }
+
         let patch = CodepagePatch {
-            inline_patch_pos: i_pos,
-            outlined_target_pos: o_pos,
+            inline_patch_pos: inline_pos,
+            outlined_target_pos: outlined_pos,
         };
         CodegenGlobals::get_instance()
             .global_inval_patches
@@ -7557,14 +10582,6 @@ impl CodegenGlobals {
         mem::take(&mut globals.global_inval_patches)
     }
 
-    pub fn get_inline_frozen_bytes() -> usize {
-        CodegenGlobals::get_instance().inline_frozen_bytes
-    }
-
-    pub fn set_inline_frozen_bytes(frozen_bytes: usize) {
-        CodegenGlobals::get_instance().inline_frozen_bytes = frozen_bytes;
-    }
-
     pub fn get_outline_full_cfunc_return_pos() -> CodePtr {
         CodegenGlobals::get_instance().outline_full_cfunc_return_pos
     }
@@ -7573,31 +10590,16 @@ impl CodegenGlobals {
         CodegenGlobals::get_instance().branch_stub_hit_trampoline
     }
 
-    pub fn look_up_codegen_method(method_serial: usize) -> Option<MethodGenFn> {
-        let table = &CodegenGlobals::get_instance().method_codegen_table;
-
-        let option_ref = table.get(&method_serial);
-        match option_ref {
-            None => None,
-            Some(&mgf) => Some(mgf), // Deref
-        }
+    pub fn get_entry_stub_hit_trampoline() -> CodePtr {
+        CodegenGlobals::get_instance().entry_stub_hit_trampoline
     }
 
     pub fn get_ocb_pages() -> &'static Vec<usize> {
         &CodegenGlobals::get_instance().ocb_pages
     }
 
-    pub fn get_freed_pages() -> &'static mut Option<Vec<usize>> {
-        &mut CodegenGlobals::get_instance().freed_pages
-    }
-
-    pub fn set_freed_pages(freed_pages: Vec<usize>) {
-        CodegenGlobals::get_instance().freed_pages = Some(freed_pages);
-        CodegenGlobals::get_instance().code_gc_count += 1;
-    }
-
-    pub fn get_code_gc_count() -> usize {
-        CodegenGlobals::get_instance().code_gc_count
+    pub fn get_pc_to_cfunc() -> &'static mut HashMap<*mut VALUE, (*const rb_callable_method_entry_t, u8)> {
+        &mut CodegenGlobals::get_instance().pc_to_cfunc
     }
 }
 
@@ -7606,17 +10608,18 @@ mod tests {
     use super::*;
 
     fn setup_codegen() -> (JITState, Context, Assembler, CodeBlock, OutlinedCb) {
-        let blockid = BlockId {
-            iseq: ptr::null(),
-            idx: 0,
-        };
-        let block = Block::new(blockid, &Context::default());
+        let cb = CodeBlock::new_dummy(256 * 1024);
 
         return (
-            JITState::new(&block),
+            JITState::new(
+                BlockId { iseq: std::ptr::null(), idx: 0 },
+                Context::default(),
+                cb.get_write_ptr(),
+                ptr::null(), // No execution context in tests. No peeking!
+            ),
             Context::default(),
             Assembler::new(),
-            CodeBlock::new_dummy(256 * 1024),
+            cb,
             OutlinedCb::wrap(CodeBlock::new_dummy(256 * 1024)),
         );
     }
@@ -7630,250 +10633,203 @@ mod tests {
 
     #[test]
     fn test_gen_exit() {
-        let (_, ctx, mut asm, mut cb, _) = setup_codegen();
-        gen_exit(0 as *mut VALUE, &ctx, &mut asm);
-        asm.compile(&mut cb);
+        let (_, _ctx, mut asm, mut cb, _) = setup_codegen();
+        gen_exit(0 as *mut VALUE, &mut asm);
+        asm.compile(&mut cb, None).unwrap();
         assert!(cb.get_write_pos() > 0);
     }
 
     #[test]
     fn test_get_side_exit() {
-        let (mut jit, ctx, _, _, mut ocb) = setup_codegen();
-         get_side_exit(&mut jit, &mut ocb, &ctx);
+        let (_jit, ctx, mut asm, _, mut ocb) = setup_codegen();
+        let side_exit_context = SideExitContext::new(0 as _, ctx);
+        asm.get_side_exit(&side_exit_context, None, &mut ocb);
         assert!(ocb.unwrap().get_write_pos() > 0);
     }
 
     #[test]
     fn test_gen_check_ints() {
-        let (_, _ctx, mut asm, _cb, mut ocb) = setup_codegen();
-        let side_exit = ocb.unwrap().get_write_ptr().as_side_exit();
-        gen_check_ints(&mut asm, side_exit);
+        let (_jit, _ctx, mut asm, _cb, _ocb) = setup_codegen();
+        asm.set_side_exit_context(0 as _, 0);
+        gen_check_ints(&mut asm, Counter::guard_send_interrupted);
     }
 
     #[test]
     fn test_gen_nop() {
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-        let status = gen_nop(&mut jit, &mut context, &mut asm, &mut ocb);
-        asm.compile(&mut cb);
+        let (mut jit, context, mut asm, mut cb, mut ocb) = setup_codegen();
+        let status = gen_nop(&mut jit, &mut asm, &mut ocb);
+        asm.compile(&mut cb, None).unwrap();
 
-        assert_eq!(status, KeepCompiling);
-        assert_eq!(context.diff(&Context::default()), 0);
+        assert_eq!(status, Some(KeepCompiling));
+        assert_eq!(context.diff(&Context::default()), TypeDiff::Compatible(0));
         assert_eq!(cb.get_write_pos(), 0);
     }
 
     #[test]
     fn test_gen_pop() {
         let (mut jit, _, mut asm, _cb, mut ocb) = setup_codegen();
-        let mut context = Context::default();
-        context.stack_push(Type::Fixnum);
-        let status = gen_pop(&mut jit, &mut context, &mut asm, &mut ocb);
+        let context = Context::default();
+        asm.stack_push(Type::Fixnum);
+        let status = gen_pop(&mut jit, &mut asm, &mut ocb);
 
-        assert_eq!(status, KeepCompiling);
-        assert_eq!(context.diff(&Context::default()), 0);
+        assert_eq!(status, Some(KeepCompiling));
+        let mut default = Context::default();
+        default.set_reg_temps(context.get_reg_temps());
+        assert_eq!(context.diff(&default), TypeDiff::Compatible(0));
     }
 
     #[test]
     fn test_gen_dup() {
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-        context.stack_push(Type::Fixnum);
-        let status = gen_dup(&mut jit, &mut context, &mut asm, &mut ocb);
+        let (mut jit, _context, mut asm, mut cb, mut ocb) = setup_codegen();
+        asm.stack_push(Type::Fixnum);
+        let status = gen_dup(&mut jit, &mut asm, &mut ocb);
 
-        assert_eq!(status, KeepCompiling);
+        assert_eq!(status, Some(KeepCompiling));
 
         // Did we duplicate the type information for the Fixnum type?
-        assert_eq!(Type::Fixnum, context.get_opnd_type(StackOpnd(0)));
-        assert_eq!(Type::Fixnum, context.get_opnd_type(StackOpnd(1)));
+        assert_eq!(Type::Fixnum, asm.ctx.get_opnd_type(StackOpnd(0)));
+        assert_eq!(Type::Fixnum, asm.ctx.get_opnd_type(StackOpnd(1)));
 
-        asm.compile(&mut cb);
+        asm.compile(&mut cb, None).unwrap();
         assert!(cb.get_write_pos() > 0); // Write some movs
     }
 
     #[test]
     fn test_gen_dupn() {
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-        context.stack_push(Type::Fixnum);
-        context.stack_push(Type::Flonum);
+        let (mut jit, _context, mut asm, mut cb, mut ocb) = setup_codegen();
+        asm.stack_push(Type::Fixnum);
+        asm.stack_push(Type::Flonum);
 
         let mut value_array: [u64; 2] = [0, 2]; // We only compile for n == 2
         let pc: *mut VALUE = &mut value_array as *mut u64 as *mut VALUE;
         jit.pc = pc;
 
-        let status = gen_dupn(&mut jit, &mut context, &mut asm, &mut ocb);
+        let status = gen_dupn(&mut jit, &mut asm, &mut ocb);
 
-        assert_eq!(status, KeepCompiling);
+        assert_eq!(status, Some(KeepCompiling));
 
-        assert_eq!(Type::Fixnum, context.get_opnd_type(StackOpnd(3)));
-        assert_eq!(Type::Flonum, context.get_opnd_type(StackOpnd(2)));
-        assert_eq!(Type::Fixnum, context.get_opnd_type(StackOpnd(1)));
-        assert_eq!(Type::Flonum, context.get_opnd_type(StackOpnd(0)));
+        assert_eq!(Type::Fixnum, asm.ctx.get_opnd_type(StackOpnd(3)));
+        assert_eq!(Type::Flonum, asm.ctx.get_opnd_type(StackOpnd(2)));
+        assert_eq!(Type::Fixnum, asm.ctx.get_opnd_type(StackOpnd(1)));
+        assert_eq!(Type::Flonum, asm.ctx.get_opnd_type(StackOpnd(0)));
 
         // TODO: this is writing zero bytes on x86. Why?
-        asm.compile(&mut cb);
+        asm.compile(&mut cb, None).unwrap();
         assert!(cb.get_write_pos() > 0); // Write some movs
     }
 
     #[test]
     fn test_gen_swap() {
-        let (mut jit, mut context, mut asm, _cb, mut ocb) = setup_codegen();
-        context.stack_push(Type::Fixnum);
-        context.stack_push(Type::Flonum);
+        let (mut jit, _context, mut asm, _cb, mut ocb) = setup_codegen();
+        asm.stack_push(Type::Fixnum);
+        asm.stack_push(Type::Flonum);
 
-        let status = gen_swap(&mut jit, &mut context, &mut asm, &mut ocb);
+        let status = gen_swap(&mut jit, &mut asm, &mut ocb);
 
-        let (_, tmp_type_top) = context.get_opnd_mapping(StackOpnd(0));
-        let (_, tmp_type_next) = context.get_opnd_mapping(StackOpnd(1));
+        let tmp_type_top = asm.ctx.get_opnd_type(StackOpnd(0));
+        let tmp_type_next = asm.ctx.get_opnd_type(StackOpnd(1));
 
-        assert_eq!(status, KeepCompiling);
+        assert_eq!(status, Some(KeepCompiling));
         assert_eq!(tmp_type_top, Type::Fixnum);
         assert_eq!(tmp_type_next, Type::Flonum);
     }
 
     #[test]
     fn test_putnil() {
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-        let status = gen_putnil(&mut jit, &mut context, &mut asm, &mut ocb);
+        let (mut jit, _context, mut asm, mut cb, mut ocb) = setup_codegen();
+        let status = gen_putnil(&mut jit, &mut asm, &mut ocb);
 
-        let (_, tmp_type_top) = context.get_opnd_mapping(StackOpnd(0));
+        let tmp_type_top = asm.ctx.get_opnd_type(StackOpnd(0));
 
-        assert_eq!(status, KeepCompiling);
+        assert_eq!(status, Some(KeepCompiling));
         assert_eq!(tmp_type_top, Type::Nil);
-        asm.compile(&mut cb);
+        asm.compile(&mut cb, None).unwrap();
         assert!(cb.get_write_pos() > 0);
     }
 
-    #[test]
-    fn test_putobject_qtrue() {
-        // Test gen_putobject with Qtrue
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-
-        let mut value_array: [u64; 2] = [0, Qtrue.into()];
-        let pc: *mut VALUE = &mut value_array as *mut u64 as *mut VALUE;
-        jit.pc = pc;
-
-        let status = gen_putobject(&mut jit, &mut context, &mut asm, &mut ocb);
-
-        let (_, tmp_type_top) = context.get_opnd_mapping(StackOpnd(0));
-
-        assert_eq!(status, KeepCompiling);
-        assert_eq!(tmp_type_top, Type::True);
-        asm.compile(&mut cb);
-        assert!(cb.get_write_pos() > 0);
-    }
-
-    #[test]
-    fn test_putobject_fixnum() {
-        // Test gen_putobject with a Fixnum to test another conditional branch
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-
-        // The Fixnum 7 is encoded as 7 * 2 + 1, or 15
-        let mut value_array: [u64; 2] = [0, 15];
-        let pc: *mut VALUE = &mut value_array as *mut u64 as *mut VALUE;
-        jit.pc = pc;
-
-        let status = gen_putobject(&mut jit, &mut context, &mut asm, &mut ocb);
-
-        let (_, tmp_type_top) = context.get_opnd_mapping(StackOpnd(0));
-
-        assert_eq!(status, KeepCompiling);
-        assert_eq!(tmp_type_top, Type::Fixnum);
-        asm.compile(&mut cb);
-        assert!(cb.get_write_pos() > 0);
-    }
-
-    #[test]
-    fn test_int2fix() {
-        let (mut jit, mut context, mut asm, _cb, mut ocb) = setup_codegen();
-        jit.opcode = YARVINSN_putobject_INT2FIX_0_.as_usize();
-        let status = gen_putobject_int2fix(&mut jit, &mut context, &mut asm, &mut ocb);
-
-        let (_, tmp_type_top) = context.get_opnd_mapping(StackOpnd(0));
-
-        // Right now we're not testing the generated machine code to make sure a literal 1 or 0 was pushed. I've checked locally.
-        assert_eq!(status, KeepCompiling);
-        assert_eq!(tmp_type_top, Type::Fixnum);
-    }
 
     #[test]
     fn test_putself() {
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-        let status = gen_putself(&mut jit, &mut context, &mut asm, &mut ocb);
+        let (mut jit, _context, mut asm, mut cb, mut ocb) = setup_codegen();
+        let status = gen_putself(&mut jit, &mut asm, &mut ocb);
 
-        assert_eq!(status, KeepCompiling);
-        asm.compile(&mut cb);
+        assert_eq!(status, Some(KeepCompiling));
+        asm.compile(&mut cb, None).unwrap();
         assert!(cb.get_write_pos() > 0);
     }
 
     #[test]
     fn test_gen_setn() {
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-        context.stack_push(Type::Fixnum);
-        context.stack_push(Type::Flonum);
-        context.stack_push(Type::CString);
+        let (mut jit, _context, mut asm, mut cb, mut ocb) = setup_codegen();
+        asm.stack_push(Type::Fixnum);
+        asm.stack_push(Type::Flonum);
+        asm.stack_push(Type::CString);
 
         let mut value_array: [u64; 2] = [0, 2];
         let pc: *mut VALUE = &mut value_array as *mut u64 as *mut VALUE;
         jit.pc = pc;
 
-        let status = gen_setn(&mut jit, &mut context, &mut asm, &mut ocb);
+        let status = gen_setn(&mut jit, &mut asm, &mut ocb);
 
-        assert_eq!(status, KeepCompiling);
+        assert_eq!(status, Some(KeepCompiling));
 
-        assert_eq!(Type::CString, context.get_opnd_type(StackOpnd(2)));
-        assert_eq!(Type::Flonum, context.get_opnd_type(StackOpnd(1)));
-        assert_eq!(Type::CString, context.get_opnd_type(StackOpnd(0)));
+        assert_eq!(Type::CString, asm.ctx.get_opnd_type(StackOpnd(2)));
+        assert_eq!(Type::Flonum, asm.ctx.get_opnd_type(StackOpnd(1)));
+        assert_eq!(Type::CString, asm.ctx.get_opnd_type(StackOpnd(0)));
 
-        asm.compile(&mut cb);
+        asm.compile(&mut cb, None).unwrap();
         assert!(cb.get_write_pos() > 0);
     }
 
     #[test]
     fn test_gen_topn() {
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-        context.stack_push(Type::Flonum);
-        context.stack_push(Type::CString);
+        let (mut jit, _context, mut asm, mut cb, mut ocb) = setup_codegen();
+        asm.stack_push(Type::Flonum);
+        asm.stack_push(Type::CString);
 
         let mut value_array: [u64; 2] = [0, 1];
         let pc: *mut VALUE = &mut value_array as *mut u64 as *mut VALUE;
         jit.pc = pc;
 
-        let status = gen_topn(&mut jit, &mut context, &mut asm, &mut ocb);
+        let status = gen_topn(&mut jit, &mut asm, &mut ocb);
 
-        assert_eq!(status, KeepCompiling);
+        assert_eq!(status, Some(KeepCompiling));
 
-        assert_eq!(Type::Flonum, context.get_opnd_type(StackOpnd(2)));
-        assert_eq!(Type::CString, context.get_opnd_type(StackOpnd(1)));
-        assert_eq!(Type::Flonum, context.get_opnd_type(StackOpnd(0)));
+        assert_eq!(Type::Flonum, asm.ctx.get_opnd_type(StackOpnd(2)));
+        assert_eq!(Type::CString, asm.ctx.get_opnd_type(StackOpnd(1)));
+        assert_eq!(Type::Flonum, asm.ctx.get_opnd_type(StackOpnd(0)));
 
-        asm.compile(&mut cb);
+        asm.compile(&mut cb, None).unwrap();
         assert!(cb.get_write_pos() > 0); // Write some movs
     }
 
     #[test]
     fn test_gen_adjuststack() {
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-        context.stack_push(Type::Flonum);
-        context.stack_push(Type::CString);
-        context.stack_push(Type::Fixnum);
+        let (mut jit, _context, mut asm, mut cb, mut ocb) = setup_codegen();
+        asm.stack_push(Type::Flonum);
+        asm.stack_push(Type::CString);
+        asm.stack_push(Type::Fixnum);
 
         let mut value_array: [u64; 3] = [0, 2, 0];
         let pc: *mut VALUE = &mut value_array as *mut u64 as *mut VALUE;
         jit.pc = pc;
 
-        let status = gen_adjuststack(&mut jit, &mut context, &mut asm, &mut ocb);
+        let status = gen_adjuststack(&mut jit, &mut asm, &mut ocb);
 
-        assert_eq!(status, KeepCompiling);
+        assert_eq!(status, Some(KeepCompiling));
 
-        assert_eq!(Type::Flonum, context.get_opnd_type(StackOpnd(0)));
+        assert_eq!(Type::Flonum, asm.ctx.get_opnd_type(StackOpnd(0)));
 
-        asm.compile(&mut cb);
+        asm.compile(&mut cb, None).unwrap();
         assert!(cb.get_write_pos() == 0); // No instructions written
     }
 
     #[test]
     fn test_gen_leave() {
-        let (mut jit, mut context, mut asm, _cb, mut ocb) = setup_codegen();
+        let (mut jit, _context, mut asm, _cb, mut ocb) = setup_codegen();
         // Push return value
-        context.stack_push(Type::Fixnum);
-        gen_leave(&mut jit, &mut context, &mut asm, &mut ocb);
+        asm.stack_push(Type::Fixnum);
+        asm.set_side_exit_context(0 as _, 0);
+        gen_leave(&mut jit, &mut asm, &mut ocb);
     }
 }