20 files changed, 1604 insertions, 909 deletions
diff --git a/yjit/src/asm/mod.rs b/yjit/src/asm/mod.rs
index ed6feb3174..9ef675b34d 100644
--- a/yjit/src/asm/mod.rs
+++ b/yjit/src/asm/mod.rs
@@ -1,4 +1,3 @@
-use std::cell::RefCell;
 use std::fmt;
 use std::mem;
 use std::rc::Rc;
@@ -44,7 +43,7 @@ pub struct LabelRef {
 /// Block of memory into which instructions can be assembled
 pub struct CodeBlock {
     // Memory for storing the encoded instructions
-    mem_block: Rc<RefCell<VirtualMem>>,
+    mem_block: Rc<VirtualMem>,
 
     // Size of a code page in bytes. Each code page is split into an inlined and an outlined portion.
     // Code GC collects code memory at this granularity.
@@ -107,16 +106,16 @@ impl CodeBlock {
     const PREFERRED_CODE_PAGE_SIZE: usize = 16 * 1024;
 
     /// Make a new CodeBlock
-    pub fn new(mem_block: Rc<RefCell<VirtualMem>>, outlined: bool, freed_pages: Rc<Option<Vec<usize>>>, keep_comments: bool) -> Self {
+    pub fn new(mem_block: Rc<VirtualMem>, outlined: bool, freed_pages: Rc<Option<Vec<usize>>>, keep_comments: bool) -> Self {
         // Pick the code page size
-        let system_page_size = mem_block.borrow().system_page_size();
+        let system_page_size = mem_block.system_page_size();
         let page_size = if 0 == Self::PREFERRED_CODE_PAGE_SIZE % system_page_size {
             Self::PREFERRED_CODE_PAGE_SIZE
         } else {
             system_page_size
         };
 
-        let mem_size = mem_block.borrow().virtual_region_size();
+        let mem_size = mem_block.virtual_region_size();
         let mut cb = Self {
             mem_block,
             mem_size,
@@ -145,6 +144,7 @@ impl CodeBlock {
 
     /// Move the CodeBlock to the next page. If it's on the furthest page,
     /// move the other CodeBlock to the next page as well.
+    #[must_use]
     pub fn next_page<F: Fn(&mut CodeBlock, CodePtr)>(&mut self, base_ptr: CodePtr, jmp_ptr: F) -> bool {
         let old_write_ptr = self.get_write_ptr();
         self.set_write_ptr(base_ptr);
@@ -237,9 +237,9 @@ impl CodeBlock {
             }
 
             // Free the grouped pages at once
-            let start_ptr = self.mem_block.borrow().start_ptr().add_bytes(page_idx * self.page_size);
+            let start_ptr = self.mem_block.start_ptr().add_bytes(page_idx * self.page_size);
             let batch_size = self.page_size * batch_idxs.len();
-            self.mem_block.borrow_mut().free_bytes(start_ptr, batch_size as u32);
+            self.mem_block.free_bytes(start_ptr, batch_size as u32);
         }
     }
 
@@ -248,13 +248,13 @@ impl CodeBlock {
     }
 
     pub fn mapped_region_size(&self) -> usize {
-        self.mem_block.borrow().mapped_region_size()
+        self.mem_block.mapped_region_size()
     }
 
     /// Size of the region in bytes where writes could be attempted.
     #[cfg(target_arch = "aarch64")]
     pub fn virtual_region_size(&self) -> usize {
-        self.mem_block.borrow().virtual_region_size()
+        self.mem_block.virtual_region_size()
     }
 
     /// Return the number of code pages that have been mapped by the VirtualMemory.
@@ -266,7 +266,7 @@ impl CodeBlock {
 
     /// Return the number of code pages that have been reserved by the VirtualMemory.
     pub fn num_virtual_pages(&self) -> usize {
-        let virtual_region_size = self.mem_block.borrow().virtual_region_size();
+        let virtual_region_size = self.mem_block.virtual_region_size();
         // CodeBlock's page size != VirtualMem's page size on Linux,
         // so mapped_region_size % self.page_size may not be 0
         ((virtual_region_size - 1) / self.page_size) + 1
@@ -408,7 +408,7 @@ impl CodeBlock {
     }
 
     pub fn write_mem(&self, write_ptr: CodePtr, byte: u8) -> Result<(), WriteError> {
-        self.mem_block.borrow_mut().write_byte(write_ptr, byte)
+        self.mem_block.write_byte(write_ptr, byte)
     }
 
     // Set the current write position
@@ -422,31 +422,31 @@ impl CodeBlock {
 
     // Set the current write position from a pointer
     pub fn set_write_ptr(&mut self, code_ptr: CodePtr) {
-        let pos = code_ptr.as_offset() - self.mem_block.borrow().start_ptr().as_offset();
+        let pos = code_ptr.as_offset() - self.mem_block.start_ptr().as_offset();
         self.set_pos(pos.try_into().unwrap());
     }
 
     /// Get a (possibly dangling) direct pointer into the executable memory block
     pub fn get_ptr(&self, offset: usize) -> CodePtr {
-        self.mem_block.borrow().start_ptr().add_bytes(offset)
+        self.mem_block.start_ptr().add_bytes(offset)
     }
 
     /// Convert an address range to memory page indexes against a num_pages()-sized array.
     pub fn addrs_to_pages(&self, start_addr: CodePtr, end_addr: CodePtr) -> impl Iterator<Item = usize> {
-        let mem_start = self.mem_block.borrow().start_ptr().raw_addr(self);
-        let mem_end = self.mem_block.borrow().mapped_end_ptr().raw_addr(self);
+        let mem_start = self.mem_block.start_ptr().raw_addr(self);
+        let mem_end = self.mem_block.mapped_end_ptr().raw_addr(self);
         assert!(mem_start <= start_addr.raw_addr(self));
         assert!(start_addr.raw_addr(self) <= end_addr.raw_addr(self));
         assert!(end_addr.raw_addr(self) <= mem_end);
 
         // Ignore empty code ranges
         if start_addr == end_addr {
-            return (0..0).into_iter();
+            return 0..0;
         }
 
         let start_page = (start_addr.raw_addr(self) - mem_start) / self.page_size;
         let end_page = (end_addr.raw_addr(self) - mem_start - 1) / self.page_size;
-        (start_page..end_page + 1).into_iter()
+        start_page..end_page + 1
     }
 
     /// Get a (possibly dangling) direct pointer to the current write position
@@ -457,7 +457,7 @@ impl CodeBlock {
     /// Write a single byte at the current position.
     pub fn write_byte(&mut self, byte: u8) {
         let write_ptr = self.get_write_ptr();
-        if self.has_capacity(1) && self.mem_block.borrow_mut().write_byte(write_ptr, byte).is_ok() {
+        if self.has_capacity(1) && self.mem_block.write_byte(write_ptr, byte).is_ok() {
             self.write_pos += 1;
         } else {
             self.dropped_bytes = true;
@@ -589,8 +589,12 @@ impl CodeBlock {
         self.label_refs = state.label_refs;
     }
 
+    pub fn mark_all_writeable(&mut self) {
+        self.mem_block.mark_all_writeable();
+    }
+
     pub fn mark_all_executable(&mut self) {
-        self.mem_block.borrow_mut().mark_all_executable();
+        self.mem_block.mark_all_executable();
     }
 
     /// Code GC. Free code pages that are not on stack and reuse them.
@@ -688,7 +692,7 @@ impl CodeBlock {
         let mem_start: *const u8 = alloc.mem_start();
         let virt_mem = VirtualMem::new(alloc, 1, NonNull::new(mem_start as *mut u8).unwrap(), mem_size, 128 * 1024 * 1024);
 
-        Self::new(Rc::new(RefCell::new(virt_mem)), false, Rc::new(None), true)
+        Self::new(Rc::new(virt_mem), false, Rc::new(None), true)
     }
 
     /// Stubbed CodeBlock for testing conditions that can arise due to code GC. Can't execute generated code.
@@ -706,7 +710,7 @@ impl CodeBlock {
         let mem_start: *const u8 = alloc.mem_start();
         let virt_mem = VirtualMem::new(alloc, 1, NonNull::new(mem_start as *mut u8).unwrap(), mem_size, 128 * 1024 * 1024);
 
-        Self::new(Rc::new(RefCell::new(virt_mem)), false, Rc::new(Some(freed_pages)), true)
+        Self::new(Rc::new(virt_mem), false, Rc::new(Some(freed_pages)), true)
     }
 }
 
@@ -714,7 +718,7 @@ impl CodeBlock {
 impl fmt::LowerHex for CodeBlock {
     fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
         for pos in 0..self.write_pos {
-            let mem_block = &*self.mem_block.borrow();
+            let mem_block = &*self.mem_block;
             let byte = unsafe { mem_block.start_ptr().raw_ptr(mem_block).add(pos).read() };
             fmtr.write_fmt(format_args!("{:02x}", byte))?;
         }
@@ -724,7 +728,7 @@ impl fmt::LowerHex for CodeBlock {
 
 impl crate::virtualmem::CodePtrBase for CodeBlock {
     fn base_ptr(&self) -> std::ptr::NonNull<u8> {
-        self.mem_block.borrow().base_ptr()
+        self.mem_block.base_ptr()
     }
 }
 
@@ -823,7 +827,7 @@ mod tests
         assert_eq!(cb.code_size(), 4);
 
         // Moving to the next page should not increase code_size
-        cb.next_page(cb.get_write_ptr(), |_, _| {});
+        assert!(cb.next_page(cb.get_write_ptr(), |_, _| {}));
         assert_eq!(cb.code_size(), 4);
 
         // Write 4 bytes in the second page
@@ -836,7 +840,7 @@ mod tests
         cb.write_bytes(&[1, 1, 1, 1]);
 
         // Moving from an old page to the next page should not increase code_size
-        cb.next_page(cb.get_write_ptr(), |_, _| {});
+        assert!(cb.next_page(cb.get_write_ptr(), |_, _| {}));
         cb.set_pos(old_write_pos);
         assert_eq!(cb.code_size(), 8);
     }
diff --git a/yjit/src/asm/x86_64/mod.rs b/yjit/src/asm/x86_64/mod.rs
index fbbfa714d8..0ef5e92117 100644
--- a/yjit/src/asm/x86_64/mod.rs
+++ b/yjit/src/asm/x86_64/mod.rs
@@ -1027,7 +1027,10 @@ pub fn mov(cb: &mut CodeBlock, dst: X86Opnd, src: X86Opnd) {
             }
 
             let output_num_bits:u32 = if mem.num_bits > 32 { 32 } else { mem.num_bits.into() };
-            assert!(imm_num_bits(imm.value) <= (output_num_bits as u8));
+            assert!(
+                mem.num_bits < 64 || imm_num_bits(imm.value) <= (output_num_bits as u8),
+                "immediate value should be small enough to survive sign extension"
+            );
             cb.write_int(imm.value as u64, output_num_bits);
         },
         // M + UImm
@@ -1042,7 +1045,10 @@ pub fn mov(cb: &mut CodeBlock, dst: X86Opnd, src: X86Opnd) {
             }
 
             let output_num_bits = if mem.num_bits > 32 { 32 } else { mem.num_bits.into() };
-            assert!(imm_num_bits(uimm.value as i64) <= (output_num_bits as u8));
+            assert!(
+                mem.num_bits < 64 || imm_num_bits(uimm.value as i64) <= (output_num_bits as u8),
+                "immediate value should be small enough to survive sign extension"
+            );
             cb.write_int(uimm.value, output_num_bits);
         },
         // * + Imm/UImm
diff --git a/yjit/src/asm/x86_64/tests.rs b/yjit/src/asm/x86_64/tests.rs
index 5ae983270f..eefcbfd52e 100644
--- a/yjit/src/asm/x86_64/tests.rs
+++ b/yjit/src/asm/x86_64/tests.rs
@@ -193,6 +193,7 @@ fn test_mov() {
     check_bytes("48c7470801000000", |cb| mov(cb, mem_opnd(64, RDI, 8), imm_opnd(1)));
     //check_bytes("67c7400411000000", |cb| mov(cb, mem_opnd(32, EAX, 4), imm_opnd(0x34))); // We don't distinguish between EAX and RAX here - that's probably fine?
     check_bytes("c7400411000000", |cb| mov(cb, mem_opnd(32, RAX, 4), imm_opnd(17)));
+    check_bytes("c7400401000080", |cb| mov(cb, mem_opnd(32, RAX, 4), uimm_opnd(0x80000001)));
     check_bytes("41895814", |cb| mov(cb, mem_opnd(32, R8, 20), EBX));
     check_bytes("4d8913", |cb| mov(cb, mem_opnd(64, R11, 0), R10));
     check_bytes("48c742f8f4ffffff", |cb| mov(cb, mem_opnd(64, RDX, -8), imm_opnd(-12)));
diff --git a/yjit/src/backend/arm64/mod.rs b/yjit/src/backend/arm64/mod.rs
index b695f8da96..0521e09d0b 100644
--- a/yjit/src/backend/arm64/mod.rs
+++ b/yjit/src/backend/arm64/mod.rs
@@ -98,7 +98,7 @@ fn emit_jmp_ptr_with_invalidation(cb: &mut CodeBlock, dst_ptr: CodePtr) {
     #[cfg(not(test))]
     {
         let end = cb.get_write_ptr();
-        unsafe { rb_yjit_icache_invalidate(start.raw_ptr(cb) as _, end.raw_ptr(cb) as _) };
+        unsafe { rb_jit_icache_invalidate(start.raw_ptr(cb) as _, end.raw_ptr(cb) as _) };
     }
 }
 
@@ -878,14 +878,13 @@ impl Assembler
             }
         }
 
-        /// Emit a push instruction for the given operand by adding to the stack
-        /// pointer and then storing the given value.
+        /// Push a value to the stack by subtracting from the stack pointer then storing,
+        /// leaving an 8-byte gap for alignment.
         fn emit_push(cb: &mut CodeBlock, opnd: A64Opnd) {
             str_pre(cb, opnd, A64Opnd::new_mem(64, C_SP_REG, -C_SP_STEP));
         }
 
-        /// Emit a pop instruction into the given operand by loading the value
-        /// and then subtracting from the stack pointer.
+        /// Pop a value from the stack by loading `[sp]` then adding to the stack pointer.
         fn emit_pop(cb: &mut CodeBlock, opnd: A64Opnd) {
             ldr_post(cb, opnd, A64Opnd::new_mem(64, C_SP_REG, C_SP_STEP));
         }
@@ -1155,8 +1154,8 @@ impl Assembler
                     let regs = Assembler::get_caller_save_regs();
 
                     // Pop the state/flags register
-                    msr(cb, SystemRegister::NZCV, Self::SCRATCH0);
                     emit_pop(cb, Self::SCRATCH0);
+                    msr(cb, SystemRegister::NZCV, Self::SCRATCH0);
 
                     for reg in regs.into_iter().rev() {
                         emit_pop(cb, A64Opnd::Reg(reg));
@@ -1341,16 +1340,13 @@ impl Assembler
             Err(EmitError::RetryOnNextPage) => {
                 // we want to lower jumps to labels to b.cond instructions, which have a 1 MiB
                 // range limit. We can easily exceed the limit in case the jump straddles two pages.
-                // In this case, we retry with a fresh page.
+                // In this case, we retry with a fresh page once.
                 cb.set_label_state(starting_label_state);
-                cb.next_page(start_ptr, emit_jmp_ptr_with_invalidation);
-                let result = asm.arm64_emit(cb, &mut ocb);
-                assert_ne!(
-                    Err(EmitError::RetryOnNextPage),
-                    result,
-                    "should not fail when writing to a fresh code page"
-                );
-                result
+                if cb.next_page(start_ptr, emit_jmp_ptr_with_invalidation) {
+                    asm.arm64_emit(cb, &mut ocb)
+                } else {
+                    Err(EmitError::OutOfMemory)
+                }
             }
             result => result
         };
@@ -1364,7 +1360,7 @@ impl Assembler
             #[cfg(not(test))]
             cb.without_page_end_reserve(|cb| {
                 for (start, end) in cb.writable_addrs(start_ptr, cb.get_write_ptr()) {
-                    unsafe { rb_yjit_icache_invalidate(start as _, end as _) };
+                    unsafe { rb_jit_icache_invalidate(start as _, end as _) };
                 }
             });
 
@@ -1422,7 +1418,7 @@ mod tests {
     fn test_emit_cpop_all() {
         let (mut asm, mut cb) = setup_asm();
 
-        asm.cpop_all();
+        asm.cpop_all(crate::core::RegMapping::default());
         asm.compile_with_num_regs(&mut cb, 0);
     }
 
diff --git a/yjit/src/backend/ir.rs b/yjit/src/backend/ir.rs
index eb32dac987..3fb67bc7cc 100644
--- a/yjit/src/backend/ir.rs
+++ b/yjit/src/backend/ir.rs
@@ -528,13 +528,13 @@ pub enum Insn {
 impl Insn {
     /// Create an iterator that will yield a non-mutable reference to each
     /// operand in turn for this instruction.
-    pub(super) fn opnd_iter(&self) -> InsnOpndIterator {
+    pub(super) fn opnd_iter(&self) -> InsnOpndIterator<'_> {
         InsnOpndIterator::new(self)
     }
 
     /// Create an iterator that will yield a mutable reference to each operand
     /// in turn for this instruction.
-    pub(super) fn opnd_iter_mut(&mut self) -> InsnOpndMutIterator {
+    pub(super) fn opnd_iter_mut(&mut self) -> InsnOpndMutIterator<'_> {
         InsnOpndMutIterator::new(self)
     }
 
@@ -1086,7 +1086,7 @@ impl Assembler
     }
 
     /// Get the list of registers that can be used for stack temps.
-    pub fn get_temp_regs2() -> &'static [Reg] {
+    pub fn get_temp_regs() -> &'static [Reg] {
         let num_regs = get_option!(num_temp_regs);
         &TEMP_REGS[0..num_regs]
     }
@@ -1204,7 +1204,7 @@ impl Assembler
 
         // Convert Opnd::Stack to Opnd::Reg
         fn reg_opnd(opnd: &Opnd, reg_idx: usize) -> Opnd {
-            let regs = Assembler::get_temp_regs2();
+            let regs = Assembler::get_temp_regs();
             if let Opnd::Stack { num_bits, .. } = *opnd {
                 incr_counter!(temp_reg_opnd);
                 Opnd::Reg(regs[reg_idx]).with_num_bits(num_bits).unwrap()
@@ -1317,7 +1317,7 @@ impl Assembler
     }
 
     /// Spill a stack temp from a register to the stack
-    fn spill_reg(&mut self, opnd: Opnd) {
+    pub fn spill_reg(&mut self, opnd: Opnd) {
         assert_ne!(self.ctx.get_reg_mapping().get_reg(opnd.reg_opnd()), None);
 
         // Use different RegMappings for dest and src operands
@@ -1602,7 +1602,7 @@ impl Assembler
                 if c_args.len() > 0 {
                     // Resolve C argument dependencies
                     let c_args_len = c_args.len() as isize;
-                    let moves = Self::reorder_reg_moves(&c_args.drain(..).into_iter().collect());
+                    let moves = Self::reorder_reg_moves(&std::mem::take(&mut c_args));
                     shift_live_ranges(&mut shifted_live_ranges, asm.insns.len(), moves.len() as isize - c_args_len);
 
                     // Push batched C arguments
@@ -1824,12 +1824,12 @@ impl Assembler {
         out
     }
 
-    pub fn cpop_all(&mut self) {
+    pub fn cpop_all(&mut self, reg_mapping: RegMapping) {
         self.push_insn(Insn::CPopAll);
 
         // Re-enable ccall's RegMappings assertion disabled by cpush_all.
         // cpush_all + cpop_all preserve all stack temp registers, so it's safe.
-        self.set_reg_mapping(self.ctx.get_reg_mapping());
+        self.set_reg_mapping(reg_mapping);
     }
 
     pub fn cpop_into(&mut self, opnd: Opnd) {
@@ -1840,14 +1840,16 @@ impl Assembler {
         self.push_insn(Insn::CPush(opnd));
     }
 
-    pub fn cpush_all(&mut self) {
+    pub fn cpush_all(&mut self) -> RegMapping {
         self.push_insn(Insn::CPushAll);
 
         // Mark all temps as not being in registers.
         // Temps will be marked back as being in registers by cpop_all.
         // We assume that cpush_all + cpop_all are used for C functions in utils.rs
         // that don't require spill_regs for GC.
+        let mapping = self.ctx.get_reg_mapping();
         self.set_reg_mapping(RegMapping::default());
+        mapping
     }
 
     pub fn cret(&mut self, opnd: Opnd) {
diff --git a/yjit/src/backend/tests.rs b/yjit/src/backend/tests.rs
index ac2f35b3d9..bfeea5163a 100644
--- a/yjit/src/backend/tests.rs
+++ b/yjit/src/backend/tests.rs
@@ -232,9 +232,9 @@ fn test_jcc_ptr()
     let (mut asm, mut cb) = setup_asm();
 
     let side_exit = Target::CodePtr(cb.get_write_ptr().add_bytes(4));
-    let not_mask = asm.not(Opnd::mem(32, EC, RUBY_OFFSET_EC_INTERRUPT_MASK));
+    let not_mask = asm.not(Opnd::mem(32, EC, RUBY_OFFSET_EC_INTERRUPT_MASK as i32));
     asm.test(
-        Opnd::mem(32, EC, RUBY_OFFSET_EC_INTERRUPT_FLAG),
+        Opnd::mem(32, EC, RUBY_OFFSET_EC_INTERRUPT_FLAG as i32),
         not_mask,
     );
     asm.jnz(side_exit);
diff --git a/yjit/src/backend/x86_64/mod.rs b/yjit/src/backend/x86_64/mod.rs
index c0d42e79e6..ef435bca7e 100644
--- a/yjit/src/backend/x86_64/mod.rs
+++ b/yjit/src/backend/x86_64/mod.rs
@@ -315,19 +315,24 @@ impl Assembler
                             let opnd1 = asm.load(*src);
                             asm.mov(*dest, opnd1);
                         },
-                        (Opnd::Mem(_), Opnd::UImm(value)) => {
-                            // 32-bit values will be sign-extended
-                            if imm_num_bits(*value as i64) > 32 {
+                        (Opnd::Mem(Mem { num_bits, .. }), Opnd::UImm(value)) => {
+                            // For 64 bit destinations, 32-bit values will be sign-extended
+                            if *num_bits == 64 && imm_num_bits(*value as i64) > 32 {
                                 let opnd1 = asm.load(*src);
                                 asm.mov(*dest, opnd1);
                             } else {
                                 asm.mov(*dest, *src);
                             }
                         },
-                        (Opnd::Mem(_), Opnd::Imm(value)) => {
-                            if imm_num_bits(*value) > 32 {
+                        (Opnd::Mem(Mem { num_bits, .. }), Opnd::Imm(value)) => {
+                            // For 64 bit destinations, 32-bit values will be sign-extended
+                            if *num_bits == 64 && imm_num_bits(*value) > 32 {
                                 let opnd1 = asm.load(*src);
                                 asm.mov(*dest, opnd1);
+                            } else if uimm_num_bits(*value as u64) <= *num_bits {
+                                // If the bit string is short enough for the destination, use the unsigned representation.
+                                // Note that 64-bit and negative values are ruled out.
+                                asm.mov(*dest, Opnd::UImm(*value as u64));
                             } else {
                                 asm.mov(*dest, *src);
                             }
@@ -1317,4 +1322,19 @@ mod tests {
             0x13: mov qword ptr [rbx], rax
         "});
     }
+
+    #[test]
+    fn test_mov_m32_imm32() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let shape_opnd = Opnd::mem(32, C_RET_OPND, 0);
+        asm.mov(shape_opnd, Opnd::UImm(0x8000_0001));
+        asm.mov(shape_opnd, Opnd::Imm(0x8000_0001));
+
+        asm.compile_with_num_regs(&mut cb, 0);
+        assert_disasm!(cb, "c70001000080c70001000080", {"
+            0x0: mov dword ptr [rax], 0x80000001
+            0x6: mov dword ptr [rax], 0x80000001
+        "});
+    }
 }
diff --git a/yjit/src/codegen.rs b/yjit/src/codegen.rs
index ded89457c6..0fbca85716 100644
--- a/yjit/src/codegen.rs
+++ b/yjit/src/codegen.rs
@@ -195,6 +195,45 @@ impl<'a> JITState<'a> {
         self.outlined_code_block
     }
 
+    /// Leave a code stub to re-enter the compiler at runtime when the compiling program point is
+    /// reached. Should always be used in tail position like `return jit.defer_compilation(asm);`.
+    #[must_use]
+    fn defer_compilation(&mut self, asm: &mut Assembler) -> Option<CodegenStatus> {
+        if crate::core::defer_compilation(self, asm).is_err() {
+            // If we can't leave a stub, the block isn't usable and we have to bail.
+            self.block_abandoned = true;
+        }
+        Some(EndBlock)
+    }
+
+    /// Generate a branch with either end possibly stubbed out
+    fn gen_branch(
+        &mut self,
+        asm: &mut Assembler,
+        target0: BlockId,
+        ctx0: &Context,
+        target1: Option<BlockId>,
+        ctx1: Option<&Context>,
+        gen_fn: BranchGenFn,
+    ) {
+        if crate::core::gen_branch(self, asm, target0, ctx0, target1, ctx1, gen_fn).is_none() {
+            // If we can't meet the request for a branch, the code is
+            // essentially corrupt and we have to discard the block.
+            self.block_abandoned = true;
+        }
+    }
+
+    /// Wrapper for [self::gen_outlined_exit] with error handling.
+    fn gen_outlined_exit(&mut self, exit_pc: *mut VALUE, ctx: &Context) -> Option<CodePtr> {
+        let result = gen_outlined_exit(exit_pc, self.num_locals(), ctx, self.get_ocb());
+        if result.is_none() {
+            // When we can't have the exits, the code is incomplete and we have to bail.
+            self.block_abandoned = true;
+        }
+
+        result
+    }
+
     /// Return true if the current ISEQ could escape an environment.
     ///
     /// As of vm_push_frame(), EP is always equal to BP. However, after pushing
@@ -399,7 +438,7 @@ impl<'a> JITState<'a> {
     fn flush_perf_symbols(&self, cb: &CodeBlock) {
         assert_eq!(0, self.perf_stack.len());
         let path = format!("/tmp/perf-{}.map", std::process::id());
-        let mut f = std::fs::File::options().create(true).append(true).open(path).unwrap();
+        let mut f = std::io::BufWriter::new(std::fs::File::options().create(true).append(true).open(path).unwrap());
         for sym in self.perf_map.borrow().iter() {
             if let (start, Some(end), name) = sym {
                 // In case the code straddles two pages, part of it belongs to the symbol.
@@ -782,11 +821,11 @@ fn gen_stub_exit(ocb: &mut OutlinedCb) -> Option<CodePtr> {
 
 /// Generate an exit to return to the interpreter
 fn gen_exit(exit_pc: *mut VALUE, asm: &mut Assembler) {
-    #[cfg(all(feature = "disasm", not(test)))]
-    {
+    #[cfg(not(test))]
+    asm_comment!(asm, "exit to interpreter on {}", {
         let opcode = unsafe { rb_vm_insn_addr2opcode((*exit_pc).as_ptr()) };
-        asm_comment!(asm, "exit to interpreter on {}", insn_name(opcode as usize));
-    }
+        insn_name(opcode as usize)
+    });
 
     if asm.ctx.is_return_landing() {
         asm.mov(SP, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP));
@@ -851,6 +890,10 @@ fn gen_exit(exit_pc: *mut VALUE, asm: &mut Assembler) {
 /// moment, so there is one unique side exit for each context. Note that
 /// it's incorrect to jump to the side exit after any ctx stack push operations
 /// since they change the logic required for reconstructing interpreter state.
+///
+/// If you're in [the codegen module][self], use [JITState::gen_outlined_exit]
+/// instead of calling this directly.
+#[must_use]
 pub fn gen_outlined_exit(exit_pc: *mut VALUE, num_locals: u32, ctx: &Context, ocb: &mut OutlinedCb) -> Option<CodePtr> {
     let mut cb = ocb.unwrap();
     let mut asm = Assembler::new(num_locals);
@@ -915,7 +958,7 @@ pub fn jit_ensure_block_entry_exit(jit: &mut JITState, asm: &mut Assembler) -> O
         jit.block_entry_exit = Some(entry_exit?);
     } else {
         let block_entry_pc = unsafe { rb_iseq_pc_at_idx(jit.iseq, jit.starting_insn_idx.into()) };
-        jit.block_entry_exit = Some(gen_outlined_exit(block_entry_pc, jit.num_locals(), block_starting_context, jit.get_ocb())?);
+        jit.block_entry_exit = Some(jit.gen_outlined_exit(block_entry_pc, block_starting_context)?);
     }
 
     Some(())
@@ -1018,14 +1061,13 @@ fn gen_leave_exception(ocb: &mut OutlinedCb) -> Option<CodePtr> {
 pub fn gen_entry_chain_guard(
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
-    iseq: IseqPtr,
-    insn_idx: u16,
+    blockid: BlockId,
 ) -> Option<PendingEntryRef> {
     let entry = new_pending_entry();
     let stub_addr = gen_entry_stub(entry.uninit_entry.as_ptr() as usize, ocb)?;
 
     let pc_opnd = Opnd::mem(64, CFP, RUBY_OFFSET_CFP_PC);
-    let expected_pc = unsafe { rb_iseq_pc_at_idx(iseq, insn_idx.into()) };
+    let expected_pc = unsafe { rb_iseq_pc_at_idx(blockid.iseq, blockid.idx.into()) };
     let expected_pc_opnd = Opnd::const_ptr(expected_pc as *const u8);
 
     asm_comment!(asm, "guard expected PC");
@@ -1044,18 +1086,15 @@ pub fn gen_entry_chain_guard(
 pub fn gen_entry_prologue(
     cb: &mut CodeBlock,
     ocb: &mut OutlinedCb,
-    iseq: IseqPtr,
-    insn_idx: u16,
+    blockid: BlockId,
+    stack_size: u8,
     jit_exception: bool,
-) -> Option<CodePtr> {
+) -> Option<(CodePtr, RegMapping)> {
+    let iseq = blockid.iseq;
     let code_ptr = cb.get_write_ptr();
 
     let mut asm = Assembler::new(unsafe { get_iseq_body_local_table_size(iseq) });
-    if get_option_ref!(dump_disasm).is_some() {
-        asm_comment!(asm, "YJIT entry point: {}", iseq_get_location(iseq, 0));
-    } else {
-        asm_comment!(asm, "YJIT entry");
-    }
+    asm_comment!(asm, "YJIT entry point: {}", iseq_get_location(iseq, 0));
 
     asm.frame_setup();
 
@@ -1102,10 +1141,11 @@ pub fn gen_entry_prologue(
     // If they don't match, then we'll jump to an entry stub and generate
     // another PC check and entry there.
     let pending_entry = if unsafe { get_iseq_flags_has_opt(iseq) } || jit_exception {
-        Some(gen_entry_chain_guard(&mut asm, ocb, iseq, insn_idx)?)
+        Some(gen_entry_chain_guard(&mut asm, ocb, blockid)?)
     } else {
         None
     };
+    let reg_mapping = gen_entry_reg_mapping(&mut asm, blockid, stack_size);
 
     asm.compile(cb, Some(ocb))?;
 
@@ -1123,8 +1163,37 @@ pub fn gen_entry_prologue(
                 .ok().expect("PendingEntry should be unique");
             iseq_payload.entries.push(pending_entry.into_entry());
         }
-        Some(code_ptr)
+        Some((code_ptr, reg_mapping))
+    }
+}
+
+/// Generate code to load registers for a JIT entry. When the entry block is compiled for
+/// the first time, it loads no register. When it has been already compiled as a callee
+/// block, it loads some registers to reuse the block.
+pub fn gen_entry_reg_mapping(asm: &mut Assembler, blockid: BlockId, stack_size: u8) -> RegMapping {
+    // Find an existing callee block. If it's not found or uses no register, skip loading registers.
+    let mut ctx = Context::default();
+    ctx.set_stack_size(stack_size);
+    let reg_mapping = find_most_compatible_reg_mapping(blockid, &ctx).unwrap_or(RegMapping::default());
+    if reg_mapping == RegMapping::default() {
+        return reg_mapping;
+    }
+
+    // If found, load the same registers to reuse the block.
+    asm_comment!(asm, "reuse maps: {:?}", reg_mapping);
+    let local_table_size: u32 = unsafe { get_iseq_body_local_table_size(blockid.iseq) }.try_into().unwrap();
+    for &reg_opnd in reg_mapping.get_reg_opnds().iter() {
+        match reg_opnd {
+            RegOpnd::Local(local_idx) => {
+                let loaded_reg = TEMP_REGS[reg_mapping.get_reg(reg_opnd).unwrap()];
+                let loaded_temp = asm.local_opnd(local_table_size - local_idx as u32 + VM_ENV_DATA_SIZE - 1);
+                asm.load_into(Opnd::Reg(loaded_reg), loaded_temp);
+            }
+            RegOpnd::Stack(_) => unreachable!("find_most_compatible_reg_mapping should not leave {:?}", reg_opnd),
+        }
     }
+
+    reg_mapping
 }
 
 // Generate code to check for interrupts and take a side-exit.
@@ -1139,7 +1208,7 @@ fn gen_check_ints(
 
     // Not checking interrupt_mask since it's zero outside finalize_deferred_heap_pages,
     // signal_exec, or rb_postponed_job_flush.
-    let interrupt_flag = asm.load(Opnd::mem(32, EC, RUBY_OFFSET_EC_INTERRUPT_FLAG));
+    let interrupt_flag = asm.load(Opnd::mem(32, EC, RUBY_OFFSET_EC_INTERRUPT_FLAG as i32));
     asm.test(interrupt_flag, interrupt_flag);
 
     asm.jnz(Target::side_exit(counter));
@@ -1173,7 +1242,7 @@ fn end_block_with_jump(
     if jit.record_boundary_patch_point {
         jit.record_boundary_patch_point = false;
         let exit_pc = unsafe { rb_iseq_pc_at_idx(jit.iseq, continuation_insn_idx.into())};
-        let exit_pos = gen_outlined_exit(exit_pc, jit.num_locals(), &reset_depth, jit.get_ocb());
+        let exit_pos = jit.gen_outlined_exit(exit_pc, &reset_depth);
         record_global_inval_patch(asm, exit_pos?);
     }
 
@@ -1223,7 +1292,6 @@ pub fn gen_single_block(
     let mut asm = Assembler::new(jit.num_locals());
     asm.ctx = ctx;
 
-    #[cfg(feature = "disasm")]
     if get_option_ref!(dump_disasm).is_some() {
         let blockid_idx = blockid.idx;
         let chain_depth = if asm.ctx.get_chain_depth() > 0 { format!("(chain_depth: {})", asm.ctx.get_chain_depth()) } else { "".to_string() };
@@ -1282,7 +1350,7 @@ pub fn gen_single_block(
         // If previous instruction requested to record the boundary
         if jit.record_boundary_patch_point {
             // Generate an exit to this instruction and record it
-            let exit_pos = gen_outlined_exit(jit.pc, jit.num_locals(), &asm.ctx, jit.get_ocb()).ok_or(())?;
+            let exit_pos = jit.gen_outlined_exit(jit.pc, &asm.ctx).ok_or(())?;
             record_global_inval_patch(&mut asm, exit_pos);
             jit.record_boundary_patch_point = false;
         }
@@ -1446,6 +1514,18 @@ fn gen_dupn(
     Some(KeepCompiling)
 }
 
+// Reverse top X stack entries
+fn gen_opt_reverse(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    let count = jit.get_arg(0).as_i32();
+    for n in 0..(count/2) {
+        stack_swap(asm, n, count - 1 - n);
+    }
+    Some(KeepCompiling)
+}
+
 // Swap top 2 stack entries
 fn gen_swap(
     _jit: &mut JITState,
@@ -1538,8 +1618,7 @@ fn fuse_putobject_opt_ltlt(
             return None;
         }
         if !jit.at_compile_target() {
-            defer_compilation(jit, asm);
-            return Some(EndBlock);
+            return jit.defer_compilation(asm);
         }
 
         let lhs = jit.peek_at_stack(&asm.ctx, 0);
@@ -1661,8 +1740,7 @@ fn gen_opt_plus(
     let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
         Some(two_fixnums) => two_fixnums,
         None => {
-            defer_compilation(jit, asm);
-            return Some(EndBlock);
+            return jit.defer_compilation(asm);
         }
     };
 
@@ -1802,8 +1880,7 @@ fn gen_splatkw(
 ) -> Option<CodegenStatus> {
     // Defer compilation so we can specialize on a runtime hash operand
     if !jit.at_compile_target() {
-        defer_compilation(jit, asm);
-        return Some(EndBlock);
+        return jit.defer_compilation(asm);
     }
 
     let comptime_hash = jit.peek_at_stack(&asm.ctx, 1);
@@ -2176,13 +2253,13 @@ fn gen_expandarray(
 
     // Defer compilation so we can specialize on a runtime `self`
     if !jit.at_compile_target() {
-        defer_compilation(jit, asm);
-        return Some(EndBlock);
+        return jit.defer_compilation(asm);
     }
 
     let comptime_recv = jit.peek_at_stack(&asm.ctx, 0);
 
-    // If the comptime receiver is not an array
+    // If the comptime receiver is not an array, speculate for when the `rb_check_array_type()`
+    // conversion returns nil and without side-effects (e.g. arbitrary method calls).
     if !unsafe { RB_TYPE_P(comptime_recv, RUBY_T_ARRAY) } {
         // at compile time, ensure to_ary is not defined
         let target_cme = unsafe { rb_callable_method_entry_or_negative(comptime_recv.class_of(), ID!(to_ary)) };
@@ -2194,13 +2271,19 @@ fn gen_expandarray(
             return None;
         }
 
+        // Bail when method_missing is defined to avoid generating code to call it.
+        // Also, for simplicity, bail when BasicObject#method_missing has been removed.
+        if !assume_method_basic_definition(jit, asm, comptime_recv.class_of(), ID!(method_missing)) {
+            gen_counter_incr(jit, asm, Counter::expandarray_method_missing);
+            return None;
+        }
+
         // invalidate compile block if to_ary is later defined
         jit.assume_method_lookup_stable(asm, target_cme);
 
         jit_guard_known_klass(
             jit,
             asm,
-            comptime_recv.class_of(),
             array_opnd,
             array_opnd.into(),
             comptime_recv,
@@ -2230,7 +2313,7 @@ fn gen_expandarray(
     }
 
     // Get the compile-time array length
-    let comptime_len = unsafe { rb_yjit_array_len(comptime_recv) as u32 };
+    let comptime_len = unsafe { rb_jit_array_len(comptime_recv) as u32 };
 
     // Move the array from the stack and check that it's an array.
     guard_object_is_array(
@@ -2364,6 +2447,11 @@ fn gen_getlocal_generic(
     ep_offset: u32,
     level: u32,
 ) -> Option<CodegenStatus> {
+    // Split the block if we need to invalidate this instruction when EP escapes
+    if level == 0 && !jit.escapes_ep() && !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
+    }
+
     let local_opnd = if level == 0 && jit.assume_no_ep_escape(asm) {
         // Load the local using SP register
         asm.local_opnd(ep_offset)
@@ -2430,6 +2518,7 @@ fn gen_setlocal_generic(
     ep_offset: u32,
     level: u32,
 ) -> Option<CodegenStatus> {
+    // Post condition: The type of of the set local is updated in the Context.
     let value_type = asm.ctx.get_opnd_type(StackOpnd(0));
 
     // Fallback because of write barrier
@@ -2451,9 +2540,19 @@ fn gen_setlocal_generic(
         );
         asm.stack_pop(1);
 
+        // Set local type in the context
+        if level == 0 {
+            let local_idx = ep_offset_to_local_idx(jit.get_iseq(), ep_offset).as_usize();
+            asm.ctx.set_local_type(local_idx, value_type);
+        }
         return Some(KeepCompiling);
     }
 
+    // Split the block if we need to invalidate this instruction when EP escapes
+    if level == 0 && !jit.escapes_ep() && !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
+    }
+
     let (flags_opnd, local_opnd) = if level == 0 && jit.assume_no_ep_escape(asm) {
         // Load flags and the local using SP register
         let flags_opnd = asm.ctx.ep_opnd(VM_ENV_DATA_INDEX_FLAGS as i32);
@@ -2498,6 +2597,7 @@ fn gen_setlocal_generic(
         );
     }
 
+    // Set local type in the context
     if level == 0 {
         let local_idx = ep_offset_to_local_idx(jit.get_iseq(), ep_offset).as_usize();
         asm.ctx.set_local_type(local_idx, value_type);
@@ -2662,7 +2762,7 @@ fn gen_checkkeyword(
 ) -> Option<CodegenStatus> {
     // When a keyword is unspecified past index 32, a hash will be used
     // instead. This can only happen in iseqs taking more than 32 keywords.
-    if unsafe { (*get_iseq_body_param_keyword(jit.iseq)).num >= 32 } {
+    if unsafe { (*get_iseq_body_param_keyword(jit.iseq)).num >= VM_KW_SPECIFIED_BITS_MAX.try_into().unwrap() } {
         return None;
     }
 
@@ -2718,10 +2818,7 @@ fn jit_chain_guard(
             idx: jit.insn_idx,
         };
 
-        // Bail if we can't generate the branch
-        if gen_branch(jit, asm, bid, &deeper, None, None, target0_gen_fn).is_none() {
-            jit.block_abandoned = true;
-        }
+        jit.gen_branch(asm, bid, &deeper, None, None, target0_gen_fn);
     } else {
         target0_gen_fn.call(asm, Target::side_exit(counter), None);
     }
@@ -2760,24 +2857,12 @@ fn gen_get_ivar(
     recv: Opnd,
     recv_opnd: YARVOpnd,
 ) -> Option<CodegenStatus> {
-    let comptime_val_klass = comptime_receiver.class_of();
-
     // If recv isn't already a register, load it.
     let recv = match recv {
         Opnd::InsnOut { .. } => recv,
         _ => asm.load(recv),
     };
 
-    // Check if the comptime class uses a custom allocator
-    let custom_allocator = unsafe { rb_get_alloc_func(comptime_val_klass) };
-    let uses_custom_allocator = match custom_allocator {
-        Some(alloc_fun) => {
-            let allocate_instance = rb_class_allocate_instance as *const u8;
-            alloc_fun as *const u8 != allocate_instance
-        }
-        None => false,
-    };
-
     // Check if the comptime receiver is a T_OBJECT
     let receiver_t_object = unsafe { RB_TYPE_P(comptime_receiver, RUBY_T_OBJECT) };
     // Use a general C call at the last chain to avoid exits on megamorphic shapes
@@ -2786,12 +2871,9 @@ fn gen_get_ivar(
         gen_counter_incr(jit, asm, Counter::num_getivar_megamorphic);
     }
 
-    // If the class uses the default allocator, instances should all be T_OBJECT
-    // NOTE: This assumes nobody changes the allocator of the class after allocation.
-    //       Eventually, we can encode whether an object is T_OBJECT or not
-    //       inside object shapes.
+    // NOTE: This assumes T_OBJECT can't ever have the same shape_id as any other type.
     // too-complex shapes can't use index access, so we use rb_ivar_get for them too.
-    if !receiver_t_object || uses_custom_allocator || comptime_receiver.shape_too_complex() || megamorphic {
+    if !comptime_receiver.heap_object_p() || comptime_receiver.shape_too_complex() || megamorphic {
         // General case. Call rb_ivar_get().
         // VALUE rb_ivar_get(VALUE obj, ID id)
         asm_comment!(asm, "call rb_ivar_get()");
@@ -2816,9 +2898,8 @@ fn gen_get_ivar(
 
     let ivar_index = unsafe {
         let shape_id = comptime_receiver.shape_id_of();
-        let shape = rb_shape_get_shape_by_id(shape_id);
-        let mut ivar_index: u32 = 0;
-        if rb_shape_get_iv_index(shape, ivar_name, &mut ivar_index) {
+        let mut ivar_index: u16 = 0;
+        if rb_shape_get_iv_index(shape_id, ivar_name, &mut ivar_index) {
             Some(ivar_index as usize)
         } else {
             None
@@ -2828,10 +2909,7 @@ fn gen_get_ivar(
     // Guard heap object (recv_opnd must be used before stack_pop)
     guard_object_is_heap(asm, recv, recv_opnd, Counter::getivar_not_heap);
 
-    // Compile time self is embedded and the ivar index lands within the object
-    let embed_test_result = unsafe { FL_TEST_RAW(comptime_receiver, VALUE(ROBJECT_EMBED.as_usize())) != VALUE(0) };
-
-    let expected_shape = unsafe { rb_shape_get_shape_id(comptime_receiver) };
+    let expected_shape = unsafe { rb_obj_shape_id(comptime_receiver) };
     let shape_id_offset = unsafe { rb_shape_id_offset() };
     let shape_opnd = Opnd::mem(SHAPE_ID_NUM_BITS as u8, recv, shape_id_offset);
 
@@ -2859,28 +2937,37 @@ fn gen_get_ivar(
             asm.mov(out_opnd, Qnil.into());
         }
         Some(ivar_index) => {
-            if embed_test_result {
-                // See ROBJECT_IVPTR() from include/ruby/internal/core/robject.h
-
-                // Load the variable
-                let offs = ROBJECT_OFFSET_AS_ARY as i32 + (ivar_index * SIZEOF_VALUE) as i32;
-                let ivar_opnd = Opnd::mem(64, recv, offs);
-
-                // Push the ivar on the stack
-                let out_opnd = asm.stack_push(Type::Unknown);
-                asm.mov(out_opnd, ivar_opnd);
+            let ivar_opnd = if receiver_t_object {
+                if comptime_receiver.embedded_p() {
+                   // See ROBJECT_FIELDS() from include/ruby/internal/core/robject.h
+
+                   // Load the variable
+                   let offs = ROBJECT_OFFSET_AS_ARY as i32 + (ivar_index * SIZEOF_VALUE) as i32;
+                   Opnd::mem(64, recv, offs)
+               } else {
+                   // Compile time value is *not* embedded.
+
+                   // Get a pointer to the extended table
+                   let tbl_opnd = asm.load(Opnd::mem(64, recv, ROBJECT_OFFSET_AS_HEAP_FIELDS as i32));
+
+                   // Read the ivar from the extended table
+                   Opnd::mem(64, tbl_opnd, (SIZEOF_VALUE * ivar_index) as i32)
+               }
             } else {
-                // Compile time value is *not* embedded.
-
-                // Get a pointer to the extended table
-                let tbl_opnd = asm.load(Opnd::mem(64, recv, ROBJECT_OFFSET_AS_HEAP_IVPTR as i32));
+                asm_comment!(asm, "call rb_ivar_get_at()");
 
-                // Read the ivar from the extended table
-                let ivar_opnd = Opnd::mem(64, tbl_opnd, (SIZEOF_VALUE * ivar_index) as i32);
+                if assume_single_ractor_mode(jit, asm) {
+                    asm.ccall(rb_ivar_get_at_no_ractor_check as *const u8, vec![recv, Opnd::UImm((ivar_index as u32).into())])
+                } else {
+                    // The function could raise RactorIsolationError.
+                    jit_prepare_non_leaf_call(jit, asm);
+                    asm.ccall(rb_ivar_get_at as *const u8, vec![recv, Opnd::UImm((ivar_index as u32).into()), Opnd::UImm(ivar_name)])
+                }
+            };
 
-                let out_opnd = asm.stack_push(Type::Unknown);
-                asm.mov(out_opnd, ivar_opnd);
-            }
+            // Push the ivar on the stack
+            let out_opnd = asm.stack_push(Type::Unknown);
+            asm.mov(out_opnd, ivar_opnd);
         }
     }
 
@@ -2895,8 +2982,7 @@ fn gen_getinstancevariable(
 ) -> Option<CodegenStatus> {
     // Defer compilation so we can specialize on a runtime `self`
     if !jit.at_compile_target() {
-        defer_compilation(jit, asm);
-        return Some(EndBlock);
+        return jit.defer_compilation(asm);
     }
 
     let ivar_name = jit.get_arg(0).as_u64();
@@ -2943,7 +3029,7 @@ fn gen_write_iv(
         // Compile time value is *not* embedded.
 
         // Get a pointer to the extended table
-        let tbl_opnd = asm.load(Opnd::mem(64, recv, ROBJECT_OFFSET_AS_HEAP_IVPTR as i32));
+        let tbl_opnd = asm.load(Opnd::mem(64, recv, ROBJECT_OFFSET_AS_HEAP_FIELDS as i32));
 
         // Write the ivar in to the extended table
         let ivar_opnd = Opnd::mem(64, tbl_opnd, (SIZEOF_VALUE * ivar_index) as i32);
@@ -2959,8 +3045,7 @@ fn gen_setinstancevariable(
 ) -> Option<CodegenStatus> {
     // Defer compilation so we can specialize on a runtime `self`
     if !jit.at_compile_target() {
-        defer_compilation(jit, asm);
-        return Some(EndBlock);
+        return jit.defer_compilation(asm);
     }
 
     let ivar_name = jit.get_arg(0).as_u64();
@@ -2988,8 +3073,6 @@ fn gen_set_ivar(
     recv_opnd: YARVOpnd,
     ic: Option<*const iseq_inline_iv_cache_entry>,
 ) -> Option<CodegenStatus> {
-    let comptime_val_klass = comptime_receiver.class_of();
-
     // If the comptime receiver is frozen, writing an IV will raise an exception
     // and we don't want to JIT code to deal with that situation.
     if comptime_receiver.is_frozen() {
@@ -2999,16 +3082,6 @@ fn gen_set_ivar(
 
     let stack_type = asm.ctx.get_opnd_type(StackOpnd(0));
 
-    // Check if the comptime class uses a custom allocator
-    let custom_allocator = unsafe { rb_get_alloc_func(comptime_val_klass) };
-    let uses_custom_allocator = match custom_allocator {
-        Some(alloc_fun) => {
-            let allocate_instance = rb_class_allocate_instance as *const u8;
-            alloc_fun as *const u8 != allocate_instance
-        }
-        None => false,
-    };
-
     // Check if the comptime receiver is a T_OBJECT
     let receiver_t_object = unsafe { RB_TYPE_P(comptime_receiver, RUBY_T_OBJECT) };
     // Use a general C call at the last chain to avoid exits on megamorphic shapes
@@ -3019,11 +3092,10 @@ fn gen_set_ivar(
 
     // Get the iv index
     let shape_too_complex = comptime_receiver.shape_too_complex();
-    let ivar_index = if !shape_too_complex {
+    let ivar_index = if !comptime_receiver.special_const_p() && !shape_too_complex {
         let shape_id = comptime_receiver.shape_id_of();
-        let shape = unsafe { rb_shape_get_shape_by_id(shape_id) };
-        let mut ivar_index: u32 = 0;
-        if unsafe { rb_shape_get_iv_index(shape, ivar_name, &mut ivar_index) } {
+        let mut ivar_index: u16 = 0;
+        if unsafe { rb_shape_get_iv_index(shape_id, ivar_name, &mut ivar_index) } {
             Some(ivar_index as usize)
         } else {
             None
@@ -3033,27 +3105,31 @@ fn gen_set_ivar(
     };
 
     // The current shape doesn't contain this iv, we need to transition to another shape.
+    let mut new_shape_too_complex = false;
     let new_shape = if !shape_too_complex && receiver_t_object && ivar_index.is_none() {
-        let current_shape = comptime_receiver.shape_of();
-        let next_shape = unsafe { rb_shape_get_next_no_warnings(current_shape, comptime_receiver, ivar_name) };
-        let next_shape_id = unsafe { rb_shape_id(next_shape) };
+        let current_shape_id = comptime_receiver.shape_id_of();
+        // We don't need to check about imemo_fields here because we're definitely looking at a T_OBJECT.
+        let klass = unsafe { rb_obj_class(comptime_receiver) };
+        let next_shape_id = unsafe { rb_shape_transition_add_ivar_no_warnings(klass, current_shape_id, ivar_name) };
 
         // If the VM ran out of shapes, or this class generated too many leaf,
         // it may be de-optimized into OBJ_TOO_COMPLEX_SHAPE (hash-table).
-        if next_shape_id == OBJ_TOO_COMPLEX_SHAPE_ID {
+        new_shape_too_complex = unsafe { rb_jit_shape_too_complex_p(next_shape_id) };
+        if new_shape_too_complex {
             Some((next_shape_id, None, 0_usize))
         } else {
-            let current_capacity = unsafe { (*current_shape).capacity };
+            let current_capacity = unsafe { rb_yjit_shape_capacity(current_shape_id) };
+            let next_capacity = unsafe { rb_yjit_shape_capacity(next_shape_id) };
 
             // If the new shape has a different capacity, or is TOO_COMPLEX, we'll have to
             // reallocate it.
-            let needs_extension = unsafe { (*current_shape).capacity != (*next_shape).capacity };
+            let needs_extension = next_capacity != current_capacity;
 
             // We can write to the object, but we need to transition the shape
-            let ivar_index = unsafe { (*current_shape).next_iv_index } as usize;
+            let ivar_index = unsafe { rb_yjit_shape_index(next_shape_id) } as usize;
 
             let needs_extension = if needs_extension {
-                Some((current_capacity, unsafe { (*next_shape).capacity }))
+                Some((current_capacity, next_capacity))
             } else {
                 None
             };
@@ -3062,12 +3138,10 @@ fn gen_set_ivar(
     } else {
         None
     };
-    let new_shape_too_complex = matches!(new_shape, Some((OBJ_TOO_COMPLEX_SHAPE_ID, _, _)));
 
-    // If the receiver isn't a T_OBJECT, or uses a custom allocator,
-    // then just write out the IV write as a function call.
+    // If the receiver isn't a T_OBJECT, then just write out the IV write as a function call.
     // too-complex shapes can't use index access, so we use rb_ivar_get for them too.
-    if !receiver_t_object || uses_custom_allocator || shape_too_complex || new_shape_too_complex || megamorphic {
+    if !receiver_t_object || shape_too_complex || new_shape_too_complex || megamorphic {
         // The function could raise FrozenError.
         // Note that this modifies REG_SP, which is why we do it first
         jit_prepare_non_leaf_call(jit, asm);
@@ -3091,7 +3165,7 @@ fn gen_set_ivar(
             asm.ccall(
                 rb_vm_setinstancevariable as *const u8,
                 vec![
-                    Opnd::const_ptr(jit.iseq as *const u8),
+                    VALUE(jit.iseq as usize).into(),
                     Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF),
                     ivar_name.into(),
                     val_opnd,
@@ -3110,7 +3184,7 @@ fn gen_set_ivar(
         // Upgrade type
         guard_object_is_heap(asm, recv, recv_opnd, Counter::setivar_not_heap);
 
-        let expected_shape = unsafe { rb_shape_get_shape_id(comptime_receiver) };
+        let expected_shape = unsafe { rb_obj_shape_id(comptime_receiver) };
         let shape_id_offset = unsafe { rb_shape_id_offset() };
         let shape_opnd = Opnd::mem(SHAPE_ID_NUM_BITS as u8, recv, shape_id_offset);
 
@@ -3270,8 +3344,7 @@ fn gen_definedivar(
 ) -> Option<CodegenStatus> {
     // Defer compilation so we can specialize base on a runtime receiver
     if !jit.at_compile_target() {
-        defer_compilation(jit, asm);
-        return Some(EndBlock);
+        return jit.defer_compilation(asm);
     }
 
     let ivar_name = jit.get_arg(0).as_u64();
@@ -3285,7 +3358,7 @@ fn gen_definedivar(
     // Specialize base on compile time values
     let comptime_receiver = jit.peek_at_self();
 
-    if comptime_receiver.shape_too_complex() || asm.ctx.get_chain_depth() >= GET_IVAR_MAX_DEPTH {
+    if comptime_receiver.special_const_p() || comptime_receiver.shape_too_complex() || asm.ctx.get_chain_depth() >= GET_IVAR_MAX_DEPTH {
         // Fall back to calling rb_ivar_defined
 
         // Save the PC and SP because the callee may allocate
@@ -3311,9 +3384,8 @@ fn gen_definedivar(
 
     let shape_id = comptime_receiver.shape_id_of();
     let ivar_exists = unsafe {
-        let shape = rb_shape_get_shape_by_id(shape_id);
-        let mut ivar_index: u32 = 0;
-        rb_shape_get_iv_index(shape, ivar_name, &mut ivar_index)
+        let mut ivar_index: u16 = 0;
+        rb_shape_get_iv_index(shape_id, ivar_name, &mut ivar_index)
     };
 
     // Guard heap object (recv_opnd must be used before stack_pop)
@@ -3336,9 +3408,7 @@ fn gen_definedivar(
     jit_putobject(asm, result);
 
     // Jump to next instruction. This allows guard chains to share the same successor.
-    jump_to_next_insn(jit, asm);
-
-    return Some(EndBlock);
+    return jump_to_next_insn(jit, asm);
 }
 
 fn gen_checktype(
@@ -3500,8 +3570,7 @@ fn gen_fixnum_cmp(
         Some(two_fixnums) => two_fixnums,
         None => {
             // Defer compilation so we can specialize based on a runtime receiver
-            defer_compilation(jit, asm);
-            return Some(EndBlock);
+            return jit.defer_compilation(asm);
         }
     };
 
@@ -3614,7 +3683,6 @@ fn gen_equality_specialized(
         jit_guard_known_klass(
             jit,
             asm,
-            unsafe { rb_cString },
             a_opnd,
             a_opnd.into(),
             comptime_a,
@@ -3640,7 +3708,6 @@ fn gen_equality_specialized(
             jit_guard_known_klass(
                 jit,
                 asm,
-                unsafe { rb_cString },
                 b_opnd,
                 b_opnd.into(),
                 comptime_b,
@@ -3680,14 +3747,12 @@ fn gen_opt_eq(
         Some(specialized) => specialized,
         None => {
             // Defer compilation so we can specialize base on a runtime receiver
-            defer_compilation(jit, asm);
-            return Some(EndBlock);
+            return jit.defer_compilation(asm);
         }
     };
 
     if specialized {
-        jump_to_next_insn(jit, asm);
-        Some(EndBlock)
+        jump_to_next_insn(jit, asm)
     } else {
         gen_opt_send_without_block(jit, asm)
     }
@@ -3718,8 +3783,7 @@ fn gen_opt_aref(
 
     // Defer compilation so we can specialize base on a runtime receiver
     if !jit.at_compile_target() {
-        defer_compilation(jit, asm);
-        return Some(EndBlock);
+        return jit.defer_compilation(asm);
     }
 
     // Specialize base on compile time values
@@ -3740,7 +3804,6 @@ fn gen_opt_aref(
         jit_guard_known_klass(
             jit,
             asm,
-            unsafe { rb_cArray },
             recv_opnd,
             recv_opnd.into(),
             comptime_recv,
@@ -3768,8 +3831,7 @@ fn gen_opt_aref(
         }
 
         // Jump to next instruction. This allows guard chains to share the same successor.
-        jump_to_next_insn(jit, asm);
-        return Some(EndBlock);
+        return jump_to_next_insn(jit, asm);
     } else if comptime_recv.class_of() == unsafe { rb_cHash } {
         if !assume_bop_not_redefined(jit, asm, HASH_REDEFINED_OP_FLAG, BOP_AREF) {
             return None;
@@ -3781,7 +3843,6 @@ fn gen_opt_aref(
         jit_guard_known_klass(
             jit,
             asm,
-            unsafe { rb_cHash },
             recv_opnd,
             recv_opnd.into(),
             comptime_recv,
@@ -3805,8 +3866,7 @@ fn gen_opt_aref(
         asm.mov(stack_ret, val);
 
         // Jump to next instruction. This allows guard chains to share the same successor.
-        jump_to_next_insn(jit, asm);
-        Some(EndBlock)
+        jump_to_next_insn(jit, asm)
     } else {
         // General case. Call the [] method.
         gen_opt_send_without_block(jit, asm)
@@ -3819,8 +3879,7 @@ fn gen_opt_aset(
 ) -> Option<CodegenStatus> {
     // Defer compilation so we can specialize on a runtime `self`
     if !jit.at_compile_target() {
-        defer_compilation(jit, asm);
-        return Some(EndBlock);
+        return jit.defer_compilation(asm);
     }
 
     let comptime_recv = jit.peek_at_stack(&asm.ctx, 2);
@@ -3836,7 +3895,6 @@ fn gen_opt_aset(
         jit_guard_known_klass(
             jit,
             asm,
-            unsafe { rb_cArray },
             recv,
             recv.into(),
             comptime_recv,
@@ -3848,7 +3906,6 @@ fn gen_opt_aset(
         jit_guard_known_klass(
             jit,
             asm,
-            unsafe { rb_cInteger },
             key,
             key.into(),
             comptime_key,
@@ -3875,14 +3932,12 @@ fn gen_opt_aset(
         let stack_ret = asm.stack_push(Type::Unknown);
         asm.mov(stack_ret, val);
 
-        jump_to_next_insn(jit, asm);
-        return Some(EndBlock);
+        return jump_to_next_insn(jit, asm)
     } else if comptime_recv.class_of() == unsafe { rb_cHash } {
         // Guard receiver is a Hash
         jit_guard_known_klass(
             jit,
             asm,
-            unsafe { rb_cHash },
             recv,
             recv.into(),
             comptime_recv,
@@ -3904,45 +3959,12 @@ fn gen_opt_aset(
         let stack_ret = asm.stack_push(Type::Unknown);
         asm.mov(stack_ret, ret);
 
-        jump_to_next_insn(jit, asm);
-        Some(EndBlock)
+        jump_to_next_insn(jit, asm)
     } else {
         gen_opt_send_without_block(jit, asm)
     }
 }
 
-fn gen_opt_aref_with(
-    jit: &mut JITState,
-    asm: &mut Assembler,
-) -> Option<CodegenStatus>{
-    // We might allocate or raise
-    jit_prepare_non_leaf_call(jit, asm);
-
-    let key_opnd = Opnd::Value(jit.get_arg(0));
-    let recv_opnd = asm.stack_opnd(0);
-
-    extern "C" {
-        fn rb_vm_opt_aref_with(recv: VALUE, key: VALUE) -> VALUE;
-    }
-
-    let val_opnd = asm.ccall(
-        rb_vm_opt_aref_with as *const u8,
-        vec![
-            recv_opnd,
-            key_opnd
-        ],
-    );
-    asm.stack_pop(1); // Keep it on stack during GC
-
-    asm.cmp(val_opnd, Qundef.into());
-    asm.je(Target::side_exit(Counter::opt_aref_with_qundef));
-
-    let top = asm.stack_push(Type::Unknown);
-    asm.mov(top, val_opnd);
-
-    return Some(KeepCompiling);
-}
-
 fn gen_opt_and(
     jit: &mut JITState,
     asm: &mut Assembler,
@@ -3951,8 +3973,7 @@ fn gen_opt_and(
         Some(two_fixnums) => two_fixnums,
         None => {
             // Defer compilation so we can specialize on a runtime `self`
-            defer_compilation(jit, asm);
-            return Some(EndBlock);
+            return jit.defer_compilation(asm);
         }
     };
 
@@ -3990,8 +4011,7 @@ fn gen_opt_or(
         Some(two_fixnums) => two_fixnums,
         None => {
             // Defer compilation so we can specialize on a runtime `self`
-            defer_compilation(jit, asm);
-            return Some(EndBlock);
+            return jit.defer_compilation(asm);
         }
     };
 
@@ -4029,8 +4049,7 @@ fn gen_opt_minus(
         Some(two_fixnums) => two_fixnums,
         None => {
             // Defer compilation so we can specialize on a runtime `self`
-            defer_compilation(jit, asm);
-            return Some(EndBlock);
+            return jit.defer_compilation(asm);
         }
     };
 
@@ -4069,8 +4088,7 @@ fn gen_opt_mult(
     let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
         Some(two_fixnums) => two_fixnums,
         None => {
-            defer_compilation(jit, asm);
-            return Some(EndBlock);
+            return jit.defer_compilation(asm);
         }
     };
 
@@ -4121,8 +4139,7 @@ fn gen_opt_mod(
         Some(two_fixnums) => two_fixnums,
         None => {
             // Defer compilation so we can specialize on a runtime `self`
-            defer_compilation(jit, asm);
-            return Some(EndBlock);
+            return jit.defer_compilation(asm);
         }
     };
 
@@ -4214,11 +4231,11 @@ fn gen_opt_ary_freeze(
         return None;
     }
 
-    let str = jit.get_arg(0);
+    let ary = jit.get_arg(0);
 
     // Push the return value onto the stack
     let stack_ret = asm.stack_push(Type::CArray);
-    asm.mov(stack_ret, str.into());
+    asm.mov(stack_ret, ary.into());
 
     Some(KeepCompiling)
 }
@@ -4231,11 +4248,11 @@ fn gen_opt_hash_freeze(
         return None;
     }
 
-    let str = jit.get_arg(0);
+    let hash = jit.get_arg(0);
 
     // Push the return value onto the stack
     let stack_ret = asm.stack_push(Type::CHash);
-    asm.mov(stack_ret, str.into());
+    asm.mov(stack_ret, hash.into());
 
     Some(KeepCompiling)
 }
@@ -4289,6 +4306,53 @@ fn gen_opt_newarray_max(
     Some(KeepCompiling)
 }
 
+fn gen_opt_duparray_send(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    let method = jit.get_arg(1).as_u64();
+
+    if method == ID!(include_p) {
+        gen_opt_duparray_send_include_p(jit, asm)
+    } else {
+        None
+    }
+}
+
+fn gen_opt_duparray_send_include_p(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    asm_comment!(asm, "opt_duparray_send include_p");
+
+    let ary = jit.get_arg(0);
+    let argc = jit.get_arg(2).as_usize();
+
+    // Save the PC and SP because we may call #include?
+    jit_prepare_non_leaf_call(jit, asm);
+
+    extern "C" {
+        fn rb_vm_opt_duparray_include_p(ec: EcPtr, ary: VALUE, target: VALUE) -> VALUE;
+    }
+
+    let target = asm.ctx.sp_opnd(-1);
+
+    let val_opnd = asm.ccall(
+        rb_vm_opt_duparray_include_p as *const u8,
+        vec![
+            EC,
+            ary.into(),
+            target,
+        ],
+    );
+
+    asm.stack_pop(argc);
+    let stack_ret = asm.stack_push(Type::Unknown);
+    asm.mov(stack_ret, val_opnd);
+
+    Some(KeepCompiling)
+}
+
 fn gen_opt_newarray_send(
     jit: &mut JITState,
     asm: &mut Assembler,
@@ -4301,6 +4365,8 @@ fn gen_opt_newarray_send(
         gen_opt_newarray_max(jit, asm)
     } else if method == VM_OPT_NEWARRAY_SEND_HASH {
         gen_opt_newarray_hash(jit, asm)
+    } else if method == VM_OPT_NEWARRAY_SEND_INCLUDE_P {
+        gen_opt_newarray_include_p(jit, asm)
     } else if method == VM_OPT_NEWARRAY_SEND_PACK {
         gen_opt_newarray_pack_buffer(jit, asm, 1, None)
     } else if method == VM_OPT_NEWARRAY_SEND_PACK_BUFFER {
@@ -4386,6 +4452,42 @@ fn gen_opt_newarray_hash(
     Some(KeepCompiling)
 }
 
+fn gen_opt_newarray_include_p(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    asm_comment!(asm, "opt_newarray_send include?");
+
+    let num = jit.get_arg(0).as_u32();
+
+    // Save the PC and SP because we may call customized methods.
+    jit_prepare_non_leaf_call(jit, asm);
+
+    extern "C" {
+        fn rb_vm_opt_newarray_include_p(ec: EcPtr, num: u32, elts: *const VALUE, target: VALUE) -> VALUE;
+    }
+
+    let values_opnd = asm.ctx.sp_opnd(-(num as i32));
+    let values_ptr = asm.lea(values_opnd);
+    let target = asm.ctx.sp_opnd(-1);
+
+    let val_opnd = asm.ccall(
+        rb_vm_opt_newarray_include_p as *const u8,
+        vec![
+            EC,
+            (num - 1).into(),
+            values_ptr,
+            target
+        ],
+    );
+
+    asm.stack_pop(num.as_usize());
+    let stack_ret = asm.stack_push(Type::Unknown);
+    asm.mov(stack_ret, val_opnd);
+
+    Some(KeepCompiling)
+}
+
 fn gen_opt_newarray_min(
     jit: &mut JITState,
     asm: &mut Assembler,
@@ -4459,8 +4561,7 @@ fn gen_opt_case_dispatch(
     // hash lookup, at least for small hashes, but it's worth revisiting this
     // assumption in the future.
     if !jit.at_compile_target() {
-        defer_compilation(jit, asm);
-        return Some(EndBlock);
+        return jit.defer_compilation(asm);
     }
 
     let case_hash = jit.get_arg(0);
@@ -4572,15 +4673,14 @@ fn gen_branchif(
 
         // Generate the branch instructions
         let ctx = asm.ctx;
-        gen_branch(
-            jit,
+        jit.gen_branch(
             asm,
             jump_block,
             &ctx,
             Some(next_block),
             Some(&ctx),
             BranchGenFn::BranchIf(Cell::new(BranchShape::Default)),
-        )?;
+        );
     }
 
     Some(EndBlock)
@@ -4626,15 +4726,14 @@ fn gen_branchunless(
 
         // Generate the branch instructions
         let ctx = asm.ctx;
-        gen_branch(
-            jit,
+        jit.gen_branch(
             asm,
             jump_block,
             &ctx,
             Some(next_block),
             Some(&ctx),
             BranchGenFn::BranchUnless(Cell::new(BranchShape::Default)),
-        )?;
+        );
     }
 
     Some(EndBlock)
@@ -4677,15 +4776,14 @@ fn gen_branchnil(
         asm.cmp(val_opnd, Opnd::UImm(Qnil.into()));
         // Generate the branch instructions
         let ctx = asm.ctx;
-        gen_branch(
-            jit,
+        jit.gen_branch(
             asm,
             jump_block,
             &ctx,
             Some(next_block),
             Some(&ctx),
             BranchGenFn::BranchNil(Cell::new(BranchShape::Default)),
-        )?;
+        );
     }
 
     Some(EndBlock)
@@ -4733,6 +4831,69 @@ fn gen_throw(
     Some(EndBlock)
 }
 
+fn gen_opt_new(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    let cd = jit.get_arg(0).as_ptr();
+    let jump_offset = jit.get_arg(1).as_i32();
+
+    if !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
+    }
+
+    let ci = unsafe { get_call_data_ci(cd) }; // info about the call site
+    let mid = unsafe { vm_ci_mid(ci) };
+    let argc: i32 = unsafe { vm_ci_argc(ci) }.try_into().unwrap();
+
+    let recv_idx = argc;
+    let comptime_recv = jit.peek_at_stack(&asm.ctx, recv_idx as isize);
+
+    // This is a singleton class
+    let comptime_recv_klass = comptime_recv.class_of();
+
+    let recv = asm.stack_opnd(recv_idx);
+
+    perf_call!("opt_new: ", jit_guard_known_klass(
+        jit,
+        asm,
+        recv,
+        recv.into(),
+        comptime_recv,
+        SEND_MAX_DEPTH,
+        Counter::guard_send_klass_megamorphic,
+    ));
+
+    // We now know that it's always comptime_recv_klass
+    if jit.assume_expected_cfunc(asm, comptime_recv_klass, mid, rb_class_new_instance_pass_kw as _) {
+        // Fast path
+        // call rb_class_alloc to actually allocate
+        jit_prepare_non_leaf_call(jit, asm);
+        let obj = asm.ccall(rb_obj_alloc as _, vec![comptime_recv.into()]);
+
+        // Get a reference to the stack location where we need to save the
+        // return instance.
+        let result = asm.stack_opnd(recv_idx + 1);
+        let recv = asm.stack_opnd(recv_idx);
+
+        // Replace the receiver for the upcoming initialize call
+        asm.ctx.set_opnd_mapping(recv.into(), TempMapping::MapToStack(Type::UnknownHeap));
+        asm.mov(recv, obj);
+
+        // Save the allocated object for return
+        asm.ctx.set_opnd_mapping(result.into(), TempMapping::MapToStack(Type::UnknownHeap));
+        asm.mov(result, obj);
+
+        jump_to_next_insn(jit, asm)
+    } else {
+        // general case
+
+        // Get the branch target instruction offsets
+        let jump_idx = jit.next_insn_idx() as i32 + jump_offset;
+        return end_block_with_jump(jit, asm, jump_idx as u16);
+    }
+}
+
 fn gen_jump(
     jit: &mut JITState,
     asm: &mut Assembler,
@@ -4766,13 +4927,13 @@ fn gen_jump(
 fn jit_guard_known_klass(
     jit: &mut JITState,
     asm: &mut Assembler,
-    known_klass: VALUE,
     obj_opnd: Opnd,
     insn_opnd: YARVOpnd,
     sample_instance: VALUE,
     max_chain_depth: u8,
     counter: Counter,
 ) {
+    let known_klass = sample_instance.class_of();
     let val_type = asm.ctx.get_opnd_type(insn_opnd);
 
     if val_type.known_class() == Some(known_klass) {
@@ -4878,7 +5039,7 @@ fn jit_guard_known_klass(
             assert_eq!(sample_instance.class_of(), rb_cString, "context says class is exactly ::String")
         };
     } else {
-        assert!(!val_type.is_imm());
+        assert!(!val_type.is_imm(), "{insn_opnd:?} should be a heap object, but was {val_type:?} for {sample_instance:?}");
 
         // Check that the receiver is a heap object
         // Note: if we get here, the class doesn't have immediate instances.
@@ -5272,6 +5433,35 @@ fn jit_rb_int_succ(
     true
 }
 
+fn jit_rb_int_pred(
+    _jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    // Guard the receiver is fixnum
+    let recv_type = asm.ctx.get_opnd_type(StackOpnd(0));
+    let recv = asm.stack_pop(1);
+    if recv_type != Type::Fixnum {
+        asm_comment!(asm, "guard object is fixnum");
+        asm.test(recv, Opnd::Imm(RUBY_FIXNUM_FLAG as i64));
+        asm.jz(Target::side_exit(Counter::send_pred_not_fixnum));
+    }
+
+    asm_comment!(asm, "Integer#pred");
+    let out_val = asm.sub(recv, Opnd::Imm(2)); // 2 is untagged Fixnum 1
+    asm.jo(Target::side_exit(Counter::send_pred_underflow));
+
+    // Push the output onto the stack
+    let dst = asm.stack_push(Type::Fixnum);
+    asm.mov(dst, out_val);
+
+    true
+}
+
 fn jit_rb_int_div(
     jit: &mut JITState,
     asm: &mut Assembler,
@@ -5493,7 +5683,6 @@ fn jit_rb_float_plus(
         jit_guard_known_klass(
             jit,
             asm,
-            comptime_obj.class_of(),
             obj,
             obj.into(),
             comptime_obj,
@@ -5535,7 +5724,6 @@ fn jit_rb_float_minus(
         jit_guard_known_klass(
             jit,
             asm,
-            comptime_obj.class_of(),
             obj,
             obj.into(),
             comptime_obj,
@@ -5577,7 +5765,6 @@ fn jit_rb_float_mul(
         jit_guard_known_klass(
             jit,
             asm,
-            comptime_obj.class_of(),
             obj,
             obj.into(),
             comptime_obj,
@@ -5619,7 +5806,6 @@ fn jit_rb_float_div(
         jit_guard_known_klass(
             jit,
             asm,
-            comptime_obj.class_of(),
             obj,
             obj.into(),
             comptime_obj,
@@ -5787,6 +5973,82 @@ fn jit_rb_str_byteslice(
     true
 }
 
+fn jit_rb_str_aref_m(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    // In yjit-bench the most common usages by far are single fixnum or two fixnums.
+    // rb_str_substr should be leaf if indexes are fixnums
+    if argc == 2 {
+        match (asm.ctx.get_opnd_type(StackOpnd(0)), asm.ctx.get_opnd_type(StackOpnd(1))) {
+            (Type::Fixnum, Type::Fixnum) => {},
+            // There is a two-argument form of (RegExp, Fixnum) which needs a different c func.
+            // Other types will raise.
+            _ => { return false },
+        }
+    } else if argc == 1 {
+        match asm.ctx.get_opnd_type(StackOpnd(0)) {
+            Type::Fixnum => {},
+            // Besides Fixnum this could also be a Range or a RegExp which are handled by separate c funcs.
+            // Other types will raise.
+            _ => {
+                // If the context doesn't have the type info we try a little harder.
+                let comptime_arg = jit.peek_at_stack(&asm.ctx, 0);
+                let arg0 = asm.stack_opnd(0);
+                if comptime_arg.fixnum_p() {
+                    asm.test(arg0, Opnd::UImm(RUBY_FIXNUM_FLAG as u64));
+
+                    jit_chain_guard(
+                        JCC_JZ,
+                        jit,
+                        asm,
+                        SEND_MAX_DEPTH,
+                        Counter::guard_send_str_aref_not_fixnum,
+                    );
+                } else {
+                    return false
+                }
+            },
+        }
+    } else {
+        return false
+    }
+
+    asm_comment!(asm, "String#[]");
+
+    // rb_str_substr allocates a substring
+    jit_prepare_call_with_gc(jit, asm);
+
+    // Get stack operands after potential SP change
+
+    // The "empty" arg distinguishes between the normal "one arg" behavior
+    // and the "two arg" special case that returns an empty string
+    // when the begin index is the length of the string.
+    // See the usages of rb_str_substr in string.c for more information.
+    let (beg_idx, empty, len) = if argc == 2 {
+        (1, Opnd::Imm(1), asm.stack_opnd(0))
+    } else {
+        // If there is only one arg, the length will be 1.
+        (0, Opnd::Imm(0), VALUE::fixnum_from_usize(1).into())
+    };
+
+    let beg = asm.stack_opnd(beg_idx);
+    let recv = asm.stack_opnd(beg_idx + 1);
+
+    let ret_opnd = asm.ccall(rb_str_substr_two_fixnums as *const u8, vec![recv, beg, len, empty]);
+    asm.stack_pop(beg_idx as usize + 2);
+
+    let out_opnd = asm.stack_push(Type::Unknown);
+    asm.mov(out_opnd, ret_opnd);
+
+    true
+}
+
 fn jit_rb_str_getbyte(
     jit: &mut JITState,
     asm: &mut Assembler,
@@ -5807,7 +6069,6 @@ fn jit_rb_str_getbyte(
         jit_guard_known_klass(
             jit,
             asm,
-            comptime_idx.class_of(),
             idx,
             idx.into(),
             comptime_idx,
@@ -5907,6 +6168,41 @@ fn jit_rb_str_to_s(
     false
 }
 
+fn jit_rb_str_dup(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    known_recv_class: Option<VALUE>,
+) -> bool {
+    // We specialize only the BARE_STRING_P case. Otherwise it's not leaf.
+    if unsafe { known_recv_class != Some(rb_cString) } {
+        return false;
+    }
+    asm_comment!(asm, "String#dup");
+
+    jit_prepare_call_with_gc(jit, asm);
+
+    let recv_opnd = asm.stack_opnd(0);
+    let recv_opnd = asm.load(recv_opnd);
+
+    let shape_id_offset = unsafe { rb_shape_id_offset() };
+    let shape_opnd = Opnd::mem(64, recv_opnd, shape_id_offset);
+    asm.test(shape_opnd, Opnd::UImm(SHAPE_ID_HAS_IVAR_MASK as u64));
+    asm.jnz(Target::side_exit(Counter::send_str_dup_exivar));
+
+    // Call rb_str_dup
+    let ret_opnd = asm.ccall(rb_str_dup as *const u8, vec![recv_opnd]);
+
+    asm.stack_pop(1);
+    let stack_ret = asm.stack_push(Type::CString);
+    asm.mov(stack_ret, ret_opnd);
+
+    true
+}
+
 // Codegen for rb_str_empty_p()
 fn jit_rb_str_empty_p(
     _jit: &mut JITState,
@@ -5957,7 +6253,7 @@ fn jit_rb_str_concat_codepoint(
 
     guard_object_is_fixnum(jit, asm, codepoint, StackOpnd(0));
 
-    asm.ccall(rb_yjit_str_concat_codepoint as *const u8, vec![recv, codepoint]);
+    asm.ccall(rb_jit_str_concat_codepoint as *const u8, vec![recv, codepoint]);
 
     // The receiver is the return value, so we only need to pop the codepoint argument off the stack.
     // We can reuse the receiver slot in the stack as the return value.
@@ -6263,6 +6559,7 @@ fn jit_rb_f_block_given_p(
     true
 }
 
+/// Codegen for `block_given?` and `defined?(yield)`
 fn gen_block_given(
     jit: &mut JITState,
     asm: &mut Assembler,
@@ -6272,16 +6569,24 @@ fn gen_block_given(
 ) {
     asm_comment!(asm, "block_given?");
 
-    // Same as rb_vm_frame_block_handler
-    let ep_opnd = gen_get_lep(jit, asm);
-    let block_handler = asm.load(
-        Opnd::mem(64, ep_opnd, SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL)
-    );
+    // `yield` goes to the block handler stowed in the "local" iseq which is
+    // the current iseq or a parent. Only the "method" iseq type can be passed a
+    // block handler. (e.g. `yield` in the top level script is a syntax error.)
+    let local_iseq = unsafe { rb_get_iseq_body_local_iseq(jit.iseq) };
+    if unsafe { rb_get_iseq_body_type(local_iseq) } == ISEQ_TYPE_METHOD {
+        // Same as rb_vm_frame_block_handler
+        let ep_opnd = gen_get_lep(jit, asm);
+        let block_handler = asm.load(
+            Opnd::mem(64, ep_opnd, SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL)
+        );
 
-    // Return `block_handler != VM_BLOCK_HANDLER_NONE`
-    asm.cmp(block_handler, VM_BLOCK_HANDLER_NONE.into());
-    let block_given = asm.csel_ne(true_opnd, false_opnd);
-    asm.mov(out_opnd, block_given);
+        // Return `block_handler != VM_BLOCK_HANDLER_NONE`
+        asm.cmp(block_handler, VM_BLOCK_HANDLER_NONE.into());
+        let block_given = asm.csel_ne(true_opnd, false_opnd);
+        asm.mov(out_opnd, block_given);
+    } else {
+        asm.mov(out_opnd, false_opnd);
+    }
 }
 
 // Codegen for rb_class_superclass()
@@ -6298,6 +6603,7 @@ fn jit_rb_class_superclass(
         fn rb_class_superclass(klass: VALUE) -> VALUE;
     }
 
+    // It may raise "uninitialized class"
     if !jit_prepare_lazy_frame_call(jit, asm, cme, StackOpnd(0)) {
         return false;
     }
@@ -6353,7 +6659,7 @@ fn jit_thread_s_current(
     asm.stack_pop(1);
 
     // ec->thread_ptr
-    let ec_thread_opnd = asm.load(Opnd::mem(64, EC, RUBY_OFFSET_EC_THREAD_PTR));
+    let ec_thread_opnd = asm.load(Opnd::mem(64, EC, RUBY_OFFSET_EC_THREAD_PTR as i32));
 
     // thread->self
     let thread_self = Opnd::mem(64, ec_thread_opnd, RUBY_OFFSET_THREAD_SELF);
@@ -6616,11 +6922,12 @@ fn gen_send_cfunc(
     // Increment total cfunc send count
     gen_counter_incr(jit, asm, Counter::num_send_cfunc);
 
-    // Delegate to codegen for C methods if we have it.
+    // Delegate to codegen for C methods if we have it and the callsite is simple enough.
     if kw_arg.is_null() &&
             !kw_splat &&
             flags & VM_CALL_OPT_SEND == 0 &&
             flags & VM_CALL_ARGS_SPLAT == 0 &&
+            flags & VM_CALL_ARGS_BLOCKARG == 0 &&
             (cfunc_argc == -1 || argc == cfunc_argc) {
         let expected_stack_after = asm.ctx.get_stack_size() as i32 - argc;
         if let Some(known_cfunc_codegen) = lookup_cfunc_codegen(unsafe { (*cme).def }) {
@@ -6640,8 +6947,7 @@ fn gen_send_cfunc(
                 gen_counter_incr(jit, asm, Counter::num_send_cfunc_inline);
                 // cfunc codegen generated code. Terminate the block so
                 // there isn't multiple calls in the same block.
-                jump_to_next_insn(jit, asm);
-                return Some(EndBlock);
+                return jump_to_next_insn(jit, asm);
             }
         }
     }
@@ -6702,7 +7008,7 @@ fn gen_send_cfunc(
         return None;
     }
 
-    let block_arg_type = if block_arg {
+    let mut block_arg_type = if block_arg {
         Some(asm.ctx.get_opnd_type(StackOpnd(0)))
     } else {
         None
@@ -6710,33 +7016,25 @@ fn gen_send_cfunc(
 
     match block_arg_type {
         Some(Type::Nil | Type::BlockParamProxy) => {
-            // We'll handle this later
-        }
-        None => {
-            // Nothing to do
-        }
-        _ => {
-            gen_counter_incr(jit, asm, Counter::send_cfunc_block_arg);
-            return None;
-        }
-    }
-
-    match block_arg_type {
-        Some(Type::Nil) => {
-            // We have a nil block arg, so let's pop it off the args
+            // We don't need the actual stack value for these
             asm.stack_pop(1);
         }
-        Some(Type::BlockParamProxy) => {
-            // We don't need the actual stack value
+        Some(Type::Unknown | Type::UnknownImm) if jit.peek_at_stack(&asm.ctx, 0).nil_p() => {
+            // The sample blockarg is nil, so speculate that's the case.
+            asm.cmp(asm.stack_opnd(0), Qnil.into());
+            asm.jne(Target::side_exit(Counter::guard_send_cfunc_block_not_nil));
+            block_arg_type = Some(Type::Nil);
             asm.stack_pop(1);
         }
         None => {
             // Nothing to do
         }
         _ => {
-            assert!(false);
+            gen_counter_incr(jit, asm, Counter::send_cfunc_block_arg);
+            return None;
         }
     }
+    let block_arg_type = block_arg_type; // drop `mut`
 
     // Pop the empty kw_splat hash
     if kw_splat {
@@ -6826,7 +7124,7 @@ fn gen_send_cfunc(
 
     asm_comment!(asm, "set ec->cfp");
     let new_cfp = asm.lea(Opnd::mem(64, CFP, -(RUBY_SIZEOF_CONTROL_FRAME as i32)));
-    asm.store(Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP), new_cfp);
+    asm.store(Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP as i32), new_cfp);
 
     if !kw_arg.is_null() {
         // Build a hash from all kwargs passed
@@ -6922,7 +7220,7 @@ fn gen_send_cfunc(
     // Pop the stack frame (ec->cfp++)
     // Instead of recalculating, we can reuse the previous CFP, which is stored in a callee-saved
     // register
-    let ec_cfp_opnd = Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP);
+    let ec_cfp_opnd = Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP as i32);
     asm.store(ec_cfp_opnd, CFP);
 
     // cfunc calls may corrupt types
@@ -6933,8 +7231,7 @@ fn gen_send_cfunc(
 
     // Jump (fall through) to the call continuation block
     // We do this to end the current block after the call
-    jump_to_next_insn(jit, asm);
-    Some(EndBlock)
+    jump_to_next_insn(jit, asm)
 }
 
 // Generate RARRAY_LEN. For array_opnd, use Opnd::Reg to reduce memory access,
@@ -7089,7 +7386,7 @@ fn gen_send_bmethod(
 ) -> Option<CodegenStatus> {
     let procv = unsafe { rb_get_def_bmethod_proc((*cme).def) };
 
-    let proc = unsafe { rb_yjit_get_proc_ptr(procv) };
+    let proc = unsafe { rb_jit_get_proc_ptr(procv) };
     let proc_block = unsafe { &(*proc).block };
 
     if proc_block.type_ != block_type_iseq {
@@ -7099,11 +7396,12 @@ fn gen_send_bmethod(
     let capture = unsafe { proc_block.as_.captured.as_ref() };
     let iseq = unsafe { *capture.code.iseq.as_ref() };
 
-    // Optimize for single ractor mode and avoid runtime check for
-    // "defined with an un-shareable Proc in a different Ractor"
-    if !assume_single_ractor_mode(jit, asm) {
-        gen_counter_incr(jit, asm, Counter::send_bmethod_ractor);
-        return None;
+    if !procv.shareable_p() {
+        let ractor_serial = unsafe { rb_yjit_cme_ractor_serial(cme) };
+        asm_comment!(asm, "guard current ractor == {}", ractor_serial);
+        let current_ractor_serial = asm.load(Opnd::mem(64, EC, RUBY_OFFSET_EC_RACTOR_ID as i32));
+        asm.cmp(current_ractor_serial, ractor_serial.into());
+        asm.jne(Target::side_exit(Counter::send_bmethod_ractor));
     }
 
     // Passing a block to a block needs logic different from passing
@@ -7124,7 +7422,7 @@ enum IseqReturn {
     Receiver,
 }
 
-extern {
+extern "C" {
     fn rb_simple_iseq_p(iseq: IseqPtr) -> bool;
     fn rb_iseq_only_kwparam_p(iseq: IseqPtr) -> bool;
 }
@@ -7169,6 +7467,12 @@ fn iseq_get_return_value(iseq: IseqPtr, captured_opnd: Option<Opnd>, block: Opti
             let ep_offset = unsafe { *rb_iseq_pc_at_idx(iseq, 1) }.as_u32();
             let local_idx = ep_offset_to_local_idx(iseq, ep_offset);
 
+            // Only inline getlocal on a parameter. DCE in the IESQ builder can
+            // make a two-instruction ISEQ that does not return a parameter.
+            if local_idx >= unsafe { get_iseq_body_param_size(iseq) } {
+                return None;
+            }
+
             if unsafe { rb_simple_iseq_p(iseq) } {
                 return Some(IseqReturn::LocalVariable(local_idx));
             } else if unsafe { rb_iseq_only_kwparam_p(iseq) } {
@@ -7325,7 +7629,7 @@ fn gen_send_iseq(
             gen_counter_incr(jit, asm, Counter::send_iseq_splat_not_array);
             return None;
         } else {
-            unsafe { rb_yjit_array_len(array) as u32}
+            unsafe { rb_jit_array_len(array) as u32}
         };
 
         // Arity check accounting for size of the splat. When callee has rest parameters, we insert
@@ -7416,7 +7720,7 @@ fn gen_send_iseq(
     gen_counter_incr(jit, asm, Counter::num_send_iseq);
 
     // Shortcut for special `Primitive.attr! :leaf` builtins
-    let builtin_attrs = unsafe { rb_yjit_iseq_builtin_attrs(iseq) };
+    let builtin_attrs = unsafe { rb_jit_iseq_builtin_attrs(iseq) };
     let builtin_func_raw = unsafe { rb_yjit_builtin_function(iseq) };
     let builtin_func = if builtin_func_raw.is_null() { None } else { Some(builtin_func_raw) };
     let opt_send_call = flags & VM_CALL_OPT_SEND != 0; // .send call is not currently supported for builtins
@@ -7473,8 +7777,7 @@ fn gen_send_iseq(
             // Seems like a safe assumption.
 
             // Let guard chains share the same successor
-            jump_to_next_insn(jit, asm);
-            return Some(EndBlock);
+            return jump_to_next_insn(jit, asm);
         }
     }
 
@@ -7512,8 +7815,7 @@ fn gen_send_iseq(
         }
 
         // Let guard chains share the same successor
-        jump_to_next_insn(jit, asm);
-        return Some(EndBlock);
+        return jump_to_next_insn(jit, asm);
     }
 
     // Stack overflow check
@@ -7595,6 +7897,11 @@ fn gen_send_iseq(
                 gen_counter_incr(jit, asm, Counter::send_iseq_clobbering_block_arg);
                 return None;
             }
+            if iseq_has_rest || has_kwrest {
+                // The proc would be stored above the current stack top, where GC can't see it
+                gen_counter_incr(jit, asm, Counter::send_iseq_block_arg_gc_unsafe);
+                return None;
+            }
             let proc = asm.stack_pop(1); // Pop first, as argc doesn't account for the block arg
             let callee_specval = asm.ctx.sp_opnd(callee_specval);
             asm.store(callee_specval, proc);
@@ -7745,14 +8052,14 @@ fn gen_send_iseq(
 
     // Pop surplus positional arguments when yielding
     if arg_setup_block {
-        let extras = argc - required_num - opt_num;
+        let extras = argc - required_num - opt_num - kw_arg_num;
         if extras > 0 {
             // Checked earlier. If there are keyword args, then
             // the positional arguments are not at the stack top.
             assert_eq!(0, kw_arg_num);
 
             asm.stack_pop(extras as usize);
-            argc = required_num + opt_num;
+            argc = required_num + opt_num + kw_arg_num;
         }
     }
 
@@ -7802,7 +8109,6 @@ fn gen_send_iseq(
         }
     }
 
-    // Don't nil fill forwarding iseqs
     if !forwarding {
         // Nil-initialize missing optional parameters
         nil_fill(
@@ -7823,13 +8129,13 @@ fn gen_send_iseq(
         // Nil-initialize non-parameter locals
         nil_fill(
             "nil-initialize locals",
-        {
-            let begin = -argc + num_params;
-            let end   = -argc + num_locals;
+            {
+                let begin = -argc + num_params;
+                let end   = -argc + num_locals;
 
-            begin..end
-        },
-        asm
+                begin..end
+            },
+            asm
         );
     }
 
@@ -7837,9 +8143,13 @@ fn gen_send_iseq(
         assert_eq!(1, num_params);
         // Write the CI in to the stack and ensure that it actually gets
         // flushed to memory
+        asm_comment!(asm, "put call info for forwarding");
         let ci_opnd = asm.stack_opnd(-1);
         asm.ctx.dealloc_reg(ci_opnd.reg_opnd());
         asm.mov(ci_opnd, VALUE(ci as usize).into());
+
+        // Nil-initialize other locals which are above the CI
+        nil_fill("nil-initialize locals", 1..num_locals, asm);
     }
 
     // Points to the receiver operand on the stack unless a captured environment is used
@@ -7893,53 +8203,16 @@ fn gen_send_iseq(
         pc: None, // We are calling into jitted code, which will set the PC as necessary
     }));
 
-    // Create a context for the callee
-    let mut callee_ctx = Context::default();
-
-    // Transfer some stack temp registers to the callee's locals for arguments.
-    let mapped_temps = if !forwarding {
-        asm.map_temp_regs_to_args(&mut callee_ctx, argc)
-    } else {
-        // When forwarding, the callee's local table has only a callinfo,
-        // so we can't map the actual arguments to the callee's locals.
-        vec![]
-    };
-
-    // Spill stack temps and locals that are not used by the callee.
-    // This must be done before changing the SP register.
-    asm.spill_regs_except(&mapped_temps);
-
-    // Saving SP before calculating ep avoids a dependency on a register
-    // However this must be done after referencing frame.recv, which may be SP-relative
-    asm.mov(SP, callee_sp);
-
-    // Log the name of the method we're calling to. We intentionally don't do this for inlined ISEQs.
-    // We also do this after gen_push_frame() to minimize the impact of spill_temps() on asm.ccall().
-    if get_option!(gen_stats) {
-        // Protect caller-saved registers in case they're used for arguments
-        asm.cpush_all();
-
-        // Assemble the ISEQ name string
-        let name_str = get_iseq_name(iseq);
-
-        // Get an index for this ISEQ name
-        let iseq_idx = get_iseq_idx(&name_str);
-
-        // Increment the counter for this cfunc
-        asm.ccall(incr_iseq_counter as *const u8, vec![iseq_idx.into()]);
-        asm.cpop_all();
-    }
-
     // No need to set cfp->pc since the callee sets it whenever calling into routines
     // that could look at it through jit_save_pc().
     // mov(cb, REG0, const_ptr_opnd(start_pc));
     // mov(cb, member_opnd(REG_CFP, rb_control_frame_t, pc), REG0);
 
-    // Stub so we can return to JITted code
-    let return_block = BlockId {
-        iseq: jit.iseq,
-        idx: jit.next_insn_idx(),
-    };
+    // Create a blockid for the callee
+    let callee_blockid = BlockId { iseq, idx: start_pc_offset };
+
+    // Create a context for the callee
+    let mut callee_ctx = Context::default();
 
     // If the callee has :inline_block annotation and the callsite has a block ISEQ,
     // duplicate a callee block for each block ISEQ to make its `yield` monomorphic.
@@ -7968,29 +8241,92 @@ fn gen_send_iseq(
     };
     callee_ctx.upgrade_opnd_type(SelfOpnd, recv_type);
 
-    // Now that callee_ctx is prepared, discover a block that can be reused if we move some registers.
-    // If there's such a block, move registers accordingly to avoid creating a new block.
-    let blockid = BlockId { iseq, idx: start_pc_offset };
-    if !mapped_temps.is_empty() {
-        // Discover a block that have the same things in different (or same) registers
-        if let Some(block_ctx) = find_block_ctx_with_same_regs(blockid, &callee_ctx) {
-            // List pairs of moves for making the register mappings compatible
+    // Spill or preserve argument registers
+    if forwarding {
+        // When forwarding, the callee's local table has only a callinfo,
+        // so we can't map the actual arguments to the callee's locals.
+        asm.spill_regs();
+    } else {
+        // Discover stack temp registers that can be used as the callee's locals
+        let mapped_temps = asm.map_temp_regs_to_args(&mut callee_ctx, argc);
+
+        // Spill stack temps and locals that are not used by the callee.
+        // This must be done before changing the SP register.
+        asm.spill_regs_except(&mapped_temps);
+
+        // If the callee block has been compiled before, spill/move registers to reuse the existing block
+        // for minimizing the number of blocks we need to compile.
+        if let Some(existing_reg_mapping) = find_most_compatible_reg_mapping(callee_blockid, &callee_ctx) {
+            asm_comment!(asm, "reuse maps: {:?} -> {:?}", callee_ctx.get_reg_mapping(), existing_reg_mapping);
+
+            // Spill the registers that are not used in the existing block.
+            // When the same ISEQ is compiled as an entry block, it starts with no registers allocated.
+            for &reg_opnd in callee_ctx.get_reg_mapping().get_reg_opnds().iter() {
+                if existing_reg_mapping.get_reg(reg_opnd).is_none() {
+                    match reg_opnd {
+                        RegOpnd::Local(local_idx) => {
+                            let spilled_temp = asm.stack_opnd(argc - local_idx as i32 - 1);
+                            asm.spill_reg(spilled_temp);
+                            callee_ctx.dealloc_reg(reg_opnd);
+                        }
+                        RegOpnd::Stack(_) => unreachable!("callee {:?} should have been spilled", reg_opnd),
+                    }
+                }
+            }
+            assert!(callee_ctx.get_reg_mapping().get_reg_opnds().len() <= existing_reg_mapping.get_reg_opnds().len());
+
+            // Load the registers that are spilled in this block but used in the existing block.
+            // When there are multiple callsites, some registers spilled in this block may be used at other callsites.
+            for &reg_opnd in existing_reg_mapping.get_reg_opnds().iter() {
+                if callee_ctx.get_reg_mapping().get_reg(reg_opnd).is_none() {
+                    match reg_opnd {
+                        RegOpnd::Local(local_idx) => {
+                            callee_ctx.alloc_reg(reg_opnd);
+                            let loaded_reg = TEMP_REGS[callee_ctx.get_reg_mapping().get_reg(reg_opnd).unwrap()];
+                            let loaded_temp = asm.stack_opnd(argc - local_idx as i32 - 1);
+                            asm.load_into(Opnd::Reg(loaded_reg), loaded_temp);
+                        }
+                        RegOpnd::Stack(_) => unreachable!("find_most_compatible_reg_mapping should not leave {:?}", reg_opnd),
+                    }
+                }
+            }
+            assert_eq!(callee_ctx.get_reg_mapping().get_reg_opnds().len(), existing_reg_mapping.get_reg_opnds().len());
+
+            // Shuffle registers to make the register mappings compatible
             let mut moves = vec![];
             for &reg_opnd in callee_ctx.get_reg_mapping().get_reg_opnds().iter() {
                 let old_reg = TEMP_REGS[callee_ctx.get_reg_mapping().get_reg(reg_opnd).unwrap()];
-                let new_reg = TEMP_REGS[block_ctx.get_reg_mapping().get_reg(reg_opnd).unwrap()];
+                let new_reg = TEMP_REGS[existing_reg_mapping.get_reg(reg_opnd).unwrap()];
                 moves.push((new_reg, Opnd::Reg(old_reg)));
             }
-
-            // Shuffle them to break cycles and generate the moves
-            let moves = Assembler::reorder_reg_moves(&moves);
-            for (reg, opnd) in moves {
+            for (reg, opnd) in Assembler::reorder_reg_moves(&moves) {
                 asm.load_into(Opnd::Reg(reg), opnd);
             }
-            callee_ctx.set_reg_mapping(block_ctx.get_reg_mapping());
+            callee_ctx.set_reg_mapping(existing_reg_mapping);
         }
     }
 
+    // Update SP register for the callee. This must be done after referencing frame.recv,
+    // which may be SP-relative.
+    asm.mov(SP, callee_sp);
+
+    // Log the name of the method we're calling to. We intentionally don't do this for inlined ISEQs.
+    // We also do this after spill_regs() to avoid doubly spilling the same thing on asm.ccall().
+    if get_option!(gen_stats) {
+        // Protect caller-saved registers in case they're used for arguments
+        let mapping = asm.cpush_all();
+
+        // Assemble the ISEQ name string
+        let name_str = get_iseq_name(iseq);
+
+        // Get an index for this ISEQ name
+        let iseq_idx = get_iseq_idx(&name_str);
+
+        // Increment the counter for this cfunc
+        asm.ccall(incr_iseq_counter as *const u8, vec![iseq_idx.into()]);
+        asm.cpop_all(mapping);
+    }
+
     // The callee might change locals through Kernel#binding and other means.
     asm.clear_local_types();
 
@@ -8003,32 +8339,33 @@ fn gen_send_iseq(
     return_asm.ctx.reset_chain_depth_and_defer();
     return_asm.ctx.set_as_return_landing();
 
+    // Stub so we can return to JITted code
+    let return_block = BlockId {
+        iseq: jit.iseq,
+        idx: jit.next_insn_idx(),
+    };
+
     // Write the JIT return address on the callee frame
-    if gen_branch(
-        jit,
+    jit.gen_branch(
         asm,
         return_block,
         &return_asm.ctx,
         None,
         None,
         BranchGenFn::JITReturn,
-    ).is_none() {
-        // Returning None here would have send_dynamic() code following incomplete
-        // send code. Abandon the block instead.
-        jit.block_abandoned = true;
-    }
+    );
 
     // ec->cfp is updated after cfp->jit_return for rb_profile_frames() safety
     asm_comment!(asm, "switch to new CFP");
     let new_cfp = asm.sub(CFP, RUBY_SIZEOF_CONTROL_FRAME.into());
     asm.mov(CFP, new_cfp);
-    asm.store(Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP), CFP);
+    asm.store(Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP as i32), CFP);
 
     // Directly jump to the entry point of the callee
     gen_direct_jump(
         jit,
         &callee_ctx,
-        blockid,
+        callee_blockid,
         asm,
     );
 
@@ -8594,8 +8931,7 @@ fn gen_struct_aref(
     let ret = asm.stack_push(Type::Unknown);
     asm.mov(ret, val);
 
-    jump_to_next_insn(jit, asm);
-    Some(EndBlock)
+    jump_to_next_insn(jit, asm)
 }
 
 fn gen_struct_aset(
@@ -8611,6 +8947,12 @@ fn gen_struct_aset(
         return None;
     }
 
+    // If the comptime receiver is frozen, writing a struct member will raise an exception
+    // and we don't want to JIT code to deal with that situation.
+    if comptime_recv.is_frozen() {
+        return None;
+    }
+
     if c_method_tracing_currently_enabled(jit) {
         // Struct accesses need fire c_call and c_return events, which we can't support
         // See :attr-tracing:
@@ -8631,6 +8973,17 @@ fn gen_struct_aset(
     assert!(unsafe { RB_TYPE_P(comptime_recv, RUBY_T_STRUCT) });
     assert!((off as i64) < unsafe { RSTRUCT_LEN(comptime_recv) });
 
+    // Even if the comptime recv was not frozen, future recv may be. So we need to emit a guard
+    // that the recv is not frozen.
+    // We know all structs are heap objects, so we can check the flag directly.
+    let recv = asm.stack_opnd(1);
+    let recv = asm.load(recv);
+    let flags = asm.load(Opnd::mem(VALUE_BITS, recv, RUBY_OFFSET_RBASIC_FLAGS));
+    asm.test(flags, (RUBY_FL_FREEZE as u64).into());
+    asm.jnz(Target::side_exit(Counter::opt_aset_frozen));
+
+    // Not frozen, so we can proceed.
+
     asm_comment!(asm, "struct aset");
 
     let val = asm.stack_pop(1);
@@ -8641,8 +8994,7 @@ fn gen_struct_aset(
     let ret = asm.stack_push(Type::Unknown);
     asm.mov(ret, val);
 
-    jump_to_next_insn(jit, asm);
-    Some(EndBlock)
+    jump_to_next_insn(jit, asm)
 }
 
 // Generate code that calls a method with dynamic dispatch
@@ -8684,8 +9036,7 @@ fn gen_send_dynamic<F: Fn(&mut Assembler) -> Opnd>(
     jit_perf_symbol_pop!(jit, asm, PerfMap::Codegen);
 
     // End the current block for invalidationg and sharing the same successor
-    jump_to_next_insn(jit, asm);
-    Some(EndBlock)
+    jump_to_next_insn(jit, asm)
 }
 
 fn gen_send_general(
@@ -8711,15 +9062,14 @@ fn gen_send_general(
 
     // Defer compilation so we can specialize on class of receiver
     if !jit.at_compile_target() {
-        defer_compilation(jit, asm);
-        return Some(EndBlock);
+        return jit.defer_compilation(asm);
     }
 
     let ci_flags = unsafe { vm_ci_flag(ci) };
 
     // Dynamic stack layout. No good way to support without inlining.
     if ci_flags & VM_CALL_FORWARDING != 0 {
-        gen_counter_incr(jit, asm, Counter::send_iseq_forwarding);
+        gen_counter_incr(jit, asm, Counter::send_forwarding);
         return None;
     }
 
@@ -8745,7 +9095,6 @@ fn gen_send_general(
     let recv_opnd: YARVOpnd = recv.into();
 
     // Log the name of the method we're calling to
-    #[cfg(feature = "disasm")]
     asm_comment!(asm, "call to {}", get_method_name(Some(comptime_recv_klass), mid));
 
     // Gather some statistics about sends
@@ -8765,7 +9114,6 @@ fn gen_send_general(
     perf_call!("gen_send_general: ", jit_guard_known_klass(
         jit,
         asm,
-        comptime_recv_klass,
         recv,
         recv_opnd,
         comptime_recv,
@@ -9018,7 +9366,6 @@ fn gen_send_general(
 
                     }
                     OPTIMIZED_METHOD_TYPE_CALL => {
-
                         if block.is_some() {
                             gen_counter_incr(jit, asm, Counter::send_call_block);
                             return None;
@@ -9034,13 +9381,6 @@ fn gen_send_general(
                             return None;
                         }
 
-                        // Optimize for single ractor mode and avoid runtime check for
-                        // "defined with an un-shareable Proc in a different Ractor"
-                        if !assume_single_ractor_mode(jit, asm) {
-                            gen_counter_incr(jit, asm, Counter::send_call_multi_ractor);
-                            return None;
-                        }
-
                         // If this is a .send call we need to adjust the stack
                         if flags & VM_CALL_OPT_SEND != 0 {
                             handle_opt_send_shift_stack(asm, argc);
@@ -9070,8 +9410,9 @@ fn gen_send_general(
 
                         let stack_ret = asm.stack_push(Type::Unknown);
                         asm.mov(stack_ret, ret);
-                        return Some(KeepCompiling);
 
+                        // End the block to allow invalidating the next instruction
+                        return jump_to_next_insn(jit, asm);
                     }
                     OPTIMIZED_METHOD_TYPE_BLOCK_CALL => {
                         gen_counter_incr(jit, asm, Counter::send_optimized_method_block_call);
@@ -9244,7 +9585,24 @@ fn gen_sendforward(
     jit: &mut JITState,
     asm: &mut Assembler,
 ) -> Option<CodegenStatus> {
-    return gen_send(jit, asm);
+    // Generate specialized code if possible
+    let cd = jit.get_arg(0).as_ptr();
+    let block = jit.get_arg(1).as_optional_ptr().map(|iseq| BlockHandler::BlockISeq(iseq));
+    if let Some(status) = perf_call! { gen_send_general(jit, asm, cd, block) } {
+        return Some(status);
+    }
+
+    // Otherwise, fallback to dynamic dispatch using the interpreter's implementation of sendforward
+    let blockiseq = jit.get_arg(1).as_iseq();
+    gen_send_dynamic(jit, asm, cd, unsafe { rb_yjit_sendish_sp_pops((*cd).ci) }, |asm| {
+        extern "C" {
+            fn rb_vm_sendforward(ec: EcPtr, cfp: CfpPtr, cd: VALUE, blockiseq: IseqPtr) -> VALUE;
+        }
+        asm.ccall(
+            rb_vm_sendforward as *const u8,
+            vec![EC, CFP, (cd as usize).into(), VALUE(blockiseq as usize).into()],
+        )
+    })
 }
 
 fn gen_invokeblock(
@@ -9275,8 +9633,7 @@ fn gen_invokeblock_specialized(
     cd: *const rb_call_data,
 ) -> Option<CodegenStatus> {
     if !jit.at_compile_target() {
-        defer_compilation(jit, asm);
-        return Some(EndBlock);
+        return jit.defer_compilation(asm);
     }
 
     // Fallback to dynamic dispatch if this callsite is megamorphic
@@ -9319,7 +9676,7 @@ fn gen_invokeblock_specialized(
 
         // If the current ISEQ is annotated to be inlined but it's not being inlined here,
         // generate a dynamic dispatch to avoid making this yield megamorphic.
-        if unsafe { rb_yjit_iseq_builtin_attrs(jit.iseq) } & BUILTIN_ATTR_INLINE_BLOCK != 0 && !asm.ctx.inline() {
+        if unsafe { rb_jit_iseq_builtin_attrs(jit.iseq) } & BUILTIN_ATTR_INLINE_BLOCK != 0 && !asm.ctx.inline() {
             gen_counter_incr(jit, asm, Counter::invokeblock_iseq_not_inlined);
             return None;
         }
@@ -9390,8 +9747,7 @@ fn gen_invokeblock_specialized(
         asm.clear_local_types();
 
         // Share the successor with other chains
-        jump_to_next_insn(jit, asm);
-        Some(EndBlock)
+        jump_to_next_insn(jit, asm)
     } else if comptime_handler.symbol_p() {
         gen_counter_incr(jit, asm, Counter::invokeblock_symbol);
         None
@@ -9411,7 +9767,7 @@ fn gen_invokesuper(
         return Some(status);
     }
 
-    // Otherwise, fallback to dynamic dispatch using the interpreter's implementation of send
+    // Otherwise, fallback to dynamic dispatch using the interpreter's implementation of invokesuper
     let blockiseq = jit.get_arg(1).as_iseq();
     gen_send_dynamic(jit, asm, cd, unsafe { rb_yjit_sendish_sp_pops((*cd).ci) }, |asm| {
         extern "C" {
@@ -9428,7 +9784,23 @@ fn gen_invokesuperforward(
     jit: &mut JITState,
     asm: &mut Assembler,
 ) -> Option<CodegenStatus> {
-    return gen_invokesuper(jit, asm);
+    // Generate specialized code if possible
+    let cd = jit.get_arg(0).as_ptr();
+    if let Some(status) = gen_invokesuper_specialized(jit, asm, cd) {
+        return Some(status);
+    }
+
+    // Otherwise, fallback to dynamic dispatch using the interpreter's implementation of invokesuperforward
+    let blockiseq = jit.get_arg(1).as_iseq();
+    gen_send_dynamic(jit, asm, cd, unsafe { rb_yjit_sendish_sp_pops((*cd).ci) }, |asm| {
+        extern "C" {
+            fn rb_vm_invokesuperforward(ec: EcPtr, cfp: CfpPtr, cd: VALUE, blockiseq: IseqPtr) -> VALUE;
+        }
+        asm.ccall(
+            rb_vm_invokesuperforward as *const u8,
+            vec![EC, CFP, (cd as usize).into(), VALUE(blockiseq as usize).into()],
+        )
+    })
 }
 
 fn gen_invokesuper_specialized(
@@ -9438,8 +9810,7 @@ fn gen_invokesuper_specialized(
 ) -> Option<CodegenStatus> {
     // Defer compilation so we can specialize on class of receiver
     if !jit.at_compile_target() {
-        defer_compilation(jit, asm);
-        return Some(EndBlock);
+        return jit.defer_compilation(asm);
     }
 
     // Handle the last two branches of vm_caller_setup_arg_block
@@ -9583,7 +9954,7 @@ fn gen_leave(
     asm_comment!(asm, "pop stack frame");
     let incr_cfp = asm.add(CFP, RUBY_SIZEOF_CONTROL_FRAME.into());
     asm.mov(CFP, incr_cfp);
-    asm.mov(Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP), CFP);
+    asm.mov(Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP as i32), CFP);
 
     // Load the return value
     let retval_opnd = asm.stack_pop(1);
@@ -9672,8 +10043,7 @@ fn gen_objtostring(
     asm: &mut Assembler,
 ) -> Option<CodegenStatus> {
     if !jit.at_compile_target() {
-        defer_compilation(jit, asm);
-        return Some(EndBlock);
+        return jit.defer_compilation(asm);
     }
 
     let recv = asm.stack_opnd(0);
@@ -9683,7 +10053,6 @@ fn gen_objtostring(
         jit_guard_known_klass(
             jit,
             asm,
-            comptime_recv.class_of(),
             recv,
             recv.into(),
             comptime_recv,
@@ -9693,6 +10062,34 @@ fn gen_objtostring(
 
         // No work needed. The string value is already on the top of the stack.
         Some(KeepCompiling)
+    } else if unsafe { RB_TYPE_P(comptime_recv, RUBY_T_SYMBOL) } && assume_method_basic_definition(jit, asm, comptime_recv.class_of(), ID!(to_s)) {
+        jit_guard_known_klass(
+            jit,
+            asm,
+            recv,
+            recv.into(),
+            comptime_recv,
+            SEND_MAX_DEPTH,
+            Counter::objtostring_not_string,
+        );
+
+        extern "C" {
+            fn rb_sym2str(sym: VALUE) -> VALUE;
+        }
+
+        // Same optimization done in the interpreter: rb_sym_to_s() allocates a mutable string, but since we are only
+        // going to use this string for interpolation, it's fine to use the
+        // frozen string.
+        // rb_sym2str does not allocate.
+        let sym = recv;
+        let str = asm.ccall(rb_sym2str as *const u8, vec![sym]);
+        asm.stack_pop(1);
+
+        // Push the return value
+        let stack_ret = asm.stack_push(Type::TString);
+        asm.mov(stack_ret, str);
+
+        Some(KeepCompiling)
     } else {
         let cd = jit.get_arg(0).as_ptr();
         perf_call! { gen_send_general(jit, asm, cd, None) }
@@ -9852,7 +10249,7 @@ fn gen_getclassvariable(
     let val_opnd = asm.ccall(
         rb_vm_getclassvariable as *const u8,
         vec![
-            Opnd::mem(64, CFP, RUBY_OFFSET_CFP_ISEQ),
+            VALUE(jit.iseq as usize).into(),
             CFP,
             Opnd::UImm(jit.get_arg(0).as_u64()),
             Opnd::UImm(jit.get_arg(1).as_u64()),
@@ -9876,7 +10273,7 @@ fn gen_setclassvariable(
     asm.ccall(
         rb_vm_setclassvariable as *const u8,
         vec![
-            Opnd::mem(64, CFP, RUBY_OFFSET_CFP_ISEQ),
+            VALUE(jit.iseq as usize).into(),
             CFP,
             Opnd::UImm(jit.get_arg(0).as_u64()),
             val,
@@ -9953,8 +10350,7 @@ fn gen_opt_getconstant_path(
         let stack_top = asm.stack_push(Type::Unknown);
         asm.store(stack_top, val);
 
-        jump_to_next_insn(jit, asm);
-        return Some(EndBlock);
+        return jump_to_next_insn(jit, asm);
     }
 
     let cref_sensitive = !unsafe { (*ice).ic_cref }.is_null();
@@ -10002,8 +10398,7 @@ fn gen_opt_getconstant_path(
         jit_putobject(asm, unsafe { (*ice).value });
     }
 
-    jump_to_next_insn(jit, asm);
-    Some(EndBlock)
+    jump_to_next_insn(jit, asm)
 }
 
 // Push the explicit block parameter onto the temporary stack. Part of the
@@ -10014,8 +10409,7 @@ fn gen_getblockparamproxy(
     asm: &mut Assembler,
 ) -> Option<CodegenStatus> {
     if !jit.at_compile_target() {
-        defer_compilation(jit, asm);
-        return Some(EndBlock);
+        return jit.defer_compilation(asm);
     }
 
     // EP level
@@ -10129,9 +10523,7 @@ fn gen_getblockparamproxy(
         unreachable!("absurd given initial filtering");
     }
 
-    jump_to_next_insn(jit, asm);
-
-    Some(EndBlock)
+    jump_to_next_insn(jit, asm)
 }
 
 fn gen_getblockparam(
@@ -10306,6 +10698,7 @@ fn get_gen_fn(opcode: VALUE) -> Option<InsnGenFn> {
         YARVINSN_dup => Some(gen_dup),
         YARVINSN_dupn => Some(gen_dupn),
         YARVINSN_swap => Some(gen_swap),
+        YARVINSN_opt_reverse => Some(gen_opt_reverse),
         YARVINSN_putnil => Some(gen_putnil),
         YARVINSN_putobject => Some(gen_putobject),
         YARVINSN_putobject_INT2FIX_0_ => Some(gen_putobject_int2fix),
@@ -10340,6 +10733,7 @@ fn get_gen_fn(opcode: VALUE) -> Option<InsnGenFn> {
         YARVINSN_opt_hash_freeze => Some(gen_opt_hash_freeze),
         YARVINSN_opt_str_freeze => Some(gen_opt_str_freeze),
         YARVINSN_opt_str_uminus => Some(gen_opt_str_uminus),
+        YARVINSN_opt_duparray_send => Some(gen_opt_duparray_send),
         YARVINSN_opt_newarray_send => Some(gen_opt_newarray_send),
         YARVINSN_splatarray => Some(gen_splatarray),
         YARVINSN_splatkw => Some(gen_splatkw),
@@ -10362,7 +10756,6 @@ fn get_gen_fn(opcode: VALUE) -> Option<InsnGenFn> {
         YARVINSN_opt_neq => Some(gen_opt_neq),
         YARVINSN_opt_aref => Some(gen_opt_aref),
         YARVINSN_opt_aset => Some(gen_opt_aset),
-        YARVINSN_opt_aref_with => Some(gen_opt_aref_with),
         YARVINSN_opt_mult => Some(gen_opt_mult),
         YARVINSN_opt_div => Some(gen_opt_div),
         YARVINSN_opt_ltlt => Some(gen_opt_ltlt),
@@ -10384,6 +10777,7 @@ fn get_gen_fn(opcode: VALUE) -> Option<InsnGenFn> {
         YARVINSN_branchnil => Some(gen_branchnil),
         YARVINSN_throw => Some(gen_throw),
         YARVINSN_jump => Some(gen_jump),
+        YARVINSN_opt_new => Some(gen_opt_new),
 
         YARVINSN_getblockparamproxy => Some(gen_getblockparamproxy),
         YARVINSN_getblockparam => Some(gen_getblockparam),
@@ -10454,6 +10848,7 @@ pub fn yjit_reg_method_codegen_fns() {
         reg_method_codegen(rb_cInteger, "===", jit_rb_int_equal);
 
         reg_method_codegen(rb_cInteger, "succ", jit_rb_int_succ);
+        reg_method_codegen(rb_cInteger, "pred", jit_rb_int_pred);
         reg_method_codegen(rb_cInteger, "/", jit_rb_int_div);
         reg_method_codegen(rb_cInteger, "<<", jit_rb_int_lshift);
         reg_method_codegen(rb_cInteger, ">>", jit_rb_int_rshift);
@@ -10465,6 +10860,7 @@ pub fn yjit_reg_method_codegen_fns() {
         reg_method_codegen(rb_cFloat, "*", jit_rb_float_mul);
         reg_method_codegen(rb_cFloat, "/", jit_rb_float_div);
 
+        reg_method_codegen(rb_cString, "dup", jit_rb_str_dup);
         reg_method_codegen(rb_cString, "empty?", jit_rb_str_empty_p);
         reg_method_codegen(rb_cString, "to_s", jit_rb_str_to_s);
         reg_method_codegen(rb_cString, "to_str", jit_rb_str_to_s);
@@ -10474,6 +10870,8 @@ pub fn yjit_reg_method_codegen_fns() {
         reg_method_codegen(rb_cString, "getbyte", jit_rb_str_getbyte);
         reg_method_codegen(rb_cString, "setbyte", jit_rb_str_setbyte);
         reg_method_codegen(rb_cString, "byteslice", jit_rb_str_byteslice);
+        reg_method_codegen(rb_cString, "[]", jit_rb_str_aref_m);
+        reg_method_codegen(rb_cString, "slice", jit_rb_str_aref_m);
         reg_method_codegen(rb_cString, "<<", jit_rb_str_concat);
         reg_method_codegen(rb_cString, "+@", jit_rb_str_uplus);
 
@@ -10506,13 +10904,12 @@ pub fn yjit_reg_method_codegen_fns() {
 /// and do not make method calls.
 ///
 /// See also: [lookup_cfunc_codegen].
-fn reg_method_codegen(klass: VALUE, mid_str: &str, gen_fn: MethodGenFn) {
-    let id_string = std::ffi::CString::new(mid_str).expect("couldn't convert to CString!");
-    let mid = unsafe { rb_intern(id_string.as_ptr()) };
+fn reg_method_codegen(klass: VALUE, method_name: &str, gen_fn: MethodGenFn) {
+    let mid = unsafe { rb_intern2(method_name.as_ptr().cast(), method_name.len().try_into().unwrap()) };
     let me = unsafe { rb_method_entry_at(klass, mid) };
 
     if me.is_null() {
-        panic!("undefined optimized method!: {mid_str}");
+        panic!("undefined optimized method!: {method_name}");
     }
 
     // For now, only cfuncs are supported (me->cme cast fine since it's just me->def->type).
@@ -10526,6 +10923,10 @@ fn reg_method_codegen(klass: VALUE, mid_str: &str, gen_fn: MethodGenFn) {
     unsafe { METHOD_CODEGEN_TABLE.as_mut().unwrap().insert(method_serial, gen_fn); }
 }
 
+pub fn yjit_shutdown_free_codegen_table() {
+    unsafe { METHOD_CODEGEN_TABLE = None; };
+}
+
 /// Global state needed for code generation
 pub struct CodegenGlobals {
     /// Flat vector of bits to store compressed context data
@@ -10586,7 +10987,7 @@ impl CodegenGlobals {
 
         #[cfg(not(test))]
         let (mut cb, mut ocb) = {
-            let virt_block: *mut u8 = unsafe { rb_yjit_reserve_addr_space(exec_mem_size as u32) };
+            let virt_block: *mut u8 = unsafe { rb_jit_reserve_addr_space(exec_mem_size as u32) };
 
             // Memory protection syscalls need page-aligned addresses, so check it here. Assuming
             // `virt_block` is page-aligned, `second_half` should be page-aligned as long as the
@@ -10595,7 +10996,7 @@ impl CodegenGlobals {
             //
             // Basically, we don't support x86-64 2MiB and 1GiB pages. ARMv8 can do up to 64KiB
             // (2¹⁶ bytes) pages, which should be fine. 4KiB pages seem to be the most popular though.
-            let page_size = unsafe { rb_yjit_get_page_size() };
+            let page_size = unsafe { rb_jit_get_page_size() };
             assert_eq!(
                 virt_block as usize % page_size.as_usize(), 0,
                 "Start of virtual address block should be page-aligned",
@@ -10611,7 +11012,7 @@ impl CodegenGlobals {
                 exec_mem_size,
                 get_option!(mem_size),
             );
-            let mem_block = Rc::new(RefCell::new(mem_block));
+            let mem_block = Rc::new(mem_block);
 
             let freed_pages = Rc::new(None);
 
@@ -10874,6 +11275,41 @@ mod tests {
     }
 
     #[test]
+    fn test_gen_opt_reverse() {
+        let (_context, mut asm, mut cb, mut ocb) = setup_codegen();
+        let mut jit = dummy_jit_state(&mut cb, &mut ocb);
+
+        // Odd number of elements
+        asm.stack_push(Type::Fixnum);
+        asm.stack_push(Type::Flonum);
+        asm.stack_push(Type::CString);
+
+        let mut value_array: [u64; 2] = [0, 3];
+        let pc: *mut VALUE = &mut value_array as *mut u64 as *mut VALUE;
+        jit.pc = pc;
+
+        let mut status = gen_opt_reverse(&mut jit, &mut asm);
+
+        assert_eq!(status, Some(KeepCompiling));
+
+        assert_eq!(Type::CString, asm.ctx.get_opnd_type(StackOpnd(2)));
+        assert_eq!(Type::Flonum, asm.ctx.get_opnd_type(StackOpnd(1)));
+        assert_eq!(Type::Fixnum, asm.ctx.get_opnd_type(StackOpnd(0)));
+
+        // Try again with an even number of elements.
+        asm.stack_push(Type::Nil);
+        value_array[1] = 4;
+        status = gen_opt_reverse(&mut jit, &mut asm);
+
+        assert_eq!(status, Some(KeepCompiling));
+
+        assert_eq!(Type::Nil, asm.ctx.get_opnd_type(StackOpnd(3)));
+        assert_eq!(Type::Fixnum, asm.ctx.get_opnd_type(StackOpnd(2)));
+        assert_eq!(Type::Flonum, asm.ctx.get_opnd_type(StackOpnd(1)));
+        assert_eq!(Type::CString, asm.ctx.get_opnd_type(StackOpnd(0)));
+    }
+
+    #[test]
     fn test_gen_swap() {
         let (_context, mut asm, mut cb, mut ocb) = setup_codegen();
         let mut jit = dummy_jit_state(&mut cb, &mut ocb);
diff --git a/yjit/src/core.rs b/yjit/src/core.rs
index aaf9ca2055..0590135392 100644
--- a/yjit/src/core.rs
+++ b/yjit/src/core.rs
@@ -447,25 +447,9 @@ impl RegMapping {
         self.0.iter().filter_map(|&reg_opnd| reg_opnd).collect()
     }
 
-    /// Return TypeDiff::Compatible(diff) if dst has a mapping that can be made by moving registers
-    /// in self `diff` times. TypeDiff::Incompatible if they have different things in registers.
-    pub fn diff(&self, dst: RegMapping) -> TypeDiff {
-        let src_opnds = self.get_reg_opnds();
-        let dst_opnds = dst.get_reg_opnds();
-        if src_opnds.len() != dst_opnds.len() {
-            return TypeDiff::Incompatible;
-        }
-
-        let mut diff = 0;
-        for &reg_opnd in src_opnds.iter() {
-            match (self.get_reg(reg_opnd), dst.get_reg(reg_opnd)) {
-                (Some(src_idx), Some(dst_idx)) => if src_idx != dst_idx {
-                    diff += 1;
-                }
-                _ => return TypeDiff::Incompatible,
-            }
-        }
-        TypeDiff::Compatible(diff)
+    /// Count the number of registers that store a different operand from `dst`.
+    pub fn diff(&self, dst: RegMapping) -> usize {
+        self.0.iter().enumerate().filter(|&(reg_idx, &reg)| reg != dst.0[reg_idx]).count()
     }
 }
 
@@ -974,13 +958,13 @@ impl Context {
             if CTX_DECODE_CACHE == None {
                 // Here we use the vec syntax to avoid allocating the large table on the stack,
                 // as this can cause a stack overflow
-                let tbl = vec![(Context::default(), 0); CTX_ENCODE_CACHE_SIZE].into_boxed_slice().try_into().unwrap();
+                let tbl = vec![(Context::default(), 0); CTX_DECODE_CACHE_SIZE].into_boxed_slice().try_into().unwrap();
                 CTX_DECODE_CACHE = Some(tbl);
             }
 
             // Write a cache entry for this context
             let cache = CTX_DECODE_CACHE.as_mut().unwrap();
-            cache[idx as usize % CTX_ENCODE_CACHE_SIZE] = (*ctx, idx);
+            cache[idx as usize % CTX_DECODE_CACHE_SIZE] = (*ctx, idx);
         }
     }
 
@@ -1115,7 +1099,7 @@ impl Context {
                 MapToLocal(local_idx) => {
                     bits.push_op(CtxOp::MapTempLocal);
                     bits.push_u3(stack_idx as u8);
-                    bits.push_u3(local_idx as u8);
+                    bits.push_u3(local_idx);
                 }
 
                 MapToSelf => {
@@ -1834,7 +1818,7 @@ pub fn for_each_iseq<F: FnMut(IseqPtr)>(mut callback: F) {
         callback(iseq);
     }
     let mut data: &mut dyn FnMut(IseqPtr) = &mut callback;
-    unsafe { rb_yjit_for_each_iseq(Some(callback_wrapper), (&mut data) as *mut _ as *mut c_void) };
+    unsafe { rb_jit_for_each_iseq(Some(callback_wrapper), (&mut data) as *mut _ as *mut c_void) };
 }
 
 /// Iterate over all on-stack ISEQs
@@ -1936,7 +1920,7 @@ pub extern "C" fn rb_yjit_iseq_mark(payload: *mut c_void) {
         // For aliasing, having the VM lock hopefully also implies that no one
         // else has an overlapping &mut IseqPayload.
         unsafe {
-            rb_yjit_assert_holding_vm_lock();
+            rb_assert_holding_vm_lock();
             &*(payload as *const IseqPayload)
         }
     };
@@ -2025,7 +2009,7 @@ pub extern "C" fn rb_yjit_iseq_update_references(iseq: IseqPtr) {
         // For aliasing, having the VM lock hopefully also implies that no one
         // else has an overlapping &mut IseqPayload.
         unsafe {
-            rb_yjit_assert_holding_vm_lock();
+            rb_assert_holding_vm_lock();
             &*(payload as *const IseqPayload)
         }
     };
@@ -2051,13 +2035,6 @@ pub extern "C" fn rb_yjit_iseq_update_references(iseq: IseqPtr) {
         block_update_references(block, cb, true);
     }
 
-    // Note that we would have returned already if YJIT is off.
-    cb.mark_all_executable();
-
-    CodegenGlobals::get_outlined_cb()
-        .unwrap()
-        .mark_all_executable();
-
     return;
 
     fn block_update_references(block: &Block, cb: &mut CodeBlock, dead: bool) {
@@ -2114,11 +2091,9 @@ pub extern "C" fn rb_yjit_iseq_update_references(iseq: IseqPtr) {
 
                 // Only write when the VALUE moves, to be copy-on-write friendly.
                 if new_addr != object {
-                    for (byte_idx, &byte) in new_addr.as_u64().to_le_bytes().iter().enumerate() {
-                        let byte_code_ptr = value_code_ptr.add_bytes(byte_idx);
-                        cb.write_mem(byte_code_ptr, byte)
-                            .expect("patching existing code should be within bounds");
-                    }
+                    // SAFETY: Since we already set code memory writable before the compacting phase,
+                    // we can use raw memory accesses directly.
+                    unsafe { value_ptr.write_unaligned(new_addr); }
                 }
             }
         }
@@ -2126,6 +2101,34 @@ pub extern "C" fn rb_yjit_iseq_update_references(iseq: IseqPtr) {
     }
 }
 
+/// Mark all code memory as writable.
+/// This function is useful for garbage collectors that update references in JIT-compiled code in
+/// bulk.
+#[no_mangle]
+pub extern "C" fn rb_yjit_mark_all_writeable() {
+    if CodegenGlobals::has_instance() {
+        CodegenGlobals::get_inline_cb().mark_all_writeable();
+
+        CodegenGlobals::get_outlined_cb()
+            .unwrap()
+            .mark_all_writeable();
+    }
+}
+
+/// Mark all code memory as executable.
+/// This function is useful for garbage collectors that update references in JIT-compiled code in
+/// bulk.
+#[no_mangle]
+pub extern "C" fn rb_yjit_mark_all_executable() {
+    if CodegenGlobals::has_instance() {
+        CodegenGlobals::get_inline_cb().mark_all_executable();
+
+        CodegenGlobals::get_outlined_cb()
+            .unwrap()
+            .mark_all_executable();
+    }
+}
+
 /// Get all blocks for a particular place in an iseq.
 fn get_version_list(blockid: BlockId) -> Option<&'static mut VersionList> {
     let insn_idx = blockid.idx.as_usize();
@@ -2240,13 +2243,12 @@ fn find_block_version(blockid: BlockId, ctx: &Context) -> Option<BlockRef> {
     return best_version;
 }
 
-/// Basically find_block_version() but allows RegMapping incompatibility
-/// that can be fixed by register moves and returns Context
-pub fn find_block_ctx_with_same_regs(blockid: BlockId, ctx: &Context) -> Option<Context> {
+/// Find the closest RegMapping among ones that have already been compiled.
+pub fn find_most_compatible_reg_mapping(blockid: BlockId, ctx: &Context) -> Option<RegMapping> {
     let versions = get_version_list(blockid)?;
 
     // Best match found
-    let mut best_ctx: Option<Context> = None;
+    let mut best_mapping: Option<RegMapping> = None;
     let mut best_diff = usize::MAX;
 
     // For each version matching the blockid
@@ -2254,17 +2256,17 @@ pub fn find_block_ctx_with_same_regs(blockid: BlockId, ctx: &Context) -> Option<
         let block = unsafe { blockref.as_ref() };
         let block_ctx = Context::decode(block.ctx);
 
-        // Discover the best block that is compatible if we move registers
-        match ctx.diff_with_same_regs(&block_ctx) {
+        // Discover the best block that is compatible if we load/spill registers
+        match ctx.diff_allowing_reg_mismatch(&block_ctx) {
             TypeDiff::Compatible(diff) if diff < best_diff => {
-                best_ctx = Some(block_ctx);
+                best_mapping = Some(block_ctx.get_reg_mapping());
                 best_diff = diff;
             }
             _ => {}
         }
     }
 
-    best_ctx
+    best_mapping
 }
 
 /// Allow inlining a Block up to MAX_INLINE_VERSIONS times.
@@ -2309,7 +2311,9 @@ pub fn limit_block_versions(blockid: BlockId, ctx: &Context) -> Context {
 
         return generic_ctx;
     }
-    incr_counter_to!(max_inline_versions, next_versions);
+    if ctx.inline() {
+        incr_counter_to!(max_inline_versions, next_versions);
+    }
 
     return *ctx;
 }
@@ -2367,6 +2371,9 @@ unsafe fn add_block_version(blockref: BlockRef, cb: &CodeBlock) {
     }
 
     incr_counter!(compiled_block_count);
+    if Context::decode(block.ctx).inline() {
+        incr_counter!(inline_block_count);
+    }
 
     // Mark code pages for code GC
     let iseq_payload = get_iseq_payload(block.iseq.get()).unwrap();
@@ -2412,7 +2419,9 @@ impl<'a> JITState<'a> {
             // Pending branches => actual branches
             outgoing: MutableBranchList(Cell::new(self.pending_outgoing.into_iter().map(|pending_out| {
                 let pending_out = Rc::try_unwrap(pending_out)
-                    .ok().expect("all PendingBranchRefs should be unique when ready to construct a Block");
+                    .unwrap_or_else(|rc| panic!(
+                        "PendingBranchRef should be unique when ready to construct a Block. \
+                         strong={} weak={}", Rc::strong_count(&rc), Rc::weak_count(&rc)));
                 pending_out.into_branch(NonNull::new(blockref as *mut Block).expect("no null from Box"))
             }).collect()))
         });
@@ -2420,7 +2429,7 @@ impl<'a> JITState<'a> {
         // SAFETY: allocated with Box above
         unsafe { ptr::write(blockref, block) };
 
-        // Block is initialized now. Note that MaybeUnint<T> has the same layout as T.
+        // Block is initialized now. Note that MaybeUninit<T> has the same layout as T.
         let blockref = NonNull::new(blockref as *mut Block).expect("no null from Box");
 
         // Track all the assumptions the block makes as invariants
@@ -2591,6 +2600,14 @@ impl Context {
         self.sp_opnd(-ep_offset + offset)
     }
 
+    /// Start using a register for a given stack temp or a local.
+    pub fn alloc_reg(&mut self, opnd: RegOpnd) {
+        let mut reg_mapping = self.get_reg_mapping();
+        if reg_mapping.alloc_reg(opnd) {
+            self.set_reg_mapping(reg_mapping);
+        }
+    }
+
     /// Stop using a register for a given stack temp or a local.
     /// This allows us to reuse the register for a value that we know is dead
     /// and will no longer be used (e.g. popped stack temp).
@@ -2893,19 +2910,26 @@ impl Context {
         return TypeDiff::Compatible(diff);
     }
 
-    /// Basically diff() but allows RegMapping incompatibility that can be fixed
-    /// by register moves.
-    pub fn diff_with_same_regs(&self, dst: &Context) -> TypeDiff {
+    /// Basically diff() but allows RegMapping incompatibility that could be fixed by
+    /// spilling, loading, or shuffling registers.
+    pub fn diff_allowing_reg_mismatch(&self, dst: &Context) -> TypeDiff {
+        // We shuffle only RegOpnd::Local and spill any other RegOpnd::Stack.
+        // If dst has RegOpnd::Stack, we can't reuse the block as a callee.
+        for reg_opnd in dst.get_reg_mapping().get_reg_opnds() {
+            if matches!(reg_opnd, RegOpnd::Stack(_)) {
+                return TypeDiff::Incompatible;
+            }
+        }
+
         // Prepare a Context with the same registers
         let mut dst_with_same_regs = dst.clone();
         dst_with_same_regs.set_reg_mapping(self.get_reg_mapping());
 
         // Diff registers and other stuff separately, and merge them
-        match (self.diff(&dst_with_same_regs), self.get_reg_mapping().diff(dst.get_reg_mapping())) {
-            (TypeDiff::Compatible(ctx_diff), TypeDiff::Compatible(reg_diff)) => {
-                TypeDiff::Compatible(ctx_diff + reg_diff)
-            }
-            _ => TypeDiff::Incompatible
+        if let TypeDiff::Compatible(ctx_diff) = self.diff(&dst_with_same_regs) {
+            TypeDiff::Compatible(ctx_diff + self.get_reg_mapping().diff(dst.get_reg_mapping()))
+        } else {
+            TypeDiff::Incompatible
         }
     }
 
@@ -3198,16 +3222,33 @@ pub fn gen_entry_point(iseq: IseqPtr, ec: EcPtr, jit_exception: bool) -> Option<
     let cb = CodegenGlobals::get_inline_cb();
     let ocb = CodegenGlobals::get_outlined_cb();
 
+    let code_ptr = gen_entry_point_body(blockid, stack_size, ec, jit_exception, cb, ocb);
+
+    cb.mark_all_executable();
+    ocb.unwrap().mark_all_executable();
+
+    code_ptr
+}
+
+fn gen_entry_point_body(blockid: BlockId, stack_size: u8, ec: EcPtr, jit_exception: bool, cb: &mut CodeBlock, ocb: &mut OutlinedCb) -> Option<*const u8> {
     // Write the interpreter entry prologue. Might be NULL when out of memory.
-    let code_ptr = gen_entry_prologue(cb, ocb, iseq, insn_idx, jit_exception);
+    let (code_ptr, reg_mapping) = gen_entry_prologue(cb, ocb, blockid, stack_size, jit_exception)?;
 
-    // Try to generate code for the entry block
+    // Find or compile a block version
     let mut ctx = Context::default();
     ctx.stack_size = stack_size;
-    let block = gen_block_series(blockid, &ctx, ec, cb, ocb);
-
-    cb.mark_all_executable();
-    ocb.unwrap().mark_all_executable();
+    ctx.reg_mapping = reg_mapping;
+    let block = match find_block_version(blockid, &ctx) {
+        // If an existing block is found, generate a jump to the block.
+        Some(blockref) => {
+            let mut asm = Assembler::new_without_iseq();
+            asm.jmp(unsafe { blockref.as_ref() }.start_addr.into());
+            asm.compile(cb, Some(ocb))?;
+            Some(blockref)
+        }
+        // If this block hasn't yet been compiled, generate blocks after the entry guard.
+        None => gen_block_series(blockid, &ctx, ec, cb, ocb),
+    };
 
     match block {
         // Compilation failed
@@ -3232,7 +3273,7 @@ pub fn gen_entry_point(iseq: IseqPtr, ec: EcPtr, jit_exception: bool) -> Option<
     incr_counter!(compiled_iseq_entry);
 
     // Compilation successful and block not empty
-    code_ptr.map(|ptr| ptr.raw_ptr(cb))
+    Some(code_ptr.raw_ptr(cb))
 }
 
 // Change the entry's jump target from an entry stub to a next entry
@@ -3307,20 +3348,22 @@ fn entry_stub_hit_body(
     let cfp = unsafe { get_ec_cfp(ec) };
     let iseq = unsafe { get_cfp_iseq(cfp) };
     let insn_idx = iseq_pc_to_insn_idx(iseq, unsafe { get_cfp_pc(cfp) })?;
+    let blockid = BlockId { iseq, idx: insn_idx };
     let stack_size: u8 = unsafe {
         u8::try_from(get_cfp_sp(cfp).offset_from(get_cfp_bp(cfp))).ok()?
     };
 
     // Compile a new entry guard as a next entry
     let next_entry = cb.get_write_ptr();
-    let mut asm = Assembler::new_without_iseq();
-    let pending_entry = gen_entry_chain_guard(&mut asm, ocb, iseq, insn_idx)?;
+    let mut asm = Assembler::new(unsafe { get_iseq_body_local_table_size(iseq) });
+    let pending_entry = gen_entry_chain_guard(&mut asm, ocb, blockid)?;
+    let reg_mapping = gen_entry_reg_mapping(&mut asm, blockid, stack_size);
     asm.compile(cb, Some(ocb))?;
 
     // Find or compile a block version
-    let blockid = BlockId { iseq, idx: insn_idx };
     let mut ctx = Context::default();
     ctx.stack_size = stack_size;
+    ctx.reg_mapping = reg_mapping;
     let blockref = match find_block_version(blockid, &ctx) {
         // If an existing block is found, generate a jump to the block.
         Some(blockref) => {
@@ -3344,8 +3387,9 @@ fn entry_stub_hit_body(
         get_or_create_iseq_payload(iseq).entries.push(pending_entry.into_entry());
     }
 
-    // Let the stub jump to the block
-    blockref.map(|block| unsafe { block.as_ref() }.start_addr.raw_ptr(cb))
+    // Return a code pointer if the block is successfully compiled. The entry stub needs
+    // to jump to the entry preceding the block to load the registers in reg_mapping.
+    blockref.map(|_block| next_entry.raw_ptr(cb))
 }
 
 /// Generate a stub that calls entry_stub_hit
@@ -3549,6 +3593,13 @@ fn branch_stub_hit_body(branch_ptr: *const c_void, target_idx: u32, ec: EcPtr) -
             return CodegenGlobals::get_stub_exit_code().raw_ptr(cb);
         }
 
+        // Bail if this branch is housed in an invalidated (dead) block.
+        // This only happens in rare invalidation scenarios and we need
+        // to avoid linking a dead block to a live block with a branch.
+        if branch.block.get().as_ref().iseq.get().is_null() {
+            return CodegenGlobals::get_stub_exit_code().raw_ptr(cb);
+        }
+
         (cfp, original_interp_sp)
     };
 
@@ -3748,7 +3799,7 @@ pub fn gen_branch_stub_hit_trampoline(ocb: &mut OutlinedCb) -> Option<CodePtr> {
     let mut asm = Assembler::new_without_iseq();
 
     // For `branch_stub_hit(branch_ptr, target_idx, ec)`,
-    // `branch_ptr` and `target_idx` is different for each stub,
+    // `branch_ptr` and `target_idx` are different for each stub,
     // but the call and what's after is the same. This trampoline
     // is the unchanging part.
     // Since this trampoline is static, it allows code GC inside
@@ -3782,7 +3833,7 @@ pub fn gen_branch_stub_hit_trampoline(ocb: &mut OutlinedCb) -> Option<CodePtr> {
 
 /// Return registers to be pushed and popped on branch_stub_hit.
 pub fn caller_saved_temp_regs() -> impl Iterator<Item = &'static Reg> + DoubleEndedIterator {
-    let temp_regs = Assembler::get_temp_regs2().iter();
+    let temp_regs = Assembler::get_temp_regs().iter();
     let len = temp_regs.len();
     // The return value gen_leave() leaves in C_RET_REG
     // needs to survive the branch_stub_hit() call.
@@ -3916,10 +3967,7 @@ pub fn gen_direct_jump(jit: &mut JITState, ctx: &Context, target0: BlockId, asm:
 }
 
 /// Create a stub to force the code up to this point to be executed
-pub fn defer_compilation(
-    jit: &mut JITState,
-    asm: &mut Assembler,
-) {
+pub fn defer_compilation(jit: &mut JITState, asm: &mut Assembler) -> Result<(), ()> {
     if asm.ctx.is_deferred() {
         panic!("Double defer!");
     }
@@ -3936,7 +3984,7 @@ pub fn defer_compilation(
     };
 
     // Likely a stub since the context is marked as deferred().
-    let target0_address = branch.set_target(0, blockid, &next_ctx, jit);
+    let dst_addr = branch.set_target(0, blockid, &next_ctx, jit).ok_or(())?;
 
     // Pad the block if it has the potential to be invalidated. This must be
     // done before gen_fn() in case the jump is overwritten by a fallthrough.
@@ -3947,9 +3995,7 @@ pub fn defer_compilation(
     // Call the branch generation function
     asm_comment!(asm, "defer_compilation");
     asm.mark_branch_start(&branch);
-    if let Some(dst_addr) = target0_address {
-        branch.gen_fn.call(asm, Target::CodePtr(dst_addr), None);
-    }
+    branch.gen_fn.call(asm, Target::CodePtr(dst_addr), None);
     asm.mark_branch_end(&branch);
 
     // If the block we're deferring from is empty
@@ -3958,6 +4004,8 @@ pub fn defer_compilation(
     }
 
     incr_counter!(defer_count);
+
+    Ok(())
 }
 
 /// Remove a block from the live control flow graph.
@@ -4138,7 +4186,23 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
     }
 
     // For each incoming branch
-    for branchref in block.incoming.0.take().iter() {
+    let mut incoming_branches = block.incoming.0.take();
+
+    // An adjacent branch will write into the start of the block being invalidated, possibly
+    // overwriting the block's exit. If we run out of memory after doing this, any subsequent
+    // incoming branches we rewrite won't be able use the block's exit as a fallback when they
+    // are unable to generate a stub. To avoid this, if there's an incoming branch that's
+    // adjacent to the invalidated block, make sure we process it last.
+    let adjacent_branch_idx = incoming_branches.iter().position(|branchref| {
+        let branch = unsafe { branchref.as_ref() };
+        let target_next = block.start_addr == branch.end_addr.get();
+        target_next
+    });
+    if let Some(adjacent_branch_idx) = adjacent_branch_idx {
+        incoming_branches.swap(adjacent_branch_idx, incoming_branches.len() - 1)
+    }
+
+    for (i, branchref) in incoming_branches.iter().enumerate() {
         let branch = unsafe { branchref.as_ref() };
         let target_idx = if branch.get_target_address(0) == Some(block_start) {
             0
@@ -4178,10 +4242,18 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
         let target_next = block.start_addr == branch.end_addr.get();
 
         if target_next {
-            // The new block will no longer be adjacent.
-            // Note that we could be enlarging the branch and writing into the
-            // start of the block being invalidated.
-            branch.gen_fn.set_shape(BranchShape::Default);
+            if stub_addr != block.start_addr {
+                // The new block will no longer be adjacent.
+                // Note that we could be enlarging the branch and writing into the
+                // start of the block being invalidated.
+                branch.gen_fn.set_shape(BranchShape::Default);
+            } else {
+                // The branch target is still adjacent, so the branch must remain
+                // a fallthrough so we don't overwrite the target with a jump.
+                //
+                // This can happen if we're unable to generate a stub and the
+                // target block also exits on entry (block_start == block_entry_exit).
+            }
         }
 
         // Rewrite the branch with the new jump target address
@@ -4191,6 +4263,11 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
         if target_next && branch.end_addr > block.end_addr {
             panic!("yjit invalidate rewrote branch past end of invalidated block: {:?} (code_size: {})", branch, block.code_size());
         }
+        let is_last_incoming_branch = i == incoming_branches.len() - 1;
+        if target_next && branch.end_addr.get() > block_entry_exit && !is_last_incoming_branch {
+            // We might still need to jump to this exit if we run out of memory when rewriting another incoming branch.
+            panic!("yjit invalidate rewrote branch over exit of invalidated block: {:?}", branch);
+        }
         if !target_next && branch.code_size() > old_branch_size {
             panic!(
                 "invalidated branch grew in size (start_addr: {:?}, old_size: {}, new_size: {})",
@@ -4229,11 +4306,9 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
     incr_counter!(invalidation_count);
 }
 
-// We cannot deallocate blocks immediately after invalidation since there
-// could be stubs waiting to access branch pointers. Return stubs can do
-// this since patching the code for setting up return addresses does not
-// affect old return addresses that are already set up to use potentially
-// invalidated branch pointers. Example:
+// We cannot deallocate blocks immediately after invalidation since patching the code for setting
+// up return addresses does not affect outstanding return addresses that are on stack and will use
+// invalidated branch pointers when hit. Example:
 //   def foo(n)
 //     if n == 2
 //       # 1.times.each to create a cfunc frame to preserve the JIT frame
@@ -4241,13 +4316,16 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
 //       return 1.times.each { Object.define_method(:foo) {} }
 //     end
 //
-//     foo(n + 1)
+//     foo(n + 1) # The block for this call houses the return branch stub
 //   end
 //   p foo(1)
 pub fn delayed_deallocation(blockref: BlockRef) {
     block_assumptions_free(blockref);
 
-    let payload = get_iseq_payload(unsafe { blockref.as_ref() }.iseq.get()).unwrap();
+    let block = unsafe { blockref.as_ref() };
+    // Set null ISEQ on the block to signal that it's dead.
+    let iseq = block.iseq.replace(ptr::null());
+    let payload = get_iseq_payload(iseq).unwrap();
     payload.dead_blocks.push(blockref);
 }
 
diff --git a/yjit/src/cruby.rs b/yjit/src/cruby.rs
index 25fabec1d0..d34b049a45 100644
--- a/yjit/src/cruby.rs
+++ b/yjit/src/cruby.rs
@@ -123,7 +123,6 @@ extern "C" {
     pub fn rb_float_new(d: f64) -> VALUE;
 
     pub fn rb_hash_empty_p(hash: VALUE) -> VALUE;
-    pub fn rb_yjit_str_concat_codepoint(str: VALUE, codepoint: VALUE);
     pub fn rb_str_setbyte(str: VALUE, index: VALUE, value: VALUE) -> VALUE;
     pub fn rb_vm_splat_array(flag: VALUE, ary: VALUE) -> VALUE;
     pub fn rb_vm_concat_array(ary1: VALUE, ary2st: VALUE) -> VALUE;
@@ -198,8 +197,8 @@ pub use rb_get_cikw_keywords_idx as get_cikw_keywords_idx;
 pub use rb_get_call_data_ci as get_call_data_ci;
 pub use rb_yarv_str_eql_internal as rb_str_eql_internal;
 pub use rb_yarv_ary_entry_internal as rb_ary_entry_internal;
-pub use rb_yjit_fix_div_fix as rb_fix_div_fix;
-pub use rb_yjit_fix_mod_fix as rb_fix_mod_fix;
+pub use rb_jit_fix_div_fix as rb_fix_div_fix;
+pub use rb_jit_fix_mod_fix as rb_fix_mod_fix;
 pub use rb_FL_TEST as FL_TEST;
 pub use rb_FL_TEST_RAW as FL_TEST_RAW;
 pub use rb_RB_TYPE_P as RB_TYPE_P;
@@ -362,6 +361,11 @@ impl VALUE {
         !self.special_const_p()
     }
 
+    /// Shareability between ractors. `RB_OBJ_SHAREABLE_P()`.
+    pub fn shareable_p(self) -> bool {
+        (self.builtin_flags() & RUBY_FL_SHAREABLE as usize) != 0
+    }
+
     /// Return true if the value is a Ruby Fixnum (immediate-size integer)
     pub fn fixnum_p(self) -> bool {
         let VALUE(cval) = self;
@@ -441,28 +445,16 @@ impl VALUE {
     }
 
     pub fn shape_too_complex(self) -> bool {
-        unsafe { rb_shape_obj_too_complex(self) }
+        unsafe { rb_yjit_shape_obj_too_complex_p(self) }
     }
 
     pub fn shape_id_of(self) -> u32 {
-        unsafe { rb_shape_get_shape_id(self) }
-    }
-
-    pub fn shape_of(self) -> *mut rb_shape {
-        unsafe {
-            let shape = rb_shape_get_shape_by_id(self.shape_id_of());
-
-            if shape.is_null() {
-                panic!("Shape should not be null");
-            } else {
-                shape
-            }
-        }
+        unsafe { rb_obj_shape_id(self) }
     }
 
     pub fn embedded_p(self) -> bool {
         unsafe {
-            FL_TEST_RAW(self, VALUE(ROBJECT_EMBED as usize)) != VALUE(0)
+            FL_TEST_RAW(self, VALUE(ROBJECT_HEAP as usize)) == VALUE(0)
         }
     }
 
@@ -613,9 +605,15 @@ pub fn rust_str_to_ruby(str: &str) -> VALUE {
 
 /// Produce a Ruby symbol from a Rust string slice
 pub fn rust_str_to_sym(str: &str) -> VALUE {
+    let id = rust_str_to_id(str);
+    unsafe { rb_id2sym(id) }
+}
+
+/// Produce an ID from a Rust string slice
+pub fn rust_str_to_id(str: &str) -> ID {
     let c_str = CString::new(str).unwrap();
     let c_ptr: *const c_char = c_str.as_ptr();
-    unsafe { rb_id2sym(rb_intern(c_ptr)) }
+    unsafe { rb_intern(c_ptr) }
 }
 
 /// Produce an owned Rust String from a C char pointer
@@ -683,7 +681,7 @@ where
     let line = loc.line;
     let mut recursive_lock_level: c_uint = 0;
 
-    unsafe { rb_yjit_vm_lock_then_barrier(&mut recursive_lock_level, file, line) };
+    unsafe { rb_jit_vm_lock_then_barrier(&mut recursive_lock_level, file, line) };
 
     let ret = match catch_unwind(func) {
         Ok(result) => result,
@@ -703,7 +701,7 @@ where
         }
     };
 
-    unsafe { rb_yjit_vm_unlock(&mut recursive_lock_level, file, line) };
+    unsafe { rb_jit_vm_unlock(&mut recursive_lock_level, file, line) };
 
     ret
 }
@@ -774,12 +772,6 @@ mod manual_defs {
     pub const RUBY_OFFSET_CFP_JIT_RETURN: i32 = 48;
     pub const RUBY_SIZEOF_CONTROL_FRAME: usize = 56;
 
-    // Constants from rb_execution_context_t vm_core.h
-    pub const RUBY_OFFSET_EC_CFP: i32 = 16;
-    pub const RUBY_OFFSET_EC_INTERRUPT_FLAG: i32 = 32; // rb_atomic_t (u32)
-    pub const RUBY_OFFSET_EC_INTERRUPT_MASK: i32 = 36; // rb_atomic_t (u32)
-    pub const RUBY_OFFSET_EC_THREAD_PTR: i32 = 48;
-
     // Constants from rb_thread_t in vm_core.h
     pub const RUBY_OFFSET_THREAD_SELF: i32 = 16;
 
@@ -822,8 +814,11 @@ pub(crate) mod ids {
     def_ids! {
         name: NULL               content: b""
         name: respond_to_missing content: b"respond_to_missing?"
+        name: method_missing     content: b"method_missing"
         name: to_ary             content: b"to_ary"
+        name: to_s               content: b"to_s"
         name: eq                 content: b"=="
+        name: include_p          content: b"include?"
     }
 }
 
diff --git a/yjit/src/cruby_bindings.inc.rs b/yjit/src/cruby_bindings.inc.rs
index 4eb44634a1..56994388a3 100644
--- a/yjit/src/cruby_bindings.inc.rs
+++ b/yjit/src/cruby_bindings.inc.rs
@@ -165,13 +165,13 @@ pub const NIL_REDEFINED_OP_FLAG: u32 = 512;
 pub const TRUE_REDEFINED_OP_FLAG: u32 = 1024;
 pub const FALSE_REDEFINED_OP_FLAG: u32 = 2048;
 pub const PROC_REDEFINED_OP_FLAG: u32 = 4096;
+pub const VM_KW_SPECIFIED_BITS_MAX: u32 = 31;
 pub const VM_ENV_DATA_SIZE: u32 = 3;
 pub const VM_ENV_DATA_INDEX_ME_CREF: i32 = -2;
 pub const VM_ENV_DATA_INDEX_SPECVAL: i32 = -1;
 pub const VM_ENV_DATA_INDEX_FLAGS: u32 = 0;
 pub const VM_BLOCK_HANDLER_NONE: u32 = 0;
 pub const SHAPE_ID_NUM_BITS: u32 = 32;
-pub const OBJ_TOO_COMPLEX_SHAPE_ID: u32 = 2;
 pub type ID = ::std::os::raw::c_ulong;
 pub type rb_alloc_func_t = ::std::option::Option<unsafe extern "C" fn(klass: VALUE) -> VALUE>;
 pub const RUBY_Qfalse: ruby_special_consts = 0;
@@ -223,13 +223,12 @@ pub const RUBY_FL_USHIFT: ruby_fl_ushift = 12;
 pub type ruby_fl_ushift = u32;
 pub const RUBY_FL_WB_PROTECTED: ruby_fl_type = 32;
 pub const RUBY_FL_PROMOTED: ruby_fl_type = 32;
-pub const RUBY_FL_UNUSED6: ruby_fl_type = 64;
+pub const RUBY_FL_USERPRIV0: ruby_fl_type = 64;
 pub const RUBY_FL_FINALIZE: ruby_fl_type = 128;
-pub const RUBY_FL_TAINT: ruby_fl_type = 0;
+pub const RUBY_FL_EXIVAR: ruby_fl_type = 0;
 pub const RUBY_FL_SHAREABLE: ruby_fl_type = 256;
-pub const RUBY_FL_UNTRUSTED: ruby_fl_type = 0;
-pub const RUBY_FL_SEEN_OBJ_ID: ruby_fl_type = 512;
-pub const RUBY_FL_EXIVAR: ruby_fl_type = 1024;
+pub const RUBY_FL_WEAK_REFERENCE: ruby_fl_type = 512;
+pub const RUBY_FL_UNUSED10: ruby_fl_type = 1024;
 pub const RUBY_FL_FREEZE: ruby_fl_type = 2048;
 pub const RUBY_FL_USER0: ruby_fl_type = 4096;
 pub const RUBY_FL_USER1: ruby_fl_type = 8192;
@@ -251,7 +250,7 @@ pub const RUBY_FL_USER16: ruby_fl_type = 268435456;
 pub const RUBY_FL_USER17: ruby_fl_type = 536870912;
 pub const RUBY_FL_USER18: ruby_fl_type = 1073741824;
 pub const RUBY_FL_USER19: ruby_fl_type = -2147483648;
-pub const RUBY_ELTS_SHARED: ruby_fl_type = 16384;
+pub const RUBY_ELTS_SHARED: ruby_fl_type = 4096;
 pub const RUBY_FL_SINGLETON: ruby_fl_type = 8192;
 pub type ruby_fl_type = i32;
 pub const RSTRING_NOEMBED: ruby_rstring_flags = 8192;
@@ -277,9 +276,9 @@ pub const RARRAY_EMBED_LEN_MASK: ruby_rarray_flags = 4161536;
 pub type ruby_rarray_flags = u32;
 pub const RARRAY_EMBED_LEN_SHIFT: ruby_rarray_consts = 15;
 pub type ruby_rarray_consts = u32;
-pub const RMODULE_IS_REFINEMENT: ruby_rmodule_flags = 32768;
+pub const RMODULE_IS_REFINEMENT: ruby_rmodule_flags = 8192;
 pub type ruby_rmodule_flags = u32;
-pub const ROBJECT_EMBED: ruby_robject_flags = 8192;
+pub const ROBJECT_HEAP: ruby_robject_flags = 65536;
 pub type ruby_robject_flags = u32;
 pub type rb_block_call_func = ::std::option::Option<
     unsafe extern "C" fn(
@@ -329,21 +328,23 @@ pub const BOP_NIL_P: ruby_basic_operators = 15;
 pub const BOP_SUCC: ruby_basic_operators = 16;
 pub const BOP_GT: ruby_basic_operators = 17;
 pub const BOP_GE: ruby_basic_operators = 18;
-pub const BOP_NOT: ruby_basic_operators = 19;
-pub const BOP_NEQ: ruby_basic_operators = 20;
-pub const BOP_MATCH: ruby_basic_operators = 21;
-pub const BOP_FREEZE: ruby_basic_operators = 22;
-pub const BOP_UMINUS: ruby_basic_operators = 23;
-pub const BOP_MAX: ruby_basic_operators = 24;
-pub const BOP_MIN: ruby_basic_operators = 25;
-pub const BOP_HASH: ruby_basic_operators = 26;
-pub const BOP_CALL: ruby_basic_operators = 27;
-pub const BOP_AND: ruby_basic_operators = 28;
-pub const BOP_OR: ruby_basic_operators = 29;
-pub const BOP_CMP: ruby_basic_operators = 30;
-pub const BOP_DEFAULT: ruby_basic_operators = 31;
-pub const BOP_PACK: ruby_basic_operators = 32;
-pub const BOP_LAST_: ruby_basic_operators = 33;
+pub const BOP_GTGT: ruby_basic_operators = 19;
+pub const BOP_NOT: ruby_basic_operators = 20;
+pub const BOP_NEQ: ruby_basic_operators = 21;
+pub const BOP_MATCH: ruby_basic_operators = 22;
+pub const BOP_FREEZE: ruby_basic_operators = 23;
+pub const BOP_UMINUS: ruby_basic_operators = 24;
+pub const BOP_MAX: ruby_basic_operators = 25;
+pub const BOP_MIN: ruby_basic_operators = 26;
+pub const BOP_HASH: ruby_basic_operators = 27;
+pub const BOP_CALL: ruby_basic_operators = 28;
+pub const BOP_AND: ruby_basic_operators = 29;
+pub const BOP_OR: ruby_basic_operators = 30;
+pub const BOP_CMP: ruby_basic_operators = 31;
+pub const BOP_DEFAULT: ruby_basic_operators = 32;
+pub const BOP_PACK: ruby_basic_operators = 33;
+pub const BOP_INCLUDE_P: ruby_basic_operators = 34;
+pub const BOP_LAST_: ruby_basic_operators = 35;
 pub type ruby_basic_operators = u32;
 pub type rb_serial_t = ::std::os::raw::c_ulonglong;
 pub const imemo_env: imemo_type = 0;
@@ -355,11 +356,10 @@ pub const imemo_memo: imemo_type = 5;
 pub const imemo_ment: imemo_type = 6;
 pub const imemo_iseq: imemo_type = 7;
 pub const imemo_tmpbuf: imemo_type = 8;
-pub const imemo_ast: imemo_type = 9;
-pub const imemo_parser_strterm: imemo_type = 10;
-pub const imemo_callinfo: imemo_type = 11;
-pub const imemo_callcache: imemo_type = 12;
-pub const imemo_constcache: imemo_type = 13;
+pub const imemo_callinfo: imemo_type = 10;
+pub const imemo_callcache: imemo_type = 11;
+pub const imemo_constcache: imemo_type = 12;
+pub const imemo_fields: imemo_type = 13;
 pub type imemo_type = u32;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -434,11 +434,6 @@ pub const OPTIMIZED_METHOD_TYPE_STRUCT_AREF: method_optimized_type = 3;
 pub const OPTIMIZED_METHOD_TYPE_STRUCT_ASET: method_optimized_type = 4;
 pub const OPTIMIZED_METHOD_TYPE__MAX: method_optimized_type = 5;
 pub type method_optimized_type = u32;
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct rb_id_table {
-    _unused: [u8; 0],
-}
 pub type rb_num_t = ::std::os::raw::c_ulong;
 pub const RUBY_TAG_NONE: ruby_tag_type = 0;
 pub const RUBY_TAG_RETURN: ruby_tag_type = 1;
@@ -458,8 +453,6 @@ pub type ruby_vm_throw_flags = u32;
 pub struct iseq_inline_constant_cache_entry {
     pub flags: VALUE,
     pub value: VALUE,
-    pub _unused1: VALUE,
-    pub _unused2: VALUE,
     pub ic_cref: *const rb_cref_t,
 }
 #[repr(C)]
@@ -471,7 +464,7 @@ pub struct iseq_inline_constant_cache {
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct iseq_inline_iv_cache_entry {
-    pub value: usize,
+    pub value: u64,
     pub iv_set_name: ID,
 }
 #[repr(C)]
@@ -492,10 +485,11 @@ pub type rb_iseq_type = u32;
 pub const BUILTIN_ATTR_LEAF: rb_builtin_attr = 1;
 pub const BUILTIN_ATTR_SINGLE_NOARG_LEAF: rb_builtin_attr = 2;
 pub const BUILTIN_ATTR_INLINE_BLOCK: rb_builtin_attr = 4;
+pub const BUILTIN_ATTR_C_TRACE: rb_builtin_attr = 8;
 pub type rb_builtin_attr = u32;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
-pub struct rb_iseq_constant_body__bindgen_ty_1_rb_iseq_param_keyword {
+pub struct rb_iseq_constant_body_rb_iseq_parameters_rb_iseq_param_keyword {
     pub num: ::std::os::raw::c_int,
     pub required_num: ::std::os::raw::c_int,
     pub bits_start: ::std::os::raw::c_int,
@@ -606,6 +600,7 @@ pub const VM_OPT_NEWARRAY_SEND_MIN: vm_opt_newarray_send_type = 2;
 pub const VM_OPT_NEWARRAY_SEND_HASH: vm_opt_newarray_send_type = 3;
 pub const VM_OPT_NEWARRAY_SEND_PACK: vm_opt_newarray_send_type = 4;
 pub const VM_OPT_NEWARRAY_SEND_PACK_BUFFER: vm_opt_newarray_send_type = 5;
+pub const VM_OPT_NEWARRAY_SEND_INCLUDE_P: vm_opt_newarray_send_type = 6;
 pub type vm_opt_newarray_send_type = u32;
 pub const VM_SPECIAL_OBJECT_VMCORE: vm_special_object_type = 1;
 pub const VM_SPECIAL_OBJECT_CBASE: vm_special_object_type = 2;
@@ -631,36 +626,16 @@ pub const VM_FRAME_FLAG_LAMBDA: vm_frame_env_flags = 256;
 pub const VM_FRAME_FLAG_MODIFIED_BLOCK_PARAM: vm_frame_env_flags = 512;
 pub const VM_FRAME_FLAG_CFRAME_KW: vm_frame_env_flags = 1024;
 pub const VM_FRAME_FLAG_PASSED: vm_frame_env_flags = 2048;
+pub const VM_FRAME_FLAG_BOX_REQUIRE: vm_frame_env_flags = 4096;
 pub const VM_ENV_FLAG_LOCAL: vm_frame_env_flags = 2;
 pub const VM_ENV_FLAG_ESCAPED: vm_frame_env_flags = 4;
 pub const VM_ENV_FLAG_WB_REQUIRED: vm_frame_env_flags = 8;
 pub const VM_ENV_FLAG_ISOLATED: vm_frame_env_flags = 16;
 pub type vm_frame_env_flags = u32;
-pub type attr_index_t = u32;
+pub type attr_index_t = u16;
 pub type shape_id_t = u32;
-pub type redblack_id_t = u32;
-pub type redblack_node_t = redblack_node;
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct rb_shape {
-    pub edges: *mut rb_id_table,
-    pub edge_name: ID,
-    pub next_iv_index: attr_index_t,
-    pub capacity: u32,
-    pub type_: u8,
-    pub heap_index: u8,
-    pub parent_id: shape_id_t,
-    pub ancestor_index: *mut redblack_node_t,
-}
-pub type rb_shape_t = rb_shape;
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct redblack_node {
-    pub key: ID,
-    pub value: *mut rb_shape_t,
-    pub l: redblack_id_t,
-    pub r: redblack_id_t,
-}
+pub const SHAPE_ID_HAS_IVAR_MASK: shape_id_mask = 134742014;
+pub type shape_id_mask = u32;
 #[repr(C)]
 pub struct rb_cvar_class_tbl_entry {
     pub index: u32,
@@ -704,7 +679,7 @@ pub struct rb_call_data {
     pub ci: *const rb_callinfo,
     pub cc: *const rb_callcache,
 }
-pub const RSTRING_CHILLED: ruby_rstring_private_flags = 32768;
+pub const RSTRING_CHILLED: ruby_rstring_private_flags = 49152;
 pub type ruby_rstring_private_flags = u32;
 pub const RHASH_PASS_AS_KEYWORDS: ruby_rhash_flags = 8192;
 pub const RHASH_PROC_DEFAULT: ruby_rhash_flags = 16384;
@@ -782,42 +757,42 @@ pub const YARVINSN_definesmethod: ruby_vminsn_type = 54;
 pub const YARVINSN_send: ruby_vminsn_type = 55;
 pub const YARVINSN_sendforward: ruby_vminsn_type = 56;
 pub const YARVINSN_opt_send_without_block: ruby_vminsn_type = 57;
-pub const YARVINSN_objtostring: ruby_vminsn_type = 58;
-pub const YARVINSN_opt_ary_freeze: ruby_vminsn_type = 59;
-pub const YARVINSN_opt_hash_freeze: ruby_vminsn_type = 60;
-pub const YARVINSN_opt_str_freeze: ruby_vminsn_type = 61;
-pub const YARVINSN_opt_nil_p: ruby_vminsn_type = 62;
-pub const YARVINSN_opt_str_uminus: ruby_vminsn_type = 63;
-pub const YARVINSN_opt_newarray_send: ruby_vminsn_type = 64;
-pub const YARVINSN_invokesuper: ruby_vminsn_type = 65;
-pub const YARVINSN_invokesuperforward: ruby_vminsn_type = 66;
-pub const YARVINSN_invokeblock: ruby_vminsn_type = 67;
-pub const YARVINSN_leave: ruby_vminsn_type = 68;
-pub const YARVINSN_throw: ruby_vminsn_type = 69;
-pub const YARVINSN_jump: ruby_vminsn_type = 70;
-pub const YARVINSN_branchif: ruby_vminsn_type = 71;
-pub const YARVINSN_branchunless: ruby_vminsn_type = 72;
-pub const YARVINSN_branchnil: ruby_vminsn_type = 73;
-pub const YARVINSN_once: ruby_vminsn_type = 74;
-pub const YARVINSN_opt_case_dispatch: ruby_vminsn_type = 75;
-pub const YARVINSN_opt_plus: ruby_vminsn_type = 76;
-pub const YARVINSN_opt_minus: ruby_vminsn_type = 77;
-pub const YARVINSN_opt_mult: ruby_vminsn_type = 78;
-pub const YARVINSN_opt_div: ruby_vminsn_type = 79;
-pub const YARVINSN_opt_mod: ruby_vminsn_type = 80;
-pub const YARVINSN_opt_eq: ruby_vminsn_type = 81;
-pub const YARVINSN_opt_neq: ruby_vminsn_type = 82;
-pub const YARVINSN_opt_lt: ruby_vminsn_type = 83;
-pub const YARVINSN_opt_le: ruby_vminsn_type = 84;
-pub const YARVINSN_opt_gt: ruby_vminsn_type = 85;
-pub const YARVINSN_opt_ge: ruby_vminsn_type = 86;
-pub const YARVINSN_opt_ltlt: ruby_vminsn_type = 87;
-pub const YARVINSN_opt_and: ruby_vminsn_type = 88;
-pub const YARVINSN_opt_or: ruby_vminsn_type = 89;
-pub const YARVINSN_opt_aref: ruby_vminsn_type = 90;
-pub const YARVINSN_opt_aset: ruby_vminsn_type = 91;
-pub const YARVINSN_opt_aset_with: ruby_vminsn_type = 92;
-pub const YARVINSN_opt_aref_with: ruby_vminsn_type = 93;
+pub const YARVINSN_opt_new: ruby_vminsn_type = 58;
+pub const YARVINSN_objtostring: ruby_vminsn_type = 59;
+pub const YARVINSN_opt_ary_freeze: ruby_vminsn_type = 60;
+pub const YARVINSN_opt_hash_freeze: ruby_vminsn_type = 61;
+pub const YARVINSN_opt_str_freeze: ruby_vminsn_type = 62;
+pub const YARVINSN_opt_nil_p: ruby_vminsn_type = 63;
+pub const YARVINSN_opt_str_uminus: ruby_vminsn_type = 64;
+pub const YARVINSN_opt_duparray_send: ruby_vminsn_type = 65;
+pub const YARVINSN_opt_newarray_send: ruby_vminsn_type = 66;
+pub const YARVINSN_invokesuper: ruby_vminsn_type = 67;
+pub const YARVINSN_invokesuperforward: ruby_vminsn_type = 68;
+pub const YARVINSN_invokeblock: ruby_vminsn_type = 69;
+pub const YARVINSN_leave: ruby_vminsn_type = 70;
+pub const YARVINSN_throw: ruby_vminsn_type = 71;
+pub const YARVINSN_jump: ruby_vminsn_type = 72;
+pub const YARVINSN_branchif: ruby_vminsn_type = 73;
+pub const YARVINSN_branchunless: ruby_vminsn_type = 74;
+pub const YARVINSN_branchnil: ruby_vminsn_type = 75;
+pub const YARVINSN_once: ruby_vminsn_type = 76;
+pub const YARVINSN_opt_case_dispatch: ruby_vminsn_type = 77;
+pub const YARVINSN_opt_plus: ruby_vminsn_type = 78;
+pub const YARVINSN_opt_minus: ruby_vminsn_type = 79;
+pub const YARVINSN_opt_mult: ruby_vminsn_type = 80;
+pub const YARVINSN_opt_div: ruby_vminsn_type = 81;
+pub const YARVINSN_opt_mod: ruby_vminsn_type = 82;
+pub const YARVINSN_opt_eq: ruby_vminsn_type = 83;
+pub const YARVINSN_opt_neq: ruby_vminsn_type = 84;
+pub const YARVINSN_opt_lt: ruby_vminsn_type = 85;
+pub const YARVINSN_opt_le: ruby_vminsn_type = 86;
+pub const YARVINSN_opt_gt: ruby_vminsn_type = 87;
+pub const YARVINSN_opt_ge: ruby_vminsn_type = 88;
+pub const YARVINSN_opt_ltlt: ruby_vminsn_type = 89;
+pub const YARVINSN_opt_and: ruby_vminsn_type = 90;
+pub const YARVINSN_opt_or: ruby_vminsn_type = 91;
+pub const YARVINSN_opt_aref: ruby_vminsn_type = 92;
+pub const YARVINSN_opt_aset: ruby_vminsn_type = 93;
 pub const YARVINSN_opt_length: ruby_vminsn_type = 94;
 pub const YARVINSN_opt_size: ruby_vminsn_type = 95;
 pub const YARVINSN_opt_empty_p: ruby_vminsn_type = 96;
@@ -891,42 +866,42 @@ pub const YARVINSN_trace_definesmethod: ruby_vminsn_type = 163;
 pub const YARVINSN_trace_send: ruby_vminsn_type = 164;
 pub const YARVINSN_trace_sendforward: ruby_vminsn_type = 165;
 pub const YARVINSN_trace_opt_send_without_block: ruby_vminsn_type = 166;
-pub const YARVINSN_trace_objtostring: ruby_vminsn_type = 167;
-pub const YARVINSN_trace_opt_ary_freeze: ruby_vminsn_type = 168;
-pub const YARVINSN_trace_opt_hash_freeze: ruby_vminsn_type = 169;
-pub const YARVINSN_trace_opt_str_freeze: ruby_vminsn_type = 170;
-pub const YARVINSN_trace_opt_nil_p: ruby_vminsn_type = 171;
-pub const YARVINSN_trace_opt_str_uminus: ruby_vminsn_type = 172;
-pub const YARVINSN_trace_opt_newarray_send: ruby_vminsn_type = 173;
-pub const YARVINSN_trace_invokesuper: ruby_vminsn_type = 174;
-pub const YARVINSN_trace_invokesuperforward: ruby_vminsn_type = 175;
-pub const YARVINSN_trace_invokeblock: ruby_vminsn_type = 176;
-pub const YARVINSN_trace_leave: ruby_vminsn_type = 177;
-pub const YARVINSN_trace_throw: ruby_vminsn_type = 178;
-pub const YARVINSN_trace_jump: ruby_vminsn_type = 179;
-pub const YARVINSN_trace_branchif: ruby_vminsn_type = 180;
-pub const YARVINSN_trace_branchunless: ruby_vminsn_type = 181;
-pub const YARVINSN_trace_branchnil: ruby_vminsn_type = 182;
-pub const YARVINSN_trace_once: ruby_vminsn_type = 183;
-pub const YARVINSN_trace_opt_case_dispatch: ruby_vminsn_type = 184;
-pub const YARVINSN_trace_opt_plus: ruby_vminsn_type = 185;
-pub const YARVINSN_trace_opt_minus: ruby_vminsn_type = 186;
-pub const YARVINSN_trace_opt_mult: ruby_vminsn_type = 187;
-pub const YARVINSN_trace_opt_div: ruby_vminsn_type = 188;
-pub const YARVINSN_trace_opt_mod: ruby_vminsn_type = 189;
-pub const YARVINSN_trace_opt_eq: ruby_vminsn_type = 190;
-pub const YARVINSN_trace_opt_neq: ruby_vminsn_type = 191;
-pub const YARVINSN_trace_opt_lt: ruby_vminsn_type = 192;
-pub const YARVINSN_trace_opt_le: ruby_vminsn_type = 193;
-pub const YARVINSN_trace_opt_gt: ruby_vminsn_type = 194;
-pub const YARVINSN_trace_opt_ge: ruby_vminsn_type = 195;
-pub const YARVINSN_trace_opt_ltlt: ruby_vminsn_type = 196;
-pub const YARVINSN_trace_opt_and: ruby_vminsn_type = 197;
-pub const YARVINSN_trace_opt_or: ruby_vminsn_type = 198;
-pub const YARVINSN_trace_opt_aref: ruby_vminsn_type = 199;
-pub const YARVINSN_trace_opt_aset: ruby_vminsn_type = 200;
-pub const YARVINSN_trace_opt_aset_with: ruby_vminsn_type = 201;
-pub const YARVINSN_trace_opt_aref_with: ruby_vminsn_type = 202;
+pub const YARVINSN_trace_opt_new: ruby_vminsn_type = 167;
+pub const YARVINSN_trace_objtostring: ruby_vminsn_type = 168;
+pub const YARVINSN_trace_opt_ary_freeze: ruby_vminsn_type = 169;
+pub const YARVINSN_trace_opt_hash_freeze: ruby_vminsn_type = 170;
+pub const YARVINSN_trace_opt_str_freeze: ruby_vminsn_type = 171;
+pub const YARVINSN_trace_opt_nil_p: ruby_vminsn_type = 172;
+pub const YARVINSN_trace_opt_str_uminus: ruby_vminsn_type = 173;
+pub const YARVINSN_trace_opt_duparray_send: ruby_vminsn_type = 174;
+pub const YARVINSN_trace_opt_newarray_send: ruby_vminsn_type = 175;
+pub const YARVINSN_trace_invokesuper: ruby_vminsn_type = 176;
+pub const YARVINSN_trace_invokesuperforward: ruby_vminsn_type = 177;
+pub const YARVINSN_trace_invokeblock: ruby_vminsn_type = 178;
+pub const YARVINSN_trace_leave: ruby_vminsn_type = 179;
+pub const YARVINSN_trace_throw: ruby_vminsn_type = 180;
+pub const YARVINSN_trace_jump: ruby_vminsn_type = 181;
+pub const YARVINSN_trace_branchif: ruby_vminsn_type = 182;
+pub const YARVINSN_trace_branchunless: ruby_vminsn_type = 183;
+pub const YARVINSN_trace_branchnil: ruby_vminsn_type = 184;
+pub const YARVINSN_trace_once: ruby_vminsn_type = 185;
+pub const YARVINSN_trace_opt_case_dispatch: ruby_vminsn_type = 186;
+pub const YARVINSN_trace_opt_plus: ruby_vminsn_type = 187;
+pub const YARVINSN_trace_opt_minus: ruby_vminsn_type = 188;
+pub const YARVINSN_trace_opt_mult: ruby_vminsn_type = 189;
+pub const YARVINSN_trace_opt_div: ruby_vminsn_type = 190;
+pub const YARVINSN_trace_opt_mod: ruby_vminsn_type = 191;
+pub const YARVINSN_trace_opt_eq: ruby_vminsn_type = 192;
+pub const YARVINSN_trace_opt_neq: ruby_vminsn_type = 193;
+pub const YARVINSN_trace_opt_lt: ruby_vminsn_type = 194;
+pub const YARVINSN_trace_opt_le: ruby_vminsn_type = 195;
+pub const YARVINSN_trace_opt_gt: ruby_vminsn_type = 196;
+pub const YARVINSN_trace_opt_ge: ruby_vminsn_type = 197;
+pub const YARVINSN_trace_opt_ltlt: ruby_vminsn_type = 198;
+pub const YARVINSN_trace_opt_and: ruby_vminsn_type = 199;
+pub const YARVINSN_trace_opt_or: ruby_vminsn_type = 200;
+pub const YARVINSN_trace_opt_aref: ruby_vminsn_type = 201;
+pub const YARVINSN_trace_opt_aset: ruby_vminsn_type = 202;
 pub const YARVINSN_trace_opt_length: ruby_vminsn_type = 203;
 pub const YARVINSN_trace_opt_size: ruby_vminsn_type = 204;
 pub const YARVINSN_trace_opt_empty_p: ruby_vminsn_type = 205;
@@ -942,7 +917,38 @@ pub const YARVINSN_trace_setlocal_WC_0: ruby_vminsn_type = 214;
 pub const YARVINSN_trace_setlocal_WC_1: ruby_vminsn_type = 215;
 pub const YARVINSN_trace_putobject_INT2FIX_0_: ruby_vminsn_type = 216;
 pub const YARVINSN_trace_putobject_INT2FIX_1_: ruby_vminsn_type = 217;
-pub const VM_INSTRUCTION_SIZE: ruby_vminsn_type = 218;
+pub const YARVINSN_zjit_getinstancevariable: ruby_vminsn_type = 218;
+pub const YARVINSN_zjit_setinstancevariable: ruby_vminsn_type = 219;
+pub const YARVINSN_zjit_definedivar: ruby_vminsn_type = 220;
+pub const YARVINSN_zjit_send: ruby_vminsn_type = 221;
+pub const YARVINSN_zjit_opt_send_without_block: ruby_vminsn_type = 222;
+pub const YARVINSN_zjit_objtostring: ruby_vminsn_type = 223;
+pub const YARVINSN_zjit_opt_nil_p: ruby_vminsn_type = 224;
+pub const YARVINSN_zjit_invokesuper: ruby_vminsn_type = 225;
+pub const YARVINSN_zjit_invokeblock: ruby_vminsn_type = 226;
+pub const YARVINSN_zjit_opt_plus: ruby_vminsn_type = 227;
+pub const YARVINSN_zjit_opt_minus: ruby_vminsn_type = 228;
+pub const YARVINSN_zjit_opt_mult: ruby_vminsn_type = 229;
+pub const YARVINSN_zjit_opt_div: ruby_vminsn_type = 230;
+pub const YARVINSN_zjit_opt_mod: ruby_vminsn_type = 231;
+pub const YARVINSN_zjit_opt_eq: ruby_vminsn_type = 232;
+pub const YARVINSN_zjit_opt_neq: ruby_vminsn_type = 233;
+pub const YARVINSN_zjit_opt_lt: ruby_vminsn_type = 234;
+pub const YARVINSN_zjit_opt_le: ruby_vminsn_type = 235;
+pub const YARVINSN_zjit_opt_gt: ruby_vminsn_type = 236;
+pub const YARVINSN_zjit_opt_ge: ruby_vminsn_type = 237;
+pub const YARVINSN_zjit_opt_ltlt: ruby_vminsn_type = 238;
+pub const YARVINSN_zjit_opt_and: ruby_vminsn_type = 239;
+pub const YARVINSN_zjit_opt_or: ruby_vminsn_type = 240;
+pub const YARVINSN_zjit_opt_aref: ruby_vminsn_type = 241;
+pub const YARVINSN_zjit_opt_aset: ruby_vminsn_type = 242;
+pub const YARVINSN_zjit_opt_length: ruby_vminsn_type = 243;
+pub const YARVINSN_zjit_opt_size: ruby_vminsn_type = 244;
+pub const YARVINSN_zjit_opt_empty_p: ruby_vminsn_type = 245;
+pub const YARVINSN_zjit_opt_succ: ruby_vminsn_type = 246;
+pub const YARVINSN_zjit_opt_not: ruby_vminsn_type = 247;
+pub const YARVINSN_zjit_opt_regexpmatch2: ruby_vminsn_type = 248;
+pub const VM_INSTRUCTION_SIZE: ruby_vminsn_type = 249;
 pub type ruby_vminsn_type = u32;
 pub type rb_iseq_callback = ::std::option::Option<
     unsafe extern "C" fn(arg1: *const rb_iseq_t, arg2: *mut ::std::os::raw::c_void),
@@ -966,13 +972,19 @@ pub const DEFINED_REF: defined_type = 15;
 pub const DEFINED_FUNC: defined_type = 16;
 pub const DEFINED_CONST_FROM: defined_type = 17;
 pub type defined_type = u32;
-pub const ROBJECT_OFFSET_AS_HEAP_IVPTR: robject_offsets = 16;
-pub const ROBJECT_OFFSET_AS_HEAP_IV_INDEX_TBL: robject_offsets = 24;
-pub const ROBJECT_OFFSET_AS_ARY: robject_offsets = 16;
-pub type robject_offsets = u32;
-pub const RUBY_OFFSET_RSTRING_LEN: rstring_offsets = 16;
-pub type rstring_offsets = u32;
-pub type rb_seq_param_keyword_struct = rb_iseq_constant_body__bindgen_ty_1_rb_iseq_param_keyword;
+pub type rb_seq_param_keyword_struct =
+    rb_iseq_constant_body_rb_iseq_parameters_rb_iseq_param_keyword;
+pub const ROBJECT_OFFSET_AS_HEAP_FIELDS: jit_bindgen_constants = 16;
+pub const ROBJECT_OFFSET_AS_ARY: jit_bindgen_constants = 16;
+pub const RUBY_OFFSET_RSTRING_LEN: jit_bindgen_constants = 16;
+pub const RUBY_OFFSET_EC_CFP: jit_bindgen_constants = 16;
+pub const RUBY_OFFSET_EC_INTERRUPT_FLAG: jit_bindgen_constants = 32;
+pub const RUBY_OFFSET_EC_INTERRUPT_MASK: jit_bindgen_constants = 36;
+pub const RUBY_OFFSET_EC_THREAD_PTR: jit_bindgen_constants = 48;
+pub const RUBY_OFFSET_EC_RACTOR_ID: jit_bindgen_constants = 64;
+pub type jit_bindgen_constants = u32;
+pub type rb_iseq_param_keyword_struct =
+    rb_iseq_constant_body_rb_iseq_parameters_rb_iseq_param_keyword;
 extern "C" {
     pub fn ruby_xfree(ptr: *mut ::std::os::raw::c_void);
     pub fn rb_class_attached_object(klass: VALUE) -> VALUE;
@@ -986,6 +998,7 @@ extern "C" {
     pub fn rb_gc_location(obj: VALUE) -> VALUE;
     pub fn rb_gc_writebarrier(old: VALUE, young: VALUE);
     pub fn rb_class_get_superclass(klass: VALUE) -> VALUE;
+    pub fn rb_funcall(recv: VALUE, mid: ID, n: ::std::os::raw::c_int, ...) -> VALUE;
     pub static mut rb_mKernel: VALUE;
     pub static mut rb_cBasicObject: VALUE;
     pub static mut rb_cArray: VALUE;
@@ -1021,7 +1034,13 @@ extern "C" {
     pub fn rb_intern2(name: *const ::std::os::raw::c_char, len: ::std::os::raw::c_long) -> ID;
     pub fn rb_id2name(id: ID) -> *const ::std::os::raw::c_char;
     pub fn rb_class2name(klass: VALUE) -> *const ::std::os::raw::c_char;
+    pub fn rb_class_new_instance_pass_kw(
+        argc: ::std::os::raw::c_int,
+        argv: *const VALUE,
+        klass: VALUE,
+    ) -> VALUE;
     pub fn rb_obj_is_kind_of(obj: VALUE, klass: VALUE) -> VALUE;
+    pub fn rb_obj_alloc(klass: VALUE) -> VALUE;
     pub fn rb_obj_frozen_p(obj: VALUE) -> VALUE;
     pub fn rb_backref_get() -> VALUE;
     pub fn rb_range_new(beg: VALUE, end: VALUE, excl: ::std::os::raw::c_int) -> VALUE;
@@ -1041,6 +1060,7 @@ extern "C" {
     pub fn rb_ivar_get(obj: VALUE, name: ID) -> VALUE;
     pub fn rb_ivar_defined(obj: VALUE, name: ID) -> VALUE;
     pub fn rb_attr_get(obj: VALUE, name: ID) -> VALUE;
+    pub fn rb_const_get(space: VALUE, name: ID) -> VALUE;
     pub fn rb_obj_info_dump(obj: VALUE);
     pub fn rb_class_allocate_instance(klass: VALUE) -> VALUE;
     pub fn rb_obj_equal(obj1: VALUE, obj2: VALUE) -> VALUE;
@@ -1056,13 +1076,14 @@ extern "C" {
         elts: *const VALUE,
     ) -> VALUE;
     pub fn rb_vm_top_self() -> VALUE;
-    pub static mut rb_vm_insns_count: u64;
+    pub static mut rb_vm_insn_count: u64;
     pub fn rb_method_entry_at(obj: VALUE, id: ID) -> *const rb_method_entry_t;
     pub fn rb_callable_method_entry(klass: VALUE, id: ID) -> *const rb_callable_method_entry_t;
     pub fn rb_callable_method_entry_or_negative(
         klass: VALUE,
         id: ID,
     ) -> *const rb_callable_method_entry_t;
+    pub static mut rb_cRubyVM: VALUE;
     pub static mut rb_mRubyVMFrozenCore: VALUE;
     pub static mut rb_block_param_proxy: VALUE;
     pub fn rb_vm_ep_local_ep(ep: *const VALUE) -> *const VALUE;
@@ -1075,21 +1096,26 @@ extern "C" {
     pub fn rb_obj_info(obj: VALUE) -> *const ::std::os::raw::c_char;
     pub fn rb_ec_stack_check(ec: *mut rb_execution_context_struct) -> ::std::os::raw::c_int;
     pub fn rb_shape_id_offset() -> i32;
-    pub fn rb_shape_get_shape_by_id(shape_id: shape_id_t) -> *mut rb_shape_t;
-    pub fn rb_shape_get_shape_id(obj: VALUE) -> shape_id_t;
-    pub fn rb_shape_get_iv_index(shape: *mut rb_shape_t, id: ID, value: *mut attr_index_t) -> bool;
-    pub fn rb_shape_obj_too_complex(obj: VALUE) -> bool;
-    pub fn rb_shape_get_next_no_warnings(
-        shape: *mut rb_shape_t,
-        obj: VALUE,
+    pub fn rb_obj_shape_id(obj: VALUE) -> shape_id_t;
+    pub fn rb_shape_get_iv_index(shape_id: shape_id_t, id: ID, value: *mut attr_index_t) -> bool;
+    pub fn rb_shape_transition_add_ivar_no_warnings(
+        klass: VALUE,
+        original_shape_id: shape_id_t,
         id: ID,
-    ) -> *mut rb_shape_t;
-    pub fn rb_shape_id(shape: *mut rb_shape_t) -> shape_id_t;
+    ) -> shape_id_t;
+    pub fn rb_ivar_get_at(obj: VALUE, index: attr_index_t, id: ID) -> VALUE;
+    pub fn rb_ivar_get_at_no_ractor_check(obj: VALUE, index: attr_index_t) -> VALUE;
     pub fn rb_gvar_get(arg1: ID) -> VALUE;
     pub fn rb_gvar_set(arg1: ID, arg2: VALUE) -> VALUE;
-    pub fn rb_ensure_iv_list_size(obj: VALUE, len: u32, newsize: u32);
+    pub fn rb_ensure_iv_list_size(obj: VALUE, current_len: u32, newsize: u32);
     pub fn rb_vm_barrier();
     pub fn rb_str_byte_substr(str_: VALUE, beg: VALUE, len: VALUE) -> VALUE;
+    pub fn rb_str_substr_two_fixnums(
+        str_: VALUE,
+        beg: VALUE,
+        len: VALUE,
+        empty: ::std::os::raw::c_int,
+    ) -> VALUE;
     pub fn rb_obj_as_string_result(str_: VALUE, obj: VALUE) -> VALUE;
     pub fn rb_str_concat_literals(num: usize, strary: *const VALUE) -> VALUE;
     pub fn rb_ec_str_resurrect(
@@ -1128,32 +1154,58 @@ extern "C" {
         lines: *mut ::std::os::raw::c_int,
     ) -> ::std::os::raw::c_int;
     pub fn rb_jit_cont_each_iseq(callback: rb_iseq_callback, data: *mut ::std::os::raw::c_void);
-    pub fn rb_yjit_mark_writable(mem_block: *mut ::std::os::raw::c_void, mem_size: u32) -> bool;
-    pub fn rb_yjit_mark_executable(mem_block: *mut ::std::os::raw::c_void, mem_size: u32);
-    pub fn rb_yjit_mark_unused(mem_block: *mut ::std::os::raw::c_void, mem_size: u32) -> bool;
-    pub fn rb_yjit_array_len(a: VALUE) -> ::std::os::raw::c_long;
-    pub fn rb_yjit_icache_invalidate(
-        start: *mut ::std::os::raw::c_void,
-        end: *mut ::std::os::raw::c_void,
-    );
     pub fn rb_yjit_exit_locations_dict(
         yjit_raw_samples: *mut VALUE,
         yjit_line_samples: *mut ::std::os::raw::c_int,
         samples_len: ::std::os::raw::c_int,
     ) -> VALUE;
-    pub fn rb_yjit_get_page_size() -> u32;
-    pub fn rb_yjit_reserve_addr_space(mem_size: u32) -> *mut u8;
     pub fn rb_c_method_tracing_currently_enabled(ec: *const rb_execution_context_t) -> bool;
     pub fn rb_full_cfunc_return(ec: *mut rb_execution_context_t, return_value: VALUE);
-    pub fn rb_iseq_encoded_size(iseq: *const rb_iseq_t) -> ::std::os::raw::c_uint;
     pub fn rb_iseq_get_yjit_payload(iseq: *const rb_iseq_t) -> *mut ::std::os::raw::c_void;
     pub fn rb_iseq_set_yjit_payload(iseq: *const rb_iseq_t, payload: *mut ::std::os::raw::c_void);
-    pub fn rb_iseq_reset_jit_func(iseq: *const rb_iseq_t);
+    pub fn rb_get_symbol_id(namep: VALUE) -> ID;
+    pub fn rb_yjit_builtin_function(iseq: *const rb_iseq_t) -> *const rb_builtin_function;
+    pub fn rb_yjit_str_simple_append(str1: VALUE, str2: VALUE) -> VALUE;
+    pub fn rb_vm_base_ptr(cfp: *mut rb_control_frame_struct) -> *mut VALUE;
+    pub fn rb_str_neq_internal(str1: VALUE, str2: VALUE) -> VALUE;
+    pub fn rb_ary_unshift_m(argc: ::std::os::raw::c_int, argv: *mut VALUE, ary: VALUE) -> VALUE;
+    pub fn rb_yjit_rb_ary_subseq_length(ary: VALUE, beg: ::std::os::raw::c_long) -> VALUE;
+    pub fn rb_yjit_ruby2_keywords_splat_p(obj: VALUE) -> usize;
+    pub fn rb_yjit_splat_varg_checks(
+        sp: *mut VALUE,
+        splat_array: VALUE,
+        cfp: *mut rb_control_frame_t,
+    ) -> VALUE;
+    pub fn rb_yjit_splat_varg_cfunc(stack_splat_array: *mut VALUE) -> ::std::os::raw::c_int;
+    pub fn rb_yjit_dump_iseq_loc(iseq: *const rb_iseq_t, insn_idx: u32);
+    pub fn rb_yjit_iseq_inspect(iseq: *const rb_iseq_t) -> *mut ::std::os::raw::c_char;
+    pub fn rb_RSTRUCT_SET(st: VALUE, k: ::std::os::raw::c_int, v: VALUE);
+    pub fn rb_ENCODING_GET(obj: VALUE) -> ::std::os::raw::c_int;
+    pub fn rb_yjit_constcache_shareable(ice: *const iseq_inline_constant_cache_entry) -> bool;
+    pub fn rb_yjit_obj_written(
+        old: VALUE,
+        young: VALUE,
+        file: *const ::std::os::raw::c_char,
+        line: ::std::os::raw::c_int,
+    );
+    pub fn rb_object_shape_count() -> VALUE;
+    pub fn rb_yjit_shape_obj_too_complex_p(obj: VALUE) -> bool;
+    pub fn rb_yjit_shape_capacity(shape_id: shape_id_t) -> attr_index_t;
+    pub fn rb_yjit_shape_index(shape_id: shape_id_t) -> attr_index_t;
+    pub fn rb_yjit_sendish_sp_pops(ci: *const rb_callinfo) -> usize;
+    pub fn rb_yjit_invokeblock_sp_pops(ci: *const rb_callinfo) -> usize;
+    pub fn rb_yjit_cme_ractor_serial(cme: *const rb_callable_method_entry_t) -> rb_serial_t;
+    pub fn rb_yjit_set_exception_return(
+        cfp: *mut rb_control_frame_t,
+        leave_exit: *mut ::std::os::raw::c_void,
+        leave_exception: *mut ::std::os::raw::c_void,
+    );
+    pub fn rb_vm_instruction_size() -> u32;
+    pub fn rb_iseq_encoded_size(iseq: *const rb_iseq_t) -> ::std::os::raw::c_uint;
     pub fn rb_iseq_pc_at_idx(iseq: *const rb_iseq_t, insn_idx: u32) -> *mut VALUE;
     pub fn rb_iseq_opcode_at_pc(iseq: *const rb_iseq_t, pc: *const VALUE) -> ::std::os::raw::c_int;
     pub fn rb_RSTRING_LEN(str_: VALUE) -> ::std::os::raw::c_ulong;
     pub fn rb_RSTRING_PTR(str_: VALUE) -> *mut ::std::os::raw::c_char;
-    pub fn rb_yjit_get_proc_ptr(procv: VALUE) -> *mut rb_proc_t;
     pub fn rb_insn_name(insn: VALUE) -> *const ::std::os::raw::c_char;
     pub fn rb_vm_ci_argc(ci: *const rb_callinfo) -> ::std::os::raw::c_uint;
     pub fn rb_vm_ci_mid(ci: *const rb_callinfo) -> ID;
@@ -1167,7 +1219,6 @@ extern "C" {
     pub fn rb_METHOD_ENTRY_VISI(me: *const rb_callable_method_entry_t) -> rb_method_visibility_t;
     pub fn rb_get_cme_def_type(cme: *const rb_callable_method_entry_t) -> rb_method_type_t;
     pub fn rb_get_cme_def_body_attr_id(cme: *const rb_callable_method_entry_t) -> ID;
-    pub fn rb_get_symbol_id(namep: VALUE) -> ID;
     pub fn rb_get_cme_def_body_optimized_type(
         cme: *const rb_callable_method_entry_t,
     ) -> method_optimized_type;
@@ -1179,10 +1230,20 @@ extern "C" {
     ) -> *mut rb_method_cfunc_t;
     pub fn rb_get_def_method_serial(def: *const rb_method_definition_t) -> usize;
     pub fn rb_get_def_original_id(def: *const rb_method_definition_t) -> ID;
+    pub fn rb_get_def_bmethod_proc(def: *mut rb_method_definition_t) -> VALUE;
+    pub fn rb_jit_get_proc_ptr(procv: VALUE) -> *mut rb_proc_t;
+    pub fn rb_optimized_call(
+        recv: *mut VALUE,
+        ec: *mut rb_execution_context_t,
+        argc: ::std::os::raw::c_int,
+        argv: *mut VALUE,
+        kw_splat: ::std::os::raw::c_int,
+        block_handler: VALUE,
+    ) -> VALUE;
+    pub fn rb_jit_iseq_builtin_attrs(iseq: *const rb_iseq_t) -> ::std::os::raw::c_uint;
     pub fn rb_get_mct_argc(mct: *const rb_method_cfunc_t) -> ::std::os::raw::c_int;
     pub fn rb_get_mct_func(mct: *const rb_method_cfunc_t) -> *mut ::std::os::raw::c_void;
     pub fn rb_get_def_iseq_ptr(def: *mut rb_method_definition_t) -> *const rb_iseq_t;
-    pub fn rb_get_def_bmethod_proc(def: *mut rb_method_definition_t) -> VALUE;
     pub fn rb_get_iseq_body_local_iseq(iseq: *const rb_iseq_t) -> *const rb_iseq_t;
     pub fn rb_get_iseq_body_parent_iseq(iseq: *const rb_iseq_t) -> *const rb_iseq_t;
     pub fn rb_get_iseq_body_local_table_size(iseq: *const rb_iseq_t) -> ::std::os::raw::c_uint;
@@ -1203,87 +1264,59 @@ extern "C" {
     pub fn rb_get_iseq_flags_forwardable(iseq: *const rb_iseq_t) -> bool;
     pub fn rb_get_iseq_body_param_keyword(
         iseq: *const rb_iseq_t,
-    ) -> *const rb_seq_param_keyword_struct;
+    ) -> *const rb_iseq_param_keyword_struct;
     pub fn rb_get_iseq_body_param_size(iseq: *const rb_iseq_t) -> ::std::os::raw::c_uint;
     pub fn rb_get_iseq_body_param_lead_num(iseq: *const rb_iseq_t) -> ::std::os::raw::c_int;
     pub fn rb_get_iseq_body_param_opt_num(iseq: *const rb_iseq_t) -> ::std::os::raw::c_int;
     pub fn rb_get_iseq_body_param_opt_table(iseq: *const rb_iseq_t) -> *const VALUE;
-    pub fn rb_optimized_call(
-        recv: *mut VALUE,
-        ec: *mut rb_execution_context_t,
-        argc: ::std::os::raw::c_int,
-        argv: *mut VALUE,
-        kw_splat: ::std::os::raw::c_int,
-        block_handler: VALUE,
-    ) -> VALUE;
-    pub fn rb_yjit_iseq_builtin_attrs(iseq: *const rb_iseq_t) -> ::std::os::raw::c_uint;
-    pub fn rb_yjit_builtin_function(iseq: *const rb_iseq_t) -> *const rb_builtin_function;
-    pub fn rb_yjit_str_simple_append(str1: VALUE, str2: VALUE) -> VALUE;
     pub fn rb_get_ec_cfp(ec: *const rb_execution_context_t) -> *mut rb_control_frame_struct;
     pub fn rb_get_cfp_iseq(cfp: *mut rb_control_frame_struct) -> *const rb_iseq_t;
     pub fn rb_get_cfp_pc(cfp: *mut rb_control_frame_struct) -> *mut VALUE;
     pub fn rb_get_cfp_sp(cfp: *mut rb_control_frame_struct) -> *mut VALUE;
-    pub fn rb_set_cfp_pc(cfp: *mut rb_control_frame_struct, pc: *const VALUE);
-    pub fn rb_set_cfp_sp(cfp: *mut rb_control_frame_struct, sp: *mut VALUE);
     pub fn rb_get_cfp_self(cfp: *mut rb_control_frame_struct) -> VALUE;
     pub fn rb_get_cfp_ep(cfp: *mut rb_control_frame_struct) -> *mut VALUE;
     pub fn rb_get_cfp_ep_level(cfp: *mut rb_control_frame_struct, lv: u32) -> *const VALUE;
-    pub fn rb_vm_base_ptr(cfp: *mut rb_control_frame_struct) -> *mut VALUE;
     pub fn rb_yarv_class_of(obj: VALUE) -> VALUE;
-    pub fn rb_yarv_str_eql_internal(str1: VALUE, str2: VALUE) -> VALUE;
-    pub fn rb_str_neq_internal(str1: VALUE, str2: VALUE) -> VALUE;
-    pub fn rb_yarv_ary_entry_internal(ary: VALUE, offset: ::std::os::raw::c_long) -> VALUE;
-    pub fn rb_ary_unshift_m(argc: ::std::os::raw::c_int, argv: *mut VALUE, ary: VALUE) -> VALUE;
-    pub fn rb_yjit_rb_ary_subseq_length(ary: VALUE, beg: ::std::os::raw::c_long) -> VALUE;
-    pub fn rb_yjit_fix_div_fix(recv: VALUE, obj: VALUE) -> VALUE;
-    pub fn rb_yjit_fix_mod_fix(recv: VALUE, obj: VALUE) -> VALUE;
-    pub fn rb_yjit_ruby2_keywords_splat_p(obj: VALUE) -> usize;
-    pub fn rb_yjit_splat_varg_checks(
-        sp: *mut VALUE,
-        splat_array: VALUE,
-        cfp: *mut rb_control_frame_t,
-    ) -> VALUE;
-    pub fn rb_yjit_splat_varg_cfunc(stack_splat_array: *mut VALUE) -> ::std::os::raw::c_int;
-    pub fn rb_yjit_dump_iseq_loc(iseq: *const rb_iseq_t, insn_idx: u32);
-    pub fn rb_yjit_iseq_inspect(iseq: *const rb_iseq_t) -> *mut ::std::os::raw::c_char;
     pub fn rb_FL_TEST(obj: VALUE, flags: VALUE) -> VALUE;
     pub fn rb_FL_TEST_RAW(obj: VALUE, flags: VALUE) -> VALUE;
     pub fn rb_RB_TYPE_P(obj: VALUE, t: ruby_value_type) -> bool;
     pub fn rb_RSTRUCT_LEN(st: VALUE) -> ::std::os::raw::c_long;
-    pub fn rb_RSTRUCT_SET(st: VALUE, k: ::std::os::raw::c_int, v: VALUE);
     pub fn rb_get_call_data_ci(cd: *const rb_call_data) -> *const rb_callinfo;
     pub fn rb_BASIC_OP_UNREDEFINED_P(bop: ruby_basic_operators, klass: u32) -> bool;
     pub fn rb_RCLASS_ORIGIN(c: VALUE) -> VALUE;
-    pub fn rb_ENCODING_GET(obj: VALUE) -> ::std::os::raw::c_int;
-    pub fn rb_yjit_multi_ractor_p() -> bool;
     pub fn rb_assert_iseq_handle(handle: VALUE);
+    pub fn rb_assert_holding_vm_lock();
     pub fn rb_IMEMO_TYPE_P(imemo: VALUE, imemo_type: imemo_type) -> ::std::os::raw::c_int;
-    pub fn rb_yjit_constcache_shareable(ice: *const iseq_inline_constant_cache_entry) -> bool;
     pub fn rb_assert_cme_handle(handle: VALUE);
-    pub fn rb_yjit_for_each_iseq(callback: rb_iseq_callback, data: *mut ::std::os::raw::c_void);
-    pub fn rb_yjit_obj_written(
-        old: VALUE,
-        young: VALUE,
-        file: *const ::std::os::raw::c_char,
-        line: ::std::os::raw::c_int,
-    );
-    pub fn rb_yjit_vm_lock_then_barrier(
+    pub fn rb_yarv_ary_entry_internal(ary: VALUE, offset: ::std::os::raw::c_long) -> VALUE;
+    pub fn rb_jit_array_len(a: VALUE) -> ::std::os::raw::c_long;
+    pub fn rb_set_cfp_pc(cfp: *mut rb_control_frame_struct, pc: *const VALUE);
+    pub fn rb_set_cfp_sp(cfp: *mut rb_control_frame_struct, sp: *mut VALUE);
+    pub fn rb_jit_shape_too_complex_p(shape_id: shape_id_t) -> bool;
+    pub fn rb_jit_multi_ractor_p() -> bool;
+    pub fn rb_jit_vm_lock_then_barrier(
         recursive_lock_level: *mut ::std::os::raw::c_uint,
         file: *const ::std::os::raw::c_char,
         line: ::std::os::raw::c_int,
     );
-    pub fn rb_yjit_vm_unlock(
+    pub fn rb_jit_vm_unlock(
         recursive_lock_level: *mut ::std::os::raw::c_uint,
         file: *const ::std::os::raw::c_char,
         line: ::std::os::raw::c_int,
     );
-    pub fn rb_object_shape_count() -> VALUE;
-    pub fn rb_yjit_assert_holding_vm_lock();
-    pub fn rb_yjit_sendish_sp_pops(ci: *const rb_callinfo) -> usize;
-    pub fn rb_yjit_invokeblock_sp_pops(ci: *const rb_callinfo) -> usize;
-    pub fn rb_yjit_set_exception_return(
-        cfp: *mut rb_control_frame_t,
-        leave_exit: *mut ::std::os::raw::c_void,
-        leave_exception: *mut ::std::os::raw::c_void,
+    pub fn rb_iseq_reset_jit_func(iseq: *const rb_iseq_t);
+    pub fn rb_jit_get_page_size() -> u32;
+    pub fn rb_jit_reserve_addr_space(mem_size: u32) -> *mut u8;
+    pub fn rb_jit_for_each_iseq(callback: rb_iseq_callback, data: *mut ::std::os::raw::c_void);
+    pub fn rb_jit_mark_writable(mem_block: *mut ::std::os::raw::c_void, mem_size: u32) -> bool;
+    pub fn rb_jit_mark_executable(mem_block: *mut ::std::os::raw::c_void, mem_size: u32);
+    pub fn rb_jit_mark_unused(mem_block: *mut ::std::os::raw::c_void, mem_size: u32) -> bool;
+    pub fn rb_jit_icache_invalidate(
+        start: *mut ::std::os::raw::c_void,
+        end: *mut ::std::os::raw::c_void,
     );
+    pub fn rb_jit_fix_mod_fix(recv: VALUE, obj: VALUE) -> VALUE;
+    pub fn rb_jit_fix_div_fix(recv: VALUE, obj: VALUE) -> VALUE;
+    pub fn rb_yarv_str_eql_internal(str1: VALUE, str2: VALUE) -> VALUE;
+    pub fn rb_jit_str_concat_codepoint(str_: VALUE, codepoint: VALUE);
 }
diff --git a/yjit/src/disasm.rs b/yjit/src/disasm.rs
index 89da07beda..4f85937ee9 100644
--- a/yjit/src/disasm.rs
+++ b/yjit/src/disasm.rs
@@ -7,6 +7,38 @@ use crate::options::DumpDisasm;
 
 use std::fmt::Write;
 
+#[cfg_attr(not(feature = "disasm"), allow(dead_code))]
+#[derive(Copy, Clone, Debug)]
+pub struct TerminalColor {
+    pub blue_begin: &'static str,
+    pub blue_end: &'static str,
+    pub bold_begin: &'static str,
+    pub bold_end: &'static str,
+}
+
+pub static TTY_TERMINAL_COLOR: TerminalColor = TerminalColor {
+    blue_begin: "\x1b[34m",
+    blue_end: "\x1b[0m",
+    bold_begin: "\x1b[1m",
+    bold_end: "\x1b[22m",
+};
+
+pub static NON_TTY_TERMINAL_COLOR: TerminalColor = TerminalColor {
+    blue_begin: "",
+    blue_end: "",
+    bold_begin: "",
+    bold_end: "",
+};
+
+/// Terminal escape codes for colors, font weight, etc. Only enabled if stdout is a TTY.
+pub fn get_colors() -> &'static TerminalColor {
+    if crate::utils::stdout_supports_colors() {
+        &TTY_TERMINAL_COLOR
+    } else {
+        &NON_TTY_TERMINAL_COLOR
+    }
+}
+
 /// Primitive called in yjit.rb
 /// Produce a string representing the disassembly for an ISEQ
 #[no_mangle]
@@ -120,7 +152,7 @@ pub fn dump_disasm_addr_range(cb: &CodeBlock, start_addr: CodePtr, end_addr: Cod
                     // Write with the fd opened during boot
                     let mut file = unsafe { std::fs::File::from_raw_fd(*fd) };
                     file.write_all(disasm.as_bytes()).unwrap();
-                    file.into_raw_fd(); // keep the fd open
+                    let _ = file.into_raw_fd(); // keep the fd open
                 }
             };
         }
@@ -158,6 +190,7 @@ pub fn disasm_addr_range(cb: &CodeBlock, start_addr: usize, end_addr: usize) ->
     #[cfg(test)]
     let start_addr = 0;
     let insns = cs.disasm_all(code_slice, start_addr as u64).unwrap();
+    let colors = get_colors();
 
     // For each instruction in this block
     for insn in insns.as_ref() {
@@ -165,17 +198,17 @@ pub fn disasm_addr_range(cb: &CodeBlock, start_addr: usize, end_addr: usize) ->
         if let Some(comment_list) = cb.comments_at(insn.address() as usize) {
             for comment in comment_list {
                 if cb.outlined {
-                    write!(&mut out, "\x1b[34m").unwrap(); // Make outlined code blue
+                    write!(&mut out, "{}", colors.blue_begin).unwrap(); // Make outlined code blue
                 }
-                writeln!(&mut out, "  \x1b[1m# {comment}\x1b[22m").unwrap(); // Make comments bold
+                writeln!(&mut out, "  {}# {comment}{}", colors.bold_begin, colors.bold_end).unwrap(); // Make comments bold
             }
         }
         if cb.outlined {
-            write!(&mut out, "\x1b[34m").unwrap(); // Make outlined code blue
+            write!(&mut out, "{}", colors.blue_begin).unwrap(); // Make outlined code blue
         }
         writeln!(&mut out, "  {insn}").unwrap();
         if cb.outlined {
-            write!(&mut out, "\x1b[0m").unwrap(); // Disable blue
+            write!(&mut out, "{}", colors.blue_end).unwrap(); // Disable blue
         }
     }
 
@@ -188,6 +221,7 @@ pub fn disasm_addr_range(cb: &CodeBlock, start_addr: usize, end_addr: usize) ->
     let mut out = String::new();
     let mut line_byte_idx = 0;
     const MAX_BYTES_PER_LINE: usize = 16;
+    let colors = get_colors();
 
     for addr in start_addr..end_addr {
         if let Some(comment_list) = cb.comments_at(addr) {
@@ -197,7 +231,7 @@ pub fn disasm_addr_range(cb: &CodeBlock, start_addr: usize, end_addr: usize) ->
                 line_byte_idx = 0;
             }
             for comment in comment_list {
-                writeln!(&mut out, "  \x1b[1m# {comment}\x1b[22m").unwrap(); // Make comments bold
+                writeln!(&mut out, "  {}# {comment}{}", colors.bold_begin, colors.bold_end).unwrap(); // Make comments bold
             }
         }
         if line_byte_idx == 0 {
diff --git a/yjit/src/invariants.rs b/yjit/src/invariants.rs
index d468cfebd9..0f22fba6b8 100644
--- a/yjit/src/invariants.rs
+++ b/yjit/src/invariants.rs
@@ -206,7 +206,7 @@ pub fn assume_method_basic_definition(
 /// Tracks that a block is assuming it is operating in single-ractor mode.
 #[must_use]
 pub fn assume_single_ractor_mode(jit: &mut JITState, asm: &mut Assembler) -> bool {
-    if unsafe { rb_yjit_multi_ractor_p() } {
+    if unsafe { rb_jit_multi_ractor_p() } {
         false
     } else {
         if jit_ensure_block_entry_exit(jit, asm).is_none() {
@@ -303,7 +303,7 @@ pub extern "C" fn rb_yjit_cme_invalidate(callee_cme: *const rb_callable_method_e
     });
 }
 
-/// Callback for then Ruby is about to spawn a ractor. In that case we need to
+/// Callback for when Ruby is about to spawn a ractor. In that case we need to
 /// invalidate every block that is assuming single ractor mode.
 #[no_mangle]
 pub extern "C" fn rb_yjit_before_ractor_spawn() {
@@ -495,7 +495,7 @@ pub extern "C" fn rb_yjit_constant_ic_update(iseq: *const rb_iseq_t, ic: IC, ins
         return;
     };
 
-    if !unsafe { (*(*ic).entry).ic_cref }.is_null() || unsafe { rb_yjit_multi_ractor_p() } {
+    if !unsafe { (*(*ic).entry).ic_cref }.is_null() || unsafe { rb_jit_multi_ractor_p() } {
         // We can't generate code in these situations, so no need to invalidate.
         // See gen_opt_getinlinecache.
         return;
@@ -626,6 +626,8 @@ pub extern "C" fn rb_yjit_tracing_invalidate_all() {
         return;
     }
 
+    incr_counter!(invalidate_everything);
+
     // Stop other ractors since we are going to patch machine code.
     with_vm_lock(src_loc!(), || {
         // Make it so all live block versions are no longer valid branch targets
diff --git a/yjit/src/lib.rs b/yjit/src/lib.rs
index 1e3f31b88b..f3247fbf1a 100644
--- a/yjit/src/lib.rs
+++ b/yjit/src/lib.rs
@@ -3,6 +3,19 @@
 #![allow(clippy::too_many_arguments)] // :shrug:
 #![allow(clippy::identity_op)] // Sometimes we do it for style
 
+// TODO(alan): This lint is right -- the way we use `static mut` is UB happy. We have many globals
+// and take `&mut` frequently, sometimes with a method that easily allows calling it twice.
+//
+// All of our globals rely on us running single threaded, which outside of boot-time relies on the
+// VM lock (which signals and waits for all other threads to pause). To fix this properly, we should
+// gather up all the globals into a struct to centralize the safety reasoning. That way we can also
+// check for re-entrance in one place.
+//
+// We're too close to release to do that, though, so disable the lint for now.
+#![allow(unknown_lints)]
+#![allow(static_mut_refs)]
+#![warn(unknown_lints)]
+
 pub mod asm;
 mod backend;
 mod codegen;
diff --git a/yjit/src/log.rs b/yjit/src/log.rs
index f2dcf519e0..c5a724f7e1 100644
--- a/yjit/src/log.rs
+++ b/yjit/src/log.rs
@@ -81,7 +81,7 @@ impl Log {
                     let mut file = unsafe { std::fs::File::from_raw_fd(fd) };
                     writeln!(file, "{}", entry).unwrap();
                     file.flush().unwrap();
-                    file.into_raw_fd(); // keep the fd open
+                    let _ = file.into_raw_fd(); // keep the fd open
                 }
 
                 LogOutput::MemoryOnly => () // Don't print or write anything
diff --git a/yjit/src/options.rs b/yjit/src/options.rs
index b993b5685b..c87a436091 100644
--- a/yjit/src/options.rs
+++ b/yjit/src/options.rs
@@ -1,5 +1,5 @@
 use std::{ffi::{CStr, CString}, ptr::null, fs::File};
-use crate::{backend::current::TEMP_REGS, stats::Counter};
+use crate::{backend::current::TEMP_REGS, cruby::*, stats::Counter};
 use std::os::raw::{c_char, c_int, c_uint};
 
 // Call threshold for small deployments and command-line apps
@@ -46,6 +46,9 @@ pub struct Options {
     // The number of registers allocated for stack temps
     pub num_temp_regs: usize,
 
+    // Disable Ruby builtin methods defined by `with_jit` hooks, e.g. Array#each in Ruby
+    pub c_builtin: bool,
+
     // Capture stats
     pub gen_stats: bool,
 
@@ -94,6 +97,7 @@ pub static mut OPTIONS: Options = Options {
     no_type_prop: false,
     max_versions: 4,
     num_temp_regs: 5,
+    c_builtin: false,
     gen_stats: false,
     trace_exits: None,
     print_stats: true,
@@ -117,7 +121,7 @@ pub const YJIT_OPTIONS: &'static [(&str, &str)] = &[
     ("--yjit-call-threshold=num",          "Number of calls to trigger JIT."),
     ("--yjit-cold-threshold=num",          "Global calls after which ISEQs not compiled (default: 200K)."),
     ("--yjit-stats",                       "Enable collecting YJIT statistics."),
-    ("--yjit--log[=file|dir]",             "Enable logging of YJIT's compilation activity."),
+    ("--yjit-log[=file|dir]",              "Enable logging of YJIT's compilation activity."),
     ("--yjit-disable",                     "Disable YJIT for lazily enabling it with RubyVM::YJIT.enable."),
     ("--yjit-code-gc",                     "Run code GC when the code size reaches the limit."),
     ("--yjit-perf",                        "Enable frame pointers and perf profiling."),
@@ -148,7 +152,6 @@ pub enum DumpDisasm {
     // Dump to stdout
     Stdout,
     // Dump to "yjit_{pid}.log" file under the specified directory
-    #[cfg_attr(not(feature = "disasm"), allow(dead_code))]
     File(std::os::unix::io::RawFd),
 }
 
@@ -169,7 +172,7 @@ macro_rules! get_option {
         {
             // Make this a statement since attributes on expressions are experimental
             #[allow(unused_unsafe)]
-            let ret = unsafe { OPTIONS.$option_name };
+            let ret = unsafe { crate::options::OPTIONS.$option_name };
             ret
         }
     };
@@ -270,6 +273,10 @@ pub fn parse_option(str_ptr: *const std::os::raw::c_char) -> Option<()> {
             }
         },
 
+        ("c-builtin", _) => unsafe {
+            OPTIONS.c_builtin = true;
+        },
+
         ("code-gc", _) => unsafe {
             OPTIONS.code_gc = true;
         },
@@ -413,3 +420,13 @@ pub extern "C" fn rb_yjit_show_usage(help: c_int, highlight: c_int, width: c_uin
         unsafe { ruby_show_usage_line(name.as_ptr(), null(), description.as_ptr(), help, highlight, width, columns) }
     }
 }
+
+/// Return true if --yjit-c-builtin is given
+#[no_mangle]
+pub extern "C" fn rb_yjit_c_builtin_p(_ec: EcPtr, _self: VALUE) -> VALUE {
+    if get_option!(c_builtin) {
+        Qtrue
+    } else {
+        Qfalse
+    }
+}
diff --git a/yjit/src/stats.rs b/yjit/src/stats.rs
index 3dc37d4bac..105def2fff 100644
--- a/yjit/src/stats.rs
+++ b/yjit/src/stats.rs
@@ -1,18 +1,19 @@
 //! Everything related to the collection of runtime stats in YJIT
-//! See the stats feature and the --yjit-stats command-line option
+//! See the --yjit-stats command-line option
 
-#![allow(dead_code)] // Counters are only used with the stats features
-
-use std::alloc::{GlobalAlloc, Layout, System};
 use std::ptr::addr_of_mut;
-use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::atomic::Ordering;
 use std::time::Instant;
 use std::collections::HashMap;
 
 use crate::codegen::CodegenGlobals;
 use crate::cruby::*;
 use crate::options::*;
-use crate::yjit::yjit_enabled_p;
+use crate::yjit::{yjit_enabled_p, YJIT_INIT_TIME};
+
+#[cfg(feature = "stats_allocator")]
+#[path = "../../jit/src/lib.rs"]
+mod jit;
 
 /// Running total of how many ISeqs are in the system.
 #[no_mangle]
@@ -22,43 +23,9 @@ pub static mut rb_yjit_live_iseq_count: u64 = 0;
 #[no_mangle]
 pub static mut rb_yjit_iseq_alloc_count: u64 = 0;
 
-/// A middleware to count Rust-allocated bytes as yjit_alloc_size.
-#[global_allocator]
-static GLOBAL_ALLOCATOR: StatsAlloc = StatsAlloc { alloc_size: AtomicUsize::new(0) };
-
-pub struct StatsAlloc {
-    alloc_size: AtomicUsize,
-}
-
-unsafe impl GlobalAlloc for StatsAlloc {
-    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
-        self.alloc_size.fetch_add(layout.size(), Ordering::SeqCst);
-        System.alloc(layout)
-    }
-
-    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
-        self.alloc_size.fetch_sub(layout.size(), Ordering::SeqCst);
-        System.dealloc(ptr, layout)
-    }
-
-    unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 {
-        self.alloc_size.fetch_add(layout.size(), Ordering::SeqCst);
-        System.alloc_zeroed(layout)
-    }
-
-    unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
-        if new_size > layout.size() {
-            self.alloc_size.fetch_add(new_size - layout.size(), Ordering::SeqCst);
-        } else if new_size < layout.size() {
-            self.alloc_size.fetch_sub(layout.size() - new_size, Ordering::SeqCst);
-        }
-        System.realloc(ptr, layout, new_size)
-    }
-}
-
 /// The number of bytes YJIT has allocated on the Rust heap.
 pub fn yjit_alloc_size() -> usize {
-    GLOBAL_ALLOCATOR.alloc_size.load(Ordering::SeqCst)
+    jit::GLOBAL_ALLOCATOR.alloc_size.load(Ordering::SeqCst)
 }
 
 /// Mapping of C function / ISEQ name to integer indices
@@ -123,7 +90,9 @@ pub extern "C" fn incr_iseq_counter(idx: usize) {
     iseq_call_count[idx] += 1;
 }
 
-// YJIT exit counts for each instruction type
+/// YJIT exit counts for each instruction type.
+/// Note that `VM_INSTRUCTION_SIZE` is an upper bound and the actual number
+/// of VM opcodes may be different in the build. See [`rb_vm_instruction_size()`]
 const VM_INSTRUCTION_SIZE_USIZE: usize = VM_INSTRUCTION_SIZE as usize;
 static mut EXIT_OP_COUNT: [u64; VM_INSTRUCTION_SIZE_USIZE] = [0; VM_INSTRUCTION_SIZE_USIZE];
 
@@ -281,7 +250,9 @@ pub const DEFAULT_COUNTERS: &'static [Counter] = &[
     Counter::deleted_defer_block_count,
     Counter::compiled_branch_count,
     Counter::compile_time_ns,
+    Counter::compilation_failure,
     Counter::max_inline_versions,
+    Counter::inline_block_count,
     Counter::num_contexts_encoded,
     Counter::context_cache_hits,
 
@@ -293,6 +264,7 @@ pub const DEFAULT_COUNTERS: &'static [Counter] = &[
     Counter::invalidate_constant_ic_fill,
     Counter::invalidate_no_singleton_class,
     Counter::invalidate_ep_escape,
+    Counter::invalidate_everything,
 ];
 
 /// Macro to increase a counter by name and count
@@ -348,7 +320,6 @@ macro_rules! ptr_to_counter {
         }
     };
 }
-pub(crate) use ptr_to_counter;
 
 // Declare all the counters we track
 make_counters! {
@@ -356,6 +327,7 @@ make_counters! {
 
     // Method calls that fallback to dynamic dispatch
     send_singleton_class,
+    send_forwarding,
     send_ivar_set_method,
     send_zsuper_method,
     send_undef_method,
@@ -382,8 +354,8 @@ make_counters! {
     send_iseq_arity_error,
     send_iseq_block_arg_type,
     send_iseq_clobbering_block_arg,
+    send_iseq_block_arg_gc_unsafe,
     send_iseq_complex_discard_extras,
-    send_iseq_forwarding,
     send_iseq_leaf_builtin_block_arg_block_param,
     send_iseq_kw_splat_non_nil,
     send_iseq_kwargs_mismatch,
@@ -419,6 +391,9 @@ make_counters! {
     send_bmethod_ractor,
     send_bmethod_block_arg,
     send_optimized_block_arg,
+    send_pred_not_fixnum,
+    send_pred_underflow,
+    send_str_dup_exivar,
 
     invokesuper_defined_class_mismatch,
     invokesuper_forwarding,
@@ -462,8 +437,10 @@ make_counters! {
     guard_send_not_fixnum_or_flonum,
     guard_send_not_string,
     guard_send_respond_to_mid_mismatch,
+    guard_send_str_aref_not_fixnum,
 
     guard_send_cfunc_bad_splat_vargs,
+    guard_send_cfunc_block_not_nil,
 
     guard_invokesuper_me_changed,
 
@@ -512,8 +489,7 @@ make_counters! {
     opt_aset_not_array,
     opt_aset_not_fixnum,
     opt_aset_not_hash,
-
-    opt_aref_with_qundef,
+    opt_aset_frozen,
 
     opt_case_dispatch_megamorphic,
 
@@ -524,6 +500,7 @@ make_counters! {
     expandarray_postarg,
     expandarray_not_array,
     expandarray_to_ary,
+    expandarray_method_missing,
     expandarray_chain_max_depth,
 
     // getblockparam
@@ -569,6 +546,7 @@ make_counters! {
     branch_insn_count,
     branch_known_count,
     max_inline_versions,
+    inline_block_count,
     num_contexts_encoded,
 
     freed_iseq_count,
@@ -583,6 +561,7 @@ make_counters! {
     invalidate_constant_ic_fill,
     invalidate_no_singleton_class,
     invalidate_ep_escape,
+    invalidate_everything,
 
     // Currently, it's out of the ordinary (might be impossible) for YJIT to leave gaps in
     // executable memory, so this should be 0.
@@ -662,8 +641,7 @@ pub extern "C" fn rb_yjit_get_stats(_ec: EcPtr, _ruby_self: VALUE, key: VALUE) -
 
 /// Primitive called in yjit.rb
 ///
-/// Check if trace_exits generation is enabled. Requires the stats feature
-/// to be enabled.
+/// Check if trace_exits generation is enabled.
 #[no_mangle]
 pub extern "C" fn rb_yjit_trace_exit_locations_enabled_p(_ec: EcPtr, _ruby_self: VALUE) -> VALUE {
     if get_option!(trace_exits).is_some() {
@@ -687,7 +665,7 @@ pub extern "C" fn rb_yjit_get_exit_locations(_ec: EcPtr, _ruby_self: VALUE) -> V
         return Qnil;
     }
 
-    // If the stats feature is enabled, pass yjit_raw_samples and yjit_line_samples
+    // Pass yjit_raw_samples and yjit_line_samples
     // to the C function called rb_yjit_exit_locations_dict for parsing.
     let yjit_raw_samples = YjitExitLocations::get_raw_samples();
     let yjit_line_samples = YjitExitLocations::get_line_samples();
@@ -784,12 +762,18 @@ fn rb_yjit_gen_stats_dict(key: VALUE) -> VALUE {
         set_stat_usize!(hash, "context_cache_bytes", crate::core::CTX_ENCODE_CACHE_BYTES + crate::core::CTX_DECODE_CACHE_BYTES);
 
         // VM instructions count
-        set_stat_usize!(hash, "vm_insns_count", rb_vm_insns_count as usize);
+        if rb_vm_insn_count > 0 {
+            set_stat_usize!(hash, "vm_insns_count", rb_vm_insn_count as usize);
+        }
 
         set_stat_usize!(hash, "live_iseq_count", rb_yjit_live_iseq_count as usize);
         set_stat_usize!(hash, "iseq_alloc_count", rb_yjit_iseq_alloc_count as usize);
 
         set_stat!(hash, "object_shape_count", rb_object_shape_count());
+
+        // Time since YJIT init in nanoseconds
+        let time_nanos = Instant::now().duration_since(YJIT_INIT_TIME.unwrap()).as_nanos();
+        set_stat_usize!(hash, "yjit_active_ns", time_nanos as usize);
     }
 
     // If we're not generating stats, put only default counters
@@ -824,7 +808,8 @@ fn rb_yjit_gen_stats_dict(key: VALUE) -> VALUE {
 
         // For each entry in exit_op_count, add a stats entry with key "exit_INSTRUCTION_NAME"
         // and the value is the count of side exits for that instruction.
-        for op_idx in 0..VM_INSTRUCTION_SIZE_USIZE {
+        use crate::utils::IntoUsize;
+        for op_idx in 0..rb_vm_instruction_size().as_usize() {
             let op_name = insn_name(op_idx);
             let key_string = "exit_".to_owned() + &op_name;
             let count = EXIT_OP_COUNT[op_idx];
@@ -850,11 +835,13 @@ fn rb_yjit_gen_stats_dict(key: VALUE) -> VALUE {
         set_stat_double!(hash, "avg_len_in_yjit", avg_len_in_yjit);
 
         // Proportion of instructions that retire in YJIT
-        let total_insns_count = retired_in_yjit + rb_vm_insns_count;
-        set_stat_usize!(hash, "total_insns_count", total_insns_count as usize);
+        if rb_vm_insn_count > 0 {
+            let total_insns_count = retired_in_yjit + rb_vm_insn_count;
+            set_stat_usize!(hash, "total_insns_count", total_insns_count as usize);
 
-        let ratio_in_yjit: f64 = 100.0 * retired_in_yjit as f64 / total_insns_count as f64;
-        set_stat_double!(hash, "ratio_in_yjit", ratio_in_yjit);
+            let ratio_in_yjit: f64 = 100.0 * retired_in_yjit as f64 / total_insns_count as f64;
+            set_stat_double!(hash, "ratio_in_yjit", ratio_in_yjit);
+        }
 
         // Set method call counts in a Ruby dict
         fn set_call_counts(
@@ -905,13 +892,13 @@ fn rb_yjit_gen_stats_dict(key: VALUE) -> VALUE {
 }
 
 /// Record the backtrace when a YJIT exit occurs. This functionality requires
-/// that the stats feature is enabled as well as the --yjit-trace-exits option.
+/// the --yjit-trace-exits option.
 ///
 /// This function will fill two Vec's in YjitExitLocations to record the raw samples
 /// and line samples. Their length should be the same, however the data stored in
 /// them is different.
 #[no_mangle]
-pub extern "C" fn rb_yjit_record_exit_stack(_exit_pc: *const VALUE)
+pub extern "C" fn rb_yjit_record_exit_stack(exit_pc: *const VALUE)
 {
     // Return if YJIT is not enabled
     if !yjit_enabled_p() {
@@ -935,10 +922,11 @@ pub extern "C" fn rb_yjit_record_exit_stack(_exit_pc: *const VALUE)
     // rb_vm_insn_addr2opcode won't work in cargo test --all-features
     // because it's a C function. Without insn call, this function is useless
     // so wrap the whole thing in a not test check.
+    let _ = exit_pc;
     #[cfg(not(test))]
     {
         // Get the opcode from the encoded insn handler at this PC
-        let insn = unsafe { rb_vm_insn_addr2opcode((*_exit_pc).as_ptr()) };
+        let insn = unsafe { rb_vm_insn_addr2opcode((*exit_pc).as_ptr()) };
 
         // Use the same buffer size as Stackprof.
         const BUFF_LEN: usize = 2048;
diff --git a/yjit/src/utils.rs b/yjit/src/utils.rs
index c4b5fbd2e7..251628fabf 100644
--- a/yjit/src/utils.rs
+++ b/yjit/src/utils.rs
@@ -3,6 +3,7 @@
 use crate::backend::ir::*;
 use crate::cruby::*;
 use std::slice;
+use std::os::raw::c_int;
 
 /// Trait for casting to [usize] that allows you to say `.as_usize()`.
 /// Implementation conditional on the cast preserving the numeric value on
@@ -91,10 +92,7 @@ pub fn ruby_str_to_rust(v: VALUE) -> String {
     let str_ptr = unsafe { rb_RSTRING_PTR(v) } as *mut u8;
     let str_len: usize = unsafe { rb_RSTRING_LEN(v) }.try_into().unwrap();
     let str_slice: &[u8] = unsafe { slice::from_raw_parts(str_ptr, str_len) };
-    match String::from_utf8(str_slice.to_vec()) {
-        Ok(utf8) => utf8,
-        Err(_) => String::new(),
-    }
+    String::from_utf8(str_slice.to_vec()).unwrap_or_default()
 }
 
 // Location is the file defining the method, colon, method name.
@@ -162,8 +160,6 @@ pub fn print_int(asm: &mut Assembler, opnd: Opnd) {
         }
     }
 
-    asm.cpush_all();
-
     let argument = match opnd {
         Opnd::Mem(_) | Opnd::Reg(_) | Opnd::InsnOut { .. } => {
             // Sign-extend the value if necessary
@@ -178,7 +174,6 @@ pub fn print_int(asm: &mut Assembler, opnd: Opnd) {
     };
 
     asm.ccall(print_int_fn as *const u8, vec![argument]);
-    asm.cpop_all();
 }
 
 /// Generate code to print a pointer
@@ -191,9 +186,7 @@ pub fn print_ptr(asm: &mut Assembler, opnd: Opnd) {
 
     assert!(opnd.rm_num_bits() == 64);
 
-    asm.cpush_all();
     asm.ccall(print_ptr_fn as *const u8, vec![opnd]);
-    asm.cpop_all();
 }
 
 /// Generate code to print a value
@@ -206,9 +199,7 @@ pub fn print_value(asm: &mut Assembler, opnd: Opnd) {
 
     assert!(matches!(opnd, Opnd::Value(_)));
 
-    asm.cpush_all();
     asm.ccall(print_value_fn as *const u8, vec![opnd]);
-    asm.cpop_all();
 }
 
 /// Generate code to print constant string to stdout
@@ -223,7 +214,6 @@ pub fn print_str(asm: &mut Assembler, str: &str) {
         }
     }
 
-    asm.cpush_all();
 
     let string_data = asm.new_label("string_data");
     let after_string = asm.new_label("after_string");
@@ -235,8 +225,14 @@ pub fn print_str(asm: &mut Assembler, str: &str) {
 
     let opnd = asm.lea_jump_target(string_data);
     asm.ccall(print_str_cfun as *const u8, vec![opnd, Opnd::UImm(str.len() as u64)]);
+}
 
-    asm.cpop_all();
+pub fn stdout_supports_colors() -> bool {
+    // TODO(max): Use std::io::IsTerminal after upgrading Rust to 1.70
+    extern "C" { fn isatty(fd: c_int) -> c_int; }
+    let stdout = 1;
+    let is_terminal = unsafe { isatty(stdout) } == 1;
+    is_terminal
 }
 
 #[cfg(test)]
diff --git a/yjit/src/virtualmem.rs b/yjit/src/virtualmem.rs
index f56b0d8213..9126cf300e 100644
--- a/yjit/src/virtualmem.rs
+++ b/yjit/src/virtualmem.rs
@@ -3,10 +3,13 @@
 // usize->pointer casts is viable. It seems like a lot of work for us to participate for not much
 // benefit.
 
-use std::ptr::NonNull;
+use std::{cell::RefCell, ptr::NonNull};
 
 use crate::{backend::ir::Target, stats::yjit_alloc_size, utils::IntoUsize};
 
+#[cfg(test)]
+use crate::options::get_option;
+
 #[cfg(not(test))]
 pub type VirtualMem = VirtualMemory<sys::SystemAllocator>;
 
@@ -36,8 +39,14 @@ pub struct VirtualMemory<A: Allocator> {
     /// granularity.
     page_size_bytes: usize,
 
+    /// Mutable parts.
+    mutable: RefCell<VirtualMemoryMut<A>>,
+}
+
+/// Mutable parts of [`VirtualMemory`].
+pub struct VirtualMemoryMut<A: Allocator> {
     /// Number of bytes that have we have allocated physical memory for starting at
-    /// [Self::region_start].
+    /// [VirtualMemory::region_start].
     mapped_region_bytes: usize,
 
     /// Keep track of the address of the last written to page.
@@ -124,9 +133,11 @@ impl<A: Allocator> VirtualMemory<A> {
             region_size_bytes,
             memory_limit_bytes,
             page_size_bytes,
-            mapped_region_bytes: 0,
-            current_write_page: None,
-            allocator,
+            mutable: RefCell::new(VirtualMemoryMut {
+                mapped_region_bytes: 0,
+                current_write_page: None,
+                allocator,
+            }),
         }
     }
 
@@ -137,7 +148,7 @@ impl<A: Allocator> VirtualMemory<A> {
     }
 
     pub fn mapped_end_ptr(&self) -> CodePtr {
-        self.start_ptr().add_bytes(self.mapped_region_bytes)
+        self.start_ptr().add_bytes(self.mutable.borrow().mapped_region_bytes)
     }
 
     pub fn virtual_end_ptr(&self) -> CodePtr {
@@ -146,7 +157,7 @@ impl<A: Allocator> VirtualMemory<A> {
 
     /// Size of the region in bytes that we have allocated physical memory for.
     pub fn mapped_region_size(&self) -> usize {
-        self.mapped_region_bytes
+        self.mutable.borrow().mapped_region_bytes
     }
 
     /// Size of the region in bytes where writes could be attempted.
@@ -161,19 +172,21 @@ impl<A: Allocator> VirtualMemory<A> {
     }
 
     /// Write a single byte. The first write to a page makes it readable.
-    pub fn write_byte(&mut self, write_ptr: CodePtr, byte: u8) -> Result<(), WriteError> {
+    pub fn write_byte(&self, write_ptr: CodePtr, byte: u8) -> Result<(), WriteError> {
+        let mut mutable = self.mutable.borrow_mut();
+
         let page_size = self.page_size_bytes;
         let raw: *mut u8 = write_ptr.raw_ptr(self) as *mut u8;
         let page_addr = (raw as usize / page_size) * page_size;
 
-        if self.current_write_page == Some(page_addr) {
+        if mutable.current_write_page == Some(page_addr) {
             // Writing within the last written to page, nothing to do
         } else {
             // Switching to a different and potentially new page
             let start = self.region_start.as_ptr();
-            let mapped_region_end = start.wrapping_add(self.mapped_region_bytes);
+            let mapped_region_end = start.wrapping_add(mutable.mapped_region_bytes);
             let whole_region_end = start.wrapping_add(self.region_size_bytes);
-            let alloc = &mut self.allocator;
+            let alloc = &mut mutable.allocator;
 
             assert!((start..=whole_region_end).contains(&mapped_region_end));
 
@@ -185,7 +198,7 @@ impl<A: Allocator> VirtualMemory<A> {
                     return Err(FailedPageMapping);
                 }
 
-                self.current_write_page = Some(page_addr);
+                mutable.current_write_page = Some(page_addr);
             } else if (start..whole_region_end).contains(&raw) &&
                     (page_addr + page_size - start as usize) + yjit_alloc_size() < self.memory_limit_bytes {
                 // Writing to a brand new page
@@ -217,9 +230,9 @@ impl<A: Allocator> VirtualMemory<A> {
                         unreachable!("unknown arch");
                     }
                 }
-                self.mapped_region_bytes = self.mapped_region_bytes + alloc_size;
+                mutable.mapped_region_bytes = mutable.mapped_region_bytes + alloc_size;
 
-                self.current_write_page = Some(page_addr);
+                mutable.current_write_page = Some(page_addr);
             } else {
                 return Err(OutOfBounds);
             }
@@ -231,20 +244,41 @@ impl<A: Allocator> VirtualMemory<A> {
         Ok(())
     }
 
+    /// Make all the code in the region writeable.
+    /// Call this during GC before the phase of updating reference fields.
+    pub fn mark_all_writeable(&self) {
+        let mut mutable = self.mutable.borrow_mut();
+
+        mutable.current_write_page = None;
+
+        let region_start = self.region_start;
+        let mapped_region_bytes: u32 = mutable.mapped_region_bytes.try_into().unwrap();
+
+        // Make mapped region executable
+        if !mutable.allocator.mark_writable(region_start.as_ptr(), mapped_region_bytes) {
+            panic!("Cannot make memory region writable: {:?}-{:?}",
+                region_start.as_ptr(),
+                unsafe { region_start.as_ptr().add(mapped_region_bytes as usize)}
+            );
+        }
+    }
+
     /// Make all the code in the region executable. Call this at the end of a write session.
     /// See [Self] for usual usage flow.
-    pub fn mark_all_executable(&mut self) {
-        self.current_write_page = None;
+    pub fn mark_all_executable(&self) {
+        let mut mutable = self.mutable.borrow_mut();
+
+        mutable.current_write_page = None;
 
         let region_start = self.region_start;
-        let mapped_region_bytes: u32 = self.mapped_region_bytes.try_into().unwrap();
+        let mapped_region_bytes: u32 = mutable.mapped_region_bytes.try_into().unwrap();
 
         // Make mapped region executable
-        self.allocator.mark_executable(region_start.as_ptr(), mapped_region_bytes);
+        mutable.allocator.mark_executable(region_start.as_ptr(), mapped_region_bytes);
     }
 
     /// Free a range of bytes. start_ptr must be memory page-aligned.
-    pub fn free_bytes(&mut self, start_ptr: CodePtr, size: u32) {
+    pub fn free_bytes(&self, start_ptr: CodePtr, size: u32) {
         assert_eq!(start_ptr.raw_ptr(self) as usize % self.page_size_bytes, 0);
 
         // Bounds check the request. We should only free memory we manage.
@@ -257,7 +291,8 @@ impl<A: Allocator> VirtualMemory<A> {
         // code page, it's more appropriate to check the last byte against the virtual region.
         assert!(virtual_region.contains(&last_byte_to_free));
 
-        self.allocator.mark_unused(start_ptr.raw_ptr(self), size);
+        let mut mutable = self.mutable.borrow_mut();
+        mutable.allocator.mark_unused(start_ptr.raw_ptr(self), size);
     }
 }
 
@@ -284,15 +319,15 @@ mod sys {
 
     impl super::Allocator for SystemAllocator {
         fn mark_writable(&mut self, ptr: *const u8, size: u32) -> bool {
-            unsafe { rb_yjit_mark_writable(ptr as VoidPtr, size) }
+            unsafe { rb_jit_mark_writable(ptr as VoidPtr, size) }
         }
 
         fn mark_executable(&mut self, ptr: *const u8, size: u32) {
-            unsafe { rb_yjit_mark_executable(ptr as VoidPtr, size) }
+            unsafe { rb_jit_mark_executable(ptr as VoidPtr, size) }
         }
 
         fn mark_unused(&mut self, ptr: *const u8, size: u32) -> bool {
-            unsafe { rb_yjit_mark_unused(ptr as VoidPtr, size) }
+            unsafe { rb_jit_mark_unused(ptr as VoidPtr, size) }
         }
     }
 }
@@ -379,18 +414,18 @@ pub mod tests {
             PAGE_SIZE.try_into().unwrap(),
             NonNull::new(mem_start as *mut u8).unwrap(),
             mem_size,
-            128 * 1024 * 1024,
+            get_option!(mem_size),
         )
     }
 
     #[test]
     #[cfg(target_arch = "x86_64")]
     fn new_memory_is_initialized() {
-        let mut virt = new_dummy_virt_mem();
+        let virt = new_dummy_virt_mem();
 
         virt.write_byte(virt.start_ptr(), 1).unwrap();
         assert!(
-            virt.allocator.memory[..PAGE_SIZE].iter().all(|&byte| byte != 0),
+            virt.mutable.borrow().allocator.memory[..PAGE_SIZE].iter().all(|&byte| byte != 0),
             "Entire page should be initialized",
         );
 
@@ -398,21 +433,21 @@ pub mod tests {
         let three_pages = 3 * PAGE_SIZE;
         virt.write_byte(virt.start_ptr().add_bytes(three_pages), 1).unwrap();
         assert!(
-            virt.allocator.memory[..three_pages].iter().all(|&byte| byte != 0),
+            virt.mutable.borrow().allocator.memory[..three_pages].iter().all(|&byte| byte != 0),
             "Gaps between write requests should be filled",
         );
     }
 
     #[test]
     fn no_redundant_syscalls_when_writing_to_the_same_page() {
-        let mut virt = new_dummy_virt_mem();
+        let virt = new_dummy_virt_mem();
 
         virt.write_byte(virt.start_ptr(), 1).unwrap();
         virt.write_byte(virt.start_ptr(), 0).unwrap();
 
         assert!(
             matches!(
-                virt.allocator.requests[..],
+                virt.mutable.borrow().allocator.requests[..],
                 [MarkWritable { start_idx: 0, length: PAGE_SIZE }],
             )
         );
@@ -421,7 +456,7 @@ pub mod tests {
     #[test]
     fn bounds_checking() {
         use super::WriteError::*;
-        let mut virt = new_dummy_virt_mem();
+        let virt = new_dummy_virt_mem();
 
         let one_past_end = virt.start_ptr().add_bytes(virt.virtual_region_size());
         assert_eq!(Err(OutOfBounds), virt.write_byte(one_past_end, 0));
@@ -434,7 +469,7 @@ pub mod tests {
     fn only_written_to_regions_become_executable() {
         // ... so we catch attempts to read/write/execute never-written-to regions
         const THREE_PAGES: usize = PAGE_SIZE * 3;
-        let mut virt = new_dummy_virt_mem();
+        let virt = new_dummy_virt_mem();
         let page_two_start = virt.start_ptr().add_bytes(PAGE_SIZE * 2);
         virt.write_byte(page_two_start, 1).unwrap();
         virt.mark_all_executable();
@@ -442,7 +477,7 @@ pub mod tests {
         assert!(virt.virtual_region_size() > THREE_PAGES);
         assert!(
             matches!(
-                virt.allocator.requests[..],
+                virt.mutable.borrow().allocator.requests[..],
                 [
                     MarkWritable { start_idx: 0, length: THREE_PAGES },
                     MarkExecutable { start_idx: 0, length: THREE_PAGES },
diff --git a/yjit/src/yjit.rs b/yjit/src/yjit.rs
index a9ecc24a80..517a0daae5 100644
--- a/yjit/src/yjit.rs
+++ b/yjit/src/yjit.rs
@@ -7,7 +7,8 @@ use crate::stats::YjitExitLocations;
 use crate::stats::incr_counter;
 use crate::stats::with_compile_time;
 
-use std::os::raw;
+use std::os::raw::{c_char, c_int};
+use std::time::Instant;
 use crate::log::Log;
 
 /// Is YJIT on? The interpreter uses this variable to decide whether to trigger
@@ -16,10 +17,13 @@ use crate::log::Log;
 #[no_mangle]
 pub static mut rb_yjit_enabled_p: bool = false;
 
+// Time when YJIT was yjit was initialized (see yjit_init)
+pub static mut YJIT_INIT_TIME: Option<Instant> = None;
+
 /// Parse one command-line option.
 /// This is called from ruby.c
 #[no_mangle]
-pub extern "C" fn rb_yjit_parse_option(str_ptr: *const raw::c_char) -> bool {
+pub extern "C" fn rb_yjit_parse_option(str_ptr: *const c_char) -> bool {
     return parse_option(str_ptr).is_some();
 }
 
@@ -50,6 +54,12 @@ fn yjit_init() {
     // TODO: need to make sure that command-line options have been
     // initialized by CRuby
 
+    // Call YJIT hooks before enabling YJIT to avoid compiling the hooks themselves
+    unsafe {
+        let yjit = rb_const_get(rb_cRubyVM, rust_str_to_id("YJIT"));
+        rb_funcall(yjit, rust_str_to_id("call_jit_hooks"), 0);
+    }
+
     // Catch panics to avoid UB for unwinding into C frames.
     // See https://doc.rust-lang.org/nomicon/exception-safety.html
     let result = std::panic::catch_unwind(|| {
@@ -76,6 +86,16 @@ fn yjit_init() {
         let _ = std::fs::remove_file(&perf_map);
         println!("YJIT perf map: {perf_map}");
     }
+
+    // Note the time when YJIT was initialized
+    unsafe {
+        YJIT_INIT_TIME = Some(Instant::now());
+    }
+}
+
+#[no_mangle]
+pub extern "C" fn rb_yjit_free_at_exit() {
+    yjit_shutdown_free_codegen_table();
 }
 
 /// At the moment, we abort in all cases we panic.
@@ -102,7 +122,10 @@ fn rb_bug_panic_hook() {
         env::set_var("RUST_BACKTRACE", "1");
         previous_hook(panic_info);
 
-        unsafe { rb_bug(b"YJIT panicked\0".as_ref().as_ptr() as *const raw::c_char); }
+        // Abort with rb_bug(). It has a length limit on the message.
+        let panic_message = &format!("{}", panic_info)[..];
+        let len = std::cmp::min(0x100, panic_message.len()) as c_int;
+        unsafe { rb_bug(b"YJIT: %*s\0".as_ref().as_ptr() as *const c_char, len, panic_message.as_ptr()); }
     }));
 }
 
@@ -168,8 +191,24 @@ pub extern "C" fn rb_yjit_code_gc(_ec: EcPtr, _ruby_self: VALUE) -> VALUE {
 
 /// Enable YJIT compilation, returning true if YJIT was previously disabled
 #[no_mangle]
-pub extern "C" fn rb_yjit_enable(_ec: EcPtr, _ruby_self: VALUE, gen_stats: VALUE, print_stats: VALUE, gen_log: VALUE, print_log: VALUE) -> VALUE {
+pub extern "C" fn rb_yjit_enable(_ec: EcPtr, _ruby_self: VALUE, gen_stats: VALUE, print_stats: VALUE, gen_log: VALUE, print_log: VALUE, mem_size: VALUE, call_threshold: VALUE) -> VALUE {
     with_vm_lock(src_loc!(), || {
+
+        if !mem_size.nil_p() {
+            let mem_size_mb = mem_size.as_isize() >> 1;
+            let mem_size_bytes = mem_size_mb * 1024 * 1024;
+            unsafe {
+                OPTIONS.mem_size = mem_size_bytes as usize;
+            }
+        }
+
+        if !call_threshold.nil_p() {
+            let threshold = call_threshold.as_isize() >> 1;
+            unsafe {
+                rb_yjit_call_threshold = threshold as u64;
+            }
+        }
+
         // Initialize and enable YJIT
         if gen_stats.test() {
             unsafe {