51 files changed, 15314 insertions, 6334 deletions
diff --git a/yjit/src/asm/arm64/arg/bitmask_imm.rs b/yjit/src/asm/arm64/arg/bitmask_imm.rs
index ff9b2c8a2d..70a439afd5 100644
--- a/yjit/src/asm/arm64/arg/bitmask_imm.rs
+++ b/yjit/src/asm/arm64/arg/bitmask_imm.rs
@@ -42,7 +42,7 @@ impl TryFrom<u64> for BitmaskImmediate {
     /// Attempt to convert a u64 into a BitmaskImmediate.
     ///
     /// The implementation here is largely based on this blog post:
-    /// https://dougallj.wordpress.com/2021/10/30/bit-twiddling-optimising-aarch64-logical-immediate-encoding-and-decoding/
+    /// <https://dougallj.wordpress.com/2021/10/30/bit-twiddling-optimising-aarch64-logical-immediate-encoding-and-decoding/>
     fn try_from(value: u64) -> Result<Self, Self::Error> {
         if value == 0 || value == u64::MAX {
             return Err(());
@@ -106,7 +106,7 @@ mod tests {
 
     #[test]
     fn test_failures() {
-        vec![5, 9, 10, 11, 13, 17, 18, 19].iter().for_each(|&imm| {
+        [5, 9, 10, 11, 13, 17, 18, 19].iter().for_each(|&imm| {
             assert!(BitmaskImmediate::try_from(imm).is_err());
         });
     }
diff --git a/yjit/src/asm/arm64/arg/condition.rs b/yjit/src/asm/arm64/arg/condition.rs
index bb9ce570c3..f711b8b0d8 100644
--- a/yjit/src/asm/arm64/arg/condition.rs
+++ b/yjit/src/asm/arm64/arg/condition.rs
@@ -49,4 +49,4 @@ impl Condition {
 
         }
     }
-}
-\ No newline at end of file
+}
diff --git a/yjit/src/asm/arm64/arg/sys_reg.rs b/yjit/src/asm/arm64/arg/sys_reg.rs
index 41d71920cb..6229d5c1fd 100644
--- a/yjit/src/asm/arm64/arg/sys_reg.rs
+++ b/yjit/src/asm/arm64/arg/sys_reg.rs
@@ -1,6 +1,6 @@
 /// The encoded representation of an A64 system register.
-/// https://developer.arm.com/documentation/ddi0601/2022-06/AArch64-Registers/
+/// <https://developer.arm.com/documentation/ddi0601/2022-06/AArch64-Registers/>
 pub enum SystemRegister {
-    /// https://developer.arm.com/documentation/ddi0601/2022-06/AArch64-Registers/NZCV--Condition-Flags?lang=en
+    /// <https://developer.arm.com/documentation/ddi0601/2022-06/AArch64-Registers/NZCV--Condition-Flags?lang=en>
     NZCV = 0b1_011_0100_0010_000
 }
diff --git a/yjit/src/asm/arm64/inst/atomic.rs b/yjit/src/asm/arm64/inst/atomic.rs
index 5ce497209c..dce9affedf 100644
--- a/yjit/src/asm/arm64/inst/atomic.rs
+++ b/yjit/src/asm/arm64/inst/atomic.rs
@@ -43,13 +43,13 @@ pub struct Atomic {
 
 impl Atomic {
     /// LDADDAL
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDADD--LDADDA--LDADDAL--LDADDL--Atomic-add-on-word-or-doubleword-in-memory-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDADD--LDADDA--LDADDAL--LDADDL--Atomic-add-on-word-or-doubleword-in-memory-?lang=en>
     pub fn ldaddal(rs: u8, rt: u8, rn: u8, num_bits: u8) -> Self {
         Self { rt, rn, rs, size: num_bits.into() }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en>
 const FAMILY: u32 = 0b0100;
 
 impl From<Atomic> for u32 {
diff --git a/yjit/src/asm/arm64/inst/branch.rs b/yjit/src/asm/arm64/inst/branch.rs
index f15ef2a9b0..14fcb2e9fd 100644
--- a/yjit/src/asm/arm64/inst/branch.rs
+++ b/yjit/src/asm/arm64/inst/branch.rs
@@ -28,25 +28,25 @@ pub struct Branch {
 
 impl Branch {
     /// BR
-    /// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/BR--Branch-to-Register-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/BR--Branch-to-Register-?lang=en>
     pub fn br(rn: u8) -> Self {
         Self { rn, op: Op::BR }
     }
 
     /// BLR
-    /// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/BLR--Branch-with-Link-to-Register-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/BLR--Branch-with-Link-to-Register-?lang=en>
     pub fn blr(rn: u8) -> Self {
         Self { rn, op: Op::BLR }
     }
 
     /// RET
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/RET--Return-from-subroutine-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/RET--Return-from-subroutine-?lang=en>
     pub fn ret(rn: u8) -> Self {
         Self { rn, op: Op::RET }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en>
 const FAMILY: u32 = 0b101;
 
 impl From<Branch> for u32 {
diff --git a/yjit/src/asm/arm64/inst/branch_cond.rs b/yjit/src/asm/arm64/inst/branch_cond.rs
index fcc07f69aa..266e9ccb31 100644
--- a/yjit/src/asm/arm64/inst/branch_cond.rs
+++ b/yjit/src/asm/arm64/inst/branch_cond.rs
@@ -19,13 +19,13 @@ pub struct BranchCond {
 
 impl BranchCond {
     /// B.cond
-    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/B-cond--Branch-conditionally-
+    /// <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/B-cond--Branch-conditionally->
     pub fn bcond(cond: u8, offset: InstructionOffset) -> Self {
         Self { cond, offset }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en>
 const FAMILY: u32 = 0b101;
 
 impl From<BranchCond> for u32 {
diff --git a/yjit/src/asm/arm64/inst/breakpoint.rs b/yjit/src/asm/arm64/inst/breakpoint.rs
index be4920ac76..d66a35c4c6 100644
--- a/yjit/src/asm/arm64/inst/breakpoint.rs
+++ b/yjit/src/asm/arm64/inst/breakpoint.rs
@@ -13,13 +13,13 @@ pub struct Breakpoint {
 
 impl Breakpoint {
     /// BRK
-    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/BRK--Breakpoint-instruction-
+    /// <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/BRK--Breakpoint-instruction->
     pub fn brk(imm16: u16) -> Self {
         Self { imm16 }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en#control
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en#control>
 const FAMILY: u32 = 0b101;
 
 impl From<Breakpoint> for u32 {
diff --git a/yjit/src/asm/arm64/inst/call.rs b/yjit/src/asm/arm64/inst/call.rs
index 74debac7f7..fd26d09f8a 100644
--- a/yjit/src/asm/arm64/inst/call.rs
+++ b/yjit/src/asm/arm64/inst/call.rs
@@ -29,19 +29,19 @@ pub struct Call {
 
 impl Call {
     /// B
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B--Branch-
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B--Branch->
     pub fn b(offset: InstructionOffset) -> Self {
         Self { offset, op: Op::Branch }
     }
 
     /// BL
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/BL--Branch-with-Link-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/BL--Branch-with-Link-?lang=en>
     pub fn bl(offset: InstructionOffset) -> Self {
         Self { offset, op: Op::BranchWithLink }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en>
 const FAMILY: u32 = 0b101;
 
 impl From<Call> for u32 {
diff --git a/yjit/src/asm/arm64/inst/conditional.rs b/yjit/src/asm/arm64/inst/conditional.rs
index e1950e95b4..1e26c7408b 100644
--- a/yjit/src/asm/arm64/inst/conditional.rs
+++ b/yjit/src/asm/arm64/inst/conditional.rs
@@ -28,13 +28,13 @@ pub struct Conditional {
 
 impl Conditional {
     /// CSEL
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CSEL--Conditional-Select-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CSEL--Conditional-Select-?lang=en>
     pub fn csel(rd: u8, rn: u8, rm: u8, cond: u8, num_bits: u8) -> Self {
         Self { rd, rn, cond, rm, sf: num_bits.into() }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Register?lang=en#condsel
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Register?lang=en#condsel>
 const FAMILY: u32 = 0b101;
 
 impl From<Conditional> for u32 {
diff --git a/yjit/src/asm/arm64/inst/data_imm.rs b/yjit/src/asm/arm64/inst/data_imm.rs
index b474b00a52..ea71705478 100644
--- a/yjit/src/asm/arm64/inst/data_imm.rs
+++ b/yjit/src/asm/arm64/inst/data_imm.rs
@@ -44,37 +44,37 @@ pub struct DataImm {
 
 impl DataImm {
     /// ADD (immediate)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADD--immediate---Add--immediate--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADD--immediate---Add--immediate--?lang=en>
     pub fn add(rd: u8, rn: u8, imm: ShiftedImmediate, num_bits: u8) -> Self {
         Self { rd, rn, imm, s: S::LeaveFlags, op: Op::Add, sf: num_bits.into() }
     }
 
     /// ADDS (immediate, set flags)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADDS--immediate---Add--immediate---setting-flags-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADDS--immediate---Add--immediate---setting-flags-?lang=en>
     pub fn adds(rd: u8, rn: u8, imm: ShiftedImmediate, num_bits: u8) -> Self {
         Self { rd, rn, imm, s: S::UpdateFlags, op: Op::Add, sf: num_bits.into() }
     }
 
     /// CMP (immediate)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CMP--immediate---Compare--immediate---an-alias-of-SUBS--immediate--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CMP--immediate---Compare--immediate---an-alias-of-SUBS--immediate--?lang=en>
     pub fn cmp(rn: u8, imm: ShiftedImmediate, num_bits: u8) -> Self {
         Self::subs(31, rn, imm, num_bits)
     }
 
     /// SUB (immediate)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SUB--immediate---Subtract--immediate--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SUB--immediate---Subtract--immediate--?lang=en>
     pub fn sub(rd: u8, rn: u8, imm: ShiftedImmediate, num_bits: u8) -> Self {
         Self { rd, rn, imm, s: S::LeaveFlags, op: Op::Sub, sf: num_bits.into() }
     }
 
     /// SUBS (immediate, set flags)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SUBS--immediate---Subtract--immediate---setting-flags-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SUBS--immediate---Subtract--immediate---setting-flags-?lang=en>
     pub fn subs(rd: u8, rn: u8, imm: ShiftedImmediate, num_bits: u8) -> Self {
         Self { rd, rn, imm, s: S::UpdateFlags, op: Op::Sub, sf: num_bits.into() }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en>
 const FAMILY: u32 = 0b1000;
 
 impl From<DataImm> for u32 {
diff --git a/yjit/src/asm/arm64/inst/data_reg.rs b/yjit/src/asm/arm64/inst/data_reg.rs
index a742121f1f..ed4afa956b 100644
--- a/yjit/src/asm/arm64/inst/data_reg.rs
+++ b/yjit/src/asm/arm64/inst/data_reg.rs
@@ -57,7 +57,7 @@ pub struct DataReg {
 
 impl DataReg {
     /// ADD (shifted register)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADD--shifted-register---Add--shifted-register--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADD--shifted-register---Add--shifted-register--?lang=en>
     pub fn add(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
         Self {
             rd,
@@ -72,7 +72,7 @@ impl DataReg {
     }
 
     /// ADDS (shifted register, set flags)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADDS--shifted-register---Add--shifted-register---setting-flags-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADDS--shifted-register---Add--shifted-register---setting-flags-?lang=en>
     pub fn adds(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
         Self {
             rd,
@@ -87,13 +87,13 @@ impl DataReg {
     }
 
     /// CMP (shifted register)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CMP--shifted-register---Compare--shifted-register---an-alias-of-SUBS--shifted-register--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CMP--shifted-register---Compare--shifted-register---an-alias-of-SUBS--shifted-register--?lang=en>
     pub fn cmp(rn: u8, rm: u8, num_bits: u8) -> Self {
         Self::subs(31, rn, rm, num_bits)
     }
 
     /// SUB (shifted register)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SUB--shifted-register---Subtract--shifted-register--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SUB--shifted-register---Subtract--shifted-register--?lang=en>
     pub fn sub(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
         Self {
             rd,
@@ -108,7 +108,7 @@ impl DataReg {
     }
 
     /// SUBS (shifted register, set flags)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SUBS--shifted-register---Subtract--shifted-register---setting-flags-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SUBS--shifted-register---Subtract--shifted-register---setting-flags-?lang=en>
     pub fn subs(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
         Self {
             rd,
@@ -123,7 +123,7 @@ impl DataReg {
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Register?lang=en
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Register?lang=en>
 const FAMILY: u32 = 0b0101;
 
 impl From<DataReg> for u32 {
diff --git a/yjit/src/asm/arm64/inst/halfword_imm.rs b/yjit/src/asm/arm64/inst/halfword_imm.rs
index 0ddae8e8de..863ac947dd 100644
--- a/yjit/src/asm/arm64/inst/halfword_imm.rs
+++ b/yjit/src/asm/arm64/inst/halfword_imm.rs
@@ -53,43 +53,43 @@ pub struct HalfwordImm {
 
 impl HalfwordImm {
     /// LDRH
-    /// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+    /// <https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate-->
     pub fn ldrh(rt: u8, rn: u8, imm12: i16) -> Self {
         Self { rt, rn, index: Index::None, imm: imm12, op: Op::Load }
     }
 
     /// LDRH (pre-index)
-    /// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+    /// <https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate-->
     pub fn ldrh_pre(rt: u8, rn: u8, imm9: i16) -> Self {
         Self { rt, rn, index: Index::PreIndex, imm: imm9, op: Op::Load }
     }
 
     /// LDRH (post-index)
-    /// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+    /// <https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate-->
     pub fn ldrh_post(rt: u8, rn: u8, imm9: i16) -> Self {
         Self { rt, rn, index: Index::PostIndex, imm: imm9, op: Op::Load }
     }
 
     /// STRH
-    /// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/STRH--immediate---Store-Register-Halfword--immediate--
+    /// <https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/STRH--immediate---Store-Register-Halfword--immediate-->
     pub fn strh(rt: u8, rn: u8, imm12: i16) -> Self {
         Self { rt, rn, index: Index::None, imm: imm12, op: Op::Store }
     }
 
     /// STRH (pre-index)
-    /// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/STRH--immediate---Store-Register-Halfword--immediate--
+    /// <https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/STRH--immediate---Store-Register-Halfword--immediate-->
     pub fn strh_pre(rt: u8, rn: u8, imm9: i16) -> Self {
         Self { rt, rn, index: Index::PreIndex, imm: imm9, op: Op::Store }
     }
 
     /// STRH (post-index)
-    /// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/STRH--immediate---Store-Register-Halfword--immediate--
+    /// <https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/STRH--immediate---Store-Register-Halfword--immediate-->
     pub fn strh_post(rt: u8, rn: u8, imm9: i16) -> Self {
         Self { rt, rn, index: Index::PostIndex, imm: imm9, op: Op::Store }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en>
 const FAMILY: u32 = 0b111100;
 
 impl From<HalfwordImm> for u32 {
diff --git a/yjit/src/asm/arm64/inst/load_literal.rs b/yjit/src/asm/arm64/inst/load_literal.rs
index 3eade205c8..817e893553 100644
--- a/yjit/src/asm/arm64/inst/load_literal.rs
+++ b/yjit/src/asm/arm64/inst/load_literal.rs
@@ -40,13 +40,13 @@ pub struct LoadLiteral {
 
 impl LoadLiteral {
     /// LDR (load literal)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--literal---Load-Register--literal--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--literal---Load-Register--literal--?lang=en>
     pub fn ldr_literal(rt: u8, offset: InstructionOffset, num_bits: u8) -> Self {
         Self { rt, offset, opc: num_bits.into() }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en>
 const FAMILY: u32 = 0b0100;
 
 impl From<LoadLiteral> for u32 {
diff --git a/yjit/src/asm/arm64/inst/load_register.rs b/yjit/src/asm/arm64/inst/load_register.rs
index 3426b9ba5f..3d94e8da1f 100644
--- a/yjit/src/asm/arm64/inst/load_register.rs
+++ b/yjit/src/asm/arm64/inst/load_register.rs
@@ -61,13 +61,13 @@ pub struct LoadRegister {
 
 impl LoadRegister {
     /// LDR
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--register---Load-Register--register--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--register---Load-Register--register--?lang=en>
     pub fn ldr(rt: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
         Self { rt, rn, s: S::NoShift, option: Option::LSL, rm, size: num_bits.into() }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en>
 const FAMILY: u32 = 0b0100;
 
 impl From<LoadRegister> for u32 {
diff --git a/yjit/src/asm/arm64/inst/load_store.rs b/yjit/src/asm/arm64/inst/load_store.rs
index b5c8a3c294..e27909ae35 100644
--- a/yjit/src/asm/arm64/inst/load_store.rs
+++ b/yjit/src/asm/arm64/inst/load_store.rs
@@ -66,67 +66,67 @@ pub struct LoadStore {
 
 impl LoadStore {
     /// LDR (immediate, post-index)
-    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
+    /// <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDR--immediate---Load-Register--immediate-->
     pub fn ldr_post(rt: u8, rn: u8, imm9: i16, num_bits: u8) -> Self {
         Self { rt, rn, idx: Index::PostIndex, imm9, opc: Opc::LDR, size: num_bits.into() }
     }
 
     /// LDR (immediate, pre-index)
-    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
+    /// <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDR--immediate---Load-Register--immediate-->
     pub fn ldr_pre(rt: u8, rn: u8, imm9: i16, num_bits: u8) -> Self {
         Self { rt, rn, idx: Index::PreIndex, imm9, opc: Opc::LDR, size: num_bits.into() }
     }
 
     /// LDUR (load register, unscaled)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled--?lang=en>
     pub fn ldur(rt: u8, rn: u8, imm9: i16, num_bits: u8) -> Self {
         Self { rt, rn, idx: Index::None, imm9, opc: Opc::LDR, size: num_bits.into() }
     }
 
     /// LDURH Load Register Halfword (unscaled)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDURH--Load-Register-Halfword--unscaled--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDURH--Load-Register-Halfword--unscaled--?lang=en>
     pub fn ldurh(rt: u8, rn: u8, imm9: i16) -> Self {
         Self { rt, rn, idx: Index::None, imm9, opc: Opc::LDR, size: Size::Size16 }
     }
 
     /// LDURB (load register, byte, unscaled)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDURB--Load-Register-Byte--unscaled--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDURB--Load-Register-Byte--unscaled--?lang=en>
     pub fn ldurb(rt: u8, rn: u8, imm9: i16) -> Self {
         Self { rt, rn, idx: Index::None, imm9, opc: Opc::LDR, size: Size::Size8 }
     }
 
     /// LDURSW (load register, unscaled, signed)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDURSW--Load-Register-Signed-Word--unscaled--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDURSW--Load-Register-Signed-Word--unscaled--?lang=en>
     pub fn ldursw(rt: u8, rn: u8, imm9: i16) -> Self {
         Self { rt, rn, idx: Index::None, imm9, opc: Opc::LDURSW, size: Size::Size32 }
     }
 
     /// STR (immediate, post-index)
-    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/STR--immediate---Store-Register--immediate--
+    /// <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/STR--immediate---Store-Register--immediate-->
     pub fn str_post(rt: u8, rn: u8, imm9: i16, num_bits: u8) -> Self {
         Self { rt, rn, idx: Index::PostIndex, imm9, opc: Opc::STR, size: num_bits.into() }
     }
 
     /// STR (immediate, pre-index)
-    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/STR--immediate---Store-Register--immediate--
+    /// <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/STR--immediate---Store-Register--immediate-->
     pub fn str_pre(rt: u8, rn: u8, imm9: i16, num_bits: u8) -> Self {
         Self { rt, rn, idx: Index::PreIndex, imm9, opc: Opc::STR, size: num_bits.into() }
     }
 
     /// STUR (store register, unscaled)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STUR--Store-Register--unscaled--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STUR--Store-Register--unscaled--?lang=en>
     pub fn stur(rt: u8, rn: u8, imm9: i16, num_bits: u8) -> Self {
         Self { rt, rn, idx: Index::None, imm9, opc: Opc::STR, size: num_bits.into() }
     }
 
     /// STURH (store register, halfword, unscaled)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STURH--Store-Register-Halfword--unscaled--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STURH--Store-Register-Halfword--unscaled--?lang=en>
     pub fn sturh(rt: u8, rn: u8, imm9: i16) -> Self {
         Self { rt, rn, idx: Index::None, imm9, opc: Opc::STR, size: Size::Size16 }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en>
 const FAMILY: u32 = 0b0100;
 
 impl From<LoadStore> for u32 {
diff --git a/yjit/src/asm/arm64/inst/load_store_exclusive.rs b/yjit/src/asm/arm64/inst/load_store_exclusive.rs
index 8216c2200a..1106b4cb37 100644
--- a/yjit/src/asm/arm64/inst/load_store_exclusive.rs
+++ b/yjit/src/asm/arm64/inst/load_store_exclusive.rs
@@ -52,19 +52,19 @@ pub struct LoadStoreExclusive {
 
 impl LoadStoreExclusive {
     /// LDAXR
-    /// https://developer.arm.com/documentation/ddi0602/2021-12/Base-Instructions/LDAXR--Load-Acquire-Exclusive-Register-
+    /// <https://developer.arm.com/documentation/ddi0602/2021-12/Base-Instructions/LDAXR--Load-Acquire-Exclusive-Register->
     pub fn ldaxr(rt: u8, rn: u8, num_bits: u8) -> Self {
         Self { rt, rn, rs: 31, op: Op::Load, size: num_bits.into() }
     }
 
     /// STLXR
-    /// https://developer.arm.com/documentation/ddi0602/2021-12/Base-Instructions/STLXR--Store-Release-Exclusive-Register-
+    /// <https://developer.arm.com/documentation/ddi0602/2021-12/Base-Instructions/STLXR--Store-Release-Exclusive-Register->
     pub fn stlxr(rs: u8, rt: u8, rn: u8, num_bits: u8) -> Self {
         Self { rt, rn, rs, op: Op::Store, size: num_bits.into() }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en>
 const FAMILY: u32 = 0b0100;
 
 impl From<LoadStoreExclusive> for u32 {
diff --git a/yjit/src/asm/arm64/inst/logical_imm.rs b/yjit/src/asm/arm64/inst/logical_imm.rs
index b24916f8a5..d57ad5f5b7 100644
--- a/yjit/src/asm/arm64/inst/logical_imm.rs
+++ b/yjit/src/asm/arm64/inst/logical_imm.rs
@@ -44,43 +44,43 @@ pub struct LogicalImm {
 
 impl LogicalImm {
     /// AND (bitmask immediate)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/AND--immediate---Bitwise-AND--immediate--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/AND--immediate---Bitwise-AND--immediate--?lang=en>
     pub fn and(rd: u8, rn: u8, imm: BitmaskImmediate, num_bits: u8) -> Self {
         Self { rd, rn, imm, opc: Opc::And, sf: num_bits.into() }
     }
 
     /// ANDS (bitmask immediate)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ANDS--immediate---Bitwise-AND--immediate---setting-flags-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ANDS--immediate---Bitwise-AND--immediate---setting-flags-?lang=en>
     pub fn ands(rd: u8, rn: u8, imm: BitmaskImmediate, num_bits: u8) -> Self {
         Self { rd, rn, imm, opc: Opc::Ands, sf: num_bits.into() }
     }
 
     /// EOR (bitmask immediate)
-    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/EOR--immediate---Bitwise-Exclusive-OR--immediate--
+    /// <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/EOR--immediate---Bitwise-Exclusive-OR--immediate-->
     pub fn eor(rd: u8, rn: u8, imm: BitmaskImmediate, num_bits: u8) -> Self {
         Self { rd, rn, imm, opc: Opc::Eor, sf: num_bits.into() }
     }
 
     /// MOV (bitmask immediate)
-    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/MOV--bitmask-immediate---Move--bitmask-immediate---an-alias-of-ORR--immediate--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/MOV--bitmask-immediate---Move--bitmask-immediate---an-alias-of-ORR--immediate--?lang=en>
     pub fn mov(rd: u8, imm: BitmaskImmediate, num_bits: u8) -> Self {
         Self { rd, rn: 0b11111, imm, opc: Opc::Orr, sf: num_bits.into() }
     }
 
     /// ORR (bitmask immediate)
-    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/ORR--immediate---Bitwise-OR--immediate--
+    /// <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/ORR--immediate---Bitwise-OR--immediate-->
     pub fn orr(rd: u8, rn: u8, imm: BitmaskImmediate, num_bits: u8) -> Self {
         Self { rd, rn, imm, opc: Opc::Orr, sf: num_bits.into() }
     }
 
     /// TST (bitmask immediate)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/TST--immediate---Test-bits--immediate---an-alias-of-ANDS--immediate--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/TST--immediate---Test-bits--immediate---an-alias-of-ANDS--immediate--?lang=en>
     pub fn tst(rn: u8, imm: BitmaskImmediate, num_bits: u8) -> Self {
         Self::ands(31, rn, imm, num_bits)
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en#log_imm
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en#log_imm>
 const FAMILY: u32 = 0b1001;
 
 impl From<LogicalImm> for u32 {
diff --git a/yjit/src/asm/arm64/inst/logical_reg.rs b/yjit/src/asm/arm64/inst/logical_reg.rs
index a96805c9f9..18edff606f 100644
--- a/yjit/src/asm/arm64/inst/logical_reg.rs
+++ b/yjit/src/asm/arm64/inst/logical_reg.rs
@@ -70,55 +70,55 @@ pub struct LogicalReg {
 
 impl LogicalReg {
     /// AND (shifted register)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/AND--shifted-register---Bitwise-AND--shifted-register--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/AND--shifted-register---Bitwise-AND--shifted-register--?lang=en>
     pub fn and(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
         Self { rd, rn, imm6: 0, rm, n: N::No, shift: Shift::LSL, opc: Opc::And, sf: num_bits.into() }
     }
 
     /// ANDS (shifted register)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ANDS--shifted-register---Bitwise-AND--shifted-register---setting-flags-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ANDS--shifted-register---Bitwise-AND--shifted-register---setting-flags-?lang=en>
     pub fn ands(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
         Self { rd, rn, imm6: 0, rm, n: N::No, shift: Shift::LSL, opc: Opc::Ands, sf: num_bits.into() }
     }
 
     /// EOR (shifted register)
-    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/EOR--shifted-register---Bitwise-Exclusive-OR--shifted-register--
+    /// <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/EOR--shifted-register---Bitwise-Exclusive-OR--shifted-register-->
     pub fn eor(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
         Self { rd, rn, imm6: 0, rm, n: N::No, shift: Shift::LSL, opc: Opc::Eor, sf: num_bits.into() }
     }
 
     /// MOV (register)
-    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/MOV--register---Move--register---an-alias-of-ORR--shifted-register--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/MOV--register---Move--register---an-alias-of-ORR--shifted-register--?lang=en>
     pub fn mov(rd: u8, rm: u8, num_bits: u8) -> Self {
         Self { rd, rn: 0b11111, imm6: 0, rm, n: N::No, shift: Shift::LSL, opc: Opc::Orr, sf: num_bits.into() }
     }
 
     /// MVN (shifted register)
-    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/MVN--Bitwise-NOT--an-alias-of-ORN--shifted-register--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/MVN--Bitwise-NOT--an-alias-of-ORN--shifted-register--?lang=en>
     pub fn mvn(rd: u8, rm: u8, num_bits: u8) -> Self {
         Self { rd, rn: 0b11111, imm6: 0, rm, n: N::Yes, shift: Shift::LSL, opc: Opc::Orr, sf: num_bits.into() }
     }
 
     /// ORN (shifted register)
-    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/ORN--shifted-register---Bitwise-OR-NOT--shifted-register--
+    /// <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/ORN--shifted-register---Bitwise-OR-NOT--shifted-register-->
     pub fn orn(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
         Self { rd, rn, imm6: 0, rm, n: N::Yes, shift: Shift::LSL, opc: Opc::Orr, sf: num_bits.into() }
     }
 
     /// ORR (shifted register)
-    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/ORR--shifted-register---Bitwise-OR--shifted-register--
+    /// <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/ORR--shifted-register---Bitwise-OR--shifted-register-->
     pub fn orr(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
         Self { rd, rn, imm6: 0, rm, n: N::No, shift: Shift::LSL, opc: Opc::Orr, sf: num_bits.into() }
     }
 
     /// TST (shifted register)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/TST--shifted-register---Test--shifted-register---an-alias-of-ANDS--shifted-register--?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/TST--shifted-register---Test--shifted-register---an-alias-of-ANDS--shifted-register--?lang=en>
     pub fn tst(rn: u8, rm: u8, num_bits: u8) -> Self {
         Self { rd: 31, rn, imm6: 0, rm, n: N::No, shift: Shift::LSL, opc: Opc::Ands, sf: num_bits.into() }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Register?lang=en
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Register?lang=en>
 const FAMILY: u32 = 0b0101;
 
 impl From<LogicalReg> for u32 {
diff --git a/yjit/src/asm/arm64/inst/madd.rs b/yjit/src/asm/arm64/inst/madd.rs
new file mode 100644
index 0000000000..71f2ab230a
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/madd.rs
@@ -0,0 +1,73 @@
+use super::super::arg::Sf;
+
+/// The struct that represents an A64 multiply-add instruction that can be
+/// encoded.
+///
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |     0  0  1    1  0  1  1    0  0  0                     0                                                    |
+/// | sf                                   rm..............      ra.............. rn.............. rd.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct MAdd {
+    /// The number of the general-purpose destination register.
+    rd: u8,
+
+    /// The number of the first general-purpose source register.
+    rn: u8,
+
+    /// The number of the third general-purpose source register.
+    ra: u8,
+
+    /// The number of the second general-purpose source register.
+    rm: u8,
+
+    /// The size of the registers of this instruction.
+    sf: Sf
+}
+
+impl MAdd {
+    /// MUL
+    /// <https://developer.arm.com/documentation/ddi0602/2023-06/Base-Instructions/MUL--Multiply--an-alias-of-MADD->
+    pub fn mul(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
+        Self { rd, rn, ra: 0b11111, rm, sf: num_bits.into() }
+    }
+}
+
+impl From<MAdd> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: MAdd) -> Self {
+        0
+        | ((inst.sf as u32) << 31)
+        | (0b11011 << 24)
+        | ((inst.rm as u32) << 16)
+        | ((inst.ra as u32) << 10)
+        | ((inst.rn as u32) << 5)
+        | (inst.rd as u32)
+    }
+}
+
+impl From<MAdd> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: MAdd) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_mul_32() {
+        let result: u32 = MAdd::mul(0, 1, 2, 32).into();
+        assert_eq!(0x1B027C20, result);
+    }
+
+    #[test]
+    fn test_mul_64() {
+        let result: u32 = MAdd::mul(0, 1, 2, 64).into();
+        assert_eq!(0x9B027C20, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/mod.rs b/yjit/src/asm/arm64/inst/mod.rs
index 9821e6a334..bfffd914ef 100644
--- a/yjit/src/asm/arm64/inst/mod.rs
+++ b/yjit/src/asm/arm64/inst/mod.rs
@@ -16,6 +16,8 @@ mod load_store;
 mod load_store_exclusive;
 mod logical_imm;
 mod logical_reg;
+mod madd;
+mod smulh;
 mod mov;
 mod nop;
 mod pc_rel;
@@ -40,6 +42,8 @@ pub use load_store::LoadStore;
 pub use load_store_exclusive::LoadStoreExclusive;
 pub use logical_imm::LogicalImm;
 pub use logical_reg::LogicalReg;
+pub use madd::MAdd;
+pub use smulh::SMulH;
 pub use mov::Mov;
 pub use nop::Nop;
 pub use pc_rel::PCRelative;
diff --git a/yjit/src/asm/arm64/inst/mov.rs b/yjit/src/asm/arm64/inst/mov.rs
index e7cb9215b0..eae4565c3a 100644
--- a/yjit/src/asm/arm64/inst/mov.rs
+++ b/yjit/src/asm/arm64/inst/mov.rs
@@ -56,19 +56,19 @@ pub struct Mov {
 
 impl Mov {
     /// MOVK
-    /// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/MOVK--Move-wide-with-keep-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/MOVK--Move-wide-with-keep-?lang=en>
     pub fn movk(rd: u8, imm16: u16, hw: u8, num_bits: u8) -> Self {
         Self { rd, imm16, hw: hw.into(), op: Op::MOVK, sf: num_bits.into() }
     }
 
     /// MOVZ
-    /// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/MOVZ--Move-wide-with-zero-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/MOVZ--Move-wide-with-zero-?lang=en>
     pub fn movz(rd: u8, imm16: u16, hw: u8, num_bits: u8) -> Self {
         Self { rd, imm16, hw: hw.into(), op: Op::MOVZ, sf: num_bits.into() }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en>
 const FAMILY: u32 = 0b1000;
 
 impl From<Mov> for u32 {
diff --git a/yjit/src/asm/arm64/inst/nop.rs b/yjit/src/asm/arm64/inst/nop.rs
index d58b3574a9..081d8558f5 100644
--- a/yjit/src/asm/arm64/inst/nop.rs
+++ b/yjit/src/asm/arm64/inst/nop.rs
@@ -10,7 +10,7 @@ pub struct Nop;
 
 impl Nop {
     /// NOP
-    /// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/NOP--No-Operation-
+    /// <https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/NOP--No-Operation->
     pub fn nop() -> Self {
         Self {}
     }
diff --git a/yjit/src/asm/arm64/inst/pc_rel.rs b/yjit/src/asm/arm64/inst/pc_rel.rs
index bd1a2b9367..2ea586a778 100644
--- a/yjit/src/asm/arm64/inst/pc_rel.rs
+++ b/yjit/src/asm/arm64/inst/pc_rel.rs
@@ -30,19 +30,19 @@ pub struct PCRelative {
 
 impl PCRelative {
     /// ADR
-    /// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/ADR--Form-PC-relative-address-
+    /// <https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/ADR--Form-PC-relative-address->
     pub fn adr(rd: u8, imm: i32) -> Self {
         Self { rd, imm, op: Op::ADR }
     }
 
     /// ADRP
-    /// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/ADRP--Form-PC-relative-address-to-4KB-page-
+    /// <https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/ADRP--Form-PC-relative-address-to-4KB-page->
     pub fn adrp(rd: u8, imm: i32) -> Self {
         Self { rd, imm: imm >> 12, op: Op::ADRP }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en>
 const FAMILY: u32 = 0b1000;
 
 impl From<PCRelative> for u32 {
diff --git a/yjit/src/asm/arm64/inst/reg_pair.rs b/yjit/src/asm/arm64/inst/reg_pair.rs
index 87690e3b4a..9bffcd8479 100644
--- a/yjit/src/asm/arm64/inst/reg_pair.rs
+++ b/yjit/src/asm/arm64/inst/reg_pair.rs
@@ -68,49 +68,49 @@ impl RegisterPair {
     }
 
     /// LDP (signed offset)
-    /// LDP <Xt1>, <Xt2>, [<Xn|SP>{, #<imm>}]
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-?lang=en
+    /// `LDP <Xt1>, <Xt2>, [<Xn|SP>{, #<imm>}]`
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-?lang=en>
     pub fn ldp(rt1: u8, rt2: u8, rn: u8, disp: i16, num_bits: u8) -> Self {
         Self::new(rt1, rt2, rn, disp, Index::LoadSignedOffset, num_bits)
     }
 
     /// LDP (pre-index)
-    /// LDP <Xt1>, <Xt2>, [<Xn|SP>, #<imm>]!
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-?lang=en
+    /// `LDP <Xt1>, <Xt2>, [<Xn|SP>, #<imm>]!`
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-?lang=en>
     pub fn ldp_pre(rt1: u8, rt2: u8, rn: u8, disp: i16, num_bits: u8) -> Self {
         Self::new(rt1, rt2, rn, disp, Index::LoadPreIndex, num_bits)
     }
 
     /// LDP (post-index)
-    /// LDP <Xt1>, <Xt2>, [<Xn|SP>], #<imm>
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-?lang=en
+    /// `LDP <Xt1>, <Xt2>, [<Xn|SP>], #<imm>`
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-?lang=en>
     pub fn ldp_post(rt1: u8, rt2: u8, rn: u8, disp: i16, num_bits: u8) -> Self {
         Self::new(rt1, rt2, rn, disp, Index::LoadPostIndex, num_bits)
     }
 
     /// STP (signed offset)
-    /// STP <Xt1>, <Xt2>, [<Xn|SP>{, #<imm>}]
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STP--Store-Pair-of-Registers-?lang=en
+    /// `STP <Xt1>, <Xt2>, [<Xn|SP>{, #<imm>}]`
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STP--Store-Pair-of-Registers-?lang=en>
     pub fn stp(rt1: u8, rt2: u8, rn: u8, disp: i16, num_bits: u8) -> Self {
         Self::new(rt1, rt2, rn, disp, Index::StoreSignedOffset, num_bits)
     }
 
     /// STP (pre-index)
-    /// STP <Xt1>, <Xt2>, [<Xn|SP>, #<imm>]!
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STP--Store-Pair-of-Registers-?lang=en
+    /// `STP <Xt1>, <Xt2>, [<Xn|SP>, #<imm>]!`
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STP--Store-Pair-of-Registers-?lang=en>
     pub fn stp_pre(rt1: u8, rt2: u8, rn: u8, disp: i16, num_bits: u8) -> Self {
         Self::new(rt1, rt2, rn, disp, Index::StorePreIndex, num_bits)
     }
 
     /// STP (post-index)
-    /// STP <Xt1>, <Xt2>, [<Xn|SP>], #<imm>
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STP--Store-Pair-of-Registers-?lang=en
+    /// `STP <Xt1>, <Xt2>, [<Xn|SP>], #<imm>`
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STP--Store-Pair-of-Registers-?lang=en>
     pub fn stp_post(rt1: u8, rt2: u8, rn: u8, disp: i16, num_bits: u8) -> Self {
         Self::new(rt1, rt2, rn, disp, Index::StorePostIndex, num_bits)
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en>
 const FAMILY: u32 = 0b0100;
 
 impl From<RegisterPair> for u32 {
diff --git a/yjit/src/asm/arm64/inst/sbfm.rs b/yjit/src/asm/arm64/inst/sbfm.rs
index 8602998980..12944ba722 100644
--- a/yjit/src/asm/arm64/inst/sbfm.rs
+++ b/yjit/src/asm/arm64/inst/sbfm.rs
@@ -32,7 +32,7 @@ pub struct SBFM {
 
 impl SBFM {
     /// ASR
-    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/ASR--immediate---Arithmetic-Shift-Right--immediate---an-alias-of-SBFM-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/ASR--immediate---Arithmetic-Shift-Right--immediate---an-alias-of-SBFM-?lang=en>
     pub fn asr(rd: u8, rn: u8, shift: u8, num_bits: u8) -> Self {
         let (imms, n) = if num_bits == 64 {
             (0b111111, true)
@@ -44,13 +44,13 @@ impl SBFM {
     }
 
     /// SXTW
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SXTW--Sign-Extend-Word--an-alias-of-SBFM-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SXTW--Sign-Extend-Word--an-alias-of-SBFM-?lang=en>
     pub fn sxtw(rd: u8, rn: u8) -> Self {
         Self { rd, rn, immr: 0, imms: 31, n: true, sf: Sf::Sf64 }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en#bitfield
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en#bitfield>
 const FAMILY: u32 = 0b1001;
 
 impl From<SBFM> for u32 {
diff --git a/yjit/src/asm/arm64/inst/shift_imm.rs b/yjit/src/asm/arm64/inst/shift_imm.rs
index 3d2685a997..9dac9a1408 100644
--- a/yjit/src/asm/arm64/inst/shift_imm.rs
+++ b/yjit/src/asm/arm64/inst/shift_imm.rs
@@ -38,13 +38,13 @@ pub struct ShiftImm {
 
 impl ShiftImm {
     /// LSL (immediate)
-    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LSL--immediate---Logical-Shift-Left--immediate---an-alias-of-UBFM-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LSL--immediate---Logical-Shift-Left--immediate---an-alias-of-UBFM-?lang=en>
     pub fn lsl(rd: u8, rn: u8, shift: u8, num_bits: u8) -> Self {
         ShiftImm { rd, rn, shift, opc: Opc::LSL, sf: num_bits.into() }
     }
 
     /// LSR (immediate)
-    /// https://developer.arm.com/documentation/ddi0602/2021-12/Base-Instructions/LSR--immediate---Logical-Shift-Right--immediate---an-alias-of-UBFM-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0602/2021-12/Base-Instructions/LSR--immediate---Logical-Shift-Right--immediate---an-alias-of-UBFM-?lang=en>
     pub fn lsr(rd: u8, rn: u8, shift: u8, num_bits: u8) -> Self {
         ShiftImm { rd, rn, shift, opc: Opc::LSR, sf: num_bits.into() }
     }
@@ -85,7 +85,7 @@ impl ShiftImm {
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en#bitfield
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en#bitfield>
 const FAMILY: u32 = 0b10011;
 
 impl From<ShiftImm> for u32 {
diff --git a/yjit/src/asm/arm64/inst/smulh.rs b/yjit/src/asm/arm64/inst/smulh.rs
new file mode 100644
index 0000000000..f355cb6531
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/smulh.rs
@@ -0,0 +1,60 @@
+/// The struct that represents an A64 signed multiply high instruction
+///
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |  1  0  0  1    1  0  1  1    0  1  0                     0                                                    |
+/// |                                      rm..............      ra.............. rn.............. rd.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct SMulH {
+    /// The number of the general-purpose destination register.
+    rd: u8,
+
+    /// The number of the first general-purpose source register.
+    rn: u8,
+
+    /// The number of the third general-purpose source register.
+    ra: u8,
+
+    /// The number of the second general-purpose source register.
+    rm: u8,
+}
+
+impl SMulH {
+    /// SMULH
+    /// <https://developer.arm.com/documentation/ddi0602/2023-06/Base-Instructions/SMULH--Signed-Multiply-High->
+    pub fn smulh(rd: u8, rn: u8, rm: u8) -> Self {
+        Self { rd, rn, ra: 0b11111, rm }
+    }
+}
+
+impl From<SMulH> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: SMulH) -> Self {
+        0
+        | (0b10011011010 << 21)
+        | ((inst.rm as u32) << 16)
+        | ((inst.ra as u32) << 10)
+        | ((inst.rn as u32) << 5)
+        | (inst.rd as u32)
+    }
+}
+
+impl From<SMulH> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: SMulH) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_smulh() {
+        let result: u32 = SMulH::smulh(0, 1, 2).into();
+        assert_eq!(0x9b427c20, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/sys_reg.rs b/yjit/src/asm/arm64/inst/sys_reg.rs
index 108737a870..7191dfbfd9 100644
--- a/yjit/src/asm/arm64/inst/sys_reg.rs
+++ b/yjit/src/asm/arm64/inst/sys_reg.rs
@@ -32,19 +32,19 @@ pub struct SysReg {
 
 impl SysReg {
     /// MRS (register)
-    /// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/MRS--Move-System-Register-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/MRS--Move-System-Register-?lang=en>
     pub fn mrs(rt: u8, systemreg: SystemRegister) -> Self {
         SysReg { rt, systemreg, l: L::MRS }
     }
 
     /// MSR (register)
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MSR--register---Move-general-purpose-register-to-System-Register-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MSR--register---Move-general-purpose-register-to-System-Register-?lang=en>
     pub fn msr(systemreg: SystemRegister, rt: u8) -> Self {
         SysReg { rt, systemreg, l: L::MSR }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en#systemmove
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en#systemmove>
 const FAMILY: u32 = 0b110101010001;
 
 impl From<SysReg> for u32 {
diff --git a/yjit/src/asm/arm64/inst/test_bit.rs b/yjit/src/asm/arm64/inst/test_bit.rs
index c57a05ad2b..f7aeca70fd 100644
--- a/yjit/src/asm/arm64/inst/test_bit.rs
+++ b/yjit/src/asm/arm64/inst/test_bit.rs
@@ -60,19 +60,19 @@ pub struct TestBit {
 
 impl TestBit {
     /// TBNZ
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/TBNZ--Test-bit-and-Branch-if-Nonzero-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/TBNZ--Test-bit-and-Branch-if-Nonzero-?lang=en>
     pub fn tbnz(rt: u8, bit_num: u8, offset: i16) -> Self {
         Self { rt, imm14: offset, b40: bit_num & 0b11111, op: Op::TBNZ, b5: bit_num.into() }
     }
 
     /// TBZ
-    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/TBZ--Test-bit-and-Branch-if-Zero-?lang=en
+    /// <https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/TBZ--Test-bit-and-Branch-if-Zero-?lang=en>
     pub fn tbz(rt: u8, bit_num: u8, offset: i16) -> Self {
         Self { rt, imm14: offset, b40: bit_num & 0b11111, op: Op::TBZ, b5: bit_num.into() }
     }
 }
 
-/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en
+/// <https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en>
 const FAMILY: u32 = 0b11011;
 
 impl From<TestBit> for u32 {
diff --git a/yjit/src/asm/arm64/mod.rs b/yjit/src/asm/arm64/mod.rs
index 9bc697ecfb..18b5270f9d 100644
--- a/yjit/src/asm/arm64/mod.rs
+++ b/yjit/src/asm/arm64/mod.rs
@@ -186,7 +186,7 @@ pub fn asr(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, shift: A64Opnd) {
 
             SBFM::asr(rd.reg_no, rn.reg_no, shift.try_into().unwrap(), rd.num_bits).into()
         },
-        _ => panic!("Invalid operand combination to asr instruction."),
+        _ => panic!("Invalid operand combination to asr instruction: asr {:?}, {:?}, {:?}", rd, rn, shift),
     };
 
     cb.write_bytes(&bytes);
@@ -215,6 +215,9 @@ pub const fn bcond_offset_fits_bits(offset: i64) -> bool {
     imm_fits_bits(offset, 19)
 }
 
+/// CBZ and CBNZ also have a limit of 19 bits for the branch offset.
+pub use bcond_offset_fits_bits as cmp_branch_offset_fits_bits;
+
 /// B.cond - branch to target if condition is true
 pub fn bcond(cb: &mut CodeBlock, cond: u8, offset: InstructionOffset) {
     assert!(bcond_offset_fits_bits(offset.into()), "The offset must be 19 bits or less.");
@@ -254,7 +257,7 @@ pub fn br(cb: &mut CodeBlock, rn: A64Opnd) {
 /// BRK - create a breakpoint
 pub fn brk(cb: &mut CodeBlock, imm16: A64Opnd) {
     let bytes: [u8; 4] = match imm16 {
-        A64Opnd::None => Breakpoint::brk(0).into(),
+        A64Opnd::None => Breakpoint::brk(0xf000).into(),
         A64Opnd::UImm(imm16) => {
             assert!(uimm_fits_bits(imm16, 16), "The immediate operand must be 16 bits or less.");
             Breakpoint::brk(imm16 as u16).into()
@@ -276,6 +279,9 @@ pub fn cmp(cb: &mut CodeBlock, rn: A64Opnd, rm: A64Opnd) {
 
             DataReg::cmp(rn.reg_no, rm.reg_no, rn.num_bits).into()
         },
+        (A64Opnd::Reg(rn), A64Opnd::Imm(imm12)) => {
+            DataImm::cmp(rn.reg_no, (imm12 as u64).try_into().unwrap(), rn.num_bits).into()
+        },
         (A64Opnd::Reg(rn), A64Opnd::UImm(imm12)) => {
             DataImm::cmp(rn.reg_no, imm12.try_into().unwrap(), rn.num_bits).into()
         },
@@ -699,6 +705,35 @@ pub fn msr(cb: &mut CodeBlock, systemregister: SystemRegister, rt: A64Opnd) {
     cb.write_bytes(&bytes);
 }
 
+/// MUL - multiply two registers, put the result in a third register
+pub fn mul(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rn, rm) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(rd.num_bits == rn.num_bits && rn.num_bits == rm.num_bits, "Expected registers to be the same size");
+
+            MAdd::mul(rd.reg_no, rn.reg_no, rm.reg_no, rd.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to mul instruction")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// SMULH - multiply two 64-bit registers to produce a 128-bit result, put the high 64-bits of the result into rd
+pub fn smulh(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rn, rm) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(rd.num_bits == rn.num_bits && rn.num_bits == rm.num_bits, "Expected registers to be the same size");
+            assert!(rd.num_bits == 64, "smulh only applicable to 64-bit registers");
+
+            SMulH::smulh(rd.reg_no, rn.reg_no, rm.reg_no).into()
+        },
+        _ => panic!("Invalid operand combination to mul instruction")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
 /// MVN - move a value in a register to another register, negating it
 pub fn mvn(cb: &mut CodeBlock, rd: A64Opnd, rm: A64Opnd) {
     let bytes: [u8; 4] = match (rd, rm) {
@@ -1064,6 +1099,48 @@ pub fn tst(cb: &mut CodeBlock, rn: A64Opnd, rm: A64Opnd) {
     cb.write_bytes(&bytes);
 }
 
+/// CBZ - branch if a register is zero
+pub fn cbz(cb: &mut CodeBlock, rt: A64Opnd, offset: InstructionOffset) {
+    assert!(imm_fits_bits(offset.into(), 19), "jump offset for cbz must fit in 19 bits");
+    let bytes: [u8; 4] = if let A64Opnd::Reg(rt) = rt {
+        cbz_cbnz(rt.num_bits, false, offset, rt.reg_no)
+    } else {
+        panic!("Invalid operand combination to cbz instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// CBNZ - branch if a register is non-zero
+pub fn cbnz(cb: &mut CodeBlock, rt: A64Opnd, offset: InstructionOffset) {
+    assert!(imm_fits_bits(offset.into(), 19), "jump offset for cbz must fit in 19 bits");
+    let bytes: [u8; 4] = if let A64Opnd::Reg(rt) = rt {
+        cbz_cbnz(rt.num_bits, true, offset, rt.reg_no)
+    } else {
+        panic!("Invalid operand combination to cbnz instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// Encode Compare and Branch on Zero (CBZ) with `op=0` or Compare and Branch on Nonzero (CBNZ)
+/// with `op=1`.
+///
+/// <https://developer.arm.com/documentation/ddi0602/2024-03/Base-Instructions/CBZ--Compare-and-Branch-on-Zero->
+///
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// | sf  0  1  1    0  1  0 op                                                                                     |
+/// |                             imm19........................................................... Rt.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+fn cbz_cbnz(num_bits: u8, op: bool, offset: InstructionOffset, rt: u8) -> [u8; 4] {
+    ((Sf::from(num_bits) as u32) << 31 |
+          0b11010 << 25 |
+          u32::from(op) << 24 |
+          truncate_imm::<_, 19>(offset) << 5 |
+          rt as u32).to_le_bytes()
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -1134,7 +1211,7 @@ mod tests {
     }
 
     #[test]
-    fn test_adds_imm_negatve() {
+    fn test_adds_imm_negative() {
         check_bytes("201c00f1", |cb| adds(cb, X0, X1, A64Opnd::new_imm(-7)));
     }
 
@@ -1159,7 +1236,7 @@ mod tests {
     }
 
     #[test]
-    fn test_and_32b_immedaite() {
+    fn test_and_32b_immediate() {
         check_bytes("404c0012", |cb| and(cb, W0, W2, A64Opnd::new_uimm(0xfffff)));
     }
 
@@ -1239,8 +1316,26 @@ mod tests {
     }
 
     #[test]
+    fn test_cbz() {
+        let offset = InstructionOffset::from_insns(-1);
+        check_bytes("e0ffffb4e0ffff34", |cb| {
+            cbz(cb, X0, offset);
+            cbz(cb, W0, offset);
+        });
+    }
+
+    #[test]
+    fn test_cbnz() {
+        let offset = InstructionOffset::from_insns(2);
+        check_bytes("540000b554000035", |cb| {
+            cbnz(cb, X20, offset);
+            cbnz(cb, W20, offset);
+        });
+    }
+
+    #[test]
     fn test_brk_none() {
-        check_bytes("000020d4", |cb| brk(cb, A64Opnd::None));
+        check_bytes("00003ed4", |cb| brk(cb, A64Opnd::None));
     }
 
     #[test]
@@ -1414,6 +1509,11 @@ mod tests {
     }
 
     #[test]
+    fn test_mul() {
+        check_bytes("6a7d0c9b", |cb| mul(cb, X10, X11, X12));
+    }
+
+    #[test]
     fn test_mvn() {
         check_bytes("ea032baa", |cb| mvn(cb, X10, X11));
     }
diff --git a/yjit/src/asm/mod.rs b/yjit/src/asm/mod.rs
index 648041bbab..9ef675b34d 100644
--- a/yjit/src/asm/mod.rs
+++ b/yjit/src/asm/mod.rs
@@ -1,20 +1,14 @@
-use std::cell::RefCell;
 use std::fmt;
 use std::mem;
 use std::rc::Rc;
-#[cfg(target_arch = "x86_64")]
-use crate::backend::x86_64::JMP_PTR_BYTES;
-#[cfg(target_arch = "aarch64")]
-use crate::backend::arm64::JMP_PTR_BYTES;
+use std::collections::BTreeMap;
+
 use crate::core::IseqPayload;
 use crate::core::for_each_off_stack_iseq_payload;
 use crate::core::for_each_on_stack_iseq_payload;
 use crate::invariants::rb_yjit_tracing_invalidate_all;
+use crate::stats::incr_counter;
 use crate::virtualmem::WriteError;
-
-#[cfg(feature = "disasm")]
-use std::collections::BTreeMap;
-
 use crate::codegen::CodegenGlobals;
 use crate::virtualmem::{VirtualMem, CodePtr};
 
@@ -24,9 +18,6 @@ pub mod x86_64;
 
 pub mod arm64;
 
-/// Size of a code page in bytes. Each code page is split into an inlined and an outlined portion.
-const CODE_PAGE_SIZE: usize = 16 * 1024;
-
 //
 // TODO: need a field_size_of macro, to compute the size of a struct field in bytes
 //
@@ -52,7 +43,12 @@ pub struct LabelRef {
 /// Block of memory into which instructions can be assembled
 pub struct CodeBlock {
     // Memory for storing the encoded instructions
-    mem_block: Rc<RefCell<VirtualMem>>,
+    mem_block: Rc<VirtualMem>,
+
+    // Size of a code page in bytes. Each code page is split into an inlined and an outlined portion.
+    // Code GC collects code memory at this granularity.
+    // Must be a multiple of the OS page size.
+    page_size: usize,
 
     // Memory block size
     mem_size: usize,
@@ -60,6 +56,12 @@ pub struct CodeBlock {
     // Current writing position
     write_pos: usize,
 
+    // The index of the last page with written bytes
+    last_page_idx: usize,
+
+    // Total number of bytes written to past pages
+    past_page_bytes: usize,
+
     // Size reserved for writing a jump to the next page
     page_end_reserve: usize,
 
@@ -72,8 +74,10 @@ pub struct CodeBlock {
     // References to labels
     label_refs: Vec<LabelRef>,
 
+    // A switch for keeping comments. They take up memory.
+    keep_comments: bool,
+
     // Comments for assembly instructions, if that feature is enabled
-    #[cfg(feature = "disasm")]
     asm_comments: BTreeMap<usize, Vec<String>>,
 
     // True for OutlinedCb
@@ -83,6 +87,10 @@ pub struct CodeBlock {
     // for example, when there is not enough space or when a jump
     // target is too far away.
     dropped_bytes: bool,
+
+    // Keeps track of what pages we can write to after code gc.
+    // `None` means all pages are free.
+    freed_pages: Rc<Option<Vec<usize>>>,
 }
 
 /// Set of CodeBlock label states. Used for recovering the previous state.
@@ -93,38 +101,60 @@ pub struct LabelState {
 }
 
 impl CodeBlock {
+    /// Works for common AArch64 systems that have 16 KiB pages and
+    /// common x86_64 systems that use 4 KiB pages.
+    const PREFERRED_CODE_PAGE_SIZE: usize = 16 * 1024;
+
     /// Make a new CodeBlock
-    pub fn new(mem_block: Rc<RefCell<VirtualMem>>, outlined: bool) -> Self {
-        let mem_size = mem_block.borrow().virtual_region_size();
+    pub fn new(mem_block: Rc<VirtualMem>, outlined: bool, freed_pages: Rc<Option<Vec<usize>>>, keep_comments: bool) -> Self {
+        // Pick the code page size
+        let system_page_size = mem_block.system_page_size();
+        let page_size = if 0 == Self::PREFERRED_CODE_PAGE_SIZE % system_page_size {
+            Self::PREFERRED_CODE_PAGE_SIZE
+        } else {
+            system_page_size
+        };
+
+        let mem_size = mem_block.virtual_region_size();
         let mut cb = Self {
             mem_block,
             mem_size,
+            page_size,
             write_pos: 0,
-            page_end_reserve: JMP_PTR_BYTES,
+            last_page_idx: 0,
+            past_page_bytes: 0,
+            page_end_reserve: 0,
             label_addrs: Vec::new(),
             label_names: Vec::new(),
             label_refs: Vec::new(),
-            #[cfg(feature = "disasm")]
+            keep_comments,
             asm_comments: BTreeMap::new(),
             outlined,
             dropped_bytes: false,
+            freed_pages,
         };
+        cb.page_end_reserve = cb.jmp_ptr_bytes();
         cb.write_pos = cb.page_start();
+
+        #[cfg(not(test))]
+        assert_eq!(0, mem_size % page_size, "partially in-bounds code pages should be impossible");
+
         cb
     }
 
     /// Move the CodeBlock to the next page. If it's on the furthest page,
     /// move the other CodeBlock to the next page as well.
+    #[must_use]
     pub fn next_page<F: Fn(&mut CodeBlock, CodePtr)>(&mut self, base_ptr: CodePtr, jmp_ptr: F) -> bool {
         let old_write_ptr = self.get_write_ptr();
         self.set_write_ptr(base_ptr);
 
         // Use the freed_pages list if code GC has been used. Otherwise use the next page.
-        let next_page_idx = if let Some(freed_pages) = CodegenGlobals::get_freed_pages() {
-            let current_page = self.write_pos / CODE_PAGE_SIZE;
+        let next_page_idx = if let Some(freed_pages) = self.freed_pages.as_ref() {
+            let current_page = self.write_pos / self.page_size;
             freed_pages.iter().find(|&&page| current_page < page).map(|&page| page)
         } else {
-            Some(self.write_pos / CODE_PAGE_SIZE + 1)
+            Some(self.write_pos / self.page_size + 1)
         };
 
         // Move self to the next page
@@ -133,8 +163,10 @@ impl CodeBlock {
             return false;
         }
 
-        // Move the other CodeBlock to the same page if it'S on the furthest page
-        self.other_cb().unwrap().set_page(next_page_idx.unwrap(), &jmp_ptr);
+        // Move the other CodeBlock to the same page if it's on the furthest page
+        if cfg!(not(test)) {
+            self.other_cb().unwrap().set_page(next_page_idx.unwrap(), &jmp_ptr);
+        }
 
         return !self.dropped_bytes;
     }
@@ -161,26 +193,32 @@ impl CodeBlock {
         // but you need to waste some space for keeping write_pos for every single page.
         // It doesn't seem necessary for performance either. So we're currently not doing it.
         let dst_pos = self.get_page_pos(page_idx);
-        if CODE_PAGE_SIZE * page_idx < self.mem_size && self.write_pos < dst_pos {
+        if self.write_pos < dst_pos {
+            // Fail if next page is out of bounds
+            if dst_pos >= self.mem_size {
+                return false;
+            }
+
             // Reset dropped_bytes
             self.dropped_bytes = false;
 
-            // Convert dst_pos to dst_ptr
-            let src_pos = self.write_pos;
-            self.write_pos = dst_pos;
-            let dst_ptr = self.get_write_ptr();
-            self.write_pos = src_pos;
-            self.without_page_end_reserve(|cb| assert!(cb.has_capacity(JMP_PTR_BYTES)));
-
             // Generate jmp_ptr from src_pos to dst_pos
+            let dst_ptr = self.get_ptr(dst_pos);
             self.without_page_end_reserve(|cb| {
+                assert!(cb.has_capacity(cb.jmp_ptr_bytes()));
                 cb.add_comment("jump to next page");
                 jmp_ptr(cb, dst_ptr);
-                assert!(!cb.has_dropped_bytes());
             });
 
+            // Update past_page_bytes for code_size() if this is a new page
+            if self.last_page_idx < page_idx {
+                self.past_page_bytes += self.current_page_bytes();
+            }
+
             // Start the next code from dst_pos
             self.write_pos = dst_pos;
+            // Update the last_page_idx if page_idx points to the furthest page
+            self.last_page_idx = usize::max(self.last_page_idx, page_idx);
         }
         !self.dropped_bytes
     }
@@ -199,33 +237,39 @@ impl CodeBlock {
             }
 
             // Free the grouped pages at once
-            let start_ptr = self.mem_block.borrow().start_ptr().add_bytes(page_idx * CODE_PAGE_SIZE);
-            let batch_size = CODE_PAGE_SIZE * batch_idxs.len();
-            self.mem_block.borrow_mut().free_bytes(start_ptr, batch_size as u32);
+            let start_ptr = self.mem_block.start_ptr().add_bytes(page_idx * self.page_size);
+            let batch_size = self.page_size * batch_idxs.len();
+            self.mem_block.free_bytes(start_ptr, batch_size as u32);
         }
     }
 
     pub fn page_size(&self) -> usize {
-        CODE_PAGE_SIZE
+        self.page_size
     }
 
     pub fn mapped_region_size(&self) -> usize {
-        self.mem_block.borrow().mapped_region_size()
+        self.mem_block.mapped_region_size()
+    }
+
+    /// Size of the region in bytes where writes could be attempted.
+    #[cfg(target_arch = "aarch64")]
+    pub fn virtual_region_size(&self) -> usize {
+        self.mem_block.virtual_region_size()
     }
 
     /// Return the number of code pages that have been mapped by the VirtualMemory.
     pub fn num_mapped_pages(&self) -> usize {
         // CodeBlock's page size != VirtualMem's page size on Linux,
-        // so mapped_region_size % CODE_PAGE_SIZE may not be 0
-        ((self.mapped_region_size() - 1) / CODE_PAGE_SIZE) + 1
+        // so mapped_region_size % self.page_size may not be 0
+        ((self.mapped_region_size() - 1) / self.page_size) + 1
     }
 
     /// Return the number of code pages that have been reserved by the VirtualMemory.
     pub fn num_virtual_pages(&self) -> usize {
-        let virtual_region_size = self.mem_block.borrow().virtual_region_size();
+        let virtual_region_size = self.mem_block.virtual_region_size();
         // CodeBlock's page size != VirtualMem's page size on Linux,
-        // so mapped_region_size % CODE_PAGE_SIZE may not be 0
-        ((virtual_region_size - 1) / CODE_PAGE_SIZE) + 1
+        // so mapped_region_size % self.page_size may not be 0
+        ((virtual_region_size - 1) / self.page_size) + 1
     }
 
     /// Return the number of code pages that have been freed and not used yet.
@@ -234,18 +278,18 @@ impl CodeBlock {
     }
 
     pub fn has_freed_page(&self, page_idx: usize) -> bool {
-        CodegenGlobals::get_freed_pages().as_ref().map_or(false, |pages| pages.contains(&page_idx)) && // code GCed
-            self.write_pos < page_idx * CODE_PAGE_SIZE // and not written yet
+        self.freed_pages.as_ref().as_ref().map_or(false, |pages| pages.contains(&page_idx)) && // code GCed
+            self.write_pos < page_idx * self.page_size // and not written yet
     }
 
     /// Convert a page index to the write_pos for the page start.
     fn get_page_pos(&self, page_idx: usize) -> usize {
-        CODE_PAGE_SIZE * page_idx + self.page_start()
+        self.page_size * page_idx + self.page_start()
     }
 
     /// write_pos of the current page start
     pub fn page_start_pos(&self) -> usize {
-        self.get_write_pos() / CODE_PAGE_SIZE * CODE_PAGE_SIZE + self.page_start()
+        self.get_write_pos() / self.page_size * self.page_size + self.page_start()
     }
 
     /// Offset of each page where CodeBlock should start writing
@@ -253,12 +297,12 @@ impl CodeBlock {
         let mut start = if self.inline() {
             0
         } else {
-            CODE_PAGE_SIZE / 2
+            self.page_size / 2
         };
         if cfg!(debug_assertions) && !cfg!(test) {
             // Leave illegal instructions at the beginning of each page to assert
             // we're not accidentally crossing page boundaries.
-            start += JMP_PTR_BYTES;
+            start += self.jmp_ptr_bytes();
         }
         start
     }
@@ -266,9 +310,9 @@ impl CodeBlock {
     /// Offset of each page where CodeBlock should stop writing (exclusive)
     pub fn page_end(&self) -> usize {
         let page_end = if self.inline() {
-            CODE_PAGE_SIZE / 2
+            self.page_size / 2
         } else {
-            CODE_PAGE_SIZE
+            self.page_size
         };
         page_end - self.page_end_reserve // reserve space to jump to the next page
     }
@@ -282,63 +326,52 @@ impl CodeBlock {
     }
 
     /// Return the address ranges of a given address range that this CodeBlock can write.
-    #[cfg(any(feature = "disasm", target_arch = "aarch64"))]
     #[allow(dead_code)]
     pub fn writable_addrs(&self, start_ptr: CodePtr, end_ptr: CodePtr) -> Vec<(usize, usize)> {
-        // CodegenGlobals is not initialized when we write initial ocb code
-        let freed_pages = if CodegenGlobals::has_instance() {
-            CodegenGlobals::get_freed_pages().as_ref()
-        } else {
-            None
-        };
-
-        let region_start = self.get_ptr(0).into_usize();
-        let region_end = self.get_ptr(self.get_mem_size()).into_usize();
-        let mut start = start_ptr.into_usize();
-        let end = std::cmp::min(end_ptr.into_usize(), region_end);
+        let region_start = self.get_ptr(0).raw_addr(self);
+        let region_end = self.get_ptr(self.get_mem_size()).raw_addr(self);
+        let mut start = start_ptr.raw_addr(self);
+        let end = std::cmp::min(end_ptr.raw_addr(self), region_end);
 
+        let freed_pages = self.freed_pages.as_ref().as_ref();
         let mut addrs = vec![];
         while start < end {
-            let page_idx = start.saturating_sub(region_start) / CODE_PAGE_SIZE;
-            let current_page = region_start + (page_idx * CODE_PAGE_SIZE);
+            let page_idx = start.saturating_sub(region_start) / self.page_size;
+            let current_page = region_start + (page_idx * self.page_size);
             let page_end = std::cmp::min(end, current_page + self.page_end());
             // If code GC has been used, skip pages that are used by past on-stack code
             if freed_pages.map_or(true, |pages| pages.contains(&page_idx)) {
                 addrs.push((start, page_end));
             }
-            start = current_page + CODE_PAGE_SIZE + self.page_start();
+            start = current_page + self.page_size + self.page_start();
         }
         addrs
     }
 
-    /// Return the code size that has been used by this CodeBlock.
+    /// Return the number of bytes written by this CodeBlock.
     pub fn code_size(&self) -> usize {
-        let mut size = 0;
-        let current_page_idx = self.write_pos / CODE_PAGE_SIZE;
-        for page_idx in 0..self.num_mapped_pages() {
-            if page_idx == current_page_idx {
-                // Count only actually used bytes for the current page.
-                size += (self.write_pos % CODE_PAGE_SIZE).saturating_sub(self.page_start());
-            } else if !self.has_freed_page(page_idx) {
-                // Count an entire range for any non-freed pages that have been used.
-                size += self.page_end() - self.page_start() + self.page_end_reserve;
-            }
-        }
-        size
+        self.current_page_bytes() + self.past_page_bytes
+    }
+
+    /// Return the number of bytes written to the current page.
+    fn current_page_bytes(&self) -> usize {
+        (self.write_pos % self.page_size).saturating_sub(self.page_start())
     }
 
     /// Check if this code block has sufficient remaining capacity
     pub fn has_capacity(&self, num_bytes: usize) -> bool {
-        let page_offset = self.write_pos % CODE_PAGE_SIZE;
+        let page_offset = self.write_pos % self.page_size;
         let capacity = self.page_end().saturating_sub(page_offset);
         num_bytes <= capacity
     }
 
     /// Add an assembly comment if the feature is on.
-    /// If not, this becomes an inline no-op.
-    #[cfg(feature = "disasm")]
     pub fn add_comment(&mut self, comment: &str) {
-        let cur_ptr = self.get_write_ptr().into_usize();
+        if !self.keep_comments {
+            return;
+        }
+
+        let cur_ptr = self.get_write_ptr().raw_addr(self);
 
         // If there's no current list of comments for this line number, add one.
         let this_line_comments = self.asm_comments.entry(cur_ptr).or_default();
@@ -348,28 +381,21 @@ impl CodeBlock {
             this_line_comments.push(comment.to_string());
         }
     }
-    #[cfg(not(feature = "disasm"))]
-    #[inline]
-    pub fn add_comment(&mut self, _: &str) {}
 
-    #[cfg(feature = "disasm")]
     pub fn comments_at(&self, pos: usize) -> Option<&Vec<String>> {
         self.asm_comments.get(&pos)
     }
 
-    #[allow(unused_variables)]
-    #[cfg(feature = "disasm")]
     pub fn remove_comments(&mut self, start_addr: CodePtr, end_addr: CodePtr) {
-        for addr in start_addr.into_usize()..end_addr.into_usize() {
+        if self.asm_comments.is_empty() {
+            return;
+        }
+        for addr in start_addr.raw_addr(self)..end_addr.raw_addr(self) {
             self.asm_comments.remove(&addr);
         }
     }
-    #[cfg(not(feature = "disasm"))]
-    #[inline]
-    pub fn remove_comments(&mut self, _: CodePtr, _: CodePtr) {}
 
     pub fn clear_comments(&mut self) {
-        #[cfg(feature = "disasm")]
         self.asm_comments.clear();
     }
 
@@ -382,7 +408,7 @@ impl CodeBlock {
     }
 
     pub fn write_mem(&self, write_ptr: CodePtr, byte: u8) -> Result<(), WriteError> {
-        self.mem_block.borrow_mut().write_byte(write_ptr, byte)
+        self.mem_block.write_byte(write_ptr, byte)
     }
 
     // Set the current write position
@@ -396,31 +422,31 @@ impl CodeBlock {
 
     // Set the current write position from a pointer
     pub fn set_write_ptr(&mut self, code_ptr: CodePtr) {
-        let pos = code_ptr.into_usize() - self.mem_block.borrow().start_ptr().into_usize();
-        self.set_pos(pos);
+        let pos = code_ptr.as_offset() - self.mem_block.start_ptr().as_offset();
+        self.set_pos(pos.try_into().unwrap());
     }
 
     /// Get a (possibly dangling) direct pointer into the executable memory block
     pub fn get_ptr(&self, offset: usize) -> CodePtr {
-        self.mem_block.borrow().start_ptr().add_bytes(offset)
+        self.mem_block.start_ptr().add_bytes(offset)
     }
 
     /// Convert an address range to memory page indexes against a num_pages()-sized array.
-    pub fn addrs_to_pages(&self, start_addr: CodePtr, end_addr: CodePtr) -> Vec<usize> {
-        let mem_start = self.mem_block.borrow().start_ptr().into_usize();
-        let mem_end = self.mem_block.borrow().end_ptr().into_usize();
-        assert!(mem_start <= start_addr.into_usize());
-        assert!(start_addr.into_usize() <= end_addr.into_usize());
-        assert!(end_addr.into_usize() <= mem_end);
+    pub fn addrs_to_pages(&self, start_addr: CodePtr, end_addr: CodePtr) -> impl Iterator<Item = usize> {
+        let mem_start = self.mem_block.start_ptr().raw_addr(self);
+        let mem_end = self.mem_block.mapped_end_ptr().raw_addr(self);
+        assert!(mem_start <= start_addr.raw_addr(self));
+        assert!(start_addr.raw_addr(self) <= end_addr.raw_addr(self));
+        assert!(end_addr.raw_addr(self) <= mem_end);
 
         // Ignore empty code ranges
         if start_addr == end_addr {
-            return vec![];
+            return 0..0;
         }
 
-        let start_page = (start_addr.into_usize() - mem_start) / CODE_PAGE_SIZE;
-        let end_page = (end_addr.into_usize() - mem_start - 1) / CODE_PAGE_SIZE;
-        (start_page..=end_page).collect() // TODO: consider returning an iterator
+        let start_page = (start_addr.raw_addr(self) - mem_start) / self.page_size;
+        let end_page = (end_addr.raw_addr(self) - mem_start - 1) / self.page_size;
+        start_page..end_page + 1
     }
 
     /// Get a (possibly dangling) direct pointer to the current write position
@@ -431,7 +457,7 @@ impl CodeBlock {
     /// Write a single byte at the current position.
     pub fn write_byte(&mut self, byte: u8) {
         let write_ptr = self.get_write_ptr();
-        if self.has_capacity(1) && self.mem_block.borrow_mut().write_byte(write_ptr, byte).is_ok() {
+        if self.has_capacity(1) && self.mem_block.write_byte(write_ptr, byte).is_ok() {
             self.write_pos += 1;
         } else {
             self.dropped_bytes = true;
@@ -563,14 +589,20 @@ impl CodeBlock {
         self.label_refs = state.label_refs;
     }
 
+    pub fn mark_all_writeable(&mut self) {
+        self.mem_block.mark_all_writeable();
+    }
+
     pub fn mark_all_executable(&mut self) {
-        self.mem_block.borrow_mut().mark_all_executable();
+        self.mem_block.mark_all_executable();
     }
 
     /// Code GC. Free code pages that are not on stack and reuse them.
-    pub fn code_gc(&mut self) {
+    pub fn code_gc(&mut self, ocb: &mut OutlinedCb) {
+        assert!(self.inline(), "must use on inline code block");
+
         // The previous code GC failed to free any pages. Give up.
-        if CodegenGlobals::get_freed_pages() == &Some(vec![]) {
+        if self.freed_pages.as_ref() == &Some(vec![]) {
             return;
         }
 
@@ -596,11 +628,13 @@ impl CodeBlock {
         // This currently patches every ISEQ, which works, but in the future,
         // we could limit that to patch only on-stack ISEQs for optimizing code GC.
         rb_yjit_tracing_invalidate_all();
-        // When code GC runs next time, we could have reused pages in between
-        // invalidated pages. To invalidate them, we skip freezing them here.
-        // We free or not reuse the bytes frozen by any past invalidation, so this
-        // can be safely reset to pass the frozen bytes check on invalidation.
-        CodegenGlobals::set_inline_frozen_bytes(0);
+
+        // Assert that all code pages are freeable
+        assert_eq!(
+            0,
+            self.mem_size % self.page_size,
+            "end of the last code page should be the end of the entire region"
+        );
 
         // Let VirtuamMem free the pages
         let mut freed_pages: Vec<usize> = pages_in_use.iter().enumerate()
@@ -614,18 +648,21 @@ impl CodeBlock {
         freed_pages.append(&mut virtual_pages);
 
         if let Some(&first_page) = freed_pages.first() {
-            let mut cb = CodegenGlobals::get_inline_cb();
-            cb.write_pos = cb.get_page_pos(first_page);
-            cb.dropped_bytes = false;
-            cb.clear_comments();
-
-            let mut ocb = CodegenGlobals::get_outlined_cb().unwrap();
-            ocb.write_pos = ocb.get_page_pos(first_page);
-            ocb.dropped_bytes = false;
-            ocb.clear_comments();
+            for cb in [&mut *self, ocb.unwrap()] {
+                cb.write_pos = cb.get_page_pos(first_page);
+                cb.past_page_bytes = 0;
+                cb.dropped_bytes = false;
+                cb.clear_comments();
+            }
         }
 
-        CodegenGlobals::set_freed_pages(freed_pages);
+        // Track which pages are free.
+        let new_freed_pages = Rc::new(Some(freed_pages));
+        let old_freed_pages = mem::replace(&mut self.freed_pages, Rc::clone(&new_freed_pages));
+        ocb.unwrap().freed_pages = new_freed_pages;
+        assert_eq!(1, Rc::strong_count(&old_freed_pages)); // will deallocate
+
+        incr_counter!(code_gc_count);
     }
 
     pub fn inline(&self) -> bool {
@@ -653,9 +690,27 @@ impl CodeBlock {
 
         let alloc = TestingAllocator::new(mem_size);
         let mem_start: *const u8 = alloc.mem_start();
-        let virt_mem = VirtualMem::new(alloc, 1, NonNull::new(mem_start as *mut u8).unwrap(), mem_size);
+        let virt_mem = VirtualMem::new(alloc, 1, NonNull::new(mem_start as *mut u8).unwrap(), mem_size, 128 * 1024 * 1024);
+
+        Self::new(Rc::new(virt_mem), false, Rc::new(None), true)
+    }
+
+    /// Stubbed CodeBlock for testing conditions that can arise due to code GC. Can't execute generated code.
+    #[cfg(target_arch = "aarch64")]
+    pub fn new_dummy_with_freed_pages(mut freed_pages: Vec<usize>) -> Self {
+        use std::ptr::NonNull;
+        use crate::virtualmem::*;
+        use crate::virtualmem::tests::TestingAllocator;
+
+        freed_pages.sort_unstable();
+        let mem_size = Self::PREFERRED_CODE_PAGE_SIZE *
+            (1 + freed_pages.last().expect("freed_pages vec should not be empty"));
+
+        let alloc = TestingAllocator::new(mem_size);
+        let mem_start: *const u8 = alloc.mem_start();
+        let virt_mem = VirtualMem::new(alloc, 1, NonNull::new(mem_start as *mut u8).unwrap(), mem_size, 128 * 1024 * 1024);
 
-        Self::new(Rc::new(RefCell::new(virt_mem)), false)
+        Self::new(Rc::new(virt_mem), false, Rc::new(Some(freed_pages)), true)
     }
 }
 
@@ -663,13 +718,20 @@ impl CodeBlock {
 impl fmt::LowerHex for CodeBlock {
     fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
         for pos in 0..self.write_pos {
-            let byte = unsafe { self.mem_block.borrow().start_ptr().raw_ptr().add(pos).read() };
+            let mem_block = &*self.mem_block;
+            let byte = unsafe { mem_block.start_ptr().raw_ptr(mem_block).add(pos).read() };
             fmtr.write_fmt(format_args!("{:02x}", byte))?;
         }
         Ok(())
     }
 }
 
+impl crate::virtualmem::CodePtrBase for CodeBlock {
+    fn base_ptr(&self) -> std::ptr::NonNull<u8> {
+        self.mem_block.base_ptr()
+    }
+}
+
 /// Wrapper struct so we can use the type system to distinguish
 /// Between the inlined and outlined code blocks
 pub struct OutlinedCb {
@@ -756,4 +818,30 @@ mod tests
         assert_eq!(uimm_num_bits((u32::MAX as u64) + 1), 64);
         assert_eq!(uimm_num_bits(u64::MAX), 64);
     }
+
+    #[test]
+    fn test_code_size() {
+        // Write 4 bytes in the first page
+        let mut cb = CodeBlock::new_dummy(CodeBlock::PREFERRED_CODE_PAGE_SIZE * 2);
+        cb.write_bytes(&[0, 0, 0, 0]);
+        assert_eq!(cb.code_size(), 4);
+
+        // Moving to the next page should not increase code_size
+        assert!(cb.next_page(cb.get_write_ptr(), |_, _| {}));
+        assert_eq!(cb.code_size(), 4);
+
+        // Write 4 bytes in the second page
+        cb.write_bytes(&[0, 0, 0, 0]);
+        assert_eq!(cb.code_size(), 8);
+
+        // Rewrite 4 bytes in the first page
+        let old_write_pos = cb.get_write_pos();
+        cb.set_pos(0);
+        cb.write_bytes(&[1, 1, 1, 1]);
+
+        // Moving from an old page to the next page should not increase code_size
+        assert!(cb.next_page(cb.get_write_ptr(), |_, _| {}));
+        cb.set_pos(old_write_pos);
+        assert_eq!(cb.code_size(), 8);
+    }
 }
diff --git a/yjit/src/asm/x86_64/mod.rs b/yjit/src/asm/x86_64/mod.rs
index 67bb5d1ffb..0ef5e92117 100644
--- a/yjit/src/asm/x86_64/mod.rs
+++ b/yjit/src/asm/x86_64/mod.rs
@@ -362,11 +362,6 @@ pub fn const_ptr_opnd(ptr: *const u8) -> X86Opnd
     uimm_opnd(ptr as u64)
 }
 
-pub fn code_ptr_opnd(code_ptr: CodePtr) -> X86Opnd
-{
-    uimm_opnd(code_ptr.raw_ptr() as u64)
-}
-
 /// Write the REX byte
 fn write_rex(cb: &mut CodeBlock, w_flag: bool, reg_no: u8, idx_reg_no: u8, rm_reg_no: u8) {
     // 0 1 0 0 w r x b
@@ -635,7 +630,7 @@ fn write_rm_multi(cb: &mut CodeBlock, op_mem_reg8: u8, op_mem_reg_pref: u8, op_r
                 panic!("immediate value too large (num_bits={}, num={uimm:?})", num_bits);
             }
         },
-        _ => unreachable!()
+        _ => panic!("unknown encoding combo: {opnd0:?} {opnd1:?}")
     };
 }
 
@@ -696,17 +691,17 @@ pub fn call_ptr(cb: &mut CodeBlock, scratch_opnd: X86Opnd, dst_ptr: *const u8) {
         let end_ptr = cb.get_ptr(cb.write_pos + 5);
 
         // Compute the jump offset
-        let rel64: i64 = dst_ptr as i64 - end_ptr.into_i64();
+        let rel64: i64 = dst_ptr as i64 - end_ptr.raw_ptr(cb) as i64;
 
         // If the offset fits in 32-bit
         if rel64 >= i32::MIN.into() && rel64 <= i32::MAX.into() {
-            incr_counter!(x86_call_rel32);
+            incr_counter!(num_send_x86_rel32);
             call_rel32(cb, rel64.try_into().unwrap());
             return;
         }
 
         // Move the pointer into the scratch register and call
-        incr_counter!(x86_call_reg);
+        incr_counter!(num_send_x86_reg);
         mov(cb, scratch_opnd, const_ptr_opnd(dst_ptr));
         call(cb, scratch_opnd);
     } else {
@@ -805,6 +800,31 @@ pub fn cqo(cb: &mut CodeBlock) {
     cb.write_bytes(&[0x48, 0x99]);
 }
 
+/// imul - signed integer multiply
+pub fn imul(cb: &mut CodeBlock, opnd0: X86Opnd, opnd1: X86Opnd) {
+    assert!(opnd0.num_bits() == 64);
+    assert!(opnd1.num_bits() == 64);
+    assert!(matches!(opnd0, X86Opnd::Reg(_) | X86Opnd::Mem(_)));
+    assert!(matches!(opnd1, X86Opnd::Reg(_) | X86Opnd::Mem(_)));
+
+    match (opnd0, opnd1) {
+        (X86Opnd::Reg(_), X86Opnd::Reg(_) | X86Opnd::Mem(_)) => {
+            //REX.W + 0F AF /rIMUL r64, r/m64
+            // Quadword register := Quadword register * r/m64.
+            write_rm(cb, false, true, opnd0, opnd1, None, &[0x0F, 0xAF]);
+        }
+
+        // Flip the operands to handle this case. This instruction has weird encoding restrictions.
+        (X86Opnd::Mem(_), X86Opnd::Reg(_)) => {
+            //REX.W + 0F AF /rIMUL r64, r/m64
+            // Quadword register := Quadword register * r/m64.
+            write_rm(cb, false, true, opnd1, opnd0, None, &[0x0F, 0xAF]);
+        }
+
+        _ => unreachable!()
+    }
+}
+
 /// Interrupt 3 - trap to debugger
 pub fn int3(cb: &mut CodeBlock) {
     cb.write_byte(0xcc);
@@ -872,7 +892,7 @@ fn write_jcc_ptr(cb: &mut CodeBlock, op0: u8, op1: u8, dst_ptr: CodePtr) {
     let end_ptr = cb.get_ptr(cb.write_pos + 4);
 
     // Compute the jump offset
-    let rel64 = dst_ptr.into_i64() - end_ptr.into_i64();
+    let rel64 = dst_ptr.as_offset() - end_ptr.as_offset();
 
     if rel64 >= i32::MIN.into() && rel64 <= i32::MAX.into() {
         // Write the relative 32-bit jump offset
@@ -932,6 +952,7 @@ pub fn jmp32(cb: &mut CodeBlock, offset: i32) {
 pub fn lea(cb: &mut CodeBlock, dst: X86Opnd, src: X86Opnd) {
     if let X86Opnd::Reg(reg) = dst {
         assert!(reg.num_bits == 64);
+        assert!(matches!(src, X86Opnd::Mem(_) | X86Opnd::IPRel(_)));
         write_rm(cb, false, true, dst, src, None, &[0x8d]);
     } else {
         unreachable!();
@@ -1006,7 +1027,10 @@ pub fn mov(cb: &mut CodeBlock, dst: X86Opnd, src: X86Opnd) {
             }
 
             let output_num_bits:u32 = if mem.num_bits > 32 { 32 } else { mem.num_bits.into() };
-            assert!(imm_num_bits(imm.value) <= (output_num_bits as u8));
+            assert!(
+                mem.num_bits < 64 || imm_num_bits(imm.value) <= (output_num_bits as u8),
+                "immediate value should be small enough to survive sign extension"
+            );
             cb.write_int(imm.value as u64, output_num_bits);
         },
         // M + UImm
@@ -1021,7 +1045,10 @@ pub fn mov(cb: &mut CodeBlock, dst: X86Opnd, src: X86Opnd) {
             }
 
             let output_num_bits = if mem.num_bits > 32 { 32 } else { mem.num_bits.into() };
-            assert!(imm_num_bits(uimm.value as i64) <= (output_num_bits as u8));
+            assert!(
+                mem.num_bits < 64 || imm_num_bits(uimm.value as i64) <= (output_num_bits as u8),
+                "immediate value should be small enough to survive sign extension"
+            );
             cb.write_int(uimm.value, output_num_bits);
         },
         // * + Imm/UImm
@@ -1223,8 +1250,8 @@ pub fn ret(cb: &mut CodeBlock) {
     cb.write_byte(0xC3);
 }
 
-// Encode a single-operand shift instruction
-fn write_shift(cb: &mut CodeBlock, op_mem_one_pref: u8, _op_mem_cl_pref: u8, op_mem_imm_pref: u8, op_ext: Option<u8>, opnd0: X86Opnd, opnd1: X86Opnd) {
+// Encode a bitwise shift instruction
+fn write_shift(cb: &mut CodeBlock, op_mem_one_pref: u8, op_mem_cl_pref: u8, op_mem_imm_pref: u8, op_ext: u8, opnd0: X86Opnd, opnd1: X86Opnd) {
     assert!(matches!(opnd0, X86Opnd::Reg(_) | X86Opnd::Mem(_)));
 
     // Check the size of opnd0
@@ -1234,16 +1261,26 @@ fn write_shift(cb: &mut CodeBlock, op_mem_one_pref: u8, _op_mem_cl_pref: u8, op_
     let sz_pref = opnd_size == 16;
     let rex_w = opnd_size == 64;
 
-    if let X86Opnd::UImm(imm) = opnd1 {
-        if imm.value == 1 {
-            write_rm(cb, sz_pref, rex_w, X86Opnd::None, opnd0, op_ext, &[op_mem_one_pref]);
-        } else {
-            assert!(imm.num_bits <= 8);
-            write_rm(cb, sz_pref, rex_w, X86Opnd::None, opnd0, op_ext, &[op_mem_imm_pref]);
-            cb.write_byte(imm.value as u8);
+    match opnd1 {
+        X86Opnd::UImm(imm) => {
+            if imm.value == 1 {
+                write_rm(cb, sz_pref, rex_w, X86Opnd::None, opnd0, Some(op_ext), &[op_mem_one_pref]);
+            } else {
+                assert!(imm.num_bits <= 8);
+                write_rm(cb, sz_pref, rex_w, X86Opnd::None, opnd0, Some(op_ext), &[op_mem_imm_pref]);
+                cb.write_byte(imm.value as u8);
+            }
+        }
+
+        X86Opnd::Reg(reg) => {
+            // We can only use CL/RCX as the shift amount
+            assert!(reg.reg_no == RCX_REG.reg_no);
+            write_rm(cb, sz_pref, rex_w, X86Opnd::None, opnd0, Some(op_ext), &[op_mem_cl_pref]);
+        }
+
+        _ => {
+            unreachable!("unsupported operands: {:?}, {:?}", opnd0, opnd1);
         }
-    } else {
-        unreachable!();
     }
 }
 
@@ -1254,7 +1291,7 @@ pub fn sal(cb: &mut CodeBlock, opnd0: X86Opnd, opnd1: X86Opnd) {
         0xD1, // opMemOnePref,
         0xD3, // opMemClPref,
         0xC1, // opMemImmPref,
-        Some(0x04),
+        0x04,
         opnd0,
         opnd1
     );
@@ -1267,7 +1304,7 @@ pub fn sar(cb: &mut CodeBlock, opnd0: X86Opnd, opnd1: X86Opnd) {
         0xD1, // opMemOnePref,
         0xD3, // opMemClPref,
         0xC1, // opMemImmPref,
-        Some(0x07),
+        0x07,
         opnd0,
         opnd1
     );
@@ -1280,7 +1317,7 @@ pub fn shl(cb: &mut CodeBlock, opnd0: X86Opnd, opnd1: X86Opnd) {
         0xD1, // opMemOnePref,
         0xD3, // opMemClPref,
         0xC1, // opMemImmPref,
-        Some(0x04),
+        0x04,
         opnd0,
         opnd1
     );
@@ -1293,7 +1330,7 @@ pub fn shr(cb: &mut CodeBlock, opnd0: X86Opnd, opnd1: X86Opnd) {
         0xD1, // opMemOnePref,
         0xD3, // opMemClPref,
         0xC1, // opMemImmPref,
-        Some(0x05),
+        0x05,
         opnd0,
         opnd1
     );
diff --git a/yjit/src/asm/x86_64/tests.rs b/yjit/src/asm/x86_64/tests.rs
index 1cd005747d..eefcbfd52e 100644
--- a/yjit/src/asm/x86_64/tests.rs
+++ b/yjit/src/asm/x86_64/tests.rs
@@ -68,7 +68,7 @@ fn test_call_ptr() {
     // calling a lower address
     check_bytes("e8fbffffff", |cb| {
         let ptr = cb.get_write_ptr();
-        call_ptr(cb, RAX, ptr.raw_ptr());
+        call_ptr(cb, RAX, ptr.raw_ptr(cb));
     });
 }
 
@@ -106,6 +106,15 @@ fn test_cqo() {
 }
 
 #[test]
+fn test_imul() {
+    check_bytes("480fafc3", |cb| imul(cb, RAX, RBX));
+    check_bytes("480faf10", |cb| imul(cb, RDX, mem_opnd(64, RAX, 0)));
+
+    // Operands flipped for encoding since multiplication is commutative
+    check_bytes("480faf10", |cb| imul(cb, mem_opnd(64, RAX, 0), RDX));
+}
+
+#[test]
 fn test_jge_label() {
     check_bytes("0f8dfaffffff", |cb| {
         let label_idx = cb.new_label("loop".to_owned());
@@ -184,6 +193,7 @@ fn test_mov() {
     check_bytes("48c7470801000000", |cb| mov(cb, mem_opnd(64, RDI, 8), imm_opnd(1)));
     //check_bytes("67c7400411000000", |cb| mov(cb, mem_opnd(32, EAX, 4), imm_opnd(0x34))); // We don't distinguish between EAX and RAX here - that's probably fine?
     check_bytes("c7400411000000", |cb| mov(cb, mem_opnd(32, RAX, 4), imm_opnd(17)));
+    check_bytes("c7400401000080", |cb| mov(cb, mem_opnd(32, RAX, 4), uimm_opnd(0x80000001)));
     check_bytes("41895814", |cb| mov(cb, mem_opnd(32, R8, 20), EBX));
     check_bytes("4d8913", |cb| mov(cb, mem_opnd(64, R11, 0), R10));
     check_bytes("48c742f8f4ffffff", |cb| mov(cb, mem_opnd(64, RDX, -8), imm_opnd(-12)));
@@ -340,6 +350,7 @@ fn test_sal() {
     check_bytes("d1e1", |cb| sal(cb, ECX, uimm_opnd(1)));
     check_bytes("c1e505", |cb| sal(cb, EBP, uimm_opnd(5)));
     check_bytes("d1642444", |cb| sal(cb, mem_opnd(32, RSP, 68), uimm_opnd(1)));
+    check_bytes("48d3e1", |cb| sal(cb, RCX, CL));
 }
 
 #[test]
@@ -361,7 +372,7 @@ fn test_sub() {
 #[test]
 #[should_panic]
 fn test_sub_uimm_too_large() {
-    // This immedaite becomes a different value after
+    // This immediate becomes a different value after
     // sign extension, so not safe to encode.
     check_bytes("ff", |cb| sub(cb, RCX, uimm_opnd(0x8000_0000)));
 }
@@ -432,15 +443,15 @@ fn basic_capstone_usage() -> std::result::Result<(), capstone::Error> {
 fn block_comments() {
     let mut cb = super::CodeBlock::new_dummy(4096);
 
-    let first_write_ptr = cb.get_write_ptr().into_usize();
+    let first_write_ptr = cb.get_write_ptr().raw_addr(&cb);
     cb.add_comment("Beginning");
     xor(&mut cb, EAX, EAX); // 2 bytes long
-    let second_write_ptr = cb.get_write_ptr().into_usize();
+    let second_write_ptr = cb.get_write_ptr().raw_addr(&cb);
     cb.add_comment("Two bytes in");
     cb.add_comment("Still two bytes in");
     cb.add_comment("Still two bytes in"); // Duplicate, should be ignored
     test(&mut cb, mem_opnd(64, RSI, 64), imm_opnd(!0x08)); // 8 bytes long
-    let third_write_ptr = cb.get_write_ptr().into_usize();
+    let third_write_ptr = cb.get_write_ptr().raw_addr(&cb);
     cb.add_comment("Ten bytes in");
 
     assert_eq!(&vec!( "Beginning".to_string() ), cb.comments_at(first_write_ptr).unwrap());
diff --git a/yjit/src/backend/arm64/mod.rs b/yjit/src/backend/arm64/mod.rs
index eb096ce677..0521e09d0b 100644
--- a/yjit/src/backend/arm64/mod.rs
+++ b/yjit/src/backend/arm64/mod.rs
@@ -1,13 +1,11 @@
-#![allow(dead_code)]
-#![allow(unused_variables)]
-#![allow(unused_imports)]
+use std::mem::take;
 
-use crate::asm::{CodeBlock};
+use crate::asm::{CodeBlock, OutlinedCb};
 use crate::asm::arm64::*;
-use crate::codegen::{JITState, CodegenGlobals};
 use crate::cruby::*;
 use crate::backend::ir::*;
 use crate::virtualmem::CodePtr;
+use crate::utils::*;
 
 // Use the arm64 register type for this platform
 pub type Reg = A64Reg;
@@ -36,8 +34,25 @@ pub const _C_RET_OPND: Opnd = Opnd::Reg(X0_REG);
 pub const C_SP_REG: A64Opnd = X31;
 pub const C_SP_STEP: i32 = 16;
 
-// The number of bytes that are generated by emit_jmp_ptr
-pub const JMP_PTR_BYTES: usize = 20;
+impl CodeBlock {
+    // The maximum number of bytes that can be generated by emit_jmp_ptr.
+    pub fn jmp_ptr_bytes(&self) -> usize {
+        // b instruction's offset is encoded as imm26 times 4. It can jump to
+        // +/-128MiB, so this can be used when --yjit-exec-mem-size <= 128.
+        let num_insns = if b_offset_fits_bits(self.virtual_region_size() as i64 / 4) {
+            1 // b instruction
+        } else {
+            5 // 4 instructions to load a 64-bit absolute address + br instruction
+        };
+        num_insns * 4
+    }
+
+    // The maximum number of instructions that can be generated by emit_conditional_jump.
+    fn conditional_jump_insns(&self) -> i32 {
+        // The worst case is instructions for a jump + bcond.
+        self.jmp_ptr_bytes() as i32 / 4 + 1
+    }
+}
 
 /// Map Opnd to A64Opnd
 impl From<Opnd> for A64Opnd {
@@ -52,8 +67,10 @@ impl From<Opnd> for A64Opnd {
             Opnd::Mem(Mem { base: MemBase::InsnOut(_), .. }) => {
                 panic!("attempted to lower an Opnd::Mem with a MemBase::InsnOut base")
             },
+            Opnd::CArg(_) => panic!("attempted to lower an Opnd::CArg"),
             Opnd::InsnOut { .. } => panic!("attempted to lower an Opnd::InsnOut"),
             Opnd::Value(_) => panic!("attempted to lower an Opnd::Value"),
+            Opnd::Stack { .. } => panic!("attempted to lower an Opnd::Stack"),
             Opnd::None => panic!(
                 "Attempted to lower an Opnd::None. This often happens when an out operand was not allocated for an instruction because the output of the instruction was not used. Please ensure you are using the output."
             ),
@@ -69,11 +86,112 @@ impl From<&Opnd> for A64Opnd {
     }
 }
 
+/// Call emit_jmp_ptr and immediately invalidate the written range.
+/// This is needed when next_page also moves other_cb that is not invalidated
+/// by compile_with_regs. Doing it here allows you to avoid invalidating a lot
+/// more than necessary when other_cb jumps from a position early in the page.
+/// This invalidates a small range of cb twice, but we accept the small cost.
+fn emit_jmp_ptr_with_invalidation(cb: &mut CodeBlock, dst_ptr: CodePtr) {
+    #[cfg(not(test))]
+    let start = cb.get_write_ptr();
+    emit_jmp_ptr(cb, dst_ptr, true);
+    #[cfg(not(test))]
+    {
+        let end = cb.get_write_ptr();
+        unsafe { rb_jit_icache_invalidate(start.raw_ptr(cb) as _, end.raw_ptr(cb) as _) };
+    }
+}
+
+fn emit_jmp_ptr(cb: &mut CodeBlock, dst_ptr: CodePtr, padding: bool) {
+    let src_addr = cb.get_write_ptr().as_offset();
+    let dst_addr = dst_ptr.as_offset();
+
+    // If the offset is short enough, then we'll use the
+    // branch instruction. Otherwise, we'll move the
+    // destination into a register and use the branch
+    // register instruction.
+    let num_insns = if b_offset_fits_bits((dst_addr - src_addr) / 4) {
+        b(cb, InstructionOffset::from_bytes((dst_addr - src_addr) as i32));
+        1
+    } else {
+        let num_insns = emit_load_value(cb, Assembler::SCRATCH0, dst_addr as u64);
+        br(cb, Assembler::SCRATCH0);
+        num_insns + 1
+    };
+
+    if padding {
+        // Make sure it's always a consistent number of
+        // instructions in case it gets patched and has to
+        // use the other branch.
+        assert!(num_insns * 4 <= cb.jmp_ptr_bytes());
+        for _ in num_insns..(cb.jmp_ptr_bytes() / 4) {
+            nop(cb);
+        }
+    }
+}
+
+/// Emit the required instructions to load the given value into the
+/// given register. Our goal here is to use as few instructions as
+/// possible to get this value into the register.
+fn emit_load_value(cb: &mut CodeBlock, rd: A64Opnd, value: u64) -> usize {
+    let mut current = value;
+
+    if current <= 0xffff {
+        // If the value fits into a single movz
+        // instruction, then we'll use that.
+        movz(cb, rd, A64Opnd::new_uimm(current), 0);
+        return 1;
+    } else if BitmaskImmediate::try_from(current).is_ok() {
+        // Otherwise, if the immediate can be encoded
+        // with the special bitmask immediate encoding,
+        // we'll use that.
+        mov(cb, rd, A64Opnd::new_uimm(current));
+        return 1;
+    } else {
+        // Finally we'll fall back to encoding the value
+        // using movz for the first 16 bits and movk for
+        // each subsequent set of 16 bits as long we
+        // they are necessary.
+        movz(cb, rd, A64Opnd::new_uimm(current & 0xffff), 0);
+        let mut num_insns = 1;
+
+        // (We're sure this is necessary since we
+        // checked if it only fit into movz above).
+        current >>= 16;
+        movk(cb, rd, A64Opnd::new_uimm(current & 0xffff), 16);
+        num_insns += 1;
+
+        if current > 0xffff {
+            current >>= 16;
+            movk(cb, rd, A64Opnd::new_uimm(current & 0xffff), 32);
+            num_insns += 1;
+        }
+
+        if current > 0xffff {
+            current >>= 16;
+            movk(cb, rd, A64Opnd::new_uimm(current & 0xffff), 48);
+            num_insns += 1;
+        }
+        return num_insns;
+    }
+}
+
+/// List of registers that can be used for stack temps.
+/// These are caller-saved registers.
+pub static TEMP_REGS: [Reg; 5] = [X1_REG, X9_REG, X10_REG, X14_REG, X15_REG];
+
+#[derive(Debug, PartialEq)]
+enum EmitError {
+    RetryOnNextPage,
+    OutOfMemory,
+}
+
 impl Assembler
 {
-    // A special scratch register for intermediate processing.
+    // Special scratch registers for intermediate processing.
     // This register is caller-saved (so we don't have to save it before using it)
-    const SCRATCH0: A64Opnd = A64Opnd::Reg(X16_REG);
+    pub const SCRATCH_REG: Reg = X16_REG;
+    const SCRATCH0: A64Opnd = A64Opnd::Reg(Assembler::SCRATCH_REG);
     const SCRATCH1: A64Opnd = A64Opnd::Reg(X17_REG);
 
     /// Get the list of registers from which we will allocate on this platform
@@ -86,7 +204,7 @@ impl Assembler
 
     /// Get a list of all of the caller-saved registers
     pub fn get_caller_save_regs() -> Vec<Reg> {
-        vec![X9_REG, X10_REG, X11_REG, X12_REG, X13_REG, X14_REG, X15_REG]
+        vec![X1_REG, X9_REG, X10_REG, X11_REG, X12_REG, X13_REG, X14_REG, X15_REG]
     }
 
     /// Split platform-specific instructions
@@ -162,7 +280,7 @@ impl Assembler
         /// do follow that encoding, and if they don't then we load them first.
         fn split_bitmask_immediate(asm: &mut Assembler, opnd: Opnd, dest_num_bits: u8) -> Opnd {
             match opnd {
-                Opnd::Reg(_) | Opnd::InsnOut { .. } => opnd,
+                Opnd::Reg(_) | Opnd::CArg(_) | Opnd::InsnOut { .. } | Opnd::Stack { .. } => opnd,
                 Opnd::Mem(_) => split_load_operand(asm, opnd),
                 Opnd::Imm(imm) => {
                     if imm == 0 {
@@ -195,9 +313,13 @@ impl Assembler
         /// a certain size. If they don't then we need to load them first.
         fn split_shifted_immediate(asm: &mut Assembler, opnd: Opnd) -> Opnd {
             match opnd {
-                Opnd::Reg(_) | Opnd::InsnOut { .. } => opnd,
+                Opnd::Reg(_) | Opnd::CArg(_) | Opnd::InsnOut { .. } => opnd,
                 Opnd::Mem(_) => split_load_operand(asm, opnd),
-                Opnd::Imm(_) => asm.load(opnd),
+                Opnd::Imm(imm) => if ShiftedImmediate::try_from(imm as u64).is_ok() {
+                    opnd
+                } else {
+                    asm.load(opnd)
+                }
                 Opnd::UImm(uimm) => {
                     if ShiftedImmediate::try_from(uimm).is_ok() {
                         opnd
@@ -205,7 +327,7 @@ impl Assembler
                         asm.load(opnd)
                     }
                 },
-                Opnd::None | Opnd::Value(_) => unreachable!()
+                Opnd::None | Opnd::Value(_) | Opnd::Stack { .. } => unreachable!()
             }
         }
 
@@ -258,7 +380,8 @@ impl Assembler
             }
         }
 
-        let mut asm_local = Assembler::new_with_label_names(std::mem::take(&mut self.label_names));
+        let live_ranges: Vec<usize> = take(&mut self.live_ranges);
+        let mut asm_local = Assembler::new_with_label_names(take(&mut self.label_names), take(&mut self.side_exits), self.num_locals);
         let asm = &mut asm_local;
         let mut iterator = self.into_draining_iter();
 
@@ -280,6 +403,9 @@ impl Assembler
                             *opnd = asm.load(*opnd);
                         }
                     },
+                    Opnd::Stack { .. } => {
+                        *opnd = asm.lower_stack_opnd(opnd);
+                    }
                     _ => {}
                 };
             }
@@ -287,11 +413,11 @@ impl Assembler
             // We are replacing instructions here so we know they are already
             // being used. It is okay not to use their output here.
             #[allow(unused_must_use)]
-            match insn {
+            match &mut insn {
                 Insn::Add { left, right, .. } => {
-                    match (left, right) {
+                    match (*left, *right) {
                         (Opnd::Reg(_) | Opnd::InsnOut { .. }, Opnd::Reg(_) | Opnd::InsnOut { .. }) => {
-                            asm.add(left, right);
+                            asm.add(*left, *right);
                         },
                         (reg_opnd @ (Opnd::Reg(_) | Opnd::InsnOut { .. }), other_opnd) |
                         (other_opnd, reg_opnd @ (Opnd::Reg(_) | Opnd::InsnOut { .. })) => {
@@ -299,24 +425,68 @@ impl Assembler
                             asm.add(reg_opnd, opnd1);
                         },
                         _ => {
-                            let opnd0 = split_load_operand(asm, left);
-                            let opnd1 = split_shifted_immediate(asm, right);
+                            let opnd0 = split_load_operand(asm, *left);
+                            let opnd1 = split_shifted_immediate(asm, *right);
                             asm.add(opnd0, opnd1);
                         }
                     }
                 },
-                Insn::And { left, right, .. } => {
-                    let (opnd0, opnd1) = split_boolean_operands(asm, left, right);
-                    asm.and(opnd0, opnd1);
-                },
-                Insn::Or { left, right, .. } => {
-                    let (opnd0, opnd1) = split_boolean_operands(asm, left, right);
-                    asm.or(opnd0, opnd1);
-                },
-                Insn::Xor { left, right, .. } => {
-                    let (opnd0, opnd1) = split_boolean_operands(asm, left, right);
-                    asm.xor(opnd0, opnd1);
-                },
+                Insn::And { left, right, out } |
+                Insn::Or { left, right, out } |
+                Insn::Xor { left, right, out } => {
+                    let (opnd0, opnd1) = split_boolean_operands(asm, *left, *right);
+                    *left = opnd0;
+                    *right = opnd1;
+
+                    // Since these instructions are lowered to an instruction that have 2 input
+                    // registers and an output register, look to merge with an `Insn::Mov` that
+                    // follows which puts the output in another register. For example:
+                    // `Add a, b => out` followed by `Mov c, out` becomes `Add a, b => c`.
+                    if let (Opnd::Reg(_), Opnd::Reg(_), Some(Insn::Mov { dest, src })) = (left, right, iterator.peek()) {
+                        if live_ranges[index] == index + 1 {
+                            // Check after potentially lowering a stack operand to a register operand
+                            let lowered_dest = if let Opnd::Stack { .. } = dest {
+                                asm.lower_stack_opnd(dest)
+                            } else {
+                                *dest
+                            };
+                            if out == src && matches!(lowered_dest, Opnd::Reg(_)) {
+                                *out = lowered_dest;
+                                iterator.map_insn_index(asm);
+                                iterator.next_unmapped(); // Pop merged Insn::Mov
+                            }
+                        }
+                    }
+
+                    asm.push_insn(insn);
+                }
+                // Lower to Joz and Jonz for generating CBZ/CBNZ for compare-with-0-and-branch.
+                ref insn @ Insn::Cmp { ref left, right: ref right @ (Opnd::UImm(0) | Opnd::Imm(0)) } |
+                ref insn @ Insn::Test { ref left, right: ref right @ (Opnd::InsnOut { .. } | Opnd::Reg(_)) } if {
+                    let same_opnd_if_test = if let Insn::Test { .. } = insn {
+                        left == right
+                    } else {
+                        true
+                    };
+
+                    same_opnd_if_test && if let Some(
+                            Insn::Jz(target) | Insn::Je(target) | Insn::Jnz(target) | Insn::Jne(target)
+                        ) = iterator.peek() {
+                            matches!(target, Target::SideExit { .. })
+                        } else {
+                            false
+                        }
+                } => {
+                    let reg = split_load_operand(asm, *left);
+                    match iterator.peek() {
+                        Some(Insn::Jz(target) | Insn::Je(target))   => asm.push_insn(Insn::Joz(reg, *target)),
+                        Some(Insn::Jnz(target) | Insn::Jne(target)) => asm.push_insn(Insn::Jonz(reg, *target)),
+                        _ => ()
+                    }
+
+                    iterator.map_insn_index(asm);
+                    iterator.next_unmapped(); // Pop merged jump instruction
+                }
                 Insn::CCall { opnds, fptr, .. } => {
                     assert!(opnds.len() <= C_ARG_OPNDS.len());
 
@@ -330,21 +500,21 @@ impl Assembler
                         // a UImm of 0 along as the argument to the move.
                         let value = match opnd {
                             Opnd::UImm(0) | Opnd::Imm(0) => Opnd::UImm(0),
-                            Opnd::Mem(_) => split_memory_address(asm, opnd),
-                            _ => opnd
+                            Opnd::Mem(_) => split_memory_address(asm, *opnd),
+                            _ => *opnd
                         };
 
-                        asm.load_into(C_ARG_OPNDS[idx], value);
+                        asm.load_into(Opnd::c_arg(C_ARG_OPNDS[idx]), value);
                     }
 
                     // Now we push the CCall without any arguments so that it
                     // just performs the call.
-                    asm.ccall(fptr, vec![]);
+                    asm.ccall(*fptr, vec![]);
                 },
                 Insn::Cmp { left, right } => {
-                    let opnd0 = split_load_operand(asm, left);
+                    let opnd0 = split_load_operand(asm, *left);
                     let opnd0 = split_less_than_32_cmp(asm, opnd0);
-                    let split_right = split_shifted_immediate(asm, right);
+                    let split_right = split_shifted_immediate(asm, *right);
                     let opnd1 = match split_right {
                         Opnd::InsnOut { .. } if opnd0.num_bits() != split_right.num_bits() => {
                             split_right.with_num_bits(opnd0.num_bits().unwrap()).unwrap()
@@ -364,81 +534,66 @@ impl Assembler
                         // make sure the displacement isn't too large and then
                         // load it into the return register.
                         Opnd::Mem(_) => {
-                            let split = split_memory_address(asm, opnd);
+                            let split = split_memory_address(asm, *opnd);
                             asm.load_into(C_RET_OPND, split);
                         },
 
                         // Otherwise we just need to load the value into the
                         // return register.
                         _ => {
-                            asm.load_into(C_RET_OPND, opnd);
+                            asm.load_into(C_RET_OPND, *opnd);
                         }
                     }
                     asm.cret(C_RET_OPND);
                 },
-                Insn::CSelZ { truthy, falsy, .. } => {
-                    let (opnd0, opnd1) = split_csel_operands(asm, truthy, falsy);
-                    asm.csel_z(opnd0, opnd1);
-                },
-                Insn::CSelNZ { truthy, falsy, .. } => {
-                    let (opnd0, opnd1) = split_csel_operands(asm, truthy, falsy);
-                    asm.csel_nz(opnd0, opnd1);
-                },
-                Insn::CSelE { truthy, falsy, .. } => {
-                    let (opnd0, opnd1) = split_csel_operands(asm, truthy, falsy);
-                    asm.csel_e(opnd0, opnd1);
-                },
-                Insn::CSelNE { truthy, falsy, .. } => {
-                    let (opnd0, opnd1) = split_csel_operands(asm, truthy, falsy);
-                    asm.csel_ne(opnd0, opnd1);
-                },
-                Insn::CSelL { truthy, falsy, .. } => {
-                    let (opnd0, opnd1) = split_csel_operands(asm, truthy, falsy);
-                    asm.csel_l(opnd0, opnd1);
-                },
-                Insn::CSelLE { truthy, falsy, .. } => {
-                    let (opnd0, opnd1) = split_csel_operands(asm, truthy, falsy);
-                    asm.csel_le(opnd0, opnd1);
-                },
-                Insn::CSelG { truthy, falsy, .. } => {
-                    let (opnd0, opnd1) = split_csel_operands(asm, truthy, falsy);
-                    asm.csel_g(opnd0, opnd1);
-                },
-                Insn::CSelGE { truthy, falsy, .. } => {
-                    let (opnd0, opnd1) = split_csel_operands(asm, truthy, falsy);
-                    asm.csel_ge(opnd0, opnd1);
+                Insn::CSelZ { truthy, falsy, out } |
+                Insn::CSelNZ { truthy, falsy, out } |
+                Insn::CSelE { truthy, falsy, out } |
+                Insn::CSelNE { truthy, falsy, out } |
+                Insn::CSelL { truthy, falsy, out } |
+                Insn::CSelLE { truthy, falsy, out } |
+                Insn::CSelG { truthy, falsy, out } |
+                Insn::CSelGE { truthy, falsy, out } => {
+                    let (opnd0, opnd1) = split_csel_operands(asm, *truthy, *falsy);
+                    *truthy = opnd0;
+                    *falsy = opnd1;
+                    // Merge `csel` and `mov` into a single `csel` when possible
+                    match iterator.peek() {
+                        Some(Insn::Mov { dest: Opnd::Reg(reg), src })
+                        if matches!(out, Opnd::InsnOut { .. }) && *out == *src && live_ranges[index] == index + 1 => {
+                            *out = Opnd::Reg(*reg);
+                            asm.push_insn(insn);
+                            iterator.map_insn_index(asm);
+                            iterator.next_unmapped(); // Pop merged Insn::Mov
+                        }
+                        _ => {
+                            asm.push_insn(insn);
+                        }
+                    }
                 },
                 Insn::IncrCounter { mem, value } => {
                     let counter_addr = match mem {
-                        Opnd::Mem(_) => split_lea_operand(asm, mem),
-                        _ => mem
+                        Opnd::Mem(_) => split_lea_operand(asm, *mem),
+                        _ => *mem
                     };
 
-                    asm.incr_counter(counter_addr, value);
+                    asm.incr_counter(counter_addr, *value);
                 },
                 Insn::JmpOpnd(opnd) => {
                     if let Opnd::Mem(_) = opnd {
-                        let opnd0 = split_load_operand(asm, opnd);
+                        let opnd0 = split_load_operand(asm, *opnd);
                         asm.jmp_opnd(opnd0);
                     } else {
-                        asm.jmp_opnd(opnd);
+                        asm.jmp_opnd(*opnd);
                     }
                 },
-                Insn::Load { opnd, .. } => {
-                    let value = match opnd {
-                        Opnd::Mem(_) => split_memory_address(asm, opnd),
-                        _ => opnd
-                    };
-
-                    asm.load(value);
-                },
-                Insn::LoadInto { dest, opnd } => {
-                    let value = match opnd {
-                        Opnd::Mem(_) => split_memory_address(asm, opnd),
-                        _ => opnd
+                Insn::Load { opnd, .. } |
+                Insn::LoadInto { opnd, .. } => {
+                    *opnd = match opnd {
+                        Opnd::Mem(_) => split_memory_address(asm, *opnd),
+                        _ => *opnd
                     };
-
-                    asm.load_into(dest, value);
+                    asm.push_insn(insn);
                 },
                 Insn::LoadSExt { opnd, .. } => {
                     match opnd {
@@ -449,39 +604,50 @@ impl Assembler
                         Opnd::Reg(Reg { num_bits: 32, .. }) |
                         Opnd::InsnOut { num_bits: 32, .. } |
                         Opnd::Mem(Mem { num_bits: 32, .. }) => {
-                            asm.load_sext(opnd);
+                            asm.load_sext(*opnd);
                         },
                         _ => {
-                            asm.load(opnd);
+                            asm.load(*opnd);
                         }
                     };
                 },
                 Insn::Mov { dest, src } => {
-                    let value: Opnd = match (dest, src) {
-                        // If the first operand is zero, then we can just use
-                        // the zero register.
-                        (Opnd::Mem(_), Opnd::UImm(0) | Opnd::Imm(0)) => Opnd::Reg(XZR_REG),
-                        // If the first operand is a memory operand, we're going
-                        // to transform this into a store instruction, so we'll
-                        // need to load this anyway.
-                        (Opnd::Mem(_), Opnd::UImm(_)) => asm.load(src),
-                        // The value that is being moved must be either a
-                        // register or an immediate that can be encoded as a
-                        // bitmask immediate. Otherwise, we'll need to split the
-                        // move into multiple instructions.
-                        _ => split_bitmask_immediate(asm, src, dest.rm_num_bits())
-                    };
+                    match (&dest, &src) {
+                        // If we're attempting to load into a memory operand, then
+                        // we'll switch over to the store instruction.
+                        (Opnd::Mem(_), _) => {
+                            let opnd0 = split_memory_address(asm, *dest);
+                            let value = match *src {
+                                // If the first operand is zero, then we can just use
+                                // the zero register.
+                                Opnd::UImm(0) | Opnd::Imm(0) => Opnd::Reg(XZR_REG),
+                                // If the first operand is a memory operand, we're going
+                                // to transform this into a store instruction, so we'll
+                                // need to load this anyway.
+                                Opnd::UImm(_) => asm.load(*src),
+                                // The value that is being moved must be either a
+                                // register or an immediate that can be encoded as a
+                                // bitmask immediate. Otherwise, we'll need to split the
+                                // move into multiple instructions.
+                                _ => split_bitmask_immediate(asm, *src, dest.rm_num_bits())
+                            };
 
-                    // If we're attempting to load into a memory operand, then
-                    // we'll switch over to the store instruction. Otherwise
-                    // we'll use the normal mov instruction.
-                    match dest {
-                        Opnd::Mem(_) => {
-                            let opnd0 = split_memory_address(asm, dest);
                             asm.store(opnd0, value);
                         },
-                        Opnd::Reg(_) => {
-                            asm.mov(dest, value);
+                        // If we're loading a memory operand into a register, then
+                        // we'll switch over to the load instruction.
+                        (Opnd::Reg(_), Opnd::Mem(_)) => {
+                            let value = split_memory_address(asm, *src);
+                            asm.load_into(*dest, value);
+                        },
+                        // Otherwise we'll use the normal mov instruction.
+                        (Opnd::Reg(_), _) => {
+                            let value = match *src {
+                                // Unlike other instructions, we can avoid splitting this case, using movz.
+                                Opnd::UImm(uimm) if uimm <= 0xffff => *src,
+                                _ => split_bitmask_immediate(asm, *src, dest.rm_num_bits()),
+                            };
+                            asm.mov(*dest, value);
                         },
                         _ => unreachable!()
                     };
@@ -490,18 +656,26 @@ impl Assembler
                     // The value that is being negated must be in a register, so
                     // if we get anything else we need to load it first.
                     let opnd0 = match opnd {
-                        Opnd::Mem(_) => split_load_operand(asm, opnd),
-                        _ => opnd
+                        Opnd::Mem(_) => split_load_operand(asm, *opnd),
+                        _ => *opnd
                     };
 
                     asm.not(opnd0);
                 },
-                Insn::Store { dest, src } => {
-                    // The displacement for the STUR instruction can't be more
-                    // than 9 bits long. If it's longer, we need to load the
-                    // memory address into a register first.
-                    let opnd0 = split_memory_address(asm, dest);
+                Insn::LShift { opnd, .. } |
+                Insn::RShift { opnd, .. } |
+                Insn::URShift { opnd, .. } => {
+                    // The operand must be in a register, so
+                    // if we get anything else we need to load it first.
+                    let opnd0 = match opnd {
+                        Opnd::Mem(_) => split_load_operand(asm, *opnd),
+                        _ => *opnd
+                    };
 
+                    *opnd = opnd0;
+                    asm.push_insn(insn);
+                },
+                Insn::Store { dest, src } => {
                     // The value being stored must be in a register, so if it's
                     // not already one we'll load it first.
                     let opnd1 = match src {
@@ -509,26 +683,43 @@ impl Assembler
                         // the zero register.
                         Opnd::UImm(0) | Opnd::Imm(0) => Opnd::Reg(XZR_REG),
                         // Otherwise we'll check if we need to load it first.
-                        _ => split_load_operand(asm, src)
+                        _ => split_load_operand(asm, *src)
                     };
 
-                    asm.store(opnd0, opnd1);
+                    match dest {
+                        Opnd::Reg(_) => {
+                            // Store does not support a register as a dest operand.
+                            asm.mov(*dest, opnd1);
+                        }
+                        _ => {
+                            // The displacement for the STUR instruction can't be more
+                            // than 9 bits long. If it's longer, we need to load the
+                            // memory address into a register first.
+                            let opnd0 = split_memory_address(asm, *dest);
+                            asm.store(opnd0, opnd1);
+                        }
+                    }
                 },
                 Insn::Sub { left, right, .. } => {
-                    let opnd0 = split_load_operand(asm, left);
-                    let opnd1 = split_shifted_immediate(asm, right);
+                    let opnd0 = split_load_operand(asm, *left);
+                    let opnd1 = split_shifted_immediate(asm, *right);
                     asm.sub(opnd0, opnd1);
                 },
+                Insn::Mul { left, right, .. } => {
+                    let opnd0 = split_load_operand(asm, *left);
+                    let opnd1 = split_load_operand(asm, *right);
+                    asm.mul(opnd0, opnd1);
+                },
                 Insn::Test { left, right } => {
                     // The value being tested must be in a register, so if it's
                     // not already one we'll load it first.
-                    let opnd0 = split_load_operand(asm, left);
+                    let opnd0 = split_load_operand(asm, *left);
 
                     // The second value must be either a register or an
                     // unsigned immediate that can be encoded as a bitmask
                     // immediate. If it's not one of those, we'll need to load
                     // it first.
-                    let opnd1 = split_bitmask_immediate(asm, right, opnd0.rm_num_bits());
+                    let opnd1 = split_bitmask_immediate(asm, *right, opnd0.rm_num_bits());
                     asm.test(opnd0, opnd1);
                 },
                 _ => {
@@ -551,9 +742,8 @@ impl Assembler
     }
 
     /// Emit platform-specific machine code
-    /// Returns a list of GC offsets
-    pub fn arm64_emit(&mut self, cb: &mut CodeBlock) -> Vec<u32>
-    {
+    /// Returns a list of GC offsets. Can return failure to signal caller to retry.
+    fn arm64_emit(&mut self, cb: &mut CodeBlock, ocb: &mut Option<&mut OutlinedCb>) -> Result<Vec<u32>, EmitError> {
         /// Determine how many instructions it will take to represent moving
         /// this value into a register. Note that the return value of this
         /// function must correspond to how many instructions are used to
@@ -574,59 +764,13 @@ impl Assembler
             }
         }
 
-        /// Emit the required instructions to load the given value into the
-        /// given register. Our goal here is to use as few instructions as
-        /// possible to get this value into the register.
-        fn emit_load_value(cb: &mut CodeBlock, rd: A64Opnd, value: u64) -> usize {
-            let mut current = value;
-
-            if current <= 0xffff {
-                // If the value fits into a single movz
-                // instruction, then we'll use that.
-                movz(cb, rd, A64Opnd::new_uimm(current), 0);
-                return 1;
-            } else if BitmaskImmediate::try_from(current).is_ok() {
-                // Otherwise, if the immediate can be encoded
-                // with the special bitmask immediate encoding,
-                // we'll use that.
-                mov(cb, rd, A64Opnd::new_uimm(current));
-                return 1;
-            } else {
-                // Finally we'll fall back to encoding the value
-                // using movz for the first 16 bits and movk for
-                // each subsequent set of 16 bits as long we
-                // they are necessary.
-                movz(cb, rd, A64Opnd::new_uimm(current & 0xffff), 0);
-                let mut num_insns = 1;
-
-                // (We're sure this is necessary since we
-                // checked if it only fit into movz above).
-                current >>= 16;
-                movk(cb, rd, A64Opnd::new_uimm(current & 0xffff), 16);
-                num_insns += 1;
-
-                if current > 0xffff {
-                    current >>= 16;
-                    movk(cb, rd, A64Opnd::new_uimm(current & 0xffff), 32);
-                    num_insns += 1;
-                }
-
-                if current > 0xffff {
-                    current >>= 16;
-                    movk(cb, rd, A64Opnd::new_uimm(current & 0xffff), 48);
-                    num_insns += 1;
-                }
-                return num_insns;
-            }
-        }
-
         /// Emit a conditional jump instruction to a specific target. This is
         /// called when lowering any of the conditional jump instructions.
         fn emit_conditional_jump<const CONDITION: u8>(cb: &mut CodeBlock, target: Target) {
             match target {
                 Target::CodePtr(dst_ptr) | Target::SideExitPtr(dst_ptr) => {
-                    let dst_addr = dst_ptr.into_i64();
-                    let src_addr = cb.get_write_ptr().into_i64();
+                    let dst_addr = dst_ptr.as_offset();
+                    let src_addr = cb.get_write_ptr().as_offset();
 
                     let num_insns = if bcond_offset_fits_bits((dst_addr - src_addr) / 4) {
                         // If the jump offset fits into the conditional jump as
@@ -640,10 +784,22 @@ impl Assembler
                         // Here we're going to return 1 because we've only
                         // written out 1 instruction.
                         1
+                    } else if b_offset_fits_bits((dst_addr - (src_addr + 4)) / 4) { // + 4 for bcond
+                        // If the jump offset fits into the unconditional jump as
+                        // an immediate value, we can use inverse b.cond + b.
+                        //
+                        // We're going to write out the inverse condition so
+                        // that if it doesn't match it will skip over the
+                        // instruction used for branching.
+                        bcond(cb, Condition::inverse(CONDITION), 2.into());
+                        b(cb, InstructionOffset::from_bytes((dst_addr - (src_addr + 4)) as i32)); // + 4 for bcond
+
+                        // We've only written out 2 instructions.
+                        2
                     } else {
                         // Otherwise, we need to load the address into a
                         // register and use the branch register instruction.
-                        let dst_addr = dst_ptr.into_u64();
+                        let dst_addr = (dst_ptr.raw_ptr(cb) as usize).as_u64();
                         let load_insns: i32 = emit_load_size(dst_addr).into();
 
                         // We're going to write out the inverse condition so
@@ -663,7 +819,8 @@ impl Assembler
                         // We need to make sure we have at least 6 instructions for
                         // every kind of jump for invalidation purposes, so we're
                         // going to write out padding nop instructions here.
-                        for _ in num_insns..6 { nop(cb); }
+                        assert!(num_insns <= cb.conditional_jump_insns());
+                        for _ in num_insns..cb.conditional_jump_insns() { nop(cb); }
                     }
                 },
                 Target::Label(label_idx) => {
@@ -676,62 +833,74 @@ impl Assembler
                         bcond(cb, CONDITION, InstructionOffset::from_bytes(bytes));
                     });
                 },
+                Target::SideExit { .. } => {
+                    unreachable!("Target::SideExit should have been compiled by compile_side_exit")
+                },
             };
         }
 
-        /// Emit a push instruction for the given operand by adding to the stack
-        /// pointer and then storing the given value.
+        /// Emit a CBZ or CBNZ which branches when a register is zero or non-zero
+        fn emit_cmp_zero_jump(cb: &mut CodeBlock, reg: A64Opnd, branch_if_zero: bool, target: Target) {
+            if let Target::SideExitPtr(dst_ptr) = target {
+                let dst_addr = dst_ptr.as_offset();
+                let src_addr = cb.get_write_ptr().as_offset();
+
+                if cmp_branch_offset_fits_bits((dst_addr - src_addr) / 4) {
+                    // If the offset fits in one instruction, generate cbz or cbnz
+                    let bytes = (dst_addr - src_addr) as i32;
+                    if branch_if_zero {
+                        cbz(cb, reg, InstructionOffset::from_bytes(bytes));
+                    } else {
+                        cbnz(cb, reg, InstructionOffset::from_bytes(bytes));
+                    }
+                } else {
+                    // Otherwise, we load the address into a register and
+                    // use the branch register instruction. Note that because
+                    // side exits should always be close, this form should be
+                    // rare or impossible to see.
+                    let dst_addr = dst_ptr.raw_addr(cb) as u64;
+                    let load_insns: i32 = emit_load_size(dst_addr).into();
+
+                    // Write out the inverse condition so that if
+                    // it doesn't match it will skip over the
+                    // instructions used for branching.
+                    if branch_if_zero {
+                        cbnz(cb, reg, InstructionOffset::from_insns(load_insns + 2));
+                    } else {
+                        cbz(cb, reg, InstructionOffset::from_insns(load_insns + 2));
+                    }
+                    emit_load_value(cb, Assembler::SCRATCH0, dst_addr);
+                    br(cb, Assembler::SCRATCH0);
+
+                }
+            } else {
+                unreachable!("We should only generate Joz/Jonz with side-exit targets");
+            }
+        }
+
+        /// Push a value to the stack by subtracting from the stack pointer then storing,
+        /// leaving an 8-byte gap for alignment.
         fn emit_push(cb: &mut CodeBlock, opnd: A64Opnd) {
             str_pre(cb, opnd, A64Opnd::new_mem(64, C_SP_REG, -C_SP_STEP));
         }
 
-        /// Emit a pop instruction into the given operand by loading the value
-        /// and then subtracting from the stack pointer.
+        /// Pop a value from the stack by loading `[sp]` then adding to the stack pointer.
         fn emit_pop(cb: &mut CodeBlock, opnd: A64Opnd) {
             ldr_post(cb, opnd, A64Opnd::new_mem(64, C_SP_REG, C_SP_STEP));
         }
 
-        fn emit_jmp_ptr(cb: &mut CodeBlock, dst_ptr: CodePtr, padding: bool) {
-            let src_addr = cb.get_write_ptr().into_i64();
-            let dst_addr = dst_ptr.into_i64();
-
-            // If the offset is short enough, then we'll use the
-            // branch instruction. Otherwise, we'll move the
-            // destination into a register and use the branch
-            // register instruction.
-            let num_insns = if b_offset_fits_bits((dst_addr - src_addr) / 4) {
-                b(cb, InstructionOffset::from_bytes((dst_addr - src_addr) as i32));
-                1
+        /// Compile a side exit if Target::SideExit is given.
+        fn compile_side_exit(
+            target: Target,
+            asm: &mut Assembler,
+            ocb: &mut Option<&mut OutlinedCb>,
+        ) -> Result<Target, EmitError> {
+            if let Target::SideExit { counter, context } = target {
+                let side_exit = asm.get_side_exit(&context.unwrap(), Some(counter), ocb.as_mut().unwrap())
+                    .ok_or(EmitError::OutOfMemory)?;
+                Ok(Target::SideExitPtr(side_exit))
             } else {
-                let num_insns = emit_load_value(cb, Assembler::SCRATCH0, dst_addr as u64);
-                br(cb, Assembler::SCRATCH0);
-                num_insns + 1
-            };
-
-            if padding {
-                // Make sure it's always a consistent number of
-                // instructions in case it gets patched and has to
-                // use the other branch.
-                for _ in num_insns..(JMP_PTR_BYTES / 4) {
-                    nop(cb);
-                }
-            }
-        }
-
-        /// Call emit_jmp_ptr and immediately invalidate the written range.
-        /// This is needed when next_page also moves other_cb that is not invalidated
-        /// by compile_with_regs. Doing it here allows you to avoid invalidating a lot
-        /// more than necessary when other_cb jumps from a position early in the page.
-        /// This invalidates a small range of cb twice, but we accept the small cost.
-        fn emit_jmp_ptr_with_invalidation(cb: &mut CodeBlock, dst_ptr: CodePtr) {
-            #[cfg(not(test))]
-            let start = cb.get_write_ptr();
-            emit_jmp_ptr(cb, dst_ptr, true);
-            #[cfg(not(test))]
-            {
-                let end = cb.get_write_ptr();
-                use crate::cruby::rb_yjit_icache_invalidate;
-                unsafe { rb_yjit_icache_invalidate(start.raw_ptr() as _, end.raw_ptr() as _) };
+                Ok(target)
             }
         }
 
@@ -740,6 +909,9 @@ impl Assembler
         // List of GC offsets
         let mut gc_offsets: Vec<u32> = Vec::new();
 
+        // Buffered list of PosMarker callbacks to fire if codegen is successful
+        let mut pos_markers: Vec<(usize, CodePtr)> = vec![];
+
         // For each instruction
         let start_write_pos = cb.get_write_pos();
         let mut insn_idx: usize = 0;
@@ -751,16 +923,14 @@ impl Assembler
 
             match insn {
                 Insn::Comment(text) => {
-                    if cfg!(feature = "disasm") {
-                        cb.add_comment(text);
-                    }
+                    cb.add_comment(text);
                 },
                 Insn::Label(target) => {
                     cb.write_label(target.unwrap_label_idx());
                 },
                 // Report back the current position in the generated code
-                Insn::PosMarker(pos_marker) => {
-                    pos_marker(cb.get_write_ptr());
+                Insn::PosMarker(..) => {
+                    pos_markers.push((insn_idx, cb.get_write_ptr()))
                 }
                 Insn::BakeString(text) => {
                     for byte in text.as_bytes() {
@@ -777,9 +947,6 @@ impl Assembler
                         cb.write_byte(0);
                     }
                 },
-                Insn::Add { left, right, out } => {
-                    adds(cb, out.into(), left.into(), right.into());
-                },
                 Insn::FrameSetup => {
                     stp_pre(cb, X29, X30, A64Opnd::new_mem(128, C_SP_REG, -16));
 
@@ -792,9 +959,39 @@ impl Assembler
 
                     ldp_post(cb, X29, X30, A64Opnd::new_mem(128, C_SP_REG, 16));
                 },
+                Insn::Add { left, right, out } => {
+                    adds(cb, out.into(), left.into(), right.into());
+                },
                 Insn::Sub { left, right, out } => {
                     subs(cb, out.into(), left.into(), right.into());
                 },
+                Insn::Mul { left, right, out } => {
+                    // If the next instruction is jo (jump on overflow)
+                    match (self.insns.get(insn_idx + 1), self.insns.get(insn_idx + 2)) {
+                        (Some(Insn::JoMul(_)), _) |
+                        (Some(Insn::PosMarker(_)), Some(Insn::JoMul(_))) => {
+                            // Compute the high 64 bits
+                            smulh(cb, Self::SCRATCH0, left.into(), right.into());
+
+                            // Compute the low 64 bits
+                            // This may clobber one of the input registers,
+                            // so we do it after smulh
+                            mul(cb, out.into(), left.into(), right.into());
+
+                            // Produce a register that is all zeros or all ones
+                            // Based on the sign bit of the 64-bit mul result
+                            asr(cb, Self::SCRATCH1, out.into(), A64Opnd::UImm(63));
+
+                            // If the high 64-bits are not all zeros or all ones,
+                            // matching the sign bit, then we have an overflow
+                            cmp(cb, Self::SCRATCH0, Self::SCRATCH1);
+                            // Insn::JoMul will emit_conditional_jump::<{Condition::NE}>
+                        }
+                        _ => {
+                            mul(cb, out.into(), left.into(), right.into());
+                        }
+                    }
+                },
                 Insn::And { left, right, out } => {
                     and(cb, out.into(), left.into(), right.into());
                 },
@@ -864,6 +1061,12 @@ impl Assembler
                             let ptr_offset: u32 = (cb.get_write_pos() as u32) - (SIZEOF_VALUE as u32);
                             insn_gc_offsets.push(ptr_offset);
                         },
+                        Opnd::CArg { .. } => {
+                            unreachable!("C argument operand was not lowered before arm64_emit");
+                        }
+                        Opnd::Stack { .. } => {
+                            unreachable!("Stack operand was not lowered before arm64_emit");
+                        }
                         Opnd::None => {
                             unreachable!("Attempted to load from None operand");
                         }
@@ -882,7 +1085,18 @@ impl Assembler
                     };
                 },
                 Insn::Mov { dest, src } => {
-                    mov(cb, dest.into(), src.into());
+                    // This supports the following two kinds of immediates:
+                    //   * The value fits into a single movz instruction
+                    //   * It can be encoded with the special bitmask immediate encoding
+                    // arm64_split() should have split other immediates that require multiple instructions.
+                    match src {
+                        Opnd::UImm(uimm) if *uimm <= 0xffff => {
+                            movz(cb, dest.into(), A64Opnd::new_uimm(*uimm), 0);
+                        },
+                        _ => {
+                            mov(cb, dest.into(), src.into());
+                        }
+                    }
                 },
                 Insn::Lea { opnd, out } => {
                     let opnd: A64Opnd = opnd.into();
@@ -901,14 +1115,20 @@ impl Assembler
                         }
                     };
                 },
-                Insn::LeaLabel { out, target, .. } => {
-                    let label_idx = target.unwrap_label_idx();
+                Insn::LeaJumpTarget { out, target, .. } => {
+                    if let Target::Label(label_idx) = target {
+                        // Set output to the raw address of the label
+                        cb.label_ref(*label_idx, 4, |cb, end_addr, dst_addr| {
+                            adr(cb, Self::SCRATCH0, A64Opnd::new_imm(dst_addr - (end_addr - 4)));
+                        });
 
-                    cb.label_ref(label_idx, 4, |cb, end_addr, dst_addr| {
-                        adr(cb, Self::SCRATCH0, A64Opnd::new_imm(dst_addr - (end_addr - 4)));
-                    });
-
-                    mov(cb, out.into(), Self::SCRATCH0);
+                        mov(cb, out.into(), Self::SCRATCH0);
+                    } else {
+                        // Set output to the jump target's raw address
+                        let target_code = target.unwrap_code_ptr();
+                        let target_addr = target_code.raw_addr(cb).as_u64();
+                        emit_load_value(cb, out.into(), target_addr);
+                    }
                 },
                 Insn::CPush(opnd) => {
                     emit_push(cb, opnd.into());
@@ -934,8 +1154,8 @@ impl Assembler
                     let regs = Assembler::get_caller_save_regs();
 
                     // Pop the state/flags register
-                    msr(cb, SystemRegister::NZCV, Self::SCRATCH0);
                     emit_pop(cb, Self::SCRATCH0);
+                    msr(cb, SystemRegister::NZCV, Self::SCRATCH0);
 
                     for reg in regs.into_iter().rev() {
                         emit_pop(cb, A64Opnd::Reg(reg));
@@ -943,7 +1163,7 @@ impl Assembler
                 },
                 Insn::CCall { fptr, .. } => {
                     // The offset to the call target in bytes
-                    let src_addr = cb.get_write_ptr().into_i64();
+                    let src_addr = cb.get_write_ptr().raw_ptr(cb) as i64;
                     let dst_addr = *fptr as i64;
 
                     // Use BL if the offset is short enough to encode as an immediate.
@@ -968,12 +1188,12 @@ impl Assembler
                     br(cb, opnd.into());
                 },
                 Insn::Jmp(target) => {
-                    match target {
+                    match compile_side_exit(*target, self, ocb)? {
                         Target::CodePtr(dst_ptr) => {
-                            emit_jmp_ptr(cb, *dst_ptr, true);
+                            emit_jmp_ptr(cb, dst_ptr, true);
                         },
                         Target::SideExitPtr(dst_ptr) => {
-                            emit_jmp_ptr(cb, *dst_ptr, false);
+                            emit_jmp_ptr(cb, dst_ptr, false);
                         },
                         Target::Label(label_idx) => {
                             // Here we're going to save enough space for
@@ -981,27 +1201,45 @@ impl Assembler
                             // instruction once we know the offset. We're going
                             // to assume we can fit into a single b instruction.
                             // It will panic otherwise.
-                            cb.label_ref(*label_idx, 4, |cb, src_addr, dst_addr| {
+                            cb.label_ref(label_idx, 4, |cb, src_addr, dst_addr| {
                                 let bytes: i32 = (dst_addr - (src_addr - 4)).try_into().unwrap();
                                 b(cb, InstructionOffset::from_bytes(bytes));
                             });
                         },
+                        Target::SideExit { .. } => {
+                            unreachable!("Target::SideExit should have been compiled by compile_side_exit")
+                        },
                     };
                 },
                 Insn::Je(target) | Insn::Jz(target) => {
-                    emit_conditional_jump::<{Condition::EQ}>(cb, *target);
+                    emit_conditional_jump::<{Condition::EQ}>(cb, compile_side_exit(*target, self, ocb)?);
                 },
-                Insn::Jne(target) | Insn::Jnz(target) => {
-                    emit_conditional_jump::<{Condition::NE}>(cb, *target);
+                Insn::Jne(target) | Insn::Jnz(target) | Insn::JoMul(target) => {
+                    emit_conditional_jump::<{Condition::NE}>(cb, compile_side_exit(*target, self, ocb)?);
                 },
                 Insn::Jl(target) => {
-                    emit_conditional_jump::<{Condition::LT}>(cb, *target);
+                    emit_conditional_jump::<{Condition::LT}>(cb, compile_side_exit(*target, self, ocb)?);
+                },
+                Insn::Jg(target) => {
+                    emit_conditional_jump::<{Condition::GT}>(cb, compile_side_exit(*target, self, ocb)?);
+                },
+                Insn::Jge(target) => {
+                    emit_conditional_jump::<{Condition::GE}>(cb, compile_side_exit(*target, self, ocb)?);
                 },
                 Insn::Jbe(target) => {
-                    emit_conditional_jump::<{Condition::LS}>(cb, *target);
+                    emit_conditional_jump::<{Condition::LS}>(cb, compile_side_exit(*target, self, ocb)?);
+                },
+                Insn::Jb(target) => {
+                    emit_conditional_jump::<{Condition::CC}>(cb, compile_side_exit(*target, self, ocb)?);
                 },
                 Insn::Jo(target) => {
-                    emit_conditional_jump::<{Condition::VS}>(cb, *target);
+                    emit_conditional_jump::<{Condition::VS}>(cb, compile_side_exit(*target, self, ocb)?);
+                },
+                Insn::Joz(opnd, target) => {
+                    emit_cmp_zero_jump(cb, opnd.into(), true, compile_side_exit(*target, self, ocb)?);
+                },
+                Insn::Jonz(opnd, target) => {
+                    emit_cmp_zero_jump(cb, opnd.into(), false, compile_side_exit(*target, self, ocb)?);
                 },
                 Insn::IncrCounter { mem, value } => {
                     let label = cb.new_label("incr_counter_loop".to_string());
@@ -1045,7 +1283,7 @@ impl Assembler
                 }
                 Insn::LiveReg { .. } => (), // just a reg alloc signal, no code
                 Insn::PadInvalPatch => {
-                    while (cb.get_write_pos().saturating_sub(std::cmp::max(start_write_pos, cb.page_start_pos()))) < JMP_PTR_BYTES && !cb.has_dropped_bytes() {
+                    while (cb.get_write_pos().saturating_sub(std::cmp::max(start_write_pos, cb.page_start_pos()))) < cb.jmp_ptr_bytes() && !cb.has_dropped_bytes() {
                         nop(cb);
                     }
                 }
@@ -1055,19 +1293,39 @@ impl Assembler
             if !had_dropped_bytes && cb.has_dropped_bytes() && cb.next_page(src_ptr, emit_jmp_ptr_with_invalidation) {
                 // Reset cb states before retrying the current Insn
                 cb.set_label_state(old_label_state);
+
+                // We don't want label references to cross page boundaries. Signal caller for
+                // retry.
+                if !self.label_names.is_empty() {
+                    return Err(EmitError::RetryOnNextPage);
+                }
             } else {
                 insn_idx += 1;
                 gc_offsets.append(&mut insn_gc_offsets);
             }
         }
 
-        gc_offsets
+        // Error if we couldn't write out everything
+        if cb.has_dropped_bytes() {
+            return Err(EmitError::OutOfMemory)
+        } else {
+            // No bytes dropped, so the pos markers point to valid code
+            for (insn_idx, pos) in pos_markers {
+                if let Insn::PosMarker(callback) = self.insns.get(insn_idx).unwrap() {
+                    callback(pos, &cb);
+                } else {
+                    panic!("non-PosMarker in pos_markers insn_idx={insn_idx} {self:?}");
+                }
+            }
+
+            return Ok(gc_offsets)
+        }
     }
 
     /// Optimize and compile the stored instructions
-    pub fn compile_with_regs(self, cb: &mut CodeBlock, regs: Vec<Reg>) -> Vec<u32>
-    {
-        let mut asm = self.arm64_split().alloc_regs(regs);
+    pub fn compile_with_regs(self, cb: &mut CodeBlock, ocb: Option<&mut OutlinedCb>, regs: Vec<Reg>) -> Option<(CodePtr, Vec<u32>)> {
+        let asm = self.arm64_split();
+        let mut asm = asm.alloc_regs(regs);
 
         // Create label instances in the code block
         for (idx, name) in asm.label_names.iter().enumerate() {
@@ -1076,11 +1334,24 @@ impl Assembler
         }
 
         let start_ptr = cb.get_write_ptr();
-        let gc_offsets = asm.arm64_emit(cb);
+        let starting_label_state = cb.get_label_state();
+        let mut ocb = ocb; // for &mut
+        let emit_result = match asm.arm64_emit(cb, &mut ocb) {
+            Err(EmitError::RetryOnNextPage) => {
+                // we want to lower jumps to labels to b.cond instructions, which have a 1 MiB
+                // range limit. We can easily exceed the limit in case the jump straddles two pages.
+                // In this case, we retry with a fresh page once.
+                cb.set_label_state(starting_label_state);
+                if cb.next_page(start_ptr, emit_jmp_ptr_with_invalidation) {
+                    asm.arm64_emit(cb, &mut ocb)
+                } else {
+                    Err(EmitError::OutOfMemory)
+                }
+            }
+            result => result
+        };
 
-        if cb.has_dropped_bytes() {
-            cb.clear_labels();
-        } else {
+        if let (Ok(gc_offsets), false) = (emit_result, cb.has_dropped_bytes()) {
             cb.link_labels();
 
             // Invalidate icache for newly written out region so we don't run stale code.
@@ -1089,21 +1360,26 @@ impl Assembler
             #[cfg(not(test))]
             cb.without_page_end_reserve(|cb| {
                 for (start, end) in cb.writable_addrs(start_ptr, cb.get_write_ptr()) {
-                    unsafe { rb_yjit_icache_invalidate(start as _, end as _) };
+                    unsafe { rb_jit_icache_invalidate(start as _, end as _) };
                 }
             });
-        }
 
-        gc_offsets
+            Some((start_ptr, gc_offsets))
+        } else {
+            cb.clear_labels();
+
+            None
+        }
     }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::disasm::*;
 
     fn setup_asm() -> (Assembler, CodeBlock) {
-        (Assembler::new(), CodeBlock::new_dummy(1024))
+        (Assembler::new(0), CodeBlock::new_dummy(1024))
     }
 
     #[test]
@@ -1112,7 +1388,7 @@ mod tests {
 
         let opnd = asm.add(Opnd::Reg(X0_REG), Opnd::Reg(X1_REG));
         asm.store(Opnd::mem(64, Opnd::Reg(X2_REG), 0), opnd);
-        asm.compile_with_regs(&mut cb, vec![X3_REG]);
+        asm.compile_with_regs(&mut cb, None, vec![X3_REG]);
 
         // Assert that only 2 instructions were written.
         assert_eq!(8, cb.get_write_pos());
@@ -1142,7 +1418,7 @@ mod tests {
     fn test_emit_cpop_all() {
         let (mut asm, mut cb) = setup_asm();
 
-        asm.cpop_all();
+        asm.cpop_all(crate::core::RegMapping::default());
         asm.compile_with_num_regs(&mut cb, 0);
     }
 
@@ -1159,8 +1435,7 @@ mod tests {
     fn test_emit_je_fits_into_bcond() {
         let (mut asm, mut cb) = setup_asm();
 
-        let offset = 80;
-        let target: CodePtr = ((cb.get_write_ptr().into_u64() + offset) as *mut u8).into();
+        let target: CodePtr = cb.get_write_ptr().add_bytes(80);
 
         asm.je(Target::CodePtr(target));
         asm.compile_with_num_regs(&mut cb, 0);
@@ -1171,7 +1446,7 @@ mod tests {
         let (mut asm, mut cb) = setup_asm();
 
         let offset = 1 << 21;
-        let target: CodePtr = ((cb.get_write_ptr().into_u64() + offset) as *mut u8).into();
+        let target: CodePtr = cb.get_write_ptr().add_bytes(offset);
 
         asm.je(Target::CodePtr(target));
         asm.compile_with_num_regs(&mut cb, 0);
@@ -1182,7 +1457,7 @@ mod tests {
         let (mut asm, mut cb) = setup_asm();
 
         let label = asm.new_label("label");
-        let opnd = asm.lea_label(label);
+        let opnd = asm.lea_jump_target(label);
 
         asm.write_label(label);
         asm.bake_string("Hello, world!");
@@ -1401,6 +1676,47 @@ mod tests {
     }
 
     #[test]
+    fn test_bcond_straddling_code_pages() {
+        const LANDING_PAGE: usize = 65;
+        let mut asm = Assembler::new(0);
+        let mut cb = CodeBlock::new_dummy_with_freed_pages(vec![0, LANDING_PAGE]);
+
+        // Skip to near the end of the page. Room for two instructions.
+        cb.set_pos(cb.page_start_pos() + cb.page_end() - 8);
+
+        let end = asm.new_label("end");
+        // Start with a conditional jump...
+        asm.jz(end);
+
+        // A few instructions, enough to cause a page switch.
+        let sum = asm.add(399.into(), 111.into());
+        let xorred = asm.xor(sum, 859.into());
+        asm.store(Opnd::mem(64, Opnd::Reg(X2_REG), 0), xorred);
+        asm.store(Opnd::mem(64, Opnd::Reg(X0_REG), 0), xorred);
+
+        // The branch target. It should be in the landing page.
+        asm.write_label(end);
+        asm.cret(xorred);
+
+        // [Bug #19385]
+        // This used to panic with "The offset must be 19 bits or less."
+        // due to attempting to lower the `asm.jz` above to a `b.e` with an offset that's > 1 MiB.
+        let starting_pos = cb.get_write_pos();
+        asm.compile_with_num_regs(&mut cb, 2);
+        let gap = cb.get_write_pos() - starting_pos;
+        assert!(gap > 0b1111111111111111111);
+
+        let instruction_at_starting_pos: [u8; 4] = unsafe {
+            std::slice::from_raw_parts(cb.get_ptr(starting_pos).raw_ptr(&cb), 4)
+        }.try_into().unwrap();
+        assert_eq!(
+            0b000101 << 26_u32,
+            u32::from_le_bytes(instruction_at_starting_pos) & (0b111111 << 26_u32),
+            "starting instruction should be an unconditional branch to the new page (B)"
+        );
+    }
+
+    #[test]
     fn test_emit_xor() {
         let (mut asm, mut cb) = setup_asm();
 
@@ -1408,6 +1724,11 @@ mod tests {
         asm.store(Opnd::mem(64, Opnd::Reg(X2_REG), 0), opnd);
 
         asm.compile_with_num_regs(&mut cb, 1);
+
+        assert_disasm!(cb, "0b0001ca4b0000f8", "
+            0x0: eor x11, x0, x1
+            0x4: stur x11, [x2]
+        ");
     }
 
     #[test]
@@ -1433,4 +1754,76 @@ mod tests {
             )),
         }
     }
+
+    #[test]
+    fn test_replace_mov_with_ldur() {
+        let (mut asm, mut cb) = setup_asm();
+
+        asm.mov(Opnd::Reg(TEMP_REGS[0]), Opnd::mem(64, CFP, 8));
+        asm.compile_with_num_regs(&mut cb, 1);
+
+        assert_disasm!(cb, "618240f8", {"
+            0x0: ldur x1, [x19, #8]
+        "});
+    }
+
+    #[test]
+    fn test_not_split_mov() {
+        let (mut asm, mut cb) = setup_asm();
+
+        asm.mov(Opnd::Reg(TEMP_REGS[0]), Opnd::UImm(0xffff));
+        asm.mov(Opnd::Reg(TEMP_REGS[0]), Opnd::UImm(0x10000));
+        asm.compile_with_num_regs(&mut cb, 1);
+
+        assert_disasm!(cb, "e1ff9fd2e10370b2", {"
+            0x0: mov x1, #0xffff
+            0x4: orr x1, xzr, #0x10000
+        "});
+    }
+
+    #[test]
+    fn test_merge_csel_mov() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let out = asm.csel_l(Qtrue.into(), Qfalse.into());
+        asm.mov(Opnd::Reg(TEMP_REGS[0]), out);
+        asm.compile_with_num_regs(&mut cb, 2);
+
+        assert_disasm!(cb, "8b0280d20c0080d261b18c9a", {"
+            0x0: mov x11, #0x14
+            0x4: mov x12, #0
+            0x8: csel x1, x11, x12, lt
+        "});
+    }
+
+    #[test]
+    fn test_add_with_immediate() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let out = asm.add(Opnd::Reg(TEMP_REGS[1]), 1.into());
+        let out = asm.add(out, 1_usize.into());
+        asm.mov(Opnd::Reg(TEMP_REGS[0]), out);
+        asm.compile_with_num_regs(&mut cb, 2);
+
+        assert_disasm!(cb, "2b0500b16b0500b1e1030baa", {"
+            0x0: adds x11, x9, #1
+            0x4: adds x11, x11, #1
+            0x8: mov x1, x11
+        "});
+    }
+
+    #[test]
+    fn test_mul_with_immediate() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let out = asm.mul(Opnd::Reg(TEMP_REGS[1]), 3.into());
+        asm.mov(Opnd::Reg(TEMP_REGS[0]), out);
+        asm.compile_with_num_regs(&mut cb, 2);
+
+        assert_disasm!(cb, "6b0080d22b7d0b9be1030baa", {"
+            0x0: mov x11, #3
+            0x4: mul x11, x9, x11
+            0x8: mov x1, x11
+        "});
+    }
 }
diff --git a/yjit/src/backend/ir.rs b/yjit/src/backend/ir.rs
index c97276de9b..3fb67bc7cc 100644
--- a/yjit/src/backend/ir.rs
+++ b/yjit/src/backend/ir.rs
@@ -1,23 +1,16 @@
-#![allow(dead_code)]
-#![allow(unused_variables)]
-#![allow(unused_imports)]
-
-use std::cell::Cell;
+use std::collections::HashMap;
 use std::fmt;
 use std::convert::From;
-use std::io::Write;
 use std::mem::take;
-use crate::cruby::{VALUE};
-use crate::virtualmem::{CodePtr};
-use crate::asm::{CodeBlock, uimm_num_bits, imm_num_bits};
-use crate::core::{Context, Type, TempMapping};
+use crate::codegen::{gen_counted_exit, gen_outlined_exit};
+use crate::cruby::{vm_stack_canary, SIZEOF_VALUE_I32, VALUE, VM_ENV_DATA_SIZE};
+use crate::virtualmem::CodePtr;
+use crate::asm::{CodeBlock, OutlinedCb};
+use crate::core::{Context, RegMapping, RegOpnd, MAX_CTX_TEMPS};
 use crate::options::*;
+use crate::stats::*;
 
-#[cfg(target_arch = "x86_64")]
-use crate::backend::x86_64::*;
-
-#[cfg(target_arch = "aarch64")]
-use crate::backend::arm64::*;
+use crate::backend::current::*;
 
 pub const EC: Opnd = _EC;
 pub const CFP: Opnd = _CFP;
@@ -25,6 +18,7 @@ pub const SP: Opnd = _SP;
 
 pub const C_ARG_OPNDS: [Opnd; 6] = _C_ARG_OPNDS;
 pub const C_RET_OPND: Opnd = _C_RET_OPND;
+pub use crate::backend::current::{Reg, C_RET_REG};
 
 // Memory operand base
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
@@ -69,9 +63,28 @@ pub enum Opnd
     // Immediate Ruby value, may be GC'd, movable
     Value(VALUE),
 
+    /// C argument register. The alloc_regs resolves its register dependencies.
+    CArg(Reg),
+
     // Output of a preceding instruction in this block
     InsnOut{ idx: usize, num_bits: u8 },
 
+    /// Pointer to a slot on the VM stack
+    Stack {
+        /// Index from stack top. Used for conversion to StackOpnd.
+        idx: i32,
+        /// Number of bits for Opnd::Reg and Opnd::Mem.
+        num_bits: u8,
+        /// ctx.stack_size when this operand is made. Used with idx for Opnd::Reg.
+        stack_size: u8,
+        /// The number of local variables in the current ISEQ. Used only for locals.
+        num_locals: Option<u32>,
+        /// ctx.sp_offset when this operand is made. Used with idx for Opnd::Mem.
+        sp_offset: i8,
+        /// ctx.reg_mapping when this operand is read. Used for register allocation.
+        reg_mapping: Option<RegMapping>
+    },
+
     // Low-level operands, for lowering
     Imm(i64),           // Raw signed immediate
     UImm(u64),          // Raw unsigned immediate
@@ -85,6 +98,8 @@ impl fmt::Debug for Opnd {
         match self {
             Self::None => write!(fmt, "None"),
             Value(val) => write!(fmt, "Value({val:?})"),
+            CArg(reg) => write!(fmt, "CArg({reg:?})"),
+            Stack { idx, sp_offset, .. } => write!(fmt, "SP[{}]", *sp_offset as i32 - idx - 1),
             InsnOut { idx, num_bits } => write!(fmt, "Out{num_bits}({idx})"),
             Imm(signed) => write!(fmt, "{signed:x}_i64"),
             UImm(unsigned) => write!(fmt, "{unsigned:x}_u64"),
@@ -127,10 +142,11 @@ impl Opnd
         Opnd::UImm(ptr as u64)
     }
 
-    pub fn is_some(&self) -> bool {
-        match *self {
-            Opnd::None => false,
-            _ => true,
+    /// Constructor for a C argument operand
+    pub fn c_arg(reg_opnd: Opnd) -> Self {
+        match reg_opnd {
+            Opnd::Reg(reg) => Opnd::CArg(reg),
+            _ => unreachable!(),
         }
     }
 
@@ -158,6 +174,7 @@ impl Opnd
             Opnd::Reg(reg) => Some(Opnd::Reg(reg.with_num_bits(num_bits))),
             Opnd::Mem(Mem { base, disp, .. }) => Some(Opnd::Mem(Mem { base, disp, num_bits })),
             Opnd::InsnOut { idx, .. } => Some(Opnd::InsnOut { idx, num_bits }),
+            Opnd::Stack { idx, stack_size, num_locals, sp_offset, reg_mapping, .. } => Some(Opnd::Stack { idx, num_bits, stack_size, num_locals, sp_offset, reg_mapping }),
             _ => None,
         }
     }
@@ -211,6 +228,29 @@ impl Opnd
     pub fn match_num_bits(opnds: &[Opnd]) -> u8 {
         Self::match_num_bits_iter(opnds.iter())
     }
+
+    /// Convert Opnd::Stack into RegMapping
+    pub fn reg_opnd(&self) -> RegOpnd {
+        self.get_reg_opnd().unwrap()
+    }
+
+    /// Convert an operand into RegMapping if it's Opnd::Stack
+    pub fn get_reg_opnd(&self) -> Option<RegOpnd> {
+        match *self {
+            Opnd::Stack { idx, stack_size, num_locals, .. } => Some(
+                if let Some(num_locals) = num_locals {
+                    let last_idx = stack_size as i32 + VM_ENV_DATA_SIZE as i32 - 1;
+                    assert!(last_idx <= idx, "Local index {} must be >= last local index {}", idx, last_idx);
+                    assert!(idx <= last_idx + num_locals as i32, "Local index {} must be < last local index {} + local size {}", idx, last_idx, num_locals);
+                    RegOpnd::Local((last_idx + num_locals as i32 - idx) as u8)
+                } else {
+                    assert!(idx < stack_size as i32);
+                    RegOpnd::Stack((stack_size as i32 - idx - 1) as u8)
+                }
+            ),
+            _ => None,
+        }
+    }
 }
 
 impl From<usize> for Opnd {
@@ -254,13 +294,22 @@ impl From<VALUE> for Opnd {
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub enum Target
 {
-    CodePtr(CodePtr),     // Pointer to a piece of YJIT-generated code
-    SideExitPtr(CodePtr), // Pointer to a side exit code
-    Label(usize),         // A label within the generated code
+    /// Pointer to a piece of YJIT-generated code
+    CodePtr(CodePtr),
+    /// Side exit with a counter
+    SideExit { counter: Counter, context: Option<SideExitContext> },
+    /// Pointer to a side exit code
+    SideExitPtr(CodePtr),
+    /// A label within the generated code
+    Label(usize),
 }
 
 impl Target
 {
+    pub fn side_exit(counter: Counter) -> Target {
+        Target::SideExit { counter, context: None }
+    }
+
     pub fn unwrap_label_idx(&self) -> usize {
         match self {
             Target::Label(idx) => *idx,
@@ -283,7 +332,7 @@ impl From<CodePtr> for Target {
     }
 }
 
-type PosMarkerFn = Box<dyn Fn(CodePtr)>;
+type PosMarkerFn = Box<dyn Fn(CodePtr, &CodeBlock)>;
 
 /// YJIT IR instruction
 pub enum Insn {
@@ -298,6 +347,7 @@ pub enum Insn {
     BakeString(String),
 
     // Trigger a debugger breakpoint
+    #[allow(dead_code)]
     Breakpoint,
 
     /// Add a comment into the IR at the point that this instruction is added.
@@ -363,15 +413,24 @@ pub enum Insn {
     // Produces no output
     IncrCounter { mem: Opnd, value: Opnd },
 
-    /// Jump if below or equal
+    /// Jump if below or equal (unsigned)
     Jbe(Target),
 
+    /// Jump if below (unsigned)
+    Jb(Target),
+
     /// Jump if equal
     Je(Target),
 
     /// Jump if lower
     Jl(Target),
 
+    /// Jump if greater
+    Jg(Target),
+
+    /// Jump if greater or equal
+    Jge(Target),
+
     // Unconditional jump to a branch target
     Jmp(Target),
 
@@ -387,15 +446,23 @@ pub enum Insn {
     /// Jump if overflow
     Jo(Target),
 
+    /// Jump if overflow in multiplication
+    JoMul(Target),
+
     /// Jump if zero
     Jz(Target),
 
+    /// Jump if operand is zero (only used during lowering at the moment)
+    Joz(Opnd, Target),
+
+    /// Jump if operand is non-zero (only used during lowering at the moment)
+    Jonz(Opnd, Target),
+
     // Add a label into the IR at the point that this instruction is added.
     Label(Target),
 
-    // Load effective address relative to the current instruction pointer. It
-    // accepts a single signed immediate operand.
-    LeaLabel { target: Target, out: Opnd },
+    /// Get the code address of a jump target
+    LeaJumpTarget { target: Target, out: Opnd },
 
     // Load effective address
     Lea { opnd: Opnd, out: Opnd },
@@ -428,7 +495,7 @@ pub enum Insn {
     // binary OR operation.
     Or { left: Opnd, right: Opnd, out: Opnd },
 
-    /// Pad nop instructions to accomodate Op::Jmp in case the block or the insn
+    /// Pad nop instructions to accommodate Op::Jmp in case the block or the insn
     /// is invalidated.
     PadInvalPatch,
 
@@ -441,9 +508,12 @@ pub enum Insn {
     // Low-level instruction to store a value to memory.
     Store { dest: Opnd, src: Opnd },
 
-    // This is the same as the OP_ADD instruction, except for subtraction.
+    // This is the same as the add instruction, except for subtraction.
     Sub { left: Opnd, right: Opnd, out: Opnd },
 
+    // Integer multiplication
+    Mul { left: Opnd, right: Opnd, out: Opnd },
+
     // Bitwise AND test instruction
     Test { left: Opnd, right: Opnd },
 
@@ -458,16 +528,41 @@ pub enum Insn {
 impl Insn {
     /// Create an iterator that will yield a non-mutable reference to each
     /// operand in turn for this instruction.
-    pub(super) fn opnd_iter(&self) -> InsnOpndIterator {
+    pub(super) fn opnd_iter(&self) -> InsnOpndIterator<'_> {
         InsnOpndIterator::new(self)
     }
 
     /// Create an iterator that will yield a mutable reference to each operand
     /// in turn for this instruction.
-    pub(super) fn opnd_iter_mut(&mut self) -> InsnOpndMutIterator {
+    pub(super) fn opnd_iter_mut(&mut self) -> InsnOpndMutIterator<'_> {
         InsnOpndMutIterator::new(self)
     }
 
+    /// Get a mutable reference to a Target if it exists.
+    pub(super) fn target_mut(&mut self) -> Option<&mut Target> {
+        match self {
+            Insn::Jbe(target) |
+            Insn::Jb(target) |
+            Insn::Je(target) |
+            Insn::Jl(target) |
+            Insn::Jg(target) |
+            Insn::Jge(target) |
+            Insn::Jmp(target) |
+            Insn::Jne(target) |
+            Insn::Jnz(target) |
+            Insn::Jo(target) |
+            Insn::Jz(target) |
+            Insn::Label(target) |
+            Insn::JoMul(target) |
+            Insn::Joz(_, target) |
+            Insn::Jonz(_, target) |
+            Insn::LeaJumpTarget { target, .. } => {
+                Some(target)
+            }
+            _ => None,
+        }
+    }
+
     /// Returns a string that describes which operation this instruction is
     /// performing. This is used for debugging.
     fn op(&self) -> &'static str {
@@ -497,16 +592,22 @@ impl Insn {
             Insn::FrameTeardown => "FrameTeardown",
             Insn::IncrCounter { .. } => "IncrCounter",
             Insn::Jbe(_) => "Jbe",
+            Insn::Jb(_) => "Jb",
             Insn::Je(_) => "Je",
             Insn::Jl(_) => "Jl",
+            Insn::Jg(_) => "Jg",
+            Insn::Jge(_) => "Jge",
             Insn::Jmp(_) => "Jmp",
             Insn::JmpOpnd(_) => "JmpOpnd",
             Insn::Jne(_) => "Jne",
             Insn::Jnz(_) => "Jnz",
             Insn::Jo(_) => "Jo",
+            Insn::JoMul(_) => "JoMul",
             Insn::Jz(_) => "Jz",
+            Insn::Joz(..) => "Joz",
+            Insn::Jonz(..) => "Jonz",
             Insn::Label(_) => "Label",
-            Insn::LeaLabel { .. } => "LeaLabel",
+            Insn::LeaJumpTarget { .. } => "LeaJumpTarget",
             Insn::Lea { .. } => "Lea",
             Insn::LiveReg { .. } => "LiveReg",
             Insn::Load { .. } => "Load",
@@ -521,6 +622,7 @@ impl Insn {
             Insn::RShift { .. } => "RShift",
             Insn::Store { .. } => "Store",
             Insn::Sub { .. } => "Sub",
+            Insn::Mul { .. } => "Mul",
             Insn::Test { .. } => "Test",
             Insn::URShift { .. } => "URShift",
             Insn::Xor { .. } => "Xor"
@@ -544,7 +646,7 @@ impl Insn {
             Insn::CSelNZ { out, .. } |
             Insn::CSelZ { out, .. } |
             Insn::Lea { out, .. } |
-            Insn::LeaLabel { out, .. } |
+            Insn::LeaJumpTarget { out, .. } |
             Insn::LiveReg { out, .. } |
             Insn::Load { out, .. } |
             Insn::LoadSExt { out, .. } |
@@ -553,6 +655,7 @@ impl Insn {
             Insn::Or { out, .. } |
             Insn::RShift { out, .. } |
             Insn::Sub { out, .. } |
+            Insn::Mul { out, .. } |
             Insn::URShift { out, .. } |
             Insn::Xor { out, .. } => Some(out),
             _ => None
@@ -576,7 +679,7 @@ impl Insn {
             Insn::CSelNZ { out, .. } |
             Insn::CSelZ { out, .. } |
             Insn::Lea { out, .. } |
-            Insn::LeaLabel { out, .. } |
+            Insn::LeaJumpTarget { out, .. } |
             Insn::LiveReg { out, .. } |
             Insn::Load { out, .. } |
             Insn::LoadSExt { out, .. } |
@@ -585,6 +688,7 @@ impl Insn {
             Insn::Or { out, .. } |
             Insn::RShift { out, .. } |
             Insn::Sub { out, .. } |
+            Insn::Mul { out, .. } |
             Insn::URShift { out, .. } |
             Insn::Xor { out, .. } => Some(out),
             _ => None
@@ -595,14 +699,17 @@ impl Insn {
     pub fn target(&self) -> Option<&Target> {
         match self {
             Insn::Jbe(target) |
+            Insn::Jb(target) |
             Insn::Je(target) |
             Insn::Jl(target) |
+            Insn::Jg(target) |
+            Insn::Jge(target) |
             Insn::Jmp(target) |
             Insn::Jne(target) |
             Insn::Jnz(target) |
             Insn::Jo(target) |
             Insn::Jz(target) |
-            Insn::LeaLabel { target, .. } => Some(target),
+            Insn::LeaJumpTarget { target, .. } => Some(target),
             _ => None
         }
     }
@@ -644,17 +751,22 @@ impl<'a> Iterator for InsnOpndIterator<'a> {
             Insn::FrameSetup |
             Insn::FrameTeardown |
             Insn::Jbe(_) |
+            Insn::Jb(_) |
             Insn::Je(_) |
             Insn::Jl(_) |
+            Insn::Jg(_) |
+            Insn::Jge(_) |
             Insn::Jmp(_) |
             Insn::Jne(_) |
             Insn::Jnz(_) |
             Insn::Jo(_) |
+            Insn::JoMul(_) |
             Insn::Jz(_) |
             Insn::Label(_) |
-            Insn::LeaLabel { .. } |
+            Insn::LeaJumpTarget { .. } |
             Insn::PadInvalPatch |
             Insn::PosMarker(_) => None,
+
             Insn::CPopInto(opnd) |
             Insn::CPush(opnd) |
             Insn::CRet(opnd) |
@@ -663,6 +775,8 @@ impl<'a> Iterator for InsnOpndIterator<'a> {
             Insn::LiveReg { opnd, .. } |
             Insn::Load { opnd, .. } |
             Insn::LoadSExt { opnd, .. } |
+            Insn::Joz(opnd, _) |
+            Insn::Jonz(opnd, _) |
             Insn::Not { opnd, .. } => {
                 match self.idx {
                     0 => {
@@ -691,6 +805,7 @@ impl<'a> Iterator for InsnOpndIterator<'a> {
             Insn::RShift { opnd: opnd0, shift: opnd1, .. } |
             Insn::Store { dest: opnd0, src: opnd1 } |
             Insn::Sub { left: opnd0, right: opnd1, .. } |
+            Insn::Mul { left: opnd0, right: opnd1, .. } |
             Insn::Test { left: opnd0, right: opnd1 } |
             Insn::URShift { opnd: opnd0, shift: opnd1, .. } |
             Insn::Xor { left: opnd0, right: opnd1, .. } => {
@@ -741,17 +856,22 @@ impl<'a> InsnOpndMutIterator<'a> {
             Insn::FrameSetup |
             Insn::FrameTeardown |
             Insn::Jbe(_) |
+            Insn::Jb(_) |
             Insn::Je(_) |
             Insn::Jl(_) |
+            Insn::Jg(_) |
+            Insn::Jge(_) |
             Insn::Jmp(_) |
             Insn::Jne(_) |
             Insn::Jnz(_) |
             Insn::Jo(_) |
+            Insn::JoMul(_) |
             Insn::Jz(_) |
             Insn::Label(_) |
-            Insn::LeaLabel { .. } |
+            Insn::LeaJumpTarget { .. } |
             Insn::PadInvalPatch |
             Insn::PosMarker(_) => None,
+
             Insn::CPopInto(opnd) |
             Insn::CPush(opnd) |
             Insn::CRet(opnd) |
@@ -760,6 +880,8 @@ impl<'a> InsnOpndMutIterator<'a> {
             Insn::LiveReg { opnd, .. } |
             Insn::Load { opnd, .. } |
             Insn::LoadSExt { opnd, .. } |
+            Insn::Joz(opnd, _) |
+            Insn::Jonz(opnd, _) |
             Insn::Not { opnd, .. } => {
                 match self.idx {
                     0 => {
@@ -788,6 +910,7 @@ impl<'a> InsnOpndMutIterator<'a> {
             Insn::RShift { opnd: opnd0, shift: opnd1, .. } |
             Insn::Store { dest: opnd0, src: opnd1 } |
             Insn::Sub { left: opnd0, right: opnd1, .. } |
+            Insn::Mul { left: opnd0, right: opnd1, .. } |
             Insn::Test { left: opnd0, right: opnd1 } |
             Insn::URShift { opnd: opnd0, shift: opnd1, .. } |
             Insn::Xor { left: opnd0, right: opnd1, .. } => {
@@ -842,10 +965,60 @@ impl fmt::Debug for Insn {
     }
 }
 
+/// Set of variables used for generating side exits
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+pub struct SideExitContext {
+    /// PC of the instruction being compiled
+    pub pc: *mut VALUE,
+
+    /// Context fields used by get_generic_ctx()
+    pub stack_size: u8,
+    pub sp_offset: i8,
+    pub reg_mapping: RegMapping,
+    pub is_return_landing: bool,
+    pub is_deferred: bool,
+}
+
+impl SideExitContext {
+    /// Convert PC and Context into SideExitContext
+    pub fn new(pc: *mut VALUE, ctx: Context) -> Self {
+        let exit_ctx = SideExitContext {
+            pc,
+            stack_size: ctx.get_stack_size(),
+            sp_offset: ctx.get_sp_offset(),
+            reg_mapping: ctx.get_reg_mapping(),
+            is_return_landing: ctx.is_return_landing(),
+            is_deferred: ctx.is_deferred(),
+        };
+        if cfg!(debug_assertions) {
+            // Assert that we're not losing any mandatory metadata
+            assert_eq!(exit_ctx.get_ctx(), ctx.get_generic_ctx());
+        }
+        exit_ctx
+    }
+
+    /// Convert SideExitContext to Context
+    fn get_ctx(&self) -> Context {
+        let mut ctx = Context::default();
+        ctx.set_stack_size(self.stack_size);
+        ctx.set_sp_offset(self.sp_offset);
+        ctx.set_reg_mapping(self.reg_mapping);
+        if self.is_return_landing {
+            ctx.set_as_return_landing();
+        }
+        if self.is_deferred {
+            ctx.mark_as_deferred();
+        }
+        ctx
+    }
+}
+
+/// Initial capacity for asm.insns vector
+const ASSEMBLER_INSNS_CAPACITY: usize = 256;
+
 /// Object into which we assemble instructions to be
 /// optimized and lowered
-pub struct Assembler
-{
+pub struct Assembler {
     pub(super) insns: Vec<Insn>,
 
     /// Parallel vec with insns
@@ -854,22 +1027,81 @@ pub struct Assembler
 
     /// Names of labels
     pub(super) label_names: Vec<String>,
+
+    /// Context for generating the current insn
+    pub ctx: Context,
+
+    /// The current ISEQ's local table size. asm.local_opnd() uses this, and it's
+    /// sometimes hard to pass this value, e.g. asm.spill_regs() in asm.ccall().
+    ///
+    /// `None` means we're not assembling for an ISEQ, or that the local size is
+    /// not relevant.
+    pub(super) num_locals: Option<u32>,
+
+    /// Side exit caches for each SideExitContext
+    pub(super) side_exits: HashMap<SideExitContext, CodePtr>,
+
+    /// PC for Target::SideExit
+    side_exit_pc: Option<*mut VALUE>,
+
+    /// Stack size for Target::SideExit
+    side_exit_stack_size: Option<u8>,
+
+    /// If true, the next ccall() should verify its leafness
+    leaf_ccall: bool,
 }
 
 impl Assembler
 {
-    pub fn new() -> Self {
-        Self::new_with_label_names(Vec::default())
+    /// Create an Assembler for ISEQ-specific code.
+    /// It includes all inline code and some outlined code like side exits and stubs.
+    pub fn new(num_locals: u32) -> Self {
+        Self::new_with_label_names(Vec::default(), HashMap::default(), Some(num_locals))
+    }
+
+    /// Create an Assembler for outlined code that are not specific to any ISEQ,
+    /// e.g. trampolines that are shared globally.
+    pub fn new_without_iseq() -> Self {
+        Self::new_with_label_names(Vec::default(), HashMap::default(), None)
     }
 
-    pub fn new_with_label_names(label_names: Vec<String>) -> Self {
+    /// Create an Assembler with parameters that are populated by another Assembler instance.
+    /// This API is used for copying an Assembler for the next compiler pass.
+    pub fn new_with_label_names(
+        label_names: Vec<String>,
+        side_exits: HashMap<SideExitContext, CodePtr>,
+        num_locals: Option<u32>
+    ) -> Self {
         Self {
-            insns: Vec::default(),
-            live_ranges: Vec::default(),
-            label_names
+            insns: Vec::with_capacity(ASSEMBLER_INSNS_CAPACITY),
+            live_ranges: Vec::with_capacity(ASSEMBLER_INSNS_CAPACITY),
+            label_names,
+            ctx: Context::default(),
+            num_locals,
+            side_exits,
+            side_exit_pc: None,
+            side_exit_stack_size: None,
+            leaf_ccall: false,
         }
     }
 
+    /// Get the list of registers that can be used for stack temps.
+    pub fn get_temp_regs() -> &'static [Reg] {
+        let num_regs = get_option!(num_temp_regs);
+        &TEMP_REGS[0..num_regs]
+    }
+
+    /// Get the number of locals for the ISEQ being compiled
+    pub fn get_num_locals(&self) -> Option<u32> {
+        self.num_locals
+    }
+
+    /// Set a context for generating side exits
+    pub fn set_side_exit_context(&mut self, pc: *mut VALUE, stack_size: u8) {
+        self.side_exit_pc = Some(pc);
+        self.side_exit_stack_size = Some(stack_size);
+    }
+
     /// Build an Opnd::InsnOut from the current index of the assembler and the
     /// given number of bits.
     pub(super) fn next_opnd_out(&self, num_bits: u8) -> Opnd {
@@ -879,31 +1111,75 @@ impl Assembler
     /// Append an instruction onto the current list of instructions and update
     /// the live ranges of any instructions whose outputs are being used as
     /// operands to this instruction.
-    pub(super) fn push_insn(&mut self, insn: Insn) {
+    pub fn push_insn(&mut self, mut insn: Insn) {
         // Index of this instruction
         let insn_idx = self.insns.len();
 
-        // If we find any InsnOut from previous instructions, we're going to
-        // update the live range of the previous instruction to point to this
-        // one.
-        for opnd in insn.opnd_iter() {
-            match opnd {
+        let mut opnd_iter = insn.opnd_iter_mut();
+        while let Some(opnd) = opnd_iter.next() {
+            match *opnd {
+                // If we find any InsnOut from previous instructions, we're going to update
+                // the live range of the previous instruction to point to this one.
                 Opnd::InsnOut { idx, .. } => {
-                    assert!(*idx < self.insns.len());
-                    self.live_ranges[*idx] = insn_idx;
+                    assert!(idx < self.insns.len());
+                    self.live_ranges[idx] = insn_idx;
                 }
                 Opnd::Mem(Mem { base: MemBase::InsnOut(idx), .. }) => {
-                    assert!(*idx < self.insns.len());
-                    self.live_ranges[*idx] = insn_idx;
+                    assert!(idx < self.insns.len());
+                    self.live_ranges[idx] = insn_idx;
+                }
+                // Set current ctx.reg_mapping to Opnd::Stack.
+                Opnd::Stack { idx, num_bits, stack_size, num_locals, sp_offset, reg_mapping: None } => {
+                    assert_eq!(
+                        self.ctx.get_stack_size() as i16 - self.ctx.get_sp_offset() as i16,
+                        stack_size as i16 - sp_offset as i16,
+                        "Opnd::Stack (stack_size: {}, sp_offset: {}) expects a different SP position from asm.ctx (stack_size: {}, sp_offset: {})",
+                        stack_size, sp_offset, self.ctx.get_stack_size(), self.ctx.get_sp_offset(),
+                    );
+                    *opnd = Opnd::Stack {
+                        idx,
+                        num_bits,
+                        stack_size,
+                        num_locals,
+                        sp_offset,
+                        reg_mapping: Some(self.ctx.get_reg_mapping()),
+                    };
                 }
                 _ => {}
             }
         }
 
+        // Set a side exit context to Target::SideExit
+        if let Some(Target::SideExit { context, .. }) = insn.target_mut() {
+            // We should skip this when this instruction is being copied from another Assembler.
+            if context.is_none() {
+                *context = Some(SideExitContext::new(
+                    self.side_exit_pc.unwrap(),
+                    self.ctx.with_stack_size(self.side_exit_stack_size.unwrap()),
+                ));
+            }
+        }
+
         self.insns.push(insn);
         self.live_ranges.push(insn_idx);
     }
 
+    /// Get a cached side exit, wrapping a counter if specified
+    pub fn get_side_exit(&mut self, side_exit_context: &SideExitContext, counter: Option<Counter>, ocb: &mut OutlinedCb) -> Option<CodePtr> {
+        // Get a cached side exit
+        let side_exit = match self.side_exits.get(&side_exit_context) {
+            None => {
+                let exit_code = gen_outlined_exit(side_exit_context.pc, self.num_locals.unwrap(), &side_exit_context.get_ctx(), ocb)?;
+                self.side_exits.insert(*side_exit_context, exit_code);
+                exit_code
+            }
+            Some(code_ptr) => *code_ptr,
+        };
+
+        // Wrap a counter if needed
+        gen_counted_exit(side_exit_context.pc, side_exit, ocb, counter)
+    }
+
     /// Create a new label instance that we can jump to
     pub fn new_label(&mut self, name: &str) -> Target
     {
@@ -914,6 +1190,198 @@ impl Assembler
         Target::Label(label_idx)
     }
 
+    /// Convert Opnd::Stack to Opnd::Mem or Opnd::Reg
+    pub fn lower_stack_opnd(&self, opnd: &Opnd) -> Opnd {
+        // Convert Opnd::Stack to Opnd::Mem
+        fn mem_opnd(opnd: &Opnd) -> Opnd {
+            if let Opnd::Stack { idx, sp_offset, num_bits, .. } = *opnd {
+                incr_counter!(temp_mem_opnd);
+                Opnd::mem(num_bits, SP, (sp_offset as i32 - idx - 1) * SIZEOF_VALUE_I32)
+            } else {
+                unreachable!()
+            }
+        }
+
+        // Convert Opnd::Stack to Opnd::Reg
+        fn reg_opnd(opnd: &Opnd, reg_idx: usize) -> Opnd {
+            let regs = Assembler::get_temp_regs();
+            if let Opnd::Stack { num_bits, .. } = *opnd {
+                incr_counter!(temp_reg_opnd);
+                Opnd::Reg(regs[reg_idx]).with_num_bits(num_bits).unwrap()
+            } else {
+                unreachable!()
+            }
+        }
+
+        match opnd {
+            Opnd::Stack { reg_mapping, .. } => {
+                if let Some(reg_idx) = reg_mapping.unwrap().get_reg(opnd.reg_opnd()) {
+                    reg_opnd(opnd, reg_idx)
+                } else {
+                    mem_opnd(opnd)
+                }
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    /// Allocate a register to a stack temp if available.
+    pub fn alloc_reg(&mut self, mapping: RegOpnd) {
+        // Allocate a register if there's no conflict.
+        let mut reg_mapping = self.ctx.get_reg_mapping();
+        if reg_mapping.alloc_reg(mapping) {
+            self.set_reg_mapping(reg_mapping);
+        }
+    }
+
+    /// Erase local variable type information
+    /// eg: because of a call we can't track
+    pub fn clear_local_types(&mut self) {
+        asm_comment!(self, "clear local variable types");
+        self.ctx.clear_local_types();
+    }
+
+    /// Repurpose stack temp registers to the corresponding locals for arguments
+    pub fn map_temp_regs_to_args(&mut self, callee_ctx: &mut Context, argc: i32) -> Vec<RegOpnd> {
+        let mut callee_reg_mapping = callee_ctx.get_reg_mapping();
+        let mut mapped_temps = vec![];
+
+        for arg_idx in 0..argc {
+            let stack_idx: u8 = (self.ctx.get_stack_size() as i32 - argc + arg_idx).try_into().unwrap();
+            let temp_opnd = RegOpnd::Stack(stack_idx);
+
+            // For each argument, if the stack temp for it has a register,
+            // let the callee use the register for the local variable.
+            if let Some(reg_idx) = self.ctx.get_reg_mapping().get_reg(temp_opnd) {
+                let local_opnd = RegOpnd::Local(arg_idx.try_into().unwrap());
+                callee_reg_mapping.set_reg(local_opnd, reg_idx);
+                mapped_temps.push(temp_opnd);
+            }
+        }
+
+        asm_comment!(self, "local maps: {:?}", callee_reg_mapping);
+        callee_ctx.set_reg_mapping(callee_reg_mapping);
+        mapped_temps
+    }
+
+    /// Spill all live registers to the stack
+    pub fn spill_regs(&mut self) {
+        self.spill_regs_except(&vec![]);
+    }
+
+    /// Spill all live registers except `ignored_temps` to the stack
+    pub fn spill_regs_except(&mut self, ignored_temps: &Vec<RegOpnd>) {
+        // Forget registers above the stack top
+        let mut reg_mapping = self.ctx.get_reg_mapping();
+        for stack_idx in self.ctx.get_stack_size()..MAX_CTX_TEMPS as u8 {
+            reg_mapping.dealloc_reg(RegOpnd::Stack(stack_idx));
+        }
+        self.set_reg_mapping(reg_mapping);
+
+        // If no registers are in use, skip all checks
+        if self.ctx.get_reg_mapping() == RegMapping::default() {
+            return;
+        }
+
+        // Collect stack temps to be spilled
+        let mut spilled_opnds = vec![];
+        for stack_idx in 0..u8::min(MAX_CTX_TEMPS as u8, self.ctx.get_stack_size()) {
+            let reg_opnd = RegOpnd::Stack(stack_idx);
+            if !ignored_temps.contains(&reg_opnd) && reg_mapping.dealloc_reg(reg_opnd) {
+                let idx = self.ctx.get_stack_size() - 1 - stack_idx;
+                let spilled_opnd = self.stack_opnd(idx.into());
+                spilled_opnds.push(spilled_opnd);
+                reg_mapping.dealloc_reg(spilled_opnd.reg_opnd());
+            }
+        }
+
+        // Collect locals to be spilled
+        for local_idx in 0..MAX_CTX_TEMPS as u8 {
+            if reg_mapping.dealloc_reg(RegOpnd::Local(local_idx)) {
+                let first_local_ep_offset = self.num_locals.unwrap() + VM_ENV_DATA_SIZE - 1;
+                let ep_offset = first_local_ep_offset - local_idx as u32;
+                let spilled_opnd = self.local_opnd(ep_offset);
+                spilled_opnds.push(spilled_opnd);
+                reg_mapping.dealloc_reg(spilled_opnd.reg_opnd());
+            }
+        }
+
+        // Spill stack temps and locals
+        if !spilled_opnds.is_empty() {
+            asm_comment!(self, "spill_regs: {:?} -> {:?}", self.ctx.get_reg_mapping(), reg_mapping);
+            for &spilled_opnd in spilled_opnds.iter() {
+                self.spill_reg(spilled_opnd);
+            }
+            self.ctx.set_reg_mapping(reg_mapping);
+        }
+    }
+
+    /// Spill a stack temp from a register to the stack
+    pub fn spill_reg(&mut self, opnd: Opnd) {
+        assert_ne!(self.ctx.get_reg_mapping().get_reg(opnd.reg_opnd()), None);
+
+        // Use different RegMappings for dest and src operands
+        let reg_mapping = self.ctx.get_reg_mapping();
+        let mut mem_mappings = reg_mapping;
+        mem_mappings.dealloc_reg(opnd.reg_opnd());
+
+        // Move the stack operand from a register to memory
+        match opnd {
+            Opnd::Stack { idx, num_bits, stack_size, num_locals, sp_offset, .. } => {
+                self.mov(
+                    Opnd::Stack { idx, num_bits, stack_size, num_locals, sp_offset, reg_mapping: Some(mem_mappings) },
+                    Opnd::Stack { idx, num_bits, stack_size, num_locals, sp_offset, reg_mapping: Some(reg_mapping) },
+                );
+            }
+            _ => unreachable!(),
+        }
+        incr_counter!(temp_spill);
+    }
+
+    /// Update which stack temps are in a register
+    pub fn set_reg_mapping(&mut self, reg_mapping: RegMapping) {
+        if self.ctx.get_reg_mapping() != reg_mapping {
+            asm_comment!(self, "reg_mapping: {:?} -> {:?}", self.ctx.get_reg_mapping(), reg_mapping);
+            self.ctx.set_reg_mapping(reg_mapping);
+        }
+    }
+
+    // Shuffle register moves, sometimes adding extra moves using SCRATCH_REG,
+    // so that they will not rewrite each other before they are used.
+    pub fn reorder_reg_moves(old_moves: &Vec<(Reg, Opnd)>) -> Vec<(Reg, Opnd)> {
+        // Return the index of a move whose destination is not used as a source if any.
+        fn find_safe_move(moves: &Vec<(Reg, Opnd)>) -> Option<usize> {
+            moves.iter().enumerate().find(|(_, &(dest_reg, _))| {
+                moves.iter().all(|&(_, src_opnd)| src_opnd != Opnd::Reg(dest_reg))
+            }).map(|(index, _)| index)
+        }
+
+        // Remove moves whose source and destination are the same
+        let mut old_moves: Vec<(Reg, Opnd)> = old_moves.clone().into_iter()
+            .filter(|&(reg, opnd)| Opnd::Reg(reg) != opnd).collect();
+
+        let mut new_moves = vec![];
+        while old_moves.len() > 0 {
+            // Keep taking safe moves
+            while let Some(index) = find_safe_move(&old_moves) {
+                new_moves.push(old_moves.remove(index));
+            }
+
+            // No safe move. Load the source of one move into SCRATCH_REG, and
+            // then load SCRATCH_REG into the destination when it's safe.
+            if old_moves.len() > 0 {
+                // Make sure it's safe to use SCRATCH_REG
+                assert!(old_moves.iter().all(|&(_, opnd)| opnd != Opnd::Reg(Assembler::SCRATCH_REG)));
+
+                // Move SCRATCH <- opnd, and delay reg <- SCRATCH
+                let (reg, opnd) = old_moves.remove(0);
+                new_moves.push((Assembler::SCRATCH_REG, opnd));
+                old_moves.push((reg, Opnd::Reg(Assembler::SCRATCH_REG)));
+            }
+        }
+        new_moves
+    }
+
     /// Sets the out field on the various instructions that require allocated
     /// registers because their output is used as the operand on a subsequent
     /// instruction. This is our implementation of the linear scan algorithm.
@@ -959,6 +1427,19 @@ impl Assembler
             }
         }
 
+        // Adjust the number of entries in live_ranges so that it can be indexed by mapped indexes.
+        fn shift_live_ranges(live_ranges: &mut Vec<usize>, start_index: usize, shift_offset: isize) {
+            if shift_offset >= 0 {
+                for index in 0..(shift_offset as usize) {
+                    live_ranges.insert(start_index + index, start_index + index);
+                }
+            } else {
+                for _ in 0..-shift_offset {
+                    live_ranges.remove(start_index);
+                }
+            }
+        }
+
         // Dump live registers for register spill debugging.
         fn dump_live_regs(insns: Vec<Insn>, live_ranges: Vec<usize>, num_regs: usize, spill_index: usize) {
             // Convert live_ranges to live_regs: the number of live registers at each index
@@ -982,11 +1463,18 @@ impl Assembler
             }
         }
 
+        // We may need to reorder LoadInto instructions with a C argument operand.
+        // This buffers the operands of such instructions to process them in batches.
+        let mut c_args: Vec<(Reg, Opnd)> = vec![];
+
+        // live_ranges is indexed by original `index` given by the iterator.
         let live_ranges: Vec<usize> = take(&mut self.live_ranges);
-        let mut asm = Assembler::new_with_label_names(take(&mut self.label_names));
+        // shifted_live_ranges is indexed by mapped indexes in insn operands.
+        let mut shifted_live_ranges: Vec<usize> = live_ranges.clone();
+        let mut asm = Assembler::new_with_label_names(take(&mut self.label_names), take(&mut self.side_exits), self.num_locals);
         let mut iterator = self.into_draining_iter();
 
-        while let Some((index, mut insn)) = iterator.next_unmapped() {
+        while let Some((index, mut insn)) = iterator.next_mapped() {
             // Check if this is the last instruction that uses an operand that
             // spans more than one instruction. In that case, return the
             // allocated register to the pool.
@@ -997,12 +1485,11 @@ impl Assembler
                         // Since we have an InsnOut, we know it spans more that one
                         // instruction.
                         let start_index = *idx;
-                        assert!(start_index < index);
 
                         // We're going to check if this is the last instruction that
                         // uses this operand. If it is, we can return the allocated
                         // register to the pool.
-                        if live_ranges[start_index] == index {
+                        if shifted_live_ranges[start_index] == index {
                             if let Some(Opnd::Reg(reg)) = asm.insns[start_index].out_opnd() {
                                 dealloc_reg(&mut pool, &regs, reg);
                             } else {
@@ -1049,7 +1536,7 @@ impl Assembler
                     let mut opnd_iter = insn.opnd_iter();
 
                     if let Some(Opnd::InsnOut{ idx, .. }) = opnd_iter.next() {
-                        if live_ranges[*idx] == index {
+                        if shifted_live_ranges[*idx] == index {
                             if let Some(Opnd::Reg(reg)) = asm.insns[*idx].out_opnd() {
                                 out_reg = Some(take_reg(&mut pool, &regs, reg));
                             }
@@ -1106,39 +1593,57 @@ impl Assembler
                 }
             }
 
-            asm.push_insn(insn);
+            // Push instruction(s). Batch and reorder C argument operations if needed.
+            if let Insn::LoadInto { dest: Opnd::CArg(reg), opnd } = insn {
+                // Buffer C arguments
+                c_args.push((reg, opnd));
+            } else {
+                // C arguments are buffered until CCall
+                if c_args.len() > 0 {
+                    // Resolve C argument dependencies
+                    let c_args_len = c_args.len() as isize;
+                    let moves = Self::reorder_reg_moves(&std::mem::take(&mut c_args));
+                    shift_live_ranges(&mut shifted_live_ranges, asm.insns.len(), moves.len() as isize - c_args_len);
+
+                    // Push batched C arguments
+                    for (reg, opnd) in moves {
+                        asm.load_into(Opnd::Reg(reg), opnd);
+                    }
+                }
+                // Other instructions are pushed as is
+                asm.push_insn(insn);
+            }
+            iterator.map_insn_index(&mut asm);
         }
 
         assert_eq!(pool, 0, "Expected all registers to be returned to the pool");
         asm
     }
 
-    /// Compile the instructions down to machine code
-    /// NOTE: should compile return a list of block labels to enable
-    ///       compiling multiple blocks at a time?
-    pub fn compile(self, cb: &mut CodeBlock) -> Vec<u32>
+    /// Compile the instructions down to machine code.
+    /// Can fail due to lack of code memory and inopportune code placement, among other reasons.
+    #[must_use]
+    pub fn compile(self, cb: &mut CodeBlock, ocb: Option<&mut OutlinedCb>) -> Option<(CodePtr, Vec<u32>)>
     {
-        #[cfg(feature = "disasm")]
         let start_addr = cb.get_write_ptr();
-
         let alloc_regs = Self::get_alloc_regs();
-        let gc_offsets = self.compile_with_regs(cb, alloc_regs);
+        let ret = self.compile_with_regs(cb, ocb, alloc_regs);
 
-        #[cfg(feature = "disasm")]
         if let Some(dump_disasm) = get_option_ref!(dump_disasm) {
             use crate::disasm::dump_disasm_addr_range;
             let end_addr = cb.get_write_ptr();
             dump_disasm_addr_range(cb, start_addr, end_addr, dump_disasm)
         }
-        gc_offsets
+        ret
     }
 
     /// Compile with a limited number of registers. Used only for unit tests.
-    pub fn compile_with_num_regs(self, cb: &mut CodeBlock, num_regs: usize) -> Vec<u32>
+    #[cfg(test)]
+    pub fn compile_with_num_regs(self, cb: &mut CodeBlock, num_regs: usize) -> (CodePtr, Vec<u32>)
     {
         let mut alloc_regs = Self::get_alloc_regs();
         let alloc_regs = alloc_regs.drain(0..num_regs).collect();
-        self.compile_with_regs(cb, alloc_regs)
+        self.compile_with_regs(cb, None, alloc_regs).unwrap()
     }
 
     /// Consume the assembler by creating a new draining iterator.
@@ -1146,16 +1651,21 @@ impl Assembler
         AssemblerDrainingIterator::new(self)
     }
 
-    /// Consume the assembler by creating a new lookback iterator.
-    pub fn into_lookback_iter(self) -> AssemblerLookbackIterator {
-        AssemblerLookbackIterator::new(self)
+    /// Return true if the next ccall() is expected to be leaf.
+    pub fn get_leaf_ccall(&mut self) -> bool {
+        self.leaf_ccall
+    }
+
+    /// Assert that the next ccall() is going to be leaf.
+    pub fn expect_leaf_ccall(&mut self) {
+        self.leaf_ccall = true;
     }
 }
 
 /// A struct that allows iterating through an assembler's instructions and
 /// consuming them as it iterates.
 pub struct AssemblerDrainingIterator {
-    insns: std::vec::IntoIter<Insn>,
+    insns: std::iter::Peekable<std::vec::IntoIter<Insn>>,
     index: usize,
     indices: Vec<usize>
 }
@@ -1163,9 +1673,9 @@ pub struct AssemblerDrainingIterator {
 impl AssemblerDrainingIterator {
     fn new(asm: Assembler) -> Self {
         Self {
-            insns: asm.insns.into_iter(),
+            insns: asm.insns.into_iter().peekable(),
             index: 0,
-            indices: Vec::default()
+            indices: Vec::with_capacity(ASSEMBLER_INSNS_CAPACITY),
         }
     }
 
@@ -1177,10 +1687,11 @@ impl AssemblerDrainingIterator {
     /// end of the current list of instructions in order to maintain that
     /// alignment.
     pub fn map_insn_index(&mut self, asm: &mut Assembler) {
-        self.indices.push(asm.insns.len() - 1);
+        self.indices.push(asm.insns.len().saturating_sub(1));
     }
 
     /// Map an operand by using this iterator's list of mapped indices.
+    #[cfg(target_arch = "x86_64")]
     pub fn map_opnd(&self, opnd: Opnd) -> Opnd {
         opnd.map_index(&self.indices)
     }
@@ -1205,51 +1716,10 @@ impl AssemblerDrainingIterator {
         self.index += 1;
         self.insns.next().map(|insn| (index, insn))
     }
-}
-
-/// A struct that allows iterating through references to an assembler's
-/// instructions without consuming them.
-pub struct AssemblerLookbackIterator {
-    asm: Assembler,
-    index: Cell<usize>
-}
-
-impl AssemblerLookbackIterator {
-    fn new(asm: Assembler) -> Self {
-        Self { asm, index: Cell::new(0) }
-    }
-
-    /// Fetches a reference to an instruction at a specific index.
-    pub fn get(&self, index: usize) -> Option<&Insn> {
-        self.asm.insns.get(index)
-    }
 
-    /// Fetches a reference to an instruction in the list relative to the
-    /// current cursor location of this iterator.
-    pub fn get_relative(&self, difference: i32) -> Option<&Insn> {
-        let index: Result<i32, _> = self.index.get().try_into();
-        let relative: Result<usize, _> = index.and_then(|value| (value + difference).try_into());
-        relative.ok().and_then(|value| self.asm.insns.get(value))
-    }
-
-    /// Fetches the previous instruction relative to the current cursor location
-    /// of this iterator.
-    pub fn get_previous(&self) -> Option<&Insn> {
-        self.get_relative(-1)
-    }
-
-    /// Fetches the next instruction relative to the current cursor location of
-    /// this iterator.
-    pub fn get_next(&self) -> Option<&Insn> {
-        self.get_relative(1)
-    }
-
-    /// Returns the next instruction in the list with the indices corresponding
-    /// to the previous list of instructions.
-    pub fn next_unmapped(&self) -> Option<(usize, &Insn)> {
-        let index = self.index.get();
-        self.index.set(index + 1);
-        self.asm.insns.get(index).map(|insn| (index, insn))
+    /// Returns the next instruction without incrementing the iterator's index.
+    pub fn peek(&mut self) -> Option<&Insn> {
+        self.insns.peek()
     }
 }
 
@@ -1284,22 +1754,67 @@ impl Assembler {
         self.push_insn(Insn::BakeString(text.to_string()));
     }
 
+    #[allow(dead_code)]
     pub fn breakpoint(&mut self) {
         self.push_insn(Insn::Breakpoint);
     }
 
     pub fn ccall(&mut self, fptr: *const u8, opnds: Vec<Opnd>) -> Opnd {
+        // Let vm_check_canary() assert this ccall's leafness if leaf_ccall is set
+        let canary_opnd = self.set_stack_canary(&opnds);
+
+        let old_temps = self.ctx.get_reg_mapping(); // with registers
+        // Spill stack temp registers since they are caller-saved registers.
+        // Note that this doesn't spill stack temps that are already popped
+        // but may still be used in the C arguments.
+        self.spill_regs();
+        let new_temps = self.ctx.get_reg_mapping(); // all spilled
+
+        // Temporarily manipulate RegMappings so that we can use registers
+        // to pass stack operands that are already spilled above.
+        self.ctx.set_reg_mapping(old_temps);
+
+        // Call a C function
         let out = self.next_opnd_out(Opnd::match_num_bits(&opnds));
         self.push_insn(Insn::CCall { fptr, opnds, out });
+
+        // Registers in old_temps may be clobbered by the above C call,
+        // so rollback the manipulated RegMappings to a spilled version.
+        self.ctx.set_reg_mapping(new_temps);
+
+        // Clear the canary after use
+        if let Some(canary_opnd) = canary_opnd {
+            self.mov(canary_opnd, 0.into());
+        }
+
         out
     }
 
-    pub fn cmp(&mut self, left: Opnd, right: Opnd) {
-        self.push_insn(Insn::Cmp { left, right });
+    /// Let vm_check_canary() assert the leafness of this ccall if leaf_ccall is set
+    fn set_stack_canary(&mut self, opnds: &Vec<Opnd>) -> Option<Opnd> {
+        // Use the slot right above the stack top for verifying leafness.
+        let canary_opnd = self.stack_opnd(-1);
+
+        // If the slot is already used, which is a valid optimization to avoid spills,
+        // give up the verification.
+        let canary_opnd = if cfg!(feature = "runtime_checks") && self.leaf_ccall && opnds.iter().all(|opnd|
+            opnd.get_reg_opnd() != canary_opnd.get_reg_opnd()
+        ) {
+            asm_comment!(self, "set stack canary");
+            self.mov(canary_opnd, vm_stack_canary().into());
+            Some(canary_opnd)
+        } else {
+            None
+        };
+
+        // Avoid carrying the flag to the next instruction whether we verified it or not.
+        self.leaf_ccall = false;
+
+        canary_opnd
     }
 
-    pub fn comment(&mut self, text: &str) {
-        self.push_insn(Insn::Comment(text.to_string()));
+    pub fn cmp(&mut self, left: Opnd, right: Opnd) {
+        self.push_insn(Insn::Cmp { left, right });
     }
 
     #[must_use]
@@ -1309,8 +1824,12 @@ impl Assembler {
         out
     }
 
-    pub fn cpop_all(&mut self) {
+    pub fn cpop_all(&mut self, reg_mapping: RegMapping) {
         self.push_insn(Insn::CPopAll);
+
+        // Re-enable ccall's RegMappings assertion disabled by cpush_all.
+        // cpush_all + cpop_all preserve all stack temp registers, so it's safe.
+        self.set_reg_mapping(reg_mapping);
     }
 
     pub fn cpop_into(&mut self, opnd: Opnd) {
@@ -1321,8 +1840,16 @@ impl Assembler {
         self.push_insn(Insn::CPush(opnd));
     }
 
-    pub fn cpush_all(&mut self) {
+    pub fn cpush_all(&mut self) -> RegMapping {
         self.push_insn(Insn::CPushAll);
+
+        // Mark all temps as not being in registers.
+        // Temps will be marked back as being in registers by cpop_all.
+        // We assume that cpush_all + cpop_all are used for C functions in utils.rs
+        // that don't require spill_regs for GC.
+        let mapping = self.ctx.get_reg_mapping();
+        self.set_reg_mapping(RegMapping::default());
+        mapping
     }
 
     pub fn cret(&mut self, opnd: Opnd) {
@@ -1401,6 +1928,10 @@ impl Assembler {
         self.push_insn(Insn::Jbe(target));
     }
 
+    pub fn jb(&mut self, target: Target) {
+        self.push_insn(Insn::Jb(target));
+    }
+
     pub fn je(&mut self, target: Target) {
         self.push_insn(Insn::Je(target));
     }
@@ -1409,6 +1940,16 @@ impl Assembler {
         self.push_insn(Insn::Jl(target));
     }
 
+    #[allow(dead_code)]
+    pub fn jg(&mut self, target: Target) {
+        self.push_insn(Insn::Jg(target));
+    }
+
+    #[allow(dead_code)]
+    pub fn jge(&mut self, target: Target) {
+        self.push_insn(Insn::Jge(target));
+    }
+
     pub fn jmp(&mut self, target: Target) {
         self.push_insn(Insn::Jmp(target));
     }
@@ -1429,6 +1970,10 @@ impl Assembler {
         self.push_insn(Insn::Jo(target));
     }
 
+    pub fn jo_mul(&mut self, target: Target) {
+        self.push_insn(Insn::JoMul(target));
+    }
+
     pub fn jz(&mut self, target: Target) {
         self.push_insn(Insn::Jz(target));
     }
@@ -1441,9 +1986,9 @@ impl Assembler {
     }
 
     #[must_use]
-    pub fn lea_label(&mut self, target: Target) -> Opnd {
+    pub fn lea_jump_target(&mut self, target: Target) -> Opnd {
         let out = self.next_opnd_out(Opnd::DEFAULT_NUM_BITS);
-        self.push_insn(Insn::LeaLabel { target, out });
+        self.push_insn(Insn::LeaJumpTarget { target, out });
         out
     }
 
@@ -1462,7 +2007,10 @@ impl Assembler {
     }
 
     pub fn load_into(&mut self, dest: Opnd, opnd: Opnd) {
-        self.push_insn(Insn::LoadInto { dest, opnd });
+        match (dest, opnd) {
+            (Opnd::Reg(dest), Opnd::Reg(opnd)) if dest == opnd => {}, // skip if noop
+            _ => self.push_insn(Insn::LoadInto { dest, opnd }),
+        }
     }
 
     #[must_use]
@@ -1502,7 +2050,7 @@ impl Assembler {
     }
 
     //pub fn pos_marker<F: FnMut(CodePtr)>(&mut self, marker_fn: F)
-    pub fn pos_marker(&mut self, marker_fn: impl Fn(CodePtr) + 'static) {
+    pub fn pos_marker(&mut self, marker_fn: impl Fn(CodePtr, &CodeBlock) + 'static) {
         self.push_insn(Insn::PosMarker(Box::new(marker_fn)));
     }
 
@@ -1524,17 +2072,35 @@ impl Assembler {
         out
     }
 
+    #[must_use]
+    pub fn mul(&mut self, left: Opnd, right: Opnd) -> Opnd {
+        let out = self.next_opnd_out(Opnd::match_num_bits(&[left, right]));
+        self.push_insn(Insn::Mul { left, right, out });
+        out
+    }
+
     pub fn test(&mut self, left: Opnd, right: Opnd) {
         self.push_insn(Insn::Test { left, right });
     }
 
     #[must_use]
+    #[allow(dead_code)]
     pub fn urshift(&mut self, opnd: Opnd, shift: Opnd) -> Opnd {
         let out = self.next_opnd_out(Opnd::match_num_bits(&[opnd, shift]));
         self.push_insn(Insn::URShift { opnd, shift, out });
         out
     }
 
+    /// Verify the leafness of the given block
+    pub fn with_leaf_ccall<F, R>(&mut self, mut block: F) -> R
+    where F: FnMut(&mut Self) -> R {
+        let old_leaf_ccall = self.leaf_ccall;
+        self.leaf_ccall = true;
+        let ret = block(self);
+        self.leaf_ccall = old_leaf_ccall;
+        ret
+    }
+
     /// Add a label at the current position
     pub fn write_label(&mut self, target: Target) {
         assert!(target.unwrap_label_idx() < self.label_names.len());
@@ -1549,6 +2115,17 @@ impl Assembler {
     }
 }
 
+/// Macro to use format! for Insn::Comment, which skips a format! call
+/// when not dumping disassembly.
+macro_rules! asm_comment {
+    ($asm:expr, $($fmt:tt)*) => {
+        if $crate::options::get_option_ref!(dump_disasm).is_some() {
+            $asm.push_insn(Insn::Comment(format!($($fmt)*)));
+        }
+    };
+}
+pub(crate) use asm_comment;
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/yjit/src/backend/mod.rs b/yjit/src/backend/mod.rs
index 4794695094..6921244c72 100644
--- a/yjit/src/backend/mod.rs
+++ b/yjit/src/backend/mod.rs
@@ -4,5 +4,11 @@ pub mod x86_64;
 #[cfg(target_arch = "aarch64")]
 pub mod arm64;
 
+#[cfg(target_arch = "x86_64")]
+pub use x86_64 as current;
+
+#[cfg(target_arch = "aarch64")]
+pub use arm64 as current;
+
 pub mod ir;
 mod tests;
diff --git a/yjit/src/backend/tests.rs b/yjit/src/backend/tests.rs
index 3098c7e3b0..bfeea5163a 100644
--- a/yjit/src/backend/tests.rs
+++ b/yjit/src/backend/tests.rs
@@ -1,19 +1,19 @@
 #![cfg(test)]
-use crate::asm::{CodeBlock};
+use crate::asm::CodeBlock;
 use crate::backend::ir::*;
 use crate::cruby::*;
 use crate::utils::c_callable;
 
 #[test]
 fn test_add() {
-    let mut asm = Assembler::new();
+    let mut asm = Assembler::new(0);
     let out = asm.add(SP, Opnd::UImm(1));
     let _ = asm.add(out, Opnd::UImm(2));
 }
 
 #[test]
 fn test_alloc_regs() {
-    let mut asm = Assembler::new();
+    let mut asm = Assembler::new(0);
 
     // Get the first output that we're going to reuse later.
     let out1 = asm.add(EC, Opnd::UImm(1));
@@ -62,7 +62,7 @@ fn test_alloc_regs() {
 
 fn setup_asm() -> (Assembler, CodeBlock) {
     return (
-        Assembler::new(),
+        Assembler::new(0),
         CodeBlock::new_dummy(1024)
     );
 }
@@ -87,7 +87,7 @@ fn test_mov_mem2mem()
 {
     let (mut asm, mut cb) = setup_asm();
 
-    asm.comment("check that comments work too");
+    asm_comment!(asm, "check that comments work too");
     asm.mov(Opnd::mem(64, SP, 0), Opnd::mem(64, SP, 8));
 
     asm.compile_with_num_regs(&mut cb, 1);
@@ -194,12 +194,12 @@ fn test_c_call()
 
 #[test]
 fn test_alloc_ccall_regs() {
-    let mut asm = Assembler::new();
+    let mut asm = Assembler::new(0);
     let out1 = asm.ccall(0 as *const u8, vec![]);
     let out2 = asm.ccall(0 as *const u8, vec![out1]);
     asm.mov(EC, out2);
     let mut cb = CodeBlock::new_dummy(1024);
-    asm.compile_with_regs(&mut cb, Assembler::get_alloc_regs());
+    asm.compile_with_regs(&mut cb, None, Assembler::get_alloc_regs());
 }
 
 #[test]
@@ -231,10 +231,10 @@ fn test_jcc_ptr()
 {
     let (mut asm, mut cb) = setup_asm();
 
-    let side_exit = Target::CodePtr(((cb.get_write_ptr().raw_ptr() as usize + 4) as *mut u8).into());
-    let not_mask = asm.not(Opnd::mem(32, EC, RUBY_OFFSET_EC_INTERRUPT_MASK));
+    let side_exit = Target::CodePtr(cb.get_write_ptr().add_bytes(4));
+    let not_mask = asm.not(Opnd::mem(32, EC, RUBY_OFFSET_EC_INTERRUPT_MASK as i32));
     asm.test(
-        Opnd::mem(32, EC, RUBY_OFFSET_EC_INTERRUPT_FLAG),
+        Opnd::mem(32, EC, RUBY_OFFSET_EC_INTERRUPT_FLAG as i32),
         not_mask,
     );
     asm.jnz(side_exit);
@@ -248,7 +248,7 @@ fn test_jmp_ptr()
 {
     let (mut asm, mut cb) = setup_asm();
 
-    let stub = Target::CodePtr(((cb.get_write_ptr().raw_ptr() as usize + 4) as *mut u8).into());
+    let stub = Target::CodePtr(cb.get_write_ptr().add_bytes(4));
     asm.jmp(stub);
 
     asm.compile_with_num_regs(&mut cb, 0);
@@ -259,7 +259,7 @@ fn test_jo()
 {
     let (mut asm, mut cb) = setup_asm();
 
-    let side_exit = Target::CodePtr(((cb.get_write_ptr().raw_ptr() as usize + 4) as *mut u8).into());
+    let side_exit = Target::CodePtr(cb.get_write_ptr().add_bytes(4));
 
     let arg1 = Opnd::mem(64, SP, 0);
     let arg0 = Opnd::mem(64, SP, 8);
@@ -283,8 +283,7 @@ fn test_bake_string() {
 
 #[test]
 fn test_draining_iterator() {
-
-    let mut asm = Assembler::new();
+    let mut asm = Assembler::new(0);
 
     let _ = asm.load(Opnd::None);
     asm.store(Opnd::None, Opnd::None);
@@ -303,25 +302,6 @@ fn test_draining_iterator() {
 }
 
 #[test]
-fn test_lookback_iterator() {
-    let mut asm = Assembler::new();
-
-    let _ = asm.load(Opnd::None);
-    asm.store(Opnd::None, Opnd::None);
-    asm.store(Opnd::None, Opnd::None);
-
-    let iter = asm.into_lookback_iter();
-
-    while let Some((index, insn)) = iter.next_unmapped() {
-        if index > 0 {
-            let opnd_iter = iter.get_previous().unwrap().opnd_iter();
-            assert_eq!(opnd_iter.take(1).next(), Some(&Opnd::None));
-            assert!(matches!(insn, Insn::Store { .. }));
-        }
-    }
-}
-
-#[test]
 fn test_cmp_8_bit() {
     let (mut asm, mut cb) = setup_asm();
     let reg = Assembler::get_alloc_regs()[0];
@@ -329,3 +309,21 @@ fn test_cmp_8_bit() {
 
     asm.compile_with_num_regs(&mut cb, 1);
 }
+
+#[test]
+fn test_no_pos_marker_callback_when_compile_fails() {
+    // When compilation fails (e.g. when out of memory), the code written out is malformed.
+    // We don't want to invoke the pos_marker callbacks with positions of malformed code.
+    let mut asm = Assembler::new(0);
+
+    // Markers around code to exhaust memory limit
+    let fail_if_called = |_code_ptr, _cb: &_| panic!("pos_marker callback should not be called");
+    asm.pos_marker(fail_if_called);
+    let zero = asm.load(0.into());
+    let sum = asm.add(zero, 500.into());
+    asm.store(Opnd::mem(64, SP, 8), sum);
+    asm.pos_marker(fail_if_called);
+
+    let cb = &mut CodeBlock::new_dummy(8);
+    assert!(asm.compile(cb, None).is_none(), "should fail due to tiny size limit");
+}
diff --git a/yjit/src/backend/x86_64/mod.rs b/yjit/src/backend/x86_64/mod.rs
index 297a0fd852..ef435bca7e 100644
--- a/yjit/src/backend/x86_64/mod.rs
+++ b/yjit/src/backend/x86_64/mod.rs
@@ -1,15 +1,12 @@
-#![allow(dead_code)]
-#![allow(unused_variables)]
-#![allow(unused_imports)]
-
 use std::mem::take;
 
 use crate::asm::*;
 use crate::asm::x86_64::*;
-use crate::codegen::{JITState};
+use crate::codegen::CodePtr;
 use crate::cruby::*;
 use crate::backend::ir::*;
-use crate::codegen::CodegenGlobals;
+use crate::options::*;
+use crate::utils::*;
 
 // Use the x86 register type for this platform
 pub type Reg = X86Reg;
@@ -33,8 +30,10 @@ pub const _C_ARG_OPNDS: [Opnd; 6] = [
 pub const C_RET_REG: Reg = RAX_REG;
 pub const _C_RET_OPND: Opnd = Opnd::Reg(RAX_REG);
 
-// The number of bytes that are generated by jmp_ptr
-pub const JMP_PTR_BYTES: usize = 6;
+impl CodeBlock {
+    // The number of bytes that are generated by jmp_ptr
+    pub fn jmp_ptr_bytes(&self) -> usize { 5 }
+}
 
 /// Map Opnd to X86Opnd
 impl From<Opnd> for X86Opnd {
@@ -80,12 +79,16 @@ impl From<&Opnd> for X86Opnd {
     }
 }
 
+/// List of registers that can be used for stack temps and locals.
+pub static TEMP_REGS: [Reg; 5] = [RSI_REG, RDI_REG, R8_REG, R9_REG, R10_REG];
+
 impl Assembler
 {
     // A special scratch register for intermediate processing.
-    // Note: right now this is only used by LeaLabel because label_ref accepts
-    // a closure and we don't want it to have to capture anything.
-    const SCRATCH0: X86Opnd = X86Opnd::Reg(R11_REG);
+    // This register is caller-saved (so we don't have to save it before using it)
+    pub const SCRATCH_REG: Reg = R11_REG;
+    const SCRATCH0: X86Opnd = X86Opnd::Reg(Assembler::SCRATCH_REG);
+
 
     /// Get the list of registers from which we can allocate on this platform
     pub fn get_alloc_regs() -> Vec<Reg>
@@ -109,7 +112,7 @@ impl Assembler
     fn x86_split(mut self) -> Assembler
     {
         let live_ranges: Vec<usize> = take(&mut self.live_ranges);
-        let mut asm = Assembler::new_with_label_names(take(&mut self.label_names));
+        let mut asm = Assembler::new_with_label_names(take(&mut self.label_names), take(&mut self.side_exits), self.num_locals);
         let mut iterator = self.into_draining_iter();
 
         while let Some((index, mut insn)) = iterator.next_unmapped() {
@@ -132,7 +135,7 @@ impl Assembler
             // Opnd::Value operands into registers here because:
             //
             //   - Most instructions can't be encoded with 64-bit immediates.
-            //   - We look for Op::Load specifically when emiting to keep GC'ed
+            //   - We look for Op::Load specifically when emitting to keep GC'ed
             //     VALUEs alive. This is a sort of canonicalization.
             let mut unmapped_opnds: Vec<Opnd> = vec![];
 
@@ -140,21 +143,23 @@ impl Assembler
             let mut opnd_iter = insn.opnd_iter_mut();
 
             while let Some(opnd) = opnd_iter.next() {
+                if let Opnd::Stack { .. } = opnd {
+                    *opnd = asm.lower_stack_opnd(opnd);
+                }
                 unmapped_opnds.push(*opnd);
 
-                *opnd = if is_load {
-                    iterator.map_opnd(*opnd)
-                } else if let Opnd::Value(value) = opnd {
-                    // Since mov(mem64, imm32) sign extends, as_i64() makes sure
-                    // we split when the extended value is different.
-                    if !value.special_const_p() || imm_num_bits(value.as_i64()) > 32 {
-                        asm.load(iterator.map_opnd(*opnd))
-                    } else {
-                        Opnd::UImm(value.as_u64())
+                *opnd = match opnd {
+                    Opnd::Value(value) if !is_load => {
+                        // Since mov(mem64, imm32) sign extends, as_i64() makes sure
+                        // we split when the extended value is different.
+                        if !value.special_const_p() || imm_num_bits(value.as_i64()) > 32 {
+                            asm.load(iterator.map_opnd(*opnd))
+                        } else {
+                            Opnd::UImm(value.as_u64())
+                        }
                     }
-                } else {
-                    iterator.map_opnd(*opnd)
-                }
+                    _ => iterator.map_opnd(*opnd),
+                };
             }
 
             // We are replacing instructions here so we know they are already
@@ -163,40 +168,86 @@ impl Assembler
             match &mut insn {
                 Insn::Add { left, right, out } |
                 Insn::Sub { left, right, out } |
+                Insn::Mul { left, right, out } |
                 Insn::And { left, right, out } |
                 Insn::Or { left, right, out } |
                 Insn::Xor { left, right, out } => {
-                    match (unmapped_opnds[0], unmapped_opnds[1]) {
-                        (Opnd::Mem(_), Opnd::Mem(_)) => {
-                            *left = asm.load(*left);
-                            *right = asm.load(*right);
-                        },
-                        (Opnd::Mem(_), Opnd::UImm(_) | Opnd::Imm(_)) => {
-                            *left = asm.load(*left);
-                        },
-                        // Instruction output whose live range spans beyond this instruction
-                        (Opnd::InsnOut { idx, .. }, _) => {
-                            if live_ranges[idx] > index {
-                                *left = asm.load(*left);
+                    match (&left, &right, iterator.peek()) {
+                        // Merge this insn, e.g. `add REG, right -> out`, and `mov REG, out` if possible
+                        (Opnd::Reg(_), Opnd::UImm(value), Some(Insn::Mov { dest, src }))
+                        if out == src && left == dest && live_ranges[index] == index + 1 && uimm_num_bits(*value) <= 32 => {
+                            *out = *dest;
+                            asm.push_insn(insn);
+                            iterator.map_insn_index(&mut asm);
+                            iterator.next_unmapped(); // Pop merged Insn::Mov
+                        }
+                        (Opnd::Reg(_), Opnd::Reg(_), Some(Insn::Mov { dest, src }))
+                        if out == src && live_ranges[index] == index + 1 && {
+                            // We want to do `dest == left`, but `left` has already gone
+                            // through lower_stack_opnd() while `dest` has not. So we
+                            // lower `dest` before comparing.
+                            let lowered_dest = if let Opnd::Stack { .. } = dest {
+                                asm.lower_stack_opnd(dest)
+                            } else {
+                                *dest
+                            };
+                            lowered_dest == *left
+                        } => {
+                            *out = *dest;
+                            asm.push_insn(insn);
+                            iterator.map_insn_index(&mut asm);
+                            iterator.next_unmapped(); // Pop merged Insn::Mov
+                        }
+                        _ => {
+                            match (unmapped_opnds[0], unmapped_opnds[1]) {
+                                (Opnd::Mem(_), Opnd::Mem(_)) => {
+                                    *left = asm.load(*left);
+                                    *right = asm.load(*right);
+                                },
+                                (Opnd::Mem(_), Opnd::UImm(_) | Opnd::Imm(_)) => {
+                                    *left = asm.load(*left);
+                                },
+                                // Instruction output whose live range spans beyond this instruction
+                                (Opnd::InsnOut { idx, .. }, _) => {
+                                    if live_ranges[idx] > index {
+                                        *left = asm.load(*left);
+                                    }
+                                },
+                                // We have to load memory operands to avoid corrupting them
+                                (Opnd::Mem(_) | Opnd::Reg(_), _) => {
+                                    *left = asm.load(*left);
+                                },
+                                _ => {}
+                            };
+
+                            *out = asm.next_opnd_out(Opnd::match_num_bits(&[*left, *right]));
+                            asm.push_insn(insn);
+                        }
+                    }
+                },
+                Insn::Cmp { left, right } => {
+                    // Replace `cmp REG, 0` (4 bytes) with `test REG, REG` (3 bytes)
+                    // when next IR is `je`, `jne`, `csel_e`, or `csel_ne`
+                    match (&left, &right, iterator.peek()) {
+                        (Opnd::InsnOut { .. },
+                         Opnd::UImm(0) | Opnd::Imm(0),
+                         Some(Insn::Je(_) | Insn::Jne(_) | Insn::CSelE { .. } | Insn::CSelNE { .. })) => {
+                            asm.push_insn(Insn::Test { left: *left, right: *left });
+                        }
+                        _ => {
+                            if let (Opnd::Mem(_), Opnd::Mem(_)) = (&left, &right) {
+                                let loaded = asm.load(*right);
+                                *right = loaded;
                             }
-                        },
-                        // We have to load memory operands to avoid corrupting them
-                        (Opnd::Mem(_) | Opnd::Reg(_), _) => {
-                            *left = asm.load(*left);
-                        },
-                        _ => {}
-                    };
-
-                    *out = asm.next_opnd_out(Opnd::match_num_bits(&[*left, *right]));
-                    asm.push_insn(insn);
+                            asm.push_insn(insn);
+                        }
+                    }
                 },
-                Insn::Cmp { left, right } |
                 Insn::Test { left, right } => {
                     if let (Opnd::Mem(_), Opnd::Mem(_)) = (&left, &right) {
                         let loaded = asm.load(*right);
                         *right = loaded;
                     }
-
                     asm.push_insn(insn);
                 },
                 // These instructions modify their input operand in-place, so we
@@ -237,7 +288,11 @@ impl Assembler
                                 *truthy = asm.load(*truthy);
                             }
                         },
-                        Opnd::UImm(_) | Opnd::Imm(_) | Opnd::Value(_) => {
+                        Opnd::UImm(_) | Opnd::Imm(_) => {
+                            *truthy = asm.load(*truthy);
+                        },
+                        // Opnd::Value could have already been split
+                        Opnd::Value(_) if !matches!(truthy, Opnd::InsnOut { .. }) => {
                             *truthy = asm.load(*truthy);
                         },
                         _ => {}
@@ -253,26 +308,31 @@ impl Assembler
                     *out = asm.next_opnd_out(Opnd::match_num_bits(&[*truthy, *falsy]));
                     asm.push_insn(insn);
                 },
-                Insn::Mov { dest, src } => {
+                Insn::Mov { dest, src } | Insn::Store { dest, src } => {
                     match (&dest, &src) {
                         (Opnd::Mem(_), Opnd::Mem(_)) => {
                             // We load opnd1 because for mov, opnd0 is the output
                             let opnd1 = asm.load(*src);
                             asm.mov(*dest, opnd1);
                         },
-                        (Opnd::Mem(_), Opnd::UImm(value)) => {
-                            // 32-bit values will be sign-extended
-                            if imm_num_bits(*value as i64) > 32 {
+                        (Opnd::Mem(Mem { num_bits, .. }), Opnd::UImm(value)) => {
+                            // For 64 bit destinations, 32-bit values will be sign-extended
+                            if *num_bits == 64 && imm_num_bits(*value as i64) > 32 {
                                 let opnd1 = asm.load(*src);
                                 asm.mov(*dest, opnd1);
                             } else {
                                 asm.mov(*dest, *src);
                             }
                         },
-                        (Opnd::Mem(_), Opnd::Imm(value)) => {
-                            if imm_num_bits(*value) > 32 {
+                        (Opnd::Mem(Mem { num_bits, .. }), Opnd::Imm(value)) => {
+                            // For 64 bit destinations, 32-bit values will be sign-extended
+                            if *num_bits == 64 && imm_num_bits(*value) > 32 {
                                 let opnd1 = asm.load(*src);
                                 asm.mov(*dest, opnd1);
+                            } else if uimm_num_bits(*value as u64) <= *num_bits {
+                                // If the bit string is short enough for the destination, use the unsigned representation.
+                                // Note that 64-bit and negative values are ruled out.
+                                asm.mov(*dest, Opnd::UImm(*value as u64));
                             } else {
                                 asm.mov(*dest, *src);
                             }
@@ -310,13 +370,25 @@ impl Assembler
                     // Load each operand into the corresponding argument
                     // register.
                     for (idx, opnd) in opnds.into_iter().enumerate() {
-                        asm.load_into(C_ARG_OPNDS[idx], *opnd);
+                        asm.load_into(Opnd::c_arg(C_ARG_OPNDS[idx]), *opnd);
                     }
 
                     // Now we push the CCall without any arguments so that it
                     // just performs the call.
                     asm.ccall(*fptr, vec![]);
                 },
+                Insn::Lea { .. } => {
+                    // Merge `lea` and `mov` into a single `lea` when possible
+                    match (&insn, iterator.peek()) {
+                        (Insn::Lea { opnd, out }, Some(Insn::Mov { dest: Opnd::Reg(reg), src }))
+                        if matches!(out, Opnd::InsnOut { .. }) && out == src && live_ranges[index] == index + 1 => {
+                            asm.push_insn(Insn::Lea { opnd: *opnd, out: Opnd::Reg(*reg) });
+                            iterator.map_insn_index(&mut asm);
+                            iterator.next_unmapped(); // Pop merged Insn::Mov
+                        }
+                        _ => asm.push_insn(insn),
+                    }
+                },
                 _ => {
                     if insn.out_opnd().is_some() {
                         let out_num_bits = Opnd::match_num_bits_iter(insn.opnd_iter());
@@ -335,7 +407,7 @@ impl Assembler
     }
 
     /// Emit platform-specific machine code
-    pub fn x86_emit(&mut self, cb: &mut CodeBlock) -> Vec<u32>
+    pub fn x86_emit(&mut self, cb: &mut CodeBlock, ocb: &mut Option<&mut OutlinedCb>) -> Option<Vec<u32>>
     {
         /// For some instructions, we want to be able to lower a 64-bit operand
         /// without requiring more registers to be available in the register
@@ -365,12 +437,45 @@ impl Assembler
             }
         }
 
+        /// Compile a side exit if Target::SideExit is given.
+        fn compile_side_exit(
+            target: Target,
+            asm: &mut Assembler,
+            ocb: &mut Option<&mut OutlinedCb>,
+        ) -> Option<Target> {
+            if let Target::SideExit { counter, context } = target {
+                let side_exit = asm.get_side_exit(&context.unwrap(), Some(counter), ocb.as_mut().unwrap());
+                Some(Target::SideExitPtr(side_exit?))
+            } else {
+                Some(target)
+            }
+        }
+
+        fn emit_csel(
+            cb: &mut CodeBlock,
+            truthy: Opnd,
+            falsy: Opnd,
+            out: Opnd,
+            cmov_fn: fn(&mut CodeBlock, X86Opnd, X86Opnd),
+            cmov_neg: fn(&mut CodeBlock, X86Opnd, X86Opnd)){
+
+            // Assert that output is a register
+            out.unwrap_reg();
+
+            // If the truthy value is a memory operand
+            if let Opnd::Mem(_) = truthy {
+                if out != falsy {
+                    mov(cb, out.into(), falsy.into());
+                }
+
+                cmov_fn(cb, out.into(), truthy.into());
+            } else {
+                if out != truthy {
+                    mov(cb, out.into(), truthy.into());
+                }
 
-        fn emit_csel(cb: &mut CodeBlock, truthy: Opnd, falsy: Opnd, out: Opnd, cmov_fn: fn(&mut CodeBlock, X86Opnd, X86Opnd)) {
-            if out != truthy {
-                mov(cb, out.into(), truthy.into());
+                cmov_neg(cb, out.into(), falsy.into());
             }
-            cmov_fn(cb, out.into(), falsy.into());
         }
 
         //dbg!(&self.insns);
@@ -378,10 +483,13 @@ impl Assembler
         // List of GC offsets
         let mut gc_offsets: Vec<u32> = Vec::new();
 
+        // Buffered list of PosMarker callbacks to fire if codegen is successful
+        let mut pos_markers: Vec<(usize, CodePtr)> = vec![];
+
         // For each instruction
         let start_write_pos = cb.get_write_pos();
-        let mut insns_idx: usize = 0;
-        while let Some(insn) = self.insns.get(insns_idx) {
+        let mut insn_idx: usize = 0;
+        while let Some(insn) = self.insns.get(insn_idx) {
             let src_ptr = cb.get_write_ptr();
             let had_dropped_bytes = cb.has_dropped_bytes();
             let old_label_state = cb.get_label_state();
@@ -389,9 +497,7 @@ impl Assembler
 
             match insn {
                 Insn::Comment(text) => {
-                    if cfg!(feature = "disasm") {
-                        cb.add_comment(text);
-                    }
+                    cb.add_comment(text);
                 },
 
                 // Write the label at the current position
@@ -400,8 +506,8 @@ impl Assembler
                 },
 
                 // Report back the current position in the generated code
-                Insn::PosMarker(pos_marker) => {
-                    pos_marker(cb.get_write_ptr());
+                Insn::PosMarker(..) => {
+                    pos_markers.push((insn_idx, cb.get_write_ptr()));
                 },
 
                 Insn::BakeString(text) => {
@@ -414,19 +520,37 @@ impl Assembler
                     cb.write_byte(0);
                 },
 
+                // Set up RBP to work with frame pointer unwinding
+                // (e.g. with Linux `perf record --call-graph fp`)
+                Insn::FrameSetup => {
+                    if get_option!(frame_pointer) {
+                        push(cb, RBP);
+                        mov(cb, RBP, RSP);
+                        push(cb, RBP);
+                    }
+                },
+                Insn::FrameTeardown => {
+                    if get_option!(frame_pointer) {
+                        pop(cb, RBP);
+                        pop(cb, RBP);
+                    }
+                },
+
                 Insn::Add { left, right, .. } => {
                     let opnd1 = emit_64bit_immediate(cb, right);
                     add(cb, left.into(), opnd1);
                 },
 
-                Insn::FrameSetup => {},
-                Insn::FrameTeardown => {},
-
                 Insn::Sub { left, right, .. } => {
                     let opnd1 = emit_64bit_immediate(cb, right);
                     sub(cb, left.into(), opnd1);
                 },
 
+                Insn::Mul { left, right, .. } => {
+                    let opnd1 = emit_64bit_immediate(cb, right);
+                    imul(cb, left.into(), opnd1);
+                },
+
                 Insn::And { left, right, .. } => {
                     let opnd1 = emit_64bit_immediate(cb, right);
                     and(cb, left.into(), opnd1);
@@ -490,16 +614,23 @@ impl Assembler
                     lea(cb, out.into(), opnd.into());
                 },
 
-                // Load relative address
-                Insn::LeaLabel { target, out } => {
-                    let label_idx = target.unwrap_label_idx();
-
-                    cb.label_ref(label_idx, 7, |cb, src_addr, dst_addr| {
-                        let disp = dst_addr - src_addr;
-                        lea(cb, Self::SCRATCH0, mem_opnd(8, RIP, disp.try_into().unwrap()));
-                    });
+                // Load address of jump target
+                Insn::LeaJumpTarget { target, out } => {
+                    if let Target::Label(label_idx) = target {
+                        // Set output to the raw address of the label
+                        cb.label_ref(*label_idx, 7, |cb, src_addr, dst_addr| {
+                            let disp = dst_addr - src_addr;
+                            lea(cb, Self::SCRATCH0, mem_opnd(8, RIP, disp.try_into().unwrap()));
+                        });
 
-                    mov(cb, out.into(), Self::SCRATCH0);
+                        mov(cb, out.into(), Self::SCRATCH0);
+                    } else {
+                        // Set output to the jump target's raw address
+                        let target_code = target.unwrap_code_ptr();
+                        let target_addr = target_code.raw_addr(cb).as_u64();
+                        // Constant encoded length important for patching
+                        movabs(cb, out.into(), target_addr);
+                    }
                 },
 
                 // Push and pop to/from the C stack
@@ -580,61 +711,96 @@ impl Assembler
 
                 // Conditional jump to a label
                 Insn::Jmp(target) => {
-                    match *target {
+                    match compile_side_exit(*target, self, ocb)? {
                         Target::CodePtr(code_ptr) | Target::SideExitPtr(code_ptr) => jmp_ptr(cb, code_ptr),
                         Target::Label(label_idx) => jmp_label(cb, label_idx),
+                        Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_side_exit"),
                     }
                 }
 
                 Insn::Je(target) => {
-                    match *target {
+                    match compile_side_exit(*target, self, ocb)? {
                         Target::CodePtr(code_ptr) | Target::SideExitPtr(code_ptr) => je_ptr(cb, code_ptr),
                         Target::Label(label_idx) => je_label(cb, label_idx),
+                        Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_side_exit"),
                     }
                 }
 
                 Insn::Jne(target) => {
-                    match *target {
+                    match compile_side_exit(*target, self, ocb)? {
                         Target::CodePtr(code_ptr) | Target::SideExitPtr(code_ptr) => jne_ptr(cb, code_ptr),
                         Target::Label(label_idx) => jne_label(cb, label_idx),
+                        Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_side_exit"),
                     }
                 }
 
                 Insn::Jl(target) => {
-                    match *target {
+                    match compile_side_exit(*target, self, ocb)? {
                         Target::CodePtr(code_ptr) | Target::SideExitPtr(code_ptr) => jl_ptr(cb, code_ptr),
                         Target::Label(label_idx) => jl_label(cb, label_idx),
+                        Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_side_exit"),
+                    }
+                },
+
+                Insn::Jg(target) => {
+                    match compile_side_exit(*target, self, ocb)? {
+                        Target::CodePtr(code_ptr) | Target::SideExitPtr(code_ptr) => jg_ptr(cb, code_ptr),
+                        Target::Label(label_idx) => jg_label(cb, label_idx),
+                        Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_side_exit"),
+                    }
+                },
+
+                Insn::Jge(target) => {
+                    match compile_side_exit(*target, self, ocb)? {
+                        Target::CodePtr(code_ptr) | Target::SideExitPtr(code_ptr) => jge_ptr(cb, code_ptr),
+                        Target::Label(label_idx) => jge_label(cb, label_idx),
+                        Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_side_exit"),
                     }
                 },
 
                 Insn::Jbe(target) => {
-                    match *target {
+                    match compile_side_exit(*target, self, ocb)? {
                         Target::CodePtr(code_ptr) | Target::SideExitPtr(code_ptr) => jbe_ptr(cb, code_ptr),
                         Target::Label(label_idx) => jbe_label(cb, label_idx),
+                        Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_side_exit"),
+                    }
+                },
+
+                Insn::Jb(target) => {
+                    match compile_side_exit(*target, self, ocb)? {
+                        Target::CodePtr(code_ptr) | Target::SideExitPtr(code_ptr) => jb_ptr(cb, code_ptr),
+                        Target::Label(label_idx) => jb_label(cb, label_idx),
+                        Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_side_exit"),
                     }
                 },
 
                 Insn::Jz(target) => {
-                    match *target {
+                    match compile_side_exit(*target, self, ocb)? {
                         Target::CodePtr(code_ptr) | Target::SideExitPtr(code_ptr) => jz_ptr(cb, code_ptr),
                         Target::Label(label_idx) => jz_label(cb, label_idx),
+                        Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_side_exit"),
                     }
                 }
 
                 Insn::Jnz(target) => {
-                    match *target {
+                    match compile_side_exit(*target, self, ocb)? {
                         Target::CodePtr(code_ptr) | Target::SideExitPtr(code_ptr) => jnz_ptr(cb, code_ptr),
                         Target::Label(label_idx) => jnz_label(cb, label_idx),
+                        Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_side_exit"),
                     }
                 }
 
-                Insn::Jo(target) => {
-                    match *target {
+                Insn::Jo(target) |
+                Insn::JoMul(target) => {
+                    match compile_side_exit(*target, self, ocb)? {
                         Target::CodePtr(code_ptr) | Target::SideExitPtr(code_ptr) => jo_ptr(cb, code_ptr),
                         Target::Label(label_idx) => jo_label(cb, label_idx),
+                        Target::SideExit { .. } => unreachable!("Target::SideExit should have been compiled by compile_side_exit"),
                     }
                 }
 
+                Insn::Joz(..) | Insn::Jonz(..) => unreachable!("Joz/Jonz should be unused for now"),
+
                 // Atomically increment a counter at a given memory location
                 Insn::IncrCounter { mem, value } => {
                     assert!(matches!(mem, Opnd::Mem(_)));
@@ -646,43 +812,36 @@ impl Assembler
                 Insn::Breakpoint => int3(cb),
 
                 Insn::CSelZ { truthy, falsy, out } => {
-                    emit_csel(cb, *truthy, *falsy, *out, cmovnz);
+                    emit_csel(cb, *truthy, *falsy, *out, cmovz, cmovnz);
                 },
                 Insn::CSelNZ { truthy, falsy, out } => {
-                    emit_csel(cb, *truthy, *falsy, *out, cmovz);
+                    emit_csel(cb, *truthy, *falsy, *out, cmovnz, cmovz);
                 },
                 Insn::CSelE { truthy, falsy, out } => {
-                    emit_csel(cb, *truthy, *falsy, *out, cmovne);
+                    emit_csel(cb, *truthy, *falsy, *out, cmove, cmovne);
                 },
                 Insn::CSelNE { truthy, falsy, out } => {
-                    emit_csel(cb, *truthy, *falsy, *out, cmove);
+                    emit_csel(cb, *truthy, *falsy, *out, cmovne, cmove);
                 },
                 Insn::CSelL { truthy, falsy, out } => {
-                    emit_csel(cb, *truthy, *falsy, *out, cmovge);
+                    emit_csel(cb, *truthy, *falsy, *out, cmovl, cmovge);
                 },
                 Insn::CSelLE { truthy, falsy, out } => {
-                    emit_csel(cb, *truthy, *falsy, *out, cmovg);
+                    emit_csel(cb, *truthy, *falsy, *out, cmovle, cmovg);
                 },
                 Insn::CSelG { truthy, falsy, out } => {
-                    emit_csel(cb, *truthy, *falsy, *out, cmovle);
+                    emit_csel(cb, *truthy, *falsy, *out, cmovg, cmovle);
                 },
                 Insn::CSelGE { truthy, falsy, out } => {
-                    emit_csel(cb, *truthy, *falsy, *out, cmovl);
+                    emit_csel(cb, *truthy, *falsy, *out, cmovge, cmovl);
                 }
                 Insn::LiveReg { .. } => (), // just a reg alloc signal, no code
                 Insn::PadInvalPatch => {
                     let code_size = cb.get_write_pos().saturating_sub(std::cmp::max(start_write_pos, cb.page_start_pos()));
-                    if code_size < JMP_PTR_BYTES {
-                        nop(cb, (JMP_PTR_BYTES - code_size) as u32);
+                    if code_size < cb.jmp_ptr_bytes() {
+                        nop(cb, (cb.jmp_ptr_bytes() - code_size) as u32);
                     }
                 }
-
-                // We want to keep the panic here because some instructions that
-                // we feed to the backend could get lowered into other
-                // instructions. So it's possible that some of our backend
-                // instructions can never make it to the emit stage.
-                #[allow(unreachable_patterns)]
-                _ => panic!("unsupported instruction passed to x86 backend: {:?}", insn)
             };
 
             // On failure, jump to the next page and retry the current insn
@@ -690,18 +849,32 @@ impl Assembler
                 // Reset cb states before retrying the current Insn
                 cb.set_label_state(old_label_state);
             } else {
-                insns_idx += 1;
+                insn_idx += 1;
                 gc_offsets.append(&mut insn_gc_offsets);
             }
         }
 
-        gc_offsets
+        // Error if we couldn't write out everything
+        if cb.has_dropped_bytes() {
+            return None
+        } else {
+            // No bytes dropped, so the pos markers point to valid code
+            for (insn_idx, pos) in pos_markers {
+                if let Insn::PosMarker(callback) = self.insns.get(insn_idx).unwrap() {
+                    callback(pos, &cb);
+                } else {
+                    panic!("non-PosMarker in pos_markers insn_idx={insn_idx} {self:?}");
+                }
+            }
+
+            return Some(gc_offsets)
+        }
     }
 
     /// Optimize and compile the stored instructions
-    pub fn compile_with_regs(self, cb: &mut CodeBlock, regs: Vec<Reg>) -> Vec<u32>
-    {
-        let mut asm = self.x86_split().alloc_regs(regs);
+    pub fn compile_with_regs(self, cb: &mut CodeBlock, ocb: Option<&mut OutlinedCb>, regs: Vec<Reg>) -> Option<(CodePtr, Vec<u32>)> {
+        let asm = self.x86_split();
+        let mut asm = asm.alloc_regs(regs);
 
         // Create label instances in the code block
         for (idx, name) in asm.label_names.iter().enumerate() {
@@ -709,24 +882,32 @@ impl Assembler
             assert!(label_idx == idx);
         }
 
-        let gc_offsets = asm.x86_emit(cb);
+        let mut ocb = ocb; // for &mut
+        let start_ptr = cb.get_write_ptr();
+        let gc_offsets = asm.x86_emit(cb, &mut ocb);
 
-        if cb.has_dropped_bytes() {
-            cb.clear_labels();
-        } else {
+        if let (Some(gc_offsets), false) = (gc_offsets, cb.has_dropped_bytes()) {
             cb.link_labels();
-        }
 
-        gc_offsets
+            Some((start_ptr, gc_offsets))
+        } else {
+            cb.clear_labels();
+
+            None
+        }
     }
 }
 
 #[cfg(test)]
 mod tests {
+    use crate::disasm::assert_disasm;
+    #[cfg(feature = "disasm")]
+    use crate::disasm::{unindent, disasm_addr_range};
+
     use super::*;
 
     fn setup_asm() -> (Assembler, CodeBlock) {
-        (Assembler::new(), CodeBlock::new_dummy(1024))
+        (Assembler::new(0), CodeBlock::new_dummy(1024))
     }
 
     #[test]
@@ -892,4 +1073,268 @@ mod tests {
 
         assert_eq!(format!("{:x}", cb), "4889c049bbffffffffffff00004c31d8");
     }
+
+    #[test]
+    fn test_merge_lea_reg() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let sp = asm.lea(Opnd::mem(64, SP, 8));
+        asm.mov(SP, sp); // should be merged to lea
+        asm.compile_with_num_regs(&mut cb, 1);
+
+        assert_disasm!(cb, "488d5b08", {"
+            0x0: lea rbx, [rbx + 8]
+        "});
+    }
+
+    #[test]
+    fn test_merge_lea_mem() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let sp = asm.lea(Opnd::mem(64, SP, 8));
+        asm.mov(Opnd::mem(64, SP, 0), sp); // should NOT be merged to lea
+        asm.compile_with_num_regs(&mut cb, 1);
+
+        assert_disasm!(cb, "488d4308488903", {"
+            0x0: lea rax, [rbx + 8]
+            0x4: mov qword ptr [rbx], rax
+        "});
+    }
+
+    #[test]
+    fn test_replace_cmp_0() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let val = asm.load(Opnd::mem(64, SP, 8));
+        asm.cmp(val, 0.into());
+        let result = asm.csel_e(Qtrue.into(), Qfalse.into());
+        asm.mov(Opnd::Reg(RAX_REG), result);
+        asm.compile_with_num_regs(&mut cb, 2);
+
+        assert_eq!(format!("{:x}", cb), "488b43084885c0b814000000b900000000480f45c14889c0");
+    }
+
+    #[test]
+    fn test_merge_add_mov() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let sp = asm.add(CFP, Opnd::UImm(0x40));
+        asm.mov(CFP, sp); // should be merged to add
+        asm.compile_with_num_regs(&mut cb, 1);
+
+        assert_eq!(format!("{:x}", cb), "4983c540");
+    }
+
+    #[test]
+    fn test_merge_sub_mov() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let sp = asm.sub(CFP, Opnd::UImm(0x40));
+        asm.mov(CFP, sp); // should be merged to add
+        asm.compile_with_num_regs(&mut cb, 1);
+
+        assert_eq!(format!("{:x}", cb), "4983ed40");
+    }
+
+    #[test]
+    fn test_merge_and_mov() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let sp = asm.and(CFP, Opnd::UImm(0x40));
+        asm.mov(CFP, sp); // should be merged to add
+        asm.compile_with_num_regs(&mut cb, 1);
+
+        assert_eq!(format!("{:x}", cb), "4983e540");
+    }
+
+    #[test]
+    fn test_merge_or_mov() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let sp = asm.or(CFP, Opnd::UImm(0x40));
+        asm.mov(CFP, sp); // should be merged to add
+        asm.compile_with_num_regs(&mut cb, 1);
+
+        assert_eq!(format!("{:x}", cb), "4983cd40");
+    }
+
+    #[test]
+    fn test_merge_xor_mov() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let sp = asm.xor(CFP, Opnd::UImm(0x40));
+        asm.mov(CFP, sp); // should be merged to add
+        asm.compile_with_num_regs(&mut cb, 1);
+
+        assert_eq!(format!("{:x}", cb), "4983f540");
+    }
+
+    #[test]
+    fn test_reorder_c_args_no_cycle() {
+        let (mut asm, mut cb) = setup_asm();
+
+        asm.ccall(0 as _, vec![
+            C_ARG_OPNDS[0], // mov rdi, rdi (optimized away)
+            C_ARG_OPNDS[1], // mov rsi, rsi (optimized away)
+        ]);
+        asm.compile_with_num_regs(&mut cb, 0);
+
+        assert_disasm!(cb, "b800000000ffd0", {"
+            0x0: mov eax, 0
+            0x5: call rax
+        "});
+    }
+
+    #[test]
+    fn test_reorder_c_args_single_cycle() {
+        let (mut asm, mut cb) = setup_asm();
+
+        // rdi and rsi form a cycle
+        asm.ccall(0 as _, vec![
+            C_ARG_OPNDS[1], // mov rdi, rsi
+            C_ARG_OPNDS[0], // mov rsi, rdi
+            C_ARG_OPNDS[2], // mov rdx, rdx (optimized away)
+        ]);
+        asm.compile_with_num_regs(&mut cb, 0);
+
+        assert_disasm!(cb, "4989f34889fe4c89dfb800000000ffd0", {"
+            0x0: mov r11, rsi
+            0x3: mov rsi, rdi
+            0x6: mov rdi, r11
+            0x9: mov eax, 0
+            0xe: call rax
+        "});
+    }
+
+    #[test]
+    fn test_reorder_c_args_two_cycles() {
+        let (mut asm, mut cb) = setup_asm();
+
+        // rdi and rsi form a cycle, and rdx and rcx form another cycle
+        asm.ccall(0 as _, vec![
+            C_ARG_OPNDS[1], // mov rdi, rsi
+            C_ARG_OPNDS[0], // mov rsi, rdi
+            C_ARG_OPNDS[3], // mov rdx, rcx
+            C_ARG_OPNDS[2], // mov rcx, rdx
+        ]);
+        asm.compile_with_num_regs(&mut cb, 0);
+
+        assert_disasm!(cb, "4989f34889fe4c89df4989cb4889d14c89dab800000000ffd0", {"
+            0x0: mov r11, rsi
+            0x3: mov rsi, rdi
+            0x6: mov rdi, r11
+            0x9: mov r11, rcx
+            0xc: mov rcx, rdx
+            0xf: mov rdx, r11
+            0x12: mov eax, 0
+            0x17: call rax
+        "});
+    }
+
+    #[test]
+    fn test_reorder_c_args_large_cycle() {
+        let (mut asm, mut cb) = setup_asm();
+
+        // rdi, rsi, and rdx form a cycle
+        asm.ccall(0 as _, vec![
+            C_ARG_OPNDS[1], // mov rdi, rsi
+            C_ARG_OPNDS[2], // mov rsi, rdx
+            C_ARG_OPNDS[0], // mov rdx, rdi
+        ]);
+        asm.compile_with_num_regs(&mut cb, 0);
+
+        assert_disasm!(cb, "4989f34889d64889fa4c89dfb800000000ffd0", {"
+            0x0: mov r11, rsi
+            0x3: mov rsi, rdx
+            0x6: mov rdx, rdi
+            0x9: mov rdi, r11
+            0xc: mov eax, 0
+            0x11: call rax
+        "});
+    }
+
+    #[test]
+    fn test_reorder_c_args_with_insn_out() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let rax = asm.load(Opnd::UImm(1));
+        let rcx = asm.load(Opnd::UImm(2));
+        let rdx = asm.load(Opnd::UImm(3));
+        // rcx and rdx form a cycle
+        asm.ccall(0 as _, vec![
+            rax, // mov rdi, rax
+            rcx, // mov rsi, rcx
+            rcx, // mov rdx, rcx
+            rdx, // mov rcx, rdx
+        ]);
+        asm.compile_with_num_regs(&mut cb, 3);
+
+        assert_disasm!(cb, "b801000000b902000000ba030000004889c74889ce4989cb4889d14c89dab800000000ffd0", {"
+            0x0: mov eax, 1
+            0x5: mov ecx, 2
+            0xa: mov edx, 3
+            0xf: mov rdi, rax
+            0x12: mov rsi, rcx
+            0x15: mov r11, rcx
+            0x18: mov rcx, rdx
+            0x1b: mov rdx, r11
+            0x1e: mov eax, 0
+            0x23: call rax
+        "});
+    }
+
+    #[test]
+    fn test_cmov_mem() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let top = Opnd::mem(64, SP, 0);
+        let ary_opnd = SP;
+        let array_len_opnd = Opnd::mem(64, SP, 16);
+
+        asm.cmp(array_len_opnd, 1.into());
+        let elem_opnd = asm.csel_g(Opnd::mem(64, ary_opnd, 0), Qnil.into());
+        asm.mov(top, elem_opnd);
+
+        asm.compile_with_num_regs(&mut cb, 1);
+
+        assert_disasm!(cb, "48837b1001b804000000480f4f03488903", {"
+            0x0: cmp qword ptr [rbx + 0x10], 1
+            0x5: mov eax, 4
+            0xa: cmovg rax, qword ptr [rbx]
+            0xe: mov qword ptr [rbx], rax
+        "});
+    }
+
+    #[test]
+    fn test_csel_split() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let stack_top = Opnd::mem(64, SP, 0);
+        let elem_opnd = asm.csel_ne(VALUE(0x7f22c88d1930).into(), Qnil.into());
+        asm.mov(stack_top, elem_opnd);
+
+        asm.compile_with_num_regs(&mut cb, 3);
+
+        assert_disasm!(cb, "48b830198dc8227f0000b904000000480f44c1488903", {"
+            0x0: movabs rax, 0x7f22c88d1930
+            0xa: mov ecx, 4
+            0xf: cmove rax, rcx
+            0x13: mov qword ptr [rbx], rax
+        "});
+    }
+
+    #[test]
+    fn test_mov_m32_imm32() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let shape_opnd = Opnd::mem(32, C_RET_OPND, 0);
+        asm.mov(shape_opnd, Opnd::UImm(0x8000_0001));
+        asm.mov(shape_opnd, Opnd::Imm(0x8000_0001));
+
+        asm.compile_with_num_regs(&mut cb, 0);
+        assert_disasm!(cb, "c70001000080c70001000080", {"
+            0x0: mov dword ptr [rax], 0x80000001
+            0x6: mov dword ptr [rax], 0x80000001
+        "});
+    }
 }
diff --git a/yjit/src/codegen.rs b/yjit/src/codegen.rs
index 5b8b1a1ff9..0fbca85716 100644
--- a/yjit/src/codegen.rs
+++ b/yjit/src/codegen.rs
@@ -3,6 +3,7 @@
 
 use crate::asm::*;
 use crate::backend::ir::*;
+use crate::backend::current::TEMP_REGS;
 use crate::core::*;
 use crate::cruby::*;
 use crate::invariants::*;
@@ -12,12 +13,17 @@ use crate::utils::*;
 use CodegenStatus::*;
 use YARVOpnd::*;
 
+use std::cell::Cell;
 use std::cmp;
+use std::cmp::min;
 use std::collections::HashMap;
+use std::ffi::c_void;
 use std::ffi::CStr;
-use std::mem::{self, size_of};
-use std::os::raw::{c_int, c_uint};
+use std::mem;
+use std::os::raw::c_int;
 use std::ptr;
+use std::rc::Rc;
+use std::cell::RefCell;
 use std::slice;
 
 pub use crate::virtualmem::CodePtr;
@@ -26,199 +32,530 @@ pub use crate::virtualmem::CodePtr;
 #[derive(PartialEq, Debug)]
 enum CodegenStatus {
     KeepCompiling,
-    CantCompile,
     EndBlock,
 }
 
 /// Code generation function signature
 type InsnGenFn = fn(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus;
+) -> Option<CodegenStatus>;
+
+/// Ephemeral code generation state.
+/// Represents a [crate::core::Block] while we build it.
+pub struct JITState<'a> {
+    /// Instruction sequence for the compiling block
+    pub iseq: IseqPtr,
+
+    /// The iseq index of the first instruction in the block
+    starting_insn_idx: IseqIdx,
 
-/// Code generation state
-/// This struct only lives while code is being generated
-pub struct JITState {
-    // Block version being compiled
-    block: BlockRef,
+    /// The [Context] entering into the first instruction of the block
+    starting_ctx: Context,
 
-    // Instruction sequence this is associated with
-    iseq: IseqPtr,
+    /// The placement for the machine code of the [Block]
+    output_ptr: CodePtr,
 
-    // Index of the current instruction being compiled
-    insn_idx: u32,
+    /// Index of the current instruction being compiled
+    insn_idx: IseqIdx,
 
-    // Opcode for the instruction being compiled
+    /// Opcode for the instruction being compiled
     opcode: usize,
 
-    // PC of the instruction being compiled
+    /// PC of the instruction being compiled
     pc: *mut VALUE,
 
-    // Side exit to the instruction being compiled. See :side-exit:.
-    side_exit_for_pc: Option<CodePtr>,
+    /// stack_size when it started to compile the current instruction.
+    stack_size_for_pc: u8,
+
+    /// Execution context when compilation started
+    /// This allows us to peek at run-time values
+    ec: EcPtr,
+
+    /// The code block used for stubs, exits, and other code that are
+    /// not on the hot path.
+    outlined_code_block: &'a mut OutlinedCb,
+
+    /// The outgoing branches the block will have
+    pub pending_outgoing: Vec<PendingBranchRef>,
+
+    // --- Fields for block invalidation and invariants tracking below:
+    // Public mostly so into_block defined in the sibling module core
+    // can partially move out of Self.
+
+    /// Whether we need to record the code address at
+    /// the end of this bytecode instruction for global invalidation
+    pub record_boundary_patch_point: bool,
+
+    /// Code for immediately exiting upon entry to the block.
+    /// Required for invalidation.
+    pub block_entry_exit: Option<CodePtr>,
+
+    /// A list of callable method entries that must be valid for the block to be valid.
+    pub method_lookup_assumptions: Vec<CmePtr>,
 
-    // Execution context when compilation started
-    // This allows us to peek at run-time values
-    ec: Option<EcPtr>,
+    /// A list of basic operators that not be redefined for the block to be valid.
+    pub bop_assumptions: Vec<(RedefinitionFlag, ruby_basic_operators)>,
 
-    // Whether we need to record the code address at
-    // the end of this bytecode instruction for global invalidation
-    record_boundary_patch_point: bool,
+    /// A list of constant expression path segments that must have
+    /// not been written to for the block to be valid.
+    pub stable_constant_names_assumption: Option<*const ID>,
+
+    /// A list of classes that are not supposed to have a singleton class.
+    pub no_singleton_class_assumptions: Vec<VALUE>,
+
+    /// When true, the block is valid only when base pointer is equal to environment pointer.
+    pub no_ep_escape: bool,
+
+    /// When true, the block is valid only when there is a total of one ractor running
+    pub block_assumes_single_ractor: bool,
+
+    /// Address range for Linux perf's [JIT interface](https://github.com/torvalds/linux/blob/master/tools/perf/Documentation/jit-interface.txt)
+    perf_map: Rc::<RefCell::<Vec<(CodePtr, Option<CodePtr>, String)>>>,
+
+    /// Stack of symbol names for --yjit-perf
+    perf_stack: Vec<String>,
+
+    /// When true, this block is the first block compiled by gen_block_series().
+    first_block: bool,
+
+    /// A killswitch for bailing out of compilation. Used in rare situations where we need to fail
+    /// compilation deep in the stack (e.g. codegen failed for some jump target, but not due to
+    /// OOM). Because these situations are so rare it's not worth it to check and propogate at each
+    /// site. Instead, we check this once at the end.
+    block_abandoned: bool,
 }
 
-impl JITState {
-    pub fn new(blockref: &BlockRef) -> Self {
+impl<'a> JITState<'a> {
+    pub fn new(blockid: BlockId, starting_ctx: Context, output_ptr: CodePtr, ec: EcPtr, ocb: &'a mut OutlinedCb, first_block: bool) -> Self {
         JITState {
-            block: blockref.clone(),
-            iseq: ptr::null(), // TODO: initialize this from the blockid
+            iseq: blockid.iseq,
+            starting_insn_idx: blockid.idx,
+            starting_ctx,
+            output_ptr,
             insn_idx: 0,
             opcode: 0,
             pc: ptr::null_mut::<VALUE>(),
-            side_exit_for_pc: None,
-            ec: None,
+            stack_size_for_pc: starting_ctx.get_stack_size(),
+            pending_outgoing: vec![],
+            ec,
+            outlined_code_block: ocb,
             record_boundary_patch_point: false,
+            block_entry_exit: None,
+            method_lookup_assumptions: vec![],
+            bop_assumptions: vec![],
+            stable_constant_names_assumption: None,
+            no_singleton_class_assumptions: vec![],
+            no_ep_escape: false,
+            block_assumes_single_ractor: false,
+            perf_map: Rc::default(),
+            perf_stack: vec![],
+            first_block,
+            block_abandoned: false,
         }
     }
 
-    pub fn get_block(&self) -> BlockRef {
-        self.block.clone()
-    }
-
-    pub fn get_insn_idx(&self) -> u32 {
+    pub fn get_insn_idx(&self) -> IseqIdx {
         self.insn_idx
     }
 
-    pub fn get_iseq(self: &JITState) -> IseqPtr {
+    pub fn get_iseq(&self) -> IseqPtr {
         self.iseq
     }
 
-    pub fn get_opcode(self: &JITState) -> usize {
+    pub fn get_opcode(&self) -> usize {
         self.opcode
     }
 
-    pub fn get_pc(self: &JITState) -> *mut VALUE {
+    pub fn get_pc(&self) -> *mut VALUE {
         self.pc
     }
-}
 
-use crate::codegen::JCCKinds::*;
+    pub fn get_starting_insn_idx(&self) -> IseqIdx {
+        self.starting_insn_idx
+    }
 
-#[allow(non_camel_case_types, unused)]
-pub enum JCCKinds {
-    JCC_JNE,
-    JCC_JNZ,
-    JCC_JZ,
-    JCC_JE,
-    JCC_JBE,
-    JCC_JNA,
-}
+    pub fn get_block_entry_exit(&self) -> Option<CodePtr> {
+        self.block_entry_exit
+    }
 
-pub fn jit_get_arg(jit: &JITState, arg_idx: isize) -> VALUE {
-    // insn_len require non-test config
-    #[cfg(not(test))]
-    assert!(insn_len(jit.get_opcode()) > (arg_idx + 1).try_into().unwrap());
-    unsafe { *(jit.pc.offset(arg_idx + 1)) }
-}
+    pub fn get_starting_ctx(&self) -> Context {
+        self.starting_ctx
+    }
 
-// Get the index of the next instruction
-fn jit_next_insn_idx(jit: &JITState) -> u32 {
-    jit.insn_idx + insn_len(jit.get_opcode())
-}
+    pub fn get_arg(&self, arg_idx: isize) -> VALUE {
+        // insn_len require non-test config
+        #[cfg(not(test))]
+        assert!(insn_len(self.get_opcode()) > (arg_idx + 1).try_into().unwrap());
+        unsafe { *(self.pc.offset(arg_idx + 1)) }
+    }
 
-// Check if we are compiling the instruction at the stub PC
-// Meaning we are compiling the instruction that is next to execute
-fn jit_at_current_insn(jit: &JITState) -> bool {
-    let ec_pc: *mut VALUE = unsafe { get_cfp_pc(get_ec_cfp(jit.ec.unwrap())) };
-    ec_pc == jit.pc
-}
+    /// Get [Self::outlined_code_block]
+    pub fn get_ocb(&mut self) -> &mut OutlinedCb {
+        self.outlined_code_block
+    }
+
+    /// Leave a code stub to re-enter the compiler at runtime when the compiling program point is
+    /// reached. Should always be used in tail position like `return jit.defer_compilation(asm);`.
+    #[must_use]
+    fn defer_compilation(&mut self, asm: &mut Assembler) -> Option<CodegenStatus> {
+        if crate::core::defer_compilation(self, asm).is_err() {
+            // If we can't leave a stub, the block isn't usable and we have to bail.
+            self.block_abandoned = true;
+        }
+        Some(EndBlock)
+    }
+
+    /// Generate a branch with either end possibly stubbed out
+    fn gen_branch(
+        &mut self,
+        asm: &mut Assembler,
+        target0: BlockId,
+        ctx0: &Context,
+        target1: Option<BlockId>,
+        ctx1: Option<&Context>,
+        gen_fn: BranchGenFn,
+    ) {
+        if crate::core::gen_branch(self, asm, target0, ctx0, target1, ctx1, gen_fn).is_none() {
+            // If we can't meet the request for a branch, the code is
+            // essentially corrupt and we have to discard the block.
+            self.block_abandoned = true;
+        }
+    }
 
-// Peek at the nth topmost value on the Ruby stack.
-// Returns the topmost value when n == 0.
-fn jit_peek_at_stack(jit: &JITState, ctx: &Context, n: isize) -> VALUE {
-    assert!(jit_at_current_insn(jit));
-    assert!(n < ctx.get_stack_size() as isize);
+    /// Wrapper for [self::gen_outlined_exit] with error handling.
+    fn gen_outlined_exit(&mut self, exit_pc: *mut VALUE, ctx: &Context) -> Option<CodePtr> {
+        let result = gen_outlined_exit(exit_pc, self.num_locals(), ctx, self.get_ocb());
+        if result.is_none() {
+            // When we can't have the exits, the code is incomplete and we have to bail.
+            self.block_abandoned = true;
+        }
 
-    // Note: this does not account for ctx->sp_offset because
-    // this is only available when hitting a stub, and while
-    // hitting a stub, cfp->sp needs to be up to date in case
-    // codegen functions trigger GC. See :stub-sp-flush:.
-    return unsafe {
-        let sp: *mut VALUE = get_cfp_sp(get_ec_cfp(jit.ec.unwrap()));
+        result
+    }
+
+    /// Return true if the current ISEQ could escape an environment.
+    ///
+    /// As of vm_push_frame(), EP is always equal to BP. However, after pushing
+    /// a frame, some ISEQ setups call vm_bind_update_env(), which redirects EP.
+    /// Also, some method calls escape the environment to the heap.
+    fn escapes_ep(&self) -> bool {
+        match unsafe { get_iseq_body_type(self.iseq) } {
+            // <main> frame is always associated to TOPLEVEL_BINDING.
+            ISEQ_TYPE_MAIN |
+            // Kernel#eval uses a heap EP when a Binding argument is not nil.
+            ISEQ_TYPE_EVAL => true,
+            // If this ISEQ has previously escaped EP, give up the optimization.
+            _ if iseq_escapes_ep(self.iseq) => true,
+            _ => false,
+        }
+    }
 
-        *(sp.offset(-1 - n))
-    };
-}
+    // Get the index of the next instruction
+    fn next_insn_idx(&self) -> u16 {
+        self.insn_idx + insn_len(self.get_opcode()) as u16
+    }
 
-fn jit_peek_at_self(jit: &JITState) -> VALUE {
-    unsafe { get_cfp_self(get_ec_cfp(jit.ec.unwrap())) }
-}
+    /// Get the index of the next instruction of the next instruction
+    fn next_next_insn_idx(&self) -> u16 {
+        let next_pc = unsafe { rb_iseq_pc_at_idx(self.iseq, self.next_insn_idx().into()) };
+        let next_opcode: usize = unsafe { rb_iseq_opcode_at_pc(self.iseq, next_pc) }.try_into().unwrap();
+        self.next_insn_idx() + insn_len(next_opcode) as u16
+    }
+
+    // Check if we are compiling the instruction at the stub PC with the target Context
+    // Meaning we are compiling the instruction that is next to execute
+    pub fn at_compile_target(&self) -> bool {
+        // If this is not the first block compiled by gen_block_series(),
+        // it might be compiling the same block again with a different Context.
+        // In that case, it should defer_compilation() and inspect the stack there.
+        if !self.first_block {
+            return false;
+        }
 
-fn jit_peek_at_local(jit: &JITState, n: i32) -> VALUE {
-    assert!(jit_at_current_insn(jit));
+        let ec_pc: *mut VALUE = unsafe { get_cfp_pc(self.get_cfp()) };
+        ec_pc == self.pc
+    }
 
-    let local_table_size: isize = unsafe { get_iseq_body_local_table_size(jit.iseq) }
-        .try_into()
-        .unwrap();
-    assert!(n < local_table_size.try_into().unwrap());
+    // Peek at the nth topmost value on the Ruby stack.
+    // Returns the topmost value when n == 0.
+    pub fn peek_at_stack(&self, ctx: &Context, n: isize) -> VALUE {
+        assert!(self.at_compile_target());
+        assert!(n < ctx.get_stack_size() as isize);
 
-    unsafe {
-        let ep = get_cfp_ep(get_ec_cfp(jit.ec.unwrap()));
-        let n_isize: isize = n.try_into().unwrap();
-        let offs: isize = -(VM_ENV_DATA_SIZE as isize) - local_table_size + n_isize + 1;
-        *ep.offset(offs)
+        // Note: this does not account for ctx->sp_offset because
+        // this is only available when hitting a stub, and while
+        // hitting a stub, cfp->sp needs to be up to date in case
+        // codegen functions trigger GC. See :stub-sp-flush:.
+        return unsafe {
+            let sp: *mut VALUE = get_cfp_sp(self.get_cfp());
+
+            *(sp.offset(-1 - n))
+        };
     }
-}
 
-fn jit_peek_at_block_handler(jit: &JITState, level: u32) -> VALUE {
-    assert!(jit_at_current_insn(jit));
+    fn peek_at_self(&self) -> VALUE {
+        unsafe { get_cfp_self(self.get_cfp()) }
+    }
 
-    unsafe {
-        let ep = get_cfp_ep_level(get_ec_cfp(jit.ec.unwrap()), level);
-        *ep.offset(VM_ENV_DATA_INDEX_SPECVAL as isize)
+    fn peek_at_local(&self, n: i32) -> VALUE {
+        assert!(self.at_compile_target());
+
+        let local_table_size: isize = unsafe { get_iseq_body_local_table_size(self.iseq) }
+            .try_into()
+            .unwrap();
+        assert!(n < local_table_size.try_into().unwrap());
+
+        unsafe {
+            let ep = get_cfp_ep(self.get_cfp());
+            let n_isize: isize = n.try_into().unwrap();
+            let offs: isize = -(VM_ENV_DATA_SIZE as isize) - local_table_size + n_isize + 1;
+            *ep.offset(offs)
+        }
+    }
+
+    fn peek_at_block_handler(&self, level: u32) -> VALUE {
+        assert!(self.at_compile_target());
+
+        unsafe {
+            let ep = get_cfp_ep_level(self.get_cfp(), level);
+            *ep.offset(VM_ENV_DATA_INDEX_SPECVAL as isize)
+        }
+    }
+
+    pub fn assume_expected_cfunc(
+        &mut self,
+        asm: &mut Assembler,
+        class: VALUE,
+        method: ID,
+        cfunc: *mut c_void,
+    ) -> bool {
+        let cme = unsafe { rb_callable_method_entry(class, method) };
+
+        if cme.is_null() {
+            return false;
+        }
+
+        let def_type = unsafe { get_cme_def_type(cme) };
+        if def_type != VM_METHOD_TYPE_CFUNC {
+            return false;
+        }
+        if unsafe { get_mct_func(get_cme_def_body_cfunc(cme)) } != cfunc {
+            return false;
+        }
+
+        self.assume_method_lookup_stable(asm, cme);
+
+        true
+    }
+
+    pub fn assume_method_lookup_stable(&mut self, asm: &mut Assembler, cme: CmePtr) -> Option<()> {
+        jit_ensure_block_entry_exit(self, asm)?;
+        self.method_lookup_assumptions.push(cme);
+
+        Some(())
+    }
+
+    /// Assume that objects of a given class will have no singleton class.
+    /// Return true if there has been no such singleton class since boot
+    /// and we can safely invalidate it.
+    pub fn assume_no_singleton_class(&mut self, asm: &mut Assembler, klass: VALUE) -> bool {
+        if jit_ensure_block_entry_exit(self, asm).is_none() {
+            return false; // out of space, give up
+        }
+        if has_singleton_class_of(klass) {
+            return false; // we've seen a singleton class. disable the optimization to avoid an invalidation loop.
+        }
+        self.no_singleton_class_assumptions.push(klass);
+        true
+    }
+
+    /// Assume that base pointer is equal to environment pointer in the current ISEQ.
+    /// Return true if it's safe to assume so.
+    fn assume_no_ep_escape(&mut self, asm: &mut Assembler) -> bool {
+        if jit_ensure_block_entry_exit(self, asm).is_none() {
+            return false; // out of space, give up
+        }
+        if self.escapes_ep() {
+            return false; // EP has been escaped in this ISEQ. disable the optimization to avoid an invalidation loop.
+        }
+        self.no_ep_escape = true;
+        true
+    }
+
+    fn get_cfp(&self) -> *mut rb_control_frame_struct {
+        unsafe { get_ec_cfp(self.ec) }
+    }
+
+    pub fn assume_stable_constant_names(&mut self, asm: &mut Assembler, id: *const ID) -> Option<()> {
+        jit_ensure_block_entry_exit(self, asm)?;
+        self.stable_constant_names_assumption = Some(id);
+
+        Some(())
+    }
+
+    pub fn queue_outgoing_branch(&mut self, branch: PendingBranchRef) {
+        self.pending_outgoing.push(branch)
+    }
+
+    /// Push a symbol for --yjit-perf
+    fn perf_symbol_push(&mut self, asm: &mut Assembler, symbol_name: &str) {
+        if !self.perf_stack.is_empty() {
+            self.perf_symbol_range_end(asm);
+        }
+        self.perf_stack.push(symbol_name.to_string());
+        self.perf_symbol_range_start(asm, symbol_name);
+    }
+
+    /// Pop the stack-top symbol for --yjit-perf
+    fn perf_symbol_pop(&mut self, asm: &mut Assembler) {
+        self.perf_symbol_range_end(asm);
+        self.perf_stack.pop();
+        if let Some(symbol_name) = self.perf_stack.get(0) {
+            self.perf_symbol_range_start(asm, symbol_name);
+        }
+    }
+
+    /// Mark the start address of a symbol to be reported to perf
+    fn perf_symbol_range_start(&self, asm: &mut Assembler, symbol_name: &str) {
+        let symbol_name = format!("[JIT] {}", symbol_name);
+        let syms = self.perf_map.clone();
+        asm.pos_marker(move |start, _| syms.borrow_mut().push((start, None, symbol_name.clone())));
+    }
+
+    /// Mark the end address of a symbol to be reported to perf
+    fn perf_symbol_range_end(&self, asm: &mut Assembler) {
+        let syms = self.perf_map.clone();
+        asm.pos_marker(move |end, _| {
+            if let Some((_, ref mut end_store, _)) = syms.borrow_mut().last_mut() {
+                assert_eq!(None, *end_store);
+                *end_store = Some(end);
+            }
+        });
+    }
+
+    /// Flush addresses and symbols to /tmp/perf-{pid}.map
+    fn flush_perf_symbols(&self, cb: &CodeBlock) {
+        assert_eq!(0, self.perf_stack.len());
+        let path = format!("/tmp/perf-{}.map", std::process::id());
+        let mut f = std::io::BufWriter::new(std::fs::File::options().create(true).append(true).open(path).unwrap());
+        for sym in self.perf_map.borrow().iter() {
+            if let (start, Some(end), name) = sym {
+                // In case the code straddles two pages, part of it belongs to the symbol.
+                for (inline_start, inline_end) in cb.writable_addrs(*start, *end) {
+                    use std::io::Write;
+                    let code_size = inline_end - inline_start;
+                    writeln!(f, "{inline_start:x} {code_size:x} {name}").unwrap();
+                }
+            }
+        }
+    }
+
+    /// Return true if we're compiling a send-like instruction, not an opt_* instruction.
+    pub fn is_sendish(&self) -> bool {
+        match unsafe { rb_iseq_opcode_at_pc(self.iseq, self.pc) } as u32 {
+            YARVINSN_send |
+            YARVINSN_opt_send_without_block |
+            YARVINSN_invokesuper => true,
+            _ => false,
+        }
+    }
+
+    /// Return the number of locals in the current ISEQ
+    pub fn num_locals(&self) -> u32 {
+        unsafe { get_iseq_body_local_table_size(self.iseq) }
     }
 }
 
-macro_rules! gen_counter_incr {
-    ($asm:tt, $counter_name:ident) => {
-        if (get_option!(gen_stats)) {
-            // Get a pointer to the counter variable
-            let ptr = ptr_to_counter!($counter_name);
+/// Macro to call jit.perf_symbol_push() without evaluating arguments when
+/// the option is turned off, which is useful for avoiding string allocation.
+macro_rules! jit_perf_symbol_push {
+    ($jit:expr, $asm:expr, $symbol_name:expr, $perf_map:expr) => {
+        if get_option!(perf_map) == Some($perf_map) {
+            $jit.perf_symbol_push($asm, $symbol_name);
+        }
+    };
+}
 
-            // Load the pointer into a register
-            $asm.comment(&format!("increment counter {}", stringify!($counter_name)));
-            let ptr_reg = $asm.load(Opnd::const_ptr(ptr as *const u8));
-            let counter_opnd = Opnd::mem(64, ptr_reg, 0);
+/// Macro to call jit.perf_symbol_pop(), for consistency with jit_perf_symbol_push!().
+macro_rules! jit_perf_symbol_pop {
+    ($jit:expr, $asm:expr, $perf_map:expr) => {
+        if get_option!(perf_map) == Some($perf_map) {
+            $jit.perf_symbol_pop($asm);
+        }
+    };
+}
 
-            // Increment and store the updated value
-            $asm.incr_counter(counter_opnd, Opnd::UImm(1));
+/// Macro to push and pop a perf symbol around a function call.
+macro_rules! perf_call {
+    // perf_call!("prefix: ", func(...)) uses "prefix: func" as a symbol.
+    ($prefix:expr, $func_name:ident($jit:expr, $asm:expr$(, $arg:expr)*$(,)?) ) => {
+        {
+            jit_perf_symbol_push!($jit, $asm, &format!("{}{}", $prefix, stringify!($func_name)), PerfMap::Codegen);
+            let ret = $func_name($jit, $asm, $($arg),*);
+            jit_perf_symbol_pop!($jit, $asm, PerfMap::Codegen);
+            ret
         }
     };
+    // perf_call! { func(...) } uses "func" as a symbol.
+    { $func_name:ident($jit:expr, $asm:expr$(, $arg:expr)*$(,)?) } => {
+        perf_call!("", $func_name($jit, $asm, $($arg),*))
+    };
 }
 
-macro_rules! counted_exit {
-    ($ocb:tt, $existing_side_exit:tt, $counter_name:ident) => {
-        // The counter is only incremented when stats are enabled
-        if (!get_option!(gen_stats)) {
-            $existing_side_exit
-        } else {
-            let ocb = $ocb.unwrap();
-            let code_ptr = ocb.get_write_ptr();
+use crate::codegen::JCCKinds::*;
+use crate::log::Log;
 
-            let mut ocb_asm = Assembler::new();
+#[allow(non_camel_case_types, unused)]
+pub enum JCCKinds {
+    JCC_JNE,
+    JCC_JNZ,
+    JCC_JZ,
+    JCC_JE,
+    JCC_JB,
+    JCC_JBE,
+    JCC_JNA,
+    JCC_JNAE,
+    JCC_JO_MUL,
+}
 
-            // Increment the counter
-            gen_counter_incr!(ocb_asm, $counter_name);
+/// Generate code to increment a given counter. With --yjit-trace-exits=counter,
+/// the counter is traced when it's incremented by this function.
+#[inline(always)]
+fn gen_counter_incr(jit: &JITState, asm: &mut Assembler, counter: Counter) {
+    gen_counter_incr_with_pc(asm, counter, jit.pc);
+}
 
-            // Jump to the existing side exit
-            ocb_asm.jmp($existing_side_exit);
-            ocb_asm.compile(ocb);
+/// Same as gen_counter_incr(), but takes PC isntead of JITState.
+#[inline(always)]
+fn gen_counter_incr_with_pc(asm: &mut Assembler, counter: Counter, pc: *mut VALUE) {
+    gen_counter_incr_without_pc(asm, counter);
 
-            // Pointer to the side-exit code
-            code_ptr.as_side_exit()
-        }
-    };
+    // Trace a counter if --yjit-trace-exits=counter is given.
+    // TraceExits::All is handled by gen_exit().
+    if get_option!(trace_exits) == Some(TraceExits::Counter(counter)) {
+        with_caller_saved_temp_regs(asm, |asm| {
+            asm.ccall(rb_yjit_record_exit_stack as *const u8, vec![Opnd::const_ptr(pc as *const u8)]);
+        });
+    }
+}
+
+/// Generate code to increment a given counter. Not traced by --yjit-trace-exits=counter
+/// unlike gen_counter_incr() or gen_counter_incr_with_pc().
+#[inline(always)]
+fn gen_counter_incr_without_pc(asm: &mut Assembler, counter: Counter) {
+    // Assert that default counters are not incremented by generated code as this would impact performance
+    assert!(!DEFAULT_COUNTERS.contains(&counter), "gen_counter_incr incremented {:?}", counter);
+
+    if get_option!(gen_stats) {
+        asm_comment!(asm, "increment counter {}", counter.get_name());
+        let ptr = get_counter_ptr(&counter.get_name());
+        let ptr_reg = asm.load(Opnd::const_ptr(ptr as *const u8));
+        let counter_opnd = Opnd::mem(64, ptr_reg, 0);
+
+        // Increment and store the updated value
+        asm.incr_counter(counter_opnd, Opnd::UImm(1));
+    }
 }
 
 // Save the incremented PC on the CFP
@@ -230,7 +567,7 @@ fn jit_save_pc(jit: &JITState, asm: &mut Assembler) {
         pc.offset(cur_insn_len)
     };
 
-    asm.comment("save PC to CFP");
+    asm_comment!(asm, "save PC to CFP");
     asm.mov(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_PC), Opnd::const_ptr(ptr as *const u8));
 }
 
@@ -238,43 +575,116 @@ fn jit_save_pc(jit: &JITState, asm: &mut Assembler) {
 /// This realigns the interpreter SP with the JIT SP
 /// Note: this will change the current value of REG_SP,
 ///       which could invalidate memory operands
-fn gen_save_sp(_jit: &JITState, asm: &mut Assembler, ctx: &mut Context) {
-    if ctx.get_sp_offset() != 0 {
-        asm.comment("save SP to CFP");
-        let stack_pointer = ctx.sp_opnd(0);
+fn gen_save_sp(asm: &mut Assembler) {
+    gen_save_sp_with_offset(asm, 0);
+}
+
+/// Save the current SP + offset on the CFP
+fn gen_save_sp_with_offset(asm: &mut Assembler, offset: i8) {
+    if asm.ctx.get_sp_offset() != -offset {
+        asm_comment!(asm, "save SP to CFP");
+        let stack_pointer = asm.ctx.sp_opnd(offset as i32);
         let sp_addr = asm.lea(stack_pointer);
         asm.mov(SP, sp_addr);
         let cfp_sp_opnd = Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP);
         asm.mov(cfp_sp_opnd, SP);
-        ctx.set_sp_offset(0);
+        asm.ctx.set_sp_offset(-offset);
+    }
+}
+
+/// Basically jit_prepare_non_leaf_call(), but this registers the current PC
+/// to lazily push a C method frame when it's necessary.
+fn jit_prepare_lazy_frame_call(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    cme: *const rb_callable_method_entry_t,
+    recv_opnd: YARVOpnd,
+) -> bool {
+    // We can use this only when the receiver is on stack.
+    let recv_idx = match recv_opnd {
+        StackOpnd(recv_idx) => recv_idx,
+        _ => unreachable!("recv_opnd must be on stack, but got: {:?}", recv_opnd),
+    };
+
+    // Get the next PC. jit_save_pc() saves that PC.
+    let pc: *mut VALUE = unsafe {
+        let cur_insn_len = insn_len(jit.get_opcode()) as isize;
+        jit.get_pc().offset(cur_insn_len)
+    };
+
+    let pc_to_cfunc = CodegenGlobals::get_pc_to_cfunc();
+    match pc_to_cfunc.get(&pc) {
+        Some(&(other_cme, _)) if other_cme != cme => {
+            // Bail out if it's not the only cme on this callsite.
+            incr_counter!(lazy_frame_failure);
+            return false;
+        }
+        _ => {
+            // Let rb_yjit_lazy_push_frame() lazily push a C frame on this PC.
+            incr_counter!(lazy_frame_count);
+            pc_to_cfunc.insert(pc, (cme, recv_idx));
+        }
     }
+
+    // Save the PC to trigger a lazy frame push, and save the SP to get the receiver.
+    // The C func may call a method that doesn't raise, so prepare for invalidation too.
+    jit_prepare_non_leaf_call(jit, asm);
+
+    // Make sure we're ready for calling rb_vm_push_cfunc_frame().
+    let cfunc_argc = unsafe { get_mct_argc(get_cme_def_body_cfunc(cme)) };
+    if cfunc_argc != -1 {
+        assert_eq!(recv_idx as i32, cfunc_argc); // verify the receiver index if possible
+    }
+    assert!(asm.get_leaf_ccall()); // It checks the stack canary we set for known_cfunc_codegen.
+
+    true
 }
 
-/// jit_save_pc() + gen_save_sp(). Should be used before calling a routine that
-/// could:
+/// jit_save_pc() + gen_save_sp(). Should be used before calling a routine that could:
 ///  - Perform GC allocation
 ///  - Take the VM lock through RB_VM_LOCK_ENTER()
 ///  - Perform Ruby method call
-fn jit_prepare_routine_call(
+///
+/// If the routine doesn't call arbitrary methods, use jit_prepare_call_with_gc() instead.
+fn jit_prepare_non_leaf_call(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler
 ) {
-    jit.record_boundary_patch_point = true;
-    jit_save_pc(jit, asm);
-    gen_save_sp(jit, asm, ctx);
+    // Prepare for GC. Setting PC also prepares for showing a backtrace.
+    jit.record_boundary_patch_point = true; // VM lock could trigger invalidation
+    jit_save_pc(jit, asm); // for allocation tracing
+    gen_save_sp(asm); // protect objects from GC
 
     // In case the routine calls Ruby methods, it can set local variables
-    // through Kernel#binding and other means.
-    ctx.clear_local_types();
+    // through Kernel#binding, rb_debug_inspector API, and other means.
+    asm.clear_local_types();
+}
+
+/// jit_save_pc() + gen_save_sp(). Should be used before calling a routine that could:
+///  - Perform GC allocation
+///  - Take the VM lock through RB_VM_LOCK_ENTER()
+fn jit_prepare_call_with_gc(
+    jit: &mut JITState,
+    asm: &mut Assembler
+) {
+    jit.record_boundary_patch_point = true; // VM lock could trigger invalidation
+    jit_save_pc(jit, asm); // for allocation tracing
+    gen_save_sp(asm); // protect objects from GC
+
+    // Expect a leaf ccall(). You should use jit_prepare_non_leaf_call() if otherwise.
+    asm.expect_leaf_ccall();
 }
 
 /// Record the current codeblock write position for rewriting into a jump into
 /// the outlined block later. Used to implement global code invalidation.
 fn record_global_inval_patch(asm: &mut Assembler, outline_block_target_pos: CodePtr) {
+    // We add a padding before pos_marker so that the previous patch will not overlap this.
+    // jump_to_next_insn() puts a patch point at the end of the block in fallthrough cases.
+    // In the fallthrough case, the next block should start with the same Context, so the
+    // patch is fine, but it should not overlap another patch.
     asm.pad_inval_patch();
-    asm.pos_marker(move |code_ptr| {
-        CodegenGlobals::push_global_inval_patch(code_ptr, outline_block_target_pos);
+    asm.pos_marker(move |code_ptr, cb| {
+        CodegenGlobals::push_global_inval_patch(code_ptr, outline_block_target_pos, cb);
     });
 }
 
@@ -285,14 +695,36 @@ fn verify_ctx(jit: &JITState, ctx: &Context) {
         unsafe { CStr::from_ptr(rb_obj_info(val)).to_str().unwrap() }
     }
 
+    // Some types such as CString only assert the class field of the object
+    // when there has never been a singleton class created for objects of that class.
+    // Once there is a singleton class created they become their weaker
+    // `T*` variant, and we more objects should pass the verification.
+    fn relax_type_with_singleton_class_assumption(ty: Type) -> Type {
+        if let Type::CString | Type::CArray | Type::CHash = ty {
+            if has_singleton_class_of(ty.known_class().unwrap()) {
+                match ty {
+                    Type::CString => return Type::TString,
+                    Type::CArray => return Type::TArray,
+                    Type::CHash => return Type::THash,
+                    _ => (),
+                }
+            }
+        }
+
+        ty
+    }
+
     // Only able to check types when at current insn
-    assert!(jit_at_current_insn(jit));
+    assert!(jit.at_compile_target());
 
-    let self_val = jit_peek_at_self(jit);
+    let self_val = jit.peek_at_self();
     let self_val_type = Type::from(self_val);
+    let learned_self_type = ctx.get_opnd_type(SelfOpnd);
+    let learned_self_type = relax_type_with_singleton_class_assumption(learned_self_type);
+
 
     // Verify self operand type
-    if self_val_type.diff(ctx.get_opnd_type(SelfOpnd)) == usize::MAX {
+    if self_val_type.diff(learned_self_type) == TypeDiff::Incompatible {
         panic!(
             "verify_ctx: ctx self type ({:?}) incompatible with actual value of self {}",
             ctx.get_opnd_type(SelfOpnd),
@@ -301,10 +733,13 @@ fn verify_ctx(jit: &JITState, ctx: &Context) {
     }
 
     // Verify stack operand types
-    let top_idx = cmp::min(ctx.get_stack_size(), MAX_TEMP_TYPES as u16);
+    let top_idx = cmp::min(ctx.get_stack_size(), MAX_CTX_TEMPS as u8);
     for i in 0..top_idx {
-        let (learned_mapping, learned_type) = ctx.get_opnd_mapping(StackOpnd(i));
-        let stack_val = jit_peek_at_stack(jit, ctx, i as isize);
+        let learned_mapping = ctx.get_opnd_mapping(StackOpnd(i));
+        let learned_type = ctx.get_opnd_type(StackOpnd(i));
+        let learned_type = relax_type_with_singleton_class_assumption(learned_type);
+
+        let stack_val = jit.peek_at_stack(ctx, i as isize);
         let val_type = Type::from(stack_val);
 
         match learned_mapping {
@@ -318,7 +753,7 @@ fn verify_ctx(jit: &JITState, ctx: &Context) {
                 }
             }
             TempMapping::MapToLocal(local_idx) => {
-                let local_val = jit_peek_at_local(jit, local_idx.into());
+                let local_val = jit.peek_at_local(local_idx.into());
                 if local_val != stack_val {
                     panic!(
                         "verify_ctx: stack value was mapped to local, but values did not match\n  stack: {}\n  local {}: {}",
@@ -328,28 +763,30 @@ fn verify_ctx(jit: &JITState, ctx: &Context) {
                     );
                 }
             }
-            TempMapping::MapToStack => {}
+            TempMapping::MapToStack(_) => {}
         }
 
         // If the actual type differs from the learned type
-        if val_type.diff(learned_type) == usize::MAX {
+        if val_type.diff(learned_type) == TypeDiff::Incompatible {
             panic!(
-                "verify_ctx: ctx type ({:?}) incompatible with actual value on stack: {}",
+                "verify_ctx: ctx type ({:?}) incompatible with actual value on stack: {} ({:?})",
                 learned_type,
-                obj_info_str(stack_val)
+                obj_info_str(stack_val),
+                val_type,
             );
         }
     }
 
     // Verify local variable types
     let local_table_size = unsafe { get_iseq_body_local_table_size(jit.iseq) };
-    let top_idx: usize = cmp::min(local_table_size as usize, MAX_TEMP_TYPES);
+    let top_idx: usize = cmp::min(local_table_size as usize, MAX_CTX_TEMPS);
     for i in 0..top_idx {
         let learned_type = ctx.get_local_type(i);
-        let local_val = jit_peek_at_local(jit, i as i32);
+        let learned_type = relax_type_with_singleton_class_assumption(learned_type);
+        let local_val = jit.peek_at_local(i as i32);
         let local_type = Type::from(local_val);
 
-        if local_type.diff(learned_type) == usize::MAX {
+        if local_type.diff(learned_type) == TypeDiff::Incompatible {
             panic!(
                 "verify_ctx: ctx type ({:?}) incompatible with actual value of local: {} (type {:?})",
                 learned_type,
@@ -364,14 +801,13 @@ fn verify_ctx(jit: &JITState, ctx: &Context) {
 // to the interpreter when it cannot service a stub by generating new code.
 // Before coming here, branch_stub_hit() takes care of fully reconstructing
 // interpreter state.
-fn gen_code_for_exit_from_stub(ocb: &mut OutlinedCb) -> CodePtr {
+fn gen_stub_exit(ocb: &mut OutlinedCb) -> Option<CodePtr> {
     let ocb = ocb.unwrap();
-    let code_ptr = ocb.get_write_ptr();
-    let mut asm = Assembler::new();
+    let mut asm = Assembler::new_without_iseq();
 
-    gen_counter_incr!(asm, exit_from_branch_stub);
+    gen_counter_incr_without_pc(&mut asm, Counter::exit_from_branch_stub);
 
-    asm.comment("exit from branch stub");
+    asm_comment!(asm, "exit from branch stub");
     asm.cpop_into(SP);
     asm.cpop_into(EC);
     asm.cpop_into(CFP);
@@ -380,23 +816,30 @@ fn gen_code_for_exit_from_stub(ocb: &mut OutlinedCb) -> CodePtr {
 
     asm.cret(Qundef.into());
 
-    asm.compile(ocb);
-
-    code_ptr
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
 }
 
 /// Generate an exit to return to the interpreter
-fn gen_exit(exit_pc: *mut VALUE, ctx: &Context, asm: &mut Assembler) {
-    #[cfg(all(feature = "disasm", not(test)))]
-    {
+fn gen_exit(exit_pc: *mut VALUE, asm: &mut Assembler) {
+    #[cfg(not(test))]
+    asm_comment!(asm, "exit to interpreter on {}", {
         let opcode = unsafe { rb_vm_insn_addr2opcode((*exit_pc).as_ptr()) };
-        asm.comment(&format!("exit to interpreter on {}", insn_name(opcode as usize)));
+        insn_name(opcode as usize)
+    });
+
+    if asm.ctx.is_return_landing() {
+        asm.mov(SP, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP));
+        let top = asm.stack_push(Type::Unknown);
+        asm.mov(top, C_RET_OPND);
     }
 
+    // Spill stack temps before returning to the interpreter
+    asm.spill_regs();
+
     // Generate the code to exit to the interpreters
     // Write the adjusted SP back into the CFP
-    if ctx.get_sp_offset() != 0 {
-        let sp_opnd = asm.lea(ctx.sp_opnd(0));
+    if asm.ctx.get_sp_offset() != 0 {
+        let sp_opnd = asm.lea(asm.ctx.sp_opnd(0));
         asm.mov(
             Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP),
             sp_opnd
@@ -416,9 +859,9 @@ fn gen_exit(exit_pc: *mut VALUE, ctx: &Context, asm: &mut Assembler) {
             vec![Opnd::const_ptr(exit_pc as *const u8)]
         );
 
-        // If --yjit-trace-exits option is enabled, record the exit stack
-        // while recording the side exits.
-        if get_option!(gen_trace_exits) {
+        // If --yjit-trace-exits is enabled, record the exit stack while recording
+        // the side exits. TraceExits::Counter is handled by gen_counted_exit().
+        if get_option!(trace_exits) == Some(TraceExits::All) {
             asm.ccall(
                 rb_yjit_record_exit_stack as *const u8,
                 vec![Opnd::const_ptr(exit_pc as *const u8)]
@@ -435,81 +878,108 @@ fn gen_exit(exit_pc: *mut VALUE, ctx: &Context, asm: &mut Assembler) {
     asm.cret(Qundef.into());
 }
 
-/// Generate an exit to the interpreter in the outlined code block
-fn gen_outlined_exit(exit_pc: *mut VALUE, ctx: &Context, ocb: &mut OutlinedCb) -> CodePtr {
+/// :side-exit:
+/// Get an exit for the current instruction in the outlined block. The code
+/// for each instruction often begins with several guards before proceeding
+/// to do work. When guards fail, an option we have is to exit to the
+/// interpreter at an instruction boundary. The piece of code that takes
+/// care of reconstructing interpreter state and exiting out of generated
+/// code is called the side exit.
+///
+/// No guards change the logic for reconstructing interpreter state at the
+/// moment, so there is one unique side exit for each context. Note that
+/// it's incorrect to jump to the side exit after any ctx stack push operations
+/// since they change the logic required for reconstructing interpreter state.
+///
+/// If you're in [the codegen module][self], use [JITState::gen_outlined_exit]
+/// instead of calling this directly.
+#[must_use]
+pub fn gen_outlined_exit(exit_pc: *mut VALUE, num_locals: u32, ctx: &Context, ocb: &mut OutlinedCb) -> Option<CodePtr> {
     let mut cb = ocb.unwrap();
-    let exit_code = cb.get_write_ptr();
-    let mut asm = Assembler::new();
+    let mut asm = Assembler::new(num_locals);
+    asm.ctx = *ctx;
+    asm.set_reg_mapping(ctx.get_reg_mapping());
+
+    gen_exit(exit_pc, &mut asm);
+
+    asm.compile(&mut cb, None).map(|(code_ptr, _)| code_ptr)
+}
+
+/// Get a side exit. Increment a counter in it if --yjit-stats is enabled.
+pub fn gen_counted_exit(exit_pc: *mut VALUE, side_exit: CodePtr, ocb: &mut OutlinedCb, counter: Option<Counter>) -> Option<CodePtr> {
+    // The counter is only incremented when stats are enabled
+    if !get_option!(gen_stats) {
+        return Some(side_exit);
+    }
+    let counter = match counter {
+        Some(counter) => counter,
+        None => return Some(side_exit),
+    };
 
-    gen_exit(exit_pc, ctx, &mut asm);
+    let mut asm = Assembler::new_without_iseq();
 
-    asm.compile(&mut cb);
+    // Increment a counter
+    gen_counter_incr_with_pc(&mut asm, counter, exit_pc);
 
-    exit_code
+    // Jump to the existing side exit
+    asm.jmp(Target::CodePtr(side_exit));
+
+    let ocb = ocb.unwrap();
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
 }
 
-// :side-exit:
-// Get an exit for the current instruction in the outlined block. The code
-// for each instruction often begins with several guards before proceeding
-// to do work. When guards fail, an option we have is to exit to the
-// interpreter at an instruction boundary. The piece of code that takes
-// care of reconstructing interpreter state and exiting out of generated
-// code is called the side exit.
-//
-// No guards change the logic for reconstructing interpreter state at the
-// moment, so there is one unique side exit for each context. Note that
-// it's incorrect to jump to the side exit after any ctx stack push operations
-// since they change the logic required for reconstructing interpreter state.
-fn get_side_exit(jit: &mut JITState, ocb: &mut OutlinedCb, ctx: &Context) -> Target {
-    match jit.side_exit_for_pc {
-        None => {
-            let exit_code = gen_outlined_exit(jit.pc, ctx, ocb);
-            jit.side_exit_for_pc = Some(exit_code);
-            exit_code.as_side_exit()
-        }
-        Some(code_ptr) => code_ptr.as_side_exit()
+/// Preserve caller-saved stack temp registers during the call of a given block
+fn with_caller_saved_temp_regs<F, R>(asm: &mut Assembler, block: F) -> R where F: FnOnce(&mut Assembler) -> R {
+    for &reg in caller_saved_temp_regs() {
+        asm.cpush(Opnd::Reg(reg)); // save stack temps
+    }
+    let ret = block(asm);
+    for &reg in caller_saved_temp_regs().rev() {
+        asm.cpop_into(Opnd::Reg(reg)); // restore stack temps
     }
+    ret
 }
 
 // Ensure that there is an exit for the start of the block being compiled.
 // Block invalidation uses this exit.
-pub fn jit_ensure_block_entry_exit(jit: &mut JITState, ocb: &mut OutlinedCb) {
-    let blockref = jit.block.clone();
-    let mut block = blockref.borrow_mut();
-    let block_ctx = block.get_ctx();
-    let blockid = block.get_blockid();
-
-    if block.entry_exit.is_some() {
-        return;
+#[must_use]
+pub fn jit_ensure_block_entry_exit(jit: &mut JITState, asm: &mut Assembler) -> Option<()> {
+    if jit.block_entry_exit.is_some() {
+        return Some(());
     }
 
+    let block_starting_context = &jit.get_starting_ctx();
+
     // If we're compiling the first instruction in the block.
-    if jit.insn_idx == blockid.idx {
-        // Generate the exit with the cache in jitstate.
-        block.entry_exit = Some(get_side_exit(jit, ocb, &block_ctx).unwrap_code_ptr());
+    if jit.insn_idx == jit.starting_insn_idx {
+        // Generate the exit with the cache in Assembler.
+        let side_exit_context = SideExitContext::new(jit.pc, *block_starting_context);
+        let entry_exit = asm.get_side_exit(&side_exit_context, None, jit.get_ocb());
+        jit.block_entry_exit = Some(entry_exit?);
     } else {
-        let _pc = unsafe { rb_iseq_pc_at_idx(blockid.iseq, blockid.idx) };
-        block.entry_exit = Some(gen_outlined_exit(jit.pc, &block_ctx, ocb));
+        let block_entry_pc = unsafe { rb_iseq_pc_at_idx(jit.iseq, jit.starting_insn_idx.into()) };
+        jit.block_entry_exit = Some(jit.gen_outlined_exit(block_entry_pc, block_starting_context)?);
     }
+
+    Some(())
 }
 
 // Landing code for when c_return tracing is enabled. See full_cfunc_return().
-fn gen_full_cfunc_return(ocb: &mut OutlinedCb) -> CodePtr {
+fn gen_full_cfunc_return(ocb: &mut OutlinedCb) -> Option<CodePtr> {
     let ocb = ocb.unwrap();
-    let code_ptr = ocb.get_write_ptr();
-    let mut asm = Assembler::new();
+    let mut asm = Assembler::new_without_iseq();
 
     // This chunk of code expects REG_EC to be filled properly and
     // RAX to contain the return value of the C method.
 
-    asm.comment("full cfunc return");
+    asm_comment!(asm, "full cfunc return");
     asm.ccall(
         rb_full_cfunc_return as *const u8,
         vec![EC, C_RET_OPND]
     );
 
     // Count the exit
-    gen_counter_incr!(asm, traced_cfunc_return);
+    gen_counter_incr_without_pc(&mut asm, Counter::traced_cfunc_return);
 
     // Return to the interpreter
     asm.cpop_into(SP);
@@ -520,26 +990,23 @@ fn gen_full_cfunc_return(ocb: &mut OutlinedCb) -> CodePtr {
 
     asm.cret(Qundef.into());
 
-    asm.compile(ocb);
-
-    return code_ptr;
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
 }
 
 /// Generate a continuation for leave that exits to the interpreter at REG_CFP->pc.
 /// This is used by gen_leave() and gen_entry_prologue()
-fn gen_leave_exit(ocb: &mut OutlinedCb) -> CodePtr {
+fn gen_leave_exit(ocb: &mut OutlinedCb) -> Option<CodePtr> {
     let ocb = ocb.unwrap();
-    let code_ptr = ocb.get_write_ptr();
-    let mut asm = Assembler::new();
+    let mut asm = Assembler::new_without_iseq();
 
     // gen_leave() fully reconstructs interpreter state and leaves the
     // return value in C_RET_OPND before coming here.
     let ret_opnd = asm.live_reg_opnd(C_RET_OPND);
 
     // Every exit to the interpreter should be counted
-    gen_counter_incr!(asm, leave_interp_return);
+    gen_counter_incr_without_pc(&mut asm, Counter::leave_interp_return);
 
-    asm.comment("exit from leave");
+    asm_comment!(asm, "exit from leave");
     asm.cpop_into(SP);
     asm.cpop_into(EC);
     asm.cpop_into(CFP);
@@ -548,52 +1015,86 @@ fn gen_leave_exit(ocb: &mut OutlinedCb) -> CodePtr {
 
     asm.cret(ret_opnd);
 
-    asm.compile(ocb);
-
-    return code_ptr;
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
 }
 
-// Generate a runtime guard that ensures the PC is at the expected
-// instruction index in the iseq, otherwise takes a side-exit.
-// This is to handle the situation of optional parameters.
-// When a function with optional parameters is called, the entry
-// PC for the method isn't necessarily 0.
-fn gen_pc_guard(asm: &mut Assembler, iseq: IseqPtr, insn_idx: u32) {
-    let pc_opnd = Opnd::mem(64, CFP, RUBY_OFFSET_CFP_PC);
-    let expected_pc = unsafe { rb_iseq_pc_at_idx(iseq, insn_idx) };
-    let expected_pc_opnd = Opnd::const_ptr(expected_pc as *const u8);
+// Increment SP and transfer the execution to the interpreter after jit_exec_exception().
+// On jit_exec_exception(), you need to return Qundef to keep executing caller non-FINISH
+// frames on the interpreter. You also need to increment SP to push the return value to
+// the caller's stack, which is different from gen_stub_exit().
+fn gen_leave_exception(ocb: &mut OutlinedCb) -> Option<CodePtr> {
+    let ocb = ocb.unwrap();
+    let mut asm = Assembler::new_without_iseq();
 
-    asm.cmp(pc_opnd, expected_pc_opnd);
+    // gen_leave() leaves the return value in C_RET_OPND before coming here.
+    let ruby_ret_val = asm.live_reg_opnd(C_RET_OPND);
 
-    let pc_match = asm.new_label("pc_match");
-    asm.je(pc_match);
+    // Every exit to the interpreter should be counted
+    gen_counter_incr_without_pc(&mut asm, Counter::leave_interp_return);
 
-    // We're not starting at the first PC, so we need to exit.
-    gen_counter_incr!(asm, leave_start_pc_non_zero);
+    asm_comment!(asm, "push return value through cfp->sp");
+    let cfp_sp = Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP);
+    let sp = asm.load(cfp_sp);
+    asm.mov(Opnd::mem(64, sp, 0), ruby_ret_val);
+    let new_sp = asm.add(sp, SIZEOF_VALUE.into());
+    asm.mov(cfp_sp, new_sp);
 
+    asm_comment!(asm, "exit from exception");
     asm.cpop_into(SP);
     asm.cpop_into(EC);
     asm.cpop_into(CFP);
 
     asm.frame_teardown();
 
+    // Execute vm_exec_core
     asm.cret(Qundef.into());
 
-    // PC should match the expected insn_idx
-    asm.write_label(pc_match);
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
+}
+
+// Generate a runtime guard that ensures the PC is at the expected
+// instruction index in the iseq, otherwise takes an entry stub
+// that generates another check and entry.
+// This is to handle the situation of optional parameters.
+// When a function with optional parameters is called, the entry
+// PC for the method isn't necessarily 0.
+pub fn gen_entry_chain_guard(
+    asm: &mut Assembler,
+    ocb: &mut OutlinedCb,
+    blockid: BlockId,
+) -> Option<PendingEntryRef> {
+    let entry = new_pending_entry();
+    let stub_addr = gen_entry_stub(entry.uninit_entry.as_ptr() as usize, ocb)?;
+
+    let pc_opnd = Opnd::mem(64, CFP, RUBY_OFFSET_CFP_PC);
+    let expected_pc = unsafe { rb_iseq_pc_at_idx(blockid.iseq, blockid.idx.into()) };
+    let expected_pc_opnd = Opnd::const_ptr(expected_pc as *const u8);
+
+    asm_comment!(asm, "guard expected PC");
+    asm.cmp(pc_opnd, expected_pc_opnd);
+
+    asm.mark_entry_start(&entry);
+    asm.jne(stub_addr.into());
+    asm.mark_entry_end(&entry);
+    return Some(entry);
 }
 
 /// Compile an interpreter entry block to be inserted into an iseq
 /// Returns None if compilation fails.
-pub fn gen_entry_prologue(cb: &mut CodeBlock, iseq: IseqPtr, insn_idx: u32) -> Option<CodePtr> {
+/// If jit_exception is true, compile JIT code for handling exceptions.
+/// See jit_compile_exception() for details.
+pub fn gen_entry_prologue(
+    cb: &mut CodeBlock,
+    ocb: &mut OutlinedCb,
+    blockid: BlockId,
+    stack_size: u8,
+    jit_exception: bool,
+) -> Option<(CodePtr, RegMapping)> {
+    let iseq = blockid.iseq;
     let code_ptr = cb.get_write_ptr();
 
-    let mut asm = Assembler::new();
-    if get_option_ref!(dump_disasm).is_some() {
-        asm.comment(&format!("YJIT entry point: {}", iseq_get_location(iseq, 0)));
-    } else {
-        asm.comment("YJIT entry");
-    }
+    let mut asm = Assembler::new(unsafe { get_iseq_body_local_table_size(iseq) });
+    asm_comment!(asm, "YJIT entry point: {}", iseq_get_location(iseq, 0));
 
     asm.frame_setup();
 
@@ -610,22 +1111,43 @@ pub fn gen_entry_prologue(cb: &mut CodeBlock, iseq: IseqPtr, insn_idx: u32) -> O
     asm.mov(SP, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP));
 
     // Setup cfp->jit_return
-    asm.mov(
-        Opnd::mem(64, CFP, RUBY_OFFSET_CFP_JIT_RETURN),
-        Opnd::const_ptr(CodegenGlobals::get_leave_exit_code().raw_ptr()),
-    );
+    // If this is an exception handler entry point
+    if jit_exception {
+        // On jit_exec_exception(), it's NOT safe to return a non-Qundef value
+        // from a non-FINISH frame. This function fixes that problem.
+        // See [jit_compile_exception] for details.
+        asm.ccall(
+            rb_yjit_set_exception_return as *mut u8,
+            vec![
+                CFP,
+                Opnd::const_ptr(CodegenGlobals::get_leave_exit_code().raw_ptr(cb)),
+                Opnd::const_ptr(CodegenGlobals::get_leave_exception_code().raw_ptr(cb)),
+            ],
+        );
+    } else {
+        // On jit_exec() or JIT_EXEC(), it's safe to return a non-Qundef value
+        // on the entry frame. See [jit_compile] for details.
+        asm.mov(
+            Opnd::mem(64, CFP, RUBY_OFFSET_CFP_JIT_RETURN),
+            Opnd::const_ptr(CodegenGlobals::get_leave_exit_code().raw_ptr(cb)),
+        );
+    }
 
-    // We're compiling iseqs that we *expect* to start at `insn_idx`. But in
-    // the case of optional parameters, the interpreter can set the pc to a
-    // different location depending on the optional parameters.  If an iseq
-    // has optional parameters, we'll add a runtime check that the PC we've
+    // We're compiling iseqs that we *expect* to start at `insn_idx`.
+    // But in the case of optional parameters or when handling exceptions,
+    // the interpreter can set the pc to a different location. For
+    // such scenarios, we'll add a runtime check that the PC we've
     // compiled for is the same PC that the interpreter wants us to run with.
-    // If they don't match, then we'll take a side exit.
-    if unsafe { get_iseq_flags_has_opt(iseq) } {
-        gen_pc_guard(&mut asm, iseq, insn_idx);
-    }
+    // If they don't match, then we'll jump to an entry stub and generate
+    // another PC check and entry there.
+    let pending_entry = if unsafe { get_iseq_flags_has_opt(iseq) } || jit_exception {
+        Some(gen_entry_chain_guard(&mut asm, ocb, blockid)?)
+    } else {
+        None
+    };
+    let reg_mapping = gen_entry_reg_mapping(&mut asm, blockid, stack_size);
 
-    asm.compile(cb);
+    asm.compile(cb, Some(ocb))?;
 
     if cb.has_dropped_bytes() {
         None
@@ -635,53 +1157,98 @@ pub fn gen_entry_prologue(cb: &mut CodeBlock, iseq: IseqPtr, insn_idx: u32) -> O
         for page in cb.addrs_to_pages(code_ptr, cb.get_write_ptr()) {
             iseq_payload.pages.insert(page);
         }
-        Some(code_ptr)
+        // Write an entry to the heap and push it to the ISEQ
+        if let Some(pending_entry) = pending_entry {
+            let pending_entry = Rc::try_unwrap(pending_entry)
+                .ok().expect("PendingEntry should be unique");
+            iseq_payload.entries.push(pending_entry.into_entry());
+        }
+        Some((code_ptr, reg_mapping))
+    }
+}
+
+/// Generate code to load registers for a JIT entry. When the entry block is compiled for
+/// the first time, it loads no register. When it has been already compiled as a callee
+/// block, it loads some registers to reuse the block.
+pub fn gen_entry_reg_mapping(asm: &mut Assembler, blockid: BlockId, stack_size: u8) -> RegMapping {
+    // Find an existing callee block. If it's not found or uses no register, skip loading registers.
+    let mut ctx = Context::default();
+    ctx.set_stack_size(stack_size);
+    let reg_mapping = find_most_compatible_reg_mapping(blockid, &ctx).unwrap_or(RegMapping::default());
+    if reg_mapping == RegMapping::default() {
+        return reg_mapping;
+    }
+
+    // If found, load the same registers to reuse the block.
+    asm_comment!(asm, "reuse maps: {:?}", reg_mapping);
+    let local_table_size: u32 = unsafe { get_iseq_body_local_table_size(blockid.iseq) }.try_into().unwrap();
+    for &reg_opnd in reg_mapping.get_reg_opnds().iter() {
+        match reg_opnd {
+            RegOpnd::Local(local_idx) => {
+                let loaded_reg = TEMP_REGS[reg_mapping.get_reg(reg_opnd).unwrap()];
+                let loaded_temp = asm.local_opnd(local_table_size - local_idx as u32 + VM_ENV_DATA_SIZE - 1);
+                asm.load_into(Opnd::Reg(loaded_reg), loaded_temp);
+            }
+            RegOpnd::Stack(_) => unreachable!("find_most_compatible_reg_mapping should not leave {:?}", reg_opnd),
+        }
     }
+
+    reg_mapping
 }
 
 // Generate code to check for interrupts and take a side-exit.
 // Warning: this function clobbers REG0
-fn gen_check_ints(asm: &mut Assembler, side_exit: Target) {
+fn gen_check_ints(
+    asm: &mut Assembler,
+    counter: Counter,
+) {
     // Check for interrupts
     // see RUBY_VM_CHECK_INTS(ec) macro
-    asm.comment("RUBY_VM_CHECK_INTS(ec)");
+    asm_comment!(asm, "RUBY_VM_CHECK_INTS(ec)");
 
     // Not checking interrupt_mask since it's zero outside finalize_deferred_heap_pages,
     // signal_exec, or rb_postponed_job_flush.
-    let interrupt_flag = asm.load(Opnd::mem(32, EC, RUBY_OFFSET_EC_INTERRUPT_FLAG));
+    let interrupt_flag = asm.load(Opnd::mem(32, EC, RUBY_OFFSET_EC_INTERRUPT_FLAG as i32));
     asm.test(interrupt_flag, interrupt_flag);
 
-    asm.jnz(side_exit);
+    asm.jnz(Target::side_exit(counter));
 }
 
 // Generate a stubbed unconditional jump to the next bytecode instruction.
 // Blocks that are part of a guard chain can use this to share the same successor.
 fn jump_to_next_insn(
     jit: &mut JITState,
-    current_context: &Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) {
-    // Reset the depth since in current usages we only ever jump to to
+) -> Option<CodegenStatus> {
+    end_block_with_jump(jit, asm, jit.next_insn_idx())
+}
+
+fn end_block_with_jump(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    continuation_insn_idx: u16,
+) -> Option<CodegenStatus> {
+    // Reset the depth since in current usages we only ever jump to
     // chain_depth > 0 from the same instruction.
-    let mut reset_depth = current_context.clone();
-    reset_depth.reset_chain_depth();
+    let mut reset_depth = asm.ctx;
+    reset_depth.reset_chain_depth_and_defer();
 
     let jump_block = BlockId {
         iseq: jit.iseq,
-        idx: jit_next_insn_idx(jit),
+        idx: continuation_insn_idx,
     };
 
     // We are at the end of the current instruction. Record the boundary.
     if jit.record_boundary_patch_point {
-        let exit_pc = unsafe { jit.pc.offset(insn_len(jit.opcode).try_into().unwrap()) };
-        let exit_pos = gen_outlined_exit(exit_pc, &reset_depth, ocb);
-        record_global_inval_patch(asm, exit_pos);
         jit.record_boundary_patch_point = false;
+        let exit_pc = unsafe { rb_iseq_pc_at_idx(jit.iseq, continuation_insn_idx.into())};
+        let exit_pos = jit.gen_outlined_exit(exit_pc, &reset_depth);
+        record_global_inval_patch(asm, exit_pos?);
     }
 
     // Generate the jump instruction
     gen_direct_jump(jit, &reset_depth, jump_block, asm);
+    Some(EndBlock)
 }
 
 // Compile a sequence of bytecode instructions for a given basic block version.
@@ -694,44 +1261,66 @@ pub fn gen_single_block(
     ec: EcPtr,
     cb: &mut CodeBlock,
     ocb: &mut OutlinedCb,
+    first_block: bool,
 ) -> Result<BlockRef, ()> {
     // Limit the number of specialized versions for this block
-    let mut ctx = limit_block_versions(blockid, start_ctx);
+    let ctx = limit_block_versions(blockid, start_ctx);
 
     verify_blockid(blockid);
     assert!(!(blockid.idx == 0 && ctx.get_stack_size() > 0));
 
+    // Save machine code placement of the block. `cb` might page switch when we
+    // generate code in `ocb`.
+    let block_start_addr = cb.get_write_ptr();
+
     // Instruction sequence to compile
     let iseq = blockid.iseq;
     let iseq_size = unsafe { get_iseq_encoded_size(iseq) };
-    let mut insn_idx: c_uint = blockid.idx;
-    let starting_insn_idx = insn_idx;
-
-    // Allocate the new block
-    let blockref = Block::new(blockid, &ctx);
+    let iseq_size: IseqIdx = if let Ok(size) = iseq_size.try_into() {
+        size
+    } else {
+        // ISeq too large to compile
+        return Err(());
+    };
+    let mut insn_idx: IseqIdx = blockid.idx;
 
     // Initialize a JIT state object
-    let mut jit = JITState::new(&blockref);
+    let mut jit = JITState::new(blockid, ctx, cb.get_write_ptr(), ec, ocb, first_block);
     jit.iseq = blockid.iseq;
-    jit.ec = Some(ec);
-
-    // Mark the start position of the block
-    blockref.borrow_mut().set_start_addr(cb.get_write_ptr());
 
     // Create a backend assembler instance
-    let mut asm = Assembler::new();
+    let mut asm = Assembler::new(jit.num_locals());
+    asm.ctx = ctx;
 
-    #[cfg(feature = "disasm")]
     if get_option_ref!(dump_disasm).is_some() {
         let blockid_idx = blockid.idx;
-        asm.comment(&format!("Block: {} (ISEQ offset: {})", iseq_get_location(blockid.iseq, blockid_idx), blockid_idx));
+        let chain_depth = if asm.ctx.get_chain_depth() > 0 { format!("(chain_depth: {})", asm.ctx.get_chain_depth()) } else { "".to_string() };
+        asm_comment!(asm, "Block: {} {}", iseq_get_location(blockid.iseq, blockid_idx), chain_depth);
+        asm_comment!(asm, "reg_mapping: {:?}", asm.ctx.get_reg_mapping());
+    }
+
+    Log::add_block_with_chain_depth(blockid, asm.ctx.get_chain_depth());
+
+    // Mark the start of an ISEQ for --yjit-perf
+    jit_perf_symbol_push!(jit, &mut asm, &get_iseq_name(iseq), PerfMap::ISEQ);
+
+    if asm.ctx.is_return_landing() {
+        // Continuation of the end of gen_leave().
+        // Reload REG_SP for the current frame and transfer the return value
+        // to the stack top.
+        asm.mov(SP, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP));
+
+        let top = asm.stack_push(Type::Unknown);
+        asm.mov(top, C_RET_OPND);
+
+        asm.ctx.clear_return_landing();
     }
 
     // For each instruction to compile
     // NOTE: could rewrite this loop with a std::iter::Iterator
     while insn_idx < iseq_size {
         // Get the current pc and opcode
-        let pc = unsafe { rb_iseq_pc_at_idx(iseq, insn_idx) };
+        let pc = unsafe { rb_iseq_pc_at_idx(iseq, insn_idx.into()) };
         // try_into() call below is unfortunate. Maybe pick i32 instead of usize for opcodes.
         let opcode: usize = unsafe { rb_iseq_opcode_at_pc(iseq, pc) }
             .try_into()
@@ -740,8 +1329,8 @@ pub fn gen_single_block(
         // We need opt_getconstant_path to be in a block all on its own. Cut the block short
         // if we run into it. This is necessary because we want to invalidate based on the
         // instruction's index.
-        if opcode == YARVINSN_opt_getconstant_path.as_usize() && insn_idx > starting_insn_idx {
-            jump_to_next_insn(&mut jit, &ctx, &mut asm, ocb);
+        if opcode == YARVINSN_opt_getconstant_path.as_usize() && insn_idx > jit.starting_insn_idx {
+            jump_to_next_insn(&mut jit, &mut asm);
             break;
         }
 
@@ -749,31 +1338,38 @@ pub fn gen_single_block(
         jit.insn_idx = insn_idx;
         jit.opcode = opcode;
         jit.pc = pc;
-        jit.side_exit_for_pc = None;
+        jit.stack_size_for_pc = asm.ctx.get_stack_size();
+        asm.set_side_exit_context(pc, asm.ctx.get_stack_size());
+
+        // stack_pop doesn't immediately deallocate a register for stack temps,
+        // but it's safe to do so at this instruction boundary.
+        for stack_idx in asm.ctx.get_stack_size()..MAX_CTX_TEMPS as u8 {
+            asm.ctx.dealloc_reg(RegOpnd::Stack(stack_idx));
+        }
 
         // If previous instruction requested to record the boundary
         if jit.record_boundary_patch_point {
             // Generate an exit to this instruction and record it
-            let exit_pos = gen_outlined_exit(jit.pc, &ctx, ocb);
+            let exit_pos = jit.gen_outlined_exit(jit.pc, &asm.ctx).ok_or(())?;
             record_global_inval_patch(&mut asm, exit_pos);
             jit.record_boundary_patch_point = false;
         }
 
         // In debug mode, verify our existing assumption
-        if cfg!(debug_assertions) && get_option!(verify_ctx) && jit_at_current_insn(&jit) {
-            verify_ctx(&jit, &ctx);
+        if cfg!(debug_assertions) && get_option!(verify_ctx) && jit.at_compile_target() {
+            verify_ctx(&jit, &asm.ctx);
         }
 
+        // :count-placement:
+        // Count bytecode instructions that execute in generated code.
+        // Note that the increment happens even when the output takes side exit.
+        gen_counter_incr(&jit, &mut asm, Counter::yjit_insns_count);
+
         // Lookup the codegen function for this instruction
-        let mut status = CantCompile;
+        let mut status = None;
         if let Some(gen_fn) = get_gen_fn(VALUE(opcode)) {
-            // :count-placement:
-            // Count bytecode instructions that execute in generated code.
-            // Note that the increment happens even when the output takes side exit.
-            gen_counter_incr!(asm, exec_instruction);
-
             // Add a comment for the name of the YARV instruction
-            asm.comment(&format!("Insn: {}", insn_name(opcode)));
+            asm_comment!(asm, "Insn: {:04} {} (stack_size: {})", insn_idx, insn_name(opcode), asm.ctx.get_stack_size());
 
             // If requested, dump instructions for debugging
             if get_option!(dump_insns) {
@@ -782,27 +1378,30 @@ pub fn gen_single_block(
             }
 
             // Call the code generation function
-            status = gen_fn(&mut jit, &mut ctx, &mut asm, ocb);
+            jit_perf_symbol_push!(jit, &mut asm, &insn_name(opcode), PerfMap::Codegen);
+            status = gen_fn(&mut jit, &mut asm);
+            jit_perf_symbol_pop!(jit, &mut asm, PerfMap::Codegen);
+
+            #[cfg(debug_assertions)]
+            assert!(!asm.get_leaf_ccall(), "ccall() wasn't used after leaf_ccall was set in {}", insn_name(opcode));
         }
 
         // If we can't compile this instruction
         // exit to the interpreter and stop compiling
-        if status == CantCompile {
+        if status == None {
             if get_option!(dump_insns) {
                 println!("can't compile {}", insn_name(opcode));
             }
 
-            let mut block = jit.block.borrow_mut();
-
-            // TODO: if the codegen function makes changes to ctx and then return YJIT_CANT_COMPILE,
-            // the exit this generates would be wrong. We could save a copy of the entry context
-            // and assert that ctx is the same here.
-            gen_exit(jit.pc, &ctx, &mut asm);
+            // Rewind stack_size using ctx.with_stack_size to allow stack_size changes
+            // before you return None.
+            asm.ctx = asm.ctx.with_stack_size(jit.stack_size_for_pc);
+            gen_exit(jit.pc, &mut asm);
 
-            // If this is the first instruction in the block, then we can use
-            // the exit for block->entry_exit.
-            if insn_idx == block.get_blockid().idx {
-                block.entry_exit = block.get_start_addr();
+            // If this is the first instruction in the block, then
+            // the entry address is the address for block_entry_exit
+            if insn_idx == jit.starting_insn_idx {
+                jit.block_entry_exit = Some(jit.output_ptr);
             }
 
             break;
@@ -810,347 +1409,384 @@ pub fn gen_single_block(
 
         // For now, reset the chain depth after each instruction as only the
         // first instruction in the block can concern itself with the depth.
-        ctx.reset_chain_depth();
+        asm.ctx.reset_chain_depth_and_defer();
 
         // Move to the next instruction to compile
-        insn_idx += insn_len(opcode);
+        insn_idx += insn_len(opcode) as u16;
 
         // If the instruction terminates this block
-        if status == EndBlock {
+        if status == Some(EndBlock) {
             break;
         }
     }
+    let end_insn_idx = insn_idx;
 
-    // Finish filling out the block
-    {
-        let mut block = jit.block.borrow_mut();
-        if block.entry_exit.is_some() {
-            asm.pad_inval_patch();
-        }
+    // We currently can't handle cases where the request is for a block that
+    // doesn't go to the next instruction in the same iseq.
+    assert!(!jit.record_boundary_patch_point);
 
-        // Compile code into the code block
-        let gc_offsets = asm.compile(cb);
+    // Bail when requested to.
+    if jit.block_abandoned {
+        incr_counter!(abandoned_block_count);
+        return Err(());
+    }
 
-        // Add the GC offsets to the block
-        block.add_gc_obj_offsets(gc_offsets);
+    // Pad the block if it has the potential to be invalidated
+    if jit.block_entry_exit.is_some() {
+        asm.pad_inval_patch();
+    }
 
-        // Mark the end position of the block
-        block.set_end_addr(cb.get_write_ptr());
+    // Mark the end of an ISEQ for --yjit-perf
+    jit_perf_symbol_pop!(jit, &mut asm, PerfMap::ISEQ);
 
-        // Store the index of the last instruction in the block
-        block.set_end_idx(insn_idx);
-    }
+    // Compile code into the code block
+    let (_, gc_offsets) = asm.compile(cb, Some(jit.get_ocb())).ok_or(())?;
+    let end_addr = cb.get_write_ptr();
 
-    // We currently can't handle cases where the request is for a block that
-    // doesn't go to the next instruction.
-    assert!(!jit.record_boundary_patch_point);
+    // Flush perf symbols after asm.compile() writes addresses
+    if get_option!(perf_map).is_some() {
+        jit.flush_perf_symbols(cb);
+    }
 
     // If code for the block doesn't fit, fail
-    if cb.has_dropped_bytes() || ocb.unwrap().has_dropped_bytes() {
-        free_block(&blockref);
+    if cb.has_dropped_bytes() || jit.get_ocb().unwrap().has_dropped_bytes() {
         return Err(());
     }
 
     // Block compiled successfully
-    Ok(blockref)
+    Ok(jit.into_block(end_insn_idx, block_start_addr, end_addr, gc_offsets))
 }
 
 fn gen_nop(
     _jit: &mut JITState,
-    _ctx: &mut Context,
     _asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Do nothing
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_pop(
     _jit: &mut JITState,
-    ctx: &mut Context,
-    _asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
     // Decrement SP
-    ctx.stack_pop(1);
-    KeepCompiling
+    asm.stack_pop(1);
+    Some(KeepCompiling)
 }
 
 fn gen_dup(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-
-    let dup_val = ctx.stack_pop(0);
-    let (mapping, tmp_type) = ctx.get_opnd_mapping(StackOpnd(0));
+) -> Option<CodegenStatus> {
+    let dup_val = asm.stack_opnd(0);
+    let mapping = asm.ctx.get_opnd_mapping(dup_val.into());
 
-    let loc0 = ctx.stack_push_mapping((mapping, tmp_type));
+    let loc0 = asm.stack_push_mapping(mapping);
     asm.mov(loc0, dup_val);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // duplicate stack top n elements
 fn gen_dupn(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let n = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let n = jit.get_arg(0).as_usize();
 
     // In practice, seems to be only used for n==2
     if n != 2 {
-        return CantCompile;
+        return None;
     }
 
-    let opnd1: Opnd = ctx.stack_opnd(1);
-    let opnd0: Opnd = ctx.stack_opnd(0);
+    let opnd1: Opnd = asm.stack_opnd(1);
+    let opnd0: Opnd = asm.stack_opnd(0);
 
-    let mapping1 = ctx.get_opnd_mapping(StackOpnd(1));
-    let mapping0 = ctx.get_opnd_mapping(StackOpnd(0));
+    let mapping1 = asm.ctx.get_opnd_mapping(opnd1.into());
+    let mapping0 = asm.ctx.get_opnd_mapping(opnd0.into());
 
-    let dst1: Opnd = ctx.stack_push_mapping(mapping1);
+    let dst1: Opnd = asm.stack_push_mapping(mapping1);
     asm.mov(dst1, opnd1);
 
-    let dst0: Opnd = ctx.stack_push_mapping(mapping0);
+    let dst0: Opnd = asm.stack_push_mapping(mapping0);
     asm.mov(dst0, opnd0);
 
-    KeepCompiling
+    Some(KeepCompiling)
+}
+
+// Reverse top X stack entries
+fn gen_opt_reverse(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    let count = jit.get_arg(0).as_i32();
+    for n in 0..(count/2) {
+        stack_swap(asm, n, count - 1 - n);
+    }
+    Some(KeepCompiling)
 }
 
 // Swap top 2 stack entries
 fn gen_swap(
-    jit: &mut JITState,
-    ctx: &mut Context,
+    _jit: &mut JITState,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    stack_swap(jit, ctx, asm, 0, 1);
-    KeepCompiling
+) -> Option<CodegenStatus> {
+    stack_swap(asm, 0, 1);
+    Some(KeepCompiling)
 }
 
 fn stack_swap(
-    _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    offset0: u16,
-    offset1: u16,
+    offset0: i32,
+    offset1: i32,
 ) {
-    let stack0_mem = ctx.stack_opnd(offset0 as i32);
-    let stack1_mem = ctx.stack_opnd(offset1 as i32);
+    let stack0_mem = asm.stack_opnd(offset0);
+    let stack1_mem = asm.stack_opnd(offset1);
 
-    let mapping0 = ctx.get_opnd_mapping(StackOpnd(offset0));
-    let mapping1 = ctx.get_opnd_mapping(StackOpnd(offset1));
+    let mapping0 = asm.ctx.get_opnd_mapping(stack0_mem.into());
+    let mapping1 = asm.ctx.get_opnd_mapping(stack1_mem.into());
 
     let stack0_reg = asm.load(stack0_mem);
     let stack1_reg = asm.load(stack1_mem);
     asm.mov(stack0_mem, stack1_reg);
     asm.mov(stack1_mem, stack0_reg);
 
-    ctx.set_opnd_mapping(StackOpnd(offset0), mapping1);
-    ctx.set_opnd_mapping(StackOpnd(offset1), mapping0);
+    asm.ctx.set_opnd_mapping(stack0_mem.into(), mapping1);
+    asm.ctx.set_opnd_mapping(stack1_mem.into(), mapping0);
 }
 
 fn gen_putnil(
-    jit: &mut JITState,
-    ctx: &mut Context,
+    _jit: &mut JITState,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    jit_putobject(jit, ctx, asm, Qnil);
-    KeepCompiling
+) -> Option<CodegenStatus> {
+    jit_putobject(asm, Qnil);
+    Some(KeepCompiling)
 }
 
-fn jit_putobject(_jit: &mut JITState, ctx: &mut Context, asm: &mut Assembler, arg: VALUE) {
+fn jit_putobject(asm: &mut Assembler, arg: VALUE) {
     let val_type: Type = Type::from(arg);
-    let stack_top = ctx.stack_push(val_type);
+    let stack_top = asm.stack_push(val_type);
     asm.mov(stack_top, arg.into());
 }
 
 fn gen_putobject_int2fix(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     let opcode = jit.opcode;
     let cst_val: usize = if opcode == YARVINSN_putobject_INT2FIX_0_.as_usize() {
         0
     } else {
         1
     };
+    let cst_val = VALUE::fixnum_from_usize(cst_val);
+
+    if let Some(result) = fuse_putobject_opt_ltlt(jit, asm, cst_val) {
+        return Some(result);
+    }
 
-    jit_putobject(jit, ctx, asm, VALUE::fixnum_from_usize(cst_val));
-    KeepCompiling
+    jit_putobject(asm, cst_val);
+    Some(KeepCompiling)
 }
 
 fn gen_putobject(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let arg: VALUE = jit_get_arg(jit, 0);
+) -> Option<CodegenStatus> {
+    let arg: VALUE = jit.get_arg(0);
+
+    if let Some(result) = fuse_putobject_opt_ltlt(jit, asm, arg) {
+        return Some(result);
+    }
 
-    jit_putobject(jit, ctx, asm, arg);
-    KeepCompiling
+    jit_putobject(asm, arg);
+    Some(KeepCompiling)
+}
+
+/// Combine `putobject` and `opt_ltlt` together if profitable, for example when
+/// left shifting an integer by a constant amount.
+fn fuse_putobject_opt_ltlt(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    constant_object: VALUE,
+) -> Option<CodegenStatus> {
+    let next_opcode = unsafe { rb_vm_insn_addr2opcode(jit.pc.add(insn_len(jit.opcode).as_usize()).read().as_ptr()) };
+    if next_opcode == YARVINSN_opt_ltlt as i32 && constant_object.fixnum_p() {
+        // Untag the fixnum shift amount
+        let shift_amt = constant_object.as_isize() >> 1;
+        if shift_amt > 63 || shift_amt < 0 {
+            return None;
+        }
+        if !jit.at_compile_target() {
+            return jit.defer_compilation(asm);
+        }
+
+        let lhs = jit.peek_at_stack(&asm.ctx, 0);
+        if !lhs.fixnum_p() {
+            return None;
+        }
+
+        if !assume_bop_not_redefined(jit, asm, INTEGER_REDEFINED_OP_FLAG, BOP_LTLT) {
+            return None;
+        }
+
+        asm_comment!(asm, "integer left shift with rhs={shift_amt}");
+        let lhs = asm.stack_opnd(0);
+
+        // Guard that lhs is a fixnum if necessary
+        let lhs_type = asm.ctx.get_opnd_type(lhs.into());
+        if lhs_type != Type::Fixnum {
+            asm_comment!(asm, "guard arg0 fixnum");
+            asm.test(lhs, Opnd::UImm(RUBY_FIXNUM_FLAG as u64));
+
+            jit_chain_guard(
+                JCC_JZ,
+                jit,
+                asm,
+                SEND_MAX_DEPTH,
+                Counter::guard_send_not_fixnums,
+            );
+        }
+
+        asm.stack_pop(1);
+        fixnum_left_shift_body(asm, lhs, shift_amt as u64);
+        return end_block_with_jump(jit, asm, jit.next_next_insn_idx());
+    }
+    return None;
 }
 
 fn gen_putself(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
 
     // Write it on the stack
-    let stack_top = ctx.stack_push_self();
+    let stack_top = asm.stack_push_self();
     asm.mov(
         stack_top,
         Opnd::mem(VALUE_BITS, CFP, RUBY_OFFSET_CFP_SELF)
     );
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_putspecialobject(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let object_type = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let object_type = jit.get_arg(0).as_usize();
 
     if object_type == VM_SPECIAL_OBJECT_VMCORE.as_usize() {
-        let stack_top = ctx.stack_push(Type::UnknownHeap);
+        let stack_top = asm.stack_push(Type::UnknownHeap);
         let frozen_core = unsafe { rb_mRubyVMFrozenCore };
         asm.mov(stack_top, frozen_core.into());
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
         // TODO: implement for VM_SPECIAL_OBJECT_CBASE and
         // VM_SPECIAL_OBJECT_CONST_BASE
-        CantCompile
+        None
     }
 }
 
 // set Nth stack entry to stack top
 fn gen_setn(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let n = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let n = jit.get_arg(0).as_usize();
 
-    let top_val = ctx.stack_pop(0);
-    let dst_opnd = ctx.stack_opnd(n.try_into().unwrap());
+    let top_val = asm.stack_opnd(0);
+    let dst_opnd = asm.stack_opnd(n.try_into().unwrap());
     asm.mov(
         dst_opnd,
         top_val
     );
 
-    let mapping = ctx.get_opnd_mapping(StackOpnd(0));
-    ctx.set_opnd_mapping(StackOpnd(n.try_into().unwrap()), mapping);
+    let mapping = asm.ctx.get_opnd_mapping(top_val.into());
+    asm.ctx.set_opnd_mapping(dst_opnd.into(), mapping);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // get nth stack value, then push it
 fn gen_topn(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let n = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let n = jit.get_arg(0).as_usize();
 
-    let top_n_val = ctx.stack_opnd(n.try_into().unwrap());
-    let mapping = ctx.get_opnd_mapping(StackOpnd(n.try_into().unwrap()));
-    let loc0 = ctx.stack_push_mapping(mapping);
+    let top_n_val = asm.stack_opnd(n.try_into().unwrap());
+    let mapping = asm.ctx.get_opnd_mapping(top_n_val.into());
+    let loc0 = asm.stack_push_mapping(mapping);
     asm.mov(loc0, top_n_val);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // Pop n values off the stack
 fn gen_adjuststack(
     jit: &mut JITState,
-    ctx: &mut Context,
-    _cb: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let n = jit_get_arg(jit, 0).as_usize();
-    ctx.stack_pop(n);
-    KeepCompiling
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    let n = jit.get_arg(0).as_usize();
+    asm.stack_pop(n);
+    Some(KeepCompiling)
 }
 
 fn gen_opt_plus(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
-    }
-
-    let comptime_a = jit_peek_at_stack(jit, ctx, 1);
-    let comptime_b = jit_peek_at_stack(jit, ctx, 0);
-
-    if comptime_a.fixnum_p() && comptime_b.fixnum_p() {
-        // Create a side-exit to fall back to the interpreter
-        // Note: we generate the side-exit before popping operands from the stack
-        let side_exit = get_side_exit(jit, ocb, ctx);
+) -> Option<CodegenStatus> {
+    let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
+        Some(two_fixnums) => two_fixnums,
+        None => {
+            return jit.defer_compilation(asm);
+        }
+    };
 
-        if !assume_bop_not_redefined(jit, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_PLUS) {
-            return CantCompile;
+    if two_fixnums {
+        if !assume_bop_not_redefined(jit, asm, INTEGER_REDEFINED_OP_FLAG, BOP_PLUS) {
+            return None;
         }
 
         // Check that both operands are fixnums
-        guard_two_fixnums(jit, ctx, asm, ocb, side_exit);
+        guard_two_fixnums(jit, asm);
 
         // Get the operands from the stack
-        let arg1 = ctx.stack_pop(1);
-        let arg0 = ctx.stack_pop(1);
+        let arg1 = asm.stack_pop(1);
+        let arg0 = asm.stack_pop(1);
 
         // Add arg0 + arg1 and test for overflow
         let arg0_untag = asm.sub(arg0, Opnd::Imm(1));
         let out_val = asm.add(arg0_untag, arg1);
-        asm.jo(side_exit);
+        asm.jo(Target::side_exit(Counter::opt_plus_overflow));
 
         // Push the output on the stack
-        let dst = ctx.stack_push(Type::Fixnum);
+        let dst = asm.stack_push(Type::Fixnum);
         asm.mov(dst, out_val);
 
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm)
     }
 }
 
 // new array initialized from top N values
 fn gen_newarray(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let n = jit_get_arg(jit, 0).as_u32();
+) -> Option<CodegenStatus> {
+    let n = jit.get_arg(0).as_u32();
 
     // Save the PC and SP because we are allocating
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_call_with_gc(jit, asm);
 
     // If n is 0, then elts is never going to be read, so we can just pass null
     let values_ptr = if n == 0 {
         Opnd::UImm(0)
     } else {
-        asm.comment("load pointer to array elts");
-        let offset_magnitude = (SIZEOF_VALUE as u32) * n;
-        let values_opnd = ctx.sp_opnd(-(offset_magnitude as isize));
+        asm_comment!(asm, "load pointer to array elements");
+        let values_opnd = asm.ctx.sp_opnd(-(n as i32));
         asm.lea(values_opnd)
     };
 
@@ -1164,24 +1800,22 @@ fn gen_newarray(
         ]
     );
 
-    ctx.stack_pop(n.as_usize());
-    let stack_ret = ctx.stack_push(Type::CArray);
+    asm.stack_pop(n.as_usize());
+    let stack_ret = asm.stack_push(Type::CArray);
     asm.mov(stack_ret, new_ary);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // dup array
 fn gen_duparray(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let ary = jit_get_arg(jit, 0);
+) -> Option<CodegenStatus> {
+    let ary = jit.get_arg(0);
 
     // Save the PC and SP because we are allocating
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_call_with_gc(jit, asm);
 
     // call rb_ary_resurrect(VALUE ary);
     let new_ary = asm.ccall(
@@ -1189,141 +1823,373 @@ fn gen_duparray(
         vec![ary.into()],
     );
 
-    let stack_ret = ctx.stack_push(Type::CArray);
+    let stack_ret = asm.stack_push(Type::CArray);
     asm.mov(stack_ret, new_ary);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // dup hash
 fn gen_duphash(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let hash = jit_get_arg(jit, 0);
+) -> Option<CodegenStatus> {
+    let hash = jit.get_arg(0);
 
     // Save the PC and SP because we are allocating
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_call_with_gc(jit, asm);
 
     // call rb_hash_resurrect(VALUE hash);
     let hash = asm.ccall(rb_hash_resurrect as *const u8, vec![hash.into()]);
 
-    let stack_ret = ctx.stack_push(Type::Hash);
+    let stack_ret = asm.stack_push(Type::CHash);
     asm.mov(stack_ret, hash);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // call to_a on the array on the stack
 fn gen_splatarray(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let flag = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let flag = jit.get_arg(0).as_usize();
 
-    // Save the PC and SP because the callee may allocate
+    // Save the PC and SP because the callee may call #to_a
     // Note that this modifies REG_SP, which is why we do it first
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
     // Get the operands from the stack
-    let ary_opnd = ctx.stack_pop(1);
+    let ary_opnd = asm.stack_opnd(0);
 
     // Call rb_vm_splat_array(flag, ary)
     let ary = asm.ccall(rb_vm_splat_array as *const u8, vec![flag.into(), ary_opnd]);
+    asm.stack_pop(1); // Keep it on stack during ccall for GC
 
-    let stack_ret = ctx.stack_push(Type::TArray);
+    let stack_ret = asm.stack_push(Type::TArray);
     asm.mov(stack_ret, ary);
 
-    KeepCompiling
+    Some(KeepCompiling)
+}
+
+// call to_hash on hash to keyword splat before converting block
+// e.g. foo(**object, &block)
+fn gen_splatkw(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    // Defer compilation so we can specialize on a runtime hash operand
+    if !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
+    }
+
+    let comptime_hash = jit.peek_at_stack(&asm.ctx, 1);
+    if comptime_hash.hash_p() {
+        // If a compile-time hash operand is T_HASH, just guard that it's T_HASH.
+        let hash_opnd = asm.stack_opnd(1);
+        guard_object_is_hash(asm, hash_opnd, hash_opnd.into(), Counter::splatkw_not_hash);
+    } else if comptime_hash.nil_p() {
+        // Speculate we'll see nil if compile-time hash operand is nil
+        let hash_opnd = asm.stack_opnd(1);
+        let hash_opnd_type = asm.ctx.get_opnd_type(hash_opnd.into());
+
+        if hash_opnd_type != Type::Nil {
+            asm.cmp(hash_opnd, Qnil.into());
+            asm.jne(Target::side_exit(Counter::splatkw_not_nil));
+
+            if Type::Nil.diff(hash_opnd_type) != TypeDiff::Incompatible {
+                asm.ctx.upgrade_opnd_type(hash_opnd.into(), Type::Nil);
+            }
+        }
+    } else {
+        // Otherwise, call #to_hash on the operand if it's not nil.
+
+        // Save the PC and SP because the callee may call #to_hash
+        jit_prepare_non_leaf_call(jit, asm);
+
+        // Get the operands from the stack
+        let block_opnd = asm.stack_opnd(0);
+        let block_type = asm.ctx.get_opnd_type(block_opnd.into());
+        let hash_opnd = asm.stack_opnd(1);
+
+        c_callable! {
+            fn to_hash_if_not_nil(mut obj: VALUE) -> VALUE {
+                if obj != Qnil {
+                    obj = unsafe { rb_to_hash_type(obj) };
+                }
+                obj
+            }
+        }
+
+        let hash = asm.ccall(to_hash_if_not_nil as _, vec![hash_opnd]);
+        asm.stack_pop(2); // Keep it on stack during ccall for GC
+
+        let stack_ret = asm.stack_push(Type::Unknown);
+        asm.mov(stack_ret, hash);
+        asm.stack_push(block_type);
+        // Leave block_opnd spilled by ccall as is
+        asm.ctx.dealloc_reg(RegOpnd::Stack(asm.ctx.get_stack_size() - 1));
+    }
+
+    Some(KeepCompiling)
 }
 
 // concat two arrays
 fn gen_concatarray(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    // Save the PC and SP because the callee may allocate
+) -> Option<CodegenStatus> {
+    // Save the PC and SP because the callee may call #to_a
     // Note that this modifies REG_SP, which is why we do it first
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
     // Get the operands from the stack
-    let ary2st_opnd = ctx.stack_pop(1);
-    let ary1_opnd = ctx.stack_pop(1);
+    let ary2st_opnd = asm.stack_opnd(0);
+    let ary1_opnd = asm.stack_opnd(1);
 
     // Call rb_vm_concat_array(ary1, ary2st)
     let ary = asm.ccall(rb_vm_concat_array as *const u8, vec![ary1_opnd, ary2st_opnd]);
+    asm.stack_pop(2); // Keep them on stack during ccall for GC
+
+    let stack_ret = asm.stack_push(Type::TArray);
+    asm.mov(stack_ret, ary);
+
+    Some(KeepCompiling)
+}
+
+// concat second array to first array.
+// first argument must already be an array.
+// attempts to convert second object to array using to_a.
+fn gen_concattoarray(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    // Save the PC and SP because the callee may call #to_a
+    jit_prepare_non_leaf_call(jit, asm);
+
+    // Get the operands from the stack
+    let ary2_opnd = asm.stack_opnd(0);
+    let ary1_opnd = asm.stack_opnd(1);
+
+    let ary = asm.ccall(rb_vm_concat_to_array as *const u8, vec![ary1_opnd, ary2_opnd]);
+    asm.stack_pop(2); // Keep them on stack during ccall for GC
+
+    let stack_ret = asm.stack_push(Type::TArray);
+    asm.mov(stack_ret, ary);
+
+    Some(KeepCompiling)
+}
+
+// push given number of objects to array directly before.
+fn gen_pushtoarray(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    let num = jit.get_arg(0).as_u64();
 
-    let stack_ret = ctx.stack_push(Type::TArray);
+    // Save the PC and SP because the callee may allocate
+    jit_prepare_call_with_gc(jit, asm);
+
+    // Get the operands from the stack
+    let ary_opnd = asm.stack_opnd(num as i32);
+    let objp_opnd = asm.lea(asm.ctx.sp_opnd(-(num as i32)));
+
+    let ary = asm.ccall(rb_ary_cat as *const u8, vec![ary_opnd, objp_opnd, num.into()]);
+    asm.stack_pop(num as usize + 1); // Keep it on stack during ccall for GC
+
+    let stack_ret = asm.stack_push(Type::TArray);
     asm.mov(stack_ret, ary);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // new range initialized from top 2 values
 fn gen_newrange(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let flag = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let flag = jit.get_arg(0).as_usize();
 
     // rb_range_new() allocates and can raise
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
     // val = rb_range_new(low, high, (int)flag);
     let range_opnd = asm.ccall(
         rb_range_new as *const u8,
         vec![
-            ctx.stack_opnd(1),
-            ctx.stack_opnd(0),
+            asm.stack_opnd(1),
+            asm.stack_opnd(0),
             flag.into()
         ]
     );
 
-    ctx.stack_pop(2);
-    let stack_ret = ctx.stack_push(Type::UnknownHeap);
+    asm.stack_pop(2);
+    let stack_ret = asm.stack_push(Type::UnknownHeap);
     asm.mov(stack_ret, range_opnd);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn guard_object_is_heap(
     asm: &mut Assembler,
-    object_opnd: Opnd,
-    side_exit: Target,
+    object: Opnd,
+    object_opnd: YARVOpnd,
+    counter: Counter,
 ) {
-    asm.comment("guard object is heap");
+    let object_type = asm.ctx.get_opnd_type(object_opnd);
+    if object_type.is_heap() {
+        return;
+    }
+
+    asm_comment!(asm, "guard object is heap");
 
     // Test that the object is not an immediate
-    asm.test(object_opnd, (RUBY_IMMEDIATE_MASK as u64).into());
-    asm.jnz(side_exit);
+    asm.test(object, (RUBY_IMMEDIATE_MASK as u64).into());
+    asm.jnz(Target::side_exit(counter));
 
     // Test that the object is not false
-    asm.cmp(object_opnd, Qfalse.into());
-    asm.je(side_exit);
+    asm.cmp(object, Qfalse.into());
+    asm.je(Target::side_exit(counter));
+
+    if Type::UnknownHeap.diff(object_type) != TypeDiff::Incompatible {
+        asm.ctx.upgrade_opnd_type(object_opnd, Type::UnknownHeap);
+    }
 }
 
 fn guard_object_is_array(
     asm: &mut Assembler,
-    object_opnd: Opnd,
-    side_exit: Target,
+    object: Opnd,
+    object_opnd: YARVOpnd,
+    counter: Counter,
 ) {
-    asm.comment("guard object is array");
+    let object_type = asm.ctx.get_opnd_type(object_opnd);
+    if object_type.is_array() {
+        return;
+    }
+
+    let object_reg = match object {
+        Opnd::InsnOut { .. } => object,
+        _ => asm.load(object),
+    };
+    guard_object_is_heap(asm, object_reg, object_opnd, counter);
+
+    asm_comment!(asm, "guard object is array");
 
     // Pull out the type mask
-    let flags_opnd = Opnd::mem(VALUE_BITS, object_opnd, RUBY_OFFSET_RBASIC_FLAGS);
+    let flags_opnd = Opnd::mem(VALUE_BITS, object_reg, RUBY_OFFSET_RBASIC_FLAGS);
     let flags_opnd = asm.and(flags_opnd, (RUBY_T_MASK as u64).into());
 
     // Compare the result with T_ARRAY
     asm.cmp(flags_opnd, (RUBY_T_ARRAY as u64).into());
-    asm.jne(side_exit);
+    asm.jne(Target::side_exit(counter));
+
+    if Type::TArray.diff(object_type) != TypeDiff::Incompatible {
+        asm.ctx.upgrade_opnd_type(object_opnd, Type::TArray);
+    }
+}
+
+fn guard_object_is_hash(
+    asm: &mut Assembler,
+    object: Opnd,
+    object_opnd: YARVOpnd,
+    counter: Counter,
+) {
+    let object_type = asm.ctx.get_opnd_type(object_opnd);
+    if object_type.is_hash() {
+        return;
+    }
+
+    let object_reg = match object {
+        Opnd::InsnOut { .. } => object,
+        _ => asm.load(object),
+    };
+    guard_object_is_heap(asm, object_reg, object_opnd, counter);
+
+    asm_comment!(asm, "guard object is hash");
+
+    // Pull out the type mask
+    let flags_opnd = Opnd::mem(VALUE_BITS, object_reg, RUBY_OFFSET_RBASIC_FLAGS);
+    let flags_opnd = asm.and(flags_opnd, (RUBY_T_MASK as u64).into());
+
+    // Compare the result with T_HASH
+    asm.cmp(flags_opnd, (RUBY_T_HASH as u64).into());
+    asm.jne(Target::side_exit(counter));
+
+    if Type::THash.diff(object_type) != TypeDiff::Incompatible {
+        asm.ctx.upgrade_opnd_type(object_opnd, Type::THash);
+    }
+}
+
+fn guard_object_is_fixnum(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    object: Opnd,
+    object_opnd: YARVOpnd
+) {
+    let object_type = asm.ctx.get_opnd_type(object_opnd);
+    if object_type.is_heap() {
+        asm_comment!(asm, "arg is heap object");
+        asm.jmp(Target::side_exit(Counter::guard_send_not_fixnum));
+        return;
+    }
+
+    if object_type != Type::Fixnum && object_type.is_specific() {
+        asm_comment!(asm, "arg is not fixnum");
+        asm.jmp(Target::side_exit(Counter::guard_send_not_fixnum));
+        return;
+    }
+
+    assert!(!object_type.is_heap());
+    assert!(object_type == Type::Fixnum || object_type.is_unknown());
+
+    // If not fixnums at run-time, fall back
+    if object_type != Type::Fixnum {
+        asm_comment!(asm, "guard object fixnum");
+        asm.test(object, Opnd::UImm(RUBY_FIXNUM_FLAG as u64));
+
+        jit_chain_guard(
+            JCC_JZ,
+            jit,
+            asm,
+            SEND_MAX_DEPTH,
+            Counter::guard_send_not_fixnum,
+        );
+    }
+
+    // Set the stack type in the context.
+    asm.ctx.upgrade_opnd_type(object.into(), Type::Fixnum);
+}
+
+fn guard_object_is_string(
+    asm: &mut Assembler,
+    object: Opnd,
+    object_opnd: YARVOpnd,
+    counter: Counter,
+) {
+    let object_type = asm.ctx.get_opnd_type(object_opnd);
+    if object_type.is_string() {
+        return;
+    }
+
+    let object_reg = match object {
+        Opnd::InsnOut { .. } => object,
+        _ => asm.load(object),
+    };
+    guard_object_is_heap(asm, object_reg, object_opnd, counter);
+
+    asm_comment!(asm, "guard object is string");
+
+    // Pull out the type mask
+    let flags_reg = asm.load(Opnd::mem(VALUE_BITS, object_reg, RUBY_OFFSET_RBASIC_FLAGS));
+    let flags_reg = asm.and(flags_reg, Opnd::UImm(RUBY_T_MASK as u64));
+
+    // Compare the result with T_STRING
+    asm.cmp(flags_reg, Opnd::UImm(RUBY_T_STRING as u64));
+    asm.jne(Target::side_exit(counter));
+
+    if Type::TString.diff(object_type) != TypeDiff::Incompatible {
+        asm.ctx.upgrade_opnd_type(object_opnd, Type::TString);
+    }
 }
 
 /// This guards that a special flag is not set on a hash.
@@ -1333,9 +2199,9 @@ fn guard_object_is_array(
 fn guard_object_is_not_ruby2_keyword_hash(
     asm: &mut Assembler,
     object_opnd: Opnd,
-    side_exit: Target,
+    counter: Counter,
 ) {
-    asm.comment("guard object is not ruby2 keyword hash");
+    asm_comment!(asm, "guard object is not ruby2 keyword hash");
 
     let not_ruby2_keyword = asm.new_label("not_ruby2_keyword");
     asm.test(object_opnd, (RUBY_IMMEDIATE_MASK as u64).into());
@@ -1355,159 +2221,168 @@ fn guard_object_is_not_ruby2_keyword_hash(
     asm.jne(not_ruby2_keyword);
 
     asm.test(flags_opnd, (RHASH_PASS_AS_KEYWORDS as u64).into());
-    asm.jnz(side_exit);
+    asm.jnz(Target::side_exit(counter));
 
     asm.write_label(not_ruby2_keyword);
 }
 
-fn guard_object_is_string(
-    asm: &mut Assembler,
-    object_reg: Opnd,
-    side_exit: Target,
-) {
-    asm.comment("guard object is string");
-
-    // Pull out the type mask
-    let flags_reg = asm.load(Opnd::mem(VALUE_BITS, object_reg, RUBY_OFFSET_RBASIC_FLAGS));
-    let flags_reg = asm.and(flags_reg, Opnd::UImm(RUBY_T_MASK as u64));
-
-    // Compare the result with T_STRING
-    asm.cmp(flags_reg, Opnd::UImm(RUBY_T_STRING as u64));
-    asm.jne(side_exit);
-}
-
-// push enough nils onto the stack to fill out an array
+/// This instruction pops a single value off the stack, converts it to an
+/// arrayif it isn’t already one using the #to_ary method, and then pushes
+/// the values from the array back onto the stack.
 fn gen_expandarray(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Both arguments are rb_num_t which is unsigned
-    let num = jit_get_arg(jit, 0).as_usize();
-    let flag = jit_get_arg(jit, 1).as_usize();
+    let num = jit.get_arg(0).as_u32();
+    let flag = jit.get_arg(1).as_usize();
 
     // If this instruction has the splat flag, then bail out.
     if flag & 0x01 != 0 {
-        gen_counter_incr!(asm, expandarray_splat);
-        return CantCompile;
+        gen_counter_incr(jit, asm, Counter::expandarray_splat);
+        return None;
     }
 
     // If this instruction has the postarg flag, then bail out.
     if flag & 0x02 != 0 {
-        gen_counter_incr!(asm, expandarray_postarg);
-        return CantCompile;
+        gen_counter_incr(jit, asm, Counter::expandarray_postarg);
+        return None;
     }
 
-    let side_exit = get_side_exit(jit, ocb, ctx);
+    let array_opnd = asm.stack_opnd(0);
 
-    let array_type = ctx.get_opnd_type(StackOpnd(0));
-    let array_opnd = ctx.stack_pop(1);
+    // Defer compilation so we can specialize on a runtime `self`
+    if !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
+    }
 
-    // num is the number of requested values. If there aren't enough in the
-    // array then we're going to push on nils.
-    if matches!(array_type, Type::Nil) {
-        // special case for a, b = nil pattern
-        // push N nils onto the stack
-        for _ in 0..num {
-            let push_opnd = ctx.stack_push(Type::Nil);
+    let comptime_recv = jit.peek_at_stack(&asm.ctx, 0);
+
+    // If the comptime receiver is not an array, speculate for when the `rb_check_array_type()`
+    // conversion returns nil and without side-effects (e.g. arbitrary method calls).
+    if !unsafe { RB_TYPE_P(comptime_recv, RUBY_T_ARRAY) } {
+        // at compile time, ensure to_ary is not defined
+        let target_cme = unsafe { rb_callable_method_entry_or_negative(comptime_recv.class_of(), ID!(to_ary)) };
+        let cme_def_type = unsafe { get_cme_def_type(target_cme) };
+
+        // if to_ary is defined, return can't compile so to_ary can be called
+        if cme_def_type != VM_METHOD_TYPE_UNDEF {
+            gen_counter_incr(jit, asm, Counter::expandarray_to_ary);
+            return None;
+        }
+
+        // Bail when method_missing is defined to avoid generating code to call it.
+        // Also, for simplicity, bail when BasicObject#method_missing has been removed.
+        if !assume_method_basic_definition(jit, asm, comptime_recv.class_of(), ID!(method_missing)) {
+            gen_counter_incr(jit, asm, Counter::expandarray_method_missing);
+            return None;
+        }
+
+        // invalidate compile block if to_ary is later defined
+        jit.assume_method_lookup_stable(asm, target_cme);
+
+        jit_guard_known_klass(
+            jit,
+            asm,
+            array_opnd,
+            array_opnd.into(),
+            comptime_recv,
+            SEND_MAX_DEPTH,
+            Counter::expandarray_not_array,
+        );
+
+        let opnd = asm.stack_pop(1); // pop after using the type info
+
+        // If we don't actually want any values, then just keep going
+        if num == 0 {
+            return Some(KeepCompiling);
+        }
+
+        // load opnd to avoid a race because we are also pushing onto the stack
+        let opnd = asm.load(opnd);
+
+        for _ in 1..num {
+            let push_opnd = asm.stack_push(Type::Nil);
             asm.mov(push_opnd, Qnil.into());
         }
-        return KeepCompiling;
+
+        let push_opnd = asm.stack_push(Type::Unknown);
+        asm.mov(push_opnd, opnd);
+
+        return Some(KeepCompiling);
     }
 
+    // Get the compile-time array length
+    let comptime_len = unsafe { rb_jit_array_len(comptime_recv) as u32 };
+
     // Move the array from the stack and check that it's an array.
-    let array_reg = asm.load(array_opnd);
-    guard_object_is_heap(
-        asm,
-        array_reg,
-        counted_exit!(ocb, side_exit, expandarray_not_array),
-    );
     guard_object_is_array(
         asm,
-        array_reg,
-        counted_exit!(ocb, side_exit, expandarray_not_array),
+        array_opnd,
+        array_opnd.into(),
+        Counter::expandarray_not_array,
     );
 
     // If we don't actually want any values, then just return.
     if num == 0 {
-        return KeepCompiling;
+        asm.stack_pop(1); // pop the array
+        return Some(KeepCompiling);
     }
 
-    // Pull out the embed flag to check if it's an embedded array.
-    let flags_opnd = Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RBASIC_FLAGS);
-
-    // Move the length of the embedded array into REG1.
-    let emb_len_opnd = asm.and(flags_opnd, (RARRAY_EMBED_LEN_MASK as u64).into());
-    let emb_len_opnd = asm.rshift(emb_len_opnd, (RARRAY_EMBED_LEN_SHIFT as u64).into());
-
-    // Conditionally move the length of the heap array into REG1.
-    let flags_opnd = Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RBASIC_FLAGS);
-    asm.test(flags_opnd, (RARRAY_EMBED_FLAG as u64).into());
-    let array_len_opnd = Opnd::mem(
-        (8 * size_of::<std::os::raw::c_long>()) as u8,
-        asm.load(array_opnd),
-        RUBY_OFFSET_RARRAY_AS_HEAP_LEN,
-    );
-    let array_len_opnd = asm.csel_nz(emb_len_opnd, array_len_opnd);
-
-    // Only handle the case where the number of values in the array is greater
-    // than or equal to the number of values requested.
-    asm.cmp(array_len_opnd, num.into());
-    asm.jl(counted_exit!(ocb, side_exit, expandarray_rhs_too_small));
-
-    // Load the address of the embedded array into REG1.
-    // (struct RArray *)(obj)->as.ary
+    let array_opnd = asm.stack_opnd(0);
     let array_reg = asm.load(array_opnd);
-    let ary_opnd = asm.lea(Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RARRAY_AS_ARY));
+    let array_len_opnd = get_array_len(asm, array_reg);
 
-    // Conditionally load the address of the heap array into REG1.
-    // (struct RArray *)(obj)->as.heap.ptr
-    let flags_opnd = Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RBASIC_FLAGS);
-    asm.test(flags_opnd, Opnd::UImm(RARRAY_EMBED_FLAG as u64));
-    let heap_ptr_opnd = Opnd::mem(
-        (8 * size_of::<usize>()) as u8,
-        asm.load(array_opnd),
-        RUBY_OFFSET_RARRAY_AS_HEAP_PTR,
-    );
-    let ary_opnd = asm.csel_nz(ary_opnd, heap_ptr_opnd);
+    // Guard on the comptime/expected array length
+    if comptime_len >= num {
+        asm_comment!(asm, "guard array length >= {}", num);
+        asm.cmp(array_len_opnd, num.into());
+        jit_chain_guard(
+            JCC_JB,
+            jit,
+            asm,
+            EXPANDARRAY_MAX_CHAIN_DEPTH,
+            Counter::expandarray_chain_max_depth,
+        );
 
-    // Loop backward through the array and push each element onto the stack.
-    for i in (0..num).rev() {
-        let top = ctx.stack_push(Type::Unknown);
-        let offset = i32::try_from(i * SIZEOF_VALUE).unwrap();
-        asm.mov(top, Opnd::mem(64, ary_opnd, offset));
+    } else {
+        asm_comment!(asm, "guard array length == {}", comptime_len);
+        asm.cmp(array_len_opnd, comptime_len.into());
+        jit_chain_guard(
+            JCC_JNE,
+            jit,
+            asm,
+            EXPANDARRAY_MAX_CHAIN_DEPTH,
+            Counter::expandarray_chain_max_depth,
+        );
     }
 
-    KeepCompiling
-}
+    let array_opnd = asm.stack_pop(1); // pop after using the type info
 
-fn gen_getlocal_wc0(
-    jit: &mut JITState,
-    ctx: &mut Context,
-    asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    // Compute the offset from BP to the local
-    let slot_idx = jit_get_arg(jit, 0).as_i32();
-    let offs: i32 = -SIZEOF_VALUE_I32 * slot_idx;
-    let local_idx = slot_to_local_idx(jit.get_iseq(), slot_idx);
-
-    // Load environment pointer EP (level 0) from CFP
-    let ep_opnd = gen_get_ep(asm, 0);
+    // Load the pointer to the embedded or heap array
+    let ary_opnd = if comptime_len > 0 {
+        let array_reg = asm.load(array_opnd);
+        Some(get_array_ptr(asm, array_reg))
+    } else {
+        None
+    };
 
-    // Load the local from the EP
-    let local_opnd = Opnd::mem(64, ep_opnd, offs);
+    // Loop backward through the array and push each element onto the stack.
+    for i in (0..num).rev() {
+        let top = asm.stack_push(if i < comptime_len { Type::Unknown } else { Type::Nil });
+        let offset = i32::try_from(i * (SIZEOF_VALUE as u32)).unwrap();
 
-    // Write the local at SP
-    let stack_top = ctx.stack_push_local(local_idx.as_usize());
-    asm.mov(stack_top, local_opnd);
+        // Missing elements are Qnil
+        asm_comment!(asm, "load array[{}]", i);
+        let elem_opnd = if i < comptime_len { Opnd::mem(64, ary_opnd.unwrap(), offset) } else { Qnil.into() };
+        asm.mov(top, elem_opnd);
+    }
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // Compute the index of a local variable from its slot index
-fn slot_to_local_idx(iseq: IseqPtr, slot_idx: i32) -> u32 {
+fn ep_offset_to_local_idx(iseq: IseqPtr, ep_offset: u32) -> u32 {
     // Layout illustration
     // This is an array of VALUE
     //                                           | VM_ENV_DATA_SIZE |
@@ -1518,7 +2393,7 @@ fn slot_to_local_idx(iseq: IseqPtr, slot_idx: i32) -> u32 {
     //           ^       ^                       ^                  ^
     //           +-------+---local_table_size----+         cfp->ep--+
     //                   |                                          |
-    //                   +------------------slot_idx----------------+
+    //                   +------------------ep_offset---------------+
     //
     // See usages of local_var_name() from iseq.c for similar calculation.
 
@@ -1526,7 +2401,7 @@ fn slot_to_local_idx(iseq: IseqPtr, slot_idx: i32) -> u32 {
     let local_table_size: i32 = unsafe { get_iseq_body_local_table_size(iseq) }
         .try_into()
         .unwrap();
-    let op = slot_idx - (VM_ENV_DATA_SIZE as i32);
+    let op = (ep_offset - VM_ENV_DATA_SIZE) as i32;
     let local_idx = local_table_size - op - 1;
     assert!(local_idx >= 0 && local_idx < local_table_size);
     local_idx.try_into().unwrap()
@@ -1552,7 +2427,7 @@ fn gen_get_ep(asm: &mut Assembler, level: u32) -> Opnd {
 
 // Gets the EP of the ISeq of the containing method, or "local level".
 // Equivalent of GET_LEP() macro.
-fn gen_get_lep(jit: &mut JITState, asm: &mut Assembler) -> Opnd {
+fn gen_get_lep(jit: &JITState, asm: &mut Assembler) -> Opnd {
     // Equivalent of get_lvar_level() in compile.c
     fn get_lvar_level(iseq: IseqPtr) -> u32 {
         if iseq == unsafe { rb_get_iseq_body_local_iseq(iseq) } {
@@ -1567,180 +2442,210 @@ fn gen_get_lep(jit: &mut JITState, asm: &mut Assembler) -> Opnd {
 }
 
 fn gen_getlocal_generic(
-    ctx: &mut Context,
+    jit: &mut JITState,
     asm: &mut Assembler,
-    local_idx: u32,
+    ep_offset: u32,
     level: u32,
-) -> CodegenStatus {
-    // Load environment pointer EP (level 0) from CFP
-    let ep_opnd = gen_get_ep(asm, level);
+) -> Option<CodegenStatus> {
+    // Split the block if we need to invalidate this instruction when EP escapes
+    if level == 0 && !jit.escapes_ep() && !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
+    }
 
-    // Load the local from the block
-    // val = *(vm_get_ep(GET_EP(), level) - idx);
-    let offs = -(SIZEOF_VALUE_I32 * local_idx as i32);
-    let local_opnd = Opnd::mem(64, ep_opnd, offs);
+    let local_opnd = if level == 0 && jit.assume_no_ep_escape(asm) {
+        // Load the local using SP register
+        asm.local_opnd(ep_offset)
+    } else {
+        // Load environment pointer EP (level 0) from CFP
+        let ep_opnd = gen_get_ep(asm, level);
+
+        // Load the local from the block
+        // val = *(vm_get_ep(GET_EP(), level) - idx);
+        let offs = -(SIZEOF_VALUE_I32 * ep_offset as i32);
+        let local_opnd = Opnd::mem(64, ep_opnd, offs);
+
+        // Write back an argument register to the stack. If the local variable
+        // is an argument, it might have an allocated register, but if this ISEQ
+        // is known to escape EP, the register shouldn't be used after this getlocal.
+        if level == 0 && asm.ctx.get_reg_mapping().get_reg(asm.local_opnd(ep_offset).reg_opnd()).is_some() {
+            asm.mov(local_opnd, asm.local_opnd(ep_offset));
+        }
+
+        local_opnd
+    };
 
     // Write the local at SP
-    let stack_top = ctx.stack_push(Type::Unknown);
+    let stack_top = if level == 0 {
+        let local_idx = ep_offset_to_local_idx(jit.get_iseq(), ep_offset);
+        asm.stack_push_local(local_idx.as_usize())
+    } else {
+        asm.stack_push(Type::Unknown)
+    };
 
     asm.mov(stack_top, local_opnd);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_getlocal(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let idx = jit_get_arg(jit, 0);
-    let level = jit_get_arg(jit, 1);
-    gen_getlocal_generic(ctx, asm, idx.as_u32(), level.as_u32())
+) -> Option<CodegenStatus> {
+    let idx = jit.get_arg(0).as_u32();
+    let level = jit.get_arg(1).as_u32();
+    gen_getlocal_generic(jit, asm, idx, level)
+}
+
+fn gen_getlocal_wc0(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    let idx = jit.get_arg(0).as_u32();
+    gen_getlocal_generic(jit, asm, idx, 0)
 }
 
 fn gen_getlocal_wc1(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let idx = jit_get_arg(jit, 0);
-    gen_getlocal_generic(ctx, asm, idx.as_u32(), 1)
+) -> Option<CodegenStatus> {
+    let idx = jit.get_arg(0).as_u32();
+    gen_getlocal_generic(jit, asm, idx, 1)
 }
 
-fn gen_setlocal_wc0(
+fn gen_setlocal_generic(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    /*
-    vm_env_write(const VALUE *ep, int index, VALUE v)
-    {
-        VALUE flags = ep[VM_ENV_DATA_INDEX_FLAGS];
-        if (LIKELY((flags & VM_ENV_FLAG_WB_REQUIRED) == 0)) {
-            VM_STACK_ENV_WRITE(ep, index, v);
-        }
-        else {
-            vm_env_write_slowpath(ep, index, v);
+    ep_offset: u32,
+    level: u32,
+) -> Option<CodegenStatus> {
+    // Post condition: The type of of the set local is updated in the Context.
+    let value_type = asm.ctx.get_opnd_type(StackOpnd(0));
+
+    // Fallback because of write barrier
+    if asm.ctx.get_chain_depth() > 0 {
+        // Load environment pointer EP at level
+        let ep_opnd = gen_get_ep(asm, level);
+
+        // This function should not yield to the GC.
+        // void rb_vm_env_write(const VALUE *ep, int index, VALUE v)
+        let index = -(ep_offset as i64);
+        let value_opnd = asm.stack_opnd(0);
+        asm.ccall(
+            rb_vm_env_write as *const u8,
+            vec![
+                ep_opnd,
+                index.into(),
+                value_opnd,
+            ]
+        );
+        asm.stack_pop(1);
+
+        // Set local type in the context
+        if level == 0 {
+            let local_idx = ep_offset_to_local_idx(jit.get_iseq(), ep_offset).as_usize();
+            asm.ctx.set_local_type(local_idx, value_type);
         }
+        return Some(KeepCompiling);
     }
-    */
 
-    let slot_idx = jit_get_arg(jit, 0).as_i32();
-    let local_idx = slot_to_local_idx(jit.get_iseq(), slot_idx).as_usize();
-    let value_type = ctx.get_opnd_type(StackOpnd(0));
+    // Split the block if we need to invalidate this instruction when EP escapes
+    if level == 0 && !jit.escapes_ep() && !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
+    }
 
-    // Load environment pointer EP (level 0) from CFP
-    let ep_opnd = gen_get_ep(asm, 0);
+    let (flags_opnd, local_opnd) = if level == 0 && jit.assume_no_ep_escape(asm) {
+        // Load flags and the local using SP register
+        let flags_opnd = asm.ctx.ep_opnd(VM_ENV_DATA_INDEX_FLAGS as i32);
+        let local_opnd = asm.local_opnd(ep_offset);
 
-    // Write barriers may be required when VM_ENV_FLAG_WB_REQUIRED is set, however write barriers
-    // only affect heap objects being written. If we know an immediate value is being written we
-    // can skip this check.
-    if !value_type.is_imm() {
-        // flags & VM_ENV_FLAG_WB_REQUIRED
+        // Allocate a register to the new local operand
+        asm.alloc_reg(local_opnd.reg_opnd());
+        (flags_opnd, local_opnd)
+    } else {
+        // Make sure getlocal doesn't read a stale register. If the local variable
+        // is an argument, it might have an allocated register, but if this ISEQ
+        // is known to escape EP, the register shouldn't be used after this setlocal.
+        if level == 0 {
+            asm.ctx.dealloc_reg(asm.local_opnd(ep_offset).reg_opnd());
+        }
+
+        // Load flags and the local for the level
+        let ep_opnd = gen_get_ep(asm, level);
         let flags_opnd = Opnd::mem(
             64,
             ep_opnd,
             SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_FLAGS as i32,
         );
-        asm.test(flags_opnd, VM_ENV_FLAG_WB_REQUIRED.into());
-
-        // Create a side-exit to fall back to the interpreter
-        let side_exit = get_side_exit(jit, ocb, ctx);
-
-        // if (flags & VM_ENV_FLAG_WB_REQUIRED) != 0
-        asm.jnz(side_exit);
-    }
-
-    // Set the type of the local variable in the context
-    ctx.set_local_type(local_idx, value_type);
-
-    // Pop the value to write from the stack
-    let stack_top = ctx.stack_pop(1);
-
-    // Write the value at the environment pointer
-    let offs: i32 = -8 * slot_idx;
-    asm.mov(Opnd::mem(64, ep_opnd, offs), stack_top);
-
-    KeepCompiling
-}
-
-fn gen_setlocal_generic(
-    jit: &mut JITState,
-    ctx: &mut Context,
-    asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-    local_idx: i32,
-    level: u32,
-) -> CodegenStatus {
-    let value_type = ctx.get_opnd_type(StackOpnd(0));
-
-    // Load environment pointer EP at level
-    let ep_opnd = gen_get_ep(asm, level);
+        (flags_opnd, Opnd::mem(64, ep_opnd, -SIZEOF_VALUE_I32 * ep_offset as i32))
+    };
 
     // Write barriers may be required when VM_ENV_FLAG_WB_REQUIRED is set, however write barriers
     // only affect heap objects being written. If we know an immediate value is being written we
     // can skip this check.
     if !value_type.is_imm() {
         // flags & VM_ENV_FLAG_WB_REQUIRED
-        let flags_opnd = Opnd::mem(
-            64,
-            ep_opnd,
-            SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_FLAGS as i32,
-        );
         asm.test(flags_opnd, VM_ENV_FLAG_WB_REQUIRED.into());
 
-        // Create a side-exit to fall back to the interpreter
-        let side_exit = get_side_exit(jit, ocb, ctx);
-
         // if (flags & VM_ENV_FLAG_WB_REQUIRED) != 0
-        asm.jnz(side_exit);
+        assert!(asm.ctx.get_chain_depth() == 0);
+        jit_chain_guard(
+            JCC_JNZ,
+            jit,
+            asm,
+            1,
+            Counter::setlocal_wb_required,
+        );
+    }
+
+    // Set local type in the context
+    if level == 0 {
+        let local_idx = ep_offset_to_local_idx(jit.get_iseq(), ep_offset).as_usize();
+        asm.ctx.set_local_type(local_idx, value_type);
     }
 
     // Pop the value to write from the stack
-    let stack_top = ctx.stack_pop(1);
+    let stack_top = asm.stack_pop(1);
 
     // Write the value at the environment pointer
-    let offs = -(SIZEOF_VALUE_I32 * local_idx);
-    asm.mov(Opnd::mem(64, ep_opnd, offs), stack_top);
+    asm.mov(local_opnd, stack_top);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_setlocal(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let idx = jit_get_arg(jit, 0).as_i32();
-    let level = jit_get_arg(jit, 1).as_u32();
-    gen_setlocal_generic(jit, ctx, asm, ocb, idx, level)
+) -> Option<CodegenStatus> {
+    let idx = jit.get_arg(0).as_u32();
+    let level = jit.get_arg(1).as_u32();
+    gen_setlocal_generic(jit, asm, idx, level)
+}
+
+fn gen_setlocal_wc0(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    let idx = jit.get_arg(0).as_u32();
+    gen_setlocal_generic(jit, asm, idx, 0)
 }
 
 fn gen_setlocal_wc1(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let idx = jit_get_arg(jit, 0).as_i32();
-    gen_setlocal_generic(jit, ctx, asm, ocb, idx, 1)
+) -> Option<CodegenStatus> {
+    let idx = jit.get_arg(0).as_u32();
+    gen_setlocal_generic(jit, asm, idx, 1)
 }
 
 // new hash initialized from top N values
 fn gen_newhash(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let num: u64 = jit_get_arg(jit, 0).as_u64();
+) -> Option<CodegenStatus> {
+    let num: u64 = jit.get_arg(0).as_u64();
 
     // Save the PC and SP because we are allocating
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_call_with_gc(jit, asm);
 
     if num != 0 {
         // val = rb_hash_new_with_size(num / 2);
@@ -1754,7 +2659,7 @@ fn gen_newhash(
         asm.cpush(new_hash); // x86 alignment
 
         // Get a pointer to the values to insert into the hash
-        let stack_addr_from_top = asm.lea(ctx.stack_opnd((num - 1) as i32));
+        let stack_addr_from_top = asm.lea(asm.stack_opnd((num - 1) as i32));
 
         // rb_hash_bulk_insert(num, STACK_ADDR_FROM_TOP(num), val);
         asm.ccall(
@@ -1769,66 +2674,109 @@ fn gen_newhash(
         let new_hash = asm.cpop();
         asm.cpop_into(new_hash); // x86 alignment
 
-        ctx.stack_pop(num.try_into().unwrap());
-        let stack_ret = ctx.stack_push(Type::Hash);
+        asm.stack_pop(num.try_into().unwrap());
+        let stack_ret = asm.stack_push(Type::CHash);
         asm.mov(stack_ret, new_hash);
     } else {
         // val = rb_hash_new();
         let new_hash = asm.ccall(rb_hash_new as *const u8, vec![]);
-        let stack_ret = ctx.stack_push(Type::Hash);
+        let stack_ret = asm.stack_push(Type::CHash);
         asm.mov(stack_ret, new_hash);
     }
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_putstring(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let put_val = jit_get_arg(jit, 0);
+) -> Option<CodegenStatus> {
+    let put_val = jit.get_arg(0);
 
     // Save the PC and SP because the callee will allocate
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_call_with_gc(jit, asm);
 
     let str_opnd = asm.ccall(
         rb_ec_str_resurrect as *const u8,
-        vec![EC, put_val.into()]
+        vec![EC, put_val.into(), 0.into()]
     );
 
-    let stack_top = ctx.stack_push(Type::CString);
+    let stack_top = asm.stack_push(Type::CString);
     asm.mov(stack_top, str_opnd);
 
-    KeepCompiling
+    Some(KeepCompiling)
+}
+
+fn gen_putchilledstring(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    let put_val = jit.get_arg(0);
+
+    // Save the PC and SP because the callee will allocate
+    jit_prepare_call_with_gc(jit, asm);
+
+    let str_opnd = asm.ccall(
+        rb_ec_str_resurrect as *const u8,
+        vec![EC, put_val.into(), 1.into()]
+    );
+
+    let stack_top = asm.stack_push(Type::CString);
+    asm.mov(stack_top, str_opnd);
+
+    Some(KeepCompiling)
+}
+
+fn gen_checkmatch(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    let flag = jit.get_arg(0).as_u32();
+
+    // rb_vm_check_match is not leaf unless flag is VM_CHECKMATCH_TYPE_WHEN.
+    // See also: leafness_of_checkmatch() and check_match()
+    if flag != VM_CHECKMATCH_TYPE_WHEN {
+        jit_prepare_non_leaf_call(jit, asm);
+    }
+
+    let pattern = asm.stack_opnd(0);
+    let target = asm.stack_opnd(1);
+
+    extern "C" {
+        fn rb_vm_check_match(ec: EcPtr, target: VALUE, pattern: VALUE, num: u32) -> VALUE;
+    }
+    let result = asm.ccall(rb_vm_check_match as *const u8, vec![EC, target, pattern, flag.into()]);
+    asm.stack_pop(2); // Keep them on stack during ccall for GC
+
+    let stack_ret = asm.stack_push(Type::Unknown);
+    asm.mov(stack_ret, result);
+
+    Some(KeepCompiling)
 }
 
 // Push Qtrue or Qfalse depending on whether the given keyword was supplied by
 // the caller
 fn gen_checkkeyword(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // When a keyword is unspecified past index 32, a hash will be used
     // instead. This can only happen in iseqs taking more than 32 keywords.
-    if unsafe { (*get_iseq_body_param_keyword(jit.iseq)).num >= 32 } {
-        return CantCompile;
+    if unsafe { (*get_iseq_body_param_keyword(jit.iseq)).num >= VM_KW_SPECIFIED_BITS_MAX.try_into().unwrap() } {
+        return None;
     }
 
     // The EP offset to the undefined bits local
-    let bits_offset = jit_get_arg(jit, 0).as_i32();
+    let bits_offset = jit.get_arg(0).as_i32();
 
     // The index of the keyword we want to check
-    let index: i64 = jit_get_arg(jit, 1).as_i64();
-
-    // Load environment pointer EP
-    let ep_opnd = gen_get_ep(asm, 0);
+    let index: i64 = jit.get_arg(1).as_i64();
 
-    // VALUE kw_bits = *(ep - bits);
-    let bits_opnd = Opnd::mem(64, ep_opnd, SIZEOF_VALUE_I32 * -bits_offset);
+    // `unspecified_bits` is a part of the local table. Therefore, we may allocate a register for
+    // that "local" when passing it as an argument. We must use such a register to avoid loading
+    // random bits from the stack if any. We assume that EP is not escaped as of entering a method
+    // with keyword arguments.
+    let bits_opnd = asm.local_opnd(bits_offset as u32);
 
     // unsigned int b = (unsigned int)FIX2ULONG(kw_bits);
     // if ((b & (0x01 << idx))) {
@@ -1838,142 +2786,63 @@ fn gen_checkkeyword(
     asm.test(bits_opnd, Opnd::Imm(bit_test));
     let ret_opnd = asm.csel_z(Qtrue.into(), Qfalse.into());
 
-    let stack_ret = ctx.stack_push(Type::UnknownImm);
+    let stack_ret = asm.stack_push(Type::UnknownImm);
     asm.mov(stack_ret, ret_opnd);
 
-    KeepCompiling
-}
-
-fn gen_jnz_to_target0(
-    asm: &mut Assembler,
-    target0: CodePtr,
-    _target1: Option<CodePtr>,
-    shape: BranchShape,
-) {
-    match shape {
-        BranchShape::Next0 | BranchShape::Next1 => unreachable!(),
-        BranchShape::Default => asm.jnz(target0.into()),
-    }
-}
-
-fn gen_jz_to_target0(
-    asm: &mut Assembler,
-    target0: CodePtr,
-    _target1: Option<CodePtr>,
-    shape: BranchShape,
-) {
-    match shape {
-        BranchShape::Next0 | BranchShape::Next1 => unreachable!(),
-        BranchShape::Default => asm.jz(Target::CodePtr(target0)),
-    }
-}
-
-fn gen_jbe_to_target0(
-    asm: &mut Assembler,
-    target0: CodePtr,
-    _target1: Option<CodePtr>,
-    shape: BranchShape,
-) {
-    match shape {
-        BranchShape::Next0 | BranchShape::Next1 => unreachable!(),
-        BranchShape::Default => asm.jbe(Target::CodePtr(target0)),
-    }
+    Some(KeepCompiling)
 }
 
 // Generate a jump to a stub that recompiles the current YARV instruction on failure.
 // When depth_limit is exceeded, generate a jump to a side exit.
 fn jit_chain_guard(
     jcc: JCCKinds,
-    jit: &JITState,
-    ctx: &Context,
+    jit: &mut JITState,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-    depth_limit: i32,
-    side_exit: Target,
+    depth_limit: u8,
+    counter: Counter,
 ) {
     let target0_gen_fn = match jcc {
-        JCC_JNE | JCC_JNZ => gen_jnz_to_target0,
-        JCC_JZ | JCC_JE => gen_jz_to_target0,
-        JCC_JBE | JCC_JNA => gen_jbe_to_target0,
+        JCC_JNE | JCC_JNZ => BranchGenFn::JNZToTarget0,
+        JCC_JZ | JCC_JE => BranchGenFn::JZToTarget0,
+        JCC_JBE | JCC_JNA => BranchGenFn::JBEToTarget0,
+        JCC_JB | JCC_JNAE => BranchGenFn::JBToTarget0,
+        JCC_JO_MUL => BranchGenFn::JOMulToTarget0,
     };
 
-    if (ctx.get_chain_depth() as i32) < depth_limit {
-        let mut deeper = ctx.clone();
+    if asm.ctx.get_chain_depth() < depth_limit {
+        // Rewind Context to use the stack_size at the beginning of this instruction.
+        let mut deeper = asm.ctx.with_stack_size(jit.stack_size_for_pc);
         deeper.increment_chain_depth();
         let bid = BlockId {
             iseq: jit.iseq,
             idx: jit.insn_idx,
         };
 
-        gen_branch(jit, asm, ocb, bid, &deeper, None, None, target0_gen_fn);
+        jit.gen_branch(asm, bid, &deeper, None, None, target0_gen_fn);
     } else {
-        target0_gen_fn(asm, side_exit.unwrap_code_ptr(), None, BranchShape::Default);
+        target0_gen_fn.call(asm, Target::side_exit(counter), None);
     }
 }
 
-// up to 5 different classes, and embedded or not for each
-pub const GET_IVAR_MAX_DEPTH: i32 = 10;
+// up to 8 different shapes for each
+pub const GET_IVAR_MAX_DEPTH: u8 = 8;
 
-// up to 5 different classes, and embedded or not for each
-pub const SET_IVAR_MAX_DEPTH: i32 = 10;
+// up to 8 different shapes for each
+pub const SET_IVAR_MAX_DEPTH: u8 = 8;
 
 // hashes and arrays
-pub const OPT_AREF_MAX_CHAIN_DEPTH: i32 = 2;
+pub const OPT_AREF_MAX_CHAIN_DEPTH: u8 = 2;
 
-// up to 5 different classes
-pub const SEND_MAX_DEPTH: i32 = 5;
+// expandarray
+pub const EXPANDARRAY_MAX_CHAIN_DEPTH: u8 = 4;
 
-// up to 20 different methods for send
-pub const SEND_MAX_CHAIN_DEPTH: i32 = 20;
+// up to 5 different methods for send
+pub const SEND_MAX_DEPTH: u8 = 5;
 
 // up to 20 different offsets for case-when
-pub const CASE_WHEN_MAX_DEPTH: i32 = 20;
-
-// Codegen for setting an instance variable.
-// Preconditions:
-//   - receiver is in REG0
-//   - receiver has the same class as CLASS_OF(comptime_receiver)
-//   - no stack push or pops to ctx since the entry to the codegen of the instruction being compiled
-fn gen_set_ivar(
-    jit: &mut JITState,
-    ctx: &mut Context,
-    asm: &mut Assembler,
-    _recv: VALUE,
-    ivar_name: ID,
-    flags: u32,
-    argc: i32,
-) -> CodegenStatus {
-
-    // This is a .send call and we need to adjust the stack
-    if flags & VM_CALL_OPT_SEND != 0 {
-        handle_opt_send_shift_stack(asm, argc, ctx);
-    }
-
-    // Save the PC and SP because the callee may allocate
-    // Note that this modifies REG_SP, which is why we do it first
-    jit_prepare_routine_call(jit, ctx, asm);
-
-    // Get the operands from the stack
-    let val_opnd = ctx.stack_pop(1);
-    let recv_opnd = ctx.stack_pop(1);
-
-    // Call rb_vm_set_ivar_id with the receiver, the ivar name, and the value
-    let val = asm.ccall(
-        rb_vm_set_ivar_id as *const u8,
-        vec![
-            recv_opnd,
-            Opnd::UImm(ivar_name),
-            val_opnd,
-        ],
-    );
-
-    let out_opnd = ctx.stack_push(Type::Unknown);
-    asm.mov(out_opnd, val);
-
-    KeepCompiling
-}
-
+pub const CASE_WHEN_MAX_DEPTH: u8 = 20;
 
+pub const MAX_SPLAT_LENGTH: i32 = 127;
 
 // Codegen for getting an instance variable.
 // Preconditions:
@@ -1981,191 +2850,156 @@ fn gen_set_ivar(
 //   - no stack push or pops to ctx since the entry to the codegen of the instruction being compiled
 fn gen_get_ivar(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-    max_chain_depth: i32,
+    max_chain_depth: u8,
     comptime_receiver: VALUE,
     ivar_name: ID,
     recv: Opnd,
     recv_opnd: YARVOpnd,
-    side_exit: Target,
-) -> CodegenStatus {
-    // If the object has a too complex shape, we exit
-    if comptime_receiver.shape_too_complex() {
-        return CantCompile;
-    }
-
-    let comptime_val_klass = comptime_receiver.class_of();
-    let starting_context = ctx.clone(); // make a copy for use with jit_chain_guard
-
+) -> Option<CodegenStatus> {
     // If recv isn't already a register, load it.
     let recv = match recv {
-        Opnd::Reg(_) => recv,
+        Opnd::InsnOut { .. } => recv,
         _ => asm.load(recv),
     };
 
-    // Check if the comptime class uses a custom allocator
-    let custom_allocator = unsafe { rb_get_alloc_func(comptime_val_klass) };
-    let uses_custom_allocator = match custom_allocator {
-        Some(alloc_fun) => {
-            let allocate_instance = rb_class_allocate_instance as *const u8;
-            alloc_fun as *const u8 != allocate_instance
-        }
-        None => false,
-    };
-
     // Check if the comptime receiver is a T_OBJECT
     let receiver_t_object = unsafe { RB_TYPE_P(comptime_receiver, RUBY_T_OBJECT) };
+    // Use a general C call at the last chain to avoid exits on megamorphic shapes
+    let megamorphic = asm.ctx.get_chain_depth() >= max_chain_depth;
+    if megamorphic {
+        gen_counter_incr(jit, asm, Counter::num_getivar_megamorphic);
+    }
 
-    // If the class uses the default allocator, instances should all be T_OBJECT
-    // NOTE: This assumes nobody changes the allocator of the class after allocation.
-    //       Eventually, we can encode whether an object is T_OBJECT or not
-    //       inside object shapes.
-    if !receiver_t_object || uses_custom_allocator {
+    // NOTE: This assumes T_OBJECT can't ever have the same shape_id as any other type.
+    // too-complex shapes can't use index access, so we use rb_ivar_get for them too.
+    if !comptime_receiver.heap_object_p() || comptime_receiver.shape_too_complex() || megamorphic {
         // General case. Call rb_ivar_get().
         // VALUE rb_ivar_get(VALUE obj, ID id)
-        asm.comment("call rb_ivar_get()");
+        asm_comment!(asm, "call rb_ivar_get()");
 
-        // The function could raise exceptions.
-        jit_prepare_routine_call(jit, ctx, asm);
+        // The function could raise RactorIsolationError.
+        jit_prepare_non_leaf_call(jit, asm);
 
         let ivar_val = asm.ccall(rb_ivar_get as *const u8, vec![recv, Opnd::UImm(ivar_name)]);
 
         if recv_opnd != SelfOpnd {
-            ctx.stack_pop(1);
+            asm.stack_pop(1);
         }
 
         // Push the ivar on the stack
-        let out_opnd = ctx.stack_push(Type::Unknown);
+        let out_opnd = asm.stack_push(Type::Unknown);
         asm.mov(out_opnd, ivar_val);
 
         // Jump to next instruction. This allows guard chains to share the same successor.
-        jump_to_next_insn(jit, ctx, asm, ocb);
-        return EndBlock;
+        jump_to_next_insn(jit, asm);
+        return Some(EndBlock);
     }
 
     let ivar_index = unsafe {
         let shape_id = comptime_receiver.shape_id_of();
-        let shape = rb_shape_get_shape_by_id(shape_id);
-        let mut ivar_index: u32 = 0;
-        if rb_shape_get_iv_index(shape, ivar_name, &mut ivar_index) {
+        let mut ivar_index: u16 = 0;
+        if rb_shape_get_iv_index(shape_id, ivar_name, &mut ivar_index) {
             Some(ivar_index as usize)
         } else {
             None
         }
     };
 
-    // must be before stack_pop
-    let recv_type = ctx.get_opnd_type(recv_opnd);
-
-    // Upgrade type
-    if !recv_type.is_heap() {
-        ctx.upgrade_opnd_type(recv_opnd, Type::UnknownHeap);
-    }
+    // Guard heap object (recv_opnd must be used before stack_pop)
+    guard_object_is_heap(asm, recv, recv_opnd, Counter::getivar_not_heap);
 
-    // Pop receiver if it's on the temp stack
-    if recv_opnd != SelfOpnd {
-        ctx.stack_pop(1);
-    }
-
-    // Guard heap object
-    if !recv_type.is_heap() {
-        guard_object_is_heap(asm, recv, side_exit);
-    }
-
-    // Compile time self is embedded and the ivar index lands within the object
-    let embed_test_result = unsafe { FL_TEST_RAW(comptime_receiver, VALUE(ROBJECT_EMBED.as_usize())) != VALUE(0) };
-
-    let expected_shape = unsafe { rb_shape_get_shape_id(comptime_receiver) };
+    let expected_shape = unsafe { rb_obj_shape_id(comptime_receiver) };
     let shape_id_offset = unsafe { rb_shape_id_offset() };
     let shape_opnd = Opnd::mem(SHAPE_ID_NUM_BITS as u8, recv, shape_id_offset);
 
-    asm.comment("guard shape");
+    asm_comment!(asm, "guard shape");
     asm.cmp(shape_opnd, Opnd::UImm(expected_shape as u64));
-    let megamorphic_side_exit = counted_exit!(ocb, side_exit, getivar_megamorphic);
     jit_chain_guard(
         JCC_JNE,
         jit,
-        &starting_context,
         asm,
-        ocb,
         max_chain_depth,
-        megamorphic_side_exit,
+        Counter::getivar_megamorphic,
     );
 
+    // Pop receiver if it's on the temp stack
+    if recv_opnd != SelfOpnd {
+        asm.stack_pop(1);
+    }
+
     match ivar_index {
         // If there is no IVAR index, then the ivar was undefined
         // when we entered the compiler.  That means we can just return
         // nil for this shape + iv name
         None => {
-            let out_opnd = ctx.stack_push(Type::Nil);
+            let out_opnd = asm.stack_push(Type::Nil);
             asm.mov(out_opnd, Qnil.into());
         }
         Some(ivar_index) => {
-            if embed_test_result {
-                // See ROBJECT_IVPTR() from include/ruby/internal/core/robject.h
-
-                // Load the variable
-                let offs = ROBJECT_OFFSET_AS_ARY + (ivar_index * SIZEOF_VALUE) as i32;
-                let ivar_opnd = Opnd::mem(64, recv, offs);
-
-                // Push the ivar on the stack
-                let out_opnd = ctx.stack_push(Type::Unknown);
-                asm.mov(out_opnd, ivar_opnd);
+            let ivar_opnd = if receiver_t_object {
+                if comptime_receiver.embedded_p() {
+                   // See ROBJECT_FIELDS() from include/ruby/internal/core/robject.h
+
+                   // Load the variable
+                   let offs = ROBJECT_OFFSET_AS_ARY as i32 + (ivar_index * SIZEOF_VALUE) as i32;
+                   Opnd::mem(64, recv, offs)
+               } else {
+                   // Compile time value is *not* embedded.
+
+                   // Get a pointer to the extended table
+                   let tbl_opnd = asm.load(Opnd::mem(64, recv, ROBJECT_OFFSET_AS_HEAP_FIELDS as i32));
+
+                   // Read the ivar from the extended table
+                   Opnd::mem(64, tbl_opnd, (SIZEOF_VALUE * ivar_index) as i32)
+               }
             } else {
-                // Compile time value is *not* embedded.
-
-                // Get a pointer to the extended table
-                let tbl_opnd = asm.load(Opnd::mem(64, recv, ROBJECT_OFFSET_AS_HEAP_IVPTR));
+                asm_comment!(asm, "call rb_ivar_get_at()");
 
-                // Read the ivar from the extended table
-                let ivar_opnd = Opnd::mem(64, tbl_opnd, (SIZEOF_VALUE * ivar_index) as i32);
+                if assume_single_ractor_mode(jit, asm) {
+                    asm.ccall(rb_ivar_get_at_no_ractor_check as *const u8, vec![recv, Opnd::UImm((ivar_index as u32).into())])
+                } else {
+                    // The function could raise RactorIsolationError.
+                    jit_prepare_non_leaf_call(jit, asm);
+                    asm.ccall(rb_ivar_get_at as *const u8, vec![recv, Opnd::UImm((ivar_index as u32).into()), Opnd::UImm(ivar_name)])
+                }
+            };
 
-                let out_opnd = ctx.stack_push(Type::Unknown);
-                asm.mov(out_opnd, ivar_opnd);
-            }
+            // Push the ivar on the stack
+            let out_opnd = asm.stack_push(Type::Unknown);
+            asm.mov(out_opnd, ivar_opnd);
         }
     }
 
     // Jump to next instruction. This allows guard chains to share the same successor.
-    jump_to_next_insn(jit, ctx, asm, ocb);
-    EndBlock
+    jump_to_next_insn(jit, asm);
+    Some(EndBlock)
 }
 
 fn gen_getinstancevariable(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Defer compilation so we can specialize on a runtime `self`
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+    if !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
     }
 
-    let ivar_name = jit_get_arg(jit, 0).as_u64();
+    let ivar_name = jit.get_arg(0).as_u64();
 
-    let comptime_val = jit_peek_at_self(jit);
-
-    // Generate a side exit
-    let side_exit = get_side_exit(jit, ocb, ctx);
+    let comptime_val = jit.peek_at_self();
 
     // Guard that the receiver has the same class as the one from compile time.
     let self_asm_opnd = Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF);
 
     gen_get_ivar(
         jit,
-        ctx,
         asm,
-        ocb,
         GET_IVAR_MAX_DEPTH,
         comptime_val,
         ivar_name,
         self_asm_opnd,
         SelfOpnd,
-        side_exit,
     )
 }
 
@@ -2185,134 +3019,183 @@ fn gen_write_iv(
 
     if embed_test_result {
         // Find the IV offset
-        let offs = ROBJECT_OFFSET_AS_ARY + (ivar_index * SIZEOF_VALUE) as i32;
+        let offs = ROBJECT_OFFSET_AS_ARY as i32 + (ivar_index * SIZEOF_VALUE) as i32;
         let ivar_opnd = Opnd::mem(64, recv, offs);
 
         // Write the IV
-        asm.comment("write IV");
+        asm_comment!(asm, "write IV");
         asm.mov(ivar_opnd, set_value);
     } else {
         // Compile time value is *not* embedded.
 
         // Get a pointer to the extended table
-        let tbl_opnd = asm.load(Opnd::mem(64, recv, ROBJECT_OFFSET_AS_HEAP_IVPTR));
+        let tbl_opnd = asm.load(Opnd::mem(64, recv, ROBJECT_OFFSET_AS_HEAP_FIELDS as i32));
 
         // Write the ivar in to the extended table
         let ivar_opnd = Opnd::mem(64, tbl_opnd, (SIZEOF_VALUE * ivar_index) as i32);
 
-        asm.comment("write IV");
+        asm_comment!(asm, "write IV");
         asm.mov(ivar_opnd, set_value);
     }
 }
 
 fn gen_setinstancevariable(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let starting_context = ctx.clone(); // make a copy for use with jit_chain_guard
-
+) -> Option<CodegenStatus> {
     // Defer compilation so we can specialize on a runtime `self`
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+    if !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
     }
 
-    let ivar_name = jit_get_arg(jit, 0).as_u64();
-    let comptime_receiver = jit_peek_at_self(jit);
-    let comptime_val_klass = comptime_receiver.class_of();
+    let ivar_name = jit.get_arg(0).as_u64();
+    let ic = jit.get_arg(1).as_ptr();
+    let comptime_receiver = jit.peek_at_self();
+    gen_set_ivar(
+        jit,
+        asm,
+        comptime_receiver,
+        ivar_name,
+        SelfOpnd,
+        Some(ic),
+    )
+}
 
+/// Set an instance variable on setinstancevariable or attr_writer.
+/// It switches the behavior based on what recv_opnd is given.
+/// * SelfOpnd: setinstancevariable, which doesn't push a result onto the stack.
+/// * StackOpnd: attr_writer, which pushes a result onto the stack.
+fn gen_set_ivar(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    comptime_receiver: VALUE,
+    ivar_name: ID,
+    recv_opnd: YARVOpnd,
+    ic: Option<*const iseq_inline_iv_cache_entry>,
+) -> Option<CodegenStatus> {
     // If the comptime receiver is frozen, writing an IV will raise an exception
     // and we don't want to JIT code to deal with that situation.
-    // If the object has a too complex shape, we will also exit
-    if comptime_receiver.is_frozen() || comptime_receiver.shape_too_complex() {
-        return CantCompile;
+    if comptime_receiver.is_frozen() {
+        gen_counter_incr(jit, asm, Counter::setivar_frozen);
+        return None;
     }
 
-    let (_, stack_type) = ctx.get_opnd_mapping(StackOpnd(0));
-
-    // Check if the comptime class uses a custom allocator
-    let custom_allocator = unsafe { rb_get_alloc_func(comptime_val_klass) };
-    let uses_custom_allocator = match custom_allocator {
-        Some(alloc_fun) => {
-            let allocate_instance = rb_class_allocate_instance as *const u8;
-            alloc_fun as *const u8 != allocate_instance
-        }
-        None => false,
-    };
+    let stack_type = asm.ctx.get_opnd_type(StackOpnd(0));
 
     // Check if the comptime receiver is a T_OBJECT
     let receiver_t_object = unsafe { RB_TYPE_P(comptime_receiver, RUBY_T_OBJECT) };
+    // Use a general C call at the last chain to avoid exits on megamorphic shapes
+    let megamorphic = asm.ctx.get_chain_depth() >= SET_IVAR_MAX_DEPTH;
+    if megamorphic {
+        gen_counter_incr(jit, asm, Counter::num_setivar_megamorphic);
+    }
 
-    // If the receiver isn't a T_OBJECT, or uses a custom allocator,
-    // then just write out the IV write as a function call
-    if !receiver_t_object || uses_custom_allocator {
-        asm.comment("call rb_vm_setinstancevariable()");
+    // Get the iv index
+    let shape_too_complex = comptime_receiver.shape_too_complex();
+    let ivar_index = if !comptime_receiver.special_const_p() && !shape_too_complex {
+        let shape_id = comptime_receiver.shape_id_of();
+        let mut ivar_index: u16 = 0;
+        if unsafe { rb_shape_get_iv_index(shape_id, ivar_name, &mut ivar_index) } {
+            Some(ivar_index as usize)
+        } else {
+            None
+        }
+    } else {
+        None
+    };
 
-        let ic = jit_get_arg(jit, 1).as_u64(); // type IVC
+    // The current shape doesn't contain this iv, we need to transition to another shape.
+    let mut new_shape_too_complex = false;
+    let new_shape = if !shape_too_complex && receiver_t_object && ivar_index.is_none() {
+        let current_shape_id = comptime_receiver.shape_id_of();
+        // We don't need to check about imemo_fields here because we're definitely looking at a T_OBJECT.
+        let klass = unsafe { rb_obj_class(comptime_receiver) };
+        let next_shape_id = unsafe { rb_shape_transition_add_ivar_no_warnings(klass, current_shape_id, ivar_name) };
+
+        // If the VM ran out of shapes, or this class generated too many leaf,
+        // it may be de-optimized into OBJ_TOO_COMPLEX_SHAPE (hash-table).
+        new_shape_too_complex = unsafe { rb_jit_shape_too_complex_p(next_shape_id) };
+        if new_shape_too_complex {
+            Some((next_shape_id, None, 0_usize))
+        } else {
+            let current_capacity = unsafe { rb_yjit_shape_capacity(current_shape_id) };
+            let next_capacity = unsafe { rb_yjit_shape_capacity(next_shape_id) };
 
-        // The function could raise exceptions.
-        // Note that this modifies REG_SP, which is why we do it first
-        jit_prepare_routine_call(jit, ctx, asm);
+            // If the new shape has a different capacity, or is TOO_COMPLEX, we'll have to
+            // reallocate it.
+            let needs_extension = next_capacity != current_capacity;
 
-        // Get the operands from the stack
-        let val_opnd = ctx.stack_pop(1);
+            // We can write to the object, but we need to transition the shape
+            let ivar_index = unsafe { rb_yjit_shape_index(next_shape_id) } as usize;
 
-        // Call rb_vm_setinstancevariable(iseq, obj, id, val, ic);
-        asm.ccall(
-            rb_vm_setinstancevariable as *const u8,
-            vec![
-                Opnd::const_ptr(jit.iseq as *const u8),
-                Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF),
-                ivar_name.into(),
-                val_opnd,
-                Opnd::const_ptr(ic as *const u8),
-            ]
-        );
-    } else {
-        // Get the iv index
-        let ivar_index = unsafe {
-            let shape_id = comptime_receiver.shape_id_of();
-            let shape = rb_shape_get_shape_by_id(shape_id);
-            let mut ivar_index: u32 = 0;
-            if rb_shape_get_iv_index(shape, ivar_name, &mut ivar_index) {
-                Some(ivar_index as usize)
+            let needs_extension = if needs_extension {
+                Some((current_capacity, next_capacity))
             } else {
                 None
-            }
-        };
+            };
+            Some((next_shape_id, needs_extension, ivar_index))
+        }
+    } else {
+        None
+    };
 
-        // Get the receiver
-        let mut recv = asm.load(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF));
+    // If the receiver isn't a T_OBJECT, then just write out the IV write as a function call.
+    // too-complex shapes can't use index access, so we use rb_ivar_get for them too.
+    if !receiver_t_object || shape_too_complex || new_shape_too_complex || megamorphic {
+        // The function could raise FrozenError.
+        // Note that this modifies REG_SP, which is why we do it first
+        jit_prepare_non_leaf_call(jit, asm);
 
-        let recv_opnd = SelfOpnd;
-        let recv_type = ctx.get_opnd_type(recv_opnd);
+        // Get the operands from the stack
+        let val_opnd = asm.stack_opnd(0);
 
-        // Generate a side exit
-        let side_exit = get_side_exit(jit, ocb, ctx);
+        if let StackOpnd(index) = recv_opnd { // attr_writer
+            let recv = asm.stack_opnd(index as i32);
+            asm_comment!(asm, "call rb_vm_set_ivar_id()");
+            asm.ccall(
+                rb_vm_set_ivar_id as *const u8,
+                vec![
+                    recv,
+                    Opnd::UImm(ivar_name),
+                    val_opnd,
+                ],
+            );
+        } else { // setinstancevariable
+            asm_comment!(asm, "call rb_vm_setinstancevariable()");
+            asm.ccall(
+                rb_vm_setinstancevariable as *const u8,
+                vec![
+                    VALUE(jit.iseq as usize).into(),
+                    Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF),
+                    ivar_name.into(),
+                    val_opnd,
+                    Opnd::const_ptr(ic.unwrap() as *const u8),
+                ],
+            );
+        }
+    } else {
+        // Get the receiver
+        let mut recv = asm.load(if let StackOpnd(index) = recv_opnd {
+            asm.stack_opnd(index as i32)
+        } else {
+            Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF)
+        });
 
         // Upgrade type
-        if !recv_type.is_heap() { // Must be a heap type
-            ctx.upgrade_opnd_type(recv_opnd, Type::UnknownHeap);
-            guard_object_is_heap(asm, recv, side_exit);
-        }
+        guard_object_is_heap(asm, recv, recv_opnd, Counter::setivar_not_heap);
 
-        let expected_shape = unsafe { rb_shape_get_shape_id(comptime_receiver) };
+        let expected_shape = unsafe { rb_obj_shape_id(comptime_receiver) };
         let shape_id_offset = unsafe { rb_shape_id_offset() };
         let shape_opnd = Opnd::mem(SHAPE_ID_NUM_BITS as u8, recv, shape_id_offset);
 
-        asm.comment("guard shape");
+        asm_comment!(asm, "guard shape");
         asm.cmp(shape_opnd, Opnd::UImm(expected_shape as u64));
-        let megamorphic_side_exit = counted_exit!(ocb, side_exit, setivar_megamorphic);
         jit_chain_guard(
             JCC_JNE,
             jit,
-            &starting_context,
             asm,
-            ocb,
             SET_IVAR_MAX_DEPTH,
-            megamorphic_side_exit,
+            Counter::setivar_megamorphic,
         );
 
         let write_val;
@@ -2321,42 +3204,15 @@ fn gen_setinstancevariable(
             // If we don't have an instance variable index, then we need to
             // transition out of the current shape.
             None => {
-                let shape = comptime_receiver.shape_of();
-
-                let current_capacity = unsafe { (*shape).capacity };
-                let new_capacity = current_capacity * 2;
-
-                // If the object doesn't have the capacity to store the IV,
-                // then we'll need to allocate it.
-                let needs_extension = unsafe { (*shape).next_iv_index >= current_capacity };
-
-                // We can write to the object, but we need to transition the shape
-                let ivar_index = unsafe { (*shape).next_iv_index } as usize;
-
-                let capa_shape = if needs_extension {
-                    // We need to add an extended table to the object
-                    // First, create an outgoing transition that increases the
-                    // capacity
-                    Some(unsafe { rb_shape_transition_shape_capa(shape, new_capacity) })
-                } else {
-                    None
-                };
-
-                let dest_shape = if let Some(capa_shape) = capa_shape {
-                    unsafe { rb_shape_get_next(capa_shape, comptime_receiver, ivar_name) }
-                } else {
-                    unsafe { rb_shape_get_next(shape, comptime_receiver, ivar_name) }
-                };
-
-                let new_shape_id = unsafe { rb_shape_id(dest_shape) };
-
-                if new_shape_id == OBJ_TOO_COMPLEX_SHAPE_ID {
-                    return CantCompile;
-                }
-
-                if needs_extension {
+                let (new_shape_id, needs_extension, ivar_index) = new_shape.unwrap();
+                if let Some((current_capacity, new_capacity)) = needs_extension {
                     // Generate the C call so that runtime code will increase
                     // the capacity and set the buffer.
+                    asm_comment!(asm, "call rb_ensure_iv_list_size");
+
+                    // It allocates so can trigger GC, which takes the VM lock
+                    // so could yield to a different ractor.
+                    jit_prepare_call_with_gc(jit, asm);
                     asm.ccall(rb_ensure_iv_list_size as *const u8,
                               vec![
                                   recv,
@@ -2366,13 +3222,17 @@ fn gen_setinstancevariable(
                     );
 
                     // Load the receiver again after the function call
-                    recv = asm.load(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF))
+                    recv = asm.load(if let StackOpnd(index) = recv_opnd {
+                        asm.stack_opnd(index as i32)
+                    } else {
+                        Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF)
+                    });
                 }
 
-                write_val = ctx.stack_pop(1);
-                gen_write_iv(asm, comptime_receiver, recv, ivar_index, write_val, needs_extension);
+                write_val = asm.stack_opnd(0);
+                gen_write_iv(asm, comptime_receiver, recv, ivar_index, write_val, needs_extension.is_some());
 
-                asm.comment("write shape");
+                asm_comment!(asm, "write shape");
 
                 let shape_id_offset = unsafe { rb_shape_id_offset() };
                 let shape_opnd = Opnd::mem(SHAPE_ID_NUM_BITS as u8, recv, shape_id_offset);
@@ -2387,7 +3247,7 @@ fn gen_setinstancevariable(
                 // the iv index by searching up the shape tree.  If we've
                 // made the transition already, then there's no reason to
                 // update the shape on the object.  Just set the IV.
-                write_val = ctx.stack_pop(1);
+                write_val = asm.stack_opnd(0);
                 gen_write_iv(asm, comptime_receiver, recv, ivar_index, write_val, false);
             },
         }
@@ -2395,6 +3255,7 @@ fn gen_setinstancevariable(
         // If we know the stack value is an immediate, there's no need to
         // generate WB code.
         if !stack_type.is_imm() {
+            asm.spill_regs(); // for ccall (unconditionally spill them for RegMappings consistency)
             let skip_wb = asm.new_label("skip_wb");
             // If the value we're writing is an immediate, we don't need to WB
             asm.test(write_val, (RUBY_IMMEDIATE_MASK as u64).into());
@@ -2404,7 +3265,7 @@ fn gen_setinstancevariable(
             asm.cmp(write_val, Qnil.into());
             asm.jbe(skip_wb);
 
-            asm.comment("write barrier");
+            asm_comment!(asm, "write barrier");
             asm.ccall(
                 rb_gc_writebarrier as *const u8,
                 vec![
@@ -2416,70 +3277,160 @@ fn gen_setinstancevariable(
             asm.write_label(skip_wb);
         }
     }
+    let write_val = asm.stack_pop(1); // Keep write_val on stack during ccall for GC
+
+    // If it's attr_writer, i.e. recv_opnd is StackOpnd, we need to pop
+    // the receiver and push the written value onto the stack.
+    if let StackOpnd(_) = recv_opnd {
+        asm.stack_pop(1); // Pop receiver
+
+        let out_opnd = asm.stack_push(Type::Unknown); // Push a return value
+        asm.mov(out_opnd, write_val);
+    }
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_defined(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let op_type = jit_get_arg(jit, 0).as_u64();
-    let obj = jit_get_arg(jit, 1);
-    let pushval = jit_get_arg(jit, 2);
+) -> Option<CodegenStatus> {
+    let op_type = jit.get_arg(0).as_u64();
+    let obj = jit.get_arg(1);
+    let pushval = jit.get_arg(2);
 
-    // Save the PC and SP because the callee may allocate
-    // Note that this modifies REG_SP, which is why we do it first
-    jit_prepare_routine_call(jit, ctx, asm);
+    match op_type as u32 {
+        DEFINED_YIELD => {
+            asm.stack_pop(1); // v operand is not used
+            let out_opnd = asm.stack_push(Type::Unknown); // nil or "yield"
 
-    // Get the operands from the stack
-    let v_opnd = ctx.stack_pop(1);
+            gen_block_given(jit, asm, out_opnd, pushval.into(), Qnil.into());
+        }
+        _ => {
+            // Save the PC and SP because the callee may allocate or call #respond_to?
+            // Note that this modifies REG_SP, which is why we do it first
+            jit_prepare_non_leaf_call(jit, asm);
 
-    // Call vm_defined(ec, reg_cfp, op_type, obj, v)
-    let def_result = asm.ccall(rb_vm_defined as *const u8, vec![EC, CFP, op_type.into(), obj.into(), v_opnd]);
+            // Get the operands from the stack
+            let v_opnd = asm.stack_opnd(0);
 
-    // if (vm_defined(ec, GET_CFP(), op_type, obj, v)) {
-    //  val = pushval;
-    // }
-    asm.test(def_result, Opnd::UImm(255));
-    let out_value = asm.csel_nz(pushval.into(), Qnil.into());
+            // Call vm_defined(ec, reg_cfp, op_type, obj, v)
+            let def_result = asm.ccall(rb_vm_defined as *const u8, vec![EC, CFP, op_type.into(), obj.into(), v_opnd]);
+            asm.stack_pop(1); // Keep it on stack during ccall for GC
 
-    // Push the return value onto the stack
-    let out_type = if pushval.special_const_p() {
-        Type::UnknownImm
-    } else {
-        Type::Unknown
+            // if (vm_defined(ec, GET_CFP(), op_type, obj, v)) {
+            //  val = pushval;
+            // }
+            asm.test(def_result, Opnd::UImm(255));
+            let out_value = asm.csel_nz(pushval.into(), Qnil.into());
+
+            // Push the return value onto the stack
+            let out_type = if pushval.special_const_p() {
+                Type::UnknownImm
+            } else {
+                Type::Unknown
+            };
+            let stack_ret = asm.stack_push(out_type);
+            asm.mov(stack_ret, out_value);
+        }
+    }
+
+    Some(KeepCompiling)
+}
+
+fn gen_definedivar(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    // Defer compilation so we can specialize base on a runtime receiver
+    if !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
+    }
+
+    let ivar_name = jit.get_arg(0).as_u64();
+    // Value that will be pushed on the stack if the ivar is defined. In practice this is always the
+    // string "instance-variable". If the ivar is not defined, nil will be pushed instead.
+    let pushval = jit.get_arg(2);
+
+    // Get the receiver
+    let recv = asm.load(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF));
+
+    // Specialize base on compile time values
+    let comptime_receiver = jit.peek_at_self();
+
+    if comptime_receiver.special_const_p() || comptime_receiver.shape_too_complex() || asm.ctx.get_chain_depth() >= GET_IVAR_MAX_DEPTH {
+        // Fall back to calling rb_ivar_defined
+
+        // Save the PC and SP because the callee may allocate
+        // Note that this modifies REG_SP, which is why we do it first
+        jit_prepare_call_with_gc(jit, asm);
+
+        // Call rb_ivar_defined(recv, ivar_name)
+        let def_result = asm.ccall(rb_ivar_defined as *const u8, vec![recv, ivar_name.into()]);
+
+        // if (rb_ivar_defined(recv, ivar_name)) {
+        //  val = pushval;
+        // }
+        asm.test(def_result, Opnd::UImm(255));
+        let out_value = asm.csel_nz(pushval.into(), Qnil.into());
+
+        // Push the return value onto the stack
+        let out_type = if pushval.special_const_p() { Type::UnknownImm } else { Type::Unknown };
+        let stack_ret = asm.stack_push(out_type);
+        asm.mov(stack_ret, out_value);
+
+        return Some(KeepCompiling)
+    }
+
+    let shape_id = comptime_receiver.shape_id_of();
+    let ivar_exists = unsafe {
+        let mut ivar_index: u16 = 0;
+        rb_shape_get_iv_index(shape_id, ivar_name, &mut ivar_index)
     };
-    let stack_ret = ctx.stack_push(out_type);
-    asm.mov(stack_ret, out_value);
 
-    KeepCompiling
+    // Guard heap object (recv_opnd must be used before stack_pop)
+    guard_object_is_heap(asm, recv, SelfOpnd, Counter::definedivar_not_heap);
+
+    let shape_id_offset = unsafe { rb_shape_id_offset() };
+    let shape_opnd = Opnd::mem(SHAPE_ID_NUM_BITS as u8, recv, shape_id_offset);
+
+    asm_comment!(asm, "guard shape");
+    asm.cmp(shape_opnd, Opnd::UImm(shape_id as u64));
+    jit_chain_guard(
+        JCC_JNE,
+        jit,
+        asm,
+        GET_IVAR_MAX_DEPTH,
+        Counter::definedivar_megamorphic,
+    );
+
+    let result = if ivar_exists { pushval } else { Qnil };
+    jit_putobject(asm, result);
+
+    // Jump to next instruction. This allows guard chains to share the same successor.
+    return jump_to_next_insn(jit, asm);
 }
 
 fn gen_checktype(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let type_val = jit_get_arg(jit, 0).as_u32();
+) -> Option<CodegenStatus> {
+    let type_val = jit.get_arg(0).as_u32();
 
     // Only three types are emitted by compile.c at the moment
     if let RUBY_T_STRING | RUBY_T_ARRAY | RUBY_T_HASH = type_val {
-        let val_type = ctx.get_opnd_type(StackOpnd(0));
-        let val = asm.load(ctx.stack_pop(1));
+        let val_type = asm.ctx.get_opnd_type(StackOpnd(0));
+        let val = asm.stack_pop(1);
 
         // Check if we know from type information
         match val_type.known_value_type() {
             Some(value_type) => {
                 if value_type == type_val {
-                    jit_putobject(jit, ctx, asm, Qtrue);
-                    return KeepCompiling;
+                    jit_putobject(asm, Qtrue);
+                    return Some(KeepCompiling);
                 } else {
-                    jit_putobject(jit, ctx, asm, Qfalse);
-                    return KeepCompiling;
+                    jit_putobject(asm, Qfalse);
+                    return Some(KeepCompiling);
                 }
             },
             _ => (),
@@ -2487,6 +3438,7 @@ fn gen_checktype(
 
         let ret = asm.new_label("ret");
 
+        let val = asm.load(val);
         if !val_type.is_heap() {
             // if (SPECIAL_CONST_P(val)) {
             // Return Qfalse via REG1 if not on heap
@@ -2504,27 +3456,25 @@ fn gen_checktype(
         let ret_opnd = asm.csel_e(Qtrue.into(), Qfalse.into());
 
         asm.write_label(ret);
-        let stack_ret = ctx.stack_push(Type::UnknownImm);
+        let stack_ret = asm.stack_push(Type::UnknownImm);
         asm.mov(stack_ret, ret_opnd);
 
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
-        CantCompile
+        None
     }
 }
 
 fn gen_concatstrings(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let n = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let n = jit.get_arg(0).as_usize();
 
-    // Save the PC and SP because we are allocating
-    jit_prepare_routine_call(jit, ctx, asm);
+    // rb_str_concat_literals may raise Encoding::CompatibilityError
+    jit_prepare_non_leaf_call(jit, asm);
 
-    let values_ptr = asm.lea(ctx.sp_opnd(-((SIZEOF_VALUE as isize) * n as isize)));
+    let values_ptr = asm.lea(asm.ctx.sp_opnd(-(n as i32)));
 
     // call rb_str_concat_literals(size_t n, const VALUE *strings);
     let return_value = asm.ccall(
@@ -2532,39 +3482,42 @@ fn gen_concatstrings(
         vec![n.into(), values_ptr]
     );
 
-    ctx.stack_pop(n);
-    let stack_ret = ctx.stack_push(Type::CString);
+    asm.stack_pop(n);
+    let stack_ret = asm.stack_push(Type::TString);
     asm.mov(stack_ret, return_value);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn guard_two_fixnums(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-    side_exit: Target
 ) {
+    let counter = Counter::guard_send_not_fixnums;
+
+    // Get stack operands without popping them
+    let arg1 = asm.stack_opnd(0);
+    let arg0 = asm.stack_opnd(1);
+
     // Get the stack operand types
-    let arg1_type = ctx.get_opnd_type(StackOpnd(0));
-    let arg0_type = ctx.get_opnd_type(StackOpnd(1));
+    let arg1_type = asm.ctx.get_opnd_type(arg1.into());
+    let arg0_type = asm.ctx.get_opnd_type(arg0.into());
 
     if arg0_type.is_heap() || arg1_type.is_heap() {
-        asm.comment("arg is heap object");
-        asm.jmp(side_exit);
+        asm_comment!(asm, "arg is heap object");
+        asm.jmp(Target::side_exit(counter));
         return;
     }
 
     if arg0_type != Type::Fixnum && arg0_type.is_specific() {
-        asm.comment("arg0 not fixnum");
-        asm.jmp(side_exit);
+        asm_comment!(asm, "arg0 not fixnum");
+        asm.jmp(Target::side_exit(counter));
         return;
     }
 
     if arg1_type != Type::Fixnum && arg1_type.is_specific() {
-        asm.comment("arg1 not fixnum");
-        asm.jmp(side_exit);
+        asm_comment!(asm, "arg1 not fixnum");
+        asm.jmp(Target::side_exit(counter));
         return;
     }
 
@@ -2573,43 +3526,35 @@ fn guard_two_fixnums(
     assert!(arg0_type == Type::Fixnum || arg0_type.is_unknown());
     assert!(arg1_type == Type::Fixnum || arg1_type.is_unknown());
 
-    // Get stack operands without popping them
-    let arg1 = ctx.stack_opnd(0);
-    let arg0 = ctx.stack_opnd(1);
-
     // If not fixnums at run-time, fall back
     if arg0_type != Type::Fixnum {
-        asm.comment("guard arg0 fixnum");
+        asm_comment!(asm, "guard arg0 fixnum");
         asm.test(arg0, Opnd::UImm(RUBY_FIXNUM_FLAG as u64));
 
         jit_chain_guard(
             JCC_JZ,
             jit,
-            &ctx,
             asm,
-            ocb,
             SEND_MAX_DEPTH,
-            side_exit,
+            counter,
         );
     }
     if arg1_type != Type::Fixnum {
-        asm.comment("guard arg1 fixnum");
+        asm_comment!(asm, "guard arg1 fixnum");
         asm.test(arg1, Opnd::UImm(RUBY_FIXNUM_FLAG as u64));
 
         jit_chain_guard(
             JCC_JZ,
             jit,
-            &ctx,
             asm,
-            ocb,
             SEND_MAX_DEPTH,
-            side_exit,
+            counter,
         );
     }
 
     // Set stack types in context
-    ctx.upgrade_opnd_type(StackOpnd(0), Type::Fixnum);
-    ctx.upgrade_opnd_type(StackOpnd(1), Type::Fixnum);
+    asm.ctx.upgrade_opnd_type(arg1.into(), Type::Fixnum);
+    asm.ctx.upgrade_opnd_type(arg0.into(), Type::Fixnum);
 }
 
 // Conditional move operation used by comparison operators
@@ -2617,753 +3562,732 @@ type CmovFn = fn(cb: &mut Assembler, opnd0: Opnd, opnd1: Opnd) -> Opnd;
 
 fn gen_fixnum_cmp(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
     cmov_op: CmovFn,
-) -> CodegenStatus {
-    // Defer compilation so we can specialize base on a runtime receiver
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
-    }
-
-    let comptime_a = jit_peek_at_stack(jit, ctx, 1);
-    let comptime_b = jit_peek_at_stack(jit, ctx, 0);
-
-    if comptime_a.fixnum_p() && comptime_b.fixnum_p() {
-        // Create a side-exit to fall back to the interpreter
-        // Note: we generate the side-exit before popping operands from the stack
-        let side_exit = get_side_exit(jit, ocb, ctx);
+    bop: ruby_basic_operators,
+) -> Option<CodegenStatus> {
+    let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
+        Some(two_fixnums) => two_fixnums,
+        None => {
+            // Defer compilation so we can specialize based on a runtime receiver
+            return jit.defer_compilation(asm);
+        }
+    };
 
-        if !assume_bop_not_redefined(jit, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_LT) {
-            return CantCompile;
+    if two_fixnums {
+        if !assume_bop_not_redefined(jit, asm, INTEGER_REDEFINED_OP_FLAG, bop) {
+            return None;
         }
 
         // Check that both operands are fixnums
-        guard_two_fixnums(jit, ctx, asm, ocb, side_exit);
+        guard_two_fixnums(jit, asm);
 
         // Get the operands from the stack
-        let arg1 = ctx.stack_pop(1);
-        let arg0 = ctx.stack_pop(1);
+        let arg1 = asm.stack_pop(1);
+        let arg0 = asm.stack_pop(1);
 
         // Compare the arguments
         asm.cmp(arg0, arg1);
         let bool_opnd = cmov_op(asm, Qtrue.into(), Qfalse.into());
 
         // Push the output on the stack
-        let dst = ctx.stack_push(Type::Unknown);
+        let dst = asm.stack_push(Type::UnknownImm);
         asm.mov(dst, bool_opnd);
 
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm)
     }
 }
 
 fn gen_opt_lt(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    gen_fixnum_cmp(jit, ctx, asm, ocb, Assembler::csel_l)
+) -> Option<CodegenStatus> {
+    gen_fixnum_cmp(jit, asm, Assembler::csel_l, BOP_LT)
 }
 
 fn gen_opt_le(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    gen_fixnum_cmp(jit, ctx, asm, ocb, Assembler::csel_le)
+) -> Option<CodegenStatus> {
+    gen_fixnum_cmp(jit, asm, Assembler::csel_le, BOP_LE)
 }
 
 fn gen_opt_ge(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    gen_fixnum_cmp(jit, ctx, asm, ocb, Assembler::csel_ge)
+) -> Option<CodegenStatus> {
+    gen_fixnum_cmp(jit, asm, Assembler::csel_ge, BOP_GE)
 }
 
 fn gen_opt_gt(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    gen_fixnum_cmp(jit, ctx, asm, ocb, Assembler::csel_g)
+) -> Option<CodegenStatus> {
+    gen_fixnum_cmp(jit, asm, Assembler::csel_g, BOP_GT)
 }
 
 // Implements specialized equality for either two fixnum or two strings
-// Returns true if code was generated, otherwise false
+// Returns None if enough type information isn't available, Some(true)
+// if code was generated, otherwise Some(false).
 fn gen_equality_specialized(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-    side_exit: Target,
-) -> bool {
-    let comptime_a = jit_peek_at_stack(jit, ctx, 1);
-    let comptime_b = jit_peek_at_stack(jit, ctx, 0);
+    gen_eq: bool,
+) -> Option<bool> {
+    let a_opnd = asm.stack_opnd(1);
+    let b_opnd = asm.stack_opnd(0);
 
-    let a_opnd = ctx.stack_opnd(1);
-    let b_opnd = ctx.stack_opnd(0);
+    let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
+        Some(two_fixnums) => two_fixnums,
+        None => return None,
+    };
 
-    if comptime_a.fixnum_p() && comptime_b.fixnum_p() {
-        if !assume_bop_not_redefined(jit, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_EQ) {
+    if two_fixnums {
+        if !assume_bop_not_redefined(jit, asm, INTEGER_REDEFINED_OP_FLAG, BOP_EQ) {
             // if overridden, emit the generic version
-            return false;
+            return Some(false);
         }
 
-        guard_two_fixnums(jit, ctx, asm, ocb, side_exit);
+        guard_two_fixnums(jit, asm);
 
         asm.cmp(a_opnd, b_opnd);
-
-        let val = asm.csel_ne(Qfalse.into(), Qtrue.into());
+        let val = if gen_eq {
+            asm.csel_e(Qtrue.into(), Qfalse.into())
+        } else {
+            asm.csel_ne(Qtrue.into(), Qfalse.into())
+        };
 
         // Push the output on the stack
-        ctx.stack_pop(2);
-        let dst = ctx.stack_push(Type::UnknownImm);
+        asm.stack_pop(2);
+        let dst = asm.stack_push(Type::UnknownImm);
         asm.mov(dst, val);
 
-        true
+        return Some(true);
     }
-    else if unsafe { comptime_a.class_of() == rb_cString && comptime_b.class_of() == rb_cString }
-    {
-        if !assume_bop_not_redefined(jit, ocb, STRING_REDEFINED_OP_FLAG, BOP_EQ) {
+
+    if !jit.at_compile_target() {
+        return None;
+    }
+    let comptime_a = jit.peek_at_stack(&asm.ctx, 1);
+    let comptime_b = jit.peek_at_stack(&asm.ctx, 0);
+
+    if unsafe { comptime_a.class_of() == rb_cString && comptime_b.class_of() == rb_cString } {
+        if !assume_bop_not_redefined(jit, asm, STRING_REDEFINED_OP_FLAG, BOP_EQ) {
             // if overridden, emit the generic version
-            return false;
+            return Some(false);
         }
 
         // Guard that a is a String
         jit_guard_known_klass(
             jit,
-            ctx,
             asm,
-            ocb,
-            unsafe { rb_cString },
             a_opnd,
-            StackOpnd(1),
+            a_opnd.into(),
             comptime_a,
             SEND_MAX_DEPTH,
-            side_exit,
+            Counter::guard_send_not_string,
         );
 
         let equal = asm.new_label("equal");
         let ret = asm.new_label("ret");
 
+        // Spill for ccall. For safety, unconditionally spill temps before branching.
+        asm.spill_regs();
+
         // If they are equal by identity, return true
         asm.cmp(a_opnd, b_opnd);
         asm.je(equal);
 
         // Otherwise guard that b is a T_STRING (from type info) or String (from runtime guard)
-        let btype = ctx.get_opnd_type(StackOpnd(0));
+        let btype = asm.ctx.get_opnd_type(b_opnd.into());
         if btype.known_value_type() != Some(RUBY_T_STRING) {
             // Note: any T_STRING is valid here, but we check for a ::String for simplicity
             // To pass a mutable static variable (rb_cString) requires an unsafe block
             jit_guard_known_klass(
                 jit,
-                ctx,
                 asm,
-                ocb,
-                unsafe { rb_cString },
                 b_opnd,
-                StackOpnd(0),
+                b_opnd.into(),
                 comptime_b,
                 SEND_MAX_DEPTH,
-                side_exit,
+                Counter::guard_send_not_string,
             );
         }
 
         // Call rb_str_eql_internal(a, b)
-        let val = asm.ccall(rb_str_eql_internal as *const u8, vec![a_opnd, b_opnd]);
+        let val = asm.ccall(
+            if gen_eq { rb_str_eql_internal } else { rb_str_neq_internal } as *const u8,
+            vec![a_opnd, b_opnd],
+        );
 
         // Push the output on the stack
-        ctx.stack_pop(2);
-        let dst = ctx.stack_push(Type::UnknownImm);
+        asm.stack_pop(2);
+        let dst = asm.stack_push(Type::UnknownImm);
         asm.mov(dst, val);
         asm.jmp(ret);
 
         asm.write_label(equal);
-        asm.mov(dst, Qtrue.into());
+        asm.mov(dst, if gen_eq { Qtrue } else { Qfalse }.into());
 
         asm.write_label(ret);
 
-        true
+        Some(true)
     } else {
-        false
+        Some(false)
     }
 }
 
 fn gen_opt_eq(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    // Defer compilation so we can specialize base on a runtime receiver
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
-    }
-
-    // Create a side-exit to fall back to the interpreter
-    let side_exit = get_side_exit(jit, ocb, ctx);
+) -> Option<CodegenStatus> {
+    let specialized = match gen_equality_specialized(jit, asm, true) {
+        Some(specialized) => specialized,
+        None => {
+            // Defer compilation so we can specialize base on a runtime receiver
+            return jit.defer_compilation(asm);
+        }
+    };
 
-    if gen_equality_specialized(jit, ctx, asm, ocb, side_exit) {
-        jump_to_next_insn(jit, ctx, asm, ocb);
-        EndBlock
+    if specialized {
+        jump_to_next_insn(jit, asm)
     } else {
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm)
     }
 }
 
 fn gen_opt_neq(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // opt_neq is passed two rb_call_data as arguments:
     // first for ==, second for !=
-    let cd = jit_get_arg(jit, 1).as_ptr();
-    return gen_send_general(jit, ctx, asm, ocb, cd, None);
+    let cd = jit.get_arg(1).as_ptr();
+    perf_call! { gen_send_general(jit, asm, cd, None) }
 }
 
 fn gen_opt_aref(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let cd: *const rb_call_data = jit_get_arg(jit, 0).as_ptr();
+) -> Option<CodegenStatus> {
+    let cd: *const rb_call_data = jit.get_arg(0).as_ptr();
     let argc = unsafe { vm_ci_argc((*cd).ci) };
 
     // Only JIT one arg calls like `ary[6]`
     if argc != 1 {
-        gen_counter_incr!(asm, oaref_argc_not_one);
-        return CantCompile;
+        gen_counter_incr(jit, asm, Counter::opt_aref_argc_not_one);
+        return None;
     }
 
     // Defer compilation so we can specialize base on a runtime receiver
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+    if !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
     }
 
     // Specialize base on compile time values
-    let comptime_idx = jit_peek_at_stack(jit, ctx, 0);
-    let comptime_recv = jit_peek_at_stack(jit, ctx, 1);
-
-    // Create a side-exit to fall back to the interpreter
-    let side_exit = get_side_exit(jit, ocb, ctx);
+    let comptime_idx = jit.peek_at_stack(&asm.ctx, 0);
+    let comptime_recv = jit.peek_at_stack(&asm.ctx, 1);
 
     if comptime_recv.class_of() == unsafe { rb_cArray } && comptime_idx.fixnum_p() {
-        if !assume_bop_not_redefined(jit, ocb, ARRAY_REDEFINED_OP_FLAG, BOP_AREF) {
-            return CantCompile;
+        if !assume_bop_not_redefined(jit, asm, ARRAY_REDEFINED_OP_FLAG, BOP_AREF) {
+            return None;
         }
 
         // Get the stack operands
-        let idx_opnd = ctx.stack_opnd(0);
-        let recv_opnd = ctx.stack_opnd(1);
+        let idx_opnd = asm.stack_opnd(0);
+        let recv_opnd = asm.stack_opnd(1);
 
         // Guard that the receiver is an ::Array
         // BOP_AREF check above is only good for ::Array.
         jit_guard_known_klass(
             jit,
-            ctx,
             asm,
-            ocb,
-            unsafe { rb_cArray },
             recv_opnd,
-            StackOpnd(1),
+            recv_opnd.into(),
             comptime_recv,
             OPT_AREF_MAX_CHAIN_DEPTH,
-            side_exit,
+            Counter::opt_aref_not_array,
         );
 
         // Bail if idx is not a FIXNUM
         let idx_reg = asm.load(idx_opnd);
         asm.test(idx_reg, (RUBY_FIXNUM_FLAG as u64).into());
-        asm.jz(counted_exit!(ocb, side_exit, oaref_arg_not_fixnum));
+        asm.jz(Target::side_exit(Counter::opt_aref_arg_not_fixnum));
 
         // Call VALUE rb_ary_entry_internal(VALUE ary, long offset).
         // It never raises or allocates, so we don't need to write to cfp->pc.
         {
+            // Pop the argument and the receiver
+            asm.stack_pop(2);
+
             let idx_reg = asm.rshift(idx_reg, Opnd::UImm(1)); // Convert fixnum to int
             let val = asm.ccall(rb_ary_entry_internal as *const u8, vec![recv_opnd, idx_reg]);
 
-            // Pop the argument and the receiver
-            ctx.stack_pop(2);
-
             // Push the return value onto the stack
-            let stack_ret = ctx.stack_push(Type::Unknown);
+            let stack_ret = asm.stack_push(Type::Unknown);
             asm.mov(stack_ret, val);
         }
 
         // Jump to next instruction. This allows guard chains to share the same successor.
-        jump_to_next_insn(jit, ctx, asm, ocb);
-        return EndBlock;
+        return jump_to_next_insn(jit, asm);
     } else if comptime_recv.class_of() == unsafe { rb_cHash } {
-        if !assume_bop_not_redefined(jit, ocb, HASH_REDEFINED_OP_FLAG, BOP_AREF) {
-            return CantCompile;
+        if !assume_bop_not_redefined(jit, asm, HASH_REDEFINED_OP_FLAG, BOP_AREF) {
+            return None;
         }
 
-        let recv_opnd = ctx.stack_opnd(1);
+        let recv_opnd = asm.stack_opnd(1);
 
         // Guard that the receiver is a hash
         jit_guard_known_klass(
             jit,
-            ctx,
             asm,
-            ocb,
-            unsafe { rb_cHash },
             recv_opnd,
-            StackOpnd(1),
+            recv_opnd.into(),
             comptime_recv,
             OPT_AREF_MAX_CHAIN_DEPTH,
-            side_exit,
+            Counter::opt_aref_not_hash,
         );
 
         // Prepare to call rb_hash_aref(). It might call #hash on the key.
-        jit_prepare_routine_call(jit, ctx, asm);
+        jit_prepare_non_leaf_call(jit, asm);
 
         // Call rb_hash_aref
-        let key_opnd = ctx.stack_opnd(0);
-        let recv_opnd = ctx.stack_opnd(1);
+        let key_opnd = asm.stack_opnd(0);
+        let recv_opnd = asm.stack_opnd(1);
         let val = asm.ccall(rb_hash_aref as *const u8, vec![recv_opnd, key_opnd]);
 
         // Pop the key and the receiver
-        ctx.stack_pop(2);
+        asm.stack_pop(2);
 
         // Push the return value onto the stack
-        let stack_ret = ctx.stack_push(Type::Unknown);
+        let stack_ret = asm.stack_push(Type::Unknown);
         asm.mov(stack_ret, val);
 
         // Jump to next instruction. This allows guard chains to share the same successor.
-        jump_to_next_insn(jit, ctx, asm, ocb);
-        EndBlock
+        jump_to_next_insn(jit, asm)
     } else {
         // General case. Call the [] method.
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm)
     }
 }
 
 fn gen_opt_aset(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Defer compilation so we can specialize on a runtime `self`
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+    if !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
     }
 
-    let comptime_recv = jit_peek_at_stack(jit, ctx, 2);
-    let comptime_key = jit_peek_at_stack(jit, ctx, 1);
+    let comptime_recv = jit.peek_at_stack(&asm.ctx, 2);
+    let comptime_key = jit.peek_at_stack(&asm.ctx, 1);
 
     // Get the operands from the stack
-    let recv = ctx.stack_opnd(2);
-    let key = ctx.stack_opnd(1);
-    let _val = ctx.stack_opnd(0);
+    let recv = asm.stack_opnd(2);
+    let key = asm.stack_opnd(1);
+    let _val = asm.stack_opnd(0);
 
     if comptime_recv.class_of() == unsafe { rb_cArray } && comptime_key.fixnum_p() {
-        let side_exit = get_side_exit(jit, ocb, ctx);
-
         // Guard receiver is an Array
         jit_guard_known_klass(
             jit,
-            ctx,
             asm,
-            ocb,
-            unsafe { rb_cArray },
             recv,
-            StackOpnd(2),
+            recv.into(),
             comptime_recv,
             SEND_MAX_DEPTH,
-            side_exit,
+            Counter::opt_aset_not_array,
         );
 
         // Guard key is a fixnum
         jit_guard_known_klass(
             jit,
-            ctx,
             asm,
-            ocb,
-            unsafe { rb_cInteger },
             key,
-            StackOpnd(1),
+            key.into(),
             comptime_key,
             SEND_MAX_DEPTH,
-            side_exit,
+            Counter::opt_aset_not_fixnum,
         );
 
         // We might allocate or raise
-        jit_prepare_routine_call(jit, ctx, asm);
+        jit_prepare_non_leaf_call(jit, asm);
 
         // Call rb_ary_store
-        let recv = ctx.stack_opnd(2);
-        let key = asm.load(ctx.stack_opnd(1));
+        let recv = asm.stack_opnd(2);
+        let key = asm.load(asm.stack_opnd(1));
         let key = asm.rshift(key, Opnd::UImm(1)); // FIX2LONG(key)
-        let val = ctx.stack_opnd(0);
+        let val = asm.stack_opnd(0);
         asm.ccall(rb_ary_store as *const u8, vec![recv, key, val]);
 
         // rb_ary_store returns void
         // stored value should still be on stack
-        let val = asm.load(ctx.stack_opnd(0));
+        let val = asm.load(asm.stack_opnd(0));
 
         // Push the return value onto the stack
-        ctx.stack_pop(3);
-        let stack_ret = ctx.stack_push(Type::Unknown);
+        asm.stack_pop(3);
+        let stack_ret = asm.stack_push(Type::Unknown);
         asm.mov(stack_ret, val);
 
-        jump_to_next_insn(jit, ctx, asm, ocb);
-        return EndBlock;
+        return jump_to_next_insn(jit, asm)
     } else if comptime_recv.class_of() == unsafe { rb_cHash } {
-        let side_exit = get_side_exit(jit, ocb, ctx);
-
         // Guard receiver is a Hash
         jit_guard_known_klass(
             jit,
-            ctx,
             asm,
-            ocb,
-            unsafe { rb_cHash },
             recv,
-            StackOpnd(2),
+            recv.into(),
             comptime_recv,
             SEND_MAX_DEPTH,
-            side_exit,
+            Counter::opt_aset_not_hash,
         );
 
         // We might allocate or raise
-        jit_prepare_routine_call(jit, ctx, asm);
+        jit_prepare_non_leaf_call(jit, asm);
 
         // Call rb_hash_aset
-        let recv = ctx.stack_opnd(2);
-        let key = ctx.stack_opnd(1);
-        let val = ctx.stack_opnd(0);
+        let recv = asm.stack_opnd(2);
+        let key = asm.stack_opnd(1);
+        let val = asm.stack_opnd(0);
         let ret = asm.ccall(rb_hash_aset as *const u8, vec![recv, key, val]);
 
         // Push the return value onto the stack
-        ctx.stack_pop(3);
-        let stack_ret = ctx.stack_push(Type::Unknown);
+        asm.stack_pop(3);
+        let stack_ret = asm.stack_push(Type::Unknown);
         asm.mov(stack_ret, ret);
 
-        jump_to_next_insn(jit, ctx, asm, ocb);
-        EndBlock
+        jump_to_next_insn(jit, asm)
     } else {
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm)
     }
 }
 
 fn gen_opt_and(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    // Defer compilation so we can specialize on a runtime `self`
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
-    }
-
-    let comptime_a = jit_peek_at_stack(jit, ctx, 1);
-    let comptime_b = jit_peek_at_stack(jit, ctx, 0);
-
-    if comptime_a.fixnum_p() && comptime_b.fixnum_p() {
-        // Create a side-exit to fall back to the interpreter
-        // Note: we generate the side-exit before popping operands from the stack
-        let side_exit = get_side_exit(jit, ocb, ctx);
+) -> Option<CodegenStatus> {
+    let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
+        Some(two_fixnums) => two_fixnums,
+        None => {
+            // Defer compilation so we can specialize on a runtime `self`
+            return jit.defer_compilation(asm);
+        }
+    };
 
-        if !assume_bop_not_redefined(jit, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_AND) {
-            return CantCompile;
+    if two_fixnums {
+        if !assume_bop_not_redefined(jit, asm, INTEGER_REDEFINED_OP_FLAG, BOP_AND) {
+            return None;
         }
 
         // Check that both operands are fixnums
-        guard_two_fixnums(jit, ctx, asm, ocb, side_exit);
+        guard_two_fixnums(jit, asm);
 
         // Get the operands and destination from the stack
-        let arg1 = ctx.stack_pop(1);
-        let arg0 = ctx.stack_pop(1);
+        let arg1 = asm.stack_pop(1);
+        let arg0 = asm.stack_pop(1);
 
         // Do the bitwise and arg0 & arg1
         let val = asm.and(arg0, arg1);
 
         // Push the output on the stack
-        let dst = ctx.stack_push(Type::Fixnum);
-        asm.store(dst, val);
+        let dst = asm.stack_push(Type::Fixnum);
+        asm.mov(dst, val);
 
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
         // Delegate to send, call the method on the recv
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm)
     }
 }
 
 fn gen_opt_or(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    // Defer compilation so we can specialize on a runtime `self`
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
-    }
-
-    let comptime_a = jit_peek_at_stack(jit, ctx, 1);
-    let comptime_b = jit_peek_at_stack(jit, ctx, 0);
-
-    if comptime_a.fixnum_p() && comptime_b.fixnum_p() {
-        // Create a side-exit to fall back to the interpreter
-        // Note: we generate the side-exit before popping operands from the stack
-        let side_exit = get_side_exit(jit, ocb, ctx);
+) -> Option<CodegenStatus> {
+    let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
+        Some(two_fixnums) => two_fixnums,
+        None => {
+            // Defer compilation so we can specialize on a runtime `self`
+            return jit.defer_compilation(asm);
+        }
+    };
 
-        if !assume_bop_not_redefined(jit, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_OR) {
-            return CantCompile;
+    if two_fixnums {
+        if !assume_bop_not_redefined(jit, asm, INTEGER_REDEFINED_OP_FLAG, BOP_OR) {
+            return None;
         }
 
         // Check that both operands are fixnums
-        guard_two_fixnums(jit, ctx, asm, ocb, side_exit);
+        guard_two_fixnums(jit, asm);
 
         // Get the operands and destination from the stack
-        let arg1 = ctx.stack_pop(1);
-        let arg0 = ctx.stack_pop(1);
+        let arg1 = asm.stack_pop(1);
+        let arg0 = asm.stack_pop(1);
 
         // Do the bitwise or arg0 | arg1
         let val = asm.or(arg0, arg1);
 
         // Push the output on the stack
-        let dst = ctx.stack_push(Type::Fixnum);
-        asm.store(dst, val);
+        let dst = asm.stack_push(Type::Fixnum);
+        asm.mov(dst, val);
 
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
         // Delegate to send, call the method on the recv
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm)
     }
 }
 
 fn gen_opt_minus(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    // Defer compilation so we can specialize on a runtime `self`
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
-    }
-
-    let comptime_a = jit_peek_at_stack(jit, ctx, 1);
-    let comptime_b = jit_peek_at_stack(jit, ctx, 0);
-
-    if comptime_a.fixnum_p() && comptime_b.fixnum_p() {
-        // Create a side-exit to fall back to the interpreter
-        // Note: we generate the side-exit before popping operands from the stack
-        let side_exit = get_side_exit(jit, ocb, ctx);
+) -> Option<CodegenStatus> {
+    let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
+        Some(two_fixnums) => two_fixnums,
+        None => {
+            // Defer compilation so we can specialize on a runtime `self`
+            return jit.defer_compilation(asm);
+        }
+    };
 
-        if !assume_bop_not_redefined(jit, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_MINUS) {
-            return CantCompile;
+    if two_fixnums {
+        if !assume_bop_not_redefined(jit, asm, INTEGER_REDEFINED_OP_FLAG, BOP_MINUS) {
+            return None;
         }
 
         // Check that both operands are fixnums
-        guard_two_fixnums(jit, ctx, asm, ocb, side_exit);
+        guard_two_fixnums(jit, asm);
 
         // Get the operands and destination from the stack
-        let arg1 = ctx.stack_pop(1);
-        let arg0 = ctx.stack_pop(1);
+        let arg1 = asm.stack_pop(1);
+        let arg0 = asm.stack_pop(1);
 
         // Subtract arg0 - arg1 and test for overflow
         let val_untag = asm.sub(arg0, arg1);
-        asm.jo(side_exit);
+        asm.jo(Target::side_exit(Counter::opt_minus_overflow));
         let val = asm.add(val_untag, Opnd::Imm(1));
 
         // Push the output on the stack
-        let dst = ctx.stack_push(Type::Fixnum);
-        asm.store(dst, val);
+        let dst = asm.stack_push(Type::Fixnum);
+        asm.mov(dst, val);
 
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
         // Delegate to send, call the method on the recv
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm)
     }
 }
 
 fn gen_opt_mult(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    // Delegate to send, call the method on the recv
-    gen_opt_send_without_block(jit, ctx, asm, ocb)
+) -> Option<CodegenStatus> {
+    let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
+        Some(two_fixnums) => two_fixnums,
+        None => {
+            return jit.defer_compilation(asm);
+        }
+    };
+
+    // Fallback to a method call if it overflows
+    if two_fixnums && asm.ctx.get_chain_depth() == 0 {
+        if !assume_bop_not_redefined(jit, asm, INTEGER_REDEFINED_OP_FLAG, BOP_MULT) {
+            return None;
+        }
+
+        // Check that both operands are fixnums
+        guard_two_fixnums(jit, asm);
+
+        // Get the operands from the stack
+        let arg1 = asm.stack_pop(1);
+        let arg0 = asm.stack_pop(1);
+
+        // Do some bitwise gymnastics to handle tag bits
+        // x * y is translated to (x >> 1) * (y - 1) + 1
+        let arg0_untag = asm.rshift(arg0, Opnd::UImm(1));
+        let arg1_untag = asm.sub(arg1, Opnd::UImm(1));
+        let out_val = asm.mul(arg0_untag, arg1_untag);
+        jit_chain_guard(JCC_JO_MUL, jit, asm, 1, Counter::opt_mult_overflow);
+        let out_val = asm.add(out_val, Opnd::UImm(1));
+
+        // Push the output on the stack
+        let dst = asm.stack_push(Type::Fixnum);
+        asm.mov(dst, out_val);
+
+        Some(KeepCompiling)
+    } else {
+        gen_opt_send_without_block(jit, asm)
+    }
 }
 
 fn gen_opt_div(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Delegate to send, call the method on the recv
-    gen_opt_send_without_block(jit, ctx, asm, ocb)
+    gen_opt_send_without_block(jit, asm)
 }
 
 fn gen_opt_mod(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    // Defer compilation so we can specialize on a runtime `self`
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
-    }
-
-    let comptime_a = jit_peek_at_stack(jit, ctx, 1);
-    let comptime_b = jit_peek_at_stack(jit, ctx, 0);
-
-    if comptime_a.fixnum_p() && comptime_b.fixnum_p() {
-        // Create a side-exit to fall back to the interpreter
-        // Note: we generate the side-exit before popping operands from the stack
-        let side_exit = get_side_exit(jit, ocb, ctx);
+) -> Option<CodegenStatus> {
+    let two_fixnums = match asm.ctx.two_fixnums_on_stack(jit) {
+        Some(two_fixnums) => two_fixnums,
+        None => {
+            // Defer compilation so we can specialize on a runtime `self`
+            return jit.defer_compilation(asm);
+        }
+    };
 
-        if !assume_bop_not_redefined(jit, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_MOD) {
-            return CantCompile;
+    if two_fixnums {
+        if !assume_bop_not_redefined(jit, asm, INTEGER_REDEFINED_OP_FLAG, BOP_MOD) {
+            return None;
         }
 
         // Check that both operands are fixnums
-        guard_two_fixnums(jit, ctx, asm, ocb, side_exit);
+        guard_two_fixnums(jit, asm);
 
         // Get the operands and destination from the stack
-        let arg1 = ctx.stack_pop(1);
-        let arg0 = ctx.stack_pop(1);
+        let arg1 = asm.stack_pop(1);
+        let arg0 = asm.stack_pop(1);
 
         // Check for arg0 % 0
         asm.cmp(arg1, Opnd::Imm(VALUE::fixnum_from_usize(0).as_i64()));
-        asm.je(side_exit);
+        asm.je(Target::side_exit(Counter::opt_mod_zero));
 
         // Call rb_fix_mod_fix(VALUE recv, VALUE obj)
         let ret = asm.ccall(rb_fix_mod_fix as *const u8, vec![arg0, arg1]);
 
         // Push the return value onto the stack
-        let stack_ret = ctx.stack_push(Type::Unknown);
+        // When the two arguments are fixnums, the modulo output is always a fixnum
+        let stack_ret = asm.stack_push(Type::Fixnum);
         asm.mov(stack_ret, ret);
 
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
         // Delegate to send, call the method on the recv
-        gen_opt_send_without_block(jit, ctx, asm, ocb)
+        gen_opt_send_without_block(jit, asm)
     }
 }
 
 fn gen_opt_ltlt(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Delegate to send, call the method on the recv
-    gen_opt_send_without_block(jit, ctx, asm, ocb)
+    gen_opt_send_without_block(jit, asm)
 }
 
 fn gen_opt_nil_p(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Delegate to send, call the method on the recv
-    gen_opt_send_without_block(jit, ctx, asm, ocb)
+    gen_opt_send_without_block(jit, asm)
 }
 
 fn gen_opt_empty_p(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Delegate to send, call the method on the recv
-    gen_opt_send_without_block(jit, ctx, asm, ocb)
+    gen_opt_send_without_block(jit, asm)
 }
 
 fn gen_opt_succ(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Delegate to send, call the method on the recv
-    gen_opt_send_without_block(jit, ctx, asm, ocb)
+    gen_opt_send_without_block(jit, asm)
 }
 
-
 fn gen_opt_str_freeze(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    if !assume_bop_not_redefined(jit, ocb, STRING_REDEFINED_OP_FLAG, BOP_FREEZE) {
-        return CantCompile;
+) -> Option<CodegenStatus> {
+    if !assume_bop_not_redefined(jit, asm, STRING_REDEFINED_OP_FLAG, BOP_FREEZE) {
+        return None;
     }
 
-    let str = jit_get_arg(jit, 0);
+    let str = jit.get_arg(0);
 
     // Push the return value onto the stack
-    let stack_ret = ctx.stack_push(Type::CString);
+    let stack_ret = asm.stack_push(Type::CString);
     asm.mov(stack_ret, str.into());
 
-    KeepCompiling
+    Some(KeepCompiling)
+}
+
+fn gen_opt_ary_freeze(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    if !assume_bop_not_redefined(jit, asm, ARRAY_REDEFINED_OP_FLAG, BOP_FREEZE) {
+        return None;
+    }
+
+    let ary = jit.get_arg(0);
+
+    // Push the return value onto the stack
+    let stack_ret = asm.stack_push(Type::CArray);
+    asm.mov(stack_ret, ary.into());
+
+    Some(KeepCompiling)
+}
+
+fn gen_opt_hash_freeze(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    if !assume_bop_not_redefined(jit, asm, HASH_REDEFINED_OP_FLAG, BOP_FREEZE) {
+        return None;
+    }
+
+    let hash = jit.get_arg(0);
+
+    // Push the return value onto the stack
+    let stack_ret = asm.stack_push(Type::CHash);
+    asm.mov(stack_ret, hash.into());
+
+    Some(KeepCompiling)
 }
 
 fn gen_opt_str_uminus(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    if !assume_bop_not_redefined(jit, ocb, STRING_REDEFINED_OP_FLAG, BOP_UMINUS) {
-        return CantCompile;
+) -> Option<CodegenStatus> {
+    if !assume_bop_not_redefined(jit, asm, STRING_REDEFINED_OP_FLAG, BOP_UMINUS) {
+        return None;
     }
 
-    let str = jit_get_arg(jit, 0);
+    let str = jit.get_arg(0);
 
     // Push the return value onto the stack
-    let stack_ret = ctx.stack_push(Type::CString);
+    let stack_ret = asm.stack_push(Type::CString);
     asm.mov(stack_ret, str.into());
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_opt_newarray_max(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let num = jit_get_arg(jit, 0).as_u32();
+) -> Option<CodegenStatus> {
+    let num = jit.get_arg(0).as_u32();
 
-    // Save the PC and SP because we may allocate
-    jit_prepare_routine_call(jit, ctx, asm);
+    // Save the PC and SP because we may call #max
+    jit_prepare_non_leaf_call(jit, asm);
 
     extern "C" {
         fn rb_vm_opt_newarray_max(ec: EcPtr, num: u32, elts: *const VALUE) -> VALUE;
     }
 
-    let offset_magnitude = (SIZEOF_VALUE as u32) * num;
-    let values_opnd = ctx.sp_opnd(-(offset_magnitude as isize));
+    let values_opnd = asm.ctx.sp_opnd(-(num as i32));
     let values_ptr = asm.lea(values_opnd);
 
     let val_opnd = asm.ccall(
@@ -3375,31 +4299,210 @@ fn gen_opt_newarray_max(
         ],
     );
 
-    ctx.stack_pop(num.as_usize());
-    let stack_ret = ctx.stack_push(Type::Unknown);
+    asm.stack_pop(num.as_usize());
+    let stack_ret = asm.stack_push(Type::Unknown);
+    asm.mov(stack_ret, val_opnd);
+
+    Some(KeepCompiling)
+}
+
+fn gen_opt_duparray_send(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    let method = jit.get_arg(1).as_u64();
+
+    if method == ID!(include_p) {
+        gen_opt_duparray_send_include_p(jit, asm)
+    } else {
+        None
+    }
+}
+
+fn gen_opt_duparray_send_include_p(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    asm_comment!(asm, "opt_duparray_send include_p");
+
+    let ary = jit.get_arg(0);
+    let argc = jit.get_arg(2).as_usize();
+
+    // Save the PC and SP because we may call #include?
+    jit_prepare_non_leaf_call(jit, asm);
+
+    extern "C" {
+        fn rb_vm_opt_duparray_include_p(ec: EcPtr, ary: VALUE, target: VALUE) -> VALUE;
+    }
+
+    let target = asm.ctx.sp_opnd(-1);
+
+    let val_opnd = asm.ccall(
+        rb_vm_opt_duparray_include_p as *const u8,
+        vec![
+            EC,
+            ary.into(),
+            target,
+        ],
+    );
+
+    asm.stack_pop(argc);
+    let stack_ret = asm.stack_push(Type::Unknown);
+    asm.mov(stack_ret, val_opnd);
+
+    Some(KeepCompiling)
+}
+
+fn gen_opt_newarray_send(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    let method = jit.get_arg(1).as_u32();
+
+    if method == VM_OPT_NEWARRAY_SEND_MIN {
+        gen_opt_newarray_min(jit, asm)
+    } else if method == VM_OPT_NEWARRAY_SEND_MAX {
+        gen_opt_newarray_max(jit, asm)
+    } else if method == VM_OPT_NEWARRAY_SEND_HASH {
+        gen_opt_newarray_hash(jit, asm)
+    } else if method == VM_OPT_NEWARRAY_SEND_INCLUDE_P {
+        gen_opt_newarray_include_p(jit, asm)
+    } else if method == VM_OPT_NEWARRAY_SEND_PACK {
+        gen_opt_newarray_pack_buffer(jit, asm, 1, None)
+    } else if method == VM_OPT_NEWARRAY_SEND_PACK_BUFFER {
+        gen_opt_newarray_pack_buffer(jit, asm, 2, Some(1))
+    } else {
+        None
+    }
+}
+
+fn gen_opt_newarray_pack_buffer(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    fmt_offset: u32,
+    buffer: Option<u32>,
+) -> Option<CodegenStatus> {
+    asm_comment!(asm, "opt_newarray_send pack");
+
+    let num = jit.get_arg(0).as_u32();
+
+    // Save the PC and SP because we may call #pack
+    jit_prepare_non_leaf_call(jit, asm);
+
+    extern "C" {
+        fn rb_vm_opt_newarray_pack_buffer(ec: EcPtr, num: u32, elts: *const VALUE, fmt: VALUE, buffer: VALUE) -> VALUE;
+    }
+
+    let values_opnd = asm.ctx.sp_opnd(-(num as i32));
+    let values_ptr = asm.lea(values_opnd);
+
+    let fmt_string = asm.ctx.sp_opnd(-(fmt_offset as i32));
+
+    let val_opnd = asm.ccall(
+        rb_vm_opt_newarray_pack_buffer as *const u8,
+        vec![
+            EC,
+            (num - fmt_offset).into(),
+            values_ptr,
+            fmt_string,
+            match buffer {
+                None => Qundef.into(),
+                Some(i) => asm.ctx.sp_opnd(-(i as i32)),
+            },
+        ],
+    );
+
+    asm.stack_pop(num.as_usize());
+    let stack_ret = asm.stack_push(Type::CString);
+    asm.mov(stack_ret, val_opnd);
+
+    Some(KeepCompiling)
+}
+
+fn gen_opt_newarray_hash(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+
+    let num = jit.get_arg(0).as_u32();
+
+    // Save the PC and SP because we may call #hash
+    jit_prepare_non_leaf_call(jit, asm);
+
+    extern "C" {
+        fn rb_vm_opt_newarray_hash(ec: EcPtr, num: u32, elts: *const VALUE) -> VALUE;
+    }
+
+    let values_opnd = asm.ctx.sp_opnd(-(num as i32));
+    let values_ptr = asm.lea(values_opnd);
+
+    let val_opnd = asm.ccall(
+        rb_vm_opt_newarray_hash as *const u8,
+        vec![
+            EC,
+            num.into(),
+            values_ptr
+        ],
+    );
+
+    asm.stack_pop(num.as_usize());
+    let stack_ret = asm.stack_push(Type::Unknown);
     asm.mov(stack_ret, val_opnd);
 
-    KeepCompiling
+    Some(KeepCompiling)
+}
+
+fn gen_opt_newarray_include_p(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    asm_comment!(asm, "opt_newarray_send include?");
+
+    let num = jit.get_arg(0).as_u32();
+
+    // Save the PC and SP because we may call customized methods.
+    jit_prepare_non_leaf_call(jit, asm);
+
+    extern "C" {
+        fn rb_vm_opt_newarray_include_p(ec: EcPtr, num: u32, elts: *const VALUE, target: VALUE) -> VALUE;
+    }
+
+    let values_opnd = asm.ctx.sp_opnd(-(num as i32));
+    let values_ptr = asm.lea(values_opnd);
+    let target = asm.ctx.sp_opnd(-1);
+
+    let val_opnd = asm.ccall(
+        rb_vm_opt_newarray_include_p as *const u8,
+        vec![
+            EC,
+            (num - 1).into(),
+            values_ptr,
+            target
+        ],
+    );
+
+    asm.stack_pop(num.as_usize());
+    let stack_ret = asm.stack_push(Type::Unknown);
+    asm.mov(stack_ret, val_opnd);
+
+    Some(KeepCompiling)
 }
 
 fn gen_opt_newarray_min(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
 
-    let num = jit_get_arg(jit, 0).as_u32();
+    let num = jit.get_arg(0).as_u32();
 
-    // Save the PC and SP because we may allocate
-    jit_prepare_routine_call(jit, ctx, asm);
+    // Save the PC and SP because we may call #min
+    jit_prepare_non_leaf_call(jit, asm);
 
     extern "C" {
         fn rb_vm_opt_newarray_min(ec: EcPtr, num: u32, elts: *const VALUE) -> VALUE;
     }
 
-    let offset_magnitude = (SIZEOF_VALUE as u32) * num;
-    let values_opnd = ctx.sp_opnd(-(offset_magnitude as isize));
+    let values_opnd = asm.ctx.sp_opnd(-(num as i32));
     let values_ptr = asm.lea(values_opnd);
 
     let val_opnd = asm.ccall(
@@ -3411,55 +4514,45 @@ fn gen_opt_newarray_min(
         ],
     );
 
-    ctx.stack_pop(num.as_usize());
-    let stack_ret = ctx.stack_push(Type::Unknown);
+    asm.stack_pop(num.as_usize());
+    let stack_ret = asm.stack_push(Type::Unknown);
     asm.mov(stack_ret, val_opnd);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_opt_not(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    return gen_opt_send_without_block(jit, ctx, asm, ocb);
+) -> Option<CodegenStatus> {
+    return gen_opt_send_without_block(jit, asm);
 }
 
 fn gen_opt_size(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    return gen_opt_send_without_block(jit, ctx, asm, ocb);
+) -> Option<CodegenStatus> {
+    return gen_opt_send_without_block(jit, asm);
 }
 
 fn gen_opt_length(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    return gen_opt_send_without_block(jit, ctx, asm, ocb);
+) -> Option<CodegenStatus> {
+    return gen_opt_send_without_block(jit, asm);
 }
 
 fn gen_opt_regexpmatch2(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    return gen_opt_send_without_block(jit, ctx, asm, ocb);
+) -> Option<CodegenStatus> {
+    return gen_opt_send_without_block(jit, asm);
 }
 
 fn gen_opt_case_dispatch(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Normally this instruction would lookup the key in a hash and jump to an
     // offset based on that.
     // Instead we can take the fallback case and continue with the next
@@ -3467,19 +4560,17 @@ fn gen_opt_case_dispatch(
     // We'd hope that our jitted code will be sufficiently fast without the
     // hash lookup, at least for small hashes, but it's worth revisiting this
     // assumption in the future.
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+    if !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
     }
-    let starting_context = ctx.clone();
 
-    let case_hash = jit_get_arg(jit, 0);
-    let else_offset = jit_get_arg(jit, 1).as_u32();
+    let case_hash = jit.get_arg(0);
+    let else_offset = jit.get_arg(1).as_u32();
 
     // Try to reorder case/else branches so that ones that are actually used come first.
     // Supporting only Fixnum for now so that the implementation can be an equality check.
-    let key_opnd = ctx.stack_pop(1);
-    let comptime_key = jit_peek_at_stack(jit, ctx, 0);
+    let key_opnd = asm.stack_opnd(0);
+    let comptime_key = jit.peek_at_stack(&asm.ctx, 0);
 
     // Check that all cases are fixnums to avoid having to register BOP assumptions on
     // all the types that case hashes support. This spends compile time to save memory.
@@ -3500,23 +4591,27 @@ fn gen_opt_case_dispatch(
         all_fixnum
     }
 
-    if comptime_key.fixnum_p() && comptime_key.0 <= u32::MAX.as_usize() && case_hash_all_fixnum_p(case_hash) {
-        if !assume_bop_not_redefined(jit, ocb, INTEGER_REDEFINED_OP_FLAG, BOP_EQQ) {
-            return CantCompile;
+    // If megamorphic, fallback to compiling branch instructions after opt_case_dispatch
+    let megamorphic = asm.ctx.get_chain_depth() >= CASE_WHEN_MAX_DEPTH;
+    if megamorphic {
+        gen_counter_incr(jit, asm, Counter::num_opt_case_dispatch_megamorphic);
+    }
+
+    if comptime_key.fixnum_p() && comptime_key.0 <= u32::MAX.as_usize() && case_hash_all_fixnum_p(case_hash) && !megamorphic {
+        if !assume_bop_not_redefined(jit, asm, INTEGER_REDEFINED_OP_FLAG, BOP_EQQ) {
+            return None;
         }
 
         // Check if the key is the same value
         asm.cmp(key_opnd, comptime_key.into());
-        let side_exit = get_side_exit(jit, ocb, &starting_context);
         jit_chain_guard(
             JCC_JNE,
             jit,
-            &starting_context,
             asm,
-            ocb,
             CASE_WHEN_MAX_DEPTH,
-            side_exit,
+            Counter::opt_case_dispatch_megamorphic,
         );
+        asm.stack_pop(1); // Pop key_opnd
 
         // Get the offset for the compile-time key
         let mut offset = 0;
@@ -3529,51 +4624,29 @@ fn gen_opt_case_dispatch(
         };
 
         // Jump to the offset of case or else
-        let jump_block = BlockId { iseq: jit.iseq, idx: jit_next_insn_idx(jit) + jump_offset };
-        gen_direct_jump(jit, &ctx, jump_block, asm);
-        EndBlock
+        let jump_idx = jit.next_insn_idx() as u32 + jump_offset;
+        let jump_block = BlockId { iseq: jit.iseq, idx: jump_idx.try_into().unwrap() };
+        gen_direct_jump(jit, &asm.ctx.clone(), jump_block, asm);
+        Some(EndBlock)
     } else {
-        KeepCompiling // continue with === branches
-    }
-}
-
-fn gen_branchif_branch(
-    asm: &mut Assembler,
-    target0: CodePtr,
-    target1: Option<CodePtr>,
-    shape: BranchShape,
-) {
-    assert!(target1 != None);
-    match shape {
-        BranchShape::Next0 => {
-            asm.jz(target1.unwrap().into());
-        }
-        BranchShape::Next1 => {
-            asm.jnz(target0.into());
-        }
-        BranchShape::Default => {
-            asm.jnz(target0.into());
-            asm.jmp(target1.unwrap().into());
-        }
+        asm.stack_pop(1); // Pop key_opnd
+        Some(KeepCompiling) // continue with === branches
     }
 }
 
 fn gen_branchif(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let jump_offset = jit_get_arg(jit, 0).as_i32();
+) -> Option<CodegenStatus> {
+    let jump_offset = jit.get_arg(0).as_i32();
 
     // Check for interrupts, but only on backward branches that may create loops
     if jump_offset < 0 {
-        let side_exit = get_side_exit(jit, ocb, ctx);
-        gen_check_ints(asm, side_exit);
+        gen_check_ints(asm, Counter::branchif_interrupted);
     }
 
     // Get the branch target instruction offsets
-    let next_idx = jit_next_insn_idx(jit);
+    let next_idx = jit.next_insn_idx();
     let jump_idx = (next_idx as i32) + jump_offset;
     let next_block = BlockId {
         iseq: jit.iseq,
@@ -3581,68 +4654,51 @@ fn gen_branchif(
     };
     let jump_block = BlockId {
         iseq: jit.iseq,
-        idx: jump_idx as u32,
+        idx: jump_idx.try_into().unwrap(),
     };
 
     // Test if any bit (outside of the Qnil bit) is on
     // See RB_TEST()
-    let val_type = ctx.get_opnd_type(StackOpnd(0));
-    let val_opnd = ctx.stack_pop(1);
+    let val_type = asm.ctx.get_opnd_type(StackOpnd(0));
+    let val_opnd = asm.stack_pop(1);
+
+    incr_counter!(branch_insn_count);
 
     if let Some(result) = val_type.known_truthy() {
         let target = if result { jump_block } else { next_block };
-        gen_direct_jump(jit, ctx, target, asm);
+        gen_direct_jump(jit, &asm.ctx.clone(), target, asm);
+        incr_counter!(branch_known_count);
     } else {
         asm.test(val_opnd, Opnd::Imm(!Qnil.as_i64()));
 
         // Generate the branch instructions
-        gen_branch(
-            jit,
+        let ctx = asm.ctx;
+        jit.gen_branch(
             asm,
-            ocb,
             jump_block,
-            ctx,
+            &ctx,
             Some(next_block),
-            Some(ctx),
-            gen_branchif_branch,
+            Some(&ctx),
+            BranchGenFn::BranchIf(Cell::new(BranchShape::Default)),
         );
     }
 
-    EndBlock
-}
-
-fn gen_branchunless_branch(
-    asm: &mut Assembler,
-    target0: CodePtr,
-    target1: Option<CodePtr>,
-    shape: BranchShape,
-) {
-    match shape {
-        BranchShape::Next0 => asm.jnz(target1.unwrap().into()),
-        BranchShape::Next1 => asm.jz(target0.into()),
-        BranchShape::Default => {
-            asm.jz(target0.into());
-            asm.jmp(target1.unwrap().into());
-        }
-    }
+    Some(EndBlock)
 }
 
 fn gen_branchunless(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let jump_offset = jit_get_arg(jit, 0).as_i32();
+) -> Option<CodegenStatus> {
+    let jump_offset = jit.get_arg(0).as_i32();
 
     // Check for interrupts, but only on backward branches that may create loops
     if jump_offset < 0 {
-        let side_exit = get_side_exit(jit, ocb, ctx);
-        gen_check_ints(asm, side_exit);
+        gen_check_ints(asm, Counter::branchunless_interrupted);
     }
 
     // Get the branch target instruction offsets
-    let next_idx = jit_next_insn_idx(jit) as i32;
+    let next_idx = jit.next_insn_idx() as i32;
     let jump_idx = next_idx + jump_offset;
     let next_block = BlockId {
         iseq: jit.iseq,
@@ -3653,12 +4709,15 @@ fn gen_branchunless(
         idx: jump_idx.try_into().unwrap(),
     };
 
-    let val_type = ctx.get_opnd_type(StackOpnd(0));
-    let val_opnd = ctx.stack_pop(1);
+    let val_type = asm.ctx.get_opnd_type(StackOpnd(0));
+    let val_opnd = asm.stack_pop(1);
+
+    incr_counter!(branch_insn_count);
 
     if let Some(result) = val_type.known_truthy() {
         let target = if result { next_block } else { jump_block };
-        gen_direct_jump(jit, ctx, target, asm);
+        gen_direct_jump(jit, &asm.ctx.clone(), target, asm);
+        incr_counter!(branch_known_count);
     } else {
         // Test if any bit (outside of the Qnil bit) is on
         // See RB_TEST()
@@ -3666,53 +4725,33 @@ fn gen_branchunless(
         asm.test(val_opnd, not_qnil.into());
 
         // Generate the branch instructions
-        gen_branch(
-            jit,
+        let ctx = asm.ctx;
+        jit.gen_branch(
             asm,
-            ocb,
             jump_block,
-            ctx,
+            &ctx,
             Some(next_block),
-            Some(ctx),
-            gen_branchunless_branch,
+            Some(&ctx),
+            BranchGenFn::BranchUnless(Cell::new(BranchShape::Default)),
         );
     }
 
-    EndBlock
-}
-
-fn gen_branchnil_branch(
-    asm: &mut Assembler,
-    target0: CodePtr,
-    target1: Option<CodePtr>,
-    shape: BranchShape,
-) {
-    match shape {
-        BranchShape::Next0 => asm.jne(target1.unwrap().into()),
-        BranchShape::Next1 => asm.je(target0.into()),
-        BranchShape::Default => {
-            asm.je(target0.into());
-            asm.jmp(target1.unwrap().into());
-        }
-    }
+    Some(EndBlock)
 }
 
 fn gen_branchnil(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let jump_offset = jit_get_arg(jit, 0).as_i32();
+) -> Option<CodegenStatus> {
+    let jump_offset = jit.get_arg(0).as_i32();
 
     // Check for interrupts, but only on backward branches that may create loops
     if jump_offset < 0 {
-        let side_exit = get_side_exit(jit, ocb, ctx);
-        gen_check_ints(asm, side_exit);
+        gen_check_ints(asm, Counter::branchnil_interrupted);
     }
 
     // Get the branch target instruction offsets
-    let next_idx = jit_next_insn_idx(jit) as i32;
+    let next_idx = jit.next_insn_idx() as i32;
     let jump_idx = next_idx + jump_offset;
     let next_block = BlockId {
         iseq: jit.iseq,
@@ -3723,56 +4762,160 @@ fn gen_branchnil(
         idx: jump_idx.try_into().unwrap(),
     };
 
-    let val_type = ctx.get_opnd_type(StackOpnd(0));
-    let val_opnd = ctx.stack_pop(1);
+    let val_type = asm.ctx.get_opnd_type(StackOpnd(0));
+    let val_opnd = asm.stack_pop(1);
+
+    incr_counter!(branch_insn_count);
 
     if let Some(result) = val_type.known_nil() {
         let target = if result { jump_block } else { next_block };
-        gen_direct_jump(jit, ctx, target, asm);
+        gen_direct_jump(jit, &asm.ctx.clone(), target, asm);
+        incr_counter!(branch_known_count);
     } else {
         // Test if the value is Qnil
         asm.cmp(val_opnd, Opnd::UImm(Qnil.into()));
         // Generate the branch instructions
-        gen_branch(
-            jit,
+        let ctx = asm.ctx;
+        jit.gen_branch(
             asm,
-            ocb,
             jump_block,
-            ctx,
+            &ctx,
             Some(next_block),
-            Some(ctx),
-            gen_branchnil_branch,
+            Some(&ctx),
+            BranchGenFn::BranchNil(Cell::new(BranchShape::Default)),
         );
     }
 
-    EndBlock
+    Some(EndBlock)
+}
+
+fn gen_throw(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    let throw_state = jit.get_arg(0).as_u64();
+    let throwobj = asm.stack_pop(1);
+    let throwobj = asm.load(throwobj);
+
+    // Gather some statistics about throw
+    gen_counter_incr(jit, asm, Counter::num_throw);
+    match (throw_state & VM_THROW_STATE_MASK as u64) as u32 {
+        RUBY_TAG_BREAK => gen_counter_incr(jit, asm, Counter::num_throw_break),
+        RUBY_TAG_RETRY => gen_counter_incr(jit, asm, Counter::num_throw_retry),
+        RUBY_TAG_RETURN => gen_counter_incr(jit, asm, Counter::num_throw_return),
+        _ => {},
+    }
+
+    // THROW_DATA_NEW allocates. Save SP for GC and PC for allocation tracing as
+    // well as handling the catch table. However, not using jit_prepare_call_with_gc
+    // since we don't need a patch point for this implementation.
+    jit_save_pc(jit, asm);
+    gen_save_sp(asm);
+
+    // rb_vm_throw verifies it's a valid throw, sets ec->tag->state, and returns throw
+    // data, which is throwobj or a vm_throw_data wrapping it. When ec->tag->state is
+    // set, JIT code callers will handle the throw with vm_exec_handle_exception.
+    extern "C" {
+        fn rb_vm_throw(ec: EcPtr, reg_cfp: CfpPtr, throw_state: u32, throwobj: VALUE) -> VALUE;
+    }
+    let val = asm.ccall(rb_vm_throw as *mut u8, vec![EC, CFP, throw_state.into(), throwobj]);
+
+    asm_comment!(asm, "exit from throw");
+    asm.cpop_into(SP);
+    asm.cpop_into(EC);
+    asm.cpop_into(CFP);
+
+    asm.frame_teardown();
+
+    asm.cret(val);
+    Some(EndBlock)
+}
+
+fn gen_opt_new(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    let cd = jit.get_arg(0).as_ptr();
+    let jump_offset = jit.get_arg(1).as_i32();
+
+    if !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
+    }
+
+    let ci = unsafe { get_call_data_ci(cd) }; // info about the call site
+    let mid = unsafe { vm_ci_mid(ci) };
+    let argc: i32 = unsafe { vm_ci_argc(ci) }.try_into().unwrap();
+
+    let recv_idx = argc;
+    let comptime_recv = jit.peek_at_stack(&asm.ctx, recv_idx as isize);
+
+    // This is a singleton class
+    let comptime_recv_klass = comptime_recv.class_of();
+
+    let recv = asm.stack_opnd(recv_idx);
+
+    perf_call!("opt_new: ", jit_guard_known_klass(
+        jit,
+        asm,
+        recv,
+        recv.into(),
+        comptime_recv,
+        SEND_MAX_DEPTH,
+        Counter::guard_send_klass_megamorphic,
+    ));
+
+    // We now know that it's always comptime_recv_klass
+    if jit.assume_expected_cfunc(asm, comptime_recv_klass, mid, rb_class_new_instance_pass_kw as _) {
+        // Fast path
+        // call rb_class_alloc to actually allocate
+        jit_prepare_non_leaf_call(jit, asm);
+        let obj = asm.ccall(rb_obj_alloc as _, vec![comptime_recv.into()]);
+
+        // Get a reference to the stack location where we need to save the
+        // return instance.
+        let result = asm.stack_opnd(recv_idx + 1);
+        let recv = asm.stack_opnd(recv_idx);
+
+        // Replace the receiver for the upcoming initialize call
+        asm.ctx.set_opnd_mapping(recv.into(), TempMapping::MapToStack(Type::UnknownHeap));
+        asm.mov(recv, obj);
+
+        // Save the allocated object for return
+        asm.ctx.set_opnd_mapping(result.into(), TempMapping::MapToStack(Type::UnknownHeap));
+        asm.mov(result, obj);
+
+        jump_to_next_insn(jit, asm)
+    } else {
+        // general case
+
+        // Get the branch target instruction offsets
+        let jump_idx = jit.next_insn_idx() as i32 + jump_offset;
+        return end_block_with_jump(jit, asm, jump_idx as u16);
+    }
 }
 
 fn gen_jump(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let jump_offset = jit_get_arg(jit, 0).as_i32();
+) -> Option<CodegenStatus> {
+    let jump_offset = jit.get_arg(0).as_i32();
 
     // Check for interrupts, but only on backward branches that may create loops
     if jump_offset < 0 {
-        let side_exit = get_side_exit(jit, ocb, ctx);
-        gen_check_ints(asm, side_exit);
+        gen_check_ints(asm, Counter::jump_interrupted);
     }
 
     // Get the branch target instruction offsets
-    let jump_idx = (jit_next_insn_idx(jit) as i32) + jump_offset;
+    let jump_idx = jit.next_insn_idx() as i32 + jump_offset;
     let jump_block = BlockId {
         iseq: jit.iseq,
-        idx: jump_idx as u32,
+        idx: jump_idx.try_into().unwrap(),
     };
 
     // Generate the jump instruction
-    gen_direct_jump(jit, ctx, jump_block, asm);
+    gen_direct_jump(jit, &asm.ctx.clone(), jump_block, asm);
 
-    EndBlock
+    Some(EndBlock)
 }
 
 /// Guard that self or a stack operand has the same class as `known_klass`, using
@@ -3783,60 +4926,68 @@ fn gen_jump(
 /// Recompile as contingency if possible, or take side exit a last resort.
 fn jit_guard_known_klass(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-    known_klass: VALUE,
     obj_opnd: Opnd,
     insn_opnd: YARVOpnd,
     sample_instance: VALUE,
-    max_chain_depth: i32,
-    side_exit: Target,
+    max_chain_depth: u8,
+    counter: Counter,
 ) {
-    let val_type = ctx.get_opnd_type(insn_opnd);
+    let known_klass = sample_instance.class_of();
+    let val_type = asm.ctx.get_opnd_type(insn_opnd);
 
     if val_type.known_class() == Some(known_klass) {
-        // We already know from type information that this is a match
-        return;
+        // Unless frozen, Array, Hash, and String objects may change their RBASIC_CLASS
+        // when they get a singleton class. Those types need invalidations.
+        if unsafe { [rb_cArray, rb_cHash, rb_cString].contains(&known_klass) } {
+            if jit.assume_no_singleton_class(asm, known_klass) {
+                // Speculate that this object will not have a singleton class,
+                // and invalidate the block in case it does.
+                return;
+            }
+        } else {
+            // We already know from type information that this is a match
+            return;
+        }
     }
 
     if unsafe { known_klass == rb_cNilClass } {
         assert!(!val_type.is_heap());
         assert!(val_type.is_unknown());
 
-        asm.comment("guard object is nil");
+        asm_comment!(asm, "guard object is nil");
         asm.cmp(obj_opnd, Qnil.into());
-        jit_chain_guard(JCC_JNE, jit, ctx, asm, ocb, max_chain_depth, side_exit);
+        jit_chain_guard(JCC_JNE, jit, asm, max_chain_depth, counter);
 
-        ctx.upgrade_opnd_type(insn_opnd, Type::Nil);
+        asm.ctx.upgrade_opnd_type(insn_opnd, Type::Nil);
     } else if unsafe { known_klass == rb_cTrueClass } {
         assert!(!val_type.is_heap());
         assert!(val_type.is_unknown());
 
-        asm.comment("guard object is true");
+        asm_comment!(asm, "guard object is true");
         asm.cmp(obj_opnd, Qtrue.into());
-        jit_chain_guard(JCC_JNE, jit, ctx, asm, ocb, max_chain_depth, side_exit);
+        jit_chain_guard(JCC_JNE, jit, asm, max_chain_depth, counter);
 
-        ctx.upgrade_opnd_type(insn_opnd, Type::True);
+        asm.ctx.upgrade_opnd_type(insn_opnd, Type::True);
     } else if unsafe { known_klass == rb_cFalseClass } {
         assert!(!val_type.is_heap());
         assert!(val_type.is_unknown());
 
-        asm.comment("guard object is false");
+        asm_comment!(asm, "guard object is false");
         assert!(Qfalse.as_i32() == 0);
         asm.test(obj_opnd, obj_opnd);
-        jit_chain_guard(JCC_JNZ, jit, ctx, asm, ocb, max_chain_depth, side_exit);
+        jit_chain_guard(JCC_JNZ, jit, asm, max_chain_depth, counter);
 
-        ctx.upgrade_opnd_type(insn_opnd, Type::False);
+        asm.ctx.upgrade_opnd_type(insn_opnd, Type::False);
     } else if unsafe { known_klass == rb_cInteger } && sample_instance.fixnum_p() {
         // We will guard fixnum and bignum as though they were separate classes
         // BIGNUM can be handled by the general else case below
         assert!(val_type.is_unknown());
 
-        asm.comment("guard object is fixnum");
+        asm_comment!(asm, "guard object is fixnum");
         asm.test(obj_opnd, Opnd::Imm(RUBY_FIXNUM_FLAG as i64));
-        jit_chain_guard(JCC_JZ, jit, ctx, asm, ocb, max_chain_depth, side_exit);
-        ctx.upgrade_opnd_type(insn_opnd, Type::Fixnum);
+        jit_chain_guard(JCC_JZ, jit, asm, max_chain_depth, counter);
+        asm.ctx.upgrade_opnd_type(insn_opnd, Type::Fixnum);
     } else if unsafe { known_klass == rb_cSymbol } && sample_instance.static_sym_p() {
         assert!(!val_type.is_heap());
         // We will guard STATIC vs DYNAMIC as though they were separate classes
@@ -3844,11 +4995,11 @@ fn jit_guard_known_klass(
         if val_type != Type::ImmSymbol || !val_type.is_imm() {
             assert!(val_type.is_unknown());
 
-            asm.comment("guard object is static symbol");
+            asm_comment!(asm, "guard object is static symbol");
             assert!(RUBY_SPECIAL_SHIFT == 8);
             asm.cmp(obj_opnd.with_num_bits(8).unwrap(), Opnd::UImm(RUBY_SYMBOL_FLAG as u64));
-            jit_chain_guard(JCC_JNE, jit, ctx, asm, ocb, max_chain_depth, side_exit);
-            ctx.upgrade_opnd_type(insn_opnd, Type::ImmSymbol);
+            jit_chain_guard(JCC_JNE, jit, asm, max_chain_depth, counter);
+            asm.ctx.upgrade_opnd_type(insn_opnd, Type::ImmSymbol);
         }
     } else if unsafe { known_klass == rb_cFloat } && sample_instance.flonum_p() {
         assert!(!val_type.is_heap());
@@ -3856,15 +5007,16 @@ fn jit_guard_known_klass(
             assert!(val_type.is_unknown());
 
             // We will guard flonum vs heap float as though they were separate classes
-            asm.comment("guard object is flonum");
+            asm_comment!(asm, "guard object is flonum");
             let flag_bits = asm.and(obj_opnd, Opnd::UImm(RUBY_FLONUM_MASK as u64));
             asm.cmp(flag_bits, Opnd::UImm(RUBY_FLONUM_FLAG as u64));
-            jit_chain_guard(JCC_JNE, jit, ctx, asm, ocb, max_chain_depth, side_exit);
-            ctx.upgrade_opnd_type(insn_opnd, Type::Flonum);
+            jit_chain_guard(JCC_JNE, jit, asm, max_chain_depth, counter);
+            asm.ctx.upgrade_opnd_type(insn_opnd, Type::Flonum);
         }
     } else if unsafe {
         FL_TEST(known_klass, VALUE(RUBY_FL_SINGLETON as usize)) != VALUE(0)
-            && sample_instance == rb_attr_get(known_klass, id__attached__ as ID)
+            && sample_instance == rb_class_attached_object(known_klass)
+            && !rb_obj_is_kind_of(sample_instance, rb_cIO).test()
     } {
         // Singleton classes are attached to one specific object, so we can
         // avoid one memory access (and potentially the is_heap check) by
@@ -3876,46 +5028,50 @@ fn jit_guard_known_klass(
         // that its singleton class is empty, so we can't avoid the memory
         // access. As an example, `Object.new.singleton_class` is an object in
         // this situation.
-        asm.comment("guard known object with singleton class");
+        // Also, guarding by identity is incorrect for IO objects because
+        // IO#reopen can be used to change the class and singleton class of IO objects!
+        asm_comment!(asm, "guard known object with singleton class");
         asm.cmp(obj_opnd, sample_instance.into());
-        jit_chain_guard(JCC_JNE, jit, ctx, asm, ocb, max_chain_depth, side_exit);
+        jit_chain_guard(JCC_JNE, jit, asm, max_chain_depth, counter);
     } else if val_type == Type::CString && unsafe { known_klass == rb_cString } {
         // guard elided because the context says we've already checked
         unsafe {
             assert_eq!(sample_instance.class_of(), rb_cString, "context says class is exactly ::String")
         };
     } else {
-        assert!(!val_type.is_imm());
+        assert!(!val_type.is_imm(), "{insn_opnd:?} should be a heap object, but was {val_type:?} for {sample_instance:?}");
 
         // Check that the receiver is a heap object
         // Note: if we get here, the class doesn't have immediate instances.
         if !val_type.is_heap() {
-            asm.comment("guard not immediate");
+            asm_comment!(asm, "guard not immediate");
             asm.test(obj_opnd, (RUBY_IMMEDIATE_MASK as u64).into());
-            jit_chain_guard(JCC_JNZ, jit, ctx, asm, ocb, max_chain_depth, side_exit);
+            jit_chain_guard(JCC_JNZ, jit, asm, max_chain_depth, counter);
             asm.cmp(obj_opnd, Qfalse.into());
-            jit_chain_guard(JCC_JE, jit, ctx, asm, ocb, max_chain_depth, side_exit);
+            jit_chain_guard(JCC_JE, jit, asm, max_chain_depth, counter);
 
-            ctx.upgrade_opnd_type(insn_opnd, Type::UnknownHeap);
+            asm.ctx.upgrade_opnd_type(insn_opnd, Type::UnknownHeap);
         }
 
         // If obj_opnd isn't already a register, load it.
         let obj_opnd = match obj_opnd {
-            Opnd::Reg(_) => obj_opnd,
+            Opnd::InsnOut { .. } => obj_opnd,
             _ => asm.load(obj_opnd),
         };
         let klass_opnd = Opnd::mem(64, obj_opnd, RUBY_OFFSET_RBASIC_KLASS);
 
         // Bail if receiver class is different from known_klass
         // TODO: jit_mov_gc_ptr keeps a strong reference, which leaks the class.
-        asm.comment("guard known class");
+        asm_comment!(asm, "guard known class");
         asm.cmp(klass_opnd, known_klass.into());
-        jit_chain_guard(JCC_JNE, jit, ctx, asm, ocb, max_chain_depth, side_exit);
+        jit_chain_guard(JCC_JNE, jit, asm, max_chain_depth, counter);
 
         if known_klass == unsafe { rb_cString } {
-            ctx.upgrade_opnd_type(insn_opnd, Type::CString);
+            asm.ctx.upgrade_opnd_type(insn_opnd, Type::CString);
         } else if known_klass == unsafe { rb_cArray } {
-            ctx.upgrade_opnd_type(insn_opnd, Type::CArray);
+            asm.ctx.upgrade_opnd_type(insn_opnd, Type::CArray);
+        } else if known_klass == unsafe { rb_cHash } {
+            asm.ctx.upgrade_opnd_type(insn_opnd, Type::CHash);
         }
     }
 }
@@ -3923,11 +5079,8 @@ fn jit_guard_known_klass(
 // Generate ancestry guard for protected callee.
 // Calls to protected callees only go through when self.is_a?(klass_that_defines_the_callee).
 fn jit_protected_callee_ancestry_guard(
-    _jit: &mut JITState,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
     cme: *const rb_callable_method_entry_t,
-    side_exit: Target,
 ) {
     // See vm_call_method().
     let def_class = unsafe { (*cme).defined_class };
@@ -3942,7 +5095,7 @@ fn jit_protected_callee_ancestry_guard(
         ],
     );
     asm.test(val, val);
-    asm.jz(counted_exit!(ocb, side_exit, send_se_protected_check_failed))
+    asm.jz(Target::side_exit(Counter::guard_send_se_protected_check_failed))
 }
 
 // Codegen for rb_obj_not().
@@ -3950,29 +5103,27 @@ fn jit_protected_callee_ancestry_guard(
 // arity guards.
 fn jit_rb_obj_not(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
 ) -> bool {
-    let recv_opnd = ctx.get_opnd_type(StackOpnd(0));
+    let recv_opnd = asm.ctx.get_opnd_type(StackOpnd(0));
 
     match recv_opnd.known_truthy() {
         Some(false) => {
-            asm.comment("rb_obj_not(nil_or_false)");
-            ctx.stack_pop(1);
-            let out_opnd = ctx.stack_push(Type::True);
+            asm_comment!(asm, "rb_obj_not(nil_or_false)");
+            asm.stack_pop(1);
+            let out_opnd = asm.stack_push(Type::True);
             asm.mov(out_opnd, Qtrue.into());
         },
         Some(true) => {
             // Note: recv_opnd != Type::Nil && recv_opnd != Type::False.
-            asm.comment("rb_obj_not(truthy)");
-            ctx.stack_pop(1);
-            let out_opnd = ctx.stack_push(Type::False);
+            asm_comment!(asm, "rb_obj_not(truthy)");
+            asm.stack_pop(1);
+            let out_opnd = asm.stack_push(Type::False);
             asm.mov(out_opnd, Qfalse.into());
         },
         _ => {
@@ -3986,18 +5137,16 @@ fn jit_rb_obj_not(
 // Codegen for rb_true()
 fn jit_rb_true(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
 ) -> bool {
-    asm.comment("nil? == true");
-    ctx.stack_pop(1);
-    let stack_ret = ctx.stack_push(Type::True);
+    asm_comment!(asm, "nil? == true");
+    asm.stack_pop(1);
+    let stack_ret = asm.stack_push(Type::True);
     asm.mov(stack_ret, Qtrue.into());
     true
 }
@@ -4005,104 +5154,719 @@ fn jit_rb_true(
 // Codegen for rb_false()
 fn jit_rb_false(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
 ) -> bool {
-    asm.comment("nil? == false");
-    ctx.stack_pop(1);
-    let stack_ret = ctx.stack_push(Type::False);
+    asm_comment!(asm, "nil? == false");
+    asm.stack_pop(1);
+    let stack_ret = asm.stack_push(Type::False);
     asm.mov(stack_ret, Qfalse.into());
     true
 }
 
+/// Codegen for Kernel#is_a?
+fn jit_rb_kernel_is_a(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    argc: i32,
+    known_recv_class: Option<VALUE>,
+) -> bool {
+    if argc != 1 {
+        return false;
+    }
+
+    // If this is a super call we might not know the class
+    if known_recv_class.is_none() {
+        return false;
+    }
+
+    // Important note: The output code will simply `return true/false`.
+    // Correctness follows from:
+    //  - `known_recv_class` implies there is a guard scheduled before here
+    //    for a particular `CLASS_OF(lhs)`.
+    //  - We guard that rhs is identical to the compile-time sample
+    //  - In general, for any two Class instances A, B, `A < B` does not change at runtime.
+    //    Class#superclass is stable.
+
+    let sample_rhs = jit.peek_at_stack(&asm.ctx, 0);
+    let sample_lhs = jit.peek_at_stack(&asm.ctx, 1);
+
+    // We are not allowing module here because the module hierarchy can change at runtime.
+    if !unsafe { RB_TYPE_P(sample_rhs, RUBY_T_CLASS) } {
+        return false;
+    }
+    let sample_is_a = unsafe { rb_obj_is_kind_of(sample_lhs, sample_rhs) == Qtrue };
+
+    asm_comment!(asm, "Kernel#is_a?");
+    asm.cmp(asm.stack_opnd(0), sample_rhs.into());
+    asm.jne(Target::side_exit(Counter::guard_send_is_a_class_mismatch));
+
+    asm.stack_pop(2);
+
+    if sample_is_a {
+        let stack_ret = asm.stack_push(Type::True);
+        asm.mov(stack_ret, Qtrue.into());
+    } else {
+        let stack_ret = asm.stack_push(Type::False);
+        asm.mov(stack_ret, Qfalse.into());
+    }
+    return true;
+}
+
+/// Codegen for Kernel#instance_of?
+fn jit_rb_kernel_instance_of(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    argc: i32,
+    known_recv_class: Option<VALUE>,
+) -> bool {
+    if argc != 1 {
+        return false;
+    }
+
+    // If this is a super call we might not know the class
+    if known_recv_class.is_none() {
+        return false;
+    }
+
+    // Important note: The output code will simply `return true/false`.
+    // Correctness follows from:
+    //  - `known_recv_class` implies there is a guard scheduled before here
+    //    for a particular `CLASS_OF(lhs)`.
+    //  - We guard that rhs is identical to the compile-time sample
+    //  - For a particular `CLASS_OF(lhs)`, `rb_obj_class(lhs)` does not change.
+    //    (because for any singleton class `s`, `s.superclass.equal?(s.attached_object.class)`)
+
+    let sample_rhs = jit.peek_at_stack(&asm.ctx, 0);
+    let sample_lhs = jit.peek_at_stack(&asm.ctx, 1);
+
+    // Filters out cases where the C implementation raises
+    if unsafe { !(RB_TYPE_P(sample_rhs, RUBY_T_CLASS) || RB_TYPE_P(sample_rhs, RUBY_T_MODULE)) } {
+        return false;
+    }
+
+    // We need to grab the class here to deal with singleton classes.
+    // Instance of grabs the "real class" of the object rather than the
+    // singleton class.
+    let sample_lhs_real_class = unsafe { rb_obj_class(sample_lhs) };
+
+    let sample_instance_of = sample_lhs_real_class == sample_rhs;
+
+    asm_comment!(asm, "Kernel#instance_of?");
+    asm.cmp(asm.stack_opnd(0), sample_rhs.into());
+    jit_chain_guard(
+        JCC_JNE,
+        jit,
+        asm,
+        SEND_MAX_DEPTH,
+        Counter::guard_send_instance_of_class_mismatch,
+    );
+
+    asm.stack_pop(2);
+
+    if sample_instance_of {
+        let stack_ret = asm.stack_push(Type::True);
+        asm.mov(stack_ret, Qtrue.into());
+    } else {
+        let stack_ret = asm.stack_push(Type::False);
+        asm.mov(stack_ret, Qfalse.into());
+    }
+    return true;
+}
+
+fn jit_rb_mod_eqq(
+    _jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    if argc != 1 {
+        return false;
+    }
+
+    asm_comment!(asm, "Module#===");
+    // By being here, we know that the receiver is a T_MODULE or a T_CLASS, because Module#=== can
+    // only live on these objects. With that, we can call rb_obj_is_kind_of() without
+    // jit_prepare_non_leaf_call() or a control frame push because it can't raise, allocate, or call
+    // Ruby methods with these inputs.
+    // Note the difference in approach from Kernel#is_a? because we don't get a free guard for the
+    // right hand side.
+    let rhs = asm.stack_pop(1);
+    let lhs = asm.stack_pop(1); // the module
+    let ret = asm.ccall(rb_obj_is_kind_of as *const u8, vec![rhs, lhs]);
+
+    // Return the result
+    let stack_ret = asm.stack_push(Type::UnknownImm);
+    asm.mov(stack_ret, ret);
+
+    return true;
+}
+
+// Substitution for rb_mod_name(). Returns the name of a module/class.
+fn jit_rb_mod_name(
+    _jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    if argc != 0 {
+        return false;
+    }
+
+    asm_comment!(asm, "Module#name");
+
+    // rb_mod_name() never allocates, so no preparation needed.
+    let name = asm.ccall(rb_mod_name as _, vec![asm.stack_opnd(0)]);
+
+    let _ = asm.stack_pop(1); // pop self
+    // call-seq: mod.name -> string or nil
+    let ret = asm.stack_push(Type::Unknown);
+    asm.mov(ret, name);
+
+    true
+}
+
 // Codegen for rb_obj_equal()
 // object identity comparison
 fn jit_rb_obj_equal(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
 ) -> bool {
-    asm.comment("equal?");
-    let obj1 = ctx.stack_pop(1);
-    let obj2 = ctx.stack_pop(1);
+    asm_comment!(asm, "equal?");
+    let obj1 = asm.stack_pop(1);
+    let obj2 = asm.stack_pop(1);
 
     asm.cmp(obj1, obj2);
     let ret_opnd = asm.csel_e(Qtrue.into(), Qfalse.into());
 
-    let stack_ret = ctx.stack_push(Type::UnknownImm);
+    let stack_ret = asm.stack_push(Type::UnknownImm);
     asm.mov(stack_ret, ret_opnd);
     true
 }
 
+// Codegen for rb_obj_not_equal()
+// object identity comparison
+fn jit_rb_obj_not_equal(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    gen_equality_specialized(jit, asm, false) == Some(true)
+}
+
 // Codegen for rb_int_equal()
 fn jit_rb_int_equal(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
 ) -> bool {
-    let side_exit = get_side_exit(jit, ocb, ctx);
-
     // Check that both operands are fixnums
-    guard_two_fixnums(jit, ctx, asm, ocb, side_exit);
+    guard_two_fixnums(jit, asm);
 
     // Compare the arguments
-    asm.comment("rb_int_equal");
-    let arg1 = ctx.stack_pop(1);
-    let arg0 = ctx.stack_pop(1);
+    asm_comment!(asm, "rb_int_equal");
+    let arg1 = asm.stack_pop(1);
+    let arg0 = asm.stack_pop(1);
     asm.cmp(arg0, arg1);
     let ret_opnd = asm.csel_e(Qtrue.into(), Qfalse.into());
 
-    let stack_ret = ctx.stack_push(Type::UnknownImm);
+    let stack_ret = asm.stack_push(Type::UnknownImm);
     asm.mov(stack_ret, ret_opnd);
     true
 }
 
-/// If string is frozen, duplicate it to get a non-frozen string. Otherwise, return it.
-fn jit_rb_str_uplus(
+fn jit_rb_int_succ(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    // Guard the receiver is fixnum
+    let recv_type = asm.ctx.get_opnd_type(StackOpnd(0));
+    let recv = asm.stack_pop(1);
+    if recv_type != Type::Fixnum {
+        asm_comment!(asm, "guard object is fixnum");
+        asm.test(recv, Opnd::Imm(RUBY_FIXNUM_FLAG as i64));
+        asm.jz(Target::side_exit(Counter::opt_succ_not_fixnum));
+    }
+
+    asm_comment!(asm, "Integer#succ");
+    let out_val = asm.add(recv, Opnd::Imm(2)); // 2 is untagged Fixnum 1
+    asm.jo(Target::side_exit(Counter::opt_succ_overflow));
+
+    // Push the output onto the stack
+    let dst = asm.stack_push(Type::Fixnum);
+    asm.mov(dst, out_val);
+
+    true
+}
+
+fn jit_rb_int_pred(
+    _jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    // Guard the receiver is fixnum
+    let recv_type = asm.ctx.get_opnd_type(StackOpnd(0));
+    let recv = asm.stack_pop(1);
+    if recv_type != Type::Fixnum {
+        asm_comment!(asm, "guard object is fixnum");
+        asm.test(recv, Opnd::Imm(RUBY_FIXNUM_FLAG as i64));
+        asm.jz(Target::side_exit(Counter::send_pred_not_fixnum));
+    }
+
+    asm_comment!(asm, "Integer#pred");
+    let out_val = asm.sub(recv, Opnd::Imm(2)); // 2 is untagged Fixnum 1
+    asm.jo(Target::side_exit(Counter::send_pred_underflow));
+
+    // Push the output onto the stack
+    let dst = asm.stack_push(Type::Fixnum);
+    asm.mov(dst, out_val);
+
+    true
+}
+
+fn jit_rb_int_div(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    if asm.ctx.two_fixnums_on_stack(jit) != Some(true) {
+        return false;
+    }
+    guard_two_fixnums(jit, asm);
+
+    // rb_fix_div_fix may GC-allocate for Bignum
+    jit_prepare_call_with_gc(jit, asm);
+
+    asm_comment!(asm, "Integer#/");
+    let obj = asm.stack_opnd(0);
+    let recv = asm.stack_opnd(1);
+
+    // Check for arg0 % 0
+    asm.cmp(obj, VALUE::fixnum_from_usize(0).as_i64().into());
+    asm.je(Target::side_exit(Counter::opt_div_zero));
+
+    let ret = asm.ccall(rb_fix_div_fix as *const u8, vec![recv, obj]);
+    asm.stack_pop(2); // Keep them during ccall for GC
+
+    let ret_opnd = asm.stack_push(Type::Unknown);
+    asm.mov(ret_opnd, ret);
+    true
+}
+
+fn jit_rb_int_lshift(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    if asm.ctx.two_fixnums_on_stack(jit) != Some(true) {
+        return false;
+    }
+    guard_two_fixnums(jit, asm);
+
+    let comptime_shift = jit.peek_at_stack(&asm.ctx, 0);
+
+    if !comptime_shift.fixnum_p() {
+        return false;
+    }
+
+    // Untag the fixnum shift amount
+    let shift_amt = comptime_shift.as_isize() >> 1;
+    if shift_amt > 63 || shift_amt < 0 {
+        return false;
+    }
+
+    // Fallback to a C call if the shift amount varies
+    // This check is needed because the chain guard will side-exit
+    // if its max depth is reached
+    if asm.ctx.get_chain_depth() > 0 {
+        return false;
+    }
+
+    let rhs = asm.stack_pop(1);
+    let lhs = asm.stack_pop(1);
+
+    // Guard on the shift amount we speculated on
+    asm.cmp(rhs, comptime_shift.into());
+    jit_chain_guard(
+        JCC_JNE,
+        jit,
+        asm,
+        1,
+        Counter::lshift_amount_changed,
+    );
+
+    fixnum_left_shift_body(asm, lhs, shift_amt as u64);
+    true
+}
+
+fn fixnum_left_shift_body(asm: &mut Assembler, lhs: Opnd, shift_amt: u64) {
+    let in_val = asm.sub(lhs, 1.into());
+    let shift_opnd = Opnd::UImm(shift_amt);
+    let out_val = asm.lshift(in_val, shift_opnd);
+    let unshifted = asm.rshift(out_val, shift_opnd);
+
+    // Guard that we did not overflow
+    asm.cmp(unshifted, in_val);
+    asm.jne(Target::side_exit(Counter::lshift_overflow));
+
+    // Re-tag the output value
+    let out_val = asm.add(out_val, 1.into());
+
+    let ret_opnd = asm.stack_push(Type::Fixnum);
+    asm.mov(ret_opnd, out_val);
+}
+
+fn jit_rb_int_rshift(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    if asm.ctx.two_fixnums_on_stack(jit) != Some(true) {
+        return false;
+    }
+    guard_two_fixnums(jit, asm);
+
+    let comptime_shift = jit.peek_at_stack(&asm.ctx, 0);
+
+    // Untag the fixnum shift amount
+    let shift_amt = comptime_shift.as_isize() >> 1;
+    if shift_amt > 63 || shift_amt < 0 {
+        return false;
+    }
+
+    // Fallback to a C call if the shift amount varies
+    // This check is needed because the chain guard will side-exit
+    // if its max depth is reached
+    if asm.ctx.get_chain_depth() > 0 {
+        return false;
+    }
+
+    let rhs = asm.stack_pop(1);
+    let lhs = asm.stack_pop(1);
+
+    // Guard on the shift amount we speculated on
+    asm.cmp(rhs, comptime_shift.into());
+    jit_chain_guard(
+        JCC_JNE,
+        jit,
+        asm,
+        1,
+        Counter::rshift_amount_changed,
+    );
+
+    let shift_opnd = Opnd::UImm(shift_amt as u64);
+    let out_val = asm.rshift(lhs, shift_opnd);
+    let out_val = asm.or(out_val, 1.into());
+
+    let ret_opnd = asm.stack_push(Type::Fixnum);
+    asm.mov(ret_opnd, out_val);
+    true
+}
+
+fn jit_rb_int_xor(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    if asm.ctx.two_fixnums_on_stack(jit) != Some(true) {
+        return false;
+    }
+    guard_two_fixnums(jit, asm);
+
+    let rhs = asm.stack_pop(1);
+    let lhs = asm.stack_pop(1);
+
+    // XOR and then re-tag the resulting fixnum
+    let out_val = asm.xor(lhs, rhs);
+    let out_val = asm.or(out_val, 1.into());
+
+    let ret_opnd = asm.stack_push(Type::Fixnum);
+    asm.mov(ret_opnd, out_val);
+    true
+}
+
+fn jit_rb_int_aref(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    if argc != 1 {
+        return false;
+    }
+    if asm.ctx.two_fixnums_on_stack(jit) != Some(true) {
+        return false;
+    }
+    guard_two_fixnums(jit, asm);
+
+    asm_comment!(asm, "Integer#[]");
+    let obj = asm.stack_pop(1);
+    let recv = asm.stack_pop(1);
+
+    let ret = asm.ccall(rb_fix_aref as *const u8, vec![recv, obj]);
+
+    let ret_opnd = asm.stack_push(Type::Fixnum);
+    asm.mov(ret_opnd, ret);
+    true
+}
+
+fn jit_rb_float_plus(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    // Guard obj is Fixnum or Flonum to avoid rb_funcall on rb_num_coerce_bin
+    let comptime_obj = jit.peek_at_stack(&asm.ctx, 0);
+    if comptime_obj.fixnum_p() || comptime_obj.flonum_p() {
+        let obj = asm.stack_opnd(0);
+        jit_guard_known_klass(
+            jit,
+            asm,
+            obj,
+            obj.into(),
+            comptime_obj,
+            SEND_MAX_DEPTH,
+            Counter::guard_send_not_fixnum_or_flonum,
+        );
+    } else {
+        return false;
+    }
+
+    // Save the PC and SP because the callee may allocate Float on heap
+    jit_prepare_call_with_gc(jit, asm);
+
+    asm_comment!(asm, "Float#+");
+    let obj = asm.stack_opnd(0);
+    let recv = asm.stack_opnd(1);
+
+    let ret = asm.ccall(rb_float_plus as *const u8, vec![recv, obj]);
+    asm.stack_pop(2); // Keep recv during ccall for GC
+
+    let ret_opnd = asm.stack_push(Type::Unknown); // Flonum or heap Float
+    asm.mov(ret_opnd, ret);
+    true
+}
+
+fn jit_rb_float_minus(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    // Guard obj is Fixnum or Flonum to avoid rb_funcall on rb_num_coerce_bin
+    let comptime_obj = jit.peek_at_stack(&asm.ctx, 0);
+    if comptime_obj.fixnum_p() || comptime_obj.flonum_p() {
+        let obj = asm.stack_opnd(0);
+        jit_guard_known_klass(
+            jit,
+            asm,
+            obj,
+            obj.into(),
+            comptime_obj,
+            SEND_MAX_DEPTH,
+            Counter::guard_send_not_fixnum_or_flonum,
+        );
+    } else {
+        return false;
+    }
+
+    // Save the PC and SP because the callee may allocate Float on heap
+    jit_prepare_call_with_gc(jit, asm);
+
+    asm_comment!(asm, "Float#-");
+    let obj = asm.stack_opnd(0);
+    let recv = asm.stack_opnd(1);
+
+    let ret = asm.ccall(rb_float_minus as *const u8, vec![recv, obj]);
+    asm.stack_pop(2); // Keep recv during ccall for GC
+
+    let ret_opnd = asm.stack_push(Type::Unknown); // Flonum or heap Float
+    asm.mov(ret_opnd, ret);
+    true
+}
+
+fn jit_rb_float_mul(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    // Guard obj is Fixnum or Flonum to avoid rb_funcall on rb_num_coerce_bin
+    let comptime_obj = jit.peek_at_stack(&asm.ctx, 0);
+    if comptime_obj.fixnum_p() || comptime_obj.flonum_p() {
+        let obj = asm.stack_opnd(0);
+        jit_guard_known_klass(
+            jit,
+            asm,
+            obj,
+            obj.into(),
+            comptime_obj,
+            SEND_MAX_DEPTH,
+            Counter::guard_send_not_fixnum_or_flonum,
+        );
+    } else {
+        return false;
+    }
+
+    // Save the PC and SP because the callee may allocate Float on heap
+    jit_prepare_call_with_gc(jit, asm);
+
+    asm_comment!(asm, "Float#*");
+    let obj = asm.stack_opnd(0);
+    let recv = asm.stack_opnd(1);
+
+    let ret = asm.ccall(rb_float_mul as *const u8, vec![recv, obj]);
+    asm.stack_pop(2); // Keep recv during ccall for GC
+
+    let ret_opnd = asm.stack_push(Type::Unknown); // Flonum or heap Float
+    asm.mov(ret_opnd, ret);
+    true
+}
+
+fn jit_rb_float_div(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    // Guard obj is Fixnum or Flonum to avoid rb_funcall on rb_num_coerce_bin
+    let comptime_obj = jit.peek_at_stack(&asm.ctx, 0);
+    if comptime_obj.fixnum_p() || comptime_obj.flonum_p() {
+        let obj = asm.stack_opnd(0);
+        jit_guard_known_klass(
+            jit,
+            asm,
+            obj,
+            obj.into(),
+            comptime_obj,
+            SEND_MAX_DEPTH,
+            Counter::guard_send_not_fixnum_or_flonum,
+        );
+    } else {
+        return false;
+    }
+
+    // Save the PC and SP because the callee may allocate Float on heap
+    jit_prepare_call_with_gc(jit, asm);
+
+    asm_comment!(asm, "Float#/");
+    let obj = asm.stack_opnd(0);
+    let recv = asm.stack_opnd(1);
+
+    let ret = asm.ccall(rb_float_div as *const u8, vec![recv, obj]);
+    asm.stack_pop(2); // Keep recv during ccall for GC
+
+    let ret_opnd = asm.stack_push(Type::Unknown); // Flonum or heap Float
+    asm.mov(ret_opnd, ret);
+    true
+}
+
+/// If string is frozen, duplicate it to get a non-frozen string. Otherwise, return it.
+fn jit_rb_str_uplus(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    argc: i32,
+    _known_recv_class: Option<VALUE>,
 ) -> bool
 {
-    asm.comment("Unary plus on string");
-    let recv_opnd = asm.load(ctx.stack_pop(1));
+    if argc != 0 {
+        return false;
+    }
+
+    // We allocate when we dup the string
+    jit_prepare_call_with_gc(jit, asm);
+    asm.spill_regs(); // For ccall. Unconditionally spill them for RegMappings consistency.
+
+    asm_comment!(asm, "Unary plus on string");
+    let recv_opnd = asm.stack_pop(1);
+    let recv_opnd = asm.load(recv_opnd);
     let flags_opnd = asm.load(Opnd::mem(64, recv_opnd, RUBY_OFFSET_RBASIC_FLAGS));
-    asm.test(flags_opnd, Opnd::Imm(RUBY_FL_FREEZE as i64));
+    asm.test(flags_opnd, Opnd::Imm(RUBY_FL_FREEZE as i64 | RSTRING_CHILLED as i64));
 
     let ret_label = asm.new_label("stack_ret");
 
-    // We guard for the receiver being a ::String, so the return value is too
-    let stack_ret = ctx.stack_push(Type::CString);
+    // String#+@ can only exist on T_STRING
+    let stack_ret = asm.stack_push(Type::TString);
 
     // If the string isn't frozen, we just return it.
     asm.mov(stack_ret, recv_opnd);
     asm.jz(ret_label);
 
     // Str is frozen - duplicate it
+    asm.spill_regs(); // for ccall
     let ret_opnd = asm.ccall(rb_str_dup as *const u8, vec![recv_opnd]);
     asm.mov(stack_ret, ret_opnd);
 
@@ -4111,23 +5875,272 @@ fn jit_rb_str_uplus(
     true
 }
 
+fn jit_rb_str_length(
+    _jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    asm_comment!(asm, "String#length");
+    extern "C" {
+        fn rb_str_length(str: VALUE) -> VALUE;
+    }
+
+    // This function cannot allocate or raise an exceptions
+    let recv = asm.stack_opnd(0);
+    let ret_opnd = asm.ccall(rb_str_length as *const u8, vec![recv]);
+    asm.stack_pop(1); // Keep recv on stack during ccall for GC
+
+    // Should be guaranteed to be a fixnum on 64-bit systems
+    let out_opnd = asm.stack_push(Type::Fixnum);
+    asm.mov(out_opnd, ret_opnd);
+
+    true
+}
+
 fn jit_rb_str_bytesize(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
 ) -> bool {
-    asm.comment("String#bytesize");
+    asm_comment!(asm, "String#bytesize");
+
+    let recv = asm.stack_pop(1);
 
-    let recv = ctx.stack_pop(1);
-    let ret_opnd = asm.ccall(rb_str_bytesize as *const u8, vec![recv]);
+    asm_comment!(asm, "get string length");
+    let str_len_opnd = Opnd::mem(
+        std::os::raw::c_long::BITS as u8,
+        asm.load(recv),
+        RUBY_OFFSET_RSTRING_LEN as i32,
+    );
 
-    let out_opnd = ctx.stack_push(Type::Fixnum);
+    let len = asm.load(str_len_opnd);
+    let shifted_val = asm.lshift(len, Opnd::UImm(1));
+    let out_val = asm.or(shifted_val, Opnd::UImm(RUBY_FIXNUM_FLAG as u64));
+
+    let out_opnd = asm.stack_push(Type::Fixnum);
+
+    asm.mov(out_opnd, out_val);
+
+    true
+}
+
+fn jit_rb_str_byteslice(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    if argc != 2 {
+        return false
+    }
+
+    // rb_str_byte_substr should be leaf if indexes are fixnums
+    match (asm.ctx.get_opnd_type(StackOpnd(0)), asm.ctx.get_opnd_type(StackOpnd(1))) {
+        (Type::Fixnum, Type::Fixnum) => {},
+        // Raises when non-integers are passed in, which requires the method frame
+        // to be pushed for the backtrace
+        _ => if !jit_prepare_lazy_frame_call(jit, asm, cme, StackOpnd(2)) {
+            return false;
+        }
+    }
+    asm_comment!(asm, "String#byteslice");
+
+    // rb_str_byte_substr allocates a substring
+    jit_prepare_call_with_gc(jit, asm);
+
+    // Get stack operands after potential SP change
+    let len = asm.stack_opnd(0);
+    let beg = asm.stack_opnd(1);
+    let recv = asm.stack_opnd(2);
+
+    let ret_opnd = asm.ccall(rb_str_byte_substr as *const u8, vec![recv, beg, len]);
+    asm.stack_pop(3);
+
+    let out_opnd = asm.stack_push(Type::Unknown);
+    asm.mov(out_opnd, ret_opnd);
+
+    true
+}
+
+fn jit_rb_str_aref_m(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    // In yjit-bench the most common usages by far are single fixnum or two fixnums.
+    // rb_str_substr should be leaf if indexes are fixnums
+    if argc == 2 {
+        match (asm.ctx.get_opnd_type(StackOpnd(0)), asm.ctx.get_opnd_type(StackOpnd(1))) {
+            (Type::Fixnum, Type::Fixnum) => {},
+            // There is a two-argument form of (RegExp, Fixnum) which needs a different c func.
+            // Other types will raise.
+            _ => { return false },
+        }
+    } else if argc == 1 {
+        match asm.ctx.get_opnd_type(StackOpnd(0)) {
+            Type::Fixnum => {},
+            // Besides Fixnum this could also be a Range or a RegExp which are handled by separate c funcs.
+            // Other types will raise.
+            _ => {
+                // If the context doesn't have the type info we try a little harder.
+                let comptime_arg = jit.peek_at_stack(&asm.ctx, 0);
+                let arg0 = asm.stack_opnd(0);
+                if comptime_arg.fixnum_p() {
+                    asm.test(arg0, Opnd::UImm(RUBY_FIXNUM_FLAG as u64));
+
+                    jit_chain_guard(
+                        JCC_JZ,
+                        jit,
+                        asm,
+                        SEND_MAX_DEPTH,
+                        Counter::guard_send_str_aref_not_fixnum,
+                    );
+                } else {
+                    return false
+                }
+            },
+        }
+    } else {
+        return false
+    }
+
+    asm_comment!(asm, "String#[]");
+
+    // rb_str_substr allocates a substring
+    jit_prepare_call_with_gc(jit, asm);
+
+    // Get stack operands after potential SP change
+
+    // The "empty" arg distinguishes between the normal "one arg" behavior
+    // and the "two arg" special case that returns an empty string
+    // when the begin index is the length of the string.
+    // See the usages of rb_str_substr in string.c for more information.
+    let (beg_idx, empty, len) = if argc == 2 {
+        (1, Opnd::Imm(1), asm.stack_opnd(0))
+    } else {
+        // If there is only one arg, the length will be 1.
+        (0, Opnd::Imm(0), VALUE::fixnum_from_usize(1).into())
+    };
+
+    let beg = asm.stack_opnd(beg_idx);
+    let recv = asm.stack_opnd(beg_idx + 1);
+
+    let ret_opnd = asm.ccall(rb_str_substr_two_fixnums as *const u8, vec![recv, beg, len, empty]);
+    asm.stack_pop(beg_idx as usize + 2);
+
+    let out_opnd = asm.stack_push(Type::Unknown);
+    asm.mov(out_opnd, ret_opnd);
+
+    true
+}
+
+fn jit_rb_str_getbyte(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    asm_comment!(asm, "String#getbyte");
+
+    // Don't pop since we may bail
+    let idx = asm.stack_opnd(0);
+    let recv = asm.stack_opnd(1);
+
+    let comptime_idx = jit.peek_at_stack(&asm.ctx, 0);
+    if comptime_idx.fixnum_p(){
+        jit_guard_known_klass(
+            jit,
+            asm,
+            idx,
+            idx.into(),
+            comptime_idx,
+            SEND_MAX_DEPTH,
+            Counter::getbyte_idx_not_fixnum,
+        );
+    } else {
+        return false;
+    }
+
+    // Untag the index
+    let idx = asm.rshift(idx, Opnd::UImm(1));
+
+    // If index is negative, exit
+    asm.cmp(idx, Opnd::UImm(0));
+    asm.jl(Target::side_exit(Counter::getbyte_idx_negative));
+
+    asm_comment!(asm, "get string length");
+    let recv = asm.load(recv);
+    let str_len_opnd = Opnd::mem(
+        std::os::raw::c_long::BITS as u8,
+        asm.load(recv),
+        RUBY_OFFSET_RSTRING_LEN as i32,
+    );
+
+    // Exit if the index is out of bounds
+    asm.cmp(idx, str_len_opnd);
+    asm.jge(Target::side_exit(Counter::getbyte_idx_out_of_bounds));
+
+    let str_ptr = get_string_ptr(asm, recv);
+    // FIXME: could use SIB indexing here with proper support in backend
+    let str_ptr = asm.add(str_ptr, idx);
+    let byte = asm.load(Opnd::mem(8, str_ptr, 0));
+
+    // Zero-extend the byte to 64 bits
+    let byte = byte.with_num_bits(64).unwrap();
+    let byte = asm.and(byte, 0xFF.into());
+
+    // Tag the byte
+    let byte = asm.lshift(byte, Opnd::UImm(1));
+    let byte = asm.or(byte, Opnd::UImm(1));
+
+    asm.stack_pop(2); // Keep them on stack during ccall for GC
+    let out_opnd = asm.stack_push(Type::Fixnum);
+    asm.mov(out_opnd, byte);
+
+    true
+}
+
+fn jit_rb_str_setbyte(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    // Raises when index is out of range. Lazily push a frame in that case.
+    if !jit_prepare_lazy_frame_call(jit, asm, cme, StackOpnd(2)) {
+        return false;
+    }
+    asm_comment!(asm, "String#setbyte");
+
+    let value = asm.stack_opnd(0);
+    let index = asm.stack_opnd(1);
+    let recv = asm.stack_opnd(2);
+
+    let ret_opnd = asm.ccall(rb_str_setbyte as *const u8, vec![recv, index, value]);
+    asm.stack_pop(3); // Keep them on stack during ccall for GC
+
+    let out_opnd = asm.stack_push(Type::UnknownImm);
     asm.mov(out_opnd, ret_opnd);
 
     true
@@ -4139,17 +6152,15 @@ fn jit_rb_str_bytesize(
 // this situation happens a lot in some workloads.
 fn jit_rb_str_to_s(
     _jit: &mut JITState,
-    _ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    known_recv_class: *const VALUE,
+    known_recv_class: Option<VALUE>,
 ) -> bool {
-    if !known_recv_class.is_null() && unsafe { *known_recv_class == rb_cString } {
-        asm.comment("to_s on plain string");
+    if unsafe { known_recv_class == Some(rb_cString) } {
+        asm_comment!(asm, "to_s on plain string");
         // The method returns the receiver, which is already on the stack.
         // No stack movement.
         return true;
@@ -4157,87 +6168,147 @@ fn jit_rb_str_to_s(
     false
 }
 
-// Codegen for rb_str_empty()
-fn jit_rb_str_empty(
-    _jit: &mut JITState,
-    ctx: &mut Context,
+fn jit_rb_str_dup(
+    jit: &mut JITState,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    known_recv_class: Option<VALUE>,
 ) -> bool {
-    const _: () = assert!(
-        RUBY_OFFSET_RSTRING_AS_HEAP_LEN == RUBY_OFFSET_RSTRING_EMBED_LEN,
-        "same offset to len embedded or not so we can use one code path to read the length",
-    );
+    // We specialize only the BARE_STRING_P case. Otherwise it's not leaf.
+    if unsafe { known_recv_class != Some(rb_cString) } {
+        return false;
+    }
+    asm_comment!(asm, "String#dup");
+
+    jit_prepare_call_with_gc(jit, asm);
 
-    let recv_opnd = ctx.stack_pop(1);
-    let out_opnd = ctx.stack_push(Type::UnknownImm);
+    let recv_opnd = asm.stack_opnd(0);
+    let recv_opnd = asm.load(recv_opnd);
 
+    let shape_id_offset = unsafe { rb_shape_id_offset() };
+    let shape_opnd = Opnd::mem(64, recv_opnd, shape_id_offset);
+    asm.test(shape_opnd, Opnd::UImm(SHAPE_ID_HAS_IVAR_MASK as u64));
+    asm.jnz(Target::side_exit(Counter::send_str_dup_exivar));
+
+    // Call rb_str_dup
+    let ret_opnd = asm.ccall(rb_str_dup as *const u8, vec![recv_opnd]);
+
+    asm.stack_pop(1);
+    let stack_ret = asm.stack_push(Type::CString);
+    asm.mov(stack_ret, ret_opnd);
+
+    true
+}
+
+// Codegen for rb_str_empty_p()
+fn jit_rb_str_empty_p(
+    _jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    let recv_opnd = asm.stack_pop(1);
+
+    asm_comment!(asm, "get string length");
     let str_len_opnd = Opnd::mem(
-        (8 * size_of::<std::os::raw::c_long>()) as u8,
+        std::os::raw::c_long::BITS as u8,
         asm.load(recv_opnd),
-        RUBY_OFFSET_RSTRING_AS_HEAP_LEN as i32,
+        RUBY_OFFSET_RSTRING_LEN as i32,
     );
 
     asm.cmp(str_len_opnd, Opnd::UImm(0));
     let string_empty = asm.csel_e(Qtrue.into(), Qfalse.into());
+    let out_opnd = asm.stack_push(Type::UnknownImm);
     asm.mov(out_opnd, string_empty);
 
     return true;
 }
 
+// Codegen for rb_str_concat() with an integer argument -- *not* String#concat
+// Using strings as a byte buffer often includes appending byte values to the end of the string.
+fn jit_rb_str_concat_codepoint(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    asm_comment!(asm, "String#<< with codepoint argument");
+
+    // Either of the string concatenation functions we call will reallocate the string to grow its
+    // capacity if necessary. In extremely rare cases (i.e., string exceeds `LONG_MAX` bytes),
+    // either of the called functions will raise an exception.
+    jit_prepare_non_leaf_call(jit, asm);
+
+    let codepoint = asm.stack_opnd(0);
+    let recv = asm.stack_opnd(1);
+
+    guard_object_is_fixnum(jit, asm, codepoint, StackOpnd(0));
+
+    asm.ccall(rb_jit_str_concat_codepoint as *const u8, vec![recv, codepoint]);
+
+    // The receiver is the return value, so we only need to pop the codepoint argument off the stack.
+    // We can reuse the receiver slot in the stack as the return value.
+    asm.stack_pop(1);
+
+    true
+}
+
 // Codegen for rb_str_concat() -- *not* String#concat
 // Frequently strings are concatenated using "out_str << next_str".
 // This is common in Erb and similar templating languages.
 fn jit_rb_str_concat(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-    _ci: *const rb_callinfo,
-    _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
-    _argc: i32,
-    _known_recv_class: *const VALUE,
+    ci: *const rb_callinfo,
+    cme: *const rb_callable_method_entry_t,
+    block: Option<BlockHandler>,
+    argc: i32,
+    known_recv_class: Option<VALUE>,
 ) -> bool {
     // The << operator can accept integer codepoints for characters
     // as the argument. We only specially optimise string arguments.
     // If the peeked-at compile time argument is something other than
     // a string, assume it won't be a string later either.
-    let comptime_arg = jit_peek_at_stack(jit, ctx, 0);
+    let comptime_arg = jit.peek_at_stack(&asm.ctx, 0);
+    if unsafe { RB_TYPE_P(comptime_arg, RUBY_T_FIXNUM) } {
+        return jit_rb_str_concat_codepoint(jit, asm, ci, cme, block, argc, known_recv_class);
+    }
+
     if ! unsafe { RB_TYPE_P(comptime_arg, RUBY_T_STRING) } {
         return false;
     }
 
-    // Generate a side exit
-    let side_exit = get_side_exit(jit, ocb, ctx);
+    // Guard that the concat argument is a string
+    guard_object_is_string(asm, asm.stack_opnd(0), StackOpnd(0), Counter::guard_send_not_string);
 
-    // Guard that the argument is of class String at runtime.
-    let arg_type = ctx.get_opnd_type(StackOpnd(0));
+    // Guard buffers from GC since rb_str_buf_append may allocate.
+    // rb_str_buf_append may raise Encoding::CompatibilityError, but we accept compromised
+    // backtraces on this method since the interpreter does the same thing on opt_ltlt.
+    jit_prepare_non_leaf_call(jit, asm);
 
-    let concat_arg = ctx.stack_pop(1);
-    let recv = ctx.stack_pop(1);
+    // Explicitly spill temps before making any C calls. `ccall` will spill temps, but it does a
+    // check to only spill if it thinks it's necessary. That logic can't see through the runtime
+    // branching occurring in the code generated for this function. Consequently, the branch for
+    // the first `ccall` will spill registers but the second one will not. At run time, we may
+    // jump over that spill code when executing the second branch, leading situations that are
+    // quite hard to debug. If we spill up front we avoid diverging behavior.
+    asm.spill_regs();
 
-    // If we're not compile-time certain that this will always be a string, guard at runtime
-    if arg_type != Type::CString && arg_type != Type::TString {
-        let arg_opnd = asm.load(concat_arg);
-        if !arg_type.is_heap() {
-            asm.comment("guard arg not immediate");
-            asm.test(arg_opnd, (RUBY_IMMEDIATE_MASK as u64).into());
-            asm.jnz(side_exit);
-            asm.cmp(arg_opnd, Qfalse.into());
-            asm.je(side_exit);
-        }
-        guard_object_is_string(asm, arg_opnd, side_exit);
-    }
+    let concat_arg = asm.stack_pop(1);
+    let recv = asm.stack_pop(1);
 
     // Test if string encodings differ. If different, use rb_str_append. If the same,
     // use rb_yjit_str_simple_append, which calls rb_str_cat.
-    asm.comment("<< on strings");
+    asm_comment!(asm, "<< on strings");
 
     // Take receiver's object flags XOR arg's flags. If any
     // string-encoding flags are different between the two,
@@ -4250,21 +6321,22 @@ fn jit_rb_str_concat(
     );
     asm.test(flags_xor, Opnd::UImm(RUBY_ENCODING_MASK as u64));
 
-    // Push once, use the resulting operand in both branches below.
-    let stack_ret = ctx.stack_push(Type::CString);
-
     let enc_mismatch = asm.new_label("enc_mismatch");
     asm.jnz(enc_mismatch);
 
     // If encodings match, call the simple append function and jump to return
     let ret_opnd = asm.ccall(rb_yjit_str_simple_append as *const u8, vec![recv, concat_arg]);
     let ret_label = asm.new_label("func_return");
+    let stack_ret = asm.stack_push(Type::TString);
     asm.mov(stack_ret, ret_opnd);
+    asm.stack_pop(1); // forget stack_ret to re-push after ccall
     asm.jmp(ret_label);
 
     // If encodings are different, use a slower encoding-aware concatenate
     asm.write_label(enc_mismatch);
+    asm.spill_regs(); // Ignore the register for the other local branch
     let ret_opnd = asm.ccall(rb_str_buf_append as *const u8, vec![recv, concat_arg]);
+    let stack_ret = asm.stack_push(Type::TString);
     asm.mov(stack_ret, ret_opnd);
     // Drop through to return
 
@@ -4273,30 +6345,120 @@ fn jit_rb_str_concat(
     true
 }
 
+// Codegen for rb_ary_empty_p()
+fn jit_rb_ary_empty_p(
+    _jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    let array_opnd = asm.stack_pop(1);
+    let array_reg = asm.load(array_opnd);
+    let len_opnd = get_array_len(asm, array_reg);
+
+    asm.test(len_opnd, len_opnd);
+    let bool_val = asm.csel_z(Qtrue.into(), Qfalse.into());
+
+    let out_opnd = asm.stack_push(Type::UnknownImm);
+    asm.store(out_opnd, bool_val);
+
+    return true;
+}
+
+// Codegen for rb_ary_length()
+fn jit_rb_ary_length(
+    _jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    let array_opnd = asm.stack_pop(1);
+    let array_reg = asm.load(array_opnd);
+    let len_opnd = get_array_len(asm, array_reg);
+
+    // Convert the length to a fixnum
+    let shifted_val = asm.lshift(len_opnd, Opnd::UImm(1));
+    let out_val = asm.or(shifted_val, Opnd::UImm(RUBY_FIXNUM_FLAG as u64));
+
+    let out_opnd = asm.stack_push(Type::Fixnum);
+    asm.store(out_opnd, out_val);
+
+    return true;
+}
+
+fn jit_rb_ary_push(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    asm_comment!(asm, "Array#<<");
+
+    // rb_ary_push allocates memory for buffer extension and can raise FrozenError
+    // Not using a lazy frame here since the interpreter also has a truncated
+    // stack trace from opt_ltlt.
+    jit_prepare_non_leaf_call(jit, asm);
+
+    let item_opnd = asm.stack_opnd(0);
+    let ary_opnd = asm.stack_opnd(1);
+    let ret = asm.ccall(rb_ary_push as *const u8, vec![ary_opnd, item_opnd]);
+    asm.stack_pop(2); // Keep them on stack during ccall for GC
+
+    let ret_opnd = asm.stack_push(Type::TArray);
+    asm.mov(ret_opnd, ret);
+    true
+}
+
+// Just a leaf method, but not using `Primitive.attr! :leaf` since BOP methods can't use it.
+fn jit_rb_hash_empty_p(
+    _jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    asm_comment!(asm, "Hash#empty?");
+
+    let hash_opnd = asm.stack_pop(1);
+    let ret = asm.ccall(rb_hash_empty_p as *const u8, vec![hash_opnd]);
+
+    let ret_opnd = asm.stack_push(Type::UnknownImm);
+    asm.mov(ret_opnd, ret);
+    true
+}
+
 fn jit_obj_respond_to(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     argc: i32,
-    known_recv_class: *const VALUE,
+    known_recv_class: Option<VALUE>,
 ) -> bool {
     // respond_to(:sym) or respond_to(:sym, true)
     if argc != 1 && argc != 2 {
         return false;
     }
 
-    if known_recv_class.is_null() {
-        return false;
-    }
-
-    let recv_class = unsafe { *known_recv_class };
+    let recv_class = match known_recv_class {
+        Some(class) => class,
+        None => return false,
+    };
 
     // Get the method_id from compile time. We will later add a guard against it.
-    let mid_sym = jit_peek_at_stack(jit, ctx, (argc - 1) as isize);
+    let mid_sym = jit.peek_at_stack(&asm.ctx, (argc - 1) as isize);
     if !mid_sym.static_sym_p() {
         return false
     }
@@ -4308,7 +6470,7 @@ fn jit_obj_respond_to(
         Some(false)
     } else {
         // Get value from type information (may or may not be known)
-        ctx.get_opnd_type(StackOpnd(0)).known_truthy()
+        asm.ctx.get_opnd_type(StackOpnd(0)).known_truthy()
     };
 
     let target_cme = unsafe { rb_callable_method_entry_or_negative(recv_class, mid) };
@@ -4329,83 +6491,223 @@ fn jit_obj_respond_to(
     };
 
     let result = match (visibility, allow_priv) {
-        (METHOD_VISI_UNDEF, _) => Qfalse, // No method => false
-        (METHOD_VISI_PUBLIC, _) => Qtrue, // Public method => true regardless of include_all
-        (_, Some(true)) => Qtrue, // include_all => always true
+        (METHOD_VISI_UNDEF, _) => {
+            // No method, we can return false given respond_to_missing? hasn't been overridden.
+            // In the future, we might want to jit the call to respond_to_missing?
+            if !assume_method_basic_definition(jit, asm, recv_class, ID!(respond_to_missing)) {
+                return false;
+            }
+            Qfalse
+        }
+        (METHOD_VISI_PUBLIC, _) | // Public method => fine regardless of include_all
+        (_, Some(true)) => { // include_all => all visibility are acceptable
+            // Method exists and has acceptable visibility
+            if cme_def_type == VM_METHOD_TYPE_NOTIMPLEMENTED {
+                // C method with rb_f_notimplement(). `respond_to?` returns false
+                // without consulting `respond_to_missing?`. See also: rb_add_method_cfunc()
+                Qfalse
+            } else {
+                Qtrue
+            }
+        }
         (_, _) => return false // not public and include_all not known, can't compile
     };
 
-    if result != Qtrue {
-        // Only if respond_to_missing? hasn't been overridden
-        // In the future, we might want to jit the call to respond_to_missing?
-        if !assume_method_basic_definition(jit, ocb, recv_class, idRespond_to_missing.into()) {
-            return false;
-        }
-    }
-
     // Invalidate this block if method lookup changes for the method being queried. This works
     // both for the case where a method does or does not exist, as for the latter we asked for a
     // "negative CME" earlier.
-    assume_method_lookup_stable(jit, ocb, target_cme);
-
-    // Generate a side exit
-    let side_exit = get_side_exit(jit, ocb, ctx);
+    jit.assume_method_lookup_stable(asm, target_cme);
 
     if argc == 2 {
         // pop include_all argument (we only use its type info)
-        ctx.stack_pop(1);
+        asm.stack_pop(1);
     }
 
-    let sym_opnd = ctx.stack_pop(1);
-    let _recv_opnd = ctx.stack_pop(1);
+    let sym_opnd = asm.stack_pop(1);
+    let _recv_opnd = asm.stack_pop(1);
 
     // This is necessary because we have no guarantee that sym_opnd is a constant
-    asm.comment("guard known mid");
+    asm_comment!(asm, "guard known mid");
     asm.cmp(sym_opnd, mid_sym.into());
-    asm.jne(side_exit);
+    jit_chain_guard(
+        JCC_JNE,
+        jit,
+        asm,
+        SEND_MAX_DEPTH,
+        Counter::guard_send_respond_to_mid_mismatch,
+    );
+
+    jit_putobject(asm, result);
+
+    true
+}
+
+fn jit_rb_f_block_given_p(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    asm.stack_pop(1);
+    let out_opnd = asm.stack_push(Type::UnknownImm);
+
+    gen_block_given(jit, asm, out_opnd, Qtrue.into(), Qfalse.into());
+
+    true
+}
+
+/// Codegen for `block_given?` and `defined?(yield)`
+fn gen_block_given(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    out_opnd: Opnd,
+    true_opnd: Opnd,
+    false_opnd: Opnd,
+) {
+    asm_comment!(asm, "block_given?");
+
+    // `yield` goes to the block handler stowed in the "local" iseq which is
+    // the current iseq or a parent. Only the "method" iseq type can be passed a
+    // block handler. (e.g. `yield` in the top level script is a syntax error.)
+    let local_iseq = unsafe { rb_get_iseq_body_local_iseq(jit.iseq) };
+    if unsafe { rb_get_iseq_body_type(local_iseq) } == ISEQ_TYPE_METHOD {
+        // Same as rb_vm_frame_block_handler
+        let ep_opnd = gen_get_lep(jit, asm);
+        let block_handler = asm.load(
+            Opnd::mem(64, ep_opnd, SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL)
+        );
+
+        // Return `block_handler != VM_BLOCK_HANDLER_NONE`
+        asm.cmp(block_handler, VM_BLOCK_HANDLER_NONE.into());
+        let block_given = asm.csel_ne(true_opnd, false_opnd);
+        asm.mov(out_opnd, block_given);
+    } else {
+        asm.mov(out_opnd, false_opnd);
+    }
+}
+
+// Codegen for rb_class_superclass()
+fn jit_rb_class_superclass(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    cme: *const rb_callable_method_entry_t,
+    _block: Option<crate::codegen::BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    extern "C" {
+        fn rb_class_superclass(klass: VALUE) -> VALUE;
+    }
+
+    // It may raise "uninitialized class"
+    if !jit_prepare_lazy_frame_call(jit, asm, cme, StackOpnd(0)) {
+        return false;
+    }
+
+    asm_comment!(asm, "Class#superclass");
+    let recv_opnd = asm.stack_opnd(0);
+    let ret = asm.ccall(rb_class_superclass as *const u8, vec![recv_opnd]);
 
-    jit_putobject(jit, ctx, asm, result);
+    asm.stack_pop(1);
+    let ret_opnd = asm.stack_push(Type::Unknown);
+    asm.mov(ret_opnd, ret);
+
+    true
+}
+
+fn jit_rb_case_equal(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    known_recv_class: Option<VALUE>,
+) -> bool {
+    if !jit.assume_expected_cfunc(asm, known_recv_class.unwrap(), ID!(eq), rb_obj_equal as _) {
+        return false;
+    }
+
+    asm_comment!(asm, "case_equal: {}#===", get_class_name(known_recv_class));
+
+    // Compare the arguments
+    let arg1 = asm.stack_pop(1);
+    let arg0 = asm.stack_pop(1);
+    asm.cmp(arg0, arg1);
+    let ret_opnd = asm.csel_e(Qtrue.into(), Qfalse.into());
+
+    let stack_ret = asm.stack_push(Type::UnknownImm);
+    asm.mov(stack_ret, ret_opnd);
 
     true
 }
 
 fn jit_thread_s_current(
     _jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
     _ci: *const rb_callinfo,
     _cme: *const rb_callable_method_entry_t,
-    _block: Option<IseqPtr>,
+    _block: Option<BlockHandler>,
     _argc: i32,
-    _known_recv_class: *const VALUE,
+    _known_recv_class: Option<VALUE>,
 ) -> bool {
-    asm.comment("Thread.current");
-    ctx.stack_pop(1);
+    asm_comment!(asm, "Thread.current");
+    asm.stack_pop(1);
 
     // ec->thread_ptr
-    let ec_thread_opnd = asm.load(Opnd::mem(64, EC, RUBY_OFFSET_EC_THREAD_PTR));
+    let ec_thread_opnd = asm.load(Opnd::mem(64, EC, RUBY_OFFSET_EC_THREAD_PTR as i32));
 
     // thread->self
     let thread_self = Opnd::mem(64, ec_thread_opnd, RUBY_OFFSET_THREAD_SELF);
 
-    let stack_ret = ctx.stack_push(Type::UnknownHeap);
+    let stack_ret = asm.stack_push(Type::UnknownHeap);
     asm.mov(stack_ret, thread_self);
     true
 }
 
-// Check if we know how to codegen for a particular cfunc method
+/// Specialization for rb_obj_dup() (Kernel#dup)
+fn jit_rb_obj_dup(
+    _jit: &mut JITState,
+    asm: &mut Assembler,
+    _ci: *const rb_callinfo,
+    _cme: *const rb_callable_method_entry_t,
+    _block: Option<BlockHandler>,
+    _argc: i32,
+    _known_recv_class: Option<VALUE>,
+) -> bool {
+    // Kernel#dup has arity=0, and caller already did argument count check.
+    let self_type = asm.ctx.get_opnd_type(StackOpnd(0));
+
+    if self_type.is_imm() {
+        // Method is no-op when receiver is an immediate value.
+        true
+    } else {
+        false
+    }
+}
+
+/// Check if we know how to codegen for a particular cfunc method
+/// See also: [reg_method_codegen].
 fn lookup_cfunc_codegen(def: *const rb_method_definition_t) -> Option<MethodGenFn> {
     let method_serial = unsafe { get_def_method_serial(def) };
+    let table = unsafe { METHOD_CODEGEN_TABLE.as_ref().unwrap() };
 
-    CodegenGlobals::look_up_codegen_method(method_serial)
+    let option_ref = table.get(&method_serial);
+    match option_ref {
+        None => None,
+        Some(&mgf) => Some(mgf), // Deref
+    }
 }
 
 // Is anyone listening for :c_call and :c_return event currently?
 fn c_method_tracing_currently_enabled(jit: &JITState) -> bool {
     // Defer to C implementation in yjit.c
     unsafe {
-        rb_c_method_tracing_currently_enabled(jit.ec.unwrap() as *mut rb_execution_context_struct)
+        rb_c_method_tracing_currently_enabled(jit.ec)
     }
 }
 
@@ -4430,13 +6732,25 @@ unsafe extern "C" fn build_kwhash(ci: *const rb_callinfo, sp: *const VALUE) -> V
 // at sp[-2]. Depending on the frame type, it can serve different purposes,
 // which are covered here by enum variants.
 enum SpecVal {
-    None,
-    BlockISeq(IseqPtr),
-    BlockParamProxy,
+    BlockHandler(Option<BlockHandler>),
     PrevEP(*const VALUE),
     PrevEPOpnd(Opnd),
 }
 
+// Each variant represents a branch in vm_caller_setup_arg_block.
+#[derive(Clone, Copy)]
+pub enum BlockHandler {
+    // send, invokesuper: blockiseq operand
+    BlockISeq(IseqPtr),
+    // invokesuper: GET_BLOCK_HANDLER() (GET_LEP()[VM_ENV_DATA_INDEX_SPECVAL])
+    LEPSpecVal,
+    // part of the allocate-free block forwarding scheme
+    BlockParamProxy,
+    // To avoid holding the block arg (e.g. proc and symbol) across C calls,
+    // we might need to set the block handler early in the call sequence
+    AlreadySet,
+}
+
 struct ControlFrame {
     recv: Opnd,
     sp: Opnd,
@@ -4445,7 +6759,6 @@ struct ControlFrame {
     frame_type: u32,
     specval: SpecVal,
     cme: *const rb_callable_method_entry_t,
-    local_size: i32
 }
 
 // Codegen performing a similar (but not identical) function to vm_push_frame
@@ -4460,21 +6773,17 @@ struct ControlFrame {
 //   * Provided sp should point to the new frame's sp, immediately following locals and the environment
 //   * At entry, CFP points to the caller (not callee) frame
 //   * At exit, ec->cfp is updated to the pushed CFP
-//   * CFP and SP registers are updated only if set_sp_cfp is set
+//   * SP register is updated only if frame.iseq is set
 //   * Stack overflow is not checked (should be done by the caller)
 //   * Interrupts are not checked (should be done by the caller)
 fn gen_push_frame(
     jit: &mut JITState,
-    _ctx: &mut Context,
     asm: &mut Assembler,
-    set_sp_cfp: bool, // if true CFP and SP will be switched to the callee
     frame: ControlFrame,
 ) {
-    assert!(frame.local_size >= 0);
-
     let sp = frame.sp;
 
-    asm.comment("push cme, specval, frame type");
+    asm_comment!(asm, "push cme, specval, frame type");
 
     // Write method entry at sp[-3]
     // sp[-3] = me;
@@ -4486,27 +6795,31 @@ fn gen_push_frame(
     // the outer environment depending on the frame type.
     // sp[-2] = specval;
     let specval: Opnd = match frame.specval {
-        SpecVal::None => {
-            VM_BLOCK_HANDLER_NONE.into()
-        }
-        SpecVal::BlockISeq(block_iseq) => {
-            // Change cfp->block_code in the current frame. See vm_caller_setup_arg_block().
-            // VM_CFP_TO_CAPTURED_BLOCK does &cfp->self, rb_captured_block->code.iseq aliases
-            // with cfp->block_code.
-            asm.store(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_BLOCK_CODE), VALUE::from(block_iseq).into());
-
-            let cfp_self = asm.lea(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF));
-            asm.or(cfp_self, Opnd::Imm(1))
-        }
-        SpecVal::BlockParamProxy => {
-            let ep_opnd = gen_get_lep(jit, asm);
-            let block_handler = asm.load(
-                Opnd::mem(64, ep_opnd, SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL)
-            );
-
-            asm.store(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_BLOCK_CODE), block_handler);
-
-            block_handler
+        SpecVal::BlockHandler(None) => VM_BLOCK_HANDLER_NONE.into(),
+        SpecVal::BlockHandler(Some(block_handler)) => {
+            match block_handler {
+                BlockHandler::BlockISeq(block_iseq) => {
+                    // Change cfp->block_code in the current frame. See vm_caller_setup_arg_block().
+                    // VM_CFP_TO_CAPTURED_BLOCK does &cfp->self, rb_captured_block->code.iseq aliases
+                    // with cfp->block_code.
+                    asm.store(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_BLOCK_CODE), VALUE::from(block_iseq).into());
+
+                    let cfp_self = asm.lea(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF));
+                    asm.or(cfp_self, Opnd::Imm(1))
+                }
+                BlockHandler::LEPSpecVal => {
+                    let lep_opnd = gen_get_lep(jit, asm);
+                    asm.load(Opnd::mem(64, lep_opnd, SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL))
+                }
+                BlockHandler::BlockParamProxy => {
+                    let ep_opnd = gen_get_lep(jit, asm);
+                    let block_handler = asm.load(
+                        Opnd::mem(64, ep_opnd, SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL)
+                    );
+                    block_handler
+                }
+                BlockHandler::AlreadySet => 0.into(), // unused
+            }
         }
         SpecVal::PrevEP(prev_ep) => {
             let tagged_prev_ep = (prev_ep as usize) | 1;
@@ -4514,9 +6827,13 @@ fn gen_push_frame(
         }
         SpecVal::PrevEPOpnd(ep_opnd) => {
             asm.or(ep_opnd, 1.into())
-        },
+        }
     };
-    asm.store(Opnd::mem(64, sp, SIZEOF_VALUE_I32 * -2), specval);
+    if let SpecVal::BlockHandler(Some(BlockHandler::AlreadySet)) = frame.specval {
+        asm_comment!(asm, "specval should have been set");
+    } else {
+        asm.store(Opnd::mem(64, sp, SIZEOF_VALUE_I32 * -2), specval);
+    }
 
     // Write env flags at sp[-1]
     // sp[-1] = frame_type;
@@ -4535,16 +6852,14 @@ fn gen_push_frame(
     //    .self       = recv,
     //    .ep         = <sp - 1>,
     //    .block_code = 0,
-    //    .__bp__     = sp,
     // };
-    asm.comment("push callee control frame");
+    asm_comment!(asm, "push callee control frame");
 
     // For an iseq call PC may be None, in which case we will not set PC and will allow jitted code
     // to set it as necessary.
-    let _pc = if let Some(pc) = frame.pc {
+    if let Some(pc) = frame.pc {
         asm.mov(cfp_opnd(RUBY_OFFSET_CFP_PC), pc.into());
     };
-    asm.mov(cfp_opnd(RUBY_OFFSET_CFP_BP), sp);
     asm.mov(cfp_opnd(RUBY_OFFSET_CFP_SP), sp);
     let iseq: Opnd = if let Some(iseq) = frame.iseq {
         VALUE::from(iseq).into()
@@ -4555,89 +6870,36 @@ fn gen_push_frame(
     asm.mov(cfp_opnd(RUBY_OFFSET_CFP_SELF), frame.recv);
     asm.mov(cfp_opnd(RUBY_OFFSET_CFP_BLOCK_CODE), 0.into());
 
-    // This Qnil fill snippet potentially requires 2 more registers on Arm, one for Qnil and
-    // another for calculating the address in case there are a lot of local variables. So doing
-    // this after releasing the register for specval and the receiver to avoid register spill.
-    let num_locals = frame.local_size;
-    if num_locals > 0 {
-        asm.comment("initialize locals");
-
-        // Initialize local variables to Qnil
-        for i in 0..num_locals {
-            let offs = SIZEOF_VALUE_I32 * (i - num_locals - 3);
-            asm.store(Opnd::mem(64, sp, offs), Qnil.into());
-        }
-    }
-
-    if set_sp_cfp {
-        // Saving SP before calculating ep avoids a dependency on a register
-        // However this must be done after referencing frame.recv, which may be SP-relative
-        asm.mov(SP, sp);
-    }
     let ep = asm.sub(sp, SIZEOF_VALUE.into());
     asm.mov(cfp_opnd(RUBY_OFFSET_CFP_EP), ep);
-
-    asm.comment("switch to new CFP");
-    let new_cfp = asm.lea(cfp_opnd(0));
-    if set_sp_cfp {
-        asm.mov(CFP, new_cfp);
-        asm.store(Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP), CFP);
-    } else {
-        asm.store(Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP), new_cfp);
-    }
 }
 
 fn gen_send_cfunc(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
     ci: *const rb_callinfo,
     cme: *const rb_callable_method_entry_t,
-    block: Option<IseqPtr>,
-    recv_known_klass: *const VALUE,
+    block: Option<BlockHandler>,
+    recv_known_class: Option<VALUE>,
     flags: u32,
     argc: i32,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     let cfunc = unsafe { get_cme_def_body_cfunc(cme) };
     let cfunc_argc = unsafe { get_mct_argc(cfunc) };
     let mut argc = argc;
 
-    // Create a side-exit to fall back to the interpreter
-    let side_exit = get_side_exit(jit, ocb, ctx);
-
-    // If the function expects a Ruby array of arguments
-    if cfunc_argc < 0 && cfunc_argc != -1 {
-        gen_counter_incr!(asm, send_cfunc_ruby_array_varg);
-        return CantCompile;
-    }
-
-    // We aren't handling a vararg cfuncs with splat currently.
-    if flags & VM_CALL_ARGS_SPLAT != 0 && cfunc_argc == -1 {
-        gen_counter_incr!(asm, send_args_splat_cfunc_var_args);
-        return CantCompile;
-    }
+    // Splat call to a C method that takes `VALUE *` and `len`
+    let variable_splat = flags & VM_CALL_ARGS_SPLAT != 0 && cfunc_argc == -1;
+    let block_arg = flags & VM_CALL_ARGS_BLOCKARG != 0;
 
-    if flags & VM_CALL_ARGS_SPLAT != 0 && flags & VM_CALL_ZSUPER != 0 {
-        // zsuper methods are super calls without any arguments.
-        // They are also marked as splat, but don't actually have an array
-        // they pull arguments from, instead we need to change to call
-        // a different method with the current stack.
-        gen_counter_incr!(asm, send_args_splat_cfunc_zuper);
-        return CantCompile;
+    // If it's a splat and the method expects a Ruby array of arguments
+    if cfunc_argc == -2 && flags & VM_CALL_ARGS_SPLAT != 0 {
+        gen_counter_incr(jit, asm, Counter::send_cfunc_splat_neg2);
+        return None;
     }
 
-    // In order to handle backwards compatibility between ruby 3 and 2
-    // ruby2_keywords was introduced. It is called only on methods
-    // with splat and changes they way they handle them.
-    // We are just going to not compile these.
-    // https://docs.ruby-lang.org/en/3.2/Module.html#method-i-ruby2_keywords
-    if unsafe {
-        get_iseq_flags_ruby2_keywords(jit.iseq) && flags & VM_CALL_ARGS_SPLAT != 0
-    } {
-        gen_counter_incr!(asm, send_args_splat_cfunc_ruby2_keywords);
-        return CantCompile;
-    }
+    exit_if_kwsplat_non_nil(jit, asm, flags, Counter::send_cfunc_kw_splat_non_nil)?;
+    let kw_splat = flags & VM_CALL_KW_SPLAT != 0;
 
     let kw_arg = unsafe { vm_ci_kwarg(ci) };
     let kw_arg_num = if kw_arg.is_null() {
@@ -4647,39 +6909,79 @@ fn gen_send_cfunc(
     };
 
     if kw_arg_num != 0 && flags & VM_CALL_ARGS_SPLAT != 0 {
-        gen_counter_incr!(asm, send_cfunc_splat_with_kw);
-        return CantCompile;
+        gen_counter_incr(jit, asm, Counter::send_cfunc_splat_with_kw);
+        return None;
     }
 
     if c_method_tracing_currently_enabled(jit) {
         // Don't JIT if tracing c_call or c_return
-        gen_counter_incr!(asm, send_cfunc_tracing);
-        return CantCompile;
-    }
+        gen_counter_incr(jit, asm, Counter::send_cfunc_tracing);
+        return None;
+    }
+
+    // Increment total cfunc send count
+    gen_counter_incr(jit, asm, Counter::num_send_cfunc);
+
+    // Delegate to codegen for C methods if we have it and the callsite is simple enough.
+    if kw_arg.is_null() &&
+            !kw_splat &&
+            flags & VM_CALL_OPT_SEND == 0 &&
+            flags & VM_CALL_ARGS_SPLAT == 0 &&
+            flags & VM_CALL_ARGS_BLOCKARG == 0 &&
+            (cfunc_argc == -1 || argc == cfunc_argc) {
+        let expected_stack_after = asm.ctx.get_stack_size() as i32 - argc;
+        if let Some(known_cfunc_codegen) = lookup_cfunc_codegen(unsafe { (*cme).def }) {
+            // We don't push a frame for specialized cfunc codegen, so the generated code must be leaf.
+            // However, the interpreter doesn't push a frame on opt_* instruction either, so we allow
+            // non-sendish instructions to break this rule as an exception.
+            let cfunc_codegen = if jit.is_sendish() {
+                asm.with_leaf_ccall(|asm|
+                    perf_call!("gen_send_cfunc: ", known_cfunc_codegen(jit, asm, ci, cme, block, argc, recv_known_class))
+                )
+            } else {
+                perf_call!("gen_send_cfunc: ", known_cfunc_codegen(jit, asm, ci, cme, block, argc, recv_known_class))
+            };
 
-    // Delegate to codegen for C methods if we have it.
-    if kw_arg.is_null() && flags & VM_CALL_OPT_SEND == 0 {
-        let codegen_p = lookup_cfunc_codegen(unsafe { (*cme).def });
-        if let Some(known_cfunc_codegen) = codegen_p {
-            if known_cfunc_codegen(jit, ctx, asm, ocb, ci, cme, block, argc, recv_known_klass) {
+            if cfunc_codegen {
+                assert_eq!(expected_stack_after, asm.ctx.get_stack_size() as i32);
+                gen_counter_incr(jit, asm, Counter::num_send_cfunc_inline);
                 // cfunc codegen generated code. Terminate the block so
                 // there isn't multiple calls in the same block.
-                jump_to_next_insn(jit, ctx, asm, ocb);
-                return EndBlock;
+                return jump_to_next_insn(jit, asm);
             }
         }
     }
 
     // Check for interrupts
-    gen_check_ints(asm, side_exit);
+    gen_check_ints(asm, Counter::guard_send_interrupted);
 
     // Stack overflow check
     // #define CHECK_VM_STACK_OVERFLOW0(cfp, sp, margin)
     // REG_CFP <= REG_SP + 4 * SIZEOF_VALUE + sizeof(rb_control_frame_t)
-    asm.comment("stack overflow check");
-    let stack_limit = asm.lea(ctx.sp_opnd((SIZEOF_VALUE * 4 + 2 * RUBY_SIZEOF_CONTROL_FRAME) as isize));
+    asm_comment!(asm, "stack overflow check");
+    const _: () = assert!(RUBY_SIZEOF_CONTROL_FRAME % SIZEOF_VALUE == 0, "sizeof(rb_control_frame_t) is a multiple of sizeof(VALUE)");
+    let stack_limit = asm.lea(asm.ctx.sp_opnd((4 + 2 * (RUBY_SIZEOF_CONTROL_FRAME / SIZEOF_VALUE)) as i32));
     asm.cmp(CFP, stack_limit);
-    asm.jbe(counted_exit!(ocb, side_exit, send_se_cf_overflow));
+    asm.jbe(Target::side_exit(Counter::guard_send_se_cf_overflow));
+
+    // Guard for variable length splat call before any modifications to the stack
+    if variable_splat {
+        let splat_array_idx = i32::from(kw_splat) + i32::from(block_arg);
+        let comptime_splat_array = jit.peek_at_stack(&asm.ctx, splat_array_idx as isize);
+        if unsafe { rb_yjit_ruby2_keywords_splat_p(comptime_splat_array) } != 0 {
+            gen_counter_incr(jit, asm, Counter::send_cfunc_splat_varg_ruby2_keywords);
+            return None;
+        }
+
+        let splat_array = asm.stack_opnd(splat_array_idx);
+        guard_object_is_array(asm, splat_array, splat_array.into(), Counter::guard_send_splat_not_array);
+
+        asm_comment!(asm, "guard variable length splat call servicable");
+        let sp = asm.ctx.sp_opnd(0);
+        let proceed = asm.ccall(rb_yjit_splat_varg_checks as _, vec![sp, splat_array, CFP]);
+        asm.cmp(proceed, Qfalse.into());
+        asm.je(Target::side_exit(Counter::guard_send_cfunc_bad_splat_vargs));
+    }
 
     // Number of args which will be passed through to the callee
     // This is adjusted by the kwargs being combined into a hash.
@@ -4689,93 +6991,116 @@ fn gen_send_cfunc(
         argc - kw_arg_num + 1
     };
 
+    // Exclude the kw_splat hash from arity check
+    if kw_splat {
+        passed_argc -= 1;
+    }
 
     // If the argument count doesn't match
     if cfunc_argc >= 0 && cfunc_argc != passed_argc && flags & VM_CALL_ARGS_SPLAT == 0 {
-        gen_counter_incr!(asm, send_cfunc_argc_mismatch);
-        return CantCompile;
+        gen_counter_incr(jit, asm, Counter::send_cfunc_argc_mismatch);
+        return None;
     }
 
     // Don't JIT functions that need C stack arguments for now
     if cfunc_argc >= 0 && passed_argc + 1 > (C_ARG_OPNDS.len() as i32) {
-        gen_counter_incr!(asm, send_cfunc_toomany_args);
-        return CantCompile;
+        gen_counter_incr(jit, asm, Counter::send_cfunc_toomany_args);
+        return None;
     }
 
-    let block_arg = flags & VM_CALL_ARGS_BLOCKARG != 0;
-    let block_arg_type = if block_arg {
-        Some(ctx.get_opnd_type(StackOpnd(0)))
+    let mut block_arg_type = if block_arg {
+        Some(asm.ctx.get_opnd_type(StackOpnd(0)))
     } else {
         None
     };
 
     match block_arg_type {
         Some(Type::Nil | Type::BlockParamProxy) => {
-            // We'll handle this later
+            // We don't need the actual stack value for these
+            asm.stack_pop(1);
         }
-        None => {
-            // Nothing to do
-        }
-        _ => {
-            gen_counter_incr!(asm, send_block_arg);
-            return CantCompile;
-        }
-    }
-
-    match block_arg_type {
-        Some(Type::Nil) => {
-            // We have a nil block arg, so let's pop it off the args
-            ctx.stack_pop(1);
-        }
-        Some(Type::BlockParamProxy) => {
-            // We don't need the actual stack value
-            ctx.stack_pop(1);
+        Some(Type::Unknown | Type::UnknownImm) if jit.peek_at_stack(&asm.ctx, 0).nil_p() => {
+            // The sample blockarg is nil, so speculate that's the case.
+            asm.cmp(asm.stack_opnd(0), Qnil.into());
+            asm.jne(Target::side_exit(Counter::guard_send_cfunc_block_not_nil));
+            block_arg_type = Some(Type::Nil);
+            asm.stack_pop(1);
         }
         None => {
             // Nothing to do
         }
         _ => {
-            assert!(false);
+            gen_counter_incr(jit, asm, Counter::send_cfunc_block_arg);
+            return None;
         }
     }
+    let block_arg_type = block_arg_type; // drop `mut`
 
-    // This is a .send call and we need to adjust the stack
-    if flags & VM_CALL_OPT_SEND != 0 {
-        handle_opt_send_shift_stack(asm, argc, ctx);
+    // Pop the empty kw_splat hash
+    if kw_splat {
+        // Only `**nil` is supported right now. Checked in exit_if_kwsplat_non_nil()
+        assert_eq!(Type::Nil, asm.ctx.get_opnd_type(StackOpnd(0)));
+        asm.stack_pop(1);
+        argc -= 1;
     }
 
-    // push_splat_args does stack manipulation so we can no longer side exit
-    if flags & VM_CALL_ARGS_SPLAT != 0 {
+    // Splat handling when C method takes a static number of arguments.
+    // push_splat_args() does stack manipulation so we can no longer side exit
+    if flags & VM_CALL_ARGS_SPLAT != 0 && cfunc_argc >= 0 {
         let required_args : u32 = (cfunc_argc as u32).saturating_sub(argc as u32 - 1);
         // + 1 because we pass self
         if required_args + 1 >= C_ARG_OPNDS.len() as u32 {
-            gen_counter_incr!(asm, send_cfunc_toomany_args);
-            return CantCompile;
+            gen_counter_incr(jit, asm, Counter::send_cfunc_toomany_args);
+            return None;
         }
+
         // We are going to assume that the splat fills
-        // all the remaining arguments. In the generated code
-        // we test if this is true and if not side exit.
-        argc = required_args as i32;
+        // all the remaining arguments. So the number of args
+        // should just equal the number of args the cfunc takes.
+        // In the generated code we test if this is true
+        // and if not side exit.
+        argc = cfunc_argc;
         passed_argc = argc;
-        push_splat_args(required_args, ctx, asm, ocb, side_exit)
+        push_splat_args(required_args, asm)
     }
 
+    // This is a .send call and we need to adjust the stack
+    if flags & VM_CALL_OPT_SEND != 0 {
+        handle_opt_send_shift_stack(asm, argc);
+    }
+
+    // Push a dynamic number of items from the splat array to the stack when calling a vargs method
+    let dynamic_splat_size = if variable_splat {
+        asm_comment!(asm, "variable length splat");
+        let stack_splat_array = asm.lea(asm.stack_opnd(0));
+        Some(asm.ccall(rb_yjit_splat_varg_cfunc as _, vec![stack_splat_array]))
+    } else {
+        None
+    };
+
     // Points to the receiver operand on the stack
-    let recv = ctx.stack_opnd(argc);
+    let recv = asm.stack_opnd(argc);
 
     // Store incremented PC into current control frame in case callee raises.
     jit_save_pc(jit, asm);
 
-    // Increment the stack pointer by 3 (in the callee)
-    // sp += 3
-    let sp = asm.lea(ctx.sp_opnd((SIZEOF_VALUE as isize) * 3));
+    // Find callee's SP with space for metadata.
+    // Usually sp+3.
+    let sp = if let Some(splat_size) = dynamic_splat_size {
+        // Compute the callee's SP at runtime in case we accept a variable size for the splat array
+        const _: () = assert!(SIZEOF_VALUE == 8, "opting for a shift since mul on A64 takes no immediates");
+        let splat_size_bytes = asm.lshift(splat_size, 3usize.into());
+        // 3 items for method metadata, minus one to remove the splat array
+        let static_stack_top = asm.lea(asm.ctx.sp_opnd(2));
+        asm.add(static_stack_top, splat_size_bytes)
+    } else {
+        asm.lea(asm.ctx.sp_opnd(3))
+    };
 
     let specval = if block_arg_type == Some(Type::BlockParamProxy) {
-        SpecVal::BlockParamProxy
-    } else if let Some(block_iseq) = block {
-        SpecVal::BlockISeq(block_iseq)
+        SpecVal::BlockHandler(Some(BlockHandler::BlockParamProxy))
     } else {
-        SpecVal::None
+        SpecVal::BlockHandler(block)
     };
 
     let mut frame_type = VM_FRAME_MAGIC_CFUNC | VM_FRAME_FLAG_CFRAME | VM_ENV_FLAG_LOCAL;
@@ -4783,60 +7108,83 @@ fn gen_send_cfunc(
         frame_type |= VM_FRAME_FLAG_CFRAME_KW
     }
 
-    gen_push_frame(jit, ctx, asm, false, ControlFrame {
+    perf_call!("gen_send_cfunc: ", gen_push_frame(jit, asm, ControlFrame {
         frame_type,
         specval,
         cme,
         recv,
         sp,
-        pc: Some(0),
+        pc: if cfg!(feature = "runtime_checks") {
+            Some(!0) // Poison value. Helps to fail fast.
+        } else {
+            None     // Leave PC uninitialized as cfuncs shouldn't read it
+        },
         iseq: None,
-        local_size: 0,
-    });
+    }));
+
+    asm_comment!(asm, "set ec->cfp");
+    let new_cfp = asm.lea(Opnd::mem(64, CFP, -(RUBY_SIZEOF_CONTROL_FRAME as i32)));
+    asm.store(Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP as i32), new_cfp);
 
     if !kw_arg.is_null() {
         // Build a hash from all kwargs passed
-        asm.comment("build_kwhash");
+        asm_comment!(asm, "build_kwhash");
         let imemo_ci = VALUE(ci as usize);
         assert_ne!(0, unsafe { rb_IMEMO_TYPE_P(imemo_ci, imemo_callinfo) },
             "we assume all callinfos with kwargs are on the GC heap");
-        let sp = asm.lea(ctx.sp_opnd(0));
+        let sp = asm.lea(asm.ctx.sp_opnd(0));
         let kwargs = asm.ccall(build_kwhash as *const u8, vec![imemo_ci.into(), sp]);
 
         // Replace the stack location at the start of kwargs with the new hash
-        let stack_opnd = ctx.stack_opnd(argc - passed_argc);
+        let stack_opnd = asm.stack_opnd(argc - passed_argc);
         asm.mov(stack_opnd, kwargs);
     }
 
-    // Copy SP because REG_SP will get overwritten
-    let sp = asm.lea(ctx.sp_opnd(0));
-
-    // Pop the C function arguments from the stack (in the caller)
-    ctx.stack_pop((argc + 1).try_into().unwrap());
-
     // Write interpreter SP into CFP.
-    // Needed in case the callee yields to the block.
-    gen_save_sp(jit, asm, ctx);
+    // We don't pop arguments yet to use registers for passing them, but we
+    // have to set cfp->sp below them for full_cfunc_return() invalidation.
+    gen_save_sp_with_offset(asm, -(argc + 1) as i8);
 
     // Non-variadic method
     let args = if cfunc_argc >= 0 {
         // Copy the arguments from the stack to the C argument registers
         // self is the 0th argument and is at index argc from the stack top
         (0..=passed_argc).map(|i|
-            Opnd::mem(64, sp, -(argc + 1 - i) * SIZEOF_VALUE_I32)
+            asm.stack_opnd(argc - i)
         ).collect()
     }
     // Variadic method
     else if cfunc_argc == -1 {
         // The method gets a pointer to the first argument
         // rb_f_puts(int argc, VALUE *argv, VALUE recv)
+
+        let passed_argc_opnd = if let Some(splat_size) = dynamic_splat_size {
+            // The final argc is the size of the splat, minus one for the splat array itself
+            asm.add(splat_size, (passed_argc - 1).into())
+        } else {
+            // Without a splat, passed_argc is static
+            Opnd::Imm(passed_argc.into())
+        };
+
         vec![
-            Opnd::Imm(passed_argc.into()),
-            asm.lea(Opnd::mem(64, sp, -(argc) * SIZEOF_VALUE_I32)),
-            Opnd::mem(64, sp, -(argc + 1) * SIZEOF_VALUE_I32),
+            passed_argc_opnd,
+            asm.lea(asm.ctx.sp_opnd(-argc)),
+            asm.stack_opnd(argc),
         ]
     }
-    else {
+    // Variadic method taking a Ruby array
+    else if cfunc_argc == -2 {
+        // Slurp up all the arguments into an array
+        let stack_args = asm.lea(asm.ctx.sp_opnd(-argc));
+        let args_array = asm.ccall(
+            rb_ec_ary_new_from_values as _,
+            vec![EC, passed_argc.into(), stack_args]
+        );
+
+        // Example signature:
+        // VALUE neg2_method(VALUE self, VALUE argv)
+        vec![asm.stack_opnd(argc), args_array]
+    } else {
         panic!("unexpected cfunc_args: {}", cfunc_argc)
     };
 
@@ -4844,73 +7192,58 @@ fn gen_send_cfunc(
     // VALUE ret = (cfunc->func)(recv, argv[0], argv[1]);
     // cfunc comes from compile-time cme->def, which we assume to be stable.
     // Invalidation logic is in yjit_method_lookup_change()
-    asm.comment("call C function");
+    asm_comment!(asm, "call C function");
     let ret = asm.ccall(unsafe { get_mct_func(cfunc) }.cast(), args);
+    asm.stack_pop((argc + 1).try_into().unwrap()); // Pop arguments after ccall to use registers for passing them.
 
     // Record code position for TracePoint patching. See full_cfunc_return().
     record_global_inval_patch(asm, CodegenGlobals::get_outline_full_cfunc_return_pos());
 
     // Push the return value on the Ruby stack
-    let stack_ret = ctx.stack_push(Type::Unknown);
+    let stack_ret = asm.stack_push(Type::Unknown);
     asm.mov(stack_ret, ret);
 
+    // Log the name of the method we're calling to. We intentionally don't do this for inlined cfuncs.
+    // We also do this after the C call to minimize the impact of spill_temps() on asm.ccall().
+    if get_option!(gen_stats) {
+        // Assemble the method name string
+        let mid = unsafe { rb_get_def_original_id((*cme).def) };
+        let name_str = get_method_name(Some(unsafe { (*cme).owner }), mid);
+
+        // Get an index for this cfunc name
+        let cfunc_idx = get_cfunc_idx(&name_str);
+
+        // Increment the counter for this cfunc
+        asm.ccall(incr_cfunc_counter as *const u8, vec![cfunc_idx.into()]);
+    }
+
     // Pop the stack frame (ec->cfp++)
     // Instead of recalculating, we can reuse the previous CFP, which is stored in a callee-saved
     // register
-    let ec_cfp_opnd = Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP);
+    let ec_cfp_opnd = Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP as i32);
     asm.store(ec_cfp_opnd, CFP);
 
     // cfunc calls may corrupt types
-    ctx.clear_local_types();
+    asm.clear_local_types();
 
     // Note: the return block of gen_send_iseq() has ctx->sp_offset == 1
     // which allows for sharing the same successor.
 
     // Jump (fall through) to the call continuation block
     // We do this to end the current block after the call
-    jump_to_next_insn(jit, ctx, asm, ocb);
-    EndBlock
+    jump_to_next_insn(jit, asm)
 }
 
-fn gen_return_branch(
-    asm: &mut Assembler,
-    target0: CodePtr,
-    _target1: Option<CodePtr>,
-    shape: BranchShape,
-) {
-    match shape {
-        BranchShape::Next0 | BranchShape::Next1 => unreachable!(),
-        BranchShape::Default => {
-            asm.comment("update cfp->jit_return");
-            asm.mov(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_JIT_RETURN), Opnd::const_ptr(target0.raw_ptr()));
-        }
-    }
-}
-
-/// Pushes arguments from an array to the stack that are passed with a splat (i.e. *args)
-/// It optimistically compiles to a static size that is the exact number of arguments
-/// needed for the function.
-fn push_splat_args(required_args: u32, ctx: &mut Context, asm: &mut Assembler, ocb: &mut OutlinedCb, side_exit: Target) {
-
-    asm.comment("push_splat_args");
-
-    let array_opnd = ctx.stack_opnd(0);
-    let array_reg = asm.load(array_opnd);
-
-    guard_object_is_heap(
-        asm,
-        array_reg,
-        counted_exit!(ocb, side_exit, send_splat_not_array),
-    );
-    guard_object_is_array(
-        asm,
-        array_reg,
-        counted_exit!(ocb, side_exit, send_splat_not_array),
-    );
-
-    asm.comment("Get array length for embedded or heap");
+// Generate RARRAY_LEN. For array_opnd, use Opnd::Reg to reduce memory access,
+// and use Opnd::Mem to save registers.
+fn get_array_len(asm: &mut Assembler, array_opnd: Opnd) -> Opnd {
+    asm_comment!(asm, "get array length for embedded or heap");
 
     // Pull out the embed flag to check if it's an embedded array.
+    let array_reg = match array_opnd {
+        Opnd::InsnOut { .. } => array_opnd,
+        _ => asm.load(array_opnd),
+    };
     let flags_opnd = Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RBASIC_FLAGS);
 
     // Get the length of the array
@@ -4921,220 +7254,300 @@ fn push_splat_args(required_args: u32, ctx: &mut Context, asm: &mut Assembler, o
     let flags_opnd = Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RBASIC_FLAGS);
     asm.test(flags_opnd, (RARRAY_EMBED_FLAG as u64).into());
 
-    // Need to repeat this here to deal with register allocation
-    let array_opnd = ctx.stack_opnd(0);
-    let array_reg = asm.load(array_opnd);
-
+    let array_reg = match array_opnd {
+        Opnd::InsnOut { .. } => array_opnd,
+        _ => asm.load(array_opnd),
+    };
     let array_len_opnd = Opnd::mem(
-        (8 * size_of::<std::os::raw::c_long>()) as u8,
+        std::os::raw::c_long::BITS as u8,
         array_reg,
         RUBY_OFFSET_RARRAY_AS_HEAP_LEN,
     );
-    let array_len_opnd = asm.csel_nz(emb_len_opnd, array_len_opnd);
-
-    asm.comment("Side exit if length doesn't not equal remaining args");
-    asm.cmp(array_len_opnd, required_args.into());
-    asm.jne(counted_exit!(ocb, side_exit, send_splatarray_length_not_equal));
 
-    asm.comment("Check last argument is not ruby2keyword hash");
+    // Select the array length value
+    asm.csel_nz(emb_len_opnd, array_len_opnd)
+}
 
-    // Need to repeat this here to deal with register allocation
-    let array_reg = asm.load(ctx.stack_opnd(0));
+// Generate RARRAY_CONST_PTR (part of RARRAY_AREF)
+fn get_array_ptr(asm: &mut Assembler, array_reg: Opnd) -> Opnd {
+    asm_comment!(asm, "get array pointer for embedded or heap");
 
     let flags_opnd = Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RBASIC_FLAGS);
     asm.test(flags_opnd, (RARRAY_EMBED_FLAG as u64).into());
     let heap_ptr_opnd = Opnd::mem(
-        (8 * size_of::<usize>()) as u8,
+        usize::BITS as u8,
         array_reg,
         RUBY_OFFSET_RARRAY_AS_HEAP_PTR,
     );
+
     // Load the address of the embedded array
     // (struct RArray *)(obj)->as.ary
     let ary_opnd = asm.lea(Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RARRAY_AS_ARY));
-    let ary_opnd = asm.csel_nz(ary_opnd, heap_ptr_opnd);
+    asm.csel_nz(ary_opnd, heap_ptr_opnd)
+}
 
-    let last_array_value = asm.load(Opnd::mem(64, ary_opnd, (required_args as i32 - 1) * (SIZEOF_VALUE as i32)));
+// Generate RSTRING_PTR
+fn get_string_ptr(asm: &mut Assembler, string_reg: Opnd) -> Opnd {
+    asm_comment!(asm, "get string pointer for embedded or heap");
+
+    let flags_opnd = Opnd::mem(VALUE_BITS, string_reg, RUBY_OFFSET_RBASIC_FLAGS);
+    asm.test(flags_opnd, (RSTRING_NOEMBED as u64).into());
+    let heap_ptr_opnd = asm.load(Opnd::mem(
+        usize::BITS as u8,
+        string_reg,
+        RUBY_OFFSET_RSTRING_AS_HEAP_PTR,
+    ));
 
-    guard_object_is_not_ruby2_keyword_hash(
+    // Load the address of the embedded array
+    // (struct RString *)(obj)->as.ary
+    let ary_opnd = asm.lea(Opnd::mem(VALUE_BITS, string_reg, RUBY_OFFSET_RSTRING_AS_ARY));
+    asm.csel_nz(heap_ptr_opnd, ary_opnd)
+}
+
+/// Pushes arguments from an array to the stack. Differs from push splat because
+/// the array can have items left over. Array is assumed to be T_ARRAY without guards.
+fn copy_splat_args_for_rest_callee(array: Opnd, num_args: u32, asm: &mut Assembler) {
+    asm_comment!(asm, "copy_splat_args_for_rest_callee");
+
+    // Unused operands cause the backend to panic
+    if num_args == 0 {
+        return;
+    }
+
+    asm_comment!(asm, "Push arguments from array");
+
+    let array_reg = asm.load(array);
+    let ary_opnd = get_array_ptr(asm, array_reg);
+    for i in 0..num_args {
+        let top = asm.stack_push(Type::Unknown);
+        asm.mov(top, Opnd::mem(64, ary_opnd, i as i32 * SIZEOF_VALUE_I32));
+    }
+}
+
+/// Pushes arguments from an array to the stack that are passed with a splat (i.e. *args)
+/// It optimistically compiles to a static size that is the exact number of arguments
+/// needed for the function.
+fn push_splat_args(required_args: u32, asm: &mut Assembler) {
+    asm_comment!(asm, "push_splat_args");
+
+    let array_opnd = asm.stack_opnd(0);
+    guard_object_is_array(
         asm,
-        last_array_value,
-        counted_exit!(ocb, side_exit, send_splatarray_last_ruby_2_keywords));
+        array_opnd,
+        array_opnd.into(),
+        Counter::guard_send_splat_not_array,
+    );
+
+    let array_len_opnd = get_array_len(asm, array_opnd);
 
-    asm.comment("Push arguments from array");
-    let array_opnd = ctx.stack_pop(1);
+    asm_comment!(asm, "Guard for expected splat length");
+    asm.cmp(array_len_opnd, required_args.into());
+    asm.jne(Target::side_exit(Counter::guard_send_splatarray_length_not_equal));
 
+    // Check last element of array if present
     if required_args > 0 {
-        // Load the address of the embedded array
-        // (struct RArray *)(obj)->as.ary
-        let array_reg = asm.load(array_opnd);
+        asm_comment!(asm, "Check last argument is not ruby2keyword hash");
 
-        // Conditionally load the address of the heap array
-        // (struct RArray *)(obj)->as.heap.ptr
-        let flags_opnd = Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RBASIC_FLAGS);
-        asm.test(flags_opnd, Opnd::UImm(RARRAY_EMBED_FLAG as u64));
-        let heap_ptr_opnd = Opnd::mem(
-            (8 * size_of::<usize>()) as u8,
-            array_reg,
-            RUBY_OFFSET_RARRAY_AS_HEAP_PTR,
+        // Need to repeat this here to deal with register allocation
+        let array_reg = asm.load(asm.stack_opnd(0));
+        let ary_opnd = get_array_ptr(asm, array_reg);
+        let last_array_value = asm.load(Opnd::mem(64, ary_opnd, (required_args as i32 - 1) * (SIZEOF_VALUE as i32)));
+        guard_object_is_not_ruby2_keyword_hash(
+            asm,
+            last_array_value,
+            Counter::guard_send_splatarray_last_ruby2_keywords,
         );
-        // Load the address of the embedded array
-        // (struct RArray *)(obj)->as.ary
-        let ary_opnd = asm.lea(Opnd::mem(VALUE_BITS, array_reg, RUBY_OFFSET_RARRAY_AS_ARY));
-        let ary_opnd = asm.csel_nz(ary_opnd, heap_ptr_opnd);
+    }
+
+    asm_comment!(asm, "Push arguments from array");
+    let array_opnd = asm.stack_pop(1);
+
+    if required_args > 0 {
+        let array_reg = asm.load(array_opnd);
+        let ary_opnd = get_array_ptr(asm, array_reg);
 
         for i in 0..required_args {
-            let top = ctx.stack_push(Type::Unknown);
+            let top = asm.stack_push(Type::Unknown);
             asm.mov(top, Opnd::mem(64, ary_opnd, i as i32 * SIZEOF_VALUE_I32));
         }
 
-        asm.comment("end push_each");
+        asm_comment!(asm, "end push_each");
     }
 }
 
 fn gen_send_bmethod(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
     ci: *const rb_callinfo,
     cme: *const rb_callable_method_entry_t,
-    block: Option<IseqPtr>,
+    block: Option<BlockHandler>,
     flags: u32,
     argc: i32,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     let procv = unsafe { rb_get_def_bmethod_proc((*cme).def) };
 
-    let proc = unsafe { rb_yjit_get_proc_ptr(procv) };
+    let proc = unsafe { rb_jit_get_proc_ptr(procv) };
     let proc_block = unsafe { &(*proc).block };
 
     if proc_block.type_ != block_type_iseq {
-        return CantCompile;
+        return None;
     }
 
     let capture = unsafe { proc_block.as_.captured.as_ref() };
     let iseq = unsafe { *capture.code.iseq.as_ref() };
 
-    // Optimize for single ractor mode and avoid runtime check for
-    // "defined with an un-shareable Proc in a different Ractor"
-    if !assume_single_ractor_mode(jit, ocb) {
-        gen_counter_incr!(asm, send_bmethod_ractor);
-        return CantCompile;
+    if !procv.shareable_p() {
+        let ractor_serial = unsafe { rb_yjit_cme_ractor_serial(cme) };
+        asm_comment!(asm, "guard current ractor == {}", ractor_serial);
+        let current_ractor_serial = asm.load(Opnd::mem(64, EC, RUBY_OFFSET_EC_RACTOR_ID as i32));
+        asm.cmp(current_ractor_serial, ractor_serial.into());
+        asm.jne(Target::side_exit(Counter::send_bmethod_ractor));
     }
 
     // Passing a block to a block needs logic different from passing
     // a block to a method and sometimes requires allocation. Bail for now.
     if block.is_some() {
-        gen_counter_incr!(asm, send_bmethod_block_arg);
-        return CantCompile;
+        gen_counter_incr(jit, asm, Counter::send_bmethod_block_arg);
+        return None;
     }
 
     let frame_type = VM_FRAME_MAGIC_BLOCK | VM_FRAME_FLAG_BMETHOD | VM_FRAME_FLAG_LAMBDA;
-    gen_send_iseq(jit, ctx, asm, ocb, iseq, ci, frame_type, Some(capture.ep), cme, block, flags, argc, None)
+    perf_call! { gen_send_iseq(jit, asm, iseq, ci, frame_type, Some(capture.ep), cme, block, flags, argc, None) }
+}
+
+/// The kind of a value an ISEQ returns
+enum IseqReturn {
+    Value(VALUE),
+    LocalVariable(u32),
+    Receiver,
+}
+
+extern "C" {
+    fn rb_simple_iseq_p(iseq: IseqPtr) -> bool;
+    fn rb_iseq_only_kwparam_p(iseq: IseqPtr) -> bool;
+}
+
+/// Return the ISEQ's return value if it consists of one simple instruction and leave.
+fn iseq_get_return_value(iseq: IseqPtr, captured_opnd: Option<Opnd>, block: Option<BlockHandler>, ci_flags: u32) -> Option<IseqReturn> {
+    // Expect only two instructions and one possible operand
+    // NOTE: If an ISEQ has an optional keyword parameter with a default value that requires
+    // computation, the ISEQ will always have more than two instructions and won't be inlined.
+    let iseq_size = unsafe { get_iseq_encoded_size(iseq) };
+    if !(2..=3).contains(&iseq_size) {
+        return None;
+    }
+
+    // Get the first two instructions
+    let first_insn = iseq_opcode_at_idx(iseq, 0);
+    let second_insn = iseq_opcode_at_idx(iseq, insn_len(first_insn as usize));
+
+    // Extract the return value if known
+    if second_insn != YARVINSN_leave {
+        return None;
+    }
+    match first_insn {
+        YARVINSN_getlocal_WC_0  => {
+            // Accept only cases where only positional arguments are used by both the callee and the caller.
+            // Keyword arguments may be specified by the callee or the caller but not used.
+            // Reject block ISEQs to avoid autosplat and other block parameter complications.
+            if captured_opnd.is_some()
+                // Reject if block ISEQ is present
+                || block.is_some()
+                // Equivalent to `VM_CALL_ARGS_SIMPLE - VM_CALL_KWARG - has_block_iseq`
+                || ci_flags & (
+                      VM_CALL_ARGS_SPLAT
+                    | VM_CALL_KW_SPLAT
+                    | VM_CALL_ARGS_BLOCKARG
+                    | VM_CALL_FORWARDING
+                ) != 0
+                 {
+                return None;
+            }
+
+            let ep_offset = unsafe { *rb_iseq_pc_at_idx(iseq, 1) }.as_u32();
+            let local_idx = ep_offset_to_local_idx(iseq, ep_offset);
+
+            // Only inline getlocal on a parameter. DCE in the IESQ builder can
+            // make a two-instruction ISEQ that does not return a parameter.
+            if local_idx >= unsafe { get_iseq_body_param_size(iseq) } {
+                return None;
+            }
+
+            if unsafe { rb_simple_iseq_p(iseq) } {
+                return Some(IseqReturn::LocalVariable(local_idx));
+            } else if unsafe { rb_iseq_only_kwparam_p(iseq) } {
+                // Inline if only positional parameters are used
+                if let Ok(i) = i32::try_from(local_idx) {
+                    if i < unsafe { rb_get_iseq_body_param_lead_num(iseq) } {
+                        return Some(IseqReturn::LocalVariable(local_idx));
+                    }
+                }
+            }
+
+            return None;
+        }
+        YARVINSN_putnil => Some(IseqReturn::Value(Qnil)),
+        YARVINSN_putobject => Some(IseqReturn::Value(unsafe { *rb_iseq_pc_at_idx(iseq, 1) })),
+        YARVINSN_putobject_INT2FIX_0_ => Some(IseqReturn::Value(VALUE::fixnum_from_usize(0))),
+        YARVINSN_putobject_INT2FIX_1_ => Some(IseqReturn::Value(VALUE::fixnum_from_usize(1))),
+        // We don't support invokeblock for now. Such ISEQs are likely not used by blocks anyway.
+        YARVINSN_putself if captured_opnd.is_none() => Some(IseqReturn::Receiver),
+        _ => None,
+    }
 }
 
 fn gen_send_iseq(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
     iseq: *const rb_iseq_t,
     ci: *const rb_callinfo,
     frame_type: u32,
     prev_ep: Option<*const VALUE>,
     cme: *const rb_callable_method_entry_t,
-    block: Option<IseqPtr>,
+    block: Option<BlockHandler>,
     flags: u32,
     argc: i32,
     captured_opnd: Option<Opnd>,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
+    // Argument count. We will change this as we gather values from
+    // sources to satisfy the callee's parameters. To help make sense
+    // of changes, note that:
+    //   - Parameters syntactically on the left have lower addresses.
+    //     For example, all the lead (required) and optional parameters
+    //     have lower addresses than the rest parameter array.
+    //   - The larger the index one passes to Assembler::stack_opnd(),
+    //     the *lower* the address.
     let mut argc = argc;
 
-    // Create a side-exit to fall back to the interpreter
-    let side_exit = get_side_exit(jit, ocb, ctx);
-
-    // When you have keyword arguments, there is an extra object that gets
-    // placed on the stack the represents a bitmap of the keywords that were not
-    // specified at the call site. We need to keep track of the fact that this
-    // value is present on the stack in order to properly set up the callee's
-    // stack pointer.
-    let doing_kw_call = unsafe { get_iseq_flags_has_kw(iseq) };
+    // Iseqs with keyword parameters have a hidden, unnamed parameter local
+    // that the callee could use to know which keywords are unspecified
+    // (see the `checkkeyword` instruction and check `ruby --dump=insn -e 'def foo(k:itself)=k'`).
+    // We always need to set up this local if the call goes through.
+    let has_kwrest = unsafe { get_iseq_flags_has_kwrest(iseq) };
+    let doing_kw_call = unsafe { get_iseq_flags_has_kw(iseq) } || has_kwrest;
     let supplying_kws = unsafe { vm_ci_flag(ci) & VM_CALL_KWARG } != 0;
+    let iseq_has_rest = unsafe { get_iseq_flags_has_rest(iseq) };
+    let iseq_has_block_param = unsafe { get_iseq_flags_has_block(iseq) };
+    let arg_setup_block = captured_opnd.is_some(); // arg_setup_type: arg_setup_block (invokeblock)
+
+    // Is this iseq tagged as "forwardable"? Iseqs that take `...` as a
+    // parameter are tagged as forwardable (e.g. `def foo(...); end`)
+    let forwarding = unsafe { rb_get_iseq_flags_forwardable(iseq) };
+
+    // If a "forwardable" iseq has been called with a splat, then we _do not_
+    // want to expand the splat to the stack. So we'll only consider this
+    // a splat call if the callee iseq is not forwardable.  For example,
+    // we do not want to handle the following code:
+    //
+    // `def foo(...); end; foo(*blah)`
+    let splat_call = (flags & VM_CALL_ARGS_SPLAT != 0) && !forwarding;
+    let kw_splat = (flags & VM_CALL_KW_SPLAT != 0) && !forwarding;
 
-    if unsafe { vm_ci_flag(ci) } & VM_CALL_TAILCALL != 0 {
-        // We can't handle tailcalls
-        gen_counter_incr!(asm, send_iseq_tailcall);
-        return CantCompile;
-    }
-
-    // No support for callees with these parameters yet as they require allocation
-    // or complex handling.
-    if unsafe { get_iseq_flags_has_rest(iseq) } {
-        gen_counter_incr!(asm, send_iseq_has_rest);
-        return CantCompile;
-    }
-    if unsafe { get_iseq_flags_has_post(iseq) } {
-        gen_counter_incr!(asm, send_iseq_has_post);
-        return CantCompile;
-    }
-    if unsafe { get_iseq_flags_has_kwrest(iseq) } {
-        gen_counter_incr!(asm, send_iseq_has_kwrest);
-        return CantCompile;
-    }
-
-    // In order to handle backwards compatibility between ruby 3 and 2
-    // ruby2_keywords was introduced. It is called only on methods
-    // with splat and changes they way they handle them.
-    // We are just going to not compile these.
-    // https://www.rubydoc.info/stdlib/core/Proc:ruby2_keywords
-    if unsafe {
-        get_iseq_flags_ruby2_keywords(jit.iseq) && flags & VM_CALL_ARGS_SPLAT != 0
-    } {
-        gen_counter_incr!(asm, send_iseq_ruby2_keywords);
-        return CantCompile;
-    }
-
-    // If we have keyword arguments being passed to a callee that only takes
-    // positionals, then we need to allocate a hash. For now we're going to
-    // call that too complex and bail.
-    if supplying_kws && !unsafe { get_iseq_flags_has_kw(iseq) } {
-        gen_counter_incr!(asm, send_iseq_has_no_kw);
-        return CantCompile;
-    }
-
-    // If we have a method accepting no kwargs (**nil), exit if we have passed
-    // it any kwargs.
-    if supplying_kws && unsafe { get_iseq_flags_accepts_no_kwarg(iseq) } {
-        gen_counter_incr!(asm, send_iseq_accepts_no_kwarg);
-        return CantCompile;
-    }
-
-    // For computing number of locals to set up for the callee
-    let mut num_params = unsafe { get_iseq_body_param_size(iseq) };
-
-    // Block parameter handling. This mirrors setup_parameters_complex().
-    if unsafe { get_iseq_flags_has_block(iseq) } {
-        if unsafe { get_iseq_body_local_iseq(iseq) == iseq } {
-            num_params -= 1;
-        } else {
-            // In this case (param.flags.has_block && local_iseq != iseq),
-            // the block argument is setup as a local variable and requires
-            // materialization (allocation). Bail.
-            gen_counter_incr!(asm, send_iseq_materialized_block);
-            return CantCompile;
-        }
-    }
-
+    // For computing offsets to callee locals
+    let num_params = unsafe { get_iseq_body_param_size(iseq) as i32 };
+    let num_locals = unsafe { get_iseq_body_local_table_size(iseq) as i32 };
 
-    if flags & VM_CALL_ARGS_SPLAT != 0 && flags & VM_CALL_ZSUPER != 0 {
-        // zsuper methods are super calls without any arguments.
-        // They are also marked as splat, but don't actually have an array
-        // they pull arguments from, instead we need to change to call
-        // a different method with the current stack.
-        gen_counter_incr!(asm, send_iseq_zsuper);
-        return CantCompile;
-    }
-
-    let mut start_pc_offset = 0;
+    let mut start_pc_offset: u16 = 0;
     let required_num = unsafe { get_iseq_body_param_lead_num(iseq) };
 
     // This struct represents the metadata about the caller-specified
@@ -5146,392 +7559,623 @@ fn gen_send_iseq(
         unsafe { get_cikw_keyword_len(kw_arg) }
     };
 
-    // Arity handling and optional parameter setup
-    let opts_filled = argc - required_num - kw_arg_num;
+    // Arity handling and optional parameter setup for positional arguments.
+    // Splats are handled later.
+    let mut opts_filled = argc - required_num - kw_arg_num - i32::from(kw_splat) - i32::from(splat_call);
     let opt_num = unsafe { get_iseq_body_param_opt_num(iseq) };
-    let opts_missing: i32 = opt_num - opts_filled;
-
-
-    if opt_num > 0 && flags & VM_CALL_ARGS_SPLAT != 0 {
-        gen_counter_incr!(asm, send_iseq_splat_with_opt);
-        return CantCompile;
+    // With a rest parameter or a yield to a block,
+    // callers can pass more than required + optional.
+    // So we cap ops_filled at opt_num.
+    if iseq_has_rest || arg_setup_block {
+        opts_filled = min(opts_filled, opt_num);
     }
+    let mut opts_missing: i32 = opt_num - opts_filled;
 
-    if doing_kw_call && flags & VM_CALL_ARGS_SPLAT != 0 {
-        gen_counter_incr!(asm, send_iseq_splat_with_kw);
-        return CantCompile;
+    let block_arg = flags & VM_CALL_ARGS_BLOCKARG != 0;
+    // Stack index of the splat array
+    let splat_pos = i32::from(block_arg) + i32::from(kw_splat) + kw_arg_num;
+
+    exit_if_stack_too_large(iseq)?;
+    exit_if_tail_call(jit, asm, ci)?;
+    exit_if_has_post(jit, asm, iseq)?;
+    exit_if_kwsplat_non_nil(jit, asm, flags, Counter::send_iseq_kw_splat_non_nil)?;
+    exit_if_has_rest_and_captured(jit, asm, iseq_has_rest, captured_opnd)?;
+    exit_if_has_kwrest_and_captured(jit, asm, has_kwrest, captured_opnd)?;
+    exit_if_has_rest_and_supplying_kws(jit, asm, iseq_has_rest, supplying_kws)?;
+    exit_if_supplying_kw_and_has_no_kw(jit, asm, supplying_kws, doing_kw_call)?;
+    exit_if_supplying_kws_and_accept_no_kwargs(jit, asm, supplying_kws, iseq)?;
+    exit_if_doing_kw_and_splat(jit, asm, doing_kw_call, flags)?;
+    if !forwarding {
+        exit_if_wrong_number_arguments(jit, asm, arg_setup_block, opts_filled, flags, opt_num, iseq_has_rest)?;
+    }
+    exit_if_doing_kw_and_opts_missing(jit, asm, doing_kw_call, opts_missing)?;
+    exit_if_has_rest_and_optional_and_block(jit, asm, iseq_has_rest, opt_num, iseq, block_arg)?;
+    if forwarding && flags & VM_CALL_OPT_SEND != 0 {
+        gen_counter_incr(jit, asm, Counter::send_iseq_send_forwarding);
+        return None;
+    }
+    let block_arg_type = exit_if_unsupported_block_arg_type(jit, asm, block_arg)?;
+
+    // Bail if we can't drop extra arguments for a yield by just popping them
+    if supplying_kws && arg_setup_block && argc > (kw_arg_num + required_num + opt_num) {
+        gen_counter_incr(jit, asm, Counter::send_iseq_complex_discard_extras);
+        return None;
     }
 
-    if opts_filled < 0 && flags & VM_CALL_ARGS_SPLAT == 0  {
-        // Too few arguments and no splat to make up for it
-        gen_counter_incr!(asm, send_iseq_arity_error);
-        return CantCompile;
+    // Block parameter handling. This mirrors setup_parameters_complex().
+    if iseq_has_block_param {
+        if unsafe { get_iseq_body_local_iseq(iseq) == iseq } {
+            // Do nothing
+        } else {
+            // In this case (param.flags.has_block && local_iseq != iseq),
+            // the block argument is setup as a local variable and requires
+            // materialization (allocation). Bail.
+            gen_counter_incr(jit, asm, Counter::send_iseq_materialized_block);
+            return None;
+        }
     }
 
-    if opts_filled > opt_num {
-        // Too many arguments
-        gen_counter_incr!(asm, send_iseq_arity_error);
-        return CantCompile;
+    // Check that required keyword arguments are supplied and find any extras
+    // that should go into the keyword rest parameter (**kw_rest).
+    if doing_kw_call {
+        gen_iseq_kw_call_checks(jit, asm, iseq, kw_arg, has_kwrest, kw_arg_num)?;
     }
 
-    let block_arg = flags & VM_CALL_ARGS_BLOCKARG != 0;
-    let block_arg_type = if block_arg {
-        Some(ctx.get_opnd_type(StackOpnd(0)))
+    let splat_array_length = if splat_call {
+        let array = jit.peek_at_stack(&asm.ctx, splat_pos as isize);
+        let array_length = if array == Qnil {
+            0
+        } else if unsafe { !RB_TYPE_P(array, RUBY_T_ARRAY) } {
+            gen_counter_incr(jit, asm, Counter::send_iseq_splat_not_array);
+            return None;
+        } else {
+            unsafe { rb_jit_array_len(array) as u32}
+        };
+
+        // Arity check accounting for size of the splat. When callee has rest parameters, we insert
+        // runtime guards later in copy_splat_args_for_rest_callee()
+        if !iseq_has_rest {
+            let supplying = argc - 1 - i32::from(kw_splat) + array_length as i32;
+            if (required_num..=required_num + opt_num).contains(&supplying) == false {
+                gen_counter_incr(jit, asm, Counter::send_iseq_splat_arity_error);
+                return None;
+            }
+        }
+
+        if iseq_has_rest && opt_num > 0 {
+            // If we have a rest and option arguments
+            // we are going to set the pc_offset for where
+            // to jump in the called method.
+            // If the number of args change, that would need to
+            // change and we don't change that dynmically so we side exit.
+            // On a normal splat without rest and option args this is handled
+            // elsewhere depending on the case
+            asm_comment!(asm, "Side exit if length doesn't not equal compile time length");
+            let array_len_opnd = get_array_len(asm, asm.stack_opnd(splat_pos));
+            asm.cmp(array_len_opnd, array_length.into());
+            asm.jne(Target::side_exit(Counter::guard_send_splatarray_length_not_equal));
+        }
+
+        Some(array_length)
     } else {
         None
     };
 
-    match block_arg_type {
-        Some(Type::Nil | Type::BlockParamProxy) => {
-            // We'll handle this later
-        }
-        None => {
-            // Nothing to do
+    // Check if we need the arg0 splat handling of vm_callee_setup_block_arg()
+    // Also known as "autosplat" inside setup_parameters_complex().
+    // Autosplat checks argc == 1 after splat and kwsplat processing, so make
+    // sure to amend this if we start support kw_splat.
+    let block_arg0_splat = arg_setup_block
+        && (argc == 1 || (argc == 2 && splat_array_length == Some(0)))
+        && !supplying_kws && !doing_kw_call
+        && unsafe {
+            (get_iseq_flags_has_lead(iseq) || opt_num > 1)
+                && !get_iseq_flags_ambiguous_param0(iseq)
+        };
+    if block_arg0_splat {
+        // If block_arg0_splat, we still need side exits after splat, but
+        // the splat modifies the stack which breaks side exits. So bail out.
+        if splat_call {
+            gen_counter_incr(jit, asm, Counter::invokeblock_iseq_arg0_args_splat);
+            return None;
         }
-        _ => {
-            gen_counter_incr!(asm, send_block_arg);
-            return CantCompile;
+        // The block_arg0_splat implementation cannot deal with optional parameters.
+        // This is a setup_parameters_complex() situation and interacts with the
+        // starting position of the callee.
+        if opt_num > 1 {
+            gen_counter_incr(jit, asm, Counter::invokeblock_iseq_arg0_optional);
+            return None;
         }
     }
 
-    // If we have unfilled optional arguments and keyword arguments then we
-    // would need to adjust the arguments location to account for that.
-    // For now we aren't handling this case.
-    if doing_kw_call && opts_missing > 0 {
-        gen_counter_incr!(asm, send_iseq_missing_optional_kw);
-        return CantCompile;
+    // Adjust `opts_filled` and `opts_missing` taking
+    // into account the size of the splat expansion.
+    if let Some(len) = splat_array_length {
+        assert_eq!(kw_arg_num, 0); // Due to exit_if_doing_kw_and_splat().
+                                   // Simplifies calculation below.
+        let num_args = argc - 1 - i32::from(kw_splat) + len as i32;
+
+        opts_filled = if num_args >= required_num {
+            min(num_args - required_num, opt_num)
+        } else {
+            0
+        };
+        opts_missing = opt_num - opts_filled;
     }
 
+    assert_eq!(opts_missing + opts_filled, opt_num);
+    assert!(opts_filled >= 0);
+
+    // ISeq with optional parameters start at different
+    // locations depending on the number of optionals given.
     if opt_num > 0 {
-        num_params -= opts_missing as u32;
+        assert!(opts_filled >= 0);
         unsafe {
             let opt_table = get_iseq_body_param_opt_table(iseq);
-            start_pc_offset = (*opt_table.offset(opts_filled as isize)).as_u32();
+            start_pc_offset = opt_table.offset(opts_filled as isize).read().try_into().unwrap();
         }
     }
 
-    if doing_kw_call {
-        // Here we're calling a method with keyword arguments and specifying
-        // keyword arguments at this call site.
-
-        // This struct represents the metadata about the callee-specified
-        // keyword parameters.
-        let keyword = unsafe { get_iseq_body_param_keyword(iseq) };
-        let keyword_num: usize = unsafe { (*keyword).num }.try_into().unwrap();
-        let keyword_required_num: usize = unsafe { (*keyword).required_num }.try_into().unwrap();
-
-        let mut required_kwargs_filled = 0;
-
-        if keyword_num > 30 {
-            // We have so many keywords that (1 << num) encoded as a FIXNUM
-            // (which shifts it left one more) no longer fits inside a 32-bit
-            // immediate.
-            gen_counter_incr!(asm, send_iseq_too_many_kwargs);
-            return CantCompile;
-        }
-
-        // Check that the kwargs being passed are valid
-        if supplying_kws {
-            // This is the list of keyword arguments that the callee specified
-            // in its initial declaration.
-            // SAFETY: see compile.c for sizing of this slice.
-            let callee_kwargs = unsafe { slice::from_raw_parts((*keyword).table, keyword_num) };
-
-            // Here we're going to build up a list of the IDs that correspond to
-            // the caller-specified keyword arguments. If they're not in the
-            // same order as the order specified in the callee declaration, then
-            // we're going to need to generate some code to swap values around
-            // on the stack.
-            let kw_arg_keyword_len: usize =
-                unsafe { get_cikw_keyword_len(kw_arg) }.try_into().unwrap();
-            let mut caller_kwargs: Vec<ID> = vec![0; kw_arg_keyword_len];
-            for kwarg_idx in 0..kw_arg_keyword_len {
-                let sym = unsafe { get_cikw_keywords_idx(kw_arg, kwarg_idx.try_into().unwrap()) };
-                caller_kwargs[kwarg_idx] = unsafe { rb_sym2id(sym) };
-            }
+    // Increment total ISEQ send count
+    gen_counter_incr(jit, asm, Counter::num_send_iseq);
 
-            // First, we're going to be sure that the names of every
-            // caller-specified keyword argument correspond to a name in the
-            // list of callee-specified keyword parameters.
-            for caller_kwarg in caller_kwargs {
-                let search_result = callee_kwargs
-                    .iter()
-                    .enumerate() // inject element index
-                    .find(|(_, &kwarg)| kwarg == caller_kwarg);
-
-                match search_result {
-                    None => {
-                        // If the keyword was never found, then we know we have a
-                        // mismatch in the names of the keyword arguments, so we need to
-                        // bail.
-                        gen_counter_incr!(asm, send_iseq_kwargs_mismatch);
-                        return CantCompile;
-                    }
-                    Some((callee_idx, _)) if callee_idx < keyword_required_num => {
-                        // Keep a count to ensure all required kwargs are specified
-                        required_kwargs_filled += 1;
-                    }
-                    _ => (),
+    // Shortcut for special `Primitive.attr! :leaf` builtins
+    let builtin_attrs = unsafe { rb_jit_iseq_builtin_attrs(iseq) };
+    let builtin_func_raw = unsafe { rb_yjit_builtin_function(iseq) };
+    let builtin_func = if builtin_func_raw.is_null() { None } else { Some(builtin_func_raw) };
+    let opt_send_call = flags & VM_CALL_OPT_SEND != 0; // .send call is not currently supported for builtins
+    if let (None, Some(builtin_info), true, false, None | Some(0)) =
+           (block, builtin_func, builtin_attrs & BUILTIN_ATTR_LEAF != 0, opt_send_call, splat_array_length) {
+        let builtin_argc = unsafe { (*builtin_info).argc };
+        if builtin_argc + 1 < (C_ARG_OPNDS.len() as i32) {
+            // We pop the block arg without using it because:
+            //  - the builtin is leaf, so it promises to not `yield`.
+            //  - no leaf builtins have block param at the time of writing, and
+            //    adding one requires interpreter changes to support.
+            if block_arg_type.is_some() {
+                if iseq_has_block_param {
+                    gen_counter_incr(jit, asm, Counter::send_iseq_leaf_builtin_block_arg_block_param);
+                    return None;
                 }
+                asm.stack_pop(1);
             }
-        }
-        assert!(required_kwargs_filled <= keyword_required_num);
-        if required_kwargs_filled != keyword_required_num {
-            gen_counter_incr!(asm, send_iseq_kwargs_mismatch);
-            return CantCompile;
-        }
-    }
-
-    // Number of locals that are not parameters
-    let num_locals = unsafe { get_iseq_body_local_table_size(iseq) as i32 } - (num_params as i32);
 
-    // Check for interrupts
-    gen_check_ints(asm, side_exit);
-
-    match block_arg_type {
-        Some(Type::Nil) => {
-            // We have a nil block arg, so let's pop it off the args
-            ctx.stack_pop(1);
-        }
-        Some(Type::BlockParamProxy) => {
-            // We don't need the actual stack value
-            ctx.stack_pop(1);
-        }
-        None => {
-            // Nothing to do
-        }
-        _ => {
-            assert!(false);
-        }
-    }
-
-    let leaf_builtin_raw = unsafe { rb_leaf_builtin_function(iseq) };
-    let leaf_builtin: Option<*const rb_builtin_function> = if leaf_builtin_raw.is_null() {
-        None
-    } else {
-        Some(leaf_builtin_raw)
-    };
-    if let (None, Some(builtin_info)) = (block, leaf_builtin) {
+            // Pop empty kw_splat hash which passes nothing (exit_if_kwsplat_non_nil())
+            if kw_splat {
+                asm.stack_pop(1);
+            }
 
-        // this is a .send call not currently supported for builtins
-        if flags & VM_CALL_OPT_SEND != 0 {
-            gen_counter_incr!(asm, send_send_builtin);
-            return CantCompile;
-        }
+            // Pop empty splat array which passes nothing
+            if let Some(0) = splat_array_length {
+                asm.stack_pop(1);
+            }
 
-        let builtin_argc = unsafe { (*builtin_info).argc };
-        if builtin_argc + 1 < (C_ARG_OPNDS.len() as i32) {
-            asm.comment("inlined leaf builtin");
+            asm_comment!(asm, "inlined leaf builtin");
+            gen_counter_incr(jit, asm, Counter::num_send_iseq_leaf);
 
-            // Save the PC and SP because the callee may allocate
-            // e.g. Integer#abs on a bignum
-            jit_prepare_routine_call(jit, ctx, asm);
+            // The callee may allocate, e.g. Integer#abs on a Bignum.
+            // Save SP for GC, save PC for allocation tracing, and prepare
+            // for global invalidation after GC's VM lock contention.
+            jit_prepare_call_with_gc(jit, asm);
 
             // Call the builtin func (ec, recv, arg1, arg2, ...)
             let mut args = vec![EC];
 
             // Copy self and arguments
             for i in 0..=builtin_argc {
-                let stack_opnd = ctx.stack_opnd(builtin_argc - i);
+                let stack_opnd = asm.stack_opnd(builtin_argc - i);
                 args.push(stack_opnd);
             }
-            ctx.stack_pop((builtin_argc + 1).try_into().unwrap());
             let val = asm.ccall(unsafe { (*builtin_info).func_ptr as *const u8 }, args);
+            asm.stack_pop((builtin_argc + 1).try_into().unwrap()); // Keep them on stack during ccall for GC
 
             // Push the return value
-            let stack_ret = ctx.stack_push(Type::Unknown);
+            let stack_ret = asm.stack_push(Type::Unknown);
             asm.mov(stack_ret, val);
 
             // Note: assuming that the leaf builtin doesn't change local variables here.
             // Seems like a safe assumption.
 
-            return KeepCompiling;
+            // Let guard chains share the same successor
+            return jump_to_next_insn(jit, asm);
+        }
+    }
+
+    // Inline simple ISEQs whose return value is known at compile time
+    if let (Some(value), None, false) = (iseq_get_return_value(iseq, captured_opnd, block, flags), block_arg_type, opt_send_call) {
+        asm_comment!(asm, "inlined simple ISEQ");
+        gen_counter_incr(jit, asm, Counter::num_send_iseq_inline);
+
+        match value {
+            IseqReturn::LocalVariable(local_idx) => {
+                // Put the local variable at the return slot
+                let stack_local = asm.stack_opnd(argc - 1 - local_idx as i32);
+                let stack_return = asm.stack_opnd(argc);
+                asm.mov(stack_return, stack_local);
+
+                // Update the mapping for the return value
+                let mapping = asm.ctx.get_opnd_mapping(stack_local.into());
+                asm.ctx.set_opnd_mapping(stack_return.into(), mapping);
+
+                // Pop everything but the return value
+                asm.stack_pop(argc as usize);
+            }
+            IseqReturn::Value(value) => {
+                // Pop receiver and arguments
+                asm.stack_pop(argc as usize + if captured_opnd.is_some() { 0 } else { 1 });
+
+                // Push the return value
+                let stack_ret = asm.stack_push(Type::from(value));
+                asm.mov(stack_ret, value.into());
+            },
+            IseqReturn::Receiver => {
+                // Just pop arguments and leave the receiver on stack
+                asm.stack_pop(argc as usize);
+            }
         }
+
+        // Let guard chains share the same successor
+        return jump_to_next_insn(jit, asm);
     }
 
     // Stack overflow check
     // Note that vm_push_frame checks it against a decremented cfp, hence the multiply by 2.
     // #define CHECK_VM_STACK_OVERFLOW0(cfp, sp, margin)
-    asm.comment("stack overflow check");
+    asm_comment!(asm, "stack overflow check");
+    const _: () = assert!(RUBY_SIZEOF_CONTROL_FRAME % SIZEOF_VALUE == 0, "sizeof(rb_control_frame_t) is a multiple of sizeof(VALUE)");
     let stack_max: i32 = unsafe { get_iseq_body_stack_max(iseq) }.try_into().unwrap();
-    let locals_offs =
-        SIZEOF_VALUE_I32 * (num_locals + stack_max) + 2 * (RUBY_SIZEOF_CONTROL_FRAME as i32);
-    let stack_limit = asm.lea(ctx.sp_opnd(locals_offs as isize));
+    let locals_offs = (num_locals + stack_max) + 2 * (RUBY_SIZEOF_CONTROL_FRAME / SIZEOF_VALUE) as i32;
+    let stack_limit = asm.lea(asm.ctx.sp_opnd(locals_offs));
     asm.cmp(CFP, stack_limit);
-    asm.jbe(counted_exit!(ocb, side_exit, send_se_cf_overflow));
-
-    // push_splat_args does stack manipulation so we can no longer side exit
-    if flags & VM_CALL_ARGS_SPLAT != 0 {
-        let required_args = num_params - (argc as u32 - 1);
-        // We are going to assume that the splat fills
-        // all the remaining arguments. In the generated code
-        // we test if this is true and if not side exit.
-        argc = num_params as i32;
-        push_splat_args(required_args, ctx, asm, ocb, side_exit)
-    }
+    asm.jbe(Target::side_exit(Counter::guard_send_se_cf_overflow));
+
+    if iseq_has_rest && splat_call {
+        // Insert length guard for a call to copy_splat_args_for_rest_callee()
+        // that will come later. We will have made changes to
+        // the stack by spilling or handling __send__ shifting
+        // by the time we get to that code, so we need the
+        // guard here where we can still side exit.
+        let non_rest_arg_count = argc - i32::from(kw_splat) - 1;
+        if non_rest_arg_count < required_num + opt_num {
+            let take_count: u32 = (required_num - non_rest_arg_count + opts_filled)
+                .try_into().unwrap();
+
+            if take_count > 0 {
+                asm_comment!(asm, "guard splat_array_length >= {take_count}");
+
+                let splat_array = asm.stack_opnd(splat_pos);
+                let array_len_opnd = get_array_len(asm, splat_array);
+                asm.cmp(array_len_opnd, take_count.into());
+                asm.jl(Target::side_exit(Counter::guard_send_iseq_has_rest_and_splat_too_few));
+            }
+        }
 
-    // This is a .send call and we need to adjust the stack
-    if flags & VM_CALL_OPT_SEND != 0 {
-        handle_opt_send_shift_stack(asm, argc, ctx);
+        // All splats need to guard for ruby2_keywords hash. Check with a function call when
+        // splatting into a rest param since the index for the last item in the array is dynamic.
+        asm_comment!(asm, "guard no ruby2_keywords hash in splat");
+        let bad_splat = asm.ccall(rb_yjit_ruby2_keywords_splat_p as _, vec![asm.stack_opnd(splat_pos)]);
+        asm.cmp(bad_splat, 0.into());
+        asm.jnz(Target::side_exit(Counter::guard_send_splatarray_last_ruby2_keywords));
     }
 
-    if doing_kw_call {
-        // Here we're calling a method with keyword arguments and specifying
-        // keyword arguments at this call site.
+    match block_arg_type {
+        Some(BlockArg::Nil) => {
+            // We have a nil block arg, so let's pop it off the args
+            asm.stack_pop(1);
+        }
+        Some(BlockArg::BlockParamProxy) => {
+            // We don't need the actual stack value
+            asm.stack_pop(1);
+        }
+        Some(BlockArg::TProc) => {
+            // Place the proc as the block handler. We do this early because
+            // the block arg being at the top of the stack gets in the way of
+            // rest param handling later. Also, since there are C calls that
+            // come later, we can't hold this value in a register and place it
+            // near the end when we push a new control frame.
+            asm_comment!(asm, "guard block arg is a proc");
+            // Simple predicate, no need for jit_prepare_non_leaf_call().
+            let is_proc = asm.ccall(rb_obj_is_proc as _, vec![asm.stack_opnd(0)]);
+            asm.cmp(is_proc, Qfalse.into());
+            jit_chain_guard(
+                JCC_JE,
+                jit,
+                asm,
+                SEND_MAX_DEPTH,
+                Counter::guard_send_block_arg_type,
+            );
 
-        // Number of positional arguments the callee expects before the first
-        // keyword argument
-        let args_before_kw = required_num + opt_num;
+            // If this is a forwardable iseq, adjust the stack size accordingly
+            let callee_ep = if forwarding {
+                -1 + num_locals + VM_ENV_DATA_SIZE as i32
+            } else {
+                -argc + num_locals + VM_ENV_DATA_SIZE as i32 - 1
+            };
+            let callee_specval = callee_ep + VM_ENV_DATA_INDEX_SPECVAL;
+            if callee_specval < 0 {
+                // Can't write to sp[-n] since that's where the arguments are
+                gen_counter_incr(jit, asm, Counter::send_iseq_clobbering_block_arg);
+                return None;
+            }
+            if iseq_has_rest || has_kwrest {
+                // The proc would be stored above the current stack top, where GC can't see it
+                gen_counter_incr(jit, asm, Counter::send_iseq_block_arg_gc_unsafe);
+                return None;
+            }
+            let proc = asm.stack_pop(1); // Pop first, as argc doesn't account for the block arg
+            let callee_specval = asm.ctx.sp_opnd(callee_specval);
+            asm.store(callee_specval, proc);
+        }
+        None => {
+            // Nothing to do
+        }
+    }
 
-        // This struct represents the metadata about the caller-specified
-        // keyword arguments.
-        let ci_kwarg = unsafe { vm_ci_kwarg(ci) };
-        let caller_keyword_len: usize = if ci_kwarg.is_null() {
-            0
-        } else {
-            unsafe { get_cikw_keyword_len(ci_kwarg) }
-                .try_into()
-                .unwrap()
-        };
+    if kw_splat {
+        // Only `**nil` is supported right now. Checked in exit_if_kwsplat_non_nil()
+        assert_eq!(Type::Nil, asm.ctx.get_opnd_type(StackOpnd(0)));
+        asm.stack_pop(1);
+        argc -= 1;
+    }
 
-        // This struct represents the metadata about the callee-specified
-        // keyword parameters.
-        let keyword = unsafe { get_iseq_body_param_keyword(iseq) };
+    // push_splat_args does stack manipulation so we can no longer side exit
+    if let Some(array_length) = splat_array_length {
+        if !iseq_has_rest {
+            // Speculate that future splats will be done with
+            // an array that has the same length. We will insert guards.
+            argc = argc - 1 + array_length as i32;
+            if argc + asm.ctx.get_stack_size() as i32 > MAX_SPLAT_LENGTH {
+                gen_counter_incr(jit, asm, Counter::send_splat_too_long);
+                return None;
+            }
+            push_splat_args(array_length, asm);
+        }
+    }
 
-        asm.comment("keyword args");
+    // This is a .send call and we need to adjust the stack
+    // TODO: This can be more efficient if we do it before
+    //       extracting from the splat array above.
+    if flags & VM_CALL_OPT_SEND != 0 {
+        handle_opt_send_shift_stack(asm, argc);
+    }
+
+    if iseq_has_rest {
+        // We are going to allocate so setting pc and sp.
+        jit_save_pc(jit, asm);
+        gen_save_sp(asm);
+
+        let rest_param_array = if splat_call {
+            let non_rest_arg_count = argc - 1;
+            // We start by dupping the array because someone else might have
+            // a reference to it. This also normalizes to an ::Array instance.
+            let array = asm.stack_opnd(0);
+            let array = asm.ccall(
+                rb_ary_dup as *const u8,
+                vec![array],
+            );
+            asm.stack_pop(1); // Pop array after ccall to use a register for passing it.
+
+            // This is the end stack state of all `non_rest_arg_count` situations below
+            argc = required_num + opts_filled;
+
+            if non_rest_arg_count > required_num + opt_num {
+                // If we have more arguments than required, we need to prepend
+                // the items from the stack onto the array.
+                let diff: u32 = (non_rest_arg_count - (required_num + opt_num))
+                    .try_into().unwrap();
+
+                // diff is >0 so no need to worry about null pointer
+                asm_comment!(asm, "load pointer to array elements");
+                let values_opnd = asm.ctx.sp_opnd(-(diff as i32));
+                let values_ptr = asm.lea(values_opnd);
+
+                asm_comment!(asm, "prepend stack values to rest array");
+                let array = asm.ccall(
+                    rb_ary_unshift_m as *const u8,
+                    vec![Opnd::UImm(diff as u64), values_ptr, array],
+                );
+                asm.stack_pop(diff as usize);
 
-        // This is the list of keyword arguments that the callee specified
-        // in its initial declaration.
-        let callee_kwargs = unsafe { (*keyword).table };
-        let total_kwargs: usize = unsafe { (*keyword).num }.try_into().unwrap();
+                array
+            } else if non_rest_arg_count < required_num + opt_num {
+                // If we have fewer arguments than required, we need to take some
+                // from the array and move them to the stack.
+                asm_comment!(asm, "take items from splat array");
 
-        // Here we're going to build up a list of the IDs that correspond to
-        // the caller-specified keyword arguments. If they're not in the
-        // same order as the order specified in the callee declaration, then
-        // we're going to need to generate some code to swap values around
-        // on the stack.
-        let mut caller_kwargs: Vec<ID> = vec![0; total_kwargs];
+                let take_count: u32 = (required_num - non_rest_arg_count + opts_filled)
+                    .try_into().unwrap();
 
-        for kwarg_idx in 0..caller_keyword_len {
-            let sym = unsafe { get_cikw_keywords_idx(ci_kwarg, kwarg_idx.try_into().unwrap()) };
-            caller_kwargs[kwarg_idx] = unsafe { rb_sym2id(sym) };
-        }
-        let mut kwarg_idx = caller_keyword_len;
+                // Copy required arguments to the stack without modifying the array
+                copy_splat_args_for_rest_callee(array, take_count, asm);
 
-        let mut unspecified_bits = 0;
+                // We will now slice the array to give us a new array of the correct size
+                let sliced = asm.ccall(rb_yjit_rb_ary_subseq_length as *const u8, vec![array, Opnd::UImm(take_count.into())]);
 
-        let keyword_required_num: usize = unsafe { (*keyword).required_num }.try_into().unwrap();
-        for callee_idx in keyword_required_num..total_kwargs {
-            let mut already_passed = false;
-            let callee_kwarg = unsafe { *(callee_kwargs.offset(callee_idx.try_into().unwrap())) };
+                sliced
+            } else {
+                // The arguments are equal so we can just push to the stack
+                asm_comment!(asm, "same length for splat array and rest param");
+                assert!(non_rest_arg_count == required_num + opt_num);
 
-            for caller_idx in 0..caller_keyword_len {
-                if caller_kwargs[caller_idx] == callee_kwarg {
-                    already_passed = true;
-                    break;
-                }
+                array
             }
+        } else {
+            asm_comment!(asm, "rest parameter without splat");
+
+            assert!(argc >= required_num);
+            let n = (argc - required_num - opts_filled) as u32;
+            argc = required_num + opts_filled;
+            // If n is 0, then elts is never going to be read, so we can just pass null
+            let values_ptr = if n == 0 {
+                Opnd::UImm(0)
+            } else {
+                asm_comment!(asm, "load pointer to array elements");
+                let values_opnd = asm.ctx.sp_opnd(-(n as i32));
+                asm.lea(values_opnd)
+            };
 
-            if !already_passed {
-                // Reserve space on the stack for each default value we'll be
-                // filling in (which is done in the next loop). Also increments
-                // argc so that the callee's SP is recorded correctly.
-                argc += 1;
-                let default_arg = ctx.stack_push(Type::Unknown);
-
-                // callee_idx - keyword->required_num is used in a couple of places below.
-                let req_num: isize = unsafe { (*keyword).required_num }.try_into().unwrap();
-                let callee_idx_isize: isize = callee_idx.try_into().unwrap();
-                let extra_args = callee_idx_isize - req_num;
-
-                //VALUE default_value = keyword->default_values[callee_idx - keyword->required_num];
-                let mut default_value = unsafe { *((*keyword).default_values.offset(extra_args)) };
-
-                if default_value == Qundef {
-                    // Qundef means that this value is not constant and must be
-                    // recalculated at runtime, so we record it in unspecified_bits
-                    // (Qnil is then used as a placeholder instead of Qundef).
-                    unspecified_bits |= 0x01 << extra_args;
-                    default_value = Qnil;
-                }
+            let new_ary = asm.ccall(
+                rb_ec_ary_new_from_values as *const u8,
+                vec![
+                    EC,
+                    Opnd::UImm(n.into()),
+                    values_ptr
+                ]
+            );
+            asm.stack_pop(n.as_usize());
 
-                asm.mov(default_arg, default_value.into());
+            new_ary
+        };
 
-                caller_kwargs[kwarg_idx] = callee_kwarg;
-                kwarg_idx += 1;
-            }
+        // Find where to put the rest parameter array
+        let rest_param = if opts_missing == 0 {
+            // All optionals are filled, the rest param goes at the top of the stack
+            argc += 1;
+            asm.stack_push(Type::TArray)
+        } else {
+            // The top of the stack will be a missing optional, but the rest
+            // parameter needs to be placed after all the missing optionals.
+            // Place it using a stack operand with a negative stack index.
+            // (Higher magnitude negative stack index have higher address.)
+            assert!(opts_missing > 0);
+            // The argument deepest in the stack will be the 0th local in the callee.
+            let callee_locals_base = argc - 1;
+            let rest_param_stack_idx = callee_locals_base - required_num - opt_num;
+            assert!(rest_param_stack_idx < 0);
+            asm.stack_opnd(rest_param_stack_idx)
+        };
+        // Store rest param to memory to avoid register shuffle as
+        // we won't be reading it for the remainder of the block.
+        asm.ctx.dealloc_reg(rest_param.reg_opnd());
+        asm.store(rest_param, rest_param_array);
+    }
+
+    // Pop surplus positional arguments when yielding
+    if arg_setup_block {
+        let extras = argc - required_num - opt_num - kw_arg_num;
+        if extras > 0 {
+            // Checked earlier. If there are keyword args, then
+            // the positional arguments are not at the stack top.
+            assert_eq!(0, kw_arg_num);
+
+            asm.stack_pop(extras as usize);
+            argc = required_num + opt_num + kw_arg_num;
         }
+    }
 
-        assert!(kwarg_idx == total_kwargs);
+    // Keyword argument passing
+    if doing_kw_call {
+        argc = gen_iseq_kw_call(jit, asm, kw_arg, iseq, argc, has_kwrest);
+    }
+
+    // Same as vm_callee_setup_block_arg_arg0_check and vm_callee_setup_block_arg_arg0_splat
+    // on vm_callee_setup_block_arg for arg_setup_block. This is done after CALLER_SETUP_ARG
+    // and CALLER_REMOVE_EMPTY_KW_SPLAT, so this implementation is put here. This may need
+    // side exits, so you still need to allow side exits here if block_arg0_splat is true.
+    // Note that you can't have side exits after this arg0 splat.
+    if block_arg0_splat {
+        let arg0_opnd = asm.stack_opnd(0);
+
+        // Only handle the case that you don't need to_ary conversion
+        let not_array_counter = Counter::invokeblock_iseq_arg0_not_array;
+        guard_object_is_array(asm, arg0_opnd, arg0_opnd.into(), not_array_counter);
+
+        // Only handle the same that the array length == ISEQ's lead_num (most common)
+        let arg0_len_opnd = get_array_len(asm, arg0_opnd);
+        let lead_num = unsafe { rb_get_iseq_body_param_lead_num(iseq) };
+        asm.cmp(arg0_len_opnd, lead_num.into());
+        asm.jne(Target::side_exit(Counter::invokeblock_iseq_arg0_wrong_len));
+
+        let arg0_reg = asm.load(arg0_opnd);
+        let array_opnd = get_array_ptr(asm, arg0_reg);
+        asm_comment!(asm, "push splat arg0 onto the stack");
+        asm.stack_pop(argc.try_into().unwrap());
+        for i in 0..lead_num {
+            let stack_opnd = asm.stack_push(Type::Unknown);
+            asm.mov(stack_opnd, Opnd::mem(64, array_opnd, SIZEOF_VALUE_I32 * i));
+        }
+        argc = lead_num;
+    }
 
-        // Next, we're going to loop through every keyword that was
-        // specified by the caller and make sure that it's in the correct
-        // place. If it's not we're going to swap it around with another one.
-        for kwarg_idx in 0..total_kwargs {
-            let kwarg_idx_isize: isize = kwarg_idx.try_into().unwrap();
-            let callee_kwarg = unsafe { *(callee_kwargs.offset(kwarg_idx_isize)) };
+    fn nil_fill(comment: &'static str, fill_range: std::ops::Range<i32>, asm: &mut Assembler) {
+        if fill_range.is_empty() {
+            return;
+        }
 
-            // If the argument is already in the right order, then we don't
-            // need to generate any code since the expected value is already
-            // in the right place on the stack.
-            if callee_kwarg == caller_kwargs[kwarg_idx] {
-                continue;
-            }
+        asm_comment!(asm, "{}", comment);
+        for i in fill_range {
+            let value_slot = asm.ctx.sp_opnd(i);
+            asm.store(value_slot, Qnil.into());
+        }
+    }
 
-            // In this case the argument is not in the right place, so we
-            // need to find its position where it _should_ be and swap with
-            // that location.
-            for swap_idx in (kwarg_idx + 1)..total_kwargs {
-                if callee_kwarg == caller_kwargs[swap_idx] {
-                    // First we're going to generate the code that is going
-                    // to perform the actual swapping at runtime.
-                    let swap_idx_i32: i32 = swap_idx.try_into().unwrap();
-                    let kwarg_idx_i32: i32 = kwarg_idx.try_into().unwrap();
-                    let offset0: u16 = (argc - 1 - swap_idx_i32 - args_before_kw)
-                        .try_into()
-                        .unwrap();
-                    let offset1: u16 = (argc - 1 - kwarg_idx_i32 - args_before_kw)
-                        .try_into()
-                        .unwrap();
-                    stack_swap(jit, ctx, asm, offset0, offset1);
-
-                    // Next we're going to do some bookkeeping on our end so
-                    // that we know the order that the arguments are
-                    // actually in now.
-                    caller_kwargs.swap(kwarg_idx, swap_idx);
+    if !forwarding {
+        // Nil-initialize missing optional parameters
+        nil_fill(
+            "nil-initialize missing optionals",
+            {
+                let begin = -argc + required_num + opts_filled;
+                let end   = -argc + required_num + opt_num;
 
-                    break;
-                }
-            }
+                begin..end
+            },
+            asm
+        );
+        // Nil-initialize the block parameter. It's the last parameter local
+        if iseq_has_block_param {
+            let block_param = asm.ctx.sp_opnd(-argc + num_params - 1);
+            asm.store(block_param, Qnil.into());
         }
+        // Nil-initialize non-parameter locals
+        nil_fill(
+            "nil-initialize locals",
+            {
+                let begin = -argc + num_params;
+                let end   = -argc + num_locals;
+
+                begin..end
+            },
+            asm
+        );
+    }
 
-        // Keyword arguments cause a special extra local variable to be
-        // pushed onto the stack that represents the parameters that weren't
-        // explicitly given a value and have a non-constant default.
-        let unspec_opnd = VALUE::fixnum_from_usize(unspecified_bits).as_u64();
-        asm.mov(ctx.stack_opnd(-1), unspec_opnd.into());
+    if forwarding {
+        assert_eq!(1, num_params);
+        // Write the CI in to the stack and ensure that it actually gets
+        // flushed to memory
+        asm_comment!(asm, "put call info for forwarding");
+        let ci_opnd = asm.stack_opnd(-1);
+        asm.ctx.dealloc_reg(ci_opnd.reg_opnd());
+        asm.mov(ci_opnd, VALUE(ci as usize).into());
+
+        // Nil-initialize other locals which are above the CI
+        nil_fill("nil-initialize locals", 1..num_locals, asm);
     }
 
     // Points to the receiver operand on the stack unless a captured environment is used
     let recv = match captured_opnd {
         Some(captured_opnd) => asm.load(Opnd::mem(64, captured_opnd, 0)), // captured->self
-        _ => ctx.stack_opnd(argc),
+        _ => asm.stack_opnd(argc),
     };
     let captured_self = captured_opnd.is_some();
-    let sp_offset = (argc as isize) + if captured_self { 0 } else { 1 };
+    let sp_offset = argc + if captured_self { 0 } else { 1 };
 
     // Store the updated SP on the current frame (pop arguments and receiver)
-    asm.comment("store caller sp");
-    let caller_sp = asm.lea(ctx.sp_opnd((SIZEOF_VALUE as isize) * -sp_offset));
+    asm_comment!(asm, "store caller sp");
+    let caller_sp = asm.lea(asm.ctx.sp_opnd(-sp_offset));
     asm.store(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP), caller_sp);
 
     // Store the next PC in the current frame
     jit_save_pc(jit, asm);
 
     // Adjust the callee's stack pointer
-    let offs =
-        (SIZEOF_VALUE as isize) * (3 + (num_locals as isize) + if doing_kw_call { 1 } else { 0 });
-    let callee_sp = asm.lea(ctx.sp_opnd(offs));
+    let callee_sp = if forwarding {
+        let offs = num_locals + VM_ENV_DATA_SIZE as i32;
+        asm.lea(asm.ctx.sp_opnd(offs))
+    } else {
+        let offs = -argc + num_locals + VM_ENV_DATA_SIZE as i32;
+        asm.lea(asm.ctx.sp_opnd(offs))
+    };
 
     let specval = if let Some(prev_ep) = prev_ep {
         // We've already side-exited if the callee expects a block, so we
@@ -5540,16 +8184,16 @@ fn gen_send_iseq(
     } else if let Some(captured_opnd) = captured_opnd {
         let ep_opnd = asm.load(Opnd::mem(64, captured_opnd, SIZEOF_VALUE_I32)); // captured->ep
         SpecVal::PrevEPOpnd(ep_opnd)
-    } else if block_arg_type == Some(Type::BlockParamProxy) {
-        SpecVal::BlockParamProxy
-    } else if let Some(block_val) = block {
-        SpecVal::BlockISeq(block_val)
+    } else if let Some(BlockArg::TProc) = block_arg_type {
+        SpecVal::BlockHandler(Some(BlockHandler::AlreadySet))
+    } else if let Some(BlockArg::BlockParamProxy) = block_arg_type {
+        SpecVal::BlockHandler(Some(BlockHandler::BlockParamProxy))
     } else {
-        SpecVal::None
+        SpecVal::BlockHandler(block)
     };
 
     // Setup the new frame
-    gen_push_frame(jit, ctx, asm, true, ControlFrame {
+    perf_call!("gen_send_iseq: ", gen_push_frame(jit, asm, ControlFrame {
         frame_type,
         specval,
         cme,
@@ -5557,93 +8201,684 @@ fn gen_send_iseq(
         sp: callee_sp,
         iseq: Some(iseq),
         pc: None, // We are calling into jitted code, which will set the PC as necessary
-        local_size: num_locals
-    });
+    }));
 
     // No need to set cfp->pc since the callee sets it whenever calling into routines
     // that could look at it through jit_save_pc().
     // mov(cb, REG0, const_ptr_opnd(start_pc));
     // mov(cb, member_opnd(REG_CFP, rb_control_frame_t, pc), REG0);
 
-    // Stub so we can return to JITted code
-    let return_block = BlockId {
-        iseq: jit.iseq,
-        idx: jit_next_insn_idx(jit),
-    };
+    // Create a blockid for the callee
+    let callee_blockid = BlockId { iseq, idx: start_pc_offset };
 
     // Create a context for the callee
     let mut callee_ctx = Context::default();
 
+    // If the callee has :inline_block annotation and the callsite has a block ISEQ,
+    // duplicate a callee block for each block ISEQ to make its `yield` monomorphic.
+    if let (Some(BlockHandler::BlockISeq(iseq)), true) = (block, builtin_attrs & BUILTIN_ATTR_INLINE_BLOCK != 0) {
+        callee_ctx.set_inline_block(iseq);
+    }
+
     // Set the argument types in the callee's context
     for arg_idx in 0..argc {
-        let stack_offs: u16 = (argc - arg_idx - 1).try_into().unwrap();
-        let arg_type = ctx.get_opnd_type(StackOpnd(stack_offs));
+        let stack_offs: u8 = (argc - arg_idx - 1).try_into().unwrap();
+        let arg_type = asm.ctx.get_opnd_type(StackOpnd(stack_offs));
         callee_ctx.set_local_type(arg_idx.try_into().unwrap(), arg_type);
     }
 
+    // If we're in a forwarding callee, there will be one unknown type
+    // written in to the local table (the caller's CI object)
+    if forwarding {
+        callee_ctx.set_local_type(0, Type::Unknown)
+    }
+
+    // Set the receiver type in the callee's context
     let recv_type = if captured_self {
         Type::Unknown // we don't track the type information of captured->self for now
     } else {
-        ctx.get_opnd_type(StackOpnd(argc.try_into().unwrap()))
+        asm.ctx.get_opnd_type(StackOpnd(argc.try_into().unwrap()))
     };
     callee_ctx.upgrade_opnd_type(SelfOpnd, recv_type);
 
+    // Spill or preserve argument registers
+    if forwarding {
+        // When forwarding, the callee's local table has only a callinfo,
+        // so we can't map the actual arguments to the callee's locals.
+        asm.spill_regs();
+    } else {
+        // Discover stack temp registers that can be used as the callee's locals
+        let mapped_temps = asm.map_temp_regs_to_args(&mut callee_ctx, argc);
+
+        // Spill stack temps and locals that are not used by the callee.
+        // This must be done before changing the SP register.
+        asm.spill_regs_except(&mapped_temps);
+
+        // If the callee block has been compiled before, spill/move registers to reuse the existing block
+        // for minimizing the number of blocks we need to compile.
+        if let Some(existing_reg_mapping) = find_most_compatible_reg_mapping(callee_blockid, &callee_ctx) {
+            asm_comment!(asm, "reuse maps: {:?} -> {:?}", callee_ctx.get_reg_mapping(), existing_reg_mapping);
+
+            // Spill the registers that are not used in the existing block.
+            // When the same ISEQ is compiled as an entry block, it starts with no registers allocated.
+            for &reg_opnd in callee_ctx.get_reg_mapping().get_reg_opnds().iter() {
+                if existing_reg_mapping.get_reg(reg_opnd).is_none() {
+                    match reg_opnd {
+                        RegOpnd::Local(local_idx) => {
+                            let spilled_temp = asm.stack_opnd(argc - local_idx as i32 - 1);
+                            asm.spill_reg(spilled_temp);
+                            callee_ctx.dealloc_reg(reg_opnd);
+                        }
+                        RegOpnd::Stack(_) => unreachable!("callee {:?} should have been spilled", reg_opnd),
+                    }
+                }
+            }
+            assert!(callee_ctx.get_reg_mapping().get_reg_opnds().len() <= existing_reg_mapping.get_reg_opnds().len());
+
+            // Load the registers that are spilled in this block but used in the existing block.
+            // When there are multiple callsites, some registers spilled in this block may be used at other callsites.
+            for &reg_opnd in existing_reg_mapping.get_reg_opnds().iter() {
+                if callee_ctx.get_reg_mapping().get_reg(reg_opnd).is_none() {
+                    match reg_opnd {
+                        RegOpnd::Local(local_idx) => {
+                            callee_ctx.alloc_reg(reg_opnd);
+                            let loaded_reg = TEMP_REGS[callee_ctx.get_reg_mapping().get_reg(reg_opnd).unwrap()];
+                            let loaded_temp = asm.stack_opnd(argc - local_idx as i32 - 1);
+                            asm.load_into(Opnd::Reg(loaded_reg), loaded_temp);
+                        }
+                        RegOpnd::Stack(_) => unreachable!("find_most_compatible_reg_mapping should not leave {:?}", reg_opnd),
+                    }
+                }
+            }
+            assert_eq!(callee_ctx.get_reg_mapping().get_reg_opnds().len(), existing_reg_mapping.get_reg_opnds().len());
+
+            // Shuffle registers to make the register mappings compatible
+            let mut moves = vec![];
+            for &reg_opnd in callee_ctx.get_reg_mapping().get_reg_opnds().iter() {
+                let old_reg = TEMP_REGS[callee_ctx.get_reg_mapping().get_reg(reg_opnd).unwrap()];
+                let new_reg = TEMP_REGS[existing_reg_mapping.get_reg(reg_opnd).unwrap()];
+                moves.push((new_reg, Opnd::Reg(old_reg)));
+            }
+            for (reg, opnd) in Assembler::reorder_reg_moves(&moves) {
+                asm.load_into(Opnd::Reg(reg), opnd);
+            }
+            callee_ctx.set_reg_mapping(existing_reg_mapping);
+        }
+    }
+
+    // Update SP register for the callee. This must be done after referencing frame.recv,
+    // which may be SP-relative.
+    asm.mov(SP, callee_sp);
+
+    // Log the name of the method we're calling to. We intentionally don't do this for inlined ISEQs.
+    // We also do this after spill_regs() to avoid doubly spilling the same thing on asm.ccall().
+    if get_option!(gen_stats) {
+        // Protect caller-saved registers in case they're used for arguments
+        let mapping = asm.cpush_all();
+
+        // Assemble the ISEQ name string
+        let name_str = get_iseq_name(iseq);
+
+        // Get an index for this ISEQ name
+        let iseq_idx = get_iseq_idx(&name_str);
+
+        // Increment the counter for this cfunc
+        asm.ccall(incr_iseq_counter as *const u8, vec![iseq_idx.into()]);
+        asm.cpop_all(mapping);
+    }
+
     // The callee might change locals through Kernel#binding and other means.
-    ctx.clear_local_types();
+    asm.clear_local_types();
 
-    // Pop arguments and receiver in return context, push the return value
-    // After the return, sp_offset will be 1. The codegen for leave writes
-    // the return value in case of JIT-to-JIT return.
-    let mut return_ctx = ctx.clone();
-    return_ctx.stack_pop(sp_offset.try_into().unwrap());
-    return_ctx.stack_push(Type::Unknown);
-    return_ctx.set_sp_offset(1);
-    return_ctx.reset_chain_depth();
+    // Pop arguments and receiver in return context and
+    // mark it as a continuation of gen_leave()
+    let mut return_asm = Assembler::new(jit.num_locals());
+    return_asm.ctx = asm.ctx;
+    return_asm.stack_pop(sp_offset.try_into().unwrap());
+    return_asm.ctx.set_sp_offset(0); // We set SP on the caller's frame above
+    return_asm.ctx.reset_chain_depth_and_defer();
+    return_asm.ctx.set_as_return_landing();
+
+    // Stub so we can return to JITted code
+    let return_block = BlockId {
+        iseq: jit.iseq,
+        idx: jit.next_insn_idx(),
+    };
 
     // Write the JIT return address on the callee frame
-    gen_branch(
-        jit,
+    jit.gen_branch(
         asm,
-        ocb,
         return_block,
-        &return_ctx,
+        &return_asm.ctx,
         None,
         None,
-        gen_return_branch,
+        BranchGenFn::JITReturn,
     );
 
-    //print_str(cb, "calling Ruby func:");
-    //print_str(cb, rb_id2name(vm_ci_mid(ci)));
+    // ec->cfp is updated after cfp->jit_return for rb_profile_frames() safety
+    asm_comment!(asm, "switch to new CFP");
+    let new_cfp = asm.sub(CFP, RUBY_SIZEOF_CONTROL_FRAME.into());
+    asm.mov(CFP, new_cfp);
+    asm.store(Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP as i32), CFP);
 
     // Directly jump to the entry point of the callee
     gen_direct_jump(
         jit,
         &callee_ctx,
-        BlockId {
-            iseq: iseq,
-            idx: start_pc_offset,
-        },
+        callee_blockid,
         asm,
     );
 
-    EndBlock
+    Some(EndBlock)
+}
+
+// Check if we can handle a keyword call
+fn gen_iseq_kw_call_checks(
+    jit: &JITState,
+    asm: &mut Assembler,
+    iseq: *const rb_iseq_t,
+    kw_arg: *const rb_callinfo_kwarg,
+    has_kwrest: bool,
+    caller_kw_num: i32
+) -> Option<()> {
+    // This struct represents the metadata about the callee-specified
+    // keyword parameters.
+    let keyword = unsafe { get_iseq_body_param_keyword(iseq) };
+    let keyword_num: usize = unsafe { (*keyword).num }.try_into().unwrap();
+    let keyword_required_num: usize = unsafe { (*keyword).required_num }.try_into().unwrap();
+
+    let mut required_kwargs_filled = 0;
+
+    if keyword_num > 30 || caller_kw_num > 64 {
+        // We have so many keywords that (1 << num) encoded as a FIXNUM
+        // (which shifts it left one more) no longer fits inside a 32-bit
+        // immediate. Similarly, we use a u64 in case of keyword rest parameter.
+        gen_counter_incr(jit, asm, Counter::send_iseq_too_many_kwargs);
+        return None;
+    }
+
+    // Check that the kwargs being passed are valid
+    if caller_kw_num > 0 {
+        // This is the list of keyword arguments that the callee specified
+        // in its initial declaration.
+        // SAFETY: see compile.c for sizing of this slice.
+        let callee_kwargs = if keyword_num == 0 {
+            &[]
+        } else {
+            unsafe { slice::from_raw_parts((*keyword).table, keyword_num) }
+        };
+
+        // Here we're going to build up a list of the IDs that correspond to
+        // the caller-specified keyword arguments. If they're not in the
+        // same order as the order specified in the callee declaration, then
+        // we're going to need to generate some code to swap values around
+        // on the stack.
+        let kw_arg_keyword_len = caller_kw_num as usize;
+        let mut caller_kwargs: Vec<ID> = vec![0; kw_arg_keyword_len];
+        for kwarg_idx in 0..kw_arg_keyword_len {
+            let sym = unsafe { get_cikw_keywords_idx(kw_arg, kwarg_idx.try_into().unwrap()) };
+            caller_kwargs[kwarg_idx] = unsafe { rb_sym2id(sym) };
+        }
+
+        // First, we're going to be sure that the names of every
+        // caller-specified keyword argument correspond to a name in the
+        // list of callee-specified keyword parameters.
+        for caller_kwarg in caller_kwargs {
+            let search_result = callee_kwargs
+                .iter()
+                .enumerate() // inject element index
+                .find(|(_, &kwarg)| kwarg == caller_kwarg);
+
+            match search_result {
+                None if !has_kwrest => {
+                    // If the keyword was never found, then we know we have a
+                    // mismatch in the names of the keyword arguments, so we need to
+                    // bail.
+                    gen_counter_incr(jit, asm, Counter::send_iseq_kwargs_mismatch);
+                    return None;
+                }
+                Some((callee_idx, _)) if callee_idx < keyword_required_num => {
+                    // Keep a count to ensure all required kwargs are specified
+                    required_kwargs_filled += 1;
+                }
+                _ => (),
+            }
+        }
+    }
+    assert!(required_kwargs_filled <= keyword_required_num);
+    if required_kwargs_filled != keyword_required_num {
+        gen_counter_incr(jit, asm, Counter::send_iseq_kwargs_mismatch);
+        return None;
+    }
+
+    Some(())
+}
+
+// Codegen for keyword argument handling. Essentially private to gen_send_iseq() since
+// there are a lot of preconditions to check before reaching this code.
+fn gen_iseq_kw_call(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    ci_kwarg: *const rb_callinfo_kwarg,
+    iseq: *const rb_iseq_t,
+    mut argc: i32,
+    has_kwrest: bool,
+) -> i32 {
+    let caller_keyword_len_i32: i32 = if ci_kwarg.is_null() {
+        0
+    } else {
+        unsafe { get_cikw_keyword_len(ci_kwarg) }
+    };
+    let caller_keyword_len: usize = caller_keyword_len_i32.try_into().unwrap();
+    let anon_kwrest = unsafe { rb_get_iseq_flags_anon_kwrest(iseq) && !get_iseq_flags_has_kw(iseq) };
+
+    // This struct represents the metadata about the callee-specified
+    // keyword parameters.
+    let keyword = unsafe { get_iseq_body_param_keyword(iseq) };
+
+    asm_comment!(asm, "keyword args");
+
+    // This is the list of keyword arguments that the callee specified
+    // in its initial declaration.
+    let callee_kwargs = unsafe { (*keyword).table };
+    let callee_kw_count_i32: i32 = unsafe { (*keyword).num };
+    let callee_kw_count: usize = callee_kw_count_i32.try_into().unwrap();
+    let keyword_required_num: usize = unsafe { (*keyword).required_num }.try_into().unwrap();
+
+    // Here we're going to build up a list of the IDs that correspond to
+    // the caller-specified keyword arguments. If they're not in the
+    // same order as the order specified in the callee declaration, then
+    // we're going to need to generate some code to swap values around
+    // on the stack.
+    let mut kwargs_order: Vec<ID> = vec![0; cmp::max(caller_keyword_len, callee_kw_count)];
+    for kwarg_idx in 0..caller_keyword_len {
+        let sym = unsafe { get_cikw_keywords_idx(ci_kwarg, kwarg_idx.try_into().unwrap()) };
+        kwargs_order[kwarg_idx] = unsafe { rb_sym2id(sym) };
+    }
+
+    let mut unspecified_bits = 0;
+
+    // The stack_opnd() index to the 0th keyword argument.
+    let kwargs_stack_base = caller_keyword_len_i32 - 1;
+
+    // Build the keyword rest parameter hash before we make any changes to the order of
+    // the supplied keyword arguments
+    let kwrest_type = if has_kwrest {
+        c_callable! {
+            fn build_kw_rest(rest_mask: u64, stack_kwargs: *const VALUE, keywords: *const rb_callinfo_kwarg) -> VALUE {
+                if keywords.is_null() {
+                    return unsafe { rb_hash_new() };
+                }
+
+                // Use the total number of supplied keywords as a size upper bound
+                let keyword_len = unsafe { (*keywords).keyword_len } as usize;
+                let hash = unsafe { rb_hash_new_with_size(keyword_len as u64) };
+
+                // Put pairs into the kwrest hash as the mask describes
+                for kwarg_idx in 0..keyword_len {
+                    if (rest_mask & (1 << kwarg_idx)) != 0 {
+                        unsafe {
+                            let keyword_symbol = (*keywords).keywords.as_ptr().add(kwarg_idx).read();
+                            let keyword_value = stack_kwargs.add(kwarg_idx).read();
+                            rb_hash_aset(hash, keyword_symbol, keyword_value);
+                        }
+                    }
+                }
+                return hash;
+            }
+        }
+
+        asm_comment!(asm, "build kwrest hash");
+
+        // Make a bit mask describing which keywords should go into kwrest.
+        let mut rest_mask: u64 = 0;
+        // Index for one argument that will go into kwrest.
+        let mut rest_collected_idx = None;
+        for (supplied_kw_idx, &supplied_kw) in kwargs_order.iter().take(caller_keyword_len).enumerate() {
+            let mut found = false;
+            for callee_idx in 0..callee_kw_count {
+                let callee_kw = unsafe { callee_kwargs.add(callee_idx).read() };
+                if callee_kw == supplied_kw {
+                    found = true;
+                    break;
+                }
+            }
+            if !found {
+                rest_mask |= 1 << supplied_kw_idx;
+                if rest_collected_idx.is_none() {
+                    rest_collected_idx = Some(supplied_kw_idx as i32);
+                }
+            }
+        }
+
+        let (kwrest, kwrest_type) = if rest_mask == 0 && anon_kwrest {
+            // In case the kwrest hash should be empty and is anonymous in the callee,
+            // we can pass nil instead of allocating. Anonymous kwrest can only be
+            // delegated, and nil is the same as an empty hash when delegating.
+            (Qnil.into(), Type::Nil)
+        } else {
+            // Save PC and SP before allocating
+            jit_save_pc(jit, asm);
+            gen_save_sp(asm);
+
+            // Build the kwrest hash. `struct rb_callinfo_kwarg` is malloc'd, so no GC concerns.
+            let kwargs_start = asm.lea(asm.ctx.sp_opnd(-caller_keyword_len_i32));
+            let hash = asm.ccall(
+                build_kw_rest as _,
+                vec![rest_mask.into(), kwargs_start, Opnd::const_ptr(ci_kwarg.cast())]
+            );
+            (hash, Type::THash)
+        };
+
+        // The kwrest parameter sits after `unspecified_bits` if the callee specifies any
+        // keywords.
+        let stack_kwrest_idx = kwargs_stack_base - callee_kw_count_i32 - i32::from(callee_kw_count > 0);
+        let stack_kwrest = asm.stack_opnd(stack_kwrest_idx);
+        // If `stack_kwrest` already has another argument there, we need to stow it elsewhere
+        // first before putting kwrest there. Use `rest_collected_idx` because that value went
+        // into kwrest so the slot is now free.
+        let kwrest_idx = callee_kw_count + usize::from(callee_kw_count > 0);
+        if let (Some(rest_collected_idx), true) = (rest_collected_idx, kwrest_idx < caller_keyword_len) {
+            let rest_collected = asm.stack_opnd(kwargs_stack_base - rest_collected_idx);
+            let mapping = asm.ctx.get_opnd_mapping(stack_kwrest.into());
+            asm.mov(rest_collected, stack_kwrest);
+            asm.ctx.set_opnd_mapping(rest_collected.into(), mapping);
+            // Update our bookkeeping to inform the reordering step later.
+            kwargs_order[rest_collected_idx as usize] = kwargs_order[kwrest_idx];
+            kwargs_order[kwrest_idx] = 0;
+        }
+        // Put kwrest straight into memory, since we might pop it later
+        asm.ctx.dealloc_reg(stack_kwrest.reg_opnd());
+        asm.mov(stack_kwrest, kwrest);
+        if stack_kwrest_idx >= 0 {
+            asm.ctx.set_opnd_mapping(stack_kwrest.into(), TempMapping::MapToStack(kwrest_type));
+        }
+
+        Some(kwrest_type)
+    } else {
+        None
+    };
+
+    // Ensure the stack is large enough for the callee
+    for _ in caller_keyword_len..callee_kw_count {
+        argc += 1;
+        asm.stack_push(Type::Unknown);
+    }
+    // Now this is the stack_opnd() index to the 0th keyword argument.
+    let kwargs_stack_base = kwargs_order.len() as i32 - 1;
+
+    // Next, we're going to loop through every keyword that was
+    // specified by the caller and make sure that it's in the correct
+    // place. If it's not we're going to swap it around with another one.
+    for kwarg_idx in 0..callee_kw_count {
+        let callee_kwarg = unsafe { callee_kwargs.add(kwarg_idx).read() };
+
+        // If the argument is already in the right order, then we don't
+        // need to generate any code since the expected value is already
+        // in the right place on the stack.
+        if callee_kwarg == kwargs_order[kwarg_idx] {
+            continue;
+        }
+
+        // In this case the argument is not in the right place, so we
+        // need to find its position where it _should_ be and swap with
+        // that location.
+        for swap_idx in 0..kwargs_order.len() {
+            if callee_kwarg == kwargs_order[swap_idx] {
+                // First we're going to generate the code that is going
+                // to perform the actual swapping at runtime.
+                let swap_idx_i32: i32 = swap_idx.try_into().unwrap();
+                let kwarg_idx_i32: i32 = kwarg_idx.try_into().unwrap();
+                let offset0 = kwargs_stack_base - swap_idx_i32;
+                let offset1 = kwargs_stack_base - kwarg_idx_i32;
+                stack_swap(asm, offset0, offset1);
+
+                // Next we're going to do some bookkeeping on our end so
+                // that we know the order that the arguments are
+                // actually in now.
+                kwargs_order.swap(kwarg_idx, swap_idx);
+
+                break;
+            }
+        }
+    }
+
+    // Now that every caller specified kwarg is in the right place, filling
+    // in unspecified default paramters won't overwrite anything.
+    for kwarg_idx in keyword_required_num..callee_kw_count {
+        if kwargs_order[kwarg_idx] != unsafe { callee_kwargs.add(kwarg_idx).read() } {
+            let default_param_idx = kwarg_idx - keyword_required_num;
+            let mut default_value = unsafe { (*keyword).default_values.add(default_param_idx).read() };
+
+            if default_value == Qundef {
+                // Qundef means that this value is not constant and must be
+                // recalculated at runtime, so we record it in unspecified_bits
+                // (Qnil is then used as a placeholder instead of Qundef).
+                unspecified_bits |= 0x01 << default_param_idx;
+                default_value = Qnil;
+            }
+
+            let default_param = asm.stack_opnd(kwargs_stack_base - kwarg_idx as i32);
+            let param_type = Type::from(default_value);
+            asm.mov(default_param, default_value.into());
+            asm.ctx.set_opnd_mapping(default_param.into(), TempMapping::MapToStack(param_type));
+        }
+    }
+
+    // Pop extra arguments that went into kwrest now that they're at stack top
+    if has_kwrest && caller_keyword_len > callee_kw_count {
+        let extra_kwarg_count = caller_keyword_len - callee_kw_count;
+        asm.stack_pop(extra_kwarg_count);
+        argc = argc - extra_kwarg_count as i32;
+    }
+
+    // Keyword arguments cause a special extra local variable to be
+    // pushed onto the stack that represents the parameters that weren't
+    // explicitly given a value and have a non-constant default.
+    if callee_kw_count > 0 {
+        let unspec_opnd = VALUE::fixnum_from_usize(unspecified_bits).as_u64();
+        let top = asm.stack_push(Type::Fixnum);
+        asm.mov(top, unspec_opnd.into());
+        argc += 1;
+    }
+
+    // The kwrest parameter sits after `unspecified_bits`
+    if let Some(kwrest_type) = kwrest_type {
+        let kwrest = asm.stack_push(kwrest_type);
+        // We put the kwrest parameter in memory earlier
+        asm.ctx.dealloc_reg(kwrest.reg_opnd());
+        argc += 1;
+    }
+
+    argc
+}
+
+/// This is a helper function to allow us to exit early
+/// during code generation if a predicate is true.
+/// We return Option<()> here because we will be able to
+/// short-circuit using the ? operator if we return None.
+/// It would be great if rust let you implement ? for your
+/// own types, but as of right now they don't.
+fn exit_if(jit: &JITState, asm: &mut Assembler, pred: bool, counter: Counter) -> Option<()> {
+    if pred {
+        gen_counter_incr(jit, asm, counter);
+        return None
+    }
+    Some(())
+}
+
+#[must_use]
+fn exit_if_tail_call(jit: &JITState, asm: &mut Assembler, ci: *const rb_callinfo) -> Option<()> {
+    exit_if(jit, asm, unsafe { vm_ci_flag(ci) } & VM_CALL_TAILCALL != 0, Counter::send_iseq_tailcall)
+}
+
+#[must_use]
+fn exit_if_has_post(jit: &JITState, asm: &mut Assembler, iseq: *const rb_iseq_t) -> Option<()> {
+    exit_if(jit, asm, unsafe { get_iseq_flags_has_post(iseq) }, Counter::send_iseq_has_post)
+}
+
+#[must_use]
+fn exit_if_kwsplat_non_nil(jit: &JITState, asm: &mut Assembler, flags: u32, counter: Counter) -> Option<()> {
+    let kw_splat = flags & VM_CALL_KW_SPLAT != 0;
+    let kw_splat_stack = StackOpnd((flags & VM_CALL_ARGS_BLOCKARG != 0).into());
+    exit_if(jit, asm, kw_splat && asm.ctx.get_opnd_type(kw_splat_stack) != Type::Nil, counter)
+}
+
+#[must_use]
+fn exit_if_has_rest_and_captured(jit: &JITState, asm: &mut Assembler, iseq_has_rest: bool, captured_opnd: Option<Opnd>) -> Option<()> {
+    exit_if(jit, asm, iseq_has_rest && captured_opnd.is_some(), Counter::send_iseq_has_rest_and_captured)
+}
+
+#[must_use]
+fn exit_if_has_kwrest_and_captured(jit: &JITState, asm: &mut Assembler, iseq_has_kwrest: bool, captured_opnd: Option<Opnd>) -> Option<()> {
+    // We need to call a C function to allocate the kwrest hash, but also need to hold the captred
+    // block across the call, which we can't do.
+    exit_if(jit, asm, iseq_has_kwrest && captured_opnd.is_some(), Counter::send_iseq_has_kwrest_and_captured)
+}
+
+#[must_use]
+fn exit_if_has_rest_and_supplying_kws(jit: &JITState, asm: &mut Assembler, iseq_has_rest: bool, supplying_kws: bool) -> Option<()> {
+    // There can be a gap between the rest parameter array and the supplied keywords, or
+    // no space to put the rest array (e.g. `def foo(*arr, k:) = arr; foo(k: 1)` 1 is
+    // sitting where the rest array should be).
+    exit_if(
+        jit,
+        asm,
+        iseq_has_rest && supplying_kws,
+        Counter::send_iseq_has_rest_and_kw_supplied,
+    )
+}
+
+#[must_use]
+fn exit_if_supplying_kw_and_has_no_kw(jit: &JITState, asm: &mut Assembler, supplying_kws: bool, callee_kws: bool) -> Option<()> {
+    // Passing keyword arguments to a callee means allocating a hash and treating
+    // that as a positional argument. Bail for now.
+    exit_if(
+        jit,
+        asm,
+        supplying_kws && !callee_kws,
+        Counter::send_iseq_has_no_kw,
+    )
+}
+
+#[must_use]
+fn exit_if_supplying_kws_and_accept_no_kwargs(jit: &JITState, asm: &mut Assembler, supplying_kws: bool, iseq: *const rb_iseq_t) -> Option<()> {
+    // If we have a method accepting no kwargs (**nil), exit if we have passed
+    // it any kwargs.
+    exit_if(
+        jit,
+        asm,
+        supplying_kws && unsafe { get_iseq_flags_accepts_no_kwarg(iseq) },
+        Counter::send_iseq_accepts_no_kwarg
+    )
+}
+
+#[must_use]
+fn exit_if_doing_kw_and_splat(jit: &JITState, asm: &mut Assembler, doing_kw_call: bool, flags: u32) -> Option<()> {
+    exit_if(jit, asm, doing_kw_call && flags & VM_CALL_ARGS_SPLAT != 0, Counter::send_iseq_splat_with_kw)
+}
+
+#[must_use]
+fn exit_if_wrong_number_arguments(
+    jit: &JITState,
+    asm: &mut Assembler,
+    args_setup_block: bool,
+    opts_filled: i32,
+    flags: u32,
+    opt_num: i32,
+    iseq_has_rest: bool,
+) -> Option<()> {
+    // Too few arguments and no splat to make up for it
+    let too_few = opts_filled < 0 && flags & VM_CALL_ARGS_SPLAT == 0;
+    // Too many arguments and no sink that take them
+    let too_many = opts_filled > opt_num && !(iseq_has_rest || args_setup_block);
+
+    exit_if(jit, asm, too_few || too_many, Counter::send_iseq_arity_error)
+}
+
+#[must_use]
+fn exit_if_doing_kw_and_opts_missing(jit: &JITState, asm: &mut Assembler, doing_kw_call: bool, opts_missing: i32) -> Option<()> {
+    // If we have unfilled optional arguments and keyword arguments then we
+    // would need to adjust the arguments location to account for that.
+    // For now we aren't handling this case.
+    exit_if(jit, asm, doing_kw_call && opts_missing > 0, Counter::send_iseq_missing_optional_kw)
+}
+
+#[must_use]
+fn exit_if_has_rest_and_optional_and_block(jit: &JITState, asm: &mut Assembler, iseq_has_rest: bool, opt_num: i32, iseq: *const rb_iseq_t, block_arg: bool) -> Option<()> {
+    exit_if(
+        jit,
+        asm,
+        iseq_has_rest && opt_num != 0 && (unsafe { get_iseq_flags_has_block(iseq) } || block_arg),
+        Counter::send_iseq_has_rest_opt_and_block
+    )
+}
+
+#[derive(Clone, Copy)]
+enum BlockArg {
+    Nil,
+    /// A special sentinel value indicating the block parameter should be read from
+    /// the current surrounding cfp
+    BlockParamProxy,
+    /// A proc object. Could be an instance of a subclass of ::rb_cProc
+    TProc,
+}
+
+#[must_use]
+fn exit_if_unsupported_block_arg_type(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    supplying_block_arg: bool
+) -> Option<Option<BlockArg>> {
+    let block_arg_type = if supplying_block_arg {
+        asm.ctx.get_opnd_type(StackOpnd(0))
+    } else {
+        // Passing no block argument
+        return Some(None);
+    };
+
+    match block_arg_type {
+        // We'll handle Nil and BlockParamProxy later
+        Type::Nil => Some(Some(BlockArg::Nil)),
+        Type::BlockParamProxy => Some(Some(BlockArg::BlockParamProxy)),
+        _ if {
+            let sample_block_arg = jit.peek_at_stack(&asm.ctx, 0);
+            unsafe { rb_obj_is_proc(sample_block_arg) }.test()
+        } => {
+            // Speculate that we'll have a proc as the block arg
+            Some(Some(BlockArg::TProc))
+        }
+        _ => {
+            gen_counter_incr(jit, asm, Counter::send_iseq_block_arg_type);
+            None
+        }
+    }
+}
+
+#[must_use]
+fn exit_if_stack_too_large(iseq: *const rb_iseq_t) -> Option<()> {
+    let stack_max = unsafe { rb_get_iseq_body_stack_max(iseq) };
+    // Reject ISEQs with very large temp stacks,
+    // this will allow us to use u8/i8 values to track stack_size and sp_offset
+    if stack_max >= i8::MAX as u32 {
+        incr_counter!(iseq_stack_too_large);
+        return None;
+    }
+    Some(())
 }
 
 fn gen_struct_aref(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
     ci: *const rb_callinfo,
     cme: *const rb_callable_method_entry_t,
     comptime_recv: VALUE,
-    _comptime_recv_klass: VALUE,
     flags: u32,
     argc: i32,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
 
     if unsafe { vm_ci_argc(ci) } != 0 {
-        return CantCompile;
+        return None;
     }
 
     let off: i32 = unsafe { get_cme_def_body_optimized_index(cme) }
@@ -5659,13 +8894,20 @@ fn gen_struct_aref(
     {
         let native_off = (off as i64) * (SIZEOF_VALUE as i64);
         if native_off > (i32::MAX as i64) {
-            return CantCompile;
+            return None;
         }
     }
 
+    if c_method_tracing_currently_enabled(jit) {
+        // Struct accesses need fire c_call and c_return events, which we can't support
+        // See :attr-tracing:
+        gen_counter_incr(jit, asm, Counter::send_cfunc_tracing);
+        return None;
+    }
+
     // This is a .send call and we need to adjust the stack
     if flags & VM_CALL_OPT_SEND != 0 {
-        handle_opt_send_shift_stack(asm, argc, ctx);
+        handle_opt_send_shift_stack(asm, argc);
     }
 
     // All structs from the same Struct class should have the same
@@ -5674,9 +8916,10 @@ fn gen_struct_aref(
     // true of the converse.
     let embedded = unsafe { FL_TEST_RAW(comptime_recv, VALUE(RSTRUCT_EMBED_LEN_MASK)) };
 
-    asm.comment("struct aref");
+    asm_comment!(asm, "struct aref");
 
-    let recv = asm.load(ctx.stack_pop(1));
+    let recv = asm.stack_pop(1);
+    let recv = asm.load(recv);
 
     let val = if embedded != VALUE(0) {
         Opnd::mem(64, recv, RUBY_OFFSET_RSTRUCT_AS_ARY + (SIZEOF_VALUE_I32 * off))
@@ -5685,32 +8928,41 @@ fn gen_struct_aref(
         Opnd::mem(64, rstruct_ptr, SIZEOF_VALUE_I32 * off)
     };
 
-    let ret = ctx.stack_push(Type::Unknown);
+    let ret = asm.stack_push(Type::Unknown);
     asm.mov(ret, val);
 
-    jump_to_next_insn(jit, ctx, asm, ocb);
-    EndBlock
+    jump_to_next_insn(jit, asm)
 }
 
 fn gen_struct_aset(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
     ci: *const rb_callinfo,
     cme: *const rb_callable_method_entry_t,
     comptime_recv: VALUE,
-    _comptime_recv_klass: VALUE,
     flags: u32,
     argc: i32,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     if unsafe { vm_ci_argc(ci) } != 1 {
-        return CantCompile;
+        return None;
+    }
+
+    // If the comptime receiver is frozen, writing a struct member will raise an exception
+    // and we don't want to JIT code to deal with that situation.
+    if comptime_recv.is_frozen() {
+        return None;
+    }
+
+    if c_method_tracing_currently_enabled(jit) {
+        // Struct accesses need fire c_call and c_return events, which we can't support
+        // See :attr-tracing:
+        gen_counter_incr(jit, asm, Counter::send_cfunc_tracing);
+        return None;
     }
 
     // This is a .send call and we need to adjust the stack
     if flags & VM_CALL_OPT_SEND != 0 {
-        handle_opt_send_shift_stack(asm, argc, ctx);
+        handle_opt_send_shift_stack(asm, argc);
     }
 
     let off: i32 = unsafe { get_cme_def_body_optimized_index(cme) }
@@ -5721,28 +8973,78 @@ fn gen_struct_aset(
     assert!(unsafe { RB_TYPE_P(comptime_recv, RUBY_T_STRUCT) });
     assert!((off as i64) < unsafe { RSTRUCT_LEN(comptime_recv) });
 
-    asm.comment("struct aset");
+    // Even if the comptime recv was not frozen, future recv may be. So we need to emit a guard
+    // that the recv is not frozen.
+    // We know all structs are heap objects, so we can check the flag directly.
+    let recv = asm.stack_opnd(1);
+    let recv = asm.load(recv);
+    let flags = asm.load(Opnd::mem(VALUE_BITS, recv, RUBY_OFFSET_RBASIC_FLAGS));
+    asm.test(flags, (RUBY_FL_FREEZE as u64).into());
+    asm.jnz(Target::side_exit(Counter::opt_aset_frozen));
+
+    // Not frozen, so we can proceed.
 
-    let val = ctx.stack_pop(1);
-    let recv = ctx.stack_pop(1);
+    asm_comment!(asm, "struct aset");
+
+    let val = asm.stack_pop(1);
+    let recv = asm.stack_pop(1);
 
     let val = asm.ccall(RSTRUCT_SET as *const u8, vec![recv, (off as i64).into(), val]);
 
-    let ret = ctx.stack_push(Type::Unknown);
+    let ret = asm.stack_push(Type::Unknown);
     asm.mov(ret, val);
 
-    jump_to_next_insn(jit, ctx, asm, ocb);
-    EndBlock
+    jump_to_next_insn(jit, asm)
+}
+
+// Generate code that calls a method with dynamic dispatch
+fn gen_send_dynamic<F: Fn(&mut Assembler) -> Opnd>(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    cd: *const rb_call_data,
+    sp_pops: usize,
+    vm_sendish: F,
+) -> Option<CodegenStatus> {
+    // Our frame handling is not compatible with tailcall
+    if unsafe { vm_ci_flag((*cd).ci) } & VM_CALL_TAILCALL != 0 {
+        return None;
+    }
+    jit_perf_symbol_push!(jit, asm, "gen_send_dynamic", PerfMap::Codegen);
+
+    // Rewind stack_size using ctx.with_stack_size to allow stack_size changes
+    // before you return None.
+    asm.ctx = asm.ctx.with_stack_size(jit.stack_size_for_pc);
+
+    // Save PC and SP to prepare for dynamic dispatch
+    jit_prepare_non_leaf_call(jit, asm);
+
+    // Dispatch a method
+    let ret = vm_sendish(asm);
+
+    // Pop arguments and a receiver
+    asm.stack_pop(sp_pops);
+
+    // Push the return value
+    let stack_ret = asm.stack_push(Type::Unknown);
+    asm.mov(stack_ret, ret);
+
+    // Fix the interpreter SP deviated by vm_sendish
+    asm.mov(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP), SP);
+
+    gen_counter_incr(jit, asm, Counter::num_send_dynamic);
+
+    jit_perf_symbol_pop!(jit, asm, PerfMap::Codegen);
+
+    // End the current block for invalidationg and sharing the same successor
+    jump_to_next_insn(jit, asm)
 }
 
 fn gen_send_general(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
     cd: *const rb_call_data,
-    block: Option<IseqPtr>,
-) -> CodegenStatus {
+    block: Option<BlockHandler>,
+) -> Option<CodegenStatus> {
     // Relevant definitions:
     // rb_execution_context_t       : vm_core.h
     // invoker, cfunc logic         : method.h, vm_method.c
@@ -5758,49 +9060,78 @@ fn gen_send_general(
     let mut mid = unsafe { vm_ci_mid(ci) };
     let mut flags = unsafe { vm_ci_flag(ci) };
 
-    // Don't JIT calls with keyword splat
-    if flags & VM_CALL_KW_SPLAT != 0 {
-        gen_counter_incr!(asm, send_kw_splat);
-        return CantCompile;
+    // Defer compilation so we can specialize on class of receiver
+    if !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
     }
 
-    // Defer compilation so we can specialize on class of receiver
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+    let ci_flags = unsafe { vm_ci_flag(ci) };
+
+    // Dynamic stack layout. No good way to support without inlining.
+    if ci_flags & VM_CALL_FORWARDING != 0 {
+        gen_counter_incr(jit, asm, Counter::send_forwarding);
+        return None;
     }
 
     let recv_idx = argc + if flags & VM_CALL_ARGS_BLOCKARG != 0 { 1 } else { 0 };
-
-    let comptime_recv = jit_peek_at_stack(jit, ctx, recv_idx as isize);
+    let comptime_recv = jit.peek_at_stack(&asm.ctx, recv_idx as isize);
     let comptime_recv_klass = comptime_recv.class_of();
+    assert_eq!(RUBY_T_CLASS, comptime_recv_klass.builtin_type(),
+        "objects visible to ruby code should have a T_CLASS in their klass field");
 
-    // Guard that the receiver has the same class as the one from compile time
-    let side_exit = get_side_exit(jit, ocb, ctx);
+    // Don't compile calls through singleton classes to avoid retaining the receiver.
+    // Make an exception for class methods since classes tend to be retained anyways.
+    // Also compile calls on top_self to help tests.
+    if VALUE(0) != unsafe { FL_TEST(comptime_recv_klass, VALUE(RUBY_FL_SINGLETON as usize)) }
+        && comptime_recv != unsafe { rb_vm_top_self() }
+        && !unsafe { RB_TYPE_P(comptime_recv, RUBY_T_CLASS) }
+        && !unsafe { RB_TYPE_P(comptime_recv, RUBY_T_MODULE) } {
+        gen_counter_incr(jit, asm, Counter::send_singleton_class);
+        return None;
+    }
 
     // Points to the receiver operand on the stack
-    let recv = ctx.stack_opnd(recv_idx);
-    let recv_opnd = StackOpnd(recv_idx.try_into().unwrap());
-    jit_guard_known_klass(
+    let recv = asm.stack_opnd(recv_idx);
+    let recv_opnd: YARVOpnd = recv.into();
+
+    // Log the name of the method we're calling to
+    asm_comment!(asm, "call to {}", get_method_name(Some(comptime_recv_klass), mid));
+
+    // Gather some statistics about sends
+    gen_counter_incr(jit, asm, Counter::num_send);
+    if let Some(_known_klass) = asm.ctx.get_opnd_type(recv_opnd).known_class()  {
+        gen_counter_incr(jit, asm, Counter::num_send_known_class);
+    }
+    if asm.ctx.get_chain_depth() > 1 {
+        gen_counter_incr(jit, asm, Counter::num_send_polymorphic);
+    }
+    // If megamorphic, let the caller fallback to dynamic dispatch
+    if asm.ctx.get_chain_depth() >= SEND_MAX_DEPTH {
+        gen_counter_incr(jit, asm, Counter::send_megamorphic);
+        return None;
+    }
+
+    perf_call!("gen_send_general: ", jit_guard_known_klass(
         jit,
-        ctx,
         asm,
-        ocb,
-        comptime_recv_klass,
         recv,
         recv_opnd,
         comptime_recv,
         SEND_MAX_DEPTH,
-        side_exit,
-    );
+        Counter::guard_send_klass_megamorphic,
+    ));
 
     // Do method lookup
     let mut cme = unsafe { rb_callable_method_entry(comptime_recv_klass, mid) };
     if cme.is_null() {
-        // TODO: counter
-        return CantCompile;
+        gen_counter_incr(jit, asm, Counter::send_cme_not_found);
+        return None;
     }
 
+    // Load an overloaded cme if applicable. See vm_search_cc().
+    // It allows you to use a faster ISEQ if possible.
+    cme = unsafe { rb_check_overloaded_cme(cme, ci) };
+
     let visi = unsafe { METHOD_ENTRY_VISI(cme) };
     match visi {
         METHOD_VISI_PUBLIC => {
@@ -5810,15 +9141,16 @@ fn gen_send_general(
             if flags & VM_CALL_FCALL == 0 {
                 // Can only call private methods with FCALL callsites.
                 // (at the moment they are callsites without a receiver or an explicit `self` receiver)
-                return CantCompile;
+                gen_counter_incr(jit, asm, Counter::send_private_not_fcall);
+                return None;
             }
         }
         METHOD_VISI_PROTECTED => {
             // If the method call is an FCALL, it is always valid
             if flags & VM_CALL_FCALL == 0 {
-                // otherwise we need an ancestry check to ensure the receiver is vaild to be called
+                // otherwise we need an ancestry check to ensure the receiver is valid to be called
                 // as protected
-                jit_protected_callee_ancestry_guard(jit, asm, ocb, cme, side_exit);
+                jit_protected_callee_ancestry_guard(asm, cme);
             }
         }
         _ => {
@@ -5828,7 +9160,7 @@ fn gen_send_general(
 
     // Register block for invalidation
     //assert!(cme->called_id == mid);
-    assume_method_lookup_stable(jit, ocb, cme);
+    jit.assume_method_lookup_stable(asm, cme);
 
     // To handle the aliased method case (VM_METHOD_TYPE_ALIAS)
     loop {
@@ -5838,38 +9170,58 @@ fn gen_send_general(
             VM_METHOD_TYPE_ISEQ => {
                 let iseq = unsafe { get_def_iseq_ptr((*cme).def) };
                 let frame_type = VM_FRAME_MAGIC_METHOD | VM_ENV_FLAG_LOCAL;
-                return gen_send_iseq(jit, ctx, asm, ocb, iseq, ci, frame_type, None, cme, block, flags, argc, None);
+                return perf_call! { gen_send_iseq(jit, asm, iseq, ci, frame_type, None, cme, block, flags, argc, None) };
             }
             VM_METHOD_TYPE_CFUNC => {
-                return gen_send_cfunc(
+                return perf_call! { gen_send_cfunc(
                     jit,
-                    ctx,
                     asm,
-                    ocb,
                     ci,
                     cme,
                     block,
-                    &comptime_recv_klass,
+                    Some(comptime_recv_klass),
                     flags,
                     argc,
-                );
+                ) };
             }
             VM_METHOD_TYPE_IVAR => {
-                if flags & VM_CALL_ARGS_SPLAT != 0 {
-                    gen_counter_incr!(asm, send_args_splat_ivar);
-                    return CantCompile;
+                // This is a .send call not supported right now for attr_reader
+                if flags & VM_CALL_OPT_SEND != 0 {
+                    gen_counter_incr(jit, asm, Counter::send_send_attr_reader);
+                    return None;
                 }
 
-                if argc != 0 {
-                    // Argument count mismatch. Getters take no arguments.
-                    gen_counter_incr!(asm, send_getter_arity);
-                    return CantCompile;
+                if flags & VM_CALL_ARGS_BLOCKARG != 0 {
+                    match asm.ctx.get_opnd_type(StackOpnd(0)) {
+                        Type::Nil | Type::BlockParamProxy => {
+                            // Getters ignore the block arg, and these types of block args can be
+                            // passed without side-effect (never any `to_proc` call).
+                            asm.stack_pop(1);
+                        }
+                        _ => {
+                            gen_counter_incr(jit, asm, Counter::send_getter_block_arg);
+                            return None;
+                        }
+                    }
                 }
 
-                // This is a .send call not supported right now for getters
-                if flags & VM_CALL_OPT_SEND != 0 {
-                    gen_counter_incr!(asm, send_send_getter);
-                    return CantCompile;
+                if argc != 0 {
+                    // Guard for simple splat of empty array
+                    if VM_CALL_ARGS_SPLAT == flags & (VM_CALL_ARGS_SPLAT | VM_CALL_KWARG | VM_CALL_KW_SPLAT)
+                        && argc == 1 {
+                        // Not using chain guards since on failure these likely end up just raising
+                        // ArgumentError
+                        let splat = asm.stack_opnd(0);
+                        guard_object_is_array(asm, splat, splat.into(), Counter::guard_send_getter_splat_non_empty);
+                        let splat_len = get_array_len(asm, splat);
+                        asm.cmp(splat_len, 0.into());
+                        asm.jne(Target::side_exit(Counter::guard_send_getter_splat_non_empty));
+                        asm.stack_pop(1);
+                    } else {
+                        // Argument count mismatch. Getters take no arguments.
+                        gen_counter_incr(jit, asm, Counter::send_getter_arity);
+                        return None;
+                    }
                 }
 
                 if c_method_tracing_currently_enabled(jit) {
@@ -5878,198 +9230,134 @@ fn gen_send_general(
                     // Handling the C method tracing events for attr_accessor
                     // methods is easier than regular C methods as we know the
                     // "method" we are calling into never enables those tracing
-                    // events. Once global invalidation runs, the code for the
-                    // attr_accessor is invalidated and we exit at the closest
-                    // instruction boundary which is always outside of the body of
-                    // the attr_accessor code.
-                    gen_counter_incr!(asm, send_cfunc_tracing);
-                    return CantCompile;
+                    // events. We are never inside the code that needs to be
+                    // invalidated when invalidation happens.
+                    gen_counter_incr(jit, asm, Counter::send_cfunc_tracing);
+                    return None;
                 }
 
+                let recv = asm.stack_opnd(0); // the receiver should now be the stack top
                 let ivar_name = unsafe { get_cme_def_body_attr_id(cme) };
 
-                if flags & VM_CALL_ARGS_BLOCKARG != 0 {
-                    gen_counter_incr!(asm, send_block_arg);
-                    return CantCompile;
-                }
-
                 return gen_get_ivar(
                     jit,
-                    ctx,
                     asm,
-                    ocb,
                     SEND_MAX_DEPTH,
                     comptime_recv,
                     ivar_name,
                     recv,
-                    recv_opnd,
-                    side_exit,
+                    recv.into(),
                 );
             }
             VM_METHOD_TYPE_ATTRSET => {
+                // This is a .send call not supported right now for attr_writer
+                if flags & VM_CALL_OPT_SEND != 0 {
+                    gen_counter_incr(jit, asm, Counter::send_send_attr_writer);
+                    return None;
+                }
                 if flags & VM_CALL_ARGS_SPLAT != 0 {
-                    gen_counter_incr!(asm, send_args_splat_attrset);
-                    return CantCompile;
+                    gen_counter_incr(jit, asm, Counter::send_args_splat_attrset);
+                    return None;
                 }
                 if flags & VM_CALL_KWARG != 0 {
-                    gen_counter_incr!(asm, send_attrset_kwargs);
-                    return CantCompile;
+                    gen_counter_incr(jit, asm, Counter::send_attrset_kwargs);
+                    return None;
                 } else if argc != 1 || unsafe { !RB_TYPE_P(comptime_recv, RUBY_T_OBJECT) } {
-                    gen_counter_incr!(asm, send_ivar_set_method);
-                    return CantCompile;
+                    gen_counter_incr(jit, asm, Counter::send_ivar_set_method);
+                    return None;
                 } else if c_method_tracing_currently_enabled(jit) {
                     // Can't generate code for firing c_call and c_return events
                     // See :attr-tracing:
-                    gen_counter_incr!(asm, send_cfunc_tracing);
-                    return CantCompile;
+                    gen_counter_incr(jit, asm, Counter::send_cfunc_tracing);
+                    return None;
                 } else if flags & VM_CALL_ARGS_BLOCKARG != 0 {
-                    gen_counter_incr!(asm, send_block_arg);
-                    return CantCompile;
+                    gen_counter_incr(jit, asm, Counter::send_attrset_block_arg);
+                    return None;
                 } else {
                     let ivar_name = unsafe { get_cme_def_body_attr_id(cme) };
-                    return gen_set_ivar(jit, ctx, asm, comptime_recv, ivar_name, flags, argc);
+                    return gen_set_ivar(jit, asm, comptime_recv, ivar_name, StackOpnd(1), None);
                 }
             }
             // Block method, e.g. define_method(:foo) { :my_block }
             VM_METHOD_TYPE_BMETHOD => {
                 if flags & VM_CALL_ARGS_SPLAT != 0 {
-                    gen_counter_incr!(asm, send_args_splat_bmethod);
-                    return CantCompile;
+                    gen_counter_incr(jit, asm, Counter::send_args_splat_bmethod);
+                    return None;
                 }
-                return gen_send_bmethod(jit, ctx, asm, ocb, ci, cme, block, flags, argc);
-            }
-            VM_METHOD_TYPE_ZSUPER => {
-                gen_counter_incr!(asm, send_zsuper_method);
-                return CantCompile;
+                return gen_send_bmethod(jit, asm, ci, cme, block, flags, argc);
             }
             VM_METHOD_TYPE_ALIAS => {
                 // Retrieve the aliased method and re-enter the switch
                 cme = unsafe { rb_aliased_callable_method_entry(cme) };
                 continue;
             }
-            VM_METHOD_TYPE_UNDEF => {
-                gen_counter_incr!(asm, send_undef_method);
-                return CantCompile;
-            }
-            VM_METHOD_TYPE_NOTIMPLEMENTED => {
-                gen_counter_incr!(asm, send_not_implemented_method);
-                return CantCompile;
-            }
             // Send family of methods, e.g. call/apply
             VM_METHOD_TYPE_OPTIMIZED => {
                 if flags & VM_CALL_ARGS_BLOCKARG != 0 {
-                    gen_counter_incr!(asm, send_block_arg);
-                    return CantCompile;
-                }
-
-                if flags & VM_CALL_ARGS_SPLAT != 0 {
-                    gen_counter_incr!(asm, send_args_splat_optimized);
-                    return CantCompile;
+                    gen_counter_incr(jit, asm, Counter::send_optimized_block_arg);
+                    return None;
                 }
 
                 let opt_type = unsafe { get_cme_def_body_optimized_type(cme) };
                 match opt_type {
                     OPTIMIZED_METHOD_TYPE_SEND => {
-
                         // This is for method calls like `foo.send(:bar)`
                         // The `send` method does not get its own stack frame.
                         // instead we look up the method and call it,
                         // doing some stack shifting based on the VM_CALL_OPT_SEND flag
 
-                        let starting_context = ctx.clone();
+                        // Reject nested cases such as `send(:send, :alias_for_send, :foo))`.
+                        // We would need to do some stack manipulation here or keep track of how
+                        // many levels deep we need to stack manipulate. Because of how exits
+                        // currently work, we can't do stack manipulation until we will no longer
+                        // side exit.
+                        if flags & VM_CALL_OPT_SEND != 0 {
+                            gen_counter_incr(jit, asm, Counter::send_send_nested);
+                            return None;
+                        }
 
                         if argc == 0 {
-                            gen_counter_incr!(asm, send_send_wrong_args);
-                            return CantCompile;
+                            gen_counter_incr(jit, asm, Counter::send_send_wrong_args);
+                            return None;
                         }
 
                         argc -= 1;
 
-                        let compile_time_name = jit_peek_at_stack(jit, ctx, argc as isize);
-
-                        if !compile_time_name.string_p() && !compile_time_name.static_sym_p()  {
-                            gen_counter_incr!(asm, send_send_chain_not_string_or_sym);
-                            return CantCompile;
-                        }
+                        let compile_time_name = jit.peek_at_stack(&asm.ctx, argc as isize);
 
                         mid = unsafe { rb_get_symbol_id(compile_time_name) };
                         if mid == 0 {
-                            gen_counter_incr!(asm, send_send_null_mid);
-                            return CantCompile;
+                            // This also rejects method names that need conversion
+                            gen_counter_incr(jit, asm, Counter::send_send_null_mid);
+                            return None;
                         }
 
                         cme = unsafe { rb_callable_method_entry(comptime_recv_klass, mid) };
                         if cme.is_null() {
-                            gen_counter_incr!(asm, send_send_null_cme);
-                            return CantCompile;
-                        }
-
-                        // We aren't going to handle `send(send(:foo))`. We would need to
-                        // do some stack manipulation here or keep track of how many levels
-                        // deep we need to stack manipulate
-                        // Because of how exits currently work, we can't do stack manipulation
-                        // until we will no longer side exit.
-                        let def_type = unsafe { get_cme_def_type(cme) };
-                        if let VM_METHOD_TYPE_OPTIMIZED = def_type {
-                            let opt_type = unsafe { get_cme_def_body_optimized_type(cme) };
-                            if let OPTIMIZED_METHOD_TYPE_SEND = opt_type {
-                                gen_counter_incr!(asm, send_send_nested);
-                                return CantCompile;
-                            }
+                            gen_counter_incr(jit, asm, Counter::send_send_null_cme);
+                            return None;
                         }
 
                         flags |= VM_CALL_FCALL | VM_CALL_OPT_SEND;
 
-                        assume_method_lookup_stable(jit, ocb, cme);
-
-                        let (known_class, type_mismatch_exit) = {
-                            if compile_time_name.string_p() {
-                                (
-                                    unsafe { rb_cString },
-                                    counted_exit!(ocb, side_exit, send_send_chain_not_string),
+                        jit.assume_method_lookup_stable(asm, cme);
 
-                                )
-                            } else {
-                                (
-                                    unsafe { rb_cSymbol },
-                                    counted_exit!(ocb, side_exit, send_send_chain_not_sym),
-                                )
-                            }
-                        };
-
-                        jit_guard_known_klass(
-                            jit,
-                            ctx,
+                        asm_comment!(
                             asm,
-                            ocb,
-                            known_class,
-                            ctx.stack_opnd(argc),
-                            StackOpnd(argc as u16),
-                            compile_time_name,
-                            2, // We have string or symbol, so max depth is 2
-                            type_mismatch_exit
+                            "guard sending method name \'{}\'",
+                            unsafe { cstr_to_rust_string(rb_id2name(mid)) }.unwrap_or_else(|| "<unknown>".to_owned()),
                         );
 
-                        // Need to do this here so we don't have too many live
-                        // values for the register allocator.
-                        let name_opnd = asm.load(ctx.stack_opnd(argc));
-
+                        let name_opnd = asm.stack_opnd(argc);
                         let symbol_id_opnd = asm.ccall(rb_get_symbol_id as *const u8, vec![name_opnd]);
 
-                        asm.comment("chain_guard_send");
-                        let chain_exit = counted_exit!(ocb, side_exit, send_send_chain);
-                        asm.cmp(symbol_id_opnd, 0.into());
-                        asm.jbe(chain_exit.into());
-
                         asm.cmp(symbol_id_opnd, mid.into());
                         jit_chain_guard(
                             JCC_JNE,
                             jit,
-                            &starting_context,
                             asm,
-                            ocb,
-                            SEND_MAX_CHAIN_DEPTH,
-                            chain_exit,
+                            SEND_MAX_DEPTH,
+                            Counter::guard_send_send_name_chain,
                         );
 
                         // We have changed the argc, flags, mid, and cme, so we need to re-enter the match
@@ -6078,36 +9366,33 @@ fn gen_send_general(
 
                     }
                     OPTIMIZED_METHOD_TYPE_CALL => {
-
                         if block.is_some() {
-                            gen_counter_incr!(asm, send_call_block);
-                            return CantCompile;
+                            gen_counter_incr(jit, asm, Counter::send_call_block);
+                            return None;
                         }
 
                         if flags & VM_CALL_KWARG != 0 {
-                            gen_counter_incr!(asm, send_call_kwarg);
-                            return CantCompile;
+                            gen_counter_incr(jit, asm, Counter::send_call_kwarg);
+                            return None;
                         }
 
-                        // Optimize for single ractor mode and avoid runtime check for
-                        // "defined with an un-shareable Proc in a different Ractor"
-                        if !assume_single_ractor_mode(jit, ocb) {
-                            gen_counter_incr!(asm, send_call_multi_ractor);
-                            return CantCompile;
+                        if flags & VM_CALL_ARGS_SPLAT != 0 {
+                            gen_counter_incr(jit, asm, Counter::send_args_splat_opt_call);
+                            return None;
                         }
 
                         // If this is a .send call we need to adjust the stack
                         if flags & VM_CALL_OPT_SEND != 0 {
-                            handle_opt_send_shift_stack(asm, argc, ctx);
+                            handle_opt_send_shift_stack(asm, argc);
                         }
 
                         // About to reset the SP, need to load this here
                         let recv_load = asm.load(recv);
 
-                        let sp = asm.lea(ctx.sp_opnd(0));
+                        let sp = asm.lea(asm.ctx.sp_opnd(0));
 
                         // Save the PC and SP because the callee can make Ruby calls
-                        jit_prepare_routine_call(jit, ctx, asm);
+                        jit_prepare_non_leaf_call(jit, asm);
 
                         let kw_splat = flags & VM_CALL_KW_SPLAT;
                         let stack_argument_pointer = asm.lea(Opnd::mem(64, sp, -(argc) * SIZEOF_VALUE_I32));
@@ -6121,49 +9406,44 @@ fn gen_send_general(
                             VM_BLOCK_HANDLER_NONE.into(),
                         ]);
 
-                        ctx.stack_pop(argc as usize + 1);
+                        asm.stack_pop(argc as usize + 1);
 
-                        let stack_ret = ctx.stack_push(Type::Unknown);
+                        let stack_ret = asm.stack_push(Type::Unknown);
                         asm.mov(stack_ret, ret);
-                        return KeepCompiling;
 
+                        // End the block to allow invalidating the next instruction
+                        return jump_to_next_insn(jit, asm);
                     }
                     OPTIMIZED_METHOD_TYPE_BLOCK_CALL => {
-                        gen_counter_incr!(asm, send_optimized_method_block_call);
-                        return CantCompile;
+                        gen_counter_incr(jit, asm, Counter::send_optimized_method_block_call);
+                        return None;
                     }
                     OPTIMIZED_METHOD_TYPE_STRUCT_AREF => {
                         if flags & VM_CALL_ARGS_SPLAT != 0 {
-                            gen_counter_incr!(asm, send_args_splat_aref);
-                            return CantCompile;
+                            gen_counter_incr(jit, asm, Counter::send_args_splat_aref);
+                            return None;
                         }
                         return gen_struct_aref(
                             jit,
-                            ctx,
                             asm,
-                            ocb,
                             ci,
                             cme,
                             comptime_recv,
-                            comptime_recv_klass,
                             flags,
                             argc,
                         );
                     }
                     OPTIMIZED_METHOD_TYPE_STRUCT_ASET => {
                         if flags & VM_CALL_ARGS_SPLAT != 0 {
-                            gen_counter_incr!(asm, send_args_splat_aset);
-                            return CantCompile;
+                            gen_counter_incr(jit, asm, Counter::send_args_splat_aset);
+                            return None;
                         }
                         return gen_struct_aset(
                             jit,
-                            ctx,
                             asm,
-                            ocb,
                             ci,
                             cme,
                             comptime_recv,
-                            comptime_recv_klass,
                             flags,
                             argc,
                         );
@@ -6173,13 +9453,25 @@ fn gen_send_general(
                     }
                 }
             }
+            VM_METHOD_TYPE_ZSUPER => {
+                gen_counter_incr(jit, asm, Counter::send_zsuper_method);
+                return None;
+            }
+            VM_METHOD_TYPE_UNDEF => {
+                gen_counter_incr(jit, asm, Counter::send_undef_method);
+                return None;
+            }
+            VM_METHOD_TYPE_NOTIMPLEMENTED => {
+                gen_counter_incr(jit, asm, Counter::send_not_implemented_method);
+                return None;
+            }
             VM_METHOD_TYPE_MISSING => {
-                gen_counter_incr!(asm, send_missing_method);
-                return CantCompile;
+                gen_counter_incr(jit, asm, Counter::send_missing_method);
+                return None;
             }
             VM_METHOD_TYPE_REFINED => {
-                gen_counter_incr!(asm, send_refined_method);
-                return CantCompile;
+                gen_counter_incr(jit, asm, Counter::send_refined_method);
+                return None;
             }
             _ => {
                 unreachable!();
@@ -6188,6 +9480,35 @@ fn gen_send_general(
     }
 }
 
+/// Get class name from a class pointer.
+fn get_class_name(class: Option<VALUE>) -> String {
+    class.filter(|&class| {
+        // type checks for rb_class2name()
+        unsafe { RB_TYPE_P(class, RUBY_T_MODULE) || RB_TYPE_P(class, RUBY_T_CLASS) }
+    }).and_then(|class| unsafe {
+        cstr_to_rust_string(rb_class2name(class))
+    }).unwrap_or_else(|| "Unknown".to_string())
+}
+
+/// Assemble "{class_name}#{method_name}" from a class pointer and a method ID
+fn get_method_name(class: Option<VALUE>, mid: u64) -> String {
+    let class_name = get_class_name(class);
+    let method_name = if mid != 0 {
+        unsafe { cstr_to_rust_string(rb_id2name(mid)) }
+    } else {
+        None
+    }.unwrap_or_else(|| "Unknown".to_string());
+    format!("{}#{}", class_name, method_name)
+}
+
+/// Assemble "{label}@{iseq_path}:{lineno}" (iseq_inspect() format) from an ISEQ
+fn get_iseq_name(iseq: IseqPtr) -> String {
+    let c_string = unsafe { rb_yjit_iseq_inspect(iseq) };
+    let string = unsafe { CStr::from_ptr(c_string) }.to_str()
+        .unwrap_or_else(|_| "not UTF-8").to_string();
+    unsafe { ruby_xfree(c_string as *mut c_void); }
+    string
+}
 
 /// Shifts the stack for send in order to remove the name of the method
 /// Comment below borrow from vm_call_opt_send in vm_insnhelper.c
@@ -6204,164 +9525,324 @@ fn gen_send_general(
 ///--+------+--------+------+------
 ///
 /// We do this for our compiletime context and the actual stack
-fn handle_opt_send_shift_stack(asm: &mut Assembler, argc: i32, ctx: &mut Context) {
-    asm.comment("shift_stack");
+fn handle_opt_send_shift_stack(asm: &mut Assembler, argc: i32) {
+    asm_comment!(asm, "shift_stack");
     for j in (0..argc).rev() {
-        let opnd = ctx.stack_opnd(j);
-        let opnd2 = ctx.stack_opnd(j + 1);
+        let opnd = asm.stack_opnd(j);
+        let opnd2 = asm.stack_opnd(j + 1);
         asm.mov(opnd2, opnd);
     }
-    ctx.shift_stack(argc as usize);
+    asm.shift_stack(argc as usize);
 }
 
 fn gen_opt_send_without_block(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let cd = jit_get_arg(jit, 0).as_ptr();
+) -> Option<CodegenStatus> {
+    // Generate specialized code if possible
+    let cd = jit.get_arg(0).as_ptr();
+    if let Some(status) = perf_call! { gen_send_general(jit, asm, cd, None) } {
+        return Some(status);
+    }
 
-    gen_send_general(jit, ctx, asm, ocb, cd, None)
+    // Otherwise, fallback to dynamic dispatch using the interpreter's implementation of send
+    gen_send_dynamic(jit, asm, cd, unsafe { rb_yjit_sendish_sp_pops((*cd).ci) }, |asm| {
+        extern "C" {
+            fn rb_vm_opt_send_without_block(ec: EcPtr, cfp: CfpPtr, cd: VALUE) -> VALUE;
+        }
+        asm.ccall(
+            rb_vm_opt_send_without_block as *const u8,
+            vec![EC, CFP, (cd as usize).into()],
+        )
+    })
 }
 
 fn gen_send(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let cd = jit_get_arg(jit, 0).as_ptr();
-    let block = jit_get_arg(jit, 1).as_optional_ptr();
-    return gen_send_general(jit, ctx, asm, ocb, cd, block);
+) -> Option<CodegenStatus> {
+    // Generate specialized code if possible
+    let cd = jit.get_arg(0).as_ptr();
+    let block = jit.get_arg(1).as_optional_ptr().map(|iseq| BlockHandler::BlockISeq(iseq));
+    if let Some(status) = perf_call! { gen_send_general(jit, asm, cd, block) } {
+        return Some(status);
+    }
+
+    // Otherwise, fallback to dynamic dispatch using the interpreter's implementation of send
+    let blockiseq = jit.get_arg(1).as_iseq();
+    gen_send_dynamic(jit, asm, cd, unsafe { rb_yjit_sendish_sp_pops((*cd).ci) }, |asm| {
+        extern "C" {
+            fn rb_vm_send(ec: EcPtr, cfp: CfpPtr, cd: VALUE, blockiseq: IseqPtr) -> VALUE;
+        }
+        asm.ccall(
+            rb_vm_send as *const u8,
+            vec![EC, CFP, (cd as usize).into(), VALUE(blockiseq as usize).into()],
+        )
+    })
+}
+
+fn gen_sendforward(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    // Generate specialized code if possible
+    let cd = jit.get_arg(0).as_ptr();
+    let block = jit.get_arg(1).as_optional_ptr().map(|iseq| BlockHandler::BlockISeq(iseq));
+    if let Some(status) = perf_call! { gen_send_general(jit, asm, cd, block) } {
+        return Some(status);
+    }
+
+    // Otherwise, fallback to dynamic dispatch using the interpreter's implementation of sendforward
+    let blockiseq = jit.get_arg(1).as_iseq();
+    gen_send_dynamic(jit, asm, cd, unsafe { rb_yjit_sendish_sp_pops((*cd).ci) }, |asm| {
+        extern "C" {
+            fn rb_vm_sendforward(ec: EcPtr, cfp: CfpPtr, cd: VALUE, blockiseq: IseqPtr) -> VALUE;
+        }
+        asm.ccall(
+            rb_vm_sendforward as *const u8,
+            vec![EC, CFP, (cd as usize).into(), VALUE(blockiseq as usize).into()],
+        )
+    })
 }
 
 fn gen_invokeblock(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+) -> Option<CodegenStatus> {
+    // Generate specialized code if possible
+    let cd = jit.get_arg(0).as_ptr();
+    if let Some(status) = gen_invokeblock_specialized(jit, asm, cd) {
+        return Some(status);
+    }
+
+    // Otherwise, fallback to dynamic dispatch using the interpreter's implementation of send
+    gen_send_dynamic(jit, asm, cd, unsafe { rb_yjit_invokeblock_sp_pops((*cd).ci) }, |asm| {
+        extern "C" {
+            fn rb_vm_invokeblock(ec: EcPtr, cfp: CfpPtr, cd: VALUE) -> VALUE;
+        }
+        asm.ccall(
+            rb_vm_invokeblock as *const u8,
+            vec![EC, CFP, (cd as usize).into()],
+        )
+    })
+}
+
+fn gen_invokeblock_specialized(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    cd: *const rb_call_data,
+) -> Option<CodegenStatus> {
+    if !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
+    }
+
+    // Fallback to dynamic dispatch if this callsite is megamorphic
+    if asm.ctx.get_chain_depth() >= SEND_MAX_DEPTH {
+        gen_counter_incr(jit, asm, Counter::invokeblock_megamorphic);
+        return None;
     }
 
     // Get call info
-    let cd = jit_get_arg(jit, 0).as_ptr();
     let ci = unsafe { get_call_data_ci(cd) };
     let argc: i32 = unsafe { vm_ci_argc(ci) }.try_into().unwrap();
     let flags = unsafe { vm_ci_flag(ci) };
 
     // Get block_handler
-    let cfp = unsafe { get_ec_cfp(jit.ec.unwrap()) };
+    let cfp = jit.get_cfp();
     let lep = unsafe { rb_vm_ep_local_ep(get_cfp_ep(cfp)) };
     let comptime_handler = unsafe { *lep.offset(VM_ENV_DATA_INDEX_SPECVAL.try_into().unwrap()) };
 
     // Handle each block_handler type
     if comptime_handler.0 == VM_BLOCK_HANDLER_NONE as usize { // no block given
-        gen_counter_incr!(asm, invokeblock_none);
-        CantCompile
+        gen_counter_incr(jit, asm, Counter::invokeblock_none);
+        None
     } else if comptime_handler.0 & 0x3 == 0x1 { // VM_BH_ISEQ_BLOCK_P
-        asm.comment("get local EP");
+        asm_comment!(asm, "get local EP");
         let ep_opnd = gen_get_lep(jit, asm);
         let block_handler_opnd = asm.load(
             Opnd::mem(64, ep_opnd, SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL)
         );
 
-        asm.comment("guard block_handler type");
-        let side_exit = get_side_exit(jit, ocb, ctx);
+        asm_comment!(asm, "guard block_handler type");
         let tag_opnd = asm.and(block_handler_opnd, 0x3.into()); // block_handler is a tagged pointer
         asm.cmp(tag_opnd, 0x1.into()); // VM_BH_ISEQ_BLOCK_P
-        asm.jne(counted_exit!(ocb, side_exit, invokeblock_iseq_tag_changed));
+        jit_chain_guard(
+            JCC_JNE,
+            jit,
+            asm,
+            SEND_MAX_DEPTH,
+            Counter::guard_invokeblock_tag_changed,
+        );
+
+        // If the current ISEQ is annotated to be inlined but it's not being inlined here,
+        // generate a dynamic dispatch to avoid making this yield megamorphic.
+        if unsafe { rb_jit_iseq_builtin_attrs(jit.iseq) } & BUILTIN_ATTR_INLINE_BLOCK != 0 && !asm.ctx.inline() {
+            gen_counter_incr(jit, asm, Counter::invokeblock_iseq_not_inlined);
+            return None;
+        }
 
-        // Not supporting vm_callee_setup_block_arg_arg0_splat for now
         let comptime_captured = unsafe { ((comptime_handler.0 & !0x3) as *const rb_captured_block).as_ref().unwrap() };
         let comptime_iseq = unsafe { *comptime_captured.code.iseq.as_ref() };
-        if argc == 1 && unsafe { get_iseq_flags_has_lead(comptime_iseq) && !get_iseq_flags_ambiguous_param0(comptime_iseq) } {
-            gen_counter_incr!(asm, invokeblock_iseq_arg0_splat);
-            return CantCompile;
-        }
 
-        asm.comment("guard known ISEQ");
+        asm_comment!(asm, "guard known ISEQ");
         let captured_opnd = asm.and(block_handler_opnd, Opnd::Imm(!0x3));
         let iseq_opnd = asm.load(Opnd::mem(64, captured_opnd, SIZEOF_VALUE_I32 * 2));
-        asm.cmp(iseq_opnd, (comptime_iseq as usize).into());
-        let block_changed_exit = counted_exit!(ocb, side_exit, invokeblock_iseq_block_changed);
+        asm.cmp(iseq_opnd, VALUE::from(comptime_iseq).into());
         jit_chain_guard(
             JCC_JNE,
             jit,
-            ctx,
             asm,
-            ocb,
-            SEND_MAX_CHAIN_DEPTH,
-            block_changed_exit,
+            SEND_MAX_DEPTH,
+            Counter::guard_invokeblock_iseq_block_changed,
         );
 
-        gen_send_iseq(
+        perf_call! { gen_send_iseq(jit, asm, comptime_iseq, ci, VM_FRAME_MAGIC_BLOCK, None, 0 as _, None, flags, argc, Some(captured_opnd)) }
+    } else if comptime_handler.0 & 0x3 == 0x3 { // VM_BH_IFUNC_P
+        // We aren't handling CALLER_SETUP_ARG and CALLER_REMOVE_EMPTY_KW_SPLAT yet.
+        if flags & VM_CALL_ARGS_SPLAT != 0 {
+            gen_counter_incr(jit, asm, Counter::invokeblock_ifunc_args_splat);
+            return None;
+        }
+        if flags & VM_CALL_KW_SPLAT != 0 {
+            gen_counter_incr(jit, asm, Counter::invokeblock_ifunc_kw_splat);
+            return None;
+        }
+
+        asm_comment!(asm, "get local EP");
+        let ep_opnd = gen_get_lep(jit, asm);
+        let block_handler_opnd = asm.load(
+            Opnd::mem(64, ep_opnd, SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL)
+        );
+
+        asm_comment!(asm, "guard block_handler type");
+        let tag_opnd = asm.and(block_handler_opnd, 0x3.into()); // block_handler is a tagged pointer
+        asm.cmp(tag_opnd, 0x3.into()); // VM_BH_IFUNC_P
+        jit_chain_guard(
+            JCC_JNE,
             jit,
-            ctx,
             asm,
-            ocb,
-            comptime_iseq,
-            ci,
-            VM_FRAME_MAGIC_BLOCK,
-            None,
-            0 as _,
-            None,
-            flags,
-            argc,
-            Some(captured_opnd),
-        )
-    } else if comptime_handler.0 & 0x3 == 0x3 { // VM_BH_IFUNC_P
-        gen_counter_incr!(asm, invokeblock_ifunc);
-        CantCompile
+            SEND_MAX_DEPTH,
+            Counter::guard_invokeblock_tag_changed,
+        );
+
+        // The cfunc may not be leaf
+        jit_prepare_non_leaf_call(jit, asm);
+
+        extern "C" {
+            fn rb_vm_yield_with_cfunc(ec: EcPtr, captured: *const rb_captured_block, argc: c_int, argv: *const VALUE) -> VALUE;
+        }
+        asm_comment!(asm, "call ifunc");
+        let captured_opnd = asm.and(block_handler_opnd, Opnd::Imm(!0x3));
+        let argv = asm.lea(asm.ctx.sp_opnd(-argc));
+        let ret = asm.ccall(
+            rb_vm_yield_with_cfunc as *const u8,
+            vec![EC, captured_opnd, argc.into(), argv],
+        );
+
+        asm.stack_pop(argc.try_into().unwrap());
+        let stack_ret = asm.stack_push(Type::Unknown);
+        asm.mov(stack_ret, ret);
+
+        // cfunc calls may corrupt types
+        asm.clear_local_types();
+
+        // Share the successor with other chains
+        jump_to_next_insn(jit, asm)
     } else if comptime_handler.symbol_p() {
-        gen_counter_incr!(asm, invokeblock_symbol);
-        CantCompile
+        gen_counter_incr(jit, asm, Counter::invokeblock_symbol);
+        None
     } else { // Proc
-        gen_counter_incr!(asm, invokeblock_proc);
-        CantCompile
+        gen_counter_incr(jit, asm, Counter::invokeblock_proc);
+        None
     }
 }
 
 fn gen_invokesuper(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let cd: *const rb_call_data = jit_get_arg(jit, 0).as_ptr();
-    let block: Option<IseqPtr> = jit_get_arg(jit, 1).as_optional_ptr();
+) -> Option<CodegenStatus> {
+    // Generate specialized code if possible
+    let cd = jit.get_arg(0).as_ptr();
+    if let Some(status) = gen_invokesuper_specialized(jit, asm, cd) {
+        return Some(status);
+    }
 
+    // Otherwise, fallback to dynamic dispatch using the interpreter's implementation of invokesuper
+    let blockiseq = jit.get_arg(1).as_iseq();
+    gen_send_dynamic(jit, asm, cd, unsafe { rb_yjit_sendish_sp_pops((*cd).ci) }, |asm| {
+        extern "C" {
+            fn rb_vm_invokesuper(ec: EcPtr, cfp: CfpPtr, cd: VALUE, blockiseq: IseqPtr) -> VALUE;
+        }
+        asm.ccall(
+            rb_vm_invokesuper as *const u8,
+            vec![EC, CFP, (cd as usize).into(), VALUE(blockiseq as usize).into()],
+        )
+    })
+}
+
+fn gen_invokesuperforward(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+) -> Option<CodegenStatus> {
+    // Generate specialized code if possible
+    let cd = jit.get_arg(0).as_ptr();
+    if let Some(status) = gen_invokesuper_specialized(jit, asm, cd) {
+        return Some(status);
+    }
+
+    // Otherwise, fallback to dynamic dispatch using the interpreter's implementation of invokesuperforward
+    let blockiseq = jit.get_arg(1).as_iseq();
+    gen_send_dynamic(jit, asm, cd, unsafe { rb_yjit_sendish_sp_pops((*cd).ci) }, |asm| {
+        extern "C" {
+            fn rb_vm_invokesuperforward(ec: EcPtr, cfp: CfpPtr, cd: VALUE, blockiseq: IseqPtr) -> VALUE;
+        }
+        asm.ccall(
+            rb_vm_invokesuperforward as *const u8,
+            vec![EC, CFP, (cd as usize).into(), VALUE(blockiseq as usize).into()],
+        )
+    })
+}
+
+fn gen_invokesuper_specialized(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    cd: *const rb_call_data,
+) -> Option<CodegenStatus> {
     // Defer compilation so we can specialize on class of receiver
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+    if !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
     }
 
-    let me = unsafe { rb_vm_frame_method_entry(get_ec_cfp(jit.ec.unwrap())) };
+    // Handle the last two branches of vm_caller_setup_arg_block
+    let block = if let Some(iseq) = jit.get_arg(1).as_optional_ptr() {
+        BlockHandler::BlockISeq(iseq)
+    } else {
+        BlockHandler::LEPSpecVal
+    };
+
+    // Fallback to dynamic dispatch if this callsite is megamorphic
+    if asm.ctx.get_chain_depth() >= SEND_MAX_DEPTH {
+        gen_counter_incr(jit, asm, Counter::invokesuper_megamorphic);
+        return None;
+    }
+
+    let me = unsafe { rb_vm_frame_method_entry(jit.get_cfp()) };
     if me.is_null() {
-        return CantCompile;
+        gen_counter_incr(jit, asm, Counter::invokesuper_no_me);
+        return None;
     }
 
     // FIXME: We should track and invalidate this block when this cme is invalidated
     let current_defined_class = unsafe { (*me).defined_class };
     let mid = unsafe { get_def_original_id((*me).def) };
 
-    if me != unsafe { rb_callable_method_entry(current_defined_class, (*me).called_id) } {
-        // Though we likely could generate this call, as we are only concerned
-        // with the method entry remaining valid, assume_method_lookup_stable
-        // below requires that the method lookup matches as well
-        return CantCompile;
-    }
-
     // vm_search_normal_superclass
     let rbasic_ptr: *const RBasic = current_defined_class.as_ptr();
     if current_defined_class.builtin_type() == RUBY_T_ICLASS
         && unsafe { RB_TYPE_P((*rbasic_ptr).klass, RUBY_T_MODULE) && FL_TEST_RAW((*rbasic_ptr).klass, VALUE(RMODULE_IS_REFINEMENT.as_usize())) != VALUE(0) }
     {
-        return CantCompile;
+        gen_counter_incr(jit, asm, Counter::invokesuper_refinement);
+        return None;
     }
     let comptime_superclass =
         unsafe { rb_class_get_superclass(RCLASS_ORIGIN(current_defined_class)) };
@@ -6375,16 +9856,16 @@ fn gen_invokesuper(
     // Note, not using VM_CALL_ARGS_SIMPLE because sometimes we pass a block.
 
     if ci_flags & VM_CALL_KWARG != 0 {
-        gen_counter_incr!(asm, send_keywords);
-        return CantCompile;
+        gen_counter_incr(jit, asm, Counter::invokesuper_kwarg);
+        return None;
     }
     if ci_flags & VM_CALL_KW_SPLAT != 0 {
-        gen_counter_incr!(asm, send_kw_splat);
-        return CantCompile;
+        gen_counter_incr(jit, asm, Counter::invokesuper_kw_splat);
+        return None;
     }
-    if ci_flags & VM_CALL_ARGS_BLOCKARG != 0 {
-        gen_counter_incr!(asm, send_block_arg);
-        return CantCompile;
+    if ci_flags & VM_CALL_FORWARDING != 0 {
+        gen_counter_incr(jit, asm, Counter::invokesuper_forwarding);
+        return None;
     }
 
     // Ensure we haven't rebound this method onto an incompatible class.
@@ -6392,267 +9873,259 @@ fn gen_invokesuper(
     // cheaper calculations first, but since we specialize on the method entry
     // and so only have to do this once at compile time this is fine to always
     // check and side exit.
-    let comptime_recv = jit_peek_at_stack(jit, ctx, argc as isize);
+    let comptime_recv = jit.peek_at_stack(&asm.ctx, argc as isize);
     if unsafe { rb_obj_is_kind_of(comptime_recv, current_defined_class) } == VALUE(0) {
-        return CantCompile;
+        gen_counter_incr(jit, asm, Counter::invokesuper_defined_class_mismatch);
+        return None;
+    }
+
+    // Don't compile `super` on objects with singleton class to avoid retaining the receiver.
+    if VALUE(0) != unsafe { FL_TEST(comptime_recv.class_of(), VALUE(RUBY_FL_SINGLETON as usize)) } {
+        gen_counter_incr(jit, asm, Counter::invokesuper_singleton_class);
+        return None;
     }
 
     // Do method lookup
     let cme = unsafe { rb_callable_method_entry(comptime_superclass, mid) };
-
     if cme.is_null() {
-        return CantCompile;
+        gen_counter_incr(jit, asm, Counter::invokesuper_no_cme);
+        return None;
     }
 
     // Check that we'll be able to write this method dispatch before generating checks
     let cme_def_type = unsafe { get_cme_def_type(cme) };
     if cme_def_type != VM_METHOD_TYPE_ISEQ && cme_def_type != VM_METHOD_TYPE_CFUNC {
         // others unimplemented
-        return CantCompile;
+        gen_counter_incr(jit, asm, Counter::invokesuper_not_iseq_or_cfunc);
+        return None;
     }
 
-    // Guard that the receiver has the same class as the one from compile time
-    let side_exit = get_side_exit(jit, ocb, ctx);
-
-    let cfp = unsafe { get_ec_cfp(jit.ec.unwrap()) };
-    let ep = unsafe { get_cfp_ep(cfp) };
-    let cref_me = unsafe { *ep.offset(VM_ENV_DATA_INDEX_ME_CREF.try_into().unwrap()) };
-    let me_as_value = VALUE(me as usize);
-    if cref_me != me_as_value {
-        // This will be the case for super within a block
-        return CantCompile;
-    }
-
-    asm.comment("guard known me");
-    let ep_opnd = asm.load(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_EP));
+    asm_comment!(asm, "guard known me");
+    let lep_opnd = gen_get_lep(jit, asm);
     let ep_me_opnd = Opnd::mem(
         64,
-        ep_opnd,
+        lep_opnd,
         SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_ME_CREF,
     );
+
+    let me_as_value = VALUE(me as usize);
     asm.cmp(ep_me_opnd, me_as_value.into());
-    asm.jne(counted_exit!(ocb, side_exit, invokesuper_me_changed));
-
-    if block.is_none() {
-        // Guard no block passed
-        // rb_vm_frame_block_handler(GET_EC()->cfp) == VM_BLOCK_HANDLER_NONE
-        // note, we assume VM_ASSERT(VM_ENV_LOCAL_P(ep))
-        //
-        // TODO: this could properly forward the current block handler, but
-        // would require changes to gen_send_*
-        asm.comment("guard no block given");
-        // EP is in REG0 from above
-        let ep_opnd = asm.load(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_EP));
-        let ep_specval_opnd = Opnd::mem(
-            64,
-            ep_opnd,
-            SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL,
-        );
-        asm.cmp(ep_specval_opnd, VM_BLOCK_HANDLER_NONE.into());
-        asm.jne(counted_exit!(ocb, side_exit, invokesuper_block));
-    }
+    jit_chain_guard(
+        JCC_JNE,
+        jit,
+        asm,
+        SEND_MAX_DEPTH,
+        Counter::guard_invokesuper_me_changed,
+    );
 
     // We need to assume that both our current method entry and the super
     // method entry we invoke remain stable
-    assume_method_lookup_stable(jit, ocb, me);
-    assume_method_lookup_stable(jit, ocb, cme);
+    jit.assume_method_lookup_stable(asm, me);
+    jit.assume_method_lookup_stable(asm, cme);
 
     // Method calls may corrupt types
-    ctx.clear_local_types();
+    asm.clear_local_types();
 
     match cme_def_type {
         VM_METHOD_TYPE_ISEQ => {
             let iseq = unsafe { get_def_iseq_ptr((*cme).def) };
             let frame_type = VM_FRAME_MAGIC_METHOD | VM_ENV_FLAG_LOCAL;
-            gen_send_iseq(jit, ctx, asm, ocb, iseq, ci, frame_type, None, cme, block, ci_flags, argc, None)
+            perf_call! { gen_send_iseq(jit, asm, iseq, ci, frame_type, None, cme, Some(block), ci_flags, argc, None) }
         }
         VM_METHOD_TYPE_CFUNC => {
-            gen_send_cfunc(jit, ctx, asm, ocb, ci, cme, block, ptr::null(), ci_flags, argc)
+            perf_call! { gen_send_cfunc(jit, asm, ci, cme, Some(block), None, ci_flags, argc) }
         }
         _ => unreachable!(),
     }
 }
 
 fn gen_leave(
-    jit: &mut JITState,
-    ctx: &mut Context,
+    _jit: &mut JITState,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Only the return value should be on the stack
-    assert_eq!(1, ctx.get_stack_size());
-
-    // Create a side-exit to fall back to the interpreter
-    let side_exit = get_side_exit(jit, ocb, ctx);
-    let ocb_asm = Assembler::new();
+    assert_eq!(1, asm.ctx.get_stack_size(), "leave instruction expects stack size 1, but was: {}", asm.ctx.get_stack_size());
 
     // Check for interrupts
-    gen_check_ints(asm, counted_exit!(ocb, side_exit, leave_se_interrupt));
-    ocb_asm.compile(ocb.unwrap());
+    gen_check_ints(asm, Counter::leave_se_interrupt);
 
     // Pop the current frame (ec->cfp++)
     // Note: the return PC is already in the previous CFP
-    asm.comment("pop stack frame");
+    asm_comment!(asm, "pop stack frame");
     let incr_cfp = asm.add(CFP, RUBY_SIZEOF_CONTROL_FRAME.into());
     asm.mov(CFP, incr_cfp);
-    asm.mov(Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP), CFP);
+    asm.mov(Opnd::mem(64, EC, RUBY_OFFSET_EC_CFP as i32), CFP);
 
     // Load the return value
-    let retval_opnd = ctx.stack_pop(1);
+    let retval_opnd = asm.stack_pop(1);
 
-    // Move the return value into the C return register for gen_leave_exit()
+    // Move the return value into the C return register
     asm.mov(C_RET_OPND, retval_opnd);
 
-    // Reload REG_SP for the caller and write the return value.
-    // Top of the stack is REG_SP[0] since the caller has sp_offset=1.
-    asm.mov(SP, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP));
-    asm.mov(Opnd::mem(64, SP, 0), C_RET_OPND);
-
-    // Jump to the JIT return address on the frame that was just popped
+    // Jump to the JIT return address on the frame that was just popped.
+    // There are a few possible jump targets:
+    //   - gen_leave_exit() and gen_leave_exception(), for C callers
+    //   - Return context set up by gen_send_iseq()
+    // We don't write the return value to stack memory like the interpreter here.
+    // Each jump target do it as necessary.
     let offset_to_jit_return =
         -(RUBY_SIZEOF_CONTROL_FRAME as i32) + RUBY_OFFSET_CFP_JIT_RETURN;
     asm.jmp_opnd(Opnd::mem(64, CFP, offset_to_jit_return));
 
-    EndBlock
+    Some(EndBlock)
 }
 
 fn gen_getglobal(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let gid = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let gid = jit.get_arg(0).as_usize();
 
     // Save the PC and SP because we might make a Ruby call for warning
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
     let val_opnd = asm.ccall(
         rb_gvar_get as *const u8,
         vec![ gid.into() ]
     );
 
-    let top = ctx.stack_push(Type::Unknown);
+    let top = asm.stack_push(Type::Unknown);
     asm.mov(top, val_opnd);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_setglobal(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let gid = jit_get_arg(jit, 0).as_usize();
+) -> Option<CodegenStatus> {
+    let gid = jit.get_arg(0).as_usize();
 
     // Save the PC and SP because we might make a Ruby call for
     // Kernel#set_trace_var
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
+    let val = asm.stack_opnd(0);
     asm.ccall(
         rb_gvar_set as *const u8,
         vec![
             gid.into(),
-            ctx.stack_pop(1),
+            val,
         ],
     );
+    asm.stack_pop(1); // Keep it during ccall for GC
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_anytostring(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Save the PC and SP since we might call #to_s
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
-    let str = ctx.stack_pop(1);
-    let val = ctx.stack_pop(1);
+    let str = asm.stack_opnd(0);
+    let val = asm.stack_opnd(1);
 
     let val = asm.ccall(rb_obj_as_string_result as *const u8, vec![str, val]);
+    asm.stack_pop(2); // Keep them during ccall for GC
 
     // Push the return value
-    let stack_ret = ctx.stack_push(Type::TString);
+    let stack_ret = asm.stack_push(Type::TString);
     asm.mov(stack_ret, val);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_objtostring(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+) -> Option<CodegenStatus> {
+    if !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
     }
 
-    let recv = ctx.stack_opnd(0);
-    let comptime_recv = jit_peek_at_stack(jit, ctx, 0);
+    let recv = asm.stack_opnd(0);
+    let comptime_recv = jit.peek_at_stack(&asm.ctx, 0);
 
     if unsafe { RB_TYPE_P(comptime_recv, RUBY_T_STRING) } {
-        let side_exit = get_side_exit(jit, ocb, ctx);
-
         jit_guard_known_klass(
             jit,
-            ctx,
             asm,
-            ocb,
-            comptime_recv.class_of(),
             recv,
-            StackOpnd(0),
+            recv.into(),
             comptime_recv,
             SEND_MAX_DEPTH,
-            side_exit,
+            Counter::objtostring_not_string,
         );
+
         // No work needed. The string value is already on the top of the stack.
-        KeepCompiling
+        Some(KeepCompiling)
+    } else if unsafe { RB_TYPE_P(comptime_recv, RUBY_T_SYMBOL) } && assume_method_basic_definition(jit, asm, comptime_recv.class_of(), ID!(to_s)) {
+        jit_guard_known_klass(
+            jit,
+            asm,
+            recv,
+            recv.into(),
+            comptime_recv,
+            SEND_MAX_DEPTH,
+            Counter::objtostring_not_string,
+        );
+
+        extern "C" {
+            fn rb_sym2str(sym: VALUE) -> VALUE;
+        }
+
+        // Same optimization done in the interpreter: rb_sym_to_s() allocates a mutable string, but since we are only
+        // going to use this string for interpolation, it's fine to use the
+        // frozen string.
+        // rb_sym2str does not allocate.
+        let sym = recv;
+        let str = asm.ccall(rb_sym2str as *const u8, vec![sym]);
+        asm.stack_pop(1);
+
+        // Push the return value
+        let stack_ret = asm.stack_push(Type::TString);
+        asm.mov(stack_ret, str);
+
+        Some(KeepCompiling)
     } else {
-        let cd = jit_get_arg(jit, 0).as_ptr();
-        gen_send_general(jit, ctx, asm, ocb, cd, None)
+        let cd = jit.get_arg(0).as_ptr();
+        perf_call! { gen_send_general(jit, asm, cd, None) }
     }
 }
 
 fn gen_intern(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // Save the PC and SP because we might allocate
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_call_with_gc(jit, asm);
 
-    let str = ctx.stack_pop(1);
+    let str = asm.stack_opnd(0);
     let sym = asm.ccall(rb_str_intern as *const u8, vec![str]);
+    asm.stack_pop(1); // Keep it during ccall for GC
 
     // Push the return value
-    let stack_ret = ctx.stack_push(Type::Unknown);
+    let stack_ret = asm.stack_push(Type::Unknown);
     asm.mov(stack_ret, sym);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_toregexp(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let opt = jit_get_arg(jit, 0).as_i64();
-    let cnt = jit_get_arg(jit, 1).as_usize();
+) -> Option<CodegenStatus> {
+    let opt = jit.get_arg(0).as_i64();
+    let cnt = jit.get_arg(1).as_usize();
 
     // Save the PC and SP because this allocates an object and could
     // raise an exception.
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
-    let values_ptr = asm.lea(ctx.sp_opnd(-((SIZEOF_VALUE as isize) * (cnt as isize))));
-    ctx.stack_pop(cnt);
+    let values_ptr = asm.lea(asm.ctx.sp_opnd(-(cnt as i32)));
 
     let ary = asm.ccall(
         rb_ary_tmp_new_from_values as *const u8,
@@ -6662,6 +10135,7 @@ fn gen_toregexp(
             values_ptr,
         ]
     );
+    asm.stack_pop(cnt); // Let ccall spill them
 
     // Save the array so we can clear it later
     asm.cpush(ary);
@@ -6681,77 +10155,75 @@ fn gen_toregexp(
     asm.cpop_into(ary);
 
     // The value we want to push on the stack is in RAX right now
-    let stack_ret = ctx.stack_push(Type::Unknown);
+    let stack_ret = asm.stack_push(Type::UnknownHeap);
     asm.mov(stack_ret, val);
 
     // Clear the temp array.
     asm.ccall(rb_ary_clear as *const u8, vec![ary]);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_getspecial(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // This takes two arguments, key and type
     // key is only used when type == 0
     // A non-zero type determines which type of backref to fetch
-    //rb_num_t key = jit_get_arg(jit, 0);
-    let rtype = jit_get_arg(jit, 1).as_u64();
+    //rb_num_t key = jit.jit_get_arg(0);
+    let rtype = jit.get_arg(1).as_u64();
 
     if rtype == 0 {
         // not yet implemented
-        return CantCompile;
+        return None;
     } else if rtype & 0x01 != 0 {
         // Fetch a "special" backref based on a char encoded by shifting by 1
 
         // Can raise if matchdata uninitialized
-        jit_prepare_routine_call(jit, ctx, asm);
+        jit_prepare_non_leaf_call(jit, asm);
 
         // call rb_backref_get()
-        asm.comment("rb_backref_get");
+        asm_comment!(asm, "rb_backref_get");
         let backref = asm.ccall(rb_backref_get as *const u8, vec![]);
 
         let rt_u8: u8 = (rtype >> 1).try_into().unwrap();
         let val = match rt_u8.into() {
             '&' => {
-                asm.comment("rb_reg_last_match");
+                asm_comment!(asm, "rb_reg_last_match");
                 asm.ccall(rb_reg_last_match as *const u8, vec![backref])
             }
             '`' => {
-                asm.comment("rb_reg_match_pre");
+                asm_comment!(asm, "rb_reg_match_pre");
                 asm.ccall(rb_reg_match_pre as *const u8, vec![backref])
             }
             '\'' => {
-                asm.comment("rb_reg_match_post");
+                asm_comment!(asm, "rb_reg_match_post");
                 asm.ccall(rb_reg_match_post as *const u8, vec![backref])
             }
             '+' => {
-                asm.comment("rb_reg_match_last");
+                asm_comment!(asm, "rb_reg_match_last");
                 asm.ccall(rb_reg_match_last as *const u8, vec![backref])
             }
             _ => panic!("invalid back-ref"),
         };
 
-        let stack_ret = ctx.stack_push(Type::Unknown);
+        let stack_ret = asm.stack_push(Type::Unknown);
         asm.mov(stack_ret, val);
 
-        KeepCompiling
+        Some(KeepCompiling)
     } else {
         // Fetch the N-th match from the last backref based on type shifted by 1
 
         // Can raise if matchdata uninitialized
-        jit_prepare_routine_call(jit, ctx, asm);
+        jit_prepare_non_leaf_call(jit, asm);
 
         // call rb_backref_get()
-        asm.comment("rb_backref_get");
+        asm_comment!(asm, "rb_backref_get");
         let backref = asm.ccall(rb_backref_get as *const u8, vec![]);
 
         // rb_reg_nth_match((int)(type >> 1), backref);
-        asm.comment("rb_reg_nth_match");
+        asm_comment!(asm, "rb_reg_nth_match");
         let val = asm.ccall(
             rb_reg_nth_match as *const u8,
             vec![
@@ -6760,75 +10232,71 @@ fn gen_getspecial(
             ]
         );
 
-        let stack_ret = ctx.stack_push(Type::Unknown);
+        let stack_ret = asm.stack_push(Type::Unknown);
         asm.mov(stack_ret, val);
 
-        KeepCompiling
+        Some(KeepCompiling)
     }
 }
 
 fn gen_getclassvariable(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // rb_vm_getclassvariable can raise exceptions.
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
     let val_opnd = asm.ccall(
         rb_vm_getclassvariable as *const u8,
         vec![
-            Opnd::mem(64, CFP, RUBY_OFFSET_CFP_ISEQ),
+            VALUE(jit.iseq as usize).into(),
             CFP,
-            Opnd::UImm(jit_get_arg(jit, 0).as_u64()),
-            Opnd::UImm(jit_get_arg(jit, 1).as_u64()),
+            Opnd::UImm(jit.get_arg(0).as_u64()),
+            Opnd::UImm(jit.get_arg(1).as_u64()),
         ],
     );
 
-    let top = ctx.stack_push(Type::Unknown);
+    let top = asm.stack_push(Type::Unknown);
     asm.mov(top, val_opnd);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_setclassvariable(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // rb_vm_setclassvariable can raise exceptions.
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
+    let val = asm.stack_opnd(0);
     asm.ccall(
         rb_vm_setclassvariable as *const u8,
         vec![
-            Opnd::mem(64, CFP, RUBY_OFFSET_CFP_ISEQ),
+            VALUE(jit.iseq as usize).into(),
             CFP,
-            Opnd::UImm(jit_get_arg(jit, 0).as_u64()),
-            ctx.stack_pop(1),
-            Opnd::UImm(jit_get_arg(jit, 1).as_u64()),
+            Opnd::UImm(jit.get_arg(0).as_u64()),
+            val,
+            Opnd::UImm(jit.get_arg(1).as_u64()),
         ],
     );
+    asm.stack_pop(1); // Keep it during ccall for GC
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_getconstant(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
 
-    let id = jit_get_arg(jit, 0).as_usize();
+    let id = jit.get_arg(0).as_usize();
 
     // vm_get_ev_const can raise exceptions.
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
-    let allow_nil_opnd = ctx.stack_pop(1);
-    let klass_opnd = ctx.stack_pop(1);
+    let allow_nil_opnd = asm.stack_opnd(0);
+    let klass_opnd = asm.stack_opnd(1);
 
     extern "C" {
         fn rb_vm_get_ev_const(ec: EcPtr, klass: VALUE, id: ID, allow_nil: VALUE) -> VALUE;
@@ -6843,42 +10311,58 @@ fn gen_getconstant(
             allow_nil_opnd
         ],
     );
+    asm.stack_pop(2); // Keep them during ccall for GC
 
-    let top = ctx.stack_push(Type::Unknown);
+    let top = asm.stack_push(Type::Unknown);
     asm.mov(top, val_opnd);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_opt_getconstant_path(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let const_cache_as_value = jit_get_arg(jit, 0);
+) -> Option<CodegenStatus> {
+    let const_cache_as_value = jit.get_arg(0);
     let ic: *const iseq_inline_constant_cache = const_cache_as_value.as_ptr();
     let idlist: *const ID = unsafe { (*ic).segments };
 
+    // Make sure there is an exit for this block as the interpreter might want
+    // to invalidate this block from yjit_constant_ic_update().
+    jit_ensure_block_entry_exit(jit, asm)?;
+
     // See vm_ic_hit_p(). The same conditions are checked in yjit_constant_ic_update().
+    // If a cache is not filled, fallback to the general C call.
     let ice = unsafe { (*ic).entry };
     if ice.is_null() {
-        // In this case, leave a block that unconditionally side exits
-        // for the interpreter to invalidate.
-        return CantCompile;
+        // Prepare for const_missing
+        jit_prepare_non_leaf_call(jit, asm);
+
+        // If this does not trigger const_missing, vm_ic_update will invalidate this block.
+        extern "C" {
+            fn rb_vm_opt_getconstant_path(ec: EcPtr, cfp: CfpPtr, ic: *const u8) -> VALUE;
+        }
+        let val = asm.ccall(
+            rb_vm_opt_getconstant_path as *const u8,
+            vec![EC, CFP, Opnd::const_ptr(ic as *const u8)],
+        );
+
+        let stack_top = asm.stack_push(Type::Unknown);
+        asm.store(stack_top, val);
+
+        return jump_to_next_insn(jit, asm);
     }
 
-    // Make sure there is an exit for this block as the interpreter might want
-    // to invalidate this block from yjit_constant_ic_update().
-    jit_ensure_block_entry_exit(jit, ocb);
+    let cref_sensitive = !unsafe { (*ice).ic_cref }.is_null();
+    let is_shareable = unsafe { rb_yjit_constcache_shareable(ice) };
+    let needs_checks = cref_sensitive || (!is_shareable && !assume_single_ractor_mode(jit, asm));
 
-    if !unsafe { (*ice).ic_cref }.is_null() {
+    if needs_checks {
         // Cache is keyed on a certain lexical scope. Use the interpreter's cache.
-        let side_exit = get_side_exit(jit, ocb, ctx);
-
         let inline_cache = asm.load(Opnd::const_ptr(ic as *const u8));
 
         // Call function to verify the cache. It doesn't allocate or call methods.
+        // This includes a check for Ractor safety
         let ret_val = asm.ccall(
             rb_vm_ic_hit_p as *const u8,
             vec![inline_cache, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_EP)]
@@ -6887,7 +10371,7 @@ fn gen_opt_getconstant_path(
         // Check the result. SysV only specifies one byte for _Bool return values,
         // so it's important we only check one bit to ignore the higher bits in the register.
         asm.test(ret_val, 1.into());
-        asm.jz(counted_exit!(ocb, side_exit, opt_getinlinecache_miss));
+        asm.jz(Target::side_exit(Counter::opt_getconstant_path_ic_miss));
 
         let inline_cache = asm.load(Opnd::const_ptr(ic as *const u8));
 
@@ -6904,24 +10388,17 @@ fn gen_opt_getconstant_path(
         ));
 
         // Push ic->entry->value
-        let stack_top = ctx.stack_push(Type::Unknown);
+        let stack_top = asm.stack_push(Type::Unknown);
         asm.store(stack_top, ic_entry_val);
     } else {
-        // Optimize for single ractor mode.
-        // FIXME: This leaks when st_insert raises NoMemoryError
-        if !assume_single_ractor_mode(jit, ocb) {
-            return CantCompile;
-        }
-
         // Invalidate output code on any constant writes associated with
         // constants referenced within the current block.
-        assume_stable_constant_names(jit, ocb, idlist);
+        jit.assume_stable_constant_names(asm, idlist);
 
-        jit_putobject(jit, ctx, asm, unsafe { (*ice).value });
+        jit_putobject(asm, unsafe { (*ice).value });
     }
 
-    jump_to_next_insn(jit, ctx, asm, ocb);
-    EndBlock
+    jump_to_next_insn(jit, asm)
 }
 
 // Push the explicit block parameter onto the temporary stack. Part of the
@@ -6929,31 +10406,27 @@ fn gen_opt_getconstant_path(
 // explicit block parameters.
 fn gen_getblockparamproxy(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    if !jit_at_current_insn(jit) {
-        defer_compilation(jit, ctx, asm, ocb);
-        return EndBlock;
+) -> Option<CodegenStatus> {
+    if !jit.at_compile_target() {
+        return jit.defer_compilation(asm);
     }
 
-    let starting_context = ctx.clone(); // make a copy for use with jit_chain_guard
-
-    // A mirror of the interpreter code. Checking for the case
-    // where it's pushing rb_block_param_proxy.
-    let side_exit = get_side_exit(jit, ocb, ctx);
-
     // EP level
-    let level = jit_get_arg(jit, 1).as_u32();
+    let level = jit.get_arg(1).as_u32();
 
     // Peek at the block handler so we can check whether it's nil
-    let comptime_handler = jit_peek_at_block_handler(jit, level);
+    let comptime_handler = jit.peek_at_block_handler(level);
 
-    // When a block handler is present, it should always be a GC-guarded
-    // pointer (VM_BH_ISEQ_BLOCK_P)
-    if comptime_handler.as_u64() != 0 && comptime_handler.as_u64() & 0x3 != 0x1 {
-        return CantCompile;
+    // Filter for the 4 cases we currently handle
+    if !(comptime_handler.as_u64() == 0 ||              // no block given
+            comptime_handler.as_u64() & 0x3 == 0x1 ||   // iseq block (no associated GC managed object)
+            comptime_handler.as_u64() & 0x3 == 0x3 ||   // ifunc block (no associated GC managed object)
+            unsafe { rb_obj_is_proc(comptime_handler) }.test() // block is a Proc
+        ) {
+        // Missing the symbol case, where we basically need to call Symbol#to_proc at runtime
+        gen_counter_incr(jit, asm, Counter::gbpp_unsupported_type);
+        return None;
     }
 
     // Load environment pointer EP from CFP
@@ -6966,7 +10439,7 @@ fn gen_getblockparamproxy(
         SIZEOF_VALUE_I32 * (VM_ENV_DATA_INDEX_FLAGS as i32),
     );
     asm.test(flag_check, VM_FRAME_FLAG_MODIFIED_BLOCK_PARAM.into());
-    asm.jnz(counted_exit!(ocb, side_exit, gbpp_block_param_modified));
+    asm.jnz(Target::side_exit(Counter::gbpp_block_param_modified));
 
     // Load the block handler for the current frame
     // note, VM_ASSERT(VM_ENV_LOCAL_P(ep))
@@ -6974,7 +10447,12 @@ fn gen_getblockparamproxy(
         Opnd::mem(64, ep_opnd, SIZEOF_VALUE_I32 * VM_ENV_DATA_INDEX_SPECVAL)
     );
 
-    // Specialize compilation for the case where no block handler is present
+    // Use block handler sample to guide specialization...
+    // NOTE: we use jit_chain_guard() in this decision tree, and since
+    // there are only a few cases, it should never reach the depth limit use
+    // the exit counter we pass to it.
+    //
+    // No block given
     if comptime_handler.as_u64() == 0 {
         // Bail if there is a block handler
         asm.cmp(block_handler, Opnd::UImm(0));
@@ -6982,58 +10460,85 @@ fn gen_getblockparamproxy(
         jit_chain_guard(
             JCC_JNZ,
             jit,
-            &starting_context,
             asm,
-            ocb,
             SEND_MAX_DEPTH,
-            side_exit,
+            Counter::gbpp_block_handler_not_none,
         );
 
-        jit_putobject(jit, ctx, asm, Qnil);
-    } else {
-        // Block handler is a tagged pointer. Look at the tag. 0x03 is from VM_BH_ISEQ_BLOCK_P().
-        let block_handler = asm.and(block_handler, 0x3.into());
-
-        // Bail unless VM_BH_ISEQ_BLOCK_P(bh). This also checks for null.
-        asm.cmp(block_handler, 0x1.into());
-
+        jit_putobject(asm, Qnil);
+    } else if comptime_handler.as_u64() & 0x1 == 0x1 {
+        // This handles two cases which are nearly identical
+        // Block handler is a tagged pointer. Look at the tag.
+        //   VM_BH_ISEQ_BLOCK_P(): block_handler & 0x03 == 0x01
+        //   VM_BH_IFUNC_P():      block_handler & 0x03 == 0x03
+        // So to check for either of those cases we can use: val & 0x1 == 0x1
+        const _: () = assert!(RUBY_SYMBOL_FLAG & 1 == 0, "guard below rejects symbol block handlers");
+        // Procs are aligned heap pointers so testing the bit rejects them too.
+
+        asm.test(block_handler, 0x1.into());
         jit_chain_guard(
-            JCC_JNZ,
+            JCC_JZ,
             jit,
-            &starting_context,
             asm,
-            ocb,
             SEND_MAX_DEPTH,
-            side_exit,
+            Counter::gbpp_block_handler_not_iseq,
         );
 
         // Push rb_block_param_proxy. It's a root, so no need to use jit_mov_gc_ptr.
         assert!(!unsafe { rb_block_param_proxy }.special_const_p());
 
-        let top = ctx.stack_push(Type::BlockParamProxy);
+        let top = asm.stack_push(Type::BlockParamProxy);
         asm.mov(top, Opnd::const_ptr(unsafe { rb_block_param_proxy }.as_ptr()));
-    }
+    } else if unsafe { rb_obj_is_proc(comptime_handler) }.test() {
+        // The block parameter is a Proc
+        c_callable! {
+            // We can't hold values across C calls due to a backend limitation,
+            // so we'll use this thin wrapper around rb_obj_is_proc().
+            fn is_proc(object: VALUE) -> VALUE {
+                if unsafe { rb_obj_is_proc(object) }.test() {
+                    // VM_BH_TO_PROC() is the identify function.
+                    object
+                } else {
+                    Qfalse
+                }
+            }
+        }
+
+        // Simple predicate, no need to jit_prepare_non_leaf_call()
+        let proc_or_false = asm.ccall(is_proc as _, vec![block_handler]);
 
-    jump_to_next_insn(jit, ctx, asm, ocb);
+        // Guard for proc
+        asm.cmp(proc_or_false, Qfalse.into());
+        jit_chain_guard(
+            JCC_JE,
+            jit,
+            asm,
+            SEND_MAX_DEPTH,
+            Counter::gbpp_block_handler_not_proc,
+        );
 
-    EndBlock
+        let top = asm.stack_push(Type::Unknown);
+        asm.mov(top, proc_or_false);
+    } else {
+        unreachable!("absurd given initial filtering");
+    }
+
+    jump_to_next_insn(jit, asm)
 }
 
 fn gen_getblockparam(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) -> CodegenStatus {
+) -> Option<CodegenStatus> {
     // EP level
-    let level = jit_get_arg(jit, 1).as_u32();
+    let level = jit.get_arg(1).as_u32();
 
     // Save the PC and SP because we might allocate
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_call_with_gc(jit, asm);
+    asm.spill_regs(); // For ccall. Unconditionally spill them for RegMappings consistency.
 
     // A mirror of the interpreter code. Checking for the case
     // where it's pushing rb_block_param_proxy.
-    let side_exit = get_side_exit(jit, ocb, ctx);
 
     // Load environment pointer EP from CFP
     let ep_opnd = gen_get_ep(asm, level);
@@ -7061,7 +10566,7 @@ fn gen_getblockparam(
     asm.test(flags_opnd, VM_ENV_FLAG_WB_REQUIRED.into());
 
     // if (flags & VM_ENV_FLAG_WB_REQUIRED) != 0
-    asm.jnz(side_exit);
+    asm.jnz(Target::side_exit(Counter::gbp_wb_required));
 
     // Convert the block handler in to a proc
     // call rb_vm_bh_to_procval(const rb_execution_context_t *ec, VALUE block_handler)
@@ -7083,7 +10588,7 @@ fn gen_getblockparam(
     let ep_opnd = gen_get_ep(asm, level);
 
     // Write the value at the environment pointer
-    let idx = jit_get_arg(jit, 0).as_i32();
+    let idx = jit.get_arg(0).as_i32();
     let offs = -(SIZEOF_VALUE_I32 * idx);
     asm.mov(Opnd::mem(64, ep_opnd, offs), proc);
 
@@ -7095,47 +10600,46 @@ fn gen_getblockparam(
     asm.write_label(frame_flag_modified);
 
     // Push the proc on the stack
-    let stack_ret = ctx.stack_push(Type::Unknown);
+    let stack_ret = asm.stack_push(Type::Unknown);
     let ep_opnd = gen_get_ep(asm, level);
     asm.mov(stack_ret, Opnd::mem(64, ep_opnd, offs));
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 fn gen_invokebuiltin(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let bf: *const rb_builtin_function = jit_get_arg(jit, 0).as_ptr();
+) -> Option<CodegenStatus> {
+    let bf: *const rb_builtin_function = jit.get_arg(0).as_ptr();
     let bf_argc: usize = unsafe { (*bf).argc }.try_into().expect("non negative argc");
 
     // ec, self, and arguments
     if bf_argc + 2 > C_ARG_OPNDS.len() {
-        return CantCompile;
+        incr_counter!(invokebuiltin_too_many_args);
+        return None;
     }
 
     // If the calls don't allocate, do they need up to date PC, SP?
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
     // Call the builtin func (ec, recv, arg1, arg2, ...)
     let mut args = vec![EC, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF)];
 
     // Copy arguments from locals
     for i in 0..bf_argc {
-        let stack_opnd = ctx.stack_opnd((bf_argc - i - 1) as i32);
+        let stack_opnd = asm.stack_opnd((bf_argc - i - 1) as i32);
         args.push(stack_opnd);
     }
 
     let val = asm.ccall(unsafe { (*bf).func_ptr } as *const u8, args);
 
     // Push the return value
-    ctx.stack_pop(bf_argc);
-    let stack_ret = ctx.stack_push(Type::Unknown);
+    asm.stack_pop(bf_argc);
+    let stack_ret = asm.stack_push(Type::Unknown);
     asm.mov(stack_ret, val);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 // opt_invokebuiltin_delegate calls a builtin function, like
@@ -7143,21 +10647,20 @@ fn gen_invokebuiltin(
 // stack uses the argument locals (and self) from the current method.
 fn gen_opt_invokebuiltin_delegate(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    _ocb: &mut OutlinedCb,
-) -> CodegenStatus {
-    let bf: *const rb_builtin_function = jit_get_arg(jit, 0).as_ptr();
+) -> Option<CodegenStatus> {
+    let bf: *const rb_builtin_function = jit.get_arg(0).as_ptr();
     let bf_argc = unsafe { (*bf).argc };
-    let start_index = jit_get_arg(jit, 1).as_i32();
+    let start_index = jit.get_arg(1).as_i32();
 
     // ec, self, and arguments
     if bf_argc + 2 > (C_ARG_OPNDS.len() as i32) {
-        return CantCompile;
+        incr_counter!(invokebuiltin_too_many_args);
+        return None;
     }
 
     // If the calls don't allocate, do they need up to date PC, SP?
-    jit_prepare_routine_call(jit, ctx, asm);
+    jit_prepare_non_leaf_call(jit, asm);
 
     // Call the builtin func (ec, recv, arg1, arg2, ...)
     let mut args = vec![EC, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF)];
@@ -7177,10 +10680,10 @@ fn gen_opt_invokebuiltin_delegate(
     let val = asm.ccall(unsafe { (*bf).func_ptr } as *const u8, args);
 
     // Push the return value
-    let stack_ret = ctx.stack_push(Type::Unknown);
+    let stack_ret = asm.stack_push(Type::Unknown);
     asm.mov(stack_ret, val);
 
-    KeepCompiling
+    Some(KeepCompiling)
 }
 
 /// Maps a YARV opcode to a code generation function (if supported)
@@ -7195,6 +10698,7 @@ fn get_gen_fn(opcode: VALUE) -> Option<InsnGenFn> {
         YARVINSN_dup => Some(gen_dup),
         YARVINSN_dupn => Some(gen_dupn),
         YARVINSN_swap => Some(gen_swap),
+        YARVINSN_opt_reverse => Some(gen_opt_reverse),
         YARVINSN_putnil => Some(gen_putnil),
         YARVINSN_putobject => Some(gen_putobject),
         YARVINSN_putobject_INT2FIX_0_ => Some(gen_putobject_int2fix),
@@ -7225,16 +10729,24 @@ fn get_gen_fn(opcode: VALUE) -> Option<InsnGenFn> {
         YARVINSN_opt_gt => Some(gen_opt_gt),
         YARVINSN_opt_ge => Some(gen_opt_ge),
         YARVINSN_opt_mod => Some(gen_opt_mod),
+        YARVINSN_opt_ary_freeze => Some(gen_opt_ary_freeze),
+        YARVINSN_opt_hash_freeze => Some(gen_opt_hash_freeze),
         YARVINSN_opt_str_freeze => Some(gen_opt_str_freeze),
         YARVINSN_opt_str_uminus => Some(gen_opt_str_uminus),
-        YARVINSN_opt_newarray_max => Some(gen_opt_newarray_max),
-        YARVINSN_opt_newarray_min => Some(gen_opt_newarray_min),
+        YARVINSN_opt_duparray_send => Some(gen_opt_duparray_send),
+        YARVINSN_opt_newarray_send => Some(gen_opt_newarray_send),
         YARVINSN_splatarray => Some(gen_splatarray),
+        YARVINSN_splatkw => Some(gen_splatkw),
         YARVINSN_concatarray => Some(gen_concatarray),
+        YARVINSN_concattoarray => Some(gen_concattoarray),
+        YARVINSN_pushtoarray => Some(gen_pushtoarray),
         YARVINSN_newrange => Some(gen_newrange),
         YARVINSN_putstring => Some(gen_putstring),
+        YARVINSN_putchilledstring => Some(gen_putchilledstring),
         YARVINSN_expandarray => Some(gen_expandarray),
         YARVINSN_defined => Some(gen_defined),
+        YARVINSN_definedivar => Some(gen_definedivar),
+        YARVINSN_checkmatch => Some(gen_checkmatch),
         YARVINSN_checkkeyword => Some(gen_checkkeyword),
         YARVINSN_concatstrings => Some(gen_concatstrings),
         YARVINSN_getinstancevariable => Some(gen_getinstancevariable),
@@ -7263,14 +10775,18 @@ fn get_gen_fn(opcode: VALUE) -> Option<InsnGenFn> {
         YARVINSN_branchif => Some(gen_branchif),
         YARVINSN_branchunless => Some(gen_branchunless),
         YARVINSN_branchnil => Some(gen_branchnil),
+        YARVINSN_throw => Some(gen_throw),
         YARVINSN_jump => Some(gen_jump),
+        YARVINSN_opt_new => Some(gen_opt_new),
 
         YARVINSN_getblockparamproxy => Some(gen_getblockparamproxy),
         YARVINSN_getblockparam => Some(gen_getblockparam),
         YARVINSN_opt_send_without_block => Some(gen_opt_send_without_block),
         YARVINSN_send => Some(gen_send),
+        YARVINSN_sendforward => Some(gen_sendforward),
         YARVINSN_invokeblock => Some(gen_invokeblock),
         YARVINSN_invokesuper => Some(gen_invokesuper),
+        YARVINSN_invokesuperforward => Some(gen_invokesuperforward),
         YARVINSN_leave => Some(gen_leave),
 
         YARVINSN_getglobal => Some(gen_getglobal),
@@ -7288,23 +10804,134 @@ fn get_gen_fn(opcode: VALUE) -> Option<InsnGenFn> {
     }
 }
 
-// Return true when the codegen function generates code.
-// known_recv_klass is non-NULL when the caller has used jit_guard_known_klass().
-// See yjit_reg_method().
+/// Return true when the codegen function generates code.
+/// known_recv_class has Some value when the caller has used jit_guard_known_klass().
+/// See [reg_method_codegen]
 type MethodGenFn = fn(
     jit: &mut JITState,
-    ctx: &mut Context,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
     ci: *const rb_callinfo,
     cme: *const rb_callable_method_entry_t,
-    block: Option<IseqPtr>,
+    block: Option<BlockHandler>,
     argc: i32,
-    known_recv_class: *const VALUE,
+    known_recv_class: Option<VALUE>,
 ) -> bool;
 
+/// Methods for generating code for hardcoded (usually C) methods
+static mut METHOD_CODEGEN_TABLE: Option<HashMap<usize, MethodGenFn>> = None;
+
+/// Register codegen functions for some Ruby core methods
+pub fn yjit_reg_method_codegen_fns() {
+    unsafe {
+        assert!(METHOD_CODEGEN_TABLE.is_none());
+        METHOD_CODEGEN_TABLE = Some(HashMap::default());
+
+        // Specialization for C methods. See the function's docs for details.
+        reg_method_codegen(rb_cBasicObject, "!", jit_rb_obj_not);
+
+        reg_method_codegen(rb_cNilClass, "nil?", jit_rb_true);
+        reg_method_codegen(rb_mKernel, "nil?", jit_rb_false);
+        reg_method_codegen(rb_mKernel, "is_a?", jit_rb_kernel_is_a);
+        reg_method_codegen(rb_mKernel, "kind_of?", jit_rb_kernel_is_a);
+        reg_method_codegen(rb_mKernel, "instance_of?", jit_rb_kernel_instance_of);
+
+        reg_method_codegen(rb_cBasicObject, "==", jit_rb_obj_equal);
+        reg_method_codegen(rb_cBasicObject, "equal?", jit_rb_obj_equal);
+        reg_method_codegen(rb_cBasicObject, "!=", jit_rb_obj_not_equal);
+        reg_method_codegen(rb_mKernel, "eql?", jit_rb_obj_equal);
+        reg_method_codegen(rb_cModule, "==", jit_rb_obj_equal);
+        reg_method_codegen(rb_cModule, "===", jit_rb_mod_eqq);
+        reg_method_codegen(rb_cModule, "name", jit_rb_mod_name);
+        reg_method_codegen(rb_cSymbol, "==", jit_rb_obj_equal);
+        reg_method_codegen(rb_cSymbol, "===", jit_rb_obj_equal);
+        reg_method_codegen(rb_cInteger, "==", jit_rb_int_equal);
+        reg_method_codegen(rb_cInteger, "===", jit_rb_int_equal);
+
+        reg_method_codegen(rb_cInteger, "succ", jit_rb_int_succ);
+        reg_method_codegen(rb_cInteger, "pred", jit_rb_int_pred);
+        reg_method_codegen(rb_cInteger, "/", jit_rb_int_div);
+        reg_method_codegen(rb_cInteger, "<<", jit_rb_int_lshift);
+        reg_method_codegen(rb_cInteger, ">>", jit_rb_int_rshift);
+        reg_method_codegen(rb_cInteger, "^", jit_rb_int_xor);
+        reg_method_codegen(rb_cInteger, "[]", jit_rb_int_aref);
+
+        reg_method_codegen(rb_cFloat, "+", jit_rb_float_plus);
+        reg_method_codegen(rb_cFloat, "-", jit_rb_float_minus);
+        reg_method_codegen(rb_cFloat, "*", jit_rb_float_mul);
+        reg_method_codegen(rb_cFloat, "/", jit_rb_float_div);
+
+        reg_method_codegen(rb_cString, "dup", jit_rb_str_dup);
+        reg_method_codegen(rb_cString, "empty?", jit_rb_str_empty_p);
+        reg_method_codegen(rb_cString, "to_s", jit_rb_str_to_s);
+        reg_method_codegen(rb_cString, "to_str", jit_rb_str_to_s);
+        reg_method_codegen(rb_cString, "length", jit_rb_str_length);
+        reg_method_codegen(rb_cString, "size", jit_rb_str_length);
+        reg_method_codegen(rb_cString, "bytesize", jit_rb_str_bytesize);
+        reg_method_codegen(rb_cString, "getbyte", jit_rb_str_getbyte);
+        reg_method_codegen(rb_cString, "setbyte", jit_rb_str_setbyte);
+        reg_method_codegen(rb_cString, "byteslice", jit_rb_str_byteslice);
+        reg_method_codegen(rb_cString, "[]", jit_rb_str_aref_m);
+        reg_method_codegen(rb_cString, "slice", jit_rb_str_aref_m);
+        reg_method_codegen(rb_cString, "<<", jit_rb_str_concat);
+        reg_method_codegen(rb_cString, "+@", jit_rb_str_uplus);
+
+        reg_method_codegen(rb_cNilClass, "===", jit_rb_case_equal);
+        reg_method_codegen(rb_cTrueClass, "===", jit_rb_case_equal);
+        reg_method_codegen(rb_cFalseClass, "===", jit_rb_case_equal);
+
+        reg_method_codegen(rb_cArray, "empty?", jit_rb_ary_empty_p);
+        reg_method_codegen(rb_cArray, "length", jit_rb_ary_length);
+        reg_method_codegen(rb_cArray, "size", jit_rb_ary_length);
+        reg_method_codegen(rb_cArray, "<<", jit_rb_ary_push);
+
+        reg_method_codegen(rb_cHash, "empty?", jit_rb_hash_empty_p);
+
+        reg_method_codegen(rb_mKernel, "respond_to?", jit_obj_respond_to);
+        reg_method_codegen(rb_mKernel, "block_given?", jit_rb_f_block_given_p);
+        reg_method_codegen(rb_mKernel, "dup", jit_rb_obj_dup);
+
+        reg_method_codegen(rb_cClass, "superclass", jit_rb_class_superclass);
+
+        reg_method_codegen(rb_singleton_class(rb_cThread), "current", jit_thread_s_current);
+    }
+}
+
+/// Register a specialized codegen function for a particular method. Note that
+/// if the function returns true, the code it generates runs without a
+/// control frame and without interrupt checks, completely substituting the
+/// original implementation of the method. To avoid creating observable
+/// behavior changes, prefer targeting simple code paths that do not allocate
+/// and do not make method calls.
+///
+/// See also: [lookup_cfunc_codegen].
+fn reg_method_codegen(klass: VALUE, method_name: &str, gen_fn: MethodGenFn) {
+    let mid = unsafe { rb_intern2(method_name.as_ptr().cast(), method_name.len().try_into().unwrap()) };
+    let me = unsafe { rb_method_entry_at(klass, mid) };
+
+    if me.is_null() {
+        panic!("undefined optimized method!: {method_name}");
+    }
+
+    // For now, only cfuncs are supported (me->cme cast fine since it's just me->def->type).
+    debug_assert_eq!(VM_METHOD_TYPE_CFUNC, unsafe { get_cme_def_type(me.cast()) });
+
+    let method_serial = unsafe {
+        let def = (*me).def;
+        get_def_method_serial(def)
+    };
+
+    unsafe { METHOD_CODEGEN_TABLE.as_mut().unwrap().insert(method_serial, gen_fn); }
+}
+
+pub fn yjit_shutdown_free_codegen_table() {
+    unsafe { METHOD_CODEGEN_TABLE = None; };
+}
+
 /// Global state needed for code generation
 pub struct CodegenGlobals {
+    /// Flat vector of bits to store compressed context data
+    context_data: BitVector,
+
     /// Inline code block (fast path)
     inline_cb: CodeBlock,
 
@@ -7314,36 +10941,31 @@ pub struct CodegenGlobals {
     /// Code for exiting back to the interpreter from the leave instruction
     leave_exit_code: CodePtr,
 
+    /// Code for exiting back to the interpreter after handling an exception
+    leave_exception_code: CodePtr,
+
     // For exiting from YJIT frame from branch_stub_hit().
-    // Filled by gen_code_for_exit_from_stub().
+    // Filled by gen_stub_exit().
     stub_exit_code: CodePtr,
 
     // For servicing branch stubs
     branch_stub_hit_trampoline: CodePtr,
 
+    // For servicing entry stubs
+    entry_stub_hit_trampoline: CodePtr,
+
     // Code for full logic of returning from C method and exiting to the interpreter
     outline_full_cfunc_return_pos: CodePtr,
 
     /// For implementing global code invalidation
     global_inval_patches: Vec<CodepagePatch>,
 
-    /// For implementing global code invalidation. The number of bytes counting from the beginning
-    /// of the inline code block that should not be changed. After patching for global invalidation,
-    /// no one should make changes to the invalidated code region anymore. This is used to
-    /// break out of invalidation race when there are multiple ractors.
-    inline_frozen_bytes: usize,
-
-    // Methods for generating code for hardcoded (usually C) methods
-    method_codegen_table: HashMap<usize, MethodGenFn>,
-
     /// Page indexes for outlined code that are not associated to any ISEQ.
     ocb_pages: Vec<usize>,
 
-    /// Freed page indexes. None if code GC has not been used.
-    freed_pages: Option<Vec<usize>>,
-
-    /// How many times code GC has been executed.
-    code_gc_count: usize,
+    /// Map of cfunc YARV PCs to CMEs and receiver indexes, used to lazily push
+    /// a frame when rb_yjit_lazy_push_frame() is called with a PC in this HashMap.
+    pc_to_cfunc: HashMap<*mut VALUE, (*const rb_callable_method_entry_t, u8)>,
 }
 
 /// For implementing global code invalidation. A position in the inline
@@ -7361,15 +10983,11 @@ impl CodegenGlobals {
     /// Initialize the codegen globals
     pub fn init() {
         // Executable memory and code page size in bytes
-        let mem_size = get_option!(exec_mem_size);
-
+        let exec_mem_size = get_option!(exec_mem_size).unwrap_or(get_option!(mem_size));
 
         #[cfg(not(test))]
         let (mut cb, mut ocb) = {
-            use std::cell::RefCell;
-            use std::rc::Rc;
-
-            let virt_block: *mut u8 = unsafe { rb_yjit_reserve_addr_space(mem_size as u32) };
+            let virt_block: *mut u8 = unsafe { rb_jit_reserve_addr_space(exec_mem_size as u32) };
 
             // Memory protection syscalls need page-aligned addresses, so check it here. Assuming
             // `virt_block` is page-aligned, `second_half` should be page-aligned as long as the
@@ -7378,7 +10996,7 @@ impl CodegenGlobals {
             //
             // Basically, we don't support x86-64 2MiB and 1GiB pages. ARMv8 can do up to 64KiB
             // (2¹⁶ bytes) pages, which should be fine. 4KiB pages seem to be the most popular though.
-            let page_size = unsafe { rb_yjit_get_page_size() };
+            let page_size = unsafe { rb_jit_get_page_size() };
             assert_eq!(
                 virt_block as usize % page_size.as_usize(), 0,
                 "Start of virtual address block should be page-aligned",
@@ -7391,14 +11009,16 @@ impl CodegenGlobals {
                 SystemAllocator {},
                 page_size,
                 NonNull::new(virt_block).unwrap(),
-                mem_size,
+                exec_mem_size,
+                get_option!(mem_size),
             );
-            let mem_block = Rc::new(RefCell::new(mem_block));
+            let mem_block = Rc::new(mem_block);
 
-            let cb = CodeBlock::new(mem_block.clone(), false);
-            let ocb = OutlinedCb::wrap(CodeBlock::new(mem_block, true));
+            let freed_pages = Rc::new(None);
 
-            assert_eq!(cb.page_size() % page_size.as_usize(), 0, "code page size is not page-aligned");
+            let asm_comments = get_option_ref!(dump_disasm).is_some();
+            let cb = CodeBlock::new(mem_block.clone(), false, freed_pages.clone(), asm_comments);
+            let ocb = OutlinedCb::wrap(CodeBlock::new(mem_block, true, freed_pages, asm_comments));
 
             (cb, ocb)
         };
@@ -7406,114 +11026,49 @@ impl CodegenGlobals {
         // In test mode we're not linking with the C code
         // so we don't allocate executable memory
         #[cfg(test)]
-        let mut cb = CodeBlock::new_dummy(mem_size / 2);
+        let mut cb = CodeBlock::new_dummy(exec_mem_size / 2);
         #[cfg(test)]
-        let mut ocb = OutlinedCb::wrap(CodeBlock::new_dummy(mem_size / 2));
+        let mut ocb = OutlinedCb::wrap(CodeBlock::new_dummy(exec_mem_size / 2));
 
         let ocb_start_addr = ocb.unwrap().get_write_ptr();
-        let leave_exit_code = gen_leave_exit(&mut ocb);
+        let leave_exit_code = gen_leave_exit(&mut ocb).unwrap();
+        let leave_exception_code = gen_leave_exception(&mut ocb).unwrap();
 
-        let stub_exit_code = gen_code_for_exit_from_stub(&mut ocb);
+        let stub_exit_code = gen_stub_exit(&mut ocb).unwrap();
 
-        let branch_stub_hit_trampoline = gen_branch_stub_hit_trampoline(&mut ocb);
+        let branch_stub_hit_trampoline = gen_branch_stub_hit_trampoline(&mut ocb).unwrap();
+        let entry_stub_hit_trampoline = gen_entry_stub_hit_trampoline(&mut ocb).unwrap();
 
         // Generate full exit code for C func
-        let cfunc_exit_code = gen_full_cfunc_return(&mut ocb);
+        let cfunc_exit_code = gen_full_cfunc_return(&mut ocb).unwrap();
 
         let ocb_end_addr = ocb.unwrap().get_write_ptr();
-        let ocb_pages = ocb.unwrap().addrs_to_pages(ocb_start_addr, ocb_end_addr);
+        let ocb_pages = ocb.unwrap().addrs_to_pages(ocb_start_addr, ocb_end_addr).collect();
 
         // Mark all code memory as executable
         cb.mark_all_executable();
-        ocb.unwrap().mark_all_executable();
 
-        let mut codegen_globals = CodegenGlobals {
+        let codegen_globals = CodegenGlobals {
+            context_data: BitVector::new(),
             inline_cb: cb,
             outlined_cb: ocb,
+            ocb_pages,
             leave_exit_code,
-            stub_exit_code: stub_exit_code,
+            leave_exception_code,
+            stub_exit_code,
             outline_full_cfunc_return_pos: cfunc_exit_code,
             branch_stub_hit_trampoline,
+            entry_stub_hit_trampoline,
             global_inval_patches: Vec::new(),
-            inline_frozen_bytes: 0,
-            method_codegen_table: HashMap::new(),
-            ocb_pages,
-            freed_pages: None,
-            code_gc_count: 0,
+            pc_to_cfunc: HashMap::new(),
         };
 
-        // Register the method codegen functions
-        codegen_globals.reg_method_codegen_fns();
-
         // Initialize the codegen globals instance
         unsafe {
             CODEGEN_GLOBALS = Some(codegen_globals);
         }
     }
 
-    // Register a specialized codegen function for a particular method. Note that
-    // the if the function returns true, the code it generates runs without a
-    // control frame and without interrupt checks. To avoid creating observable
-    // behavior changes, the codegen function should only target simple code paths
-    // that do not allocate and do not make method calls.
-    fn yjit_reg_method(&mut self, klass: VALUE, mid_str: &str, gen_fn: MethodGenFn) {
-        let id_string = std::ffi::CString::new(mid_str).expect("couldn't convert to CString!");
-        let mid = unsafe { rb_intern(id_string.as_ptr()) };
-        let me = unsafe { rb_method_entry_at(klass, mid) };
-
-        if me.is_null() {
-            panic!("undefined optimized method!");
-        }
-
-        // For now, only cfuncs are supported
-        //RUBY_ASSERT(me && me->def);
-        //RUBY_ASSERT(me->def->type == VM_METHOD_TYPE_CFUNC);
-
-        let method_serial = unsafe {
-            let def = (*me).def;
-            get_def_method_serial(def)
-        };
-
-        self.method_codegen_table.insert(method_serial, gen_fn);
-    }
-
-    /// Register codegen functions for some Ruby core methods
-    fn reg_method_codegen_fns(&mut self) {
-        unsafe {
-            // Specialization for C methods. See yjit_reg_method() for details.
-            self.yjit_reg_method(rb_cBasicObject, "!", jit_rb_obj_not);
-
-            self.yjit_reg_method(rb_cNilClass, "nil?", jit_rb_true);
-            self.yjit_reg_method(rb_mKernel, "nil?", jit_rb_false);
-
-            self.yjit_reg_method(rb_cBasicObject, "==", jit_rb_obj_equal);
-            self.yjit_reg_method(rb_cBasicObject, "equal?", jit_rb_obj_equal);
-            self.yjit_reg_method(rb_mKernel, "eql?", jit_rb_obj_equal);
-            self.yjit_reg_method(rb_cModule, "==", jit_rb_obj_equal);
-            self.yjit_reg_method(rb_cSymbol, "==", jit_rb_obj_equal);
-            self.yjit_reg_method(rb_cSymbol, "===", jit_rb_obj_equal);
-            self.yjit_reg_method(rb_cInteger, "==", jit_rb_int_equal);
-            self.yjit_reg_method(rb_cInteger, "===", jit_rb_int_equal);
-
-            // rb_str_to_s() methods in string.c
-            self.yjit_reg_method(rb_cString, "empty?", jit_rb_str_empty);
-            self.yjit_reg_method(rb_cString, "to_s", jit_rb_str_to_s);
-            self.yjit_reg_method(rb_cString, "to_str", jit_rb_str_to_s);
-            self.yjit_reg_method(rb_cString, "bytesize", jit_rb_str_bytesize);
-            self.yjit_reg_method(rb_cString, "<<", jit_rb_str_concat);
-            self.yjit_reg_method(rb_cString, "+@", jit_rb_str_uplus);
-
-            self.yjit_reg_method(rb_mKernel, "respond_to?", jit_obj_respond_to);
-
-            // Thread.current
-            self.yjit_reg_method(
-                rb_singleton_class(rb_cThread),
-                "current",
-                jit_thread_s_current,
-            );
-        }
-    }
-
     /// Get a mutable reference to the codegen globals instance
     pub fn get_instance() -> &'static mut CodegenGlobals {
         unsafe { CODEGEN_GLOBALS.as_mut().unwrap() }
@@ -7523,6 +11078,11 @@ impl CodegenGlobals {
         unsafe { CODEGEN_GLOBALS.as_mut().is_some() }
     }
 
+    /// Get a mutable reference to the context data
+    pub fn get_context_data() -> &'static mut BitVector {
+        &mut CodegenGlobals::get_instance().context_data
+    }
+
     /// Get a mutable reference to the inline code block
     pub fn get_inline_cb() -> &'static mut CodeBlock {
         &mut CodegenGlobals::get_instance().inline_cb
@@ -7537,14 +11097,26 @@ impl CodegenGlobals {
         CodegenGlobals::get_instance().leave_exit_code
     }
 
+    pub fn get_leave_exception_code() -> CodePtr {
+        CodegenGlobals::get_instance().leave_exception_code
+    }
+
     pub fn get_stub_exit_code() -> CodePtr {
         CodegenGlobals::get_instance().stub_exit_code
     }
 
-    pub fn push_global_inval_patch(i_pos: CodePtr, o_pos: CodePtr) {
+    pub fn push_global_inval_patch(inline_pos: CodePtr, outlined_pos: CodePtr, cb: &CodeBlock) {
+        if let Some(last_patch) = CodegenGlobals::get_instance().global_inval_patches.last() {
+            let patch_offset = inline_pos.as_offset() - last_patch.inline_patch_pos.as_offset();
+            assert!(
+                patch_offset < 0 || cb.jmp_ptr_bytes() as i64 <= patch_offset,
+                "patches should not overlap (patch_offset: {patch_offset})",
+            );
+        }
+
         let patch = CodepagePatch {
-            inline_patch_pos: i_pos,
-            outlined_target_pos: o_pos,
+            inline_patch_pos: inline_pos,
+            outlined_target_pos: outlined_pos,
         };
         CodegenGlobals::get_instance()
             .global_inval_patches
@@ -7557,14 +11129,6 @@ impl CodegenGlobals {
         mem::take(&mut globals.global_inval_patches)
     }
 
-    pub fn get_inline_frozen_bytes() -> usize {
-        CodegenGlobals::get_instance().inline_frozen_bytes
-    }
-
-    pub fn set_inline_frozen_bytes(frozen_bytes: usize) {
-        CodegenGlobals::get_instance().inline_frozen_bytes = frozen_bytes;
-    }
-
     pub fn get_outline_full_cfunc_return_pos() -> CodePtr {
         CodegenGlobals::get_instance().outline_full_cfunc_return_pos
     }
@@ -7573,31 +11137,16 @@ impl CodegenGlobals {
         CodegenGlobals::get_instance().branch_stub_hit_trampoline
     }
 
-    pub fn look_up_codegen_method(method_serial: usize) -> Option<MethodGenFn> {
-        let table = &CodegenGlobals::get_instance().method_codegen_table;
-
-        let option_ref = table.get(&method_serial);
-        match option_ref {
-            None => None,
-            Some(&mgf) => Some(mgf), // Deref
-        }
+    pub fn get_entry_stub_hit_trampoline() -> CodePtr {
+        CodegenGlobals::get_instance().entry_stub_hit_trampoline
     }
 
     pub fn get_ocb_pages() -> &'static Vec<usize> {
         &CodegenGlobals::get_instance().ocb_pages
     }
 
-    pub fn get_freed_pages() -> &'static mut Option<Vec<usize>> {
-        &mut CodegenGlobals::get_instance().freed_pages
-    }
-
-    pub fn set_freed_pages(freed_pages: Vec<usize>) {
-        CodegenGlobals::get_instance().freed_pages = Some(freed_pages);
-        CodegenGlobals::get_instance().code_gc_count += 1;
-    }
-
-    pub fn get_code_gc_count() -> usize {
-        CodegenGlobals::get_instance().code_gc_count
+    pub fn get_pc_to_cfunc() -> &'static mut HashMap<*mut VALUE, (*const rb_callable_method_entry_t, u8)> {
+        &mut CodegenGlobals::get_instance().pc_to_cfunc
     }
 }
 
@@ -7605,22 +11154,28 @@ impl CodegenGlobals {
 mod tests {
     use super::*;
 
-    fn setup_codegen() -> (JITState, Context, Assembler, CodeBlock, OutlinedCb) {
-        let blockid = BlockId {
-            iseq: ptr::null(),
-            idx: 0,
-        };
-        let block = Block::new(blockid, &Context::default());
+    fn setup_codegen() -> (Context, Assembler, CodeBlock, OutlinedCb) {
+        let cb = CodeBlock::new_dummy(256 * 1024);
 
         return (
-            JITState::new(&block),
             Context::default(),
-            Assembler::new(),
-            CodeBlock::new_dummy(256 * 1024),
+            Assembler::new(0),
+            cb,
             OutlinedCb::wrap(CodeBlock::new_dummy(256 * 1024)),
         );
     }
 
+    fn dummy_jit_state<'a>(cb: &mut CodeBlock, ocb: &'a mut OutlinedCb) -> JITState<'a> {
+        JITState::new(
+            BlockId { iseq: std::ptr::null(), idx: 0 },
+            Context::default(),
+            cb.get_write_ptr(),
+            ptr::null(), // No execution context in tests. No peeking!
+            ocb,
+            true,
+        )
+    }
+
     #[test]
     fn test_gen_leave_exit() {
         let mut ocb = OutlinedCb::wrap(CodeBlock::new_dummy(256 * 1024));
@@ -7630,250 +11185,249 @@ mod tests {
 
     #[test]
     fn test_gen_exit() {
-        let (_, ctx, mut asm, mut cb, _) = setup_codegen();
-        gen_exit(0 as *mut VALUE, &ctx, &mut asm);
-        asm.compile(&mut cb);
+        let (_ctx, mut asm, mut cb, _) = setup_codegen();
+        gen_exit(0 as *mut VALUE, &mut asm);
+        asm.compile(&mut cb, None).unwrap();
         assert!(cb.get_write_pos() > 0);
     }
 
     #[test]
     fn test_get_side_exit() {
-        let (mut jit, ctx, _, _, mut ocb) = setup_codegen();
-         get_side_exit(&mut jit, &mut ocb, &ctx);
+        let (ctx, mut asm, _, mut ocb) = setup_codegen();
+        let side_exit_context = SideExitContext::new(0 as _, ctx);
+        asm.get_side_exit(&side_exit_context, None, &mut ocb);
         assert!(ocb.unwrap().get_write_pos() > 0);
     }
 
     #[test]
     fn test_gen_check_ints() {
-        let (_, _ctx, mut asm, _cb, mut ocb) = setup_codegen();
-        let side_exit = ocb.unwrap().get_write_ptr().as_side_exit();
-        gen_check_ints(&mut asm, side_exit);
+        let (_ctx, mut asm, _cb, _ocb) = setup_codegen();
+        asm.set_side_exit_context(0 as _, 0);
+        gen_check_ints(&mut asm, Counter::guard_send_interrupted);
     }
 
     #[test]
     fn test_gen_nop() {
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-        let status = gen_nop(&mut jit, &mut context, &mut asm, &mut ocb);
-        asm.compile(&mut cb);
+        let (context, mut asm, mut cb, mut ocb) = setup_codegen();
+        let mut jit = dummy_jit_state(&mut cb, &mut ocb);
+        let status = gen_nop(&mut jit, &mut asm);
+        asm.compile(&mut cb, None).unwrap();
 
-        assert_eq!(status, KeepCompiling);
-        assert_eq!(context.diff(&Context::default()), 0);
+        assert_eq!(status, Some(KeepCompiling));
+        assert_eq!(context.diff(&Context::default()), TypeDiff::Compatible(0));
         assert_eq!(cb.get_write_pos(), 0);
     }
 
     #[test]
     fn test_gen_pop() {
-        let (mut jit, _, mut asm, _cb, mut ocb) = setup_codegen();
-        let mut context = Context::default();
-        context.stack_push(Type::Fixnum);
-        let status = gen_pop(&mut jit, &mut context, &mut asm, &mut ocb);
+        let (_, mut asm, mut cb, mut ocb) = setup_codegen();
+        let mut jit = dummy_jit_state(&mut cb, &mut ocb);
+        let context = Context::default();
+        asm.stack_push(Type::Fixnum);
+        let status = gen_pop(&mut jit, &mut asm);
 
-        assert_eq!(status, KeepCompiling);
-        assert_eq!(context.diff(&Context::default()), 0);
+        assert_eq!(status, Some(KeepCompiling));
+        let mut default = Context::default();
+        default.set_reg_mapping(context.get_reg_mapping());
+        assert_eq!(context.diff(&default), TypeDiff::Compatible(0));
     }
 
     #[test]
     fn test_gen_dup() {
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-        context.stack_push(Type::Fixnum);
-        let status = gen_dup(&mut jit, &mut context, &mut asm, &mut ocb);
+        let (_context, mut asm, mut cb, mut ocb) = setup_codegen();
+        let mut jit = dummy_jit_state(&mut cb, &mut ocb);
+        asm.stack_push(Type::Fixnum);
+        let status = gen_dup(&mut jit, &mut asm);
 
-        assert_eq!(status, KeepCompiling);
+        assert_eq!(status, Some(KeepCompiling));
 
         // Did we duplicate the type information for the Fixnum type?
-        assert_eq!(Type::Fixnum, context.get_opnd_type(StackOpnd(0)));
-        assert_eq!(Type::Fixnum, context.get_opnd_type(StackOpnd(1)));
+        assert_eq!(Type::Fixnum, asm.ctx.get_opnd_type(StackOpnd(0)));
+        assert_eq!(Type::Fixnum, asm.ctx.get_opnd_type(StackOpnd(1)));
 
-        asm.compile(&mut cb);
+        asm.compile(&mut cb, None).unwrap();
         assert!(cb.get_write_pos() > 0); // Write some movs
     }
 
     #[test]
     fn test_gen_dupn() {
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-        context.stack_push(Type::Fixnum);
-        context.stack_push(Type::Flonum);
+        let (_context, mut asm, mut cb, mut ocb) = setup_codegen();
+        let mut jit = dummy_jit_state(&mut cb, &mut ocb);
+        asm.stack_push(Type::Fixnum);
+        asm.stack_push(Type::Flonum);
 
         let mut value_array: [u64; 2] = [0, 2]; // We only compile for n == 2
         let pc: *mut VALUE = &mut value_array as *mut u64 as *mut VALUE;
         jit.pc = pc;
 
-        let status = gen_dupn(&mut jit, &mut context, &mut asm, &mut ocb);
+        let status = gen_dupn(&mut jit, &mut asm);
 
-        assert_eq!(status, KeepCompiling);
+        assert_eq!(status, Some(KeepCompiling));
 
-        assert_eq!(Type::Fixnum, context.get_opnd_type(StackOpnd(3)));
-        assert_eq!(Type::Flonum, context.get_opnd_type(StackOpnd(2)));
-        assert_eq!(Type::Fixnum, context.get_opnd_type(StackOpnd(1)));
-        assert_eq!(Type::Flonum, context.get_opnd_type(StackOpnd(0)));
+        assert_eq!(Type::Fixnum, asm.ctx.get_opnd_type(StackOpnd(3)));
+        assert_eq!(Type::Flonum, asm.ctx.get_opnd_type(StackOpnd(2)));
+        assert_eq!(Type::Fixnum, asm.ctx.get_opnd_type(StackOpnd(1)));
+        assert_eq!(Type::Flonum, asm.ctx.get_opnd_type(StackOpnd(0)));
 
         // TODO: this is writing zero bytes on x86. Why?
-        asm.compile(&mut cb);
+        asm.compile(&mut cb, None).unwrap();
         assert!(cb.get_write_pos() > 0); // Write some movs
     }
 
     #[test]
-    fn test_gen_swap() {
-        let (mut jit, mut context, mut asm, _cb, mut ocb) = setup_codegen();
-        context.stack_push(Type::Fixnum);
-        context.stack_push(Type::Flonum);
-
-        let status = gen_swap(&mut jit, &mut context, &mut asm, &mut ocb);
-
-        let (_, tmp_type_top) = context.get_opnd_mapping(StackOpnd(0));
-        let (_, tmp_type_next) = context.get_opnd_mapping(StackOpnd(1));
+    fn test_gen_opt_reverse() {
+        let (_context, mut asm, mut cb, mut ocb) = setup_codegen();
+        let mut jit = dummy_jit_state(&mut cb, &mut ocb);
 
-        assert_eq!(status, KeepCompiling);
-        assert_eq!(tmp_type_top, Type::Fixnum);
-        assert_eq!(tmp_type_next, Type::Flonum);
-    }
-
-    #[test]
-    fn test_putnil() {
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-        let status = gen_putnil(&mut jit, &mut context, &mut asm, &mut ocb);
+        // Odd number of elements
+        asm.stack_push(Type::Fixnum);
+        asm.stack_push(Type::Flonum);
+        asm.stack_push(Type::CString);
 
-        let (_, tmp_type_top) = context.get_opnd_mapping(StackOpnd(0));
+        let mut value_array: [u64; 2] = [0, 3];
+        let pc: *mut VALUE = &mut value_array as *mut u64 as *mut VALUE;
+        jit.pc = pc;
 
-        assert_eq!(status, KeepCompiling);
-        assert_eq!(tmp_type_top, Type::Nil);
-        asm.compile(&mut cb);
-        assert!(cb.get_write_pos() > 0);
-    }
+        let mut status = gen_opt_reverse(&mut jit, &mut asm);
 
-    #[test]
-    fn test_putobject_qtrue() {
-        // Test gen_putobject with Qtrue
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
+        assert_eq!(status, Some(KeepCompiling));
 
-        let mut value_array: [u64; 2] = [0, Qtrue.into()];
-        let pc: *mut VALUE = &mut value_array as *mut u64 as *mut VALUE;
-        jit.pc = pc;
+        assert_eq!(Type::CString, asm.ctx.get_opnd_type(StackOpnd(2)));
+        assert_eq!(Type::Flonum, asm.ctx.get_opnd_type(StackOpnd(1)));
+        assert_eq!(Type::Fixnum, asm.ctx.get_opnd_type(StackOpnd(0)));
 
-        let status = gen_putobject(&mut jit, &mut context, &mut asm, &mut ocb);
+        // Try again with an even number of elements.
+        asm.stack_push(Type::Nil);
+        value_array[1] = 4;
+        status = gen_opt_reverse(&mut jit, &mut asm);
 
-        let (_, tmp_type_top) = context.get_opnd_mapping(StackOpnd(0));
+        assert_eq!(status, Some(KeepCompiling));
 
-        assert_eq!(status, KeepCompiling);
-        assert_eq!(tmp_type_top, Type::True);
-        asm.compile(&mut cb);
-        assert!(cb.get_write_pos() > 0);
+        assert_eq!(Type::Nil, asm.ctx.get_opnd_type(StackOpnd(3)));
+        assert_eq!(Type::Fixnum, asm.ctx.get_opnd_type(StackOpnd(2)));
+        assert_eq!(Type::Flonum, asm.ctx.get_opnd_type(StackOpnd(1)));
+        assert_eq!(Type::CString, asm.ctx.get_opnd_type(StackOpnd(0)));
     }
 
     #[test]
-    fn test_putobject_fixnum() {
-        // Test gen_putobject with a Fixnum to test another conditional branch
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-
-        // The Fixnum 7 is encoded as 7 * 2 + 1, or 15
-        let mut value_array: [u64; 2] = [0, 15];
-        let pc: *mut VALUE = &mut value_array as *mut u64 as *mut VALUE;
-        jit.pc = pc;
+    fn test_gen_swap() {
+        let (_context, mut asm, mut cb, mut ocb) = setup_codegen();
+        let mut jit = dummy_jit_state(&mut cb, &mut ocb);
+        asm.stack_push(Type::Fixnum);
+        asm.stack_push(Type::Flonum);
 
-        let status = gen_putobject(&mut jit, &mut context, &mut asm, &mut ocb);
+        let status = gen_swap(&mut jit, &mut asm);
 
-        let (_, tmp_type_top) = context.get_opnd_mapping(StackOpnd(0));
+        let tmp_type_top = asm.ctx.get_opnd_type(StackOpnd(0));
+        let tmp_type_next = asm.ctx.get_opnd_type(StackOpnd(1));
 
-        assert_eq!(status, KeepCompiling);
+        assert_eq!(status, Some(KeepCompiling));
         assert_eq!(tmp_type_top, Type::Fixnum);
-        asm.compile(&mut cb);
-        assert!(cb.get_write_pos() > 0);
+        assert_eq!(tmp_type_next, Type::Flonum);
     }
 
     #[test]
-    fn test_int2fix() {
-        let (mut jit, mut context, mut asm, _cb, mut ocb) = setup_codegen();
-        jit.opcode = YARVINSN_putobject_INT2FIX_0_.as_usize();
-        let status = gen_putobject_int2fix(&mut jit, &mut context, &mut asm, &mut ocb);
+    fn test_putnil() {
+        let (_context, mut asm, mut cb, mut ocb) = setup_codegen();
+        let mut jit = dummy_jit_state(&mut cb, &mut ocb);
+        let status = gen_putnil(&mut jit, &mut asm);
 
-        let (_, tmp_type_top) = context.get_opnd_mapping(StackOpnd(0));
+        let tmp_type_top = asm.ctx.get_opnd_type(StackOpnd(0));
 
-        // Right now we're not testing the generated machine code to make sure a literal 1 or 0 was pushed. I've checked locally.
-        assert_eq!(status, KeepCompiling);
-        assert_eq!(tmp_type_top, Type::Fixnum);
+        assert_eq!(status, Some(KeepCompiling));
+        assert_eq!(tmp_type_top, Type::Nil);
+        asm.compile(&mut cb, None).unwrap();
+        assert!(cb.get_write_pos() > 0);
     }
 
+
     #[test]
     fn test_putself() {
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-        let status = gen_putself(&mut jit, &mut context, &mut asm, &mut ocb);
+        let (_context, mut asm, mut cb, mut ocb) = setup_codegen();
+        let mut jit = dummy_jit_state(&mut cb, &mut ocb);
+        let status = gen_putself(&mut jit, &mut asm);
 
-        assert_eq!(status, KeepCompiling);
-        asm.compile(&mut cb);
+        assert_eq!(status, Some(KeepCompiling));
+        asm.compile(&mut cb, None).unwrap();
         assert!(cb.get_write_pos() > 0);
     }
 
     #[test]
     fn test_gen_setn() {
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-        context.stack_push(Type::Fixnum);
-        context.stack_push(Type::Flonum);
-        context.stack_push(Type::CString);
+        let (_context, mut asm, mut cb, mut ocb) = setup_codegen();
+        let mut jit = dummy_jit_state(&mut cb, &mut ocb);
+        asm.stack_push(Type::Fixnum);
+        asm.stack_push(Type::Flonum);
+        asm.stack_push(Type::CString);
 
         let mut value_array: [u64; 2] = [0, 2];
         let pc: *mut VALUE = &mut value_array as *mut u64 as *mut VALUE;
         jit.pc = pc;
 
-        let status = gen_setn(&mut jit, &mut context, &mut asm, &mut ocb);
+        let status = gen_setn(&mut jit, &mut asm);
 
-        assert_eq!(status, KeepCompiling);
+        assert_eq!(status, Some(KeepCompiling));
 
-        assert_eq!(Type::CString, context.get_opnd_type(StackOpnd(2)));
-        assert_eq!(Type::Flonum, context.get_opnd_type(StackOpnd(1)));
-        assert_eq!(Type::CString, context.get_opnd_type(StackOpnd(0)));
+        assert_eq!(Type::CString, asm.ctx.get_opnd_type(StackOpnd(2)));
+        assert_eq!(Type::Flonum, asm.ctx.get_opnd_type(StackOpnd(1)));
+        assert_eq!(Type::CString, asm.ctx.get_opnd_type(StackOpnd(0)));
 
-        asm.compile(&mut cb);
+        asm.compile(&mut cb, None).unwrap();
         assert!(cb.get_write_pos() > 0);
     }
 
     #[test]
     fn test_gen_topn() {
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-        context.stack_push(Type::Flonum);
-        context.stack_push(Type::CString);
+        let (_context, mut asm, mut cb, mut ocb) = setup_codegen();
+        let mut jit = dummy_jit_state(&mut cb, &mut ocb);
+        asm.stack_push(Type::Flonum);
+        asm.stack_push(Type::CString);
 
         let mut value_array: [u64; 2] = [0, 1];
         let pc: *mut VALUE = &mut value_array as *mut u64 as *mut VALUE;
         jit.pc = pc;
 
-        let status = gen_topn(&mut jit, &mut context, &mut asm, &mut ocb);
+        let status = gen_topn(&mut jit, &mut asm);
 
-        assert_eq!(status, KeepCompiling);
+        assert_eq!(status, Some(KeepCompiling));
 
-        assert_eq!(Type::Flonum, context.get_opnd_type(StackOpnd(2)));
-        assert_eq!(Type::CString, context.get_opnd_type(StackOpnd(1)));
-        assert_eq!(Type::Flonum, context.get_opnd_type(StackOpnd(0)));
+        assert_eq!(Type::Flonum, asm.ctx.get_opnd_type(StackOpnd(2)));
+        assert_eq!(Type::CString, asm.ctx.get_opnd_type(StackOpnd(1)));
+        assert_eq!(Type::Flonum, asm.ctx.get_opnd_type(StackOpnd(0)));
 
-        asm.compile(&mut cb);
+        asm.compile(&mut cb, None).unwrap();
         assert!(cb.get_write_pos() > 0); // Write some movs
     }
 
     #[test]
     fn test_gen_adjuststack() {
-        let (mut jit, mut context, mut asm, mut cb, mut ocb) = setup_codegen();
-        context.stack_push(Type::Flonum);
-        context.stack_push(Type::CString);
-        context.stack_push(Type::Fixnum);
+        let (_context, mut asm, mut cb, mut ocb) = setup_codegen();
+        let mut jit = dummy_jit_state(&mut cb, &mut ocb);
+        asm.stack_push(Type::Flonum);
+        asm.stack_push(Type::CString);
+        asm.stack_push(Type::Fixnum);
 
         let mut value_array: [u64; 3] = [0, 2, 0];
         let pc: *mut VALUE = &mut value_array as *mut u64 as *mut VALUE;
         jit.pc = pc;
 
-        let status = gen_adjuststack(&mut jit, &mut context, &mut asm, &mut ocb);
+        let status = gen_adjuststack(&mut jit, &mut asm);
 
-        assert_eq!(status, KeepCompiling);
+        assert_eq!(status, Some(KeepCompiling));
 
-        assert_eq!(Type::Flonum, context.get_opnd_type(StackOpnd(0)));
+        assert_eq!(Type::Flonum, asm.ctx.get_opnd_type(StackOpnd(0)));
 
-        asm.compile(&mut cb);
+        asm.compile(&mut cb, None).unwrap();
         assert!(cb.get_write_pos() == 0); // No instructions written
     }
 
     #[test]
     fn test_gen_leave() {
-        let (mut jit, mut context, mut asm, _cb, mut ocb) = setup_codegen();
+        let (_context, mut asm, mut cb, mut ocb) = setup_codegen();
+        let mut jit = dummy_jit_state(&mut cb, &mut ocb);
         // Push return value
-        context.stack_push(Type::Fixnum);
-        gen_leave(&mut jit, &mut context, &mut asm, &mut ocb);
+        asm.stack_push(Type::Fixnum);
+        asm.set_side_exit_context(0 as _, 0);
+        gen_leave(&mut jit, &mut asm);
     }
 }
diff --git a/yjit/src/core.rs b/yjit/src/core.rs
index 15b8fe4466..0590135392 100644
--- a/yjit/src/core.rs
+++ b/yjit/src/core.rs
@@ -1,3 +1,8 @@
+//! Code versioning, retained live control flow graph mutations, type tracking, etc.
+
+// So we can comment on individual uses of `unsafe` in `unsafe` functions
+#![warn(unsafe_op_in_unsafe_fn)]
+
 use crate::asm::*;
 use crate::backend::ir::*;
 use crate::codegen::*;
@@ -10,24 +15,36 @@ use crate::utils::*;
 use crate::disasm::*;
 use core::ffi::c_void;
 use std::cell::*;
+use std::fmt;
+use std::mem;
+use std::mem::transmute;
+use std::ops::Range;
+use std::rc::Rc;
 use std::collections::HashSet;
+use std::collections::hash_map::DefaultHasher;
 use std::hash::{Hash, Hasher};
-use std::mem;
-use std::rc::{Rc};
+use mem::MaybeUninit;
+use std::ptr;
+use ptr::NonNull;
 use YARVOpnd::*;
 use TempMapping::*;
-use crate::invariants::block_assumptions_free;
+use crate::invariants::*;
 
-// Maximum number of temp value types we keep track of
-pub const MAX_TEMP_TYPES: usize = 8;
+// Maximum number of temp value types or registers we keep track of
+pub const MAX_CTX_TEMPS: usize = 8;
 
-// Maximum number of local variable types we keep track of
-const MAX_LOCAL_TYPES: usize = 8;
+// Maximum number of local variable types or registers we keep track of
+const MAX_CTX_LOCALS: usize = 8;
+
+/// An index into `ISEQ_BODY(iseq)->iseq_encoded`. Points
+/// to a YARV instruction or an instruction operand.
+pub type IseqIdx = u16;
 
 // Represent the type of a value (local/stack/self) in YJIT
-#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+#[derive(Copy, Clone, Hash, PartialEq, Eq, Debug)]
+#[repr(u8)]
 pub enum Type {
-    Unknown,
+    Unknown = 0,
     UnknownImm,
     UnknownHeap,
     Nil,
@@ -35,19 +52,20 @@ pub enum Type {
     False,
     Fixnum,
     Flonum,
-    Hash,
     ImmSymbol,
 
-    #[allow(unused)]
-    HeapSymbol,
-
     TString, // An object with the T_STRING flag set, possibly an rb_cString
-    CString, // An un-subclassed string of type rb_cString (can have instance vars in some cases)
+    CString, // An object that at one point had its class field equal rb_cString (creating a singleton class changes it)
     TArray, // An object with the T_ARRAY flag set, possibly an rb_cArray
-    CArray, // An un-subclassed string of type rb_cArray (can have instance vars in some cases)
+    CArray, // An object that at one point had its class field equal rb_cArray (creating a singleton class changes it)
+    THash, // An object with the T_HASH flag set, possibly an rb_cHash
+    CHash, // An object that at one point had its class field equal rb_cHash (creating a singleton class changes it)
 
     BlockParamProxy, // A special sentinel value indicating the block parameter should be read from
                      // the current surrounding cfp
+
+    // The context currently relies on types taking at most 4 bits (max value 15)
+    // to encode, so if we add any more, we will need to refactor the context.
 }
 
 // Default initialization
@@ -80,12 +98,11 @@ impl Type {
             // Core.rs can't reference rb_cString because it's linked by Rust-only tests.
             // But CString vs TString is only an optimisation and shouldn't affect correctness.
             #[cfg(not(test))]
-            if val.class_of() == unsafe { rb_cString } {
-                return Type::CString;
-            }
-            #[cfg(not(test))]
-            if val.class_of() == unsafe { rb_cArray } {
-                return Type::CArray;
+            match val.class_of() {
+                class if class == unsafe { rb_cArray }  => return Type::CArray,
+                class if class == unsafe { rb_cHash }   => return Type::CHash,
+                class if class == unsafe { rb_cString } => return Type::CString,
+                _ => {}
             }
             // We likewise can't reference rb_block_param_proxy, but it's again an optimisation;
             // we can just treat it as a normal Object.
@@ -95,7 +112,7 @@ impl Type {
             }
             match val.builtin_type() {
                 RUBY_T_ARRAY => Type::TArray,
-                RUBY_T_HASH => Type::Hash,
+                RUBY_T_HASH => Type::THash,
                 RUBY_T_STRING => Type::TString,
                 _ => Type::UnknownHeap,
             }
@@ -137,14 +154,30 @@ impl Type {
             Type::UnknownHeap => true,
             Type::TArray => true,
             Type::CArray => true,
-            Type::Hash => true,
-            Type::HeapSymbol => true,
+            Type::THash => true,
+            Type::CHash => true,
             Type::TString => true,
             Type::CString => true,
+            Type::BlockParamProxy => true,
             _ => false,
         }
     }
 
+    /// Check if it's a T_ARRAY object (both TArray and CArray are T_ARRAY)
+    pub fn is_array(&self) -> bool {
+        matches!(self, Type::TArray | Type::CArray)
+    }
+
+    /// Check if it's a T_HASH object (both THash and CHash are T_HASH)
+    pub fn is_hash(&self) -> bool {
+        matches!(self, Type::THash | Type::CHash)
+    }
+
+    /// Check if it's a T_STRING object (both TString and CString are T_STRING)
+    pub fn is_string(&self) -> bool {
+        matches!(self, Type::TString | Type::CString)
+    }
+
     /// Returns an Option with the T_ value type if it is known, otherwise None
     pub fn known_value_type(&self) -> Option<ruby_value_type> {
         match self {
@@ -154,8 +187,8 @@ impl Type {
             Type::Fixnum => Some(RUBY_T_FIXNUM),
             Type::Flonum => Some(RUBY_T_FLOAT),
             Type::TArray | Type::CArray => Some(RUBY_T_ARRAY),
-            Type::Hash => Some(RUBY_T_HASH),
-            Type::ImmSymbol | Type::HeapSymbol => Some(RUBY_T_SYMBOL),
+            Type::THash | Type::CHash => Some(RUBY_T_HASH),
+            Type::ImmSymbol => Some(RUBY_T_SYMBOL),
             Type::TString | Type::CString => Some(RUBY_T_STRING),
             Type::Unknown | Type::UnknownImm | Type::UnknownHeap => None,
             Type::BlockParamProxy => None,
@@ -171,9 +204,10 @@ impl Type {
                 Type::False => Some(rb_cFalseClass),
                 Type::Fixnum => Some(rb_cInteger),
                 Type::Flonum => Some(rb_cFloat),
-                Type::ImmSymbol | Type::HeapSymbol => Some(rb_cSymbol),
-                Type::CString => Some(rb_cString),
+                Type::ImmSymbol => Some(rb_cSymbol),
                 Type::CArray => Some(rb_cArray),
+                Type::CHash => Some(rb_cHash),
+                Type::CString => Some(rb_cString),
                 _ => None,
             }
         }
@@ -212,66 +246,83 @@ impl Type {
     }
 
     /// Compute a difference between two value types
-    /// Returns 0 if the two are the same
-    /// Returns > 0 if different but compatible
-    /// Returns usize::MAX if incompatible
-    pub fn diff(self, dst: Self) -> usize {
+    pub fn diff(self, dst: Self) -> TypeDiff {
         // Perfect match, difference is zero
         if self == dst {
-            return 0;
+            return TypeDiff::Compatible(0);
         }
 
         // Any type can flow into an unknown type
         if dst == Type::Unknown {
-            return 1;
-        }
-
-        // A CString is also a TString.
-        if self == Type::CString && dst == Type::TString {
-            return 1;
+            return TypeDiff::Compatible(1);
         }
 
         // A CArray is also a TArray.
         if self == Type::CArray && dst == Type::TArray {
-            return 1;
+            return TypeDiff::Compatible(1);
+        }
+
+        // A CHash is also a THash.
+        if self == Type::CHash && dst == Type::THash {
+            return TypeDiff::Compatible(1);
+        }
+
+        // A CString is also a TString.
+        if self == Type::CString && dst == Type::TString {
+            return TypeDiff::Compatible(1);
         }
 
         // Specific heap type into unknown heap type is imperfect but valid
         if self.is_heap() && dst == Type::UnknownHeap {
-            return 1;
+            return TypeDiff::Compatible(1);
         }
 
         // Specific immediate type into unknown immediate type is imperfect but valid
         if self.is_imm() && dst == Type::UnknownImm {
-            return 1;
+            return TypeDiff::Compatible(1);
         }
 
         // Incompatible types
-        return usize::MAX;
+        return TypeDiff::Incompatible;
     }
 
     /// Upgrade this type into a more specific compatible type
     /// The new type must be compatible and at least as specific as the previously known type.
-    fn upgrade(&mut self, src: Self) {
-        // Here we're checking that src is more specific than self
-        assert!(src.diff(*self) != usize::MAX);
-        *self = src;
+    fn upgrade(&mut self, new_type: Self) {
+        // We can only upgrade to a type that is more specific
+        assert!(new_type.diff(*self) != TypeDiff::Incompatible);
+        *self = new_type;
     }
 }
 
-// Potential mapping of a value on the temporary stack to
-// self, a local variable or constant so that we can track its type
-#[derive(Copy, Clone, Eq, PartialEq, Debug)]
+#[derive(Debug, Eq, PartialEq)]
+pub enum TypeDiff {
+    // usize == 0: Same type
+    // usize >= 1: Different but compatible. The smaller, the more compatible.
+    Compatible(usize),
+    Incompatible,
+}
+
+#[derive(Copy, Clone, Eq, Hash, PartialEq, Debug)]
 pub enum TempMapping {
-    MapToStack, // Normal stack value
-    MapToSelf,  // Temp maps to the self operand
-    MapToLocal(u8), // Temp maps to a local variable with index
-                //ConstMapping,         // Small constant (0, 1, 2, Qnil, Qfalse, Qtrue)
+    MapToStack(Type),
+    MapToSelf,
+    MapToLocal(u8),
 }
 
 impl Default for TempMapping {
     fn default() -> Self {
-        MapToStack
+        TempMapping::MapToStack(Type::default())
+    }
+}
+
+impl TempMapping {
+    /// Return TempMapping without type information in MapToStack
+    pub fn without_type(&self) -> TempMapping {
+        match self {
+            MapToStack(_) => TempMapping::MapToStack(Type::default()),
+            _ => *self,
+        }
     }
 }
 
@@ -282,35 +333,883 @@ pub enum YARVOpnd {
     SelfOpnd,
 
     // Temporary stack operand with stack index
-    StackOpnd(u16),
+    StackOpnd(u8),
+}
+
+impl From<Opnd> for YARVOpnd {
+    fn from(value: Opnd) -> Self {
+        match value {
+            Opnd::Stack { idx, .. } => StackOpnd(idx.try_into().unwrap()),
+            _ => unreachable!("{:?} cannot be converted to YARVOpnd", value)
+        }
+    }
+}
+
+/// Number of registers that can be used for stack temps or locals
+pub const MAX_MAPPED_REGS: usize = 5;
+
+/// A stack slot or a local variable. u8 represents the index of it (<= 8).
+#[derive(Copy, Clone, Eq, Hash, PartialEq, Debug)]
+pub enum RegOpnd {
+    Stack(u8),
+    Local(u8),
+}
+
+/// RegMappings manages a set of registers used for stack temps and locals.
+/// Each element of the array represents each of the registers.
+/// If an element is Some, the stack temp or the local uses a register.
+///
+/// Note that Opnd::InsnOut uses a separate set of registers at the moment.
+#[derive(Copy, Clone, Default, Eq, Hash, PartialEq)]
+pub struct RegMapping([Option<RegOpnd>; MAX_MAPPED_REGS]);
+
+impl RegMapping {
+    /// Return the index of the register for a given operand if allocated.
+    pub fn get_reg(&self, opnd: RegOpnd) -> Option<usize> {
+        self.0.iter().enumerate()
+            .find(|(_, &reg_opnd)| reg_opnd == Some(opnd))
+            .map(|(reg_idx, _)| reg_idx)
+    }
+
+    /// Set a given operand to the register at a given index.
+    pub fn set_reg(&mut self, opnd: RegOpnd, reg_idx: usize) {
+        assert!(self.0[reg_idx].is_none());
+        self.0[reg_idx] = Some(opnd);
+    }
+
+    /// Allocate a register for a given operand if available.
+    /// Return true if self is updated.
+    pub fn alloc_reg(&mut self, opnd: RegOpnd) -> bool {
+        // If a given opnd already has a register, skip allocation.
+        if self.get_reg(opnd).is_some() {
+            return false;
+        }
+
+        // If the index is too large to encode with with 3 bits, give up.
+        match opnd {
+            RegOpnd::Stack(stack_idx) => if stack_idx >= MAX_CTX_TEMPS as u8 {
+                return false;
+            }
+            RegOpnd::Local(local_idx) => if local_idx >= MAX_CTX_LOCALS as u8 {
+                return false;
+            }
+        };
+
+        // Allocate a register if available.
+        if let Some(reg_idx) = self.find_unused_reg(opnd) {
+            self.0[reg_idx] = Some(opnd);
+            return true;
+        }
+        false
+    }
+
+    /// Deallocate a register for a given operand if in use.
+    /// Return true if self is updated.
+    pub fn dealloc_reg(&mut self, opnd: RegOpnd) -> bool {
+        for reg_opnd in self.0.iter_mut() {
+            if *reg_opnd == Some(opnd) {
+                *reg_opnd = None;
+                return true;
+            }
+        }
+        false
+    }
+
+    /// Find an available register and return the index of it.
+    fn find_unused_reg(&self, opnd: RegOpnd) -> Option<usize> {
+        let num_regs = get_option!(num_temp_regs);
+        if num_regs == 0 {
+            return None;
+        }
+        assert!(num_regs <= MAX_MAPPED_REGS);
+
+        // If the default index for the operand is available, use that to minimize
+        // discrepancies among Contexts.
+        let default_idx = match opnd {
+            RegOpnd::Stack(stack_idx) => stack_idx.as_usize() % num_regs,
+            RegOpnd::Local(local_idx) => num_regs - (local_idx.as_usize() % num_regs) - 1,
+        };
+        if self.0[default_idx].is_none() {
+            return Some(default_idx);
+        }
+
+        // If not, pick any other available register. Like default indexes, prefer
+        // lower indexes for Stack, and higher indexes for Local.
+        let mut index_temps = self.0.iter().enumerate();
+        match opnd {
+            RegOpnd::Stack(_) => index_temps.find(|(_, reg_opnd)| reg_opnd.is_none()),
+            RegOpnd::Local(_) => index_temps.rev().find(|(_, reg_opnd)| reg_opnd.is_none()),
+        }.map(|(index, _)| index)
+    }
+
+    /// Return a vector of RegOpnds that have an allocated register
+    pub fn get_reg_opnds(&self) -> Vec<RegOpnd> {
+        self.0.iter().filter_map(|&reg_opnd| reg_opnd).collect()
+    }
+
+    /// Count the number of registers that store a different operand from `dst`.
+    pub fn diff(&self, dst: RegMapping) -> usize {
+        self.0.iter().enumerate().filter(|&(reg_idx, &reg)| reg != dst.0[reg_idx]).count()
+    }
 }
 
+impl fmt::Debug for RegMapping {
+    /// Print `[None, ...]` instead of the default `RegMappings([None, ...])`
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        write!(fmt, "{:?}", self.0)
+    }
+}
+
+/// Maximum value of the chain depth (should fit in 5 bits)
+const CHAIN_DEPTH_MAX: u8 = 0b11111; // 31
+
 /// Code generation context
 /// Contains information we can use to specialize/optimize code
-/// There are a lot of context objects so we try to keep the size small.
-#[derive(Clone, Default, PartialEq, Debug)]
+#[derive(Copy, Clone, Default, Eq, Hash, PartialEq, Debug)]
 pub struct Context {
     // Number of values currently on the temporary stack
-    stack_size: u16,
+    stack_size: u8,
 
     // Offset of the JIT SP relative to the interpreter SP
     // This represents how far the JIT's SP is from the "real" SP
-    sp_offset: i16,
+    sp_offset: i8,
+
+    /// Which stack temps or locals are in a register
+    reg_mapping: RegMapping,
 
     // Depth of this block in the sidechain (eg: inline-cache chain)
+    // 6 bits, max 63
     chain_depth: u8,
 
-    // Local variable types we keep track of
-    local_types: [Type; MAX_LOCAL_TYPES],
+    // Whether this code is the target of a JIT-to-JIT Ruby return ([Self::is_return_landing])
+    is_return_landing: bool,
 
-    // Temporary variable types we keep track of
-    temp_types: [Type; MAX_TEMP_TYPES],
+    // Whether the compilation of this code has been deferred ([Self::is_deferred])
+    is_deferred: bool,
 
     // Type we track for self
     self_type: Type,
 
-    // Mapping of temp stack entries to types we track
-    temp_mapping: [TempMapping; MAX_TEMP_TYPES],
+    // Local variable types we keep track of
+    local_types: [Type; MAX_CTX_LOCALS],
+
+    // Temp mapping type/local_idx we track
+    temp_mapping: [TempMapping; MAX_CTX_TEMPS],
+
+    /// A pointer to a block ISEQ supplied by the caller. 0 if not inlined.
+    inline_block: Option<IseqPtr>,
+}
+
+#[derive(Clone)]
+pub struct BitVector {
+    // Flat vector of bytes to write into
+    bytes: Vec<u8>,
+
+    // Number of bits taken out of bytes allocated
+    num_bits: usize,
+}
+
+impl BitVector {
+    pub fn new() -> Self {
+        Self {
+            bytes: Vec::with_capacity(4096),
+            num_bits: 0,
+        }
+    }
+
+    #[allow(unused)]
+    pub fn num_bits(&self) -> usize {
+        self.num_bits
+    }
+
+    // Total number of bytes taken
+    #[allow(unused)]
+    pub fn num_bytes(&self) -> usize {
+        (self.num_bits / 8) + if (self.num_bits % 8) != 0 { 1 } else { 0 }
+    }
+
+    // Write/append an unsigned integer value
+    fn push_uint(&mut self, mut val: u64, mut num_bits: usize) {
+        assert!(num_bits <= 64);
+
+        // Mask out bits above the number of bits requested
+        let mut val_bits = val;
+        if num_bits < 64 {
+            val_bits &= (1 << num_bits) - 1;
+            assert!(val == val_bits);
+        }
+
+        // Number of bits encoded in the last byte
+        let rem_bits = self.num_bits % 8;
+
+        // Encode as many bits as we can in this last byte
+        if rem_bits != 0 {
+            let num_enc = std::cmp::min(num_bits, 8 - rem_bits);
+            let bit_mask = (1 << num_enc) - 1;
+            let frac_bits = (val & bit_mask) << rem_bits;
+            let frac_bits: u8 = frac_bits.try_into().unwrap();
+            let last_byte_idx = self.bytes.len() - 1;
+            self.bytes[last_byte_idx] |= frac_bits;
+
+            self.num_bits += num_enc;
+            num_bits -= num_enc;
+            val >>= num_enc;
+        }
+
+        // While we have bits left to encode
+        while num_bits > 0 {
+            // Grow with a 1.2x growth factor instead of 2x
+            assert!(self.num_bits % 8 == 0);
+            let num_bytes = self.num_bits / 8;
+            if num_bytes == self.bytes.capacity() {
+                self.bytes.reserve_exact(self.bytes.len() / 5);
+            }
+
+            let bits = val & 0xFF;
+            let bits: u8 = bits.try_into().unwrap();
+            self.bytes.push(bits);
+
+            let bits_to_encode = std::cmp::min(num_bits, 8);
+            self.num_bits += bits_to_encode;
+            num_bits -= bits_to_encode;
+            val >>= bits_to_encode;
+        }
+    }
+
+    fn push_u8(&mut self, val: u8) {
+        self.push_uint(val as u64, 8);
+    }
+
+    fn push_u5(&mut self, val: u8) {
+        assert!(val <= 0b11111);
+        self.push_uint(val as u64, 5);
+    }
+
+    fn push_u4(&mut self, val: u8) {
+        assert!(val <= 0b1111);
+        self.push_uint(val as u64, 4);
+    }
+
+    fn push_u3(&mut self, val: u8) {
+        assert!(val <= 0b111);
+        self.push_uint(val as u64, 3);
+    }
+
+    fn push_u2(&mut self, val: u8) {
+        assert!(val <= 0b11);
+        self.push_uint(val as u64, 2);
+    }
+
+    fn push_u1(&mut self, val: u8) {
+        assert!(val <= 0b1);
+        self.push_uint(val as u64, 1);
+    }
+
+    fn push_bool(&mut self, val: bool) {
+        self.push_u1(if val { 1 } else { 0 });
+    }
+
+    // Push a context encoding opcode
+    fn push_op(&mut self, op: CtxOp) {
+        self.push_u4(op as u8);
+    }
+
+    // Read a uint value at a given bit index
+    // The bit index is incremented after the value is read
+    fn read_uint(&self, bit_idx: &mut usize, mut num_bits: usize) -> u64 {
+        let start_bit_idx = *bit_idx;
+        let mut cur_idx = *bit_idx;
+
+        // Read the bits in the first byte
+        let bit_mod = cur_idx % 8;
+        let bits_in_byte = self.bytes[cur_idx / 8] >> bit_mod;
+
+        let num_bits_in_byte = std::cmp::min(num_bits, 8 - bit_mod);
+        cur_idx += num_bits_in_byte;
+        num_bits -= num_bits_in_byte;
+
+        let mut out_bits = (bits_in_byte as u64) & ((1 << num_bits_in_byte) - 1);
+
+        // While we have bits left to read
+        while num_bits > 0 {
+            let num_bits_in_byte = std::cmp::min(num_bits, 8);
+            assert!(cur_idx % 8 == 0);
+            let byte = self.bytes[cur_idx / 8] as u64;
+
+            let bits_in_byte = byte & ((1 << num_bits) - 1);
+            out_bits |= bits_in_byte << (cur_idx - start_bit_idx);
+
+            // Move to the next byte/offset
+            cur_idx += num_bits_in_byte;
+            num_bits -= num_bits_in_byte;
+        }
+
+        // Update the read index
+        *bit_idx = cur_idx;
+
+        out_bits
+    }
+
+    fn read_u8(&self, bit_idx: &mut usize) -> u8 {
+        self.read_uint(bit_idx, 8) as u8
+    }
+
+    fn read_u5(&self, bit_idx: &mut usize) -> u8 {
+        self.read_uint(bit_idx, 5) as u8
+    }
+
+    fn read_u4(&self, bit_idx: &mut usize) -> u8 {
+        self.read_uint(bit_idx, 4) as u8
+    }
+
+    fn read_u3(&self, bit_idx: &mut usize) -> u8 {
+        self.read_uint(bit_idx, 3) as u8
+    }
+
+    fn read_u2(&self, bit_idx: &mut usize) -> u8 {
+        self.read_uint(bit_idx, 2) as u8
+    }
+
+    fn read_u1(&self, bit_idx: &mut usize) -> u8 {
+        self.read_uint(bit_idx, 1) as u8
+    }
+
+    fn read_bool(&self, bit_idx: &mut usize) -> bool {
+        self.read_u1(bit_idx) != 0
+    }
+
+    fn read_op(&self, bit_idx: &mut usize) -> CtxOp {
+        unsafe { std::mem::transmute(self.read_u4(bit_idx)) }
+    }
+}
+
+impl fmt::Debug for BitVector {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // We print the higher bytes first
+        for (idx, byte) in self.bytes.iter().enumerate().rev() {
+            write!(f, "{:08b}", byte)?;
+
+            // Insert a separator between each byte
+            if idx > 0 {
+                write!(f, "|")?;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod bitvector_tests {
+    use super::*;
+
+    #[test]
+    fn write_3() {
+        let mut arr = BitVector::new();
+        arr.push_uint(3, 2);
+        assert!(arr.read_uint(&mut 0, 2) == 3);
+    }
+
+    #[test]
+    fn write_11() {
+        let mut arr = BitVector::new();
+        arr.push_uint(1, 1);
+        arr.push_uint(1, 1);
+        assert!(arr.read_uint(&mut 0, 2) == 3);
+    }
+
+    #[test]
+    fn write_11_overlap() {
+        let mut arr = BitVector::new();
+        arr.push_uint(0, 7);
+        arr.push_uint(3, 2);
+        arr.push_uint(1, 1);
+
+        //dbg!(arr.read_uint(7, 2));
+        assert!(arr.read_uint(&mut 7, 2) == 3);
+    }
+
+    #[test]
+    fn write_ff_0() {
+        let mut arr = BitVector::new();
+        arr.push_uint(0xFF, 8);
+        assert!(arr.read_uint(&mut 0, 8) == 0xFF);
+    }
+
+    #[test]
+    fn write_ff_3() {
+        // Write 0xFF at bit index 3
+        let mut arr = BitVector::new();
+        arr.push_uint(0, 3);
+        arr.push_uint(0xFF, 8);
+        assert!(arr.read_uint(&mut 3, 8) == 0xFF);
+    }
+
+    #[test]
+    fn write_ff_sandwich() {
+        // Write 0xFF sandwiched between zeros
+        let mut arr = BitVector::new();
+        arr.push_uint(0, 3);
+        arr.push_u8(0xFF);
+        arr.push_uint(0, 3);
+        assert!(arr.read_uint(&mut 3, 8) == 0xFF);
+    }
+
+    #[test]
+    fn write_read_u32_max() {
+        let mut arr = BitVector::new();
+        arr.push_uint(0xFF_FF_FF_FF, 32);
+        assert!(arr.read_uint(&mut 0, 32) == 0xFF_FF_FF_FF);
+    }
+
+    #[test]
+    fn write_read_u32_max_64b() {
+        let mut arr = BitVector::new();
+        arr.push_uint(0xFF_FF_FF_FF, 64);
+        assert!(arr.read_uint(&mut 0, 64) == 0xFF_FF_FF_FF);
+    }
+
+    #[test]
+    fn write_read_u64_max() {
+        let mut arr = BitVector::new();
+        arr.push_uint(u64::MAX, 64);
+        assert!(arr.read_uint(&mut 0, 64) == u64::MAX);
+    }
+
+    #[test]
+    fn encode_default() {
+        let mut bits = BitVector::new();
+        let ctx = Context::default();
+        let start_idx = ctx.encode_into(&mut bits);
+        assert!(start_idx == 0);
+        assert!(bits.num_bits() > 0);
+        assert!(bits.num_bytes() > 0);
+
+        // Make sure that the round trip matches the input
+        let ctx2 = Context::decode_from(&bits, 0);
+        assert!(ctx2 == ctx);
+    }
+
+    #[test]
+    fn encode_default_2x() {
+        let mut bits = BitVector::new();
+
+        let ctx0 = Context::default();
+        let idx0 = ctx0.encode_into(&mut bits);
+
+        let mut ctx1 = Context::default();
+        ctx1.reg_mapping = RegMapping([Some(RegOpnd::Stack(0)), None, None, None, None]);
+        let idx1 = ctx1.encode_into(&mut bits);
+
+        // Make sure that we can encode two contexts successively
+        let ctx0_dec = Context::decode_from(&bits, idx0);
+        let ctx1_dec = Context::decode_from(&bits, idx1);
+        assert!(ctx0_dec == ctx0);
+        assert!(ctx1_dec == ctx1);
+    }
+
+    #[test]
+    fn regress_reg_mapping() {
+        let mut bits = BitVector::new();
+        let mut ctx = Context::default();
+        ctx.reg_mapping = RegMapping([Some(RegOpnd::Stack(0)), None, None, None, None]);
+        ctx.encode_into(&mut bits);
+
+        let b0 = bits.read_u1(&mut 0);
+        assert!(b0 == 1);
+
+        // Make sure that the round trip matches the input
+        let ctx2 = Context::decode_from(&bits, 0);
+        assert!(ctx2 == ctx);
+    }
+}
+
+// Context encoding opcodes (4 bits)
+#[derive(Debug, Copy, Clone)]
+#[repr(u8)]
+enum CtxOp {
+    // Self type (4 bits)
+    SetSelfType = 0,
+
+    // Local idx (3 bits), temp type (4 bits)
+    SetLocalType,
+
+    // Map stack temp to self with known type
+    // Temp idx (3 bits), known type (4 bits)
+    SetTempType,
+
+    // Map stack temp to a local variable
+    // Temp idx (3 bits), local idx (3 bits)
+    MapTempLocal,
+
+    // Map a stack temp to self
+    // Temp idx (3 bits)
+    MapTempSelf,
+
+    // Set inline block pointer	(8 bytes)
+    SetInlineBlock,
+
+    // End of encoding
+    EndOfCode,
+}
+
+// Number of entries in the context cache
+const CTX_ENCODE_CACHE_SIZE: usize = 1024;
+const CTX_DECODE_CACHE_SIZE: usize = 1024;
+
+// Cache of the last contexts encoded/decoded
+// Empirically this saves a few percent of memory and speeds up compilation
+// We can experiment with varying the size of this cache
+pub type CtxEncodeCache = [(Context, u32); CTX_ENCODE_CACHE_SIZE];
+static mut CTX_ENCODE_CACHE: Option<Box<CtxEncodeCache>> = None;
+
+// Cache of the last contexts encoded/decoded
+// This speeds up compilation
+pub type CtxDecodeCache = [(Context, u32); CTX_DECODE_CACHE_SIZE];
+static mut CTX_DECODE_CACHE: Option<Box<CtxDecodeCache>> = None;
+
+// Size of the context cache in bytes
+pub const CTX_ENCODE_CACHE_BYTES: usize = std::mem::size_of::<CtxEncodeCache>();
+pub const CTX_DECODE_CACHE_BYTES: usize = std::mem::size_of::<CtxDecodeCache>();
+
+impl Context {
+    // Encode a context into the global context data, or return
+    // a cached previously encoded offset if one is found
+    pub fn encode(&self) -> u32 {
+        incr_counter!(num_contexts_encoded);
+
+        if *self == Context::default() {
+            incr_counter!(context_cache_hits);
+            return 0;
+        }
+
+        if let Some(idx) = Self::encode_cache_get(self) {
+            incr_counter!(context_cache_hits);
+            debug_assert!(Self::decode(idx) == *self);
+            return idx;
+        }
+
+        let context_data = CodegenGlobals::get_context_data();
+
+        // Make sure we don't use offset 0 because
+        // it's is reserved for the default context
+        if context_data.num_bits() == 0 {
+            context_data.push_u1(0);
+        }
+
+        let idx = self.encode_into(context_data);
+        let idx: u32 = idx.try_into().unwrap();
+
+        // Save this offset into the cache
+        Self::encode_cache_set(self, idx);
+        Self::decode_cache_set(self, idx);
+
+        // In debug mode, check that the round-trip decoding always matches
+        debug_assert!(Self::decode(idx) == *self);
+
+        idx
+    }
+
+    pub fn decode(start_idx: u32) -> Context {
+        if start_idx == 0 {
+            return Context::default();
+        };
+
+        if let Some(ctx) = Self::decode_cache_get(start_idx) {
+            return ctx;
+        }
+
+        let context_data = CodegenGlobals::get_context_data();
+        let ctx = Self::decode_from(context_data, start_idx as usize);
+
+        Self::encode_cache_set(&ctx, start_idx);
+        Self::decode_cache_set(&ctx, start_idx);
+
+        ctx
+    }
+
+    // Store an entry in a cache of recently encoded/decoded contexts for encoding
+    fn encode_cache_set(ctx: &Context, idx: u32)
+    {
+        // Compute the hash for this context
+        let mut hasher = DefaultHasher::new();
+        ctx.hash(&mut hasher);
+        let ctx_hash = hasher.finish() as usize;
+
+        unsafe {
+            // Lazily initialize the context cache
+            if CTX_ENCODE_CACHE == None {
+                // Here we use the vec syntax to avoid allocating the large table on the stack,
+                // as this can cause a stack overflow
+                let tbl = vec![(Context::default(), 0); CTX_ENCODE_CACHE_SIZE].into_boxed_slice().try_into().unwrap();
+                CTX_ENCODE_CACHE = Some(tbl);
+            }
+
+            // Write a cache entry for this context
+            let cache = CTX_ENCODE_CACHE.as_mut().unwrap();
+            cache[ctx_hash % CTX_ENCODE_CACHE_SIZE] = (*ctx, idx);
+        }
+    }
+
+    // Store an entry in a cache of recently encoded/decoded contexts for decoding
+    fn decode_cache_set(ctx: &Context, idx: u32) {
+        unsafe {
+            // Lazily initialize the context cache
+            if CTX_DECODE_CACHE == None {
+                // Here we use the vec syntax to avoid allocating the large table on the stack,
+                // as this can cause a stack overflow
+                let tbl = vec![(Context::default(), 0); CTX_DECODE_CACHE_SIZE].into_boxed_slice().try_into().unwrap();
+                CTX_DECODE_CACHE = Some(tbl);
+            }
+
+            // Write a cache entry for this context
+            let cache = CTX_DECODE_CACHE.as_mut().unwrap();
+            cache[idx as usize % CTX_DECODE_CACHE_SIZE] = (*ctx, idx);
+        }
+    }
+
+    // Lookup the context in a cache of recently encoded/decoded contexts for encoding
+    fn encode_cache_get(ctx: &Context) -> Option<u32>
+    {
+        // Compute the hash for this context
+        let mut hasher = DefaultHasher::new();
+        ctx.hash(&mut hasher);
+        let ctx_hash = hasher.finish() as usize;
+
+        unsafe {
+            if CTX_ENCODE_CACHE == None {
+                return None;
+            }
+
+            let cache = CTX_ENCODE_CACHE.as_mut().unwrap();
+
+            // Check that the context for this cache entry matches
+            let cache_entry = &cache[ctx_hash % CTX_ENCODE_CACHE_SIZE];
+            if cache_entry.0 == *ctx {
+                debug_assert!(cache_entry.1 != 0);
+                return Some(cache_entry.1);
+            }
+
+            return None;
+        }
+    }
+
+    // Lookup the context in a cache of recently encoded/decoded contexts for decoding
+    fn decode_cache_get(start_idx: u32) -> Option<Context> {
+        unsafe {
+            if CTX_DECODE_CACHE == None {
+                return None;
+            }
+
+            let cache = CTX_DECODE_CACHE.as_mut().unwrap();
+
+            // Check that the start_idx for this cache entry matches
+            let cache_entry = &cache[start_idx as usize % CTX_DECODE_CACHE_SIZE];
+            if cache_entry.1 == start_idx {
+                return Some(cache_entry.0);
+            }
+
+            return None;
+        }
+    }
+
+    // Encode into a compressed context representation in a bit vector
+    fn encode_into(&self, bits: &mut BitVector) -> usize {
+        let start_idx = bits.num_bits();
+
+        // Most of the time, the stack size is small and sp offset has the same value
+        if (self.stack_size as i64) == (self.sp_offset as i64) && self.stack_size < 4 {
+            // One single bit to signify a compact stack_size/sp_offset encoding
+            debug_assert!(self.sp_offset >= 0);
+            bits.push_u1(1);
+            bits.push_u2(self.stack_size);
+        } else {
+            // Full stack size encoding
+            bits.push_u1(0);
+
+            // Number of values currently on the temporary stack
+            bits.push_u8(self.stack_size);
+
+            // sp_offset: i8,
+            bits.push_u8(self.sp_offset as u8);
+        }
+
+        // Which stack temps or locals are in a register
+        for &temp in self.reg_mapping.0.iter() {
+            if let Some(temp) = temp {
+                bits.push_u1(1); // Some
+                match temp {
+                    RegOpnd::Stack(stack_idx) => {
+                        bits.push_u1(0); // Stack
+                        bits.push_u3(stack_idx);
+                    }
+                    RegOpnd::Local(local_idx) => {
+                        bits.push_u1(1); // Local
+                        bits.push_u3(local_idx);
+                    }
+                }
+            } else {
+                bits.push_u1(0); // None
+            }
+        }
+
+        bits.push_bool(self.is_deferred);
+        bits.push_bool(self.is_return_landing);
+
+        // The chain depth is most often 0 or 1
+        if self.chain_depth < 2 {
+            bits.push_u1(0);
+            bits.push_u1(self.chain_depth);
+
+        } else {
+            bits.push_u1(1);
+            bits.push_u5(self.chain_depth);
+        }
+
+        // Encode the self type if known
+        if self.self_type != Type::Unknown {
+            bits.push_op(CtxOp::SetSelfType);
+            bits.push_u4(self.self_type as u8);
+        }
+
+        // Encode the local types if known
+        for local_idx in 0..MAX_CTX_LOCALS {
+            let t = self.get_local_type(local_idx);
+            if t != Type::Unknown {
+                bits.push_op(CtxOp::SetLocalType);
+                bits.push_u3(local_idx as u8);
+                bits.push_u4(t as u8);
+            }
+        }
+
+        // Encode stack temps
+        for stack_idx in 0..MAX_CTX_TEMPS {
+            let mapping = self.get_temp_mapping(stack_idx);
+
+            match mapping {
+                MapToStack(temp_type) => {
+                    if temp_type != Type::Unknown {
+                        // Temp idx (3 bits), known type (4 bits)
+                        bits.push_op(CtxOp::SetTempType);
+                        bits.push_u3(stack_idx as u8);
+                        bits.push_u4(temp_type as u8);
+                    }
+                }
+
+                MapToLocal(local_idx) => {
+                    bits.push_op(CtxOp::MapTempLocal);
+                    bits.push_u3(stack_idx as u8);
+                    bits.push_u3(local_idx);
+                }
+
+                MapToSelf => {
+                    // Temp idx (3 bits)
+                    bits.push_op(CtxOp::MapTempSelf);
+                    bits.push_u3(stack_idx as u8);
+                }
+            }
+        }
+
+        // Inline block pointer
+        if let Some(iseq) = self.inline_block {
+            bits.push_op(CtxOp::SetInlineBlock);
+            bits.push_uint(iseq as u64, 64);
+        }
+
+        // TODO: should we add an op for end-of-encoding,
+        // or store num ops at the beginning?
+        bits.push_op(CtxOp::EndOfCode);
+
+        start_idx
+    }
+
+    // Decode a compressed context representation from a bit vector
+    fn decode_from(bits: &BitVector, start_idx: usize) -> Context {
+        let mut ctx = Context::default();
+
+        let mut idx = start_idx;
+
+        // Small vs large stack size encoding
+        if bits.read_u1(&mut idx) == 1 {
+            ctx.stack_size = bits.read_u2(&mut idx);
+            ctx.sp_offset = ctx.stack_size as i8;
+        } else {
+            ctx.stack_size = bits.read_u8(&mut idx);
+            let sp_offset_bits = bits.read_u8(&mut idx);
+            ctx.sp_offset = sp_offset_bits as i8;
+
+            // If the top bit is set, then the sp offset must be negative
+            debug_assert!(!( (sp_offset_bits & 0x80) != 0 && ctx.sp_offset > 0 ));
+        }
+
+        // Which stack temps or locals are in a register
+        for index in 0..MAX_MAPPED_REGS {
+            if bits.read_u1(&mut idx) == 1 { // Some
+                let temp = if bits.read_u1(&mut idx) == 0 { // RegMapping::Stack
+                    RegOpnd::Stack(bits.read_u3(&mut idx))
+                } else {
+                    RegOpnd::Local(bits.read_u3(&mut idx))
+                };
+                ctx.reg_mapping.0[index] = Some(temp);
+            }
+        }
+
+        ctx.is_deferred = bits.read_bool(&mut idx);
+        ctx.is_return_landing = bits.read_bool(&mut idx);
+
+        if bits.read_u1(&mut idx) == 0 {
+            ctx.chain_depth = bits.read_u1(&mut idx)
+        } else {
+            ctx.chain_depth = bits.read_u5(&mut idx)
+        }
+
+        loop {
+            //println!("reading op");
+            let op = bits.read_op(&mut idx);
+            //println!("got op {:?}", op);
+
+            match op {
+                CtxOp::SetSelfType => {
+                    ctx.self_type = unsafe { transmute(bits.read_u4(&mut idx)) };
+                }
+
+                CtxOp::SetLocalType => {
+                    let local_idx = bits.read_u3(&mut idx) as usize;
+                    let t = unsafe { transmute(bits.read_u4(&mut idx)) };
+                    ctx.set_local_type(local_idx, t);
+                }
+
+                // Map temp to stack (known type)
+                CtxOp::SetTempType => {
+                    let temp_idx = bits.read_u3(&mut idx) as usize;
+                    let temp_type = unsafe { transmute(bits.read_u4(&mut idx)) };
+                    ctx.set_temp_mapping(temp_idx, TempMapping::MapToStack(temp_type));
+                }
+
+                // Map temp to local
+                CtxOp::MapTempLocal => {
+                    let temp_idx = bits.read_u3(&mut idx) as usize;
+                    let local_idx = bits.read_u3(&mut idx);
+                    ctx.set_temp_mapping(temp_idx, TempMapping::MapToLocal(local_idx));
+                }
+
+                // Map temp to self
+                CtxOp::MapTempSelf => {
+                    let temp_idx = bits.read_u3(&mut idx) as usize;
+                    ctx.set_temp_mapping(temp_idx, TempMapping::MapToSelf);
+                }
+
+                // Inline block pointer
+                CtxOp::SetInlineBlock => {
+                    ctx.inline_block = Some(bits.read_uint(&mut idx, 64) as IseqPtr);
+                }
+
+                CtxOp::EndOfCode => break,
+            }
+        }
+
+        ctx
+    }
 }
 
 /// Tuple of (iseq, idx) used to identify basic blocks
@@ -322,7 +1221,7 @@ pub struct BlockId {
     pub iseq: IseqPtr,
 
     /// Index in the iseq where the block starts
-    pub idx: u32,
+    pub idx: u16,
 }
 
 /// Branch code shape enumeration
@@ -333,12 +1232,127 @@ pub enum BranchShape {
     Default, // Neither target is next
 }
 
-// Branch code generation function signature
-type BranchGenFn =
-    fn(cb: &mut Assembler, target0: CodePtr, target1: Option<CodePtr>, shape: BranchShape) -> ();
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum BranchGenFn {
+    BranchIf(Cell<BranchShape>),
+    BranchNil(Cell<BranchShape>),
+    BranchUnless(Cell<BranchShape>),
+    JumpToTarget0(Cell<BranchShape>),
+    JNZToTarget0,
+    JZToTarget0,
+    JBEToTarget0,
+    JBToTarget0,
+    JOMulToTarget0,
+    JITReturn,
+}
+
+impl BranchGenFn {
+    pub fn call(&self, asm: &mut Assembler, target0: Target, target1: Option<Target>) {
+        match self {
+            BranchGenFn::BranchIf(shape) => {
+                match shape.get() {
+                    BranchShape::Next0 => asm.jz(target1.unwrap()),
+                    BranchShape::Next1 => asm.jnz(target0),
+                    BranchShape::Default => {
+                        asm.jnz(target0);
+                        asm.jmp(target1.unwrap());
+                    }
+                }
+            }
+            BranchGenFn::BranchNil(shape) => {
+                match shape.get() {
+                    BranchShape::Next0 => asm.jne(target1.unwrap()),
+                    BranchShape::Next1 => asm.je(target0),
+                    BranchShape::Default => {
+                        asm.je(target0);
+                        asm.jmp(target1.unwrap());
+                    }
+                }
+            }
+            BranchGenFn::BranchUnless(shape) => {
+                match shape.get() {
+                    BranchShape::Next0 => asm.jnz(target1.unwrap()),
+                    BranchShape::Next1 => asm.jz(target0),
+                    BranchShape::Default => {
+                        asm.jz(target0);
+                        asm.jmp(target1.unwrap());
+                    }
+                }
+            }
+            BranchGenFn::JumpToTarget0(shape) => {
+                if shape.get() == BranchShape::Next1 {
+                    panic!("Branch shape Next1 not allowed in JumpToTarget0!");
+                }
+                if shape.get() == BranchShape::Default {
+                    asm.jmp(target0);
+                }
+            }
+            BranchGenFn::JNZToTarget0 => {
+                asm.jnz(target0)
+            }
+            BranchGenFn::JZToTarget0 => {
+                asm.jz(target0)
+            }
+            BranchGenFn::JBEToTarget0 => {
+                asm.jbe(target0)
+            }
+            BranchGenFn::JBToTarget0 => {
+                asm.jb(target0)
+            }
+            BranchGenFn::JOMulToTarget0 => {
+                asm.jo_mul(target0)
+            }
+            BranchGenFn::JITReturn => {
+                asm_comment!(asm, "update cfp->jit_return");
+                let jit_return = RUBY_OFFSET_CFP_JIT_RETURN - RUBY_SIZEOF_CONTROL_FRAME as i32;
+                let raw_ptr = asm.lea_jump_target(target0);
+                asm.mov(Opnd::mem(64, CFP, jit_return), raw_ptr);
+            }
+        }
+    }
+
+    pub fn get_shape(&self) -> BranchShape {
+        match self {
+            BranchGenFn::BranchIf(shape) |
+            BranchGenFn::BranchNil(shape) |
+            BranchGenFn::BranchUnless(shape) |
+            BranchGenFn::JumpToTarget0(shape) => shape.get(),
+            BranchGenFn::JNZToTarget0 |
+            BranchGenFn::JZToTarget0 |
+            BranchGenFn::JBEToTarget0 |
+            BranchGenFn::JBToTarget0 |
+            BranchGenFn::JOMulToTarget0 |
+            BranchGenFn::JITReturn => BranchShape::Default,
+        }
+    }
+
+    pub fn set_shape(&self, new_shape: BranchShape) {
+        match self {
+            BranchGenFn::BranchIf(shape) |
+            BranchGenFn::BranchNil(shape) |
+            BranchGenFn::BranchUnless(shape) => {
+                shape.set(new_shape);
+            }
+            BranchGenFn::JumpToTarget0(shape) => {
+                if new_shape == BranchShape::Next1 {
+                    panic!("Branch shape Next1 not allowed in JumpToTarget0!");
+                }
+                shape.set(new_shape);
+            }
+            BranchGenFn::JNZToTarget0 |
+            BranchGenFn::JZToTarget0 |
+            BranchGenFn::JBEToTarget0 |
+            BranchGenFn::JBToTarget0 |
+            BranchGenFn::JOMulToTarget0 |
+            BranchGenFn::JITReturn => {
+                assert_eq!(new_shape, BranchShape::Default);
+            }
+        }
+    }
+}
 
 /// A place that a branch could jump to
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 enum BranchTarget {
     Stub(Box<BranchStub>), // Not compiled yet
     Block(BlockRef),       // Already compiled
@@ -348,88 +1362,255 @@ impl BranchTarget {
     fn get_address(&self) -> Option<CodePtr> {
         match self {
             BranchTarget::Stub(stub) => stub.address,
-            BranchTarget::Block(blockref) => blockref.borrow().start_addr,
+            BranchTarget::Block(blockref) => Some(unsafe { blockref.as_ref() }.start_addr),
         }
     }
 
     fn get_blockid(&self) -> BlockId {
         match self {
-            BranchTarget::Stub(stub) => stub.id,
-            BranchTarget::Block(blockref) => blockref.borrow().blockid,
+            BranchTarget::Stub(stub) => BlockId { iseq: stub.iseq.get(), idx: stub.iseq_idx },
+            BranchTarget::Block(blockref) => unsafe { blockref.as_ref() }.get_blockid(),
         }
     }
 
-    fn get_ctx(&self) -> Context {
+    fn get_ctx(&self) -> u32 {
         match self {
-            BranchTarget::Stub(stub) => stub.ctx.clone(),
-            BranchTarget::Block(blockref) => blockref.borrow().ctx.clone(),
+            BranchTarget::Stub(stub) => stub.ctx,
+            BranchTarget::Block(blockref) => unsafe { blockref.as_ref() }.ctx,
         }
     }
 
     fn get_block(&self) -> Option<BlockRef> {
         match self {
             BranchTarget::Stub(_) => None,
-            BranchTarget::Block(blockref) => Some(blockref.clone()),
+            BranchTarget::Block(blockref) => Some(*blockref),
         }
     }
 
-    fn set_iseq(&mut self, iseq: IseqPtr) {
+    fn set_iseq(&self, iseq: IseqPtr) {
         match self {
-            BranchTarget::Stub(stub) => stub.id.iseq = iseq,
-            BranchTarget::Block(blockref) => blockref.borrow_mut().blockid.iseq = iseq,
+            BranchTarget::Stub(stub) => stub.iseq.set(iseq),
+            BranchTarget::Block(blockref) => unsafe { blockref.as_ref() }.iseq.set(iseq),
         }
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 struct BranchStub {
     address: Option<CodePtr>,
-    id: BlockId,
-    ctx: Context,
+    iseq: Cell<IseqPtr>,
+    iseq_idx: IseqIdx,
+    ctx: u32,
 }
 
 /// Store info about an outgoing branch in a code segment
 /// Note: care must be taken to minimize the size of branch objects
-struct Branch {
+pub struct Branch {
     // Block this is attached to
-    block: BlockRef,
+    block: Cell<BlockRef>,
 
     // Positions where the generated code starts and ends
-    start_addr: Option<CodePtr>,
-    end_addr: Option<CodePtr>, // exclusive
+    start_addr: CodePtr,
+    end_addr: Cell<CodePtr>, // exclusive
 
     // Branch target blocks and their contexts
-    targets: [Option<Box<BranchTarget>>; 2],
+    targets: [Cell<Option<Box<BranchTarget>>>; 2],
 
     // Branch code generation function
     gen_fn: BranchGenFn,
+}
+
+/// A [Branch] for a [Block] that is under construction.
+/// Fields correspond, but may be `None` during construction.
+pub struct PendingBranch {
+    /// Allocation holder for the address of the constructed branch
+    /// in error paths Box deallocates it.
+    uninit_branch: Box<MaybeUninit<Branch>>,
+
+    /// Branch code generation function
+    gen_fn: BranchGenFn,
+
+    /// Positions where the generated code starts and ends
+    start_addr: Cell<Option<CodePtr>>,
+    end_addr: Cell<Option<CodePtr>>, // exclusive
+
+    /// Branch target blocks and their contexts
+    targets: [Cell<Option<Box<BranchTarget>>>; 2],
+}
+
+impl Branch {
+    // Compute the size of the branch code
+    fn code_size(&self) -> usize {
+        (self.end_addr.get().as_offset() - self.start_addr.as_offset()) as usize
+    }
+
+    /// Get the address of one of the branch destination
+    fn get_target_address(&self, target_idx: usize) -> Option<CodePtr> {
+        unsafe {
+            self.targets[target_idx]
+                .ref_unchecked()
+                .as_ref()
+                .and_then(|target| target.get_address())
+        }
+    }
+
+    fn get_stub_count(&self) -> usize {
+        let mut count = 0;
+        for target in self.targets.iter() {
+            if unsafe {
+                // SAFETY: no mutation
+                matches!(
+                    target.ref_unchecked().as_ref().map(Box::as_ref),
+                    Some(BranchTarget::Stub(_))
+                )
+            } {
+                count += 1;
+            }
+        }
+        count
+    }
 
-    // Shape of the branch
-    shape: BranchShape,
+    fn assert_layout(&self) {
+        let shape = self.gen_fn.get_shape();
+        assert!(
+            !(shape == BranchShape::Default && 0 == self.code_size()),
+            "zero-size branches are incorrect when code for neither targets are adjacent"
+            // One needs to issue some instruction to steer to the branch target
+            // when falling through isn't an option.
+        );
+    }
 }
 
 impl std::fmt::Debug for Branch {
+    // Can't derive this because `targets: !Copy` due to Cell.
     fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // TODO: expand this if needed. #[derive(Debug)] on Branch gave a
-        // strange error related to BranchGenFn
+        let targets = unsafe {
+            // SAFETY:
+            // While the references are live for the result of this function,
+            // no mutation happens because we are only calling derived fmt::Debug functions.
+            [self.targets[0].as_ptr().as_ref().unwrap(), self.targets[1].as_ptr().as_ref().unwrap()]
+        };
+
         formatter
             .debug_struct("Branch")
+            .field("block", &self.block)
             .field("start", &self.start_addr)
             .field("end", &self.end_addr)
-            .field("targets", &self.targets)
+            .field("targets", &targets)
+            .field("gen_fn", &self.gen_fn)
             .finish()
     }
 }
 
-impl Branch {
-    // Compute the size of the branch code
-    fn code_size(&self) -> usize {
-        (self.end_addr.unwrap().raw_ptr() as usize) - (self.start_addr.unwrap().raw_ptr() as usize)
+impl PendingBranch {
+    /// Set up a branch target at `target_idx`. Find an existing block to branch to
+    /// or generate a stub for one.
+    #[must_use]
+    fn set_target(
+        &self,
+        target_idx: u32,
+        target: BlockId,
+        ctx: &Context,
+        jit: &mut JITState,
+    ) -> Option<CodePtr> {
+        // If the block already exists
+        if let Some(blockref) = find_block_version(target, ctx) {
+            let block = unsafe { blockref.as_ref() };
+
+            // Fill out the target with this block
+            self.targets[target_idx.as_usize()]
+                .set(Some(Box::new(BranchTarget::Block(blockref))));
+            return Some(block.start_addr);
+        }
+
+        // Compress/encode the context
+        let ctx = Context::encode(ctx);
+
+        // The branch struct is uninitialized right now but as a stable address.
+        // We make sure the stub runs after the branch is initialized.
+        let branch_struct_addr = self.uninit_branch.as_ptr() as usize;
+        let stub_addr = gen_branch_stub(ctx, jit.iseq, jit.get_ocb(), branch_struct_addr, target_idx);
+
+        if let Some(stub_addr) = stub_addr {
+            // Fill the branch target with a stub
+            self.targets[target_idx.as_usize()].set(Some(Box::new(BranchTarget::Stub(Box::new(BranchStub {
+                address: Some(stub_addr),
+                iseq: Cell::new(target.iseq),
+                iseq_idx: target.idx,
+                ctx,
+            })))));
+        }
+
+        stub_addr
     }
 
-    /// Get the address of one of the branch destination
-    fn get_target_address(&self, target_idx: usize) -> Option<CodePtr> {
-        self.targets[target_idx].as_ref().and_then(|target| target.get_address())
+    // Construct the branch and wire it up in the grpah
+    fn into_branch(mut self, uninit_block: BlockRef) -> BranchRef {
+        // Make the branch
+        let branch = Branch {
+            block: Cell::new(uninit_block),
+            start_addr: self.start_addr.get().unwrap(),
+            end_addr: Cell::new(self.end_addr.get().unwrap()),
+            targets: self.targets,
+            gen_fn: self.gen_fn,
+        };
+        // Move it to the designated place on
+        // the heap and unwrap MaybeUninit.
+        self.uninit_branch.write(branch);
+        let raw_branch: *mut MaybeUninit<Branch> = Box::into_raw(self.uninit_branch);
+        let branchref = NonNull::new(raw_branch as *mut Branch).expect("no null from Box");
+
+        // SAFETY: just allocated it
+        let branch = unsafe { branchref.as_ref() };
+        // For block branch targets, put the new branch in the
+        // appropriate incoming list.
+        for target in branch.targets.iter() {
+            // SAFETY: no mutation
+            let out_block: Option<BlockRef> = unsafe {
+                target.ref_unchecked().as_ref().and_then(|target| target.get_block())
+            };
+
+            if let Some(out_block) = out_block {
+                // SAFETY: These blockrefs come from set_target() which only puts blocks from
+                // ISeqs, which are all initialized. Note that uninit_block isn't in any ISeq
+                // payload yet.
+                unsafe { out_block.as_ref() }.incoming.push(branchref);
+            }
+        }
+
+        branch.assert_layout();
+        incr_counter!(compiled_branch_count);
+
+        branchref
+    }
+}
+
+// Store info about code used on YJIT entry
+pub struct Entry {
+    // Positions where the generated code starts and ends
+    start_addr: CodePtr,
+    end_addr: CodePtr, // exclusive
+}
+
+/// A [Branch] for a [Block] that is under construction.
+pub struct PendingEntry {
+    pub uninit_entry: Box<MaybeUninit<Entry>>,
+    start_addr: Cell<Option<CodePtr>>,
+    end_addr: Cell<Option<CodePtr>>, // exclusive
+}
+
+impl PendingEntry {
+    // Construct the entry in the heap
+    pub fn into_entry(mut self) -> EntryRef {
+        // Make the entry
+        let entry = Entry {
+            start_addr: self.start_addr.get().unwrap(),
+            end_addr: self.end_addr.get().unwrap(),
+        };
+        // Move it to the designated place on the heap and unwrap MaybeUninit.
+        self.uninit_entry.write(entry);
+        let raw_entry: *mut MaybeUninit<Entry> = Box::into_raw(self.uninit_entry);
+        NonNull::new(raw_entry as *mut Entry).expect("no null from Box")
     }
 }
 
@@ -441,50 +1622,69 @@ pub type CmePtr = *const rb_callable_method_entry_t;
 /// Note: care must be taken to minimize the size of block_t objects
 #[derive(Debug)]
 pub struct Block {
-    // Bytecode sequence (iseq, idx) this is a version of
-    blockid: BlockId,
+    // The byte code instruction sequence this is a version of.
+    // Can change due to moving GC.
+    iseq: Cell<IseqPtr>,
 
-    // Index one past the last instruction for this block in the iseq
-    end_idx: u32,
+    // Index range covered by this version in `ISEQ_BODY(iseq)->iseq_encoded`.
+    iseq_range: Range<IseqIdx>,
 
     // Context at the start of the block
     // This should never be mutated
-    ctx: Context,
+    ctx: u32,
 
     // Positions where the generated code starts and ends
-    start_addr: Option<CodePtr>,
-    end_addr: Option<CodePtr>,
+    start_addr: CodePtr,
+    end_addr: Cell<CodePtr>,
 
     // List of incoming branches (from predecessors)
-    // These are reference counted (ownership shared between predecessor and successors)
-    incoming: Vec<BranchRef>,
+    incoming: MutableBranchList,
 
-    // NOTE: we might actually be able to store the branches here without refcounting
-    // however, using a RefCell makes it easy to get a pointer to Branch objects
-    //
     // List of outgoing branches (to successors)
-    outgoing: Vec<BranchRef>,
+    // Infrequently mutated for control flow graph edits for saving memory.
+    outgoing: MutableBranchList,
 
     // FIXME: should these be code pointers instead?
     // Offsets for GC managed objects in the mainline code block
-    gc_obj_offsets: Vec<u32>,
+    gc_obj_offsets: Box<[u32]>,
 
     // CME dependencies of this block, to help to remove all pointers to this
     // block in the system.
-    cme_dependencies: Vec<CmePtr>,
+    cme_dependencies: Box<[Cell<CmePtr>]>,
 
     // Code address of an exit for `ctx` and `blockid`.
     // Used for block invalidation.
-    pub entry_exit: Option<CodePtr>,
+    entry_exit: Option<CodePtr>,
 }
 
-/// Reference-counted pointer to a block that can be borrowed mutably.
-/// Wrapped so we could implement [Hash] and [Eq] for use with stdlib collections.
-#[derive(Debug)]
-pub struct BlockRef(Rc<RefCell<Block>>);
-
-/// Reference-counted pointer to a branch that can be borrowed mutably
-type BranchRef = Rc<RefCell<Branch>>;
+/// Pointer to a [Block].
+///
+/// # Safety
+///
+/// _Never_ derive a `&mut Block` from this and always use
+/// [std::ptr::NonNull::as_ref] to get a `&Block`. `&'a mut`
+/// in Rust asserts that there are no other references live
+/// over the lifetime `'a`. This uniqueness assertion does
+/// not hold in many situations for us, even when you ignore
+/// the fact that our control flow graph can have cycles.
+/// Here are just two examples where we have overlapping references:
+///  - Yielding to a different OS thread within the same
+///    ractor during compilation
+///  - The GC calling [rb_yjit_iseq_mark] during compilation
+///
+/// Technically, for soundness, we also need to ensure that
+/// the we have the VM lock while the result of `as_ref()`
+/// is live, so that no deallocation happens while the
+/// shared reference is live. The vast majority of our code run while
+/// holding the VM lock, though.
+pub type BlockRef = NonNull<Block>;
+
+/// Pointer to a [Branch]. See [BlockRef] for notes about
+/// proper usage.
+pub type BranchRef = NonNull<Branch>;
+
+/// Pointer to an entry that is already added to an ISEQ
+pub type EntryRef = NonNull<Entry>;
 
 /// List of block versions for a given blockid
 type VersionList = Vec<BlockRef>;
@@ -493,48 +1693,53 @@ type VersionList = Vec<BlockRef>;
 /// An instance of this is stored on each iseq
 type VersionMap = Vec<VersionList>;
 
-impl BlockRef {
-    /// Constructor
-    pub fn new(rc: Rc<RefCell<Block>>) -> Self {
-        Self(rc)
+/// [Interior mutability][1] wrapper for a list of branches.
+/// O(n) insertion, but space efficient. We generally expect
+/// blocks to have only a few branches.
+///
+/// [1]: https://doc.rust-lang.org/std/cell/struct.UnsafeCell.html
+#[repr(transparent)]
+struct MutableBranchList(Cell<Box<[BranchRef]>>);
+
+impl MutableBranchList {
+    fn push(&self, branch: BranchRef) {
+        // Temporary move the boxed slice out of self.
+        // oom=abort is load bearing here...
+        let mut current_list = self.0.take().into_vec();
+        current_list.push(branch);
+        self.0.set(current_list.into_boxed_slice());
     }
 
-    /// Borrow the block through [RefCell].
-    pub fn borrow(&self) -> Ref<'_, Block> {
-        self.0.borrow()
-    }
-
-    /// Borrow the block for mutation through [RefCell].
-    pub fn borrow_mut(&self) -> RefMut<'_, Block> {
-        self.0.borrow_mut()
+    /// Iterate through branches in the list by moving out of the cell
+    /// and then putting it back when done. Modifications to this cell
+    /// during iteration will be discarded.
+    ///
+    /// Assumes panic=abort since panic=unwind during iteration would
+    /// leave the cell empty.
+    fn for_each(&self, mut f: impl FnMut(BranchRef)) {
+        let list = self.0.take();
+        for branch in list.iter() {
+            f(*branch);
+        }
+        self.0.set(list);
     }
-}
 
-impl Clone for BlockRef {
-    /// Clone the [Rc]
-    fn clone(&self) -> Self {
-        Self(self.0.clone())
+    /// Length of the list.
+    fn len(&self) -> usize {
+        // SAFETY: No cell mutation inside unsafe.
+        unsafe { self.0.ref_unchecked().len() }
     }
 }
 
-impl Hash for BlockRef {
-    /// Hash the reference by hashing the pointer
-    fn hash<H: Hasher>(&self, state: &mut H) {
-        let rc_ptr = Rc::as_ptr(&self.0);
-        rc_ptr.hash(state);
-    }
-}
+impl fmt::Debug for MutableBranchList {
+    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // SAFETY: the derived Clone for boxed slices does not mutate this Cell
+        let branches = unsafe { self.0.ref_unchecked().clone() };
 
-impl PartialEq for BlockRef {
-    /// Equality defined by allocation identity
-    fn eq(&self, other: &Self) -> bool {
-        Rc::ptr_eq(&self.0, &other.0)
+        formatter.debug_list().entries(branches.iter()).finish()
     }
 }
 
-/// It's comparison by identity so all the requirements are statisfied
-impl Eq for BlockRef {}
-
 /// This is all the data YJIT stores on an iseq
 /// This will be dynamically allocated by C code
 /// C code should pass an &mut IseqPayload to us
@@ -542,11 +1747,14 @@ impl Eq for BlockRef {}
 #[derive(Default)]
 pub struct IseqPayload {
     // Basic block versions
-    version_map: VersionMap,
+    pub version_map: VersionMap,
 
-    // Indexes of code pages used by this this ISEQ
+    // Indexes of code pages used by this ISEQ
     pub pages: HashSet<usize>,
 
+    // List of ISEQ entry codes
+    pub entries: Vec<EntryRef>,
+
     // Blocks that are invalidated but are not yet deallocated.
     // The code GC will free them later.
     pub dead_blocks: Vec<BlockRef>,
@@ -605,17 +1813,19 @@ pub fn get_or_create_iseq_payload(iseq: IseqPtr) -> &'static mut IseqPayload {
 /// Iterate over all existing ISEQs
 pub fn for_each_iseq<F: FnMut(IseqPtr)>(mut callback: F) {
     unsafe extern "C" fn callback_wrapper(iseq: IseqPtr, data: *mut c_void) {
-        let callback: &mut &mut dyn FnMut(IseqPtr) -> bool = std::mem::transmute(&mut *data);
+        // SAFETY: points to the local below
+        let callback: &mut &mut dyn FnMut(IseqPtr) -> bool = unsafe { std::mem::transmute(&mut *data) };
         callback(iseq);
     }
     let mut data: &mut dyn FnMut(IseqPtr) = &mut callback;
-    unsafe { rb_yjit_for_each_iseq(Some(callback_wrapper), (&mut data) as *mut _ as *mut c_void) };
+    unsafe { rb_jit_for_each_iseq(Some(callback_wrapper), (&mut data) as *mut _ as *mut c_void) };
 }
 
 /// Iterate over all on-stack ISEQs
 pub fn for_each_on_stack_iseq<F: FnMut(IseqPtr)>(mut callback: F) {
     unsafe extern "C" fn callback_wrapper(iseq: IseqPtr, data: *mut c_void) {
-        let callback: &mut &mut dyn FnMut(IseqPtr) -> bool = std::mem::transmute(&mut *data);
+        // SAFETY: points to the local below
+        let callback: &mut &mut dyn FnMut(IseqPtr) -> bool = unsafe { std::mem::transmute(&mut *data) };
         callback(iseq);
     }
     let mut data: &mut dyn FnMut(IseqPtr) = &mut callback;
@@ -633,23 +1843,34 @@ pub fn for_each_on_stack_iseq_payload<F: FnMut(&IseqPayload)>(mut callback: F) {
 
 /// Iterate over all NOT on-stack ISEQ payloads
 pub fn for_each_off_stack_iseq_payload<F: FnMut(&mut IseqPayload)>(mut callback: F) {
-    let mut on_stack_iseqs: Vec<IseqPtr> = vec![];
-    for_each_on_stack_iseq(|iseq| {
-        on_stack_iseqs.push(iseq);
-    });
-    for_each_iseq(|iseq| {
+    // Get all ISEQs on the heap. Note that rb_objspace_each_objects() runs GC first,
+    // which could move ISEQ pointers when GC.auto_compact = true.
+    // So for_each_on_stack_iseq() must be called after this, which doesn't run GC.
+    let mut iseqs: Vec<IseqPtr> = vec![];
+    for_each_iseq(|iseq| iseqs.push(iseq));
+
+    // Get all ISEQs that are on a CFP of existing ECs.
+    let mut on_stack_iseqs: HashSet<IseqPtr> = HashSet::new();
+    for_each_on_stack_iseq(|iseq| { on_stack_iseqs.insert(iseq); });
+
+    // Invoke the callback for iseqs - on_stack_iseqs
+    for iseq in iseqs {
         if !on_stack_iseqs.contains(&iseq) {
             if let Some(iseq_payload) = get_iseq_payload(iseq) {
                 callback(iseq_payload);
             }
         }
-    })
+    }
 }
 
 /// Free the per-iseq payload
 #[no_mangle]
-pub extern "C" fn rb_yjit_iseq_free(payload: *mut c_void) {
+pub extern "C" fn rb_yjit_iseq_free(iseq: IseqPtr) {
+    // Free invariants for the ISEQ
+    iseq_free_invariants(iseq);
+
     let payload = {
+        let payload = unsafe { rb_iseq_get_yjit_payload(iseq) };
         if payload.is_null() {
             // Nothing to free.
             return;
@@ -663,27 +1884,45 @@ pub extern "C" fn rb_yjit_iseq_free(payload: *mut c_void) {
     // SAFETY: We got the pointer from Box::into_raw().
     let payload = unsafe { Box::from_raw(payload) };
 
-    // Increment the freed iseq count
-    incr_counter!(freed_iseq_count);
-
-    // Free all blocks in the payload
+    // Free all blocks in version_map. The GC doesn't free running iseqs.
     for versions in &payload.version_map {
         for block in versions {
-            free_block(block);
+            // SAFETY: blocks in the version_map are always well connected
+            unsafe { free_block(*block, true) };
         }
     }
+
+    // Free dead blocks
+    for block in payload.dead_blocks {
+        unsafe { free_block(block, false) };
+    }
+
+    // Free all entries
+    for entryref in payload.entries.iter() {
+        let entry = unsafe { Box::from_raw(entryref.as_ptr()) };
+        mem::drop(entry);
+    }
+
+    // Increment the freed iseq count
+    incr_counter!(freed_iseq_count);
 }
 
-/// GC callback for marking GC objects in the the per-iseq payload.
+/// GC callback for marking GC objects in the per-iseq payload.
 #[no_mangle]
 pub extern "C" fn rb_yjit_iseq_mark(payload: *mut c_void) {
     let payload = if payload.is_null() {
         // Nothing to mark.
         return;
     } else {
-        // SAFETY: It looks like the GC takes the VM lock while marking
-        // so we should be satisfying aliasing rules here.
-        unsafe { &*(payload as *const IseqPayload) }
+        // SAFETY: The GC takes the VM lock while marking, which
+        // we assert, so we should be synchronized and data race free.
+        //
+        // For aliasing, having the VM lock hopefully also implies that no one
+        // else has an overlapping &mut IseqPayload.
+        unsafe {
+            rb_assert_holding_vm_lock();
+            &*(payload as *const IseqPayload)
+        }
     };
 
     // For marking VALUEs written into the inline code block.
@@ -692,26 +1931,56 @@ pub extern "C" fn rb_yjit_iseq_mark(payload: *mut c_void) {
 
     for versions in &payload.version_map {
         for block in versions {
-            let block = block.borrow();
+            // SAFETY: all blocks inside version_map are initialized.
+            let block = unsafe { block.as_ref() };
+            mark_block(block, cb, false);
+        }
+    }
+    // Mark dead blocks, since there could be stubs pointing at them
+    for blockref in &payload.dead_blocks {
+        // SAFETY: dead blocks come from version_map, which only have initialized blocks
+        let block = unsafe { blockref.as_ref() };
+        mark_block(block, cb, true);
+    }
 
-            unsafe { rb_gc_mark_movable(block.blockid.iseq.into()) };
+    return;
 
-            // Mark method entry dependencies
-            for &cme_dep in &block.cme_dependencies {
-                unsafe { rb_gc_mark_movable(cme_dep.into()) };
-            }
+    fn mark_block(block: &Block, cb: &CodeBlock, dead: bool) {
+        unsafe { rb_gc_mark_movable(block.iseq.get().into()) };
+
+        // Mark method entry dependencies
+        for cme_dep in block.cme_dependencies.iter() {
+            unsafe { rb_gc_mark_movable(cme_dep.get().into()) };
+        }
 
-            // Mark outgoing branch entries
-            for branch in &block.outgoing {
-                let branch = branch.borrow();
-                for target in branch.targets.iter().flatten() {
-                    unsafe { rb_gc_mark_movable(target.get_blockid().iseq.into()) };
+        // Mark outgoing branch entries
+        block.outgoing.for_each(|branch| {
+            let branch = unsafe { branch.as_ref() };
+            for target in branch.targets.iter() {
+                // SAFETY: no mutation inside unsafe
+                let target_iseq = unsafe {
+                    target.ref_unchecked().as_ref().and_then(|target| {
+                        // Avoid get_blockid() on blockref. Can be dangling on dead blocks,
+                        // and the iseq housing the block already naturally handles it.
+                        if target.get_block().is_some() {
+                            None
+                        } else {
+                            Some(target.get_blockid().iseq)
+                        }
+                    })
+                };
+
+                if let Some(target_iseq) = target_iseq {
+                    unsafe { rb_gc_mark_movable(target_iseq.into()) };
                 }
             }
+        });
 
-            // Walk over references to objects in generated code.
-            for offset in &block.gc_obj_offsets {
-                let value_address: *const u8 = cb.get_ptr(offset.as_usize()).raw_ptr();
+        // Mark references to objects in generated code.
+        // Skip for dead blocks since they shouldn't run.
+        if !dead {
+            for offset in block.gc_obj_offsets.iter() {
+                let value_address: *const u8 = cb.get_ptr(offset.as_usize()).raw_ptr(cb);
                 // Creating an unaligned pointer is well defined unlike in C.
                 let value_address = value_address as *const VALUE;
 
@@ -725,17 +1994,24 @@ pub extern "C" fn rb_yjit_iseq_mark(payload: *mut c_void) {
     }
 }
 
-/// GC callback for updating GC objects in the the per-iseq payload.
+/// GC callback for updating GC objects in the per-iseq payload.
 /// This is a mirror of [rb_yjit_iseq_mark].
 #[no_mangle]
-pub extern "C" fn rb_yjit_iseq_update_references(payload: *mut c_void) {
+pub extern "C" fn rb_yjit_iseq_update_references(iseq: IseqPtr) {
+    let payload = unsafe { rb_iseq_get_yjit_payload(iseq) };
     let payload = if payload.is_null() {
         // Nothing to update.
         return;
     } else {
-        // SAFETY: It looks like the GC takes the VM lock while updating references
-        // so we should be satisfying aliasing rules here.
-        unsafe { &*(payload as *const IseqPayload) }
+        // SAFETY: The GC takes the VM lock while marking, which
+        // we assert, so we should be synchronized and data race free.
+        //
+        // For aliasing, having the VM lock hopefully also implies that no one
+        // else has an overlapping &mut IseqPayload.
+        unsafe {
+            rb_assert_holding_vm_lock();
+            &*(payload as *const IseqPayload)
+        }
     };
 
     // Evict other threads from generated code since we are about to patch them.
@@ -746,29 +2022,66 @@ pub extern "C" fn rb_yjit_iseq_update_references(payload: *mut c_void) {
     let cb = CodegenGlobals::get_inline_cb();
 
     for versions in &payload.version_map {
-        for block in versions {
-            let mut block = block.borrow_mut();
+        for version in versions {
+            // SAFETY: all blocks inside version_map are initialized
+            let block = unsafe { version.as_ref() };
+            block_update_references(block, cb, false);
+        }
+    }
+    // Update dead blocks, since there could be stubs pointing at them
+    for blockref in &payload.dead_blocks {
+        // SAFETY: dead blocks come from version_map, which only have initialized blocks
+        let block = unsafe { blockref.as_ref() };
+        block_update_references(block, cb, true);
+    }
 
-            block.blockid.iseq = unsafe { rb_gc_location(block.blockid.iseq.into()) }.as_iseq();
+    return;
 
-            // Update method entry dependencies
-            for cme_dep in &mut block.cme_dependencies {
-                *cme_dep = unsafe { rb_gc_location((*cme_dep).into()) }.as_cme();
-            }
+    fn block_update_references(block: &Block, cb: &mut CodeBlock, dead: bool) {
+        block.iseq.set(unsafe { rb_gc_location(block.iseq.get().into()) }.as_iseq());
+
+        // Update method entry dependencies
+        for cme_dep in block.cme_dependencies.iter() {
+            let cur_cme: VALUE = cme_dep.get().into();
+            let new_cme = unsafe { rb_gc_location(cur_cme) }.as_cme();
+            cme_dep.set(new_cme);
+        }
+
+        // Update outgoing branch entries
+        block.outgoing.for_each(|branch| {
+            let branch = unsafe { branch.as_ref() };
+            for target in branch.targets.iter() {
+                // SAFETY: no mutation inside unsafe
+                let current_iseq = unsafe {
+                    target.ref_unchecked().as_ref().and_then(|target| {
+                        // Avoid get_blockid() on blockref. Can be dangling on dead blocks,
+                        // and the iseq housing the block already naturally handles it.
+                        if target.get_block().is_some() {
+                            None
+                        } else {
+                            Some(target.get_blockid().iseq)
+                        }
+                    })
+                };
 
-            // Update outgoing branch entries
-            for branch in &block.outgoing {
-                let mut branch = branch.borrow_mut();
-                for target in branch.targets.iter_mut().flatten() {
-                    target.set_iseq(unsafe { rb_gc_location(target.get_blockid().iseq.into()) }.as_iseq());
+                if let Some(current_iseq) = current_iseq {
+                    let updated_iseq = unsafe { rb_gc_location(current_iseq.into()) }
+                        .as_iseq();
+                    // SAFETY: the Cell::set is not on the reference given out
+                    // by ref_unchecked.
+                    unsafe { target.ref_unchecked().as_ref().unwrap().set_iseq(updated_iseq) };
                 }
             }
+        });
 
-            // Walk over references to objects in generated code.
-            for offset in &block.gc_obj_offsets {
+        // Update references to objects in generated code.
+        // Skip for dead blocks since they shouldn't run and
+        // so there is no potential of writing over invalidation jumps
+        if !dead {
+            for offset in block.gc_obj_offsets.iter() {
                 let offset_to_value = offset.as_usize();
                 let value_code_ptr = cb.get_ptr(offset_to_value);
-                let value_ptr: *const u8 = value_code_ptr.raw_ptr();
+                let value_ptr: *const u8 = value_code_ptr.raw_ptr(cb);
                 // Creating an unaligned pointer is well defined unlike in C.
                 let value_ptr = value_ptr as *mut VALUE;
 
@@ -778,22 +2091,42 @@ pub extern "C" fn rb_yjit_iseq_update_references(payload: *mut c_void) {
 
                 // Only write when the VALUE moves, to be copy-on-write friendly.
                 if new_addr != object {
-                    for (byte_idx, &byte) in new_addr.as_u64().to_le_bytes().iter().enumerate() {
-                        let byte_code_ptr = value_code_ptr.add_bytes(byte_idx);
-                        cb.write_mem(byte_code_ptr, byte)
-                            .expect("patching existing code should be within bounds");
-                    }
+                    // SAFETY: Since we already set code memory writable before the compacting phase,
+                    // we can use raw memory accesses directly.
+                    unsafe { value_ptr.write_unaligned(new_addr); }
                 }
             }
         }
+
     }
+}
 
-    // Note that we would have returned already if YJIT is off.
-    cb.mark_all_executable();
+/// Mark all code memory as writable.
+/// This function is useful for garbage collectors that update references in JIT-compiled code in
+/// bulk.
+#[no_mangle]
+pub extern "C" fn rb_yjit_mark_all_writeable() {
+    if CodegenGlobals::has_instance() {
+        CodegenGlobals::get_inline_cb().mark_all_writeable();
+
+        CodegenGlobals::get_outlined_cb()
+            .unwrap()
+            .mark_all_writeable();
+    }
+}
+
+/// Mark all code memory as executable.
+/// This function is useful for garbage collectors that update references in JIT-compiled code in
+/// bulk.
+#[no_mangle]
+pub extern "C" fn rb_yjit_mark_all_executable() {
+    if CodegenGlobals::has_instance() {
+        CodegenGlobals::get_inline_cb().mark_all_executable();
 
-    CodegenGlobals::get_outlined_cb()
-        .unwrap()
-        .mark_all_executable();
+        CodegenGlobals::get_outlined_cb()
+            .unwrap()
+            .mark_all_executable();
+    }
 }
 
 /// Get all blocks for a particular place in an iseq.
@@ -833,15 +2166,28 @@ pub fn take_version_list(blockid: BlockId) -> VersionList {
     }
 }
 
-/// Count the number of block versions matching a given blockid
-fn get_num_versions(blockid: BlockId) -> usize {
+/// Count the number of block versions that match a given BlockId and part of a Context
+fn get_num_versions(blockid: BlockId, ctx: &Context) -> usize {
     let insn_idx = blockid.idx.as_usize();
     match get_iseq_payload(blockid.iseq) {
+
+        // FIXME: this counting logic is going to be expensive.
+        // We should avoid it if possible
+
         Some(payload) => {
             payload
                 .version_map
                 .get(insn_idx)
-                .map(|versions| versions.len())
+                .map(|versions| {
+                    versions.iter().filter(|&&version| {
+                        let version_ctx = Context::decode(unsafe { version.as_ref() }.ctx);
+                        // Inline versions are counted separately towards MAX_INLINE_VERSIONS.
+                        version_ctx.inline() == ctx.inline() &&
+                            // find_block_versions() finds only blocks with compatible reg_mapping,
+                            // so count only versions with compatible reg_mapping.
+                            version_ctx.reg_mapping == ctx.reg_mapping
+                    }).count()
+                })
                 .unwrap_or(0)
         }
         None => 0,
@@ -862,7 +2208,7 @@ pub fn get_or_create_iseq_block_list(iseq: IseqPtr) -> Vec<BlockRef> {
         // For each version at this instruction index
         for version in version_list {
             // Clone the block ref and add it to the list
-            blocks.push(version.clone());
+            blocks.push(*version);
         }
     }
 
@@ -872,90 +2218,151 @@ pub fn get_or_create_iseq_block_list(iseq: IseqPtr) -> Vec<BlockRef> {
 /// Retrieve a basic block version for an (iseq, idx) tuple
 /// This will return None if no version is found
 fn find_block_version(blockid: BlockId, ctx: &Context) -> Option<BlockRef> {
-    let versions = match get_version_list(blockid) {
-        Some(versions) => versions,
-        None => return None,
-    };
+    let versions = get_version_list(blockid)?;
 
     // Best match found
     let mut best_version: Option<BlockRef> = None;
     let mut best_diff = usize::MAX;
 
     // For each version matching the blockid
-    for blockref in versions.iter_mut() {
-        let block = blockref.borrow();
-        let diff = ctx.diff(&block.ctx);
+    for blockref in versions.iter() {
+        let block = unsafe { blockref.as_ref() };
+        let block_ctx = Context::decode(block.ctx);
 
         // Note that we always prefer the first matching
         // version found because of inline-cache chains
-        if diff < best_diff {
-            best_version = Some(blockref.clone());
-            best_diff = diff;
+        match ctx.diff(&block_ctx) {
+            TypeDiff::Compatible(diff) if diff < best_diff => {
+                best_version = Some(*blockref);
+                best_diff = diff;
+            }
+            _ => {}
         }
     }
 
-    // If greedy versioning is enabled
-    if get_option!(greedy_versioning) {
-        // If we're below the version limit, don't settle for an imperfect match
-        if versions.len() + 1 < get_option!(max_versions) && best_diff > 0 {
-            return None;
+    return best_version;
+}
+
+/// Find the closest RegMapping among ones that have already been compiled.
+pub fn find_most_compatible_reg_mapping(blockid: BlockId, ctx: &Context) -> Option<RegMapping> {
+    let versions = get_version_list(blockid)?;
+
+    // Best match found
+    let mut best_mapping: Option<RegMapping> = None;
+    let mut best_diff = usize::MAX;
+
+    // For each version matching the blockid
+    for blockref in versions.iter() {
+        let block = unsafe { blockref.as_ref() };
+        let block_ctx = Context::decode(block.ctx);
+
+        // Discover the best block that is compatible if we load/spill registers
+        match ctx.diff_allowing_reg_mismatch(&block_ctx) {
+            TypeDiff::Compatible(diff) if diff < best_diff => {
+                best_mapping = Some(block_ctx.get_reg_mapping());
+                best_diff = diff;
+            }
+            _ => {}
         }
     }
 
-    return best_version;
+    best_mapping
 }
 
+/// Allow inlining a Block up to MAX_INLINE_VERSIONS times.
+const MAX_INLINE_VERSIONS: usize = 1000;
+
 /// Produce a generic context when the block version limit is hit for a blockid
 pub fn limit_block_versions(blockid: BlockId, ctx: &Context) -> Context {
     // Guard chains implement limits separately, do nothing
-    if ctx.chain_depth > 0 {
-        return ctx.clone();
+    if ctx.get_chain_depth() > 0 {
+        return *ctx;
     }
 
+    let next_versions = get_num_versions(blockid, ctx) + 1;
+    let max_versions = if ctx.inline() {
+        MAX_INLINE_VERSIONS
+    } else {
+        get_option!(max_versions)
+    };
+
     // If this block version we're about to add will hit the version limit
-    if get_num_versions(blockid) + 1 >= get_option!(max_versions) {
+    if next_versions >= max_versions {
         // Produce a generic context that stores no type information,
         // but still respects the stack_size and sp_offset constraints.
         // This new context will then match all future requests.
-        let mut generic_ctx = Context::default();
-        generic_ctx.stack_size = ctx.stack_size;
-        generic_ctx.sp_offset = ctx.sp_offset;
+        let generic_ctx = ctx.get_generic_ctx();
+
+        if cfg!(debug_assertions) {
+            let mut ctx = ctx.clone();
+            if ctx.inline() {
+                // Suppress TypeDiff::Incompatible from ctx.diff(). We return TypeDiff::Incompatible
+                // to keep inlining blocks until we hit the limit, but it's safe to give up inlining.
+                ctx.inline_block = None;
+                assert!(generic_ctx.inline_block == None);
+            }
 
-        debug_assert_ne!(
-            usize::MAX,
-            ctx.diff(&generic_ctx),
-            "should substitute a compatible context",
-        );
+            assert_ne!(
+                TypeDiff::Incompatible,
+                ctx.diff(&generic_ctx),
+                "should substitute a compatible context",
+            );
+        }
 
         return generic_ctx;
     }
+    if ctx.inline() {
+        incr_counter_to!(max_inline_versions, next_versions);
+    }
 
-    return ctx.clone();
+    return *ctx;
 }
 
-/// Keep track of a block version. Block should be fully constructed.
-/// Uses `cb` for running write barriers.
-fn add_block_version(blockref: &BlockRef, cb: &CodeBlock) {
-    let block = blockref.borrow();
+/// Install a block version into its [IseqPayload], letting the GC track its
+/// lifetime, and allowing it to be considered for use for other
+/// blocks we might generate. Uses `cb` for running write barriers.
+///
+/// # Safety
+///
+/// The block must be fully initialized. Its incoming and outgoing edges,
+/// if there are any, must point to initialized blocks, too.
+///
+/// Note that the block might gain edges after this function returns,
+/// as can happen during [gen_block_series]. Initialized here doesn't mean
+/// ready to be consumed or that the machine code tracked by the block is
+/// ready to be run.
+///
+/// Due to this transient state where a block is tracked by the GC by
+/// being inside an [IseqPayload] but not ready to be executed, it's
+/// generally unsound to call any Ruby methods during codegen. That has
+/// the potential to run blocks which are not ready.
+unsafe fn add_block_version(blockref: BlockRef, cb: &CodeBlock) {
+    // SAFETY: caller ensures initialization
+    let block = unsafe { blockref.as_ref() };
 
     // Function entry blocks must have stack size 0
-    assert!(!(block.blockid.idx == 0 && block.ctx.stack_size > 0));
+    debug_assert!(!(block.iseq_range.start == 0 && Context::decode(block.ctx).stack_size > 0));
+
+    let version_list = get_or_create_version_list(block.get_blockid());
 
-    let version_list = get_or_create_version_list(block.blockid);
+    // If this the first block being compiled with this block id
+    if version_list.len() == 0 {
+        incr_counter!(compiled_blockid_count);
+    }
 
-    version_list.push(blockref.clone());
+    version_list.push(blockref);
     version_list.shrink_to_fit();
 
     // By writing the new block to the iseq, the iseq now
     // contains new references to Ruby objects. Run write barriers.
-    let iseq: VALUE = block.blockid.iseq.into();
-    for &dep in block.iter_cme_deps() {
+    let iseq: VALUE = block.iseq.get().into();
+    for dep in block.iter_cme_deps() {
         obj_written!(iseq, dep.into());
     }
 
     // Run write barriers for all objects in generated code.
-    for offset in &block.gc_obj_offsets {
-        let value_address: *const u8 = cb.get_ptr(offset.as_usize()).raw_ptr();
+    for offset in block.gc_obj_offsets.iter() {
+        let value_address: *const u8 = cb.get_ptr(offset.as_usize()).raw_ptr(cb);
         // Creating an unaligned pointer is well defined unlike in C.
         let value_address: *const VALUE = value_address.cast();
 
@@ -964,18 +2371,21 @@ fn add_block_version(blockref: &BlockRef, cb: &CodeBlock) {
     }
 
     incr_counter!(compiled_block_count);
+    if Context::decode(block.ctx).inline() {
+        incr_counter!(inline_block_count);
+    }
 
     // Mark code pages for code GC
-    let iseq_payload = get_iseq_payload(block.blockid.iseq).unwrap();
-    for page in cb.addrs_to_pages(block.start_addr.unwrap(), block.end_addr.unwrap()) {
+    let iseq_payload = get_iseq_payload(block.iseq.get()).unwrap();
+    for page in cb.addrs_to_pages(block.start_addr, block.end_addr.get()) {
         iseq_payload.pages.insert(page);
     }
 }
 
 /// Remove a block version from the version map of its parent ISEQ
 fn remove_block_version(blockref: &BlockRef) {
-    let block = blockref.borrow();
-    let version_list = match get_version_list(block.blockid) {
+    let block = unsafe { blockref.as_ref() };
+    let version_list = match get_version_list(block.get_blockid()) {
         Some(version_list) => version_list,
         None => return,
     };
@@ -984,241 +2394,228 @@ fn remove_block_version(blockref: &BlockRef) {
     version_list.retain(|other| blockref != other);
 }
 
-//===========================================================================
-// I put the implementation of traits for core.rs types below
-// We can move these closer to the above structs later if we want.
-//===========================================================================
+impl<'a> JITState<'a> {
+    // Finish compiling and turn a jit state into a block
+    // note that the block is still not in shape.
+    pub fn into_block(self, end_insn_idx: IseqIdx, start_addr: CodePtr, end_addr: CodePtr, gc_obj_offsets: Vec<u32>) -> BlockRef {
+        // Allocate the block and get its pointer
+        let blockref: *mut MaybeUninit<Block> = Box::into_raw(Box::new(MaybeUninit::uninit()));
+
+        incr_counter_by!(num_gc_obj_refs, gc_obj_offsets.len());
+
+        let ctx = Context::encode(&self.get_starting_ctx());
+
+        // Make the new block
+        let block = MaybeUninit::new(Block {
+            start_addr,
+            iseq: Cell::new(self.get_iseq()),
+            iseq_range: self.get_starting_insn_idx()..end_insn_idx,
+            ctx,
+            end_addr: Cell::new(end_addr),
+            incoming: MutableBranchList(Cell::default()),
+            gc_obj_offsets: gc_obj_offsets.into_boxed_slice(),
+            entry_exit: self.get_block_entry_exit(),
+            cme_dependencies: self.method_lookup_assumptions.into_iter().map(Cell::new).collect(),
+            // Pending branches => actual branches
+            outgoing: MutableBranchList(Cell::new(self.pending_outgoing.into_iter().map(|pending_out| {
+                let pending_out = Rc::try_unwrap(pending_out)
+                    .unwrap_or_else(|rc| panic!(
+                        "PendingBranchRef should be unique when ready to construct a Block. \
+                         strong={} weak={}", Rc::strong_count(&rc), Rc::weak_count(&rc)));
+                pending_out.into_branch(NonNull::new(blockref as *mut Block).expect("no null from Box"))
+            }).collect()))
+        });
+        // Initialize it on the heap
+        // SAFETY: allocated with Box above
+        unsafe { ptr::write(blockref, block) };
 
-impl Block {
-    pub fn new(blockid: BlockId, ctx: &Context) -> BlockRef {
-        let block = Block {
-            blockid,
-            end_idx: 0,
-            ctx: ctx.clone(),
-            start_addr: None,
-            end_addr: None,
-            incoming: Vec::new(),
-            outgoing: Vec::new(),
-            gc_obj_offsets: Vec::new(),
-            cme_dependencies: Vec::new(),
-            entry_exit: None,
-        };
+        // Block is initialized now. Note that MaybeUninit<T> has the same layout as T.
+        let blockref = NonNull::new(blockref as *mut Block).expect("no null from Box");
+
+        // Track all the assumptions the block makes as invariants
+        if self.block_assumes_single_ractor {
+            track_single_ractor_assumption(blockref);
+        }
+        for bop in self.bop_assumptions {
+            track_bop_assumption(blockref, bop);
+        }
+        // SAFETY: just allocated it above
+        for cme in unsafe { blockref.as_ref() }.cme_dependencies.iter() {
+            track_method_lookup_stability_assumption(blockref, cme.get());
+        }
+        if let Some(idlist) = self.stable_constant_names_assumption {
+            track_stable_constant_names_assumption(blockref, idlist);
+        }
+        for klass in self.no_singleton_class_assumptions {
+            track_no_singleton_class_assumption(blockref, klass);
+        }
+        if self.no_ep_escape {
+            track_no_ep_escape_assumption(blockref, self.iseq);
+        }
 
-        // Wrap the block in a reference counted refcell
-        // so that the block ownership can be shared
-        BlockRef::new(Rc::new(RefCell::new(block)))
+        blockref
     }
+}
 
+impl Block {
     pub fn get_blockid(&self) -> BlockId {
-        self.blockid
+        BlockId { iseq: self.iseq.get(), idx: self.iseq_range.start }
     }
 
-    pub fn get_end_idx(&self) -> u32 {
-        self.end_idx
+    pub fn get_end_idx(&self) -> IseqIdx {
+        self.iseq_range.end
     }
 
-    pub fn get_ctx(&self) -> Context {
-        self.ctx.clone()
+    pub fn get_ctx_count(&self) -> usize {
+        let mut count = 1; // block.ctx
+        self.outgoing.for_each(|branch| {
+            // SAFETY: &self implies it's initialized
+            count += unsafe { branch.as_ref() }.get_stub_count();
+        });
+        count
     }
 
     #[allow(unused)]
-    pub fn get_start_addr(&self) -> Option<CodePtr> {
+    pub fn get_start_addr(&self) -> CodePtr {
         self.start_addr
     }
 
     #[allow(unused)]
-    pub fn get_end_addr(&self) -> Option<CodePtr> {
-        self.end_addr
+    pub fn get_end_addr(&self) -> CodePtr {
+        self.end_addr.get()
     }
 
     /// Get an immutable iterator over cme dependencies
-    pub fn iter_cme_deps(&self) -> std::slice::Iter<'_, CmePtr> {
-        self.cme_dependencies.iter()
-    }
-
-    /// Set the starting address in the generated code for the block
-    /// This can be done only once for a block
-    pub fn set_start_addr(&mut self, addr: CodePtr) {
-        assert!(self.start_addr.is_none());
-        self.start_addr = Some(addr);
+    pub fn iter_cme_deps(&self) -> impl Iterator<Item = CmePtr> + '_ {
+        self.cme_dependencies.iter().map(Cell::get)
     }
 
-    /// Set the end address in the generated for the block
-    /// This can be done only once for a block
-    pub fn set_end_addr(&mut self, addr: CodePtr) {
-        // The end address can only be set after the start address is set
-        assert!(self.start_addr.is_some());
-
-        // TODO: assert constraint that blocks can shrink but not grow in length
-        self.end_addr = Some(addr);
+    // Push an incoming branch ref and shrink the vector
+    fn push_incoming(&self, branch: BranchRef) {
+        self.incoming.push(branch);
     }
 
-    /// Set the index of the last instruction in the block
-    /// This can be done only once for a block
-    pub fn set_end_idx(&mut self, end_idx: u32) {
-        assert!(self.end_idx == 0);
-        self.end_idx = end_idx;
+    // Compute the size of the block code
+    pub fn code_size(&self) -> usize {
+        (self.end_addr.get().as_offset() - self.start_addr.as_offset()).try_into().unwrap()
     }
+}
 
-    pub fn add_gc_obj_offsets(self: &mut Block, gc_offsets: Vec<u32>) {
-        for offset in gc_offsets {
-            self.gc_obj_offsets.push(offset);
-            incr_counter!(num_gc_obj_refs);
-        }
-        self.gc_obj_offsets.shrink_to_fit();
+impl Context {
+    pub fn get_stack_size(&self) -> u8 {
+        self.stack_size
     }
 
-    /// Instantiate a new CmeDependency struct and add it to the list of
-    /// dependencies for this block.
-    pub fn add_cme_dependency(&mut self, callee_cme: CmePtr) {
-        self.cme_dependencies.push(callee_cme);
-        self.cme_dependencies.shrink_to_fit();
+    pub fn set_stack_size(&mut self, stack_size: u8) {
+        self.stack_size = stack_size;
     }
 
-    // Push an incoming branch ref and shrink the vector
-    fn push_incoming(&mut self, branch: BranchRef) {
-        self.incoming.push(branch);
-        self.incoming.shrink_to_fit();
+    /// Create a new Context that is compatible with self but doesn't have type information.
+    pub fn get_generic_ctx(&self) -> Context {
+        let mut generic_ctx = Context::default();
+        generic_ctx.stack_size = self.stack_size;
+        generic_ctx.sp_offset = self.sp_offset;
+        generic_ctx.reg_mapping = self.reg_mapping;
+        if self.is_return_landing() {
+            generic_ctx.set_as_return_landing();
+        }
+        if self.is_deferred() {
+            generic_ctx.mark_as_deferred();
+        }
+        generic_ctx
     }
 
-    // Push an outgoing branch ref and shrink the vector
-    fn push_outgoing(&mut self, branch: BranchRef) {
-        self.outgoing.push(branch);
-        self.outgoing.shrink_to_fit();
+    /// Create a new Context instance with a given stack_size and sp_offset adjusted
+    /// accordingly. This is useful when you want to virtually rewind a stack_size for
+    /// generating a side exit while considering past sp_offset changes on gen_save_sp.
+    pub fn with_stack_size(&self, stack_size: u8) -> Context {
+        let mut ctx = *self;
+        ctx.sp_offset -= (ctx.get_stack_size() as isize - stack_size as isize) as i8;
+        ctx.stack_size = stack_size;
+        ctx
     }
 
-    // Compute the size of the block code
-    pub fn code_size(&self) -> usize {
-        (self.end_addr.unwrap().raw_ptr() as usize) - (self.start_addr.unwrap().raw_ptr() as usize)
+    pub fn get_sp_offset(&self) -> i8 {
+        self.sp_offset
     }
-}
 
-impl Context {
-    pub fn get_stack_size(&self) -> u16 {
-        self.stack_size
+    pub fn set_sp_offset(&mut self, offset: i8) {
+        self.sp_offset = offset;
     }
 
-    pub fn get_sp_offset(&self) -> i16 {
-        self.sp_offset
+    pub fn get_reg_mapping(&self) -> RegMapping {
+        self.reg_mapping
     }
 
-    pub fn set_sp_offset(&mut self, offset: i16) {
-        self.sp_offset = offset;
+    pub fn set_reg_mapping(&mut self, reg_mapping: RegMapping) {
+        self.reg_mapping = reg_mapping;
     }
 
     pub fn get_chain_depth(&self) -> u8 {
         self.chain_depth
     }
 
-    pub fn reset_chain_depth(&mut self) {
+    pub fn reset_chain_depth_and_defer(&mut self) {
         self.chain_depth = 0;
+        self.is_deferred = false;
     }
 
     pub fn increment_chain_depth(&mut self) {
+        if self.get_chain_depth() == CHAIN_DEPTH_MAX {
+            panic!("max block version chain depth reached!");
+        }
         self.chain_depth += 1;
     }
 
-    /// Get an operand for the adjusted stack pointer address
-    pub fn sp_opnd(&self, offset_bytes: isize) -> Opnd {
-        let offset = ((self.sp_offset as isize) * (SIZEOF_VALUE as isize)) + offset_bytes;
-        let offset = offset as i32;
-        return Opnd::mem(64, SP, offset);
+    pub fn set_as_return_landing(&mut self) {
+        self.is_return_landing = true;
     }
 
-    /// Push one new value on the temp stack with an explicit mapping
-    /// Return a pointer to the new stack top
-    pub fn stack_push_mapping(&mut self, (mapping, temp_type): (TempMapping, Type)) -> Opnd {
-        // If type propagation is disabled, store no types
-        if get_option!(no_type_prop) {
-            return self.stack_push_mapping((mapping, Type::Unknown));
-        }
-
-        let stack_size: usize = self.stack_size.into();
-
-        // Keep track of the type and mapping of the value
-        if stack_size < MAX_TEMP_TYPES {
-            self.temp_mapping[stack_size] = mapping;
-            self.temp_types[stack_size] = temp_type;
-
-            if let MapToLocal(idx) = mapping {
-                assert!((idx as usize) < MAX_LOCAL_TYPES);
-            }
-        }
-
-        self.stack_size += 1;
-        self.sp_offset += 1;
-
-        // SP points just above the topmost value
-        let offset = ((self.sp_offset as i32) - 1) * (SIZEOF_VALUE as i32);
-        return Opnd::mem(64, SP, offset);
+    pub fn clear_return_landing(&mut self) {
+        self.is_return_landing = false;
     }
 
-    /// Push one new value on the temp stack
-    /// Return a pointer to the new stack top
-    pub fn stack_push(&mut self, val_type: Type) -> Opnd {
-        return self.stack_push_mapping((MapToStack, val_type));
+    pub fn is_return_landing(&self) -> bool {
+        self.is_return_landing
     }
 
-    /// Push the self value on the stack
-    pub fn stack_push_self(&mut self) -> Opnd {
-        return self.stack_push_mapping((MapToSelf, Type::Unknown));
+    pub fn mark_as_deferred(&mut self) {
+        self.is_deferred = true;
     }
 
-    /// Push a local variable on the stack
-    pub fn stack_push_local(&mut self, local_idx: usize) -> Opnd {
-        if local_idx >= MAX_LOCAL_TYPES {
-            return self.stack_push(Type::Unknown);
-        }
-
-        return self.stack_push_mapping((MapToLocal(local_idx as u8), Type::Unknown));
+    pub fn is_deferred(&self) -> bool {
+        self.is_deferred
     }
 
-    // Pop N values off the stack
-    // Return a pointer to the stack top before the pop operation
-    pub fn stack_pop(&mut self, n: usize) -> Opnd {
-        assert!(n <= self.stack_size.into());
-
-        // SP points just above the topmost value
-        let offset = ((self.sp_offset as i32) - 1) * (SIZEOF_VALUE as i32);
-        let top = Opnd::mem(64, SP, offset);
-
-        // Clear the types of the popped values
-        for i in 0..n {
-            let idx: usize = (self.stack_size as usize) - i - 1;
-
-            if idx < MAX_TEMP_TYPES {
-                self.temp_types[idx] = Type::Unknown;
-                self.temp_mapping[idx] = MapToStack;
-            }
-        }
-
-        self.stack_size -= n as u16;
-        self.sp_offset -= n as i16;
-
-        return top;
+    /// Get an operand for the adjusted stack pointer address
+    pub fn sp_opnd(&self, offset: i32) -> Opnd {
+        let offset = (self.sp_offset as i32 + offset) * SIZEOF_VALUE_I32;
+        return Opnd::mem(64, SP, offset);
     }
 
-    pub fn shift_stack(&mut self, argc: usize) {
-        assert!(argc < self.stack_size.into());
-
-        let method_name_index = (self.stack_size - argc as u16 - 1) as usize;
-
-        for i in method_name_index..(self.stack_size - 1) as usize {
+    /// Get an operand for the adjusted environment pointer address using SP register.
+    /// This is valid only when a Binding object hasn't been created for the frame.
+    pub fn ep_opnd(&self, offset: i32) -> Opnd {
+        let ep_offset = self.get_stack_size() as i32 + 1;
+        self.sp_opnd(-ep_offset + offset)
+    }
 
-            if i + 1 < MAX_TEMP_TYPES {
-                self.temp_types[i] = self.temp_types[i + 1];
-                self.temp_mapping[i] = self.temp_mapping[i + 1];
-            }
+    /// Start using a register for a given stack temp or a local.
+    pub fn alloc_reg(&mut self, opnd: RegOpnd) {
+        let mut reg_mapping = self.get_reg_mapping();
+        if reg_mapping.alloc_reg(opnd) {
+            self.set_reg_mapping(reg_mapping);
         }
-        self.stack_pop(1);
     }
 
-    /// Get an operand pointing to a slot on the temp stack
-    pub fn stack_opnd(&self, idx: i32) -> Opnd {
-        // SP points just above the topmost value
-        let offset = ((self.sp_offset as i32) - 1 - idx) * (SIZEOF_VALUE as i32);
-        let opnd = Opnd::mem(64, SP, offset);
-        return opnd;
+    /// Stop using a register for a given stack temp or a local.
+    /// This allows us to reuse the register for a value that we know is dead
+    /// and will no longer be used (e.g. popped stack temp).
+    pub fn dealloc_reg(&mut self, opnd: RegOpnd) {
+        let mut reg_mapping = self.get_reg_mapping();
+        if reg_mapping.dealloc_reg(opnd) {
+            self.set_reg_mapping(reg_mapping);
+        }
     }
 
     /// Get the type of an instruction operand
@@ -1230,18 +2627,18 @@ impl Context {
                 let stack_idx: usize = (self.stack_size - 1 - idx).into();
 
                 // If outside of tracked range, do nothing
-                if stack_idx >= MAX_TEMP_TYPES {
+                if stack_idx >= MAX_CTX_TEMPS {
                     return Type::Unknown;
                 }
 
-                let mapping = self.temp_mapping[stack_idx];
+                let mapping = self.get_temp_mapping(stack_idx);
 
                 match mapping {
                     MapToSelf => self.self_type,
-                    MapToStack => self.temp_types[(self.stack_size - 1 - idx) as usize],
-                    MapToLocal(idx) => {
-                        assert!((idx as usize) < MAX_LOCAL_TYPES);
-                        return self.local_types[idx as usize];
+                    MapToStack(temp_type) => temp_type,
+                    MapToLocal(local_idx) => {
+                        assert!((local_idx as usize) < MAX_CTX_LOCALS);
+                        return self.get_local_type(local_idx.into());
                     }
                 }
             }
@@ -1249,8 +2646,24 @@ impl Context {
     }
 
     /// Get the currently tracked type for a local variable
-    pub fn get_local_type(&self, idx: usize) -> Type {
-        *self.local_types.get(idx).unwrap_or(&Type::Unknown)
+    pub fn get_local_type(&self, local_idx: usize) -> Type {
+        if local_idx >= MAX_CTX_LOCALS {
+            Type::Unknown
+        } else {
+            self.local_types[local_idx]
+        }
+    }
+
+    /// Get the current temp mapping for a given stack slot
+    fn get_temp_mapping(&self, temp_idx: usize) -> TempMapping {
+        assert!(temp_idx < MAX_CTX_TEMPS);
+        self.temp_mapping[temp_idx]
+    }
+
+    /// Set the current temp mapping for a given stack slot
+    fn set_temp_mapping(&mut self, temp_idx: usize, mapping: TempMapping) {
+        assert!(temp_idx < MAX_CTX_TEMPS);
+        self.temp_mapping[temp_idx] = mapping;
     }
 
     /// Upgrade (or "learn") the type of an instruction operand
@@ -1270,19 +2683,27 @@ impl Context {
                 let stack_idx = (self.stack_size - 1 - idx) as usize;
 
                 // If outside of tracked range, do nothing
-                if stack_idx >= MAX_TEMP_TYPES {
+                if stack_idx >= MAX_CTX_TEMPS {
                     return;
                 }
 
-                let mapping = self.temp_mapping[stack_idx];
+                let mapping = self.get_temp_mapping(stack_idx);
 
                 match mapping {
                     MapToSelf => self.self_type.upgrade(opnd_type),
-                    MapToStack => self.temp_types[stack_idx].upgrade(opnd_type),
-                    MapToLocal(idx) => {
-                        let idx = idx as usize;
-                        assert!(idx < MAX_LOCAL_TYPES);
-                        self.local_types[idx].upgrade(opnd_type);
+                    MapToStack(mut temp_type) => {
+                        temp_type.upgrade(opnd_type);
+                        self.set_temp_mapping(stack_idx, TempMapping::MapToStack(temp_type));
+                    }
+                    MapToLocal(local_idx) => {
+                        let idx = local_idx as usize;
+                        assert!(idx < MAX_CTX_LOCALS);
+                        let mut new_type = self.get_local_type(idx);
+                        new_type.upgrade(opnd_type);
+                        self.set_local_type(idx, new_type);
+                        // Re-attach MapToLocal for this StackOpnd(idx). set_local_type() detaches
+                        // all MapToLocal mappings, including the one we're upgrading here.
+                        self.set_opnd_mapping(opnd, mapping);
                     }
                 }
             }
@@ -1294,29 +2715,29 @@ impl Context {
     This is can be used with stack_push_mapping or set_opnd_mapping to copy
     a stack value's type while maintaining the mapping.
     */
-    pub fn get_opnd_mapping(&self, opnd: YARVOpnd) -> (TempMapping, Type) {
+    pub fn get_opnd_mapping(&self, opnd: YARVOpnd) -> TempMapping {
         let opnd_type = self.get_opnd_type(opnd);
 
         match opnd {
-            SelfOpnd => (MapToSelf, opnd_type),
+            SelfOpnd => TempMapping::MapToSelf,
             StackOpnd(idx) => {
                 assert!(idx < self.stack_size);
                 let stack_idx = (self.stack_size - 1 - idx) as usize;
 
-                if stack_idx < MAX_TEMP_TYPES {
-                    (self.temp_mapping[stack_idx], opnd_type)
+                if stack_idx < MAX_CTX_TEMPS {
+                    self.get_temp_mapping(stack_idx)
                 } else {
                     // We can't know the source of this stack operand, so we assume it is
                     // a stack-only temporary. type will be UNKNOWN
                     assert!(opnd_type == Type::Unknown);
-                    (MapToStack, opnd_type)
+                    TempMapping::MapToStack(opnd_type)
                 }
             }
         }
     }
 
     /// Overwrite both the type and mapping of a stack operand.
-    pub fn set_opnd_mapping(&mut self, opnd: YARVOpnd, (mapping, opnd_type): (TempMapping, Type)) {
+    pub fn set_opnd_mapping(&mut self, opnd: YARVOpnd, mapping: TempMapping) {
         match opnd {
             SelfOpnd => unreachable!("self always maps to self"),
             StackOpnd(idx) => {
@@ -1329,48 +2750,46 @@ impl Context {
                 }
 
                 // If outside of tracked range, do nothing
-                if stack_idx >= MAX_TEMP_TYPES {
+                if stack_idx >= MAX_CTX_TEMPS {
                     return;
                 }
 
-                self.temp_mapping[stack_idx] = mapping;
-
-                // Only used when mapping == MAP_STACK
-                self.temp_types[stack_idx] = opnd_type;
+                self.set_temp_mapping(stack_idx, mapping);
             }
         }
     }
 
     /// Set the type of a local variable
     pub fn set_local_type(&mut self, local_idx: usize, local_type: Type) {
-        let ctx = self;
-
         // If type propagation is disabled, store no types
         if get_option!(no_type_prop) {
             return;
         }
 
-        if local_idx >= MAX_LOCAL_TYPES {
-            return;
+        if local_idx >= MAX_CTX_LOCALS {
+            return
         }
 
         // If any values on the stack map to this local we must detach them
-        for (i, mapping) in ctx.temp_mapping.iter_mut().enumerate() {
-            *mapping = match *mapping {
-                MapToStack => MapToStack,
-                MapToSelf => MapToSelf,
+        for mapping_idx in 0..MAX_CTX_TEMPS {
+            let mapping = self.get_temp_mapping(mapping_idx);
+            let tm = match mapping {
+                MapToStack(_) => mapping,
+                MapToSelf => mapping,
                 MapToLocal(idx) => {
                     if idx as usize == local_idx {
-                        ctx.temp_types[i] = ctx.local_types[idx as usize];
-                        MapToStack
+                        let local_type = self.get_local_type(local_idx);
+                        TempMapping::MapToStack(local_type)
                     } else {
-                        MapToLocal(idx)
+                        TempMapping::MapToLocal(idx)
                     }
                 }
-            }
+            };
+            self.set_temp_mapping(mapping_idx, tm);
         }
 
-        ctx.local_types[local_idx] = local_type;
+        // Update the type
+        self.local_types[local_idx] = local_type;
     }
 
     /// Erase local variable type information
@@ -1378,99 +2797,273 @@ impl Context {
     pub fn clear_local_types(&mut self) {
         // When clearing local types we must detach any stack mappings to those
         // locals. Even if local values may have changed, stack values will not.
-        for (i, mapping) in self.temp_mapping.iter_mut().enumerate() {
-            *mapping = match *mapping {
-                MapToStack => MapToStack,
-                MapToSelf => MapToSelf,
-                MapToLocal(idx) => {
-                    self.temp_types[i] = self.local_types[idx as usize];
-                    MapToStack
-                }
+
+        for mapping_idx in 0..MAX_CTX_TEMPS {
+            let mapping = self.get_temp_mapping(mapping_idx);
+            if let MapToLocal(local_idx) = mapping {
+                let local_idx = local_idx as usize;
+                self.set_temp_mapping(mapping_idx, TempMapping::MapToStack(self.get_local_type(local_idx)));
             }
         }
 
         // Clear the local types
-        self.local_types = [Type::default(); MAX_LOCAL_TYPES];
+        self.local_types = [Type::default(); MAX_CTX_LOCALS];
+    }
+
+    /// Return true if the code is inlined by the caller
+    pub fn inline(&self) -> bool {
+        self.inline_block.is_some()
+    }
+
+    /// Set a block ISEQ given to the Block of this Context
+    pub fn set_inline_block(&mut self, iseq: IseqPtr) {
+        self.inline_block = Some(iseq);
     }
 
     /// Compute a difference score for two context objects
-    /// Returns 0 if the two contexts are the same
-    /// Returns > 0 if different but compatible
-    /// Returns usize::MAX if incompatible
-    pub fn diff(&self, dst: &Context) -> usize {
+    pub fn diff(&self, dst: &Context) -> TypeDiff {
         // Self is the source context (at the end of the predecessor)
         let src = self;
 
         // Can only lookup the first version in the chain
-        if dst.chain_depth != 0 {
-            return usize::MAX;
+        if dst.get_chain_depth() != 0 {
+            return TypeDiff::Incompatible;
         }
 
         // Blocks with depth > 0 always produce new versions
         // Sidechains cannot overlap
-        if src.chain_depth != 0 {
-            return usize::MAX;
+        if src.get_chain_depth() != 0 {
+            return TypeDiff::Incompatible;
+        }
+
+        if src.is_return_landing() != dst.is_return_landing() {
+            return TypeDiff::Incompatible;
+        }
+
+        if src.is_deferred() != dst.is_deferred() {
+            return TypeDiff::Incompatible;
         }
 
         if dst.stack_size != src.stack_size {
-            return usize::MAX;
+            return TypeDiff::Incompatible;
         }
 
         if dst.sp_offset != src.sp_offset {
-            return usize::MAX;
+            return TypeDiff::Incompatible;
+        }
+
+        if dst.reg_mapping != src.reg_mapping {
+            return TypeDiff::Incompatible;
         }
 
         // Difference sum
         let mut diff = 0;
 
         // Check the type of self
-        let self_diff = src.self_type.diff(dst.self_type);
+        diff += match src.self_type.diff(dst.self_type) {
+            TypeDiff::Compatible(diff) => diff,
+            TypeDiff::Incompatible => return TypeDiff::Incompatible,
+        };
 
-        if self_diff == usize::MAX {
-            return usize::MAX;
+        // Check the block to inline
+        if src.inline_block != dst.inline_block {
+            // find_block_version should not find existing blocks with different
+            // inline_block so that their yield will not be megamorphic.
+            return TypeDiff::Incompatible;
         }
 
-        diff += self_diff;
-
         // For each local type we track
-        for i in 0..src.local_types.len() {
-            let t_src = src.local_types[i];
-            let t_dst = dst.local_types[i];
-            let temp_diff = t_src.diff(t_dst);
-
-            if temp_diff == usize::MAX {
-                return usize::MAX;
-            }
-
-            diff += temp_diff;
+        for i in 0.. MAX_CTX_LOCALS {
+            let t_src = src.get_local_type(i);
+            let t_dst = dst.get_local_type(i);
+            diff += match t_src.diff(t_dst) {
+                TypeDiff::Compatible(diff) => diff,
+                TypeDiff::Incompatible => return TypeDiff::Incompatible,
+            };
         }
 
         // For each value on the temp stack
         for i in 0..src.stack_size {
-            let (src_mapping, src_type) = src.get_opnd_mapping(StackOpnd(i));
-            let (dst_mapping, dst_type) = dst.get_opnd_mapping(StackOpnd(i));
+            let src_mapping = src.get_opnd_mapping(StackOpnd(i));
+            let dst_mapping = dst.get_opnd_mapping(StackOpnd(i));
 
             // If the two mappings aren't the same
             if src_mapping != dst_mapping {
-                if dst_mapping == MapToStack {
+                if matches!(dst_mapping, MapToStack(_)) {
                     // We can safely drop information about the source of the temp
                     // stack operand.
                     diff += 1;
                 } else {
-                    return usize::MAX;
+                    return TypeDiff::Incompatible;
                 }
             }
 
-            let temp_diff = src_type.diff(dst_type);
+            let src_type = src.get_opnd_type(StackOpnd(i));
+            let dst_type = dst.get_opnd_type(StackOpnd(i));
+
+            diff += match src_type.diff(dst_type) {
+                TypeDiff::Compatible(diff) => diff,
+                TypeDiff::Incompatible => return TypeDiff::Incompatible,
+            };
+        }
 
-            if temp_diff == usize::MAX {
-                return usize::MAX;
+        return TypeDiff::Compatible(diff);
+    }
+
+    /// Basically diff() but allows RegMapping incompatibility that could be fixed by
+    /// spilling, loading, or shuffling registers.
+    pub fn diff_allowing_reg_mismatch(&self, dst: &Context) -> TypeDiff {
+        // We shuffle only RegOpnd::Local and spill any other RegOpnd::Stack.
+        // If dst has RegOpnd::Stack, we can't reuse the block as a callee.
+        for reg_opnd in dst.get_reg_mapping().get_reg_opnds() {
+            if matches!(reg_opnd, RegOpnd::Stack(_)) {
+                return TypeDiff::Incompatible;
             }
+        }
 
-            diff += temp_diff;
+        // Prepare a Context with the same registers
+        let mut dst_with_same_regs = dst.clone();
+        dst_with_same_regs.set_reg_mapping(self.get_reg_mapping());
+
+        // Diff registers and other stuff separately, and merge them
+        if let TypeDiff::Compatible(ctx_diff) = self.diff(&dst_with_same_regs) {
+            TypeDiff::Compatible(ctx_diff + self.get_reg_mapping().diff(dst.get_reg_mapping()))
+        } else {
+            TypeDiff::Incompatible
+        }
+    }
+
+    pub fn two_fixnums_on_stack(&self, jit: &mut JITState) -> Option<bool> {
+        if jit.at_compile_target() {
+            let comptime_recv = jit.peek_at_stack(self, 1);
+            let comptime_arg = jit.peek_at_stack(self, 0);
+            return Some(comptime_recv.fixnum_p() && comptime_arg.fixnum_p());
         }
 
-        return diff;
+        let recv_type = self.get_opnd_type(StackOpnd(1));
+        let arg_type = self.get_opnd_type(StackOpnd(0));
+        match (recv_type, arg_type) {
+            (Type::Fixnum, Type::Fixnum) => Some(true),
+            (Type::Unknown | Type::UnknownImm, Type::Unknown | Type::UnknownImm) => None,
+            _ => Some(false),
+        }
+    }
+}
+
+impl Assembler {
+    /// Push one new value on the temp stack with an explicit mapping
+    /// Return a pointer to the new stack top
+    pub fn stack_push_mapping(&mut self, mapping: TempMapping) -> Opnd {
+        // If type propagation is disabled, store no types
+        if get_option!(no_type_prop) {
+            return self.stack_push_mapping(mapping.without_type());
+        }
+
+        let stack_size: usize = self.ctx.stack_size.into();
+
+        // Keep track of the type and mapping of the value
+        if stack_size < MAX_CTX_TEMPS {
+            self.ctx.set_temp_mapping(stack_size, mapping);
+
+            if let MapToLocal(local_idx) = mapping {
+                assert!((local_idx as usize) < MAX_CTX_LOCALS);
+            }
+        }
+
+        self.ctx.stack_size += 1;
+        self.ctx.sp_offset += 1;
+
+        // Allocate a register to the new stack operand
+        let stack_opnd = self.stack_opnd(0);
+        self.alloc_reg(stack_opnd.reg_opnd());
+
+        stack_opnd
+    }
+
+    /// Push one new value on the temp stack
+    /// Return a pointer to the new stack top
+    pub fn stack_push(&mut self, val_type: Type) -> Opnd {
+        return self.stack_push_mapping(TempMapping::MapToStack(val_type));
+    }
+
+    /// Push the self value on the stack
+    pub fn stack_push_self(&mut self) -> Opnd {
+        return self.stack_push_mapping(TempMapping::MapToSelf);
+    }
+
+    /// Push a local variable on the stack
+    pub fn stack_push_local(&mut self, local_idx: usize) -> Opnd {
+        if local_idx >= MAX_CTX_LOCALS {
+            return self.stack_push(Type::Unknown);
+        }
+
+        return self.stack_push_mapping(TempMapping::MapToLocal(local_idx as u8));
+    }
+
+    // Pop N values off the stack
+    // Return a pointer to the stack top before the pop operation
+    pub fn stack_pop(&mut self, n: usize) -> Opnd {
+        assert!(n <= self.ctx.stack_size.into());
+
+        let top = self.stack_opnd(0);
+
+        // Clear the types of the popped values
+        for i in 0..n {
+            let idx: usize = (self.ctx.stack_size as usize) - i - 1;
+
+            if idx < MAX_CTX_TEMPS {
+                self.ctx.set_temp_mapping(idx, TempMapping::MapToStack(Type::Unknown));
+            }
+        }
+
+        self.ctx.stack_size -= n as u8;
+        self.ctx.sp_offset -= n as i8;
+
+        return top;
+    }
+
+    /// Shift stack temps to remove a Symbol for #send.
+    pub fn shift_stack(&mut self, argc: usize) {
+        assert!(argc < self.ctx.stack_size.into());
+
+        let method_name_index = (self.ctx.stack_size as usize) - argc - 1;
+
+        for i in method_name_index..(self.ctx.stack_size - 1) as usize {
+            if i < MAX_CTX_TEMPS {
+                let next_arg_mapping = if i + 1 < MAX_CTX_TEMPS {
+                    self.ctx.get_temp_mapping(i + 1)
+                } else {
+                    TempMapping::MapToStack(Type::Unknown)
+                };
+                self.ctx.set_temp_mapping(i, next_arg_mapping);
+            }
+        }
+        self.stack_pop(1);
+    }
+
+    /// Get an operand pointing to a slot on the temp stack
+    pub fn stack_opnd(&self, idx: i32) -> Opnd {
+        Opnd::Stack {
+            idx,
+            num_bits: 64,
+            stack_size: self.ctx.stack_size,
+            num_locals: None, // not needed for stack temps
+            sp_offset: self.ctx.sp_offset,
+            reg_mapping: None, // push_insn will set this
+        }
+    }
+
+    /// Get an operand pointing to a local variable
+    pub fn local_opnd(&self, ep_offset: u32) -> Opnd {
+        let idx = self.ctx.stack_size as i32 + ep_offset as i32;
+        Opnd::Stack {
+            idx,
+            num_bits: 64,
+            stack_size: self.ctx.stack_size,
+            num_locals: Some(self.get_num_locals().unwrap()), // this must exist for locals
+            sp_offset: self.ctx.sp_offset,
+            reg_mapping: None, // push_insn will set this
+        }
     }
 }
 
@@ -1479,7 +3072,7 @@ impl BlockId {
     #[cfg(debug_assertions)]
     #[allow(dead_code)]
     pub fn dump_src_loc(&self) {
-        unsafe { rb_yjit_dump_iseq_loc(self.iseq, self.idx) }
+        unsafe { rb_yjit_dump_iseq_loc(self.iseq, self.idx as u32) }
     }
 }
 
@@ -1513,51 +3106,57 @@ fn gen_block_series_body(
     let mut batch = Vec::with_capacity(EXPECTED_BATCH_SIZE);
 
     // Generate code for the first block
-    let first_block = gen_single_block(blockid, start_ctx, ec, cb, ocb).ok()?;
-    batch.push(first_block.clone()); // Keep track of this block version
+    let first_block = gen_single_block(blockid, start_ctx, ec, cb, ocb, true).ok()?;
+    batch.push(first_block); // Keep track of this block version
 
     // Add the block version to the VersionMap for this ISEQ
-    add_block_version(&first_block, cb);
+    unsafe { add_block_version(first_block, cb) };
 
     // Loop variable
-    let mut last_blockref = first_block.clone();
+    let mut last_blockref = first_block;
     loop {
         // Get the last outgoing branch from the previous block.
-        let last_branchref = {
-            let last_block = last_blockref.borrow();
-            match last_block.outgoing.last() {
-                Some(branch) => branch.clone(),
+        // SAFETY: No cell mutation inside unsafe. Copying out a BranchRef.
+        let last_branchref: BranchRef = unsafe {
+            let last_block = last_blockref.as_ref();
+            match last_block.outgoing.0.ref_unchecked().last() {
+                Some(branch) => *branch,
                 None => {
                     break;
                 } // If last block has no branches, stop.
             }
         };
-        let mut last_branch = last_branchref.borrow_mut();
+        let last_branch = unsafe { last_branchref.as_ref() };
+
+        incr_counter!(block_next_count);
 
         // gen_direct_jump() can request a block to be placed immediately after by
         // leaving a single target that has a `None` address.
-        let last_target = match &mut last_branch.targets {
-            [Some(last_target), None] if last_target.get_address().is_none() => last_target,
-            _ => break
+        // SAFETY: no mutation inside the unsafe block
+        let (requested_blockid, requested_ctx) = unsafe {
+            match (last_branch.targets[0].ref_unchecked(), last_branch.targets[1].ref_unchecked()) {
+                (Some(last_target), None) if last_target.get_address().is_none() => {
+                    (last_target.get_blockid(), last_target.get_ctx())
+                }
+                _ => {
+                    // We're done when no fallthrough block is requested
+                    break;
+                }
+            }
         };
 
-        incr_counter!(block_next_count);
-
-        // Get id and context for the new block
-        let requested_blockid = last_target.get_blockid();
-        let requested_ctx = last_target.get_ctx();
-
         // Generate new block using context from the last branch.
-        let result = gen_single_block(requested_blockid, &requested_ctx, ec, cb, ocb);
+        let requested_ctx = Context::decode(requested_ctx);
+        let result = gen_single_block(requested_blockid, &requested_ctx, ec, cb, ocb, false);
 
         // If the block failed to compile
         if result.is_err() {
             // Remove previously compiled block
             // versions from the version map
-            mem::drop(last_branch); // end borrow
-            for blockref in &batch {
-                free_block(blockref);
-                remove_block_version(blockref);
+            for blockref in batch {
+                remove_block_version(&blockref);
+                // SAFETY: block was well connected because it was in a version_map
+                unsafe { free_block(blockref, false) };
             }
 
             // Stop compiling
@@ -1567,16 +3166,14 @@ fn gen_block_series_body(
         let new_blockref = result.unwrap();
 
         // Add the block version to the VersionMap for this ISEQ
-        add_block_version(&new_blockref, cb);
+        unsafe { add_block_version(new_blockref, cb) };
 
         // Connect the last branch and the new block
-        last_branch.targets[0] = Some(Box::new(BranchTarget::Block(new_blockref.clone())));
-        new_blockref
-            .borrow_mut()
-            .push_incoming(last_branchref.clone());
+        last_branch.targets[0].set(Some(Box::new(BranchTarget::Block(new_blockref))));
+        unsafe { new_blockref.as_ref().incoming.push(last_branchref) };
 
         // Track the block
-        batch.push(new_blockref.clone());
+        batch.push(new_blockref);
 
         // Repeat with newest block
         last_blockref = new_blockref;
@@ -1587,12 +3184,12 @@ fn gen_block_series_body(
         // If dump_iseq_disasm is active, see if this iseq's location matches the given substring.
         // If so, we print the new blocks to the console.
         if let Some(substr) = get_option_ref!(dump_iseq_disasm).as_ref() {
-            let blockid_idx = blockid.idx;
-            let iseq_location = iseq_get_location(blockid.iseq, blockid_idx);
+            let iseq_location = iseq_get_location(blockid.iseq, blockid.idx);
             if iseq_location.contains(substr) {
-                let last_block = last_blockref.borrow();
-                println!("Compiling {} block(s) for {}, ISEQ offsets [{}, {})", batch.len(), iseq_location, blockid_idx, last_block.end_idx);
-                print!("{}", disasm_iseq_insn_range(blockid.iseq, blockid.idx, last_block.end_idx));
+                let last_block = unsafe { last_blockref.as_ref() };
+                let iseq_range = &last_block.iseq_range;
+                println!("Compiling {} block(s) for {}, ISEQ offsets [{}, {})", batch.len(), iseq_location, iseq_range.start, iseq_range.end);
+                print!("{}", disasm_iseq_insn_range(blockid.iseq, iseq_range.start, iseq_range.end));
             }
         }
     }
@@ -1602,12 +3199,17 @@ fn gen_block_series_body(
 
 /// Generate a block version that is an entry point inserted into an iseq
 /// NOTE: this function assumes that the VM lock has been taken
-pub fn gen_entry_point(iseq: IseqPtr, ec: EcPtr) -> Option<CodePtr> {
+/// If jit_exception is true, compile JIT code for handling exceptions.
+/// See jit_compile_exception() for details.
+pub fn gen_entry_point(iseq: IseqPtr, ec: EcPtr, jit_exception: bool) -> Option<*const u8> {
     // Compute the current instruction index based on the current PC
-    let insn_idx: u32 = unsafe {
-        let pc_zero = rb_iseq_pc_at_idx(iseq, 0);
-        let ec_pc = get_cfp_pc(get_ec_cfp(ec));
-        ec_pc.offset_from(pc_zero).try_into().ok()?
+    let cfp = unsafe { get_ec_cfp(ec) };
+    let insn_idx: u16 = unsafe {
+        let ec_pc = get_cfp_pc(cfp);
+        iseq_pc_to_insn_idx(iseq, ec_pc)?
+    };
+    let stack_size: u8 = unsafe {
+        u8::try_from(get_cfp_sp(cfp).offset_from(get_cfp_bp(cfp))).ok()?
     };
 
     // The entry context makes no assumptions about types
@@ -1620,77 +3222,247 @@ pub fn gen_entry_point(iseq: IseqPtr, ec: EcPtr) -> Option<CodePtr> {
     let cb = CodegenGlobals::get_inline_cb();
     let ocb = CodegenGlobals::get_outlined_cb();
 
-    // Write the interpreter entry prologue. Might be NULL when out of memory.
-    let code_ptr = gen_entry_prologue(cb, iseq, insn_idx);
-
-    // Try to generate code for the entry block
-    let block = gen_block_series(blockid, &Context::default(), ec, cb, ocb);
+    let code_ptr = gen_entry_point_body(blockid, stack_size, ec, jit_exception, cb, ocb);
 
     cb.mark_all_executable();
     ocb.unwrap().mark_all_executable();
 
+    code_ptr
+}
+
+fn gen_entry_point_body(blockid: BlockId, stack_size: u8, ec: EcPtr, jit_exception: bool, cb: &mut CodeBlock, ocb: &mut OutlinedCb) -> Option<*const u8> {
+    // Write the interpreter entry prologue. Might be NULL when out of memory.
+    let (code_ptr, reg_mapping) = gen_entry_prologue(cb, ocb, blockid, stack_size, jit_exception)?;
+
+    // Find or compile a block version
+    let mut ctx = Context::default();
+    ctx.stack_size = stack_size;
+    ctx.reg_mapping = reg_mapping;
+    let block = match find_block_version(blockid, &ctx) {
+        // If an existing block is found, generate a jump to the block.
+        Some(blockref) => {
+            let mut asm = Assembler::new_without_iseq();
+            asm.jmp(unsafe { blockref.as_ref() }.start_addr.into());
+            asm.compile(cb, Some(ocb))?;
+            Some(blockref)
+        }
+        // If this block hasn't yet been compiled, generate blocks after the entry guard.
+        None => gen_block_series(blockid, &ctx, ec, cb, ocb),
+    };
+
     match block {
         // Compilation failed
         None => {
             // Trigger code GC. This entry point will be recompiled later.
-            cb.code_gc();
+            if get_option!(code_gc) {
+                cb.code_gc(ocb);
+            }
             return None;
         }
 
         // If the block contains no Ruby instructions
         Some(block) => {
-            let block = block.borrow();
-            if block.end_idx == insn_idx {
+            let block = unsafe { block.as_ref() };
+            if block.iseq_range.is_empty() {
                 return None;
             }
         }
     }
 
+    // Count the number of entry points we compile
+    incr_counter!(compiled_iseq_entry);
+
     // Compilation successful and block not empty
-    return code_ptr;
+    Some(code_ptr.raw_ptr(cb))
 }
 
-/// Generate code for a branch, possibly rewriting and changing the size of it
-fn regenerate_branch(cb: &mut CodeBlock, branch: &mut Branch) {
-    // FIXME
-    /*
-    if (branch->start_addr < cb_get_ptr(cb, yjit_codepage_frozen_bytes)) {
-        // Generating this branch would modify frozen bytes. Do nothing.
-        return;
+// Change the entry's jump target from an entry stub to a next entry
+pub fn regenerate_entry(cb: &mut CodeBlock, entryref: &EntryRef, next_entry: CodePtr) {
+    let mut asm = Assembler::new_without_iseq();
+    asm_comment!(asm, "regenerate_entry");
+
+    // gen_entry_guard generates cmp + jne. We're rewriting only jne.
+    asm.jne(next_entry.into());
+
+    // Move write_pos to rewrite the entry
+    let old_write_pos = cb.get_write_pos();
+    let old_dropped_bytes = cb.has_dropped_bytes();
+    cb.set_write_ptr(unsafe { entryref.as_ref() }.start_addr);
+    cb.set_dropped_bytes(false);
+    asm.compile(cb, None).expect("can rewrite existing code");
+
+    // Rewind write_pos to the original one
+    assert_eq!(cb.get_write_ptr(), unsafe { entryref.as_ref() }.end_addr);
+    cb.set_pos(old_write_pos);
+    cb.set_dropped_bytes(old_dropped_bytes);
+}
+
+pub type PendingEntryRef = Rc<PendingEntry>;
+
+/// Create a new entry reference for an ISEQ
+pub fn new_pending_entry() -> PendingEntryRef {
+    let entry = PendingEntry {
+        uninit_entry: Box::new(MaybeUninit::uninit()),
+        start_addr: Cell::new(None),
+        end_addr: Cell::new(None),
+    };
+    return Rc::new(entry);
+}
+
+c_callable! {
+    /// Generated code calls this function with the SysV calling convention.
+    /// See [gen_entry_stub].
+    fn entry_stub_hit(entry_ptr: *const c_void, ec: EcPtr) -> *const u8 {
+        with_compile_time(|| {
+            with_vm_lock(src_loc!(), || {
+                let cb = CodegenGlobals::get_inline_cb();
+                let ocb = CodegenGlobals::get_outlined_cb();
+
+                let addr = entry_stub_hit_body(entry_ptr, ec, cb, ocb)
+                    .unwrap_or_else(|| {
+                        // Trigger code GC (e.g. no space).
+                        // This entry point will be recompiled later.
+                        if get_option!(code_gc) {
+                            cb.code_gc(ocb);
+                        }
+                        CodegenGlobals::get_stub_exit_code().raw_ptr(cb)
+                    });
+
+                cb.mark_all_executable();
+                ocb.unwrap().mark_all_executable();
+
+                addr
+            })
+        })
     }
-    */
+}
 
-    // Remove old comments
-    if let (Some(start_addr), Some(end_addr)) = (branch.start_addr, branch.end_addr) {
-        cb.remove_comments(start_addr, end_addr)
+/// Called by the generated code when an entry stub is executed
+fn entry_stub_hit_body(
+    entry_ptr: *const c_void,
+    ec: EcPtr,
+    cb: &mut CodeBlock,
+    ocb: &mut OutlinedCb
+) -> Option<*const u8> {
+    // Get ISEQ and insn_idx from the current ec->cfp
+    let cfp = unsafe { get_ec_cfp(ec) };
+    let iseq = unsafe { get_cfp_iseq(cfp) };
+    let insn_idx = iseq_pc_to_insn_idx(iseq, unsafe { get_cfp_pc(cfp) })?;
+    let blockid = BlockId { iseq, idx: insn_idx };
+    let stack_size: u8 = unsafe {
+        u8::try_from(get_cfp_sp(cfp).offset_from(get_cfp_bp(cfp))).ok()?
+    };
+
+    // Compile a new entry guard as a next entry
+    let next_entry = cb.get_write_ptr();
+    let mut asm = Assembler::new(unsafe { get_iseq_body_local_table_size(iseq) });
+    let pending_entry = gen_entry_chain_guard(&mut asm, ocb, blockid)?;
+    let reg_mapping = gen_entry_reg_mapping(&mut asm, blockid, stack_size);
+    asm.compile(cb, Some(ocb))?;
+
+    // Find or compile a block version
+    let mut ctx = Context::default();
+    ctx.stack_size = stack_size;
+    ctx.reg_mapping = reg_mapping;
+    let blockref = match find_block_version(blockid, &ctx) {
+        // If an existing block is found, generate a jump to the block.
+        Some(blockref) => {
+            let mut asm = Assembler::new_without_iseq();
+            asm.jmp(unsafe { blockref.as_ref() }.start_addr.into());
+            asm.compile(cb, Some(ocb))?;
+            Some(blockref)
+        }
+        // If this block hasn't yet been compiled, generate blocks after the entry guard.
+        None => gen_block_series(blockid, &ctx, ec, cb, ocb),
+    };
+
+    // Commit or retry the entry
+    if blockref.is_some() {
+        // Regenerate the previous entry
+        let entryref = NonNull::<Entry>::new(entry_ptr as *mut Entry).expect("Entry should not be null");
+        regenerate_entry(cb, &entryref, next_entry);
+
+        // Write an entry to the heap and push it to the ISEQ
+        let pending_entry = Rc::try_unwrap(pending_entry).ok().expect("PendingEntry should be unique");
+        get_or_create_iseq_payload(iseq).entries.push(pending_entry.into_entry());
     }
 
-    let branch_terminates_block = branch.end_addr == branch.block.borrow().end_addr;
+    // Return a code pointer if the block is successfully compiled. The entry stub needs
+    // to jump to the entry preceding the block to load the registers in reg_mapping.
+    blockref.map(|_block| next_entry.raw_ptr(cb))
+}
+
+/// Generate a stub that calls entry_stub_hit
+pub fn gen_entry_stub(entry_address: usize, ocb: &mut OutlinedCb) -> Option<CodePtr> {
+    let ocb = ocb.unwrap();
+
+    let mut asm = Assembler::new_without_iseq();
+    asm_comment!(asm, "entry stub hit");
+
+    asm.mov(C_ARG_OPNDS[0], entry_address.into());
+
+    // Jump to trampoline to call entry_stub_hit()
+    // Not really a side exit, just don't need a padded jump here.
+    asm.jmp(CodegenGlobals::get_entry_stub_hit_trampoline().as_side_exit());
+
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
+}
+
+/// A trampoline used by gen_entry_stub. entry_stub_hit may issue Code GC, so
+/// it's useful for Code GC to call entry_stub_hit from a globally shared code.
+pub fn gen_entry_stub_hit_trampoline(ocb: &mut OutlinedCb) -> Option<CodePtr> {
+    let ocb = ocb.unwrap();
+    let mut asm = Assembler::new_without_iseq();
+
+    // See gen_entry_guard for how it's used.
+    asm_comment!(asm, "entry_stub_hit() trampoline");
+    let jump_addr = asm.ccall(entry_stub_hit as *mut u8, vec![C_ARG_OPNDS[0], EC]);
+
+    // Jump to the address returned by the entry_stub_hit() call
+    asm.jmp_opnd(jump_addr);
+
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
+}
+
+/// Generate code for a branch, possibly rewriting and changing the size of it
+fn regenerate_branch(cb: &mut CodeBlock, branch: &Branch) {
+    // Remove old comments
+    cb.remove_comments(branch.start_addr, branch.end_addr.get());
+
+    // SAFETY: having a &Branch implies branch.block is initialized.
+    let block = unsafe { branch.block.get().as_ref() };
+
+    let branch_terminates_block = branch.end_addr.get() == block.get_end_addr();
 
     // Generate the branch
-    let mut asm = Assembler::new();
-    asm.comment("regenerate_branch");
-    (branch.gen_fn)(
+    let mut asm = Assembler::new_without_iseq();
+    asm_comment!(asm, "regenerate_branch");
+    branch.gen_fn.call(
         &mut asm,
-        branch.get_target_address(0).unwrap(),
-        branch.get_target_address(1),
-        branch.shape,
+        Target::CodePtr(branch.get_target_address(0).unwrap()),
+        branch.get_target_address(1).map(|addr| Target::CodePtr(addr)),
     );
 
+    // If the entire block is the branch and the block could be invalidated,
+    // we need to pad to ensure there is room for invalidation patching.
+    if branch.start_addr == block.start_addr && branch_terminates_block && block.entry_exit.is_some() {
+        asm.pad_inval_patch();
+    }
+
     // Rewrite the branch
     let old_write_pos = cb.get_write_pos();
     let old_dropped_bytes = cb.has_dropped_bytes();
-    cb.set_write_ptr(branch.start_addr.unwrap());
+    cb.set_write_ptr(branch.start_addr);
     cb.set_dropped_bytes(false);
-    asm.compile(cb);
+    asm.compile(cb, None).expect("can rewrite existing code");
+    let new_end_addr = cb.get_write_ptr();
 
-    branch.end_addr = Some(cb.get_write_ptr());
+    branch.end_addr.set(new_end_addr);
 
     // The block may have shrunk after the branch is rewritten
-    let mut block = branch.block.borrow_mut();
     if branch_terminates_block {
         // Adjust block size
-        block.end_addr = branch.end_addr;
+        block.end_addr.set(new_end_addr);
     }
 
     // cb.write_pos is both a write cursor and a marker for the end of
@@ -1708,46 +3480,38 @@ fn regenerate_branch(cb: &mut CodeBlock, branch: &mut Branch) {
         // The branch sits at the end of cb and consumed some memory.
         // Keep cb.write_pos.
     }
-}
 
-/// Create a new outgoing branch entry for a block
-fn make_branch_entry(block: &BlockRef, gen_fn: BranchGenFn) -> BranchRef {
-    let branch = Branch {
-        // Block this is attached to
-        block: block.clone(),
-
-        // Positions where the generated code starts and ends
-        start_addr: None,
-        end_addr: None,
-
-        // Branch target blocks and their contexts
-        targets: [None, None],
+    branch.assert_layout();
+}
 
-        // Branch code generation function
-        gen_fn: gen_fn,
+pub type PendingBranchRef = Rc<PendingBranch>;
 
-        // Shape of the branch
-        shape: BranchShape::Default,
-    };
+/// Create a new outgoing branch entry for a block
+fn new_pending_branch(jit: &mut JITState, gen_fn: BranchGenFn) -> PendingBranchRef {
+    let branch = Rc::new(PendingBranch {
+        uninit_branch: Box::new(MaybeUninit::uninit()),
+        gen_fn,
+        start_addr: Cell::new(None),
+        end_addr: Cell::new(None),
+        targets: [Cell::new(None), Cell::new(None)],
+    });
 
     // Add to the list of outgoing branches for the block
-    let branchref = Rc::new(RefCell::new(branch));
-    block.borrow_mut().push_outgoing(branchref.clone());
-    incr_counter!(compiled_branch_count);
+    jit.queue_outgoing_branch(branch.clone());
 
-    return branchref;
+    branch
 }
 
 c_callable! {
     /// Generated code calls this function with the SysV calling convention.
-    /// See [set_branch_target].
+    /// See [gen_branch_stub].
     fn branch_stub_hit(
         branch_ptr: *const c_void,
         target_idx: u32,
         ec: EcPtr,
     ) -> *const u8 {
         with_vm_lock(src_loc!(), || {
-            branch_stub_hit_body(branch_ptr, target_idx, ec)
+            with_compile_time(|| { branch_stub_hit_body(branch_ptr, target_idx, ec) })
         })
     }
 }
@@ -1759,24 +3523,18 @@ fn branch_stub_hit_body(branch_ptr: *const c_void, target_idx: u32, ec: EcPtr) -
         println!("branch_stub_hit");
     }
 
-    assert!(!branch_ptr.is_null());
-
-    //branch_ptr is actually:
-    //branch_ptr: *const RefCell<Branch>
-    let branch_rc = unsafe { BranchRef::from_raw(branch_ptr as *const RefCell<Branch>) };
+    let branch_ref = NonNull::<Branch>::new(branch_ptr as *mut Branch)
+        .expect("Branches should not be null");
 
-    // We increment the strong count because we want to keep the reference owned
-    // by the branch stub alive. Return branch stubs can be hit multiple times.
-    unsafe { Rc::increment_strong_count(branch_ptr) };
-
-    let mut branch = branch_rc.borrow_mut();
+    // SAFETY: We have the VM lock, and the branch is initialized by the time generated
+    // code calls this function.
+    //
+    // Careful, don't make a `&Block` from `branch.block` here because we might
+    // delete it later in delete_empty_defer_block().
+    let branch = unsafe { branch_ref.as_ref() };
     let branch_size_on_entry = branch.code_size();
 
     let target_idx: usize = target_idx.as_usize();
-    let target = branch.targets[target_idx].as_ref().unwrap();
-    let target_blockid = target.get_blockid();
-    let target_ctx = target.get_ctx();
-
     let target_branch_shape = match target_idx {
         0 => BranchShape::Next0,
         1 => BranchShape::Next1,
@@ -1786,22 +3544,33 @@ fn branch_stub_hit_body(branch_ptr: *const c_void, target_idx: u32, ec: EcPtr) -
     let cb = CodegenGlobals::get_inline_cb();
     let ocb = CodegenGlobals::get_outlined_cb();
 
-    // If this branch has already been patched, return the dst address
-    // Note: ractors can cause the same stub to be hit multiple times
-    if let BranchTarget::Block(_) = target.as_ref() {
-        return target.get_address().unwrap().raw_ptr();
-    }
+    let (target_blockid, target_ctx): (BlockId, Context) = unsafe {
+        // SAFETY: no mutation of the target's Cell. Just reading out data.
+        let target = branch.targets[target_idx].ref_unchecked().as_ref().unwrap();
+
+        // If this branch has already been patched, return the dst address
+        // Note: recursion can cause the same stub to be hit multiple times
+        if let BranchTarget::Block(_) = target.as_ref() {
+            return target.get_address().unwrap().raw_ptr(cb);
+        }
+
+        let target_ctx = Context::decode(target.get_ctx());
+        (target.get_blockid(), target_ctx)
+    };
 
     let (cfp, original_interp_sp) = unsafe {
         let cfp = get_ec_cfp(ec);
         let original_interp_sp = get_cfp_sp(cfp);
 
-        let running_iseq = rb_cfp_get_iseq(cfp);
-        let reconned_pc = rb_iseq_pc_at_idx(running_iseq, target_blockid.idx);
-        let reconned_sp = original_interp_sp.offset(target_ctx.sp_offset.into());
-
+        let running_iseq = get_cfp_iseq(cfp);
         assert_eq!(running_iseq, target_blockid.iseq as _, "each stub expects a particular iseq");
 
+        let reconned_pc = rb_iseq_pc_at_idx(running_iseq, target_blockid.idx.into());
+        let reconned_sp = original_interp_sp.offset(target_ctx.sp_offset.into());
+        // Unlike in the interpreter, our `leave` doesn't write to the caller's
+        // SP -- we do it in the returned-to code. Account for this difference.
+        let reconned_sp = reconned_sp.add(target_ctx.is_return_landing().into());
+
         // Update the PC in the current CFP, because it may be out of sync in JITted code
         rb_set_cfp_pc(cfp, reconned_pc);
 
@@ -1813,83 +3582,97 @@ fn branch_stub_hit_body(branch_ptr: *const c_void, target_idx: u32, ec: EcPtr) -
         // So we do it here instead.
         rb_set_cfp_sp(cfp, reconned_sp);
 
+        // Bail if code GC is disabled and we've already run out of spaces.
+        if !get_option!(code_gc) && (cb.has_dropped_bytes() || ocb.unwrap().has_dropped_bytes()) {
+            return CodegenGlobals::get_stub_exit_code().raw_ptr(cb);
+        }
+
+        // Bail if we're about to run out of native stack space.
+        // We've just reconstructed interpreter state.
+        if rb_ec_stack_check(ec as _) != 0 {
+            return CodegenGlobals::get_stub_exit_code().raw_ptr(cb);
+        }
+
+        // Bail if this branch is housed in an invalidated (dead) block.
+        // This only happens in rare invalidation scenarios and we need
+        // to avoid linking a dead block to a live block with a branch.
+        if branch.block.get().as_ref().iseq.get().is_null() {
+            return CodegenGlobals::get_stub_exit_code().raw_ptr(cb);
+        }
+
         (cfp, original_interp_sp)
     };
 
     // Try to find an existing compiled version of this block
     let mut block = find_block_version(target_blockid, &target_ctx);
-
+    let mut branch_modified = false;
     // If this block hasn't yet been compiled
     if block.is_none() {
-        let branch_old_shape = branch.shape;
-        let mut branch_modified = false;
+        let branch_old_shape = branch.gen_fn.get_shape();
 
         // If the new block can be generated right after the branch (at cb->write_pos)
-        if Some(cb.get_write_ptr()) == branch.end_addr {
+        if cb.get_write_ptr() == branch.end_addr.get() {
             // This branch should be terminating its block
-            assert!(branch.end_addr == branch.block.borrow().end_addr);
+            assert!(branch.end_addr == unsafe { branch.block.get().as_ref() }.end_addr);
 
             // Change the branch shape to indicate the target block will be placed next
-            branch.shape = target_branch_shape;
+            branch.gen_fn.set_shape(target_branch_shape);
 
             // Rewrite the branch with the new, potentially more compact shape
-            regenerate_branch(cb, &mut branch);
+            regenerate_branch(cb, branch);
             branch_modified = true;
 
             // Ensure that the branch terminates the codeblock just like
             // before entering this if block. This drops bytes off the end
             // in case we shrank the branch when regenerating.
-            cb.set_write_ptr(branch.end_addr.unwrap());
+            cb.set_write_ptr(branch.end_addr.get());
         }
 
         // Compile the new block version
-        drop(branch); // Stop mutable RefCell borrow since GC might borrow branch for marking
         block = gen_block_series(target_blockid, &target_ctx, ec, cb, ocb);
-        branch = branch_rc.borrow_mut();
 
         if block.is_none() && branch_modified {
             // We couldn't generate a new block for the branch, but we modified the branch.
             // Restore the branch by regenerating it.
-            branch.shape = branch_old_shape;
-            regenerate_branch(cb, &mut branch);
+            branch.gen_fn.set_shape(branch_old_shape);
+            regenerate_branch(cb, branch);
         }
     }
 
     // Finish building the new block
     let dst_addr = match block {
-        Some(block_rc) => {
-            let mut block: RefMut<_> = block_rc.borrow_mut();
+        Some(new_block) => {
+            let new_block = unsafe { new_block.as_ref() };
 
             // Branch shape should reflect layout
-            assert!(!(branch.shape == target_branch_shape && block.start_addr != branch.end_addr));
+            assert!(!(branch.gen_fn.get_shape() == target_branch_shape && new_block.start_addr != branch.end_addr.get()));
+
+            // When block housing this branch is empty, try to free it
+            delete_empty_defer_block(branch, new_block, target_ctx, target_blockid);
 
             // Add this branch to the list of incoming branches for the target
-            block.push_incoming(branch_rc.clone());
-            mem::drop(block); // end mut borrow
+            new_block.push_incoming(branch_ref);
 
             // Update the branch target address
-            branch.targets[target_idx] = Some(Box::new(BranchTarget::Block(block_rc.clone())));
+            branch.targets[target_idx].set(Some(Box::new(BranchTarget::Block(new_block.into()))));
 
             // Rewrite the branch with the new jump target address
-            regenerate_branch(cb, &mut branch);
+            regenerate_branch(cb, branch);
 
             // Restore interpreter sp, since the code hitting the stub expects the original.
             unsafe { rb_set_cfp_sp(cfp, original_interp_sp) };
 
-            block_rc.borrow().start_addr.unwrap()
+            new_block.start_addr
         }
         None => {
-            // Code GC needs to borrow blocks for invalidation, so their mutable
-            // borrows must be dropped first.
-            drop(block);
-            drop(branch);
             // Trigger code GC. The whole ISEQ will be recompiled later.
             // We shouldn't trigger it in the middle of compilation in branch_stub_hit
             // because incomplete code could be used when cb.dropped_bytes is flipped
             // by code GC. So this place, after all compilation, is the safest place
             // to hook code GC on branch_stub_hit.
-            cb.code_gc();
-            branch = branch_rc.borrow_mut();
+            if get_option!(code_gc) {
+                cb.code_gc(ocb);
+            }
 
             // Failed to service the stub by generating a new block so now we
             // need to exit to the interpreter at the stubbed location. We are
@@ -1909,88 +3692,120 @@ fn branch_stub_hit_body(branch_ptr: *const c_void, target_idx: u32, ec: EcPtr) -
     assert!(
         new_branch_size <= branch_size_on_entry,
         "branch stubs should never enlarge branches (start_addr: {:?}, old_size: {}, new_size: {})",
-        branch.start_addr.unwrap().raw_ptr(), branch_size_on_entry, new_branch_size,
+        branch.start_addr.raw_ptr(cb), branch_size_on_entry, new_branch_size,
     );
 
     // Return a pointer to the compiled block version
-    dst_addr.raw_ptr()
+    dst_addr.raw_ptr(cb)
 }
 
-/// Set up a branch target at an index with a block version or a stub
-fn set_branch_target(
-    target_idx: u32,
-    target: BlockId,
-    ctx: &Context,
-    branchref: &BranchRef,
-    branch: &mut Branch,
-    ocb: &mut OutlinedCb,
-) {
-    let maybe_block = find_block_version(target, ctx);
-
-    // If the block already exists
-    if let Some(blockref) = maybe_block {
-        let mut block = blockref.borrow_mut();
-
-        // Add an incoming branch into this block
-        block.push_incoming(branchref.clone());
+/// Part of branch_stub_hit().
+/// If we've hit a deferred branch, and the housing block consists solely of the branch, rewire
+/// incoming branches to the new block and delete the housing block.
+fn delete_empty_defer_block(branch: &Branch, new_block: &Block, target_ctx: Context, target_blockid: BlockId)
+{
+    // This &Block should be unique, relying on the VM lock
+    let housing_block: &Block = unsafe { branch.block.get().as_ref() };
+    if target_ctx.is_deferred() &&
+        target_blockid == housing_block.get_blockid() &&
+        housing_block.outgoing.len() == 1 &&
+        {
+            // The block is empty when iseq_range is one instruction long.
+            let range = &housing_block.iseq_range;
+            let iseq = housing_block.iseq.get();
+            let start_opcode = iseq_opcode_at_idx(iseq, range.start.into()) as usize;
+            let empty_end = range.start + insn_len(start_opcode) as IseqIdx;
+            range.end == empty_end
+        }
+    {
+        // Divert incoming branches of housing_block to the new block
+        housing_block.incoming.for_each(|incoming| {
+            let incoming = unsafe { incoming.as_ref() };
+            for target in 0..incoming.targets.len() {
+                // SAFETY: No cell mutation; copying out a BlockRef.
+                if Some(BlockRef::from(housing_block)) == unsafe {
+                            incoming.targets[target]
+                                .ref_unchecked()
+                                .as_ref()
+                                .and_then(|target| target.get_block())
+                        } {
+                    incoming.targets[target].set(Some(Box::new(BranchTarget::Block(new_block.into()))));
+                }
+            }
+            new_block.push_incoming(incoming.into());
+        });
 
-        // Fill out the target with this block
-        branch.targets[target_idx.as_usize()] = Some(Box::new(BranchTarget::Block(blockref.clone())));
+        // Transplant the branch we've just hit to the new block
+        mem::drop(housing_block.outgoing.0.take());
+        new_block.outgoing.push(branch.into());
+        let housing_block: BlockRef = branch.block.replace(new_block.into());
+        // Free the old housing block; there should now be no live &Block.
+        remove_block_version(&housing_block);
+        unsafe { free_block(housing_block, false) };
 
-        return;
+        incr_counter!(deleted_defer_block_count);
     }
+}
 
+/// Generate a "stub", a piece of code that calls the compiler back when run.
+/// A piece of code that redeems for more code; a thunk for code.
+fn gen_branch_stub(
+    ctx: u32,
+    iseq: IseqPtr,
+    ocb: &mut OutlinedCb,
+    branch_struct_address: usize,
+    target_idx: u32,
+) -> Option<CodePtr> {
     let ocb = ocb.unwrap();
 
-    // Generate an outlined stub that will call branch_stub_hit()
-    let stub_addr = ocb.get_write_ptr();
+    let mut asm = Assembler::new(unsafe { get_iseq_body_local_table_size(iseq) });
+    asm.ctx = Context::decode(ctx);
+    asm.set_reg_mapping(asm.ctx.reg_mapping);
+    asm_comment!(asm, "branch stub hit");
 
-    // Get a raw pointer to the branch. We clone and then decrement the strong count which overall
-    // balances the strong count. We do this so that we're passing the result of [Rc::into_raw] to
-    // [Rc::from_raw] as required.
-    // We make sure the block housing the branch is still alive when branch_stub_hit() is running.
-    let branch_ptr: *const RefCell<Branch> = BranchRef::into_raw(branchref.clone());
-    unsafe { BranchRef::decrement_strong_count(branch_ptr) };
+    if asm.ctx.is_return_landing() {
+        asm.mov(SP, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP));
+        let top = asm.stack_push(Type::Unknown);
+        asm.mov(top, C_RET_OPND);
+    }
+
+    // Save caller-saved registers before C_ARG_OPNDS get clobbered.
+    // Spill all registers for consistency with the trampoline.
+    for &reg in caller_saved_temp_regs() {
+        asm.cpush(Opnd::Reg(reg));
+    }
 
-    let mut asm = Assembler::new();
-    asm.comment("branch stub hit");
+    // Spill temps to the VM stack as well for jit.peek_at_stack()
+    asm.spill_regs();
 
     // Set up the arguments unique to this stub for:
-    // branch_stub_hit(branch_ptr, target_idx, ec)
-    asm.mov(C_ARG_OPNDS[0], Opnd::const_ptr(branch_ptr as *const u8));
+    //
+    //    branch_stub_hit(branch_ptr, target_idx, ec)
+    //
+    // Bake pointer to Branch into output code.
+    // We make sure the block housing the branch is still alive when branch_stub_hit() is running.
+    asm.mov(C_ARG_OPNDS[0], branch_struct_address.into());
     asm.mov(C_ARG_OPNDS[1], target_idx.into());
 
     // Jump to trampoline to call branch_stub_hit()
     // Not really a side exit, just don't need a padded jump here.
     asm.jmp(CodegenGlobals::get_branch_stub_hit_trampoline().as_side_exit());
 
-    asm.compile(ocb);
-
-    if ocb.has_dropped_bytes() {
-        // No space
-    } else {
-        // Fill the branch target with a stub
-        branch.targets[target_idx.as_usize()] = Some(Box::new(BranchTarget::Stub(Box::new(BranchStub {
-            address: Some(stub_addr),
-            id: target,
-            ctx: ctx.clone(),
-        }))));
-    }
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
 }
 
-pub fn gen_branch_stub_hit_trampoline(ocb: &mut OutlinedCb) -> CodePtr {
+pub fn gen_branch_stub_hit_trampoline(ocb: &mut OutlinedCb) -> Option<CodePtr> {
     let ocb = ocb.unwrap();
-    let code_ptr = ocb.get_write_ptr();
-    let mut asm = Assembler::new();
+    let mut asm = Assembler::new_without_iseq();
 
     // For `branch_stub_hit(branch_ptr, target_idx, ec)`,
-    // `branch_ptr` and `target_idx` is different for each stub,
+    // `branch_ptr` and `target_idx` are different for each stub,
     // but the call and what's after is the same. This trampoline
     // is the unchanging part.
     // Since this trampoline is static, it allows code GC inside
     // branch_stub_hit() to free stubs without problems.
-    asm.comment("branch_stub_hit() trampoline");
-    let jump_addr = asm.ccall(
+    asm_comment!(asm, "branch_stub_hit() trampoline");
+    let stub_hit_ret = asm.ccall(
         branch_stub_hit as *mut u8,
         vec![
             C_ARG_OPNDS[0],
@@ -1998,226 +3813,303 @@ pub fn gen_branch_stub_hit_trampoline(ocb: &mut OutlinedCb) -> CodePtr {
             EC,
         ]
     );
+    let jump_addr = asm.load(stub_hit_ret);
+
+    // Restore caller-saved registers for stack temps
+    for &reg in caller_saved_temp_regs().rev() {
+        asm.cpop_into(Opnd::Reg(reg));
+    }
 
     // Jump to the address returned by the branch_stub_hit() call
     asm.jmp_opnd(jump_addr);
 
-    asm.compile(ocb);
+    // HACK: popping into C_RET_REG clobbers the return value of branch_stub_hit() we need to jump
+    // to, so we need a scratch register to preserve it. This extends the live range of the C
+    // return register so we get something else for the return value.
+    let _ = asm.live_reg_opnd(stub_hit_ret);
 
-    code_ptr
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
+}
+
+/// Return registers to be pushed and popped on branch_stub_hit.
+pub fn caller_saved_temp_regs() -> impl Iterator<Item = &'static Reg> + DoubleEndedIterator {
+    let temp_regs = Assembler::get_temp_regs().iter();
+    let len = temp_regs.len();
+    // The return value gen_leave() leaves in C_RET_REG
+    // needs to survive the branch_stub_hit() call.
+    let regs = temp_regs.chain(std::iter::once(&C_RET_REG));
+
+    // On x86_64, maintain 16-byte stack alignment
+    if cfg!(target_arch = "x86_64") && len % 2 == 0 {
+        static ONE_MORE: [Reg; 1] = [C_RET_REG];
+        regs.chain(ONE_MORE.iter())
+    } else {
+        regs.chain(&[])
+    }
 }
 
 impl Assembler
 {
+    /// Mark the start position of a patchable entry point in the machine code
+    pub fn mark_entry_start(&mut self, entryref: &PendingEntryRef) {
+        // We need to create our own entry rc object
+        // so that we can move the closure below
+        let entryref = entryref.clone();
+
+        self.pos_marker(move |code_ptr, _| {
+            entryref.start_addr.set(Some(code_ptr));
+        });
+    }
+
+    /// Mark the end position of a patchable entry point in the machine code
+    pub fn mark_entry_end(&mut self, entryref: &PendingEntryRef) {
+        // We need to create our own entry rc object
+        // so that we can move the closure below
+        let entryref = entryref.clone();
+
+        self.pos_marker(move |code_ptr, _| {
+            entryref.end_addr.set(Some(code_ptr));
+        });
+    }
+
     // Mark the start position of a patchable branch in the machine code
-    fn mark_branch_start(&mut self, branchref: &BranchRef)
+    fn mark_branch_start(&mut self, branchref: &PendingBranchRef)
     {
         // We need to create our own branch rc object
         // so that we can move the closure below
         let branchref = branchref.clone();
 
-        self.pos_marker(move |code_ptr| {
-            let mut branch = branchref.borrow_mut();
-            branch.start_addr = Some(code_ptr);
+        self.pos_marker(move |code_ptr, _| {
+            branchref.start_addr.set(Some(code_ptr));
         });
     }
 
     // Mark the end position of a patchable branch in the machine code
-    fn mark_branch_end(&mut self, branchref: &BranchRef)
+    fn mark_branch_end(&mut self, branchref: &PendingBranchRef)
     {
         // We need to create our own branch rc object
         // so that we can move the closure below
         let branchref = branchref.clone();
 
-        self.pos_marker(move |code_ptr| {
-            let mut branch = branchref.borrow_mut();
-            branch.end_addr = Some(code_ptr);
+        self.pos_marker(move |code_ptr, _| {
+            branchref.end_addr.set(Some(code_ptr));
         });
     }
 }
 
+#[must_use]
 pub fn gen_branch(
-    jit: &JITState,
+    jit: &mut JITState,
     asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
     target0: BlockId,
     ctx0: &Context,
     target1: Option<BlockId>,
     ctx1: Option<&Context>,
     gen_fn: BranchGenFn,
-) {
-    let branchref = make_branch_entry(&jit.get_block(), gen_fn);
-    let branch = &mut branchref.borrow_mut();
+) -> Option<()> {
+    let branch = new_pending_branch(jit, gen_fn);
 
     // Get the branch targets or stubs
-    set_branch_target(0, target0, ctx0, &branchref, branch, ocb);
-    if let Some(ctx) = ctx1 {
-        set_branch_target(1, target1.unwrap(), ctx, &branchref, branch, ocb);
-        if branch.targets[1].is_none() {
-            return; // avoid unwrap() in gen_fn()
+    let target0_addr = branch.set_target(0, target0, ctx0, jit)?;
+    let target1_addr = if let Some(ctx) = ctx1 {
+        let addr = branch.set_target(1, target1.unwrap(), ctx, jit);
+        if addr.is_none() {
+            // target1 requested but we're out of memory.
+            // Avoid unwrap() in gen_fn()
+            return None;
         }
-    }
 
-    // Call the branch generation function
-    asm.mark_branch_start(&branchref);
-    if let Some(dst_addr) = branch.get_target_address(0) {
-        gen_fn(asm, dst_addr, branch.get_target_address(1), BranchShape::Default);
-    }
-    asm.mark_branch_end(&branchref);
-}
+        addr
+    } else { None };
 
-fn gen_jump_branch(
-    asm: &mut Assembler,
-    target0: CodePtr,
-    _target1: Option<CodePtr>,
-    shape: BranchShape,
-) {
-    if shape == BranchShape::Next1 {
-        panic!("Branch shape Next1 not allowed in gen_jump_branch!");
-    }
+    // Call the branch generation function
+    asm.mark_branch_start(&branch);
+    branch.gen_fn.call(asm, Target::CodePtr(target0_addr), target1_addr.map(|addr| Target::CodePtr(addr)));
+    asm.mark_branch_end(&branch);
 
-    if shape == BranchShape::Default {
-        asm.jmp(target0.into());
-    }
+    Some(())
 }
 
-pub fn gen_direct_jump(jit: &JITState, ctx: &Context, target0: BlockId, asm: &mut Assembler) {
-    let branchref = make_branch_entry(&jit.get_block(), gen_jump_branch);
-    let mut branch = branchref.borrow_mut();
-
-    let mut new_target = BranchTarget::Stub(Box::new(BranchStub {
-        address: None,
-        ctx: ctx.clone(),
-        id: target0,
-    }));
-
+pub fn gen_direct_jump(jit: &mut JITState, ctx: &Context, target0: BlockId, asm: &mut Assembler) {
+    let branch = new_pending_branch(jit, BranchGenFn::JumpToTarget0(Cell::new(BranchShape::Default)));
     let maybe_block = find_block_version(target0, ctx);
 
     // If the block already exists
-    if let Some(blockref) = maybe_block {
-        let mut block = blockref.borrow_mut();
-        let block_addr = block.start_addr.unwrap();
-
-        block.push_incoming(branchref.clone());
-
-        new_target = BranchTarget::Block(blockref.clone());
-
-        branch.shape = BranchShape::Default;
+    let new_target = if let Some(blockref) = maybe_block {
+        let block = unsafe { blockref.as_ref() };
+        let block_addr = block.start_addr;
 
         // Call the branch generation function
-        asm.comment("gen_direct_jmp: existing block");
-        asm.mark_branch_start(&branchref);
-        gen_jump_branch(asm, block_addr, None, BranchShape::Default);
-        asm.mark_branch_end(&branchref);
-    } else {
-        // `None` in new_target.address signals gen_block_series() to compile the
-        // target block right after this one (fallthrough).
-        branch.shape = BranchShape::Next0;
+        asm_comment!(asm, "gen_direct_jmp: existing block");
+        asm.mark_branch_start(&branch);
+        branch.gen_fn.call(asm, Target::CodePtr(block_addr), None);
+        asm.mark_branch_end(&branch);
 
+        BranchTarget::Block(blockref)
+    } else {
         // The branch is effectively empty (a noop)
-        asm.comment("gen_direct_jmp: fallthrough");
-        asm.mark_branch_start(&branchref);
-        asm.mark_branch_end(&branchref);
-    }
+        asm_comment!(asm, "gen_direct_jmp: fallthrough");
+        asm.mark_branch_start(&branch);
+        asm.mark_branch_end(&branch);
+        branch.gen_fn.set_shape(BranchShape::Next0);
+
+        // `None` in new_target.address signals gen_block_series() to
+        // compile the target block right after this one (fallthrough).
+        BranchTarget::Stub(Box::new(BranchStub {
+            address: None,
+            ctx: Context::encode(ctx),
+            iseq: Cell::new(target0.iseq),
+            iseq_idx: target0.idx,
+        }))
+    };
 
-    branch.targets[0] = Some(Box::new(new_target));
+    branch.targets[0].set(Some(Box::new(new_target)));
 }
 
 /// Create a stub to force the code up to this point to be executed
-pub fn defer_compilation(
-    jit: &JITState,
-    cur_ctx: &Context,
-    asm: &mut Assembler,
-    ocb: &mut OutlinedCb,
-) {
-    if cur_ctx.chain_depth != 0 {
+pub fn defer_compilation(jit: &mut JITState, asm: &mut Assembler) -> Result<(), ()> {
+    if asm.ctx.is_deferred() {
         panic!("Double defer!");
     }
 
-    let mut next_ctx = cur_ctx.clone();
+    let mut next_ctx = asm.ctx;
 
-    if next_ctx.chain_depth == u8::MAX {
-        panic!("max block version chain depth reached!");
-    }
-    next_ctx.chain_depth += 1;
+    next_ctx.mark_as_deferred();
 
-    let block_rc = jit.get_block();
-    let branch_rc = make_branch_entry(&jit.get_block(), gen_jump_branch);
-    let mut branch = branch_rc.borrow_mut();
-    let block = block_rc.borrow();
+    let branch = new_pending_branch(jit, BranchGenFn::JumpToTarget0(Cell::new(BranchShape::Default)));
 
     let blockid = BlockId {
-        iseq: block.blockid.iseq,
+        iseq: jit.get_iseq(),
         idx: jit.get_insn_idx(),
     };
-    set_branch_target(0, blockid, &next_ctx, &branch_rc, &mut branch, ocb);
+
+    // Likely a stub since the context is marked as deferred().
+    let dst_addr = branch.set_target(0, blockid, &next_ctx, jit).ok_or(())?;
+
+    // Pad the block if it has the potential to be invalidated. This must be
+    // done before gen_fn() in case the jump is overwritten by a fallthrough.
+    if jit.block_entry_exit.is_some() {
+        asm.pad_inval_patch();
+    }
 
     // Call the branch generation function
-    asm.comment("defer_compilation");
-    asm.mark_branch_start(&branch_rc);
-    if let Some(dst_addr) = branch.get_target_address(0) {
-        gen_jump_branch(asm, dst_addr, None, BranchShape::Default);
+    asm_comment!(asm, "defer_compilation");
+    asm.mark_branch_start(&branch);
+    branch.gen_fn.call(asm, Target::CodePtr(dst_addr), None);
+    asm.mark_branch_end(&branch);
+
+    // If the block we're deferring from is empty
+    if jit.get_starting_insn_idx() == jit.get_insn_idx() {
+        incr_counter!(defer_empty_count);
     }
-    asm.mark_branch_end(&branch_rc);
 
     incr_counter!(defer_count);
+
+    Ok(())
 }
 
-fn remove_from_graph(blockref: &BlockRef) {
-    let block = blockref.borrow();
+/// Remove a block from the live control flow graph.
+/// Block must be initialized and incoming/outgoing edges
+/// must also point to initialized blocks.
+unsafe fn remove_from_graph(blockref: BlockRef) {
+    let block = unsafe { blockref.as_ref() };
 
     // Remove this block from the predecessor's targets
-    for pred_branchref in &block.incoming {
+    for pred_branchref in block.incoming.0.take().iter() {
         // Branch from the predecessor to us
-        let mut pred_branch = pred_branchref.borrow_mut();
+        let pred_branch = unsafe { pred_branchref.as_ref() };
 
         // If this is us, nullify the target block
-        for target_idx in 0..=1 {
-            if let Some(target) = pred_branch.targets[target_idx].as_ref() {
-                if target.get_block().as_ref() == Some(blockref) {
-                    pred_branch.targets[target_idx] = None;
-                }
+        for target_idx in 0..pred_branch.targets.len() {
+            // SAFETY: no mutation inside unsafe
+            let target_is_us = unsafe {
+                pred_branch.targets[target_idx]
+                    .ref_unchecked()
+                    .as_ref()
+                    .and_then(|target| target.get_block())
+                    .and_then(|target_block| (target_block == blockref).then(|| ()))
+                    .is_some()
+            };
+
+            if target_is_us {
+                pred_branch.targets[target_idx].set(None);
             }
         }
     }
 
     // For each outgoing branch
-    for out_branchref in &block.outgoing {
-        let out_branch = out_branchref.borrow();
-
+    block.outgoing.for_each(|out_branchref| {
+        let out_branch = unsafe { out_branchref.as_ref() };
         // For each successor block
-        for out_target in out_branch.targets.iter().flatten() {
-            if let Some(succ_blockref) = &out_target.get_block() {
+        for out_target in out_branch.targets.iter() {
+            // SAFETY: copying out an Option<BlockRef>. No mutation.
+            let succ_block: Option<BlockRef> = unsafe {
+                out_target.ref_unchecked().as_ref().and_then(|target| target.get_block())
+            };
+
+            if let Some(succ_block) = succ_block {
                 // Remove outgoing branch from the successor's incoming list
-                let mut succ_block = succ_blockref.borrow_mut();
-                succ_block
-                    .incoming
-                    .retain(|succ_incoming| !Rc::ptr_eq(succ_incoming, out_branchref));
+                // SAFETY: caller promises the block has valid outgoing edges.
+                let succ_block = unsafe { succ_block.as_ref() };
+                // Temporarily move out of succ_block.incoming.
+                let succ_incoming = succ_block.incoming.0.take();
+                let mut succ_incoming = succ_incoming.into_vec();
+                succ_incoming.retain(|branch| *branch != out_branchref);
+                succ_block.incoming.0.set(succ_incoming.into_boxed_slice()); // allocs. Rely on oom=abort
             }
         }
-    }
+    });
 }
 
-/// Remove most references to a block to deallocate it.
-/// Does not touch references from iseq payloads.
-pub fn free_block(blockref: &BlockRef) {
-    block_assumptions_free(blockref);
+/// Tear down a block and deallocate it.
+/// Caller has to ensure that the code tracked by the block is not
+/// running, as running code may hit [branch_stub_hit] who expects
+/// [Branch] to be live.
+///
+/// We currently ensure this through the `jit_cont` system in cont.c
+/// and sometimes through the GC calling [rb_yjit_iseq_free]. The GC
+/// has proven that an ISeq is not running if it calls us to free it.
+///
+/// For delayed deallocation, since dead blocks don't keep
+/// blocks they refer alive, by the time we get here their outgoing
+/// edges may be dangling. Pass `graph_intact=false` such these cases.
+pub unsafe fn free_block(blockref: BlockRef, graph_intact: bool) {
+    // Careful with order here.
+    // First, remove all pointers to the referent block
+    unsafe {
+        block_assumptions_free(blockref);
+
+        if graph_intact {
+            remove_from_graph(blockref);
+        }
+    }
 
-    remove_from_graph(blockref);
+    // SAFETY: we should now have a unique pointer to the block
+    unsafe { dealloc_block(blockref) }
+}
 
-    // Branches have a Rc pointing at the block housing them.
-    // Break the cycle.
-    blockref.borrow_mut().incoming.clear();
-    blockref.borrow_mut().outgoing.clear();
+/// Deallocate a block and its outgoing branches. Blocks own their outgoing branches.
+/// Caller must ensure that we have unique ownership for the referent block
+unsafe fn dealloc_block(blockref: BlockRef) {
+    unsafe {
+        for outgoing in blockref.as_ref().outgoing.0.take().iter() {
+            // this Box::from_raw matches the Box::into_raw from PendingBranch::into_branch
+            mem::drop(Box::from_raw(outgoing.as_ptr()));
+        }
+    }
 
-    // No explicit deallocation here as blocks are ref-counted.
+    // Deallocate the referent Block
+    unsafe {
+        // this Box::from_raw matches the Box::into_raw from JITState::into_block
+        mem::drop(Box::from_raw(blockref.as_ptr()));
+    }
 }
 
 // Some runtime checks for integrity of a program location
 pub fn verify_blockid(blockid: BlockId) {
     unsafe {
         assert!(rb_IMEMO_TYPE_P(blockid.iseq.into(), imemo_iseq) != 0);
-        assert!(blockid.idx < get_iseq_encoded_size(blockid.iseq));
+        assert!(u32::from(blockid.idx) < get_iseq_encoded_size(blockid.iseq));
     }
 }
 
@@ -2228,20 +4120,21 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
     // TODO: want to assert that all other ractors are stopped here. Can't patch
     // machine code that some other thread is running.
 
-    let block = blockref.borrow();
+    let block = unsafe { (*blockref).as_ref() };
+    let id_being_invalidated = block.get_blockid();
     let mut cb = CodegenGlobals::get_inline_cb();
     let ocb = CodegenGlobals::get_outlined_cb();
 
-    verify_blockid(block.blockid);
+    verify_blockid(id_being_invalidated);
 
     #[cfg(feature = "disasm")]
     {
         // If dump_iseq_disasm is specified, print to console that blocks for matching ISEQ names were invalidated.
         if let Some(substr) = get_option_ref!(dump_iseq_disasm).as_ref() {
-            let blockid_idx = block.blockid.idx;
-            let iseq_location = iseq_get_location(block.blockid.iseq, blockid_idx);
+            let iseq_range = &block.iseq_range;
+            let iseq_location = iseq_get_location(block.iseq.get(), iseq_range.start);
             if iseq_location.contains(substr) {
-                println!("Invalidating block from {}, ISEQ offsets [{}, {})", iseq_location, blockid_idx, block.end_idx);
+                println!("Invalidating block from {}, ISEQ offsets [{}, {})", iseq_location, iseq_range.start, iseq_range.end);
             }
         }
     }
@@ -2252,7 +4145,7 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
     // Get a pointer to the generated code for this block
     let block_start = block.start_addr;
 
-    // Make the the start of the block do an exit. This handles OOM situations
+    // Make the start of the block do an exit. This handles OOM situations
     // and some cases where we can't efficiently patch incoming branches.
     // Do this first, since in case there is a fallthrough branch into this
     // block, the patching loop below can overwrite the start of the block.
@@ -2263,36 +4156,29 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
         .entry_exit
         .expect("invalidation needs the entry_exit field");
     {
-        let block_start = block
-            .start_addr
-            .expect("invalidation needs constructed block");
-        let block_end = block
-            .end_addr
-            .expect("invalidation needs constructed block");
+        let block_end = block.get_end_addr();
 
         if block_start == block_entry_exit {
             // Some blocks exit on entry. Patching a jump to the entry at the
             // entry makes an infinite loop.
         } else {
-            // TODO(alan)
-            // if (block.start_addr >= cb_get_ptr(cb, yjit_codepage_frozen_bytes)) // Don't patch frozen code region
-
             // Patch in a jump to block.entry_exit.
 
             let cur_pos = cb.get_write_ptr();
             let cur_dropped_bytes = cb.has_dropped_bytes();
             cb.set_write_ptr(block_start);
 
-            let mut asm = Assembler::new();
+            let mut asm = Assembler::new_without_iseq();
             asm.jmp(block_entry_exit.as_side_exit());
             cb.set_dropped_bytes(false);
-            asm.compile(&mut cb);
+            asm.compile(&mut cb, Some(ocb)).expect("can rewrite existing code");
 
             assert!(
                 cb.get_write_ptr() <= block_end,
-                "invalidation wrote past end of block (code_size: {:?}, new_size: {})",
+                "invalidation wrote past end of block (code_size: {:?}, new_size: {}, start_addr: {:?})",
                 block.code_size(),
-                cb.get_write_ptr().into_i64() - block_start.into_i64(),
+                cb.get_write_ptr().as_offset() - block_start.as_offset(),
+                block.start_addr.raw_ptr(cb),
             );
             cb.set_write_ptr(cur_pos);
             cb.set_dropped_bytes(cur_dropped_bytes);
@@ -2300,64 +4186,92 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
     }
 
     // For each incoming branch
-    for branchref in &block.incoming {
-        let mut branch = branchref.borrow_mut();
-        let target_idx = if branch.get_target_address(0) == block_start {
+    let mut incoming_branches = block.incoming.0.take();
+
+    // An adjacent branch will write into the start of the block being invalidated, possibly
+    // overwriting the block's exit. If we run out of memory after doing this, any subsequent
+    // incoming branches we rewrite won't be able use the block's exit as a fallback when they
+    // are unable to generate a stub. To avoid this, if there's an incoming branch that's
+    // adjacent to the invalidated block, make sure we process it last.
+    let adjacent_branch_idx = incoming_branches.iter().position(|branchref| {
+        let branch = unsafe { branchref.as_ref() };
+        let target_next = block.start_addr == branch.end_addr.get();
+        target_next
+    });
+    if let Some(adjacent_branch_idx) = adjacent_branch_idx {
+        incoming_branches.swap(adjacent_branch_idx, incoming_branches.len() - 1)
+    }
+
+    for (i, branchref) in incoming_branches.iter().enumerate() {
+        let branch = unsafe { branchref.as_ref() };
+        let target_idx = if branch.get_target_address(0) == Some(block_start) {
             0
         } else {
             1
         };
 
         // Assert that the incoming branch indeed points to the block being invalidated
-        let incoming_target = branch.targets[target_idx].as_ref().unwrap();
-        assert_eq!(block_start, incoming_target.get_address());
-        if let Some(incoming_block) = &incoming_target.get_block() {
-            assert_eq!(blockref, incoming_block);
-        }
-
-        // TODO(alan):
-        // Don't patch frozen code region
-        // if (branch.start_addr < cb_get_ptr(cb, yjit_codepage_frozen_bytes)) {
-        //     continue;
-        // }
-
-        // Create a stub for this branch target or rewire it to a valid block
-        set_branch_target(target_idx as u32, block.blockid, &block.ctx, branchref, &mut branch, ocb);
-
-        if branch.targets[target_idx].is_none() {
-            // We were unable to generate a stub (e.g. OOM). Use the block's
-            // exit instead of a stub for the block. It's important that we
-            // still patch the branch in this situation so stubs are unique
-            // to branches. Think about what could go wrong if we run out of
-            // memory in the middle of this loop.
-            branch.targets[target_idx] = Some(Box::new(BranchTarget::Stub(Box::new(BranchStub {
-                address: block.entry_exit,
-                id: block.blockid,
-                ctx: block.ctx.clone(),
-            }))));
+        // SAFETY: no mutation.
+        unsafe {
+            let incoming_target = branch.targets[target_idx].ref_unchecked().as_ref().unwrap();
+            assert_eq!(Some(block_start), incoming_target.get_address());
+            if let Some(incoming_block) = &incoming_target.get_block() {
+                assert_eq!(blockref, incoming_block);
+            }
         }
 
+        // Create a stub for this branch target
+        let stub_addr = gen_branch_stub(block.ctx, block.iseq.get(), ocb, branchref.as_ptr() as usize, target_idx as u32);
+
+        // In case we were unable to generate a stub (e.g. OOM). Use the block's
+        // exit instead of a stub for the block. It's important that we
+        // still patch the branch in this situation so stubs are unique
+        // to branches. Think about what could go wrong if we run out of
+        // memory in the middle of this loop.
+        let stub_addr = stub_addr.unwrap_or(block_entry_exit);
+
+        // Fill the branch target with a stub
+        branch.targets[target_idx].set(Some(Box::new(BranchTarget::Stub(Box::new(BranchStub {
+            address: Some(stub_addr),
+            iseq: block.iseq.clone(),
+            iseq_idx: block.iseq_range.start,
+            ctx: block.ctx,
+        })))));
+
         // Check if the invalidated block immediately follows
-        let target_next = block.start_addr == branch.end_addr;
+        let target_next = block.start_addr == branch.end_addr.get();
 
         if target_next {
-            // The new block will no longer be adjacent.
-            // Note that we could be enlarging the branch and writing into the
-            // start of the block being invalidated.
-            branch.shape = BranchShape::Default;
+            if stub_addr != block.start_addr {
+                // The new block will no longer be adjacent.
+                // Note that we could be enlarging the branch and writing into the
+                // start of the block being invalidated.
+                branch.gen_fn.set_shape(BranchShape::Default);
+            } else {
+                // The branch target is still adjacent, so the branch must remain
+                // a fallthrough so we don't overwrite the target with a jump.
+                //
+                // This can happen if we're unable to generate a stub and the
+                // target block also exits on entry (block_start == block_entry_exit).
+            }
         }
 
         // Rewrite the branch with the new jump target address
         let old_branch_size = branch.code_size();
-        regenerate_branch(cb, &mut branch);
+        regenerate_branch(cb, branch);
 
         if target_next && branch.end_addr > block.end_addr {
             panic!("yjit invalidate rewrote branch past end of invalidated block: {:?} (code_size: {})", branch, block.code_size());
         }
+        let is_last_incoming_branch = i == incoming_branches.len() - 1;
+        if target_next && branch.end_addr.get() > block_entry_exit && !is_last_incoming_branch {
+            // We might still need to jump to this exit if we run out of memory when rewriting another incoming branch.
+            panic!("yjit invalidate rewrote branch over exit of invalidated block: {:?}", branch);
+        }
         if !target_next && branch.code_size() > old_branch_size {
             panic!(
                 "invalidated branch grew in size (start_addr: {:?}, old_size: {}, new_size: {})",
-                branch.start_addr.unwrap().raw_ptr(), old_branch_size, branch.code_size()
+                branch.start_addr.raw_ptr(cb), old_branch_size, branch.code_size()
             );
         }
     }
@@ -2370,17 +4284,21 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
     // points will always have an instruction index of 0.  We'll need to
     // change this in the future when we support optional parameters because
     // they enter the function with a non-zero PC
-    if block.blockid.idx == 0 {
+    if block.iseq_range.start == 0 {
         // TODO:
         // We could reset the exec counter to zero in rb_iseq_reset_jit_func()
         // so that we eventually compile a new entry point when useful
-        unsafe { rb_iseq_reset_jit_func(block.blockid.iseq) };
+        unsafe { rb_iseq_reset_jit_func(block.iseq.get()) };
     }
 
     // FIXME:
     // Call continuation addresses on the stack can also be atomically replaced by jumps going to the stub.
 
-    delayed_deallocation(blockref);
+    // SAFETY: This block was in a version_map earlier
+    // in this function before we removed it, so it's well connected.
+    unsafe { remove_from_graph(*blockref) };
+
+    delayed_deallocation(*blockref);
 
     ocb.unwrap().mark_all_executable();
     cb.mark_all_executable();
@@ -2388,29 +4306,57 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
     incr_counter!(invalidation_count);
 }
 
-// We cannot deallocate blocks immediately after invalidation since there
-// could be stubs waiting to access branch pointers. Return stubs can do
-// this since patching the code for setting up return addresses does not
-// affect old return addresses that are already set up to use potentially
-// invalidated branch pointers. Example:
+// We cannot deallocate blocks immediately after invalidation since patching the code for setting
+// up return addresses does not affect outstanding return addresses that are on stack and will use
+// invalidated branch pointers when hit. Example:
 //   def foo(n)
 //     if n == 2
-//       return 1.times { Object.define_method(:foo) {} }
+//       # 1.times.each to create a cfunc frame to preserve the JIT frame
+//       # which will return to a stub housed in an invalidated block
+//       return 1.times.each { Object.define_method(:foo) {} }
 //     end
 //
-//     foo(n + 1)
+//     foo(n + 1) # The block for this call houses the return branch stub
 //   end
 //   p foo(1)
-pub fn delayed_deallocation(blockref: &BlockRef) {
+pub fn delayed_deallocation(blockref: BlockRef) {
     block_assumptions_free(blockref);
 
-    // We do this another time when we deem that it's safe
-    // to deallocate in case there is another Ractor waiting to acquire the
-    // VM lock inside branch_stub_hit().
-    remove_from_graph(blockref);
+    let block = unsafe { blockref.as_ref() };
+    // Set null ISEQ on the block to signal that it's dead.
+    let iseq = block.iseq.replace(ptr::null());
+    let payload = get_iseq_payload(iseq).unwrap();
+    payload.dead_blocks.push(blockref);
+}
 
-    let payload = get_iseq_payload(blockref.borrow().blockid.iseq).unwrap();
-    payload.dead_blocks.push(blockref.clone());
+trait RefUnchecked {
+    type Contained;
+    unsafe fn ref_unchecked(&self) -> &Self::Contained;
+}
+
+impl<T> RefUnchecked for Cell<T> {
+    type Contained = T;
+
+    /// Gives a reference to the contents of a [Cell].
+    /// Dangerous; please include a SAFETY note.
+    ///
+    /// An easy way to use this without triggering Undefined Behavior is to
+    ///   1. ensure there is transitively no Cell/UnsafeCell mutation in the `unsafe` block
+    ///   2. ensure the `unsafe` block does not return any references, so our
+    ///      analysis is lexically confined. This is trivially true if the block
+    ///      returns a `bool`, for example. Aggregates that store references have
+    ///      explicit lifetime parameters that look like `<'a>`.
+    ///
+    /// There are other subtler situations that don't follow these rules yet
+    /// are still sound.
+    /// See `test_miri_ref_unchecked()` for examples. You can play with it
+    /// with `cargo +nightly miri test miri`.
+    unsafe fn ref_unchecked(&self) -> &Self::Contained {
+        // SAFETY: pointer is dereferenceable because it's from a &Cell.
+        // It's up to the caller to follow aliasing rules with the output
+        // reference.
+        unsafe { self.as_ptr().as_ref().unwrap() }
+    }
 }
 
 #[cfg(test)]
@@ -2418,31 +4364,240 @@ mod tests {
     use crate::core::*;
 
     #[test]
+    fn type_size() {
+        // Check that we can store types in 4 bits,
+        // and all local types in 32 bits
+        assert_eq!(mem::size_of::<Type>(), 1);
+        assert!(Type::BlockParamProxy as usize <= 0b1111);
+        assert!(MAX_CTX_LOCALS * 4 <= 32);
+    }
+
+    #[test]
+    fn local_types() {
+        let mut ctx = Context::default();
+
+        for i in 0..MAX_CTX_LOCALS {
+            ctx.set_local_type(i, Type::Fixnum);
+            assert_eq!(ctx.get_local_type(i), Type::Fixnum);
+            ctx.set_local_type(i, Type::BlockParamProxy);
+            assert_eq!(ctx.get_local_type(i), Type::BlockParamProxy);
+        }
+
+        ctx.set_local_type(0, Type::Fixnum);
+        ctx.clear_local_types();
+        assert!(ctx.get_local_type(0) == Type::Unknown);
+
+        // Make sure we don't accidentally set bits incorrectly
+        let mut ctx = Context::default();
+        ctx.set_local_type(0, Type::Fixnum);
+        assert_eq!(ctx.get_local_type(0), Type::Fixnum);
+        ctx.set_local_type(2, Type::Fixnum);
+        ctx.set_local_type(1, Type::BlockParamProxy);
+        assert_eq!(ctx.get_local_type(0), Type::Fixnum);
+        assert_eq!(ctx.get_local_type(2), Type::Fixnum);
+    }
+
+    #[test]
     fn types() {
         // Valid src => dst
-        assert_eq!(Type::Unknown.diff(Type::Unknown), 0);
-        assert_eq!(Type::UnknownImm.diff(Type::UnknownImm), 0);
-        assert_ne!(Type::UnknownImm.diff(Type::Unknown), usize::MAX);
-        assert_ne!(Type::Fixnum.diff(Type::Unknown), usize::MAX);
-        assert_ne!(Type::Fixnum.diff(Type::UnknownImm), usize::MAX);
+        assert_eq!(Type::Unknown.diff(Type::Unknown), TypeDiff::Compatible(0));
+        assert_eq!(Type::UnknownImm.diff(Type::UnknownImm), TypeDiff::Compatible(0));
+        assert_ne!(Type::UnknownImm.diff(Type::Unknown), TypeDiff::Incompatible);
+        assert_ne!(Type::Fixnum.diff(Type::Unknown), TypeDiff::Incompatible);
+        assert_ne!(Type::Fixnum.diff(Type::UnknownImm), TypeDiff::Incompatible);
 
         // Invalid src => dst
-        assert_eq!(Type::Unknown.diff(Type::UnknownImm), usize::MAX);
-        assert_eq!(Type::Unknown.diff(Type::Fixnum), usize::MAX);
-        assert_eq!(Type::Fixnum.diff(Type::UnknownHeap), usize::MAX);
+        assert_eq!(Type::Unknown.diff(Type::UnknownImm), TypeDiff::Incompatible);
+        assert_eq!(Type::Unknown.diff(Type::Fixnum), TypeDiff::Incompatible);
+        assert_eq!(Type::Fixnum.diff(Type::UnknownHeap), TypeDiff::Incompatible);
+    }
+
+    #[test]
+    fn reg_mapping() {
+        let mut reg_mapping = RegMapping([None, None, None, None, None]);
+
+        // 0 means every slot is not spilled
+        for stack_idx in 0..MAX_CTX_TEMPS as u8 {
+            assert_eq!(reg_mapping.get_reg(RegOpnd::Stack(stack_idx)), None);
+        }
+
+        // Set 0, 2, 6 (RegMapping: [Some(0), Some(6), Some(2), None, None])
+        reg_mapping.alloc_reg(RegOpnd::Stack(0));
+        reg_mapping.alloc_reg(RegOpnd::Stack(2));
+        reg_mapping.alloc_reg(RegOpnd::Stack(3));
+        reg_mapping.dealloc_reg(RegOpnd::Stack(3));
+        reg_mapping.alloc_reg(RegOpnd::Stack(6));
+
+        // Get 0..8
+        assert_eq!(reg_mapping.get_reg(RegOpnd::Stack(0)), Some(0));
+        assert_eq!(reg_mapping.get_reg(RegOpnd::Stack(1)), None);
+        assert_eq!(reg_mapping.get_reg(RegOpnd::Stack(2)), Some(2));
+        assert_eq!(reg_mapping.get_reg(RegOpnd::Stack(3)), None);
+        assert_eq!(reg_mapping.get_reg(RegOpnd::Stack(4)), None);
+        assert_eq!(reg_mapping.get_reg(RegOpnd::Stack(5)), None);
+        assert_eq!(reg_mapping.get_reg(RegOpnd::Stack(6)), Some(1));
+        assert_eq!(reg_mapping.get_reg(RegOpnd::Stack(7)), None);
     }
 
     #[test]
     fn context() {
         // Valid src => dst
-        assert_eq!(Context::default().diff(&Context::default()), 0);
+        assert_eq!(Context::default().diff(&Context::default()), TypeDiff::Compatible(0));
 
         // Try pushing an operand and getting its type
-        let mut ctx = Context::default();
-        ctx.stack_push(Type::Fixnum);
-        let top_type = ctx.get_opnd_type(StackOpnd(0));
+        let mut asm = Assembler::new(0);
+        asm.stack_push(Type::Fixnum);
+        let top_type = asm.ctx.get_opnd_type(StackOpnd(0));
         assert!(top_type == Type::Fixnum);
 
         // TODO: write more tests for Context type diff
     }
+
+    #[test]
+    fn context_upgrade_local() {
+        let mut asm = Assembler::new(0);
+        asm.stack_push_local(0);
+        asm.ctx.upgrade_opnd_type(StackOpnd(0), Type::Nil);
+        assert_eq!(Type::Nil, asm.ctx.get_opnd_type(StackOpnd(0)));
+    }
+
+    #[test]
+    fn context_chain_depth() {
+        let mut ctx = Context::default();
+        assert_eq!(ctx.get_chain_depth(), 0);
+        assert_eq!(ctx.is_return_landing(), false);
+        assert_eq!(ctx.is_deferred(), false);
+
+        for _ in 0..5 {
+            ctx.increment_chain_depth();
+        }
+        assert_eq!(ctx.get_chain_depth(), 5);
+
+        ctx.set_as_return_landing();
+        assert_eq!(ctx.is_return_landing(), true);
+
+        ctx.clear_return_landing();
+        assert_eq!(ctx.is_return_landing(), false);
+
+        ctx.mark_as_deferred();
+        assert_eq!(ctx.is_deferred(), true);
+
+        ctx.reset_chain_depth_and_defer();
+        assert_eq!(ctx.get_chain_depth(), 0);
+        assert_eq!(ctx.is_deferred(), false);
+    }
+
+    #[test]
+    fn shift_stack_for_send() {
+        let mut asm = Assembler::new(0);
+
+        // Push values to simulate send(:name, arg) with 6 items already on-stack
+        for _ in 0..6 {
+            asm.stack_push(Type::Fixnum);
+        }
+        asm.stack_push(Type::Unknown);
+        asm.stack_push(Type::ImmSymbol);
+        asm.stack_push(Type::Unknown);
+
+        // This method takes argc of the sendee, not argc of send
+        asm.shift_stack(1);
+
+        // The symbol should be gone
+        assert_eq!(Type::Unknown, asm.ctx.get_opnd_type(StackOpnd(0)));
+        assert_eq!(Type::Unknown, asm.ctx.get_opnd_type(StackOpnd(1)));
+    }
+
+    #[test]
+    fn test_miri_ref_unchecked() {
+        let blockid = BlockId {
+            iseq: ptr::null(),
+            idx: 0,
+        };
+        let cb = CodeBlock::new_dummy(1024);
+        let mut ocb = OutlinedCb::wrap(CodeBlock::new_dummy(1024));
+        let dumm_addr = cb.get_write_ptr();
+        let block = JITState::new(blockid, Context::default(), dumm_addr, ptr::null(), &mut ocb, true)
+            .into_block(0, dumm_addr, dumm_addr, vec![]);
+        let _dropper = BlockDropper(block);
+
+        // Outside of brief moments during construction,
+        // we're always working with &Branch (a shared reference to a Branch).
+        let branch: &Branch = &Branch {
+            gen_fn: BranchGenFn::JZToTarget0,
+            block: Cell::new(block),
+            start_addr: dumm_addr,
+            end_addr: Cell::new(dumm_addr),
+            targets: [Cell::new(None), Cell::new(Some(Box::new(BranchTarget::Stub(Box::new(BranchStub {
+                iseq: Cell::new(ptr::null()),
+                iseq_idx: 0,
+                address: None,
+                ctx: 0,
+            })))))]
+        };
+        // For easier soundness reasoning, make sure the reference returned does not out live the
+        // `unsafe` block! It's tempting to do, but it leads to non-local issues.
+        // Here is an example where it goes wrong:
+        if false {
+            for target in branch.targets.iter().as_ref() {
+                if let Some(btarget) = unsafe { target.ref_unchecked() } {
+                    // btarget is derived from the usnafe block!
+                    target.set(None); // This drops the contents of the cell...
+                    assert!(btarget.get_address().is_none()); // but `btarget` is still live! UB.
+                }
+            }
+        }
+
+        // Do something like this instead. It's not pretty, but it's easier to vet for UB this way.
+        for target in branch.targets.iter().as_ref() {
+            // SAFETY: no mutation within unsafe
+            if unsafe { target.ref_unchecked().is_none() } {
+                continue;
+            }
+            // SAFETY: no mutation within unsafe
+            assert!(unsafe { target.ref_unchecked().as_ref().unwrap().get_address().is_none() });
+            target.set(None);
+        }
+
+        // A more subtle situation where we do Cell/UnsafeCell mutation over the
+        // lifetime of the reference released by ref_unchecked().
+        branch.targets[0].set(Some(Box::new(BranchTarget::Stub(Box::new(BranchStub {
+            iseq: Cell::new(ptr::null()),
+            iseq_idx: 0,
+            address: None,
+            ctx: 0,
+        })))));
+        // Invalid ISeq; we never dereference it.
+        let secret_iseq = NonNull::<rb_iseq_t>::dangling().as_ptr();
+        unsafe {
+            if let Some(branch_target) = branch.targets[0].ref_unchecked().as_ref() {
+                if let BranchTarget::Stub(stub) = branch_target.as_ref() {
+                    // SAFETY:
+                    // This is a Cell mutation, but it mutates the contents
+                    // of a a Cell<IseqPtr>, which is a different type
+                    // from the type of Cell found in `Branch::targets`, so
+                    // there is no chance of mutating the Cell that we called
+                    // ref_unchecked() on above.
+                    Cell::set(&stub.iseq, secret_iseq);
+                }
+            }
+        };
+        // Check that we indeed changed the iseq of the stub
+        // Cell::take moves out of the cell.
+        assert_eq!(
+            secret_iseq as usize,
+            branch.targets[0].take().unwrap().get_blockid().iseq as usize
+        );
+
+        struct BlockDropper(BlockRef);
+        impl Drop for BlockDropper {
+            fn drop(&mut self) {
+                // SAFETY: we have ownership because the test doesn't stash
+                // the block away in any global structure.
+                // Note that the test being self-contained is also why we
+                // use dealloc_block() over free_block(), as free_block() touches
+                // the global invariants tables unavailable in tests.
+                unsafe { dealloc_block(self.0) };
+            }
+        }
+    }
 }
diff --git a/yjit/src/cruby.rs b/yjit/src/cruby.rs
index f4a6956926..d34b049a45 100644
--- a/yjit/src/cruby.rs
+++ b/yjit/src/cruby.rs
@@ -83,7 +83,8 @@
 #![allow(non_upper_case_globals)]
 
 use std::convert::From;
-use std::ffi::CString;
+use std::ffi::{CString, CStr};
+use std::fmt::{Debug, Formatter};
 use std::os::raw::{c_char, c_int, c_uint};
 use std::panic::{catch_unwind, UnwindSafe};
 
@@ -96,7 +97,7 @@ pub type size_t = u64;
 pub type RedefinitionFlag = u32;
 
 #[allow(dead_code)]
-#[allow(clippy::useless_transmute)]
+#[allow(clippy::all)]
 mod autogened {
     use super::*;
     // Textually include output from rust-bindgen as suggested by its user guide.
@@ -107,13 +108,25 @@ pub use autogened::*;
 // TODO: For #defines that affect memory layout, we need to check for them
 // on build and fail if they're wrong. e.g. USE_FLONUM *must* be true.
 
-// These are functions we expose from vm_insnhelper.c, not in any header.
+// These are functions we expose from C files, not in any header.
 // Parsing it would result in a lot of duplicate definitions.
 // Use bindgen for functions that are defined in headers or in yjit.c.
 #[cfg_attr(test, allow(unused))] // We don't link against C code when testing
 extern "C" {
+    pub fn rb_check_overloaded_cme(
+        me: *const rb_callable_method_entry_t,
+        ci: *const rb_callinfo,
+    ) -> *const rb_callable_method_entry_t;
+
+    // Floats within range will be encoded without creating objects in the heap.
+    // (Range is 0x3000000000000001 to 0x4fffffffffffffff (1.7272337110188893E-77 to 2.3158417847463237E+77).
+    pub fn rb_float_new(d: f64) -> VALUE;
+
+    pub fn rb_hash_empty_p(hash: VALUE) -> VALUE;
+    pub fn rb_str_setbyte(str: VALUE, index: VALUE, value: VALUE) -> VALUE;
     pub fn rb_vm_splat_array(flag: VALUE, ary: VALUE) -> VALUE;
     pub fn rb_vm_concat_array(ary1: VALUE, ary2st: VALUE) -> VALUE;
+    pub fn rb_vm_concat_to_array(ary1: VALUE, ary2st: VALUE) -> VALUE;
     pub fn rb_vm_defined(
         ec: EcPtr,
         reg_cfp: CfpPtr,
@@ -135,19 +148,20 @@ extern "C" {
         ic: ICVARC,
     ) -> VALUE;
     pub fn rb_vm_ic_hit_p(ic: IC, reg_ep: *const VALUE) -> bool;
-    pub fn rb_str_bytesize(str: VALUE) -> VALUE;
+    pub fn rb_vm_stack_canary() -> VALUE;
+    pub fn rb_vm_push_cfunc_frame(cme: *const rb_callable_method_entry_t, recv_idx: c_int);
 }
 
 // Renames
 pub use rb_insn_name as raw_insn_name;
-pub use rb_insn_len as raw_insn_len;
-pub use rb_yarv_class_of as CLASS_OF;
 pub use rb_get_ec_cfp as get_ec_cfp;
+pub use rb_get_cfp_iseq as get_cfp_iseq;
 pub use rb_get_cfp_pc as get_cfp_pc;
 pub use rb_get_cfp_sp as get_cfp_sp;
 pub use rb_get_cfp_self as get_cfp_self;
 pub use rb_get_cfp_ep as get_cfp_ep;
 pub use rb_get_cfp_ep_level as get_cfp_ep_level;
+pub use rb_vm_base_ptr as get_cfp_bp;
 pub use rb_get_cme_def_type as get_cme_def_type;
 pub use rb_get_cme_def_body_attr_id as get_cme_def_body_attr_id;
 pub use rb_get_cme_def_body_optimized_type as get_cme_def_body_optimized_type;
@@ -162,11 +176,11 @@ pub use rb_iseq_encoded_size as get_iseq_encoded_size;
 pub use rb_get_iseq_body_local_iseq as get_iseq_body_local_iseq;
 pub use rb_get_iseq_body_iseq_encoded as get_iseq_body_iseq_encoded;
 pub use rb_get_iseq_body_stack_max as get_iseq_body_stack_max;
+pub use rb_get_iseq_body_type as get_iseq_body_type;
 pub use rb_get_iseq_flags_has_lead as get_iseq_flags_has_lead;
 pub use rb_get_iseq_flags_has_opt as get_iseq_flags_has_opt;
 pub use rb_get_iseq_flags_has_kw as get_iseq_flags_has_kw;
 pub use rb_get_iseq_flags_has_rest as get_iseq_flags_has_rest;
-pub use rb_get_iseq_flags_ruby2_keywords as get_iseq_flags_ruby2_keywords;
 pub use rb_get_iseq_flags_has_post as get_iseq_flags_has_post;
 pub use rb_get_iseq_flags_has_kwrest as get_iseq_flags_has_kwrest;
 pub use rb_get_iseq_flags_has_block as get_iseq_flags_has_block;
@@ -183,7 +197,8 @@ pub use rb_get_cikw_keywords_idx as get_cikw_keywords_idx;
 pub use rb_get_call_data_ci as get_call_data_ci;
 pub use rb_yarv_str_eql_internal as rb_str_eql_internal;
 pub use rb_yarv_ary_entry_internal as rb_ary_entry_internal;
-pub use rb_yarv_fix_mod_fix as rb_fix_mod_fix;
+pub use rb_jit_fix_div_fix as rb_fix_div_fix;
+pub use rb_jit_fix_mod_fix as rb_fix_mod_fix;
 pub use rb_FL_TEST as FL_TEST;
 pub use rb_FL_TEST_RAW as FL_TEST_RAW;
 pub use rb_RB_TYPE_P as RB_TYPE_P;
@@ -199,8 +214,6 @@ pub use rb_RCLASS_ORIGIN as RCLASS_ORIGIN;
 
 /// Helper so we can get a Rust string for insn_name()
 pub fn insn_name(opcode: usize) -> String {
-    use std::ffi::CStr;
-
     unsafe {
         // Look up Ruby's NULL-terminated insn name string
         let op_name = raw_insn_name(VALUE(opcode));
@@ -220,7 +233,7 @@ pub fn insn_len(opcode: usize) -> u32 {
 
     #[cfg(not(test))]
     unsafe {
-        raw_insn_len(VALUE(opcode)).try_into().unwrap()
+        rb_insn_len(VALUE(opcode)).try_into().unwrap()
     }
 }
 
@@ -243,6 +256,30 @@ pub struct VALUE(pub usize);
 /// Pointer to an ISEQ
 pub type IseqPtr = *const rb_iseq_t;
 
+// Given an ISEQ pointer, convert PC to insn_idx
+pub fn iseq_pc_to_insn_idx(iseq: IseqPtr, pc: *mut VALUE) -> Option<u16> {
+    let pc_zero = unsafe { rb_iseq_pc_at_idx(iseq, 0) };
+    unsafe { pc.offset_from(pc_zero) }.try_into().ok()
+}
+
+/// Given an ISEQ pointer and an instruction index, return an opcode.
+pub fn iseq_opcode_at_idx(iseq: IseqPtr, insn_idx: u32) -> u32 {
+    let pc = unsafe { rb_iseq_pc_at_idx(iseq, insn_idx) };
+    unsafe { rb_iseq_opcode_at_pc(iseq, pc) as u32 }
+}
+
+/// Return a poison value to be set above the stack top to verify leafness.
+#[cfg(not(test))]
+pub fn vm_stack_canary() -> u64 {
+    unsafe { rb_vm_stack_canary() }.as_u64()
+}
+
+/// Avoid linking the C function in `cargo test`
+#[cfg(test)]
+pub fn vm_stack_canary() -> u64 {
+    0
+}
+
 /// Opaque execution-context type from vm_core.h
 #[repr(C)]
 pub struct rb_execution_context_struct {
@@ -277,13 +314,6 @@ pub struct rb_callcache {
     _marker: core::marker::PhantomData<(*mut u8, core::marker::PhantomPinned)>,
 }
 
-/// Opaque call-info type from vm_callinfo.h
-#[repr(C)]
-pub struct rb_callinfo_kwarg {
-    _data: [u8; 0],
-    _marker: core::marker::PhantomData<(*mut u8, core::marker::PhantomPinned)>,
-}
-
 /// Opaque control_frame (CFP) struct from vm_core.h
 #[repr(C)]
 pub struct rb_control_frame_struct {
@@ -331,6 +361,11 @@ impl VALUE {
         !self.special_const_p()
     }
 
+    /// Shareability between ractors. `RB_OBJ_SHAREABLE_P()`.
+    pub fn shareable_p(self) -> bool {
+        (self.builtin_flags() & RUBY_FL_SHAREABLE as usize) != 0
+    }
+
     /// Return true if the value is a Ruby Fixnum (immediate-size integer)
     pub fn fixnum_p(self) -> bool {
         let VALUE(cval) = self;
@@ -367,6 +402,11 @@ impl VALUE {
         }
     }
 
+    /// Returns true if the value is T_HASH
+    pub fn hash_p(self) -> bool {
+        !self.special_const_p() && self.builtin_type() == RUBY_T_HASH
+    }
+
     /// Returns true or false depending on whether the value is nil
     pub fn nil_p(self) -> bool {
         self == Qnil
@@ -391,7 +431,13 @@ impl VALUE {
     }
 
     pub fn class_of(self) -> VALUE {
-        unsafe { CLASS_OF(self) }
+        if !self.special_const_p() {
+            let builtin_type = self.builtin_type();
+            assert_ne!(builtin_type, RUBY_T_NONE, "YJIT should only see live objects");
+            assert_ne!(builtin_type, RUBY_T_MOVED, "YJIT should only see live objects");
+        }
+
+        unsafe { rb_yarv_class_of(self) }
     }
 
     pub fn is_frozen(self) -> bool {
@@ -399,28 +445,16 @@ impl VALUE {
     }
 
     pub fn shape_too_complex(self) -> bool {
-        unsafe { rb_shape_obj_too_complex(self) }
+        unsafe { rb_yjit_shape_obj_too_complex_p(self) }
     }
 
     pub fn shape_id_of(self) -> u32 {
-        unsafe { rb_shape_get_shape_id(self) }
-    }
-
-    pub fn shape_of(self) -> *mut rb_shape {
-        unsafe {
-            let shape = rb_shape_get_shape_by_id(self.shape_id_of());
-
-            if shape.is_null() {
-                panic!("Shape should not be null");
-            } else {
-                shape
-            }
-        }
+        unsafe { rb_obj_shape_id(self) }
     }
 
     pub fn embedded_p(self) -> bool {
         unsafe {
-            FL_TEST_RAW(self, VALUE(ROBJECT_EMBED as usize)) != VALUE(0)
+            FL_TEST_RAW(self, VALUE(ROBJECT_HEAP as usize)) == VALUE(0)
         }
     }
 
@@ -463,7 +497,7 @@ impl VALUE {
         us as *mut T
     }
 
-    /// For working with opague pointers and encoding null check.
+    /// For working with opaque pointers and encoding null check.
     /// Similar to [std::ptr::NonNull], but for `*const T`. `NonNull<T>`
     /// is for `*mut T` while our C functions are setup to use `*const T`.
     /// Casting from `NonNull<T>` to `*const T` is too noisy.
@@ -500,9 +534,7 @@ impl VALUE {
 
         ptr
     }
-}
 
-impl VALUE {
     pub fn fixnum_from_usize(item: usize) -> Self {
         assert!(item <= (RUBY_FIXNUM_MAX as usize)); // An unsigned will always be greater than RUBY_FIXNUM_MIN
         let k: usize = item.wrapping_add(item.wrapping_add(1));
@@ -524,6 +556,18 @@ impl From<*const rb_callable_method_entry_t> for VALUE {
     }
 }
 
+impl From<&str> for VALUE {
+    fn from(value: &str) -> Self {
+        rust_str_to_ruby(value)
+    }
+}
+
+impl From<String> for VALUE {
+    fn from(value: String) -> Self {
+        rust_str_to_ruby(&value)
+    }
+}
+
 impl From<VALUE> for u64 {
     fn from(value: VALUE) -> Self {
         let VALUE(uimm) = value;
@@ -555,33 +599,58 @@ impl From<VALUE> for u16 {
 }
 
 /// Produce a Ruby string from a Rust string slice
-#[cfg(feature = "disasm")]
 pub fn rust_str_to_ruby(str: &str) -> VALUE {
     unsafe { rb_utf8_str_new(str.as_ptr() as *const _, str.len() as i64) }
 }
 
 /// Produce a Ruby symbol from a Rust string slice
 pub fn rust_str_to_sym(str: &str) -> VALUE {
+    let id = rust_str_to_id(str);
+    unsafe { rb_id2sym(id) }
+}
+
+/// Produce an ID from a Rust string slice
+pub fn rust_str_to_id(str: &str) -> ID {
     let c_str = CString::new(str).unwrap();
     let c_ptr: *const c_char = c_str.as_ptr();
+    unsafe { rb_intern(c_ptr) }
+}
+
+/// Produce an owned Rust String from a C char pointer
+pub fn cstr_to_rust_string(c_char_ptr: *const c_char) -> Option<String> {
+    assert!(c_char_ptr != std::ptr::null());
+
+    let c_str: &CStr = unsafe { CStr::from_ptr(c_char_ptr) };
 
-    unsafe { rb_id2sym(rb_intern(c_ptr)) }
+    match c_str.to_str() {
+        Ok(rust_str) => Some(rust_str.to_string()),
+        Err(_) => None
+    }
 }
 
 /// A location in Rust code for integrating with debugging facilities defined in C.
 /// Use the [src_loc!] macro to crate an instance.
 pub struct SourceLocation {
-    pub file: CString,
+    pub file: &'static CStr,
     pub line: c_int,
 }
 
+impl Debug for SourceLocation {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.write_fmt(format_args!("{}:{}", self.file.to_string_lossy(), self.line))
+    }
+}
+
 /// Make a [SourceLocation] at the current spot.
 macro_rules! src_loc {
     () => {
-        // NOTE(alan): `CString::new` allocates so we might want to limit this to debug builds.
-        $crate::cruby::SourceLocation {
-            file: std::ffi::CString::new(file!()).unwrap(), // ASCII source file paths
-            line: line!().try_into().unwrap(),              // not that many lines
+        {
+            // Nul-terminated string with static lifetime, make a CStr out of it safely.
+            let file: &'static str = concat!(file!(), '\0');
+            $crate::cruby::SourceLocation {
+                file: unsafe { std::ffi::CStr::from_ptr(file.as_ptr().cast()) },
+                line: line!().try_into().unwrap(),
+            }
         }
     };
 }
@@ -612,28 +681,27 @@ where
     let line = loc.line;
     let mut recursive_lock_level: c_uint = 0;
 
-    unsafe { rb_yjit_vm_lock_then_barrier(&mut recursive_lock_level, file, line) };
+    unsafe { rb_jit_vm_lock_then_barrier(&mut recursive_lock_level, file, line) };
 
     let ret = match catch_unwind(func) {
         Ok(result) => result,
         Err(_) => {
             // Theoretically we can recover from some of these panics,
             // but it's too late if the unwind reaches here.
-            use std::{process, str};
 
             let _ = catch_unwind(|| {
                 // IO functions can panic too.
                 eprintln!(
                     "YJIT panicked while holding VM lock acquired at {}:{}. Aborting...",
-                    str::from_utf8(loc.file.as_bytes()).unwrap_or("<not utf8>"),
+                    loc.file.to_string_lossy(),
                     line,
                 );
             });
-            process::abort();
+            std::process::abort();
         }
     };
 
-    unsafe { rb_yjit_vm_unlock(&mut recursive_lock_level, file, line) };
+    unsafe { rb_jit_vm_unlock(&mut recursive_lock_level, file, line) };
 
     ret
 }
@@ -663,8 +731,10 @@ mod manual_defs {
     pub const RUBY_FIXNUM_MAX: isize = RUBY_LONG_MAX / 2;
 
     // From vm_callinfo.h - uses calculation that seems to confuse bindgen
+    pub const VM_CALL_ARGS_SIMPLE: u32 = 1 << VM_CALL_ARGS_SIMPLE_bit;
     pub const VM_CALL_ARGS_SPLAT: u32 = 1 << VM_CALL_ARGS_SPLAT_bit;
     pub const VM_CALL_ARGS_BLOCKARG: u32 = 1 << VM_CALL_ARGS_BLOCKARG_bit;
+    pub const VM_CALL_FORWARDING: u32 = 1 << VM_CALL_FORWARDING_bit;
     pub const VM_CALL_FCALL: u32 = 1 << VM_CALL_FCALL_bit;
     pub const VM_CALL_KWARG: u32 = 1 << VM_CALL_KWARG_bit;
     pub const VM_CALL_KW_SPLAT: u32 = 1 << VM_CALL_KW_SPLAT_bit;
@@ -673,7 +743,7 @@ mod manual_defs {
     pub const VM_CALL_OPT_SEND : u32 = 1 << VM_CALL_OPT_SEND_bit;
 
     // From internal/struct.h - in anonymous enum, so we can't easily import it
-    pub const RSTRUCT_EMBED_LEN_MASK: usize = (RUBY_FL_USER2 | RUBY_FL_USER1) as usize;
+    pub const RSTRUCT_EMBED_LEN_MASK: usize = (RUBY_FL_USER7 | RUBY_FL_USER6 | RUBY_FL_USER5 | RUBY_FL_USER4 | RUBY_FL_USER3 |RUBY_FL_USER2 | RUBY_FL_USER1) as usize;
 
     // From iseq.h - via a different constant, which seems to confuse bindgen
     pub const ISEQ_TRANSLATED: usize = RUBY_FL_USER7 as usize;
@@ -689,6 +759,9 @@ mod manual_defs {
     pub const RUBY_OFFSET_RSTRUCT_AS_HEAP_PTR: i32 = 24; // struct RStruct, subfield "as.heap.ptr"
     pub const RUBY_OFFSET_RSTRUCT_AS_ARY: i32 = 16; // struct RStruct, subfield "as.ary"
 
+    pub const RUBY_OFFSET_RSTRING_AS_HEAP_PTR: i32 = 24; // struct RString, subfield "as.heap.ptr"
+    pub const RUBY_OFFSET_RSTRING_AS_ARY: i32 = 24; // struct RString, subfield "as.embed.ary"
+
     // Constants from rb_control_frame_t vm_core.h
     pub const RUBY_OFFSET_CFP_PC: i32 = 0;
     pub const RUBY_OFFSET_CFP_SP: i32 = 8;
@@ -696,15 +769,8 @@ mod manual_defs {
     pub const RUBY_OFFSET_CFP_SELF: i32 = 24;
     pub const RUBY_OFFSET_CFP_EP: i32 = 32;
     pub const RUBY_OFFSET_CFP_BLOCK_CODE: i32 = 40;
-    pub const RUBY_OFFSET_CFP_BP: i32 = 48; // field __bp__
-    pub const RUBY_OFFSET_CFP_JIT_RETURN: i32 = 56;
-    pub const RUBY_SIZEOF_CONTROL_FRAME: usize = 64;
-
-    // Constants from rb_execution_context_t vm_core.h
-    pub const RUBY_OFFSET_EC_CFP: i32 = 16;
-    pub const RUBY_OFFSET_EC_INTERRUPT_FLAG: i32 = 32; // rb_atomic_t (u32)
-    pub const RUBY_OFFSET_EC_INTERRUPT_MASK: i32 = 36; // rb_atomic_t (u32)
-    pub const RUBY_OFFSET_EC_THREAD_PTR: i32 = 48;
+    pub const RUBY_OFFSET_CFP_JIT_RETURN: i32 = 48;
+    pub const RUBY_SIZEOF_CONTROL_FRAME: usize = 56;
 
     // Constants from rb_thread_t in vm_core.h
     pub const RUBY_OFFSET_THREAD_SELF: i32 = 16;
@@ -714,3 +780,52 @@ mod manual_defs {
     pub const RUBY_OFFSET_ICE_VALUE: i32 = 8;
 }
 pub use manual_defs::*;
+
+/// Interned ID values for Ruby symbols and method names.
+/// See [type@crate::cruby::ID] and usages outside of YJIT.
+pub(crate) mod ids {
+    use std::sync::atomic::AtomicU64;
+    /// Globals to cache IDs on boot. Atomic to use with relaxed ordering
+    /// so reads can happen without `unsafe`. Synchronization done through
+    /// the VM lock.
+    macro_rules! def_ids {
+        ($(name: $ident:ident content: $str:literal)*) => {
+            $(
+                #[doc = concat!("[type@crate::cruby::ID] for `", stringify!($str), "`")]
+                pub static $ident: AtomicU64 = AtomicU64::new(0);
+            )*
+
+            pub(crate) fn init() {
+                $(
+                    let content = &$str;
+                    let ptr: *const u8 = content.as_ptr();
+
+                    // Lookup and cache each ID
+                    $ident.store(
+                        unsafe { $crate::cruby::rb_intern2(ptr.cast(), content.len() as _) },
+                        std::sync::atomic::Ordering::Relaxed
+                    );
+                )*
+
+            }
+        }
+    }
+
+    def_ids! {
+        name: NULL               content: b""
+        name: respond_to_missing content: b"respond_to_missing?"
+        name: method_missing     content: b"method_missing"
+        name: to_ary             content: b"to_ary"
+        name: to_s               content: b"to_s"
+        name: eq                 content: b"=="
+        name: include_p          content: b"include?"
+    }
+}
+
+/// Get an CRuby `ID` to an interned string, e.g. a particular method name.
+macro_rules! ID {
+    ($id_name:ident) => {
+        $crate::cruby::ids::$id_name.load(std::sync::atomic::Ordering::Relaxed)
+    }
+}
+pub(crate) use ID;
diff --git a/yjit/src/cruby_bindings.inc.rs b/yjit/src/cruby_bindings.inc.rs
index b8a8c91f38..56994388a3 100644
--- a/yjit/src/cruby_bindings.inc.rs
+++ b/yjit/src/cruby_bindings.inc.rs
@@ -1,4 +1,4 @@
-/* automatically generated by rust-bindgen 0.63.0 */
+/* automatically generated by rust-bindgen 0.70.1 */
 
 #[repr(C)]
 #[derive(Copy, Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
@@ -81,6 +81,36 @@ where
     }
 }
 #[repr(C)]
+#[derive(Default)]
+pub struct __IncompleteArrayField<T>(::std::marker::PhantomData<T>, [T; 0]);
+impl<T> __IncompleteArrayField<T> {
+    #[inline]
+    pub const fn new() -> Self {
+        __IncompleteArrayField(::std::marker::PhantomData, [])
+    }
+    #[inline]
+    pub fn as_ptr(&self) -> *const T {
+        self as *const _ as *const T
+    }
+    #[inline]
+    pub fn as_mut_ptr(&mut self) -> *mut T {
+        self as *mut _ as *mut T
+    }
+    #[inline]
+    pub unsafe fn as_slice(&self, len: usize) -> &[T] {
+        ::std::slice::from_raw_parts(self.as_ptr(), len)
+    }
+    #[inline]
+    pub unsafe fn as_mut_slice(&mut self, len: usize) -> &mut [T] {
+        ::std::slice::from_raw_parts_mut(self.as_mut_ptr(), len)
+    }
+}
+impl<T> ::std::fmt::Debug for __IncompleteArrayField<T> {
+    fn fmt(&self, fmt: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result {
+        fmt.write_str("__IncompleteArrayField")
+    }
+}
+#[repr(C)]
 pub struct __BindgenUnionField<T>(::std::marker::PhantomData<T>);
 impl<T> __BindgenUnionField<T> {
     #[inline]
@@ -105,7 +135,7 @@ impl<T> ::std::default::Default for __BindgenUnionField<T> {
 impl<T> ::std::clone::Clone for __BindgenUnionField<T> {
     #[inline]
     fn clone(&self) -> Self {
-        Self::new()
+        *self
     }
 }
 impl<T> ::std::marker::Copy for __BindgenUnionField<T> {}
@@ -123,8 +153,6 @@ impl<T> ::std::cmp::PartialEq for __BindgenUnionField<T> {
     }
 }
 impl<T> ::std::cmp::Eq for __BindgenUnionField<T> {}
-pub const SHAPE_ID_NUM_BITS: u32 = 32;
-pub const OBJ_TOO_COMPLEX_SHAPE_ID: u32 = 11;
 pub const INTEGER_REDEFINED_OP_FLAG: u32 = 1;
 pub const FLOAT_REDEFINED_OP_FLAG: u32 = 2;
 pub const STRING_REDEFINED_OP_FLAG: u32 = 4;
@@ -137,11 +165,13 @@ pub const NIL_REDEFINED_OP_FLAG: u32 = 512;
 pub const TRUE_REDEFINED_OP_FLAG: u32 = 1024;
 pub const FALSE_REDEFINED_OP_FLAG: u32 = 2048;
 pub const PROC_REDEFINED_OP_FLAG: u32 = 4096;
+pub const VM_KW_SPECIFIED_BITS_MAX: u32 = 31;
 pub const VM_ENV_DATA_SIZE: u32 = 3;
 pub const VM_ENV_DATA_INDEX_ME_CREF: i32 = -2;
 pub const VM_ENV_DATA_INDEX_SPECVAL: i32 = -1;
 pub const VM_ENV_DATA_INDEX_FLAGS: u32 = 0;
 pub const VM_BLOCK_HANDLER_NONE: u32 = 0;
+pub const SHAPE_ID_NUM_BITS: u32 = 32;
 pub type ID = ::std::os::raw::c_ulong;
 pub type rb_alloc_func_t = ::std::option::Option<unsafe extern "C" fn(klass: VALUE) -> VALUE>;
 pub const RUBY_Qfalse: ruby_special_consts = 0;
@@ -192,15 +222,13 @@ pub type ruby_value_type = u32;
 pub const RUBY_FL_USHIFT: ruby_fl_ushift = 12;
 pub type ruby_fl_ushift = u32;
 pub const RUBY_FL_WB_PROTECTED: ruby_fl_type = 32;
-pub const RUBY_FL_PROMOTED0: ruby_fl_type = 32;
-pub const RUBY_FL_PROMOTED1: ruby_fl_type = 64;
-pub const RUBY_FL_PROMOTED: ruby_fl_type = 96;
+pub const RUBY_FL_PROMOTED: ruby_fl_type = 32;
+pub const RUBY_FL_USERPRIV0: ruby_fl_type = 64;
 pub const RUBY_FL_FINALIZE: ruby_fl_type = 128;
-pub const RUBY_FL_TAINT: ruby_fl_type = 256;
+pub const RUBY_FL_EXIVAR: ruby_fl_type = 0;
 pub const RUBY_FL_SHAREABLE: ruby_fl_type = 256;
-pub const RUBY_FL_UNTRUSTED: ruby_fl_type = 256;
-pub const RUBY_FL_SEEN_OBJ_ID: ruby_fl_type = 512;
-pub const RUBY_FL_EXIVAR: ruby_fl_type = 1024;
+pub const RUBY_FL_WEAK_REFERENCE: ruby_fl_type = 512;
+pub const RUBY_FL_UNUSED10: ruby_fl_type = 1024;
 pub const RUBY_FL_FREEZE: ruby_fl_type = 2048;
 pub const RUBY_FL_USER0: ruby_fl_type = 4096;
 pub const RUBY_FL_USER1: ruby_fl_type = 8192;
@@ -222,8 +250,8 @@ pub const RUBY_FL_USER16: ruby_fl_type = 268435456;
 pub const RUBY_FL_USER17: ruby_fl_type = 536870912;
 pub const RUBY_FL_USER18: ruby_fl_type = 1073741824;
 pub const RUBY_FL_USER19: ruby_fl_type = -2147483648;
-pub const RUBY_ELTS_SHARED: ruby_fl_type = 16384;
-pub const RUBY_FL_SINGLETON: ruby_fl_type = 4096;
+pub const RUBY_ELTS_SHARED: ruby_fl_type = 4096;
+pub const RUBY_FL_SINGLETON: ruby_fl_type = 8192;
 pub type ruby_fl_type = i32;
 pub const RSTRING_NOEMBED: ruby_rstring_flags = 8192;
 pub const RSTRING_FSTR: ruby_rstring_flags = 536870912;
@@ -245,17 +273,13 @@ pub type st_foreach_callback_func = ::std::option::Option<
 >;
 pub const RARRAY_EMBED_FLAG: ruby_rarray_flags = 8192;
 pub const RARRAY_EMBED_LEN_MASK: ruby_rarray_flags = 4161536;
-pub const RARRAY_TRANSIENT_FLAG: ruby_rarray_flags = 33554432;
 pub type ruby_rarray_flags = u32;
 pub const RARRAY_EMBED_LEN_SHIFT: ruby_rarray_consts = 15;
 pub type ruby_rarray_consts = u32;
-pub const RMODULE_IS_REFINEMENT: ruby_rmodule_flags = 32768;
+pub const RMODULE_IS_REFINEMENT: ruby_rmodule_flags = 8192;
 pub type ruby_rmodule_flags = u32;
-pub const ROBJECT_EMBED: ruby_robject_flags = 8192;
+pub const ROBJECT_HEAP: ruby_robject_flags = 65536;
 pub type ruby_robject_flags = u32;
-pub const ROBJECT_OFFSET_AS_HEAP_IVPTR: i32 = 16;
-pub const ROBJECT_OFFSET_AS_HEAP_IV_INDEX_TBL: i32 = 24;
-pub const ROBJECT_OFFSET_AS_ARY: i32 = 16;
 pub type rb_block_call_func = ::std::option::Option<
     unsafe extern "C" fn(
         yielded_arg: VALUE,
@@ -285,233 +309,6 @@ pub const RUBY_ENCINDEX_EUC_JP: ruby_preserved_encindex = 10;
 pub const RUBY_ENCINDEX_Windows_31J: ruby_preserved_encindex = 11;
 pub const RUBY_ENCINDEX_BUILTIN_MAX: ruby_preserved_encindex = 12;
 pub type ruby_preserved_encindex = u32;
-pub type attr_index_t = u32;
-pub type shape_id_t = u32;
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct rb_shape {
-    pub edges: *mut rb_id_table,
-    pub edge_name: ID,
-    pub next_iv_index: attr_index_t,
-    pub capacity: u32,
-    pub type_: u8,
-    pub size_pool_index: u8,
-    pub parent_id: shape_id_t,
-}
-pub type rb_shape_t = rb_shape;
-pub const idDot2: ruby_method_ids = 128;
-pub const idDot3: ruby_method_ids = 129;
-pub const idUPlus: ruby_method_ids = 132;
-pub const idUMinus: ruby_method_ids = 133;
-pub const idPow: ruby_method_ids = 134;
-pub const idCmp: ruby_method_ids = 135;
-pub const idPLUS: ruby_method_ids = 43;
-pub const idMINUS: ruby_method_ids = 45;
-pub const idMULT: ruby_method_ids = 42;
-pub const idDIV: ruby_method_ids = 47;
-pub const idMOD: ruby_method_ids = 37;
-pub const idLTLT: ruby_method_ids = 136;
-pub const idGTGT: ruby_method_ids = 137;
-pub const idLT: ruby_method_ids = 60;
-pub const idLE: ruby_method_ids = 138;
-pub const idGT: ruby_method_ids = 62;
-pub const idGE: ruby_method_ids = 139;
-pub const idEq: ruby_method_ids = 140;
-pub const idEqq: ruby_method_ids = 141;
-pub const idNeq: ruby_method_ids = 142;
-pub const idNot: ruby_method_ids = 33;
-pub const idAnd: ruby_method_ids = 38;
-pub const idOr: ruby_method_ids = 124;
-pub const idBackquote: ruby_method_ids = 96;
-pub const idEqTilde: ruby_method_ids = 143;
-pub const idNeqTilde: ruby_method_ids = 144;
-pub const idAREF: ruby_method_ids = 145;
-pub const idASET: ruby_method_ids = 146;
-pub const idCOLON2: ruby_method_ids = 147;
-pub const idANDOP: ruby_method_ids = 148;
-pub const idOROP: ruby_method_ids = 149;
-pub const idANDDOT: ruby_method_ids = 150;
-pub const tPRESERVED_ID_BEGIN: ruby_method_ids = 150;
-pub const idNilP: ruby_method_ids = 151;
-pub const idNULL: ruby_method_ids = 152;
-pub const idEmptyP: ruby_method_ids = 153;
-pub const idEqlP: ruby_method_ids = 154;
-pub const idRespond_to: ruby_method_ids = 155;
-pub const idRespond_to_missing: ruby_method_ids = 156;
-pub const idIFUNC: ruby_method_ids = 157;
-pub const idCFUNC: ruby_method_ids = 158;
-pub const id_core_set_method_alias: ruby_method_ids = 159;
-pub const id_core_set_variable_alias: ruby_method_ids = 160;
-pub const id_core_undef_method: ruby_method_ids = 161;
-pub const id_core_define_method: ruby_method_ids = 162;
-pub const id_core_define_singleton_method: ruby_method_ids = 163;
-pub const id_core_set_postexe: ruby_method_ids = 164;
-pub const id_core_hash_merge_ptr: ruby_method_ids = 165;
-pub const id_core_hash_merge_kwd: ruby_method_ids = 166;
-pub const id_core_raise: ruby_method_ids = 167;
-pub const id_core_sprintf: ruby_method_ids = 168;
-pub const id_debug_created_info: ruby_method_ids = 169;
-pub const tPRESERVED_ID_END: ruby_method_ids = 170;
-pub const tTOKEN_LOCAL_BEGIN: ruby_method_ids = 169;
-pub const tMax: ruby_method_ids = 170;
-pub const tMin: ruby_method_ids = 171;
-pub const tFreeze: ruby_method_ids = 172;
-pub const tInspect: ruby_method_ids = 173;
-pub const tIntern: ruby_method_ids = 174;
-pub const tObject_id: ruby_method_ids = 175;
-pub const tConst_added: ruby_method_ids = 176;
-pub const tConst_missing: ruby_method_ids = 177;
-pub const tMethodMissing: ruby_method_ids = 178;
-pub const tMethod_added: ruby_method_ids = 179;
-pub const tSingleton_method_added: ruby_method_ids = 180;
-pub const tMethod_removed: ruby_method_ids = 181;
-pub const tSingleton_method_removed: ruby_method_ids = 182;
-pub const tMethod_undefined: ruby_method_ids = 183;
-pub const tSingleton_method_undefined: ruby_method_ids = 184;
-pub const tLength: ruby_method_ids = 185;
-pub const tSize: ruby_method_ids = 186;
-pub const tGets: ruby_method_ids = 187;
-pub const tSucc: ruby_method_ids = 188;
-pub const tEach: ruby_method_ids = 189;
-pub const tProc: ruby_method_ids = 190;
-pub const tLambda: ruby_method_ids = 191;
-pub const tSend: ruby_method_ids = 192;
-pub const t__send__: ruby_method_ids = 193;
-pub const t__attached__: ruby_method_ids = 194;
-pub const t__recursive_key__: ruby_method_ids = 195;
-pub const tInitialize: ruby_method_ids = 196;
-pub const tInitialize_copy: ruby_method_ids = 197;
-pub const tInitialize_clone: ruby_method_ids = 198;
-pub const tInitialize_dup: ruby_method_ids = 199;
-pub const tTo_int: ruby_method_ids = 200;
-pub const tTo_ary: ruby_method_ids = 201;
-pub const tTo_str: ruby_method_ids = 202;
-pub const tTo_sym: ruby_method_ids = 203;
-pub const tTo_hash: ruby_method_ids = 204;
-pub const tTo_proc: ruby_method_ids = 205;
-pub const tTo_io: ruby_method_ids = 206;
-pub const tTo_a: ruby_method_ids = 207;
-pub const tTo_s: ruby_method_ids = 208;
-pub const tTo_i: ruby_method_ids = 209;
-pub const tTo_f: ruby_method_ids = 210;
-pub const tTo_r: ruby_method_ids = 211;
-pub const tBt: ruby_method_ids = 212;
-pub const tBt_locations: ruby_method_ids = 213;
-pub const tCall: ruby_method_ids = 214;
-pub const tMesg: ruby_method_ids = 215;
-pub const tException: ruby_method_ids = 216;
-pub const tLocals: ruby_method_ids = 217;
-pub const tNOT: ruby_method_ids = 218;
-pub const tAND: ruby_method_ids = 219;
-pub const tOR: ruby_method_ids = 220;
-pub const tDiv: ruby_method_ids = 221;
-pub const tDivmod: ruby_method_ids = 222;
-pub const tFdiv: ruby_method_ids = 223;
-pub const tQuo: ruby_method_ids = 224;
-pub const tName: ruby_method_ids = 225;
-pub const tNil: ruby_method_ids = 226;
-pub const tPath: ruby_method_ids = 227;
-pub const tUScore: ruby_method_ids = 228;
-pub const tNUMPARAM_1: ruby_method_ids = 229;
-pub const tNUMPARAM_2: ruby_method_ids = 230;
-pub const tNUMPARAM_3: ruby_method_ids = 231;
-pub const tNUMPARAM_4: ruby_method_ids = 232;
-pub const tNUMPARAM_5: ruby_method_ids = 233;
-pub const tNUMPARAM_6: ruby_method_ids = 234;
-pub const tNUMPARAM_7: ruby_method_ids = 235;
-pub const tNUMPARAM_8: ruby_method_ids = 236;
-pub const tNUMPARAM_9: ruby_method_ids = 237;
-pub const tDefault: ruby_method_ids = 238;
-pub const tTOKEN_LOCAL_END: ruby_method_ids = 239;
-pub const tTOKEN_INSTANCE_BEGIN: ruby_method_ids = 238;
-pub const tTOKEN_INSTANCE_END: ruby_method_ids = 239;
-pub const tTOKEN_GLOBAL_BEGIN: ruby_method_ids = 238;
-pub const tLASTLINE: ruby_method_ids = 239;
-pub const tBACKREF: ruby_method_ids = 240;
-pub const tERROR_INFO: ruby_method_ids = 241;
-pub const tTOKEN_GLOBAL_END: ruby_method_ids = 242;
-pub const tTOKEN_CONST_BEGIN: ruby_method_ids = 241;
-pub const tTOKEN_CONST_END: ruby_method_ids = 242;
-pub const tTOKEN_CLASS_BEGIN: ruby_method_ids = 241;
-pub const tTOKEN_CLASS_END: ruby_method_ids = 242;
-pub const tTOKEN_ATTRSET_BEGIN: ruby_method_ids = 241;
-pub const tTOKEN_ATTRSET_END: ruby_method_ids = 242;
-pub const tNEXT_ID: ruby_method_ids = 242;
-pub const idMax: ruby_method_ids = 2721;
-pub const idMin: ruby_method_ids = 2737;
-pub const idFreeze: ruby_method_ids = 2753;
-pub const idInspect: ruby_method_ids = 2769;
-pub const idIntern: ruby_method_ids = 2785;
-pub const idObject_id: ruby_method_ids = 2801;
-pub const idConst_added: ruby_method_ids = 2817;
-pub const idConst_missing: ruby_method_ids = 2833;
-pub const idMethodMissing: ruby_method_ids = 2849;
-pub const idMethod_added: ruby_method_ids = 2865;
-pub const idSingleton_method_added: ruby_method_ids = 2881;
-pub const idMethod_removed: ruby_method_ids = 2897;
-pub const idSingleton_method_removed: ruby_method_ids = 2913;
-pub const idMethod_undefined: ruby_method_ids = 2929;
-pub const idSingleton_method_undefined: ruby_method_ids = 2945;
-pub const idLength: ruby_method_ids = 2961;
-pub const idSize: ruby_method_ids = 2977;
-pub const idGets: ruby_method_ids = 2993;
-pub const idSucc: ruby_method_ids = 3009;
-pub const idEach: ruby_method_ids = 3025;
-pub const idProc: ruby_method_ids = 3041;
-pub const idLambda: ruby_method_ids = 3057;
-pub const idSend: ruby_method_ids = 3073;
-pub const id__send__: ruby_method_ids = 3089;
-pub const id__attached__: ruby_method_ids = 3105;
-pub const id__recursive_key__: ruby_method_ids = 3121;
-pub const idInitialize: ruby_method_ids = 3137;
-pub const idInitialize_copy: ruby_method_ids = 3153;
-pub const idInitialize_clone: ruby_method_ids = 3169;
-pub const idInitialize_dup: ruby_method_ids = 3185;
-pub const idTo_int: ruby_method_ids = 3201;
-pub const idTo_ary: ruby_method_ids = 3217;
-pub const idTo_str: ruby_method_ids = 3233;
-pub const idTo_sym: ruby_method_ids = 3249;
-pub const idTo_hash: ruby_method_ids = 3265;
-pub const idTo_proc: ruby_method_ids = 3281;
-pub const idTo_io: ruby_method_ids = 3297;
-pub const idTo_a: ruby_method_ids = 3313;
-pub const idTo_s: ruby_method_ids = 3329;
-pub const idTo_i: ruby_method_ids = 3345;
-pub const idTo_f: ruby_method_ids = 3361;
-pub const idTo_r: ruby_method_ids = 3377;
-pub const idBt: ruby_method_ids = 3393;
-pub const idBt_locations: ruby_method_ids = 3409;
-pub const idCall: ruby_method_ids = 3425;
-pub const idMesg: ruby_method_ids = 3441;
-pub const idException: ruby_method_ids = 3457;
-pub const idLocals: ruby_method_ids = 3473;
-pub const idNOT: ruby_method_ids = 3489;
-pub const idAND: ruby_method_ids = 3505;
-pub const idOR: ruby_method_ids = 3521;
-pub const idDiv: ruby_method_ids = 3537;
-pub const idDivmod: ruby_method_ids = 3553;
-pub const idFdiv: ruby_method_ids = 3569;
-pub const idQuo: ruby_method_ids = 3585;
-pub const idName: ruby_method_ids = 3601;
-pub const idNil: ruby_method_ids = 3617;
-pub const idPath: ruby_method_ids = 3633;
-pub const idUScore: ruby_method_ids = 3649;
-pub const idNUMPARAM_1: ruby_method_ids = 3665;
-pub const idNUMPARAM_2: ruby_method_ids = 3681;
-pub const idNUMPARAM_3: ruby_method_ids = 3697;
-pub const idNUMPARAM_4: ruby_method_ids = 3713;
-pub const idNUMPARAM_5: ruby_method_ids = 3729;
-pub const idNUMPARAM_6: ruby_method_ids = 3745;
-pub const idNUMPARAM_7: ruby_method_ids = 3761;
-pub const idNUMPARAM_8: ruby_method_ids = 3777;
-pub const idNUMPARAM_9: ruby_method_ids = 3793;
-pub const idDefault: ruby_method_ids = 3809;
-pub const idLASTLINE: ruby_method_ids = 3831;
-pub const idBACKREF: ruby_method_ids = 3847;
-pub const idERROR_INFO: ruby_method_ids = 3863;
-pub const tLAST_OP_ID: ruby_method_ids = 169;
-pub const idLAST_OP_ID: ruby_method_ids = 10;
-pub type ruby_method_ids = u32;
 pub const BOP_PLUS: ruby_basic_operators = 0;
 pub const BOP_MINUS: ruby_basic_operators = 1;
 pub const BOP_MULT: ruby_basic_operators = 2;
@@ -531,19 +328,23 @@ pub const BOP_NIL_P: ruby_basic_operators = 15;
 pub const BOP_SUCC: ruby_basic_operators = 16;
 pub const BOP_GT: ruby_basic_operators = 17;
 pub const BOP_GE: ruby_basic_operators = 18;
-pub const BOP_NOT: ruby_basic_operators = 19;
-pub const BOP_NEQ: ruby_basic_operators = 20;
-pub const BOP_MATCH: ruby_basic_operators = 21;
-pub const BOP_FREEZE: ruby_basic_operators = 22;
-pub const BOP_UMINUS: ruby_basic_operators = 23;
-pub const BOP_MAX: ruby_basic_operators = 24;
-pub const BOP_MIN: ruby_basic_operators = 25;
-pub const BOP_CALL: ruby_basic_operators = 26;
-pub const BOP_AND: ruby_basic_operators = 27;
-pub const BOP_OR: ruby_basic_operators = 28;
-pub const BOP_CMP: ruby_basic_operators = 29;
-pub const BOP_DEFAULT: ruby_basic_operators = 30;
-pub const BOP_LAST_: ruby_basic_operators = 31;
+pub const BOP_GTGT: ruby_basic_operators = 19;
+pub const BOP_NOT: ruby_basic_operators = 20;
+pub const BOP_NEQ: ruby_basic_operators = 21;
+pub const BOP_MATCH: ruby_basic_operators = 22;
+pub const BOP_FREEZE: ruby_basic_operators = 23;
+pub const BOP_UMINUS: ruby_basic_operators = 24;
+pub const BOP_MAX: ruby_basic_operators = 25;
+pub const BOP_MIN: ruby_basic_operators = 26;
+pub const BOP_HASH: ruby_basic_operators = 27;
+pub const BOP_CALL: ruby_basic_operators = 28;
+pub const BOP_AND: ruby_basic_operators = 29;
+pub const BOP_OR: ruby_basic_operators = 30;
+pub const BOP_CMP: ruby_basic_operators = 31;
+pub const BOP_DEFAULT: ruby_basic_operators = 32;
+pub const BOP_PACK: ruby_basic_operators = 33;
+pub const BOP_INCLUDE_P: ruby_basic_operators = 34;
+pub const BOP_LAST_: ruby_basic_operators = 35;
 pub type ruby_basic_operators = u32;
 pub type rb_serial_t = ::std::os::raw::c_ulonglong;
 pub const imemo_env: imemo_type = 0;
@@ -555,11 +356,10 @@ pub const imemo_memo: imemo_type = 5;
 pub const imemo_ment: imemo_type = 6;
 pub const imemo_iseq: imemo_type = 7;
 pub const imemo_tmpbuf: imemo_type = 8;
-pub const imemo_ast: imemo_type = 9;
-pub const imemo_parser_strterm: imemo_type = 10;
-pub const imemo_callinfo: imemo_type = 11;
-pub const imemo_callcache: imemo_type = 12;
-pub const imemo_constcache: imemo_type = 13;
+pub const imemo_callinfo: imemo_type = 10;
+pub const imemo_callcache: imemo_type = 11;
+pub const imemo_constcache: imemo_type = 12;
+pub const imemo_fields: imemo_type = 13;
 pub type imemo_type = u32;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -570,7 +370,7 @@ pub struct vm_ifunc_argc {
 #[repr(C)]
 pub struct vm_ifunc {
     pub flags: VALUE,
-    pub reserved: VALUE,
+    pub svar_lep: *mut VALUE,
     pub func: rb_block_call_func_t,
     pub data: *const ::std::os::raw::c_void,
     pub argc: vm_ifunc_argc,
@@ -612,10 +412,11 @@ pub const VM_METHOD_TYPE_OPTIMIZED: rb_method_type_t = 9;
 pub const VM_METHOD_TYPE_MISSING: rb_method_type_t = 10;
 pub const VM_METHOD_TYPE_REFINED: rb_method_type_t = 11;
 pub type rb_method_type_t = u32;
+pub type rb_cfunc_t = ::std::option::Option<unsafe extern "C" fn() -> VALUE>;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct rb_method_cfunc_struct {
-    pub func: ::std::option::Option<unsafe extern "C" fn() -> VALUE>,
+    pub func: rb_cfunc_t,
     pub invoker: ::std::option::Option<
         unsafe extern "C" fn(
             recv: VALUE,
@@ -633,18 +434,25 @@ pub const OPTIMIZED_METHOD_TYPE_STRUCT_AREF: method_optimized_type = 3;
 pub const OPTIMIZED_METHOD_TYPE_STRUCT_ASET: method_optimized_type = 4;
 pub const OPTIMIZED_METHOD_TYPE__MAX: method_optimized_type = 5;
 pub type method_optimized_type = u32;
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct rb_id_table {
-    _unused: [u8; 0],
-}
 pub type rb_num_t = ::std::os::raw::c_ulong;
+pub const RUBY_TAG_NONE: ruby_tag_type = 0;
+pub const RUBY_TAG_RETURN: ruby_tag_type = 1;
+pub const RUBY_TAG_BREAK: ruby_tag_type = 2;
+pub const RUBY_TAG_NEXT: ruby_tag_type = 3;
+pub const RUBY_TAG_RETRY: ruby_tag_type = 4;
+pub const RUBY_TAG_REDO: ruby_tag_type = 5;
+pub const RUBY_TAG_RAISE: ruby_tag_type = 6;
+pub const RUBY_TAG_THROW: ruby_tag_type = 7;
+pub const RUBY_TAG_FATAL: ruby_tag_type = 8;
+pub const RUBY_TAG_MASK: ruby_tag_type = 15;
+pub type ruby_tag_type = u32;
+pub const VM_THROW_NO_ESCAPE_FLAG: ruby_vm_throw_flags = 32768;
+pub const VM_THROW_STATE_MASK: ruby_vm_throw_flags = 255;
+pub type ruby_vm_throw_flags = u32;
 #[repr(C)]
 pub struct iseq_inline_constant_cache_entry {
     pub flags: VALUE,
     pub value: VALUE,
-    pub _unused1: VALUE,
-    pub _unused2: VALUE,
     pub ic_cref: *const rb_cref_t,
 }
 #[repr(C)]
@@ -656,7 +464,7 @@ pub struct iseq_inline_constant_cache {
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct iseq_inline_iv_cache_entry {
-    pub value: usize,
+    pub value: u64,
     pub iv_set_name: ID,
 }
 #[repr(C)]
@@ -664,9 +472,24 @@ pub struct iseq_inline_iv_cache_entry {
 pub struct iseq_inline_cvar_cache_entry {
     pub entry: *mut rb_cvar_class_tbl_entry,
 }
+pub const ISEQ_TYPE_TOP: rb_iseq_type = 0;
+pub const ISEQ_TYPE_METHOD: rb_iseq_type = 1;
+pub const ISEQ_TYPE_BLOCK: rb_iseq_type = 2;
+pub const ISEQ_TYPE_CLASS: rb_iseq_type = 3;
+pub const ISEQ_TYPE_RESCUE: rb_iseq_type = 4;
+pub const ISEQ_TYPE_ENSURE: rb_iseq_type = 5;
+pub const ISEQ_TYPE_EVAL: rb_iseq_type = 6;
+pub const ISEQ_TYPE_MAIN: rb_iseq_type = 7;
+pub const ISEQ_TYPE_PLAIN: rb_iseq_type = 8;
+pub type rb_iseq_type = u32;
+pub const BUILTIN_ATTR_LEAF: rb_builtin_attr = 1;
+pub const BUILTIN_ATTR_SINGLE_NOARG_LEAF: rb_builtin_attr = 2;
+pub const BUILTIN_ATTR_INLINE_BLOCK: rb_builtin_attr = 4;
+pub const BUILTIN_ATTR_C_TRACE: rb_builtin_attr = 8;
+pub type rb_builtin_attr = u32;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
-pub struct rb_iseq_constant_body__bindgen_ty_1_rb_iseq_param_keyword {
+pub struct rb_iseq_constant_body_rb_iseq_parameters_rb_iseq_param_keyword {
     pub num: ::std::os::raw::c_int,
     pub required_num: ::std::os::raw::c_int,
     pub bits_start: ::std::os::raw::c_int,
@@ -768,6 +591,17 @@ impl rb_proc_t {
         __bindgen_bitfield_unit
     }
 }
+pub const VM_CHECKMATCH_TYPE_WHEN: vm_check_match_type = 1;
+pub const VM_CHECKMATCH_TYPE_CASE: vm_check_match_type = 2;
+pub const VM_CHECKMATCH_TYPE_RESCUE: vm_check_match_type = 3;
+pub type vm_check_match_type = u32;
+pub const VM_OPT_NEWARRAY_SEND_MAX: vm_opt_newarray_send_type = 1;
+pub const VM_OPT_NEWARRAY_SEND_MIN: vm_opt_newarray_send_type = 2;
+pub const VM_OPT_NEWARRAY_SEND_HASH: vm_opt_newarray_send_type = 3;
+pub const VM_OPT_NEWARRAY_SEND_PACK: vm_opt_newarray_send_type = 4;
+pub const VM_OPT_NEWARRAY_SEND_PACK_BUFFER: vm_opt_newarray_send_type = 5;
+pub const VM_OPT_NEWARRAY_SEND_INCLUDE_P: vm_opt_newarray_send_type = 6;
+pub type vm_opt_newarray_send_type = u32;
 pub const VM_SPECIAL_OBJECT_VMCORE: vm_special_object_type = 1;
 pub const VM_SPECIAL_OBJECT_CBASE: vm_special_object_type = 2;
 pub const VM_SPECIAL_OBJECT_CONST_BASE: vm_special_object_type = 3;
@@ -792,15 +626,21 @@ pub const VM_FRAME_FLAG_LAMBDA: vm_frame_env_flags = 256;
 pub const VM_FRAME_FLAG_MODIFIED_BLOCK_PARAM: vm_frame_env_flags = 512;
 pub const VM_FRAME_FLAG_CFRAME_KW: vm_frame_env_flags = 1024;
 pub const VM_FRAME_FLAG_PASSED: vm_frame_env_flags = 2048;
+pub const VM_FRAME_FLAG_BOX_REQUIRE: vm_frame_env_flags = 4096;
 pub const VM_ENV_FLAG_LOCAL: vm_frame_env_flags = 2;
 pub const VM_ENV_FLAG_ESCAPED: vm_frame_env_flags = 4;
 pub const VM_ENV_FLAG_WB_REQUIRED: vm_frame_env_flags = 8;
 pub const VM_ENV_FLAG_ISOLATED: vm_frame_env_flags = 16;
 pub type vm_frame_env_flags = u32;
+pub type attr_index_t = u16;
+pub type shape_id_t = u32;
+pub const SHAPE_ID_HAS_IVAR_MASK: shape_id_mask = 134742014;
+pub type shape_id_mask = u32;
 #[repr(C)]
 pub struct rb_cvar_class_tbl_entry {
     pub index: u32,
     pub global_cvar_state: rb_serial_t,
+    pub cref: *const rb_cref_t,
     pub class_value: VALUE,
 }
 pub const VM_CALL_ARGS_SPLAT_bit: vm_call_flag_bits = 0;
@@ -808,17 +648,24 @@ pub const VM_CALL_ARGS_BLOCKARG_bit: vm_call_flag_bits = 1;
 pub const VM_CALL_FCALL_bit: vm_call_flag_bits = 2;
 pub const VM_CALL_VCALL_bit: vm_call_flag_bits = 3;
 pub const VM_CALL_ARGS_SIMPLE_bit: vm_call_flag_bits = 4;
-pub const VM_CALL_BLOCKISEQ_bit: vm_call_flag_bits = 5;
-pub const VM_CALL_KWARG_bit: vm_call_flag_bits = 6;
-pub const VM_CALL_KW_SPLAT_bit: vm_call_flag_bits = 7;
-pub const VM_CALL_TAILCALL_bit: vm_call_flag_bits = 8;
-pub const VM_CALL_SUPER_bit: vm_call_flag_bits = 9;
-pub const VM_CALL_ZSUPER_bit: vm_call_flag_bits = 10;
-pub const VM_CALL_OPT_SEND_bit: vm_call_flag_bits = 11;
-pub const VM_CALL_KW_SPLAT_MUT_bit: vm_call_flag_bits = 12;
-pub const VM_CALL__END: vm_call_flag_bits = 13;
+pub const VM_CALL_KWARG_bit: vm_call_flag_bits = 5;
+pub const VM_CALL_KW_SPLAT_bit: vm_call_flag_bits = 6;
+pub const VM_CALL_TAILCALL_bit: vm_call_flag_bits = 7;
+pub const VM_CALL_SUPER_bit: vm_call_flag_bits = 8;
+pub const VM_CALL_ZSUPER_bit: vm_call_flag_bits = 9;
+pub const VM_CALL_OPT_SEND_bit: vm_call_flag_bits = 10;
+pub const VM_CALL_KW_SPLAT_MUT_bit: vm_call_flag_bits = 11;
+pub const VM_CALL_ARGS_SPLAT_MUT_bit: vm_call_flag_bits = 12;
+pub const VM_CALL_FORWARDING_bit: vm_call_flag_bits = 13;
+pub const VM_CALL__END: vm_call_flag_bits = 14;
 pub type vm_call_flag_bits = u32;
 #[repr(C)]
+pub struct rb_callinfo_kwarg {
+    pub keyword_len: ::std::os::raw::c_int,
+    pub references: ::std::os::raw::c_int,
+    pub keywords: __IncompleteArrayField<VALUE>,
+}
+#[repr(C)]
 pub struct rb_callinfo {
     pub flags: VALUE,
     pub kwarg: *const rb_callinfo_kwarg,
@@ -832,6 +679,8 @@ pub struct rb_call_data {
     pub ci: *const rb_callinfo,
     pub cc: *const rb_callcache,
 }
+pub const RSTRING_CHILLED: ruby_rstring_private_flags = 49152;
+pub type ruby_rstring_private_flags = u32;
 pub const RHASH_PASS_AS_KEYWORDS: ruby_rhash_flags = 8192;
 pub const RHASH_PROC_DEFAULT: ruby_rhash_flags = 16384;
 pub const RHASH_ST_TABLE_FLAG: ruby_rhash_flags = 32768;
@@ -839,7 +688,6 @@ pub const RHASH_AR_TABLE_SIZE_MASK: ruby_rhash_flags = 983040;
 pub const RHASH_AR_TABLE_SIZE_SHIFT: ruby_rhash_flags = 16;
 pub const RHASH_AR_TABLE_BOUND_MASK: ruby_rhash_flags = 15728640;
 pub const RHASH_AR_TABLE_BOUND_SHIFT: ruby_rhash_flags = 20;
-pub const RHASH_TRANSIENT_FLAG: ruby_rhash_flags = 16777216;
 pub const RHASH_LEV_SHIFT: ruby_rhash_flags = 25;
 pub const RHASH_LEV_MAX: ruby_rhash_flags = 127;
 pub type ruby_rhash_flags = u32;
@@ -850,14 +698,6 @@ pub struct rb_builtin_function {
     pub argc: ::std::os::raw::c_int,
     pub index: ::std::os::raw::c_int,
     pub name: *const ::std::os::raw::c_char,
-    pub compiler: ::std::option::Option<
-        unsafe extern "C" fn(
-            arg1: VALUE,
-            arg2: ::std::os::raw::c_long,
-            arg3: ::std::os::raw::c_uint,
-            arg4: bool,
-        ),
-    >,
 }
 pub const YARVINSN_nop: ruby_vminsn_type = 0;
 pub const YARVINSN_getlocal: ruby_vminsn_type = 1;
@@ -881,229 +721,326 @@ pub const YARVINSN_putself: ruby_vminsn_type = 18;
 pub const YARVINSN_putobject: ruby_vminsn_type = 19;
 pub const YARVINSN_putspecialobject: ruby_vminsn_type = 20;
 pub const YARVINSN_putstring: ruby_vminsn_type = 21;
-pub const YARVINSN_concatstrings: ruby_vminsn_type = 22;
-pub const YARVINSN_anytostring: ruby_vminsn_type = 23;
-pub const YARVINSN_toregexp: ruby_vminsn_type = 24;
-pub const YARVINSN_intern: ruby_vminsn_type = 25;
-pub const YARVINSN_newarray: ruby_vminsn_type = 26;
-pub const YARVINSN_newarraykwsplat: ruby_vminsn_type = 27;
-pub const YARVINSN_duparray: ruby_vminsn_type = 28;
-pub const YARVINSN_duphash: ruby_vminsn_type = 29;
-pub const YARVINSN_expandarray: ruby_vminsn_type = 30;
-pub const YARVINSN_concatarray: ruby_vminsn_type = 31;
-pub const YARVINSN_splatarray: ruby_vminsn_type = 32;
-pub const YARVINSN_newhash: ruby_vminsn_type = 33;
-pub const YARVINSN_newrange: ruby_vminsn_type = 34;
-pub const YARVINSN_pop: ruby_vminsn_type = 35;
-pub const YARVINSN_dup: ruby_vminsn_type = 36;
-pub const YARVINSN_dupn: ruby_vminsn_type = 37;
-pub const YARVINSN_swap: ruby_vminsn_type = 38;
-pub const YARVINSN_opt_reverse: ruby_vminsn_type = 39;
-pub const YARVINSN_topn: ruby_vminsn_type = 40;
-pub const YARVINSN_setn: ruby_vminsn_type = 41;
-pub const YARVINSN_adjuststack: ruby_vminsn_type = 42;
-pub const YARVINSN_defined: ruby_vminsn_type = 43;
-pub const YARVINSN_checkmatch: ruby_vminsn_type = 44;
-pub const YARVINSN_checkkeyword: ruby_vminsn_type = 45;
-pub const YARVINSN_checktype: ruby_vminsn_type = 46;
-pub const YARVINSN_defineclass: ruby_vminsn_type = 47;
-pub const YARVINSN_definemethod: ruby_vminsn_type = 48;
-pub const YARVINSN_definesmethod: ruby_vminsn_type = 49;
-pub const YARVINSN_send: ruby_vminsn_type = 50;
-pub const YARVINSN_opt_send_without_block: ruby_vminsn_type = 51;
-pub const YARVINSN_objtostring: ruby_vminsn_type = 52;
-pub const YARVINSN_opt_str_freeze: ruby_vminsn_type = 53;
-pub const YARVINSN_opt_nil_p: ruby_vminsn_type = 54;
-pub const YARVINSN_opt_str_uminus: ruby_vminsn_type = 55;
-pub const YARVINSN_opt_newarray_max: ruby_vminsn_type = 56;
-pub const YARVINSN_opt_newarray_min: ruby_vminsn_type = 57;
-pub const YARVINSN_invokesuper: ruby_vminsn_type = 58;
-pub const YARVINSN_invokeblock: ruby_vminsn_type = 59;
-pub const YARVINSN_leave: ruby_vminsn_type = 60;
-pub const YARVINSN_throw: ruby_vminsn_type = 61;
-pub const YARVINSN_jump: ruby_vminsn_type = 62;
-pub const YARVINSN_branchif: ruby_vminsn_type = 63;
-pub const YARVINSN_branchunless: ruby_vminsn_type = 64;
-pub const YARVINSN_branchnil: ruby_vminsn_type = 65;
-pub const YARVINSN_once: ruby_vminsn_type = 66;
-pub const YARVINSN_opt_case_dispatch: ruby_vminsn_type = 67;
-pub const YARVINSN_opt_plus: ruby_vminsn_type = 68;
-pub const YARVINSN_opt_minus: ruby_vminsn_type = 69;
-pub const YARVINSN_opt_mult: ruby_vminsn_type = 70;
-pub const YARVINSN_opt_div: ruby_vminsn_type = 71;
-pub const YARVINSN_opt_mod: ruby_vminsn_type = 72;
-pub const YARVINSN_opt_eq: ruby_vminsn_type = 73;
-pub const YARVINSN_opt_neq: ruby_vminsn_type = 74;
-pub const YARVINSN_opt_lt: ruby_vminsn_type = 75;
-pub const YARVINSN_opt_le: ruby_vminsn_type = 76;
-pub const YARVINSN_opt_gt: ruby_vminsn_type = 77;
-pub const YARVINSN_opt_ge: ruby_vminsn_type = 78;
-pub const YARVINSN_opt_ltlt: ruby_vminsn_type = 79;
-pub const YARVINSN_opt_and: ruby_vminsn_type = 80;
-pub const YARVINSN_opt_or: ruby_vminsn_type = 81;
-pub const YARVINSN_opt_aref: ruby_vminsn_type = 82;
-pub const YARVINSN_opt_aset: ruby_vminsn_type = 83;
-pub const YARVINSN_opt_aset_with: ruby_vminsn_type = 84;
-pub const YARVINSN_opt_aref_with: ruby_vminsn_type = 85;
-pub const YARVINSN_opt_length: ruby_vminsn_type = 86;
-pub const YARVINSN_opt_size: ruby_vminsn_type = 87;
-pub const YARVINSN_opt_empty_p: ruby_vminsn_type = 88;
-pub const YARVINSN_opt_succ: ruby_vminsn_type = 89;
-pub const YARVINSN_opt_not: ruby_vminsn_type = 90;
-pub const YARVINSN_opt_regexpmatch2: ruby_vminsn_type = 91;
-pub const YARVINSN_invokebuiltin: ruby_vminsn_type = 92;
-pub const YARVINSN_opt_invokebuiltin_delegate: ruby_vminsn_type = 93;
-pub const YARVINSN_opt_invokebuiltin_delegate_leave: ruby_vminsn_type = 94;
-pub const YARVINSN_getlocal_WC_0: ruby_vminsn_type = 95;
-pub const YARVINSN_getlocal_WC_1: ruby_vminsn_type = 96;
-pub const YARVINSN_setlocal_WC_0: ruby_vminsn_type = 97;
-pub const YARVINSN_setlocal_WC_1: ruby_vminsn_type = 98;
-pub const YARVINSN_putobject_INT2FIX_0_: ruby_vminsn_type = 99;
-pub const YARVINSN_putobject_INT2FIX_1_: ruby_vminsn_type = 100;
-pub const YARVINSN_trace_nop: ruby_vminsn_type = 101;
-pub const YARVINSN_trace_getlocal: ruby_vminsn_type = 102;
-pub const YARVINSN_trace_setlocal: ruby_vminsn_type = 103;
-pub const YARVINSN_trace_getblockparam: ruby_vminsn_type = 104;
-pub const YARVINSN_trace_setblockparam: ruby_vminsn_type = 105;
-pub const YARVINSN_trace_getblockparamproxy: ruby_vminsn_type = 106;
-pub const YARVINSN_trace_getspecial: ruby_vminsn_type = 107;
-pub const YARVINSN_trace_setspecial: ruby_vminsn_type = 108;
-pub const YARVINSN_trace_getinstancevariable: ruby_vminsn_type = 109;
-pub const YARVINSN_trace_setinstancevariable: ruby_vminsn_type = 110;
-pub const YARVINSN_trace_getclassvariable: ruby_vminsn_type = 111;
-pub const YARVINSN_trace_setclassvariable: ruby_vminsn_type = 112;
-pub const YARVINSN_trace_opt_getconstant_path: ruby_vminsn_type = 113;
-pub const YARVINSN_trace_getconstant: ruby_vminsn_type = 114;
-pub const YARVINSN_trace_setconstant: ruby_vminsn_type = 115;
-pub const YARVINSN_trace_getglobal: ruby_vminsn_type = 116;
-pub const YARVINSN_trace_setglobal: ruby_vminsn_type = 117;
-pub const YARVINSN_trace_putnil: ruby_vminsn_type = 118;
-pub const YARVINSN_trace_putself: ruby_vminsn_type = 119;
-pub const YARVINSN_trace_putobject: ruby_vminsn_type = 120;
-pub const YARVINSN_trace_putspecialobject: ruby_vminsn_type = 121;
-pub const YARVINSN_trace_putstring: ruby_vminsn_type = 122;
-pub const YARVINSN_trace_concatstrings: ruby_vminsn_type = 123;
-pub const YARVINSN_trace_anytostring: ruby_vminsn_type = 124;
-pub const YARVINSN_trace_toregexp: ruby_vminsn_type = 125;
-pub const YARVINSN_trace_intern: ruby_vminsn_type = 126;
-pub const YARVINSN_trace_newarray: ruby_vminsn_type = 127;
-pub const YARVINSN_trace_newarraykwsplat: ruby_vminsn_type = 128;
-pub const YARVINSN_trace_duparray: ruby_vminsn_type = 129;
-pub const YARVINSN_trace_duphash: ruby_vminsn_type = 130;
-pub const YARVINSN_trace_expandarray: ruby_vminsn_type = 131;
-pub const YARVINSN_trace_concatarray: ruby_vminsn_type = 132;
-pub const YARVINSN_trace_splatarray: ruby_vminsn_type = 133;
-pub const YARVINSN_trace_newhash: ruby_vminsn_type = 134;
-pub const YARVINSN_trace_newrange: ruby_vminsn_type = 135;
-pub const YARVINSN_trace_pop: ruby_vminsn_type = 136;
-pub const YARVINSN_trace_dup: ruby_vminsn_type = 137;
-pub const YARVINSN_trace_dupn: ruby_vminsn_type = 138;
-pub const YARVINSN_trace_swap: ruby_vminsn_type = 139;
-pub const YARVINSN_trace_opt_reverse: ruby_vminsn_type = 140;
-pub const YARVINSN_trace_topn: ruby_vminsn_type = 141;
-pub const YARVINSN_trace_setn: ruby_vminsn_type = 142;
-pub const YARVINSN_trace_adjuststack: ruby_vminsn_type = 143;
-pub const YARVINSN_trace_defined: ruby_vminsn_type = 144;
-pub const YARVINSN_trace_checkmatch: ruby_vminsn_type = 145;
-pub const YARVINSN_trace_checkkeyword: ruby_vminsn_type = 146;
-pub const YARVINSN_trace_checktype: ruby_vminsn_type = 147;
-pub const YARVINSN_trace_defineclass: ruby_vminsn_type = 148;
-pub const YARVINSN_trace_definemethod: ruby_vminsn_type = 149;
-pub const YARVINSN_trace_definesmethod: ruby_vminsn_type = 150;
-pub const YARVINSN_trace_send: ruby_vminsn_type = 151;
-pub const YARVINSN_trace_opt_send_without_block: ruby_vminsn_type = 152;
-pub const YARVINSN_trace_objtostring: ruby_vminsn_type = 153;
-pub const YARVINSN_trace_opt_str_freeze: ruby_vminsn_type = 154;
-pub const YARVINSN_trace_opt_nil_p: ruby_vminsn_type = 155;
-pub const YARVINSN_trace_opt_str_uminus: ruby_vminsn_type = 156;
-pub const YARVINSN_trace_opt_newarray_max: ruby_vminsn_type = 157;
-pub const YARVINSN_trace_opt_newarray_min: ruby_vminsn_type = 158;
-pub const YARVINSN_trace_invokesuper: ruby_vminsn_type = 159;
-pub const YARVINSN_trace_invokeblock: ruby_vminsn_type = 160;
-pub const YARVINSN_trace_leave: ruby_vminsn_type = 161;
-pub const YARVINSN_trace_throw: ruby_vminsn_type = 162;
-pub const YARVINSN_trace_jump: ruby_vminsn_type = 163;
-pub const YARVINSN_trace_branchif: ruby_vminsn_type = 164;
-pub const YARVINSN_trace_branchunless: ruby_vminsn_type = 165;
-pub const YARVINSN_trace_branchnil: ruby_vminsn_type = 166;
-pub const YARVINSN_trace_once: ruby_vminsn_type = 167;
-pub const YARVINSN_trace_opt_case_dispatch: ruby_vminsn_type = 168;
-pub const YARVINSN_trace_opt_plus: ruby_vminsn_type = 169;
-pub const YARVINSN_trace_opt_minus: ruby_vminsn_type = 170;
-pub const YARVINSN_trace_opt_mult: ruby_vminsn_type = 171;
-pub const YARVINSN_trace_opt_div: ruby_vminsn_type = 172;
-pub const YARVINSN_trace_opt_mod: ruby_vminsn_type = 173;
-pub const YARVINSN_trace_opt_eq: ruby_vminsn_type = 174;
-pub const YARVINSN_trace_opt_neq: ruby_vminsn_type = 175;
-pub const YARVINSN_trace_opt_lt: ruby_vminsn_type = 176;
-pub const YARVINSN_trace_opt_le: ruby_vminsn_type = 177;
-pub const YARVINSN_trace_opt_gt: ruby_vminsn_type = 178;
-pub const YARVINSN_trace_opt_ge: ruby_vminsn_type = 179;
-pub const YARVINSN_trace_opt_ltlt: ruby_vminsn_type = 180;
-pub const YARVINSN_trace_opt_and: ruby_vminsn_type = 181;
-pub const YARVINSN_trace_opt_or: ruby_vminsn_type = 182;
-pub const YARVINSN_trace_opt_aref: ruby_vminsn_type = 183;
-pub const YARVINSN_trace_opt_aset: ruby_vminsn_type = 184;
-pub const YARVINSN_trace_opt_aset_with: ruby_vminsn_type = 185;
-pub const YARVINSN_trace_opt_aref_with: ruby_vminsn_type = 186;
-pub const YARVINSN_trace_opt_length: ruby_vminsn_type = 187;
-pub const YARVINSN_trace_opt_size: ruby_vminsn_type = 188;
-pub const YARVINSN_trace_opt_empty_p: ruby_vminsn_type = 189;
-pub const YARVINSN_trace_opt_succ: ruby_vminsn_type = 190;
-pub const YARVINSN_trace_opt_not: ruby_vminsn_type = 191;
-pub const YARVINSN_trace_opt_regexpmatch2: ruby_vminsn_type = 192;
-pub const YARVINSN_trace_invokebuiltin: ruby_vminsn_type = 193;
-pub const YARVINSN_trace_opt_invokebuiltin_delegate: ruby_vminsn_type = 194;
-pub const YARVINSN_trace_opt_invokebuiltin_delegate_leave: ruby_vminsn_type = 195;
-pub const YARVINSN_trace_getlocal_WC_0: ruby_vminsn_type = 196;
-pub const YARVINSN_trace_getlocal_WC_1: ruby_vminsn_type = 197;
-pub const YARVINSN_trace_setlocal_WC_0: ruby_vminsn_type = 198;
-pub const YARVINSN_trace_setlocal_WC_1: ruby_vminsn_type = 199;
-pub const YARVINSN_trace_putobject_INT2FIX_0_: ruby_vminsn_type = 200;
-pub const YARVINSN_trace_putobject_INT2FIX_1_: ruby_vminsn_type = 201;
-pub const VM_INSTRUCTION_SIZE: ruby_vminsn_type = 202;
+pub const YARVINSN_putchilledstring: ruby_vminsn_type = 22;
+pub const YARVINSN_concatstrings: ruby_vminsn_type = 23;
+pub const YARVINSN_anytostring: ruby_vminsn_type = 24;
+pub const YARVINSN_toregexp: ruby_vminsn_type = 25;
+pub const YARVINSN_intern: ruby_vminsn_type = 26;
+pub const YARVINSN_newarray: ruby_vminsn_type = 27;
+pub const YARVINSN_pushtoarraykwsplat: ruby_vminsn_type = 28;
+pub const YARVINSN_duparray: ruby_vminsn_type = 29;
+pub const YARVINSN_duphash: ruby_vminsn_type = 30;
+pub const YARVINSN_expandarray: ruby_vminsn_type = 31;
+pub const YARVINSN_concatarray: ruby_vminsn_type = 32;
+pub const YARVINSN_concattoarray: ruby_vminsn_type = 33;
+pub const YARVINSN_pushtoarray: ruby_vminsn_type = 34;
+pub const YARVINSN_splatarray: ruby_vminsn_type = 35;
+pub const YARVINSN_splatkw: ruby_vminsn_type = 36;
+pub const YARVINSN_newhash: ruby_vminsn_type = 37;
+pub const YARVINSN_newrange: ruby_vminsn_type = 38;
+pub const YARVINSN_pop: ruby_vminsn_type = 39;
+pub const YARVINSN_dup: ruby_vminsn_type = 40;
+pub const YARVINSN_dupn: ruby_vminsn_type = 41;
+pub const YARVINSN_swap: ruby_vminsn_type = 42;
+pub const YARVINSN_opt_reverse: ruby_vminsn_type = 43;
+pub const YARVINSN_topn: ruby_vminsn_type = 44;
+pub const YARVINSN_setn: ruby_vminsn_type = 45;
+pub const YARVINSN_adjuststack: ruby_vminsn_type = 46;
+pub const YARVINSN_defined: ruby_vminsn_type = 47;
+pub const YARVINSN_definedivar: ruby_vminsn_type = 48;
+pub const YARVINSN_checkmatch: ruby_vminsn_type = 49;
+pub const YARVINSN_checkkeyword: ruby_vminsn_type = 50;
+pub const YARVINSN_checktype: ruby_vminsn_type = 51;
+pub const YARVINSN_defineclass: ruby_vminsn_type = 52;
+pub const YARVINSN_definemethod: ruby_vminsn_type = 53;
+pub const YARVINSN_definesmethod: ruby_vminsn_type = 54;
+pub const YARVINSN_send: ruby_vminsn_type = 55;
+pub const YARVINSN_sendforward: ruby_vminsn_type = 56;
+pub const YARVINSN_opt_send_without_block: ruby_vminsn_type = 57;
+pub const YARVINSN_opt_new: ruby_vminsn_type = 58;
+pub const YARVINSN_objtostring: ruby_vminsn_type = 59;
+pub const YARVINSN_opt_ary_freeze: ruby_vminsn_type = 60;
+pub const YARVINSN_opt_hash_freeze: ruby_vminsn_type = 61;
+pub const YARVINSN_opt_str_freeze: ruby_vminsn_type = 62;
+pub const YARVINSN_opt_nil_p: ruby_vminsn_type = 63;
+pub const YARVINSN_opt_str_uminus: ruby_vminsn_type = 64;
+pub const YARVINSN_opt_duparray_send: ruby_vminsn_type = 65;
+pub const YARVINSN_opt_newarray_send: ruby_vminsn_type = 66;
+pub const YARVINSN_invokesuper: ruby_vminsn_type = 67;
+pub const YARVINSN_invokesuperforward: ruby_vminsn_type = 68;
+pub const YARVINSN_invokeblock: ruby_vminsn_type = 69;
+pub const YARVINSN_leave: ruby_vminsn_type = 70;
+pub const YARVINSN_throw: ruby_vminsn_type = 71;
+pub const YARVINSN_jump: ruby_vminsn_type = 72;
+pub const YARVINSN_branchif: ruby_vminsn_type = 73;
+pub const YARVINSN_branchunless: ruby_vminsn_type = 74;
+pub const YARVINSN_branchnil: ruby_vminsn_type = 75;
+pub const YARVINSN_once: ruby_vminsn_type = 76;
+pub const YARVINSN_opt_case_dispatch: ruby_vminsn_type = 77;
+pub const YARVINSN_opt_plus: ruby_vminsn_type = 78;
+pub const YARVINSN_opt_minus: ruby_vminsn_type = 79;
+pub const YARVINSN_opt_mult: ruby_vminsn_type = 80;
+pub const YARVINSN_opt_div: ruby_vminsn_type = 81;
+pub const YARVINSN_opt_mod: ruby_vminsn_type = 82;
+pub const YARVINSN_opt_eq: ruby_vminsn_type = 83;
+pub const YARVINSN_opt_neq: ruby_vminsn_type = 84;
+pub const YARVINSN_opt_lt: ruby_vminsn_type = 85;
+pub const YARVINSN_opt_le: ruby_vminsn_type = 86;
+pub const YARVINSN_opt_gt: ruby_vminsn_type = 87;
+pub const YARVINSN_opt_ge: ruby_vminsn_type = 88;
+pub const YARVINSN_opt_ltlt: ruby_vminsn_type = 89;
+pub const YARVINSN_opt_and: ruby_vminsn_type = 90;
+pub const YARVINSN_opt_or: ruby_vminsn_type = 91;
+pub const YARVINSN_opt_aref: ruby_vminsn_type = 92;
+pub const YARVINSN_opt_aset: ruby_vminsn_type = 93;
+pub const YARVINSN_opt_length: ruby_vminsn_type = 94;
+pub const YARVINSN_opt_size: ruby_vminsn_type = 95;
+pub const YARVINSN_opt_empty_p: ruby_vminsn_type = 96;
+pub const YARVINSN_opt_succ: ruby_vminsn_type = 97;
+pub const YARVINSN_opt_not: ruby_vminsn_type = 98;
+pub const YARVINSN_opt_regexpmatch2: ruby_vminsn_type = 99;
+pub const YARVINSN_invokebuiltin: ruby_vminsn_type = 100;
+pub const YARVINSN_opt_invokebuiltin_delegate: ruby_vminsn_type = 101;
+pub const YARVINSN_opt_invokebuiltin_delegate_leave: ruby_vminsn_type = 102;
+pub const YARVINSN_getlocal_WC_0: ruby_vminsn_type = 103;
+pub const YARVINSN_getlocal_WC_1: ruby_vminsn_type = 104;
+pub const YARVINSN_setlocal_WC_0: ruby_vminsn_type = 105;
+pub const YARVINSN_setlocal_WC_1: ruby_vminsn_type = 106;
+pub const YARVINSN_putobject_INT2FIX_0_: ruby_vminsn_type = 107;
+pub const YARVINSN_putobject_INT2FIX_1_: ruby_vminsn_type = 108;
+pub const YARVINSN_trace_nop: ruby_vminsn_type = 109;
+pub const YARVINSN_trace_getlocal: ruby_vminsn_type = 110;
+pub const YARVINSN_trace_setlocal: ruby_vminsn_type = 111;
+pub const YARVINSN_trace_getblockparam: ruby_vminsn_type = 112;
+pub const YARVINSN_trace_setblockparam: ruby_vminsn_type = 113;
+pub const YARVINSN_trace_getblockparamproxy: ruby_vminsn_type = 114;
+pub const YARVINSN_trace_getspecial: ruby_vminsn_type = 115;
+pub const YARVINSN_trace_setspecial: ruby_vminsn_type = 116;
+pub const YARVINSN_trace_getinstancevariable: ruby_vminsn_type = 117;
+pub const YARVINSN_trace_setinstancevariable: ruby_vminsn_type = 118;
+pub const YARVINSN_trace_getclassvariable: ruby_vminsn_type = 119;
+pub const YARVINSN_trace_setclassvariable: ruby_vminsn_type = 120;
+pub const YARVINSN_trace_opt_getconstant_path: ruby_vminsn_type = 121;
+pub const YARVINSN_trace_getconstant: ruby_vminsn_type = 122;
+pub const YARVINSN_trace_setconstant: ruby_vminsn_type = 123;
+pub const YARVINSN_trace_getglobal: ruby_vminsn_type = 124;
+pub const YARVINSN_trace_setglobal: ruby_vminsn_type = 125;
+pub const YARVINSN_trace_putnil: ruby_vminsn_type = 126;
+pub const YARVINSN_trace_putself: ruby_vminsn_type = 127;
+pub const YARVINSN_trace_putobject: ruby_vminsn_type = 128;
+pub const YARVINSN_trace_putspecialobject: ruby_vminsn_type = 129;
+pub const YARVINSN_trace_putstring: ruby_vminsn_type = 130;
+pub const YARVINSN_trace_putchilledstring: ruby_vminsn_type = 131;
+pub const YARVINSN_trace_concatstrings: ruby_vminsn_type = 132;
+pub const YARVINSN_trace_anytostring: ruby_vminsn_type = 133;
+pub const YARVINSN_trace_toregexp: ruby_vminsn_type = 134;
+pub const YARVINSN_trace_intern: ruby_vminsn_type = 135;
+pub const YARVINSN_trace_newarray: ruby_vminsn_type = 136;
+pub const YARVINSN_trace_pushtoarraykwsplat: ruby_vminsn_type = 137;
+pub const YARVINSN_trace_duparray: ruby_vminsn_type = 138;
+pub const YARVINSN_trace_duphash: ruby_vminsn_type = 139;
+pub const YARVINSN_trace_expandarray: ruby_vminsn_type = 140;
+pub const YARVINSN_trace_concatarray: ruby_vminsn_type = 141;
+pub const YARVINSN_trace_concattoarray: ruby_vminsn_type = 142;
+pub const YARVINSN_trace_pushtoarray: ruby_vminsn_type = 143;
+pub const YARVINSN_trace_splatarray: ruby_vminsn_type = 144;
+pub const YARVINSN_trace_splatkw: ruby_vminsn_type = 145;
+pub const YARVINSN_trace_newhash: ruby_vminsn_type = 146;
+pub const YARVINSN_trace_newrange: ruby_vminsn_type = 147;
+pub const YARVINSN_trace_pop: ruby_vminsn_type = 148;
+pub const YARVINSN_trace_dup: ruby_vminsn_type = 149;
+pub const YARVINSN_trace_dupn: ruby_vminsn_type = 150;
+pub const YARVINSN_trace_swap: ruby_vminsn_type = 151;
+pub const YARVINSN_trace_opt_reverse: ruby_vminsn_type = 152;
+pub const YARVINSN_trace_topn: ruby_vminsn_type = 153;
+pub const YARVINSN_trace_setn: ruby_vminsn_type = 154;
+pub const YARVINSN_trace_adjuststack: ruby_vminsn_type = 155;
+pub const YARVINSN_trace_defined: ruby_vminsn_type = 156;
+pub const YARVINSN_trace_definedivar: ruby_vminsn_type = 157;
+pub const YARVINSN_trace_checkmatch: ruby_vminsn_type = 158;
+pub const YARVINSN_trace_checkkeyword: ruby_vminsn_type = 159;
+pub const YARVINSN_trace_checktype: ruby_vminsn_type = 160;
+pub const YARVINSN_trace_defineclass: ruby_vminsn_type = 161;
+pub const YARVINSN_trace_definemethod: ruby_vminsn_type = 162;
+pub const YARVINSN_trace_definesmethod: ruby_vminsn_type = 163;
+pub const YARVINSN_trace_send: ruby_vminsn_type = 164;
+pub const YARVINSN_trace_sendforward: ruby_vminsn_type = 165;
+pub const YARVINSN_trace_opt_send_without_block: ruby_vminsn_type = 166;
+pub const YARVINSN_trace_opt_new: ruby_vminsn_type = 167;
+pub const YARVINSN_trace_objtostring: ruby_vminsn_type = 168;
+pub const YARVINSN_trace_opt_ary_freeze: ruby_vminsn_type = 169;
+pub const YARVINSN_trace_opt_hash_freeze: ruby_vminsn_type = 170;
+pub const YARVINSN_trace_opt_str_freeze: ruby_vminsn_type = 171;
+pub const YARVINSN_trace_opt_nil_p: ruby_vminsn_type = 172;
+pub const YARVINSN_trace_opt_str_uminus: ruby_vminsn_type = 173;
+pub const YARVINSN_trace_opt_duparray_send: ruby_vminsn_type = 174;
+pub const YARVINSN_trace_opt_newarray_send: ruby_vminsn_type = 175;
+pub const YARVINSN_trace_invokesuper: ruby_vminsn_type = 176;
+pub const YARVINSN_trace_invokesuperforward: ruby_vminsn_type = 177;
+pub const YARVINSN_trace_invokeblock: ruby_vminsn_type = 178;
+pub const YARVINSN_trace_leave: ruby_vminsn_type = 179;
+pub const YARVINSN_trace_throw: ruby_vminsn_type = 180;
+pub const YARVINSN_trace_jump: ruby_vminsn_type = 181;
+pub const YARVINSN_trace_branchif: ruby_vminsn_type = 182;
+pub const YARVINSN_trace_branchunless: ruby_vminsn_type = 183;
+pub const YARVINSN_trace_branchnil: ruby_vminsn_type = 184;
+pub const YARVINSN_trace_once: ruby_vminsn_type = 185;
+pub const YARVINSN_trace_opt_case_dispatch: ruby_vminsn_type = 186;
+pub const YARVINSN_trace_opt_plus: ruby_vminsn_type = 187;
+pub const YARVINSN_trace_opt_minus: ruby_vminsn_type = 188;
+pub const YARVINSN_trace_opt_mult: ruby_vminsn_type = 189;
+pub const YARVINSN_trace_opt_div: ruby_vminsn_type = 190;
+pub const YARVINSN_trace_opt_mod: ruby_vminsn_type = 191;
+pub const YARVINSN_trace_opt_eq: ruby_vminsn_type = 192;
+pub const YARVINSN_trace_opt_neq: ruby_vminsn_type = 193;
+pub const YARVINSN_trace_opt_lt: ruby_vminsn_type = 194;
+pub const YARVINSN_trace_opt_le: ruby_vminsn_type = 195;
+pub const YARVINSN_trace_opt_gt: ruby_vminsn_type = 196;
+pub const YARVINSN_trace_opt_ge: ruby_vminsn_type = 197;
+pub const YARVINSN_trace_opt_ltlt: ruby_vminsn_type = 198;
+pub const YARVINSN_trace_opt_and: ruby_vminsn_type = 199;
+pub const YARVINSN_trace_opt_or: ruby_vminsn_type = 200;
+pub const YARVINSN_trace_opt_aref: ruby_vminsn_type = 201;
+pub const YARVINSN_trace_opt_aset: ruby_vminsn_type = 202;
+pub const YARVINSN_trace_opt_length: ruby_vminsn_type = 203;
+pub const YARVINSN_trace_opt_size: ruby_vminsn_type = 204;
+pub const YARVINSN_trace_opt_empty_p: ruby_vminsn_type = 205;
+pub const YARVINSN_trace_opt_succ: ruby_vminsn_type = 206;
+pub const YARVINSN_trace_opt_not: ruby_vminsn_type = 207;
+pub const YARVINSN_trace_opt_regexpmatch2: ruby_vminsn_type = 208;
+pub const YARVINSN_trace_invokebuiltin: ruby_vminsn_type = 209;
+pub const YARVINSN_trace_opt_invokebuiltin_delegate: ruby_vminsn_type = 210;
+pub const YARVINSN_trace_opt_invokebuiltin_delegate_leave: ruby_vminsn_type = 211;
+pub const YARVINSN_trace_getlocal_WC_0: ruby_vminsn_type = 212;
+pub const YARVINSN_trace_getlocal_WC_1: ruby_vminsn_type = 213;
+pub const YARVINSN_trace_setlocal_WC_0: ruby_vminsn_type = 214;
+pub const YARVINSN_trace_setlocal_WC_1: ruby_vminsn_type = 215;
+pub const YARVINSN_trace_putobject_INT2FIX_0_: ruby_vminsn_type = 216;
+pub const YARVINSN_trace_putobject_INT2FIX_1_: ruby_vminsn_type = 217;
+pub const YARVINSN_zjit_getinstancevariable: ruby_vminsn_type = 218;
+pub const YARVINSN_zjit_setinstancevariable: ruby_vminsn_type = 219;
+pub const YARVINSN_zjit_definedivar: ruby_vminsn_type = 220;
+pub const YARVINSN_zjit_send: ruby_vminsn_type = 221;
+pub const YARVINSN_zjit_opt_send_without_block: ruby_vminsn_type = 222;
+pub const YARVINSN_zjit_objtostring: ruby_vminsn_type = 223;
+pub const YARVINSN_zjit_opt_nil_p: ruby_vminsn_type = 224;
+pub const YARVINSN_zjit_invokesuper: ruby_vminsn_type = 225;
+pub const YARVINSN_zjit_invokeblock: ruby_vminsn_type = 226;
+pub const YARVINSN_zjit_opt_plus: ruby_vminsn_type = 227;
+pub const YARVINSN_zjit_opt_minus: ruby_vminsn_type = 228;
+pub const YARVINSN_zjit_opt_mult: ruby_vminsn_type = 229;
+pub const YARVINSN_zjit_opt_div: ruby_vminsn_type = 230;
+pub const YARVINSN_zjit_opt_mod: ruby_vminsn_type = 231;
+pub const YARVINSN_zjit_opt_eq: ruby_vminsn_type = 232;
+pub const YARVINSN_zjit_opt_neq: ruby_vminsn_type = 233;
+pub const YARVINSN_zjit_opt_lt: ruby_vminsn_type = 234;
+pub const YARVINSN_zjit_opt_le: ruby_vminsn_type = 235;
+pub const YARVINSN_zjit_opt_gt: ruby_vminsn_type = 236;
+pub const YARVINSN_zjit_opt_ge: ruby_vminsn_type = 237;
+pub const YARVINSN_zjit_opt_ltlt: ruby_vminsn_type = 238;
+pub const YARVINSN_zjit_opt_and: ruby_vminsn_type = 239;
+pub const YARVINSN_zjit_opt_or: ruby_vminsn_type = 240;
+pub const YARVINSN_zjit_opt_aref: ruby_vminsn_type = 241;
+pub const YARVINSN_zjit_opt_aset: ruby_vminsn_type = 242;
+pub const YARVINSN_zjit_opt_length: ruby_vminsn_type = 243;
+pub const YARVINSN_zjit_opt_size: ruby_vminsn_type = 244;
+pub const YARVINSN_zjit_opt_empty_p: ruby_vminsn_type = 245;
+pub const YARVINSN_zjit_opt_succ: ruby_vminsn_type = 246;
+pub const YARVINSN_zjit_opt_not: ruby_vminsn_type = 247;
+pub const YARVINSN_zjit_opt_regexpmatch2: ruby_vminsn_type = 248;
+pub const VM_INSTRUCTION_SIZE: ruby_vminsn_type = 249;
 pub type ruby_vminsn_type = u32;
 pub type rb_iseq_callback = ::std::option::Option<
     unsafe extern "C" fn(arg1: *const rb_iseq_t, arg2: *mut ::std::os::raw::c_void),
 >;
-pub const RUBY_OFFSET_RSTRING_AS_HEAP_LEN: rstring_offsets = 16;
-pub const RUBY_OFFSET_RSTRING_EMBED_LEN: rstring_offsets = 16;
-pub type rstring_offsets = u32;
-pub type rb_seq_param_keyword_struct = rb_iseq_constant_body__bindgen_ty_1_rb_iseq_param_keyword;
+pub const DEFINED_NOT_DEFINED: defined_type = 0;
+pub const DEFINED_NIL: defined_type = 1;
+pub const DEFINED_IVAR: defined_type = 2;
+pub const DEFINED_LVAR: defined_type = 3;
+pub const DEFINED_GVAR: defined_type = 4;
+pub const DEFINED_CVAR: defined_type = 5;
+pub const DEFINED_CONST: defined_type = 6;
+pub const DEFINED_METHOD: defined_type = 7;
+pub const DEFINED_YIELD: defined_type = 8;
+pub const DEFINED_ZSUPER: defined_type = 9;
+pub const DEFINED_SELF: defined_type = 10;
+pub const DEFINED_TRUE: defined_type = 11;
+pub const DEFINED_FALSE: defined_type = 12;
+pub const DEFINED_ASGN: defined_type = 13;
+pub const DEFINED_EXPR: defined_type = 14;
+pub const DEFINED_REF: defined_type = 15;
+pub const DEFINED_FUNC: defined_type = 16;
+pub const DEFINED_CONST_FROM: defined_type = 17;
+pub type defined_type = u32;
+pub type rb_seq_param_keyword_struct =
+    rb_iseq_constant_body_rb_iseq_parameters_rb_iseq_param_keyword;
+pub const ROBJECT_OFFSET_AS_HEAP_FIELDS: jit_bindgen_constants = 16;
+pub const ROBJECT_OFFSET_AS_ARY: jit_bindgen_constants = 16;
+pub const RUBY_OFFSET_RSTRING_LEN: jit_bindgen_constants = 16;
+pub const RUBY_OFFSET_EC_CFP: jit_bindgen_constants = 16;
+pub const RUBY_OFFSET_EC_INTERRUPT_FLAG: jit_bindgen_constants = 32;
+pub const RUBY_OFFSET_EC_INTERRUPT_MASK: jit_bindgen_constants = 36;
+pub const RUBY_OFFSET_EC_THREAD_PTR: jit_bindgen_constants = 48;
+pub const RUBY_OFFSET_EC_RACTOR_ID: jit_bindgen_constants = 64;
+pub type jit_bindgen_constants = u32;
+pub type rb_iseq_param_keyword_struct =
+    rb_iseq_constant_body_rb_iseq_parameters_rb_iseq_param_keyword;
 extern "C" {
+    pub fn ruby_xfree(ptr: *mut ::std::os::raw::c_void);
+    pub fn rb_class_attached_object(klass: VALUE) -> VALUE;
     pub fn rb_singleton_class(obj: VALUE) -> VALUE;
     pub fn rb_get_alloc_func(klass: VALUE) -> rb_alloc_func_t;
     pub fn rb_method_basic_definition_p(klass: VALUE, mid: ID) -> ::std::os::raw::c_int;
+    pub fn rb_bug(fmt: *const ::std::os::raw::c_char, ...) -> !;
+    pub fn rb_float_new(d: f64) -> VALUE;
+    pub fn rb_gc_mark(obj: VALUE);
+    pub fn rb_gc_mark_movable(obj: VALUE);
+    pub fn rb_gc_location(obj: VALUE) -> VALUE;
     pub fn rb_gc_writebarrier(old: VALUE, young: VALUE);
     pub fn rb_class_get_superclass(klass: VALUE) -> VALUE;
+    pub fn rb_funcall(recv: VALUE, mid: ID, n: ::std::os::raw::c_int, ...) -> VALUE;
     pub static mut rb_mKernel: VALUE;
     pub static mut rb_cBasicObject: VALUE;
     pub static mut rb_cArray: VALUE;
+    pub static mut rb_cClass: VALUE;
     pub static mut rb_cFalseClass: VALUE;
     pub static mut rb_cFloat: VALUE;
     pub static mut rb_cHash: VALUE;
+    pub static mut rb_cIO: VALUE;
     pub static mut rb_cInteger: VALUE;
     pub static mut rb_cModule: VALUE;
     pub static mut rb_cNilClass: VALUE;
+    pub static mut rb_cNumeric: VALUE;
     pub static mut rb_cString: VALUE;
     pub static mut rb_cSymbol: VALUE;
     pub static mut rb_cThread: VALUE;
     pub static mut rb_cTrueClass: VALUE;
+    pub fn rb_obj_class(obj: VALUE) -> VALUE;
     pub fn rb_ary_new_capa(capa: ::std::os::raw::c_long) -> VALUE;
     pub fn rb_ary_store(ary: VALUE, key: ::std::os::raw::c_long, val: VALUE);
+    pub fn rb_ary_dup(ary: VALUE) -> VALUE;
     pub fn rb_ary_resurrect(ary: VALUE) -> VALUE;
+    pub fn rb_ary_cat(ary: VALUE, train: *const VALUE, len: ::std::os::raw::c_long) -> VALUE;
+    pub fn rb_ary_push(ary: VALUE, elem: VALUE) -> VALUE;
     pub fn rb_ary_clear(ary: VALUE) -> VALUE;
     pub fn rb_hash_new() -> VALUE;
     pub fn rb_hash_aref(hash: VALUE, key: VALUE) -> VALUE;
     pub fn rb_hash_aset(hash: VALUE, key: VALUE, val: VALUE) -> VALUE;
     pub fn rb_hash_bulk_insert(argc: ::std::os::raw::c_long, argv: *const VALUE, hash: VALUE);
+    pub fn rb_obj_is_proc(recv: VALUE) -> VALUE;
     pub fn rb_sym2id(obj: VALUE) -> ID;
     pub fn rb_id2sym(id: ID) -> VALUE;
     pub fn rb_intern(name: *const ::std::os::raw::c_char) -> ID;
-    pub fn rb_gc_mark(obj: VALUE);
-    pub fn rb_gc_mark_movable(obj: VALUE);
-    pub fn rb_gc_location(obj: VALUE) -> VALUE;
+    pub fn rb_intern2(name: *const ::std::os::raw::c_char, len: ::std::os::raw::c_long) -> ID;
+    pub fn rb_id2name(id: ID) -> *const ::std::os::raw::c_char;
+    pub fn rb_class2name(klass: VALUE) -> *const ::std::os::raw::c_char;
+    pub fn rb_class_new_instance_pass_kw(
+        argc: ::std::os::raw::c_int,
+        argv: *const VALUE,
+        klass: VALUE,
+    ) -> VALUE;
     pub fn rb_obj_is_kind_of(obj: VALUE, klass: VALUE) -> VALUE;
+    pub fn rb_obj_alloc(klass: VALUE) -> VALUE;
     pub fn rb_obj_frozen_p(obj: VALUE) -> VALUE;
     pub fn rb_backref_get() -> VALUE;
     pub fn rb_range_new(beg: VALUE, end: VALUE, excl: ::std::os::raw::c_int) -> VALUE;
@@ -1119,23 +1056,15 @@ extern "C" {
     pub fn rb_str_buf_append(dst: VALUE, src: VALUE) -> VALUE;
     pub fn rb_str_dup(str_: VALUE) -> VALUE;
     pub fn rb_str_intern(str_: VALUE) -> VALUE;
+    pub fn rb_mod_name(mod_: VALUE) -> VALUE;
     pub fn rb_ivar_get(obj: VALUE, name: ID) -> VALUE;
+    pub fn rb_ivar_defined(obj: VALUE, name: ID) -> VALUE;
     pub fn rb_attr_get(obj: VALUE, name: ID) -> VALUE;
+    pub fn rb_const_get(space: VALUE, name: ID) -> VALUE;
     pub fn rb_obj_info_dump(obj: VALUE);
-    pub fn rb_reg_new_ary(ary: VALUE, options: ::std::os::raw::c_int) -> VALUE;
     pub fn rb_class_allocate_instance(klass: VALUE) -> VALUE;
-    pub fn rb_obj_info(obj: VALUE) -> *const ::std::os::raw::c_char;
-    pub fn rb_shape_id_offset() -> i32;
-    pub fn rb_shape_get_shape_by_id(shape_id: shape_id_t) -> *mut rb_shape_t;
-    pub fn rb_shape_get_shape_id(obj: VALUE) -> shape_id_t;
-    pub fn rb_shape_get_iv_index(shape: *mut rb_shape_t, id: ID, value: *mut attr_index_t) -> bool;
-    pub fn rb_shape_obj_too_complex(obj: VALUE) -> bool;
-    pub fn rb_shape_transition_shape_capa(
-        shape: *mut rb_shape_t,
-        new_capacity: u32,
-    ) -> *mut rb_shape_t;
-    pub fn rb_shape_get_next(shape: *mut rb_shape_t, obj: VALUE, id: ID) -> *mut rb_shape_t;
-    pub fn rb_shape_id(shape: *mut rb_shape_t) -> shape_id_t;
+    pub fn rb_obj_equal(obj1: VALUE, obj2: VALUE) -> VALUE;
+    pub fn rb_reg_new_ary(ary: VALUE, options: ::std::os::raw::c_int) -> VALUE;
     pub fn rb_ary_tmp_new_from_values(
         arg1: VALUE,
         arg2: ::std::os::raw::c_long,
@@ -1146,23 +1075,55 @@ extern "C" {
         n: ::std::os::raw::c_long,
         elts: *const VALUE,
     ) -> VALUE;
+    pub fn rb_vm_top_self() -> VALUE;
+    pub static mut rb_vm_insn_count: u64;
     pub fn rb_method_entry_at(obj: VALUE, id: ID) -> *const rb_method_entry_t;
     pub fn rb_callable_method_entry(klass: VALUE, id: ID) -> *const rb_callable_method_entry_t;
     pub fn rb_callable_method_entry_or_negative(
         klass: VALUE,
         id: ID,
     ) -> *const rb_callable_method_entry_t;
+    pub static mut rb_cRubyVM: VALUE;
     pub static mut rb_mRubyVMFrozenCore: VALUE;
     pub static mut rb_block_param_proxy: VALUE;
     pub fn rb_vm_ep_local_ep(ep: *const VALUE) -> *const VALUE;
     pub fn rb_iseq_path(iseq: *const rb_iseq_t) -> VALUE;
+    pub fn rb_vm_env_write(ep: *const VALUE, index: ::std::os::raw::c_int, v: VALUE);
     pub fn rb_vm_bh_to_procval(ec: *const rb_execution_context_t, block_handler: VALUE) -> VALUE;
     pub fn rb_vm_frame_method_entry(
         cfp: *const rb_control_frame_t,
     ) -> *const rb_callable_method_entry_t;
+    pub fn rb_obj_info(obj: VALUE) -> *const ::std::os::raw::c_char;
+    pub fn rb_ec_stack_check(ec: *mut rb_execution_context_struct) -> ::std::os::raw::c_int;
+    pub fn rb_shape_id_offset() -> i32;
+    pub fn rb_obj_shape_id(obj: VALUE) -> shape_id_t;
+    pub fn rb_shape_get_iv_index(shape_id: shape_id_t, id: ID, value: *mut attr_index_t) -> bool;
+    pub fn rb_shape_transition_add_ivar_no_warnings(
+        klass: VALUE,
+        original_shape_id: shape_id_t,
+        id: ID,
+    ) -> shape_id_t;
+    pub fn rb_ivar_get_at(obj: VALUE, index: attr_index_t, id: ID) -> VALUE;
+    pub fn rb_ivar_get_at_no_ractor_check(obj: VALUE, index: attr_index_t) -> VALUE;
+    pub fn rb_gvar_get(arg1: ID) -> VALUE;
+    pub fn rb_gvar_set(arg1: ID, arg2: VALUE) -> VALUE;
+    pub fn rb_ensure_iv_list_size(obj: VALUE, current_len: u32, newsize: u32);
+    pub fn rb_vm_barrier();
+    pub fn rb_str_byte_substr(str_: VALUE, beg: VALUE, len: VALUE) -> VALUE;
+    pub fn rb_str_substr_two_fixnums(
+        str_: VALUE,
+        beg: VALUE,
+        len: VALUE,
+        empty: ::std::os::raw::c_int,
+    ) -> VALUE;
     pub fn rb_obj_as_string_result(str_: VALUE, obj: VALUE) -> VALUE;
     pub fn rb_str_concat_literals(num: usize, strary: *const VALUE) -> VALUE;
-    pub fn rb_ec_str_resurrect(ec: *mut rb_execution_context_struct, str_: VALUE) -> VALUE;
+    pub fn rb_ec_str_resurrect(
+        ec: *mut rb_execution_context_struct,
+        str_: VALUE,
+        chilled: bool,
+    ) -> VALUE;
+    pub fn rb_to_hash_type(obj: VALUE) -> VALUE;
     pub fn rb_hash_stlike_foreach(
         hash: VALUE,
         func: st_foreach_callback_func,
@@ -1175,15 +1136,17 @@ extern "C" {
         key: st_data_t,
         pval: *mut st_data_t,
     ) -> ::std::os::raw::c_int;
-    pub fn rb_gvar_get(arg1: ID) -> VALUE;
-    pub fn rb_gvar_set(arg1: ID, arg2: VALUE) -> VALUE;
-    pub fn rb_ensure_iv_list_size(obj: VALUE, len: u32, newsize: u32);
+    pub fn rb_insn_len(insn: VALUE) -> ::std::os::raw::c_int;
     pub fn rb_vm_insn_decode(encoded: VALUE) -> ::std::os::raw::c_int;
+    pub fn rb_float_plus(x: VALUE, y: VALUE) -> VALUE;
+    pub fn rb_float_minus(x: VALUE, y: VALUE) -> VALUE;
+    pub fn rb_float_mul(x: VALUE, y: VALUE) -> VALUE;
+    pub fn rb_float_div(x: VALUE, y: VALUE) -> VALUE;
+    pub fn rb_fix_aref(fix: VALUE, idx: VALUE) -> VALUE;
     pub fn rb_vm_insn_addr2opcode(addr: *const ::std::os::raw::c_void) -> ::std::os::raw::c_int;
     pub fn rb_iseq_line_no(iseq: *const rb_iseq_t, pos: usize) -> ::std::os::raw::c_uint;
     pub fn rb_iseqw_to_iseq(iseqw: VALUE) -> *const rb_iseq_t;
     pub fn rb_iseq_label(iseq: *const rb_iseq_t) -> VALUE;
-    pub fn rb_vm_barrier();
     pub fn rb_profile_frames(
         start: ::std::os::raw::c_int,
         limit: ::std::os::raw::c_int,
@@ -1191,33 +1154,59 @@ extern "C" {
         lines: *mut ::std::os::raw::c_int,
     ) -> ::std::os::raw::c_int;
     pub fn rb_jit_cont_each_iseq(callback: rb_iseq_callback, data: *mut ::std::os::raw::c_void);
-    pub fn rb_yjit_mark_writable(mem_block: *mut ::std::os::raw::c_void, mem_size: u32) -> bool;
-    pub fn rb_yjit_mark_executable(mem_block: *mut ::std::os::raw::c_void, mem_size: u32);
-    pub fn rb_yjit_mark_unused(mem_block: *mut ::std::os::raw::c_void, mem_size: u32) -> bool;
-    pub fn rb_yjit_icache_invalidate(
-        start: *mut ::std::os::raw::c_void,
-        end: *mut ::std::os::raw::c_void,
-    );
     pub fn rb_yjit_exit_locations_dict(
         yjit_raw_samples: *mut VALUE,
         yjit_line_samples: *mut ::std::os::raw::c_int,
         samples_len: ::std::os::raw::c_int,
     ) -> VALUE;
-    pub fn rb_yjit_get_page_size() -> u32;
-    pub fn rb_yjit_reserve_addr_space(mem_size: u32) -> *mut u8;
-    pub fn rb_c_method_tracing_currently_enabled(ec: *mut rb_execution_context_t) -> bool;
+    pub fn rb_c_method_tracing_currently_enabled(ec: *const rb_execution_context_t) -> bool;
     pub fn rb_full_cfunc_return(ec: *mut rb_execution_context_t, return_value: VALUE);
-    pub fn rb_iseq_encoded_size(iseq: *const rb_iseq_t) -> ::std::os::raw::c_uint;
     pub fn rb_iseq_get_yjit_payload(iseq: *const rb_iseq_t) -> *mut ::std::os::raw::c_void;
     pub fn rb_iseq_set_yjit_payload(iseq: *const rb_iseq_t, payload: *mut ::std::os::raw::c_void);
-    pub fn rb_iseq_reset_jit_func(iseq: *const rb_iseq_t);
+    pub fn rb_get_symbol_id(namep: VALUE) -> ID;
+    pub fn rb_yjit_builtin_function(iseq: *const rb_iseq_t) -> *const rb_builtin_function;
+    pub fn rb_yjit_str_simple_append(str1: VALUE, str2: VALUE) -> VALUE;
+    pub fn rb_vm_base_ptr(cfp: *mut rb_control_frame_struct) -> *mut VALUE;
+    pub fn rb_str_neq_internal(str1: VALUE, str2: VALUE) -> VALUE;
+    pub fn rb_ary_unshift_m(argc: ::std::os::raw::c_int, argv: *mut VALUE, ary: VALUE) -> VALUE;
+    pub fn rb_yjit_rb_ary_subseq_length(ary: VALUE, beg: ::std::os::raw::c_long) -> VALUE;
+    pub fn rb_yjit_ruby2_keywords_splat_p(obj: VALUE) -> usize;
+    pub fn rb_yjit_splat_varg_checks(
+        sp: *mut VALUE,
+        splat_array: VALUE,
+        cfp: *mut rb_control_frame_t,
+    ) -> VALUE;
+    pub fn rb_yjit_splat_varg_cfunc(stack_splat_array: *mut VALUE) -> ::std::os::raw::c_int;
+    pub fn rb_yjit_dump_iseq_loc(iseq: *const rb_iseq_t, insn_idx: u32);
+    pub fn rb_yjit_iseq_inspect(iseq: *const rb_iseq_t) -> *mut ::std::os::raw::c_char;
+    pub fn rb_RSTRUCT_SET(st: VALUE, k: ::std::os::raw::c_int, v: VALUE);
+    pub fn rb_ENCODING_GET(obj: VALUE) -> ::std::os::raw::c_int;
+    pub fn rb_yjit_constcache_shareable(ice: *const iseq_inline_constant_cache_entry) -> bool;
+    pub fn rb_yjit_obj_written(
+        old: VALUE,
+        young: VALUE,
+        file: *const ::std::os::raw::c_char,
+        line: ::std::os::raw::c_int,
+    );
+    pub fn rb_object_shape_count() -> VALUE;
+    pub fn rb_yjit_shape_obj_too_complex_p(obj: VALUE) -> bool;
+    pub fn rb_yjit_shape_capacity(shape_id: shape_id_t) -> attr_index_t;
+    pub fn rb_yjit_shape_index(shape_id: shape_id_t) -> attr_index_t;
+    pub fn rb_yjit_sendish_sp_pops(ci: *const rb_callinfo) -> usize;
+    pub fn rb_yjit_invokeblock_sp_pops(ci: *const rb_callinfo) -> usize;
+    pub fn rb_yjit_cme_ractor_serial(cme: *const rb_callable_method_entry_t) -> rb_serial_t;
+    pub fn rb_yjit_set_exception_return(
+        cfp: *mut rb_control_frame_t,
+        leave_exit: *mut ::std::os::raw::c_void,
+        leave_exception: *mut ::std::os::raw::c_void,
+    );
+    pub fn rb_vm_instruction_size() -> u32;
+    pub fn rb_iseq_encoded_size(iseq: *const rb_iseq_t) -> ::std::os::raw::c_uint;
     pub fn rb_iseq_pc_at_idx(iseq: *const rb_iseq_t, insn_idx: u32) -> *mut VALUE;
     pub fn rb_iseq_opcode_at_pc(iseq: *const rb_iseq_t, pc: *const VALUE) -> ::std::os::raw::c_int;
     pub fn rb_RSTRING_LEN(str_: VALUE) -> ::std::os::raw::c_ulong;
     pub fn rb_RSTRING_PTR(str_: VALUE) -> *mut ::std::os::raw::c_char;
-    pub fn rb_yjit_get_proc_ptr(procv: VALUE) -> *mut rb_proc_t;
     pub fn rb_insn_name(insn: VALUE) -> *const ::std::os::raw::c_char;
-    pub fn rb_insn_len(insn: VALUE) -> ::std::os::raw::c_int;
     pub fn rb_vm_ci_argc(ci: *const rb_callinfo) -> ::std::os::raw::c_uint;
     pub fn rb_vm_ci_mid(ci: *const rb_callinfo) -> ID;
     pub fn rb_vm_ci_flag(ci: *const rb_callinfo) -> ::std::os::raw::c_uint;
@@ -1230,7 +1219,6 @@ extern "C" {
     pub fn rb_METHOD_ENTRY_VISI(me: *const rb_callable_method_entry_t) -> rb_method_visibility_t;
     pub fn rb_get_cme_def_type(cme: *const rb_callable_method_entry_t) -> rb_method_type_t;
     pub fn rb_get_cme_def_body_attr_id(cme: *const rb_callable_method_entry_t) -> ID;
-    pub fn rb_get_symbol_id(namep: VALUE) -> ID;
     pub fn rb_get_cme_def_body_optimized_type(
         cme: *const rb_callable_method_entry_t,
     ) -> method_optimized_type;
@@ -1242,85 +1230,93 @@ extern "C" {
     ) -> *mut rb_method_cfunc_t;
     pub fn rb_get_def_method_serial(def: *const rb_method_definition_t) -> usize;
     pub fn rb_get_def_original_id(def: *const rb_method_definition_t) -> ID;
+    pub fn rb_get_def_bmethod_proc(def: *mut rb_method_definition_t) -> VALUE;
+    pub fn rb_jit_get_proc_ptr(procv: VALUE) -> *mut rb_proc_t;
+    pub fn rb_optimized_call(
+        recv: *mut VALUE,
+        ec: *mut rb_execution_context_t,
+        argc: ::std::os::raw::c_int,
+        argv: *mut VALUE,
+        kw_splat: ::std::os::raw::c_int,
+        block_handler: VALUE,
+    ) -> VALUE;
+    pub fn rb_jit_iseq_builtin_attrs(iseq: *const rb_iseq_t) -> ::std::os::raw::c_uint;
     pub fn rb_get_mct_argc(mct: *const rb_method_cfunc_t) -> ::std::os::raw::c_int;
     pub fn rb_get_mct_func(mct: *const rb_method_cfunc_t) -> *mut ::std::os::raw::c_void;
     pub fn rb_get_def_iseq_ptr(def: *mut rb_method_definition_t) -> *const rb_iseq_t;
-    pub fn rb_get_def_bmethod_proc(def: *mut rb_method_definition_t) -> VALUE;
     pub fn rb_get_iseq_body_local_iseq(iseq: *const rb_iseq_t) -> *const rb_iseq_t;
     pub fn rb_get_iseq_body_parent_iseq(iseq: *const rb_iseq_t) -> *const rb_iseq_t;
     pub fn rb_get_iseq_body_local_table_size(iseq: *const rb_iseq_t) -> ::std::os::raw::c_uint;
     pub fn rb_get_iseq_body_iseq_encoded(iseq: *const rb_iseq_t) -> *mut VALUE;
     pub fn rb_get_iseq_body_stack_max(iseq: *const rb_iseq_t) -> ::std::os::raw::c_uint;
+    pub fn rb_get_iseq_body_type(iseq: *const rb_iseq_t) -> rb_iseq_type;
     pub fn rb_get_iseq_flags_has_lead(iseq: *const rb_iseq_t) -> bool;
     pub fn rb_get_iseq_flags_has_opt(iseq: *const rb_iseq_t) -> bool;
     pub fn rb_get_iseq_flags_has_kw(iseq: *const rb_iseq_t) -> bool;
     pub fn rb_get_iseq_flags_has_post(iseq: *const rb_iseq_t) -> bool;
     pub fn rb_get_iseq_flags_has_kwrest(iseq: *const rb_iseq_t) -> bool;
+    pub fn rb_get_iseq_flags_anon_kwrest(iseq: *const rb_iseq_t) -> bool;
     pub fn rb_get_iseq_flags_has_rest(iseq: *const rb_iseq_t) -> bool;
     pub fn rb_get_iseq_flags_ruby2_keywords(iseq: *const rb_iseq_t) -> bool;
     pub fn rb_get_iseq_flags_has_block(iseq: *const rb_iseq_t) -> bool;
     pub fn rb_get_iseq_flags_ambiguous_param0(iseq: *const rb_iseq_t) -> bool;
     pub fn rb_get_iseq_flags_accepts_no_kwarg(iseq: *const rb_iseq_t) -> bool;
+    pub fn rb_get_iseq_flags_forwardable(iseq: *const rb_iseq_t) -> bool;
     pub fn rb_get_iseq_body_param_keyword(
         iseq: *const rb_iseq_t,
-    ) -> *const rb_seq_param_keyword_struct;
+    ) -> *const rb_iseq_param_keyword_struct;
     pub fn rb_get_iseq_body_param_size(iseq: *const rb_iseq_t) -> ::std::os::raw::c_uint;
     pub fn rb_get_iseq_body_param_lead_num(iseq: *const rb_iseq_t) -> ::std::os::raw::c_int;
     pub fn rb_get_iseq_body_param_opt_num(iseq: *const rb_iseq_t) -> ::std::os::raw::c_int;
     pub fn rb_get_iseq_body_param_opt_table(iseq: *const rb_iseq_t) -> *const VALUE;
-    pub fn rb_optimized_call(
-        recv: *mut VALUE,
-        ec: *mut rb_execution_context_t,
-        argc: ::std::os::raw::c_int,
-        argv: *mut VALUE,
-        kw_splat: ::std::os::raw::c_int,
-        block_handler: VALUE,
-    ) -> VALUE;
-    pub fn rb_leaf_invokebuiltin_iseq_p(iseq: *const rb_iseq_t) -> bool;
-    pub fn rb_leaf_builtin_function(iseq: *const rb_iseq_t) -> *const rb_builtin_function;
-    pub fn rb_yjit_str_simple_append(str1: VALUE, str2: VALUE) -> VALUE;
     pub fn rb_get_ec_cfp(ec: *const rb_execution_context_t) -> *mut rb_control_frame_struct;
+    pub fn rb_get_cfp_iseq(cfp: *mut rb_control_frame_struct) -> *const rb_iseq_t;
     pub fn rb_get_cfp_pc(cfp: *mut rb_control_frame_struct) -> *mut VALUE;
     pub fn rb_get_cfp_sp(cfp: *mut rb_control_frame_struct) -> *mut VALUE;
-    pub fn rb_set_cfp_pc(cfp: *mut rb_control_frame_struct, pc: *const VALUE);
-    pub fn rb_set_cfp_sp(cfp: *mut rb_control_frame_struct, sp: *mut VALUE);
-    pub fn rb_cfp_get_iseq(cfp: *mut rb_control_frame_struct) -> *mut rb_iseq_t;
     pub fn rb_get_cfp_self(cfp: *mut rb_control_frame_struct) -> VALUE;
     pub fn rb_get_cfp_ep(cfp: *mut rb_control_frame_struct) -> *mut VALUE;
     pub fn rb_get_cfp_ep_level(cfp: *mut rb_control_frame_struct, lv: u32) -> *const VALUE;
     pub fn rb_yarv_class_of(obj: VALUE) -> VALUE;
-    pub fn rb_yarv_str_eql_internal(str1: VALUE, str2: VALUE) -> VALUE;
-    pub fn rb_yarv_ary_entry_internal(ary: VALUE, offset: ::std::os::raw::c_long) -> VALUE;
-    pub fn rb_yarv_fix_mod_fix(recv: VALUE, obj: VALUE) -> VALUE;
-    pub fn rb_yjit_dump_iseq_loc(iseq: *const rb_iseq_t, insn_idx: u32);
     pub fn rb_FL_TEST(obj: VALUE, flags: VALUE) -> VALUE;
     pub fn rb_FL_TEST_RAW(obj: VALUE, flags: VALUE) -> VALUE;
     pub fn rb_RB_TYPE_P(obj: VALUE, t: ruby_value_type) -> bool;
     pub fn rb_RSTRUCT_LEN(st: VALUE) -> ::std::os::raw::c_long;
-    pub fn rb_RSTRUCT_SET(st: VALUE, k: ::std::os::raw::c_int, v: VALUE);
     pub fn rb_get_call_data_ci(cd: *const rb_call_data) -> *const rb_callinfo;
     pub fn rb_BASIC_OP_UNREDEFINED_P(bop: ruby_basic_operators, klass: u32) -> bool;
     pub fn rb_RCLASS_ORIGIN(c: VALUE) -> VALUE;
-    pub fn rb_ENCODING_GET(obj: VALUE) -> ::std::os::raw::c_int;
-    pub fn rb_yjit_multi_ractor_p() -> bool;
     pub fn rb_assert_iseq_handle(handle: VALUE);
+    pub fn rb_assert_holding_vm_lock();
     pub fn rb_IMEMO_TYPE_P(imemo: VALUE, imemo_type: imemo_type) -> ::std::os::raw::c_int;
     pub fn rb_assert_cme_handle(handle: VALUE);
-    pub fn rb_yjit_for_each_iseq(callback: rb_iseq_callback, data: *mut ::std::os::raw::c_void);
-    pub fn rb_yjit_obj_written(
-        old: VALUE,
-        young: VALUE,
-        file: *const ::std::os::raw::c_char,
-        line: ::std::os::raw::c_int,
-    );
-    pub fn rb_yjit_vm_lock_then_barrier(
+    pub fn rb_yarv_ary_entry_internal(ary: VALUE, offset: ::std::os::raw::c_long) -> VALUE;
+    pub fn rb_jit_array_len(a: VALUE) -> ::std::os::raw::c_long;
+    pub fn rb_set_cfp_pc(cfp: *mut rb_control_frame_struct, pc: *const VALUE);
+    pub fn rb_set_cfp_sp(cfp: *mut rb_control_frame_struct, sp: *mut VALUE);
+    pub fn rb_jit_shape_too_complex_p(shape_id: shape_id_t) -> bool;
+    pub fn rb_jit_multi_ractor_p() -> bool;
+    pub fn rb_jit_vm_lock_then_barrier(
         recursive_lock_level: *mut ::std::os::raw::c_uint,
         file: *const ::std::os::raw::c_char,
         line: ::std::os::raw::c_int,
     );
-    pub fn rb_yjit_vm_unlock(
+    pub fn rb_jit_vm_unlock(
         recursive_lock_level: *mut ::std::os::raw::c_uint,
         file: *const ::std::os::raw::c_char,
         line: ::std::os::raw::c_int,
     );
+    pub fn rb_iseq_reset_jit_func(iseq: *const rb_iseq_t);
+    pub fn rb_jit_get_page_size() -> u32;
+    pub fn rb_jit_reserve_addr_space(mem_size: u32) -> *mut u8;
+    pub fn rb_jit_for_each_iseq(callback: rb_iseq_callback, data: *mut ::std::os::raw::c_void);
+    pub fn rb_jit_mark_writable(mem_block: *mut ::std::os::raw::c_void, mem_size: u32) -> bool;
+    pub fn rb_jit_mark_executable(mem_block: *mut ::std::os::raw::c_void, mem_size: u32);
+    pub fn rb_jit_mark_unused(mem_block: *mut ::std::os::raw::c_void, mem_size: u32) -> bool;
+    pub fn rb_jit_icache_invalidate(
+        start: *mut ::std::os::raw::c_void,
+        end: *mut ::std::os::raw::c_void,
+    );
+    pub fn rb_jit_fix_mod_fix(recv: VALUE, obj: VALUE) -> VALUE;
+    pub fn rb_jit_fix_div_fix(recv: VALUE, obj: VALUE) -> VALUE;
+    pub fn rb_yarv_str_eql_internal(str1: VALUE, str2: VALUE) -> VALUE;
+    pub fn rb_jit_str_concat_codepoint(str_: VALUE, codepoint: VALUE);
 }
diff --git a/yjit/src/disasm.rs b/yjit/src/disasm.rs
index 6fcec5b580..4f85937ee9 100644
--- a/yjit/src/disasm.rs
+++ b/yjit/src/disasm.rs
@@ -1,16 +1,44 @@
 use crate::core::*;
 use crate::cruby::*;
 use crate::yjit::yjit_enabled_p;
-#[cfg(feature = "disasm")]
 use crate::asm::CodeBlock;
-#[cfg(feature = "disasm")]
 use crate::codegen::CodePtr;
-#[cfg(feature = "disasm")]
 use crate::options::DumpDisasm;
 
-#[cfg(feature = "disasm")]
 use std::fmt::Write;
 
+#[cfg_attr(not(feature = "disasm"), allow(dead_code))]
+#[derive(Copy, Clone, Debug)]
+pub struct TerminalColor {
+    pub blue_begin: &'static str,
+    pub blue_end: &'static str,
+    pub bold_begin: &'static str,
+    pub bold_end: &'static str,
+}
+
+pub static TTY_TERMINAL_COLOR: TerminalColor = TerminalColor {
+    blue_begin: "\x1b[34m",
+    blue_end: "\x1b[0m",
+    bold_begin: "\x1b[1m",
+    bold_end: "\x1b[22m",
+};
+
+pub static NON_TTY_TERMINAL_COLOR: TerminalColor = TerminalColor {
+    blue_begin: "",
+    blue_end: "",
+    bold_begin: "",
+    bold_end: "",
+};
+
+/// Terminal escape codes for colors, font weight, etc. Only enabled if stdout is a TTY.
+pub fn get_colors() -> &'static TerminalColor {
+    if crate::utils::stdout_supports_colors() {
+        &TTY_TERMINAL_COLOR
+    } else {
+        &NON_TTY_TERMINAL_COLOR
+    }
+}
+
 /// Primitive called in yjit.rb
 /// Produce a string representing the disassembly for an ISEQ
 #[no_mangle]
@@ -23,11 +51,6 @@ pub extern "C" fn rb_yjit_disasm_iseq(_ec: EcPtr, _ruby_self: VALUE, iseqw: VALU
 
     #[cfg(feature = "disasm")]
     {
-        // TODO:
-        //if unsafe { CLASS_OF(iseqw) != rb_cISeq } {
-        //    return Qnil;
-        //}
-
         if !yjit_enabled_p() {
             return Qnil;
         }
@@ -37,56 +60,46 @@ pub extern "C" fn rb_yjit_disasm_iseq(_ec: EcPtr, _ruby_self: VALUE, iseqw: VALU
 
         // This will truncate disassembly of methods with 10k+ bytecodes.
         // That's a good thing - this prints to console.
-        let out_string = disasm_iseq_insn_range(iseq, 0, 9999);
+        let out_string = with_vm_lock(src_loc!(), || disasm_iseq_insn_range(iseq, 0, 9999));
 
         return rust_str_to_ruby(&out_string);
     }
 }
 
+/// Only call while holding the VM lock.
 #[cfg(feature = "disasm")]
-pub fn disasm_iseq_insn_range(iseq: IseqPtr, start_idx: u32, end_idx: u32) -> String {
+pub fn disasm_iseq_insn_range(iseq: IseqPtr, start_idx: u16, end_idx: u16) -> String {
     let mut out = String::from("");
 
     // Get a list of block versions generated for this iseq
-    let mut block_list = get_or_create_iseq_block_list(iseq);
+    let block_list = get_or_create_iseq_block_list(iseq);
+    let mut block_list: Vec<&Block> = block_list.into_iter().map(|blockref| {
+        // SAFETY: We have the VM lock here and all the blocks on iseqs are valid.
+        unsafe { blockref.as_ref() }
+    }).collect();
 
     // Get a list of codeblocks relevant to this iseq
     let global_cb = crate::codegen::CodegenGlobals::get_inline_cb();
 
     // Sort the blocks by increasing start addresses
-    block_list.sort_by(|a, b| {
-        use std::cmp::Ordering;
-
-        // Get the start addresses for each block
-        let addr_a = a.borrow().get_start_addr().unwrap().raw_ptr();
-        let addr_b = b.borrow().get_start_addr().unwrap().raw_ptr();
-
-        if addr_a < addr_b {
-            Ordering::Less
-        } else if addr_a == addr_b {
-            Ordering::Equal
-        } else {
-            Ordering::Greater
-        }
-    });
+    block_list.sort_by_key(|block| block.get_start_addr().as_offset());
 
     // Compute total code size in bytes for all blocks in the function
     let mut total_code_size = 0;
     for blockref in &block_list {
-        total_code_size += blockref.borrow().code_size();
+        total_code_size += blockref.code_size();
     }
 
     writeln!(out, "NUM BLOCK VERSIONS: {}", block_list.len()).unwrap();
     writeln!(out,  "TOTAL INLINE CODE SIZE: {} bytes", total_code_size).unwrap();
 
     // For each block, sorted by increasing start address
-    for block_idx in 0..block_list.len() {
-        let block = block_list[block_idx].borrow();
+    for (block_idx, block) in block_list.iter().enumerate() {
         let blockid = block.get_blockid();
         if blockid.idx >= start_idx && blockid.idx < end_idx {
             let end_idx = block.get_end_idx();
-            let start_addr = block.get_start_addr().unwrap();
-            let end_addr = block.get_end_addr().unwrap();
+            let start_addr = block.get_start_addr();
+            let end_addr = block.get_end_addr();
             let code_size = block.code_size();
 
             // Write some info about the current block
@@ -110,9 +123,9 @@ pub fn disasm_iseq_insn_range(iseq: IseqPtr, start_idx: u32, end_idx: u32) -> St
             // If this is not the last block
             if block_idx < block_list.len() - 1 {
                 // Compute the size of the gap between this block and the next
-                let next_block = block_list[block_idx + 1].borrow();
-                let next_start_addr = next_block.get_start_addr().unwrap();
-                let gap_size = next_start_addr.into_usize() - end_addr.into_usize();
+                let next_block = block_list[block_idx + 1];
+                let next_start_addr = next_block.get_start_addr();
+                let gap_size = next_start_addr.as_offset() - end_addr.as_offset();
 
                 // Log the size of the gap between the blocks if nonzero
                 if gap_size > 0 {
@@ -125,19 +138,21 @@ pub fn disasm_iseq_insn_range(iseq: IseqPtr, start_idx: u32, end_idx: u32) -> St
     return out;
 }
 
-#[cfg(feature = "disasm")]
+/// Dump dissassembly for a range in a [CodeBlock]. VM lock required.
 pub fn dump_disasm_addr_range(cb: &CodeBlock, start_addr: CodePtr, end_addr: CodePtr, dump_disasm: &DumpDisasm) {
-    use std::fs::File;
-    use std::io::Write;
-
     for (start_addr, end_addr) in cb.writable_addrs(start_addr, end_addr) {
         let disasm = disasm_addr_range(cb, start_addr, end_addr);
         if disasm.len() > 0 {
             match dump_disasm {
                 DumpDisasm::Stdout => println!("{disasm}"),
-                DumpDisasm::File(path) => {
-                    let mut f = File::options().create(true).append(true).open(path).unwrap();
-                    f.write_all(disasm.as_bytes()).unwrap();
+                DumpDisasm::File(fd) => {
+                    use std::os::unix::io::{FromRawFd, IntoRawFd};
+                    use std::io::Write;
+
+                    // Write with the fd opened during boot
+                    let mut file = unsafe { std::fs::File::from_raw_fd(*fd) };
+                    file.write_all(disasm.as_bytes()).unwrap();
+                    let _ = file.into_raw_fd(); // keep the fd open
                 }
             };
         }
@@ -171,75 +186,181 @@ pub fn disasm_addr_range(cb: &CodeBlock, start_addr: usize, end_addr: usize) ->
     // Disassemble the instructions
     let code_size = end_addr - start_addr;
     let code_slice = unsafe { std::slice::from_raw_parts(start_addr as _, code_size) };
+    // Stabilize output for cargo test
+    #[cfg(test)]
+    let start_addr = 0;
     let insns = cs.disasm_all(code_slice, start_addr as u64).unwrap();
+    let colors = get_colors();
 
-    // Colorize outlined code in blue
-    if cb.outlined {
-        write!(&mut out, "\x1b[34m").unwrap();
-    }
     // For each instruction in this block
     for insn in insns.as_ref() {
         // Comments for this block
         if let Some(comment_list) = cb.comments_at(insn.address() as usize) {
             for comment in comment_list {
-                writeln!(&mut out, "  \x1b[1m# {comment}\x1b[22m").unwrap(); // Make comments bold
+                if cb.outlined {
+                    write!(&mut out, "{}", colors.blue_begin).unwrap(); // Make outlined code blue
+                }
+                writeln!(&mut out, "  {}# {comment}{}", colors.bold_begin, colors.bold_end).unwrap(); // Make comments bold
             }
         }
+        if cb.outlined {
+            write!(&mut out, "{}", colors.blue_begin).unwrap(); // Make outlined code blue
+        }
         writeln!(&mut out, "  {insn}").unwrap();
-    }
-    // Disable blue color
-    if cb.outlined {
-        write!(&mut out, "\x1b[0m").unwrap();
+        if cb.outlined {
+            write!(&mut out, "{}", colors.blue_end).unwrap(); // Disable blue
+        }
     }
 
     return out;
 }
 
+/// Fallback version without dependency on a disassembler which prints just bytes and comments.
+#[cfg(not(feature = "disasm"))]
+pub fn disasm_addr_range(cb: &CodeBlock, start_addr: usize, end_addr: usize) -> String {
+    let mut out = String::new();
+    let mut line_byte_idx = 0;
+    const MAX_BYTES_PER_LINE: usize = 16;
+    let colors = get_colors();
+
+    for addr in start_addr..end_addr {
+        if let Some(comment_list) = cb.comments_at(addr) {
+            // Start a new line if we're in the middle of one
+            if line_byte_idx != 0 {
+                writeln!(&mut out).unwrap();
+                line_byte_idx = 0;
+            }
+            for comment in comment_list {
+                writeln!(&mut out, "  {}# {comment}{}", colors.bold_begin, colors.bold_end).unwrap(); // Make comments bold
+            }
+        }
+        if line_byte_idx == 0 {
+            write!(&mut out, "  0x{addr:x}: ").unwrap();
+        } else {
+            write!(&mut out, " ").unwrap();
+        }
+        let byte = unsafe { (addr as *const u8).read() };
+        write!(&mut out, "{byte:02x}").unwrap();
+        line_byte_idx += 1;
+        if line_byte_idx == MAX_BYTES_PER_LINE - 1 {
+            writeln!(&mut out).unwrap();
+            line_byte_idx = 0;
+        }
+    }
+
+    if !out.is_empty() {
+        writeln!(&mut out).unwrap();
+    }
+
+    out
+}
+
+/// Assert that CodeBlock has the code specified with hex. In addition, if tested with
+/// `cargo test --all-features`, it also checks it generates the specified disasm.
+#[cfg(test)]
+macro_rules! assert_disasm {
+    ($cb:expr, $hex:expr, $disasm:expr) => {
+        #[cfg(feature = "disasm")]
+        {
+            let disasm = disasm_addr_range(
+                &$cb,
+                $cb.get_ptr(0).raw_addr(&$cb),
+                $cb.get_write_ptr().raw_addr(&$cb),
+            );
+            assert_eq!(unindent(&disasm, false), unindent(&$disasm, true));
+        }
+        assert_eq!(format!("{:x}", $cb), $hex);
+    };
+}
+#[cfg(test)]
+pub(crate) use assert_disasm;
+
+/// Remove the minimum indent from every line, skipping the first line if `skip_first`.
+#[cfg(all(feature = "disasm", test))]
+pub fn unindent(string: &str, trim_lines: bool) -> String {
+    fn split_lines(string: &str) -> Vec<String> {
+        let mut result: Vec<String> = vec![];
+        let mut buf: Vec<u8> = vec![];
+        for byte in string.as_bytes().iter() {
+            buf.push(*byte);
+            if *byte == b'\n' {
+                result.push(String::from_utf8(buf).unwrap());
+                buf = vec![];
+            }
+        }
+        if !buf.is_empty() {
+            result.push(String::from_utf8(buf).unwrap());
+        }
+        result
+    }
+
+    // Break up a string into multiple lines
+    let mut lines = split_lines(string);
+    if trim_lines { // raw string literals come with extra lines
+        lines.remove(0);
+        lines.remove(lines.len() - 1);
+    }
+
+    // Count the minimum number of spaces
+    let spaces = lines.iter().filter_map(|line| {
+        for (i, ch) in line.as_bytes().iter().enumerate() {
+            if *ch != b' ' {
+                return Some(i);
+            }
+        }
+        None
+    }).min().unwrap_or(0);
+
+    // Join lines, removing spaces
+    let mut unindented: Vec<u8> = vec![];
+    for line in lines.iter() {
+        if line.len() > spaces {
+            unindented.extend_from_slice(&line.as_bytes()[spaces..]);
+        } else {
+            unindented.extend_from_slice(&line.as_bytes());
+        }
+    }
+    String::from_utf8(unindented).unwrap()
+}
+
 /// Primitive called in yjit.rb
 /// Produce a list of instructions compiled for an isew
 #[no_mangle]
 pub extern "C" fn rb_yjit_insns_compiled(_ec: EcPtr, _ruby_self: VALUE, iseqw: VALUE) -> VALUE {
-    {
-        // TODO:
-        //if unsafe { CLASS_OF(iseqw) != rb_cISeq } {
-        //    return Qnil;
-        //}
-
-        if !yjit_enabled_p() {
-            return Qnil;
-        }
-
-        // Get the iseq pointer from the wrapper
-        let iseq = unsafe { rb_iseqw_to_iseq(iseqw) };
+    if !yjit_enabled_p() {
+        return Qnil;
+    }
 
-        // Get the list of instructions compiled
-        let insn_vec = insns_compiled(iseq);
+    // Get the iseq pointer from the wrapper
+    let iseq = unsafe { rb_iseqw_to_iseq(iseqw) };
 
-        unsafe {
-            let insn_ary = rb_ary_new_capa((insn_vec.len() * 2) as i64);
+    // Get the list of instructions compiled
+    let insn_vec = insns_compiled(iseq);
 
-            // For each instruction compiled
-            for idx in 0..insn_vec.len() {
-                let op_name = &insn_vec[idx].0;
-                let insn_idx = insn_vec[idx].1;
+    unsafe {
+        let insn_ary = rb_ary_new_capa((insn_vec.len() * 2) as i64);
 
-                let op_sym = rust_str_to_sym(&op_name);
+        // For each instruction compiled
+        for idx in 0..insn_vec.len() {
+            let op_name = &insn_vec[idx].0;
+            let insn_idx = insn_vec[idx].1;
 
-                // Store the instruction index and opcode symbol
-                rb_ary_store(
-                    insn_ary,
-                    (2 * idx + 0) as i64,
-                    VALUE::fixnum_from_usize(insn_idx as usize),
-                );
-                rb_ary_store(insn_ary, (2 * idx + 1) as i64, op_sym);
-            }
+            let op_sym = rust_str_to_sym(&op_name);
 
-            insn_ary
+            // Store the instruction index and opcode symbol
+            rb_ary_store(
+                insn_ary,
+                (2 * idx + 0) as i64,
+                VALUE::fixnum_from_usize(insn_idx as usize),
+            );
+            rb_ary_store(insn_ary, (2 * idx + 1) as i64, op_sym);
         }
+
+        insn_ary
     }
 }
 
-fn insns_compiled(iseq: IseqPtr) -> Vec<(String, u32)> {
+fn insns_compiled(iseq: IseqPtr) -> Vec<(String, u16)> {
     let mut insn_vec = Vec::new();
 
     // Get a list of block versions generated for this iseq
@@ -247,16 +368,18 @@ fn insns_compiled(iseq: IseqPtr) -> Vec<(String, u32)> {
 
     // For each block associated with this iseq
     for blockref in &block_list {
-        let block = blockref.borrow();
+        // SAFETY: Called as part of a Ruby method, which ensures the graph is
+        // well connected for the given iseq.
+        let block = unsafe { blockref.as_ref() };
         let start_idx = block.get_blockid().idx;
         let end_idx = block.get_end_idx();
-        assert!(end_idx <= unsafe { get_iseq_encoded_size(iseq) });
+        assert!(u32::from(end_idx) <= unsafe { get_iseq_encoded_size(iseq) });
 
         // For each YARV instruction in the block
         let mut insn_idx = start_idx;
         while insn_idx < end_idx {
             // Get the current pc and opcode
-            let pc = unsafe { rb_iseq_pc_at_idx(iseq, insn_idx) };
+            let pc = unsafe { rb_iseq_pc_at_idx(iseq, insn_idx.into()) };
             // try_into() call below is unfortunate. Maybe pick i32 instead of usize for opcodes.
             let opcode: usize = unsafe { rb_iseq_opcode_at_pc(iseq, pc) }
                 .try_into()
@@ -269,7 +392,7 @@ fn insns_compiled(iseq: IseqPtr) -> Vec<(String, u32)> {
             insn_vec.push((op_name, insn_idx));
 
             // Move to the next instruction
-            insn_idx += insn_len(opcode);
+            insn_idx += insn_len(opcode) as u16;
         }
     }
 
diff --git a/yjit/src/invariants.rs b/yjit/src/invariants.rs
index 734b32c464..0f22fba6b8 100644
--- a/yjit/src/invariants.rs
+++ b/yjit/src/invariants.rs
@@ -1,23 +1,23 @@
 //! Code to track assumptions made during code generation and invalidate
 //! generated code if and when these assumptions are invalidated.
 
-use crate::asm::OutlinedCb;
+use crate::backend::ir::Assembler;
 use crate::codegen::*;
 use crate::core::*;
 use crate::cruby::*;
-use crate::options::*;
 use crate::stats::*;
 use crate::utils::IntoUsize;
 use crate::yjit::yjit_enabled_p;
 
 use std::collections::{HashMap, HashSet};
+use std::os::raw::c_void;
 use std::mem;
 
 // Invariants to track:
 // assume_bop_not_redefined(jit, INTEGER_REDEFINED_OP_FLAG, BOP_PLUS)
 // assume_method_lookup_stable(comptime_recv_klass, cme, jit);
-// assume_single_ractor_mode(jit)
-// assume_stable_global_constant_state(jit);
+// assume_single_ractor_mode()
+// track_stable_constant_names_assumption()
 
 /// Used to track all of the various block references that contain assumptions
 /// about the state of the virtual machine.
@@ -30,7 +30,6 @@ pub struct Invariants {
     /// quick access to all of the blocks that are making this assumption when
     /// the operator is redefined.
     basic_operator_blocks: HashMap<(RedefinitionFlag, ruby_basic_operators), HashSet<BlockRef>>,
-
     /// A map from a block to a set of classes and their associated basic
     /// operators that the block is assuming are not redefined. This is used for
     /// quick access to all of the assumptions that a block is making when it
@@ -48,10 +47,23 @@ pub struct Invariants {
     /// a constant `A::B` is redefined, then all blocks that are assuming that
     /// `A` and `B` have not be redefined must be invalidated.
     constant_state_blocks: HashMap<ID, HashSet<BlockRef>>,
-
     /// A map from a block to a set of IDs that it is assuming have not been
     /// redefined.
     block_constant_states: HashMap<BlockRef, HashSet<ID>>,
+
+    /// A map from a class to a set of blocks that assume objects of the class
+    /// will have no singleton class. When the set is empty, it means that
+    /// there has been a singleton class for the class after boot, so you cannot
+    /// assume no singleton class going forward.
+    /// For now, the key can be only Array, Hash, or String. Consider making
+    /// an inverted HashMap if we start using this for user-defined classes
+    /// to maintain the performance of block_assumptions_free().
+    no_singleton_classes: HashMap<VALUE, HashSet<BlockRef>>,
+
+    /// A map from an ISEQ to a set of blocks that assume base pointer is equal
+    /// to environment pointer. When the set is empty, it means that EP has been
+    /// escaped in the ISEQ.
+    no_ep_escape_iseqs: HashMap<IseqPtr, HashSet<BlockRef>>,
 }
 
 /// Private singleton instance of the invariants global struct.
@@ -68,6 +80,8 @@ impl Invariants {
                 single_ractor: HashSet::new(),
                 constant_state_blocks: HashMap::new(),
                 block_constant_states: HashMap::new(),
+                no_singleton_classes: HashMap::new(),
+                no_ep_escape_iseqs: HashMap::new(),
             });
         }
     }
@@ -78,29 +92,20 @@ impl Invariants {
     }
 }
 
-/// A public function that can be called from within the code generation
-/// functions to ensure that the block being generated is invalidated when the
-/// basic operator is redefined.
+/// Mark the pending block as assuming that certain basic operators (e.g. Integer#==)
+/// have not been redefined.
+#[must_use]
 pub fn assume_bop_not_redefined(
     jit: &mut JITState,
-    ocb: &mut OutlinedCb,
+    asm: &mut Assembler,
     klass: RedefinitionFlag,
     bop: ruby_basic_operators,
 ) -> bool {
     if unsafe { BASIC_OP_UNREDEFINED_P(bop, klass) } {
-        jit_ensure_block_entry_exit(jit, ocb);
-
-        let invariants = Invariants::get_instance();
-        invariants
-            .basic_operator_blocks
-            .entry((klass, bop))
-            .or_default()
-            .insert(jit.get_block());
-        invariants
-            .block_basic_operators
-            .entry(jit.get_block())
-            .or_default()
-            .insert((klass, bop));
+        if jit_ensure_block_entry_exit(jit, asm).is_none() {
+            return false;
+        }
+        jit.bop_assumptions.push((klass, bop));
 
         return true;
     } else {
@@ -108,30 +113,75 @@ pub fn assume_bop_not_redefined(
     }
 }
 
-// Remember that a block assumes that
-// `rb_callable_method_entry(receiver_klass, cme->called_id) == cme` and that
-// `cme` is valid.
-// When either of these assumptions becomes invalid, rb_yjit_method_lookup_change() or
-// rb_yjit_cme_invalidate() invalidates the block.
-//
-// @raise NoMemoryError
-pub fn assume_method_lookup_stable(
-    jit: &mut JITState,
-    ocb: &mut OutlinedCb,
+/// Track that a block is only valid when a certain basic operator has not been redefined
+/// since the block's inception.
+pub fn track_bop_assumption(uninit_block: BlockRef, bop: (RedefinitionFlag, ruby_basic_operators)) {
+    let invariants = Invariants::get_instance();
+    invariants
+        .basic_operator_blocks
+        .entry(bop)
+        .or_default()
+        .insert(uninit_block);
+    invariants
+        .block_basic_operators
+        .entry(uninit_block)
+        .or_default()
+        .insert(bop);
+}
+
+/// Track that a block will assume that `cme` is valid (false == METHOD_ENTRY_INVALIDATED(cme)).
+/// [rb_yjit_cme_invalidate] invalidates the block when `cme` is invalidated.
+pub fn track_method_lookup_stability_assumption(
+    uninit_block: BlockRef,
     callee_cme: *const rb_callable_method_entry_t,
 ) {
-    jit_ensure_block_entry_exit(jit, ocb);
-
-    let block = jit.get_block();
-    block
-        .borrow_mut()
-        .add_cme_dependency(callee_cme);
-
     Invariants::get_instance()
         .cme_validity
         .entry(callee_cme)
         .or_default()
-        .insert(block);
+        .insert(uninit_block);
+}
+
+/// Track that a block will assume that `klass` objects will have no singleton class.
+pub fn track_no_singleton_class_assumption(uninit_block: BlockRef, klass: VALUE) {
+    Invariants::get_instance()
+        .no_singleton_classes
+        .entry(klass)
+        .or_default()
+        .insert(uninit_block);
+}
+
+/// Returns true if we've seen a singleton class of a given class since boot.
+pub fn has_singleton_class_of(klass: VALUE) -> bool {
+    Invariants::get_instance()
+        .no_singleton_classes
+        .get(&klass)
+        .map_or(false, |blocks| blocks.is_empty())
+}
+
+/// Track that a block will assume that base pointer is equal to environment pointer.
+pub fn track_no_ep_escape_assumption(uninit_block: BlockRef, iseq: IseqPtr) {
+    Invariants::get_instance()
+        .no_ep_escape_iseqs
+        .entry(iseq)
+        .or_default()
+        .insert(uninit_block);
+}
+
+/// Returns true if a given ISEQ has previously escaped an environment.
+pub fn iseq_escapes_ep(iseq: IseqPtr) -> bool {
+    Invariants::get_instance()
+        .no_ep_escape_iseqs
+        .get(&iseq)
+        .map_or(false, |blocks| blocks.is_empty())
+}
+
+/// Forget an ISEQ remembered in invariants
+pub fn iseq_free_invariants(iseq: IseqPtr) {
+    if unsafe { INVARIANTS.is_none() } {
+        return;
+    }
+    Invariants::get_instance().no_ep_escape_iseqs.remove(&iseq);
 }
 
 // Checks rb_method_basic_definition_p and registers the current block for invalidation if method
@@ -140,13 +190,13 @@ pub fn assume_method_lookup_stable(
 // default behavior.
 pub fn assume_method_basic_definition(
     jit: &mut JITState,
-    ocb: &mut OutlinedCb,
+    asm: &mut Assembler,
     klass: VALUE,
     mid: ID
-    ) -> bool {
+) -> bool {
     if unsafe { rb_method_basic_definition_p(klass, mid) } != 0 {
         let cme = unsafe { rb_callable_method_entry(klass, mid) };
-        assume_method_lookup_stable(jit, ocb, cme);
+        jit.assume_method_lookup_stable(asm, cme);
         true
     } else {
         false
@@ -155,30 +205,34 @@ pub fn assume_method_basic_definition(
 
 /// Tracks that a block is assuming it is operating in single-ractor mode.
 #[must_use]
-pub fn assume_single_ractor_mode(jit: &mut JITState, ocb: &mut OutlinedCb) -> bool {
-    if unsafe { rb_yjit_multi_ractor_p() } {
+pub fn assume_single_ractor_mode(jit: &mut JITState, asm: &mut Assembler) -> bool {
+    if unsafe { rb_jit_multi_ractor_p() } {
         false
     } else {
-        jit_ensure_block_entry_exit(jit, ocb);
-        Invariants::get_instance()
-            .single_ractor
-            .insert(jit.get_block());
+        if jit_ensure_block_entry_exit(jit, asm).is_none() {
+            return false;
+        }
+        jit.block_assumes_single_ractor = true;
+
         true
     }
 }
 
-/// Walk through the ISEQ to go from the current opt_getinlinecache to the
-/// subsequent opt_setinlinecache and find all of the name components that are
-/// associated with this constant (which correspond to the getconstant
-/// arguments).
-pub fn assume_stable_constant_names(jit: &mut JITState, ocb: &mut OutlinedCb, idlist: *const ID) {
-    /// Tracks that a block is assuming that the name component of a constant
-    /// has not changed since the last call to this function.
+/// Track that the block will assume single ractor mode.
+pub fn track_single_ractor_assumption(uninit_block: BlockRef) {
+    Invariants::get_instance()
+        .single_ractor
+        .insert(uninit_block);
+}
+
+/// Track that a block will assume that the name components of a constant path expression
+/// has not changed since the block's full initialization.
+pub fn track_stable_constant_names_assumption(uninit_block: BlockRef, idlist: *const ID) {
     fn assume_stable_constant_name(
-        jit: &mut JITState,
+        uninit_block: BlockRef,
         id: ID,
     ) {
-        if id == idNULL as u64 {
+        if id == ID!(NULL) {
             // Used for :: prefix
             return;
         }
@@ -188,10 +242,10 @@ pub fn assume_stable_constant_names(jit: &mut JITState, ocb: &mut OutlinedCb, id
             .constant_state_blocks
             .entry(id)
             .or_default()
-            .insert(jit.get_block());
+            .insert(uninit_block);
         invariants
             .block_constant_states
-            .entry(jit.get_block())
+            .entry(uninit_block)
             .or_default()
             .insert(id);
     }
@@ -200,12 +254,9 @@ pub fn assume_stable_constant_names(jit: &mut JITState, ocb: &mut OutlinedCb, id
     for i in 0.. {
         match unsafe { *idlist.offset(i) } {
             0 => break, // End of NULL terminated list
-            id => assume_stable_constant_name(jit, id),
+            id => assume_stable_constant_name(uninit_block, id),
         }
     }
-
-    jit_ensure_block_entry_exit(jit, ocb);
-
 }
 
 /// Called when a basic operator is redefined. Note that all the blocks assuming
@@ -252,7 +303,7 @@ pub extern "C" fn rb_yjit_cme_invalidate(callee_cme: *const rb_callable_method_e
     });
 }
 
-/// Callback for then Ruby is about to spawn a ractor. In that case we need to
+/// Callback for when Ruby is about to spawn a ractor. In that case we need to
 /// invalidate every block that is assuming single ractor mode.
 #[no_mangle]
 pub extern "C" fn rb_yjit_before_ractor_spawn() {
@@ -282,32 +333,11 @@ pub extern "C" fn rb_yjit_constant_state_changed(id: ID) {
     }
 
     with_vm_lock(src_loc!(), || {
-        if get_option!(global_constant_state) {
-            // If the global-constant-state option is set, then we're going to
-            // invalidate every block that depends on any constant.
-
-            Invariants::get_instance()
-                .constant_state_blocks
-                .keys()
-                .for_each(|id| {
-                    if let Some(blocks) =
-                        Invariants::get_instance().constant_state_blocks.remove(&id)
-                    {
-                        for block in &blocks {
-                            invalidate_block_version(block);
-                            incr_counter!(invalidate_constant_state_bump);
-                        }
-                    }
-                });
-        } else {
-            // If the global-constant-state option is not set, then we're only going
-            // to invalidate the blocks that are associated with the given ID.
-
-            if let Some(blocks) = Invariants::get_instance().constant_state_blocks.remove(&id) {
-                for block in &blocks {
-                    invalidate_block_version(block);
-                    incr_counter!(invalidate_constant_state_bump);
-                }
+        // Invalidate the blocks that are associated with the given ID.
+        if let Some(blocks) = Invariants::get_instance().constant_state_blocks.remove(&id) {
+            for block in &blocks {
+                invalidate_block_version(block);
+                incr_counter!(invalidate_constant_state_bump);
             }
         }
     });
@@ -327,7 +357,7 @@ pub extern "C" fn rb_yjit_root_mark() {
     // Why not let the GC move the cme keys in this table?
     // Because this is basically a compare_by_identity Hash.
     // If a key moves, we would need to reinsert it into the table so it is rehashed.
-    // That is tricky to do, espcially as it could trigger allocation which could
+    // That is tricky to do, especially as it could trigger allocation which could
     // trigger GC. Not sure if it is okay to trigger GC while the GC is updating
     // references.
     //
@@ -344,21 +374,41 @@ pub extern "C" fn rb_yjit_root_mark() {
     }
 }
 
+#[no_mangle]
+pub extern "C" fn rb_yjit_root_update_references() {
+    if unsafe { INVARIANTS.is_none() } {
+        return;
+    }
+    let no_ep_escape_iseqs = &mut Invariants::get_instance().no_ep_escape_iseqs;
+
+    // Make a copy of the table with updated ISEQ keys
+    let mut updated_copy = HashMap::with_capacity(no_ep_escape_iseqs.len());
+    for (iseq, blocks) in mem::take(no_ep_escape_iseqs) {
+        let new_iseq = unsafe { rb_gc_location(iseq.into()) }.as_iseq();
+        updated_copy.insert(new_iseq, blocks);
+    }
+
+    *no_ep_escape_iseqs = updated_copy;
+}
+
 /// Remove all invariant assumptions made by the block by removing the block as
 /// as a key in all of the relevant tables.
-pub fn block_assumptions_free(blockref: &BlockRef) {
+/// For safety, the block has to be initialized and the vm lock must be held.
+/// However, outgoing/incoming references to the block does _not_ need to be valid.
+pub fn block_assumptions_free(blockref: BlockRef) {
     let invariants = Invariants::get_instance();
 
     {
-        let block = blockref.borrow();
+        // SAFETY: caller ensures that this reference is valid
+        let block = unsafe { blockref.as_ref() };
 
         // For each method lookup dependency
         for dep in block.iter_cme_deps() {
             // Remove tracking for cme validity
-            if let Some(blockset) = invariants.cme_validity.get_mut(dep) {
-                blockset.remove(blockref);
+            if let Some(blockset) = invariants.cme_validity.get_mut(&dep) {
+                blockset.remove(&blockref);
                 if blockset.is_empty() {
-                    invariants.cme_validity.remove(dep);
+                    invariants.cme_validity.remove(&dep);
                 }
             }
         }
@@ -411,19 +461,41 @@ pub fn block_assumptions_free(blockref: &BlockRef) {
     if invariants.constant_state_blocks.is_empty() {
         invariants.constant_state_blocks.shrink_to_fit();
     }
+
+    // Remove tracking for blocks assuming no singleton class
+    // NOTE: no_singleton_class has up to 3 keys (Array, Hash, or String) for now.
+    // This is effectively an O(1) access unless we start using it for more classes.
+    for (_, blocks) in invariants.no_singleton_classes.iter_mut() {
+        blocks.remove(&blockref);
+    }
+
+    // Remove tracking for blocks assuming EP doesn't escape
+    let iseq = unsafe { blockref.as_ref() }.get_blockid().iseq;
+    if let Some(blocks) = invariants.no_ep_escape_iseqs.get_mut(&iseq) {
+        blocks.remove(&blockref);
+    }
 }
 
 /// Callback from the opt_setinlinecache instruction in the interpreter.
 /// Invalidate the block for the matching opt_getinlinecache so it could regenerate code
 /// using the new value in the constant cache.
 #[no_mangle]
-pub extern "C" fn rb_yjit_constant_ic_update(iseq: *const rb_iseq_t, ic: IC, insn_idx: u32) {
+pub extern "C" fn rb_yjit_constant_ic_update(iseq: *const rb_iseq_t, ic: IC, insn_idx: std::os::raw::c_uint) {
     // If YJIT isn't enabled, do nothing
     if !yjit_enabled_p() {
         return;
     }
 
-    if !unsafe { (*(*ic).entry).ic_cref }.is_null() || unsafe { rb_yjit_multi_ractor_p() } {
+    // Try to downcast the iseq index
+    let insn_idx: IseqIdx = if let Ok(idx) = insn_idx.try_into() {
+        idx
+    } else {
+        // The index is too large, YJIT can't possibly have code for it,
+        // so there is nothing to invalidate.
+        return;
+    };
+
+    if !unsafe { (*(*ic).entry).ic_cref }.is_null() || unsafe { rb_jit_multi_ractor_p() } {
         // We can't generate code in these situations, so no need to invalidate.
         // See gen_opt_getinlinecache.
         return;
@@ -435,7 +507,7 @@ pub extern "C" fn rb_yjit_constant_ic_update(iseq: *const rb_iseq_t, ic: IC, ins
         // This should come from a running iseq, so direct threading translation
         // should have been done
         assert!(unsafe { FL_TEST(iseq.into(), VALUE(ISEQ_TRANSLATED)) } != VALUE(0));
-        assert!(insn_idx < unsafe { get_iseq_encoded_size(iseq) });
+        assert!(u32::from(insn_idx) < unsafe { get_iseq_encoded_size(iseq) });
 
         // Ensure that the instruction the insn_idx is pointing to is in
         // fact a opt_getconstant_path instruction.
@@ -468,6 +540,66 @@ pub extern "C" fn rb_yjit_constant_ic_update(iseq: *const rb_iseq_t, ic: IC, ins
     });
 }
 
+/// Invalidate blocks that assume objects of a given class will have no singleton class.
+#[no_mangle]
+pub extern "C" fn rb_yjit_invalidate_no_singleton_class(klass: VALUE) {
+    // Skip tracking singleton classes during boot. Such objects already have a singleton class
+    // before entering JIT code, so they get rejected when they're checked for the first time.
+    if unsafe { INVARIANTS.is_none() } {
+        return;
+    }
+
+    // We apply this optimization only to Array, Hash, and String for now.
+    if unsafe { [rb_cArray, rb_cHash, rb_cString].contains(&klass) } {
+        with_vm_lock(src_loc!(), || {
+            let no_singleton_classes = &mut Invariants::get_instance().no_singleton_classes;
+            match no_singleton_classes.get_mut(&klass) {
+                Some(blocks) => {
+                    // Invalidate existing blocks and let has_singleton_class_of()
+                    // return true when they are compiled again
+                    for block in mem::take(blocks) {
+                        invalidate_block_version(&block);
+                        incr_counter!(invalidate_no_singleton_class);
+                    }
+                }
+                None => {
+                    // Let has_singleton_class_of() return true for this class
+                    no_singleton_classes.insert(klass, HashSet::new());
+                }
+            }
+        });
+    }
+}
+
+/// Invalidate blocks for a given ISEQ that assumes environment pointer is
+/// equal to base pointer.
+#[no_mangle]
+pub extern "C" fn rb_yjit_invalidate_ep_is_bp(iseq: IseqPtr) {
+    // Skip tracking EP escapes on boot. We don't need to invalidate anything during boot.
+    if unsafe { INVARIANTS.is_none() } {
+        return;
+    }
+
+    with_vm_lock(src_loc!(), || {
+        // If an EP escape for this ISEQ is detected for the first time, invalidate all blocks
+        // associated to the ISEQ.
+        let no_ep_escape_iseqs = &mut Invariants::get_instance().no_ep_escape_iseqs;
+        match no_ep_escape_iseqs.get_mut(&iseq) {
+            Some(blocks) => {
+                // Invalidate existing blocks and make jit.ep_is_bp() return false
+                for block in mem::take(blocks) {
+                    invalidate_block_version(&block);
+                    incr_counter!(invalidate_ep_escape);
+                }
+            }
+            None => {
+                // Let jit.ep_is_bp() return false for this ISEQ
+                no_ep_escape_iseqs.insert(iseq, HashSet::new());
+            }
+        }
+    });
+}
+
 // Invalidate all generated code and patch C method return code to contain
 // logic for firing the c_return TracePoint event. Once rb_vm_barrier()
 // returns, all other ractors are pausing inside RB_VM_LOCK_ENTER(), which
@@ -494,6 +626,8 @@ pub extern "C" fn rb_yjit_tracing_invalidate_all() {
         return;
     }
 
+    incr_counter!(invalidate_everything);
+
     // Stop other ractors since we are going to patch machine code.
     with_vm_lock(src_loc!(), || {
         // Make it so all live block versions are no longer valid branch targets
@@ -508,17 +642,18 @@ pub extern "C" fn rb_yjit_tracing_invalidate_all() {
                 if on_stack_iseqs.contains(&iseq) {
                     // This ISEQ is running, so we can't free blocks immediately
                     for block in blocks {
-                        delayed_deallocation(&block);
+                        delayed_deallocation(block);
                     }
                     payload.dead_blocks.shrink_to_fit();
                 } else {
                     // Safe to free dead blocks since the ISEQ isn't running
+                    // Since we're freeing _all_ blocks, we don't need to keep the graph well formed
                     for block in blocks {
-                        free_block(&block);
+                        unsafe { free_block(block, false) };
                     }
                     mem::take(&mut payload.dead_blocks)
-                        .iter()
-                        .for_each(free_block);
+                        .into_iter()
+                        .for_each(|block| unsafe { free_block(block, false) });
                 }
             }
 
@@ -528,37 +663,44 @@ pub extern "C" fn rb_yjit_tracing_invalidate_all() {
 
         let cb = CodegenGlobals::get_inline_cb();
 
+        // Prevent on-stack frames from jumping to the caller on jit_exec_exception
+        extern "C" {
+            fn rb_yjit_cancel_jit_return(leave_exit: *mut c_void, leave_exception: *mut c_void) -> VALUE;
+        }
+        unsafe {
+            rb_yjit_cancel_jit_return(
+                CodegenGlobals::get_leave_exit_code().raw_ptr(cb) as _,
+                CodegenGlobals::get_leave_exception_code().raw_ptr(cb) as _,
+            );
+        }
+
         // Apply patches
         let old_pos = cb.get_write_pos();
         let old_dropped_bytes = cb.has_dropped_bytes();
         let mut patches = CodegenGlobals::take_global_inval_patches();
-        patches.sort_by_cached_key(|patch| patch.inline_patch_pos.raw_ptr());
+        patches.sort_by_cached_key(|patch| patch.inline_patch_pos.raw_ptr(cb));
         let mut last_patch_end = std::ptr::null();
         for patch in &patches {
-            assert!(last_patch_end <= patch.inline_patch_pos.raw_ptr(), "patches should not overlap");
-
-            let mut asm = crate::backend::ir::Assembler::new();
-            asm.jmp(patch.outlined_target_pos.as_side_exit());
+            let patch_pos = patch.inline_patch_pos.raw_ptr(cb);
+            assert!(
+                last_patch_end <= patch_pos,
+                "patches should not overlap (last_patch_end: {last_patch_end:?}, patch_pos: {patch_pos:?})",
+            );
 
             cb.set_write_ptr(patch.inline_patch_pos);
             cb.set_dropped_bytes(false);
-            asm.compile(cb);
-            last_patch_end = cb.get_write_ptr().raw_ptr();
+            cb.without_page_end_reserve(|cb| {
+                let mut asm = crate::backend::ir::Assembler::new_without_iseq();
+                asm.jmp(patch.outlined_target_pos.as_side_exit());
+                if asm.compile(cb, None).is_none() {
+                    panic!("Failed to apply patch at {:?}", patch.inline_patch_pos);
+                }
+            });
+            last_patch_end = cb.get_write_ptr().raw_ptr(cb);
         }
         cb.set_pos(old_pos);
         cb.set_dropped_bytes(old_dropped_bytes);
 
-        // Freeze invalidated part of the codepage. We only want to wait for
-        // running instances of the code to exit from now on, so we shouldn't
-        // change the code. There could be other ractors sleeping in
-        // branch_stub_hit(), for example. We could harden this by changing memory
-        // protection on the frozen range.
-        assert!(
-            CodegenGlobals::get_inline_frozen_bytes() <= old_pos,
-            "frozen bytes should increase monotonically"
-        );
-        CodegenGlobals::set_inline_frozen_bytes(old_pos);
-
         CodegenGlobals::get_outlined_cb()
             .unwrap()
             .mark_all_executable();
diff --git a/yjit/src/lib.rs b/yjit/src/lib.rs
index ce87cc250a..f3247fbf1a 100644
--- a/yjit/src/lib.rs
+++ b/yjit/src/lib.rs
@@ -3,8 +3,20 @@
 #![allow(clippy::too_many_arguments)] // :shrug:
 #![allow(clippy::identity_op)] // Sometimes we do it for style
 
+// TODO(alan): This lint is right -- the way we use `static mut` is UB happy. We have many globals
+// and take `&mut` frequently, sometimes with a method that easily allows calling it twice.
+//
+// All of our globals rely on us running single threaded, which outside of boot-time relies on the
+// VM lock (which signals and waits for all other threads to pause). To fix this properly, we should
+// gather up all the globals into a struct to centralize the safety reasoning. That way we can also
+// check for re-entrance in one place.
+//
+// We're too close to release to do that, though, so disable the lint for now.
+#![allow(unknown_lints)]
+#![allow(static_mut_refs)]
+#![warn(unknown_lints)]
 
-mod asm;
+pub mod asm;
 mod backend;
 mod codegen;
 mod core;
@@ -16,3 +28,4 @@ mod stats;
 mod utils;
 mod yjit;
 mod virtualmem;
+mod log;
diff --git a/yjit/src/log.rs b/yjit/src/log.rs
new file mode 100644
index 0000000000..c5a724f7e1
--- /dev/null
+++ b/yjit/src/log.rs
@@ -0,0 +1,179 @@
+use crate::core::BlockId;
+use crate::cruby::*;
+use crate::options::*;
+use crate::yjit::yjit_enabled_p;
+
+use std::fmt::{Display, Formatter};
+use std::os::raw::c_long;
+use crate::utils::iseq_get_location;
+
+type Timestamp = f64;
+
+#[derive(Clone, Debug)]
+pub struct LogEntry {
+    /// The time when the block was compiled.
+    pub timestamp: Timestamp,
+
+    /// The log message.
+    pub message: String,
+}
+
+impl Display for LogEntry {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:15.6}: {}", self.timestamp, self.message)
+    }
+}
+
+pub type Log = CircularBuffer<LogEntry, 1024>;
+static mut LOG: Option<Log> = None;
+
+impl Log {
+    pub fn init() {
+        unsafe {
+            LOG = Some(Log::new());
+        }
+    }
+
+    pub fn get_instance() -> &'static mut Log {
+        unsafe {
+            LOG.as_mut().unwrap()
+        }
+    }
+
+    pub fn has_instance() -> bool {
+        unsafe {
+            LOG.as_mut().is_some()
+        }
+    }
+
+    pub fn add_block_with_chain_depth(block_id: BlockId, chain_depth: u8) {
+        if !Self::has_instance() {
+            return;
+        }
+
+        let print_log = get_option!(log);
+        let timestamp = std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap().as_secs_f64();
+
+        let location = iseq_get_location(block_id.iseq, block_id.idx);
+        let index = block_id.idx;
+        let message = if chain_depth > 0 {
+            format!("{} (index: {}, chain_depth: {})", location, index, chain_depth)
+        } else {
+            format!("{} (index: {})", location, index)
+        };
+
+        let entry = LogEntry {
+            timestamp,
+            message
+        };
+
+        if let Some(output) = print_log {
+            match output {
+                LogOutput::Stderr => {
+                    eprintln!("{}", entry);
+                }
+
+                LogOutput::File(fd) => {
+                    use std::os::unix::io::{FromRawFd, IntoRawFd};
+                    use std::io::Write;
+
+                    // Write with the fd opened during boot
+                    let mut file = unsafe { std::fs::File::from_raw_fd(fd) };
+                    writeln!(file, "{}", entry).unwrap();
+                    file.flush().unwrap();
+                    let _ = file.into_raw_fd(); // keep the fd open
+                }
+
+                LogOutput::MemoryOnly => () // Don't print or write anything
+            }
+        }
+
+        Self::get_instance().push(entry);
+    }
+}
+
+pub struct CircularBuffer<T, const N: usize> {
+    buffer: Vec<Option<T>>,
+    head: usize,
+    tail: usize,
+    size: usize
+}
+
+impl<T: Clone, const N: usize> CircularBuffer<T, N> {
+    pub fn new() -> Self {
+        Self {
+            buffer: vec![None; N],
+            head: 0,
+            tail: 0,
+            size: 0
+        }
+    }
+
+    pub fn push(&mut self, value: T) {
+        self.buffer[self.head] = Some(value);
+        self.head = (self.head + 1) % N;
+        if self.size == N {
+            self.tail = (self.tail + 1) % N;
+        } else {
+            self.size += 1;
+        }
+    }
+
+    pub fn pop(&mut self) -> Option<T> {
+        if self.size == 0 {
+            return None;
+        }
+
+        let value = self.buffer[self.tail].take();
+        self.tail = (self.tail + 1) % N;
+        self.size -= 1;
+        value
+    }
+
+    pub fn len(&self) -> usize {
+        self.size
+    }
+}
+
+
+//===========================================================================
+
+/// Primitive called in yjit.rb
+/// Check if log generation is enabled
+#[no_mangle]
+pub extern "C" fn rb_yjit_log_enabled_p(_ec: EcPtr, _ruby_self: VALUE) -> VALUE {
+    if get_option!(log).is_some() {
+        return Qtrue;
+    } else {
+        return Qfalse;
+    }
+}
+
+/// Primitive called in yjit.rb.
+/// Export all YJIT log entries as a Ruby array.
+#[no_mangle]
+pub extern "C" fn rb_yjit_get_log(_ec: EcPtr, _ruby_self: VALUE) -> VALUE {
+    with_vm_lock(src_loc!(), || rb_yjit_get_log_array())
+}
+
+fn rb_yjit_get_log_array() -> VALUE {
+    if !yjit_enabled_p() || get_option!(log).is_none() {
+        return Qnil;
+    }
+
+    let log = Log::get_instance();
+    let array = unsafe { rb_ary_new_capa(log.len() as c_long) };
+
+    while log.len() > 0 {
+        let entry = log.pop().unwrap();
+
+        unsafe {
+            let entry_array = rb_ary_new_capa(2);
+            rb_ary_push(entry_array, rb_float_new(entry.timestamp));
+            rb_ary_push(entry_array, entry.message.into());
+            rb_ary_push(array, entry_array);
+        }
+    }
+
+    return array;
+}
diff --git a/yjit/src/options.rs b/yjit/src/options.rs
index e720c33b0b..c87a436091 100644
--- a/yjit/src/options.rs
+++ b/yjit/src/options.rs
@@ -1,19 +1,40 @@
-use std::ffi::CStr;
+use std::{ffi::{CStr, CString}, ptr::null, fs::File};
+use crate::{backend::current::TEMP_REGS, cruby::*, stats::Counter};
+use std::os::raw::{c_char, c_int, c_uint};
+
+// Call threshold for small deployments and command-line apps
+pub static SMALL_CALL_THRESHOLD: u64 = 30;
+
+// Call threshold for larger deployments and production-sized applications
+pub static LARGE_CALL_THRESHOLD: u64 = 120;
+
+// Number of live ISEQs after which we consider an app to be large
+pub static LARGE_ISEQ_COUNT: u64 = 40_000;
+
+// This option is exposed to the C side in a global variable for performance, see vm.c
+// Number of method calls after which to start generating code
+// Threshold==1 means compile on first execution
+#[no_mangle]
+pub static mut rb_yjit_call_threshold: u64 = SMALL_CALL_THRESHOLD;
+
+// This option is exposed to the C side in a global variable for performance, see vm.c
+// Number of execution requests after which a method is no longer
+// considered hot. Raising this results in more generated code.
+#[no_mangle]
+pub static mut rb_yjit_cold_threshold: u64 = 200_000;
 
 // Command-line options
-#[derive(Clone, PartialEq, Eq, Debug)]
+#[derive(Debug)]
 #[repr(C)]
 pub struct Options {
-    // Size of the executable memory block to allocate in bytes
-    // Note that the command line argument is expressed in MiB and not bytes
-    pub exec_mem_size: usize,
-
-    // Number of method calls after which to start generating code
-    // Threshold==1 means compile on first execution
-    pub call_threshold: usize,
+    /// Soft limit of all memory used by YJIT in bytes
+    /// VirtualMem avoids allocating new pages if code_region_size + yjit_alloc_size
+    /// is larger than this threshold. Rust may still allocate memory beyond this limit.
+    pub mem_size: usize,
 
-    // Generate versions greedily until the limit is hit
-    pub greedy_versioning: bool,
+    /// Hard limit of the executable memory block to allocate in bytes
+    /// Note that the command line argument is expressed in MiB and not bytes
+    pub exec_mem_size: Option<usize>,
 
     // Disable the propagation of type information
     pub no_type_prop: bool,
@@ -22,11 +43,27 @@ pub struct Options {
     // 1 means always create generic versions
     pub max_versions: usize,
 
-    // Capture and print out stats
+    // The number of registers allocated for stack temps
+    pub num_temp_regs: usize,
+
+    // Disable Ruby builtin methods defined by `with_jit` hooks, e.g. Array#each in Ruby
+    pub c_builtin: bool,
+
+    // Capture stats
     pub gen_stats: bool,
 
+    // Print stats on exit (when gen_stats is also true)
+    pub print_stats: bool,
+
     // Trace locations of exits
-    pub gen_trace_exits: bool,
+    pub trace_exits: Option<TraceExits>,
+
+    // how often to sample exit trace data
+    pub trace_exits_sample_rate: usize,
+
+    // Whether to enable YJIT at boot. This option prevents other
+    // YJIT tuning options from enabling YJIT at boot.
+    pub disable: bool,
 
     /// Dump compiled and executed instructions for debugging
     pub dump_insns: bool,
@@ -40,35 +77,91 @@ pub struct Options {
     /// Verify context objects (debug mode only)
     pub verify_ctx: bool,
 
-    /// Whether or not to assume a global constant state (and therefore
-    /// invalidating code whenever any constant changes) versus assuming
-    /// constant name components (and therefore invalidating code whenever a
-    /// matching name component changes)
-    pub global_constant_state: bool,
+    /// Enable generating frame pointers (for x86. arm64 always does this)
+    pub frame_pointer: bool,
+
+    /// Run code GC when exec_mem_size is reached.
+    pub code_gc: bool,
+
+    /// Enable writing /tmp/perf-{pid}.map for Linux perf
+    pub perf_map: Option<PerfMap>,
+
+    // Where to store the log. `None` disables the log.
+    pub log: Option<LogOutput>,
 }
 
 // Initialize the options to default values
 pub static mut OPTIONS: Options = Options {
-    exec_mem_size: 64 * 1024 * 1024,
-    call_threshold: 30,
-    greedy_versioning: false,
+    mem_size: 128 * 1024 * 1024,
+    exec_mem_size: None,
     no_type_prop: false,
     max_versions: 4,
+    num_temp_regs: 5,
+    c_builtin: false,
     gen_stats: false,
-    gen_trace_exits: false,
+    trace_exits: None,
+    print_stats: true,
+    trace_exits_sample_rate: 0,
+    disable: false,
     dump_insns: false,
     dump_disasm: None,
     verify_ctx: false,
-    global_constant_state: false,
     dump_iseq_disasm: None,
+    frame_pointer: false,
+    code_gc: false,
+    perf_map: None,
+    log: None,
 };
 
-#[derive(Clone, PartialEq, Eq, Debug)]
+/// YJIT option descriptions for `ruby --help`.
+/// Note that --help allows only 80 characters per line, including indentation.   80-character limit --> |
+pub const YJIT_OPTIONS: &'static [(&str, &str)] = &[
+    ("--yjit-mem-size=num",                "Soft limit on YJIT memory usage in MiB (default: 128)."),
+    ("--yjit-exec-mem-size=num",           "Hard limit on executable memory block in MiB."),
+    ("--yjit-call-threshold=num",          "Number of calls to trigger JIT."),
+    ("--yjit-cold-threshold=num",          "Global calls after which ISEQs not compiled (default: 200K)."),
+    ("--yjit-stats",                       "Enable collecting YJIT statistics."),
+    ("--yjit-log[=file|dir]",              "Enable logging of YJIT's compilation activity."),
+    ("--yjit-disable",                     "Disable YJIT for lazily enabling it with RubyVM::YJIT.enable."),
+    ("--yjit-code-gc",                     "Run code GC when the code size reaches the limit."),
+    ("--yjit-perf",                        "Enable frame pointers and perf profiling."),
+    ("--yjit-trace-exits",                 "Record Ruby source location when exiting from generated code."),
+    ("--yjit-trace-exits-sample-rate=num", "Trace exit locations only every Nth occurrence."),
+];
+
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub enum TraceExits {
+    // Trace all exits
+    All,
+    // Trace a specific counter
+    Counter(Counter),
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub enum LogOutput {
+    // Dump to the log file as events occur.
+    File(std::os::unix::io::RawFd),
+    // Keep the log in memory only
+    MemoryOnly,
+    // Dump to stderr when the process exits
+    Stderr
+}
+
+#[derive(Debug)]
 pub enum DumpDisasm {
     // Dump to stdout
     Stdout,
     // Dump to "yjit_{pid}.log" file under the specified directory
-    File(String),
+    File(std::os::unix::io::RawFd),
+}
+
+/// Type of symbols to dump into /tmp/perf-{pid}.map
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub enum PerfMap {
+    // Dump ISEQ symbols
+    ISEQ,
+    // Dump YJIT codegen symbols
+    Codegen,
 }
 
 /// Macro to get an option value by name
@@ -76,7 +169,12 @@ macro_rules! get_option {
     // Unsafe is ok here because options are initialized
     // once before any Ruby code executes
     ($option_name:ident) => {
-        unsafe { OPTIONS.$option_name }
+        {
+            // Make this a statement since attributes on expressions are experimental
+            #[allow(unused_unsafe)]
+            let ret = unsafe { crate::options::OPTIONS.$option_name };
+            ret
+        }
     };
 }
 pub(crate) use get_option;
@@ -90,6 +188,7 @@ macro_rules! get_option_ref {
     };
 }
 pub(crate) use get_option_ref;
+use crate::log::Log;
 
 /// Expected to receive what comes after the third dash in "--yjit-*".
 /// Empty string means user passed only "--yjit". C code rejects when
@@ -111,6 +210,20 @@ pub fn parse_option(str_ptr: *const std::os::raw::c_char) -> Option<()> {
     match (opt_name, opt_val) {
         ("", "") => (), // Simply --yjit
 
+        ("mem-size", _) => match opt_val.parse::<usize>() {
+            Ok(n) => {
+                if n == 0 || n > 2 * 1024 * 1024 {
+                    return None
+                }
+
+                // Convert from MiB to bytes internally for convenience
+                unsafe { OPTIONS.mem_size = n * 1024 * 1024 }
+            }
+            Err(_) => {
+                return None;
+            }
+        },
+
         ("exec-mem-size", _) => match opt_val.parse::<usize>() {
             Ok(n) => {
                 if n == 0 || n > 2 * 1024 * 1024 {
@@ -118,7 +231,7 @@ pub fn parse_option(str_ptr: *const std::os::raw::c_char) -> Option<()> {
                 }
 
                 // Convert from MiB to bytes internally for convenience
-                unsafe { OPTIONS.exec_mem_size = n * 1024 * 1024 }
+                unsafe { OPTIONS.exec_mem_size = Some(n * 1024 * 1024) }
             }
             Err(_) => {
                 return None;
@@ -126,7 +239,14 @@ pub fn parse_option(str_ptr: *const std::os::raw::c_char) -> Option<()> {
         },
 
         ("call-threshold", _) => match opt_val.parse() {
-            Ok(n) => unsafe { OPTIONS.call_threshold = n },
+            Ok(n) => unsafe { rb_yjit_call_threshold = n },
+            Err(_) => {
+                return None;
+            }
+        },
+
+        ("cold-threshold", _) => match opt_val.parse() {
+            Ok(n) => unsafe { rb_yjit_cold_threshold = n },
             Err(_) => {
                 return None;
             }
@@ -139,27 +259,127 @@ pub fn parse_option(str_ptr: *const std::os::raw::c_char) -> Option<()> {
             }
         },
 
-        ("dump-disasm", _) => match opt_val.to_string().as_str() {
-            "" => unsafe { OPTIONS.dump_disasm = Some(DumpDisasm::Stdout) },
-            directory => {
-                let pid = std::process::id();
-                let path = format!("{directory}/yjit_{pid}.log");
-                println!("YJIT disasm dump: {path}");
-                unsafe { OPTIONS.dump_disasm = Some(DumpDisasm::File(path)) }
+        ("disable", "") => unsafe {
+            OPTIONS.disable = true;
+        },
+
+        ("temp-regs", _) => match opt_val.parse() {
+            Ok(n) => {
+                assert!(n <= TEMP_REGS.len(), "--yjit-temp-regs must be <= {}", TEMP_REGS.len());
+                unsafe { OPTIONS.num_temp_regs = n }
             }
+            Err(_) => {
+                return None;
+            }
+        },
+
+        ("c-builtin", _) => unsafe {
+            OPTIONS.c_builtin = true;
+        },
+
+        ("code-gc", _) => unsafe {
+            OPTIONS.code_gc = true;
+        },
+
+        ("perf", _) => match opt_val {
+            "" => unsafe {
+                OPTIONS.frame_pointer = true;
+                OPTIONS.perf_map = Some(PerfMap::ISEQ);
+            },
+            "fp" => unsafe { OPTIONS.frame_pointer = true },
+            "iseq" => unsafe { OPTIONS.perf_map = Some(PerfMap::ISEQ) },
+            // Accept --yjit-perf=map for backward compatibility
+            "codegen" | "map" => unsafe { OPTIONS.perf_map = Some(PerfMap::Codegen) },
+            _ => return None,
          },
 
+        ("dump-disasm", _) => {
+            if !cfg!(feature = "disasm") {
+                eprintln!("WARNING: the {} option works best when YJIT is built in dev mode, i.e. ./configure --enable-yjit=dev", opt_name);
+            }
+
+            match opt_val {
+                "" => unsafe { OPTIONS.dump_disasm = Some(DumpDisasm::Stdout) },
+                directory => {
+                    let path = format!("{directory}/yjit_{}.log", std::process::id());
+                    match File::options().create(true).append(true).open(&path) {
+                        Ok(file) => {
+                            use std::os::unix::io::IntoRawFd;
+                            eprintln!("YJIT disasm dump: {path}");
+                            unsafe { OPTIONS.dump_disasm = Some(DumpDisasm::File(file.into_raw_fd())) }
+                        }
+                        Err(err) => eprintln!("Failed to create {path}: {err}"),
+                    }
+                }
+            }
+        },
+
         ("dump-iseq-disasm", _) => unsafe {
+            if !cfg!(feature = "disasm") {
+                eprintln!("WARNING: the {} option is only available when YJIT is built in dev mode, i.e. ./configure --enable-yjit=dev", opt_name);
+            }
+
             OPTIONS.dump_iseq_disasm = Some(opt_val.to_string());
         },
 
-        ("greedy-versioning", "") => unsafe { OPTIONS.greedy_versioning = true },
         ("no-type-prop", "") => unsafe { OPTIONS.no_type_prop = true },
-        ("stats", "") => unsafe { OPTIONS.gen_stats = true },
-        ("trace-exits", "") => unsafe { OPTIONS.gen_trace_exits = true; OPTIONS.gen_stats = true },
+        ("stats", _) => match opt_val {
+            "" => unsafe { OPTIONS.gen_stats = true },
+            "quiet" => unsafe {
+                OPTIONS.gen_stats = true;
+                OPTIONS.print_stats = false;
+            },
+            _ => {
+                return None;
+            }
+        },
+        ("log", _) => match opt_val {
+            "" => unsafe {
+                OPTIONS.log = Some(LogOutput::Stderr);
+                Log::init();
+            },
+            "quiet" => unsafe {
+                OPTIONS.log = Some(LogOutput::MemoryOnly);
+                Log::init();
+            },
+            arg_value => {
+                let log_file_path = if std::path::Path::new(arg_value).is_dir() {
+                    format!("{arg_value}/yjit_{}.log", std::process::id())
+                } else {
+                    arg_value.to_string()
+                };
+
+                match File::options().create(true).write(true).truncate(true).open(&log_file_path) {
+                    Ok(file) => {
+                        use std::os::unix::io::IntoRawFd;
+                        eprintln!("YJIT log: {log_file_path}");
+
+                        unsafe { OPTIONS.log = Some(LogOutput::File(file.into_raw_fd())) }
+                        Log::init()
+                    }
+                    Err(err) => panic!("Failed to create {log_file_path}: {err}"),
+                }
+            }
+        },
+        ("trace-exits", _) => unsafe {
+            OPTIONS.gen_stats = true;
+            OPTIONS.trace_exits = match opt_val {
+                "" => Some(TraceExits::All),
+                name => match Counter::get(name) {
+                    Some(counter) => Some(TraceExits::Counter(counter)),
+                    None => return None,
+                },
+            };
+        },
+        ("trace-exits-sample-rate", sample_rate) => unsafe {
+            OPTIONS.gen_stats = true;
+            if OPTIONS.trace_exits.is_none() {
+                OPTIONS.trace_exits = Some(TraceExits::All);
+            }
+            OPTIONS.trace_exits_sample_rate = sample_rate.parse().unwrap();
+        },
         ("dump-insns", "") => unsafe { OPTIONS.dump_insns = true },
         ("verify-ctx", "") => unsafe { OPTIONS.verify_ctx = true },
-        ("global-constant-state", "") => unsafe { OPTIONS.global_constant_state = true },
 
         // Option name not recognized
         _ => {
@@ -167,8 +387,46 @@ pub fn parse_option(str_ptr: *const std::os::raw::c_char) -> Option<()> {
         }
     }
 
+    // before we continue, check that sample_rate is either 0 or a prime number
+    let trace_sample_rate = unsafe { OPTIONS.trace_exits_sample_rate };
+    if trace_sample_rate > 1 {
+        let mut i = 2;
+        while i*i <= trace_sample_rate {
+            if trace_sample_rate % i == 0 {
+                println!("Warning: using a non-prime number as your sampling rate can result in less accurate sampling data");
+                return Some(());
+            }
+            i += 1;
+        }
+    }
+
     // dbg!(unsafe {OPTIONS});
 
     // Option successfully parsed
     return Some(());
 }
+
+/// Print YJIT options for `ruby --help`. `width` is width of option parts, and
+/// `columns` is indent width of descriptions.
+#[no_mangle]
+pub extern "C" fn rb_yjit_show_usage(help: c_int, highlight: c_int, width: c_uint, columns: c_int) {
+    for &(name, description) in YJIT_OPTIONS.iter() {
+        extern "C" {
+            fn ruby_show_usage_line(name: *const c_char, secondary: *const c_char, description: *const c_char,
+                                    help: c_int, highlight: c_int, width: c_uint, columns: c_int);
+        }
+        let name = CString::new(name).unwrap();
+        let description = CString::new(description).unwrap();
+        unsafe { ruby_show_usage_line(name.as_ptr(), null(), description.as_ptr(), help, highlight, width, columns) }
+    }
+}
+
+/// Return true if --yjit-c-builtin is given
+#[no_mangle]
+pub extern "C" fn rb_yjit_c_builtin_p(_ec: EcPtr, _self: VALUE) -> VALUE {
+    if get_option!(c_builtin) {
+        Qtrue
+    } else {
+        Qfalse
+    }
+}
diff --git a/yjit/src/stats.rs b/yjit/src/stats.rs
index de1310d78b..105def2fff 100644
--- a/yjit/src/stats.rs
+++ b/yjit/src/stats.rs
@@ -1,20 +1,99 @@
 //! Everything related to the collection of runtime stats in YJIT
-//! See the stats feature and the --yjit-stats command-line option
+//! See the --yjit-stats command-line option
 
-#![allow(dead_code)] // Counters are only used with the stats features
+use std::ptr::addr_of_mut;
+use std::sync::atomic::Ordering;
+use std::time::Instant;
+use std::collections::HashMap;
 
 use crate::codegen::CodegenGlobals;
 use crate::cruby::*;
 use crate::options::*;
-use crate::yjit::yjit_enabled_p;
+use crate::yjit::{yjit_enabled_p, YJIT_INIT_TIME};
 
-// stats_alloc is a middleware to instrument global allocations in Rust.
-#[cfg(feature="stats")]
-#[global_allocator]
-static GLOBAL_ALLOCATOR: &stats_alloc::StatsAlloc<std::alloc::System> = &stats_alloc::INSTRUMENTED_SYSTEM;
+#[cfg(feature = "stats_allocator")]
+#[path = "../../jit/src/lib.rs"]
+mod jit;
 
-// YJIT exit counts for each instruction type
-const VM_INSTRUCTION_SIZE_USIZE:usize = VM_INSTRUCTION_SIZE as usize;
+/// Running total of how many ISeqs are in the system.
+#[no_mangle]
+pub static mut rb_yjit_live_iseq_count: u64 = 0;
+
+/// Monotonically increasing total of how many ISEQs were allocated
+#[no_mangle]
+pub static mut rb_yjit_iseq_alloc_count: u64 = 0;
+
+/// The number of bytes YJIT has allocated on the Rust heap.
+pub fn yjit_alloc_size() -> usize {
+    jit::GLOBAL_ALLOCATOR.alloc_size.load(Ordering::SeqCst)
+}
+
+/// Mapping of C function / ISEQ name to integer indices
+/// This is accessed at compilation time only (protected by a lock)
+static mut CFUNC_NAME_TO_IDX: Option<HashMap<String, usize>> = None;
+static mut ISEQ_NAME_TO_IDX: Option<HashMap<String, usize>> = None;
+
+/// Vector of call counts for each C function / ISEQ index
+/// This is modified (but not resized) by JITted code
+static mut CFUNC_CALL_COUNT: Option<Vec<u64>> = None;
+static mut ISEQ_CALL_COUNT: Option<Vec<u64>> = None;
+
+/// Assign an index to a given cfunc name string
+pub fn get_cfunc_idx(name: &str) -> usize {
+    // SAFETY: We acquire a VM lock and don't create multiple &mut references to these static mut variables.
+    unsafe { get_method_idx(name, &mut *addr_of_mut!(CFUNC_NAME_TO_IDX), &mut *addr_of_mut!(CFUNC_CALL_COUNT)) }
+}
+
+/// Assign an index to a given ISEQ name string
+pub fn get_iseq_idx(name: &str) -> usize {
+    // SAFETY: We acquire a VM lock and don't create multiple &mut references to these static mut variables.
+    unsafe { get_method_idx(name, &mut *addr_of_mut!(ISEQ_NAME_TO_IDX), &mut *addr_of_mut!(ISEQ_CALL_COUNT)) }
+}
+
+fn get_method_idx(
+    name: &str,
+    method_name_to_idx: &mut Option<HashMap<String, usize>>,
+    method_call_count: &mut Option<Vec<u64>>,
+) -> usize {
+    //println!("{}", name);
+
+    let name_to_idx = method_name_to_idx.get_or_insert_with(HashMap::default);
+    let call_count = method_call_count.get_or_insert_with(Vec::default);
+
+    match name_to_idx.get(name) {
+        Some(idx) => *idx,
+        None => {
+            let idx = name_to_idx.len();
+            name_to_idx.insert(name.to_string(), idx);
+
+            // Resize the call count vector
+            if idx >= call_count.len() {
+                call_count.resize(idx + 1, 0);
+            }
+
+            idx
+        }
+    }
+}
+
+// Increment the counter for a C function
+pub extern "C" fn incr_cfunc_counter(idx: usize) {
+    let cfunc_call_count = unsafe { CFUNC_CALL_COUNT.as_mut().unwrap() };
+    assert!(idx < cfunc_call_count.len());
+    cfunc_call_count[idx] += 1;
+}
+
+// Increment the counter for an ISEQ
+pub extern "C" fn incr_iseq_counter(idx: usize) {
+    let iseq_call_count = unsafe { ISEQ_CALL_COUNT.as_mut().unwrap() };
+    assert!(idx < iseq_call_count.len());
+    iseq_call_count[idx] += 1;
+}
+
+/// YJIT exit counts for each instruction type.
+/// Note that `VM_INSTRUCTION_SIZE` is an upper bound and the actual number
+/// of VM opcodes may be different in the build. See [`rb_vm_instruction_size()`]
+const VM_INSTRUCTION_SIZE_USIZE: usize = VM_INSTRUCTION_SIZE as usize;
 static mut EXIT_OP_COUNT: [u64; VM_INSTRUCTION_SIZE_USIZE] = [0; VM_INSTRUCTION_SIZE_USIZE];
 
 /// Global state needed for collecting backtraces of exits
@@ -24,7 +103,9 @@ pub struct YjitExitLocations {
     raw_samples: Vec<VALUE>,
     /// Vec to hold line_samples which represent line numbers of
     /// the iseq caller.
-    line_samples: Vec<i32>
+    line_samples: Vec<i32>,
+    /// Number of samples skipped when sampling
+    skipped_samples: usize
 }
 
 /// Private singleton instance of yjit exit locations
@@ -33,19 +114,15 @@ static mut YJIT_EXIT_LOCATIONS: Option<YjitExitLocations> = None;
 impl YjitExitLocations {
     /// Initialize the yjit exit locations
     pub fn init() {
-        // Return if the stats feature is disabled
-        if !cfg!(feature = "stats") {
-            return;
-        }
-
         // Return if --yjit-trace-exits isn't enabled
-        if !get_option!(gen_trace_exits) {
+        if get_option!(trace_exits).is_none() {
             return;
         }
 
         let yjit_exit_locations = YjitExitLocations {
             raw_samples: Vec::new(),
-            line_samples: Vec::new()
+            line_samples: Vec::new(),
+            skipped_samples: 0
         };
 
         // Initialize the yjit exit locations instance
@@ -69,6 +146,11 @@ impl YjitExitLocations {
         &mut YjitExitLocations::get_instance().line_samples
     }
 
+    /// Get the number of samples skipped
+    pub fn get_skipped_samples() -> &'static mut usize {
+        &mut YjitExitLocations::get_instance().skipped_samples
+    }
+
     /// Mark the data stored in YjitExitLocations::get_raw_samples that needs to be used by
     /// rb_yjit_add_frame. YjitExitLocations::get_raw_samples are an array of
     /// VALUE pointers, exit instruction, and number of times we've seen this stack row
@@ -81,13 +163,8 @@ impl YjitExitLocations {
             return;
         }
 
-        // Return if the stats feature is disabled
-        if !cfg!(feature = "stats") {
-            return;
-        }
-
         // Return if --yjit-trace-exits isn't enabled
-        if !get_option!(gen_trace_exits) {
+        if get_option!(trace_exits).is_none() {
             return;
         }
 
@@ -109,7 +186,7 @@ impl YjitExitLocations {
 
             // Increase index for exit instruction.
             idx += 1;
-            // Increase index for bookeeping value (number of times we've seen this
+            // Increase index for bookkeeping value (number of times we've seen this
             // row in a stack).
             idx += 1;
         }
@@ -123,6 +200,28 @@ macro_rules! make_counters {
         #[derive(Default, Debug)]
         pub struct Counters { $(pub $counter_name: u64),+ }
 
+        /// Enum to represent a counter
+        #[allow(non_camel_case_types)]
+        #[derive(Clone, Copy, PartialEq, Eq, Debug)]
+        pub enum Counter { $($counter_name),+ }
+
+        impl Counter {
+            /// Map a counter name string to a counter enum
+            pub fn get(name: &str) -> Option<Counter> {
+                match name {
+                    $( stringify!($counter_name) => { Some(Counter::$counter_name) } ),+
+                    _ => None,
+                }
+            }
+
+            /// Get a counter name string
+            pub fn get_name(&self) -> String {
+                match self {
+                    $( Counter::$counter_name => stringify!($counter_name).to_string() ),+
+                }
+            }
+        }
+
         /// Global counters instance, initialized to zero
         pub static mut COUNTERS: Counters = Counters { $($counter_name: 0),+ };
 
@@ -130,7 +229,7 @@ macro_rules! make_counters {
         const COUNTER_NAMES: &'static [&'static str] = &[ $(stringify!($counter_name)),+ ];
 
         /// Map a counter name string to a counter pointer
-        fn get_counter_ptr(name: &str) -> *mut u64 {
+        pub fn get_counter_ptr(name: &str) -> *mut u64 {
             match name {
                 $( stringify!($counter_name) => { ptr_to_counter!($counter_name) } ),+
                 _ => panic!()
@@ -139,6 +238,66 @@ macro_rules! make_counters {
     }
 }
 
+/// The list of counters that are available without --yjit-stats.
+/// They are incremented only by `incr_counter!` and don't use `gen_counter_incr`.
+pub const DEFAULT_COUNTERS: &'static [Counter] = &[
+    Counter::code_gc_count,
+    Counter::compiled_iseq_entry,
+    Counter::cold_iseq_entry,
+    Counter::compiled_iseq_count,
+    Counter::compiled_blockid_count,
+    Counter::compiled_block_count,
+    Counter::deleted_defer_block_count,
+    Counter::compiled_branch_count,
+    Counter::compile_time_ns,
+    Counter::compilation_failure,
+    Counter::max_inline_versions,
+    Counter::inline_block_count,
+    Counter::num_contexts_encoded,
+    Counter::context_cache_hits,
+
+    Counter::invalidation_count,
+    Counter::invalidate_method_lookup,
+    Counter::invalidate_bop_redefined,
+    Counter::invalidate_ractor_spawn,
+    Counter::invalidate_constant_state_bump,
+    Counter::invalidate_constant_ic_fill,
+    Counter::invalidate_no_singleton_class,
+    Counter::invalidate_ep_escape,
+    Counter::invalidate_everything,
+];
+
+/// Macro to increase a counter by name and count
+macro_rules! incr_counter_by {
+    // Unsafe is ok here because options are initialized
+    // once before any Ruby code executes
+    ($counter_name:ident, $count:expr) => {
+        #[allow(unused_unsafe)]
+        {
+            unsafe { $crate::stats::COUNTERS.$counter_name += $count as u64 }
+        }
+    };
+}
+pub(crate) use incr_counter_by;
+
+/// Macro to increase a counter if the given value is larger
+macro_rules! incr_counter_to {
+    // Unsafe is ok here because options are initialized
+    // once before any Ruby code executes
+    ($counter_name:ident, $count:expr) => {
+        #[allow(unused_unsafe)]
+        {
+            unsafe {
+                $crate::stats::COUNTERS.$counter_name = u64::max(
+                    $crate::stats::COUNTERS.$counter_name,
+                    $count as u64,
+                )
+            }
+        }
+    };
+}
+pub(crate) use incr_counter_to;
+
 /// Macro to increment a counter by name
 macro_rules! incr_counter {
     // Unsafe is ok here because options are initialized
@@ -161,136 +320,235 @@ macro_rules! ptr_to_counter {
         }
     };
 }
-pub(crate) use ptr_to_counter;
 
 // Declare all the counters we track
 make_counters! {
-    exec_instruction,
+    yjit_insns_count,
 
-    send_keywords,
-    send_kw_splat,
-    send_args_splat_super,
-    send_iseq_zsuper,
-    send_block_arg,
+    // Method calls that fallback to dynamic dispatch
+    send_singleton_class,
+    send_forwarding,
     send_ivar_set_method,
     send_zsuper_method,
     send_undef_method,
-    send_optimized_method,
-    send_optimized_method_call,
     send_optimized_method_block_call,
     send_call_block,
     send_call_kwarg,
     send_call_multi_ractor,
+    send_cme_not_found,
+    send_megamorphic,
     send_missing_method,
     send_refined_method,
-    send_cfunc_ruby_array_varg,
+    send_private_not_fcall,
+    send_cfunc_kw_splat_non_nil,
+    send_cfunc_splat_neg2,
     send_cfunc_argc_mismatch,
+    send_cfunc_block_arg,
     send_cfunc_toomany_args,
     send_cfunc_tracing,
-    send_cfunc_kwargs,
     send_cfunc_splat_with_kw,
+    send_cfunc_splat_varg_ruby2_keywords,
     send_attrset_kwargs,
+    send_attrset_block_arg,
     send_iseq_tailcall,
     send_iseq_arity_error,
-    send_iseq_only_keywords,
-    send_iseq_kwargs_req_and_opt_missing,
+    send_iseq_block_arg_type,
+    send_iseq_clobbering_block_arg,
+    send_iseq_block_arg_gc_unsafe,
+    send_iseq_complex_discard_extras,
+    send_iseq_leaf_builtin_block_arg_block_param,
+    send_iseq_kw_splat_non_nil,
     send_iseq_kwargs_mismatch,
-    send_iseq_has_rest,
     send_iseq_has_post,
-    send_iseq_has_kwrest,
     send_iseq_has_no_kw,
     send_iseq_accepts_no_kwarg,
     send_iseq_materialized_block,
-    send_iseq_splat_with_opt,
+    send_iseq_send_forwarding,
+    send_iseq_splat_not_array,
     send_iseq_splat_with_kw,
     send_iseq_missing_optional_kw,
     send_iseq_too_many_kwargs,
     send_not_implemented_method,
     send_getter_arity,
-    send_se_cf_overflow,
-    send_se_protected_check_failed,
-    send_splatarray_length_not_equal,
-    send_splatarray_last_ruby_2_keywords,
-    send_splat_not_array,
-    send_args_splat_non_iseq,
-    send_args_splat_ivar,
+    send_getter_block_arg,
     send_args_splat_attrset,
     send_args_splat_bmethod,
     send_args_splat_aref,
     send_args_splat_aset,
-    send_args_splat_optimized,
-    send_args_splat_cfunc_var_args,
-    send_args_splat_cfunc_zuper,
-    send_args_splat_cfunc_ruby2_keywords,
-    send_iseq_ruby2_keywords,
-    send_send_not_imm,
+    send_args_splat_opt_call,
+    send_iseq_splat_arity_error,
+    send_splat_too_long,
     send_send_wrong_args,
     send_send_null_mid,
     send_send_null_cme,
     send_send_nested,
-    send_send_chain,
-    send_send_chain_string,
-    send_send_chain_not_string,
-    send_send_chain_not_sym,
-    send_send_chain_not_string_or_sym,
-    send_send_getter,
-    send_send_builtin,
-
+    send_send_attr_reader,
+    send_send_attr_writer,
+    send_iseq_has_rest_and_captured,
+    send_iseq_has_kwrest_and_captured,
+    send_iseq_has_rest_and_kw_supplied,
+    send_iseq_has_rest_opt_and_block,
     send_bmethod_ractor,
     send_bmethod_block_arg,
-
-    traced_cfunc_return,
-
-    invokesuper_me_changed,
-    invokesuper_block,
-
+    send_optimized_block_arg,
+    send_pred_not_fixnum,
+    send_pred_underflow,
+    send_str_dup_exivar,
+
+    invokesuper_defined_class_mismatch,
+    invokesuper_forwarding,
+    invokesuper_kw_splat,
+    invokesuper_kwarg,
+    invokesuper_megamorphic,
+    invokesuper_no_cme,
+    invokesuper_no_me,
+    invokesuper_not_iseq_or_cfunc,
+    invokesuper_refinement,
+    invokesuper_singleton_class,
+
+    invokeblock_megamorphic,
     invokeblock_none,
-    invokeblock_iseq_arg0_splat,
-    invokeblock_iseq_block_changed,
-    invokeblock_iseq_tag_changed,
-    invokeblock_ifunc,
+    invokeblock_iseq_arg0_optional,
+    invokeblock_iseq_arg0_args_splat,
+    invokeblock_iseq_arg0_not_array,
+    invokeblock_iseq_arg0_wrong_len,
+    invokeblock_iseq_not_inlined,
+    invokeblock_ifunc_args_splat,
+    invokeblock_ifunc_kw_splat,
     invokeblock_proc,
     invokeblock_symbol,
 
+    // Method calls that exit to the interpreter
+    guard_send_block_arg_type,
+    guard_send_getter_splat_non_empty,
+    guard_send_klass_megamorphic,
+    guard_send_se_cf_overflow,
+    guard_send_se_protected_check_failed,
+    guard_send_splatarray_length_not_equal,
+    guard_send_splatarray_last_ruby2_keywords,
+    guard_send_splat_not_array,
+    guard_send_send_name_chain,
+    guard_send_iseq_has_rest_and_splat_too_few,
+    guard_send_is_a_class_mismatch,
+    guard_send_instance_of_class_mismatch,
+    guard_send_interrupted,
+    guard_send_not_fixnums,
+    guard_send_not_fixnum,
+    guard_send_not_fixnum_or_flonum,
+    guard_send_not_string,
+    guard_send_respond_to_mid_mismatch,
+    guard_send_str_aref_not_fixnum,
+
+    guard_send_cfunc_bad_splat_vargs,
+    guard_send_cfunc_block_not_nil,
+
+    guard_invokesuper_me_changed,
+
+    guard_invokeblock_tag_changed,
+    guard_invokeblock_iseq_block_changed,
+
+    traced_cfunc_return,
+
     leave_se_interrupt,
     leave_interp_return,
-    leave_start_pc_non_zero,
 
-    getivar_se_self_not_heap,
-    getivar_idx_out_of_range,
     getivar_megamorphic,
+    getivar_not_heap,
 
-    setivar_se_self_not_heap,
-    setivar_idx_out_of_range,
-    setivar_val_heapobject,
-    setivar_name_not_mapped,
-    setivar_not_object,
+    setivar_not_heap,
     setivar_frozen,
     setivar_megamorphic,
 
-    oaref_argc_not_one,
-    oaref_arg_not_fixnum,
+    definedivar_not_heap,
+    definedivar_megamorphic,
+
+    setlocal_wb_required,
+
+    invokebuiltin_too_many_args,
+
+    opt_plus_overflow,
+    opt_minus_overflow,
+    opt_mult_overflow,
+
+    opt_succ_not_fixnum,
+    opt_succ_overflow,
 
-    opt_getinlinecache_miss,
+    opt_mod_zero,
+    opt_div_zero,
+
+    lshift_amount_changed,
+    lshift_overflow,
+
+    rshift_amount_changed,
+
+    opt_aref_argc_not_one,
+    opt_aref_arg_not_fixnum,
+    opt_aref_not_array,
+    opt_aref_not_hash,
+
+    opt_aset_not_array,
+    opt_aset_not_fixnum,
+    opt_aset_not_hash,
+    opt_aset_frozen,
+
+    opt_case_dispatch_megamorphic,
+
+    opt_getconstant_path_ic_miss,
+    opt_getconstant_path_multi_ractor,
 
     expandarray_splat,
     expandarray_postarg,
     expandarray_not_array,
-    expandarray_rhs_too_small,
+    expandarray_to_ary,
+    expandarray_method_missing,
+    expandarray_chain_max_depth,
+
+    // getblockparam
+    gbp_wb_required,
 
+    // getblockparamproxy
+    gbpp_unsupported_type,
     gbpp_block_param_modified,
+    gbpp_block_handler_not_none,
     gbpp_block_handler_not_iseq,
+    gbpp_block_handler_not_proc,
+
+    branchif_interrupted,
+    branchunless_interrupted,
+    branchnil_interrupted,
+    jump_interrupted,
+
+    objtostring_not_string,
+
+    getbyte_idx_not_fixnum,
+    getbyte_idx_negative,
+    getbyte_idx_out_of_bounds,
+
+    splatkw_not_hash,
+    splatkw_not_nil,
 
     binding_allocations,
     binding_set,
 
-    vm_insns_count,
+    compiled_iseq_entry,
+    cold_iseq_entry,
     compiled_iseq_count,
+    compiled_blockid_count,
     compiled_block_count,
     compiled_branch_count,
+    compile_time_ns,
     compilation_failure,
+    abandoned_block_count,
     block_next_count,
     defer_count,
+    defer_empty_count,
+    deleted_defer_block_count,
+    branch_insn_count,
+    branch_known_count,
+    max_inline_versions,
+    inline_block_count,
+    num_contexts_encoded,
+
     freed_iseq_count,
 
     exit_from_branch_stub,
@@ -301,17 +559,52 @@ make_counters! {
     invalidate_ractor_spawn,
     invalidate_constant_state_bump,
     invalidate_constant_ic_fill,
-
-    constant_state_bumps,
+    invalidate_no_singleton_class,
+    invalidate_ep_escape,
+    invalidate_everything,
 
     // Currently, it's out of the ordinary (might be impossible) for YJIT to leave gaps in
     // executable memory, so this should be 0.
     exec_mem_non_bump_alloc,
 
+    code_gc_count,
+
     num_gc_obj_refs,
 
-    x86_call_rel32,
-    x86_call_reg,
+    num_send,
+    num_send_known_class,
+    num_send_polymorphic,
+    num_send_x86_rel32,
+    num_send_x86_reg,
+    num_send_dynamic,
+    num_send_cfunc,
+    num_send_cfunc_inline,
+    num_send_iseq,
+    num_send_iseq_leaf,
+    num_send_iseq_inline,
+
+    num_getivar_megamorphic,
+    num_setivar_megamorphic,
+    num_opt_case_dispatch_megamorphic,
+
+    num_throw,
+    num_throw_break,
+    num_throw_retry,
+    num_throw_return,
+
+    num_lazy_frame_check,
+    num_lazy_frame_push,
+    lazy_frame_count,
+    lazy_frame_failure,
+
+    iseq_stack_too_large,
+    iseq_too_long,
+
+    temp_reg_opnd,
+    temp_mem_opnd,
+    temp_spill,
+
+    context_cache_hits,
 }
 
 //===========================================================================
@@ -328,21 +621,30 @@ pub extern "C" fn rb_yjit_stats_enabled_p(_ec: EcPtr, _ruby_self: VALUE) -> VALU
     }
 }
 
+/// Primitive called in yjit.rb
+/// Check if stats generation should print at exit
+#[no_mangle]
+pub extern "C" fn rb_yjit_print_stats_p(_ec: EcPtr, _ruby_self: VALUE) -> VALUE {
+    if yjit_enabled_p() && get_option!(print_stats) {
+        return Qtrue;
+    } else {
+        return Qfalse;
+    }
+}
+
 /// Primitive called in yjit.rb.
 /// Export all YJIT statistics as a Ruby hash.
 #[no_mangle]
-pub extern "C" fn rb_yjit_get_stats(_ec: EcPtr, _ruby_self: VALUE) -> VALUE {
-    with_vm_lock(src_loc!(), || rb_yjit_gen_stats_dict())
+pub extern "C" fn rb_yjit_get_stats(_ec: EcPtr, _ruby_self: VALUE, key: VALUE) -> VALUE {
+    with_vm_lock(src_loc!(), || rb_yjit_gen_stats_dict(key))
 }
 
 /// Primitive called in yjit.rb
 ///
-/// Check if trace_exits generation is enabled. Requires the stats feature
-/// to be enabled.
+/// Check if trace_exits generation is enabled.
 #[no_mangle]
 pub extern "C" fn rb_yjit_trace_exit_locations_enabled_p(_ec: EcPtr, _ruby_self: VALUE) -> VALUE {
-    #[cfg(feature = "stats")]
-    if get_option!(gen_trace_exits) {
+    if get_option!(trace_exits).is_some() {
         return Qtrue;
     }
 
@@ -358,17 +660,12 @@ pub extern "C" fn rb_yjit_get_exit_locations(_ec: EcPtr, _ruby_self: VALUE) -> V
         return Qnil;
     }
 
-    // Return if the stats feature is disabled
-    if !cfg!(feature = "stats") {
-        return Qnil;
-    }
-
     // Return if --yjit-trace-exits isn't enabled
-    if !get_option!(gen_trace_exits) {
+    if get_option!(trace_exits).is_none() {
         return Qnil;
     }
 
-    // If the stats feature is enabled, pass yjit_raw_samples and yjit_line_samples
+    // Pass yjit_raw_samples and yjit_line_samples
     // to the C function called rb_yjit_exit_locations_dict for parsing.
     let yjit_raw_samples = YjitExitLocations::get_raw_samples();
     let yjit_line_samples = YjitExitLocations::get_line_samples();
@@ -386,102 +683,216 @@ pub extern "C" fn rb_yjit_get_exit_locations(_ec: EcPtr, _ruby_self: VALUE) -> V
     }
 }
 
+/// Increment a counter by name from the CRuby side
+/// Warning: this is not fast because it requires a hash lookup, so don't use in tight loops
+#[no_mangle]
+pub extern "C" fn rb_yjit_incr_counter(counter_name: *const std::os::raw::c_char) {
+    use std::ffi::CStr;
+    let counter_name = unsafe { CStr::from_ptr(counter_name).to_str().unwrap() };
+    let counter_ptr = get_counter_ptr(counter_name);
+    unsafe { *counter_ptr += 1 };
+}
+
 /// Export all YJIT statistics as a Ruby hash.
-fn rb_yjit_gen_stats_dict() -> VALUE {
+fn rb_yjit_gen_stats_dict(key: VALUE) -> VALUE {
     // If YJIT is not enabled, return Qnil
     if !yjit_enabled_p() {
         return Qnil;
     }
 
-    macro_rules! hash_aset_usize {
-        ($hash:ident, $counter_name:expr, $value:expr) => {
-            let key = rust_str_to_sym($counter_name);
-            let value = VALUE::fixnum_from_usize($value);
-            rb_hash_aset($hash, key, value);
+    let hash = if key == Qnil {
+        unsafe { rb_hash_new() }
+    } else {
+        Qnil
+    };
+
+    macro_rules! set_stat {
+        ($hash:ident, $name:expr, $value:expr) => {
+            let rb_key = rust_str_to_sym($name);
+            if key == rb_key {
+                return $value;
+            } else if hash != Qnil {
+                rb_hash_aset($hash, rb_key, $value);
+            }
         }
     }
 
-    let hash = unsafe { rb_hash_new() };
+    macro_rules! set_stat_usize {
+        ($hash:ident, $name:expr, $value:expr) => {
+            set_stat!($hash, $name, VALUE::fixnum_from_usize($value));
+        }
+    }
+
+    macro_rules! set_stat_double {
+        ($hash:ident, $name:expr, $value:expr) => {
+            set_stat!($hash, $name, rb_float_new($value));
+        }
+    }
 
-    // CodeBlock stats
     unsafe {
         // Get the inline and outlined code blocks
         let cb = CodegenGlobals::get_inline_cb();
         let ocb = CodegenGlobals::get_outlined_cb();
 
         // Inline code size
-        hash_aset_usize!(hash, "inline_code_size", cb.code_size());
+        set_stat_usize!(hash, "inline_code_size", cb.code_size());
 
         // Outlined code size
-        hash_aset_usize!(hash, "outlined_code_size", ocb.unwrap().code_size());
+        set_stat_usize!(hash, "outlined_code_size", ocb.unwrap().code_size());
 
         // GCed pages
         let freed_page_count = cb.num_freed_pages();
-        hash_aset_usize!(hash, "freed_page_count", freed_page_count);
+        set_stat_usize!(hash, "freed_page_count", freed_page_count);
 
         // GCed code size
-        hash_aset_usize!(hash, "freed_code_size", freed_page_count * cb.page_size());
+        set_stat_usize!(hash, "freed_code_size", freed_page_count * cb.page_size());
 
         // Live pages
-        hash_aset_usize!(hash, "live_page_count", cb.num_mapped_pages() - freed_page_count);
-
-        // Code GC count
-        hash_aset_usize!(hash, "code_gc_count", CodegenGlobals::get_code_gc_count());
+        set_stat_usize!(hash, "live_page_count", cb.num_mapped_pages() - freed_page_count);
 
         // Size of memory region allocated for JIT code
-        hash_aset_usize!(hash, "code_region_size", cb.mapped_region_size());
+        set_stat_usize!(hash, "code_region_size", cb.mapped_region_size());
 
         // Rust global allocations in bytes
-        #[cfg(feature="stats")]
-        hash_aset_usize!(hash, "yjit_alloc_size", global_allocation_size());
+        set_stat_usize!(hash, "yjit_alloc_size", yjit_alloc_size());
+
+        // How many bytes we are using to store context data
+        let context_data = CodegenGlobals::get_context_data();
+        set_stat_usize!(hash, "context_data_bytes", context_data.num_bytes());
+        set_stat_usize!(hash, "context_cache_bytes", crate::core::CTX_ENCODE_CACHE_BYTES + crate::core::CTX_DECODE_CACHE_BYTES);
+
+        // VM instructions count
+        if rb_vm_insn_count > 0 {
+            set_stat_usize!(hash, "vm_insns_count", rb_vm_insn_count as usize);
+        }
+
+        set_stat_usize!(hash, "live_iseq_count", rb_yjit_live_iseq_count as usize);
+        set_stat_usize!(hash, "iseq_alloc_count", rb_yjit_iseq_alloc_count as usize);
+
+        set_stat!(hash, "object_shape_count", rb_object_shape_count());
+
+        // Time since YJIT init in nanoseconds
+        let time_nanos = Instant::now().duration_since(YJIT_INIT_TIME.unwrap()).as_nanos();
+        set_stat_usize!(hash, "yjit_active_ns", time_nanos as usize);
     }
 
-    // If we're not generating stats, the hash is done
+    // If we're not generating stats, put only default counters
     if !get_option!(gen_stats) {
+        for counter in DEFAULT_COUNTERS {
+            // Get the counter value
+            let counter_ptr = get_counter_ptr(&counter.get_name());
+            let counter_val = unsafe { *counter_ptr };
+
+            // Put counter into hash
+            let key = &counter.get_name();
+            let value = VALUE::fixnum_from_usize(counter_val as usize);
+            unsafe { set_stat!(hash, key, value); }
+        }
+
         return hash;
     }
 
-    // If the stats feature is enabled
-
     unsafe {
         // Indicate that the complete set of stats is available
-        rb_hash_aset(hash, rust_str_to_sym("all_stats"), Qtrue);
+        set_stat!(hash, "all_stats", Qtrue);
 
         // For each counter we track
         for counter_name in COUNTER_NAMES {
             // Get the counter value
             let counter_ptr = get_counter_ptr(counter_name);
             let counter_val = *counter_ptr;
-
-            #[cfg(not(feature = "stats"))]
-            if counter_name == &"vm_insns_count" {
-                // If the stats feature is disabled, we don't have vm_insns_count
-                // so we are going to exlcude the key
-                continue;
-            }
-
-            // Put counter into hash
-            let key = rust_str_to_sym(counter_name);
-            let value = VALUE::fixnum_from_usize(counter_val as usize);
-            rb_hash_aset(hash, key, value);
+            set_stat_usize!(hash, counter_name, counter_val as usize);
         }
 
+        let mut side_exits = 0;
+
         // For each entry in exit_op_count, add a stats entry with key "exit_INSTRUCTION_NAME"
         // and the value is the count of side exits for that instruction.
-        for op_idx in 0..VM_INSTRUCTION_SIZE_USIZE {
+        use crate::utils::IntoUsize;
+        for op_idx in 0..rb_vm_instruction_size().as_usize() {
             let op_name = insn_name(op_idx);
             let key_string = "exit_".to_owned() + &op_name;
-            let key = rust_str_to_sym(&key_string);
-            let value = VALUE::fixnum_from_usize(EXIT_OP_COUNT[op_idx] as usize);
-            rb_hash_aset(hash, key, value);
+            let count = EXIT_OP_COUNT[op_idx];
+            side_exits += count;
+            set_stat_usize!(hash, &key_string, count as usize);
+        }
+
+        set_stat_usize!(hash, "side_exit_count", side_exits as usize);
+
+        let total_exits = side_exits + *get_counter_ptr(&Counter::leave_interp_return.get_name());
+        set_stat_usize!(hash, "total_exit_count", total_exits as usize);
+
+        // Number of instructions that finish executing in YJIT.
+        // See :count-placement: about the subtraction.
+        let retired_in_yjit = *get_counter_ptr(&Counter::yjit_insns_count.get_name()) - side_exits;
+
+        // Average length of instruction sequences executed by YJIT
+        let avg_len_in_yjit: f64 = if total_exits > 0 {
+            retired_in_yjit as f64 / total_exits as f64
+        } else {
+            0_f64
+        };
+        set_stat_double!(hash, "avg_len_in_yjit", avg_len_in_yjit);
+
+        // Proportion of instructions that retire in YJIT
+        if rb_vm_insn_count > 0 {
+            let total_insns_count = retired_in_yjit + rb_vm_insn_count;
+            set_stat_usize!(hash, "total_insns_count", total_insns_count as usize);
+
+            let ratio_in_yjit: f64 = 100.0 * retired_in_yjit as f64 / total_insns_count as f64;
+            set_stat_double!(hash, "ratio_in_yjit", ratio_in_yjit);
         }
+
+        // Set method call counts in a Ruby dict
+        fn set_call_counts(
+            calls_hash: VALUE,
+            method_name_to_idx: &mut Option<HashMap<String, usize>>,
+            method_call_count: &mut Option<Vec<u64>>,
+        ) {
+            if let (Some(name_to_idx), Some(call_counts)) = (method_name_to_idx, method_call_count) {
+                // Create a list of (name, call_count) pairs
+                let mut pairs = Vec::new();
+                for (name, idx) in name_to_idx {
+                    let count = call_counts[*idx];
+                    pairs.push((name, count));
+                }
+
+                // Sort the vectors by decreasing call counts
+                pairs.sort_by_key(|e| -(e.1 as i64));
+
+                // Cap the number of counts reported to avoid
+                // bloating log files, etc.
+                pairs.truncate(20);
+
+                // Add the pairs to the dict
+                for (name, call_count) in pairs {
+                    let key = rust_str_to_sym(name);
+                    let value = VALUE::fixnum_from_usize(call_count as usize);
+                    unsafe { rb_hash_aset(calls_hash, key, value); }
+                }
+            }
+        }
+
+        // Create a hash for the cfunc call counts
+        set_stat!(hash, "cfunc_calls", {
+            let cfunc_calls = rb_hash_new();
+            set_call_counts(cfunc_calls, &mut *addr_of_mut!(CFUNC_NAME_TO_IDX), &mut *addr_of_mut!(CFUNC_CALL_COUNT));
+            cfunc_calls
+        });
+
+        // Create a hash for the ISEQ call counts
+        set_stat!(hash, "iseq_calls", {
+            let iseq_calls = rb_hash_new();
+            set_call_counts(iseq_calls, &mut *addr_of_mut!(ISEQ_NAME_TO_IDX), &mut *addr_of_mut!(ISEQ_CALL_COUNT));
+            iseq_calls
+        });
     }
 
     hash
 }
 
 /// Record the backtrace when a YJIT exit occurs. This functionality requires
-/// that the stats feature is enabled as well as the --yjit-trace-exits option.
+/// the --yjit-trace-exits option.
 ///
 /// This function will fill two Vec's in YjitExitLocations to record the raw samples
 /// and line samples. Their length should be the same, however the data stored in
@@ -494,20 +905,26 @@ pub extern "C" fn rb_yjit_record_exit_stack(exit_pc: *const VALUE)
         return;
     }
 
-    // Return if the stats feature is disabled
-    if !cfg!(feature = "stats") {
+    // Return if --yjit-trace-exits isn't enabled
+    if get_option!(trace_exits).is_none() {
         return;
     }
 
-    // Return if --yjit-trace-exits isn't enabled
-    if !get_option!(gen_trace_exits) {
-        return;
+    if get_option!(trace_exits_sample_rate) > 0 {
+        if get_option!(trace_exits_sample_rate) <= *YjitExitLocations::get_skipped_samples() {
+            YjitExitLocations::get_instance().skipped_samples = 0;
+        } else {
+            YjitExitLocations::get_instance().skipped_samples += 1;
+            return;
+        }
     }
 
     // rb_vm_insn_addr2opcode won't work in cargo test --all-features
     // because it's a C function. Without insn call, this function is useless
     // so wrap the whole thing in a not test check.
-    if cfg!(not(test)) {
+    let _ = exit_pc;
+    #[cfg(not(test))]
+    {
         // Get the opcode from the encoded insn handler at this PC
         let insn = unsafe { rb_vm_insn_addr2opcode((*exit_pc).as_ptr()) };
 
@@ -540,7 +957,7 @@ pub extern "C" fn rb_yjit_record_exit_stack(exit_pc: *const VALUE)
             let mut prev_frame_idx = 0;
             let mut seen_already = true;
 
-            // If the previous stack lenght and current stack length are equal,
+            // If the previous stack length and current stack length are equal,
             // loop and compare the current frame to the previous frame. If they are
             // not equal, set seen_already to false and break out of the loop.
             if prev_stack_len == stack_length as i64 {
@@ -591,10 +1008,8 @@ pub extern "C" fn rb_yjit_record_exit_stack(exit_pc: *const VALUE)
         // Push the insn value into the yjit_raw_samples Vec.
         yjit_raw_samples.push(VALUE(insn as usize));
 
-        // Push the current line onto the yjit_line_samples Vec. This
-        // points to the line in insns.def.
-        let line = yjit_line_samples.len() - 1;
-        yjit_line_samples.push(line as i32);
+        // We don't know the line
+        yjit_line_samples.push(0);
 
         // Push number of times seen onto the stack, which is 1
         // because it's the first time we've seen it.
@@ -614,12 +1029,6 @@ pub extern "C" fn rb_yjit_reset_stats_bang(_ec: EcPtr, _ruby_self: VALUE) -> VAL
     return Qnil;
 }
 
-/// Increment the number of instructions executed by the interpreter
-#[no_mangle]
-pub extern "C" fn rb_yjit_collect_vm_usage_insn() {
-    incr_counter!(vm_insns_count);
-}
-
 #[no_mangle]
 pub extern "C" fn rb_yjit_collect_binding_alloc() {
     incr_counter!(binding_allocations);
@@ -645,9 +1054,11 @@ pub extern "C" fn rb_yjit_count_side_exit_op(exit_pc: *const VALUE) -> *const VA
     return exit_pc;
 }
 
-// Get the size of global allocations in Rust.
-#[cfg(feature="stats")]
-fn global_allocation_size() -> usize {
-    let stats = GLOBAL_ALLOCATOR.stats();
-    stats.bytes_allocated.saturating_sub(stats.bytes_deallocated)
+/// Measure the time taken by func() and add that to yjit_compile_time.
+pub fn with_compile_time<F, R>(func: F) -> R where F: FnOnce() -> R {
+    let start = Instant::now();
+    let ret = func();
+    let nanos = Instant::now().duration_since(start).as_nanos();
+    incr_counter_by!(compile_time_ns, nanos);
+    ret
 }
diff --git a/yjit/src/utils.rs b/yjit/src/utils.rs
index f66000381e..251628fabf 100644
--- a/yjit/src/utils.rs
+++ b/yjit/src/utils.rs
@@ -3,9 +3,10 @@
 use crate::backend::ir::*;
 use crate::cruby::*;
 use std::slice;
+use std::os::raw::c_int;
 
 /// Trait for casting to [usize] that allows you to say `.as_usize()`.
-/// Implementation conditional on the the cast preserving the numeric value on
+/// Implementation conditional on the cast preserving the numeric value on
 /// all inputs and being inexpensive.
 ///
 /// [usize] is only guaranteed to be more than 16-bit wide, so we can't use
@@ -51,6 +52,20 @@ impl IntoUsize for u8 {
     }
 }
 
+/// The `Into<u64>` Rust does not provide.
+/// Convert to u64 with assurance that the value is preserved.
+/// Currently, `usize::BITS == 64` holds for all platforms we support.
+pub(crate) trait IntoU64 {
+    fn as_u64(self) -> u64;
+}
+
+#[cfg(target_pointer_width = "64")]
+impl IntoU64 for usize {
+    fn as_u64(self) -> u64 {
+        self as u64
+    }
+}
+
 /// Compute an offset in bytes of a given struct field
 #[allow(unused)]
 macro_rules! offset_of {
@@ -73,20 +88,17 @@ pub(crate) use offset_of;
 // Convert a CRuby UTF-8-encoded RSTRING into a Rust string.
 // This should work fine on ASCII strings and anything else
 // that is considered legal UTF-8, including embedded nulls.
-fn ruby_str_to_rust(v: VALUE) -> String {
+pub fn ruby_str_to_rust(v: VALUE) -> String {
     let str_ptr = unsafe { rb_RSTRING_PTR(v) } as *mut u8;
     let str_len: usize = unsafe { rb_RSTRING_LEN(v) }.try_into().unwrap();
     let str_slice: &[u8] = unsafe { slice::from_raw_parts(str_ptr, str_len) };
-    match String::from_utf8(str_slice.to_vec()) {
-        Ok(utf8) => utf8,
-        Err(_) => String::new(),
-    }
+    String::from_utf8(str_slice.to_vec()).unwrap_or_default()
 }
 
 // Location is the file defining the method, colon, method name.
 // Filenames are sometimes internal strings supplied to eval,
 // so be careful with them.
-pub fn iseq_get_location(iseq: IseqPtr, pos: u32) -> String {
+pub fn iseq_get_location(iseq: IseqPtr, pos: u16) -> String {
     let iseq_label = unsafe { rb_iseq_label(iseq) };
     let iseq_path = unsafe { rb_iseq_path(iseq) };
     let iseq_lineno = unsafe { rb_iseq_line_no(iseq, pos as usize) };
@@ -148,8 +160,6 @@ pub fn print_int(asm: &mut Assembler, opnd: Opnd) {
         }
     }
 
-    asm.cpush_all();
-
     let argument = match opnd {
         Opnd::Mem(_) | Opnd::Reg(_) | Opnd::InsnOut { .. } => {
             // Sign-extend the value if necessary
@@ -164,7 +174,6 @@ pub fn print_int(asm: &mut Assembler, opnd: Opnd) {
     };
 
     asm.ccall(print_int_fn as *const u8, vec![argument]);
-    asm.cpop_all();
 }
 
 /// Generate code to print a pointer
@@ -177,9 +186,7 @@ pub fn print_ptr(asm: &mut Assembler, opnd: Opnd) {
 
     assert!(opnd.rm_num_bits() == 64);
 
-    asm.cpush_all();
     asm.ccall(print_ptr_fn as *const u8, vec![opnd]);
-    asm.cpop_all();
 }
 
 /// Generate code to print a value
@@ -192,9 +199,7 @@ pub fn print_value(asm: &mut Assembler, opnd: Opnd) {
 
     assert!(matches!(opnd, Opnd::Value(_)));
 
-    asm.cpush_all();
     asm.ccall(print_value_fn as *const u8, vec![opnd]);
-    asm.cpop_all();
 }
 
 /// Generate code to print constant string to stdout
@@ -209,7 +214,6 @@ pub fn print_str(asm: &mut Assembler, str: &str) {
         }
     }
 
-    asm.cpush_all();
 
     let string_data = asm.new_label("string_data");
     let after_string = asm.new_label("after_string");
@@ -219,10 +223,16 @@ pub fn print_str(asm: &mut Assembler, str: &str) {
     asm.bake_string(str);
     asm.write_label(after_string);
 
-    let opnd = asm.lea_label(string_data);
+    let opnd = asm.lea_jump_target(string_data);
     asm.ccall(print_str_cfun as *const u8, vec![opnd, Opnd::UImm(str.len() as u64)]);
+}
 
-    asm.cpop_all();
+pub fn stdout_supports_colors() -> bool {
+    // TODO(max): Use std::io::IsTerminal after upgrading Rust to 1.70
+    extern "C" { fn isatty(fd: c_int) -> c_int; }
+    let stdout = 1;
+    let is_terminal = unsafe { isatty(stdout) } == 1;
+    is_terminal
 }
 
 #[cfg(test)]
@@ -259,19 +269,19 @@ mod tests {
 
     #[test]
     fn test_print_int() {
-        let mut asm = Assembler::new();
+        let mut asm = Assembler::new_without_iseq();
         let mut cb = CodeBlock::new_dummy(1024);
 
         print_int(&mut asm, Opnd::Imm(42));
-        asm.compile(&mut cb);
+        asm.compile(&mut cb, None).unwrap();
     }
 
     #[test]
     fn test_print_str() {
-        let mut asm = Assembler::new();
+        let mut asm = Assembler::new_without_iseq();
         let mut cb = CodeBlock::new_dummy(1024);
 
         print_str(&mut asm, "Hello, world!");
-        asm.compile(&mut cb);
+        asm.compile(&mut cb, None).unwrap();
     }
 }
diff --git a/yjit/src/virtualmem.rs b/yjit/src/virtualmem.rs
index 1a5b2b1908..9126cf300e 100644
--- a/yjit/src/virtualmem.rs
+++ b/yjit/src/virtualmem.rs
@@ -3,9 +3,12 @@
 // usize->pointer casts is viable. It seems like a lot of work for us to participate for not much
 // benefit.
 
-use std::ptr::NonNull;
+use std::{cell::RefCell, ptr::NonNull};
 
-use crate::{utils::IntoUsize, backend::ir::Target};
+use crate::{backend::ir::Target, stats::yjit_alloc_size, utils::IntoUsize};
+
+#[cfg(test)]
+use crate::options::get_option;
 
 #[cfg(not(test))]
 pub type VirtualMem = VirtualMemory<sys::SystemAllocator>;
@@ -26,15 +29,24 @@ pub struct VirtualMemory<A: Allocator> {
     /// Location of the virtual memory region.
     region_start: NonNull<u8>,
 
-    /// Size of the region in bytes.
+    /// Size of this virtual memory region in bytes.
     region_size_bytes: usize,
 
+    /// mapped_region_bytes + yjit_alloc_size may not increase beyond this limit.
+    memory_limit_bytes: usize,
+
     /// Number of bytes per "page", memory protection permission can only be controlled at this
     /// granularity.
     page_size_bytes: usize,
 
+    /// Mutable parts.
+    mutable: RefCell<VirtualMemoryMut<A>>,
+}
+
+/// Mutable parts of [`VirtualMemory`].
+pub struct VirtualMemoryMut<A: Allocator> {
     /// Number of bytes that have we have allocated physical memory for starting at
-    /// [Self::region_start].
+    /// [VirtualMemory::region_start].
     mapped_region_bytes: usize,
 
     /// Keep track of the address of the last written to page.
@@ -57,14 +69,39 @@ pub trait Allocator {
     fn mark_unused(&mut self, ptr: *const u8, size: u32) -> bool;
 }
 
-/// Pointer into a [VirtualMemory].
-/// We may later change this to wrap an u32.
-/// Note: there is no NULL constant for CodePtr. You should use Option<CodePtr> instead.
+/// Pointer into a [VirtualMemory] represented as an offset from the base.
+/// Note: there is no NULL constant for [CodePtr]. You should use `Option<CodePtr>` instead.
 #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Debug)]
 #[repr(C, packed)]
-pub struct CodePtr(NonNull<u8>);
+pub struct CodePtr(u32);
 
 impl CodePtr {
+    /// Advance the CodePtr. Can return a dangling pointer.
+    pub fn add_bytes(self, bytes: usize) -> Self {
+        let CodePtr(raw) = self;
+        let bytes: u32 = bytes.try_into().unwrap();
+        CodePtr(raw + bytes)
+    }
+
+    /// Note that the raw pointer might be dangling if there hasn't
+    /// been any writes to it through the [VirtualMemory] yet.
+    pub fn raw_ptr(self, base: &impl CodePtrBase) -> *const u8 {
+        let CodePtr(offset) = self;
+        return base.base_ptr().as_ptr().wrapping_add(offset.as_usize())
+    }
+
+    /// Get the address of the code pointer.
+    pub fn raw_addr(self, base: &impl CodePtrBase) -> usize {
+        self.raw_ptr(base) as usize
+    }
+
+    /// Get the offset component for the code pointer. Useful finding the distance between two
+    /// code pointers that share the same [VirtualMem].
+    pub fn as_offset(self) -> i64 {
+        let CodePtr(offset) = self;
+        offset.into()
+    }
+
     pub fn as_side_exit(self) -> Target {
         Target::SideExitPtr(self)
     }
@@ -81,33 +118,46 @@ use WriteError::*;
 
 impl<A: Allocator> VirtualMemory<A> {
     /// Bring a part of the address space under management.
-    pub fn new(allocator: A, page_size: u32, virt_region_start: NonNull<u8>, size_bytes: usize) -> Self {
+    pub fn new(
+        allocator: A,
+        page_size: u32,
+        virt_region_start: NonNull<u8>,
+        region_size_bytes: usize,
+        memory_limit_bytes: usize,
+    ) -> Self {
         assert_ne!(0, page_size);
         let page_size_bytes = page_size.as_usize();
 
         Self {
             region_start: virt_region_start,
-            region_size_bytes: size_bytes,
+            region_size_bytes,
+            memory_limit_bytes,
             page_size_bytes,
-            mapped_region_bytes: 0,
-            current_write_page: None,
-            allocator,
+            mutable: RefCell::new(VirtualMemoryMut {
+                mapped_region_bytes: 0,
+                current_write_page: None,
+                allocator,
+            }),
         }
     }
 
     /// Return the start of the region as a raw pointer. Note that it could be a dangling
     /// pointer so be careful dereferencing it.
     pub fn start_ptr(&self) -> CodePtr {
-        CodePtr(self.region_start)
+        CodePtr(0)
     }
 
-    pub fn end_ptr(&self) -> CodePtr {
-        CodePtr(NonNull::new(self.region_start.as_ptr().wrapping_add(self.mapped_region_bytes)).unwrap())
+    pub fn mapped_end_ptr(&self) -> CodePtr {
+        self.start_ptr().add_bytes(self.mutable.borrow().mapped_region_bytes)
+    }
+
+    pub fn virtual_end_ptr(&self) -> CodePtr {
+        self.start_ptr().add_bytes(self.region_size_bytes)
     }
 
     /// Size of the region in bytes that we have allocated physical memory for.
     pub fn mapped_region_size(&self) -> usize {
-        self.mapped_region_bytes
+        self.mutable.borrow().mapped_region_bytes
     }
 
     /// Size of the region in bytes where writes could be attempted.
@@ -115,20 +165,28 @@ impl<A: Allocator> VirtualMemory<A> {
         self.region_size_bytes
     }
 
+    /// The granularity at which we can control memory permission.
+    /// On Linux, this is the page size that mmap(2) talks about.
+    pub fn system_page_size(&self) -> usize {
+        self.page_size_bytes
+    }
+
     /// Write a single byte. The first write to a page makes it readable.
-    pub fn write_byte(&mut self, write_ptr: CodePtr, byte: u8) -> Result<(), WriteError> {
+    pub fn write_byte(&self, write_ptr: CodePtr, byte: u8) -> Result<(), WriteError> {
+        let mut mutable = self.mutable.borrow_mut();
+
         let page_size = self.page_size_bytes;
-        let raw: *mut u8 = write_ptr.raw_ptr() as *mut u8;
+        let raw: *mut u8 = write_ptr.raw_ptr(self) as *mut u8;
         let page_addr = (raw as usize / page_size) * page_size;
 
-        if self.current_write_page == Some(page_addr) {
+        if mutable.current_write_page == Some(page_addr) {
             // Writing within the last written to page, nothing to do
         } else {
             // Switching to a different and potentially new page
             let start = self.region_start.as_ptr();
-            let mapped_region_end = start.wrapping_add(self.mapped_region_bytes);
+            let mapped_region_end = start.wrapping_add(mutable.mapped_region_bytes);
             let whole_region_end = start.wrapping_add(self.region_size_bytes);
-            let alloc = &mut self.allocator;
+            let alloc = &mut mutable.allocator;
 
             assert!((start..=whole_region_end).contains(&mapped_region_end));
 
@@ -140,8 +198,9 @@ impl<A: Allocator> VirtualMemory<A> {
                     return Err(FailedPageMapping);
                 }
 
-                self.current_write_page = Some(page_addr);
-            } else if (start..whole_region_end).contains(&raw) {
+                mutable.current_write_page = Some(page_addr);
+            } else if (start..whole_region_end).contains(&raw) &&
+                    (page_addr + page_size - start as usize) + yjit_alloc_size() < self.memory_limit_bytes {
                 // Writing to a brand new page
                 let mapped_region_end_addr = mapped_region_end as usize;
                 let alloc_size = page_addr - mapped_region_end_addr + page_size;
@@ -171,9 +230,9 @@ impl<A: Allocator> VirtualMemory<A> {
                         unreachable!("unknown arch");
                     }
                 }
-                self.mapped_region_bytes = self.mapped_region_bytes + alloc_size;
+                mutable.mapped_region_bytes = mutable.mapped_region_bytes + alloc_size;
 
-                self.current_write_page = Some(page_addr);
+                mutable.current_write_page = Some(page_addr);
             } else {
                 return Err(OutOfBounds);
             }
@@ -185,60 +244,66 @@ impl<A: Allocator> VirtualMemory<A> {
         Ok(())
     }
 
-    /// Make all the code in the region executable. Call this at the end of a write session.
-    /// See [Self] for usual usage flow.
-    pub fn mark_all_executable(&mut self) {
-        self.current_write_page = None;
+    /// Make all the code in the region writeable.
+    /// Call this during GC before the phase of updating reference fields.
+    pub fn mark_all_writeable(&self) {
+        let mut mutable = self.mutable.borrow_mut();
+
+        mutable.current_write_page = None;
 
         let region_start = self.region_start;
-        let mapped_region_bytes: u32 = self.mapped_region_bytes.try_into().unwrap();
+        let mapped_region_bytes: u32 = mutable.mapped_region_bytes.try_into().unwrap();
 
         // Make mapped region executable
-        self.allocator.mark_executable(region_start.as_ptr(), mapped_region_bytes);
+        if !mutable.allocator.mark_writable(region_start.as_ptr(), mapped_region_bytes) {
+            panic!("Cannot make memory region writable: {:?}-{:?}",
+                region_start.as_ptr(),
+                unsafe { region_start.as_ptr().add(mapped_region_bytes as usize)}
+            );
+        }
     }
 
-    /// Free a range of bytes. start_ptr must be memory page-aligned.
-    pub fn free_bytes(&mut self, start_ptr: CodePtr, size: u32) {
-        assert_eq!(start_ptr.into_usize() % self.page_size_bytes, 0);
-        self.allocator.mark_unused(start_ptr.0.as_ptr(), size);
-    }
-}
+    /// Make all the code in the region executable. Call this at the end of a write session.
+    /// See [Self] for usual usage flow.
+    pub fn mark_all_executable(&self) {
+        let mut mutable = self.mutable.borrow_mut();
 
-impl CodePtr {
-    /// Note that the raw pointer might be dangling if there hasn't
-    /// been any writes to it through the [VirtualMemory] yet.
-    pub fn raw_ptr(self) -> *const u8 {
-        let CodePtr(ptr) = self;
-        return ptr.as_ptr();
-    }
+        mutable.current_write_page = None;
 
-    /// Advance the CodePtr. Can return a dangling pointer.
-    pub fn add_bytes(self, bytes: usize) -> Self {
-        let CodePtr(raw) = self;
-        CodePtr(NonNull::new(raw.as_ptr().wrapping_add(bytes)).unwrap())
-    }
+        let region_start = self.region_start;
+        let mapped_region_bytes: u32 = mutable.mapped_region_bytes.try_into().unwrap();
 
-    pub fn into_i64(self) -> i64 {
-        let CodePtr(ptr) = self;
-        ptr.as_ptr() as i64
+        // Make mapped region executable
+        mutable.allocator.mark_executable(region_start.as_ptr(), mapped_region_bytes);
     }
 
-    #[cfg(target_arch = "aarch64")]
-    pub fn into_u64(self) -> u64 {
-        let CodePtr(ptr) = self;
-        ptr.as_ptr() as u64
+    /// Free a range of bytes. start_ptr must be memory page-aligned.
+    pub fn free_bytes(&self, start_ptr: CodePtr, size: u32) {
+        assert_eq!(start_ptr.raw_ptr(self) as usize % self.page_size_bytes, 0);
+
+        // Bounds check the request. We should only free memory we manage.
+        let mapped_region = self.start_ptr().raw_ptr(self)..self.mapped_end_ptr().raw_ptr(self);
+        let virtual_region = self.start_ptr().raw_ptr(self)..self.virtual_end_ptr().raw_ptr(self);
+        let last_byte_to_free = start_ptr.add_bytes(size.saturating_sub(1).as_usize()).raw_ptr(self);
+        assert!(mapped_region.contains(&start_ptr.raw_ptr(self)));
+        // On platforms where code page size != memory page size (e.g. Linux), we often need
+        // to free code pages that contain unmapped memory pages. When it happens on the last
+        // code page, it's more appropriate to check the last byte against the virtual region.
+        assert!(virtual_region.contains(&last_byte_to_free));
+
+        let mut mutable = self.mutable.borrow_mut();
+        mutable.allocator.mark_unused(start_ptr.raw_ptr(self), size);
     }
+}
 
-    pub fn into_usize(self) -> usize {
-        let CodePtr(ptr) = self;
-        ptr.as_ptr() as usize
-    }
+/// Something that could provide a base pointer to compute a raw pointer from a [CodePtr].
+pub trait CodePtrBase {
+    fn base_ptr(&self) -> NonNull<u8>;
 }
 
-impl From<*mut u8> for CodePtr {
-    fn from(value: *mut u8) -> Self {
-        assert!(value as usize != 0);
-        return CodePtr(NonNull::new(value).unwrap());
+impl<A: Allocator> CodePtrBase for VirtualMemory<A> {
+    fn base_ptr(&self) -> NonNull<u8> {
+        self.region_start
     }
 }
 
@@ -254,15 +319,15 @@ mod sys {
 
     impl super::Allocator for SystemAllocator {
         fn mark_writable(&mut self, ptr: *const u8, size: u32) -> bool {
-            unsafe { rb_yjit_mark_writable(ptr as VoidPtr, size) }
+            unsafe { rb_jit_mark_writable(ptr as VoidPtr, size) }
         }
 
         fn mark_executable(&mut self, ptr: *const u8, size: u32) {
-            unsafe { rb_yjit_mark_executable(ptr as VoidPtr, size) }
+            unsafe { rb_jit_mark_executable(ptr as VoidPtr, size) }
         }
 
         fn mark_unused(&mut self, ptr: *const u8, size: u32) -> bool {
-            unsafe { rb_yjit_mark_unused(ptr as VoidPtr, size) }
+            unsafe { rb_jit_mark_unused(ptr as VoidPtr, size) }
         }
     }
 }
@@ -349,17 +414,18 @@ pub mod tests {
             PAGE_SIZE.try_into().unwrap(),
             NonNull::new(mem_start as *mut u8).unwrap(),
             mem_size,
+            get_option!(mem_size),
         )
     }
 
     #[test]
     #[cfg(target_arch = "x86_64")]
     fn new_memory_is_initialized() {
-        let mut virt = new_dummy_virt_mem();
+        let virt = new_dummy_virt_mem();
 
         virt.write_byte(virt.start_ptr(), 1).unwrap();
         assert!(
-            virt.allocator.memory[..PAGE_SIZE].iter().all(|&byte| byte != 0),
+            virt.mutable.borrow().allocator.memory[..PAGE_SIZE].iter().all(|&byte| byte != 0),
             "Entire page should be initialized",
         );
 
@@ -367,21 +433,21 @@ pub mod tests {
         let three_pages = 3 * PAGE_SIZE;
         virt.write_byte(virt.start_ptr().add_bytes(three_pages), 1).unwrap();
         assert!(
-            virt.allocator.memory[..three_pages].iter().all(|&byte| byte != 0),
+            virt.mutable.borrow().allocator.memory[..three_pages].iter().all(|&byte| byte != 0),
             "Gaps between write requests should be filled",
         );
     }
 
     #[test]
     fn no_redundant_syscalls_when_writing_to_the_same_page() {
-        let mut virt = new_dummy_virt_mem();
+        let virt = new_dummy_virt_mem();
 
         virt.write_byte(virt.start_ptr(), 1).unwrap();
         virt.write_byte(virt.start_ptr(), 0).unwrap();
 
         assert!(
             matches!(
-                virt.allocator.requests[..],
+                virt.mutable.borrow().allocator.requests[..],
                 [MarkWritable { start_idx: 0, length: PAGE_SIZE }],
             )
         );
@@ -390,12 +456,12 @@ pub mod tests {
     #[test]
     fn bounds_checking() {
         use super::WriteError::*;
-        let mut virt = new_dummy_virt_mem();
+        let virt = new_dummy_virt_mem();
 
         let one_past_end = virt.start_ptr().add_bytes(virt.virtual_region_size());
         assert_eq!(Err(OutOfBounds), virt.write_byte(one_past_end, 0));
 
-        let end_of_addr_space = CodePtr(NonNull::new(usize::MAX as _).unwrap());
+        let end_of_addr_space = CodePtr(u32::MAX);
         assert_eq!(Err(OutOfBounds), virt.write_byte(end_of_addr_space, 0));
     }
 
@@ -403,7 +469,7 @@ pub mod tests {
     fn only_written_to_regions_become_executable() {
         // ... so we catch attempts to read/write/execute never-written-to regions
         const THREE_PAGES: usize = PAGE_SIZE * 3;
-        let mut virt = new_dummy_virt_mem();
+        let virt = new_dummy_virt_mem();
         let page_two_start = virt.start_ptr().add_bytes(PAGE_SIZE * 2);
         virt.write_byte(page_two_start, 1).unwrap();
         virt.mark_all_executable();
@@ -411,7 +477,7 @@ pub mod tests {
         assert!(virt.virtual_region_size() > THREE_PAGES);
         assert!(
             matches!(
-                virt.allocator.requests[..],
+                virt.mutable.borrow().allocator.requests[..],
                 [
                     MarkWritable { start_idx: 0, length: THREE_PAGES },
                     MarkExecutable { start_idx: 0, length: THREE_PAGES },
diff --git a/yjit/src/yjit.rs b/yjit/src/yjit.rs
index 4850dca7a8..517a0daae5 100644
--- a/yjit/src/yjit.rs
+++ b/yjit/src/yjit.rs
@@ -4,77 +4,171 @@ use crate::cruby::*;
 use crate::invariants::*;
 use crate::options::*;
 use crate::stats::YjitExitLocations;
+use crate::stats::incr_counter;
+use crate::stats::with_compile_time;
 
-use std::os::raw;
-use std::sync::atomic::{AtomicBool, Ordering};
+use std::os::raw::{c_char, c_int};
+use std::time::Instant;
+use crate::log::Log;
 
-/// For tracking whether the user enabled YJIT through command line arguments or environment
-/// variables. AtomicBool to avoid `unsafe`. On x86 it compiles to simple movs.
-/// See <https://doc.rust-lang.org/std/sync/atomic/enum.Ordering.html>
-/// See [rb_yjit_enabled_p]
-static YJIT_ENABLED: AtomicBool = AtomicBool::new(false);
+/// Is YJIT on? The interpreter uses this variable to decide whether to trigger
+/// compilation. See jit_exec() and jit_compile().
+#[allow(non_upper_case_globals)]
+#[no_mangle]
+pub static mut rb_yjit_enabled_p: bool = false;
+
+// Time when YJIT was yjit was initialized (see yjit_init)
+pub static mut YJIT_INIT_TIME: Option<Instant> = None;
 
 /// Parse one command-line option.
 /// This is called from ruby.c
 #[no_mangle]
-pub extern "C" fn rb_yjit_parse_option(str_ptr: *const raw::c_char) -> bool {
+pub extern "C" fn rb_yjit_parse_option(str_ptr: *const c_char) -> bool {
     return parse_option(str_ptr).is_some();
 }
 
-/// Is YJIT on? The interpreter uses this function to decide whether to increment
-/// ISEQ call counters. See jit_exec().
-/// This is used frequently since it's used on every method call in the interpreter.
 #[no_mangle]
-pub extern "C" fn rb_yjit_enabled_p() -> raw::c_int {
-    // Note that we might want to call this function from signal handlers so
-    // might need to ensure signal-safety(7).
-    YJIT_ENABLED.load(Ordering::Acquire).into()
+pub extern "C" fn rb_yjit_option_disable() -> bool {
+    return get_option!(disable);
 }
 
 /// Like rb_yjit_enabled_p, but for Rust code.
 pub fn yjit_enabled_p() -> bool {
-    YJIT_ENABLED.load(Ordering::Acquire)
+    unsafe { rb_yjit_enabled_p }
 }
 
-/// After how many calls YJIT starts compiling a method
+/// This function is called from C code
 #[no_mangle]
-pub extern "C" fn rb_yjit_call_threshold() -> raw::c_uint {
-    get_option!(call_threshold) as raw::c_uint
+pub extern "C" fn rb_yjit_init(yjit_enabled: bool) {
+    // Register the method codegen functions. This must be done at boot.
+    yjit_reg_method_codegen_fns();
+
+    // If --yjit-disable, yjit_init() will not be called until RubyVM::YJIT.enable.
+    if yjit_enabled {
+        yjit_init();
+    }
 }
 
-/// This function is called from C code
-#[no_mangle]
-pub extern "C" fn rb_yjit_init_rust() {
+/// Initialize and enable YJIT. You should call this at boot or with GVL.
+fn yjit_init() {
     // TODO: need to make sure that command-line options have been
     // initialized by CRuby
 
+    // Call YJIT hooks before enabling YJIT to avoid compiling the hooks themselves
+    unsafe {
+        let yjit = rb_const_get(rb_cRubyVM, rust_str_to_id("YJIT"));
+        rb_funcall(yjit, rust_str_to_id("call_jit_hooks"), 0);
+    }
+
     // Catch panics to avoid UB for unwinding into C frames.
     // See https://doc.rust-lang.org/nomicon/exception-safety.html
-    // TODO: set a panic handler so the we don't print a message
-    //       everytime we panic.
     let result = std::panic::catch_unwind(|| {
         Invariants::init();
         CodegenGlobals::init();
         YjitExitLocations::init();
+        ids::init();
+
+        rb_bug_panic_hook();
 
         // YJIT enabled and initialized successfully
-        YJIT_ENABLED.store(true, Ordering::Release);
+        assert!(unsafe{ !rb_yjit_enabled_p });
+        unsafe { rb_yjit_enabled_p = true; }
     });
 
     if let Err(_) = result {
-        println!("YJIT: rb_yjit_init_rust() panicked. Aborting.");
+        println!("YJIT: yjit_init() panicked. Aborting.");
         std::process::abort();
     }
+
+    // Make sure --yjit-perf doesn't append symbols to an old file
+    if get_option!(perf_map).is_some() {
+        let perf_map = format!("/tmp/perf-{}.map", std::process::id());
+        let _ = std::fs::remove_file(&perf_map);
+        println!("YJIT perf map: {perf_map}");
+    }
+
+    // Note the time when YJIT was initialized
+    unsafe {
+        YJIT_INIT_TIME = Some(Instant::now());
+    }
+}
+
+#[no_mangle]
+pub extern "C" fn rb_yjit_free_at_exit() {
+    yjit_shutdown_free_codegen_table();
+}
+
+/// At the moment, we abort in all cases we panic.
+/// To aid with getting diagnostics in the wild without requiring
+/// people to set RUST_BACKTRACE=1, register a panic hook that crash using rb_bug().
+/// rb_bug() might not be as good at printing a call trace as Rust's stdlib, but
+/// it dumps some other info that might be relevant.
+///
+/// In case we want to start doing fancier exception handling with panic=unwind,
+/// we can revisit this later. For now, this helps to get us good bug reports.
+fn rb_bug_panic_hook() {
+    use std::env;
+    use std::panic;
+    use std::io::{stderr, Write};
+
+    // Probably the default hook. We do this very early during process boot.
+    let previous_hook = panic::take_hook();
+
+    panic::set_hook(Box::new(move |panic_info| {
+        // Not using `eprintln` to avoid double panic.
+        let _ = stderr().write_all(b"ruby: YJIT has panicked. More info to follow...\n");
+
+        // Always show a Rust backtrace.
+        env::set_var("RUST_BACKTRACE", "1");
+        previous_hook(panic_info);
+
+        // Abort with rb_bug(). It has a length limit on the message.
+        let panic_message = &format!("{}", panic_info)[..];
+        let len = std::cmp::min(0x100, panic_message.len()) as c_int;
+        unsafe { rb_bug(b"YJIT: %*s\0".as_ref().as_ptr() as *const c_char, len, panic_message.as_ptr()); }
+    }));
 }
 
 /// Called from C code to begin compiling a function
 /// NOTE: this should be wrapped in RB_VM_LOCK_ENTER(), rb_vm_barrier() on the C side
+/// If jit_exception is true, compile JIT code for handling exceptions.
+/// See jit_compile_exception() for details.
 #[no_mangle]
-pub extern "C" fn rb_yjit_iseq_gen_entry_point(iseq: IseqPtr, ec: EcPtr) -> *const u8 {
-    let maybe_code_ptr = gen_entry_point(iseq, ec);
+pub extern "C" fn rb_yjit_iseq_gen_entry_point(iseq: IseqPtr, ec: EcPtr, jit_exception: bool) -> *const u8 {
+    // Don't compile when there is insufficient native stack space
+    if unsafe { rb_ec_stack_check(ec as _) } != 0 {
+        return std::ptr::null();
+    }
+
+    // Reject ISEQs with very large temp stacks,
+    // this will allow us to use u8/i8 values to track stack_size and sp_offset
+    let stack_max = unsafe { rb_get_iseq_body_stack_max(iseq) };
+    if stack_max >= i8::MAX as u32 {
+        incr_counter!(iseq_stack_too_large);
+        return std::ptr::null();
+    }
+
+    // Reject ISEQs that are too long,
+    // this will allow us to use u16 for instruction indices if we want to,
+    // very long ISEQs are also much more likely to be initialization code
+    let iseq_size = unsafe { get_iseq_encoded_size(iseq) };
+    if iseq_size >= u16::MAX as u32 {
+        incr_counter!(iseq_too_long);
+        return std::ptr::null();
+    }
+
+    // If a custom call threshold was not specified on the command-line and
+    // this is a large application (has very many ISEQs), switch to
+    // using the call threshold for large applications after this entry point
+    use crate::stats::rb_yjit_live_iseq_count;
+    if unsafe { rb_yjit_call_threshold } == SMALL_CALL_THRESHOLD && unsafe { rb_yjit_live_iseq_count } > LARGE_ISEQ_COUNT {
+        unsafe { rb_yjit_call_threshold = LARGE_CALL_THRESHOLD; };
+    }
+
+    let maybe_code_ptr = with_compile_time(|| { gen_entry_point(iseq, ec, jit_exception) });
 
     match maybe_code_ptr {
-        Some(ptr) => ptr.raw_ptr(),
+        Some(ptr) => ptr,
         None => std::ptr::null(),
     }
 }
@@ -86,11 +180,67 @@ pub extern "C" fn rb_yjit_code_gc(_ec: EcPtr, _ruby_self: VALUE) -> VALUE {
         return Qnil;
     }
 
-    let cb = CodegenGlobals::get_inline_cb();
-    cb.code_gc();
+    with_vm_lock(src_loc!(), || {
+        let cb = CodegenGlobals::get_inline_cb();
+        let ocb = CodegenGlobals::get_outlined_cb();
+        cb.code_gc(ocb);
+    });
+
     Qnil
 }
 
+/// Enable YJIT compilation, returning true if YJIT was previously disabled
+#[no_mangle]
+pub extern "C" fn rb_yjit_enable(_ec: EcPtr, _ruby_self: VALUE, gen_stats: VALUE, print_stats: VALUE, gen_log: VALUE, print_log: VALUE, mem_size: VALUE, call_threshold: VALUE) -> VALUE {
+    with_vm_lock(src_loc!(), || {
+
+        if !mem_size.nil_p() {
+            let mem_size_mb = mem_size.as_isize() >> 1;
+            let mem_size_bytes = mem_size_mb * 1024 * 1024;
+            unsafe {
+                OPTIONS.mem_size = mem_size_bytes as usize;
+            }
+        }
+
+        if !call_threshold.nil_p() {
+            let threshold = call_threshold.as_isize() >> 1;
+            unsafe {
+                rb_yjit_call_threshold = threshold as u64;
+            }
+        }
+
+        // Initialize and enable YJIT
+        if gen_stats.test() {
+            unsafe {
+                OPTIONS.gen_stats = gen_stats.test();
+                OPTIONS.print_stats = print_stats.test();
+            }
+        }
+
+        if gen_log.test() {
+            unsafe {
+                if print_log.test() {
+                    OPTIONS.log = Some(LogOutput::Stderr);
+                } else {
+                    OPTIONS.log = Some(LogOutput::MemoryOnly);
+                }
+
+                Log::init();
+            }
+        }
+
+        yjit_init();
+
+        // Add "+YJIT" to RUBY_DESCRIPTION
+        extern "C" {
+            fn ruby_set_yjit_description();
+        }
+        unsafe { ruby_set_yjit_description(); }
+
+        Qtrue
+    })
+}
+
 /// Simulate a situation where we are out of executable memory
 #[no_mangle]
 pub extern "C" fn rb_yjit_simulate_oom_bang(_ec: EcPtr, _ruby_self: VALUE) -> VALUE {
@@ -109,3 +259,19 @@ pub extern "C" fn rb_yjit_simulate_oom_bang(_ec: EcPtr, _ruby_self: VALUE) -> VA
 
     return Qnil;
 }
+
+/// Push a C method frame if the given PC is supposed to lazily push one.
+/// This is called from rb_raise() (at rb_exc_new_str()) and other functions
+/// that may make a method call (e.g. rb_to_int()).
+#[no_mangle]
+pub extern "C" fn rb_yjit_lazy_push_frame(pc: *mut VALUE) {
+    if !yjit_enabled_p() {
+        return;
+    }
+
+    incr_counter!(num_lazy_frame_check);
+    if let Some(&(cme, recv_idx)) = CodegenGlobals::get_pc_to_cfunc().get(&pc) {
+        incr_counter!(num_lazy_frame_push);
+        unsafe { rb_vm_push_cfunc_frame(cme, recv_idx as i32) }
+    }
+}