1 files changed, 863 insertions, 385 deletions
diff --git a/yjit/src/core.rs b/yjit/src/core.rs
index 4dd0a387d5..cd6e649aa0 100644
--- a/yjit/src/core.rs
+++ b/yjit/src/core.rs
@@ -18,13 +18,14 @@ use std::cell::*;
 use std::collections::HashSet;
 use std::fmt;
 use std::mem;
+use std::mem::transmute;
 use std::ops::Range;
 use std::rc::Rc;
 use mem::MaybeUninit;
 use std::ptr;
 use ptr::NonNull;
 use YARVOpnd::*;
-use TempMapping::*;
+use TempMappingKind::*;
 use crate::invariants::*;
 
 // Maximum number of temp value types we keep track of
@@ -39,8 +40,9 @@ pub type IseqIdx = u16;
 
 // Represent the type of a value (local/stack/self) in YJIT
 #[derive(Copy, Clone, Hash, PartialEq, Eq, Debug)]
+#[repr(u8)]
 pub enum Type {
-    Unknown,
+    Unknown = 0,
     UnknownImm,
     UnknownHeap,
     Nil,
@@ -48,19 +50,20 @@ pub enum Type {
     False,
     Fixnum,
     Flonum,
-    Hash,
     ImmSymbol,
 
-    #[allow(unused)]
-    HeapSymbol,
-
     TString, // An object with the T_STRING flag set, possibly an rb_cString
-    CString, // An un-subclassed string of type rb_cString (can have instance vars in some cases)
+    CString, // An object that at one point had its class field equal rb_cString (creating a singleton class changes it)
     TArray, // An object with the T_ARRAY flag set, possibly an rb_cArray
-    CArray, // An un-subclassed string of type rb_cArray (can have instance vars in some cases)
+    CArray, // An object that at one point had its class field equal rb_cArray (creating a singleton class changes it)
+    THash, // An object with the T_HASH flag set, possibly an rb_cHash
+    CHash, // An object that at one point had its class field equal rb_cHash (creating a singleton class changes it)
 
     BlockParamProxy, // A special sentinel value indicating the block parameter should be read from
                      // the current surrounding cfp
+
+    // The context currently relies on types taking at most 4 bits (max value 15)
+    // to encode, so if we add any more, we will need to refactor the context.
 }
 
 // Default initialization
@@ -93,12 +96,11 @@ impl Type {
             // Core.rs can't reference rb_cString because it's linked by Rust-only tests.
             // But CString vs TString is only an optimisation and shouldn't affect correctness.
             #[cfg(not(test))]
-            if val.class_of() == unsafe { rb_cString } {
-                return Type::CString;
-            }
-            #[cfg(not(test))]
-            if val.class_of() == unsafe { rb_cArray } {
-                return Type::CArray;
+            match val.class_of() {
+                class if class == unsafe { rb_cArray }  => return Type::CArray,
+                class if class == unsafe { rb_cHash }   => return Type::CHash,
+                class if class == unsafe { rb_cString } => return Type::CString,
+                _ => {}
             }
             // We likewise can't reference rb_block_param_proxy, but it's again an optimisation;
             // we can just treat it as a normal Object.
@@ -108,7 +110,7 @@ impl Type {
             }
             match val.builtin_type() {
                 RUBY_T_ARRAY => Type::TArray,
-                RUBY_T_HASH => Type::Hash,
+                RUBY_T_HASH => Type::THash,
                 RUBY_T_STRING => Type::TString,
                 _ => Type::UnknownHeap,
             }
@@ -150,8 +152,8 @@ impl Type {
             Type::UnknownHeap => true,
             Type::TArray => true,
             Type::CArray => true,
-            Type::Hash => true,
-            Type::HeapSymbol => true,
+            Type::THash => true,
+            Type::CHash => true,
             Type::TString => true,
             Type::CString => true,
             Type::BlockParamProxy => true,
@@ -161,20 +163,17 @@ impl Type {
 
     /// Check if it's a T_ARRAY object (both TArray and CArray are T_ARRAY)
     pub fn is_array(&self) -> bool {
-        match self {
-            Type::TArray => true,
-            Type::CArray => true,
-            _ => false,
-        }
+        matches!(self, Type::TArray | Type::CArray)
+    }
+
+    /// Check if it's a T_HASH object (both THash and CHash are T_HASH)
+    pub fn is_hash(&self) -> bool {
+        matches!(self, Type::THash | Type::CHash)
     }
 
     /// Check if it's a T_STRING object (both TString and CString are T_STRING)
     pub fn is_string(&self) -> bool {
-        match self {
-            Type::TString => true,
-            Type::CString => true,
-            _ => false,
-        }
+        matches!(self, Type::TString | Type::CString)
     }
 
     /// Returns an Option with the T_ value type if it is known, otherwise None
@@ -186,8 +185,8 @@ impl Type {
             Type::Fixnum => Some(RUBY_T_FIXNUM),
             Type::Flonum => Some(RUBY_T_FLOAT),
             Type::TArray | Type::CArray => Some(RUBY_T_ARRAY),
-            Type::Hash => Some(RUBY_T_HASH),
-            Type::ImmSymbol | Type::HeapSymbol => Some(RUBY_T_SYMBOL),
+            Type::THash | Type::CHash => Some(RUBY_T_HASH),
+            Type::ImmSymbol => Some(RUBY_T_SYMBOL),
             Type::TString | Type::CString => Some(RUBY_T_STRING),
             Type::Unknown | Type::UnknownImm | Type::UnknownHeap => None,
             Type::BlockParamProxy => None,
@@ -203,9 +202,10 @@ impl Type {
                 Type::False => Some(rb_cFalseClass),
                 Type::Fixnum => Some(rb_cInteger),
                 Type::Flonum => Some(rb_cFloat),
-                Type::ImmSymbol | Type::HeapSymbol => Some(rb_cSymbol),
-                Type::CString => Some(rb_cString),
+                Type::ImmSymbol => Some(rb_cSymbol),
                 Type::CArray => Some(rb_cArray),
+                Type::CHash => Some(rb_cHash),
+                Type::CString => Some(rb_cString),
                 _ => None,
             }
         }
@@ -255,13 +255,18 @@ impl Type {
             return TypeDiff::Compatible(1);
         }
 
-        // A CString is also a TString.
-        if self == Type::CString && dst == Type::TString {
+        // A CArray is also a TArray.
+        if self == Type::CArray && dst == Type::TArray {
             return TypeDiff::Compatible(1);
         }
 
-        // A CArray is also a TArray.
-        if self == Type::CArray && dst == Type::TArray {
+        // A CHash is also a THash.
+        if self == Type::CHash && dst == Type::THash {
+            return TypeDiff::Compatible(1);
+        }
+
+        // A CString is also a TString.
+        if self == Type::CString && dst == Type::TString {
             return TypeDiff::Compatible(1);
         }
 
@@ -296,63 +301,92 @@ pub enum TypeDiff {
     Incompatible,
 }
 
-// Potential mapping of a value on the temporary stack to
-// self, a local variable or constant so that we can track its type
 #[derive(Copy, Clone, Eq, Hash, PartialEq, Debug)]
-pub enum TempMapping {
-    MapToStack, // Normal stack value
-    MapToSelf,  // Temp maps to the self operand
-    MapToLocal(LocalIndex), // Temp maps to a local variable with index
-                //ConstMapping,         // Small constant (0, 1, 2, Qnil, Qfalse, Qtrue)
+#[repr(u8)]
+pub enum TempMappingKind
+{
+    MapToStack = 0,
+    MapToSelf = 1,
+    MapToLocal = 2,
 }
 
-// Index used by MapToLocal. Using this instead of u8 makes TempMapping 1 byte.
+// Potential mapping of a value on the temporary stack to
+// self, a local variable or constant so that we can track its type
+//
+// The highest two bits represent TempMappingKind, and the rest of
+// the bits are used differently across different kinds.
+// * MapToStack: The lowest 5 bits are used for mapping Type.
+// * MapToSelf: The remaining bits are not used; the type is stored in self_type.
+// * MapToLocal: The lowest 3 bits store the index of a local variable.
 #[derive(Copy, Clone, Eq, Hash, PartialEq, Debug)]
-pub enum LocalIndex {
-    Local0,
-    Local1,
-    Local2,
-    Local3,
-    Local4,
-    Local5,
-    Local6,
-    Local7,
-}
+pub struct TempMapping(u8);
 
-impl From<LocalIndex> for u8 {
-    fn from(idx: LocalIndex) -> Self {
-        match idx {
-            LocalIndex::Local0 => 0,
-            LocalIndex::Local1 => 1,
-            LocalIndex::Local2 => 2,
-            LocalIndex::Local3 => 3,
-            LocalIndex::Local4 => 4,
-            LocalIndex::Local5 => 5,
-            LocalIndex::Local6 => 6,
-            LocalIndex::Local7 => 7,
-        }
+impl TempMapping {
+    pub fn map_to_stack(t: Type) -> TempMapping
+    {
+        let kind_bits = TempMappingKind::MapToStack as u8;
+        let type_bits = t as u8;
+        assert!(type_bits <= 0b11111);
+        let bits = (kind_bits << 6) | (type_bits & 0b11111);
+        TempMapping(bits)
+    }
+
+    pub fn map_to_self() -> TempMapping
+    {
+        let kind_bits = TempMappingKind::MapToSelf as u8;
+        let bits = kind_bits << 6;
+        TempMapping(bits)
     }
-}
 
-impl From<u8> for LocalIndex {
-    fn from(idx: u8) -> Self {
-        match idx {
-            0 => LocalIndex::Local0,
-            1 => LocalIndex::Local1,
-            2 => LocalIndex::Local2,
-            3 => LocalIndex::Local3,
-            4 => LocalIndex::Local4,
-            5 => LocalIndex::Local5,
-            6 => LocalIndex::Local6,
-            7 => LocalIndex::Local7,
-            _ => unreachable!("{idx} was larger than {MAX_LOCAL_TYPES}"),
+    pub fn map_to_local(local_idx: u8) -> TempMapping
+    {
+        let kind_bits = TempMappingKind::MapToLocal as u8;
+        assert!(local_idx <= 0b111);
+        let bits = (kind_bits << 6) | (local_idx & 0b111);
+        TempMapping(bits)
+    }
+
+    pub fn without_type(&self) -> TempMapping
+    {
+        if self.get_kind() != TempMappingKind::MapToStack {
+            return *self;
         }
+
+        TempMapping::map_to_stack(Type::Unknown)
+    }
+
+    pub fn get_kind(&self) -> TempMappingKind
+    {
+        // Take the two highest bits
+        let TempMapping(bits) = self;
+        let kind_bits = bits >> 6;
+        assert!(kind_bits <= 2);
+        unsafe { transmute::<u8, TempMappingKind>(kind_bits) }
+    }
+
+    pub fn get_type(&self) -> Type
+    {
+        assert!(self.get_kind() == TempMappingKind::MapToStack);
+
+        // Take the 5 lowest bits
+        let TempMapping(bits) = self;
+        let type_bits = bits & 0b11111;
+        unsafe { transmute::<u8, Type>(type_bits) }
+    }
+
+    pub fn get_local_idx(&self) -> u8
+    {
+        assert!(self.get_kind() == TempMappingKind::MapToLocal);
+
+        // Take the 3 lowest bits
+        let TempMapping(bits) = self;
+        bits & 0b111
     }
 }
 
 impl Default for TempMapping {
     fn default() -> Self {
-        MapToStack
+        TempMapping::map_to_stack(Type::Unknown)
     }
 }
 
@@ -403,21 +437,27 @@ impl RegTemps {
 
     /// Return true if there's a register that conflicts with a given stack_idx.
     pub fn conflicts_with(&self, stack_idx: u8) -> bool {
-        let mut other_idx = stack_idx as isize - get_option!(num_temp_regs) as isize;
-        while other_idx >= 0 {
-            if self.get(other_idx as u8) {
+        let mut other_idx = stack_idx as usize % get_option!(num_temp_regs);
+        while other_idx < MAX_REG_TEMPS as usize {
+            if stack_idx as usize != other_idx && self.get(other_idx as u8) {
                 return true;
             }
-            other_idx -= get_option!(num_temp_regs) as isize;
+            other_idx += get_option!(num_temp_regs);
         }
         false
     }
 }
 
+/// Bits for chain_depth_return_landing_defer
+const RETURN_LANDING_BIT: u8 = 0b10000000;
+const DEFER_BIT: u8          = 0b01000000;
+const CHAIN_DEPTH_MASK: u8   = 0b00111111; // 63
+
 /// Code generation context
 /// Contains information we can use to specialize/optimize code
 /// There are a lot of context objects so we try to keep the size small.
-#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, Debug)]
+#[derive(Copy, Clone, Default, Eq, Hash, PartialEq, Debug)]
+#[repr(packed)]
 pub struct Context {
     // Number of values currently on the temporary stack
     stack_size: u8,
@@ -429,20 +469,33 @@ pub struct Context {
     /// Bitmap of which stack temps are in a register
     reg_temps: RegTemps,
 
-    // Depth of this block in the sidechain (eg: inline-cache chain)
-    chain_depth: u8,
+    /// Fields packed into u8
+    /// - 1st bit from the left: Whether this code is the target of a JIT-to-JIT Ruby return ([Self::is_return_landing])
+    /// - 2nd bit from the left: Whether the compilation of this code has been deferred ([Self::is_deferred])
+    /// - Last 6 bits (max: 63): Depth of this block in the sidechain (eg: inline-cache chain)
+    chain_depth_and_flags: u8,
+
+    // Type we track for self
+    self_type: Type,
 
     // Local variable types we keep track of
-    local_types: [Type; MAX_LOCAL_TYPES],
+    // We store 8 local types, requiring 4 bits each, for a total of 32 bits
+    local_types: u32,
 
-    // Temporary variable types we keep track of
-    temp_types: [Type; MAX_TEMP_TYPES],
+    // Temp mapping kinds we track
+    // 8 temp mappings * 2 bits, total 16 bits
+    temp_mapping_kind: u16,
 
-    // Type we track for self
-    self_type: Type,
+    // Stack slot type/local_idx we track
+    // 8 temp types * 4 bits, total 32 bits
+    temp_payload: u32,
 
-    // Mapping of temp stack entries to types we track
-    temp_mapping: [TempMapping; MAX_TEMP_TYPES],
+    /// A pointer to a block ISEQ supplied by the caller. 0 if not inlined.
+    /// Not using IseqPtr to satisfy Default trait, and not using Option for #[repr(packed)]
+    /// TODO: This could be u16 if we have a global or per-ISEQ HashMap to convert IseqPtr
+    /// to serial indexes. We're thinking of overhauling Context structure in Ruby 3.4 which
+    /// could allow this to consume no bytes, so we're leaving this as is.
+    inline_block: u64,
 }
 
 /// Tuple of (iseq, idx) used to identify basic blocks
@@ -474,6 +527,8 @@ pub enum BranchGenFn {
     JNZToTarget0,
     JZToTarget0,
     JBEToTarget0,
+    JBToTarget0,
+    JOMulToTarget0,
     JITReturn,
 }
 
@@ -485,8 +540,8 @@ impl BranchGenFn {
                     BranchShape::Next0 => asm.jz(target1.unwrap()),
                     BranchShape::Next1 => asm.jnz(target0),
                     BranchShape::Default => {
-                        asm.jnz(target0.into());
-                        asm.jmp(target1.unwrap().into());
+                        asm.jnz(target0);
+                        asm.jmp(target1.unwrap());
                     }
                 }
             }
@@ -515,11 +570,11 @@ impl BranchGenFn {
                     panic!("Branch shape Next1 not allowed in JumpToTarget0!");
                 }
                 if shape.get() == BranchShape::Default {
-                    asm.jmp(target0.into());
+                    asm.jmp(target0);
                 }
             }
             BranchGenFn::JNZToTarget0 => {
-                asm.jnz(target0.into())
+                asm.jnz(target0)
             }
             BranchGenFn::JZToTarget0 => {
                 asm.jz(target0)
@@ -527,9 +582,17 @@ impl BranchGenFn {
             BranchGenFn::JBEToTarget0 => {
                 asm.jbe(target0)
             }
+            BranchGenFn::JBToTarget0 => {
+                asm.jb(target0)
+            }
+            BranchGenFn::JOMulToTarget0 => {
+                asm.jo_mul(target0)
+            }
             BranchGenFn::JITReturn => {
-                asm.comment("update cfp->jit_return");
-                asm.mov(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_JIT_RETURN), Opnd::const_ptr(target0.unwrap_code_ptr().raw_ptr()));
+                asm_comment!(asm, "update cfp->jit_return");
+                let jit_return = RUBY_OFFSET_CFP_JIT_RETURN - RUBY_SIZEOF_CONTROL_FRAME as i32;
+                let raw_ptr = asm.lea_jump_target(target0);
+                asm.mov(Opnd::mem(64, CFP, jit_return), raw_ptr);
             }
         }
     }
@@ -543,6 +606,8 @@ impl BranchGenFn {
             BranchGenFn::JNZToTarget0 |
             BranchGenFn::JZToTarget0 |
             BranchGenFn::JBEToTarget0 |
+            BranchGenFn::JBToTarget0 |
+            BranchGenFn::JOMulToTarget0 |
             BranchGenFn::JITReturn => BranchShape::Default,
         }
     }
@@ -563,6 +628,8 @@ impl BranchGenFn {
             BranchGenFn::JNZToTarget0 |
             BranchGenFn::JZToTarget0 |
             BranchGenFn::JBEToTarget0 |
+            BranchGenFn::JBToTarget0 |
+            BranchGenFn::JOMulToTarget0 |
             BranchGenFn::JITReturn => {
                 assert_eq!(new_shape, BranchShape::Default);
             }
@@ -594,8 +661,8 @@ impl BranchTarget {
 
     fn get_ctx(&self) -> Context {
         match self {
-            BranchTarget::Stub(stub) => stub.ctx.clone(),
-            BranchTarget::Block(blockref) => unsafe { blockref.as_ref() }.ctx.clone(),
+            BranchTarget::Stub(stub) => stub.ctx,
+            BranchTarget::Block(blockref) => unsafe { blockref.as_ref() }.ctx,
         }
     }
 
@@ -660,7 +727,7 @@ pub struct PendingBranch {
 impl Branch {
     // Compute the size of the branch code
     fn code_size(&self) -> usize {
-        (self.end_addr.get().raw_ptr() as usize) - (self.start_addr.raw_ptr() as usize)
+        (self.end_addr.get().as_offset() - self.start_addr.as_offset()) as usize
     }
 
     /// Get the address of one of the branch destination
@@ -752,7 +819,7 @@ impl PendingBranch {
                 address: Some(stub_addr),
                 iseq: Cell::new(target.iseq),
                 iseq_idx: target.idx,
-                ctx: ctx.clone(),
+                ctx: *ctx,
             })))));
         }
 
@@ -937,7 +1004,6 @@ impl fmt::Debug for MutableBranchList {
     }
 }
 
-
 /// This is all the data YJIT stores on an iseq
 /// This will be dynamically allocated by C code
 /// C code should pass an &mut IseqPayload to us
@@ -1050,23 +1116,34 @@ pub fn for_each_on_stack_iseq_payload<F: FnMut(&IseqPayload)>(mut callback: F) {
 
 /// Iterate over all NOT on-stack ISEQ payloads
 pub fn for_each_off_stack_iseq_payload<F: FnMut(&mut IseqPayload)>(mut callback: F) {
-    let mut on_stack_iseqs: Vec<IseqPtr> = vec![];
-    for_each_on_stack_iseq(|iseq| {
-        on_stack_iseqs.push(iseq);
-    });
-    for_each_iseq(|iseq| {
+    // Get all ISEQs on the heap. Note that rb_objspace_each_objects() runs GC first,
+    // which could move ISEQ pointers when GC.auto_compact = true.
+    // So for_each_on_stack_iseq() must be called after this, which doesn't run GC.
+    let mut iseqs: Vec<IseqPtr> = vec![];
+    for_each_iseq(|iseq| iseqs.push(iseq));
+
+    // Get all ISEQs that are on a CFP of existing ECs.
+    let mut on_stack_iseqs: HashSet<IseqPtr> = HashSet::new();
+    for_each_on_stack_iseq(|iseq| { on_stack_iseqs.insert(iseq); });
+
+    // Invoke the callback for iseqs - on_stack_iseqs
+    for iseq in iseqs {
         if !on_stack_iseqs.contains(&iseq) {
             if let Some(iseq_payload) = get_iseq_payload(iseq) {
                 callback(iseq_payload);
             }
         }
-    })
+    }
 }
 
 /// Free the per-iseq payload
 #[no_mangle]
-pub extern "C" fn rb_yjit_iseq_free(payload: *mut c_void) {
+pub extern "C" fn rb_yjit_iseq_free(iseq: IseqPtr) {
+    // Free invariants for the ISEQ
+    iseq_free_invariants(iseq);
+
     let payload = {
+        let payload = unsafe { rb_iseq_get_yjit_payload(iseq) };
         if payload.is_null() {
             // Nothing to free.
             return;
@@ -1103,7 +1180,7 @@ pub extern "C" fn rb_yjit_iseq_free(payload: *mut c_void) {
     incr_counter!(freed_iseq_count);
 }
 
-/// GC callback for marking GC objects in the the per-iseq payload.
+/// GC callback for marking GC objects in the per-iseq payload.
 #[no_mangle]
 pub extern "C" fn rb_yjit_iseq_mark(payload: *mut c_void) {
     let payload = if payload.is_null() {
@@ -1129,30 +1206,54 @@ pub extern "C" fn rb_yjit_iseq_mark(payload: *mut c_void) {
         for block in versions {
             // SAFETY: all blocks inside version_map are initialized.
             let block = unsafe { block.as_ref() };
+            mark_block(block, cb, false);
+        }
+    }
+    // Mark dead blocks, since there could be stubs pointing at them
+    for blockref in &payload.dead_blocks {
+        // SAFETY: dead blocks come from version_map, which only have initialized blocks
+        let block = unsafe { blockref.as_ref() };
+        mark_block(block, cb, true);
+    }
+
+    return;
+
+    fn mark_block(block: &Block, cb: &CodeBlock, dead: bool) {
+        unsafe { rb_gc_mark_movable(block.iseq.get().into()) };
+
+        // Mark method entry dependencies
+        for cme_dep in block.cme_dependencies.iter() {
+            unsafe { rb_gc_mark_movable(cme_dep.get().into()) };
+        }
+
+        // Mark outgoing branch entries
+        for branch in block.outgoing.iter() {
+            let branch = unsafe { branch.as_ref() };
+            for target in branch.targets.iter() {
+                // SAFETY: no mutation inside unsafe
+                let target_iseq = unsafe {
+                    target.ref_unchecked().as_ref().and_then(|target| {
+                        // Avoid get_blockid() on blockref. Can be dangling on dead blocks,
+                        // and the iseq housing the block already naturally handles it.
+                        if target.get_block().is_some() {
+                            None
+                        } else {
+                            Some(target.get_blockid().iseq)
+                        }
+                    })
+                };
 
-            unsafe { rb_gc_mark_movable(block.iseq.get().into()) };
-
-            // Mark method entry dependencies
-            for cme_dep in block.cme_dependencies.iter() {
-                unsafe { rb_gc_mark_movable(cme_dep.get().into()) };
-            }
-
-            // Mark outgoing branch entries
-            for branch in block.outgoing.iter() {
-                let branch = unsafe { branch.as_ref() };
-                for target in branch.targets.iter() {
-                    // SAFETY: no mutation inside unsafe
-                    let target_iseq = unsafe { target.ref_unchecked().as_ref().map(|target| target.get_blockid().iseq) };
-
-                    if let Some(target_iseq) = target_iseq {
-                        unsafe { rb_gc_mark_movable(target_iseq.into()) };
-                    }
+                if let Some(target_iseq) = target_iseq {
+                    unsafe { rb_gc_mark_movable(target_iseq.into()) };
                 }
             }
+        }
 
-            // Walk over references to objects in generated code.
+        // Mark references to objects in generated code.
+        // Skip for dead blocks since they shouldn't run.
+        if !dead {
             for offset in block.gc_obj_offsets.iter() {
-                let value_address: *const u8 = cb.get_ptr(offset.as_usize()).raw_ptr();
+                let value_address: *const u8 = cb.get_ptr(offset.as_usize()).raw_ptr(cb);
                 // Creating an unaligned pointer is well defined unlike in C.
                 let value_address = value_address as *const VALUE;
 
@@ -1166,10 +1267,11 @@ pub extern "C" fn rb_yjit_iseq_mark(payload: *mut c_void) {
     }
 }
 
-/// GC callback for updating GC objects in the the per-iseq payload.
+/// GC callback for updating GC objects in the per-iseq payload.
 /// This is a mirror of [rb_yjit_iseq_mark].
 #[no_mangle]
-pub extern "C" fn rb_yjit_iseq_update_references(payload: *mut c_void) {
+pub extern "C" fn rb_yjit_iseq_update_references(iseq: IseqPtr) {
+    let payload = unsafe { rb_iseq_get_yjit_payload(iseq) };
     let payload = if payload.is_null() {
         // Nothing to update.
         return;
@@ -1196,21 +1298,70 @@ pub extern "C" fn rb_yjit_iseq_update_references(payload: *mut c_void) {
         for version in versions {
             // SAFETY: all blocks inside version_map are initialized
             let block = unsafe { version.as_ref() };
+            block_update_references(block, cb, false);
+        }
+    }
+    // Update dead blocks, since there could be stubs pointing at them
+    for blockref in &payload.dead_blocks {
+        // SAFETY: dead blocks come from version_map, which only have initialized blocks
+        let block = unsafe { blockref.as_ref() };
+        block_update_references(block, cb, true);
+    }
 
-            block.iseq.set(unsafe { rb_gc_location(block.iseq.get().into()) }.as_iseq());
+    // Note that we would have returned already if YJIT is off.
+    cb.mark_all_executable();
 
-            // Update method entry dependencies
-            for cme_dep in block.cme_dependencies.iter() {
-                let cur_cme: VALUE = cme_dep.get().into();
-                let new_cme = unsafe { rb_gc_location(cur_cme) }.as_cme();
-                cme_dep.set(new_cme);
+    CodegenGlobals::get_outlined_cb()
+        .unwrap()
+        .mark_all_executable();
+
+    return;
+
+    fn block_update_references(block: &Block, cb: &mut CodeBlock, dead: bool) {
+        block.iseq.set(unsafe { rb_gc_location(block.iseq.get().into()) }.as_iseq());
+
+        // Update method entry dependencies
+        for cme_dep in block.cme_dependencies.iter() {
+            let cur_cme: VALUE = cme_dep.get().into();
+            let new_cme = unsafe { rb_gc_location(cur_cme) }.as_cme();
+            cme_dep.set(new_cme);
+        }
+
+        // Update outgoing branch entries
+        for branch in block.outgoing.iter() {
+            let branch = unsafe { branch.as_ref() };
+            for target in branch.targets.iter() {
+                // SAFETY: no mutation inside unsafe
+                let current_iseq = unsafe {
+                    target.ref_unchecked().as_ref().and_then(|target| {
+                        // Avoid get_blockid() on blockref. Can be dangling on dead blocks,
+                        // and the iseq housing the block already naturally handles it.
+                        if target.get_block().is_some() {
+                            None
+                        } else {
+                            Some(target.get_blockid().iseq)
+                        }
+                    })
+                };
+
+                if let Some(current_iseq) = current_iseq {
+                    let updated_iseq = unsafe { rb_gc_location(current_iseq.into()) }
+                        .as_iseq();
+                    // SAFETY: the Cell::set is not on the reference given out
+                    // by ref_unchecked.
+                    unsafe { target.ref_unchecked().as_ref().unwrap().set_iseq(updated_iseq) };
+                }
             }
+        }
 
-            // Walk over references to objects in generated code.
+        // Update references to objects in generated code.
+        // Skip for dead blocks since they shouldn't run and
+        // so there is no potential of writing over invalidation jumps
+        if !dead {
             for offset in block.gc_obj_offsets.iter() {
                 let offset_to_value = offset.as_usize();
                 let value_code_ptr = cb.get_ptr(offset_to_value);
-                let value_ptr: *const u8 = value_code_ptr.raw_ptr();
+                let value_ptr: *const u8 = value_code_ptr.raw_ptr(cb);
                 // Creating an unaligned pointer is well defined unlike in C.
                 let value_ptr = value_ptr as *mut VALUE;
 
@@ -1227,32 +1378,9 @@ pub extern "C" fn rb_yjit_iseq_update_references(payload: *mut c_void) {
                     }
                 }
             }
-
-            // Update outgoing branch entries
-            for branch in block.outgoing.iter() {
-                let branch = unsafe { branch.as_ref() };
-                for target in branch.targets.iter() {
-                    // SAFETY: no mutation inside unsafe
-                    let current_iseq = unsafe { target.ref_unchecked().as_ref().map(|target| target.get_blockid().iseq) };
-
-                    if let Some(current_iseq) = current_iseq {
-                        let updated_iseq = unsafe { rb_gc_location(current_iseq.into()) }
-                            .as_iseq();
-                        // SAFETY: the Cell::set is not on the reference given out
-                        // by ref_unchecked.
-                        unsafe { target.ref_unchecked().as_ref().unwrap().set_iseq(updated_iseq) };
-                    }
-                }
-            }
         }
-    }
 
-    // Note that we would have returned already if YJIT is off.
-    cb.mark_all_executable();
-
-    CodegenGlobals::get_outlined_cb()
-        .unwrap()
-        .mark_all_executable();
+    }
 }
 
 /// Get all blocks for a particular place in an iseq.
@@ -1293,14 +1421,19 @@ pub fn take_version_list(blockid: BlockId) -> VersionList {
 }
 
 /// Count the number of block versions matching a given blockid
-fn get_num_versions(blockid: BlockId) -> usize {
+/// `inlined: true` counts inlined versions, and `inlined: false` counts other versions.
+fn get_num_versions(blockid: BlockId, inlined: bool) -> usize {
     let insn_idx = blockid.idx.as_usize();
     match get_iseq_payload(blockid.iseq) {
         Some(payload) => {
             payload
                 .version_map
                 .get(insn_idx)
-                .map(|versions| versions.len())
+                .map(|versions| {
+                    versions.iter().filter(|&&version|
+                        unsafe { version.as_ref() }.ctx.inline() == inlined
+                    ).count()
+                })
                 .unwrap_or(0)
         }
         None => 0,
@@ -1355,41 +1488,54 @@ fn find_block_version(blockid: BlockId, ctx: &Context) -> Option<BlockRef> {
         }
     }
 
-    // If greedy versioning is enabled
-    if get_option!(greedy_versioning) {
-        // If we're below the version limit, don't settle for an imperfect match
-        if versions.len() + 1 < get_option!(max_versions) && best_diff > 0 {
-            return None;
-        }
-    }
-
     return best_version;
 }
 
+/// Allow inlining a Block up to MAX_INLINE_VERSIONS times.
+const MAX_INLINE_VERSIONS: usize = 1000;
+
 /// Produce a generic context when the block version limit is hit for a blockid
 pub fn limit_block_versions(blockid: BlockId, ctx: &Context) -> Context {
     // Guard chains implement limits separately, do nothing
-    if ctx.chain_depth > 0 {
-        return ctx.clone();
+    if ctx.get_chain_depth() > 0 {
+        return *ctx;
     }
 
+    let next_versions = get_num_versions(blockid, ctx.inline()) + 1;
+    let max_versions = if ctx.inline() {
+        MAX_INLINE_VERSIONS
+    } else {
+        get_option!(max_versions)
+    };
+
     // If this block version we're about to add will hit the version limit
-    if get_num_versions(blockid) + 1 >= get_option!(max_versions) {
+    if next_versions >= max_versions {
         // Produce a generic context that stores no type information,
         // but still respects the stack_size and sp_offset constraints.
         // This new context will then match all future requests.
         let generic_ctx = ctx.get_generic_ctx();
 
-        debug_assert_ne!(
-            TypeDiff::Incompatible,
-            ctx.diff(&generic_ctx),
-            "should substitute a compatible context",
-        );
+        if cfg!(debug_assertions) {
+            let mut ctx = ctx.clone();
+            if ctx.inline() {
+                // Suppress TypeDiff::Incompatible from ctx.diff(). We return TypeDiff::Incompatible
+                // to keep inlining blocks until we hit the limit, but it's safe to give up inlining.
+                ctx.inline_block = 0;
+                assert!(generic_ctx.inline_block == 0);
+            }
+
+            assert_ne!(
+                TypeDiff::Incompatible,
+                ctx.diff(&generic_ctx),
+                "should substitute a compatible context",
+            );
+        }
 
         return generic_ctx;
     }
+    incr_counter_to!(max_inline_versions, next_versions);
 
-    return ctx.clone();
+    return *ctx;
 }
 
 /// Install a block version into its [IseqPayload], letting the GC track its
@@ -1436,7 +1582,7 @@ unsafe fn add_block_version(blockref: BlockRef, cb: &CodeBlock) {
 
     // Run write barriers for all objects in generated code.
     for offset in block.gc_obj_offsets.iter() {
-        let value_address: *const u8 = cb.get_ptr(offset.as_usize()).raw_ptr();
+        let value_address: *const u8 = cb.get_ptr(offset.as_usize()).raw_ptr(cb);
         // Creating an unaligned pointer is well defined unlike in C.
         let value_address: *const VALUE = value_address.cast();
 
@@ -1513,6 +1659,12 @@ impl JITState {
         if let Some(idlist) = self.stable_constant_names_assumption {
             track_stable_constant_names_assumption(blockref, idlist);
         }
+        for klass in self.no_singleton_class_assumptions {
+            track_no_singleton_class_assumption(blockref, klass);
+        }
+        if self.no_ep_escape {
+            track_no_ep_escape_assumption(blockref, self.iseq);
+        }
 
         blockref
     }
@@ -1558,7 +1710,7 @@ impl Block {
 
     // Compute the size of the block code
     pub fn code_size(&self) -> usize {
-        (self.end_addr.get().into_usize()) - (self.start_addr.into_usize())
+        (self.end_addr.get().as_offset() - self.start_addr.as_offset()).try_into().unwrap()
     }
 }
 
@@ -1567,12 +1719,22 @@ impl Context {
         self.stack_size
     }
 
+    pub fn set_stack_size(&mut self, stack_size: u8) {
+        self.stack_size = stack_size;
+    }
+
     /// Create a new Context that is compatible with self but doesn't have type information.
     pub fn get_generic_ctx(&self) -> Context {
         let mut generic_ctx = Context::default();
         generic_ctx.stack_size = self.stack_size;
         generic_ctx.sp_offset = self.sp_offset;
         generic_ctx.reg_temps = self.reg_temps;
+        if self.is_return_landing() {
+            generic_ctx.set_as_return_landing();
+        }
+        if self.is_deferred() {
+            generic_ctx.mark_as_deferred();
+        }
         generic_ctx
     }
 
@@ -1580,7 +1742,7 @@ impl Context {
     /// accordingly. This is useful when you want to virtually rewind a stack_size for
     /// generating a side exit while considering past sp_offset changes on gen_save_sp.
     pub fn with_stack_size(&self, stack_size: u8) -> Context {
-        let mut ctx = self.clone();
+        let mut ctx = *self;
         ctx.sp_offset -= (ctx.get_stack_size() as isize - stack_size as isize) as i8;
         ctx.stack_size = stack_size;
         ctx
@@ -1603,24 +1765,54 @@ impl Context {
     }
 
     pub fn get_chain_depth(&self) -> u8 {
-        self.chain_depth
+        self.chain_depth_and_flags & CHAIN_DEPTH_MASK
     }
 
-    pub fn reset_chain_depth(&mut self) {
-        self.chain_depth = 0;
+    pub fn reset_chain_depth_and_defer(&mut self) {
+        self.chain_depth_and_flags &= !CHAIN_DEPTH_MASK;
+        self.chain_depth_and_flags &= !DEFER_BIT;
     }
 
     pub fn increment_chain_depth(&mut self) {
-        self.chain_depth += 1;
+        if self.get_chain_depth() == CHAIN_DEPTH_MASK {
+            panic!("max block version chain depth reached!");
+        }
+        self.chain_depth_and_flags += 1;
+    }
+
+    pub fn set_as_return_landing(&mut self) {
+        self.chain_depth_and_flags |= RETURN_LANDING_BIT;
+    }
+
+    pub fn clear_return_landing(&mut self) {
+        self.chain_depth_and_flags &= !RETURN_LANDING_BIT;
+    }
+
+    pub fn is_return_landing(&self) -> bool {
+        self.chain_depth_and_flags & RETURN_LANDING_BIT != 0
+    }
+
+    pub fn mark_as_deferred(&mut self) {
+        self.chain_depth_and_flags |= DEFER_BIT;
+    }
+
+    pub fn is_deferred(&self) -> bool {
+        self.chain_depth_and_flags & DEFER_BIT != 0
     }
 
     /// Get an operand for the adjusted stack pointer address
-    pub fn sp_opnd(&self, offset_bytes: isize) -> Opnd {
-        let offset = ((self.sp_offset as isize) * (SIZEOF_VALUE as isize)) + offset_bytes;
-        let offset = offset as i32;
+    pub fn sp_opnd(&self, offset: i32) -> Opnd {
+        let offset = (self.sp_offset as i32 + offset) * SIZEOF_VALUE_I32;
         return Opnd::mem(64, SP, offset);
     }
 
+    /// Get an operand for the adjusted environment pointer address using SP register.
+    /// This is valid only when a Binding object hasn't been created for the frame.
+    pub fn ep_opnd(&self, offset: i32) -> Opnd {
+        let ep_offset = self.get_stack_size() as i32 + 1;
+        self.sp_opnd(-ep_offset + offset)
+    }
+
     /// Stop using a register for a given stack temp.
     /// This allows us to reuse the register for a value that we know is dead
     /// and will no longer be used (e.g. popped stack temp).
@@ -1645,14 +1837,15 @@ impl Context {
                     return Type::Unknown;
                 }
 
-                let mapping = self.temp_mapping[stack_idx];
+                let mapping = self.get_temp_mapping(stack_idx);
 
-                match mapping {
+                match mapping.get_kind() {
                     MapToSelf => self.self_type,
-                    MapToStack => self.temp_types[(self.stack_size - 1 - idx) as usize],
-                    MapToLocal(idx) => {
+                    MapToStack => mapping.get_type(),
+                    MapToLocal => {
+                        let idx = mapping.get_local_idx();
                         assert!((idx as usize) < MAX_LOCAL_TYPES);
-                        return self.local_types[idx as usize];
+                        return self.get_local_type(idx.into());
                     }
                 }
             }
@@ -1660,8 +1853,83 @@ impl Context {
     }
 
     /// Get the currently tracked type for a local variable
-    pub fn get_local_type(&self, idx: usize) -> Type {
-        *self.local_types.get(idx).unwrap_or(&Type::Unknown)
+    pub fn get_local_type(&self, local_idx: usize) -> Type {
+        if local_idx >= MAX_LOCAL_TYPES {
+            return Type::Unknown
+        } else {
+            // Each type is stored in 4 bits
+            let type_bits = (self.local_types >> (4 * local_idx)) & 0b1111;
+            unsafe { transmute::<u8, Type>(type_bits as u8) }
+        }
+    }
+
+    /// Get the current temp mapping for a given stack slot
+    fn get_temp_mapping(&self, temp_idx: usize) -> TempMapping {
+        assert!(temp_idx < MAX_TEMP_TYPES);
+
+        // Extract the temp mapping kind
+        let kind_bits = (self.temp_mapping_kind >> (2 * temp_idx)) & 0b11;
+        let temp_kind = unsafe { transmute::<u8, TempMappingKind>(kind_bits as u8) };
+
+        // Extract the payload bits (temp type or local idx)
+        let payload_bits = (self.temp_payload >> (4 * temp_idx)) & 0b1111;
+
+        match temp_kind {
+            MapToSelf => TempMapping::map_to_self(),
+
+            MapToStack => {
+                TempMapping::map_to_stack(
+                    unsafe { transmute::<u8, Type>(payload_bits as u8) }
+                )
+            }
+
+            MapToLocal => {
+                TempMapping::map_to_local(
+                    payload_bits as u8
+                )
+            }
+        }
+    }
+
+    /// Get the current temp mapping for a given stack slot
+    fn set_temp_mapping(&mut self, temp_idx: usize, mapping: TempMapping) {
+        assert!(temp_idx < MAX_TEMP_TYPES);
+
+        // Extract the kind bits
+        let mapping_kind = mapping.get_kind();
+        let kind_bits = unsafe { transmute::<TempMappingKind, u8>(mapping_kind) };
+        assert!(kind_bits <= 0b11);
+
+        // Extract the payload bits
+        let payload_bits = match mapping_kind {
+            MapToSelf => 0,
+
+            MapToStack => {
+                let t = mapping.get_type();
+                unsafe { transmute::<Type, u8>(t) }
+            }
+
+            MapToLocal => {
+                mapping.get_local_idx()
+            }
+        };
+        assert!(payload_bits <= 0b1111);
+
+        // Update the kind bits
+        {
+            let mask_bits = 0b11_u16 << (2 * temp_idx);
+            let shifted_bits = (kind_bits as u16) << (2 * temp_idx);
+            let all_kind_bits = self.temp_mapping_kind as u16;
+            self.temp_mapping_kind = (all_kind_bits & !mask_bits) | shifted_bits;
+        }
+
+        // Update the payload bits
+        {
+            let mask_bits = 0b1111_u32 << (4 * temp_idx);
+            let shifted_bits = (payload_bits as u32) << (4 * temp_idx);
+            let all_payload_bits = self.temp_payload as u32;
+            self.temp_payload = (all_payload_bits & !mask_bits) | shifted_bits;
+        }
     }
 
     /// Upgrade (or "learn") the type of an instruction operand
@@ -1685,15 +1953,24 @@ impl Context {
                     return;
                 }
 
-                let mapping = self.temp_mapping[stack_idx];
+                let mapping = self.get_temp_mapping(stack_idx);
 
-                match mapping {
+                match mapping.get_kind() {
                     MapToSelf => self.self_type.upgrade(opnd_type),
-                    MapToStack => self.temp_types[stack_idx].upgrade(opnd_type),
-                    MapToLocal(idx) => {
-                        let idx = idx as usize;
+                    MapToStack => {
+                        let mut temp_type = mapping.get_type();
+                        temp_type.upgrade(opnd_type);
+                        self.set_temp_mapping(stack_idx, TempMapping::map_to_stack(temp_type));
+                    }
+                    MapToLocal => {
+                        let idx = mapping.get_local_idx() as usize;
                         assert!(idx < MAX_LOCAL_TYPES);
-                        self.local_types[idx].upgrade(opnd_type);
+                        let mut new_type = self.get_local_type(idx);
+                        new_type.upgrade(opnd_type);
+                        self.set_local_type(idx, new_type);
+                        // Re-attach MapToLocal for this StackOpnd(idx). set_local_type() detaches
+                        // all MapToLocal mappings, including the one we're upgrading here.
+                        self.set_opnd_mapping(opnd, mapping);
                     }
                 }
             }
@@ -1705,29 +1982,29 @@ impl Context {
     This is can be used with stack_push_mapping or set_opnd_mapping to copy
     a stack value's type while maintaining the mapping.
     */
-    pub fn get_opnd_mapping(&self, opnd: YARVOpnd) -> (TempMapping, Type) {
+    pub fn get_opnd_mapping(&self, opnd: YARVOpnd) -> TempMapping {
         let opnd_type = self.get_opnd_type(opnd);
 
         match opnd {
-            SelfOpnd => (MapToSelf, opnd_type),
+            SelfOpnd => TempMapping::map_to_self(),
             StackOpnd(idx) => {
                 assert!(idx < self.stack_size);
                 let stack_idx = (self.stack_size - 1 - idx) as usize;
 
                 if stack_idx < MAX_TEMP_TYPES {
-                    (self.temp_mapping[stack_idx], opnd_type)
+                    self.get_temp_mapping(stack_idx)
                 } else {
                     // We can't know the source of this stack operand, so we assume it is
                     // a stack-only temporary. type will be UNKNOWN
                     assert!(opnd_type == Type::Unknown);
-                    (MapToStack, opnd_type)
+                    TempMapping::map_to_stack(opnd_type)
                 }
             }
         }
     }
 
     /// Overwrite both the type and mapping of a stack operand.
-    pub fn set_opnd_mapping(&mut self, opnd: YARVOpnd, (mapping, opnd_type): (TempMapping, Type)) {
+    pub fn set_opnd_mapping(&mut self, opnd: YARVOpnd, mapping: TempMapping) {
         match opnd {
             SelfOpnd => unreachable!("self always maps to self"),
             StackOpnd(idx) => {
@@ -1744,44 +2021,47 @@ impl Context {
                     return;
                 }
 
-                self.temp_mapping[stack_idx] = mapping;
-
-                // Only used when mapping == MAP_STACK
-                self.temp_types[stack_idx] = opnd_type;
+                self.set_temp_mapping(stack_idx, mapping);
             }
         }
     }
 
     /// Set the type of a local variable
     pub fn set_local_type(&mut self, local_idx: usize, local_type: Type) {
-        let ctx = self;
-
         // If type propagation is disabled, store no types
         if get_option!(no_type_prop) {
             return;
         }
 
         if local_idx >= MAX_LOCAL_TYPES {
-            return;
+            return
         }
 
         // If any values on the stack map to this local we must detach them
-        for (i, mapping) in ctx.temp_mapping.iter_mut().enumerate() {
-            *mapping = match *mapping {
-                MapToStack => MapToStack,
-                MapToSelf => MapToSelf,
-                MapToLocal(idx) => {
+        for mapping_idx in 0..MAX_TEMP_TYPES {
+            let mapping = self.get_temp_mapping(mapping_idx);
+            let tm = match mapping.get_kind() {
+                MapToStack => mapping,
+                MapToSelf => mapping,
+                MapToLocal => {
+                    let idx = mapping.get_local_idx();
                     if idx as usize == local_idx {
-                        ctx.temp_types[i] = ctx.local_types[idx as usize];
-                        MapToStack
+                        let local_type = self.get_local_type(local_idx);
+                        TempMapping::map_to_stack(local_type)
                     } else {
-                        MapToLocal(idx)
+                        TempMapping::map_to_local(idx)
                     }
                 }
-            }
+            };
+            self.set_temp_mapping(mapping_idx, tm);
         }
 
-        ctx.local_types[local_idx] = local_type;
+        // Update the type bits
+        let type_bits = local_type as u32;
+        assert!(type_bits <= 0b1111);
+        let mask_bits = 0b1111_u32 << (4 * local_idx);
+        let shifted_bits = type_bits << (4 * local_idx);
+        self.local_types = (self.local_types & !mask_bits) | shifted_bits;
     }
 
     /// Erase local variable type information
@@ -1789,19 +2069,27 @@ impl Context {
     pub fn clear_local_types(&mut self) {
         // When clearing local types we must detach any stack mappings to those
         // locals. Even if local values may have changed, stack values will not.
-        for (i, mapping) in self.temp_mapping.iter_mut().enumerate() {
-            *mapping = match *mapping {
-                MapToStack => MapToStack,
-                MapToSelf => MapToSelf,
-                MapToLocal(idx) => {
-                    self.temp_types[i] = self.local_types[idx as usize];
-                    MapToStack
-                }
+
+        for mapping_idx in 0..MAX_TEMP_TYPES {
+            let mapping = self.get_temp_mapping(mapping_idx);
+            if mapping.get_kind() == MapToLocal {
+                let local_idx = mapping.get_local_idx() as usize;
+                self.set_temp_mapping(mapping_idx, TempMapping::map_to_stack(self.get_local_type(local_idx)));
             }
         }
 
         // Clear the local types
-        self.local_types = [Type::default(); MAX_LOCAL_TYPES];
+        self.local_types = 0;
+    }
+
+    /// Return true if the code is inlined by the caller
+    pub fn inline(&self) -> bool {
+        self.inline_block != 0
+    }
+
+    /// Set a block ISEQ given to the Block of this Context
+    pub fn set_inline_block(&mut self, iseq: IseqPtr) {
+        self.inline_block = iseq as u64
     }
 
     /// Compute a difference score for two context objects
@@ -1810,13 +2098,21 @@ impl Context {
         let src = self;
 
         // Can only lookup the first version in the chain
-        if dst.chain_depth != 0 {
+        if dst.get_chain_depth() != 0 {
             return TypeDiff::Incompatible;
         }
 
         // Blocks with depth > 0 always produce new versions
         // Sidechains cannot overlap
-        if src.chain_depth != 0 {
+        if src.get_chain_depth() != 0 {
+            return TypeDiff::Incompatible;
+        }
+
+        if src.is_return_landing() != dst.is_return_landing() {
+            return TypeDiff::Incompatible;
+        }
+
+        if src.is_deferred() != dst.is_deferred() {
             return TypeDiff::Incompatible;
         }
 
@@ -1841,10 +2137,17 @@ impl Context {
             TypeDiff::Incompatible => return TypeDiff::Incompatible,
         };
 
+        // Check the block to inline
+        if src.inline_block != dst.inline_block {
+            // find_block_version should not find existing blocks with different
+            // inline_block so that their yield will not be megamorphic.
+            return TypeDiff::Incompatible;
+        }
+
         // For each local type we track
-        for i in 0..src.local_types.len() {
-            let t_src = src.local_types[i];
-            let t_dst = dst.local_types[i];
+        for i in 0.. MAX_LOCAL_TYPES {
+            let t_src = src.get_local_type(i);
+            let t_dst = dst.get_local_type(i);
             diff += match t_src.diff(t_dst) {
                 TypeDiff::Compatible(diff) => diff,
                 TypeDiff::Incompatible => return TypeDiff::Incompatible,
@@ -1853,12 +2156,12 @@ impl Context {
 
         // For each value on the temp stack
         for i in 0..src.stack_size {
-            let (src_mapping, src_type) = src.get_opnd_mapping(StackOpnd(i));
-            let (dst_mapping, dst_type) = dst.get_opnd_mapping(StackOpnd(i));
+            let src_mapping = src.get_opnd_mapping(StackOpnd(i));
+            let dst_mapping = dst.get_opnd_mapping(StackOpnd(i));
 
             // If the two mappings aren't the same
             if src_mapping != dst_mapping {
-                if dst_mapping == MapToStack {
+                if dst_mapping.get_kind() == MapToStack {
                     // We can safely drop information about the source of the temp
                     // stack operand.
                     diff += 1;
@@ -1867,6 +2170,9 @@ impl Context {
                 }
             }
 
+            let src_type = src.get_opnd_type(StackOpnd(i));
+            let dst_type = dst.get_opnd_type(StackOpnd(i));
+
             diff += match src_type.diff(dst_type) {
                 TypeDiff::Compatible(diff) => diff,
                 TypeDiff::Incompatible => return TypeDiff::Incompatible,
@@ -1896,20 +2202,20 @@ impl Context {
 impl Assembler {
     /// Push one new value on the temp stack with an explicit mapping
     /// Return a pointer to the new stack top
-    pub fn stack_push_mapping(&mut self, (mapping, temp_type): (TempMapping, Type)) -> Opnd {
+    pub fn stack_push_mapping(&mut self, mapping: TempMapping) -> Opnd {
         // If type propagation is disabled, store no types
         if get_option!(no_type_prop) {
-            return self.stack_push_mapping((mapping, Type::Unknown));
+            return self.stack_push_mapping(mapping.without_type());
         }
 
         let stack_size: usize = self.ctx.stack_size.into();
 
         // Keep track of the type and mapping of the value
         if stack_size < MAX_TEMP_TYPES {
-            self.ctx.temp_mapping[stack_size] = mapping;
-            self.ctx.temp_types[stack_size] = temp_type;
+            self.ctx.set_temp_mapping(stack_size, mapping);
 
-            if let MapToLocal(idx) = mapping {
+            if mapping.get_kind() == MapToLocal {
+                let idx = mapping.get_local_idx();
                 assert!((idx as usize) < MAX_LOCAL_TYPES);
             }
         }
@@ -1928,12 +2234,12 @@ impl Assembler {
     /// Push one new value on the temp stack
     /// Return a pointer to the new stack top
     pub fn stack_push(&mut self, val_type: Type) -> Opnd {
-        return self.stack_push_mapping((MapToStack, val_type));
+        return self.stack_push_mapping(TempMapping::map_to_stack(val_type));
     }
 
     /// Push the self value on the stack
     pub fn stack_push_self(&mut self) -> Opnd {
-        return self.stack_push_mapping((MapToSelf, Type::Unknown));
+        return self.stack_push_mapping(TempMapping::map_to_self());
     }
 
     /// Push a local variable on the stack
@@ -1942,7 +2248,7 @@ impl Assembler {
             return self.stack_push(Type::Unknown);
         }
 
-        return self.stack_push_mapping((MapToLocal((local_idx as u8).into()), Type::Unknown));
+        return self.stack_push_mapping(TempMapping::map_to_local(local_idx as u8));
     }
 
     // Pop N values off the stack
@@ -1957,8 +2263,7 @@ impl Assembler {
             let idx: usize = (self.ctx.stack_size as usize) - i - 1;
 
             if idx < MAX_TEMP_TYPES {
-                self.ctx.temp_types[idx] = Type::Unknown;
-                self.ctx.temp_mapping[idx] = MapToStack;
+                self.ctx.set_temp_mapping(idx, TempMapping::map_to_stack(Type::Unknown));
             }
         }
 
@@ -1972,12 +2277,16 @@ impl Assembler {
     pub fn shift_stack(&mut self, argc: usize) {
         assert!(argc < self.ctx.stack_size.into());
 
-        let method_name_index = (self.ctx.stack_size as usize) - (argc as usize) - 1;
+        let method_name_index = (self.ctx.stack_size as usize) - argc - 1;
 
         for i in method_name_index..(self.ctx.stack_size - 1) as usize {
-            if i + 1 < MAX_TEMP_TYPES {
-                self.ctx.temp_types[i] = self.ctx.temp_types[i + 1];
-                self.ctx.temp_mapping[i] = self.ctx.temp_mapping[i + 1];
+            if i < MAX_TEMP_TYPES {
+                let next_arg_mapping = if i + 1 < MAX_TEMP_TYPES {
+                    self.ctx.get_temp_mapping(i + 1)
+                } else {
+                    TempMapping::map_to_stack(Type::Unknown)
+                };
+                self.ctx.set_temp_mapping(i, next_arg_mapping);
             }
         }
         self.stack_pop(1);
@@ -2125,12 +2434,18 @@ fn gen_block_series_body(
 
 /// Generate a block version that is an entry point inserted into an iseq
 /// NOTE: this function assumes that the VM lock has been taken
-pub fn gen_entry_point(iseq: IseqPtr, ec: EcPtr) -> Option<CodePtr> {
+/// If jit_exception is true, compile JIT code for handling exceptions.
+/// See [jit_compile_exception] for details.
+pub fn gen_entry_point(iseq: IseqPtr, ec: EcPtr, jit_exception: bool) -> Option<*const u8> {
     // Compute the current instruction index based on the current PC
+    let cfp = unsafe { get_ec_cfp(ec) };
     let insn_idx: u16 = unsafe {
-        let ec_pc = get_cfp_pc(get_ec_cfp(ec));
+        let ec_pc = get_cfp_pc(cfp);
         iseq_pc_to_insn_idx(iseq, ec_pc)?
     };
+    let stack_size: u8 = unsafe {
+        u8::try_from(get_cfp_sp(cfp).offset_from(get_cfp_bp(cfp))).ok()?
+    };
 
     // The entry context makes no assumptions about types
     let blockid = BlockId {
@@ -2143,10 +2458,12 @@ pub fn gen_entry_point(iseq: IseqPtr, ec: EcPtr) -> Option<CodePtr> {
     let ocb = CodegenGlobals::get_outlined_cb();
 
     // Write the interpreter entry prologue. Might be NULL when out of memory.
-    let code_ptr = gen_entry_prologue(cb, ocb, iseq, insn_idx);
+    let code_ptr = gen_entry_prologue(cb, ocb, iseq, insn_idx, jit_exception);
 
     // Try to generate code for the entry block
-    let block = gen_block_series(blockid, &Context::default(), ec, cb, ocb);
+    let mut ctx = Context::default();
+    ctx.stack_size = stack_size;
+    let block = gen_block_series(blockid, &ctx, ec, cb, ocb);
 
     cb.mark_all_executable();
     ocb.unwrap().mark_all_executable();
@@ -2155,7 +2472,9 @@ pub fn gen_entry_point(iseq: IseqPtr, ec: EcPtr) -> Option<CodePtr> {
         // Compilation failed
         None => {
             // Trigger code GC. This entry point will be recompiled later.
-            cb.code_gc(ocb);
+            if get_option!(code_gc) {
+                cb.code_gc(ocb);
+            }
             return None;
         }
 
@@ -2168,14 +2487,17 @@ pub fn gen_entry_point(iseq: IseqPtr, ec: EcPtr) -> Option<CodePtr> {
         }
     }
 
+    // Count the number of entry points we compile
+    incr_counter!(compiled_iseq_entry);
+
     // Compilation successful and block not empty
-    return code_ptr;
+    code_ptr.map(|ptr| ptr.raw_ptr(cb))
 }
 
 // Change the entry's jump target from an entry stub to a next entry
 pub fn regenerate_entry(cb: &mut CodeBlock, entryref: &EntryRef, next_entry: CodePtr) {
     let mut asm = Assembler::new();
-    asm.comment("regenerate_entry");
+    asm_comment!(asm, "regenerate_entry");
 
     // gen_entry_guard generates cmp + jne. We're rewriting only jne.
     asm.jne(next_entry.into());
@@ -2185,7 +2507,7 @@ pub fn regenerate_entry(cb: &mut CodeBlock, entryref: &EntryRef, next_entry: Cod
     let old_dropped_bytes = cb.has_dropped_bytes();
     cb.set_write_ptr(unsafe { entryref.as_ref() }.start_addr);
     cb.set_dropped_bytes(false);
-    asm.compile(cb, None);
+    asm.compile(cb, None).expect("can rewrite existing code");
 
     // Rewind write_pos to the original one
     assert_eq!(cb.get_write_ptr(), unsafe { entryref.as_ref() }.end_addr);
@@ -2209,78 +2531,88 @@ c_callable! {
     /// Generated code calls this function with the SysV calling convention.
     /// See [gen_call_entry_stub_hit].
     fn entry_stub_hit(entry_ptr: *const c_void, ec: EcPtr) -> *const u8 {
-        with_vm_lock(src_loc!(), || {
-            match entry_stub_hit_body(entry_ptr, ec) {
-                Some(addr) => addr,
-                // Failed to service the stub by generating a new block so now we
-                // need to exit to the interpreter at the stubbed location.
-                None => return CodegenGlobals::get_stub_exit_code().raw_ptr(),
-            }
+        with_compile_time(|| {
+            with_vm_lock(src_loc!(), || {
+                let cb = CodegenGlobals::get_inline_cb();
+                let ocb = CodegenGlobals::get_outlined_cb();
+
+                let addr = entry_stub_hit_body(entry_ptr, ec, cb, ocb)
+                    .unwrap_or_else(|| {
+                        // Trigger code GC (e.g. no space).
+                        // This entry point will be recompiled later.
+                        if get_option!(code_gc) {
+                            cb.code_gc(ocb);
+                        }
+                        CodegenGlobals::get_stub_exit_code().raw_ptr(cb)
+                    });
+
+                cb.mark_all_executable();
+                ocb.unwrap().mark_all_executable();
+
+                addr
+            })
         })
     }
 }
 
 /// Called by the generated code when an entry stub is executed
-fn entry_stub_hit_body(entry_ptr: *const c_void, ec: EcPtr) -> Option<*const u8> {
+fn entry_stub_hit_body(
+    entry_ptr: *const c_void,
+    ec: EcPtr,
+    cb: &mut CodeBlock,
+    ocb: &mut OutlinedCb
+) -> Option<*const u8> {
     // Get ISEQ and insn_idx from the current ec->cfp
     let cfp = unsafe { get_ec_cfp(ec) };
     let iseq = unsafe { get_cfp_iseq(cfp) };
     let insn_idx = iseq_pc_to_insn_idx(iseq, unsafe { get_cfp_pc(cfp) })?;
-
-    let cb = CodegenGlobals::get_inline_cb();
-    let ocb = CodegenGlobals::get_outlined_cb();
+    let stack_size: u8 = unsafe {
+        u8::try_from(get_cfp_sp(cfp).offset_from(get_cfp_bp(cfp))).ok()?
+    };
 
     // Compile a new entry guard as a next entry
     let next_entry = cb.get_write_ptr();
     let mut asm = Assembler::new();
     let pending_entry = gen_entry_chain_guard(&mut asm, ocb, iseq, insn_idx)?;
-    asm.compile(cb, Some(ocb));
+    asm.compile(cb, Some(ocb))?;
 
-    // Try to find an existing compiled version of this block
+    // Find or compile a block version
     let blockid = BlockId { iseq, idx: insn_idx };
-    let ctx = Context::default();
+    let mut ctx = Context::default();
+    ctx.stack_size = stack_size;
     let blockref = match find_block_version(blockid, &ctx) {
         // If an existing block is found, generate a jump to the block.
         Some(blockref) => {
             let mut asm = Assembler::new();
             asm.jmp(unsafe { blockref.as_ref() }.start_addr.into());
-            asm.compile(cb, Some(ocb));
-            blockref
+            asm.compile(cb, Some(ocb))?;
+            Some(blockref)
         }
         // If this block hasn't yet been compiled, generate blocks after the entry guard.
-        None => match gen_block_series(blockid, &ctx, ec, cb, ocb) {
-            Some(blockref) => blockref,
-            None => { // No space
-                // Trigger code GC. This entry point will be recompiled later.
-                cb.code_gc(ocb);
-                return None;
-            }
-        }
+        None => gen_block_series(blockid, &ctx, ec, cb, ocb),
     };
 
-    // Regenerate the previous entry
-    assert!(!entry_ptr.is_null());
-    let entryref = NonNull::<Entry>::new(entry_ptr as *mut Entry).expect("Entry should not be null");
-    regenerate_entry(cb, &entryref, next_entry);
-
-    // Write an entry to the heap and push it to the ISEQ
-    let pending_entry = Rc::try_unwrap(pending_entry).ok().expect("PendingEntry should be unique");
-    get_or_create_iseq_payload(iseq).entries.push(pending_entry.into_entry());
+    // Commit or retry the entry
+    if blockref.is_some() {
+        // Regenerate the previous entry
+        let entryref = NonNull::<Entry>::new(entry_ptr as *mut Entry).expect("Entry should not be null");
+        regenerate_entry(cb, &entryref, next_entry);
 
-    cb.mark_all_executable();
-    ocb.unwrap().mark_all_executable();
+        // Write an entry to the heap and push it to the ISEQ
+        let pending_entry = Rc::try_unwrap(pending_entry).ok().expect("PendingEntry should be unique");
+        get_or_create_iseq_payload(iseq).entries.push(pending_entry.into_entry());
+    }
 
     // Let the stub jump to the block
-    Some(unsafe { blockref.as_ref() }.start_addr.raw_ptr())
+    blockref.map(|block| unsafe { block.as_ref() }.start_addr.raw_ptr(cb))
 }
 
 /// Generate a stub that calls entry_stub_hit
 pub fn gen_entry_stub(entry_address: usize, ocb: &mut OutlinedCb) -> Option<CodePtr> {
     let ocb = ocb.unwrap();
-    let stub_addr = ocb.get_write_ptr();
 
     let mut asm = Assembler::new();
-    asm.comment("entry stub hit");
+    asm_comment!(asm, "entry stub hit");
 
     asm.mov(C_ARG_OPNDS[0], entry_address.into());
 
@@ -2288,32 +2620,23 @@ pub fn gen_entry_stub(entry_address: usize, ocb: &mut OutlinedCb) -> Option<Code
     // Not really a side exit, just don't need a padded jump here.
     asm.jmp(CodegenGlobals::get_entry_stub_hit_trampoline().as_side_exit());
 
-    asm.compile(ocb, None);
-
-    if ocb.has_dropped_bytes() {
-        return None; // No space
-    } else {
-        return Some(stub_addr);
-    }
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
 }
 
 /// A trampoline used by gen_entry_stub. entry_stub_hit may issue Code GC, so
 /// it's useful for Code GC to call entry_stub_hit from a globally shared code.
-pub fn gen_entry_stub_hit_trampoline(ocb: &mut OutlinedCb) -> CodePtr {
+pub fn gen_entry_stub_hit_trampoline(ocb: &mut OutlinedCb) -> Option<CodePtr> {
     let ocb = ocb.unwrap();
-    let code_ptr = ocb.get_write_ptr();
     let mut asm = Assembler::new();
 
     // See gen_entry_guard for how it's used.
-    asm.comment("entry_stub_hit() trampoline");
+    asm_comment!(asm, "entry_stub_hit() trampoline");
     let jump_addr = asm.ccall(entry_stub_hit as *mut u8, vec![C_ARG_OPNDS[0], EC]);
 
     // Jump to the address returned by the entry_stub_hit() call
     asm.jmp_opnd(jump_addr);
 
-    asm.compile(ocb, None);
-
-    code_ptr
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
 }
 
 /// Generate code for a branch, possibly rewriting and changing the size of it
@@ -2328,19 +2651,25 @@ fn regenerate_branch(cb: &mut CodeBlock, branch: &Branch) {
 
     // Generate the branch
     let mut asm = Assembler::new();
-    asm.comment("regenerate_branch");
+    asm_comment!(asm, "regenerate_branch");
     branch.gen_fn.call(
         &mut asm,
         Target::CodePtr(branch.get_target_address(0).unwrap()),
         branch.get_target_address(1).map(|addr| Target::CodePtr(addr)),
     );
 
+    // If the entire block is the branch and the block could be invalidated,
+    // we need to pad to ensure there is room for invalidation patching.
+    if branch.start_addr == block.start_addr && branch_terminates_block && block.entry_exit.is_some() {
+        asm.pad_inval_patch();
+    }
+
     // Rewrite the branch
     let old_write_pos = cb.get_write_pos();
     let old_dropped_bytes = cb.has_dropped_bytes();
     cb.set_write_ptr(branch.start_addr);
     cb.set_dropped_bytes(false);
-    asm.compile(cb, None);
+    asm.compile(cb, None).expect("can rewrite existing code");
     let new_end_addr = cb.get_write_ptr();
 
     branch.end_addr.set(new_end_addr);
@@ -2399,7 +2728,7 @@ c_callable! {
         ec: EcPtr,
     ) -> *const u8 {
         with_vm_lock(src_loc!(), || {
-            branch_stub_hit_body(branch_ptr, target_idx, ec)
+            with_compile_time(|| { branch_stub_hit_body(branch_ptr, target_idx, ec) })
         })
     }
 }
@@ -2427,6 +2756,9 @@ fn branch_stub_hit_body(branch_ptr: *const c_void, target_idx: u32, ec: EcPtr) -
         _ => unreachable!("target_idx < 2 must always hold"),
     };
 
+    let cb = CodegenGlobals::get_inline_cb();
+    let ocb = CodegenGlobals::get_outlined_cb();
+
     let (target_blockid, target_ctx): (BlockId, Context) = unsafe {
         // SAFETY: no mutation of the target's Cell. Just reading out data.
         let target = branch.targets[target_idx].ref_unchecked().as_ref().unwrap();
@@ -2434,24 +2766,24 @@ fn branch_stub_hit_body(branch_ptr: *const c_void, target_idx: u32, ec: EcPtr) -
         // If this branch has already been patched, return the dst address
         // Note: recursion can cause the same stub to be hit multiple times
         if let BranchTarget::Block(_) = target.as_ref() {
-            return target.get_address().unwrap().raw_ptr();
+            return target.get_address().unwrap().raw_ptr(cb);
         }
 
         (target.get_blockid(), target.get_ctx())
     };
 
-    let cb = CodegenGlobals::get_inline_cb();
-    let ocb = CodegenGlobals::get_outlined_cb();
-
     let (cfp, original_interp_sp) = unsafe {
         let cfp = get_ec_cfp(ec);
         let original_interp_sp = get_cfp_sp(cfp);
 
-        let running_iseq = rb_cfp_get_iseq(cfp);
+        let running_iseq = get_cfp_iseq(cfp);
+        assert_eq!(running_iseq, target_blockid.iseq as _, "each stub expects a particular iseq");
+
         let reconned_pc = rb_iseq_pc_at_idx(running_iseq, target_blockid.idx.into());
         let reconned_sp = original_interp_sp.offset(target_ctx.sp_offset.into());
-
-        assert_eq!(running_iseq, target_blockid.iseq as _, "each stub expects a particular iseq");
+        // Unlike in the interpreter, our `leave` doesn't write to the caller's
+        // SP -- we do it in the returned-to code. Account for this difference.
+        let reconned_sp = reconned_sp.add(target_ctx.is_return_landing().into());
 
         // Update the PC in the current CFP, because it may be out of sync in JITted code
         rb_set_cfp_pc(cfp, reconned_pc);
@@ -2464,6 +2796,17 @@ fn branch_stub_hit_body(branch_ptr: *const c_void, target_idx: u32, ec: EcPtr) -
         // So we do it here instead.
         rb_set_cfp_sp(cfp, reconned_sp);
 
+        // Bail if code GC is disabled and we've already run out of spaces.
+        if !get_option!(code_gc) && (cb.has_dropped_bytes() || ocb.unwrap().has_dropped_bytes()) {
+            return CodegenGlobals::get_stub_exit_code().raw_ptr(cb);
+        }
+
+        // Bail if we're about to run out of native stack space.
+        // We've just reconstructed interpreter state.
+        if rb_ec_stack_check(ec as _) != 0 {
+            return CodegenGlobals::get_stub_exit_code().raw_ptr(cb);
+        }
+
         (cfp, original_interp_sp)
     };
 
@@ -2474,7 +2817,6 @@ fn branch_stub_hit_body(branch_ptr: *const c_void, target_idx: u32, ec: EcPtr) -
     if block.is_none() {
         let branch_old_shape = branch.gen_fn.get_shape();
 
-
         // If the new block can be generated right after the branch (at cb->write_pos)
         if cb.get_write_ptr() == branch.end_addr.get() {
             // This branch should be terminating its block
@@ -2532,7 +2874,9 @@ fn branch_stub_hit_body(branch_ptr: *const c_void, target_idx: u32, ec: EcPtr) -
             // because incomplete code could be used when cb.dropped_bytes is flipped
             // by code GC. So this place, after all compilation, is the safest place
             // to hook code GC on branch_stub_hit.
-            cb.code_gc(ocb);
+            if get_option!(code_gc) {
+                cb.code_gc(ocb);
+            }
 
             // Failed to service the stub by generating a new block so now we
             // need to exit to the interpreter at the stubbed location. We are
@@ -2552,11 +2896,11 @@ fn branch_stub_hit_body(branch_ptr: *const c_void, target_idx: u32, ec: EcPtr) -
     assert!(
         new_branch_size <= branch_size_on_entry,
         "branch stubs should never enlarge branches (start_addr: {:?}, old_size: {}, new_size: {})",
-        branch.start_addr.raw_ptr(), branch_size_on_entry, new_branch_size,
+        branch.start_addr.raw_ptr(cb), branch_size_on_entry, new_branch_size,
     );
 
     // Return a pointer to the compiled block version
-    dst_addr.raw_ptr()
+    dst_addr.raw_ptr(cb)
 }
 
 /// Generate a "stub", a piece of code that calls the compiler back when run.
@@ -2569,18 +2913,21 @@ fn gen_branch_stub(
 ) -> Option<CodePtr> {
     let ocb = ocb.unwrap();
 
-    // Generate an outlined stub that will call branch_stub_hit()
-    let stub_addr = ocb.get_write_ptr();
-
     let mut asm = Assembler::new();
-    asm.ctx = ctx.clone();
+    asm.ctx = *ctx;
     asm.set_reg_temps(ctx.reg_temps);
-    asm.comment("branch stub hit");
+    asm_comment!(asm, "branch stub hit");
+
+    if asm.ctx.is_return_landing() {
+        asm.mov(SP, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP));
+        let top = asm.stack_push(Type::Unknown);
+        asm.mov(top, C_RET_OPND);
+    }
 
     // Save caller-saved registers before C_ARG_OPNDS get clobbered.
     // Spill all registers for consistency with the trampoline.
-    for &reg in caller_saved_temp_regs().iter() {
-        asm.cpush(reg);
+    for &reg in caller_saved_temp_regs() {
+        asm.cpush(Opnd::Reg(reg));
     }
 
     // Spill temps to the VM stack as well for jit.peek_at_stack()
@@ -2599,19 +2946,11 @@ fn gen_branch_stub(
     // Not really a side exit, just don't need a padded jump here.
     asm.jmp(CodegenGlobals::get_branch_stub_hit_trampoline().as_side_exit());
 
-    asm.compile(ocb, None);
-
-    if ocb.has_dropped_bytes() {
-        // No space
-        None
-    } else {
-        Some(stub_addr)
-    }
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
 }
 
-pub fn gen_branch_stub_hit_trampoline(ocb: &mut OutlinedCb) -> CodePtr {
+pub fn gen_branch_stub_hit_trampoline(ocb: &mut OutlinedCb) -> Option<CodePtr> {
     let ocb = ocb.unwrap();
-    let code_ptr = ocb.get_write_ptr();
     let mut asm = Assembler::new();
 
     // For `branch_stub_hit(branch_ptr, target_idx, ec)`,
@@ -2620,8 +2959,8 @@ pub fn gen_branch_stub_hit_trampoline(ocb: &mut OutlinedCb) -> CodePtr {
     // is the unchanging part.
     // Since this trampoline is static, it allows code GC inside
     // branch_stub_hit() to free stubs without problems.
-    asm.comment("branch_stub_hit() trampoline");
-    let jump_addr = asm.ccall(
+    asm_comment!(asm, "branch_stub_hit() trampoline");
+    let stub_hit_ret = asm.ccall(
         branch_stub_hit as *mut u8,
         vec![
             C_ARG_OPNDS[0],
@@ -2629,28 +2968,39 @@ pub fn gen_branch_stub_hit_trampoline(ocb: &mut OutlinedCb) -> CodePtr {
             EC,
         ]
     );
+    let jump_addr = asm.load(stub_hit_ret);
 
     // Restore caller-saved registers for stack temps
-    for &reg in caller_saved_temp_regs().iter().rev() {
-        asm.cpop_into(reg);
+    for &reg in caller_saved_temp_regs().rev() {
+        asm.cpop_into(Opnd::Reg(reg));
     }
 
     // Jump to the address returned by the branch_stub_hit() call
     asm.jmp_opnd(jump_addr);
 
-    asm.compile(ocb, None);
+    // HACK: popping into C_RET_REG clobbers the return value of branch_stub_hit() we need to jump
+    // to, so we need a scratch register to preserve it. This extends the live range of the C
+    // return register so we get something else for the return value.
+    let _ = asm.live_reg_opnd(stub_hit_ret);
 
-    code_ptr
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
 }
 
 /// Return registers to be pushed and popped on branch_stub_hit.
-/// The return value may include an extra register for x86 alignment.
-fn caller_saved_temp_regs() -> Vec<Opnd> {
-    let mut regs = Assembler::get_temp_regs();
-    if regs.len() % 2 == 1 {
-        regs.push(*regs.last().unwrap()); // x86 alignment
+pub fn caller_saved_temp_regs() -> impl Iterator<Item = &'static Reg> + DoubleEndedIterator {
+    let temp_regs = Assembler::get_temp_regs().iter();
+    let len = temp_regs.len();
+    // The return value gen_leave() leaves in C_RET_REG
+    // needs to survive the branch_stub_hit() call.
+    let regs = temp_regs.chain(std::iter::once(&C_RET_REG));
+
+    // On x86_64, maintain 16-byte stack alignment
+    if cfg!(target_arch = "x86_64") && len % 2 == 0 {
+        static ONE_MORE: [Reg; 1] = [C_RET_REG];
+        regs.chain(ONE_MORE.iter())
+    } else {
+        regs.chain(&[])
     }
-    regs.iter().map(|&reg| Opnd::Reg(reg)).collect()
 }
 
 impl Assembler
@@ -2661,7 +3011,7 @@ impl Assembler
         // so that we can move the closure below
         let entryref = entryref.clone();
 
-        self.pos_marker(move |code_ptr| {
+        self.pos_marker(move |code_ptr, _| {
             entryref.start_addr.set(Some(code_ptr));
         });
     }
@@ -2672,7 +3022,7 @@ impl Assembler
         // so that we can move the closure below
         let entryref = entryref.clone();
 
-        self.pos_marker(move |code_ptr| {
+        self.pos_marker(move |code_ptr, _| {
             entryref.end_addr.set(Some(code_ptr));
         });
     }
@@ -2684,7 +3034,7 @@ impl Assembler
         // so that we can move the closure below
         let branchref = branchref.clone();
 
-        self.pos_marker(move |code_ptr| {
+        self.pos_marker(move |code_ptr, _| {
             branchref.start_addr.set(Some(code_ptr));
         });
     }
@@ -2696,7 +3046,7 @@ impl Assembler
         // so that we can move the closure below
         let branchref = branchref.clone();
 
-        self.pos_marker(move |code_ptr| {
+        self.pos_marker(move |code_ptr, _| {
             branchref.end_addr.set(Some(code_ptr));
         });
     }
@@ -2745,7 +3095,7 @@ pub fn gen_direct_jump(jit: &mut JITState, ctx: &Context, target0: BlockId, asm:
         let block_addr = block.start_addr;
 
         // Call the branch generation function
-        asm.comment("gen_direct_jmp: existing block");
+        asm_comment!(asm, "gen_direct_jmp: existing block");
         asm.mark_branch_start(&branch);
         branch.gen_fn.call(asm, Target::CodePtr(block_addr), None);
         asm.mark_branch_end(&branch);
@@ -2753,7 +3103,7 @@ pub fn gen_direct_jump(jit: &mut JITState, ctx: &Context, target0: BlockId, asm:
         BranchTarget::Block(blockref)
     } else {
         // The branch is effectively empty (a noop)
-        asm.comment("gen_direct_jmp: fallthrough");
+        asm_comment!(asm, "gen_direct_jmp: fallthrough");
         asm.mark_branch_start(&branch);
         asm.mark_branch_end(&branch);
         branch.gen_fn.set_shape(BranchShape::Next0);
@@ -2762,7 +3112,7 @@ pub fn gen_direct_jump(jit: &mut JITState, ctx: &Context, target0: BlockId, asm:
         // compile the target block right after this one (fallthrough).
         BranchTarget::Stub(Box::new(BranchStub {
             address: None,
-            ctx: ctx.clone(),
+            ctx: *ctx,
             iseq: Cell::new(target0.iseq),
             iseq_idx: target0.idx,
         }))
@@ -2777,16 +3127,13 @@ pub fn defer_compilation(
     asm: &mut Assembler,
     ocb: &mut OutlinedCb,
 ) {
-    if asm.ctx.chain_depth != 0 {
+    if asm.ctx.is_deferred() {
         panic!("Double defer!");
     }
 
-    let mut next_ctx = asm.ctx.clone();
+    let mut next_ctx = asm.ctx;
 
-    if next_ctx.chain_depth == u8::MAX {
-        panic!("max block version chain depth reached!");
-    }
-    next_ctx.chain_depth += 1;
+    next_ctx.mark_as_deferred();
 
     let branch = new_pending_branch(jit, BranchGenFn::JumpToTarget0(Cell::new(BranchShape::Default)));
 
@@ -2798,8 +3145,14 @@ pub fn defer_compilation(
     // Likely a stub due to the increased chain depth
     let target0_address = branch.set_target(0, blockid, &next_ctx, ocb);
 
+    // Pad the block if it has the potential to be invalidated. This must be
+    // done before gen_fn() in case the jump is overwritten by a fallthrough.
+    if jit.block_entry_exit.is_some() {
+        asm.pad_inval_patch();
+    }
+
     // Call the branch generation function
-    asm.comment("defer_compilation");
+    asm_comment!(asm, "defer_compilation");
     asm.mark_branch_start(&branch);
     if let Some(dst_addr) = target0_address {
         branch.gen_fn.call(asm, Target::CodePtr(dst_addr), None);
@@ -2951,7 +3304,7 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
     // Get a pointer to the generated code for this block
     let block_start = block.start_addr;
 
-    // Make the the start of the block do an exit. This handles OOM situations
+    // Make the start of the block do an exit. This handles OOM situations
     // and some cases where we can't efficiently patch incoming branches.
     // Do this first, since in case there is a fallthrough branch into this
     // block, the patching loop below can overwrite the start of the block.
@@ -2977,13 +3330,14 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
             let mut asm = Assembler::new();
             asm.jmp(block_entry_exit.as_side_exit());
             cb.set_dropped_bytes(false);
-            asm.compile(&mut cb, Some(ocb));
+            asm.compile(&mut cb, Some(ocb)).expect("can rewrite existing code");
 
             assert!(
                 cb.get_write_ptr() <= block_end,
-                "invalidation wrote past end of block (code_size: {:?}, new_size: {})",
+                "invalidation wrote past end of block (code_size: {:?}, new_size: {}, start_addr: {:?})",
                 block.code_size(),
-                cb.get_write_ptr().into_i64() - block_start.into_i64(),
+                cb.get_write_ptr().as_offset() - block_start.as_offset(),
+                block.start_addr.raw_ptr(cb),
             );
             cb.set_write_ptr(cur_pos);
             cb.set_dropped_bytes(cur_dropped_bytes);
@@ -3024,7 +3378,7 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
             address: Some(stub_addr),
             iseq: block.iseq.clone(),
             iseq_idx: block.iseq_range.start,
-            ctx: block.ctx.clone(),
+            ctx: block.ctx,
         })))));
 
         // Check if the invalidated block immediately follows
@@ -3047,7 +3401,7 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
         if !target_next && branch.code_size() > old_branch_size {
             panic!(
                 "invalidated branch grew in size (start_addr: {:?}, old_size: {}, new_size: {})",
-                branch.start_addr.raw_ptr(), old_branch_size, branch.code_size()
+                branch.start_addr.raw_ptr(cb), old_branch_size, branch.code_size()
             );
         }
     }
@@ -3089,9 +3443,9 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
 // invalidated branch pointers. Example:
 //   def foo(n)
 //     if n == 2
-//       # 1.times{} to use a cfunc to avoid exiting from the
-//       # frame which will use the retained return address
-//       return 1.times { Object.define_method(:foo) {} }
+//       # 1.times.each to create a cfunc frame to preserve the JIT frame
+//       # which will return to a stub housed in an invalidated block
+//       return 1.times.each { Object.define_method(:foo) {} }
 //     end
 //
 //     foo(n + 1)
@@ -3139,6 +3493,65 @@ mod tests {
     use crate::core::*;
 
     #[test]
+    fn type_size() {
+        // Check that we can store types in 4 bits,
+        // and all local types in 32 bits
+        assert_eq!(mem::size_of::<Type>(), 1);
+        assert!(Type::BlockParamProxy as usize <= 0b1111);
+        assert!(MAX_LOCAL_TYPES * 4 <= 32);
+    }
+
+    #[test]
+    fn tempmapping_size() {
+        assert_eq!(mem::size_of::<TempMapping>(), 1);
+    }
+
+    #[test]
+    fn local_types() {
+        let mut ctx = Context::default();
+
+        for i in 0..MAX_LOCAL_TYPES {
+            ctx.set_local_type(i, Type::Fixnum);
+            assert_eq!(ctx.get_local_type(i), Type::Fixnum);
+            ctx.set_local_type(i, Type::BlockParamProxy);
+            assert_eq!(ctx.get_local_type(i), Type::BlockParamProxy);
+        }
+
+        ctx.set_local_type(0, Type::Fixnum);
+        ctx.clear_local_types();
+        assert!(ctx.get_local_type(0) == Type::Unknown);
+
+        // Make sure we don't accidentally set bits incorrectly
+        let mut ctx = Context::default();
+        ctx.set_local_type(0, Type::Fixnum);
+        assert_eq!(ctx.get_local_type(0), Type::Fixnum);
+        ctx.set_local_type(2, Type::Fixnum);
+        ctx.set_local_type(1, Type::BlockParamProxy);
+        assert_eq!(ctx.get_local_type(0), Type::Fixnum);
+        assert_eq!(ctx.get_local_type(2), Type::Fixnum);
+    }
+
+    #[test]
+    fn tempmapping() {
+        let t = TempMapping::map_to_stack(Type::Unknown);
+        assert_eq!(t.get_kind(), MapToStack);
+        assert_eq!(t.get_type(), Type::Unknown);
+
+        let t = TempMapping::map_to_stack(Type::TString);
+        assert_eq!(t.get_kind(), MapToStack);
+        assert_eq!(t.get_type(), Type::TString);
+
+        let t = TempMapping::map_to_local(7);
+        assert_eq!(t.get_kind(), MapToLocal);
+        assert_eq!(t.get_local_idx(), 7);
+    }
+
+    #[test]
+    fn context_size() {
+        assert_eq!(mem::size_of::<Context>(), 23);
+    }
+
+    #[test]
     fn types() {
         // Valid src => dst
         assert_eq!(Type::Unknown.diff(Type::Unknown), TypeDiff::Compatible(0));
@@ -3162,7 +3575,7 @@ mod tests {
             assert_eq!(reg_temps.get(stack_idx), false);
         }
 
-        // Set 0, 2, 7
+        // Set 0, 2, 7 (RegTemps: 10100001)
         reg_temps.set(0, true);
         reg_temps.set(2, true);
         reg_temps.set(3, true);
@@ -3178,6 +3591,17 @@ mod tests {
         assert_eq!(reg_temps.get(5), false);
         assert_eq!(reg_temps.get(6), false);
         assert_eq!(reg_temps.get(7), true);
+
+        // Test conflicts
+        assert_eq!(5, get_option!(num_temp_regs));
+        assert_eq!(reg_temps.conflicts_with(0), false); // already set, but no conflict
+        assert_eq!(reg_temps.conflicts_with(1), false);
+        assert_eq!(reg_temps.conflicts_with(2), true); // already set, and conflicts with 7
+        assert_eq!(reg_temps.conflicts_with(3), false);
+        assert_eq!(reg_temps.conflicts_with(4), false);
+        assert_eq!(reg_temps.conflicts_with(5), true); // not set, and will conflict with 0
+        assert_eq!(reg_temps.conflicts_with(6), false);
+        assert_eq!(reg_temps.conflicts_with(7), true); // already set, and conflicts with 2
     }
 
     #[test]
@@ -3195,6 +3619,60 @@ mod tests {
     }
 
     #[test]
+    fn context_upgrade_local() {
+        let mut asm = Assembler::new();
+        asm.stack_push_local(0);
+        asm.ctx.upgrade_opnd_type(StackOpnd(0), Type::Nil);
+        assert_eq!(Type::Nil, asm.ctx.get_opnd_type(StackOpnd(0)));
+    }
+
+    #[test]
+    fn context_chain_depth() {
+        let mut ctx = Context::default();
+        assert_eq!(ctx.get_chain_depth(), 0);
+        assert_eq!(ctx.is_return_landing(), false);
+        assert_eq!(ctx.is_deferred(), false);
+
+        for _ in 0..5 {
+            ctx.increment_chain_depth();
+        }
+        assert_eq!(ctx.get_chain_depth(), 5);
+
+        ctx.set_as_return_landing();
+        assert_eq!(ctx.is_return_landing(), true);
+
+        ctx.clear_return_landing();
+        assert_eq!(ctx.is_return_landing(), false);
+
+        ctx.mark_as_deferred();
+        assert_eq!(ctx.is_deferred(), true);
+
+        ctx.reset_chain_depth_and_defer();
+        assert_eq!(ctx.get_chain_depth(), 0);
+        assert_eq!(ctx.is_deferred(), false);
+    }
+
+    #[test]
+    fn shift_stack_for_send() {
+        let mut asm = Assembler::new();
+
+        // Push values to simulate send(:name, arg) with 6 items already on-stack
+        for _ in 0..6 {
+            asm.stack_push(Type::Fixnum);
+        }
+        asm.stack_push(Type::Unknown);
+        asm.stack_push(Type::ImmSymbol);
+        asm.stack_push(Type::Unknown);
+
+        // This method takes argc of the sendee, not argc of send
+        asm.shift_stack(1);
+
+        // The symbol should be gone
+        assert_eq!(Type::Unknown, asm.ctx.get_opnd_type(StackOpnd(0)));
+        assert_eq!(Type::Unknown, asm.ctx.get_opnd_type(StackOpnd(1)));
+    }
+
+    #[test]
     fn test_miri_ref_unchecked() {
         let blockid = BlockId {
             iseq: ptr::null(),