1 files changed, 2502 insertions, 880 deletions
diff --git a/yjit/src/core.rs b/yjit/src/core.rs
index 64585653d9..4152eab02c 100644
--- a/yjit/src/core.rs
+++ b/yjit/src/core.rs
@@ -1,18 +1,32 @@
-use crate::asm::x86_64::*;
+//! Code versioning, retained live control flow graph mutations, type tracking, etc.
+
+// So we can comment on individual uses of `unsafe` in `unsafe` functions
+#![warn(unsafe_op_in_unsafe_fn)]
+
 use crate::asm::*;
+use crate::backend::ir::*;
 use crate::codegen::*;
 use crate::virtualmem::CodePtr;
 use crate::cruby::*;
 use crate::options::*;
 use crate::stats::*;
 use crate::utils::*;
+#[cfg(feature="disasm")]
+use crate::disasm::*;
 use core::ffi::c_void;
 use std::cell::*;
-use std::hash::{Hash, Hasher};
+use std::collections::HashSet;
+use std::fmt;
 use std::mem;
-use std::rc::{Rc};
-use InsnOpnd::*;
-use TempMapping::*;
+use std::mem::transmute;
+use std::ops::Range;
+use std::rc::Rc;
+use mem::MaybeUninit;
+use std::ptr;
+use ptr::NonNull;
+use YARVOpnd::*;
+use TempMappingKind::*;
+use crate::invariants::*;
 
 // Maximum number of temp value types we keep track of
 pub const MAX_TEMP_TYPES: usize = 8;
@@ -20,10 +34,15 @@ pub const MAX_TEMP_TYPES: usize = 8;
 // Maximum number of local variable types we keep track of
 const MAX_LOCAL_TYPES: usize = 8;
 
+/// An index into `ISEQ_BODY(iseq)->iseq_encoded`. Points
+/// to a YARV instruction or an instruction operand.
+pub type IseqIdx = u16;
+
 // Represent the type of a value (local/stack/self) in YJIT
-#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+#[derive(Copy, Clone, Hash, PartialEq, Eq, Debug)]
+#[repr(u8)]
 pub enum Type {
-    Unknown,
+    Unknown = 0,
     UnknownImm,
     UnknownHeap,
     Nil,
@@ -31,15 +50,20 @@ pub enum Type {
     False,
     Fixnum,
     Flonum,
-    Array,
-    Hash,
     ImmSymbol,
 
-    #[allow(unused)]
-    HeapSymbol,
-
     TString, // An object with the T_STRING flag set, possibly an rb_cString
     CString, // An un-subclassed string of type rb_cString (can have instance vars in some cases)
+    TArray, // An object with the T_ARRAY flag set, possibly an rb_cArray
+    CArray, // An un-subclassed array of type rb_cArray (can have instance vars in some cases)
+    THash, // An object with the T_HASH flag set, possibly an rb_cHash
+    CHash, // An un-subclassed hash of type rb_cHash (can have instance vars in some cases)
+
+    BlockParamProxy, // A special sentinel value indicating the block parameter should be read from
+                     // the current surrounding cfp
+
+    // The context currently relies on types taking at most 4 bits (max value 15)
+    // to encode, so if we add any more, we will need to refactor the context.
 }
 
 // Default initialization
@@ -66,18 +90,27 @@ impl Type {
             } else if val.flonum_p() {
                 Type::Flonum
             } else {
-                unreachable!()
+                unreachable!("Illegal value: {:?}", val)
             }
         } else {
             // Core.rs can't reference rb_cString because it's linked by Rust-only tests.
             // But CString vs TString is only an optimisation and shouldn't affect correctness.
             #[cfg(not(test))]
-            if val.class_of() == unsafe { rb_cString } {
-                return Type::CString;
+            match val.class_of() {
+                class if class == unsafe { rb_cArray }  => return Type::CArray,
+                class if class == unsafe { rb_cHash }   => return Type::CHash,
+                class if class == unsafe { rb_cString } => return Type::CString,
+                _ => {}
+            }
+            // We likewise can't reference rb_block_param_proxy, but it's again an optimisation;
+            // we can just treat it as a normal Object.
+            #[cfg(not(test))]
+            if val == unsafe { rb_block_param_proxy } {
+                return Type::BlockParamProxy;
             }
             match val.builtin_type() {
-                RUBY_T_ARRAY => Type::Array,
-                RUBY_T_HASH => Type::Hash,
+                RUBY_T_ARRAY => Type::TArray,
+                RUBY_T_HASH => Type::THash,
                 RUBY_T_STRING => Type::TString,
                 _ => Type::UnknownHeap,
             }
@@ -117,15 +150,32 @@ impl Type {
     pub fn is_heap(&self) -> bool {
         match self {
             Type::UnknownHeap => true,
-            Type::Array => true,
-            Type::Hash => true,
-            Type::HeapSymbol => true,
+            Type::TArray => true,
+            Type::CArray => true,
+            Type::THash => true,
+            Type::CHash => true,
             Type::TString => true,
             Type::CString => true,
+            Type::BlockParamProxy => true,
             _ => false,
         }
     }
 
+    /// Check if it's a T_ARRAY object (both TArray and CArray are T_ARRAY)
+    pub fn is_array(&self) -> bool {
+        matches!(self, Type::TArray | Type::CArray)
+    }
+
+    /// Check if it's a T_HASH object (both THash and CHash are T_HASH)
+    pub fn is_hash(&self) -> bool {
+        matches!(self, Type::THash | Type::CHash)
+    }
+
+    /// Check if it's a T_STRING object (both TString and CString are T_STRING)
+    pub fn is_string(&self) -> bool {
+        matches!(self, Type::TString | Type::CString)
+    }
+
     /// Returns an Option with the T_ value type if it is known, otherwise None
     pub fn known_value_type(&self) -> Option<ruby_value_type> {
         match self {
@@ -134,11 +184,12 @@ impl Type {
             Type::False => Some(RUBY_T_FALSE),
             Type::Fixnum => Some(RUBY_T_FIXNUM),
             Type::Flonum => Some(RUBY_T_FLOAT),
-            Type::Array => Some(RUBY_T_ARRAY),
-            Type::Hash => Some(RUBY_T_HASH),
-            Type::ImmSymbol | Type::HeapSymbol => Some(RUBY_T_SYMBOL),
+            Type::TArray | Type::CArray => Some(RUBY_T_ARRAY),
+            Type::THash | Type::CHash => Some(RUBY_T_HASH),
+            Type::ImmSymbol => Some(RUBY_T_SYMBOL),
             Type::TString | Type::CString => Some(RUBY_T_STRING),
-            Type::Unknown | Type::UnknownImm | Type::UnknownHeap => None
+            Type::Unknown | Type::UnknownImm | Type::UnknownHeap => None,
+            Type::BlockParamProxy => None,
         }
     }
 
@@ -151,7 +202,9 @@ impl Type {
                 Type::False => Some(rb_cFalseClass),
                 Type::Fixnum => Some(rb_cInteger),
                 Type::Flonum => Some(rb_cFloat),
-                Type::ImmSymbol | Type::HeapSymbol => Some(rb_cSymbol),
+                Type::ImmSymbol => Some(rb_cSymbol),
+                Type::CArray => Some(rb_cArray),
+                Type::CHash => Some(rb_cHash),
                 Type::CString => Some(rb_cString),
                 _ => None,
             }
@@ -169,7 +222,7 @@ impl Type {
         }
     }
 
-    /// Returns an Option with the exact value if it is known, otherwise None
+    /// Returns an Option boolean representing whether the value is truthy if known, otherwise None
     pub fn known_truthy(&self) -> Option<bool> {
         match self {
             Type::Nil => Some(false),
@@ -180,112 +233,281 @@ impl Type {
         }
     }
 
+    /// Returns an Option boolean representing whether the value is equal to nil if known, otherwise None
+    pub fn known_nil(&self) -> Option<bool> {
+        match (self, self.known_truthy()) {
+            (Type::Nil, _) => Some(true),
+            (Type::False, _) => Some(false), // Qfalse is not nil
+            (_, Some(true))  => Some(false), // if truthy, can't be nil
+            (_, _) => None // otherwise unknown
+        }
+    }
+
     /// Compute a difference between two value types
-    /// Returns 0 if the two are the same
-    /// Returns > 0 if different but compatible
-    /// Returns usize::MAX if incompatible
-    pub fn diff(self, dst: Self) -> usize {
+    pub fn diff(self, dst: Self) -> TypeDiff {
         // Perfect match, difference is zero
         if self == dst {
-            return 0;
+            return TypeDiff::Compatible(0);
         }
 
         // Any type can flow into an unknown type
         if dst == Type::Unknown {
-            return 1;
+            return TypeDiff::Compatible(1);
+        }
+
+        // A CArray is also a TArray.
+        if self == Type::CArray && dst == Type::TArray {
+            return TypeDiff::Compatible(1);
+        }
+
+        // A CHash is also a THash.
+        if self == Type::CHash && dst == Type::THash {
+            return TypeDiff::Compatible(1);
         }
 
         // A CString is also a TString.
         if self == Type::CString && dst == Type::TString {
-            return 1;
+            return TypeDiff::Compatible(1);
         }
 
         // Specific heap type into unknown heap type is imperfect but valid
         if self.is_heap() && dst == Type::UnknownHeap {
-            return 1;
+            return TypeDiff::Compatible(1);
         }
 
         // Specific immediate type into unknown immediate type is imperfect but valid
         if self.is_imm() && dst == Type::UnknownImm {
-            return 1;
+            return TypeDiff::Compatible(1);
         }
 
         // Incompatible types
-        return usize::MAX;
+        return TypeDiff::Incompatible;
     }
 
     /// Upgrade this type into a more specific compatible type
     /// The new type must be compatible and at least as specific as the previously known type.
-    fn upgrade(&mut self, src: Self) {
-        // Here we're checking that src is more specific than self
-        assert!(src.diff(*self) != usize::MAX);
-        *self = src;
+    fn upgrade(&mut self, new_type: Self) {
+        // We can only upgrade to a type that is more specific
+        assert!(new_type.diff(*self) != TypeDiff::Incompatible);
+        *self = new_type;
     }
 }
 
+#[derive(Debug, Eq, PartialEq)]
+pub enum TypeDiff {
+    // usize == 0: Same type
+    // usize >= 1: Different but compatible. The smaller, the more compatible.
+    Compatible(usize),
+    Incompatible,
+}
+
+#[derive(Copy, Clone, Eq, Hash, PartialEq, Debug)]
+#[repr(u8)]
+pub enum TempMappingKind
+{
+    MapToStack = 0,
+    MapToSelf = 1,
+    MapToLocal = 2,
+}
+
 // Potential mapping of a value on the temporary stack to
 // self, a local variable or constant so that we can track its type
-#[derive(Copy, Clone, Eq, PartialEq, Debug)]
-pub enum TempMapping {
-    MapToStack, // Normal stack value
-    MapToSelf,  // Temp maps to the self operand
-    MapToLocal(u8), // Temp maps to a local variable with index
-                //ConstMapping,         // Small constant (0, 1, 2, Qnil, Qfalse, Qtrue)
+//
+// The highest two bits represent TempMappingKind, and the rest of
+// the bits are used differently across different kinds.
+// * MapToStack: The lowest 5 bits are used for mapping Type.
+// * MapToSelf: The remaining bits are not used; the type is stored in self_type.
+// * MapToLocal: The lowest 3 bits store the index of a local variable.
+#[derive(Copy, Clone, Eq, Hash, PartialEq, Debug)]
+pub struct TempMapping(u8);
+
+impl TempMapping {
+    pub fn map_to_stack(t: Type) -> TempMapping
+    {
+        let kind_bits = TempMappingKind::MapToStack as u8;
+        let type_bits = t as u8;
+        assert!(type_bits <= 0b11111);
+        let bits = (kind_bits << 6) | (type_bits & 0b11111);
+        TempMapping(bits)
+    }
+
+    pub fn map_to_self() -> TempMapping
+    {
+        let kind_bits = TempMappingKind::MapToSelf as u8;
+        let bits = kind_bits << 6;
+        TempMapping(bits)
+    }
+
+    pub fn map_to_local(local_idx: u8) -> TempMapping
+    {
+        let kind_bits = TempMappingKind::MapToLocal as u8;
+        assert!(local_idx <= 0b111);
+        let bits = (kind_bits << 6) | (local_idx & 0b111);
+        TempMapping(bits)
+    }
+
+    pub fn without_type(&self) -> TempMapping
+    {
+        if self.get_kind() != TempMappingKind::MapToStack {
+            return *self;
+        }
+
+        TempMapping::map_to_stack(Type::Unknown)
+    }
+
+    pub fn get_kind(&self) -> TempMappingKind
+    {
+        // Take the two highest bits
+        let TempMapping(bits) = self;
+        let kind_bits = bits >> 6;
+        assert!(kind_bits <= 2);
+        unsafe { transmute::<u8, TempMappingKind>(kind_bits) }
+    }
+
+    pub fn get_type(&self) -> Type
+    {
+        assert!(self.get_kind() == TempMappingKind::MapToStack);
+
+        // Take the 5 lowest bits
+        let TempMapping(bits) = self;
+        let type_bits = bits & 0b11111;
+        unsafe { transmute::<u8, Type>(type_bits) }
+    }
+
+    pub fn get_local_idx(&self) -> u8
+    {
+        assert!(self.get_kind() == TempMappingKind::MapToLocal);
+
+        // Take the 3 lowest bits
+        let TempMapping(bits) = self;
+        bits & 0b111
+    }
 }
 
 impl Default for TempMapping {
     fn default() -> Self {
-        MapToStack
+        TempMapping::map_to_stack(Type::Unknown)
     }
 }
 
-// Operand to a bytecode instruction
+// Operand to a YARV bytecode instruction
 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
-pub enum InsnOpnd {
+pub enum YARVOpnd {
     // The value is self
     SelfOpnd,
 
     // Temporary stack operand with stack index
-    StackOpnd(u16),
+    StackOpnd(u8),
+}
+
+impl From<Opnd> for YARVOpnd {
+    fn from(value: Opnd) -> Self {
+        match value {
+            Opnd::Stack { idx, .. } => StackOpnd(idx.try_into().unwrap()),
+            _ => unreachable!("{:?} cannot be converted to YARVOpnd", value)
+        }
+    }
+}
+
+/// Maximum index of stack temps that could be in a register
+pub const MAX_REG_TEMPS: u8 = 8;
+
+/// Bitmap of which stack temps are in a register
+#[derive(Copy, Clone, Default, Eq, Hash, PartialEq, Debug)]
+pub struct RegTemps(u8);
+
+impl RegTemps {
+    pub fn get(&self, index: u8) -> bool {
+        assert!(index < MAX_REG_TEMPS);
+        (self.0 >> index) & 1 == 1
+    }
+
+    pub fn set(&mut self, index: u8, value: bool) {
+        assert!(index < MAX_REG_TEMPS);
+        if value {
+            self.0 = self.0 | (1 << index);
+        } else {
+            self.0 = self.0 & !(1 << index);
+        }
+    }
+
+    pub fn as_u8(&self) -> u8 {
+        self.0
+    }
+
+    /// Return true if there's a register that conflicts with a given stack_idx.
+    pub fn conflicts_with(&self, stack_idx: u8) -> bool {
+        let mut other_idx = stack_idx as usize % get_option!(num_temp_regs);
+        while other_idx < MAX_REG_TEMPS as usize {
+            if stack_idx as usize != other_idx && self.get(other_idx as u8) {
+                return true;
+            }
+            other_idx += get_option!(num_temp_regs);
+        }
+        false
+    }
 }
 
+/// Bits for chain_depth_return_landing_defer
+const RETURN_LANDING_BIT: u8 = 0b10000000;
+const DEFER_BIT: u8          = 0b01000000;
+const CHAIN_DEPTH_MASK: u8   = 0b00111111; // 63
+
 /// Code generation context
 /// Contains information we can use to specialize/optimize code
 /// There are a lot of context objects so we try to keep the size small.
-#[derive(Copy, Clone, Default, Debug)]
+#[derive(Copy, Clone, Default, Eq, Hash, PartialEq, Debug)]
+#[repr(packed)]
 pub struct Context {
     // Number of values currently on the temporary stack
-    stack_size: u16,
+    stack_size: u8,
 
     // Offset of the JIT SP relative to the interpreter SP
     // This represents how far the JIT's SP is from the "real" SP
-    sp_offset: i16,
+    sp_offset: i8,
 
-    // Depth of this block in the sidechain (eg: inline-cache chain)
-    chain_depth: u8,
+    /// Bitmap of which stack temps are in a register
+    reg_temps: RegTemps,
 
-    // Local variable types we keep track of
-    local_types: [Type; MAX_LOCAL_TYPES],
-
-    // Temporary variable types we keep track of
-    temp_types: [Type; MAX_TEMP_TYPES],
+    /// Fields packed into u8
+    /// - 1st bit from the left: Whether this code is the target of a JIT-to-JIT Ruby return ([Self::is_return_landing])
+    /// - 2nd bit from the left: Whether the compilation of this code has been deferred ([Self::is_deferred])
+    /// - Last 6 bits (max: 63): Depth of this block in the sidechain (eg: inline-cache chain)
+    chain_depth_and_flags: u8,
 
     // Type we track for self
     self_type: Type,
 
-    // Mapping of temp stack entries to types we track
-    temp_mapping: [TempMapping; MAX_TEMP_TYPES],
+    // Local variable types we keep track of
+    // We store 8 local types, requiring 4 bits each, for a total of 32 bits
+    local_types: u32,
+
+    // Temp mapping kinds we track
+    // 8 temp mappings * 2 bits, total 16 bits
+    temp_mapping_kind: u16,
+
+    // Stack slot type/local_idx we track
+    // 8 temp types * 4 bits, total 32 bits
+    temp_payload: u32,
+
+    /// A pointer to a block ISEQ supplied by the caller. 0 if not inlined.
+    /// Not using IseqPtr to satisfy Default trait, and not using Option for #[repr(packed)]
+    /// TODO: This could be u16 if we have a global or per-ISEQ HashMap to convert IseqPtr
+    /// to serial indexes. We're thinking of overhauling Context structure in Ruby 3.4 which
+    /// could allow this to consume no bytes, so we're leaving this as is.
+    inline_block: u64,
 }
 
 /// Tuple of (iseq, idx) used to identify basic blocks
 /// There are a lot of blockid objects so we try to keep the size small.
 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
+#[repr(packed)]
 pub struct BlockId {
     /// Instruction sequence
     pub iseq: IseqPtr,
 
     /// Index in the iseq where the block starts
-    pub idx: u32,
+    pub idx: u16,
 }
 
 /// Branch code shape enumeration
@@ -296,116 +518,457 @@ pub enum BranchShape {
     Default, // Neither target is next
 }
 
-// Branch code generation function signature
-type BranchGenFn =
-    fn(cb: &mut CodeBlock, target0: CodePtr, target1: Option<CodePtr>, shape: BranchShape) -> ();
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum BranchGenFn {
+    BranchIf(Cell<BranchShape>),
+    BranchNil(Cell<BranchShape>),
+    BranchUnless(Cell<BranchShape>),
+    JumpToTarget0(Cell<BranchShape>),
+    JNZToTarget0,
+    JZToTarget0,
+    JBEToTarget0,
+    JBToTarget0,
+    JOMulToTarget0,
+    JITReturn,
+}
+
+impl BranchGenFn {
+    pub fn call(&self, asm: &mut Assembler, target0: Target, target1: Option<Target>) {
+        match self {
+            BranchGenFn::BranchIf(shape) => {
+                match shape.get() {
+                    BranchShape::Next0 => asm.jz(target1.unwrap()),
+                    BranchShape::Next1 => asm.jnz(target0),
+                    BranchShape::Default => {
+                        asm.jnz(target0);
+                        asm.jmp(target1.unwrap());
+                    }
+                }
+            }
+            BranchGenFn::BranchNil(shape) => {
+                match shape.get() {
+                    BranchShape::Next0 => asm.jne(target1.unwrap()),
+                    BranchShape::Next1 => asm.je(target0),
+                    BranchShape::Default => {
+                        asm.je(target0);
+                        asm.jmp(target1.unwrap());
+                    }
+                }
+            }
+            BranchGenFn::BranchUnless(shape) => {
+                match shape.get() {
+                    BranchShape::Next0 => asm.jnz(target1.unwrap()),
+                    BranchShape::Next1 => asm.jz(target0),
+                    BranchShape::Default => {
+                        asm.jz(target0);
+                        asm.jmp(target1.unwrap());
+                    }
+                }
+            }
+            BranchGenFn::JumpToTarget0(shape) => {
+                if shape.get() == BranchShape::Next1 {
+                    panic!("Branch shape Next1 not allowed in JumpToTarget0!");
+                }
+                if shape.get() == BranchShape::Default {
+                    asm.jmp(target0);
+                }
+            }
+            BranchGenFn::JNZToTarget0 => {
+                asm.jnz(target0)
+            }
+            BranchGenFn::JZToTarget0 => {
+                asm.jz(target0)
+            }
+            BranchGenFn::JBEToTarget0 => {
+                asm.jbe(target0)
+            }
+            BranchGenFn::JBToTarget0 => {
+                asm.jb(target0)
+            }
+            BranchGenFn::JOMulToTarget0 => {
+                asm.jo_mul(target0)
+            }
+            BranchGenFn::JITReturn => {
+                asm_comment!(asm, "update cfp->jit_return");
+                let jit_return = RUBY_OFFSET_CFP_JIT_RETURN - RUBY_SIZEOF_CONTROL_FRAME as i32;
+                let raw_ptr = asm.lea_jump_target(target0);
+                asm.mov(Opnd::mem(64, CFP, jit_return), raw_ptr);
+            }
+        }
+    }
+
+    pub fn get_shape(&self) -> BranchShape {
+        match self {
+            BranchGenFn::BranchIf(shape) |
+            BranchGenFn::BranchNil(shape) |
+            BranchGenFn::BranchUnless(shape) |
+            BranchGenFn::JumpToTarget0(shape) => shape.get(),
+            BranchGenFn::JNZToTarget0 |
+            BranchGenFn::JZToTarget0 |
+            BranchGenFn::JBEToTarget0 |
+            BranchGenFn::JBToTarget0 |
+            BranchGenFn::JOMulToTarget0 |
+            BranchGenFn::JITReturn => BranchShape::Default,
+        }
+    }
+
+    pub fn set_shape(&self, new_shape: BranchShape) {
+        match self {
+            BranchGenFn::BranchIf(shape) |
+            BranchGenFn::BranchNil(shape) |
+            BranchGenFn::BranchUnless(shape) => {
+                shape.set(new_shape);
+            }
+            BranchGenFn::JumpToTarget0(shape) => {
+                if new_shape == BranchShape::Next1 {
+                    panic!("Branch shape Next1 not allowed in JumpToTarget0!");
+                }
+                shape.set(new_shape);
+            }
+            BranchGenFn::JNZToTarget0 |
+            BranchGenFn::JZToTarget0 |
+            BranchGenFn::JBEToTarget0 |
+            BranchGenFn::JBToTarget0 |
+            BranchGenFn::JOMulToTarget0 |
+            BranchGenFn::JITReturn => {
+                assert_eq!(new_shape, BranchShape::Default);
+            }
+        }
+    }
+}
+
+/// A place that a branch could jump to
+#[derive(Debug, Clone)]
+enum BranchTarget {
+    Stub(Box<BranchStub>), // Not compiled yet
+    Block(BlockRef),       // Already compiled
+}
+
+impl BranchTarget {
+    fn get_address(&self) -> Option<CodePtr> {
+        match self {
+            BranchTarget::Stub(stub) => stub.address,
+            BranchTarget::Block(blockref) => Some(unsafe { blockref.as_ref() }.start_addr),
+        }
+    }
+
+    fn get_blockid(&self) -> BlockId {
+        match self {
+            BranchTarget::Stub(stub) => BlockId { iseq: stub.iseq.get(), idx: stub.iseq_idx },
+            BranchTarget::Block(blockref) => unsafe { blockref.as_ref() }.get_blockid(),
+        }
+    }
+
+    fn get_ctx(&self) -> Context {
+        match self {
+            BranchTarget::Stub(stub) => stub.ctx,
+            BranchTarget::Block(blockref) => unsafe { blockref.as_ref() }.ctx,
+        }
+    }
+
+    fn get_block(&self) -> Option<BlockRef> {
+        match self {
+            BranchTarget::Stub(_) => None,
+            BranchTarget::Block(blockref) => Some(*blockref),
+        }
+    }
+
+    fn set_iseq(&self, iseq: IseqPtr) {
+        match self {
+            BranchTarget::Stub(stub) => stub.iseq.set(iseq),
+            BranchTarget::Block(blockref) => unsafe { blockref.as_ref() }.iseq.set(iseq),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct BranchStub {
+    address: Option<CodePtr>,
+    iseq: Cell<IseqPtr>,
+    iseq_idx: IseqIdx,
+    ctx: Context,
+}
 
 /// Store info about an outgoing branch in a code segment
 /// Note: care must be taken to minimize the size of branch objects
-struct Branch {
+pub struct Branch {
     // Block this is attached to
     block: BlockRef,
 
     // Positions where the generated code starts and ends
-    start_addr: Option<CodePtr>,
-    end_addr: Option<CodePtr>,
-
-    // Context right after the branch instruction
-    #[allow(unused)] // set but not read at the moment
-    src_ctx: Context,
+    start_addr: CodePtr,
+    end_addr: Cell<CodePtr>, // exclusive
 
     // Branch target blocks and their contexts
-    targets: [Option<BlockId>; 2],
-    target_ctxs: [Context; 2],
-    blocks: [Option<BlockRef>; 2],
-
-    // Jump target addresses
-    dst_addrs: [Option<CodePtr>; 2],
+    targets: [Cell<Option<Box<BranchTarget>>>; 2],
 
     // Branch code generation function
     gen_fn: BranchGenFn,
+}
+
+/// A [Branch] for a [Block] that is under construction.
+/// Fields correspond, but may be `None` during construction.
+pub struct PendingBranch {
+    /// Allocation holder for the address of the constructed branch
+    /// in error paths Box deallocates it.
+    uninit_branch: Box<MaybeUninit<Branch>>,
+
+    /// Branch code generation function
+    gen_fn: BranchGenFn,
+
+    /// Positions where the generated code starts and ends
+    start_addr: Cell<Option<CodePtr>>,
+    end_addr: Cell<Option<CodePtr>>, // exclusive
+
+    /// Branch target blocks and their contexts
+    targets: [Cell<Option<Box<BranchTarget>>>; 2],
+}
+
+impl Branch {
+    // Compute the size of the branch code
+    fn code_size(&self) -> usize {
+        (self.end_addr.get().as_offset() - self.start_addr.as_offset()) as usize
+    }
+
+    /// Get the address of one of the branch destination
+    fn get_target_address(&self, target_idx: usize) -> Option<CodePtr> {
+        unsafe {
+            self.targets[target_idx]
+                .ref_unchecked()
+                .as_ref()
+                .and_then(|target| target.get_address())
+        }
+    }
+
+    fn get_stub_count(&self) -> usize {
+        let mut count = 0;
+        for target in self.targets.iter() {
+            if unsafe {
+                // SAFETY: no mutation
+                matches!(
+                    target.ref_unchecked().as_ref().map(Box::as_ref),
+                    Some(BranchTarget::Stub(_))
+                )
+            } {
+                count += 1;
+            }
+        }
+        count
+    }
 
-    // Shape of the branch
-    shape: BranchShape,
+    fn assert_layout(&self) {
+        let shape = self.gen_fn.get_shape();
+        assert!(
+            !(shape == BranchShape::Default && 0 == self.code_size()),
+            "zero-size branches are incorrect when code for neither targets are adjacent"
+            // One needs to issue some instruction to steer to the branch target
+            // when falling through isn't an option.
+        );
+    }
 }
 
 impl std::fmt::Debug for Branch {
+    // Can't derive this because `targets: !Copy` due to Cell.
     fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // TODO: expand this if needed. #[derive(Debug)] on Branch gave a
-        // strange error related to BranchGenFn
+        let targets = unsafe {
+            // SAFETY:
+            // While the references are live for the result of this function,
+            // no mutation happens because we are only calling derived fmt::Debug functions.
+            [self.targets[0].as_ptr().as_ref().unwrap(), self.targets[1].as_ptr().as_ref().unwrap()]
+        };
+
         formatter
             .debug_struct("Branch")
+            .field("block", &self.block)
             .field("start", &self.start_addr)
             .field("end", &self.end_addr)
-            .field("targets", &self.targets)
+            .field("targets", &targets)
+            .field("gen_fn", &self.gen_fn)
             .finish()
     }
 }
 
-impl Branch {
-    // Compute the size of the branch code
-    fn code_size(&self) -> usize {
-        (self.end_addr.unwrap().raw_ptr() as usize) - (self.start_addr.unwrap().raw_ptr() as usize)
+impl PendingBranch {
+    /// Set up a branch target at `target_idx`. Find an existing block to branch to
+    /// or generate a stub for one.
+    fn set_target(
+        &self,
+        target_idx: u32,
+        target: BlockId,
+        ctx: &Context,
+        ocb: &mut OutlinedCb,
+    ) -> Option<CodePtr> {
+        // If the block already exists
+        if let Some(blockref) = find_block_version(target, ctx) {
+            let block = unsafe { blockref.as_ref() };
+
+            // Fill out the target with this block
+            self.targets[target_idx.as_usize()]
+                .set(Some(Box::new(BranchTarget::Block(blockref))));
+            return Some(block.start_addr);
+        }
+
+        // The branch struct is uninitialized right now but as a stable address.
+        // We make sure the stub runs after the branch is initialized.
+        let branch_struct_addr = self.uninit_branch.as_ptr() as usize;
+        let stub_addr = gen_branch_stub(ctx, ocb, branch_struct_addr, target_idx);
+
+        if let Some(stub_addr) = stub_addr {
+            // Fill the branch target with a stub
+            self.targets[target_idx.as_usize()].set(Some(Box::new(BranchTarget::Stub(Box::new(BranchStub {
+                address: Some(stub_addr),
+                iseq: Cell::new(target.iseq),
+                iseq_idx: target.idx,
+                ctx: *ctx,
+            })))));
+        }
+
+        stub_addr
+    }
+
+    // Construct the branch and wire it up in the grpah
+    fn into_branch(mut self, uninit_block: BlockRef) -> BranchRef {
+        // Make the branch
+        let branch = Branch {
+            block: uninit_block,
+            start_addr: self.start_addr.get().unwrap(),
+            end_addr: Cell::new(self.end_addr.get().unwrap()),
+            targets: self.targets,
+            gen_fn: self.gen_fn,
+        };
+        // Move it to the designated place on
+        // the heap and unwrap MaybeUninit.
+        self.uninit_branch.write(branch);
+        let raw_branch: *mut MaybeUninit<Branch> = Box::into_raw(self.uninit_branch);
+        let branchref = NonNull::new(raw_branch as *mut Branch).expect("no null from Box");
+
+        // SAFETY: just allocated it
+        let branch = unsafe { branchref.as_ref() };
+        // For block branch targets, put the new branch in the
+        // appropriate incoming list.
+        for target in branch.targets.iter() {
+            // SAFETY: no mutation
+            let out_block: Option<BlockRef> = unsafe {
+                target.ref_unchecked().as_ref().and_then(|target| target.get_block())
+            };
+
+            if let Some(out_block) = out_block {
+                // SAFETY: These blockrefs come from set_target() which only puts blocks from
+                // ISeqs, which are all initialized. Note that uninit_block isn't in any ISeq
+                // payload yet.
+                unsafe { out_block.as_ref() }.incoming.push(branchref);
+            }
+        }
+
+        branch.assert_layout();
+
+        branchref
     }
 }
 
-// In case this block is invalidated, these two pieces of info
-// help to remove all pointers to this block in the system.
-#[derive(Debug)]
-pub struct CmeDependency {
-    pub receiver_klass: VALUE,
-    pub callee_cme: *const rb_callable_method_entry_t,
+// Store info about code used on YJIT entry
+pub struct Entry {
+    // Positions where the generated code starts and ends
+    start_addr: CodePtr,
+    end_addr: CodePtr, // exclusive
 }
 
+/// A [Branch] for a [Block] that is under construction.
+pub struct PendingEntry {
+    pub uninit_entry: Box<MaybeUninit<Entry>>,
+    start_addr: Cell<Option<CodePtr>>,
+    end_addr: Cell<Option<CodePtr>>, // exclusive
+}
+
+impl PendingEntry {
+    // Construct the entry in the heap
+    pub fn into_entry(mut self) -> EntryRef {
+        // Make the entry
+        let entry = Entry {
+            start_addr: self.start_addr.get().unwrap(),
+            end_addr: self.end_addr.get().unwrap(),
+        };
+        // Move it to the designated place on the heap and unwrap MaybeUninit.
+        self.uninit_entry.write(entry);
+        let raw_entry: *mut MaybeUninit<Entry> = Box::into_raw(self.uninit_entry);
+        NonNull::new(raw_entry as *mut Entry).expect("no null from Box")
+    }
+}
+
+// In case a block is invalidated, this helps to remove all pointers to the block.
+pub type CmePtr = *const rb_callable_method_entry_t;
+
 /// Basic block version
 /// Represents a portion of an iseq compiled with a given context
 /// Note: care must be taken to minimize the size of block_t objects
 #[derive(Debug)]
 pub struct Block {
-    // Bytecode sequence (iseq, idx) this is a version of
-    blockid: BlockId,
+    // The byte code instruction sequence this is a version of.
+    // Can change due to moving GC.
+    iseq: Cell<IseqPtr>,
 
-    // Index one past the last instruction for this block in the iseq
-    end_idx: u32,
+    // Index range covered by this version in `ISEQ_BODY(iseq)->iseq_encoded`.
+    iseq_range: Range<IseqIdx>,
 
     // Context at the start of the block
     // This should never be mutated
     ctx: Context,
 
     // Positions where the generated code starts and ends
-    start_addr: Option<CodePtr>,
-    end_addr: Option<CodePtr>,
+    start_addr: CodePtr,
+    end_addr: Cell<CodePtr>,
 
     // List of incoming branches (from predecessors)
     // These are reference counted (ownership shared between predecessor and successors)
-    incoming: Vec<BranchRef>,
+    incoming: MutableBranchList,
 
     // NOTE: we might actually be able to store the branches here without refcounting
     // however, using a RefCell makes it easy to get a pointer to Branch objects
     //
     // List of outgoing branches (to successors)
-    outgoing: Vec<BranchRef>,
+    outgoing: Box<[BranchRef]>,
 
     // FIXME: should these be code pointers instead?
     // Offsets for GC managed objects in the mainline code block
-    gc_object_offsets: Vec<u32>,
+    gc_obj_offsets: Box<[u32]>,
 
     // CME dependencies of this block, to help to remove all pointers to this
     // block in the system.
-    cme_dependencies: Vec<CmeDependency>,
+    cme_dependencies: Box<[Cell<CmePtr>]>,
 
     // Code address of an exit for `ctx` and `blockid`.
     // Used for block invalidation.
-    pub entry_exit: Option<CodePtr>,
+    entry_exit: Option<CodePtr>,
 }
 
-/// Reference-counted pointer to a block that can be borrowed mutably.
-/// Wrapped so we could implement [Hash] and [Eq] for use with stdlib collections.
-#[derive(Debug)]
-pub struct BlockRef(Rc<RefCell<Block>>);
-
-/// Reference-counted pointer to a branch that can be borrowed mutably
-type BranchRef = Rc<RefCell<Branch>>;
+/// Pointer to a [Block].
+///
+/// # Safety
+///
+/// _Never_ derive a `&mut Block` from this and always use
+/// [std::ptr::NonNull::as_ref] to get a `&Block`. `&'a mut`
+/// in Rust asserts that there are no other references live
+/// over the lifetime `'a`. This uniqueness assertion does
+/// not hold in many situations for us, even when you ignore
+/// the fact that our control flow graph can have cycles.
+/// Here are just two examples where we have overlapping references:
+///  - Yielding to a different OS thread within the same
+///    ractor during compilation
+///  - The GC calling [rb_yjit_iseq_mark] during compilation
+///
+/// Technically, for soundness, we also need to ensure that
+/// the we have the VM lock while the result of `as_ref()`
+/// is live, so that no deallocation happens while the
+/// shared reference is live. The vast majority of our code run while
+/// holding the VM lock, though.
+pub type BlockRef = NonNull<Block>;
+
+/// Pointer to a [Branch]. See [BlockRef] for notes about
+/// proper usage.
+pub type BranchRef = NonNull<Branch>;
+
+/// Pointer to an entry that is already added to an ISEQ
+pub type EntryRef = NonNull<Entry>;
 
 /// List of block versions for a given blockid
 type VersionList = Vec<BlockRef>;
@@ -414,55 +977,51 @@ type VersionList = Vec<BlockRef>;
 /// An instance of this is stored on each iseq
 type VersionMap = Vec<VersionList>;
 
-impl BlockRef {
-    /// Constructor
-    pub fn new(rc: Rc<RefCell<Block>>) -> Self {
-        Self(rc)
-    }
-
-    /// Borrow the block through [RefCell].
-    pub fn borrow(&self) -> Ref<'_, Block> {
-        self.0.borrow()
-    }
-
-    /// Borrow the block for mutation through [RefCell].
-    pub fn borrow_mut(&self) -> RefMut<'_, Block> {
-        self.0.borrow_mut()
-    }
-}
-
-impl Clone for BlockRef {
-    /// Clone the [Rc]
-    fn clone(&self) -> Self {
-        Self(self.0.clone())
+/// [Interior mutability][1] wrapper for a list of branches.
+/// O(n) insertion, but space efficient. We generally expect
+/// blocks to have only a few branches.
+///
+/// [1]: https://doc.rust-lang.org/std/cell/struct.UnsafeCell.html
+#[repr(transparent)]
+struct MutableBranchList(Cell<Box<[BranchRef]>>);
+
+impl MutableBranchList {
+    fn push(&self, branch: BranchRef) {
+        // Temporary move the boxed slice out of self.
+        // oom=abort is load bearing here...
+        let mut current_list = self.0.take().into_vec();
+        current_list.push(branch);
+        self.0.set(current_list.into_boxed_slice());
     }
 }
 
-impl Hash for BlockRef {
-    /// Hash the reference by hashing the pointer
-    fn hash<H: Hasher>(&self, state: &mut H) {
-        let rc_ptr = Rc::as_ptr(&self.0);
-        rc_ptr.hash(state);
-    }
-}
+impl fmt::Debug for MutableBranchList {
+    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // SAFETY: the derived Clone for boxed slices does not mutate this Cell
+        let branches = unsafe { self.0.ref_unchecked().clone() };
 
-impl PartialEq for BlockRef {
-    /// Equality defined by allocation identity
-    fn eq(&self, other: &Self) -> bool {
-        Rc::ptr_eq(&self.0, &other.0)
+        formatter.debug_list().entries(branches.into_iter()).finish()
     }
 }
 
-/// It's comparison by identity so all the requirements are statisfied
-impl Eq for BlockRef {}
-
 /// This is all the data YJIT stores on an iseq
 /// This will be dynamically allocated by C code
 /// C code should pass an &mut IseqPayload to us
 /// when calling into YJIT
 #[derive(Default)]
 pub struct IseqPayload {
-    version_map: VersionMap,
+    // Basic block versions
+    pub version_map: VersionMap,
+
+    // Indexes of code pages used by this this ISEQ
+    pub pages: HashSet<usize>,
+
+    // List of ISEQ entry codes
+    pub entries: Vec<EntryRef>,
+
+    // Blocks that are invalidated but are not yet deallocated.
+    // The code GC will free them later.
+    pub dead_blocks: Vec<BlockRef>,
 }
 
 impl IseqPayload {
@@ -478,14 +1037,14 @@ impl IseqPayload {
 
 /// Get the payload for an iseq. For safety it's up to the caller to ensure the returned `&mut`
 /// upholds aliasing rules and that the argument is a valid iseq.
-pub unsafe fn load_iseq_payload(iseq: IseqPtr) -> Option<&'static mut IseqPayload> {
-    let payload = rb_iseq_get_yjit_payload(iseq);
+pub fn get_iseq_payload(iseq: IseqPtr) -> Option<&'static mut IseqPayload> {
+    let payload = unsafe { rb_iseq_get_yjit_payload(iseq) };
     let payload: *mut IseqPayload = payload.cast();
-    payload.as_mut()
+    unsafe { payload.as_mut() }
 }
 
 /// Get the payload object associated with an iseq. Create one if none exists.
-fn get_iseq_payload(iseq: IseqPtr) -> &'static mut IseqPayload {
+pub fn get_or_create_iseq_payload(iseq: IseqPtr) -> &'static mut IseqPayload {
     type VoidPtr = *mut c_void;
 
     let payload_non_null = unsafe {
@@ -498,7 +1057,8 @@ fn get_iseq_payload(iseq: IseqPtr) -> &'static mut IseqPayload {
             // We drop the payload with Box::from_raw when the GC frees the iseq and calls us.
             // NOTE(alan): Sometimes we read from an iseq without ever writing to it.
             // We allocate in those cases anyways.
-            let new_payload = Box::into_raw(Box::new(IseqPayload::default()));
+            let new_payload = IseqPayload::default();
+            let new_payload = Box::into_raw(Box::new(new_payload));
             rb_iseq_set_yjit_payload(iseq, new_payload as VoidPtr);
 
             new_payload
@@ -514,6 +1074,68 @@ fn get_iseq_payload(iseq: IseqPtr) -> &'static mut IseqPayload {
     unsafe { payload_non_null.as_mut() }.unwrap()
 }
 
+/// Iterate over all existing ISEQs
+pub fn for_each_iseq<F: FnMut(IseqPtr)>(mut callback: F) {
+    unsafe extern "C" fn callback_wrapper(iseq: IseqPtr, data: *mut c_void) {
+        // SAFETY: points to the local below
+        let callback: &mut &mut dyn FnMut(IseqPtr) -> bool = unsafe { std::mem::transmute(&mut *data) };
+        callback(iseq);
+    }
+    let mut data: &mut dyn FnMut(IseqPtr) = &mut callback;
+    unsafe { rb_yjit_for_each_iseq(Some(callback_wrapper), (&mut data) as *mut _ as *mut c_void) };
+}
+
+/// Iterate over all ISEQ payloads
+pub fn for_each_iseq_payload<F: FnMut(&IseqPayload)>(mut callback: F) {
+    for_each_iseq(|iseq| {
+        if let Some(iseq_payload) = get_iseq_payload(iseq) {
+            callback(iseq_payload);
+        }
+    });
+}
+
+/// Iterate over all on-stack ISEQs
+pub fn for_each_on_stack_iseq<F: FnMut(IseqPtr)>(mut callback: F) {
+    unsafe extern "C" fn callback_wrapper(iseq: IseqPtr, data: *mut c_void) {
+        // SAFETY: points to the local below
+        let callback: &mut &mut dyn FnMut(IseqPtr) -> bool = unsafe { std::mem::transmute(&mut *data) };
+        callback(iseq);
+    }
+    let mut data: &mut dyn FnMut(IseqPtr) = &mut callback;
+    unsafe { rb_jit_cont_each_iseq(Some(callback_wrapper), (&mut data) as *mut _ as *mut c_void) };
+}
+
+/// Iterate over all on-stack ISEQ payloads
+pub fn for_each_on_stack_iseq_payload<F: FnMut(&IseqPayload)>(mut callback: F) {
+    for_each_on_stack_iseq(|iseq| {
+        if let Some(iseq_payload) = get_iseq_payload(iseq) {
+            callback(iseq_payload);
+        }
+    });
+}
+
+/// Iterate over all NOT on-stack ISEQ payloads
+pub fn for_each_off_stack_iseq_payload<F: FnMut(&mut IseqPayload)>(mut callback: F) {
+    // Get all ISEQs on the heap. Note that rb_objspace_each_objects() runs GC first,
+    // which could move ISEQ pointers when GC.auto_compact = true.
+    // So for_each_on_stack_iseq() must be called after this, which doesn't run GC.
+    let mut iseqs: Vec<IseqPtr> = vec![];
+    for_each_iseq(|iseq| iseqs.push(iseq));
+
+    // Get all ISEQs that are on a CFP of existing ECs.
+    let mut on_stack_iseqs: HashSet<IseqPtr> = HashSet::new();
+    for_each_on_stack_iseq(|iseq| { on_stack_iseqs.insert(iseq); });
+
+    // Invoke the callback for iseqs - on_stack_iseqs
+    for iseq in iseqs {
+        if !on_stack_iseqs.contains(&iseq) {
+            if let Some(iseq_payload) = get_iseq_payload(iseq) {
+                callback(iseq_payload);
+            }
+        }
+    }
+}
+
 /// Free the per-iseq payload
 #[no_mangle]
 pub extern "C" fn rb_yjit_iseq_free(payload: *mut c_void) {
@@ -526,31 +1148,50 @@ pub extern "C" fn rb_yjit_iseq_free(payload: *mut c_void) {
         }
     };
 
-    use crate::invariants;
-
     // Take ownership of the payload with Box::from_raw().
     // It drops right before this function returns.
     // SAFETY: We got the pointer from Box::into_raw().
     let payload = unsafe { Box::from_raw(payload) };
 
-    // Remove all blocks in the payload from global invariants table.
+    // Free all blocks in version_map. The GC doesn't free running iseqs.
     for versions in &payload.version_map {
         for block in versions {
-            invariants::block_assumptions_free(&block);
+            // SAFETY: blocks in the version_map are always well connected
+            unsafe { free_block(*block, true) };
         }
     }
+
+    // Free dead blocks
+    for block in payload.dead_blocks {
+        unsafe { free_block(block, false) };
+    }
+
+    // Free all entries
+    for entryref in payload.entries.iter() {
+        let entry = unsafe { Box::from_raw(entryref.as_ptr()) };
+        mem::drop(entry);
+    }
+
+    // Increment the freed iseq count
+    incr_counter!(freed_iseq_count);
 }
 
-/// GC callback for marking GC objects in the the per-iseq payload.
+/// GC callback for marking GC objects in the per-iseq payload.
 #[no_mangle]
 pub extern "C" fn rb_yjit_iseq_mark(payload: *mut c_void) {
     let payload = if payload.is_null() {
         // Nothing to mark.
         return;
     } else {
-        // SAFETY: It looks like the GC takes the VM lock while marking
-        // so we should be satisfying aliasing rules here.
-        unsafe { &*(payload as *const IseqPayload) }
+        // SAFETY: The GC takes the VM lock while marking, which
+        // we assert, so we should be synchronized and data race free.
+        //
+        // For aliasing, having the VM lock hopefully also implies that no one
+        // else has an overlapping &mut IseqPayload.
+        unsafe {
+            rb_yjit_assert_holding_vm_lock();
+            &*(payload as *const IseqPayload)
+        }
     };
 
     // For marking VALUEs written into the inline code block.
@@ -559,29 +1200,56 @@ pub extern "C" fn rb_yjit_iseq_mark(payload: *mut c_void) {
 
     for versions in &payload.version_map {
         for block in versions {
-            let block = block.borrow();
+            // SAFETY: all blocks inside version_map are initialized.
+            let block = unsafe { block.as_ref() };
+            mark_block(block, cb, false);
+        }
+    }
+    // Mark dead blocks, since there could be stubs pointing at them
+    for blockref in &payload.dead_blocks {
+        // SAFETY: dead blocks come from version_map, which only have initialized blocks
+        let block = unsafe { blockref.as_ref() };
+        mark_block(block, cb, true);
+    }
 
-            unsafe { rb_gc_mark_movable(block.blockid.iseq.into()) };
+    return;
 
-            // Mark method entry dependencies
-            for cme_dep in &block.cme_dependencies {
-                unsafe { rb_gc_mark_movable(cme_dep.receiver_klass) };
-                unsafe { rb_gc_mark_movable(cme_dep.callee_cme.into()) };
-            }
+    fn mark_block(block: &Block, cb: &CodeBlock, dead: bool) {
+        unsafe { rb_gc_mark_movable(block.iseq.get().into()) };
 
-            // Mark outgoing branch entries
-            for branch in &block.outgoing {
-                let branch = branch.borrow();
-                for target in &branch.targets {
-                    if let Some(target) = target {
-                        unsafe { rb_gc_mark_movable(target.iseq.into()) };
-                    }
+        // Mark method entry dependencies
+        for cme_dep in block.cme_dependencies.iter() {
+            unsafe { rb_gc_mark_movable(cme_dep.get().into()) };
+        }
+
+        // Mark outgoing branch entries
+        for branch in block.outgoing.iter() {
+            let branch = unsafe { branch.as_ref() };
+            for target in branch.targets.iter() {
+                // SAFETY: no mutation inside unsafe
+                let target_iseq = unsafe {
+                    target.ref_unchecked().as_ref().and_then(|target| {
+                        // Avoid get_blockid() on blockref. Can be dangling on dead blocks,
+                        // and the iseq housing the block already naturally handles it.
+                        if target.get_block().is_some() {
+                            None
+                        } else {
+                            Some(target.get_blockid().iseq)
+                        }
+                    })
+                };
+
+                if let Some(target_iseq) = target_iseq {
+                    unsafe { rb_gc_mark_movable(target_iseq.into()) };
                 }
             }
+        }
 
-            // Walk over references to objects in generated code.
-            for offset in &block.gc_object_offsets {
-                let value_address: *const u8 = cb.get_ptr(offset.as_usize()).raw_ptr();
+        // Mark references to objects in generated code.
+        // Skip for dead blocks since they shouldn't run.
+        if !dead {
+            for offset in block.gc_obj_offsets.iter() {
+                let value_address: *const u8 = cb.get_ptr(offset.as_usize()).raw_ptr(cb);
                 // Creating an unaligned pointer is well defined unlike in C.
                 let value_address = value_address as *const VALUE;
 
@@ -595,7 +1263,7 @@ pub extern "C" fn rb_yjit_iseq_mark(payload: *mut c_void) {
     }
 }
 
-/// GC callback for updating GC objects in the the per-iseq payload.
+/// GC callback for updating GC objects in the per-iseq payload.
 /// This is a mirror of [rb_yjit_iseq_mark].
 #[no_mangle]
 pub extern "C" fn rb_yjit_iseq_update_references(payload: *mut c_void) {
@@ -603,9 +1271,15 @@ pub extern "C" fn rb_yjit_iseq_update_references(payload: *mut c_void) {
         // Nothing to update.
         return;
     } else {
-        // SAFETY: It looks like the GC takes the VM lock while updating references
-        // so we should be satisfying aliasing rules here.
-        unsafe { &*(payload as *const IseqPayload) }
+        // SAFETY: The GC takes the VM lock while marking, which
+        // we assert, so we should be synchronized and data race free.
+        //
+        // For aliasing, having the VM lock hopefully also implies that no one
+        // else has an overlapping &mut IseqPayload.
+        unsafe {
+            rb_yjit_assert_holding_vm_lock();
+            &*(payload as *const IseqPayload)
+        }
     };
 
     // Evict other threads from generated code since we are about to patch them.
@@ -616,32 +1290,73 @@ pub extern "C" fn rb_yjit_iseq_update_references(payload: *mut c_void) {
     let cb = CodegenGlobals::get_inline_cb();
 
     for versions in &payload.version_map {
-        for block in versions {
-            let mut block = block.borrow_mut();
+        for version in versions {
+            // SAFETY: all blocks inside version_map are initialized
+            let block = unsafe { version.as_ref() };
+            block_update_references(block, cb, false);
+        }
+    }
+    // Update dead blocks, since there could be stubs pointing at them
+    for blockref in &payload.dead_blocks {
+        // SAFETY: dead blocks come from version_map, which only have initialized blocks
+        let block = unsafe { blockref.as_ref() };
+        block_update_references(block, cb, true);
+    }
 
-            block.blockid.iseq = unsafe { rb_gc_location(block.blockid.iseq.into()) }.as_iseq();
+    // Note that we would have returned already if YJIT is off.
+    cb.mark_all_executable();
 
-            // Update method entry dependencies
-            for cme_dep in &mut block.cme_dependencies {
-                cme_dep.receiver_klass = unsafe { rb_gc_location(cme_dep.receiver_klass) };
-                cme_dep.callee_cme = unsafe { rb_gc_location(cme_dep.callee_cme.into()) }.as_cme();
-            }
+    CodegenGlobals::get_outlined_cb()
+        .unwrap()
+        .mark_all_executable();
 
-            // Update outgoing branch entries
-            for branch in &block.outgoing {
-                let mut branch = branch.borrow_mut();
-                for target in &mut branch.targets {
-                    if let Some(target) = target {
-                        target.iseq = unsafe { rb_gc_location(target.iseq.into()) }.as_iseq();
-                    }
+    return;
+
+    fn block_update_references(block: &Block, cb: &mut CodeBlock, dead: bool) {
+        block.iseq.set(unsafe { rb_gc_location(block.iseq.get().into()) }.as_iseq());
+
+        // Update method entry dependencies
+        for cme_dep in block.cme_dependencies.iter() {
+            let cur_cme: VALUE = cme_dep.get().into();
+            let new_cme = unsafe { rb_gc_location(cur_cme) }.as_cme();
+            cme_dep.set(new_cme);
+        }
+
+        // Update outgoing branch entries
+        for branch in block.outgoing.iter() {
+            let branch = unsafe { branch.as_ref() };
+            for target in branch.targets.iter() {
+                // SAFETY: no mutation inside unsafe
+                let current_iseq = unsafe {
+                    target.ref_unchecked().as_ref().and_then(|target| {
+                        // Avoid get_blockid() on blockref. Can be dangling on dead blocks,
+                        // and the iseq housing the block already naturally handles it.
+                        if target.get_block().is_some() {
+                            None
+                        } else {
+                            Some(target.get_blockid().iseq)
+                        }
+                    })
+                };
+
+                if let Some(current_iseq) = current_iseq {
+                    let updated_iseq = unsafe { rb_gc_location(current_iseq.into()) }
+                        .as_iseq();
+                    // SAFETY: the Cell::set is not on the reference given out
+                    // by ref_unchecked.
+                    unsafe { target.ref_unchecked().as_ref().unwrap().set_iseq(updated_iseq) };
                 }
             }
+        }
 
-            // Walk over references to objects in generated code.
-            for offset in &block.gc_object_offsets {
+        // Update references to objects in generated code.
+        // Skip for dead blocks since they shouldn't run and
+        // so there is no potential of writing over invalidation jumps
+        if !dead {
+            for offset in block.gc_obj_offsets.iter() {
                 let offset_to_value = offset.as_usize();
                 let value_code_ptr = cb.get_ptr(offset_to_value);
-                let value_ptr: *const u8 = value_code_ptr.raw_ptr();
+                let value_ptr: *const u8 = value_code_ptr.raw_ptr(cb);
                 // Creating an unaligned pointer is well defined unlike in C.
                 let value_ptr = value_ptr as *mut VALUE;
 
@@ -653,25 +1368,30 @@ pub extern "C" fn rb_yjit_iseq_update_references(payload: *mut c_void) {
                 if new_addr != object {
                     for (byte_idx, &byte) in new_addr.as_u64().to_le_bytes().iter().enumerate() {
                         let byte_code_ptr = value_code_ptr.add_bytes(byte_idx);
-                        cb.get_mem().write_byte(byte_code_ptr, byte)
+                        cb.write_mem(byte_code_ptr, byte)
                             .expect("patching existing code should be within bounds");
                     }
                 }
             }
         }
-    }
-
-    // Note that we would have returned already if YJIT is off.
-    cb.mark_all_executable();
 
-    CodegenGlobals::get_outlined_cb()
-        .unwrap()
-        .mark_all_executable();
+    }
 }
 
 /// Get all blocks for a particular place in an iseq.
-fn get_version_list(blockid: BlockId) -> &'static mut VersionList {
-    let payload = get_iseq_payload(blockid.iseq);
+fn get_version_list(blockid: BlockId) -> Option<&'static mut VersionList> {
+    let insn_idx = blockid.idx.as_usize();
+    match get_iseq_payload(blockid.iseq) {
+        Some(payload) if insn_idx < payload.version_map.len() => {
+            Some(payload.version_map.get_mut(insn_idx).unwrap())
+        },
+        _ => None
+    }
+}
+
+/// Get or create all blocks for a particular place in an iseq.
+fn get_or_create_version_list(blockid: BlockId) -> &'static mut VersionList {
+    let payload = get_or_create_iseq_payload(blockid.iseq);
     let insn_idx = blockid.idx.as_usize();
 
     // Expand the version map as necessary
@@ -686,32 +1406,39 @@ fn get_version_list(blockid: BlockId) -> &'static mut VersionList {
 
 /// Take all of the blocks for a particular place in an iseq
 pub fn take_version_list(blockid: BlockId) -> VersionList {
-    let payload = get_iseq_payload(blockid.iseq);
     let insn_idx = blockid.idx.as_usize();
-
-    if insn_idx >= payload.version_map.len() {
-        VersionList::default()
-    } else {
-        mem::take(&mut payload.version_map[insn_idx])
+    match get_iseq_payload(blockid.iseq) {
+        Some(payload) if insn_idx < payload.version_map.len() => {
+            mem::take(&mut payload.version_map[insn_idx])
+        },
+        _ => VersionList::default(),
     }
 }
 
 /// Count the number of block versions matching a given blockid
-fn get_num_versions(blockid: BlockId) -> usize {
+/// `inlined: true` counts inlined versions, and `inlined: false` counts other versions.
+fn get_num_versions(blockid: BlockId, inlined: bool) -> usize {
     let insn_idx = blockid.idx.as_usize();
-    let payload = get_iseq_payload(blockid.iseq);
-
-    payload
-        .version_map
-        .get(insn_idx)
-        .map(|versions| versions.len())
-        .unwrap_or(0)
+    match get_iseq_payload(blockid.iseq) {
+        Some(payload) => {
+            payload
+                .version_map
+                .get(insn_idx)
+                .map(|versions| {
+                    versions.iter().filter(|&&version|
+                        unsafe { version.as_ref() }.ctx.inline() == inlined
+                    ).count()
+                })
+                .unwrap_or(0)
+        }
+        None => 0,
+    }
 }
 
-/// Get a list of block versions generated for an iseq
+/// Get or create a list of block versions generated for an iseq
 /// This is used for disassembly (see disasm.rs)
-pub fn get_iseq_block_list(iseq: IseqPtr) -> Vec<BlockRef> {
-    let payload = get_iseq_payload(iseq);
+pub fn get_or_create_iseq_block_list(iseq: IseqPtr) -> Vec<BlockRef> {
+    let payload = get_or_create_iseq_payload(iseq);
 
     let mut blocks = Vec::<BlockRef>::new();
 
@@ -722,7 +1449,7 @@ pub fn get_iseq_block_list(iseq: IseqPtr) -> Vec<BlockRef> {
         // For each version at this instruction index
         for version in version_list {
             // Clone the block ref and add it to the list
-            blocks.push(version.clone());
+            blocks.push(*version);
         }
     }
 
@@ -732,82 +1459,125 @@ pub fn get_iseq_block_list(iseq: IseqPtr) -> Vec<BlockRef> {
 /// Retrieve a basic block version for an (iseq, idx) tuple
 /// This will return None if no version is found
 fn find_block_version(blockid: BlockId, ctx: &Context) -> Option<BlockRef> {
-    let versions = get_version_list(blockid);
+    let versions = match get_version_list(blockid) {
+        Some(versions) => versions,
+        None => return None,
+    };
 
     // Best match found
     let mut best_version: Option<BlockRef> = None;
     let mut best_diff = usize::MAX;
 
     // For each version matching the blockid
-    for blockref in versions.iter_mut() {
-        let block = blockref.borrow();
-        let diff = ctx.diff(&block.ctx);
+    for blockref in versions.iter() {
+        let block = unsafe { blockref.as_ref() };
 
         // Note that we always prefer the first matching
         // version found because of inline-cache chains
-        if diff < best_diff {
-            best_version = Some(blockref.clone());
-            best_diff = diff;
-        }
-    }
-
-    // If greedy versioning is enabled
-    if get_option!(greedy_versioning) {
-        // If we're below the version limit, don't settle for an imperfect match
-        if versions.len() + 1 < get_option!(max_versions) && best_diff > 0 {
-            return None;
+        match ctx.diff(&block.ctx) {
+            TypeDiff::Compatible(diff) if diff < best_diff => {
+                best_version = Some(*blockref);
+                best_diff = diff;
+            }
+            _ => {}
         }
     }
 
     return best_version;
 }
 
+/// Allow inlining a Block up to MAX_INLINE_VERSIONS times.
+const MAX_INLINE_VERSIONS: usize = 1000;
+
 /// Produce a generic context when the block version limit is hit for a blockid
 pub fn limit_block_versions(blockid: BlockId, ctx: &Context) -> Context {
     // Guard chains implement limits separately, do nothing
-    if ctx.chain_depth > 0 {
+    if ctx.get_chain_depth() > 0 {
         return *ctx;
     }
 
+    let next_versions = get_num_versions(blockid, ctx.inline()) + 1;
+    let max_versions = if ctx.inline() {
+        MAX_INLINE_VERSIONS
+    } else {
+        get_option!(max_versions)
+    };
+
     // If this block version we're about to add will hit the version limit
-    if get_num_versions(blockid) + 1 >= get_option!(max_versions) {
+    if next_versions >= max_versions {
         // Produce a generic context that stores no type information,
         // but still respects the stack_size and sp_offset constraints.
         // This new context will then match all future requests.
-        let mut generic_ctx = Context::default();
-        generic_ctx.stack_size = ctx.stack_size;
-        generic_ctx.sp_offset = ctx.sp_offset;
+        let generic_ctx = ctx.get_generic_ctx();
+
+        if cfg!(debug_assertions) {
+            let mut ctx = ctx.clone();
+            if ctx.inline() {
+                // Suppress TypeDiff::Incompatible from ctx.diff(). We return TypeDiff::Incompatible
+                // to keep inlining blocks until we hit the limit, but it's safe to give up inlining.
+                ctx.inline_block = 0;
+                assert!(generic_ctx.inline_block == 0);
+            }
+
+            assert_ne!(
+                TypeDiff::Incompatible,
+                ctx.diff(&generic_ctx),
+                "should substitute a compatible context",
+            );
+        }
 
-        // Mutate the incoming context
         return generic_ctx;
     }
+    incr_counter_to!(max_inline_versions, next_versions);
 
     return *ctx;
 }
 
-/// Keep track of a block version. Block should be fully constructed.
-/// Uses `cb` for running write barriers.
-fn add_block_version(blockref: &BlockRef, cb: &CodeBlock) {
-    let block = blockref.borrow();
+/// Install a block version into its [IseqPayload], letting the GC track its
+/// lifetime, and allowing it to be considered for use for other
+/// blocks we might generate. Uses `cb` for running write barriers.
+///
+/// # Safety
+///
+/// The block must be fully initialized. Its incoming and outgoing edges,
+/// if there are any, must point to initialized blocks, too.
+///
+/// Note that the block might gain edges after this function returns,
+/// as can happen during [gen_block_series]. Initialized here doesn't mean
+/// ready to be consumed or that the machine code tracked by the block is
+/// ready to be run.
+///
+/// Due to this transient state where a block is tracked by the GC by
+/// being inside an [IseqPayload] but not ready to be executed, it's
+/// generally unsound to call any Ruby methods during codegen. That has
+/// the potential to run blocks which are not ready.
+unsafe fn add_block_version(blockref: BlockRef, cb: &CodeBlock) {
+    // SAFETY: caller ensures initialization
+    let block = unsafe { blockref.as_ref() };
 
     // Function entry blocks must have stack size 0
-    assert!(!(block.blockid.idx == 0 && block.ctx.stack_size > 0));
+    assert!(!(block.iseq_range.start == 0 && block.ctx.stack_size > 0));
 
-    let version_list = get_version_list(block.blockid);
+    let version_list = get_or_create_version_list(block.get_blockid());
 
-    version_list.push(blockref.clone());
+    // If this the first block being compiled with this block id
+    if version_list.len() == 0 {
+        incr_counter!(compiled_blockid_count);
+    }
+
+    version_list.push(blockref);
+    version_list.shrink_to_fit();
 
     // By writing the new block to the iseq, the iseq now
     // contains new references to Ruby objects. Run write barriers.
-    let iseq: VALUE = block.blockid.iseq.into();
+    let iseq: VALUE = block.iseq.get().into();
     for dep in block.iter_cme_deps() {
-        obj_written!(iseq, dep.receiver_klass);
-        obj_written!(iseq, dep.callee_cme.into());
+        obj_written!(iseq, dep.into());
     }
 
     // Run write barriers for all objects in generated code.
-    for offset in &block.gc_object_offsets {
-        let value_address: *const u8 = cb.get_ptr(offset.as_usize()).raw_ptr();
+    for offset in block.gc_obj_offsets.iter() {
+        let value_address: *const u8 = cb.get_ptr(offset.as_usize()).raw_ptr(cb);
         // Creating an unaligned pointer is well defined unlike in C.
         let value_address: *const VALUE = value_address.cast();
 
@@ -816,251 +1586,234 @@ fn add_block_version(blockref: &BlockRef, cb: &CodeBlock) {
     }
 
     incr_counter!(compiled_block_count);
+
+    // Mark code pages for code GC
+    let iseq_payload = get_iseq_payload(block.iseq.get()).unwrap();
+    for page in cb.addrs_to_pages(block.start_addr, block.end_addr.get()) {
+        iseq_payload.pages.insert(page);
+    }
 }
 
 /// Remove a block version from the version map of its parent ISEQ
 fn remove_block_version(blockref: &BlockRef) {
-    let block = blockref.borrow();
-    let version_list = get_version_list(block.blockid);
+    let block = unsafe { blockref.as_ref() };
+    let version_list = match get_version_list(block.get_blockid()) {
+        Some(version_list) => version_list,
+        None => return,
+    };
 
     // Retain the versions that are not this one
     version_list.retain(|other| blockref != other);
 }
 
-//===========================================================================
-// I put the implementation of traits for core.rs types below
-// We can move these closer to the above structs later if we want.
-//===========================================================================
+impl JITState {
+    // Finish compiling and turn a jit state into a block
+    // note that the block is still not in shape.
+    pub fn into_block(self, end_insn_idx: IseqIdx, start_addr: CodePtr, end_addr: CodePtr, gc_obj_offsets: Vec<u32>) -> BlockRef {
+        // Allocate the block and get its pointer
+        let blockref: *mut MaybeUninit<Block> = Box::into_raw(Box::new(MaybeUninit::uninit()));
+
+        incr_counter_by!(num_gc_obj_refs, gc_obj_offsets.len());
+
+        // Make the new block
+        let block = MaybeUninit::new(Block {
+            start_addr,
+            iseq: Cell::new(self.get_iseq()),
+            iseq_range: self.get_starting_insn_idx()..end_insn_idx,
+            ctx: self.get_starting_ctx(),
+            end_addr: Cell::new(end_addr),
+            incoming: MutableBranchList(Cell::default()),
+            gc_obj_offsets: gc_obj_offsets.into_boxed_slice(),
+            entry_exit: self.get_block_entry_exit(),
+            cme_dependencies: self.method_lookup_assumptions.into_iter().map(Cell::new).collect(),
+            // Pending branches => actual branches
+            outgoing: self.pending_outgoing.into_iter().map(|pending_out| {
+                let pending_out = Rc::try_unwrap(pending_out)
+                    .ok().expect("all PendingBranchRefs should be unique when ready to construct a Block");
+                pending_out.into_branch(NonNull::new(blockref as *mut Block).expect("no null from Box"))
+            }).collect()
+        });
+        // Initialize it on the heap
+        // SAFETY: allocated with Box above
+        unsafe { ptr::write(blockref, block) };
+
+        // Block is initialized now. Note that MaybeUnint<T> has the same layout as T.
+        let blockref = NonNull::new(blockref as *mut Block).expect("no null from Box");
 
-impl Block {
-    pub fn new(blockid: BlockId, ctx: &Context) -> BlockRef {
-        let block = Block {
-            blockid,
-            end_idx: 0,
-            ctx: *ctx,
-            start_addr: None,
-            end_addr: None,
-            incoming: Vec::new(),
-            outgoing: Vec::new(),
-            gc_object_offsets: Vec::new(),
-            cme_dependencies: Vec::new(),
-            entry_exit: None,
-        };
+        // Track all the assumptions the block makes as invariants
+        if self.block_assumes_single_ractor {
+            track_single_ractor_assumption(blockref);
+        }
+        for bop in self.bop_assumptions {
+            track_bop_assumption(blockref, bop);
+        }
+        // SAFETY: just allocated it above
+        for cme in unsafe { blockref.as_ref() }.cme_dependencies.iter() {
+            track_method_lookup_stability_assumption(blockref, cme.get());
+        }
+        if let Some(idlist) = self.stable_constant_names_assumption {
+            track_stable_constant_names_assumption(blockref, idlist);
+        }
+        for klass in self.no_singleton_class_assumptions {
+            track_no_singleton_class_assumption(blockref, klass);
+        }
 
-        // Wrap the block in a reference counted refcell
-        // so that the block ownership can be shared
-        BlockRef::new(Rc::new(RefCell::new(block)))
+        blockref
     }
+}
 
+impl Block {
     pub fn get_blockid(&self) -> BlockId {
-        self.blockid
+        BlockId { iseq: self.iseq.get(), idx: self.iseq_range.start }
     }
 
-    pub fn get_end_idx(&self) -> u32 {
-        self.end_idx
+    pub fn get_end_idx(&self) -> IseqIdx {
+        self.iseq_range.end
     }
 
-    pub fn get_ctx(&self) -> Context {
-        self.ctx
+    pub fn get_ctx_count(&self) -> usize {
+        let mut count = 1; // block.ctx
+        for branch in self.outgoing.iter() {
+            // SAFETY: &self implies it's initialized
+            count += unsafe { branch.as_ref() }.get_stub_count();
+        }
+        count
     }
 
     #[allow(unused)]
-    pub fn get_start_addr(&self) -> Option<CodePtr> {
+    pub fn get_start_addr(&self) -> CodePtr {
         self.start_addr
     }
 
     #[allow(unused)]
-    pub fn get_end_addr(&self) -> Option<CodePtr> {
-        self.end_addr
+    pub fn get_end_addr(&self) -> CodePtr {
+        self.end_addr.get()
     }
 
     /// Get an immutable iterator over cme dependencies
-    pub fn iter_cme_deps(&self) -> std::slice::Iter<'_, CmeDependency> {
-        self.cme_dependencies.iter()
+    pub fn iter_cme_deps(&self) -> impl Iterator<Item = CmePtr> + '_ {
+        self.cme_dependencies.iter().map(Cell::get)
     }
 
-    /// Set the starting address in the generated code for the block
-    /// This can be done only once for a block
-    pub fn set_start_addr(&mut self, addr: CodePtr) {
-        assert!(self.start_addr.is_none());
-        self.start_addr = Some(addr);
-    }
-
-    /// Set the end address in the generated for the block
-    /// This can be done only once for a block
-    pub fn set_end_addr(&mut self, addr: CodePtr) {
-        // The end address can only be set after the start address is set
-        assert!(self.start_addr.is_some());
-
-        // TODO: assert constraint that blocks can shrink but not grow in length
-        self.end_addr = Some(addr);
-    }
-
-    /// Set the index of the last instruction in the block
-    /// This can be done only once for a block
-    pub fn set_end_idx(&mut self, end_idx: u32) {
-        assert!(self.end_idx == 0);
-        self.end_idx = end_idx;
-    }
-
-    pub fn add_gc_object_offset(self: &mut Block, ptr_offset: u32) {
-        self.gc_object_offsets.push(ptr_offset);
-    }
-
-    /// Instantiate a new CmeDependency struct and add it to the list of
-    /// dependencies for this block.
-    pub fn add_cme_dependency(
-        &mut self,
-        receiver_klass: VALUE,
-        callee_cme: *const rb_callable_method_entry_t,
-    ) {
-        self.cme_dependencies.push(CmeDependency {
-            receiver_klass,
-            callee_cme,
-        });
+    // Push an incoming branch ref and shrink the vector
+    fn push_incoming(&self, branch: BranchRef) {
+        self.incoming.push(branch);
     }
 
     // Compute the size of the block code
     pub fn code_size(&self) -> usize {
-        (self.end_addr.unwrap().raw_ptr() as usize) - (self.start_addr.unwrap().raw_ptr() as usize)
+        (self.end_addr.get().as_offset() - self.start_addr.as_offset()).try_into().unwrap()
     }
 }
 
 impl Context {
-    pub fn new_with_stack_size(size: i16) -> Self {
-        return Context {
-            stack_size: size as u16,
-            sp_offset: size,
-            chain_depth: 0,
-            local_types: [Type::Unknown; MAX_LOCAL_TYPES],
-            temp_types: [Type::Unknown; MAX_TEMP_TYPES],
-            self_type: Type::Unknown,
-            temp_mapping: [MapToStack; MAX_TEMP_TYPES],
-        };
+    pub fn get_stack_size(&self) -> u8 {
+        self.stack_size
     }
 
-    pub fn new() -> Self {
-        return Self::new_with_stack_size(0);
+    pub fn set_stack_size(&mut self, stack_size: u8) {
+        self.stack_size = stack_size;
     }
 
-    pub fn get_stack_size(&self) -> u16 {
-        self.stack_size
+    /// Create a new Context that is compatible with self but doesn't have type information.
+    pub fn get_generic_ctx(&self) -> Context {
+        let mut generic_ctx = Context::default();
+        generic_ctx.stack_size = self.stack_size;
+        generic_ctx.sp_offset = self.sp_offset;
+        generic_ctx.reg_temps = self.reg_temps;
+        if self.is_return_landing() {
+            generic_ctx.set_as_return_landing();
+        }
+        if self.is_deferred() {
+            generic_ctx.mark_as_deferred();
+        }
+        generic_ctx
     }
 
-    pub fn get_sp_offset(&self) -> i16 {
+    /// Create a new Context instance with a given stack_size and sp_offset adjusted
+    /// accordingly. This is useful when you want to virtually rewind a stack_size for
+    /// generating a side exit while considering past sp_offset changes on gen_save_sp.
+    pub fn with_stack_size(&self, stack_size: u8) -> Context {
+        let mut ctx = *self;
+        ctx.sp_offset -= (ctx.get_stack_size() as isize - stack_size as isize) as i8;
+        ctx.stack_size = stack_size;
+        ctx
+    }
+
+    pub fn get_sp_offset(&self) -> i8 {
         self.sp_offset
     }
 
-    pub fn set_sp_offset(&mut self, offset: i16) {
+    pub fn set_sp_offset(&mut self, offset: i8) {
         self.sp_offset = offset;
     }
 
-    pub fn get_chain_depth(&self) -> u8 {
-        self.chain_depth
+    pub fn get_reg_temps(&self) -> RegTemps {
+        self.reg_temps
     }
 
-    pub fn reset_chain_depth(&mut self) {
-        self.chain_depth = 0;
+    pub fn set_reg_temps(&mut self, reg_temps: RegTemps) {
+        self.reg_temps = reg_temps;
     }
 
-    pub fn increment_chain_depth(&mut self) {
-        self.chain_depth += 1;
+    pub fn get_chain_depth(&self) -> u8 {
+        self.chain_depth_and_flags & CHAIN_DEPTH_MASK
     }
 
-    /// Get an operand for the adjusted stack pointer address
-    pub fn sp_opnd(&self, offset_bytes: isize) -> X86Opnd {
-        let offset = ((self.sp_offset as isize) * (SIZEOF_VALUE as isize)) + offset_bytes;
-        let offset = offset as i32;
-        return mem_opnd(64, REG_SP, offset);
+    pub fn reset_chain_depth_and_defer(&mut self) {
+        self.chain_depth_and_flags &= !CHAIN_DEPTH_MASK;
+        self.chain_depth_and_flags &= !DEFER_BIT;
     }
 
-    /// Push one new value on the temp stack with an explicit mapping
-    /// Return a pointer to the new stack top
-    pub fn stack_push_mapping(&mut self, (mapping, temp_type): (TempMapping, Type)) -> X86Opnd {
-        // If type propagation is disabled, store no types
-        if get_option!(no_type_prop) {
-            return self.stack_push_mapping((mapping, Type::Unknown));
-        }
-
-        let stack_size: usize = self.stack_size.into();
-
-        // Keep track of the type and mapping of the value
-        if stack_size < MAX_TEMP_TYPES {
-            self.temp_mapping[stack_size] = mapping;
-            self.temp_types[stack_size] = temp_type;
-
-            if let MapToLocal(idx) = mapping {
-                assert!((idx as usize) < MAX_LOCAL_TYPES);
-            }
+    pub fn increment_chain_depth(&mut self) {
+        if self.get_chain_depth() == CHAIN_DEPTH_MASK {
+            panic!("max block version chain depth reached!");
         }
-
-        self.stack_size += 1;
-        self.sp_offset += 1;
-
-        // SP points just above the topmost value
-        let offset = ((self.sp_offset as i32) - 1) * (SIZEOF_VALUE as i32);
-        return mem_opnd(64, REG_SP, offset);
+        self.chain_depth_and_flags += 1;
     }
 
-    /// Push one new value on the temp stack
-    /// Return a pointer to the new stack top
-    pub fn stack_push(&mut self, val_type: Type) -> X86Opnd {
-        return self.stack_push_mapping((MapToStack, val_type));
+    pub fn set_as_return_landing(&mut self) {
+        self.chain_depth_and_flags |= RETURN_LANDING_BIT;
     }
 
-    /// Push the self value on the stack
-    pub fn stack_push_self(&mut self) -> X86Opnd {
-        return self.stack_push_mapping((MapToSelf, Type::Unknown));
+    pub fn clear_return_landing(&mut self) {
+        self.chain_depth_and_flags &= !RETURN_LANDING_BIT;
     }
 
-    /// Push a local variable on the stack
-    pub fn stack_push_local(&mut self, local_idx: usize) -> X86Opnd {
-        if local_idx >= MAX_LOCAL_TYPES {
-            return self.stack_push(Type::Unknown);
-        }
-
-        return self.stack_push_mapping((MapToLocal(local_idx as u8), Type::Unknown));
+    pub fn is_return_landing(&self) -> bool {
+        self.chain_depth_and_flags & RETURN_LANDING_BIT != 0
     }
 
-    // Pop N values off the stack
-    // Return a pointer to the stack top before the pop operation
-    pub fn stack_pop(&mut self, n: usize) -> X86Opnd {
-        assert!(n <= self.stack_size.into());
-
-        // SP points just above the topmost value
-        let offset = ((self.sp_offset as i32) - 1) * (SIZEOF_VALUE as i32);
-        let top = mem_opnd(64, REG_SP, offset);
-
-        // Clear the types of the popped values
-        for i in 0..n {
-            let idx: usize = (self.stack_size as usize) - i - 1;
-
-            if idx < MAX_TEMP_TYPES {
-                self.temp_types[idx] = Type::Unknown;
-                self.temp_mapping[idx] = MapToStack;
-            }
-        }
-
-        self.stack_size -= n as u16;
-        self.sp_offset -= n as i16;
+    pub fn mark_as_deferred(&mut self) {
+        self.chain_depth_and_flags |= DEFER_BIT;
+    }
 
-        return top;
+    pub fn is_deferred(&self) -> bool {
+        self.chain_depth_and_flags & DEFER_BIT != 0
     }
 
-    /// Get an operand pointing to a slot on the temp stack
-    pub fn stack_opnd(&self, idx: i32) -> X86Opnd {
-        // SP points just above the topmost value
-        let offset = ((self.sp_offset as i32) - 1 - idx) * (SIZEOF_VALUE as i32);
-        let opnd = mem_opnd(64, REG_SP, offset);
-        return opnd;
+    /// Get an operand for the adjusted stack pointer address
+    pub fn sp_opnd(&self, offset: i32) -> Opnd {
+        let offset = (self.sp_offset as i32 + offset) * SIZEOF_VALUE_I32;
+        return Opnd::mem(64, SP, offset);
+    }
+
+    /// Stop using a register for a given stack temp.
+    /// This allows us to reuse the register for a value that we know is dead
+    /// and will no longer be used (e.g. popped stack temp).
+    pub fn dealloc_temp_reg(&mut self, stack_idx: u8) {
+        if stack_idx < MAX_REG_TEMPS {
+            let mut reg_temps = self.get_reg_temps();
+            reg_temps.set(stack_idx, false);
+            self.set_reg_temps(reg_temps);
+        }
     }
 
     /// Get the type of an instruction operand
-    pub fn get_opnd_type(&self, opnd: InsnOpnd) -> Type {
+    pub fn get_opnd_type(&self, opnd: YARVOpnd) -> Type {
         match opnd {
             SelfOpnd => self.self_type,
             StackOpnd(idx) => {
-                let idx = idx as u16;
                 assert!(idx < self.stack_size);
                 let stack_idx: usize = (self.stack_size - 1 - idx).into();
 
@@ -1069,14 +1822,15 @@ impl Context {
                     return Type::Unknown;
                 }
 
-                let mapping = self.temp_mapping[stack_idx];
+                let mapping = self.get_temp_mapping(stack_idx);
 
-                match mapping {
+                match mapping.get_kind() {
                     MapToSelf => self.self_type,
-                    MapToStack => self.temp_types[(self.stack_size - 1 - idx) as usize],
-                    MapToLocal(idx) => {
+                    MapToStack => mapping.get_type(),
+                    MapToLocal => {
+                        let idx = mapping.get_local_idx();
                         assert!((idx as usize) < MAX_LOCAL_TYPES);
-                        return self.local_types[idx as usize];
+                        return self.get_local_type(idx.into());
                     }
                 }
             }
@@ -1084,15 +1838,90 @@ impl Context {
     }
 
     /// Get the currently tracked type for a local variable
-    pub fn get_local_type(&self, idx: usize) -> Type {
-        *self.local_types.get(idx).unwrap_or(&Type::Unknown)
+    pub fn get_local_type(&self, local_idx: usize) -> Type {
+        if local_idx >= MAX_LOCAL_TYPES {
+            return Type::Unknown
+        } else {
+            // Each type is stored in 4 bits
+            let type_bits = (self.local_types >> (4 * local_idx)) & 0b1111;
+            unsafe { transmute::<u8, Type>(type_bits as u8) }
+        }
+    }
+
+    /// Get the current temp mapping for a given stack slot
+    fn get_temp_mapping(&self, temp_idx: usize) -> TempMapping {
+        assert!(temp_idx < MAX_TEMP_TYPES);
+
+        // Extract the temp mapping kind
+        let kind_bits = (self.temp_mapping_kind >> (2 * temp_idx)) & 0b11;
+        let temp_kind = unsafe { transmute::<u8, TempMappingKind>(kind_bits as u8) };
+
+        // Extract the payload bits (temp type or local idx)
+        let payload_bits = (self.temp_payload >> (4 * temp_idx)) & 0b1111;
+
+        match temp_kind {
+            MapToSelf => TempMapping::map_to_self(),
+
+            MapToStack => {
+                TempMapping::map_to_stack(
+                    unsafe { transmute::<u8, Type>(payload_bits as u8) }
+                )
+            }
+
+            MapToLocal => {
+                TempMapping::map_to_local(
+                    payload_bits as u8
+                )
+            }
+        }
+    }
+
+    /// Get the current temp mapping for a given stack slot
+    fn set_temp_mapping(&mut self, temp_idx: usize, mapping: TempMapping) {
+        assert!(temp_idx < MAX_TEMP_TYPES);
+
+        // Extract the kind bits
+        let mapping_kind = mapping.get_kind();
+        let kind_bits = unsafe { transmute::<TempMappingKind, u8>(mapping_kind) };
+        assert!(kind_bits <= 0b11);
+
+        // Extract the payload bits
+        let payload_bits = match mapping_kind {
+            MapToSelf => 0,
+
+            MapToStack => {
+                let t = mapping.get_type();
+                unsafe { transmute::<Type, u8>(t) }
+            }
+
+            MapToLocal => {
+                mapping.get_local_idx()
+            }
+        };
+        assert!(payload_bits <= 0b1111);
+
+        // Update the kind bits
+        {
+            let mask_bits = 0b11_u16 << (2 * temp_idx);
+            let shifted_bits = (kind_bits as u16) << (2 * temp_idx);
+            let all_kind_bits = self.temp_mapping_kind as u16;
+            self.temp_mapping_kind = (all_kind_bits & !mask_bits) | shifted_bits;
+        }
+
+        // Update the payload bits
+        {
+            let mask_bits = 0b1111_u32 << (4 * temp_idx);
+            let shifted_bits = (payload_bits as u32) << (4 * temp_idx);
+            let all_payload_bits = self.temp_payload as u32;
+            self.temp_payload = (all_payload_bits & !mask_bits) | shifted_bits;
+        }
     }
 
     /// Upgrade (or "learn") the type of an instruction operand
     /// This value must be compatible and at least as specific as the previously known type.
     /// If this value originated from self, or an lvar, the learned type will be
     /// propagated back to its source.
-    pub fn upgrade_opnd_type(&mut self, opnd: InsnOpnd, opnd_type: Type) {
+    pub fn upgrade_opnd_type(&mut self, opnd: YARVOpnd, opnd_type: Type) {
         // If type propagation is disabled, store no types
         if get_option!(no_type_prop) {
             return;
@@ -1101,7 +1930,6 @@ impl Context {
         match opnd {
             SelfOpnd => self.self_type.upgrade(opnd_type),
             StackOpnd(idx) => {
-                let idx = idx as u16;
                 assert!(idx < self.stack_size);
                 let stack_idx = (self.stack_size - 1 - idx) as usize;
 
@@ -1110,15 +1938,24 @@ impl Context {
                     return;
                 }
 
-                let mapping = self.temp_mapping[stack_idx];
+                let mapping = self.get_temp_mapping(stack_idx);
 
-                match mapping {
+                match mapping.get_kind() {
                     MapToSelf => self.self_type.upgrade(opnd_type),
-                    MapToStack => self.temp_types[stack_idx].upgrade(opnd_type),
-                    MapToLocal(idx) => {
-                        let idx = idx as usize;
+                    MapToStack => {
+                        let mut temp_type = mapping.get_type();
+                        temp_type.upgrade(opnd_type);
+                        self.set_temp_mapping(stack_idx, TempMapping::map_to_stack(temp_type));
+                    }
+                    MapToLocal => {
+                        let idx = mapping.get_local_idx() as usize;
                         assert!(idx < MAX_LOCAL_TYPES);
-                        self.local_types[idx].upgrade(opnd_type);
+                        let mut new_type = self.get_local_type(idx);
+                        new_type.upgrade(opnd_type);
+                        self.set_local_type(idx, new_type);
+                        // Re-attach MapToLocal for this StackOpnd(idx). set_local_type() detaches
+                        // all MapToLocal mappings, including the one we're upgrading here.
+                        self.set_opnd_mapping(opnd, mapping);
                     }
                 }
             }
@@ -1130,30 +1967,29 @@ impl Context {
     This is can be used with stack_push_mapping or set_opnd_mapping to copy
     a stack value's type while maintaining the mapping.
     */
-    pub fn get_opnd_mapping(&self, opnd: InsnOpnd) -> (TempMapping, Type) {
+    pub fn get_opnd_mapping(&self, opnd: YARVOpnd) -> TempMapping {
         let opnd_type = self.get_opnd_type(opnd);
 
         match opnd {
-            SelfOpnd => (MapToSelf, opnd_type),
+            SelfOpnd => TempMapping::map_to_self(),
             StackOpnd(idx) => {
-                let idx = idx as u16;
                 assert!(idx < self.stack_size);
                 let stack_idx = (self.stack_size - 1 - idx) as usize;
 
                 if stack_idx < MAX_TEMP_TYPES {
-                    (self.temp_mapping[stack_idx], opnd_type)
+                    self.get_temp_mapping(stack_idx)
                 } else {
                     // We can't know the source of this stack operand, so we assume it is
                     // a stack-only temporary. type will be UNKNOWN
                     assert!(opnd_type == Type::Unknown);
-                    (MapToStack, opnd_type)
+                    TempMapping::map_to_stack(opnd_type)
                 }
             }
         }
     }
 
     /// Overwrite both the type and mapping of a stack operand.
-    pub fn set_opnd_mapping(&mut self, opnd: InsnOpnd, (mapping, opnd_type): (TempMapping, Type)) {
+    pub fn set_opnd_mapping(&mut self, opnd: YARVOpnd, mapping: TempMapping) {
         match opnd {
             SelfOpnd => unreachable!("self always maps to self"),
             StackOpnd(idx) => {
@@ -1170,44 +2006,47 @@ impl Context {
                     return;
                 }
 
-                self.temp_mapping[stack_idx] = mapping;
-
-                // Only used when mapping == MAP_STACK
-                self.temp_types[stack_idx] = opnd_type;
+                self.set_temp_mapping(stack_idx, mapping);
             }
         }
     }
 
     /// Set the type of a local variable
     pub fn set_local_type(&mut self, local_idx: usize, local_type: Type) {
-        let ctx = self;
-
         // If type propagation is disabled, store no types
         if get_option!(no_type_prop) {
             return;
         }
 
         if local_idx >= MAX_LOCAL_TYPES {
-            return;
+            return
         }
 
         // If any values on the stack map to this local we must detach them
-        for (i, mapping) in ctx.temp_mapping.iter_mut().enumerate() {
-            *mapping = match *mapping {
-                MapToStack => MapToStack,
-                MapToSelf => MapToSelf,
-                MapToLocal(idx) => {
+        for mapping_idx in 0..MAX_TEMP_TYPES {
+            let mapping = self.get_temp_mapping(mapping_idx);
+            let tm = match mapping.get_kind() {
+                MapToStack => mapping,
+                MapToSelf => mapping,
+                MapToLocal => {
+                    let idx = mapping.get_local_idx();
                     if idx as usize == local_idx {
-                        ctx.temp_types[i] = ctx.local_types[idx as usize];
-                        MapToStack
+                        let local_type = self.get_local_type(local_idx);
+                        TempMapping::map_to_stack(local_type)
                     } else {
-                        MapToLocal(idx)
+                        TempMapping::map_to_local(idx)
                     }
                 }
-            }
+            };
+            self.set_temp_mapping(mapping_idx, tm);
         }
 
-        ctx.local_types[local_idx] = local_type;
+        // Update the type bits
+        let type_bits = local_type as u32;
+        assert!(type_bits <= 0b1111);
+        let mask_bits = 0b1111_u32 << (4 * local_idx);
+        let shifted_bits = type_bits << (4 * local_idx);
+        self.local_types = (self.local_types & !mask_bits) | shifted_bits;
     }
 
     /// Erase local variable type information
@@ -1215,99 +2054,238 @@ impl Context {
     pub fn clear_local_types(&mut self) {
         // When clearing local types we must detach any stack mappings to those
         // locals. Even if local values may have changed, stack values will not.
-        for (i, mapping) in self.temp_mapping.iter_mut().enumerate() {
-            *mapping = match *mapping {
-                MapToStack => MapToStack,
-                MapToSelf => MapToSelf,
-                MapToLocal(idx) => {
-                    self.temp_types[i] = self.local_types[idx as usize];
-                    MapToStack
-                }
+
+        for mapping_idx in 0..MAX_TEMP_TYPES {
+            let mapping = self.get_temp_mapping(mapping_idx);
+            if mapping.get_kind() == MapToLocal {
+                let local_idx = mapping.get_local_idx() as usize;
+                self.set_temp_mapping(mapping_idx, TempMapping::map_to_stack(self.get_local_type(local_idx)));
             }
         }
 
         // Clear the local types
-        self.local_types = [Type::default(); MAX_LOCAL_TYPES];
+        self.local_types = 0;
+    }
+
+    /// Return true if the code is inlined by the caller
+    pub fn inline(&self) -> bool {
+        self.inline_block != 0
+    }
+
+    /// Set a block ISEQ given to the Block of this Context
+    pub fn set_inline_block(&mut self, iseq: IseqPtr) {
+        self.inline_block = iseq as u64
     }
 
     /// Compute a difference score for two context objects
-    /// Returns 0 if the two contexts are the same
-    /// Returns > 0 if different but compatible
-    /// Returns usize::MAX if incompatible
-    pub fn diff(&self, dst: &Context) -> usize {
+    pub fn diff(&self, dst: &Context) -> TypeDiff {
         // Self is the source context (at the end of the predecessor)
         let src = self;
 
         // Can only lookup the first version in the chain
-        if dst.chain_depth != 0 {
-            return usize::MAX;
+        if dst.get_chain_depth() != 0 {
+            return TypeDiff::Incompatible;
         }
 
         // Blocks with depth > 0 always produce new versions
         // Sidechains cannot overlap
-        if src.chain_depth != 0 {
-            return usize::MAX;
+        if src.get_chain_depth() != 0 {
+            return TypeDiff::Incompatible;
+        }
+
+        if src.is_return_landing() != dst.is_return_landing() {
+            return TypeDiff::Incompatible;
+        }
+
+        if src.is_deferred() != dst.is_deferred() {
+            return TypeDiff::Incompatible;
         }
 
         if dst.stack_size != src.stack_size {
-            return usize::MAX;
+            return TypeDiff::Incompatible;
         }
 
         if dst.sp_offset != src.sp_offset {
-            return usize::MAX;
+            return TypeDiff::Incompatible;
+        }
+
+        if dst.reg_temps != src.reg_temps {
+            return TypeDiff::Incompatible;
         }
 
         // Difference sum
         let mut diff = 0;
 
         // Check the type of self
-        let self_diff = src.self_type.diff(dst.self_type);
+        diff += match src.self_type.diff(dst.self_type) {
+            TypeDiff::Compatible(diff) => diff,
+            TypeDiff::Incompatible => return TypeDiff::Incompatible,
+        };
 
-        if self_diff == usize::MAX {
-            return usize::MAX;
+        // Check the block to inline
+        if src.inline_block != dst.inline_block {
+            // find_block_version should not find existing blocks with different
+            // inline_block so that their yield will not be megamorphic.
+            return TypeDiff::Incompatible;
         }
 
-        diff += self_diff;
-
         // For each local type we track
-        for i in 0..src.local_types.len() {
-            let t_src = src.local_types[i];
-            let t_dst = dst.local_types[i];
-            let temp_diff = t_src.diff(t_dst);
-
-            if temp_diff == usize::MAX {
-                return usize::MAX;
-            }
-
-            diff += temp_diff;
+        for i in 0.. MAX_LOCAL_TYPES {
+            let t_src = src.get_local_type(i);
+            let t_dst = dst.get_local_type(i);
+            diff += match t_src.diff(t_dst) {
+                TypeDiff::Compatible(diff) => diff,
+                TypeDiff::Incompatible => return TypeDiff::Incompatible,
+            };
         }
 
         // For each value on the temp stack
         for i in 0..src.stack_size {
-            let (src_mapping, src_type) = src.get_opnd_mapping(StackOpnd(i));
-            let (dst_mapping, dst_type) = dst.get_opnd_mapping(StackOpnd(i));
+            let src_mapping = src.get_opnd_mapping(StackOpnd(i));
+            let dst_mapping = dst.get_opnd_mapping(StackOpnd(i));
 
             // If the two mappings aren't the same
             if src_mapping != dst_mapping {
-                if dst_mapping == MapToStack {
+                if dst_mapping.get_kind() == MapToStack {
                     // We can safely drop information about the source of the temp
                     // stack operand.
                     diff += 1;
                 } else {
-                    return usize::MAX;
+                    return TypeDiff::Incompatible;
                 }
             }
 
-            let temp_diff = src_type.diff(dst_type);
+            let src_type = src.get_opnd_type(StackOpnd(i));
+            let dst_type = dst.get_opnd_type(StackOpnd(i));
+
+            diff += match src_type.diff(dst_type) {
+                TypeDiff::Compatible(diff) => diff,
+                TypeDiff::Incompatible => return TypeDiff::Incompatible,
+            };
+        }
+
+        return TypeDiff::Compatible(diff);
+    }
+
+    pub fn two_fixnums_on_stack(&self, jit: &mut JITState) -> Option<bool> {
+        if jit.at_current_insn() {
+            let comptime_recv = jit.peek_at_stack(self, 1);
+            let comptime_arg = jit.peek_at_stack(self, 0);
+            return Some(comptime_recv.fixnum_p() && comptime_arg.fixnum_p());
+        }
 
-            if temp_diff == usize::MAX {
-                return usize::MAX;
+        let recv_type = self.get_opnd_type(StackOpnd(1));
+        let arg_type = self.get_opnd_type(StackOpnd(0));
+        match (recv_type, arg_type) {
+            (Type::Fixnum, Type::Fixnum) => Some(true),
+            (Type::Unknown | Type::UnknownImm, Type::Unknown | Type::UnknownImm) => None,
+            _ => Some(false),
+        }
+    }
+}
+
+impl Assembler {
+    /// Push one new value on the temp stack with an explicit mapping
+    /// Return a pointer to the new stack top
+    pub fn stack_push_mapping(&mut self, mapping: TempMapping) -> Opnd {
+        // If type propagation is disabled, store no types
+        if get_option!(no_type_prop) {
+            return self.stack_push_mapping(mapping.without_type());
+        }
+
+        let stack_size: usize = self.ctx.stack_size.into();
+
+        // Keep track of the type and mapping of the value
+        if stack_size < MAX_TEMP_TYPES {
+            self.ctx.set_temp_mapping(stack_size, mapping);
+
+            if mapping.get_kind() == MapToLocal {
+                let idx = mapping.get_local_idx();
+                assert!((idx as usize) < MAX_LOCAL_TYPES);
             }
+        }
 
-            diff += temp_diff;
+        // Allocate a register to the stack operand
+        if self.ctx.stack_size < MAX_REG_TEMPS {
+            self.alloc_temp_reg(self.ctx.stack_size);
         }
 
-        return diff;
+        self.ctx.stack_size += 1;
+        self.ctx.sp_offset += 1;
+
+        return self.stack_opnd(0);
+    }
+
+    /// Push one new value on the temp stack
+    /// Return a pointer to the new stack top
+    pub fn stack_push(&mut self, val_type: Type) -> Opnd {
+        return self.stack_push_mapping(TempMapping::map_to_stack(val_type));
+    }
+
+    /// Push the self value on the stack
+    pub fn stack_push_self(&mut self) -> Opnd {
+        return self.stack_push_mapping(TempMapping::map_to_self());
+    }
+
+    /// Push a local variable on the stack
+    pub fn stack_push_local(&mut self, local_idx: usize) -> Opnd {
+        if local_idx >= MAX_LOCAL_TYPES {
+            return self.stack_push(Type::Unknown);
+        }
+
+        return self.stack_push_mapping(TempMapping::map_to_local(local_idx as u8));
+    }
+
+    // Pop N values off the stack
+    // Return a pointer to the stack top before the pop operation
+    pub fn stack_pop(&mut self, n: usize) -> Opnd {
+        assert!(n <= self.ctx.stack_size.into());
+
+        let top = self.stack_opnd(0);
+
+        // Clear the types of the popped values
+        for i in 0..n {
+            let idx: usize = (self.ctx.stack_size as usize) - i - 1;
+
+            if idx < MAX_TEMP_TYPES {
+                self.ctx.set_temp_mapping(idx, TempMapping::map_to_stack(Type::Unknown));
+            }
+        }
+
+        self.ctx.stack_size -= n as u8;
+        self.ctx.sp_offset -= n as i8;
+
+        return top;
+    }
+
+    /// Shift stack temps to remove a Symbol for #send.
+    pub fn shift_stack(&mut self, argc: usize) {
+        assert!(argc < self.ctx.stack_size.into());
+
+        let method_name_index = (self.ctx.stack_size as usize) - argc - 1;
+
+        for i in method_name_index..(self.ctx.stack_size - 1) as usize {
+            if i < MAX_TEMP_TYPES {
+                let next_arg_mapping = if i + 1 < MAX_TEMP_TYPES {
+                    self.ctx.get_temp_mapping(i + 1)
+                } else {
+                    TempMapping::map_to_stack(Type::Unknown)
+                };
+                self.ctx.set_temp_mapping(i, next_arg_mapping);
+            }
+        }
+        self.stack_pop(1);
+    }
+
+    /// Get an operand pointing to a slot on the temp stack
+    pub fn stack_opnd(&self, idx: i32) -> Opnd {
+        Opnd::Stack {
+            idx,
+            num_bits: 64,
+            stack_size: self.ctx.stack_size,
+            sp_offset: self.ctx.sp_offset,
+            reg_temps: None, // push_insn will set this
+        }
     }
 }
 
@@ -1316,7 +2294,7 @@ impl BlockId {
     #[cfg(debug_assertions)]
     #[allow(dead_code)]
     pub fn dump_src_loc(&self) {
-        unsafe { rb_yjit_dump_iseq_loc(self.iseq, self.idx) }
+        unsafe { rb_yjit_dump_iseq_loc(self.iseq, self.idx as u32) }
     }
 }
 
@@ -1351,52 +2329,54 @@ fn gen_block_series_body(
 
     // Generate code for the first block
     let first_block = gen_single_block(blockid, start_ctx, ec, cb, ocb).ok()?;
-    batch.push(first_block.clone()); // Keep track of this block version
+    batch.push(first_block); // Keep track of this block version
 
     // Add the block version to the VersionMap for this ISEQ
-    add_block_version(&first_block, cb);
+    unsafe { add_block_version(first_block, cb) };
 
     // Loop variable
-    let mut last_blockref = first_block.clone();
+    let mut last_blockref = first_block;
     loop {
         // Get the last outgoing branch from the previous block.
         let last_branchref = {
-            let last_block = last_blockref.borrow();
+            let last_block = unsafe { last_blockref.as_ref() };
             match last_block.outgoing.last() {
-                Some(branch) => branch.clone(),
+                Some(branch) => *branch,
                 None => {
                     break;
                 } // If last block has no branches, stop.
             }
         };
-        let mut last_branch = last_branchref.borrow_mut();
+        let last_branch = unsafe { last_branchref.as_ref() };
+
+        incr_counter!(block_next_count);
 
         // gen_direct_jump() can request a block to be placed immediately after by
-        // leaving `None`s in the `dst_addrs` array.
-        match &last_branch.dst_addrs {
-            [None, None] => (),
-            _ => {
-                break;
-            } // If there is no next block to compile, stop
+        // leaving a single target that has a `None` address.
+        // SAFETY: no mutation inside the unsafe block
+        let (requested_blockid, requested_ctx) = unsafe {
+            match (last_branch.targets[0].ref_unchecked(), last_branch.targets[1].ref_unchecked()) {
+                (Some(last_target), None) if last_target.get_address().is_none() => {
+                    (last_target.get_blockid(), last_target.get_ctx())
+                }
+                _ => {
+                    // We're done when no fallthrough block is requested
+                    break;
+                }
+            }
         };
 
-        // Get id and context for the new block
-        let requested_id = last_branch.targets[0].expect("block id must be filled");
-        let requested_ctx = &last_branch.target_ctxs[0];
-
         // Generate new block using context from the last branch.
-        let result = gen_single_block(requested_id, requested_ctx, ec, cb, ocb);
+        let result = gen_single_block(requested_blockid, &requested_ctx, ec, cb, ocb);
 
         // If the block failed to compile
         if result.is_err() {
             // Remove previously compiled block
             // versions from the version map
-            for blockref in &batch {
-                // FIXME: should be deallocating resources here too
-                // e.g. invariants, etc.
-                //free_block(blockref)
-
-                remove_block_version(blockref);
+            for blockref in batch {
+                remove_block_version(&blockref);
+                // SAFETY: block was well connected because it was in a version_map
+                unsafe { free_block(blockref, false) };
             }
 
             // Stop compiling
@@ -1406,37 +2386,50 @@ fn gen_block_series_body(
         let new_blockref = result.unwrap();
 
         // Add the block version to the VersionMap for this ISEQ
-        add_block_version(&new_blockref, cb);
+        unsafe { add_block_version(new_blockref, cb) };
 
         // Connect the last branch and the new block
-        last_branch.blocks[0] = Some(new_blockref.clone());
-        last_branch.dst_addrs[0] = new_blockref.borrow().start_addr;
-        new_blockref
-            .borrow_mut()
-            .incoming
-            .push(last_branchref.clone());
-
-        // This block should immediately follow the last branch
-        assert!(new_blockref.borrow().start_addr == last_branch.end_addr);
+        last_branch.targets[0].set(Some(Box::new(BranchTarget::Block(new_blockref))));
+        unsafe { new_blockref.as_ref().incoming.push(last_branchref) };
 
         // Track the block
-        batch.push(new_blockref.clone());
+        batch.push(new_blockref);
 
         // Repeat with newest block
         last_blockref = new_blockref;
     }
 
+    #[cfg(feature = "disasm")]
+    {
+        // If dump_iseq_disasm is active, see if this iseq's location matches the given substring.
+        // If so, we print the new blocks to the console.
+        if let Some(substr) = get_option_ref!(dump_iseq_disasm).as_ref() {
+            let iseq_location = iseq_get_location(blockid.iseq, blockid.idx);
+            if iseq_location.contains(substr) {
+                let last_block = unsafe { last_blockref.as_ref() };
+                let iseq_range = &last_block.iseq_range;
+                println!("Compiling {} block(s) for {}, ISEQ offsets [{}, {})", batch.len(), iseq_location, iseq_range.start, iseq_range.end);
+                print!("{}", disasm_iseq_insn_range(blockid.iseq, iseq_range.start, iseq_range.end));
+            }
+        }
+    }
+
     Some(first_block)
 }
 
 /// Generate a block version that is an entry point inserted into an iseq
 /// NOTE: this function assumes that the VM lock has been taken
-pub fn gen_entry_point(iseq: IseqPtr, ec: EcPtr) -> Option<CodePtr> {
+/// If jit_exception is true, compile JIT code for handling exceptions.
+/// See [jit_compile_exception] for details.
+pub fn gen_entry_point(iseq: IseqPtr, ec: EcPtr, jit_exception: bool) -> Option<*const u8> {
     // Compute the current instruction index based on the current PC
-    let insn_idx: u32 = unsafe {
-        let pc_zero = rb_iseq_pc_at_idx(iseq, 0);
-        let ec_pc = get_cfp_pc(get_ec_cfp(ec));
-        ec_pc.offset_from(pc_zero).try_into().ok()?
+    let cfp = unsafe { get_ec_cfp(ec) };
+    let insn_idx: u16 = unsafe {
+        let ec_pc = get_cfp_pc(cfp);
+        iseq_pc_to_insn_idx(iseq, ec_pc)?
+    };
+    let stack_size: u8 = unsafe {
+        u8::try_from(get_cfp_sp(cfp).offset_from(get_cfp_bp(cfp))).ok()?
     };
 
     // The entry context makes no assumptions about types
@@ -1450,61 +2443,226 @@ pub fn gen_entry_point(iseq: IseqPtr, ec: EcPtr) -> Option<CodePtr> {
     let ocb = CodegenGlobals::get_outlined_cb();
 
     // Write the interpreter entry prologue. Might be NULL when out of memory.
-    let code_ptr = gen_entry_prologue(cb, iseq, insn_idx);
+    let code_ptr = gen_entry_prologue(cb, ocb, iseq, insn_idx, jit_exception);
 
     // Try to generate code for the entry block
-    let block = gen_block_series(blockid, &Context::default(), ec, cb, ocb);
+    let mut ctx = Context::default();
+    ctx.stack_size = stack_size;
+    let block = gen_block_series(blockid, &ctx, ec, cb, ocb);
 
     cb.mark_all_executable();
     ocb.unwrap().mark_all_executable();
 
     match block {
         // Compilation failed
-        None => return None,
+        None => {
+            // Trigger code GC. This entry point will be recompiled later.
+            if get_option!(code_gc) {
+                cb.code_gc(ocb);
+            }
+            return None;
+        }
 
         // If the block contains no Ruby instructions
         Some(block) => {
-            let block = block.borrow();
-            if block.end_idx == insn_idx {
+            let block = unsafe { block.as_ref() };
+            if block.iseq_range.is_empty() {
                 return None;
             }
         }
     }
 
+    // Count the number of entry points we compile
+    incr_counter!(compiled_iseq_entry);
+
     // Compilation successful and block not empty
-    return code_ptr;
+    code_ptr.map(|ptr| ptr.raw_ptr(cb))
 }
 
-/// Generate code for a branch, possibly rewriting and changing the size of it
-fn regenerate_branch(cb: &mut CodeBlock, branch: &mut Branch) {
-    // FIXME
-    /*
-    if (branch->start_addr < cb_get_ptr(cb, yjit_codepage_frozen_bytes)) {
-        // Generating this branch would modify frozen bytes. Do nothing.
-        return;
-    }
-    */
+// Change the entry's jump target from an entry stub to a next entry
+pub fn regenerate_entry(cb: &mut CodeBlock, entryref: &EntryRef, next_entry: CodePtr) {
+    let mut asm = Assembler::new();
+    asm_comment!(asm, "regenerate_entry");
 
+    // gen_entry_guard generates cmp + jne. We're rewriting only jne.
+    asm.jne(next_entry.into());
+
+    // Move write_pos to rewrite the entry
     let old_write_pos = cb.get_write_pos();
+    let old_dropped_bytes = cb.has_dropped_bytes();
+    cb.set_write_ptr(unsafe { entryref.as_ref() }.start_addr);
+    cb.set_dropped_bytes(false);
+    asm.compile(cb, None).expect("can rewrite existing code");
+
+    // Rewind write_pos to the original one
+    assert_eq!(cb.get_write_ptr(), unsafe { entryref.as_ref() }.end_addr);
+    cb.set_pos(old_write_pos);
+    cb.set_dropped_bytes(old_dropped_bytes);
+}
 
-    let mut block = branch.block.borrow_mut();
-    let branch_terminates_block = branch.end_addr == block.end_addr;
+pub type PendingEntryRef = Rc<PendingEntry>;
 
-    // Rewrite the branch
-    assert!(branch.dst_addrs[0].is_some());
-    cb.set_write_ptr(branch.start_addr.unwrap());
-    (branch.gen_fn)(
-        cb,
-        branch.dst_addrs[0].unwrap(),
-        branch.dst_addrs[1],
-        branch.shape,
+/// Create a new entry reference for an ISEQ
+pub fn new_pending_entry() -> PendingEntryRef {
+    let entry = PendingEntry {
+        uninit_entry: Box::new(MaybeUninit::uninit()),
+        start_addr: Cell::new(None),
+        end_addr: Cell::new(None),
+    };
+    return Rc::new(entry);
+}
+
+c_callable! {
+    /// Generated code calls this function with the SysV calling convention.
+    /// See [gen_call_entry_stub_hit].
+    fn entry_stub_hit(entry_ptr: *const c_void, ec: EcPtr) -> *const u8 {
+        with_compile_time(|| {
+            with_vm_lock(src_loc!(), || {
+                let cb = CodegenGlobals::get_inline_cb();
+                let ocb = CodegenGlobals::get_outlined_cb();
+
+                let addr = entry_stub_hit_body(entry_ptr, ec, cb, ocb)
+                    .unwrap_or_else(|| {
+                        // Trigger code GC (e.g. no space).
+                        // This entry point will be recompiled later.
+                        if get_option!(code_gc) {
+                            cb.code_gc(ocb);
+                        }
+                        CodegenGlobals::get_stub_exit_code().raw_ptr(cb)
+                    });
+
+                cb.mark_all_executable();
+                ocb.unwrap().mark_all_executable();
+
+                addr
+            })
+        })
+    }
+}
+
+/// Called by the generated code when an entry stub is executed
+fn entry_stub_hit_body(
+    entry_ptr: *const c_void,
+    ec: EcPtr,
+    cb: &mut CodeBlock,
+    ocb: &mut OutlinedCb
+) -> Option<*const u8> {
+    // Get ISEQ and insn_idx from the current ec->cfp
+    let cfp = unsafe { get_ec_cfp(ec) };
+    let iseq = unsafe { get_cfp_iseq(cfp) };
+    let insn_idx = iseq_pc_to_insn_idx(iseq, unsafe { get_cfp_pc(cfp) })?;
+    let stack_size: u8 = unsafe {
+        u8::try_from(get_cfp_sp(cfp).offset_from(get_cfp_bp(cfp))).ok()?
+    };
+
+    // Compile a new entry guard as a next entry
+    let next_entry = cb.get_write_ptr();
+    let mut asm = Assembler::new();
+    let pending_entry = gen_entry_chain_guard(&mut asm, ocb, iseq, insn_idx)?;
+    asm.compile(cb, Some(ocb))?;
+
+    // Find or compile a block version
+    let blockid = BlockId { iseq, idx: insn_idx };
+    let mut ctx = Context::default();
+    ctx.stack_size = stack_size;
+    let blockref = match find_block_version(blockid, &ctx) {
+        // If an existing block is found, generate a jump to the block.
+        Some(blockref) => {
+            let mut asm = Assembler::new();
+            asm.jmp(unsafe { blockref.as_ref() }.start_addr.into());
+            asm.compile(cb, Some(ocb))?;
+            Some(blockref)
+        }
+        // If this block hasn't yet been compiled, generate blocks after the entry guard.
+        None => gen_block_series(blockid, &ctx, ec, cb, ocb),
+    };
+
+    // Commit or retry the entry
+    if blockref.is_some() {
+        // Regenerate the previous entry
+        let entryref = NonNull::<Entry>::new(entry_ptr as *mut Entry).expect("Entry should not be null");
+        regenerate_entry(cb, &entryref, next_entry);
+
+        // Write an entry to the heap and push it to the ISEQ
+        let pending_entry = Rc::try_unwrap(pending_entry).ok().expect("PendingEntry should be unique");
+        get_or_create_iseq_payload(iseq).entries.push(pending_entry.into_entry());
+    }
+
+    // Let the stub jump to the block
+    blockref.map(|block| unsafe { block.as_ref() }.start_addr.raw_ptr(cb))
+}
+
+/// Generate a stub that calls entry_stub_hit
+pub fn gen_entry_stub(entry_address: usize, ocb: &mut OutlinedCb) -> Option<CodePtr> {
+    let ocb = ocb.unwrap();
+
+    let mut asm = Assembler::new();
+    asm_comment!(asm, "entry stub hit");
+
+    asm.mov(C_ARG_OPNDS[0], entry_address.into());
+
+    // Jump to trampoline to call entry_stub_hit()
+    // Not really a side exit, just don't need a padded jump here.
+    asm.jmp(CodegenGlobals::get_entry_stub_hit_trampoline().as_side_exit());
+
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
+}
+
+/// A trampoline used by gen_entry_stub. entry_stub_hit may issue Code GC, so
+/// it's useful for Code GC to call entry_stub_hit from a globally shared code.
+pub fn gen_entry_stub_hit_trampoline(ocb: &mut OutlinedCb) -> Option<CodePtr> {
+    let ocb = ocb.unwrap();
+    let mut asm = Assembler::new();
+
+    // See gen_entry_guard for how it's used.
+    asm_comment!(asm, "entry_stub_hit() trampoline");
+    let jump_addr = asm.ccall(entry_stub_hit as *mut u8, vec![C_ARG_OPNDS[0], EC]);
+
+    // Jump to the address returned by the entry_stub_hit() call
+    asm.jmp_opnd(jump_addr);
+
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
+}
+
+/// Generate code for a branch, possibly rewriting and changing the size of it
+fn regenerate_branch(cb: &mut CodeBlock, branch: &Branch) {
+    // Remove old comments
+    cb.remove_comments(branch.start_addr, branch.end_addr.get());
+
+    // SAFETY: having a &Branch implies branch.block is initialized.
+    let block = unsafe { branch.block.as_ref() };
+
+    let branch_terminates_block = branch.end_addr.get() == block.get_end_addr();
+
+    // Generate the branch
+    let mut asm = Assembler::new();
+    asm_comment!(asm, "regenerate_branch");
+    branch.gen_fn.call(
+        &mut asm,
+        Target::CodePtr(branch.get_target_address(0).unwrap()),
+        branch.get_target_address(1).map(|addr| Target::CodePtr(addr)),
     );
-    branch.end_addr = Some(cb.get_write_ptr());
+
+    // If the entire block is the branch and the block could be invalidated,
+    // we need to pad to ensure there is room for invalidation patching.
+    if branch.start_addr == block.start_addr && branch_terminates_block && block.entry_exit.is_some() {
+        asm.pad_inval_patch();
+    }
+
+    // Rewrite the branch
+    let old_write_pos = cb.get_write_pos();
+    let old_dropped_bytes = cb.has_dropped_bytes();
+    cb.set_write_ptr(branch.start_addr);
+    cb.set_dropped_bytes(false);
+    asm.compile(cb, None).expect("can rewrite existing code");
+    let new_end_addr = cb.get_write_ptr();
+
+    branch.end_addr.set(new_end_addr);
 
     // The block may have shrunk after the branch is rewritten
     if branch_terminates_block {
         // Adjust block size
-        block.end_addr = branch.end_addr;
+        block.end_addr.set(new_end_addr);
     }
 
     // cb.write_pos is both a write cursor and a marker for the end of
@@ -1517,79 +2675,66 @@ fn regenerate_branch(cb: &mut CodeBlock, branch: &mut Branch) {
     if old_write_pos > cb.get_write_pos() {
         // We rewound cb->write_pos to generate the branch, now restore it.
         cb.set_pos(old_write_pos);
+        cb.set_dropped_bytes(old_dropped_bytes);
     } else {
         // The branch sits at the end of cb and consumed some memory.
         // Keep cb.write_pos.
     }
-}
-
-/// Create a new outgoing branch entry for a block
-fn make_branch_entry(block: BlockRef, src_ctx: &Context, gen_fn: BranchGenFn) -> BranchRef {
-    let branch = Branch {
-        // Block this is attached to
-        block: block.clone(),
 
-        // Positions where the generated code starts and ends
-        start_addr: None,
-        end_addr: None,
-
-        // Context right after the branch instruction
-        src_ctx: *src_ctx,
-
-        // Branch target blocks and their contexts
-        targets: [None, None],
-        target_ctxs: [Context::default(), Context::default()],
-        blocks: [None, None],
+    branch.assert_layout();
+}
 
-        // Jump target addresses
-        dst_addrs: [None, None],
+pub type PendingBranchRef = Rc<PendingBranch>;
 
-        // Branch code generation function
-        gen_fn: gen_fn,
+/// Create a new outgoing branch entry for a block
+fn new_pending_branch(jit: &mut JITState, gen_fn: BranchGenFn) -> PendingBranchRef {
+    let branch = Rc::new(PendingBranch {
+        uninit_branch: Box::new(MaybeUninit::uninit()),
+        gen_fn,
+        start_addr: Cell::new(None),
+        end_addr: Cell::new(None),
+        targets: [Cell::new(None), Cell::new(None)],
+    });
 
-        // Shape of the branch
-        shape: BranchShape::Default,
-    };
+    incr_counter!(compiled_branch_count); // TODO not true. count at finalize time
 
     // Add to the list of outgoing branches for the block
-    let branchref = Rc::new(RefCell::new(branch));
-    block.borrow_mut().outgoing.push(branchref.clone());
+    jit.queue_outgoing_branch(branch.clone());
 
-    return branchref;
+    branch
 }
 
-/// Generated code calls this function with the SysV calling convention.
-/// See [get_branch_target].
-extern "sysv64" fn branch_stub_hit(
-    branch_ptr: *const c_void,
-    target_idx: u32,
-    ec: EcPtr,
-) -> *const u8 {
-    with_vm_lock(src_loc!(), || {
-        branch_stub_hit_body(branch_ptr, target_idx, ec)
-    })
+c_callable! {
+    /// Generated code calls this function with the SysV calling convention.
+    /// See [gen_branch_stub].
+    fn branch_stub_hit(
+        branch_ptr: *const c_void,
+        target_idx: u32,
+        ec: EcPtr,
+    ) -> *const u8 {
+        with_vm_lock(src_loc!(), || {
+            with_compile_time(|| { branch_stub_hit_body(branch_ptr, target_idx, ec) })
+        })
+    }
 }
 
 /// Called by the generated code when a branch stub is executed
 /// Triggers compilation of branches and code patching
 fn branch_stub_hit_body(branch_ptr: *const c_void, target_idx: u32, ec: EcPtr) -> *const u8 {
-    assert!(!branch_ptr.is_null());
-
-    //branch_ptr is actually:
-    //branch_ptr: *const RefCell<Branch>
-    let branch_rc = unsafe { BranchRef::from_raw(branch_ptr as *const RefCell<Branch>) };
+    if get_option!(dump_insns) {
+        println!("branch_stub_hit");
+    }
 
-    // We increment the strong count because we want to keep the reference owned
-    // by the branch stub alive. Return branch stubs can be hit multiple times.
-    unsafe { Rc::increment_strong_count(branch_ptr) };
+    let branch_ref = NonNull::<Branch>::new(branch_ptr as *mut Branch)
+        .expect("Branches should not be null");
 
-    let mut branch = branch_rc.borrow_mut();
+    // SAFETY: We have the VM lock, and the branch is initialized by the time generated
+    // code calls this function.
+    let branch = unsafe { branch_ref.as_ref() };
     let branch_size_on_entry = branch.code_size();
+    let housing_block = unsafe { branch.block.as_ref() };
 
     let target_idx: usize = target_idx.as_usize();
-    let target = branch.targets[target_idx].unwrap();
-    let target_ctx = branch.target_ctxs[target_idx];
-
     let target_branch_shape = match target_idx {
         0 => BranchShape::Next0,
         1 => BranchShape::Next1,
@@ -1599,18 +2744,31 @@ fn branch_stub_hit_body(branch_ptr: *const c_void, target_idx: u32, ec: EcPtr) -
     let cb = CodegenGlobals::get_inline_cb();
     let ocb = CodegenGlobals::get_outlined_cb();
 
-    // If this branch has already been patched, return the dst address
-    // Note: ractors can cause the same stub to be hit multiple times
-    if let Some(_) = branch.blocks[target_idx] {
-        return branch.dst_addrs[target_idx].unwrap().raw_ptr();
-    }
+    let (target_blockid, target_ctx): (BlockId, Context) = unsafe {
+        // SAFETY: no mutation of the target's Cell. Just reading out data.
+        let target = branch.targets[target_idx].ref_unchecked().as_ref().unwrap();
+
+        // If this branch has already been patched, return the dst address
+        // Note: recursion can cause the same stub to be hit multiple times
+        if let BranchTarget::Block(_) = target.as_ref() {
+            return target.get_address().unwrap().raw_ptr(cb);
+        }
+
+        (target.get_blockid(), target.get_ctx())
+    };
 
     let (cfp, original_interp_sp) = unsafe {
         let cfp = get_ec_cfp(ec);
         let original_interp_sp = get_cfp_sp(cfp);
 
-        let reconned_pc = rb_iseq_pc_at_idx(rb_cfp_get_iseq(cfp), target.idx);
+        let running_iseq = get_cfp_iseq(cfp);
+        assert_eq!(running_iseq, target_blockid.iseq as _, "each stub expects a particular iseq");
+
+        let reconned_pc = rb_iseq_pc_at_idx(running_iseq, target_blockid.idx.into());
         let reconned_sp = original_interp_sp.offset(target_ctx.sp_offset.into());
+        // Unlike in the interpreter, our `leave` doesn't write to the caller's
+        // SP -- we do it in the returned-to code. Account for this difference.
+        let reconned_sp = reconned_sp.add(target_ctx.is_return_landing().into());
 
         // Update the PC in the current CFP, because it may be out of sync in JITted code
         rb_set_cfp_pc(cfp, reconned_pc);
@@ -1623,76 +2781,88 @@ fn branch_stub_hit_body(branch_ptr: *const c_void, target_idx: u32, ec: EcPtr) -
         // So we do it here instead.
         rb_set_cfp_sp(cfp, reconned_sp);
 
+        // Bail if code GC is disabled and we've already run out of spaces.
+        if !get_option!(code_gc) && (cb.has_dropped_bytes() || ocb.unwrap().has_dropped_bytes()) {
+            return CodegenGlobals::get_stub_exit_code().raw_ptr(cb);
+        }
+
+        // Bail if we're about to run out of native stack space.
+        // We've just reconstructed interpreter state.
+        if rb_ec_stack_check(ec as _) != 0 {
+            return CodegenGlobals::get_stub_exit_code().raw_ptr(cb);
+        }
+
         (cfp, original_interp_sp)
     };
 
     // Try to find an existing compiled version of this block
-    let mut block = find_block_version(target, &target_ctx);
-
+    let mut block = find_block_version(target_blockid, &target_ctx);
+    let mut branch_modified = false;
     // If this block hasn't yet been compiled
     if block.is_none() {
-        let branch_old_shape = branch.shape;
-        let mut branch_modified = false;
+        let branch_old_shape = branch.gen_fn.get_shape();
 
         // If the new block can be generated right after the branch (at cb->write_pos)
-        if Some(cb.get_write_ptr()) == branch.end_addr {
+        if cb.get_write_ptr() == branch.end_addr.get() {
             // This branch should be terminating its block
-            assert!(branch.end_addr == branch.block.borrow().end_addr);
+            assert!(branch.end_addr == housing_block.end_addr);
 
             // Change the branch shape to indicate the target block will be placed next
-            branch.shape = target_branch_shape;
+            branch.gen_fn.set_shape(target_branch_shape);
 
             // Rewrite the branch with the new, potentially more compact shape
-            regenerate_branch(cb, &mut branch);
+            regenerate_branch(cb, branch);
             branch_modified = true;
 
             // Ensure that the branch terminates the codeblock just like
             // before entering this if block. This drops bytes off the end
             // in case we shrank the branch when regenerating.
-            cb.set_write_ptr(branch.end_addr.unwrap());
+            cb.set_write_ptr(branch.end_addr.get());
         }
 
         // Compile the new block version
-        drop(branch); // Stop mutable RefCell borrow since GC might borrow branch for marking
-        block = gen_block_series(target, &target_ctx, ec, cb, ocb);
-        branch = branch_rc.borrow_mut();
+        block = gen_block_series(target_blockid, &target_ctx, ec, cb, ocb);
 
         if block.is_none() && branch_modified {
             // We couldn't generate a new block for the branch, but we modified the branch.
             // Restore the branch by regenerating it.
-            branch.shape = branch_old_shape;
-            regenerate_branch(cb, &mut branch);
+            branch.gen_fn.set_shape(branch_old_shape);
+            regenerate_branch(cb, branch);
         }
     }
 
     // Finish building the new block
     let dst_addr = match block {
-        Some(block_rc) => {
-            let mut block: RefMut<_> = block_rc.borrow_mut();
+        Some(new_block) => {
+            let new_block = unsafe { new_block.as_ref() };
 
             // Branch shape should reflect layout
-            assert!(!(branch.shape == target_branch_shape && block.start_addr != branch.end_addr));
+            assert!(!(branch.gen_fn.get_shape() == target_branch_shape && new_block.start_addr != branch.end_addr.get()));
 
             // Add this branch to the list of incoming branches for the target
-            block.incoming.push(branch_rc.clone());
+            new_block.push_incoming(branch_ref);
 
             // Update the branch target address
-            let dst_addr = block.start_addr;
-            branch.dst_addrs[target_idx] = dst_addr;
-
-            // Mark this branch target as patched (no longer a stub)
-            branch.blocks[target_idx] = Some(block_rc.clone());
+            branch.targets[target_idx].set(Some(Box::new(BranchTarget::Block(new_block.into()))));
 
             // Rewrite the branch with the new jump target address
-            mem::drop(block); // end mut borrow
-            regenerate_branch(cb, &mut branch);
+            regenerate_branch(cb, branch);
 
             // Restore interpreter sp, since the code hitting the stub expects the original.
             unsafe { rb_set_cfp_sp(cfp, original_interp_sp) };
 
-            block_rc.borrow().start_addr.unwrap()
+            new_block.start_addr
         }
         None => {
+            // Trigger code GC. The whole ISEQ will be recompiled later.
+            // We shouldn't trigger it in the middle of compilation in branch_stub_hit
+            // because incomplete code could be used when cb.dropped_bytes is flipped
+            // by code GC. So this place, after all compilation, is the safest place
+            // to hook code GC on branch_stub_hit.
+            if get_option!(code_gc) {
+                cb.code_gc(ocb);
+            }
+
             // Failed to service the stub by generating a new block so now we
             // need to exit to the interpreter at the stubbed location. We are
             // intentionally *not* restoring original_interp_sp. At the time of
@@ -1710,67 +2880,166 @@ fn branch_stub_hit_body(branch_ptr: *const c_void, target_idx: u32, ec: EcPtr) -
     let new_branch_size = branch.code_size();
     assert!(
         new_branch_size <= branch_size_on_entry,
-        "branch stubs should never enlarge branches"
+        "branch stubs should never enlarge branches (start_addr: {:?}, old_size: {}, new_size: {})",
+        branch.start_addr.raw_ptr(cb), branch_size_on_entry, new_branch_size,
     );
 
     // Return a pointer to the compiled block version
-    dst_addr.raw_ptr()
+    dst_addr.raw_ptr(cb)
 }
 
-/// Get a block version or stub corresponding to a branch target
-fn get_branch_target(
-    target: BlockId,
+/// Generate a "stub", a piece of code that calls the compiler back when run.
+/// A piece of code that redeems for more code; a thunk for code.
+fn gen_branch_stub(
     ctx: &Context,
-    branchref: &BranchRef,
-    target_idx: u32,
     ocb: &mut OutlinedCb,
+    branch_struct_address: usize,
+    target_idx: u32,
 ) -> Option<CodePtr> {
-    let maybe_block = find_block_version(target, ctx);
+    let ocb = ocb.unwrap();
 
-    // If the block already exists
-    if let Some(blockref) = maybe_block {
-        let mut block = blockref.borrow_mut();
+    let mut asm = Assembler::new();
+    asm.ctx = *ctx;
+    asm.set_reg_temps(ctx.reg_temps);
+    asm_comment!(asm, "branch stub hit");
 
-        // Add an incoming branch into this block
-        block.incoming.push(branchref.clone());
-        let mut branch = branchref.borrow_mut();
-        branch.blocks[target_idx.as_usize()] = Some(blockref.clone());
+    if asm.ctx.is_return_landing() {
+        asm.mov(SP, Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP));
+        let top = asm.stack_push(Type::Unknown);
+        asm.mov(top, C_RET_OPND);
+    }
 
-        // Return a pointer to the compiled code for the block
-        return block.start_addr;
+    // Save caller-saved registers before C_ARG_OPNDS get clobbered.
+    // Spill all registers for consistency with the trampoline.
+    for &reg in caller_saved_temp_regs() {
+        asm.cpush(Opnd::Reg(reg));
     }
 
+    // Spill temps to the VM stack as well for jit.peek_at_stack()
+    asm.spill_temps();
+
+    // Set up the arguments unique to this stub for:
+    //
+    //    branch_stub_hit(branch_ptr, target_idx, ec)
+    //
+    // Bake pointer to Branch into output code.
+    // We make sure the block housing the branch is still alive when branch_stub_hit() is running.
+    asm.mov(C_ARG_OPNDS[0], branch_struct_address.into());
+    asm.mov(C_ARG_OPNDS[1], target_idx.into());
+
+    // Jump to trampoline to call branch_stub_hit()
+    // Not really a side exit, just don't need a padded jump here.
+    asm.jmp(CodegenGlobals::get_branch_stub_hit_trampoline().as_side_exit());
+
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
+}
+
+pub fn gen_branch_stub_hit_trampoline(ocb: &mut OutlinedCb) -> Option<CodePtr> {
     let ocb = ocb.unwrap();
+    let mut asm = Assembler::new();
+
+    // For `branch_stub_hit(branch_ptr, target_idx, ec)`,
+    // `branch_ptr` and `target_idx` is different for each stub,
+    // but the call and what's after is the same. This trampoline
+    // is the unchanging part.
+    // Since this trampoline is static, it allows code GC inside
+    // branch_stub_hit() to free stubs without problems.
+    asm_comment!(asm, "branch_stub_hit() trampoline");
+    let stub_hit_ret = asm.ccall(
+        branch_stub_hit as *mut u8,
+        vec![
+            C_ARG_OPNDS[0],
+            C_ARG_OPNDS[1],
+            EC,
+        ]
+    );
+    let jump_addr = asm.load(stub_hit_ret);
 
-    // Generate an outlined stub that will call branch_stub_hit()
-    let stub_addr = ocb.get_write_ptr();
+    // Restore caller-saved registers for stack temps
+    for &reg in caller_saved_temp_regs().rev() {
+        asm.cpop_into(Opnd::Reg(reg));
+    }
 
-    // Get a raw pointer to the branch while keeping the reference count alive
-    // Here clone increments the strong count by 1
-    // This means the branch stub owns its own reference to the branch
-    let branch_ptr: *const RefCell<Branch> = BranchRef::into_raw(branchref.clone());
+    // Jump to the address returned by the branch_stub_hit() call
+    asm.jmp_opnd(jump_addr);
 
-    // Call branch_stub_hit(branch_idx, target_idx, ec)
-    mov(ocb, C_ARG_REGS[2], REG_EC);
-    mov(ocb, C_ARG_REGS[1], uimm_opnd(target_idx as u64));
-    mov(ocb, C_ARG_REGS[0], const_ptr_opnd(branch_ptr as *const u8));
-    call_ptr(ocb, REG0, branch_stub_hit as *mut u8);
+    // HACK: popping into C_RET_REG clobbers the return value of branch_stub_hit() we need to jump
+    // to, so we need a scratch register to preserve it. This extends the live range of the C
+    // return register so we get something else for the return value.
+    let _ = asm.live_reg_opnd(stub_hit_ret);
 
-    // Jump to the address returned by the
-    // branch_stub_hit call
-    jmp_rm(ocb, RAX);
+    asm.compile(ocb, None).map(|(code_ptr, _)| code_ptr)
+}
 
-    if ocb.has_dropped_bytes() {
-        None // No space
+/// Return registers to be pushed and popped on branch_stub_hit.
+pub fn caller_saved_temp_regs() -> impl Iterator<Item = &'static Reg> + DoubleEndedIterator {
+    let temp_regs = Assembler::get_temp_regs().iter();
+    let len = temp_regs.len();
+    // The return value gen_leave() leaves in C_RET_REG
+    // needs to survive the branch_stub_hit() call.
+    let regs = temp_regs.chain(std::iter::once(&C_RET_REG));
+
+    // On x86_64, maintain 16-byte stack alignment
+    if cfg!(target_arch = "x86_64") && len % 2 == 0 {
+        static ONE_MORE: [Reg; 1] = [C_RET_REG];
+        regs.chain(ONE_MORE.iter())
     } else {
-        Some(stub_addr)
+        regs.chain(&[])
+    }
+}
+
+impl Assembler
+{
+    /// Mark the start position of a patchable entry point in the machine code
+    pub fn mark_entry_start(&mut self, entryref: &PendingEntryRef) {
+        // We need to create our own entry rc object
+        // so that we can move the closure below
+        let entryref = entryref.clone();
+
+        self.pos_marker(move |code_ptr, _| {
+            entryref.start_addr.set(Some(code_ptr));
+        });
+    }
+
+    /// Mark the end position of a patchable entry point in the machine code
+    pub fn mark_entry_end(&mut self, entryref: &PendingEntryRef) {
+        // We need to create our own entry rc object
+        // so that we can move the closure below
+        let entryref = entryref.clone();
+
+        self.pos_marker(move |code_ptr, _| {
+            entryref.end_addr.set(Some(code_ptr));
+        });
+    }
+
+    // Mark the start position of a patchable branch in the machine code
+    fn mark_branch_start(&mut self, branchref: &PendingBranchRef)
+    {
+        // We need to create our own branch rc object
+        // so that we can move the closure below
+        let branchref = branchref.clone();
+
+        self.pos_marker(move |code_ptr, _| {
+            branchref.start_addr.set(Some(code_ptr));
+        });
+    }
+
+    // Mark the end position of a patchable branch in the machine code
+    fn mark_branch_end(&mut self, branchref: &PendingBranchRef)
+    {
+        // We need to create our own branch rc object
+        // so that we can move the closure below
+        let branchref = branchref.clone();
+
+        self.pos_marker(move |code_ptr, _| {
+            branchref.end_addr.set(Some(code_ptr));
+        });
     }
 }
 
 pub fn gen_branch(
-    jit: &JITState,
-    src_ctx: &Context,
-    cb: &mut CodeBlock,
+    jit: &mut JITState,
+    asm: &mut Assembler,
     ocb: &mut OutlinedCb,
     target0: BlockId,
     ctx0: &Context,
@@ -1778,168 +3047,207 @@ pub fn gen_branch(
     ctx1: Option<&Context>,
     gen_fn: BranchGenFn,
 ) {
-    let branchref = make_branch_entry(jit.get_block(), src_ctx, gen_fn);
+    let branch = new_pending_branch(jit, gen_fn);
 
     // Get the branch targets or stubs
-    let dst_addr0 = get_branch_target(target0, ctx0, &branchref, 0, ocb);
-    let dst_addr1 = if let Some(ctx) = ctx1 {
-        get_branch_target(target1.unwrap(), ctx, &branchref, 1, ocb)
-    } else {
-        None
-    };
-
-    let mut branch = branchref.borrow_mut();
-
-    // Set the branch target adresses
-    branch.dst_addrs[0] = dst_addr0;
-    branch.dst_addrs[1] = dst_addr1;
+    let target0_addr = branch.set_target(0, target0, ctx0, ocb);
+    let target1_addr = if let Some(ctx) = ctx1 {
+        let addr = branch.set_target(1, target1.unwrap(), ctx, ocb);
+        if addr.is_none() {
+            // target1 requested but we're out of memory.
+            // Avoid unwrap() in gen_fn()
+            return;
+        }
 
-    branch.targets[0] = Some(target0);
-    branch.targets[1] = target1;
-    branch.target_ctxs[0] = *ctx0;
-    branch.target_ctxs[1] = if let Some(&ctx) = ctx1 {
-        ctx
-    } else {
-        Context::default()
-    };
+        addr
+    } else { None };
 
     // Call the branch generation function
-    branch.start_addr = Some(cb.get_write_ptr());
-    regenerate_branch(cb, &mut branch);
-}
-
-fn gen_jump_branch(
-    cb: &mut CodeBlock,
-    target0: CodePtr,
-    _target1: Option<CodePtr>,
-    shape: BranchShape,
-) {
-    if shape == BranchShape::Next1 {
-        panic!("Branch shape Next1 not allowed in gen_jump_branch!");
-    }
-
-    if shape == BranchShape::Default {
-        jmp_ptr(cb, target0);
+    asm.mark_branch_start(&branch);
+    if let Some(dst_addr) = target0_addr {
+        branch.gen_fn.call(asm, Target::CodePtr(dst_addr), target1_addr.map(|addr| Target::CodePtr(addr)));
     }
+    asm.mark_branch_end(&branch);
 }
 
-pub fn gen_direct_jump(jit: &JITState, ctx: &Context, target0: BlockId, cb: &mut CodeBlock) {
-    let branchref = make_branch_entry(jit.get_block(), ctx, gen_jump_branch);
-    let mut branch = branchref.borrow_mut();
-
-    branch.targets[0] = Some(target0);
-    branch.target_ctxs[0] = *ctx;
-
+pub fn gen_direct_jump(jit: &mut JITState, ctx: &Context, target0: BlockId, asm: &mut Assembler) {
+    let branch = new_pending_branch(jit, BranchGenFn::JumpToTarget0(Cell::new(BranchShape::Default)));
     let maybe_block = find_block_version(target0, ctx);
 
     // If the block already exists
-    if let Some(blockref) = maybe_block {
-        let mut block = blockref.borrow_mut();
-
-        block.incoming.push(branchref.clone());
-
-        branch.dst_addrs[0] = block.start_addr;
-        branch.blocks[0] = Some(blockref.clone());
-        branch.shape = BranchShape::Default;
+    let new_target = if let Some(blockref) = maybe_block {
+        let block = unsafe { blockref.as_ref() };
+        let block_addr = block.start_addr;
 
         // Call the branch generation function
-        branch.start_addr = Some(cb.get_write_ptr());
-        gen_jump_branch(cb, branch.dst_addrs[0].unwrap(), None, BranchShape::Default);
-        branch.end_addr = Some(cb.get_write_ptr());
+        asm_comment!(asm, "gen_direct_jmp: existing block");
+        asm.mark_branch_start(&branch);
+        branch.gen_fn.call(asm, Target::CodePtr(block_addr), None);
+        asm.mark_branch_end(&branch);
+
+        BranchTarget::Block(blockref)
     } else {
-        // This None target address signals gen_block_series() to compile the
-        // target block right after this one (fallthrough).
-        branch.dst_addrs[0] = None;
-        branch.shape = BranchShape::Next0;
-        branch.start_addr = Some(cb.get_write_ptr());
-        branch.end_addr = Some(cb.get_write_ptr());
-    }
+        // The branch is effectively empty (a noop)
+        asm_comment!(asm, "gen_direct_jmp: fallthrough");
+        asm.mark_branch_start(&branch);
+        asm.mark_branch_end(&branch);
+        branch.gen_fn.set_shape(BranchShape::Next0);
+
+        // `None` in new_target.address signals gen_block_series() to
+        // compile the target block right after this one (fallthrough).
+        BranchTarget::Stub(Box::new(BranchStub {
+            address: None,
+            ctx: *ctx,
+            iseq: Cell::new(target0.iseq),
+            iseq_idx: target0.idx,
+        }))
+    };
+
+    branch.targets[0].set(Some(Box::new(new_target)));
 }
 
 /// Create a stub to force the code up to this point to be executed
 pub fn defer_compilation(
-    jit: &JITState,
-    cur_ctx: &Context,
-    cb: &mut CodeBlock,
+    jit: &mut JITState,
+    asm: &mut Assembler,
     ocb: &mut OutlinedCb,
 ) {
-    if cur_ctx.chain_depth != 0 {
+    if asm.ctx.is_deferred() {
         panic!("Double defer!");
     }
 
-    let mut next_ctx = *cur_ctx;
+    let mut next_ctx = asm.ctx;
 
-    if next_ctx.chain_depth == u8::MAX {
-        panic!("max block version chain depth reached!");
-    }
-    next_ctx.chain_depth += 1;
+    next_ctx.mark_as_deferred();
 
-    let block_rc = jit.get_block();
-    let branch_rc = make_branch_entry(jit.get_block(), cur_ctx, gen_jump_branch);
-    let mut branch = branch_rc.borrow_mut();
-    let block = block_rc.borrow();
+    let branch = new_pending_branch(jit, BranchGenFn::JumpToTarget0(Cell::new(BranchShape::Default)));
 
     let blockid = BlockId {
-        iseq: block.blockid.iseq,
+        iseq: jit.get_iseq(),
         idx: jit.get_insn_idx(),
     };
-    branch.target_ctxs[0] = next_ctx;
-    branch.targets[0] = Some(blockid);
-    branch.dst_addrs[0] = get_branch_target(blockid, &next_ctx, &branch_rc, 0, ocb);
+
+    // Likely a stub due to the increased chain depth
+    let target0_address = branch.set_target(0, blockid, &next_ctx, ocb);
 
     // Call the branch generation function
-    branch.start_addr = Some(cb.get_write_ptr());
-    gen_jump_branch(cb, branch.dst_addrs[0].unwrap(), None, BranchShape::Default);
-    branch.end_addr = Some(cb.get_write_ptr());
-}
+    asm_comment!(asm, "defer_compilation");
+    asm.mark_branch_start(&branch);
+    if let Some(dst_addr) = target0_address {
+        branch.gen_fn.call(asm, Target::CodePtr(dst_addr), None);
+    }
+    asm.mark_branch_end(&branch);
 
-// Remove all references to a block then free it.
-fn free_block(blockref: &BlockRef) {
-    use crate::invariants::*;
+    // If the block we're deferring from is empty
+    if jit.get_starting_insn_idx() == jit.get_insn_idx() {
+        incr_counter!(defer_empty_count);
+    }
 
-    block_assumptions_free(blockref);
+    incr_counter!(defer_count);
+}
 
-    let block = blockref.borrow();
+/// Remove a block from the live control flow graph.
+/// Block must be initialized and incoming/outgoing edges
+/// must also point to initialized blocks.
+unsafe fn remove_from_graph(blockref: BlockRef) {
+    let block = unsafe { blockref.as_ref() };
 
     // Remove this block from the predecessor's targets
-    for pred_branchref in &block.incoming {
+    for pred_branchref in block.incoming.0.take().iter() {
         // Branch from the predecessor to us
-        let mut pred_branch = pred_branchref.borrow_mut();
+        let pred_branch = unsafe { pred_branchref.as_ref() };
 
         // If this is us, nullify the target block
-        for pred_succ_ref in &mut pred_branch.blocks {
-            if let Some(pred_succ) = pred_succ_ref {
-                if pred_succ == blockref {
-                    *pred_succ_ref = None;
-                }
+        for target_idx in 0..pred_branch.targets.len() {
+            // SAFETY: no mutation inside unsafe
+            let target_is_us = unsafe {
+                pred_branch.targets[target_idx]
+                    .ref_unchecked()
+                    .as_ref()
+                    .and_then(|target| target.get_block())
+                    .and_then(|target_block| (target_block == blockref).then(|| ()))
+                    .is_some()
+            };
+
+            if target_is_us {
+                pred_branch.targets[target_idx].set(None);
             }
         }
     }
 
     // For each outgoing branch
-    for out_branchref in &block.outgoing {
-        let out_branch = out_branchref.borrow();
-
+    for out_branchref in block.outgoing.iter() {
+        let out_branch = unsafe { out_branchref.as_ref() };
         // For each successor block
-        for succ in &out_branch.blocks {
-            if let Some(succ) = succ {
+        for out_target in out_branch.targets.iter() {
+            // SAFETY: copying out an Option<BlockRef>. No mutation.
+            let succ_block: Option<BlockRef> = unsafe {
+                out_target.ref_unchecked().as_ref().and_then(|target| target.get_block())
+            };
+
+            if let Some(succ_block) = succ_block {
                 // Remove outgoing branch from the successor's incoming list
-                let mut succ_block = succ.borrow_mut();
-                succ_block
-                    .incoming
-                    .retain(|succ_incoming| !Rc::ptr_eq(succ_incoming, out_branchref));
+                // SAFETY: caller promises the block has valid outgoing edges.
+                let succ_block = unsafe { succ_block.as_ref() };
+                // Temporarily move out of succ_block.incoming.
+                let succ_incoming = succ_block.incoming.0.take();
+                let mut succ_incoming = succ_incoming.into_vec();
+                succ_incoming.retain(|branch| branch != out_branchref);
+                succ_block.incoming.0.set(succ_incoming.into_boxed_slice()); // allocs. Rely on oom=abort
             }
         }
     }
+}
+
+/// Tear down a block and deallocate it.
+/// Caller has to ensure that the code tracked by the block is not
+/// running, as running code may hit [branch_stub_hit] who exepcts
+/// [Branch] to be live.
+///
+/// We currently ensure this through the `jit_cont` system in cont.c
+/// and sometimes through the GC calling [rb_yjit_iseq_free]. The GC
+/// has proven that an ISeq is not running if it calls us to free it.
+///
+/// For delayed deallocation, since dead blocks don't keep
+/// blocks they refer alive, by the time we get here their outgoing
+/// edges may be dangling. Pass `graph_intact=false` such these cases.
+pub unsafe fn free_block(blockref: BlockRef, graph_intact: bool) {
+    // Careful with order here.
+    // First, remove all pointers to the referent block
+    unsafe {
+        block_assumptions_free(blockref);
+
+        if graph_intact {
+            remove_from_graph(blockref);
+        }
+    }
 
-    // No explicit deallocation here as blocks are ref-counted.
+    // SAFETY: we should now have a unique pointer to the block
+    unsafe { dealloc_block(blockref) }
+}
+
+/// Deallocate a block and its outgoing branches. Blocks own their outgoing branches.
+/// Caller must ensure that we have unique ownership for the referent block
+unsafe fn dealloc_block(blockref: BlockRef) {
+    unsafe {
+        for outgoing in blockref.as_ref().outgoing.iter() {
+            // this Box::from_raw matches the Box::into_raw from PendingBranch::into_branch
+            mem::drop(Box::from_raw(outgoing.as_ptr()));
+        }
+    }
+
+    // Deallocate the referent Block
+    unsafe {
+        // this Box::from_raw matches the Box::into_raw from JITState::into_block
+        mem::drop(Box::from_raw(blockref.as_ptr()));
+    }
 }
 
 // Some runtime checks for integrity of a program location
 pub fn verify_blockid(blockid: BlockId) {
     unsafe {
         assert!(rb_IMEMO_TYPE_P(blockid.iseq.into(), imemo_iseq) != 0);
-        assert!(blockid.idx < get_iseq_encoded_size(blockid.iseq));
+        assert!(u32::from(blockid.idx) < get_iseq_encoded_size(blockid.iseq));
     }
 }
 
@@ -1950,114 +3258,129 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
     // TODO: want to assert that all other ractors are stopped here. Can't patch
     // machine code that some other thread is running.
 
-    let block = blockref.borrow();
-    let cb = CodegenGlobals::get_inline_cb();
+    let block = unsafe { (*blockref).as_ref() };
+    let id_being_invalidated = block.get_blockid();
+    let mut cb = CodegenGlobals::get_inline_cb();
     let ocb = CodegenGlobals::get_outlined_cb();
 
-    verify_blockid(block.blockid);
+    verify_blockid(id_being_invalidated);
+
+    #[cfg(feature = "disasm")]
+    {
+        // If dump_iseq_disasm is specified, print to console that blocks for matching ISEQ names were invalidated.
+        if let Some(substr) = get_option_ref!(dump_iseq_disasm).as_ref() {
+            let iseq_range = &block.iseq_range;
+            let iseq_location = iseq_get_location(block.iseq.get(), iseq_range.start);
+            if iseq_location.contains(substr) {
+                println!("Invalidating block from {}, ISEQ offsets [{}, {})", iseq_location, iseq_range.start, iseq_range.end);
+            }
+        }
+    }
 
     // Remove this block from the version array
     remove_block_version(blockref);
 
     // Get a pointer to the generated code for this block
-    let code_ptr = block.start_addr;
+    let block_start = block.start_addr;
 
-    // Make the the start of the block do an exit. This handles OOM situations
+    // Make the start of the block do an exit. This handles OOM situations
     // and some cases where we can't efficiently patch incoming branches.
     // Do this first, since in case there is a fallthrough branch into this
     // block, the patching loop below can overwrite the start of the block.
     // In those situations, there is hopefully no jumps to the start of the block
     // after patching as the start of the block would be in the middle of something
     // generated by branch_t::gen_fn.
+    let block_entry_exit = block
+        .entry_exit
+        .expect("invalidation needs the entry_exit field");
     {
-        let block_start = block
-            .start_addr
-            .expect("invalidation needs constructed block");
-        let block_end = block
-            .end_addr
-            .expect("invalidation needs constructed block");
-        let block_entry_exit = block
-            .entry_exit
-            .expect("invalidation needs the entry_exit field");
+        let block_end = block.get_end_addr();
 
         if block_start == block_entry_exit {
             // Some blocks exit on entry. Patching a jump to the entry at the
             // entry makes an infinite loop.
         } else {
-            // TODO(alan)
-            // if (block.start_addr >= cb_get_ptr(cb, yjit_codepage_frozen_bytes)) // Don't patch frozen code region
-
             // Patch in a jump to block.entry_exit.
+
             let cur_pos = cb.get_write_ptr();
+            let cur_dropped_bytes = cb.has_dropped_bytes();
             cb.set_write_ptr(block_start);
-            jmp_ptr(cb, block_entry_exit);
+
+            let mut asm = Assembler::new();
+            asm.jmp(block_entry_exit.as_side_exit());
+            cb.set_dropped_bytes(false);
+            asm.compile(&mut cb, Some(ocb)).expect("can rewrite existing code");
+
             assert!(
-                cb.get_write_ptr() < block_end,
-                "invalidation wrote past end of block"
+                cb.get_write_ptr() <= block_end,
+                "invalidation wrote past end of block (code_size: {:?}, new_size: {})",
+                block.code_size(),
+                cb.get_write_ptr().as_offset() - block_start.as_offset(),
             );
             cb.set_write_ptr(cur_pos);
+            cb.set_dropped_bytes(cur_dropped_bytes);
         }
     }
 
     // For each incoming branch
-    for branchref in &block.incoming {
-        let mut branch = branchref.borrow_mut();
-        let target_idx = if branch.dst_addrs[0] == code_ptr {
+    for branchref in block.incoming.0.take().iter() {
+        let branch = unsafe { branchref.as_ref() };
+        let target_idx = if branch.get_target_address(0) == Some(block_start) {
             0
         } else {
             1
         };
-        assert_eq!(branch.dst_addrs[target_idx], code_ptr);
-        assert_eq!(blockref, branch.blocks[target_idx].as_ref().unwrap());
-
-        // Mark this target as being a stub
-        branch.blocks[target_idx] = None;
-
-        // TODO(alan):
-        // Don't patch frozen code region
-        // if (branch.start_addr < cb_get_ptr(cb, yjit_codepage_frozen_bytes)) {
-        //     continue;
-        // }
 
-        // Create a stub for this branch target
-        mem::drop(branch); // end RefCell borrow as get_branch_target() can borrow the branch.
-        let mut branch_target =
-            get_branch_target(block.blockid, &block.ctx, branchref, target_idx as u32, ocb);
-
-        if branch_target.is_none() {
-            // We were unable to generate a stub (e.g. OOM). Use the block's
-            // exit instead of a stub for the block. It's important that we
-            // still patch the branch in this situation so stubs are unique
-            // to branches. Think about what could go wrong if we run out of
-            // memory in the middle of this loop.
-            branch_target = block.entry_exit;
+        // Assert that the incoming branch indeed points to the block being invalidated
+        // SAFETY: no mutation.
+        unsafe {
+            let incoming_target = branch.targets[target_idx].ref_unchecked().as_ref().unwrap();
+            assert_eq!(Some(block_start), incoming_target.get_address());
+            if let Some(incoming_block) = &incoming_target.get_block() {
+                assert_eq!(blockref, incoming_block);
+            }
         }
 
-        branch = branchref.borrow_mut();
-        branch.dst_addrs[target_idx] = branch_target;
+        // Create a stub for this branch target
+        let stub_addr = gen_branch_stub(&block.ctx, ocb, branchref.as_ptr() as usize, target_idx as u32);
+
+        // In case we were unable to generate a stub (e.g. OOM). Use the block's
+        // exit instead of a stub for the block. It's important that we
+        // still patch the branch in this situation so stubs are unique
+        // to branches. Think about what could go wrong if we run out of
+        // memory in the middle of this loop.
+        let stub_addr = stub_addr.unwrap_or(block_entry_exit);
+
+        // Fill the branch target with a stub
+        branch.targets[target_idx].set(Some(Box::new(BranchTarget::Stub(Box::new(BranchStub {
+            address: Some(stub_addr),
+            iseq: block.iseq.clone(),
+            iseq_idx: block.iseq_range.start,
+            ctx: block.ctx,
+        })))));
 
         // Check if the invalidated block immediately follows
-        let target_next = block.start_addr == branch.end_addr;
+        let target_next = block.start_addr == branch.end_addr.get();
 
         if target_next {
             // The new block will no longer be adjacent.
             // Note that we could be enlarging the branch and writing into the
             // start of the block being invalidated.
-            branch.shape = BranchShape::Default;
+            branch.gen_fn.set_shape(BranchShape::Default);
         }
 
         // Rewrite the branch with the new jump target address
-        regenerate_branch(cb, &mut branch);
+        let old_branch_size = branch.code_size();
+        regenerate_branch(cb, branch);
 
         if target_next && branch.end_addr > block.end_addr {
-            dbg!(
-                branch.block.borrow().blockid.idx,
-                block.blockid.idx,
-                branch.end_addr,
-                block.end_addr,
-                block.code_size()
+            panic!("yjit invalidate rewrote branch past end of invalidated block: {:?} (code_size: {})", branch, block.code_size());
+        }
+        if !target_next && branch.code_size() > old_branch_size {
+            panic!(
+                "invalidated branch grew in size (start_addr: {:?}, old_size: {}, new_size: {})",
+                branch.start_addr.raw_ptr(cb), old_branch_size, branch.code_size()
             );
-            panic!("yjit invalidate rewrote branch past end of invalidated block");
         }
     }
 
@@ -2069,18 +3392,21 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
     // points will always have an instruction index of 0.  We'll need to
     // change this in the future when we support optional parameters because
     // they enter the function with a non-zero PC
-    if block.blockid.idx == 0 {
-        unsafe { rb_iseq_reset_jit_func(block.blockid.iseq) };
+    if block.iseq_range.start == 0 {
+        // TODO:
+        // We could reset the exec counter to zero in rb_iseq_reset_jit_func()
+        // so that we eventually compile a new entry point when useful
+        unsafe { rb_iseq_reset_jit_func(block.iseq.get()) };
     }
 
-    // TODO:
-    // May want to recompile a new entry point (for interpreter entry blocks)
-    // This isn't necessary for correctness
-
     // FIXME:
     // Call continuation addresses on the stack can also be atomically replaced by jumps going to the stub.
 
-    free_block(blockref);
+    // SAFETY: This block was in a version_map earlier
+    // in this function before we removed it, so it's well connected.
+    unsafe { remove_from_graph(*blockref) };
+
+    delayed_deallocation(*blockref);
 
     ocb.unwrap().mark_all_executable();
     cb.mark_all_executable();
@@ -2088,36 +3414,332 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
     incr_counter!(invalidation_count);
 }
 
+// We cannot deallocate blocks immediately after invalidation since there
+// could be stubs waiting to access branch pointers. Return stubs can do
+// this since patching the code for setting up return addresses does not
+// affect old return addresses that are already set up to use potentially
+// invalidated branch pointers. Example:
+//   def foo(n)
+//     if n == 2
+//       # 1.times.each to create a cfunc frame to preserve the JIT frame
+//       # which will return to a stub housed in an invalidated block
+//       return 1.times.each { Object.define_method(:foo) {} }
+//     end
+//
+//     foo(n + 1)
+//   end
+//   p foo(1)
+pub fn delayed_deallocation(blockref: BlockRef) {
+    block_assumptions_free(blockref);
+
+    let payload = get_iseq_payload(unsafe { blockref.as_ref() }.iseq.get()).unwrap();
+    payload.dead_blocks.push(blockref);
+}
+
+trait RefUnchecked {
+    type Contained;
+    unsafe fn ref_unchecked(&self) -> &Self::Contained;
+}
+
+impl<T> RefUnchecked for Cell<T> {
+    type Contained = T;
+
+    /// Gives a reference to the contents of a [Cell].
+    /// Dangerous; please include a SAFETY note.
+    ///
+    /// An easy way to use this without triggering Undefined Behavior is to
+    ///   1. ensure there is transitively no Cell/UnsafeCell mutation in the `unsafe` block
+    ///   2. ensure the `unsafe` block does not return any references, so our
+    ///      analysis is lexically confined. This is trivially true if the block
+    ///      returns a `bool`, for example. Aggregates that store references have
+    ///      explicit lifetime parameters that look like `<'a>`.
+    ///
+    /// There are other subtler situations that don't follow these rules yet
+    /// are still sound.
+    /// See `test_miri_ref_unchecked()` for examples. You can play with it
+    /// with `cargo +nightly miri test miri`.
+    unsafe fn ref_unchecked(&self) -> &Self::Contained {
+        // SAFETY: pointer is dereferenceable because it's from a &Cell.
+        // It's up to the caller to follow aliasing rules with the output
+        // reference.
+        unsafe { self.as_ptr().as_ref().unwrap() }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::core::*;
 
     #[test]
+    fn type_size() {
+        // Check that we can store types in 4 bits,
+        // and all local types in 32 bits
+        assert_eq!(mem::size_of::<Type>(), 1);
+        assert!(Type::BlockParamProxy as usize <= 0b1111);
+        assert!(MAX_LOCAL_TYPES * 4 <= 32);
+    }
+
+    #[test]
+    fn tempmapping_size() {
+        assert_eq!(mem::size_of::<TempMapping>(), 1);
+    }
+
+    #[test]
+    fn local_types() {
+        let mut ctx = Context::default();
+
+        for i in 0..MAX_LOCAL_TYPES {
+            ctx.set_local_type(i, Type::Fixnum);
+            assert_eq!(ctx.get_local_type(i), Type::Fixnum);
+            ctx.set_local_type(i, Type::BlockParamProxy);
+            assert_eq!(ctx.get_local_type(i), Type::BlockParamProxy);
+        }
+
+        ctx.set_local_type(0, Type::Fixnum);
+        ctx.clear_local_types();
+        assert!(ctx.get_local_type(0) == Type::Unknown);
+
+        // Make sure we don't accidentally set bits incorrectly
+        let mut ctx = Context::default();
+        ctx.set_local_type(0, Type::Fixnum);
+        assert_eq!(ctx.get_local_type(0), Type::Fixnum);
+        ctx.set_local_type(2, Type::Fixnum);
+        ctx.set_local_type(1, Type::BlockParamProxy);
+        assert_eq!(ctx.get_local_type(0), Type::Fixnum);
+        assert_eq!(ctx.get_local_type(2), Type::Fixnum);
+    }
+
+    #[test]
+    fn tempmapping() {
+        let t = TempMapping::map_to_stack(Type::Unknown);
+        assert_eq!(t.get_kind(), MapToStack);
+        assert_eq!(t.get_type(), Type::Unknown);
+
+        let t = TempMapping::map_to_stack(Type::TString);
+        assert_eq!(t.get_kind(), MapToStack);
+        assert_eq!(t.get_type(), Type::TString);
+
+        let t = TempMapping::map_to_local(7);
+        assert_eq!(t.get_kind(), MapToLocal);
+        assert_eq!(t.get_local_idx(), 7);
+    }
+
+    #[test]
+    fn context_size() {
+        assert_eq!(mem::size_of::<Context>(), 23);
+    }
+
+    #[test]
     fn types() {
         // Valid src => dst
-        assert_eq!(Type::Unknown.diff(Type::Unknown), 0);
-        assert_eq!(Type::UnknownImm.diff(Type::UnknownImm), 0);
-        assert_ne!(Type::UnknownImm.diff(Type::Unknown), usize::MAX);
-        assert_ne!(Type::Fixnum.diff(Type::Unknown), usize::MAX);
-        assert_ne!(Type::Fixnum.diff(Type::UnknownImm), usize::MAX);
+        assert_eq!(Type::Unknown.diff(Type::Unknown), TypeDiff::Compatible(0));
+        assert_eq!(Type::UnknownImm.diff(Type::UnknownImm), TypeDiff::Compatible(0));
+        assert_ne!(Type::UnknownImm.diff(Type::Unknown), TypeDiff::Incompatible);
+        assert_ne!(Type::Fixnum.diff(Type::Unknown), TypeDiff::Incompatible);
+        assert_ne!(Type::Fixnum.diff(Type::UnknownImm), TypeDiff::Incompatible);
 
         // Invalid src => dst
-        assert_eq!(Type::Unknown.diff(Type::UnknownImm), usize::MAX);
-        assert_eq!(Type::Unknown.diff(Type::Fixnum), usize::MAX);
-        assert_eq!(Type::Fixnum.diff(Type::UnknownHeap), usize::MAX);
+        assert_eq!(Type::Unknown.diff(Type::UnknownImm), TypeDiff::Incompatible);
+        assert_eq!(Type::Unknown.diff(Type::Fixnum), TypeDiff::Incompatible);
+        assert_eq!(Type::Fixnum.diff(Type::UnknownHeap), TypeDiff::Incompatible);
+    }
+
+    #[test]
+    fn reg_temps() {
+        let mut reg_temps = RegTemps(0);
+
+        // 0 means every slot is not spilled
+        for stack_idx in 0..MAX_REG_TEMPS {
+            assert_eq!(reg_temps.get(stack_idx), false);
+        }
+
+        // Set 0, 2, 7 (RegTemps: 10100001)
+        reg_temps.set(0, true);
+        reg_temps.set(2, true);
+        reg_temps.set(3, true);
+        reg_temps.set(3, false);
+        reg_temps.set(7, true);
+
+        // Get 0..8
+        assert_eq!(reg_temps.get(0), true);
+        assert_eq!(reg_temps.get(1), false);
+        assert_eq!(reg_temps.get(2), true);
+        assert_eq!(reg_temps.get(3), false);
+        assert_eq!(reg_temps.get(4), false);
+        assert_eq!(reg_temps.get(5), false);
+        assert_eq!(reg_temps.get(6), false);
+        assert_eq!(reg_temps.get(7), true);
+
+        // Test conflicts
+        assert_eq!(5, get_option!(num_temp_regs));
+        assert_eq!(reg_temps.conflicts_with(0), false); // already set, but no conflict
+        assert_eq!(reg_temps.conflicts_with(1), false);
+        assert_eq!(reg_temps.conflicts_with(2), true); // already set, and conflicts with 7
+        assert_eq!(reg_temps.conflicts_with(3), false);
+        assert_eq!(reg_temps.conflicts_with(4), false);
+        assert_eq!(reg_temps.conflicts_with(5), true); // not set, and will conflict with 0
+        assert_eq!(reg_temps.conflicts_with(6), false);
+        assert_eq!(reg_temps.conflicts_with(7), true); // already set, and conflicts with 2
     }
 
     #[test]
     fn context() {
         // Valid src => dst
-        assert_eq!(Context::default().diff(&Context::default()), 0);
+        assert_eq!(Context::default().diff(&Context::default()), TypeDiff::Compatible(0));
 
         // Try pushing an operand and getting its type
-        let mut ctx = Context::default();
-        ctx.stack_push(Type::Fixnum);
-        let top_type = ctx.get_opnd_type(StackOpnd(0));
+        let mut asm = Assembler::new();
+        asm.stack_push(Type::Fixnum);
+        let top_type = asm.ctx.get_opnd_type(StackOpnd(0));
         assert!(top_type == Type::Fixnum);
 
         // TODO: write more tests for Context type diff
     }
+
+    #[test]
+    fn context_upgrade_local() {
+        let mut asm = Assembler::new();
+        asm.stack_push_local(0);
+        asm.ctx.upgrade_opnd_type(StackOpnd(0), Type::Nil);
+        assert_eq!(Type::Nil, asm.ctx.get_opnd_type(StackOpnd(0)));
+    }
+
+    #[test]
+    fn context_chain_depth() {
+        let mut ctx = Context::default();
+        assert_eq!(ctx.get_chain_depth(), 0);
+        assert_eq!(ctx.is_return_landing(), false);
+        assert_eq!(ctx.is_deferred(), false);
+
+        for _ in 0..5 {
+            ctx.increment_chain_depth();
+        }
+        assert_eq!(ctx.get_chain_depth(), 5);
+
+        ctx.set_as_return_landing();
+        assert_eq!(ctx.is_return_landing(), true);
+
+        ctx.clear_return_landing();
+        assert_eq!(ctx.is_return_landing(), false);
+
+        ctx.mark_as_deferred();
+        assert_eq!(ctx.is_deferred(), true);
+
+        ctx.reset_chain_depth_and_defer();
+        assert_eq!(ctx.get_chain_depth(), 0);
+        assert_eq!(ctx.is_deferred(), false);
+    }
+
+    #[test]
+    fn shift_stack_for_send() {
+        let mut asm = Assembler::new();
+
+        // Push values to simulate send(:name, arg) with 6 items already on-stack
+        for _ in 0..6 {
+            asm.stack_push(Type::Fixnum);
+        }
+        asm.stack_push(Type::Unknown);
+        asm.stack_push(Type::ImmSymbol);
+        asm.stack_push(Type::Unknown);
+
+        // This method takes argc of the sendee, not argc of send
+        asm.shift_stack(1);
+
+        // The symbol should be gone
+        assert_eq!(Type::Unknown, asm.ctx.get_opnd_type(StackOpnd(0)));
+        assert_eq!(Type::Unknown, asm.ctx.get_opnd_type(StackOpnd(1)));
+    }
+
+    #[test]
+    fn test_miri_ref_unchecked() {
+        let blockid = BlockId {
+            iseq: ptr::null(),
+            idx: 0,
+        };
+        let cb = CodeBlock::new_dummy(1024);
+        let dumm_addr = cb.get_write_ptr();
+        let block = JITState::new(blockid, Context::default(), dumm_addr, ptr::null())
+            .into_block(0, dumm_addr, dumm_addr, vec![]);
+        let _dropper = BlockDropper(block);
+
+        // Outside of brief moments during construction,
+        // we're always working with &Branch (a shared reference to a Branch).
+        let branch: &Branch = &Branch {
+            gen_fn: BranchGenFn::JZToTarget0,
+            block,
+            start_addr: dumm_addr,
+            end_addr: Cell::new(dumm_addr),
+            targets: [Cell::new(None), Cell::new(Some(Box::new(BranchTarget::Stub(Box::new(BranchStub {
+                iseq: Cell::new(ptr::null()),
+                iseq_idx: 0,
+                address: None,
+                ctx: Context::default(),
+            })))))]
+        };
+        // For easier soundness reasoning, make sure the reference returned does not out live the
+        // `unsafe` block! It's tempting to do, but it leads to non-local issues.
+        // Here is an example where it goes wrong:
+        if false {
+            for target in branch.targets.iter().as_ref() {
+                if let Some(btarget) = unsafe { target.ref_unchecked() } {
+                    // btarget is derived from the usnafe block!
+                    target.set(None); // This drops the contents of the cell...
+                    assert!(btarget.get_address().is_none()); // but `btarget` is still live! UB.
+                }
+            }
+        }
+
+        // Do something like this instead. It's not pretty, but it's easier to vet for UB this way.
+        for target in branch.targets.iter().as_ref() {
+            // SAFETY: no mutation within unsafe
+            if unsafe { target.ref_unchecked().is_none() } {
+                continue;
+            }
+            // SAFETY: no mutation within unsafe
+            assert!(unsafe { target.ref_unchecked().as_ref().unwrap().get_address().is_none() });
+            target.set(None);
+        }
+
+        // A more subtle situation where we do Cell/UnsafeCell mutation over the
+        // lifetime of the reference released by ref_unchecked().
+        branch.targets[0].set(Some(Box::new(BranchTarget::Stub(Box::new(BranchStub {
+            iseq: Cell::new(ptr::null()),
+            iseq_idx: 0,
+            address: None,
+            ctx: Context::default(),
+        })))));
+        // Invalid ISeq; we never dereference it.
+        let secret_iseq = NonNull::<rb_iseq_t>::dangling().as_ptr();
+        unsafe {
+            if let Some(branch_target) = branch.targets[0].ref_unchecked().as_ref() {
+                if let BranchTarget::Stub(stub) = branch_target.as_ref() {
+                    // SAFETY:
+                    // This is a Cell mutation, but it mutates the contents
+                    // of a a Cell<IseqPtr>, which is a different type
+                    // from the type of Cell found in `Branch::targets`, so
+                    // there is no chance of mutating the Cell that we called
+                    // ref_unchecked() on above.
+                    Cell::set(&stub.iseq, secret_iseq);
+                }
+            }
+        };
+        // Check that we indeed changed the iseq of the stub
+        // Cell::take moves out of the cell.
+        assert_eq!(
+            secret_iseq as usize,
+            branch.targets[0].take().unwrap().get_blockid().iseq as usize
+        );
+
+        struct BlockDropper(BlockRef);
+        impl Drop for BlockDropper {
+            fn drop(&mut self) {
+                // SAFETY: we have ownership because the test doesn't stash
+                // the block away in any global structure.
+                // Note that the test being self-contained is also why we
+                // use dealloc_block() over free_block(), as free_block() touches
+                // the global invariants tables unavailable in tests.
+                unsafe { dealloc_block(self.0) };
+            }
+        }
+    }
 }