40 files changed, 6427 insertions, 272 deletions
diff --git a/yjit/src/asm/arm64/README.md b/yjit/src/asm/arm64/README.md
new file mode 100644
index 0000000000..edae5773e8
--- /dev/null
+++ b/yjit/src/asm/arm64/README.md
@@ -0,0 +1,16 @@
+# Arm64
+
+This module is responsible for encoding YJIT operands into an appropriate Arm64 encoding.
+
+## Architecture
+
+Every instruction in the Arm64 instruction set is 32 bits wide and is represented in little-endian order. Because they're all going to the same size, we represent each instruction by a struct that implements `From<T> for u32`, which contains the mechanism for encoding each instruction. The encoding for each instruction is shown in the documentation for the struct that ends up being created.
+
+In general each set of bytes inside of the struct has either a direct value (usually a `u8`/`u16`) or some kind of `enum` that can be converted directly into a `u32`. For more complicated pieces of encoding (e.g., bitmask immediates) a corresponding module under the `arg` namespace is available.
+
+## Helpful links
+
+* [Arm A64 Instruction Set Architecture](https://developer.arm.com/documentation/ddi0596/2021-12?lang=en) Official documentation
+* [armconverter.com](https://armconverter.com/) A website that encodes Arm assembly syntax
+* [hatstone](https://github.com/tenderlove/hatstone) A wrapper around the Capstone disassembler written in Ruby
+* [onlinedisassembler.com](https://onlinedisassembler.com/odaweb/) A web-based disassembler
diff --git a/yjit/src/asm/arm64/arg/bitmask_imm.rs b/yjit/src/asm/arm64/arg/bitmask_imm.rs
new file mode 100644
index 0000000000..6b71a73d2c
--- /dev/null
+++ b/yjit/src/asm/arm64/arg/bitmask_imm.rs
@@ -0,0 +1,255 @@
+/// Immediates used by the logical immediate instructions are not actually the
+/// immediate value, but instead are encoded into a 13-bit wide mask of 3
+/// elements. This allows many more values to be represented than 13 bits would
+/// normally allow, at the expense of not being able to represent every possible
+/// value.
+///
+/// In order for a number to be encodeable in this form, the binary
+/// representation must consist of a single set of contiguous 1s. That pattern
+/// must then be replicatable across all of the bits either 1, 2, 4, 8, 16, or
+/// 32 times (rotated or not).
+///
+/// For example, 1 (0b1), 2 (0b10), 3 (0b11), and 4 (0b100) are all valid.
+/// However, 5 (0b101) is invalid, because it contains 2 sets of 1s and cannot
+/// be replicated across 64 bits.
+///
+/// Some more examples to illustrate the idea of replication:
+/// * 0x5555555555555555 is a valid value (0b0101...) because it consists of a
+///   single set of 1s which can be replicated across all of the bits 32 times.
+/// * 0xf0f0f0f0f0f0f0f0 is a valid value (0b1111000011110000...) because it
+///   consists of a single set of 1s which can be replicated across all of the
+///   bits 8 times (rotated by 4 bits).
+/// * 0x0ff00ff00ff00ff0 is a valid value (0000111111110000...) because it
+///   consists of a single set of 1s which can be replicated across all of the
+///   bits 4 times (rotated by 12 bits).
+///
+/// To encode the values, there are 3 elements:
+/// * n = 1 if the pattern is 64-bits wide, 0 otherwise
+/// * imms = the size of the pattern, a 0, and then one less than the number of
+///   sequential 1s
+/// * immr = the number of right rotations to apply to the pattern to get the
+///   target value
+///
+pub struct BitmaskImmediate {
+    n: u8,
+    imms: u8,
+    immr: u8
+}
+
+impl TryFrom<u64> for BitmaskImmediate {
+    type Error = ();
+
+    /// Attempt to convert a u64 into a BitmaskImmediate.
+    ///
+    /// The implementation here is largely based on this blog post:
+    /// https://dougallj.wordpress.com/2021/10/30/bit-twiddling-optimising-aarch64-logical-immediate-encoding-and-decoding/
+    fn try_from(value: u64) -> Result<Self, Self::Error> {
+        if value == 0 || value == u64::MAX {
+            return Err(());
+        }
+
+        fn rotate_right(value: u64, rotations: u32) -> u64 {
+            (value >> (rotations & 0x3F)) |
+            (value << (rotations.wrapping_neg() & 0x3F))
+        }
+
+        let rotations = (value & (value + 1)).trailing_zeros();
+        let normalized = rotate_right(value, rotations & 0x3F);
+
+        let zeroes = normalized.leading_zeros();
+        let ones = (!normalized).trailing_zeros();
+        let size = zeroes + ones;
+
+        if rotate_right(value, size & 0x3F) != value {
+            return Err(());
+        }
+
+        Ok(BitmaskImmediate {
+            n: ((size >> 6) & 1) as u8,
+            imms: (((size << 1).wrapping_neg() | (ones - 1)) & 0x3F) as u8,
+            immr: ((rotations.wrapping_neg() & (size - 1)) & 0x3F) as u8
+        })
+    }
+}
+
+impl BitmaskImmediate {
+    /// Attempt to make a BitmaskImmediate for a 32 bit register.
+    /// The result has N==0, which is required for some 32-bit instructions.
+    /// Note that the exact same BitmaskImmediate produces different values
+    /// depending on the size of the target register.
+    pub fn new_32b_reg(value: u32) -> Result<Self, ()> {
+        // The same bit pattern replicated to u64
+        let value = value as u64;
+        let replicated: u64 = (value << 32) | value;
+        let converted = Self::try_from(replicated);
+        if let Ok(ref imm) = converted {
+            assert_eq!(0, imm.n);
+        }
+
+        converted
+    }
+}
+
+impl BitmaskImmediate {
+    /// Encode a bitmask immediate into a 32-bit value.
+    pub fn encode(self) -> u32 {
+        0
+        | ((self.n as u32) << 12)
+        | ((self.immr as u32) << 6)
+        | (self.imms as u32)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_failures() {
+        [5, 9, 10, 11, 13, 17, 18, 19].iter().for_each(|&imm| {
+            assert!(BitmaskImmediate::try_from(imm).is_err());
+        });
+    }
+
+    #[test]
+    fn test_negative() {
+        let bitmask: BitmaskImmediate = (-9_i64 as u64).try_into().unwrap();
+        let encoded: u32 = bitmask.encode();
+        assert_eq!(7998, encoded);
+    }
+
+    #[test]
+    fn test_size_2_minimum() {
+        let bitmask = BitmaskImmediate::try_from(0x5555555555555555);
+        assert!(matches!(bitmask, Ok(BitmaskImmediate { n: 0, immr: 0b000000, imms: 0b111100 })));
+    }
+
+    #[test]
+    fn test_size_2_maximum() {
+        let bitmask = BitmaskImmediate::try_from(0xaaaaaaaaaaaaaaaa);
+        assert!(matches!(bitmask, Ok(BitmaskImmediate { n: 0, immr: 0b000001, imms: 0b111100 })));
+    }
+
+    #[test]
+    fn test_size_4_minimum() {
+        let bitmask = BitmaskImmediate::try_from(0x1111111111111111);
+        assert!(matches!(bitmask, Ok(BitmaskImmediate { n: 0, immr: 0b000000, imms: 0b111000 })));
+    }
+
+    #[test]
+    fn test_size_4_rotated() {
+        let bitmask = BitmaskImmediate::try_from(0x6666666666666666);
+        assert!(matches!(bitmask, Ok(BitmaskImmediate { n: 0, immr: 0b000011, imms: 0b111001 })));
+    }
+
+    #[test]
+    fn test_size_4_maximum() {
+        let bitmask = BitmaskImmediate::try_from(0xeeeeeeeeeeeeeeee);
+        assert!(matches!(bitmask, Ok(BitmaskImmediate { n: 0, immr: 0b000011, imms: 0b111010 })));
+    }
+
+    #[test]
+    fn test_size_8_minimum() {
+        let bitmask = BitmaskImmediate::try_from(0x0101010101010101);
+        assert!(matches!(bitmask, Ok(BitmaskImmediate { n: 0, immr: 0b000000, imms: 0b110000 })));
+    }
+
+    #[test]
+    fn test_size_8_rotated() {
+        let bitmask = BitmaskImmediate::try_from(0x1818181818181818);
+        assert!(matches!(bitmask, Ok(BitmaskImmediate { n: 0, immr: 0b000101, imms: 0b110001 })));
+    }
+
+    #[test]
+    fn test_size_8_maximum() {
+        let bitmask = BitmaskImmediate::try_from(0xfefefefefefefefe);
+        assert!(matches!(bitmask, Ok(BitmaskImmediate { n: 0, immr: 0b000111, imms: 0b110110 })));
+    }
+
+    #[test]
+    fn test_size_16_minimum() {
+        let bitmask = BitmaskImmediate::try_from(0x0001000100010001);
+        assert!(matches!(bitmask, Ok(BitmaskImmediate { n: 0, immr: 0b000000, imms: 0b100000 })));
+    }
+
+    #[test]
+    fn test_size_16_rotated() {
+        let bitmask = BitmaskImmediate::try_from(0xff8fff8fff8fff8f);
+        assert!(matches!(bitmask, Ok(BitmaskImmediate { n: 0, immr: 0b001001, imms: 0b101100 })));
+    }
+
+    #[test]
+    fn test_size_16_maximum() {
+        let bitmask = BitmaskImmediate::try_from(0xfffefffefffefffe);
+        assert!(matches!(bitmask, Ok(BitmaskImmediate { n: 0, immr: 0b001111, imms: 0b101110 })));
+    }
+
+    #[test]
+    fn test_size_32_minimum() {
+        let bitmask = BitmaskImmediate::try_from(0x0000000100000001);
+        assert!(matches!(bitmask, Ok(BitmaskImmediate { n: 0, immr: 0b000000, imms: 0b000000 })));
+    }
+
+    #[test]
+    fn test_size_32_rotated() {
+        let bitmask = BitmaskImmediate::try_from(0x3fffff003fffff00);
+        assert!(matches!(bitmask, Ok(BitmaskImmediate { n: 0, immr: 0b011000, imms: 0b010101 })));
+    }
+
+    #[test]
+    fn test_size_32_maximum() {
+        let bitmask = BitmaskImmediate::try_from(0xfffffffefffffffe);
+        assert!(matches!(bitmask, Ok(BitmaskImmediate { n: 0, immr: 0b011111, imms: 0b011110 })));
+    }
+
+    #[test]
+    fn test_size_64_minimum() {
+        let bitmask = BitmaskImmediate::try_from(0x0000000000000001);
+        assert!(matches!(bitmask, Ok(BitmaskImmediate { n: 1, immr: 0b000000, imms: 0b000000 })));
+    }
+
+    #[test]
+    fn test_size_64_rotated() {
+        let bitmask = BitmaskImmediate::try_from(0x0000001fffff0000);
+        assert!(matches!(bitmask, Ok(BitmaskImmediate { n: 1, immr: 0b110000, imms: 0b010100 })));
+    }
+
+    #[test]
+    fn test_size_64_maximum() {
+        let bitmask = BitmaskImmediate::try_from(0xfffffffffffffffe);
+        assert!(matches!(bitmask, Ok(BitmaskImmediate { n: 1, immr: 0b111111, imms: 0b111110 })));
+    }
+
+    #[test]
+    fn test_size_64_invalid() {
+        let bitmask = BitmaskImmediate::try_from(u64::MAX);
+        assert!(matches!(bitmask, Err(())));
+    }
+
+    #[test]
+    fn test_all_valid_32b_pattern() {
+        let mut patterns = vec![];
+        for pattern_size in [2, 4, 8, 16, 32_u64] {
+            for ones_count in 1..pattern_size {
+                for rotation in 0..pattern_size {
+                    let ones = (1_u64 << ones_count) - 1;
+                    let rotated = (ones >> rotation) |
+                        ((ones & ((1 << rotation) - 1)) << (pattern_size - rotation));
+                    let mut replicated = rotated;
+                    let mut shift = pattern_size;
+                    while shift < 32 {
+                        replicated |= replicated << shift;
+                        shift *= 2;
+                    }
+                    let replicated: u32 = replicated.try_into().unwrap();
+                    assert!(BitmaskImmediate::new_32b_reg(replicated).is_ok());
+                    patterns.push(replicated);
+                }
+            }
+        }
+        patterns.sort();
+        patterns.dedup();
+        // Up to {size}-1 ones, and a total of {size} possible rotations.
+        assert_eq!(1*2 + 3*4 + 7*8 + 15*16 + 31*32, patterns.len());
+    }
+}
diff --git a/yjit/src/asm/arm64/arg/condition.rs b/yjit/src/asm/arm64/arg/condition.rs
new file mode 100644
index 0000000000..f711b8b0d8
--- /dev/null
+++ b/yjit/src/asm/arm64/arg/condition.rs
@@ -0,0 +1,52 @@
+/// Various instructions in A64 can have condition codes attached. This enum
+/// includes all of the various kinds of conditions along with their respective
+/// encodings.
+pub struct Condition;
+
+impl Condition {
+    pub const EQ: u8 = 0b0000; // equal to
+    pub const NE: u8 = 0b0001; // not equal to
+    pub const CS: u8 = 0b0010; // carry set (alias for HS)
+    pub const CC: u8 = 0b0011; // carry clear (alias for LO)
+    pub const MI: u8 = 0b0100; // minus, negative
+    pub const PL: u8 = 0b0101; // positive or zero
+    pub const VS: u8 = 0b0110; // signed overflow
+    pub const VC: u8 = 0b0111; // no signed overflow
+    pub const HI: u8 = 0b1000; // greater than (unsigned)
+    pub const LS: u8 = 0b1001; // less than or equal to (unsigned)
+    pub const GE: u8 = 0b1010; // greater than or equal to (signed)
+    pub const LT: u8 = 0b1011; // less than (signed)
+    pub const GT: u8 = 0b1100; // greater than (signed)
+    pub const LE: u8 = 0b1101; // less than or equal to (signed)
+    pub const AL: u8 = 0b1110; // always
+
+    pub const fn inverse(condition: u8) -> u8 {
+        match condition {
+            Condition::EQ => Condition::NE,
+            Condition::NE => Condition::EQ,
+
+            Condition::CS => Condition::CC,
+            Condition::CC => Condition::CS,
+
+            Condition::MI => Condition::PL,
+            Condition::PL => Condition::MI,
+
+            Condition::VS => Condition::VC,
+            Condition::VC => Condition::VS,
+
+            Condition::HI => Condition::LS,
+            Condition::LS => Condition::HI,
+
+            Condition::LT => Condition::GE,
+            Condition::GE => Condition::LT,
+
+            Condition::GT => Condition::LE,
+            Condition::LE => Condition::GT,
+
+            Condition::AL => Condition::AL,
+
+            _ => panic!("Unknown condition")
+
+        }
+    }
+}
diff --git a/yjit/src/asm/arm64/arg/inst_offset.rs b/yjit/src/asm/arm64/arg/inst_offset.rs
new file mode 100644
index 0000000000..f4a6bc73a0
--- /dev/null
+++ b/yjit/src/asm/arm64/arg/inst_offset.rs
@@ -0,0 +1,47 @@
+/// There are a lot of instructions in the AArch64 architectrue that take an
+/// offset in terms of number of instructions. Usually they are jump
+/// instructions or instructions that load a value relative to the current PC.
+///
+/// This struct is used to mark those locations instead of a generic operand in
+/// order to give better clarity to the developer when reading the AArch64
+/// backend code. It also helps to clarify that everything is in terms of a
+/// number of instructions and not a number of bytes (i.e., the offset is the
+/// number of bytes divided by 4).
+#[derive(Copy, Clone)]
+pub struct InstructionOffset(i32);
+
+impl InstructionOffset {
+    /// Create a new instruction offset.
+    pub fn from_insns(insns: i32) -> Self {
+        InstructionOffset(insns)
+    }
+
+    /// Create a new instruction offset from a number of bytes.
+    pub fn from_bytes(bytes: i32) -> Self {
+        assert_eq!(bytes % 4, 0, "Byte offset must be a multiple of 4");
+        InstructionOffset(bytes / 4)
+    }
+}
+
+impl From<i32> for InstructionOffset {
+    /// Convert an i64 into an instruction offset.
+    fn from(value: i32) -> Self {
+        InstructionOffset(value)
+    }
+}
+
+impl From<InstructionOffset> for i32 {
+    /// Convert an instruction offset into a number of instructions as an i32.
+    fn from(offset: InstructionOffset) -> Self {
+        offset.0
+    }
+}
+
+impl From<InstructionOffset> for i64 {
+    /// Convert an instruction offset into a number of instructions as an i64.
+    /// This is useful for when we're checking how many bits this offset fits
+    /// into.
+    fn from(offset: InstructionOffset) -> Self {
+        offset.0.into()
+    }
+}
diff --git a/yjit/src/asm/arm64/arg/mod.rs b/yjit/src/asm/arm64/arg/mod.rs
new file mode 100644
index 0000000000..7eb37834f9
--- /dev/null
+++ b/yjit/src/asm/arm64/arg/mod.rs
@@ -0,0 +1,18 @@
+// This module contains various A64 instruction arguments and the logic
+// necessary to encode them.
+
+mod bitmask_imm;
+mod condition;
+mod inst_offset;
+mod sf;
+mod shifted_imm;
+mod sys_reg;
+mod truncate;
+
+pub use bitmask_imm::BitmaskImmediate;
+pub use condition::Condition;
+pub use inst_offset::InstructionOffset;
+pub use sf::Sf;
+pub use shifted_imm::ShiftedImmediate;
+pub use sys_reg::SystemRegister;
+pub use truncate::{truncate_imm, truncate_uimm};
diff --git a/yjit/src/asm/arm64/arg/sf.rs b/yjit/src/asm/arm64/arg/sf.rs
new file mode 100644
index 0000000000..c2fd33302c
--- /dev/null
+++ b/yjit/src/asm/arm64/arg/sf.rs
@@ -0,0 +1,19 @@
+/// This is commonly the top-most bit in the encoding of the instruction, and
+/// represents whether register operands should be treated as 64-bit registers
+/// or 32-bit registers.
+pub enum Sf {
+    Sf32 = 0b0,
+    Sf64 = 0b1
+}
+
+/// A convenience function so that we can convert the number of bits of an
+/// register operand directly into an Sf enum variant.
+impl From<u8> for Sf {
+    fn from(num_bits: u8) -> Self {
+        match num_bits {
+            64 => Sf::Sf64,
+            32 => Sf::Sf32,
+            _ => panic!("Invalid number of bits: {}", num_bits)
+        }
+    }
+}
diff --git a/yjit/src/asm/arm64/arg/shifted_imm.rs b/yjit/src/asm/arm64/arg/shifted_imm.rs
new file mode 100644
index 0000000000..4602ac64ab
--- /dev/null
+++ b/yjit/src/asm/arm64/arg/shifted_imm.rs
@@ -0,0 +1,81 @@
+/// How much to shift the immediate by.
+pub enum Shift {
+    LSL0 = 0b0, // no shift
+    LSL12 = 0b1 // logical shift left by 12 bits
+}
+
+/// Some instructions accept a 12-bit immediate that has an optional shift
+/// attached to it. This allows encoding larger values than just fit into 12
+/// bits. We attempt to encode those here. If the values are too large we have
+/// to bail out.
+pub struct ShiftedImmediate {
+    shift: Shift,
+    value: u16
+}
+
+impl TryFrom<u64> for ShiftedImmediate {
+    type Error = ();
+
+    /// Attempt to convert a u64 into a BitmaskImm.
+    fn try_from(value: u64) -> Result<Self, Self::Error> {
+        let current = value;
+        if current < 2_u64.pow(12) {
+            return Ok(ShiftedImmediate { shift: Shift::LSL0, value: current as u16 });
+        }
+
+        if (current & (2_u64.pow(12) - 1) == 0) && ((current >> 12) < 2_u64.pow(12)) {
+            return Ok(ShiftedImmediate { shift: Shift::LSL12, value: (current >> 12) as u16 });
+        }
+
+        Err(())
+    }
+}
+
+impl From<ShiftedImmediate> for u32 {
+    /// Encode a bitmask immediate into a 32-bit value.
+    fn from(imm: ShiftedImmediate) -> Self {
+        0
+        | (((imm.shift as u32) & 1) << 12)
+        | (imm.value as u32)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_no_shift() {
+        let expected_value = 256;
+        let result = ShiftedImmediate::try_from(expected_value);
+
+        match result {
+            Ok(ShiftedImmediate { shift: Shift::LSL0, value }) => assert_eq!(value as u64, expected_value),
+            _ => panic!("Unexpected shift value")
+        }
+    }
+
+    #[test]
+    fn test_maximum_no_shift() {
+        let expected_value = (1 << 12) - 1;
+        let result = ShiftedImmediate::try_from(expected_value);
+
+        match result {
+            Ok(ShiftedImmediate { shift: Shift::LSL0, value }) => assert_eq!(value as u64, expected_value),
+            _ => panic!("Unexpected shift value")
+        }
+    }
+
+    #[test]
+    fn test_with_shift() {
+        let result = ShiftedImmediate::try_from(256 << 12);
+
+        assert!(matches!(result, Ok(ShiftedImmediate { shift: Shift::LSL12, value: 256 })));
+    }
+
+    #[test]
+    fn test_unencodable() {
+        let result = ShiftedImmediate::try_from((256 << 12) + 1);
+        assert!(matches!(result, Err(())));
+    }
+}
diff --git a/yjit/src/asm/arm64/arg/sys_reg.rs b/yjit/src/asm/arm64/arg/sys_reg.rs
new file mode 100644
index 0000000000..41d71920cb
--- /dev/null
+++ b/yjit/src/asm/arm64/arg/sys_reg.rs
@@ -0,0 +1,6 @@
+/// The encoded representation of an A64 system register.
+/// https://developer.arm.com/documentation/ddi0601/2022-06/AArch64-Registers/
+pub enum SystemRegister {
+    /// https://developer.arm.com/documentation/ddi0601/2022-06/AArch64-Registers/NZCV--Condition-Flags?lang=en
+    NZCV = 0b1_011_0100_0010_000
+}
diff --git a/yjit/src/asm/arm64/arg/truncate.rs b/yjit/src/asm/arm64/arg/truncate.rs
new file mode 100644
index 0000000000..85d56ff202
--- /dev/null
+++ b/yjit/src/asm/arm64/arg/truncate.rs
@@ -0,0 +1,66 @@
+// There are many instances in AArch64 instruction encoding where you represent
+// an integer value with a particular bit width that isn't a power of 2. These
+// functions represent truncating those integer values down to the appropriate
+// number of bits.
+
+/// Truncate a signed immediate to fit into a compile-time known width. It is
+/// assumed before calling this function that the value fits into the correct
+/// size. If it doesn't, then this function will panic.
+///
+/// When the value is positive, this should effectively be a no-op since we're
+/// just dropping leading zeroes. When the value is negative we should only be
+/// dropping leading ones.
+pub fn truncate_imm<T: Into<i32>, const WIDTH: usize>(imm: T) -> u32 {
+    let value: i32 = imm.into();
+    let masked = (value as u32) & ((1 << WIDTH) - 1);
+
+    // Assert that we didn't drop any bits by truncating.
+    if value >= 0 {
+        assert_eq!(value as u32, masked);
+    } else {
+        assert_eq!(value as u32, masked | (u32::MAX << WIDTH));
+    }
+
+    masked
+}
+
+/// Truncate an unsigned immediate to fit into a compile-time known width. It is
+/// assumed before calling this function that the value fits into the correct
+/// size. If it doesn't, then this function will panic.
+///
+/// This should effectively be a no-op since we're just dropping leading zeroes.
+pub fn truncate_uimm<T: Into<u32>, const WIDTH: usize>(uimm: T) -> u32 {
+    let value: u32 = uimm.into();
+    let masked = value & ((1 << WIDTH) - 1);
+
+    // Assert that we didn't drop any bits by truncating.
+    assert_eq!(value, masked);
+
+    masked
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_truncate_imm_positive() {
+        let inst = truncate_imm::<i32, 4>(5);
+        let result: u32 = inst;
+        assert_eq!(0b0101, result);
+    }
+
+    #[test]
+    fn test_truncate_imm_negative() {
+        let inst = truncate_imm::<i32, 4>(-5);
+        let result: u32 = inst;
+        assert_eq!(0b1011, result);
+    }
+
+    #[test]
+    fn test_truncate_uimm() {
+        let inst = truncate_uimm::<u32, 4>(5);
+        let result: u32 = inst;
+        assert_eq!(0b0101, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/atomic.rs b/yjit/src/asm/arm64/inst/atomic.rs
new file mode 100644
index 0000000000..5ce497209c
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/atomic.rs
@@ -0,0 +1,86 @@
+/// The size of the register operands to this instruction.
+enum Size {
+    /// Using 32-bit registers.
+    Size32 = 0b10,
+
+    /// Using 64-bit registers.
+    Size64 = 0b11
+}
+
+/// A convenience function so that we can convert the number of bits of an
+/// register operand directly into a Size enum variant.
+impl From<u8> for Size {
+    fn from(num_bits: u8) -> Self {
+        match num_bits {
+            64 => Size::Size64,
+            32 => Size::Size32,
+            _ => panic!("Invalid number of bits: {}", num_bits)
+        }
+    }
+}
+
+/// The struct that represents an A64 atomic instruction that can be encoded.
+///
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |        1  1    1  0  0  0    1  1  1                     0  0  0  0    0  0                                   |
+/// | size                                 rs..............                       rn.............. rt.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct Atomic {
+    /// The register holding the value to be loaded.
+    rt: u8,
+
+    /// The base register.
+    rn: u8,
+
+    /// The register holding the data value to be operated on.
+    rs: u8,
+
+    /// The size of the registers used in this instruction.
+    size: Size
+}
+
+impl Atomic {
+    /// LDADDAL
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDADD--LDADDA--LDADDAL--LDADDL--Atomic-add-on-word-or-doubleword-in-memory-?lang=en
+    pub fn ldaddal(rs: u8, rt: u8, rn: u8, num_bits: u8) -> Self {
+        Self { rt, rn, rs, size: num_bits.into() }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en
+const FAMILY: u32 = 0b0100;
+
+impl From<Atomic> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: Atomic) -> Self {
+        0
+        | ((inst.size as u32) << 30)
+        | (0b11 << 28)
+        | (FAMILY << 25)
+        | (0b111 << 21)
+        | ((inst.rs as u32) << 16)
+        | ((inst.rn as u32) << 5)
+        | (inst.rt as u32)
+    }
+}
+
+impl From<Atomic> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: Atomic) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_ldaddal() {
+        let result: u32 = Atomic::ldaddal(20, 21, 22, 64).into();
+        assert_eq!(0xf8f402d5, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/branch.rs b/yjit/src/asm/arm64/inst/branch.rs
new file mode 100644
index 0000000000..f15ef2a9b0
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/branch.rs
@@ -0,0 +1,100 @@
+/// Which operation to perform.
+enum Op {
+    /// Perform a BR instruction.
+    BR = 0b00,
+
+    /// Perform a BLR instruction.
+    BLR = 0b01,
+
+    /// Perform a RET instruction.
+    RET = 0b10
+}
+
+/// The struct that represents an A64 branch instruction that can be encoded.
+///
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |  1  1  0  1    0  1  1  0    0        1    1  1  1  1    0  0  0  0    0  0                   0    0  0  0  0 |
+/// |                                op...                                        rn.............. rm.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct Branch {
+    /// The register holding the address to be branched to.
+    rn: u8,
+
+    /// The operation to perform.
+    op: Op
+}
+
+impl Branch {
+    /// BR
+    /// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/BR--Branch-to-Register-?lang=en
+    pub fn br(rn: u8) -> Self {
+        Self { rn, op: Op::BR }
+    }
+
+    /// BLR
+    /// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/BLR--Branch-with-Link-to-Register-?lang=en
+    pub fn blr(rn: u8) -> Self {
+        Self { rn, op: Op::BLR }
+    }
+
+    /// RET
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/RET--Return-from-subroutine-?lang=en
+    pub fn ret(rn: u8) -> Self {
+        Self { rn, op: Op::RET }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en
+const FAMILY: u32 = 0b101;
+
+impl From<Branch> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: Branch) -> Self {
+        0
+        | (0b11 << 30)
+        | (FAMILY << 26)
+        | (1 << 25)
+        | ((inst.op as u32) << 21)
+        | (0b11111 << 16)
+        | ((inst.rn as u32) << 5)
+    }
+}
+
+impl From<Branch> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: Branch) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_br() {
+        let result: u32 = Branch::br(0).into();
+        assert_eq!(0xd61f0000, result);
+    }
+
+    #[test]
+    fn test_blr() {
+        let result: u32 = Branch::blr(0).into();
+        assert_eq!(0xd63f0000, result);
+    }
+
+    #[test]
+    fn test_ret() {
+        let result: u32 = Branch::ret(30).into();
+        assert_eq!(0xd65f03C0, result);
+    }
+
+    #[test]
+    fn test_ret_rn() {
+        let result: u32 = Branch::ret(20).into();
+        assert_eq!(0xd65f0280, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/branch_cond.rs b/yjit/src/asm/arm64/inst/branch_cond.rs
new file mode 100644
index 0000000000..fcc07f69aa
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/branch_cond.rs
@@ -0,0 +1,78 @@
+use super::super::arg::{InstructionOffset, truncate_imm};
+
+/// The struct that represents an A64 conditional branch instruction that can be
+/// encoded.
+///
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |  0  1  0  1    0  1  0  0                                                                     0               |
+/// |                             imm19...........................................................      cond....... |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct BranchCond {
+    /// The kind of condition to check before branching.
+    cond: u8,
+
+    /// The instruction offset from this instruction to branch to.
+    offset: InstructionOffset
+}
+
+impl BranchCond {
+    /// B.cond
+    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/B-cond--Branch-conditionally-
+    pub fn bcond(cond: u8, offset: InstructionOffset) -> Self {
+        Self { cond, offset }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en
+const FAMILY: u32 = 0b101;
+
+impl From<BranchCond> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: BranchCond) -> Self {
+        0
+        | (1 << 30)
+        | (FAMILY << 26)
+        | (truncate_imm::<_, 19>(inst.offset) << 5)
+        | (inst.cond as u32)
+    }
+}
+
+impl From<BranchCond> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: BranchCond) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use super::super::super::arg::Condition;
+
+    #[test]
+    fn test_b_eq() {
+        let result: u32 = BranchCond::bcond(Condition::EQ, 32.into()).into();
+        assert_eq!(0x54000400, result);
+    }
+
+    #[test]
+    fn test_b_vs() {
+        let result: u32 = BranchCond::bcond(Condition::VS, 32.into()).into();
+        assert_eq!(0x54000406, result);
+    }
+
+    #[test]
+    fn test_b_eq_max() {
+        let result: u32 = BranchCond::bcond(Condition::EQ, ((1 << 18) - 1).into()).into();
+        assert_eq!(0x547fffe0, result);
+    }
+
+    #[test]
+    fn test_b_eq_min() {
+        let result: u32 = BranchCond::bcond(Condition::EQ, (-(1 << 18)).into()).into();
+        assert_eq!(0x54800000, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/breakpoint.rs b/yjit/src/asm/arm64/inst/breakpoint.rs
new file mode 100644
index 0000000000..be4920ac76
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/breakpoint.rs
@@ -0,0 +1,55 @@
+/// The struct that represents an A64 breakpoint instruction that can be encoded.
+///
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |  1  1  0  1    0  1  0  0    0  0  1                                                          0    0  0  0  0 |
+/// |                                      imm16..................................................                  |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct Breakpoint {
+    /// The value to be captured by ESR_ELx.ISS
+    imm16: u16
+}
+
+impl Breakpoint {
+    /// BRK
+    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/BRK--Breakpoint-instruction-
+    pub fn brk(imm16: u16) -> Self {
+        Self { imm16 }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en#control
+const FAMILY: u32 = 0b101;
+
+impl From<Breakpoint> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: Breakpoint) -> Self {
+        let imm16 = inst.imm16 as u32;
+
+        0
+        | (0b11 << 30)
+        | (FAMILY << 26)
+        | (1 << 21)
+        | (imm16 << 5)
+    }
+}
+
+impl From<Breakpoint> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: Breakpoint) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_brk() {
+        let result: u32 = Breakpoint::brk(7).into();
+        assert_eq!(0xd42000e0, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/call.rs b/yjit/src/asm/arm64/inst/call.rs
new file mode 100644
index 0000000000..74debac7f7
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/call.rs
@@ -0,0 +1,104 @@
+use super::super::arg::{InstructionOffset, truncate_imm};
+
+/// The operation to perform for this instruction.
+enum Op {
+    /// Branch directly, with a hint that this is not a subroutine call or
+    /// return.
+    Branch = 0,
+
+    /// Branch directly, with a hint that this is a subroutine call or return.
+    BranchWithLink = 1
+}
+
+/// The struct that represents an A64 branch with our without link instruction
+/// that can be encoded.
+///
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |     0  0  1    0  1                                                                                           |
+/// | op                  imm26.................................................................................... |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct Call {
+    /// The PC-relative offset to jump to in terms of number of instructions.
+    offset: InstructionOffset,
+
+    /// The operation to perform for this instruction.
+    op: Op
+}
+
+impl Call {
+    /// B
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B--Branch-
+    pub fn b(offset: InstructionOffset) -> Self {
+        Self { offset, op: Op::Branch }
+    }
+
+    /// BL
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/BL--Branch-with-Link-?lang=en
+    pub fn bl(offset: InstructionOffset) -> Self {
+        Self { offset, op: Op::BranchWithLink }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en
+const FAMILY: u32 = 0b101;
+
+impl From<Call> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: Call) -> Self {
+        0
+        | ((inst.op as u32) << 31)
+        | (FAMILY << 26)
+        | truncate_imm::<_, 26>(inst.offset)
+    }
+}
+
+impl From<Call> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: Call) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_bl() {
+        let result: u32 = Call::bl(0.into()).into();
+        assert_eq!(0x94000000, result);
+    }
+
+    #[test]
+    fn test_bl_positive() {
+        let result: u32 = Call::bl(256.into()).into();
+        assert_eq!(0x94000100, result);
+    }
+
+    #[test]
+    fn test_bl_negative() {
+        let result: u32 = Call::bl((-256).into()).into();
+        assert_eq!(0x97ffff00, result);
+    }
+
+    #[test]
+    fn test_b() {
+        let result: u32 = Call::b(0.into()).into();
+        assert_eq!(0x14000000, result);
+    }
+
+    #[test]
+    fn test_b_positive() {
+        let result: u32 = Call::b(((1 << 25) - 1).into()).into();
+        assert_eq!(0x15ffffff, result);
+    }
+
+    #[test]
+    fn test_b_negative() {
+        let result: u32 = Call::b((-(1 << 25)).into()).into();
+        assert_eq!(0x16000000, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/conditional.rs b/yjit/src/asm/arm64/inst/conditional.rs
new file mode 100644
index 0000000000..e1950e95b4
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/conditional.rs
@@ -0,0 +1,73 @@
+use super::super::arg::Sf;
+
+/// The struct that represents an A64 conditional instruction that can be
+/// encoded.
+///
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |     0  0  1    1  0  1  0    1  0  0                                   0  0                                   |
+/// | sf                                   rm..............   cond.......         rn.............. rd.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct Conditional {
+    /// The number of the general-purpose destination register.
+    rd: u8,
+
+    /// The number of the first general-purpose source register.
+    rn: u8,
+
+    /// The condition to use for the conditional instruction.
+    cond: u8,
+
+    /// The number of the second general-purpose source register.
+    rm: u8,
+
+    /// The size of the registers of this instruction.
+    sf: Sf
+}
+
+impl Conditional {
+    /// CSEL
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CSEL--Conditional-Select-?lang=en
+    pub fn csel(rd: u8, rn: u8, rm: u8, cond: u8, num_bits: u8) -> Self {
+        Self { rd, rn, cond, rm, sf: num_bits.into() }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Register?lang=en#condsel
+const FAMILY: u32 = 0b101;
+
+impl From<Conditional> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: Conditional) -> Self {
+        0
+        | ((inst.sf as u32) << 31)
+        | (1 << 28)
+        | (FAMILY << 25)
+        | (1 << 23)
+        | ((inst.rm as u32) << 16)
+        | ((inst.cond as u32) << 12)
+        | ((inst.rn as u32) << 5)
+        | (inst.rd as u32)
+    }
+}
+
+impl From<Conditional> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: Conditional) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use super::super::super::arg::Condition;
+
+    #[test]
+    fn test_csel() {
+        let result: u32 = Conditional::csel(0, 1, 2, Condition::NE, 64).into();
+        assert_eq!(0x9a821020, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/data_imm.rs b/yjit/src/asm/arm64/inst/data_imm.rs
new file mode 100644
index 0000000000..b474b00a52
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/data_imm.rs
@@ -0,0 +1,143 @@
+use super::super::arg::{Sf, ShiftedImmediate};
+
+/// The operation being performed by this instruction.
+enum Op {
+    Add = 0b0,
+    Sub = 0b1
+}
+
+// Whether or not to update the flags when this instruction is performed.
+enum S {
+    LeaveFlags = 0b0,
+    UpdateFlags = 0b1
+}
+
+/// The struct that represents an A64 data processing -- immediate instruction
+/// that can be encoded.
+///
+/// Add/subtract (immediate)
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |           1    0  0  0  1    0                                                                                |
+/// | sf op  S                       sh imm12.................................... rn.............. rd.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct DataImm {
+    /// The register number of the destination register.
+    rd: u8,
+
+    /// The register number of the first operand register.
+    rn: u8,
+
+    /// How much to shift the immediate by.
+    imm: ShiftedImmediate,
+
+    /// Whether or not to update the flags when this instruction is performed.
+    s: S,
+
+    /// The opcode for this instruction.
+    op: Op,
+
+    /// Whether or not this instruction is operating on 64-bit operands.
+    sf: Sf
+}
+
+impl DataImm {
+    /// ADD (immediate)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADD--immediate---Add--immediate--?lang=en
+    pub fn add(rd: u8, rn: u8, imm: ShiftedImmediate, num_bits: u8) -> Self {
+        Self { rd, rn, imm, s: S::LeaveFlags, op: Op::Add, sf: num_bits.into() }
+    }
+
+    /// ADDS (immediate, set flags)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADDS--immediate---Add--immediate---setting-flags-?lang=en
+    pub fn adds(rd: u8, rn: u8, imm: ShiftedImmediate, num_bits: u8) -> Self {
+        Self { rd, rn, imm, s: S::UpdateFlags, op: Op::Add, sf: num_bits.into() }
+    }
+
+    /// CMP (immediate)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CMP--immediate---Compare--immediate---an-alias-of-SUBS--immediate--?lang=en
+    pub fn cmp(rn: u8, imm: ShiftedImmediate, num_bits: u8) -> Self {
+        Self::subs(31, rn, imm, num_bits)
+    }
+
+    /// SUB (immediate)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SUB--immediate---Subtract--immediate--?lang=en
+    pub fn sub(rd: u8, rn: u8, imm: ShiftedImmediate, num_bits: u8) -> Self {
+        Self { rd, rn, imm, s: S::LeaveFlags, op: Op::Sub, sf: num_bits.into() }
+    }
+
+    /// SUBS (immediate, set flags)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SUBS--immediate---Subtract--immediate---setting-flags-?lang=en
+    pub fn subs(rd: u8, rn: u8, imm: ShiftedImmediate, num_bits: u8) -> Self {
+        Self { rd, rn, imm, s: S::UpdateFlags, op: Op::Sub, sf: num_bits.into() }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en
+const FAMILY: u32 = 0b1000;
+
+impl From<DataImm> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: DataImm) -> Self {
+        let imm: u32 = inst.imm.into();
+
+        0
+        | ((inst.sf as u32) << 31)
+        | ((inst.op as u32) << 30)
+        | ((inst.s as u32) << 29)
+        | (FAMILY << 25)
+        | (1 << 24)
+        | (imm << 10)
+        | ((inst.rn as u32) << 5)
+        | inst.rd as u32
+    }
+}
+
+impl From<DataImm> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: DataImm) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_add() {
+        let inst = DataImm::add(0, 1, 7.try_into().unwrap(), 64);
+        let result: u32 = inst.into();
+        assert_eq!(0x91001c20, result);
+    }
+
+    #[test]
+    fn test_adds() {
+        let inst = DataImm::adds(0, 1, 7.try_into().unwrap(), 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xb1001c20, result);
+    }
+
+    #[test]
+    fn test_cmp() {
+        let inst = DataImm::cmp(0, 7.try_into().unwrap(), 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf1001c1f, result);
+    }
+
+    #[test]
+    fn test_sub() {
+        let inst = DataImm::sub(0, 1, 7.try_into().unwrap(), 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xd1001c20, result);
+    }
+
+    #[test]
+    fn test_subs() {
+        let inst = DataImm::subs(0, 1, 7.try_into().unwrap(), 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf1001c20, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/data_reg.rs b/yjit/src/asm/arm64/inst/data_reg.rs
new file mode 100644
index 0000000000..a742121f1f
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/data_reg.rs
@@ -0,0 +1,192 @@
+use super::super::arg::{Sf, truncate_uimm};
+
+/// The operation being performed by this instruction.
+enum Op {
+    Add = 0b0,
+    Sub = 0b1
+}
+
+// Whether or not to update the flags when this instruction is performed.
+enum S {
+    LeaveFlags = 0b0,
+    UpdateFlags = 0b1
+}
+
+/// The type of shift to perform on the second operand register.
+enum Shift {
+    LSL = 0b00, // logical shift left (unsigned)
+    LSR = 0b01, // logical shift right (unsigned)
+    ASR = 0b10  // arithmetic shift right (signed)
+}
+
+/// The struct that represents an A64 data processing -- register instruction
+/// that can be encoded.
+///
+/// Add/subtract (shifted register)
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |           0    1  0  1  1          0                                                                          |
+/// | sf op  S                    shift    rm..............   imm6............... rn.............. rd.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct DataReg {
+    /// The register number of the destination register.
+    rd: u8,
+
+    /// The register number of the first operand register.
+    rn: u8,
+
+    /// The amount to shift the second operand register by.
+    imm6: u8,
+
+    /// The register number of the second operand register.
+    rm: u8,
+
+    /// The type of shift to perform on the second operand register.
+    shift: Shift,
+
+    /// Whether or not to update the flags when this instruction is performed.
+    s: S,
+
+    /// The opcode for this instruction.
+    op: Op,
+
+    /// Whether or not this instruction is operating on 64-bit operands.
+    sf: Sf
+}
+
+impl DataReg {
+    /// ADD (shifted register)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADD--shifted-register---Add--shifted-register--?lang=en
+    pub fn add(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
+        Self {
+            rd,
+            rn,
+            imm6: 0,
+            rm,
+            shift: Shift::LSL,
+            s: S::LeaveFlags,
+            op: Op::Add,
+            sf: num_bits.into()
+        }
+    }
+
+    /// ADDS (shifted register, set flags)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADDS--shifted-register---Add--shifted-register---setting-flags-?lang=en
+    pub fn adds(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
+        Self {
+            rd,
+            rn,
+            imm6: 0,
+            rm,
+            shift: Shift::LSL,
+            s: S::UpdateFlags,
+            op: Op::Add,
+            sf: num_bits.into()
+        }
+    }
+
+    /// CMP (shifted register)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CMP--shifted-register---Compare--shifted-register---an-alias-of-SUBS--shifted-register--?lang=en
+    pub fn cmp(rn: u8, rm: u8, num_bits: u8) -> Self {
+        Self::subs(31, rn, rm, num_bits)
+    }
+
+    /// SUB (shifted register)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SUB--shifted-register---Subtract--shifted-register--?lang=en
+    pub fn sub(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
+        Self {
+            rd,
+            rn,
+            imm6: 0,
+            rm,
+            shift: Shift::LSL,
+            s: S::LeaveFlags,
+            op: Op::Sub,
+            sf: num_bits.into()
+        }
+    }
+
+    /// SUBS (shifted register, set flags)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SUBS--shifted-register---Subtract--shifted-register---setting-flags-?lang=en
+    pub fn subs(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
+        Self {
+            rd,
+            rn,
+            imm6: 0,
+            rm,
+            shift: Shift::LSL,
+            s: S::UpdateFlags,
+            op: Op::Sub,
+            sf: num_bits.into()
+        }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Register?lang=en
+const FAMILY: u32 = 0b0101;
+
+impl From<DataReg> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: DataReg) -> Self {
+        0
+        | ((inst.sf as u32) << 31)
+        | ((inst.op as u32) << 30)
+        | ((inst.s as u32) << 29)
+        | (FAMILY << 25)
+        | (1 << 24)
+        | ((inst.shift as u32) << 22)
+        | ((inst.rm as u32) << 16)
+        | (truncate_uimm::<_, 6>(inst.imm6) << 10)
+        | ((inst.rn as u32) << 5)
+        | inst.rd as u32
+    }
+}
+
+impl From<DataReg> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: DataReg) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_add() {
+        let inst = DataReg::add(0, 1, 2, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0x8b020020, result);
+    }
+
+    #[test]
+    fn test_adds() {
+        let inst = DataReg::adds(0, 1, 2, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xab020020, result);
+    }
+
+    #[test]
+    fn test_cmp() {
+        let inst = DataReg::cmp(0, 1, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xeb01001f, result);
+    }
+
+    #[test]
+    fn test_sub() {
+        let inst = DataReg::sub(0, 1, 2, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xcb020020, result);
+    }
+
+    #[test]
+    fn test_subs() {
+        let inst = DataReg::subs(0, 1, 2, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xeb020020, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/halfword_imm.rs b/yjit/src/asm/arm64/inst/halfword_imm.rs
new file mode 100644
index 0000000000..0ddae8e8de
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/halfword_imm.rs
@@ -0,0 +1,179 @@
+use super::super::arg::truncate_imm;
+
+/// Whether this is a load or a store.
+enum Op {
+    Load = 1,
+    Store = 0
+}
+
+/// The type of indexing to perform for this instruction.
+enum Index {
+    /// No indexing.
+    None = 0b00,
+
+    /// Mutate the register after the read.
+    PostIndex = 0b01,
+
+    /// Mutate the register before the read.
+    PreIndex = 0b11
+}
+
+/// The struct that represents an A64 halfword instruction that can be encoded.
+///
+/// LDRH/STRH
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |  0  1  1  1    1  0  0  1    0                                                                                |
+/// |                                op imm12.................................... rn.............. rt.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+/// LDRH (pre-index/post-index)
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |  0  1  1  1    1  0  0  0    0     0                                                                          |
+/// |                                op    imm9..........................   index rn.............. rt.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct HalfwordImm {
+    /// The number of the 32-bit register to be loaded.
+    rt: u8,
+
+    /// The number of the 64-bit base register to calculate the memory address.
+    rn: u8,
+
+    /// The type of indexing to perform for this instruction.
+    index: Index,
+
+    /// The immediate offset from the base register.
+    imm: i16,
+
+    /// The operation to perform.
+    op: Op
+}
+
+impl HalfwordImm {
+    /// LDRH
+    /// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+    pub fn ldrh(rt: u8, rn: u8, imm12: i16) -> Self {
+        Self { rt, rn, index: Index::None, imm: imm12, op: Op::Load }
+    }
+
+    /// LDRH (pre-index)
+    /// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+    pub fn ldrh_pre(rt: u8, rn: u8, imm9: i16) -> Self {
+        Self { rt, rn, index: Index::PreIndex, imm: imm9, op: Op::Load }
+    }
+
+    /// LDRH (post-index)
+    /// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/LDRH--immediate---Load-Register-Halfword--immediate--
+    pub fn ldrh_post(rt: u8, rn: u8, imm9: i16) -> Self {
+        Self { rt, rn, index: Index::PostIndex, imm: imm9, op: Op::Load }
+    }
+
+    /// STRH
+    /// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/STRH--immediate---Store-Register-Halfword--immediate--
+    pub fn strh(rt: u8, rn: u8, imm12: i16) -> Self {
+        Self { rt, rn, index: Index::None, imm: imm12, op: Op::Store }
+    }
+
+    /// STRH (pre-index)
+    /// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/STRH--immediate---Store-Register-Halfword--immediate--
+    pub fn strh_pre(rt: u8, rn: u8, imm9: i16) -> Self {
+        Self { rt, rn, index: Index::PreIndex, imm: imm9, op: Op::Store }
+    }
+
+    /// STRH (post-index)
+    /// https://developer.arm.com/documentation/ddi0602/2022-06/Base-Instructions/STRH--immediate---Store-Register-Halfword--immediate--
+    pub fn strh_post(rt: u8, rn: u8, imm9: i16) -> Self {
+        Self { rt, rn, index: Index::PostIndex, imm: imm9, op: Op::Store }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en
+const FAMILY: u32 = 0b111100;
+
+impl From<HalfwordImm> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: HalfwordImm) -> Self {
+        let (opc, imm) = match inst.index {
+            Index::None => {
+                assert_eq!(inst.imm & 1, 0, "immediate offset must be even");
+                let imm12 = truncate_imm::<_, 12>(inst.imm / 2);
+                (0b100, imm12)
+            },
+            Index::PreIndex | Index::PostIndex => {
+                let imm9 = truncate_imm::<_, 9>(inst.imm);
+                (0b000, (imm9 << 2) | (inst.index as u32))
+            }
+        };
+
+        0
+        | (FAMILY << 25)
+        | ((opc | (inst.op as u32)) << 22)
+        | (imm << 10)
+        | ((inst.rn as u32) << 5)
+        | (inst.rt as u32)
+    }
+}
+
+impl From<HalfwordImm> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: HalfwordImm) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_ldrh() {
+        let inst = HalfwordImm::ldrh(0, 1, 8);
+        let result: u32 = inst.into();
+        assert_eq!(0x79401020, result);
+    }
+
+    #[test]
+    fn test_ldrh_pre() {
+        let inst = HalfwordImm::ldrh_pre(0, 1, 16);
+        let result: u32 = inst.into();
+        assert_eq!(0x78410c20, result);
+    }
+
+    #[test]
+    fn test_ldrh_post() {
+        let inst = HalfwordImm::ldrh_post(0, 1, 24);
+        let result: u32 = inst.into();
+        assert_eq!(0x78418420, result);
+    }
+
+    #[test]
+    fn test_ldrh_post_negative() {
+        let inst = HalfwordImm::ldrh_post(0, 1, -24);
+        let result: u32 = inst.into();
+        assert_eq!(0x785e8420, result);
+    }
+
+    #[test]
+    fn test_strh() {
+        let inst = HalfwordImm::strh(0, 1, 0);
+        let result: u32 = inst.into();
+        assert_eq!(0x79000020, result);
+    }
+
+    #[test]
+    fn test_strh_pre() {
+        let inst = HalfwordImm::strh_pre(0, 1, 0);
+        let result: u32 = inst.into();
+        assert_eq!(0x78000c20, result);
+    }
+
+    #[test]
+    fn test_strh_post() {
+        let inst = HalfwordImm::strh_post(0, 1, 0);
+        let result: u32 = inst.into();
+        assert_eq!(0x78000420, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/load_literal.rs b/yjit/src/asm/arm64/inst/load_literal.rs
new file mode 100644
index 0000000000..3eade205c8
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/load_literal.rs
@@ -0,0 +1,89 @@
+use super::super::arg::{InstructionOffset, truncate_imm};
+
+/// The size of the operands being operated on.
+enum Opc {
+    Size32 = 0b00,
+    Size64 = 0b01,
+}
+
+/// A convenience function so that we can convert the number of bits of an
+/// register operand directly into an Sf enum variant.
+impl From<u8> for Opc {
+    fn from(num_bits: u8) -> Self {
+        match num_bits {
+            64 => Opc::Size64,
+            32 => Opc::Size32,
+            _ => panic!("Invalid number of bits: {}", num_bits)
+        }
+    }
+}
+
+/// The struct that represents an A64 load literal instruction that can be encoded.
+///
+/// LDR
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |        0  1    1  0  0  0                                                                                     |
+/// | opc..                       imm19........................................................... rt.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct LoadLiteral {
+    /// The number of the register to load the value into.
+    rt: u8,
+
+    /// The PC-relative number of instructions to load the value from.
+    offset: InstructionOffset,
+
+    /// The size of the operands being operated on.
+    opc: Opc
+}
+
+impl LoadLiteral {
+    /// LDR (load literal)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--literal---Load-Register--literal--?lang=en
+    pub fn ldr_literal(rt: u8, offset: InstructionOffset, num_bits: u8) -> Self {
+        Self { rt, offset, opc: num_bits.into() }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en
+const FAMILY: u32 = 0b0100;
+
+impl From<LoadLiteral> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: LoadLiteral) -> Self {
+        0
+        | ((inst.opc as u32) << 30)
+        | (1 << 28)
+        | (FAMILY << 25)
+        | (truncate_imm::<_, 19>(inst.offset) << 5)
+        | (inst.rt as u32)
+    }
+}
+
+impl From<LoadLiteral> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: LoadLiteral) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_ldr_positive() {
+        let inst = LoadLiteral::ldr_literal(0, 5.into(), 64);
+        let result: u32 = inst.into();
+        assert_eq!(0x580000a0, result);
+    }
+
+    #[test]
+    fn test_ldr_negative() {
+        let inst = LoadLiteral::ldr_literal(0, (-5).into(), 64);
+        let result: u32 = inst.into();
+        assert_eq!(0x58ffff60, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/load_register.rs b/yjit/src/asm/arm64/inst/load_register.rs
new file mode 100644
index 0000000000..3426b9ba5f
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/load_register.rs
@@ -0,0 +1,108 @@
+/// Whether or not to shift the register.
+enum S {
+    Shift = 1,
+    NoShift = 0
+}
+
+/// The option for this instruction.
+enum Option {
+    UXTW = 0b010,
+    LSL = 0b011,
+    SXTW = 0b110,
+    SXTX = 0b111
+}
+
+/// The size of the operands of this instruction.
+enum Size {
+    Size32 = 0b10,
+    Size64 = 0b11
+}
+
+/// A convenience function so that we can convert the number of bits of an
+/// register operand directly into a Size enum variant.
+impl From<u8> for Size {
+    fn from(num_bits: u8) -> Self {
+        match num_bits {
+            64 => Size::Size64,
+            32 => Size::Size32,
+            _ => panic!("Invalid number of bits: {}", num_bits)
+        }
+    }
+}
+
+/// The struct that represents an A64 load instruction that can be encoded.
+///
+/// LDR
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |        1  1    1  0  0  0    0  1  1                                   1  0                                   |
+/// | size.                                rm..............   option.. S          rn.............. rt.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct LoadRegister {
+    /// The number of the register to load the value into.
+    rt: u8,
+
+    /// The base register with which to form the address.
+    rn: u8,
+
+    /// Whether or not to shift the value of the register.
+    s: S,
+
+    /// The option associated with this instruction that controls the shift.
+    option: Option,
+
+    /// The number of the offset register.
+    rm: u8,
+
+    /// The size of the operands.
+    size: Size
+}
+
+impl LoadRegister {
+    /// LDR
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--register---Load-Register--register--?lang=en
+    pub fn ldr(rt: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
+        Self { rt, rn, s: S::NoShift, option: Option::LSL, rm, size: num_bits.into() }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en
+const FAMILY: u32 = 0b0100;
+
+impl From<LoadRegister> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: LoadRegister) -> Self {
+        0
+        | ((inst.size as u32) << 30)
+        | (0b11 << 28)
+        | (FAMILY << 25)
+        | (0b11 << 21)
+        | ((inst.rm as u32) << 16)
+        | ((inst.option as u32) << 13)
+        | ((inst.s as u32) << 12)
+        | (0b10 << 10)
+        | ((inst.rn as u32) << 5)
+        | (inst.rt as u32)
+    }
+}
+
+impl From<LoadRegister> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: LoadRegister) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_ldr() {
+        let inst = LoadRegister::ldr(0, 1, 2, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf8626820, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/load_store.rs b/yjit/src/asm/arm64/inst/load_store.rs
new file mode 100644
index 0000000000..b5c8a3c294
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/load_store.rs
@@ -0,0 +1,249 @@
+use super::super::arg::truncate_imm;
+
+/// The size of the operands being operated on.
+enum Size {
+    Size8 = 0b00,
+    Size16 = 0b01,
+    Size32 = 0b10,
+    Size64 = 0b11,
+}
+
+/// A convenience function so that we can convert the number of bits of an
+/// register operand directly into an Sf enum variant.
+impl From<u8> for Size {
+    fn from(num_bits: u8) -> Self {
+        match num_bits {
+            64 => Size::Size64,
+            32 => Size::Size32,
+            _ => panic!("Invalid number of bits: {}", num_bits)
+        }
+    }
+}
+
+/// The operation to perform for this instruction.
+enum Opc {
+    STR = 0b00,
+    LDR = 0b01,
+    LDURSW = 0b10
+}
+
+/// What kind of indexing to perform for this instruction.
+enum Index {
+    None = 0b00,
+    PostIndex = 0b01,
+    PreIndex = 0b11
+}
+
+/// The struct that represents an A64 load or store instruction that can be
+/// encoded.
+///
+/// LDR/LDUR/LDURSW/STR/STUR
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |        1  1    1  0  0  0          0                                                                          |
+/// | size.                       opc..    imm9..........................   idx.. rn.............. rt.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct LoadStore {
+    /// The number of the register to load the value into.
+    rt: u8,
+
+    /// The base register with which to form the address.
+    rn: u8,
+
+    /// What kind of indexing to perform for this instruction.
+    idx: Index,
+
+    /// The optional signed immediate byte offset from the base register.
+    imm9: i16,
+
+    /// The operation to perform for this instruction.
+    opc: Opc,
+
+    /// The size of the operands being operated on.
+    size: Size
+}
+
+impl LoadStore {
+    /// LDR (immediate, post-index)
+    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
+    pub fn ldr_post(rt: u8, rn: u8, imm9: i16, num_bits: u8) -> Self {
+        Self { rt, rn, idx: Index::PostIndex, imm9, opc: Opc::LDR, size: num_bits.into() }
+    }
+
+    /// LDR (immediate, pre-index)
+    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
+    pub fn ldr_pre(rt: u8, rn: u8, imm9: i16, num_bits: u8) -> Self {
+        Self { rt, rn, idx: Index::PreIndex, imm9, opc: Opc::LDR, size: num_bits.into() }
+    }
+
+    /// LDUR (load register, unscaled)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDUR--Load-Register--unscaled--?lang=en
+    pub fn ldur(rt: u8, rn: u8, imm9: i16, num_bits: u8) -> Self {
+        Self { rt, rn, idx: Index::None, imm9, opc: Opc::LDR, size: num_bits.into() }
+    }
+
+    /// LDURH Load Register Halfword (unscaled)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDURH--Load-Register-Halfword--unscaled--?lang=en
+    pub fn ldurh(rt: u8, rn: u8, imm9: i16) -> Self {
+        Self { rt, rn, idx: Index::None, imm9, opc: Opc::LDR, size: Size::Size16 }
+    }
+
+    /// LDURB (load register, byte, unscaled)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDURB--Load-Register-Byte--unscaled--?lang=en
+    pub fn ldurb(rt: u8, rn: u8, imm9: i16) -> Self {
+        Self { rt, rn, idx: Index::None, imm9, opc: Opc::LDR, size: Size::Size8 }
+    }
+
+    /// LDURSW (load register, unscaled, signed)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDURSW--Load-Register-Signed-Word--unscaled--?lang=en
+    pub fn ldursw(rt: u8, rn: u8, imm9: i16) -> Self {
+        Self { rt, rn, idx: Index::None, imm9, opc: Opc::LDURSW, size: Size::Size32 }
+    }
+
+    /// STR (immediate, post-index)
+    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/STR--immediate---Store-Register--immediate--
+    pub fn str_post(rt: u8, rn: u8, imm9: i16, num_bits: u8) -> Self {
+        Self { rt, rn, idx: Index::PostIndex, imm9, opc: Opc::STR, size: num_bits.into() }
+    }
+
+    /// STR (immediate, pre-index)
+    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/STR--immediate---Store-Register--immediate--
+    pub fn str_pre(rt: u8, rn: u8, imm9: i16, num_bits: u8) -> Self {
+        Self { rt, rn, idx: Index::PreIndex, imm9, opc: Opc::STR, size: num_bits.into() }
+    }
+
+    /// STUR (store register, unscaled)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STUR--Store-Register--unscaled--?lang=en
+    pub fn stur(rt: u8, rn: u8, imm9: i16, num_bits: u8) -> Self {
+        Self { rt, rn, idx: Index::None, imm9, opc: Opc::STR, size: num_bits.into() }
+    }
+
+    /// STURH (store register, halfword, unscaled)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STURH--Store-Register-Halfword--unscaled--?lang=en
+    pub fn sturh(rt: u8, rn: u8, imm9: i16) -> Self {
+        Self { rt, rn, idx: Index::None, imm9, opc: Opc::STR, size: Size::Size16 }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en
+const FAMILY: u32 = 0b0100;
+
+impl From<LoadStore> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: LoadStore) -> Self {
+        0
+        | ((inst.size as u32) << 30)
+        | (0b11 << 28)
+        | (FAMILY << 25)
+        | ((inst.opc as u32) << 22)
+        | (truncate_imm::<_, 9>(inst.imm9) << 12)
+        | ((inst.idx as u32) << 10)
+        | ((inst.rn as u32) << 5)
+        | (inst.rt as u32)
+    }
+}
+
+impl From<LoadStore> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: LoadStore) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_ldr_post() {
+        let inst = LoadStore::ldr_post(0, 1, 16, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf8410420, result);
+    }
+
+    #[test]
+    fn test_ldr_pre() {
+        let inst = LoadStore::ldr_pre(0, 1, 16, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf8410c20, result);
+    }
+
+    #[test]
+    fn test_ldur() {
+        let inst = LoadStore::ldur(0, 1, 0, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf8400020, result);
+    }
+
+    #[test]
+    fn test_ldurb() {
+        let inst = LoadStore::ldurb(0, 1, 0);
+        let result: u32 = inst.into();
+        assert_eq!(0x38400020, result);
+    }
+
+    #[test]
+    fn test_ldurh() {
+        let inst = LoadStore::ldurh(0, 1, 0);
+        let result: u32 = inst.into();
+        assert_eq!(0x78400020, result);
+    }
+
+    #[test]
+    fn test_ldur_with_imm() {
+        let inst = LoadStore::ldur(0, 1, 123, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf847b020, result);
+    }
+
+    #[test]
+    fn test_ldursw() {
+        let inst = LoadStore::ldursw(0, 1, 0);
+        let result: u32 = inst.into();
+        assert_eq!(0xb8800020, result);
+    }
+
+    #[test]
+    fn test_ldursw_with_imm() {
+        let inst = LoadStore::ldursw(0, 1, 123);
+        let result: u32 = inst.into();
+        assert_eq!(0xb887b020, result);
+    }
+
+    #[test]
+    fn test_str_post() {
+        let inst = LoadStore::str_post(0, 1, -16, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf81f0420, result);
+    }
+
+    #[test]
+    fn test_str_pre() {
+        let inst = LoadStore::str_pre(0, 1, -16, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf81f0c20, result);
+    }
+
+    #[test]
+    fn test_stur() {
+        let inst = LoadStore::stur(0, 1, 0, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf8000020, result);
+    }
+
+    #[test]
+    fn test_stur_negative_offset() {
+        let inst = LoadStore::stur(0, 1, -1, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf81ff020, result);
+    }
+
+    #[test]
+    fn test_stur_positive_offset() {
+        let inst = LoadStore::stur(0, 1, 255, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf80ff020, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/load_store_exclusive.rs b/yjit/src/asm/arm64/inst/load_store_exclusive.rs
new file mode 100644
index 0000000000..8216c2200a
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/load_store_exclusive.rs
@@ -0,0 +1,109 @@
+/// The operation being performed for this instruction.
+enum Op {
+    Store = 0,
+    Load = 1
+}
+
+/// The size of the registers being operated on.
+enum Size {
+    Size32 = 0b10,
+    Size64 = 0b11
+}
+
+/// A convenience function so that we can convert the number of bits of an
+/// register operand directly into a Size enum variant.
+impl From<u8> for Size {
+    fn from(num_bits: u8) -> Self {
+        match num_bits {
+            64 => Size::Size64,
+            32 => Size::Size32,
+            _ => panic!("Invalid number of bits: {}", num_bits)
+        }
+    }
+}
+
+/// The struct that represents an A64 load or store exclusive instruction that
+/// can be encoded.
+///
+/// LDAXR/STLXR
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |  1     0  0    1  0  0  0    0     0                     1  1  1  1    1  1                                   |
+/// | size.                          op    rs..............                       rn.............. rt.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct LoadStoreExclusive {
+    /// The number of the register to be loaded.
+    rt: u8,
+
+    /// The base register with which to form the address.
+    rn: u8,
+
+    /// The register to be used for the status result if it applies to this
+    /// operation. Otherwise it's the zero register.
+    rs: u8,
+
+    /// The operation being performed for this instruction.
+    op: Op,
+
+    /// The size of the registers being operated on.
+    size: Size
+}
+
+impl LoadStoreExclusive {
+    /// LDAXR
+    /// https://developer.arm.com/documentation/ddi0602/2021-12/Base-Instructions/LDAXR--Load-Acquire-Exclusive-Register-
+    pub fn ldaxr(rt: u8, rn: u8, num_bits: u8) -> Self {
+        Self { rt, rn, rs: 31, op: Op::Load, size: num_bits.into() }
+    }
+
+    /// STLXR
+    /// https://developer.arm.com/documentation/ddi0602/2021-12/Base-Instructions/STLXR--Store-Release-Exclusive-Register-
+    pub fn stlxr(rs: u8, rt: u8, rn: u8, num_bits: u8) -> Self {
+        Self { rt, rn, rs, op: Op::Store, size: num_bits.into() }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en
+const FAMILY: u32 = 0b0100;
+
+impl From<LoadStoreExclusive> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: LoadStoreExclusive) -> Self {
+        0
+        | ((inst.size as u32) << 30)
+        | (FAMILY << 25)
+        | ((inst.op as u32) << 22)
+        | ((inst.rs as u32) << 16)
+        | (0b111111 << 10)
+        | ((inst.rn as u32) << 5)
+        | (inst.rt as u32)
+    }
+}
+
+impl From<LoadStoreExclusive> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: LoadStoreExclusive) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_ldaxr() {
+        let inst = LoadStoreExclusive::ldaxr(16, 0, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xc85ffc10, result);
+    }
+
+    #[test]
+    fn test_stlxr() {
+        let inst = LoadStoreExclusive::stlxr(17, 16, 0, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xc811fc10, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/logical_imm.rs b/yjit/src/asm/arm64/inst/logical_imm.rs
new file mode 100644
index 0000000000..b24916f8a5
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/logical_imm.rs
@@ -0,0 +1,154 @@
+use super::super::arg::{BitmaskImmediate, Sf};
+
+// Which operation to perform.
+enum Opc {
+    /// The AND operation.
+    And = 0b00,
+
+    /// The ORR operation.
+    Orr = 0b01,
+
+    /// The EOR operation.
+    Eor = 0b10,
+
+    /// The ANDS operation.
+    Ands = 0b11
+}
+
+/// The struct that represents an A64 bitwise immediate instruction that can be
+/// encoded.
+///
+/// AND/ORR/ANDS (immediate)
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |           1    0  0  1  0    0                                                                                |
+/// | sf opc..                       N  immr...............   imms............... rn.............. rd.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct LogicalImm {
+    /// The register number of the destination register.
+    rd: u8,
+
+    /// The register number of the first operand register.
+    rn: u8,
+
+    /// The immediate value to test.
+    imm: BitmaskImmediate,
+
+    /// The opcode for this instruction.
+    opc: Opc,
+
+    /// Whether or not this instruction is operating on 64-bit operands.
+    sf: Sf
+}
+
+impl LogicalImm {
+    /// AND (bitmask immediate)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/AND--immediate---Bitwise-AND--immediate--?lang=en
+    pub fn and(rd: u8, rn: u8, imm: BitmaskImmediate, num_bits: u8) -> Self {
+        Self { rd, rn, imm, opc: Opc::And, sf: num_bits.into() }
+    }
+
+    /// ANDS (bitmask immediate)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ANDS--immediate---Bitwise-AND--immediate---setting-flags-?lang=en
+    pub fn ands(rd: u8, rn: u8, imm: BitmaskImmediate, num_bits: u8) -> Self {
+        Self { rd, rn, imm, opc: Opc::Ands, sf: num_bits.into() }
+    }
+
+    /// EOR (bitmask immediate)
+    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/EOR--immediate---Bitwise-Exclusive-OR--immediate--
+    pub fn eor(rd: u8, rn: u8, imm: BitmaskImmediate, num_bits: u8) -> Self {
+        Self { rd, rn, imm, opc: Opc::Eor, sf: num_bits.into() }
+    }
+
+    /// MOV (bitmask immediate)
+    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/MOV--bitmask-immediate---Move--bitmask-immediate---an-alias-of-ORR--immediate--?lang=en
+    pub fn mov(rd: u8, imm: BitmaskImmediate, num_bits: u8) -> Self {
+        Self { rd, rn: 0b11111, imm, opc: Opc::Orr, sf: num_bits.into() }
+    }
+
+    /// ORR (bitmask immediate)
+    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/ORR--immediate---Bitwise-OR--immediate--
+    pub fn orr(rd: u8, rn: u8, imm: BitmaskImmediate, num_bits: u8) -> Self {
+        Self { rd, rn, imm, opc: Opc::Orr, sf: num_bits.into() }
+    }
+
+    /// TST (bitmask immediate)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/TST--immediate---Test-bits--immediate---an-alias-of-ANDS--immediate--?lang=en
+    pub fn tst(rn: u8, imm: BitmaskImmediate, num_bits: u8) -> Self {
+        Self::ands(31, rn, imm, num_bits)
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en#log_imm
+const FAMILY: u32 = 0b1001;
+
+impl From<LogicalImm> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: LogicalImm) -> Self {
+        let imm: u32 = inst.imm.encode();
+
+        0
+        | ((inst.sf as u32) << 31)
+        | ((inst.opc as u32) << 29)
+        | (FAMILY << 25)
+        | (imm << 10)
+        | ((inst.rn as u32) << 5)
+        | inst.rd as u32
+    }
+}
+
+impl From<LogicalImm> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: LogicalImm) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_and() {
+        let inst = LogicalImm::and(0, 1, 7.try_into().unwrap(), 64);
+        let result: u32 = inst.into();
+        assert_eq!(0x92400820, result);
+    }
+
+    #[test]
+    fn test_ands() {
+        let inst = LogicalImm::ands(0, 1, 7.try_into().unwrap(), 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf2400820, result);
+    }
+
+    #[test]
+    fn test_eor() {
+        let inst = LogicalImm::eor(0, 1, 7.try_into().unwrap(), 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xd2400820, result);
+    }
+
+    #[test]
+    fn test_mov() {
+        let inst = LogicalImm::mov(0, 0x5555555555555555.try_into().unwrap(), 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xb200f3e0, result);
+    }
+
+    #[test]
+    fn test_orr() {
+        let inst = LogicalImm::orr(0, 1, 7.try_into().unwrap(), 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xb2400820, result);
+    }
+
+    #[test]
+    fn test_tst() {
+        let inst = LogicalImm::tst(1, 7.try_into().unwrap(), 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf240083f, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/logical_reg.rs b/yjit/src/asm/arm64/inst/logical_reg.rs
new file mode 100644
index 0000000000..a96805c9f9
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/logical_reg.rs
@@ -0,0 +1,207 @@
+use super::super::arg::{Sf, truncate_uimm};
+
+/// Whether or not this is a NOT instruction.
+enum N {
+    /// This is not a NOT instruction.
+    No = 0,
+
+    /// This is a NOT instruction.
+    Yes = 1
+}
+
+/// The type of shift to perform on the second operand register.
+enum Shift {
+    LSL = 0b00, // logical shift left (unsigned)
+    LSR = 0b01, // logical shift right (unsigned)
+    ASR = 0b10, // arithmetic shift right (signed)
+    ROR = 0b11  // rotate right (unsigned)
+}
+
+// Which operation to perform.
+enum Opc {
+    /// The AND operation.
+    And = 0b00,
+
+    /// The ORR operation.
+    Orr = 0b01,
+
+    /// The EOR operation.
+    Eor = 0b10,
+
+    /// The ANDS operation.
+    Ands = 0b11
+}
+
+/// The struct that represents an A64 logical register instruction that can be
+/// encoded.
+///
+/// AND/ORR/ANDS (shifted register)
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |           0    1  0  1  0                                                                                     |
+/// | sf opc..                    shift N  rm..............   imm6............... rn.............. rd.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct LogicalReg {
+    /// The register number of the destination register.
+    rd: u8,
+
+    /// The register number of the first operand register.
+    rn: u8,
+
+    /// The amount to shift the second operand register.
+    imm6: u8,
+
+    /// The register number of the second operand register.
+    rm: u8,
+
+    /// Whether or not this is a NOT instruction.
+    n: N,
+
+    /// The type of shift to perform on the second operand register.
+    shift: Shift,
+
+    /// The opcode for this instruction.
+    opc: Opc,
+
+    /// Whether or not this instruction is operating on 64-bit operands.
+    sf: Sf
+}
+
+impl LogicalReg {
+    /// AND (shifted register)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/AND--shifted-register---Bitwise-AND--shifted-register--?lang=en
+    pub fn and(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
+        Self { rd, rn, imm6: 0, rm, n: N::No, shift: Shift::LSL, opc: Opc::And, sf: num_bits.into() }
+    }
+
+    /// ANDS (shifted register)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ANDS--shifted-register---Bitwise-AND--shifted-register---setting-flags-?lang=en
+    pub fn ands(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
+        Self { rd, rn, imm6: 0, rm, n: N::No, shift: Shift::LSL, opc: Opc::Ands, sf: num_bits.into() }
+    }
+
+    /// EOR (shifted register)
+    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/EOR--shifted-register---Bitwise-Exclusive-OR--shifted-register--
+    pub fn eor(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
+        Self { rd, rn, imm6: 0, rm, n: N::No, shift: Shift::LSL, opc: Opc::Eor, sf: num_bits.into() }
+    }
+
+    /// MOV (register)
+    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/MOV--register---Move--register---an-alias-of-ORR--shifted-register--?lang=en
+    pub fn mov(rd: u8, rm: u8, num_bits: u8) -> Self {
+        Self { rd, rn: 0b11111, imm6: 0, rm, n: N::No, shift: Shift::LSL, opc: Opc::Orr, sf: num_bits.into() }
+    }
+
+    /// MVN (shifted register)
+    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/MVN--Bitwise-NOT--an-alias-of-ORN--shifted-register--?lang=en
+    pub fn mvn(rd: u8, rm: u8, num_bits: u8) -> Self {
+        Self { rd, rn: 0b11111, imm6: 0, rm, n: N::Yes, shift: Shift::LSL, opc: Opc::Orr, sf: num_bits.into() }
+    }
+
+    /// ORN (shifted register)
+    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/ORN--shifted-register---Bitwise-OR-NOT--shifted-register--
+    pub fn orn(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
+        Self { rd, rn, imm6: 0, rm, n: N::Yes, shift: Shift::LSL, opc: Opc::Orr, sf: num_bits.into() }
+    }
+
+    /// ORR (shifted register)
+    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/ORR--shifted-register---Bitwise-OR--shifted-register--
+    pub fn orr(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
+        Self { rd, rn, imm6: 0, rm, n: N::No, shift: Shift::LSL, opc: Opc::Orr, sf: num_bits.into() }
+    }
+
+    /// TST (shifted register)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/TST--shifted-register---Test--shifted-register---an-alias-of-ANDS--shifted-register--?lang=en
+    pub fn tst(rn: u8, rm: u8, num_bits: u8) -> Self {
+        Self { rd: 31, rn, imm6: 0, rm, n: N::No, shift: Shift::LSL, opc: Opc::Ands, sf: num_bits.into() }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Register?lang=en
+const FAMILY: u32 = 0b0101;
+
+impl From<LogicalReg> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: LogicalReg) -> Self {
+        0
+        | ((inst.sf as u32) << 31)
+        | ((inst.opc as u32) << 29)
+        | (FAMILY << 25)
+        | ((inst.shift as u32) << 22)
+        | ((inst.n as u32) << 21)
+        | ((inst.rm as u32) << 16)
+        | (truncate_uimm::<_, 6>(inst.imm6) << 10)
+        | ((inst.rn as u32) << 5)
+        | inst.rd as u32
+    }
+}
+
+impl From<LogicalReg> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: LogicalReg) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_and() {
+        let inst = LogicalReg::and(0, 1, 2, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0x8a020020, result);
+    }
+
+    #[test]
+    fn test_ands() {
+        let inst = LogicalReg::ands(0, 1, 2, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xea020020, result);
+    }
+
+    #[test]
+    fn test_eor() {
+        let inst = LogicalReg::eor(0, 1, 2, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xca020020, result);
+    }
+
+    #[test]
+    fn test_mov() {
+        let inst = LogicalReg::mov(0, 1, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xaa0103e0, result);
+    }
+
+    #[test]
+    fn test_mvn() {
+        let inst = LogicalReg::mvn(0, 1, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xaa2103e0, result);
+    }
+
+    #[test]
+    fn test_orn() {
+        let inst = LogicalReg::orn(0, 1, 2, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xaa220020, result);
+    }
+
+    #[test]
+    fn test_orr() {
+        let inst = LogicalReg::orr(0, 1, 2, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xaa020020, result);
+    }
+
+    #[test]
+    fn test_tst() {
+        let inst = LogicalReg::tst(0, 1, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xea01001f, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/madd.rs b/yjit/src/asm/arm64/inst/madd.rs
new file mode 100644
index 0000000000..683e643189
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/madd.rs
@@ -0,0 +1,73 @@
+use super::super::arg::Sf;
+
+/// The struct that represents an A64 multiply-add instruction that can be
+/// encoded.
+///
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |     0  0  1    1  0  1  1    0  0  0                     0                                                    |
+/// | sf                                   rm..............      ra.............. rn.............. rd.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct MAdd {
+    /// The number of the general-purpose destination register.
+    rd: u8,
+
+    /// The number of the first general-purpose source register.
+    rn: u8,
+
+    /// The number of the third general-purpose source register.
+    ra: u8,
+
+    /// The number of the second general-purpose source register.
+    rm: u8,
+
+    /// The size of the registers of this instruction.
+    sf: Sf
+}
+
+impl MAdd {
+    /// MUL
+    /// https://developer.arm.com/documentation/ddi0602/2023-06/Base-Instructions/MUL--Multiply--an-alias-of-MADD-
+    pub fn mul(rd: u8, rn: u8, rm: u8, num_bits: u8) -> Self {
+        Self { rd, rn, ra: 0b11111, rm, sf: num_bits.into() }
+    }
+}
+
+impl From<MAdd> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: MAdd) -> Self {
+        0
+        | ((inst.sf as u32) << 31)
+        | (0b11011 << 24)
+        | ((inst.rm as u32) << 16)
+        | ((inst.ra as u32) << 10)
+        | ((inst.rn as u32) << 5)
+        | (inst.rd as u32)
+    }
+}
+
+impl From<MAdd> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: MAdd) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_mul_32() {
+        let result: u32 = MAdd::mul(0, 1, 2, 32).into();
+        assert_eq!(0x1B027C20, result);
+    }
+
+    #[test]
+    fn test_mul_64() {
+        let result: u32 = MAdd::mul(0, 1, 2, 64).into();
+        assert_eq!(0x9B027C20, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/mod.rs b/yjit/src/asm/arm64/inst/mod.rs
new file mode 100644
index 0000000000..bfffd914ef
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/mod.rs
@@ -0,0 +1,54 @@
+// This module contains various A64 instructions and the logic necessary to
+// encode them into u32s.
+
+mod atomic;
+mod branch;
+mod branch_cond;
+mod breakpoint;
+mod call;
+mod conditional;
+mod data_imm;
+mod data_reg;
+mod halfword_imm;
+mod load_literal;
+mod load_register;
+mod load_store;
+mod load_store_exclusive;
+mod logical_imm;
+mod logical_reg;
+mod madd;
+mod smulh;
+mod mov;
+mod nop;
+mod pc_rel;
+mod reg_pair;
+mod sbfm;
+mod shift_imm;
+mod sys_reg;
+mod test_bit;
+
+pub use atomic::Atomic;
+pub use branch::Branch;
+pub use branch_cond::BranchCond;
+pub use breakpoint::Breakpoint;
+pub use call::Call;
+pub use conditional::Conditional;
+pub use data_imm::DataImm;
+pub use data_reg::DataReg;
+pub use halfword_imm::HalfwordImm;
+pub use load_literal::LoadLiteral;
+pub use load_register::LoadRegister;
+pub use load_store::LoadStore;
+pub use load_store_exclusive::LoadStoreExclusive;
+pub use logical_imm::LogicalImm;
+pub use logical_reg::LogicalReg;
+pub use madd::MAdd;
+pub use smulh::SMulH;
+pub use mov::Mov;
+pub use nop::Nop;
+pub use pc_rel::PCRelative;
+pub use reg_pair::RegisterPair;
+pub use sbfm::SBFM;
+pub use shift_imm::ShiftImm;
+pub use sys_reg::SysReg;
+pub use test_bit::TestBit;
diff --git a/yjit/src/asm/arm64/inst/mov.rs b/yjit/src/asm/arm64/inst/mov.rs
new file mode 100644
index 0000000000..e7cb9215b0
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/mov.rs
@@ -0,0 +1,155 @@
+use super::super::arg::Sf;
+
+/// Which operation is being performed.
+enum Op {
+    /// A movz operation which zeroes out the other bits.
+    MOVZ = 0b10,
+
+    /// A movk operation which keeps the other bits in place.
+    MOVK = 0b11
+}
+
+/// How much to shift the immediate by.
+enum Hw {
+    LSL0 = 0b00,
+    LSL16 = 0b01,
+    LSL32 = 0b10,
+    LSL48 = 0b11
+}
+
+impl From<u8> for Hw {
+    fn from(shift: u8) -> Self {
+        match shift {
+            0 => Hw::LSL0,
+            16 => Hw::LSL16,
+            32 => Hw::LSL32,
+            48 => Hw::LSL48,
+            _ => panic!("Invalid value for shift: {}", shift)
+        }
+    }
+}
+
+/// The struct that represents a MOVK or MOVZ instruction.
+///
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |           1    0  0  1  0    1                                                                                |
+/// | sf op...                       hw... imm16.................................................. rd.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct Mov {
+    /// The register number of the destination register.
+    rd: u8,
+
+    /// The value to move into the register.
+    imm16: u16,
+
+    /// The shift of the value to move.
+    hw: Hw,
+
+    /// Which operation is being performed.
+    op: Op,
+
+    /// Whether or not this instruction is operating on 64-bit operands.
+    sf: Sf
+}
+
+impl Mov {
+    /// MOVK
+    /// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/MOVK--Move-wide-with-keep-?lang=en
+    pub fn movk(rd: u8, imm16: u16, hw: u8, num_bits: u8) -> Self {
+        Self { rd, imm16, hw: hw.into(), op: Op::MOVK, sf: num_bits.into() }
+    }
+
+    /// MOVZ
+    /// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/MOVZ--Move-wide-with-zero-?lang=en
+    pub fn movz(rd: u8, imm16: u16, hw: u8, num_bits: u8) -> Self {
+        Self { rd, imm16, hw: hw.into(), op: Op::MOVZ, sf: num_bits.into() }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en
+const FAMILY: u32 = 0b1000;
+
+impl From<Mov> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: Mov) -> Self {
+        0
+        | ((inst.sf as u32) << 31)
+        | ((inst.op as u32) << 29)
+        | (FAMILY << 25)
+        | (0b101 << 23)
+        | ((inst.hw as u32) << 21)
+        | ((inst.imm16 as u32) << 5)
+        | inst.rd as u32
+    }
+}
+
+impl From<Mov> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: Mov) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_movk_unshifted() {
+        let inst = Mov::movk(0, 123, 0, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf2800f60, result);
+    }
+
+    #[test]
+    fn test_movk_shifted_16() {
+        let inst = Mov::movk(0, 123, 16, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf2A00f60, result);
+    }
+
+    #[test]
+    fn test_movk_shifted_32() {
+        let inst = Mov::movk(0, 123, 32, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf2C00f60, result);
+    }
+
+    #[test]
+    fn test_movk_shifted_48() {
+        let inst = Mov::movk(0, 123, 48, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xf2e00f60, result);
+    }
+
+    #[test]
+    fn test_movz_unshifted() {
+        let inst = Mov::movz(0, 123, 0, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xd2800f60, result);
+    }
+
+    #[test]
+    fn test_movz_shifted_16() {
+        let inst = Mov::movz(0, 123, 16, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xd2a00f60, result);
+    }
+
+    #[test]
+    fn test_movz_shifted_32() {
+        let inst = Mov::movz(0, 123, 32, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xd2c00f60, result);
+    }
+
+    #[test]
+    fn test_movz_shifted_48() {
+        let inst = Mov::movz(0, 123, 48, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xd2e00f60, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/nop.rs b/yjit/src/asm/arm64/inst/nop.rs
new file mode 100644
index 0000000000..d58b3574a9
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/nop.rs
@@ -0,0 +1,44 @@
+/// The struct that represents an A64 nop instruction that can be encoded.
+///
+/// NOP
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |  1  1  0  1    0  1  0  1    0  0  0  0    0  0  1  1    0  0  1  0    0  0  0  0    0  0  0  1    1  1  1  1 |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct Nop;
+
+impl Nop {
+    /// NOP
+    /// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/NOP--No-Operation-
+    pub fn nop() -> Self {
+        Self {}
+    }
+}
+
+impl From<Nop> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(_inst: Nop) -> Self {
+        0b11010101000000110010000000011111
+    }
+}
+
+impl From<Nop> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: Nop) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_nop() {
+        let inst = Nop::nop();
+        let result: u32 = inst.into();
+        assert_eq!(0xd503201f, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/pc_rel.rs b/yjit/src/asm/arm64/inst/pc_rel.rs
new file mode 100644
index 0000000000..bd1a2b9367
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/pc_rel.rs
@@ -0,0 +1,107 @@
+/// Which operation to perform for the PC-relative instruction.
+enum Op {
+    /// Form a PC-relative address.
+    ADR = 0,
+
+    /// Form a PC-relative address to a 4KB page.
+    ADRP = 1
+}
+
+/// The struct that represents an A64 PC-relative address instruction that can
+/// be encoded.
+///
+/// ADR
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |           1    0  0  0  0                                                                                     |
+/// | op immlo                    immhi........................................................... rd.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct PCRelative {
+    /// The number for the general-purpose register to load the address into.
+    rd: u8,
+
+    /// The number of bytes to add to the PC to form the address.
+    imm: i32,
+
+    /// Which operation to perform for this instruction.
+    op: Op
+}
+
+impl PCRelative {
+    /// ADR
+    /// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/ADR--Form-PC-relative-address-
+    pub fn adr(rd: u8, imm: i32) -> Self {
+        Self { rd, imm, op: Op::ADR }
+    }
+
+    /// ADRP
+    /// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/ADRP--Form-PC-relative-address-to-4KB-page-
+    pub fn adrp(rd: u8, imm: i32) -> Self {
+        Self { rd, imm: imm >> 12, op: Op::ADRP }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en
+const FAMILY: u32 = 0b1000;
+
+impl From<PCRelative> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: PCRelative) -> Self {
+        let immlo = (inst.imm & 0b11) as u32;
+        let mut immhi = ((inst.imm >> 2) & ((1 << 18) - 1)) as u32;
+
+        // Toggle the sign bit if necessary.
+        if inst.imm < 0 {
+            immhi |= 1 << 18;
+        }
+
+        0
+        | ((inst.op as u32) << 31)
+        | (immlo << 29)
+        | (FAMILY << 25)
+        | (immhi << 5)
+        | inst.rd as u32
+    }
+}
+
+impl From<PCRelative> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: PCRelative) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_adr_positive() {
+        let inst = PCRelative::adr(0, 5);
+        let result: u32 = inst.into();
+        assert_eq!(0x30000020, result);
+    }
+
+    #[test]
+    fn test_adr_negative() {
+        let inst = PCRelative::adr(0, -5);
+        let result: u32 = inst.into();
+        assert_eq!(0x70ffffc0, result);
+    }
+
+    #[test]
+    fn test_adrp_positive() {
+        let inst = PCRelative::adrp(0, 0x4000);
+        let result: u32 = inst.into();
+        assert_eq!(0x90000020, result);
+    }
+
+    #[test]
+    fn test_adrp_negative() {
+        let inst = PCRelative::adrp(0, -0x4000);
+        let result: u32 = inst.into();
+        assert_eq!(0x90ffffe0, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/reg_pair.rs b/yjit/src/asm/arm64/inst/reg_pair.rs
new file mode 100644
index 0000000000..87690e3b4a
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/reg_pair.rs
@@ -0,0 +1,212 @@
+use super::super::arg::truncate_imm;
+
+/// The operation to perform for this instruction.
+enum Opc {
+    /// When the registers are 32-bits wide.
+    Opc32 = 0b00,
+
+    /// When the registers are 64-bits wide.
+    Opc64 = 0b10
+}
+
+/// The kind of indexing to perform for this instruction.
+enum Index {
+    StorePostIndex = 0b010,
+    LoadPostIndex = 0b011,
+    StoreSignedOffset = 0b100,
+    LoadSignedOffset = 0b101,
+    StorePreIndex = 0b110,
+    LoadPreIndex = 0b111
+}
+
+/// A convenience function so that we can convert the number of bits of a
+/// register operand directly into an Opc variant.
+impl From<u8> for Opc {
+    fn from(num_bits: u8) -> Self {
+        match num_bits {
+            64 => Opc::Opc64,
+            32 => Opc::Opc32,
+            _ => panic!("Invalid number of bits: {}", num_bits)
+        }
+    }
+}
+
+/// The struct that represents an A64 register pair instruction that can be
+/// encoded.
+///
+/// STP/LDP
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |     0  1  0    1  0  0                                                                                        |
+/// | opc                    index..... imm7.................... rt2............. rn.............. rt1............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct RegisterPair {
+    /// The number of the first register to be transferred.
+    rt1: u8,
+
+    /// The number of the base register.
+    rn: u8,
+
+    /// The number of the second register to be transferred.
+    rt2: u8,
+
+    /// The signed immediate byte offset, a multiple of 8.
+    imm7: i16,
+
+    /// The kind of indexing to use for this instruction.
+    index: Index,
+
+    /// The operation to be performed (in terms of size).
+    opc: Opc
+}
+
+impl RegisterPair {
+    /// Create a register pair instruction with a given indexing mode.
+    fn new(rt1: u8, rt2: u8, rn: u8, disp: i16, index: Index, num_bits: u8) -> Self {
+        Self { rt1, rn, rt2, imm7: disp / 8, index, opc: num_bits.into() }
+    }
+
+    /// LDP (signed offset)
+    /// LDP <Xt1>, <Xt2>, [<Xn|SP>{, #<imm>}]
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-?lang=en
+    pub fn ldp(rt1: u8, rt2: u8, rn: u8, disp: i16, num_bits: u8) -> Self {
+        Self::new(rt1, rt2, rn, disp, Index::LoadSignedOffset, num_bits)
+    }
+
+    /// LDP (pre-index)
+    /// LDP <Xt1>, <Xt2>, [<Xn|SP>, #<imm>]!
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-?lang=en
+    pub fn ldp_pre(rt1: u8, rt2: u8, rn: u8, disp: i16, num_bits: u8) -> Self {
+        Self::new(rt1, rt2, rn, disp, Index::LoadPreIndex, num_bits)
+    }
+
+    /// LDP (post-index)
+    /// LDP <Xt1>, <Xt2>, [<Xn|SP>], #<imm>
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDP--Load-Pair-of-Registers-?lang=en
+    pub fn ldp_post(rt1: u8, rt2: u8, rn: u8, disp: i16, num_bits: u8) -> Self {
+        Self::new(rt1, rt2, rn, disp, Index::LoadPostIndex, num_bits)
+    }
+
+    /// STP (signed offset)
+    /// STP <Xt1>, <Xt2>, [<Xn|SP>{, #<imm>}]
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STP--Store-Pair-of-Registers-?lang=en
+    pub fn stp(rt1: u8, rt2: u8, rn: u8, disp: i16, num_bits: u8) -> Self {
+        Self::new(rt1, rt2, rn, disp, Index::StoreSignedOffset, num_bits)
+    }
+
+    /// STP (pre-index)
+    /// STP <Xt1>, <Xt2>, [<Xn|SP>, #<imm>]!
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STP--Store-Pair-of-Registers-?lang=en
+    pub fn stp_pre(rt1: u8, rt2: u8, rn: u8, disp: i16, num_bits: u8) -> Self {
+        Self::new(rt1, rt2, rn, disp, Index::StorePreIndex, num_bits)
+    }
+
+    /// STP (post-index)
+    /// STP <Xt1>, <Xt2>, [<Xn|SP>], #<imm>
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STP--Store-Pair-of-Registers-?lang=en
+    pub fn stp_post(rt1: u8, rt2: u8, rn: u8, disp: i16, num_bits: u8) -> Self {
+        Self::new(rt1, rt2, rn, disp, Index::StorePostIndex, num_bits)
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en
+const FAMILY: u32 = 0b0100;
+
+impl From<RegisterPair> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: RegisterPair) -> Self {
+        0
+        | ((inst.opc as u32) << 30)
+        | (1 << 29)
+        | (FAMILY << 25)
+        | ((inst.index as u32) << 22)
+        | (truncate_imm::<_, 7>(inst.imm7) << 15)
+        | ((inst.rt2 as u32) << 10)
+        | ((inst.rn as u32) << 5)
+        | (inst.rt1 as u32)
+    }
+}
+
+impl From<RegisterPair> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: RegisterPair) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_ldp() {
+        let inst = RegisterPair::ldp(0, 1, 2, 0, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xa9400440, result);
+    }
+
+    #[test]
+    fn test_ldp_maximum_displacement() {
+        let inst = RegisterPair::ldp(0, 1, 2, 504, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xa95f8440, result);
+    }
+
+    #[test]
+    fn test_ldp_minimum_displacement() {
+        let inst = RegisterPair::ldp(0, 1, 2, -512, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xa9600440, result);
+    }
+
+    #[test]
+    fn test_ldp_pre() {
+        let inst = RegisterPair::ldp_pre(0, 1, 2, 256, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xa9d00440, result);
+    }
+
+    #[test]
+    fn test_ldp_post() {
+        let inst = RegisterPair::ldp_post(0, 1, 2, 256, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xa8d00440, result);
+    }
+
+    #[test]
+    fn test_stp() {
+        let inst = RegisterPair::stp(0, 1, 2, 0, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xa9000440, result);
+    }
+
+    #[test]
+    fn test_stp_maximum_displacement() {
+        let inst = RegisterPair::stp(0, 1, 2, 504, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xa91f8440, result);
+    }
+
+    #[test]
+    fn test_stp_minimum_displacement() {
+        let inst = RegisterPair::stp(0, 1, 2, -512, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xa9200440, result);
+    }
+
+    #[test]
+    fn test_stp_pre() {
+        let inst = RegisterPair::stp_pre(0, 1, 2, 256, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xa9900440, result);
+    }
+
+    #[test]
+    fn test_stp_post() {
+        let inst = RegisterPair::stp_post(0, 1, 2, 256, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xa8900440, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/sbfm.rs b/yjit/src/asm/arm64/inst/sbfm.rs
new file mode 100644
index 0000000000..8602998980
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/sbfm.rs
@@ -0,0 +1,103 @@
+use super::super::arg::{Sf, truncate_uimm};
+
+/// The struct that represents an A64 signed bitfield move instruction that can
+/// be encoded.
+///
+/// SBFM
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |     0  0  1    0  0  1  1    0                                                                                |
+/// | sf                             N  immr...............   imms............... rn.............. rd.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct SBFM {
+    /// The number for the general-purpose register to load the value into.
+    rd: u8,
+
+    /// The number for the general-purpose register to copy from.
+    rn: u8,
+
+    /// The leftmost bit number to be moved from the source.
+    imms: u8,
+
+    // The right rotate amount.
+    immr: u8,
+
+    /// Whether or not this is a 64-bit operation.
+    n: bool,
+
+    /// The size of this operation.
+    sf: Sf
+}
+
+impl SBFM {
+    /// ASR
+    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/ASR--immediate---Arithmetic-Shift-Right--immediate---an-alias-of-SBFM-?lang=en
+    pub fn asr(rd: u8, rn: u8, shift: u8, num_bits: u8) -> Self {
+        let (imms, n) = if num_bits == 64 {
+            (0b111111, true)
+        } else {
+            (0b011111, false)
+        };
+
+        Self { rd, rn, immr: shift, imms, n, sf: num_bits.into() }
+    }
+
+    /// SXTW
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SXTW--Sign-Extend-Word--an-alias-of-SBFM-?lang=en
+    pub fn sxtw(rd: u8, rn: u8) -> Self {
+        Self { rd, rn, immr: 0, imms: 31, n: true, sf: Sf::Sf64 }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en#bitfield
+const FAMILY: u32 = 0b1001;
+
+impl From<SBFM> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: SBFM) -> Self {
+        0
+        | ((inst.sf as u32) << 31)
+        | (FAMILY << 25)
+        | (1 << 24)
+        | ((inst.n as u32) << 22)
+        | (truncate_uimm::<_, 6>(inst.immr) << 16)
+        | (truncate_uimm::<_, 6>(inst.imms) << 10)
+        | ((inst.rn as u32) << 5)
+        | inst.rd as u32
+    }
+}
+
+impl From<SBFM> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: SBFM) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_asr_32_bits() {
+        let inst = SBFM::asr(0, 1, 2, 32);
+        let result: u32 = inst.into();
+        assert_eq!(0x13027c20, result);
+    }
+
+    #[test]
+    fn test_asr_64_bits() {
+        let inst = SBFM::asr(10, 11, 5, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0x9345fd6a, result);
+    }
+
+    #[test]
+    fn test_sxtw() {
+        let inst = SBFM::sxtw(0, 1);
+        let result: u32 = inst.into();
+        assert_eq!(0x93407c20, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/shift_imm.rs b/yjit/src/asm/arm64/inst/shift_imm.rs
new file mode 100644
index 0000000000..3d2685a997
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/shift_imm.rs
@@ -0,0 +1,147 @@
+use super::super::arg::Sf;
+
+/// The operation to perform for this instruction.
+enum Opc {
+    /// Logical left shift
+    LSL,
+
+    /// Logical shift right
+    LSR
+}
+
+/// The struct that represents an A64 unsigned bitfield move instruction that
+/// can be encoded.
+///
+/// LSL (immediate)
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |     1  0  1    0  0  1  1    0                                                                                |
+/// | sf                             N  immr...............   imms............... rn.............. rd.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct ShiftImm {
+    /// The register number of the destination register.
+    rd: u8,
+
+    /// The register number of the first operand register.
+    rn: u8,
+
+    /// The immediate value to shift by.
+    shift: u8,
+
+    /// The opcode for this instruction.
+    opc: Opc,
+
+    /// Whether or not this instruction is operating on 64-bit operands.
+    sf: Sf
+}
+
+impl ShiftImm {
+    /// LSL (immediate)
+    /// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LSL--immediate---Logical-Shift-Left--immediate---an-alias-of-UBFM-?lang=en
+    pub fn lsl(rd: u8, rn: u8, shift: u8, num_bits: u8) -> Self {
+        ShiftImm { rd, rn, shift, opc: Opc::LSL, sf: num_bits.into() }
+    }
+
+    /// LSR (immediate)
+    /// https://developer.arm.com/documentation/ddi0602/2021-12/Base-Instructions/LSR--immediate---Logical-Shift-Right--immediate---an-alias-of-UBFM-?lang=en
+    pub fn lsr(rd: u8, rn: u8, shift: u8, num_bits: u8) -> Self {
+        ShiftImm { rd, rn, shift, opc: Opc::LSR, sf: num_bits.into() }
+    }
+
+    /// Returns a triplet of (n, immr, imms) encoded in u32s for this
+    /// instruction. This mirrors how they will be encoded in the actual bits.
+    fn bitmask(&self) -> (u32, u32, u32) {
+        match self.opc {
+            // The key insight is a little buried in the docs, but effectively:
+            // LSL <Wd>, <Wn>, #<shift> == UBFM <Wd>, <Wn>, #(-<shift> MOD 32), #(31-<shift>)
+            // LSL <Xd>, <Xn>, #<shift> == UBFM <Xd>, <Xn>, #(-<shift> MOD 64), #(63-<shift>)
+            Opc::LSL => {
+                let shift = -(self.shift as i16);
+
+                match self.sf {
+                    Sf::Sf32 => (
+                        0,
+                        (shift.rem_euclid(32) & 0x3f) as u32,
+                        ((31 - self.shift) & 0x3f) as u32
+                    ),
+                    Sf::Sf64 => (
+                        1,
+                        (shift.rem_euclid(64) & 0x3f) as u32,
+                        ((63 - self.shift) & 0x3f) as u32
+                    )
+                }
+            },
+            // Similar to LSL:
+            // LSR <Wd>, <Wn>, #<shift> == UBFM <Wd>, <Wn>, #<shift>, #31
+            // LSR <Xd>, <Xn>, #<shift> == UBFM <Xd>, <Xn>, #<shift>, #63
+            Opc::LSR => {
+                match self.sf {
+                    Sf::Sf32 => (0, (self.shift & 0x3f) as u32, 31),
+                    Sf::Sf64 => (1, (self.shift & 0x3f) as u32, 63)
+                }
+            }
+        }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Data-Processing----Immediate?lang=en#bitfield
+const FAMILY: u32 = 0b10011;
+
+impl From<ShiftImm> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: ShiftImm) -> Self {
+        let (n, immr, imms) = inst.bitmask();
+
+        0
+        | ((inst.sf as u32) << 31)
+        | (1 << 30)
+        | (FAMILY << 24)
+        | (n << 22)
+        | (immr << 16)
+        | (imms << 10)
+        | ((inst.rn as u32) << 5)
+        | inst.rd as u32
+    }
+}
+
+impl From<ShiftImm> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: ShiftImm) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_lsl_32() {
+        let inst = ShiftImm::lsl(0, 1, 7, 32);
+        let result: u32 = inst.into();
+        assert_eq!(0x53196020, result);
+    }
+
+    #[test]
+    fn test_lsl_64() {
+        let inst = ShiftImm::lsl(0, 1, 7, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xd379e020, result);
+    }
+
+    #[test]
+    fn test_lsr_32() {
+        let inst = ShiftImm::lsr(0, 1, 7, 32);
+        let result: u32 = inst.into();
+        assert_eq!(0x53077c20, result);
+    }
+
+    #[test]
+    fn test_lsr_64() {
+        let inst = ShiftImm::lsr(0, 1, 7, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xd347fc20, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/smulh.rs b/yjit/src/asm/arm64/inst/smulh.rs
new file mode 100644
index 0000000000..5e9b231fde
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/smulh.rs
@@ -0,0 +1,60 @@
+/// The struct that represents an A64 signed multiply high instruction
+///
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |  1  0  0  1    1  0  1  1    0  1  0                     0                                                    |
+/// |                                      rm..............      ra.............. rn.............. rd.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct SMulH {
+    /// The number of the general-purpose destination register.
+    rd: u8,
+
+    /// The number of the first general-purpose source register.
+    rn: u8,
+
+    /// The number of the third general-purpose source register.
+    ra: u8,
+
+    /// The number of the second general-purpose source register.
+    rm: u8,
+}
+
+impl SMulH {
+    /// SMULH
+    /// https://developer.arm.com/documentation/ddi0602/2023-06/Base-Instructions/SMULH--Signed-Multiply-High-
+    pub fn smulh(rd: u8, rn: u8, rm: u8) -> Self {
+        Self { rd, rn, ra: 0b11111, rm }
+    }
+}
+
+impl From<SMulH> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: SMulH) -> Self {
+        0
+        | (0b10011011010 << 21)
+        | ((inst.rm as u32) << 16)
+        | ((inst.ra as u32) << 10)
+        | ((inst.rn as u32) << 5)
+        | (inst.rd as u32)
+    }
+}
+
+impl From<SMulH> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: SMulH) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_smulh() {
+        let result: u32 = SMulH::smulh(0, 1, 2).into();
+        assert_eq!(0x9b427c20, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/sys_reg.rs b/yjit/src/asm/arm64/inst/sys_reg.rs
new file mode 100644
index 0000000000..108737a870
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/sys_reg.rs
@@ -0,0 +1,86 @@
+use super::super::arg::SystemRegister;
+
+/// Which operation to perform (loading or storing the system register value).
+enum L {
+    /// Store the value of a general-purpose register in a system register.
+    MSR = 0,
+
+    /// Store the value of a system register in a general-purpose register.
+    MRS = 1
+}
+
+/// The struct that represents an A64 system register instruction that can be
+/// encoded.
+///
+/// MSR/MRS (register)
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |  1  1  0  1    0  1  0  1    0  0     1                                                                       |
+/// |                                   L       o0 op1.....   CRn........   CRm........   op2..... rt.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct SysReg {
+    /// The register to load the system register value into.
+    rt: u8,
+
+    /// Which system register to load or store.
+    systemreg: SystemRegister,
+
+    /// Which operation to perform (loading or storing the system register value).
+    l: L
+}
+
+impl SysReg {
+    /// MRS (register)
+    /// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/MRS--Move-System-Register-?lang=en
+    pub fn mrs(rt: u8, systemreg: SystemRegister) -> Self {
+        SysReg { rt, systemreg, l: L::MRS }
+    }
+
+    /// MSR (register)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MSR--register---Move-general-purpose-register-to-System-Register-?lang=en
+    pub fn msr(systemreg: SystemRegister, rt: u8) -> Self {
+        SysReg { rt, systemreg, l: L::MSR }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en#systemmove
+const FAMILY: u32 = 0b110101010001;
+
+impl From<SysReg> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: SysReg) -> Self {
+        0
+        | (FAMILY << 20)
+        | ((inst.l as u32) << 21)
+        | ((inst.systemreg as u32) << 5)
+        | inst.rt as u32
+    }
+}
+
+impl From<SysReg> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: SysReg) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_mrs() {
+        let inst = SysReg::mrs(0, SystemRegister::NZCV);
+        let result: u32 = inst.into();
+        assert_eq!(0xd53b4200, result);
+    }
+
+    #[test]
+    fn test_msr() {
+        let inst = SysReg::msr(SystemRegister::NZCV, 0);
+        let result: u32 = inst.into();
+        assert_eq!(0xd51b4200, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/test_bit.rs b/yjit/src/asm/arm64/inst/test_bit.rs
new file mode 100644
index 0000000000..c57a05ad2b
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/test_bit.rs
@@ -0,0 +1,133 @@
+use super::super::arg::truncate_imm;
+
+/// The upper bit of the bit number to test.
+#[derive(Debug)]
+enum B5 {
+    /// When the bit number is below 32.
+    B532 = 0,
+
+    /// When the bit number is equal to or above 32.
+    B564 = 1
+}
+
+/// A convenience function so that we can convert the bit number directly into a
+/// B5 variant.
+impl From<u8> for B5 {
+    fn from(bit_num: u8) -> Self {
+        match bit_num {
+            0..=31 => B5::B532,
+            32..=63 => B5::B564,
+            _ => panic!("Invalid bit number: {}", bit_num)
+        }
+    }
+}
+
+/// The operation to perform for this instruction.
+enum Op {
+    /// The test bit zero operation.
+    TBZ = 0,
+
+    /// The test bit not zero operation.
+    TBNZ = 1
+}
+
+/// The struct that represents an A64 test bit instruction that can be encoded.
+///
+/// TBNZ/TBZ
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |     0  1  1    0  1  1                                                                                        |
+/// | b5                     op   b40............. imm14.......................................... rt.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct TestBit {
+    /// The number of the register to test.
+    rt: u8,
+
+    /// The PC-relative offset to the target instruction in term of number of
+    /// instructions.
+    imm14: i16,
+
+    /// The lower 5 bits of the bit number to be tested.
+    b40: u8,
+
+    /// The operation to perform for this instruction.
+    op: Op,
+
+    /// The upper bit of the bit number to test.
+    b5: B5
+}
+
+impl TestBit {
+    /// TBNZ
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/TBNZ--Test-bit-and-Branch-if-Nonzero-?lang=en
+    pub fn tbnz(rt: u8, bit_num: u8, offset: i16) -> Self {
+        Self { rt, imm14: offset, b40: bit_num & 0b11111, op: Op::TBNZ, b5: bit_num.into() }
+    }
+
+    /// TBZ
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/TBZ--Test-bit-and-Branch-if-Zero-?lang=en
+    pub fn tbz(rt: u8, bit_num: u8, offset: i16) -> Self {
+        Self { rt, imm14: offset, b40: bit_num & 0b11111, op: Op::TBZ, b5: bit_num.into() }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions?lang=en
+const FAMILY: u32 = 0b11011;
+
+impl From<TestBit> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: TestBit) -> Self {
+        let b40 = (inst.b40 & 0b11111) as u32;
+        let imm14 = truncate_imm::<_, 14>(inst.imm14);
+
+        0
+        | ((inst.b5 as u32) << 31)
+        | (FAMILY << 25)
+        | ((inst.op as u32) << 24)
+        | (b40 << 19)
+        | (imm14 << 5)
+        | inst.rt as u32
+    }
+}
+
+impl From<TestBit> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: TestBit) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_tbnz() {
+        let inst = TestBit::tbnz(0, 0, 0);
+        let result: u32 = inst.into();
+        assert_eq!(0x37000000, result);
+    }
+
+    #[test]
+    fn test_tbnz_negative() {
+        let inst = TestBit::tbnz(0, 0, -1);
+        let result: u32 = inst.into();
+        assert_eq!(0x3707ffe0, result);
+    }
+
+    #[test]
+    fn test_tbz() {
+        let inst = TestBit::tbz(0, 0, 0);
+        let result: u32 = inst.into();
+        assert_eq!(0x36000000, result);
+    }
+
+    #[test]
+    fn test_tbz_negative() {
+        let inst = TestBit::tbz(0, 0, -1);
+        let result: u32 = inst.into();
+        assert_eq!(0x3607ffe0, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/mod.rs b/yjit/src/asm/arm64/mod.rs
new file mode 100644
index 0000000000..a94d435b7c
--- /dev/null
+++ b/yjit/src/asm/arm64/mod.rs
@@ -0,0 +1,1680 @@
+#![allow(dead_code)] // For instructions and operands we're not currently using.
+
+use crate::asm::CodeBlock;
+
+mod arg;
+mod inst;
+mod opnd;
+
+use inst::*;
+
+// We're going to make these public to make using these things easier in the
+// backend (so they don't have to have knowledge about the submodule).
+pub use arg::*;
+pub use opnd::*;
+
+/// Checks that a signed value fits within the specified number of bits.
+pub const fn imm_fits_bits(imm: i64, num_bits: u8) -> bool {
+    let minimum = if num_bits == 64 { i64::MIN } else { -(2_i64.pow((num_bits as u32) - 1)) };
+    let maximum = if num_bits == 64 { i64::MAX } else { 2_i64.pow((num_bits as u32) - 1) - 1 };
+
+    imm >= minimum && imm <= maximum
+}
+
+/// Checks that an unsigned value fits within the specified number of bits.
+pub const fn uimm_fits_bits(uimm: u64, num_bits: u8) -> bool {
+    let maximum = if num_bits == 64 { u64::MAX } else { 2_u64.pow(num_bits as u32) - 1 };
+
+    uimm <= maximum
+}
+
+/// ADD - add rn and rm, put the result in rd, don't update flags
+pub fn add(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rn, rm) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(
+                rd.num_bits == rn.num_bits && rn.num_bits == rm.num_bits,
+                "All operands must be of the same size."
+            );
+
+            DataReg::add(rd.reg_no, rn.reg_no, rm.reg_no, rd.num_bits).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::UImm(uimm12)) => {
+            assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
+
+            DataImm::add(rd.reg_no, rn.reg_no, uimm12.try_into().unwrap(), rd.num_bits).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Imm(imm12)) => {
+            assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
+
+            if imm12 < 0 {
+                DataImm::sub(rd.reg_no, rn.reg_no, (-imm12 as u64).try_into().unwrap(), rd.num_bits).into()
+            } else {
+                DataImm::add(rd.reg_no, rn.reg_no, (imm12 as u64).try_into().unwrap(), rd.num_bits).into()
+            }
+        },
+        _ => panic!("Invalid operand combination to add instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// ADDS - add rn and rm, put the result in rd, update flags
+pub fn adds(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rn, rm) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(
+                rd.num_bits == rn.num_bits && rn.num_bits == rm.num_bits,
+                "All operands must be of the same size."
+            );
+
+            DataReg::adds(rd.reg_no, rn.reg_no, rm.reg_no, rd.num_bits).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::UImm(imm12)) => {
+            assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
+
+            DataImm::adds(rd.reg_no, rn.reg_no, imm12.try_into().unwrap(), rd.num_bits).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Imm(imm12)) => {
+            assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
+
+            if imm12 < 0 {
+                DataImm::subs(rd.reg_no, rn.reg_no, (-imm12 as u64).try_into().unwrap(), rd.num_bits).into()
+            } else {
+                DataImm::adds(rd.reg_no, rn.reg_no, (imm12 as u64).try_into().unwrap(), rd.num_bits).into()
+            }
+        },
+        _ => panic!("Invalid operand combination to adds instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// ADR - form a PC-relative address and load it into a register
+pub fn adr(cb: &mut CodeBlock, rd: A64Opnd, imm: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, imm) {
+        (A64Opnd::Reg(rd), A64Opnd::Imm(imm)) => {
+            assert!(rd.num_bits == 64, "The destination register must be 64 bits.");
+            assert!(imm_fits_bits(imm, 21), "The immediate operand must be 21 bits or less.");
+
+            PCRelative::adr(rd.reg_no, imm as i32).into()
+        },
+        _ => panic!("Invalid operand combination to adr instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// ADRP - form a PC-relative address to a 4KB page and load it into a register.
+/// This is effectively the same as ADR except that the immediate must be a
+/// multiple of 4KB.
+pub fn adrp(cb: &mut CodeBlock, rd: A64Opnd, imm: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, imm) {
+        (A64Opnd::Reg(rd), A64Opnd::Imm(imm)) => {
+            assert!(rd.num_bits == 64, "The destination register must be 64 bits.");
+            assert!(imm_fits_bits(imm, 32), "The immediate operand must be 32 bits or less.");
+
+            PCRelative::adrp(rd.reg_no, imm as i32).into()
+        },
+        _ => panic!("Invalid operand combination to adr instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// AND - and rn and rm, put the result in rd, don't update flags
+pub fn and(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rn, rm) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(
+                rd.num_bits == rn.num_bits && rn.num_bits == rm.num_bits,
+                "All operands must be of the same size."
+            );
+
+            LogicalReg::and(rd.reg_no, rn.reg_no, rm.reg_no, rd.num_bits).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::UImm(imm)) => {
+            assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
+            let bitmask_imm = if rd.num_bits == 32 {
+                BitmaskImmediate::new_32b_reg(imm.try_into().unwrap())
+            } else {
+                imm.try_into()
+            }.unwrap();
+
+            LogicalImm::and(rd.reg_no, rn.reg_no, bitmask_imm, rd.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to and instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// ANDS - and rn and rm, put the result in rd, update flags
+pub fn ands(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rn, rm) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(
+                rd.num_bits == rn.num_bits && rn.num_bits == rm.num_bits,
+                "All operands must be of the same size."
+            );
+
+            LogicalReg::ands(rd.reg_no, rn.reg_no, rm.reg_no, rd.num_bits).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::UImm(imm)) => {
+            assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
+            let bitmask_imm = if rd.num_bits == 32 {
+                BitmaskImmediate::new_32b_reg(imm.try_into().unwrap())
+            } else {
+                imm.try_into()
+            }.unwrap();
+
+            LogicalImm::ands(rd.reg_no, rn.reg_no, bitmask_imm, rd.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to ands instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// ASR - arithmetic shift right rn by shift, put the result in rd, don't update
+/// flags
+pub fn asr(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, shift: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rn, shift) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::UImm(shift)) => {
+            assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
+            assert!(uimm_fits_bits(shift, 6), "The shift operand must be 6 bits or less.");
+
+            SBFM::asr(rd.reg_no, rn.reg_no, shift.try_into().unwrap(), rd.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to asr instruction: asr {:?}, {:?}, {:?}", rd, rn, shift),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// Whether or not the offset between two instructions fits into the branch with
+/// or without link instruction. If it doesn't, then we have to load the value
+/// into a register first.
+pub const fn b_offset_fits_bits(offset: i64) -> bool {
+    imm_fits_bits(offset, 26)
+}
+
+/// B - branch without link (offset is number of instructions to jump)
+pub fn b(cb: &mut CodeBlock, offset: InstructionOffset) {
+    assert!(b_offset_fits_bits(offset.into()), "The immediate operand must be 26 bits or less.");
+    let bytes: [u8; 4] = Call::b(offset).into();
+
+    cb.write_bytes(&bytes);
+}
+
+/// Whether or not the offset in number of instructions between two instructions
+/// fits into the b.cond instruction. If it doesn't, then we have to load the
+/// value into a register first, then use the b.cond instruction to skip past a
+/// direct jump.
+pub const fn bcond_offset_fits_bits(offset: i64) -> bool {
+    imm_fits_bits(offset, 19)
+}
+
+/// CBZ and CBNZ also have a limit of 19 bits for the branch offset.
+pub use bcond_offset_fits_bits as cmp_branch_offset_fits_bits;
+
+/// B.cond - branch to target if condition is true
+pub fn bcond(cb: &mut CodeBlock, cond: u8, offset: InstructionOffset) {
+    assert!(bcond_offset_fits_bits(offset.into()), "The offset must be 19 bits or less.");
+    let bytes: [u8; 4] = BranchCond::bcond(cond, offset).into();
+
+    cb.write_bytes(&bytes);
+}
+
+/// BL - branch with link (offset is number of instructions to jump)
+pub fn bl(cb: &mut CodeBlock, offset: InstructionOffset) {
+    assert!(b_offset_fits_bits(offset.into()), "The offset must be 26 bits or less.");
+    let bytes: [u8; 4] = Call::bl(offset).into();
+
+    cb.write_bytes(&bytes);
+}
+
+/// BLR - branch with link to a register
+pub fn blr(cb: &mut CodeBlock, rn: A64Opnd) {
+    let bytes: [u8; 4] = match rn {
+        A64Opnd::Reg(rn) => Branch::blr(rn.reg_no).into(),
+        _ => panic!("Invalid operand to blr instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// BR - branch to a register
+pub fn br(cb: &mut CodeBlock, rn: A64Opnd) {
+    let bytes: [u8; 4] = match rn {
+        A64Opnd::Reg(rn) => Branch::br(rn.reg_no).into(),
+        _ => panic!("Invalid operand to br instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// BRK - create a breakpoint
+pub fn brk(cb: &mut CodeBlock, imm16: A64Opnd) {
+    let bytes: [u8; 4] = match imm16 {
+        A64Opnd::None => Breakpoint::brk(0).into(),
+        A64Opnd::UImm(imm16) => {
+            assert!(uimm_fits_bits(imm16, 16), "The immediate operand must be 16 bits or less.");
+            Breakpoint::brk(imm16 as u16).into()
+        },
+        _ => panic!("Invalid operand combination to brk instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// CMP - compare rn and rm, update flags
+pub fn cmp(cb: &mut CodeBlock, rn: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rn, rm) {
+        (A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(
+                rn.num_bits == rm.num_bits,
+                "All operands must be of the same size."
+            );
+
+            DataReg::cmp(rn.reg_no, rm.reg_no, rn.num_bits).into()
+        },
+        (A64Opnd::Reg(rn), A64Opnd::Imm(imm12)) => {
+            DataImm::cmp(rn.reg_no, (imm12 as u64).try_into().unwrap(), rn.num_bits).into()
+        },
+        (A64Opnd::Reg(rn), A64Opnd::UImm(imm12)) => {
+            DataImm::cmp(rn.reg_no, imm12.try_into().unwrap(), rn.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to cmp instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// CSEL - conditionally select between two registers
+pub fn csel(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd, cond: u8) {
+    let bytes: [u8; 4] = match (rd, rn, rm) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(
+                rd.num_bits == rn.num_bits && rn.num_bits == rm.num_bits,
+                "All operands must be of the same size."
+            );
+
+            Conditional::csel(rd.reg_no, rn.reg_no, rm.reg_no, cond, rd.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to csel instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// EOR - perform a bitwise XOR of rn and rm, put the result in rd, don't update flags
+pub fn eor(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rn, rm) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(
+                rd.num_bits == rn.num_bits && rn.num_bits == rm.num_bits,
+                "All operands must be of the same size."
+            );
+
+            LogicalReg::eor(rd.reg_no, rn.reg_no, rm.reg_no, rd.num_bits).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::UImm(imm)) => {
+            assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
+            let bitmask_imm = if rd.num_bits == 32 {
+                BitmaskImmediate::new_32b_reg(imm.try_into().unwrap())
+            } else {
+                imm.try_into()
+            }.unwrap();
+
+            LogicalImm::eor(rd.reg_no, rn.reg_no, bitmask_imm, rd.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to eor instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// LDADDAL - atomic add with acquire and release semantics
+pub fn ldaddal(cb: &mut CodeBlock, rs: A64Opnd, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rs, rt, rn) {
+        (A64Opnd::Reg(rs), A64Opnd::Reg(rt), A64Opnd::Reg(rn)) => {
+            assert!(
+                rs.num_bits == rt.num_bits && rt.num_bits == rn.num_bits,
+                "All operands must be of the same size."
+            );
+
+            Atomic::ldaddal(rs.reg_no, rt.reg_no, rn.reg_no, rs.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to ldaddal instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// LDAXR - atomic load with acquire semantics
+pub fn ldaxr(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Reg(rn)) => {
+            assert_eq!(rn.num_bits, 64, "rn must be a 64-bit register.");
+
+            LoadStoreExclusive::ldaxr(rt.reg_no, rn.reg_no, rt.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to ldaxr instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// LDP (signed offset) - load a pair of registers from memory
+pub fn ldp(cb: &mut CodeBlock, rt1: A64Opnd, rt2: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt1, rt2, rn) {
+        (A64Opnd::Reg(rt1), A64Opnd::Reg(rt2), A64Opnd::Mem(rn)) => {
+            assert!(rt1.num_bits == rt2.num_bits, "Expected source registers to be the same size");
+            assert!(imm_fits_bits(rn.disp.into(), 10), "The displacement must be 10 bits or less.");
+            assert_ne!(rt1.reg_no, rt2.reg_no, "Behavior is unpredictable with pairs of the same register");
+
+            RegisterPair::ldp(rt1.reg_no, rt2.reg_no, rn.base_reg_no, rn.disp as i16, rt1.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to ldp instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// LDP (pre-index) - load a pair of registers from memory, update the base pointer before loading it
+pub fn ldp_pre(cb: &mut CodeBlock, rt1: A64Opnd, rt2: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt1, rt2, rn) {
+        (A64Opnd::Reg(rt1), A64Opnd::Reg(rt2), A64Opnd::Mem(rn)) => {
+            assert!(rt1.num_bits == rt2.num_bits, "Expected source registers to be the same size");
+            assert!(imm_fits_bits(rn.disp.into(), 10), "The displacement must be 10 bits or less.");
+            assert_ne!(rt1.reg_no, rt2.reg_no, "Behavior is unpredictable with pairs of the same register");
+
+            RegisterPair::ldp_pre(rt1.reg_no, rt2.reg_no, rn.base_reg_no, rn.disp as i16, rt1.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to ldp instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// LDP (post-index) - load a pair of registers from memory, update the base pointer after loading it
+pub fn ldp_post(cb: &mut CodeBlock, rt1: A64Opnd, rt2: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt1, rt2, rn) {
+        (A64Opnd::Reg(rt1), A64Opnd::Reg(rt2), A64Opnd::Mem(rn)) => {
+            assert!(rt1.num_bits == rt2.num_bits, "Expected source registers to be the same size");
+            assert!(imm_fits_bits(rn.disp.into(), 10), "The displacement must be 10 bits or less.");
+            assert_ne!(rt1.reg_no, rt2.reg_no, "Behavior is unpredictable with pairs of the same register");
+
+            RegisterPair::ldp_post(rt1.reg_no, rt2.reg_no, rn.base_reg_no, rn.disp as i16, rt1.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to ldp instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// LDR - load a memory address into a register with a register offset
+pub fn ldr(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn, rm) {
+        (A64Opnd::Reg(rt), A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(rt.num_bits == rn.num_bits, "Expected registers to be the same size");
+            assert!(rn.num_bits == rm.num_bits, "Expected registers to be the same size");
+
+            LoadRegister::ldr(rt.reg_no, rn.reg_no, rm.reg_no, rt.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to ldr instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// LDR - load a PC-relative memory address into a register
+pub fn ldr_literal(cb: &mut CodeBlock, rt: A64Opnd, rn: InstructionOffset) {
+    let bytes: [u8; 4] = match rt {
+        A64Opnd::Reg(rt) => {
+            LoadLiteral::ldr_literal(rt.reg_no, rn, rt.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to ldr instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// LDRH - load a halfword from memory
+pub fn ldrh(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
+            assert_eq!(rt.num_bits, 32, "Expected to be loading a halfword");
+            assert!(imm_fits_bits(rn.disp.into(), 12), "The displacement must be 12 bits or less.");
+
+            HalfwordImm::ldrh(rt.reg_no, rn.base_reg_no, rn.disp as i16).into()
+        },
+        _ => panic!("Invalid operand combination to ldrh instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// LDRH (pre-index) - load a halfword from memory, update the base pointer before loading it
+pub fn ldrh_pre(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
+            assert_eq!(rt.num_bits, 32, "Expected to be loading a halfword");
+            assert!(imm_fits_bits(rn.disp.into(), 9), "The displacement must be 9 bits or less.");
+
+            HalfwordImm::ldrh_pre(rt.reg_no, rn.base_reg_no, rn.disp as i16).into()
+        },
+        _ => panic!("Invalid operand combination to ldrh instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// LDRH (post-index) - load a halfword from memory, update the base pointer after loading it
+pub fn ldrh_post(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
+            assert_eq!(rt.num_bits, 32, "Expected to be loading a halfword");
+            assert!(imm_fits_bits(rn.disp.into(), 9), "The displacement must be 9 bits or less.");
+
+            HalfwordImm::ldrh_post(rt.reg_no, rn.base_reg_no, rn.disp as i16).into()
+        },
+        _ => panic!("Invalid operand combination to ldrh instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// Whether or not a memory address displacement fits into the maximum number of
+/// bits such that it can be used without loading it into a register first.
+pub fn mem_disp_fits_bits(disp: i32) -> bool {
+    imm_fits_bits(disp.into(), 9)
+}
+
+/// LDR (post-index) - load a register from memory, update the base pointer after loading it
+pub fn ldr_post(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
+            assert!(rt.num_bits == rn.num_bits, "All operands must be of the same size.");
+            assert!(mem_disp_fits_bits(rn.disp), "The displacement must be 9 bits or less.");
+
+            LoadStore::ldr_post(rt.reg_no, rn.base_reg_no, rn.disp as i16, rt.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to ldr instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// LDR (pre-index) - load a register from memory, update the base pointer before loading it
+pub fn ldr_pre(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
+            assert!(rt.num_bits == rn.num_bits, "All operands must be of the same size.");
+            assert!(mem_disp_fits_bits(rn.disp), "The displacement must be 9 bits or less.");
+
+            LoadStore::ldr_pre(rt.reg_no, rn.base_reg_no, rn.disp as i16, rt.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to ldr instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// LDUR - load a memory address into a register
+pub fn ldur(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Reg(rn)) => {
+            assert!(rt.num_bits == rn.num_bits, "All operands must be of the same size.");
+
+            LoadStore::ldur(rt.reg_no, rn.reg_no, 0, rt.num_bits).into()
+        },
+        (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
+            assert!(rt.num_bits == rn.num_bits, "Expected registers to be the same size");
+            assert!(mem_disp_fits_bits(rn.disp), "Expected displacement to be 9 bits or less");
+
+            LoadStore::ldur(rt.reg_no, rn.base_reg_no, rn.disp as i16, rt.num_bits).into()
+        },
+        _ => panic!("Invalid operands for LDUR")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// LDURH - load a byte from memory, zero-extend it, and write it to a register
+pub fn ldurh(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
+            assert!(mem_disp_fits_bits(rn.disp), "Expected displacement to be 9 bits or less");
+
+            LoadStore::ldurh(rt.reg_no, rn.base_reg_no, rn.disp as i16).into()
+        },
+        _ => panic!("Invalid operands for LDURH")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// LDURB - load a byte from memory, zero-extend it, and write it to a register
+pub fn ldurb(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
+            assert!(rt.num_bits == rn.num_bits, "Expected registers to be the same size");
+            assert!(rt.num_bits == 8, "Expected registers to have size 8");
+            assert!(mem_disp_fits_bits(rn.disp), "Expected displacement to be 9 bits or less");
+
+            LoadStore::ldurb(rt.reg_no, rn.base_reg_no, rn.disp as i16).into()
+        },
+        _ => panic!("Invalid operands for LDURB")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// LDURSW - load a 32-bit memory address into a register and sign-extend it
+pub fn ldursw(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
+            assert!(rt.num_bits == rn.num_bits, "Expected registers to be the same size");
+            assert!(mem_disp_fits_bits(rn.disp), "Expected displacement to be 9 bits or less");
+
+            LoadStore::ldursw(rt.reg_no, rn.base_reg_no, rn.disp as i16).into()
+        },
+        _ => panic!("Invalid operand combination to ldursw instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// LSL - logical shift left a register by an immediate
+pub fn lsl(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, shift: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rn, shift) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::UImm(uimm)) => {
+            assert!(rd.num_bits == rn.num_bits, "Expected registers to be the same size");
+            assert!(uimm_fits_bits(uimm, 6), "Expected shift to be 6 bits or less");
+
+            ShiftImm::lsl(rd.reg_no, rn.reg_no, uimm as u8, rd.num_bits).into()
+        },
+        _ => panic!("Invalid operands combination to lsl instruction")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// LSR - logical shift right a register by an immediate
+pub fn lsr(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, shift: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rn, shift) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::UImm(uimm)) => {
+            assert!(rd.num_bits == rn.num_bits, "Expected registers to be the same size");
+            assert!(uimm_fits_bits(uimm, 6), "Expected shift to be 6 bits or less");
+
+            ShiftImm::lsr(rd.reg_no, rn.reg_no, uimm as u8, rd.num_bits).into()
+        },
+        _ => panic!("Invalid operands combination to lsr instruction")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// MOV - move a value in a register to another register
+pub fn mov(cb: &mut CodeBlock, rd: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rm) {
+        (A64Opnd::Reg(A64Reg { reg_no: 31, num_bits: 64 }), A64Opnd::Reg(rm)) => {
+            assert!(rm.num_bits == 64, "Expected rm to be 64 bits");
+
+            DataImm::add(31, rm.reg_no, 0.try_into().unwrap(), 64).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(A64Reg { reg_no: 31, num_bits: 64 })) => {
+            assert!(rd.num_bits == 64, "Expected rd to be 64 bits");
+
+            DataImm::add(rd.reg_no, 31, 0.try_into().unwrap(), 64).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rm)) => {
+            assert!(rd.num_bits == rm.num_bits, "Expected registers to be the same size");
+
+            LogicalReg::mov(rd.reg_no, rm.reg_no, rd.num_bits).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::UImm(0)) => {
+            LogicalReg::mov(rd.reg_no, XZR_REG.reg_no, rd.num_bits).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::UImm(imm)) => {
+            let bitmask_imm = if rd.num_bits == 32 {
+                BitmaskImmediate::new_32b_reg(imm.try_into().unwrap())
+            } else {
+                imm.try_into()
+            }.unwrap();
+
+            LogicalImm::mov(rd.reg_no, bitmask_imm, rd.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to mov instruction")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// MOVK - move a 16 bit immediate into a register, keep the other bits in place
+pub fn movk(cb: &mut CodeBlock, rd: A64Opnd, imm16: A64Opnd, shift: u8) {
+    let bytes: [u8; 4] = match (rd, imm16) {
+        (A64Opnd::Reg(rd), A64Opnd::UImm(imm16)) => {
+            assert!(uimm_fits_bits(imm16, 16), "The immediate operand must be 16 bits or less.");
+
+            Mov::movk(rd.reg_no, imm16 as u16, shift, rd.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to movk instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// MOVZ - move a 16 bit immediate into a register, zero the other bits
+pub fn movz(cb: &mut CodeBlock, rd: A64Opnd, imm16: A64Opnd, shift: u8) {
+    let bytes: [u8; 4] = match (rd, imm16) {
+        (A64Opnd::Reg(rd), A64Opnd::UImm(imm16)) => {
+            assert!(uimm_fits_bits(imm16, 16), "The immediate operand must be 16 bits or less.");
+
+            Mov::movz(rd.reg_no, imm16 as u16, shift, rd.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to movz instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// MRS - move a system register into a general-purpose register
+pub fn mrs(cb: &mut CodeBlock, rt: A64Opnd, systemregister: SystemRegister) {
+    let bytes: [u8; 4] = match rt {
+        A64Opnd::Reg(rt) => {
+            SysReg::mrs(rt.reg_no, systemregister).into()
+        },
+        _ => panic!("Invalid operand combination to mrs instruction")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// MSR - move a general-purpose register into a system register
+pub fn msr(cb: &mut CodeBlock, systemregister: SystemRegister, rt: A64Opnd) {
+    let bytes: [u8; 4] = match rt {
+        A64Opnd::Reg(rt) => {
+            SysReg::msr(systemregister, rt.reg_no).into()
+        },
+        _ => panic!("Invalid operand combination to msr instruction")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// MUL - multiply two registers, put the result in a third register
+pub fn mul(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rn, rm) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(rd.num_bits == rn.num_bits && rn.num_bits == rm.num_bits, "Expected registers to be the same size");
+
+            MAdd::mul(rd.reg_no, rn.reg_no, rm.reg_no, rd.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to mul instruction")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// SMULH - multiply two 64-bit registers to produce a 128-bit result, put the high 64-bits of the result into rd
+pub fn smulh(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rn, rm) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(rd.num_bits == rn.num_bits && rn.num_bits == rm.num_bits, "Expected registers to be the same size");
+            assert!(rd.num_bits == 64, "smulh only applicable to 64-bit registers");
+
+            SMulH::smulh(rd.reg_no, rn.reg_no, rm.reg_no).into()
+        },
+        _ => panic!("Invalid operand combination to mul instruction")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// MVN - move a value in a register to another register, negating it
+pub fn mvn(cb: &mut CodeBlock, rd: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rm) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rm)) => {
+            assert!(rd.num_bits == rm.num_bits, "Expected registers to be the same size");
+
+            LogicalReg::mvn(rd.reg_no, rm.reg_no, rd.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to mvn instruction")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// NOP - no-operation, used for alignment purposes
+pub fn nop(cb: &mut CodeBlock) {
+    let bytes: [u8; 4] = Nop::nop().into();
+
+    cb.write_bytes(&bytes);
+}
+
+/// ORN - perform a bitwise OR of rn and NOT rm, put the result in rd, don't update flags
+pub fn orn(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rn, rm) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(rd.num_bits == rn.num_bits && rn.num_bits == rm.num_bits, "Expected registers to be the same size");
+
+            LogicalReg::orn(rd.reg_no, rn.reg_no, rm.reg_no, rd.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to orn instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// ORR - perform a bitwise OR of rn and rm, put the result in rd, don't update flags
+pub fn orr(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rn, rm) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(
+                rd.num_bits == rn.num_bits && rn.num_bits == rm.num_bits,
+                "All operands must be of the same size."
+            );
+
+            LogicalReg::orr(rd.reg_no, rn.reg_no, rm.reg_no, rd.num_bits).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::UImm(imm)) => {
+            assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
+            let bitmask_imm = if rd.num_bits == 32 {
+                BitmaskImmediate::new_32b_reg(imm.try_into().unwrap())
+            } else {
+                imm.try_into()
+            }.unwrap();
+
+            LogicalImm::orr(rd.reg_no, rn.reg_no, bitmask_imm, rd.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to orr instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// STLXR - store a value to memory, release exclusive access
+pub fn stlxr(cb: &mut CodeBlock, rs: A64Opnd, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rs, rt, rn) {
+        (A64Opnd::Reg(rs), A64Opnd::Reg(rt), A64Opnd::Reg(rn)) => {
+            assert_eq!(rs.num_bits, 32, "rs must be a 32-bit register.");
+            assert_eq!(rn.num_bits, 64, "rn must be a 64-bit register.");
+
+            LoadStoreExclusive::stlxr(rs.reg_no, rt.reg_no, rn.reg_no, rn.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to stlxr instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// STP (signed offset) - store a pair of registers to memory
+pub fn stp(cb: &mut CodeBlock, rt1: A64Opnd, rt2: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt1, rt2, rn) {
+        (A64Opnd::Reg(rt1), A64Opnd::Reg(rt2), A64Opnd::Mem(rn)) => {
+            assert!(rt1.num_bits == rt2.num_bits, "Expected source registers to be the same size");
+            assert!(imm_fits_bits(rn.disp.into(), 10), "The displacement must be 10 bits or less.");
+            assert_ne!(rt1.reg_no, rt2.reg_no, "Behavior is unpredictable with pairs of the same register");
+
+            RegisterPair::stp(rt1.reg_no, rt2.reg_no, rn.base_reg_no, rn.disp as i16, rt1.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to stp instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// STP (pre-index) - store a pair of registers to memory, update the base pointer before loading it
+pub fn stp_pre(cb: &mut CodeBlock, rt1: A64Opnd, rt2: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt1, rt2, rn) {
+        (A64Opnd::Reg(rt1), A64Opnd::Reg(rt2), A64Opnd::Mem(rn)) => {
+            assert!(rt1.num_bits == rt2.num_bits, "Expected source registers to be the same size");
+            assert!(imm_fits_bits(rn.disp.into(), 10), "The displacement must be 10 bits or less.");
+            assert_ne!(rt1.reg_no, rt2.reg_no, "Behavior is unpredictable with pairs of the same register");
+
+            RegisterPair::stp_pre(rt1.reg_no, rt2.reg_no, rn.base_reg_no, rn.disp as i16, rt1.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to stp instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// STP (post-index) - store a pair of registers to memory, update the base pointer after loading it
+pub fn stp_post(cb: &mut CodeBlock, rt1: A64Opnd, rt2: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt1, rt2, rn) {
+        (A64Opnd::Reg(rt1), A64Opnd::Reg(rt2), A64Opnd::Mem(rn)) => {
+            assert!(rt1.num_bits == rt2.num_bits, "Expected source registers to be the same size");
+            assert!(imm_fits_bits(rn.disp.into(), 10), "The displacement must be 10 bits or less.");
+            assert_ne!(rt1.reg_no, rt2.reg_no, "Behavior is unpredictable with pairs of the same register");
+
+            RegisterPair::stp_post(rt1.reg_no, rt2.reg_no, rn.base_reg_no, rn.disp as i16, rt1.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to stp instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// STR (post-index) - store a register to memory, update the base pointer after loading it
+pub fn str_post(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
+            assert!(rt.num_bits == rn.num_bits, "All operands must be of the same size.");
+            assert!(mem_disp_fits_bits(rn.disp), "The displacement must be 9 bits or less.");
+
+            LoadStore::str_post(rt.reg_no, rn.base_reg_no, rn.disp as i16, rt.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to str instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// STR (pre-index) - store a register to memory, update the base pointer before loading it
+pub fn str_pre(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
+            assert!(rt.num_bits == rn.num_bits, "All operands must be of the same size.");
+            assert!(mem_disp_fits_bits(rn.disp), "The displacement must be 9 bits or less.");
+
+            LoadStore::str_pre(rt.reg_no, rn.base_reg_no, rn.disp as i16, rt.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to str instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// STRH - store a halfword into memory
+pub fn strh(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
+            assert_eq!(rt.num_bits, 32, "Expected to be loading a halfword");
+            assert!(imm_fits_bits(rn.disp.into(), 12), "The displacement must be 12 bits or less.");
+
+            HalfwordImm::strh(rt.reg_no, rn.base_reg_no, rn.disp as i16).into()
+        },
+        _ => panic!("Invalid operand combination to strh instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// STRH (pre-index) - store a halfword into memory, update the base pointer before loading it
+pub fn strh_pre(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
+            assert_eq!(rt.num_bits, 32, "Expected to be loading a halfword");
+            assert!(imm_fits_bits(rn.disp.into(), 9), "The displacement must be 9 bits or less.");
+
+            HalfwordImm::strh_pre(rt.reg_no, rn.base_reg_no, rn.disp as i16).into()
+        },
+        _ => panic!("Invalid operand combination to strh instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// STRH (post-index) - store a halfword into memory, update the base pointer after loading it
+pub fn strh_post(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
+            assert_eq!(rt.num_bits, 32, "Expected to be loading a halfword");
+            assert!(imm_fits_bits(rn.disp.into(), 9), "The displacement must be 9 bits or less.");
+
+            HalfwordImm::strh_post(rt.reg_no, rn.base_reg_no, rn.disp as i16).into()
+        },
+        _ => panic!("Invalid operand combination to strh instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// STUR - store a value in a register at a memory address
+pub fn stur(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
+            assert!(rn.num_bits == 32 || rn.num_bits == 64);
+            assert!(mem_disp_fits_bits(rn.disp), "Expected displacement to be 9 bits or less");
+
+            LoadStore::stur(rt.reg_no, rn.base_reg_no, rn.disp as i16, rn.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to stur instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// STURH - store a value in a register at a memory address
+pub fn sturh(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Mem(rn)) => {
+            assert!(rn.num_bits == 16);
+            assert!(mem_disp_fits_bits(rn.disp), "Expected displacement to be 9 bits or less");
+
+            LoadStore::sturh(rt.reg_no, rn.base_reg_no, rn.disp as i16).into()
+        },
+        _ => panic!("Invalid operand combination to stur instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// SUB - subtract rm from rn, put the result in rd, don't update flags
+pub fn sub(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rn, rm) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(
+                rd.num_bits == rn.num_bits && rn.num_bits == rm.num_bits,
+                "All operands must be of the same size."
+            );
+
+            DataReg::sub(rd.reg_no, rn.reg_no, rm.reg_no, rd.num_bits).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::UImm(uimm12)) => {
+            assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
+
+            DataImm::sub(rd.reg_no, rn.reg_no, uimm12.try_into().unwrap(), rd.num_bits).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Imm(imm12)) => {
+            assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
+
+            if imm12 < 0 {
+                DataImm::add(rd.reg_no, rn.reg_no, (-imm12 as u64).try_into().unwrap(), rd.num_bits).into()
+            } else {
+                DataImm::sub(rd.reg_no, rn.reg_no, (imm12 as u64).try_into().unwrap(), rd.num_bits).into()
+            }
+        },
+        _ => panic!("Invalid operand combination to sub instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// SUBS - subtract rm from rn, put the result in rd, update flags
+pub fn subs(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rn, rm) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(
+                rd.num_bits == rn.num_bits && rn.num_bits == rm.num_bits,
+                "All operands must be of the same size."
+            );
+
+            DataReg::subs(rd.reg_no, rn.reg_no, rm.reg_no, rd.num_bits).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::UImm(uimm12)) => {
+            assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
+
+            DataImm::subs(rd.reg_no, rn.reg_no, uimm12.try_into().unwrap(), rd.num_bits).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Imm(imm12)) => {
+            assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
+
+            if imm12 < 0 {
+                DataImm::adds(rd.reg_no, rn.reg_no, (-imm12 as u64).try_into().unwrap(), rd.num_bits).into()
+            } else {
+                DataImm::subs(rd.reg_no, rn.reg_no, (imm12 as u64).try_into().unwrap(), rd.num_bits).into()
+            }
+        },
+        _ => panic!("Invalid operand combination to subs instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// SXTW - sign extend a 32-bit register into a 64-bit register
+pub fn sxtw(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rd, rn) {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn)) => {
+            assert_eq!(rd.num_bits, 64, "rd must be 64-bits wide.");
+            assert_eq!(rn.num_bits, 32, "rn must be 32-bits wide.");
+
+            SBFM::sxtw(rd.reg_no, rn.reg_no).into()
+        },
+        _ => panic!("Invalid operand combination to sxtw instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// RET - unconditionally return to a location in a register, defaults to X30
+pub fn ret(cb: &mut CodeBlock, rn: A64Opnd) {
+    let bytes: [u8; 4] = match rn {
+        A64Opnd::None => Branch::ret(30).into(),
+        A64Opnd::Reg(reg) => Branch::ret(reg.reg_no).into(),
+        _ => panic!("Invalid operand to ret instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// TBNZ - test bit and branch if not zero
+pub fn tbnz(cb: &mut CodeBlock, rt: A64Opnd, bit_num: A64Opnd, offset: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, bit_num, offset) {
+        (A64Opnd::Reg(rt), A64Opnd::UImm(bit_num), A64Opnd::Imm(offset)) => {
+            TestBit::tbnz(rt.reg_no, bit_num.try_into().unwrap(), offset.try_into().unwrap()).into()
+        },
+        _ => panic!("Invalid operand combination to tbnz instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// TBZ - test bit and branch if zero
+pub fn tbz(cb: &mut CodeBlock, rt: A64Opnd, bit_num: A64Opnd, offset: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, bit_num, offset) {
+        (A64Opnd::Reg(rt), A64Opnd::UImm(bit_num), A64Opnd::Imm(offset)) => {
+            TestBit::tbz(rt.reg_no, bit_num.try_into().unwrap(), offset.try_into().unwrap()).into()
+        },
+        _ => panic!("Invalid operand combination to tbz instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// TST - test the bits of a register against a mask, then update flags
+pub fn tst(cb: &mut CodeBlock, rn: A64Opnd, rm: A64Opnd) {
+    let bytes: [u8; 4] = match (rn, rm) {
+        (A64Opnd::Reg(rn), A64Opnd::Reg(rm)) => {
+            assert!(rn.num_bits == rm.num_bits, "All operands must be of the same size.");
+
+            LogicalReg::tst(rn.reg_no, rm.reg_no, rn.num_bits).into()
+        },
+        (A64Opnd::Reg(rn), A64Opnd::UImm(imm)) => {
+            let bitmask_imm = if rn.num_bits == 32 {
+                BitmaskImmediate::new_32b_reg(imm.try_into().unwrap())
+            } else {
+                imm.try_into()
+            }.unwrap();
+
+            LogicalImm::tst(rn.reg_no, bitmask_imm, rn.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to tst instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// CBZ - branch if a register is zero
+pub fn cbz(cb: &mut CodeBlock, rt: A64Opnd, offset: InstructionOffset) {
+    assert!(imm_fits_bits(offset.into(), 19), "jump offset for cbz must fit in 19 bits");
+    let bytes: [u8; 4] = if let A64Opnd::Reg(rt) = rt {
+        cbz_cbnz(rt.num_bits, false, offset, rt.reg_no)
+    } else {
+        panic!("Invalid operand combination to cbz instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// CBNZ - branch if a register is non-zero
+pub fn cbnz(cb: &mut CodeBlock, rt: A64Opnd, offset: InstructionOffset) {
+    assert!(imm_fits_bits(offset.into(), 19), "jump offset for cbz must fit in 19 bits");
+    let bytes: [u8; 4] = if let A64Opnd::Reg(rt) = rt {
+        cbz_cbnz(rt.num_bits, true, offset, rt.reg_no)
+    } else {
+        panic!("Invalid operand combination to cbnz instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
+/// Encode Compare and Branch on Zero (CBZ) with `op=0` or Compare and Branch on Nonzero (CBNZ)
+/// with `op=1`.
+///
+/// <https://developer.arm.com/documentation/ddi0602/2024-03/Base-Instructions/CBZ--Compare-and-Branch-on-Zero->
+///
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// | sf  0  1  1    0  1  0 op                                                                                     |
+/// |                             imm19........................................................... Rt.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+fn cbz_cbnz(num_bits: u8, op: bool, offset: InstructionOffset, rt: u8) -> [u8; 4] {
+    ((Sf::from(num_bits) as u32) << 31 |
+          0b11010 << 25 |
+          u32::from(op) << 24 |
+          truncate_imm::<_, 19>(offset) << 5 |
+          rt as u32).to_le_bytes()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Check that the bytes for an instruction sequence match a hex string
+    fn check_bytes<R>(bytes: &str, run: R) where R: FnOnce(&mut super::CodeBlock) {
+        let mut cb = super::CodeBlock::new_dummy(128);
+        run(&mut cb);
+        assert_eq!(format!("{:x}", cb), bytes);
+    }
+
+    #[test]
+    fn test_imm_fits_bits() {
+        assert!(imm_fits_bits(i8::MAX.into(), 8));
+        assert!(imm_fits_bits(i8::MIN.into(), 8));
+
+        assert!(imm_fits_bits(i16::MAX.into(), 16));
+        assert!(imm_fits_bits(i16::MIN.into(), 16));
+
+        assert!(imm_fits_bits(i32::MAX.into(), 32));
+        assert!(imm_fits_bits(i32::MIN.into(), 32));
+
+        assert!(imm_fits_bits(i64::MAX, 64));
+        assert!(imm_fits_bits(i64::MIN, 64));
+    }
+
+    #[test]
+    fn test_uimm_fits_bits() {
+        assert!(uimm_fits_bits(u8::MAX.into(), 8));
+        assert!(uimm_fits_bits(u16::MAX.into(), 16));
+        assert!(uimm_fits_bits(u32::MAX.into(), 32));
+        assert!(uimm_fits_bits(u64::MAX, 64));
+    }
+
+    #[test]
+    fn test_add_reg() {
+        check_bytes("2000028b", |cb| add(cb, X0, X1, X2));
+    }
+
+    #[test]
+    fn test_add_uimm() {
+        check_bytes("201c0091", |cb| add(cb, X0, X1, A64Opnd::new_uimm(7)));
+    }
+
+    #[test]
+    fn test_add_imm_positive() {
+        check_bytes("201c0091", |cb| add(cb, X0, X1, A64Opnd::new_imm(7)));
+    }
+
+    #[test]
+    fn test_add_imm_negative() {
+        check_bytes("201c00d1", |cb| add(cb, X0, X1, A64Opnd::new_imm(-7)));
+    }
+
+    #[test]
+    fn test_adds_reg() {
+        check_bytes("200002ab", |cb| adds(cb, X0, X1, X2));
+    }
+
+    #[test]
+    fn test_adds_uimm() {
+        check_bytes("201c00b1", |cb| adds(cb, X0, X1, A64Opnd::new_uimm(7)));
+    }
+
+    #[test]
+    fn test_adds_imm_positive() {
+        check_bytes("201c00b1", |cb| adds(cb, X0, X1, A64Opnd::new_imm(7)));
+    }
+
+    #[test]
+    fn test_adds_imm_negative() {
+        check_bytes("201c00f1", |cb| adds(cb, X0, X1, A64Opnd::new_imm(-7)));
+    }
+
+    #[test]
+    fn test_adr() {
+        check_bytes("aa000010", |cb| adr(cb, X10, A64Opnd::new_imm(20)));
+    }
+
+    #[test]
+    fn test_adrp() {
+        check_bytes("4a000090", |cb| adrp(cb, X10, A64Opnd::new_imm(0x8000)));
+    }
+
+    #[test]
+    fn test_and_register() {
+        check_bytes("2000028a", |cb| and(cb, X0, X1, X2));
+    }
+
+    #[test]
+    fn test_and_immediate() {
+        check_bytes("20084092", |cb| and(cb, X0, X1, A64Opnd::new_uimm(7)));
+    }
+
+    #[test]
+    fn test_and_32b_immediate() {
+        check_bytes("404c0012", |cb| and(cb, W0, W2, A64Opnd::new_uimm(0xfffff)));
+    }
+
+    #[test]
+    fn test_ands_register() {
+        check_bytes("200002ea", |cb| ands(cb, X0, X1, X2));
+    }
+
+    #[test]
+    fn test_ands_immediate() {
+        check_bytes("200840f2", |cb| ands(cb, X0, X1, A64Opnd::new_uimm(7)));
+    }
+
+    #[test]
+    fn test_asr() {
+        check_bytes("b4fe4a93", |cb| asr(cb, X20, X21, A64Opnd::new_uimm(10)));
+    }
+
+    #[test]
+    fn test_bcond() {
+        let offset = InstructionOffset::from_insns(0x100);
+        check_bytes("01200054", |cb| bcond(cb, Condition::NE, offset));
+    }
+
+    #[test]
+    fn test_b() {
+        let offset = InstructionOffset::from_insns((1 << 25) - 1);
+        check_bytes("ffffff15", |cb| b(cb, offset));
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_b_too_big() {
+        // There are 26 bits available
+        let offset = InstructionOffset::from_insns(1 << 25);
+        check_bytes("", |cb| b(cb, offset));
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_b_too_small() {
+        // There are 26 bits available
+        let offset = InstructionOffset::from_insns(-(1 << 25) - 1);
+        check_bytes("", |cb| b(cb, offset));
+    }
+
+    #[test]
+    fn test_bl() {
+        let offset = InstructionOffset::from_insns(-(1 << 25));
+        check_bytes("00000096", |cb| bl(cb, offset));
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_bl_too_big() {
+        // There are 26 bits available
+        let offset = InstructionOffset::from_insns(1 << 25);
+        check_bytes("", |cb| bl(cb, offset));
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_bl_too_small() {
+        // There are 26 bits available
+        let offset = InstructionOffset::from_insns(-(1 << 25) - 1);
+        check_bytes("", |cb| bl(cb, offset));
+    }
+
+    #[test]
+    fn test_blr() {
+        check_bytes("80023fd6", |cb| blr(cb, X20));
+    }
+
+    #[test]
+    fn test_br() {
+        check_bytes("80021fd6", |cb| br(cb, X20));
+    }
+
+    #[test]
+    fn test_cbz() {
+        let offset = InstructionOffset::from_insns(-1);
+        check_bytes("e0ffffb4e0ffff34", |cb| {
+            cbz(cb, X0, offset);
+            cbz(cb, W0, offset);
+        });
+    }
+
+    #[test]
+    fn test_cbnz() {
+        let offset = InstructionOffset::from_insns(2);
+        check_bytes("540000b554000035", |cb| {
+            cbnz(cb, X20, offset);
+            cbnz(cb, W20, offset);
+        });
+    }
+
+    #[test]
+    fn test_brk_none() {
+        check_bytes("000020d4", |cb| brk(cb, A64Opnd::None));
+    }
+
+    #[test]
+    fn test_brk_uimm() {
+        check_bytes("c00120d4", |cb| brk(cb, A64Opnd::new_uimm(14)));
+    }
+
+    #[test]
+    fn test_cmp_register() {
+        check_bytes("5f010beb", |cb| cmp(cb, X10, X11));
+    }
+
+    #[test]
+    fn test_cmp_immediate() {
+        check_bytes("5f3900f1", |cb| cmp(cb, X10, A64Opnd::new_uimm(14)));
+    }
+
+    #[test]
+    fn test_csel() {
+        check_bytes("6a018c9a", |cb| csel(cb, X10, X11, X12, Condition::EQ));
+    }
+
+    #[test]
+    fn test_eor_register() {
+        check_bytes("6a010cca", |cb| eor(cb, X10, X11, X12));
+    }
+
+    #[test]
+    fn test_eor_immediate() {
+        check_bytes("6a0940d2", |cb| eor(cb, X10, X11, A64Opnd::new_uimm(7)));
+    }
+
+    #[test]
+    fn test_eor_32b_immediate() {
+        check_bytes("29040152", |cb| eor(cb, W9, W1, A64Opnd::new_uimm(0x80000001)));
+    }
+
+    #[test]
+    fn test_ldaddal() {
+        check_bytes("8b01eaf8", |cb| ldaddal(cb, X10, X11, X12));
+    }
+
+    #[test]
+    fn test_ldaxr() {
+        check_bytes("6afd5fc8", |cb| ldaxr(cb, X10, X11));
+    }
+
+    #[test]
+    fn test_ldp() {
+        check_bytes("8a2d4da9", |cb| ldp(cb, X10, X11, A64Opnd::new_mem(64, X12, 208)));
+    }
+
+    #[test]
+    fn test_ldp_pre() {
+        check_bytes("8a2dcda9", |cb| ldp_pre(cb, X10, X11, A64Opnd::new_mem(64, X12, 208)));
+    }
+
+    #[test]
+    fn test_ldp_post() {
+        check_bytes("8a2dcda8", |cb| ldp_post(cb, X10, X11, A64Opnd::new_mem(64, X12, 208)));
+    }
+
+    #[test]
+    fn test_ldr() {
+        check_bytes("6a696cf8", |cb| ldr(cb, X10, X11, X12));
+    }
+
+    #[test]
+    fn test_ldr_literal() {
+        check_bytes("40010058", |cb| ldr_literal(cb, X0, 10.into()));
+    }
+
+    #[test]
+    fn test_ldr_post() {
+        check_bytes("6a0541f8", |cb| ldr_post(cb, X10, A64Opnd::new_mem(64, X11, 16)));
+    }
+
+    #[test]
+    fn test_ldr_pre() {
+        check_bytes("6a0d41f8", |cb| ldr_pre(cb, X10, A64Opnd::new_mem(64, X11, 16)));
+    }
+
+    #[test]
+    fn test_ldrh() {
+        check_bytes("6a194079", |cb| ldrh(cb, W10, A64Opnd::new_mem(64, X11, 12)));
+    }
+
+    #[test]
+    fn test_ldrh_pre() {
+        check_bytes("6acd4078", |cb| ldrh_pre(cb, W10, A64Opnd::new_mem(64, X11, 12)));
+    }
+
+    #[test]
+    fn test_ldrh_post() {
+        check_bytes("6ac54078", |cb| ldrh_post(cb, W10, A64Opnd::new_mem(64, X11, 12)));
+    }
+
+    #[test]
+    fn test_ldurh_memory() {
+        check_bytes("2a004078", |cb| ldurh(cb, W10, A64Opnd::new_mem(64, X1, 0)));
+        check_bytes("2ab04778", |cb| ldurh(cb, W10, A64Opnd::new_mem(64, X1, 123)));
+    }
+
+    #[test]
+    fn test_ldur_memory() {
+        check_bytes("20b047f8", |cb| ldur(cb, X0, A64Opnd::new_mem(64, X1, 123)));
+    }
+
+    #[test]
+    fn test_ldur_register() {
+        check_bytes("200040f8", |cb| ldur(cb, X0, X1));
+    }
+
+    #[test]
+    fn test_ldursw() {
+        check_bytes("6ab187b8", |cb| ldursw(cb, X10, A64Opnd::new_mem(64, X11, 123)));
+    }
+
+    #[test]
+    fn test_lsl() {
+        check_bytes("6ac572d3", |cb| lsl(cb, X10, X11, A64Opnd::new_uimm(14)));
+    }
+
+    #[test]
+    fn test_lsr() {
+        check_bytes("6afd4ed3", |cb| lsr(cb, X10, X11, A64Opnd::new_uimm(14)));
+    }
+
+    #[test]
+    fn test_mov_registers() {
+        check_bytes("ea030baa", |cb| mov(cb, X10, X11));
+    }
+
+    #[test]
+    fn test_mov_immediate() {
+        check_bytes("eaf300b2", |cb| mov(cb, X10, A64Opnd::new_uimm(0x5555555555555555)));
+    }
+
+    #[test]
+    fn test_mov_32b_immediate() {
+        check_bytes("ea070132", |cb| mov(cb, W10, A64Opnd::new_uimm(0x80000001)));
+    }
+    #[test]
+    fn test_mov_into_sp() {
+        check_bytes("1f000091", |cb| mov(cb, X31, X0));
+    }
+
+    #[test]
+    fn test_mov_from_sp() {
+        check_bytes("e0030091", |cb| mov(cb, X0, X31));
+    }
+
+    #[test]
+    fn test_movk() {
+        check_bytes("600fa0f2", |cb| movk(cb, X0, A64Opnd::new_uimm(123), 16));
+    }
+
+    #[test]
+    fn test_movz() {
+        check_bytes("600fa0d2", |cb| movz(cb, X0, A64Opnd::new_uimm(123), 16));
+    }
+
+    #[test]
+    fn test_mrs() {
+        check_bytes("0a423bd5", |cb| mrs(cb, X10, SystemRegister::NZCV));
+    }
+
+    #[test]
+    fn test_msr() {
+        check_bytes("0a421bd5", |cb| msr(cb, SystemRegister::NZCV, X10));
+    }
+
+    #[test]
+    fn test_mul() {
+        check_bytes("6a7d0c9b", |cb| mul(cb, X10, X11, X12));
+    }
+
+    #[test]
+    fn test_mvn() {
+        check_bytes("ea032baa", |cb| mvn(cb, X10, X11));
+    }
+
+    #[test]
+    fn test_nop() {
+        check_bytes("1f2003d5", |cb| nop(cb));
+    }
+
+    #[test]
+    fn test_orn() {
+        check_bytes("6a012caa", |cb| orn(cb, X10, X11, X12));
+    }
+
+    #[test]
+    fn test_orr_register() {
+        check_bytes("6a010caa", |cb| orr(cb, X10, X11, X12));
+    }
+
+    #[test]
+    fn test_orr_immediate() {
+        check_bytes("6a0940b2", |cb| orr(cb, X10, X11, A64Opnd::new_uimm(7)));
+    }
+
+    #[test]
+    fn test_orr_32b_immediate() {
+        check_bytes("6a010032", |cb| orr(cb, W10, W11, A64Opnd::new_uimm(1)));
+    }
+
+    #[test]
+    fn test_ret_none() {
+        check_bytes("c0035fd6", |cb| ret(cb, A64Opnd::None));
+    }
+
+    #[test]
+    fn test_ret_register() {
+        check_bytes("80025fd6", |cb| ret(cb, X20));
+    }
+
+    #[test]
+    fn test_stlxr() {
+        check_bytes("8bfd0ac8", |cb| stlxr(cb, W10, X11, X12));
+    }
+
+    #[test]
+    fn test_stp() {
+        check_bytes("8a2d0da9", |cb| stp(cb, X10, X11, A64Opnd::new_mem(64, X12, 208)));
+    }
+
+    #[test]
+    fn test_stp_pre() {
+        check_bytes("8a2d8da9", |cb| stp_pre(cb, X10, X11, A64Opnd::new_mem(64, X12, 208)));
+    }
+
+    #[test]
+    fn test_stp_post() {
+        check_bytes("8a2d8da8", |cb| stp_post(cb, X10, X11, A64Opnd::new_mem(64, X12, 208)));
+    }
+
+    #[test]
+    fn test_str_post() {
+        check_bytes("6a051ff8", |cb| str_post(cb, X10, A64Opnd::new_mem(64, X11, -16)));
+    }
+
+    #[test]
+    fn test_str_pre() {
+        check_bytes("6a0d1ff8", |cb| str_pre(cb, X10, A64Opnd::new_mem(64, X11, -16)));
+    }
+
+    #[test]
+    fn test_strh() {
+        check_bytes("6a190079", |cb| strh(cb, W10, A64Opnd::new_mem(64, X11, 12)));
+    }
+
+    #[test]
+    fn test_strh_pre() {
+        check_bytes("6acd0078", |cb| strh_pre(cb, W10, A64Opnd::new_mem(64, X11, 12)));
+    }
+
+    #[test]
+    fn test_strh_post() {
+        check_bytes("6ac50078", |cb| strh_post(cb, W10, A64Opnd::new_mem(64, X11, 12)));
+    }
+
+    #[test]
+    fn test_stur_64_bits() {
+        check_bytes("6a0108f8", |cb| stur(cb, X10, A64Opnd::new_mem(64, X11, 128)));
+    }
+
+    #[test]
+    fn test_stur_32_bits() {
+        check_bytes("6a0108b8", |cb| stur(cb, X10, A64Opnd::new_mem(32, X11, 128)));
+    }
+
+    #[test]
+    fn test_sub_reg() {
+        check_bytes("200002cb", |cb| sub(cb, X0, X1, X2));
+    }
+
+    #[test]
+    fn test_sub_uimm() {
+        check_bytes("201c00d1", |cb| sub(cb, X0, X1, A64Opnd::new_uimm(7)));
+    }
+
+    #[test]
+    fn test_sub_imm_positive() {
+        check_bytes("201c00d1", |cb| sub(cb, X0, X1, A64Opnd::new_imm(7)));
+    }
+
+    #[test]
+    fn test_sub_imm_negative() {
+        check_bytes("201c0091", |cb| sub(cb, X0, X1, A64Opnd::new_imm(-7)));
+    }
+
+    #[test]
+    fn test_subs_reg() {
+        check_bytes("200002eb", |cb| subs(cb, X0, X1, X2));
+    }
+
+    #[test]
+    fn test_subs_imm_positive() {
+        check_bytes("201c00f1", |cb| subs(cb, X0, X1, A64Opnd::new_imm(7)));
+    }
+
+    #[test]
+    fn test_subs_imm_negative() {
+        check_bytes("201c00b1", |cb| subs(cb, X0, X1, A64Opnd::new_imm(-7)));
+    }
+
+    #[test]
+    fn test_subs_uimm() {
+        check_bytes("201c00f1", |cb| subs(cb, X0, X1, A64Opnd::new_uimm(7)));
+    }
+
+    #[test]
+    fn test_sxtw() {
+        check_bytes("6a7d4093", |cb| sxtw(cb, X10, W11));
+    }
+
+    #[test]
+    fn test_tbnz() {
+        check_bytes("4a005037", |cb| tbnz(cb, X10, A64Opnd::UImm(10), A64Opnd::Imm(2)));
+    }
+
+    #[test]
+    fn test_tbz() {
+        check_bytes("4a005036", |cb| tbz(cb, X10, A64Opnd::UImm(10), A64Opnd::Imm(2)));
+    }
+
+    #[test]
+    fn test_tst_register() {
+        check_bytes("1f0001ea", |cb| tst(cb, X0, X1));
+    }
+
+    #[test]
+    fn test_tst_immediate() {
+        check_bytes("3f0840f2", |cb| tst(cb, X1, A64Opnd::new_uimm(7)));
+    }
+
+    #[test]
+    fn test_tst_32b_immediate() {
+        check_bytes("1f3c0072", |cb| tst(cb, W0, A64Opnd::new_uimm(0xffff)));
+    }
+}
diff --git a/yjit/src/asm/arm64/opnd.rs b/yjit/src/asm/arm64/opnd.rs
new file mode 100644
index 0000000000..108824e08d
--- /dev/null
+++ b/yjit/src/asm/arm64/opnd.rs
@@ -0,0 +1,195 @@
+
+
+/// This operand represents a register.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub struct A64Reg
+{
+    // Size in bits
+    pub num_bits: u8,
+
+    // Register index number
+    pub reg_no: u8,
+}
+
+impl A64Reg {
+    pub fn with_num_bits(&self, num_bits: u8) -> Self {
+        assert!(num_bits == 8 || num_bits == 16 || num_bits == 32 || num_bits == 64);
+        Self { num_bits, reg_no: self.reg_no }
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+pub struct A64Mem
+{
+    // Size in bits
+    pub num_bits: u8,
+
+    /// Base register number
+    pub base_reg_no: u8,
+
+    /// Constant displacement from the base, not scaled
+    pub disp: i32,
+}
+
+impl A64Mem {
+    pub fn new(num_bits: u8, reg: A64Opnd, disp: i32) -> Self {
+        match reg {
+            A64Opnd::Reg(reg) => {
+                Self { num_bits, base_reg_no: reg.reg_no, disp }
+            },
+            _ => panic!("Expected register operand")
+        }
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+pub enum A64Opnd
+{
+    // Dummy operand
+    None,
+
+    // Immediate value
+    Imm(i64),
+
+    // Unsigned immediate
+    UImm(u64),
+
+    // Register
+    Reg(A64Reg),
+
+    // Memory
+    Mem(A64Mem)
+}
+
+impl A64Opnd {
+    /// Create a new immediate value operand.
+    pub fn new_imm(value: i64) -> Self {
+        A64Opnd::Imm(value)
+    }
+
+    /// Create a new unsigned immediate value operand.
+    pub fn new_uimm(value: u64) -> Self {
+        A64Opnd::UImm(value)
+    }
+
+    /// Creates a new memory operand.
+    pub fn new_mem(num_bits: u8, reg: A64Opnd, disp: i32) -> Self {
+        A64Opnd::Mem(A64Mem::new(num_bits, reg, disp))
+    }
+
+    /// Convenience function to check if this operand is a register.
+    pub fn is_reg(&self) -> bool {
+        match self {
+            A64Opnd::Reg(_) => true,
+            _ => false
+        }
+    }
+
+    /// Unwrap a register from an operand.
+    pub fn unwrap_reg(&self) -> A64Reg {
+        match self {
+            A64Opnd::Reg(reg) => *reg,
+            _ => panic!("Expected register operand")
+        }
+    }
+}
+
+// argument registers
+pub const X0_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 0 };
+pub const X1_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 1 };
+pub const X2_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 2 };
+pub const X3_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 3 };
+pub const X4_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 4 };
+pub const X5_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 5 };
+
+// caller-save registers
+pub const X9_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 9 };
+pub const X10_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 10 };
+pub const X11_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 11 };
+pub const X12_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 12 };
+pub const X13_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 13 };
+pub const X14_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 14 };
+pub const X15_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 15 };
+pub const X16_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 16 };
+pub const X17_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 17 };
+
+// callee-save registers
+pub const X19_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 19 };
+pub const X20_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 20 };
+pub const X21_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 21 };
+pub const X22_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 22 };
+
+// zero register
+pub const XZR_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 31 };
+
+// 64-bit registers
+pub const X0: A64Opnd = A64Opnd::Reg(X0_REG);
+pub const X1: A64Opnd = A64Opnd::Reg(X1_REG);
+pub const X2: A64Opnd = A64Opnd::Reg(X2_REG);
+pub const X3: A64Opnd = A64Opnd::Reg(X3_REG);
+pub const X4: A64Opnd = A64Opnd::Reg(X4_REG);
+pub const X5: A64Opnd = A64Opnd::Reg(X5_REG);
+pub const X6: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: 6 });
+pub const X7: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: 7 });
+pub const X8: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: 8 });
+pub const X9: A64Opnd = A64Opnd::Reg(X9_REG);
+pub const X10: A64Opnd = A64Opnd::Reg(X10_REG);
+pub const X11: A64Opnd = A64Opnd::Reg(X11_REG);
+pub const X12: A64Opnd = A64Opnd::Reg(X12_REG);
+pub const X13: A64Opnd = A64Opnd::Reg(X13_REG);
+pub const X14: A64Opnd = A64Opnd::Reg(X14_REG);
+pub const X15: A64Opnd = A64Opnd::Reg(X15_REG);
+pub const X16: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: 16 });
+pub const X17: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: 17 });
+pub const X18: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: 18 });
+pub const X19: A64Opnd = A64Opnd::Reg(X19_REG);
+pub const X20: A64Opnd = A64Opnd::Reg(X20_REG);
+pub const X21: A64Opnd = A64Opnd::Reg(X21_REG);
+pub const X22: A64Opnd = A64Opnd::Reg(X22_REG);
+pub const X23: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: 23 });
+pub const X24: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: 24 });
+pub const X25: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: 25 });
+pub const X26: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: 26 });
+pub const X27: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: 27 });
+pub const X28: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: 28 });
+pub const X29: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: 29 });
+pub const X30: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: 30 });
+pub const X31: A64Opnd = A64Opnd::Reg(XZR_REG);
+
+// 32-bit registers
+pub const W0: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 0 });
+pub const W1: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 1 });
+pub const W2: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 2 });
+pub const W3: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 3 });
+pub const W4: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 4 });
+pub const W5: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 5 });
+pub const W6: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 6 });
+pub const W7: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 7 });
+pub const W8: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 8 });
+pub const W9: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 9 });
+pub const W10: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 10 });
+pub const W11: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 11 });
+pub const W12: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 12 });
+pub const W13: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 13 });
+pub const W14: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 14 });
+pub const W15: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 15 });
+pub const W16: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 16 });
+pub const W17: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 17 });
+pub const W18: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 18 });
+pub const W19: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 19 });
+pub const W20: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 20 });
+pub const W21: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 21 });
+pub const W22: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 22 });
+pub const W23: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 23 });
+pub const W24: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 24 });
+pub const W25: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 25 });
+pub const W26: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 26 });
+pub const W27: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 27 });
+pub const W28: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 28 });
+pub const W29: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 29 });
+pub const W30: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 30 });
+pub const W31: A64Opnd = A64Opnd::Reg(A64Reg { num_bits: 32, reg_no: 31 });
+
+// C argument registers
+pub const C_ARG_REGS: [A64Opnd; 4] = [X0, X1, X2, X3];
+pub const C_ARG_REGREGS: [A64Reg; 4] = [X0_REG, X1_REG, X2_REG, X3_REG];
diff --git a/yjit/src/asm/mod.rs b/yjit/src/asm/mod.rs
index e16e856925..524d6341f5 100644
--- a/yjit/src/asm/mod.rs
+++ b/yjit/src/asm/mod.rs
@@ -1,31 +1,57 @@
+use std::cell::RefCell;
+use std::fmt;
 use std::mem;
-
-#[cfg(feature = "asm_comments")]
+use std::rc::Rc;
+use crate::core::IseqPayload;
+use crate::core::for_each_off_stack_iseq_payload;
+use crate::core::for_each_on_stack_iseq_payload;
+use crate::invariants::rb_yjit_tracing_invalidate_all;
+use crate::stats::incr_counter;
+use crate::virtualmem::WriteError;
+
+#[cfg(feature = "disasm")]
 use std::collections::BTreeMap;
 
+use crate::codegen::CodegenGlobals;
 use crate::virtualmem::{VirtualMem, CodePtr};
 
 // Lots of manual vertical alignment in there that rustfmt doesn't handle well.
 #[rustfmt::skip]
 pub mod x86_64;
 
+pub mod arm64;
+
 //
 // TODO: need a field_size_of macro, to compute the size of a struct field in bytes
 //
 
 /// Reference to an ASM label
-struct LabelRef {
+#[derive(Clone)]
+pub struct LabelRef {
     // Position in the code block where the label reference exists
     pos: usize,
 
     // Label which this refers to
     label_idx: usize,
+
+    /// The number of bytes that this label reference takes up in the memory.
+    /// It's necessary to know this ahead of time so that when we come back to
+    /// patch it it takes the same amount of space.
+    num_bytes: usize,
+
+    /// The object that knows how to encode the branch instruction.
+    encode: fn(&mut CodeBlock, i64, i64)
 }
 
 /// Block of memory into which instructions can be assembled
 pub struct CodeBlock {
     // Memory for storing the encoded instructions
-    mem_block: VirtualMem,
+    mem_block: Rc<RefCell<VirtualMem>>,
+
+    // Size of a code page in bytes. Each code page is split into an inlined and an outlined portion.
+    // Code GC collects code memory at this granularity.
+    // Must be a multiple of the OS page size.
+    page_size: usize,
 
     // Memory block size
     mem_size: usize,
@@ -33,6 +59,15 @@ pub struct CodeBlock {
     // Current writing position
     write_pos: usize,
 
+    // The index of the last page with written bytes
+    last_page_idx: usize,
+
+    // Total number of bytes written to past pages
+    past_page_bytes: usize,
+
+    // Size reserved for writing a jump to the next page
+    page_end_reserve: usize,
+
     // Table of registered label addresses
     label_addrs: Vec<usize>,
 
@@ -43,41 +78,298 @@ pub struct CodeBlock {
     label_refs: Vec<LabelRef>,
 
     // Comments for assembly instructions, if that feature is enabled
-    #[cfg(feature = "asm_comments")]
+    #[cfg(feature = "disasm")]
     asm_comments: BTreeMap<usize, Vec<String>>,
 
+    // True for OutlinedCb
+    pub outlined: bool,
+
     // Set if the CodeBlock is unable to output some instructions,
     // for example, when there is not enough space or when a jump
     // target is too far away.
     dropped_bytes: bool,
+
+    // Keeps track of what pages we can write to after code gc.
+    // `None` means all pages are free.
+    freed_pages: Rc<Option<Vec<usize>>>,
+}
+
+/// Set of CodeBlock label states. Used for recovering the previous state.
+pub struct LabelState {
+    label_addrs: Vec<usize>,
+    label_names: Vec<String>,
+    label_refs: Vec<LabelRef>,
 }
 
 impl CodeBlock {
+    /// Works for common AArch64 systems that have 16 KiB pages and
+    /// common x86_64 systems that use 4 KiB pages.
+    const PREFERRED_CODE_PAGE_SIZE: usize = 16 * 1024;
+
     /// Make a new CodeBlock
-    pub fn new(mem_block: VirtualMem) -> Self {
-        Self {
-            mem_size: mem_block.virtual_region_size(),
+    pub fn new(mem_block: Rc<RefCell<VirtualMem>>, outlined: bool, freed_pages: Rc<Option<Vec<usize>>>) -> Self {
+        // Pick the code page size
+        let system_page_size = mem_block.borrow().system_page_size();
+        let page_size = if 0 == Self::PREFERRED_CODE_PAGE_SIZE % system_page_size {
+            Self::PREFERRED_CODE_PAGE_SIZE
+        } else {
+            system_page_size
+        };
+
+        let mem_size = mem_block.borrow().virtual_region_size();
+        let mut cb = Self {
             mem_block,
+            mem_size,
+            page_size,
             write_pos: 0,
+            last_page_idx: 0,
+            past_page_bytes: 0,
+            page_end_reserve: 0,
             label_addrs: Vec::new(),
             label_names: Vec::new(),
             label_refs: Vec::new(),
-            #[cfg(feature = "asm_comments")]
+            #[cfg(feature = "disasm")]
             asm_comments: BTreeMap::new(),
+            outlined,
             dropped_bytes: false,
+            freed_pages,
+        };
+        cb.page_end_reserve = cb.jmp_ptr_bytes();
+        cb.write_pos = cb.page_start();
+
+        #[cfg(not(test))]
+        assert_eq!(0, mem_size % page_size, "partially in-bounds code pages should be impossible");
+
+        cb
+    }
+
+    /// Move the CodeBlock to the next page. If it's on the furthest page,
+    /// move the other CodeBlock to the next page as well.
+    pub fn next_page<F: Fn(&mut CodeBlock, CodePtr)>(&mut self, base_ptr: CodePtr, jmp_ptr: F) -> bool {
+        let old_write_ptr = self.get_write_ptr();
+        self.set_write_ptr(base_ptr);
+
+        // Use the freed_pages list if code GC has been used. Otherwise use the next page.
+        let next_page_idx = if let Some(freed_pages) = self.freed_pages.as_ref() {
+            let current_page = self.write_pos / self.page_size;
+            freed_pages.iter().find(|&&page| current_page < page).map(|&page| page)
+        } else {
+            Some(self.write_pos / self.page_size + 1)
+        };
+
+        // Move self to the next page
+        if next_page_idx.is_none() || !self.set_page(next_page_idx.unwrap(), &jmp_ptr) {
+            self.set_write_ptr(old_write_ptr); // rollback if there are no more pages
+            return false;
+        }
+
+        // Move the other CodeBlock to the same page if it's on the furthest page
+        if cfg!(not(test)) {
+            self.other_cb().unwrap().set_page(next_page_idx.unwrap(), &jmp_ptr);
+        }
+
+        return !self.dropped_bytes;
+    }
+
+    /// Move the CodeBlock to page_idx only if it's not going backwards.
+    fn set_page<F: Fn(&mut CodeBlock, CodePtr)>(&mut self, page_idx: usize, jmp_ptr: &F) -> bool {
+        // Do not move the CodeBlock if page_idx points to an old position so that this
+        // CodeBlock will not overwrite existing code.
+        //
+        // Let's say this is the current situation:
+        //   cb: [page1, page2, page3 (write_pos)], ocb: [page1, page2, page3 (write_pos)]
+        //
+        // When cb needs to patch page1, this will be temporarily changed to:
+        //   cb: [page1 (write_pos), page2, page3], ocb: [page1, page2, page3 (write_pos)]
+        //
+        // While patching page1, cb may need to jump to page2. What set_page currently does is:
+        //   cb: [page1, page2 (write_pos), page3], ocb: [page1, page2, page3 (write_pos)]
+        // instead of:
+        //   cb: [page1, page2 (write_pos), page3], ocb: [page1, page2 (write_pos), page3]
+        // because moving ocb's write_pos from page3 to the beginning of page2 will let ocb's
+        // write_pos point to existing code in page2, which might let ocb overwrite it later.
+        //
+        // We could remember the last write_pos in page2 and let set_page use that position,
+        // but you need to waste some space for keeping write_pos for every single page.
+        // It doesn't seem necessary for performance either. So we're currently not doing it.
+        let dst_pos = self.get_page_pos(page_idx);
+        if self.write_pos < dst_pos {
+            // Fail if next page is out of bounds
+            if dst_pos >= self.mem_size {
+                return false;
+            }
+
+            // Reset dropped_bytes
+            self.dropped_bytes = false;
+
+            // Generate jmp_ptr from src_pos to dst_pos
+            let dst_ptr = self.get_ptr(dst_pos);
+            self.without_page_end_reserve(|cb| {
+                assert!(cb.has_capacity(cb.jmp_ptr_bytes()));
+                cb.add_comment("jump to next page");
+                jmp_ptr(cb, dst_ptr);
+            });
+
+            // Update past_page_bytes for code_size() if this is a new page
+            if self.last_page_idx < page_idx {
+                self.past_page_bytes += self.current_page_bytes();
+            }
+
+            // Start the next code from dst_pos
+            self.write_pos = dst_pos;
+            // Update the last_page_idx if page_idx points to the furthest page
+            self.last_page_idx = usize::max(self.last_page_idx, page_idx);
+        }
+        !self.dropped_bytes
+    }
+
+    /// Free the memory pages of given code page indexes
+    fn free_pages(&mut self, page_idxs: &Vec<usize>) {
+        let mut page_idxs = page_idxs.clone();
+        page_idxs.reverse(); // to loop with pop()
+
+        // Group adjacent page indexes and free them in batches to reduce the # of syscalls.
+        while let Some(page_idx) = page_idxs.pop() {
+            // Group first adjacent page indexes
+            let mut batch_idxs = vec![page_idx];
+            while page_idxs.last() == Some(&(batch_idxs.last().unwrap() + 1)) {
+                batch_idxs.push(page_idxs.pop().unwrap());
+            }
+
+            // Free the grouped pages at once
+            let start_ptr = self.mem_block.borrow().start_ptr().add_bytes(page_idx * self.page_size);
+            let batch_size = self.page_size * batch_idxs.len();
+            self.mem_block.borrow_mut().free_bytes(start_ptr, batch_size as u32);
+        }
+    }
+
+    pub fn page_size(&self) -> usize {
+        self.page_size
+    }
+
+    pub fn mapped_region_size(&self) -> usize {
+        self.mem_block.borrow().mapped_region_size()
+    }
+
+    /// Size of the region in bytes where writes could be attempted.
+    #[cfg(target_arch = "aarch64")]
+    pub fn virtual_region_size(&self) -> usize {
+        self.mem_block.borrow().virtual_region_size()
+    }
+
+    /// Return the number of code pages that have been mapped by the VirtualMemory.
+    pub fn num_mapped_pages(&self) -> usize {
+        // CodeBlock's page size != VirtualMem's page size on Linux,
+        // so mapped_region_size % self.page_size may not be 0
+        ((self.mapped_region_size() - 1) / self.page_size) + 1
+    }
+
+    /// Return the number of code pages that have been reserved by the VirtualMemory.
+    pub fn num_virtual_pages(&self) -> usize {
+        let virtual_region_size = self.mem_block.borrow().virtual_region_size();
+        // CodeBlock's page size != VirtualMem's page size on Linux,
+        // so mapped_region_size % self.page_size may not be 0
+        ((virtual_region_size - 1) / self.page_size) + 1
+    }
+
+    /// Return the number of code pages that have been freed and not used yet.
+    pub fn num_freed_pages(&self) -> usize {
+        (0..self.num_mapped_pages()).filter(|&page_idx| self.has_freed_page(page_idx)).count()
+    }
+
+    pub fn has_freed_page(&self, page_idx: usize) -> bool {
+        self.freed_pages.as_ref().as_ref().map_or(false, |pages| pages.contains(&page_idx)) && // code GCed
+            self.write_pos < page_idx * self.page_size // and not written yet
+    }
+
+    /// Convert a page index to the write_pos for the page start.
+    fn get_page_pos(&self, page_idx: usize) -> usize {
+        self.page_size * page_idx + self.page_start()
+    }
+
+    /// write_pos of the current page start
+    pub fn page_start_pos(&self) -> usize {
+        self.get_write_pos() / self.page_size * self.page_size + self.page_start()
+    }
+
+    /// Offset of each page where CodeBlock should start writing
+    pub fn page_start(&self) -> usize {
+        let mut start = if self.inline() {
+            0
+        } else {
+            self.page_size / 2
+        };
+        if cfg!(debug_assertions) && !cfg!(test) {
+            // Leave illegal instructions at the beginning of each page to assert
+            // we're not accidentally crossing page boundaries.
+            start += self.jmp_ptr_bytes();
+        }
+        start
+    }
+
+    /// Offset of each page where CodeBlock should stop writing (exclusive)
+    pub fn page_end(&self) -> usize {
+        let page_end = if self.inline() {
+            self.page_size / 2
+        } else {
+            self.page_size
+        };
+        page_end - self.page_end_reserve // reserve space to jump to the next page
+    }
+
+    /// Call a given function with page_end_reserve = 0
+    pub fn without_page_end_reserve<F: Fn(&mut Self)>(&mut self, block: F) {
+        let old_page_end_reserve = self.page_end_reserve;
+        self.page_end_reserve = 0;
+        block(self);
+        self.page_end_reserve = old_page_end_reserve;
+    }
+
+    /// Return the address ranges of a given address range that this CodeBlock can write.
+    #[allow(dead_code)]
+    pub fn writable_addrs(&self, start_ptr: CodePtr, end_ptr: CodePtr) -> Vec<(usize, usize)> {
+        let region_start = self.get_ptr(0).raw_addr(self);
+        let region_end = self.get_ptr(self.get_mem_size()).raw_addr(self);
+        let mut start = start_ptr.raw_addr(self);
+        let end = std::cmp::min(end_ptr.raw_addr(self), region_end);
+
+        let freed_pages = self.freed_pages.as_ref().as_ref();
+        let mut addrs = vec![];
+        while start < end {
+            let page_idx = start.saturating_sub(region_start) / self.page_size;
+            let current_page = region_start + (page_idx * self.page_size);
+            let page_end = std::cmp::min(end, current_page + self.page_end());
+            // If code GC has been used, skip pages that are used by past on-stack code
+            if freed_pages.map_or(true, |pages| pages.contains(&page_idx)) {
+                addrs.push((start, page_end));
+            }
+            start = current_page + self.page_size + self.page_start();
         }
+        addrs
+    }
+
+    /// Return the number of bytes written by this CodeBlock.
+    pub fn code_size(&self) -> usize {
+        self.current_page_bytes() + self.past_page_bytes
+    }
+
+    /// Return the number of bytes written to the current page.
+    fn current_page_bytes(&self) -> usize {
+        (self.write_pos % self.page_size).saturating_sub(self.page_start())
     }
 
     /// Check if this code block has sufficient remaining capacity
     pub fn has_capacity(&self, num_bytes: usize) -> bool {
-        self.write_pos + num_bytes < self.mem_size
+        let page_offset = self.write_pos % self.page_size;
+        let capacity = self.page_end().saturating_sub(page_offset);
+        num_bytes <= capacity
     }
 
     /// Add an assembly comment if the feature is on.
     /// If not, this becomes an inline no-op.
-    #[cfg(feature = "asm_comments")]
+    #[cfg(feature = "disasm")]
     pub fn add_comment(&mut self, comment: &str) {
-        let cur_ptr = self.get_write_ptr().into_usize();
+        let cur_ptr = self.get_write_ptr().raw_addr(self);
 
         // If there's no current list of comments for this line number, add one.
         let this_line_comments = self.asm_comments.entry(cur_ptr).or_default();
@@ -87,15 +379,31 @@ impl CodeBlock {
             this_line_comments.push(comment.to_string());
         }
     }
-    #[cfg(not(feature = "asm_comments"))]
+    #[cfg(not(feature = "disasm"))]
     #[inline]
     pub fn add_comment(&mut self, _: &str) {}
 
-    #[cfg(feature = "asm_comments")]
+    #[cfg(feature = "disasm")]
     pub fn comments_at(&self, pos: usize) -> Option<&Vec<String>> {
         self.asm_comments.get(&pos)
     }
 
+    #[allow(unused_variables)]
+    #[cfg(feature = "disasm")]
+    pub fn remove_comments(&mut self, start_addr: CodePtr, end_addr: CodePtr) {
+        for addr in start_addr.raw_addr(self)..end_addr.raw_addr(self) {
+            self.asm_comments.remove(&addr);
+        }
+    }
+    #[cfg(not(feature = "disasm"))]
+    #[inline]
+    pub fn remove_comments(&mut self, _: CodePtr, _: CodePtr) {}
+
+    pub fn clear_comments(&mut self) {
+        #[cfg(feature = "disasm")]
+        self.asm_comments.clear();
+    }
+
     pub fn get_mem_size(&self) -> usize {
         self.mem_size
     }
@@ -104,73 +412,72 @@ impl CodeBlock {
         self.write_pos
     }
 
-    pub fn get_mem(&mut self) -> &mut VirtualMem {
-        &mut self.mem_block
+    pub fn write_mem(&self, write_ptr: CodePtr, byte: u8) -> Result<(), WriteError> {
+        self.mem_block.borrow_mut().write_byte(write_ptr, byte)
     }
 
     // Set the current write position
     pub fn set_pos(&mut self, pos: usize) {
-        // Assert here since while CodeBlock functions do bounds checking, there is
-        // nothing stopping users from taking out an out-of-bounds pointer and
-        // doing bad accesses with it.
-        assert!(pos < self.mem_size);
+        // No bounds check here since we can be out of bounds
+        // when the code block fills up. We want to be able to
+        // restore to the filled up state after patching something
+        // in the middle.
         self.write_pos = pos;
     }
 
-    // Align the current write pointer to a multiple of bytes
-    pub fn align_pos(&mut self, multiple: u32) {
-        // Compute the alignment boundary that is lower or equal
-        // Do everything with usize
-        let multiple: usize = multiple.try_into().unwrap();
-        let pos = self.get_write_ptr().raw_ptr() as usize;
-        let remainder = pos % multiple;
-        let prev_aligned = pos - remainder;
-
-        if prev_aligned == pos {
-            // Already aligned so do nothing
-        } else {
-            // Align by advancing
-            let pad = multiple - remainder;
-            self.set_pos(self.get_write_pos() + pad);
-        }
-    }
-
     // Set the current write position from a pointer
     pub fn set_write_ptr(&mut self, code_ptr: CodePtr) {
-        let pos = code_ptr.into_usize() - self.mem_block.start_ptr().into_usize();
-        self.set_pos(pos);
+        let pos = code_ptr.as_offset() - self.mem_block.borrow().start_ptr().as_offset();
+        self.set_pos(pos.try_into().unwrap());
     }
 
-    // Get a direct pointer into the executable memory block
+    /// Get a (possibly dangling) direct pointer into the executable memory block
     pub fn get_ptr(&self, offset: usize) -> CodePtr {
-        self.mem_block.start_ptr().add_bytes(offset)
+        self.mem_block.borrow().start_ptr().add_bytes(offset)
     }
 
-    // Get a direct pointer to the current write position
-    pub fn get_write_ptr(&mut self) -> CodePtr {
+    /// Convert an address range to memory page indexes against a num_pages()-sized array.
+    pub fn addrs_to_pages(&self, start_addr: CodePtr, end_addr: CodePtr) -> Vec<usize> {
+        let mem_start = self.mem_block.borrow().start_ptr().raw_addr(self);
+        let mem_end = self.mem_block.borrow().mapped_end_ptr().raw_addr(self);
+        assert!(mem_start <= start_addr.raw_addr(self));
+        assert!(start_addr.raw_addr(self) <= end_addr.raw_addr(self));
+        assert!(end_addr.raw_addr(self) <= mem_end);
+
+        // Ignore empty code ranges
+        if start_addr == end_addr {
+            return vec![];
+        }
+
+        let start_page = (start_addr.raw_addr(self) - mem_start) / self.page_size;
+        let end_page = (end_addr.raw_addr(self) - mem_start - 1) / self.page_size;
+        (start_page..=end_page).collect() // TODO: consider returning an iterator
+    }
+
+    /// Get a (possibly dangling) direct pointer to the current write position
+    pub fn get_write_ptr(&self) -> CodePtr {
         self.get_ptr(self.write_pos)
     }
 
-    // Write a single byte at the current position
+    /// Write a single byte at the current position.
     pub fn write_byte(&mut self, byte: u8) {
         let write_ptr = self.get_write_ptr();
-
-        if self.mem_block.write_byte(write_ptr, byte).is_ok() {
+        if self.has_capacity(1) && self.mem_block.borrow_mut().write_byte(write_ptr, byte).is_ok() {
             self.write_pos += 1;
         } else {
             self.dropped_bytes = true;
         }
     }
 
-    // Write multiple bytes starting from the current position
+    /// Write multiple bytes starting from the current position.
     pub fn write_bytes(&mut self, bytes: &[u8]) {
         for byte in bytes {
             self.write_byte(*byte);
         }
     }
 
-    // Write a signed integer over a given number of bits at the current position
-    pub fn write_int(&mut self, val: u64, num_bits: u32) {
+    /// Write an integer over the given number of bits at the current position.
+    fn write_int(&mut self, val: u64, num_bits: u32) {
         assert!(num_bits > 0);
         assert!(num_bits % 8 == 0);
 
@@ -201,8 +508,16 @@ impl CodeBlock {
         self.dropped_bytes
     }
 
+    /// To patch code that straddle pages correctly, we need to start with
+    /// the dropped bytes flag unset so we can detect when to switch to a new page.
+    pub fn set_dropped_bytes(&mut self, dropped_bytes: bool) {
+        self.dropped_bytes = dropped_bytes;
+    }
+
     /// Allocate a new label with a given name
     pub fn new_label(&mut self, name: String) -> usize {
+        assert!(!name.contains(' '), "use underscores in label names, not spaces");
+
         // This label doesn't have an address yet
         self.label_addrs.push(0);
         self.label_names.push(name);
@@ -212,22 +527,22 @@ impl CodeBlock {
 
     /// Write a label at the current address
     pub fn write_label(&mut self, label_idx: usize) {
-        // TODO: make sure that label_idx is valid
-        // TODO: add an asseer here
-
         self.label_addrs[label_idx] = self.write_pos;
     }
 
     // Add a label reference at the current write position
-    pub fn label_ref(&mut self, label_idx: usize) {
-        // TODO: make sure that label_idx is valid
-        // TODO: add an asseer here
+    pub fn label_ref(&mut self, label_idx: usize, num_bytes: usize, encode: fn(&mut CodeBlock, i64, i64)) {
+        assert!(label_idx < self.label_addrs.len());
 
         // Keep track of the reference
-        self.label_refs.push(LabelRef {
-            pos: self.write_pos,
-            label_idx,
-        });
+        self.label_refs.push(LabelRef { pos: self.write_pos, label_idx, num_bytes, encode });
+
+        // Move past however many bytes the instruction takes up
+        if self.has_capacity(num_bytes) {
+            self.write_pos += num_bytes;
+        } else {
+            self.dropped_bytes = true; // retry emitting the Insn after next_page
+        }
     }
 
     // Link internal label references
@@ -243,11 +558,12 @@ impl CodeBlock {
             let label_addr = self.label_addrs[label_idx];
             assert!(label_addr < self.mem_size);
 
-            // Compute the offset from the reference's end to the label
-            let offset = (label_addr as i64) - ((ref_pos + 4) as i64);
-
             self.set_pos(ref_pos);
-            self.write_int(offset as u64, 32);
+            (label_ref.encode)(self, (ref_pos + label_ref.num_bytes) as i64, label_addr as i64);
+
+            // Assert that we've written the same number of bytes that we
+            // expected to have written.
+            assert!(self.write_pos == ref_pos + label_ref.num_bytes);
         }
 
         self.write_pos = orig_pos;
@@ -258,8 +574,110 @@ impl CodeBlock {
         assert!(self.label_refs.is_empty());
     }
 
+    pub fn clear_labels(&mut self) {
+        self.label_addrs.clear();
+        self.label_names.clear();
+        self.label_refs.clear();
+    }
+
+    pub fn get_label_state(&self) -> LabelState {
+        LabelState {
+            label_addrs: self.label_addrs.clone(),
+            label_names: self.label_names.clone(),
+            label_refs: self.label_refs.clone(),
+        }
+    }
+
+    pub fn set_label_state(&mut self, state: LabelState) {
+        self.label_addrs = state.label_addrs;
+        self.label_names = state.label_names;
+        self.label_refs = state.label_refs;
+    }
+
     pub fn mark_all_executable(&mut self) {
-        self.mem_block.mark_all_executable();
+        self.mem_block.borrow_mut().mark_all_executable();
+    }
+
+    /// Code GC. Free code pages that are not on stack and reuse them.
+    pub fn code_gc(&mut self, ocb: &mut OutlinedCb) {
+        assert!(self.inline(), "must use on inline code block");
+
+        // The previous code GC failed to free any pages. Give up.
+        if self.freed_pages.as_ref() == &Some(vec![]) {
+            return;
+        }
+
+        // Check which pages are still in use
+        let mut pages_in_use = vec![false; self.num_mapped_pages()];
+        // For each ISEQ, we currently assume that only code pages used by inline code
+        // are used by outlined code, so we mark only code pages used by inlined code.
+        for_each_on_stack_iseq_payload(|iseq_payload| {
+            for page in &iseq_payload.pages {
+                pages_in_use[*page] = true;
+            }
+        });
+        // Avoid accumulating freed pages for future code GC
+        for_each_off_stack_iseq_payload(|iseq_payload: &mut IseqPayload| {
+            iseq_payload.pages = std::collections::HashSet::default();
+        });
+        // Outlined code generated by CodegenGlobals::init() should also be kept.
+        for page in CodegenGlobals::get_ocb_pages() {
+            pages_in_use[*page] = true;
+        }
+
+        // Invalidate everything to have more compact code after code GC.
+        // This currently patches every ISEQ, which works, but in the future,
+        // we could limit that to patch only on-stack ISEQs for optimizing code GC.
+        rb_yjit_tracing_invalidate_all();
+
+        // Assert that all code pages are freeable
+        assert_eq!(
+            0,
+            self.mem_size % self.page_size,
+            "end of the last code page should be the end of the entire region"
+        );
+
+        // Let VirtuamMem free the pages
+        let mut freed_pages: Vec<usize> = pages_in_use.iter().enumerate()
+            .filter(|&(_, &in_use)| !in_use).map(|(page, _)| page).collect();
+        // ObjectSpace API may trigger Ruby's GC, which marks gc_offsets in JIT code.
+        // So this should be called after for_each_*_iseq_payload and rb_yjit_tracing_invalidate_all.
+        self.free_pages(&freed_pages);
+
+        // Append virtual pages in case RubyVM::YJIT.code_gc is manually triggered.
+        let mut virtual_pages: Vec<usize> = (self.num_mapped_pages()..self.num_virtual_pages()).collect();
+        freed_pages.append(&mut virtual_pages);
+
+        if let Some(&first_page) = freed_pages.first() {
+            for cb in [&mut *self, ocb.unwrap()] {
+                cb.write_pos = cb.get_page_pos(first_page);
+                cb.past_page_bytes = 0;
+                cb.dropped_bytes = false;
+                cb.clear_comments();
+            }
+        }
+
+        // Track which pages are free.
+        let new_freed_pages = Rc::new(Some(freed_pages));
+        let old_freed_pages = mem::replace(&mut self.freed_pages, Rc::clone(&new_freed_pages));
+        ocb.unwrap().freed_pages = new_freed_pages;
+        assert_eq!(1, Rc::strong_count(&old_freed_pages)); // will deallocate
+
+        incr_counter!(code_gc_count);
+    }
+
+    pub fn inline(&self) -> bool {
+        !self.outlined
+    }
+
+    pub fn other_cb(&self) -> Option<&'static mut Self> {
+        if !CodegenGlobals::has_instance() {
+            None
+        } else if self.inline() {
+            Some(CodegenGlobals::get_outlined_cb().unwrap())
+        } else {
+            Some(CodegenGlobals::get_inline_cb())
+        }
     }
 }
 
@@ -267,14 +685,51 @@ impl CodeBlock {
 impl CodeBlock {
     /// Stubbed CodeBlock for testing. Can't execute generated code.
     pub fn new_dummy(mem_size: usize) -> Self {
+        use std::ptr::NonNull;
         use crate::virtualmem::*;
         use crate::virtualmem::tests::TestingAllocator;
 
         let alloc = TestingAllocator::new(mem_size);
         let mem_start: *const u8 = alloc.mem_start();
-        let virt_mem = VirtualMem::new(alloc, 1, mem_start as *mut u8, mem_size);
+        let virt_mem = VirtualMem::new(alloc, 1, NonNull::new(mem_start as *mut u8).unwrap(), mem_size);
 
-        Self::new(virt_mem)
+        Self::new(Rc::new(RefCell::new(virt_mem)), false, Rc::new(None))
+    }
+
+    /// Stubbed CodeBlock for testing conditions that can arise due to code GC. Can't execute generated code.
+    #[cfg(target_arch = "aarch64")]
+    pub fn new_dummy_with_freed_pages(mut freed_pages: Vec<usize>) -> Self {
+        use std::ptr::NonNull;
+        use crate::virtualmem::*;
+        use crate::virtualmem::tests::TestingAllocator;
+
+        freed_pages.sort_unstable();
+        let mem_size = Self::PREFERRED_CODE_PAGE_SIZE *
+            (1 + freed_pages.last().expect("freed_pages vec should not be empty"));
+
+        let alloc = TestingAllocator::new(mem_size);
+        let mem_start: *const u8 = alloc.mem_start();
+        let virt_mem = VirtualMem::new(alloc, 1, NonNull::new(mem_start as *mut u8).unwrap(), mem_size);
+
+        Self::new(Rc::new(RefCell::new(virt_mem)), false, Rc::new(Some(freed_pages)))
+    }
+}
+
+/// Produce hex string output from the bytes in a code block
+impl fmt::LowerHex for CodeBlock {
+    fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
+        for pos in 0..self.write_pos {
+            let mem_block = &*self.mem_block.borrow();
+            let byte = unsafe { mem_block.start_ptr().raw_ptr(mem_block).add(pos).read() };
+            fmtr.write_fmt(format_args!("{:02x}", byte))?;
+        }
+        Ok(())
+    }
+}
+
+impl crate::virtualmem::CodePtrBase for CodeBlock {
+    fn base_ptr(&self) -> std::ptr::NonNull<u8> {
+        self.mem_block.borrow().base_ptr()
     }
 }
 
@@ -294,3 +749,100 @@ impl OutlinedCb {
         &mut self.cb
     }
 }
+
+/// Compute the number of bits needed to encode a signed value
+pub fn imm_num_bits(imm: i64) -> u8
+{
+    // Compute the smallest size this immediate fits in
+    if imm >= i8::MIN.into() && imm <= i8::MAX.into() {
+        return 8;
+    }
+    if imm >= i16::MIN.into() && imm <= i16::MAX.into() {
+        return 16;
+    }
+    if imm >= i32::MIN.into() && imm <= i32::MAX.into() {
+        return 32;
+    }
+
+    return 64;
+}
+
+/// Compute the number of bits needed to encode an unsigned value
+pub fn uimm_num_bits(uimm: u64) -> u8
+{
+    // Compute the smallest size this immediate fits in
+    if uimm <= u8::MAX.into() {
+        return 8;
+    }
+    else if uimm <= u16::MAX.into() {
+        return 16;
+    }
+    else if uimm <= u32::MAX.into() {
+        return 32;
+    }
+
+    return 64;
+}
+
+#[cfg(test)]
+mod tests
+{
+    use super::*;
+
+    #[test]
+    fn test_imm_num_bits()
+    {
+        assert_eq!(imm_num_bits(i8::MIN.into()), 8);
+        assert_eq!(imm_num_bits(i8::MAX.into()), 8);
+
+        assert_eq!(imm_num_bits(i16::MIN.into()), 16);
+        assert_eq!(imm_num_bits(i16::MAX.into()), 16);
+
+        assert_eq!(imm_num_bits(i32::MIN.into()), 32);
+        assert_eq!(imm_num_bits(i32::MAX.into()), 32);
+
+        assert_eq!(imm_num_bits(i64::MIN), 64);
+        assert_eq!(imm_num_bits(i64::MAX), 64);
+    }
+
+    #[test]
+    fn test_uimm_num_bits() {
+        assert_eq!(uimm_num_bits(u8::MIN.into()), 8);
+        assert_eq!(uimm_num_bits(u8::MAX.into()), 8);
+
+        assert_eq!(uimm_num_bits(((u8::MAX as u16) + 1).into()), 16);
+        assert_eq!(uimm_num_bits(u16::MAX.into()), 16);
+
+        assert_eq!(uimm_num_bits(((u16::MAX as u32) + 1).into()), 32);
+        assert_eq!(uimm_num_bits(u32::MAX.into()), 32);
+
+        assert_eq!(uimm_num_bits((u32::MAX as u64) + 1), 64);
+        assert_eq!(uimm_num_bits(u64::MAX), 64);
+    }
+
+    #[test]
+    fn test_code_size() {
+        // Write 4 bytes in the first page
+        let mut cb = CodeBlock::new_dummy(CodeBlock::PREFERRED_CODE_PAGE_SIZE * 2);
+        cb.write_bytes(&[0, 0, 0, 0]);
+        assert_eq!(cb.code_size(), 4);
+
+        // Moving to the next page should not increase code_size
+        cb.next_page(cb.get_write_ptr(), |_, _| {});
+        assert_eq!(cb.code_size(), 4);
+
+        // Write 4 bytes in the second page
+        cb.write_bytes(&[0, 0, 0, 0]);
+        assert_eq!(cb.code_size(), 8);
+
+        // Rewrite 4 bytes in the first page
+        let old_write_pos = cb.get_write_pos();
+        cb.set_pos(0);
+        cb.write_bytes(&[1, 1, 1, 1]);
+
+        // Moving from an old page to the next page should not increase code_size
+        cb.next_page(cb.get_write_ptr(), |_, _| {});
+        cb.set_pos(old_write_pos);
+        assert_eq!(cb.code_size(), 8);
+    }
+}
diff --git a/yjit/src/asm/x86_64/mod.rs b/yjit/src/asm/x86_64/mod.rs
index 6eb7efaa0a..fbbfa714d8 100644
--- a/yjit/src/asm/x86_64/mod.rs
+++ b/yjit/src/asm/x86_64/mod.rs
@@ -5,24 +5,24 @@ use crate::asm::*;
 // Import the assembler tests module
 mod tests;
 
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
 pub struct X86Imm
 {
     // Size in bits
-    num_bits: u8,
+    pub num_bits: u8,
 
     // The value of the immediate
-    value: i64
+    pub value: i64
 }
 
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
 pub struct X86UImm
 {
     // Size in bits
-    num_bits: u8,
+    pub num_bits: u8,
 
     // The value of the immediate
-    value: u64
+    pub value: u64
 }
 
 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
@@ -34,36 +34,36 @@ pub enum RegType
     IP,
 }
 
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
 pub struct X86Reg
 {
     // Size in bits
-    num_bits: u8,
+    pub num_bits: u8,
 
     // Register type
-    reg_type: RegType,
+    pub reg_type: RegType,
 
     // Register index number
-    reg_no: u8,
+    pub reg_no: u8,
 }
 
 #[derive(Clone, Copy, Debug)]
 pub struct X86Mem
 {
     // Size in bits
-    num_bits: u8,
+    pub num_bits: u8,
 
     /// Base register number
-    base_reg_no: u8,
+    pub base_reg_no: u8,
 
     /// Index register number
-    idx_reg_no: Option<u8>,
+    pub idx_reg_no: Option<u8>,
 
     /// SIB scale exponent value (power of two, two bits)
-    scale_exp: u8,
+    pub scale_exp: u8,
 
     /// Constant displacement from the base, not scaled
-    disp: i32,
+    pub disp: i32,
 }
 
 #[derive(Clone, Copy, Debug)]
@@ -88,6 +88,22 @@ pub enum X86Opnd
     IPRel(i32)
 }
 
+impl X86Reg {
+    pub fn with_num_bits(&self, num_bits: u8) -> Self {
+        assert!(
+            num_bits == 8 ||
+            num_bits == 16 ||
+            num_bits == 32 ||
+            num_bits == 64
+        );
+        Self {
+            num_bits,
+            reg_type: self.reg_type,
+            reg_no: self.reg_no
+        }
+    }
+}
+
 impl X86Opnd {
     fn rex_needed(&self) -> bool {
         match self {
@@ -95,7 +111,7 @@ impl X86Opnd {
             X86Opnd::Imm(_) => false,
             X86Opnd::UImm(_) => false,
             X86Opnd::Reg(reg) => reg.reg_no > 7 || reg.num_bits == 8 && reg.reg_no >= 4,
-            X86Opnd::Mem(mem) => (mem.base_reg_no > 7 || (mem.idx_reg_no.unwrap_or(0) > 7)),
+            X86Opnd::Mem(mem) => mem.base_reg_no > 7 || (mem.idx_reg_no.unwrap_or(0) > 7),
             X86Opnd::IPRel(_) => false
         }
     }
@@ -118,7 +134,7 @@ impl X86Opnd {
             X86Opnd::Mem(mem) => {
                 if mem.disp != 0 {
                     // Compute the required displacement size
-                    let num_bits = sig_imm_size(mem.disp.into());
+                    let num_bits = imm_num_bits(mem.disp.into());
                     if num_bits > 32 {
                         panic!("displacement does not fit in 32 bits");
                     }
@@ -145,6 +161,14 @@ impl X86Opnd {
             _ => unreachable!()
         }
     }
+
+    pub fn is_some(&self) -> bool {
+        match self {
+            X86Opnd::None => false,
+            _ => true
+        }
+    }
+
 }
 
 // Instruction pointer
@@ -157,22 +181,39 @@ const RBP_REG_NO: u8 = 5;
 const R12_REG_NO: u8 = 12;
 const R13_REG_NO: u8 = 13;
 
-pub const RAX: X86Opnd  = X86Opnd::Reg(X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: RAX_REG_NO });
-pub const RCX: X86Opnd  = X86Opnd::Reg(X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 1 });
-pub const RDX: X86Opnd  = X86Opnd::Reg(X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 2 });
-pub const RBX: X86Opnd  = X86Opnd::Reg(X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 3 });
-pub const RSP: X86Opnd  = X86Opnd::Reg(X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: RSP_REG_NO });
-pub const RBP: X86Opnd  = X86Opnd::Reg(X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: RBP_REG_NO });
-pub const RSI: X86Opnd  = X86Opnd::Reg(X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 6 });
-pub const RDI: X86Opnd  = X86Opnd::Reg(X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 7 });
-pub const R8:  X86Opnd  = X86Opnd::Reg(X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 8 });
-pub const R9:  X86Opnd  = X86Opnd::Reg(X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 9 });
-pub const R10: X86Opnd  = X86Opnd::Reg(X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 10 });
-pub const R11: X86Opnd  = X86Opnd::Reg(X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 11 });
-pub const R12: X86Opnd  = X86Opnd::Reg(X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: R12_REG_NO });
-pub const R13: X86Opnd  = X86Opnd::Reg(X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: R13_REG_NO });
-pub const R14: X86Opnd  = X86Opnd::Reg(X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 14 });
-pub const R15: X86Opnd  = X86Opnd::Reg(X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 15 });
+pub const RAX_REG: X86Reg = X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: RAX_REG_NO };
+pub const RCX_REG: X86Reg = X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 1 };
+pub const RDX_REG: X86Reg = X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 2 };
+pub const RBX_REG: X86Reg = X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 3 };
+pub const RSP_REG: X86Reg = X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: RSP_REG_NO };
+pub const RBP_REG: X86Reg = X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: RBP_REG_NO };
+pub const RSI_REG: X86Reg = X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 6 };
+pub const RDI_REG: X86Reg = X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 7 };
+pub const R8_REG:  X86Reg = X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 8 };
+pub const R9_REG:  X86Reg = X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 9 };
+pub const R10_REG: X86Reg = X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 10 };
+pub const R11_REG: X86Reg = X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 11 };
+pub const R12_REG: X86Reg = X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: R12_REG_NO };
+pub const R13_REG: X86Reg = X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: R13_REG_NO };
+pub const R14_REG: X86Reg = X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 14 };
+pub const R15_REG: X86Reg = X86Reg { num_bits: 64, reg_type: RegType::GP, reg_no: 15 };
+
+pub const RAX: X86Opnd  = X86Opnd::Reg(RAX_REG);
+pub const RCX: X86Opnd  = X86Opnd::Reg(RCX_REG);
+pub const RDX: X86Opnd  = X86Opnd::Reg(RDX_REG);
+pub const RBX: X86Opnd  = X86Opnd::Reg(RBX_REG);
+pub const RSP: X86Opnd  = X86Opnd::Reg(RSP_REG);
+pub const RBP: X86Opnd  = X86Opnd::Reg(RBP_REG);
+pub const RSI: X86Opnd  = X86Opnd::Reg(RSI_REG);
+pub const RDI: X86Opnd  = X86Opnd::Reg(RDI_REG);
+pub const R8:  X86Opnd  = X86Opnd::Reg(R8_REG);
+pub const R9:  X86Opnd  = X86Opnd::Reg(R9_REG);
+pub const R10: X86Opnd  = X86Opnd::Reg(R10_REG);
+pub const R11: X86Opnd  = X86Opnd::Reg(R11_REG);
+pub const R12: X86Opnd  = X86Opnd::Reg(R12_REG);
+pub const R13: X86Opnd  = X86Opnd::Reg(R13_REG);
+pub const R14: X86Opnd  = X86Opnd::Reg(R14_REG);
+pub const R15: X86Opnd  = X86Opnd::Reg(R15_REG);
 
 // 32-bit GP registers
 pub const EAX: X86Opnd  = X86Opnd::Reg(X86Reg { num_bits: 32, reg_type: RegType::GP, reg_no: 0 });
@@ -197,7 +238,7 @@ pub const AX:   X86Opnd = X86Opnd::Reg(X86Reg { num_bits: 16, reg_type: RegType:
 pub const CX:   X86Opnd = X86Opnd::Reg(X86Reg { num_bits: 16, reg_type: RegType::GP, reg_no: 1 });
 pub const DX:   X86Opnd = X86Opnd::Reg(X86Reg { num_bits: 16, reg_type: RegType::GP, reg_no: 2 });
 pub const BX:   X86Opnd = X86Opnd::Reg(X86Reg { num_bits: 16, reg_type: RegType::GP, reg_no: 3 });
-pub const SP:   X86Opnd = X86Opnd::Reg(X86Reg { num_bits: 16, reg_type: RegType::GP, reg_no: 4 });
+//pub const SP:   X86Opnd = X86Opnd::Reg(X86Reg { num_bits: 16, reg_type: RegType::GP, reg_no: 4 });
 pub const BP:   X86Opnd = X86Opnd::Reg(X86Reg { num_bits: 16, reg_type: RegType::GP, reg_no: 5 });
 pub const SI:   X86Opnd = X86Opnd::Reg(X86Reg { num_bits: 16, reg_type: RegType::GP, reg_no: 6 });
 pub const DI:   X86Opnd = X86Opnd::Reg(X86Reg { num_bits: 16, reg_type: RegType::GP, reg_no: 7 });
@@ -228,45 +269,8 @@ pub const R13B: X86Opnd = X86Opnd::Reg(X86Reg { num_bits: 8, reg_type: RegType::
 pub const R14B: X86Opnd = X86Opnd::Reg(X86Reg { num_bits: 8, reg_type: RegType::GP, reg_no: 14 });
 pub const R15B: X86Opnd = X86Opnd::Reg(X86Reg { num_bits: 8, reg_type: RegType::GP, reg_no: 15 });
 
-// C argument registers
-pub const C_ARG_REGS: [X86Opnd; 6] = [RDI, RSI, RDX, RCX, R8, R9];
-
 //===========================================================================
 
-/// Compute the number of bits needed to encode a signed value
-pub fn sig_imm_size(imm: i64) -> u8
-{
-    // Compute the smallest size this immediate fits in
-    if imm >= i8::MIN.into() && imm <= i8::MAX.into() {
-        return 8;
-    }
-    if imm >= i16::MIN.into() && imm <= i16::MAX.into() {
-        return 16;
-    }
-    if imm >= i32::MIN.into() && imm <= i32::MAX.into() {
-        return 32;
-    }
-
-    return 64;
-}
-
-/// Compute the number of bits needed to encode an unsigned value
-pub fn unsig_imm_size(imm: u64) -> u8
-{
-    // Compute the smallest size this immediate fits in
-    if imm <= u8::MAX.into() {
-        return 8;
-    }
-    else if imm <= u16::MAX.into() {
-        return 16;
-    }
-    else if imm <= u32::MAX.into() {
-        return 32;
-    }
-
-    return 64;
-}
-
 /// Shorthand for memory operand with base register and displacement
 pub fn mem_opnd(num_bits: u8, base_reg: X86Opnd, disp: i32) -> X86Opnd
 {
@@ -345,12 +349,12 @@ static x86opnd_t resize_opnd(x86opnd_t opnd, uint32_t num_bits)
 
 pub fn imm_opnd(value: i64) -> X86Opnd
 {
-    X86Opnd::Imm(X86Imm { num_bits: sig_imm_size(value), value })
+    X86Opnd::Imm(X86Imm { num_bits: imm_num_bits(value), value })
 }
 
 pub fn uimm_opnd(value: u64) -> X86Opnd
 {
-    X86Opnd::UImm(X86UImm { num_bits: unsig_imm_size(value), value })
+    X86Opnd::UImm(X86UImm { num_bits: uimm_num_bits(value), value })
 }
 
 pub fn const_ptr_opnd(ptr: *const u8) -> X86Opnd
@@ -358,11 +362,6 @@ pub fn const_ptr_opnd(ptr: *const u8) -> X86Opnd
     uimm_opnd(ptr as u64)
 }
 
-pub fn code_ptr_opnd(code_ptr: CodePtr) -> X86Opnd
-{
-    uimm_opnd(code_ptr.raw_ptr() as u64)
-}
-
 /// Write the REX byte
 fn write_rex(cb: &mut CodeBlock, w_flag: bool, reg_no: u8, idx_reg_no: u8, rm_reg_no: u8) {
     // 0 1 0 0 w r x b
@@ -386,7 +385,7 @@ fn write_opcode(cb: &mut CodeBlock, opcode: u8, reg: X86Reg) {
 }
 
 /// Encode an RM instruction
-fn write_rm(cb: &mut CodeBlock, sz_pref: bool, rex_w: bool, r_opnd: X86Opnd, rm_opnd: X86Opnd, op_ext: u8, bytes: &[u8]) {
+fn write_rm(cb: &mut CodeBlock, sz_pref: bool, rex_w: bool, r_opnd: X86Opnd, rm_opnd: X86Opnd, op_ext: Option<u8>, bytes: &[u8]) {
     let op_len = bytes.len();
     assert!(op_len > 0 && op_len <= 3);
     assert!(matches!(r_opnd, X86Opnd::Reg(_) | X86Opnd::None), "Can only encode an RM instruction with a register or a none");
@@ -443,7 +442,7 @@ fn write_rm(cb: &mut CodeBlock, sz_pref: bool, rex_w: bool, r_opnd: X86Opnd, rm_
     // MODRM.rm  (3 bits)
 
     assert!(
-        !(op_ext != 0xff && !matches!(r_opnd, X86Opnd::None)),
+        !(op_ext.is_some() && r_opnd.is_some()),
         "opcode extension and register operand present"
     );
 
@@ -464,8 +463,8 @@ fn write_rm(cb: &mut CodeBlock, sz_pref: bool, rex_w: bool, r_opnd: X86Opnd, rm_
 
     // Encode the reg field
     let reg: u8;
-    if op_ext != 0xff {
-        reg = op_ext;
+    if let Some(val) = op_ext {
+        reg = val;
     } else {
         reg = match r_opnd {
             X86Opnd::Reg(reg) => reg.reg_no & 7,
@@ -526,7 +525,7 @@ fn write_rm(cb: &mut CodeBlock, sz_pref: bool, rex_w: bool, r_opnd: X86Opnd, rm_
 }
 
 // Encode a mul-like single-operand RM instruction
-fn write_rm_unary(cb: &mut CodeBlock, op_mem_reg_8: u8, op_mem_reg_pref: u8, op_ext: u8, opnd: X86Opnd) {
+fn write_rm_unary(cb: &mut CodeBlock, op_mem_reg_8: u8, op_mem_reg_pref: u8, op_ext: Option<u8>, opnd: X86Opnd) {
     assert!(matches!(opnd, X86Opnd::Reg(_) | X86Opnd::Mem(_)));
 
     let opnd_size = opnd.num_bits();
@@ -542,7 +541,7 @@ fn write_rm_unary(cb: &mut CodeBlock, op_mem_reg_8: u8, op_mem_reg_pref: u8, op_
 }
 
 // Encode an add-like RM instruction with multiple possible encodings
-fn write_rm_multi(cb: &mut CodeBlock, op_mem_reg8: u8, op_mem_reg_pref: u8, op_reg_mem8: u8, op_reg_mem_pref: u8, op_mem_imm8: u8, op_mem_imm_sml: u8, op_mem_imm_lrg: u8, op_ext_imm: u8, opnd0: X86Opnd, opnd1: X86Opnd) {
+fn write_rm_multi(cb: &mut CodeBlock, op_mem_reg8: u8, op_mem_reg_pref: u8, op_reg_mem8: u8, op_reg_mem_pref: u8, op_mem_imm8: u8, op_mem_imm_sml: u8, op_mem_imm_lrg: u8, op_ext_imm: Option<u8>, opnd0: X86Opnd, opnd1: X86Opnd) {
     assert!(matches!(opnd0, X86Opnd::Reg(_) | X86Opnd::Mem(_)));
 
     // Check the size of opnd0
@@ -551,8 +550,8 @@ fn write_rm_multi(cb: &mut CodeBlock, op_mem_reg8: u8, op_mem_reg_pref: u8, op_r
 
     // Check the size of opnd1
     match opnd1 {
-        X86Opnd::Reg(reg) => assert!(reg.num_bits == opnd_size),
-        X86Opnd::Mem(mem) => assert!(mem.num_bits == opnd_size),
+        X86Opnd::Reg(reg) => assert_eq!(reg.num_bits, opnd_size),
+        X86Opnd::Mem(mem) => assert_eq!(mem.num_bits, opnd_size),
         X86Opnd::Imm(imm) => assert!(imm.num_bits <= opnd_size),
         X86Opnd::UImm(uimm) => assert!(uimm.num_bits <= opnd_size),
         _ => ()
@@ -565,17 +564,17 @@ fn write_rm_multi(cb: &mut CodeBlock, op_mem_reg8: u8, op_mem_reg_pref: u8, op_r
         // R/M + Reg
         (X86Opnd::Mem(_), X86Opnd::Reg(_)) | (X86Opnd::Reg(_), X86Opnd::Reg(_)) => {
             if opnd_size == 8 {
-                write_rm(cb, false, false, opnd1, opnd0, 0xff, &[op_mem_reg8]);
+                write_rm(cb, false, false, opnd1, opnd0, None, &[op_mem_reg8]);
             } else {
-                write_rm(cb, sz_pref, rex_w, opnd1, opnd0, 0xff, &[op_mem_reg_pref]);
+                write_rm(cb, sz_pref, rex_w, opnd1, opnd0, None, &[op_mem_reg_pref]);
             }
         },
         // Reg + R/M/IPRel
         (X86Opnd::Reg(_), X86Opnd::Mem(_) | X86Opnd::IPRel(_)) => {
             if opnd_size == 8 {
-                write_rm(cb, false, false, opnd0, opnd1, 0xff, &[op_reg_mem8]);
+                write_rm(cb, false, false, opnd0, opnd1, None, &[op_reg_mem8]);
             } else {
-                write_rm(cb, sz_pref, rex_w, opnd0, opnd1, 0xff, &[op_reg_mem_pref]);
+                write_rm(cb, sz_pref, rex_w, opnd0, opnd1, None, &[op_reg_mem_pref]);
             }
         },
         // R/M + Imm
@@ -602,7 +601,14 @@ fn write_rm_multi(cb: &mut CodeBlock, op_mem_reg8: u8, op_mem_reg_pref: u8, op_r
         },
         // R/M + UImm
         (_, X86Opnd::UImm(uimm)) => {
-            let num_bits = sig_imm_size(uimm.value.try_into().unwrap());
+            // If the size of left hand operand equals the number of bits
+            // required to represent the right hand immediate, then we
+            // don't care about sign extension when calculating the immediate
+            let num_bits = if opnd0.num_bits() == uimm_num_bits(uimm.value) {
+                uimm_num_bits(uimm.value)
+            } else {
+                imm_num_bits(uimm.value.try_into().unwrap())
+            };
 
             if num_bits <= 8 {
                 // 8-bit immediate
@@ -621,10 +627,10 @@ fn write_rm_multi(cb: &mut CodeBlock, op_mem_reg8: u8, op_mem_reg_pref: u8, op_r
                 write_rm(cb, sz_pref, rex_w, X86Opnd::None, opnd0, op_ext_imm, &[op_mem_imm_lrg]);
                 cb.write_int(uimm.value, if opnd_size > 32 { 32 } else { opnd_size.into() });
             } else {
-                panic!("immediate value too large");
+                panic!("immediate value too large (num_bits={}, num={uimm:?})", num_bits);
             }
         },
-        _ => unreachable!()
+        _ => panic!("unknown encoding combo: {opnd0:?} {opnd1:?}")
     };
 }
 
@@ -644,7 +650,7 @@ pub fn add(cb: &mut CodeBlock, opnd0: X86Opnd, opnd1: X86Opnd) {
         0x80, // opMemImm8
         0x83, // opMemImmSml
         0x81, // opMemImmLrg
-        0x00, // opExtImm
+        Some(0x00), // opExtImm
         opnd0,
         opnd1
     );
@@ -661,7 +667,7 @@ pub fn and(cb: &mut CodeBlock, opnd0: X86Opnd, opnd1: X86Opnd) {
         0x80, // opMemImm8
         0x83, // opMemImmSml
         0x81, // opMemImmLrg
-        0x04, // opExtImm
+        Some(0x04), // opExtImm
         opnd0,
         opnd1
     );
@@ -679,19 +685,23 @@ pub fn call_rel32(cb: &mut CodeBlock, rel32: i32) {
 /// call - Call a pointer, encode with a 32-bit offset if possible
 pub fn call_ptr(cb: &mut CodeBlock, scratch_opnd: X86Opnd, dst_ptr: *const u8) {
     if let X86Opnd::Reg(_scratch_reg) = scratch_opnd {
+        use crate::stats::{incr_counter};
+
         // Pointer to the end of this call instruction
         let end_ptr = cb.get_ptr(cb.write_pos + 5);
 
         // Compute the jump offset
-        let rel64: i64 = dst_ptr as i64 - end_ptr.into_i64();
+        let rel64: i64 = dst_ptr as i64 - end_ptr.raw_ptr(cb) as i64;
 
         // If the offset fits in 32-bit
         if rel64 >= i32::MIN.into() && rel64 <= i32::MAX.into() {
+            incr_counter!(num_send_x86_rel32);
             call_rel32(cb, rel64.try_into().unwrap());
             return;
         }
 
         // Move the pointer into the scratch register and call
+        incr_counter!(num_send_x86_reg);
         mov(cb, scratch_opnd, const_ptr_opnd(dst_ptr));
         call(cb, scratch_opnd);
     } else {
@@ -701,19 +711,15 @@ pub fn call_ptr(cb: &mut CodeBlock, scratch_opnd: X86Opnd, dst_ptr: *const u8) {
 
 /// call - Call to label with 32-bit offset
 pub fn call_label(cb: &mut CodeBlock, label_idx: usize) {
-    // Write the opcode
-    cb.write_byte(0xE8);
-
-    // Add a reference to the label
-    cb.label_ref(label_idx);
-
-    // Relative 32-bit offset to be patched
-    cb.write_int(0, 32);
+    cb.label_ref(label_idx, 5, |cb, src_addr, dst_addr| {
+        cb.write_byte(0xE8);
+        cb.write_int((dst_addr - src_addr) as u64, 32);
+    });
 }
 
 /// call - Indirect call with an R/M operand
 pub fn call(cb: &mut CodeBlock, opnd: X86Opnd) {
-    write_rm(cb, false, false, X86Opnd::None, opnd, 2, &[0xff]);
+    write_rm(cb, false, false, X86Opnd::None, opnd, Some(2), &[0xff]);
 }
 
 /// Encode a conditional move instruction
@@ -729,7 +735,7 @@ fn write_cmov(cb: &mut CodeBlock, opcode1: u8, dst: X86Opnd, src: X86Opnd) {
         let sz_pref = reg.num_bits == 16;
         let rex_w = reg.num_bits == 64;
 
-        write_rm(cb, sz_pref, rex_w, dst, src, 0xff, &[0x0f, opcode1]);
+        write_rm(cb, sz_pref, rex_w, dst, src, None, &[0x0f, opcode1]);
     } else {
         unreachable!()
     }
@@ -778,7 +784,7 @@ pub fn cmp(cb: &mut CodeBlock, opnd0: X86Opnd, opnd1: X86Opnd) {
         0x80, // opMemImm8
         0x83, // opMemImmSml
         0x81, // opMemImmLrg
-        0x07, // opExtImm
+        Some(0x07), // opExtImm
         opnd0,
         opnd1
     );
@@ -794,60 +800,84 @@ pub fn cqo(cb: &mut CodeBlock) {
     cb.write_bytes(&[0x48, 0x99]);
 }
 
+/// imul - signed integer multiply
+pub fn imul(cb: &mut CodeBlock, opnd0: X86Opnd, opnd1: X86Opnd) {
+    assert!(opnd0.num_bits() == 64);
+    assert!(opnd1.num_bits() == 64);
+    assert!(matches!(opnd0, X86Opnd::Reg(_) | X86Opnd::Mem(_)));
+    assert!(matches!(opnd1, X86Opnd::Reg(_) | X86Opnd::Mem(_)));
+
+    match (opnd0, opnd1) {
+        (X86Opnd::Reg(_), X86Opnd::Reg(_) | X86Opnd::Mem(_)) => {
+            //REX.W + 0F AF /rIMUL r64, r/m64
+            // Quadword register := Quadword register * r/m64.
+            write_rm(cb, false, true, opnd0, opnd1, None, &[0x0F, 0xAF]);
+        }
+
+        // Flip the operands to handle this case. This instruction has weird encoding restrictions.
+        (X86Opnd::Mem(_), X86Opnd::Reg(_)) => {
+            //REX.W + 0F AF /rIMUL r64, r/m64
+            // Quadword register := Quadword register * r/m64.
+            write_rm(cb, false, true, opnd1, opnd0, None, &[0x0F, 0xAF]);
+        }
+
+        _ => unreachable!()
+    }
+}
+
 /// Interrupt 3 - trap to debugger
 pub fn int3(cb: &mut CodeBlock) {
     cb.write_byte(0xcc);
 }
 
-// Encode a relative jump to a label (direct or conditional)
+// Encode a conditional relative jump to a label
 // Note: this always encodes a 32-bit offset
-fn write_jcc(cb: &mut CodeBlock, op0: u8, op1: u8, label_idx: usize) {
-    // Write the opcode
-    if op0 != 0xff {
-        cb.write_byte(op0);
-    }
-
-    cb.write_byte(op1);
-
-    // Add a reference to the label
-    cb.label_ref(label_idx);
-
-    // Relative 32-bit offset to be patched
-    cb.write_int( 0, 32);
+fn write_jcc<const OP: u8>(cb: &mut CodeBlock, label_idx: usize) {
+    cb.label_ref(label_idx, 6, |cb, src_addr, dst_addr| {
+        cb.write_byte(0x0F);
+        cb.write_byte(OP);
+        cb.write_int((dst_addr - src_addr) as u64, 32);
+    });
 }
 
 /// jcc - relative jumps to a label
-pub fn ja_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x87, label_idx); }
-pub fn jae_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x83, label_idx); }
-pub fn jb_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x82, label_idx); }
-pub fn jbe_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x86, label_idx); }
-pub fn jc_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x82, label_idx); }
-pub fn je_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x84, label_idx); }
-pub fn jg_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x8F, label_idx); }
-pub fn jge_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x8D, label_idx); }
-pub fn jl_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x8C, label_idx); }
-pub fn jle_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x8E, label_idx); }
-pub fn jna_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x86, label_idx); }
-pub fn jnae_label(cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x82, label_idx); }
-pub fn jnb_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x83, label_idx); }
-pub fn jnbe_label(cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x87, label_idx); }
-pub fn jnc_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x83, label_idx); }
-pub fn jne_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x85, label_idx); }
-pub fn jng_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x8E, label_idx); }
-pub fn jnge_label(cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x8C, label_idx); }
-pub fn jnl_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x8D, label_idx); }
-pub fn jnle_label(cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x8F, label_idx); }
-pub fn jno_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x81, label_idx); }
-pub fn jnp_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x8b, label_idx); }
-pub fn jns_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x89, label_idx); }
-pub fn jnz_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x85, label_idx); }
-pub fn jo_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x80, label_idx); }
-pub fn jp_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x8A, label_idx); }
-pub fn jpe_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x8A, label_idx); }
-pub fn jpo_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x8B, label_idx); }
-pub fn js_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x88, label_idx); }
-pub fn jz_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0x0F, 0x84, label_idx); }
-pub fn jmp_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc(cb, 0xFF, 0xE9, label_idx); }
+pub fn ja_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x87>(cb, label_idx); }
+pub fn jae_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x83>(cb, label_idx); }
+pub fn jb_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x82>(cb, label_idx); }
+pub fn jbe_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x86>(cb, label_idx); }
+pub fn jc_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x82>(cb, label_idx); }
+pub fn je_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x84>(cb, label_idx); }
+pub fn jg_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x8F>(cb, label_idx); }
+pub fn jge_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x8D>(cb, label_idx); }
+pub fn jl_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x8C>(cb, label_idx); }
+pub fn jle_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x8E>(cb, label_idx); }
+pub fn jna_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x86>(cb, label_idx); }
+pub fn jnae_label(cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x82>(cb, label_idx); }
+pub fn jnb_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x83>(cb, label_idx); }
+pub fn jnbe_label(cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x87>(cb, label_idx); }
+pub fn jnc_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x83>(cb, label_idx); }
+pub fn jne_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x85>(cb, label_idx); }
+pub fn jng_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x8E>(cb, label_idx); }
+pub fn jnge_label(cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x8C>(cb, label_idx); }
+pub fn jnl_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x8D>(cb, label_idx); }
+pub fn jnle_label(cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x8F>(cb, label_idx); }
+pub fn jno_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x81>(cb, label_idx); }
+pub fn jnp_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x8b>(cb, label_idx); }
+pub fn jns_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x89>(cb, label_idx); }
+pub fn jnz_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x85>(cb, label_idx); }
+pub fn jo_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x80>(cb, label_idx); }
+pub fn jp_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x8A>(cb, label_idx); }
+pub fn jpe_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x8A>(cb, label_idx); }
+pub fn jpo_label (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x8B>(cb, label_idx); }
+pub fn js_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x88>(cb, label_idx); }
+pub fn jz_label  (cb: &mut CodeBlock, label_idx: usize) { write_jcc::<0x84>(cb, label_idx); }
+
+pub fn jmp_label(cb: &mut CodeBlock, label_idx: usize) {
+    cb.label_ref(label_idx, 5, |cb, src_addr, dst_addr| {
+        cb.write_byte(0xE9);
+        cb.write_int((dst_addr - src_addr) as u64, 32);
+    });
+}
 
 /// Encode a relative jump to a pointer at a 32-bit offset (direct or conditional)
 fn write_jcc_ptr(cb: &mut CodeBlock, op0: u8, op1: u8, dst_ptr: CodePtr) {
@@ -862,7 +892,7 @@ fn write_jcc_ptr(cb: &mut CodeBlock, op0: u8, op1: u8, dst_ptr: CodePtr) {
     let end_ptr = cb.get_ptr(cb.write_pos + 4);
 
     // Compute the jump offset
-    let rel64 = dst_ptr.into_i64() - end_ptr.into_i64();
+    let rel64 = dst_ptr.as_offset() - end_ptr.as_offset();
 
     if rel64 >= i32::MIN.into() && rel64 <= i32::MAX.into() {
         // Write the relative 32-bit jump offset
@@ -909,7 +939,7 @@ pub fn jmp_ptr (cb: &mut CodeBlock, ptr: CodePtr) { write_jcc_ptr(cb, 0xFF, 0xE9
 
 /// jmp - Indirect jump near to an R/M operand.
 pub fn jmp_rm(cb: &mut CodeBlock, opnd: X86Opnd) {
-    write_rm(cb, false, false, X86Opnd::None, opnd, 4, &[0xff]);
+    write_rm(cb, false, false, X86Opnd::None, opnd, Some(4), &[0xff]);
 }
 
 // jmp - Jump with relative 32-bit offset
@@ -922,7 +952,8 @@ pub fn jmp32(cb: &mut CodeBlock, offset: i32) {
 pub fn lea(cb: &mut CodeBlock, dst: X86Opnd, src: X86Opnd) {
     if let X86Opnd::Reg(reg) = dst {
         assert!(reg.num_bits == 64);
-        write_rm(cb, false, true, dst, src, 0xff, &[0x8d]);
+        assert!(matches!(src, X86Opnd::Mem(_) | X86Opnd::IPRel(_)));
+        write_rm(cb, false, true, dst, src, None, &[0x8d]);
     } else {
         unreachable!();
     }
@@ -990,13 +1021,13 @@ pub fn mov(cb: &mut CodeBlock, dst: X86Opnd, src: X86Opnd) {
             assert!(imm.num_bits <= mem.num_bits);
 
             if mem.num_bits == 8 {
-                write_rm(cb, false, false, X86Opnd::None, dst, 0xff, &[0xc6]);
+                write_rm(cb, false, false, X86Opnd::None, dst, None, &[0xc6]);
             } else {
-                write_rm(cb, mem.num_bits == 16, mem.num_bits == 64, X86Opnd::None, dst, 0, &[0xc7]);
+                write_rm(cb, mem.num_bits == 16, mem.num_bits == 64, X86Opnd::None, dst, Some(0), &[0xc7]);
             }
 
             let output_num_bits:u32 = if mem.num_bits > 32 { 32 } else { mem.num_bits.into() };
-            assert!(sig_imm_size(imm.value) <= (output_num_bits as u8));
+            assert!(imm_num_bits(imm.value) <= (output_num_bits as u8));
             cb.write_int(imm.value as u64, output_num_bits);
         },
         // M + UImm
@@ -1004,14 +1035,14 @@ pub fn mov(cb: &mut CodeBlock, dst: X86Opnd, src: X86Opnd) {
             assert!(uimm.num_bits <= mem.num_bits);
 
             if mem.num_bits == 8 {
-                write_rm(cb, false, false, X86Opnd::None, dst, 0xff, &[0xc6]);
+                write_rm(cb, false, false, X86Opnd::None, dst, None, &[0xc6]);
             }
             else {
-                write_rm(cb, mem.num_bits == 16, mem.num_bits == 64, X86Opnd::None, dst, 0, &[0xc7]);
+                write_rm(cb, mem.num_bits == 16, mem.num_bits == 64, X86Opnd::None, dst, Some(0), &[0xc7]);
             }
 
             let output_num_bits = if mem.num_bits > 32 { 32 } else { mem.num_bits.into() };
-            assert!(sig_imm_size(uimm.value as i64) <= (output_num_bits as u8));
+            assert!(imm_num_bits(uimm.value as i64) <= (output_num_bits as u8));
             cb.write_int(uimm.value, output_num_bits);
         },
         // * + Imm/UImm
@@ -1027,7 +1058,7 @@ pub fn mov(cb: &mut CodeBlock, dst: X86Opnd, src: X86Opnd) {
                 0xC6, // opMemImm8
                 0xFF, // opMemImmSml (not available)
                 0xFF, // opMemImmLrg
-                0xFF, // opExtImm
+                None, // opExtImm
                 dst,
                 src
             );
@@ -1035,6 +1066,20 @@ pub fn mov(cb: &mut CodeBlock, dst: X86Opnd, src: X86Opnd) {
     };
 }
 
+/// A variant of mov used for always writing the value in 64 bits for GC offsets.
+pub fn movabs(cb: &mut CodeBlock, dst: X86Opnd, value: u64) {
+    match dst {
+        X86Opnd::Reg(reg) => {
+            assert_eq!(reg.num_bits, 64);
+            write_rex(cb, true, 0, 0, reg.reg_no);
+
+            write_opcode(cb, 0xb8, reg);
+            cb.write_int(value, 64);
+        },
+        _ => unreachable!()
+    }
+}
+
 /// movsx - Move with sign extension (signed integers)
 pub fn movsx(cb: &mut CodeBlock, dst: X86Opnd, src: X86Opnd) {
     if let X86Opnd::Reg(_dst_reg) = dst {
@@ -1045,9 +1090,9 @@ pub fn movsx(cb: &mut CodeBlock, dst: X86Opnd, src: X86Opnd) {
         assert!(src_num_bits < dst_num_bits);
 
         match src_num_bits {
-            8 => write_rm(cb, dst_num_bits == 16, dst_num_bits == 64, dst, src, 0xff, &[0x0f, 0xbe]),
-            16 => write_rm(cb, dst_num_bits == 16, dst_num_bits == 64, dst, src, 0xff, &[0x0f, 0xbf]),
-            32 => write_rm(cb, false, true, dst, src, 0xff, &[0x63]),
+            8 => write_rm(cb, dst_num_bits == 16, dst_num_bits == 64, dst, src, None, &[0x0f, 0xbe]),
+            16 => write_rm(cb, dst_num_bits == 16, dst_num_bits == 64, dst, src, None, &[0x0f, 0xbf]),
+            32 => write_rm(cb, false, true, dst, src, None, &[0x63]),
             _ => unreachable!()
         };
     } else {
@@ -1125,7 +1170,7 @@ pub fn not(cb: &mut CodeBlock, opnd: X86Opnd) {
         cb,
         0xf6, // opMemReg8
         0xf7, // opMemRegPref
-        0x02, // opExt
+        Some(0x02), // opExt
         opnd
     );
 }
@@ -1141,7 +1186,7 @@ pub fn or(cb: &mut CodeBlock, opnd0: X86Opnd, opnd1: X86Opnd) {
         0x80, // opMemImm8
         0x83, // opMemImmSml
         0x81, // opMemImmLrg
-        0x01, // opExtImm
+        Some(0x01), // opExtImm
         opnd0,
         opnd1
     );
@@ -1161,7 +1206,7 @@ pub fn pop(cb: &mut CodeBlock, opnd: X86Opnd) {
         X86Opnd::Mem(mem) => {
             assert!(mem.num_bits == 64);
 
-            write_rm(cb, false, false, X86Opnd::None, opnd, 0, &[0x8f]);
+            write_rm(cb, false, false, X86Opnd::None, opnd, Some(0), &[0x8f]);
         },
         _ => unreachable!()
     };
@@ -1183,7 +1228,7 @@ pub fn push(cb: &mut CodeBlock, opnd: X86Opnd) {
             write_opcode(cb, 0x50, reg);
         },
         X86Opnd::Mem(_mem) => {
-            write_rm(cb, false, false, X86Opnd::None, opnd, 6, &[0xff]);
+            write_rm(cb, false, false, X86Opnd::None, opnd, Some(6), &[0xff]);
         },
         _ => unreachable!()
     }
@@ -1199,8 +1244,8 @@ pub fn ret(cb: &mut CodeBlock) {
     cb.write_byte(0xC3);
 }
 
-// Encode a single-operand shift instruction
-fn write_shift(cb: &mut CodeBlock, op_mem_one_pref: u8, _op_mem_cl_pref: u8, op_mem_imm_pref: u8, op_ext: u8, opnd0: X86Opnd, opnd1: X86Opnd) {
+// Encode a bitwise shift instruction
+fn write_shift(cb: &mut CodeBlock, op_mem_one_pref: u8, op_mem_cl_pref: u8, op_mem_imm_pref: u8, op_ext: u8, opnd0: X86Opnd, opnd1: X86Opnd) {
     assert!(matches!(opnd0, X86Opnd::Reg(_) | X86Opnd::Mem(_)));
 
     // Check the size of opnd0
@@ -1210,16 +1255,26 @@ fn write_shift(cb: &mut CodeBlock, op_mem_one_pref: u8, _op_mem_cl_pref: u8, op_
     let sz_pref = opnd_size == 16;
     let rex_w = opnd_size == 64;
 
-    if let X86Opnd::UImm(imm) = opnd1 {
-        if imm.value == 1 {
-            write_rm(cb, sz_pref, rex_w, X86Opnd::None, opnd0, op_ext, &[op_mem_one_pref]);
-        } else {
-            assert!(imm.num_bits <= 8);
-            write_rm(cb, sz_pref, rex_w, X86Opnd::None, opnd0, op_ext, &[op_mem_imm_pref]);
-            cb.write_byte(imm.value as u8);
+    match opnd1 {
+        X86Opnd::UImm(imm) => {
+            if imm.value == 1 {
+                write_rm(cb, sz_pref, rex_w, X86Opnd::None, opnd0, Some(op_ext), &[op_mem_one_pref]);
+            } else {
+                assert!(imm.num_bits <= 8);
+                write_rm(cb, sz_pref, rex_w, X86Opnd::None, opnd0, Some(op_ext), &[op_mem_imm_pref]);
+                cb.write_byte(imm.value as u8);
+            }
+        }
+
+        X86Opnd::Reg(reg) => {
+            // We can only use CL/RCX as the shift amount
+            assert!(reg.reg_no == RCX_REG.reg_no);
+            write_rm(cb, sz_pref, rex_w, X86Opnd::None, opnd0, Some(op_ext), &[op_mem_cl_pref]);
+        }
+
+        _ => {
+            unreachable!("unsupported operands: {:?}, {:?}", opnd0, opnd1);
         }
-    } else {
-        unreachable!();
     }
 }
 
@@ -1286,7 +1341,7 @@ pub fn sub(cb: &mut CodeBlock, opnd0: X86Opnd, opnd1: X86Opnd) {
         0x80, // opMemImm8
         0x83, // opMemImmSml
         0x81, // opMemImmLrg
-        0x05, // opExtImm
+        Some(0x05), // opExtImm
         opnd0,
         opnd1
     );
@@ -1323,10 +1378,10 @@ pub fn test(cb: &mut CodeBlock, rm_opnd: X86Opnd, test_opnd: X86Opnd) {
             let rm_resized = resize_opnd(rm_opnd, uimm.num_bits);
 
             if uimm.num_bits == 8 {
-                write_rm(cb, false, false, X86Opnd::None, rm_resized, 0x00, &[0xf6]);
+                write_rm(cb, false, false, X86Opnd::None, rm_resized, Some(0x00), &[0xf6]);
                 cb.write_int(uimm.value, uimm.num_bits.into());
             } else {
-                write_rm(cb, uimm.num_bits == 16, false, X86Opnd::None, rm_resized, 0x00, &[0xf7]);
+                write_rm(cb, uimm.num_bits == 16, false, X86Opnd::None, rm_resized, Some(0x00), &[0xf7]);
                 cb.write_int(uimm.value, uimm.num_bits.into());
             }
         },
@@ -1335,16 +1390,16 @@ pub fn test(cb: &mut CodeBlock, rm_opnd: X86Opnd, test_opnd: X86Opnd) {
             assert!(imm.num_bits <= 32);
             assert!(rm_num_bits == 64);
 
-            write_rm(cb, false, true, X86Opnd::None, rm_opnd, 0x00, &[0xf7]);
+            write_rm(cb, false, true, X86Opnd::None, rm_opnd, Some(0x00), &[0xf7]);
             cb.write_int(imm.value as u64, 32);
         },
         X86Opnd::Reg(reg) => {
             assert!(reg.num_bits == rm_num_bits);
 
             if rm_num_bits == 8 {
-                write_rm(cb, false, false, test_opnd, rm_opnd, 0xff, &[0x84]);
+                write_rm(cb, false, false, test_opnd, rm_opnd, None, &[0x84]);
             } else {
-                write_rm(cb, rm_num_bits == 16, rm_num_bits == 64, test_opnd, rm_opnd, 0xff, &[0x85]);
+                write_rm(cb, rm_num_bits == 16, rm_num_bits == 64, test_opnd, rm_opnd, None, &[0x85]);
             }
         },
         _ => unreachable!()
@@ -1370,7 +1425,7 @@ pub fn xchg(cb: &mut CodeBlock, rm_opnd: X86Opnd, r_opnd: X86Opnd) {
             // Write the opcode and register number
             cb.write_byte(0x90 + (r_reg.reg_no & 7));
         } else {
-            write_rm(cb, false, true, r_opnd, rm_opnd, 0xff, &[0x87]);
+            write_rm(cb, false, true, r_opnd, rm_opnd, None, &[0x87]);
         }
     } else {
         unreachable!();
@@ -1388,7 +1443,7 @@ pub fn xor(cb: &mut CodeBlock, opnd0: X86Opnd, opnd1: X86Opnd) {
         0x80, // opMemImm8
         0x83, // opMemImmSml
         0x81, // opMemImmLrg
-        0x06, // opExtImm
+        Some(0x06), // opExtImm
         opnd0,
         opnd1
     );
diff --git a/yjit/src/asm/x86_64/tests.rs b/yjit/src/asm/x86_64/tests.rs
index ffcc063420..5ae983270f 100644
--- a/yjit/src/asm/x86_64/tests.rs
+++ b/yjit/src/asm/x86_64/tests.rs
@@ -1,18 +1,6 @@
 #![cfg(test)]
 
 use crate::asm::x86_64::*;
-use std::fmt;
-
-/// Produce hex string output from the bytes in a code block
-impl<'a> fmt::LowerHex for super::CodeBlock {
-    fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
-        for pos in 0..self.write_pos {
-            let byte = unsafe { self.mem_block.start_ptr().raw_ptr().add(pos).read() };
-            fmtr.write_fmt(format_args!("{:02x}", byte))?;
-        }
-        Ok(())
-    }
-}
 
 /// Check that the bytes for an instruction sequence match a hex string
 fn check_bytes<R>(bytes: &str, run: R) where R: FnOnce(&mut super::CodeBlock) {
@@ -80,7 +68,7 @@ fn test_call_ptr() {
     // calling a lower address
     check_bytes("e8fbffffff", |cb| {
         let ptr = cb.get_write_ptr();
-        call_ptr(cb, RAX, ptr.raw_ptr());
+        call_ptr(cb, RAX, ptr.raw_ptr(cb));
     });
 }
 
@@ -109,6 +97,7 @@ fn test_cmp() {
     check_bytes("39f9", |cb| cmp(cb, ECX, EDI));
     check_bytes("493b1424", |cb| cmp(cb, RDX, mem_opnd(64, R12, 0)));
     check_bytes("4883f802", |cb| cmp(cb, RAX, imm_opnd(2)));
+    check_bytes("81f900000080", |cb| cmp(cb, ECX, uimm_opnd(0x8000_0000)));
 }
 
 #[test]
@@ -117,6 +106,15 @@ fn test_cqo() {
 }
 
 #[test]
+fn test_imul() {
+    check_bytes("480fafc3", |cb| imul(cb, RAX, RBX));
+    check_bytes("480faf10", |cb| imul(cb, RDX, mem_opnd(64, RAX, 0)));
+
+    // Operands flipped for encoding since multiplication is commutative
+    check_bytes("480faf10", |cb| imul(cb, mem_opnd(64, RAX, 0), RDX));
+}
+
+#[test]
 fn test_jge_label() {
     check_bytes("0f8dfaffffff", |cb| {
         let label_idx = cb.new_label("loop".to_owned());
@@ -201,6 +199,12 @@ fn test_mov() {
 }
 
 #[test]
+fn test_movabs() {
+    check_bytes("49b83400000000000000", |cb| movabs(cb, R8, 0x34));
+    check_bytes("49b80000008000000000", |cb| movabs(cb, R8, 0x80000000));
+}
+
+#[test]
 fn test_mov_unsigned() {
     // MOV AL, imm8
     check_bytes("b001", |cb| mov(cb, AL, uimm_opnd(1)));
@@ -345,6 +349,7 @@ fn test_sal() {
     check_bytes("d1e1", |cb| sal(cb, ECX, uimm_opnd(1)));
     check_bytes("c1e505", |cb| sal(cb, EBP, uimm_opnd(5)));
     check_bytes("d1642444", |cb| sal(cb, mem_opnd(32, RSP, 68), uimm_opnd(1)));
+    check_bytes("48d3e1", |cb| sal(cb, RCX, CL));
 }
 
 #[test]
@@ -364,6 +369,14 @@ fn test_sub() {
 }
 
 #[test]
+#[should_panic]
+fn test_sub_uimm_too_large() {
+    // This immediate becomes a different value after
+    // sign extension, so not safe to encode.
+    check_bytes("ff", |cb| sub(cb, RCX, uimm_opnd(0x8000_0000)));
+}
+
+#[test]
 fn test_test() {
     check_bytes("84c0", |cb| test(cb, AL, AL));
     check_bytes("6685c0", |cb| test(cb, AX, AX));
@@ -425,19 +438,19 @@ fn basic_capstone_usage() -> std::result::Result<(), capstone::Error> {
 }
 
 #[test]
-#[cfg(feature = "asm_comments")]
+#[cfg(feature = "disasm")]
 fn block_comments() {
     let mut cb = super::CodeBlock::new_dummy(4096);
 
-    let first_write_ptr = cb.get_write_ptr().into_usize();
+    let first_write_ptr = cb.get_write_ptr().raw_addr(&cb);
     cb.add_comment("Beginning");
     xor(&mut cb, EAX, EAX); // 2 bytes long
-    let second_write_ptr = cb.get_write_ptr().into_usize();
+    let second_write_ptr = cb.get_write_ptr().raw_addr(&cb);
     cb.add_comment("Two bytes in");
     cb.add_comment("Still two bytes in");
     cb.add_comment("Still two bytes in"); // Duplicate, should be ignored
     test(&mut cb, mem_opnd(64, RSI, 64), imm_opnd(!0x08)); // 8 bytes long
-    let third_write_ptr = cb.get_write_ptr().into_usize();
+    let third_write_ptr = cb.get_write_ptr().raw_addr(&cb);
     cb.add_comment("Ten bytes in");
 
     assert_eq!(&vec!( "Beginning".to_string() ), cb.comments_at(first_write_ptr).unwrap());