More Arm64 lowering/backend work (https://github.com/Shopify/ruby/pull/307)

* More Arm64 lowering/backend work * We now have encoding support for the LDR instruction for loading a PC-relative memory location * You can now call add/adds/sub/subs with signed immediates, which switches appropriately based on sign * We can now load immediates into registers appropriately, attempting to keep the minimal number of instructions: * If it fits into 16 bytes, we use just a single movz. * Else if it can be encoded into a bitmask immediate, we use a single mov. * Otherwise we use a movz, a movk, and then optionally another one or two movks. * Fixed a bunch of code to do with the Op::Load opcode. * We now handle GC-offsets properly for Op::Load by skipping around them with a jump instruction. (This will be made better by constant pools in the future.) * Op::Lea is doing what it's supposed to do now. * Fixed a bug in the backend tests to do with not using the result of an Op::Add. * Fix the remaining tests for Arm64 * Move split loads logic into each backend
author: Kevin Newton <kddnewton@gmail.com> 2022-07-08 13:01:21 -0400
committer: Takashi Kokubun <takashikkbn@gmail.com> 2022-08-29 08:46:59 -0700
commit: 6773832ab9cad3c7bcb3b93ef85a4bcfc9b3a4e3 (patch)
tree: b377f91fb5597d0ee141485205a2638cb35f75c8
parent: 0551115912fd6682187dd501275096fdb7570084 (diff)
9 files changed, 431 insertions, 163 deletions
diff --git a/yjit/src/asm/arm64/inst/load_literal.rs b/yjit/src/asm/arm64/inst/load_literal.rs
new file mode 100644
index 0000000000..a49130c3eb
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/load_literal.rs
@@ -0,0 +1,89 @@
+/// The size of the operands being operated on.
+enum Opc {
+    Size32 = 0b00,
+    Size64 = 0b01,
+}
+
+/// A convenience function so that we can convert the number of bits of an
+/// register operand directly into an Sf enum variant.
+impl From<u8> for Opc {
+    fn from(num_bits: u8) -> Self {
+        match num_bits {
+            64 => Opc::Size64,
+            32 => Opc::Size32,
+            _ => panic!("Invalid number of bits: {}", num_bits)
+        }
+    }
+}
+
+/// The struct that represents an A64 load literal instruction that can be encoded.
+///
+/// LDR
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |        0  1    1  0  0  0                                                                                     |
+/// | opc..                       imm19........................................................... rt.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct LoadLiteral {
+    /// The number of the register to load the value into.
+    rt: u8,
+
+    /// The PC-relative number of instructions to load the value from.
+    imm19: i32,
+
+    /// The size of the operands being operated on.
+    opc: Opc
+}
+
+impl LoadLiteral {
+    /// LDR (load literal)
+    /// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--literal---Load-Register--literal--?lang=en
+    pub fn ldr(rt: u8, imm19: i32, num_bits: u8) -> Self {
+        Self { rt, imm19, opc: num_bits.into() }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en
+const FAMILY: u32 = 0b0100;
+
+impl From<LoadLiteral> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: LoadLiteral) -> Self {
+        let imm19 = (inst.imm19 as u32) & ((1 << 19) - 1);
+
+        0
+        | ((inst.opc as u32) << 30)
+        | (1 << 28)
+        | (FAMILY << 25)
+        | (imm19 << 5)
+        | (inst.rt as u32)
+    }
+}
+
+impl From<LoadLiteral> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: LoadLiteral) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_ldr_positive() {
+        let inst = LoadLiteral::ldr(0, 5, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0x580000a0, result);
+    }
+
+    #[test]
+    fn test_ldr_negative() {
+        let inst = LoadLiteral::ldr(0, -5, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0x58ffff60, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/mod.rs b/yjit/src/asm/arm64/inst/mod.rs
index ae589ca564..f402f6765a 100644
--- a/yjit/src/asm/arm64/inst/mod.rs
+++ b/yjit/src/asm/arm64/inst/mod.rs
@@ -9,6 +9,7 @@ mod call;
 mod data_imm;
 mod data_reg;
 mod load;
+mod load_literal;
 mod logical_imm;
 mod logical_reg;
 mod mov;
@@ -24,6 +25,7 @@ pub use call::Call;
 pub use data_imm::DataImm;
 pub use data_reg::DataReg;
 pub use load::Load;
+pub use load_literal::LoadLiteral;
 pub use logical_imm::LogicalImm;
 pub use logical_reg::LogicalReg;
 pub use mov::Mov;
diff --git a/yjit/src/asm/arm64/mod.rs b/yjit/src/asm/arm64/mod.rs
index ced8b262c5..2dc5aa9388 100644
--- a/yjit/src/asm/arm64/mod.rs
+++ b/yjit/src/asm/arm64/mod.rs
@@ -39,11 +39,21 @@ pub fn add(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
 
             DataReg::add(rd.reg_no, rn.reg_no, rm.reg_no, rd.num_bits).into()
         },
-        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::UImm(imm12)) => {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::UImm(uimm12)) => {
             assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
-            assert!(uimm_fits_bits(imm12, 12), "The immediate operand must be 12 bits or less.");
+            assert!(uimm_fits_bits(uimm12, 12), "The immediate operand must be 12 bits or less.");
 
-            DataImm::add(rd.reg_no, rn.reg_no, imm12 as u16, rd.num_bits).into()
+            DataImm::add(rd.reg_no, rn.reg_no, uimm12 as u16, rd.num_bits).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Imm(imm12)) => {
+            assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
+            assert!(imm_fits_bits(imm12, 12), "The immediate operand must be 12 bits or less.");
+
+            if imm12 < 0 {
+                DataImm::sub(rd.reg_no, rn.reg_no, -imm12 as u16, rd.num_bits).into()
+            } else {
+                DataImm::add(rd.reg_no, rn.reg_no, imm12 as u16, rd.num_bits).into()
+            }
         },
         _ => panic!("Invalid operand combination to add instruction."),
     };
@@ -68,6 +78,16 @@ pub fn adds(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
 
             DataImm::adds(rd.reg_no, rn.reg_no, imm12 as u16, rd.num_bits).into()
         },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Imm(imm12)) => {
+            assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
+            assert!(imm_fits_bits(imm12, 12), "The immediate operand must be 12 bits or less.");
+
+            if imm12 < 0 {
+                DataImm::subs(rd.reg_no, rn.reg_no, -imm12 as u16, rd.num_bits).into()
+            } else {
+                DataImm::adds(rd.reg_no, rn.reg_no, imm12 as u16, rd.num_bits).into()
+            }
+        },
         _ => panic!("Invalid operand combination to adds instruction."),
     };
 
@@ -237,6 +257,18 @@ pub fn ldaddal(cb: &mut CodeBlock, rs: A64Opnd, rt: A64Opnd, rn: A64Opnd) {
     cb.write_bytes(&bytes);
 }
 
+/// LDR - load a PC-relative memory address into a register
+pub fn ldr(cb: &mut CodeBlock, rt: A64Opnd, rn: i32) {
+    let bytes: [u8; 4] = match rt {
+        A64Opnd::Reg(rt) => {
+            LoadLiteral::ldr(rt.reg_no, rn, rt.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to ldr instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
 /// LDUR - load a memory address into a register
 pub fn ldur(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
     let bytes: [u8; 4] = match (rt, rn) {
@@ -415,11 +447,21 @@ pub fn sub(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
 
             DataReg::sub(rd.reg_no, rn.reg_no, rm.reg_no, rd.num_bits).into()
         },
-        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::UImm(imm12)) => {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::UImm(uimm12)) => {
             assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
-            assert!(uimm_fits_bits(imm12, 12), "The immediate operand must be 12 bits or less.");
+            assert!(uimm_fits_bits(uimm12, 12), "The immediate operand must be 12 bits or less.");
 
-            DataImm::sub(rd.reg_no, rn.reg_no, imm12 as u16, rd.num_bits).into()
+            DataImm::sub(rd.reg_no, rn.reg_no, uimm12 as u16, rd.num_bits).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Imm(imm12)) => {
+            assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
+            assert!(imm_fits_bits(imm12, 12), "The immediate operand must be 12 bits or less.");
+
+            if imm12 < 0 {
+                DataImm::add(rd.reg_no, rn.reg_no, -imm12 as u16, rd.num_bits).into()
+            } else {
+                DataImm::sub(rd.reg_no, rn.reg_no, imm12 as u16, rd.num_bits).into()
+            }
         },
         _ => panic!("Invalid operand combination to sub instruction."),
     };
@@ -438,11 +480,21 @@ pub fn subs(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
 
             DataReg::subs(rd.reg_no, rn.reg_no, rm.reg_no, rd.num_bits).into()
         },
-        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::UImm(imm12)) => {
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::UImm(uimm12)) => {
             assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
-            assert!(uimm_fits_bits(imm12, 12), "The immediate operand must be 12 bits or less.");
+            assert!(uimm_fits_bits(uimm12, 12), "The immediate operand must be 12 bits or less.");
+
+            DataImm::subs(rd.reg_no, rn.reg_no, uimm12 as u16, rd.num_bits).into()
+        },
+        (A64Opnd::Reg(rd), A64Opnd::Reg(rn), A64Opnd::Imm(imm12)) => {
+            assert!(rd.num_bits == rn.num_bits, "rd and rn must be of the same size.");
+            assert!(imm_fits_bits(imm12, 12), "The immediate operand must be 12 bits or less.");
 
-            DataImm::subs(rd.reg_no, rn.reg_no, imm12 as u16, rd.num_bits).into()
+            if imm12 < 0 {
+                DataImm::adds(rd.reg_no, rn.reg_no, -imm12 as u16, rd.num_bits).into()
+            } else {
+                DataImm::subs(rd.reg_no, rn.reg_no, imm12 as u16, rd.num_bits).into()
+            }
         },
         _ => panic!("Invalid operand combination to subs instruction."),
     };
@@ -513,26 +565,46 @@ mod tests {
     }
 
     #[test]
-    fn test_add_register() {
+    fn test_add_reg() {
         check_bytes("2000028b", |cb| add(cb, X0, X1, X2));
     }
 
     #[test]
-    fn test_add_immediate() {
+    fn test_add_uimm() {
         check_bytes("201c0091", |cb| add(cb, X0, X1, A64Opnd::new_uimm(7)));
     }
 
     #[test]
-    fn test_adds_register() {
+    fn test_add_imm_positive() {
+        check_bytes("201c0091", |cb| add(cb, X0, X1, A64Opnd::new_imm(7)));
+    }
+
+    #[test]
+    fn test_add_imm_negative() {
+        check_bytes("201c00d1", |cb| add(cb, X0, X1, A64Opnd::new_imm(-7)));
+    }
+
+    #[test]
+    fn test_adds_reg() {
         check_bytes("200002ab", |cb| adds(cb, X0, X1, X2));
     }
 
     #[test]
-    fn test_adds_immediate() {
+    fn test_adds_uimm() {
         check_bytes("201c00b1", |cb| adds(cb, X0, X1, A64Opnd::new_uimm(7)));
     }
 
     #[test]
+    fn test_adds_imm_positive() {
+        check_bytes("201c00b1", |cb| adds(cb, X0, X1, A64Opnd::new_imm(7)));
+    }
+
+    #[test]
+    fn test_adds_imm_negatve() {
+        check_bytes("201c00f1", |cb| adds(cb, X0, X1, A64Opnd::new_imm(-7)));
+    }
+
+    #[test]
     fn test_and_register() {
         check_bytes("2000028a", |cb| and(cb, X0, X1, X2));
     }
@@ -598,6 +670,11 @@ mod tests {
     }
 
     #[test]
+    fn test_ldr() {
+        check_bytes("40010058", |cb| ldr(cb, X0, 10));
+    }
+
+    #[test]
     fn test_ldur_memory() {
         check_bytes("20b047f8", |cb| ldur(cb, X0, A64Opnd::new_mem(64, X1, 123)));
     }
@@ -678,22 +755,42 @@ mod tests {
     }
 
     #[test]
-    fn test_sub_register() {
+    fn test_sub_reg() {
         check_bytes("200002cb", |cb| sub(cb, X0, X1, X2));
     }
 
     #[test]
-    fn test_sub_immediate() {
+    fn test_sub_uimm() {
         check_bytes("201c00d1", |cb| sub(cb, X0, X1, A64Opnd::new_uimm(7)));
     }
 
     #[test]
-    fn test_subs_register() {
+    fn test_sub_imm_positive() {
+        check_bytes("201c00d1", |cb| sub(cb, X0, X1, A64Opnd::new_imm(7)));
+    }
+
+    #[test]
+    fn test_sub_imm_negative() {
+        check_bytes("201c0091", |cb| sub(cb, X0, X1, A64Opnd::new_imm(-7)));
+    }
+
+    #[test]
+    fn test_subs_reg() {
         check_bytes("200002eb", |cb| subs(cb, X0, X1, X2));
     }
 
     #[test]
-    fn test_subs_immediate() {
+    fn test_subs_imm_positive() {
+        check_bytes("201c00f1", |cb| subs(cb, X0, X1, A64Opnd::new_imm(7)));
+    }
+
+    #[test]
+    fn test_subs_imm_negative() {
+        check_bytes("201c00b1", |cb| subs(cb, X0, X1, A64Opnd::new_imm(-7)));
+    }
+
+    #[test]
+    fn test_subs_uimm() {
         check_bytes("201c00f1", |cb| subs(cb, X0, X1, A64Opnd::new_uimm(7)));
     }
 
diff --git a/yjit/src/asm/arm64/opnd.rs b/yjit/src/asm/arm64/opnd.rs
index 6c06d2db3c..1738f0985c 100644
--- a/yjit/src/asm/arm64/opnd.rs
+++ b/yjit/src/asm/arm64/opnd.rs
@@ -175,3 +175,4 @@ pub const W31: A64Reg = A64Reg { num_bits: 32, reg_no: 31 };
 
 // C argument registers
 pub const C_ARG_REGS: [A64Opnd; 4] = [X0, X1, X2, X3];
+pub const C_ARG_REGREGS: [A64Reg; 4] = [X0_REG, X1_REG, X2_REG, X3_REG];
diff --git a/yjit/src/asm/mod.rs b/yjit/src/asm/mod.rs
index 5723406aec..126c9a8548 100644
--- a/yjit/src/asm/mod.rs
+++ b/yjit/src/asm/mod.rs
@@ -174,7 +174,7 @@ impl CodeBlock {
     }
 
     /// Write multiple bytes starting from the current position.
-    fn write_bytes(&mut self, bytes: &[u8]) {
+    pub fn write_bytes(&mut self, bytes: &[u8]) {
         for byte in bytes {
             self.write_byte(*byte);
         }
diff --git a/yjit/src/backend/arm64/mod.rs b/yjit/src/backend/arm64/mod.rs
index 061d21d19b..7e6a187f8f 100644
--- a/yjit/src/backend/arm64/mod.rs
+++ b/yjit/src/backend/arm64/mod.rs
@@ -61,10 +61,7 @@ impl Assembler
     /// Get the list of registers from which we can allocate on this platform
     pub fn get_alloc_regs() -> Vec<Reg>
     {
-        vec![
-            X12_REG,
-            X13_REG
-        ]
+        vec![C_RET_REG, X12_REG]
     }
 
     /// Split platform-specific instructions
@@ -75,8 +72,21 @@ impl Assembler
     fn arm64_split(mut self) -> Assembler
     {
         self.forward_pass(|asm, index, op, opnds, target| {
+            // Load all Value operands into registers that aren't already a part
+            // of Load instructions.
+            let opnds = match op {
+                Op::Load => opnds,
+                _ => opnds.into_iter().map(|opnd| {
+                    if let Opnd::Value(_) = opnd {
+                        asm.load(opnd)
+                    } else {
+                        opnd
+                    }
+                }).collect()
+            };
+
             match op {
-                Op::Add | Op::Sub => {
+                Op::Add | Op::And | Op::Sub => {
                     // Check if one of the operands is a register. If it is,
                     // then we'll make that the first operand.
                     match (opnds[0], opnds[1]) {
@@ -95,6 +105,28 @@ impl Assembler
                         }
                     }
                 },
+                Op::CCall => {
+                    assert!(opnds.len() < C_ARG_REGS.len());
+
+                    // For each of the operands we're going to first load them
+                    // into a register and then move them into the correct
+                    // argument register.
+                    for (idx, opnd) in opnds.into_iter().enumerate() {
+                        let value = asm.load(opnd);
+                        asm.mov(Opnd::Reg(C_ARG_REGREGS[idx]), value);
+                    }
+
+                    // Now we push the CCall without any arguments so that it
+                    // just performs the call.
+                    asm.ccall(target.unwrap().unwrap_fun_ptr(), vec![]);
+                },
+                Op::CRet => {
+                    if opnds[0] != Opnd::Reg(C_RET_REG) {
+                        let value = asm.load(opnds[0]);
+                        asm.mov(C_RET_OPND, value);
+                    }
+                    asm.cret(C_RET_OPND);
+                },
                 Op::IncrCounter => {
                     // Every operand to the IncrCounter instruction need to be a
                     // register once it gets there. So here we're going to load
@@ -154,6 +186,16 @@ impl Assembler
 
                     asm.store(opnds[0], opnd1);
                 },
+                Op::Test => {
+                    // The value being tested must be in a register, so if it's
+                    // not already one we'll load it first.
+                    let opnd0 = match opnds[0] {
+                        Opnd::Reg(_) | Opnd::InsnOut { .. } => opnds[0],
+                        _ => asm.load(opnds[0])
+                    };
+
+                    asm.test(opnd0, opnds[1]);
+                },
                 _ => {
                     asm.push_insn(op, opnds, target);
                 }
@@ -165,6 +207,45 @@ impl Assembler
     /// Returns a list of GC offsets
     pub fn arm64_emit(&mut self, cb: &mut CodeBlock) -> Vec<u32>
     {
+        /// Emit the required instructions to load the given value into the
+        /// given register. Our goal here is to use as few instructions as
+        /// possible to get this value into the register.
+        fn emit_load_value(cb: &mut CodeBlock, rd: A64Opnd, value: u64) {
+            let mut current = value;
+
+            if current <= 0xffff {
+                // If the value fits into a single movz
+                // instruction, then we'll use that.
+                movz(cb, rd, A64Opnd::new_uimm(current), 0);
+            } else if BitmaskImmediate::try_from(current).is_ok() {
+                // Otherwise, if the immediate can be encoded
+                // with the special bitmask immediate encoding,
+                // we'll use that.
+                mov(cb, rd, A64Opnd::new_uimm(current));
+            } else {
+                // Finally we'll fall back to encoding the value
+                // using movz for the first 16 bits and movk for
+                // each subsequent set of 16 bits as long we
+                // they are necessary.
+                movz(cb, rd, A64Opnd::new_uimm(current & 0xffff), 0);
+
+                // (We're sure this is necessary since we
+                // checked if it only fit into movz above).
+                current >>= 16;
+                movk(cb, rd, A64Opnd::new_uimm(current & 0xffff), 16);
+
+                if current > 0xffff {
+                    current >>= 16;
+                    movk(cb, rd, A64Opnd::new_uimm(current & 0xffff), 32);
+                }
+
+                if current > 0xffff {
+                    current >>= 16;
+                    movk(cb, rd, A64Opnd::new_uimm(current & 0xffff), 48);
+                }
+            }
+        }
+
         /// Emit a conditional jump instruction to a specific target. This is
         /// called when lowering any of the conditional jump instructions.
         fn emit_conditional_jump(cb: &mut CodeBlock, condition: Condition, target: Target) {
@@ -203,7 +284,7 @@ impl Assembler
                             // wasn't met, in which case we'll jump past the
                             // next instruction that perform the direct jump.
                             b(cb, A64Opnd::new_imm(8));
-                            mov(cb, X29, A64Opnd::new_uimm(dst_addr as u64));
+                            emit_load_value(cb, X29, dst_addr as u64);
                             br(cb, X29);
                         }
                     }
@@ -257,25 +338,57 @@ impl Assembler
                     stur(cb, insn.opnds[1].into(), insn.opnds[0].into());
                 },
                 Op::Load => {
-                    mov(cb, insn.out.into(), insn.opnds[0].into());
-
-                    // This assumes only load instructions can contain
-                    // references to GC'd Value operands. If the value being
-                    // loaded is a heap object, we'll report that back out to
-                    // the gc_offsets list.
-                    if let Opnd::Value(val) = insn.opnds[0] {
-                        if !val.special_const_p() {
-                            // The pointer immediate is encoded as the last part of the mov written out
-                            let ptr_offset: u32 = (cb.get_write_pos() as u32) - (SIZEOF_VALUE as u32);
-                            gc_offsets.push(ptr_offset);
+                    match insn.opnds[0] {
+                        Opnd::Reg(_) | Opnd::InsnOut { .. } => {
+                            mov(cb, insn.out.into(), insn.opnds[0].into());
+                        },
+                        Opnd::UImm(uimm) => {
+                            emit_load_value(cb, insn.out.into(), uimm);
+                        },
+                        Opnd::Imm(imm) => {
+                            emit_load_value(cb, insn.out.into(), imm as u64);
+                        },
+                        Opnd::Mem(_) => {
+                            ldur(cb, insn.out.into(), insn.opnds[0].into());
+                        },
+                        Opnd::Value(value) => {
+                            // This assumes only load instructions can contain
+                            // references to GC'd Value operands. If the value
+                            // being loaded is a heap object, we'll report that
+                            // back out to the gc_offsets list.
+                            ldr(cb, insn.out.into(), 1);
+                            b(cb, A64Opnd::new_uimm((SIZEOF_VALUE as u64) / 4));
+                            cb.write_bytes(&value.as_u64().to_le_bytes());
+
+                            if !value.special_const_p() {
+                                let ptr_offset: u32 = (cb.get_write_pos() as u32) - (SIZEOF_VALUE as u32);
+                                gc_offsets.push(ptr_offset);
+                            }
+                        },
+                        Opnd::None => {
+                            unreachable!("Attempted to load from None operand");
                         }
-                    }
+                    };
                 },
                 Op::Mov => {
                     mov(cb, insn.opnds[0].into(), insn.opnds[1].into());
                 },
                 Op::Lea => {
-                    ldur(cb, insn.out.into(), insn.opnds[0].into());
+                    let opnd: A64Opnd = insn.opnds[0].into();
+
+                    match opnd {
+                        A64Opnd::Mem(mem) => {
+                            add(
+                                cb,
+                                insn.out.into(),
+                                A64Opnd::Reg(A64Reg { reg_no: mem.base_reg_no, num_bits: 64 }),
+                                A64Opnd::new_imm(mem.disp.into())
+                            );
+                        },
+                        _ => {
+                            panic!("Op::Lea only accepts Opnd::Mem operands.");
+                        }
+                    };
                 },
                 Op::CPush => {
                     add(cb, C_SP_REG, C_SP_REG, C_SP_STEP);
@@ -286,14 +399,6 @@ impl Assembler
                     sub(cb, C_SP_REG, C_SP_REG, C_SP_STEP);
                 },
                 Op::CCall => {
-                    // Temporary
-                    assert!(insn.opnds.len() < C_ARG_REGS.len());
-
-                    // For each operand
-                    for (idx, opnd) in insn.opnds.iter().enumerate() {
-                        mov(cb, C_ARG_REGS[idx], insn.opnds[idx].into());
-                    }
-
                     let src_addr = cb.get_write_ptr().into_i64() + 4;
                     let dst_addr = insn.target.unwrap().unwrap_fun_ptr() as i64;
 
@@ -310,17 +415,12 @@ impl Assembler
                     if b_offset_fits_bits(offset) {
                         bl(cb, A64Opnd::new_imm(offset / 4));
                     } else {
-                        mov(cb, X30, A64Opnd::new_uimm(src_addr as u64));
-                        mov(cb, X29, A64Opnd::new_uimm(dst_addr as u64));
+                        emit_load_value(cb, X30, src_addr as u64);
+                        emit_load_value(cb, X29, dst_addr as u64);
                         br(cb, X29);
                     }
                 },
                 Op::CRet => {
-                    // TODO: bias allocation towards return register
-                    if insn.opnds[0] != Opnd::Reg(C_RET_REG) {
-                        mov(cb, C_RET_OPND.into(), insn.opnds[0].into());
-                    }
-
                     ret(cb, A64Opnd::None);
                 },
                 Op::Cmp => {
@@ -351,7 +451,7 @@ impl Assembler
                             if b_offset_fits_bits(offset) {
                                 b(cb, A64Opnd::new_imm(offset / 4));
                             } else {
-                                mov(cb, X29, A64Opnd::new_uimm(dst_addr as u64));
+                                emit_load_value(cb, X29, dst_addr as u64);
                                 br(cb, X29);
                             }
                         },
@@ -398,7 +498,7 @@ impl Assembler
     /// Optimize and compile the stored instructions
     pub fn compile_with_regs(self, cb: &mut CodeBlock, regs: Vec<Reg>) -> Vec<u32>
     {
-        let mut asm = self.arm64_split().split_loads().alloc_regs(regs);
+        let mut asm = self.arm64_split().alloc_regs(regs);
 
         // Create label instances in the code block
         for (idx, name) in asm.label_names.iter().enumerate() {
diff --git a/yjit/src/backend/ir.rs b/yjit/src/backend/ir.rs
index c9e75df01a..cd88ec560b 100644
--- a/yjit/src/backend/ir.rs
+++ b/yjit/src/backend/ir.rs
@@ -534,71 +534,6 @@ impl Assembler
         asm
     }
 
-    /// Transforms the instructions by splitting instructions that cannot be
-    /// represented in the final architecture into multiple instructions that
-    /// can.
-    pub(super) fn split_loads(self) -> Assembler
-    {
-        // Load operands that are GC values into a register
-        fn load_gc_opnds(op: Op, opnds: Vec<Opnd>, asm: &mut Assembler) -> Vec<Opnd>
-        {
-            if op == Op::Load || op == Op::Mov {
-                return opnds;
-            }
-
-            fn map_opnd(opnd: Opnd, asm: &mut Assembler) -> Opnd {
-                if let Opnd::Value(val) = opnd {
-                    // If this is a heap object, load it into a register
-                    if !val.special_const_p() {
-                        asm.load(opnd);
-                    }
-                }
-
-                opnd
-            }
-
-            opnds.into_iter().map(|opnd| map_opnd(opnd, asm)).collect()
-        }
-
-        self.forward_pass(|asm, _, op, opnds, target| {
-            // Load heap object operands into registers because most
-            // instructions can't directly work with 64-bit constants
-            let opnds = load_gc_opnds(op, opnds, asm);
-
-            match op {
-                // Check for Add, Sub, And, Mov, with two memory operands.
-                // Load one operand into memory.
-                Op::Add | Op::Sub | Op::And | Op::Mov => {
-                    match opnds.as_slice() {
-                        [Opnd::Mem(_), Opnd::Mem(_)] => {
-                            // We load opnd1 because for mov, opnd0 is the output
-                            let opnd1 = asm.load(opnds[1]);
-                            asm.push_insn(op, vec![opnds[0], opnd1], None);
-                        },
-
-                        [Opnd::Mem(_), Opnd::UImm(val)] => {
-                            if uimm_num_bits(*val) > 32 {
-                                let opnd1 = asm.load(opnds[1]);
-                                asm.push_insn(op, vec![opnds[0], opnd1], None);
-                            }
-                            else
-                            {
-                                asm.push_insn(op, opnds, target);
-                            }
-                        },
-
-                        _ => {
-                            asm.push_insn(op, opnds, target);
-                        }
-                    }
-                },
-                _ => {
-                    asm.push_insn(op, opnds, target);
-                }
-            };
-        })
-    }
-
     /// Sets the out field on the various instructions that require allocated
     /// registers because their output is used as the operand on a subsequent
     /// instruction. This is our implementation of the linear scan algorithm.
diff --git a/yjit/src/backend/tests.rs b/yjit/src/backend/tests.rs
index d72f0ec0ac..27f799fc31 100644
--- a/yjit/src/backend/tests.rs
+++ b/yjit/src/backend/tests.rs
@@ -44,22 +44,6 @@ fn test_add() {
 }
 
 #[test]
-fn test_split_loads() {
-    let mut asm = Assembler::new();
-
-    let regs = Assembler::get_alloc_regs();
-
-    asm.add(
-        Opnd::mem(64, Opnd::Reg(regs[0]), 0),
-        Opnd::mem(64, Opnd::Reg(regs[1]), 0)
-    );
-
-    let result = asm.split_loads();
-    assert_eq!(result.insns.len(), 2);
-    assert_eq!(result.insns[0].op, Op::Load);
-}
-
-#[test]
 fn test_alloc_regs() {
     let mut asm = Assembler::new();
 
@@ -109,7 +93,8 @@ fn test_compile()
     let regs = Assembler::get_alloc_regs();
 
     let out = asm.add(Opnd::Reg(regs[0]), Opnd::UImm(2));
-    asm.add(out, Opnd::UImm(2));
+    let out2 = asm.add(out, Opnd::UImm(2));
+    asm.store(Opnd::mem(64, SP, 0), out2);
 
     asm.compile_with_num_regs(&mut cb, 1);
 }
@@ -162,7 +147,7 @@ fn test_reuse_reg()
     let v0 = asm.add(Opnd::mem(64, SP, 0), Opnd::UImm(1));
     let v1 = asm.add(Opnd::mem(64, SP, 8), Opnd::UImm(1));
 
-    let v2 = asm.add(v0, Opnd::UImm(1)); // Reuse v1 register
+    let v2 = asm.add(v1, Opnd::UImm(1)); // Reuse v1 register
     let v3 = asm.add(v0, v2);
 
     asm.store(Opnd::mem(64, SP, 0), v2);
@@ -202,7 +187,7 @@ fn test_base_insn_out()
     // Increment and store the updated value
     asm.incr_counter(counter_opnd, 1.into());
 
-    asm.compile_with_num_regs(&mut cb, 1);
+    asm.compile_with_num_regs(&mut cb, 2);
 }
 
 #[test]
@@ -262,7 +247,7 @@ fn test_jcc_ptr()
     );
     asm.jnz(side_exit);
 
-    asm.compile_with_num_regs(&mut cb, 1);
+    asm.compile_with_num_regs(&mut cb, 2);
 }
 
 /// Direct jump to a stub e.g. for deferred compilation
@@ -293,5 +278,5 @@ fn test_jo()
 
     asm.mov(Opnd::mem(64, SP, 0), out_val);
 
-    asm.compile_with_num_regs(&mut cb, 1);
+    asm.compile_with_num_regs(&mut cb, 2);
 }
diff --git a/yjit/src/backend/x86_64/mod.rs b/yjit/src/backend/x86_64/mod.rs
index 7a26650549..4fd30e7144 100644
--- a/yjit/src/backend/x86_64/mod.rs
+++ b/yjit/src/backend/x86_64/mod.rs
@@ -2,7 +2,7 @@
 #![allow(unused_variables)]
 #![allow(unused_imports)]
 
-use crate::asm::{CodeBlock};
+use crate::asm::{uimm_num_bits, CodeBlock};
 use crate::asm::x86_64::*;
 use crate::codegen::{JITState};
 use crate::cruby::*;
@@ -82,36 +82,97 @@ impl Assembler
         let live_ranges: Vec<usize> = std::mem::take(&mut self.live_ranges);
 
         self.forward_pass(|asm, index, op, opnds, target| {
+            // Load heap object operands into registers because most
+            // instructions can't directly work with 64-bit constants
+            let opnds = match op {
+                Op::Load | Op::Mov => opnds,
+                _ => opnds.into_iter().map(|opnd| {
+                    if let Opnd::Value(value) = opnd {
+                        if !value.special_const_p() {
+                            asm.load(opnd)
+                        } else {
+                            opnd
+                        }
+                    } else {
+                        opnd
+                    }
+                }).collect()
+            };
+
             match op {
-                Op::Add | Op::Sub | Op::And | Op::Not => {
-                    match opnds[0] {
+                Op::Add | Op::Sub | Op::And => {
+                    let (opnd0, opnd1) = match (opnds[0], opnds[1]) {
+                        (Opnd::Mem(_), Opnd::Mem(_)) => {
+                            (asm.load(opnds[0]), asm.load(opnds[1]))
+                        },
+                        (Opnd::Mem(_), Opnd::UImm(value)) => {
+                            if uimm_num_bits(value) > 32 {
+                                (asm.load(opnds[0]), asm.load(opnds[1]))
+                            } else {
+                                (asm.load(opnds[0]), opnds[1])
+                            }
+                        },
                         // Instruction output whose live range spans beyond this instruction
-                        Opnd::InsnOut{idx, ..} => {
+                        (Opnd::InsnOut { idx, .. }, _) => {
                             if live_ranges[idx] > index {
-                                let opnd0 = asm.load(opnds[0]);
-                                let mut new_opnds = vec![opnd0];
-                                new_opnds.extend_from_slice(&opnds[1..]);
-                                asm.push_insn(op, new_opnds, None);
-                                return;
+                                (asm.load(opnds[0]), opnds[1])
+                            } else {
+                                (opnds[0], opnds[1])
                             }
                         },
-
                         // We have to load memory and register operands to avoid corrupting them
-                        Opnd::Mem(_) | Opnd::Reg(_) => {
-                            let opnd0 = asm.load(opnds[0]);
-                            let mut new_opnds = vec![opnd0];
-                            new_opnds.extend_from_slice(&opnds[1..]);
-                            asm.push_insn(op, new_opnds, None);
-                            return;
+                        (Opnd::Mem(_) | Opnd::Reg(_), _) => {
+                            (asm.load(opnds[0]), opnds[1])
                         },
+                        _ => (opnds[0], opnds[1])
+                    };
 
-                        _ => {}
+                    asm.push_insn(op, vec![opnd0, opnd1], target);
+                },
+                Op::Mov => {
+                    match (opnds[0], opnds[1]) {
+                        (Opnd::Mem(_), Opnd::Mem(_)) => {
+                            // We load opnd1 because for mov, opnd0 is the output
+                            let opnd1 = asm.load(opnds[1]);
+                            asm.mov(opnds[0], opnd1);
+                        },
+                        (Opnd::Mem(_), Opnd::UImm(value)) => {
+                            if uimm_num_bits(value) > 32 {
+                                let opnd1 = asm.load(opnds[1]);
+                                asm.mov(opnds[0], opnd1);
+                            } else {
+                                asm.mov(opnds[0], opnds[1]);
+                            }
+                        },
+                        _ => {
+                            asm.mov(opnds[0], opnds[1]);
+                        }
                     }
                 },
-                _ => {}
+                Op::Not => {
+                    let opnd0 = match opnds[0] {
+                        // If we have an instruction output whose live range
+                        // spans beyond this instruction, we have to load it.
+                        Opnd::InsnOut { idx, .. } => {
+                            if live_ranges[idx] > index {
+                                asm.load(opnds[0])
+                            } else {
+                                opnds[0]
+                            }
+                        },
+                        // We have to load memory and register operands to avoid
+                        // corrupting them.
+                        Opnd::Mem(_) | Opnd::Reg(_) => asm.load(opnds[0]),
+                        // Otherwise we can just reuse the existing operand.
+                        _ => opnds[0]
+                    };
+
+                    asm.not(opnd0);
+                },
+                _ => {
+                    asm.push_insn(op, opnds, target);
+                }
             };
-
-            asm.push_insn(op, opnds, target);
         })
     }
 
@@ -270,9 +331,7 @@ impl Assembler
     /// Optimize and compile the stored instructions
     pub fn compile_with_regs(self, cb: &mut CodeBlock, regs: Vec<Reg>) -> Vec<u32>
     {
-        let mut asm = self.x86_split();
-        let mut asm = asm.split_loads();
-        let mut asm = asm.alloc_regs(regs);
+        let mut asm = self.x86_split().alloc_regs(regs);
 
         // Create label instances in the code block
         for (idx, name) in asm.label_names.iter().enumerate() {
author	Kevin Newton <kddnewton@gmail.com>	2022-07-08 13:01:21 -0400
committer	Takashi Kokubun <takashikkbn@gmail.com>	2022-08-29 08:46:59 -0700
commit	6773832ab9cad3c7bcb3b93ef85a4bcfc9b3a4e3 (patch)
tree	b377f91fb5597d0ee141485205a2638cb35f75c8
parent	0551115912fd6682187dd501275096fdb7570084 (diff)