From 28433e9aa0c765c9d20bc6397439a1b12e66bcbd Mon Sep 17 00:00:00 2001
From: Kevin Newton <kddnewton@gmail.com>
Date: Tue, 27 Sep 2022 16:58:01 -0400
Subject: Change IncrCounter lowering on AArch64 (#6455)

* Change IncrCounter lowering on AArch64

Previously we were using LDADDAL which is not available on
Graviton 1 chips. Instead, we're going to use an exclusive
load/store group through the LDAXR/STLXR instructions.

* Update yjit/src/backend/arm64/mod.rs

Co-authored-by: Maxime Chevalier-Boisvert <maximechevalierb@gmail.com>
---
 yjit/src/asm/arm64/inst/load_store_exclusive.rs | 109 ++++++++++++++++++++++++
 yjit/src/asm/arm64/inst/mod.rs                  |   2 +
 yjit/src/asm/arm64/mod.rs                       |  39 +++++++++
 yjit/src/asm/arm64/opnd.rs                      |  10 +++
 yjit/src/backend/arm64/mod.rs                   |  32 ++++---
 5 files changed, 181 insertions(+), 11 deletions(-)
 create mode 100644 yjit/src/asm/arm64/inst/load_store_exclusive.rs
diff --git a/yjit/src/asm/arm64/inst/load_store_exclusive.rs b/yjit/src/asm/arm64/inst/load_store_exclusive.rs
new file mode 100644
index 0000000000..8216c2200a
--- /dev/null
+++ b/yjit/src/asm/arm64/inst/load_store_exclusive.rs
@@ -0,0 +1,109 @@
+/// The operation being performed for this instruction.
+enum Op {
+    Store = 0,
+    Load = 1
+}
+
+/// The size of the registers being operated on.
+enum Size {
+    Size32 = 0b10,
+    Size64 = 0b11
+}
+
+/// A convenience function so that we can convert the number of bits of an
+/// register operand directly into a Size enum variant.
+impl From<u8> for Size {
+    fn from(num_bits: u8) -> Self {
+        match num_bits {
+            64 => Size::Size64,
+            32 => Size::Size32,
+            _ => panic!("Invalid number of bits: {}", num_bits)
+        }
+    }
+}
+
+/// The struct that represents an A64 load or store exclusive instruction that
+/// can be encoded.
+///
+/// LDAXR/STLXR
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+/// | 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12 | 11 10 09 08 | 07 06 05 04 | 03 02 01 00 |
+/// |  1     0  0    1  0  0  0    0     0                     1  1  1  1    1  1                                   |
+/// | size.                          op    rs..............                       rn.............. rt.............. |
+/// +-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
+///
+pub struct LoadStoreExclusive {
+    /// The number of the register to be loaded.
+    rt: u8,
+
+    /// The base register with which to form the address.
+    rn: u8,
+
+    /// The register to be used for the status result if it applies to this
+    /// operation. Otherwise it's the zero register.
+    rs: u8,
+
+    /// The operation being performed for this instruction.
+    op: Op,
+
+    /// The size of the registers being operated on.
+    size: Size
+}
+
+impl LoadStoreExclusive {
+    /// LDAXR
+    /// https://developer.arm.com/documentation/ddi0602/2021-12/Base-Instructions/LDAXR--Load-Acquire-Exclusive-Register-
+    pub fn ldaxr(rt: u8, rn: u8, num_bits: u8) -> Self {
+        Self { rt, rn, rs: 31, op: Op::Load, size: num_bits.into() }
+    }
+
+    /// STLXR
+    /// https://developer.arm.com/documentation/ddi0602/2021-12/Base-Instructions/STLXR--Store-Release-Exclusive-Register-
+    pub fn stlxr(rs: u8, rt: u8, rn: u8, num_bits: u8) -> Self {
+        Self { rt, rn, rs, op: Op::Store, size: num_bits.into() }
+    }
+}
+
+/// https://developer.arm.com/documentation/ddi0602/2022-03/Index-by-Encoding/Loads-and-Stores?lang=en
+const FAMILY: u32 = 0b0100;
+
+impl From<LoadStoreExclusive> for u32 {
+    /// Convert an instruction into a 32-bit value.
+    fn from(inst: LoadStoreExclusive) -> Self {
+        0
+        | ((inst.size as u32) << 30)
+        | (FAMILY << 25)
+        | ((inst.op as u32) << 22)
+        | ((inst.rs as u32) << 16)
+        | (0b111111 << 10)
+        | ((inst.rn as u32) << 5)
+        | (inst.rt as u32)
+    }
+}
+
+impl From<LoadStoreExclusive> for [u8; 4] {
+    /// Convert an instruction into a 4 byte array.
+    fn from(inst: LoadStoreExclusive) -> [u8; 4] {
+        let result: u32 = inst.into();
+        result.to_le_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_ldaxr() {
+        let inst = LoadStoreExclusive::ldaxr(16, 0, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xc85ffc10, result);
+    }
+
+    #[test]
+    fn test_stlxr() {
+        let inst = LoadStoreExclusive::stlxr(17, 16, 0, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0xc811fc10, result);
+    }
+}
diff --git a/yjit/src/asm/arm64/inst/mod.rs b/yjit/src/asm/arm64/inst/mod.rs
index b3a77e73c9..9821e6a334 100644
--- a/yjit/src/asm/arm64/inst/mod.rs
+++ b/yjit/src/asm/arm64/inst/mod.rs
@@ -13,6 +13,7 @@ mod halfword_imm;
 mod load_literal;
 mod load_register;
 mod load_store;
+mod load_store_exclusive;
 mod logical_imm;
 mod logical_reg;
 mod mov;
@@ -36,6 +37,7 @@ pub use halfword_imm::HalfwordImm;
 pub use load_literal::LoadLiteral;
 pub use load_register::LoadRegister;
 pub use load_store::LoadStore;
+pub use load_store_exclusive::LoadStoreExclusive;
 pub use logical_imm::LogicalImm;
 pub use logical_reg::LogicalReg;
 pub use mov::Mov;
diff --git a/yjit/src/asm/arm64/mod.rs b/yjit/src/asm/arm64/mod.rs
index d97452a045..88431ce30a 100644
--- a/yjit/src/asm/arm64/mod.rs
+++ b/yjit/src/asm/arm64/mod.rs
@@ -331,6 +331,20 @@ pub fn ldaddal(cb: &mut CodeBlock, rs: A64Opnd, rt: A64Opnd, rn: A64Opnd) {
     cb.write_bytes(&bytes);
 }
 
+/// LDAXR - atomic load with acquire semantics
+pub fn ldaxr(cb: &mut CodeBlock, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rt, rn) {
+        (A64Opnd::Reg(rt), A64Opnd::Reg(rn)) => {
+            assert_eq!(rn.num_bits, 64, "rn must be a 64-bit register.");
+
+            LoadStoreExclusive::ldaxr(rt.reg_no, rn.reg_no, rt.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to ldaxr instruction."),
+    };
+
+    cb.write_bytes(&bytes);
+}
+
 /// LDP (signed offset) - load a pair of registers from memory
 pub fn ldp(cb: &mut CodeBlock, rt1: A64Opnd, rt2: A64Opnd, rn: A64Opnd) {
     let bytes: [u8; 4] = match (rt1, rt2, rn) {
@@ -707,6 +721,21 @@ pub fn orr(cb: &mut CodeBlock, rd: A64Opnd, rn: A64Opnd, rm: A64Opnd) {
     cb.write_bytes(&bytes);
 }
 
+/// STLXR - store a value to memory, release exclusive access
+pub fn stlxr(cb: &mut CodeBlock, rs: A64Opnd, rt: A64Opnd, rn: A64Opnd) {
+    let bytes: [u8; 4] = match (rs, rt, rn) {
+        (A64Opnd::Reg(rs), A64Opnd::Reg(rt), A64Opnd::Reg(rn)) => {
+            assert_eq!(rs.num_bits, 32, "rs must be a 32-bit register.");
+            assert_eq!(rn.num_bits, 64, "rn must be a 64-bit register.");
+
+            LoadStoreExclusive::stlxr(rs.reg_no, rt.reg_no, rn.reg_no, rn.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to stlxr instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
 /// STP (signed offset) - store a pair of registers to memory
 pub fn stp(cb: &mut CodeBlock, rt1: A64Opnd, rt2: A64Opnd, rn: A64Opnd) {
     let bytes: [u8; 4] = match (rt1, rt2, rn) {
@@ -1183,6 +1212,11 @@ mod tests {
         check_bytes("8b01eaf8", |cb| ldaddal(cb, X10, X11, X12));
     }
 
+    #[test]
+    fn test_ldaxr() {
+        check_bytes("6afd5fc8", |cb| ldaxr(cb, X10, X11));
+    }
+
     #[test]
     fn test_ldp() {
         check_bytes("8a2d4da9", |cb| ldp(cb, X10, X11, A64Opnd::new_mem(64, X12, 208)));
@@ -1333,6 +1367,11 @@ mod tests {
         check_bytes("80025fd6", |cb| ret(cb, X20));
     }
 
+    #[test]
+    fn test_stlxr() {
+        check_bytes("8bfd0ac8", |cb| stlxr(cb, W10, X11, X12));
+    }
+
     #[test]
     fn test_stp() {
         check_bytes("8a2d0da9", |cb| stp(cb, X10, X11, A64Opnd::new_mem(64, X12, 208)));
diff --git a/yjit/src/asm/arm64/opnd.rs b/yjit/src/asm/arm64/opnd.rs
index 52b2a84637..0dc614ab4e 100644
--- a/yjit/src/asm/arm64/opnd.rs
+++ b/yjit/src/asm/arm64/opnd.rs
@@ -84,6 +84,14 @@ impl A64Opnd {
             _ => false
         }
     }
+
+    /// Unwrap a register from an operand.
+    pub fn unwrap_reg(&self) -> A64Reg {
+        match self {
+            A64Opnd::Reg(reg) => *reg,
+            _ => panic!("Expected register operand")
+        }
+    }
 }
 
 // argument registers
@@ -102,6 +110,8 @@ pub const X12_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 12 };
 pub const X13_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 13 };
 pub const X14_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 14 };
 pub const X15_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 15 };
+pub const X16_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 16 };
+pub const X17_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 17 };
 
 // callee-save registers
 pub const X19_REG: A64Reg = A64Reg { num_bits: 64, reg_no: 19 };
diff --git a/yjit/src/backend/arm64/mod.rs b/yjit/src/backend/arm64/mod.rs
index 0a5068be58..79dff530d1 100644
--- a/yjit/src/backend/arm64/mod.rs
+++ b/yjit/src/backend/arm64/mod.rs
@@ -70,7 +70,8 @@ impl Assembler
 {
     // A special scratch register for intermediate processing.
     // This register is caller-saved (so we don't have to save it before using it)
-    const SCRATCH0: A64Opnd = A64Opnd::Reg(X15_REG);
+    const SCRATCH0: A64Opnd = A64Opnd::Reg(X16_REG);
+    const SCRATCH1: A64Opnd = A64Opnd::Reg(X17_REG);    
 
     /// Get the list of registers from which we will allocate on this platform
     /// These are caller-saved registers
@@ -373,17 +374,12 @@ impl Assembler
                     asm.csel_ge(opnd0, opnd1);
                 },
                 Insn::IncrCounter { mem, value } => {
-                    // We'll use LDADD later which only works with registers
-                    // ... Load pointer into register
-                    let counter_addr = split_lea_operand(asm, mem);
-
-                    // Load immediates into a register
-                    let addend = match value {
-                        opnd @ Opnd::Imm(_) | opnd @ Opnd::UImm(_) => asm.load(opnd),
-                        opnd => opnd,
+                    let counter_addr = match mem {
+                        Opnd::Mem(_) => split_lea_operand(asm, mem),
+                        _ => mem
                     };
 
-                    asm.incr_counter(counter_addr, addend);
+                    asm.incr_counter(counter_addr, value);
                 },
                 Insn::JmpOpnd(opnd) => {
                     if let Opnd::Mem(_) = opnd {
@@ -936,7 +932,21 @@ impl Assembler
                     emit_conditional_jump::<{Condition::VS}>(cb, *target);
                 },
                 Insn::IncrCounter { mem, value } => {
-                    ldaddal(cb, value.into(), value.into(), mem.into());
+                    let label = cb.new_label("incr_counter_loop".to_string());
+                    cb.write_label(label);
+
+                    ldaxr(cb, Self::SCRATCH0, mem.into());
+                    add(cb, Self::SCRATCH0, Self::SCRATCH0, value.into());
+
+                    // The status register that gets used to track whether or
+                    // not the store was successful must be 32 bytes. Since we
+                    // store the SCRATCH registers as their 64-bit versions, we
+                    // need to rewrap it here.
+                    let status = A64Opnd::Reg(Self::SCRATCH1.unwrap_reg().with_num_bits(32));
+                    stlxr(cb, status, Self::SCRATCH0, mem.into());
+
+                    cmp(cb, Self::SCRATCH1, A64Opnd::new_uimm(0));
+                    emit_conditional_jump::<{Condition::NE}>(cb, Target::Label(label));
                 },
                 Insn::Breakpoint => {
                     brk(cb, A64Opnd::None);
-- 
cgit v1.2.3