2 files changed, 232 insertions, 0 deletions
diff --git a/ext/json/simd/conf.rb b/ext/json/simd/conf.rb
new file mode 100644
index 0000000000..76f774bc97
--- /dev/null
+++ b/ext/json/simd/conf.rb
@@ -0,0 +1,24 @@
+case RbConfig::CONFIG['host_cpu']
+when /^(arm|aarch64)/
+  # Try to compile a small program using NEON instructions
+  header, type, init, extra = 'arm_neon.h', 'uint8x16_t', 'vdupq_n_u8(32)', nil
+when /^(x86_64|x64)/
+  header, type, init, extra = 'x86intrin.h', '__m128i', '_mm_set1_epi8(32)', 'if (__builtin_cpu_supports("sse2")) { printf("OK"); }'
+end
+if header
+  if have_header(header) && try_compile(<<~SRC, '-Werror=implicit-function-declaration')
+      #{cpp_include(header)}
+      int main(int argc, char **argv) {
+        #{type} test = #{init};
+        #{extra}
+        if (argc > 100000) printf("%p", &test);
+        return 0;
+      }
+    SRC
+    $defs.push("-DJSON_ENABLE_SIMD")
+  else
+    puts "Disable SIMD"
+  end
+end
+
+have_header('cpuid.h')
diff --git a/ext/json/simd/simd.h b/ext/json/simd/simd.h
new file mode 100644
index 0000000000..611b41b066
--- /dev/null
+++ b/ext/json/simd/simd.h
@@ -0,0 +1,208 @@
+#include "../json.h"
+
+typedef enum {
+    SIMD_NONE,
+    SIMD_NEON,
+    SIMD_SSE2
+} SIMD_Implementation;
+
+#ifndef __has_builtin         // Optional of course.
+  #define __has_builtin(x) 0  // Compatibility with non-clang compilers.
+#endif
+
+#ifdef __clang__
+# if __has_builtin(__builtin_ctzll)
+#   define HAVE_BUILTIN_CTZLL 1
+# else
+#   define HAVE_BUILTIN_CTZLL 0
+# endif
+#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+# define HAVE_BUILTIN_CTZLL 1
+#else
+# define HAVE_BUILTIN_CTZLL 0
+#endif
+
+static inline uint32_t trailing_zeros64(uint64_t input)
+{
+    JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior
+
+#if HAVE_BUILTIN_CTZLL
+    return __builtin_ctzll(input);
+#else
+    uint32_t trailing_zeros = 0;
+    uint64_t temp = input;
+    while ((temp & 1) == 0 && temp > 0) {
+        trailing_zeros++;
+        temp >>= 1;
+    }
+    return trailing_zeros;
+#endif
+}
+
+static inline int trailing_zeros(int input)
+{
+    JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior
+
+#if HAVE_BUILTIN_CTZLL
+    return __builtin_ctz(input);
+#else
+    int trailing_zeros = 0;
+    int temp = input;
+    while ((temp & 1) == 0 && temp > 0) {
+        trailing_zeros++;
+        temp >>= 1;
+    }
+    return trailing_zeros;
+#endif
+}
+
+#ifdef JSON_ENABLE_SIMD
+
+#define SIMD_MINIMUM_THRESHOLD 4
+
+ALWAYS_INLINE(static) void json_fast_memcpy16(char *dst, const char *src, size_t len)
+{
+    RBIMPL_ASSERT_OR_ASSUME(len < 16);
+    RBIMPL_ASSERT_OR_ASSUME(len >= SIMD_MINIMUM_THRESHOLD); // 4
+#if defined(__has_builtin) && __has_builtin(__builtin_memcpy)
+    // If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD (4) and vec_len-1 (15) bytes.
+    // These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy
+    // the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct
+    // position in both copies.
+
+    // Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the
+    // generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)),
+    // when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional
+    // select instruction instead of direct loads and stores with a branch. This ends up slower than the branch
+    // plus two loads and stores generated when using __builtin_memcpy.
+    if (len >= 8) {
+        __builtin_memcpy(dst, src, 8);
+        __builtin_memcpy(dst + len - 8, src + len - 8, 8);
+    } else {
+        __builtin_memcpy(dst, src, 4);
+        __builtin_memcpy(dst + len - 4, src + len - 4, 4);
+    }
+#else
+    MEMCPY(dst, src, char, len);
+#endif
+}
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
+#include <arm_neon.h>
+
+#define FIND_SIMD_IMPLEMENTATION_DEFINED 1
+static inline SIMD_Implementation find_simd_implementation(void)
+{
+    return SIMD_NEON;
+}
+
+#define HAVE_SIMD 1
+#define HAVE_SIMD_NEON 1
+
+// See: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
+ALWAYS_INLINE(static) uint64_t neon_match_mask(uint8x16_t matches)
+{
+    const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(matches), 4);
+    const uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0);
+    return mask & 0x8888888888888888ull;
+}
+
+ALWAYS_INLINE(static) uint64_t compute_chunk_mask_neon(const char *ptr)
+{
+    uint8x16_t chunk = vld1q_u8((const unsigned char *)ptr);
+
+    // Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33
+    // https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/
+    const uint8x16_t too_low_or_dbl_quote = vcltq_u8(veorq_u8(chunk, vdupq_n_u8(2)), vdupq_n_u8(33));
+
+    uint8x16_t has_backslash = vceqq_u8(chunk, vdupq_n_u8('\\'));
+    uint8x16_t needs_escape  = vorrq_u8(too_low_or_dbl_quote, has_backslash);
+    return neon_match_mask(needs_escape);
+}
+
+ALWAYS_INLINE(static) int string_scan_simd_neon(const char **ptr, const char *end, uint64_t *mask)
+{
+    while (*ptr + sizeof(uint8x16_t) <= end) {
+        uint64_t chunk_mask = compute_chunk_mask_neon(*ptr);
+        if (chunk_mask) {
+            *mask = chunk_mask;
+            return 1;
+        }
+        *ptr += sizeof(uint8x16_t);
+    }
+    return 0;
+}
+
+#endif /* ARM Neon Support.*/
+
+#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+
+#ifdef HAVE_X86INTRIN_H
+#include <x86intrin.h>
+
+#define HAVE_SIMD 1
+#define HAVE_SIMD_SSE2 1
+
+#ifdef HAVE_CPUID_H
+#define FIND_SIMD_IMPLEMENTATION_DEFINED 1
+
+#if defined(__clang__) || defined(__GNUC__)
+#define TARGET_SSE2 __attribute__((target("sse2")))
+#else
+#define TARGET_SSE2
+#endif
+
+#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a)
+#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a)
+#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1))
+#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a)
+
+ALWAYS_INLINE(static) TARGET_SSE2 int compute_chunk_mask_sse2(const char *ptr)
+{
+    __m128i chunk         = _mm_loadu_si128((__m128i const*)ptr);
+    // Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33
+    // https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/
+    __m128i too_low_or_dbl_quote = _mm_cmplt_epu8(_mm_xor_si128(chunk, _mm_set1_epi8(2)), _mm_set1_epi8(33));
+    __m128i has_backslash = _mm_cmpeq_epi8(chunk, _mm_set1_epi8('\\'));
+    __m128i needs_escape  = _mm_or_si128(too_low_or_dbl_quote, has_backslash);
+    return _mm_movemask_epi8(needs_escape);
+}
+
+ALWAYS_INLINE(static) TARGET_SSE2 int string_scan_simd_sse2(const char **ptr, const char *end, int *mask)
+{
+    while (*ptr + sizeof(__m128i) <= end) {
+        int chunk_mask = compute_chunk_mask_sse2(*ptr);
+        if (chunk_mask) {
+            *mask = chunk_mask;
+            return 1;
+        }
+        *ptr += sizeof(__m128i);
+    }
+
+    return 0;
+}
+
+#include <cpuid.h>
+#endif /* HAVE_CPUID_H */
+
+static inline SIMD_Implementation find_simd_implementation(void)
+{
+    // TODO Revisit. I think the SSE version now only uses SSE2 instructions.
+    if (__builtin_cpu_supports("sse2")) {
+        return SIMD_SSE2;
+    }
+
+    return SIMD_NONE;
+}
+
+#endif /* HAVE_X86INTRIN_H */
+#endif /* X86_64 Support */
+
+#endif /* JSON_ENABLE_SIMD */
+
+#ifndef FIND_SIMD_IMPLEMENTATION_DEFINED
+static inline SIMD_Implementation find_simd_implementation(void)
+{
+    return SIMD_NONE;
+}
+#endif