1 files changed, 42 insertions, 22 deletions
diff --git a/ext/json/simd/simd.h b/ext/json/simd/simd.h
index 3abbdb0209..611b41b066 100644
--- a/ext/json/simd/simd.h
+++ b/ext/json/simd/simd.h
@@ -1,10 +1,14 @@
+#include "../json.h"
+
 typedef enum {
     SIMD_NONE,
     SIMD_NEON,
     SIMD_SSE2
 } SIMD_Implementation;
 
-#ifdef JSON_ENABLE_SIMD
+#ifndef __has_builtin         // Optional of course.
+  #define __has_builtin(x) 0  // Compatibility with non-clang compilers.
+#endif
 
 #ifdef __clang__
 # if __has_builtin(__builtin_ctzll)
@@ -20,6 +24,8 @@ typedef enum {
 
 static inline uint32_t trailing_zeros64(uint64_t input)
 {
+    JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior
+
 #if HAVE_BUILTIN_CTZLL
     return __builtin_ctzll(input);
 #else
@@ -35,6 +41,8 @@ static inline uint32_t trailing_zeros64(uint64_t input)
 
 static inline int trailing_zeros(int input)
 {
+    JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior
+
 #if HAVE_BUILTIN_CTZLL
     return __builtin_ctz(input);
 #else
@@ -48,14 +56,36 @@ static inline int trailing_zeros(int input)
 #endif
 }
 
-#if (defined(__GNUC__ ) || defined(__clang__))
-#define FORCE_INLINE __attribute__((always_inline))
-#else
-#define FORCE_INLINE
-#endif
+#ifdef JSON_ENABLE_SIMD
 
+#define SIMD_MINIMUM_THRESHOLD 4
 
-#define SIMD_MINIMUM_THRESHOLD 6
+ALWAYS_INLINE(static) void json_fast_memcpy16(char *dst, const char *src, size_t len)
+{
+    RBIMPL_ASSERT_OR_ASSUME(len < 16);
+    RBIMPL_ASSERT_OR_ASSUME(len >= SIMD_MINIMUM_THRESHOLD); // 4
+#if defined(__has_builtin) && __has_builtin(__builtin_memcpy)
+    // If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD (4) and vec_len-1 (15) bytes.
+    // These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy
+    // the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct
+    // position in both copies.
+
+    // Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the
+    // generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)),
+    // when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional
+    // select instruction instead of direct loads and stores with a branch. This ends up slower than the branch
+    // plus two loads and stores generated when using __builtin_memcpy.
+    if (len >= 8) {
+        __builtin_memcpy(dst, src, 8);
+        __builtin_memcpy(dst + len - 8, src + len - 8, 8);
+    } else {
+        __builtin_memcpy(dst, src, 4);
+        __builtin_memcpy(dst + len - 4, src + len - 4, 4);
+    }
+#else
+    MEMCPY(dst, src, char, len);
+#endif
+}
 
 #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
 #include <arm_neon.h>
@@ -70,14 +100,14 @@ static inline SIMD_Implementation find_simd_implementation(void)
 #define HAVE_SIMD_NEON 1
 
 // See: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
-static inline FORCE_INLINE uint64_t neon_match_mask(uint8x16_t matches)
+ALWAYS_INLINE(static) uint64_t neon_match_mask(uint8x16_t matches)
 {
     const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(matches), 4);
     const uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0);
     return mask & 0x8888888888888888ull;
 }
 
-static inline FORCE_INLINE uint64_t compute_chunk_mask_neon(const char *ptr)
+ALWAYS_INLINE(static) uint64_t compute_chunk_mask_neon(const char *ptr)
 {
     uint8x16_t chunk = vld1q_u8((const unsigned char *)ptr);
 
@@ -90,7 +120,7 @@ static inline FORCE_INLINE uint64_t compute_chunk_mask_neon(const char *ptr)
     return neon_match_mask(needs_escape);
 }
 
-static inline FORCE_INLINE int string_scan_simd_neon(const char **ptr, const char *end, uint64_t *mask)
+ALWAYS_INLINE(static) int string_scan_simd_neon(const char **ptr, const char *end, uint64_t *mask)
 {
     while (*ptr + sizeof(uint8x16_t) <= end) {
         uint64_t chunk_mask = compute_chunk_mask_neon(*ptr);
@@ -103,16 +133,6 @@ static inline FORCE_INLINE int string_scan_simd_neon(const char **ptr, const cha
     return 0;
 }
 
-static inline uint8x16x4_t load_uint8x16_4(const unsigned char *table)
-{
-    uint8x16x4_t tab;
-    tab.val[0] = vld1q_u8(table);
-    tab.val[1] = vld1q_u8(table+16);
-    tab.val[2] = vld1q_u8(table+32);
-    tab.val[3] = vld1q_u8(table+48);
-    return tab;
-}
-
 #endif /* ARM Neon Support.*/
 
 #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
@@ -137,7 +157,7 @@ static inline uint8x16x4_t load_uint8x16_4(const unsigned char *table)
 #define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1))
 #define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a)
 
-static inline TARGET_SSE2 FORCE_INLINE int compute_chunk_mask_sse2(const char *ptr)
+ALWAYS_INLINE(static) TARGET_SSE2 int compute_chunk_mask_sse2(const char *ptr)
 {
     __m128i chunk         = _mm_loadu_si128((__m128i const*)ptr);
     // Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33
@@ -148,7 +168,7 @@ static inline TARGET_SSE2 FORCE_INLINE int compute_chunk_mask_sse2(const char *p
     return _mm_movemask_epi8(needs_escape);
 }
 
-static inline TARGET_SSE2 FORCE_INLINE int string_scan_simd_sse2(const char **ptr, const char *end, int *mask)
+ALWAYS_INLINE(static) TARGET_SSE2 int string_scan_simd_sse2(const char **ptr, const char *end, int *mask)
 {
     while (*ptr + sizeof(__m128i) <= end) {
         int chunk_mask = compute_chunk_mask_sse2(*ptr);