summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean Boussier <jean.boussier@gmail.com>2026-01-16 18:43:08 +0100
committergit <svn-admin@ruby-lang.org>2026-01-16 17:52:54 +0000
commit3164d4e8a26b591c6e22a0deeb750b6f21db1345 (patch)
tree43f83d05a7ecf6cdf29c72873c3e04612687d698
parent456ef9140acbdf643c5537ee0f5d67429f2332b6 (diff)
[ruby/json] Extract json_fast_memcpy16 for readability
https://github.com/ruby/json/commit/1b276c8623
-rw-r--r--ext/json/generator/generator.c34
-rw-r--r--ext/json/json.h4
-rw-r--r--ext/json/simd/simd.h27
3 files changed, 37 insertions, 28 deletions
diff --git a/ext/json/generator/generator.c b/ext/json/generator/generator.c
index f17a2a729c..dbba99c455 100644
--- a/ext/json/generator/generator.c
+++ b/ext/json/generator/generator.c
@@ -288,6 +288,8 @@ static inline void escape_UTF8_char(search_state *search, unsigned char ch_len)
ALWAYS_INLINE(static) char *copy_remaining_bytes(search_state *search, unsigned long vec_len, unsigned long len)
{
+ RBIMPL_ASSERT_OR_ASSUME(len < vec_len);
+
// Flush the buffer so everything up until the last 'len' characters are unflushed.
search_flush(search);
@@ -303,37 +305,13 @@ ALWAYS_INLINE(static) char *copy_remaining_bytes(search_state *search, unsigned
// Optimistically copy the remaining 'len' characters to the output FBuffer. If there are no characters
// to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage.
-#if defined(__has_builtin) && __has_builtin(__builtin_memcpy)
-
-#ifdef RBIMPL_ASSERT_OR_ASSUME
- RBIMPL_ASSERT_OR_ASSUME(len < 16);
-#endif
-
- if (vec_len == 16 && len >= 4) {
- // If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD and vec_len-1 bytes.
- // These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy
- // the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct
- // position in both copies.
-
- // Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the
- // generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)),
- // when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional
- // select instruction instead of direct loads and stores with a branch. This ends up slower than the branch
- // plus two loads and stores generated when using __builtin_memcpy.
- if (len >= 8) {
- __builtin_memcpy(s, search->ptr, 8);
- __builtin_memcpy(s + len - 8, search->ptr + len - 8, 8);
- } else {
- __builtin_memcpy(s, search->ptr, 4);
- __builtin_memcpy(s + len - 4, search->ptr + len - 4, 4);
- }
+ if (vec_len == 16) {
+ RBIMPL_ASSERT_OR_ASSUME(len >= SIMD_MINIMUM_THRESHOLD);
+ json_fast_memcpy16(s, search->ptr, len);
} else {
MEMCPY(s, search->ptr, char, len);
}
-#else
- MEMCPY(s, search->ptr, char, len);
-#endif
-
+
return s;
}
diff --git a/ext/json/json.h b/ext/json/json.h
index 28efa04c25..9379d7ae7f 100644
--- a/ext/json/json.h
+++ b/ext/json/json.h
@@ -5,6 +5,10 @@
#include "ruby/encoding.h"
#include <stdint.h>
+#ifndef RBIMPL_ASSERT_OR_ASSUME
+# define RBIMPL_ASSERT_OR_ASSUME(x)
+#endif
+
#if defined(RUBY_DEBUG) && RUBY_DEBUG
# define JSON_ASSERT RUBY_ASSERT
#else
diff --git a/ext/json/simd/simd.h b/ext/json/simd/simd.h
index 84f6135ad1..3bb86acdec 100644
--- a/ext/json/simd/simd.h
+++ b/ext/json/simd/simd.h
@@ -60,6 +60,33 @@ static inline int trailing_zeros(int input)
#define SIMD_MINIMUM_THRESHOLD 4
+ALWAYS_INLINE(static) void json_fast_memcpy16(char *dst, const char *src, size_t len)
+{
+ RBIMPL_ASSERT_OR_ASSUME(len < 16);
+ RBIMPL_ASSERT_OR_ASSUME(len >= SIMD_MINIMUM_THRESHOLD); // 4
+#if defined(__has_builtin) && __has_builtin(__builtin_memcpy)
+ // If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD (4) and vec_len-1 (15) bytes.
+ // These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy
+ // the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct
+ // position in both copies.
+
+ // Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the
+ // generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)),
+ // when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional
+ // select instruction instead of direct loads and stores with a branch. This ends up slower than the branch
+ // plus two loads and stores generated when using __builtin_memcpy.
+ if (len >= 8) {
+ __builtin_memcpy(dst, src, 8);
+ __builtin_memcpy(dst + len - 8, src + len - 8, 8);
+ } else {
+ __builtin_memcpy(dst, src, 4);
+ __builtin_memcpy(dst + len - 4, src + len - 4, 4);
+ }
+#else
+ MEMCPY(dst, src, char, len);
+#endif
+}
+
#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
#include <arm_neon.h>