summaryrefslogtreecommitdiff
path: root/ext/json/simd/simd.h
blob: f8e5ee1880e5b119b119855fba8cbadf55c3b755 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#include "../json.h"

typedef enum {
    SIMD_NONE,
    SIMD_NEON,
    SIMD_SSE2
} SIMD_Implementation;

#ifndef __has_builtin         // Optional of course.
  #define __has_builtin(x) 0  // Compatibility with non-clang compilers.
#endif

#ifdef __clang__
# if __has_builtin(__builtin_ctzll)
#   define HAVE_BUILTIN_CTZLL 1
# else
#   define HAVE_BUILTIN_CTZLL 0
# endif
#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
# define HAVE_BUILTIN_CTZLL 1
#else
# define HAVE_BUILTIN_CTZLL 0
#endif

static inline uint32_t trailing_zeros64(uint64_t input)
{
    JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior

#if HAVE_BUILTIN_CTZLL
    return __builtin_ctzll(input);
#else
    uint32_t trailing_zeros = 0;
    uint64_t temp = input;
    while ((temp & 1) == 0 && temp > 0) {
        trailing_zeros++;
        temp >>= 1;
    }
    return trailing_zeros;
#endif
}

static inline int trailing_zeros(int input)
{
    JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior

#if HAVE_BUILTIN_CTZLL
    return __builtin_ctz(input);
#else
    int trailing_zeros = 0;
    int temp = input;
    while ((temp & 1) == 0 && temp > 0) {
        trailing_zeros++;
        temp >>= 1;
    }
    return trailing_zeros;
#endif
}

#ifdef JSON_ENABLE_SIMD

#define SIMD_MINIMUM_THRESHOLD 6

#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
#include <arm_neon.h>

#define FIND_SIMD_IMPLEMENTATION_DEFINED 1
static inline SIMD_Implementation find_simd_implementation(void)
{
    return SIMD_NEON;
}

#define HAVE_SIMD 1
#define HAVE_SIMD_NEON 1

// See: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
ALWAYS_INLINE(static) uint64_t neon_match_mask(uint8x16_t matches)
{
    const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(matches), 4);
    const uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0);
    return mask & 0x8888888888888888ull;
}

ALWAYS_INLINE(static) uint64_t compute_chunk_mask_neon(const char *ptr)
{
    uint8x16_t chunk = vld1q_u8((const unsigned char *)ptr);

    // Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33
    // https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/
    const uint8x16_t too_low_or_dbl_quote = vcltq_u8(veorq_u8(chunk, vdupq_n_u8(2)), vdupq_n_u8(33));

    uint8x16_t has_backslash = vceqq_u8(chunk, vdupq_n_u8('\\'));
    uint8x16_t needs_escape  = vorrq_u8(too_low_or_dbl_quote, has_backslash);
    return neon_match_mask(needs_escape);
}

ALWAYS_INLINE(static) int string_scan_simd_neon(const char **ptr, const char *end, uint64_t *mask)
{
    while (*ptr + sizeof(uint8x16_t) <= end) {
        uint64_t chunk_mask = compute_chunk_mask_neon(*ptr);
        if (chunk_mask) {
            *mask = chunk_mask;
            return 1;
        }
        *ptr += sizeof(uint8x16_t);
    }
    return 0;
}

static inline uint8x16x4_t load_uint8x16_4(const unsigned char *table)
{
    uint8x16x4_t tab;
    tab.val[0] = vld1q_u8(table);
    tab.val[1] = vld1q_u8(table+16);
    tab.val[2] = vld1q_u8(table+32);
    tab.val[3] = vld1q_u8(table+48);
    return tab;
}

#endif /* ARM Neon Support.*/

#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)

#ifdef HAVE_X86INTRIN_H
#include <x86intrin.h>

#define HAVE_SIMD 1
#define HAVE_SIMD_SSE2 1

#ifdef HAVE_CPUID_H
#define FIND_SIMD_IMPLEMENTATION_DEFINED 1

#if defined(__clang__) || defined(__GNUC__)
#define TARGET_SSE2 __attribute__((target("sse2")))
#else
#define TARGET_SSE2
#endif

#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a)
#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a)
#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1))
#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a)

ALWAYS_INLINE(static) TARGET_SSE2 int compute_chunk_mask_sse2(const char *ptr)
{
    __m128i chunk         = _mm_loadu_si128((__m128i const*)ptr);
    // Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33
    // https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/
    __m128i too_low_or_dbl_quote = _mm_cmplt_epu8(_mm_xor_si128(chunk, _mm_set1_epi8(2)), _mm_set1_epi8(33));
    __m128i has_backslash = _mm_cmpeq_epi8(chunk, _mm_set1_epi8('\\'));
    __m128i needs_escape  = _mm_or_si128(too_low_or_dbl_quote, has_backslash);
    return _mm_movemask_epi8(needs_escape);
}

ALWAYS_INLINE(static) TARGET_SSE2 int string_scan_simd_sse2(const char **ptr, const char *end, int *mask)
{
    while (*ptr + sizeof(__m128i) <= end) {
        int chunk_mask = compute_chunk_mask_sse2(*ptr);
        if (chunk_mask) {
            *mask = chunk_mask;
            return 1;
        }
        *ptr += sizeof(__m128i);
    }

    return 0;
}

#include <cpuid.h>
#endif /* HAVE_CPUID_H */

static inline SIMD_Implementation find_simd_implementation(void)
{
    // TODO Revisit. I think the SSE version now only uses SSE2 instructions.
    if (__builtin_cpu_supports("sse2")) {
        return SIMD_SSE2;
    }

    return SIMD_NONE;
}

#endif /* HAVE_X86INTRIN_H */
#endif /* X86_64 Support */

#endif /* JSON_ENABLE_SIMD */

#ifndef FIND_SIMD_IMPLEMENTATION_DEFINED
static inline SIMD_Implementation find_simd_implementation(void)
{
    return SIMD_NONE;
}
#endif