diff options
Diffstat (limited to 'ext/json/parser/parser.c')
| -rw-r--r-- | ext/json/parser/parser.c | 2068 |
1 files changed, 1310 insertions, 758 deletions
diff --git a/ext/json/parser/parser.c b/ext/json/parser/parser.c index 0a1d937575..eb7dc9aa82 100644 --- a/ext/json/parser/parser.c +++ b/ext/json/parser/parser.c @@ -1,61 +1,22 @@ -#include "ruby.h" -#include "ruby/encoding.h" - -/* shims */ -/* This is the fallback definition from Ruby 3.4 */ - -#ifndef RBIMPL_STDBOOL_H -#if defined(__cplusplus) -# if defined(HAVE_STDBOOL_H) && (__cplusplus >= 201103L) -# include <cstdbool> -# endif -#elif defined(HAVE_STDBOOL_H) -# include <stdbool.h> -#elif !defined(HAVE__BOOL) -typedef unsigned char _Bool; -# define bool _Bool -# define true ((_Bool)+1) -# define false ((_Bool)+0) -# define __bool_true_false_are_defined -#endif -#endif - -#ifndef RB_UNLIKELY -#define RB_UNLIKELY(expr) expr -#endif - -#ifndef RB_LIKELY -#define RB_LIKELY(expr) expr -#endif +#include "../json.h" +#include "../vendor/ryu.h" +#include "../simd/simd.h" static VALUE mJSON, eNestingError, Encoding_UTF_8; static VALUE CNaN, CInfinity, CMinusInfinity; -static ID i_json_creatable_p, i_json_create, i_create_id, - i_chr, i_deep_const_get, i_match, i_aset, i_aref, - i_leftshift, i_new, i_try_convert, i_uminus, i_encode; +static ID i_new, i_try_convert, i_uminus, i_encode; -static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_symbolize_names, sym_freeze, - sym_create_additions, sym_create_id, sym_object_class, sym_array_class, - sym_decimal_class, sym_match_string; +static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_allow_control_characters, + sym_allow_invalid_escape, sym_symbolize_names, sym_freeze, sym_decimal_class, sym_on_load, + sym_allow_duplicate_key; static int binary_encindex; static int utf8_encindex; -#ifdef HAVE_RB_CATEGORY_WARN -# define json_deprecated(message) rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, message) -#else -# define json_deprecated(message) rb_warn(message) -#endif - -static const char deprecated_create_additions_warning[] = - "JSON.load implicit support for `create_additions: true` is deprecated " - "and will be removed in 3.0, use JSON.unsafe_load or explicitly " - "pass `create_additions: true`"; - #ifndef HAVE_RB_HASH_BULK_INSERT // For TruffleRuby -void +static void rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash) { long index = 0; @@ -72,6 +33,12 @@ rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash) #define rb_hash_new_capa(n) rb_hash_new() #endif +#ifndef HAVE_RB_STR_TO_INTERNED_STR +static VALUE rb_str_to_interned_str(VALUE str) +{ + return rb_funcall(rb_str_freeze(str), i_uminus, 0); +} +#endif /* name cache */ @@ -117,116 +84,104 @@ static void rvalue_cache_insert_at(rvalue_cache *cache, int index, VALUE rstring cache->entries[index] = rstring; } -static inline int rstring_cache_cmp(const char *str, const long length, VALUE rstring) +#define rstring_cache_memcmp memcmp + +#if JSON_CPU_LITTLE_ENDIAN_64BITS +#if __has_builtin(__builtin_bswap64) +#undef rstring_cache_memcmp +ALWAYS_INLINE(static) int rstring_cache_memcmp(const char *str, const char *rptr, const long length) { - long rstring_length = RSTRING_LEN(rstring); - if (length == rstring_length) { - return memcmp(str, RSTRING_PTR(rstring), length); - } else { - return (int)(length - rstring_length); + // The libc memcmp has numerous complex optimizations, but in this particular case, + // we know the string is small (JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH), so being able to + // inline a simpler memcmp outperforms calling the libc version. + long i = 0; + + for (; i + 8 <= length; i += 8) { + uint64_t a, b; + memcpy(&a, str + i, 8); + memcpy(&b, rptr + i, 8); + if (a != b) { + a = __builtin_bswap64(a); + b = __builtin_bswap64(b); + return (a < b) ? -1 : 1; + } + } + + for (; i < length; i++) { + if (str[i] != rptr[i]) { + return (str[i] < rptr[i]) ? -1 : 1; + } } + + return 0; } +#endif +#endif -static VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length) +ALWAYS_INLINE(static) int rstring_cache_cmp(const char *str, const long length, VALUE rstring) { - if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) { - // Common names aren't likely to be very long. So we just don't - // cache names above an arbitrary threshold. - return Qfalse; - } + const char *rstring_ptr; + long rstring_length; + + RSTRING_GETMEM(rstring, rstring_ptr, rstring_length); - if (RB_UNLIKELY(!isalpha((unsigned char)str[0]))) { - // Simple heuristic, if the first character isn't a letter, - // we're much less likely to see this string again. - // We mostly want to cache strings that are likely to be repeated. - return Qfalse; + if (length == rstring_length) { + return rstring_cache_memcmp(str, rstring_ptr, length); + } else { + return (int)(length - rstring_length); } +} +ALWAYS_INLINE(static) VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length) +{ int low = 0; int high = cache->length - 1; - int mid = 0; - int last_cmp = 0; while (low <= high) { - mid = (high + low) >> 1; + int mid = (high + low) >> 1; VALUE entry = cache->entries[mid]; - last_cmp = rstring_cache_cmp(str, length, entry); + int cmp = rstring_cache_cmp(str, length, entry); - if (last_cmp == 0) { + if (cmp == 0) { return entry; - } else if (last_cmp > 0) { + } else if (cmp > 0) { low = mid + 1; } else { high = mid - 1; } } - if (RB_UNLIKELY(memchr(str, '\\', length))) { - // We assume the overwhelming majority of names don't need to be escaped. - // But if they do, we have to fallback to the slow path. - return Qfalse; - } - VALUE rstring = build_interned_string(str, length); if (cache->length < JSON_RVALUE_CACHE_CAPA) { - if (last_cmp > 0) { - mid += 1; - } - - rvalue_cache_insert_at(cache, mid, rstring); + rvalue_cache_insert_at(cache, low, rstring); } return rstring; } static VALUE rsymbol_cache_fetch(rvalue_cache *cache, const char *str, const long length) { - if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) { - // Common names aren't likely to be very long. So we just don't - // cache names above an arbitrary threshold. - return Qfalse; - } - - if (RB_UNLIKELY(!isalpha((unsigned char)str[0]))) { - // Simple heuristic, if the first character isn't a letter, - // we're much less likely to see this string again. - // We mostly want to cache strings that are likely to be repeated. - return Qfalse; - } - int low = 0; int high = cache->length - 1; - int mid = 0; - int last_cmp = 0; while (low <= high) { - mid = (high + low) >> 1; + int mid = (high + low) >> 1; VALUE entry = cache->entries[mid]; - last_cmp = rstring_cache_cmp(str, length, rb_sym2str(entry)); + int cmp = rstring_cache_cmp(str, length, rb_sym2str(entry)); - if (last_cmp == 0) { + if (cmp == 0) { return entry; - } else if (last_cmp > 0) { + } else if (cmp > 0) { low = mid + 1; } else { high = mid - 1; } } - if (RB_UNLIKELY(memchr(str, '\\', length))) { - // We assume the overwhelming majority of names don't need to be escaped. - // But if they do, we have to fallback to the slow path. - return Qfalse; - } - VALUE rsymbol = build_symbol(str, length); if (cache->length < JSON_RVALUE_CACHE_CAPA) { - if (last_cmp > 0) { - mid += 1; - } - - rvalue_cache_insert_at(cache, mid, rsymbol); + rvalue_cache_insert_at(cache, low, rsymbol); } return rsymbol; } @@ -256,7 +211,7 @@ static rvalue_stack *rvalue_stack_grow(rvalue_stack *stack, VALUE *handle, rvalu if (stack->type == RVALUE_STACK_STACK_ALLOCATED) { stack = rvalue_stack_spill(stack, handle, stack_ref); } else { - REALLOC_N(stack->ptr, VALUE, required); + JSON_SIZED_REALLOC_N(stack->ptr, VALUE, required, stack->capa); stack->capa = required; } return stack; @@ -286,17 +241,27 @@ static void rvalue_stack_mark(void *ptr) { rvalue_stack *stack = (rvalue_stack *)ptr; long index; - for (index = 0; index < stack->head; index++) { - rb_gc_mark(stack->ptr[index]); + if (stack && stack->ptr) { + for (index = 0; index < stack->head; index++) { + rb_gc_mark(stack->ptr[index]); + } } } +static void rvalue_stack_free_buffer(rvalue_stack *stack) +{ + JSON_SIZED_FREE_N(stack->ptr, stack->capa); + stack->ptr = NULL; +} + static void rvalue_stack_free(void *ptr) { rvalue_stack *stack = (rvalue_stack *)ptr; if (stack) { - ruby_xfree(stack->ptr); - ruby_xfree(stack); + rvalue_stack_free_buffer(stack); +#ifndef HAVE_RUBY_TYPED_EMBEDDABLE + JSON_SIZED_FREE(stack); +#endif } } @@ -307,14 +272,13 @@ static size_t rvalue_stack_memsize(const void *ptr) } static const rb_data_type_t JSON_Parser_rvalue_stack_type = { - "JSON::Ext::Parser/rvalue_stack", - { + .wrap_struct_name = "JSON::Ext::Parser/rvalue_stack", + .function = { .dmark = rvalue_stack_mark, .dfree = rvalue_stack_free, .dsize = rvalue_stack_memsize, }, - 0, 0, - RUBY_TYPED_FREE_IMMEDIATELY, + .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE, }; static rvalue_stack *rvalue_stack_spill(rvalue_stack *old_stack, VALUE *handle, rvalue_stack **stack_ref) @@ -336,85 +300,197 @@ static void rvalue_stack_eagerly_release(VALUE handle) if (handle) { rvalue_stack *stack; TypedData_Get_Struct(handle, rvalue_stack, &JSON_Parser_rvalue_stack_type, stack); - RTYPEDDATA_DATA(handle) = NULL; +#ifdef HAVE_RUBY_TYPED_EMBEDDABLE + rvalue_stack_free_buffer(stack); +#else rvalue_stack_free(stack); + RTYPEDDATA_DATA(handle) = NULL; +#endif } } +/* frame stack */ + +// Iterative (non-recursive) parsing keeps an explicit stack of the containers +// currently being built, instead of relying on the C call stack. Each frame +// only needs enough bookkeeping to close its container: which kind it is, the +// rvalue_stack position where its children start (so we know how many to pop), +// and the cursor at its opening brace (used to rewind for duplicate key +// errors). Frames hold no VALUEs, so this stack needs no GC marking; it reuses +// the same stack-allocated-with-heap-spill strategy as the rvalue_stack so that +// it's freed even if parsing raises. +// +// The lifecycle helpers below (grow/push/peek/pop/spill/free/eagerly_release +// and the rb_data_type_t) deliberately mirror their rvalue_stack counterparts +// -- the element type and the absence of a mark function are the only real +// differences. Keep the two in sync: a fix to the spill/release or +// HAVE_RUBY_TYPED_EMBEDDABLE handling in one almost certainly belongs in the +// other. +#define JSON_FRAME_STACK_INITIAL_CAPA 32 + +enum json_frame_type { + JSON_FRAME_ROOT, + JSON_FRAME_ARRAY, + JSON_FRAME_OBJECT, +}; + +// Where a frame is within its container's grammar. This is the entirety of the +// parser's "what to do next" state: json_parse_any dispatches on the top +// frame's phase and holds no resume state in C locals, so a parse can stop at +// any value boundary and be resumed purely from the (persistable) frame stack. +enum json_frame_phase { + JSON_PHASE_VALUE, // expecting a value (document root, array element, or object value after ':') + JSON_PHASE_ARRAY_COMMA, // after a value: expecting ',' or the closing ']' + JSON_PHASE_OBJECT_KEY, // expecting a '"' key (after '{' or ',') + JSON_PHASE_OBJECT_COMMA, // after a value: expecting ',' or the closing '}' + JSON_PHASE_OBJECT_COLON, // object only: after a key, expecting ':' + JSON_PHASE_DONE, // root only: the document value has been parsed +}; + +typedef struct json_frame_struct { + enum json_frame_type type; + enum json_frame_phase phase; + long value_stack_head; // rvalue_stack->head when this container opened + const char *start_cursor; // object frames only (the '{'); NULL otherwise +} json_frame; + +typedef struct json_frame_stack_struct { + enum rvalue_stack_type type; // shared with rvalue_stack: is ptr stack- or heap-allocated + long capa; + long head; + json_frame *ptr; +} json_frame_stack; + +enum duplicate_key_action { + JSON_DEPRECATED = 0, + JSON_IGNORE, + JSON_RAISE, +}; + +typedef struct JSON_ParserStruct { + VALUE on_load_proc; + VALUE decimal_class; + ID decimal_method_id; + enum duplicate_key_action on_duplicate_key; + int max_nesting; + bool allow_nan; + bool allow_trailing_comma; + bool allow_control_characters; + bool allow_invalid_escape; + bool symbolize_names; + bool freeze; +} JSON_ParserConfig; + +typedef struct JSON_ParserStateStruct { + VALUE *value_stack_handle; + VALUE *frame_stack_handle; + const char *start; + const char *cursor; + const char *end; + rvalue_stack *value_stack; + json_frame_stack *frames; + rvalue_cache name_cache; + int in_array; + int current_nesting; + unsigned int emitted_deprecations; +} JSON_ParserState; + +static json_frame_stack *json_frame_stack_spill(json_frame_stack *old_stack, VALUE *handle, json_frame_stack **stack_ref); -#ifndef HAVE_STRNLEN -static size_t strnlen(const char *s, size_t maxlen) +static json_frame_stack *json_frame_stack_grow(json_frame_stack *stack, VALUE *handle, json_frame_stack **stack_ref) { - char *p; - return ((p = memchr(s, '\0', maxlen)) ? p - s : maxlen); + long required = stack->capa * 2; + + if (stack->type == RVALUE_STACK_STACK_ALLOCATED) { + stack = json_frame_stack_spill(stack, handle, stack_ref); + } else { + JSON_SIZED_REALLOC_N(stack->ptr, json_frame, required, stack->capa); + stack->capa = required; + } + return stack; } -#endif -#define PARSE_ERROR_FRAGMENT_LEN 32 -#ifdef RBIMPL_ATTR_NORETURN -RBIMPL_ATTR_NORETURN() -#endif -static void raise_parse_error(const char *format, const char *start) +static json_frame *json_frame_stack_push(JSON_ParserState *state, json_frame frame) { - unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 1]; + json_frame_stack *stack = state->frames; + if (RB_UNLIKELY(stack->head >= stack->capa)) { + stack = json_frame_stack_grow(stack, state->frame_stack_handle, &state->frames); + } - size_t len = start ? strnlen(start, PARSE_ERROR_FRAGMENT_LEN) : 0; - const char *ptr = start; + json_frame *frame_ptr = &stack->ptr[stack->head++]; + *frame_ptr = frame; + return frame_ptr; +} - if (len == PARSE_ERROR_FRAGMENT_LEN) { - MEMCPY(buffer, start, char, PARSE_ERROR_FRAGMENT_LEN); +static inline json_frame *json_frame_stack_peek(json_frame_stack *stack) +{ + return &stack->ptr[stack->head - 1]; +} - while (buffer[len - 1] >= 0x80 && buffer[len - 1] < 0xC0) { // Is continuation byte - len--; - } +static inline void json_frame_stack_pop(json_frame_stack *stack) +{ + stack->head--; +} - if (buffer[len - 1] >= 0xC0) { // multibyte character start - len--; - } +static void json_frame_stack_free_buffer(json_frame_stack *stack) +{ + JSON_SIZED_FREE_N(stack->ptr, stack->capa); + stack->ptr = NULL; +} - buffer[len] = '\0'; - ptr = (const char *)buffer; +static void json_frame_stack_free(void *ptr) +{ + json_frame_stack *stack = (json_frame_stack *)ptr; + if (stack) { + json_frame_stack_free_buffer(stack); +#ifndef HAVE_RUBY_TYPED_EMBEDDABLE + JSON_SIZED_FREE(stack); +#endif } - - rb_enc_raise(enc_utf8, rb_path2class("JSON::ParserError"), format, ptr); } -/* unicode */ +static size_t json_frame_stack_memsize(const void *ptr) +{ + const json_frame_stack *stack = (const json_frame_stack *)ptr; + return sizeof(json_frame_stack) + sizeof(json_frame) * stack->capa; +} -static const signed char digit_values[256] = { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, - -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1 +static const rb_data_type_t JSON_Parser_frame_stack_type = { + .wrap_struct_name = "JSON::Ext::Parser/frame_stack", + .function = { + .dmark = NULL, + .dfree = json_frame_stack_free, + .dsize = json_frame_stack_memsize, + }, + .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE, }; -static uint32_t unescape_unicode(const unsigned char *p) -{ - signed char b; - uint32_t result = 0; - b = digit_values[p[0]]; - if (b < 0) raise_parse_error("incomplete unicode character escape sequence at '%s'", (char *)p - 2); - result = (result << 4) | (unsigned char)b; - b = digit_values[p[1]]; - if (b < 0) raise_parse_error("incomplete unicode character escape sequence at '%s'", (char *)p - 2); - result = (result << 4) | (unsigned char)b; - b = digit_values[p[2]]; - if (b < 0) raise_parse_error("incomplete unicode character escape sequence at '%s'", (char *)p - 2); - result = (result << 4) | (unsigned char)b; - b = digit_values[p[3]]; - if (b < 0) raise_parse_error("incomplete unicode character escape sequence at '%s'", (char *)p - 2); - result = (result << 4) | (unsigned char)b; - return result; +static json_frame_stack *json_frame_stack_spill(json_frame_stack *old_stack, VALUE *handle, json_frame_stack **stack_ref) +{ + json_frame_stack *stack; + *handle = TypedData_Make_Struct(0, json_frame_stack, &JSON_Parser_frame_stack_type, stack); + *stack_ref = stack; + MEMCPY(stack, old_stack, json_frame_stack, 1); + + stack->capa = old_stack->capa << 1; + stack->ptr = ALLOC_N(json_frame, stack->capa); + stack->type = RVALUE_STACK_HEAP_ALLOCATED; + MEMCPY(stack->ptr, old_stack->ptr, json_frame, old_stack->head); + return stack; +} + +static void json_frame_stack_eagerly_release(VALUE handle) +{ + if (handle) { + json_frame_stack *stack; + TypedData_Get_Struct(handle, json_frame_stack, &JSON_Parser_frame_stack_type, stack); +#ifdef HAVE_RUBY_TYPED_EMBEDDABLE + json_frame_stack_free_buffer(stack); +#else + json_frame_stack_free(stack); + RTYPEDDATA_DATA(handle) = NULL; +#endif + } } static int convert_UTF32_to_UTF8(char *buf, uint32_t ch) @@ -443,32 +519,161 @@ static int convert_UTF32_to_UTF8(char *buf, uint32_t ch) return len; } -typedef struct JSON_ParserStruct { - VALUE create_id; - VALUE object_class; - VALUE array_class; - VALUE decimal_class; - ID decimal_method_id; - VALUE match_string; - int max_nesting; - bool allow_nan; - bool allow_trailing_comma; - bool parsing_name; - bool symbolize_names; - bool freeze; - bool create_additions; - bool deprecated_create_additions; -} JSON_ParserConfig; +static inline size_t rest(JSON_ParserState *state) { + return state->end - state->cursor; +} -typedef struct JSON_ParserStateStruct { - VALUE stack_handle; - const char *cursor; - const char *end; - rvalue_stack *stack; - rvalue_cache name_cache; - int in_array; - int current_nesting; -} JSON_ParserState; +static inline bool eos(JSON_ParserState *state) { + return state->cursor >= state->end; +} + +static inline char peek(JSON_ParserState *state) +{ + if (RB_UNLIKELY(eos(state))) { + return 0; + } + return *state->cursor; +} + +static void cursor_position(JSON_ParserState *state, long *line_out, long *column_out) +{ + JSON_ASSERT(state->cursor <= state->end); + + // Redundant but helpful for hardening + if (RB_UNLIKELY(state->cursor > state->end)) { + state->cursor = state->end; + } + + const char *cursor = state->cursor; + long column = 0; + long line = 1; + + while (cursor >= state->start) { + if (*cursor-- == '\n') { + break; + } + column++; + } + + while (cursor >= state->start) { + if (*cursor-- == '\n') { + line++; + } + } + *line_out = line; + *column_out = column; +} + +static void emit_parse_warning(const char *message, JSON_ParserState *state) +{ + long line, column; + cursor_position(state, &line, &column); + + VALUE warning = rb_sprintf("%s at line %ld column %ld", message, line, column); + rb_funcall(mJSON, rb_intern("deprecation_warning"), 1, warning); +} + +#define PARSE_ERROR_FRAGMENT_LEN 32 + +static VALUE build_parse_error_message(const char *format, JSON_ParserState *state, long line, long column) +{ + unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 3]; + + const char *ptr = "EOF"; + if (state->cursor && state->cursor < state->end) { + ptr = state->cursor; + size_t len = 0; + while (len < PARSE_ERROR_FRAGMENT_LEN) { + char ch = ptr[len]; + if (!ch || ch == '\n' || ch == ' ' || ch == '\t' || ch == '\r') { + break; + } + len++; + } + + if (len) { + buffer[0] = '\''; + MEMCPY(buffer + 1, ptr, char, len); + + while (buffer[len] >= 0x80 && buffer[len] < 0xC0) { // Is continuation byte + len--; + } + + if (buffer[len] >= 0xC0) { // multibyte character start + len--; + } + + buffer[len + 1] = '\''; + buffer[len + 2] = '\0'; + ptr = (const char *)buffer; + } + } + + VALUE message = rb_enc_sprintf(enc_utf8, format, ptr); + rb_str_catf(message, " at line %ld column %ld", line, column); + return message; +} + +static VALUE parse_error_new(VALUE message, long line, long column) +{ + VALUE exc = rb_exc_new_str(rb_path2class("JSON::ParserError"), message); + rb_ivar_set(exc, rb_intern("@line"), LONG2NUM(line)); + rb_ivar_set(exc, rb_intern("@column"), LONG2NUM(column)); + return exc; +} + +NORETURN(static) void raise_parse_error(const char *format, JSON_ParserState *state) +{ + long line, column; + cursor_position(state, &line, &column); + VALUE message = build_parse_error_message(format, state, line, column); + rb_exc_raise(parse_error_new(message, line, column)); +} + +NORETURN(static) void raise_parse_error_at(const char *format, JSON_ParserState *state, const char *at) +{ + state->cursor = at; + raise_parse_error(format, state); +} + +/* unicode */ + +static const signed char digit_values[256] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, + -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1 +}; + +static uint32_t unescape_unicode(JSON_ParserState *state, const char *sp, const char *spe) +{ + if (RB_UNLIKELY(sp > spe - 4)) { + raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2); + } + + const unsigned char *p = (const unsigned char *)sp; + + const signed char b0 = digit_values[p[0]]; + const signed char b1 = digit_values[p[1]]; + const signed char b2 = digit_values[p[2]]; + const signed char b3 = digit_values[p[3]]; + + if (RB_UNLIKELY((signed char)(b0 | b1 | b2 | b3) < 0)) { + raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2); + } + + return ((uint32_t)b0 << 12) | ((uint32_t)b1 << 8) | ((uint32_t)b2 << 4) | (uint32_t)b3; +} #define GET_PARSER_CONFIG \ JSON_ParserConfig *config; \ @@ -476,62 +681,82 @@ typedef struct JSON_ParserStateStruct { static const rb_data_type_t JSON_ParserConfig_type; -static const bool whitespace[256] = { - [' '] = 1, - ['\t'] = 1, - ['\n'] = 1, - ['\r'] = 1, - ['/'] = 1, -}; - -static void +NOINLINE(static) void json_eat_comments(JSON_ParserState *state) { - if (state->cursor + 1 < state->end) { - switch(state->cursor[1]) { - case '/': { - state->cursor = memchr(state->cursor, '\n', state->end - state->cursor); - if (!state->cursor) { - state->cursor = state->end; - } else { - state->cursor++; - } - break; + const char *start = state->cursor; + state->cursor++; + + switch (peek(state)) { + case '/': { + state->cursor = memchr(state->cursor, '\n', state->end - state->cursor); + if (!state->cursor) { + state->cursor = state->end; + } else { + state->cursor++; } - case '*': { - state->cursor += 2; - while (true) { - state->cursor = memchr(state->cursor, '*', state->end - state->cursor); - if (!state->cursor) { - state->cursor = state->end; - raise_parse_error("unexpected end of input, expected closing '*/'", state->cursor); - } else { - state->cursor++; - if (state->cursor < state->end && *state->cursor == '/') { - state->cursor++; - break; - } - } + break; + } + case '*': { + state->cursor++; + + while (true) { + const char *next_match = memchr(state->cursor, '*', state->end - state->cursor); + if (!next_match) { + raise_parse_error_at("unterminated comment, expected closing '*/'", state, start); + } + + state->cursor = next_match + 1; + if (peek(state) == '/') { + state->cursor++; + break; } - break; } - default: - raise_parse_error("unexpected token at '%s'", state->cursor); - break; + break; } - } else { - raise_parse_error("unexpected token at '%s'", state->cursor); + default: + raise_parse_error_at("unexpected token %s", state, start); + break; } } -static inline void +ALWAYS_INLINE(static) void json_eat_whitespace(JSON_ParserState *state) { - while (state->cursor < state->end && RB_UNLIKELY(whitespace[(unsigned char)*state->cursor])) { - if (RB_LIKELY(*state->cursor != '/')) { - state->cursor++; - } else { - json_eat_comments(state); + while (true) { + switch (peek(state)) { + case ' ': + state->cursor++; + break; + case '\n': + state->cursor++; + + // Heuristic: if we see a newline, there is likely consecutive spaces after it. +#if JSON_CPU_LITTLE_ENDIAN_64BITS + while (rest(state) > 8) { + uint64_t chunk; + memcpy(&chunk, state->cursor, sizeof(uint64_t)); + if (chunk == 0x2020202020202020) { + state->cursor += 8; + continue; + } + + uint32_t consecutive_spaces = trailing_zeros64(chunk ^ 0x2020202020202020) / CHAR_BIT; + state->cursor += consecutive_spaces; + break; + } +#endif + break; + case '\t': + case '\r': + state->cursor++; + break; + case '/': + json_eat_comments(state); + break; + + default: + return; } } } @@ -562,11 +787,22 @@ static inline VALUE build_string(const char *start, const char *end, bool intern return result; } -static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize) +static inline bool json_string_cacheable_p(const char *string, size_t length) { + // We mostly want to cache strings that are likely to be repeated. + // Simple heuristics: + // - Common names aren't likely to be very long. So we just don't cache names above an arbitrary threshold. + // - If the first character isn't a letter, we're much less likely to see this string again. + return length <= JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH && rb_isalpha(string[0]); +} + +static inline VALUE json_string_fastpath(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name) +{ + bool intern = is_name || config->freeze; + bool symbolize = is_name && config->symbolize_names; size_t bufferSize = stringEnd - string; - if (is_name && state->in_array) { + if (is_name && state->in_array && RB_LIKELY(json_string_cacheable_p(string, bufferSize))) { VALUE cached_key; if (RB_UNLIKELY(symbolize)) { cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize); @@ -582,104 +818,129 @@ static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *st return build_string(string, stringEnd, intern, symbolize); } -static VALUE json_string_unescape(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize) -{ - size_t bufferSize = stringEnd - string; - const char *p = string, *pe = string, *unescape, *bufferStart; - char *buffer; - int unescape_len; - char buf[4]; +#define JSON_MAX_UNESCAPE_POSITIONS 16 +typedef struct _json_unescape_positions { + long size; + const char **positions; + unsigned long additional_backslashes; +} JSON_UnescapePositions; - if (is_name && state->in_array) { - VALUE cached_key; - if (RB_UNLIKELY(symbolize)) { - cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize); - } else { - cached_key = rstring_cache_fetch(&state->name_cache, string, bufferSize); +static inline const char *json_next_backslash(const char *pe, const char *stringEnd, JSON_UnescapePositions *positions) +{ + while (positions->size) { + positions->size--; + const char *next_position = positions->positions[0]; + positions->positions++; + if (next_position >= pe) { + return next_position; } + } - if (RB_LIKELY(cached_key)) { - return cached_key; - } + if (positions->additional_backslashes) { + positions->additional_backslashes--; + return memchr(pe, '\\', stringEnd - pe); } + return NULL; +} + +NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name, JSON_UnescapePositions *positions) +{ + bool intern = is_name || config->freeze; + bool symbolize = is_name && config->symbolize_names; + size_t bufferSize = stringEnd - string; + const char *p = string, *pe = string, *bufferStart; + char *buffer; + VALUE result = rb_str_buf_new(bufferSize); rb_enc_associate_index(result, utf8_encindex); buffer = RSTRING_PTR(result); bufferStart = buffer; - while ((pe = memchr(pe, '\\', stringEnd - pe))) { - unescape = (char *) "?"; - unescape_len = 1; +#define APPEND_CHAR(chr) *buffer++ = chr; p = ++pe; + + while (pe < stringEnd && (pe = json_next_backslash(pe, stringEnd, positions))) { if (pe > p) { MEMCPY(buffer, p, char, pe - p); buffer += pe - p; } switch (*++pe) { + case '"': + case '/': + p = pe; // nothing to unescape just need to skip the backslash + break; + case '\\': + APPEND_CHAR('\\'); + break; case 'n': - unescape = (char *) "\n"; + APPEND_CHAR('\n'); break; case 'r': - unescape = (char *) "\r"; + APPEND_CHAR('\r'); break; case 't': - unescape = (char *) "\t"; - break; - case '"': - unescape = (char *) "\""; - break; - case '\\': - unescape = (char *) "\\"; + APPEND_CHAR('\t'); break; case 'b': - unescape = (char *) "\b"; + APPEND_CHAR('\b'); break; case 'f': - unescape = (char *) "\f"; + APPEND_CHAR('\f'); break; - case 'u': - if (pe > stringEnd - 5) { - raise_parse_error("incomplete unicode character escape sequence at '%s'", p); - } else { - uint32_t ch = unescape_unicode((unsigned char *) ++pe); - pe += 3; - /* To handle values above U+FFFF, we take a sequence of - * \uXXXX escapes in the U+D800..U+DBFF then - * U+DC00..U+DFFF ranges, take the low 10 bits from each - * to make a 20-bit number, then add 0x10000 to get the - * final codepoint. - * - * See Unicode 15: 3.8 "Surrogates", 5.3 "Handling - * Surrogate Pairs in UTF-16", and 23.6 "Surrogates - * Area". - */ - if ((ch & 0xFC00) == 0xD800) { - pe++; - if (pe > stringEnd - 6) { - raise_parse_error("incomplete surrogate pair at '%s'", p); - } - if (pe[0] == '\\' && pe[1] == 'u') { - uint32_t sur = unescape_unicode((unsigned char *) pe + 2); - ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) - | (sur & 0x3FF)); - pe += 5; - } else { - unescape = (char *) "?"; - break; + case 'u': { + uint32_t ch = unescape_unicode(state, ++pe, stringEnd); + pe += 3; + /* To handle values above U+FFFF, we take a sequence of + * \uXXXX escapes in the U+D800..U+DBFF then + * U+DC00..U+DFFF ranges, take the low 10 bits from each + * to make a 20-bit number, then add 0x10000 to get the + * final codepoint. + * + * See Unicode 15: 3.8 "Surrogates", 5.3 "Handling + * Surrogate Pairs in UTF-16", and 23.6 "Surrogates + * Area". + */ + if ((ch & 0xFC00) == 0xD800) { + pe++; + if (RB_LIKELY((pe <= stringEnd - 6) && memcmp(pe, "\\u", 2) == 0)) { + uint32_t sur = unescape_unicode(state, pe + 2, stringEnd); + + if (RB_UNLIKELY((sur & 0xFC00) != 0xDC00)) { + raise_parse_error_at("invalid surrogate pair at %s", state, p); } + + ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) | (sur & 0x3FF)); + pe += 5; + } else { + raise_parse_error_at("incomplete surrogate pair at %s", state, p); + break; } - unescape_len = convert_UTF32_to_UTF8(buf, ch); - unescape = buf; } + + int unescape_len = convert_UTF32_to_UTF8(buffer, ch); + buffer += unescape_len; + p = ++pe; break; + } default: - p = pe; - continue; + if ((unsigned char)*pe < 0x20) { + if (!config->allow_control_characters) { + if (*pe == '\n') { + raise_parse_error_at("Invalid unescaped newline character (\\n) in string: %s", state, pe - 1); + } + raise_parse_error_at("invalid ASCII control character in string: %s", state, pe - 1); + } + } + + if (config->allow_invalid_escape) { + APPEND_CHAR(*pe); + } else { + raise_parse_error_at("invalid escape character in string: %s", state, pe - 1); + } + break; } - MEMCPY(buffer, unescape, char, unescape_len); - buffer += unescape_len; - p = ++pe; } +#undef APPEND_CHAR if (stringEnd > p) { MEMCPY(buffer, p, char, stringEnd - p); @@ -690,98 +951,99 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c if (symbolize) { result = rb_str_intern(result); } else if (intern) { - result = rb_funcall(rb_str_freeze(result), i_uminus, 0); + result = rb_str_to_interned_str(result); } return result; } #define MAX_FAST_INTEGER_SIZE 18 -static inline VALUE fast_decode_integer(const char *p, const char *pe) -{ - bool negative = false; - if (*p == '-') { - negative = true; - p++; - } +#define MAX_NUMBER_STACK_BUFFER 128 - long long memo = 0; - while (p < pe) { - memo *= 10; - memo += *p - '0'; - p++; - } +typedef VALUE (*json_number_decode_func_t)(const char *ptr); - if (negative) { - memo = -memo; +static inline VALUE json_decode_large_number(const char *start, long len, json_number_decode_func_t func) +{ + if (RB_LIKELY(len < MAX_NUMBER_STACK_BUFFER)) { + char buffer[MAX_NUMBER_STACK_BUFFER]; + MEMCPY(buffer, start, char, len); + buffer[len] = '\0'; + return func(buffer); + } else { + VALUE buffer_v = rb_str_tmp_new(len); + char *buffer = RSTRING_PTR(buffer_v); + MEMCPY(buffer, start, char, len); + buffer[len] = '\0'; + VALUE number = func(buffer); + RB_GC_GUARD(buffer_v); + return number; } - return LL2NUM(memo); } -static VALUE json_decode_large_integer(const char *start, long len) +static VALUE json_decode_inum(const char *buffer) { - VALUE buffer_v; - char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1); - MEMCPY(buffer, start, char, len); - buffer[len] = '\0'; - VALUE number = rb_cstr2inum(buffer, 10); - RB_ALLOCV_END(buffer_v); - return number; + return rb_cstr2inum(buffer, 10); } -static inline VALUE -json_decode_integer(const char *start, const char *end) +NOINLINE(static) VALUE json_decode_large_integer(const char *start, long len) { - long len = end - start; - if (RB_LIKELY(len < MAX_FAST_INTEGER_SIZE)) { - return fast_decode_integer(start, end); + return json_decode_large_number(start, len, json_decode_inum); +} + +static inline VALUE json_decode_integer(uint64_t mantissa, int mantissa_digits, bool negative, const char *start, const char *end) +{ + if (RB_LIKELY(mantissa_digits < MAX_FAST_INTEGER_SIZE)) { + if (negative) { + return INT64T2NUM(-((int64_t)mantissa)); } - return json_decode_large_integer(start, len); + return UINT64T2NUM(mantissa); + } + + return json_decode_large_integer(start, end - start); } -static VALUE json_decode_large_float(const char *start, long len) +static VALUE json_decode_dnum(const char *buffer) { - VALUE buffer_v; - char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1); - MEMCPY(buffer, start, char, len); - buffer[len] = '\0'; - VALUE number = DBL2NUM(rb_cstr_to_dbl(buffer, 1)); - RB_ALLOCV_END(buffer_v); - return number; + return DBL2NUM(rb_cstr_to_dbl(buffer, 1)); } -static VALUE json_decode_float(JSON_ParserConfig *config, const char *start, const char *end) +NOINLINE(static) VALUE json_decode_large_float(const char *start, long len) { - long len = end - start; + return json_decode_large_number(start, len, json_decode_dnum); +} +/* Ruby JSON optimized float decoder using vendored Ryu algorithm + * Accepts pre-extracted mantissa and exponent from first-pass validation + */ +static inline VALUE json_decode_float(JSON_ParserConfig *config, uint64_t mantissa, int mantissa_digits, int64_t exponent, bool negative, + const char *start, const char *end) +{ if (RB_UNLIKELY(config->decimal_class)) { - VALUE text = rb_str_new(start, len); + VALUE text = rb_str_new(start, end - start); return rb_funcallv(config->decimal_class, config->decimal_method_id, 1, &text); - } else if (RB_LIKELY(len < 64)) { - char buffer[64]; - MEMCPY(buffer, start, char, len); - buffer[len] = '\0'; - return DBL2NUM(rb_cstr_to_dbl(buffer, 1)); - } else { - return json_decode_large_float(start, len); } + + if (RB_UNLIKELY(exponent > INT32_MAX)) { + return negative ? CMinusInfinity : CInfinity; + } + + if (RB_UNLIKELY(exponent < INT32_MIN)) { + return rb_float_new(negative ? -0.0 : 0.0); + } + + // Fall back to rb_cstr_to_dbl for potential subnormals (rare edge case) + // Ryu has rounding issues with subnormals around 1e-310 (< 2.225e-308) + if (RB_UNLIKELY(mantissa_digits > 17 || mantissa_digits + exponent < -307)) { + return json_decode_large_float(start, end - start); + } + + return DBL2NUM(ryu_s2d_from_parts(mantissa, mantissa_digits, (int32_t)exponent, negative)); } static inline VALUE json_decode_array(JSON_ParserState *state, JSON_ParserConfig *config, long count) { - VALUE array; - if (RB_UNLIKELY(config->array_class)) { - array = rb_class_new_instance(0, 0, config->array_class); - VALUE *items = rvalue_stack_peek(state->stack, count); - long index; - for (index = 0; index < count; index++) { - rb_funcall(array, i_leftshift, 1, items[index]); - } - } else { - array = rb_ary_new_from_values(count, rvalue_stack_peek(state->stack, count)); - } - - rvalue_stack_pop(state->stack, count); + VALUE array = rb_ary_new_from_values(count, rvalue_stack_peek(state->value_stack, count)); + rvalue_stack_pop(state->value_stack, count); if (config->freeze) { RB_OBJ_FREEZE(array); @@ -790,89 +1052,98 @@ static inline VALUE json_decode_array(JSON_ParserState *state, JSON_ParserConfig return array; } -static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfig *config, long count) +static VALUE json_find_duplicated_key(size_t count, const VALUE *pairs) { - VALUE object; - if (RB_UNLIKELY(config->object_class)) { - object = rb_class_new_instance(0, 0, config->object_class); - long index = 0; - VALUE *items = rvalue_stack_peek(state->stack, count); - while (index < count) { - VALUE name = items[index++]; - VALUE value = items[index++]; - rb_funcall(object, i_aset, 2, name, value); - } - } else { - object = rb_hash_new_capa(count); - rb_hash_bulk_insert(count, rvalue_stack_peek(state->stack, count), object); - } - - rvalue_stack_pop(state->stack, count); - - if (RB_UNLIKELY(config->create_additions)) { - VALUE klassname; - if (config->object_class) { - klassname = rb_funcall(object, i_aref, 1, config->create_id); - } else { - klassname = rb_hash_aref(object, config->create_id); - } - if (!NIL_P(klassname)) { - VALUE klass = rb_funcall(mJSON, i_deep_const_get, 1, klassname); - if (RTEST(rb_funcall(klass, i_json_creatable_p, 0))) { - if (config->deprecated_create_additions) { - json_deprecated(deprecated_create_additions_warning); - } - object = rb_funcall(klass, i_json_create, 1, object); + VALUE set = rb_hash_new_capa(count / 2); + for (size_t index = 0; index < count; index += 2) { + size_t before = RHASH_SIZE(set); + VALUE key = pairs[index]; + rb_hash_aset(set, key, Qtrue); + if (RHASH_SIZE(set) == before) { + if (RB_SYMBOL_P(key)) { + return rb_sym2str(key); } + return key; } } + return Qfalse; +} - if (config->freeze) { - RB_OBJ_FREEZE(object); - } +NOINLINE(static) void emit_duplicate_key_warning(JSON_ParserState *state, VALUE duplicate_key) +{ + VALUE message = rb_sprintf( + "detected duplicate key %"PRIsVALUE" in JSON object. This will raise an error in json 3.0 unless enabled via `allow_duplicate_key: true`", + rb_inspect(duplicate_key) + ); - return object; + emit_parse_warning(RSTRING_PTR(message), state); + RB_GC_GUARD(message); } -static int match_i(VALUE regexp, VALUE klass, VALUE memo) +NORETURN(static) void raise_duplicate_key_error(JSON_ParserState *state, VALUE duplicate_key) { - if (regexp == Qundef) return ST_STOP; - if (RTEST(rb_funcall(klass, i_json_creatable_p, 0)) && - RTEST(rb_funcall(regexp, i_match, 1, rb_ary_entry(memo, 0)))) { - rb_ary_push(memo, klass); - return ST_STOP; + VALUE message = rb_sprintf( + "duplicate key %"PRIsVALUE, + rb_inspect(duplicate_key) + ); + + long line, column; + cursor_position(state, &line, &column); + rb_str_concat(message, build_parse_error_message("", state, line, column)) ; + rb_exc_raise(parse_error_new(message, line, column)); +} + +NOINLINE(static) void json_on_duplicate_key(JSON_ParserState *state, JSON_ParserConfig *config, size_t count, const VALUE *pairs) +{ + switch (config->on_duplicate_key) { + case JSON_IGNORE: + return; + + case JSON_DEPRECATED: + // Only emit the first few deprecations to avoid spamming. + if (state->emitted_deprecations < 5) { + emit_duplicate_key_warning(state, json_find_duplicated_key(count, pairs)); + state->emitted_deprecations++; + } + return; + + case JSON_RAISE: + raise_duplicate_key_error(state, json_find_duplicated_key(count, pairs)); + return; } - return ST_CONTINUE; + UNREACHABLE; } -static inline VALUE json_decode_string(JSON_ParserState *state, JSON_ParserConfig *config, const char *start, const char *end, bool escaped, bool is_name) +static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfig *config, size_t count) { - VALUE string; - bool intern = is_name || config->freeze; - bool symbolize = is_name && config->symbolize_names; - if (escaped) { - string = json_string_unescape(state, start, end, is_name, intern, symbolize); - } else { - string = json_string_fastpath(state, start, end, is_name, intern, symbolize); + size_t entries_count = count / 2; + VALUE object = rb_hash_new_capa(entries_count); + const VALUE *pairs = rvalue_stack_peek(state->value_stack, count); + rb_hash_bulk_insert(count, pairs, object); + + if (RB_UNLIKELY(RHASH_SIZE(object) < entries_count)) { + json_on_duplicate_key(state, config, count, pairs); } - if (RB_UNLIKELY(config->create_additions && RTEST(config->match_string))) { - VALUE klass; - VALUE memo = rb_ary_new2(2); - rb_ary_push(memo, string); - rb_hash_foreach(config->match_string, match_i, memo); - klass = rb_ary_entry(memo, 1); - if (RTEST(klass)) { - string = rb_funcall(klass, i_json_create, 1, string); - } + rvalue_stack_pop(state->value_stack, count); + + if (config->freeze) { + RB_OBJ_FREEZE(object); } - return string; + return object; } -#define PUSH(result) rvalue_stack_push(state->stack, result, &state->stack_handle, &state->stack) +static inline VALUE json_push_value(JSON_ParserState *state, JSON_ParserConfig *config, VALUE value) +{ + if (RB_UNLIKELY(config->on_load_proc)) { + value = rb_proc_call_with_block(config->on_load_proc, 1, &value, Qnil); + } + rvalue_stack_push(state->value_stack, value, state->value_stack_handle, &state->value_stack); + return value; +} -static const bool string_scan[256] = { +static const bool string_scan_table[256] = { // ASCII Control Characters 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -885,292 +1156,569 @@ static const bool string_scan[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; -static inline VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name) +#ifdef HAVE_SIMD +static SIMD_Implementation simd_impl = SIMD_NONE; +#endif /* HAVE_SIMD */ + +ALWAYS_INLINE(static) bool string_scan(JSON_ParserState *state) { - state->cursor++; - const char *start = state->cursor; - bool escaped = false; +#ifdef HAVE_SIMD +#if defined(HAVE_SIMD_NEON) - while (state->cursor < state->end) { - if (RB_UNLIKELY(string_scan[(unsigned char)*state->cursor])) { - switch (*state->cursor) { - case '"': { - VALUE string = json_decode_string(state, config, start, state->cursor, escaped, is_name); - state->cursor++; - return PUSH(string); - } - case '\\': { - state->cursor++; - escaped = true; - if ((unsigned char)*state->cursor < 0x20) { - raise_parse_error("invalid ASCII control character in string: %s", state->cursor); - } - break; + uint64_t mask = 0; + if (string_scan_simd_neon(&state->cursor, state->end, &mask)) { + state->cursor += trailing_zeros64(mask) >> 2; + return true; + } + +#elif defined(HAVE_SIMD_SSE2) + if (simd_impl == SIMD_SSE2) { + int mask = 0; + if (string_scan_simd_sse2(&state->cursor, state->end, &mask)) { + state->cursor += trailing_zeros(mask); + return true; + } + } +#endif /* HAVE_SIMD_NEON or HAVE_SIMD_SSE2 */ +#endif /* HAVE_SIMD */ + + while (!eos(state)) { + if (RB_UNLIKELY(string_scan_table[(unsigned char)*state->cursor])) { + return true; + } + state->cursor++; + } + + // If the string ended with an unterminated escape sequence, we might + // have gone past the end. + if (RB_UNLIKELY(state->cursor > state->end)) { + state->cursor = state->end; + } + + return false; +} + +static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name, const char *start) +{ + const char *backslashes[JSON_MAX_UNESCAPE_POSITIONS]; + JSON_UnescapePositions positions = { + .size = 0, + .positions = backslashes, + .additional_backslashes = 0, + }; + + do { + switch (*state->cursor) { + case '"': { + VALUE string = json_string_unescape(state, config, start, state->cursor, is_name, &positions); + state->cursor++; + return json_push_value(state, config, string); + } + case '\\': { + if (RB_LIKELY(positions.size < JSON_MAX_UNESCAPE_POSITIONS)) { + backslashes[positions.size] = state->cursor; + positions.size++; + } else { + positions.additional_backslashes++; } - default: - raise_parse_error("invalid ASCII control character in string: %s", state->cursor); - break; + state->cursor++; + break; } + default: + if (!config->allow_control_characters) { + raise_parse_error("invalid ASCII control character in string: %s", state); + } + break; } state->cursor++; - } + } while (string_scan(state)); - raise_parse_error("unexpected end of input, expected closing \"", state->cursor); + raise_parse_error("unexpected end of input, expected closing \"", state); return Qfalse; } -static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) +ALWAYS_INLINE(static) VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name) { - json_eat_whitespace(state); - if (state->cursor >= state->end) { - raise_parse_error("unexpected end of input", state->cursor); + state->cursor++; + const char *start = state->cursor; + + if (RB_UNLIKELY(!string_scan(state))) { + raise_parse_error("unexpected end of input, expected closing \"", state); } - switch (*state->cursor) { - case 'n': - if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "null", 4) == 0)) { - state->cursor += 4; - return PUSH(Qnil); - } + if (RB_LIKELY(*state->cursor == '"')) { + VALUE string = json_string_fastpath(state, config, start, state->cursor, is_name); + state->cursor++; + return json_push_value(state, config, string); + } + return json_parse_escaped_string(state, config, is_name, start); +} - raise_parse_error("unexpected token at '%s'", state->cursor); - break; - case 't': - if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "true", 4) == 0)) { - state->cursor += 4; - return PUSH(Qtrue); - } +#if JSON_CPU_LITTLE_ENDIAN_64BITS +// From: https://lemire.me/blog/2022/01/21/swar-explained-parsing-eight-digits/ +// Additional References: +// https://johnnylee-sde.github.io/Fast-numeric-string-to-int/ +// http://0x80.pl/notesen/2014-10-12-parsing-decimal-numbers-part-1-swar.html +static inline uint64_t decode_8digits_unrolled(uint64_t val) { + const uint64_t mask = 0x000000FF000000FF; + const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32) + const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32) + val -= 0x3030303030303030; + val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8; + val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32; + return val; +} - raise_parse_error("unexpected token at '%s'", state->cursor); - break; - case 'f': - // Note: memcmp with a small power of two compile to an integer comparison - if ((state->end - state->cursor >= 5) && (memcmp(state->cursor + 1, "alse", 4) == 0)) { - state->cursor += 5; - return PUSH(Qfalse); - } +static inline uint64_t decode_4digits_unrolled(uint32_t val) { + const uint32_t mask = 0x000000FF; + const uint32_t mul1 = 100; + val -= 0x30303030; + val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8; + val = ((val & mask) * mul1) + (((val >> 16) & mask)); + return val; +} +#endif - raise_parse_error("unexpected token at '%s'", state->cursor); - break; - case 'N': - // Note: memcmp with a small power of two compile to an integer comparison - if (config->allow_nan && (state->end - state->cursor >= 3) && (memcmp(state->cursor + 1, "aN", 2) == 0)) { - state->cursor += 3; - return PUSH(CNaN); - } +static inline int json_parse_digits(JSON_ParserState *state, uint64_t *accumulator) +{ + const char *start = state->cursor; - raise_parse_error("unexpected token at '%s'", state->cursor); - break; - case 'I': - if (config->allow_nan && (state->end - state->cursor >= 8) && (memcmp(state->cursor, "Infinity", 8) == 0)) { - state->cursor += 8; - return PUSH(CInfinity); - } +#if JSON_CPU_LITTLE_ENDIAN_64BITS + while (rest(state) >= sizeof(uint64_t)) { + uint64_t next_8bytes; + memcpy(&next_8bytes, state->cursor, sizeof(uint64_t)); - raise_parse_error("unexpected token at '%s'", state->cursor); - break; - case '-': - // Note: memcmp with a small power of two compile to an integer comparison - if ((state->end - state->cursor >= 9) && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) { - if (config->allow_nan) { - state->cursor += 9; - return PUSH(CMinusInfinity); - } else { - raise_parse_error("unexpected token at '%s'", state->cursor); - } - } - // Fallthrough - case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { - bool integer = true; + // From: https://github.com/simdjson/simdjson/blob/32b301893c13d058095a07d9868edaaa42ee07aa/include/simdjson/generic/numberparsing.h#L333 + // Branchless version of: http://0x80.pl/articles/swar-digits-validate.html + uint64_t match = (next_8bytes & 0xF0F0F0F0F0F0F0F0) | (((next_8bytes + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4); + + if (match == 0x3333333333333333) { // 8 consecutive digits + *accumulator = (*accumulator * 100000000) + decode_8digits_unrolled(next_8bytes); + state->cursor += 8; + continue; + } - // /\A-?(0|[1-9]\d*)(\.\d+)?([Ee][-+]?\d+)?/ - const char *start = state->cursor; + uint32_t consecutive_digits = trailing_zeros64(match ^ 0x3333333333333333) / CHAR_BIT; + + if (consecutive_digits >= 4) { + *accumulator = (*accumulator * 10000) + decode_4digits_unrolled((uint32_t)next_8bytes); + state->cursor += 4; + consecutive_digits -= 4; + } + + while (consecutive_digits) { + *accumulator = *accumulator * 10 + (*state->cursor - '0'); + consecutive_digits--; state->cursor++; + } - while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) { - state->cursor++; - } + return (int)(state->cursor - start); + } +#endif - long integer_length = state->cursor - start; + char next_char; + while (rb_isdigit(next_char = peek(state))) { + *accumulator = *accumulator * 10 + (next_char - '0'); + state->cursor++; + } + return (int)(state->cursor - start); +} - if (RB_UNLIKELY(start[0] == '0' && integer_length > 1)) { - raise_parse_error("invalid number: %s", start); - } else if (RB_UNLIKELY(integer_length > 2 && start[0] == '-' && start[1] == '0')) { - raise_parse_error("invalid number: %s", start); - } else if (RB_UNLIKELY(integer_length == 1 && start[0] == '-')) { - raise_parse_error("invalid number: %s", start); - } +static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig *config, bool negative, const char *start) +{ + bool integer = true; + const char first_digit = *state->cursor; - if ((state->cursor < state->end) && (*state->cursor == '.')) { - integer = false; - state->cursor++; + // Variables for Ryu optimization - extract digits during parsing + int64_t exponent = 0; + int decimal_point_pos = -1; + uint64_t mantissa = 0; - if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') { - raise_parse_error("invalid number: %s", state->cursor); - } + // Parse integer part and extract mantissa digits + int mantissa_digits = json_parse_digits(state, &mantissa); - while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) { - state->cursor++; - } - } + if (RB_UNLIKELY((first_digit == '0' && mantissa_digits > 1) || (negative && mantissa_digits == 0))) { + raise_parse_error_at("invalid number: %s", state, start); + } - if ((state->cursor < state->end) && ((*state->cursor == 'e') || (*state->cursor == 'E'))) { - integer = false; - state->cursor++; - if ((state->cursor < state->end) && ((*state->cursor == '+') || (*state->cursor == '-'))) { - state->cursor++; - } + // Parse fractional part + if (peek(state) == '.') { + integer = false; + decimal_point_pos = mantissa_digits; // Remember position of decimal point + state->cursor++; - if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') { - raise_parse_error("invalid number: %s", state->cursor); - } + int fractional_digits = json_parse_digits(state, &mantissa); + mantissa_digits += fractional_digits; - while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) { - state->cursor++; - } - } + if (RB_UNLIKELY(!fractional_digits)) { + raise_parse_error_at("invalid number: %s", state, start); + } + } - if (integer) { - return PUSH(json_decode_integer(start, state->cursor)); - } - return PUSH(json_decode_float(config, start, state->cursor)); + // Parse exponent + if (rb_tolower(peek(state)) == 'e') { + integer = false; + state->cursor++; + + bool negative_exponent = false; + const char next_char = peek(state); + if (next_char == '-' || next_char == '+') { + negative_exponent = next_char == '-'; + state->cursor++; } - case '"': { - // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"} - return json_parse_string(state, config, false); - break; + + uint64_t abs_exponent = 0; + int exponent_digits = json_parse_digits(state, &abs_exponent); + + if (RB_UNLIKELY(!exponent_digits)) { + raise_parse_error_at("invalid number: %s", state, start); } - case '[': { - state->cursor++; - json_eat_whitespace(state); - long stack_head = state->stack->head; - if ((state->cursor < state->end) && (*state->cursor == ']')) { - state->cursor++; - return PUSH(json_decode_array(state, config, 0)); - } else { - state->current_nesting++; - if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) { - rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting); - } - state->in_array++; - json_parse_any(state, config); + if (RB_UNLIKELY(exponent_digits >= 20 || abs_exponent > (uint64_t)INT64_MAX)) { + exponent = negative_exponent ? INT64_MIN : INT64_MAX; + } else { + exponent = negative_exponent ? -(int64_t)abs_exponent : (int64_t)abs_exponent; + } + } + + if (integer) { + return json_decode_integer(mantissa, mantissa_digits, negative, start, state->cursor); + } + + // Adjust exponent based on decimal point position + if (decimal_point_pos >= 0) { + exponent -= (mantissa_digits - decimal_point_pos); + } + + return json_decode_float(config, mantissa, mantissa_digits, exponent, negative, start, state->cursor); +} + +static inline VALUE json_parse_positive_number(JSON_ParserState *state, JSON_ParserConfig *config) +{ + return json_parse_number(state, config, false, state->cursor); +} + +static inline VALUE json_parse_negative_number(JSON_ParserState *state, JSON_ParserConfig *config) +{ + const char *start = state->cursor; + state->cursor++; + return json_parse_number(state, config, true, start); +} + +// How many values (array elements, or interleaved object keys+values) have been +// pushed onto the rvalue stack since this container opened. Used to size the +// bulk decode on close, and to tell the first key/colon from later ones. +static inline long json_frame_entry_count(const json_frame *frame, const rvalue_stack *value_stack) +{ + return value_stack->head - frame->value_stack_head; +} + +// A complete value now sits on top of the rvalue stack. Advance the frame that +// was waiting for it: the root document is done, or the enclosing container +// moves on to expecting a ',' or its closing bracket. The caller passes the +// frame it already has in hand -- the one that was expecting the value -- which +// after a container close is the freshly re-exposed parent. +static inline void json_value_completed(json_frame *frame) +{ + switch (frame->type) { + case JSON_FRAME_ROOT: + frame->phase = JSON_PHASE_DONE; + return; + case JSON_FRAME_ARRAY: + frame->phase = JSON_PHASE_ARRAY_COMMA; + return; + case JSON_FRAME_OBJECT: + frame->phase = JSON_PHASE_OBJECT_COMMA; + return; + } + UNREACHABLE; +} + +// Parse an arbitrary JSON value iteratively. This is a state machine driven +// entirely by the top frame's phase so it can stop at any value boundary and +// resume purely from the frame stack. A JSON_FRAME_ROOT frame sits at the +// bottom of the stack, so the stack is never empty mid-parse and the document +// itself is just another frame whose value, once parsed, leaves its phase DONE. +static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) +{ + while (true) { + json_frame *frame = json_frame_stack_peek(state->frames); + + switch (frame->phase) { + case JSON_PHASE_DONE: { + // The root document value is parsed; it is the lone survivor on + // the rvalue stack. + return *rvalue_stack_peek(state->value_stack, 1); } - while (true) { + case JSON_PHASE_VALUE: { + JSON_PHASE_VALUE: json_eat_whitespace(state); - if (state->cursor < state->end) { - if (*state->cursor == ']') { + switch (peek(state)) { + case 'n': + if (rest(state) >= 4 && (memcmp(state->cursor, "null", 4) == 0)) { + state->cursor += 4; + json_push_value(state, config, Qnil); + json_value_completed(frame); + break; + } + + raise_parse_error("unexpected token %s", state); + case 't': + if (rest(state) >= 4 && (memcmp(state->cursor, "true", 4) == 0)) { + state->cursor += 4; + json_push_value(state, config, Qtrue); + json_value_completed(frame); + break; + } + + raise_parse_error("unexpected token %s", state); + case 'f': + // Note: memcmp with a small power of two compile to an integer comparison + if (rest(state) >= 5 && (memcmp(state->cursor + 1, "alse", 4) == 0)) { + state->cursor += 5; + json_push_value(state, config, Qfalse); + json_value_completed(frame); + break; + } + + raise_parse_error("unexpected token %s", state); + case 'N': + // Note: memcmp with a small power of two compile to an integer comparison + if (config->allow_nan && rest(state) >= 3 && (memcmp(state->cursor + 1, "aN", 2) == 0)) { + state->cursor += 3; + json_push_value(state, config, CNaN); + json_value_completed(frame); + break; + } + + raise_parse_error("unexpected token %s", state); + case 'I': + if (config->allow_nan && rest(state) >= 8 && (memcmp(state->cursor, "Infinity", 8) == 0)) { + state->cursor += 8; + json_push_value(state, config, CInfinity); + json_value_completed(frame); + break; + } + + raise_parse_error("unexpected token %s", state); + case '-': { + // Note: memcmp with a small power of two compile to an integer comparison + if (rest(state) >= 9 && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) { + if (config->allow_nan) { + state->cursor += 9; + json_push_value(state, config, CMinusInfinity); + json_value_completed(frame); + break; + } else { + raise_parse_error("unexpected token %s", state); + } + } + json_push_value(state, config, json_parse_negative_number(state, config)); + json_value_completed(frame); + break; + } + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + json_push_value(state, config, json_parse_positive_number(state, config)); + json_value_completed(frame); + break; + case '"': + // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"} + json_parse_string(state, config, false); + json_value_completed(frame); + break; + case '[': { state->cursor++; - long count = state->stack->head - stack_head; - state->current_nesting--; - state->in_array--; - return PUSH(json_decode_array(state, config, count)); + json_eat_whitespace(state); + + if (peek(state) == ']') { + state->cursor++; + json_push_value(state, config, json_decode_array(state, config, 0)); + json_value_completed(frame); + break; + } + + state->current_nesting++; + if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) { + rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting); + } + state->in_array++; + + // Phase stays VALUE: the next iteration reads the first element. + frame = json_frame_stack_push(state, (json_frame){ + .type = JSON_FRAME_ARRAY, + .phase = JSON_PHASE_VALUE, + .value_stack_head = state->value_stack->head, + }); + goto JSON_PHASE_VALUE; + break; } + case '{': { + const char *object_start_cursor = state->cursor; - if (*state->cursor == ',') { state->cursor++; - if (config->allow_trailing_comma) { - json_eat_whitespace(state); - if ((state->cursor < state->end) && (*state->cursor == ']')) { - continue; - } + json_eat_whitespace(state); + + if (peek(state) == '}') { + state->cursor++; + json_push_value(state, config, json_decode_object(state, config, 0)); + json_value_completed(frame); + break; } - json_parse_any(state, config); - continue; + + state->current_nesting++; + if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) { + rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting); + } + + // Phase KEY: the next iteration reads the first key. + frame = json_frame_stack_push(state, (json_frame){ + .type = JSON_FRAME_OBJECT, + .phase = JSON_PHASE_OBJECT_KEY, + .value_stack_head = state->value_stack->head, + .start_cursor = object_start_cursor, + }); + goto JSON_PHASE_OBJECT_KEY; + break; } - } - raise_parse_error("expected ',' or ']' after array value", state->cursor); - } - break; - } - case '{': { - state->cursor++; - json_eat_whitespace(state); - long stack_head = state->stack->head; + case 0: + raise_parse_error("unexpected end of input", state); - if ((state->cursor < state->end) && (*state->cursor == '}')) { - state->cursor++; - return PUSH(json_decode_object(state, config, 0)); - } else { - state->current_nesting++; - if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) { - rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting); + default: + raise_parse_error("unexpected character: %s", state); } + break; + } - if (*state->cursor != '"') { - raise_parse_error("expected object key, got '%s", state->cursor); - } - json_parse_string(state, config, true); + case JSON_PHASE_OBJECT_KEY: { + JSON_PHASE_OBJECT_KEY: + JSON_ASSERT(frame->type == JSON_FRAME_OBJECT); json_eat_whitespace(state); - if ((state->cursor >= state->end) || (*state->cursor != ':')) { - raise_parse_error("expected ':' after object key", state->cursor); - } - state->cursor++; - json_parse_any(state, config); + if (RB_LIKELY(peek(state) == '"')) { + json_parse_string(state, config, true); + frame->phase = JSON_PHASE_OBJECT_COLON; + goto JSON_PHASE_OBJECT_COLON; + } else { + // The message differs for the first key vs. a key after a + // ',': the first is the only one reached with nothing pushed + // for this object yet. + if (json_frame_entry_count(frame, state->value_stack) == 0) { + raise_parse_error("expected object key, got %s", state); + } else { + raise_parse_error("expected object key, got: %s", state); + } + } + break; } - while (true) { + case JSON_PHASE_OBJECT_COLON: { + JSON_PHASE_OBJECT_COLON: + JSON_ASSERT(frame->type == JSON_FRAME_OBJECT); + json_eat_whitespace(state); - if (state->cursor < state->end) { - if (*state->cursor == '}') { - state->cursor++; - state->current_nesting--; - long count = state->stack->head - stack_head; - return PUSH(json_decode_object(state, config, count)); + if (RB_LIKELY(peek(state) == ':')) { + state->cursor++; + frame->phase = JSON_PHASE_VALUE; + goto JSON_PHASE_VALUE; + } else { + // First colon (only the first pair's key is pushed, nothing + // else) vs. a later one. + if (json_frame_entry_count(frame, state->value_stack) == 1) { + raise_parse_error("expected ':' after object key", state); + } else { + raise_parse_error("expected ':' after object key, got: %s", state); } + } + break; + } - if (*state->cursor == ',') { - state->cursor++; - json_eat_whitespace(state); + case JSON_PHASE_ARRAY_COMMA: { + JSON_ASSERT(frame->type == JSON_FRAME_ARRAY); - if (config->allow_trailing_comma) { - if ((state->cursor < state->end) && (*state->cursor == '}')) { - continue; - } - } + json_eat_whitespace(state); - if (*state->cursor != '"') { - raise_parse_error("expected object key, got: '%s'", state->cursor); - } - json_parse_string(state, config, true); + const char next_char = peek(state); + if (RB_LIKELY(next_char == ',')) { + state->cursor++; + if (config->allow_trailing_comma) { json_eat_whitespace(state); - if ((state->cursor >= state->end) || (*state->cursor != ':')) { - raise_parse_error("expected ':' after object key, got: '%s", state->cursor); + if (peek(state) == ']') { + // Trailing comma: stay in COMMA to close on the next iteration. + break; } - state->cursor++; + } + frame->phase = JSON_PHASE_VALUE; + goto JSON_PHASE_VALUE; + } else if (next_char == ']') { + state->cursor++; + long count = json_frame_entry_count(frame, state->value_stack); + state->current_nesting--; + state->in_array--; + json_frame_stack_pop(state->frames); + json_push_value(state, config, json_decode_array(state, config, count)); + json_value_completed(json_frame_stack_peek(state->frames)); + } else { + raise_parse_error("expected ',' or ']' after array value", state); + } + break; + } - json_parse_any(state, config); + case JSON_PHASE_OBJECT_COMMA: { + JSON_ASSERT(frame->type == JSON_FRAME_OBJECT); - continue; + json_eat_whitespace(state); + const char next_char = peek(state); + + if (RB_LIKELY(next_char == ',')) { + state->cursor++; + json_eat_whitespace(state); + + if (config->allow_trailing_comma) { + if (peek(state) == '}') { + // Trailing comma: stay in COMMA to close on the next iteration. + break; + } } - } - raise_parse_error("expected ',' or '}' after object value, got: '%s'", state->cursor); + frame->phase = JSON_PHASE_OBJECT_KEY; + goto JSON_PHASE_OBJECT_KEY; + + break; + } else if (next_char == '}') { + state->cursor++; + state->current_nesting--; + size_t count = json_frame_entry_count(frame, state->value_stack); + + // Temporary rewind cursor in case an error is raised + const char *final_cursor = state->cursor; + state->cursor = frame->start_cursor; + VALUE object = json_decode_object(state, config, count); + state->cursor = final_cursor; + + json_frame_stack_pop(state->frames); + json_push_value(state, config, object); + json_value_completed(json_frame_stack_peek(state->frames)); + break; + } else { + raise_parse_error("expected ',' or '}' after object value, got: %s", state); + } } - break; } - - default: - raise_parse_error("unexpected character: '%s'", state->cursor); - break; } - - raise_parse_error("unreacheable: '%s'", state->cursor); } static void json_ensure_eof(JSON_ParserState *state) { json_eat_whitespace(state); - if (state->cursor != state->end) { - raise_parse_error("unexpected token at end of stream '%s'", state->cursor); + if (!eos(state)) { + raise_parse_error("unexpected token at end of stream %s", state); } } @@ -1188,40 +1736,56 @@ static void json_ensure_eof(JSON_ParserState *state) static VALUE convert_encoding(VALUE source) { - int encindex = RB_ENCODING_GET(source); + StringValue(source); + int encindex = RB_ENCODING_GET(source); - if (RB_LIKELY(encindex == utf8_encindex)) { + if (RB_LIKELY(encindex == utf8_encindex)) { + return source; + } + + if (encindex == binary_encindex) { + // For historical reason, we silently reinterpret binary strings as UTF-8 + return rb_enc_associate_index(rb_str_dup(source), utf8_encindex); + } + + source = rb_funcall(source, i_encode, 1, Encoding_UTF_8); + StringValue(source); return source; - } +} - if (encindex == binary_encindex) { - // For historical reason, we silently reinterpret binary strings as UTF-8 - return rb_enc_associate_index(rb_str_dup(source), utf8_encindex); - } +struct parser_config_init_args { + JSON_ParserConfig *config; + VALUE self; +}; - return rb_funcall(source, i_encode, 1, Encoding_UTF_8); +static void parser_config_wb_write(VALUE self, VALUE *dest, VALUE val) +{ + *dest = val; + if (self) RB_OBJ_WRITTEN(self, Qundef, val); } static int parser_config_init_i(VALUE key, VALUE val, VALUE data) { - JSON_ParserConfig *config = (JSON_ParserConfig *)data; - - if (key == sym_max_nesting) { config->max_nesting = RTEST(val) ? FIX2INT(val) : 0; } - else if (key == sym_allow_nan) { config->allow_nan = RTEST(val); } - else if (key == sym_allow_trailing_comma) { config->allow_trailing_comma = RTEST(val); } - else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); } - else if (key == sym_freeze) { config->freeze = RTEST(val); } - else if (key == sym_create_id) { config->create_id = RTEST(val) ? val : Qfalse; } - else if (key == sym_object_class) { config->object_class = RTEST(val) ? val : Qfalse; } - else if (key == sym_array_class) { config->array_class = RTEST(val) ? val : Qfalse; } - else if (key == sym_match_string) { config->match_string = RTEST(val) ? val : Qfalse; } - else if (key == sym_decimal_class) { + struct parser_config_init_args *args = (struct parser_config_init_args *)data; + JSON_ParserConfig *config = args->config; + VALUE self = args->self; + + if (key == sym_max_nesting) { config->max_nesting = RTEST(val) ? FIX2INT(val) : 0; } + else if (key == sym_allow_nan) { config->allow_nan = RTEST(val); } + else if (key == sym_allow_trailing_comma) { config->allow_trailing_comma = RTEST(val); } + else if (key == sym_allow_control_characters) { config->allow_control_characters = RTEST(val); } + else if (key == sym_allow_invalid_escape) { config->allow_invalid_escape = RTEST(val); } + else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); } + else if (key == sym_freeze) { config->freeze = RTEST(val); } + else if (key == sym_on_load) { parser_config_wb_write(self, &config->on_load_proc, RTEST(val) ? val : Qfalse); } + else if (key == sym_allow_duplicate_key) { config->on_duplicate_key = RTEST(val) ? JSON_IGNORE : JSON_RAISE; } + else if (key == sym_decimal_class) { if (RTEST(val)) { if (rb_respond_to(val, i_try_convert)) { - config->decimal_class = val; + parser_config_wb_write(self, &config->decimal_class, val); config->decimal_method_id = i_try_convert; } else if (rb_respond_to(val, i_new)) { - config->decimal_class = val; + parser_config_wb_write(self, &config->decimal_class, val); config->decimal_method_id = i_new; } else if (RB_TYPE_P(val, T_CLASS)) { VALUE name = rb_class_name(val); @@ -1230,7 +1794,7 @@ static int parser_config_init_i(VALUE key, VALUE val, VALUE data) if (last_colon) { const char *mod_path_end = last_colon - 1; VALUE mod_path = rb_str_substr(name, 0, mod_path_end - name_cstr); - config->decimal_class = rb_path_to_class(mod_path); + parser_config_wb_write(self, &config->decimal_class, rb_path_to_class(mod_path)); const char *method_name_beg = last_colon + 1; long before_len = method_name_beg - name_cstr; @@ -1238,45 +1802,31 @@ static int parser_config_init_i(VALUE key, VALUE val, VALUE data) VALUE method_name = rb_str_substr(name, before_len, len); config->decimal_method_id = SYM2ID(rb_str_intern(method_name)); } else { - config->decimal_class = rb_mKernel; + parser_config_wb_write(self, &config->decimal_class, rb_mKernel); config->decimal_method_id = SYM2ID(rb_str_intern(name)); } } } } - else if (key == sym_create_additions) { - if (NIL_P(val)) { - config->create_additions = true; - config->deprecated_create_additions = true; - } else { - config->create_additions = RTEST(val); - config->deprecated_create_additions = false; - } - } return ST_CONTINUE; } -static void parser_config_init(JSON_ParserConfig *config, VALUE opts) +static void parser_config_init(JSON_ParserConfig *config, VALUE opts, VALUE self) { config->max_nesting = 100; + struct parser_config_init_args args = { + .config = config, + .self = self, + }; + if (!NIL_P(opts)) { Check_Type(opts, T_HASH); if (RHASH_SIZE(opts) > 0) { // We assume in most cases few keys are set so it's faster to go over // the provided keys than to check all possible keys. - rb_hash_foreach(opts, parser_config_init_i, (VALUE)config); - - if (config->symbolize_names && config->create_additions) { - rb_raise(rb_eArgError, - "options :symbolize_names and :create_additions cannot be " - " used in conjunction"); - } - - if (config->create_additions && !config->create_id) { - config->create_id = rb_funcall(mJSON, i_create_id, 0); - } + rb_hash_foreach(opts, parser_config_init_i, (VALUE)&args); } } @@ -1301,50 +1851,68 @@ static void parser_config_init(JSON_ParserConfig *config, VALUE opts) * (keys) in a JSON object. Otherwise strings are returned, which is * also the default. It's not possible to use this option in * conjunction with the *create_additions* option. - * * *create_additions*: If set to false, the Parser doesn't create - * additions even if a matching class and create_id was found. This option - * defaults to false. - * * *object_class*: Defaults to Hash. If another type is provided, it will be used - * instead of Hash to represent JSON objects. The type must respond to - * +new+ without arguments, and return an object that respond to +[]=+. - * * *array_class*: Defaults to Array If another type is provided, it will be used - * instead of Hash to represent JSON arrays. The type must respond to - * +new+ without arguments, and return an object that respond to +<<+. * * *decimal_class*: Specifies which class to use instead of the default * (Float) when parsing decimal numbers. This class must accept a single * string argument in its constructor. */ static VALUE cParserConfig_initialize(VALUE self, VALUE opts) { + rb_check_frozen(self); GET_PARSER_CONFIG; - parser_config_init(config, opts); - - RB_OBJ_WRITTEN(self, Qundef, config->create_id); - RB_OBJ_WRITTEN(self, Qundef, config->object_class); - RB_OBJ_WRITTEN(self, Qundef, config->array_class); - RB_OBJ_WRITTEN(self, Qundef, config->decimal_class); - RB_OBJ_WRITTEN(self, Qundef, config->match_string); + parser_config_init(config, opts, self); return self; } -static VALUE cParser_parse(JSON_ParserConfig *config, VALUE Vsource) +static VALUE cParser_parse(JSON_ParserConfig *config, VALUE src) { - Vsource = convert_encoding(StringValue(Vsource)); - StringValue(Vsource); + VALUE Vsource = convert_encoding(src); + + // Ensure the string isn't mutated under us. + // The classic API to use is `rb_str_locktmp`, but then we'd + // need to use `rb_protect` to make sure we always unlock. + if (Vsource == src) { + Vsource = rb_str_new_frozen(Vsource); + } VALUE rvalue_stack_buffer[RVALUE_STACK_INITIAL_CAPA]; - rvalue_stack stack = { + rvalue_stack value_stack = { .type = RVALUE_STACK_STACK_ALLOCATED, .ptr = rvalue_stack_buffer, .capa = RVALUE_STACK_INITIAL_CAPA, }; + // Seed the frame stack with the root frame, establishing the invariant that + // json_parse_any always has a top frame to dispatch on (so the stack is never + // empty mid-parse). + json_frame frame_stack_buffer[JSON_FRAME_STACK_INITIAL_CAPA]; + frame_stack_buffer[0] = (json_frame){ + .type = JSON_FRAME_ROOT, + .phase = JSON_PHASE_VALUE, + }; + json_frame_stack frames = { + .type = RVALUE_STACK_STACK_ALLOCATED, + .ptr = frame_stack_buffer, + .capa = JSON_FRAME_STACK_INITIAL_CAPA, + .head = 1, + }; + + long len; + const char *start; + + RSTRING_GETMEM(Vsource, start, len); + + VALUE value_stack_handle = 0; + VALUE frame_stack_handle = 0; JSON_ParserState _state = { - .cursor = RSTRING_PTR(Vsource), - .end = RSTRING_END(Vsource), - .stack = &stack, + .start = start, + .cursor = start, + .end = start + len, + .value_stack = &value_stack, + .value_stack_handle = &value_stack_handle, + .frames = &frames, + .frame_stack_handle = &frame_stack_handle, }; JSON_ParserState *state = &_state; @@ -1352,8 +1920,11 @@ static VALUE cParser_parse(JSON_ParserConfig *config, VALUE Vsource) // This may be skipped in case of exception, but // it won't cause a leak. - rvalue_stack_eagerly_release(state->stack_handle); - + rvalue_stack_eagerly_release(value_stack_handle); + json_frame_stack_eagerly_release(frame_stack_handle); + RB_GC_GUARD(value_stack_handle); + RB_GC_GUARD(frame_stack_handle); + RB_GC_GUARD(Vsource); json_ensure_eof(state); return result; @@ -1374,12 +1945,9 @@ static VALUE cParserConfig_parse(VALUE self, VALUE Vsource) static VALUE cParser_m_parse(VALUE klass, VALUE Vsource, VALUE opts) { - Vsource = convert_encoding(StringValue(Vsource)); - StringValue(Vsource); - JSON_ParserConfig _config = {0}; JSON_ParserConfig *config = &_config; - parser_config_init(config, opts); + parser_config_init(config, opts, false); return cParser_parse(config, Vsource); } @@ -1387,17 +1955,8 @@ static VALUE cParser_m_parse(VALUE klass, VALUE Vsource, VALUE opts) static void JSON_ParserConfig_mark(void *ptr) { JSON_ParserConfig *config = ptr; - rb_gc_mark(config->create_id); - rb_gc_mark(config->object_class); - rb_gc_mark(config->array_class); + rb_gc_mark(config->on_load_proc); rb_gc_mark(config->decimal_class); - rb_gc_mark(config->match_string); -} - -static void JSON_ParserConfig_free(void *ptr) -{ - JSON_ParserConfig *config = ptr; - ruby_xfree(config); } static size_t JSON_ParserConfig_memsize(const void *ptr) @@ -1406,14 +1965,13 @@ static size_t JSON_ParserConfig_memsize(const void *ptr) } static const rb_data_type_t JSON_ParserConfig_type = { - "JSON::Ext::Parser/ParserConfig", - { + .wrap_struct_name = "JSON::Ext::Parser/ParserConfig", + .function = { JSON_ParserConfig_mark, - JSON_ParserConfig_free, + RUBY_DEFAULT_FREE, JSON_ParserConfig_memsize, }, - 0, 0, - RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED, + .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FROZEN_SHAREABLE | RUBY_TYPED_EMBEDDABLE, }; static VALUE cJSON_parser_s_allocate(VALUE klass) @@ -1457,24 +2015,14 @@ void Init_parser(void) sym_max_nesting = ID2SYM(rb_intern("max_nesting")); sym_allow_nan = ID2SYM(rb_intern("allow_nan")); sym_allow_trailing_comma = ID2SYM(rb_intern("allow_trailing_comma")); + sym_allow_control_characters = ID2SYM(rb_intern("allow_control_characters")); + sym_allow_invalid_escape = ID2SYM(rb_intern("allow_invalid_escape")); sym_symbolize_names = ID2SYM(rb_intern("symbolize_names")); sym_freeze = ID2SYM(rb_intern("freeze")); - sym_create_additions = ID2SYM(rb_intern("create_additions")); - sym_create_id = ID2SYM(rb_intern("create_id")); - sym_object_class = ID2SYM(rb_intern("object_class")); - sym_array_class = ID2SYM(rb_intern("array_class")); + sym_on_load = ID2SYM(rb_intern("on_load")); sym_decimal_class = ID2SYM(rb_intern("decimal_class")); - sym_match_string = ID2SYM(rb_intern("match_string")); - - i_create_id = rb_intern("create_id"); - i_json_creatable_p = rb_intern("json_creatable?"); - i_json_create = rb_intern("json_create"); - i_chr = rb_intern("chr"); - i_match = rb_intern("match"); - i_deep_const_get = rb_intern("deep_const_get"); - i_aset = rb_intern("[]="); - i_aref = rb_intern("[]"); - i_leftshift = rb_intern("<<"); + sym_allow_duplicate_key = ID2SYM(rb_intern("allow_duplicate_key")); + i_new = rb_intern("new"); i_try_convert = rb_intern("try_convert"); i_uminus = rb_intern("-@"); @@ -1483,4 +2031,8 @@ void Init_parser(void) binary_encindex = rb_ascii8bit_encindex(); utf8_encindex = rb_utf8_encindex(); enc_utf8 = rb_utf8_encoding(); + +#ifdef HAVE_SIMD + simd_impl = find_simd_implementation(); +#endif } |
