diff options
Diffstat (limited to 'ext/json/parser')
| -rw-r--r-- | ext/json/parser/depend | 182 | ||||
| -rw-r--r-- | ext/json/parser/extconf.rb | 16 | ||||
| -rw-r--r-- | ext/json/parser/parser.c | 1672 |
3 files changed, 1870 insertions, 0 deletions
diff --git a/ext/json/parser/depend b/ext/json/parser/depend new file mode 100644 index 0000000000..d4737b1dfb --- /dev/null +++ b/ext/json/parser/depend @@ -0,0 +1,182 @@ +$(OBJS): $(ruby_headers) +parser.o: parser.c $(srcdir)/../fbuffer/fbuffer.h + +# AUTOGENERATED DEPENDENCIES START +parser.o: $(RUBY_EXTCONF_H) +parser.o: $(arch_hdrdir)/ruby/config.h +parser.o: $(hdrdir)/ruby.h +parser.o: $(hdrdir)/ruby/assert.h +parser.o: $(hdrdir)/ruby/backward.h +parser.o: $(hdrdir)/ruby/backward/2/assume.h +parser.o: $(hdrdir)/ruby/backward/2/attributes.h +parser.o: $(hdrdir)/ruby/backward/2/bool.h +parser.o: $(hdrdir)/ruby/backward/2/inttypes.h +parser.o: $(hdrdir)/ruby/backward/2/limits.h +parser.o: $(hdrdir)/ruby/backward/2/long_long.h +parser.o: $(hdrdir)/ruby/backward/2/stdalign.h +parser.o: $(hdrdir)/ruby/backward/2/stdarg.h +parser.o: $(hdrdir)/ruby/defines.h +parser.o: $(hdrdir)/ruby/encoding.h +parser.o: $(hdrdir)/ruby/intern.h +parser.o: $(hdrdir)/ruby/internal/abi.h +parser.o: $(hdrdir)/ruby/internal/anyargs.h +parser.o: $(hdrdir)/ruby/internal/arithmetic.h +parser.o: $(hdrdir)/ruby/internal/arithmetic/char.h +parser.o: $(hdrdir)/ruby/internal/arithmetic/double.h +parser.o: $(hdrdir)/ruby/internal/arithmetic/fixnum.h +parser.o: $(hdrdir)/ruby/internal/arithmetic/gid_t.h +parser.o: $(hdrdir)/ruby/internal/arithmetic/int.h +parser.o: $(hdrdir)/ruby/internal/arithmetic/intptr_t.h +parser.o: $(hdrdir)/ruby/internal/arithmetic/long.h +parser.o: $(hdrdir)/ruby/internal/arithmetic/long_long.h +parser.o: $(hdrdir)/ruby/internal/arithmetic/mode_t.h +parser.o: $(hdrdir)/ruby/internal/arithmetic/off_t.h +parser.o: $(hdrdir)/ruby/internal/arithmetic/pid_t.h +parser.o: $(hdrdir)/ruby/internal/arithmetic/short.h +parser.o: $(hdrdir)/ruby/internal/arithmetic/size_t.h +parser.o: $(hdrdir)/ruby/internal/arithmetic/st_data_t.h +parser.o: $(hdrdir)/ruby/internal/arithmetic/uid_t.h +parser.o: $(hdrdir)/ruby/internal/assume.h +parser.o: $(hdrdir)/ruby/internal/attr/alloc_size.h +parser.o: $(hdrdir)/ruby/internal/attr/artificial.h +parser.o: $(hdrdir)/ruby/internal/attr/cold.h +parser.o: $(hdrdir)/ruby/internal/attr/const.h +parser.o: $(hdrdir)/ruby/internal/attr/constexpr.h +parser.o: $(hdrdir)/ruby/internal/attr/deprecated.h +parser.o: $(hdrdir)/ruby/internal/attr/diagnose_if.h +parser.o: $(hdrdir)/ruby/internal/attr/enum_extensibility.h +parser.o: $(hdrdir)/ruby/internal/attr/error.h +parser.o: $(hdrdir)/ruby/internal/attr/flag_enum.h +parser.o: $(hdrdir)/ruby/internal/attr/forceinline.h +parser.o: $(hdrdir)/ruby/internal/attr/format.h +parser.o: $(hdrdir)/ruby/internal/attr/maybe_unused.h +parser.o: $(hdrdir)/ruby/internal/attr/noalias.h +parser.o: $(hdrdir)/ruby/internal/attr/nodiscard.h +parser.o: $(hdrdir)/ruby/internal/attr/noexcept.h +parser.o: $(hdrdir)/ruby/internal/attr/noinline.h +parser.o: $(hdrdir)/ruby/internal/attr/nonnull.h +parser.o: $(hdrdir)/ruby/internal/attr/noreturn.h +parser.o: $(hdrdir)/ruby/internal/attr/packed_struct.h +parser.o: $(hdrdir)/ruby/internal/attr/pure.h +parser.o: $(hdrdir)/ruby/internal/attr/restrict.h +parser.o: $(hdrdir)/ruby/internal/attr/returns_nonnull.h +parser.o: $(hdrdir)/ruby/internal/attr/warning.h +parser.o: $(hdrdir)/ruby/internal/attr/weakref.h +parser.o: $(hdrdir)/ruby/internal/cast.h +parser.o: $(hdrdir)/ruby/internal/compiler_is.h +parser.o: $(hdrdir)/ruby/internal/compiler_is/apple.h +parser.o: $(hdrdir)/ruby/internal/compiler_is/clang.h +parser.o: $(hdrdir)/ruby/internal/compiler_is/gcc.h +parser.o: $(hdrdir)/ruby/internal/compiler_is/intel.h +parser.o: $(hdrdir)/ruby/internal/compiler_is/msvc.h +parser.o: $(hdrdir)/ruby/internal/compiler_is/sunpro.h +parser.o: $(hdrdir)/ruby/internal/compiler_since.h +parser.o: $(hdrdir)/ruby/internal/config.h +parser.o: $(hdrdir)/ruby/internal/constant_p.h +parser.o: $(hdrdir)/ruby/internal/core.h +parser.o: $(hdrdir)/ruby/internal/core/rarray.h +parser.o: $(hdrdir)/ruby/internal/core/rbasic.h +parser.o: $(hdrdir)/ruby/internal/core/rbignum.h +parser.o: $(hdrdir)/ruby/internal/core/rclass.h +parser.o: $(hdrdir)/ruby/internal/core/rdata.h +parser.o: $(hdrdir)/ruby/internal/core/rfile.h +parser.o: $(hdrdir)/ruby/internal/core/rhash.h +parser.o: $(hdrdir)/ruby/internal/core/robject.h +parser.o: $(hdrdir)/ruby/internal/core/rregexp.h +parser.o: $(hdrdir)/ruby/internal/core/rstring.h +parser.o: $(hdrdir)/ruby/internal/core/rstruct.h +parser.o: $(hdrdir)/ruby/internal/core/rtypeddata.h +parser.o: $(hdrdir)/ruby/internal/ctype.h +parser.o: $(hdrdir)/ruby/internal/dllexport.h +parser.o: $(hdrdir)/ruby/internal/dosish.h +parser.o: $(hdrdir)/ruby/internal/encoding/coderange.h +parser.o: $(hdrdir)/ruby/internal/encoding/ctype.h +parser.o: $(hdrdir)/ruby/internal/encoding/encoding.h +parser.o: $(hdrdir)/ruby/internal/encoding/pathname.h +parser.o: $(hdrdir)/ruby/internal/encoding/re.h +parser.o: $(hdrdir)/ruby/internal/encoding/sprintf.h +parser.o: $(hdrdir)/ruby/internal/encoding/string.h +parser.o: $(hdrdir)/ruby/internal/encoding/symbol.h +parser.o: $(hdrdir)/ruby/internal/encoding/transcode.h +parser.o: $(hdrdir)/ruby/internal/error.h +parser.o: $(hdrdir)/ruby/internal/eval.h +parser.o: $(hdrdir)/ruby/internal/event.h +parser.o: $(hdrdir)/ruby/internal/fl_type.h +parser.o: $(hdrdir)/ruby/internal/gc.h +parser.o: $(hdrdir)/ruby/internal/glob.h +parser.o: $(hdrdir)/ruby/internal/globals.h +parser.o: $(hdrdir)/ruby/internal/has/attribute.h +parser.o: $(hdrdir)/ruby/internal/has/builtin.h +parser.o: $(hdrdir)/ruby/internal/has/c_attribute.h +parser.o: $(hdrdir)/ruby/internal/has/cpp_attribute.h +parser.o: $(hdrdir)/ruby/internal/has/declspec_attribute.h +parser.o: $(hdrdir)/ruby/internal/has/extension.h +parser.o: $(hdrdir)/ruby/internal/has/feature.h +parser.o: $(hdrdir)/ruby/internal/has/warning.h +parser.o: $(hdrdir)/ruby/internal/intern/array.h +parser.o: $(hdrdir)/ruby/internal/intern/bignum.h +parser.o: $(hdrdir)/ruby/internal/intern/class.h +parser.o: $(hdrdir)/ruby/internal/intern/compar.h +parser.o: $(hdrdir)/ruby/internal/intern/complex.h +parser.o: $(hdrdir)/ruby/internal/intern/cont.h +parser.o: $(hdrdir)/ruby/internal/intern/dir.h +parser.o: $(hdrdir)/ruby/internal/intern/enum.h +parser.o: $(hdrdir)/ruby/internal/intern/enumerator.h +parser.o: $(hdrdir)/ruby/internal/intern/error.h +parser.o: $(hdrdir)/ruby/internal/intern/eval.h +parser.o: $(hdrdir)/ruby/internal/intern/file.h +parser.o: $(hdrdir)/ruby/internal/intern/hash.h +parser.o: $(hdrdir)/ruby/internal/intern/io.h +parser.o: $(hdrdir)/ruby/internal/intern/load.h +parser.o: $(hdrdir)/ruby/internal/intern/marshal.h +parser.o: $(hdrdir)/ruby/internal/intern/numeric.h +parser.o: $(hdrdir)/ruby/internal/intern/object.h +parser.o: $(hdrdir)/ruby/internal/intern/parse.h +parser.o: $(hdrdir)/ruby/internal/intern/proc.h +parser.o: $(hdrdir)/ruby/internal/intern/process.h +parser.o: $(hdrdir)/ruby/internal/intern/random.h +parser.o: $(hdrdir)/ruby/internal/intern/range.h +parser.o: $(hdrdir)/ruby/internal/intern/rational.h +parser.o: $(hdrdir)/ruby/internal/intern/re.h +parser.o: $(hdrdir)/ruby/internal/intern/ruby.h +parser.o: $(hdrdir)/ruby/internal/intern/select.h +parser.o: $(hdrdir)/ruby/internal/intern/select/largesize.h +parser.o: $(hdrdir)/ruby/internal/intern/set.h +parser.o: $(hdrdir)/ruby/internal/intern/signal.h +parser.o: $(hdrdir)/ruby/internal/intern/sprintf.h +parser.o: $(hdrdir)/ruby/internal/intern/string.h +parser.o: $(hdrdir)/ruby/internal/intern/struct.h +parser.o: $(hdrdir)/ruby/internal/intern/thread.h +parser.o: $(hdrdir)/ruby/internal/intern/time.h +parser.o: $(hdrdir)/ruby/internal/intern/variable.h +parser.o: $(hdrdir)/ruby/internal/intern/vm.h +parser.o: $(hdrdir)/ruby/internal/interpreter.h +parser.o: $(hdrdir)/ruby/internal/iterator.h +parser.o: $(hdrdir)/ruby/internal/memory.h +parser.o: $(hdrdir)/ruby/internal/method.h +parser.o: $(hdrdir)/ruby/internal/module.h +parser.o: $(hdrdir)/ruby/internal/newobj.h +parser.o: $(hdrdir)/ruby/internal/scan_args.h +parser.o: $(hdrdir)/ruby/internal/special_consts.h +parser.o: $(hdrdir)/ruby/internal/static_assert.h +parser.o: $(hdrdir)/ruby/internal/stdalign.h +parser.o: $(hdrdir)/ruby/internal/stdbool.h +parser.o: $(hdrdir)/ruby/internal/stdckdint.h +parser.o: $(hdrdir)/ruby/internal/symbol.h +parser.o: $(hdrdir)/ruby/internal/value.h +parser.o: $(hdrdir)/ruby/internal/value_type.h +parser.o: $(hdrdir)/ruby/internal/variable.h +parser.o: $(hdrdir)/ruby/internal/warning_push.h +parser.o: $(hdrdir)/ruby/internal/xmalloc.h +parser.o: $(hdrdir)/ruby/missing.h +parser.o: $(hdrdir)/ruby/onigmo.h +parser.o: $(hdrdir)/ruby/oniguruma.h +parser.o: $(hdrdir)/ruby/ruby.h +parser.o: $(hdrdir)/ruby/st.h +parser.o: $(hdrdir)/ruby/subst.h +parser.o: $(srcdir)/../fbuffer/fbuffer.h +parser.o: $(srcdir)/../json.h +parser.o: $(srcdir)/../simd/simd.h +parser.o: $(srcdir)/../vendor/ryu.h +parser.o: parser.c +# AUTOGENERATED DEPENDENCIES END diff --git a/ext/json/parser/extconf.rb b/ext/json/parser/extconf.rb new file mode 100644 index 0000000000..2440e66d8b --- /dev/null +++ b/ext/json/parser/extconf.rb @@ -0,0 +1,16 @@ +# frozen_string_literal: true +require 'mkmf' + +$defs << "-DJSON_DEBUG" if ENV.fetch("JSON_DEBUG", "0") != "0" +have_func("rb_enc_interned_str", "ruby/encoding.h") # RUBY_VERSION >= 3.0 +have_func("rb_str_to_interned_str", "ruby.h") # RUBY_VERSION >= 3.0 +have_func("rb_hash_new_capa", "ruby.h") # RUBY_VERSION >= 3.2 +have_func("rb_hash_bulk_insert", "ruby.h") # Missing on TruffleRuby + +append_cflags("-std=c99") + +if enable_config('parser-use-simd', default=!ENV["JSON_DISABLE_SIMD"]) + load __dir__ + "/../simd/conf.rb" +end + +create_makefile 'json/ext/parser' diff --git a/ext/json/parser/parser.c b/ext/json/parser/parser.c new file mode 100644 index 0000000000..f1ea1b6abb --- /dev/null +++ b/ext/json/parser/parser.c @@ -0,0 +1,1672 @@ +#include "../json.h" +#include "../vendor/ryu.h" +#include "../simd/simd.h" + +static VALUE mJSON, eNestingError, Encoding_UTF_8; +static VALUE CNaN, CInfinity, CMinusInfinity; + +static ID i_new, i_try_convert, i_uminus, i_encode; + +static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_allow_control_characters, sym_symbolize_names, sym_freeze, + sym_decimal_class, sym_on_load, sym_allow_duplicate_key; + +static int binary_encindex; +static int utf8_encindex; + +#ifndef HAVE_RB_HASH_BULK_INSERT +// For TruffleRuby +static void +rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash) +{ + long index = 0; + while (index < count) { + VALUE name = pairs[index++]; + VALUE value = pairs[index++]; + rb_hash_aset(hash, name, value); + } + RB_GC_GUARD(hash); +} +#endif + +#ifndef HAVE_RB_HASH_NEW_CAPA +#define rb_hash_new_capa(n) rb_hash_new() +#endif + +#ifndef HAVE_RB_STR_TO_INTERNED_STR +static VALUE rb_str_to_interned_str(VALUE str) +{ + return rb_funcall(rb_str_freeze(str), i_uminus, 0); +} +#endif + +/* name cache */ + +#include <string.h> +#include <ctype.h> + +// Object names are likely to be repeated, and are frozen. +// As such we can re-use them if we keep a cache of the ones we've seen so far, +// and save much more expensive lookups into the global fstring table. +// This cache implementation is deliberately simple, as we're optimizing for compactness, +// to be able to fit safely on the stack. +// As such, binary search into a sorted array gives a good tradeoff between compactness and +// performance. +#define JSON_RVALUE_CACHE_CAPA 63 +typedef struct rvalue_cache_struct { + int length; + VALUE entries[JSON_RVALUE_CACHE_CAPA]; +} rvalue_cache; + +static rb_encoding *enc_utf8; + +#define JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH 55 + +static inline VALUE build_interned_string(const char *str, const long length) +{ +# ifdef HAVE_RB_ENC_INTERNED_STR + return rb_enc_interned_str(str, length, enc_utf8); +# else + VALUE rstring = rb_utf8_str_new(str, length); + return rb_funcall(rb_str_freeze(rstring), i_uminus, 0); +# endif +} + +static inline VALUE build_symbol(const char *str, const long length) +{ + return rb_str_intern(build_interned_string(str, length)); +} + +static void rvalue_cache_insert_at(rvalue_cache *cache, int index, VALUE rstring) +{ + MEMMOVE(&cache->entries[index + 1], &cache->entries[index], VALUE, cache->length - index); + cache->length++; + cache->entries[index] = rstring; +} + +#define rstring_cache_memcmp memcmp + +#if JSON_CPU_LITTLE_ENDIAN_64BITS +#if __has_builtin(__builtin_bswap64) +#undef rstring_cache_memcmp +ALWAYS_INLINE(static) int rstring_cache_memcmp(const char *str, const char *rptr, const long length) +{ + // The libc memcmp has numerous complex optimizations, but in this particular case, + // we know the string is small (JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH), so being able to + // inline a simpler memcmp outperforms calling the libc version. + long i = 0; + + for (; i + 8 <= length; i += 8) { + uint64_t a, b; + memcpy(&a, str + i, 8); + memcpy(&b, rptr + i, 8); + if (a != b) { + a = __builtin_bswap64(a); + b = __builtin_bswap64(b); + return (a < b) ? -1 : 1; + } + } + + for (; i < length; i++) { + if (str[i] != rptr[i]) { + return (str[i] < rptr[i]) ? -1 : 1; + } + } + + return 0; +} +#endif +#endif + +ALWAYS_INLINE(static) int rstring_cache_cmp(const char *str, const long length, VALUE rstring) +{ + const char *rstring_ptr; + long rstring_length; + + RSTRING_GETMEM(rstring, rstring_ptr, rstring_length); + + if (length == rstring_length) { + return rstring_cache_memcmp(str, rstring_ptr, length); + } else { + return (int)(length - rstring_length); + } +} + +ALWAYS_INLINE(static) VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length) +{ + int low = 0; + int high = cache->length - 1; + + while (low <= high) { + int mid = (high + low) >> 1; + VALUE entry = cache->entries[mid]; + int cmp = rstring_cache_cmp(str, length, entry); + + if (cmp == 0) { + return entry; + } else if (cmp > 0) { + low = mid + 1; + } else { + high = mid - 1; + } + } + + VALUE rstring = build_interned_string(str, length); + + if (cache->length < JSON_RVALUE_CACHE_CAPA) { + rvalue_cache_insert_at(cache, low, rstring); + } + return rstring; +} + +static VALUE rsymbol_cache_fetch(rvalue_cache *cache, const char *str, const long length) +{ + int low = 0; + int high = cache->length - 1; + + while (low <= high) { + int mid = (high + low) >> 1; + VALUE entry = cache->entries[mid]; + int cmp = rstring_cache_cmp(str, length, rb_sym2str(entry)); + + if (cmp == 0) { + return entry; + } else if (cmp > 0) { + low = mid + 1; + } else { + high = mid - 1; + } + } + + VALUE rsymbol = build_symbol(str, length); + + if (cache->length < JSON_RVALUE_CACHE_CAPA) { + rvalue_cache_insert_at(cache, low, rsymbol); + } + return rsymbol; +} + +/* rvalue stack */ + +#define RVALUE_STACK_INITIAL_CAPA 128 + +enum rvalue_stack_type { + RVALUE_STACK_HEAP_ALLOCATED = 0, + RVALUE_STACK_STACK_ALLOCATED = 1, +}; + +typedef struct rvalue_stack_struct { + enum rvalue_stack_type type; + long capa; + long head; + VALUE *ptr; +} rvalue_stack; + +static rvalue_stack *rvalue_stack_spill(rvalue_stack *old_stack, VALUE *handle, rvalue_stack **stack_ref); + +static rvalue_stack *rvalue_stack_grow(rvalue_stack *stack, VALUE *handle, rvalue_stack **stack_ref) +{ + long required = stack->capa * 2; + + if (stack->type == RVALUE_STACK_STACK_ALLOCATED) { + stack = rvalue_stack_spill(stack, handle, stack_ref); + } else { + REALLOC_N(stack->ptr, VALUE, required); + stack->capa = required; + } + return stack; +} + +static VALUE rvalue_stack_push(rvalue_stack *stack, VALUE value, VALUE *handle, rvalue_stack **stack_ref) +{ + if (RB_UNLIKELY(stack->head >= stack->capa)) { + stack = rvalue_stack_grow(stack, handle, stack_ref); + } + stack->ptr[stack->head] = value; + stack->head++; + return value; +} + +static inline VALUE *rvalue_stack_peek(rvalue_stack *stack, long count) +{ + return stack->ptr + (stack->head - count); +} + +static inline void rvalue_stack_pop(rvalue_stack *stack, long count) +{ + stack->head -= count; +} + +static void rvalue_stack_mark(void *ptr) +{ + rvalue_stack *stack = (rvalue_stack *)ptr; + long index; + for (index = 0; index < stack->head; index++) { + rb_gc_mark(stack->ptr[index]); + } +} + +static void rvalue_stack_free(void *ptr) +{ + rvalue_stack *stack = (rvalue_stack *)ptr; + if (stack) { + ruby_xfree(stack->ptr); + ruby_xfree(stack); + } +} + +static size_t rvalue_stack_memsize(const void *ptr) +{ + const rvalue_stack *stack = (const rvalue_stack *)ptr; + return sizeof(rvalue_stack) + sizeof(VALUE) * stack->capa; +} + +static const rb_data_type_t JSON_Parser_rvalue_stack_type = { + "JSON::Ext::Parser/rvalue_stack", + { + .dmark = rvalue_stack_mark, + .dfree = rvalue_stack_free, + .dsize = rvalue_stack_memsize, + }, + 0, 0, + RUBY_TYPED_FREE_IMMEDIATELY, +}; + +static rvalue_stack *rvalue_stack_spill(rvalue_stack *old_stack, VALUE *handle, rvalue_stack **stack_ref) +{ + rvalue_stack *stack; + *handle = TypedData_Make_Struct(0, rvalue_stack, &JSON_Parser_rvalue_stack_type, stack); + *stack_ref = stack; + MEMCPY(stack, old_stack, rvalue_stack, 1); + + stack->capa = old_stack->capa << 1; + stack->ptr = ALLOC_N(VALUE, stack->capa); + stack->type = RVALUE_STACK_HEAP_ALLOCATED; + MEMCPY(stack->ptr, old_stack->ptr, VALUE, old_stack->head); + return stack; +} + +static void rvalue_stack_eagerly_release(VALUE handle) +{ + if (handle) { + rvalue_stack *stack; + TypedData_Get_Struct(handle, rvalue_stack, &JSON_Parser_rvalue_stack_type, stack); + RTYPEDDATA_DATA(handle) = NULL; + rvalue_stack_free(stack); + } +} + +static int convert_UTF32_to_UTF8(char *buf, uint32_t ch) +{ + int len = 1; + if (ch <= 0x7F) { + buf[0] = (char) ch; + } else if (ch <= 0x07FF) { + buf[0] = (char) ((ch >> 6) | 0xC0); + buf[1] = (char) ((ch & 0x3F) | 0x80); + len++; + } else if (ch <= 0xFFFF) { + buf[0] = (char) ((ch >> 12) | 0xE0); + buf[1] = (char) (((ch >> 6) & 0x3F) | 0x80); + buf[2] = (char) ((ch & 0x3F) | 0x80); + len += 2; + } else if (ch <= 0x1fffff) { + buf[0] =(char) ((ch >> 18) | 0xF0); + buf[1] =(char) (((ch >> 12) & 0x3F) | 0x80); + buf[2] =(char) (((ch >> 6) & 0x3F) | 0x80); + buf[3] =(char) ((ch & 0x3F) | 0x80); + len += 3; + } else { + buf[0] = '?'; + } + return len; +} + +enum duplicate_key_action { + JSON_DEPRECATED = 0, + JSON_IGNORE, + JSON_RAISE, +}; + +typedef struct JSON_ParserStruct { + VALUE on_load_proc; + VALUE decimal_class; + ID decimal_method_id; + enum duplicate_key_action on_duplicate_key; + int max_nesting; + bool allow_nan; + bool allow_trailing_comma; + bool allow_control_characters; + bool symbolize_names; + bool freeze; +} JSON_ParserConfig; + +typedef struct JSON_ParserStateStruct { + VALUE stack_handle; + const char *start; + const char *cursor; + const char *end; + rvalue_stack *stack; + rvalue_cache name_cache; + int in_array; + int current_nesting; +} JSON_ParserState; + +static inline size_t rest(JSON_ParserState *state) { + return state->end - state->cursor; +} + +static inline bool eos(JSON_ParserState *state) { + return state->cursor >= state->end; +} + +static inline char peek(JSON_ParserState *state) +{ + if (RB_UNLIKELY(eos(state))) { + return 0; + } + return *state->cursor; +} + +static void cursor_position(JSON_ParserState *state, long *line_out, long *column_out) +{ + const char *cursor = state->cursor; + long column = 0; + long line = 1; + + while (cursor >= state->start) { + if (*cursor-- == '\n') { + break; + } + column++; + } + + while (cursor >= state->start) { + if (*cursor-- == '\n') { + line++; + } + } + *line_out = line; + *column_out = column; +} + +static void emit_parse_warning(const char *message, JSON_ParserState *state) +{ + long line, column; + cursor_position(state, &line, &column); + + VALUE warning = rb_sprintf("%s at line %ld column %ld", message, line, column); + rb_funcall(mJSON, rb_intern("deprecation_warning"), 1, warning); +} + +#define PARSE_ERROR_FRAGMENT_LEN 32 + +#ifdef RBIMPL_ATTR_NORETURN +RBIMPL_ATTR_NORETURN() +#endif +static void raise_parse_error(const char *format, JSON_ParserState *state) +{ + unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 3]; + long line, column; + cursor_position(state, &line, &column); + + const char *ptr = "EOF"; + if (state->cursor && state->cursor < state->end) { + ptr = state->cursor; + size_t len = 0; + while (len < PARSE_ERROR_FRAGMENT_LEN) { + char ch = ptr[len]; + if (!ch || ch == '\n' || ch == ' ' || ch == '\t' || ch == '\r') { + break; + } + len++; + } + + if (len) { + buffer[0] = '\''; + MEMCPY(buffer + 1, ptr, char, len); + + while (buffer[len] >= 0x80 && buffer[len] < 0xC0) { // Is continuation byte + len--; + } + + if (buffer[len] >= 0xC0) { // multibyte character start + len--; + } + + buffer[len + 1] = '\''; + buffer[len + 2] = '\0'; + ptr = (const char *)buffer; + } + } + + VALUE msg = rb_sprintf(format, ptr); + VALUE message = rb_enc_sprintf(enc_utf8, "%s at line %ld column %ld", RSTRING_PTR(msg), line, column); + RB_GC_GUARD(msg); + + VALUE exc = rb_exc_new_str(rb_path2class("JSON::ParserError"), message); + rb_ivar_set(exc, rb_intern("@line"), LONG2NUM(line)); + rb_ivar_set(exc, rb_intern("@column"), LONG2NUM(column)); + rb_exc_raise(exc); +} + +#ifdef RBIMPL_ATTR_NORETURN +RBIMPL_ATTR_NORETURN() +#endif +static void raise_parse_error_at(const char *format, JSON_ParserState *state, const char *at) +{ + state->cursor = at; + raise_parse_error(format, state); +} + +/* unicode */ + +static const signed char digit_values[256] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, + -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1 +}; + +static uint32_t unescape_unicode(JSON_ParserState *state, const char *sp, const char *spe) +{ + if (RB_UNLIKELY(sp > spe - 4)) { + raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2); + } + + const unsigned char *p = (const unsigned char *)sp; + + const signed char b0 = digit_values[p[0]]; + const signed char b1 = digit_values[p[1]]; + const signed char b2 = digit_values[p[2]]; + const signed char b3 = digit_values[p[3]]; + + if (RB_UNLIKELY((signed char)(b0 | b1 | b2 | b3) < 0)) { + raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2); + } + + return ((uint32_t)b0 << 12) | ((uint32_t)b1 << 8) | ((uint32_t)b2 << 4) | (uint32_t)b3; +} + +#define GET_PARSER_CONFIG \ + JSON_ParserConfig *config; \ + TypedData_Get_Struct(self, JSON_ParserConfig, &JSON_ParserConfig_type, config) + +static const rb_data_type_t JSON_ParserConfig_type; + +static void +json_eat_comments(JSON_ParserState *state) +{ + const char *start = state->cursor; + state->cursor++; + + switch (peek(state)) { + case '/': { + state->cursor = memchr(state->cursor, '\n', state->end - state->cursor); + if (!state->cursor) { + state->cursor = state->end; + } else { + state->cursor++; + } + break; + } + case '*': { + state->cursor++; + + while (true) { + const char *next_match = memchr(state->cursor, '*', state->end - state->cursor); + if (!next_match) { + raise_parse_error_at("unterminated comment, expected closing '*/'", state, start); + } + + state->cursor = next_match + 1; + if (peek(state) == '/') { + state->cursor++; + break; + } + } + break; + } + default: + raise_parse_error_at("unexpected token %s", state, start); + break; + } +} + +ALWAYS_INLINE(static) void +json_eat_whitespace(JSON_ParserState *state) +{ + while (true) { + switch (peek(state)) { + case ' ': + state->cursor++; + break; + case '\n': + state->cursor++; + + // Heuristic: if we see a newline, there is likely consecutive spaces after it. +#if JSON_CPU_LITTLE_ENDIAN_64BITS + while (rest(state) > 8) { + uint64_t chunk; + memcpy(&chunk, state->cursor, sizeof(uint64_t)); + if (chunk == 0x2020202020202020) { + state->cursor += 8; + continue; + } + + uint32_t consecutive_spaces = trailing_zeros64(chunk ^ 0x2020202020202020) / CHAR_BIT; + state->cursor += consecutive_spaces; + break; + } +#endif + break; + case '\t': + case '\r': + state->cursor++; + break; + case '/': + json_eat_comments(state); + break; + + default: + return; + } + } +} + +static inline VALUE build_string(const char *start, const char *end, bool intern, bool symbolize) +{ + if (symbolize) { + intern = true; + } + VALUE result; +# ifdef HAVE_RB_ENC_INTERNED_STR + if (intern) { + result = rb_enc_interned_str(start, (long)(end - start), enc_utf8); + } else { + result = rb_utf8_str_new(start, (long)(end - start)); + } +# else + result = rb_utf8_str_new(start, (long)(end - start)); + if (intern) { + result = rb_funcall(rb_str_freeze(result), i_uminus, 0); + } +# endif + + if (symbolize) { + result = rb_str_intern(result); + } + + return result; +} + +static inline bool json_string_cacheable_p(const char *string, size_t length) +{ + // We mostly want to cache strings that are likely to be repeated. + // Simple heuristics: + // - Common names aren't likely to be very long. So we just don't cache names above an arbitrary threshold. + // - If the first character isn't a letter, we're much less likely to see this string again. + return length <= JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH && rb_isalpha(string[0]); +} + +static inline VALUE json_string_fastpath(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name) +{ + bool intern = is_name || config->freeze; + bool symbolize = is_name && config->symbolize_names; + size_t bufferSize = stringEnd - string; + + if (is_name && state->in_array && RB_LIKELY(json_string_cacheable_p(string, bufferSize))) { + VALUE cached_key; + if (RB_UNLIKELY(symbolize)) { + cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize); + } else { + cached_key = rstring_cache_fetch(&state->name_cache, string, bufferSize); + } + + if (RB_LIKELY(cached_key)) { + return cached_key; + } + } + + return build_string(string, stringEnd, intern, symbolize); +} + +#define JSON_MAX_UNESCAPE_POSITIONS 16 +typedef struct _json_unescape_positions { + long size; + const char **positions; + unsigned long additional_backslashes; +} JSON_UnescapePositions; + +static inline const char *json_next_backslash(const char *pe, const char *stringEnd, JSON_UnescapePositions *positions) +{ + while (positions->size) { + positions->size--; + const char *next_position = positions->positions[0]; + positions->positions++; + if (next_position >= pe) { + return next_position; + } + } + + if (positions->additional_backslashes) { + positions->additional_backslashes--; + return memchr(pe, '\\', stringEnd - pe); + } + + return NULL; +} + +NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name, JSON_UnescapePositions *positions) +{ + bool intern = is_name || config->freeze; + bool symbolize = is_name && config->symbolize_names; + size_t bufferSize = stringEnd - string; + const char *p = string, *pe = string, *bufferStart; + char *buffer; + + VALUE result = rb_str_buf_new(bufferSize); + rb_enc_associate_index(result, utf8_encindex); + buffer = RSTRING_PTR(result); + bufferStart = buffer; + +#define APPEND_CHAR(chr) *buffer++ = chr; p = ++pe; + + while (pe < stringEnd && (pe = json_next_backslash(pe, stringEnd, positions))) { + if (pe > p) { + MEMCPY(buffer, p, char, pe - p); + buffer += pe - p; + } + switch (*++pe) { + case '"': + case '/': + p = pe; // nothing to unescape just need to skip the backslash + break; + case '\\': + APPEND_CHAR('\\'); + break; + case 'n': + APPEND_CHAR('\n'); + break; + case 'r': + APPEND_CHAR('\r'); + break; + case 't': + APPEND_CHAR('\t'); + break; + case 'b': + APPEND_CHAR('\b'); + break; + case 'f': + APPEND_CHAR('\f'); + break; + case 'u': { + uint32_t ch = unescape_unicode(state, ++pe, stringEnd); + pe += 3; + /* To handle values above U+FFFF, we take a sequence of + * \uXXXX escapes in the U+D800..U+DBFF then + * U+DC00..U+DFFF ranges, take the low 10 bits from each + * to make a 20-bit number, then add 0x10000 to get the + * final codepoint. + * + * See Unicode 15: 3.8 "Surrogates", 5.3 "Handling + * Surrogate Pairs in UTF-16", and 23.6 "Surrogates + * Area". + */ + if ((ch & 0xFC00) == 0xD800) { + pe++; + if (RB_LIKELY((pe <= stringEnd - 6) && memcmp(pe, "\\u", 2) == 0)) { + uint32_t sur = unescape_unicode(state, pe + 2, stringEnd); + + if (RB_UNLIKELY((sur & 0xFC00) != 0xDC00)) { + raise_parse_error_at("invalid surrogate pair at %s", state, p); + } + + ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) | (sur & 0x3FF)); + pe += 5; + } else { + raise_parse_error_at("incomplete surrogate pair at %s", state, p); + break; + } + } + + int unescape_len = convert_UTF32_to_UTF8(buffer, ch); + buffer += unescape_len; + p = ++pe; + break; + } + default: + if ((unsigned char)*pe < 0x20) { + if (!config->allow_control_characters) { + if (*pe == '\n') { + raise_parse_error_at("Invalid unescaped newline character (\\n) in string: %s", state, pe - 1); + } + raise_parse_error_at("invalid ASCII control character in string: %s", state, pe - 1); + } + } else { + raise_parse_error_at("invalid escape character in string: %s", state, pe - 1); + } + break; + } + } +#undef APPEND_CHAR + + if (stringEnd > p) { + MEMCPY(buffer, p, char, stringEnd - p); + buffer += stringEnd - p; + } + rb_str_set_len(result, buffer - bufferStart); + + if (symbolize) { + result = rb_str_intern(result); + } else if (intern) { + result = rb_str_to_interned_str(result); + } + + return result; +} + +#define MAX_FAST_INTEGER_SIZE 18 + +static VALUE json_decode_large_integer(const char *start, long len) +{ + VALUE buffer_v; + char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1); + MEMCPY(buffer, start, char, len); + buffer[len] = '\0'; + VALUE number = rb_cstr2inum(buffer, 10); + RB_ALLOCV_END(buffer_v); + return number; +} + +static inline VALUE +json_decode_integer(uint64_t mantissa, int mantissa_digits, bool negative, const char *start, const char *end) +{ + if (RB_LIKELY(mantissa_digits < MAX_FAST_INTEGER_SIZE)) { + if (negative) { + return INT64T2NUM(-((int64_t)mantissa)); + } + return UINT64T2NUM(mantissa); + } + + return json_decode_large_integer(start, end - start); +} + +static VALUE json_decode_large_float(const char *start, long len) +{ + if (RB_LIKELY(len < 64)) { + char buffer[64]; + MEMCPY(buffer, start, char, len); + buffer[len] = '\0'; + return DBL2NUM(rb_cstr_to_dbl(buffer, 1)); + } + + VALUE buffer_v; + char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1); + MEMCPY(buffer, start, char, len); + buffer[len] = '\0'; + VALUE number = DBL2NUM(rb_cstr_to_dbl(buffer, 1)); + RB_ALLOCV_END(buffer_v); + return number; +} + +/* Ruby JSON optimized float decoder using vendored Ryu algorithm + * Accepts pre-extracted mantissa and exponent from first-pass validation + */ +static inline VALUE json_decode_float(JSON_ParserConfig *config, uint64_t mantissa, int mantissa_digits, int32_t exponent, bool negative, + const char *start, const char *end) +{ + if (RB_UNLIKELY(config->decimal_class)) { + VALUE text = rb_str_new(start, end - start); + return rb_funcallv(config->decimal_class, config->decimal_method_id, 1, &text); + } + + // Fall back to rb_cstr_to_dbl for potential subnormals (rare edge case) + // Ryu has rounding issues with subnormals around 1e-310 (< 2.225e-308) + if (RB_UNLIKELY(mantissa_digits > 17 || mantissa_digits + exponent < -307)) { + return json_decode_large_float(start, end - start); + } + + return DBL2NUM(ryu_s2d_from_parts(mantissa, mantissa_digits, exponent, negative)); +} + +static inline VALUE json_decode_array(JSON_ParserState *state, JSON_ParserConfig *config, long count) +{ + VALUE array = rb_ary_new_from_values(count, rvalue_stack_peek(state->stack, count)); + rvalue_stack_pop(state->stack, count); + + if (config->freeze) { + RB_OBJ_FREEZE(array); + } + + return array; +} + +static VALUE json_find_duplicated_key(size_t count, const VALUE *pairs) +{ + VALUE set = rb_hash_new_capa(count / 2); + for (size_t index = 0; index < count; index += 2) { + size_t before = RHASH_SIZE(set); + VALUE key = pairs[index]; + rb_hash_aset(set, key, Qtrue); + if (RHASH_SIZE(set) == before) { + if (RB_SYMBOL_P(key)) { + return rb_sym2str(key); + } + return key; + } + } + return Qfalse; +} + +static void emit_duplicate_key_warning(JSON_ParserState *state, VALUE duplicate_key) +{ + VALUE message = rb_sprintf( + "detected duplicate key %"PRIsVALUE" in JSON object. This will raise an error in json 3.0 unless enabled via `allow_duplicate_key: true`", + rb_inspect(duplicate_key) + ); + + emit_parse_warning(RSTRING_PTR(message), state); + RB_GC_GUARD(message); +} + +#ifdef RBIMPL_ATTR_NORETURN +RBIMPL_ATTR_NORETURN() +#endif +static void raise_duplicate_key_error(JSON_ParserState *state, VALUE duplicate_key) +{ + VALUE message = rb_sprintf( + "duplicate key %"PRIsVALUE, + rb_inspect(duplicate_key) + ); + + raise_parse_error(RSTRING_PTR(message), state); + RB_GC_GUARD(message); +} + +static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfig *config, size_t count) +{ + size_t entries_count = count / 2; + VALUE object = rb_hash_new_capa(entries_count); + const VALUE *pairs = rvalue_stack_peek(state->stack, count); + rb_hash_bulk_insert(count, pairs, object); + + if (RB_UNLIKELY(RHASH_SIZE(object) < entries_count)) { + switch (config->on_duplicate_key) { + case JSON_IGNORE: + break; + case JSON_DEPRECATED: + emit_duplicate_key_warning(state, json_find_duplicated_key(count, pairs)); + break; + case JSON_RAISE: + raise_duplicate_key_error(state, json_find_duplicated_key(count, pairs)); + break; + } + } + + rvalue_stack_pop(state->stack, count); + + if (config->freeze) { + RB_OBJ_FREEZE(object); + } + + return object; +} + +static inline VALUE json_push_value(JSON_ParserState *state, JSON_ParserConfig *config, VALUE value) +{ + if (RB_UNLIKELY(config->on_load_proc)) { + value = rb_proc_call_with_block(config->on_load_proc, 1, &value, Qnil); + } + rvalue_stack_push(state->stack, value, &state->stack_handle, &state->stack); + return value; +} + +static const bool string_scan_table[256] = { + // ASCII Control Characters + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + // ASCII Characters + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, // '\\' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +#ifdef HAVE_SIMD +static SIMD_Implementation simd_impl = SIMD_NONE; +#endif /* HAVE_SIMD */ + +ALWAYS_INLINE(static) bool string_scan(JSON_ParserState *state) +{ +#ifdef HAVE_SIMD +#if defined(HAVE_SIMD_NEON) + + uint64_t mask = 0; + if (string_scan_simd_neon(&state->cursor, state->end, &mask)) { + state->cursor += trailing_zeros64(mask) >> 2; + return true; + } + +#elif defined(HAVE_SIMD_SSE2) + if (simd_impl == SIMD_SSE2) { + int mask = 0; + if (string_scan_simd_sse2(&state->cursor, state->end, &mask)) { + state->cursor += trailing_zeros(mask); + return true; + } + } +#endif /* HAVE_SIMD_NEON or HAVE_SIMD_SSE2 */ +#endif /* HAVE_SIMD */ + + while (!eos(state)) { + if (RB_UNLIKELY(string_scan_table[(unsigned char)*state->cursor])) { + return true; + } + state->cursor++; + } + return false; +} + +static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name, const char *start) +{ + const char *backslashes[JSON_MAX_UNESCAPE_POSITIONS]; + JSON_UnescapePositions positions = { + .size = 0, + .positions = backslashes, + .additional_backslashes = 0, + }; + + do { + switch (*state->cursor) { + case '"': { + VALUE string = json_string_unescape(state, config, start, state->cursor, is_name, &positions); + state->cursor++; + return json_push_value(state, config, string); + } + case '\\': { + if (RB_LIKELY(positions.size < JSON_MAX_UNESCAPE_POSITIONS)) { + backslashes[positions.size] = state->cursor; + positions.size++; + } else { + positions.additional_backslashes++; + } + state->cursor++; + break; + } + default: + if (!config->allow_control_characters) { + raise_parse_error("invalid ASCII control character in string: %s", state); + } + break; + } + + state->cursor++; + } while (string_scan(state)); + + raise_parse_error("unexpected end of input, expected closing \"", state); + return Qfalse; +} + +ALWAYS_INLINE(static) VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name) +{ + state->cursor++; + const char *start = state->cursor; + + if (RB_UNLIKELY(!string_scan(state))) { + raise_parse_error("unexpected end of input, expected closing \"", state); + } + + if (RB_LIKELY(*state->cursor == '"')) { + VALUE string = json_string_fastpath(state, config, start, state->cursor, is_name); + state->cursor++; + return json_push_value(state, config, string); + } + return json_parse_escaped_string(state, config, is_name, start); +} + +#if JSON_CPU_LITTLE_ENDIAN_64BITS +// From: https://lemire.me/blog/2022/01/21/swar-explained-parsing-eight-digits/ +// Additional References: +// https://johnnylee-sde.github.io/Fast-numeric-string-to-int/ +// http://0x80.pl/notesen/2014-10-12-parsing-decimal-numbers-part-1-swar.html +static inline uint64_t decode_8digits_unrolled(uint64_t val) { + const uint64_t mask = 0x000000FF000000FF; + const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32) + const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32) + val -= 0x3030303030303030; + val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8; + val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32; + return val; +} + +static inline uint64_t decode_4digits_unrolled(uint32_t val) { + const uint32_t mask = 0x000000FF; + const uint32_t mul1 = 100; + val -= 0x30303030; + val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8; + val = ((val & mask) * mul1) + (((val >> 16) & mask)); + return val; +} +#endif + +static inline int json_parse_digits(JSON_ParserState *state, uint64_t *accumulator) +{ + const char *start = state->cursor; + +#if JSON_CPU_LITTLE_ENDIAN_64BITS + while (rest(state) >= sizeof(uint64_t)) { + uint64_t next_8bytes; + memcpy(&next_8bytes, state->cursor, sizeof(uint64_t)); + + // From: https://github.com/simdjson/simdjson/blob/32b301893c13d058095a07d9868edaaa42ee07aa/include/simdjson/generic/numberparsing.h#L333 + // Branchless version of: http://0x80.pl/articles/swar-digits-validate.html + uint64_t match = (next_8bytes & 0xF0F0F0F0F0F0F0F0) | (((next_8bytes + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4); + + if (match == 0x3333333333333333) { // 8 consecutive digits + *accumulator = (*accumulator * 100000000) + decode_8digits_unrolled(next_8bytes); + state->cursor += 8; + continue; + } + + uint32_t consecutive_digits = trailing_zeros64(match ^ 0x3333333333333333) / CHAR_BIT; + + if (consecutive_digits >= 4) { + *accumulator = (*accumulator * 10000) + decode_4digits_unrolled((uint32_t)next_8bytes); + state->cursor += 4; + consecutive_digits -= 4; + } + + while (consecutive_digits) { + *accumulator = *accumulator * 10 + (*state->cursor - '0'); + consecutive_digits--; + state->cursor++; + } + + return (int)(state->cursor - start); + } +#endif + + char next_char; + while (rb_isdigit(next_char = peek(state))) { + *accumulator = *accumulator * 10 + (next_char - '0'); + state->cursor++; + } + return (int)(state->cursor - start); +} + +static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig *config, bool negative, const char *start) +{ + bool integer = true; + const char first_digit = *state->cursor; + + // Variables for Ryu optimization - extract digits during parsing + int32_t exponent = 0; + int decimal_point_pos = -1; + uint64_t mantissa = 0; + + // Parse integer part and extract mantissa digits + int mantissa_digits = json_parse_digits(state, &mantissa); + + if (RB_UNLIKELY((first_digit == '0' && mantissa_digits > 1) || (negative && mantissa_digits == 0))) { + raise_parse_error_at("invalid number: %s", state, start); + } + + // Parse fractional part + if (peek(state) == '.') { + integer = false; + decimal_point_pos = mantissa_digits; // Remember position of decimal point + state->cursor++; + + int fractional_digits = json_parse_digits(state, &mantissa); + mantissa_digits += fractional_digits; + + if (RB_UNLIKELY(!fractional_digits)) { + raise_parse_error_at("invalid number: %s", state, start); + } + } + + // Parse exponent + if (rb_tolower(peek(state)) == 'e') { + integer = false; + state->cursor++; + + bool negative_exponent = false; + const char next_char = peek(state); + if (next_char == '-' || next_char == '+') { + negative_exponent = next_char == '-'; + state->cursor++; + } + + uint64_t abs_exponent = 0; + int exponent_digits = json_parse_digits(state, &abs_exponent); + + if (RB_UNLIKELY(!exponent_digits)) { + raise_parse_error_at("invalid number: %s", state, start); + } + + exponent = negative_exponent ? -((int32_t)abs_exponent) : ((int32_t)abs_exponent); + } + + if (integer) { + return json_decode_integer(mantissa, mantissa_digits, negative, start, state->cursor); + } + + // Adjust exponent based on decimal point position + if (decimal_point_pos >= 0) { + exponent -= (mantissa_digits - decimal_point_pos); + } + + return json_decode_float(config, mantissa, mantissa_digits, exponent, negative, start, state->cursor); +} + +static inline VALUE json_parse_positive_number(JSON_ParserState *state, JSON_ParserConfig *config) +{ + return json_parse_number(state, config, false, state->cursor); +} + +static inline VALUE json_parse_negative_number(JSON_ParserState *state, JSON_ParserConfig *config) +{ + const char *start = state->cursor; + state->cursor++; + return json_parse_number(state, config, true, start); +} + +static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) +{ + json_eat_whitespace(state); + + switch (peek(state)) { + case 'n': + if (rest(state) >= 4 && (memcmp(state->cursor, "null", 4) == 0)) { + state->cursor += 4; + return json_push_value(state, config, Qnil); + } + + raise_parse_error("unexpected token %s", state); + break; + case 't': + if (rest(state) >= 4 && (memcmp(state->cursor, "true", 4) == 0)) { + state->cursor += 4; + return json_push_value(state, config, Qtrue); + } + + raise_parse_error("unexpected token %s", state); + break; + case 'f': + // Note: memcmp with a small power of two compile to an integer comparison + if (rest(state) >= 5 && (memcmp(state->cursor + 1, "alse", 4) == 0)) { + state->cursor += 5; + return json_push_value(state, config, Qfalse); + } + + raise_parse_error("unexpected token %s", state); + break; + case 'N': + // Note: memcmp with a small power of two compile to an integer comparison + if (config->allow_nan && rest(state) >= 3 && (memcmp(state->cursor + 1, "aN", 2) == 0)) { + state->cursor += 3; + return json_push_value(state, config, CNaN); + } + + raise_parse_error("unexpected token %s", state); + break; + case 'I': + if (config->allow_nan && rest(state) >= 8 && (memcmp(state->cursor, "Infinity", 8) == 0)) { + state->cursor += 8; + return json_push_value(state, config, CInfinity); + } + + raise_parse_error("unexpected token %s", state); + break; + case '-': { + // Note: memcmp with a small power of two compile to an integer comparison + if (rest(state) >= 9 && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) { + if (config->allow_nan) { + state->cursor += 9; + return json_push_value(state, config, CMinusInfinity); + } else { + raise_parse_error("unexpected token %s", state); + } + } + return json_push_value(state, config, json_parse_negative_number(state, config)); + break; + } + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + return json_push_value(state, config, json_parse_positive_number(state, config)); + break; + case '"': { + // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"} + return json_parse_string(state, config, false); + break; + } + case '[': { + state->cursor++; + json_eat_whitespace(state); + long stack_head = state->stack->head; + + if (peek(state) == ']') { + state->cursor++; + return json_push_value(state, config, json_decode_array(state, config, 0)); + } else { + state->current_nesting++; + if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) { + rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting); + } + state->in_array++; + json_parse_any(state, config); + } + + while (true) { + json_eat_whitespace(state); + + const char next_char = peek(state); + + if (RB_LIKELY(next_char == ',')) { + state->cursor++; + if (config->allow_trailing_comma) { + json_eat_whitespace(state); + if (peek(state) == ']') { + continue; + } + } + json_parse_any(state, config); + continue; + } + + if (next_char == ']') { + state->cursor++; + long count = state->stack->head - stack_head; + state->current_nesting--; + state->in_array--; + return json_push_value(state, config, json_decode_array(state, config, count)); + } + + raise_parse_error("expected ',' or ']' after array value", state); + } + break; + } + case '{': { + const char *object_start_cursor = state->cursor; + + state->cursor++; + json_eat_whitespace(state); + long stack_head = state->stack->head; + + if (peek(state) == '}') { + state->cursor++; + return json_push_value(state, config, json_decode_object(state, config, 0)); + } else { + state->current_nesting++; + if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) { + rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting); + } + + if (peek(state) != '"') { + raise_parse_error("expected object key, got %s", state); + } + json_parse_string(state, config, true); + + json_eat_whitespace(state); + if (peek(state) != ':') { + raise_parse_error("expected ':' after object key", state); + } + state->cursor++; + + json_parse_any(state, config); + } + + while (true) { + json_eat_whitespace(state); + + const char next_char = peek(state); + if (next_char == '}') { + state->cursor++; + state->current_nesting--; + size_t count = state->stack->head - stack_head; + + // Temporary rewind cursor in case an error is raised + const char *final_cursor = state->cursor; + state->cursor = object_start_cursor; + VALUE object = json_decode_object(state, config, count); + state->cursor = final_cursor; + + return json_push_value(state, config, object); + } + + if (next_char == ',') { + state->cursor++; + json_eat_whitespace(state); + + if (config->allow_trailing_comma) { + if (peek(state) == '}') { + continue; + } + } + + if (RB_UNLIKELY(peek(state) != '"')) { + raise_parse_error("expected object key, got: %s", state); + } + json_parse_string(state, config, true); + + json_eat_whitespace(state); + if (RB_UNLIKELY(peek(state) != ':')) { + raise_parse_error("expected ':' after object key, got: %s", state); + } + state->cursor++; + + json_parse_any(state, config); + + continue; + } + + raise_parse_error("expected ',' or '}' after object value, got: %s", state); + } + break; + } + + case 0: + raise_parse_error("unexpected end of input", state); + break; + + default: + raise_parse_error("unexpected character: %s", state); + break; + } + + raise_parse_error("unreachable: %s", state); + return Qundef; +} + +static void json_ensure_eof(JSON_ParserState *state) +{ + json_eat_whitespace(state); + if (!eos(state)) { + raise_parse_error("unexpected token at end of stream %s", state); + } +} + +/* + * Document-class: JSON::Ext::Parser + * + * This is the JSON parser implemented as a C extension. It can be configured + * to be used by setting + * + * JSON.parser = JSON::Ext::Parser + * + * with the method parser= in JSON. + * + */ + +static VALUE convert_encoding(VALUE source) +{ + int encindex = RB_ENCODING_GET(source); + + if (RB_LIKELY(encindex == utf8_encindex)) { + return source; + } + + if (encindex == binary_encindex) { + // For historical reason, we silently reinterpret binary strings as UTF-8 + return rb_enc_associate_index(rb_str_dup(source), utf8_encindex); + } + + return rb_funcall(source, i_encode, 1, Encoding_UTF_8); +} + +static int parser_config_init_i(VALUE key, VALUE val, VALUE data) +{ + JSON_ParserConfig *config = (JSON_ParserConfig *)data; + + if (key == sym_max_nesting) { config->max_nesting = RTEST(val) ? FIX2INT(val) : 0; } + else if (key == sym_allow_nan) { config->allow_nan = RTEST(val); } + else if (key == sym_allow_trailing_comma) { config->allow_trailing_comma = RTEST(val); } + else if (key == sym_allow_control_characters) { config->allow_control_characters = RTEST(val); } + else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); } + else if (key == sym_freeze) { config->freeze = RTEST(val); } + else if (key == sym_on_load) { config->on_load_proc = RTEST(val) ? val : Qfalse; } + else if (key == sym_allow_duplicate_key) { config->on_duplicate_key = RTEST(val) ? JSON_IGNORE : JSON_RAISE; } + else if (key == sym_decimal_class) { + if (RTEST(val)) { + if (rb_respond_to(val, i_try_convert)) { + config->decimal_class = val; + config->decimal_method_id = i_try_convert; + } else if (rb_respond_to(val, i_new)) { + config->decimal_class = val; + config->decimal_method_id = i_new; + } else if (RB_TYPE_P(val, T_CLASS)) { + VALUE name = rb_class_name(val); + const char *name_cstr = RSTRING_PTR(name); + const char *last_colon = strrchr(name_cstr, ':'); + if (last_colon) { + const char *mod_path_end = last_colon - 1; + VALUE mod_path = rb_str_substr(name, 0, mod_path_end - name_cstr); + config->decimal_class = rb_path_to_class(mod_path); + + const char *method_name_beg = last_colon + 1; + long before_len = method_name_beg - name_cstr; + long len = RSTRING_LEN(name) - before_len; + VALUE method_name = rb_str_substr(name, before_len, len); + config->decimal_method_id = SYM2ID(rb_str_intern(method_name)); + } else { + config->decimal_class = rb_mKernel; + config->decimal_method_id = SYM2ID(rb_str_intern(name)); + } + } + } + } + + return ST_CONTINUE; +} + +static void parser_config_init(JSON_ParserConfig *config, VALUE opts) +{ + config->max_nesting = 100; + + if (!NIL_P(opts)) { + Check_Type(opts, T_HASH); + if (RHASH_SIZE(opts) > 0) { + // We assume in most cases few keys are set so it's faster to go over + // the provided keys than to check all possible keys. + rb_hash_foreach(opts, parser_config_init_i, (VALUE)config); + } + + } +} + +/* + * call-seq: new(opts => {}) + * + * Creates a new JSON::Ext::ParserConfig instance. + * + * It will be configured by the _opts_ hash. _opts_ can have the following + * keys: + * + * _opts_ can have the following keys: + * * *max_nesting*: The maximum depth of nesting allowed in the parsed data + * structures. Disable depth checking with :max_nesting => false|nil|0, it + * defaults to 100. + * * *allow_nan*: If set to true, allow NaN, Infinity and -Infinity in + * defiance of RFC 4627 to be parsed by the Parser. This option defaults to + * false. + * * *symbolize_names*: If set to true, returns symbols for the names + * (keys) in a JSON object. Otherwise strings are returned, which is + * also the default. It's not possible to use this option in + * conjunction with the *create_additions* option. + * * *decimal_class*: Specifies which class to use instead of the default + * (Float) when parsing decimal numbers. This class must accept a single + * string argument in its constructor. + */ +static VALUE cParserConfig_initialize(VALUE self, VALUE opts) +{ + rb_check_frozen(self); + GET_PARSER_CONFIG; + + parser_config_init(config, opts); + + RB_OBJ_WRITTEN(self, Qundef, config->decimal_class); + + return self; +} + +static VALUE cParser_parse(JSON_ParserConfig *config, VALUE Vsource) +{ + Vsource = convert_encoding(StringValue(Vsource)); + StringValue(Vsource); + + VALUE rvalue_stack_buffer[RVALUE_STACK_INITIAL_CAPA]; + rvalue_stack stack = { + .type = RVALUE_STACK_STACK_ALLOCATED, + .ptr = rvalue_stack_buffer, + .capa = RVALUE_STACK_INITIAL_CAPA, + }; + + long len; + const char *start; + RSTRING_GETMEM(Vsource, start, len); + + JSON_ParserState _state = { + .start = start, + .cursor = start, + .end = start + len, + .stack = &stack, + }; + JSON_ParserState *state = &_state; + + VALUE result = json_parse_any(state, config); + + // This may be skipped in case of exception, but + // it won't cause a leak. + rvalue_stack_eagerly_release(state->stack_handle); + + json_ensure_eof(state); + + return result; +} + +/* + * call-seq: parse(source) + * + * Parses the current JSON text _source_ and returns the complete data + * structure as a result. + * It raises JSON::ParserError if fail to parse. + */ +static VALUE cParserConfig_parse(VALUE self, VALUE Vsource) +{ + GET_PARSER_CONFIG; + return cParser_parse(config, Vsource); +} + +static VALUE cParser_m_parse(VALUE klass, VALUE Vsource, VALUE opts) +{ + Vsource = convert_encoding(StringValue(Vsource)); + StringValue(Vsource); + + JSON_ParserConfig _config = {0}; + JSON_ParserConfig *config = &_config; + parser_config_init(config, opts); + + return cParser_parse(config, Vsource); +} + +static void JSON_ParserConfig_mark(void *ptr) +{ + JSON_ParserConfig *config = ptr; + rb_gc_mark(config->on_load_proc); + rb_gc_mark(config->decimal_class); +} + +static void JSON_ParserConfig_free(void *ptr) +{ + JSON_ParserConfig *config = ptr; + ruby_xfree(config); +} + +static size_t JSON_ParserConfig_memsize(const void *ptr) +{ + return sizeof(JSON_ParserConfig); +} + +static const rb_data_type_t JSON_ParserConfig_type = { + "JSON::Ext::Parser/ParserConfig", + { + JSON_ParserConfig_mark, + JSON_ParserConfig_free, + JSON_ParserConfig_memsize, + }, + 0, 0, + RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FROZEN_SHAREABLE, +}; + +static VALUE cJSON_parser_s_allocate(VALUE klass) +{ + JSON_ParserConfig *config; + return TypedData_Make_Struct(klass, JSON_ParserConfig, &JSON_ParserConfig_type, config); +} + +void Init_parser(void) +{ +#ifdef HAVE_RB_EXT_RACTOR_SAFE + rb_ext_ractor_safe(true); +#endif + +#undef rb_intern + rb_require("json/common"); + mJSON = rb_define_module("JSON"); + VALUE mExt = rb_define_module_under(mJSON, "Ext"); + VALUE cParserConfig = rb_define_class_under(mExt, "ParserConfig", rb_cObject); + eNestingError = rb_path2class("JSON::NestingError"); + rb_gc_register_mark_object(eNestingError); + rb_define_alloc_func(cParserConfig, cJSON_parser_s_allocate); + rb_define_method(cParserConfig, "initialize", cParserConfig_initialize, 1); + rb_define_method(cParserConfig, "parse", cParserConfig_parse, 1); + + VALUE cParser = rb_define_class_under(mExt, "Parser", rb_cObject); + rb_define_singleton_method(cParser, "parse", cParser_m_parse, 2); + + CNaN = rb_const_get(mJSON, rb_intern("NaN")); + rb_gc_register_mark_object(CNaN); + + CInfinity = rb_const_get(mJSON, rb_intern("Infinity")); + rb_gc_register_mark_object(CInfinity); + + CMinusInfinity = rb_const_get(mJSON, rb_intern("MinusInfinity")); + rb_gc_register_mark_object(CMinusInfinity); + + rb_global_variable(&Encoding_UTF_8); + Encoding_UTF_8 = rb_const_get(rb_path2class("Encoding"), rb_intern("UTF_8")); + + sym_max_nesting = ID2SYM(rb_intern("max_nesting")); + sym_allow_nan = ID2SYM(rb_intern("allow_nan")); + sym_allow_trailing_comma = ID2SYM(rb_intern("allow_trailing_comma")); + sym_allow_control_characters = ID2SYM(rb_intern("allow_control_characters")); + sym_symbolize_names = ID2SYM(rb_intern("symbolize_names")); + sym_freeze = ID2SYM(rb_intern("freeze")); + sym_on_load = ID2SYM(rb_intern("on_load")); + sym_decimal_class = ID2SYM(rb_intern("decimal_class")); + sym_allow_duplicate_key = ID2SYM(rb_intern("allow_duplicate_key")); + + i_new = rb_intern("new"); + i_try_convert = rb_intern("try_convert"); + i_uminus = rb_intern("-@"); + i_encode = rb_intern("encode"); + + binary_encindex = rb_ascii8bit_encindex(); + utf8_encindex = rb_utf8_encindex(); + enc_utf8 = rb_utf8_encoding(); + +#ifdef HAVE_SIMD + simd_impl = find_simd_implementation(); +#endif +} |
