summaryrefslogtreecommitdiff
path: root/ext/json/parser
diff options
context:
space:
mode:
Diffstat (limited to 'ext/json/parser')
-rw-r--r--ext/json/parser/depend2
-rw-r--r--ext/json/parser/extconf.rb8
-rw-r--r--ext/json/parser/parser.c1832
3 files changed, 1200 insertions, 642 deletions
diff --git a/ext/json/parser/depend b/ext/json/parser/depend
index 1bb03d3517..d4737b1dfb 100644
--- a/ext/json/parser/depend
+++ b/ext/json/parser/depend
@@ -175,6 +175,8 @@ parser.o: $(hdrdir)/ruby/ruby.h
parser.o: $(hdrdir)/ruby/st.h
parser.o: $(hdrdir)/ruby/subst.h
parser.o: $(srcdir)/../fbuffer/fbuffer.h
+parser.o: $(srcdir)/../json.h
parser.o: $(srcdir)/../simd/simd.h
+parser.o: $(srcdir)/../vendor/ryu.h
parser.o: parser.c
# AUTOGENERATED DEPENDENCIES END
diff --git a/ext/json/parser/extconf.rb b/ext/json/parser/extconf.rb
index de5d5758b4..a9d740c755 100644
--- a/ext/json/parser/extconf.rb
+++ b/ext/json/parser/extconf.rb
@@ -1,10 +1,16 @@
# frozen_string_literal: true
require 'mkmf'
+$defs << "-DJSON_DEBUG" if ENV.fetch("JSON_DEBUG", "0") != "0"
have_func("rb_enc_interned_str", "ruby/encoding.h") # RUBY_VERSION >= 3.0
+have_func("rb_str_to_interned_str", "ruby.h") # RUBY_VERSION >= 3.0
have_func("rb_hash_new_capa", "ruby.h") # RUBY_VERSION >= 3.2
have_func("rb_hash_bulk_insert", "ruby.h") # Missing on TruffleRuby
-have_func("strnlen", "string.h") # Missing on Solaris 10
+have_func("ruby_xfree_sized", "ruby.h") # RUBY_VERSION >= 4.1
+
+if RUBY_ENGINE == "ruby"
+ have_const("RUBY_TYPED_EMBEDDABLE", "ruby.h") # RUBY_VERSION >= 3.3
+end
append_cflags("-std=c99")
diff --git a/ext/json/parser/parser.c b/ext/json/parser/parser.c
index 1e6ee753f0..c0631728c3 100644
--- a/ext/json/parser/parser.c
+++ b/ext/json/parser/parser.c
@@ -1,50 +1,22 @@
-#include "ruby.h"
-#include "ruby/encoding.h"
-
-/* shims */
-/* This is the fallback definition from Ruby 3.4 */
-
-#ifndef RBIMPL_STDBOOL_H
-#if defined(__cplusplus)
-# if defined(HAVE_STDBOOL_H) && (__cplusplus >= 201103L)
-# include <cstdbool>
-# endif
-#elif defined(HAVE_STDBOOL_H)
-# include <stdbool.h>
-#elif !defined(HAVE__BOOL)
-typedef unsigned char _Bool;
-# define bool _Bool
-# define true ((_Bool)+1)
-# define false ((_Bool)+0)
-# define __bool_true_false_are_defined
-#endif
-#endif
-
+#include "../json.h"
+#include "../vendor/ryu.h"
#include "../simd/simd.h"
-#ifndef RB_UNLIKELY
-#define RB_UNLIKELY(expr) expr
-#endif
-
-#ifndef RB_LIKELY
-#define RB_LIKELY(expr) expr
-#endif
-
static VALUE mJSON, eNestingError, Encoding_UTF_8;
static VALUE CNaN, CInfinity, CMinusInfinity;
-static ID i_chr, i_aset, i_aref,
- i_leftshift, i_new, i_try_convert, i_uminus, i_encode;
+static ID i_new, i_try_convert, i_uminus, i_encode;
-static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_symbolize_names, sym_freeze,
- sym_decimal_class, sym_on_load, sym_allow_duplicate_key;
+static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_allow_control_characters,
+ sym_allow_invalid_escape, sym_symbolize_names, sym_freeze, sym_decimal_class, sym_on_load,
+ sym_allow_duplicate_key;
static int binary_encindex;
static int utf8_encindex;
#ifndef HAVE_RB_HASH_BULK_INSERT
// For TruffleRuby
-void
+static void
rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash)
{
long index = 0;
@@ -61,6 +33,12 @@ rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash)
#define rb_hash_new_capa(n) rb_hash_new()
#endif
+#ifndef HAVE_RB_STR_TO_INTERNED_STR
+static VALUE rb_str_to_interned_str(VALUE str)
+{
+ return rb_funcall(rb_str_freeze(str), i_uminus, 0);
+}
+#endif
/* name cache */
@@ -106,116 +84,104 @@ static void rvalue_cache_insert_at(rvalue_cache *cache, int index, VALUE rstring
cache->entries[index] = rstring;
}
-static inline int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
+#define rstring_cache_memcmp memcmp
+
+#if JSON_CPU_LITTLE_ENDIAN_64BITS
+#if __has_builtin(__builtin_bswap64)
+#undef rstring_cache_memcmp
+ALWAYS_INLINE(static) int rstring_cache_memcmp(const char *str, const char *rptr, const long length)
{
- long rstring_length = RSTRING_LEN(rstring);
- if (length == rstring_length) {
- return memcmp(str, RSTRING_PTR(rstring), length);
- } else {
- return (int)(length - rstring_length);
+ // The libc memcmp has numerous complex optimizations, but in this particular case,
+ // we know the string is small (JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH), so being able to
+ // inline a simpler memcmp outperforms calling the libc version.
+ long i = 0;
+
+ for (; i + 8 <= length; i += 8) {
+ uint64_t a, b;
+ memcpy(&a, str + i, 8);
+ memcpy(&b, rptr + i, 8);
+ if (a != b) {
+ a = __builtin_bswap64(a);
+ b = __builtin_bswap64(b);
+ return (a < b) ? -1 : 1;
+ }
}
+
+ for (; i < length; i++) {
+ if (str[i] != rptr[i]) {
+ return (str[i] < rptr[i]) ? -1 : 1;
+ }
+ }
+
+ return 0;
}
+#endif
+#endif
-static VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length)
+ALWAYS_INLINE(static) int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
{
- if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) {
- // Common names aren't likely to be very long. So we just don't
- // cache names above an arbitrary threshold.
- return Qfalse;
- }
+ const char *rstring_ptr;
+ long rstring_length;
- if (RB_UNLIKELY(!isalpha((unsigned char)str[0]))) {
- // Simple heuristic, if the first character isn't a letter,
- // we're much less likely to see this string again.
- // We mostly want to cache strings that are likely to be repeated.
- return Qfalse;
+ RSTRING_GETMEM(rstring, rstring_ptr, rstring_length);
+
+ if (length == rstring_length) {
+ return rstring_cache_memcmp(str, rstring_ptr, length);
+ } else {
+ return (int)(length - rstring_length);
}
+}
+ALWAYS_INLINE(static) VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length)
+{
int low = 0;
int high = cache->length - 1;
- int mid = 0;
- int last_cmp = 0;
while (low <= high) {
- mid = (high + low) >> 1;
+ int mid = (high + low) >> 1;
VALUE entry = cache->entries[mid];
- last_cmp = rstring_cache_cmp(str, length, entry);
+ int cmp = rstring_cache_cmp(str, length, entry);
- if (last_cmp == 0) {
+ if (cmp == 0) {
return entry;
- } else if (last_cmp > 0) {
+ } else if (cmp > 0) {
low = mid + 1;
} else {
high = mid - 1;
}
}
- if (RB_UNLIKELY(memchr(str, '\\', length))) {
- // We assume the overwhelming majority of names don't need to be escaped.
- // But if they do, we have to fallback to the slow path.
- return Qfalse;
- }
-
VALUE rstring = build_interned_string(str, length);
if (cache->length < JSON_RVALUE_CACHE_CAPA) {
- if (last_cmp > 0) {
- mid += 1;
- }
-
- rvalue_cache_insert_at(cache, mid, rstring);
+ rvalue_cache_insert_at(cache, low, rstring);
}
return rstring;
}
static VALUE rsymbol_cache_fetch(rvalue_cache *cache, const char *str, const long length)
{
- if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) {
- // Common names aren't likely to be very long. So we just don't
- // cache names above an arbitrary threshold.
- return Qfalse;
- }
-
- if (RB_UNLIKELY(!isalpha((unsigned char)str[0]))) {
- // Simple heuristic, if the first character isn't a letter,
- // we're much less likely to see this string again.
- // We mostly want to cache strings that are likely to be repeated.
- return Qfalse;
- }
-
int low = 0;
int high = cache->length - 1;
- int mid = 0;
- int last_cmp = 0;
while (low <= high) {
- mid = (high + low) >> 1;
+ int mid = (high + low) >> 1;
VALUE entry = cache->entries[mid];
- last_cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
+ int cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
- if (last_cmp == 0) {
+ if (cmp == 0) {
return entry;
- } else if (last_cmp > 0) {
+ } else if (cmp > 0) {
low = mid + 1;
} else {
high = mid - 1;
}
}
- if (RB_UNLIKELY(memchr(str, '\\', length))) {
- // We assume the overwhelming majority of names don't need to be escaped.
- // But if they do, we have to fallback to the slow path.
- return Qfalse;
- }
-
VALUE rsymbol = build_symbol(str, length);
if (cache->length < JSON_RVALUE_CACHE_CAPA) {
- if (last_cmp > 0) {
- mid += 1;
- }
-
- rvalue_cache_insert_at(cache, mid, rsymbol);
+ rvalue_cache_insert_at(cache, low, rsymbol);
}
return rsymbol;
}
@@ -245,7 +211,7 @@ static rvalue_stack *rvalue_stack_grow(rvalue_stack *stack, VALUE *handle, rvalu
if (stack->type == RVALUE_STACK_STACK_ALLOCATED) {
stack = rvalue_stack_spill(stack, handle, stack_ref);
} else {
- REALLOC_N(stack->ptr, VALUE, required);
+ JSON_SIZED_REALLOC_N(stack->ptr, VALUE, required, stack->capa);
stack->capa = required;
}
return stack;
@@ -275,35 +241,62 @@ static void rvalue_stack_mark(void *ptr)
{
rvalue_stack *stack = (rvalue_stack *)ptr;
long index;
- for (index = 0; index < stack->head; index++) {
- rb_gc_mark(stack->ptr[index]);
+ if (stack && stack->ptr) {
+ for (index = 0; index < stack->head; index++) {
+ rb_gc_mark_movable(stack->ptr[index]);
+ }
}
}
+static void rvalue_stack_free_buffer(rvalue_stack *stack)
+{
+ JSON_SIZED_FREE_N(stack->ptr, stack->capa);
+ stack->ptr = NULL;
+}
+
static void rvalue_stack_free(void *ptr)
{
rvalue_stack *stack = (rvalue_stack *)ptr;
if (stack) {
- ruby_xfree(stack->ptr);
- ruby_xfree(stack);
+ rvalue_stack_free_buffer(stack);
+#ifndef HAVE_RUBY_TYPED_EMBEDDABLE
+ JSON_SIZED_FREE(stack);
+#endif
}
}
static size_t rvalue_stack_memsize(const void *ptr)
{
const rvalue_stack *stack = (const rvalue_stack *)ptr;
- return sizeof(rvalue_stack) + sizeof(VALUE) * stack->capa;
+ size_t memsize = sizeof(VALUE) * stack->capa;
+#ifndef HAVE_RUBY_TYPED_EMBEDDABLE
+ memsize += sizeof(rvalue_stack);
+#endif
+ return memsize;
+}
+
+static void rvalue_stack_compact(void *ptr)
+{
+ rvalue_stack *stack = (rvalue_stack *)ptr;
+ long index;
+ if (stack && stack->ptr) {
+ for (index = 0; index < stack->head; index++) {
+ stack->ptr[index] = rb_gc_location(stack->ptr[index]);
+ }
+ }
}
static const rb_data_type_t JSON_Parser_rvalue_stack_type = {
- "JSON::Ext::Parser/rvalue_stack",
- {
+ .wrap_struct_name = "JSON::Ext::Parser/rvalue_stack",
+ .function = {
.dmark = rvalue_stack_mark,
.dfree = rvalue_stack_free,
.dsize = rvalue_stack_memsize,
+ .dcompact = rvalue_stack_compact,
},
- 0, 0,
- RUBY_TYPED_FREE_IMMEDIATELY,
+ // We deliberately don't declare rvalue_stack as RUBY_TYPED_WB_PROTECTED
+ // because it churns a lot of values so trigering write barriers every time is very costly.
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE,
};
static rvalue_stack *rvalue_stack_spill(rvalue_stack *old_stack, VALUE *handle, rvalue_stack **stack_ref)
@@ -325,19 +318,206 @@ static void rvalue_stack_eagerly_release(VALUE handle)
if (handle) {
rvalue_stack *stack;
TypedData_Get_Struct(handle, rvalue_stack, &JSON_Parser_rvalue_stack_type, stack);
- RTYPEDDATA_DATA(handle) = NULL;
+#ifdef HAVE_RUBY_TYPED_EMBEDDABLE
+ rvalue_stack_free_buffer(stack);
+#else
rvalue_stack_free(stack);
+ RTYPEDDATA_DATA(handle) = NULL;
+#endif
+ }
+}
+
+/* frame stack */
+
+// Iterative (non-recursive) parsing keeps an explicit stack of the containers
+// currently being built, instead of relying on the C call stack. Each frame
+// only needs enough bookkeeping to close its container: which kind it is, the
+// rvalue_stack position where its children start (so we know how many to pop),
+// and the cursor at its opening brace (used to rewind for duplicate key
+// errors). Frames hold no VALUEs, so this stack needs no GC marking; it reuses
+// the same stack-allocated-with-heap-spill strategy as the rvalue_stack so that
+// it's freed even if parsing raises.
+//
+// The lifecycle helpers below (grow/push/peek/pop/spill/free/eagerly_release
+// and the rb_data_type_t) deliberately mirror their rvalue_stack counterparts
+// -- the element type and the absence of a mark function are the only real
+// differences. Keep the two in sync: a fix to the spill/release or
+// HAVE_RUBY_TYPED_EMBEDDABLE handling in one almost certainly belongs in the
+// other.
+#define JSON_FRAME_STACK_INITIAL_CAPA 32
+
+enum json_frame_type {
+ JSON_FRAME_ROOT, // == JSON_PHASE_DONE
+ JSON_FRAME_ARRAY, // == JSON_PHASE_ARRAY_COMMA
+ JSON_FRAME_OBJECT, // = JSON_PHASE_OBJECT_COMMA
+};
+
+// Where a frame is within its container's grammar. This is the entirety of the
+// parser's "what to do next" state: json_parse_any dispatches on the top
+// frame's phase and holds no resume state in C locals, so a parse can stop at
+// any value boundary and be resumed purely from the (persistable) frame stack.
+//
+// The first three phases are deliberately equal to the corresponding json_frame_type
+// to simplify the transition of phase in json_value_completed.
+enum json_frame_phase {
+ JSON_PHASE_DONE = JSON_FRAME_ROOT, // root only: the document value has been parsed
+ JSON_PHASE_ARRAY_COMMA = JSON_FRAME_ARRAY, // after a value: expecting ',' or the closing ']'
+ JSON_PHASE_OBJECT_COMMA = JSON_FRAME_OBJECT, // after a value: expecting ',' or the closing '}'
+ JSON_PHASE_VALUE, // expecting a value (document root, array element, or object value after ':')
+ JSON_PHASE_OBJECT_KEY, // expecting a '"' key (after '{' or ',')
+ JSON_PHASE_OBJECT_COLON, // object only: after a key, expecting ':'
+};
+
+typedef struct json_frame_struct {
+ enum json_frame_type type;
+ enum json_frame_phase phase;
+ long value_stack_head; // rvalue_stack->head when this container opened
+ const char *start_cursor; // object frames only (the '{'); NULL otherwise
+} json_frame;
+
+typedef struct json_frame_stack_struct {
+ enum rvalue_stack_type type; // shared with rvalue_stack: is ptr stack- or heap-allocated
+ long capa;
+ long head;
+ json_frame *ptr;
+} json_frame_stack;
+
+enum duplicate_key_action {
+ JSON_DEPRECATED = 0,
+ JSON_IGNORE,
+ JSON_RAISE,
+};
+
+typedef struct JSON_ParserStruct {
+ VALUE on_load_proc;
+ VALUE decimal_class;
+ ID decimal_method_id;
+ enum duplicate_key_action on_duplicate_key;
+ int max_nesting;
+ bool allow_nan;
+ bool allow_trailing_comma;
+ bool allow_control_characters;
+ bool allow_invalid_escape;
+ bool symbolize_names;
+ bool freeze;
+} JSON_ParserConfig;
+
+typedef struct JSON_ParserStateStruct {
+ VALUE *value_stack_handle;
+ VALUE *frame_stack_handle;
+ const char *start;
+ const char *cursor;
+ const char *end;
+ rvalue_stack *value_stack;
+ json_frame_stack *frames;
+ rvalue_cache name_cache;
+ int in_array;
+ int current_nesting;
+ unsigned int emitted_deprecations;
+} JSON_ParserState;
+
+static json_frame_stack *json_frame_stack_spill(json_frame_stack *old_stack, VALUE *handle, json_frame_stack **stack_ref);
+
+static json_frame_stack *json_frame_stack_grow(json_frame_stack *stack, VALUE *handle, json_frame_stack **stack_ref)
+{
+ long required = stack->capa * 2;
+
+ if (stack->type == RVALUE_STACK_STACK_ALLOCATED) {
+ stack = json_frame_stack_spill(stack, handle, stack_ref);
+ } else {
+ JSON_SIZED_REALLOC_N(stack->ptr, json_frame, required, stack->capa);
+ stack->capa = required;
}
+ return stack;
}
+static json_frame *json_frame_stack_push(JSON_ParserState *state, json_frame frame)
+{
+ json_frame_stack *stack = state->frames;
+ if (RB_UNLIKELY(stack->head >= stack->capa)) {
+ stack = json_frame_stack_grow(stack, state->frame_stack_handle, &state->frames);
+ }
+
+ json_frame *frame_ptr = &stack->ptr[stack->head++];
+ *frame_ptr = frame;
+ return frame_ptr;
+}
-#ifndef HAVE_STRNLEN
-static size_t strnlen(const char *s, size_t maxlen)
+static inline json_frame *json_frame_stack_peek(json_frame_stack *stack)
{
- char *p;
- return ((p = memchr(s, '\0', maxlen)) ? p - s : maxlen);
+ return &stack->ptr[stack->head - 1];
}
+
+static inline void json_frame_stack_pop(json_frame_stack *stack)
+{
+ stack->head--;
+}
+
+static void json_frame_stack_free_buffer(json_frame_stack *stack)
+{
+ JSON_SIZED_FREE_N(stack->ptr, stack->capa);
+ stack->ptr = NULL;
+}
+
+static void json_frame_stack_free(void *ptr)
+{
+ json_frame_stack *stack = (json_frame_stack *)ptr;
+ if (stack) {
+ json_frame_stack_free_buffer(stack);
+#ifndef HAVE_RUBY_TYPED_EMBEDDABLE
+ JSON_SIZED_FREE(stack);
#endif
+ }
+}
+
+static size_t json_frame_stack_memsize(const void *ptr)
+{
+ const json_frame_stack *stack = (const json_frame_stack *)ptr;
+
+ size_t memsize = sizeof(json_frame) * stack->capa;
+#ifndef HAVE_RUBY_TYPED_EMBEDDABLE
+ memsize += sizeof(json_frame_stack);
+#endif
+ return memsize;
+}
+
+static const rb_data_type_t JSON_Parser_frame_stack_type = {
+ .wrap_struct_name = "JSON::Ext::Parser/frame_stack",
+ .function = {
+ .dmark = NULL,
+ .dfree = json_frame_stack_free,
+ .dsize = json_frame_stack_memsize,
+ },
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE,
+};
+
+static json_frame_stack *json_frame_stack_spill(json_frame_stack *old_stack, VALUE *handle, json_frame_stack **stack_ref)
+{
+ json_frame_stack *stack;
+ *handle = TypedData_Make_Struct(0, json_frame_stack, &JSON_Parser_frame_stack_type, stack);
+ *stack_ref = stack;
+ MEMCPY(stack, old_stack, json_frame_stack, 1);
+
+ stack->capa = old_stack->capa << 1;
+ stack->ptr = ALLOC_N(json_frame, stack->capa);
+ stack->type = RVALUE_STACK_HEAP_ALLOCATED;
+ MEMCPY(stack->ptr, old_stack->ptr, json_frame, old_stack->head);
+ return stack;
+}
+
+static void json_frame_stack_eagerly_release(VALUE handle)
+{
+ if (handle) {
+ json_frame_stack *stack;
+ TypedData_Get_Struct(handle, json_frame_stack, &JSON_Parser_frame_stack_type, stack);
+#ifdef HAVE_RUBY_TYPED_EMBEDDABLE
+ json_frame_stack_free_buffer(stack);
+#else
+ json_frame_stack_free(stack);
+ RTYPEDDATA_DATA(handle) = NULL;
+#endif
+ }
+}
static int convert_UTF32_to_UTF8(char *buf, uint32_t ch)
{
@@ -365,38 +545,31 @@ static int convert_UTF32_to_UTF8(char *buf, uint32_t ch)
return len;
}
-enum duplicate_key_action {
- JSON_DEPRECATED = 0,
- JSON_IGNORE,
- JSON_RAISE,
-};
+static inline size_t rest(JSON_ParserState *state) {
+ return state->end - state->cursor;
+}
-typedef struct JSON_ParserStruct {
- VALUE on_load_proc;
- VALUE decimal_class;
- ID decimal_method_id;
- enum duplicate_key_action on_duplicate_key;
- int max_nesting;
- bool allow_nan;
- bool allow_trailing_comma;
- bool parsing_name;
- bool symbolize_names;
- bool freeze;
-} JSON_ParserConfig;
+static inline bool eos(JSON_ParserState *state) {
+ return state->cursor >= state->end;
+}
-typedef struct JSON_ParserStateStruct {
- VALUE stack_handle;
- const char *start;
- const char *cursor;
- const char *end;
- rvalue_stack *stack;
- rvalue_cache name_cache;
- int in_array;
- int current_nesting;
-} JSON_ParserState;
+static inline char peek(JSON_ParserState *state)
+{
+ if (RB_UNLIKELY(eos(state))) {
+ return 0;
+ }
+ return *state->cursor;
+}
static void cursor_position(JSON_ParserState *state, long *line_out, long *column_out)
{
+ JSON_ASSERT(state->cursor <= state->end);
+
+ // Redundant but helpful for hardening
+ if (RB_UNLIKELY(state->cursor > state->end)) {
+ state->cursor = state->end;
+ }
+
const char *cursor = state->cursor;
long column = 0;
long line = 1;
@@ -428,14 +601,9 @@ static void emit_parse_warning(const char *message, JSON_ParserState *state)
#define PARSE_ERROR_FRAGMENT_LEN 32
-#ifdef RBIMPL_ATTR_NORETURN
-RBIMPL_ATTR_NORETURN()
-#endif
-static void raise_parse_error(const char *format, JSON_ParserState *state)
+static VALUE build_parse_error_message(const char *format, JSON_ParserState *state, long line, long column)
{
unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 3];
- long line, column;
- cursor_position(state, &line, &column);
const char *ptr = "EOF";
if (state->cursor && state->cursor < state->end) {
@@ -467,20 +635,28 @@ static void raise_parse_error(const char *format, JSON_ParserState *state)
}
}
- VALUE msg = rb_sprintf(format, ptr);
- VALUE message = rb_enc_sprintf(enc_utf8, "%s at line %ld column %ld", RSTRING_PTR(msg), line, column);
- RB_GC_GUARD(msg);
+ VALUE message = rb_enc_sprintf(enc_utf8, format, ptr);
+ rb_str_catf(message, " at line %ld column %ld", line, column);
+ return message;
+}
+static VALUE parse_error_new(VALUE message, long line, long column)
+{
VALUE exc = rb_exc_new_str(rb_path2class("JSON::ParserError"), message);
rb_ivar_set(exc, rb_intern("@line"), LONG2NUM(line));
rb_ivar_set(exc, rb_intern("@column"), LONG2NUM(column));
- rb_exc_raise(exc);
+ return exc;
}
-#ifdef RBIMPL_ATTR_NORETURN
-RBIMPL_ATTR_NORETURN()
-#endif
-static void raise_parse_error_at(const char *format, JSON_ParserState *state, const char *at)
+NORETURN(static) void raise_parse_error(const char *format, JSON_ParserState *state)
+{
+ long line, column;
+ cursor_position(state, &line, &column);
+ VALUE message = build_parse_error_message(format, state, line, column);
+ rb_exc_raise(parse_error_new(message, line, column));
+}
+
+NORETURN(static) void raise_parse_error_at(const char *format, JSON_ParserState *state, const char *at)
{
state->cursor = at;
raise_parse_error(format, state);
@@ -505,23 +681,24 @@ static const signed char digit_values[256] = {
-1, -1, -1, -1, -1, -1, -1
};
-static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p)
-{
- signed char b;
- uint32_t result = 0;
- b = digit_values[p[0]];
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
- result = (result << 4) | (unsigned char)b;
- b = digit_values[p[1]];
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
- result = (result << 4) | (unsigned char)b;
- b = digit_values[p[2]];
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
- result = (result << 4) | (unsigned char)b;
- b = digit_values[p[3]];
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
- result = (result << 4) | (unsigned char)b;
- return result;
+static uint32_t unescape_unicode(JSON_ParserState *state, const char *sp, const char *spe)
+{
+ if (RB_UNLIKELY(sp > spe - 4)) {
+ raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2);
+ }
+
+ const unsigned char *p = (const unsigned char *)sp;
+
+ const signed char b0 = digit_values[p[0]];
+ const signed char b1 = digit_values[p[1]];
+ const signed char b2 = digit_values[p[2]];
+ const signed char b3 = digit_values[p[3]];
+
+ if (RB_UNLIKELY((signed char)(b0 | b1 | b2 | b3) < 0)) {
+ raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2);
+ }
+
+ return ((uint32_t)b0 << 12) | ((uint32_t)b1 << 8) | ((uint32_t)b2 << 4) | (uint32_t)b3;
}
#define GET_PARSER_CONFIG \
@@ -530,61 +707,82 @@ static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p
static const rb_data_type_t JSON_ParserConfig_type;
-static const bool whitespace[256] = {
- [' '] = 1,
- ['\t'] = 1,
- ['\n'] = 1,
- ['\r'] = 1,
- ['/'] = 1,
-};
-
-static void
+NOINLINE(static) void
json_eat_comments(JSON_ParserState *state)
{
- if (state->cursor + 1 < state->end) {
- switch (state->cursor[1]) {
- case '/': {
- state->cursor = memchr(state->cursor, '\n', state->end - state->cursor);
- if (!state->cursor) {
- state->cursor = state->end;
- } else {
- state->cursor++;
- }
- break;
+ const char *start = state->cursor;
+ state->cursor++;
+
+ switch (peek(state)) {
+ case '/': {
+ state->cursor = memchr(state->cursor, '\n', state->end - state->cursor);
+ if (!state->cursor) {
+ state->cursor = state->end;
+ } else {
+ state->cursor++;
}
- case '*': {
- state->cursor += 2;
- while (true) {
- state->cursor = memchr(state->cursor, '*', state->end - state->cursor);
- if (!state->cursor) {
- raise_parse_error_at("unexpected end of input, expected closing '*/'", state, state->end);
- } else {
- state->cursor++;
- if (state->cursor < state->end && *state->cursor == '/') {
- state->cursor++;
- break;
- }
- }
+ break;
+ }
+ case '*': {
+ state->cursor++;
+
+ while (true) {
+ const char *next_match = memchr(state->cursor, '*', state->end - state->cursor);
+ if (!next_match) {
+ raise_parse_error_at("unterminated comment, expected closing '*/'", state, start);
+ }
+
+ state->cursor = next_match + 1;
+ if (peek(state) == '/') {
+ state->cursor++;
+ break;
}
- break;
}
- default:
- raise_parse_error("unexpected token %s", state);
- break;
+ break;
}
- } else {
- raise_parse_error("unexpected token %s", state);
+ default:
+ raise_parse_error_at("unexpected token %s", state, start);
+ break;
}
}
-static inline void
+ALWAYS_INLINE(static) void
json_eat_whitespace(JSON_ParserState *state)
{
- while (state->cursor < state->end && RB_UNLIKELY(whitespace[(unsigned char)*state->cursor])) {
- if (RB_LIKELY(*state->cursor != '/')) {
- state->cursor++;
- } else {
- json_eat_comments(state);
+ while (true) {
+ switch (peek(state)) {
+ case ' ':
+ state->cursor++;
+ break;
+ case '\n':
+ state->cursor++;
+
+ // Heuristic: if we see a newline, there is likely consecutive spaces after it.
+#if JSON_CPU_LITTLE_ENDIAN_64BITS
+ while (rest(state) > 8) {
+ uint64_t chunk;
+ memcpy(&chunk, state->cursor, sizeof(uint64_t));
+ if (chunk == 0x2020202020202020) {
+ state->cursor += 8;
+ continue;
+ }
+
+ uint32_t consecutive_spaces = trailing_zeros64(chunk ^ 0x2020202020202020) / CHAR_BIT;
+ state->cursor += consecutive_spaces;
+ break;
+ }
+#endif
+ break;
+ case '\t':
+ case '\r':
+ state->cursor++;
+ break;
+ case '/':
+ json_eat_comments(state);
+ break;
+
+ default:
+ return;
}
}
}
@@ -615,11 +813,22 @@ static inline VALUE build_string(const char *start, const char *end, bool intern
return result;
}
-static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize)
+static inline bool json_string_cacheable_p(const char *string, size_t length)
{
+ // We mostly want to cache strings that are likely to be repeated.
+ // Simple heuristics:
+ // - Common names aren't likely to be very long. So we just don't cache names above an arbitrary threshold.
+ // - If the first character isn't a letter, we're much less likely to see this string again.
+ return length <= JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH && rb_isalpha(string[0]);
+}
+
+static inline VALUE json_string_fastpath(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name)
+{
+ bool intern = is_name || config->freeze;
+ bool symbolize = is_name && config->symbolize_names;
size_t bufferSize = stringEnd - string;
- if (is_name && state->in_array) {
+ if (is_name && state->in_array && RB_LIKELY(json_string_cacheable_p(string, bufferSize))) {
VALUE cached_key;
if (RB_UNLIKELY(symbolize)) {
cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize);
@@ -635,104 +844,129 @@ static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *st
return build_string(string, stringEnd, intern, symbolize);
}
-static VALUE json_string_unescape(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize)
-{
- size_t bufferSize = stringEnd - string;
- const char *p = string, *pe = string, *unescape, *bufferStart;
- char *buffer;
- int unescape_len;
- char buf[4];
+#define JSON_MAX_UNESCAPE_POSITIONS 16
+typedef struct _json_unescape_positions {
+ long size;
+ const char **positions;
+ unsigned long additional_backslashes;
+} JSON_UnescapePositions;
- if (is_name && state->in_array) {
- VALUE cached_key;
- if (RB_UNLIKELY(symbolize)) {
- cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize);
- } else {
- cached_key = rstring_cache_fetch(&state->name_cache, string, bufferSize);
+static inline const char *json_next_backslash(const char *pe, const char *stringEnd, JSON_UnescapePositions *positions)
+{
+ while (positions->size) {
+ positions->size--;
+ const char *next_position = positions->positions[0];
+ positions->positions++;
+ if (next_position >= pe) {
+ return next_position;
}
+ }
- if (RB_LIKELY(cached_key)) {
- return cached_key;
- }
+ if (positions->additional_backslashes) {
+ positions->additional_backslashes--;
+ return memchr(pe, '\\', stringEnd - pe);
}
+ return NULL;
+}
+
+NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name, JSON_UnescapePositions *positions)
+{
+ bool intern = is_name || config->freeze;
+ bool symbolize = is_name && config->symbolize_names;
+ size_t bufferSize = stringEnd - string;
+ const char *p = string, *pe = string, *bufferStart;
+ char *buffer;
+
VALUE result = rb_str_buf_new(bufferSize);
rb_enc_associate_index(result, utf8_encindex);
buffer = RSTRING_PTR(result);
bufferStart = buffer;
- while (pe < stringEnd && (pe = memchr(pe, '\\', stringEnd - pe))) {
- unescape = (char *) "?";
- unescape_len = 1;
+#define APPEND_CHAR(chr) *buffer++ = chr; p = ++pe;
+
+ while (pe < stringEnd && (pe = json_next_backslash(pe, stringEnd, positions))) {
if (pe > p) {
MEMCPY(buffer, p, char, pe - p);
buffer += pe - p;
}
switch (*++pe) {
+ case '"':
+ case '/':
+ p = pe; // nothing to unescape just need to skip the backslash
+ break;
+ case '\\':
+ APPEND_CHAR('\\');
+ break;
case 'n':
- unescape = (char *) "\n";
+ APPEND_CHAR('\n');
break;
case 'r':
- unescape = (char *) "\r";
+ APPEND_CHAR('\r');
break;
case 't':
- unescape = (char *) "\t";
- break;
- case '"':
- unescape = (char *) "\"";
- break;
- case '\\':
- unescape = (char *) "\\";
+ APPEND_CHAR('\t');
break;
case 'b':
- unescape = (char *) "\b";
+ APPEND_CHAR('\b');
break;
case 'f':
- unescape = (char *) "\f";
+ APPEND_CHAR('\f');
break;
- case 'u':
- if (pe > stringEnd - 5) {
- raise_parse_error_at("incomplete unicode character escape sequence at %s", state, p);
- } else {
- uint32_t ch = unescape_unicode(state, (unsigned char *) ++pe);
- pe += 3;
- /* To handle values above U+FFFF, we take a sequence of
- * \uXXXX escapes in the U+D800..U+DBFF then
- * U+DC00..U+DFFF ranges, take the low 10 bits from each
- * to make a 20-bit number, then add 0x10000 to get the
- * final codepoint.
- *
- * See Unicode 15: 3.8 "Surrogates", 5.3 "Handling
- * Surrogate Pairs in UTF-16", and 23.6 "Surrogates
- * Area".
- */
- if ((ch & 0xFC00) == 0xD800) {
- pe++;
- if (pe > stringEnd - 6) {
- raise_parse_error_at("incomplete surrogate pair at %s", state, p);
- }
- if (pe[0] == '\\' && pe[1] == 'u') {
- uint32_t sur = unescape_unicode(state, (unsigned char *) pe + 2);
- ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
- | (sur & 0x3FF));
- pe += 5;
- } else {
- unescape = (char *) "?";
- break;
+ case 'u': {
+ uint32_t ch = unescape_unicode(state, ++pe, stringEnd);
+ pe += 3;
+ /* To handle values above U+FFFF, we take a sequence of
+ * \uXXXX escapes in the U+D800..U+DBFF then
+ * U+DC00..U+DFFF ranges, take the low 10 bits from each
+ * to make a 20-bit number, then add 0x10000 to get the
+ * final codepoint.
+ *
+ * See Unicode 15: 3.8 "Surrogates", 5.3 "Handling
+ * Surrogate Pairs in UTF-16", and 23.6 "Surrogates
+ * Area".
+ */
+ if ((ch & 0xFC00) == 0xD800) {
+ pe++;
+ if (RB_LIKELY((pe <= stringEnd - 6) && memcmp(pe, "\\u", 2) == 0)) {
+ uint32_t sur = unescape_unicode(state, pe + 2, stringEnd);
+
+ if (RB_UNLIKELY((sur & 0xFC00) != 0xDC00)) {
+ raise_parse_error_at("invalid surrogate pair at %s", state, p);
}
+
+ ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) | (sur & 0x3FF));
+ pe += 5;
+ } else {
+ raise_parse_error_at("incomplete surrogate pair at %s", state, p);
+ break;
}
- unescape_len = convert_UTF32_to_UTF8(buf, ch);
- unescape = buf;
}
+
+ int unescape_len = convert_UTF32_to_UTF8(buffer, ch);
+ buffer += unescape_len;
+ p = ++pe;
break;
+ }
default:
- p = pe;
- continue;
+ if ((unsigned char)*pe < 0x20) {
+ if (!config->allow_control_characters) {
+ if (*pe == '\n') {
+ raise_parse_error_at("Invalid unescaped newline character (\\n) in string: %s", state, pe - 1);
+ }
+ raise_parse_error_at("invalid ASCII control character in string: %s", state, pe - 1);
+ }
+ }
+
+ if (config->allow_invalid_escape) {
+ APPEND_CHAR(*pe);
+ } else {
+ raise_parse_error_at("invalid escape character in string: %s", state, pe - 1);
+ }
+ break;
}
- MEMCPY(buffer, unescape, char, unescape_len);
- buffer += unescape_len;
- p = ++pe;
}
+#undef APPEND_CHAR
if (stringEnd > p) {
MEMCPY(buffer, p, char, stringEnd - p);
@@ -743,87 +977,99 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
if (symbolize) {
result = rb_str_intern(result);
} else if (intern) {
- result = rb_funcall(rb_str_freeze(result), i_uminus, 0);
+ result = rb_str_to_interned_str(result);
}
return result;
}
#define MAX_FAST_INTEGER_SIZE 18
-static inline VALUE fast_decode_integer(const char *p, const char *pe)
-{
- bool negative = false;
- if (*p == '-') {
- negative = true;
- p++;
- }
+#define MAX_NUMBER_STACK_BUFFER 128
- long long memo = 0;
- while (p < pe) {
- memo *= 10;
- memo += *p - '0';
- p++;
- }
+typedef VALUE (*json_number_decode_func_t)(const char *ptr);
- if (negative) {
- memo = -memo;
+static inline VALUE json_decode_large_number(const char *start, long len, json_number_decode_func_t func)
+{
+ if (RB_LIKELY(len < MAX_NUMBER_STACK_BUFFER)) {
+ char buffer[MAX_NUMBER_STACK_BUFFER];
+ MEMCPY(buffer, start, char, len);
+ buffer[len] = '\0';
+ return func(buffer);
+ } else {
+ VALUE buffer_v = rb_str_tmp_new(len);
+ char *buffer = RSTRING_PTR(buffer_v);
+ MEMCPY(buffer, start, char, len);
+ buffer[len] = '\0';
+ VALUE number = func(buffer);
+ RB_GC_GUARD(buffer_v);
+ return number;
}
- return LL2NUM(memo);
}
-static VALUE json_decode_large_integer(const char *start, long len)
+static VALUE json_decode_inum(const char *buffer)
+{
+ return rb_cstr2inum(buffer, 10);
+}
+
+NOINLINE(static) VALUE json_decode_large_integer(const char *start, long len)
{
- VALUE buffer_v;
- char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1);
- MEMCPY(buffer, start, char, len);
- buffer[len] = '\0';
- VALUE number = rb_cstr2inum(buffer, 10);
- RB_ALLOCV_END(buffer_v);
- return number;
+ return json_decode_large_number(start, len, json_decode_inum);
}
-static inline VALUE
-json_decode_integer(const char *start, const char *end)
+static inline VALUE json_decode_integer(uint64_t mantissa, int mantissa_digits, bool negative, const char *start, const char *end)
{
- long len = end - start;
- if (RB_LIKELY(len < MAX_FAST_INTEGER_SIZE)) {
- return fast_decode_integer(start, end);
+ if (RB_LIKELY(mantissa_digits < MAX_FAST_INTEGER_SIZE)) {
+ if (negative) {
+ return INT64T2NUM(-((int64_t)mantissa));
}
- return json_decode_large_integer(start, len);
+ return UINT64T2NUM(mantissa);
+ }
+
+ return json_decode_large_integer(start, end - start);
}
-static VALUE json_decode_large_float(const char *start, long len)
+static VALUE json_decode_dnum(const char *buffer)
{
- VALUE buffer_v;
- char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1);
- MEMCPY(buffer, start, char, len);
- buffer[len] = '\0';
- VALUE number = DBL2NUM(rb_cstr_to_dbl(buffer, 1));
- RB_ALLOCV_END(buffer_v);
- return number;
+ return DBL2NUM(rb_cstr_to_dbl(buffer, 1));
}
-static VALUE json_decode_float(JSON_ParserConfig *config, const char *start, const char *end)
+NOINLINE(static) VALUE json_decode_large_float(const char *start, long len)
{
- long len = end - start;
+ return json_decode_large_number(start, len, json_decode_dnum);
+}
+/* Ruby JSON optimized float decoder using vendored Ryu algorithm
+ * Accepts pre-extracted mantissa and exponent from first-pass validation
+ */
+static inline VALUE json_decode_float(JSON_ParserConfig *config, uint64_t mantissa, int mantissa_digits, int64_t exponent, bool negative,
+ const char *start, const char *end)
+{
if (RB_UNLIKELY(config->decimal_class)) {
- VALUE text = rb_str_new(start, len);
+ VALUE text = rb_str_new(start, end - start);
return rb_funcallv(config->decimal_class, config->decimal_method_id, 1, &text);
- } else if (RB_LIKELY(len < 64)) {
- char buffer[64];
- MEMCPY(buffer, start, char, len);
- buffer[len] = '\0';
- return DBL2NUM(rb_cstr_to_dbl(buffer, 1));
- } else {
- return json_decode_large_float(start, len);
}
+
+ if (RB_UNLIKELY(exponent > INT32_MAX)) {
+ return negative ? CMinusInfinity : CInfinity;
+ }
+
+ if (RB_UNLIKELY(exponent < INT32_MIN)) {
+ return rb_float_new(negative ? -0.0 : 0.0);
+ }
+
+ // Fall back to rb_cstr_to_dbl for potential subnormals (rare edge case)
+ // Ryu has rounding issues with subnormals around 1e-310 (< 2.225e-308)
+ if (RB_UNLIKELY(mantissa_digits > 17 || mantissa_digits + exponent < -307)) {
+ return json_decode_large_float(start, end - start);
+ }
+
+ return DBL2NUM(ryu_s2d_from_parts(mantissa, mantissa_digits, (int32_t)exponent, negative));
}
static inline VALUE json_decode_array(JSON_ParserState *state, JSON_ParserConfig *config, long count)
{
- VALUE array = rb_ary_new_from_values(count, rvalue_stack_peek(state->stack, count));
- rvalue_stack_pop(state->stack, count);
+ VALUE array = rb_ary_new_from_values(count, rvalue_stack_peek(state->value_stack, count));
+ rvalue_stack_pop(state->value_stack, count);
if (config->freeze) {
RB_OBJ_FREEZE(array);
@@ -849,7 +1095,7 @@ static VALUE json_find_duplicated_key(size_t count, const VALUE *pairs)
return Qfalse;
}
-static void emit_duplicate_key_warning(JSON_ParserState *state, VALUE duplicate_key)
+NOINLINE(static) void emit_duplicate_key_warning(JSON_ParserState *state, VALUE duplicate_key)
{
VALUE message = rb_sprintf(
"detected duplicate key %"PRIsVALUE" in JSON object. This will raise an error in json 3.0 unless enabled via `allow_duplicate_key: true`",
@@ -860,41 +1106,52 @@ static void emit_duplicate_key_warning(JSON_ParserState *state, VALUE duplicate_
RB_GC_GUARD(message);
}
-#ifdef RBIMPL_ATTR_NORETURN
-RBIMPL_ATTR_NORETURN()
-#endif
-static void raise_duplicate_key_error(JSON_ParserState *state, VALUE duplicate_key)
+NORETURN(static) void raise_duplicate_key_error(JSON_ParserState *state, VALUE duplicate_key)
{
VALUE message = rb_sprintf(
"duplicate key %"PRIsVALUE,
rb_inspect(duplicate_key)
);
- raise_parse_error(RSTRING_PTR(message), state);
- RB_GC_GUARD(message);
+ long line, column;
+ cursor_position(state, &line, &column);
+ rb_str_concat(message, build_parse_error_message("", state, line, column)) ;
+ rb_exc_raise(parse_error_new(message, line, column));
+}
+
+NOINLINE(static) void json_on_duplicate_key(JSON_ParserState *state, JSON_ParserConfig *config, size_t count, const VALUE *pairs)
+{
+ switch (config->on_duplicate_key) {
+ case JSON_IGNORE:
+ return;
+
+ case JSON_DEPRECATED:
+ // Only emit the first few deprecations to avoid spamming.
+ if (state->emitted_deprecations < 5) {
+ emit_duplicate_key_warning(state, json_find_duplicated_key(count, pairs));
+ state->emitted_deprecations++;
+ }
+ return;
+
+ case JSON_RAISE:
+ raise_duplicate_key_error(state, json_find_duplicated_key(count, pairs));
+ return;
+ }
+ UNREACHABLE;
}
static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfig *config, size_t count)
{
size_t entries_count = count / 2;
VALUE object = rb_hash_new_capa(entries_count);
- const VALUE *pairs = rvalue_stack_peek(state->stack, count);
+ const VALUE *pairs = rvalue_stack_peek(state->value_stack, count);
rb_hash_bulk_insert(count, pairs, object);
if (RB_UNLIKELY(RHASH_SIZE(object) < entries_count)) {
- switch (config->on_duplicate_key) {
- case JSON_IGNORE:
- break;
- case JSON_DEPRECATED:
- emit_duplicate_key_warning(state, json_find_duplicated_key(count, pairs));
- break;
- case JSON_RAISE:
- raise_duplicate_key_error(state, json_find_duplicated_key(count, pairs));
- break;
- }
+ json_on_duplicate_key(state, config, count, pairs);
}
- rvalue_stack_pop(state->stack, count);
+ rvalue_stack_pop(state->value_stack, count);
if (config->freeze) {
RB_OBJ_FREEZE(object);
@@ -903,26 +1160,12 @@ static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfi
return object;
}
-static inline VALUE json_decode_string(JSON_ParserState *state, JSON_ParserConfig *config, const char *start, const char *end, bool escaped, bool is_name)
-{
- VALUE string;
- bool intern = is_name || config->freeze;
- bool symbolize = is_name && config->symbolize_names;
- if (escaped) {
- string = json_string_unescape(state, start, end, is_name, intern, symbolize);
- } else {
- string = json_string_fastpath(state, start, end, is_name, intern, symbolize);
- }
-
- return string;
-}
-
static inline VALUE json_push_value(JSON_ParserState *state, JSON_ParserConfig *config, VALUE value)
{
if (RB_UNLIKELY(config->on_load_proc)) {
value = rb_proc_call_with_block(config->on_load_proc, 1, &value, Qnil);
}
- rvalue_stack_push(state->stack, value, &state->stack_handle, &state->stack);
+ rvalue_stack_push(state->value_stack, value, state->value_stack_handle, &state->value_stack);
return value;
}
@@ -939,17 +1182,11 @@ static const bool string_scan_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
-#if (defined(__GNUC__ ) || defined(__clang__))
-#define FORCE_INLINE __attribute__((always_inline))
-#else
-#define FORCE_INLINE
-#endif
-
#ifdef HAVE_SIMD
static SIMD_Implementation simd_impl = SIMD_NONE;
#endif /* HAVE_SIMD */
-static inline bool FORCE_INLINE string_scan(JSON_ParserState *state)
+ALWAYS_INLINE(static) bool string_scan(JSON_ParserState *state)
{
#ifdef HAVE_SIMD
#if defined(HAVE_SIMD_NEON)
@@ -957,7 +1194,7 @@ static inline bool FORCE_INLINE string_scan(JSON_ParserState *state)
uint64_t mask = 0;
if (string_scan_simd_neon(&state->cursor, state->end, &mask)) {
state->cursor += trailing_zeros64(mask) >> 2;
- return 1;
+ return true;
}
#elif defined(HAVE_SIMD_SSE2)
@@ -965,313 +1202,574 @@ static inline bool FORCE_INLINE string_scan(JSON_ParserState *state)
int mask = 0;
if (string_scan_simd_sse2(&state->cursor, state->end, &mask)) {
state->cursor += trailing_zeros(mask);
- return 1;
+ return true;
}
}
#endif /* HAVE_SIMD_NEON or HAVE_SIMD_SSE2 */
#endif /* HAVE_SIMD */
- while (state->cursor < state->end) {
+ while (!eos(state)) {
if (RB_UNLIKELY(string_scan_table[(unsigned char)*state->cursor])) {
- return 1;
+ return true;
}
- *state->cursor++;
+ state->cursor++;
}
- return 0;
+
+ // If the string ended with an unterminated escape sequence, we might
+ // have gone past the end.
+ if (RB_UNLIKELY(state->cursor > state->end)) {
+ state->cursor = state->end;
+ }
+
+ return false;
}
-static inline VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name)
+static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name, const char *start)
{
- state->cursor++;
- const char *start = state->cursor;
- bool escaped = false;
+ const char *backslashes[JSON_MAX_UNESCAPE_POSITIONS];
+ JSON_UnescapePositions positions = {
+ .size = 0,
+ .positions = backslashes,
+ .additional_backslashes = 0,
+ };
- while (RB_UNLIKELY(string_scan(state))) {
+ do {
switch (*state->cursor) {
case '"': {
- VALUE string = json_decode_string(state, config, start, state->cursor, escaped, is_name);
+ VALUE string = json_string_unescape(state, config, start, state->cursor, is_name, &positions);
state->cursor++;
- return json_push_value(state, config, string);
+ return string;
}
case '\\': {
- state->cursor++;
- escaped = true;
- if ((unsigned char)*state->cursor < 0x20) {
- raise_parse_error("invalid ASCII control character in string: %s", state);
+ if (RB_LIKELY(positions.size < JSON_MAX_UNESCAPE_POSITIONS)) {
+ backslashes[positions.size] = state->cursor;
+ positions.size++;
+ } else {
+ positions.additional_backslashes++;
}
+ state->cursor++;
break;
}
default:
- raise_parse_error("invalid ASCII control character in string: %s", state);
+ if (!config->allow_control_characters) {
+ raise_parse_error("invalid ASCII control character in string: %s", state);
+ }
break;
}
state->cursor++;
- }
+ } while (string_scan(state));
raise_parse_error("unexpected end of input, expected closing \"", state);
return Qfalse;
}
-static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
+ALWAYS_INLINE(static) VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name)
{
- json_eat_whitespace(state);
- if (state->cursor >= state->end) {
- raise_parse_error("unexpected end of input", state);
+ state->cursor++;
+ const char *start = state->cursor;
+
+ if (RB_UNLIKELY(!string_scan(state))) {
+ raise_parse_error("unexpected end of input, expected closing \"", state);
}
- switch (*state->cursor) {
- case 'n':
- if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "null", 4) == 0)) {
- state->cursor += 4;
- return json_push_value(state, config, Qnil);
- }
+ VALUE string;
+ if (RB_LIKELY(*state->cursor == '"')) {
+ string = json_string_fastpath(state, config, start, state->cursor, is_name);
+ state->cursor++;
+ }
+ else {
+ string = json_parse_escaped_string(state, config, is_name, start);
+ }
- raise_parse_error("unexpected token %s", state);
- break;
- case 't':
- if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "true", 4) == 0)) {
- state->cursor += 4;
- return json_push_value(state, config, Qtrue);
- }
+ return string;
+}
- raise_parse_error("unexpected token %s", state);
- break;
- case 'f':
- // Note: memcmp with a small power of two compile to an integer comparison
- if ((state->end - state->cursor >= 5) && (memcmp(state->cursor + 1, "alse", 4) == 0)) {
- state->cursor += 5;
- return json_push_value(state, config, Qfalse);
- }
+#if JSON_CPU_LITTLE_ENDIAN_64BITS
+// From: https://lemire.me/blog/2022/01/21/swar-explained-parsing-eight-digits/
+// Additional References:
+// https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
+// http://0x80.pl/notesen/2014-10-12-parsing-decimal-numbers-part-1-swar.html
+static inline uint64_t decode_8digits_unrolled(uint64_t val) {
+ const uint64_t mask = 0x000000FF000000FF;
+ const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
+ const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
+ val -= 0x3030303030303030;
+ val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
+ val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;
+ return val;
+}
- raise_parse_error("unexpected token %s", state);
- break;
- case 'N':
- // Note: memcmp with a small power of two compile to an integer comparison
- if (config->allow_nan && (state->end - state->cursor >= 3) && (memcmp(state->cursor + 1, "aN", 2) == 0)) {
- state->cursor += 3;
- return json_push_value(state, config, CNaN);
- }
+static inline uint64_t decode_4digits_unrolled(uint32_t val) {
+ const uint32_t mask = 0x000000FF;
+ const uint32_t mul1 = 100;
+ val -= 0x30303030;
+ val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
+ val = ((val & mask) * mul1) + (((val >> 16) & mask));
+ return val;
+}
+#endif
- raise_parse_error("unexpected token %s", state);
- break;
- case 'I':
- if (config->allow_nan && (state->end - state->cursor >= 8) && (memcmp(state->cursor, "Infinity", 8) == 0)) {
- state->cursor += 8;
- return json_push_value(state, config, CInfinity);
- }
+static inline int json_parse_digits(JSON_ParserState *state, uint64_t *accumulator)
+{
+ const char *start = state->cursor;
- raise_parse_error("unexpected token %s", state);
- break;
- case '-':
- // Note: memcmp with a small power of two compile to an integer comparison
- if ((state->end - state->cursor >= 9) && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) {
- if (config->allow_nan) {
- state->cursor += 9;
- return json_push_value(state, config, CMinusInfinity);
- } else {
- raise_parse_error("unexpected token %s", state);
- }
- }
- // Fallthrough
- case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': {
- bool integer = true;
+#if JSON_CPU_LITTLE_ENDIAN_64BITS
+ while (rest(state) >= sizeof(uint64_t)) {
+ uint64_t next_8bytes;
+ memcpy(&next_8bytes, state->cursor, sizeof(uint64_t));
+
+ // From: https://github.com/simdjson/simdjson/blob/32b301893c13d058095a07d9868edaaa42ee07aa/include/simdjson/generic/numberparsing.h#L333
+ // Branchless version of: http://0x80.pl/articles/swar-digits-validate.html
+ uint64_t match = (next_8bytes & 0xF0F0F0F0F0F0F0F0) | (((next_8bytes + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4);
+
+ if (match == 0x3333333333333333) { // 8 consecutive digits
+ *accumulator = (*accumulator * 100000000) + decode_8digits_unrolled(next_8bytes);
+ state->cursor += 8;
+ continue;
+ }
+
+ uint32_t consecutive_digits = trailing_zeros64(match ^ 0x3333333333333333) / CHAR_BIT;
- // /\A-?(0|[1-9]\d*)(\.\d+)?([Ee][-+]?\d+)?/
- const char *start = state->cursor;
+ if (consecutive_digits >= 4) {
+ *accumulator = (*accumulator * 10000) + decode_4digits_unrolled((uint32_t)next_8bytes);
+ state->cursor += 4;
+ consecutive_digits -= 4;
+ }
+
+ while (consecutive_digits) {
+ *accumulator = *accumulator * 10 + (*state->cursor - '0');
+ consecutive_digits--;
state->cursor++;
+ }
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
- state->cursor++;
- }
+ return (int)(state->cursor - start);
+ }
+#endif
+
+ char next_char;
+ while (rb_isdigit(next_char = peek(state))) {
+ *accumulator = *accumulator * 10 + (next_char - '0');
+ state->cursor++;
+ }
+ return (int)(state->cursor - start);
+}
- long integer_length = state->cursor - start;
+static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig *config, bool negative, const char *start)
+{
+ bool integer = true;
+ const char first_digit = *state->cursor;
- if (RB_UNLIKELY(start[0] == '0' && integer_length > 1)) {
- raise_parse_error_at("invalid number: %s", state, start);
- } else if (RB_UNLIKELY(integer_length > 2 && start[0] == '-' && start[1] == '0')) {
- raise_parse_error_at("invalid number: %s", state, start);
- } else if (RB_UNLIKELY(integer_length == 1 && start[0] == '-')) {
- raise_parse_error_at("invalid number: %s", state, start);
- }
+ // Variables for Ryu optimization - extract digits during parsing
+ int64_t exponent = 0;
+ int decimal_point_pos = -1;
+ uint64_t mantissa = 0;
- if ((state->cursor < state->end) && (*state->cursor == '.')) {
- integer = false;
- state->cursor++;
+ // Parse integer part and extract mantissa digits
+ int mantissa_digits = json_parse_digits(state, &mantissa);
+
+ if (RB_UNLIKELY((first_digit == '0' && mantissa_digits > 1) || (negative && mantissa_digits == 0))) {
+ raise_parse_error_at("invalid number: %s", state, start);
+ }
+
+ // Parse fractional part
+ if (peek(state) == '.') {
+ integer = false;
+ decimal_point_pos = mantissa_digits; // Remember position of decimal point
+ state->cursor++;
+
+ int fractional_digits = json_parse_digits(state, &mantissa);
+ mantissa_digits += fractional_digits;
+
+ if (RB_UNLIKELY(!fractional_digits)) {
+ raise_parse_error_at("invalid number: %s", state, start);
+ }
+ }
+
+ // Parse exponent
+ if (rb_tolower(peek(state)) == 'e') {
+ integer = false;
+ state->cursor++;
+
+ bool negative_exponent = false;
+ const char next_char = peek(state);
+ if (next_char == '-' || next_char == '+') {
+ negative_exponent = next_char == '-';
+ state->cursor++;
+ }
+
+ uint64_t abs_exponent = 0;
+ int exponent_digits = json_parse_digits(state, &abs_exponent);
+
+ if (RB_UNLIKELY(!exponent_digits)) {
+ raise_parse_error_at("invalid number: %s", state, start);
+ }
+
+ if (RB_UNLIKELY(exponent_digits >= 20 || abs_exponent > (uint64_t)INT64_MAX)) {
+ exponent = negative_exponent ? INT64_MIN : INT64_MAX;
+ } else {
+ exponent = negative_exponent ? -(int64_t)abs_exponent : (int64_t)abs_exponent;
+ }
+ }
+
+ if (integer) {
+ return json_decode_integer(mantissa, mantissa_digits, negative, start, state->cursor);
+ }
+
+ // Adjust exponent based on decimal point position
+ if (decimal_point_pos >= 0) {
+ exponent -= (mantissa_digits - decimal_point_pos);
+ }
+
+ return json_decode_float(config, mantissa, mantissa_digits, exponent, negative, start, state->cursor);
+}
+
+static inline VALUE json_parse_positive_number(JSON_ParserState *state, JSON_ParserConfig *config)
+{
+ return json_parse_number(state, config, false, state->cursor);
+}
+
+static inline VALUE json_parse_negative_number(JSON_ParserState *state, JSON_ParserConfig *config)
+{
+ return json_parse_number(state, config, true, state->cursor - 1);
+}
- if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
- raise_parse_error("invalid number: %s", state);
+// How many values (array elements, or interleaved object keys+values) have been
+// pushed onto the rvalue stack since this container opened. Used to size the
+// bulk decode on close, and to tell the first key/colon from later ones.
+static inline long json_frame_entry_count(const json_frame *frame, const rvalue_stack *value_stack)
+{
+ return value_stack->head - frame->value_stack_head;
+}
+
+// A complete value now sits on top of the rvalue stack. Advance the frame that
+// was waiting for it: the root document is done, or the enclosing container
+// moves on to expecting a ',' or its closing bracket. The caller passes the
+// frame it already has in hand -- the one that was expecting the value -- which
+// after a container close is the freshly re-exposed parent.
+static inline void json_value_completed(json_frame *frame)
+{
+ JSON_ASSERT((int)JSON_PHASE_DONE == (int)JSON_FRAME_ROOT);
+ JSON_ASSERT((int)JSON_PHASE_ARRAY_COMMA == (int)JSON_FRAME_ARRAY);
+ JSON_ASSERT((int)JSON_PHASE_OBJECT_COMMA == (int)JSON_FRAME_OBJECT);
+
+ frame->phase = (enum json_frame_phase) frame->type;
+}
+
+ALWAYS_INLINE(static) bool json_match_keyword(JSON_ParserState *state, const char *keyword, size_t offset)
+{
+ // It is assumed that since `keyword` is always a literal, the compiler is able to constantize this
+ // `strlen` and several other computations in that routine, such as eliminating the `if (resumable)` branch.
+
+ size_t len = strlen(keyword);
+
+ // Note: memcmp with a small power of two and a literal string compile to an integer comparison /
+ // That's why we sometime compare starting from the first byte and sometimes from the second.
+ if (rest(state) >= len && (memcmp(state->cursor + offset, keyword + offset, len - offset) == 0)) {
+ state->cursor += len;
+ return true;
+ }
+ return false;
+}
+
+// Parse an arbitrary JSON value iteratively. This is a state machine driven
+// entirely by the top frame's phase so it can stop at any value boundary and
+// resume purely from the frame stack. A JSON_FRAME_ROOT frame sits at the
+// bottom of the stack, so the stack is never empty mid-parse and the document
+// itself is just another frame whose value, once parsed, leaves its phase DONE.
+static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
+{
+ json_frame *frame = json_frame_stack_peek(state->frames);
+
+ switch (frame->phase) {
+ case JSON_PHASE_DONE: goto JSON_PHASE_DONE;
+ case JSON_PHASE_ARRAY_COMMA: goto JSON_PHASE_ARRAY_COMMA;
+ case JSON_PHASE_OBJECT_COMMA: goto JSON_PHASE_OBJECT_COMMA;
+ case JSON_PHASE_VALUE: goto JSON_PHASE_VALUE;
+ case JSON_PHASE_OBJECT_KEY: goto JSON_PHASE_OBJECT_KEY;
+ case JSON_PHASE_OBJECT_COLON: goto JSON_PHASE_OBJECT_COLON;
+ }
+ UNREACHABLE_RETURN(Qundef);
+
+ JSON_PHASE_DONE: {
+ // The root document value is parsed; it is the lone survivor on
+ // the rvalue stack.
+ return *rvalue_stack_peek(state->value_stack, 1);
+ }
+
+ JSON_PHASE_VALUE: {
+ json_eat_whitespace(state);
+
+ VALUE value;
+ switch (peek(state)) {
+ case 'n':
+ if (json_match_keyword(state, "null", 0)) {
+ value = Qnil;
+ break;
}
+ raise_parse_error("unexpected token %s", state);
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
- state->cursor++;
+ case 't':
+ if (json_match_keyword(state, "true", 0)) {
+ value = Qtrue;
+ break;
}
- }
+ raise_parse_error("unexpected token %s", state);
- if ((state->cursor < state->end) && ((*state->cursor == 'e') || (*state->cursor == 'E'))) {
- integer = false;
- state->cursor++;
- if ((state->cursor < state->end) && ((*state->cursor == '+') || (*state->cursor == '-'))) {
- state->cursor++;
+ case 'f':
+ if (json_match_keyword(state, "false", 1)) {
+ value = Qfalse;
+ break;
}
+ raise_parse_error("unexpected token %s", state);
- if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
- raise_parse_error("invalid number: %s", state);
+ case 'N':
+ // Note: memcmp with a small power of two compile to an integer comparison
+ if (config->allow_nan && json_match_keyword(state, "NaN", 1)) {
+ value = CNaN;
+ break;
}
+ raise_parse_error("unexpected token %s", state);
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
- state->cursor++;
+ case 'I':
+ if (config->allow_nan && json_match_keyword(state, "Infinity", 0)) {
+ value = CInfinity;
+ break;
}
- }
+ raise_parse_error("unexpected token %s", state);
- if (integer) {
- return json_push_value(state, config, json_decode_integer(start, state->cursor));
+ case '-': {
+ state->cursor++;
+ if (config->allow_nan && json_match_keyword(state, "Infinity", 0)) {
+ value = CMinusInfinity;
+ } else {
+ value = json_parse_negative_number(state, config);
+ }
+ break;
}
- return json_push_value(state, config, json_decode_float(config, start, state->cursor));
- }
- case '"': {
- // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"}
- return json_parse_string(state, config, false);
- break;
- }
- case '[': {
- state->cursor++;
- json_eat_whitespace(state);
- long stack_head = state->stack->head;
- if ((state->cursor < state->end) && (*state->cursor == ']')) {
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
+ value = json_parse_positive_number(state, config);
+ break;
+
+ case '"':
+ // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"}
+ value = json_parse_string(state, config, false);
+ break;
+
+ case '[': {
state->cursor++;
- return json_push_value(state, config, json_decode_array(state, config, 0));
- } else {
+ json_eat_whitespace(state);
+
+ if (peek(state) == ']') {
+ state->cursor++;
+ value = json_decode_array(state, config, 0);
+ break;
+ }
+
state->current_nesting++;
if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) {
rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting);
}
state->in_array++;
- json_parse_any(state, config);
+
+ // Phase stays VALUE: the next iteration reads the first element.
+ frame = json_frame_stack_push(state, (json_frame){
+ .type = JSON_FRAME_ARRAY,
+ .phase = JSON_PHASE_VALUE,
+ .value_stack_head = state->value_stack->head,
+ });
+ goto JSON_PHASE_VALUE;
}
+ case '{': {
+ const char *object_start_cursor = state->cursor;
- while (true) {
+ state->cursor++;
json_eat_whitespace(state);
- if (state->cursor < state->end) {
- if (*state->cursor == ']') {
- state->cursor++;
- long count = state->stack->head - stack_head;
- state->current_nesting--;
- state->in_array--;
- return json_push_value(state, config, json_decode_array(state, config, count));
- }
+ if (peek(state) == '}') {
+ state->cursor++;
+ value = json_decode_object(state, config, 0);
+ break;
+ }
- if (*state->cursor == ',') {
- state->cursor++;
- if (config->allow_trailing_comma) {
- json_eat_whitespace(state);
- if ((state->cursor < state->end) && (*state->cursor == ']')) {
- continue;
- }
- }
- json_parse_any(state, config);
- continue;
- }
+ state->current_nesting++;
+ if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) {
+ rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting);
}
- raise_parse_error("expected ',' or ']' after array value", state);
+ // Phase KEY: the next iteration reads the first key.
+ frame = json_frame_stack_push(state, (json_frame){
+ .type = JSON_FRAME_OBJECT,
+ .phase = JSON_PHASE_OBJECT_KEY,
+ .value_stack_head = state->value_stack->head,
+ .start_cursor = object_start_cursor,
+ });
+ goto JSON_PHASE_OBJECT_KEY;
}
- break;
+
+ case 0:
+ raise_parse_error("unexpected end of input", state);
+
+ default:
+ raise_parse_error("unexpected character: %s", state);
}
- case '{': {
- const char *object_start_cursor = state->cursor;
- state->cursor++;
- json_eat_whitespace(state);
- long stack_head = state->stack->head;
+ json_push_value(state, config, value);
+ json_value_completed(frame);
- if ((state->cursor < state->end) && (*state->cursor == '}')) {
- state->cursor++;
- return json_push_value(state, config, json_decode_object(state, config, 0));
- } else {
- state->current_nesting++;
- if (RB_UNLIKELY(config->max_nesting && (config->max_nesting < state->current_nesting))) {
- rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting);
- }
+ switch (frame->phase) {
+ case JSON_PHASE_DONE: goto JSON_PHASE_DONE;
+ case JSON_PHASE_ARRAY_COMMA: goto JSON_PHASE_ARRAY_COMMA;
+ case JSON_PHASE_OBJECT_COMMA: goto JSON_PHASE_OBJECT_COMMA;
+ case JSON_PHASE_VALUE: goto JSON_PHASE_VALUE;
+ case JSON_PHASE_OBJECT_KEY: UNREACHABLE_RETURN(Qundef);
+ case JSON_PHASE_OBJECT_COLON: goto JSON_PHASE_OBJECT_COLON;
+ }
+ UNREACHABLE_RETURN(Qundef);
+ }
- if (*state->cursor != '"') {
- raise_parse_error("expected object key, got %s", state);
- }
- json_parse_string(state, config, true);
+ JSON_PHASE_OBJECT_KEY: {
+ JSON_ASSERT(frame->type == JSON_FRAME_OBJECT);
- json_eat_whitespace(state);
- if ((state->cursor >= state->end) || (*state->cursor != ':')) {
- raise_parse_error("expected ':' after object key", state);
- }
- state->cursor++;
+ json_eat_whitespace(state);
- json_parse_any(state, config);
+ if (RB_LIKELY(peek(state) == '"')) {
+ json_push_value(state, config, json_parse_string(state, config, true));
+ frame->phase = JSON_PHASE_OBJECT_COLON;
+ goto JSON_PHASE_OBJECT_COLON;
+ } else {
+ // The message differs for the first key vs. a key after a
+ // ',': the first is the only one reached with nothing pushed
+ // for this object yet.
+ if (json_frame_entry_count(frame, state->value_stack) == 0) {
+ raise_parse_error("expected object key, got %s", state);
+ } else {
+ raise_parse_error("expected object key, got: %s", state);
}
+ }
+ UNREACHABLE_RETURN(Qundef);
+ }
- while (true) {
- json_eat_whitespace(state);
+ JSON_PHASE_OBJECT_COLON: {
+ JSON_ASSERT(frame->type == JSON_FRAME_OBJECT);
- if (state->cursor < state->end) {
- if (*state->cursor == '}') {
- state->cursor++;
- state->current_nesting--;
- size_t count = state->stack->head - stack_head;
+ json_eat_whitespace(state);
- // Temporary rewind cursor in case an error is raised
- const char *final_cursor = state->cursor;
- state->cursor = object_start_cursor;
- VALUE object = json_decode_object(state, config, count);
- state->cursor = final_cursor;
+ if (RB_LIKELY(peek(state) == ':')) {
+ state->cursor++;
+ frame->phase = JSON_PHASE_VALUE;
+ goto JSON_PHASE_VALUE;
+ } else {
+ // First colon (only the first pair's key is pushed, nothing
+ // else) vs. a later one.
+ if (json_frame_entry_count(frame, state->value_stack) == 1) {
+ raise_parse_error("expected ':' after object key", state);
+ } else {
+ raise_parse_error("expected ':' after object key, got: %s", state);
+ }
+ }
+ UNREACHABLE_RETURN(Qundef);
+ }
- return json_push_value(state, config, object);
- }
+ JSON_PHASE_ARRAY_COMMA: {
+ JSON_ASSERT(frame->type == JSON_FRAME_ARRAY);
- if (*state->cursor == ',') {
- state->cursor++;
- json_eat_whitespace(state);
+ json_eat_whitespace(state);
- if (config->allow_trailing_comma) {
- if ((state->cursor < state->end) && (*state->cursor == '}')) {
- continue;
- }
- }
+ const char next_char = peek(state);
- if (*state->cursor != '"') {
- raise_parse_error("expected object key, got: %s", state);
- }
- json_parse_string(state, config, true);
+ if (RB_LIKELY(next_char == ',')) {
+ state->cursor++;
+ if (config->allow_trailing_comma) {
+ json_eat_whitespace(state);
+ if (peek(state) == ']') {
+ // Trailing comma: stay in COMMA to close on the next iteration.
+ goto JSON_PHASE_ARRAY_COMMA;
+ }
+ }
+ frame->phase = JSON_PHASE_VALUE;
+ goto JSON_PHASE_VALUE;
+ } else if (next_char == ']') {
+ state->cursor++;
+ long count = json_frame_entry_count(frame, state->value_stack);
+ state->current_nesting--;
+ state->in_array--;
+ json_frame_stack_pop(state->frames);
+ json_push_value(state, config, json_decode_array(state, config, count));
+ frame = json_frame_stack_peek(state->frames);
+ json_value_completed(frame);
+
+ switch (frame->phase) {
+ case JSON_PHASE_DONE: goto JSON_PHASE_DONE;
+ case JSON_PHASE_ARRAY_COMMA: goto JSON_PHASE_ARRAY_COMMA;
+ case JSON_PHASE_OBJECT_COMMA: goto JSON_PHASE_OBJECT_COMMA;
+ case JSON_PHASE_VALUE: goto JSON_PHASE_VALUE;
+ case JSON_PHASE_OBJECT_KEY: UNREACHABLE_RETURN(Qundef);
+ case JSON_PHASE_OBJECT_COLON: goto JSON_PHASE_OBJECT_COLON;
+ }
+ } else {
+ raise_parse_error("expected ',' or ']' after array value", state);
+ }
+ UNREACHABLE_RETURN(Qundef);
+ }
- json_eat_whitespace(state);
- if ((state->cursor >= state->end) || (*state->cursor != ':')) {
- raise_parse_error("expected ':' after object key, got: %s", state);
- }
- state->cursor++;
+ JSON_PHASE_OBJECT_COMMA: {
+ JSON_ASSERT(frame->type == JSON_FRAME_OBJECT);
- json_parse_any(state, config);
+ json_eat_whitespace(state);
+ const char next_char = peek(state);
- continue;
- }
+ if (RB_LIKELY(next_char == ',')) {
+ state->cursor++;
+
+ if (config->allow_trailing_comma) {
+ json_eat_whitespace(state);
+ if (peek(state) == '}') {
+ // Trailing comma: stay in COMMA to close on the next iteration.
+ goto JSON_PHASE_OBJECT_COMMA;
}
+ }
- raise_parse_error("expected ',' or '}' after object value, got: %s", state);
+ frame->phase = JSON_PHASE_OBJECT_KEY;
+ goto JSON_PHASE_OBJECT_KEY;
+ } else if (next_char == '}') {
+ state->cursor++;
+ state->current_nesting--;
+ size_t count = json_frame_entry_count(frame, state->value_stack);
+
+ // Temporary rewind cursor in case an error is raised
+ const char *final_cursor = state->cursor;
+ state->cursor = frame->start_cursor;
+ VALUE object = json_decode_object(state, config, count);
+ state->cursor = final_cursor;
+
+ json_push_value(state, config, object);
+ json_frame_stack_pop(state->frames);
+ frame = json_frame_stack_peek(state->frames);
+ json_value_completed(frame);
+
+ switch (frame->phase) {
+ case JSON_PHASE_DONE: goto JSON_PHASE_DONE;
+ case JSON_PHASE_ARRAY_COMMA: goto JSON_PHASE_ARRAY_COMMA;
+ case JSON_PHASE_OBJECT_COMMA: goto JSON_PHASE_OBJECT_COMMA;
+ case JSON_PHASE_VALUE: goto JSON_PHASE_VALUE;
+ case JSON_PHASE_OBJECT_KEY: UNREACHABLE_RETURN(Qundef);
+ case JSON_PHASE_OBJECT_COLON: goto JSON_PHASE_OBJECT_COLON;
}
- break;
+ } else {
+ raise_parse_error("expected ',' or '}' after object value, got: %s", state);
}
-
- default:
- raise_parse_error("unexpected character: %s", state);
- break;
+ UNREACHABLE_RETURN(Qundef);
}
- raise_parse_error("unreachable: %s", state);
+ UNREACHABLE_RETURN(Qundef);
}
static void json_ensure_eof(JSON_ParserState *state)
{
json_eat_whitespace(state);
- if (state->cursor != state->end) {
+ if (!eos(state)) {
raise_parse_error("unexpected token at end of stream %s", state);
}
}
@@ -1290,38 +1788,56 @@ static void json_ensure_eof(JSON_ParserState *state)
static VALUE convert_encoding(VALUE source)
{
- int encindex = RB_ENCODING_GET(source);
+ StringValue(source);
+ int encindex = RB_ENCODING_GET(source);
- if (RB_LIKELY(encindex == utf8_encindex)) {
+ if (RB_LIKELY(encindex == utf8_encindex)) {
+ return source;
+ }
+
+ if (encindex == binary_encindex) {
+ // For historical reason, we silently reinterpret binary strings as UTF-8
+ return rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
+ }
+
+ source = rb_funcall(source, i_encode, 1, Encoding_UTF_8);
+ StringValue(source);
return source;
- }
+}
- if (encindex == binary_encindex) {
- // For historical reason, we silently reinterpret binary strings as UTF-8
- return rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
- }
+struct parser_config_init_args {
+ JSON_ParserConfig *config;
+ VALUE self;
+};
- return rb_funcall(source, i_encode, 1, Encoding_UTF_8);
+static void parser_config_wb_write(VALUE self, VALUE *dest, VALUE val)
+{
+ *dest = val;
+ if (self) RB_OBJ_WRITTEN(self, Qundef, val);
}
static int parser_config_init_i(VALUE key, VALUE val, VALUE data)
{
- JSON_ParserConfig *config = (JSON_ParserConfig *)data;
-
- if (key == sym_max_nesting) { config->max_nesting = RTEST(val) ? FIX2INT(val) : 0; }
- else if (key == sym_allow_nan) { config->allow_nan = RTEST(val); }
- else if (key == sym_allow_trailing_comma) { config->allow_trailing_comma = RTEST(val); }
- else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); }
- else if (key == sym_freeze) { config->freeze = RTEST(val); }
- else if (key == sym_on_load) { config->on_load_proc = RTEST(val) ? val : Qfalse; }
- else if (key == sym_allow_duplicate_key) { config->on_duplicate_key = RTEST(val) ? JSON_IGNORE : JSON_RAISE; }
- else if (key == sym_decimal_class) {
+ struct parser_config_init_args *args = (struct parser_config_init_args *)data;
+ JSON_ParserConfig *config = args->config;
+ VALUE self = args->self;
+
+ if (key == sym_max_nesting) { config->max_nesting = RTEST(val) ? FIX2INT(val) : 0; }
+ else if (key == sym_allow_nan) { config->allow_nan = RTEST(val); }
+ else if (key == sym_allow_trailing_comma) { config->allow_trailing_comma = RTEST(val); }
+ else if (key == sym_allow_control_characters) { config->allow_control_characters = RTEST(val); }
+ else if (key == sym_allow_invalid_escape) { config->allow_invalid_escape = RTEST(val); }
+ else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); }
+ else if (key == sym_freeze) { config->freeze = RTEST(val); }
+ else if (key == sym_on_load) { parser_config_wb_write(self, &config->on_load_proc, RTEST(val) ? val : Qfalse); }
+ else if (key == sym_allow_duplicate_key) { config->on_duplicate_key = RTEST(val) ? JSON_IGNORE : JSON_RAISE; }
+ else if (key == sym_decimal_class) {
if (RTEST(val)) {
if (rb_respond_to(val, i_try_convert)) {
- config->decimal_class = val;
+ parser_config_wb_write(self, &config->decimal_class, val);
config->decimal_method_id = i_try_convert;
} else if (rb_respond_to(val, i_new)) {
- config->decimal_class = val;
+ parser_config_wb_write(self, &config->decimal_class, val);
config->decimal_method_id = i_new;
} else if (RB_TYPE_P(val, T_CLASS)) {
VALUE name = rb_class_name(val);
@@ -1330,7 +1846,7 @@ static int parser_config_init_i(VALUE key, VALUE val, VALUE data)
if (last_colon) {
const char *mod_path_end = last_colon - 1;
VALUE mod_path = rb_str_substr(name, 0, mod_path_end - name_cstr);
- config->decimal_class = rb_path_to_class(mod_path);
+ parser_config_wb_write(self, &config->decimal_class, rb_path_to_class(mod_path));
const char *method_name_beg = last_colon + 1;
long before_len = method_name_beg - name_cstr;
@@ -1338,7 +1854,7 @@ static int parser_config_init_i(VALUE key, VALUE val, VALUE data)
VALUE method_name = rb_str_substr(name, before_len, len);
config->decimal_method_id = SYM2ID(rb_str_intern(method_name));
} else {
- config->decimal_class = rb_mKernel;
+ parser_config_wb_write(self, &config->decimal_class, rb_mKernel);
config->decimal_method_id = SYM2ID(rb_str_intern(name));
}
}
@@ -1348,16 +1864,21 @@ static int parser_config_init_i(VALUE key, VALUE val, VALUE data)
return ST_CONTINUE;
}
-static void parser_config_init(JSON_ParserConfig *config, VALUE opts)
+static void parser_config_init(JSON_ParserConfig *config, VALUE opts, VALUE self)
{
config->max_nesting = 100;
+ struct parser_config_init_args args = {
+ .config = config,
+ .self = self,
+ };
+
if (!NIL_P(opts)) {
Check_Type(opts, T_HASH);
if (RHASH_SIZE(opts) > 0) {
// We assume in most cases few keys are set so it's faster to go over
// the provided keys than to check all possible keys.
- rb_hash_foreach(opts, parser_config_init_i, (VALUE)config);
+ rb_hash_foreach(opts, parser_config_init_i, (VALUE)&args);
}
}
@@ -1388,36 +1909,62 @@ static void parser_config_init(JSON_ParserConfig *config, VALUE opts)
*/
static VALUE cParserConfig_initialize(VALUE self, VALUE opts)
{
+ rb_check_frozen(self);
GET_PARSER_CONFIG;
- parser_config_init(config, opts);
-
- RB_OBJ_WRITTEN(self, Qundef, config->decimal_class);
+ parser_config_init(config, opts, self);
return self;
}
-static VALUE cParser_parse(JSON_ParserConfig *config, VALUE Vsource)
+static VALUE cParser_parse(JSON_ParserConfig *config, VALUE src)
{
- Vsource = convert_encoding(StringValue(Vsource));
- StringValue(Vsource);
+ VALUE Vsource = convert_encoding(src);
+
+ // Ensure the string isn't mutated under us.
+ // The classic API to use is `rb_str_locktmp`, but then we'd
+ // need to use `rb_protect` to make sure we always unlock.
+ if (Vsource == src) {
+ Vsource = rb_str_new_frozen(Vsource);
+ }
VALUE rvalue_stack_buffer[RVALUE_STACK_INITIAL_CAPA];
- rvalue_stack stack = {
+ rvalue_stack value_stack = {
.type = RVALUE_STACK_STACK_ALLOCATED,
.ptr = rvalue_stack_buffer,
.capa = RVALUE_STACK_INITIAL_CAPA,
};
+ // Seed the frame stack with the root frame, establishing the invariant that
+ // json_parse_any always has a top frame to dispatch on (so the stack is never
+ // empty mid-parse).
+ json_frame frame_stack_buffer[JSON_FRAME_STACK_INITIAL_CAPA];
+ frame_stack_buffer[0] = (json_frame){
+ .type = JSON_FRAME_ROOT,
+ .phase = JSON_PHASE_VALUE,
+ };
+ json_frame_stack frames = {
+ .type = RVALUE_STACK_STACK_ALLOCATED,
+ .ptr = frame_stack_buffer,
+ .capa = JSON_FRAME_STACK_INITIAL_CAPA,
+ .head = 1,
+ };
+
long len;
const char *start;
+
RSTRING_GETMEM(Vsource, start, len);
+ VALUE value_stack_handle = 0;
+ VALUE frame_stack_handle = 0;
JSON_ParserState _state = {
.start = start,
.cursor = start,
.end = start + len,
- .stack = &stack,
+ .value_stack = &value_stack,
+ .value_stack_handle = &value_stack_handle,
+ .frames = &frames,
+ .frame_stack_handle = &frame_stack_handle,
};
JSON_ParserState *state = &_state;
@@ -1425,8 +1972,11 @@ static VALUE cParser_parse(JSON_ParserConfig *config, VALUE Vsource)
// This may be skipped in case of exception, but
// it won't cause a leak.
- rvalue_stack_eagerly_release(state->stack_handle);
-
+ rvalue_stack_eagerly_release(value_stack_handle);
+ json_frame_stack_eagerly_release(frame_stack_handle);
+ RB_GC_GUARD(value_stack_handle);
+ RB_GC_GUARD(frame_stack_handle);
+ RB_GC_GUARD(Vsource);
json_ensure_eof(state);
return result;
@@ -1447,12 +1997,9 @@ static VALUE cParserConfig_parse(VALUE self, VALUE Vsource)
static VALUE cParser_m_parse(VALUE klass, VALUE Vsource, VALUE opts)
{
- Vsource = convert_encoding(StringValue(Vsource));
- StringValue(Vsource);
-
JSON_ParserConfig _config = {0};
JSON_ParserConfig *config = &_config;
- parser_config_init(config, opts);
+ parser_config_init(config, opts, false);
return cParser_parse(config, Vsource);
}
@@ -1460,30 +2007,35 @@ static VALUE cParser_m_parse(VALUE klass, VALUE Vsource, VALUE opts)
static void JSON_ParserConfig_mark(void *ptr)
{
JSON_ParserConfig *config = ptr;
- rb_gc_mark(config->on_load_proc);
- rb_gc_mark(config->decimal_class);
+ rb_gc_mark_movable(config->on_load_proc);
+ rb_gc_mark_movable(config->decimal_class);
}
-static void JSON_ParserConfig_free(void *ptr)
+static size_t JSON_ParserConfig_memsize(const void *ptr)
{
- JSON_ParserConfig *config = ptr;
- ruby_xfree(config);
+#ifdef HAVE_RUBY_TYPED_EMBEDDABLE
+ return 0;
+#else
+ return sizeof(JSON_ParserConfig);
+#endif
}
-static size_t JSON_ParserConfig_memsize(const void *ptr)
+static void JSON_ParserConfig_compact(void *ptr)
{
- return sizeof(JSON_ParserConfig);
+ JSON_ParserConfig *config = ptr;
+ config->on_load_proc = rb_gc_location(config->on_load_proc);
+ config->decimal_class = rb_gc_location(config->decimal_class);
}
static const rb_data_type_t JSON_ParserConfig_type = {
- "JSON::Ext::Parser/ParserConfig",
- {
- JSON_ParserConfig_mark,
- JSON_ParserConfig_free,
- JSON_ParserConfig_memsize,
+ .wrap_struct_name = "JSON::Ext::Parser/ParserConfig",
+ .function = {
+ .dmark = JSON_ParserConfig_mark,
+ .dfree = RUBY_DEFAULT_FREE,
+ .dsize = JSON_ParserConfig_memsize,
+ .dcompact = JSON_ParserConfig_compact,
},
- 0, 0,
- RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FROZEN_SHAREABLE | RUBY_TYPED_EMBEDDABLE,
};
static VALUE cJSON_parser_s_allocate(VALUE klass)
@@ -1527,16 +2079,14 @@ void Init_parser(void)
sym_max_nesting = ID2SYM(rb_intern("max_nesting"));
sym_allow_nan = ID2SYM(rb_intern("allow_nan"));
sym_allow_trailing_comma = ID2SYM(rb_intern("allow_trailing_comma"));
+ sym_allow_control_characters = ID2SYM(rb_intern("allow_control_characters"));
+ sym_allow_invalid_escape = ID2SYM(rb_intern("allow_invalid_escape"));
sym_symbolize_names = ID2SYM(rb_intern("symbolize_names"));
sym_freeze = ID2SYM(rb_intern("freeze"));
sym_on_load = ID2SYM(rb_intern("on_load"));
sym_decimal_class = ID2SYM(rb_intern("decimal_class"));
sym_allow_duplicate_key = ID2SYM(rb_intern("allow_duplicate_key"));
- i_chr = rb_intern("chr");
- i_aset = rb_intern("[]=");
- i_aref = rb_intern("[]");
- i_leftshift = rb_intern("<<");
i_new = rb_intern("new");
i_try_convert = rb_intern("try_convert");
i_uminus = rb_intern("-@");