summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean Boussier <jean.boussier@gmail.com>2025-01-31 20:09:12 +0100
committerHiroshi SHIBATA <hsbt@ruby-lang.org>2025-02-03 10:05:26 +0900
commit98c56de8237cd760200a1e5f11025a5aee7ed15f (patch)
tree246d0458715a0245e5f0f6337060990839df6b39
parent98e1c2845a8361b69820c41b05eddbe5dbf8cf58 (diff)
[ruby/json] Refactor further to expose the simpler escape search possible
https://github.com/ruby/json/commit/e03515ac8b
-rw-r--r--ext/json/generator/generator.c240
1 files changed, 143 insertions, 97 deletions
diff --git a/ext/json/generator/generator.c b/ext/json/generator/generator.c
index 1bd6af6ed7..b2fcd2b294 100644
--- a/ext/json/generator/generator.c
+++ b/ext/json/generator/generator.c
@@ -103,20 +103,20 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
static const unsigned char CHAR_LENGTH_MASK = 7;
static const unsigned char ESCAPE_MASK = 8;
-static const unsigned char escape_table[256] = {
- // ASCII Control Characters
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- // ASCII Characters
- 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-};
+typedef struct _search_state {
+ const char *ptr;
+ const char *end;
+ const char *cursor;
+ FBuffer *buffer;
+} search_state;
-static const unsigned char ascii_only_escape_table[256] = {
+static inline void search_flush(search_state *search)
+{
+ fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor);
+ search->cursor = search->ptr;
+}
+
+static const unsigned char escape_table_basic[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
@@ -127,20 +127,105 @@ static const unsigned char ascii_only_escape_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- // Continuation byte
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- // First byte of a 2-byte code point
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- // First byte of a 3-byte code point
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- //First byte of a 4+ byte code point
- 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
};
+static inline unsigned char search_escape_basic(search_state *search)
+{
+ while (search->ptr < search->end) {
+ if (RB_UNLIKELY(escape_table_basic[(const unsigned char)*search->ptr])) {
+ search_flush(search);
+ return 1;
+ } else {
+ search->ptr++;
+ }
+ }
+ search_flush(search);
+ return 0;
+}
+
+static inline void escape_UTF8_char_basic(search_state *search) {
+ const unsigned char ch = (unsigned char)*search->ptr;
+ switch (ch) {
+ case '"': fbuffer_append(search->buffer, "\\\"", 2); break;
+ case '\\': fbuffer_append(search->buffer, "\\\\", 2); break;
+ case '/': fbuffer_append(search->buffer, "\\/", 2); break;
+ case '\b': fbuffer_append(search->buffer, "\\b", 2); break;
+ case '\f': fbuffer_append(search->buffer, "\\f", 2); break;
+ case '\n': fbuffer_append(search->buffer, "\\n", 2); break;
+ case '\r': fbuffer_append(search->buffer, "\\r", 2); break;
+ case '\t': fbuffer_append(search->buffer, "\\t", 2); break;
+ default: {
+ const char *hexdig = "0123456789abcdef";
+ char scratch[6] = { '\\', 'u', '0', '0', 0, 0 };
+ scratch[4] = hexdig[(ch >> 4) & 0xf];
+ scratch[5] = hexdig[ch & 0xf];
+ fbuffer_append(search->buffer, scratch, 6);
+ break;
+ }
+ }
+ search->ptr++;
+ search->cursor = search->ptr;
+}
+
+/* Converts in_string to a JSON string (without the wrapping '"'
+ * characters) in FBuffer out_buffer.
+ *
+ * Character are JSON-escaped according to:
+ *
+ * - Always: ASCII control characters (0x00-0x1F), dquote, and
+ * backslash.
+ *
+ * - If out_ascii_only: non-ASCII characters (>0x7F)
+ *
+ * - If script_safe: forwardslash (/), line separator (U+2028), and
+ * paragraph separator (U+2029)
+ *
+ * Everything else (should be UTF-8) is just passed through and
+ * appended to the result.
+ */
+static inline void convert_UTF8_to_JSON(search_state *search)
+{
+ while (search_escape_basic(search)) {
+ escape_UTF8_char_basic(search);
+ }
+}
+
+static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) {
+ const unsigned char ch = (unsigned char)*search->ptr;
+ switch (ch_len) {
+ case 1: {
+ switch (ch) {
+ case '"': fbuffer_append(search->buffer, "\\\"", 2); break;
+ case '\\': fbuffer_append(search->buffer, "\\\\", 2); break;
+ case '/': fbuffer_append(search->buffer, "\\/", 2); break;
+ case '\b': fbuffer_append(search->buffer, "\\b", 2); break;
+ case '\f': fbuffer_append(search->buffer, "\\f", 2); break;
+ case '\n': fbuffer_append(search->buffer, "\\n", 2); break;
+ case '\r': fbuffer_append(search->buffer, "\\r", 2); break;
+ case '\t': fbuffer_append(search->buffer, "\\t", 2); break;
+ default: {
+ const char *hexdig = "0123456789abcdef";
+ char scratch[6] = { '\\', 'u', '0', '0', 0, 0 };
+ scratch[4] = hexdig[(ch >> 4) & 0xf];
+ scratch[5] = hexdig[ch & 0xf];
+ fbuffer_append(search->buffer, scratch, 6);
+ break;
+ }
+ }
+ break;
+ }
+ case 3: {
+ if (search->ptr[2] & 1) {
+ fbuffer_append(search->buffer, "\\u2029", 6);
+ } else {
+ fbuffer_append(search->buffer, "\\u2028", 6);
+ }
+ break;
+ }
+ }
+ search->cursor = (search->ptr += ch_len);
+}
+
static const unsigned char script_safe_escape_table[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
@@ -166,25 +251,11 @@ static const unsigned char script_safe_escape_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
};
-
-typedef struct _search_state {
- const char *ptr;
- const char *end;
- const char *cursor;
- FBuffer *buffer;
-} search_state;
-
-static inline void search_flush(search_state *search)
-{
- fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor);
- search->cursor = search->ptr;
-}
-
-static inline unsigned char search_escape(search_state *search, const unsigned char escape_table[256])
+static inline unsigned char search_script_safe_escape(search_state *search)
{
while (search->ptr < search->end) {
unsigned char ch = (unsigned char)*search->ptr;
- unsigned char ch_len = escape_table[ch];
+ unsigned char ch_len = script_safe_escape_table[ch];
if (RB_UNLIKELY(ch_len)) {
if (ch_len & ESCAPE_MASK) {
@@ -208,66 +279,39 @@ static inline unsigned char search_escape(search_state *search, const unsigned c
return 0;
}
-static inline void fast_escape_UTF8_char(search_state *search, unsigned char ch_len) {
- const unsigned char ch = (unsigned char)*search->ptr;
- switch (ch_len) {
- case 1: {
- switch (ch) {
- case '"': fbuffer_append(search->buffer, "\\\"", 2); break;
- case '\\': fbuffer_append(search->buffer, "\\\\", 2); break;
- case '/': fbuffer_append(search->buffer, "\\/", 2); break;
- case '\b': fbuffer_append(search->buffer, "\\b", 2); break;
- case '\f': fbuffer_append(search->buffer, "\\f", 2); break;
- case '\n': fbuffer_append(search->buffer, "\\n", 2); break;
- case '\r': fbuffer_append(search->buffer, "\\r", 2); break;
- case '\t': fbuffer_append(search->buffer, "\\t", 2); break;
- default: {
- const char *hexdig = "0123456789abcdef";
- char scratch[6] = { '\\', 'u', '0', '0', 0, 0 };
- scratch[4] = hexdig[(ch >> 4) & 0xf];
- scratch[5] = hexdig[ch & 0xf];
- fbuffer_append(search->buffer, scratch, 6);
- break;
- }
- }
- break;
- }
- case 3: {
- if (search->ptr[2] & 1) {
- fbuffer_append(search->buffer, "\\u2029", 6);
- } else {
- fbuffer_append(search->buffer, "\\u2028", 6);
- }
- break;
- }
- }
- search->cursor = (search->ptr += ch_len);
-}
-
-/* Converts in_string to a JSON string (without the wrapping '"'
- * characters) in FBuffer out_buffer.
- *
- * Character are JSON-escaped according to:
- *
- * - Always: ASCII control characters (0x00-0x1F), dquote, and
- * backslash.
- *
- * - If out_ascii_only: non-ASCII characters (>0x7F)
- *
- * - If script_safe: forwardslash (/), line separator (U+2028), and
- * paragraph separator (U+2029)
- *
- * Everything else (should be UTF-8) is just passed through and
- * appended to the result.
- */
-static inline void convert_UTF8_to_JSON(search_state *search, const unsigned char escape_table[256])
+static void convert_UTF8_to_script_safe_JSON(search_state *search)
{
unsigned char ch_len;
- while ((ch_len = search_escape(search, escape_table))) {
- fast_escape_UTF8_char(search, ch_len);
+ while ((ch_len = search_script_safe_escape(search))) {
+ escape_UTF8_char(search, ch_len);
}
}
+static const unsigned char ascii_only_escape_table[256] = {
+ // ASCII Control Characters
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ // ASCII Characters
+ 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ // Continuation byte
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ // First byte of a 2-byte code point
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ // First byte of a 3-byte code point
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ //First byte of a 4+ byte code point
+ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
+};
+
static inline unsigned char search_ascii_only_escape(search_state *search, const unsigned char escape_table[256])
{
while (search->ptr < search->end) {
@@ -934,8 +978,10 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat
case ENC_CODERANGE_VALID:
if (RB_UNLIKELY(state->ascii_only)) {
convert_UTF8_to_ASCII_only_JSON(&search, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
+ } else if (RB_UNLIKELY(state->script_safe)) {
+ convert_UTF8_to_script_safe_JSON(&search);
} else {
- convert_UTF8_to_JSON(&search, state->script_safe ? script_safe_escape_table : escape_table);
+ convert_UTF8_to_JSON(&search);
}
break;
default: