summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean Boussier <jean.boussier@gmail.com>2024-10-21 22:19:44 +0200
committerHiroshi SHIBATA <hsbt@ruby-lang.org>2024-10-26 18:44:15 +0900
commitfc9f0cb8c5ead74455370f2ae409bc4a6b1dedc0 (patch)
treef289961afc78ca5903e306e7d17cad870f3e9a46
parent2c6e3bc71e12e12ad6949502e2b161171ca56840 (diff)
[ruby/json] JSON.dump / String#to_json: raise on invalid encoding
This regressed since 2.7.2. https://github.com/ruby/json/commit/35407d6635
-rw-r--r--ext/json/generator/generator.c40
-rw-r--r--ext/json/parser/parser.c19
-rw-r--r--ext/json/parser/parser.rl3
-rwxr-xr-xtest/json/json_generator_test.rb14
4 files changed, 52 insertions, 24 deletions
diff --git a/ext/json/generator/generator.c b/ext/json/generator/generator.c
index cb05453244..c35e86d9b8 100644
--- a/ext/json/generator/generator.c
+++ b/ext/json/generator/generator.c
@@ -5,9 +5,9 @@
#define RB_UNLIKELY(cond) (cond)
#endif
-static VALUE mJSON, cState, mString_Extend, eGeneratorError, eNestingError;
+static VALUE mJSON, cState, mString_Extend, eGeneratorError, eNestingError, Encoding_UTF_8;
-static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend;
+static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend, i_encode;
/* Converts in_string to a JSON string (without the wrapping '"'
* characters) in FBuffer out_buffer.
@@ -735,20 +735,41 @@ static void generate_json_array(FBuffer *buffer, VALUE Vstate, JSON_Generator_St
fbuffer_append_char(buffer, ']');
}
-static int usascii_encindex, utf8_encindex;
+static int usascii_encindex, utf8_encindex, binary_encindex;
-static int enc_utf8_compatible_p(int enc_idx)
+static inline int enc_utf8_compatible_p(int enc_idx)
{
if (enc_idx == usascii_encindex) return 1;
if (enc_idx == utf8_encindex) return 1;
return 0;
}
+static inline VALUE ensure_valid_encoding(VALUE str)
+{
+ int encindex = RB_ENCODING_GET(str);
+ VALUE utf8_string;
+ if (RB_UNLIKELY(!enc_utf8_compatible_p(encindex))) {
+ if (encindex == binary_encindex) {
+ // For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
+ // TODO: Deprecate in 2.8.0
+ // TODO: Remove in 3.0.0
+ utf8_string = rb_enc_associate_index(rb_str_dup(str), utf8_encindex);
+ switch (rb_enc_str_coderange(utf8_string)) {
+ case ENC_CODERANGE_7BIT:
+ case ENC_CODERANGE_VALID:
+ return utf8_string;
+ break;
+ }
+ }
+
+ str = rb_funcall(str, i_encode, 1, Encoding_UTF_8);
+ }
+ return str;
+}
+
static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_State *state, VALUE obj)
{
- if (!enc_utf8_compatible_p(RB_ENCODING_GET(obj))) {
- obj = rb_str_export_to_enc(obj, rb_utf8_encoding());
- }
+ obj = ensure_valid_encoding(obj);
fbuffer_append_char(buffer, '"');
@@ -1462,6 +1483,9 @@ void Init_generator(void)
VALUE mNilClass = rb_define_module_under(mGeneratorMethods, "NilClass");
rb_define_method(mNilClass, "to_json", mNilClass_to_json, -1);
+ rb_global_variable(&Encoding_UTF_8);
+ Encoding_UTF_8 = rb_const_get(rb_path2class("Encoding"), rb_intern("UTF_8"));
+
i_to_s = rb_intern("to_s");
i_to_json = rb_intern("to_json");
i_new = rb_intern("new");
@@ -1469,7 +1493,9 @@ void Init_generator(void)
i_unpack = rb_intern("unpack");
i_create_id = rb_intern("create_id");
i_extend = rb_intern("extend");
+ i_encode = rb_intern("encode");
usascii_encindex = rb_usascii_encindex();
utf8_encindex = rb_utf8_encindex();
+ binary_encindex = rb_ascii8bit_encindex();
}
diff --git a/ext/json/parser/parser.c b/ext/json/parser/parser.c
index 62effbd0a5..cf0b3cefa4 100644
--- a/ext/json/parser/parser.c
+++ b/ext/json/parser/parser.c
@@ -1794,6 +1794,9 @@ static VALUE convert_encoding(VALUE source)
}
if (encindex == binary_encindex) {
+ // For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
+ // TODO: Deprecate in 2.8.0
+ // TODO: Remove in 3.0.0
return rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
}
@@ -1943,7 +1946,7 @@ static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self)
}
-#line 1947 "parser.c"
+#line 1950 "parser.c"
enum {JSON_start = 1};
enum {JSON_first_final = 10};
enum {JSON_error = 0};
@@ -1951,7 +1954,7 @@ enum {JSON_error = 0};
enum {JSON_en_main = 1};
-#line 855 "parser.rl"
+#line 858 "parser.rl"
/*
@@ -1969,16 +1972,16 @@ static VALUE cParser_parse(VALUE self)
GET_PARSER;
-#line 1973 "parser.c"
+#line 1976 "parser.c"
{
cs = JSON_start;
}
-#line 872 "parser.rl"
+#line 875 "parser.rl"
p = json->source;
pe = p + json->len;
-#line 1982 "parser.c"
+#line 1985 "parser.c"
{
if ( p == pe )
goto _test_eof;
@@ -2012,7 +2015,7 @@ st0:
cs = 0;
goto _out;
tr2:
-#line 847 "parser.rl"
+#line 850 "parser.rl"
{
char *np = JSON_parse_value(json, p, pe, &result, 0);
if (np == NULL) { p--; {p++; cs = 10; goto _out;} } else {p = (( np))-1;}
@@ -2022,7 +2025,7 @@ st10:
if ( ++p == pe )
goto _test_eof10;
case 10:
-#line 2026 "parser.c"
+#line 2029 "parser.c"
switch( (*p) ) {
case 13: goto st10;
case 32: goto st10;
@@ -2111,7 +2114,7 @@ case 9:
_out: {}
}
-#line 875 "parser.rl"
+#line 878 "parser.rl"
if (cs >= JSON_first_final && p == pe) {
return result;
diff --git a/ext/json/parser/parser.rl b/ext/json/parser/parser.rl
index 6ebb2f6fac..73f81341ab 100644
--- a/ext/json/parser/parser.rl
+++ b/ext/json/parser/parser.rl
@@ -689,6 +689,9 @@ static VALUE convert_encoding(VALUE source)
}
if (encindex == binary_encindex) {
+ // For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
+ // TODO: Deprecate in 2.8.0
+ // TODO: Remove in 3.0.0
return rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
}
diff --git a/test/json/json_generator_test.rb b/test/json/json_generator_test.rb
index 6730898d4f..7dc45e3a52 100755
--- a/test/json/json_generator_test.rb
+++ b/test/json/json_generator_test.rb
@@ -449,16 +449,12 @@ EOT
end
assert_includes error.message, "source sequence is illegal/malformed utf-8"
- # These pass on the pure-Ruby generator but not with the native extension
- # https://github.com/ruby/json/issues/634
- if defined?(JSON::Pure)
- assert_raise(Encoding::UndefinedConversionError) do
- "\x82\xAC\xEF".b.to_json
- end
+ assert_raise(Encoding::UndefinedConversionError) do
+ "\x82\xAC\xEF".b.to_json
+ end
- assert_raise(Encoding::UndefinedConversionError) do
- JSON.dump("\x82\xAC\xEF".b)
- end
+ assert_raise(Encoding::UndefinedConversionError) do
+ JSON.dump("\x82\xAC\xEF".b)
end
end