diff options
-rw-r--r-- | ChangeLog | 8 | ||||
-rw-r--r-- | ext/psych/parser.c | 91 | ||||
-rw-r--r-- | test/psych/test_encoding.rb | 73 | ||||
-rw-r--r-- | test/psych/test_parser.rb | 1 | ||||
-rw-r--r-- | test/psych/test_tainted.rb | 4 |
5 files changed, 174 insertions, 3 deletions
@@ -1,3 +1,11 @@ +Fri Feb 24 08:08:38 2012 Aaron Patterson <aaron@tenderlovemaking.com> + + * ext/psych/parser.c: set parser encoding based on the YAML input + rather than user configuration. + * test/psych/test_encoding.rb: corresponding tests. + * test/psych/test_parser.rb: ditto + * test/psych/test_tainted.rb: ditto + Fri Feb 24 07:02:52 2012 Eric Hodel <drbrain@segment7.net> * hash.c (Init_Hash): Add section on how objects are used as Hash keys diff --git a/ext/psych/parser.c b/ext/psych/parser.c index b0f4d97916..9820686014 100644 --- a/ext/psych/parser.c +++ b/ext/psych/parser.c @@ -75,6 +75,85 @@ static VALUE make_exception(yaml_parser_t * parser, VALUE path) parser->context ? rb_usascii_str_new2(parser->context) : Qnil); } +#ifdef HAVE_RUBY_ENCODING_H +static VALUE transcode_string(VALUE src, int * parser_encoding) +{ + int utf8 = rb_utf8_encindex(); + int utf16le = rb_enc_find_index("UTF16_LE"); + int utf16be = rb_enc_find_index("UTF16_BE"); + int source_encoding = rb_enc_get_index(src); + + if (source_encoding == utf8) { + *parser_encoding = YAML_UTF8_ENCODING; + return src; + } + + if (source_encoding == utf16le) { + *parser_encoding = YAML_UTF16LE_ENCODING; + return src; + } + + if (source_encoding == utf16be) { + *parser_encoding = YAML_UTF16BE_ENCODING; + return src; + } + + src = rb_str_export_to_enc(src, rb_utf8_encoding()); + RB_GC_GUARD(src); + + *parser_encoding = YAML_UTF8_ENCODING; + return src; +} + +static VALUE transcode_io(VALUE src, int * parser_encoding) +{ + VALUE io_external_encoding; + int io_external_enc_index; + + io_external_encoding = rb_funcall(src, rb_intern("external_encoding"), 0); + + /* if no encoding is returned, assume ascii8bit. */ + if (NIL_P(io_external_encoding)) { + io_external_enc_index = rb_ascii8bit_encindex(); + } else { + io_external_enc_index = rb_to_encoding_index(io_external_encoding); + } + + /* Treat US-ASCII as utf_8 */ + if (io_external_enc_index == rb_usascii_encindex()) { + *parser_encoding = YAML_UTF8_ENCODING; + return src; + } + + if (io_external_enc_index == rb_utf8_encindex()) { + *parser_encoding = YAML_UTF8_ENCODING; + return src; + } + + if (io_external_enc_index == rb_enc_find_index("UTF-16LE")) { + *parser_encoding = YAML_UTF16LE_ENCODING; + return src; + } + + if (io_external_enc_index == rb_enc_find_index("UTF-16BE")) { + *parser_encoding = YAML_UTF16BE_ENCODING; + return src; + } + + /* Just guess on ASCII-8BIT */ + if (io_external_enc_index == rb_ascii8bit_encindex()) { + *parser_encoding = YAML_ANY_ENCODING; + return src; + } + + rb_raise(rb_eArgError, "YAML file must be UTF-8, UTF-16LE, or UTF-16BE, not %s", + rb_enc_name(rb_enc_from_index(io_external_enc_index))); + + return Qnil; +} + +#endif + /* * call-seq: * parser.parse(yaml) @@ -91,6 +170,7 @@ static VALUE parse(int argc, VALUE *argv, VALUE self) yaml_event_t event; int done = 0; int tainted = 0; + int parser_encoding = YAML_ANY_ENCODING; #ifdef HAVE_RUBY_ENCODING_H int encoding = rb_utf8_encindex(); rb_encoding * internal_enc = rb_default_internal_encoding(); @@ -108,15 +188,22 @@ static VALUE parse(int argc, VALUE *argv, VALUE self) yaml_parser_delete(parser); yaml_parser_initialize(parser); - yaml_parser_set_encoding(parser, NUM2INT(rb_iv_get(self, "@external_encoding"))); if (OBJ_TAINTED(yaml)) tainted = 1; - if(rb_respond_to(yaml, id_read)) { + if (rb_respond_to(yaml, id_read)) { +#ifdef HAVE_RUBY_ENCODING_H + yaml = transcode_io(yaml, &parser_encoding); + yaml_parser_set_encoding(parser, parser_encoding); +#endif yaml_parser_set_input(parser, io_reader, (void *)yaml); if (RTEST(rb_obj_is_kind_of(yaml, rb_cIO))) tainted = 1; } else { StringValue(yaml); +#ifdef HAVE_RUBY_ENCODING_H + yaml = transcode_string(yaml, &parser_encoding); + yaml_parser_set_encoding(parser, parser_encoding); +#endif yaml_parser_set_input_string( parser, (const unsigned char *)RSTRING_PTR(yaml), diff --git a/test/psych/test_encoding.rb b/test/psych/test_encoding.rb index a341c47859..8efb676d9a 100644 --- a/test/psych/test_encoding.rb +++ b/test/psych/test_encoding.rb @@ -31,6 +31,79 @@ module Psych @emitter = Psych::Emitter.new @buffer end + def test_transcode_shiftjis + str = "こんにちは!" + loaded = Psych.load("--- こんにちは!".encode('SHIFT_JIS')) + assert_equal str, loaded + end + + def test_transcode_utf16le + str = "こんにちは!" + loaded = Psych.load("--- こんにちは!".encode('UTF-16LE')) + assert_equal str, loaded + end + + def test_transcode_utf16be + str = "こんにちは!" + loaded = Psych.load("--- こんにちは!".encode('UTF-16BE')) + assert_equal str, loaded + end + + def test_io_shiftjis + t = Tempfile.new(['shiftjis', 'yml'], :encoding => 'SHIFT_JIS') + t.write '--- こんにちは!' + t.close + + # If the external encoding isn't utf8, utf16le, or utf16be, we cannot + # process the file. + File.open(t.path, 'r', :encoding => 'SHIFT_JIS') do |f| + assert_raises ArgumentError do + Psych.load(f) + end + end + + t.close(true) + end + + def test_io_utf16le + t = Tempfile.new(['utf16le', 'yml']) + t.binmode + t.write '--- こんにちは!'.encode('UTF-16LE') + t.close + + File.open(t.path, 'rb', :encoding => 'UTF-16LE') do |f| + assert_equal "こんにちは!", Psych.load(f) + end + + t.close(true) + end + + def test_io_utf16be + t = Tempfile.new(['utf16be', 'yml']) + t.binmode + t.write '--- こんにちは!'.encode('UTF-16BE') + t.close + + File.open(t.path, 'rb', :encoding => 'UTF-16BE') do |f| + assert_equal "こんにちは!", Psych.load(f) + end + + t.close(true) + end + + def test_io_utf8 + t = Tempfile.new(['utf8', 'yml']) + t.binmode + t.write '--- こんにちは!'.encode('UTF-8') + t.close + + File.open(t.path, 'rb', :encoding => 'UTF-8') do |f| + assert_equal "こんにちは!", Psych.load(f) + end + + t.close(true) + end + def test_emit_alias @emitter.start_stream Psych::Parser::UTF8 @emitter.start_document [], [], true diff --git a/test/psych/test_parser.rb b/test/psych/test_parser.rb index a491d7fdd6..cfbfb61693 100644 --- a/test/psych/test_parser.rb +++ b/test/psych/test_parser.rb @@ -112,6 +112,7 @@ module Psych def test_bogus_io o = Object.new + def o.external_encoding; nil end def o.read len; self end assert_raises(TypeError) do diff --git a/test/psych/test_tainted.rb b/test/psych/test_tainted.rb index bf55d3b30e..00d220e825 100644 --- a/test/psych/test_tainted.rb +++ b/test/psych/test_tainted.rb @@ -121,7 +121,9 @@ module Psych t.binmode t.write string t.close - File.open(t.path) { |f| @parser.parse f } + File.open(t.path) { |f| + @parser.parse f + } t.close(true) end end |