summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog8
-rw-r--r--ext/psych/parser.c91
-rw-r--r--test/psych/test_encoding.rb73
-rw-r--r--test/psych/test_parser.rb1
-rw-r--r--test/psych/test_tainted.rb4
5 files changed, 174 insertions, 3 deletions
diff --git a/ChangeLog b/ChangeLog
index 5d91040cdd..82bb331117 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+Fri Feb 24 08:08:38 2012 Aaron Patterson <aaron@tenderlovemaking.com>
+
+ * ext/psych/parser.c: set parser encoding based on the YAML input
+ rather than user configuration.
+ * test/psych/test_encoding.rb: corresponding tests.
+ * test/psych/test_parser.rb: ditto
+ * test/psych/test_tainted.rb: ditto
+
Fri Feb 24 07:02:52 2012 Eric Hodel <drbrain@segment7.net>
* hash.c (Init_Hash): Add section on how objects are used as Hash keys
diff --git a/ext/psych/parser.c b/ext/psych/parser.c
index b0f4d97916..9820686014 100644
--- a/ext/psych/parser.c
+++ b/ext/psych/parser.c
@@ -75,6 +75,85 @@ static VALUE make_exception(yaml_parser_t * parser, VALUE path)
parser->context ? rb_usascii_str_new2(parser->context) : Qnil);
}
+#ifdef HAVE_RUBY_ENCODING_H
+static VALUE transcode_string(VALUE src, int * parser_encoding)
+{
+ int utf8 = rb_utf8_encindex();
+ int utf16le = rb_enc_find_index("UTF16_LE");
+ int utf16be = rb_enc_find_index("UTF16_BE");
+ int source_encoding = rb_enc_get_index(src);
+
+ if (source_encoding == utf8) {
+ *parser_encoding = YAML_UTF8_ENCODING;
+ return src;
+ }
+
+ if (source_encoding == utf16le) {
+ *parser_encoding = YAML_UTF16LE_ENCODING;
+ return src;
+ }
+
+ if (source_encoding == utf16be) {
+ *parser_encoding = YAML_UTF16BE_ENCODING;
+ return src;
+ }
+
+ src = rb_str_export_to_enc(src, rb_utf8_encoding());
+ RB_GC_GUARD(src);
+
+ *parser_encoding = YAML_UTF8_ENCODING;
+ return src;
+}
+
+static VALUE transcode_io(VALUE src, int * parser_encoding)
+{
+ VALUE io_external_encoding;
+ int io_external_enc_index;
+
+ io_external_encoding = rb_funcall(src, rb_intern("external_encoding"), 0);
+
+ /* if no encoding is returned, assume ascii8bit. */
+ if (NIL_P(io_external_encoding)) {
+ io_external_enc_index = rb_ascii8bit_encindex();
+ } else {
+ io_external_enc_index = rb_to_encoding_index(io_external_encoding);
+ }
+
+ /* Treat US-ASCII as utf_8 */
+ if (io_external_enc_index == rb_usascii_encindex()) {
+ *parser_encoding = YAML_UTF8_ENCODING;
+ return src;
+ }
+
+ if (io_external_enc_index == rb_utf8_encindex()) {
+ *parser_encoding = YAML_UTF8_ENCODING;
+ return src;
+ }
+
+ if (io_external_enc_index == rb_enc_find_index("UTF-16LE")) {
+ *parser_encoding = YAML_UTF16LE_ENCODING;
+ return src;
+ }
+
+ if (io_external_enc_index == rb_enc_find_index("UTF-16BE")) {
+ *parser_encoding = YAML_UTF16BE_ENCODING;
+ return src;
+ }
+
+ /* Just guess on ASCII-8BIT */
+ if (io_external_enc_index == rb_ascii8bit_encindex()) {
+ *parser_encoding = YAML_ANY_ENCODING;
+ return src;
+ }
+
+ rb_raise(rb_eArgError, "YAML file must be UTF-8, UTF-16LE, or UTF-16BE, not %s",
+ rb_enc_name(rb_enc_from_index(io_external_enc_index)));
+
+ return Qnil;
+}
+
+#endif
+
/*
* call-seq:
* parser.parse(yaml)
@@ -91,6 +170,7 @@ static VALUE parse(int argc, VALUE *argv, VALUE self)
yaml_event_t event;
int done = 0;
int tainted = 0;
+ int parser_encoding = YAML_ANY_ENCODING;
#ifdef HAVE_RUBY_ENCODING_H
int encoding = rb_utf8_encindex();
rb_encoding * internal_enc = rb_default_internal_encoding();
@@ -108,15 +188,22 @@ static VALUE parse(int argc, VALUE *argv, VALUE self)
yaml_parser_delete(parser);
yaml_parser_initialize(parser);
- yaml_parser_set_encoding(parser, NUM2INT(rb_iv_get(self, "@external_encoding")));
if (OBJ_TAINTED(yaml)) tainted = 1;
- if(rb_respond_to(yaml, id_read)) {
+ if (rb_respond_to(yaml, id_read)) {
+#ifdef HAVE_RUBY_ENCODING_H
+ yaml = transcode_io(yaml, &parser_encoding);
+ yaml_parser_set_encoding(parser, parser_encoding);
+#endif
yaml_parser_set_input(parser, io_reader, (void *)yaml);
if (RTEST(rb_obj_is_kind_of(yaml, rb_cIO))) tainted = 1;
} else {
StringValue(yaml);
+#ifdef HAVE_RUBY_ENCODING_H
+ yaml = transcode_string(yaml, &parser_encoding);
+ yaml_parser_set_encoding(parser, parser_encoding);
+#endif
yaml_parser_set_input_string(
parser,
(const unsigned char *)RSTRING_PTR(yaml),
diff --git a/test/psych/test_encoding.rb b/test/psych/test_encoding.rb
index a341c47859..8efb676d9a 100644
--- a/test/psych/test_encoding.rb
+++ b/test/psych/test_encoding.rb
@@ -31,6 +31,79 @@ module Psych
@emitter = Psych::Emitter.new @buffer
end
+ def test_transcode_shiftjis
+ str = "こんにちは!"
+ loaded = Psych.load("--- こんにちは!".encode('SHIFT_JIS'))
+ assert_equal str, loaded
+ end
+
+ def test_transcode_utf16le
+ str = "こんにちは!"
+ loaded = Psych.load("--- こんにちは!".encode('UTF-16LE'))
+ assert_equal str, loaded
+ end
+
+ def test_transcode_utf16be
+ str = "こんにちは!"
+ loaded = Psych.load("--- こんにちは!".encode('UTF-16BE'))
+ assert_equal str, loaded
+ end
+
+ def test_io_shiftjis
+ t = Tempfile.new(['shiftjis', 'yml'], :encoding => 'SHIFT_JIS')
+ t.write '--- こんにちは!'
+ t.close
+
+ # If the external encoding isn't utf8, utf16le, or utf16be, we cannot
+ # process the file.
+ File.open(t.path, 'r', :encoding => 'SHIFT_JIS') do |f|
+ assert_raises ArgumentError do
+ Psych.load(f)
+ end
+ end
+
+ t.close(true)
+ end
+
+ def test_io_utf16le
+ t = Tempfile.new(['utf16le', 'yml'])
+ t.binmode
+ t.write '--- こんにちは!'.encode('UTF-16LE')
+ t.close
+
+ File.open(t.path, 'rb', :encoding => 'UTF-16LE') do |f|
+ assert_equal "こんにちは!", Psych.load(f)
+ end
+
+ t.close(true)
+ end
+
+ def test_io_utf16be
+ t = Tempfile.new(['utf16be', 'yml'])
+ t.binmode
+ t.write '--- こんにちは!'.encode('UTF-16BE')
+ t.close
+
+ File.open(t.path, 'rb', :encoding => 'UTF-16BE') do |f|
+ assert_equal "こんにちは!", Psych.load(f)
+ end
+
+ t.close(true)
+ end
+
+ def test_io_utf8
+ t = Tempfile.new(['utf8', 'yml'])
+ t.binmode
+ t.write '--- こんにちは!'.encode('UTF-8')
+ t.close
+
+ File.open(t.path, 'rb', :encoding => 'UTF-8') do |f|
+ assert_equal "こんにちは!", Psych.load(f)
+ end
+
+ t.close(true)
+ end
+
def test_emit_alias
@emitter.start_stream Psych::Parser::UTF8
@emitter.start_document [], [], true
diff --git a/test/psych/test_parser.rb b/test/psych/test_parser.rb
index a491d7fdd6..cfbfb61693 100644
--- a/test/psych/test_parser.rb
+++ b/test/psych/test_parser.rb
@@ -112,6 +112,7 @@ module Psych
def test_bogus_io
o = Object.new
+ def o.external_encoding; nil end
def o.read len; self end
assert_raises(TypeError) do
diff --git a/test/psych/test_tainted.rb b/test/psych/test_tainted.rb
index bf55d3b30e..00d220e825 100644
--- a/test/psych/test_tainted.rb
+++ b/test/psych/test_tainted.rb
@@ -121,7 +121,9 @@ module Psych
t.binmode
t.write string
t.close
- File.open(t.path) { |f| @parser.parse f }
+ File.open(t.path) { |f|
+ @parser.parse f
+ }
t.close(true)
end
end