summaryrefslogtreecommitdiff
path: root/ext/psych/psych_parser.c
diff options
context:
space:
mode:
authortenderlove <tenderlove@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2012-11-28 04:34:41 +0000
committertenderlove <tenderlove@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2012-11-28 04:34:41 +0000
commit14099149f403f4cfbaa4cd673f04bf20cc3c5ac5 (patch)
treeff14c2027bb9d2ae751580452c8b266bc27fca65 /ext/psych/psych_parser.c
parent694c77633c80a106a1e526870ad5076bfac30640 (diff)
* ext/psych/extconf.rb: use embedded libyaml if no system libyaml is
found. [ruby-core:49463] * ext/psych/lib/psych.rb: updating to psych 2.0.0 * ext/psych/lib/psych/deprecated.rb: updated docs * ext/psych/psych.gemspec: updated to psych 2.0.0 * ext/psych/psych.h: fixing header file include for rename * ext/psych/psych_emitter.c: renamed to avoid libyaml conflict. * ext/psych/psych_emitter.h: ditto * ext/psych/psych_parser.c: ditto * ext/psych/psych_parser.h: ditto * ext/psych/psych_to_ruby.c: ditto * ext/psych/psych_to_ruby.h: ditto * ext/psych/psych_yaml_tree.c: ditto * ext/psych/psych_yaml_tree.h: ditto * ext/psych/yaml/LICENSE: embedding libyaml 0.1.4 * ext/psych/yaml/api.c: ditto * ext/psych/yaml/config.h: ditto * ext/psych/yaml/dumper.c: ditto * ext/psych/yaml/emitter.c: ditto * ext/psych/yaml/loader.c: ditto * ext/psych/yaml/parser.c: ditto * ext/psych/yaml/reader.c: ditto * ext/psych/yaml/scanner.c: ditto * ext/psych/yaml/writer.c: ditto * ext/psych/yaml/yaml.h: ditto * ext/psych/yaml/yaml_private.h: ditto git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@37919 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'ext/psych/psych_parser.c')
-rw-r--r--ext/psych/psych_parser.c579
1 files changed, 579 insertions, 0 deletions
diff --git a/ext/psych/psych_parser.c b/ext/psych/psych_parser.c
new file mode 100644
index 0000000000..8c65ce1307
--- /dev/null
+++ b/ext/psych/psych_parser.c
@@ -0,0 +1,579 @@
+#include <psych.h>
+
+VALUE cPsychParser;
+VALUE ePsychSyntaxError;
+
+static ID id_read;
+static ID id_path;
+static ID id_empty;
+static ID id_start_stream;
+static ID id_end_stream;
+static ID id_start_document;
+static ID id_end_document;
+static ID id_alias;
+static ID id_scalar;
+static ID id_start_sequence;
+static ID id_end_sequence;
+static ID id_start_mapping;
+static ID id_end_mapping;
+
+#define PSYCH_TRANSCODE(_str, _yaml_enc, _internal_enc) \
+ do { \
+ rb_enc_associate_index((_str), (_yaml_enc)); \
+ if(_internal_enc) \
+ (_str) = rb_str_export_to_enc((_str), (_internal_enc)); \
+ } while (0)
+
+static int io_reader(void * data, unsigned char *buf, size_t size, size_t *read)
+{
+ VALUE io = (VALUE)data;
+ VALUE string = rb_funcall(io, id_read, 1, INT2NUM(size));
+
+ *read = 0;
+
+ if(! NIL_P(string)) {
+ void * str = (void *)StringValuePtr(string);
+ *read = (size_t)RSTRING_LEN(string);
+ memcpy(buf, str, *read);
+ }
+
+ return 1;
+}
+
+static void dealloc(void * ptr)
+{
+ yaml_parser_t * parser;
+
+ parser = (yaml_parser_t *)ptr;
+ yaml_parser_delete(parser);
+ xfree(parser);
+}
+
+static VALUE allocate(VALUE klass)
+{
+ yaml_parser_t * parser;
+
+ parser = xmalloc(sizeof(yaml_parser_t));
+ yaml_parser_initialize(parser);
+
+ return Data_Wrap_Struct(klass, 0, dealloc, parser);
+}
+
+static VALUE make_exception(yaml_parser_t * parser, VALUE path)
+{
+ size_t line, column;
+
+ line = parser->context_mark.line + 1;
+ column = parser->context_mark.column + 1;
+
+ return rb_funcall(ePsychSyntaxError, rb_intern("new"), 6,
+ path,
+ INT2NUM(line),
+ INT2NUM(column),
+ INT2NUM(parser->problem_offset),
+ parser->problem ? rb_usascii_str_new2(parser->problem) : Qnil,
+ parser->context ? rb_usascii_str_new2(parser->context) : Qnil);
+}
+
+#ifdef HAVE_RUBY_ENCODING_H
+static VALUE transcode_string(VALUE src, int * parser_encoding)
+{
+ int utf8 = rb_utf8_encindex();
+ int utf16le = rb_enc_find_index("UTF-16LE");
+ int utf16be = rb_enc_find_index("UTF-16BE");
+ int source_encoding = rb_enc_get_index(src);
+
+ if (source_encoding == utf8) {
+ *parser_encoding = YAML_UTF8_ENCODING;
+ return src;
+ }
+
+ if (source_encoding == utf16le) {
+ *parser_encoding = YAML_UTF16LE_ENCODING;
+ return src;
+ }
+
+ if (source_encoding == utf16be) {
+ *parser_encoding = YAML_UTF16BE_ENCODING;
+ return src;
+ }
+
+ src = rb_str_export_to_enc(src, rb_utf8_encoding());
+ RB_GC_GUARD(src);
+
+ *parser_encoding = YAML_UTF8_ENCODING;
+ return src;
+}
+
+static VALUE transcode_io(VALUE src, int * parser_encoding)
+{
+ VALUE io_external_encoding;
+ int io_external_enc_index;
+
+ io_external_encoding = rb_funcall(src, rb_intern("external_encoding"), 0);
+
+ /* if no encoding is returned, assume ascii8bit. */
+ if (NIL_P(io_external_encoding)) {
+ io_external_enc_index = rb_ascii8bit_encindex();
+ } else {
+ io_external_enc_index = rb_to_encoding_index(io_external_encoding);
+ }
+
+ /* Treat US-ASCII as utf_8 */
+ if (io_external_enc_index == rb_usascii_encindex()) {
+ *parser_encoding = YAML_UTF8_ENCODING;
+ return src;
+ }
+
+ if (io_external_enc_index == rb_utf8_encindex()) {
+ *parser_encoding = YAML_UTF8_ENCODING;
+ return src;
+ }
+
+ if (io_external_enc_index == rb_enc_find_index("UTF-16LE")) {
+ *parser_encoding = YAML_UTF16LE_ENCODING;
+ return src;
+ }
+
+ if (io_external_enc_index == rb_enc_find_index("UTF-16BE")) {
+ *parser_encoding = YAML_UTF16BE_ENCODING;
+ return src;
+ }
+
+ /* Just guess on ASCII-8BIT */
+ if (io_external_enc_index == rb_ascii8bit_encindex()) {
+ *parser_encoding = YAML_ANY_ENCODING;
+ return src;
+ }
+
+ /* If the external encoding is something we don't know how to handle,
+ * fall back to YAML_ANY_ENCODING. */
+ *parser_encoding = YAML_ANY_ENCODING;
+
+ return src;
+}
+
+#endif
+
+static VALUE protected_start_stream(VALUE pointer)
+{
+ VALUE *args = (VALUE *)pointer;
+ return rb_funcall(args[0], id_start_stream, 1, args[1]);
+}
+
+static VALUE protected_start_document(VALUE pointer)
+{
+ VALUE *args = (VALUE *)pointer;
+ return rb_funcall3(args[0], id_start_document, 3, args + 1);
+}
+
+static VALUE protected_end_document(VALUE pointer)
+{
+ VALUE *args = (VALUE *)pointer;
+ return rb_funcall(args[0], id_end_document, 1, args[1]);
+}
+
+static VALUE protected_alias(VALUE pointer)
+{
+ VALUE *args = (VALUE *)pointer;
+ return rb_funcall(args[0], id_alias, 1, args[1]);
+}
+
+static VALUE protected_scalar(VALUE pointer)
+{
+ VALUE *args = (VALUE *)pointer;
+ return rb_funcall3(args[0], id_scalar, 6, args + 1);
+}
+
+static VALUE protected_start_sequence(VALUE pointer)
+{
+ VALUE *args = (VALUE *)pointer;
+ return rb_funcall3(args[0], id_start_sequence, 4, args + 1);
+}
+
+static VALUE protected_end_sequence(VALUE handler)
+{
+ return rb_funcall(handler, id_end_sequence, 0);
+}
+
+static VALUE protected_start_mapping(VALUE pointer)
+{
+ VALUE *args = (VALUE *)pointer;
+ return rb_funcall3(args[0], id_start_mapping, 4, args + 1);
+}
+
+static VALUE protected_end_mapping(VALUE handler)
+{
+ return rb_funcall(handler, id_end_mapping, 0);
+}
+
+static VALUE protected_empty(VALUE handler)
+{
+ return rb_funcall(handler, id_empty, 0);
+}
+
+static VALUE protected_end_stream(VALUE handler)
+{
+ return rb_funcall(handler, id_end_stream, 0);
+}
+
+/*
+ * call-seq:
+ * parser.parse(yaml)
+ *
+ * Parse the YAML document contained in +yaml+. Events will be called on
+ * the handler set on the parser instance.
+ *
+ * See Psych::Parser and Psych::Parser#handler
+ */
+static VALUE parse(int argc, VALUE *argv, VALUE self)
+{
+ VALUE yaml, path;
+ yaml_parser_t * parser;
+ yaml_event_t event;
+ int done = 0;
+ int tainted = 0;
+ int state = 0;
+ int parser_encoding = YAML_ANY_ENCODING;
+#ifdef HAVE_RUBY_ENCODING_H
+ int encoding = rb_utf8_encindex();
+ rb_encoding * internal_enc = rb_default_internal_encoding();
+#endif
+ VALUE handler = rb_iv_get(self, "@handler");
+
+ if (rb_scan_args(argc, argv, "11", &yaml, &path) == 1) {
+ if(rb_respond_to(yaml, id_path))
+ path = rb_funcall(yaml, id_path, 0);
+ else
+ path = rb_str_new2("<unknown>");
+ }
+
+ Data_Get_Struct(self, yaml_parser_t, parser);
+
+ yaml_parser_delete(parser);
+ yaml_parser_initialize(parser);
+
+ if (OBJ_TAINTED(yaml)) tainted = 1;
+
+ if (rb_respond_to(yaml, id_read)) {
+#ifdef HAVE_RUBY_ENCODING_H
+ yaml = transcode_io(yaml, &parser_encoding);
+ yaml_parser_set_encoding(parser, parser_encoding);
+#endif
+ yaml_parser_set_input(parser, io_reader, (void *)yaml);
+ if (RTEST(rb_obj_is_kind_of(yaml, rb_cIO))) tainted = 1;
+ } else {
+ StringValue(yaml);
+#ifdef HAVE_RUBY_ENCODING_H
+ yaml = transcode_string(yaml, &parser_encoding);
+ yaml_parser_set_encoding(parser, parser_encoding);
+#endif
+ yaml_parser_set_input_string(
+ parser,
+ (const unsigned char *)RSTRING_PTR(yaml),
+ (size_t)RSTRING_LEN(yaml)
+ );
+ }
+
+ while(!done) {
+ if(!yaml_parser_parse(parser, &event)) {
+ VALUE exception;
+
+ exception = make_exception(parser, path);
+ yaml_parser_delete(parser);
+ yaml_parser_initialize(parser);
+
+ rb_exc_raise(exception);
+ }
+
+ switch(event.type) {
+ case YAML_STREAM_START_EVENT:
+ {
+ VALUE args[2];
+
+ args[0] = handler;
+ args[1] = INT2NUM((long)event.data.stream_start.encoding);
+ rb_protect(protected_start_stream, (VALUE)args, &state);
+ }
+ break;
+ case YAML_DOCUMENT_START_EVENT:
+ {
+ VALUE args[4];
+ /* Get a list of tag directives (if any) */
+ VALUE tag_directives = rb_ary_new();
+ /* Grab the document version */
+ VALUE version = event.data.document_start.version_directive ?
+ rb_ary_new3(
+ (long)2,
+ INT2NUM((long)event.data.document_start.version_directive->major),
+ INT2NUM((long)event.data.document_start.version_directive->minor)
+ ) : rb_ary_new();
+
+ if(event.data.document_start.tag_directives.start) {
+ yaml_tag_directive_t *start =
+ event.data.document_start.tag_directives.start;
+ yaml_tag_directive_t *end =
+ event.data.document_start.tag_directives.end;
+ for(; start != end; start++) {
+ VALUE handle = Qnil;
+ VALUE prefix = Qnil;
+ if(start->handle) {
+ handle = rb_str_new2((const char *)start->handle);
+ if (tainted) OBJ_TAINT(handle);
+#ifdef HAVE_RUBY_ENCODING_H
+ PSYCH_TRANSCODE(handle, encoding, internal_enc);
+#endif
+ }
+
+ if(start->prefix) {
+ prefix = rb_str_new2((const char *)start->prefix);
+ if (tainted) OBJ_TAINT(prefix);
+#ifdef HAVE_RUBY_ENCODING_H
+ PSYCH_TRANSCODE(prefix, encoding, internal_enc);
+#endif
+ }
+
+ rb_ary_push(tag_directives, rb_ary_new3((long)2, handle, prefix));
+ }
+ }
+ args[0] = handler;
+ args[1] = version;
+ args[2] = tag_directives;
+ args[3] = event.data.document_start.implicit == 1 ? Qtrue : Qfalse;
+ rb_protect(protected_start_document, (VALUE)args, &state);
+ }
+ break;
+ case YAML_DOCUMENT_END_EVENT:
+ {
+ VALUE args[2];
+
+ args[0] = handler;
+ args[1] = event.data.document_end.implicit == 1 ? Qtrue : Qfalse;
+ rb_protect(protected_end_document, (VALUE)args, &state);
+ }
+ break;
+ case YAML_ALIAS_EVENT:
+ {
+ VALUE args[2];
+ VALUE alias = Qnil;
+ if(event.data.alias.anchor) {
+ alias = rb_str_new2((const char *)event.data.alias.anchor);
+ if (tainted) OBJ_TAINT(alias);
+#ifdef HAVE_RUBY_ENCODING_H
+ PSYCH_TRANSCODE(alias, encoding, internal_enc);
+#endif
+ }
+
+ args[0] = handler;
+ args[1] = alias;
+ rb_protect(protected_alias, (VALUE)args, &state);
+ }
+ break;
+ case YAML_SCALAR_EVENT:
+ {
+ VALUE args[7];
+ VALUE anchor = Qnil;
+ VALUE tag = Qnil;
+ VALUE plain_implicit, quoted_implicit, style;
+ VALUE val = rb_str_new(
+ (const char *)event.data.scalar.value,
+ (long)event.data.scalar.length
+ );
+ if (tainted) OBJ_TAINT(val);
+
+#ifdef HAVE_RUBY_ENCODING_H
+ PSYCH_TRANSCODE(val, encoding, internal_enc);
+#endif
+
+ if(event.data.scalar.anchor) {
+ anchor = rb_str_new2((const char *)event.data.scalar.anchor);
+ if (tainted) OBJ_TAINT(anchor);
+#ifdef HAVE_RUBY_ENCODING_H
+ PSYCH_TRANSCODE(anchor, encoding, internal_enc);
+#endif
+ }
+
+ if(event.data.scalar.tag) {
+ tag = rb_str_new2((const char *)event.data.scalar.tag);
+ if (tainted) OBJ_TAINT(tag);
+#ifdef HAVE_RUBY_ENCODING_H
+ PSYCH_TRANSCODE(tag, encoding, internal_enc);
+#endif
+ }
+
+ plain_implicit =
+ event.data.scalar.plain_implicit == 0 ? Qfalse : Qtrue;
+
+ quoted_implicit =
+ event.data.scalar.quoted_implicit == 0 ? Qfalse : Qtrue;
+
+ style = INT2NUM((long)event.data.scalar.style);
+
+ args[0] = handler;
+ args[1] = val;
+ args[2] = anchor;
+ args[3] = tag;
+ args[4] = plain_implicit;
+ args[5] = quoted_implicit;
+ args[6] = style;
+ rb_protect(protected_scalar, (VALUE)args, &state);
+ }
+ break;
+ case YAML_SEQUENCE_START_EVENT:
+ {
+ VALUE args[5];
+ VALUE anchor = Qnil;
+ VALUE tag = Qnil;
+ VALUE implicit, style;
+ if(event.data.sequence_start.anchor) {
+ anchor = rb_str_new2((const char *)event.data.sequence_start.anchor);
+ if (tainted) OBJ_TAINT(anchor);
+#ifdef HAVE_RUBY_ENCODING_H
+ PSYCH_TRANSCODE(anchor, encoding, internal_enc);
+#endif
+ }
+
+ tag = Qnil;
+ if(event.data.sequence_start.tag) {
+ tag = rb_str_new2((const char *)event.data.sequence_start.tag);
+ if (tainted) OBJ_TAINT(tag);
+#ifdef HAVE_RUBY_ENCODING_H
+ PSYCH_TRANSCODE(tag, encoding, internal_enc);
+#endif
+ }
+
+ implicit =
+ event.data.sequence_start.implicit == 0 ? Qfalse : Qtrue;
+
+ style = INT2NUM((long)event.data.sequence_start.style);
+
+ args[0] = handler;
+ args[1] = anchor;
+ args[2] = tag;
+ args[3] = implicit;
+ args[4] = style;
+
+ rb_protect(protected_start_sequence, (VALUE)args, &state);
+ }
+ break;
+ case YAML_SEQUENCE_END_EVENT:
+ rb_protect(protected_end_sequence, handler, &state);
+ break;
+ case YAML_MAPPING_START_EVENT:
+ {
+ VALUE args[5];
+ VALUE anchor = Qnil;
+ VALUE tag = Qnil;
+ VALUE implicit, style;
+ if(event.data.mapping_start.anchor) {
+ anchor = rb_str_new2((const char *)event.data.mapping_start.anchor);
+ if (tainted) OBJ_TAINT(anchor);
+#ifdef HAVE_RUBY_ENCODING_H
+ PSYCH_TRANSCODE(anchor, encoding, internal_enc);
+#endif
+ }
+
+ if(event.data.mapping_start.tag) {
+ tag = rb_str_new2((const char *)event.data.mapping_start.tag);
+ if (tainted) OBJ_TAINT(tag);
+#ifdef HAVE_RUBY_ENCODING_H
+ PSYCH_TRANSCODE(tag, encoding, internal_enc);
+#endif
+ }
+
+ implicit =
+ event.data.mapping_start.implicit == 0 ? Qfalse : Qtrue;
+
+ style = INT2NUM((long)event.data.mapping_start.style);
+
+ args[0] = handler;
+ args[1] = anchor;
+ args[2] = tag;
+ args[3] = implicit;
+ args[4] = style;
+
+ rb_protect(protected_start_mapping, (VALUE)args, &state);
+ }
+ break;
+ case YAML_MAPPING_END_EVENT:
+ rb_protect(protected_end_mapping, handler, &state);
+ break;
+ case YAML_NO_EVENT:
+ rb_protect(protected_empty, handler, &state);
+ break;
+ case YAML_STREAM_END_EVENT:
+ rb_protect(protected_end_stream, handler, &state);
+ done = 1;
+ break;
+ }
+ yaml_event_delete(&event);
+ if (state) rb_jump_tag(state);
+ }
+
+ return self;
+}
+
+/*
+ * call-seq:
+ * parser.mark # => #<Psych::Parser::Mark>
+ *
+ * Returns a Psych::Parser::Mark object that contains line, column, and index
+ * information.
+ */
+static VALUE mark(VALUE self)
+{
+ VALUE mark_klass;
+ VALUE args[3];
+ yaml_parser_t * parser;
+
+ Data_Get_Struct(self, yaml_parser_t, parser);
+ mark_klass = rb_const_get_at(cPsychParser, rb_intern("Mark"));
+ args[0] = INT2NUM(parser->mark.index);
+ args[1] = INT2NUM(parser->mark.line);
+ args[2] = INT2NUM(parser->mark.column);
+
+ return rb_class_new_instance(3, args, mark_klass);
+}
+
+void Init_psych_parser()
+{
+#if 0
+ mPsych = rb_define_module("Psych");
+#endif
+
+ cPsychParser = rb_define_class_under(mPsych, "Parser", rb_cObject);
+ rb_define_alloc_func(cPsychParser, allocate);
+
+ /* Any encoding: Let the parser choose the encoding */
+ rb_define_const(cPsychParser, "ANY", INT2NUM(YAML_ANY_ENCODING));
+
+ /* UTF-8 Encoding */
+ rb_define_const(cPsychParser, "UTF8", INT2NUM(YAML_UTF8_ENCODING));
+
+ /* UTF-16-LE Encoding with BOM */
+ rb_define_const(cPsychParser, "UTF16LE", INT2NUM(YAML_UTF16LE_ENCODING));
+
+ /* UTF-16-BE Encoding with BOM */
+ rb_define_const(cPsychParser, "UTF16BE", INT2NUM(YAML_UTF16BE_ENCODING));
+
+ rb_require("psych/syntax_error");
+ ePsychSyntaxError = rb_const_get(mPsych, rb_intern("SyntaxError"));
+
+ rb_define_method(cPsychParser, "parse", parse, -1);
+ rb_define_method(cPsychParser, "mark", mark, 0);
+
+ id_read = rb_intern("read");
+ id_path = rb_intern("path");
+ id_empty = rb_intern("empty");
+ id_start_stream = rb_intern("start_stream");
+ id_end_stream = rb_intern("end_stream");
+ id_start_document = rb_intern("start_document");
+ id_end_document = rb_intern("end_document");
+ id_alias = rb_intern("alias");
+ id_scalar = rb_intern("scalar");
+ id_start_sequence = rb_intern("start_sequence");
+ id_end_sequence = rb_intern("end_sequence");
+ id_start_mapping = rb_intern("start_mapping");
+ id_end_mapping = rb_intern("end_mapping");
+}
+/* vim: set noet sws=4 sw=4: */