[ruby/prism] Support parsing streams

https://github.com/ruby/prism/commit/efdc2b7222
author: Kevin Newton <kddnewton@gmail.com> 2024-03-07 15:24:43 -0500
committer: git <svn-admin@ruby-lang.org> 2024-03-07 20:40:39 +0000
commit: ec159fc8ba17cb70e34a5b62c1ef804e393b7b2f (patch)
tree: dba97733c5296ab09cf507b47dcbd6c327934328
parent: 76e11595e28e258f4a4187a6d3eaccc9ca752e10 (diff)
5 files changed, 342 insertions, 32 deletions
diff --git a/lib/prism/ffi.rb b/lib/prism/ffi.rb
index 2aecd4df86..0a064a5c94 100644
--- a/lib/prism/ffi.rb
+++ b/lib/prism/ffi.rb
@@ -23,15 +23,21 @@ module Prism
     #     size_t       -> :size_t
     #     void         -> :void
     #
-    def self.resolve_type(type)
+    def self.resolve_type(type, callbacks)
       type = type.strip
-      type.end_with?("*") ? :pointer : type.delete_prefix("const ").to_sym
+
+      if !type.end_with?("*")
+        type.delete_prefix("const ").to_sym
+      else
+        type = type.delete_suffix("*").rstrip
+        callbacks.include?(type.to_sym) ? type.to_sym : :pointer
+      end
     end
 
     # Read through the given header file and find the declaration of each of the
     # given functions. For each one, define a function with the same name and
     # signature as the C function.
-    def self.load_exported_functions_from(header, *functions)
+    def self.load_exported_functions_from(header, *functions, callbacks)
       File.foreach(File.expand_path("../../include/#{header}", __dir__)) do |line|
         # We only want to attempt to load exported functions.
         next unless line.start_with?("PRISM_EXPORTED_FUNCTION ")
@@ -55,24 +61,28 @@ module Prism
 
         # Resolve the type of the argument by dropping the name of the argument
         # first if it is present.
-        arg_types.map! { |type| resolve_type(type.sub(/\w+$/, "")) }
+        arg_types.map! { |type| resolve_type(type.sub(/\w+$/, ""), callbacks) }
 
         # Attach the function using the FFI library.
-        attach_function name, arg_types, resolve_type(return_type)
+        attach_function name, arg_types, resolve_type(return_type, [])
       end
 
       # If we didn't find all of the functions, raise an error.
       raise "Could not find functions #{functions.inspect}" unless functions.empty?
     end
 
+    callback :pm_parse_stream_fgets_t, [:pointer, :int, :pointer], :pointer
+
     load_exported_functions_from(
       "prism.h",
       "pm_version",
       "pm_serialize_parse",
+      "pm_serialize_parse_stream",
       "pm_serialize_parse_comments",
       "pm_serialize_lex",
       "pm_serialize_parse_lex",
-      "pm_parse_success_p"
+      "pm_parse_success_p",
+      [:pm_parse_stream_fgets_t]
     )
 
     load_exported_functions_from(
@@ -81,7 +91,8 @@ module Prism
       "pm_buffer_init",
       "pm_buffer_value",
       "pm_buffer_length",
-      "pm_buffer_free"
+      "pm_buffer_free",
+      []
     )
 
     load_exported_functions_from(
@@ -90,7 +101,8 @@ module Prism
       "pm_string_free",
       "pm_string_source",
       "pm_string_length",
-      "pm_string_sizeof"
+      "pm_string_sizeof",
+      []
     )
 
     # This object represents a pm_buffer_t. We only use it as an opaque pointer,
@@ -215,13 +227,36 @@ module Prism
     end
 
     # Mirror the Prism.parse_file API by using the serialization API. This uses
-    # native strings instead of Ruby strings because it allows us to use mmap when
-    # it is available.
+    # native strings instead of Ruby strings because it allows us to use mmap
+    # when it is available.
     def parse_file(filepath, **options)
       options[:filepath] = filepath
       LibRubyParser::PrismString.with_file(filepath) { |string| parse_common(string, string.read, options) }
     end
 
+    # Mirror the Prism.parse_stream API by using the serialization API.
+    def parse_stream(stream, **options)
+      LibRubyParser::PrismBuffer.with do |buffer|
+        source = +""
+        callback = -> (string, size, _) {
+          raise "Expected size to be >= 0, got: #{size}" if size <= 0
+
+          if !(line = stream.gets(size - 1)).nil?
+            source << line
+            string.write_string("#{line}\x00", line.bytesize + 1)
+          end
+        }
+
+        # In the pm_serialize_parse_stream function it accepts a pointer to the
+        # IO object as a void* and then passes it through to the callback as the
+        # third argument, but it never touches it itself. As such, since we have
+        # access to the IO object already through the closure of the lambda, we
+        # can pass a null pointer here and not worry.
+        LibRubyParser.pm_serialize_parse_stream(buffer.pointer, nil, callback, dump_options(options))
+        Prism.load(source, buffer.read)
+      end
+    end
+
     # Mirror the Prism.parse_comments API by using the serialization API.
     def parse_comments(code, **options)
       LibRubyParser::PrismString.with_string(code) { |string| parse_comments_common(string, code, options) }
diff --git a/prism/extension.c b/prism/extension.c
index 09ce6a1c0c..91daa2945e 100644
--- a/prism/extension.c
+++ b/prism/extension.c
@@ -504,6 +504,24 @@ parser_warnings(pm_parser_t *parser, rb_encoding *encoding, VALUE source) {
     return warnings;
 }
 
+/**
+ * Create a new parse result from the given parser, value, encoding, and source.
+ */
+static VALUE
+parse_result_create(pm_parser_t *parser, VALUE value, rb_encoding *encoding, VALUE source) {
+    VALUE result_argv[] = {
+        value,
+        parser_comments(parser, source),
+        parser_magic_comments(parser, source),
+        parser_data_loc(parser, source),
+        parser_errors(parser, encoding, source),
+        parser_warnings(parser, encoding, source),
+        source
+    };
+
+    return rb_class_new_instance(7, result_argv, rb_cPrismParseResult);
+}
+
 /******************************************************************************/
 /* Lexing Ruby code                                                           */
 /******************************************************************************/
@@ -610,19 +628,11 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod
         value = parse_lex_data.tokens;
     }
 
-    VALUE result_argv[] = {
-        value,
-        parser_comments(&parser, source),
-        parser_magic_comments(&parser, source),
-        parser_data_loc(&parser, source),
-        parser_errors(&parser, parse_lex_data.encoding, source),
-        parser_warnings(&parser, parse_lex_data.encoding, source),
-        source
-    };
-
+    VALUE result = parse_result_create(&parser, value, parse_lex_data.encoding, source);
     pm_node_destroy(&parser, node);
     pm_parser_free(&parser);
-    return rb_class_new_instance(7, result_argv, rb_cPrismParseResult);
+
+    return result;
 }
 
 /**
@@ -682,17 +692,8 @@ parse_input(pm_string_t *input, const pm_options_t *options) {
     rb_encoding *encoding = rb_enc_find(parser.encoding->name);
 
     VALUE source = pm_source_new(&parser, encoding);
-    VALUE result_argv[] = {
-        pm_ast_new(&parser, node, encoding, source),
-        parser_comments(&parser, source),
-        parser_magic_comments(&parser, source),
-        parser_data_loc(&parser, source),
-        parser_errors(&parser, encoding, source),
-        parser_warnings(&parser, encoding, source),
-        source
-    };
-
-    VALUE result = rb_class_new_instance(7, result_argv, rb_cPrismParseResult);
+    VALUE value = pm_ast_new(&parser, node, encoding, source);
+    VALUE result = parse_result_create(&parser, value, encoding, source) ;
 
     pm_node_destroy(&parser, node);
     pm_parser_free(&parser);
@@ -752,6 +753,60 @@ parse(int argc, VALUE *argv, VALUE self) {
 }
 
 /**
+ * An implementation of fgets that is suitable for use with Ruby IO objects.
+ */
+static char *
+parse_stream_fgets(char *restrict string, int size, void *restrict stream) {
+    RUBY_ASSERT(size > 0);
+
+    VALUE line = rb_funcall((VALUE) stream, rb_intern("gets"), 1, INT2FIX(size - 1));
+    if (NIL_P(line)) {
+        return NULL;
+    }
+
+    const char *cstr = StringValueCStr(line);
+    size_t length = strlen(cstr);
+
+    memcpy(string, cstr, length);
+    string[length] = '\0';
+
+    return string;
+}
+
+/**
+ * call-seq:
+ *   Prism::parse_stream(stream, **options) -> ParseResult
+ *
+ * Parse the given object that responds to `gets` and return a ParseResult
+ * instance. The options that are supported are the same as Prism::parse.
+ */
+static VALUE
+parse_stream(int argc, VALUE *argv, VALUE self) {
+    VALUE stream;
+    VALUE keywords;
+    rb_scan_args(argc, argv, "1:", &stream, &keywords);
+
+    pm_options_t options = { 0 };
+    extract_options(&options, Qnil, keywords);
+
+    pm_parser_t parser;
+    pm_buffer_t buffer;
+
+    pm_node_t *node = pm_parse_stream(&parser, &buffer, (void *) stream, parse_stream_fgets, &options);
+    rb_encoding *encoding = rb_enc_find(parser.encoding->name);
+
+    VALUE source = pm_source_new(&parser, encoding);
+    VALUE value = pm_ast_new(&parser, node, encoding, source);
+    VALUE result = parse_result_create(&parser, value, encoding, source);
+
+    pm_node_destroy(&parser, node);
+    pm_buffer_free(&buffer);
+    pm_parser_free(&parser);
+
+    return result;
+}
+
+/**
  * call-seq:
  *   Prism::parse_file(filepath, **options) -> ParseResult
  *
@@ -1271,6 +1326,7 @@ Init_prism(void) {
     rb_define_singleton_method(rb_cPrism, "lex", lex, -1);
     rb_define_singleton_method(rb_cPrism, "lex_file", lex_file, -1);
     rb_define_singleton_method(rb_cPrism, "parse", parse, -1);
+    rb_define_singleton_method(rb_cPrism, "parse_stream", parse_stream, -1);
     rb_define_singleton_method(rb_cPrism, "parse_file", parse_file, -1);
     rb_define_singleton_method(rb_cPrism, "parse_comments", parse_comments, -1);
     rb_define_singleton_method(rb_cPrism, "parse_file_comments", parse_file_comments, -1);
diff --git a/prism/prism.c b/prism/prism.c
index d7ee5ac7db..045fe63f06 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -18703,6 +18703,99 @@ pm_parse(pm_parser_t *parser) {
     return parse_program(parser);
 }
 
+/**
+ * Read into the stream until the gets callback returns false. If the last read
+ * line from the stream matches an __END__ marker, then halt and return false,
+ * otherwise return true.
+ */
+static bool
+pm_parse_stream_read(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets) {
+#define LINE_SIZE 4096
+    char line[LINE_SIZE];
+
+    while (fgets(line, LINE_SIZE, stream) != NULL) {
+        size_t length = strlen(line);
+
+        if (length == LINE_SIZE && line[length - 1] != '\n') {
+            // If we read a line that is the maximum size and it doesn't end
+            // with a newline, then we'll just append it to the buffer and
+            // continue reading.
+            pm_buffer_append_string(buffer, line, length);
+            continue;
+        }
+
+        // Append the line to the buffer.
+        pm_buffer_append_string(buffer, line, length);
+
+        // Check if the line matches the __END__ marker. If it does, then stop
+        // reading and return false. In most circumstances, this means we should
+        // stop reading from the stream so that the DATA constant can pick it
+        // up.
+        switch (length) {
+            case 7:
+                if (strncmp(line, "__END__", 7) == 0) return false;
+                break;
+            case 8:
+                if (strncmp(line, "__END__\n", 8) == 0) return false;
+                break;
+            case 9:
+                if (strncmp(line, "__END__\r\n", 9) == 0) return false;
+                break;
+        }
+    }
+
+    return true;
+#undef LINE_SIZE
+}
+
+/**
+ * Determine if there was an unterminated heredoc at the end of the input, which
+ * would mean the stream isn't finished and we should keep reading.
+ *
+ * For the other lex modes we can check if the lex mode has been closed, but for
+ * heredocs when we hit EOF we close the lex mode and then go back to parse the
+ * rest of the line after the heredoc declaration so that we get more of the
+ * syntax tree.
+ */
+static bool
+pm_parse_stream_unterminated_heredoc_p(pm_parser_t *parser) {
+    pm_diagnostic_t *diagnostic = (pm_diagnostic_t *) parser->error_list.head;
+
+    for (; diagnostic != NULL; diagnostic = (pm_diagnostic_t *) diagnostic->node.next) {
+        if (diagnostic->diag_id == PM_ERR_HEREDOC_TERM) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/**
+ * Parse a stream of Ruby source and return the tree.
+ *
+ * Prism is designed around having the entire source in memory at once, but you
+ * can stream stdin in to Ruby so we need to support a streaming API.
+ */
+PRISM_EXPORTED_FUNCTION pm_node_t *
+pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets, const pm_options_t *options) {
+    pm_buffer_init(buffer);
+
+    bool eof = pm_parse_stream_read(buffer, stream, fgets);
+    pm_parser_init(parser, (const uint8_t *) pm_buffer_value(buffer), pm_buffer_length(buffer), options);
+    pm_node_t *node = pm_parse(parser);
+
+    while (!eof && parser->error_list.size > 0 && (parser->lex_modes.index > 0 || pm_parse_stream_unterminated_heredoc_p(parser))) {
+        pm_node_destroy(parser, node);
+        eof = pm_parse_stream_read(buffer, stream, fgets);
+
+        pm_parser_free(parser);
+        pm_parser_init(parser, (const uint8_t *) pm_buffer_value(buffer), pm_buffer_length(buffer), options);
+        node = pm_parse(parser);
+    }
+
+    return node;
+}
+
 static inline void
 pm_serialize_header(pm_buffer_t *buffer) {
     pm_buffer_append_string(buffer, "PRISM", 5);
@@ -18746,6 +18839,28 @@ pm_serialize_parse(pm_buffer_t *buffer, const uint8_t *source, size_t size, cons
 }
 
 /**
+ * Parse and serialize the AST represented by the source that is read out of the
+ * given stream into to the given buffer.
+ */
+PRISM_EXPORTED_FUNCTION void
+pm_serialize_parse_stream(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets, const char *data) {
+    pm_parser_t parser;
+    pm_options_t options = { 0 };
+    pm_options_read(&options, data);
+
+    pm_buffer_t parser_buffer;
+    pm_node_t *node = pm_parse_stream(&parser, &parser_buffer, stream, fgets, &options);
+    pm_serialize_header(buffer);
+    pm_serialize_content(&parser, node, buffer);
+    pm_buffer_append_byte(buffer, '\0');
+
+    pm_node_destroy(&parser, node);
+    pm_buffer_free(&parser_buffer);
+    pm_parser_free(&parser);
+    pm_options_free(&options);
+}
+
+/**
  * Parse and serialize the comments in the given source to the given buffer.
  */
 PRISM_EXPORTED_FUNCTION void
diff --git a/prism/prism.h b/prism/prism.h
index 7d9b96fa82..1e74461a90 100644
--- a/prism/prism.h
+++ b/prism/prism.h
@@ -80,6 +80,36 @@ PRISM_EXPORTED_FUNCTION void pm_parser_free(pm_parser_t *parser);
 PRISM_EXPORTED_FUNCTION pm_node_t * pm_parse(pm_parser_t *parser);
 
 /**
+ * This function is used in pm_parse_stream to retrieve a line of input from a
+ * stream. It closely mirrors that of fgets so that fgets can be used as the
+ * default implementation.
+ */
+typedef char * (pm_parse_stream_fgets_t)(char *restrict string, int size, void *restrict stream);
+
+/**
+ * Parse a stream of Ruby source and return the tree.
+ *
+ * @param parser The parser to use.
+ * @param buffer The buffer to use.
+ * @param stream The stream to parse.
+ * @param fgets The function to use to read from the stream.
+ * @param options The optional options to use when parsing.
+ * @return The AST representing the source.
+ */
+PRISM_EXPORTED_FUNCTION pm_node_t * pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets, const pm_options_t *options);
+
+/**
+ * Parse and serialize the AST represented by the source that is read out of the
+ * given stream into to the given buffer.
+ *
+ * @param buffer The buffer to serialize to.
+ * @param stream The stream to parse.
+ * @param fgets The function to use to read from the stream.
+ * @param data The optional data to pass to the parser.
+ */
+PRISM_EXPORTED_FUNCTION void pm_serialize_parse_stream(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets, const char *data);
+
+/**
  * Serialize the given list of comments to the given buffer.
  *
  * @param parser The parser to serialize.
diff --git a/test/prism/parse_stream_test.rb b/test/prism/parse_stream_test.rb
new file mode 100644
index 0000000000..9e6347b92b
--- /dev/null
+++ b/test/prism/parse_stream_test.rb
@@ -0,0 +1,74 @@
+# frozen_string_literal: true
+
+require_relative "test_helper"
+require "stringio"
+
+module Prism
+  class ParseStreamTest < TestCase
+    def test_single_line
+      io = StringIO.new("1 + 2")
+      result = Prism.parse_stream(io)
+
+      assert result.success?
+      assert_kind_of Prism::CallNode, result.value.statements.body.first
+    end
+
+    def test_multi_line
+      io = StringIO.new("1 + 2\n3 + 4")
+      result = Prism.parse_stream(io)
+
+      assert result.success?
+      assert_kind_of Prism::CallNode, result.value.statements.body.first
+      assert_kind_of Prism::CallNode, result.value.statements.body.last
+    end
+
+    def test_multi_read
+      io = StringIO.new("a" * 4096 * 4)
+      result = Prism.parse_stream(io)
+
+      assert result.success?
+      assert_kind_of Prism::CallNode, result.value.statements.body.first
+    end
+
+    def test___END__
+      io = StringIO.new("1 + 2\n3 + 4\n__END__\n5 + 6")
+      result = Prism.parse_stream(io)
+
+      assert result.success?
+      assert_equal 2, result.value.statements.body.length
+      assert_equal "5 + 6", io.read
+    end
+
+    def test_false___END___in_string
+      io = StringIO.new("1 + 2\n3 + 4\n\"\n__END__\n\"\n5 + 6")
+      result = Prism.parse_stream(io)
+
+      assert result.success?
+      assert_equal 4, result.value.statements.body.length
+    end
+
+    def test_false___END___in_regexp
+      io = StringIO.new("1 + 2\n3 + 4\n/\n__END__\n/\n5 + 6")
+      result = Prism.parse_stream(io)
+
+      assert result.success?
+      assert_equal 4, result.value.statements.body.length
+    end
+
+    def test_false___END___in_list
+      io = StringIO.new("1 + 2\n3 + 4\n%w[\n__END__\n]\n5 + 6")
+      result = Prism.parse_stream(io)
+
+      assert result.success?
+      assert_equal 4, result.value.statements.body.length
+    end
+
+    def test_false___END___in_heredoc
+      io = StringIO.new("1 + 2\n3 + 4\n<<-EOF\n__END__\nEOF\n5 + 6")
+      result = Prism.parse_stream(io)
+
+      assert result.success?
+      assert_equal 4, result.value.statements.body.length
+    end
+  end
+end
author	Kevin Newton <kddnewton@gmail.com>	2024-03-07 15:24:43 -0500
committer	git <svn-admin@ruby-lang.org>	2024-03-07 20:40:39 +0000
commit	ec159fc8ba17cb70e34a5b62c1ef804e393b7b2f (patch)
tree	dba97733c5296ab09cf507b47dcbd6c327934328
parent	76e11595e28e258f4a4187a6d3eaccc9ca752e10 (diff)