diff options
| author | Kevin Newton <kddnewton@gmail.com> | 2024-03-07 15:24:43 -0500 |
|---|---|---|
| committer | git <svn-admin@ruby-lang.org> | 2024-03-07 20:40:39 +0000 |
| commit | ec159fc8ba17cb70e34a5b62c1ef804e393b7b2f (patch) | |
| tree | dba97733c5296ab09cf507b47dcbd6c327934328 | |
| parent | 76e11595e28e258f4a4187a6d3eaccc9ca752e10 (diff) | |
[ruby/prism] Support parsing streams
https://github.com/ruby/prism/commit/efdc2b7222
| -rw-r--r-- | lib/prism/ffi.rb | 55 | ||||
| -rw-r--r-- | prism/extension.c | 100 | ||||
| -rw-r--r-- | prism/prism.c | 115 | ||||
| -rw-r--r-- | prism/prism.h | 30 | ||||
| -rw-r--r-- | test/prism/parse_stream_test.rb | 74 |
5 files changed, 342 insertions, 32 deletions
diff --git a/lib/prism/ffi.rb b/lib/prism/ffi.rb index 2aecd4df86..0a064a5c94 100644 --- a/lib/prism/ffi.rb +++ b/lib/prism/ffi.rb @@ -23,15 +23,21 @@ module Prism # size_t -> :size_t # void -> :void # - def self.resolve_type(type) + def self.resolve_type(type, callbacks) type = type.strip - type.end_with?("*") ? :pointer : type.delete_prefix("const ").to_sym + + if !type.end_with?("*") + type.delete_prefix("const ").to_sym + else + type = type.delete_suffix("*").rstrip + callbacks.include?(type.to_sym) ? type.to_sym : :pointer + end end # Read through the given header file and find the declaration of each of the # given functions. For each one, define a function with the same name and # signature as the C function. - def self.load_exported_functions_from(header, *functions) + def self.load_exported_functions_from(header, *functions, callbacks) File.foreach(File.expand_path("../../include/#{header}", __dir__)) do |line| # We only want to attempt to load exported functions. next unless line.start_with?("PRISM_EXPORTED_FUNCTION ") @@ -55,24 +61,28 @@ module Prism # Resolve the type of the argument by dropping the name of the argument # first if it is present. - arg_types.map! { |type| resolve_type(type.sub(/\w+$/, "")) } + arg_types.map! { |type| resolve_type(type.sub(/\w+$/, ""), callbacks) } # Attach the function using the FFI library. - attach_function name, arg_types, resolve_type(return_type) + attach_function name, arg_types, resolve_type(return_type, []) end # If we didn't find all of the functions, raise an error. raise "Could not find functions #{functions.inspect}" unless functions.empty? end + callback :pm_parse_stream_fgets_t, [:pointer, :int, :pointer], :pointer + load_exported_functions_from( "prism.h", "pm_version", "pm_serialize_parse", + "pm_serialize_parse_stream", "pm_serialize_parse_comments", "pm_serialize_lex", "pm_serialize_parse_lex", - "pm_parse_success_p" + "pm_parse_success_p", + [:pm_parse_stream_fgets_t] ) load_exported_functions_from( @@ -81,7 +91,8 @@ module Prism "pm_buffer_init", "pm_buffer_value", "pm_buffer_length", - "pm_buffer_free" + "pm_buffer_free", + [] ) load_exported_functions_from( @@ -90,7 +101,8 @@ module Prism "pm_string_free", "pm_string_source", "pm_string_length", - "pm_string_sizeof" + "pm_string_sizeof", + [] ) # This object represents a pm_buffer_t. We only use it as an opaque pointer, @@ -215,13 +227,36 @@ module Prism end # Mirror the Prism.parse_file API by using the serialization API. This uses - # native strings instead of Ruby strings because it allows us to use mmap when - # it is available. + # native strings instead of Ruby strings because it allows us to use mmap + # when it is available. def parse_file(filepath, **options) options[:filepath] = filepath LibRubyParser::PrismString.with_file(filepath) { |string| parse_common(string, string.read, options) } end + # Mirror the Prism.parse_stream API by using the serialization API. + def parse_stream(stream, **options) + LibRubyParser::PrismBuffer.with do |buffer| + source = +"" + callback = -> (string, size, _) { + raise "Expected size to be >= 0, got: #{size}" if size <= 0 + + if !(line = stream.gets(size - 1)).nil? + source << line + string.write_string("#{line}\x00", line.bytesize + 1) + end + } + + # In the pm_serialize_parse_stream function it accepts a pointer to the + # IO object as a void* and then passes it through to the callback as the + # third argument, but it never touches it itself. As such, since we have + # access to the IO object already through the closure of the lambda, we + # can pass a null pointer here and not worry. + LibRubyParser.pm_serialize_parse_stream(buffer.pointer, nil, callback, dump_options(options)) + Prism.load(source, buffer.read) + end + end + # Mirror the Prism.parse_comments API by using the serialization API. def parse_comments(code, **options) LibRubyParser::PrismString.with_string(code) { |string| parse_comments_common(string, code, options) } diff --git a/prism/extension.c b/prism/extension.c index 09ce6a1c0c..91daa2945e 100644 --- a/prism/extension.c +++ b/prism/extension.c @@ -504,6 +504,24 @@ parser_warnings(pm_parser_t *parser, rb_encoding *encoding, VALUE source) { return warnings; } +/** + * Create a new parse result from the given parser, value, encoding, and source. + */ +static VALUE +parse_result_create(pm_parser_t *parser, VALUE value, rb_encoding *encoding, VALUE source) { + VALUE result_argv[] = { + value, + parser_comments(parser, source), + parser_magic_comments(parser, source), + parser_data_loc(parser, source), + parser_errors(parser, encoding, source), + parser_warnings(parser, encoding, source), + source + }; + + return rb_class_new_instance(7, result_argv, rb_cPrismParseResult); +} + /******************************************************************************/ /* Lexing Ruby code */ /******************************************************************************/ @@ -610,19 +628,11 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod value = parse_lex_data.tokens; } - VALUE result_argv[] = { - value, - parser_comments(&parser, source), - parser_magic_comments(&parser, source), - parser_data_loc(&parser, source), - parser_errors(&parser, parse_lex_data.encoding, source), - parser_warnings(&parser, parse_lex_data.encoding, source), - source - }; - + VALUE result = parse_result_create(&parser, value, parse_lex_data.encoding, source); pm_node_destroy(&parser, node); pm_parser_free(&parser); - return rb_class_new_instance(7, result_argv, rb_cPrismParseResult); + + return result; } /** @@ -682,17 +692,8 @@ parse_input(pm_string_t *input, const pm_options_t *options) { rb_encoding *encoding = rb_enc_find(parser.encoding->name); VALUE source = pm_source_new(&parser, encoding); - VALUE result_argv[] = { - pm_ast_new(&parser, node, encoding, source), - parser_comments(&parser, source), - parser_magic_comments(&parser, source), - parser_data_loc(&parser, source), - parser_errors(&parser, encoding, source), - parser_warnings(&parser, encoding, source), - source - }; - - VALUE result = rb_class_new_instance(7, result_argv, rb_cPrismParseResult); + VALUE value = pm_ast_new(&parser, node, encoding, source); + VALUE result = parse_result_create(&parser, value, encoding, source) ; pm_node_destroy(&parser, node); pm_parser_free(&parser); @@ -752,6 +753,60 @@ parse(int argc, VALUE *argv, VALUE self) { } /** + * An implementation of fgets that is suitable for use with Ruby IO objects. + */ +static char * +parse_stream_fgets(char *restrict string, int size, void *restrict stream) { + RUBY_ASSERT(size > 0); + + VALUE line = rb_funcall((VALUE) stream, rb_intern("gets"), 1, INT2FIX(size - 1)); + if (NIL_P(line)) { + return NULL; + } + + const char *cstr = StringValueCStr(line); + size_t length = strlen(cstr); + + memcpy(string, cstr, length); + string[length] = '\0'; + + return string; +} + +/** + * call-seq: + * Prism::parse_stream(stream, **options) -> ParseResult + * + * Parse the given object that responds to `gets` and return a ParseResult + * instance. The options that are supported are the same as Prism::parse. + */ +static VALUE +parse_stream(int argc, VALUE *argv, VALUE self) { + VALUE stream; + VALUE keywords; + rb_scan_args(argc, argv, "1:", &stream, &keywords); + + pm_options_t options = { 0 }; + extract_options(&options, Qnil, keywords); + + pm_parser_t parser; + pm_buffer_t buffer; + + pm_node_t *node = pm_parse_stream(&parser, &buffer, (void *) stream, parse_stream_fgets, &options); + rb_encoding *encoding = rb_enc_find(parser.encoding->name); + + VALUE source = pm_source_new(&parser, encoding); + VALUE value = pm_ast_new(&parser, node, encoding, source); + VALUE result = parse_result_create(&parser, value, encoding, source); + + pm_node_destroy(&parser, node); + pm_buffer_free(&buffer); + pm_parser_free(&parser); + + return result; +} + +/** * call-seq: * Prism::parse_file(filepath, **options) -> ParseResult * @@ -1271,6 +1326,7 @@ Init_prism(void) { rb_define_singleton_method(rb_cPrism, "lex", lex, -1); rb_define_singleton_method(rb_cPrism, "lex_file", lex_file, -1); rb_define_singleton_method(rb_cPrism, "parse", parse, -1); + rb_define_singleton_method(rb_cPrism, "parse_stream", parse_stream, -1); rb_define_singleton_method(rb_cPrism, "parse_file", parse_file, -1); rb_define_singleton_method(rb_cPrism, "parse_comments", parse_comments, -1); rb_define_singleton_method(rb_cPrism, "parse_file_comments", parse_file_comments, -1); diff --git a/prism/prism.c b/prism/prism.c index d7ee5ac7db..045fe63f06 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -18703,6 +18703,99 @@ pm_parse(pm_parser_t *parser) { return parse_program(parser); } +/** + * Read into the stream until the gets callback returns false. If the last read + * line from the stream matches an __END__ marker, then halt and return false, + * otherwise return true. + */ +static bool +pm_parse_stream_read(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets) { +#define LINE_SIZE 4096 + char line[LINE_SIZE]; + + while (fgets(line, LINE_SIZE, stream) != NULL) { + size_t length = strlen(line); + + if (length == LINE_SIZE && line[length - 1] != '\n') { + // If we read a line that is the maximum size and it doesn't end + // with a newline, then we'll just append it to the buffer and + // continue reading. + pm_buffer_append_string(buffer, line, length); + continue; + } + + // Append the line to the buffer. + pm_buffer_append_string(buffer, line, length); + + // Check if the line matches the __END__ marker. If it does, then stop + // reading and return false. In most circumstances, this means we should + // stop reading from the stream so that the DATA constant can pick it + // up. + switch (length) { + case 7: + if (strncmp(line, "__END__", 7) == 0) return false; + break; + case 8: + if (strncmp(line, "__END__\n", 8) == 0) return false; + break; + case 9: + if (strncmp(line, "__END__\r\n", 9) == 0) return false; + break; + } + } + + return true; +#undef LINE_SIZE +} + +/** + * Determine if there was an unterminated heredoc at the end of the input, which + * would mean the stream isn't finished and we should keep reading. + * + * For the other lex modes we can check if the lex mode has been closed, but for + * heredocs when we hit EOF we close the lex mode and then go back to parse the + * rest of the line after the heredoc declaration so that we get more of the + * syntax tree. + */ +static bool +pm_parse_stream_unterminated_heredoc_p(pm_parser_t *parser) { + pm_diagnostic_t *diagnostic = (pm_diagnostic_t *) parser->error_list.head; + + for (; diagnostic != NULL; diagnostic = (pm_diagnostic_t *) diagnostic->node.next) { + if (diagnostic->diag_id == PM_ERR_HEREDOC_TERM) { + return true; + } + } + + return false; +} + +/** + * Parse a stream of Ruby source and return the tree. + * + * Prism is designed around having the entire source in memory at once, but you + * can stream stdin in to Ruby so we need to support a streaming API. + */ +PRISM_EXPORTED_FUNCTION pm_node_t * +pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets, const pm_options_t *options) { + pm_buffer_init(buffer); + + bool eof = pm_parse_stream_read(buffer, stream, fgets); + pm_parser_init(parser, (const uint8_t *) pm_buffer_value(buffer), pm_buffer_length(buffer), options); + pm_node_t *node = pm_parse(parser); + + while (!eof && parser->error_list.size > 0 && (parser->lex_modes.index > 0 || pm_parse_stream_unterminated_heredoc_p(parser))) { + pm_node_destroy(parser, node); + eof = pm_parse_stream_read(buffer, stream, fgets); + + pm_parser_free(parser); + pm_parser_init(parser, (const uint8_t *) pm_buffer_value(buffer), pm_buffer_length(buffer), options); + node = pm_parse(parser); + } + + return node; +} + static inline void pm_serialize_header(pm_buffer_t *buffer) { pm_buffer_append_string(buffer, "PRISM", 5); @@ -18746,6 +18839,28 @@ pm_serialize_parse(pm_buffer_t *buffer, const uint8_t *source, size_t size, cons } /** + * Parse and serialize the AST represented by the source that is read out of the + * given stream into to the given buffer. + */ +PRISM_EXPORTED_FUNCTION void +pm_serialize_parse_stream(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets, const char *data) { + pm_parser_t parser; + pm_options_t options = { 0 }; + pm_options_read(&options, data); + + pm_buffer_t parser_buffer; + pm_node_t *node = pm_parse_stream(&parser, &parser_buffer, stream, fgets, &options); + pm_serialize_header(buffer); + pm_serialize_content(&parser, node, buffer); + pm_buffer_append_byte(buffer, '\0'); + + pm_node_destroy(&parser, node); + pm_buffer_free(&parser_buffer); + pm_parser_free(&parser); + pm_options_free(&options); +} + +/** * Parse and serialize the comments in the given source to the given buffer. */ PRISM_EXPORTED_FUNCTION void diff --git a/prism/prism.h b/prism/prism.h index 7d9b96fa82..1e74461a90 100644 --- a/prism/prism.h +++ b/prism/prism.h @@ -80,6 +80,36 @@ PRISM_EXPORTED_FUNCTION void pm_parser_free(pm_parser_t *parser); PRISM_EXPORTED_FUNCTION pm_node_t * pm_parse(pm_parser_t *parser); /** + * This function is used in pm_parse_stream to retrieve a line of input from a + * stream. It closely mirrors that of fgets so that fgets can be used as the + * default implementation. + */ +typedef char * (pm_parse_stream_fgets_t)(char *restrict string, int size, void *restrict stream); + +/** + * Parse a stream of Ruby source and return the tree. + * + * @param parser The parser to use. + * @param buffer The buffer to use. + * @param stream The stream to parse. + * @param fgets The function to use to read from the stream. + * @param options The optional options to use when parsing. + * @return The AST representing the source. + */ +PRISM_EXPORTED_FUNCTION pm_node_t * pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets, const pm_options_t *options); + +/** + * Parse and serialize the AST represented by the source that is read out of the + * given stream into to the given buffer. + * + * @param buffer The buffer to serialize to. + * @param stream The stream to parse. + * @param fgets The function to use to read from the stream. + * @param data The optional data to pass to the parser. + */ +PRISM_EXPORTED_FUNCTION void pm_serialize_parse_stream(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets, const char *data); + +/** * Serialize the given list of comments to the given buffer. * * @param parser The parser to serialize. diff --git a/test/prism/parse_stream_test.rb b/test/prism/parse_stream_test.rb new file mode 100644 index 0000000000..9e6347b92b --- /dev/null +++ b/test/prism/parse_stream_test.rb @@ -0,0 +1,74 @@ +# frozen_string_literal: true + +require_relative "test_helper" +require "stringio" + +module Prism + class ParseStreamTest < TestCase + def test_single_line + io = StringIO.new("1 + 2") + result = Prism.parse_stream(io) + + assert result.success? + assert_kind_of Prism::CallNode, result.value.statements.body.first + end + + def test_multi_line + io = StringIO.new("1 + 2\n3 + 4") + result = Prism.parse_stream(io) + + assert result.success? + assert_kind_of Prism::CallNode, result.value.statements.body.first + assert_kind_of Prism::CallNode, result.value.statements.body.last + end + + def test_multi_read + io = StringIO.new("a" * 4096 * 4) + result = Prism.parse_stream(io) + + assert result.success? + assert_kind_of Prism::CallNode, result.value.statements.body.first + end + + def test___END__ + io = StringIO.new("1 + 2\n3 + 4\n__END__\n5 + 6") + result = Prism.parse_stream(io) + + assert result.success? + assert_equal 2, result.value.statements.body.length + assert_equal "5 + 6", io.read + end + + def test_false___END___in_string + io = StringIO.new("1 + 2\n3 + 4\n\"\n__END__\n\"\n5 + 6") + result = Prism.parse_stream(io) + + assert result.success? + assert_equal 4, result.value.statements.body.length + end + + def test_false___END___in_regexp + io = StringIO.new("1 + 2\n3 + 4\n/\n__END__\n/\n5 + 6") + result = Prism.parse_stream(io) + + assert result.success? + assert_equal 4, result.value.statements.body.length + end + + def test_false___END___in_list + io = StringIO.new("1 + 2\n3 + 4\n%w[\n__END__\n]\n5 + 6") + result = Prism.parse_stream(io) + + assert result.success? + assert_equal 4, result.value.statements.body.length + end + + def test_false___END___in_heredoc + io = StringIO.new("1 + 2\n3 + 4\n<<-EOF\n__END__\nEOF\n5 + 6") + result = Prism.parse_stream(io) + + assert result.success? + assert_equal 4, result.value.statements.body.length + end + end +end |
