diff options
author | Kevin Newton <kddnewton@gmail.com> | 2023-09-27 12:22:36 -0400 |
---|---|---|
committer | Kevin Newton <kddnewton@gmail.com> | 2023-09-27 13:57:38 -0400 |
commit | 8ab56869a64fdccc094f4a83c6367fb23b72d38b (patch) | |
tree | 46ef2bd5c51d5b7f923eda6a60edefc7a08200db /lib/prism | |
parent | 7e0971eb5d679bb6219abb0ec238139aa6502c5a (diff) |
Rename YARP filepaths to prism filepaths
Diffstat (limited to 'lib/prism')
-rw-r--r-- | lib/prism/debug.rb | 157 | ||||
-rw-r--r-- | lib/prism/desugar_compiler.rb | 206 | ||||
-rw-r--r-- | lib/prism/ffi.rb | 251 | ||||
-rw-r--r-- | lib/prism/language_server.rb | 166 | ||||
-rw-r--r-- | lib/prism/lex_compat.rb | 838 | ||||
-rw-r--r-- | lib/prism/node_ext.rb | 55 | ||||
-rw-r--r-- | lib/prism/node_inspector.rb | 68 | ||||
-rw-r--r-- | lib/prism/pack.rb | 185 | ||||
-rw-r--r-- | lib/prism/parse_result.rb | 266 | ||||
-rw-r--r-- | lib/prism/parse_result/comments.rb | 172 | ||||
-rw-r--r-- | lib/prism/parse_result/newlines.rb | 60 | ||||
-rw-r--r-- | lib/prism/pattern.rb | 239 | ||||
-rw-r--r-- | lib/prism/prism.gemspec | 113 | ||||
-rw-r--r-- | lib/prism/ripper_compat.rb | 174 | ||||
-rw-r--r-- | lib/prism/version.rb | 5 |
15 files changed, 2955 insertions, 0 deletions
diff --git a/lib/prism/debug.rb b/lib/prism/debug.rb new file mode 100644 index 0000000000..39df1e838c --- /dev/null +++ b/lib/prism/debug.rb @@ -0,0 +1,157 @@ +# frozen_string_literal: true + +module YARP + # This module is used for testing and debugging and is not meant to be used by + # consumers of this library. + module Debug + class ISeq + attr_reader :parts + + def initialize(parts) + @parts = parts + end + + def type + parts[0] + end + + def local_table + parts[10] + end + + def instructions + parts[13] + end + + def each_child + instructions.each do |instruction| + # Only look at arrays. Other instructions are line numbers or + # tracepoint events. + next unless instruction.is_a?(Array) + + instruction.each do |opnd| + # Only look at arrays. Other operands are literals. + next unless opnd.is_a?(Array) + + # Only look at instruction sequences. Other operands are literals. + next unless opnd[0] == "YARVInstructionSequence/SimpleDataFormat" + + yield ISeq.new(opnd) + end + end + end + end + + # For the given source, compiles with CRuby and returns a list of all of the + # sets of local variables that were encountered. + def self.cruby_locals(source) + verbose = $VERBOSE + $VERBOSE = nil + + begin + locals = [] + stack = [ISeq.new(RubyVM::InstructionSequence.compile(source).to_a)] + + while (iseq = stack.pop) + if iseq.type != :once + names = iseq.local_table + + # CRuby will push on a special local variable when there are keyword + # arguments. We get rid of that here. + names = names.grep_v(Integer) + + # For some reason, CRuby occasionally pushes this special local + # variable when there are splat arguments. We get rid of that here. + names = names.grep_v(:"#arg_rest") + + # Now push them onto the list of locals. + locals << names + end + + iseq.each_child { |child| stack << child } + end + + locals + ensure + $VERBOSE = verbose + end + end + + # For the given source, parses with YARP and returns a list of all of the + # sets of local variables that were encountered. + def self.yarp_locals(source) + locals = [] + stack = [YARP.parse(source).value] + + while (node = stack.pop) + case node + when BlockNode, DefNode, LambdaNode + names = node.locals + + params = node.parameters + params = params&.parameters unless node.is_a?(DefNode) + + # YARP places parameters in the same order that they appear in the + # source. CRuby places them in the order that they need to appear + # according to their own internal calling convention. We mimic that + # order here so that we can compare properly. + if params + sorted = [ + *params.requireds.grep(RequiredParameterNode).map(&:name), + *params.optionals.map(&:name), + *((params.rest.name || :*) if params.rest && params.rest.operator != ","), + *params.posts.grep(RequiredParameterNode).map(&:name), + *params.keywords.reject(&:value).map(&:name), + *params.keywords.select(&:value).map(&:name) + ] + + # TODO: When we get a ... parameter, we should be pushing * and & + # onto the local list. We don't do that yet, so we need to add them + # in here. + if params.keyword_rest.is_a?(ForwardingParameterNode) + sorted.push(:*, :&, :"...") + end + + # Recurse down the parameter tree to find any destructured + # parameters and add them after the other parameters. + param_stack = params.requireds.concat(params.posts).grep(RequiredDestructuredParameterNode).reverse + while (param = param_stack.pop) + case param + when RequiredDestructuredParameterNode + param_stack.concat(param.parameters.reverse) + when RequiredParameterNode + sorted << param.name + when SplatNode + sorted << param.expression.name if param.expression + end + end + + names = sorted.concat(names - sorted) + end + + locals << names + when ClassNode, ModuleNode, ProgramNode, SingletonClassNode + locals << node.locals + when ForNode + locals << [] + when PostExecutionNode + locals.push([], []) + when InterpolatedRegularExpressionNode + locals << [] if node.once? + end + + stack.concat(node.compact_child_nodes) + end + + locals + end + + def self.newlines(source) + YARP.parse(source).source.offsets + end + + def self.parse_serialize_file(filepath) + parse_serialize_file_metadata(filepath, [filepath.bytesize, filepath.b, 0].pack("LA*L")) + end + end +end diff --git a/lib/prism/desugar_compiler.rb b/lib/prism/desugar_compiler.rb new file mode 100644 index 0000000000..b86e8518c6 --- /dev/null +++ b/lib/prism/desugar_compiler.rb @@ -0,0 +1,206 @@ +# frozen_string_literal: true + +module YARP + # DesugarCompiler is a compiler that desugars Ruby code into a more primitive + # form. This is useful for consumers that want to deal with fewer node types. + class DesugarCompiler < MutationCompiler + # @@foo &&= bar + # + # becomes + # + # @@foo && @@foo = bar + def visit_class_variable_and_write_node(node) + desugar_and_write_node(node, ClassVariableReadNode, ClassVariableWriteNode, node.name) + end + + # @@foo ||= bar + # + # becomes + # + # defined?(@@foo) ? @@foo : @@foo = bar + def visit_class_variable_or_write_node(node) + desugar_or_write_defined_node(node, ClassVariableReadNode, ClassVariableWriteNode, node.name) + end + + # @@foo += bar + # + # becomes + # + # @@foo = @@foo + bar + def visit_class_variable_operator_write_node(node) + desugar_operator_write_node(node, ClassVariableReadNode, ClassVariableWriteNode, node.name) + end + + # Foo &&= bar + # + # becomes + # + # Foo && Foo = bar + def visit_constant_and_write_node(node) + desugar_and_write_node(node, ConstantReadNode, ConstantWriteNode, node.name) + end + + # Foo ||= bar + # + # becomes + # + # defined?(Foo) ? Foo : Foo = bar + def visit_constant_or_write_node(node) + desugar_or_write_defined_node(node, ConstantReadNode, ConstantWriteNode, node.name) + end + + # Foo += bar + # + # becomes + # + # Foo = Foo + bar + def visit_constant_operator_write_node(node) + desugar_operator_write_node(node, ConstantReadNode, ConstantWriteNode, node.name) + end + + # $foo &&= bar + # + # becomes + # + # $foo && $foo = bar + def visit_global_variable_and_write_node(node) + desugar_and_write_node(node, GlobalVariableReadNode, GlobalVariableWriteNode, node.name) + end + + # $foo ||= bar + # + # becomes + # + # defined?($foo) ? $foo : $foo = bar + def visit_global_variable_or_write_node(node) + desugar_or_write_defined_node(node, GlobalVariableReadNode, GlobalVariableWriteNode, node.name) + end + + # $foo += bar + # + # becomes + # + # $foo = $foo + bar + def visit_global_variable_operator_write_node(node) + desugar_operator_write_node(node, GlobalVariableReadNode, GlobalVariableWriteNode, node.name) + end + + # @foo &&= bar + # + # becomes + # + # @foo && @foo = bar + def visit_instance_variable_and_write_node(node) + desugar_and_write_node(node, InstanceVariableReadNode, InstanceVariableWriteNode, node.name) + end + + # @foo ||= bar + # + # becomes + # + # @foo || @foo = bar + def visit_instance_variable_or_write_node(node) + desugar_or_write_node(node, InstanceVariableReadNode, InstanceVariableWriteNode, node.name) + end + + # @foo += bar + # + # becomes + # + # @foo = @foo + bar + def visit_instance_variable_operator_write_node(node) + desugar_operator_write_node(node, InstanceVariableReadNode, InstanceVariableWriteNode, node.name) + end + + # foo &&= bar + # + # becomes + # + # foo && foo = bar + def visit_local_variable_and_write_node(node) + desugar_and_write_node(node, LocalVariableReadNode, LocalVariableWriteNode, node.name, node.depth) + end + + # foo ||= bar + # + # becomes + # + # foo || foo = bar + def visit_local_variable_or_write_node(node) + desugar_or_write_node(node, LocalVariableReadNode, LocalVariableWriteNode, node.name, node.depth) + end + + # foo += bar + # + # becomes + # + # foo = foo + bar + def visit_local_variable_operator_write_node(node) + desugar_operator_write_node(node, LocalVariableReadNode, LocalVariableWriteNode, node.name, node.depth) + end + + private + + # Desugar `x &&= y` to `x && x = y` + def desugar_and_write_node(node, read_class, write_class, *arguments) + AndNode.new( + read_class.new(*arguments, node.name_loc), + write_class.new(*arguments, node.name_loc, node.value, node.operator_loc, node.location), + node.operator_loc, + node.location + ) + end + + # Desugar `x += y` to `x = x + y` + def desugar_operator_write_node(node, read_class, write_class, *arguments) + write_class.new( + *arguments, + node.name_loc, + CallNode.new( + read_class.new(*arguments, node.name_loc), + nil, + node.operator_loc.copy(length: node.operator_loc.length - 1), + nil, + ArgumentsNode.new([node.value], node.value.location), + nil, + nil, + 0, + node.operator_loc.slice.chomp("="), + node.location + ), + node.operator_loc.copy(start_offset: node.operator_loc.end_offset - 1, length: 1), + node.location + ) + end + + # Desugar `x ||= y` to `x || x = y` + def desugar_or_write_node(node, read_class, write_class, *arguments) + OrNode.new( + read_class.new(*arguments, node.name_loc), + write_class.new(*arguments, node.name_loc, node.value, node.operator_loc, node.location), + node.operator_loc, + node.location + ) + end + + # Desugar `x ||= y` to `defined?(x) ? x : x = y` + def desugar_or_write_defined_node(node, read_class, write_class, *arguments) + IfNode.new( + node.operator_loc, + DefinedNode.new(nil, read_class.new(*arguments, node.name_loc), nil, node.operator_loc, node.name_loc), + StatementsNode.new([read_class.new(*arguments, node.name_loc)], node.location), + ElseNode.new( + node.operator_loc, + StatementsNode.new( + [write_class.new(*arguments, node.name_loc, node.value, node.operator_loc, node.location)], + node.location + ), + node.operator_loc, + node.location + ), + node.operator_loc, + node.location + ) + end + end +end diff --git a/lib/prism/ffi.rb b/lib/prism/ffi.rb new file mode 100644 index 0000000000..82643be808 --- /dev/null +++ b/lib/prism/ffi.rb @@ -0,0 +1,251 @@ +# frozen_string_literal: true + +# This file is responsible for mirroring the API provided by the C extension by +# using FFI to call into the shared library. + +require "rbconfig" +require "ffi" + +module YARP + BACKEND = :FFI + + module LibRubyParser + extend FFI::Library + + # Define the library that we will be pulling functions from. Note that this + # must align with the build shared library from make/rake. + ffi_lib File.expand_path("../../build/librubyparser.#{RbConfig::CONFIG["SOEXT"]}", __dir__) + + # Convert a native C type declaration into a symbol that FFI understands. + # For example: + # + # const char * -> :pointer + # bool -> :bool + # size_t -> :size_t + # void -> :void + # + def self.resolve_type(type) + type = type.strip.delete_prefix("const ") + type.end_with?("*") ? :pointer : type.to_sym + end + + # Read through the given header file and find the declaration of each of the + # given functions. For each one, define a function with the same name and + # signature as the C function. + def self.load_exported_functions_from(header, *functions) + File.foreach(File.expand_path("../../include/#{header}", __dir__)) do |line| + # We only want to attempt to load exported functions. + next unless line.start_with?("YP_EXPORTED_FUNCTION ") + + # We only want to load the functions that we are interested in. + next unless functions.any? { |function| line.include?(function) } + + # Parse the function declaration. + unless /^YP_EXPORTED_FUNCTION (?<return_type>.+) (?<name>\w+)\((?<arg_types>.+)\);$/ =~ line + raise "Could not parse #{line}" + end + + # Delete the function from the list of functions we are looking for to + # mark it as having been found. + functions.delete(name) + + # Split up the argument types into an array, ensure we handle the case + # where there are no arguments (by explicit void). + arg_types = arg_types.split(",").map(&:strip) + arg_types = [] if arg_types == %w[void] + + # Resolve the type of the argument by dropping the name of the argument + # first if it is present. + arg_types.map! { |type| resolve_type(type.sub(/\w+$/, "")) } + + # Attach the function using the FFI library. + attach_function name, arg_types, resolve_type(return_type) + end + + # If we didn't find all of the functions, raise an error. + raise "Could not find functions #{functions.inspect}" unless functions.empty? + end + + load_exported_functions_from( + "yarp.h", + "yp_version", + "yp_parse_serialize", + "yp_lex_serialize", + "yp_parse_lex_serialize" + ) + + load_exported_functions_from( + "yarp/util/yp_buffer.h", + "yp_buffer_sizeof", + "yp_buffer_init", + "yp_buffer_value", + "yp_buffer_length", + "yp_buffer_free" + ) + + load_exported_functions_from( + "yarp/util/yp_string.h", + "yp_string_mapped_init", + "yp_string_free", + "yp_string_source", + "yp_string_length", + "yp_string_sizeof" + ) + + # This object represents a yp_buffer_t. We only use it as an opaque pointer, + # so it doesn't need to know the fields of yp_buffer_t. + class YPBuffer + SIZEOF = LibRubyParser.yp_buffer_sizeof + + attr_reader :pointer + + def initialize(pointer) + @pointer = pointer + end + + def value + LibRubyParser.yp_buffer_value(pointer) + end + + def length + LibRubyParser.yp_buffer_length(pointer) + end + + def read + value.read_string(length) + end + + # Initialize a new buffer and yield it to the block. The buffer will be + # automatically freed when the block returns. + def self.with(&block) + pointer = FFI::MemoryPointer.new(SIZEOF) + + begin + raise unless LibRubyParser.yp_buffer_init(pointer) + yield new(pointer) + ensure + LibRubyParser.yp_buffer_free(pointer) + pointer.free + end + end + end + + # This object represents a yp_string_t. We only use it as an opaque pointer, + # so it doesn't have to be an FFI::Struct. + class YPString + SIZEOF = LibRubyParser.yp_string_sizeof + + attr_reader :pointer + + def initialize(pointer) + @pointer = pointer + end + + def source + LibRubyParser.yp_string_source(pointer) + end + + def length + LibRubyParser.yp_string_length(pointer) + end + + def read + source.read_string(length) + end + + # Yields a yp_string_t pointer to the given block. + def self.with(filepath, &block) + pointer = FFI::MemoryPointer.new(SIZEOF) + + begin + raise unless LibRubyParser.yp_string_mapped_init(pointer, filepath) + yield new(pointer) + ensure + LibRubyParser.yp_string_free(pointer) + pointer.free + end + end + end + + def self.dump_internal(source, source_size, filepath) + YPBuffer.with do |buffer| + metadata = [filepath.bytesize, filepath.b, 0].pack("LA*L") if filepath + yp_parse_serialize(source, source_size, buffer.pointer, metadata) + buffer.read + end + end + end + + # Mark the LibRubyParser module as private as it should only be called through + # the YARP module. + private_constant :LibRubyParser + + # The version constant is set by reading the result of calling yp_version. + VERSION = LibRubyParser.yp_version.read_string + + # Mirror the YARP.dump API by using the serialization API. + def self.dump(code, filepath = nil) + LibRubyParser.dump_internal(code, code.bytesize, filepath) + end + + # Mirror the YARP.dump_file API by using the serialization API. + def self.dump_file(filepath) + LibRubyParser::YPString.with(filepath) do |string| + LibRubyParser.dump_internal(string.source, string.length, filepath) + end + end + + # Mirror the YARP.lex API by using the serialization API. + def self.lex(code, filepath = nil) + LibRubyParser::YPBuffer.with do |buffer| + LibRubyParser.yp_lex_serialize(code, code.bytesize, filepath, buffer.pointer) + Serialize.load_tokens(Source.new(code), buffer.read) + end + end + + # Mirror the YARP.lex_file API by using the serialization API. + def self.lex_file(filepath) + LibRubyParser::YPString.with(filepath) do |string| + lex(string.read, filepath) + end + end + + # Mirror the YARP.parse API by using the serialization API. + def self.parse(code, filepath = nil) + YARP.load(code, dump(code, filepath)) + end + + # Mirror the YARP.parse_file API by using the serialization API. This uses + # native strings instead of Ruby strings because it allows us to use mmap when + # it is available. + def self.parse_file(filepath) + LibRubyParser::YPString.with(filepath) do |string| + parse(string.read, filepath) + end + end + + # Mirror the YARP.parse_lex API by using the serialization API. + def self.parse_lex(code, filepath = nil) + LibRubyParser::YPBuffer.with do |buffer| + metadata = [filepath.bytesize, filepath.b, 0].pack("LA*L") if filepath + LibRubyParser.yp_parse_lex_serialize(code, code.bytesize, buffer.pointer, metadata) + + source = Source.new(code) + loader = Serialize::Loader.new(source, buffer.read) + + tokens = loader.load_tokens + node, comments, errors, warnings = loader.load_nodes + + tokens.each { |token,| token.value.force_encoding(loader.encoding) } + + ParseResult.new([node, tokens], comments, errors, warnings, source) + end + end + + # Mirror the YARP.parse_lex_file API by using the serialization API. + def self.parse_lex_file(filepath) + LibRubyParser::YPString.with(filepath) do |string| + parse_lex(string.read, filepath) + end + end +end diff --git a/lib/prism/language_server.rb b/lib/prism/language_server.rb new file mode 100644 index 0000000000..5a10d484a1 --- /dev/null +++ b/lib/prism/language_server.rb @@ -0,0 +1,166 @@ +# frozen_string_literal: true + +require "cgi" +require "json" +require "uri" + +module YARP + # YARP additionally ships with a language server conforming to the + # language server protocol. It can be invoked by running the yarp-lsp + # bin script (bin/yarp-lsp) + class LanguageServer + GITHUB_TEMPLATE = <<~TEMPLATE + Reporting issue with error `%{error}`. + + ## Expected behavior + <!-- TODO: Briefly explain what the expected behavior should be on this example. --> + + ## Actual behavior + <!-- TODO: Describe here what actually happened. --> + + ## Steps to reproduce the problem + <!-- TODO: Describe how we can reproduce the problem. --> + + ## Additional information + <!-- TODO: Include any additional information, such as screenshots. --> + + TEMPLATE + + attr_reader :input, :output + + def initialize( + input: $stdin, + output: $stdout + ) + @input = input.binmode + @output = output.binmode + end + + # rubocop:disable Layout/LineLength + def run + store = + Hash.new do |hash, uri| + filepath = CGI.unescape(URI.parse(uri).path) + File.exist?(filepath) ? (hash[uri] = File.read(filepath)) : nil + end + + while (headers = input.gets("\r\n\r\n")) + source = input.read(headers[/Content-Length: (\d+)/i, 1].to_i) + request = JSON.parse(source, symbolize_names: true) + + # stree-ignore + case request + in { method: "initialize", id: } + store.clear + write(id: id, result: { capabilities: capabilities }) + in { method: "initialized" } + # ignored + in { method: "shutdown" } # tolerate missing ID to be a good citizen + store.clear + write(id: request[:id], result: {}) + in { method: "exit"} + return + in { method: "textDocument/didChange", params: { textDocument: { uri: }, contentChanges: [{ text: }, *] } } + store[uri] = text + in { method: "textDocument/didOpen", params: { textDocument: { uri:, text: } } } + store[uri] = text + in { method: "textDocument/didClose", params: { textDocument: { uri: } } } + store.delete(uri) + in { method: "textDocument/diagnostic", id:, params: { textDocument: { uri: } } } + contents = store[uri] + write(id: id, result: contents ? diagnostics(contents) : nil) + in { method: "textDocument/codeAction", id:, params: { textDocument: { uri: }, context: { diagnostics: }}} + contents = store[uri] + write(id: id, result: contents ? code_actions(contents, diagnostics) : nil) + in { method: %r{\$/.+} } + # ignored + end + end + end + # rubocop:enable Layout/LineLength + + private + + def capabilities + { + codeActionProvider: { + codeActionKinds: [ + 'quickfix', + ], + }, + diagnosticProvider: { + interFileDependencies: false, + workspaceDiagnostics: false, + }, + textDocumentSync: { + change: 1, + openClose: true + }, + } + end + + def code_actions(source, diagnostics) + diagnostics.map do |diagnostic| + message = diagnostic[:message] + issue_content = URI.encode_www_form_component(GITHUB_TEMPLATE % {error: message}) + issue_link = "https://github.com/ruby/yarp/issues/new?&labels=Bug&body=#{issue_content}" + + { + title: "Report incorrect error: `#{diagnostic[:message]}`", + kind: "quickfix", + diagnostics: [diagnostic], + command: { + title: "Report incorrect error", + command: "vscode.open", + arguments: [issue_link] + } + } + end + end + + def diagnostics(source) + offsets = Hash.new do |hash, key| + slice = source.byteslice(...key) + lineno = slice.count("\n") + + char = slice.length + newline = source.rindex("\n", [char - 1, 0].max) || -1 + hash[key] = { line: lineno, character: char - newline - 1 } + end + + parse_output = YARP.parse(source) + + { + kind: "full", + items: [ + *parse_output.errors.map do |error| + { + range: { + start: offsets[error.location.start_offset], + end: offsets[error.location.end_offset], + }, + message: error.message, + severity: 1, + } + end, + *parse_output.warnings.map do |warning| + { + range: { + start: offsets[warning.location.start_offset], + end: offsets[warning.location.end_offset], + }, + message: warning.message, + severity: 2, + } + end, + ] + } + end + + def write(value) + response = value.merge(jsonrpc: "2.0").to_json + output.print("Content-Length: #{response.bytesize}\r\n\r\n#{response}") + output.flush + end + end +end diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb new file mode 100644 index 0000000000..720ac2b59b --- /dev/null +++ b/lib/prism/lex_compat.rb @@ -0,0 +1,838 @@ +# frozen_string_literal: true + +require "delegate" + +module YARP + # This class is responsible for lexing the source using YARP and then + # converting those tokens to be compatible with Ripper. In the vast majority + # of cases, this is a one-to-one mapping of the token type. Everything else + # generally lines up. However, there are a few cases that require special + # handling. + class LexCompat + # This is a mapping of YARP token types to Ripper token types. This is a + # many-to-one mapping because we split up our token types, whereas Ripper + # tends to group them. + RIPPER = { + AMPERSAND: :on_op, + AMPERSAND_AMPERSAND: :on_op, + AMPERSAND_AMPERSAND_EQUAL: :on_op, + AMPERSAND_DOT: :on_op, + AMPERSAND_EQUAL: :on_op, + BACK_REFERENCE: :on_backref, + BACKTICK: :on_backtick, + BANG: :on_op, + BANG_EQUAL: :on_op, + BANG_TILDE: :on_op, + BRACE_LEFT: :on_lbrace, + BRACE_RIGHT: :on_rbrace, + BRACKET_LEFT: :on_lbracket, + BRACKET_LEFT_ARRAY: :on_lbracket, + BRACKET_LEFT_RIGHT: :on_op, + BRACKET_LEFT_RIGHT_EQUAL: :on_op, + BRACKET_RIGHT: :on_rbracket, + CARET: :on_op, + CARET_EQUAL: :on_op, + CHARACTER_LITERAL: :on_CHAR, + CLASS_VARIABLE: :on_cvar, + COLON: :on_op, + COLON_COLON: :on_op, + COMMA: :on_comma, + COMMENT: :on_comment, + CONSTANT: :on_const, + DOT: :on_period, + DOT_DOT: :on_op, + DOT_DOT_DOT: :on_op, + EMBDOC_BEGIN: :on_embdoc_beg, + EMBDOC_END: :on_embdoc_end, + EMBDOC_LINE: :on_embdoc, + EMBEXPR_BEGIN: :on_embexpr_beg, + EMBEXPR_END: :on_embexpr_end, + EMBVAR: :on_embvar, + EOF: :on_eof, + EQUAL: :on_op, + EQUAL_EQUAL: :on_op, + EQUAL_EQUAL_EQUAL: :on_op, + EQUAL_GREATER: :on_op, + EQUAL_TILDE: :on_op, + FLOAT: :on_float, + FLOAT_IMAGINARY: :on_imaginary, + FLOAT_RATIONAL: :on_rational, + FLOAT_RATIONAL_IMAGINARY: :on_imaginary, + GREATER: :on_op, + GREATER_EQUAL: :on_op, + GREATER_GREATER: :on_op, + GREATER_GREATER_EQUAL: :on_op, + GLOBAL_VARIABLE: :on_gvar, + HEREDOC_END: :on_heredoc_end, + HEREDOC_START: :on_heredoc_beg, + IDENTIFIER: :on_ident, + IGNORED_NEWLINE: :on_ignored_nl, + INTEGER: :on_int, + INTEGER_IMAGINARY: :on_imaginary, + INTEGER_RATIONAL: :on_rational, + INTEGER_RATIONAL_IMAGINARY: :on_imaginary, + INSTANCE_VARIABLE: :on_ivar, + INVALID: :INVALID, + KEYWORD___ENCODING__: :on_kw, + KEYWORD___LINE__: :on_kw, + KEYWORD___FILE__: :on_kw, + KEYWORD_ALIAS: :on_kw, + KEYWORD_AND: :on_kw, + KEYWORD_BEGIN: :on_kw, + KEYWORD_BEGIN_UPCASE: :on_kw, + KEYWORD_BREAK: :on_kw, + KEYWORD_CASE: :on_kw, + KEYWORD_CLASS: :on_kw, + KEYWORD_DEF: :on_kw, + KEYWORD_DEFINED: :on_kw, + KEYWORD_DO: :on_kw, + KEYWORD_DO_LOOP: :on_kw, + KEYWORD_ELSE: :on_kw, + KEYWORD_ELSIF: :on_kw, + KEYWORD_END: :on_kw, + KEYWORD_END_UPCASE: :on_kw, + KEYWORD_ENSURE: :on_kw, + KEYWORD_FALSE: :on_kw, + KEYWORD_FOR: :on_kw, + KEYWORD_IF: :on_kw, + KEYWORD_IF_MODIFIER: :on_kw, + KEYWORD_IN: :on_kw, + KEYWORD_MODULE: :on_kw, + KEYWORD_NEXT: :on_kw, + KEYWORD_NIL: :on_kw, + KEYWORD_NOT: :on_kw, + KEYWORD_OR: :on_kw, + KEYWORD_REDO: :on_kw, + KEYWORD_RESCUE: :on_kw, + KEYWORD_RESCUE_MODIFIER: :on_kw, + KEYWORD_RETRY: :on_kw, + KEYWORD_RETURN: :on_kw, + KEYWORD_SELF: :on_kw, + KEYWORD_SUPER: :on_kw, + KEYWORD_THEN: :on_kw, + KEYWORD_TRUE: :on_kw, + KEYWORD_UNDEF: :on_kw, + KEYWORD_UNLESS: :on_kw, + KEYWORD_UNLESS_MODIFIER: :on_kw, + KEYWORD_UNTIL: :on_kw, + KEYWORD_UNTIL_MODIFIER: :on_kw, + KEYWORD_WHEN: :on_kw, + KEYWORD_WHILE: :on_kw, + KEYWORD_WHILE_MODIFIER: :on_kw, + KEYWORD_YIELD: :on_kw, + LABEL: :on_label, + LABEL_END: :on_label_end, + LAMBDA_BEGIN: :on_tlambeg, + LESS: :on_op, + LESS_EQUAL: :on_op, + LESS_EQUAL_GREATER: :on_op, + LESS_LESS: :on_op, + LESS_LESS_EQUAL: :on_op, + METHOD_NAME: :on_ident, + MINUS: :on_op, + MINUS_EQUAL: :on_op, + MINUS_GREATER: :on_tlambda, + NEWLINE: :on_nl, + NUMBERED_REFERENCE: :on_backref, + PARENTHESIS_LEFT: :on_lparen, + PARENTHESIS_LEFT_PARENTHESES: :on_lparen, + PARENTHESIS_RIGHT: :on_rparen, + PERCENT: :on_op, + PERCENT_EQUAL: :on_op, + PERCENT_LOWER_I: :on_qsymbols_beg, + PERCENT_LOWER_W: :on_qwords_beg, + PERCENT_LOWER_X: :on_backtick, + PERCENT_UPPER_I: :on_symbols_beg, + PERCENT_UPPER_W: :on_words_beg, + PIPE: :on_op, + PIPE_EQUAL: :on_op, + PIPE_PIPE: :on_op, + PIPE_PIPE_EQUAL: :on_op, + PLUS: :on_op, + PLUS_EQUAL: :on_op, + QUESTION_MARK: :on_op, + RATIONAL_FLOAT: :on_rational, + RATIONAL_INTEGER: :on_rational, + REGEXP_BEGIN: :on_regexp_beg, + REGEXP_END: :on_regexp_end, + SEMICOLON: :on_semicolon, + SLASH: :on_op, + SLASH_EQUAL: :on_op, + STAR: :on_op, + STAR_EQUAL: :on_op, + STAR_STAR: :on_op, + STAR_STAR_EQUAL: :on_op, + STRING_BEGIN: :on_tstring_beg, + STRING_CONTENT: :on_tstring_content, + STRING_END: :on_tstring_end, + SYMBOL_BEGIN: :on_symbeg, + TILDE: :on_op, + UAMPERSAND: :on_op, + UCOLON_COLON: :on_op, + UDOT_DOT: :on_op, + UDOT_DOT_DOT: :on_op, + UMINUS: :on_op, + UMINUS_NUM: :on_op, + UPLUS: :on_op, + USTAR: :on_op, + USTAR_STAR: :on_op, + WORDS_SEP: :on_words_sep, + "__END__": :on___end__ + }.freeze + + # When we produce tokens, we produce the same arrays that Ripper does. + # However, we add a couple of convenience methods onto them to make them a + # little easier to work with. We delegate all other methods to the array. + class Token < SimpleDelegator + def location + self[0] + end + + def event + self[1] + end + + def value + self[2] + end + + def state + self[3] + end + end + + # Ripper doesn't include the rest of the token in the event, so we need to + # trim it down to just the content on the first line when comparing. + class EndContentToken < Token + def ==(other) + [self[0], self[1], self[2][0..self[2].index("\n")], self[3]] == other + end + end + + # Tokens where state should be ignored + # used for :on_comment, :on_heredoc_end, :on_embexpr_end + class IgnoreStateToken < Token + def ==(other) + self[0...-1] == other[0...-1] + end + end + + # Ident tokens for the most part are exactly the same, except sometimes we + # know an ident is a local when ripper doesn't (when they are introduced + # through named captures in regular expressions). In that case we don't + # compare the state. + class IdentToken < Token + def ==(other) + (self[0...-1] == other[0...-1]) && ( + (other[3] == Ripper::EXPR_LABEL | Ripper::EXPR_END) || + (other[3] & Ripper::EXPR_ARG_ANY != 0) + ) + end + end + + # Ignored newlines can occasionally have a LABEL state attached to them, so + # we compare the state differently here. + class IgnoredNewlineToken < Token + def ==(other) + return false unless self[0...-1] == other[0...-1] + + if self[4] == Ripper::EXPR_ARG | Ripper::EXPR_LABELED + other[4] & Ripper::EXPR_ARG | Ripper::EXPR_LABELED > 0 + else + self[4] == other[4] + end + end + end + + # If we have an identifier that follows a method name like: + # + # def foo bar + # + # then Ripper will mark bar as END|LABEL if there is a local in a parent + # scope named bar because it hasn't pushed the local table yet. We do this + # more accurately, so we need to allow comparing against both END and + # END|LABEL. + class ParamToken < Token + def ==(other) + (self[0...-1] == other[0...-1]) && ( + (other[3] == Ripper::EXPR_END) || + (other[3] == Ripper::EXPR_END | Ripper::EXPR_LABEL) + ) + end + end + + # A heredoc in this case is a list of tokens that belong to the body of the + # heredoc that should be appended onto the list of tokens when the heredoc + # closes. + module Heredoc + # Heredocs that are no dash or tilde heredocs are just a list of tokens. + # We need to keep them around so that we can insert them in the correct + # order back into the token stream and set the state of the last token to + # the state that the heredoc was opened in. + class PlainHeredoc + attr_reader :tokens + + def initialize + @tokens = [] + end + + def <<(token) + tokens << token + end + + def to_a + tokens + end + end + + # Dash heredocs are a little more complicated. They are a list of tokens + # that need to be split on "\\\n" to mimic Ripper's behavior. We also need + # to keep track of the state that the heredoc was opened in. + class DashHeredoc + attr_reader :split, :tokens + + def initialize(split) + @split = split + @tokens = [] + end + + def <<(token) + tokens << token + end + + def to_a + embexpr_balance = 0 + + tokens.each_with_object([]) do |token, results| + case token.event + when :on_embexpr_beg + embexpr_balance += 1 + results << token + when :on_embexpr_end + embexpr_balance -= 1 + results << token + when :on_tstring_content + if embexpr_balance == 0 + lineno = token[0][0] + column = token[0][1] + + if split + # Split on "\\\n" to mimic Ripper's behavior. Use a lookbehind + # to keep the delimiter in the result. + token.value.split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index| + column = 0 if index > 0 + results << Token.new([[lineno, column], :on_tstring_content, value, token.state]) + lineno += value.count("\n") + end + else + results << token + end + else + results << token + end + else + results << token + end + end + end + end + + # Heredocs that are dedenting heredocs are a little more complicated. + # Ripper outputs on_ignored_sp tokens for the whitespace that is being + # removed from the output. YARP only modifies the node itself and keeps + # the token the same. This simplifies YARP, but makes comparing against + # Ripper much harder because there is a length mismatch. + # + # Fortunately, we already have to pull out the heredoc tokens in order to + # insert them into the stream in the correct order. As such, we can do + # some extra manipulation on the tokens to make them match Ripper's + # output by mirroring the dedent logic that Ripper uses. + class DedentingHeredoc + TAB_WIDTH = 8 + + attr_reader :tokens, :dedent_next, :dedent, :embexpr_balance + + def initialize + @tokens = [] + @dedent_next = true + @dedent = nil + @embexpr_balance = 0 + end + + # As tokens are coming in, we track the minimum amount of common leading + # whitespace on plain string content tokens. This allows us to later + # remove that amount of whitespace from the beginning of each line. + def <<(token) + case token.event + when :on_embexpr_beg, :on_heredoc_beg + @embexpr_balance += 1 + when :on_embexpr_end, :on_heredoc_end + @embexpr_balance -= 1 + when :on_tstring_content + if embexpr_balance == 0 + token.value.split(/(?<=\n)/).each_with_index do |line, index| + next if line.strip.empty? && line.end_with?("\n") + next if !(dedent_next || index > 0) + + leading = line[/\A(\s*)\n?/, 1] + next_dedent = 0 + + leading.each_char do |char| + if char == "\t" + next_dedent = next_dedent - (next_dedent % TAB_WIDTH) + TAB_WIDTH + else + next_dedent += 1 + end + end + + @dedent = [dedent, next_dedent].compact.min + end + end + end + + @dedent_next = token.event == :on_tstring_content && embexpr_balance == 0 + tokens << token + end + + def to_a + # If every line in the heredoc is blank, we still need to split up the + # string content token into multiple tokens. + if dedent.nil? + results = [] + embexpr_balance = 0 + + tokens.each do |token| + case token.event + when :on_embexpr_beg, :on_heredoc_beg + embexpr_balance += 1 + results << token + when :on_embexpr_end, :on_heredoc_end + embexpr_balance -= 1 + results << token + when :on_tstring_content + if embexpr_balance == 0 + lineno = token[0][0] + column = token[0][1] + + token.value.split(/(?<=\n)/).each_with_index do |value, index| + column = 0 if index > 0 + results << Token.new([[lineno, column], :on_tstring_content, value, token.state]) + lineno += 1 + end + else + results << token + end + else + results << token + end + end + + return results + end + + # Otherwise, we're going to run through each token in the list and + # insert on_ignored_sp tokens for the amount of dedent that we need to + # perform. We also need to remove the dedent from the beginning of + # each line of plain string content tokens. + results = [] + dedent_next = true + embexpr_balance = 0 + + tokens.each do |token| + # Notice that the structure of this conditional largely matches the + # whitespace calculation we performed above. This is because + # checking if the subsequent token needs to be dedented is common to + # both the dedent calculation and the ignored_sp insertion. + case token.event + when :on_embexpr_beg + embexpr_balance += 1 + results << token + when :on_embexpr_end + embexpr_balance -= 1 + results << token + when :on_tstring_content + if embexpr_balance == 0 + # Here we're going to split the string on newlines, but maintain + # the newlines in the resulting array. We'll do that with a look + # behind assertion. + splits = token.value.split(/(?<=\n)/) + index = 0 + + while index < splits.length + line = splits[index] + lineno = token[0][0] + index + column = token[0][1] + + # Blank lines do not count toward common leading whitespace + # calculation and do not need to be dedented. + if dedent_next || index > 0 + column = 0 + end + + # If the dedent is 0 and we're not supposed to dedent the next + # line or this line doesn't start with whitespace, then we + # should concatenate the rest of the string to match ripper. + if dedent == 0 && (!dedent_next || !line.start_with?(/\s/)) + line = splits[index..].join + index = splits.length + end + + # If we are supposed to dedent this line or if this is not the + # first line of the string and this line isn't entirely blank, + # then we need to insert an on_ignored_sp token and remove the + # dedent from the beginning of the line. + if (dedent > 0) && (dedent_next || index > 0) + deleting = 0 + deleted_chars = [] + + # Gather up all of the characters that we're going to + # delete, stopping when you hit a character that would put + # you over the dedent amount. + line.each_char.with_index do |char, i| + case char + when "\r" + if line.chars[i + 1] == "\n" + break + end + when "\n" + break + when "\t" + deleting = deleting - (deleting % TAB_WIDTH) + TAB_WIDTH + else + deleting += 1 + end + + break if deleting > dedent + deleted_chars << char + end + + # If we have something to delete, then delete it from the + # string and insert an on_ignored_sp token. + if deleted_chars.any? + ignored = deleted_chars.join + line.delete_prefix!(ignored) + + results << Token.new([[lineno, 0], :on_ignored_sp, ignored, token[3]]) + column = ignored.length + end + end + + results << Token.new([[lineno, column], token[1], line, token[3]]) unless line.empty? + index += 1 + end + else + results << token + end + else + results << token + end + + dedent_next = + ((token.event == :on_tstring_content) || (token.event == :on_heredoc_end)) && + embexpr_balance == 0 + end + + results + end + end + + # Here we will split between the two types of heredocs and return the + # object that will store their tokens. + def self.build(opening) + case opening.value[2] + when "~" + DedentingHeredoc.new + when "-" + DashHeredoc.new(opening.value[3] != "'") + else + PlainHeredoc.new + end + end + end + + attr_reader :source, :filepath + + def initialize(source, filepath = "") + @source = source + @filepath = filepath || "" + end + + def result + tokens = [] + + state = :default + heredoc_stack = [[]] + + result = YARP.lex(source, @filepath) + result_value = result.value + previous_state = nil + + # In previous versions of Ruby, Ripper wouldn't flush the bom before the + # first token, so we had to have a hack in place to account for that. This + # checks for that behavior. + bom_flushed = Ripper.lex("\xEF\xBB\xBF# test")[0][0][1] == 0 + bom = source.byteslice(0..2) == "\xEF\xBB\xBF" + + result_value.each_with_index do |(token, lex_state), index| + lineno = token.location.start_line + column = token.location.start_column + + # If there's a UTF-8 byte-order mark as the start of the file, then for + # certain tokens ripper sets the first token back by 3 bytes. It also + # keeps the byte order mark in the first token's value. This is weird, + # and I don't want to mirror that in our parser. So instead, we'll match + # up the columns and values here. + if bom && lineno == 1 + column -= 3 + + if index == 0 && column == 0 && !bom_flushed + flushed = + case token.type + when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE, + :GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I, + :PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I, + :PERCENT_UPPER_W, :STRING_BEGIN + true + when :REGEXP_BEGIN, :SYMBOL_BEGIN + token.value.start_with?("%") + else + false + end + + unless flushed + column -= 3 + value = token.value + value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding)) + end + end + end + + event = RIPPER.fetch(token.type) + value = token.value + lex_state = Ripper::Lexer::State.new(lex_state) + + token = + case event + when :on___end__ + EndContentToken.new([[lineno, column], event, value, lex_state]) + when :on_comment + IgnoreStateToken.new([[lineno, column], event, value, lex_state]) + when :on_heredoc_end + # Heredoc end tokens can be emitted in an odd order, so we don't + # want to bother comparing the state on them. + IgnoreStateToken.new([[lineno, column], event, value, lex_state]) + when :on_ident + if lex_state == Ripper::EXPR_END + # If we have an identifier that follows a method name like: + # + # def foo bar + # + # then Ripper will mark bar as END|LABEL if there is a local in a + # parent scope named bar because it hasn't pushed the local table + # yet. We do this more accurately, so we need to allow comparing + # against both END and END|LABEL. + ParamToken.new([[lineno, column], event, value, lex_state]) + elsif lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL + # In the event that we're comparing identifiers, we're going to + # allow a little divergence. Ripper doesn't account for local + # variables introduced through named captures in regexes, and we + # do, which accounts for this difference. + IdentToken.new([[lineno, column], event, value, lex_state]) + else + Token.new([[lineno, column], event, value, lex_state]) + end + when :on_embexpr_end + IgnoreStateToken.new([[lineno, column], event, value, lex_state]) + when :on_ignored_nl + # Ignored newlines can occasionally have a LABEL state attached to + # them which doesn't actually impact anything. We don't mirror that + # state so we ignored it. + IgnoredNewlineToken.new([[lineno, column], event, value, lex_state]) + when :on_regexp_end + # On regex end, Ripper scans and then sets end state, so the ripper + # lexed output is begin, when it should be end. YARP sets lex state + # correctly to end state, but we want to be able to compare against + # Ripper's lexed state. So here, if it's a regexp end token, we + # output the state as the previous state, solely for the sake of + # comparison. + previous_token = result_value[index - 1][0] + lex_state = + if RIPPER.fetch(previous_token.type) == :on_embexpr_end + # If the previous token is embexpr_end, then we have to do even + # more processing. The end of an embedded expression sets the + # state to the state that it had at the beginning of the + # embedded expression. So we have to go and find that state and + # set it here. + counter = 1 + current_index = index - 1 + + until counter == 0 + current_index -= 1 + current_event = RIPPER.fetch(result_value[current_index][0].type) + counter += { on_embexpr_beg: -1, on_embexpr_end: 1 }[current_event] || 0 + end + + Ripper::Lexer::State.new(result_value[current_index][1]) + else + previous_state + end + + Token.new([[lineno, column], event, value, lex_state]) + when :on_eof + previous_token = result_value[index - 1][0] + + # If we're at the end of the file and the previous token was a + # comment and there is still whitespace after the comment, then + # Ripper will append a on_nl token (even though there isn't + # necessarily a newline). We mirror that here. + start_offset = previous_token.location.end_offset + end_offset = token.location.start_offset + + if previous_token.type == :COMMENT && start_offset < end_offset + if bom + start_offset += 3 + end_offset += 3 + end + + tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state]) + end + + Token.new([[lineno, column], event, value, lex_state]) + else + Token.new([[lineno, column], event, value, lex_state]) + end + + previous_state = lex_state + + # The order in which tokens appear in our lexer is different from the + # order that they appear in Ripper. When we hit the declaration of a + # heredoc in YARP, we skip forward and lex the rest of the content of + # the heredoc before going back and lexing at the end of the heredoc + # identifier. + # + # To match up to ripper, we keep a small state variable around here to + # track whether we're in the middle of a heredoc or not. In this way we + # can shuffle around the token to match Ripper's output. + case state + when :default + # The default state is when there are no heredocs at all. In this + # state we can append the token to the list of tokens and move on. + tokens << token + + # If we get the declaration of a heredoc, then we open a new heredoc + # and move into the heredoc_opened state. + if event == :on_heredoc_beg + state = :heredoc_opened + heredoc_stack.last << Heredoc.build(token) + end + when :heredoc_opened + # The heredoc_opened state is when we've seen the declaration of a + # heredoc and are now lexing the body of the heredoc. In this state we + # push tokens onto the most recently created heredoc. + heredoc_stack.last.last << token + + case event + when :on_heredoc_beg + # If we receive a heredoc declaration while lexing the body of a + # heredoc, this means we have nested heredocs. In this case we'll + # push a new heredoc onto the stack and stay in the heredoc_opened + # state since we're now lexing the body of the new heredoc. + heredoc_stack << [Heredoc.build(token)] + when :on_heredoc_end + # If we receive the end of a heredoc, then we're done lexing the + # body of the heredoc. In this case we now have a completed heredoc + # but need to wait for the next newline to push it into the token + # stream. + state = :heredoc_closed + end + when :heredoc_closed + if %i[on_nl on_ignored_nl on_comment].include?(event) || (event == :on_tstring_content && value.end_with?("\n")) + if heredoc_stack.size > 1 + flushing = heredoc_stack.pop + heredoc_stack.last.last << token + + flushing.each do |heredoc| + heredoc.to_a.each do |flushed_token| + heredoc_stack.last.last << flushed_token + end + end + + state = :heredoc_opened + next + end + elsif event == :on_heredoc_beg + tokens << token + state = :heredoc_opened + heredoc_stack.last << Heredoc.build(token) + next + elsif heredoc_stack.size > 1 + heredoc_stack[-2].last << token + next + end + + heredoc_stack.last.each do |heredoc| + tokens.concat(heredoc.to_a) + end + + heredoc_stack.last.clear + state = :default + + tokens << token + end + end + + # Drop the EOF token from the list + tokens = tokens[0...-1] + + # We sort by location to compare against Ripper's output + tokens.sort_by!(&:location) + + if result_value.size - 1 > tokens.size + raise StandardError, "Lost tokens when performing lex_compat" + end + + ParseResult.new(tokens, result.comments, result.errors, result.warnings, []) + end + end + + # This is a class that wraps the Ripper lexer to produce almost exactly the + # same tokens. + class LexRipper + attr_reader :source + + def initialize(source) + @source = source + end + + def result + previous = [] + results = [] + + Ripper.lex(source, raise_errors: true).each do |token| + case token[1] + when :on_sp + # skip + when :on_tstring_content + if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@")) + previous[2] << token[2] + else + results << token + previous = token + end + when :on_words_sep + if previous[1] == :on_words_sep + previous[2] << token[2] + else + results << token + previous = token + end + else + results << token + previous = token + end + end + + results + end + end +end diff --git a/lib/prism/node_ext.rb b/lib/prism/node_ext.rb new file mode 100644 index 0000000000..760b3d75df --- /dev/null +++ b/lib/prism/node_ext.rb @@ -0,0 +1,55 @@ +# frozen_string_literal: true + +# Here we are reopening the YARP module to provide methods on nodes that aren't +# templated and are meant as convenience methods. +module YARP + class FloatNode < Node + # Returns the value of the node as a Ruby Float. + def value + Float(slice) + end + end + + class ImaginaryNode < Node + # Returns the value of the node as a Ruby Complex. + def value + Complex(0, numeric.value) + end + end + + class IntegerNode < Node + # Returns the value of the node as a Ruby Integer. + def value + Integer(slice) + end + end + + class InterpolatedRegularExpressionNode < Node + # Returns a numeric value that represents the flags that were used to create + # the regular expression. + def options + o = flags & (RegularExpressionFlags::IGNORE_CASE | RegularExpressionFlags::EXTENDED | RegularExpressionFlags::MULTI_LINE) + o |= Regexp::FIXEDENCODING if flags.anybits?(RegularExpressionFlags::EUC_JP | RegularExpressionFlags::WINDOWS_31J | RegularExpressionFlags::UTF_8) + o |= Regexp::NOENCODING if flags.anybits?(RegularExpressionFlags::ASCII_8BIT) + o + end + end + + class RationalNode < Node + # Returns the value of the node as a Ruby Rational. + def value + Rational(slice.chomp("r")) + end + end + + class RegularExpressionNode < Node + # Returns a numeric value that represents the flags that were used to create + # the regular expression. + def options + o = flags & (RegularExpressionFlags::IGNORE_CASE | RegularExpressionFlags::EXTENDED | RegularExpressionFlags::MULTI_LINE) + o |= Regexp::FIXEDENCODING if flags.anybits?(RegularExpressionFlags::EUC_JP | RegularExpressionFlags::WINDOWS_31J | RegularExpressionFlags::UTF_8) + o |= Regexp::NOENCODING if flags.anybits?(RegularExpressionFlags::ASCII_8BIT) + o + end + end +end diff --git a/lib/prism/node_inspector.rb b/lib/prism/node_inspector.rb new file mode 100644 index 0000000000..c09840a471 --- /dev/null +++ b/lib/prism/node_inspector.rb @@ -0,0 +1,68 @@ +# frozen_string_literal: true + +module YARP + # This object is responsible for generating the output for the inspect method + # implementations of child nodes. + class NodeInspector + attr_reader :prefix, :output + + def initialize(prefix = "") + @prefix = prefix + @output = +"" + end + + # Appends a line to the output with the current prefix. + def <<(line) + output << "#{prefix}#{line}" + end + + # This generates a string that is used as the header of the inspect output + # for any given node. + def header(node) + output = +"@ #{node.class.name.split("::").last} (" + output << "location: (#{node.location.start_line},#{node.location.start_column})-(#{node.location.end_line},#{node.location.end_column})" + output << ", newline: true" if node.newline? + output << ")\n" + output + end + + # Generates a string that represents a list of nodes. It handles properly + # using the box drawing characters to make the output look nice. + def list(prefix, nodes) + output = +"(length: #{nodes.length})\n" + last_index = nodes.length - 1 + + nodes.each_with_index do |node, index| + pointer, preadd = (index == last_index) ? ["└── ", " "] : ["├── ", "│ "] + node_prefix = "#{prefix}#{preadd}" + output << node.inspect(NodeInspector.new(node_prefix)).sub(node_prefix, "#{prefix}#{pointer}") + end + + output + end + + # Generates a string that represents a location field on a node. + def location(value) + if value + "(#{value.start_line},#{value.start_column})-(#{value.end_line},#{value.end_column}) = #{value.slice.inspect}" + else + "∅" + end + end + + # Generates a string that represents a child node. + def child_node(node, append) + node.inspect(child_inspector(append)).delete_prefix(prefix) + end + + # Returns a new inspector that can be used to inspect a child node. + def child_inspector(append) + NodeInspector.new("#{prefix}#{append}") + end + + # Returns the output as a string. + def to_str + output + end + end +end diff --git a/lib/prism/pack.rb b/lib/prism/pack.rb new file mode 100644 index 0000000000..83f5569923 --- /dev/null +++ b/lib/prism/pack.rb @@ -0,0 +1,185 @@ +# frozen_string_literal: true + +module YARP + module Pack + %i[ + SPACE + COMMENT + INTEGER + UTF8 + BER + FLOAT + STRING_SPACE_PADDED + STRING_NULL_PADDED + STRING_NULL_TERMINATED + STRING_MSB + STRING_LSB + STRING_HEX_HIGH + STRING_HEX_LOW + STRING_UU + STRING_MIME + STRING_BASE64 + STRING_FIXED + STRING_POINTER + MOVE + BACK + NULL + + UNSIGNED + SIGNED + SIGNED_NA + + AGNOSTIC_ENDIAN + LITTLE_ENDIAN + BIG_ENDIAN + NATIVE_ENDIAN + ENDIAN_NA + + SIZE_SHORT + SIZE_INT + SIZE_LONG + SIZE_LONG_LONG + SIZE_8 + SIZE_16 + SIZE_32 + SIZE_64 + SIZE_P + SIZE_NA + + LENGTH_FIXED + LENGTH_MAX + LENGTH_RELATIVE + LENGTH_NA + ].each do |const| + const_set(const, const) + end + + class Directive + attr_reader :version, :variant, :source, :type, :signed, :endian, :size, :length_type, :length + + def initialize(version, variant, source, type, signed, endian, size, length_type, length) + @version = version + @variant = variant + @source = source + @type = type + @signed = signed + @endian = endian + @size = size + @length_type = length_type + @length = length + end + + ENDIAN_DESCRIPTIONS = { + AGNOSTIC_ENDIAN: 'agnostic', + LITTLE_ENDIAN: 'little-endian (VAX)', + BIG_ENDIAN: 'big-endian (network)', + NATIVE_ENDIAN: 'native-endian', + ENDIAN_NA: 'n/a' + } + + SIGNED_DESCRIPTIONS = { + UNSIGNED: 'unsigned', + SIGNED: 'signed', + SIGNED_NA: 'n/a' + } + + SIZE_DESCRIPTIONS = { + SIZE_SHORT: 'short', + SIZE_INT: 'int-width', + SIZE_LONG: 'long', + SIZE_LONG_LONG: 'long long', + SIZE_8: '8-bit', + SIZE_16: '16-bit', + SIZE_32: '32-bit', + SIZE_64: '64-bit', + SIZE_P: 'pointer-width' + } + + def describe + case type + when SPACE + 'whitespace' + when COMMENT + 'comment' + when INTEGER + if size == SIZE_8 + base = "#{SIGNED_DESCRIPTIONS[signed]} #{SIZE_DESCRIPTIONS[size]} integer" + else + base = "#{SIGNED_DESCRIPTIONS[signed]} #{SIZE_DESCRIPTIONS[size]} #{ENDIAN_DESCRIPTIONS[endian]} integer" + end + case length_type + when LENGTH_FIXED + if length > 1 + base + ", x#{length}" + else + base + end + when LENGTH_MAX + base + ', as many as possible' + end + when UTF8 + 'UTF-8 character' + when BER + 'BER-compressed integer' + when FLOAT + "#{SIZE_DESCRIPTIONS[size]} #{ENDIAN_DESCRIPTIONS[endian]} float" + when STRING_SPACE_PADDED + 'arbitrary binary string (space padded)' + when STRING_NULL_PADDED + 'arbitrary binary string (null padded, count is width)' + when STRING_NULL_TERMINATED + 'arbitrary binary string (null padded, count is width), except that null is added with *' + when STRING_MSB + 'bit string (MSB first)' + when STRING_LSB + 'bit string (LSB first)' + when STRING_HEX_HIGH + 'hex string (high nibble first)' + when STRING_HEX_LOW + 'hex string (low nibble first)' + when STRING_UU + 'UU-encoded string' + when STRING_MIME + 'quoted printable, MIME encoding' + when STRING_BASE64 + 'base64 encoded string' + when STRING_FIXED + 'pointer to a structure (fixed-length string)' + when STRING_POINTER + 'pointer to a null-terminated string' + when MOVE + 'move to absolute position' + when BACK + 'back up a byte' + when NULL + 'null byte' + else + raise + end + end + end + + class Format + attr_reader :directives, :encoding + + def initialize(directives, encoding) + @directives = directives + @encoding = encoding + end + + def describe + source_width = directives.map { |d| d.source.inspect.length }.max + directive_lines = directives.map do |directive| + if directive.type == SPACE + source = directive.source.inspect + else + source = directive.source + end + " #{source.ljust(source_width)} #{directive.describe}" + end + + (['Directives:'] + directive_lines + ['Encoding:', " #{encoding}"]).join("\n") + end + end + end +end diff --git a/lib/prism/parse_result.rb b/lib/prism/parse_result.rb new file mode 100644 index 0000000000..2d9d855b86 --- /dev/null +++ b/lib/prism/parse_result.rb @@ -0,0 +1,266 @@ +# frozen_string_literal: true + +module YARP + # This represents a source of Ruby code that has been parsed. It is used in + # conjunction with locations to allow them to resolve line numbers and source + # ranges. + class Source + attr_reader :source, :offsets + + def initialize(source, offsets = compute_offsets(source)) + @source = source + @offsets = offsets + end + + def slice(offset, length) + source.byteslice(offset, length) + end + + def line(value) + offsets.bsearch_index { |offset| offset > value } || offsets.length + end + + def line_offset(value) + offsets[line(value) - 1] + end + + def column(value) + value - offsets[line(value) - 1] + end + + private + + def compute_offsets(code) + offsets = [0] + code.b.scan("\n") { offsets << $~.end(0) } + offsets + end + end + + # This represents a location in the source. + class Location + # A Source object that is used to determine more information from the given + # offset and length. + protected attr_reader :source + + # The byte offset from the beginning of the source where this location + # starts. + attr_reader :start_offset + + # The length of this location in bytes. + attr_reader :length + + # The list of comments attached to this location + attr_reader :comments + + def initialize(source, start_offset, length) + @source = source + @start_offset = start_offset + @length = length + @comments = [] + end + + # Create a new location object with the given options. + def copy(**options) + Location.new( + options.fetch(:source) { source }, + options.fetch(:start_offset) { start_offset }, + options.fetch(:length) { length } + ) + end + + # Returns a string representation of this location. + def inspect + "#<YARP::Location @start_offset=#{@start_offset} @length=#{@length} start_line=#{start_line}>" + end + + # The source code that this location represents. + def slice + source.slice(start_offset, length) + end + + # The byte offset from the beginning of the source where this location ends. + def end_offset + start_offset + length + end + + # The line number where this location starts. + def start_line + source.line(start_offset) + end + + # The content of the line where this location starts before this location. + def start_line_slice + offset = source.line_offset(start_offset) + source.slice(offset, start_offset - offset) + end + + # The line number where this location ends. + def end_line + source.line(end_offset - 1) + end + + # The column number in bytes where this location starts from the start of + # the line. + def start_column + source.column(start_offset) + end + + # The column number in bytes where this location ends from the start of the + # line. + def end_column + source.column(end_offset) + end + + def deconstruct_keys(keys) + { start_offset: start_offset, end_offset: end_offset } + end + + def pretty_print(q) + q.text("(#{start_line},#{start_column})-(#{end_line},#{end_column}))") + end + + def ==(other) + other.is_a?(Location) && + other.start_offset == start_offset && + other.end_offset == end_offset + end + + # Returns a new location that stretches from this location to the given + # other location. Raises an error if this location is not before the other + # location or if they don't share the same source. + def join(other) + raise "Incompatible sources" if source != other.source + raise "Incompatible locations" if start_offset > other.start_offset + + Location.new(source, start_offset, other.end_offset - start_offset) + end + + def self.null + new(0, 0) + end + end + + # This represents a comment that was encountered during parsing. + class Comment + TYPES = [:inline, :embdoc, :__END__] + + attr_reader :type, :location + + def initialize(type, location) + @type = type + @location = location + end + + def deconstruct_keys(keys) + { type: type, location: location } + end + + # Returns true if the comment happens on the same line as other code and false if the comment is by itself + def trailing? + type == :inline && !location.start_line_slice.strip.empty? + end + + def inspect + "#<YARP::Comment @type=#{@type.inspect} @location=#{@location.inspect}>" + end + end + + # This represents an error that was encountered during parsing. + class ParseError + attr_reader :message, :location + + def initialize(message, location) + @message = message + @location = location + end + + def deconstruct_keys(keys) + { message: message, location: location } + end + + def inspect + "#<YARP::ParseError @message=#{@message.inspect} @location=#{@location.inspect}>" + end + end + + # This represents a warning that was encountered during parsing. + class ParseWarning + attr_reader :message, :location + + def initialize(message, location) + @message = message + @location = location + end + + def deconstruct_keys(keys) + { message: message, location: location } + end + + def inspect + "#<YARP::ParseWarning @message=#{@message.inspect} @location=#{@location.inspect}>" + end + end + + # This represents the result of a call to ::parse or ::parse_file. It contains + # the AST, any comments that were encounters, and any errors that were + # encountered. + class ParseResult + attr_reader :value, :comments, :errors, :warnings, :source + + def initialize(value, comments, errors, warnings, source) + @value = value + @comments = comments + @errors = errors + @warnings = warnings + @source = source + end + + def deconstruct_keys(keys) + { value: value, comments: comments, errors: errors, warnings: warnings } + end + + def success? + errors.empty? + end + + def failure? + !success? + end + end + + # This represents a token from the Ruby source. + class Token + attr_reader :type, :value, :location + + def initialize(type, value, location) + @type = type + @value = value + @location = location + end + + def deconstruct_keys(keys) + { type: type, value: value, location: location } + end + + def pretty_print(q) + q.group do + q.text(type.to_s) + self.location.pretty_print(q) + q.text("(") + q.nest(2) do + q.breakable("") + q.pp(value) + end + q.breakable("") + q.text(")") + end + end + + def ==(other) + other.is_a?(Token) && + other.type == type && + other.value == value + end + end +end diff --git a/lib/prism/parse_result/comments.rb b/lib/prism/parse_result/comments.rb new file mode 100644 index 0000000000..88240609b1 --- /dev/null +++ b/lib/prism/parse_result/comments.rb @@ -0,0 +1,172 @@ +# frozen_string_literal: true + +module YARP + class ParseResult + # When we've parsed the source, we have both the syntax tree and the list of + # comments that we found in the source. This class is responsible for + # walking the tree and finding the nearest location to attach each comment. + # + # It does this by first finding the nearest locations to each comment. + # Locations can either come from nodes directly or from location fields on + # nodes. For example, a `ClassNode` has an overall location encompassing the + # entire class, but it also has a location for the `class` keyword. + # + # Once the nearest locations are found, it determines which one to attach + # to. If it's a trailing comment (a comment on the same line as other source + # code), it will favor attaching to the nearest location that occurs before + # the comment. Otherwise it will favor attaching to the nearest location + # that is after the comment. + class Comments + # A target for attaching comments that is based on a specific node's + # location. + class NodeTarget + attr_reader :node + + def initialize(node) + @node = node + end + + def start_offset + node.location.start_offset + end + + def end_offset + node.location.end_offset + end + + def encloses?(comment) + start_offset <= comment.location.start_offset && + comment.location.end_offset <= end_offset + end + + def <<(comment) + node.location.comments << comment + end + end + + # A target for attaching comments that is based on a location field on a + # node. For example, the `end` token of a ClassNode. + class LocationTarget + attr_reader :location + + def initialize(location) + @location = location + end + + def start_offset + location.start_offset + end + + def end_offset + location.end_offset + end + + def encloses?(comment) + false + end + + def <<(comment) + location.comments << comment + end + end + + attr_reader :parse_result + + def initialize(parse_result) + @parse_result = parse_result + end + + def attach! + parse_result.comments.each do |comment| + preceding, enclosing, following = nearest_targets(parse_result.value, comment) + target = + if comment.trailing? + preceding || following || enclosing || NodeTarget.new(parse_result.value) + else + # If a comment exists on its own line, prefer a leading comment. + following || preceding || enclosing || NodeTarget.new(parse_result.value) + end + + target << comment + end + end + + private + + # Responsible for finding the nearest targets to the given comment within + # the context of the given encapsulating node. + def nearest_targets(node, comment) + comment_start = comment.location.start_offset + comment_end = comment.location.end_offset + + targets = [] + node.comment_targets.map do |value| + case value + when StatementsNode + targets.concat(value.body.map { |node| NodeTarget.new(node) }) + when Node + targets << NodeTarget.new(value) + when Location + targets << LocationTarget.new(value) + end + end + + targets.sort_by!(&:start_offset) + preceding = nil + following = nil + + left = 0 + right = targets.length + + # This is a custom binary search that finds the nearest nodes to the + # given comment. When it finds a node that completely encapsulates the + # comment, it recurses downward into the tree. + while left < right + middle = (left + right) / 2 + target = targets[middle] + + target_start = target.start_offset + target_end = target.end_offset + + if target.encloses?(comment) + # The comment is completely contained by this target. Abandon the + # binary search at this level. + return nearest_targets(target.node, comment) + end + + if target_end <= comment_start + # This target falls completely before the comment. Because we will + # never consider this target or any targets before it again, this + # target must be the closest preceding target we have encountered so + # far. + preceding = target + left = middle + 1 + next + end + + if comment_end <= target_start + # This target falls completely after the comment. Because we will + # never consider this target or any targets after it again, this + # target must be the closest following target we have encountered so + # far. + following = target + right = middle + next + end + + # This should only happen if there is a bug in this parser. + raise "Comment location overlaps with a target location" + end + + [preceding, NodeTarget.new(node), following] + end + end + + private_constant :Comments + + # Attach the list of comments to their respective locations in the tree. + def attach_comments! + Comments.new(self).attach! + end + end +end diff --git a/lib/prism/parse_result/newlines.rb b/lib/prism/parse_result/newlines.rb new file mode 100644 index 0000000000..d16600afd0 --- /dev/null +++ b/lib/prism/parse_result/newlines.rb @@ -0,0 +1,60 @@ +# frozen_string_literal: true + +module YARP + class ParseResult + # The :line tracepoint event gets fired whenever the Ruby VM encounters an + # expression on a new line. The types of expressions that can trigger this + # event are: + # + # * if statements + # * unless statements + # * nodes that are children of statements lists + # + # In order to keep track of the newlines, we have a list of offsets that + # come back from the parser. We assign these offsets to the first nodes that + # we find in the tree that are on those lines. + # + # Note that the logic in this file should be kept in sync with the Java + # MarkNewlinesVisitor, since that visitor is responsible for marking the + # newlines for JRuby/TruffleRuby. + class Newlines < Visitor + def initialize(newline_marked) + @newline_marked = newline_marked + end + + def visit_block_node(node) + old_newline_marked = @newline_marked + @newline_marked = Array.new(old_newline_marked.size, false) + + begin + super(node) + ensure + @newline_marked = old_newline_marked + end + end + + alias_method :visit_lambda_node, :visit_block_node + + def visit_if_node(node) + node.set_newline_flag(@newline_marked) + super(node) + end + + alias_method :visit_unless_node, :visit_if_node + + def visit_statements_node(node) + node.body.each do |child| + child.set_newline_flag(@newline_marked) + end + super(node) + end + end + + private_constant :Newlines + + # Walk the tree and mark nodes that are on a new line. + def mark_newlines! + value.accept(Newlines.new(Array.new(1 + source.offsets.size, false))) + end + end +end diff --git a/lib/prism/pattern.rb b/lib/prism/pattern.rb new file mode 100644 index 0000000000..f7519137e4 --- /dev/null +++ b/lib/prism/pattern.rb @@ -0,0 +1,239 @@ +# frozen_string_literal: true + +module YARP + # A pattern is an object that wraps a Ruby pattern matching expression. The + # expression would normally be passed to an `in` clause within a `case` + # expression or a rightward assignment expression. For example, in the + # following snippet: + # + # case node + # in ConstantPathNode[ConstantReadNode[name: :YARP], ConstantReadNode[name: :Pattern]] + # end + # + # the pattern is the `ConstantPathNode[...]` expression. + # + # The pattern gets compiled into an object that responds to #call by running + # the #compile method. This method itself will run back through YARP to + # parse the expression into a tree, then walk the tree to generate the + # necessary callable objects. For example, if you wanted to compile the + # expression above into a callable, you would: + # + # callable = YARP::Pattern.new("ConstantPathNode[ConstantReadNode[name: :YARP], ConstantReadNode[name: :Pattern]]").compile + # callable.call(node) + # + # The callable object returned by #compile is guaranteed to respond to #call + # with a single argument, which is the node to match against. It also is + # guaranteed to respond to #===, which means it itself can be used in a `case` + # expression, as in: + # + # case node + # when callable + # end + # + # If the query given to the initializer cannot be compiled into a valid + # matcher (either because of a syntax error or because it is using syntax we + # do not yet support) then a YARP::Pattern::CompilationError will be + # raised. + class Pattern + # Raised when the query given to a pattern is either invalid Ruby syntax or + # is using syntax that we don't yet support. + class CompilationError < StandardError + def initialize(repr) + super(<<~ERROR) + YARP was unable to compile the pattern you provided into a usable + expression. It failed on to understand the node represented by: + + #{repr} + + Note that not all syntax supported by Ruby's pattern matching syntax + is also supported by YARP's patterns. If you're using some syntax + that you believe should be supported, please open an issue on + GitHub at https://github.com/ruby/yarp/issues/new. + ERROR + end + end + + attr_reader :query + + def initialize(query) + @query = query + @compiled = nil + end + + def compile + result = YARP.parse("case nil\nin #{query}\nend") + compile_node(result.value.statements.body.last.conditions.last.pattern) + end + + def scan(root) + return to_enum(__method__, root) unless block_given? + + @compiled ||= compile + queue = [root] + + while (node = queue.shift) + yield node if @compiled.call(node) + queue.concat(node.compact_child_nodes) + end + end + + private + + # Shortcut for combining two procs into one that returns true if both return + # true. + def combine_and(left, right) + ->(other) { left.call(other) && right.call(other) } + end + + # Shortcut for combining two procs into one that returns true if either + # returns true. + def combine_or(left, right) + ->(other) { left.call(other) || right.call(other) } + end + + # Raise an error because the given node is not supported. + def compile_error(node) + raise CompilationError, node.inspect + end + + # in [foo, bar, baz] + def compile_array_pattern_node(node) + compile_error(node) if !node.rest.nil? || node.posts.any? + + constant = node.constant + compiled_constant = compile_node(constant) if constant + + preprocessed = node.requireds.map { |required| compile_node(required) } + + compiled_requireds = ->(other) do + deconstructed = other.deconstruct + + deconstructed.length == preprocessed.length && + preprocessed + .zip(deconstructed) + .all? { |(matcher, value)| matcher.call(value) } + end + + if compiled_constant + combine_and(compiled_constant, compiled_requireds) + else + compiled_requireds + end + end + + # in foo | bar + def compile_alternation_pattern_node(node) + combine_or(compile_node(node.left), compile_node(node.right)) + end + + # in YARP::ConstantReadNode + def compile_constant_path_node(node) + parent = node.parent + + if parent.is_a?(ConstantReadNode) && parent.slice == "YARP" + compile_node(node.child) + else + compile_error(node) + end + end + + # in ConstantReadNode + # in String + def compile_constant_read_node(node) + value = node.slice + + if YARP.const_defined?(value, false) + clazz = YARP.const_get(value) + + ->(other) { clazz === other } + elsif Object.const_defined?(value, false) + clazz = Object.const_get(value) + + ->(other) { clazz === other } + else + compile_error(node) + end + end + + # in InstanceVariableReadNode[name: Symbol] + # in { name: Symbol } + def compile_hash_pattern_node(node) + compile_error(node) unless node.kwrest.nil? + compiled_constant = compile_node(node.constant) if node.constant + + preprocessed = + node.assocs.to_h do |assoc| + [assoc.key.unescaped.to_sym, compile_node(assoc.value)] + end + + compiled_keywords = ->(other) do + deconstructed = other.deconstruct_keys(preprocessed.keys) + + preprocessed.all? do |keyword, matcher| + deconstructed.key?(keyword) && matcher.call(deconstructed[keyword]) + end + end + + if compiled_constant + combine_and(compiled_constant, compiled_keywords) + else + compiled_keywords + end + end + + # in nil + def compile_nil_node(node) + ->(attribute) { attribute.nil? } + end + + # in /foo/ + def compile_regular_expression_node(node) + regexp = Regexp.new(node.unescaped, node.closing[1..]) + + ->(attribute) { regexp === attribute } + end + + # in "" + # in "foo" + def compile_string_node(node) + string = node.unescaped + + ->(attribute) { string === attribute } + end + + # in :+ + # in :foo + def compile_symbol_node(node) + symbol = node.unescaped.to_sym + + ->(attribute) { symbol === attribute } + end + + # Compile any kind of node. Dispatch out to the individual compilation + # methods based on the type of node. + def compile_node(node) + case node + when AlternationPatternNode + compile_alternation_pattern_node(node) + when ArrayPatternNode + compile_array_pattern_node(node) + when ConstantPathNode + compile_constant_path_node(node) + when ConstantReadNode + compile_constant_read_node(node) + when HashPatternNode + compile_hash_pattern_node(node) + when NilNode + compile_nil_node(node) + when RegularExpressionNode + compile_regular_expression_node(node) + when StringNode + compile_string_node(node) + when SymbolNode + compile_symbol_node(node) + else + compile_error(node) + end + end + end +end diff --git a/lib/prism/prism.gemspec b/lib/prism/prism.gemspec new file mode 100644 index 0000000000..d1a7bbbbcf --- /dev/null +++ b/lib/prism/prism.gemspec @@ -0,0 +1,113 @@ +# frozen_string_literal: true + +Gem::Specification.new do |spec| + spec.name = "yarp" + spec.version = "0.12.0" + spec.authors = ["Shopify"] + spec.email = ["ruby@shopify.com"] + + spec.summary = "Yet Another Ruby Parser" + spec.homepage = "https://github.com/ruby/yarp" + spec.license = "MIT" + + spec.required_ruby_version = ">= 3.0.0" + + spec.require_paths = ["lib"] + spec.files = [ + "CHANGELOG.md", + "CODE_OF_CONDUCT.md", + "CONTRIBUTING.md", + "LICENSE.md", + "Makefile", + "README.md", + "config.yml", + "docs/build_system.md", + "docs/building.md", + "docs/configuration.md", + "docs/design.md", + "docs/encoding.md", + "docs/fuzzing.md", + "docs/heredocs.md", + "docs/mapping.md", + "docs/ripper.md", + "docs/ruby_api.md", + "docs/serialization.md", + "docs/testing.md", + "ext/yarp/api_node.c", + "ext/yarp/api_pack.c", + "ext/yarp/extension.c", + "ext/yarp/extension.h", + "include/yarp.h", + "include/yarp/ast.h", + "include/yarp/defines.h", + "include/yarp/diagnostic.h", + "include/yarp/enc/yp_encoding.h", + "include/yarp/node.h", + "include/yarp/pack.h", + "include/yarp/parser.h", + "include/yarp/regexp.h", + "include/yarp/unescape.h", + "include/yarp/util/yp_buffer.h", + "include/yarp/util/yp_char.h", + "include/yarp/util/yp_constant_pool.h", + "include/yarp/util/yp_list.h", + "include/yarp/util/yp_memchr.h", + "include/yarp/util/yp_newline_list.h", + "include/yarp/util/yp_state_stack.h", + "include/yarp/util/yp_string.h", + "include/yarp/util/yp_string_list.h", + "include/yarp/util/yp_strpbrk.h", + "include/yarp/version.h", + "lib/yarp.rb", + "lib/yarp/compiler.rb", + "lib/yarp/debug.rb", + "lib/yarp/desugar_compiler.rb", + "lib/yarp/dispatcher.rb", + "lib/yarp/dsl.rb", + "lib/yarp/ffi.rb", + "lib/yarp/lex_compat.rb", + "lib/yarp/mutation_compiler.rb", + "lib/yarp/node.rb", + "lib/yarp/node_ext.rb", + "lib/yarp/node_inspector.rb", + "lib/yarp/pack.rb", + "lib/yarp/parse_result.rb", + "lib/yarp/pattern.rb", + "lib/yarp/ripper_compat.rb", + "lib/yarp/serialize.rb", + "lib/yarp/parse_result/comments.rb", + "lib/yarp/parse_result/newlines.rb", + "lib/yarp/visitor.rb", + "src/diagnostic.c", + "src/enc/yp_big5.c", + "src/enc/yp_euc_jp.c", + "src/enc/yp_gbk.c", + "src/enc/yp_shift_jis.c", + "src/enc/yp_tables.c", + "src/enc/yp_unicode.c", + "src/enc/yp_windows_31j.c", + "src/node.c", + "src/pack.c", + "src/prettyprint.c", + "src/regexp.c", + "src/serialize.c", + "src/token_type.c", + "src/unescape.c", + "src/util/yp_buffer.c", + "src/util/yp_char.c", + "src/util/yp_constant_pool.c", + "src/util/yp_list.c", + "src/util/yp_memchr.c", + "src/util/yp_newline_list.c", + "src/util/yp_state_stack.c", + "src/util/yp_string.c", + "src/util/yp_string_list.c", + "src/util/yp_strncasecmp.c", + "src/util/yp_strpbrk.c", + "src/yarp.c", + "yarp.gemspec", + ] + + spec.extensions = ["ext/yarp/extconf.rb"] + spec.metadata["allowed_push_host"] = "https://rubygems.org" +end diff --git a/lib/prism/ripper_compat.rb b/lib/prism/ripper_compat.rb new file mode 100644 index 0000000000..c76f3fd07a --- /dev/null +++ b/lib/prism/ripper_compat.rb @@ -0,0 +1,174 @@ +# frozen_string_literal: true + +require "ripper" + +module YARP + # This class is meant to provide a compatibility layer between YARP and + # Ripper. It functions by parsing the entire tree first and then walking it + # and executing each of the Ripper callbacks as it goes. + # + # This class is going to necessarily be slower than the native Ripper API. It + # is meant as a stopgap until developers migrate to using YARP. It is also + # meant as a test harness for the YARP parser. + class RipperCompat + # This class mirrors the ::Ripper::SexpBuilder subclass of ::Ripper that + # returns the arrays of [type, *children]. + class SexpBuilder < RipperCompat + private + + Ripper::PARSER_EVENTS.each do |event| + define_method(:"on_#{event}") do |*args| + [event, *args] + end + end + + Ripper::SCANNER_EVENTS.each do |event| + define_method(:"on_#{event}") do |value| + [:"@#{event}", value, [lineno, column]] + end + end + end + + # This class mirrors the ::Ripper::SexpBuilderPP subclass of ::Ripper that + # returns the same values as ::Ripper::SexpBuilder except with a couple of + # niceties that flatten linked lists into arrays. + class SexpBuilderPP < SexpBuilder + private + + def _dispatch_event_new + [] + end + + def _dispatch_event_push(list, item) + list << item + list + end + + Ripper::PARSER_EVENT_TABLE.each do |event, arity| + case event + when /_new\z/ + alias_method :"on_#{event}", :_dispatch_event_new if arity == 0 + when /_add\z/ + alias_method :"on_#{event}", :_dispatch_event_push + end + end + end + + attr_reader :source, :lineno, :column + + def initialize(source) + @source = source + @result = nil + @lineno = nil + @column = nil + end + + ############################################################################ + # Public interface + ############################################################################ + + def error? + result.errors.any? + end + + def parse + result.value.accept(self) unless error? + end + + ############################################################################ + # Visitor methods + ############################################################################ + + def visit(node) + node&.accept(self) + end + + def visit_call_node(node) + if !node.opening_loc && node.arguments.arguments.length == 1 + bounds(node.receiver.location) + left = visit(node.receiver) + + bounds(node.arguments.arguments.first.location) + right = visit(node.arguments.arguments.first) + + on_binary(left, source[node.message_loc.start_offset...node.message_loc.end_offset].to_sym, right) + else + raise NotImplementedError + end + end + + def visit_integer_node(node) + bounds(node.location) + on_int(source[node.location.start_offset...node.location.end_offset]) + end + + def visit_statements_node(node) + bounds(node.location) + node.body.inject(on_stmts_new) do |stmts, stmt| + on_stmts_add(stmts, visit(stmt)) + end + end + + def visit_token(node) + bounds(node.location) + + case node.type + when :MINUS + on_op(node.value) + when :PLUS + on_op(node.value) + else + raise NotImplementedError, "Unknown token: #{node.type}" + end + end + + def visit_program_node(node) + bounds(node.location) + on_program(visit(node.statements)) + end + + ############################################################################ + # Entrypoints for subclasses + ############################################################################ + + # This is a convenience method that runs the SexpBuilder subclass parser. + def self.sexp_raw(source) + SexpBuilder.new(source).parse + end + + # This is a convenience method that runs the SexpBuilderPP subclass parser. + def self.sexp(source) + SexpBuilderPP.new(source).parse + end + + private + + # This method is responsible for updating lineno and column information + # to reflect the current node. + # + # This method could be drastically improved with some caching on the start + # of every line, but for now it's good enough. + def bounds(location) + start_offset = location.start_offset + + @lineno = source[0..start_offset].count("\n") + 1 + @column = start_offset - (source.rindex("\n", start_offset) || 0) + end + + def result + @result ||= YARP.parse(source) + end + + def _dispatch0; end + def _dispatch1(_); end + def _dispatch2(_, _); end + def _dispatch3(_, _, _); end + def _dispatch4(_, _, _, _); end + def _dispatch5(_, _, _, _, _); end + def _dispatch7(_, _, _, _, _, _, _); end + + (Ripper::SCANNER_EVENT_TABLE.merge(Ripper::PARSER_EVENT_TABLE)).each do |event, arity| + alias_method :"on_#{event}", :"_dispatch#{arity}" + end + end +end diff --git a/lib/prism/version.rb b/lib/prism/version.rb new file mode 100644 index 0000000000..e450bfb526 --- /dev/null +++ b/lib/prism/version.rb @@ -0,0 +1,5 @@ +# frozen_string_literal: true + +module YARP + VERSION = "0.8.0" +end |