1 files changed, 306 insertions, 0 deletions
diff --git a/lib/syntax_suggest/clean_document.rb b/lib/syntax_suggest/clean_document.rb
new file mode 100644
index 0000000000..2790ccae86
--- /dev/null
+++ b/lib/syntax_suggest/clean_document.rb
@@ -0,0 +1,306 @@
+# frozen_string_literal: true
+
+module SyntaxSuggest
+  # Parses and sanitizes source into a lexically aware document
+  #
+  # Internally the document is represented by an array with each
+  # index containing a CodeLine correlating to a line from the source code.
+  #
+  # There are three main phases in the algorithm:
+  #
+  # 1. Sanitize/format input source
+  # 2. Search for invalid blocks
+  # 3. Format invalid blocks into something meaninful
+  #
+  # This class handles the first part.
+  #
+  # The reason this class exists is to format input source
+  # for better/easier/cleaner exploration.
+  #
+  # The CodeSearch class operates at the line level so
+  # we must be careful to not introduce lines that look
+  # valid by themselves, but when removed will trigger syntax errors
+  # or strange behavior.
+  #
+  # ## Join Trailing slashes
+  #
+  # Code with a trailing slash is logically treated as a single line:
+  #
+  #     1 it "code can be split" \
+  #     2    "across multiple lines" do
+  #
+  # In this case removing line 2 would add a syntax error. We get around
+  # this by internally joining the two lines into a single "line" object
+  #
+  # ## Logically Consecutive lines
+  #
+  # Code that can be broken over multiple
+  # lines such as method calls are on different lines:
+  #
+  #     1 User.
+  #     2   where(name: "schneems").
+  #     3   first
+  #
+  # Removing line 2 can introduce a syntax error. To fix this, all lines
+  # are joined into one.
+  #
+  # ## Heredocs
+  #
+  # A heredoc is an way of defining a multi-line string. They can cause many
+  # problems. If left as a single line, the parser would try to parse the contents
+  # as ruby code rather than as a string. Even without this problem, we still
+  # hit an issue with indentation:
+  #
+  #    1 foo = <<~HEREDOC
+  #    2  "Be yourself; everyone else is already taken.""
+  #    3    ― Oscar Wilde
+  #    4      puts "I look like ruby code" # but i'm still a heredoc
+  #    5 HEREDOC
+  #
+  # If we didn't join these lines then our algorithm would think that line 4
+  # is separate from the rest, has a higher indentation, then look at it first
+  # and remove it.
+  #
+  # If the code evaluates line 5 by itself it will think line 5 is a constant,
+  # remove it, and introduce a syntax errror.
+  #
+  # All of these problems are fixed by joining the whole heredoc into a single
+  # line.
+  #
+  # ## Comments and whitespace
+  #
+  # Comments can throw off the way the lexer tells us that the line
+  # logically belongs with the next line. This is valid ruby but
+  # results in a different lex output than before:
+  #
+  #     1 User.
+  #     2   where(name: "schneems").
+  #     3   # Comment here
+  #     4   first
+  #
+  # To handle this we can replace comment lines with empty lines
+  # and then re-lex the source. This removal and re-lexing preserves
+  # line index and document size, but generates an easier to work with
+  # document.
+  #
+  class CleanDocument
+    def initialize(source:)
+      lines = clean_sweep(source: source)
+      @document = CodeLine.from_source(lines.join, lines: lines)
+    end
+
+    # Call all of the document "cleaners"
+    # and return self
+    def call
+      join_trailing_slash!
+      join_consecutive!
+      join_heredoc!
+
+      self
+    end
+
+    # Return an array of CodeLines in the
+    # document
+    def lines
+      @document
+    end
+
+    # Renders the document back to a string
+    def to_s
+      @document.join
+    end
+
+    # Remove comments
+    #
+    # replace with empty newlines
+    #
+    #     source = <<~'EOM'
+    #       # Comment 1
+    #       puts "hello"
+    #       # Comment 2
+    #       puts "world"
+    #     EOM
+    #
+    #     lines = CleanDocument.new(source: source).lines
+    #     expect(lines[0].to_s).to eq("\n")
+    #     expect(lines[1].to_s).to eq("puts "hello")
+    #     expect(lines[2].to_s).to eq("\n")
+    #     expect(lines[3].to_s).to eq("puts "world")
+    #
+    # Important: This must be done before lexing.
+    #
+    # After this change is made, we lex the document because
+    # removing comments can change how the doc is parsed.
+    #
+    # For example:
+    #
+    #     values = LexAll.new(source: <<~EOM))
+    #       User.
+    #         # comment
+    #         where(name: 'schneems')
+    #     EOM
+    #     expect(
+    #       values.count {|v| v.type == :on_ignored_nl}
+    #     ).to eq(1)
+    #
+    # After the comment is removed:
+    #
+    #     values = LexAll.new(source: <<~EOM))
+    #       User.
+    #
+    #         where(name: 'schneems')
+    #     EOM
+    #     expect(
+    #      values.count {|v| v.type == :on_ignored_nl}
+    #    ).to eq(2)
+    #
+    def clean_sweep(source:)
+      # Match comments, but not HEREDOC strings with #{variable} interpolation
+      # https://rubular.com/r/HPwtW9OYxKUHXQ
+      source.lines.map do |line|
+        if line.match?(/^\s*#([^{].*|)$/)
+          $/
+        else
+          line
+        end
+      end
+    end
+
+    # Smushes all heredoc lines into one line
+    #
+    #     source = <<~'EOM'
+    #       foo = <<~HEREDOC
+    #          lol
+    #          hehehe
+    #       HEREDOC
+    #     EOM
+    #
+    #     lines = CleanDocument.new(source: source).join_heredoc!.lines
+    #     expect(lines[0].to_s).to eq(source)
+    #     expect(lines[1].to_s).to eq("")
+    def join_heredoc!
+      start_index_stack = []
+      heredoc_beg_end_index = []
+      lines.each do |line|
+        line.lex.each do |lex_value|
+          case lex_value.type
+          when :on_heredoc_beg
+            start_index_stack << line.index
+          when :on_heredoc_end
+            start_index = start_index_stack.pop
+            end_index = line.index
+            heredoc_beg_end_index << [start_index, end_index]
+          end
+        end
+      end
+
+      heredoc_groups = heredoc_beg_end_index.map { |start_index, end_index| @document[start_index..end_index] }
+
+      join_groups(heredoc_groups)
+      self
+    end
+
+    # Smushes logically "consecutive" lines
+    #
+    #     source = <<~'EOM'
+    #       User.
+    #         where(name: 'schneems').
+    #         first
+    #     EOM
+    #
+    #     lines = CleanDocument.new(source: source).join_consecutive!.lines
+    #     expect(lines[0].to_s).to eq(source)
+    #     expect(lines[1].to_s).to eq("")
+    #
+    # The one known case this doesn't handle is:
+    #
+    #     Ripper.lex <<~EOM
+    #       a &&
+    #        b ||
+    #        c
+    #     EOM
+    #
+    # For some reason this introduces `on_ignore_newline` but with BEG type
+    #
+    def join_consecutive!
+      consecutive_groups = @document.select(&:ignore_newline_not_beg?).map do |code_line|
+        take_while_including(code_line.index..) do |line|
+          line.ignore_newline_not_beg?
+        end
+      end
+
+      join_groups(consecutive_groups)
+      self
+    end
+
+    # Join lines with a trailing slash
+    #
+    #     source = <<~'EOM'
+    #       it "code can be split" \
+    #          "across multiple lines" do
+    #     EOM
+    #
+    #     lines = CleanDocument.new(source: source).join_consecutive!.lines
+    #     expect(lines[0].to_s).to eq(source)
+    #     expect(lines[1].to_s).to eq("")
+    def join_trailing_slash!
+      trailing_groups = @document.select(&:trailing_slash?).map do |code_line|
+        take_while_including(code_line.index..) { |x| x.trailing_slash? }
+      end
+      join_groups(trailing_groups)
+      self
+    end
+
+    # Helper method for joining "groups" of lines
+    #
+    # Input is expected to be type Array<Array<CodeLine>>
+    #
+    # The outer array holds the various "groups" while the
+    # inner array holds code lines.
+    #
+    # All code lines are "joined" into the first line in
+    # their group.
+    #
+    # To preserve document size, empty lines are placed
+    # in the place of the lines that were "joined"
+    def join_groups(groups)
+      groups.each do |lines|
+        line = lines.first
+
+        # Handle the case of multiple groups in a row
+        # if one is already replaced, move on
+        next if @document[line.index].empty?
+
+        # Join group into the first line
+        @document[line.index] = CodeLine.new(
+          lex: lines.map(&:lex).flatten,
+          line: lines.join,
+          index: line.index
+        )
+
+        # Hide the rest of the lines
+        lines[1..].each do |line|
+          # The above lines already have newlines in them, if add more
+          # then there will be double newline, use an empty line instead
+          @document[line.index] = CodeLine.new(line: "", index: line.index, lex: [])
+        end
+      end
+      self
+    end
+
+    # Helper method for grabbing elements from document
+    #
+    # Like `take_while` except when it stops
+    # iterating, it also returns the line
+    # that caused it to stop
+    def take_while_including(range = 0..)
+      take_next_and_stop = false
+      @document[range].take_while do |line|
+        next if take_next_and_stop
+
+        take_next_and_stop = !(yield line)
+        true
+      end
+    end
+  end
+end