1 files changed, 223 insertions, 0 deletions
diff --git a/lib/syntax_suggest/clean_document.rb b/lib/syntax_suggest/clean_document.rb
new file mode 100644
index 0000000000..94c68d8ad4
--- /dev/null
+++ b/lib/syntax_suggest/clean_document.rb
@@ -0,0 +1,223 @@
+# frozen_string_literal: true
+
+module SyntaxSuggest
+  # Parses and sanitizes source into a lexically aware document
+  #
+  # Internally the document is represented by an array with each
+  # index containing a CodeLine correlating to a line from the source code.
+  #
+  # There are three main phases in the algorithm:
+  #
+  # 1. Sanitize/format input source
+  # 2. Search for invalid blocks
+  # 3. Format invalid blocks into something meaningful
+  #
+  # This class handles the first part.
+  #
+  # The reason this class exists is to format input source
+  # for better/easier/cleaner exploration.
+  #
+  # The CodeSearch class operates at the line level so
+  # we must be careful to not introduce lines that look
+  # valid by themselves, but when removed will trigger syntax errors
+  # or strange behavior.
+  #
+  # ## Join Trailing slashes
+  #
+  # Code with a trailing slash is logically treated as a single line:
+  #
+  #     1 it "code can be split" \
+  #     2    "across multiple lines" do
+  #
+  # In this case removing line 2 would add a syntax error. We get around
+  # this by internally joining the two lines into a single "line" object
+  #
+  # ## Logically Consecutive lines
+  #
+  # Code that can be broken over multiple
+  # lines such as method calls are on different lines:
+  #
+  #     1 User.
+  #     2   where(name: "schneems").
+  #     3   first
+  #
+  # Removing line 2 can introduce a syntax error. To fix this, all lines
+  # are joined into one.
+  #
+  # ## Heredocs
+  #
+  # A heredoc is an way of defining a multi-line string. They can cause many
+  # problems. If left as a single line, the parser would try to parse the contents
+  # as ruby code rather than as a string. Even without this problem, we still
+  # hit an issue with indentation:
+  #
+  #    1 foo = <<~HEREDOC
+  #    2  "Be yourself; everyone else is already taken.""
+  #    3    ― Oscar Wilde
+  #    4      puts "I look like ruby code" # but i'm still a heredoc
+  #    5 HEREDOC
+  #
+  # If we didn't join these lines then our algorithm would think that line 4
+  # is separate from the rest, has a higher indentation, then look at it first
+  # and remove it.
+  #
+  # If the code evaluates line 5 by itself it will think line 5 is a constant,
+  # remove it, and introduce a syntax errror.
+  #
+  # All of these problems are fixed by joining the whole heredoc into a single
+  # line.
+  class CleanDocument
+    def initialize(source:)
+      @document = CodeLine.from_source(source)
+    end
+
+    # Call all of the document "cleaners"
+    # and return self
+    def call
+      join_trailing_slash!
+      join_consecutive!
+      join_heredoc!
+
+      self
+    end
+
+    # Return an array of CodeLines in the
+    # document
+    def lines
+      @document
+    end
+
+    # Renders the document back to a string
+    def to_s
+      @document.join
+    end
+
+    # Smushes all heredoc lines into one line
+    #
+    #     source = <<~'EOM'
+    #       foo = <<~HEREDOC
+    #          lol
+    #          hehehe
+    #       HEREDOC
+    #     EOM
+    #
+    #     lines = CleanDocument.new(source: source).join_heredoc!.lines
+    #     expect(lines[0].to_s).to eq(source)
+    #     expect(lines[1].to_s).to eq("")
+    def join_heredoc!
+      start_index_stack = []
+      heredoc_beg_end_index = []
+      lines.each do |line|
+        line.tokens.each do |token|
+          case token.type
+          when :HEREDOC_START
+            start_index_stack << line.index
+          when :HEREDOC_END
+            start_index = start_index_stack.pop
+            end_index = line.index
+            heredoc_beg_end_index << [start_index, end_index]
+          end
+        end
+      end
+
+      heredoc_groups = heredoc_beg_end_index.map { |start_index, end_index| @document[start_index..end_index] }
+
+      join_groups(heredoc_groups)
+      self
+    end
+
+    # Smushes logically "consecutive" lines
+    #
+    #     source = <<~'EOM'
+    #       User.
+    #         where(name: 'schneems').
+    #         first
+    #     EOM
+    #
+    #     lines = CleanDocument.new(source: source).join_consecutive!.lines
+    #     expect(lines[0].to_s).to eq(source)
+    #     expect(lines[1].to_s).to eq("")
+    #
+    def join_consecutive!
+      consecutive_groups = @document.select(&:consecutive?).map do |code_line|
+        take_while_including(code_line.index..) do |line|
+          line.consecutive?
+        end
+      end
+
+      join_groups(consecutive_groups)
+      self
+    end
+
+    # Join lines with a trailing slash
+    #
+    #     source = <<~'EOM'
+    #       it "code can be split" \
+    #          "across multiple lines" do
+    #     EOM
+    #
+    #     lines = CleanDocument.new(source: source).join_consecutive!.lines
+    #     expect(lines[0].to_s).to eq(source)
+    #     expect(lines[1].to_s).to eq("")
+    def join_trailing_slash!
+      trailing_groups = @document.select(&:trailing_slash?).map do |code_line|
+        take_while_including(code_line.index..) { |x| x.trailing_slash? }
+      end
+      join_groups(trailing_groups)
+      self
+    end
+
+    # Helper method for joining "groups" of lines
+    #
+    # Input is expected to be type Array<Array<CodeLine>>
+    #
+    # The outer array holds the various "groups" while the
+    # inner array holds code lines.
+    #
+    # All code lines are "joined" into the first line in
+    # their group.
+    #
+    # To preserve document size, empty lines are placed
+    # in the place of the lines that were "joined"
+    def join_groups(groups)
+      groups.each do |lines|
+        line = lines.first
+
+        # Handle the case of multiple groups in a row
+        # if one is already replaced, move on
+        next if @document[line.index].empty?
+
+        # Join group into the first line
+        @document[line.index] = CodeLine.new(
+          tokens: lines.map(&:tokens).flatten,
+          line: lines.join,
+          index: line.index,
+          consecutive: false
+        )
+
+        # Hide the rest of the lines
+        lines[1..].each do |line|
+          # The above lines already have newlines in them, if add more
+          # then there will be double newline, use an empty line instead
+          @document[line.index] = CodeLine.new(line: "", index: line.index, tokens: [], consecutive: false)
+        end
+      end
+      self
+    end
+
+    # Helper method for grabbing elements from document
+    #
+    # Like `take_while` except when it stops
+    # iterating, it also returns the line
+    # that caused it to stop
+    def take_while_including(range = 0..)
+      take_next_and_stop = false
+      @document[range].take_while do |line|
+        next if take_next_and_stop
+
+        take_next_and_stop = !(yield line)
+        true
+      end
+    end
+  end
+end