lib/syntax_suggest/clean_document.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306

# frozen_string_literal: true

module SyntaxSuggest
  # Parses and sanitizes source into a lexically aware document
  #
  # Internally the document is represented by an array with each
  # index containing a CodeLine correlating to a line from the source code.
  #
  # There are three main phases in the algorithm:
  #
  # 1. Sanitize/format input source
  # 2. Search for invalid blocks
  # 3. Format invalid blocks into something meaninful
  #
  # This class handles the first part.
  #
  # The reason this class exists is to format input source
  # for better/easier/cleaner exploration.
  #
  # The CodeSearch class operates at the line level so
  # we must be careful to not introduce lines that look
  # valid by themselves, but when removed will trigger syntax errors
  # or strange behavior.
  #
  # ## Join Trailing slashes
  #
  # Code with a trailing slash is logically treated as a single line:
  #
  #     1 it "code can be split" \
  #     2    "across multiple lines" do
  #
  # In this case removing line 2 would add a syntax error. We get around
  # this by internally joining the two lines into a single "line" object
  #
  # ## Logically Consecutive lines
  #
  # Code that can be broken over multiple
  # lines such as method calls are on different lines:
  #
  #     1 User.
  #     2   where(name: "schneems").
  #     3   first
  #
  # Removing line 2 can introduce a syntax error. To fix this, all lines
  # are joined into one.
  #
  # ## Heredocs
  #
  # A heredoc is an way of defining a multi-line string. They can cause many
  # problems. If left as a single line, the parser would try to parse the contents
  # as ruby code rather than as a string. Even without this problem, we still
  # hit an issue with indentation:
  #
  #    1 foo = <<~HEREDOC
  #    2  "Be yourself; everyone else is already taken.""
  #    3    ― Oscar Wilde
  #    4      puts "I look like ruby code" # but i'm still a heredoc
  #    5 HEREDOC
  #
  # If we didn't join these lines then our algorithm would think that line 4
  # is separate from the rest, has a higher indentation, then look at it first
  # and remove it.
  #
  # If the code evaluates line 5 by itself it will think line 5 is a constant,
  # remove it, and introduce a syntax errror.
  #
  # All of these problems are fixed by joining the whole heredoc into a single
  # line.
  #
  # ## Comments and whitespace
  #
  # Comments can throw off the way the lexer tells us that the line
  # logically belongs with the next line. This is valid ruby but
  # results in a different lex output than before:
  #
  #     1 User.
  #     2   where(name: "schneems").
  #     3   # Comment here
  #     4   first
  #
  # To handle this we can replace comment lines with empty lines
  # and then re-lex the source. This removal and re-lexing preserves
  # line index and document size, but generates an easier to work with
  # document.
  #
  class CleanDocument
    def initialize(source:)
      lines = clean_sweep(source: source)
      @document = CodeLine.from_source(lines.join, lines: lines)
    end

    # Call all of the document "cleaners"
    # and return self
    def call
      join_trailing_slash!
      join_consecutive!
      join_heredoc!

      self
    end

    # Return an array of CodeLines in the
    # document
    def lines
      @document
    end

    # Renders the document back to a string
    def to_s
      @document.join
    end

    # Remove comments
    #
    # replace with empty newlines
    #
    #     source = <<~'EOM'
    #       # Comment 1
    #       puts "hello"
    #       # Comment 2
    #       puts "world"
    #     EOM
    #
    #     lines = CleanDocument.new(source: source).lines
    #     expect(lines[0].to_s).to eq("\n")
    #     expect(lines[1].to_s).to eq("puts "hello")
    #     expect(lines[2].to_s).to eq("\n")
    #     expect(lines[3].to_s).to eq("puts "world")
    #
    # Important: This must be done before lexing.
    #
    # After this change is made, we lex the document because
    # removing comments can change how the doc is parsed.
    #
    # For example:
    #
    #     values = LexAll.new(source: <<~EOM))
    #       User.
    #         # comment
    #         where(name: 'schneems')
    #     EOM
    #     expect(
    #       values.count {|v| v.type == :on_ignored_nl}
    #     ).to eq(1)
    #
    # After the comment is removed:
    #
    #     values = LexAll.new(source: <<~EOM))
    #       User.
    #
    #         where(name: 'schneems')
    #     EOM
    #     expect(
    #      values.count {|v| v.type == :on_ignored_nl}
    #    ).to eq(2)
    #
    def clean_sweep(source:)
      # Match comments, but not HEREDOC strings with #{variable} interpolation
      # https://rubular.com/r/HPwtW9OYxKUHXQ
      source.lines.map do |line|
        if line.match?(/^\s*#([^{].*|)$/)
          $/
        else
          line
        end
      end
    end

    # Smushes all heredoc lines into one line
    #
    #     source = <<~'EOM'
    #       foo = <<~HEREDOC
    #          lol
    #          hehehe
    #       HEREDOC
    #     EOM
    #
    #     lines = CleanDocument.new(source: source).join_heredoc!.lines
    #     expect(lines[0].to_s).to eq(source)
    #     expect(lines[1].to_s).to eq("")
    def join_heredoc!
      start_index_stack = []
      heredoc_beg_end_index = []
      lines.each do |line|
        line.lex.each do |lex_value|
          case lex_value.type
          when :on_heredoc_beg
            start_index_stack << line.index
          when :on_heredoc_end
            start_index = start_index_stack.pop
            end_index = line.index
            heredoc_beg_end_index << [start_index, end_index]
          end
        end
      end

      heredoc_groups = heredoc_beg_end_index.map { |start_index, end_index| @document[start_index..end_index] }

      join_groups(heredoc_groups)
      self
    end

    # Smushes logically "consecutive" lines
    #
    #     source = <<~'EOM'
    #       User.
    #         where(name: 'schneems').
    #         first
    #     EOM
    #
    #     lines = CleanDocument.new(source: source).join_consecutive!.lines
    #     expect(lines[0].to_s).to eq(source)
    #     expect(lines[1].to_s).to eq("")
    #
    # The one known case this doesn't handle is:
    #
    #     Ripper.lex <<~EOM
    #       a &&
    #        b ||
    #        c
    #     EOM
    #
    # For some reason this introduces `on_ignore_newline` but with BEG type
    #
    def join_consecutive!
      consecutive_groups = @document.select(&:ignore_newline_not_beg?).map do |code_line|
        take_while_including(code_line.index..) do |line|
          line.ignore_newline_not_beg?
        end
      end

      join_groups(consecutive_groups)
      self
    end

    # Join lines with a trailing slash
    #
    #     source = <<~'EOM'
    #       it "code can be split" \
    #          "across multiple lines" do
    #     EOM
    #
    #     lines = CleanDocument.new(source: source).join_consecutive!.lines
    #     expect(lines[0].to_s).to eq(source)
    #     expect(lines[1].to_s).to eq("")
    def join_trailing_slash!
      trailing_groups = @document.select(&:trailing_slash?).map do |code_line|
        take_while_including(code_line.index..) { |x| x.trailing_slash? }
      end
      join_groups(trailing_groups)
      self
    end

    # Helper method for joining "groups" of lines
    #
    # Input is expected to be type Array<Array<CodeLine>>
    #
    # The outer array holds the various "groups" while the
    # inner array holds code lines.
    #
    # All code lines are "joined" into the first line in
    # their group.
    #
    # To preserve document size, empty lines are placed
    # in the place of the lines that were "joined"
    def join_groups(groups)
      groups.each do |lines|
        line = lines.first

        # Handle the case of multiple groups in a a row
        # if one is already replaced, move on
        next if @document[line.index].empty?

        # Join group into the first line
        @document[line.index] = CodeLine.new(
          lex: lines.map(&:lex).flatten,
          line: lines.join,
          index: line.index
        )

        # Hide the rest of the lines
        lines[1..].each do |line|
          # The above lines already have newlines in them, if add more
          # then there will be double newline, use an empty line instead
          @document[line.index] = CodeLine.new(line: "", index: line.index, lex: [])
        end
      end
      self
    end

    # Helper method for grabbing elements from document
    #
    # Like `take_while` except when it stops
    # iterating, it also returns the line
    # that caused it to stop
    def take_while_including(range = 0..)
      take_next_and_stop = false
      @document[range].take_while do |line|
        next if take_next_and_stop

        take_next_and_stop = !(yield line)
        true
      end
    end
  end
end