summaryrefslogtreecommitdiff
path: root/lib/rdoc
diff options
context:
space:
mode:
authorYusuke Endoh <mame@ruby-lang.org>2019-08-07 01:53:56 +0900
committeraycabta <aycabta@gmail.com>2019-08-16 06:02:45 +0900
commit0a0760aa632f05bc04df395d0173580042d9f730 (patch)
tree2b287cf83c29bfbba5d517f9d14a63cfd73a0621 /lib/rdoc
parent9d2fed2ccd1724d1cf42a3075c20dcc418082761 (diff)
Refactor and improve performance of RDoc::Markup::Parser
This change introduces a wrapper of StringScanner that is aware of the current position (column and lineno). It has two advantages: faster and more modular. The old code frequently runs `@input.byteslice(0, byte_offset).length` to get the current position, but it was painfully slow. This change keeps track of the position at each scan, which reduces about half of time of "Generating RI format into ..." in Ruby's `make rdoc` (5.5 sec -> 3.0 sec). And the old code used four instance variables (`@input`, `@line`, `@line_pos`, and `@s`) to track the position. This change factors them out into MyStringScanner, so now only one variable (`@s`) is needed.
Diffstat (limited to 'lib/rdoc')
-rw-r--r--lib/rdoc/markup/parser.rb101
-rw-r--r--lib/rdoc/tom_doc.rb13
2 files changed, 65 insertions, 49 deletions
diff --git a/lib/rdoc/markup/parser.rb b/lib/rdoc/markup/parser.rb
index 14f1f6c719..600eb841ac 100644
--- a/lib/rdoc/markup/parser.rb
+++ b/lib/rdoc/markup/parser.rb
@@ -80,10 +80,6 @@ class RDoc::Markup::Parser
@binary_input = nil
@current_token = nil
@debug = false
- @input = nil
- @input_encoding = nil
- @line = 0
- @line_pos = 0
@s = nil
@tokens = []
end
@@ -320,13 +316,6 @@ class RDoc::Markup::Parser
end
##
- # The character offset for the input string at the given +byte_offset+
-
- def char_pos byte_offset
- @input.byteslice(0, byte_offset).length
- end
-
- ##
# Pulls the next token from the stream.
def get
@@ -425,14 +414,53 @@ class RDoc::Markup::Parser
end
##
+ # A simple wrapper of StringScanner that is aware of the current column and lineno
+
+ class MyStringScanner
+ def initialize(input)
+ @line = @column = 0
+ @s = StringScanner.new input
+ end
+
+ def scan(re)
+ prev_pos = @s.pos
+ ret = @s.scan(re)
+ @column += ret.length if ret
+ ret
+ end
+
+ def unscan(s)
+ @s.pos -= s.bytesize
+ @column -= s.length
+ end
+
+ def pos
+ [@column, @line]
+ end
+
+ def newline!
+ @column = 0
+ @line += 1
+ end
+
+ def eos?
+ @s.eos?
+ end
+
+ def matched
+ @s.matched
+ end
+
+ def [](i)
+ @s[i]
+ end
+ end
+
+ ##
# Creates the StringScanner
def setup_scanner input
- @line = 0
- @line_pos = 0
- @input = input.dup
-
- @s = StringScanner.new input
+ @s = MyStringScanner.new input
end
##
@@ -467,31 +495,30 @@ class RDoc::Markup::Parser
@tokens << case
# [CR]LF => :NEWLINE
when @s.scan(/\r?\n/) then
- token = [:NEWLINE, @s.matched, *token_pos(pos)]
- @line_pos = char_pos @s.pos
- @line += 1
+ token = [:NEWLINE, @s.matched, *pos]
+ @s.newline!
token
# === text => :HEADER then :TEXT
when @s.scan(/(=+)(\s*)/) then
level = @s[1].length
- header = [:HEADER, level, *token_pos(pos)]
+ header = [:HEADER, level, *pos]
if @s[2] =~ /^\r?\n/ then
- @s.pos -= @s[2].length
+ @s.unscan(@s[2])
header
else
pos = @s.pos
@s.scan(/.*/)
@tokens << header
- [:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)]
+ [:TEXT, @s.matched.sub(/\r$/, ''), *pos]
end
# --- (at least 3) and nothing else on the line => :RULE
when @s.scan(/(-{3,}) *\r?$/) then
- [:RULE, @s[1].length - 2, *token_pos(pos)]
+ [:RULE, @s[1].length - 2, *pos]
# * or - followed by white space and text => :BULLET
when @s.scan(/([*-]) +(\S)/) then
- @s.pos -= @s[2].bytesize # unget \S
- [:BULLET, @s[1], *token_pos(pos)]
+ @s.unscan(@s[2])
+ [:BULLET, @s[1], *pos]
# A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
when @s.scan(/([a-z]|\d+)\. +(\S)/i) then
# FIXME if tab(s), the column will be wrong
@@ -500,7 +527,7 @@ class RDoc::Markup::Parser
# before (and provide a check for that at least in debug
# mode)
list_label = @s[1]
- @s.pos -= @s[2].bytesize # unget \S
+ @s.unscan(@s[2])
list_type =
case list_label
when /[a-z]/ then :LALPHA
@@ -509,24 +536,24 @@ class RDoc::Markup::Parser
else
raise ParseError, "BUG token #{list_label}"
end
- [list_type, list_label, *token_pos(pos)]
+ [list_type, list_label, *pos]
# [text] followed by spaces or end of line => :LABEL
when @s.scan(/\[(.*?)\]( +|\r?$)/) then
- [:LABEL, @s[1], *token_pos(pos)]
+ [:LABEL, @s[1], *pos]
# text:: followed by spaces or end of line => :NOTE
when @s.scan(/(.*?)::( +|\r?$)/) then
- [:NOTE, @s[1], *token_pos(pos)]
+ [:NOTE, @s[1], *pos]
# >>> followed by end of line => :BLOCKQUOTE
when @s.scan(/>>> *(\w+)?$/) then
- [:BLOCKQUOTE, @s[1], *token_pos(pos)]
+ [:BLOCKQUOTE, @s[1], *pos]
# anything else: :TEXT
else
@s.scan(/(.*?)( )?\r?$/)
- token = [:TEXT, @s[1], *token_pos(pos)]
+ token = [:TEXT, @s[1], *pos]
if @s[2] then
@tokens << token
- [:BREAK, @s[2], *token_pos(pos + @s[1].length)]
+ [:BREAK, @s[2], pos[0] + @s[1].length, pos[1]]
else
token
end
@@ -537,16 +564,6 @@ class RDoc::Markup::Parser
end
##
- # Calculates the column (by character) and line of the current token based
- # on +byte_offset+.
-
- def token_pos byte_offset
- offset = char_pos byte_offset
-
- [offset - @line_pos, @line]
- end
-
- ##
# Returns the current token to the token stream
def unget
diff --git a/lib/rdoc/tom_doc.rb b/lib/rdoc/tom_doc.rb
index 625a6b5cfa..e161fcf42f 100644
--- a/lib/rdoc/tom_doc.rb
+++ b/lib/rdoc/tom_doc.rb
@@ -242,19 +242,18 @@ class RDoc::TomDoc < RDoc::Markup::Parser
@tokens << case
when @s.scan(/\r?\n/) then
- token = [:NEWLINE, @s.matched, *token_pos(pos)]
- @line_pos = char_pos @s.pos
- @line += 1
+ token = [:NEWLINE, @s.matched, *pos]
+ @s.newline!
token
when @s.scan(/(Examples|Signature)$/) then
- @tokens << [:HEADER, 3, *token_pos(pos)]
+ @tokens << [:HEADER, 3, *pos]
- [:TEXT, @s[1], *token_pos(pos)]
+ [:TEXT, @s[1], *pos]
when @s.scan(/([:\w][\w\[\]]*)[ ]+- /) then
- [:NOTE, @s[1], *token_pos(pos)]
+ [:NOTE, @s[1], *pos]
else
@s.scan(/.*/)
- [:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)]
+ [:TEXT, @s.matched.sub(/\r$/, ''), *pos]
end
end