summaryrefslogtreecommitdiff
path: root/lib/rdoc/markup/parser.rb
diff options
context:
space:
mode:
authordrbrain <drbrain@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2010-04-01 07:45:16 +0000
committerdrbrain <drbrain@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2010-04-01 07:45:16 +0000
commit46580b51477355fece514573c88cb67030f4a502 (patch)
tree779c1a64466643461b3daa4cd9a3548b84f0fd55 /lib/rdoc/markup/parser.rb
parent9b40cdfe8c973a061c5683ad78c283b9ddb8b2e9 (diff)
Import RDoc 2.5
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@27147 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'lib/rdoc/markup/parser.rb')
-rw-r--r--lib/rdoc/markup/parser.rb528
1 files changed, 528 insertions, 0 deletions
diff --git a/lib/rdoc/markup/parser.rb b/lib/rdoc/markup/parser.rb
new file mode 100644
index 0000000000..c0d6519fd5
--- /dev/null
+++ b/lib/rdoc/markup/parser.rb
@@ -0,0 +1,528 @@
+require 'strscan'
+require 'rdoc/text'
+
+##
+# A recursive-descent parser for RDoc markup.
+#
+# The parser tokenizes an input string then parses the tokens into a Document.
+# Documents can be converted into output formats by writing a visitor like
+# RDoc::Markup::ToHTML.
+#
+# The parser only handles the block-level constructs Paragraph, List,
+# ListItem, Heading, Verbatim, BlankLine and Rule. Inline markup such as
+# <tt>\+blah\+</tt> is handled separately by RDoc::Markup::AttributeManager.
+#
+# To see what markup the Parser implements read RDoc. To see how to use
+# RDoc markup to format text in your program read RDoc::Markup.
+
+class RDoc::Markup::Parser
+
+ include RDoc::Text
+
+ ##
+ # List token types
+
+ LIST_TOKENS = [
+ :BULLET,
+ :LABEL,
+ :LALPHA,
+ :NOTE,
+ :NUMBER,
+ :UALPHA,
+ ]
+
+ ##
+ # Parser error subclass
+
+ class Error < RuntimeError; end
+
+ ##
+ # Raised when the parser is unable to handle the given markup
+
+ class ParseError < Error; end
+
+ ##
+ # Enables display of debugging information
+
+ attr_accessor :debug
+
+ ##
+ # Token accessor
+
+ attr_reader :tokens
+
+ ##
+ # Parsers +str+ into a Document
+
+ def self.parse str
+ parser = new
+ #parser.debug = true
+ parser.tokenize str
+ RDoc::Markup::Document.new(*parser.parse)
+ end
+
+ ##
+ # Returns a token stream for +str+, for testing
+
+ def self.tokenize str
+ parser = new
+ parser.tokenize str
+ parser.tokens
+ end
+
+ ##
+ # Creates a new Parser. See also ::parse
+
+ def initialize
+ @tokens = []
+ @current_token = nil
+ @debug = false
+
+ @line = 0
+ @line_pos = 0
+ end
+
+ ##
+ # Builds a Heading of +level+
+
+ def build_heading level
+ heading = RDoc::Markup::Heading.new level, text
+ skip :NEWLINE
+
+ heading
+ end
+
+ ##
+ # Builds a List flush to +margin+
+
+ def build_list margin
+ p :list_start => margin if @debug
+
+ list = RDoc::Markup::List.new
+
+ until @tokens.empty? do
+ type, data, column, = get
+
+ case type
+ when :BULLET, :LABEL, :LALPHA, :NOTE, :NUMBER, :UALPHA then
+ list_type = type
+
+ if column < margin then
+ unget
+ break
+ end
+
+ if list.type and list.type != list_type then
+ unget
+ break
+ end
+
+ list.type = list_type
+
+ case type
+ when :NOTE, :LABEL then
+ _, indent, = get # SPACE
+ if :NEWLINE == peek_token.first then
+ get
+ peek_type, new_indent, peek_column, = peek_token
+ indent = new_indent if
+ peek_type == :INDENT and peek_column >= column
+ unget
+ end
+ else
+ data = nil
+ _, indent, = get
+ end
+
+ list_item = build_list_item(margin + indent, data)
+
+ list << list_item if list_item
+ else
+ unget
+ break
+ end
+ end
+
+ p :list_end => margin if @debug
+
+ return nil if list.empty?
+
+ list
+ end
+
+ ##
+ # Builds a ListItem that is flush to +indent+ with type +item_type+
+
+ def build_list_item indent, item_type = nil
+ p :list_item_start => [indent, item_type] if @debug
+
+ list_item = RDoc::Markup::ListItem.new item_type
+
+ until @tokens.empty? do
+ type, data, column = get
+
+ if column < indent and
+ not type == :NEWLINE and
+ (type != :INDENT or data < indent) then
+ unget
+ break
+ end
+
+ case type
+ when :INDENT then
+ unget
+ list_item.push(*parse(indent))
+ when :TEXT then
+ unget
+ list_item << build_paragraph(indent)
+ when :HEADER then
+ list_item << build_heading(data)
+ when :NEWLINE then
+ list_item << RDoc::Markup::BlankLine.new
+ when *LIST_TOKENS then
+ unget
+ list_item << build_list(column)
+ else
+ raise ParseError, "Unhandled token #{@current_token.inspect}"
+ end
+ end
+
+ p :list_item_end => [indent, item_type] if @debug
+
+ return nil if list_item.empty?
+
+ list_item.parts.shift if
+ RDoc::Markup::BlankLine === list_item.parts.first and
+ list_item.length > 1
+
+ list_item
+ end
+
+ ##
+ # Builds a Paragraph that is flush to +margin+
+
+ def build_paragraph margin
+ p :paragraph_start => margin if @debug
+
+ paragraph = RDoc::Markup::Paragraph.new
+
+ until @tokens.empty? do
+ type, data, column, = get
+
+ case type
+ when :INDENT then
+ next if data == margin and peek_token[0] == :TEXT
+
+ unget
+ break
+ when :TEXT then
+ if column != margin then
+ unget
+ break
+ end
+
+ paragraph << data
+ skip :NEWLINE
+ else
+ unget
+ break
+ end
+ end
+
+ p :paragraph_end => margin if @debug
+
+ paragraph
+ end
+
+ ##
+ # Builds a Verbatim that is flush to +margin+
+
+ def build_verbatim margin
+ p :verbatim_begin => margin if @debug
+ verbatim = RDoc::Markup::Verbatim.new
+
+ until @tokens.empty? do
+ type, data, column, = get
+
+ case type
+ when :INDENT then
+ if margin >= data then
+ unget
+ break
+ end
+
+ indent = data - margin
+
+ verbatim << ' ' * indent
+ when :HEADER then
+ verbatim << '=' * data
+
+ _, _, peek_column, = peek_token
+ peek_column ||= column + data
+ verbatim << ' ' * (peek_column - column - data)
+ when :RULE then
+ width = 2 + data
+ verbatim << '-' * width
+
+ _, _, peek_column, = peek_token
+ peek_column ||= column + data + 2
+ verbatim << ' ' * (peek_column - column - width)
+ when :TEXT then
+ verbatim << data
+ when *LIST_TOKENS then
+ if column <= margin then
+ unget
+ break
+ end
+
+ list_marker = case type
+ when :BULLET then '*'
+ when :LABEL then "[#{data}]"
+ when :LALPHA, :NUMBER, :UALPHA then "#{data}."
+ when :NOTE then "#{data}::"
+ end
+
+ verbatim << list_marker
+
+ _, data, = get
+
+ verbatim << ' ' * (data - list_marker.length)
+ when :NEWLINE then
+ verbatim << data
+ break unless [:INDENT, :NEWLINE].include? peek_token[0]
+ else
+ unget
+ break
+ end
+ end
+
+ verbatim.normalize
+
+ p :verbatim_end => margin if @debug
+
+ verbatim
+ end
+
+ ##
+ # Pulls the next token from the stream.
+
+ def get
+ @current_token = @tokens.shift
+ p :get => @current_token if @debug
+ @current_token
+ end
+
+ ##
+ # Parses the tokens into a Document
+
+ def parse indent = 0
+ p :parse_start => indent if @debug
+
+ document = []
+
+ until @tokens.empty? do
+ type, data, column, = get
+
+ if type != :INDENT and column < indent then
+ unget
+ break
+ end
+
+ case type
+ when :HEADER then
+ document << build_heading(data)
+ when :INDENT then
+ if indent > data then
+ unget
+ break
+ elsif indent == data then
+ next
+ end
+
+ unget
+ document << build_verbatim(indent)
+ when :NEWLINE then
+ document << RDoc::Markup::BlankLine.new
+ skip :NEWLINE, false
+ when :RULE then
+ document << RDoc::Markup::Rule.new(data)
+ skip :NEWLINE
+ when :TEXT then
+ unget
+ document << build_paragraph(indent)
+
+ # we're done with this paragraph (indent mismatch)
+ break if peek_token[0] == :TEXT
+ when *LIST_TOKENS then
+ unget
+
+ list = build_list(indent)
+
+ document << list if list
+
+ # we're done with this list (indent mismatch)
+ break if LIST_TOKENS.include? peek_token.first and indent > 0
+ else
+ type, data, column, line = @current_token
+ raise ParseError,
+ "Unhandled token #{type} (#{data.inspect}) at #{line}:#{column}"
+ end
+ end
+
+ p :parse_end => indent if @debug
+
+ document
+ end
+
+ ##
+ # Returns the next token on the stream without modifying the stream
+
+ def peek_token
+ token = @tokens.first || []
+ p :peek => token if @debug
+ token
+ end
+
+ ##
+ # Skips a token of +token_type+, optionally raising an error.
+
+ def skip token_type, error = true
+ type, data, = get
+
+ return unless type # end of stream
+
+ return @current_token if token_type == type
+
+ unget
+
+ raise ParseError, "expected #{token_type} got #{@current_token.inspect}" if
+ error
+ end
+
+ ##
+ # Consumes tokens until NEWLINE and turns them back into text
+
+ def text
+ text = ''
+
+ loop do
+ type, data, = get
+
+ text << case type
+ when :BULLET then
+ _, space, = get # SPACE
+ "*#{' ' * (space - 1)}"
+ when :LABEL then
+ _, space, = get # SPACE
+ "[#{data}]#{' ' * (space - data.length - 2)}"
+ when :LALPHA, :NUMBER, :UALPHA then
+ _, space, = get # SPACE
+ "#{data}.#{' ' * (space - 2)}"
+ when :NOTE then
+ _, space = get # SPACE
+ "#{data}::#{' ' * (space - data.length - 2)}"
+ when :TEXT then
+ data
+ when :NEWLINE then
+ unget
+ break
+ when nil then
+ break
+ else
+ raise ParseError, "unhandled token #{@current_token.inspect}"
+ end
+ end
+
+ text
+ end
+
+ ##
+ # Calculates the column and line of the current token based on +offset+.
+
+ def token_pos offset
+ [offset - @line_pos, @line]
+ end
+
+ ##
+ # Turns text +input+ into a stream of tokens
+
+ def tokenize input
+ s = StringScanner.new input
+
+ @line = 0
+ @line_pos = 0
+
+ until s.eos? do
+ pos = s.pos
+
+ @tokens << case
+ when s.scan(/\r?\n/) then
+ token = [:NEWLINE, s.matched, *token_pos(pos)]
+ @line_pos = s.pos
+ @line += 1
+ token
+ when s.scan(/ +/) then
+ [:INDENT, s.matched_size, *token_pos(pos)]
+ when s.scan(/(=+)\s+/) then
+ level = s[1].length
+ level = 6 if level > 6
+ @tokens << [:HEADER, level, *token_pos(pos)]
+
+ pos = s.pos
+ s.scan(/.*/)
+ [:TEXT, s.matched, *token_pos(pos)]
+ when s.scan(/^(-{3,}) *$/) then
+ [:RULE, s[1].length - 2, *token_pos(pos)]
+ when s.scan(/([*-])\s+/) then
+ @tokens << [:BULLET, :BULLET, *token_pos(pos)]
+ [:SPACE, s.matched_size, *token_pos(pos)]
+ when s.scan(/([a-z]|\d+)\.[ \t]+\S/i) then
+ list_label = s[1]
+ width = s.matched_size - 1
+
+ s.pos -= 1 # unget \S
+
+ list_type = case list_label
+ when /[a-z]/ then :LALPHA
+ when /[A-Z]/ then :UALPHA
+ when /\d/ then :NUMBER
+ else
+ raise ParseError, "BUG token #{list_label}"
+ end
+
+ @tokens << [list_type, list_label, *token_pos(pos)]
+ [:SPACE, width, *token_pos(pos)]
+ when s.scan(/\[(.*?)\]( +|$)/) then
+ @tokens << [:LABEL, s[1], *token_pos(pos)]
+ [:SPACE, s.matched_size, *token_pos(pos)]
+ when s.scan(/(.*?)::( +|$)/) then
+ @tokens << [:NOTE, s[1], *token_pos(pos)]
+ [:SPACE, s.matched_size, *token_pos(pos)]
+ else s.scan(/.*/)
+ [:TEXT, s.matched, *token_pos(pos)]
+ end
+ end
+
+ self
+ end
+
+ ##
+ # Returns the current token or +token+ to the token stream
+
+ def unget token = @current_token
+ p :unget => token if @debug
+ raise Error, 'too many #ungets' if token == @tokens.first
+ @tokens.unshift token if token
+ end
+
+end
+
+require 'rdoc/markup/blank_line'
+require 'rdoc/markup/document'
+require 'rdoc/markup/heading'
+require 'rdoc/markup/list'
+require 'rdoc/markup/list_item'
+require 'rdoc/markup/paragraph'
+require 'rdoc/markup/rule'
+require 'rdoc/markup/verbatim'
+