require 'strscan' require 'rdoc/text' ## # A recursive-descent parser for RDoc markup. # # The parser tokenizes an input string then parses the tokens into a Document. # Documents can be converted into output formats by writing a visitor like # RDoc::Markup::ToHTML. # # The parser only handles the block-level constructs Paragraph, List, # ListItem, Heading, Verbatim, BlankLine and Rule. Inline markup such as # \+blah\+ is handled separately by RDoc::Markup::AttributeManager. # # To see what markup the Parser implements read RDoc. To see how to use # RDoc markup to format text in your program read RDoc::Markup. class RDoc::Markup::Parser include RDoc::Text ## # List token types LIST_TOKENS = [ :BULLET, :LABEL, :LALPHA, :NOTE, :NUMBER, :UALPHA, ] ## # Parser error subclass class Error < RuntimeError; end ## # Raised when the parser is unable to handle the given markup class ParseError < Error; end ## # Enables display of debugging information attr_accessor :debug ## # Token accessor attr_reader :tokens ## # Parsers +str+ into a Document def self.parse str parser = new #parser.debug = true parser.tokenize str RDoc::Markup::Document.new(*parser.parse) end ## # Returns a token stream for +str+, for testing def self.tokenize str parser = new parser.tokenize str parser.tokens end ## # Creates a new Parser. See also ::parse def initialize @tokens = [] @current_token = nil @debug = false @line = 0 @line_pos = 0 end ## # Builds a Heading of +level+ def build_heading level heading = RDoc::Markup::Heading.new level, text skip :NEWLINE heading end ## # Builds a List flush to +margin+ def build_list margin p :list_start => margin if @debug list = RDoc::Markup::List.new until @tokens.empty? do type, data, column, = get case type when :BULLET, :LABEL, :LALPHA, :NOTE, :NUMBER, :UALPHA then list_type = type if column < margin then unget break end if list.type and list.type != list_type then unget break end list.type = list_type case type when :NOTE, :LABEL then _, indent, = get # SPACE if :NEWLINE == peek_token.first then get peek_type, new_indent, peek_column, = peek_token indent = new_indent if peek_type == :INDENT and peek_column >= column unget end else data = nil _, indent, = get end list_item = build_list_item(margin + indent, data) list << list_item if list_item else unget break end end p :list_end => margin if @debug return nil if list.empty? list end ## # Builds a ListItem that is flush to +indent+ with type +item_type+ def build_list_item indent, item_type = nil p :list_item_start => [indent, item_type] if @debug list_item = RDoc::Markup::ListItem.new item_type until @tokens.empty? do type, data, column = get if column < indent and not type == :NEWLINE and (type != :INDENT or data < indent) then unget break end case type when :INDENT then unget list_item.push(*parse(indent)) when :TEXT then unget list_item << build_paragraph(indent) when :HEADER then list_item << build_heading(data) when :NEWLINE then list_item << RDoc::Markup::BlankLine.new when *LIST_TOKENS then unget list_item << build_list(column) else raise ParseError, "Unhandled token #{@current_token.inspect}" end end p :list_item_end => [indent, item_type] if @debug return nil if list_item.empty? list_item.parts.shift if RDoc::Markup::BlankLine === list_item.parts.first and list_item.length > 1 list_item end ## # Builds a Paragraph that is flush to +margin+ def build_paragraph margin p :paragraph_start => margin if @debug paragraph = RDoc::Markup::Paragraph.new until @tokens.empty? do type, data, column, = get case type when :INDENT then next if data == margin and peek_token[0] == :TEXT unget break when :TEXT then if column != margin then unget break end paragraph << data skip :NEWLINE else unget break end end p :paragraph_end => margin if @debug paragraph end ## # Builds a Verbatim that is flush to +margin+ def build_verbatim margin p :verbatim_begin => margin if @debug verbatim = RDoc::Markup::Verbatim.new until @tokens.empty? do type, data, column, = get case type when :INDENT then if margin >= data then unget break end indent = data - margin verbatim << ' ' * indent when :HEADER then verbatim << '=' * data _, _, peek_column, = peek_token peek_column ||= column + data verbatim << ' ' * (peek_column - column - data) when :RULE then width = 2 + data verbatim << '-' * width _, _, peek_column, = peek_token peek_column ||= column + data + 2 verbatim << ' ' * (peek_column - column - width) when :TEXT then verbatim << data when *LIST_TOKENS then if column <= margin then unget break end list_marker = case type when :BULLET then '*' when :LABEL then "[#{data}]" when :LALPHA, :NUMBER, :UALPHA then "#{data}." when :NOTE then "#{data}::" end verbatim << list_marker _, data, = get verbatim << ' ' * (data - list_marker.length) when :NEWLINE then verbatim << data break unless [:INDENT, :NEWLINE].include? peek_token[0] else unget break end end verbatim.normalize p :verbatim_end => margin if @debug verbatim end ## # Pulls the next token from the stream. def get @current_token = @tokens.shift p :get => @current_token if @debug @current_token end ## # Parses the tokens into a Document def parse indent = 0 p :parse_start => indent if @debug document = [] until @tokens.empty? do type, data, column, = get if type != :INDENT and column < indent then unget break end case type when :HEADER then document << build_heading(data) when :INDENT then if indent > data then unget break elsif indent == data then next end unget document << build_verbatim(indent) when :NEWLINE then document << RDoc::Markup::BlankLine.new skip :NEWLINE, false when :RULE then document << RDoc::Markup::Rule.new(data) skip :NEWLINE when :TEXT then unget document << build_paragraph(indent) # we're done with this paragraph (indent mismatch) break if peek_token[0] == :TEXT when *LIST_TOKENS then unget list = build_list(indent) document << list if list # we're done with this list (indent mismatch) break if LIST_TOKENS.include? peek_token.first and indent > 0 else type, data, column, line = @current_token raise ParseError, "Unhandled token #{type} (#{data.inspect}) at #{line}:#{column}" end end p :parse_end => indent if @debug document end ## # Returns the next token on the stream without modifying the stream def peek_token token = @tokens.first || [] p :peek => token if @debug token end ## # Skips a token of +token_type+, optionally raising an error. def skip token_type, error = true type, data, = get return unless type # end of stream return @current_token if token_type == type unget raise ParseError, "expected #{token_type} got #{@current_token.inspect}" if error end ## # Consumes tokens until NEWLINE and turns them back into text def text text = '' loop do type, data, = get text << case type when :BULLET then _, space, = get # SPACE "*#{' ' * (space - 1)}" when :LABEL then _, space, = get # SPACE "[#{data}]#{' ' * (space - data.length - 2)}" when :LALPHA, :NUMBER, :UALPHA then _, space, = get # SPACE "#{data}.#{' ' * (space - 2)}" when :NOTE then _, space = get # SPACE "#{data}::#{' ' * (space - data.length - 2)}" when :TEXT then data when :NEWLINE then unget break when nil then break else raise ParseError, "unhandled token #{@current_token.inspect}" end end text end ## # Calculates the column and line of the current token based on +offset+. def token_pos offset [offset - @line_pos, @line] end ## # Turns text +input+ into a stream of tokens def tokenize input s = StringScanner.new input @line = 0 @line_pos = 0 until s.eos? do pos = s.pos @tokens << case when s.scan(/\r?\n/) then token = [:NEWLINE, s.matched, *token_pos(pos)] @line_pos = s.pos @line += 1 token when s.scan(/ +/) then [:INDENT, s.matched_size, *token_pos(pos)] when s.scan(/(=+)\s+/) then level = s[1].length level = 6 if level > 6 @tokens << [:HEADER, level, *token_pos(pos)] pos = s.pos s.scan(/.*/) [:TEXT, s.matched, *token_pos(pos)] when s.scan(/^(-{3,}) *$/) then [:RULE, s[1].length - 2, *token_pos(pos)] when s.scan(/([*-])\s+/) then @tokens << [:BULLET, :BULLET, *token_pos(pos)] [:SPACE, s.matched_size, *token_pos(pos)] when s.scan(/([a-z]|\d+)\.[ \t]+\S/i) then list_label = s[1] width = s.matched_size - 1 s.pos -= 1 # unget \S list_type = case list_label when /[a-z]/ then :LALPHA when /[A-Z]/ then :UALPHA when /\d/ then :NUMBER else raise ParseError, "BUG token #{list_label}" end @tokens << [list_type, list_label, *token_pos(pos)] [:SPACE, width, *token_pos(pos)] when s.scan(/\[(.*?)\]( +|$)/) then @tokens << [:LABEL, s[1], *token_pos(pos)] [:SPACE, s.matched_size, *token_pos(pos)] when s.scan(/(.*?)::( +|$)/) then @tokens << [:NOTE, s[1], *token_pos(pos)] [:SPACE, s.matched_size, *token_pos(pos)] else s.scan(/.*/) [:TEXT, s.matched, *token_pos(pos)] end end self end ## # Returns the current token or +token+ to the token stream def unget token = @current_token p :unget => token if @debug raise Error, 'too many #ungets' if token == @tokens.first @tokens.unshift token if token end end require 'rdoc/markup/blank_line' require 'rdoc/markup/document' require 'rdoc/markup/heading' require 'rdoc/markup/list' require 'rdoc/markup/list_item' require 'rdoc/markup/paragraph' require 'rdoc/markup/rule' require 'rdoc/markup/verbatim'