Import RDoc 3

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@30249 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
author: drbrain <drbrain@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2010-12-20 03:22:49 +0000
committer: drbrain <drbrain@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2010-12-20 03:22:49 +0000
commit: 2ef9c50c6e405717d06362787c4549ca4f1c6485 (patch)
tree: ee99486567461dd5796f3d6edcc9e204187f2666 /lib/rdoc/markup/parser.rb
parent: d7effd506f5b91a636f2e6452ef1946b923007c7 (diff)
1 files changed, 190 insertions, 236 deletions
diff --git a/lib/rdoc/markup/parser.rb b/lib/rdoc/markup/parser.rb
index 9fba69dc29..ea02ee3c5b 100644
--- a/lib/rdoc/markup/parser.rb
+++ b/lib/rdoc/markup/parser.rb
@@ -52,13 +52,13 @@ class RDoc::Markup::Parser
   attr_reader :tokens
 
   ##
-  # Parsers +str+ into a Document
+  # Parses +str+ into a Document
 
   def self.parse str
     parser = new
-    #parser.debug = true
     parser.tokenize str
-    RDoc::Markup::Document.new(*parser.parse)
+    doc = RDoc::Markup::Document.new
+    parser.parse doc
   end
 
   ##
@@ -86,6 +86,7 @@ class RDoc::Markup::Parser
   # Builds a Heading of +level+
 
   def build_heading level
+    _, text, = get  # TEXT
     heading = RDoc::Markup::Heading.new level, text
     skip :NEWLINE
 
@@ -105,38 +106,69 @@ class RDoc::Markup::Parser
 
       case type
       when :BULLET, :LABEL, :LALPHA, :NOTE, :NUMBER, :UALPHA then
-        list_type = type
 
-        if column < margin then
+        if column < margin || (list.type && list.type != type) then
           unget
           break
         end
 
-        if list.type and list.type != list_type then
-          unget
-          break
-        end
-
-        list.type = list_type
+        list.type = type
+        peek_type, _, column, = peek_token
 
         case type
         when :NOTE, :LABEL then
-          _, indent, = get # SPACE
-          if :NEWLINE == peek_token.first then
-            get
-            peek_type, new_indent, peek_column, = peek_token
-            indent = new_indent if
-              peek_type == :INDENT and peek_column >= column
-            unget
+          if peek_type == :NEWLINE then
+            # description not on the same line as LABEL/NOTE
+            # skip the trailing newline & any blank lines below
+            while peek_type == :NEWLINE
+              get
+              peek_type, _, column, = peek_token
+            end
+
+            # we may be:
+            #   - at end of stream
+            #   - at a column < margin:
+            #         [text]
+            #       blah blah blah
+            #   - at the same column, but with a different type of list item
+            #       [text]
+            #       * blah blah
+            #   - at the same column, with the same type of list item
+            #       [one]
+            #       [two]
+            # In all cases, we have an empty description.
+            # In the last case only, we continue.
+            if peek_type.nil? || column < margin then
+              empty = 1
+            elsif column == margin then
+              case peek_type
+              when type
+                empty = 2 # continue
+              when *LIST_TOKENS
+                empty = 1
+              else
+                empty = 0
+              end
+            else
+              empty = 0
+            end
+
+            if empty > 0 then
+              item = RDoc::Markup::ListItem.new(data)
+              item << RDoc::Markup::BlankLine.new
+              list << item
+              break if empty == 1
+              next
+            end
           end
         else
           data = nil
-          _, indent, = get
         end
 
-        list_item = build_list_item(margin + indent, data)
+        list_item = RDoc::Markup::ListItem.new data
+        parse list_item, column
+        list << list_item
 
-        list << list_item if list_item
       else
         unget
         break
@@ -151,54 +183,6 @@ class RDoc::Markup::Parser
   end
 
   ##
-  # Builds a ListItem that is flush to +indent+ with type +item_type+
-
-  def build_list_item indent, item_type = nil
-    p :list_item_start => [indent, item_type] if @debug
-
-    list_item = RDoc::Markup::ListItem.new item_type
-
-    until @tokens.empty? do
-      type, data, column = get
-
-      if column < indent and
-         not type == :NEWLINE and
-         (type != :INDENT or data < indent) then
-        unget
-        break
-      end
-
-      case type
-      when :INDENT then
-        unget
-        list_item.push(*parse(indent))
-      when :TEXT then
-        unget
-        list_item << build_paragraph(indent)
-      when :HEADER then
-        list_item << build_heading(data)
-      when :NEWLINE then
-        list_item << RDoc::Markup::BlankLine.new
-      when *LIST_TOKENS then
-        unget
-        list_item << build_list(column)
-      else
-        raise ParseError, "Unhandled token #{@current_token.inspect}"
-      end
-    end
-
-    p :list_item_end => [indent, item_type] if @debug
-
-    return nil if list_item.empty?
-
-    list_item.parts.shift if
-      RDoc::Markup::BlankLine === list_item.parts.first and
-      list_item.length > 1
-
-    list_item
-  end
-
-  ##
   # Builds a Paragraph that is flush to +margin+
 
   def build_paragraph margin
@@ -209,18 +193,7 @@ class RDoc::Markup::Parser
     until @tokens.empty? do
       type, data, column, = get
 
-      case type
-      when :INDENT then
-        next if data == margin and peek_token[0] == :TEXT
-
-        unget
-        break
-      when :TEXT then
-        if column != margin then
-          unget
-          break
-        end
-
+      if type == :TEXT && column == margin then
         paragraph << data
         skip :NEWLINE
       else
@@ -235,67 +208,81 @@ class RDoc::Markup::Parser
   end
 
   ##
-  # Builds a Verbatim that is flush to +margin+
+  # Builds a Verbatim that is indented from +margin+.
+  #
+  # The verbatim block is shifted left (the least indented lines start in
+  # column 0).  Each part of the verbatim is one line of text, always
+  # terminated by a newline.  Blank lines always consist of a single newline
+  # character, and there is never a single newline at the end of the verbatim.
 
   def build_verbatim margin
     p :verbatim_begin => margin if @debug
     verbatim = RDoc::Markup::Verbatim.new
 
+    min_indent = nil
+    generate_leading_spaces = true
+    line = ''
+
     until @tokens.empty? do
       type, data, column, = get
 
-      case type
-      when :INDENT then
-        if margin >= data then
-          unget
-          break
-        end
+      if type == :NEWLINE then
+        line << data
+        verbatim << line
+        line = ''
+        generate_leading_spaces = true
+        next
+      end
 
-        indent = data - margin
+      if column <= margin
+        unget
+        break
+      end
 
-        verbatim << ' ' * indent
-      when :HEADER then
-        verbatim << '=' * data
+      if generate_leading_spaces then
+        indent = column - margin
+        line << ' ' * indent
+        min_indent = indent if min_indent.nil? || indent < min_indent
+        generate_leading_spaces = false
+      end
 
+      case type
+      when :HEADER then
+        line << '=' * data
         _, _, peek_column, = peek_token
         peek_column ||= column + data
-        verbatim << ' ' * (peek_column - column - data)
+        indent = peek_column - column - data
+        line << ' ' * indent
       when :RULE then
         width = 2 + data
-        verbatim << '-' * width
-
+        line << '-' * width
         _, _, peek_column, = peek_token
-        peek_column ||= column + data + 2
-        verbatim << ' ' * (peek_column - column - width)
+        peek_column ||= column + width
+        indent = peek_column - column - width
+        line << ' ' * indent
       when :TEXT then
-        verbatim << data
-      when *LIST_TOKENS then
-        if column <= margin then
-          unget
-          break
-        end
-
+        line << data
+      else # *LIST_TOKENS
         list_marker = case type
-                      when :BULLET                   then '*'
-                      when :LABEL                    then "[#{data}]"
-                      when :LALPHA, :NUMBER, :UALPHA then "#{data}."
-                      when :NOTE                     then "#{data}::"
+                      when :BULLET then data
+                      when :LABEL  then "[#{data}]"
+                      when :NOTE   then "#{data}::"
+                      else # :LALPHA, :NUMBER, :UALPHA
+                        "#{data}."
                       end
-
-        verbatim << list_marker
-
-        _, data, = get
-
-        verbatim << ' ' * (data - list_marker.length)
-      when :NEWLINE then
-        verbatim << data
-        break unless [:INDENT, :NEWLINE].include? peek_token[0]
-      else
-        unget
-        break
+        line << list_marker
+        peek_type, _, peek_column = peek_token
+        unless peek_type == :NEWLINE then
+          peek_column ||= column + list_marker.length
+          indent = peek_column - column - list_marker.length
+          line << ' ' * indent
+        end
       end
+
     end
 
+    verbatim << line << "\n" unless line.empty?
+    verbatim.parts.each { |p| p.slice!(0, min_indent) unless p == "\n" } if min_indent > 0
     verbatim.normalize
 
     p :verbatim_end => margin if @debug
@@ -313,65 +300,60 @@ class RDoc::Markup::Parser
   end
 
   ##
-  # Parses the tokens into a Document
-
-  def parse indent = 0
+  # Parses the tokens into an array of RDoc::Markup::XXX objects,
+  # and appends them to the passed +parent+ RDoc::Markup::YYY object.
+  #
+  # Exits at the end of the token stream, or when it encounters a token
+  # in a column less than +indent+ (unless it is a NEWLINE).
+  #
+  # Returns +parent+.
+
+  def parse parent, indent = 0
     p :parse_start => indent if @debug
 
-    document = []
-
     until @tokens.empty? do
       type, data, column, = get
 
-      if type != :INDENT and column < indent then
+      if type == :NEWLINE then
+        # trailing newlines are skipped below, so this is a blank line
+        parent << RDoc::Markup::BlankLine.new
+        skip :NEWLINE, false
+        next
+      end
+
+      # indentation change: break or verbattim
+      if column < indent then
         unget
         break
+      elsif column > indent then
+        unget
+        parent << build_verbatim(indent)
+        next
       end
 
+      # indentation is the same
       case type
       when :HEADER then
-        document << build_heading(data)
-      when :INDENT then
-        if indent > data then
-          unget
-          break
-        elsif indent == data then
-          next
-        end
-
-        unget
-        document << build_verbatim(indent)
-      when :NEWLINE then
-        document << RDoc::Markup::BlankLine.new
-        skip :NEWLINE, false
+        parent << build_heading(data)
       when :RULE then
-        document << RDoc::Markup::Rule.new(data)
+        parent << RDoc::Markup::Rule.new(data)
         skip :NEWLINE
       when :TEXT then
         unget
-        document << build_paragraph(indent)
-
-        # we're done with this paragraph (indent mismatch)
-        break if peek_token[0] == :TEXT
+        parent << build_paragraph(indent)
       when *LIST_TOKENS then
         unget
-
-        list = build_list(indent)
-
-        document << list if list
-
-        # we're done with this list (indent mismatch)
-        break if LIST_TOKENS.include? peek_token.first and indent > 0
+        parent << build_list(indent)
       else
         type, data, column, line = @current_token
-        raise ParseError,
-              "Unhandled token #{type} (#{data.inspect}) at #{line}:#{column}"
+        raise ParseError, "Unhandled token #{type} (#{data.inspect}) at #{line}:#{column}"
       end
     end
 
     p :parse_end => indent if @debug
 
-    document
+    parent
+
   end
 
   ##
@@ -384,63 +366,16 @@ class RDoc::Markup::Parser
   end
 
   ##
-  # Skips a token of +token_type+, optionally raising an error.
+  # Skips the next token if its type is +token_type+.
+  #
+  # Optionally raises an error if the next token is not of the expected type.
 
   def skip token_type, error = true
     type, = get
-
     return unless type # end of stream
-
     return @current_token if token_type == type
-
     unget
-
-    raise ParseError, "expected #{token_type} got #{@current_token.inspect}" if
-      error
-  end
-
-  ##
-  # Consumes tokens until NEWLINE and turns them back into text
-
-  def text
-    text = ''
-
-    loop do
-      type, data, = get
-
-      text << case type
-              when :BULLET then
-                _, space, = get # SPACE
-                "*#{' ' * (space - 1)}"
-              when :LABEL then
-                _, space, = get # SPACE
-                "[#{data}]#{' ' * (space - data.length - 2)}"
-              when :LALPHA, :NUMBER, :UALPHA then
-                _, space, = get # SPACE
-                "#{data}.#{' ' * (space - 2)}"
-              when :NOTE then
-                _, space = get # SPACE
-                "#{data}::#{' ' * (space - data.length - 2)}"
-              when :TEXT then
-                data
-              when :NEWLINE then
-                unget
-                break
-              when nil then
-                break
-              else
-                raise ParseError, "unhandled token #{@current_token.inspect}"
-              end
-    end
-
-    text
-  end
-
-  ##
-  # Calculates the column and line of the current token based on +offset+.
-
-  def token_pos offset
-    [offset - @line_pos, @line]
+    raise ParseError, "expected #{token_type} got #{@current_token.inspect}" if error
   end
 
   ##
@@ -455,51 +390,62 @@ class RDoc::Markup::Parser
     until s.eos? do
       pos = s.pos
 
+      # leading spaces will be reflected by the column of the next token
+      # the only thing we loose are trailing spaces at the end of the file
+      next if s.scan(/ +/)
+
+      # note: after BULLET, LABEL, etc.,
+      # indent will be the column of the next non-newline token
+
       @tokens << case
+                 # [CR]LF => :NEWLINE
                  when s.scan(/\r?\n/) then
                    token = [:NEWLINE, s.matched, *token_pos(pos)]
                    @line_pos = s.pos
                    @line += 1
                    token
-                 when s.scan(/ +/) then
-                   [:INDENT, s.matched_size, *token_pos(pos)]
+                 # === text => :HEADER then :TEXT
                  when s.scan(/(=+)\s*/) then
                    level = s[1].length
                    level = 6 if level > 6
                    @tokens << [:HEADER, level, *token_pos(pos)]
-
                    pos = s.pos
                    s.scan(/.*/)
-                   [:TEXT, s.matched, *token_pos(pos)]
-                 when s.scan(/^(-{3,}) *$/) then
+                   [:TEXT, s.matched.sub(/\r$/, ''), *token_pos(pos)]
+                 # --- (at least 3) and nothing else on the line => :RULE
+                 when s.scan(/(-{3,}) *$/) then
                    [:RULE, s[1].length - 2, *token_pos(pos)]
-                 when s.scan(/([*-])\s+/) then
-                   @tokens << [:BULLET, :BULLET, *token_pos(pos)]
-                   [:SPACE, s.matched_size, *token_pos(pos)]
-                 when s.scan(/([a-z]|\d+)\.[ \t]+\S/i) then
+                 # * or - followed by white space and text => :BULLET
+                 when s.scan(/([*-]) +(\S)/) then
+                   s.pos -= s[2].bytesize # unget \S
+                   [:BULLET, s[1], *token_pos(pos)]
+                 # A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
+                 when s.scan(/([a-z]|\d+)\. +(\S)/i) then
+                   # FIXME if tab(s), the column will be wrong
+                   # either support tabs everywhere by first expanding them to
+                   # spaces, or assume that they will have been replaced
+                   # before (and provide a check for that at least in debug
+                   # mode)
                    list_label = s[1]
-                   width      = s.matched_size - 1
-
-                   s.pos -= 1 # unget \S
-
-                   list_type = case list_label
-                               when /[a-z]/ then :LALPHA
-                               when /[A-Z]/ then :UALPHA
-                               when /\d/    then :NUMBER
-                               else
-                                 raise ParseError, "BUG token #{list_label}"
-                               end
-
-                   @tokens << [list_type, list_label, *token_pos(pos)]
-                   [:SPACE, width, *token_pos(pos)]
+                   s.pos -= s[2].bytesize # unget \S
+                   list_type =
+                     case list_label
+                     when /[a-z]/ then :LALPHA
+                     when /[A-Z]/ then :UALPHA
+                     when /\d/    then :NUMBER
+                     else
+                       raise ParseError, "BUG token #{list_label}"
+                     end
+                   [list_type, list_label, *token_pos(pos)]
+                 # [text] followed by spaces or end of line => :LABEL
                  when s.scan(/\[(.*?)\]( +|$)/) then
-                   @tokens << [:LABEL, s[1], *token_pos(pos)]
-                   [:SPACE, s.matched_size, *token_pos(pos)]
+                   [:LABEL, s[1], *token_pos(pos)]
+                 # text:: followed by spaces or end of line => :NOTE
                  when s.scan(/(.*?)::( +|$)/) then
-                   @tokens << [:NOTE, s[1], *token_pos(pos)]
-                   [:SPACE, s.matched_size, *token_pos(pos)]
+                   [:NOTE, s[1], *token_pos(pos)]
+                 # anything else: :TEXT
                  else s.scan(/.*/)
-                   [:TEXT, s.matched, *token_pos(pos)]
+                   [:TEXT, s.matched.sub(/\r$/, ''), *token_pos(pos)]
                  end
     end
 
@@ -507,9 +453,17 @@ class RDoc::Markup::Parser
   end
 
   ##
-  # Returns the current token or +token+ to the token stream
+  # Calculates the column and line of the current token based on +offset+.
+
+  def token_pos offset
+    [offset - @line_pos, @line]
+  end
+
+  ##
+  # Returns the current token to the token stream
 
-  def unget token = @current_token
+  def unget
+    token = @current_token
     p :unget => token if @debug
     raise Error, 'too many #ungets' if token == @tokens.first
     @tokens.unshift token if token
author	drbrain <drbrain@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2010-12-20 03:22:49 +0000
committer	drbrain <drbrain@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2010-12-20 03:22:49 +0000
commit	2ef9c50c6e405717d06362787c4549ca4f1c6485 (patch)
tree	ee99486567461dd5796f3d6edcc9e204187f2666 /lib/rdoc/markup/parser.rb
parent	d7effd506f5b91a636f2e6452ef1946b923007c7 (diff)