1 files changed, 473 insertions, 0 deletions
diff --git a/trunk/lib/rdoc/markup.rb b/trunk/lib/rdoc/markup.rb
new file mode 100644
index 0000000000..0e1b596255
--- /dev/null
+++ b/trunk/lib/rdoc/markup.rb
@@ -0,0 +1,473 @@
+require 'rdoc'
+
+##
+# RDoc::Markup parses plain text documents and attempts to decompose them into
+# their constituent parts.  Some of these parts are high-level: paragraphs,
+# chunks of verbatim text, list entries and the like.  Other parts happen at
+# the character level: a piece of bold text, a word in code font.  This markup
+# is similar in spirit to that used on WikiWiki webs, where folks create web
+# pages using a simple set of formatting rules.
+#
+# RDoc::Markup itself does no output formatting: this is left to a different
+# set of classes.
+#
+# RDoc::Markup is extendable at runtime: you can add \new markup elements to
+# be recognised in the documents that RDoc::Markup parses.
+#
+# RDoc::Markup is intended to be the basis for a family of tools which share
+# the common requirement that simple, plain-text should be rendered in a
+# variety of different output formats and media.  It is envisaged that
+# RDoc::Markup could be the basis for formatting RDoc style comment blocks,
+# Wiki entries, and online FAQs.
+#
+# = Basic Formatting
+#
+# * RDoc::Markup looks for a document's natural left margin.  This is
+#   used as the initial margin for the document.
+#
+# * Consecutive lines starting at this margin are considered to be a
+#   paragraph.
+#
+# * If a paragraph starts with a "*", "-", or with "<digit>.", then it is
+#   taken to be the start of a list.  The margin in increased to be the first
+#   non-space following the list start flag.  Subsequent lines should be
+#   indented to this \new margin until the list ends.  For example:
+#
+#      * this is a list with three paragraphs in
+#        the first item.  This is the first paragraph.
+#
+#        And this is the second paragraph.
+#
+#        1. This is an indented, numbered list.
+#        2. This is the second item in that list
+#
+#        This is the third conventional paragraph in the
+#        first list item.
+#
+#      * This is the second item in the original list
+#
+# * You can also construct labeled lists, sometimes called description
+#   or definition lists.  Do this by putting the label in square brackets
+#   and indenting the list body:
+#
+#       [cat]  a small furry mammal
+#              that seems to sleep a lot
+#
+#       [ant]  a little insect that is known
+#              to enjoy picnics
+#
+#   A minor variation on labeled lists uses two colons to separate the
+#   label from the list body:
+#
+#       cat::  a small furry mammal
+#              that seems to sleep a lot
+#
+#       ant::  a little insect that is known
+#              to enjoy picnics
+#
+#   This latter style guarantees that the list bodies' left margins are
+#   aligned: think of them as a two column table.
+#
+# * Any line that starts to the right of the current margin is treated
+#   as verbatim text.  This is useful for code listings.  The example of a
+#   list above is also verbatim text.
+#
+# * A line starting with an equals sign (=) is treated as a
+#   heading.  Level one headings have one equals sign, level two headings
+#   have two,and so on.
+#
+# * A line starting with three or more hyphens (at the current indent)
+#   generates a horizontal rule.  The more hyphens, the thicker the rule
+#   (within reason, and if supported by the output device)
+#
+# * You can use markup within text (except verbatim) to change the
+#   appearance of parts of that text.  Out of the box, RDoc::Markup
+#   supports word-based and general markup.
+#
+#   Word-based markup uses flag characters around individual words:
+#
+#   [\*word*]  displays word in a *bold* font
+#   [\_word_]  displays word in an _emphasized_ font
+#   [\+word+]  displays word in a +code+ font
+#
+#   General markup affects text between a start delimiter and and end
+#   delimiter.  Not surprisingly, these delimiters look like HTML markup.
+#
+#   [\<b>text...</b>]    displays word in a *bold* font
+#   [\<em>text...</em>]  displays word in an _emphasized_ font
+#   [\<i>text...</i>]    displays word in an _emphasized_ font
+#   [\<tt>text...</tt>]  displays word in a +code+ font
+#
+#   Unlike conventional Wiki markup, general markup can cross line
+#   boundaries.  You can turn off the interpretation of markup by
+#   preceding the first character with a backslash, so \\\<b>bold
+#   text</b> and \\\*bold* produce \<b>bold text</b> and \*bold*
+#   respectively.
+#
+# * Hyperlinks to the web starting http:, mailto:, ftp:, or www. are
+#   recognized.  An HTTP url that references an external image file is
+#   converted into an inline <IMG..>.  Hyperlinks starting 'link:' are
+#   assumed to refer to local files whose path is relative to the --op
+#   directory.
+#
+#   Hyperlinks can also be of the form <tt>label</tt>[url], in which
+#   case the label is used in the displayed text, and <tt>url</tt> is
+#   used as the target.  If <tt>label</tt> contains multiple words,
+#   put it in braces: <em>{multi word label}[</em>url<em>]</em>.
+#
+# == Synopsis
+#
+# This code converts +input_string+ to HTML.  The conversion takes place in
+# the +convert+ method, so you can use the same RDoc::Markup converter to
+# convert multiple input strings.
+#
+#   require 'rdoc/markup/to_html'
+#   
+#   h = RDoc::Markup::ToHtml.new
+#   
+#   puts h.convert(input_string)
+#
+# You can extend the RDoc::Markup parser to recognise new markup
+# sequences, and to add special processing for text that matches a
+# regular expression.  Here we make WikiWords significant to the parser,
+# and also make the sequences {word} and \<no>text...</no> signify
+# strike-through text.  When then subclass the HTML output class to deal
+# with these:
+#
+#   require 'rdoc/markup'
+#   require 'rdoc/markup/to_html'
+#   
+#   class WikiHtml < RDoc::Markup::ToHtml
+#     def handle_special_WIKIWORD(special)
+#       "<font color=red>" + special.text + "</font>"
+#     end
+#   end
+#   
+#   m = RDoc::Markup.new
+#   m.add_word_pair("{", "}", :STRIKE)
+#   m.add_html("no", :STRIKE)
+#   
+#   m.add_special(/\b([A-Z][a-z]+[A-Z]\w+)/, :WIKIWORD)
+#   
+#   wh = WikiHtml.new
+#   wh.add_tag(:STRIKE, "<strike>", "</strike>")
+#   
+#   puts "<body>#{wh.convert ARGF.read}</body>"
+#
+#--
+# Author::   Dave Thomas,  dave@pragmaticprogrammer.com
+# License::  Ruby license
+
+class RDoc::Markup
+
+  SPACE = ?\s
+
+  # List entries look like:
+  #   *       text
+  #   1.      text
+  #   [label] text
+  #   label:: text
+  #
+  # Flag it as a list entry, and work out the indent for subsequent lines
+
+  SIMPLE_LIST_RE = /^(
+                (  \*          (?# bullet)
+                  |-           (?# bullet)
+                  |\d+\.       (?# numbered )
+                  |[A-Za-z]\.  (?# alphabetically numbered )
+                )
+                \s+
+              )\S/x
+
+  LABEL_LIST_RE = /^(
+                      (  \[.*?\]    (?# labeled  )
+                        |\S.*::     (?# note     )
+                      )(?:\s+|$)
+                    )/x
+
+  ##
+  # Take a block of text and use various heuristics to determine it's
+  # structure (paragraphs, lists, and so on).  Invoke an event handler as we
+  # identify significant chunks.
+
+  def initialize
+    @am = RDoc::Markup::AttributeManager.new
+    @output = nil
+  end
+
+  ##
+  # Add to the sequences used to add formatting to an individual word (such
+  # as *bold*).  Matching entries will generate attributes that the output
+  # formatters can recognize by their +name+.
+
+  def add_word_pair(start, stop, name)
+    @am.add_word_pair(start, stop, name)
+  end
+
+  ##
+  # Add to the sequences recognized as general markup.
+
+  def add_html(tag, name)
+    @am.add_html(tag, name)
+  end
+
+  ##
+  # Add to other inline sequences.  For example, we could add WikiWords using
+  # something like:
+  #
+  #    parser.add_special(/\b([A-Z][a-z]+[A-Z]\w+)/, :WIKIWORD)
+  #
+  # Each wiki word will be presented to the output formatter via the
+  # accept_special method.
+
+  def add_special(pattern, name)
+    @am.add_special(pattern, name)
+  end
+
+  ##
+  # We take a string, split it into lines, work out the type of each line,
+  # and from there deduce groups of lines (for example all lines in a
+  # paragraph).  We then invoke the output formatter using a Visitor to
+  # display the result.
+
+  def convert(str, op)
+    lines = str.split(/\r?\n/).map { |line| Line.new line }
+    @lines = Lines.new lines
+
+    return "" if @lines.empty?
+    @lines.normalize
+    assign_types_to_lines
+    group = group_lines
+    # call the output formatter to handle the result
+    #group.each { |line| p line }
+    group.accept @am, op
+  end
+
+  private
+
+  ##
+  # Look through the text at line indentation.  We flag each line as being
+  # Blank, a paragraph, a list element, or verbatim text.
+
+  def assign_types_to_lines(margin = 0, level = 0)
+    while line = @lines.next
+      if line.blank? then
+        line.stamp :BLANK, level
+        next
+      end
+
+      # if a line contains non-blanks before the margin, then it must belong
+      # to an outer level
+
+      text = line.text
+
+      for i in 0...margin
+        if text[i] != SPACE
+          @lines.unget
+          return
+        end
+      end
+
+      active_line = text[margin..-1]
+
+      # Rules (horizontal lines) look like
+      #
+      #  ---   (three or more hyphens)
+      #
+      # The more hyphens, the thicker the rule
+      #
+
+      if /^(---+)\s*$/ =~ active_line
+        line.stamp :RULE, level, $1.length-2
+        next
+      end
+
+      # Then look for list entries.  First the ones that have to have
+      # text following them (* xxx, - xxx, and dd. xxx)
+
+      if SIMPLE_LIST_RE =~ active_line
+        offset = margin + $1.length
+        prefix = $2
+        prefix_length = prefix.length
+
+        flag = case prefix
+               when "*","-" then :BULLET
+               when /^\d/   then :NUMBER
+               when /^[A-Z]/ then :UPPERALPHA
+               when /^[a-z]/ then :LOWERALPHA
+               else raise "Invalid List Type: #{self.inspect}"
+               end
+
+        line.stamp :LIST, level+1, prefix, flag
+        text[margin, prefix_length] = " " * prefix_length
+        assign_types_to_lines(offset, level + 1)
+        next
+      end
+
+      if LABEL_LIST_RE =~ active_line
+        offset = margin + $1.length
+        prefix = $2
+        prefix_length = prefix.length
+
+        next if handled_labeled_list(line, level, margin, offset, prefix)
+      end
+
+      # Headings look like
+      # = Main heading
+      # == Second level
+      # === Third
+      #
+      # Headings reset the level to 0
+
+      if active_line[0] == ?= and active_line =~ /^(=+)\s*(.*)/
+        prefix_length = $1.length
+        prefix_length = 6 if prefix_length > 6
+        line.stamp :HEADING, 0, prefix_length
+        line.strip_leading(margin + prefix_length)
+        next
+      end
+
+      # If the character's a space, then we have verbatim text,
+      # otherwise
+
+      if active_line[0] == SPACE
+        line.strip_leading(margin) if margin > 0
+        line.stamp :VERBATIM, level
+      else
+        line.stamp :PARAGRAPH, level
+      end
+    end
+  end
+
+  ##
+  # Handle labeled list entries, We have a special case to deal with.
+  # Because the labels can be long, they force the remaining block of text
+  # over the to right:
+  #
+  #   this is a long label that I wrote:: and here is the
+  #                                       block of text with
+  #                                       a silly margin
+  #
+  # So we allow the special case.  If the label is followed by nothing, and
+  # if the following line is indented, then we take the indent of that line
+  # as the new margin.
+  #
+  #   this is a long label that I wrote::
+  #       here is a more reasonably indented block which
+  #       will be attached to the label.
+  #
+
+  def handled_labeled_list(line, level, margin, offset, prefix)
+    prefix_length = prefix.length
+    text = line.text
+    flag = nil
+
+    case prefix
+    when /^\[/ then
+      flag = :LABELED
+      prefix = prefix[1, prefix.length-2]
+    when /:$/ then
+      flag = :NOTE
+      prefix.chop!
+    else
+      raise "Invalid List Type: #{self.inspect}"
+    end
+
+    # body is on the next line
+    if text.length <= offset then
+      original_line = line
+      line = @lines.next
+      return false unless line
+      text = line.text
+
+      for i in 0..margin
+        if text[i] != SPACE
+          @lines.unget
+          return false
+        end
+      end
+
+      i = margin
+      i += 1 while text[i] == SPACE
+
+      if i >= text.length then
+        @lines.unget
+        return false
+      else
+        offset = i
+        prefix_length = 0
+
+        if text[offset..-1] =~ SIMPLE_LIST_RE then
+          @lines.unget
+          line = original_line
+          line.text = ''
+        else
+          @lines.delete original_line
+        end
+      end
+    end
+
+    line.stamp :LIST, level+1, prefix, flag
+    text[margin, prefix_length] = " " * prefix_length
+    assign_types_to_lines(offset, level + 1)
+    return true
+  end
+
+  ##
+  # Return a block consisting of fragments which are paragraphs, list
+  # entries or verbatim text.  We merge consecutive lines of the same type
+  # and level together.  We are also slightly tricky with lists: the lines
+  # following a list introduction look like paragraph lines at the next
+  # level, and we remap them into list entries instead.
+
+  def group_lines
+    @lines.rewind
+
+    in_list = false
+    wanted_type = wanted_level = nil
+
+    block = LineCollection.new
+    group = nil
+
+    while line = @lines.next
+      if line.level == wanted_level and line.type == wanted_type
+        group.add_text(line.text)
+      else
+        group = block.fragment_for(line)
+        block.add(group)
+
+        if line.type == :LIST
+          wanted_type = :PARAGRAPH
+        else
+          wanted_type = line.type
+        end
+
+        wanted_level = line.type == :HEADING ? line.param : line.level
+      end
+    end
+
+    block.normalize
+    block
+  end
+
+  ##
+  # For debugging, we allow access to our line contents as text.
+
+  def content
+    @lines.as_text
+  end
+  public :content
+
+  ##
+  # For debugging, return the list of line types.
+
+  def get_line_types
+    @lines.line_types
+  end
+  public :get_line_types
+
+end
+
+require 'rdoc/markup/fragments'
+require 'rdoc/markup/inline'
+require 'rdoc/markup/lines'