From c922e508049ebd8a36be0b86d512375c26b19ee7 Mon Sep 17 00:00:00 2001 From: nobu Date: Thu, 17 Sep 2009 04:12:38 +0000 Subject: * lib/rdoc/parser/ruby.rb (RDoc::Parser::Ruby): parse also rdoc files. * doc/re.rdoc: renamed from re.rb. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@24976 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- .document | 3 +- doc/re.rb | 583 ------------------------------------------------ doc/re.rdoc | 583 ++++++++++++++++++++++++++++++++++++++++++++++++ lib/rdoc/parser/ruby.rb | 2 +- 4 files changed, 586 insertions(+), 585 deletions(-) delete mode 100644 doc/re.rb create mode 100644 doc/re.rdoc diff --git a/.document b/.document index b41b4d08b0..746ca2929d 100644 --- a/.document +++ b/.document @@ -15,4 +15,5 @@ lib ext -doc/re.rb +# rdoc files +doc/*.rdoc diff --git a/doc/re.rb b/doc/re.rb deleted file mode 100644 index 9671a7bd0b..0000000000 --- a/doc/re.rb +++ /dev/null @@ -1,583 +0,0 @@ -# -*- coding: utf-8 -*- -# Regular expressions (regexps) are patterns which describe the -# contents of a string. They're used for testing whether a string contains a -# given pattern, or extracting the portions that match. They are created -# with the /pat/ and -# %r{pat} literals or the Regexp.new -# constructor. -# -# A regexp is usually delimited with forward slashes (/). For -# example: -# -# /hay/ =~ 'haystack' #=> 0 -# /y/.match('haystack') #=> # -# -# If a string contains the pattern it is said to match. A literal -# string matches itself. -# -# # 'haystack' does not contain the pattern 'needle', so doesn't match. -# /needle/.match('haystack') #=> nil -# # 'haystack' does contain the pattern 'hay', so it matches -# /hay/.match('haystack') #=> # -# -# Specifically, /st/ requires that the string contains the letter -# _s_ followed by the letter _t_, so it matches _haystack_, also. -# -# == Metacharacters and Escapes -# -# The following are metacharacters (, ), -# [, ], {, }, ., ?, -# +, *. They have a specific meaning when appearing in a -# pattern. To match them literally they must be backslash-escaped. To match -# a backslash literally backslash-escape that: \\\\\\. -# -# /1 \+ 2 = 3\?/.match('Does 1 + 2 = 3?') #=> # -# -# Patterns behave like double-quoted strings so can contain the same -# backslash escapes. -# -# /\s\u{6771 4eac 90fd}/.match("Go to 東京都") -# #=> # -# -# Arbitrary Ruby expressions can be embedded into patterns with the -# #{...} construct. -# -# place = "東京都" -# /#{place}/.match("Go to 東京都") -# #=> # -# -# == Character Classes -# -# A character class is delimited with square brackets ([, -# ]) and lists characters that may appear at that point in the -# match. /[ab]/ means _a_ or _b_, as opposed to /ab/ which -# means _a_ followed by _b_. -# -# /W[aeiou]rd/.match("Word") #=> # -# -# Within a character class the hyphen (-) is a metacharacter -# denoting an inclusive range of characters. [abcd] is equivalent -# to [a-d]. A range can be followed by another range, so -# [abcdwxyz] is equivalent to [a-dw-z]. The order in which -# ranges or individual characters appear inside a character class is -# irrelevant. -# -# /[0-9a-f]/.match('9f') #=> # -# /[9f]/.match('9f') #=> # -# -# If the first character of a character class is a caret (^) the -# class is inverted: it matches any character _except_ those named. -# -# /[^a-eg-z]/.match('f') #=> # -# -# A character class may contain another character class. By itself this -# isn't useful because [a-z[0-9]] describes the same set as -# [a-z0-9]. However, character classes also support the && -# operator which performs set intersection on its arguments. The two can be -# combined as follows: -# -# /[a-w&&[^c-g]z]/ # ([a-w] AND ([^c-g] OR z)) -# # This is equivalent to: -# /[abh-w]/ -# -# The following metacharacters also behave like character classes: -# -# * /./ - Any character except a newline. -# * /./m - Any character (the +m+ modifier enables multiline mode) -# * /\w/ - A word character ([a-zA-Z0-9_]) -# * /\W/ - A non-word character ([^a-zA-Z0-9_]) -# * /\d/ - A digit character ([0-9]) -# * /\D/ - A non-digit character ([^0-9]) -# * /\h/ - A hexdigit character ([0-9a-fA-F]) -# * /\H/ - A non-hexdigit character ([^0-9a-fA-F]) -# * /\s/ - A whitespace character: /[ \t\r\n\f]/ -# * /\S/ - A non-whitespace character: /[^ \t\r\n\f]/ -# -# POSIX bracket expressions are also similar to character classes. -# They provide a portable alternative to the above, with the added benefit -# that they encompass non-ASCII characters. For instance, /\d/ -# matches only the ASCII decimal digits (0-9); whereas /[[:digit:]]/ -# matches any character in the Unicode _Nd_ category. -# -# * /[[:alnum:]]/ - Alphabetic and numeric character -# * /[[:alpha:]]/ - Alphabetic character -# * /[[:blank:]]/ - Space or tab -# * /[[:cntrl:]]/ - Control character -# * /[[:digit:]]/ - Digit -# * /[[:graph:]]/ - Non-blank character (excludes spaces, control -# characters, and similar) -# * /[[:lower:]]/ - Lowercase alphabetical character -# * /[[:print:]]/ - Like [:graph:], but includes the space character -# * /[[:punct:]]/ - Punctuation character -# * /[[:space:]]/ - Whitespace character ([:blank:], newline, -# carriage return, etc.) -# * /[[:upper:]]/ - Uppercase alphabetical -# * /[[:xdigit:]]/ - Digit allowed in a hexadecimal number (i.e., -# 0-9a-fA-F) -# -# Ruby also supports the following non-POSIX character classes: -# -# * /[[:word:]]/ - A character in one of the following Unicode -# general categories _Letter_, _Mark_, _Number_, -# Connector_Punctuation -# * /[[:ascii:]]/ - A character in the ASCII character set -# -# # U+06F2 is "EXTENDED ARABIC-INDIC DIGIT TWO" -# /[[:digit:]]/.match("\u06F2") #=> # -# /[[:upper:]][[:lower:]]/.match("Hello") #=> # -# /[[:xdigit:]][[:xdigit:]]/.match("A6") #=> # -# -# == Repetition -# -# The constructs described so far match a single character. They can be -# followed by a repetition metacharacter to specify how many times they need -# to occur. Such metacharacters are called quantifiers. -# -# * * - Zero or more times -# * + - One or more times -# * ? - Zero or one times (optional) -# * {n} - Exactly n times -# * {n,} - n or more times -# * {,m} - m or less times -# * {n,m} - At least n and -# at most m times -# -# # At least one uppercase character ('H'), at least one lowercase -# # character ('e'), two 'l' characters, then one 'o' -# "Hello".match(/[[:upper:]]+[[:lower:]]+l{2}o/) #=> # -# -# Repetition is greedy by default: as many occurrences as possible -# are matched while still allowing the overall match to succeed. By -# contrast, lazy matching makes the minimal amount of matches -# necessary for overall success. A greedy metacharacter can be made lazy by -# following it with ?. -# -# # Both patterns below match the string. The fist uses a greedy -# # quantifier so '.+' matches ''; the second uses a lazy -# # quantifier so '.+?' matches ''. -# /<.+>/.match("") #=> #"> -# /<.+?>/.match("") #=> #"> -# -# A quantifier followed by + matches possessively: once it -# has matched it does not backtrack. They behave like greedy quantifiers, -# but having matched they refuse to "give up" their match even if this -# jeopardises the overall match. -# -# == Capturing -# -# Parentheses can be used for capturing. The text enclosed by the -# nth group of parentheses can be subsequently referred to -# with n. Within a pattern use the backreference -# \n; outside of the pattern use -# MatchData[n]. -# -# # 'at' is captured by the first group of parentheses, then referred to -# # later with \1 -# /[csh](..) [csh]\1 in/.match("The cat sat in the hat") -# #=> # -# # Regexp#match returns a MatchData object which makes the captured -# # text available with its #[] method. -# /[csh](..) [csh]\1 in/.match("The cat sat in the hat")[1] #=> 'at' -# -# Capture groups can be referred to by name when defined with the -# (?<name>) or (?'name') -# constructs. -# -# /\$(?\d+)\.(?\d+)/.match("$3.67") -# => # -# /\$(?\d+)\.(?\d+)/.match("$3.67")[:dollars] #=> "3" -# -# Named groups can be backreferenced with \k<name>, -# where _name_ is the group name. -# -# /(?[aeiou]).\k.\k/.match('ototomy') -# #=> # -# -# *Note*: A regexp can't use named backreferences and numbered -# backreferences simultaneously. -# -# When named capture groups are used with a literal regexp on the left-hand -# side of an expression and the =~ operator, the captured text is -# also assigned to local variables with corresponding names. -# -# /\$(?\d+)\.(?\d+)/ =~ "$3.67" #=> 0 -# dollars #=> "3" -# -# == Grouping -# -# Parentheses also group the terms they enclose, allowing them to be -# quantified as one atomic whole. -# -# # The pattern below matches a vowel followed by 2 word characters: -# # 'aen' -# /[aeiou]\w{2}/.match("Caenorhabditis elegans") #=> # -# # Whereas the following pattern matches a vowel followed by a word -# # character, twice, i.e. [aeiou]\w[aeiou]\w: 'enor'. -# /([aeiou]\w){2}/.match("Caenorhabditis elegans") -# #=> # -# -# The (?:...) construct provides grouping without -# capturing. That is, it combines the terms it contains into an atomic whole -# without creating a backreference. This benefits performance at the slight -# expense of readabilty. -# -# # The group of parentheses captures 'n' and the second 'ti'. The -# # second group is referred to later with the backreference \2 -# /I(n)ves(ti)ga\2ons/.match("Investigations") -# #=> # -# # The first group of parentheses is now made non-capturing with '?:', -# # so it still matches 'n', but doesn't create the backreference. Thus, -# # the backreference \1 now refers to 'ti'. -# /I(?:n)ves(ti)ga\1ons/.match("Investigations") -# #=> # -# -# === Atomic Grouping -# -# Grouping can be made atomic with -# (?>pat). This causes the subexpression pat -# to be matched independently of the rest of the expression such that what -# it matches becomes fixed for the remainder of the match, unless the entire -# subexpression must be abandoned and subsequently revisited. In this -# way pat is treated as a non-divisible whole. Atomic grouping is -# typically used to optimise patterns so as to prevent the regular -# expression engine from backtracking needlesly. -# -# # The " in the pattern below matches the first character of -# # the string, then .* matches Quote". This causes the -# # overall match to fail, so the text matched by .* is -# # backtracked by one position, which leaves the final character of the -# # string available to match " -# /".*"/.match('"Quote"') #=> # -# # If .* is grouped atomically, it refuses to backtrack -# # Quote", even though this means that the overall match fails -# /"(?>.*)"/.match('"Quote"') #=> nil -# -# == Subexpression Calls -# -# The \g<name> syntax matches the previous -# subexpression named _name_, which can be a group name or number, again. -# This differs from backreferences in that it re-executes the group rather -# than simply trying to re-match the same text. -# -# # Matches a ( character and assigns it to the paren -# # group, tries to call that the paren sub-expression again -# # but fails, then matches a literal ). -# /\A(?\(\g*\))*\z/ =~ '()' -# -# -# /\A(?\(\g*\))*\z/ =~ '(())' #=> 0 -# # ^1 -# # ^2 -# # ^3 -# # ^4 -# # ^5 -# # ^6 -# # ^7 -# # ^8 -# # ^9 -# # ^10 -# -# 1. Matches at the beginning of the string, i.e. before the first -# character. -# 2. Enters a named capture group called paren -# 3. Matches a literal (, the first character in the string -# 4. Calls the paren group again, i.e. recurses back to the -# second step -# 5. Re-enters the paren group -# 6. Matches a literal (, the second character in the -# string -# 7. Try to call paren a third time, but fail because -# doing so would prevent an overall successful match -# 8. Match a literal ), the third character in the string. -# Marks the end of the second recursive call -# 9. Match a literal ), the fourth character in the string -# 10. Match the end of the string -# -# == Alternation -# -# The vertical bar metacharacter (|) combines two expressions into -# a single one that matches either of the expressions. Each expression is an -# alternative. -# -# /\w(and|or)\w/.match("Feliformia") #=> # -# /\w(and|or)\w/.match("furandi") #=> # -# /\w(and|or)\w/.match("dissemblance") #=> nil -# -# == Character Properties -# -# The \p{} construct matches characters with the named property, -# much like POSIX bracket classes. -# -# * /\p{Alnum}/ - Alphabetic and numeric character -# * /\p{Alpha}/ - Alphabetic character -# * /\p{Blank}/ - Space or tab -# * /\p{Cntrl}/ - Control character -# * /\p{Digit}/ - Digit -# * /\p{Graph}/ - Non-blank character (excludes spaces, control -# characters, and similar) -# * /\p{Lower}/ - Lowercase alphabetical character -# * /\p{Print}/ - Like \p{Graph}, but includes the space character -# * /\p{Punct}/ - Punctuation character -# * /\p{Space}/ - Whitespace character ([:blank:], newline, -# carriage return, etc.) -# * /\p{Upper}/ - Uppercase alphabetical -# * /\p{XDigit}/ - Digit allowed in a hexadecimal number (i.e., 0-9a-fA-F) -# * /\p{Word}/ - A member of one of the following Unicode general -# category Letter, Mark, Number, -# Connector\_Punctuation -# * /\p{ASCII}/ - A character in the ASCII character set -# * /\p{Any}/ - Any Unicode character (including unassigned -# characters) -# * /\p{Assigned}/ - An assigned character -# -# A Unicode character's General Category value can also be matched -# with \p{Ab} where Ab is the category's -# abbreviation as described below: -# -# * /\p{L}/ - 'Letter' -# * /\p{Ll}/ - 'Letter: Lowercase' -# * /\p{Lm}/ - 'Letter: Mark' -# * /\p{Lo}/ - 'Letter: Other' -# * /\p{Lt}/ - 'Letter: Titlecase' -# * /\p{Lu}/ - 'Letter: Uppercase -# * /\p{Lo}/ - 'Letter: Other' -# * /\p{M}/ - 'Mark' -# * /\p{Mn}/ - 'Mark: Nonspacing' -# * /\p{Mc}/ - 'Mark: Spacing Combining' -# * /\p{Me}/ - 'Mark: Enclosing' -# * /\p{N}/ - 'Number' -# * /\p{Nd}/ - 'Number: Decimal Digit' -# * /\p{Nl}/ - 'Number: Letter' -# * /\p{No}/ - 'Number: Other' -# * /\p{P}/ - 'Punctuation' -# * /\p{Pc}/ - 'Punctuation: Connector' -# * /\p{Pd}/ - 'Punctuation: Dash' -# * /\p{Ps}/ - 'Punctuation: Open' -# * /\p{Pe}/ - 'Punctuation: Close' -# * /\p{Pi}/ - 'Punctuation: Initial Quote' -# * /\p{Pf}/ - 'Punctuation: Final Quote' -# * /\p{Po}/ - 'Punctuation: Other' -# * /\p{S}/ - 'Symbol' -# * /\p{Sm}/ - 'Symbol: Math' -# * /\p{Sc}/ - 'Symbol: Currency' -# * /\p{Sc}/ - 'Symbol: Currency' -# * /\p{Sk}/ - 'Symbol: Modifier' -# * /\p{So}/ - 'Symbol: Other' -# * /\p{Z}/ - 'Separator' -# * /\p{Zs}/ - 'Separator: Space' -# * /\p{Zl}/ - 'Separator: Line' -# * /\p{Zp}/ - 'Separator: Paragraph' -# * /\p{C}/ - 'Other' -# * /\p{Cc}/ - 'Other: Control' -# * /\p{Cf}/ - 'Other: Format' -# * /\p{Cn}/ - 'Other: Not Assigned' -# * /\p{Co}/ - 'Other: Private Use' -# * /\p{Cs}/ - 'Other: Surrogate' -# -# Lastly, \p{} matches a character's Unicode script. The -# following scripts are supported: Arabic, Armenian, -# Balinese, Bengali, Bopomofo, Braille, -# Buginese, Buhid, Canadian_Aboriginal, Carian, -# Cham, Cherokee, Common, Coptic, -# Cuneiform, Cypriot, Cyrillic, Deseret, -# Devanagari, Ethiopic, Georgian, Glagolitic, -# Gothic, Greek, Gujarati, Gurmukhi, Han, -# Hangul, Hanunoo, Hebrew, Hiragana, -# Inherited, Kannada, Katakana, Kayah_Li, -# Kharoshthi, Khmer, Lao, Latin, Lepcha, -# Limbu, Linear_B, Lycian, Lydian, -# Malayalam, Mongolian, Myanmar, New_Tai_Lue, -# Nko, Ogham, Ol_Chiki, Old_Italic, -# Old_Persian, Oriya, Osmanya, Phags_Pa, -# Phoenician, Rejang, Runic, Saurashtra, -# Shavian, Sinhala, Sundanese, Syloti_Nagri, -# Syriac, Tagalog, Tagbanwa, Tai_Le, -# Tamil, Telugu, Thaana, Thai, Tibetan, -# Tifinagh, Ugaritic, Vai, and Yi. -# -# # Unicode codepoint U+06E9 is named "ARABIC PLACE OF SAJDAH" and -# # belongs to the Arabic script. -# /\p{Arabic}/.match("\u06E9") #=> # -# -# All character properties can be inverted by prefixing their name with a -# caret (^). -# -# # Letter 'A' is not in the Unicode Ll (Letter; Lowercase) category, so -# # this match succeeds -# /\p{^Ll}/.match("A") #=> # -# -# == Anchors -# -# Anchors are metacharacter that match the zero-width positions between -# characters, anchoring the match to a specific position. -# -# * ^ - Matches beginning of line -# * $ - Matches end of line -# * \A - Matches beginning of string. -# * \Z - Matches end of string. If string ends with a newline, -# it matches just before newline -# * \z - Matches end of string -# * \G - Matches point where last match finished -# * \b - Matches word boundaries when outside brackets; backspace -# (0x08) inside brackets -# * \B - Matches non-word boundaries -# * (?=pat) - Positive lookahead assertion: -# ensures that the following characters match pat, but doesn't -# include those characters in the matched text -# * (?!pat) - Negative lookahead assertion: -# ensures that the following characters do not match pat, but -# doesn't include those characters in the matched text -# * (?<=pat) - Positive lookbehind -# assertion: ensures that the preceding characters match pat, but -# doesn't include those characters in the matched text -# * (?pat) - Negative lookbehind -# assertion: ensures that the preceding characters do not match -# pat, but doesn't include those characters in the matched text -# -# # If a pattern isn't anchored it can begin at any point in the string -# /real/.match("surrealist") #=> # -# # Anchoring the pattern to the beginning of the string forces the -# # match to start there. 'real' doesn't occur at the beginning of the -# # string, so now the match fails -# /\Areal/.match("surrealist") #=> nil -# # The match below fails because although 'Demand' contains 'and', the -# pattern does not occur at a word boundary. -# /\band/.match("Demand") -# # Whereas in the following example 'and' has been anchored to a -# # non-word boundary so instead of matching the first 'and' it matches -# # from the fourth letter of 'demand' instead -# /\Band.+/.match("Supply and demand curve") #=> # -# # The pattern below uses positive lookahead and positive lookbehind to -# # match text appearing in tags without including the tags in the -# # match -# /(?<=)\w+(?=<\/b>)/.match("Fortune favours the bold") -# #=> # -# -# == Options -# -# The end delimiter for a regexp can be followed by one or more single-letter -# options which control how the pattern can match. -# -# * /pat/i - Ignore case -# * /pat/m - Treat a newline as a character matched by . -# * /pat/x - Ignore whitespace and comments in the pattern -# * /pat/o - Perform #{} interpolation only once -# -# i, m, and x can also be applied on the -# subexpression level with the -# (?on-off) construct, which -# enables options on, and disables options off for the -# expression enclosed by the parentheses. -# -# /a(?i:b)c/.match('aBc') #=> # -# /a(?i:b)c/.match('abc') #=> # -# -# == Free-Spacing Mode and Comments -# -# As mentioned above, the x option enables free-spacing -# mode. Literal white space inside the pattern is ignored, and the -# octothorpe (#) character introduces a comment until the end of -# the line. This allows the components of the pattern to be organised in a -# potentially more readable fashion. -# -# # A contrived pattern to match a number with optional decimal places -# float_pat = /\A -# [[:digit:]]+ # 1 or more digits before the decimal point -# (\. # Decimal point -# [[:digit:]]+ # 1 or more digits after the decimal point -# )? # The decimal point and following digits are optional -# \Z/x -# float_pat.match('3.14') #=> # -# -# *Note*: To match whitespace in an x pattern use an escape such as -# \s or \p{Space}. -# -# Comments can be included in a non-x pattern with the -# (?#comment) construct, where comment is -# arbitrary text ignored by the regexp engine. -# -# == Encoding -# -# Regular expressions are assumed to use the source encoding. This can be -# overridden with one of the following modifiers. -# -# * /pat/u - UTF-8 -# * /pat/e - EUC-JP -# * /pat/s - Windows-31J -# * /pat/n - ASCII-8BIT -# -# A regexp can be matched against a string when they either share an -# encoding, or the regexp's encoding is _US-ASCII_ and the string's encoding -# is ASCII-compatible. -# -# If a match between incompatible encodings is attempted an -# Encoding::CompatibilityError exception is raised. -# -# The Regexp#fixed_encoding? predicate indicates whether the regexp -# has a fixed encoding, that is one incompatible with ASCII. A -# regexp's encoding can be explicitly fixed by supplying -# Regexp::FIXEDENCODING as the second argument of -# Regexp.new: -# -# r = Regexp.new("a".force_encoding("iso-8859-1"),Regexp::FIXEDENCODING) -# r =~"a\u3042" -# #=> Encoding::CompatibilityError: incompatible encoding regexp match -# (ISO-8859-1 regexp with UTF-8 string) -# -# == Performance -# -# Certain pathological combinations of constructs can lead to abysmally bad -# performance. -# -# Consider a string of 25 as, a d, 4 as, and a -# c. -# -# s = 'a' * 25 + 'd' 'a' * 4 + 'c' -# #=> "aaaaaaaaaaaaaaaaaaaaaaaaadadadadac" -# -# The following patterns match instantly as you would expect: -# -# /(b|a)/ =~ s #=> 0 -# /(b|a+)/ =~ s #=> 0 -# /(b|a+)*\/ =~ s #=> 0 -# -# However, the following pattern takes appreciably longer: -# -# /(b|a+)*c/ =~ s #=> 32 -# -# This happens because an atom in the regexp is quantified by both an -# immediate + and an enclosing * with nothing to -# differentiate which is in control of any particular character. The -# nondeterminism that results produces super-linear performance. (Consult -# Mastering Regular Expressions (3rd ed.), pp 222, by -# Jeffery Friedl, for an in-depth analysis). This particular case -# can be fixed by use of atomic grouping, which prevents the unnecessary -# backtracking: -# -# (start = Time.now) && /(b|a+)*c/ =~ s && (Time.now - start) -# #=> 24.702736882 -# (start = Time.now) && /(?>b|a+)*c/ =~ s && (Time.now - start) -# #=> 0.000166571 -# -# A similar case is typified by the following example, which takes -# approximately 60 seconds to execute for me: -# -# # Match a string of 29 as against a pattern of 29 optional -# # as followed by 29 mandatory as. -# Regexp.new('a?' * 29 + 'a' * 29) =~ 'a' * 29 -# -# The 29 optional as match the string, but this prevents the 29 -# mandatory as that follow from matching. Ruby must then backtrack -# repeatedly so as to satisfy as many of the optional matches as it can -# while still matching the mandatory 29. It is plain to us that none of the -# optional matches can succeed, but this fact unfortunately eludes Ruby. -# -# One approach for improving performance is to anchor the match to the -# beginning of the string, thus significantly reducing the amount of -# backtracking needed. -# -# Regexp.new('\A' 'a?' * 29 + 'a' * 29).match('a' * 29) -# #=> # -# -# -class Regexp; end diff --git a/doc/re.rdoc b/doc/re.rdoc new file mode 100644 index 0000000000..9671a7bd0b --- /dev/null +++ b/doc/re.rdoc @@ -0,0 +1,583 @@ +# -*- coding: utf-8 -*- +# Regular expressions (regexps) are patterns which describe the +# contents of a string. They're used for testing whether a string contains a +# given pattern, or extracting the portions that match. They are created +# with the /pat/ and +# %r{pat} literals or the Regexp.new +# constructor. +# +# A regexp is usually delimited with forward slashes (/). For +# example: +# +# /hay/ =~ 'haystack' #=> 0 +# /y/.match('haystack') #=> # +# +# If a string contains the pattern it is said to match. A literal +# string matches itself. +# +# # 'haystack' does not contain the pattern 'needle', so doesn't match. +# /needle/.match('haystack') #=> nil +# # 'haystack' does contain the pattern 'hay', so it matches +# /hay/.match('haystack') #=> # +# +# Specifically, /st/ requires that the string contains the letter +# _s_ followed by the letter _t_, so it matches _haystack_, also. +# +# == Metacharacters and Escapes +# +# The following are metacharacters (, ), +# [, ], {, }, ., ?, +# +, *. They have a specific meaning when appearing in a +# pattern. To match them literally they must be backslash-escaped. To match +# a backslash literally backslash-escape that: \\\\\\. +# +# /1 \+ 2 = 3\?/.match('Does 1 + 2 = 3?') #=> # +# +# Patterns behave like double-quoted strings so can contain the same +# backslash escapes. +# +# /\s\u{6771 4eac 90fd}/.match("Go to 東京都") +# #=> # +# +# Arbitrary Ruby expressions can be embedded into patterns with the +# #{...} construct. +# +# place = "東京都" +# /#{place}/.match("Go to 東京都") +# #=> # +# +# == Character Classes +# +# A character class is delimited with square brackets ([, +# ]) and lists characters that may appear at that point in the +# match. /[ab]/ means _a_ or _b_, as opposed to /ab/ which +# means _a_ followed by _b_. +# +# /W[aeiou]rd/.match("Word") #=> # +# +# Within a character class the hyphen (-) is a metacharacter +# denoting an inclusive range of characters. [abcd] is equivalent +# to [a-d]. A range can be followed by another range, so +# [abcdwxyz] is equivalent to [a-dw-z]. The order in which +# ranges or individual characters appear inside a character class is +# irrelevant. +# +# /[0-9a-f]/.match('9f') #=> # +# /[9f]/.match('9f') #=> # +# +# If the first character of a character class is a caret (^) the +# class is inverted: it matches any character _except_ those named. +# +# /[^a-eg-z]/.match('f') #=> # +# +# A character class may contain another character class. By itself this +# isn't useful because [a-z[0-9]] describes the same set as +# [a-z0-9]. However, character classes also support the && +# operator which performs set intersection on its arguments. The two can be +# combined as follows: +# +# /[a-w&&[^c-g]z]/ # ([a-w] AND ([^c-g] OR z)) +# # This is equivalent to: +# /[abh-w]/ +# +# The following metacharacters also behave like character classes: +# +# * /./ - Any character except a newline. +# * /./m - Any character (the +m+ modifier enables multiline mode) +# * /\w/ - A word character ([a-zA-Z0-9_]) +# * /\W/ - A non-word character ([^a-zA-Z0-9_]) +# * /\d/ - A digit character ([0-9]) +# * /\D/ - A non-digit character ([^0-9]) +# * /\h/ - A hexdigit character ([0-9a-fA-F]) +# * /\H/ - A non-hexdigit character ([^0-9a-fA-F]) +# * /\s/ - A whitespace character: /[ \t\r\n\f]/ +# * /\S/ - A non-whitespace character: /[^ \t\r\n\f]/ +# +# POSIX bracket expressions are also similar to character classes. +# They provide a portable alternative to the above, with the added benefit +# that they encompass non-ASCII characters. For instance, /\d/ +# matches only the ASCII decimal digits (0-9); whereas /[[:digit:]]/ +# matches any character in the Unicode _Nd_ category. +# +# * /[[:alnum:]]/ - Alphabetic and numeric character +# * /[[:alpha:]]/ - Alphabetic character +# * /[[:blank:]]/ - Space or tab +# * /[[:cntrl:]]/ - Control character +# * /[[:digit:]]/ - Digit +# * /[[:graph:]]/ - Non-blank character (excludes spaces, control +# characters, and similar) +# * /[[:lower:]]/ - Lowercase alphabetical character +# * /[[:print:]]/ - Like [:graph:], but includes the space character +# * /[[:punct:]]/ - Punctuation character +# * /[[:space:]]/ - Whitespace character ([:blank:], newline, +# carriage return, etc.) +# * /[[:upper:]]/ - Uppercase alphabetical +# * /[[:xdigit:]]/ - Digit allowed in a hexadecimal number (i.e., +# 0-9a-fA-F) +# +# Ruby also supports the following non-POSIX character classes: +# +# * /[[:word:]]/ - A character in one of the following Unicode +# general categories _Letter_, _Mark_, _Number_, +# Connector_Punctuation +# * /[[:ascii:]]/ - A character in the ASCII character set +# +# # U+06F2 is "EXTENDED ARABIC-INDIC DIGIT TWO" +# /[[:digit:]]/.match("\u06F2") #=> # +# /[[:upper:]][[:lower:]]/.match("Hello") #=> # +# /[[:xdigit:]][[:xdigit:]]/.match("A6") #=> # +# +# == Repetition +# +# The constructs described so far match a single character. They can be +# followed by a repetition metacharacter to specify how many times they need +# to occur. Such metacharacters are called quantifiers. +# +# * * - Zero or more times +# * + - One or more times +# * ? - Zero or one times (optional) +# * {n} - Exactly n times +# * {n,} - n or more times +# * {,m} - m or less times +# * {n,m} - At least n and +# at most m times +# +# # At least one uppercase character ('H'), at least one lowercase +# # character ('e'), two 'l' characters, then one 'o' +# "Hello".match(/[[:upper:]]+[[:lower:]]+l{2}o/) #=> # +# +# Repetition is greedy by default: as many occurrences as possible +# are matched while still allowing the overall match to succeed. By +# contrast, lazy matching makes the minimal amount of matches +# necessary for overall success. A greedy metacharacter can be made lazy by +# following it with ?. +# +# # Both patterns below match the string. The fist uses a greedy +# # quantifier so '.+' matches ''; the second uses a lazy +# # quantifier so '.+?' matches ''. +# /<.+>/.match("") #=> #"> +# /<.+?>/.match("") #=> #"> +# +# A quantifier followed by + matches possessively: once it +# has matched it does not backtrack. They behave like greedy quantifiers, +# but having matched they refuse to "give up" their match even if this +# jeopardises the overall match. +# +# == Capturing +# +# Parentheses can be used for capturing. The text enclosed by the +# nth group of parentheses can be subsequently referred to +# with n. Within a pattern use the backreference +# \n; outside of the pattern use +# MatchData[n]. +# +# # 'at' is captured by the first group of parentheses, then referred to +# # later with \1 +# /[csh](..) [csh]\1 in/.match("The cat sat in the hat") +# #=> # +# # Regexp#match returns a MatchData object which makes the captured +# # text available with its #[] method. +# /[csh](..) [csh]\1 in/.match("The cat sat in the hat")[1] #=> 'at' +# +# Capture groups can be referred to by name when defined with the +# (?<name>) or (?'name') +# constructs. +# +# /\$(?\d+)\.(?\d+)/.match("$3.67") +# => # +# /\$(?\d+)\.(?\d+)/.match("$3.67")[:dollars] #=> "3" +# +# Named groups can be backreferenced with \k<name>, +# where _name_ is the group name. +# +# /(?[aeiou]).\k.\k/.match('ototomy') +# #=> # +# +# *Note*: A regexp can't use named backreferences and numbered +# backreferences simultaneously. +# +# When named capture groups are used with a literal regexp on the left-hand +# side of an expression and the =~ operator, the captured text is +# also assigned to local variables with corresponding names. +# +# /\$(?\d+)\.(?\d+)/ =~ "$3.67" #=> 0 +# dollars #=> "3" +# +# == Grouping +# +# Parentheses also group the terms they enclose, allowing them to be +# quantified as one atomic whole. +# +# # The pattern below matches a vowel followed by 2 word characters: +# # 'aen' +# /[aeiou]\w{2}/.match("Caenorhabditis elegans") #=> # +# # Whereas the following pattern matches a vowel followed by a word +# # character, twice, i.e. [aeiou]\w[aeiou]\w: 'enor'. +# /([aeiou]\w){2}/.match("Caenorhabditis elegans") +# #=> # +# +# The (?:...) construct provides grouping without +# capturing. That is, it combines the terms it contains into an atomic whole +# without creating a backreference. This benefits performance at the slight +# expense of readabilty. +# +# # The group of parentheses captures 'n' and the second 'ti'. The +# # second group is referred to later with the backreference \2 +# /I(n)ves(ti)ga\2ons/.match("Investigations") +# #=> # +# # The first group of parentheses is now made non-capturing with '?:', +# # so it still matches 'n', but doesn't create the backreference. Thus, +# # the backreference \1 now refers to 'ti'. +# /I(?:n)ves(ti)ga\1ons/.match("Investigations") +# #=> # +# +# === Atomic Grouping +# +# Grouping can be made atomic with +# (?>pat). This causes the subexpression pat +# to be matched independently of the rest of the expression such that what +# it matches becomes fixed for the remainder of the match, unless the entire +# subexpression must be abandoned and subsequently revisited. In this +# way pat is treated as a non-divisible whole. Atomic grouping is +# typically used to optimise patterns so as to prevent the regular +# expression engine from backtracking needlesly. +# +# # The " in the pattern below matches the first character of +# # the string, then .* matches Quote". This causes the +# # overall match to fail, so the text matched by .* is +# # backtracked by one position, which leaves the final character of the +# # string available to match " +# /".*"/.match('"Quote"') #=> # +# # If .* is grouped atomically, it refuses to backtrack +# # Quote", even though this means that the overall match fails +# /"(?>.*)"/.match('"Quote"') #=> nil +# +# == Subexpression Calls +# +# The \g<name> syntax matches the previous +# subexpression named _name_, which can be a group name or number, again. +# This differs from backreferences in that it re-executes the group rather +# than simply trying to re-match the same text. +# +# # Matches a ( character and assigns it to the paren +# # group, tries to call that the paren sub-expression again +# # but fails, then matches a literal ). +# /\A(?\(\g*\))*\z/ =~ '()' +# +# +# /\A(?\(\g*\))*\z/ =~ '(())' #=> 0 +# # ^1 +# # ^2 +# # ^3 +# # ^4 +# # ^5 +# # ^6 +# # ^7 +# # ^8 +# # ^9 +# # ^10 +# +# 1. Matches at the beginning of the string, i.e. before the first +# character. +# 2. Enters a named capture group called paren +# 3. Matches a literal (, the first character in the string +# 4. Calls the paren group again, i.e. recurses back to the +# second step +# 5. Re-enters the paren group +# 6. Matches a literal (, the second character in the +# string +# 7. Try to call paren a third time, but fail because +# doing so would prevent an overall successful match +# 8. Match a literal ), the third character in the string. +# Marks the end of the second recursive call +# 9. Match a literal ), the fourth character in the string +# 10. Match the end of the string +# +# == Alternation +# +# The vertical bar metacharacter (|) combines two expressions into +# a single one that matches either of the expressions. Each expression is an +# alternative. +# +# /\w(and|or)\w/.match("Feliformia") #=> # +# /\w(and|or)\w/.match("furandi") #=> # +# /\w(and|or)\w/.match("dissemblance") #=> nil +# +# == Character Properties +# +# The \p{} construct matches characters with the named property, +# much like POSIX bracket classes. +# +# * /\p{Alnum}/ - Alphabetic and numeric character +# * /\p{Alpha}/ - Alphabetic character +# * /\p{Blank}/ - Space or tab +# * /\p{Cntrl}/ - Control character +# * /\p{Digit}/ - Digit +# * /\p{Graph}/ - Non-blank character (excludes spaces, control +# characters, and similar) +# * /\p{Lower}/ - Lowercase alphabetical character +# * /\p{Print}/ - Like \p{Graph}, but includes the space character +# * /\p{Punct}/ - Punctuation character +# * /\p{Space}/ - Whitespace character ([:blank:], newline, +# carriage return, etc.) +# * /\p{Upper}/ - Uppercase alphabetical +# * /\p{XDigit}/ - Digit allowed in a hexadecimal number (i.e., 0-9a-fA-F) +# * /\p{Word}/ - A member of one of the following Unicode general +# category Letter, Mark, Number, +# Connector\_Punctuation +# * /\p{ASCII}/ - A character in the ASCII character set +# * /\p{Any}/ - Any Unicode character (including unassigned +# characters) +# * /\p{Assigned}/ - An assigned character +# +# A Unicode character's General Category value can also be matched +# with \p{Ab} where Ab is the category's +# abbreviation as described below: +# +# * /\p{L}/ - 'Letter' +# * /\p{Ll}/ - 'Letter: Lowercase' +# * /\p{Lm}/ - 'Letter: Mark' +# * /\p{Lo}/ - 'Letter: Other' +# * /\p{Lt}/ - 'Letter: Titlecase' +# * /\p{Lu}/ - 'Letter: Uppercase +# * /\p{Lo}/ - 'Letter: Other' +# * /\p{M}/ - 'Mark' +# * /\p{Mn}/ - 'Mark: Nonspacing' +# * /\p{Mc}/ - 'Mark: Spacing Combining' +# * /\p{Me}/ - 'Mark: Enclosing' +# * /\p{N}/ - 'Number' +# * /\p{Nd}/ - 'Number: Decimal Digit' +# * /\p{Nl}/ - 'Number: Letter' +# * /\p{No}/ - 'Number: Other' +# * /\p{P}/ - 'Punctuation' +# * /\p{Pc}/ - 'Punctuation: Connector' +# * /\p{Pd}/ - 'Punctuation: Dash' +# * /\p{Ps}/ - 'Punctuation: Open' +# * /\p{Pe}/ - 'Punctuation: Close' +# * /\p{Pi}/ - 'Punctuation: Initial Quote' +# * /\p{Pf}/ - 'Punctuation: Final Quote' +# * /\p{Po}/ - 'Punctuation: Other' +# * /\p{S}/ - 'Symbol' +# * /\p{Sm}/ - 'Symbol: Math' +# * /\p{Sc}/ - 'Symbol: Currency' +# * /\p{Sc}/ - 'Symbol: Currency' +# * /\p{Sk}/ - 'Symbol: Modifier' +# * /\p{So}/ - 'Symbol: Other' +# * /\p{Z}/ - 'Separator' +# * /\p{Zs}/ - 'Separator: Space' +# * /\p{Zl}/ - 'Separator: Line' +# * /\p{Zp}/ - 'Separator: Paragraph' +# * /\p{C}/ - 'Other' +# * /\p{Cc}/ - 'Other: Control' +# * /\p{Cf}/ - 'Other: Format' +# * /\p{Cn}/ - 'Other: Not Assigned' +# * /\p{Co}/ - 'Other: Private Use' +# * /\p{Cs}/ - 'Other: Surrogate' +# +# Lastly, \p{} matches a character's Unicode script. The +# following scripts are supported: Arabic, Armenian, +# Balinese, Bengali, Bopomofo, Braille, +# Buginese, Buhid, Canadian_Aboriginal, Carian, +# Cham, Cherokee, Common, Coptic, +# Cuneiform, Cypriot, Cyrillic, Deseret, +# Devanagari, Ethiopic, Georgian, Glagolitic, +# Gothic, Greek, Gujarati, Gurmukhi, Han, +# Hangul, Hanunoo, Hebrew, Hiragana, +# Inherited, Kannada, Katakana, Kayah_Li, +# Kharoshthi, Khmer, Lao, Latin, Lepcha, +# Limbu, Linear_B, Lycian, Lydian, +# Malayalam, Mongolian, Myanmar, New_Tai_Lue, +# Nko, Ogham, Ol_Chiki, Old_Italic, +# Old_Persian, Oriya, Osmanya, Phags_Pa, +# Phoenician, Rejang, Runic, Saurashtra, +# Shavian, Sinhala, Sundanese, Syloti_Nagri, +# Syriac, Tagalog, Tagbanwa, Tai_Le, +# Tamil, Telugu, Thaana, Thai, Tibetan, +# Tifinagh, Ugaritic, Vai, and Yi. +# +# # Unicode codepoint U+06E9 is named "ARABIC PLACE OF SAJDAH" and +# # belongs to the Arabic script. +# /\p{Arabic}/.match("\u06E9") #=> # +# +# All character properties can be inverted by prefixing their name with a +# caret (^). +# +# # Letter 'A' is not in the Unicode Ll (Letter; Lowercase) category, so +# # this match succeeds +# /\p{^Ll}/.match("A") #=> # +# +# == Anchors +# +# Anchors are metacharacter that match the zero-width positions between +# characters, anchoring the match to a specific position. +# +# * ^ - Matches beginning of line +# * $ - Matches end of line +# * \A - Matches beginning of string. +# * \Z - Matches end of string. If string ends with a newline, +# it matches just before newline +# * \z - Matches end of string +# * \G - Matches point where last match finished +# * \b - Matches word boundaries when outside brackets; backspace +# (0x08) inside brackets +# * \B - Matches non-word boundaries +# * (?=pat) - Positive lookahead assertion: +# ensures that the following characters match pat, but doesn't +# include those characters in the matched text +# * (?!pat) - Negative lookahead assertion: +# ensures that the following characters do not match pat, but +# doesn't include those characters in the matched text +# * (?<=pat) - Positive lookbehind +# assertion: ensures that the preceding characters match pat, but +# doesn't include those characters in the matched text +# * (?pat) - Negative lookbehind +# assertion: ensures that the preceding characters do not match +# pat, but doesn't include those characters in the matched text +# +# # If a pattern isn't anchored it can begin at any point in the string +# /real/.match("surrealist") #=> # +# # Anchoring the pattern to the beginning of the string forces the +# # match to start there. 'real' doesn't occur at the beginning of the +# # string, so now the match fails +# /\Areal/.match("surrealist") #=> nil +# # The match below fails because although 'Demand' contains 'and', the +# pattern does not occur at a word boundary. +# /\band/.match("Demand") +# # Whereas in the following example 'and' has been anchored to a +# # non-word boundary so instead of matching the first 'and' it matches +# # from the fourth letter of 'demand' instead +# /\Band.+/.match("Supply and demand curve") #=> # +# # The pattern below uses positive lookahead and positive lookbehind to +# # match text appearing in tags without including the tags in the +# # match +# /(?<=)\w+(?=<\/b>)/.match("Fortune favours the bold") +# #=> # +# +# == Options +# +# The end delimiter for a regexp can be followed by one or more single-letter +# options which control how the pattern can match. +# +# * /pat/i - Ignore case +# * /pat/m - Treat a newline as a character matched by . +# * /pat/x - Ignore whitespace and comments in the pattern +# * /pat/o - Perform #{} interpolation only once +# +# i, m, and x can also be applied on the +# subexpression level with the +# (?on-off) construct, which +# enables options on, and disables options off for the +# expression enclosed by the parentheses. +# +# /a(?i:b)c/.match('aBc') #=> # +# /a(?i:b)c/.match('abc') #=> # +# +# == Free-Spacing Mode and Comments +# +# As mentioned above, the x option enables free-spacing +# mode. Literal white space inside the pattern is ignored, and the +# octothorpe (#) character introduces a comment until the end of +# the line. This allows the components of the pattern to be organised in a +# potentially more readable fashion. +# +# # A contrived pattern to match a number with optional decimal places +# float_pat = /\A +# [[:digit:]]+ # 1 or more digits before the decimal point +# (\. # Decimal point +# [[:digit:]]+ # 1 or more digits after the decimal point +# )? # The decimal point and following digits are optional +# \Z/x +# float_pat.match('3.14') #=> # +# +# *Note*: To match whitespace in an x pattern use an escape such as +# \s or \p{Space}. +# +# Comments can be included in a non-x pattern with the +# (?#comment) construct, where comment is +# arbitrary text ignored by the regexp engine. +# +# == Encoding +# +# Regular expressions are assumed to use the source encoding. This can be +# overridden with one of the following modifiers. +# +# * /pat/u - UTF-8 +# * /pat/e - EUC-JP +# * /pat/s - Windows-31J +# * /pat/n - ASCII-8BIT +# +# A regexp can be matched against a string when they either share an +# encoding, or the regexp's encoding is _US-ASCII_ and the string's encoding +# is ASCII-compatible. +# +# If a match between incompatible encodings is attempted an +# Encoding::CompatibilityError exception is raised. +# +# The Regexp#fixed_encoding? predicate indicates whether the regexp +# has a fixed encoding, that is one incompatible with ASCII. A +# regexp's encoding can be explicitly fixed by supplying +# Regexp::FIXEDENCODING as the second argument of +# Regexp.new: +# +# r = Regexp.new("a".force_encoding("iso-8859-1"),Regexp::FIXEDENCODING) +# r =~"a\u3042" +# #=> Encoding::CompatibilityError: incompatible encoding regexp match +# (ISO-8859-1 regexp with UTF-8 string) +# +# == Performance +# +# Certain pathological combinations of constructs can lead to abysmally bad +# performance. +# +# Consider a string of 25 as, a d, 4 as, and a +# c. +# +# s = 'a' * 25 + 'd' 'a' * 4 + 'c' +# #=> "aaaaaaaaaaaaaaaaaaaaaaaaadadadadac" +# +# The following patterns match instantly as you would expect: +# +# /(b|a)/ =~ s #=> 0 +# /(b|a+)/ =~ s #=> 0 +# /(b|a+)*\/ =~ s #=> 0 +# +# However, the following pattern takes appreciably longer: +# +# /(b|a+)*c/ =~ s #=> 32 +# +# This happens because an atom in the regexp is quantified by both an +# immediate + and an enclosing * with nothing to +# differentiate which is in control of any particular character. The +# nondeterminism that results produces super-linear performance. (Consult +# Mastering Regular Expressions (3rd ed.), pp 222, by +# Jeffery Friedl, for an in-depth analysis). This particular case +# can be fixed by use of atomic grouping, which prevents the unnecessary +# backtracking: +# +# (start = Time.now) && /(b|a+)*c/ =~ s && (Time.now - start) +# #=> 24.702736882 +# (start = Time.now) && /(?>b|a+)*c/ =~ s && (Time.now - start) +# #=> 0.000166571 +# +# A similar case is typified by the following example, which takes +# approximately 60 seconds to execute for me: +# +# # Match a string of 29 as against a pattern of 29 optional +# # as followed by 29 mandatory as. +# Regexp.new('a?' * 29 + 'a' * 29) =~ 'a' * 29 +# +# The 29 optional as match the string, but this prevents the 29 +# mandatory as that follow from matching. Ruby must then backtrack +# repeatedly so as to satisfy as many of the optional matches as it can +# while still matching the mandatory 29. It is plain to us that none of the +# optional matches can succeed, but this fact unfortunately eludes Ruby. +# +# One approach for improving performance is to anchor the match to the +# beginning of the string, thus significantly reducing the amount of +# backtracking needed. +# +# Regexp.new('\A' 'a?' * 29 + 'a' * 29).match('a' * 29) +# #=> # +# +# +class Regexp; end diff --git a/lib/rdoc/parser/ruby.rb b/lib/rdoc/parser/ruby.rb index cf6c1ad221..fde3964a60 100644 --- a/lib/rdoc/parser/ruby.rb +++ b/lib/rdoc/parser/ruby.rb @@ -1458,7 +1458,7 @@ end class RDoc::Parser::Ruby < RDoc::Parser - parse_files_matching(/\.rbw?$/) + parse_files_matching(/\.(?:rbw?|rdoc)\z/) include RDoc::RubyToken include RDoc::TokenStream -- cgit v1.2.3