From 5196e8609ba576021305710610b071d31777e798 Mon Sep 17 00:00:00 2001 From: matz Date: Fri, 5 Jun 1998 09:54:28 +0000 Subject: regex git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/v1_1r@234 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 8 + lib/date2.rb | 119 +++++++----- lib/tk.rb | 19 +- regex.c | 507 +++++++++++++++++++++++++++++++++++++++++++++++----- sample/ruby-mode.el | 2 +- string.c | 2 +- 6 files changed, 555 insertions(+), 102 deletions(-) diff --git a/ChangeLog b/ChangeLog index 517f389d19..3cf5ccf6ae 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +Wed Jun 3 18:07:54 1998 Yukihiro Matsumoto + + * re.c (reg_raise): check rb_in_compile, not rb_in_eval. + +Mon Jun 1 05:26:06 1998 WATANABE Tetsuya + + * string.c (trnext): casting to signed char* needed. + Tue Jun 2 16:00:12 1998 Yukihiro Matsumoto * ext/socket/socket.c (udp_addrsetup): error check enhanced. diff --git a/lib/date2.rb b/lib/date2.rb index 939e633fe0..50c2ccfbd9 100644 --- a/lib/date2.rb +++ b/lib/date2.rb @@ -1,5 +1,5 @@ # date.rb: Written by Tadayoshi Funaba 1998 -# $Id: date.rb,v 1.3 1998/03/08 09:43:54 tadf Exp $ +# $Id: date.rb,v 1.4 1998/06/01 12:52:33 tadf Exp $ class Date @@ -11,8 +11,8 @@ class Date DAYNAMES = [ 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday' ] - GREGORY = 2299161 # Oct 15, 1582 - ENGLAND = 2361222 # Sept 14, 1752 + ITALY = 2299161 # Oct 15, 1582 + ENGLAND = 2361222 # Sept 14, 1752 def Date.civil_to_jd(y, m, d, gs = true) if m <= 2 then @@ -20,16 +20,15 @@ class Date m += 12 end a = (y / 100).to_i - b = (a / 4).to_i - c = 2 - a + b - e = (365.25 * (y + 4716)).to_i - f = (30.6001 * (m + 1)).to_i - jd = c + d + e + f - 1524 + b = 2 - a + (a / 4).to_i + jd = (365.25 * (y + 4716)).to_i + + (30.6001 * (m + 1)).to_i + + d + b - 1524 unless (if gs.kind_of? Numeric then jd >= gs else gs end) then - jd -= c + jd -= b end - return jd + jd end def Date.jd_to_civil(jd, gs = true) @@ -37,27 +36,22 @@ class Date (if gs.kind_of? Numeric then jd >= gs else gs end) then a = jd else - w = ((jd - 1867216.25) / 36524.25).to_i - x = (w / 4).to_i - a = jd + 1 + w - x + x = ((jd - 1867216.25) / 36524.25).to_i + a = jd + 1 + x - (x / 4).to_i end b = a + 1524 c = ((b - 122.1) / 365.25).to_i d = (365.25 * c).to_i e = ((b - d) / 30.6001).to_i - f = (30.6001 * e).to_i - day = b - d - f + dom = b - d - (30.6001 * e).to_i if e <= 13 then m = e - 1 + y = c - 4716 else m = e - 13 - end - if m <= 2 then y = c - 4715 - else - y = c - 4716 end - return y, m, day + return y, m, dom end def Date.mjd_to_jd(mjd) @@ -76,22 +70,38 @@ class Date jd - 2440000.5 end - def initialize(jd = 0, gs = GREGORY) - @jd = jd - @gs = gs + def Date.julian_leap? (y) + y % 4 == 0 + end + + def Date.gregorian_leap? (y) + y % 4 == 0 and y % 100 != 0 or y % 400 == 0 + end + + def Date.leap? (y) + Date.gregorian_leap?(y) end - def Date.new3(y = -4712, m = 1, d = 1, gs = GREGORY) + def initialize(jd = 0, gs = ITALY) + @jd, @gs = jd, gs + end + + def Date.exist? (y, m, d, gs = true) jd = Date.civil_to_jd(y, m, d, gs) - y2, m2, d2 = Date.jd_to_civil(jd, gs) - unless y == y2 and m == m2 and d == d2 then - raise ArgumentError, 'invalid date' + if [y, m, d] == Date.jd_to_civil(jd, gs) then + jd + end + end + + def Date.new3(y = -4712, m = 1, d = 1, gs = ITALY) + unless jd = Date.exist?(y, m, d, gs) then + fail ArgumentError, 'invalid date' end Date.new(jd, gs) end - def Date.today(gs = GREGORY) - Date.new3(*(Time.now.to_a[3..5].reverse << gs)) + def Date.today(gs = ITALY) + Date.new(Date.civil_to_jd(*(Time.now.to_a[3..5].reverse << gs)), gs) end def jd @@ -99,48 +109,63 @@ class Date end def mjd - Date.jd_to_mjd(@jd) + def self.mjd; @mjd end + @mjd = Date.jd_to_mjd(@jd) end def tjd - Date.jd_to_tjd(@jd) + def self.tjd; @tjd end + @tjd = Date.jd_to_tjd(@jd) + end + + def civil + def self.year; @year end + def self.mon; @mon end + def self.mday; @mday end + @year, @mon, @mday = Date.jd_to_civil(@jd, @gs) end + private :civil + def year - Date.jd_to_civil(@jd, @gs)[0] + civil + @year end def yday - gs = if @gs.kind_of? Numeric then @jd >= @gs else @gs end - jd = Date.civil_to_jd(year - 1, 12, 31, gs) - @jd - jd + def self.yday; @yday end + ns = if @gs.kind_of? Numeric then @jd >= @gs else @gs end + jd = Date.civil_to_jd(year - 1, 12, 31, ns) + @yday = @jd - jd end def mon - Date.jd_to_civil(@jd, @gs)[1] + civil + @mon end def mday - Date.jd_to_civil(@jd, @gs)[2] + civil + @mday end def wday - k = (@jd + 1) % 7 - k += 7 if k < 0 - k + def self.wday; @wday end + @wday = (@jd + 1) % 7 end def leap? - gs = if @gs.kind_of? Numeric then @jd >= @gs else @gs end - jd = Date.civil_to_jd(year, 2, 28, gs) - Date.jd_to_civil(jd + 1, gs)[1] == 2 + def self.leap?; @leap_p end + ns = if @gs.kind_of? Numeric then @jd >= @gs else @gs end + jd = Date.civil_to_jd(year, 2, 28, ns) + @leap_p = Date.jd_to_civil(jd + 1, ns)[1] == 2 end def + (other) if other.kind_of? Numeric then return Date.new(@jd + other, @gs) end - raise TypeError, 'expected numeric' + fail TypeError, 'expected numeric' end def - (other) @@ -149,7 +174,7 @@ class Date elsif other.kind_of? Date then return @jd - other.jd end - raise TypeError, 'expected numeric or date' + fail TypeError, 'expected numeric or date' end def <=> (other) @@ -158,7 +183,7 @@ class Date elsif other.kind_of? Date then return @jd <=> other.jd end - raise TypeError, 'expected numeric or date' + fail TypeError, 'expected numeric or date' end def downto(min) @@ -188,7 +213,7 @@ class Date end def to_s - format('%04d-%02d-%02d', *Date.jd_to_civil(@jd, @gs)) + format('%04d-%02d-%02d', year, mon, mday) end end diff --git a/lib/tk.rb b/lib/tk.rb index 213fc54de7..3891428ff7 100644 --- a/lib/tk.rb +++ b/lib/tk.rb @@ -910,15 +910,26 @@ class TkWindow, etc. */ @@ -244,6 +249,7 @@ enum regexpcode of string to be matched (if not). */ endbuf, /* Analogously, for end of buffer/string. */ jump, /* Followed by two bytes giving relative address to jump to. */ + jump_past_alt,/* Same as jump, but marks the end of an alternative. */ on_failure_jump, /* Followed by two bytes giving relative address of place to resume at in case of failure. */ finalize_jump, /* Throw away latest failure point and then jump to @@ -262,6 +268,8 @@ enum regexpcode makes this before the first repeat. Also use it as an intermediary kind of jump when compiling an or construct. */ + push_dummy_failure, /* Push a dummy failure point and continue. Used at the end of + alternatives. */ succeed_n, /* Used like on_failure_jump except has to succeed n times; then gets turned into an on_failure_jump. The relative address following it is useless until then. The @@ -431,8 +439,8 @@ long re_syntax_options = 0; if (bufp->buffer == 0) \ goto memory_exhausted; \ b = (b - old_buffer) + bufp->buffer; \ - if (fixup_jump) \ - fixup_jump = (fixup_jump - old_buffer) + bufp->buffer; \ + if (fixup_alt_jump) \ + fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer; \ if (laststart) \ laststart = (laststart - old_buffer) + bufp->buffer; \ begalt = (begalt - old_buffer) + bufp->buffer; \ @@ -652,12 +660,12 @@ print_partial_compiled_pattern(start, end) case start_memory: mcnt = *p++; - printf ("/start_memory/%d", mcnt); + printf ("/start_memory/%d/%d", mcnt, *p++); break; case stop_memory: mcnt = *p++; - printf ("/stop_memory/%d", mcnt); + printf ("/stop_memory/%d/%d", mcnt, *p++); break; case casefold_on: @@ -740,6 +748,10 @@ print_partial_compiled_pattern(start, end) printf ("/dummy_failure_jump//%d", mcnt); break; + case push_dummy_failure: + printf ("/push_dummy_failure"); + break; + case finalize_jump: EXTRACT_NUMBER_AND_INCR (mcnt, p); printf ("/finalize_jump//%d", mcnt); @@ -750,6 +762,11 @@ print_partial_compiled_pattern(start, end) printf ("/maybe_finalize_jump//%d", mcnt); break; + case jump_past_alt: + EXTRACT_NUMBER_AND_INCR (mcnt, p); + printf ("/jump_past_alt//%d", mcnt); + break; + case jump: EXTRACT_NUMBER_AND_INCR (mcnt, p); printf ("/jump//%d", mcnt); @@ -869,6 +886,9 @@ calculate_must_string(start, end) case start_memory: case stop_memory: + p += 2; + break; + case duplicate: p++; break; @@ -891,6 +911,7 @@ calculate_must_string(start, end) case notwordchar: case begbuf: case endbuf: + case push_dummy_failure: break; case charset: @@ -981,7 +1002,7 @@ re_compile_pattern(pattern, size, bufp) the containing expression. Each alternative of an `or', except the last, ends with a forward-jump of this sort. */ - char *fixup_jump = 0; + char *fixup_alt_jump = 0; /* Address of start of the most recently finished expression. This tells postfix * where to find the start of its operand. */ @@ -1017,7 +1038,7 @@ re_compile_pattern(pattern, size, bufp) /* Stack of information saved by \( and restored by \). Five stack elements are pushed by each \(: First, the value of b. - Second, the value of fixup_jump. + Second, the value of fixup_alt_jump. Third, the value of begalt. Fourth, the value of regnum. Fifth, the type of the paren. */ @@ -1487,7 +1508,7 @@ re_compile_pattern(pattern, size, bufp) c = '('; } if (c == '#') break; - if (stackp+7 >= stacke) { + if (stackp+8 >= stacke) { int *stackx; unsigned int len = stacke - stackb; @@ -1502,13 +1523,15 @@ re_compile_pattern(pattern, size, bufp) to push (unless the pattern has RE_NREGS or more ('s). */ /* obsolete: now RE_NREGS is just a default register size. */ *stackp++ = b - bufp->buffer; - *stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0; + *stackp++ = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; *stackp++ = begalt - bufp->buffer; switch (c) { case '(': BUFPUSH(start_memory); BUFPUSH(regnum); *stackp++ = regnum++; + *stackp++ = b - bufp->buffer; + BUFPUSH(0); /* too many ()'s to fit in a byte. (max 254) */ if (regnum >= RE_REG_MAX) goto too_big; break; @@ -1534,7 +1557,7 @@ re_compile_pattern(pattern, size, bufp) } *stackp++ = c; *stackp++ = options; - fixup_jump = 0; + fixup_alt_jump = 0; laststart = 0; begalt = b; break; @@ -1548,11 +1571,27 @@ re_compile_pattern(pattern, size, bufp) options = *--stackp; switch (c = *--stackp) { case '(': - if (fixup_jump) - store_jump(fixup_jump, jump, b); - BUFPUSH(stop_memory); - BUFPUSH(stackp[-1]); - stackp--; + case ':': + pending_exact = 0; + if (fixup_alt_jump) + { /* Push a dummy failure point at the end of the + alternative for a possible future + `finalize_jump' to pop. See comments at + `push_dummy_failure' in `re_match_2'. */ + BUFPUSH(push_dummy_failure); + + /* We allocated space for this jump when we assigned + to `fixup_alt_jump', in the `handle_alt' case below. */ + store_jump(fixup_alt_jump, jump, b); + } + if (c == '(') { + char *loc = bufp->buffer + *--stackp; + *loc = regnum - stackp[-1]; + BUFPUSH(stop_memory); + BUFPUSH(stackp[-1]); + BUFPUSH(regnum - stackp[-1]); + stackp--; + } break; case '!': @@ -1570,16 +1609,12 @@ re_compile_pattern(pattern, size, bufp) stackp--; break; - case ':': - if (fixup_jump) - store_jump(fixup_jump, jump, b); - pending_exact = 0; default: break; } begalt = *--stackp + bufp->buffer; stackp--; - fixup_jump = *stackp ? *stackp + bufp->buffer - 1 : 0; + fixup_alt_jump = *stackp ? *stackp + bufp->buffer - 1 : 0; laststart = *--stackp + bufp->buffer; if (c == '!' || c == '=') laststart = b; break; @@ -1591,19 +1626,28 @@ re_compile_pattern(pattern, size, bufp) insert_jump(on_failure_jump, begalt, b + 6, b); pending_exact = 0; b += 3; - /* The alternative before the previous alternative has a - jump after it which gets executed if it gets matched. - Adjust that jump so it will jump to the previous - alternative's analogous jump (put in below, which in - turn will jump to the next (if any) alternative's such - jump, etc.). The last such jump jumps to the correct - final destination. */ - if (fixup_jump) - store_jump(fixup_jump, jump, b); + /* The alternative before this one has a jump after it + which gets executed if it gets matched. Adjust that + jump so it will jump to this alternative's analogous + jump (put in below, which in turn will jump to the next + (if any) alternative's such jump, etc.). The last such + jump jumps to the correct final destination. A picture: + _____ _____ + | | | | + | v | v + a | b | c + + If we are at `b', then fixup_alt_jump right now points to a + three-byte space after `a'. We'll put in the jump, set + fixup_alt_jump to right after `b', and leave behind three + bytes which we'll fill in when we get to after `c'. */ + + if (fixup_alt_jump) + store_jump(fixup_alt_jump, jump_past_alt, b); /* Leave space for a jump after previous alternative---to be filled in later. */ - fixup_jump = b; + fixup_alt_jump = b; b += 3; laststart = 0; @@ -1947,8 +1991,8 @@ re_compile_pattern(pattern, size, bufp) } } - if (fixup_jump) - store_jump(fixup_jump, jump, b); + if (fixup_alt_jump) + store_jump(fixup_alt_jump, jump, b); if (stackp != stackb) FREE_AND_RETURN(stackb, "unmatched ("); @@ -2239,6 +2283,7 @@ re_compile_fastmap(bufp) case wordbeg: case wordend: case pop_and_fail: + case push_dummy_failure: continue; case casefold_on: @@ -2261,6 +2306,7 @@ re_compile_fastmap(bufp) case finalize_jump: case maybe_finalize_jump: case jump: + case jump_past_alt: case dummy_failure_jump: EXTRACT_NUMBER_AND_INCR(j, p); p += j; @@ -2335,7 +2381,7 @@ re_compile_fastmap(bufp) case start_memory: case stop_memory: - p++; + p += 2; continue; case duplicate: @@ -2763,7 +2809,9 @@ typedef union for (this_reg = 0; this_reg < num_regs; this_reg++) \ { \ if (IS_ACTIVE(reg_info[this_reg])) \ - MATCHED_SOMETHING(reg_info[this_reg]) = 1; \ + MATCHED_SOMETHING(reg_info[this_reg]) \ + = EVER_MATCHED_SOMETHING (reg_info[this_reg]) \ + = 1; \ else \ MATCHED_SOMETHING(reg_info[this_reg]) = 0; \ } \ @@ -2831,6 +2879,7 @@ re_match(bufp, string_arg, size, pos, regs) struct re_registers *regs; { register unsigned char *p = (unsigned char *) bufp->buffer; + unsigned char *p1; /* Pointer to beyond end of buffer. */ register unsigned char *pend = p + bufp->used; @@ -2870,6 +2919,14 @@ re_match(bufp, string_arg, size, pos, regs) unsigned char **regstart = RE_TALLOC(num_regs, unsigned char*); unsigned char **regend = RE_TALLOC(num_regs, unsigned char*); + /* If a group that's operated upon by a repetition operator fails to + match anything, then the register for its start will need to be + restored because it will have been set to wherever in the string we + are when we last see its open-group operator. Similarly for a + register's end. */ + unsigned char **old_regstart = RE_TALLOC(num_regs, unsigned char*); + unsigned char **old_regend = RE_TALLOC(num_regs, unsigned char*); + /* The is_active field of reg_info helps us keep track of which (possibly nested) subexpressions we are currently in. The matched_something field of reg_info[reg_num] helps us tell whether or not we have @@ -2906,11 +2963,13 @@ re_match(bufp, string_arg, size, pos, regs) inactive and mark them as not having matched anything or ever failed. */ for (mcnt = 0; mcnt < num_regs; mcnt++) { - regstart[mcnt] = regend[mcnt] - = best_regstart[mcnt] = best_regend[mcnt] = REG_UNSET_VALUE; - reg_info[mcnt].word = 0; - IS_ACTIVE (reg_info[mcnt]) = 0; - MATCHED_SOMETHING (reg_info[mcnt]) = 0; + regstart[mcnt] = regend[mcnt] + = old_regstart[mcnt] = old_regend[mcnt] + = best_regstart[mcnt] = best_regend[mcnt] = REG_UNSET_VALUE; + REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; + IS_ACTIVE (reg_info[mcnt]) = 0; + MATCHED_SOMETHING (reg_info[mcnt]) = 0; + EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; } /* Set up pointers to ends of strings. @@ -3016,13 +3075,36 @@ re_match(bufp, string_arg, size, pos, regs) a register number in the next byte. The text matched within the \( and \) is recorded under that number. */ case start_memory: + /* Find out if this group can match the empty string. */ + p1 = p; /* To send to group_match_null_string_p. */ + if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) + REG_MATCH_NULL_STRING_P (reg_info[*p]) + = group_match_null_string_p (&p1, pend, reg_info); + + /* Save the position in the string where we were the last time + we were at this open-group operator in case the group is + operated upon by a repetition operator, e.g., with `(a*)*b' + against `ab'; then we want to ignore where we are now in + the string in case this attempt to match fails. */ + old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) + ? REG_UNSET (regstart[*p]) ? d : regstart[*p] + : regstart[*p]; regstart[*p] = d; IS_ACTIVE(reg_info[*p]) = 1; MATCHED_SOMETHING(reg_info[*p]) = 0; - p++; + p += 2; continue; case stop_memory: + /* We need to save the string position the last time we were at + this close-group operator in case the group is operated + upon by a repetition operator, e.g., with `((a*)*(b*)*)*' + against `aba'; then we want to ignore where we are now in + the string in case this attempt to match fails. */ + old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) + ? REG_UNSET (regend[*p]) ? d : regend[*p] + : regend[*p]; + regend[*p] = d; IS_ACTIVE(reg_info[*p]) = 0; @@ -3037,6 +3119,7 @@ re_match(bufp, string_arg, size, pos, regs) switch (*p2++) { case jump_n: + case finalize_push_n: is_a_jump_n = 1; case finalize_jump: case maybe_finalize_jump: @@ -3053,14 +3136,42 @@ re_match(bufp, string_arg, size, pos, regs) to an on_failure_jump, exit from the loop by forcing a failure after pushing on the stack the on_failure_jump's jump in the pattern, and d. */ - if (mcnt < 0 && (enum regexpcode) *p2++ == on_failure_jump) + if (mcnt < 0 && (enum regexpcode) *p2++ == on_failure_jump + && (enum regexpcode) p1[3] == start_memory && p1[4] == *p) { + /* If this group ever matched anything, then restore + what its registers were before trying this last + failed match, e.g., with `(a*)*b' against `ab' for + regstart[1], and, e.g., with `((a*)*(b*)*)*' + against `aba' for regend[3]. + + Also restore the registers for inner groups for, + e.g., `((a*)(b*))*' against `aba' (register 3 would + otherwise get trashed). */ + + if (EVER_MATCHED_SOMETHING (reg_info[*p])) + { + unsigned r; + + EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; + + /* Restore this and inner groups' (if any) registers. */ + for (r = *p; r < *p + *(p + 1); r++) + { + regstart[r] = old_regstart[r]; + + /* xx why this test? */ + if ((int) old_regend[r] >= (int) regstart[r]) + regend[r] = old_regend[r]; + } + } + p2++; EXTRACT_NUMBER_AND_INCR(mcnt, p2); PUSH_FAILURE_POINT(p2 + mcnt, d); goto fail; } } - p++; + p += 2; continue; /* \ has been turned into a `duplicate' command which is @@ -3235,13 +3346,26 @@ re_match(bufp, string_arg, size, pos, regs) EXTRACT_NUMBER_AND_INCR(mcnt, p); { register unsigned char *p2 = p; - /* Compare what follows with the beginning of the repeat. - If we can establish that there is nothing that they would - both match, we can change to finalize_jump. */ - while (p2 + 1 != pend - && (*p2 == (unsigned char)stop_memory - || *p2 == (unsigned char)start_memory)) - p2 += 2; /* Skip over reg number. */ + + /* Compare the beginning of the repeat with what in the + pattern follows its end. If we can establish that there + is nothing that they would both match, i.e., that we + would have to backtrack because of (as in, e.g., `a*a') + then we can change to pop_failure_jump, because we'll + never have to backtrack. + + This is not true in the case of alternatives: in + `(a|ab)*' we do need to backtrack to the `ab' alternative + (e.g., if the string was `ab'). But instead of trying to + detect that here, the alternative has put on a dummy + failure point which is what we will end up popping. */ + + /* Skip over open/close-group commands. */ + while (p2 + 2 < pend + && ((enum regexpcode) *p2 == stop_memory + || (enum regexpcode) *p2 == start_memory)) + p2 += 3; /* Skip over args, too. */ + if (p2 == pend) p[-3] = (unsigned char)finalize_jump; else if (*p2 == (unsigned char)exactn @@ -3294,6 +3418,11 @@ re_match(bufp, string_arg, size, pos, regs) p += mcnt; continue; + /* We need this opcode so we can detect where alternatives end + in `group_match_null_string_p' et al. */ + case jump_past_alt: + goto nofinalize; + case dummy_failure_jump: /* Normally, the on_failure_jump pushes a failure point, which then gets popped at finalize_jump. We will end up at @@ -3303,6 +3432,17 @@ re_match(bufp, string_arg, size, pos, regs) PUSH_FAILURE_POINT(0, 0); goto nofinalize; + /* At the end of an alternative, we need to push a dummy failure + point in case we are followed by a `finalize_jump', because + we don't want the failure point for the alternative to be + popped. For example, matching `(a|ab)*' against `aab' + requires that we match the `ab' alternative. */ + case push_dummy_failure: + /* See comments just above at `dummy_failure_jump' about the + two zeroes. */ + PUSH_FAILURE_POINT(0, 0); + break; + /* Have to succeed matching what follows at least n times. Then just handle like an on_failure_jump. */ case succeed_n: @@ -3527,7 +3667,37 @@ re_match(bufp, string_arg, size, pos, regs) regend[this_reg] = *--stackp; regstart[this_reg] = *--stackp; } - } + + if (p < pend) + { + int is_a_jump_n = 0; + unsigned char *p1 = p; + + /* If failed to a backwards jump that's part of a repetition + loop, need to pop this failure point and use the next one. */ + switch ((enum regexpcode) *p1) + { + case jump_n: + case finalize_push_n: + is_a_jump_n = 1; + case maybe_finalize_jump: + case finalize_jump: + case finalize_push: + case jump: + p1 = p + 1; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + + if ((is_a_jump_n && (enum regexpcode) *p1 == succeed_n) + || (!is_a_jump_n + && (enum regexpcode) *p1 == on_failure_jump)) + goto fail; + break; + default: + /* do nothing */ ; + } + } + } else break; /* Matching at this starting point really fails. */ } @@ -3539,6 +3709,245 @@ re_match(bufp, string_arg, size, pos, regs) } +/* We are passed P pointing to a register number after a start_memory. + + Return true if the pattern up to the corresponding stop_memory can + match the empty string, and false otherwise. + + If we find the matching stop_memory, sets P to point to one past its number. + Otherwise, sets P to an undefined byte less than or equal to END. + + We don't handle duplicates properly (yet). */ + +static int +group_match_null_string_p (p, end, reg_info) + unsigned char **p, *end; + register_info_type *reg_info; +{ + int mcnt; + /* Point to after the args to the start_memory. */ + unsigned char *p1 = *p + 2; + + while (p1 < end) + { + /* Skip over opcodes that can match nothing, and return true or + false, as appropriate, when we get to one that can't, or to the + matching stop_memory. */ + + switch ((enum regexpcode) *p1) + { + /* Could be either a loop or a series of alternatives. */ + case on_failure_jump: + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + + /* If the next operation is not a jump backwards in the + pattern. */ + + if (mcnt >= 0) + { + /* Go through the on_failure_jumps of the alternatives, + seeing if any of the alternatives cannot match nothing. + The last alternative starts with only a jump, + whereas the rest start with on_failure_jump and end + with a jump, e.g., here is the pattern for `a|b|c': + + /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 + /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 + /exactn/1/c + + So, we have to first go through the first (n-1) + alternatives and then deal with the last one separately. */ + + + /* Deal with the first (n-1) alternatives, which start + with an on_failure_jump (see above) that jumps to right + past a jump_past_alt. */ + + while ((enum regexpcode) p1[mcnt-3] == jump_past_alt) + { + /* `mcnt' holds how many bytes long the alternative + is, including the ending `jump_past_alt' and + its number. */ + + if (!alt_match_null_string_p (p1, p1 + mcnt - 3, + reg_info)) + return 0; + + /* Move to right after this alternative, including the + jump_past_alt. */ + p1 += mcnt; + + /* Break if it's the beginning of an n-th alternative + that doesn't begin with an on_failure_jump. */ + if ((enum regexpcode) *p1 != on_failure_jump) + break; + + /* Still have to check that it's not an n-th + alternative that starts with an on_failure_jump. */ + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + if ((enum regexpcode) p1[mcnt-3] != jump_past_alt) + { + /* Get to the beginning of the n-th alternative. */ + p1 -= 3; + break; + } + } + + /* Deal with the last alternative: go back and get number + of the `jump_past_alt' just before it. `mcnt' contains + the length of the alternative. */ + EXTRACT_NUMBER (mcnt, p1 - 2); + + if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) + return 0; + + p1 += mcnt; /* Get past the n-th alternative. */ + } /* if mcnt > 0 */ + break; + + + case stop_memory: + *p = p1 + 2; + return 1; + + + default: + if (!common_op_match_null_string_p (&p1, end, reg_info)) + return 0; + } + } /* while p1 < end */ + + return 0; +} /* group_match_null_string_p */ + + +/* Similar to group_match_null_string_p, but doesn't deal with alternatives: + It expects P to be the first byte of a single alternative and END one + byte past the last. The alternative can contain groups. */ + +static int +alt_match_null_string_p (p, end, reg_info) + unsigned char *p, *end; + register_info_type *reg_info; +{ + int mcnt; + unsigned char *p1 = p; + + while (p1 < end) + { + /* Skip over opcodes that can match nothing, and break when we get + to one that can't. */ + + switch ((enum regexpcode) *p1) + { + /* It's a loop. */ + case on_failure_jump: + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + break; + + default: + if (!common_op_match_null_string_p (&p1, end, reg_info)) + return 0; + } + } /* while p1 < end */ + + return 1; +} /* alt_match_null_string_p */ + + +/* Deals with the ops common to group_match_null_string_p and + alt_match_null_string_p. + + Sets P to one after the op and its arguments, if any. */ + +static int +common_op_match_null_string_p (p, end, reg_info) + unsigned char **p, *end; + register_info_type *reg_info; +{ + int mcnt; + int ret; + int reg_no; + unsigned char *p1 = *p; + + switch ((enum regexpcode) *p1++) + { + case unused: + case begline: + case endline: + case begbuf: + case endbuf: + case wordbeg: + case wordend: + case wordbound: + case notwordbound: +#ifdef emacs + case before_dot: + case at_dot: + case after_dot: +#endif + break; + + case start_memory: + reg_no = *p1; + ret = group_match_null_string_p (&p1, end, reg_info); + + /* Have to set this here in case we're checking a group which + contains a group and a back reference to it. */ + + if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) + REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; + + if (!ret) + return 0; + break; + + /* If this is an optimized succeed_n for zero times, make the jump. */ + case jump: + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + if (mcnt >= 0) + p1 += mcnt; + else + return 0; + break; + + case succeed_n: + /* Get to the number of times to succeed. */ + p1 += 2; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + + if (mcnt == 0) + { + p1 -= 4; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + } + else + return 0; + break; + + case duplicate: + if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) + return 0; + break; + + case set_number_at: + p1 += 4; + + default: + /* All other opcodes mean we cannot match the empty string. */ + return 0; + } + + *p = p1; + return 1; +} /* common_op_match_null_string_p */ + + static int memcmp_translate(s1, s2, len) unsigned char *s1, *s2; diff --git a/sample/ruby-mode.el b/sample/ruby-mode.el index c2410f1fe6..7b9549612c 100644 --- a/sample/ruby-mode.el +++ b/sample/ruby-mode.el @@ -660,7 +660,7 @@ An end of a defun is found by moving forward from the beginning of one." '("\\(^\\|[^_]\\)\\b\\([A-Z]+[a-zA-Z0-9_]*\\)" 2 font-lock-type-face) ;; functions - '("^\\s *def[ \t]+[^ \t(]*$" + '("^\\s *def[ \t]+[^ \t(]*" 0 font-lock-function-name-face t)) "*Additional expressions to highlight in ruby mode.") (if (and (>= (string-to-int emacs-version) 19) diff --git a/string.c b/string.c index 5def049e35..5229236f3b 100644 --- a/string.c +++ b/string.c @@ -1708,7 +1708,7 @@ trnext(t) for (;;) { if (!t->gen) { if (t->p == t->pend) return -1; - t->now = *t->p++; + t->now = *(USTR)t->p++; if (t->p < t->pend && *t->p == '-') { t->p++; if (t->p < t->pend) { -- cgit v1.2.3