From f5da3b6746dba18ab86d11aa49caf97b37ecc6ac Mon Sep 17 00:00:00 2001 From: matz Date: Thu, 3 Sep 1998 07:43:53 +0000 Subject: 1.1c4 git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/v1_1r@293 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- regex.c | 238 ++++++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 142 insertions(+), 96 deletions(-) (limited to 'regex.c') diff --git a/regex.c b/regex.c index 2eefc9da6d..40b22bc3a8 100644 --- a/regex.c +++ b/regex.c @@ -143,7 +143,7 @@ static int group_match_null_string_p (); /* Define the syntax stuff, so we can do the \<, \>, etc. */ /* This must be nonzero for the wordchar and notwordchar pattern - commands in re_match_2. */ + commands in re_match. */ #ifndef Sword #define Sword 1 #endif @@ -153,6 +153,8 @@ static int group_match_null_string_p (); static char re_syntax_table[256]; static void init_syntax_once P((void)); static unsigned char *translate = 0; +static void init_regs P((struct re_registers*, unsigned int)); +static void bm_init_skip P((int *, unsigned char*, int, char *)); #undef P @@ -247,6 +249,7 @@ enum regexpcode begbuf, /* Succeeds if at beginning of buffer (if emacs) or at beginning of string to be matched (if not). */ endbuf, /* Analogously, for end of buffer/string. */ + endbuf2, /* End of buffer/string, or newline just before it. */ jump, /* Followed by two bytes giving relative address to jump to. */ jump_past_alt,/* Same as jump, but marks the end of an alternative. */ on_failure_jump, /* Followed by two bytes giving relative address of @@ -386,7 +389,7 @@ long re_syntax_options = 0; /* Macros for re_compile_pattern, which is found below these definitions. */ #define TRANSLATE_P() ((options&RE_OPTION_IGNORECASE) && translate) -#define TRY_TRANSLATE() ((bufp->options&(RE_OPTION_IGNORECASE|RE_MAY_IGNORECASE)) && translate) +#define MAY_TRANSLATE() ((bufp->options&(RE_OPTION_IGNORECASE|RE_MAY_IGNORECASE)) && translate) /* Fetch the next character in the uncompiled pattern---translating it if necessary. Also cast from a signed character in the constant string passed to us by the user to an unsigned char that we can use @@ -836,6 +839,10 @@ print_partial_compiled_pattern(start, end) printf ("/endbuf"); break; + case endbuf2: + printf ("/endbuf2"); + break; + default: printf ("?%d", *(p-1)); } @@ -908,6 +915,7 @@ calculate_must_string(start, end) case notwordchar: case begbuf: case endbuf: + case endbuf2: case push_dummy_failure: break; @@ -1060,6 +1068,7 @@ re_compile_pattern(pattern, size, bufp) bufp->fastmap_accurate = 0; bufp->must = 0; + bufp->must_skip = 0; /* Initialize the syntax table. */ init_syntax_once(); @@ -1449,41 +1458,49 @@ re_compile_pattern(pattern, size, bufp) case '(': PATFETCH(c); if (c == '?') { + int negative = 0; PATFETCH_RAW(c); switch (c) { - case 'x': case 'X': - case 'i': case 'I': + case 'x': case 'i': case '-': for (;;) { switch (c) { + case '-': + negative = 1; + break; + case ')': + case ':': break; case 'x': - options |= RE_OPTION_EXTENDED; - break; - case 'X': - options &= ~RE_OPTION_EXTENDED; + if (negative) + options &= ~RE_OPTION_EXTENDED; + else + options |= RE_OPTION_EXTENDED; break; case 'i': - if (!(options&RE_OPTION_IGNORECASE)) { + if (negative) { + if (options&RE_OPTION_IGNORECASE) { + options &= ~RE_OPTION_IGNORECASE; + BUFPUSH(casefold_off); + } + } + else if (!(options&RE_OPTION_IGNORECASE)) { options |= RE_OPTION_IGNORECASE; BUFPUSH(casefold_on); } break; - case 'I': - if (options&RE_OPTION_IGNORECASE) { - options &= ~RE_OPTION_IGNORECASE; - BUFPUSH(casefold_off); - } - break; default: FREE_AND_RETURN(stackb, "undefined (?...) inline option"); } - if (c == ')') break; + if (c == ')') { + c = '#'; /* read whole in-line options */ + break; + } + if (c == ':') break; PATFETCH_RAW(c); } - c = '#'; /* read whole in-line options */ break; case '#': @@ -1577,7 +1594,7 @@ re_compile_pattern(pattern, size, bufp) { /* Push a dummy failure point at the end of the alternative for a possible future `finalize_jump' to pop. See comments at - `push_dummy_failure' in `re_match_2'. */ + `push_dummy_failure' in `re_match'. */ BUFPUSH(push_dummy_failure); /* We allocated space for this jump when we assigned @@ -1881,6 +1898,10 @@ re_compile_pattern(pattern, size, bufp) break; case 'Z': + BUFPUSH(endbuf2); + break; + + case 'z': BUFPUSH(endbuf); break; @@ -2025,9 +2046,11 @@ re_compile_pattern(pattern, size, bufp) bufp->used = b - bufp->buffer; bufp->re_nsub = regnum; - if (*bufp->buffer == exactn) { + laststart = bufp->buffer; + if (*laststart == start_memory) laststart += 3; + if (*laststart == exactn) { bufp->options |= RE_OPTIMIZE_EXACTN; - bufp->must = bufp->buffer+1; + bufp->must = laststart+1; } else { bufp->must = calculate_must_string(bufp->buffer, b); @@ -2044,6 +2067,12 @@ re_compile_pattern(pattern, size, bufp) break; } } + if (!(bufp->options & RE_OPTIMIZE_NO_BM)) { + bufp->must_skip = (int *) xmalloc((1 << BYTEWIDTH)*sizeof(int)); + bm_init_skip(bufp->must_skip, bufp->must+1, + (unsigned char)bufp->must[0], + MAY_TRANSLATE()?translate:0); + } } FREE_AND_RETURN(stackb, 0); @@ -2064,6 +2093,15 @@ re_compile_pattern(pattern, size, bufp) FREE_AND_RETURN(stackb, "nested *?+ in regexp"); } +void +re_free_pattern(bufp) + struct re_pattern_buffer *bufp; +{ + free(bufp->buffer); + free(bufp->fastmap); + if (bufp->must_skip) free(bufp->must_skip); + free(bufp); +} /* Store a jump of the form . Store in the location FROM a jump operation to jump to relative @@ -2189,7 +2227,7 @@ insert_op_2(op, there, current_end, num_1, num_2) #define trans_eq(c1, c2, translate) (translate?(translate[c1]==translate[c2]):((c1)==(c2))) static int -must_match(little, lend, big, bend, translate) +slow_match(little, lend, big, bend, translate) unsigned char *little, *lend; unsigned char *big, *bend; unsigned char *translate; @@ -2198,10 +2236,8 @@ must_match(little, lend, big, bend, translate) while (little < lend && big < bend) { c = *little++; - if (c == 0xff) { - if (!trans_eq(*big++, *little++, translate)) break; - continue; - } + if (c == 0xff) + c = *little++; if (!trans_eq(*big++, c, translate)) break; } if (little == lend) return 1; @@ -2209,7 +2245,7 @@ must_match(little, lend, big, bend, translate) } static int -must_instr(little, llen, big, blen, translate) +slow_search(little, llen, big, blen, translate) unsigned char *little; int llen; unsigned char *big; @@ -2253,7 +2289,7 @@ must_instr(little, llen, big, blen, translate) } } - if (must_match(little, little+llen, big, bend, translate)) + if (slow_match(little, little+llen, big, bend, translate)) return big - bsave; if (ismbchar(*big)) big++; @@ -2263,84 +2299,69 @@ must_instr(little, llen, big, blen, translate) } static void -bm_init_skip(skip, pat, m) +bm_init_skip(skip, pat, m, translate) int *skip; unsigned char *pat; int m; + char *translate; { int j, c; for (c=0; c<256; c++) { skip[c] = m; } - for (j=0; j=0; s--) { - j = m; - while (j >= 0 && pat[j-s] == pat[j]) { - j--; - } - if (j > s) { - next[j] = m-j+s; - } - else { - while (j > 0) { - next[j] = m-j+s; - j--; - } - } + else { + for (j=0; j= 0 && translate[big[i]] == translate[little[j]]) { - i--; - j--; - } - if (j < 0) return i+1; - - i += skip[big[i]] > next[j] ? skip[big[i]] : next[j]; - } - return -1; - } + i = llen-1; + if (translate) { while (i < blen) { - j = llen-1; - while (j >= 0 && big[i] == little[j]) { - i--; - j--; - } - if (j < 0) return i+1; + k = i; + j = llen-1; + while (j >= 0 && translate[big[k]] == translate[little[j]]) { + k--; + j--; + } + if (j < 0) return k+1; - i += skip[big[i]] > next[j] ? skip[big[i]] : next[j]; + i += skip[translate[big[i]]]; } return -1; + } + while (i < blen) { + k = i; + j = llen-1; + while (j >= 0 && big[k] == little[j]) { + k--; + j--; + } + if (j < 0) return k+1; + + i += skip[big[i]]; + } + return -1; } /* Given a pattern, compute a fastmap from it. The fastmap records @@ -2402,6 +2423,7 @@ re_compile_fastmap(bufp) case begline: case begbuf: case endbuf: + case endbuf2: case wordbound: case notwordbound: case wordbeg: @@ -2602,18 +2624,20 @@ re_compile_fastmap(bufp) { unsigned short size; unsigned char c, beg; + int byte_match = 0; p += p[-1] + 2; size = EXTRACT_UNSIGNED(&p[-2]); if (size == 0) { - for (j = 0x80; j < (1 << BYTEWIDTH); j++) - if (ismbchar(j)) - fastmap[j] = 1; + for (j = 0x80; j < (1 << BYTEWIDTH); j++) + if (ismbchar(j)) + fastmap[j] = 1; } for (j = 0,c = 0x80;j < (int)size; j++) { if ((unsigned char)p[j*4] == 0xff) { + byte_match = 1; for (beg = (unsigned char)p[j*4+1]; c < beg; c++) - fastmap[c] = 2; + fastmap[c] = 1; c = (unsigned char)p[j*4+3] + 1; } else { @@ -2623,6 +2647,18 @@ re_compile_fastmap(bufp) c = (unsigned char)p[j*4 + 2] + 1; } } + if (byte_match) { + for (j = c; j < (1 << BYTEWIDTH); j++) + fastmap[j] = 1; + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (fastmap[j]) + fastmap[j] = 2; + } + else { + for (j = c; j < (1 << BYTEWIDTH); j++) + if (ismbchar(j)) + fastmap[j] = 1; + } } break; @@ -2708,18 +2744,20 @@ re_search(bufp, string, size, startpos, range, regs) r = 0; } if (bufp->options & RE_OPTIMIZE_NO_BM) { - pos = must_instr(bufp->must+1, len, - string+startpos, size-startpos-r, - TRY_TRANSLATE()?translate:0); + pos = slow_search(bufp->must+1, len, + string+startpos, size-startpos-r, + MAY_TRANSLATE()?translate:0); } else { pos = bm_search(bufp->must+1, len, string+startpos, size-startpos-r, - TRY_TRANSLATE()?translate:0); + bufp->must_skip, + MAY_TRANSLATE()?translate:0); } if (pos == -1) return -1; - if (bufp->options & RE_OPTIMIZE_EXACTN) + if (bufp->options & RE_OPTIMIZE_EXACTN) { startpos += pos; + } } for (;;) @@ -2751,7 +2789,7 @@ re_search(bufp, string, size, startpos, range, regs) break; } else - if (fastmap[TRY_TRANSLATE() ? translate[c] : c]) + if (fastmap[MAY_TRANSLATE() ? translate[c] : c]) break; range--; } @@ -2763,7 +2801,7 @@ re_search(bufp, string, size, startpos, range, regs) c = string[startpos]; c &= 0xff; - if (TRY_TRANSLATE() ? !fastmap[translate[c]] : !fastmap[c]) + if (MAY_TRANSLATE() ? !fastmap[translate[c]] : !fastmap[c]) goto advance; } } @@ -2781,9 +2819,9 @@ re_search(bufp, string, size, startpos, range, regs) return -2; #ifndef NO_ALLOCA -#ifdef cALLOCA +#ifdef C_ALLOCA alloca(0); -#endif /* cALLOCA */ +#endif /* C_ALLOCA */ #endif /* NO_ALLOCA */ if (range > 0) { @@ -2807,7 +2845,7 @@ re_search(bufp, string, size, startpos, range, regs) if (fastmap[c] != 2) break; } else - if (!fastmap[TRY_TRANSLATE() ? translate[c] : c]) + if (!fastmap[MAY_TRANSLATE() ? translate[c] : c]) break; range--; } @@ -2997,7 +3035,7 @@ typedef union static void init_regs(regs, num_regs) struct re_registers *regs; - unsigned num_regs; + unsigned int num_regs; { int i; @@ -3476,6 +3514,12 @@ re_match(bufp, string_arg, size, pos, regs) /* Match at the very end of the data. */ case endbuf: + if (AT_STRINGS_END(d)) + break; + goto fail; + + /* Match at the very end of the data. */ + case endbuf2: if (AT_STRINGS_END(d)) break; /* .. or newline just before the end of the data. */ @@ -4047,6 +4091,7 @@ common_op_match_null_string_p (p, end, reg_info) case endline: case begbuf: case endbuf: + case endbuf2: case wordbeg: case wordend: case wordbound: @@ -4200,6 +4245,7 @@ static const unsigned char mbctab_euc[] = { /* 0xA1-0xFE */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -4231,7 +4277,7 @@ const unsigned char *mbctab = mbctab_ascii; int current_mbctype = MBCTYPE_ASCII; void -mbcinit(mbctype) +re_mbcinit(mbctype) int mbctype; { switch (mbctype) { -- cgit v1.2.3