diff options
author | matz <matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 1998-05-25 09:42:47 +0000 |
---|---|---|
committer | matz <matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 1998-05-25 09:42:47 +0000 |
commit | 8b1e7e34403977058d412931e31577df719fd2e4 (patch) | |
tree | 7743656749f91b601f3d025ea357e8a8043424d8 /regex.c | |
parent | 45c61f40b23b14e97428206e1b813eb813526e6f (diff) |
*** empty log message ***
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/v1_1r@224 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'regex.c')
-rw-r--r-- | regex.c | 225 |
1 files changed, 133 insertions, 92 deletions
@@ -134,8 +134,7 @@ static void store_jump_n P((char *, int, char *, unsigned)); static void insert_jump_n P((int, char *, char *, char *, unsigned)); static void insert_op P((int, char *, char *)); static void insert_op_2 P((int, char *, char *, int, int)); -static int memcmp_translate P((unsigned char *, unsigned char *, - int, unsigned char *)); +static int memcmp_translate P((unsigned char *, unsigned char *, int)); /* Define the syntax stuff, so we can do the \<, \>, etc. */ @@ -149,6 +148,7 @@ static int memcmp_translate P((unsigned char *, unsigned char *, static char re_syntax_table[256]; static void init_syntax_once P((void)); +static char *translate = 0; #undef P @@ -185,6 +185,13 @@ init_syntax_once() done = 1; } +void +re_set_casetable(table) + char *table; +{ + translate = table; +} + /* Jim Meyering writes: "... Some ctype macros are valid only for character codes that @@ -287,6 +294,9 @@ enum regexpcode and store it in a memory register. Followed by one byte containing the register number. Register numbers must be in the range 0 through RE_NREGS. */ + start_paren, /* Just a mark for starting(?:). */ + start_casefold, /* Start casefold region. */ + stop_casefold, /* End casefold region. */ start_nowidth, /* Save string point to the stack. */ stop_nowidth, /* Restore string place at the point start_nowidth. */ pop_and_fail, /* Fail after popping nowidth entry from stack. */ @@ -369,6 +379,7 @@ long re_syntax_options = 0; /* Macros for re_compile_pattern, which is found below these definitions. */ +#define TRANSLATE_P() ((options&RE_OPTION_IGNORECASE) && translate) /* Fetch the next character in the uncompiled pattern---translating it if necessary. Also cast from a signed character in the constant string passed to us by the user to an unsigned char that we can use @@ -376,7 +387,7 @@ long re_syntax_options = 0; #define PATFETCH(c) \ do {if (p == pend) goto end_of_pattern; \ c = (unsigned char) *p++; \ - if (translate) c = (unsigned char)translate[c]; \ + if (TRANSLATE_P()) c = (unsigned char)translate[c]; \ } while (0) /* Fetch the next character in the uncompiled pattern, with no @@ -649,6 +660,18 @@ print_partial_compiled_pattern(start, end) printf ("/stop_memory/%d", mcnt); break; + case start_paren: + printf ("/start_paren"); + break; + + case start_casefold: + printf ("/start_casefold"); + break; + + case stop_casefold: + printf ("/stop_casefold"); + break; + case start_nowidth: EXTRACT_NUMBER_AND_INCR (mcnt, p); printf ("/start_nowidth//%d", mcnt); @@ -853,6 +876,9 @@ calculate_must_string(start, end) p++; break; + case start_casefold: + case stop_casefold: + case start_paren: case start_nowidth: case stop_nowidth: case pop_and_fail: @@ -1008,8 +1034,7 @@ re_compile_pattern(pattern, size, bufp) int range = 0; int had_char_class = 0; - /* How to translate the characters in the pattern. */ - char *translate = bufp->translate; + int options = bufp->options; bufp->fastmap_accurate = 0; @@ -1038,37 +1063,21 @@ re_compile_pattern(pattern, size, bufp) char *p1 = p; /* When testing what follows the $, look past the \-constructs that don't consume anything. */ - if (! (re_syntax_options & RE_CONTEXT_INDEP_OPS)) - while (p1 != pend) - { - if (*p1 == '\\' && p1 + 1 != pend - && (p1[1] == 'b' || p1[1] == 'B')) - p1 += 2; - else - break; - } - if (re_syntax_options & RE_TIGHT_VBAR) + + while (p1 != pend) { - if (! (re_syntax_options & RE_CONTEXT_INDEP_OPS) && p1 != pend) - goto normal_char; - /* Make operand of last vbar end before this `$'. */ - if (fixup_jump) - store_jump(fixup_jump, jump, b); - fixup_jump = 0; - BUFPUSH(endline); - break; + if (*p1 == '\\' && p1 + 1 != pend + && (p1[1] == 'b' || p1[1] == 'B')) + p1 += 2; + else + break; } /* $ means succeed if at end of line, but only in special contexts. If validly in the middle of a pattern, it is a normal character. */ if (p1 == pend || *p1 == '\n' - || (re_syntax_options & RE_CONTEXT_INDEP_OPS) - || (re_syntax_options & RE_NO_BK_PARENS - ? *p1 == ')' - : *p1 == '\\' && p1[1] == ')') - || (re_syntax_options & RE_NO_BK_VBAR - ? *p1 == '|' - : *p1 == '\\' && p1[1] == '|')) + || *p1 == ')' + || *p1 == '|') { BUFPUSH(endline); break; @@ -1079,36 +1088,19 @@ re_compile_pattern(pattern, size, bufp) /* ^ means succeed if at beg of line, but only if no preceding pattern. */ - if ((re_syntax_options & RE_CONTEXTUAL_INVALID_OPS) && laststart) + if (laststart) goto invalid_pattern; - if (laststart && p - 2 >= pattern && p[-2] != '\n' - && !(re_syntax_options & RE_CONTEXT_INDEP_OPS)) + if (laststart && p - 2 >= pattern && p[-2] != '\n') goto normal_char; - if (re_syntax_options & RE_TIGHT_VBAR) - { - if (p != pattern + 1 - && ! (re_syntax_options & RE_CONTEXT_INDEP_OPS)) - goto normal_char; - BUFPUSH(begline); - begalt = b; - } - else - { - BUFPUSH(begline); - } + BUFPUSH(begline); break; case '+': case '?': - if (re_syntax_options & RE_LIMITED_OPS) - goto normal_char; case '*': /* If there is no previous pattern, char not special. */ if (!laststart) { - if (re_syntax_options & RE_CONTEXTUAL_INVALID_OPS) - goto invalid_pattern; - else if (! (re_syntax_options & RE_CONTEXT_INDEP_OPS)) - goto normal_char; + goto invalid_pattern; } /* If there is a sequence of repetition chars, collapse it down to just one. */ @@ -1221,9 +1213,7 @@ re_compile_pattern(pattern, size, bufp) if (c == ']') { if (p == p0 + 1) { - /* If this is an empty bracket expression. */ - if ((re_syntax_options & RE_NO_EMPTY_BRACKETS) - && p == pend) + if (p == pend) goto invalid_pattern; } else @@ -1325,10 +1315,6 @@ re_compile_pattern(pattern, size, bufp) if (last > c) goto invalid_pattern; - if ((re_syntax_options & RE_NO_HYPHEN_RANGE_END) - && c == '-' && *p != ']') - goto invalid_pattern; - range = 0; if (last < 1 << BYTEWIDTH && c < 1 << BYTEWIDTH) { for (;last<=c;last++) @@ -1442,11 +1428,31 @@ re_compile_pattern(pattern, size, bufp) if (c == '?') { PATFETCH(c); switch (c) { - case '#': - case 'i': - case 'm': - case 's': case 'x': + case 'i': + for (;;) { + switch (c) { + case ')': + break; + + case 'x': + options |= RE_OPTION_EXTENDED; + break; + case 'i': + options |= RE_OPTION_IGNORECASE; + BUFPUSH(start_casefold); + break; + + default: + FREE_AND_RETURN(stackb, "undefined (?...) inline option"); + } + if (c == ')') break; + PATFETCH(c); + } + c = '#'; /* read whole in-line options */ + break; + + case '#': for (;;) { PATFETCH(c); if (c == ')') break; @@ -1468,7 +1474,7 @@ re_compile_pattern(pattern, size, bufp) c = '('; } if (c == '#') break; - if (stackp+6 >= stacke) { + if (stackp+7 >= stacke) { int *stackx; unsigned int len = stacke - stackb; @@ -1509,11 +1515,14 @@ re_compile_pattern(pattern, size, bufp) break; case ':': + if (b > bufp->buffer && b[-1] != start_paren) + BUFPUSH(start_paren); pending_exact = 0; default: break; } *stackp++ = c; + *stackp++ = options; fixup_jump = 0; laststart = 0; begalt = b; @@ -1521,6 +1530,10 @@ re_compile_pattern(pattern, size, bufp) case ')': if (stackp == stackb) goto unmatched_close; + if ((options ^ stackp[-1]) & RE_OPTION_IGNORECASE) { + BUFPUSH(stop_casefold); + } + options = *--stackp; switch (c = *--stackp) { case '(': if (fixup_jump) @@ -1589,32 +1602,27 @@ re_compile_pattern(pattern, size, bufp) /* If there is no previous pattern, this isn't an interval. */ if (!laststart) { - if (re_syntax_options & RE_CONTEXTUAL_INVALID_OPS) - goto invalid_pattern; - else goto normal_backsl; } /* It also isn't an interval if not preceded by an re matching a single character or subexpression, or if the current type of intervals can't handle back references and the previous thing is a back reference. */ + if (! (*laststart == anychar || *laststart == charset || *laststart == charset_not || *laststart == wordchar || *laststart == notwordchar || *laststart == start_memory + || *laststart == start_paren || (*laststart == exactn && (laststart[1] == 1 || laststart[1] == 2 && ismbchar(laststart[2]))) - || (! (re_syntax_options & RE_NO_BK_REFS) - && *laststart == duplicate))) + || *laststart == duplicate)) { /* Posix extended syntax is handled in previous statement; this is for Posix basic syntax. */ - if (re_syntax_options & RE_INTERVALS) - goto invalid_pattern; - goto normal_backsl; } lower_bound = -1; /* So can see if are set. */ @@ -1875,6 +1883,23 @@ re_compile_pattern(pattern, size, bufp) } break; + case '#': + if (options & RE_OPTION_EXTENDED) + { + while (p != pend) { + PATFETCH(c); + if (c == '\n') break; + } + break; + } + goto normal_char; + + case ' ': + case '\t': + case '\n': + if (options & RE_OPTION_EXTENDED) + break; + default: normal_char: /* Expects the character in `c'. */ c1 = 0; @@ -2160,12 +2185,12 @@ re_compile_fastmap(bufp) register unsigned char *p = pattern; register unsigned char *pend = pattern + size; register int j, k; - unsigned char *translate = (unsigned char *)bufp->translate; unsigned is_a_succeed_n; unsigned char **stackb = RE_TALLOC(NFAILURES, unsigned char*); unsigned char **stackp = stackb; unsigned char **stacke = stackb + NFAILURES; + int options = bufp->options; memset(fastmap, 0, (1 << BYTEWIDTH)); bufp->fastmap_accurate = 1; @@ -2187,12 +2212,12 @@ re_compile_fastmap(bufp) { case exactn: if (p[1] == 0xff) { - if (translate) + if (TRANSLATE_P()) fastmap[translate[p[2]]] = 2; else fastmap[p[2]] = 2; } - else if (translate) + else if (TRANSLATE_P()) fastmap[translate[p[1]]] = 1; else fastmap[p[1]] = 1; @@ -2206,10 +2231,13 @@ re_compile_fastmap(bufp) case wordbeg: case wordend: case pop_and_fail: + case start_paren: + case start_casefold: + case stop_casefold: continue; case endline: - if (translate) + if (TRANSLATE_P()) fastmap[translate['\n']] = 1; else fastmap['\n'] = 1; @@ -2333,7 +2361,7 @@ re_compile_fastmap(bufp) for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) { - if (translate) + if (TRANSLATE_P()) fastmap[translate[j]] = 1; else fastmap[j] = 1; @@ -2453,8 +2481,8 @@ re_search(bufp, string, size, startpos, range, regs) struct re_registers *regs; { register char *fastmap = bufp->fastmap; - register unsigned char *translate = (unsigned char *) bufp->translate; int val, anchor = 0; + int options = bufp->options; /* Check for out-of-range starting position. */ if (startpos < 0 || startpos > size) @@ -2490,7 +2518,7 @@ re_search(bufp, string, size, startpos, range, regs) && bufp->must && !must_instr(bufp->must+1, bufp->must[0], string+startpos, size-startpos, - translate)) { + (TRANSLATE_P())?translate:0)) { return -1; } #endif @@ -2528,7 +2556,7 @@ re_search(bufp, string, size, startpos, range, regs) break; } else - if (fastmap[translate ? translate[c] : c]) + if (fastmap[(TRANSLATE_P()) ? translate[c] : c]) break; range--; } @@ -2540,7 +2568,7 @@ re_search(bufp, string, size, startpos, range, regs) c = string[startpos]; c &= 0xff; - if (translate ? !fastmap[translate[c]] : !fastmap[c]) + if ((TRANSLATE_P()) ? !fastmap[translate[c]] : !fastmap[c]) goto advance; } } @@ -2787,7 +2815,7 @@ re_match(bufp, string_arg, size, pos, regs) register unsigned char *d, *dend; register int mcnt; /* Multipurpose. */ - unsigned char *translate = (unsigned char *) bufp->translate; + int options = bufp->options; unsigned is_a_jump_n = 0; /* Failure point stack. Each place that can handle a failure further @@ -3041,8 +3069,8 @@ re_match(bufp, string_arg, size, pos, regs) /* Compare that many; failure if mismatch, else move past them. */ - if (translate - ? memcmp_translate(d, d2, mcnt, translate) + if ((options & RE_OPTION_IGNORECASE) + ? memcmp_translate(d, d2, mcnt) : memcmp((char *)d, (char *)d2, mcnt)) goto fail; d += mcnt, d2 += mcnt; @@ -3079,9 +3107,7 @@ re_match(bufp, string_arg, size, pos, regs) d += 2; break; } - if ((translate ? translate[*d] : *d) == '\n' - || ((re_syntax_options & RE_DOT_NOT_NULL) - && (translate ? translate[*d] : *d) == '\000')) + if (((TRANSLATE_P()) ? translate[*d] : *d) == '\n') goto fail; SET_REGS_MATCHED; d++; @@ -3102,7 +3128,7 @@ re_match(bufp, string_arg, size, pos, regs) c |= (unsigned char)d[1]; } } - else if (translate) + else if (TRANSLATE_P()) c = (unsigned char)translate[c]; half = not = is_in_list(c, p); @@ -3308,11 +3334,16 @@ re_match(bufp, string_arg, size, pos, regs) continue; case finalize_push_n: - EXTRACT_NUMBER(mcnt, p + 2); - /* Originally, this is how many times we CAN jump. */ + EXTRACT_NUMBER(mcnt, p + 2); + /* Originally, this is how many times we CAN jump. */ if (mcnt) { + int pos, i; + mcnt--; STORE_NUMBER(p + 2, mcnt); + EXTRACT_NUMBER(pos, p); + EXTRACT_NUMBER(i, p+pos+5); + if (i > 0) goto nofinalize; POP_FAILURE_POINT(); EXTRACT_NUMBER_AND_INCR(mcnt, p); PUSH_FAILURE_POINT(p + mcnt, d); @@ -3329,6 +3360,17 @@ re_match(bufp, string_arg, size, pos, regs) case unused: continue; + case start_paren: + continue; + + case start_casefold: + options |= RE_OPTION_IGNORECASE; + continue; + + case stop_casefold: + options &= ~RE_OPTION_IGNORECASE; + continue; + case wordbound: if (AT_WORD_BOUNDARY(d)) break; @@ -3374,7 +3416,7 @@ re_match(bufp, string_arg, size, pos, regs) mcnt = *p++; /* This is written out as an if-else so we don't waste time testing `translate' inside the loop. */ - if (translate) + if (TRANSLATE_P()) { do { @@ -3469,10 +3511,9 @@ re_match(bufp, string_arg, size, pos, regs) static int -memcmp_translate(s1, s2, len, translate) +memcmp_translate(s1, s2, len) unsigned char *s1, *s2; register int len; - unsigned char *translate; { register unsigned char *p1 = s1, *p2 = s2, c; while (len) |