From d349889e770a2078c247d9d28070e86a54b856f4 Mon Sep 17 00:00:00 2001 From: Yukihiro Matsumoto Date: Wed, 15 Mar 1995 14:59:18 +0900 Subject: version 0.69 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://cache.ruby-lang.org/pub/ruby/1.0/ruby-0.69.tar.gz Wed Mar 15 14:59:18 1995 Yukihiro Matsumoto (matz@ix-02) * version 0.69 * eval.c(method_missing): unknownから名称変更. * eval.c(single_method_added): 特異メソッドが定義された時に呼ばれ るメソッド.hookとして使える.実際に定義される直前に呼ばれる. Tue Mar 14 14:46:44 1995 Yukihiro Matsumoto (matz@ix-02) * ruby.c(proc_options): 引数の解析を自分でやることにより引数指定の 方法がperlに近付いた.getopt_longはもう使わない. * dir.c(glob): `{}'のネストを許すようにした. Mon Mar 13 17:56:25 1995 Yukihiro Matsumoto (matz@ix-02) * glob.c: Glob(ワイルドカードオブジェクト)はなくなった.ワイルドカー ドの展開はDir.glob(文字列)を使う.ワイルドカードのマッチは正規表 現で代用. Fri Mar 10 18:35:46 1995 Yukihiro Matsumoto (matz@ix-02) * eval.c: Mathのようなモジュールは自分自身でextendする. * eval.c: クラスやモジュールを定義した既に同名のものがあれば追加定 義となるように.ただし.superクラスの違いなどはチェックする. * regex.c: debug. * math.c: 定数PIとEを定義. Thu Mar 9 21:35:12 1995 Yukihiro Matsumoto (matz@ix-02) * regex.c: EUC,SJISモードでは0x80以上の8進,16進リテラルを禁止. * regex.c: クラス内でも数値リテラル・文字クラスが使えるようした. Wed Mar 8 17:39:05 1995 Yukihiro Matsumoto (matz@ix-02) * regex.c: \200など括弧の数以上の表現は8進リテラルと解釈する.ただ し,\1から\9までは例外. * regex.c: \9以上のリファレンスも有効にした. Tue Mar 7 14:26:01 1995 Yukihiro Matsumoto (matz@ix-02) * eval.c(public/private): スコープ制御メソッドの名称変更.静的なア クセスも出来るようにしてみたが,不採用. Mon Mar 6 19:34:32 1995 Yukihiro Matsumoto (matz@ix-02) * eval.c(inlcude): メソッド化.動的にモジュールをインクルードでき るように.さらに任意のオブジェクトにもモジュールをインクルードで きるメソッド `extend'も用意した. * parse.y: 文法からincludeを削除.メソッド化. Tue Feb 28 15:35:10 1995 Yukihiro Matsumoto (matz@ix-02) * parse.y: 配列,連想配列の最後に`,'をおけるように. --- regex.c | 1329 ++++++++++++++++++++++----------------------------------------- 1 file changed, 453 insertions(+), 876 deletions(-) (limited to 'regex.c') diff --git a/regex.c b/regex.c index bfd604f99c..1a6d9fa97f 100644 --- a/regex.c +++ b/regex.c @@ -25,27 +25,15 @@ On the other hand, if you compile with both -Dtest and -Dcanned you can run some tests we've already thought of. */ -#include "config.h" - /* We write fatal error messages on standard error. */ #include /* isalpha(3) etc. are used for the character classes. */ #include +#include -#ifdef emacs - -/* The `emacs' switch turns on certain special matching commands - that make sense only in emacs. */ - -#include "lisp.h" -#include "buffer.h" -#include "syntax.h" - -#else /* not emacs */ - +#include "config.h" #include "defines.h" -#include #ifdef __STDC__ #define P(s) s @@ -74,6 +62,9 @@ char *alloca (); #endif #endif /* __GNUC__ */ +#define RE_ALLOCATE alloca +#define FREE_VARIABLES() alloca (0) + #define FREE_AND_RETURN_VOID(stackb) return #define FREE_AND_RETURN(stackb,val) return(val) #define DOUBLE_STACK(stackx,stackb,len) \ @@ -82,12 +73,38 @@ char *alloca (); /* Only copy what is in use. */ \ (unsigned char **) memcpy (stackx, stackb, len * sizeof (char *))) #else /* NO_ALLOCA defined */ + +#define RE_ALLOCATE malloc + +#define FREE_VAR(var) if (var) free (var); var = NULL +#define FREE_VARIABLES() \ + do { \ + FREE_VAR (regstart); \ + FREE_VAR (regend); \ + FREE_VAR (best_regstart); \ + FREE_VAR (best_regend); \ + FREE_VAR (reg_info); \ + } while (0) + #define FREE_AND_RETURN_VOID(stackb) free(stackb);return #define FREE_AND_RETURN(stackb,val) free(stackb);return(val) #define DOUBLE_STACK(stackx,stackb,len) \ (unsigned char **)xrealloc (stackb, 2 * len * sizeof (unsigned char *)) #endif /* NO_ALLOCA */ +#define RE_TALLOC(n,t) ((t*)RE_ALLOCATE((n)*sizeof(t))) +#define TMALLOC(n,t) ((t*)xmalloc((n)*sizeof(t))) +#define TREALLOC(s,n,t) (s=((t*)xrealloc(s,(n)*sizeof(t)))) + +/* Get the interface, including the syntax bits. */ +#include "regex.h" + +static int re_search_2 P((struct re_pattern_buffer *, char *, int, + char *, int, int, int, + struct re_registers *)); +static int re_match_2 P((struct re_pattern_buffer *, char *, int, + char *, int, int, struct re_registers *)); + static void store_jump P((char *, int, char *)); static void insert_jump P((int, char *, char *, char *)); static void store_jump_n P((char *, int, char *, unsigned)); @@ -95,7 +112,6 @@ static void insert_jump_n P((int, char *, char *, char *, unsigned)); static void insert_op_2 P((int, char *, char *, int, int )); static int memcmp_translate P((unsigned char *, unsigned char *, int, unsigned char *)); -long re_set_syntax P((long)); /* Define the syntax stuff, so we can do the \<, \>, etc. */ @@ -107,16 +123,12 @@ long re_set_syntax P((long)); #define SYNTAX(c) re_syntax_table[c] - -#ifdef SYNTAX_TABLE - -char *re_syntax_table; - -#else /* not SYNTAX_TABLE */ - static char re_syntax_table[256]; static void init_syntax_once P((void)); +#undef P + +#include "util.h" static void init_syntax_once () @@ -147,20 +159,11 @@ init_syntax_once () done = 1; } -#endif /* SYNTAX_TABLE */ -#undef P -#endif /* emacs */ - - /* Sequents are missing isgraph. */ #ifndef isgraph #define isgraph(c) (isprint((c)) && !isspace((c))) #endif -/* Get the interface, including the syntax bits. */ -#include "regex.h" - - /* These are the command codes that appear in compiled regular expressions, one per byte. Some command codes are followed by argument bytes. A command code can specify any interpretation @@ -226,22 +229,10 @@ enum regexpcode duplicate, /* Match a duplicate of something remembered. Followed by one byte containing the index of the memory register. */ - before_dot, /* Succeeds if before point. */ - at_dot, /* Succeeds if at point. */ - after_dot, /* Succeeds if after point. */ - begbuf, /* Succeeds if at beginning of buffer. */ - endbuf, /* Succeeds if at end of buffer. */ wordchar, /* Matches any word-constituent character. */ notwordchar, /* Matches any char that is not a word-constituent. */ - wordbeg, /* Succeeds if at word beginning. */ - wordend, /* Succeeds if at word end. */ wordbound, /* Succeeds if at a word boundary. */ notwordbound,/* Succeeds if not at a word boundary. */ - syntaxspec, /* Matches any character whose syntax is specified. - followed by a byte which contains a syntax code, - e.g., Sword. */ - notsyntaxspec /* Matches any character whose syntax differs from - that specified. */ }; @@ -301,39 +292,20 @@ re_set_syntax (syntax) { long ret; - ret = obscure_syntax; - obscure_syntax = syntax; + ret = re_syntax_options; + re_syntax_options = syntax; return ret; } /* Set by re_set_syntax to the current regexp syntax to recognize. */ -#ifdef EUC -#define DEFAULT_MBCTYPE RE_MBCTYPE_EUC -#else -#ifdef SJIS -#define DEFAULT_MBCTYPE RE_MBCTYPE_SJIS -#else -#define DEFAULT_MBCTYPE 0 -#endif -#endif -long obscure_syntax = DEFAULT_MBCTYPE; +long re_syntax_options = DEFAULT_MBCTYPE; /* Macros for re_compile_pattern, which is found below these definitions. */ #define CHAR_CLASS_MAX_LENGTH 6 - -/* Fetch the next character in the uncompiled pattern, translating it if - necessary. */ +/* Fetch the next character in the uncompiled pattern. */ #define PATFETCH(c) \ - {if (p == pend) goto end_of_pattern; \ - c = * (unsigned char *) p++; \ - if (translate && !ismbchar (c)) \ - c = (unsigned char) translate[(unsigned char) c]; } - -/* Fetch the next character in the uncompiled pattern, with no - translation. */ -#define PATFETCH_RAW(c) \ {if (p == pend) goto end_of_pattern; \ c = * (unsigned char *) p++; } @@ -436,20 +408,13 @@ long obscure_syntax = DEFAULT_MBCTYPE; }; */ static void -#ifdef __STDC__ -set_list_bits (unsigned short c1, unsigned short c2, - unsigned char *b, const unsigned char *translate) -#else -set_list_bits (c1, c2, b, translate) - unsigned short c1, c2; - unsigned char *b; - const unsigned char *translate; -#endif +set_list_bits (c1, c2, b) + unsigned short c1, c2; + unsigned char *b; { - enum regexpcode op = (enum regexpcode) b[-2]; unsigned char sbc_size = b[-1]; unsigned short mbc_size = EXTRACT_UNSIGNED (&b[sbc_size]); - unsigned short c, beg, end, upb; + unsigned short beg, end, upb; if (c1 > c2) return; @@ -464,15 +429,9 @@ set_list_bits (c1, c2, b, translate) memset (&b[sbc_size], 0, (1 << BYTEWIDTH) / BYTEWIDTH - sbc_size); b[-1] = sbc_size = (1 << BYTEWIDTH) / BYTEWIDTH; } - if (!translate) { - for (; c1 <= upb; c1++) - if (!ismbchar (c1)) - SET_LIST_BIT (c1); - } - else - for (; c1 <= upb; c1++) + for (; c1 <= upb; c1++) if (!ismbchar (c1)) - SET_LIST_BIT (translate[c1]); + SET_LIST_BIT (c1); if (c2 < 1 << BYTEWIDTH) return; c1 = 0x8000; /* The first wide char */ @@ -513,41 +472,37 @@ set_list_bits (c1, c2, b, translate) } static int -#ifdef __STDC__ -is_in_list (unsigned short c, const unsigned char *b) -#else is_in_list (c, b) - unsigned short c; - const unsigned char *b; -#endif + unsigned short c; + const unsigned char *b; { - unsigned short size; - int in = (enum regexpcode) b[-1] == charset_not; + unsigned short size; - size = *b++; - if (c < 1 << BYTEWIDTH) { - if (c / BYTEWIDTH < size && b[c / BYTEWIDTH] & 1 << c % BYTEWIDTH) - in = !in; - } - else { - unsigned short i, j; + size = *b++; + if (c < 1 << BYTEWIDTH) { + if (c / BYTEWIDTH < size && b[c / BYTEWIDTH] & 1 << c % BYTEWIDTH) { + return 1; + } + } + else { + unsigned short i, j; - b += size + 2; - size = EXTRACT_UNSIGNED (&b[-2]); + b += size + 2; + size = EXTRACT_UNSIGNED (&b[-2]); - for (i = 0, j = size; i < j; ) { - unsigned short k = (i + j) >> 1; + for (i = 0, j = size; i < j; ) { + unsigned short k = (i + j) >> 1; - if (c > EXTRACT_MBC (&b[k*4 + 2])) - i = k + 1; - else - j = k; + if (c > EXTRACT_MBC (&b[k*4 + 2])) + i = k + 1; + else + j = k; + } + if (i < size && EXTRACT_MBC (&b[i*4]) <= c + && ((unsigned char) c != '\n' && (unsigned char) c != '\0')) + return 1; } - if (i < size && EXTRACT_MBC (&b[i*4]) <= c - && ((unsigned char) c != '\n' && (unsigned char) c != '\0')) - in = !in; - } - return in; + return 0; } /* re_compile_pattern takes a regular-expression string @@ -571,88 +526,85 @@ re_compile_pattern (pattern, size, bufp) size_t size; struct re_pattern_buffer *bufp; { - register char *b = bufp->buffer; - register char *p = pattern; - char *pend = pattern + size; - register unsigned c, c1; - char *p0; - unsigned char *translate = (unsigned char *) bufp->translate; - - /* Address of the count-byte of the most recently inserted `exactn' - command. This makes it possible to tell whether a new exact-match - character can be added to that command or requires a new `exactn' - command. */ - - char *pending_exact = 0; - - /* Address of the place where a forward-jump should go to the end of - the containing expression. Each alternative of an `or', except the - last, ends with a forward-jump of this sort. */ - - char *fixup_jump = 0; - - /* Address of start of the most recently finished expression. - This tells postfix * where to find the start of its operand. */ - - char *laststart = 0; - - /* In processing a repeat, 1 means zero matches is allowed. */ - - char zero_times_ok; - - /* In processing a repeat, 1 means many matches is allowed. */ - - char many_times_ok; - - /* Address of beginning of regexp, or inside of last \(. */ - - char *begalt = b; - - /* In processing an interval, at least this many matches must be made. */ - int lower_bound; - - /* In processing an interval, at most this many matches can be made. */ - int upper_bound; - - /* Place in pattern (i.e., the {) to which to go back if the interval - is invalid. */ - char *beg_interval = 0; - - /* Stack of information saved by \( and restored by \). - Four stack elements are pushed by each \(: + register char *b = bufp->buffer; + register char *p = pattern; + char *pend = pattern + size; + register unsigned c, c1; + char *p0; + int numlen; + + /* Address of the count-byte of the most recently inserted `exactn' + command. This makes it possible to tell whether a new exact-match + character can be added to that command or requires a new `exactn' + command. */ + + char *pending_exact = 0; + + /* Address of the place where a forward-jump should go to the end of + the containing expression. Each alternative of an `or', except the + last, ends with a forward-jump of this sort. */ + + char *fixup_jump = 0; + + /* Address of start of the most recently finished expression. + This tells postfix * where to find the start of its operand. */ + + char *laststart = 0; + + /* In processing a repeat, 1 means zero matches is allowed. */ + + char zero_times_ok; + + /* In processing a repeat, 1 means many matches is allowed. */ + + char many_times_ok; + + /* Address of beginning of regexp, or inside of last \(. */ + + char *begalt = b; + + /* In processing an interval, at least this many matches must be made. */ + int lower_bound; + + /* In processing an interval, at most this many matches can be made. */ + int upper_bound; + + /* Place in pattern (i.e., the {) to which to go back if the interval + is invalid. */ + char *beg_interval = 0; + + /* Stack of information saved by \( and restored by \). + Four stack elements are pushed by each \(: First, the value of b. Second, the value of fixup_jump. Third, the value of regnum. Fourth, the value of begalt. */ + + int stackb[40]; + int *stackp = stackb; + int *stacke = stackb + 40; + int *stackt; + + /* Counts \('s as they are encountered. Remembered for the matching \), + where it becomes the register number to put in the stop_memory + command. */ + + int regnum = 1; + int range = 0; - int stackb[40]; - int *stackp = stackb; - int *stacke = stackb + 40; - int *stackt; - - /* Counts \('s as they are encountered. Remembered for the matching \), - where it becomes the register number to put in the stop_memory - command. */ - - int regnum = 1; - - bufp->fastmap_accurate = 0; - -#ifndef emacs -#ifndef SYNTAX_TABLE - /* Initialize the syntax table. */ - init_syntax_once(); -#endif -#endif - - if (bufp->allocated == 0) - { - bufp->allocated = INIT_BUF_SIZE; - if (bufp->buffer) - /* EXTEND_BUFFER loses when bufp->allocated is 0. */ - bufp->buffer = (char *) xrealloc (bufp->buffer, INIT_BUF_SIZE); - else - /* Caller did not allocate a buffer. Do it for them. */ + bufp->fastmap_accurate = 0; + + /* Initialize the syntax table. */ + init_syntax_once(); + + if (bufp->allocated == 0) + { + bufp->allocated = INIT_BUF_SIZE; + if (bufp->buffer) + /* EXTEND_BUFFER loses when bufp->allocated is 0. */ + bufp->buffer = (char *) xrealloc (bufp->buffer, INIT_BUF_SIZE); + else + /* Caller did not allocate a buffer. Do it for them. */ bufp->buffer = (char *) xmalloc (INIT_BUF_SIZE); if (!bufp->buffer) goto memory_exhausted; begalt = b = bufp->buffer; @@ -669,23 +621,18 @@ re_compile_pattern (pattern, size, bufp) char *p1 = p; /* When testing what follows the $, look past the \-constructs that don't consume anything. */ - if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS)) + if (! (re_syntax_options & RE_CONTEXT_INDEP_OPS)) while (p1 != pend) { if (*p1 == '\\' && p1 + 1 != pend - && (p1[1] == '<' || p1[1] == '>' - || p1[1] == '`' || p1[1] == '\'' -#ifdef emacs - || p1[1] == '=' -#endif - || p1[1] == 'b' || p1[1] == 'B')) + && (p1[1] == 'b' || p1[1] == 'B')) p1 += 2; else break; } - if (obscure_syntax & RE_TIGHT_VBAR) + if (re_syntax_options & RE_TIGHT_VBAR) { - if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p1 != pend) + if (! (re_syntax_options & RE_CONTEXT_INDEP_OPS) && p1 != pend) goto normal_char; /* Make operand of last vbar end before this `$'. */ if (fixup_jump) @@ -697,14 +644,14 @@ re_compile_pattern (pattern, size, bufp) /* $ means succeed if at end of line, but only in special contexts. If validly in the middle of a pattern, it is a normal character. */ - if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && p1 != pend) + if ((re_syntax_options & RE_CONTEXTUAL_INVALID_OPS) && p1 != pend) goto invalid_pattern; if (p1 == pend || *p1 == '\n' - || (obscure_syntax & RE_CONTEXT_INDEP_OPS) - || (obscure_syntax & RE_NO_BK_PARENS + || (re_syntax_options & RE_CONTEXT_INDEP_OPS) + || (re_syntax_options & RE_NO_BK_PARENS ? *p1 == ')' : *p1 == '\\' && p1[1] == ')') - || (obscure_syntax & RE_NO_BK_VBAR + || (re_syntax_options & RE_NO_BK_VBAR ? *p1 == '|' : *p1 == '\\' && p1[1] == '|')) { @@ -717,15 +664,15 @@ re_compile_pattern (pattern, size, bufp) /* ^ means succeed if at beg of line, but only if no preceding pattern. */ - if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && laststart) + if ((re_syntax_options & RE_CONTEXTUAL_INVALID_OPS) && laststart) goto invalid_pattern; if (laststart && p - 2 >= pattern && p[-2] != '\n' - && !(obscure_syntax & RE_CONTEXT_INDEP_OPS)) + && !(re_syntax_options & RE_CONTEXT_INDEP_OPS)) goto normal_char; - if (obscure_syntax & RE_TIGHT_VBAR) + if (re_syntax_options & RE_TIGHT_VBAR) { if (p != pattern + 1 - && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS)) + && ! (re_syntax_options & RE_CONTEXT_INDEP_OPS)) goto normal_char; BUFPUSH (begline); begalt = b; @@ -736,17 +683,17 @@ re_compile_pattern (pattern, size, bufp) case '+': case '?': - if ((obscure_syntax & RE_BK_PLUS_QM) - || (obscure_syntax & RE_LIMITED_OPS)) + if ((re_syntax_options & RE_BK_PLUS_QM) + || (re_syntax_options & RE_LIMITED_OPS)) goto normal_char; handle_plus: case '*': /* If there is no previous pattern, char not special. */ if (!laststart) { - if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) + if (re_syntax_options & RE_CONTEXTUAL_INVALID_OPS) goto invalid_pattern; - else if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS)) + else if (! (re_syntax_options & RE_CONTEXT_INDEP_OPS)) goto normal_char; } /* If there is a sequence of repetition chars, @@ -762,10 +709,10 @@ re_compile_pattern (pattern, size, bufp) PATFETCH (c); if (c == '*') ; - else if (!(obscure_syntax & RE_BK_PLUS_QM) + else if (!(re_syntax_options & RE_BK_PLUS_QM) && (c == '+' || c == '?')) ; - else if ((obscure_syntax & RE_BK_PLUS_QM) + else if ((re_syntax_options & RE_BK_PLUS_QM) && c == '\\') { /* int c1; */ @@ -846,7 +793,7 @@ re_compile_pattern (pattern, size, bufp) /* Clear the whole map */ memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH + 2); - if ((obscure_syntax & RE_HAT_NOT_NEWLINE) && b[-2] == charset_not) + if ((re_syntax_options & RE_HAT_NOT_NEWLINE) && b[-2] == charset_not) SET_LIST_BIT ('\n'); @@ -854,6 +801,7 @@ re_compile_pattern (pattern, size, bufp) while (1) { int size; + unsigned last = -1; if ((size = EXTRACT_UNSIGNED (&b[(1 << BYTEWIDTH) / BYTEWIDTH]))) { /* Ensure the space is enough to hold another interval @@ -862,150 +810,131 @@ re_compile_pattern (pattern, size, bufp) while (b + size + 1 > bufp->buffer + bufp->allocated) EXTEND_BUFFER; } - /* Don't translate while fetching, in case it's a range bound. - When we set the bit for the character, we translate it. */ - PATFETCH_RAW (c); - - /* If set, \ escapes characters when inside [...]. */ - if ((obscure_syntax & RE_AWK_CLASS_HACK) && c == '\\') - { - PATFETCH(c1); - if (ismbchar (c1)) { - unsigned char c2; + range_retry: + PATFETCH (c); - PATFETCH_RAW (c2); - c1 = c1 << 8 | c2; - set_list_bits (c1, c1, (unsigned char *) b, translate); - continue; - } - SET_LIST_BIT (c1); - continue; - } - if (c == ']') - { - if (p == p0 + 1) - { + if (c == ']') { + if (p == p0 + 1) { /* If this is an empty bracket expression. */ - if ((obscure_syntax & RE_NO_EMPTY_BRACKETS) + if ((re_syntax_options & RE_NO_EMPTY_BRACKETS) && p == pend) - goto invalid_pattern; - } + goto invalid_pattern; + } else /* Stop if this isn't merely a ] inside a bracket expression, but rather the end of a bracket expression. */ - break; - } + break; + } if (ismbchar (c)) { - unsigned char c2; - - PATFETCH_RAW (c2); - c = c << 8 | c2; + PATFETCH (c1); + c = c << 8 | c1; } - /* Get a range. */ - if (p[0] == '-' && p[1] != ']') - { - PATFETCH_RAW (c1); - /* Don't translate the range bounds while fetching them. */ - PATFETCH_RAW (c1); - if (ismbchar (c)) { - unsigned char c2; - PATFETCH_RAW (c2); - c1 = c1 << 8 | c2; + /* \ escapes characters when inside [...]. */ + if (c == '\\') { + PATFETCH(c); + switch (c) { + case 'w': + for (c = 0; c < 256; c++) + if (isalnum(c)) + SET_LIST_BIT (c); + last = -1; + continue; + + case 'W': + if (re_syntax_options & RE_MBCTYPE_MASK) + goto invalid_char; + for (c = 0; c < 256; c++) + if (!isalnum(c)) + SET_LIST_BIT (c); + last = -1; + continue; + + case 's': + for (c = 0; c < 256; c++) + if (isspace(c)) + SET_LIST_BIT (c); + last = -1; + continue; + + case 'S': + if (re_syntax_options & RE_MBCTYPE_MASK) + goto invalid_char; + for (c = 0; c < 256; c++) + if (!isspace(c)) + SET_LIST_BIT (c); + last = -1; + continue; + + case 'd': + for (c = '0'; c <= '9'; c++) + SET_LIST_BIT (c); + last = -1; + continue; + + case 'D': + if (re_syntax_options & RE_MBCTYPE_MASK) + goto invalid_char; + for (c = 0; c < '0'; c++) + SET_LIST_BIT (c); + for (c = '9' + 1; c < 256; c++) + SET_LIST_BIT (c); + last = -1; + continue; + + case 'x': + c = scan_hex(p, 2, &numlen); + if ((re_syntax_options & RE_MBCTYPE_MASK) && (c > 0x7f)) + goto invalid_char; + p += numlen; + break; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + PATUNFETCH; + c = scan_oct(p, 3, &numlen); + if ((re_syntax_options & RE_MBCTYPE_MASK) && (c > 0x7f)) + goto invalid_char; + p += numlen; + break; + + default: + if (ismbchar (c)) { + PATFETCH(c1); + c = c << 8 | c1; + } + break; } - - if ((obscure_syntax & RE_NO_EMPTY_RANGES) && c > c1) - goto invalid_pattern; - - if ((obscure_syntax & RE_NO_HYPHEN_RANGE_END) - && c1 == '-' && *p != ']') + } + + /* Get a range. */ + if (range) { + if (last > c) goto invalid_pattern; - - set_list_bits (c, c1, (unsigned char *) b, translate); - } - else if ((obscure_syntax & RE_CHAR_CLASSES) - && c == '[' && p[0] == ':') - { - /* Longest valid character class word has six characters. */ - char str[CHAR_CLASS_MAX_LENGTH]; - PATFETCH_RAW (c); - c1 = 0; - /* If no ] at end. */ - if (p == pend) + + if ((re_syntax_options & RE_NO_HYPHEN_RANGE_END) + && c == '-' && *p != ']') goto invalid_pattern; - while (1) - { - /* Don't translate the ``character class'' characters. */ - PATFETCH_RAW (c); - if (c == ':' || c == ']' || p == pend - || c1 == CHAR_CLASS_MAX_LENGTH) - break; - str[c1++] = c; - } - str[c1] = '\0'; - if (p == pend - || c == ']' /* End of the bracket expression. */ - || p[0] != ']' - || p + 1 == pend - || (strcmp (str, "alpha") != 0 - && strcmp (str, "upper") != 0 - && strcmp (str, "lower") != 0 - && strcmp (str, "digit") != 0 - && strcmp (str, "alnum") != 0 - && strcmp (str, "xdigit") != 0 - && strcmp (str, "space") != 0 - && strcmp (str, "print") != 0 - && strcmp (str, "punct") != 0 - && strcmp (str, "graph") != 0 - && strcmp (str, "cntrl") != 0)) - { - /* Undo the ending character, the letters, and leave - the leading : and [ (but set bits for them). */ - c1++; - while (c1--) - PATUNFETCH; -#if 1 /* The original was: */ - SET_LIST_BIT ('['); - SET_LIST_BIT (':'); -#else /* I think this is the right way. */ - if (translate) { - SET_LIST_BIT (translate['[']); - SET_LIST_BIT (trasnlate[':']); - } - else { - SET_LIST_BIT ('['); - SET_LIST_BIT (':'); - } -#endif - } - else - { - /* The ] at the end of the character class. */ - PATFETCH (c); - if (c != ']') - goto invalid_pattern; - for (c = 0; c < (1 << BYTEWIDTH); c++) - { - if ((strcmp (str, "alpha") == 0 && isalpha (c)) - || (strcmp (str, "upper") == 0 && isupper (c)) - || (strcmp (str, "lower") == 0 && islower (c)) - || (strcmp (str, "digit") == 0 && isdigit (c)) - || (strcmp (str, "alnum") == 0 && isalnum (c)) - || (strcmp (str, "xdigit") == 0 && isxdigit (c)) - || (strcmp (str, "space") == 0 && isspace (c)) - || (strcmp (str, "print") == 0 && isprint (c)) - || (strcmp (str, "punct") == 0 && ispunct (c)) - || (strcmp (str, "graph") == 0 && isgraph (c)) - || (strcmp (str, "cntrl") == 0 && iscntrl (c))) - SET_LIST_BIT (c); - } - } - } - else if (translate && c < 1 << BYTEWIDTH) - SET_LIST_BIT (translate[c]); + + range = 0; + if (last < 1 << BYTEWIDTH && c < 1 << BYTEWIDTH) { + for (;last<=c;last++) + SET_LIST_BIT (last); + } + else + set_list_bits (last, c, (unsigned char*)b); + } + else if (p[0] == '-' && p[1] != ']') { + last = c; + PATFETCH(c1); + range = 1; + goto range_retry; + } + else if (c < 1 << BYTEWIDTH) + SET_LIST_BIT (c); else - set_list_bits (c, c, (unsigned char *) b, translate); + set_list_bits (c, c, (unsigned char*)b); } /* Discard any character set/class bitmap bytes that are all @@ -1019,46 +948,46 @@ re_compile_pattern (pattern, size, bufp) break; case '(': - if (! (obscure_syntax & RE_NO_BK_PARENS)) + if (! (re_syntax_options & RE_NO_BK_PARENS)) goto normal_char; else goto handle_open; case ')': - if (! (obscure_syntax & RE_NO_BK_PARENS)) + if (! (re_syntax_options & RE_NO_BK_PARENS)) goto normal_char; else goto handle_close; case '\n': - if (! (obscure_syntax & RE_NEWLINE_OR)) + if (! (re_syntax_options & RE_NEWLINE_OR)) goto normal_char; else goto handle_bar; case '|': - if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) + if ((re_syntax_options & RE_CONTEXTUAL_INVALID_OPS) && (! laststart || p == pend)) goto invalid_pattern; - else if (! (obscure_syntax & RE_NO_BK_VBAR)) + else if (! (re_syntax_options & RE_NO_BK_VBAR)) goto normal_char; else goto handle_bar; case '{': - if (! ((obscure_syntax & RE_NO_BK_CURLY_BRACES) - && (obscure_syntax & RE_INTERVALS))) + if (! ((re_syntax_options & RE_NO_BK_CURLY_BRACES) + && (re_syntax_options & RE_INTERVALS))) goto normal_char; else goto handle_interval; case '\\': if (p == pend) goto invalid_pattern; - PATFETCH_RAW (c); + PATFETCH(c); switch (c) { case '(': - if (obscure_syntax & RE_NO_BK_PARENS) + if (re_syntax_options & RE_NO_BK_PARENS) goto normal_backsl; handle_open: if (stackp == stacke) goto nesting_too_deep; @@ -1066,11 +995,8 @@ re_compile_pattern (pattern, size, bufp) /* Laststart should point to the start_memory that we are about to push (unless the pattern has RE_NREGS or more ('s). */ *stackp++ = b - bufp->buffer; - if (regnum < RE_NREGS) - { - BUFPUSH (start_memory); - BUFPUSH (regnum); - } + BUFPUSH (start_memory); + BUFPUSH (regnum); *stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0; *stackp++ = regnum++; *stackp++ = begalt - bufp->buffer; @@ -1080,29 +1006,26 @@ re_compile_pattern (pattern, size, bufp) break; case ')': - if (obscure_syntax & RE_NO_BK_PARENS) + if (re_syntax_options & RE_NO_BK_PARENS) goto normal_backsl; handle_close: if (stackp == stackb) goto unmatched_close; begalt = *--stackp + bufp->buffer; if (fixup_jump) store_jump (fixup_jump, jump, b); - if (stackp[-1] < RE_NREGS) - { - BUFPUSH (stop_memory); - BUFPUSH (stackp[-1]); - } + BUFPUSH (stop_memory); + BUFPUSH (stackp[-1]); stackp -= 2; fixup_jump = *stackp ? *stackp + bufp->buffer - 1 : 0; laststart = *--stackp + bufp->buffer; break; case '|': - if ((obscure_syntax & RE_LIMITED_OPS) - || (obscure_syntax & RE_NO_BK_VBAR)) + if ((re_syntax_options & RE_LIMITED_OPS) + || (re_syntax_options & RE_NO_BK_VBAR)) goto normal_backsl; handle_bar: - if (obscure_syntax & RE_LIMITED_OPS) + if (re_syntax_options & RE_LIMITED_OPS) goto normal_char; /* Insert before the previous alternative a jump which jumps to this alternative if the former fails. */ @@ -1130,10 +1053,10 @@ re_compile_pattern (pattern, size, bufp) break; case '{': - if (! (obscure_syntax & RE_INTERVALS) + if (! (re_syntax_options & RE_INTERVALS) /* Let \{ be a literal. */ - || ((obscure_syntax & RE_INTERVALS) - && (obscure_syntax & RE_NO_BK_CURLY_BRACES)) + || ((re_syntax_options & RE_INTERVALS) + && (re_syntax_options & RE_NO_BK_CURLY_BRACES)) /* If it's the string "\{". */ || (p - 2 == pattern && p == pend)) goto normal_backsl; @@ -1142,7 +1065,7 @@ re_compile_pattern (pattern, size, bufp) /* If there is no previous pattern, this isn't an interval. */ if (!laststart) { - if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) + if (re_syntax_options & RE_CONTEXTUAL_INVALID_OPS) goto invalid_pattern; else goto normal_backsl; @@ -1156,15 +1079,15 @@ re_compile_pattern (pattern, size, bufp) || *laststart == charset_not || *laststart == start_memory || (*laststart == exactn && laststart[1] == 1) - || (! (obscure_syntax & RE_NO_BK_REFS) + || (! (re_syntax_options & RE_NO_BK_REFS) && *laststart == duplicate))) { - if (obscure_syntax & RE_NO_BK_CURLY_BRACES) + if (re_syntax_options & RE_NO_BK_CURLY_BRACES) goto normal_char; /* Posix extended syntax is handled in previous statement; this is for Posix basic syntax. */ - if (obscure_syntax & RE_INTERVALS) + if (re_syntax_options & RE_INTERVALS) goto invalid_pattern; goto normal_backsl; @@ -1180,7 +1103,7 @@ re_compile_pattern (pattern, size, bufp) } if (upper_bound < 0) upper_bound = lower_bound; - if (! (obscure_syntax & RE_NO_BK_CURLY_BRACES)) + if (! (re_syntax_options & RE_NO_BK_CURLY_BRACES)) { if (c != '\\') goto invalid_pattern; @@ -1188,10 +1111,10 @@ re_compile_pattern (pattern, size, bufp) } if (c != '}' || lower_bound < 0 || upper_bound > RE_DUP_MAX || lower_bound > upper_bound - || ((obscure_syntax & RE_NO_BK_CURLY_BRACES) + || ((re_syntax_options & RE_NO_BK_CURLY_BRACES) && p != pend && *p == '{')) { - if (obscure_syntax & RE_NO_BK_CURLY_BRACES) + if (re_syntax_options & RE_NO_BK_CURLY_BRACES) goto unfetch_interval; else goto invalid_pattern; @@ -1273,27 +1196,6 @@ re_compile_pattern (pattern, size, bufp) goto normal_char; break; -#ifdef emacs - case '=': - BUFPUSH (at_dot); - break; - - case 's': - laststart = b; - BUFPUSH (syntaxspec); - PATFETCH (c); - BUFPUSH (syntax_spec_code[c]); - break; - - case 'S': - laststart = b; - BUFPUSH (notsyntaxspec); - PATFETCH (c); - BUFPUSH (syntax_spec_code[c]); - break; -#endif /* emacs */ - -#ifdef RUBY case 's': case 'S': case 'd': @@ -1334,7 +1236,6 @@ re_compile_pattern (pattern, size, bufp) 2 + EXTRACT_UNSIGNED (&b[(1 << BYTEWIDTH) / BYTEWIDTH])*4); b += b[-1] + 2 + EXTRACT_UNSIGNED (&b[b[-1]])*4; break; -#endif /* RUBY */ case 'w': laststart = b; @@ -1345,15 +1246,7 @@ re_compile_pattern (pattern, size, bufp) laststart = b; BUFPUSH (notwordchar); break; -#ifndef RUBY - case '<': - BUFPUSH (wordbeg); - break; - case '>': - BUFPUSH (wordend); - break; -#endif /* RUBY */ case 'b': BUFPUSH (wordbound); break; @@ -1362,33 +1255,50 @@ re_compile_pattern (pattern, size, bufp) BUFPUSH (notwordbound); break; - case '`': - BUFPUSH (begbuf); - break; + /* hex */ + case 'x': + c1 = 0; + c = scan_hex(p, 2, &numlen); + if ((re_syntax_options & RE_MBCTYPE_MASK) && (c > 0x7f)) + goto invalid_char; + p += numlen; + goto numeric_char; + + /* octal */ + case '0': + c1 = 0; + c = scan_oct(p, 3, &numlen); + p += numlen; + goto numeric_char; + + /* back-ref or octal */ + case '1': case '2': case '3': + case '4': case '5': case '6': + case '7': case '8': case '9': + { + char *p_save; - case '\'': - BUFPUSH (endbuf); - break; + PATUNFETCH; + p_save = p; + + c1 = 0; + GET_UNSIGNED_NUMBER(c1); + + if (c1 >= regnum) { + if (c1 < RE_NREGS) + goto invalid_pattern; + + /* need to get octal */ + p = p_save; + c = scan_oct(p_save, 3, &numlen); + if ((re_syntax_options & RE_MBCTYPE_MASK) && (c > 0x7f)) + goto invalid_char; + p = p_save + numlen; + c1 = 0; + goto numeric_char; + } + } - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - if (obscure_syntax & RE_NO_BK_REFS) - goto normal_char; - c1 = c - '0'; - if (c1 >= regnum) - { - if (obscure_syntax & RE_NO_EMPTY_BK_REF) - goto invalid_pattern; - else - goto normal_char; - } /* Can't back reference to a subexpression if inside of it. */ for (stackt = stackp - 2; stackt > stackb; stackt -= 4) if (*stackt == c1) @@ -1400,7 +1310,7 @@ re_compile_pattern (pattern, size, bufp) case '+': case '?': - if (obscure_syntax & RE_BK_PLUS_QM) + if (re_syntax_options & RE_BK_PLUS_QM) goto handle_plus; else goto normal_backsl; @@ -1408,10 +1318,6 @@ re_compile_pattern (pattern, size, bufp) default: normal_backsl: - /* You might think it would be useful for \ to mean - not to translate; but if we don't translate it - it will never match anything. */ - if (translate && !ismbchar (c)) c = (unsigned char) translate[c]; goto normal_char; } break; @@ -1421,16 +1327,17 @@ re_compile_pattern (pattern, size, bufp) c1 = 0; if (ismbchar (c)) { c1 = c; - PATFETCH_RAW (c); + PATFETCH(c); } + numeric_char: if (!pending_exact || pending_exact + *pending_exact + 1 != b || *pending_exact >= (c1 ? 0176 : 0177) || *p == '*' || *p == '^' - || ((obscure_syntax & RE_BK_PLUS_QM) + || ((re_syntax_options & RE_BK_PLUS_QM) ? *p == '\\' && (p[1] == '+' || p[1] == '?') : (*p == '+' || *p == '?')) - || ((obscure_syntax & RE_INTERVALS) - && ((obscure_syntax & RE_NO_BK_CURLY_BRACES) + || ((re_syntax_options & RE_INTERVALS) + && ((re_syntax_options & RE_NO_BK_CURLY_BRACES) ? *p == '{' : (p[0] == '\\' && p[1] == '{')))) { @@ -1454,16 +1361,20 @@ re_compile_pattern (pattern, size, bufp) if (stackp != stackb) goto unmatched_open; bufp->used = b - bufp->buffer; + bufp->re_nsub = regnum; return 0; + invalid_char: + return "Invalid character in regular expression"; + invalid_pattern: return "Invalid regular expression"; unmatched_open: - return "Unmatched \\("; + return "Unmatched ("; unmatched_close: - return "Unmatched \\)"; + return "Unmatched )"; end_of_pattern: return "Premature end of regular expression"; @@ -1600,20 +1511,14 @@ re_compile_fastmap (bufp) register unsigned char *p = pattern; register unsigned char *pend = pattern + size; register int j, k; - unsigned char *translate = (unsigned char *) bufp->translate; + unsigned char *translate = (unsigned char *)bufp->translate; unsigned is_a_succeed_n; -#ifndef NO_ALLOCA - unsigned char *stackb[NFAILURES]; - unsigned char **stackp = stackb; - -#else unsigned char **stackb; unsigned char **stackp; - stackb = (unsigned char **) xmalloc (NFAILURES * sizeof (unsigned char *)); + stackb = RE_TALLOC(NFAILURES, unsigned char*); stackp = stackb; -#endif /* NO_ALLOCA */ memset (fastmap, 0, (1 << BYTEWIDTH)); bufp->fastmap_accurate = 1; bufp->can_be_null = 0; @@ -1633,26 +1538,15 @@ re_compile_fastmap (bufp) #endif { case exactn: -#if 1 /* The original was: */ if (translate) fastmap[translate[p[1]]] = 1; else fastmap[p[1]] = 1; -#else /* The compiled pattern has already been translated. (not true for ruby) */ - fastmap[p[1]] = 1; -#endif break; case begline: - case before_dot: - case at_dot: - case after_dot: - case begbuf: - case endbuf: case wordbound: case notwordbound: - case wordbeg: - case wordend: continue; case endline: @@ -1748,41 +1642,16 @@ re_compile_fastmap (bufp) fastmap[j] = 1; break; -#ifdef emacs - case syntaxspec: - k = *p++; - for (j = 0; j < (1 << BYTEWIDTH); j++) - if (SYNTAX (j) == (enum syntaxcode) k) - fastmap[j] = 1; - break; - - case notsyntaxspec: - k = *p++; - for (j = 0; j < (1 << BYTEWIDTH); j++) - if (SYNTAX (j) != (enum syntaxcode) k) - fastmap[j] = 1; - break; - -#else /* not emacs */ - case syntaxspec: - case notsyntaxspec: - break; -#endif /* not emacs */ - case charset: /* NOTE: Charset for single-byte chars never contain multi-byte char. See set_list_bits(). */ for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) { -#if 1 /* The original was: */ if (translate) fastmap[translate[j]] = 1; else fastmap[j] = 1; -#else /* The compiled pattern has already been translated. */ - fastmap[j] = 1; -#endif } { unsigned short size; @@ -1830,7 +1699,7 @@ re_compile_fastmap (bufp) p += p[-1] + 2; size = EXTRACT_UNSIGNED (&p[-2]); - c = 0x8000; + c = 0x80; for (j = 0; j < size; j++) { for (beg = (unsigned char) p[j*4 + 0]; c < beg; c++) if (ismbchar (c)) @@ -1868,8 +1737,7 @@ re_search (pbufp, string, size, startpos, range, regs) int size, startpos, range; struct re_registers *regs; { - return re_search_2 (pbufp, (char *) 0, 0, string, size, startpos, range, - regs, size); + return re_search_2 (pbufp, (char*)0, 0, string, size, startpos, range, regs); } @@ -1881,23 +1749,20 @@ re_search (pbufp, string, size, startpos, range, regs) - 1, etc. STRING1 and STRING2 are of SIZE1 and SIZE2, respectively. In REGS, return the indices of the virtual concatenation of STRING1 and STRING2 that matched the entire PBUFP->buffer and its contained - subexpressions. Do not consider matching one past the index MSTOP in - the virtual concatenation of STRING1 and STRING2. + subexpressions. The value returned is the position in the strings at which the match was found, or -1 if no match was found, or -2 if error (such as failure stack overflow). */ -int -re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, - regs, mstop) +static int +re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, regs) struct re_pattern_buffer *pbufp; char *string1, *string2; int size1, size2; int startpos; register int range; struct re_registers *regs; - int mstop; { register char *fastmap = pbufp->fastmap; register unsigned char *translate = (unsigned char *) pbufp->translate; @@ -1917,19 +1782,9 @@ re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, range = total_size - startpos; /* Update the fastmap now if not correct already. */ - if (fastmap && !pbufp->fastmap_accurate) - re_compile_fastmap (pbufp); - - /* If the search isn't to be a backwards one, don't waste time in a - long search for a pattern that says it is anchored. */ - if (pbufp->used > 0 && (enum regexpcode) pbufp->buffer[0] == begbuf - && range > 0) - { - if (startpos > 0) - return -1; - else - range = 1; - } + if (fastmap && !pbufp->fastmap_accurate) { + re_compile_fastmap (pbufp); + } while (1) { @@ -1987,7 +1842,7 @@ re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, return -1; val = re_match_2 (pbufp, string1, size1, string2, size2, startpos, - regs, mstop); + regs); if (val >= 0) return startpos; if (val == -2) @@ -2039,7 +1894,6 @@ re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, -#ifndef emacs /* emacs never uses this. */ int re_match (pbufp, string, size, pos, regs) struct re_pattern_buffer *pbufp; @@ -2049,7 +1903,6 @@ re_match (pbufp, string, size, pos, regs) { return re_match_2 (pbufp, (char *) 0, 0, string, size, pos, regs, size); } -#endif /* not emacs */ /* The following are used for re_match_2, defined below: */ @@ -2086,7 +1939,7 @@ struct register_info fail. The `+ 2' refers to PATTERN_PLACE and STRING_PLACE, which are arguments to the PUSH_FAILURE_POINT macro. */ -#define MAX_NUM_FAILURE_ITEMS (RE_NREGS * NUM_REG_ITEMS + 2) +#define MAX_NUM_FAILURE_ITEMS (num_regs * NUM_REG_ITEMS + 2) /* We push this many things on the stack whenever we fail. */ @@ -2102,8 +1955,8 @@ struct register_info long last_used_reg, this_reg; \ \ /* Find out how many registers are active or have been matched. \ - (Aside from register zero, which is only set at the end.) */ \ - for (last_used_reg = RE_NREGS - 1; last_used_reg > 0; last_used_reg--)\ + (Aside from register zero, which is only set at the end.) */ \ + for (last_used_reg = num_regs - 1; last_used_reg > 0; last_used_reg--)\ if (regstart[last_used_reg] != (unsigned char *)(-1L)) \ break; \ \ @@ -2113,6 +1966,7 @@ struct register_info unsigned int len = stacke - stackb; \ if (len > re_max_failures * MAX_NUM_FAILURE_ITEMS) \ { \ + FREE_VARIABLES(); \ FREE_AND_RETURN(stackb,(-2)); \ } \ \ @@ -2179,8 +2033,8 @@ struct register_info registers corresponding to the subexpressions of which we currently are inside. */ #define SET_REGS_MATCHED \ - { unsigned this_reg; \ - for (this_reg = 0; this_reg < RE_NREGS; this_reg++) \ + { unsigned this_reg; \ + for (this_reg = 0; this_reg < num_regs; this_reg++) \ { \ if (IS_ACTIVE(reg_info[this_reg])) \ MATCHED_SOMETHING(reg_info[this_reg]) = 1; \ @@ -2209,15 +2063,37 @@ struct register_info (SYNTAX ((d) == end1 ? *string2 : (d) == string2 - 1 ? *(end1 - 1) : *(d))\ == Sword) +static void +init_regs(regs, num_regs) + struct re_registers *regs; + unsigned num_regs; +{ + int i; + + regs->num_regs = num_regs; + if (num_regs < RE_NREGS) + num_regs = RE_NREGS; + + if (regs->allocated == 0) { + regs->beg = TMALLOC(num_regs, int); + regs->end = TMALLOC(num_regs, int); + regs->allocated = num_regs; + } + else if (regs->allocated < num_regs) { + TREALLOC(regs->beg, num_regs, int); + TREALLOC(regs->end, num_regs, int); + } + for (i=0; ibeg[i] = regs->end[i] = -1; + } +} /* Match the pattern described by PBUFP against the virtual concatenation of STRING1 and STRING2, which are of SIZE1 and SIZE2, respectively. Start the match at index POS in the virtual concatenation of STRING1 and STRING2. In REGS, return the indices of the virtual concatenation of STRING1 and STRING2 that matched the - entire PBUFP->buffer and its contained subexpressions. Do not - consider matching one past the index MSTOP in the virtual - concatenation of STRING1 and STRING2. + entire PBUFP->buffer and its contained subexpressions. If pbufp->fastmap is nonzero, then it had better be up to date. @@ -2229,20 +2105,21 @@ struct register_info error (such as match stack overflow). Otherwise the value is the length of the substring which was matched. */ -int -re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) +static int +re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs) struct re_pattern_buffer *pbufp; char *string1_arg, *string2_arg; int size1, size2; int pos; struct re_registers *regs; - int mstop; { register unsigned char *p = (unsigned char *) pbufp->buffer; /* Pointer to beyond end of buffer. */ register unsigned char *pend = p + pbufp->used; + unsigned num_regs = pbufp->re_nsub + 1; + unsigned char *string1 = (unsigned char *) string1_arg; unsigned char *string2 = (unsigned char *) string2_arg; unsigned char *end1; /* Just past end of first string. */ @@ -2267,9 +2144,6 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) ``dummy''; if a failure happens and the failure point is a dummy, it gets discarded and the next next one is tried. */ -#ifndef NO_ALLOCA - unsigned char *initial_stack[MAX_NUM_FAILURE_ITEMS * NFAILURES]; -#endif unsigned char **stackb; unsigned char **stackp; unsigned char **stacke; @@ -2283,8 +2157,8 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) stopped matching the regnum-th subexpression. (The zeroth register keeps track of what the whole pattern matches.) */ - unsigned char *regstart[RE_NREGS]; - unsigned char *regend[RE_NREGS]; + unsigned char **regstart = RE_TALLOC(num_regs, unsigned char*); + unsigned char **regend = RE_TALLOC(num_regs, unsigned char*); /* The is_active field of reg_info helps us keep track of which (possibly nested) subexpressions we are currently in. The matched_something @@ -2293,8 +2167,7 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) subexpression. These two fields get reset each time through any loop their register is in. */ - struct register_info reg_info[RE_NREGS]; - + struct register_info *reg_info = RE_TALLOC(num_regs, struct register_info); /* The following record the register info as found in the above variables when we find a match better than any we've seen before. @@ -2302,15 +2175,16 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) turn happens only if we have not yet matched the entire string. */ unsigned best_regs_set = 0; - unsigned char *best_regstart[RE_NREGS]; - unsigned char *best_regend[RE_NREGS]; + unsigned char **best_regstart = RE_TALLOC(num_regs, unsigned char*); + unsigned char **best_regend = RE_TALLOC(num_regs, unsigned char*); + + if (regs) { + init_regs(regs, num_regs); + } + /* Initialize the stack. */ -#ifdef NO_ALLOCA - stackb = (unsigned char **) xmalloc (MAX_NUM_FAILURE_ITEMS * NFAILURES * sizeof (char *)); -#else - stackb = initial_stack; -#endif + stackb = RE_TALLOC(MAX_NUM_FAILURE_ITEMS * NFAILURES, unsigned char*); stackp = stackb; stacke = &stackb[MAX_NUM_FAILURE_ITEMS * NFAILURES]; @@ -2321,41 +2195,27 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) /* Initialize subexpression text positions to -1 to mark ones that no \( or ( and \) or ) has been seen for. Also set all registers to inactive and mark them as not having matched anything or ever - failed. */ - for (mcnt = 0; mcnt < RE_NREGS; mcnt++) - { + failed. */ + for (mcnt = 0; mcnt < num_regs; mcnt++) { regstart[mcnt] = regend[mcnt] = (unsigned char *) (-1L); IS_ACTIVE (reg_info[mcnt]) = 0; MATCHED_SOMETHING (reg_info[mcnt]) = 0; - } + } - if (regs) - for (mcnt = 0; mcnt < RE_NREGS; mcnt++) - regs->start[mcnt] = regs->end[mcnt] = -1; - /* Set up pointers to ends of strings. Don't allow the second string to be empty unless both are empty. */ - if (size2 == 0) - { + if (size2 == 0) { string2 = string1; size2 = size1; string1 = 0; size1 = 0; - } + } end1 = string1 + size1; end2 = string2 + size2; /* Compute where to stop matching, within the two strings. */ - if (mstop <= size1) - { - end_match_1 = string1 + mstop; - end_match_2 = string2; - } - else - { - end_match_1 = end1; - end_match_2 = string2 + mstop - size1; - } + end_match_1 = end1; + end_match_2 = string2 + size2 - size1; /* `p' scans through the pattern as `d' scans through the data. `dend' is the end of the input string that `d' points within. `d' is @@ -2405,7 +2265,7 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) best_regs_set = 1; best_regend[0] = d; /* Never use regstart[0]. */ - for (mcnt = 1; mcnt < RE_NREGS; mcnt++) + for (mcnt = 1; mcnt < num_regs; mcnt++) { best_regstart[mcnt] = regstart[mcnt]; best_regend[mcnt] = regend[mcnt]; @@ -2420,7 +2280,7 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) /* Restore best match. */ d = best_regend[0]; - for (mcnt = 0; mcnt < RE_NREGS; mcnt++) + for (mcnt = 0; mcnt < num_regs; mcnt++) { regstart[mcnt] = best_regstart[mcnt]; regend[mcnt] = best_regend[mcnt]; @@ -2432,23 +2292,23 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) to indices. */ if (regs) { - regs->start[0] = pos; + regs->beg[0] = pos; if (MATCHING_IN_FIRST_STRING) regs->end[0] = d - string1; else regs->end[0] = d - string2 + size1; - for (mcnt = 1; mcnt < RE_NREGS; mcnt++) + for (mcnt = 1; mcnt < num_regs; mcnt++) { if (regend[mcnt] == (unsigned char *)(-1L)) { - regs->start[mcnt] = -1; + regs->beg[mcnt] = -1; regs->end[mcnt] = -1; continue; } if (IS_IN_FIRST_STRING (regstart[mcnt])) - regs->start[mcnt] = regstart[mcnt] - string1; + regs->beg[mcnt] = regstart[mcnt] - string1; else - regs->start[mcnt] = regstart[mcnt] - string2 + size1; + regs->beg[mcnt] = regstart[mcnt] - string2 + size1; if (IS_IN_FIRST_STRING (regend[mcnt])) regs->end[mcnt] = regend[mcnt] - string1; @@ -2456,6 +2316,7 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) regs->end[mcnt] = regend[mcnt] - string2 + size1; } } + FREE_VARIABLES(); FREE_AND_RETURN(stackb, (d - pos - (MATCHING_IN_FIRST_STRING ? string1 : @@ -2586,7 +2447,7 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) break; } if ((translate ? translate[*d] : *d) == '\n' - || ((obscure_syntax & RE_DOT_NOT_NULL) + || ((re_syntax_options & RE_DOT_NOT_NULL) && (translate ? translate[*d] : *d) == '\000')) goto fail; SET_REGS_MATCHED; @@ -2597,9 +2458,7 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) case charset_not: { int not; /* Nonzero for charset_not. */ - register int c; - if (*(p - 1) == (unsigned char) charset_not) - not = 1; + int c; PREFETCH; /* Fetch a data character. */ @@ -2613,14 +2472,17 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) c = (unsigned char) translate[c]; not = is_in_list (c, p); + if (*(p - 1) == (unsigned char) charset_not) + not = !not; p += 1 + *p + 2 + EXTRACT_UNSIGNED (&p[1 + *p])*4; if (!not) goto fail; SET_REGS_MATCHED; + d++; if (d != dend && c >= 1 << BYTEWIDTH) - d++; + d++; break; } @@ -2686,14 +2548,17 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) if (p1[3] == (unsigned char) exactn && p1[5] != c) p[-3] = (unsigned char) finalize_jump; else if (p1[3] == (unsigned char) charset - || p1[3] == (unsigned char) charset_not) - { + || p1[3] == (unsigned char) charset_not) { + int not; if (ismbchar (c)) c = c << 8 | p2[3]; /* `is_in_list()' is TRUE if c would match */ /* That means it is not safe to finalize. */ - if (!is_in_list (c, p1 + 4)) - p[-3] = (unsigned char) finalize_jump; + not = is_in_list (c, p1 + 4); + if (p1[3] == (unsigned char) charset_not) + not = !not; + if (!not) + p[-3] = (unsigned char) finalize_jump; } } } @@ -2798,60 +2663,6 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) goto fail; break; - case wordbeg: - if (IS_A_LETTER (d) && (AT_STRINGS_BEG || !IS_A_LETTER (d - 1))) - break; - goto fail; - - case wordend: - /* Have to check if AT_STRINGS_BEG before looking at d - 1. */ - if (!AT_STRINGS_BEG && IS_A_LETTER (d - 1) - && (!IS_A_LETTER (d) || AT_STRINGS_END)) - break; - goto fail; - -#ifdef emacs - case before_dot: - if (PTR_CHAR_POS (d) >= point) - goto fail; - break; - - case at_dot: - if (PTR_CHAR_POS (d) != point) - goto fail; - break; - - case after_dot: - if (PTR_CHAR_POS (d) <= point) - goto fail; - break; - - case wordchar: - mcnt = (int) Sword; - goto matchsyntax; - - case syntaxspec: - mcnt = *p++; - matchsyntax: - PREFETCH; - if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail; - SET_REGS_MATCHED; - break; - - case notwordchar: - mcnt = (int) Sword; - goto matchnotsyntax; - - case notsyntaxspec: - mcnt = *p++; - matchnotsyntax: - PREFETCH; - if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail; - SET_REGS_MATCHED; - break; - -#else /* not emacs */ - case wordchar: PREFETCH; if (!IS_A_LETTER (d)) @@ -2868,25 +2679,6 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) SET_REGS_MATCHED; break; - case before_dot: - case at_dot: - case after_dot: - case syntaxspec: - case notsyntaxspec: - break; - -#endif /* not emacs */ - - case begbuf: - if (AT_STRINGS_BEG) - break; - goto fail; - - case endbuf: - if (AT_STRINGS_END) - break; - goto fail; - case exactn: /* Match the next few pattern characters exactly. mcnt is how many characters to match. */ @@ -2922,27 +2714,8 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) { do { -#if 0 - /* this code suppose that multi-byte chars are not splited - in string1 and string2. If you want to check this with - speed cost, change `#if 0' here and next to `#if 1'. */ - unsigned char c; - -#endif PREFETCH; -#if 0 - c = *d++; - if (ismbchar (c)) { - if (c != (unsigned char) *p++ - || !--mcnt - || d == dend) - goto fail; - c = *d++; - } - if (c != (unsigned char) *p++) goto fail; -#else if (*d++ != *p++) goto fail; -#endif } while (--mcnt); } @@ -2973,8 +2746,8 @@ re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop) /* Restore register info. */ last_used_reg = (long) *--stackp; - /* Make the ones that weren't saved -1 or 0 again. */ - for (this_reg = RE_NREGS - 1; this_reg > last_used_reg; this_reg--) + /* Make the ones that weren't saved -1 or 0 again. */ + for (this_reg = num_regs - 1; this_reg > last_used_reg; this_reg--) { regend[this_reg] = (unsigned char *) (-1L); regstart[this_reg] = (unsigned char *) (-1L); @@ -3023,218 +2796,22 @@ memcmp_translate (s1, s2, len, translate) return 0; } - - -/* Entry points compatible with 4.2 BSD regex library. */ - -#if !defined(emacs) && !defined(GAWK) && !defined(RUBY) - -static struct re_pattern_buffer re_comp_buf; - -char * -re_comp (s) - char *s; -{ - if (!s) - { - if (!re_comp_buf.buffer) - return "No previous regular expression"; - return 0; - } - - if (!re_comp_buf.buffer) - { - if (!(re_comp_buf.buffer = (char *) xmalloc (200))) - return "Memory exhausted"; - re_comp_buf.allocated = 200; - if (!(re_comp_buf.fastmap = (char *) xmalloc (1 << BYTEWIDTH))) - return "Memory exhausted"; - } - return re_compile_pattern (s, strlen (s), &re_comp_buf); -} - -int -re_exec (s) - char *s; -{ - int len = strlen (s); - return 0 <= re_search (&re_comp_buf, s, len, 0, len, - (struct re_registers *) 0); -} -#endif /* not emacs && not GAWK && not RUBY */ - - - -#ifdef test - -#ifdef atarist -long _stksize = 2L; /* reserve memory for stack */ -#endif -#include - -/* Indexed by a character, gives the upper case equivalent of the - character. */ - -char upcase[0400] = - { 000, 001, 002, 003, 004, 005, 006, 007, - 010, 011, 012, 013, 014, 015, 016, 017, - 020, 021, 022, 023, 024, 025, 026, 027, - 030, 031, 032, 033, 034, 035, 036, 037, - 040, 041, 042, 043, 044, 045, 046, 047, - 050, 051, 052, 053, 054, 055, 056, 057, - 060, 061, 062, 063, 064, 065, 066, 067, - 070, 071, 072, 073, 074, 075, 076, 077, - 0100, 0101, 0102, 0103, 0104, 0105, 0106, 0107, - 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117, - 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127, - 0130, 0131, 0132, 0133, 0134, 0135, 0136, 0137, - 0140, 0101, 0102, 0103, 0104, 0105, 0106, 0107, - 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117, - 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127, - 0130, 0131, 0132, 0173, 0174, 0175, 0176, 0177, - 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207, - 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217, - 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227, - 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237, - 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247, - 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257, - 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267, - 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277, - 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307, - 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317, - 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327, - 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337, - 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347, - 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357, - 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367, - 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377 - }; - -#ifdef canned - -#include "tests.h" - -typedef enum { extended_test, basic_test } test_type; - -/* Use this to run the tests we've thought of. */ - void -main () +re_copy_registers(regs1, regs2) + struct re_registers *regs1, *regs2; { - test_type t = extended_test; + int i; - if (t == basic_test) - { - printf ("Running basic tests:\n\n"); - test_posix_basic (); - } - else if (t == extended_test) - { - printf ("Running extended tests:\n\n"); - test_posix_extended (); + if (regs1->allocated == 0) { + regs1->beg = TMALLOC(regs2->num_regs, int); + regs1->end = TMALLOC(regs2->num_regs, int); } -} - -#else /* not canned */ - -/* Use this to run interactive tests. */ - -int -main (argc, argv) - int argc; - char **argv; -{ - char pat[80]; - struct re_pattern_buffer buf; - int i; - char c; - char fastmap[(1 << BYTEWIDTH)]; - - /* Allow a command argument to specify the style of syntax. */ - if (argc > 1) - obscure_syntax = atol (argv[1]); - - buf.allocated = 40; - buf.buffer = (char *) xmalloc (buf.allocated); - buf.fastmap = fastmap; - buf.translate = upcase; - - while (1) - { - gets (pat); - - if (*pat) - { - re_compile_pattern (pat, strlen(pat), &buf); - - for (i = 0; i < buf.used; i++) - printchar (buf.buffer[i]); - - putchar ('\n'); - - printf ("%d allocated, %d used.\n", buf.allocated, buf.used); - - re_compile_fastmap (&buf); - printf ("Allowed by fastmap: "); - for (i = 0; i < (1 << BYTEWIDTH); i++) - if (fastmap[i]) printchar (i); - putchar ('\n'); - } - - gets (pat); /* Now read the string to match against */ - - i = re_match (&buf, pat, strlen (pat), 0, 0); - printf ("Match value %d.\n", i); + else if (regs1->allocated < regs2->num_regs) { + TREALLOC(regs1->beg, regs2->num_regs, int); + TREALLOC(regs1->end, regs2->num_regs, int); } -} - -#endif - - -#ifdef NOTDEF -print_buf (bufp) - struct re_pattern_buffer *bufp; -{ - int i; - - printf ("buf is :\n----------------\n"); - for (i = 0; i < bufp->used; i++) - printchar (bufp->buffer[i]); - - printf ("\n%d allocated, %d used.\n", bufp->allocated, bufp->used); - - printf ("Allowed by fastmap: "); - for (i = 0; i < (1 << BYTEWIDTH); i++) - if (bufp->fastmap[i]) - printchar (i); - printf ("\nAllowed by translate: "); - if (bufp->translate) - for (i = 0; i < (1 << BYTEWIDTH); i++) - if (bufp->translate[i]) - printchar (i); - printf ("\nfastmap is%s accurate\n", bufp->fastmap_accurate ? "" : "n't"); - printf ("can %s be null\n----------", bufp->can_be_null ? "" : "not"); -} -#endif /* NOTDEF */ - -printchar (c) - char c; -{ - if (c < 040 || c >= 0177) - { - putchar ('\\'); - putchar (((c >> 6) & 3) + '0'); - putchar (((c >> 3) & 7) + '0'); - putchar ((c & 7) + '0'); + for (i=0; inum_regs; i++) { + regs1->beg[i] = regs2->beg[i]; + regs1->end[i] = regs2->end[i]; } - else - putchar (c); -} - -error (string) - char *string; -{ - puts (string); - exit (1); } -#endif /* test */ -- cgit v1.2.3