From 0424e152c684a85f4b0691f1e84aec203115333d Mon Sep 17 00:00:00 2001 From: naruse Date: Fri, 17 Feb 2012 07:42:23 +0000 Subject: * Merge Onigmo-5.13.1. [ruby-dev:45057] [Feature #5820] https://github.com/k-takata/Onigmo cp reg{comp,enc,error,exec,parse,syntax}.c reg{enc,int,parse}.h cp oniguruma.h cp tool/enc-unicode.rb cp -r enc/ git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@34663 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- regparse.c | 1053 ++++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 840 insertions(+), 213 deletions(-) (limited to 'regparse.c') diff --git a/regparse.c b/regparse.c index 7a58fe14c8..bc1343481d 100644 --- a/regparse.c +++ b/regparse.c @@ -1,9 +1,9 @@ -/* -*- mode:c; c-file-style:"gnu" -*- */ /********************************************************************** - regparse.c - Oniguruma (regular expression library) + regparse.c - Onigmo (Oniguruma-mod) (regular expression library) **********************************************************************/ /*- * Copyright (c) 2002-2008 K.Kosako + * Copyright (c) 2011-2012 K.Takata * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -50,7 +50,11 @@ const OnigSyntaxType OnigSyntaxRuby = { ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB | - ONIG_SYN_OP2_ESC_H_XDIGIT ) + ONIG_SYN_OP2_ESC_H_XDIGIT | + ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER | + ONIG_SYN_OP2_QMARK_LPAREN_CONDITION | + ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK | + ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP ) , ( SYN_GNU_REGEX_BV | ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | @@ -60,7 +64,8 @@ const OnigSyntaxType OnigSyntaxRuby = { ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | ONIG_SYN_WARN_CC_DUP | ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) - , ONIG_OPTION_NONE + , ( ONIG_OPTION_ASCII_RANGE | ONIG_OPTION_POSIX_BRACKET_ALL_RANGE | + ONIG_OPTION_WORD_BOUND_ALL_RANGE ) , { (OnigCodePoint )'\\' /* esc */ @@ -133,7 +138,7 @@ bbuf_clone(BBuf** rto, BBuf* from) (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80) #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \ - add_code_range_to_buf(pbuf, env, MBCODE_START_POS(enc), ~((OnigCodePoint )0)) + add_code_range_to_buf(pbuf, env, MBCODE_START_POS(enc), ONIG_LAST_CODE_POINT) #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\ if (! ONIGENC_IS_SINGLEBYTE(enc)) {\ @@ -677,7 +682,7 @@ onig_names_free(regex_t* reg) } static NameEntry* -name_find(regex_t* reg, UChar* name, UChar* name_end) +name_find(regex_t* reg, const UChar* name, const UChar* name_end) { int i, len; NameEntry* e; @@ -781,10 +786,12 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) } else if (t->num == t->alloc) { int i; + NameEntry* p; alloc = t->alloc * 2; - t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc); - CHECK_NULL_RETURN_MEMERR(t->e); + p = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc); + CHECK_NULL_RETURN_MEMERR(p); + t->e = p; t->alloc = alloc; clear: @@ -826,9 +833,11 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) } else { if (e->back_num > e->back_alloc) { + int* p; alloc = e->back_alloc * 2; - e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc); - CHECK_NULL_RETURN_MEMERR(e->back_refs); + p = (int* )xrealloc(e->back_refs, sizeof(int) * alloc); + CHECK_NULL_RETURN_MEMERR(p); + e->back_refs = p; e->back_alloc = alloc; } e->back_refs[e->back_num - 1] = backref; @@ -1209,7 +1218,7 @@ node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out, } static Node* -node_new_ctype(int type, int not) +node_new_ctype(int type, int not, int ascii_range) { Node* node = node_new(); CHECK_NULL_RETURN(node); @@ -1217,6 +1226,7 @@ node_new_ctype(int type, int not) SET_NTYPE(node, NT_CTYPE); NCTYPE(node)->ctype = type; NCTYPE(node)->not = not; + NCTYPE(node)->ascii_range = ascii_range; return node; } @@ -1288,6 +1298,7 @@ onig_node_new_anchor(int type) NANCHOR(node)->type = type; NANCHOR(node)->target = NULL; NANCHOR(node)->char_len = -1; + NANCHOR(node)->ascii_range = 0; return node; } @@ -1455,7 +1466,7 @@ onig_node_str_cat(Node* node, const UChar* s, const UChar* end) CHECK_NULL_RETURN_MEMERR(p); NSTR(node)->s = p; - NSTR(node)->capa = (int)capa; + NSTR(node)->capa = (int )capa; } } else { @@ -1483,6 +1494,15 @@ node_str_cat_char(Node* node, UChar c) return onig_node_str_cat(node, s, s + 1); } +static int +node_str_cat_codepoint(Node* node, OnigEncoding enc, OnigCodePoint c) +{ + UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; + int num = ONIGENC_CODE_TO_MBC(enc, c, buf); + if (num < 0) return num; + return onig_node_str_cat(node, buf, buf + num); +} + extern void onig_node_conv_to_str_node(Node* node, int flag) { @@ -1535,7 +1555,8 @@ static Node* node_new_str_raw(UChar* s, UChar* end) { Node* node = node_new_str(s, end); - NSTRING_SET_RAW(node); + if (IS_NOT_NULL(node)) + NSTRING_SET_RAW(node); return node; } @@ -1564,7 +1585,7 @@ str_node_split_last_char(StrNode* sn, OnigEncoding enc) p = onigenc_get_prev_char_head(enc, sn->s, sn->end, sn->end); if (p && p > sn->s) { /* can be splitted. */ n = node_new_str(p, sn->end); - if ((sn->flag & NSTR_RAW) != 0) + if (IS_NOT_NULL(n) && (sn->flag & NSTR_RAW) != 0) NSTRING_SET_RAW(n); sn->end = (UChar* )p; } @@ -1627,14 +1648,16 @@ onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) } static int -scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen, - OnigEncoding enc) +scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen, + int maxlen, OnigEncoding enc) { OnigCodePoint c; unsigned int num, val; + int restlen; UChar* p = *src; PFETCH_READY; + restlen = maxlen - minlen; num = 0; while (!PEND && maxlen-- != 0) { PFETCH(c); @@ -1650,6 +1673,8 @@ scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen, break; } } + if (maxlen > restlen) + return -2; /* not enough digits */ *src = p; return num; } @@ -1734,17 +1759,19 @@ add_code_range_to_buf0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePo data = (OnigCodePoint* )(bbuf->p); data++; - for (low = 0, bound = n; low < bound; ) { + bound = (from == 0) ? 0 : n; + for (low = 0; low < bound; ) { x = (low + bound) >> 1; - if (from > data[x*2 + 1]) + if (from - 1 > data[x*2 + 1]) low = x + 1; else bound = x; } - for (high = low, bound = n; high < bound; ) { + high = (to == ONIG_LAST_CODE_POINT) ? n : low; + for (bound = n; high < bound; ) { x = (high + bound) >> 1; - if (to >= data[x*2] - 1) + if (to + 1 >= data[x*2]) high = x + 1; else bound = x; @@ -1762,13 +1789,15 @@ add_code_range_to_buf0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePo to = data[(high - 1)*2 + 1]; } - if (inc_n != 0 && (OnigCodePoint )high < n) { + if (inc_n != 0) { int from_pos = SIZE_CODE_POINT * (1 + high * 2); int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2); - int size = (n - high) * 2 * SIZE_CODE_POINT; if (inc_n > 0) { - BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size); + if ((OnigCodePoint )high < n) { + int size = (n - high) * 2 * SIZE_CODE_POINT; + BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size); + } } else { BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos); @@ -1836,11 +1865,11 @@ not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf, ScanEnv* env) r = add_code_range_to_buf(pbuf, env, pre, from - 1); if (r != 0) return r; } - if (to == ~((OnigCodePoint )0)) break; + if (to == ONIG_LAST_CODE_POINT) break; pre = to + 1; } - if (to < ~((OnigCodePoint )0)) { - r = add_code_range_to_buf(pbuf, env, to + 1, ~((OnigCodePoint )0)); + if (to < ONIG_LAST_CODE_POINT) { + r = add_code_range_to_buf(pbuf, env, to + 1, ONIG_LAST_CODE_POINT); } return r; } @@ -2053,8 +2082,8 @@ and_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env) } } if (r != 0) { - bbuf_free(pbuf); - return r; + bbuf_free(pbuf); + return r; } dest->mbuf = pbuf; @@ -2111,8 +2140,8 @@ or_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env) } } if (r != 0) { - bbuf_free(pbuf); - return r; + bbuf_free(pbuf); + return r; } dest->mbuf = pbuf; @@ -2151,7 +2180,9 @@ conv_backslash_value(int c, ScanEnv* env) return c; } -#if 0 /* no invalid quantifier */ +#ifdef USE_NO_INVALID_QUANTIFIER +#define is_invalid_quantifier_target(node) 0 +#else static int is_invalid_quantifier_target(Node* node) { @@ -2183,8 +2214,6 @@ is_invalid_quantifier_target(Node* node) } return 0; } -#else -#define is_invalid_quantifier_target(node) 0 #endif /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */ @@ -2303,6 +2332,9 @@ enum TokenSyms { TK_CC_OPEN, TK_QUOTE_OPEN, TK_CHAR_PROPERTY, /* \p{...}, \P{...} */ + TK_LINEBREAK, + TK_EXTENDED_GRAPHEME_CLUSTER, + TK_KEEP, /* in cc */ TK_CC_CLOSE, TK_CC_RANGE, @@ -2320,8 +2352,10 @@ typedef struct { UChar* s; int c; OnigCodePoint code; - int anchor; - int subtype; + struct { + int subtype; + int ascii_range; + } anchor; struct { int lower; int upper; @@ -2342,6 +2376,7 @@ typedef struct { UChar* name; UChar* name_end; int gnum; + int rel; } call; struct { int ctype; @@ -2523,6 +2558,8 @@ get_name_end_code_point(OnigCodePoint start) switch (start) { case '<': return (OnigCodePoint )'>'; break; case '\'': return (OnigCodePoint )'\''; break; + case '(': return (OnigCodePoint )')'; break; + case '{': return (OnigCodePoint )'}'; break; default: break; } @@ -2648,7 +2685,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, #endif /* USE_BACKREF_WITH_LEVEL */ /* - def: 0 -> define name (don't allow number name) + ref: 0 -> define name (don't allow number name) 1 -> reference name (allow number name) */ static int @@ -2883,8 +2920,8 @@ CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c) { if (onig_warn == onig_null_warn) return ; - if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) { - onig_syntax_warn(env, "regular expression has '%s' without escape", c); + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) { + onig_syntax_warn(env, "regular expression has '%s' without escape", c); } } @@ -2893,9 +2930,9 @@ CC_DUP_WARN(ScanEnv *env) { if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ; - if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_DUP) && - !((env)->warnings_flag & ONIG_SYN_WARN_CC_DUP)) { - (env)->warnings_flag |= ONIG_SYN_WARN_CC_DUP; + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_DUP) && + !(env->warnings_flag & ONIG_SYN_WARN_CC_DUP)) { + env->warnings_flag |= ONIG_SYN_WARN_CC_DUP; onig_syntax_warn(env, "character class has duplicated range"); } } @@ -3014,32 +3051,32 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) switch (c) { case 'w': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_W; + tok->u.prop.ctype = ONIGENC_CTYPE_WORD; tok->u.prop.not = 0; break; case 'W': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_W; + tok->u.prop.ctype = ONIGENC_CTYPE_WORD; tok->u.prop.not = 1; break; case 'd': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_D; + tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; tok->u.prop.not = 0; break; case 'D': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_D; + tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; tok->u.prop.not = 1; break; case 's': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_S; + tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; tok->u.prop.not = 0; break; case 'S': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_S; + tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; tok->u.prop.not = 1; break; case 'h': @@ -3074,7 +3111,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else { - onig_syntax_warn(env, "invalid Unicode Property \\%c", c); + onig_syntax_warn(env, "invalid Unicode Property \\%c", c); } break; @@ -3084,7 +3121,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); + num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; if (!PEND) { c2 = PPEEK; @@ -3104,7 +3141,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 2, enc); + num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -3120,8 +3157,9 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, enc); - if (num < 0) return ONIGERR_TOO_BIG_NUMBER; + num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc); + if (num < -1) return ONIGERR_TOO_SHORT_DIGITS; + else if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ } @@ -3195,6 +3233,82 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) return tok->type; } +#ifdef USE_NAMED_GROUP +static int +fetch_named_backref_token(OnigCodePoint c, OnigToken* tok, UChar** src, + UChar* end, ScanEnv* env) +{ + int r, num; + OnigEncoding enc = env->enc; + const OnigSyntaxType* syn = env->syntax; + UChar* prev; + UChar* p = *src; + UChar* name_end; + int* backs; + int back_num; + + prev = p; + +#ifdef USE_BACKREF_WITH_LEVEL + name_end = NULL_UCHARP; /* no need. escape gcc warning. */ + r = fetch_name_with_level(c, &p, end, &name_end, + env, &back_num, &tok->u.backref.level); + if (r == 1) tok->u.backref.exist_level = 1; + else tok->u.backref.exist_level = 0; +#else + r = fetch_name(&p, end, &name_end, env, &back_num, 1); +#endif + if (r < 0) return r; + + if (back_num != 0) { + if (back_num < 0) { + back_num = BACKREF_REL_TO_ABS(back_num, env); + if (back_num <= 0) + return ONIGERR_INVALID_BACKREF; + } + + if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { + if (back_num > env->num_mem || + IS_NULL(SCANENV_MEM_NODES(env)[back_num])) + return ONIGERR_INVALID_BACKREF; + } + tok->type = TK_BACKREF; + tok->u.backref.by_name = 0; + tok->u.backref.num = 1; + tok->u.backref.ref1 = back_num; + } + else { + num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); + if (num <= 0) { + onig_scan_env_set_error_string(env, + ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); + return ONIGERR_UNDEFINED_NAME_REFERENCE; + } + if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { + int i; + for (i = 0; i < num; i++) { + if (backs[i] > env->num_mem || + IS_NULL(SCANENV_MEM_NODES(env)[backs[i]])) + return ONIGERR_INVALID_BACKREF; + } + } + + tok->type = TK_BACKREF; + tok->u.backref.by_name = 1; + if (num == 1) { + tok->u.backref.num = 1; + tok->u.backref.ref1 = backs[0]; + } + else { + tok->u.backref.num = num; + tok->u.backref.refs = backs; + } + } + *src = p; + return 0; +} +#endif + static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) { @@ -3304,68 +3418,74 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case 'w': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_W; + tok->u.prop.ctype = ONIGENC_CTYPE_WORD; tok->u.prop.not = 0; break; case 'W': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_W; + tok->u.prop.ctype = ONIGENC_CTYPE_WORD; tok->u.prop.not = 1; break; case 'b': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; tok->type = TK_ANCHOR; - tok->u.anchor = ANCHOR_WORD_BOUND; + tok->u.anchor.subtype = ANCHOR_WORD_BOUND; + tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option) + && ! IS_WORD_BOUND_ALL_RANGE(env->option); break; case 'B': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; tok->type = TK_ANCHOR; - tok->u.anchor = ANCHOR_NOT_WORD_BOUND; + tok->u.anchor.subtype = ANCHOR_NOT_WORD_BOUND; + tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option) + && ! IS_WORD_BOUND_ALL_RANGE(env->option); break; #ifdef USE_WORD_BEGIN_END case '<': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; tok->type = TK_ANCHOR; - tok->u.anchor = ANCHOR_WORD_BEGIN; + tok->u.anchor.subtype = ANCHOR_WORD_BEGIN; + tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option); break; case '>': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; tok->type = TK_ANCHOR; - tok->u.anchor = ANCHOR_WORD_END; + tok->u.anchor.subtype = ANCHOR_WORD_END; + tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option); break; #endif case 's': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_S; + tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; tok->u.prop.not = 0; break; case 'S': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_S; + tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; tok->u.prop.not = 1; break; case 'd': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_D; + tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; tok->u.prop.not = 0; break; case 'D': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_D; + tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; tok->u.prop.not = 1; break; @@ -3387,26 +3507,26 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; begin_buf: tok->type = TK_ANCHOR; - tok->u.subtype = ANCHOR_BEGIN_BUF; + tok->u.anchor.subtype = ANCHOR_BEGIN_BUF; break; case 'Z': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; tok->type = TK_ANCHOR; - tok->u.subtype = ANCHOR_SEMI_END_BUF; + tok->u.anchor.subtype = ANCHOR_SEMI_END_BUF; break; case 'z': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; end_buf: tok->type = TK_ANCHOR; - tok->u.subtype = ANCHOR_END_BUF; + tok->u.anchor.subtype = ANCHOR_END_BUF; break; case 'G': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break; tok->type = TK_ANCHOR; - tok->u.subtype = ANCHOR_BEGIN_POSITION; + tok->u.anchor.subtype = ANCHOR_BEGIN_POSITION; break; case '`': @@ -3425,7 +3545,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); + num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; if (!PEND) { if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK)) @@ -3443,7 +3563,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 2, enc); + num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc); if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ @@ -3459,8 +3579,9 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, enc); - if (num < 0) return ONIGERR_TOO_BIG_NUMBER; + num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc); + if (num < -1) return ONIGERR_TOO_SHORT_DIGITS; + else if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ } @@ -3527,98 +3648,69 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) { PFETCH(c); if (c == '<' || c == '\'') { - UChar* name_end; - int* backs; - int back_num; - - prev = p; - -#ifdef USE_BACKREF_WITH_LEVEL - name_end = NULL_UCHARP; /* no need. escape gcc warning. */ - r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end, - env, &back_num, &tok->u.backref.level); - if (r == 1) tok->u.backref.exist_level = 1; - else tok->u.backref.exist_level = 0; -#else - r = fetch_name(&p, end, &name_end, env, &back_num, 1); -#endif + r = fetch_named_backref_token(c, tok, &p, end, env); if (r < 0) return r; - - if (back_num != 0) { - if (back_num < 0) { - back_num = BACKREF_REL_TO_ABS(back_num, env); - if (back_num <= 0) - return ONIGERR_INVALID_BACKREF; - } - - if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { - if (back_num > env->num_mem || - IS_NULL(SCANENV_MEM_NODES(env)[back_num])) - return ONIGERR_INVALID_BACKREF; - } - tok->type = TK_BACKREF; - tok->u.backref.by_name = 0; - tok->u.backref.num = 1; - tok->u.backref.ref1 = back_num; - } - else { - num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); - if (num <= 0) { - onig_scan_env_set_error_string(env, - ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); - return ONIGERR_UNDEFINED_NAME_REFERENCE; - } - if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { - int i; - for (i = 0; i < num; i++) { - if (backs[i] > env->num_mem || - IS_NULL(SCANENV_MEM_NODES(env)[backs[i]])) - return ONIGERR_INVALID_BACKREF; - } - } - - tok->type = TK_BACKREF; - tok->u.backref.by_name = 1; - if (num == 1) { - tok->u.backref.num = 1; - tok->u.backref.ref1 = backs[0]; - } - else { - tok->u.backref.num = num; - tok->u.backref.refs = backs; - } - } } else { - PUNFETCH; - onig_syntax_warn(env, "invalid back reference"); + PUNFETCH; + onig_syntax_warn(env, "invalid back reference"); } } break; #endif -#ifdef USE_SUBEXP_CALL +#if defined(USE_SUBEXP_CALL) || defined(USE_NAMED_GROUP) case 'g': +#ifdef USE_NAMED_GROUP + if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_BRACE_BACKREF)) { + PFETCH(c); + if (c == '{') { + r = fetch_named_backref_token(c, tok, &p, end, env); + if (r < 0) return r; + } + else + PUNFETCH; + } +#endif +#ifdef USE_SUBEXP_CALL if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) { PFETCH(c); if (c == '<' || c == '\'') { - int gnum; + int gnum = -1, rel = 0; UChar* name_end; - + OnigCodePoint cnext; + + cnext = PPEEK; + if (cnext == '0') { + PINC; + if (PPEEK_IS(get_name_end_code_point(c))) { // \g<0>, \g'0' + PINC; + name_end = p; + gnum = 0; + } + } + else if (cnext == '+') { + PINC; + rel = 1; + } prev = p; - r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1); - if (r < 0) return r; + if (gnum < 0) { + r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1); + if (r < 0) return r; + } tok->type = TK_CALL; tok->u.call.name = prev; tok->u.call.name_end = name_end; tok->u.call.gnum = gnum; + tok->u.call.rel = rel; } else { - onig_syntax_warn(env, "invalid subexp call"); - PUNFETCH; + onig_syntax_warn(env, "invalid subexp call"); + PUNFETCH; } } +#endif break; #endif @@ -3646,7 +3738,25 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else { - onig_syntax_warn(env, "invalid Unicode Property \\%c", c); + onig_syntax_warn(env, "invalid Unicode Property \\%c", c); + } + break; + + case 'R': + if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK)) { + tok->type = TK_LINEBREAK; + } + break; + + case 'X': + if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER)) { + tok->type = TK_EXTENDED_GRAPHEME_CLUSTER; + } + break; + + case 'K': + if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP)) { + tok->type = TK_KEEP; } break; @@ -3766,6 +3876,97 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } goto start; } +#ifdef USE_PERL_SUBEXP_CALL + /* (?&name), (?n), (?R), (?0), (?+n), (?-n) */ + c = PPEEK; + if ((c == '&' || c == 'R' || ONIGENC_IS_CODE_DIGIT(enc, c)) && + IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) { + /* (?&name), (?n), (?R), (?0) */ + int gnum; + UChar *name; + UChar *name_end; + + if (c == 'R' || c == '0') { + PINC; /* skip 'R' / '0' */ + if (!PPEEK_IS(')')) return ONIGERR_INVALID_GROUP_NAME; + PINC; /* skip ')' */ + name_end = name = p; + gnum = 0; + } + else { + int numref = 1; + if (c == '&') { /* (?&name) */ + PINC; + numref = 0; /* don't allow number name */ + } + name = p; + r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, numref); + if (r < 0) return r; + } + + tok->type = TK_CALL; + tok->u.call.name = name; + tok->u.call.name_end = name_end; + tok->u.call.gnum = gnum; + tok->u.call.rel = 0; + break; + } + else if ((c == '-' || c == '+') && + IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) { + /* (?+n), (?-n) */ + int gnum; + UChar *name; + UChar *name_end; + OnigCodePoint cnext; + PFETCH_READY; + + PINC; /* skip '-' / '+' */ + cnext = PPEEK; + if (ONIGENC_IS_CODE_DIGIT(enc, cnext)) { + if (c == '-') PUNFETCH; + name = p; + r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 1); + if (r < 0) return r; + + tok->type = TK_CALL; + tok->u.call.name = name; + tok->u.call.name_end = name_end; + tok->u.call.gnum = gnum; + tok->u.call.rel = 1; + break; + } + } +#endif /* USE_PERL_SUBEXP_CALL */ +#ifdef USE_CAPITAL_P_NAMED_GROUP + if (PPEEK_IS('P') && + IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) { + int gnum; + UChar *name; + UChar *name_end; + PFETCH_READY; + + PINC; /* skip 'P' */ + PFETCH(c); + if (c == '=') { /* (?P=name): backref */ + r = fetch_named_backref_token((OnigCodePoint )'(', tok, &p, end, env); + if (r < 0) return r; + break; + } + else if (c == '>') { /* (?P>name): subexp call */ + name = p; + r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 0); + if (r < 0) return r; + + tok->type = TK_CALL; + tok->u.call.name = name; + tok->u.call.name_end = name_end; + tok->u.call.gnum = gnum; + tok->u.call.rel = 0; + break; + } + PUNFETCH; + } +#endif /* USE_CAPITAL_P_NAMED_GROUP */ PUNFETCH; } @@ -3781,15 +3982,15 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case '^': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; tok->type = TK_ANCHOR; - tok->u.subtype = (IS_SINGLELINE(env->option) - ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE); + tok->u.anchor.subtype = (IS_SINGLELINE(env->option) + ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE); break; case '$': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; tok->type = TK_ANCHOR; - tok->u.subtype = (IS_SINGLELINE(env->option) - ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE); + tok->u.anchor.subtype = (IS_SINGLELINE(env->option) + ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE); break; case '[': @@ -3906,43 +4107,44 @@ add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not, } static int -add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) +add_ctype_to_cc(CClassNode* cc, int ctype, int not, int char_prop, ScanEnv* env) { + int maxcode, ascii_range; int c, r; const OnigCodePoint *ranges; OnigCodePoint sb_out; OnigEncoding enc = env->enc; + OnigOptionType option = env->option; - switch (ctype) { - case ONIGENC_CTYPE_D: - case ONIGENC_CTYPE_S: - case ONIGENC_CTYPE_W: - ctype ^= ONIGENC_CTYPE_SPECIAL_MASK; - if (not != 0) { - for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (! ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype)) - BITSET_SET_BIT_CHKDUP(cc->bs, c); - } - ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); - } - else { - for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype)) - BITSET_SET_BIT_CHKDUP(cc->bs, c); - } - } - return 0; - break; - } + ascii_range = IS_ASCII_RANGE(option) && (char_prop == 0); r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges); if (r == 0) { - return add_ctype_to_cc_by_range(cc, ctype, not, env, sb_out, ranges); + r = add_ctype_to_cc_by_range(cc, ctype, not, env, sb_out, ranges); + if ((r == 0) && ascii_range) { + if (not != 0) { + r = add_code_range_to_buf(&(cc->mbuf), env, 0x80, ONIG_LAST_CODE_POINT); + } + else { + CClassNode ccascii; + initialize_cclass(&ccascii); + if (ONIGENC_MBC_MINLEN(env->enc) > 1) { + add_code_range(&(ccascii.mbuf), env, 0x00, 0x7F); + } + else { + bitset_set_range(env, ccascii.bs, 0x00, 0x7F); + } + r = and_cclass(cc, &ccascii, env); + if (IS_NOT_NULL(ccascii.mbuf)) bbuf_free(ccascii.mbuf); + } + } + return r; } else if (r != ONIG_NO_SUPPORT_CONFIG) { return r; } + maxcode = ascii_range ? 0x80 : SINGLE_BYTE_SIZE; r = 0; switch (ctype) { case ONIGENC_CTYPE_ALPHA: @@ -3975,32 +4177,39 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) case ONIGENC_CTYPE_PRINT: if (not != 0) { for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) + if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype) + || c >= maxcode) BITSET_SET_BIT_CHKDUP(cc->bs, c); } + if (ascii_range) + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } else { - for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + for (c = 0; c < maxcode; c++) { if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) BITSET_SET_BIT_CHKDUP(cc->bs, c); } - ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); + if (! ascii_range) + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } break; case ONIGENC_CTYPE_WORD: if (not == 0) { - for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT_CHKDUP(cc->bs, c); + for (c = 0; c < maxcode; c++) { + if (ONIGENC_IS_CODE_WORD(enc, c)) BITSET_SET_BIT_CHKDUP(cc->bs, c); } - ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); + if (! ascii_range) + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } else { for (c = 0; c < SINGLE_BYTE_SIZE; c++) { if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */ - && ! ONIGENC_IS_CODE_WORD(enc, c)) + && (! ONIGENC_IS_CODE_WORD(enc, c) || c >= maxcode)) BITSET_SET_BIT_CHKDUP(cc->bs, c); } + if (ascii_range) + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } break; @@ -4059,7 +4268,9 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0) return ONIGERR_INVALID_POSIX_BRACKET_TYPE; - r = add_ctype_to_cc(cc, pb->ctype, not, env); + r = add_ctype_to_cc(cc, pb->ctype, not, + IS_POSIX_BRACKET_ALL_RANGE(env->option), + env); if (r != 0) return r; PINC; PINC; @@ -4132,7 +4343,7 @@ parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end, *np = node_new_cclass(); CHECK_NULL_RETURN_MEMERR(*np); cc = NCCLASS(*np); - r = add_ctype_to_cc(cc, ctype, 0, env); + r = add_ctype_to_cc(cc, ctype, 0, 1, env); if (r != 0) return r; if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); @@ -4424,7 +4635,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, break; case TK_CHAR_TYPE: - r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env); + r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, 0, env); if (r != 0) return r; next_class: @@ -4438,7 +4649,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, ctype = fetch_char_property_to_ctype(&p, end, env); if (ctype < 0) return ctype; - r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env); + r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, 1, env); if (r != 0) return r; goto next_class; } @@ -4586,8 +4797,10 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) { if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1) BITSET_SET_BIT_CHKDUP(cc->bs, NEWLINE_CODE); - else - add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE); + else { + r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE); + if (r < 0) goto err; + } } } } @@ -4607,8 +4820,8 @@ static int parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, ScanEnv* env) { - int r, num; - Node *target; + int r = 0, num; + Node *target, *work1 = NULL, *work2 = NULL; OnigOptionType option; OnigCodePoint c; OnigEncoding enc = env->enc; @@ -4644,10 +4857,10 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, case '=': *np = onig_node_new_anchor(ANCHOR_PREC_READ); break; - case '!': /* preceding read */ + case '!': /* preceding read */ *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT); break; - case '>': /* (?>...) stop backtrack */ + case '>': /* (?>...) stop backtrack */ *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK); break; @@ -4659,6 +4872,16 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, else return ONIGERR_UNDEFINED_GROUP_OPTION; break; + +#ifdef USE_CAPITAL_P_NAMED_GROUP + case 'P': /* (?P...) */ + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) { + PFETCH(c); + if (c == '<') goto named_group1; + } + return ONIGERR_UNDEFINED_GROUP_OPTION; + break; +#endif #endif case '<': /* look behind (?<=...), (?...) */ if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { UChar *name; UChar *name_end; @@ -4740,10 +4963,104 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, } break; + case '(': /* conditional expression: (?(cond)yes), (?(cond)yes|no) */ + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_CONDITION)) { + UChar *name = NULL; + UChar *name_end; + PFETCH(c); + if (ONIGENC_IS_CODE_DIGIT(enc, c)) { /* (n) */ + PUNFETCH; + r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &num, 1); + if (r < 0) return r; + if (num < 0) { + num = BACKREF_REL_TO_ABS(num, env); + if (num <= 0) + return ONIGERR_INVALID_BACKREF; + } + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { + if (num > env->num_mem || + IS_NULL(SCANENV_MEM_NODES(env)[num])) + return ONIGERR_INVALID_BACKREF; + } + } +#ifdef USE_NAMED_GROUP + else if (c == '<' || c == '\'') { /* (), ('name') */ + int nums; + int *backs; + + name = p; + r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0); + if (r < 0) return r; + PFETCH(c); + if (c != ')') return ONIGERR_UNDEFINED_GROUP_OPTION; + + nums = onig_name_to_group_numbers(env->reg, name, name_end, &backs); + if (nums <= 0) { + onig_scan_env_set_error_string(env, + ONIGERR_UNDEFINED_NAME_REFERENCE, name, name_end); + return ONIGERR_UNDEFINED_NAME_REFERENCE; + } + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { + int i; + for (i = 0; i < nums; i++) { + if (backs[i] > env->num_mem || + IS_NULL(SCANENV_MEM_NODES(env)[backs[i]])) + return ONIGERR_INVALID_BACKREF; + } + } + num = backs[0]; /* XXX: use left most named group as Perl */ + } +#endif + *np = node_new_enclose(ENCLOSE_CONDITION); + CHECK_NULL_RETURN_MEMERR(*np); + NENCLOSE(*np)->regnum = num; + if (IS_NOT_NULL(name)) NENCLOSE(*np)->state |= NST_NAME_REF; + } + else + return ONIGERR_UNDEFINED_GROUP_OPTION; + break; + +#if 0 + case '|': /* branch reset: (?|...) */ + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_VBAR_BRANCH_RESET)) { + /* TODO */ + } + else + return ONIGERR_UNDEFINED_GROUP_OPTION; + break; +#endif + + case '^': /* loads default options */ + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { + /* d-imsx */ + ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); + ONOFF(option, ONIG_OPTION_IGNORECASE, 1); + ONOFF(option, ONIG_OPTION_SINGLELINE, 0); + ONOFF(option, ONIG_OPTION_MULTILINE, 1); + ONOFF(option, ONIG_OPTION_EXTEND, 1); + PFETCH(c); + } +#if 0 + else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) { + /* d-imx */ + ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); + ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0); + ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0); + ONOFF(option, ONIG_OPTION_IGNORECASE, 1); + ONOFF(option, ONIG_OPTION_MULTILINE, 1); + ONOFF(option, ONIG_OPTION_EXTEND, 1); + PFETCH(c); + } +#endif + else { + return ONIGERR_UNDEFINED_GROUP_OPTION; + } + /* fall through */ #ifdef USE_POSIXLINE_OPTION case 'p': #endif case '-': case 'i': case 'm': case 's': case 'x': + case 'a': case 'd': case 'l': case 'u': { int neg = 0; @@ -4779,6 +5096,54 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg); break; #endif + + case 'a': /* limits \d, \s, \w and POSIX brackets to ASCII range */ + if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) || + IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) && + (neg == 0)) { + ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); + ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1); + ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1); + } + else + return ONIGERR_UNDEFINED_GROUP_OPTION; + break; + + case 'u': + if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) || + IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) && + (neg == 0)) { + ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); + ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1); + ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1); + } + else + return ONIGERR_UNDEFINED_GROUP_OPTION; + break; + + case 'd': + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && + (neg == 0)) { + ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); + } + else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY) && + (neg == 0)) { + ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); + ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0); + ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0); + } + else + return ONIGERR_UNDEFINED_GROUP_OPTION; + break; + + case 'l': + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && (neg == 0)) { + ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); + } + else + return ONIGERR_UNDEFINED_GROUP_OPTION; + break; + default: return ONIGERR_UNDEFINED_GROUP_OPTION; } @@ -4844,10 +5209,29 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np); if (r != 0) return r; } + else if (NENCLOSE(*np)->type == ENCLOSE_CONDITION) { + if (NTYPE(target) != NT_ALT) { + /* convert (?(cond)yes) to (?(cond)yes|empty) */ + work1 = node_new_empty(); + if (IS_NULL(work1)) goto err; + work2 = onig_node_new_alt(work1, NULL_NODE); + if (IS_NULL(work2)) goto err; + work1 = onig_node_new_alt(target, work2); + if (IS_NULL(work1)) goto err; + NENCLOSE(*np)->target = work1; + } + } } *src = p; return 0; + + err: + onig_node_free(work1); + onig_node_free(work2); + onig_node_free(*np); + *np = NULL; + return ONIGERR_MEMORY; } static const char* const PopularQStr[] = { @@ -5140,6 +5524,209 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], return 0; } +static int +node_linebreak(Node** np, ScanEnv* env) +{ + /* same as (?>\x0D\x0A|[\x0A-\x0D\x{85}\x{2028}\x{2029}]) */ + Node* left = NULL; + Node* right = NULL; + Node* target1 = NULL; + Node* target2 = NULL; + CClassNode* cc; + int num1, num2; + UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2]; + + /* \x0D\x0A */ + num1 = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf); + if (num1 < 0) return num1; + num2 = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1); + if (num2 < 0) return num2; + left = node_new_str_raw(buf, buf + num1 + num2); + if (IS_NULL(left)) goto err; + + /* [\x0A-\x0D] or [\x0A-\x0D\x{85}\x{2028}\x{2029}] */ + right = node_new_cclass(); + if (IS_NULL(right)) goto err; + cc = NCCLASS(right); + if (ONIGENC_MBC_MINLEN(env->enc) > 1) { + add_code_range(&(cc->mbuf), env, 0x0A, 0x0D); + } + else { + bitset_set_range(env, cc->bs, 0x0A, 0x0D); + } + + /* TODO: move this block to enc/unicode.c */ + if (ONIGENC_IS_UNICODE(env->enc)) { + /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */ + add_code_range(&(cc->mbuf), env, 0x85, 0x85); + add_code_range(&(cc->mbuf), env, 0x2028, 0x2029); + } + + /* ...|... */ + target1 = onig_node_new_alt(right, NULL_NODE); + if (IS_NULL(target1)) goto err; + right = NULL; + target2 = onig_node_new_alt(left, target1); + if (IS_NULL(target2)) goto err; + left = NULL; + target1 = NULL; + + /* (?>...) */ + *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK); + if (IS_NULL(*np)) goto err; + NENCLOSE(*np)->target = target2; + return ONIG_NORMAL; + + err: + onig_node_free(left); + onig_node_free(right); + onig_node_free(target1); + onig_node_free(target2); + return ONIGERR_MEMORY; +} + +static int +node_extended_grapheme_cluster(Node** np, ScanEnv* env) +{ + /* same as (?>\P{M}\p{M}*) */ + Node* np1 = NULL; + Node* np2 = NULL; + Node* qn = NULL; + Node* list1 = NULL; + Node* list2 = NULL; + int r = 0; + +#ifdef USE_UNICODE_PROPERTIES + if (ONIGENC_IS_UNICODE(env->enc)) { + /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */ + CClassNode* cc1; + CClassNode* cc2; + UChar* propname = (UChar* )"M"; + int ctype = env->enc->property_name_to_ctype(ONIG_ENCODING_ASCII, + propname, propname + 1); + if (ctype >= 0) { + /* \P{M} */ + np1 = node_new_cclass(); + if (IS_NULL(np1)) goto err; + cc1 = NCCLASS(np1); + r = add_ctype_to_cc(cc1, ctype, 0, 1, env); + if (r != 0) goto err; + NCCLASS_SET_NOT(cc1); + + /* \p{M}* */ + np2 = node_new_cclass(); + if (IS_NULL(np2)) goto err; + cc2 = NCCLASS(np2); + r = add_ctype_to_cc(cc2, ctype, 0, 1, env); + if (r != 0) goto err; + + qn = node_new_quantifier(0, REPEAT_INFINITE, 0); + if (IS_NULL(qn)) goto err; + NQTFR(qn)->target = np2; + np2 = NULL; + + /* \P{M}\p{M}* */ + list2 = node_new_list(qn, NULL_NODE); + if (IS_NULL(list2)) goto err; + qn = NULL; + list1 = node_new_list(np1, list2); + if (IS_NULL(list1)) goto err; + np1 = NULL; + list2 = NULL; + + /* (?>...) */ + *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK); + if (IS_NULL(*np)) goto err; + NENCLOSE(*np)->target = list1; + return ONIG_NORMAL; + } + } +#endif /* USE_UNICODE_PROPERTIES */ + if (IS_NULL(*np)) { + /* PerlSyntax: (?s:.), RubySyntax: (?m:.) */ + OnigOptionType option; + np1 = node_new_anychar(); + if (IS_NULL(np1)) goto err; + + option = env->option; + ONOFF(option, ONIG_OPTION_MULTILINE, 0); + *np = node_new_option(option); + if (IS_NULL(*np)) goto err; + NENCLOSE(*np)->target = np1; + } + return ONIG_NORMAL; + + err: + onig_node_free(np1); + onig_node_free(np2); + onig_node_free(qn); + onig_node_free(list1); + onig_node_free(list2); + return (r == 0) ? ONIGERR_MEMORY : r; +} + +static int +countbits(unsigned int bits) { + bits = (bits & 0x55555555) + ((bits >> 1) & 0x55555555); + bits = (bits & 0x33333333) + ((bits >> 2) & 0x33333333); + bits = (bits & 0x0f0f0f0f) + ((bits >> 4) & 0x0f0f0f0f); + bits = (bits & 0x00ff00ff) + ((bits >> 8) & 0x00ff00ff); + return (bits & 0x0000ffff) + ((bits >>16) & 0x0000ffff); +} + +static int +is_onechar_cclass(CClassNode* cc, OnigCodePoint* code) +{ + OnigCodePoint c; + int found = 0; + int i, j = -1; + Bits b1, b2; + BBuf *bbuf = cc->mbuf; + + if (IS_NCCLASS_NOT(cc)) return 0; + + /* check bbuf */ + if (IS_NOT_NULL(bbuf)) { + OnigCodePoint n, *data; + GET_CODE_POINT(n, bbuf->p); + data = (OnigCodePoint* )(bbuf->p) + 1; + if ((n == 1) && (data[0] == data[1])) { + /* only one char found in the bbuf, save the code point. */ + found = 1; + c = data[0]; + } + else { + return 0; /* the bbuf contains multiple chars */ + } + } + + if (found && (c < SINGLE_BYTE_SIZE) && BITSET_AT(cc->bs, c)) { + /* c is included in the bitset, ignore the result of bbuf. */ + found = 0; + } + + /* check bitset */ + for (i = 0; i < (int )BITSET_SIZE; i++) { + b1 = cc->bs[i]; + if (b1 != 0) { + if (((b1 & (b1 - 1)) == 0) && (found == 0)) { + found = 1; + j = i; + b2 = b1; + } else { + return 0; /* the character class contains multiple chars */ + } + } + } + if (j >= 0) { + /* only one char found in the bitset, calculate the code point. */ + c = BITS_IN_ROOM * j + (countbits(b2 - 1) & 0x1f); + } + *code = c; + return 1; +} + + static int parse_exp(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, ScanEnv* env) @@ -5155,9 +5742,10 @@ parse_exp(Node** np, OnigToken* tok, int term, switch (tok->type) { case TK_ALT: case TK_EOT: - end_of_token: + end_of_token: *np = node_new_empty(); return tok->type; + break; case TK_SUBEXP_OPEN: r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env); @@ -5189,18 +5777,42 @@ parse_exp(Node** np, OnigToken* tok, int term, else goto tk_byte; break; + case TK_LINEBREAK: + r = node_linebreak(np, env); + if (r < 0) return r; + break; + + case TK_EXTENDED_GRAPHEME_CLUSTER: + r = node_extended_grapheme_cluster(np, env); + if (r < 0) return r; + break; + + case TK_KEEP: + *np = onig_node_new_anchor(ANCHOR_KEEP); + CHECK_NULL_RETURN_MEMERR(*np); + break; + case TK_STRING: tk_byte: { *np = node_new_str(tok->backp, *src); CHECK_NULL_RETURN_MEMERR(*np); + string_loop: while (1) { r = fetch_token(tok, src, end, env); if (r < 0) return r; - if (r != TK_STRING) break; - - r = onig_node_str_cat(*np, tok->backp, *src); + if (r == TK_STRING) { + r = onig_node_str_cat(*np, tok->backp, *src); + } +#ifndef NUMBERED_CHAR_IS_NOT_CASE_AMBIG + else if (r == TK_CODE_POINT) { + r = node_str_cat_codepoint(*np, env->enc, tok->u.code); + } +#endif + else { + break; + } if (r < 0) return r; } @@ -5253,15 +5865,15 @@ parse_exp(Node** np, OnigToken* tok, int term, case TK_CODE_POINT: { - UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf); - if (num < 0) return num; + *np = node_new_empty(); + CHECK_NULL_RETURN_MEMERR(*np); + r = node_str_cat_codepoint(*np, env->enc, tok->u.code); + if (r != 0) return r; #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG - *np = node_new_str_raw(buf, buf + num); + NSTRING_SET_RAW(*np); #else - *np = node_new_str(buf, buf + num); + goto string_loop; #endif - CHECK_NULL_RETURN_MEMERR(*np); } break; @@ -5286,21 +5898,9 @@ parse_exp(Node** np, OnigToken* tok, int term, case TK_CHAR_TYPE: { switch (tok->u.prop.ctype) { - case ONIGENC_CTYPE_D: - case ONIGENC_CTYPE_S: - case ONIGENC_CTYPE_W: - { - CClassNode* cc; - *np = node_new_cclass(); - CHECK_NULL_RETURN_MEMERR(*np); - cc = NCCLASS(*np); - add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env); - if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); - } - break; - case ONIGENC_CTYPE_WORD: - *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not); + *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not, + IS_ASCII_RANGE(env->option)); CHECK_NULL_RETURN_MEMERR(*np); break; @@ -5317,6 +5917,7 @@ parse_exp(Node** np, OnigToken* tok, int term, r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype, &sb_out, &mbr); if (r == 0 && + ! IS_ASCII_RANGE(env->option) && ONIGENC_CODE_RANGE_NUM(mbr) >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) { type_cclass_key key; @@ -5365,7 +5966,7 @@ parse_exp(Node** np, OnigToken* tok, int term, *np = node_new_cclass(); CHECK_NULL_RETURN_MEMERR(*np); cc = NCCLASS(*np); - add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env); + add_ctype_to_cc(cc, tok->u.prop.ctype, 0, 0, env); if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); #ifdef USE_SHARED_CCLASS_TABLE } @@ -5388,11 +5989,19 @@ parse_exp(Node** np, OnigToken* tok, int term, case TK_CC_OPEN: { CClassNode* cc; + OnigCodePoint code; r = parse_char_class(np, tok, src, end, env); if (r != 0) return r; cc = NCCLASS(*np); + if (is_onechar_cclass(cc, &code)) { + *np = node_new_empty(); + CHECK_NULL_RETURN_MEMERR(*np); + r = node_str_cat_codepoint(*np, env->enc, code); + if (r != 0) return r; + goto string_loop; + } if (IS_IGNORECASE(env->option)) { IApplyCaseFoldArg iarg; @@ -5451,7 +6060,8 @@ parse_exp(Node** np, OnigToken* tok, int term, { int gnum = tok->u.call.gnum; - if (gnum < 0) { + if (gnum < 0 || tok->u.call.rel != 0) { + if (gnum > 0) gnum--; gnum = BACKREF_REL_TO_ABS(gnum, env); if (gnum <= 0) return ONIGERR_INVALID_BACKREF; @@ -5464,7 +6074,9 @@ parse_exp(Node** np, OnigToken* tok, int term, #endif case TK_ANCHOR: - *np = onig_node_new_anchor(tok->u.anchor); + *np = onig_node_new_anchor(tok->u.anchor.subtype); + CHECK_NULL_RETURN_MEMERR(*np); + NANCHOR(*np)->ascii_range = tok->u.anchor.ascii_range; break; case TK_OP_REPEAT: @@ -5522,7 +6134,7 @@ parse_exp(Node** np, OnigToken* tok, int term, *targetp = qn; } else if (r == 1) { - onig_node_free(qn); + onig_node_free(qn); } else if (r == 2) { /* split case: /abc+/ */ Node *tmp; @@ -5647,6 +6259,21 @@ parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) if (r < 0) return r; r = parse_subexp(top, &tok, TK_EOT, src, end, env); if (r < 0) return r; + +#ifdef USE_SUBEXP_CALL + if (env->num_call > 0) { + /* Capture the pattern itself. It is used for (?R), (?0) and \g<0>. */ + const int num = 0; + Node* np; + np = node_new_enclose_memory(env->option, 0); + CHECK_NULL_RETURN_MEMERR(np); + NENCLOSE(np)->regnum = num; + NENCLOSE(np)->target = *top; + r = scan_env_set_mem_node(env, num, np); + if (r != 0) return r; + *top = np; + } +#endif return 0; } -- cgit v1.2.3