summaryrefslogtreecommitdiff
path: root/regparse.c
diff options
context:
space:
mode:
authornaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2012-02-17 07:42:23 +0000
committernaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2012-02-17 07:42:23 +0000
commit0424e152c684a85f4b0691f1e84aec203115333d (patch)
treeccb4af1c7fa12a09934f315edba0b9d839e57ceb /regparse.c
parent5362e7fcddf70d16fc4261383f99175c90881353 (diff)
* Merge Onigmo-5.13.1. [ruby-dev:45057] [Feature #5820]
https://github.com/k-takata/Onigmo cp reg{comp,enc,error,exec,parse,syntax}.c reg{enc,int,parse}.h cp oniguruma.h cp tool/enc-unicode.rb cp -r enc/ git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@34663 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'regparse.c')
-rw-r--r--regparse.c1053
1 files changed, 840 insertions, 213 deletions
diff --git a/regparse.c b/regparse.c
index 7a58fe14c8..bc1343481d 100644
--- a/regparse.c
+++ b/regparse.c
@@ -1,9 +1,9 @@
-/* -*- mode:c; c-file-style:"gnu" -*- */
/**********************************************************************
- regparse.c - Oniguruma (regular expression library)
+ regparse.c - Onigmo (Oniguruma-mod) (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
+ * Copyright (c) 2011-2012 K.Takata <kentkt AT csc DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -50,7 +50,11 @@ const OnigSyntaxType OnigSyntaxRuby = {
ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
- ONIG_SYN_OP2_ESC_H_XDIGIT )
+ ONIG_SYN_OP2_ESC_H_XDIGIT |
+ ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER |
+ ONIG_SYN_OP2_QMARK_LPAREN_CONDITION |
+ ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK |
+ ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP )
, ( SYN_GNU_REGEX_BV |
ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
@@ -60,7 +64,8 @@ const OnigSyntaxType OnigSyntaxRuby = {
ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
ONIG_SYN_WARN_CC_DUP |
ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
- , ONIG_OPTION_NONE
+ , ( ONIG_OPTION_ASCII_RANGE | ONIG_OPTION_POSIX_BRACKET_ALL_RANGE |
+ ONIG_OPTION_WORD_BOUND_ALL_RANGE )
,
{
(OnigCodePoint )'\\' /* esc */
@@ -133,7 +138,7 @@ bbuf_clone(BBuf** rto, BBuf* from)
(OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
#define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
- add_code_range_to_buf(pbuf, env, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
+ add_code_range_to_buf(pbuf, env, MBCODE_START_POS(enc), ONIG_LAST_CODE_POINT)
#define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
@@ -677,7 +682,7 @@ onig_names_free(regex_t* reg)
}
static NameEntry*
-name_find(regex_t* reg, UChar* name, UChar* name_end)
+name_find(regex_t* reg, const UChar* name, const UChar* name_end)
{
int i, len;
NameEntry* e;
@@ -781,10 +786,12 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
}
else if (t->num == t->alloc) {
int i;
+ NameEntry* p;
alloc = t->alloc * 2;
- t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
- CHECK_NULL_RETURN_MEMERR(t->e);
+ p = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
+ CHECK_NULL_RETURN_MEMERR(p);
+ t->e = p;
t->alloc = alloc;
clear:
@@ -826,9 +833,11 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
}
else {
if (e->back_num > e->back_alloc) {
+ int* p;
alloc = e->back_alloc * 2;
- e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
- CHECK_NULL_RETURN_MEMERR(e->back_refs);
+ p = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
+ CHECK_NULL_RETURN_MEMERR(p);
+ e->back_refs = p;
e->back_alloc = alloc;
}
e->back_refs[e->back_num - 1] = backref;
@@ -1209,7 +1218,7 @@ node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out,
}
static Node*
-node_new_ctype(int type, int not)
+node_new_ctype(int type, int not, int ascii_range)
{
Node* node = node_new();
CHECK_NULL_RETURN(node);
@@ -1217,6 +1226,7 @@ node_new_ctype(int type, int not)
SET_NTYPE(node, NT_CTYPE);
NCTYPE(node)->ctype = type;
NCTYPE(node)->not = not;
+ NCTYPE(node)->ascii_range = ascii_range;
return node;
}
@@ -1288,6 +1298,7 @@ onig_node_new_anchor(int type)
NANCHOR(node)->type = type;
NANCHOR(node)->target = NULL;
NANCHOR(node)->char_len = -1;
+ NANCHOR(node)->ascii_range = 0;
return node;
}
@@ -1455,7 +1466,7 @@ onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
CHECK_NULL_RETURN_MEMERR(p);
NSTR(node)->s = p;
- NSTR(node)->capa = (int)capa;
+ NSTR(node)->capa = (int )capa;
}
}
else {
@@ -1483,6 +1494,15 @@ node_str_cat_char(Node* node, UChar c)
return onig_node_str_cat(node, s, s + 1);
}
+static int
+node_str_cat_codepoint(Node* node, OnigEncoding enc, OnigCodePoint c)
+{
+ UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
+ int num = ONIGENC_CODE_TO_MBC(enc, c, buf);
+ if (num < 0) return num;
+ return onig_node_str_cat(node, buf, buf + num);
+}
+
extern void
onig_node_conv_to_str_node(Node* node, int flag)
{
@@ -1535,7 +1555,8 @@ static Node*
node_new_str_raw(UChar* s, UChar* end)
{
Node* node = node_new_str(s, end);
- NSTRING_SET_RAW(node);
+ if (IS_NOT_NULL(node))
+ NSTRING_SET_RAW(node);
return node;
}
@@ -1564,7 +1585,7 @@ str_node_split_last_char(StrNode* sn, OnigEncoding enc)
p = onigenc_get_prev_char_head(enc, sn->s, sn->end, sn->end);
if (p && p > sn->s) { /* can be splitted. */
n = node_new_str(p, sn->end);
- if ((sn->flag & NSTR_RAW) != 0)
+ if (IS_NOT_NULL(n) && (sn->flag & NSTR_RAW) != 0)
NSTRING_SET_RAW(n);
sn->end = (UChar* )p;
}
@@ -1627,14 +1648,16 @@ onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
}
static int
-scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
- OnigEncoding enc)
+scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen,
+ int maxlen, OnigEncoding enc)
{
OnigCodePoint c;
unsigned int num, val;
+ int restlen;
UChar* p = *src;
PFETCH_READY;
+ restlen = maxlen - minlen;
num = 0;
while (!PEND && maxlen-- != 0) {
PFETCH(c);
@@ -1650,6 +1673,8 @@ scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
break;
}
}
+ if (maxlen > restlen)
+ return -2; /* not enough digits */
*src = p;
return num;
}
@@ -1734,17 +1759,19 @@ add_code_range_to_buf0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePo
data = (OnigCodePoint* )(bbuf->p);
data++;
- for (low = 0, bound = n; low < bound; ) {
+ bound = (from == 0) ? 0 : n;
+ for (low = 0; low < bound; ) {
x = (low + bound) >> 1;
- if (from > data[x*2 + 1])
+ if (from - 1 > data[x*2 + 1])
low = x + 1;
else
bound = x;
}
- for (high = low, bound = n; high < bound; ) {
+ high = (to == ONIG_LAST_CODE_POINT) ? n : low;
+ for (bound = n; high < bound; ) {
x = (high + bound) >> 1;
- if (to >= data[x*2] - 1)
+ if (to + 1 >= data[x*2])
high = x + 1;
else
bound = x;
@@ -1762,13 +1789,15 @@ add_code_range_to_buf0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePo
to = data[(high - 1)*2 + 1];
}
- if (inc_n != 0 && (OnigCodePoint )high < n) {
+ if (inc_n != 0) {
int from_pos = SIZE_CODE_POINT * (1 + high * 2);
int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
- int size = (n - high) * 2 * SIZE_CODE_POINT;
if (inc_n > 0) {
- BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
+ if ((OnigCodePoint )high < n) {
+ int size = (n - high) * 2 * SIZE_CODE_POINT;
+ BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
+ }
}
else {
BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
@@ -1836,11 +1865,11 @@ not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf, ScanEnv* env)
r = add_code_range_to_buf(pbuf, env, pre, from - 1);
if (r != 0) return r;
}
- if (to == ~((OnigCodePoint )0)) break;
+ if (to == ONIG_LAST_CODE_POINT) break;
pre = to + 1;
}
- if (to < ~((OnigCodePoint )0)) {
- r = add_code_range_to_buf(pbuf, env, to + 1, ~((OnigCodePoint )0));
+ if (to < ONIG_LAST_CODE_POINT) {
+ r = add_code_range_to_buf(pbuf, env, to + 1, ONIG_LAST_CODE_POINT);
}
return r;
}
@@ -2053,8 +2082,8 @@ and_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env)
}
}
if (r != 0) {
- bbuf_free(pbuf);
- return r;
+ bbuf_free(pbuf);
+ return r;
}
dest->mbuf = pbuf;
@@ -2111,8 +2140,8 @@ or_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env)
}
}
if (r != 0) {
- bbuf_free(pbuf);
- return r;
+ bbuf_free(pbuf);
+ return r;
}
dest->mbuf = pbuf;
@@ -2151,7 +2180,9 @@ conv_backslash_value(int c, ScanEnv* env)
return c;
}
-#if 0 /* no invalid quantifier */
+#ifdef USE_NO_INVALID_QUANTIFIER
+#define is_invalid_quantifier_target(node) 0
+#else
static int
is_invalid_quantifier_target(Node* node)
{
@@ -2183,8 +2214,6 @@ is_invalid_quantifier_target(Node* node)
}
return 0;
}
-#else
-#define is_invalid_quantifier_target(node) 0
#endif
/* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
@@ -2303,6 +2332,9 @@ enum TokenSyms {
TK_CC_OPEN,
TK_QUOTE_OPEN,
TK_CHAR_PROPERTY, /* \p{...}, \P{...} */
+ TK_LINEBREAK,
+ TK_EXTENDED_GRAPHEME_CLUSTER,
+ TK_KEEP,
/* in cc */
TK_CC_CLOSE,
TK_CC_RANGE,
@@ -2320,8 +2352,10 @@ typedef struct {
UChar* s;
int c;
OnigCodePoint code;
- int anchor;
- int subtype;
+ struct {
+ int subtype;
+ int ascii_range;
+ } anchor;
struct {
int lower;
int upper;
@@ -2342,6 +2376,7 @@ typedef struct {
UChar* name;
UChar* name_end;
int gnum;
+ int rel;
} call;
struct {
int ctype;
@@ -2523,6 +2558,8 @@ get_name_end_code_point(OnigCodePoint start)
switch (start) {
case '<': return (OnigCodePoint )'>'; break;
case '\'': return (OnigCodePoint )'\''; break;
+ case '(': return (OnigCodePoint )')'; break;
+ case '{': return (OnigCodePoint )'}'; break;
default:
break;
}
@@ -2648,7 +2685,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
#endif /* USE_BACKREF_WITH_LEVEL */
/*
- def: 0 -> define name (don't allow number name)
+ ref: 0 -> define name (don't allow number name)
1 -> reference name (allow number name)
*/
static int
@@ -2883,8 +2920,8 @@ CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
{
if (onig_warn == onig_null_warn) return ;
- if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
- onig_syntax_warn(env, "regular expression has '%s' without escape", c);
+ if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
+ onig_syntax_warn(env, "regular expression has '%s' without escape", c);
}
}
@@ -2893,9 +2930,9 @@ CC_DUP_WARN(ScanEnv *env)
{
if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ;
- if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_DUP) &&
- !((env)->warnings_flag & ONIG_SYN_WARN_CC_DUP)) {
- (env)->warnings_flag |= ONIG_SYN_WARN_CC_DUP;
+ if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_DUP) &&
+ !(env->warnings_flag & ONIG_SYN_WARN_CC_DUP)) {
+ env->warnings_flag |= ONIG_SYN_WARN_CC_DUP;
onig_syntax_warn(env, "character class has duplicated range");
}
}
@@ -3014,32 +3051,32 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
switch (c) {
case 'w':
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_W;
+ tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
tok->u.prop.not = 0;
break;
case 'W':
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_W;
+ tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
tok->u.prop.not = 1;
break;
case 'd':
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_D;
+ tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
tok->u.prop.not = 0;
break;
case 'D':
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_D;
+ tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
tok->u.prop.not = 1;
break;
case 's':
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_S;
+ tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
tok->u.prop.not = 0;
break;
case 'S':
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_S;
+ tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
tok->u.prop.not = 1;
break;
case 'h':
@@ -3074,7 +3111,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
}
}
else {
- onig_syntax_warn(env, "invalid Unicode Property \\%c", c);
+ onig_syntax_warn(env, "invalid Unicode Property \\%c", c);
}
break;
@@ -3084,7 +3121,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
prev = p;
if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
PINC;
- num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
+ num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc);
if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
if (!PEND) {
c2 = PPEEK;
@@ -3104,7 +3141,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
}
}
else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
- num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
+ num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc);
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
@@ -3120,8 +3157,9 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
prev = p;
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
- num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
- if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
+ num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc);
+ if (num < -1) return ONIGERR_TOO_SHORT_DIGITS;
+ else if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
}
@@ -3195,6 +3233,82 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
return tok->type;
}
+#ifdef USE_NAMED_GROUP
+static int
+fetch_named_backref_token(OnigCodePoint c, OnigToken* tok, UChar** src,
+ UChar* end, ScanEnv* env)
+{
+ int r, num;
+ OnigEncoding enc = env->enc;
+ const OnigSyntaxType* syn = env->syntax;
+ UChar* prev;
+ UChar* p = *src;
+ UChar* name_end;
+ int* backs;
+ int back_num;
+
+ prev = p;
+
+#ifdef USE_BACKREF_WITH_LEVEL
+ name_end = NULL_UCHARP; /* no need. escape gcc warning. */
+ r = fetch_name_with_level(c, &p, end, &name_end,
+ env, &back_num, &tok->u.backref.level);
+ if (r == 1) tok->u.backref.exist_level = 1;
+ else tok->u.backref.exist_level = 0;
+#else
+ r = fetch_name(&p, end, &name_end, env, &back_num, 1);
+#endif
+ if (r < 0) return r;
+
+ if (back_num != 0) {
+ if (back_num < 0) {
+ back_num = BACKREF_REL_TO_ABS(back_num, env);
+ if (back_num <= 0)
+ return ONIGERR_INVALID_BACKREF;
+ }
+
+ if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
+ if (back_num > env->num_mem ||
+ IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
+ return ONIGERR_INVALID_BACKREF;
+ }
+ tok->type = TK_BACKREF;
+ tok->u.backref.by_name = 0;
+ tok->u.backref.num = 1;
+ tok->u.backref.ref1 = back_num;
+ }
+ else {
+ num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
+ if (num <= 0) {
+ onig_scan_env_set_error_string(env,
+ ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
+ return ONIGERR_UNDEFINED_NAME_REFERENCE;
+ }
+ if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
+ int i;
+ for (i = 0; i < num; i++) {
+ if (backs[i] > env->num_mem ||
+ IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
+ return ONIGERR_INVALID_BACKREF;
+ }
+ }
+
+ tok->type = TK_BACKREF;
+ tok->u.backref.by_name = 1;
+ if (num == 1) {
+ tok->u.backref.num = 1;
+ tok->u.backref.ref1 = backs[0];
+ }
+ else {
+ tok->u.backref.num = num;
+ tok->u.backref.refs = backs;
+ }
+ }
+ *src = p;
+ return 0;
+}
+#endif
+
static int
fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
{
@@ -3304,68 +3418,74 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
case 'w':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_W;
+ tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
tok->u.prop.not = 0;
break;
case 'W':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_W;
+ tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
tok->u.prop.not = 1;
break;
case 'b':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
tok->type = TK_ANCHOR;
- tok->u.anchor = ANCHOR_WORD_BOUND;
+ tok->u.anchor.subtype = ANCHOR_WORD_BOUND;
+ tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option)
+ && ! IS_WORD_BOUND_ALL_RANGE(env->option);
break;
case 'B':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
tok->type = TK_ANCHOR;
- tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
+ tok->u.anchor.subtype = ANCHOR_NOT_WORD_BOUND;
+ tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option)
+ && ! IS_WORD_BOUND_ALL_RANGE(env->option);
break;
#ifdef USE_WORD_BEGIN_END
case '<':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
tok->type = TK_ANCHOR;
- tok->u.anchor = ANCHOR_WORD_BEGIN;
+ tok->u.anchor.subtype = ANCHOR_WORD_BEGIN;
+ tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option);
break;
case '>':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
tok->type = TK_ANCHOR;
- tok->u.anchor = ANCHOR_WORD_END;
+ tok->u.anchor.subtype = ANCHOR_WORD_END;
+ tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option);
break;
#endif
case 's':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_S;
+ tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
tok->u.prop.not = 0;
break;
case 'S':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_S;
+ tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
tok->u.prop.not = 1;
break;
case 'd':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_D;
+ tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
tok->u.prop.not = 0;
break;
case 'D':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
tok->type = TK_CHAR_TYPE;
- tok->u.prop.ctype = ONIGENC_CTYPE_D;
+ tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
tok->u.prop.not = 1;
break;
@@ -3387,26 +3507,26 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
begin_buf:
tok->type = TK_ANCHOR;
- tok->u.subtype = ANCHOR_BEGIN_BUF;
+ tok->u.anchor.subtype = ANCHOR_BEGIN_BUF;
break;
case 'Z':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
tok->type = TK_ANCHOR;
- tok->u.subtype = ANCHOR_SEMI_END_BUF;
+ tok->u.anchor.subtype = ANCHOR_SEMI_END_BUF;
break;
case 'z':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
end_buf:
tok->type = TK_ANCHOR;
- tok->u.subtype = ANCHOR_END_BUF;
+ tok->u.anchor.subtype = ANCHOR_END_BUF;
break;
case 'G':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
tok->type = TK_ANCHOR;
- tok->u.subtype = ANCHOR_BEGIN_POSITION;
+ tok->u.anchor.subtype = ANCHOR_BEGIN_POSITION;
break;
case '`':
@@ -3425,7 +3545,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
prev = p;
if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
PINC;
- num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
+ num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc);
if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
if (!PEND) {
if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
@@ -3443,7 +3563,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
}
}
else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
- num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
+ num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc);
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
@@ -3459,8 +3579,9 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
prev = p;
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
- num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
- if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
+ num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc);
+ if (num < -1) return ONIGERR_TOO_SHORT_DIGITS;
+ else if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
}
@@ -3527,98 +3648,69 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
PFETCH(c);
if (c == '<' || c == '\'') {
- UChar* name_end;
- int* backs;
- int back_num;
-
- prev = p;
-
-#ifdef USE_BACKREF_WITH_LEVEL
- name_end = NULL_UCHARP; /* no need. escape gcc warning. */
- r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
- env, &back_num, &tok->u.backref.level);
- if (r == 1) tok->u.backref.exist_level = 1;
- else tok->u.backref.exist_level = 0;
-#else
- r = fetch_name(&p, end, &name_end, env, &back_num, 1);
-#endif
+ r = fetch_named_backref_token(c, tok, &p, end, env);
if (r < 0) return r;
-
- if (back_num != 0) {
- if (back_num < 0) {
- back_num = BACKREF_REL_TO_ABS(back_num, env);
- if (back_num <= 0)
- return ONIGERR_INVALID_BACKREF;
- }
-
- if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
- if (back_num > env->num_mem ||
- IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
- return ONIGERR_INVALID_BACKREF;
- }
- tok->type = TK_BACKREF;
- tok->u.backref.by_name = 0;
- tok->u.backref.num = 1;
- tok->u.backref.ref1 = back_num;
- }
- else {
- num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
- if (num <= 0) {
- onig_scan_env_set_error_string(env,
- ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
- return ONIGERR_UNDEFINED_NAME_REFERENCE;
- }
- if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
- int i;
- for (i = 0; i < num; i++) {
- if (backs[i] > env->num_mem ||
- IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
- return ONIGERR_INVALID_BACKREF;
- }
- }
-
- tok->type = TK_BACKREF;
- tok->u.backref.by_name = 1;
- if (num == 1) {
- tok->u.backref.num = 1;
- tok->u.backref.ref1 = backs[0];
- }
- else {
- tok->u.backref.num = num;
- tok->u.backref.refs = backs;
- }
- }
}
else {
- PUNFETCH;
- onig_syntax_warn(env, "invalid back reference");
+ PUNFETCH;
+ onig_syntax_warn(env, "invalid back reference");
}
}
break;
#endif
-#ifdef USE_SUBEXP_CALL
+#if defined(USE_SUBEXP_CALL) || defined(USE_NAMED_GROUP)
case 'g':
+#ifdef USE_NAMED_GROUP
+ if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_BRACE_BACKREF)) {
+ PFETCH(c);
+ if (c == '{') {
+ r = fetch_named_backref_token(c, tok, &p, end, env);
+ if (r < 0) return r;
+ }
+ else
+ PUNFETCH;
+ }
+#endif
+#ifdef USE_SUBEXP_CALL
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
PFETCH(c);
if (c == '<' || c == '\'') {
- int gnum;
+ int gnum = -1, rel = 0;
UChar* name_end;
-
+ OnigCodePoint cnext;
+
+ cnext = PPEEK;
+ if (cnext == '0') {
+ PINC;
+ if (PPEEK_IS(get_name_end_code_point(c))) { // \g<0>, \g'0'
+ PINC;
+ name_end = p;
+ gnum = 0;
+ }
+ }
+ else if (cnext == '+') {
+ PINC;
+ rel = 1;
+ }
prev = p;
- r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
- if (r < 0) return r;
+ if (gnum < 0) {
+ r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
+ if (r < 0) return r;
+ }
tok->type = TK_CALL;
tok->u.call.name = prev;
tok->u.call.name_end = name_end;
tok->u.call.gnum = gnum;
+ tok->u.call.rel = rel;
}
else {
- onig_syntax_warn(env, "invalid subexp call");
- PUNFETCH;
+ onig_syntax_warn(env, "invalid subexp call");
+ PUNFETCH;
}
}
+#endif
break;
#endif
@@ -3646,7 +3738,25 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
}
}
else {
- onig_syntax_warn(env, "invalid Unicode Property \\%c", c);
+ onig_syntax_warn(env, "invalid Unicode Property \\%c", c);
+ }
+ break;
+
+ case 'R':
+ if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK)) {
+ tok->type = TK_LINEBREAK;
+ }
+ break;
+
+ case 'X':
+ if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER)) {
+ tok->type = TK_EXTENDED_GRAPHEME_CLUSTER;
+ }
+ break;
+
+ case 'K':
+ if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP)) {
+ tok->type = TK_KEEP;
}
break;
@@ -3766,6 +3876,97 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
}
goto start;
}
+#ifdef USE_PERL_SUBEXP_CALL
+ /* (?&name), (?n), (?R), (?0), (?+n), (?-n) */
+ c = PPEEK;
+ if ((c == '&' || c == 'R' || ONIGENC_IS_CODE_DIGIT(enc, c)) &&
+ IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) {
+ /* (?&name), (?n), (?R), (?0) */
+ int gnum;
+ UChar *name;
+ UChar *name_end;
+
+ if (c == 'R' || c == '0') {
+ PINC; /* skip 'R' / '0' */
+ if (!PPEEK_IS(')')) return ONIGERR_INVALID_GROUP_NAME;
+ PINC; /* skip ')' */
+ name_end = name = p;
+ gnum = 0;
+ }
+ else {
+ int numref = 1;
+ if (c == '&') { /* (?&name) */
+ PINC;
+ numref = 0; /* don't allow number name */
+ }
+ name = p;
+ r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, numref);
+ if (r < 0) return r;
+ }
+
+ tok->type = TK_CALL;
+ tok->u.call.name = name;
+ tok->u.call.name_end = name_end;
+ tok->u.call.gnum = gnum;
+ tok->u.call.rel = 0;
+ break;
+ }
+ else if ((c == '-' || c == '+') &&
+ IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) {
+ /* (?+n), (?-n) */
+ int gnum;
+ UChar *name;
+ UChar *name_end;
+ OnigCodePoint cnext;
+ PFETCH_READY;
+
+ PINC; /* skip '-' / '+' */
+ cnext = PPEEK;
+ if (ONIGENC_IS_CODE_DIGIT(enc, cnext)) {
+ if (c == '-') PUNFETCH;
+ name = p;
+ r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 1);
+ if (r < 0) return r;
+
+ tok->type = TK_CALL;
+ tok->u.call.name = name;
+ tok->u.call.name_end = name_end;
+ tok->u.call.gnum = gnum;
+ tok->u.call.rel = 1;
+ break;
+ }
+ }
+#endif /* USE_PERL_SUBEXP_CALL */
+#ifdef USE_CAPITAL_P_NAMED_GROUP
+ if (PPEEK_IS('P') &&
+ IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) {
+ int gnum;
+ UChar *name;
+ UChar *name_end;
+ PFETCH_READY;
+
+ PINC; /* skip 'P' */
+ PFETCH(c);
+ if (c == '=') { /* (?P=name): backref */
+ r = fetch_named_backref_token((OnigCodePoint )'(', tok, &p, end, env);
+ if (r < 0) return r;
+ break;
+ }
+ else if (c == '>') { /* (?P>name): subexp call */
+ name = p;
+ r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 0);
+ if (r < 0) return r;
+
+ tok->type = TK_CALL;
+ tok->u.call.name = name;
+ tok->u.call.name_end = name_end;
+ tok->u.call.gnum = gnum;
+ tok->u.call.rel = 0;
+ break;
+ }
+ PUNFETCH;
+ }
+#endif /* USE_CAPITAL_P_NAMED_GROUP */
PUNFETCH;
}
@@ -3781,15 +3982,15 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
case '^':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
tok->type = TK_ANCHOR;
- tok->u.subtype = (IS_SINGLELINE(env->option)
- ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
+ tok->u.anchor.subtype = (IS_SINGLELINE(env->option)
+ ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
break;
case '$':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
tok->type = TK_ANCHOR;
- tok->u.subtype = (IS_SINGLELINE(env->option)
- ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
+ tok->u.anchor.subtype = (IS_SINGLELINE(env->option)
+ ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
break;
case '[':
@@ -3906,43 +4107,44 @@ add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
}
static int
-add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
+add_ctype_to_cc(CClassNode* cc, int ctype, int not, int char_prop, ScanEnv* env)
{
+ int maxcode, ascii_range;
int c, r;
const OnigCodePoint *ranges;
OnigCodePoint sb_out;
OnigEncoding enc = env->enc;
+ OnigOptionType option = env->option;
- switch (ctype) {
- case ONIGENC_CTYPE_D:
- case ONIGENC_CTYPE_S:
- case ONIGENC_CTYPE_W:
- ctype ^= ONIGENC_CTYPE_SPECIAL_MASK;
- if (not != 0) {
- for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
- if (! ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype))
- BITSET_SET_BIT_CHKDUP(cc->bs, c);
- }
- ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
- }
- else {
- for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
- if (ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype))
- BITSET_SET_BIT_CHKDUP(cc->bs, c);
- }
- }
- return 0;
- break;
- }
+ ascii_range = IS_ASCII_RANGE(option) && (char_prop == 0);
r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
if (r == 0) {
- return add_ctype_to_cc_by_range(cc, ctype, not, env, sb_out, ranges);
+ r = add_ctype_to_cc_by_range(cc, ctype, not, env, sb_out, ranges);
+ if ((r == 0) && ascii_range) {
+ if (not != 0) {
+ r = add_code_range_to_buf(&(cc->mbuf), env, 0x80, ONIG_LAST_CODE_POINT);
+ }
+ else {
+ CClassNode ccascii;
+ initialize_cclass(&ccascii);
+ if (ONIGENC_MBC_MINLEN(env->enc) > 1) {
+ add_code_range(&(ccascii.mbuf), env, 0x00, 0x7F);
+ }
+ else {
+ bitset_set_range(env, ccascii.bs, 0x00, 0x7F);
+ }
+ r = and_cclass(cc, &ccascii, env);
+ if (IS_NOT_NULL(ccascii.mbuf)) bbuf_free(ccascii.mbuf);
+ }
+ }
+ return r;
}
else if (r != ONIG_NO_SUPPORT_CONFIG) {
return r;
}
+ maxcode = ascii_range ? 0x80 : SINGLE_BYTE_SIZE;
r = 0;
switch (ctype) {
case ONIGENC_CTYPE_ALPHA:
@@ -3975,32 +4177,39 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
case ONIGENC_CTYPE_PRINT:
if (not != 0) {
for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
- if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
+ if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)
+ || c >= maxcode)
BITSET_SET_BIT_CHKDUP(cc->bs, c);
}
+ if (ascii_range)
+ ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
}
else {
- for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
+ for (c = 0; c < maxcode; c++) {
if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
BITSET_SET_BIT_CHKDUP(cc->bs, c);
}
- ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
+ if (! ascii_range)
+ ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
}
break;
case ONIGENC_CTYPE_WORD:
if (not == 0) {
- for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
- if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT_CHKDUP(cc->bs, c);
+ for (c = 0; c < maxcode; c++) {
+ if (ONIGENC_IS_CODE_WORD(enc, c)) BITSET_SET_BIT_CHKDUP(cc->bs, c);
}
- ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
+ if (! ascii_range)
+ ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
}
else {
for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */
- && ! ONIGENC_IS_CODE_WORD(enc, c))
+ && (! ONIGENC_IS_CODE_WORD(enc, c) || c >= maxcode))
BITSET_SET_BIT_CHKDUP(cc->bs, c);
}
+ if (ascii_range)
+ ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
}
break;
@@ -4059,7 +4268,9 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
- r = add_ctype_to_cc(cc, pb->ctype, not, env);
+ r = add_ctype_to_cc(cc, pb->ctype, not,
+ IS_POSIX_BRACKET_ALL_RANGE(env->option),
+ env);
if (r != 0) return r;
PINC; PINC;
@@ -4132,7 +4343,7 @@ parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
*np = node_new_cclass();
CHECK_NULL_RETURN_MEMERR(*np);
cc = NCCLASS(*np);
- r = add_ctype_to_cc(cc, ctype, 0, env);
+ r = add_ctype_to_cc(cc, ctype, 0, 1, env);
if (r != 0) return r;
if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
@@ -4424,7 +4635,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
break;
case TK_CHAR_TYPE:
- r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
+ r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, 0, env);
if (r != 0) return r;
next_class:
@@ -4438,7 +4649,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
ctype = fetch_char_property_to_ctype(&p, end, env);
if (ctype < 0) return ctype;
- r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
+ r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, 1, env);
if (r != 0) return r;
goto next_class;
}
@@ -4586,8 +4797,10 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
BITSET_SET_BIT_CHKDUP(cc->bs, NEWLINE_CODE);
- else
- add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
+ else {
+ r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
+ if (r < 0) goto err;
+ }
}
}
}
@@ -4607,8 +4820,8 @@ static int
parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
ScanEnv* env)
{
- int r, num;
- Node *target;
+ int r = 0, num;
+ Node *target, *work1 = NULL, *work2 = NULL;
OnigOptionType option;
OnigCodePoint c;
OnigEncoding enc = env->enc;
@@ -4644,10 +4857,10 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
case '=':
*np = onig_node_new_anchor(ANCHOR_PREC_READ);
break;
- case '!': /* preceding read */
+ case '!': /* preceding read */
*np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
break;
- case '>': /* (?>...) stop backtrack */
+ case '>': /* (?>...) stop backtrack */
*np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
break;
@@ -4659,6 +4872,16 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
else
return ONIGERR_UNDEFINED_GROUP_OPTION;
break;
+
+#ifdef USE_CAPITAL_P_NAMED_GROUP
+ case 'P': /* (?P<name>...) */
+ if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) {
+ PFETCH(c);
+ if (c == '<') goto named_group1;
+ }
+ return ONIGERR_UNDEFINED_GROUP_OPTION;
+ break;
+#endif
#endif
case '<': /* look behind (?<=...), (?<!...) */
@@ -4668,7 +4891,7 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
else if (c == '!')
*np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
#ifdef USE_NAMED_GROUP
- else {
+ else { /* (?<name>...) */
if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
UChar *name;
UChar *name_end;
@@ -4740,10 +4963,104 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
}
break;
+ case '(': /* conditional expression: (?(cond)yes), (?(cond)yes|no) */
+ if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_CONDITION)) {
+ UChar *name = NULL;
+ UChar *name_end;
+ PFETCH(c);
+ if (ONIGENC_IS_CODE_DIGIT(enc, c)) { /* (n) */
+ PUNFETCH;
+ r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &num, 1);
+ if (r < 0) return r;
+ if (num < 0) {
+ num = BACKREF_REL_TO_ABS(num, env);
+ if (num <= 0)
+ return ONIGERR_INVALID_BACKREF;
+ }
+ if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
+ if (num > env->num_mem ||
+ IS_NULL(SCANENV_MEM_NODES(env)[num]))
+ return ONIGERR_INVALID_BACKREF;
+ }
+ }
+#ifdef USE_NAMED_GROUP
+ else if (c == '<' || c == '\'') { /* (<name>), ('name') */
+ int nums;
+ int *backs;
+
+ name = p;
+ r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
+ if (r < 0) return r;
+ PFETCH(c);
+ if (c != ')') return ONIGERR_UNDEFINED_GROUP_OPTION;
+
+ nums = onig_name_to_group_numbers(env->reg, name, name_end, &backs);
+ if (nums <= 0) {
+ onig_scan_env_set_error_string(env,
+ ONIGERR_UNDEFINED_NAME_REFERENCE, name, name_end);
+ return ONIGERR_UNDEFINED_NAME_REFERENCE;
+ }
+ if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
+ int i;
+ for (i = 0; i < nums; i++) {
+ if (backs[i] > env->num_mem ||
+ IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
+ return ONIGERR_INVALID_BACKREF;
+ }
+ }
+ num = backs[0]; /* XXX: use left most named group as Perl */
+ }
+#endif
+ *np = node_new_enclose(ENCLOSE_CONDITION);
+ CHECK_NULL_RETURN_MEMERR(*np);
+ NENCLOSE(*np)->regnum = num;
+ if (IS_NOT_NULL(name)) NENCLOSE(*np)->state |= NST_NAME_REF;
+ }
+ else
+ return ONIGERR_UNDEFINED_GROUP_OPTION;
+ break;
+
+#if 0
+ case '|': /* branch reset: (?|...) */
+ if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_VBAR_BRANCH_RESET)) {
+ /* TODO */
+ }
+ else
+ return ONIGERR_UNDEFINED_GROUP_OPTION;
+ break;
+#endif
+
+ case '^': /* loads default options */
+ if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
+ /* d-imsx */
+ ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
+ ONOFF(option, ONIG_OPTION_IGNORECASE, 1);
+ ONOFF(option, ONIG_OPTION_SINGLELINE, 0);
+ ONOFF(option, ONIG_OPTION_MULTILINE, 1);
+ ONOFF(option, ONIG_OPTION_EXTEND, 1);
+ PFETCH(c);
+ }
+#if 0
+ else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
+ /* d-imx */
+ ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
+ ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0);
+ ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0);
+ ONOFF(option, ONIG_OPTION_IGNORECASE, 1);
+ ONOFF(option, ONIG_OPTION_MULTILINE, 1);
+ ONOFF(option, ONIG_OPTION_EXTEND, 1);
+ PFETCH(c);
+ }
+#endif
+ else {
+ return ONIGERR_UNDEFINED_GROUP_OPTION;
+ }
+ /* fall through */
#ifdef USE_POSIXLINE_OPTION
case 'p':
#endif
case '-': case 'i': case 'm': case 's': case 'x':
+ case 'a': case 'd': case 'l': case 'u':
{
int neg = 0;
@@ -4779,6 +5096,54 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
break;
#endif
+
+ case 'a': /* limits \d, \s, \w and POSIX brackets to ASCII range */
+ if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) ||
+ IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) &&
+ (neg == 0)) {
+ ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
+ ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1);
+ ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1);
+ }
+ else
+ return ONIGERR_UNDEFINED_GROUP_OPTION;
+ break;
+
+ case 'u':
+ if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) ||
+ IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) &&
+ (neg == 0)) {
+ ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
+ ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1);
+ ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1);
+ }
+ else
+ return ONIGERR_UNDEFINED_GROUP_OPTION;
+ break;
+
+ case 'd':
+ if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) &&
+ (neg == 0)) {
+ ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
+ }
+ else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY) &&
+ (neg == 0)) {
+ ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
+ ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0);
+ ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0);
+ }
+ else
+ return ONIGERR_UNDEFINED_GROUP_OPTION;
+ break;
+
+ case 'l':
+ if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && (neg == 0)) {
+ ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
+ }
+ else
+ return ONIGERR_UNDEFINED_GROUP_OPTION;
+ break;
+
default:
return ONIGERR_UNDEFINED_GROUP_OPTION;
}
@@ -4844,10 +5209,29 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
if (r != 0) return r;
}
+ else if (NENCLOSE(*np)->type == ENCLOSE_CONDITION) {
+ if (NTYPE(target) != NT_ALT) {
+ /* convert (?(cond)yes) to (?(cond)yes|empty) */
+ work1 = node_new_empty();
+ if (IS_NULL(work1)) goto err;
+ work2 = onig_node_new_alt(work1, NULL_NODE);
+ if (IS_NULL(work2)) goto err;
+ work1 = onig_node_new_alt(target, work2);
+ if (IS_NULL(work1)) goto err;
+ NENCLOSE(*np)->target = work1;
+ }
+ }
}
*src = p;
return 0;
+
+ err:
+ onig_node_free(work1);
+ onig_node_free(work2);
+ onig_node_free(*np);
+ *np = NULL;
+ return ONIGERR_MEMORY;
}
static const char* const PopularQStr[] = {
@@ -5141,6 +5525,209 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
}
static int
+node_linebreak(Node** np, ScanEnv* env)
+{
+ /* same as (?>\x0D\x0A|[\x0A-\x0D\x{85}\x{2028}\x{2029}]) */
+ Node* left = NULL;
+ Node* right = NULL;
+ Node* target1 = NULL;
+ Node* target2 = NULL;
+ CClassNode* cc;
+ int num1, num2;
+ UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2];
+
+ /* \x0D\x0A */
+ num1 = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf);
+ if (num1 < 0) return num1;
+ num2 = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1);
+ if (num2 < 0) return num2;
+ left = node_new_str_raw(buf, buf + num1 + num2);
+ if (IS_NULL(left)) goto err;
+
+ /* [\x0A-\x0D] or [\x0A-\x0D\x{85}\x{2028}\x{2029}] */
+ right = node_new_cclass();
+ if (IS_NULL(right)) goto err;
+ cc = NCCLASS(right);
+ if (ONIGENC_MBC_MINLEN(env->enc) > 1) {
+ add_code_range(&(cc->mbuf), env, 0x0A, 0x0D);
+ }
+ else {
+ bitset_set_range(env, cc->bs, 0x0A, 0x0D);
+ }
+
+ /* TODO: move this block to enc/unicode.c */
+ if (ONIGENC_IS_UNICODE(env->enc)) {
+ /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
+ add_code_range(&(cc->mbuf), env, 0x85, 0x85);
+ add_code_range(&(cc->mbuf), env, 0x2028, 0x2029);
+ }
+
+ /* ...|... */
+ target1 = onig_node_new_alt(right, NULL_NODE);
+ if (IS_NULL(target1)) goto err;
+ right = NULL;
+ target2 = onig_node_new_alt(left, target1);
+ if (IS_NULL(target2)) goto err;
+ left = NULL;
+ target1 = NULL;
+
+ /* (?>...) */
+ *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
+ if (IS_NULL(*np)) goto err;
+ NENCLOSE(*np)->target = target2;
+ return ONIG_NORMAL;
+
+ err:
+ onig_node_free(left);
+ onig_node_free(right);
+ onig_node_free(target1);
+ onig_node_free(target2);
+ return ONIGERR_MEMORY;
+}
+
+static int
+node_extended_grapheme_cluster(Node** np, ScanEnv* env)
+{
+ /* same as (?>\P{M}\p{M}*) */
+ Node* np1 = NULL;
+ Node* np2 = NULL;
+ Node* qn = NULL;
+ Node* list1 = NULL;
+ Node* list2 = NULL;
+ int r = 0;
+
+#ifdef USE_UNICODE_PROPERTIES
+ if (ONIGENC_IS_UNICODE(env->enc)) {
+ /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
+ CClassNode* cc1;
+ CClassNode* cc2;
+ UChar* propname = (UChar* )"M";
+ int ctype = env->enc->property_name_to_ctype(ONIG_ENCODING_ASCII,
+ propname, propname + 1);
+ if (ctype >= 0) {
+ /* \P{M} */
+ np1 = node_new_cclass();
+ if (IS_NULL(np1)) goto err;
+ cc1 = NCCLASS(np1);
+ r = add_ctype_to_cc(cc1, ctype, 0, 1, env);
+ if (r != 0) goto err;
+ NCCLASS_SET_NOT(cc1);
+
+ /* \p{M}* */
+ np2 = node_new_cclass();
+ if (IS_NULL(np2)) goto err;
+ cc2 = NCCLASS(np2);
+ r = add_ctype_to_cc(cc2, ctype, 0, 1, env);
+ if (r != 0) goto err;
+
+ qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
+ if (IS_NULL(qn)) goto err;
+ NQTFR(qn)->target = np2;
+ np2 = NULL;
+
+ /* \P{M}\p{M}* */
+ list2 = node_new_list(qn, NULL_NODE);
+ if (IS_NULL(list2)) goto err;
+ qn = NULL;
+ list1 = node_new_list(np1, list2);
+ if (IS_NULL(list1)) goto err;
+ np1 = NULL;
+ list2 = NULL;
+
+ /* (?>...) */
+ *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
+ if (IS_NULL(*np)) goto err;
+ NENCLOSE(*np)->target = list1;
+ return ONIG_NORMAL;
+ }
+ }
+#endif /* USE_UNICODE_PROPERTIES */
+ if (IS_NULL(*np)) {
+ /* PerlSyntax: (?s:.), RubySyntax: (?m:.) */
+ OnigOptionType option;
+ np1 = node_new_anychar();
+ if (IS_NULL(np1)) goto err;
+
+ option = env->option;
+ ONOFF(option, ONIG_OPTION_MULTILINE, 0);
+ *np = node_new_option(option);
+ if (IS_NULL(*np)) goto err;
+ NENCLOSE(*np)->target = np1;
+ }
+ return ONIG_NORMAL;
+
+ err:
+ onig_node_free(np1);
+ onig_node_free(np2);
+ onig_node_free(qn);
+ onig_node_free(list1);
+ onig_node_free(list2);
+ return (r == 0) ? ONIGERR_MEMORY : r;
+}
+
+static int
+countbits(unsigned int bits) {
+ bits = (bits & 0x55555555) + ((bits >> 1) & 0x55555555);
+ bits = (bits & 0x33333333) + ((bits >> 2) & 0x33333333);
+ bits = (bits & 0x0f0f0f0f) + ((bits >> 4) & 0x0f0f0f0f);
+ bits = (bits & 0x00ff00ff) + ((bits >> 8) & 0x00ff00ff);
+ return (bits & 0x0000ffff) + ((bits >>16) & 0x0000ffff);
+}
+
+static int
+is_onechar_cclass(CClassNode* cc, OnigCodePoint* code)
+{
+ OnigCodePoint c;
+ int found = 0;
+ int i, j = -1;
+ Bits b1, b2;
+ BBuf *bbuf = cc->mbuf;
+
+ if (IS_NCCLASS_NOT(cc)) return 0;
+
+ /* check bbuf */
+ if (IS_NOT_NULL(bbuf)) {
+ OnigCodePoint n, *data;
+ GET_CODE_POINT(n, bbuf->p);
+ data = (OnigCodePoint* )(bbuf->p) + 1;
+ if ((n == 1) && (data[0] == data[1])) {
+ /* only one char found in the bbuf, save the code point. */
+ found = 1;
+ c = data[0];
+ }
+ else {
+ return 0; /* the bbuf contains multiple chars */
+ }
+ }
+
+ if (found && (c < SINGLE_BYTE_SIZE) && BITSET_AT(cc->bs, c)) {
+ /* c is included in the bitset, ignore the result of bbuf. */
+ found = 0;
+ }
+
+ /* check bitset */
+ for (i = 0; i < (int )BITSET_SIZE; i++) {
+ b1 = cc->bs[i];
+ if (b1 != 0) {
+ if (((b1 & (b1 - 1)) == 0) && (found == 0)) {
+ found = 1;
+ j = i;
+ b2 = b1;
+ } else {
+ return 0; /* the character class contains multiple chars */
+ }
+ }
+ }
+ if (j >= 0) {
+ /* only one char found in the bitset, calculate the code point. */
+ c = BITS_IN_ROOM * j + (countbits(b2 - 1) & 0x1f);
+ }
+ *code = c;
+ return 1;
+}
+
+
+static int
parse_exp(Node** np, OnigToken* tok, int term,
UChar** src, UChar* end, ScanEnv* env)
{
@@ -5155,9 +5742,10 @@ parse_exp(Node** np, OnigToken* tok, int term,
switch (tok->type) {
case TK_ALT:
case TK_EOT:
- end_of_token:
+ end_of_token:
*np = node_new_empty();
return tok->type;
+ break;
case TK_SUBEXP_OPEN:
r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
@@ -5189,18 +5777,42 @@ parse_exp(Node** np, OnigToken* tok, int term,
else goto tk_byte;
break;
+ case TK_LINEBREAK:
+ r = node_linebreak(np, env);
+ if (r < 0) return r;
+ break;
+
+ case TK_EXTENDED_GRAPHEME_CLUSTER:
+ r = node_extended_grapheme_cluster(np, env);
+ if (r < 0) return r;
+ break;
+
+ case TK_KEEP:
+ *np = onig_node_new_anchor(ANCHOR_KEEP);
+ CHECK_NULL_RETURN_MEMERR(*np);
+ break;
+
case TK_STRING:
tk_byte:
{
*np = node_new_str(tok->backp, *src);
CHECK_NULL_RETURN_MEMERR(*np);
+ string_loop:
while (1) {
r = fetch_token(tok, src, end, env);
if (r < 0) return r;
- if (r != TK_STRING) break;
-
- r = onig_node_str_cat(*np, tok->backp, *src);
+ if (r == TK_STRING) {
+ r = onig_node_str_cat(*np, tok->backp, *src);
+ }
+#ifndef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
+ else if (r == TK_CODE_POINT) {
+ r = node_str_cat_codepoint(*np, env->enc, tok->u.code);
+ }
+#endif
+ else {
+ break;
+ }
if (r < 0) return r;
}
@@ -5253,15 +5865,15 @@ parse_exp(Node** np, OnigToken* tok, int term,
case TK_CODE_POINT:
{
- UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
- int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
- if (num < 0) return num;
+ *np = node_new_empty();
+ CHECK_NULL_RETURN_MEMERR(*np);
+ r = node_str_cat_codepoint(*np, env->enc, tok->u.code);
+ if (r != 0) return r;
#ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
- *np = node_new_str_raw(buf, buf + num);
+ NSTRING_SET_RAW(*np);
#else
- *np = node_new_str(buf, buf + num);
+ goto string_loop;
#endif
- CHECK_NULL_RETURN_MEMERR(*np);
}
break;
@@ -5286,21 +5898,9 @@ parse_exp(Node** np, OnigToken* tok, int term,
case TK_CHAR_TYPE:
{
switch (tok->u.prop.ctype) {
- case ONIGENC_CTYPE_D:
- case ONIGENC_CTYPE_S:
- case ONIGENC_CTYPE_W:
- {
- CClassNode* cc;
- *np = node_new_cclass();
- CHECK_NULL_RETURN_MEMERR(*np);
- cc = NCCLASS(*np);
- add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
- if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
- }
- break;
-
case ONIGENC_CTYPE_WORD:
- *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not);
+ *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not,
+ IS_ASCII_RANGE(env->option));
CHECK_NULL_RETURN_MEMERR(*np);
break;
@@ -5317,6 +5917,7 @@ parse_exp(Node** np, OnigToken* tok, int term,
r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype,
&sb_out, &mbr);
if (r == 0 &&
+ ! IS_ASCII_RANGE(env->option) &&
ONIGENC_CODE_RANGE_NUM(mbr)
>= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
type_cclass_key key;
@@ -5365,7 +5966,7 @@ parse_exp(Node** np, OnigToken* tok, int term,
*np = node_new_cclass();
CHECK_NULL_RETURN_MEMERR(*np);
cc = NCCLASS(*np);
- add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
+ add_ctype_to_cc(cc, tok->u.prop.ctype, 0, 0, env);
if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
#ifdef USE_SHARED_CCLASS_TABLE
}
@@ -5388,11 +5989,19 @@ parse_exp(Node** np, OnigToken* tok, int term,
case TK_CC_OPEN:
{
CClassNode* cc;
+ OnigCodePoint code;
r = parse_char_class(np, tok, src, end, env);
if (r != 0) return r;
cc = NCCLASS(*np);
+ if (is_onechar_cclass(cc, &code)) {
+ *np = node_new_empty();
+ CHECK_NULL_RETURN_MEMERR(*np);
+ r = node_str_cat_codepoint(*np, env->enc, code);
+ if (r != 0) return r;
+ goto string_loop;
+ }
if (IS_IGNORECASE(env->option)) {
IApplyCaseFoldArg iarg;
@@ -5451,7 +6060,8 @@ parse_exp(Node** np, OnigToken* tok, int term,
{
int gnum = tok->u.call.gnum;
- if (gnum < 0) {
+ if (gnum < 0 || tok->u.call.rel != 0) {
+ if (gnum > 0) gnum--;
gnum = BACKREF_REL_TO_ABS(gnum, env);
if (gnum <= 0)
return ONIGERR_INVALID_BACKREF;
@@ -5464,7 +6074,9 @@ parse_exp(Node** np, OnigToken* tok, int term,
#endif
case TK_ANCHOR:
- *np = onig_node_new_anchor(tok->u.anchor);
+ *np = onig_node_new_anchor(tok->u.anchor.subtype);
+ CHECK_NULL_RETURN_MEMERR(*np);
+ NANCHOR(*np)->ascii_range = tok->u.anchor.ascii_range;
break;
case TK_OP_REPEAT:
@@ -5522,7 +6134,7 @@ parse_exp(Node** np, OnigToken* tok, int term,
*targetp = qn;
}
else if (r == 1) {
- onig_node_free(qn);
+ onig_node_free(qn);
}
else if (r == 2) { /* split case: /abc+/ */
Node *tmp;
@@ -5647,6 +6259,21 @@ parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
if (r < 0) return r;
r = parse_subexp(top, &tok, TK_EOT, src, end, env);
if (r < 0) return r;
+
+#ifdef USE_SUBEXP_CALL
+ if (env->num_call > 0) {
+ /* Capture the pattern itself. It is used for (?R), (?0) and \g<0>. */
+ const int num = 0;
+ Node* np;
+ np = node_new_enclose_memory(env->option, 0);
+ CHECK_NULL_RETURN_MEMERR(np);
+ NENCLOSE(np)->regnum = num;
+ NENCLOSE(np)->target = *top;
+ r = scan_env_set_mem_node(env, num, np);
+ if (r != 0) return r;
+ *top = np;
+ }
+#endif
return 0;
}