From 0424e152c684a85f4b0691f1e84aec203115333d Mon Sep 17 00:00:00 2001 From: naruse Date: Fri, 17 Feb 2012 07:42:23 +0000 Subject: * Merge Onigmo-5.13.1. [ruby-dev:45057] [Feature #5820] https://github.com/k-takata/Onigmo cp reg{comp,enc,error,exec,parse,syntax}.c reg{enc,int,parse}.h cp oniguruma.h cp tool/enc-unicode.rb cp -r enc/ git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@34663 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- regexec.c | 827 ++++++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 671 insertions(+), 156 deletions(-) (limited to 'regexec.c') diff --git a/regexec.c b/regexec.c index ee0a5fa2af..2c16dafda5 100644 --- a/regexec.c +++ b/regexec.c @@ -1,8 +1,9 @@ /********************************************************************** - regexec.c - Oniguruma (regular expression library) + regexec.c - Onigmo (Oniguruma-mod) (regular expression library) **********************************************************************/ /*- * Copyright (c) 2002-2008 K.Kosako + * Copyright (c) 2011-2012 K.Takata * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -34,8 +35,44 @@ #ifdef USE_CRNL_AS_LINE_TERMINATOR #define ONIGENC_IS_MBC_CRNL(enc,p,end) \ (ONIGENC_MBC_TO_CODE(enc,p,end) == 13 && \ - ONIGENC_IS_MBC_NEWLINE(enc,(p+enclen(enc,p)),end)) -#endif + ONIGENC_MBC_TO_CODE(enc,(p+enclen(enc,p,end)),end) == 10) +#define ONIGENC_IS_MBC_NEWLINE_EX(enc,p,start,end,option,check_prev) \ + is_mbc_newline_ex((enc),(p),(start),(end),(option),(check_prev)) +static int +is_mbc_newline_ex(OnigEncoding enc, const UChar *p, const UChar *start, + const UChar *end, OnigOptionType option, int check_prev) +{ + if (IS_NEWLINE_CRLF(option)) { + if (ONIGENC_MBC_TO_CODE(enc, p, end) == 0x0a) { + if (check_prev) { + const UChar *prev = onigenc_get_prev_char_head(enc, start, p, end); + if ((prev != NULL) && ONIGENC_MBC_TO_CODE(enc, prev, end) == 0x0d) + return 0; + else + return 1; + } + else + return 1; + } + else { + const UChar *pnext = p + enclen(enc, p, end); + if (pnext < end && + ONIGENC_MBC_TO_CODE(enc, p, end) == 0x0d && + ONIGENC_MBC_TO_CODE(enc, pnext, end) == 0x0a) + return 1; + if (ONIGENC_IS_MBC_NEWLINE(enc, p, end)) + return 1; + return 0; + } + } + else { + return ONIGENC_IS_MBC_NEWLINE(enc, p, end); + } +} +#else /* USE_CRNL_AS_LINE_TERMINATOR */ +#define ONIGENC_IS_MBC_NEWLINE_EX(enc,p,start,end,option,check_prev) \ + ONIGENC_IS_MBC_NEWLINE((enc), (p), (end)) +#endif /* USE_CRNL_AS_LINE_TERMINATOR */ #ifdef USE_CAPTURE_HISTORY static void history_tree_free(OnigCaptureTreeNode* node); @@ -58,6 +95,8 @@ history_tree_clear(OnigCaptureTreeNode* node) node->beg = ONIG_REGION_NOTPOS; node->end = ONIG_REGION_NOTPOS; node->group = -1; + xfree(node->childs); + node->childs = (OnigCaptureTreeNode** )0; } } @@ -106,14 +145,20 @@ history_tree_add_child(OnigCaptureTreeNode* parent, OnigCaptureTreeNode* child) n = HISTORY_TREE_INIT_ALLOC_SIZE; parent->childs = (OnigCaptureTreeNode** )xmalloc(sizeof(OnigCaptureTreeNode*) * n); + CHECK_NULL_RETURN_MEMERR(parent->childs); } else { + OnigCaptureTreeNode** tmp; n = parent->allocated * 2; - parent->childs = + tmp = (OnigCaptureTreeNode** )xrealloc(parent->childs, sizeof(OnigCaptureTreeNode*) * n); + if (tmp == 0) { + history_tree_clear(parent); + return ONIGERR_MEMORY; + } + parent->childs = tmp; } - CHECK_NULL_RETURN_MEMERR(parent->childs); for (i = parent->allocated; i < n; i++) { parent->childs[i] = (OnigCaptureTreeNode* )0; } @@ -128,7 +173,7 @@ history_tree_add_child(OnigCaptureTreeNode* parent, OnigCaptureTreeNode* child) static OnigCaptureTreeNode* history_tree_clone(OnigCaptureTreeNode* node) { - int i; + int i, r; OnigCaptureTreeNode *clone, *child; clone = history_node_new(); @@ -142,7 +187,12 @@ history_tree_clone(OnigCaptureTreeNode* node) history_tree_free(clone); return (OnigCaptureTreeNode* )0; } - history_tree_add_child(clone, child); + r = history_tree_add_child(clone, child); + if (r != 0) { + history_tree_free(child); + history_tree_free(clone); + return (OnigCaptureTreeNode* )0; + } } return clone; @@ -177,11 +227,11 @@ onig_region_resize(OnigRegion* region, int n) n = ONIG_NREGION; if (region->allocated == 0) { - region->beg = (int* )xmalloc(n * sizeof(int)); + region->beg = (OnigPosition* )xmalloc(n * sizeof(OnigPosition)); if (region->beg == 0) return ONIGERR_MEMORY; - region->end = (int* )xmalloc(n * sizeof(int)); + region->end = (OnigPosition* )xmalloc(n * sizeof(OnigPosition)); if (region->end == 0) { xfree(region->beg); return ONIGERR_MEMORY; @@ -190,26 +240,24 @@ onig_region_resize(OnigRegion* region, int n) region->allocated = n; } else if (region->allocated < n) { - int *tmp; + OnigPosition *tmp; region->allocated = 0; - tmp = (int* )xrealloc(region->beg, n * sizeof(int)); + tmp = (OnigPosition* )xrealloc(region->beg, n * sizeof(OnigPosition)); if (tmp == 0) { xfree(region->beg); xfree(region->end); return ONIGERR_MEMORY; } region->beg = tmp; - tmp = (int* )xrealloc(region->end, n * sizeof(int)); + tmp = (OnigPosition* )xrealloc(region->end, n * sizeof(OnigPosition)); if (tmp == 0) { xfree(region->beg); + xfree(region->end); return ONIGERR_MEMORY; } region->end = tmp; - if (region->beg == 0 || region->end == 0) - return ONIGERR_MEMORY; - region->allocated = n; } @@ -247,8 +295,8 @@ onig_region_init(OnigRegion* region) { region->num_regs = 0; region->allocated = 0; - region->beg = (int* )0; - region->end = (int* )0; + region->beg = (OnigPosition* )0; + region->end = (OnigPosition* )0; region->history_root = (OnigCaptureTreeNode* )0; } @@ -283,11 +331,13 @@ extern void onig_region_copy(OnigRegion* to, OnigRegion* from) { #define RREGC_SIZE (sizeof(int) * from->num_regs) - int i; + int i, r; if (to == from) return; - onig_region_resize(to, from->num_regs); + r = onig_region_resize(to, from->num_regs); + if (r) return; + for (i = 0; i < from->num_regs; i++) { to->beg[i] = from->beg[i]; to->end[i] = from->end[i]; @@ -334,19 +384,21 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) #define STK_MASK_MEM_END_OR_MARK 0x8000 /* MEM_END or MEM_END_MARK */ #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE -#define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start) do {\ +#define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start, arg_gpos) do {\ (msa).stack_p = (void* )0;\ (msa).options = (arg_option);\ (msa).region = (arg_region);\ (msa).start = (arg_start);\ + (msa).gpos = (arg_gpos);\ (msa).best_len = ONIG_MISMATCH;\ } while(0) #else -#define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start) do {\ +#define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start, arg_gpos) do {\ (msa).stack_p = (void* )0;\ (msa).options = (arg_option);\ (msa).region = (arg_region);\ (msa).start = (arg_start);\ + (msa).gpos = (arg_gpos);\ } while(0) #endif @@ -386,9 +438,9 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) if ((msa).state_check_buff) xfree((msa).state_check_buff);\ }\ } while(0) -#else +#else /* USE_COMBINATION_EXPLOSION_CHECK */ #define MATCH_ARG_FREE(msa) if ((msa).stack_p) xfree((msa).stack_p) -#endif +#endif /* USE_COMBINATION_EXPLOSION_CHECK */ @@ -510,13 +562,14 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, state_check_buff[x/8] |= (1<<(x%8)); \ } -#define STACK_PUSH(stack_type,pat,s,sprev) do {\ +#define STACK_PUSH(stack_type,pat,s,sprev,keep) do {\ STACK_ENSURE(1);\ stk->type = (stack_type);\ stk->u.state.pcode = (pat);\ stk->u.state.pstr = (s);\ stk->u.state.pstr_prev = (sprev);\ stk->u.state.state_check = 0;\ + stk->u.state.pkeep = (keep);\ STACK_INC;\ } while(0) @@ -527,13 +580,14 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, STACK_INC;\ } while(0) -#define STACK_PUSH_ALT_WITH_STATE_CHECK(pat,s,sprev,snum) do {\ +#define STACK_PUSH_ALT_WITH_STATE_CHECK(pat,s,sprev,snum,keep) do {\ STACK_ENSURE(1);\ stk->type = STK_ALT;\ stk->u.state.pcode = (pat);\ stk->u.state.pstr = (s);\ stk->u.state.pstr_prev = (sprev);\ stk->u.state.state_check = ((state_check_buff != NULL) ? (snum) : 0);\ + stk->u.state.pkeep = (keep);\ STACK_INC;\ } while(0) @@ -551,12 +605,13 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, #define ELSE_IF_STATE_CHECK_MARK(stk) -#define STACK_PUSH(stack_type,pat,s,sprev) do {\ +#define STACK_PUSH(stack_type,pat,s,sprev,keep) do {\ STACK_ENSURE(1);\ stk->type = (stack_type);\ stk->u.state.pcode = (pat);\ stk->u.state.pstr = (s);\ stk->u.state.pstr_prev = (sprev);\ + stk->u.state.pkeep = (keep);\ STACK_INC;\ } while(0) @@ -567,12 +622,12 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, } while(0) #endif /* USE_COMBINATION_EXPLOSION_CHECK */ -#define STACK_PUSH_ALT(pat,s,sprev) STACK_PUSH(STK_ALT,pat,s,sprev) -#define STACK_PUSH_POS(s,sprev) STACK_PUSH(STK_POS,NULL_UCHARP,s,sprev) -#define STACK_PUSH_POS_NOT(pat,s,sprev) STACK_PUSH(STK_POS_NOT,pat,s,sprev) +#define STACK_PUSH_ALT(pat,s,sprev,keep) STACK_PUSH(STK_ALT,pat,s,sprev,keep) +#define STACK_PUSH_POS(s,sprev,keep) STACK_PUSH(STK_POS,NULL_UCHARP,s,sprev,keep) +#define STACK_PUSH_POS_NOT(pat,s,sprev,keep) STACK_PUSH(STK_POS_NOT,pat,s,sprev,keep) #define STACK_PUSH_STOP_BT STACK_PUSH_TYPE(STK_STOP_BT) -#define STACK_PUSH_LOOK_BEHIND_NOT(pat,s,sprev) \ - STACK_PUSH(STK_LOOK_BEHIND_NOT,pat,s,sprev) +#define STACK_PUSH_LOOK_BEHIND_NOT(pat,s,sprev,keep) \ + STACK_PUSH(STK_LOOK_BEHIND_NOT,pat,s,sprev,keep) #define STACK_PUSH_REPEAT(id, pat) do {\ STACK_ENSURE(1);\ @@ -879,7 +934,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, }\ k++;\ }\ - break;\ + break;\ }\ }\ }\ @@ -920,7 +975,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, }\ k++;\ }\ - break;\ + break;\ }\ }\ else {\ @@ -983,7 +1038,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, } while(0) static int string_cmp_ic(OnigEncoding enc, int case_fold_flag, - UChar* s1, UChar** ps2, int mblen, const UChar* text_end) + UChar* s1, UChar** ps2, OnigDistance mblen, const UChar* text_end) { UChar buf1[ONIGENC_MBC_CASE_FOLD_MAXLEN]; UChar buf2[ONIGENC_MBC_CASE_FOLD_MAXLEN]; @@ -1057,20 +1112,23 @@ make_capture_history_tree(OnigCaptureTreeNode* node, OnigStackType** kp, child = history_node_new(); CHECK_NULL_RETURN_MEMERR(child); child->group = n; - child->beg = (int )(k->u.mem.pstr - str); + child->beg = k->u.mem.pstr - str; r = history_tree_add_child(node, child); - if (r != 0) return r; + if (r != 0) { + history_tree_free(child); + return r; + } *kp = (k + 1); r = make_capture_history_tree(child, kp, stk_top, str, reg); if (r != 0) return r; k = *kp; - child->end = (int )(k->u.mem.pstr - str); + child->end = k->u.mem.pstr - str; } } else if (k->type == STK_MEM_END) { if (k->u.mem.num == node->group) { - node->end = (int )(k->u.mem.pstr - str); + node->end = k->u.mem.pstr - str; *kp = k; return 0; } @@ -1080,7 +1138,7 @@ make_capture_history_tree(OnigCaptureTreeNode* node, OnigStackType** kp, return 1; /* 1: root node ending. */ } -#endif +#endif /* USE_CAPTURE_HISTORY */ #ifdef USE_BACKREF_WITH_LEVEL static int mem_is_in_memp(int mem, int num, UChar* memp) @@ -1125,7 +1183,7 @@ static int backref_match_at_nested_level(regex_t* reg if (ignore_case != 0) { if (string_cmp_ic(reg->enc, case_fold_flag, - pstart, &ss, (int )(pend - pstart), send) == 0) + pstart, &ss, pend - pstart, send) == 0) return 0; /* or goto next_mem; */ } else { @@ -1168,14 +1226,14 @@ static struct timeval ts, te; #define GETTIME(t) gettimeofday(&(t), (struct timezone* )0) #define TIMEDIFF(te,ts) (((te).tv_usec - (ts).tv_usec) + \ (((te).tv_sec - (ts).tv_sec)*1000000)) -#else +#else /* USE_TIMEOFDAY */ #ifdef HAVE_SYS_TIMES_H #include #endif static struct tms ts, te; #define GETTIME(t) times(&(t)) #define TIMEDIFF(te,ts) ((te).tms_utime - (ts).tms_utime) -#endif +#endif /* USE_TIMEOFDAY */ static int OpCounter[256]; static int OpPrevCounter[256]; @@ -1224,12 +1282,13 @@ onig_print_statistics(FILE* f) MaxStackDepth = stk - stk_base;\ } while(0) -#else +#else /* ONIG_DEBUG_STATISTICS */ #define STACK_INC stk++ #define MOP_IN(opcode) #define MOP_OUT -#endif +#endif /* ONIG_DEBUG_STATISTICS */ + /* matching region of POSIX API */ @@ -1245,7 +1304,7 @@ void onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nex /* match data(str - end) from position (sstart). */ /* if sstart == str then set sprev to NULL. */ -static long +static OnigPosition match_at(regex_t* reg, const UChar* str, const UChar* end, #ifdef USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE const UChar* right_range, @@ -1254,8 +1313,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, { static const UChar FinishCode[] = { OP_FINISH }; - int i, num_mem, best_len, pop_level; - ptrdiff_t n; + int i, num_mem, pop_level; + ptrdiff_t n, best_len; LengthType tlen, tlen2; MemNumType mem; RelAddrType addr; @@ -1264,6 +1323,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, OnigCaseFoldType case_fold_flag = reg->case_fold_flag; UChar *s, *q, *sbegin; UChar *p = reg->p; + UChar *pkeep; char *alloca_base; OnigStackType *stk_alloc, *stk_base, *stk, *stk_end; OnigStackType *stkp; /* used as any purpose. */ @@ -1275,6 +1335,23 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, unsigned char* state_check_buff = msa->state_check_buff; int num_comb_exp_check = reg->num_comb_exp_check; #endif + +#ifdef USE_SUBEXP_CALL + /* Stack #0 is used to store the pattern itself and used for (?R), \g<0>, etc. */ + n = reg->num_repeat + (reg->num_mem + 1) * 2; + + STACK_INIT(alloca_base, n, INIT_MATCH_STACK_SIZE); + pop_level = reg->stack_pop_level; + num_mem = reg->num_mem; + repeat_stk = (OnigStackIndex* )alloca_base; + + mem_start_stk = (OnigStackIndex* )(repeat_stk + reg->num_repeat); + mem_end_stk = mem_start_stk + (num_mem + 1); + for (i = 0; i <= num_mem; i++) { + mem_start_stk[i] = mem_end_stk[i] = INVALID_STACK_INDEX; + } +#else /* USE_SUBEXP_CALL */ + /* Stack #0 not is used. */ n = reg->num_repeat + reg->num_mem * 2; STACK_INIT(alloca_base, n, INIT_MATCH_STACK_SIZE); @@ -1291,10 +1368,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, for (i = 1; i <= num_mem; i++) { mem_start_stk[i] = mem_end_stk[i] = INVALID_STACK_INDEX; } +#endif /* USE_SUBEXP_CALL */ #ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "match_at: str: %"PRIdPTR", end: %"PRIdPTR", start: %"PRIdPTR", sprev: %"PRIdPTR"\n", - (intptr_t)str, (intptr_t)end, (intptr_t)sstart, (intptr_t)sprev); + fprintf(stderr, "match_at: str: %"PRIdPTR" (%p), end: %"PRIdPTR" (%p), start: %"PRIdPTR" (%p), sprev: %"PRIdPTR" (%p)\n", + (intptr_t)str, str, (intptr_t)end, end, (intptr_t)sstart, sstart, (intptr_t)sprev, sprev); fprintf(stderr, "size: %d, start offset: %d\n", (int )(end - str), (int )(sstart - str)); #endif @@ -1302,6 +1380,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_PUSH_ENSURED(STK_ALT, (UChar *)FinishCode); /* bottom stack */ best_len = ONIG_MISMATCH; s = (UChar* )sstart; + pkeep = (UChar* )sstart; while (1) { #ifdef ONIG_DEBUG_MATCH if (s) { @@ -1309,9 +1388,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, int len; fprintf(stderr, "%4d> \"", (int )(s - str)); bp = buf; - for (i = 0, q = s; i < 7 && q < end; i++) { - len = enclen(encode, q, end); - while (len-- > 0) *bp++ = *q++; + if (*p != OP_FINISH) { /* s may not be a valid pointer if OP_FINISH. */ + for (i = 0, q = s; i < 7 && q < end; i++) { + len = enclen(encode, q, end); + while (len-- > 0) *bp++ = *q++; + } } if (q < end) { xmemcpy(bp, "...\"", 4); bp += 4; } else { xmemcpy(bp, "\"", 1); bp += 1; } @@ -1332,32 +1413,32 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE if (IS_FIND_LONGEST(option)) { if (n > msa->best_len) { - msa->best_len = (int)n; + msa->best_len = n; msa->best_s = (UChar* )sstart; } else goto end_best_len; } #endif - best_len = (int)n; + best_len = n; region = msa->region; if (region) { #ifdef USE_POSIX_API_REGION_OPTION if (IS_POSIX_REGION(msa->options)) { posix_regmatch_t* rmt = (posix_regmatch_t* )region; - rmt[0].rm_so = sstart - str; - rmt[0].rm_eo = s - str; + rmt[0].rm_so = (regoff_t )(((pkeep > s) ? s : pkeep) - str); + rmt[0].rm_eo = (regoff_t )(s - str); for (i = 1; i <= num_mem; i++) { if (mem_end_stk[i] != INVALID_STACK_INDEX) { if (BIT_STATUS_AT(reg->bt_mem_start, i)) - rmt[i].rm_so = STACK_AT(mem_start_stk[i])->u.mem.pstr - str; + rmt[i].rm_so = (regoff_t )(STACK_AT(mem_start_stk[i])->u.mem.pstr - str); else - rmt[i].rm_so = (UChar* )((void* )(mem_start_stk[i])) - str; + rmt[i].rm_so = (regoff_t )((UChar* )((void* )(mem_start_stk[i])) - str); - rmt[i].rm_eo = (BIT_STATUS_AT(reg->bt_mem_end, i) + rmt[i].rm_eo = (regoff_t )((BIT_STATUS_AT(reg->bt_mem_end, i) ? STACK_AT(mem_end_stk[i])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[i])) - str; + : (UChar* )((void* )mem_end_stk[i])) - str); } else { rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS; @@ -1366,16 +1447,18 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } else { #endif /* USE_POSIX_API_REGION_OPTION */ - region->beg[0] = (int)(sstart - str); - region->end[0] = (int)(s - str); + region->beg[0] = ((pkeep > s) ? s : pkeep) - str; + region->end[0] = s - str; for (i = 1; i <= num_mem; i++) { if (mem_end_stk[i] != INVALID_STACK_INDEX) { - region->beg[i] = (int)((BIT_STATUS_AT(reg->bt_mem_start, i)) - ? STACK_AT(mem_start_stk[i])->u.mem.pstr - str - : (UChar* )((void* )mem_start_stk[i]) - str); - region->end[i] = (int)(BIT_STATUS_AT(reg->bt_mem_end, i) - ? STACK_AT(mem_end_stk[i])->u.mem.pstr - str - : (UChar* )((void* )mem_end_stk[i]) - str); + if (BIT_STATUS_AT(reg->bt_mem_start, i)) + region->beg[i] = STACK_AT(mem_start_stk[i])->u.mem.pstr - str; + else + region->beg[i] = (UChar* )((void* )mem_start_stk[i]) - str; + + region->end[i] = (BIT_STATUS_AT(reg->bt_mem_end, i) + ? STACK_AT(mem_end_stk[i])->u.mem.pstr + : (UChar* )((void* )mem_end_stk[i])) - str; } else { region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; @@ -1397,8 +1480,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } node->group = 0; - node->beg = sstart - str; - node->end = s - str; + node->beg = ((pkeep > s) ? s : pkeep) - str; + node->end = s - str; stkp = stk_base; r = make_capture_history_tree(region->history_root, &stkp, @@ -1795,7 +1878,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, DATA_ENSURE(1); n = enclen(encode, s, end); DATA_ENSURE(n); - if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; + if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 0)) goto fail; s += n; MOP_OUT; break; @@ -1810,10 +1893,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_ANYCHAR_STAR: MOP_IN(OP_ANYCHAR_STAR); while (DATA_ENSURE_CHECK1) { - STACK_PUSH_ALT(p, s, sprev); + STACK_PUSH_ALT(p, s, sprev, pkeep); n = enclen(encode, s, end); DATA_ENSURE(n); - if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; + if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 0)) goto fail; sprev = s; s += n; } @@ -1822,7 +1905,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_ANYCHAR_ML_STAR: MOP_IN(OP_ANYCHAR_ML_STAR); while (DATA_ENSURE_CHECK1) { - STACK_PUSH_ALT(p, s, sprev); + STACK_PUSH_ALT(p, s, sprev, pkeep); n = enclen(encode, s, end); if (n > 1) { DATA_ENSURE(n); @@ -1840,11 +1923,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_ANYCHAR_STAR_PEEK_NEXT: MOP_IN(OP_ANYCHAR_STAR_PEEK_NEXT); while (DATA_ENSURE_CHECK1) { if (*p == *s) { - STACK_PUSH_ALT(p + 1, s, sprev); + STACK_PUSH_ALT(p + 1, s, sprev, pkeep); } n = enclen(encode, s, end); DATA_ENSURE(n); - if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; + if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 0)) goto fail; sprev = s; s += n; } @@ -1855,7 +1938,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_ANYCHAR_ML_STAR_PEEK_NEXT:MOP_IN(OP_ANYCHAR_ML_STAR_PEEK_NEXT); while (DATA_ENSURE_CHECK1) { if (*p == *s) { - STACK_PUSH_ALT(p + 1, s, sprev); + STACK_PUSH_ALT(p + 1, s, sprev, pkeep); } n = enclen(encode, s, end); if (n > 1) { @@ -1879,10 +1962,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STATE_CHECK_VAL(scv, mem); if (scv) goto fail; - STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem); + STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem, pkeep); n = enclen(encode, s, end); DATA_ENSURE(n); - if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; + if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 0)) goto fail; sprev = s; s += n; } @@ -1897,7 +1980,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STATE_CHECK_VAL(scv, mem); if (scv) goto fail; - STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem); + STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem, pkeep); n = enclen(encode, s, end); if (n > 1) { DATA_ENSURE(n); @@ -1922,6 +2005,15 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, MOP_OUT; break; + case OP_ASCII_WORD: MOP_IN(OP_ASCII_WORD); + DATA_ENSURE(1); + if (! ONIGENC_IS_MBC_ASCII_WORD(encode, s, end)) + goto fail; + + s += enclen(encode, s, end); + MOP_OUT; + break; + case OP_NOT_WORD: MOP_IN(OP_NOT_WORD); DATA_ENSURE(1); if (ONIGENC_IS_MBC_WORD(encode, s, end)) @@ -1931,6 +2023,15 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, MOP_OUT; break; + case OP_NOT_ASCII_WORD: MOP_IN(OP_NOT_ASCII_WORD); + DATA_ENSURE(1); + if (ONIGENC_IS_MBC_ASCII_WORD(encode, s, end)) + goto fail; + + s += enclen(encode, s, end); + MOP_OUT; + break; + case OP_WORD_BOUND: MOP_IN(OP_WORD_BOUND); if (ON_STR_BEGIN(s)) { DATA_ENSURE(1); @@ -1950,6 +2051,25 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, continue; break; + case OP_ASCII_WORD_BOUND: MOP_IN(OP_ASCII_WORD_BOUND); + if (ON_STR_BEGIN(s)) { + DATA_ENSURE(1); + if (! ONIGENC_IS_MBC_ASCII_WORD(encode, s, end)) + goto fail; + } + else if (ON_STR_END(s)) { + if (! ONIGENC_IS_MBC_ASCII_WORD(encode, sprev, end)) + goto fail; + } + else { + if (ONIGENC_IS_MBC_ASCII_WORD(encode, s, end) + == ONIGENC_IS_MBC_ASCII_WORD(encode, sprev, end)) + goto fail; + } + MOP_OUT; + continue; + break; + case OP_NOT_WORD_BOUND: MOP_IN(OP_NOT_WORD_BOUND); if (ON_STR_BEGIN(s)) { if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_WORD(encode, s, end)) @@ -1968,6 +2088,24 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, continue; break; + case OP_NOT_ASCII_WORD_BOUND: MOP_IN(OP_NOT_ASCII_WORD_BOUND); + if (ON_STR_BEGIN(s)) { + if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_ASCII_WORD(encode, s, end)) + goto fail; + } + else if (ON_STR_END(s)) { + if (ONIGENC_IS_MBC_ASCII_WORD(encode, sprev, end)) + goto fail; + } + else { + if (ONIGENC_IS_MBC_ASCII_WORD(encode, s, end) + != ONIGENC_IS_MBC_ASCII_WORD(encode, sprev, end)) + goto fail; + } + MOP_OUT; + continue; + break; + #ifdef USE_WORD_BEGIN_END case OP_WORD_BEGIN: MOP_IN(OP_WORD_BEGIN); if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_WORD(encode, s, end)) { @@ -1979,6 +2117,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, goto fail; break; + case OP_ASCII_WORD_BEGIN: MOP_IN(OP_ASCII_WORD_BEGIN); + if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_ASCII_WORD(encode, s, end)) { + if (ON_STR_BEGIN(s) || !ONIGENC_IS_MBC_ASCII_WORD(encode, sprev, end)) { + MOP_OUT; + continue; + } + } + goto fail; + break; + case OP_WORD_END: MOP_IN(OP_WORD_END); if (!ON_STR_BEGIN(s) && ONIGENC_IS_MBC_WORD(encode, sprev, end)) { if (ON_STR_END(s) || !ONIGENC_IS_MBC_WORD(encode, s, end)) { @@ -1988,6 +2136,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } goto fail; break; + + case OP_ASCII_WORD_END: MOP_IN(OP_ASCII_WORD_END); + if (!ON_STR_BEGIN(s) && ONIGENC_IS_MBC_ASCII_WORD(encode, sprev, end)) { + if (ON_STR_END(s) || !ONIGENC_IS_MBC_ASCII_WORD(encode, s, end)) { + MOP_OUT; + continue; + } + } + goto fail; + break; #endif case OP_BEGIN_BUF: MOP_IN(OP_BEGIN_BUF); @@ -2005,12 +2163,18 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, break; case OP_BEGIN_LINE: MOP_IN(OP_BEGIN_LINE); + op_begin_line: if (ON_STR_BEGIN(s)) { if (IS_NOTBOL(msa->options)) goto fail; MOP_OUT; continue; } - else if (ONIGENC_IS_MBC_NEWLINE(encode, sprev, end) && !ON_STR_END(s)) { + else if (ONIGENC_IS_MBC_NEWLINE(encode, sprev, end) +#ifdef USE_CRNL_AS_LINE_TERMINATOR + && !(IS_NEWLINE_CRLF(option) + && ONIGENC_IS_MBC_CRNL(encode, sprev, end)) +#endif + && !ON_STR_END(s)) { MOP_OUT; continue; } @@ -2020,7 +2184,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_END_LINE: MOP_IN(OP_END_LINE); if (ON_STR_END(s)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { + if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE_EX(encode, sprev, str, end, option, 1)) { #endif if (IS_NOTEOL(msa->options)) goto fail; MOP_OUT; @@ -2029,23 +2193,17 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } #endif } - else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) { - MOP_OUT; - continue; - } -#ifdef USE_CRNL_AS_LINE_TERMINATOR - else if (ONIGENC_IS_MBC_CRNL(encode, s, end)) { + else if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 1)) { MOP_OUT; continue; } -#endif goto fail; break; case OP_SEMI_END_BUF: MOP_IN(OP_SEMI_END_BUF); if (ON_STR_END(s)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { + if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE_EX(encode, sprev, str, end, option, 1)) { #endif if (IS_NOTEOL(msa->options)) goto fail; MOP_OUT; @@ -2054,32 +2212,41 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } #endif } - else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end) && - ON_STR_END(s + enclen(encode, s, end))) { - MOP_OUT; - continue; - } + else if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 1)) { + UChar* ss = s + enclen(encode, s, end); + if (ON_STR_END(ss)) { + MOP_OUT; + continue; + } #ifdef USE_CRNL_AS_LINE_TERMINATOR - else if (ONIGENC_IS_MBC_CRNL(encode, s, end)) { - UChar* ss = s + enclen(encode, s); - ss += enclen(encode, ss); - if (ON_STR_END(ss)) { - MOP_OUT; - continue; - } - } + else if (ss < end) { + ss += enclen(encode, ss, end); + if (ON_STR_END(ss)) { + MOP_OUT; + continue; + } + } #endif + } goto fail; break; case OP_BEGIN_POSITION: MOP_IN(OP_BEGIN_POSITION); - if (s != msa->start) + if (s != msa->gpos) goto fail; MOP_OUT; continue; break; + case OP_BEGIN_POS_OR_LINE: MOP_IN(OP_BEGIN_POS_OR_LINE); + if (s != msa->gpos) + goto op_begin_line; + + MOP_OUT; + continue; + break; + case OP_MEMORY_START_PUSH: MOP_IN(OP_MEMORY_START_PUSH); GET_MEMNUM_INC(mem, p); STACK_PUSH_MEM_START(mem, s); @@ -2108,6 +2275,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, continue; break; + case OP_KEEP: MOP_IN(OP_KEEP); + pkeep = s; + MOP_OUT; + continue; + break; + #ifdef USE_SUBEXP_CALL case OP_MEMORY_END_PUSH_REC: MOP_IN(OP_MEMORY_END_PUSH_REC); GET_MEMNUM_INC(mem, p); @@ -2272,7 +2445,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, DATA_ENSURE(n); sprev = s; swork = s; - STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, (int)n, end, is_fail); + STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, end, is_fail); if (is_fail) continue; s = swork; while (sprev + (len = enclen(encode, sprev, end)) < s) @@ -2319,7 +2492,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #if 0 /* no need: IS_DYNAMIC_OPTION() == 0 */ case OP_SET_OPTION_PUSH: MOP_IN(OP_SET_OPTION_PUSH); GET_OPTION_INC(option, p); - STACK_PUSH_ALT(p, s, sprev); + STACK_PUSH_ALT(p, s, sprev, pkeep); p += SIZE_OP_SET_OPTION + SIZE_OP_FAIL; MOP_OUT; continue; @@ -2347,8 +2520,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_NULL_CHECK(isnull, mem, s); if (isnull) { #ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "NULL_CHECK_END: skip id:%d, s:%"PRIdPTR"\n", - (int )mem, (intptr_t )s); + fprintf(stderr, "NULL_CHECK_END: skip id:%d, s:%"PRIdPTR" (%p)\n", + (int )mem, (intptr_t )s, s); #endif null_check_found: /* empty loop founded, skip next instruction */ @@ -2382,8 +2555,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_NULL_CHECK_MEMST(isnull, mem, s, reg); if (isnull) { #ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "NULL_CHECK_END_MEMST: skip id:%d, s:%"PRIdPTR"\n", - (int )mem, (intptr_t )s); + fprintf(stderr, "NULL_CHECK_END_MEMST: skip id:%d, s:%"PRIdPTR" (%p)\n", + (int )mem, (intptr_t )s, s); #endif if (isnull == -1) goto fail; goto null_check_found; @@ -2408,8 +2581,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #endif if (isnull) { #ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "NULL_CHECK_END_MEMST_PUSH: skip id:%d, s:%"PRIdPTR"\n", - (int )mem, (intptr_t )s); + fprintf(stderr, "NULL_CHECK_END_MEMST_PUSH: skip id:%d, s:%"PRIdPTR" (%p)\n", + (int )mem, (intptr_t )s, s); #endif if (isnull == -1) goto fail; goto null_check_found; @@ -2433,7 +2606,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_PUSH: MOP_IN(OP_PUSH); GET_RELADDR_INC(addr, p); - STACK_PUSH_ALT(p + addr, s, sprev); + STACK_PUSH_ALT(p + addr, s, sprev, pkeep); MOP_OUT; continue; break; @@ -2445,7 +2618,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (scv) goto fail; GET_RELADDR_INC(addr, p); - STACK_PUSH_ALT_WITH_STATE_CHECK(p + addr, s, sprev, mem); + STACK_PUSH_ALT_WITH_STATE_CHECK(p + addr, s, sprev, mem, pkeep); MOP_OUT; continue; break; @@ -2458,7 +2631,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p += addr; } else { - STACK_PUSH_ALT_WITH_STATE_CHECK(p + addr, s, sprev, mem); + STACK_PUSH_ALT_WITH_STATE_CHECK(p + addr, s, sprev, mem, pkeep); } MOP_OUT; continue; @@ -2485,7 +2658,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, GET_RELADDR_INC(addr, p); if (*p == *s && DATA_ENSURE_CHECK1) { p++; - STACK_PUSH_ALT(p + addr, s, sprev); + STACK_PUSH_ALT(p + addr, s, sprev, pkeep); MOP_OUT; continue; } @@ -2498,7 +2671,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, GET_RELADDR_INC(addr, p); if (*p == *s) { p++; - STACK_PUSH_ALT(p + addr, s, sprev); + STACK_PUSH_ALT(p + addr, s, sprev, pkeep); MOP_OUT; continue; } @@ -2517,7 +2690,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_PUSH_REPEAT(mem, p); if (reg->repeat_range[mem].lower == 0) { - STACK_PUSH_ALT(p + addr, s, sprev); + STACK_PUSH_ALT(p + addr, s, sprev, pkeep); } } MOP_OUT; @@ -2534,7 +2707,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_PUSH_REPEAT(mem, p); if (reg->repeat_range[mem].lower == 0) { - STACK_PUSH_ALT(p, s, sprev); + STACK_PUSH_ALT(p, s, sprev, pkeep); p += addr; } } @@ -2553,7 +2726,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, /* end of repeat. Nothing to do. */ } else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { - STACK_PUSH_ALT(p, s, sprev); + STACK_PUSH_ALT(p, s, sprev, pkeep); p = STACK_AT(si)->u.repeat.pcode; /* Don't use stkp after PUSH. */ } else { @@ -2584,7 +2757,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, UChar* pcode = stkp->u.repeat.pcode; STACK_PUSH_REPEAT_INC(si); - STACK_PUSH_ALT(pcode, s, sprev); + STACK_PUSH_ALT(pcode, s, sprev, pkeep); } else { p = stkp->u.repeat.pcode; @@ -2607,7 +2780,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, break; case OP_PUSH_POS: MOP_IN(OP_PUSH_POS); - STACK_PUSH_POS(s, sprev); + STACK_PUSH_POS(s, sprev, pkeep); MOP_OUT; continue; break; @@ -2624,7 +2797,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_PUSH_POS_NOT: MOP_IN(OP_PUSH_POS_NOT); GET_RELADDR_INC(addr, p); - STACK_PUSH_POS_NOT(p + addr, s, sprev); + STACK_PUSH_POS_NOT(p + addr, s, sprev, pkeep); MOP_OUT; continue; break; @@ -2666,7 +2839,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, /* goto fail; */ } else { - STACK_PUSH_LOOK_BEHIND_NOT(p + addr, s, sprev); + STACK_PUSH_LOOK_BEHIND_NOT(p + addr, s, sprev, pkeep); s = q; sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s, end); } @@ -2696,6 +2869,18 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, break; #endif + case OP_CONDITION: MOP_IN(OP_CONDITION); + GET_MEMNUM_INC(mem, p); + GET_RELADDR_INC(addr, p); + if ((mem > num_mem) || + (mem_end_stk[mem] == INVALID_STACK_INDEX) || + (mem_start_stk[mem] == INVALID_STACK_INDEX)) { + p += addr; + } + MOP_OUT; + continue; + break; + case OP_FINISH: goto finish; break; @@ -2708,6 +2893,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p = stk->u.state.pcode; s = stk->u.state.pstr; sprev = stk->u.state.pstr_prev; + pkeep = stk->u.state.pkeep; #ifdef USE_COMBINATION_EXPLOSION_CHECK if (stk->u.state.state_check != 0) { @@ -2772,7 +2958,7 @@ slow_search(OnigEncoding enc, UChar* target, UChar* target_end, } s += n; } - return (UChar*)NULL; + return (UChar* )NULL; } while (s < end) { if (*s == *target) { @@ -2890,6 +3076,8 @@ slow_search_backward_ic(OnigEncoding enc, int case_fold_flag, return (UChar* )NULL; } +#ifndef USE_SUNDAY_QUICK_SEARCH +/* Boyer-Moore-Horspool search applied to a multibyte string */ static UChar* bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, const UChar* text, const UChar* text_end, @@ -2900,8 +3088,8 @@ bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, ptrdiff_t skip, tlen1; #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "bm_search_notrev: text: %"PRIuPTR", text_end: %"PRIuPTR", text_range: %"PRIuPTR"\n", - text, text_end, text_range); + fprintf(stderr, "bm_search_notrev: text: %"PRIuPTR" (%p), text_end: %"PRIuPTR" (%p), text_range: %"PRIuPTR" (%p)\n", + text, text, text_end, text_end, text_range, text_range); #endif tail = target_end - 1; @@ -2946,6 +3134,7 @@ bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, return (UChar* )NULL; } +/* Boyer-Moore-Horspool search */ static UChar* bm_search(regex_t* reg, const UChar* target, const UChar* target_end, const UChar* text, const UChar* text_end, const UChar* text_range) @@ -2993,10 +3182,315 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end, return (UChar* )NULL; } +/* Boyer-Moore-Horspool search applied to a multibyte string (ignore case) */ +static UChar* +bm_search_notrev_ic(regex_t* reg, const UChar* target, const UChar* target_end, + const UChar* text, const UChar* text_end, + const UChar* text_range) +{ + const UChar *s, *se, *t, *end; + const UChar *tail; + ptrdiff_t skip, tlen1; + OnigEncoding enc = reg->enc; + int case_fold_flag = reg->case_fold_flag; + +#ifdef ONIG_DEBUG_SEARCH + fprintf(stderr, "bm_search_notrev_ic: text: %d (%p), text_end: %d (%p), text_range: %d (%p)\n", + (int )text, text, (int )text_end, text_end, (int )text_range, text_range); +#endif + + tail = target_end - 1; + tlen1 = tail - target; + end = text_range; + if (end + tlen1 > text_end) + end = text_end - tlen1; + + s = text; + + if (IS_NULL(reg->int_map)) { + while (s < end) { + se = s + tlen1; + if (str_lower_case_match(enc, case_fold_flag, target, target_end, + s, se + 1)) + return (UChar* )s; + skip = reg->map[*se]; + t = s; + do { + s += enclen(reg->enc, s, end); + } while ((s - t) < skip && s < end); + } + } + else { + while (s < end) { + se = s + tlen1; + if (str_lower_case_match(enc, case_fold_flag, target, target_end, + s, se + 1)) + return (UChar* )s; + skip = reg->int_map[*se]; + t = s; + do { + s += enclen(reg->enc, s, end); + } while ((s - t) < skip && s < end); + } + } + + return (UChar* )NULL; +} + +/* Boyer-Moore-Horspool search (ignore case) */ +static UChar* +bm_search_ic(regex_t* reg, const UChar* target, const UChar* target_end, + const UChar* text, const UChar* text_end, const UChar* text_range) +{ + const UChar *s, *p, *end; + const UChar *tail; + OnigEncoding enc = reg->enc; + int case_fold_flag = reg->case_fold_flag; + +#ifdef ONIG_DEBUG_SEARCH + fprintf(stderr, "bm_search_ic: text: %d (%p), text_end: %d (%p), text_range: %d (%p)\n", + (int )text, text, (int )text_end, text_end, (int )text_range, text_range); +#endif + + end = text_range + (target_end - target) - 1; + if (end > text_end) + end = text_end; + + tail = target_end - 1; + s = text + (target_end - target) - 1; + if (IS_NULL(reg->int_map)) { + while (s < end) { + p = s - (target_end - target) + 1; + if (str_lower_case_match(enc, case_fold_flag, target, target_end, + p, s + 1)) + return (UChar* )p; + s += reg->map[*s]; + } + } + else { /* see int_map[] */ + while (s < end) { + p = s - (target_end - target) + 1; + if (str_lower_case_match(enc, case_fold_flag, target, target_end, + p, s + 1)) + return (UChar* )p; + s += reg->int_map[*s]; + } + } + return (UChar* )NULL; +} + +#else /* USE_SUNDAY_QUICK_SEARCH */ + +/* Sunday's quick search applied to a multibyte string */ +static UChar* +bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, + const UChar* text, const UChar* text_end, + const UChar* text_range) +{ + const UChar *s, *se, *t, *p, *end; + const UChar *tail; + ptrdiff_t skip, tlen1; + OnigEncoding enc = reg->enc; + +#ifdef ONIG_DEBUG_SEARCH + fprintf(stderr, "bm_search_notrev: text: %d (%p), text_end: %d (%p), text_range: %d (%p)\n", + (int )text, text, (int )text_end, text_end, (int )text_range, text_range); +#endif + + tail = target_end - 1; + tlen1 = tail - target; + end = text_range; + if (end + tlen1 > text_end) + end = text_end - tlen1; + + s = text; + + if (IS_NULL(reg->int_map)) { + while (s < end) { + p = se = s + tlen1; + t = tail; + while (*p == *t) { + if (t == target) return (UChar* )s; + p--; t--; + } + if (s + 1 >= end) break; + skip = reg->map[se[1]]; + t = s; + do { + s += enclen(enc, s, end); + } while ((s - t) < skip && s < end); + } + } + else { + while (s < end) { + p = se = s + tlen1; + t = tail; + while (*p == *t) { + if (t == target) return (UChar* )s; + p--; t--; + } + if (s + 1 >= end) break; + skip = reg->int_map[se[1]]; + t = s; + do { + s += enclen(enc, s, end); + } while ((s - t) < skip && s < end); + } + } + + return (UChar* )NULL; +} + +/* Sunday's quick search */ +static UChar* +bm_search(regex_t* reg, const UChar* target, const UChar* target_end, + const UChar* text, const UChar* text_end, const UChar* text_range) +{ + const UChar *s, *t, *p, *end; + const UChar *tail; + ptrdiff_t tlen1; + + tail = target_end - 1; + tlen1 = tail - target; + end = text_range + tlen1; + if (end > text_end) + end = text_end; + + s = text + tlen1; + if (IS_NULL(reg->int_map)) { + while (s < end) { + p = s; + t = tail; + while (*p == *t) { + if (t == target) return (UChar* )p; + p--; t--; + } + if (s + 1 >= end) break; + s += reg->map[s[1]]; + } + } + else { /* see int_map[] */ + while (s < end) { + p = s; + t = tail; + while (*p == *t) { + if (t == target) return (UChar* )p; + p--; t--; + } + if (s + 1 >= end) break; + s += reg->int_map[s[1]]; + } + } + return (UChar* )NULL; +} + +/* Sunday's quick search applied to a multibyte string (ignore case) */ +static UChar* +bm_search_notrev_ic(regex_t* reg, const UChar* target, const UChar* target_end, + const UChar* text, const UChar* text_end, + const UChar* text_range) +{ + const UChar *s, *se, *t, *end; + const UChar *tail; + ptrdiff_t skip, tlen1; + OnigEncoding enc = reg->enc; + int case_fold_flag = reg->case_fold_flag; + +#ifdef ONIG_DEBUG_SEARCH + fprintf(stderr, "bm_search_notrev_ic: text: %d (%p), text_end: %d (%p), text_range: %d (%p)\n", + (int )text, text, (int )text_end, text_end, (int )text_range, text_range); +#endif + + tail = target_end - 1; + tlen1 = tail - target; + end = text_range; + if (end + tlen1 > text_end) + end = text_end - tlen1; + + s = text; + + if (IS_NULL(reg->int_map)) { + while (s < end) { + se = s + tlen1; + if (str_lower_case_match(enc, case_fold_flag, target, target_end, + s, se + 1)) + return (UChar* )s; + if (s + 1 >= end) break; + skip = reg->map[se[1]]; + t = s; + do { + s += enclen(enc, s, end); + } while ((s - t) < skip && s < end); + } + } + else { + while (s < end) { + se = s + tlen1; + if (str_lower_case_match(enc, case_fold_flag, target, target_end, + s, se + 1)) + return (UChar* )s; + if (s + 1 >= end) break; + skip = reg->int_map[se[1]]; + t = s; + do { + s += enclen(enc, s, end); + } while ((s - t) < skip && s < end); + } + } + + return (UChar* )NULL; +} + +/* Sunday's quick search (ignore case) */ +static UChar* +bm_search_ic(regex_t* reg, const UChar* target, const UChar* target_end, + const UChar* text, const UChar* text_end, const UChar* text_range) +{ + const UChar *s, *p, *end; + const UChar *tail; + ptrdiff_t tlen1; + OnigEncoding enc = reg->enc; + int case_fold_flag = reg->case_fold_flag; + +#ifdef ONIG_DEBUG_SEARCH + fprintf(stderr, "bm_search_ic: text: %d (%p), text_end: %d (%p), text_range: %d (%p)\n", + (int )text, text, (int )text_end, text_end, (int )text_range, text_range); +#endif + + tail = target_end - 1; + tlen1 = tail - target; + end = text_range + tlen1; + if (end > text_end) + end = text_end; + + s = text + tlen1; + if (IS_NULL(reg->int_map)) { + while (s < end) { + p = s - tlen1; + if (str_lower_case_match(enc, case_fold_flag, target, target_end, + p, s + 1)) + return (UChar* )p; + if (s + 1 >= end) break; + s += reg->map[s[1]]; + } + } + else { /* see int_map[] */ + while (s < end) { + p = s - tlen1; + if (str_lower_case_match(enc, case_fold_flag, target, target_end, + p, s + 1)) + return (UChar* )p; + if (s + 1 >= end) break; + s += reg->int_map[s[1]]; + } + } + return (UChar* )NULL; +} +#endif /* USE_SUNDAY_QUICK_SEARCH */ + static int set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED, int** skip) - { int i, len; @@ -3005,7 +3499,7 @@ set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED, if (IS_NULL(*skip)) return ONIGERR_MEMORY; } - len = (int)(end - s); + len = (int )(end - s); for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*skip)[i] = len; @@ -3073,11 +3567,11 @@ map_search_backward(OnigEncoding enc, UChar map[], return (UChar* )NULL; } -extern long +extern OnigPosition onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, OnigRegion* region, OnigOptionType option) { - long r; + ptrdiff_t r; UChar *prev; OnigMatchArg msa; @@ -3106,7 +3600,7 @@ onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, On THREAD_ATOMIC_END; #endif /* USE_RECOMPILE_API && USE_MULTI_THREAD_SYSTEM */ - MATCH_ARG_INIT(msa, option, region, at); + MATCH_ARG_INIT(msa, option, region, at, at); #ifdef USE_COMBINATION_EXPLOSION_CHECK { int offset = at - str; @@ -3145,8 +3639,8 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, UChar *p, *pprev = (UChar* )NULL; #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "forward_search_range: str: %"PRIuPTR", end: %"PRIuPTR", s: %"PRIuPTR", range: %"PRIuPTR"\n", - str, end, s, range); + fprintf(stderr, "forward_search_range: str: %"PRIuPTR" (%p), end: %"PRIuPTR" (%p), s: %"PRIuPTR" (%p), range: %"PRIuPTR" (%p)\n", + str, str, end, end, s, s, range, range); #endif p = s; @@ -3178,6 +3672,14 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, p = bm_search_notrev(reg, reg->exact, reg->exact_end, p, end, range); break; + case ONIG_OPTIMIZE_EXACT_BM_IC: + p = bm_search_ic(reg, reg->exact, reg->exact_end, p, end, range); + break; + + case ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC: + p = bm_search_notrev_ic(reg, reg->exact, reg->exact_end, p, end, range); + break; + case ONIG_OPTIMIZE_MAP: p = map_search(reg->enc, reg->map, p, range, end); break; @@ -3199,7 +3701,7 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, if (!ON_STR_BEGIN(p)) { prev = onigenc_get_prev_char_head(reg->enc, (pprev ? pprev : str), p, end); - if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) + if (!ONIGENC_IS_MBC_NEWLINE_EX(reg->enc, prev, str, end, reg->options, 0)) goto retry_gate; } break; @@ -3209,15 +3711,11 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE prev = (UChar* )onigenc_get_prev_char_head(reg->enc, (pprev ? pprev : str), p); - if (prev && ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) + if (prev && ONIGENC_IS_MBC_NEWLINE_EX(reg->enc, prev, str, end, reg->options, 1)) goto retry_gate; #endif } - else if (! ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end) -#ifdef USE_CRNL_AS_LINE_TERMINATOR - && ! ONIGENC_IS_MBC_CRNL(reg->enc, p, end) -#endif - ) + else if (! ONIGENC_IS_MBC_NEWLINE_EX(reg->enc, p, str, end, reg->options, 1)) goto retry_gate; break; } @@ -3269,7 +3767,7 @@ static int set_bm_backward_skip P_((UChar* s, UChar* end, OnigEncoding enc, #define BM_BACKWARD_SEARCH_LENGTH_THRESHOLD 100 -static long +static int backward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, const UChar* range, UChar* adjrange, UChar** low, UChar** high) @@ -3289,6 +3787,8 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, break; case ONIG_OPTIMIZE_EXACT_IC: + case ONIG_OPTIMIZE_EXACT_BM_IC: + case ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC: p = slow_search_backward_ic(reg->enc, reg->case_fold_flag, reg->exact, reg->exact_end, range, adjrange, end, p); @@ -3321,7 +3821,7 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, case ANCHOR_BEGIN_LINE: if (!ON_STR_BEGIN(p)) { prev = onigenc_get_prev_char_head(reg->enc, str, p, end); - if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) { + if (!ONIGENC_IS_MBC_NEWLINE_EX(reg->enc, prev, str, end, reg->options, 0)) { p = prev; goto retry; } @@ -3333,17 +3833,13 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE prev = onigenc_get_prev_char_head(reg->enc, adjrange, p); if (IS_NULL(prev)) goto fail; - if (ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) { + if (ONIGENC_IS_MBC_NEWLINE_EX(reg->enc, prev, str, end, reg->options, 1)) { p = prev; goto retry; } #endif } - else if (! ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end) -#ifdef USE_CRNL_AS_LINE_TERMINATOR - && ! ONIGENC_IS_MBC_CRNL(reg->enc, p, end) -#endif - ) { + else if (! ONIGENC_IS_MBC_NEWLINE_EX(reg->enc, p, str, end, reg->options, 1)) { p = onigenc_get_prev_char_head(reg->enc, adjrange, p, end); if (IS_NULL(p)) goto fail; goto retry; @@ -3374,11 +3870,19 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, } -extern long +extern OnigPosition onig_search(regex_t* reg, const UChar* str, const UChar* end, const UChar* start, const UChar* range, OnigRegion* region, OnigOptionType option) { - long r; + return onig_search_gpos(reg, str, end, start, start, range, region, option); +} + +extern OnigPosition +onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, + const UChar* global_pos, + const UChar* start, const UChar* range, OnigRegion* region, OnigOptionType option) +{ + ptrdiff_t r; UChar *s, *prev; OnigMatchArg msa; const UChar *orig_start = start; @@ -3413,8 +3917,8 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, - "onig_search (entry point): str: %"PRIuPTR", end: %"PRIuPTR", start: %"PRIuPTR", range: %"PRIuPTR"\n", - str, end - str, start - str, range - str); + "onig_search (entry point): str: %"PRIuPTR" (%p), end: %"PRIuPTR", start: %"PRIuPTR", range: %"PRIuPTR"\n", + str, str, end - str, start - str, range - str); #endif if (region @@ -3546,6 +4050,7 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, #ifdef USE_CRNL_AS_LINE_TERMINATOR pre_end = ONIGENC_STEP_BACK(reg->enc, str, pre_end, end, 1); if (IS_NOT_NULL(pre_end) && + IS_NEWLINE_CRLF(reg->options) && ONIGENC_IS_MBC_CRNL(reg->enc, pre_end, end)) { min_semi_end = pre_end; } @@ -3575,7 +4080,7 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, s = (UChar* )start; prev = (UChar* )NULL; - MATCH_ARG_INIT(msa, option, region, start); + MATCH_ARG_INIT(msa, option, region, start, start); #ifdef USE_COMBINATION_EXPLOSION_CHECK msa.state_check_buff = (void* )0; msa.state_check_buff_size = 0; /* NO NEED, for valgrind */ @@ -3591,7 +4096,7 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, (int )(end - str), (int )(start - str), (int )(range - str)); #endif - MATCH_ARG_INIT(msa, option, region, orig_start); + MATCH_ARG_INIT(msa, option, region, start, global_pos); #ifdef USE_COMBINATION_EXPLOSION_CHECK { int offset = (MIN(start, range) - str); @@ -3644,9 +4149,19 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, if ((reg->anchor & ANCHOR_ANYCHAR_STAR) != 0) { do { + if ((reg->anchor & ANCHOR_BEGIN_POSITION) == 0) + msa.gpos = s; /* move \G position */ MATCH_AND_RETURN_CHECK(orig_range); prev = s; s += enclen(reg->enc, s, end); + + if ((reg->anchor & ANCHOR_LOOK_BEHIND) == 0) { + while (!ONIGENC_IS_MBC_NEWLINE_EX(reg->enc, prev, str, end, reg->options, 0) + && s < range) { + prev = s; + s += enclen(reg->enc, s, end); + } + } } while (s < range); goto mismatch; } @@ -3666,7 +4181,7 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, else { /* backward search */ #ifdef USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE if (orig_start < end) - orig_start += enclen(reg->enc, orig_start, end); /* is upper range */ + orig_start += enclen(reg->enc, orig_start, end); /* is upper range */ #endif if (reg->optimize != ONIG_OPTIMIZE_NONE) { -- cgit v1.2.3