summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
authorakr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-09-16 16:48:05 +0000
committerakr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-09-16 16:48:05 +0000
commita67d4fa01c43124048be957f7d8e4090b7255393 (patch)
tree98c3fdd7beee9a446971cd1cb4be9086bde3ae60 /regexec.c
parentbad9a9ad0d211f9c0a5c0eb12bdb57e49fa900fd (diff)
* include/ruby/oniguruma.h (OnigEncodingTypeST): add precise_ret
argument for mbc_to_code. (ONIGENC_MBC_TO_CODE): provide NULL for precise_ret. (ONIGENC_MBC_PRECISE_CODEPOINT): defined. * include/ruby/encoding.h (rb_enc_mbc_precise_codepoint): defined. * regenc.h (onigenc_single_byte_mbc_to_code): precise_ret argument added. (onigenc_mbn_mbc_to_code): ditto. * regenc.c (onigenc_single_byte_mbc_to_code): precise_ret argument added. (onigenc_mbn_mbc_to_code): ditto. * string.c (count_utf8_lead_bytes_with_word): removed. (str_utf8_nth): removed. (str_utf8_offset): removed. (str_strlen): UTF-8 codepoint oriented optimization removed. (rb_str_substr): ditto. (enc_succ_char): use rb_enc_mbc_precise_codepoint. (enc_pred_char): ditto. (rb_str_succ): ditto. * encoding.c (rb_enc_ascget): check length with rb_enc_mbc_precise_codepoint. (rb_enc_codepoint): use rb_enc_mbc_precise_codepoint. * regexec.c (string_cmp_ic): add text_end argument. (match_at): check end of character after exact string matches. * enc/utf_8.c (graphme_table): defined for extended graphme cluster boundary. (grapheme_cmp): defined. (get_grapheme_properties): defined. (grapheme_boundary_p): defined. (MAX_BYTES_LENGTH): defined. (comb_char_enc_len): defined. (mbc_to_code0): extracted from mbc_to_code. (mbc_to_code): use mbc_to_code0. (left_adjust_combchar_head): defined. (utf_8): use a extended graphme cluster as a unit. * enc/unicode.c (onigenc_unicode_mbc_case_fold): use ONIGENC_MBC_PRECISE_CODEPOINT to extract codepoints. (onigenc_unicode_get_case_fold_codes_by_str): ditto. * enc/euc_jp.c (mbc_to_code): follow mbc_to_code field change. use onigenc_mbn_mbc_to_code. * enc/shift_jis.c (mbc_to_code): ditto. * enc/emacs_mule.c (mbc_to_code): ditto. * enc/gbk.c (gbk_mbc_to_code): follow mbc_to_code field and onigenc_mbn_mbc_to_code change. * enc/cp949.c (cp949_mbc_to_code): ditto. * enc/big5.c (big5_mbc_to_code): ditto. * enc/euc_tw.c (euctw_mbc_to_code): ditto. * enc/euc_kr.c (euckr_mbc_to_code): ditto. * enc/gb18030.c (gb18030_mbc_to_code): ditto. * enc/utf_32be.c (utf32be_mbc_to_code): follow mbc_to_code field change. * enc/utf_16be.c (utf16be_mbc_to_code): ditto. * enc/utf_32le.c (utf32le_mbc_to_code): ditto. * enc/utf_16le.c (utf16le_mbc_to_code): ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@19389 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c52
1 files changed, 39 insertions, 13 deletions
diff --git a/regexec.c b/regexec.c
index c936a60352..93240dae81 100644
--- a/regexec.c
+++ b/regexec.c
@@ -977,25 +977,24 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
}\
} while(0)
-#define STRING_CMP_IC(case_fold_flag,s1,ps2,len) do {\
- if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len) == 0) \
+#define STRING_CMP_IC(case_fold_flag,s1,ps2,len,text_end) do {\
+ if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len, text_end) == 0) \
goto fail; \
} while(0)
static int string_cmp_ic(OnigEncoding enc, int case_fold_flag,
- UChar* s1, UChar** ps2, int mblen)
+ UChar* s1, UChar** ps2, int mblen, const UChar* text_end)
{
UChar buf1[ONIGENC_MBC_CASE_FOLD_MAXLEN];
UChar buf2[ONIGENC_MBC_CASE_FOLD_MAXLEN];
- UChar *p1, *p2, *end1, *s2, *end2;
+ UChar *p1, *p2, *end1, *s2;
int len1, len2;
s2 = *ps2;
end1 = s1 + mblen;
- end2 = s2 + mblen;
while (s1 < end1) {
len1 = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &s1, end1, buf1);
- len2 = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &s2, end2, buf2);
+ len2 = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &s2, text_end, buf2);
if (len1 != len2) return 0;
p1 = buf1;
p2 = buf2;
@@ -1019,8 +1018,8 @@ static int string_cmp_ic(OnigEncoding enc, int case_fold_flag,
}\
} while(0)
-#define STRING_CMP_VALUE_IC(case_fold_flag,s1,ps2,len,is_fail) do {\
- if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len) == 0) \
+#define STRING_CMP_VALUE_IC(case_fold_flag,s1,ps2,len,text_end,is_fail) do {\
+ if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len, text_end) == 0) \
is_fail = 1; \
else \
is_fail = 0; \
@@ -1126,7 +1125,7 @@ static int backref_match_at_nested_level(regex_t* reg
if (ignore_case != 0) {
if (string_cmp_ic(reg->enc, case_fold_flag,
- pstart, &ss, (int )(pend - pstart)) == 0)
+ pstart, &ss, (int )(pend - pstart), send) == 0)
return 0; /* or goto next_mem; */
}
else {
@@ -1442,6 +1441,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
if (*p != *s++) goto fail;
DATA_ENSURE(0);
p++;
+ if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+ goto fail;
MOP_OUT;
break;
@@ -1464,6 +1465,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
p++; q++;
}
}
+ if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+ goto fail;
MOP_OUT;
break;
@@ -1474,6 +1477,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
if (*p != *s) goto fail;
sprev = s;
p++; s++;
+ if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+ goto fail;
MOP_OUT;
continue;
break;
@@ -1487,6 +1492,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
if (*p != *s) goto fail;
sprev = s;
p++; s++;
+ if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+ goto fail;
MOP_OUT;
continue;
break;
@@ -1502,6 +1509,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
if (*p != *s) goto fail;
sprev = s;
p++; s++;
+ if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+ goto fail;
MOP_OUT;
continue;
break;
@@ -1519,6 +1528,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
if (*p != *s) goto fail;
sprev = s;
p++; s++;
+ if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+ goto fail;
MOP_OUT;
continue;
break;
@@ -1530,6 +1541,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
if (*p++ != *s++) goto fail;
}
sprev = s - 1;
+ if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+ goto fail;
MOP_OUT;
continue;
break;
@@ -1557,7 +1570,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
}
}
}
-
+ if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+ goto fail;
MOP_OUT;
continue;
break;
@@ -1568,6 +1582,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
p++; s++;
if (*p != *s) goto fail;
p++; s++;
+ if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+ goto fail;
MOP_OUT;
break;
@@ -1582,6 +1598,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
p++; s++;
if (*p != *s) goto fail;
p++; s++;
+ if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+ goto fail;
MOP_OUT;
continue;
break;
@@ -1601,6 +1619,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
p++; s++;
if (*p != *s) goto fail;
p++; s++;
+ if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+ goto fail;
MOP_OUT;
continue;
break;
@@ -1615,6 +1635,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
p++; s++;
}
sprev = s - 2;
+ if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+ goto fail;
MOP_OUT;
continue;
break;
@@ -1631,6 +1653,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
p++; s++;
}
sprev = s - 3;
+ if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+ goto fail;
MOP_OUT;
continue;
break;
@@ -1645,6 +1669,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
p++; s++;
}
sprev = s - tlen;
+ if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+ goto fail;
MOP_OUT;
continue;
break;
@@ -2199,7 +2225,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
n = pend - pstart;
DATA_ENSURE(n);
sprev = s;
- STRING_CMP_IC(case_fold_flag, pstart, &s, n);
+ STRING_CMP_IC(case_fold_flag, pstart, &s, n, end);
while (sprev + (len = enclen(encode, sprev, end)) < s)
sprev += len;
@@ -2271,7 +2297,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
DATA_ENSURE(n);
sprev = s;
swork = s;
- STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail);
+ STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, end, is_fail);
if (is_fail) continue;
s = swork;
while (sprev + (len = enclen(encode, sprev, end)) < s)
@@ -2780,7 +2806,7 @@ slow_search(OnigEncoding enc, UChar* target, UChar* target_end,
if (target_end == t || memcmp(t, p, target_end - t) == 0)
return s;
}
- s += enclen(enc, s, end);
+ s += enclen(enc, s, text_end);
}
return (UChar* )NULL;