diff options
author | akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2008-09-16 16:48:05 +0000 |
---|---|---|
committer | akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2008-09-16 16:48:05 +0000 |
commit | a67d4fa01c43124048be957f7d8e4090b7255393 (patch) | |
tree | 98c3fdd7beee9a446971cd1cb4be9086bde3ae60 /regexec.c | |
parent | bad9a9ad0d211f9c0a5c0eb12bdb57e49fa900fd (diff) |
* include/ruby/oniguruma.h (OnigEncodingTypeST): add precise_ret
argument for mbc_to_code.
(ONIGENC_MBC_TO_CODE): provide NULL for precise_ret.
(ONIGENC_MBC_PRECISE_CODEPOINT): defined.
* include/ruby/encoding.h (rb_enc_mbc_precise_codepoint): defined.
* regenc.h (onigenc_single_byte_mbc_to_code): precise_ret argument
added.
(onigenc_mbn_mbc_to_code): ditto.
* regenc.c (onigenc_single_byte_mbc_to_code): precise_ret argument
added.
(onigenc_mbn_mbc_to_code): ditto.
* string.c (count_utf8_lead_bytes_with_word): removed.
(str_utf8_nth): removed.
(str_utf8_offset): removed.
(str_strlen): UTF-8 codepoint oriented optimization removed.
(rb_str_substr): ditto.
(enc_succ_char): use rb_enc_mbc_precise_codepoint.
(enc_pred_char): ditto.
(rb_str_succ): ditto.
* encoding.c (rb_enc_ascget): check length with
rb_enc_mbc_precise_codepoint.
(rb_enc_codepoint): use rb_enc_mbc_precise_codepoint.
* regexec.c (string_cmp_ic): add text_end argument.
(match_at): check end of character after exact string matches.
* enc/utf_8.c (graphme_table): defined for extended graphme cluster
boundary.
(grapheme_cmp): defined.
(get_grapheme_properties): defined.
(grapheme_boundary_p): defined.
(MAX_BYTES_LENGTH): defined.
(comb_char_enc_len): defined.
(mbc_to_code0): extracted from mbc_to_code.
(mbc_to_code): use mbc_to_code0.
(left_adjust_combchar_head): defined.
(utf_8): use a extended graphme cluster as a unit.
* enc/unicode.c (onigenc_unicode_mbc_case_fold): use
ONIGENC_MBC_PRECISE_CODEPOINT to extract codepoints.
(onigenc_unicode_get_case_fold_codes_by_str): ditto.
* enc/euc_jp.c (mbc_to_code): follow mbc_to_code field change.
use onigenc_mbn_mbc_to_code.
* enc/shift_jis.c (mbc_to_code): ditto.
* enc/emacs_mule.c (mbc_to_code): ditto.
* enc/gbk.c (gbk_mbc_to_code): follow mbc_to_code field and
onigenc_mbn_mbc_to_code change.
* enc/cp949.c (cp949_mbc_to_code): ditto.
* enc/big5.c (big5_mbc_to_code): ditto.
* enc/euc_tw.c (euctw_mbc_to_code): ditto.
* enc/euc_kr.c (euckr_mbc_to_code): ditto.
* enc/gb18030.c (gb18030_mbc_to_code): ditto.
* enc/utf_32be.c (utf32be_mbc_to_code): follow mbc_to_code field
change.
* enc/utf_16be.c (utf16be_mbc_to_code): ditto.
* enc/utf_32le.c (utf32le_mbc_to_code): ditto.
* enc/utf_16le.c (utf16le_mbc_to_code): ditto.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@19389 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'regexec.c')
-rw-r--r-- | regexec.c | 52 |
1 files changed, 39 insertions, 13 deletions
@@ -977,25 +977,24 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, }\ } while(0) -#define STRING_CMP_IC(case_fold_flag,s1,ps2,len) do {\ - if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len) == 0) \ +#define STRING_CMP_IC(case_fold_flag,s1,ps2,len,text_end) do {\ + if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len, text_end) == 0) \ goto fail; \ } while(0) static int string_cmp_ic(OnigEncoding enc, int case_fold_flag, - UChar* s1, UChar** ps2, int mblen) + UChar* s1, UChar** ps2, int mblen, const UChar* text_end) { UChar buf1[ONIGENC_MBC_CASE_FOLD_MAXLEN]; UChar buf2[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - UChar *p1, *p2, *end1, *s2, *end2; + UChar *p1, *p2, *end1, *s2; int len1, len2; s2 = *ps2; end1 = s1 + mblen; - end2 = s2 + mblen; while (s1 < end1) { len1 = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &s1, end1, buf1); - len2 = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &s2, end2, buf2); + len2 = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &s2, text_end, buf2); if (len1 != len2) return 0; p1 = buf1; p2 = buf2; @@ -1019,8 +1018,8 @@ static int string_cmp_ic(OnigEncoding enc, int case_fold_flag, }\ } while(0) -#define STRING_CMP_VALUE_IC(case_fold_flag,s1,ps2,len,is_fail) do {\ - if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len) == 0) \ +#define STRING_CMP_VALUE_IC(case_fold_flag,s1,ps2,len,text_end,is_fail) do {\ + if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len, text_end) == 0) \ is_fail = 1; \ else \ is_fail = 0; \ @@ -1126,7 +1125,7 @@ static int backref_match_at_nested_level(regex_t* reg if (ignore_case != 0) { if (string_cmp_ic(reg->enc, case_fold_flag, - pstart, &ss, (int )(pend - pstart)) == 0) + pstart, &ss, (int )(pend - pstart), send) == 0) return 0; /* or goto next_mem; */ } else { @@ -1442,6 +1441,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p != *s++) goto fail; DATA_ENSURE(0); p++; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; break; @@ -1464,6 +1465,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; q++; } } + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; break; @@ -1474,6 +1477,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p != *s) goto fail; sprev = s; p++; s++; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1487,6 +1492,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p != *s) goto fail; sprev = s; p++; s++; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1502,6 +1509,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p != *s) goto fail; sprev = s; p++; s++; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1519,6 +1528,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p != *s) goto fail; sprev = s; p++; s++; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1530,6 +1541,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p++ != *s++) goto fail; } sprev = s - 1; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1557,7 +1570,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } } } - + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1568,6 +1582,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; if (*p != *s) goto fail; p++; s++; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; break; @@ -1582,6 +1598,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; if (*p != *s) goto fail; p++; s++; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1601,6 +1619,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; if (*p != *s) goto fail; p++; s++; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1615,6 +1635,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; } sprev = s - 2; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1631,6 +1653,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; } sprev = s - 3; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1645,6 +1669,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; } sprev = s - tlen; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -2199,7 +2225,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, n = pend - pstart; DATA_ENSURE(n); sprev = s; - STRING_CMP_IC(case_fold_flag, pstart, &s, n); + STRING_CMP_IC(case_fold_flag, pstart, &s, n, end); while (sprev + (len = enclen(encode, sprev, end)) < s) sprev += len; @@ -2271,7 +2297,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, DATA_ENSURE(n); sprev = s; swork = s; - STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); + STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, end, is_fail); if (is_fail) continue; s = swork; while (sprev + (len = enclen(encode, sprev, end)) < s) @@ -2780,7 +2806,7 @@ slow_search(OnigEncoding enc, UChar* target, UChar* target_end, if (target_end == t || memcmp(t, p, target_end - t) == 0) return s; } - s += enclen(enc, s, end); + s += enclen(enc, s, text_end); } return (UChar* )NULL; |