* include/ruby/oniguruma.h (OnigEncodingTypeST): add precise_ret

argument for mbc_to_code. (ONIGENC_MBC_TO_CODE): provide NULL for precise_ret. (ONIGENC_MBC_PRECISE_CODEPOINT): defined. * include/ruby/encoding.h (rb_enc_mbc_precise_codepoint): defined. * regenc.h (onigenc_single_byte_mbc_to_code): precise_ret argument added. (onigenc_mbn_mbc_to_code): ditto. * regenc.c (onigenc_single_byte_mbc_to_code): precise_ret argument added. (onigenc_mbn_mbc_to_code): ditto. * string.c (count_utf8_lead_bytes_with_word): removed. (str_utf8_nth): removed. (str_utf8_offset): removed. (str_strlen): UTF-8 codepoint oriented optimization removed. (rb_str_substr): ditto. (enc_succ_char): use rb_enc_mbc_precise_codepoint. (enc_pred_char): ditto. (rb_str_succ): ditto. * encoding.c (rb_enc_ascget): check length with rb_enc_mbc_precise_codepoint. (rb_enc_codepoint): use rb_enc_mbc_precise_codepoint. * regexec.c (string_cmp_ic): add text_end argument. (match_at): check end of character after exact string matches. * enc/utf_8.c (graphme_table): defined for extended graphme cluster boundary. (grapheme_cmp): defined. (get_grapheme_properties): defined. (grapheme_boundary_p): defined. (MAX_BYTES_LENGTH): defined. (comb_char_enc_len): defined. (mbc_to_code0): extracted from mbc_to_code. (mbc_to_code): use mbc_to_code0. (left_adjust_combchar_head): defined. (utf_8): use a extended graphme cluster as a unit. * enc/unicode.c (onigenc_unicode_mbc_case_fold): use ONIGENC_MBC_PRECISE_CODEPOINT to extract codepoints. (onigenc_unicode_get_case_fold_codes_by_str): ditto. * enc/euc_jp.c (mbc_to_code): follow mbc_to_code field change. use onigenc_mbn_mbc_to_code. * enc/shift_jis.c (mbc_to_code): ditto. * enc/emacs_mule.c (mbc_to_code): ditto. * enc/gbk.c (gbk_mbc_to_code): follow mbc_to_code field and onigenc_mbn_mbc_to_code change. * enc/cp949.c (cp949_mbc_to_code): ditto. * enc/big5.c (big5_mbc_to_code): ditto. * enc/euc_tw.c (euctw_mbc_to_code): ditto. * enc/euc_kr.c (euckr_mbc_to_code): ditto. * enc/gb18030.c (gb18030_mbc_to_code): ditto. * enc/utf_32be.c (utf32be_mbc_to_code): follow mbc_to_code field change. * enc/utf_16be.c (utf16be_mbc_to_code): ditto. * enc/utf_32le.c (utf32le_mbc_to_code): ditto. * enc/utf_16le.c (utf16le_mbc_to_code): ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@19389 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
author: akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2008-09-16 16:48:05 +0000
committer: akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2008-09-16 16:48:05 +0000
commit: a67d4fa01c43124048be957f7d8e4090b7255393 (patch)
tree: 98c3fdd7beee9a446971cd1cb4be9086bde3ae60 /regexec.c
parent: bad9a9ad0d211f9c0a5c0eb12bdb57e49fa900fd (diff)
1 files changed, 39 insertions, 13 deletions
diff --git a/regexec.c b/regexec.c
index c936a60352..93240dae81 100644
--- a/regexec.c
+++ b/regexec.c
@@ -977,25 +977,24 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
   }\
 } while(0)
 
-#define STRING_CMP_IC(case_fold_flag,s1,ps2,len) do {\
-  if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len) == 0) \
+#define STRING_CMP_IC(case_fold_flag,s1,ps2,len,text_end) do {\
+  if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len, text_end) == 0) \
     goto fail; \
 } while(0)
 
 static int string_cmp_ic(OnigEncoding enc, int case_fold_flag,
-			 UChar* s1, UChar** ps2, int mblen)
+			 UChar* s1, UChar** ps2, int mblen, const UChar* text_end)
 {
   UChar buf1[ONIGENC_MBC_CASE_FOLD_MAXLEN];
   UChar buf2[ONIGENC_MBC_CASE_FOLD_MAXLEN];
-  UChar *p1, *p2, *end1, *s2, *end2;
+  UChar *p1, *p2, *end1, *s2;
   int len1, len2;
 
   s2   = *ps2;
   end1 = s1 + mblen;
-  end2 = s2 + mblen;
   while (s1 < end1) {
     len1 = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &s1, end1, buf1);
-    len2 = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &s2, end2, buf2);
+    len2 = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &s2, text_end, buf2);
     if (len1 != len2) return 0;
     p1 = buf1;
     p2 = buf2;
@@ -1019,8 +1018,8 @@ static int string_cmp_ic(OnigEncoding enc, int case_fold_flag,
   }\
 } while(0)
 
-#define STRING_CMP_VALUE_IC(case_fold_flag,s1,ps2,len,is_fail) do {\
-  if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len) == 0) \
+#define STRING_CMP_VALUE_IC(case_fold_flag,s1,ps2,len,text_end,is_fail) do {\
+  if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len, text_end) == 0) \
     is_fail = 1; \
   else \
     is_fail = 0; \
@@ -1126,7 +1125,7 @@ static int backref_match_at_nested_level(regex_t* reg
 
 	    if (ignore_case != 0) {
 	      if (string_cmp_ic(reg->enc, case_fold_flag,
-				pstart, &ss, (int )(pend - pstart)) == 0)
+				pstart, &ss, (int )(pend - pstart), send) == 0)
 		return 0; /* or goto next_mem; */
 	    }
 	    else {
@@ -1442,6 +1441,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
       if (*p != *s++) goto fail;
       DATA_ENSURE(0);
       p++;
+      if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+        goto fail;
       MOP_OUT;
       break;
 
@@ -1464,6 +1465,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
 	  p++; q++;
 	}
       }
+      if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+        goto fail;
       MOP_OUT;
       break;
 
@@ -1474,6 +1477,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
       if (*p != *s) goto fail;
       sprev = s;
       p++; s++;
+      if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+        goto fail;
       MOP_OUT;
       continue;
       break;
@@ -1487,6 +1492,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
       if (*p != *s) goto fail;
       sprev = s;
       p++; s++;
+      if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+        goto fail;
       MOP_OUT;
       continue;
       break;
@@ -1502,6 +1509,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
       if (*p != *s) goto fail;
       sprev = s;
       p++; s++;
+      if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+        goto fail;
       MOP_OUT;
       continue;
       break;
@@ -1519,6 +1528,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
       if (*p != *s) goto fail;
       sprev = s;
       p++; s++;
+      if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+        goto fail;
       MOP_OUT;
       continue;
       break;
@@ -1530,6 +1541,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
 	if (*p++ != *s++) goto fail;
       }
       sprev = s - 1;
+      if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+        goto fail;
       MOP_OUT;
       continue;
       break;
@@ -1557,7 +1570,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
 	  }
 	}
       }
-
+      if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+        goto fail;
       MOP_OUT;
       continue;
       break;
@@ -1568,6 +1582,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
       p++; s++;
       if (*p != *s) goto fail;
       p++; s++;
+      if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+        goto fail;
       MOP_OUT;
       break;
 
@@ -1582,6 +1598,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
       p++; s++;
       if (*p != *s) goto fail;
       p++; s++;
+      if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+        goto fail;
       MOP_OUT;
       continue;
       break;
@@ -1601,6 +1619,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
       p++; s++;
       if (*p != *s) goto fail;
       p++; s++;
+      if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+        goto fail;
       MOP_OUT;
       continue;
       break;
@@ -1615,6 +1635,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
 	p++; s++;
       }
       sprev = s - 2;
+      if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+        goto fail;
       MOP_OUT;
       continue;
       break;
@@ -1631,6 +1653,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
 	p++; s++;
       }
       sprev = s - 3;
+      if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+        goto fail;
       MOP_OUT;
       continue;
       break;
@@ -1645,6 +1669,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
 	p++; s++;
       }
       sprev = s - tlen;
+      if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s)
+        goto fail;
       MOP_OUT;
       continue;
       break;
@@ -2199,7 +2225,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
 	n = pend - pstart;
 	DATA_ENSURE(n);
 	sprev = s;
-	STRING_CMP_IC(case_fold_flag, pstart, &s, n);
+	STRING_CMP_IC(case_fold_flag, pstart, &s, n, end);
 	while (sprev + (len = enclen(encode, sprev, end)) < s)
 	  sprev += len;
 
@@ -2271,7 +2297,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
 	  DATA_ENSURE(n);
 	  sprev = s;
 	  swork = s;
-	  STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail);
+	  STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, end, is_fail);
 	  if (is_fail) continue;
 	  s = swork;
 	  while (sprev + (len = enclen(encode, sprev, end)) < s)
@@ -2780,7 +2806,7 @@ slow_search(OnigEncoding enc, UChar* target, UChar* target_end,
       if (target_end == t || memcmp(t, p, target_end - t) == 0)
 	return s;
     }
-    s += enclen(enc, s, end);
+    s += enclen(enc, s, text_end);
   }
 
   return (UChar* )NULL;
author	akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2008-09-16 16:48:05 +0000
committer	akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2008-09-16 16:48:05 +0000
commit	a67d4fa01c43124048be957f7d8e4090b7255393 (patch)
tree	98c3fdd7beee9a446971cd1cb4be9086bde3ae60 /regexec.c
parent	bad9a9ad0d211f9c0a5c0eb12bdb57e49fa900fd (diff)