diff options
-rw-r--r-- | ChangeLog | 6 | ||||
-rw-r--r-- | string.c | 59 |
2 files changed, 64 insertions, 1 deletions
@@ -1,3 +1,7 @@ +Sat Feb 16 20:49:34 2008 NARUSE, Yui <naruse@ruby-lang.org> + + * string.c (rb_str_substr): optimized for UTF-8. + Sat Feb 16 18:13:53 2008 Tanaka Akira <akr@fsij.org> * encoding.c (rb_enc_compatible): check encoding incapable arguments. @@ -12,7 +16,7 @@ Sat Feb 16 19:04:17 2008 NARUSE, Yui <naruse@ruby-lang.org> Sat Feb 16 18:25:14 2008 NARUSE, Yui <naruse@ruby-lang.org> - * string.c (str_strlen): little more optimize. + * string.c (str_strlen): little more optimization. (rb_enc_nth): remove needless variable 'c'. Sat Feb 16 18:00:13 2008 Tanaka Akira <akr@fsij.org> @@ -1011,6 +1011,58 @@ str_offset(const char *p, const char *e, int nth, rb_encoding *enc, int singleby return pp - p; } +#ifdef NONASCII_MASK +static char * +str_utf8_nth(const char *p, const char *e, int nth) +{ + if (sizeof(long) * 2 < nth) { + const unsigned long *s, *t; + const VALUE lowbits = sizeof(unsigned long) - 1; + s = (const unsigned long*)(~lowbits & ((VALUE)p + lowbits)); + t = (const unsigned long*)(~lowbits & (VALUE)e); + for (; p<(const char *)s && 0<nth; p++) { + if (((*p)&0xC0) != 0x80) nth--; + } + while (s < t) { + unsigned long d = *s++; + d = ~d | (d<<1); + d &= NONASCII_MASK; + d >>= 7; + d += (d>>8); + d += (d>>16); +#if NONASCII_MASK == 0x8080808080808080UL + d += (d>>32); +#endif + nth -= (long)(d&0xF); + if (nth < 8) { + t = s; + break; + } + } + p = (char *)t; + } + if (0 < nth) { + while (p < e) { + if (((*p)&0xC0) != 0x80) { + nth--; + if (nth < 0) + break; + } + p++; + } + } + return (char *)p; +} + +static int +str_utf8_offset(const char *p, const char *e, int nth) +{ + const char *pp = str_utf8_nth(p, e, nth); + if (!pp) return e - p; + return pp - p; +} +#endif + static long str_sublen(VALUE str, long pos, rb_encoding *enc) { @@ -1082,6 +1134,13 @@ rb_str_substr(VALUE str, long beg, long len) if (len == 0) { p = 0; } +#ifdef NONASCII_MASK + else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && + enc == rb_utf8_encoding()) { + p = str_utf8_nth(s, e, beg); + len = str_utf8_offset(p, e, len); + } +#endif else if ((p = str_nth(s, e, beg, enc, singlebyte)) == e) { len = 0; } |