summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog6
-rw-r--r--string.c59
2 files changed, 64 insertions, 1 deletions
diff --git a/ChangeLog b/ChangeLog
index 35a49c22ed..3979cf1ce8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+Sat Feb 16 20:49:34 2008 NARUSE, Yui <naruse@ruby-lang.org>
+
+ * string.c (rb_str_substr): optimized for UTF-8.
+
Sat Feb 16 18:13:53 2008 Tanaka Akira <akr@fsij.org>
* encoding.c (rb_enc_compatible): check encoding incapable arguments.
@@ -12,7 +16,7 @@ Sat Feb 16 19:04:17 2008 NARUSE, Yui <naruse@ruby-lang.org>
Sat Feb 16 18:25:14 2008 NARUSE, Yui <naruse@ruby-lang.org>
- * string.c (str_strlen): little more optimize.
+ * string.c (str_strlen): little more optimization.
(rb_enc_nth): remove needless variable 'c'.
Sat Feb 16 18:00:13 2008 Tanaka Akira <akr@fsij.org>
diff --git a/string.c b/string.c
index 341bfd7463..3a78f66a9a 100644
--- a/string.c
+++ b/string.c
@@ -1011,6 +1011,58 @@ str_offset(const char *p, const char *e, int nth, rb_encoding *enc, int singleby
return pp - p;
}
+#ifdef NONASCII_MASK
+static char *
+str_utf8_nth(const char *p, const char *e, int nth)
+{
+ if (sizeof(long) * 2 < nth) {
+ const unsigned long *s, *t;
+ const VALUE lowbits = sizeof(unsigned long) - 1;
+ s = (const unsigned long*)(~lowbits & ((VALUE)p + lowbits));
+ t = (const unsigned long*)(~lowbits & (VALUE)e);
+ for (; p<(const char *)s && 0<nth; p++) {
+ if (((*p)&0xC0) != 0x80) nth--;
+ }
+ while (s < t) {
+ unsigned long d = *s++;
+ d = ~d | (d<<1);
+ d &= NONASCII_MASK;
+ d >>= 7;
+ d += (d>>8);
+ d += (d>>16);
+#if NONASCII_MASK == 0x8080808080808080UL
+ d += (d>>32);
+#endif
+ nth -= (long)(d&0xF);
+ if (nth < 8) {
+ t = s;
+ break;
+ }
+ }
+ p = (char *)t;
+ }
+ if (0 < nth) {
+ while (p < e) {
+ if (((*p)&0xC0) != 0x80) {
+ nth--;
+ if (nth < 0)
+ break;
+ }
+ p++;
+ }
+ }
+ return (char *)p;
+}
+
+static int
+str_utf8_offset(const char *p, const char *e, int nth)
+{
+ const char *pp = str_utf8_nth(p, e, nth);
+ if (!pp) return e - p;
+ return pp - p;
+}
+#endif
+
static long
str_sublen(VALUE str, long pos, rb_encoding *enc)
{
@@ -1082,6 +1134,13 @@ rb_str_substr(VALUE str, long beg, long len)
if (len == 0) {
p = 0;
}
+#ifdef NONASCII_MASK
+ else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
+ enc == rb_utf8_encoding()) {
+ p = str_utf8_nth(s, e, beg);
+ len = str_utf8_offset(p, e, len);
+ }
+#endif
else if ((p = str_nth(s, e, beg, enc, singlebyte)) == e) {
len = 0;
}