summaryrefslogtreecommitdiff
path: root/re.c
diff options
context:
space:
mode:
authornaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-03-17 19:04:29 +0000
committernaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-03-17 19:04:29 +0000
commite58adeae0f384a562bbcb295b0a25026842625ca (patch)
tree161176e4eef5bfe5a88d0177728dcd8780899337 /re.c
parent89afc5431b10e20689bcaac33a9e81a12d1b2b95 (diff)
* re.c (rb_memsearch_ss): simple shift search.
* re.c (rb_memsearch_qs): quick search. * re.c (rb_memsearch_qs_utf8): quick search for UTF-8 string. * re.c (rb_memsearch_qs_utf8_hash): hash functions for above. * re.c (rb_memsearch): use above functions. * string.c (rb_str_index): give enc to rb_memsearch. * include/ruby/intern.h (rb_memsearch): move to encoding.h. * include/ruby/encoding.h (rb_memsearch): move from intern.h. * common.mk (PREP): add dependency. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@15792 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 're.c')
-rw-r--r--re.c156
1 files changed, 130 insertions, 26 deletions
diff --git a/re.c b/re.c
index 58f72fbf80..7ab7ad211b 100644
--- a/re.c
+++ b/re.c
@@ -95,41 +95,145 @@ rb_memcmp(const void *p1, const void *p2, long len)
return memcmp(p1, p2, len);
}
-long
-rb_memsearch(const void *x0, long m, const void *y0, long n)
-{
- const unsigned char *x = x0, *y = y0;
- const unsigned char *s, *e;
- long i;
- int d;
- unsigned long hx, hy;
+static inline long
+rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
+{
+ const unsigned char *x = xs, *xe = xs + m;
+ const unsigned char *y = ys, *ye = ys + n;
+#ifndef VALUE_MAX
+# if SIZEOF_VALUE == 8
+# define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL
+# elif SIZEOF_VALUE == 4
+# define VALUE_MAX 0xFFFFFFFFUL
+# endif
+#endif
+ VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT);
-#define KR_REHASH(a, b, h) (((h) << 1) - (((unsigned long)(a))<<d) + (b))
+ if (m > SIZEOF_VALUE)
+ rb_bug("!!too long pattern string!!");
- if (m > n) return -1;
- s = y; e = s + n - m;
+ /* Prepare hash value */
+ for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
+ hx <<= CHAR_BIT;
+ hy <<= CHAR_BIT;
+ hx |= *x;
+ hy |= *y;
+ }
+ /* Searching */
+ while (hx != hy) {
+ if (y == ye)
+ return -1;
+ hy <<= CHAR_BIT;
+ hy |= *y;
+ hy &= mask;
+ y++;
+ }
+ return y - ys - m;
+}
+
+static inline long
+rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n)
+{
+ const unsigned char *x = xs, *xe = xs + m;
+ const unsigned char *y = ys, *ye = ys + n;
+ VALUE i, qstable[256];
/* Preprocessing */
- /* computes d = 2^(m-1) with
- the left-shift operator */
- d = sizeof(hx) * CHAR_BIT - 1;
- if (d > m) d = m;
+ for (i = 0; i < 256; ++i)
+ qstable[i] = m + 1;
+ for (; x < xe; ++x)
+ qstable[*x] = xe - x;
+ /* Searching */
+ for (; y < ye; y += *(qstable + y[m])) {
+ if (*xs == *y && memcmp(xs, y, m) == 0)
+ return y - ys;
+ }
+ return -1;
+}
+
+static inline unsigned int
+rb_memsearch_qs_utf8_hash(const unsigned char *x)
+{
+ register const unsigned int mix = 8353;
+ register unsigned int h = *x;
+ if (h < 0xC0) {
+ return h + 256;
+ }
+ else if (h < 0xE0) {
+ h *= mix;
+ h += x[1];
+ }
+ else if (h < 0xF0) {
+ h *= mix;
+ h += x[1];
+ h *= mix;
+ h += x[2];
+ }
+ else if (h < 0xF5) {
+ h *= mix;
+ h += x[1];
+ h *= mix;
+ h += x[2];
+ h *= mix;
+ h += x[3];
+ }
+ else {
+ return h + 256;
+ }
+ return (unsigned char)h;
+}
+
+static inline long
+rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n)
+{
+ const unsigned char *x = xs, *xe = xs + m;
+ const unsigned char *y = ys, *ye = ys + n;
+ VALUE i, qstable[512];
- if (n == m) {
- return memcmp(x, s, m) == 0 ? 0 : -1;
+ /* Preprocessing */
+ for (i = 0; i < 512; ++i) {
+ qstable[i] = m + 1;
}
- /* Prepare hash value */
- for (hy = hx = i = 0; i < d; ++i) {
- hx = KR_REHASH(0, x[i], hx);
- hy = KR_REHASH(0, s[i], hy);
+ for (; x < xe; ++x) {
+ qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
}
/* Searching */
- while (hx != hy || memcmp(x, s, m)) {
- if (s >= e) return -1;
- hy = KR_REHASH(*s, *(s+d), hy);
- s++;
+ for (; y < ye; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
+ if (*xs == *y && memcmp(xs, y, m) == 0)
+ return y - ys;
+ }
+ return -1;
+}
+
+long
+rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
+{
+ const unsigned char *x = x0, *y = y0;
+
+ if (m > n) return -1;
+ else if (m == n) {
+ return memcmp(x0, y0, m) == 0 ? 0 : -1;
+ }
+ else if (m < 1) {
+ return 0;
+ }
+ else if (m == 1) {
+ const unsigned char *ys = y, *ye = ys + n;
+ for (; y < ye; ++y) {
+ if (*x == *y)
+ return y - ys;
+ }
+ return -1;
+ }
+ else if (m <= SIZEOF_VALUE) {
+ return rb_memsearch_ss(x0, m, y0, n);
+ }
+ else if (enc == rb_utf8_encoding()){
+ return rb_memsearch_qs_utf8(x0, m, y0, n);
+ }
+ else {
+ return rb_memsearch_qs(x0, m, y0, n);
}
- return s-y;
}
#define REG_LITERAL FL_USER5