diff options
author | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2008-03-17 19:04:29 +0000 |
---|---|---|
committer | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2008-03-17 19:04:29 +0000 |
commit | e58adeae0f384a562bbcb295b0a25026842625ca (patch) | |
tree | 161176e4eef5bfe5a88d0177728dcd8780899337 /re.c | |
parent | 89afc5431b10e20689bcaac33a9e81a12d1b2b95 (diff) |
* re.c (rb_memsearch_ss): simple shift search.
* re.c (rb_memsearch_qs): quick search.
* re.c (rb_memsearch_qs_utf8): quick search for UTF-8 string.
* re.c (rb_memsearch_qs_utf8_hash): hash functions for above.
* re.c (rb_memsearch): use above functions.
* string.c (rb_str_index): give enc to rb_memsearch.
* include/ruby/intern.h (rb_memsearch): move to encoding.h.
* include/ruby/encoding.h (rb_memsearch): move from intern.h.
* common.mk (PREP): add dependency.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@15792 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 're.c')
-rw-r--r-- | re.c | 156 |
1 files changed, 130 insertions, 26 deletions
@@ -95,41 +95,145 @@ rb_memcmp(const void *p1, const void *p2, long len) return memcmp(p1, p2, len); } -long -rb_memsearch(const void *x0, long m, const void *y0, long n) -{ - const unsigned char *x = x0, *y = y0; - const unsigned char *s, *e; - long i; - int d; - unsigned long hx, hy; +static inline long +rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n) +{ + const unsigned char *x = xs, *xe = xs + m; + const unsigned char *y = ys, *ye = ys + n; +#ifndef VALUE_MAX +# if SIZEOF_VALUE == 8 +# define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL +# elif SIZEOF_VALUE == 4 +# define VALUE_MAX 0xFFFFFFFFUL +# endif +#endif + VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT); -#define KR_REHASH(a, b, h) (((h) << 1) - (((unsigned long)(a))<<d) + (b)) + if (m > SIZEOF_VALUE) + rb_bug("!!too long pattern string!!"); - if (m > n) return -1; - s = y; e = s + n - m; + /* Prepare hash value */ + for (hx = *x++, hy = *y++; x < xe; ++x, ++y) { + hx <<= CHAR_BIT; + hy <<= CHAR_BIT; + hx |= *x; + hy |= *y; + } + /* Searching */ + while (hx != hy) { + if (y == ye) + return -1; + hy <<= CHAR_BIT; + hy |= *y; + hy &= mask; + y++; + } + return y - ys - m; +} + +static inline long +rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n) +{ + const unsigned char *x = xs, *xe = xs + m; + const unsigned char *y = ys, *ye = ys + n; + VALUE i, qstable[256]; /* Preprocessing */ - /* computes d = 2^(m-1) with - the left-shift operator */ - d = sizeof(hx) * CHAR_BIT - 1; - if (d > m) d = m; + for (i = 0; i < 256; ++i) + qstable[i] = m + 1; + for (; x < xe; ++x) + qstable[*x] = xe - x; + /* Searching */ + for (; y < ye; y += *(qstable + y[m])) { + if (*xs == *y && memcmp(xs, y, m) == 0) + return y - ys; + } + return -1; +} + +static inline unsigned int +rb_memsearch_qs_utf8_hash(const unsigned char *x) +{ + register const unsigned int mix = 8353; + register unsigned int h = *x; + if (h < 0xC0) { + return h + 256; + } + else if (h < 0xE0) { + h *= mix; + h += x[1]; + } + else if (h < 0xF0) { + h *= mix; + h += x[1]; + h *= mix; + h += x[2]; + } + else if (h < 0xF5) { + h *= mix; + h += x[1]; + h *= mix; + h += x[2]; + h *= mix; + h += x[3]; + } + else { + return h + 256; + } + return (unsigned char)h; +} + +static inline long +rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n) +{ + const unsigned char *x = xs, *xe = xs + m; + const unsigned char *y = ys, *ye = ys + n; + VALUE i, qstable[512]; - if (n == m) { - return memcmp(x, s, m) == 0 ? 0 : -1; + /* Preprocessing */ + for (i = 0; i < 512; ++i) { + qstable[i] = m + 1; } - /* Prepare hash value */ - for (hy = hx = i = 0; i < d; ++i) { - hx = KR_REHASH(0, x[i], hx); - hy = KR_REHASH(0, s[i], hy); + for (; x < xe; ++x) { + qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x; } /* Searching */ - while (hx != hy || memcmp(x, s, m)) { - if (s >= e) return -1; - hy = KR_REHASH(*s, *(s+d), hy); - s++; + for (; y < ye; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) { + if (*xs == *y && memcmp(xs, y, m) == 0) + return y - ys; + } + return -1; +} + +long +rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc) +{ + const unsigned char *x = x0, *y = y0; + + if (m > n) return -1; + else if (m == n) { + return memcmp(x0, y0, m) == 0 ? 0 : -1; + } + else if (m < 1) { + return 0; + } + else if (m == 1) { + const unsigned char *ys = y, *ye = ys + n; + for (; y < ye; ++y) { + if (*x == *y) + return y - ys; + } + return -1; + } + else if (m <= SIZEOF_VALUE) { + return rb_memsearch_ss(x0, m, y0, n); + } + else if (enc == rb_utf8_encoding()){ + return rb_memsearch_qs_utf8(x0, m, y0, n); + } + else { + return rb_memsearch_qs(x0, m, y0, n); } - return s-y; } #define REG_LITERAL FL_USER5 |