summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog33
-rw-r--r--common.mk2
-rw-r--r--encoding.c50
-rw-r--r--include/ruby/encoding.h27
-rw-r--r--re.c41
-rw-r--r--regenc.h2
-rw-r--r--regexec.c21
-rw-r--r--regparse.c6
-rw-r--r--string.c2
9 files changed, 115 insertions, 69 deletions
diff --git a/ChangeLog b/ChangeLog
index fc406cd..397a4a5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,36 @@
+Mon May 19 17:23:55 2008 Yukihiro Matsumoto <matz@ruby-lang.org>
+
+ * regparse.c (PINC): use optimized enclen() instead of
+ ONIGENC_MBC_ENC_LEN().
+
+ * regparse.c (PFETCH): ditto.
+
+ * regparse.c (PFETCH): small optimization.
+
+ * regexec.c (slow_search): single byte encoding optimization.
+
+ * regenc.h (enclen): avoid calling function when encoding's
+ min_len == max_len.
+
+ * re.c (rb_reg_regsub): rb_enc_ascget() optimization for single
+ byte encoding.
+
+ * re.c (rb_reg_search): avoid allocating new re_registers if we
+ already have MatchData.
+
+ * re.c (match_init_copy): avoid unnecessary onig_region_free()
+ before onig_region_copy.
+
+ * encoding.c (rb_enc_get_index): remove implicit enc_capable check
+ each time.
+
+ * encoding.c (rb_enc_set_index): ditto.
+
+ * encoding.c (enc_compatible_p): small refactoring.
+
+ * include/ruby/encoding.h (rb_enc_dummy_p): inline
+ rb_enc_dummy_p() and export related code.
+
Mon May 19 14:32:03 2008 Koichi Sasada <ko1@atdot.net>
* version.h: fix strange change by version.h update tool.
diff --git a/common.mk b/common.mk
index 2a76c7b..8e62efa 100644
--- a/common.mk
+++ b/common.mk
@@ -615,7 +615,7 @@ cont.$(OBJEXT): {$(VPATH)}cont.c {$(VPATH)}ruby.h {$(VPATH)}config.h \
{$(VPATH)}eval_intern.h {$(VPATH)}util.h {$(VPATH)}dln.h
time.$(OBJEXT): {$(VPATH)}time.c {$(VPATH)}ruby.h {$(VPATH)}config.h \
{$(VPATH)}defines.h {$(VPATH)}missing.h {$(VPATH)}intern.h \
- {$(VPATH)}st.h
+ {$(VPATH)}st.h {$(VPATH)}encoding.h
util.$(OBJEXT): {$(VPATH)}util.c {$(VPATH)}ruby.h {$(VPATH)}config.h \
{$(VPATH)}defines.h {$(VPATH)}missing.h {$(VPATH)}intern.h \
{$(VPATH)}st.h {$(VPATH)}util.h
diff --git a/encoding.c b/encoding.c
index e243451..bb400e1 100644
--- a/encoding.c
+++ b/encoding.c
@@ -18,7 +18,7 @@
#endif
static ID id_encoding, id_base_encoding;
-static VALUE rb_cEncoding;
+VALUE rb_cEncoding;
struct rb_encoding_entry {
const char *name;
@@ -38,14 +38,6 @@ void rb_enc_init(void);
#define enc_autoload_p(enc) (!rb_enc_mbmaxlen(enc))
-#define ENC_UNINITIALIZED (&rb_cEncoding)
-#define enc_initialized_p(enc) ((enc)->auxiliary_data != &rb_cEncoding)
-#define ENC_FROM_ENCODING(enc) ((VALUE)(enc)->auxiliary_data)
-
-#define ENC_DUMMY_FLAG FL_USER2
-#define ENC_DUMMY_P(enc) (RBASIC(enc)->flags & ENC_DUMMY_FLAG)
-#define ENC_SET_DUMMY(enc) (RBASIC(enc)->flags |= ENC_DUMMY_FLAG)
-
static int load_encoding(const char *name);
static VALUE enc_base_encoding(VALUE self);
@@ -318,15 +310,6 @@ rb_encdb_dummy(const char *name)
return index;
}
-int
-rb_enc_dummy_p(rb_encoding *enc)
-{
- VALUE encoding;
- if (!enc_initialized_p(enc)) return Qfalse;
- encoding = rb_enc_from_encoding(enc);
- return ENC_DUMMY_P(encoding);
-}
-
/*
* call-seq:
* enc.dummy? => true or false
@@ -343,7 +326,7 @@ rb_enc_dummy_p(rb_encoding *enc)
static VALUE
enc_dummy_p(VALUE enc)
{
- return rb_enc_dummy_p(rb_to_encoding(enc)) ? Qtrue : Qfalse;
+ return ENC_DUMMY_P(enc) ? Qtrue : Qfalse;
}
static int
@@ -555,7 +538,7 @@ rb_id_encoding(void)
}
int
-rb_enc_internal_get_index(VALUE obj)
+rb_enc_get_index(VALUE obj)
{
int i;
@@ -570,7 +553,7 @@ rb_enc_internal_get_index(VALUE obj)
}
void
-rb_enc_internal_set_index(VALUE obj, int idx)
+rb_enc_set_index(VALUE obj, int idx)
{
if (idx < ENCODING_INLINE_MAX) {
ENCODING_SET_INLINED(obj, idx);
@@ -584,14 +567,14 @@ rb_enc_internal_set_index(VALUE obj, int idx)
void
rb_enc_associate_index(VALUE obj, int idx)
{
- enc_check_capable(obj);
- if (rb_enc_internal_get_index(obj) == idx)
+// enc_check_capable(obj);
+ if (rb_enc_get_index(obj) == idx)
return;
if (!ENC_CODERANGE_ASCIIONLY(obj) ||
!rb_enc_asciicompat(rb_enc_from_index(idx))) {
ENC_CODERANGE_CLEAR(obj);
}
- rb_enc_internal_set_index(obj, idx);
+ rb_enc_set_index(obj, idx);
}
void
@@ -600,13 +583,6 @@ rb_enc_associate(VALUE obj, rb_encoding *enc)
rb_enc_associate_index(obj, rb_enc_to_index(enc));
}
-int
-rb_enc_get_index(VALUE obj)
-{
- if (!enc_capable(obj)) return -1;
- return rb_enc_internal_get_index(obj);
-}
-
rb_encoding*
rb_enc_get(VALUE obj)
{
@@ -906,11 +882,13 @@ enc_find(VALUE klass, VALUE enc)
static VALUE
enc_compatible_p(VALUE klass, VALUE str1, VALUE str2)
{
- rb_encoding *enc = rb_enc_compatible(str1, str2);
- VALUE encoding = Qnil;
- if (!enc || !(encoding = rb_enc_from_encoding(enc)))
- encoding = Qnil;
- return encoding;
+ rb_encoding *enc;
+
+ if (!enc_capable(str1)) return Qnil;
+ if (!enc_capable(str2)) return Qnil;
+ enc = rb_enc_compatible(str1, str2);
+ if (!enc) return Qnil;
+ return rb_enc_from_encoding(enc);
}
/* :nodoc: */
diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h
index 2dd2f93..0a6b7c1 100644
--- a/include/ruby/encoding.h
+++ b/include/ruby/encoding.h
@@ -33,14 +33,14 @@
if (encoding_set_enc_index < ENCODING_INLINE_MAX) \
ENCODING_SET_INLINED(rb_encoding_set_obj, encoding_set_enc_index); \
else \
- rb_enc_internal_set_index(rb_encoding_set_obj, encoding_set_enc_index); \
+ rb_enc_set_index(rb_encoding_set_obj, encoding_set_enc_index); \
} while (0)
#define ENCODING_GET_INLINED(obj) ((RBASIC(obj)->flags & ENCODING_MASK)>>ENCODING_SHIFT)
#define ENCODING_GET(obj) \
(ENCODING_GET_INLINED(obj) != ENCODING_INLINE_MAX ? \
ENCODING_GET_INLINED(obj) : \
- rb_enc_internal_get_index(obj))
+ rb_enc_get_index(obj))
#define ENCODING_IS_ASCII8BIT(obj) (ENCODING_GET_INLINED(obj) == 0)
@@ -74,9 +74,9 @@ typedef OnigEncodingType rb_encoding;
int rb_enc_replicate(const char *, rb_encoding *);
int rb_define_dummy_encoding(const char *);
-int rb_enc_dummy_p(rb_encoding *);
#define rb_enc_to_index(enc) ((enc) ? ((enc)->ruby_encoding_index) : 0)
int rb_enc_get_index(VALUE obj);
+void rb_enc_set_index(VALUE obj, int encindex);
int rb_enc_find_index(const char *name);
int rb_to_encoding_index(VALUE);
rb_encoding* rb_to_encoding(VALUE);
@@ -86,8 +86,6 @@ rb_encoding* rb_enc_check(VALUE,VALUE);
void rb_enc_associate_index(VALUE, int);
void rb_enc_associate(VALUE, rb_encoding*);
void rb_enc_copy(VALUE dst, VALUE src);
-int rb_enc_internal_get_index(VALUE obj);
-void rb_enc_internal_set_index(VALUE obj, int encindex);
VALUE rb_enc_str_new(const char*, long, rb_encoding*);
VALUE rb_enc_reg_new(const char*, long, rb_encoding*, int);
@@ -154,7 +152,7 @@ int rb_enc_codelen(int code, rb_encoding *enc);
#define rb_enc_isspace(c,enc) ONIGENC_IS_CODE_SPACE(enc,c)
#define rb_enc_isdigit(c,enc) ONIGENC_IS_CODE_DIGIT(enc,c)
-#define rb_enc_asciicompat(enc) (!rb_enc_dummy_p(enc) && rb_enc_mbminlen(enc)==1)
+#define rb_enc_asciicompat(enc) (rb_enc_mbminlen(enc)==1 && !rb_enc_dummy_p(enc))
int rb_enc_casefold(char *to, const char *p, const char *e, rb_encoding *enc);
int rb_enc_toupper(int c, rb_encoding *enc);
@@ -178,4 +176,21 @@ void rb_enc_set_default_external(VALUE encoding);
VALUE rb_locale_charmap(VALUE klass);
long rb_memsearch(const void*,long,const void*,long,rb_encoding*);
+RUBY_EXTERN VALUE rb_cEncoding;
+
+#define ENC_UNINITIALIZED (&rb_cEncoding)
+#define enc_initialized_p(enc) ((enc)->auxiliary_data != &rb_cEncoding)
+#define ENC_FROM_ENCODING(enc) ((VALUE)(enc)->auxiliary_data)
+
+#define ENC_DUMMY_FLAG FL_USER2
+#define ENC_DUMMY_P(enc) (RBASIC(enc)->flags & ENC_DUMMY_FLAG)
+#define ENC_SET_DUMMY(enc) (RBASIC(enc)->flags |= ENC_DUMMY_FLAG)
+
+static inline int
+rb_enc_dummy_p(rb_encoding *enc)
+{
+ if (!enc_initialized_p(enc)) return Qfalse;
+ return ENC_DUMMY_P(ENC_FROM_ENCODING(enc));
+}
+
#endif /* RUBY_ENCODING_H */
diff --git a/re.c b/re.c
index 4d9a263..c5b47c4 100644
--- a/re.c
+++ b/re.c
@@ -881,9 +881,6 @@ match_init_copy(VALUE obj, VALUE orig)
RMATCH(obj)->regexp = RMATCH(orig)->regexp;
rm = RMATCH(obj)->rmatch;
- onig_region_free(&rm->regs, 0);
- rm->regs.allocated = 0;
-
onig_region_copy(&rm->regs, RMATCH_REGS(orig));
if (!RMATCH(orig)->rmatch->char_offset_updated) {
@@ -1265,7 +1262,7 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse)
{
int result;
VALUE match;
- struct re_registers regs;
+ struct re_registers *regs, regi;
char *range = RSTRING_PTR(str);
regex_t *reg0 = RREGEXP(re)->ptr, *reg;
int busy = FL_TEST(re, REG_BUSY);
@@ -1277,17 +1274,29 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse)
reg = rb_reg_prepare_re(re, str);
+ match = rb_backref_get();
+ if (!NIL_P(match)) {
+ if (FL_TEST(match, MATCH_BUSY)) {
+ match = Qnil;
+ }
+ else {
+ regs = RMATCH_REGS(match);
+ }
+ }
+ if (NIL_P(match)) {
+ regs = &regi;
+ MEMZERO(regs, struct re_registers, 1);
+ }
FL_SET(re, REG_BUSY);
if (!reverse) {
range += RSTRING_LEN(str);
}
- MEMZERO(&regs, struct re_registers, 1);
result = onig_search(reg,
(UChar*)(RSTRING_PTR(str)),
((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
((UChar*)(RSTRING_PTR(str)) + pos),
((UChar*)range),
- &regs, ONIG_OPTION_NONE);
+ regs, ONIG_OPTION_NONE);
if (RREGEXP(re)->ptr != reg) {
if (busy) {
@@ -1300,7 +1309,8 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse)
}
if (!busy) FL_UNSET(re, REG_BUSY);
if (result < 0) {
- onig_region_free(&regs, 0);
+ if (regs == &regi)
+ onig_region_free(regs, 0);
if (result == ONIG_MISMATCH) {
rb_backref_set(Qnil);
return result;
@@ -1312,9 +1322,10 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse)
}
}
- match = rb_backref_get();
- if (NIL_P(match) || FL_TEST(match, MATCH_BUSY)) {
+ if (NIL_P(match)) {
match = match_alloc(rb_cMatch);
+ onig_region_copy(RMATCH_REGS(match), regs);
+ onig_region_free(regs, 0);
}
else {
if (rb_safe_level() >= 3)
@@ -1323,8 +1334,6 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse)
FL_UNSET(match, FL_TAINT);
}
- onig_region_copy(RMATCH_REGS(match), &regs);
- onig_region_free(&regs, 0);
RMATCH(match)->str = rb_str_new4(str);
RMATCH(match)->regexp = re;
RMATCH(match)->rmatch->char_offset_updated = 0;
@@ -3088,12 +3097,14 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
int no, clen;
rb_encoding *str_enc = rb_enc_get(str);
rb_encoding *src_enc = rb_enc_get(src);
+ int acompat = rb_enc_asciicompat(str_enc);
+#define ASCGET(s,e,cl) (acompat ? (*cl=1,s[0]) : rb_enc_ascget(s, e, cl, str_enc))
p = s = RSTRING_PTR(str);
e = s + RSTRING_LEN(str);
while (s < e) {
- int c = rb_enc_ascget(s, e, &clen, str_enc);
+ int c = ASCGET(s, e, &clen);
char *ss;
if (c == -1) {
@@ -3110,7 +3121,7 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
}
rb_enc_str_buf_cat(val, p, ss-p, str_enc);
- c = rb_enc_ascget(s, e, &clen, str_enc);
+ c = ASCGET(s, e, &clen);
if (c == -1) {
s += mbclen(s, e, str_enc);
rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
@@ -3132,12 +3143,12 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
break;
case 'k':
- if (s < e && rb_enc_ascget(s, e, &clen, str_enc) == '<') {
+ if (s < e && ASCGET(s, e, &clen) == '<') {
char *name, *name_end;
name_end = name = s + clen;
while (name_end < e) {
- c = rb_enc_ascget(name_end, e, &clen, str_enc);
+ c = ASCGET(name_end, e, &clen);
if (c == '>') break;
name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
}
diff --git a/regenc.h b/regenc.h
index 09c6da4..317175f 100644
--- a/regenc.h
+++ b/regenc.h
@@ -70,7 +70,7 @@ typedef struct {
#define ONIG_CHECK_NULL_RETURN(p) if (ONIG_IS_NULL(p)) return NULL
#define ONIG_CHECK_NULL_RETURN_VAL(p,val) if (ONIG_IS_NULL(p)) return (val)
-#define enclen(enc,p,e) ONIGENC_MBC_ENC_LEN(enc,p,e)
+#define enclen(enc,p,e) ((enc->max_enc_len == enc->min_enc_len) ? enc->min_enc_len : ONIGENC_MBC_ENC_LEN(enc,p,e))
/* character types bit flag */
#define BIT_CTYPE_NEWLINE (1<< ONIGENC_CTYPE_NEWLINE)
diff --git a/regexec.c b/regexec.c
index b9947d8..a2d6993 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2758,16 +2758,25 @@ slow_search(OnigEncoding enc, UChar* target, UChar* target_end,
s = (UChar* )text;
+ if (enc->max_enc_len == enc->min_enc_len) {
+ int n = enc->max_enc_len;
+
+ while (s < end) {
+ if (*s == *target) {
+ p = s + 1;
+ t = target + 1;
+ if (memcmp(t, p, target_end - t) == 0)
+ return s;
+ }
+ s += n;
+ }
+ return (UChar*)NULL;
+ }
while (s < end) {
if (*s == *target) {
p = s + 1;
t = target + 1;
- while (t < target_end) {
- if (*t != *p++)
- break;
- t++;
- }
- if (t == target_end)
+ if (memcmp(t, p, target_end - t) == 0)
return s;
}
s += enclen(enc, s, end);
diff --git a/regparse.c b/regparse.c
index 1b2a083..8d74efa 100644
--- a/regparse.c
+++ b/regparse.c
@@ -253,12 +253,12 @@ strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
#define PUNFETCH p = pfetch_prev
#define PINC do { \
pfetch_prev = p; \
- p += ONIGENC_MBC_ENC_LEN(enc, p, end); \
+ p += enclen(enc, p, end); \
} while (0)
#define PFETCH(c) do { \
- c = ONIGENC_MBC_TO_CODE(enc, p, end); \
+ c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \
pfetch_prev = p; \
- p += ONIGENC_MBC_ENC_LEN(enc, p, end); \
+ p += enclen(enc, p, end); \
} while (0)
#define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
diff --git a/string.c b/string.c
index 67f1e33..a80f605 100644
--- a/string.c
+++ b/string.c
@@ -256,7 +256,7 @@ rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc
static inline void
str_enc_copy(VALUE str1, VALUE str2)
{
- rb_enc_internal_set_index(str1, ENCODING_GET(str2));
+ rb_enc_set_index(str1, ENCODING_GET(str2));
}
static void