From 44cd8e457b808173147c499408ffc5e908f236dc Mon Sep 17 00:00:00 2001 From: matz Date: Mon, 19 May 2008 08:25:03 +0000 Subject: * regparse.c (PINC): use optimized enclen() instead of ONIGENC_MBC_ENC_LEN(). * regparse.c (PFETCH): ditto. * regparse.c (PFETCH): small optimization. * regexec.c (slow_search): single byte encoding optimization. * regenc.h (enclen): avoid calling function when encoding's min_len == max_len. * re.c (rb_reg_regsub): rb_enc_ascget() optimization for single byte encoding. * re.c (rb_reg_search): avoid allocating new re_registers if we already have MatchData. * re.c (match_init_copy): avoid unnecessary onig_region_free() before onig_region_copy. * encoding.c (rb_enc_get_index): remove implicit enc_capable check each time. * encoding.c (rb_enc_set_index): ditto. * encoding.c (enc_compatible_p): small refactoring. * include/ruby/encoding.h (rb_enc_dummy_p): inline rb_enc_dummy_p() and export related code. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@16477 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 33 ++++++++++++++++++++++++++++++++ common.mk | 2 +- encoding.c | 50 ++++++++++++++----------------------------------- include/ruby/encoding.h | 27 ++++++++++++++++++++------ re.c | 41 +++++++++++++++++++++++++--------------- regenc.h | 2 +- regexec.c | 21 +++++++++++++++------ regparse.c | 6 +++--- string.c | 2 +- 9 files changed, 115 insertions(+), 69 deletions(-) diff --git a/ChangeLog b/ChangeLog index fc406cd112..397a4a59b0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,36 @@ +Mon May 19 17:23:55 2008 Yukihiro Matsumoto + + * regparse.c (PINC): use optimized enclen() instead of + ONIGENC_MBC_ENC_LEN(). + + * regparse.c (PFETCH): ditto. + + * regparse.c (PFETCH): small optimization. + + * regexec.c (slow_search): single byte encoding optimization. + + * regenc.h (enclen): avoid calling function when encoding's + min_len == max_len. + + * re.c (rb_reg_regsub): rb_enc_ascget() optimization for single + byte encoding. + + * re.c (rb_reg_search): avoid allocating new re_registers if we + already have MatchData. + + * re.c (match_init_copy): avoid unnecessary onig_region_free() + before onig_region_copy. + + * encoding.c (rb_enc_get_index): remove implicit enc_capable check + each time. + + * encoding.c (rb_enc_set_index): ditto. + + * encoding.c (enc_compatible_p): small refactoring. + + * include/ruby/encoding.h (rb_enc_dummy_p): inline + rb_enc_dummy_p() and export related code. + Mon May 19 14:32:03 2008 Koichi Sasada * version.h: fix strange change by version.h update tool. diff --git a/common.mk b/common.mk index 2a76c7bfa2..8e62efae2e 100644 --- a/common.mk +++ b/common.mk @@ -615,7 +615,7 @@ cont.$(OBJEXT): {$(VPATH)}cont.c {$(VPATH)}ruby.h {$(VPATH)}config.h \ {$(VPATH)}eval_intern.h {$(VPATH)}util.h {$(VPATH)}dln.h time.$(OBJEXT): {$(VPATH)}time.c {$(VPATH)}ruby.h {$(VPATH)}config.h \ {$(VPATH)}defines.h {$(VPATH)}missing.h {$(VPATH)}intern.h \ - {$(VPATH)}st.h + {$(VPATH)}st.h {$(VPATH)}encoding.h util.$(OBJEXT): {$(VPATH)}util.c {$(VPATH)}ruby.h {$(VPATH)}config.h \ {$(VPATH)}defines.h {$(VPATH)}missing.h {$(VPATH)}intern.h \ {$(VPATH)}st.h {$(VPATH)}util.h diff --git a/encoding.c b/encoding.c index e243451569..bb400e1ef0 100644 --- a/encoding.c +++ b/encoding.c @@ -18,7 +18,7 @@ #endif static ID id_encoding, id_base_encoding; -static VALUE rb_cEncoding; +VALUE rb_cEncoding; struct rb_encoding_entry { const char *name; @@ -38,14 +38,6 @@ void rb_enc_init(void); #define enc_autoload_p(enc) (!rb_enc_mbmaxlen(enc)) -#define ENC_UNINITIALIZED (&rb_cEncoding) -#define enc_initialized_p(enc) ((enc)->auxiliary_data != &rb_cEncoding) -#define ENC_FROM_ENCODING(enc) ((VALUE)(enc)->auxiliary_data) - -#define ENC_DUMMY_FLAG FL_USER2 -#define ENC_DUMMY_P(enc) (RBASIC(enc)->flags & ENC_DUMMY_FLAG) -#define ENC_SET_DUMMY(enc) (RBASIC(enc)->flags |= ENC_DUMMY_FLAG) - static int load_encoding(const char *name); static VALUE enc_base_encoding(VALUE self); @@ -318,15 +310,6 @@ rb_encdb_dummy(const char *name) return index; } -int -rb_enc_dummy_p(rb_encoding *enc) -{ - VALUE encoding; - if (!enc_initialized_p(enc)) return Qfalse; - encoding = rb_enc_from_encoding(enc); - return ENC_DUMMY_P(encoding); -} - /* * call-seq: * enc.dummy? => true or false @@ -343,7 +326,7 @@ rb_enc_dummy_p(rb_encoding *enc) static VALUE enc_dummy_p(VALUE enc) { - return rb_enc_dummy_p(rb_to_encoding(enc)) ? Qtrue : Qfalse; + return ENC_DUMMY_P(enc) ? Qtrue : Qfalse; } static int @@ -555,7 +538,7 @@ rb_id_encoding(void) } int -rb_enc_internal_get_index(VALUE obj) +rb_enc_get_index(VALUE obj) { int i; @@ -570,7 +553,7 @@ rb_enc_internal_get_index(VALUE obj) } void -rb_enc_internal_set_index(VALUE obj, int idx) +rb_enc_set_index(VALUE obj, int idx) { if (idx < ENCODING_INLINE_MAX) { ENCODING_SET_INLINED(obj, idx); @@ -584,14 +567,14 @@ rb_enc_internal_set_index(VALUE obj, int idx) void rb_enc_associate_index(VALUE obj, int idx) { - enc_check_capable(obj); - if (rb_enc_internal_get_index(obj) == idx) +// enc_check_capable(obj); + if (rb_enc_get_index(obj) == idx) return; if (!ENC_CODERANGE_ASCIIONLY(obj) || !rb_enc_asciicompat(rb_enc_from_index(idx))) { ENC_CODERANGE_CLEAR(obj); } - rb_enc_internal_set_index(obj, idx); + rb_enc_set_index(obj, idx); } void @@ -600,13 +583,6 @@ rb_enc_associate(VALUE obj, rb_encoding *enc) rb_enc_associate_index(obj, rb_enc_to_index(enc)); } -int -rb_enc_get_index(VALUE obj) -{ - if (!enc_capable(obj)) return -1; - return rb_enc_internal_get_index(obj); -} - rb_encoding* rb_enc_get(VALUE obj) { @@ -906,11 +882,13 @@ enc_find(VALUE klass, VALUE enc) static VALUE enc_compatible_p(VALUE klass, VALUE str1, VALUE str2) { - rb_encoding *enc = rb_enc_compatible(str1, str2); - VALUE encoding = Qnil; - if (!enc || !(encoding = rb_enc_from_encoding(enc))) - encoding = Qnil; - return encoding; + rb_encoding *enc; + + if (!enc_capable(str1)) return Qnil; + if (!enc_capable(str2)) return Qnil; + enc = rb_enc_compatible(str1, str2); + if (!enc) return Qnil; + return rb_enc_from_encoding(enc); } /* :nodoc: */ diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index 2dd2f93b18..0a6b7c18e2 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -33,14 +33,14 @@ if (encoding_set_enc_index < ENCODING_INLINE_MAX) \ ENCODING_SET_INLINED(rb_encoding_set_obj, encoding_set_enc_index); \ else \ - rb_enc_internal_set_index(rb_encoding_set_obj, encoding_set_enc_index); \ + rb_enc_set_index(rb_encoding_set_obj, encoding_set_enc_index); \ } while (0) #define ENCODING_GET_INLINED(obj) ((RBASIC(obj)->flags & ENCODING_MASK)>>ENCODING_SHIFT) #define ENCODING_GET(obj) \ (ENCODING_GET_INLINED(obj) != ENCODING_INLINE_MAX ? \ ENCODING_GET_INLINED(obj) : \ - rb_enc_internal_get_index(obj)) + rb_enc_get_index(obj)) #define ENCODING_IS_ASCII8BIT(obj) (ENCODING_GET_INLINED(obj) == 0) @@ -74,9 +74,9 @@ typedef OnigEncodingType rb_encoding; int rb_enc_replicate(const char *, rb_encoding *); int rb_define_dummy_encoding(const char *); -int rb_enc_dummy_p(rb_encoding *); #define rb_enc_to_index(enc) ((enc) ? ((enc)->ruby_encoding_index) : 0) int rb_enc_get_index(VALUE obj); +void rb_enc_set_index(VALUE obj, int encindex); int rb_enc_find_index(const char *name); int rb_to_encoding_index(VALUE); rb_encoding* rb_to_encoding(VALUE); @@ -86,8 +86,6 @@ rb_encoding* rb_enc_check(VALUE,VALUE); void rb_enc_associate_index(VALUE, int); void rb_enc_associate(VALUE, rb_encoding*); void rb_enc_copy(VALUE dst, VALUE src); -int rb_enc_internal_get_index(VALUE obj); -void rb_enc_internal_set_index(VALUE obj, int encindex); VALUE rb_enc_str_new(const char*, long, rb_encoding*); VALUE rb_enc_reg_new(const char*, long, rb_encoding*, int); @@ -154,7 +152,7 @@ int rb_enc_codelen(int code, rb_encoding *enc); #define rb_enc_isspace(c,enc) ONIGENC_IS_CODE_SPACE(enc,c) #define rb_enc_isdigit(c,enc) ONIGENC_IS_CODE_DIGIT(enc,c) -#define rb_enc_asciicompat(enc) (!rb_enc_dummy_p(enc) && rb_enc_mbminlen(enc)==1) +#define rb_enc_asciicompat(enc) (rb_enc_mbminlen(enc)==1 && !rb_enc_dummy_p(enc)) int rb_enc_casefold(char *to, const char *p, const char *e, rb_encoding *enc); int rb_enc_toupper(int c, rb_encoding *enc); @@ -178,4 +176,21 @@ void rb_enc_set_default_external(VALUE encoding); VALUE rb_locale_charmap(VALUE klass); long rb_memsearch(const void*,long,const void*,long,rb_encoding*); +RUBY_EXTERN VALUE rb_cEncoding; + +#define ENC_UNINITIALIZED (&rb_cEncoding) +#define enc_initialized_p(enc) ((enc)->auxiliary_data != &rb_cEncoding) +#define ENC_FROM_ENCODING(enc) ((VALUE)(enc)->auxiliary_data) + +#define ENC_DUMMY_FLAG FL_USER2 +#define ENC_DUMMY_P(enc) (RBASIC(enc)->flags & ENC_DUMMY_FLAG) +#define ENC_SET_DUMMY(enc) (RBASIC(enc)->flags |= ENC_DUMMY_FLAG) + +static inline int +rb_enc_dummy_p(rb_encoding *enc) +{ + if (!enc_initialized_p(enc)) return Qfalse; + return ENC_DUMMY_P(ENC_FROM_ENCODING(enc)); +} + #endif /* RUBY_ENCODING_H */ diff --git a/re.c b/re.c index 4d9a26363f..c5b47c46b7 100644 --- a/re.c +++ b/re.c @@ -881,9 +881,6 @@ match_init_copy(VALUE obj, VALUE orig) RMATCH(obj)->regexp = RMATCH(orig)->regexp; rm = RMATCH(obj)->rmatch; - onig_region_free(&rm->regs, 0); - rm->regs.allocated = 0; - onig_region_copy(&rm->regs, RMATCH_REGS(orig)); if (!RMATCH(orig)->rmatch->char_offset_updated) { @@ -1265,7 +1262,7 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse) { int result; VALUE match; - struct re_registers regs; + struct re_registers *regs, regi; char *range = RSTRING_PTR(str); regex_t *reg0 = RREGEXP(re)->ptr, *reg; int busy = FL_TEST(re, REG_BUSY); @@ -1277,17 +1274,29 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse) reg = rb_reg_prepare_re(re, str); + match = rb_backref_get(); + if (!NIL_P(match)) { + if (FL_TEST(match, MATCH_BUSY)) { + match = Qnil; + } + else { + regs = RMATCH_REGS(match); + } + } + if (NIL_P(match)) { + regs = ®i; + MEMZERO(regs, struct re_registers, 1); + } FL_SET(re, REG_BUSY); if (!reverse) { range += RSTRING_LEN(str); } - MEMZERO(®s, struct re_registers, 1); result = onig_search(reg, (UChar*)(RSTRING_PTR(str)), ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)), ((UChar*)(RSTRING_PTR(str)) + pos), ((UChar*)range), - ®s, ONIG_OPTION_NONE); + regs, ONIG_OPTION_NONE); if (RREGEXP(re)->ptr != reg) { if (busy) { @@ -1300,7 +1309,8 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse) } if (!busy) FL_UNSET(re, REG_BUSY); if (result < 0) { - onig_region_free(®s, 0); + if (regs == ®i) + onig_region_free(regs, 0); if (result == ONIG_MISMATCH) { rb_backref_set(Qnil); return result; @@ -1312,9 +1322,10 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse) } } - match = rb_backref_get(); - if (NIL_P(match) || FL_TEST(match, MATCH_BUSY)) { + if (NIL_P(match)) { match = match_alloc(rb_cMatch); + onig_region_copy(RMATCH_REGS(match), regs); + onig_region_free(regs, 0); } else { if (rb_safe_level() >= 3) @@ -1323,8 +1334,6 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse) FL_UNSET(match, FL_TAINT); } - onig_region_copy(RMATCH_REGS(match), ®s); - onig_region_free(®s, 0); RMATCH(match)->str = rb_str_new4(str); RMATCH(match)->regexp = re; RMATCH(match)->rmatch->char_offset_updated = 0; @@ -3088,12 +3097,14 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) int no, clen; rb_encoding *str_enc = rb_enc_get(str); rb_encoding *src_enc = rb_enc_get(src); + int acompat = rb_enc_asciicompat(str_enc); +#define ASCGET(s,e,cl) (acompat ? (*cl=1,s[0]) : rb_enc_ascget(s, e, cl, str_enc)) p = s = RSTRING_PTR(str); e = s + RSTRING_LEN(str); while (s < e) { - int c = rb_enc_ascget(s, e, &clen, str_enc); + int c = ASCGET(s, e, &clen); char *ss; if (c == -1) { @@ -3110,7 +3121,7 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) } rb_enc_str_buf_cat(val, p, ss-p, str_enc); - c = rb_enc_ascget(s, e, &clen, str_enc); + c = ASCGET(s, e, &clen); if (c == -1) { s += mbclen(s, e, str_enc); rb_enc_str_buf_cat(val, ss, s-ss, str_enc); @@ -3132,12 +3143,12 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) break; case 'k': - if (s < e && rb_enc_ascget(s, e, &clen, str_enc) == '<') { + if (s < e && ASCGET(s, e, &clen) == '<') { char *name, *name_end; name_end = name = s + clen; while (name_end < e) { - c = rb_enc_ascget(name_end, e, &clen, str_enc); + c = ASCGET(name_end, e, &clen); if (c == '>') break; name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen; } diff --git a/regenc.h b/regenc.h index 09c6da4783..317175f5df 100644 --- a/regenc.h +++ b/regenc.h @@ -70,7 +70,7 @@ typedef struct { #define ONIG_CHECK_NULL_RETURN(p) if (ONIG_IS_NULL(p)) return NULL #define ONIG_CHECK_NULL_RETURN_VAL(p,val) if (ONIG_IS_NULL(p)) return (val) -#define enclen(enc,p,e) ONIGENC_MBC_ENC_LEN(enc,p,e) +#define enclen(enc,p,e) ((enc->max_enc_len == enc->min_enc_len) ? enc->min_enc_len : ONIGENC_MBC_ENC_LEN(enc,p,e)) /* character types bit flag */ #define BIT_CTYPE_NEWLINE (1<< ONIGENC_CTYPE_NEWLINE) diff --git a/regexec.c b/regexec.c index b9947d8f93..a2d6993d08 100644 --- a/regexec.c +++ b/regexec.c @@ -2758,16 +2758,25 @@ slow_search(OnigEncoding enc, UChar* target, UChar* target_end, s = (UChar* )text; + if (enc->max_enc_len == enc->min_enc_len) { + int n = enc->max_enc_len; + + while (s < end) { + if (*s == *target) { + p = s + 1; + t = target + 1; + if (memcmp(t, p, target_end - t) == 0) + return s; + } + s += n; + } + return (UChar*)NULL; + } while (s < end) { if (*s == *target) { p = s + 1; t = target + 1; - while (t < target_end) { - if (*t != *p++) - break; - t++; - } - if (t == target_end) + if (memcmp(t, p, target_end - t) == 0) return s; } s += enclen(enc, s, end); diff --git a/regparse.c b/regparse.c index 1b2a0830ae..8d74efafea 100644 --- a/regparse.c +++ b/regparse.c @@ -253,12 +253,12 @@ strdup_with_null(OnigEncoding enc, UChar* s, UChar* end) #define PUNFETCH p = pfetch_prev #define PINC do { \ pfetch_prev = p; \ - p += ONIGENC_MBC_ENC_LEN(enc, p, end); \ + p += enclen(enc, p, end); \ } while (0) #define PFETCH(c) do { \ - c = ONIGENC_MBC_TO_CODE(enc, p, end); \ + c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \ pfetch_prev = p; \ - p += ONIGENC_MBC_ENC_LEN(enc, p, end); \ + p += enclen(enc, p, end); \ } while (0) #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE) diff --git a/string.c b/string.c index 67f1e33b17..a80f60555f 100644 --- a/string.c +++ b/string.c @@ -256,7 +256,7 @@ rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc static inline void str_enc_copy(VALUE str1, VALUE str2) { - rb_enc_internal_set_index(str1, ENCODING_GET(str2)); + rb_enc_set_index(str1, ENCODING_GET(str2)); } static void -- cgit v1.2.3