From 71c5e485989cfead51f62f36d0d694cab65f855f Mon Sep 17 00:00:00 2001 From: akr Date: Sat, 16 Feb 2008 20:08:35 +0000 Subject: * include/ruby/re.h (struct rmatch_offset): new struct for character offsets. (struct rmatch): new struct. (struct RMatch): reference struct rmatch. (RMATCH_REGS): new macro. * re.c (match_alloc): initialize struct rmatch. (pair_byte_cmp): new function. (update_char_offset): update character offsets. (match_init_copy): copy regexp and character offsets. (match_sublen): removed. (match_offset): use update_char_offset. (match_begin): ditto. (match_end): ditto. (rb_reg_search): make character offset updated flag false. (match_size): use RMATCH_REGS. (match_backref_number): ditto. (rb_reg_nth_defined): ditto. (rb_reg_nth_match): ditto. (rb_reg_match_pre): ditto. (rb_reg_match_post): ditto. (rb_reg_match_last): ditto. (match_array): ditto. (match_aref): ditto. (match_values_at): ditto. (match_inspect): ditto. * string.c (rb_str_subpat_set): use RMATCH_REGS. (rb_str_sub_bang): ditto. (str_gsub): ditto. (rb_str_split_m): ditto. (scan_once): ditto. * gc.c (obj_free): free character offsets. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@15513 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 37 ++++++++++ gc.c | 9 ++- include/ruby/re.h | 16 ++++- re.c | 202 ++++++++++++++++++++++++++++++++++++++++++------------ string.c | 20 +++--- 5 files changed, 226 insertions(+), 58 deletions(-) diff --git a/ChangeLog b/ChangeLog index 7a2fe8e652..f00aca1f4b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,40 @@ +Sun Feb 17 03:37:01 2008 Tanaka Akira + + * include/ruby/re.h (struct rmatch_offset): new struct for character + offsets. + (struct rmatch): new struct. + (struct RMatch): reference struct rmatch. + (RMATCH_REGS): new macro. + + * re.c (match_alloc): initialize struct rmatch. + (pair_byte_cmp): new function. + (update_char_offset): update character offsets. + (match_init_copy): copy regexp and character offsets. + (match_sublen): removed. + (match_offset): use update_char_offset. + (match_begin): ditto. + (match_end): ditto. + (rb_reg_search): make character offset updated flag false. + (match_size): use RMATCH_REGS. + (match_backref_number): ditto. + (rb_reg_nth_defined): ditto. + (rb_reg_nth_match): ditto. + (rb_reg_match_pre): ditto. + (rb_reg_match_post): ditto. + (rb_reg_match_last): ditto. + (match_array): ditto. + (match_aref): ditto. + (match_values_at): ditto. + (match_inspect): ditto. + + * string.c (rb_str_subpat_set): use RMATCH_REGS. + (rb_str_sub_bang): ditto. + (str_gsub): ditto. + (rb_str_split_m): ditto. + (scan_once): ditto. + + * gc.c (obj_free): free character offsets. + Sun Feb 17 03:13:40 2008 NAKAMURA Usaku * win32/resource.rb: made version infos confirm to OS spec. diff --git a/gc.c b/gc.c index fec5802e71..4ef74f6e36 100644 --- a/gc.c +++ b/gc.c @@ -1322,9 +1322,12 @@ obj_free(VALUE obj) } break; case T_MATCH: - if (RANY(obj)->as.match.regs) { - onig_region_free(RANY(obj)->as.match.regs, 0); - RUBY_CRITICAL(free(RANY(obj)->as.match.regs)); + if (RANY(obj)->as.match.rmatch) { + struct rmatch *rm = RANY(obj)->as.match.rmatch; + onig_region_free(&rm->regs, 0); + if (rm->char_offset) + RUBY_CRITICAL(free(rm->char_offset)); + RUBY_CRITICAL(free(rm)); } break; case T_FILE: diff --git a/include/ruby/re.h b/include/ruby/re.h index 07c08614da..cc7f6025a7 100644 --- a/include/ruby/re.h +++ b/include/ruby/re.h @@ -26,14 +26,28 @@ extern "C" { typedef struct re_pattern_buffer Regexp; +struct rmatch_offset { + int beg; + int end; +}; + +struct rmatch { + struct re_registers regs; + + int char_offset_updated; + int char_offset_num_allocated; + struct rmatch_offset *char_offset; +}; + struct RMatch { struct RBasic basic; VALUE str; - struct re_registers *regs; + struct rmatch *rmatch; VALUE regexp; /* RRegexp */ }; #define RMATCH(obj) (R_CAST(RMatch)(obj)) +#define RMATCH_REGS(obj) (&(R_CAST(RMatch)(obj))->rmatch->regs) VALUE rb_reg_regcomp(VALUE); int rb_reg_search(VALUE, VALUE, int, int); diff --git a/re.c b/re.c index 90dae9ba9e..f6bc8a205a 100644 --- a/re.c +++ b/re.c @@ -670,27 +670,129 @@ match_alloc(VALUE klass) OBJSETUP(match, klass, T_MATCH); match->str = 0; - match->regs = 0; + match->rmatch = 0; match->regexp = 0; - match->regs = ALLOC(struct re_registers); - MEMZERO(match->regs, struct re_registers, 1); + match->rmatch = ALLOC(struct rmatch); + MEMZERO(match->rmatch, struct rmatch, 1); return (VALUE)match; } +typedef struct { + int byte_pos; + int char_pos; +} pair_t; + +static int +pair_byte_cmp(const void *pair1, const void *pair2) +{ + return ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos; +} + +static void +update_char_offset(VALUE match) +{ + struct rmatch *rm = RMATCH(match)->rmatch; + struct re_registers *regs; + int num_regs; + int i, num_pos, c; + char *s, *p, *q, *e; + rb_encoding *enc; + pair_t *pairs; + + if (rm->char_offset_updated) + return; + + regs = &rm->regs; + num_regs = rm->regs.num_regs; + + if (rm->char_offset_num_allocated < num_regs) { + REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs); + rm->char_offset_num_allocated = num_regs; + } + + enc = rb_enc_get(RMATCH(match)->str); + if (rb_enc_mbmaxlen(enc) == 1) { + for (i = 0; i < num_regs; i++) { + rm->char_offset[i].beg = BEG(i); + rm->char_offset[i].end = END(i); + } + rm->char_offset_updated = 1; + return; + } + + pairs = ALLOCA_N(pair_t, num_regs*2); + num_pos = 0; + for (i = 0; i < num_regs; i++) { + if (BEG(i) < 0) + continue; + pairs[num_pos++].byte_pos = BEG(i); + pairs[num_pos++].byte_pos = END(i); + } + qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp); + + s = p = RSTRING_PTR(RMATCH(match)->str); + e = s + RSTRING_LEN(RMATCH(match)->str); + c = 0; + for (i = 0; i < num_pos; i++) { + q = s + pairs[i].byte_pos; + c += rb_enc_strlen(p, q, enc); + pairs[i].char_pos = c; + p = q; + } + + for (i = 0; i < num_regs; i++) { + pair_t key, *found; + if (BEG(i) < 0) { + rm->char_offset[i].beg = -1; + rm->char_offset[i].end = -1; + continue; + } + + key.byte_pos = BEG(i); + found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp); + rm->char_offset[i].beg = found->char_pos; + + key.byte_pos = END(i); + found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp); + rm->char_offset[i].end = found->char_pos; + } + + rm->char_offset_updated = 1; +} + /* :nodoc: */ static VALUE match_init_copy(VALUE obj, VALUE orig) { + struct rmatch *rm; + if (obj == orig) return obj; if (!rb_obj_is_instance_of(orig, rb_obj_class(obj))) { rb_raise(rb_eTypeError, "wrong argument class"); } RMATCH(obj)->str = RMATCH(orig)->str; - onig_region_free(RMATCH(obj)->regs, 0); - RMATCH(obj)->regs->allocated = 0; - onig_region_copy(RMATCH(obj)->regs, RMATCH(orig)->regs); + RMATCH(obj)->regexp = RMATCH(orig)->regexp; + + rm = RMATCH(obj)->rmatch; + onig_region_free(&rm->regs, 0); + rm->regs.allocated = 0; + + onig_region_copy(&rm->regs, RMATCH_REGS(orig)); + + if (!RMATCH(orig)->rmatch->char_offset_updated) { + rm->char_offset_updated = 0; + } + else { + if (rm->char_offset_num_allocated < rm->regs.num_regs) { + REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs); + rm->char_offset_num_allocated = rm->regs.num_regs; + } + MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset, + struct rmatch_offset, rm->regs.num_regs); + rm->char_offset_updated = 1; + } return obj; } @@ -747,16 +849,7 @@ match_names(VALUE match) static VALUE match_size(VALUE match) { - return INT2FIX(RMATCH(match)->regs->num_regs); -} - -static VALUE -match_sublen(VALUE str, int offset) -{ - int i; - - i = rb_str_sublen(str, offset); - return INT2FIX(i); + return INT2FIX(RMATCH_REGS(match)->num_regs); } static int @@ -765,7 +858,7 @@ match_backref_number(VALUE match, VALUE backref) const char *name; int num; - struct re_registers *regs = RMATCH(match)->regs; + struct re_registers *regs = RMATCH_REGS(match); VALUE regexp = RMATCH(match)->regexp; switch(TYPE(backref)) { @@ -816,15 +909,17 @@ static VALUE match_offset(VALUE match, VALUE n) { int i = match_backref_number(match, n); + struct re_registers *regs = RMATCH_REGS(match); - if (i < 0 || RMATCH(match)->regs->num_regs <= i) + if (i < 0 || regs->num_regs <= i) rb_raise(rb_eIndexError, "index %d out of matches", i); - if (RMATCH(match)->regs->beg[i] < 0) + if (BEG(i) < 0) return rb_assoc_new(Qnil, Qnil); - return rb_assoc_new(match_sublen(RMATCH(match)->str, RMATCH(match)->regs->beg[i]), - match_sublen(RMATCH(match)->str, RMATCH(match)->regs->end[i])); + update_char_offset(match); + return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg), + INT2FIX(RMATCH(match)->rmatch->char_offset[i].end)); } @@ -849,14 +944,16 @@ static VALUE match_begin(VALUE match, VALUE n) { int i = match_backref_number(match, n); + struct re_registers *regs = RMATCH_REGS(match); - if (i < 0 || RMATCH(match)->regs->num_regs <= i) + if (i < 0 || regs->num_regs <= i) rb_raise(rb_eIndexError, "index %d out of matches", i); - if (RMATCH(match)->regs->beg[i] < 0) + if (BEG(i) < 0) return Qnil; - return match_sublen(RMATCH(match)->str, RMATCH(match)->regs->beg[i]); + update_char_offset(match); + return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg); } @@ -881,14 +978,16 @@ static VALUE match_end(VALUE match, VALUE n) { int i = match_backref_number(match, n); + struct re_registers *regs = RMATCH_REGS(match); - if (i < 0 || RMATCH(match)->regs->num_regs <= i) + if (i < 0 || regs->num_regs <= i) rb_raise(rb_eIndexError, "index %d out of matches", i); - if (RMATCH(match)->regs->beg[i] < 0) + if (BEG(i) < 0) return Qnil; - return match_sublen(RMATCH(match)->str, RMATCH(match)->regs->end[i]); + update_char_offset(match); + return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end); } #define MATCH_BUSY FL_USER2 @@ -1094,9 +1193,10 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse) FL_UNSET(match, FL_TAINT); } - onig_region_copy(RMATCH(match)->regs, ®s); + onig_region_copy(RMATCH_REGS(match), ®s); RMATCH(match)->str = rb_str_new4(str); RMATCH(match)->regexp = re; + RMATCH(match)->rmatch->char_offset_updated = 0; rb_backref_set(match); OBJ_INFECT(match, re); @@ -1108,15 +1208,17 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse) VALUE rb_reg_nth_defined(int nth, VALUE match) { + struct re_registers *regs; if (NIL_P(match)) return Qnil; - if (nth >= RMATCH(match)->regs->num_regs) { + regs = RMATCH_REGS(match); + if (nth >= regs->num_regs) { return Qnil; } if (nth < 0) { - nth += RMATCH(match)->regs->num_regs; + nth += regs->num_regs; if (nth <= 0) return Qnil; } - if (RMATCH(match)->BEG(nth) == -1) return Qfalse; + if (BEG(nth) == -1) return Qfalse; return Qtrue; } @@ -1125,18 +1227,20 @@ rb_reg_nth_match(int nth, VALUE match) { VALUE str; long start, end, len; + struct re_registers *regs; if (NIL_P(match)) return Qnil; - if (nth >= RMATCH(match)->regs->num_regs) { + regs = RMATCH_REGS(match); + if (nth >= regs->num_regs) { return Qnil; } if (nth < 0) { - nth += RMATCH(match)->regs->num_regs; + nth += regs->num_regs; if (nth <= 0) return Qnil; } - start = RMATCH(match)->BEG(nth); + start = BEG(nth); if (start == -1) return Qnil; - end = RMATCH(match)->END(nth); + end = END(nth); len = end - start; str = rb_str_subseq(RMATCH(match)->str, start, len); OBJ_INFECT(str, match); @@ -1165,10 +1269,12 @@ VALUE rb_reg_match_pre(VALUE match) { VALUE str; + struct re_registers *regs; if (NIL_P(match)) return Qnil; - if (RMATCH(match)->BEG(0) == -1) return Qnil; - str = rb_str_subseq(RMATCH(match)->str, 0, RMATCH(match)->BEG(0)); + regs = RMATCH_REGS(match); + if (BEG(0) == -1) return Qnil; + str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0)); if (OBJ_TAINTED(match)) OBJ_TAINT(str); return str; } @@ -1190,11 +1296,13 @@ rb_reg_match_post(VALUE match) { VALUE str; long pos; + struct re_registers *regs; if (NIL_P(match)) return Qnil; - if (RMATCH(match)->BEG(0) == -1) return Qnil; + regs = RMATCH_REGS(match); + if (BEG(0) == -1) return Qnil; str = RMATCH(match)->str; - pos = RMATCH(match)->END(0); + pos = END(0); str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos); if (OBJ_TAINTED(match)) OBJ_TAINT(str); return str; @@ -1204,11 +1312,13 @@ VALUE rb_reg_match_last(VALUE match) { int i; + struct re_registers *regs; if (NIL_P(match)) return Qnil; - if (RMATCH(match)->BEG(0) == -1) return Qnil; + regs = RMATCH_REGS(match); + if (BEG(0) == -1) return Qnil; - for (i=RMATCH(match)->regs->num_regs-1; RMATCH(match)->BEG(i) == -1 && i > 0; i--) + for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--) ; if (i == 0) return Qnil; return rb_reg_nth_match(i, match); @@ -1241,7 +1351,7 @@ last_paren_match_getter(void) static VALUE match_array(VALUE match, int start) { - struct re_registers *regs = RMATCH(match)->regs; + struct re_registers *regs = RMATCH_REGS(match); VALUE ary = rb_ary_new2(regs->num_regs); VALUE target = RMATCH(match)->str; int i; @@ -1381,7 +1491,7 @@ match_aref(int argc, VALUE *argv, VALUE match) p = StringValuePtr(idx); name_to_backref: - num = name_to_backref_number(RMATCH(match)->regs, + num = name_to_backref_number(RMATCH_REGS(match), RMATCH(match)->regexp, p, p + strlen(p)); return rb_reg_nth_match(num, match); break; @@ -1419,7 +1529,8 @@ match_entry(VALUE match, long n) static VALUE match_values_at(int argc, VALUE *argv, VALUE match) { - return rb_get_values_at(match, RMATCH(match)->regs->num_regs, argc, argv, match_entry); + struct re_registers *regs = RMATCH_REGS(match); + return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry); } @@ -1506,7 +1617,8 @@ match_inspect(VALUE match) char *cname = rb_obj_classname(match); VALUE str; int i; - int num_regs = RMATCH(match)->regs->num_regs; + struct re_registers *regs = RMATCH_REGS(match); + int num_regs = regs->num_regs; struct backref_name_tag *names; VALUE regexp = RMATCH(match)->regexp; diff --git a/string.c b/string.c index 3a78f66a9a..109b2f846d 100644 --- a/string.c +++ b/string.c @@ -2716,27 +2716,29 @@ rb_str_subpat_set(VALUE str, VALUE re, int nth, VALUE val) VALUE match; long start, end, len; rb_encoding *enc; + struct re_registers *regs; if (rb_reg_search(re, str, 0, 0) < 0) { rb_raise(rb_eIndexError, "regexp not matched"); } match = rb_backref_get(); - if (nth >= RMATCH(match)->regs->num_regs) { + regs = RMATCH_REGS(match); + if (nth >= regs->num_regs) { out_of_range: rb_raise(rb_eIndexError, "index %d out of regexp", nth); } if (nth < 0) { - if (-nth >= RMATCH(match)->regs->num_regs) { + if (-nth >= regs->num_regs) { goto out_of_range; } - nth += RMATCH(match)->regs->num_regs; + nth += regs->num_regs; } - start = RMATCH(match)->BEG(nth); + start = BEG(nth); if (start == -1) { rb_raise(rb_eIndexError, "regexp group %d not matched", nth); } - end = RMATCH(match)->END(nth); + end = END(nth); len = end - start; StringValue(val); enc = rb_enc_check(str, val); @@ -2967,7 +2969,7 @@ rb_str_sub_bang(int argc, VALUE *argv, VALUE str) int cr = ENC_CODERANGE(str); match = rb_backref_get(); - regs = RMATCH(match)->regs; + regs = RMATCH_REGS(match); if (iter || !NIL_P(hash)) { char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str); @@ -3114,7 +3116,7 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) do { n++; match = rb_backref_get(); - regs = RMATCH(match)->regs; + regs = RMATCH_REGS(match); if (iter || !NIL_P(hash)) { if (iter) { rb_match_busy(match); @@ -4751,7 +4753,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) struct re_registers *regs; while ((end = rb_reg_search(spat, str, start, 0)) >= 0) { - regs = RMATCH(rb_backref_get())->regs; + regs = RMATCH_REGS(rb_backref_get()); if (start == end && BEG(0) == END(0)) { if (!RSTRING_PTR(str)) { rb_ary_push(result, rb_str_new("", 0)); @@ -5397,7 +5399,7 @@ scan_once(VALUE str, VALUE pat, long *start) enc = STR_ENC_GET(str); if (rb_reg_search(pat, str, *start, 0) >= 0) { match = rb_backref_get(); - regs = RMATCH(match)->regs; + regs = RMATCH_REGS(match); if (BEG(0) == END(0)) { /* * Always consume at least one character of the input string -- cgit v1.2.3