From 8987b97ca9d3eca076a489fbf6eb9c1dfbf72df2 Mon Sep 17 00:00:00 2001 From: akr Date: Sun, 6 Jan 2008 09:25:09 +0000 Subject: * include/ruby/encoding.h (rb_enc_str_buf_cat): declared. * string.c (coderange_scan): extracted from rb_enc_str_coderange. (rb_enc_str_coderange): use coderange_scan. (rb_str_shared_replace): copy encoding and coderange. (rb_enc_str_buf_cat): new function for linear complexity string accumulation with encoding. (rb_str_sub_bang): don't conflict substituted part and replacement. (str_gsub): use rb_enc_str_buf_cat. (rb_str_clear): clear coderange. * re.c (rb_reg_regsub): use rb_enc_str_buf_cat. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14910 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 15 +++ include/ruby/encoding.h | 1 + re.c | 36 +++---- string.c | 232 +++++++++++++++++++++++++++++--------------- test/ruby/test_m17n_comb.rb | 5 +- 5 files changed, 189 insertions(+), 100 deletions(-) diff --git a/ChangeLog b/ChangeLog index 9e9e67cb67..2576b21495 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +Sun Jan 6 18:19:12 2008 Tanaka Akira + + * include/ruby/encoding.h (rb_enc_str_buf_cat): declared. + + * string.c (coderange_scan): extracted from rb_enc_str_coderange. + (rb_enc_str_coderange): use coderange_scan. + (rb_str_shared_replace): copy encoding and coderange. + (rb_enc_str_buf_cat): new function for linear complexity string + accumulation with encoding. + (rb_str_sub_bang): don't conflict substituted part and replacement. + (str_gsub): use rb_enc_str_buf_cat. + (rb_str_clear): clear coderange. + + * re.c (rb_reg_regsub): use rb_enc_str_buf_cat. + Sun Jan 6 17:55:44 2008 Technorama Ltd. * lib/securerandom.rb: Add Win32 support. diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index 2c247c99e9..bb29e7c46f 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -65,6 +65,7 @@ VALUE rb_enc_vsprintf(rb_encoding *, const char*, va_list); long rb_enc_strlen(const char*, const char*, rb_encoding*); char* rb_enc_nth(const char*, const char*, int, rb_encoding*); VALUE rb_obj_encoding(VALUE); +VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc); /* index -> rb_encoding */ rb_encoding* rb_enc_from_index(int idx); diff --git a/re.c b/re.c index 50fa341d15..8fd80628b8 100644 --- a/re.c +++ b/re.c @@ -2795,17 +2795,18 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) VALUE val = 0; char *p, *s, *e; int no, clen; - rb_encoding *enc = rb_enc_check(str, src); + rb_encoding *str_enc = rb_enc_get(str); + rb_encoding *src_enc = rb_enc_get(src); p = s = RSTRING_PTR(str); e = s + RSTRING_LEN(str); while (s < e) { - int c = rb_enc_ascget(s, e, &clen, enc); + int c = rb_enc_ascget(s, e, &clen, str_enc); char *ss; if (c == -1) { - s += mbclen(s, e, enc); + s += mbclen(s, e, str_enc); continue; } ss = s; @@ -2816,12 +2817,12 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) if (!val) { val = rb_str_buf_new(ss-p); } - rb_str_buf_cat(val, p, ss-p); + rb_enc_str_buf_cat(val, p, ss-p, str_enc); - c = rb_enc_ascget(s, e, &clen, enc); + c = rb_enc_ascget(s, e, &clen, str_enc); if (c == -1) { - s += mbclen(s, e, enc); - rb_str_buf_cat(val, ss, s-ss); + s += mbclen(s, e, str_enc); + rb_enc_str_buf_cat(val, ss, s-ss, str_enc); continue; } s += clen; @@ -2839,14 +2840,14 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) break; case 'k': - if (s < e && rb_enc_ascget(s, e, &clen, enc) == '<') { + if (s < e && rb_enc_ascget(s, e, &clen, str_enc) == '<') { char *name, *name_end; name_end = name = s + clen; while (name_end < e) { - c = rb_enc_ascget(name_end, e, &clen, enc); + c = rb_enc_ascget(name_end, e, &clen, str_enc); if (c == '>') break; - name_end += c == -1 ? mbclen(name_end, e, enc) : clen; + name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen; } if (name_end < e) { no = name_to_backref_number(regs, regexp, name, name_end); @@ -2858,7 +2859,7 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) } } - rb_str_buf_cat(val, ss, s-ss); + rb_enc_str_buf_cat(val, ss, s-ss, str_enc); continue; case '0': @@ -2867,11 +2868,11 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) break; case '`': - rb_str_buf_cat(val, RSTRING_PTR(src), BEG(0)); + rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc); continue; case '\'': - rb_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0)); + rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc); continue; case '+': @@ -2881,26 +2882,25 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) break; case '\\': - rb_str_buf_cat(val, s-clen, clen); + rb_enc_str_buf_cat(val, s-clen, clen, str_enc); continue; default: - rb_str_buf_cat(val, ss, s-ss); + rb_enc_str_buf_cat(val, ss, s-ss, str_enc); continue; } if (no >= 0) { if (no >= regs->num_regs) continue; if (BEG(no) == -1) continue; - rb_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no)); + rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc); } } if (!val) return str; if (p < e) { - rb_str_buf_cat(val, p, e-p); + rb_enc_str_buf_cat(val, p, e-p, str_enc); } - rb_enc_associate(val, enc); return val; } diff --git a/string.c b/string.c index ccebadb29d..b5e57afedf 100644 --- a/string.c +++ b/string.c @@ -114,6 +114,31 @@ single_byte_optimizable(VALUE str) VALUE rb_fs; +static int +coderange_scan(const char *p, long len, rb_encoding *enc) +{ + const char *e = p + len; + int cr; + + cr = rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; + while (p < e) { + int ret = rb_enc_precise_mbclen(p, e, enc); + int len = MBCLEN_CHARFOUND(ret); + + if (len) { + if (len != 1 || !ISASCII((unsigned char)*p)) { + cr = ENC_CODERANGE_VALID; + } + p += len; + } + else { + cr = ENC_CODERANGE_BROKEN; + break; + } + } + return cr; +} + int rb_enc_str_coderange(VALUE str) { @@ -121,26 +146,7 @@ rb_enc_str_coderange(VALUE str) if (cr == ENC_CODERANGE_UNKNOWN) { rb_encoding *enc = rb_enc_get(str); - - const char *p = RSTRING_PTR(str); - const char *e = p + RSTRING_LEN(str); - - cr = rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; - while (p < e) { - int ret = rb_enc_precise_mbclen(p, e, enc); - int len = MBCLEN_CHARFOUND(ret); - - if (len) { - if (len != 1 || !rb_enc_isascii((unsigned char)*p, enc)) { - cr = ENC_CODERANGE_VALID; - } - p += len; - } - else { - cr = ENC_CODERANGE_BROKEN; - break; - } - } + cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc); ENC_CODERANGE_SET(str, cr); } return cr; @@ -404,13 +410,19 @@ rb_str_to_str(VALUE str) void rb_str_shared_replace(VALUE str, VALUE str2) { + rb_encoding *enc; + int cr; if (str == str2) return; + enc = rb_enc_get(str2); + cr = ENC_CODERANGE(str2); rb_str_modify(str); if (OBJ_TAINTED(str2)) OBJ_TAINT(str); if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) { STR_SET_EMBED(str); memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1); STR_SET_EMBED_LEN(str, RSTRING_LEN(str2)); + rb_enc_associate(str, enc); + ENC_CODERANGE_SET(str, cr); return; } if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) { @@ -431,6 +443,8 @@ rb_str_shared_replace(VALUE str, VALUE str2) RSTRING(str2)->as.heap.len = 0; RSTRING(str2)->as.heap.aux.capa = 0; STR_UNSET_NOCAPA(str2); + rb_enc_associate(str, enc); + ENC_CODERANGE_SET(str, cr); } static ID id_to_s; @@ -1031,6 +1045,94 @@ rb_str_cat2(VALUE str, const char *ptr) return rb_str_cat(str, ptr, strlen(ptr)); } +VALUE +rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc) +{ + long capa, total; + + rb_encoding *str_enc = rb_enc_get(str); + rb_encoding *res_enc; + int str_cr, ptr_cr, res_cr; + int str_a8 = rb_enc_to_index(str_enc) == 0; + int ptr_a8 = rb_enc_to_index(ptr_enc) == 0; + + str_cr = ENC_CODERANGE(str); + ptr_cr = coderange_scan(ptr, len, ptr_enc); + + if (str_cr == ENC_CODERANGE_UNKNOWN) { + if (str_a8 ? !ptr_a8 + : (str_enc != ptr_enc && ptr_cr != ENC_CODERANGE_7BIT)) { + str_cr = rb_enc_str_coderange(str); + } + } + + if (str_enc != ptr_enc && + str_cr != ENC_CODERANGE_7BIT && + ptr_cr != ENC_CODERANGE_7BIT) { + rb_raise(rb_eArgError, "append incompatible encoding strings: %s and %s", + rb_enc_name(str_enc), + rb_enc_name(ptr_enc)); + } + + if (str_cr == ENC_CODERANGE_UNKNOWN) { + res_enc = str_enc; + res_cr = ENC_CODERANGE_UNKNOWN; + } + else if (str_cr == ENC_CODERANGE_7BIT) { + if (ptr_cr == ENC_CODERANGE_7BIT) { + res_enc = !str_a8 ? str_enc : ptr_enc; + res_cr = ENC_CODERANGE_7BIT; + } + else { + res_enc = ptr_enc; + res_cr = ptr_cr; + } + } + else if (str_cr == ENC_CODERANGE_VALID) { + res_enc = str_enc; + res_cr = str_cr; + } + else { /* str_cr == ENC_CODERANGE_BROKEN */ + res_enc = str_enc; + res_cr = str_cr; + if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN; + } + + if (len < 0) { + rb_raise(rb_eArgError, "negative string size (or size too big)"); + } + rb_str_modify(str); + if (len == 0) { + rb_enc_associate(str, res_enc); + ENC_CODERANGE_SET(str, res_cr); + return str; + } + if (STR_ASSOC_P(str)) { + FL_UNSET(str, STR_ASSOC); + capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str); + } + else if (STR_EMBED_P(str)) { + capa = RSTRING_EMBED_LEN_MAX; + } + else { + capa = RSTRING(str)->as.heap.aux.capa; + } + total = RSTRING_LEN(str)+len; + if (capa <= total) { + while (total > capa) { + capa = (capa + 1) * 2; + } + RESIZE_CAPA(str, capa); + } + memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len); + STR_SET_LEN(str, total); + RSTRING_PTR(str)[total] = '\0'; // sentinel + + rb_enc_associate(str, res_enc); + ENC_CODERANGE_SET(str, res_cr); + return str; +} + VALUE rb_str_buf_append(VALUE str, VALUE str2) { @@ -2463,15 +2565,27 @@ rb_str_sub_bang(int argc, VALUE *argv, VALUE str) rb_match_busy(match); repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); - enc = rb_enc_check(str, repl); str_mod_check(str, p, len); str_frozen_check(str); rb_backref_set(match); } else { repl = rb_reg_regsub(repl, str, regs, pat); - enc = rb_enc_check(str, repl); } + enc = rb_enc_compatible(str, repl); + if (!enc) { + rb_encoding *str_enc = rb_enc_get(str); + if (coderange_scan(RSTRING_PTR(str), BEG(0), str_enc) != + ENC_CODERANGE_7BIT || + coderange_scan(RSTRING_PTR(str)+END(0), + RSTRING_LEN(str)-END(0), str_enc) != + ENC_CODERANGE_7BIT) { + rb_raise(rb_eArgError, "character encodings differ: %s and %s", + rb_enc_name(str_enc), + rb_enc_name(rb_enc_get(repl))); + } + enc = rb_enc_get(repl); + } rb_str_modify(str); rb_enc_associate(str, enc); if (OBJ_TAINTED(repl)) tainted = 1; @@ -2548,9 +2662,8 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) long beg, n; long offset, blen, slen, len; int iter = 0; - char *buf, *bp, *sp, *cp; + char *sp, *cp; int tainted = 0; - int cr; switch (argc) { case 1: @@ -2575,25 +2688,18 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) } blen = RSTRING_LEN(str) + 30; /* len + margin */ - dest = str_new(0, 0, blen); - buf = RSTRING_PTR(dest); - bp = buf; - sp = cp = RSTRING_PTR(str); + dest = rb_str_buf_new(blen); + sp = RSTRING_PTR(str); slen = RSTRING_LEN(str); - cr = ENC_CODERANGE(str); + cp = sp; - rb_str_locktmp(dest); do { - rb_encoding *enc; - n++; match = rb_backref_get(); regs = RMATCH(match)->regs; if (iter) { - rb_match_busy(match); val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); - enc = rb_enc_check(str, val); str_mod_check(str, sp, slen); if (bang) str_frozen_check(str); if (val == dest) { /* paranoid check [ruby-dev:24827] */ @@ -2603,28 +2709,16 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) } else { val = rb_reg_regsub(repl, str, regs, pat); - enc = rb_enc_check(str, val); } - rb_enc_associate(str, enc); + if (OBJ_TAINTED(val)) tainted = 1; - if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) { - int cr2 = ENC_CODERANGE(val); - if (cr2 == ENC_CODERANGE_UNKNOWN || cr2 > cr) cr = cr2; - } - len = (bp - buf) + (beg - offset) + RSTRING_LEN(val) + 3; - if (blen < len) { - while (blen < len) blen *= 2; - len = bp - buf; - RESIZE_CAPA(dest, blen); - STR_SET_LEN(dest, blen); - buf = RSTRING_PTR(dest); - bp = buf + len; - } + len = beg - offset; /* copy pre-match substr */ - memcpy(bp, cp, len); - bp += len; - memcpy(bp, RSTRING_PTR(val), RSTRING_LEN(val)); - bp += RSTRING_LEN(val); + rb_enc_str_buf_cat(dest, cp, len, rb_enc_get(str)); + + rb_enc_str_buf_cat(dest, RSTRING_PTR(val), RSTRING_LEN(val), rb_enc_get(val)); + RB_GC_GUARD(val); + offset = END(0); if (BEG(0) == END(0)) { /* @@ -2632,9 +2726,8 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) * in order to prevent infinite loops. */ if (RSTRING_LEN(str) <= END(0)) break; - len = rb_enc_mbclen(RSTRING_PTR(str)+END(0), RSTRING_END(str), enc); - memcpy(bp, RSTRING_PTR(str)+END(0), len); - bp += len; + len = rb_enc_mbclen(RSTRING_PTR(str)+END(0), RSTRING_END(str), rb_enc_get(str)); + rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+END(0), len, rb_enc_get(str)); offset = END(0) + len; } cp = RSTRING_PTR(str) + offset; @@ -2642,39 +2735,18 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) beg = rb_reg_search(pat, str, offset, 0); } while (beg >= 0); if (RSTRING_LEN(str) > offset) { - len = bp - buf; - if (blen - len < RSTRING_LEN(str) - offset) { - blen = len + RSTRING_LEN(str) - offset; - RESIZE_CAPA(dest, blen); - buf = RSTRING_PTR(dest); - bp = buf + len; - } - memcpy(bp, cp, RSTRING_LEN(str) - offset); - bp += RSTRING_LEN(str) - offset; + rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, rb_enc_get(str)); } rb_backref_set(match); - *bp = '\0'; - rb_str_unlocktmp(dest); if (bang) { - if (str_independent(str) && !STR_EMBED_P(str)) { - free(RSTRING_PTR(str)); - } - STR_SET_NOEMBED(str); - STR_UNSET_NOCAPA(str); - RSTRING(str)->as.heap.ptr = buf; - RSTRING(str)->as.heap.aux.capa = blen; - RSTRING(dest)->as.heap.ptr = 0; - RSTRING(dest)->as.heap.len = 0; + rb_str_shared_replace(str, dest); } else { RBASIC(dest)->klass = rb_obj_class(str); OBJ_INFECT(dest, str); - rb_enc_copy(dest, str); str = dest; } - STR_SET_LEN(str, bp - buf); - ENC_CODERANGE_SET(str, cr); if (tainted) OBJ_TAINT(str); return str; } @@ -2782,6 +2854,7 @@ rb_str_replace(VALUE str, VALUE str2) OBJ_INFECT(str, str2); rb_enc_copy(str, str2); + ENC_CODERANGE_SET(str, ENC_CODERANGE(str2)); return str; } @@ -2805,6 +2878,7 @@ rb_str_clear(VALUE str) STR_SET_EMBED(str); STR_SET_EMBED_LEN(str, 0); RSTRING_PTR(str)[0] = 0; + ENC_CODERANGE_CLEAR(str); return str; } diff --git a/test/ruby/test_m17n_comb.rb b/test/ruby/test_m17n_comb.rb index b1928f3cd5..951961e09b 100644 --- a/test/ruby/test_m17n_comb.rb +++ b/test/ruby/test_m17n_comb.rb @@ -335,7 +335,6 @@ class TestM17NComb < Test::Unit::TestCase end def test_str_aref_substr - combination(STRINGS, STRINGS) {|s1, s2| if s1.ascii_only? || s2.ascii_only? || s1.encoding == s2.encoding t = s1[s2] @@ -1359,7 +1358,7 @@ class TestM17NComb < Test::Unit::TestCase assert_equal(s1, doit.call) next end - if !str_enc_compatible?(s1, s3) + if !str_enc_compatible?(s1.gsub(r2, ''), s3) assert_raise(ArgumentError, desc) { doit.call } next end @@ -1413,7 +1412,7 @@ class TestM17NComb < Test::Unit::TestCase assert_equal([s1, nil], doit.call) next end - if !str_enc_compatible?(s1, s3) + if !str_enc_compatible?(s1.gsub(r2, ''), s3) assert_raise(ArgumentError, desc) { doit.call } next end -- cgit v1.2.3