diff options
Diffstat (limited to 'string.c')
-rw-r--r-- | string.c | 1065 |
1 files changed, 674 insertions, 391 deletions
@@ -78,24 +78,41 @@ VALUE rb_cString; VALUE rb_cSymbol; -/* FLAGS of RString +/* Flags of RString * * 1: RSTRING_NOEMBED - * 2: STR_SHARED (== ELTS_SHARED) - * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be - * other strings that rely on this string's buffer) - * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle - * early, specific to rb_str_tmp_frozen_{acquire,release}) - * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall - * such as read(2). Any modification and realloc is prohibited) - * - * 8-9: ENC_CODERANGE (2 bits) - * 10-16: ENCODING (7 bits == 128) + * The string is not embedded. When a string is embedded, the contents + * follow the header. When a string is not embedded, the contents is + * on a separately allocated buffer. + * 2: STR_SHARED (equal to ELTS_SHARED) + * The string is shared. The buffer this string points to is owned by + * another string (the shared root). + * 3: STR_CHILLED (will be frozen in a future version) + * The string appears frozen but can be mutated with a warning. + * 5: STR_SHARED_ROOT + * Other strings may point to the contents of this string. When this + * flag is set, STR_SHARED must not be set. + * 6: STR_BORROWED + * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe + * to be unshared by rb_str_tmp_frozen_release. + * 7: STR_TMPLOCK + * The pointer to the buffer is passed to a system call such as + * read(2). Any modification and realloc is prohibited. + * 8-9: ENC_CODERANGE + * Stores the coderange of the string. + * 10-16: ENCODING + * Stores the encoding of the string. * 17: RSTRING_FSTR - * 18: STR_NOFREE (do not free this string's buffer when a String is freed. - * used for a string object based on C string literal) - * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string - * object header is temporarily allocated on C stack) + * The string is a fstring. The string is deduplicated in the fstring + * table. + * 18: STR_NOFREE + * Do not free this string's buffer when the string is reclaimed + * by the garbage collector. Used for when the string buffer is a C + * string literal. + * 19: STR_FAKESTR + * The string is not allocated or managed by the garbage collector. + * Typically, the string object header (struct RString) is temporarily + * allocated on C stack. */ #define RUBY_MAX_CHAR_LEN 16 @@ -109,19 +126,10 @@ VALUE rb_cSymbol; FL_SET((str), STR_NOEMBED);\ FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\ } while (0) -#define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE)) -# define STR_SET_EMBED_LEN(str, n) do { \ - assert(str_embed_capa(str) > (n));\ - RSTRING(str)->as.embed.len = (n);\ -} while (0) +#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE) #define STR_SET_LEN(str, n) do { \ - if (STR_EMBED_P(str)) {\ - STR_SET_EMBED_LEN((str), (n));\ - }\ - else {\ - RSTRING(str)->as.heap.len = (n);\ - }\ + RSTRING(str)->len = (n); \ } while (0) static inline bool @@ -158,13 +166,13 @@ str_enc_fastpath(VALUE str) const long tlen = RSTRING_LEN(str);\ memcpy(tmp, RSTRING_PTR(str), tlen);\ RSTRING(str)->as.heap.ptr = tmp;\ - RSTRING(str)->as.heap.len = tlen;\ + RSTRING(str)->len = tlen;\ STR_SET_NOEMBED(str);\ RSTRING(str)->as.heap.aux.capa = (capacity);\ }\ }\ else {\ - assert(!FL_TEST((str), STR_SHARED)); \ + RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \ SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \ (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \ RSTRING(str)->as.heap.aux.capa = (capacity);\ @@ -173,8 +181,8 @@ str_enc_fastpath(VALUE str) #define STR_SET_SHARED(str, shared_str) do { \ if (!FL_TEST(str, STR_FAKESTR)) { \ - assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \ - assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \ + RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \ + RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \ RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \ FL_SET((str), STR_SHARED); \ FL_SET((shared_str), STR_SHARED_ROOT); \ @@ -222,7 +230,7 @@ rb_str_size_as_embedded(VALUE str) { size_t real_size; if (STR_EMBED_P(str)) { - real_size = rb_str_embed_size(RSTRING(str)->as.embed.len) + TERM_LEN(str); + real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str); } /* if the string is not currently embedded, but it can be embedded, how * much space would it require */ @@ -275,10 +283,10 @@ rb_str_make_embedded(VALUE str) RUBY_ASSERT(!STR_EMBED_P(str)); char *buf = RSTRING(str)->as.heap.ptr; - long len = RSTRING(str)->as.heap.len; + long len = RSTRING(str)->len; STR_SET_EMBED(str); - STR_SET_EMBED_LEN(str, len); + STR_SET_LEN(str, len); if (len > 0) { memcpy(RSTRING_PTR(str), buf, len); @@ -289,26 +297,6 @@ rb_str_make_embedded(VALUE str) } void -rb_str_update_shared_ary(VALUE str, VALUE old_root, VALUE new_root) -{ - // if the root location hasn't changed, we don't need to update - if (new_root == old_root) { - return; - } - - // if the root string isn't embedded, we don't need to touch the pointer. - // it already points to the shame shared buffer - if (!STR_EMBED_P(new_root)) { - return; - } - - size_t offset = (size_t)((uintptr_t)RSTRING(str)->as.heap.ptr - (uintptr_t)RSTRING(old_root)->as.embed.ary); - - RUBY_ASSERT(RSTRING(str)->as.heap.ptr >= RSTRING(old_root)->as.embed.ary); - RSTRING(str)->as.heap.ptr = RSTRING(new_root)->as.embed.ary + offset; -} - -void rb_debug_rstring_null_ptr(const char *func) { fprintf(stderr, "%s is returning NULL!! " @@ -382,24 +370,25 @@ fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int exist else { if (FL_TEST_RAW(str, STR_FAKESTR)) { if (arg->copy) { - VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->as.heap.len); + VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len); rb_enc_copy(new_str, str); str = new_str; } else { str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr, - RSTRING(str)->as.heap.len, + RSTRING(str)->len, ENCODING_GET(str)); } - OBJ_FREEZE_RAW(str); + OBJ_FREEZE(str); } else { - if (!OBJ_FROZEN(str)) + if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) { str = str_new_frozen(rb_cString, str); + } if (STR_SHARED_P(str)) { /* str should not be shared */ /* shared substring */ str_make_independent(str); - assert(OBJ_FROZEN(str)); + RUBY_ASSERT(OBJ_FROZEN(str)); } if (!BARE_STRING_P(str)) { str = str_new_frozen(rb_cString, str); @@ -412,7 +401,6 @@ fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int exist } } -RUBY_FUNC_EXPORTED VALUE rb_fstring(VALUE str) { @@ -427,23 +415,24 @@ rb_fstring(VALUE str) bare = BARE_STRING_P(str); if (!bare) { if (STR_EMBED_P(str)) { - OBJ_FREEZE_RAW(str); + OBJ_FREEZE(str); return str; } - if (FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) { - assert(OBJ_FROZEN(str)); + + if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) { + RUBY_ASSERT(OBJ_FROZEN(str)); return str; } } - if (!OBJ_FROZEN(str)) + if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED)) rb_str_resize(str, RSTRING_LEN(str)); fstr = register_fstring(str, FALSE); if (!bare) { str_replace_shared_without_enc(str, fstr); - OBJ_FREEZE_RAW(str); + OBJ_FREEZE(str); return str; } return fstr; @@ -465,10 +454,11 @@ register_fstring(VALUE str, bool copy) } RB_VM_LOCK_LEAVE(); - assert(OBJ_FROZEN(args.fstr)); - assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR)); - assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR)); - assert(RBASIC_CLASS(args.fstr) == rb_cString); + RUBY_ASSERT(OBJ_FROZEN(args.fstr)); + RUBY_ASSERT(!FL_TEST_RAW(args.fstr, STR_FAKESTR)); + RUBY_ASSERT(!FL_TEST_RAW(args.fstr, FL_EXIVAR)); + RUBY_ASSERT(RBASIC_CLASS(args.fstr) == rb_cString); + return args.fstr; } @@ -486,7 +476,7 @@ setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx) ENCODING_SET_INLINED((VALUE)fake_str, encidx); RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString); - fake_str->as.heap.len = len; + fake_str->len = len; fake_str->as.heap.ptr = (char *)name; fake_str->as.heap.aux.capa = len; return (VALUE)fake_str; @@ -832,7 +822,7 @@ str_capacity(VALUE str, const int termlen) return str_embed_capa(str) - termlen; } else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) { - return RSTRING(str)->as.heap.len; + return RSTRING(str)->len; } else { return RSTRING(str)->as.heap.aux.capa; @@ -857,8 +847,8 @@ static inline VALUE str_alloc_embed(VALUE klass, size_t capa) { size_t size = rb_str_embed_size(capa); - assert(size > 0); - assert(rb_gc_size_allocatable_p(size)); + RUBY_ASSERT(size > 0); + RUBY_ASSERT(rb_gc_size_allocatable_p(size)); NEWOBJ_OF(str, struct RString, klass, T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size, 0); @@ -1012,7 +1002,7 @@ str_new_static(VALUE klass, const char *ptr, long len, int encindex) else { RUBY_DTRACE_CREATE_HOOK(STRING, len); str = str_alloc_heap(klass); - RSTRING(str)->as.heap.len = len; + RSTRING(str)->len = len; RSTRING(str)->as.heap.ptr = (char *)ptr; RSTRING(str)->as.heap.aux.capa = len; RBASIC(str)->flags |= STR_NOFREE; @@ -1160,6 +1150,7 @@ str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len, rb_str_resize(newstr, olen); } DATA_PTR(econv_wrapper) = 0; + RB_GC_GUARD(econv_wrapper); rb_econv_close(ec); switch (ret) { case econv_finished: @@ -1296,7 +1287,6 @@ str_replace_shared_without_enc(VALUE str2, VALUE str) char *ptr2 = RSTRING(str2)->as.embed.ary; STR_SET_EMBED(str2); memcpy(ptr2, RSTRING_PTR(str), len); - STR_SET_EMBED_LEN(str2, len); TERM_FILL(ptr2+len, termlen); } else { @@ -1309,7 +1299,8 @@ str_replace_shared_without_enc(VALUE str2, VALUE str) root = rb_str_new_frozen(str); RSTRING_GETMEM(root, ptr, len); } - assert(OBJ_FROZEN(root)); + RUBY_ASSERT(OBJ_FROZEN(root)); + if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) { if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) { rb_fatal("about to free a possible shared root"); @@ -1320,10 +1311,12 @@ str_replace_shared_without_enc(VALUE str2, VALUE str) } } FL_SET(str2, STR_NOEMBED); - RSTRING(str2)->as.heap.len = len; RSTRING(str2)->as.heap.ptr = ptr; STR_SET_SHARED(str2, root); } + + STR_SET_LEN(str2, len); + return str2; } @@ -1350,7 +1343,7 @@ rb_str_new_shared(VALUE str) VALUE rb_str_new_frozen(VALUE orig) { - if (OBJ_FROZEN(orig)) return orig; + if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig; return str_new_frozen(rb_obj_class(orig), orig); } @@ -1368,6 +1361,42 @@ rb_str_tmp_frozen_acquire(VALUE orig) return str_new_frozen_buffer(0, orig, FALSE); } +VALUE +rb_str_tmp_frozen_no_embed_acquire(VALUE orig) +{ + if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig; + if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig); + + VALUE str = str_alloc_heap(0); + OBJ_FREEZE(str); + /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */ + FL_SET(str, STR_SHARED_ROOT); + + size_t capa = str_capacity(orig, TERM_LEN(orig)); + + /* If the string is embedded then we want to create a copy that is heap + * allocated. If the string is shared then the shared root must be + * embedded, so we want to create a copy. If the string is a shared root + * then it must be embedded, so we want to create a copy. */ + if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) { + RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig)); + memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa); + } + else { + /* orig must be heap allocated and not shared, so we can safely transfer + * the pointer to str. */ + RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr; + RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE; + RBASIC(orig)->flags &= ~STR_NOFREE; + STR_SET_SHARED(orig, str); + } + + RSTRING(str)->len = RSTRING(orig)->len; + RSTRING(str)->as.heap.aux.capa = capa; + + return str; +} + void rb_str_tmp_frozen_release(VALUE orig, VALUE tmp) { @@ -1375,25 +1404,25 @@ rb_str_tmp_frozen_release(VALUE orig, VALUE tmp) return; if (STR_EMBED_P(tmp)) { - assert(OBJ_FROZEN_RAW(tmp)); + RUBY_ASSERT(OBJ_FROZEN_RAW(tmp)); } else if (FL_TEST_RAW(orig, STR_SHARED) && !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) { VALUE shared = RSTRING(orig)->as.heap.aux.shared; if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) { - assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr); - assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len); + RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr); + RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp)); /* Unshare orig since the root (tmp) only has this one child. */ FL_UNSET_RAW(orig, STR_SHARED); RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa; RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE; - assert(OBJ_FROZEN_RAW(tmp)); + RUBY_ASSERT(OBJ_FROZEN_RAW(tmp)); /* Make tmp embedded and empty so it is safe for sweeping. */ STR_SET_EMBED(tmp); - STR_SET_EMBED_LEN(tmp, 0); + STR_SET_LEN(tmp, 0); } } } @@ -1407,11 +1436,11 @@ str_new_frozen(VALUE klass, VALUE orig) static VALUE heap_str_make_shared(VALUE klass, VALUE orig) { - assert(!STR_EMBED_P(orig)); - assert(!STR_SHARED_P(orig)); + RUBY_ASSERT(!STR_EMBED_P(orig)); + RUBY_ASSERT(!STR_SHARED_P(orig)); VALUE str = str_alloc_heap(klass); - RSTRING(str)->as.heap.len = RSTRING_LEN(orig); + STR_SET_LEN(str, RSTRING_LEN(orig)); RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig); RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa; RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE; @@ -1432,25 +1461,25 @@ str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding) if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) { str = str_new0(klass, RSTRING_PTR(orig), len, termlen); - assert(STR_EMBED_P(str)); + RUBY_ASSERT(STR_EMBED_P(str)); } else { if (FL_TEST_RAW(orig, STR_SHARED)) { VALUE shared = RSTRING(orig)->as.heap.aux.shared; long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared); - long rest = RSTRING_LEN(shared) - ofs - RSTRING(orig)->as.heap.len; - assert(ofs >= 0); - assert(rest >= 0); - assert(ofs + rest <= RSTRING_LEN(shared)); - assert(OBJ_FROZEN(shared)); + long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig); + RUBY_ASSERT(ofs >= 0); + RUBY_ASSERT(rest >= 0); + RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared)); + RUBY_ASSERT(OBJ_FROZEN(shared)); if ((ofs > 0) || (rest > 0) || (klass != RBASIC(shared)->klass) || ENCODING_GET(shared) != ENCODING_GET(orig)) { str = str_new_shared(klass, shared); - assert(!STR_EMBED_P(str)); + RUBY_ASSERT(!STR_EMBED_P(str)); RSTRING(str)->as.heap.ptr += ofs; - RSTRING(str)->as.heap.len -= ofs + rest; + STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest)); } else { if (RBASIC_CLASS(shared) == 0) @@ -1462,7 +1491,7 @@ str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding) str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig)); STR_SET_EMBED(str); memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig)); - STR_SET_EMBED_LEN(str, RSTRING_LEN(orig)); + STR_SET_LEN(str, RSTRING_LEN(orig)); TERM_FILL(RSTRING_END(str), TERM_LEN(orig)); } else { @@ -1552,7 +1581,7 @@ rb_str_free(VALUE str) } } -RUBY_FUNC_EXPORTED size_t +size_t rb_str_memsize(VALUE str) { if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) { @@ -1591,23 +1620,24 @@ str_shared_replace(VALUE str, VALUE str2) str_discard(str); termlen = rb_enc_mbminlen(enc); + STR_SET_LEN(str, RSTRING_LEN(str2)); + if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) { STR_SET_EMBED(str); memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen); - STR_SET_EMBED_LEN(str, RSTRING_LEN(str2)); rb_enc_associate(str, enc); ENC_CODERANGE_SET(str, cr); } else { if (STR_EMBED_P(str2)) { - assert(!FL_TEST(str2, STR_SHARED)); - long len = RSTRING(str2)->as.embed.len; - assert(len + termlen <= str_embed_capa(str2)); + RUBY_ASSERT(!FL_TEST(str2, STR_SHARED)); + long len = RSTRING_LEN(str2); + RUBY_ASSERT(len + termlen <= str_embed_capa(str2)); char *new_ptr = ALLOC_N(char, len + termlen); memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen); RSTRING(str2)->as.heap.ptr = new_ptr; - RSTRING(str2)->as.heap.len = len; + STR_SET_LEN(str2, len); RSTRING(str2)->as.heap.aux.capa = len; STR_SET_NOEMBED(str2); } @@ -1615,7 +1645,6 @@ str_shared_replace(VALUE str, VALUE str2) STR_SET_NOEMBED(str); FL_UNSET(str, STR_SHARED); RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2); - RSTRING(str)->as.heap.len = RSTRING_LEN(str2); if (FL_TEST(str2, STR_SHARED)) { VALUE shared = RSTRING(str2)->as.heap.aux.shared; @@ -1628,7 +1657,7 @@ str_shared_replace(VALUE str, VALUE str2) /* abandon str2 */ STR_SET_EMBED(str2); RSTRING_PTR(str2)[0] = 0; - STR_SET_EMBED_LEN(str2, 0); + STR_SET_LEN(str2, 0); rb_enc_associate(str, enc); ENC_CODERANGE_SET(str, cr); } @@ -1662,9 +1691,9 @@ str_replace(VALUE str, VALUE str2) len = RSTRING_LEN(str2); if (STR_SHARED_P(str2)) { VALUE shared = RSTRING(str2)->as.heap.aux.shared; - assert(OBJ_FROZEN(shared)); + RUBY_ASSERT(OBJ_FROZEN(shared)); STR_SET_NOEMBED(str); - RSTRING(str)->as.heap.len = len; + STR_SET_LEN(str, len); RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2); STR_SET_SHARED(str, shared); rb_enc_cr_str_exact_copy(str, str2); @@ -1680,8 +1709,8 @@ static inline VALUE ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa) { size_t size = rb_str_embed_size(capa); - assert(size > 0); - assert(rb_gc_size_allocatable_p(size)); + RUBY_ASSERT(size > 0); + RUBY_ASSERT(rb_gc_size_allocatable_p(size)); NEWOBJ_OF(str, struct RString, klass, T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size, ec); @@ -1708,11 +1737,10 @@ str_duplicate_setup(VALUE klass, VALUE str, VALUE dup) VALUE flags = FL_TEST_RAW(str, flag_mask); int encidx = 0; if (STR_EMBED_P(str)) { - long len = RSTRING_EMBED_LEN(str); + long len = RSTRING_LEN(str); - assert(STR_EMBED_P(dup)); - assert(str_embed_capa(dup) >= len + 1); - STR_SET_EMBED_LEN(dup, len); + RUBY_ASSERT(STR_EMBED_P(dup)); + RUBY_ASSERT(str_embed_capa(dup) >= len + 1); MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1); } else { @@ -1724,16 +1752,17 @@ str_duplicate_setup(VALUE klass, VALUE str, VALUE dup) root = str = str_new_frozen(klass, str); flags = FL_TEST_RAW(str, flag_mask); } - assert(!STR_SHARED_P(root)); - assert(RB_OBJ_FROZEN_RAW(root)); + RUBY_ASSERT(!STR_SHARED_P(root)); + RUBY_ASSERT(RB_OBJ_FROZEN_RAW(root)); - RSTRING(dup)->as.heap.len = RSTRING_LEN(str); RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str); FL_SET(root, STR_SHARED_ROOT); RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root); flags |= RSTRING_NOEMBED | STR_SHARED; } + STR_SET_LEN(dup, RSTRING_LEN(str)); + if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) { encidx = rb_enc_get_index(str); flags &= ~ENCODING_MASK; @@ -1747,11 +1776,11 @@ static inline VALUE ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str) { VALUE dup; - if (FL_TEST(str, STR_NOEMBED)) { - dup = ec_str_alloc_heap(ec, klass); + if (STR_EMBED_P(str)) { + dup = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str)); } else { - dup = ec_str_alloc_embed(ec, klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str)); + dup = ec_str_alloc_heap(ec, klass); } return str_duplicate_setup(klass, str, dup); @@ -1761,11 +1790,11 @@ static inline VALUE str_duplicate(VALUE klass, VALUE str) { VALUE dup; - if (FL_TEST(str, STR_NOEMBED)) { - dup = str_alloc_heap(klass); + if (STR_EMBED_P(str)) { + dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str)); } else { - dup = str_alloc_embed(klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str)); + dup = str_alloc_heap(klass); } return str_duplicate_setup(klass, str, dup); @@ -1777,6 +1806,18 @@ rb_str_dup(VALUE str) return str_duplicate(rb_obj_class(str), str); } +/* :nodoc: */ +VALUE +rb_str_dup_m(VALUE str) +{ + if (LIKELY(BARE_STRING_P(str))) { + return str_duplicate(rb_obj_class(str), str); + } + else { + return rb_obj_dup(str); + } +} + VALUE rb_str_resurrect(VALUE str) { @@ -1785,10 +1826,20 @@ rb_str_resurrect(VALUE str) } VALUE -rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str) +rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled) { RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str)); - return ec_str_duplicate(ec, rb_cString, str); + VALUE new_str = ec_str_duplicate(ec, rb_cString, str); + if (chilled) { + STR_CHILL_RAW(new_str); + } + return new_str; +} + +bool +rb_str_chilled_p(VALUE str) +{ + return CHILLED_STRING_P(str); } /* @@ -1839,17 +1890,13 @@ rb_str_init(int argc, VALUE *argv, VALUE str) if (orig == str) n = 0; } str_modifiable(str); - if (STR_EMBED_P(str)) { /* make noembed always */ - char *new_ptr = ALLOC_N(char, (size_t)capa + termlen); - assert(RSTRING(str)->as.embed.len + 1 <= str_embed_capa(str)); - memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING(str)->as.embed.len + 1); - RSTRING(str)->as.heap.ptr = new_ptr; - } - else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) { + if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) { + /* make noembed always */ const size_t size = (size_t)capa + termlen; const char *const old_ptr = RSTRING_PTR(str); - const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str); - char *new_ptr = ALLOC_N(char, (size_t)capa + termlen); + const size_t osize = RSTRING_LEN(str) + TERM_LEN(str); + char *new_ptr = ALLOC_N(char, size); + if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str)); memcpy(new_ptr, old_ptr, osize < size ? osize : size); FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE); RSTRING(str)->as.heap.ptr = new_ptr; @@ -1858,7 +1905,7 @@ rb_str_init(int argc, VALUE *argv, VALUE str) SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, (size_t)capa + termlen, STR_HEAP_SIZE(str)); } - RSTRING(str)->as.heap.len = len; + STR_SET_LEN(str, len); TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen); if (n == 1) { memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len); @@ -1881,6 +1928,98 @@ rb_str_init(int argc, VALUE *argv, VALUE str) return str; } +/* :nodoc: */ +static VALUE +rb_str_s_new(int argc, VALUE *argv, VALUE klass) +{ + if (klass != rb_cString) { + return rb_class_new_instance_pass_kw(argc, argv, klass); + } + + static ID keyword_ids[2]; + VALUE orig, opt, encoding = Qnil, capacity = Qnil; + VALUE kwargs[2]; + rb_encoding *enc = NULL; + + int n = rb_scan_args(argc, argv, "01:", &orig, &opt); + if (NIL_P(opt)) { + return rb_class_new_instance_pass_kw(argc, argv, klass); + } + + keyword_ids[0] = rb_id_encoding(); + CONST_ID(keyword_ids[1], "capacity"); + rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs); + encoding = kwargs[0]; + capacity = kwargs[1]; + + int termlen = 1; + + if (n == 1) { + orig = StringValue(orig); + } + else { + orig = Qnil; + } + + if (UNDEF_P(encoding)) { + if (!NIL_P(orig)) { + encoding = rb_obj_encoding(orig); + } + } + + if (!UNDEF_P(encoding)) { + enc = rb_to_encoding(encoding); + termlen = rb_enc_mbminlen(enc); + } + + // If capacity is nil, we're basically just duping `orig`. + if (UNDEF_P(capacity)) { + if (NIL_P(orig)) { + VALUE empty_str = str_new(klass, "", 0); + if (enc) { + rb_enc_associate(empty_str, enc); + } + return empty_str; + } + VALUE copy = str_duplicate(klass, orig); + rb_enc_associate(copy, enc); + ENC_CODERANGE_CLEAR(copy); + return copy; + } + + long capa = 0; + capa = NUM2LONG(capacity); + if (capa < 0) { + capa = 0; + } + + if (!NIL_P(orig)) { + long orig_capa = rb_str_capacity(orig); + if (orig_capa > capa) { + capa = orig_capa; + } + } + + long fake_len = capa - termlen; + if (fake_len < 0) { + fake_len = 0; + } + + VALUE str = str_new0(klass, NULL, fake_len, termlen); + STR_SET_LEN(str, 0); + TERM_FILL(RSTRING_PTR(str), termlen); + + if (enc) { + rb_enc_associate(str, enc); + } + + if (!NIL_P(orig)) { + rb_str_buf_append(str, orig); + } + + return str; +} + #ifdef NONASCII_MASK #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80) @@ -2138,7 +2277,7 @@ rb_str_empty(VALUE str) * call-seq: * string + other_string -> new_string * - * Returns a new \String containing +other_string+ concatenated to +self+: + * Returns a new +String+ containing +other_string+ concatenated to +self+: * * "Hello from " + self.to_s # => "Hello from main" * @@ -2178,8 +2317,8 @@ rb_str_plus(VALUE str1, VALUE str2) VALUE rb_str_opt_plus(VALUE str1, VALUE str2) { - assert(RBASIC_CLASS(str1) == rb_cString); - assert(RBASIC_CLASS(str2) == rb_cString); + RUBY_ASSERT(RBASIC_CLASS(str1) == rb_cString); + RUBY_ASSERT(RBASIC_CLASS(str2) == rb_cString); long len1, len2; MAYBE_UNUSED(char) *ptr1, *ptr2; RSTRING_GETMEM(str1, ptr1, len1); @@ -2209,7 +2348,7 @@ rb_str_opt_plus(VALUE str1, VALUE str2) * call-seq: * string * integer -> new_string * - * Returns a new \String containing +integer+ copies of +self+: + * Returns a new +String+ containing +integer+ copies of +self+: * * "Ho! " * 3 # => "Ho! Ho! Ho! " * "Ho! " * 0 # => "" @@ -2284,7 +2423,7 @@ rb_str_times(VALUE str, VALUE times) * "%05d" % 123 # => "00123" * * If +self+ contains multiple substitutions, +object+ must be - * an \Array or \Hash containing the values to be substituted: + * an Array or Hash containing the values to be substituted: * * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168" * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar" @@ -2350,7 +2489,7 @@ str_make_independent_expand(VALUE str, long len, long expand, const int termlen) STR_SET_EMBED(str); memcpy(RSTRING(str)->as.embed.ary, ptr, len); TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen); - STR_SET_EMBED_LEN(str, len); + STR_SET_LEN(str, len); return; } @@ -2366,7 +2505,7 @@ str_make_independent_expand(VALUE str, long len, long expand, const int termlen) FL_UNSET(str, STR_SHARED|STR_NOFREE); TERM_FILL(ptr + len, termlen); RSTRING(str)->as.heap.ptr = ptr; - RSTRING(str)->as.heap.len = len; + STR_SET_LEN(str, len); RSTRING(str)->as.heap.aux.capa = capa; } @@ -2418,7 +2557,7 @@ str_discard(VALUE str) if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) { ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str)); RSTRING(str)->as.heap.ptr = 0; - RSTRING(str)->as.heap.len = 0; + STR_SET_LEN(str, 0); } } @@ -2495,7 +2634,7 @@ rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int terml long capa = str_capacity(str, oldtermlen) + oldtermlen; long len = RSTRING_LEN(str); - assert(capa >= len); + RUBY_ASSERT(capa >= len); if (capa - len < termlen) { rb_check_lockedtmp(str); str_make_independent_expand(str, len, 0L, termlen); @@ -2507,7 +2646,7 @@ rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int terml else { if (!STR_EMBED_P(str)) { /* modify capa instead of realloc */ - assert(!FL_TEST((str), STR_SHARED)); + RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); RSTRING(str)->as.heap.aux.capa = capa - termlen; } if (termlen > oldtermlen) { @@ -2584,14 +2723,14 @@ rb_check_string_type(VALUE str) * call-seq: * String.try_convert(object) -> object, new_string, or nil * - * If +object+ is a \String object, returns +object+. + * If +object+ is a +String+ object, returns +object+. * * Otherwise if +object+ responds to <tt>:to_str</tt>, * calls <tt>object.to_str</tt> and returns the result. * * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>. * - * Raises an exception unless <tt>object.to_str</tt> returns a \String object. + * Raises an exception unless <tt>object.to_str</tt> returns a +String+ object. */ static VALUE rb_str_s_try_convert(VALUE dummy, VALUE str) @@ -2740,19 +2879,34 @@ str_subseq(VALUE str, long beg, long len) { VALUE str2; - const long rstring_embed_capa_max = ((sizeof(struct RString) - offsetof(struct RString, as.embed.ary)) / sizeof(char)) - 1; + RUBY_ASSERT(beg >= 0); + RUBY_ASSERT(len >= 0); + RUBY_ASSERT(beg+len <= RSTRING_LEN(str)); - if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str)) || - len <= rstring_embed_capa_max) { + const int termlen = TERM_LEN(str); + if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) { str2 = rb_str_new(RSTRING_PTR(str) + beg, len); RB_GC_GUARD(str); + return str2; + } + + str2 = str_alloc_heap(rb_cString); + if (str_embed_capa(str2) >= len + termlen) { + char *ptr2 = RSTRING(str2)->as.embed.ary; + STR_SET_EMBED(str2); + memcpy(ptr2, RSTRING_PTR(str) + beg, len); + TERM_FILL(ptr2+len, termlen); + + STR_SET_LEN(str2, len); + RB_GC_GUARD(str); } else { - str2 = str_new_shared(rb_cString, str); + str_replace_shared(str2, str); + RUBY_ASSERT(!STR_EMBED_P(str2)); ENC_CODERANGE_CLEAR(str2); RSTRING(str2)->as.heap.ptr += beg; - if (RSTRING(str2)->as.heap.len > len) { - RSTRING(str2)->as.heap.len = len; + if (RSTRING_LEN(str2) > len) { + STR_SET_LEN(str2, len); } } @@ -2879,12 +3033,15 @@ str_substr(VALUE str, long beg, long len, int empty) VALUE rb_str_freeze(VALUE str) { + if (CHILLED_STRING_P(str)) { + FL_UNSET_RAW(str, STR_CHILLED); + } + if (OBJ_FROZEN(str)) return str; rb_str_resize(str, RSTRING_LEN(str)); return rb_obj_freeze(str); } - /* * call-seq: * +string -> new_string or self @@ -2911,7 +3068,7 @@ str_uplus(VALUE str) * * Returns a frozen, possibly pre-existing copy of the string. * - * The returned \String will be deduplicated as long as it does not have + * The returned +String+ will be deduplicated as long as it does not have * any instance variables set on it and is not a String subclass. * * Note that <tt>-string</tt> variant is more convenient for defining @@ -2957,7 +3114,7 @@ rb_str_unlocktmp(VALUE str) return str; } -RUBY_FUNC_EXPORTED VALUE +VALUE rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg) { rb_str_locktmp(str); @@ -2977,6 +3134,33 @@ rb_str_set_len(VALUE str, long len) if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) { rb_bug("probable buffer overflow: %ld for %ld", len, capa); } + + int cr = ENC_CODERANGE(str); + if (cr == ENC_CODERANGE_UNKNOWN) { + /* Leave unknown. */ + } + else if (len > RSTRING_LEN(str)) { + if (ENC_CODERANGE_CLEAN_P(cr)) { + /* Update the coderange regarding the extended part. */ + const char *const prev_end = RSTRING_END(str); + const char *const new_end = RSTRING_PTR(str) + len; + rb_encoding *enc = rb_enc_get(str); + rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr); + ENC_CODERANGE_SET(str, cr); + } + else if (cr == ENC_CODERANGE_BROKEN) { + /* May be valid now, by appended part. */ + ENC_CODERANGE_SET(str, ENC_CODERANGE_UNKNOWN); + } + } + else if (len < RSTRING_LEN(str)) { + if (cr != ENC_CODERANGE_7BIT) { + /* ASCII-only string is keeping after truncated. Valid + * and broken may be invalid or valid, leave unknown. */ + ENC_CODERANGE_SET(str, ENC_CODERANGE_UNKNOWN); + } + } + STR_SET_LEN(str, len); TERM_FILL(&RSTRING_PTR(str)[len], termlen); } @@ -3001,7 +3185,7 @@ rb_str_resize(VALUE str, long len) if (STR_EMBED_P(str)) { if (len == slen) return str; if (str_embed_capa(str) >= len + termlen) { - STR_SET_EMBED_LEN(str, len); + STR_SET_LEN(str, len); TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen); return str; } @@ -3013,7 +3197,7 @@ rb_str_resize(VALUE str, long len) if (slen > len) slen = len; if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen); TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen); - STR_SET_EMBED_LEN(str, len); + STR_SET_LEN(str, len); if (independent) ruby_xfree(ptr); return str; } @@ -3028,7 +3212,7 @@ rb_str_resize(VALUE str, long len) RSTRING(str)->as.heap.aux.capa = len; } else if (len == slen) return str; - RSTRING(str)->as.heap.len = len; + STR_SET_LEN(str, len); TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */ } return str; @@ -3190,7 +3374,7 @@ rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len, incompatible: rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", - rb_enc_name(str_enc), rb_enc_name(ptr_enc)); + rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc)); UNREACHABLE_RETURN(Qundef); } @@ -3235,6 +3419,7 @@ rb_str_buf_append(VALUE str, VALUE str2) case ENC_CODERANGE_7BIT: // If RHS is 7bit we can do simple concatenation str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true); + RB_GC_GUARD(str2); return str; case ENC_CODERANGE_VALID: // If RHS is valid, we can do simple concatenation if encodings are the same @@ -3244,6 +3429,7 @@ rb_str_buf_append(VALUE str, VALUE str2) if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) { ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr)); } + RB_GC_GUARD(str2); return str; } } @@ -3301,7 +3487,7 @@ rb_str_concat_literals(size_t num, const VALUE *strary) * s.concat('bar', 'baz') # => "foobarbaz" * s # => "foobarbaz" * - * For each given object +object+ that is an \Integer, + * For each given object +object+ that is an Integer, * the value is considered a codepoint and converted to a character before concatenation: * * s = 'foo' @@ -3340,7 +3526,7 @@ rb_str_concat_multi(int argc, VALUE *argv, VALUE str) * s << 'bar' # => "foobar" * s # => "foobar" * - * If +object+ is an \Integer, + * If +object+ is an Integer, * the value is considered a codepoint and converted to a character before concatenation: * * s = 'foo' @@ -3401,8 +3587,12 @@ rb_str_concat(VALUE str1, VALUE str2) } rb_str_resize(str1, pos+len); memcpy(RSTRING_PTR(str1) + pos, buf, len); - if (cr == ENC_CODERANGE_7BIT && code > 127) + if (cr == ENC_CODERANGE_7BIT && code > 127) { cr = ENC_CODERANGE_VALID; + } + else if (cr == ENC_CODERANGE_BROKEN) { + cr = ENC_CODERANGE_UNKNOWN; + } ENC_CODERANGE_SET(str1, cr); } return str1; @@ -3465,11 +3655,12 @@ rb_str_prepend_multi(int argc, VALUE *argv, VALUE str) st_index_t rb_str_hash(VALUE str) { - int e = ENCODING_GET(str); - if (e && is_ascii_string(str)) { - e = 0; + st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)); + int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0; + if (e && !is_ascii_string(str)) { + h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e)); } - return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e; + return h; } int @@ -3570,7 +3761,7 @@ rb_str_cmp(VALUE str1, VALUE str2) * Returns +false+ if the two strings' encodings are not compatible: * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false * - * If +object+ is not an instance of \String but responds to +to_str+, then the + * If +object+ is not an instance of +String+ but responds to +to_str+, then the * two strings are compared using <code>object.==</code>. */ @@ -3815,7 +4006,9 @@ strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len, return pos + offset; } +/* found index in byte */ #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0) +#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1) static long rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte) @@ -3869,34 +4062,28 @@ rb_str_index_m(int argc, VALUE *argv, VALUE str) { VALUE sub; VALUE initpos; + rb_encoding *enc = STR_ENC_GET(str); long pos; if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) { + long slen = str_strlen(str, enc); /* str's enc */ pos = NUM2LONG(initpos); - } - else { - pos = 0; - } - if (pos < 0) { - pos += str_strlen(str, NULL); - if (pos < 0) { + if (pos < 0 ? (pos += slen) < 0 : pos > slen) { if (RB_TYPE_P(sub, T_REGEXP)) { rb_backref_set(Qnil); } return Qnil; } } + else { + pos = 0; + } if (RB_TYPE_P(sub, T_REGEXP)) { - if (pos > str_strlen(str, NULL)) - return Qnil; pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, - rb_enc_check(str, sub), single_byte_optimizable(str)); + enc, single_byte_optimizable(str)); - if (rb_reg_search(sub, str, pos, 0) < 0) { - return Qnil; - } - else { + if (rb_reg_search(sub, str, pos, 0) >= 0) { VALUE match = rb_backref_get(); struct re_registers *regs = RMATCH_REGS(match); pos = rb_str_sublen(str, BEG(0)); @@ -3906,25 +4093,28 @@ rb_str_index_m(int argc, VALUE *argv, VALUE str) else { StringValue(sub); pos = rb_str_index(str, sub, pos); - pos = rb_str_sublen(str, pos); + if (pos >= 0) { + pos = rb_str_sublen(str, pos); + return LONG2NUM(pos); + } } - - if (pos == -1) return Qnil; - return LONG2NUM(pos); + return Qnil; } -/* whether given pos is valid character boundary or not +/* Ensure that the given pos is a valid character boundary. * Note that in this function, "character" means a code point * (Unicode scalar value), not a grapheme cluster. */ -static bool -str_check_byte_pos(VALUE str, long pos) +static void +str_ensure_byte_pos(VALUE str, long pos) { const char *s = RSTRING_PTR(str); const char *e = RSTRING_END(str); const char *p = s + pos; - const char *pp = rb_enc_left_char_head(s, p, e, rb_enc_get(str)); - return p == pp; + if (!at_char_boundary(s, p, e, rb_enc_get(str))) { + rb_raise(rb_eIndexError, + "offset %ld does not land on character boundary", pos); + } } /* @@ -3932,7 +4122,7 @@ str_check_byte_pos(VALUE str, long pos) * byteindex(substring, offset = 0) -> integer or nil * byteindex(regexp, offset = 0) -> integer or nil * - * Returns the \Integer byte-based index of the first occurrence of the given +substring+, + * Returns the Integer byte-based index of the first occurrence of the given +substring+, * or +nil+ if none found: * * 'foo'.byteindex('f') # => 0 @@ -3940,7 +4130,7 @@ str_check_byte_pos(VALUE str, long pos) * 'foo'.byteindex('oo') # => 1 * 'foo'.byteindex('ooo') # => nil * - * Returns the \Integer byte-based index of the first match for the given \Regexp +regexp+, + * Returns the Integer byte-based index of the first match for the given Regexp +regexp+, * or +nil+ if none found: * * 'foo'.byteindex(/f/) # => 0 @@ -3948,7 +4138,7 @@ str_check_byte_pos(VALUE str, long pos) * 'foo'.byteindex(/oo/) # => 1 * 'foo'.byteindex(/ooo/) # => nil * - * \Integer argument +offset+, if given, specifies the byte-based position in the + * Integer argument +offset+, if given, specifies the byte-based position in the * string to begin the search: * * 'foo'.byteindex('o', 1) # => 1 @@ -3976,33 +4166,23 @@ rb_str_byteindex_m(int argc, VALUE *argv, VALUE str) long pos; if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) { + long slen = RSTRING_LEN(str); pos = NUM2LONG(initpos); - } - else { - pos = 0; - } - if (pos < 0) { - pos += RSTRING_LEN(str); - if (pos < 0) { + if (pos < 0 ? (pos += slen) < 0 : pos > slen) { if (RB_TYPE_P(sub, T_REGEXP)) { rb_backref_set(Qnil); } return Qnil; } } - - if (!str_check_byte_pos(str, pos)) { - rb_raise(rb_eIndexError, - "offset %ld does not land on character boundary", pos); + else { + pos = 0; } + str_ensure_byte_pos(str, pos); + if (RB_TYPE_P(sub, T_REGEXP)) { - if (pos > RSTRING_LEN(str)) - return Qnil; - if (rb_reg_search(sub, str, pos, 0) < 0) { - return Qnil; - } - else { + if (rb_reg_search(sub, str, pos, 0) >= 0) { VALUE match = rb_backref_get(); struct re_registers *regs = RMATCH_REGS(match); pos = BEG(0); @@ -4011,11 +4191,10 @@ rb_str_byteindex_m(int argc, VALUE *argv, VALUE str) } else { StringValue(sub); - pos = rb_strseq_index(str, sub, pos, 1); + pos = rb_str_byteindex(str, sub, pos); + if (pos >= 0) return LONG2NUM(pos); } - - if (pos == -1) return Qnil; - return LONG2NUM(pos); + return Qnil; } #ifdef HAVE_MEMRCHR @@ -4074,6 +4253,7 @@ str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc) } #endif +/* found index in byte */ static long rb_str_rindex(VALUE str, VALUE sub, long pos) { @@ -4103,7 +4283,7 @@ rb_str_rindex(VALUE str, VALUE sub, long pos) } s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte); - return rb_str_sublen(str, str_rindex(str, sub, s, enc)); + return str_rindex(str, sub, s, enc); } /* @@ -4111,7 +4291,7 @@ rb_str_rindex(VALUE str, VALUE sub, long pos) * rindex(substring, offset = self.length) -> integer or nil * rindex(regexp, offset = self.length) -> integer or nil * - * Returns the \Integer index of the _last_ occurrence of the given +substring+, + * Returns the Integer index of the _last_ occurrence of the given +substring+, * or +nil+ if none found: * * 'foo'.rindex('f') # => 0 @@ -4119,7 +4299,7 @@ rb_str_rindex(VALUE str, VALUE sub, long pos) * 'foo'.rindex('oo') # => 1 * 'foo'.rindex('ooo') # => nil * - * Returns the \Integer index of the _last_ match for the given \Regexp +regexp+, + * Returns the Integer index of the _last_ match for the given Regexp +regexp+, * or +nil+ if none found: * * 'foo'.rindex(/f/) # => 0 @@ -4144,7 +4324,7 @@ rb_str_rindex(VALUE str, VALUE sub, long pos) * 'foo'.index(/o+(?!.*o)/) # => 1 * $~ #=> #<MatchData "oo"> * - * \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the + * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the * string to _end_ the search: * * 'foo'.rindex('o', 0) # => nil @@ -4152,7 +4332,7 @@ rb_str_rindex(VALUE str, VALUE sub, long pos) * 'foo'.rindex('o', 2) # => 2 * 'foo'.rindex('o', 3) # => 2 * - * If +offset+ is a negative \Integer, the maximum starting position in the + * If +offset+ is a negative Integer, the maximum starting position in the * string to _end_ the search is the sum of the string's length and +offset+: * * 'foo'.rindex('o', -1) # => 2 @@ -4167,20 +4347,17 @@ static VALUE rb_str_rindex_m(int argc, VALUE *argv, VALUE str) { VALUE sub; - VALUE vpos; + VALUE initpos; rb_encoding *enc = STR_ENC_GET(str); long pos, len = str_strlen(str, enc); /* str's enc */ - if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) { - pos = NUM2LONG(vpos); - if (pos < 0) { - pos += len; - if (pos < 0) { - if (RB_TYPE_P(sub, T_REGEXP)) { - rb_backref_set(Qnil); - } - return Qnil; + if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) { + pos = NUM2LONG(initpos); + if (pos < 0 && (pos += len) < 0) { + if (RB_TYPE_P(sub, T_REGEXP)) { + rb_backref_set(Qnil); } + return Qnil; } if (pos > len) pos = len; } @@ -4189,7 +4366,7 @@ rb_str_rindex_m(int argc, VALUE *argv, VALUE str) } if (RB_TYPE_P(sub, T_REGEXP)) { - /* enc = rb_get_check(str, sub); */ + /* enc = rb_enc_check(str, sub); */ pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, enc, single_byte_optimizable(str)); @@ -4203,7 +4380,10 @@ rb_str_rindex_m(int argc, VALUE *argv, VALUE str) else { StringValue(sub); pos = rb_str_rindex(str, sub, pos); - if (pos >= 0) return LONG2NUM(pos); + if (pos >= 0) { + pos = rb_str_sublen(str, pos); + return LONG2NUM(pos); + } } return Qnil; } @@ -4244,7 +4424,7 @@ rb_str_byterindex(VALUE str, VALUE sub, long pos) * byterindex(substring, offset = self.bytesize) -> integer or nil * byterindex(regexp, offset = self.bytesize) -> integer or nil * - * Returns the \Integer byte-based index of the _last_ occurrence of the given +substring+, + * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+, * or +nil+ if none found: * * 'foo'.byterindex('f') # => 0 @@ -4252,7 +4432,7 @@ rb_str_byterindex(VALUE str, VALUE sub, long pos) * 'foo'.byterindex('oo') # => 1 * 'foo'.byterindex('ooo') # => nil * - * Returns the \Integer byte-based index of the _last_ match for the given \Regexp +regexp+, + * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+, * or +nil+ if none found: * * 'foo'.byterindex(/f/) # => 0 @@ -4277,7 +4457,7 @@ rb_str_byterindex(VALUE str, VALUE sub, long pos) * 'foo'.byteindex(/o+(?!.*o)/) # => 1 * $~ #=> #<MatchData "oo"> * - * \Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the + * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the * string to _end_ the search: * * 'foo'.byterindex('o', 0) # => nil @@ -4285,7 +4465,7 @@ rb_str_byterindex(VALUE str, VALUE sub, long pos) * 'foo'.byterindex('o', 2) # => 2 * 'foo'.byterindex('o', 3) # => 2 * - * If +offset+ is a negative \Integer, the maximum starting position in the + * If +offset+ is a negative Integer, the maximum starting position in the * string to _end_ the search is the sum of the string's length and +offset+: * * 'foo'.byterindex('o', -1) # => 2 @@ -4303,19 +4483,16 @@ static VALUE rb_str_byterindex_m(int argc, VALUE *argv, VALUE str) { VALUE sub; - VALUE vpos; + VALUE initpos; long pos, len = RSTRING_LEN(str); - if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) { - pos = NUM2LONG(vpos); - if (pos < 0) { - pos += len; - if (pos < 0) { - if (RB_TYPE_P(sub, T_REGEXP)) { - rb_backref_set(Qnil); - } - return Qnil; + if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) { + pos = NUM2LONG(initpos); + if (pos < 0 && (pos += len) < 0) { + if (RB_TYPE_P(sub, T_REGEXP)) { + rb_backref_set(Qnil); } + return Qnil; } if (pos > len) pos = len; } @@ -4323,10 +4500,7 @@ rb_str_byterindex_m(int argc, VALUE *argv, VALUE str) pos = len; } - if (!str_check_byte_pos(str, pos)) { - rb_raise(rb_eIndexError, - "offset %ld does not land on character boundary", pos); - } + str_ensure_byte_pos(str, pos); if (RB_TYPE_P(sub, T_REGEXP)) { if (rb_reg_search(sub, str, pos, 1) >= 0) { @@ -4349,16 +4523,16 @@ rb_str_byterindex_m(int argc, VALUE *argv, VALUE str) * string =~ regexp -> integer or nil * string =~ object -> integer or nil * - * Returns the \Integer index of the first substring that matches + * Returns the Integer index of the first substring that matches * the given +regexp+, or +nil+ if no match found: * * 'foo' =~ /f/ # => 0 * 'foo' =~ /o/ # => 1 * 'foo' =~ /x/ # => nil * - * Note: also updates Regexp@Special+global+variables. + * Note: also updates Regexp@Global+Variables. * - * If the given +object+ is not a \Regexp, returns the value + * If the given +object+ is not a Regexp, returns the value * returned by <tt>object =~ self</tt>. * * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt> @@ -4396,13 +4570,13 @@ static VALUE get_pat(VALUE); * match(pattern, offset = 0) -> matchdata or nil * match(pattern, offset = 0) {|matchdata| ... } -> object * - * Returns a \MatchData object (or +nil+) based on +self+ and the given +pattern+. + * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+. * - * Note: also updates Regexp@Special+global+variables. + * Note: also updates Regexp@Global+Variables. * - * - Computes +regexp+ by converting +pattern+ (if not already a \Regexp). + * - Computes +regexp+ by converting +pattern+ (if not already a Regexp). * regexp = Regexp.new(pattern) - * - Computes +matchdata+, which will be either a \MatchData object or +nil+ + * - Computes +matchdata+, which will be either a MatchData object or +nil+ * (see Regexp#match): * matchdata = <tt>regexp.match(self) * @@ -4412,7 +4586,7 @@ static VALUE get_pat(VALUE); * 'foo'.match('o') # => #<MatchData "o"> * 'foo'.match('x') # => nil * - * If \Integer argument +offset+ is given, the search begins at index +offset+: + * If Integer argument +offset+ is given, the search begins at index +offset+: * * 'foo'.match('f', 1) # => nil * 'foo'.match('o', 1) # => #<MatchData "o"> @@ -4447,19 +4621,19 @@ rb_str_match_m(int argc, VALUE *argv, VALUE str) * * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+. * - * Note: does not update Regexp@Special+global+variables. + * Note: does not update Regexp@Global+Variables. * - * Computes +regexp+ by converting +pattern+ (if not already a \Regexp). + * Computes +regexp+ by converting +pattern+ (if not already a Regexp). * regexp = Regexp.new(pattern) * - * Returns +true+ if <tt>self+.match(regexp)</tt> returns a \MatchData object, + * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object, * +false+ otherwise: * * 'foo'.match?(/o/) # => true * 'foo'.match?('o') # => true * 'foo'.match?(/x/) # => false * - * If \Integer argument +offset+ is given, the search begins at index +offset+: + * If Integer argument +offset+ is given, the search begins at index +offset+: * 'foo'.match?('f', 1) # => false * 'foo'.match?('o', 1) # => true * @@ -4711,7 +4885,7 @@ static VALUE str_succ(VALUE str); * s = '99zz99zz' * s.succ # => "100aa00aa" * - * The successor to an empty \String is a new empty \String: + * The successor to an empty +String+ is a new empty +String+: * * ''.succ # => "" * @@ -4851,7 +5025,7 @@ str_upto_i(VALUE str, VALUE arg) * upto(other_string, exclusive = false) {|string| ... } -> self * upto(other_string, exclusive = false) -> new_enumerator * - * With a block given, calls the block with each \String value + * With a block given, calls the block with each +String+ value * returned by successive calls to String#succ; * the first value is +self+, the next is <tt>self.succ</tt>, and so on; * the sequence terminates when value +other_string+ is reached; @@ -4875,7 +5049,7 @@ str_upto_i(VALUE str, VALUE arg) * '25'.upto('5') {|s| fail s } * 'aa'.upto('a') {|s| fail s } * - * With no block given, returns a new \Enumerator: + * With no block given, returns a new Enumerator: * * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")> * @@ -5158,7 +5332,6 @@ rb_str_drop_bytes(VALUE str, long len) char *oldptr = ptr; int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE)); STR_SET_EMBED(str); - STR_SET_EMBED_LEN(str, nlen); ptr = RSTRING(str)->as.embed.ary; memmove(ptr, oldptr + len, nlen); if (fl == STR_NOEMBED) xfree(oldptr); @@ -5170,9 +5343,12 @@ rb_str_drop_bytes(VALUE str, long len) OBJ_FREEZE(shared); } ptr = RSTRING(str)->as.heap.ptr += len; - RSTRING(str)->as.heap.len = nlen; } - ptr[nlen] = 0; + STR_SET_LEN(str, nlen); + + if (!SHARABLE_MIDDLE_SUBSTRING) { + TERM_FILL(ptr + nlen, TERM_LEN(str)); + } ENC_CODERANGE_CLEAR(str); return str; } @@ -5246,8 +5422,9 @@ rb_str_update(VALUE str, long beg, long len, VALUE val) if (beg < 0) { beg += slen; } - assert(beg >= 0); - assert(beg <= slen); + RUBY_ASSERT(beg >= 0); + RUBY_ASSERT(beg <= slen); + if (len > slen - beg) { len = slen - beg; } @@ -5385,11 +5562,11 @@ rb_str_aset_m(int argc, VALUE *argv, VALUE str) * * Inserts the given +other_string+ into +self+; returns +self+. * - * If the \Integer +index+ is positive, inserts +other_string+ at offset +index+: + * If the Integer +index+ is positive, inserts +other_string+ at offset +index+: * * 'foo'.insert(1, 'bar') # => "fbaroo" * - * If the \Integer +index+ is negative, counts backward from the end of +self+ + * If the Integer +index+ is negative, counts backward from the end of +self+ * and inserts +other_string+ at offset <tt>index+1</tt> * (that is, _after_ <tt>self[index]</tt>): * @@ -5574,7 +5751,7 @@ static long rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str) { if (BUILTIN_TYPE(pat) == T_STRING) { - pos = rb_strseq_index(str, pat, pos, 1); + pos = rb_str_byteindex(str, pat, pos); if (set_backref_str) { if (pos >= 0) { str = rb_str_new_frozen_String(str); @@ -5677,8 +5854,8 @@ rb_str_sub_bang(int argc, VALUE *argv, VALUE str) if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT || coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) { rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", - rb_enc_name(str_enc), - rb_enc_name(STR_ENC_GET(repl))); + rb_enc_inspect_name(str_enc), + rb_enc_inspect_name(STR_ENC_GET(repl))); } enc = STR_ENC_GET(repl); } @@ -5709,6 +5886,8 @@ rb_str_sub_bang(int argc, VALUE *argv, VALUE str) TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str)); ENC_CODERANGE_SET(str, cr); + RB_GC_GUARD(match); + return str; } return Qnil; @@ -5740,8 +5919,7 @@ rb_str_sub(int argc, VALUE *argv, VALUE str) static VALUE str_gsub(int argc, VALUE *argv, VALUE str, int bang) { - VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil; - struct re_registers *regs; + VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil; long beg, beg0, end0; long offset, blen, slen, len, last; enum {STR, ITER, MAP} mode = STR; @@ -5786,8 +5964,8 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID); do { - match = rb_backref_get(); - regs = RMATCH_REGS(match); + VALUE match = rb_backref_get(); + struct re_registers *regs = RMATCH_REGS(match); if (RB_TYPE_P(pat, T_STRING)) { beg0 = beg; end0 = beg0 + RSTRING_LEN(pat); @@ -5844,6 +6022,8 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) cp = RSTRING_PTR(str) + offset; if (offset > RSTRING_LEN(str)) break; beg = rb_pat_search(pat, str, offset, need_backref); + + RB_GC_GUARD(match); } while (beg >= 0); if (RSTRING_LEN(str) > offset) { rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc); @@ -5946,7 +6126,7 @@ rb_str_clear(VALUE str) { str_discard(str); STR_SET_EMBED(str); - STR_SET_EMBED_LEN(str, 0); + STR_SET_LEN(str, 0); RSTRING_PTR(str)[0] = 0; if (rb_enc_asciicompat(STR_ENC_GET(str))) ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); @@ -6010,7 +6190,7 @@ rb_str_getbyte(VALUE str, VALUE index) * * Related: String#getbyte. */ -static VALUE +VALUE rb_str_setbyte(VALUE str, VALUE index, VALUE value) { long pos = NUM2LONG(index); @@ -6106,6 +6286,12 @@ str_byte_substr(VALUE str, long beg, long len, int empty) return str2; } +VALUE +rb_str_byte_substr(VALUE str, VALUE beg, VALUE len) +{ + return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE); +} + static VALUE str_byte_aref(VALUE str, VALUE indx) { @@ -6198,20 +6384,15 @@ str_check_beg_len(VALUE str, long *beg, long *len) if (*beg < 0) { *beg += slen; } - assert(*beg >= 0); - assert(*beg <= slen); + RUBY_ASSERT(*beg >= 0); + RUBY_ASSERT(*beg <= slen); + if (*len > slen - *beg) { *len = slen - *beg; } end = *beg + *len; - if (!str_check_byte_pos(str, *beg)) { - rb_raise(rb_eIndexError, - "offset %ld does not land on character boundary", *beg); - } - if (!str_check_byte_pos(str, end)) { - rb_raise(rb_eIndexError, - "offset %ld does not land on character boundary", end); - } + str_ensure_byte_pos(str, *beg); + str_ensure_byte_pos(str, end); } /* @@ -6397,7 +6578,7 @@ rb_str_reverse_bang(VALUE str) /* * call-seq: - * include? other_string -> true or false + * include?(other_string) -> true or false * * Returns +true+ if +self+ contains +other_string+, +false+ otherwise: * @@ -6471,7 +6652,7 @@ rb_str_to_i(int argc, VALUE *argv, VALUE str) * Returns the result of interpreting leading characters in +self+ as a Float: * * '3.14159'.to_f # => 3.14159 - '1.234e-2'.to_f # => 0.01234 + * '1.234e-2'.to_f # => 0.01234 * * Characters past a leading valid number (in the given +base+) are ignored: * @@ -6494,8 +6675,8 @@ rb_str_to_f(VALUE str) * call-seq: * to_s -> self or string * - * Returns +self+ if +self+ is a \String, - * or +self+ converted to a \String if +self+ is a subclass of \String. + * Returns +self+ if +self+ is a +String+, + * or +self+ converted to a +String+ if +self+ is a subclass of +String+. */ static VALUE @@ -7130,6 +7311,8 @@ str_undump(VALUE str) } } + RB_GC_GUARD(str); + return undumped; invalid_format: rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form"); @@ -7744,8 +7927,10 @@ trnext(struct tr *t, rb_encoding *enc) } continue; /* not reached */ } - t->gen = 1; - t->max = c; + else if (t->now < c) { + t->gen = 1; + t->max = c; + } } } return t->now; @@ -7875,7 +8060,14 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) while (s < send) { int may_modify = 0; - c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1); + int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1); + if (!MBCLEN_CHARFOUND_P(r)) { + xfree(buf); + rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1)); + } + clen = MBCLEN_CHARFOUND_LEN(r); + c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1); + tlen = enc == e1 ? clen : rb_enc_codelen(c, enc); s += clen; @@ -7926,7 +8118,7 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) } TERM_FILL((char *)t, termlen); RSTRING(str)->as.heap.ptr = (char *)buf; - RSTRING(str)->as.heap.len = t - buf; + STR_SET_LEN(str, t - buf); STR_SET_NOEMBED(str); RSTRING(str)->as.heap.aux.capa = max; } @@ -7955,7 +8147,15 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) while (s < send) { int may_modify = 0; - c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1); + + int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1); + if (!MBCLEN_CHARFOUND_P(r)) { + xfree(buf); + rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1)); + } + clen = MBCLEN_CHARFOUND_LEN(r); + c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1); + tlen = enc == e1 ? clen : rb_enc_codelen(c, enc); if (c < 256) { @@ -8002,7 +8202,7 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) } TERM_FILL((char *)t, termlen); RSTRING(str)->as.heap.ptr = (char *)buf; - RSTRING(str)->as.heap.len = t - buf; + STR_SET_LEN(str, t - buf); STR_SET_NOEMBED(str); RSTRING(str)->as.heap.aux.capa = max; } @@ -8670,7 +8870,6 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count)) - if (result) result = rb_ary_new(); beg = 0; char *ptr = RSTRING_PTR(str); char *eptr = RSTRING_END(str); @@ -8679,6 +8878,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) int skip = 1; unsigned int c; + if (result) result = rb_ary_new(); end = beg; if (is_ascii_string(str)) { while (ptr < eptr) { @@ -8738,6 +8938,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) char *sptr = RSTRING_PTR(spat); long slen = RSTRING_LEN(spat); + if (result) result = rb_ary_new(); mustnot_broken(str); enc = rb_enc_check(str, spat); while (ptr < eptr && @@ -8759,6 +8960,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) char *str_start = ptr; int n; + if (result) result = rb_ary_new_capa(RSTRING_LEN(str)); mustnot_broken(str); enc = rb_enc_get(str); while (ptr < eptr && @@ -8770,6 +8972,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) beg = ptr - str_start; } else { + if (result) result = rb_ary_new(); long len = RSTRING_LEN(str); long start = beg; long idx; @@ -9233,56 +9436,65 @@ static regex_t * get_reg_grapheme_cluster(rb_encoding *enc) { int encidx = rb_enc_to_index(enc); - regex_t *reg_grapheme_cluster = NULL; - static regex_t *reg_grapheme_cluster_utf8 = NULL; - /* synchronize */ - if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) { - reg_grapheme_cluster = reg_grapheme_cluster_utf8; - } - if (!reg_grapheme_cluster) { - const OnigUChar source_ascii[] = "\\X"; - OnigErrorInfo einfo; - const OnigUChar *source = source_ascii; - size_t source_len = sizeof(source_ascii) - 1; - switch (encidx) { + const OnigUChar source_ascii[] = "\\X"; + const OnigUChar *source = source_ascii; + size_t source_len = sizeof(source_ascii) - 1; + + switch (encidx) { #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x) #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8) #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x) #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16) #define CASE_UTF(e) \ - case ENCINDEX_UTF_##e: { \ - static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \ - source = source_UTF_##e; \ - source_len = sizeof(source_UTF_##e); \ - break; \ - } - CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE); + case ENCINDEX_UTF_##e: { \ + static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \ + source = source_UTF_##e; \ + source_len = sizeof(source_UTF_##e); \ + break; \ + } + CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE); #undef CASE_UTF #undef CHARS_16BE #undef CHARS_16LE #undef CHARS_32BE #undef CHARS_32LE - } - int r = onig_new(®_grapheme_cluster, source, source + source_len, - ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo); - if (r) { - UChar message[ONIG_MAX_ERROR_MESSAGE_LEN]; - onig_error_code_to_str(message, r, &einfo); - rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message); - } - if (encidx == rb_utf8_encindex()) { - reg_grapheme_cluster_utf8 = reg_grapheme_cluster; - } } + + regex_t *reg_grapheme_cluster; + OnigErrorInfo einfo; + int r = onig_new(®_grapheme_cluster, source, source + source_len, + ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo); + if (r) { + UChar message[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str(message, r, &einfo); + rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message); + } + return reg_grapheme_cluster; } +static regex_t * +get_cached_reg_grapheme_cluster(rb_encoding *enc) +{ + int encidx = rb_enc_to_index(enc); + static regex_t *reg_grapheme_cluster_utf8 = NULL; + + if (encidx == rb_utf8_encindex()) { + if (!reg_grapheme_cluster_utf8) { + reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc); + } + + return reg_grapheme_cluster_utf8; + } + + return NULL; +} + static VALUE rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj) { size_t grapheme_cluster_count = 0; - regex_t *reg_grapheme_cluster = NULL; rb_encoding *enc = get_encoding(str); const char *ptr, *end; @@ -9290,7 +9502,13 @@ rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj) return rb_str_length(str); } - reg_grapheme_cluster = get_reg_grapheme_cluster(enc); + bool cached_reg_grapheme_cluster = true; + regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc); + if (!reg_grapheme_cluster) { + reg_grapheme_cluster = get_reg_grapheme_cluster(enc); + cached_reg_grapheme_cluster = false; + } + ptr = RSTRING_PTR(str); end = RSTRING_END(str); @@ -9303,6 +9521,10 @@ rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj) ptr += len; } + if (!cached_reg_grapheme_cluster) { + onig_free(reg_grapheme_cluster); + } + return SIZET2NUM(grapheme_cluster_count); } @@ -9310,7 +9532,6 @@ static VALUE rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary) { VALUE orig = str; - regex_t *reg_grapheme_cluster = NULL; rb_encoding *enc = get_encoding(str); const char *ptr0, *ptr, *end; @@ -9319,7 +9540,14 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary) } if (!ary) str = rb_str_new_frozen(str); - reg_grapheme_cluster = get_reg_grapheme_cluster(enc); + + bool cached_reg_grapheme_cluster = true; + regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc); + if (!reg_grapheme_cluster) { + reg_grapheme_cluster = get_reg_grapheme_cluster(enc); + cached_reg_grapheme_cluster = false; + } + ptr0 = ptr = RSTRING_PTR(str); end = RSTRING_END(str); @@ -9331,6 +9559,11 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary) ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len)); ptr += len; } + + if (!cached_reg_grapheme_cluster) { + onig_free(reg_grapheme_cluster); + } + RB_GC_GUARD(str); if (ary) return ary; @@ -9526,7 +9759,7 @@ chompped_length(VALUE str, VALUE rs) if (p[len-1] == newline && (rslen <= 1 || memcmp(rsptr, pp, rslen) == 0)) { - if (rb_enc_left_char_head(p, pp, e, enc) == pp) + if (at_char_boundary(p, pp, e, enc)) return len - rslen; RB_GC_GUARD(rs); } @@ -9581,7 +9814,7 @@ rb_str_chomp_bang(int argc, VALUE *argv, VALUE str) { VALUE rs; str_modifiable(str); - if (RSTRING_LEN(str) == 0) return Qnil; + if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil; rs = chomp_rs(argc, argv); if (NIL_P(rs)) return Qnil; return rb_str_chomp_string(str, rs); @@ -9848,11 +10081,11 @@ rb_str_strip(VALUE str) static VALUE scan_once(VALUE str, VALUE pat, long *start, int set_backref_str) { - VALUE result, match; - struct re_registers *regs; - int i; + VALUE result = Qnil; long end, pos = rb_pat_search(pat, str, *start, set_backref_str); if (pos >= 0) { + VALUE match; + struct re_registers *regs; if (BUILTIN_TYPE(pat) == T_STRING) { regs = NULL; end = pos + RSTRING_LEN(pat); @@ -9863,6 +10096,7 @@ scan_once(VALUE str, VALUE pat, long *start, int set_backref_str) pos = BEG(0); end = END(0); } + if (pos == end) { rb_encoding *enc = STR_ENC_GET(str); /* @@ -9877,22 +10111,27 @@ scan_once(VALUE str, VALUE pat, long *start, int set_backref_str) else { *start = end; } + if (!regs || regs->num_regs == 1) { result = rb_str_subseq(str, pos, end - pos); return result; } - result = rb_ary_new2(regs->num_regs); - for (i=1; i < regs->num_regs; i++) { - VALUE s = Qnil; - if (BEG(i) >= 0) { - s = rb_str_subseq(str, BEG(i), END(i)-BEG(i)); + else { + result = rb_ary_new2(regs->num_regs); + for (int i = 1; i < regs->num_regs; i++) { + VALUE s = Qnil; + if (BEG(i) >= 0) { + s = rb_str_subseq(str, BEG(i), END(i)-BEG(i)); + } + + rb_ary_push(result, s); } - rb_ary_push(result, s); } - return result; + RB_GC_GUARD(match); } - return Qnil; + + return result; } @@ -10437,7 +10676,6 @@ rb_str_rpartition(VALUE str, VALUE sep) if (pos < 0) { goto failed; } - pos = rb_str_offset(str, pos); } return rb_ary_new3(3, rb_str_subseq(str, 0, pos), @@ -10468,10 +10706,20 @@ rb_str_start_with(int argc, VALUE *argv, VALUE str) return Qtrue; } else { + const char *p, *s, *e; + long slen, tlen; + rb_encoding *enc; + StringValue(tmp); - rb_enc_check(str, tmp); - if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue; - if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0) + enc = rb_enc_check(str, tmp); + if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue; + if ((slen = RSTRING_LEN(str)) < tlen) continue; + p = RSTRING_PTR(str); + e = p + slen; + s = p + tlen; + if (!at_char_right_boundary(p, s, e, enc)) + continue; + if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0) return Qtrue; } } @@ -10490,12 +10738,13 @@ static VALUE rb_str_end_with(int argc, VALUE *argv, VALUE str) { int i; - char *p, *s, *e; - rb_encoding *enc; for (i=0; i<argc; i++) { VALUE tmp = argv[i]; + const char *p, *s, *e; long slen, tlen; + rb_encoding *enc; + StringValue(tmp); enc = rb_enc_check(str, tmp); if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue; @@ -10503,9 +10752,9 @@ rb_str_end_with(int argc, VALUE *argv, VALUE str) p = RSTRING_PTR(str); e = p + slen; s = e - tlen; - if (rb_enc_left_char_head(p, s, e, enc) != s) + if (!at_char_boundary(p, s, e, enc)) continue; - if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0) + if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0) return Qtrue; } return Qfalse; @@ -10523,12 +10772,17 @@ rb_str_end_with(int argc, VALUE *argv, VALUE str) static long deleted_prefix_length(VALUE str, VALUE prefix) { - char *strptr, *prefixptr; + const char *strptr, *prefixptr; long olen, prefixlen; + rb_encoding *enc = rb_enc_get(str); StringValue(prefix); - if (is_broken_string(prefix)) return 0; - rb_enc_check(str, prefix); + + if (!is_broken_string(prefix) || + !rb_enc_asciicompat(enc) || + !rb_enc_asciicompat(rb_enc_get(prefix))) { + enc = rb_enc_check(str, prefix); + } /* return 0 if not start with prefix */ prefixlen = RSTRING_LEN(prefix); @@ -10538,6 +10792,19 @@ deleted_prefix_length(VALUE str, VALUE prefix) strptr = RSTRING_PTR(str); prefixptr = RSTRING_PTR(prefix); if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0; + if (is_broken_string(prefix)) { + if (!is_broken_string(str)) { + /* prefix in a valid string cannot be broken */ + return 0; + } + const char *strend = strptr + olen; + const char *after_prefix = strptr + prefixlen; + if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) { + /* prefix does not end at char-boundary */ + return 0; + } + } + /* prefix part in `str` also should be valid. */ return prefixlen; } @@ -10594,7 +10861,7 @@ rb_str_delete_prefix(VALUE str, VALUE prefix) static long deleted_suffix_length(VALUE str, VALUE suffix) { - char *strptr, *suffixptr, *s; + const char *strptr, *suffixptr; long olen, suffixlen; rb_encoding *enc; @@ -10609,9 +10876,10 @@ deleted_suffix_length(VALUE str, VALUE suffix) if (olen < suffixlen) return 0; strptr = RSTRING_PTR(str); suffixptr = RSTRING_PTR(suffix); - s = strptr + olen - suffixlen; - if (memcmp(s, suffixptr, suffixlen) != 0) return 0; - if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0; + const char *strend = strptr + olen; + const char *before_suffix = strend - suffixlen; + if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0; + if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0; return suffixlen; } @@ -10683,7 +10951,7 @@ rb_fs_setter(VALUE val, ID id, VALUE *var) rb_id2str(id)); } if (!NIL_P(val)) { - rb_warn_deprecated("`$;'", NULL); + rb_warn_deprecated("'$;'", NULL); } *var = val; } @@ -10701,7 +10969,23 @@ static VALUE rb_str_force_encoding(VALUE str, VALUE enc) { str_modifiable(str); - rb_enc_associate(str, rb_to_encoding(enc)); + + rb_encoding *encoding = rb_to_encoding(enc); + int idx = rb_enc_to_index(encoding); + + // If the encoding is unchanged, we do nothing. + if (ENCODING_GET(str) == idx) { + return str; + } + + rb_enc_associate_index(str, idx); + + // If the coderange was 7bit and the new encoding is ASCII-compatible + // we can keep the coderange. + if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) { + return str; + } + ENC_CODERANGE_CLEAR(str); return str; } @@ -10718,11 +11002,11 @@ static VALUE rb_str_b(VALUE str) { VALUE str2; - if (FL_TEST(str, STR_NOEMBED)) { - str2 = str_alloc_heap(rb_cString); + if (STR_EMBED_P(str)) { + str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str)); } else { - str2 = str_alloc_embed(rb_cString, RSTRING_EMBED_LEN(str) + TERM_LEN(str)); + str2 = str_alloc_heap(rb_cString); } str_replace_shared_without_enc(str2, str); @@ -10836,7 +11120,7 @@ str_compat_and_valid(VALUE str, rb_encoding *enc) rb_encoding *e = STR_ENC_GET(str); if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) { rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", - rb_enc_name(enc), rb_enc_name(e)); + rb_enc_inspect_name(enc), rb_enc_inspect_name(e)); } } return str; @@ -11245,17 +11529,17 @@ rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str) /********************************************************************** * Document-class: Symbol * - * Symbol objects represent named identifiers inside the Ruby interpreter. + * A +Symbol+ object represents a named identifier inside the Ruby interpreter. * - * You can create a \Symbol object explicitly with: + * You can create a +Symbol+ object explicitly with: * * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals]. * - * The same Symbol object will be + * The same +Symbol+ object will be * created for a given name or string for the duration of a program's * execution, regardless of the context or meaning of that name. Thus * if <code>Fred</code> is a constant in one context, a method in - * another, and a class in a third, the Symbol <code>:Fred</code> + * another, and a class in a third, the +Symbol+ <code>:Fred</code> * will be the same object in all three contexts. * * module One @@ -11298,18 +11582,18 @@ rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str) * local_variables * # => [:seven] * - * Symbol objects are different from String objects in that - * Symbol objects represent identifiers, while String objects - * represent text or data. + * A +Symbol+ object differs from a String object in that + * a +Symbol+ object represents an identifier, while a String object + * represents text or data. * * == What's Here * - * First, what's elsewhere. \Class \Symbol: + * First, what's elsewhere. \Class +Symbol+: * * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here]. * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here]. * - * Here, class \Symbol provides methods that are useful for: + * Here, class +Symbol+ provides methods that are useful for: * * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying] * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing] @@ -11468,26 +11752,25 @@ sym_inspect(VALUE sym) } else { rb_encoding *enc = STR_ENC_GET(str); - RSTRING_GETMEM(str, ptr, len); + VALUE orig_str = str; + + len = RSTRING_LEN(orig_str); str = rb_enc_str_new(0, len + 1, enc); + + // Get data pointer after allocation + ptr = RSTRING_PTR(orig_str); dest = RSTRING_PTR(str); memcpy(dest + 1, ptr, len); + + RB_GC_GUARD(orig_str); } dest[0] = ':'; + + RUBY_ASSERT_BUILTIN_TYPE(str, T_STRING); + return str; } -/* - * call-seq: - * to_s -> string - * - * Returns a string representation of +self+ (not including the leading colon): - * - * :foo.to_s # => "foo" - * - * Related: Symbol#inspect, Symbol#name. - */ - VALUE rb_sym_to_s(VALUE sym) { @@ -11859,7 +12142,7 @@ rb_interned_str_cstr(const char *ptr) VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc) { - if (UNLIKELY(rb_enc_autoload_p(enc))) { + if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) { rb_enc_autoload(enc); } @@ -11877,10 +12160,11 @@ void Init_String(void) { rb_cString = rb_define_class("String", rb_cObject); - assert(rb_vm_fstring_table()); + RUBY_ASSERT(rb_vm_fstring_table()); st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString); rb_include_module(rb_cString, rb_mComparable); rb_define_alloc_func(rb_cString, empty_str_alloc); + rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1); rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1); rb_define_method(rb_cString, "initialize", rb_str_init, -1); rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1); @@ -11925,6 +12209,7 @@ Init_String(void) rb_define_method(rb_cString, "freeze", rb_str_freeze, 0); rb_define_method(rb_cString, "+@", str_uplus, 0); rb_define_method(rb_cString, "-@", str_uminus, 0); + rb_define_method(rb_cString, "dup", rb_str_dup_m, 0); rb_define_alias(rb_cString, "dedup", "-@"); rb_define_method(rb_cString, "to_i", rb_str_to_i, -1); @@ -12052,8 +12337,6 @@ Init_String(void) rb_define_method(rb_cSymbol, "==", sym_equal, 1); rb_define_method(rb_cSymbol, "===", sym_equal, 1); rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0); - rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0); - rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0); rb_define_method(rb_cSymbol, "name", rb_sym2str, 0); /* in symbol.c */ rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */ rb_define_method(rb_cSymbol, "succ", sym_succ, 0); |