diff options
Diffstat (limited to 'string.c')
| -rw-r--r-- | string.c | 6868 |
1 files changed, 5036 insertions, 1832 deletions
@@ -11,17 +11,21 @@ **********************************************************************/ -#include "ruby/ruby.h" -#include "ruby/re.h" #include "ruby/encoding.h" -#include "node.h" -#include "eval_intern.h" +#include "ruby/re.h" #include "internal.h" -#include <assert.h> +#include "encindex.h" +#include "probes.h" +#include "gc.h" +#include "ruby_assert.h" +#include "id.h" +#include "debug_counter.h" +#include "ruby/util.h" #define BEG(no) (regs->beg[(no)]) #define END(no) (regs->end[(no)]) +#include <errno.h> #include <math.h> #include <ctype.h> @@ -29,50 +33,70 @@ #include <unistd.h> #endif -#define numberof(array) (int)(sizeof(array) / sizeof((array)[0])) +#if defined HAVE_CRYPT_R +# if defined HAVE_CRYPT_H +# include <crypt.h> +# endif +#elif !defined HAVE_CRYPT +# include "missing/crypt.h" +# define HAVE_CRYPT_R 1 +#endif +#define STRING_ENUMERATORS_WANTARRAY 0 /* next major */ + +#undef rb_str_new +#undef rb_usascii_str_new +#undef rb_utf8_str_new +#undef rb_enc_str_new #undef rb_str_new_cstr #undef rb_tainted_str_new_cstr #undef rb_usascii_str_new_cstr +#undef rb_utf8_str_new_cstr +#undef rb_enc_str_new_cstr #undef rb_external_str_new_cstr #undef rb_locale_str_new_cstr -#undef rb_str_new2 -#undef rb_str_new3 -#undef rb_str_new4 -#undef rb_str_new5 -#undef rb_tainted_str_new2 -#undef rb_usascii_str_new2 #undef rb_str_dup_frozen #undef rb_str_buf_new_cstr -#undef rb_str_buf_new2 +#undef rb_str_buf_cat #undef rb_str_buf_cat2 #undef rb_str_cat2 +#undef rb_str_cat_cstr +#undef rb_fstring_cstr +#undef rb_fstring_enc_cstr static VALUE rb_str_clear(VALUE str); VALUE rb_cString; VALUE rb_cSymbol; +/* FLAGS of RString + * + * 1: RSTRING_NOEMBED + * 2: STR_SHARED (== ELTS_SHARED) + * 2-6: RSTRING_EMBED_LEN (5 bits == 32) + * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be + * other strings that rely on this string's buffer) + * 6: STR_IS_SHARED_M (shared, when RSTRING_NOEMBED==1 && klass==0) + * 7: STR_TMPLOCK + * 8-9: ENC_CODERANGE (2 bits) + * 10-16: ENCODING (7 bits == 128) + * 17: RSTRING_FSTR + * 18: STR_NOFREE + * 19: STR_FAKESTR + */ + #define RUBY_MAX_CHAR_LEN 16 +#define STR_SHARED_ROOT FL_USER5 +#define STR_IS_SHARED_M FL_USER6 #define STR_TMPLOCK FL_USER7 -#define STR_NOEMBED FL_USER1 -#define STR_SHARED FL_USER2 /* = ELTS_SHARED */ -#define STR_ASSOC FL_USER3 -#define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED) -#define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC) -#define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC) -#define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC)) -#define STR_UNSET_NOCAPA(s) do {\ - if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\ -} while (0) - +#define STR_NOFREE FL_USER18 +#define STR_FAKESTR FL_USER19 #define STR_SET_NOEMBED(str) do {\ FL_SET((str), STR_NOEMBED);\ STR_SET_EMBED_LEN((str), 0);\ } while (0) -#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED) -#define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED)) +#define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE)) #define STR_SET_EMBED_LEN(str, n) do { \ long tmp_n = (n);\ RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\ @@ -99,28 +123,319 @@ VALUE rb_cSymbol; }\ } while (0) +#define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str)) +#define TERM_FILL(ptr, termlen) do {\ + char *const term_fill_ptr = (ptr);\ + const int term_fill_len = (termlen);\ + *term_fill_ptr = '\0';\ + if (UNLIKELY(term_fill_len > 1))\ + memset(term_fill_ptr, 0, term_fill_len);\ +} while (0) + #define RESIZE_CAPA(str,capacity) do {\ + const int termlen = TERM_LEN(str);\ + RESIZE_CAPA_TERM(str,capacity,termlen);\ +} while (0) +#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\ if (STR_EMBED_P(str)) {\ - if ((capacity) > RSTRING_EMBED_LEN_MAX) {\ - char *tmp = ALLOC_N(char, (capacity)+1);\ - memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\ + if (!STR_EMBEDDABLE_P(capacity, termlen)) {\ + char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\ + const long tlen = RSTRING_LEN(str);\ + memcpy(tmp, RSTRING_PTR(str), tlen);\ RSTRING(str)->as.heap.ptr = tmp;\ - RSTRING(str)->as.heap.len = RSTRING_LEN(str);\ + RSTRING(str)->as.heap.len = tlen;\ STR_SET_NOEMBED(str);\ RSTRING(str)->as.heap.aux.capa = (capacity);\ }\ }\ else {\ - REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\ - if (!STR_NOCAPA_P(str))\ - RSTRING(str)->as.heap.aux.capa = (capacity);\ + assert(!FL_TEST((str), STR_SHARED)); \ + SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \ + (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \ + RSTRING(str)->as.heap.aux.capa = (capacity);\ }\ } while (0) -#define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) -#define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) +#define STR_SET_SHARED(str, shared_str) do { \ + if (!FL_TEST(str, STR_FAKESTR)) { \ + RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \ + FL_SET((str), STR_SHARED); \ + FL_SET((shared_str), STR_SHARED_ROOT); \ + if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \ + FL_SET_RAW((shared_str), STR_IS_SHARED_M); \ + } \ +} while (0) + +#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr) +#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str)) + +#define STR_ENC_GET(str) get_encoding(str) + +#if !defined SHARABLE_MIDDLE_SUBSTRING +# define SHARABLE_MIDDLE_SUBSTRING 0 +#endif +#if !SHARABLE_MIDDLE_SUBSTRING +#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end)) +#else +#define SHARABLE_SUBSTRING_P(beg, len, end) 1 +#endif + +#define STR_EMBEDDABLE_P(len, termlen) \ + ((len) <= RSTRING_EMBED_LEN_MAX + 1 - (termlen)) + +static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str); +static VALUE str_new_shared(VALUE klass, VALUE str); +static VALUE str_new_frozen(VALUE klass, VALUE orig); +static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex); +static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen); +static inline void str_modifiable(VALUE str); +static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str); + +static inline void +str_make_independent(VALUE str) +{ + long len = RSTRING_LEN(str); + int termlen = TERM_LEN(str); + str_make_independent_expand((str), len, 0L, termlen); +} + +/* symbols for [up|down|swap]case/capitalize options */ +static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold; + +static rb_encoding * +get_actual_encoding(const int encidx, VALUE str) +{ + const unsigned char *q; + + switch (encidx) { + case ENCINDEX_UTF_16: + if (RSTRING_LEN(str) < 2) break; + q = (const unsigned char *)RSTRING_PTR(str); + if (q[0] == 0xFE && q[1] == 0xFF) { + return rb_enc_get_from_index(ENCINDEX_UTF_16BE); + } + if (q[0] == 0xFF && q[1] == 0xFE) { + return rb_enc_get_from_index(ENCINDEX_UTF_16LE); + } + return rb_ascii8bit_encoding(); + case ENCINDEX_UTF_32: + if (RSTRING_LEN(str) < 4) break; + q = (const unsigned char *)RSTRING_PTR(str); + if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) { + return rb_enc_get_from_index(ENCINDEX_UTF_32BE); + } + if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) { + return rb_enc_get_from_index(ENCINDEX_UTF_32LE); + } + return rb_ascii8bit_encoding(); + } + return rb_enc_from_index(encidx); +} + +static rb_encoding * +get_encoding(VALUE str) +{ + return get_actual_encoding(ENCODING_GET(str), str); +} + +static void +mustnot_broken(VALUE str) +{ + if (is_broken_string(str)) { + rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str))); + } +} + +static void +mustnot_wchar(VALUE str) +{ + rb_encoding *enc = STR_ENC_GET(str); + if (rb_enc_mbminlen(enc) > 1) { + rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc)); + } +} + +static int fstring_cmp(VALUE a, VALUE b); + +static VALUE register_fstring(VALUE str); + +const struct st_hash_type rb_fstring_hash_type = { + fstring_cmp, + rb_str_hash, +}; + +#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_TAINT|FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString) + +static int +fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t arg, int existing) +{ + VALUE *fstr = (VALUE *)arg; + VALUE str = (VALUE)*key; + + if (existing) { + /* because of lazy sweep, str may be unmarked already and swept + * at next time */ + + if (rb_objspace_garbage_object_p(str)) { + *fstr = Qundef; + return ST_DELETE; + } -#define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str)) + *fstr = str; + return ST_STOP; + } + else { + if (FL_TEST_RAW(str, STR_FAKESTR)) { + str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr, + RSTRING(str)->as.heap.len, + ENCODING_GET(str)); + OBJ_FREEZE_RAW(str); + } + else { + str = str_new_frozen(rb_cString, str); + if (STR_SHARED_P(str)) { /* str should not be shared */ + /* shared substring */ + str_make_independent(str); + assert(OBJ_FROZEN(str)); + } + if (!BARE_STRING_P(str)) { + str = str_new_frozen(rb_cString, str); + } + } + RBASIC(str)->flags |= RSTRING_FSTR; + + *key = *value = *fstr = str; + return ST_CONTINUE; + } +} + +RUBY_FUNC_EXPORTED +VALUE +rb_fstring(VALUE str) +{ + VALUE fstr; + int bare; + + Check_Type(str, T_STRING); + + if (FL_TEST(str, RSTRING_FSTR)) + return str; + + bare = BARE_STRING_P(str); + if (!bare) { + if (STR_EMBED_P(str)) { + OBJ_FREEZE_RAW(str); + return str; + } + if (FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) { + assert(OBJ_FROZEN(str)); + return str; + } + } + + fstr = register_fstring(str); + + if (!bare) { + str_replace_shared_without_enc(str, fstr); + OBJ_FREEZE_RAW(str); + return str; + } + return fstr; +} + +static VALUE +register_fstring(VALUE str) +{ + VALUE ret; + st_table *frozen_strings = rb_vm_fstring_table(); + + do { + ret = str; + st_update(frozen_strings, (st_data_t)str, + fstr_update_callback, (st_data_t)&ret); + } while (ret == Qundef); + + assert(OBJ_FROZEN(ret)); + assert(!FL_TEST_RAW(ret, STR_FAKESTR)); + assert(!FL_TEST_RAW(ret, FL_EXIVAR)); + assert(!FL_TEST_RAW(ret, FL_TAINT)); + assert(RBASIC_CLASS(ret) == rb_cString); + return ret; +} + +static VALUE +setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx) +{ + fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR; + /* SHARED to be allocated by the callback */ + + ENCODING_SET_INLINED((VALUE)fake_str, encidx); + + RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString); + fake_str->as.heap.len = len; + fake_str->as.heap.ptr = (char *)name; + fake_str->as.heap.aux.capa = len; + return (VALUE)fake_str; +} + +/* + * set up a fake string which refers a static string literal. + */ +VALUE +rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc) +{ + return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc)); +} + +/* + * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen + * shared string which refers a static string literal. `ptr` must + * point a constant string. + */ +MJIT_FUNC_EXPORTED VALUE +rb_fstring_new(const char *ptr, long len) +{ + struct RString fake_str; + return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII)); +} + +VALUE +rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc) +{ + struct RString fake_str; + return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc)); +} + +VALUE +rb_fstring_cstr(const char *ptr) +{ + return rb_fstring_new(ptr, strlen(ptr)); +} + +VALUE +rb_fstring_enc_cstr(const char *ptr, rb_encoding *enc) +{ + return rb_fstring_enc_new(ptr, strlen(ptr), enc); +} + +static int +fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg) +{ + RBASIC_SET_CLASS((VALUE)key, (VALUE)arg); + return ST_CONTINUE; +} + +static int +fstring_cmp(VALUE a, VALUE b) +{ + long alen, blen; + const char *aptr, *bptr; + RSTRING_GETMEM(a, aptr, alen); + RSTRING_GETMEM(b, bptr, blen); + return (alen != blen || + ENCODING_GET(a) != ENCODING_GET(b) || + memcmp(aptr, bptr, alen) != 0); +} static inline int single_byte_optimizable(VALUE str) @@ -145,38 +460,80 @@ VALUE rb_fs; static inline const char * search_nonascii(const char *p, const char *e) { -#if SIZEOF_VALUE == 8 -# define NONASCII_MASK 0x8080808080808080ULL -#elif SIZEOF_VALUE == 4 -# define NONASCII_MASK 0x80808080UL + const uintptr_t *s, *t; + +#if defined(__STDC_VERSION) && (__STDC_VERSION__ >= 199901L) +# if SIZEOF_UINTPTR_T == 8 +# define NONASCII_MASK UINT64_C(0x8080808080808080) +# elif SIZEOF_UINTPTR_T == 4 +# define NONASCII_MASK UINT32_C(0x80808080) +# else +# error "don't know what to do." +# endif +#else +# if SIZEOF_UINTPTR_T == 8 +# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL) +# elif SIZEOF_UINTPTR_T == 4 +# define NONASCII_MASK 0x80808080UL /* or...? */ +# else +# error "don't know what to do." +# endif #endif -#ifdef NONASCII_MASK - if ((int)sizeof(VALUE) * 2 < e - p) { - const VALUE *s, *t; - const VALUE lowbits = sizeof(VALUE) - 1; - s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); - while (p < (const char *)s) { - if (!ISASCII(*p)) - return p; - p++; - } - t = (const VALUE*)(~lowbits & (VALUE)e); - while (s < t) { - if (*s & NONASCII_MASK) { - t = s; - break; - } - s++; - } - p = (const char *)t; + + if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) { +#if !UNALIGNED_WORD_ACCESS + if ((uintptr_t)p % SIZEOF_VOIDP) { + int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP; + p += l; + switch (l) { + default: UNREACHABLE; +#if SIZEOF_VOIDP > 4 + case 7: if (p[-7]&0x80) return p-7; + case 6: if (p[-6]&0x80) return p-6; + case 5: if (p[-5]&0x80) return p-5; + case 4: if (p[-4]&0x80) return p-4; +#endif + case 3: if (p[-3]&0x80) return p-3; + case 2: if (p[-2]&0x80) return p-2; + case 1: if (p[-1]&0x80) return p-1; + case 0: break; + } + } +#endif +#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS +#define aligned_ptr(value) \ + __builtin_assume_aligned((value), sizeof(uintptr_t)) +#else +#define aligned_ptr(value) (uintptr_t *)(value) +#endif + s = aligned_ptr(p); + t = aligned_ptr(e - (SIZEOF_VOIDP-1)); +#undef aligned_ptr + for (;s < t; s++) { + if (*s & NONASCII_MASK) { +#ifdef WORDS_BIGENDIAN + return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3); +#else + return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3); +#endif + } + } + p = (const char *)s; } + + switch (e - p) { + default: UNREACHABLE; +#if SIZEOF_VOIDP > 4 + case 7: if (e[-7]&0x80) return e-7; + case 6: if (e[-6]&0x80) return e-6; + case 5: if (e[-5]&0x80) return e-5; + case 4: if (e[-4]&0x80) return e-4; #endif - while (p < e) { - if (!ISASCII(*p)) - return p; - p++; + case 3: if (e[-3]&0x80) return e-3; + case 2: if (e[-2]&0x80) return e-2; + case 1: if (e[-1]&0x80) return e-1; + case 0: return NULL; } - return NULL; } static int @@ -184,7 +541,7 @@ coderange_scan(const char *p, long len, rb_encoding *enc) { const char *e = p + len; - if (rb_enc_to_index(enc) == 0) { + if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) { /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */ p = search_nonascii(p, e); return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT; @@ -192,38 +549,22 @@ coderange_scan(const char *p, long len, rb_encoding *enc) if (rb_enc_asciicompat(enc)) { p = search_nonascii(p, e); - if (!p) { - return ENC_CODERANGE_7BIT; - } - while (p < e) { + if (!p) return ENC_CODERANGE_7BIT; + for (;;) { int ret = rb_enc_precise_mbclen(p, e, enc); - if (!MBCLEN_CHARFOUND_P(ret)) { - return ENC_CODERANGE_BROKEN; - } + if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN; p += MBCLEN_CHARFOUND_LEN(ret); - if (p < e) { - p = search_nonascii(p, e); - if (!p) { - return ENC_CODERANGE_VALID; - } - } - } - if (e < p) { - return ENC_CODERANGE_BROKEN; + if (p == e) break; + p = search_nonascii(p, e); + if (!p) break; } - return ENC_CODERANGE_VALID; } - - while (p < e) { - int ret = rb_enc_precise_mbclen(p, e, enc); - - if (!MBCLEN_CHARFOUND_P(ret)) { - return ENC_CODERANGE_BROKEN; + else { + while (p < e) { + int ret = rb_enc_precise_mbclen(p, e, enc); + if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN; + p += MBCLEN_CHARFOUND_LEN(ret); } - p += MBCLEN_CHARFOUND_LEN(ret); - } - if (e < p) { - return ENC_CODERANGE_BROKEN; } return ENC_CODERANGE_VALID; } @@ -236,10 +577,11 @@ rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc if (*cr == ENC_CODERANGE_BROKEN) return e - s; - if (rb_enc_to_index(enc) == 0) { + if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) { /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */ + if (*cr == ENC_CODERANGE_VALID) return e - s; p = search_nonascii(p, e); - *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; + *cr = p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT; return e - s; } else if (rb_enc_asciicompat(enc)) { @@ -248,23 +590,17 @@ rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT; return e - s; } - while (p < e) { + for (;;) { int ret = rb_enc_precise_mbclen(p, e, enc); if (!MBCLEN_CHARFOUND_P(ret)) { *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN; return p - s; } p += MBCLEN_CHARFOUND_LEN(ret); - if (p < e) { - p = search_nonascii(p, e); - if (!p) { - *cr = ENC_CODERANGE_VALID; - return e - s; - } - } + if (p == e) break; + p = search_nonascii(p, e); + if (!p) break; } - *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID; - return p - s; } else { while (p < e) { @@ -275,9 +611,9 @@ rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc } p += MBCLEN_CHARFOUND_LEN(ret); } - *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID; - return p - s; } + *cr = ENC_CODERANGE_VALID; + return e - s; } static inline void @@ -293,6 +629,13 @@ rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src) * from src to new string "dest" which is made from the part of src. */ str_enc_copy(dest, src); + if (RSTRING_LEN(dest) == 0) { + if (!rb_enc_asciicompat(STR_ENC_GET(src))) + ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID); + else + ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); + return; + } switch (ENC_CODERANGE(src)) { case ENC_CODERANGE_7BIT: ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); @@ -305,12 +648,6 @@ rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src) ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); break; default: - if (RSTRING_LEN(dest) == 0) { - if (!rb_enc_asciicompat(STR_ENC_GET(src))) - ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID); - else - ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); - } break; } } @@ -328,8 +665,16 @@ rb_enc_str_coderange(VALUE str) int cr = ENC_CODERANGE(str); if (cr == ENC_CODERANGE_UNKNOWN) { - rb_encoding *enc = STR_ENC_GET(str); - cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc); + int encidx = ENCODING_GET(str); + rb_encoding *enc = rb_enc_from_index(encidx); + if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) && + rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) { + cr = ENC_CODERANGE_BROKEN; + } + else { + cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), + enc); + } ENC_CODERANGE_SET(str, cr); } return cr; @@ -355,13 +700,13 @@ str_mod_check(VALUE s, const char *p, long len) } } -size_t -rb_str_capacity(VALUE str) +static size_t +str_capacity(VALUE str, const int termlen) { if (STR_EMBED_P(str)) { - return RSTRING_EMBED_LEN_MAX; + return (RSTRING_EMBED_LEN_MAX + 1 - termlen); } - else if (STR_NOCAPA_P(str)) { + else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) { return RSTRING(str)->as.heap.len; } else { @@ -369,20 +714,36 @@ rb_str_capacity(VALUE str) } } -static inline VALUE -str_alloc(VALUE klass) +size_t +rb_str_capacity(VALUE str) { - NEWOBJ_OF(str, struct RString, klass, T_STRING); + return str_capacity(str, TERM_LEN(str)); +} - str->as.heap.ptr = 0; - str->as.heap.len = 0; - str->as.heap.aux.capa = 0; +static inline void +must_not_null(const char *ptr) +{ + if (!ptr) { + rb_raise(rb_eArgError, "NULL pointer given"); + } +} +static inline VALUE +str_alloc(VALUE klass) +{ + NEWOBJ_OF(str, struct RString, klass, T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0)); return (VALUE)str; } +static inline VALUE +empty_str_alloc(VALUE klass) +{ + RUBY_DTRACE_CREATE_HOOK(STRING, 0); + return str_alloc(klass); +} + static VALUE -str_new(VALUE klass, const char *ptr, long len) +str_new0(VALUE klass, const char *ptr, long len, int termlen) { VALUE str; @@ -390,10 +751,12 @@ str_new(VALUE klass, const char *ptr, long len) rb_raise(rb_eArgError, "negative string size (or size too big)"); } + RUBY_DTRACE_CREATE_HOOK(STRING, len); + str = str_alloc(klass); - if (len > RSTRING_EMBED_LEN_MAX) { + if (!STR_EMBEDDABLE_P(len, termlen)) { RSTRING(str)->as.heap.aux.capa = len; - RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1); + RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)len + termlen); STR_SET_NOEMBED(str); } else if (len == 0) { @@ -403,10 +766,16 @@ str_new(VALUE klass, const char *ptr, long len) memcpy(RSTRING_PTR(str), ptr, len); } STR_SET_LEN(str, len); - RSTRING_PTR(str)[len] = '\0'; + TERM_FILL(RSTRING_PTR(str) + len, termlen); return str; } +static VALUE +str_new(VALUE klass, const char *ptr, long len) +{ + return str_new0(klass, ptr, len, 1); +} + VALUE rb_str_new(const char *ptr, long len) { @@ -422,9 +791,21 @@ rb_usascii_str_new(const char *ptr, long len) } VALUE +rb_utf8_str_new(const char *ptr, long len) +{ + VALUE str = str_new(rb_cString, ptr, len); + rb_enc_associate_index(str, rb_utf8_encindex()); + return str; +} + +VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc) { - VALUE str = rb_str_new(ptr, len); + VALUE str; + + if (!enc) return rb_str_new(ptr, len); + + str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc)); rb_enc_associate(str, enc); return str; } @@ -432,25 +813,90 @@ rb_enc_str_new(const char *ptr, long len, rb_encoding *enc) VALUE rb_str_new_cstr(const char *ptr) { - if (!ptr) { - rb_raise(rb_eArgError, "NULL pointer given"); - } + must_not_null(ptr); + /* rb_str_new_cstr() can take pointer from non-malloc-generated + * memory regions, and that cannot be detected by the MSAN. Just + * trust the programmer that the argument passed here is a sane C + * string. */ + __msan_unpoison_string(ptr); return rb_str_new(ptr, strlen(ptr)); } -RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr)) -#define rb_str_new2 rb_str_new_cstr - VALUE rb_usascii_str_new_cstr(const char *ptr) { - VALUE str = rb_str_new2(ptr); + VALUE str = rb_str_new_cstr(ptr); ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT); return str; } -RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr)) -#define rb_usascii_str_new2 rb_usascii_str_new_cstr +VALUE +rb_utf8_str_new_cstr(const char *ptr) +{ + VALUE str = rb_str_new_cstr(ptr); + rb_enc_associate_index(str, rb_utf8_encindex()); + return str; +} + +VALUE +rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc) +{ + must_not_null(ptr); + if (rb_enc_mbminlen(enc) != 1) { + rb_raise(rb_eArgError, "wchar encoding given"); + } + return rb_enc_str_new(ptr, strlen(ptr), enc); +} + +static VALUE +str_new_static(VALUE klass, const char *ptr, long len, int encindex) +{ + VALUE str; + + if (len < 0) { + rb_raise(rb_eArgError, "negative string size (or size too big)"); + } + + if (!ptr) { + rb_encoding *enc = rb_enc_get_from_index(encindex); + str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc)); + } + else { + RUBY_DTRACE_CREATE_HOOK(STRING, len); + str = str_alloc(klass); + RSTRING(str)->as.heap.len = len; + RSTRING(str)->as.heap.ptr = (char *)ptr; + RSTRING(str)->as.heap.aux.capa = len; + STR_SET_NOEMBED(str); + RBASIC(str)->flags |= STR_NOFREE; + } + rb_enc_associate_index(str, encindex); + return str; +} + +VALUE +rb_str_new_static(const char *ptr, long len) +{ + return str_new_static(rb_cString, ptr, len, 0); +} + +VALUE +rb_usascii_str_new_static(const char *ptr, long len) +{ + return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII); +} + +VALUE +rb_utf8_str_new_static(const char *ptr, long len) +{ + return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8); +} + +VALUE +rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc) +{ + return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc)); +} VALUE rb_tainted_str_new(const char *ptr, long len) @@ -461,32 +907,39 @@ rb_tainted_str_new(const char *ptr, long len) return str; } +static VALUE +rb_tainted_str_new_with_enc(const char *ptr, long len, rb_encoding *enc) +{ + VALUE str = rb_enc_str_new(ptr, len, enc); + + OBJ_TAINT(str); + return str; +} + VALUE rb_tainted_str_new_cstr(const char *ptr) { - VALUE str = rb_str_new2(ptr); + VALUE str = rb_str_new_cstr(ptr); OBJ_TAINT(str); return str; } -RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr)) -#define rb_tainted_str_new2 rb_tainted_str_new_cstr +static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len, + rb_encoding *from, rb_encoding *to, + int ecflags, VALUE ecopts); VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts) { - rb_econv_t *ec; - rb_econv_result_t ret; long len; + const char *ptr; VALUE newstr; - const unsigned char *sp; - unsigned char *dp; if (!to) return str; if (!from) from = rb_enc_get(str); if (from == to) return str; - if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) || + if ((rb_enc_asciicompat(to) && is_ascii_string(str)) || to == rb_ascii8bit_encoding()) { if (STR_ENC_GET(str) != to) { str = rb_str_dup(str); @@ -495,25 +948,93 @@ rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, return str; } - len = RSTRING_LEN(str); - newstr = rb_str_new(0, len); + RSTRING_GETMEM(str, ptr, len); + newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len, + from, to, ecflags, ecopts); + if (NIL_P(newstr)) { + /* some error, return original */ + return str; + } + OBJ_INFECT(newstr, str); + return newstr; +} - retry: - ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts); - if (!ec) return str; +VALUE +rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len, + rb_encoding *from, int ecflags, VALUE ecopts) +{ + long olen; + + olen = RSTRING_LEN(newstr); + if (ofs < -olen || olen < ofs) + rb_raise(rb_eIndexError, "index %ld out of string", ofs); + if (ofs < 0) ofs += olen; + if (!from) { + STR_SET_LEN(newstr, ofs); + return rb_str_cat(newstr, ptr, len); + } + + rb_str_modify(newstr); + return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from, + rb_enc_get(newstr), + ecflags, ecopts); +} - sp = (unsigned char*)RSTRING_PTR(str); - dp = (unsigned char*)RSTRING_PTR(newstr); - ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str), - &dp, (unsigned char*)RSTRING_END(newstr), 0); +VALUE +rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc) +{ + STR_SET_LEN(str, 0); + rb_enc_associate(str, enc); + rb_str_cat(str, ptr, len); + return str; +} + +static VALUE +str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len, + rb_encoding *from, rb_encoding *to, + int ecflags, VALUE ecopts) +{ + rb_econv_t *ec; + rb_econv_result_t ret; + long olen; + VALUE econv_wrapper; + const unsigned char *start, *sp; + unsigned char *dest, *dp; + size_t converted_output = (size_t)ofs; + + olen = rb_str_capacity(newstr); + + econv_wrapper = rb_obj_alloc(rb_cEncodingConverter); + RBASIC_CLEAR_CLASS(econv_wrapper); + ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts); + if (!ec) return Qnil; + DATA_PTR(econv_wrapper) = ec; + + sp = (unsigned char*)ptr; + start = sp; + while ((dest = (unsigned char*)RSTRING_PTR(newstr)), + (dp = dest + converted_output), + (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)), + ret == econv_destination_buffer_full) { + /* destination buffer short */ + size_t converted_input = sp - start; + size_t rest = len - converted_input; + converted_output = dp - dest; + rb_str_set_len(newstr, converted_output); + if (converted_input && converted_output && + rest < (LONG_MAX / converted_output)) { + rest = (rest * converted_output) / converted_input; + } + else { + rest = olen; + } + olen += rest < 2 ? 2 : rest; + rb_str_resize(newstr, olen); + } + DATA_PTR(econv_wrapper) = 0; rb_econv_close(ec); + rb_gc_force_recycle(econv_wrapper); switch (ret) { - case econv_destination_buffer_full: - /* destination buffer short */ - len = len < 2 ? 2 : len * 2; - rb_str_resize(newstr, len); - goto retry; - case econv_finished: len = dp - (unsigned char*)RSTRING_PTR(newstr); rb_str_set_len(newstr, len); @@ -521,8 +1042,7 @@ rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, return newstr; default: - /* some error, return original */ - return str; + return Qnil; } } @@ -535,15 +1055,51 @@ rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to) VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc) { + rb_encoding *ienc; VALUE str; + const int eidx = rb_enc_to_index(eenc); - str = rb_tainted_str_new(ptr, len); - if (eenc == rb_usascii_encoding() && + if (!ptr) { + return rb_tainted_str_new_with_enc(ptr, len, eenc); + } + + /* ASCII-8BIT case, no conversion */ + if ((eidx == rb_ascii8bit_encindex()) || + (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) { + return rb_tainted_str_new(ptr, len); + } + /* no default_internal or same encoding, no conversion */ + ienc = rb_default_internal_encoding(); + if (!ienc || eenc == ienc) { + return rb_tainted_str_new_with_enc(ptr, len, eenc); + } + /* ASCII compatible, and ASCII only string, no conversion in + * default_internal */ + if ((eidx == rb_ascii8bit_encindex()) || + (eidx == rb_usascii_encindex()) || + (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) { + return rb_tainted_str_new_with_enc(ptr, len, ienc); + } + /* convert from the given encoding to default_internal */ + str = rb_tainted_str_new_with_enc(NULL, 0, ienc); + /* when the conversion failed for some reason, just ignore the + * default_internal and result in the given encoding as-is. */ + if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) { + rb_str_initialize(str, ptr, len, eenc); + } + return str; +} + +VALUE +rb_external_str_with_enc(VALUE str, rb_encoding *eenc) +{ + int eidx = rb_enc_to_index(eenc); + if (eidx == rb_usascii_encindex() && rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) { - rb_enc_associate(str, rb_ascii8bit_encoding()); + rb_enc_associate_index(str, rb_ascii8bit_encindex()); return str; } - rb_enc_associate(str, eenc); + rb_enc_associate_index(str, eidx); return rb_str_conv_enc(str, eenc, rb_default_internal_encoding()); } @@ -602,137 +1158,194 @@ rb_str_export_to_enc(VALUE str, rb_encoding *enc) } static VALUE -str_replace_shared(VALUE str2, VALUE str) +str_replace_shared_without_enc(VALUE str2, VALUE str) { - if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) { + const int termlen = TERM_LEN(str); + char *ptr; + long len; + + RSTRING_GETMEM(str, ptr, len); + if (STR_EMBEDDABLE_P(len, termlen)) { + char *ptr2 = RSTRING(str2)->as.ary; STR_SET_EMBED(str2); - memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1); - STR_SET_EMBED_LEN(str2, RSTRING_LEN(str)); + memcpy(ptr2, RSTRING_PTR(str), len); + STR_SET_EMBED_LEN(str2, len); + TERM_FILL(ptr2+len, termlen); } else { - str = rb_str_new_frozen(str); + VALUE root; + if (STR_SHARED_P(str)) { + root = RSTRING(str)->as.heap.aux.shared; + RSTRING_GETMEM(str, ptr, len); + } + else { + root = rb_str_new_frozen(str); + RSTRING_GETMEM(root, ptr, len); + } + if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) { + if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) { + rb_fatal("about to free a possible shared root"); + } + char *ptr2 = STR_HEAP_PTR(str2); + if (ptr2 != ptr) { + ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2)); + } + } FL_SET(str2, STR_NOEMBED); - RSTRING(str2)->as.heap.len = RSTRING_LEN(str); - RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str); - RSTRING(str2)->as.heap.aux.shared = str; - FL_SET(str2, ELTS_SHARED); + RSTRING(str2)->as.heap.len = len; + RSTRING(str2)->as.heap.ptr = ptr; + STR_SET_SHARED(str2, root); } - rb_enc_cr_str_exact_copy(str2, str); - return str2; } static VALUE -str_new_shared(VALUE klass, VALUE str) +str_replace_shared(VALUE str2, VALUE str) { - return str_replace_shared(str_alloc(klass), str); + str_replace_shared_without_enc(str2, str); + rb_enc_cr_str_exact_copy(str2, str); + return str2; } static VALUE -str_new3(VALUE klass, VALUE str) +str_new_shared(VALUE klass, VALUE str) { - return str_new_shared(klass, str); + return str_replace_shared(str_alloc(klass), str); } VALUE rb_str_new_shared(VALUE str) { - VALUE str2 = str_new3(rb_obj_class(str), str); + VALUE str2 = str_new_shared(rb_obj_class(str), str); OBJ_INFECT(str2, str); return str2; } -RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str)) -#define rb_str_new3 rb_str_new_shared +VALUE +rb_str_new_frozen(VALUE orig) +{ + VALUE str; + + if (OBJ_FROZEN(orig)) return orig; + + str = str_new_frozen(rb_obj_class(orig), orig); + OBJ_INFECT(str, orig); + return str; +} + +VALUE +rb_str_tmp_frozen_acquire(VALUE orig) +{ + VALUE tmp; -static VALUE -str_new4(VALUE klass, VALUE str) + if (OBJ_FROZEN_RAW(orig)) return orig; + + tmp = str_new_frozen(0, orig); + OBJ_INFECT(tmp, orig); + + return tmp; +} + +void +rb_str_tmp_frozen_release(VALUE orig, VALUE tmp) { - VALUE str2; + if (RBASIC_CLASS(tmp) != 0) + return; - str2 = str_alloc(klass); - STR_SET_NOEMBED(str2); - RSTRING(str2)->as.heap.len = RSTRING_LEN(str); - RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str); - if (STR_SHARED_P(str)) { - VALUE shared = RSTRING(str)->as.heap.aux.shared; - assert(OBJ_FROZEN(shared)); - FL_SET(str2, ELTS_SHARED); - RSTRING(str2)->as.heap.aux.shared = shared; + if (STR_EMBED_P(tmp)) { + assert(OBJ_FROZEN_RAW(tmp)); + rb_gc_force_recycle(tmp); } - else { - FL_SET(str, ELTS_SHARED); - RSTRING(str)->as.heap.aux.shared = str2; + else if (FL_TEST_RAW(orig, STR_SHARED) && + !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) { + VALUE shared = RSTRING(orig)->as.heap.aux.shared; + + if (shared == tmp && !FL_TEST_RAW(tmp, STR_IS_SHARED_M)) { + FL_UNSET_RAW(orig, STR_SHARED); + assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr); + assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len); + RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa; + RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE; + assert(OBJ_FROZEN_RAW(tmp)); + rb_gc_force_recycle(tmp); + } } - rb_enc_cr_str_exact_copy(str2, str); - OBJ_INFECT(str2, str); - return str2; } -VALUE -rb_str_new_frozen(VALUE orig) +static VALUE +str_new_frozen(VALUE klass, VALUE orig) { - VALUE klass, str; + VALUE str; - if (OBJ_FROZEN(orig)) return orig; - klass = rb_obj_class(orig); - if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) { - long ofs; - assert(OBJ_FROZEN(str)); - ofs = RSTRING_LEN(str) - RSTRING_LEN(orig); - if ((ofs > 0) || (klass != RBASIC(str)->klass) || - ((RBASIC(str)->flags ^ RBASIC(orig)->flags) & (FL_TAINT|FL_UNTRUSTED)) || - ENCODING_GET(str) != ENCODING_GET(orig)) { - str = str_new3(klass, str); - RSTRING(str)->as.heap.ptr += ofs; - RSTRING(str)->as.heap.len -= ofs; - rb_enc_cr_str_exact_copy(str, orig); - OBJ_INFECT(str, orig); - } - } - else if (STR_EMBED_P(orig)) { + if (STR_EMBED_P(orig)) { str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig)); - rb_enc_cr_str_exact_copy(str, orig); - OBJ_INFECT(str, orig); - } - else if (STR_ASSOC_P(orig)) { - VALUE assoc = RSTRING(orig)->as.heap.aux.shared; - FL_UNSET(orig, STR_ASSOC); - str = str_new4(klass, orig); - FL_SET(str, STR_ASSOC); - RSTRING(str)->as.heap.aux.shared = assoc; } else { - str = str_new4(klass, orig); + if (FL_TEST_RAW(orig, STR_SHARED)) { + VALUE shared = RSTRING(orig)->as.heap.aux.shared; + long ofs = RSTRING(orig)->as.heap.ptr - RSTRING(shared)->as.heap.ptr; + long rest = RSTRING(shared)->as.heap.len - ofs - RSTRING(orig)->as.heap.len; + assert(!STR_EMBED_P(shared)); + assert(OBJ_FROZEN(shared)); + + if ((ofs > 0) || (rest > 0) || + (klass != RBASIC(shared)->klass) || + ((RBASIC(shared)->flags ^ RBASIC(orig)->flags) & FL_TAINT) || + ENCODING_GET(shared) != ENCODING_GET(orig)) { + str = str_new_shared(klass, shared); + RSTRING(str)->as.heap.ptr += ofs; + RSTRING(str)->as.heap.len -= ofs + rest; + } + else { + if (RBASIC_CLASS(shared) == 0) + FL_SET_RAW(shared, STR_IS_SHARED_M); + return shared; + } + } + else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) { + str = str_alloc(klass); + STR_SET_EMBED(str); + memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig)); + STR_SET_EMBED_LEN(str, RSTRING_LEN(orig)); + TERM_FILL(RSTRING_END(str), TERM_LEN(orig)); + } + else { + str = str_alloc(klass); + STR_SET_NOEMBED(str); + RSTRING(str)->as.heap.len = RSTRING_LEN(orig); + RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig); + RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa; + RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE; + RBASIC(orig)->flags &= ~STR_NOFREE; + STR_SET_SHARED(orig, str); + if (klass == 0) + FL_UNSET_RAW(str, STR_IS_SHARED_M); + } } + + rb_enc_cr_str_exact_copy(str, orig); OBJ_FREEZE(str); return str; } -RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig)) -#define rb_str_new4 rb_str_new_frozen - VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len) { - return str_new(rb_obj_class(obj), ptr, len); + return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj)); } -RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len), - rb_str_new_with_class, (obj, ptr, len)) -#define rb_str_new5 rb_str_new_with_class - static VALUE str_new_empty(VALUE str) { - VALUE v = rb_str_new5(str, 0, 0); + VALUE v = rb_str_new_with_class(str, 0, 0); rb_enc_copy(v, str); OBJ_INFECT(v, str); return v; } -#define STR_BUF_MIN_SIZE 128 +#define STR_BUF_MIN_SIZE 127 +STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX); VALUE rb_str_buf_new(long capa) @@ -744,7 +1357,7 @@ rb_str_buf_new(long capa) } FL_SET(str, STR_NOEMBED); RSTRING(str)->as.heap.aux.capa = capa; - RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1); + RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1); RSTRING(str)->as.heap.ptr[0] = '\0'; return str; @@ -762,44 +1375,39 @@ rb_str_buf_new_cstr(const char *ptr) return str; } -RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr)) -#define rb_str_buf_new2 rb_str_buf_new_cstr - VALUE rb_str_tmp_new(long len) { return str_new(0, 0, len); } -void * -rb_alloc_tmp_buffer(volatile VALUE *store, long len) -{ - VALUE s = rb_str_tmp_new(len); - *store = s; - return RSTRING_PTR(s); -} - -void -rb_free_tmp_buffer(volatile VALUE *store) -{ - VALUE s = *store; - *store = 0; - if (s) rb_str_clear(s); -} - void rb_str_free(VALUE str) { - if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) { - xfree(RSTRING(str)->as.heap.ptr); + if (FL_TEST(str, RSTRING_FSTR)) { + st_data_t fstr = (st_data_t)str; + st_delete(rb_vm_fstring_table(), &fstr, NULL); + RB_DEBUG_COUNTER_INC(obj_str_fstr); + } + + if (STR_EMBED_P(str)) { + RB_DEBUG_COUNTER_INC(obj_str_embed); + } + else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) { + (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED)); + (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE)); + } + else { + RB_DEBUG_COUNTER_INC(obj_str_ptr); + ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str)); } } RUBY_FUNC_EXPORTED size_t rb_str_memsize(VALUE str) { - if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) { - return RSTRING(str)->as.heap.aux.capa; + if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) { + return STR_HEAP_SIZE(str); } else { return 0; @@ -809,49 +1417,62 @@ rb_str_memsize(VALUE str) VALUE rb_str_to_str(VALUE str) { - return rb_convert_type(str, T_STRING, "String", "to_str"); + return rb_convert_type_with_id(str, T_STRING, "String", idTo_str); } static inline void str_discard(VALUE str); +static void str_shared_replace(VALUE str, VALUE str2); void rb_str_shared_replace(VALUE str, VALUE str2) { + if (str != str2) str_shared_replace(str, str2); +} + +static void +str_shared_replace(VALUE str, VALUE str2) +{ rb_encoding *enc; int cr; - if (str == str2) return; + int termlen; + + RUBY_ASSERT(str2 != str); enc = STR_ENC_GET(str2); cr = ENC_CODERANGE(str2); str_discard(str); OBJ_INFECT(str, str2); - if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) { + termlen = rb_enc_mbminlen(enc); + + if (STR_EMBEDDABLE_P(RSTRING_LEN(str2), termlen)) { STR_SET_EMBED(str); - memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1); + memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen); STR_SET_EMBED_LEN(str, RSTRING_LEN(str2)); rb_enc_associate(str, enc); ENC_CODERANGE_SET(str, cr); - return; - } - STR_SET_NOEMBED(str); - STR_UNSET_NOCAPA(str); - RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2); - RSTRING(str)->as.heap.len = RSTRING_LEN(str2); - if (STR_NOCAPA_P(str2)) { - FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA); - RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared; } else { - RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa; + STR_SET_NOEMBED(str); + FL_UNSET(str, STR_SHARED); + RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2); + RSTRING(str)->as.heap.len = RSTRING_LEN(str2); + + if (FL_TEST(str2, STR_SHARED)) { + VALUE shared = RSTRING(str2)->as.heap.aux.shared; + STR_SET_SHARED(str, shared); + } + else { + RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa; + } + + /* abandon str2 */ + STR_SET_EMBED(str2); + RSTRING_PTR(str2)[0] = 0; + STR_SET_EMBED_LEN(str2, 0); + rb_enc_associate(str, enc); + ENC_CODERANGE_SET(str, cr); } - STR_SET_EMBED(str2); /* abandon str2 */ - RSTRING_PTR(str2)[0] = 0; - STR_SET_EMBED_LEN(str2, 0); - rb_enc_associate(str, enc); - ENC_CODERANGE_SET(str, cr); } -static ID id_to_s; - VALUE rb_obj_as_string(VALUE obj) { @@ -860,10 +1481,18 @@ rb_obj_as_string(VALUE obj) if (RB_TYPE_P(obj, T_STRING)) { return obj; } - str = rb_funcall(obj, id_to_s, 0); + str = rb_funcall(obj, idTo_s, 0); + return rb_obj_as_string_result(str, obj); +} + +MJIT_FUNC_EXPORTED VALUE +rb_obj_as_string_result(VALUE str, VALUE obj) +{ if (!RB_TYPE_P(str, T_STRING)) return rb_any_to_s(obj); - if (OBJ_TAINTED(obj)) OBJ_TAINT(str); + if (!FL_TEST_RAW(str, RSTRING_FSTR) && FL_ABLE(obj)) + /* fstring must not be tainted, at least */ + OBJ_INFECT_RAW(str, obj); return str; } @@ -873,33 +1502,55 @@ str_replace(VALUE str, VALUE str2) long len; len = RSTRING_LEN(str2); - if (STR_ASSOC_P(str2)) { - str2 = rb_str_new4(str2); - } if (STR_SHARED_P(str2)) { VALUE shared = RSTRING(str2)->as.heap.aux.shared; assert(OBJ_FROZEN(shared)); STR_SET_NOEMBED(str); RSTRING(str)->as.heap.len = len; RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2); - FL_SET(str, ELTS_SHARED); - FL_UNSET(str, STR_ASSOC); - RSTRING(str)->as.heap.aux.shared = shared; + STR_SET_SHARED(str, shared); + rb_enc_cr_str_exact_copy(str, str2); } else { str_replace_shared(str, str2); } OBJ_INFECT(str, str2); - rb_enc_cr_str_exact_copy(str, str2); return str; } -static VALUE +static inline VALUE str_duplicate(VALUE klass, VALUE str) { + enum {embed_size = RSTRING_EMBED_LEN_MAX + 1}; + const VALUE flag_mask = + RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK | + ENC_CODERANGE_MASK | ENCODING_MASK | + FL_TAINT | FL_FREEZE + ; + VALUE flags = FL_TEST_RAW(str, flag_mask); VALUE dup = str_alloc(klass); - str_replace(dup, str); + MEMCPY(RSTRING(dup)->as.ary, RSTRING(str)->as.ary, + char, embed_size); + if (flags & STR_NOEMBED) { + if (FL_TEST_RAW(str, STR_SHARED)) { + str = RSTRING(str)->as.heap.aux.shared; + } + else if (UNLIKELY(!(flags & FL_FREEZE))) { + str = str_new_frozen(klass, str); + FL_SET_RAW(str, flags & FL_TAINT); + flags = FL_TEST_RAW(str, flag_mask); + } + if (flags & STR_NOEMBED) { + RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, str); + flags |= STR_SHARED; + } + else { + MEMCPY(RSTRING(dup)->as.ary, RSTRING(str)->as.ary, + char, embed_size); + } + } + FL_SET_RAW(dup, flags & ~FL_FREEZE); return dup; } @@ -912,26 +1563,149 @@ rb_str_dup(VALUE str) VALUE rb_str_resurrect(VALUE str) { - return str_replace(str_alloc(rb_cString), str); + RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str)); + return str_duplicate(rb_cString, str); } /* * call-seq: - * String.new(str="") -> new_str + * String.new(str="") -> new_str + * String.new(str="", encoding: enc) -> new_str + * String.new(str="", capacity: size) -> new_str * * Returns a new string object containing a copy of <i>str</i>. + * + * The optional <i>encoding</i> keyword argument specifies the encoding + * of the new string. + * If not specified, the encoding of <i>str</i> is used + * (or ASCII-8BIT, if <i>str</i> is not specified). + * + * The optional <i>capacity</i> keyword argument specifies the size + * of the internal buffer. + * This may improve performance, when the string will be concatenated many + * times (causing many realloc calls). */ static VALUE rb_str_init(int argc, VALUE *argv, VALUE str) { - VALUE orig; + static ID keyword_ids[2]; + VALUE orig, opt, venc, vcapa; + VALUE kwargs[2]; + rb_encoding *enc = 0; + int n; + + if (!keyword_ids[0]) { + keyword_ids[0] = rb_id_encoding(); + CONST_ID(keyword_ids[1], "capacity"); + } + + n = rb_scan_args(argc, argv, "01:", &orig, &opt); + if (!NIL_P(opt)) { + rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs); + venc = kwargs[0]; + vcapa = kwargs[1]; + if (venc != Qundef && !NIL_P(venc)) { + enc = rb_to_encoding(venc); + } + if (vcapa != Qundef && !NIL_P(vcapa)) { + long capa = NUM2LONG(vcapa); + long len = 0; + int termlen = enc ? rb_enc_mbminlen(enc) : 1; - if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1) + if (capa < STR_BUF_MIN_SIZE) { + capa = STR_BUF_MIN_SIZE; + } + if (n == 1) { + StringValue(orig); + len = RSTRING_LEN(orig); + if (capa < len) { + capa = len; + } + if (orig == str) n = 0; + } + str_modifiable(str); + if (STR_EMBED_P(str)) { /* make noembed always */ + char *new_ptr = ALLOC_N(char, (size_t)capa + termlen); + memcpy(new_ptr, RSTRING(str)->as.ary, RSTRING_EMBED_LEN_MAX + 1); + RSTRING(str)->as.heap.ptr = new_ptr; + } + else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) { + const size_t size = (size_t)capa + termlen; + const char *const old_ptr = RSTRING_PTR(str); + const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str); + char *new_ptr = ALLOC_N(char, (size_t)capa + termlen); + memcpy(new_ptr, old_ptr, osize < size ? osize : size); + FL_UNSET_RAW(str, STR_SHARED); + RSTRING(str)->as.heap.ptr = new_ptr; + } + else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) { + SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, + (size_t)capa + termlen, STR_HEAP_SIZE(str)); + } + RSTRING(str)->as.heap.len = len; + TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen); + if (n == 1) { + memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len); + rb_enc_cr_str_exact_copy(str, orig); + } + FL_SET(str, STR_NOEMBED); + RSTRING(str)->as.heap.aux.capa = capa; + } + else if (n == 1) { + rb_str_replace(str, orig); + } + if (enc) { + rb_enc_associate(str, enc); + ENC_CODERANGE_CLEAR(str); + } + } + else if (n == 1) { rb_str_replace(str, orig); + } return str; } +#ifdef NONASCII_MASK +#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80) + +/* + * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx + * bit representation. (see http://en.wikipedia.org/wiki/UTF-8) + * Therefore, the following pseudocode can detect UTF-8 leading bytes. + * + * if (!(byte & 0x80)) + * byte |= 0x40; // turn on bit6 + * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not. + * + * This function calculates whether a byte is leading or not for all bytes + * in the argument word by concurrently using the above logic, and then + * adds up the number of leading bytes in the word. + */ +static inline uintptr_t +count_utf8_lead_bytes_with_word(const uintptr_t *s) +{ + uintptr_t d = *s; + + /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */ + d = (d>>6) | (~d>>7); + d &= NONASCII_MASK >> 7; + + /* Gather all bytes. */ +#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__) + /* use only if it can use POPCNT */ + return rb_popcount_intptr(d); +#else + d += (d>>8); + d += (d>>16); +# if SIZEOF_VOIDP == 8 + d += (d>>32); +# endif + return (d&0xF); +#endif +} +#endif + static inline long enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr) { @@ -939,11 +1713,37 @@ enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr) const char *q; if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { - return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc); + long diff = (long)(e - p); + return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc)); + } +#ifdef NONASCII_MASK + else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) { + uintptr_t len = 0; + if ((int)sizeof(uintptr_t) * 2 < e - p) { + const uintptr_t *s, *t; + const uintptr_t lowbits = sizeof(uintptr_t) - 1; + s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits)); + t = (const uintptr_t*)(~lowbits & (uintptr_t)e); + while (p < (const char *)s) { + if (is_utf8_lead_byte(*p)) len++; + p++; + } + while (s < t) { + len += count_utf8_lead_bytes_with_word(s); + s++; + } + p = (const char *)s; + } + while (p < e) { + if (is_utf8_lead_byte(*p)) len++; + p++; + } + return (long)len; } +#endif else if (rb_enc_asciicompat(enc)) { c = 0; - if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) { + if (ENC_CODERANGE_CLEAN_P(cr)) { while (p < e) { if (ISASCII(*p)) { q = search_nonascii(p, e); @@ -984,6 +1784,9 @@ rb_enc_strlen(const char *p, const char *e, rb_encoding *enc) return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN); } +/* To get strlen with cr + * Note that given cr is not used. + */ long rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr) { @@ -993,7 +1796,8 @@ rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr) *cr = 0; if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { - return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc); + long diff = (long)(e - p); + return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc)); } else if (rb_enc_asciicompat(enc)) { c = 0; @@ -1040,46 +1844,11 @@ rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr) return c; } -#ifdef NONASCII_MASK -#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80) - -/* - * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx - * bit represention. (see http://en.wikipedia.org/wiki/UTF-8) - * Therefore, following pseudo code can detect UTF-8 leading byte. - * - * if (!(byte & 0x80)) - * byte |= 0x40; // turn on bit6 - * return ((byte>>6) & 1); // bit6 represent it's leading byte or not. - * - * This function calculate every bytes in the argument word `s' - * using the above logic concurrently. and gather every bytes result. - */ -static inline VALUE -count_utf8_lead_bytes_with_word(const VALUE *s) -{ - VALUE d = *s; - - /* Transform into bit0 represent UTF-8 leading or not. */ - d |= ~(d>>1); - d >>= 6; - d &= NONASCII_MASK >> 7; - - /* Gather every bytes. */ - d += (d>>8); - d += (d>>16); -#if SIZEOF_VALUE == 8 - d += (d>>32); -#endif - return (d&0xF); -} -#endif - +/* enc must be str's enc or rb_enc_check(str, str2) */ static long str_strlen(VALUE str, rb_encoding *enc) { const char *p, *e; - long n; int cr; if (single_byte_optimizable(str)) return RSTRING_LEN(str); @@ -1087,44 +1856,21 @@ str_strlen(VALUE str, rb_encoding *enc) p = RSTRING_PTR(str); e = RSTRING_END(str); cr = ENC_CODERANGE(str); -#ifdef NONASCII_MASK - if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && - enc == rb_utf8_encoding()) { - VALUE len = 0; - if ((int)sizeof(VALUE) * 2 < e - p) { - const VALUE *s, *t; - const VALUE lowbits = sizeof(VALUE) - 1; - s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); - t = (const VALUE*)(~lowbits & (VALUE)e); - while (p < (const char *)s) { - if (is_utf8_lead_byte(*p)) len++; - p++; - } - while (s < t) { - len += count_utf8_lead_bytes_with_word(s); - s++; - } - p = (const char *)s; - } - while (p < e) { - if (is_utf8_lead_byte(*p)) len++; - p++; - } - return (long)len; + if (cr == ENC_CODERANGE_UNKNOWN) { + long n = rb_enc_strlen_cr(p, e, enc, &cr); + if (cr) ENC_CODERANGE_SET(str, cr); + return n; } -#endif - n = rb_enc_strlen_cr(p, e, enc, &cr); - if (cr) { - ENC_CODERANGE_SET(str, cr); + else { + return enc_strlen(p, e, enc, cr); } - return n; } long rb_str_strlen(VALUE str) { - return str_strlen(str, STR_ENC_GET(str)); + return str_strlen(str, NULL); } /* @@ -1138,10 +1884,7 @@ rb_str_strlen(VALUE str) VALUE rb_str_length(VALUE str) { - long len; - - len = str_strlen(str, STR_ENC_GET(str)); - return LONG2NUM(len); + return LONG2NUM(str_strlen(str, NULL)); } /* @@ -1194,19 +1937,29 @@ rb_str_plus(VALUE str1, VALUE str2) { VALUE str3; rb_encoding *enc; + char *ptr1, *ptr2, *ptr3; + long len1, len2; + int termlen; StringValue(str2); - enc = rb_enc_check(str1, str2); - str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2)); - memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1)); - memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1), - RSTRING_PTR(str2), RSTRING_LEN(str2)); - RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0'; - - if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2)) - OBJ_TAINT(str3); + enc = rb_enc_check_str(str1, str2); + RSTRING_GETMEM(str1, ptr1, len1); + RSTRING_GETMEM(str2, ptr2, len2); + termlen = rb_enc_mbminlen(enc); + if (len1 > LONG_MAX - len2) { + rb_raise(rb_eArgError, "string size too big"); + } + str3 = str_new0(rb_cString, 0, len1+len2, termlen); + ptr3 = RSTRING_PTR(str3); + memcpy(ptr3, ptr1, len1); + memcpy(ptr3+len1, ptr2, len2); + TERM_FILL(&ptr3[len1+len2], termlen); + + FL_SET_RAW(str3, OBJ_TAINTED_RAW(str1) | OBJ_TAINTED_RAW(str2)); ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc), ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2))); + RB_GC_GUARD(str1); + RB_GC_GUARD(str2); return str3; } @@ -1227,16 +1980,40 @@ rb_str_times(VALUE str, VALUE times) VALUE str2; long n, len; char *ptr2; + int termlen; + if (times == INT2FIX(1)) { + return rb_str_dup(str); + } + if (times == INT2FIX(0)) { + str2 = str_alloc(rb_obj_class(str)); + rb_enc_copy(str2, str); + OBJ_INFECT(str2, str); + return str2; + } len = NUM2LONG(times); if (len < 0) { rb_raise(rb_eArgError, "negative argument"); } + if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) { + str2 = str_alloc(rb_obj_class(str)); + if (!STR_EMBEDDABLE_P(len, 1)) { + RSTRING(str2)->as.heap.aux.capa = len; + RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1); + STR_SET_NOEMBED(str2); + } + STR_SET_LEN(str2, len); + rb_enc_copy(str2, str); + OBJ_INFECT(str2, str); + return str2; + } if (len && LONG_MAX/len < RSTRING_LEN(str)) { rb_raise(rb_eArgError, "argument too big"); } - str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str)); + len *= RSTRING_LEN(str); + termlen = TERM_LEN(str); + str2 = str_new0(rb_obj_class(str), 0, len, termlen); ptr2 = RSTRING_PTR(str2); if (len) { n = RSTRING_LEN(str); @@ -1247,7 +2024,8 @@ rb_str_times(VALUE str, VALUE times) } memcpy(ptr2 + n, ptr2, len-n); } - ptr2[RSTRING_LEN(str2)] = '\0'; + STR_SET_LEN(str2, len); + TERM_FILL(&ptr2[len], termlen); OBJ_INFECT(str2, str); rb_enc_cr_str_copy_for_substr(str2, str); @@ -1265,63 +2043,88 @@ rb_str_times(VALUE str, VALUE times) * details of the format string. * * "%05d" % 123 #=> "00123" - * "%-5s: %08x" % [ "ID", self.object_id ] #=> "ID : 200e14d6" + * "%-5s: %016x" % [ "ID", self.object_id ] #=> "ID : 00002b054ec93168" * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar" */ static VALUE rb_str_format_m(VALUE str, VALUE arg) { - volatile VALUE tmp = rb_check_array_type(arg); + VALUE tmp = rb_check_array_type(arg); if (!NIL_P(tmp)) { - return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str); + return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str); } return rb_str_format(1, &arg, str); } static inline void -str_modifiable(VALUE str) +rb_check_lockedtmp(VALUE str) { if (FL_TEST(str, STR_TMPLOCK)) { rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked"); } +} + +static inline void +str_modifiable(VALUE str) +{ + rb_check_lockedtmp(str); rb_check_frozen(str); - if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4) - rb_raise(rb_eSecurityError, "Insecure: can't modify string"); +} + +static inline int +str_dependent_p(VALUE str) +{ + if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) { + return 0; + } + else { + return 1; + } } static inline int str_independent(VALUE str) { str_modifiable(str); - if (!STR_SHARED_P(str)) return 1; - if (STR_EMBED_P(str)) return 1; - return 0; + return !str_dependent_p(str); } static void -str_make_independent_expand(VALUE str, long expand) +str_make_independent_expand(VALUE str, long len, long expand, const int termlen) { char *ptr; - long len = RSTRING_LEN(str); + char *oldptr; long capa = len + expand; if (len > capa) len = capa; - ptr = ALLOC_N(char, capa + 1); - if (RSTRING_PTR(str)) { - memcpy(ptr, RSTRING_PTR(str), len); + + if (!STR_EMBED_P(str) && STR_EMBEDDABLE_P(capa, termlen)) { + ptr = RSTRING(str)->as.heap.ptr; + STR_SET_EMBED(str); + memcpy(RSTRING(str)->as.ary, ptr, len); + TERM_FILL(RSTRING(str)->as.ary + len, termlen); + STR_SET_EMBED_LEN(str, len); + return; + } + + ptr = ALLOC_N(char, (size_t)capa + termlen); + oldptr = RSTRING_PTR(str); + if (oldptr) { + memcpy(ptr, oldptr, len); + } + if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) { + xfree(oldptr); } STR_SET_NOEMBED(str); - STR_UNSET_NOCAPA(str); - ptr[len] = 0; + FL_UNSET(str, STR_SHARED|STR_NOFREE); + TERM_FILL(ptr + len, termlen); RSTRING(str)->as.heap.ptr = ptr; RSTRING(str)->as.heap.len = len; RSTRING(str)->as.heap.aux.capa = capa; } -#define str_make_independent(str) str_make_independent_expand((str), 0L) - void rb_str_modify(VALUE str) { @@ -1333,22 +2136,21 @@ rb_str_modify(VALUE str) void rb_str_modify_expand(VALUE str, long expand) { + int termlen = TERM_LEN(str); + long len = RSTRING_LEN(str); + if (expand < 0) { rb_raise(rb_eArgError, "negative expanding string size"); } + if (expand > LONG_MAX - len) { + rb_raise(rb_eArgError, "string size too big"); + } + if (!str_independent(str)) { - str_make_independent_expand(str, expand); + str_make_independent_expand(str, len, expand, termlen); } else if (expand > 0) { - long len = RSTRING_LEN(str); - long capa = len + expand; - if (!STR_EMBED_P(str)) { - REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1); - RSTRING(str)->as.heap.aux.capa = capa; - } - else if (capa > RSTRING_EMBED_LEN_MAX) { - str_make_independent_expand(str, expand); - } + RESIZE_CAPA_TERM(str, len + expand, termlen); } ENC_CODERANGE_CLEAR(str); } @@ -1368,55 +2170,14 @@ static inline void str_discard(VALUE str) { str_modifiable(str); - if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) { - xfree(RSTRING_PTR(str)); + if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) { + ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str)); RSTRING(str)->as.heap.ptr = 0; RSTRING(str)->as.heap.len = 0; } } void -rb_str_associate(VALUE str, VALUE add) -{ - /* sanity check */ - rb_check_frozen(str); - if (STR_ASSOC_P(str)) { - /* already associated */ - rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add); - } - else { - if (STR_SHARED_P(str)) { - VALUE assoc = RSTRING(str)->as.heap.aux.shared; - str_make_independent(str); - if (STR_ASSOC_P(assoc)) { - assoc = RSTRING(assoc)->as.heap.aux.shared; - rb_ary_concat(assoc, add); - add = assoc; - } - } - else if (STR_EMBED_P(str)) { - str_make_independent(str); - } - else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) { - RESIZE_CAPA(str, RSTRING_LEN(str)); - } - FL_SET(str, STR_ASSOC); - RBASIC(add)->klass = 0; - RSTRING(str)->as.heap.aux.shared = add; - } -} - -VALUE -rb_str_associated(VALUE str) -{ - if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared; - if (STR_ASSOC_P(str)) { - return RSTRING(str)->as.heap.aux.shared; - } - return Qfalse; -} - -void rb_must_asciicompat(VALUE str) { rb_encoding *enc = rb_enc_get(str); @@ -1443,28 +2204,131 @@ rb_string_value_ptr(volatile VALUE *ptr) return RSTRING_PTR(str); } -char * -rb_string_value_cstr(volatile VALUE *ptr) +static int +zero_filled(const char *s, int n) +{ + for (; n > 0; --n) { + if (*s++) return 0; + } + return 1; +} + +static const char * +str_null_char(const char *s, long len, const int minlen, rb_encoding *enc) +{ + const char *e = s + len; + + for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) { + if (zero_filled(s, minlen)) return s; + } + return 0; +} + +static char * +str_fill_term(VALUE str, char *s, long len, int termlen) +{ + /* This function assumes that (capa + termlen) bytes of memory + * is allocated, like many other functions in this file. + */ + if (str_dependent_p(str)) { + if (!zero_filled(s + len, termlen)) + str_make_independent_expand(str, len, 0L, termlen); + } + else { + TERM_FILL(s + len, termlen); + return s; + } + return RSTRING_PTR(str); +} + +void +rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen) +{ + long capa = str_capacity(str, oldtermlen) + oldtermlen; + long len = RSTRING_LEN(str); + + assert(capa >= len); + if (capa - len < termlen) { + rb_check_lockedtmp(str); + str_make_independent_expand(str, len, 0L, termlen); + } + else if (str_dependent_p(str)) { + if (termlen > oldtermlen) + str_make_independent_expand(str, len, 0L, termlen); + } + else { + if (!STR_EMBED_P(str)) { + /* modify capa instead of realloc */ + assert(!FL_TEST((str), STR_SHARED)); + RSTRING(str)->as.heap.aux.capa = capa - termlen; + } + if (termlen > oldtermlen) { + TERM_FILL(RSTRING_PTR(str) + len, termlen); + } + } + + return; +} + +static char * +str_null_check(VALUE str, int *w) { - VALUE str = rb_string_value(ptr); char *s = RSTRING_PTR(str); long len = RSTRING_LEN(str); + rb_encoding *enc = rb_enc_get(str); + const int minlen = rb_enc_mbminlen(enc); + if (minlen > 1) { + *w = 1; + if (str_null_char(s, len, minlen, enc)) { + return NULL; + } + return str_fill_term(str, s, len, minlen); + } + *w = 0; if (!s || memchr(s, 0, len)) { - rb_raise(rb_eArgError, "string contains null byte"); + return NULL; } if (s[len]) { - rb_str_modify(str); - s = RSTRING_PTR(str); - s[RSTRING_LEN(str)] = 0; + s = str_fill_term(str, s, len, minlen); + } + return s; +} + +char * +rb_str_to_cstr(VALUE str) +{ + int w; + return str_null_check(str, &w); +} + +char * +rb_string_value_cstr(volatile VALUE *ptr) +{ + VALUE str = rb_string_value(ptr); + int w; + char *s = str_null_check(str, &w); + if (!s) { + if (w) { + rb_raise(rb_eArgError, "string contains null char"); + } + rb_raise(rb_eArgError, "string contains null byte"); } return s; } +char * +rb_str_fill_terminator(VALUE str, const int newminlen) +{ + char *s = RSTRING_PTR(str); + long len = RSTRING_LEN(str); + return str_fill_term(str, s, len, newminlen); +} + VALUE rb_check_string_type(VALUE str) { - str = rb_check_convert_type(str, T_STRING, "String", "to_str"); + str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str); return str; } @@ -1575,11 +2439,11 @@ static char * str_utf8_nth(const char *p, const char *e, long *nthp) { long nth = *nthp; - if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) { - const VALUE *s, *t; - const VALUE lowbits = sizeof(VALUE) - 1; - s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); - t = (const VALUE*)(~lowbits & (VALUE)e); + if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) { + const uintptr_t *s, *t; + const uintptr_t lowbits = SIZEOF_VOIDP - 1; + s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits)); + t = (const uintptr_t*)(~lowbits & (uintptr_t)e); while (p < (const char *)s) { if (is_utf8_lead_byte(*p)) nth--; p++; @@ -1587,7 +2451,7 @@ str_utf8_nth(const char *p, const char *e, long *nthp) do { nth -= count_utf8_lead_bytes_with_word(s); s++; - } while (s < t && (int)sizeof(VALUE) <= nth); + } while (s < t && (int)SIZEOF_VOIDP <= nth); p = (char *)s; } while (p < e) { @@ -1626,13 +2490,16 @@ rb_str_subseq(VALUE str, long beg, long len) { VALUE str2; - if (RSTRING_LEN(str) == beg + len && - RSTRING_EMBED_LEN_MAX < len) { - str2 = rb_str_new_shared(rb_str_new_frozen(str)); - rb_str_drop_bytes(str2, beg); + if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) && + SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) { + long olen; + str2 = rb_str_new_shared(rb_str_new_frozen(str)); + RSTRING(str2)->as.heap.ptr += beg; + olen = RSTRING(str2)->as.heap.len; + if (olen > len) RSTRING(str2)->as.heap.len = len; } else { - str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len); + str2 = rb_str_new_with_class(str, RSTRING_PTR(str)+beg, len); RB_GC_GUARD(str); } @@ -1642,7 +2509,7 @@ rb_str_subseq(VALUE str, long beg, long len) return str2; } -static char * +char * rb_str_subpos(VALUE str, long beg, long *lenp) { long len = *lenp; @@ -1661,7 +2528,7 @@ rb_str_subpos(VALUE str, long beg, long *lenp) beg += blen; if (beg < 0) return 0; } - if (beg + len > blen) + if (len > blen - beg) len = blen - beg; if (len < 0) return 0; p = s + beg; @@ -1691,7 +2558,7 @@ rb_str_subpos(VALUE str, long beg, long *lenp) return 0; } if (len == 0) { - if (beg > str_strlen(str, enc)) return 0; + if (beg > str_strlen(str, enc)) return 0; /* str's enc */ p = s + beg; } #ifdef NONASCII_MASK @@ -1727,25 +2594,37 @@ rb_str_subpos(VALUE str, long beg, long *lenp) return p; } +static VALUE str_substr(VALUE str, long beg, long len, int empty); + VALUE rb_str_substr(VALUE str, long beg, long len) { + return str_substr(str, beg, len, TRUE); +} + +static VALUE +str_substr(VALUE str, long beg, long len, int empty) +{ VALUE str2; char *p = rb_str_subpos(str, beg, &len); if (!p) return Qnil; - if (len > RSTRING_EMBED_LEN_MAX && p + len == RSTRING_END(str)) { - str2 = rb_str_new4(str); - str2 = str_new3(rb_obj_class(str2), str2); - RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len; + if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) && + SHARABLE_SUBSTRING_P(p, len, RSTRING_END(str))) { + long ofs = p - RSTRING_PTR(str); + str2 = rb_str_new_frozen(str); + str2 = str_new_shared(rb_obj_class(str2), str2); + RSTRING(str2)->as.heap.ptr += ofs; RSTRING(str2)->as.heap.len = len; + ENC_CODERANGE_CLEAR(str2); } else { - str2 = rb_str_new5(str, p, len); - rb_enc_cr_str_copy_for_substr(str2, str); + if (!len && !empty) return Qnil; + str2 = rb_str_new_with_class(str, p, len); OBJ_INFECT(str2, str); RB_GC_GUARD(str); } + rb_enc_cr_str_copy_for_substr(str2, str); return str2; } @@ -1753,13 +2632,46 @@ rb_str_substr(VALUE str, long beg, long len) VALUE rb_str_freeze(VALUE str) { - if (STR_ASSOC_P(str)) { - VALUE ary = RSTRING(str)->as.heap.aux.shared; - OBJ_FREEZE(ary); - } + if (OBJ_FROZEN(str)) return str; + rb_str_resize(str, RSTRING_LEN(str)); return rb_obj_freeze(str); } + +/* + * call-seq: + * +str -> str (mutable) + * + * If the string is frozen, then return duplicated mutable string. + * + * If the string is not frozen, then return the string itself. + */ +static VALUE +str_uplus(VALUE str) +{ + if (OBJ_FROZEN(str)) { + return rb_str_dup(str); + } + else { + return str; + } +} + +/* + * call-seq: + * -str -> str (frozen) + * + * Returns a frozen, possibly pre-existing copy of the string. + * + * The string will be deduplicated as long as it is not tainted, + * or has any instance variables set on it. + */ +static VALUE +str_uminus(VALUE str) +{ + return rb_fstring(str); +} + RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str)) #define rb_str_dup_frozen rb_str_new_frozen @@ -1783,20 +2695,28 @@ rb_str_unlocktmp(VALUE str) return str; } +RUBY_FUNC_EXPORTED VALUE +rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg) +{ + rb_str_locktmp(str); + return rb_ensure(func, arg, rb_str_unlocktmp, str); +} + void rb_str_set_len(VALUE str, long len) { long capa; + const int termlen = TERM_LEN(str); str_modifiable(str); if (STR_SHARED_P(str)) { rb_raise(rb_eRuntimeError, "can't set length of shared string"); } - if (len > (capa = (long)rb_str_capacity(str))) { + if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) { rb_bug("probable buffer overflow: %ld for %ld", len, capa); } STR_SET_LEN(str, len); - RSTRING_PTR(str)[len] = '\0'; + TERM_FILL(&RSTRING_PTR(str)[len], termlen); } VALUE @@ -1812,37 +2732,42 @@ rb_str_resize(VALUE str, long len) independent = str_independent(str); ENC_CODERANGE_CLEAR(str); slen = RSTRING_LEN(str); - if (len != slen) { + + { + long capa; + const int termlen = TERM_LEN(str); if (STR_EMBED_P(str)) { - if (len <= RSTRING_EMBED_LEN_MAX) { + if (len == slen) return str; + if (STR_EMBEDDABLE_P(len, termlen)) { STR_SET_EMBED_LEN(str, len); - RSTRING(str)->as.ary[len] = '\0'; + TERM_FILL(RSTRING(str)->as.ary + len, termlen); return str; } - str_make_independent_expand(str, len - slen); - STR_SET_NOEMBED(str); + str_make_independent_expand(str, slen, len - slen, termlen); } - else if (len <= RSTRING_EMBED_LEN_MAX) { - char *ptr = RSTRING(str)->as.heap.ptr; + else if (STR_EMBEDDABLE_P(len, termlen)) { + char *ptr = STR_HEAP_PTR(str); STR_SET_EMBED(str); if (slen > len) slen = len; if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen); - RSTRING(str)->as.ary[len] = '\0'; + TERM_FILL(RSTRING(str)->as.ary + len, termlen); STR_SET_EMBED_LEN(str, len); - if (independent) xfree(ptr); + if (independent) ruby_xfree(ptr); return str; } else if (!independent) { - str_make_independent_expand(str, len - slen); + if (len == slen) return str; + str_make_independent_expand(str, slen, len - slen, termlen); } - else if (slen < len || slen - len > 1024) { - REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1); - } - if (!STR_NOCAPA_P(str)) { + else if ((capa = RSTRING(str)->as.heap.aux.capa) < len || + (capa - len) > (len < 1024 ? len : 1024)) { + SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, + (size_t)len + termlen, STR_HEAP_SIZE(str)); RSTRING(str)->as.heap.aux.capa = len; } + else if (len == slen) return str; RSTRING(str)->as.heap.len = len; - RSTRING(str)->as.heap.ptr[len] = '\0'; /* sentinel */ + TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */ } return str; } @@ -1850,43 +2775,47 @@ rb_str_resize(VALUE str, long len) static VALUE str_buf_cat(VALUE str, const char *ptr, long len) { - long capa, total, off = -1; + long capa, total, olen, off = -1; + char *sptr; + const int termlen = TERM_LEN(str); + assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */ - if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) { - off = ptr - RSTRING_PTR(str); + RSTRING_GETMEM(str, sptr, olen); + if (ptr >= sptr && ptr <= sptr + olen) { + off = ptr - sptr; } rb_str_modify(str); if (len == 0) return 0; - if (STR_ASSOC_P(str)) { - FL_UNSET(str, STR_ASSOC); - capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str); - } - else if (STR_EMBED_P(str)) { - capa = RSTRING_EMBED_LEN_MAX; + if (STR_EMBED_P(str)) { + capa = RSTRING_EMBED_LEN_MAX + 1 - termlen; + sptr = RSTRING(str)->as.ary; + olen = RSTRING_EMBED_LEN(str); } else { capa = RSTRING(str)->as.heap.aux.capa; + sptr = RSTRING(str)->as.heap.ptr; + olen = RSTRING(str)->as.heap.len; } - if (RSTRING_LEN(str) >= LONG_MAX - len) { + if (olen > LONG_MAX - len) { rb_raise(rb_eArgError, "string sizes too big"); } - total = RSTRING_LEN(str)+len; - if (capa <= total) { + total = olen + len; + if (capa < total) { + if (total >= LONG_MAX / 2) { + capa = total; + } while (total > capa) { - if (capa + 1 >= LONG_MAX / 2) { - capa = (total + 4095) / 4096; - break; - } - capa = (capa + 1) * 2; + capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */ } - RESIZE_CAPA(str, capa); + RESIZE_CAPA_TERM(str, capa, termlen); + sptr = RSTRING_PTR(str); } if (off != -1) { - ptr = RSTRING_PTR(str) + off; + ptr = sptr + off; } - memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len); + memcpy(sptr + olen, ptr, len); STR_SET_LEN(str, total); - RSTRING_PTR(str)[total] = '\0'; /* sentinel */ + TERM_FILL(sptr + total, termlen); /* sentinel */ return str; } @@ -1894,7 +2823,7 @@ str_buf_cat(VALUE str, const char *ptr, long len) #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr)) VALUE -rb_str_buf_cat(VALUE str, const char *ptr, long len) +rb_str_cat(VALUE str, const char *ptr, long len) { if (len == 0) return str; if (len < 0) { @@ -1904,35 +2833,15 @@ rb_str_buf_cat(VALUE str, const char *ptr, long len) } VALUE -rb_str_buf_cat2(VALUE str, const char *ptr) +rb_str_cat_cstr(VALUE str, const char *ptr) { + must_not_null(ptr); return rb_str_buf_cat(str, ptr, strlen(ptr)); } -VALUE -rb_str_cat(VALUE str, const char *ptr, long len) -{ - if (len < 0) { - rb_raise(rb_eArgError, "negative string size (or size too big)"); - } - if (STR_ASSOC_P(str)) { - char *p; - rb_str_modify_expand(str, len); - p = RSTRING(str)->as.heap.ptr; - memcpy(p + RSTRING(str)->as.heap.len, ptr, len); - len = RSTRING(str)->as.heap.len += len; - p[len] = '\0'; /* sentinel */ - return str; - } - - return rb_str_buf_cat(str, ptr, len); -} - -VALUE -rb_str_cat2(VALUE str, const char *ptr) -{ - return rb_str_cat(str, ptr, strlen(ptr)); -} +RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len)) +RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr)) +RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr)) static VALUE rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len, @@ -1941,19 +2850,18 @@ rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len, int str_encindex = ENCODING_GET(str); int res_encindex; int str_cr, res_cr; + rb_encoding *str_enc, *ptr_enc; - str_cr = ENC_CODERANGE(str); + str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT; if (str_encindex == ptr_encindex) { - if (str_cr == ENC_CODERANGE_UNKNOWN) - ptr_cr = ENC_CODERANGE_UNKNOWN; - else if (ptr_cr == ENC_CODERANGE_UNKNOWN) { + if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) { ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex)); } } else { - rb_encoding *str_enc = rb_enc_from_index(str_encindex); - rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex); + str_enc = rb_enc_from_index(str_encindex); + ptr_enc = rb_enc_from_index(ptr_encindex); if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) { if (len == 0) return str; @@ -1979,10 +2887,11 @@ rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len, if (str_encindex != ptr_encindex && str_cr != ENC_CODERANGE_7BIT && ptr_cr != ENC_CODERANGE_7BIT) { + str_enc = rb_enc_from_index(str_encindex); + ptr_enc = rb_enc_from_index(ptr_encindex); incompatible: rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", - rb_enc_name(rb_enc_from_index(str_encindex)), - rb_enc_name(rb_enc_from_index(ptr_encindex))); + rb_enc_name(str_enc), rb_enc_name(ptr_enc)); } if (str_cr == ENC_CODERANGE_UNKNOWN) { @@ -2001,7 +2910,7 @@ rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len, } else if (str_cr == ENC_CODERANGE_VALID) { res_encindex = str_encindex; - if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID) + if (ENC_CODERANGE_CLEAN_P(ptr_cr)) res_cr = str_cr; else res_cr = ptr_cr; @@ -2070,51 +2979,112 @@ rb_str_buf_append(VALUE str, VALUE str2) VALUE rb_str_append(VALUE str, VALUE str2) { - rb_encoding *enc; - int cr, cr2; - long len2; - StringValue(str2); - if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) { - long len = RSTRING_LEN(str) + len2; - enc = rb_enc_check(str, str2); - cr = ENC_CODERANGE(str); - if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2; - rb_str_modify_expand(str, len2); - memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, - RSTRING_PTR(str2), len2+1); - RSTRING(str)->as.heap.len = len; - rb_enc_associate(str, enc); - ENC_CODERANGE_SET(str, cr); - OBJ_INFECT(str, str2); - return str; - } return rb_str_buf_append(str, str2); } +#define MIN_PRE_ALLOC_SIZE 48 + +MJIT_FUNC_EXPORTED VALUE +rb_str_concat_literals(size_t num, const VALUE *strary) +{ + VALUE str; + size_t i, s; + long len = 1; + + if (UNLIKELY(!num)) return rb_str_new(0, 0); + if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]); + + for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); } + if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) { + str = rb_str_resurrect(strary[0]); + s = 1; + } + else { + str = rb_str_buf_new(len); + rb_enc_copy(str, strary[0]); + s = 0; + } + + for (i = s; i < num; ++i) { + const VALUE v = strary[i]; + int encidx = ENCODING_GET(v); + + rb_enc_cr_str_buf_cat(str, RSTRING_PTR(v), RSTRING_LEN(v), + encidx, ENC_CODERANGE(v), NULL); + OBJ_INFECT_RAW(str, v); + if (encidx != ENCINDEX_US_ASCII) { + if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII) + rb_enc_set_index(str, encidx); + } + } + return str; +} + /* * call-seq: - * str << integer -> str - * str.concat(integer) -> str - * str << obj -> str - * str.concat(obj) -> str + * str.concat(obj1, obj2, ...) -> str * - * Append---Concatenates the given object to <i>str</i>. If the object is a - * <code>Integer</code>, it is considered as a codepoint, and is converted + * Concatenates the given object(s) to <i>str</i>. If an object is an + * <code>Integer</code>, it is considered a codepoint and converted * to a character before concatenation. * + * +concat+ can take multiple arguments, and all the arguments are + * concatenated in order. + * * a = "hello " - * a << "world" #=> "hello world" - * a.concat(33) #=> "hello world!" + * a.concat("world", 33) #=> "hello world!" + * a #=> "hello world!" + * + * b = "sn" + * b.concat("_", b, "_", b) #=> "sn_sn_sn" + * + * See also String#<<, which takes a single argument. */ +static VALUE +rb_str_concat_multi(int argc, VALUE *argv, VALUE str) +{ + str_modifiable(str); + + if (argc == 1) { + return rb_str_concat(str, argv[0]); + } + else if (argc > 1) { + int i; + VALUE arg_str = rb_str_tmp_new(0); + rb_enc_copy(arg_str, str); + for (i = 0; i < argc; i++) { + rb_str_concat(arg_str, argv[i]); + } + rb_str_buf_append(str, arg_str); + } + return str; +} + +/* + * call-seq: + * str << obj -> str + * str << integer -> str + * + * Appends the given object to <i>str</i>. If the object is an + * <code>Integer</code>, it is considered a codepoint and converted + * to a character before being appended. + * + * a = "hello " + * a << "world" #=> "hello world" + * a << 33 #=> "hello world!" + * + * See also String#concat, which takes multiple arguments. + */ VALUE rb_str_concat(VALUE str1, VALUE str2) { unsigned int code; rb_encoding *enc = STR_ENC_GET(str1); + int encidx; - if (FIXNUM_P(str2) || RB_TYPE_P(str2, T_BIGNUM)) { + if (RB_INTEGER_TYPE_P(str2)) { if (rb_num_to_uint(str2, &code) == 0) { } else if (FIXNUM_P(str2)) { @@ -2128,7 +3098,8 @@ rb_str_concat(VALUE str1, VALUE str2) return rb_str_append(str1, str2); } - if (enc == rb_usascii_encoding()) { + encidx = rb_enc_to_index(enc); + if (encidx == ENCINDEX_ASCII || encidx == ENCINDEX_US_ASCII) { /* US-ASCII automatically extended to ASCII-8BIT */ char buf[1]; buf[0] = (char)code; @@ -2136,8 +3107,8 @@ rb_str_concat(VALUE str1, VALUE str2) rb_raise(rb_eRangeError, "%u out of char range", code); } rb_str_cat(str1, buf, 1); - if (code > 127) { - rb_enc_associate(str1, rb_ascii8bit_encoding()); + if (encidx == ENCINDEX_US_ASCII && code > 127) { + rb_enc_associate_index(str1, ENCINDEX_ASCII); ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID); } } @@ -2172,28 +3143,36 @@ rb_str_concat(VALUE str1, VALUE str2) /* * call-seq: - * str.prepend(other_str) -> str + * str.prepend(other_str1, other_str2, ...) -> str * - * Prepend---Prepend the given string to <i>str</i>. + * Prepend---Prepend the given strings to <i>str</i>. * - * a = "world" - * a.prepend("hello ") #=> "hello world" - * a #=> "hello world" + * a = "!" + * a.prepend("hello ", "world") #=> "hello world!" + * a #=> "hello world!" + * + * See also String#concat. */ static VALUE -rb_str_prepend(VALUE str, VALUE str2) +rb_str_prepend_multi(int argc, VALUE *argv, VALUE str) { - StringValue(str2); - StringValue(str); - rb_str_update(str, 0L, 0L, str2); - return str; -} + str_modifiable(str); -st_index_t -rb_memhash(const void *ptr, long len) -{ - return st_hash(ptr, len, rb_hash_start((st_index_t)len)); + if (argc == 1) { + rb_str_update(str, 0L, 0L, argv[0]); + } + else if (argc > 1) { + int i; + VALUE arg_str = rb_str_tmp_new(0); + rb_enc_copy(arg_str, str); + for (i = 0; i < argc; i++) { + rb_str_append(arg_str, argv[i]); + } + rb_str_update(str, 0L, 0L, arg_str); + } + + return str; } st_index_t @@ -2209,28 +3188,29 @@ rb_str_hash(VALUE str) int rb_str_hash_cmp(VALUE str1, VALUE str2) { - long len; - - if (!rb_str_comparable(str1, str2)) return 1; - if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) && - memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) { - return 0; - } - return 1; + long len1, len2; + const char *ptr1, *ptr2; + RSTRING_GETMEM(str1, ptr1, len1); + RSTRING_GETMEM(str2, ptr2, len2); + return (len1 != len2 || + !rb_str_comparable(str1, str2) || + memcmp(ptr1, ptr2, len1) != 0); } /* * call-seq: - * str.hash -> fixnum + * str.hash -> integer + * + * Returns a hash based on the string's length, content and encoding. * - * Return a hash based on the string's length and content. + * See also Object#hash. */ static VALUE rb_str_hash_m(VALUE str) { st_index_t hval = rb_str_hash(str); - return INT2FIX(hval); + return ST2FIX(hval); } #define lesser(a,b) (((a)>(b))?(b):(a)) @@ -2301,13 +3281,18 @@ str_eql(const VALUE str1, const VALUE str2) return Qtrue; return Qfalse; } + /* * call-seq: - * str == obj -> true or false + * str == obj -> true or false + * str === obj -> true or false + * + * Equality---Returns whether +str+ == +obj+, similar to Object#==. * - * Equality---If <i>obj</i> is not a <code>String</code>, returns - * <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i> - * <code><=></code> <i>obj</i> returns zero. + * If +obj+ is not an instance of String but responds to +to_str+, then the + * two strings are compared using <code>obj.==</code>. + * + * Otherwise, returns similarly to String#eql?, comparing length and content. */ VALUE @@ -2315,7 +3300,7 @@ rb_str_equal(VALUE str1, VALUE str2) { if (str1 == str2) return Qtrue; if (!RB_TYPE_P(str2, T_STRING)) { - if (!rb_respond_to(str2, rb_intern("to_str"))) { + if (!rb_respond_to(str2, idTo_str)) { return Qfalse; } return rb_equal(str2, str1); @@ -2330,7 +3315,7 @@ rb_str_equal(VALUE str1, VALUE str2) * Two strings are equal if they have the same length and content. */ -static VALUE +MJIT_FUNC_EXPORTED VALUE rb_str_eql(VALUE str1, VALUE str2) { if (str1 == str2) return Qtrue; @@ -2340,75 +3325,81 @@ rb_str_eql(VALUE str1, VALUE str2) /* * call-seq: - * str <=> other_str -> -1, 0, +1 or nil + * string <=> other_string -> -1, 0, +1, or nil + * + * Comparison---Returns -1, 0, +1, or +nil+ depending on whether +string+ is + * less than, equal to, or greater than +other_string+. * - * Comparison---Returns -1 if <i>other_str</i> is greater than, 0 if - * <i>other_str</i> is equal to, and +1 if <i>other_str</i> is less than - * <i>str</i>. If the strings are of different lengths, and the strings are - * equal when compared up to the shortest length, then the longer string is - * considered greater than the shorter one. In older versions of Ruby, setting - * <code>$=</code> allowed case-insensitive comparisons; this is now deprecated - * in favor of using <code>String#casecmp</code>. + * +nil+ is returned if the two values are incomparable. + * + * If the strings are of different lengths, and the strings are equal when + * compared up to the shortest length, then the longer string is considered + * greater than the shorter one. * * <code><=></code> is the basis for the methods <code><</code>, - * <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>, - * included from module <code>Comparable</code>. The method - * <code>String#==</code> does not use <code>Comparable#==</code>. + * <code><=</code>, <code>></code>, <code>>=</code>, and + * <code>between?</code>, included from module Comparable. The method + * String#== does not use Comparable#==. * * "abcdef" <=> "abcde" #=> 1 * "abcdef" <=> "abcdef" #=> 0 * "abcdef" <=> "abcdefg" #=> -1 * "abcdef" <=> "ABCDEF" #=> 1 + * "abcdef" <=> 1 #=> nil */ static VALUE rb_str_cmp_m(VALUE str1, VALUE str2) { - long result; - - if (!RB_TYPE_P(str2, T_STRING)) { - if (!rb_respond_to(str2, rb_intern("to_str"))) { - return Qnil; - } - else if (!rb_respond_to(str2, rb_intern("<=>"))) { - return Qnil; - } - else { - VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1); - - if (NIL_P(tmp)) return Qnil; - if (!FIXNUM_P(tmp)) { - return rb_funcall(LONG2FIX(0), '-', 1, tmp); - } - result = -FIX2LONG(tmp); - } + int result; + VALUE s = rb_check_string_type(str2); + if (NIL_P(s)) { + return rb_invcmp(str1, str2); } - else { - result = rb_str_cmp(str1, str2); - } - return LONG2NUM(result); + result = rb_str_cmp(str1, s); + return INT2FIX(result); } +static VALUE str_casecmp(VALUE str1, VALUE str2); +static VALUE str_casecmp_p(VALUE str1, VALUE str2); + /* * call-seq: - * str.casecmp(other_str) -> -1, 0, +1 or nil + * str.casecmp(other_str) -> -1, 0, +1, or nil * * Case-insensitive version of <code>String#<=></code>. + * Currently, case-insensitivity only works on characters A-Z/a-z, + * not all of Unicode. This is different from String#casecmp?. * - * "abcdef".casecmp("abcde") #=> 1 + * "aBcDeF".casecmp("abcde") #=> 1 * "aBcDeF".casecmp("abcdef") #=> 0 - * "abcdef".casecmp("abcdefg") #=> -1 + * "aBcDeF".casecmp("abcdefg") #=> -1 * "abcdef".casecmp("ABCDEF") #=> 0 + * + * +nil+ is returned if the two strings have incompatible encodings, + * or if +other_str+ is not a string. + * + * "foo".casecmp(2) #=> nil + * "\u{e4 f6 fc}".encode("ISO-8859-1").casecmp("\u{c4 d6 dc}") #=> nil */ static VALUE rb_str_casecmp(VALUE str1, VALUE str2) { + VALUE s = rb_check_string_type(str2); + if (NIL_P(s)) { + return Qnil; + } + return str_casecmp(str1, s); +} + +static VALUE +str_casecmp(VALUE str1, VALUE str2) +{ long len; rb_encoding *enc; char *p1, *p1end, *p2, *p2end; - StringValue(str2); enc = rb_enc_compatible(str1, str2); if (!enc) { return Qnil; @@ -2459,54 +3450,119 @@ rb_str_casecmp(VALUE str1, VALUE str2) return INT2FIX(-1); } -static long -rb_str_index(VALUE str, VALUE sub, long offset) +/* + * call-seq: + * str.casecmp?(other_str) -> true, false, or nil + * + * Returns +true+ if +str+ and +other_str+ are equal after + * Unicode case folding, +false+ if they are not equal. + * + * "aBcDeF".casecmp?("abcde") #=> false + * "aBcDeF".casecmp?("abcdef") #=> true + * "aBcDeF".casecmp?("abcdefg") #=> false + * "abcdef".casecmp?("ABCDEF") #=> true + * "\u{e4 f6 fc}".casecmp?("\u{c4 d6 dc}") #=> true + * + * +nil+ is returned if the two strings have incompatible encodings, + * or if +other_str+ is not a string. + * + * "foo".casecmp?(2) #=> nil + * "\u{e4 f6 fc}".encode("ISO-8859-1").casecmp?("\u{c4 d6 dc}") #=> nil + */ + +static VALUE +rb_str_casecmp_p(VALUE str1, VALUE str2) +{ + VALUE s = rb_check_string_type(str2); + if (NIL_P(s)) { + return Qnil; + } + return str_casecmp_p(str1, s); +} + +static VALUE +str_casecmp_p(VALUE str1, VALUE str2) { - long pos; - char *s, *sptr, *e; - long len, slen; rb_encoding *enc; + VALUE folded_str1, folded_str2; + VALUE fold_opt = sym_fold; - enc = rb_enc_check(str, sub); - if (is_broken_string(sub)) { - return -1; - } - len = str_strlen(str, enc); - slen = str_strlen(sub, enc); - if (offset < 0) { - offset += len; - if (offset < 0) return -1; - } - if (len - offset < slen) return -1; - s = RSTRING_PTR(str); - e = s + RSTRING_LEN(str); - if (offset) { - offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str)); - s += offset; + enc = rb_enc_compatible(str1, str2); + if (!enc) { + return Qnil; } - if (slen == 0) return offset; - /* need proceed one character at a time */ - sptr = RSTRING_PTR(sub); - slen = RSTRING_LEN(sub); - len = RSTRING_LEN(str) - offset; + + folded_str1 = rb_str_downcase(1, &fold_opt, str1); + folded_str2 = rb_str_downcase(1, &fold_opt, str2); + + return rb_str_eql(folded_str1, folded_str2); +} + +static long +strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len, + const char *sub_ptr, long sub_len, long offset, rb_encoding *enc) +{ + const char *search_start = str_ptr; + long pos, search_len = str_len - offset; + for (;;) { - char *t; - pos = rb_memsearch(sptr, slen, s, len, enc); + const char *t; + pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc); if (pos < 0) return pos; - t = rb_enc_right_char_head(s, s+pos, e, enc); - if (t == s + pos) break; - if ((len -= t - s) <= 0) return -1; - offset += t - s; - s = t; + t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc); + if (t == search_start + pos) break; + search_len -= t - search_start; + if (search_len <= 0) return -1; + offset += t - search_start; + search_start = t; } return pos + offset; } +#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0) + +static long +rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte) +{ + const char *str_ptr, *str_ptr_end, *sub_ptr; + long str_len, sub_len; + int single_byte = single_byte_optimizable(str); + rb_encoding *enc; + + enc = rb_enc_check(str, sub); + if (is_broken_string(sub)) return -1; + + str_ptr = RSTRING_PTR(str); + str_ptr_end = RSTRING_END(str); + str_len = RSTRING_LEN(str); + sub_ptr = RSTRING_PTR(sub); + sub_len = RSTRING_LEN(sub); + + if (str_len < sub_len) return -1; + + if (offset != 0) { + long str_len_char, sub_len_char; + str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc); + sub_len_char = in_byte ? sub_len : str_strlen(sub, enc); + if (offset < 0) { + offset += str_len_char; + if (offset < 0) return -1; + } + if (str_len_char - offset < sub_len_char) return -1; + if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte); + str_ptr += offset; + } + if (sub_len == 0) return offset; + + /* need proceed one character at a time */ + return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc); +} + /* * call-seq: - * str.index(substring [, offset]) -> fixnum or nil - * str.index(regexp [, offset]) -> fixnum or nil + * str.index(substring [, offset]) -> integer or nil + * str.index(regexp [, offset]) -> integer or nil * * Returns the index of the first occurrence of the given <i>substring</i> or * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not @@ -2534,7 +3590,7 @@ rb_str_index_m(int argc, VALUE *argv, VALUE str) pos = 0; } if (pos < 0) { - pos += str_strlen(str, STR_ENC_GET(str)); + pos += str_strlen(str, NULL); if (pos < 0) { if (RB_TYPE_P(sub, T_REGEXP)) { rb_backref_set(Qnil); @@ -2546,7 +3602,7 @@ rb_str_index_m(int argc, VALUE *argv, VALUE str) if (SPECIAL_CONST_P(sub)) goto generic; switch (BUILTIN_TYPE(sub)) { case T_REGEXP: - if (pos > str_strlen(str, STR_ENC_GET(str))) + if (pos > str_strlen(str, NULL)) return Qnil; pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, rb_enc_check(str, sub), single_byte_optimizable(str)); @@ -2577,33 +3633,50 @@ rb_str_index_m(int argc, VALUE *argv, VALUE str) return LONG2NUM(pos); } +#ifdef HAVE_MEMRCHR static long -rb_str_rindex(VALUE str, VALUE sub, long pos) +str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc) { - long len, slen; - char *s, *sbeg, *e, *t; - rb_encoding *enc; - int singlebyte = single_byte_optimizable(str); + char *hit, *adjusted; + int c; + long slen, searchlen; + char *sbeg, *e, *t; + + slen = RSTRING_LEN(sub); + if (slen == 0) return pos; + sbeg = RSTRING_PTR(str); + e = RSTRING_END(str); + t = RSTRING_PTR(sub); + c = *t & 0xff; + searchlen = s - sbeg + 1; + + do { + hit = memrchr(sbeg, c, searchlen); + if (!hit) break; + adjusted = rb_enc_left_char_head(sbeg, hit, e, enc); + if (hit != adjusted) { + searchlen = adjusted - sbeg; + continue; + } + if (memcmp(hit, t, slen) == 0) + return rb_str_sublen(str, hit - sbeg); + searchlen = adjusted - sbeg; + } while (searchlen > 0); + + return -1; +} +#else +static long +str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc) +{ + long slen; + char *sbeg, *e, *t; - enc = rb_enc_check(str, sub); - if (is_broken_string(sub)) { - return -1; - } - len = str_strlen(str, enc); - slen = str_strlen(sub, enc); - /* substring longer than string */ - if (len < slen) return -1; - if (len - pos < slen) { - pos = len - slen; - } - if (len == 0) { - return pos; - } sbeg = RSTRING_PTR(str); e = RSTRING_END(str); t = RSTRING_PTR(sub); slen = RSTRING_LEN(sub); - s = str_nth(sbeg, e, pos, enc, singlebyte); + while (s) { if (memcmp(s, t, slen) == 0) { return pos; @@ -2612,14 +3685,48 @@ rb_str_rindex(VALUE str, VALUE sub, long pos) pos--; s = rb_enc_prev_char(sbeg, s, e, enc); } + return -1; } +#endif + +static long +rb_str_rindex(VALUE str, VALUE sub, long pos) +{ + long len, slen; + char *sbeg, *s; + rb_encoding *enc; + int singlebyte; + + enc = rb_enc_check(str, sub); + if (is_broken_string(sub)) return -1; + singlebyte = single_byte_optimizable(str); + len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */ + slen = str_strlen(sub, enc); /* rb_enc_check */ + + /* substring longer than string */ + if (len < slen) return -1; + if (len - pos < slen) pos = len - slen; + if (len == 0) return pos; + + sbeg = RSTRING_PTR(str); + + if (pos == 0) { + if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0) + return 0; + else + return -1; + } + + s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte); + return str_rindex(str, sub, s, pos, enc); +} /* * call-seq: - * str.rindex(substring [, fixnum]) -> fixnum or nil - * str.rindex(regexp [, fixnum]) -> fixnum or nil + * str.rindex(substring [, integer]) -> integer or nil + * str.rindex(regexp [, integer]) -> integer or nil * * Returns the index of the last occurrence of the given <i>substring</i> or * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not @@ -2640,7 +3747,7 @@ rb_str_rindex_m(int argc, VALUE *argv, VALUE str) VALUE sub; VALUE vpos; rb_encoding *enc = STR_ENC_GET(str); - long pos, len = str_strlen(str, enc); + long pos, len = str_strlen(str, enc); /* str's enc */ if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) { pos = NUM2LONG(vpos); @@ -2664,12 +3771,10 @@ rb_str_rindex_m(int argc, VALUE *argv, VALUE str) case T_REGEXP: /* enc = rb_get_check(str, sub); */ pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, - STR_ENC_GET(str), single_byte_optimizable(str)); + enc, single_byte_optimizable(str)); - if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) { - pos = rb_reg_search(sub, str, pos, 1); - pos = rb_str_sublen(str, pos); - } + pos = rb_reg_search(sub, str, pos, 1); + pos = rb_str_sublen(str, pos); if (pos >= 0) return LONG2NUM(pos); break; @@ -2695,7 +3800,7 @@ rb_str_rindex_m(int argc, VALUE *argv, VALUE str) /* * call-seq: - * str =~ obj -> fixnum or nil + * str =~ obj -> integer or nil * * Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match * against <i>str</i>,and returns the position the match starts, or @@ -2724,12 +3829,12 @@ rb_str_match(VALUE x, VALUE y) generic: default: - return rb_funcall(y, rb_intern("=~"), 1, x); + return rb_funcall(y, idEqTilde, 1, x); } } -static VALUE get_pat(VALUE, int); +static VALUE get_pat(VALUE); /* @@ -2745,6 +3850,7 @@ static VALUE get_pat(VALUE, int); * 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l"> * 'hello'.match('(.)\1')[0] #=> "ll" * 'hello'.match(/(.)\1/)[0] #=> "ll" + * 'hello'.match(/(.)\1/, 3) #=> nil * 'hello'.match('xx') #=> nil * * If a block is given, invoke the block with MatchData if match succeed, so @@ -2769,13 +3875,39 @@ rb_str_match_m(int argc, VALUE *argv, VALUE str) rb_check_arity(argc, 1, 2); re = argv[0]; argv[0] = str; - result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv); + result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv); if (!NIL_P(result) && rb_block_given_p()) { return rb_yield(result); } return result; } +/* + * call-seq: + * str.match?(pattern) -> true or false + * str.match?(pattern, pos) -> true or false + * + * Converts _pattern_ to a +Regexp+ (if it isn't already one), then + * returns a +true+ or +false+ indicates whether the regexp is + * matched _str_ or not without updating <code>$~</code> and other + * related variables. If the second parameter is present, it + * specifies the position in the string to begin the search. + * + * "Ruby".match?(/R.../) #=> true + * "Ruby".match?(/R.../, 1) #=> false + * "Ruby".match?(/P.../) #=> false + * $& #=> nil + */ + +static VALUE +rb_str_match_m_p(int argc, VALUE *argv, VALUE str) +{ + VALUE re; + rb_check_arity(argc, 1, 2); + re = get_pat(argv[0]); + return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0); +} + enum neighbor_char { NEIGHBOR_NOT_CHAR, NEIGHBOR_FOUND, @@ -2787,6 +3919,24 @@ enc_succ_char(char *p, long len, rb_encoding *enc) { long i; int l; + + if (rb_enc_mbminlen(enc) > 1) { + /* wchar, trivial case */ + int r = rb_enc_precise_mbclen(p, p + len, enc), c; + if (!MBCLEN_CHARFOUND_P(r)) { + return NEIGHBOR_NOT_CHAR; + } + c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1; + l = rb_enc_code_to_mbclen(c, enc); + if (!l) return NEIGHBOR_NOT_CHAR; + if (l != len) return NEIGHBOR_WRAPPED; + rb_enc_mbcput(c, p, enc); + r = rb_enc_precise_mbclen(p, p + len, enc); + if (!MBCLEN_CHARFOUND_P(r)) { + return NEIGHBOR_NOT_CHAR; + } + return NEIGHBOR_FOUND; + } while (1) { for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--) p[i] = '\0'; @@ -2821,6 +3971,25 @@ enc_pred_char(char *p, long len, rb_encoding *enc) { long i; int l; + if (rb_enc_mbminlen(enc) > 1) { + /* wchar, trivial case */ + int r = rb_enc_precise_mbclen(p, p + len, enc), c; + if (!MBCLEN_CHARFOUND_P(r)) { + return NEIGHBOR_NOT_CHAR; + } + c = rb_enc_mbc_to_codepoint(p, p + len, enc); + if (!c) return NEIGHBOR_NOT_CHAR; + --c; + l = rb_enc_code_to_mbclen(c, enc); + if (!l) return NEIGHBOR_NOT_CHAR; + if (l != len) return NEIGHBOR_WRAPPED; + rb_enc_mbcput(c, p, enc); + r = rb_enc_precise_mbclen(p, p + len, enc); + if (!MBCLEN_CHARFOUND_P(r)) { + return NEIGHBOR_NOT_CHAR; + } + return NEIGHBOR_FOUND; + } while (1) { for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--) p[i] = '\xff'; @@ -2868,6 +4037,10 @@ enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry) int range; char save[ONIGENC_CODE_TO_MBC_MAXLEN]; + /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */ + int try; + const int max_gaps = 1; + c = rb_enc_mbc_to_codepoint(p, p+len, enc); if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc)) ctype = ONIGENC_CTYPE_DIGIT; @@ -2877,11 +4050,13 @@ enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry) return NEIGHBOR_NOT_CHAR; MEMCPY(save, p, char, len); - ret = enc_succ_char(p, len, enc); - if (ret == NEIGHBOR_FOUND) { - c = rb_enc_mbc_to_codepoint(p, p+len, enc); - if (rb_enc_isctype(c, ctype, enc)) - return NEIGHBOR_FOUND; + for (try = 0; try <= max_gaps; ++try) { + ret = enc_succ_char(p, len, enc); + if (ret == NEIGHBOR_FOUND) { + c = rb_enc_mbc_to_codepoint(p, p+len, enc); + if (rb_enc_isctype(c, ctype, enc)) + return NEIGHBOR_FOUND; + } } MEMCPY(p, save, char, len); range = 1; @@ -2916,6 +4091,8 @@ enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry) } +static VALUE str_succ(VALUE str); + /* * call-seq: * str.succ -> new_str @@ -2944,23 +4121,30 @@ enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry) VALUE rb_str_succ(VALUE orig) { - rb_encoding *enc; VALUE str; + str = rb_str_new_with_class(orig, RSTRING_PTR(orig), RSTRING_LEN(orig)); + rb_enc_cr_str_copy_for_substr(str, orig); + OBJ_INFECT(str, orig); + return str_succ(str); +} + +static VALUE +str_succ(VALUE str) +{ + rb_encoding *enc; char *sbeg, *s, *e, *last_alnum = 0; int c = -1; - long l; + long l, slen; char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1"; long carry_pos = 0, carry_len = 1; enum neighbor_char neighbor = NEIGHBOR_FOUND; - str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig)); - rb_enc_cr_str_copy_for_substr(str, orig); - OBJ_INFECT(str, orig); - if (RSTRING_LEN(str) == 0) return str; + slen = RSTRING_LEN(str); + if (slen == 0) return str; - enc = STR_ENC_GET(orig); + enc = STR_ENC_GET(str); sbeg = RSTRING_PTR(str); - s = e = sbeg + RSTRING_LEN(str); + s = e = sbeg + slen; while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) { if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) { @@ -2970,7 +4154,9 @@ rb_str_succ(VALUE orig) break; } } - if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; + l = rb_enc_precise_mbclen(s, e, enc); + if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue; + l = ONIGENC_MBCLEN_CHARFOUND_LEN(l); neighbor = enc_succ_alnum_char(s, l, enc, carry); switch (neighbor) { case NEIGHBOR_NOT_CHAR: @@ -2989,10 +4175,23 @@ rb_str_succ(VALUE orig) s = e; while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) { enum neighbor_char neighbor; - if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; - neighbor = enc_succ_char(s, l, enc); - if (neighbor == NEIGHBOR_FOUND) + char tmp[ONIGENC_CODE_TO_MBC_MAXLEN]; + l = rb_enc_precise_mbclen(s, e, enc); + if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue; + l = ONIGENC_MBCLEN_CHARFOUND_LEN(l); + MEMCPY(tmp, s, char, l); + neighbor = enc_succ_char(tmp, l, enc); + switch (neighbor) { + case NEIGHBOR_FOUND: + MEMCPY(s, tmp, char, l); return str; + break; + case NEIGHBOR_WRAPPED: + MEMCPY(s, tmp, char, l); + break; + case NEIGHBOR_NOT_CHAR: + break; + } if (rb_enc_precise_mbclen(s, s+l, enc) != l) { /* wrapped to \0...\0. search next valid char. */ enc_succ_char(s, l, enc); @@ -3003,13 +4202,16 @@ rb_str_succ(VALUE orig) } carry_pos = s - sbeg; } + ENC_CODERANGE_SET(str, ENC_CODERANGE_UNKNOWN); } - RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len); - s = RSTRING_PTR(str) + carry_pos; - memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos); + RESIZE_CAPA(str, slen + carry_len); + sbeg = RSTRING_PTR(str); + s = sbeg + carry_pos; + memmove(s + carry_len, s, slen - carry_pos); memmove(s, carry, carry_len); - STR_SET_LEN(str, RSTRING_LEN(str) + carry_len); - RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; + slen += carry_len; + STR_SET_LEN(str, slen); + TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc)); rb_enc_str_coderange(str); return str; } @@ -3027,11 +4229,27 @@ rb_str_succ(VALUE orig) static VALUE rb_str_succ_bang(VALUE str) { - rb_str_shared_replace(str, rb_str_succ(str)); - + rb_str_modify(str); + str_succ(str); return str; } +static int +all_digits_p(const char *s, long len) +{ + while (len-- > 0) { + if (!ISDIGIT(*s)) return 0; + s++; + } + return 1; +} + +static int +str_upto_i(VALUE str, VALUE arg) +{ + rb_yield(str); + return 0; +} /* * call-seq: @@ -3069,14 +4287,20 @@ static VALUE rb_str_upto(int argc, VALUE *argv, VALUE beg) { VALUE end, exclusive; + + rb_scan_args(argc, argv, "11", &end, &exclusive); + RETURN_ENUMERATOR(beg, argc, argv); + return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil); +} + +VALUE +rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg) +{ VALUE current, after_end; ID succ; - int n, excl, ascii; + int n, ascii; rb_encoding *enc; - rb_scan_args(argc, argv, "11", &end, &exclusive); - RETURN_ENUMERATOR(beg, argc, argv); - excl = RTEST(exclusive); CONST_ID(succ, "succ"); StringValue(end); enc = rb_enc_check(beg, end); @@ -3088,7 +4312,7 @@ rb_str_upto(int argc, VALUE *argv, VALUE beg) if (c > e || (excl && c == e)) return beg; for (;;) { - rb_yield(rb_enc_str_new(&c, 1, enc)); + if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break; if (!excl && c == e) break; c++; if (excl && c == e) break; @@ -3096,22 +4320,13 @@ rb_str_upto(int argc, VALUE *argv, VALUE beg) return beg; } /* both edges are all digits */ - if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) { - char *s, *send; + if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) && + all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) && + all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) { VALUE b, e; int width; - s = RSTRING_PTR(beg); send = RSTRING_END(beg); - width = rb_long2int(send - s); - while (s < send) { - if (!ISDIGIT(*s)) goto no_digits; - s++; - } - s = RSTRING_PTR(end); send = RSTRING_END(end); - while (s < send) { - if (!ISDIGIT(*s)) goto no_digits; - s++; - } + width = RSTRING_LENINT(beg); b = rb_str_to_inum(beg, 10, FALSE); e = rb_str_to_inum(end, 10, FALSE); if (FIXNUM_P(b) && FIXNUM_P(e)) { @@ -3121,35 +4336,34 @@ rb_str_upto(int argc, VALUE *argv, VALUE beg) while (bi <= ei) { if (excl && bi == ei) break; - rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi)); + if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break; bi++; } } else { - ID op = excl ? '<' : rb_intern("<="); - VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d")); + ID op = excl ? '<' : idLE; + VALUE args[2], fmt = rb_fstring_lit("%.*d"); args[0] = INT2FIX(width); while (rb_funcall(b, op, 1, e)) { args[1] = b; - rb_yield(rb_str_format(numberof(args), args, fmt)); - b = rb_funcall(b, succ, 0, 0); + if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break; + b = rb_funcallv(b, succ, 0, 0); } } return beg; } /* normal case */ - no_digits: n = rb_str_cmp(beg, end); if (n > 0 || (excl && n == 0)) return beg; - after_end = rb_funcall(end, succ, 0, 0); + after_end = rb_funcallv(end, succ, 0, 0); current = rb_str_dup(beg); while (!rb_str_equal(current, after_end)) { VALUE next = Qnil; if (excl || !rb_str_equal(current, end)) - next = rb_funcall(current, succ, 0, 0); - rb_yield(current); + next = rb_funcallv(current, succ, 0, 0); + if ((*each)(current, arg)) break; if (NIL_P(next)) break; current = next; StringValue(current); @@ -3161,6 +4375,103 @@ rb_str_upto(int argc, VALUE *argv, VALUE beg) return beg; } +VALUE +rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg) +{ + VALUE current; + ID succ; + + CONST_ID(succ, "succ"); + /* both edges are all digits */ + if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) && + all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) { + VALUE b, args[2], fmt = rb_fstring_lit("%.*d"); + int width = RSTRING_LENINT(beg); + b = rb_str_to_inum(beg, 10, FALSE); + if (FIXNUM_P(b)) { + long bi = FIX2LONG(b); + rb_encoding *usascii = rb_usascii_encoding(); + + while (FIXABLE(bi)) { + if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break; + bi++; + } + b = LONG2NUM(bi); + } + args[0] = INT2FIX(width); + while (1) { + args[1] = b; + if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break; + b = rb_funcallv(b, succ, 0, 0); + } + } + /* normal case */ + current = rb_str_dup(beg); + while (1) { + VALUE next = rb_funcallv(current, succ, 0, 0); + if ((*each)(current, arg)) break; + current = next; + StringValue(current); + if (RSTRING_LEN(current) == 0) + break; + } + + return beg; +} + +static int +include_range_i(VALUE str, VALUE arg) +{ + VALUE *argp = (VALUE *)arg; + if (!rb_equal(str, *argp)) return 0; + *argp = Qnil; + return 1; +} + +VALUE +rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive) +{ + beg = rb_str_new_frozen(beg); + StringValue(end); + end = rb_str_new_frozen(end); + if (NIL_P(val)) return Qfalse; + val = rb_check_string_type(val); + if (NIL_P(val)) return Qfalse; + if (rb_enc_asciicompat(STR_ENC_GET(beg)) && + rb_enc_asciicompat(STR_ENC_GET(end)) && + rb_enc_asciicompat(STR_ENC_GET(val))) { + const char *bp = RSTRING_PTR(beg); + const char *ep = RSTRING_PTR(end); + const char *vp = RSTRING_PTR(val); + if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) { + if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1) + return Qfalse; + else { + char b = *bp; + char e = *ep; + char v = *vp; + + if (ISASCII(b) && ISASCII(e) && ISASCII(v)) { + if (b <= v && v < e) return Qtrue; + if (!RTEST(exclusive) && v == e) return Qtrue; + return Qfalse; + } + } + } +#if 0 + /* both edges are all digits */ + if (ISDIGIT(*bp) && ISDIGIT(*ep) && + all_digits_p(bp, RSTRING_LEN(beg)) && + all_digits_p(ep, RSTRING_LEN(end))) { + /* TODO */ + } +#endif + } + rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val); + + return NIL_P(val) ? Qtrue : Qfalse; +} + static VALUE rb_str_subpat(VALUE str, VALUE re, VALUE backref) { @@ -3179,46 +4490,30 @@ rb_str_aref(VALUE str, VALUE indx) if (FIXNUM_P(indx)) { idx = FIX2LONG(indx); - - num_index: - str = rb_str_substr(str, idx, 1); - if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil; - return str; } - - if (SPECIAL_CONST_P(indx)) goto generic; - switch (BUILTIN_TYPE(indx)) { - case T_REGEXP: + else if (RB_TYPE_P(indx, T_REGEXP)) { return rb_str_subpat(str, indx, INT2FIX(0)); - - case T_STRING: + } + else if (RB_TYPE_P(indx, T_STRING)) { if (rb_str_index(str, indx, 0) != -1) return rb_str_dup(indx); return Qnil; - - generic: - default: + } + else { /* check if indx is Range */ - { - long beg, len; - VALUE tmp; - - len = str_strlen(str, STR_ENC_GET(str)); - switch (rb_range_beg_len(indx, &beg, &len, len, 0)) { - case Qfalse: - break; - case Qnil: - return Qnil; - default: - tmp = rb_str_substr(str, beg, len); - return tmp; - } + long beg, len = str_strlen(str, NULL); + switch (rb_range_beg_len(indx, &beg, &len, len, 0)) { + case Qfalse: + break; + case Qnil: + return Qnil; + default: + return rb_str_substr(str, beg, len); } idx = NUM2LONG(indx); - goto num_index; } - UNREACHABLE; + return str_substr(str, idx, 1, FALSE); } @@ -3240,7 +4535,7 @@ rb_str_aref(VALUE str, VALUE indx) * Element Reference --- If passed a single +index+, returns a substring of * one character at that index. If passed a +start+ index and a +length+, * returns a substring containing +length+ characters starting at the - * +index+. If passed a +range+, its beginning and end are interpreted as + * +start+ index. If passed a +range+, its beginning and end are interpreted as * offsets delimiting the substring to be returned. * * In these three cases, if an index is negative, it is counted from the end @@ -3298,7 +4593,11 @@ rb_str_aref_m(int argc, VALUE *argv, VALUE str) if (RB_TYPE_P(argv[0], T_REGEXP)) { return rb_str_subpat(str, argv[0], argv[1]); } - return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1])); + else { + long beg = NUM2LONG(argv[0]); + long len = NUM2LONG(argv[1]); + return rb_str_substr(str, beg, len); + } } rb_check_arity(argc, 1, 2); return rb_str_aref(str, argv[0]); @@ -3313,9 +4612,9 @@ rb_str_drop_bytes(VALUE str, long len) str_modifiable(str); if (len > olen) len = olen; nlen = olen - len; - if (nlen <= RSTRING_EMBED_LEN_MAX) { + if (STR_EMBEDDABLE_P(nlen, TERM_LEN(str))) { char *oldptr = ptr; - int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED)); + int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE)); STR_SET_EMBED(str); STR_SET_EMBED_LEN(str, nlen); ptr = RSTRING(str)->as.ary; @@ -3323,7 +4622,7 @@ rb_str_drop_bytes(VALUE str, long len) if (fl == STR_NOEMBED) xfree(oldptr); } else { - if (!STR_SHARED_P(str)) rb_str_new4(str); + if (!STR_SHARED_P(str)) rb_str_new_frozen(str); ptr = RSTRING(str)->as.heap.ptr += len; RSTRING(str)->as.heap.len = nlen; } @@ -3335,38 +4634,49 @@ rb_str_drop_bytes(VALUE str, long len) static void rb_str_splice_0(VALUE str, long beg, long len, VALUE val) { - if (beg == 0 && RSTRING_LEN(val) == 0) { + char *sptr; + long slen, vlen = RSTRING_LEN(val); + int cr; + + if (beg == 0 && vlen == 0) { rb_str_drop_bytes(str, len); OBJ_INFECT(str, val); return; } - rb_str_modify(str); - if (len < RSTRING_LEN(val)) { + str_modify_keep_cr(str); + RSTRING_GETMEM(str, sptr, slen); + if (len < vlen) { /* expand string */ - RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1); + RESIZE_CAPA(str, slen + vlen - len); + sptr = RSTRING_PTR(str); } - if (RSTRING_LEN(val) != len) { - memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val), - RSTRING_PTR(str) + beg + len, - RSTRING_LEN(str) - (beg + len)); - } - if (RSTRING_LEN(val) < beg && len < 0) { - MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len); + if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) + cr = rb_enc_str_coderange(val); + else + cr = ENC_CODERANGE_UNKNOWN; + + if (vlen != len) { + memmove(sptr + beg + vlen, + sptr + beg + len, + slen - (beg + len)); } - if (RSTRING_LEN(val) > 0) { - memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val)); + if (vlen < beg && len < 0) { + MEMZERO(sptr + slen, char, -len); } - STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len); - if (RSTRING_PTR(str)) { - RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; + if (vlen > 0) { + memmove(sptr + beg, RSTRING_PTR(val), vlen); } + slen += vlen - len; + STR_SET_LEN(str, slen); + TERM_FILL(&sptr[slen], TERM_LEN(str)); OBJ_INFECT(str, val); + ENC_CODERANGE_SET(str, cr); } -static void -rb_str_splice(VALUE str, long beg, long len, VALUE val) +void +rb_str_update(VALUE str, long beg, long len, VALUE val) { long slen; char *p, *e; @@ -3378,19 +4688,21 @@ rb_str_splice(VALUE str, long beg, long len, VALUE val) StringValue(val); enc = rb_enc_check(str, val); - slen = str_strlen(str, enc); + slen = str_strlen(str, enc); /* rb_enc_check */ if (slen < beg) { out_of_range: rb_raise(rb_eIndexError, "index %ld out of string", beg); } if (beg < 0) { - if (-beg > slen) { + if (beg + slen < 0) { goto out_of_range; } beg += slen; } - if (slen < len || slen < beg + len) { + assert(beg >= 0); + assert(beg <= slen); + if (len > slen - beg) { len = slen - beg; } str_modify_keep_cr(str); @@ -3408,11 +4720,7 @@ rb_str_splice(VALUE str, long beg, long len, VALUE val) ENC_CODERANGE_SET(str, cr); } -void -rb_str_update(VALUE str, long beg, long len, VALUE val) -{ - rb_str_splice(str, beg, len, val); -} +#define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val) static void rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val) @@ -3447,7 +4755,7 @@ rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val) end = END(nth); len = end - start; StringValue(val); - enc = rb_enc_check(str, val); + enc = rb_enc_check_str(str, val); rb_str_splice_0(str, start, len, val); rb_enc_associate(str, enc); } @@ -3465,7 +4773,7 @@ rb_str_aset(VALUE str, VALUE indx, VALUE val) } if (SPECIAL_CONST_P(indx)) goto generic; - switch (TYPE(indx)) { + switch (BUILTIN_TYPE(indx)) { case T_REGEXP: rb_str_subpat_set(str, indx, INT2FIX(0), val); return val; @@ -3476,7 +4784,7 @@ rb_str_aset(VALUE str, VALUE indx, VALUE val) rb_raise(rb_eIndexError, "string not matched"); } beg = rb_str_sublen(str, beg); - rb_str_splice(str, beg, str_strlen(indx, 0), val); + rb_str_splice(str, beg, str_strlen(indx, NULL), val); return val; generic: @@ -3484,7 +4792,7 @@ rb_str_aset(VALUE str, VALUE indx, VALUE val) /* check if indx is Range */ { long beg, len; - if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) { + if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) { rb_str_splice(str, beg, len, val); return val; } @@ -3496,11 +4804,11 @@ rb_str_aset(VALUE str, VALUE indx, VALUE val) /* * call-seq: - * str[fixnum] = new_str - * str[fixnum, fixnum] = new_str + * str[integer] = new_str + * str[integer, integer] = new_str * str[range] = aString * str[regexp] = new_str - * str[regexp, fixnum] = new_str + * str[regexp, integer] = new_str * str[regexp, name] = new_str * str[other_str] = new_str * @@ -3510,13 +4818,13 @@ rb_str_aset(VALUE str, VALUE indx, VALUE val) * the text it is replacing, the string will be adjusted accordingly. If the * regular expression or string is used as the index doesn't match a position * in the string, <code>IndexError</code> is raised. If the regular expression - * form is used, the optional second <code>Fixnum</code> allows you to specify + * form is used, the optional second <code>Integer</code> allows you to specify * which portion of the match to replace (effectively using the - * <code>MatchData</code> indexing rules. The forms that take a - * <code>Fixnum</code> will raise an <code>IndexError</code> if the value is + * <code>MatchData</code> indexing rules. The forms that take an + * <code>Integer</code> will raise an <code>IndexError</code> if the value is * out of range; the <code>Range</code> form will raise a * <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code> - * forms will silently ignore the assignment. + * will raise an <code>IndexError</code> on negative match. */ static VALUE @@ -3570,8 +4878,8 @@ rb_str_insert(VALUE str, VALUE idx, VALUE str2) /* * call-seq: - * str.slice!(fixnum) -> fixnum or nil - * str.slice!(fixnum, fixnum) -> new_str or nil + * str.slice!(integer) -> new_str or nil + * str.slice!(integer, integer) -> new_str or nil * str.slice!(range) -> new_str or nil * str.slice!(regexp) -> new_str or nil * str.slice!(other_str) -> new_str or nil @@ -3608,11 +4916,12 @@ rb_str_slice_bang(int argc, VALUE *argv, VALUE str) } static VALUE -get_pat(VALUE pat, int quote) +get_pat(VALUE pat) { VALUE val; - switch (TYPE(pat)) { + if (SPECIAL_CONST_P(pat)) goto to_string; + switch (BUILTIN_TYPE(pat)) { case T_REGEXP: return pat; @@ -3620,6 +4929,7 @@ get_pat(VALUE pat, int quote) break; default: + to_string: val = rb_check_string_type(pat); if (NIL_P(val)) { Check_Type(pat, T_REGEXP); @@ -3627,11 +4937,58 @@ get_pat(VALUE pat, int quote) pat = val; } - if (quote) { - pat = rb_reg_quote(pat); + return rb_reg_regcomp(pat); +} + +static VALUE +get_pat_quoted(VALUE pat, int check) +{ + VALUE val; + + if (SPECIAL_CONST_P(pat)) goto to_string; + switch (BUILTIN_TYPE(pat)) { + case T_REGEXP: + return pat; + + case T_STRING: + break; + + default: + to_string: + val = rb_check_string_type(pat); + if (NIL_P(val)) { + Check_Type(pat, T_REGEXP); + } + pat = val; + } + if (check && is_broken_string(pat)) { + rb_exc_raise(rb_reg_check_preprocess(pat)); } + return pat; +} - return rb_reg_regcomp(pat); +static long +rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str) +{ + if (BUILTIN_TYPE(pat) == T_STRING) { + pos = rb_strseq_index(str, pat, pos, 1); + if (set_backref_str) { + if (pos >= 0) { + VALUE match; + str = rb_str_new_frozen(str); + rb_backref_set_string(str, pos, RSTRING_LEN(pat)); + match = rb_backref_get(); + OBJ_INFECT(match, pat); + } + else { + rb_backref_set(Qnil); + } + } + return pos; + } + else { + return rb_reg_search0(pat, str, pos, 0, set_backref_str); + } } @@ -3652,9 +5009,9 @@ rb_str_sub_bang(int argc, VALUE *argv, VALUE str) VALUE pat, repl, hash = Qnil; int iter = 0; int tainted = 0; - int untrusted = 0; long plen; int min_arity = rb_block_given_p() ? 1 : 2; + long beg; rb_check_arity(argc, min_arity, 2); if (argc == 1) { @@ -3666,27 +5023,40 @@ rb_str_sub_bang(int argc, VALUE *argv, VALUE str) if (NIL_P(hash)) { StringValue(repl); } - if (OBJ_TAINTED(repl)) tainted = 1; - if (OBJ_UNTRUSTED(repl)) untrusted = 1; + tainted = OBJ_TAINTED_RAW(repl); } - pat = get_pat(argv[0], 1); + pat = get_pat_quoted(argv[0], 1); + str_modifiable(str); - if (rb_reg_search(pat, str, 0, 0) >= 0) { + beg = rb_pat_search(pat, str, 0, 1); + if (beg >= 0) { rb_encoding *enc; int cr = ENC_CODERANGE(str); - VALUE match = rb_backref_get(); - struct re_registers *regs = RMATCH_REGS(match); - long beg0 = BEG(0); - long end0 = END(0); + long beg0, end0; + VALUE match, match0 = Qnil; + struct re_registers *regs; char *p, *rp; long len, rlen; + match = rb_backref_get(); + regs = RMATCH_REGS(match); + if (RB_TYPE_P(pat, T_STRING)) { + beg0 = beg; + end0 = beg0 + RSTRING_LEN(pat); + match0 = pat; + } + else { + beg0 = BEG(0); + end0 = END(0); + if (iter) match0 = rb_reg_nth_match(0, match); + } + if (iter || !NIL_P(hash)) { p = RSTRING_PTR(str); len = RSTRING_LEN(str); if (iter) { - repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); + repl = rb_obj_as_string(rb_yield(match0)); } else { repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0)); @@ -3696,8 +5066,9 @@ rb_str_sub_bang(int argc, VALUE *argv, VALUE str) rb_check_frozen(str); } else { - repl = rb_reg_regsub(repl, str, regs, pat); + repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat); } + enc = rb_enc_compatible(str, repl); if (!enc) { rb_encoding *str_enc = STR_ENC_GET(str); @@ -3712,8 +5083,7 @@ rb_str_sub_bang(int argc, VALUE *argv, VALUE str) } rb_str_modify(str); rb_enc_associate(str, enc); - if (OBJ_TAINTED(repl)) tainted = 1; - if (OBJ_UNTRUSTED(repl)) untrusted = 1; + tainted |= OBJ_TAINTED_RAW(repl); if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) { int cr2 = ENC_CODERANGE(repl); if (cr2 == ENC_CODERANGE_BROKEN || @@ -3723,7 +5093,7 @@ rb_str_sub_bang(int argc, VALUE *argv, VALUE str) cr = cr2; } plen = end0 - beg0; - rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl); + rlen = RSTRING_LEN(repl); len = RSTRING_LEN(str); if (rlen > plen) { RESIZE_CAPA(str, len + rlen - plen); @@ -3732,13 +5102,13 @@ rb_str_sub_bang(int argc, VALUE *argv, VALUE str) if (rlen != plen) { memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen); } - memcpy(p + beg0, rp, rlen); + rp = RSTRING_PTR(repl); + memmove(p + beg0, rp, rlen); len += rlen - plen; STR_SET_LEN(str, len); - RSTRING_PTR(str)[len] = '\0'; + TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str)); ENC_CODERANGE_SET(str, cr); - if (tainted) OBJ_TAINT(str); - if (untrusted) OBJ_UNTRUST(str); + FL_SET_RAW(str, tainted); return str; } @@ -3755,7 +5125,7 @@ rb_str_sub_bang(int argc, VALUE *argv, VALUE str) * Returns a copy of +str+ with the _first_ occurrence of +pattern+ * replaced by the second argument. The +pattern+ is typically a Regexp; if * given as a String, any regular expression metacharacters it contains will - * be interpreted literally, e.g. <code>'\\\d'</code> will match a backlash + * be interpreted literally, e.g. <code>'\\\d'</code> will match a backslash * followed by 'd', instead of a digit. * * If +replacement+ is a String it will be substituted for the matched text. @@ -3764,7 +5134,10 @@ rb_str_sub_bang(int argc, VALUE *argv, VALUE str) * <code>"\\k<n>"</code>, where <i>n</i> is a group name. If it is a * double-quoted string, both back-references must be preceded by an * additional backslash. However, within +replacement+ the special match - * variables, such as <code>&$</code>, will not refer to the current match. + * variables, such as <code>$&</code>, will not refer to the current match. + * If +replacement+ is a String that looks like a pattern's capture group but + * is actually not a pattern capture group e.g. <code>"\\'"</code>, then it + * will have to be preceded by two backslashes like so <code>"\\\\'"</code>. * * If the second argument is a Hash, and the matched text is one of its keys, * the corresponding value is the replacement string. @@ -3796,20 +5169,20 @@ rb_str_sub(int argc, VALUE *argv, VALUE str) static VALUE str_gsub(int argc, VALUE *argv, VALUE str, int bang) { - VALUE pat, val, repl, match, dest, hash = Qnil; + VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil; struct re_registers *regs; - long beg, n; - long beg0, end0; + long beg, beg0, end0; long offset, blen, slen, len, last; - int iter = 0; + enum {STR, ITER, MAP} mode = STR; char *sp, *cp; int tainted = 0; + int need_backref = -1; rb_encoding *str_enc; switch (argc) { case 1: RETURN_ENUMERATOR(str, argc, argv); - iter = 1; + mode = ITER; break; case 2: repl = argv[1]; @@ -3817,21 +5190,23 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) if (NIL_P(hash)) { StringValue(repl); } - if (OBJ_TAINTED(repl)) tainted = 1; + else { + mode = MAP; + } + tainted = OBJ_TAINTED_RAW(repl); break; default: rb_check_arity(argc, 1, 2); } - pat = get_pat(argv[0], 1); - beg = rb_reg_search(pat, str, 0, 0); + pat = get_pat_quoted(argv[0], 1); + beg = rb_pat_search(pat, str, 0, need_backref); if (beg < 0) { if (bang) return Qnil; /* no match, no substitution */ return rb_str_dup(str); } offset = 0; - n = 0; blen = RSTRING_LEN(str) + 30; /* len + margin */ dest = rb_str_buf_new(blen); sp = RSTRING_PTR(str); @@ -3842,17 +5217,25 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID); do { - n++; match = rb_backref_get(); regs = RMATCH_REGS(match); - beg0 = BEG(0); - end0 = END(0); - if (iter || !NIL_P(hash)) { - if (iter) { - val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); + if (RB_TYPE_P(pat, T_STRING)) { + beg0 = beg; + end0 = beg0 + RSTRING_LEN(pat); + match0 = pat; + } + else { + beg0 = BEG(0); + end0 = END(0); + if (mode == ITER) match0 = rb_reg_nth_match(0, match); + } + + if (mode) { + if (mode == ITER) { + val = rb_obj_as_string(rb_yield(match0)); } else { - val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0))); + val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0)); val = rb_obj_as_string(val); } str_mod_check(str, sp, slen); @@ -3860,13 +5243,19 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) rb_raise(rb_eRuntimeError, "block should not cheat"); } } + else if (need_backref) { + val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat); + if (need_backref < 0) { + need_backref = val != repl; + } + } else { - val = rb_reg_regsub(repl, str, regs, pat); + val = repl; } - if (OBJ_TAINTED(val)) tainted = 1; + tainted |= OBJ_TAINTED_RAW(val); - len = beg - offset; /* copy pre-match substr */ + len = beg0 - offset; /* copy pre-match substr */ if (len) { rb_enc_str_buf_cat(dest, cp, len, str_enc); } @@ -3887,22 +5276,22 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) } cp = RSTRING_PTR(str) + offset; if (offset > RSTRING_LEN(str)) break; - beg = rb_reg_search(pat, str, offset, 0); + beg = rb_pat_search(pat, str, offset, need_backref); } while (beg >= 0); if (RSTRING_LEN(str) > offset) { rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc); } - rb_reg_search(pat, str, last, 0); + rb_pat_search(pat, str, last, 1); if (bang) { - rb_str_shared_replace(str, dest); + str_shared_replace(str, dest); } else { - RBASIC(dest)->klass = rb_obj_class(str); - OBJ_INFECT(dest, str); + RBASIC_SET_CLASS(dest, rb_obj_class(str)); + tainted |= OBJ_TAINTED_RAW(str); str = dest; } - if (tainted) OBJ_TAINT(str); + FL_SET_RAW(str, tainted); return str; } @@ -3910,6 +5299,7 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) /* * call-seq: * str.gsub!(pattern, replacement) -> str or nil + * str.gsub!(pattern, hash) -> str or nil * str.gsub!(pattern) {|match| block } -> str or nil * str.gsub!(pattern) -> an_enumerator * @@ -3933,11 +5323,11 @@ rb_str_gsub_bang(int argc, VALUE *argv, VALUE str) * str.gsub(pattern) {|match| block } -> new_str * str.gsub(pattern) -> enumerator * - * Returns a copy of <i>str</i> with the <em>all</em> occurrences of + * Returns a copy of <i>str</i> with <em>all</em> occurrences of * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is * typically a <code>Regexp</code>; if given as a <code>String</code>, any * regular expression metacharacters it contains will be interpreted - * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd', + * literally, e.g. <code>'\\\d'</code> will match a backslash followed by 'd', * instead of a digit. * * If <i>replacement</i> is a <code>String</code> it will be substituted for @@ -3946,7 +5336,7 @@ rb_str_gsub_bang(int argc, VALUE *argv, VALUE str) * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a * double-quoted string, both back-references must be preceded by an * additional backslash. However, within <i>replacement</i> the special match - * variables, such as <code>&$</code>, will not refer to the current match. + * variables, such as <code>$&</code>, will not refer to the current match. * * If the second argument is a <code>Hash</code>, and the matched text is one * of its keys, the corresponding value is the replacement string. @@ -4059,30 +5449,68 @@ rb_str_getbyte(VALUE str, VALUE index) /* * call-seq: - * str.setbyte(index, int) -> int + * str.setbyte(index, integer) -> integer * - * modifies the <i>index</i>th byte as <i>int</i>. + * modifies the <i>index</i>th byte as <i>integer</i>. */ static VALUE rb_str_setbyte(VALUE str, VALUE index, VALUE value) { long pos = NUM2LONG(index); - int byte = NUM2INT(value); - - rb_str_modify(str); + long len = RSTRING_LEN(str); + char *head, *left = 0; + unsigned char *ptr; + rb_encoding *enc; + int cr = ENC_CODERANGE_UNKNOWN, width, nlen; - if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos) + if (pos < -len || len <= pos) rb_raise(rb_eIndexError, "index %ld out of string", pos); if (pos < 0) - pos += RSTRING_LEN(str); + pos += len; - RSTRING_PTR(str)[pos] = byte; + VALUE v = rb_to_int(value); + VALUE w = rb_int_modulo(v, INT2FIX(256)); + unsigned char byte = NUM2INT(w) & 0xFF; + if (!str_independent(str)) + str_make_independent(str); + enc = STR_ENC_GET(str); + head = RSTRING_PTR(str); + ptr = (unsigned char *)&head[pos]; + if (!STR_EMBED_P(str)) { + cr = ENC_CODERANGE(str); + switch (cr) { + case ENC_CODERANGE_7BIT: + left = (char *)ptr; + *ptr = byte; + if (ISASCII(byte)) goto end; + nlen = rb_enc_precise_mbclen(left, head+len, enc); + if (!MBCLEN_CHARFOUND_P(nlen)) + ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN); + else + ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); + goto end; + case ENC_CODERANGE_VALID: + left = rb_enc_left_char_head(head, ptr, head+len, enc); + width = rb_enc_precise_mbclen(left, head+len, enc); + *ptr = byte; + nlen = rb_enc_precise_mbclen(left, head+len, enc); + if (!MBCLEN_CHARFOUND_P(nlen)) + ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN); + else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte)) + ENC_CODERANGE_CLEAR(str); + goto end; + } + } + ENC_CODERANGE_CLEAR(str); + *ptr = byte; + + end: return value; } static VALUE -str_byte_substr(VALUE str, long beg, long len) +str_byte_substr(VALUE str, long beg, long len, int empty) { char *p, *s = RSTRING_PTR(str); long n = RSTRING_LEN(str); @@ -4093,26 +5521,46 @@ str_byte_substr(VALUE str, long beg, long len) beg += n; if (beg < 0) return Qnil; } - if (beg + len > n) + if (len > n - beg) len = n - beg; if (len <= 0) { + if (!empty) return Qnil; len = 0; p = 0; } else p = s + beg; - if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) { - str2 = rb_str_new4(str); - str2 = str_new3(rb_obj_class(str2), str2); - RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len; + if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) && SHARABLE_SUBSTRING_P(beg, len, n)) { + str2 = rb_str_new_frozen(str); + str2 = str_new_shared(rb_obj_class(str2), str2); + RSTRING(str2)->as.heap.ptr += beg; RSTRING(str2)->as.heap.len = len; } else { - str2 = rb_str_new5(str, p, len); - rb_enc_cr_str_copy_for_substr(str2, str); - OBJ_INFECT(str2, str); + str2 = rb_str_new_with_class(str, p, len); + } + + str_enc_copy(str2, str); + + if (RSTRING_LEN(str2) == 0) { + if (!rb_enc_asciicompat(STR_ENC_GET(str))) + ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID); + else + ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT); } + else { + switch (ENC_CODERANGE(str)) { + case ENC_CODERANGE_7BIT: + ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT); + break; + default: + ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN); + break; + } + } + + OBJ_INFECT_RAW(str2, str); return str2; } @@ -4121,44 +5569,35 @@ static VALUE str_byte_aref(VALUE str, VALUE indx) { long idx; - switch (TYPE(indx)) { - case T_FIXNUM: + if (FIXNUM_P(indx)) { idx = FIX2LONG(indx); - - num_index: - str = str_byte_substr(str, idx, 1); - if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil; - return str; - - default: + } + else { /* check if indx is Range */ - { - long beg, len = RSTRING_LEN(str); + long beg, len = RSTRING_LEN(str); - switch (rb_range_beg_len(indx, &beg, &len, len, 0)) { - case Qfalse: - break; - case Qnil: - return Qnil; - default: - return str_byte_substr(str, beg, len); - } + switch (rb_range_beg_len(indx, &beg, &len, len, 0)) { + case Qfalse: + break; + case Qnil: + return Qnil; + default: + return str_byte_substr(str, beg, len, TRUE); } + idx = NUM2LONG(indx); - goto num_index; } - - UNREACHABLE; + return str_byte_substr(str, idx, 1, FALSE); } /* * call-seq: - * str.byteslice(fixnum) -> new_str or nil - * str.byteslice(fixnum, fixnum) -> new_str or nil + * str.byteslice(integer) -> new_str or nil + * str.byteslice(integer, integer) -> new_str or nil * str.byteslice(range) -> new_str or nil * - * Byte Reference---If passed a single <code>Fixnum</code>, returns a - * substring of one byte at that position. If passed two <code>Fixnum</code> + * Byte Reference---If passed a single <code>Integer</code>, returns a + * substring of one byte at that position. If passed two <code>Integer</code> * objects, returns a substring starting at the offset given by the first, and * a length given by the second. If given a <code>Range</code>, a substring containing * bytes at offsets given by the range is returned. In all three cases, if @@ -4178,7 +5617,9 @@ static VALUE rb_str_byteslice(int argc, VALUE *argv, VALUE str) { if (argc == 2) { - return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1])); + long beg = NUM2LONG(argv[0]); + long end = NUM2LONG(argv[1]); + return str_byte_substr(str, beg, end, TRUE); } rb_check_arity(argc, 1, 2); return str_byte_aref(str, argv[0]); @@ -4199,13 +5640,14 @@ rb_str_reverse(VALUE str) rb_encoding *enc; VALUE rev; char *s, *e, *p; - int single = 1; + int cr; if (RSTRING_LEN(str) <= 1) return rb_str_dup(str); enc = STR_ENC_GET(str); - rev = rb_str_new5(str, 0, RSTRING_LEN(str)); + rev = rb_str_new_with_class(str, 0, RSTRING_LEN(str)); s = RSTRING_PTR(str); e = RSTRING_END(str); p = RSTRING_END(rev); + cr = ENC_CODERANGE(str); if (RSTRING_LEN(str) > 1) { if (single_byte_optimizable(str)) { @@ -4213,21 +5655,22 @@ rb_str_reverse(VALUE str) *--p = *s++; } } - else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) { + else if (cr == ENC_CODERANGE_VALID) { while (s < e) { int clen = rb_enc_fast_mbclen(s, e, enc); - if (clen > 1 || (*s & 0x80)) single = 0; p -= clen; memcpy(p, s, clen); s += clen; } } else { + cr = rb_enc_asciicompat(enc) ? + ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; while (s < e) { int clen = rb_enc_mbclen(s, e, enc); - if (clen > 1 || (*s & 0x80)) single = 0; + if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN; p -= clen; memcpy(p, s, clen); s += clen; @@ -4235,16 +5678,9 @@ rb_str_reverse(VALUE str) } } STR_SET_LEN(rev, RSTRING_LEN(str)); - OBJ_INFECT(rev, str); - if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) { - if (single) { - ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); - } - else { - ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); - } - } - rb_enc_cr_str_copy_for_substr(rev, str); + OBJ_INFECT_RAW(rev, str); + str_enc_copy(rev, str); + ENC_CODERANGE_SET(rev, cr); return rev; } @@ -4274,7 +5710,7 @@ rb_str_reverse_bang(VALUE str) } } else { - rb_str_shared_replace(str, rb_str_reverse(str)); + str_shared_replace(str, rb_str_reverse(str)); } } else { @@ -4333,16 +5769,9 @@ rb_str_include(VALUE str, VALUE arg) static VALUE rb_str_to_i(int argc, VALUE *argv, VALUE str) { - int base; + int base = 10; - if (argc == 0) base = 10; - else { - VALUE b; - - rb_scan_args(argc, argv, "01", &b); - base = NUM2INT(b); - } - if (base < 0) { + if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) { rb_raise(rb_eArgError, "invalid radix %d", base); } return rb_str_to_inum(str, base, FALSE); @@ -4375,7 +5804,9 @@ rb_str_to_f(VALUE str) * str.to_s -> str * str.to_str -> str * - * Returns the receiver. + * Returns +self+. + * + * If called on a subclass of String, converts the receiver to a String object. */ static VALUE @@ -4434,6 +5865,70 @@ rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p) return l; } +VALUE +rb_str_escape(VALUE str) +{ + int encidx = ENCODING_GET(str); + rb_encoding *enc = rb_enc_from_index(encidx); + const char *p = RSTRING_PTR(str); + const char *pend = RSTRING_END(str); + const char *prev = p; + char buf[CHAR_ESC_LEN + 1]; + VALUE result = rb_str_buf_new(0); + int unicode_p = rb_enc_unicode_p(enc); + int asciicompat = rb_enc_asciicompat(enc); + + while (p < pend) { + unsigned int c, cc; + int n = rb_enc_precise_mbclen(p, pend, enc); + if (!MBCLEN_CHARFOUND_P(n)) { + if (p > prev) str_buf_cat(result, prev, p - prev); + n = rb_enc_mbminlen(enc); + if (pend < p + n) + n = (int)(pend - p); + while (n--) { + snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377); + str_buf_cat(result, buf, strlen(buf)); + prev = ++p; + } + continue; + } + n = MBCLEN_CHARFOUND_LEN(n); + c = rb_enc_mbc_to_codepoint(p, pend, enc); + p += n; + switch (c) { + case '\n': cc = 'n'; break; + case '\r': cc = 'r'; break; + case '\t': cc = 't'; break; + case '\f': cc = 'f'; break; + case '\013': cc = 'v'; break; + case '\010': cc = 'b'; break; + case '\007': cc = 'a'; break; + case 033: cc = 'e'; break; + default: cc = 0; break; + } + if (cc) { + if (p - n > prev) str_buf_cat(result, prev, p - n - prev); + buf[0] = '\\'; + buf[1] = (char)cc; + str_buf_cat(result, buf, 2); + prev = p; + } + else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) { + } + else { + if (p - n > prev) str_buf_cat(result, prev, p - n - prev); + rb_str_buf_cat_escaped_char(result, c, unicode_p); + prev = p; + } + } + if (p > prev) str_buf_cat(result, prev, p - prev); + ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT); + + OBJ_INFECT_RAW(result, str); + return result; +} + /* * call-seq: * str.inspect -> string @@ -4449,17 +5944,15 @@ rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p) VALUE rb_str_inspect(VALUE str) { - rb_encoding *enc = STR_ENC_GET(str); + int encidx = ENCODING_GET(str); + rb_encoding *enc = rb_enc_from_index(encidx), *actenc; const char *p, *pend, *prev; char buf[CHAR_ESC_LEN + 1]; VALUE result = rb_str_buf_new(0); rb_encoding *resenc = rb_default_internal_encoding(); int unicode_p = rb_enc_unicode_p(enc); int asciicompat = rb_enc_asciicompat(enc); - static rb_encoding *utf16, *utf32; - if (!utf16) utf16 = rb_enc_find("UTF-16"); - if (!utf32) utf32 = rb_enc_find("UTF-32"); if (resenc == NULL) resenc = rb_default_external_encoding(); if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding(); rb_enc_associate(result, resenc); @@ -4467,23 +5960,10 @@ rb_str_inspect(VALUE str) p = RSTRING_PTR(str); pend = RSTRING_END(str); prev = p; - if (enc == utf16) { - const unsigned char *q = (const unsigned char *)p; - if (q[0] == 0xFE && q[1] == 0xFF) - enc = rb_enc_find("UTF-16BE"); - else if (q[0] == 0xFF && q[1] == 0xFE) - enc = rb_enc_find("UTF-16LE"); - else - unicode_p = 0; - } - else if (enc == utf32) { - const unsigned char *q = (const unsigned char *)p; - if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) - enc = rb_enc_find("UTF-32BE"); - else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) - enc = rb_enc_find("UTF-32LE"); - else - unicode_p = 0; + actenc = get_actual_encoding(encidx, str); + if (actenc != enc) { + enc = actenc; + if (unicode_p) unicode_p = rb_enc_unicode_p(enc); } while (p < pend) { unsigned int c, cc; @@ -4552,7 +6032,7 @@ rb_str_inspect(VALUE str) if (p > prev) str_buf_cat(result, prev, p - prev); str_buf_cat2(result, "\""); - OBJ_INFECT(result, str); + OBJ_INFECT_RAW(result, str); return result; } @@ -4565,61 +6045,75 @@ rb_str_inspect(VALUE str) * Produces a version of +str+ with all non-printing characters replaced by * <code>\nnn</code> notation and all special characters escaped. * - * "hello \n ''".dump #=> "\"hello \\n ''\" + * "hello \n ''".dump #=> "\"hello \\n ''\"" */ VALUE rb_str_dump(VALUE str) { - rb_encoding *enc = rb_enc_get(str); + int encidx = rb_enc_get_index(str); + rb_encoding *enc = rb_enc_from_index(encidx); long len; const char *p, *pend; char *q, *qend; VALUE result; - int u8 = (enc == rb_utf8_encoding()); + int u8 = (encidx == rb_utf8_encindex()); + static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")"; len = 2; /* "" */ + if (!rb_enc_asciicompat(enc)) { + len += strlen(nonascii_suffix) - rb_strlen_lit("%s"); + len += strlen(enc->name); + } + p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); while (p < pend) { + int clen; unsigned char c = *p++; + switch (c) { case '"': case '\\': case '\n': case '\r': case '\t': case '\f': case '\013': case '\010': case '\007': case '\033': - len += 2; + clen = 2; break; case '#': - len += IS_EVSTR(p, pend) ? 2 : 1; + clen = IS_EVSTR(p, pend) ? 2 : 1; break; default: if (ISPRINT(c)) { - len++; + clen = 1; } else { - if (u8) { /* \u{NN} */ + if (u8 && c > 0x7F) { /* \u notation */ int n = rb_enc_precise_mbclen(p-1, pend, enc); - if (MBCLEN_CHARFOUND_P(n-1)) { + if (MBCLEN_CHARFOUND_P(n)) { unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc); - while (cc >>= 4) len++; - len += 5; + if (cc <= 0xFFFF) + clen = 6; /* \uXXXX */ + else if (cc <= 0xFFFFF) + clen = 9; /* \u{XXXXX} */ + else + clen = 10; /* \u{XXXXXX} */ p += MBCLEN_CHARFOUND_LEN(n)-1; break; } } - len += 4; /* \xNN */ + clen = 4; /* \xNN */ } break; } - } - if (!rb_enc_asciicompat(enc)) { - len += 19; /* ".force_encoding('')" */ - len += strlen(enc->name); + + if (clen > LONG_MAX - len) { + rb_raise(rb_eRuntimeError, "string size too big"); + } + len += clen; } - result = rb_str_new5(str, 0, len); + result = rb_str_new_with_class(str, 0, len); p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); q = RSTRING_PTR(result); qend = q + len + 1; @@ -4677,7 +6171,10 @@ rb_str_dump(VALUE str) if (MBCLEN_CHARFOUND_P(n)) { int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc); p += n; - snprintf(q, qend-q, "u{%x}", cc); + if (cc <= 0xFFFF) + snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */ + else + snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */ q += strlen(q); continue; } @@ -4689,16 +6186,249 @@ rb_str_dump(VALUE str) *q++ = '"'; *q = '\0'; if (!rb_enc_asciicompat(enc)) { - snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name); - enc = rb_ascii8bit_encoding(); + snprintf(q, qend-q, nonascii_suffix, enc->name); + encidx = rb_ascii8bit_encindex(); } - OBJ_INFECT(result, str); + OBJ_INFECT_RAW(result, str); /* result from dump is ASCII */ - rb_enc_associate(result, enc); + rb_enc_associate_index(result, encidx); ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT); return result; } +static int +unescape_ascii(unsigned int c) +{ + switch (c) { + case 'n': + return '\n'; + case 'r': + return '\r'; + case 't': + return '\t'; + case 'f': + return '\f'; + case 'v': + return '\13'; + case 'b': + return '\010'; + case 'a': + return '\007'; + case 'e': + return 033; + default: + UNREACHABLE; + } +} + +static void +undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary) +{ + const char *s = *ss; + unsigned int c; + int codelen; + size_t hexlen; + unsigned char buf[6]; + static rb_encoding *enc_utf8 = NULL; + + switch (*s) { + case '\\': + case '"': + case '#': + rb_str_cat(undumped, s, 1); /* cat itself */ + s++; + break; + case 'n': + case 'r': + case 't': + case 'f': + case 'v': + case 'b': + case 'a': + case 'e': + *buf = unescape_ascii(*s); + rb_str_cat(undumped, (char *)buf, 1); + s++; + break; + case 'u': + if (*binary) { + rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed"); + } + *utf8 = true; + if (++s >= s_end) { + rb_raise(rb_eRuntimeError, "invalid Unicode escape"); + } + if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding(); + if (*penc != enc_utf8) { + *penc = enc_utf8; + rb_enc_associate(undumped, enc_utf8); + } + if (*s == '{') { /* handle \u{...} form */ + s++; + for (;;) { + if (s >= s_end) { + rb_raise(rb_eRuntimeError, "unterminated Unicode escape"); + } + if (*s == '}') { + s++; + break; + } + if (ISSPACE(*s)) { + s++; + continue; + } + c = scan_hex(s, s_end-s, &hexlen); + if (hexlen == 0 || hexlen > 6) { + rb_raise(rb_eRuntimeError, "invalid Unicode escape"); + } + if (c > 0x10ffff) { + rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)"); + } + if (0xd800 <= c && c <= 0xdfff) { + rb_raise(rb_eRuntimeError, "invalid Unicode codepoint"); + } + codelen = rb_enc_mbcput(c, (char *)buf, *penc); + rb_str_cat(undumped, (char *)buf, codelen); + s += hexlen; + } + } + else { /* handle \uXXXX form */ + c = scan_hex(s, 4, &hexlen); + if (hexlen != 4) { + rb_raise(rb_eRuntimeError, "invalid Unicode escape"); + } + if (0xd800 <= c && c <= 0xdfff) { + rb_raise(rb_eRuntimeError, "invalid Unicode codepoint"); + } + codelen = rb_enc_mbcput(c, (char *)buf, *penc); + rb_str_cat(undumped, (char *)buf, codelen); + s += hexlen; + } + break; + case 'x': + if (*utf8) { + rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed"); + } + *binary = true; + if (++s >= s_end) { + rb_raise(rb_eRuntimeError, "invalid hex escape"); + } + *buf = scan_hex(s, 2, &hexlen); + if (hexlen != 2) { + rb_raise(rb_eRuntimeError, "invalid hex escape"); + } + rb_str_cat(undumped, (char *)buf, 1); + s += hexlen; + break; + default: + rb_str_cat(undumped, s-1, 2); + s++; + } + + *ss = s; +} + +static VALUE rb_str_is_ascii_only_p(VALUE str); + +/* + * call-seq: + * str.undump -> new_str + * + * Produces unescaped version of +str+. + * See also String#dump because String#undump does inverse of String#dump. + * + * "\"hello \\n ''\"".undump #=> "hello \n ''" + */ + +static VALUE +str_undump(VALUE str) +{ + const char *s = RSTRING_PTR(str); + const char *s_end = RSTRING_END(str); + rb_encoding *enc = rb_enc_get(str); + VALUE undumped = rb_enc_str_new(s, 0L, enc); + bool utf8 = false; + bool binary = false; + int w; + + rb_must_asciicompat(str); + if (rb_str_is_ascii_only_p(str) == Qfalse) { + rb_raise(rb_eRuntimeError, "non-ASCII character detected"); + } + if (!str_null_check(str, &w)) { + rb_raise(rb_eRuntimeError, "string contains null byte"); + } + if (RSTRING_LEN(str) < 2) goto invalid_format; + if (*s != '"') goto invalid_format; + + /* strip '"' at the start */ + s++; + + for (;;) { + if (s >= s_end) { + rb_raise(rb_eRuntimeError, "unterminated dumped string"); + } + + if (*s == '"') { + /* epilogue */ + s++; + if (s == s_end) { + /* ascii compatible dumped string */ + break; + } + else { + static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */ + static const char dup_suffix[] = ".dup"; + const char *encname; + int encidx; + ptrdiff_t size; + + /* check separately for strings dumped by older versions */ + size = sizeof(dup_suffix) - 1; + if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size; + + size = sizeof(force_encoding_suffix) - 1; + if (s_end - s <= size) goto invalid_format; + if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format; + s += size; + + if (utf8) { + rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding"); + } + + encname = s; + s = memchr(s, '"', s_end-s); + size = s - encname; + if (!s) goto invalid_format; + if (s_end - s != 2) goto invalid_format; + if (s[0] != '"' || s[1] != ')') goto invalid_format; + + encidx = rb_enc_find_index2(encname, (long)size); + if (encidx < 0) { + rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name"); + } + rb_enc_associate_index(undumped, encidx); + } + break; + } + + if (*s == '\\') { + s++; + if (s >= s_end) { + rb_raise(rb_eRuntimeError, "invalid escape"); + } + undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary); + } + else { + rb_str_cat(undumped, s++, 1); + } + } + + OBJ_INFECT(undumped, str); + return undumped; +invalid_format: + rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form"); +} static void rb_str_check_dummy_enc(rb_encoding *enc) @@ -4709,179 +6439,358 @@ rb_str_check_dummy_enc(rb_encoding *enc) } } +static OnigCaseFoldType +check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags) +{ + if (argc==0) + return flags; + if (argc>2) + rb_raise(rb_eArgError, "too many options"); + if (argv[0]==sym_turkic) { + flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI; + if (argc==2) { + if (argv[1]==sym_lithuanian) + flags |= ONIGENC_CASE_FOLD_LITHUANIAN; + else + rb_raise(rb_eArgError, "invalid second option"); + } + } + else if (argv[0]==sym_lithuanian) { + flags |= ONIGENC_CASE_FOLD_LITHUANIAN; + if (argc==2) { + if (argv[1]==sym_turkic) + flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI; + else + rb_raise(rb_eArgError, "invalid second option"); + } + } + else if (argc>1) + rb_raise(rb_eArgError, "too many options"); + else if (argv[0]==sym_ascii) + flags |= ONIGENC_CASE_ASCII_ONLY; + else if (argv[0]==sym_fold) { + if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE) + flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE; + else + rb_raise(rb_eArgError, "option :fold only allowed for downcasing"); + } + else + rb_raise(rb_eArgError, "invalid option"); + return flags; +} + +/* 16 should be long enough to absorb any kind of single character length increase */ +#define CASE_MAPPING_ADDITIONAL_LENGTH 20 +#ifndef CASEMAP_DEBUG +# define CASEMAP_DEBUG 0 +#endif + +struct mapping_buffer; +typedef struct mapping_buffer { + size_t capa; + size_t used; + struct mapping_buffer *next; + OnigUChar space[FLEX_ARY_LEN]; +} mapping_buffer; + +static void +mapping_buffer_free(void *p) +{ + mapping_buffer *previous_buffer; + mapping_buffer *current_buffer = p; + while (current_buffer) { + previous_buffer = current_buffer; + current_buffer = current_buffer->next; + ruby_sized_xfree(previous_buffer, previous_buffer->capa); + } +} + +static const rb_data_type_t mapping_buffer_type = { + "mapping_buffer", + {0, mapping_buffer_free,} +}; + +static VALUE +rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc) +{ + VALUE target; + + OnigUChar *source_current, *source_end; + int target_length = 0; + VALUE buffer_anchor; + mapping_buffer *current_buffer = 0; + mapping_buffer **pre_buffer; + size_t buffer_count = 0; + int buffer_length_or_invalid; + + if (RSTRING_LEN(source) == 0) return rb_str_dup(source); + + source_current = (OnigUChar*)RSTRING_PTR(source); + source_end = (OnigUChar*)RSTRING_END(source); + + buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0); + pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor); + while (source_current < source_end) { + /* increase multiplier using buffer count to converge quickly */ + size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH; + if (CASEMAP_DEBUG) { + fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */ + } + current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa); + *pre_buffer = current_buffer; + pre_buffer = ¤t_buffer->next; + current_buffer->next = NULL; + current_buffer->capa = capa; + buffer_length_or_invalid = enc->case_map(flags, + (const OnigUChar**)&source_current, source_end, + current_buffer->space, + current_buffer->space+current_buffer->capa, + enc); + if (buffer_length_or_invalid < 0) { + current_buffer = DATA_PTR(buffer_anchor); + DATA_PTR(buffer_anchor) = 0; + mapping_buffer_free(current_buffer); + rb_raise(rb_eArgError, "input string invalid"); + } + target_length += current_buffer->used = buffer_length_or_invalid; + } + if (CASEMAP_DEBUG) { + fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */ + } + + if (buffer_count==1) { + target = rb_str_new_with_class(source, (const char*)current_buffer->space, target_length); + } + else { + char *target_current; + + target = rb_str_new_with_class(source, 0, target_length); + target_current = RSTRING_PTR(target); + current_buffer = DATA_PTR(buffer_anchor); + while (current_buffer) { + memcpy(target_current, current_buffer->space, current_buffer->used); + target_current += current_buffer->used; + current_buffer = current_buffer->next; + } + } + current_buffer = DATA_PTR(buffer_anchor); + DATA_PTR(buffer_anchor) = 0; + mapping_buffer_free(current_buffer); + + /* TODO: check about string terminator character */ + OBJ_INFECT_RAW(target, source); + str_enc_copy(target, source); + /*ENC_CODERANGE_SET(mapped, cr);*/ + + return target; +} + +static void +rb_str_ascii_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc) +{ + OnigUChar *source_current, *source_end; + long old_length = RSTRING_LEN(source); + int length_or_invalid; + + if (old_length == 0) return; + + source_current = (OnigUChar*)RSTRING_PTR(source); + source_end = (OnigUChar*)RSTRING_END(source); + + length_or_invalid = onigenc_ascii_only_case_map(flags, + (const OnigUChar**)&source_current, source_end, + source_current, source_end, enc); + if (length_or_invalid < 0) + rb_raise(rb_eArgError, "input string invalid"); + if (CASEMAP_DEBUG && length_or_invalid != old_length) { + fprintf(stderr, "problem with rb_str_ascii_casemap" + "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid); + rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap" + "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid); + } +} + /* * call-seq: - * str.upcase! -> str or nil + * str.upcase! -> str or nil + * str.upcase!([options]) -> str or nil * * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes * were made. - * Note: case replacement is effective only in ASCII region. + * + * See String#downcase for meaning of +options+ and use with different encodings. */ static VALUE -rb_str_upcase_bang(VALUE str) +rb_str_upcase_bang(int argc, VALUE *argv, VALUE str) { rb_encoding *enc; - char *s, *send; - int modify = 0; - int n; + OnigCaseFoldType flags = ONIGENC_CASE_UPCASE; + flags = check_case_options(argc, argv, flags); str_modify_keep_cr(str); enc = STR_ENC_GET(str); rb_str_check_dummy_enc(enc); - s = RSTRING_PTR(str); send = RSTRING_END(str); - if (single_byte_optimizable(str)) { + if (((flags&ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc)==1)) + || (!(flags&ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str)==ENC_CODERANGE_7BIT)) { + char *s = RSTRING_PTR(str), *send = RSTRING_END(str); + while (s < send) { unsigned int c = *(unsigned char*)s; if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { *s = 'A' + (c - 'a'); - modify = 1; + flags |= ONIGENC_CASE_MODIFIED; } s++; } } - else { - int ascompat = rb_enc_asciicompat(enc); - - while (s < send) { - unsigned int c; - - if (ascompat && (c = *(unsigned char*)s) < 0x80) { - if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { - *s = 'A' + (c - 'a'); - modify = 1; - } - s++; - } - else { - c = rb_enc_codepoint_len(s, send, &n, enc); - if (rb_enc_islower(c, enc)) { - /* assuming toupper returns codepoint with same size */ - rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); - modify = 1; - } - s += n; - } - } - } + else if (flags&ONIGENC_CASE_ASCII_ONLY) + rb_str_ascii_casemap(str, &flags, enc); + else + str_shared_replace(str, rb_str_casemap(str, &flags, enc)); - if (modify) return str; + if (ONIGENC_CASE_MODIFIED&flags) return str; return Qnil; } /* * call-seq: - * str.upcase -> new_str + * str.upcase -> new_str + * str.upcase([options]) -> new_str * * Returns a copy of <i>str</i> with all lowercase letters replaced with their - * uppercase counterparts. The operation is locale insensitive---only - * characters ``a'' to ``z'' are affected. - * Note: case replacement is effective only in ASCII region. + * uppercase counterparts. + * + * See String#downcase for meaning of +options+ and use with different encodings. * * "hEllO".upcase #=> "HELLO" */ static VALUE -rb_str_upcase(VALUE str) +rb_str_upcase(int argc, VALUE *argv, VALUE str) { str = rb_str_dup(str); - rb_str_upcase_bang(str); + rb_str_upcase_bang(argc, argv, str); return str; } - /* * call-seq: - * str.downcase! -> str or nil + * str.downcase! -> str or nil + * str.downcase!([options]) -> str or nil * * Downcases the contents of <i>str</i>, returning <code>nil</code> if no * changes were made. - * Note: case replacement is effective only in ASCII region. + * + * See String#downcase for meaning of +options+ and use with different encodings. */ static VALUE -rb_str_downcase_bang(VALUE str) +rb_str_downcase_bang(int argc, VALUE *argv, VALUE str) { rb_encoding *enc; - char *s, *send; - int modify = 0; + OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE; + flags = check_case_options(argc, argv, flags); str_modify_keep_cr(str); enc = STR_ENC_GET(str); rb_str_check_dummy_enc(enc); - s = RSTRING_PTR(str); send = RSTRING_END(str); - if (single_byte_optimizable(str)) { + if (((flags&ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc)==1)) + || (!(flags&ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str)==ENC_CODERANGE_7BIT)) { + char *s = RSTRING_PTR(str), *send = RSTRING_END(str); + while (s < send) { unsigned int c = *(unsigned char*)s; if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { *s = 'a' + (c - 'A'); - modify = 1; + flags |= ONIGENC_CASE_MODIFIED; } s++; } } - else { - int ascompat = rb_enc_asciicompat(enc); - - while (s < send) { - unsigned int c; - int n; - - if (ascompat && (c = *(unsigned char*)s) < 0x80) { - if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { - *s = 'a' + (c - 'A'); - modify = 1; - } - s++; - } - else { - c = rb_enc_codepoint_len(s, send, &n, enc); - if (rb_enc_isupper(c, enc)) { - /* assuming toupper returns codepoint with same size */ - rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); - modify = 1; - } - s += n; - } - } - } + else if (flags&ONIGENC_CASE_ASCII_ONLY) + rb_str_ascii_casemap(str, &flags, enc); + else + str_shared_replace(str, rb_str_casemap(str, &flags, enc)); - if (modify) return str; + if (ONIGENC_CASE_MODIFIED&flags) return str; return Qnil; } /* * call-seq: - * str.downcase -> new_str + * str.downcase -> new_str + * str.downcase([options]) -> new_str * * Returns a copy of <i>str</i> with all uppercase letters replaced with their - * lowercase counterparts. The operation is locale insensitive---only - * characters ``A'' to ``Z'' are affected. - * Note: case replacement is effective only in ASCII region. + * lowercase counterparts. Which letters exactly are replaced, and by which + * other letters, depends on the presence or absence of options, and on the + * +encoding+ of the string. + * + * The meaning of the +options+ is as follows: + * + * No option :: + * Full Unicode case mapping, suitable for most languages + * (see :turkic and :lithuanian options below for exceptions). + * Context-dependent case mapping as described in Table 3-14 of the + * Unicode standard is currently not supported. + * :ascii :: + * Only the ASCII region, i.e. the characters ``A'' to ``Z'' and + * ``a'' to ``z'', are affected. + * This option cannot be combined with any other option. + * :turkic :: + * Full Unicode case mapping, adapted for Turkic languages + * (Turkish, Azerbaijani, ...). This means that upper case I is mapped to + * lower case dotless i, and so on. + * :lithuanian :: + * Currently, just full Unicode case mapping. In the future, full Unicode + * case mapping adapted for Lithuanian (keeping the dot on the lower case + * i even if there is an accent on top). + * :fold :: + * Only available on +downcase+ and +downcase!+. Unicode case <b>folding</b>, + * which is more far-reaching than Unicode case mapping. + * This option currently cannot be combined with any other option + * (i.e. there is currently no variant for turkic languages). + * + * Please note that several assumptions that are valid for ASCII-only case + * conversions do not hold for more general case conversions. For example, + * the length of the result may not be the same as the length of the input + * (neither in characters nor in bytes), some roundtrip assumptions + * (e.g. str.downcase == str.upcase.downcase) may not apply, and Unicode + * normalization (i.e. String#unicode_normalize) is not necessarily maintained + * by case mapping operations. + * + * Non-ASCII case mapping/folding is currently supported for UTF-8, + * UTF-16BE/LE, UTF-32BE/LE, and ISO-8859-1~16 Strings/Symbols. + * This support will be extended to other encodings. * * "hEllO".downcase #=> "hello" */ static VALUE -rb_str_downcase(VALUE str) +rb_str_downcase(int argc, VALUE *argv, VALUE str) { str = rb_str_dup(str); - rb_str_downcase_bang(str); + rb_str_downcase_bang(argc, argv, str); return str; } /* * call-seq: - * str.capitalize! -> str or nil + * str.capitalize! -> str or nil + * str.capitalize!([options]) -> str or nil * * Modifies <i>str</i> by converting the first character to uppercase and the * remainder to lowercase. Returns <code>nil</code> if no changes are made. - * Note: case conversion is effective only in ASCII region. + * There is an exception for modern Georgian (mkhedruli/MTAVRULI), where + * the result is the same as for String#downcase, to avoid mixed case. + * + * See String#downcase for meaning of +options+ and use with different encodings. * * a = "hello" * a.capitalize! #=> "Hello" @@ -4890,47 +6799,35 @@ rb_str_downcase(VALUE str) */ static VALUE -rb_str_capitalize_bang(VALUE str) +rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str) { rb_encoding *enc; - char *s, *send; - int modify = 0; - unsigned int c; - int n; + OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE; + flags = check_case_options(argc, argv, flags); str_modify_keep_cr(str); enc = STR_ENC_GET(str); rb_str_check_dummy_enc(enc); if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; - s = RSTRING_PTR(str); send = RSTRING_END(str); - - c = rb_enc_codepoint_len(s, send, &n, enc); - if (rb_enc_islower(c, enc)) { - rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); - modify = 1; - } - s += n; - while (s < send) { - c = rb_enc_codepoint_len(s, send, &n, enc); - if (rb_enc_isupper(c, enc)) { - rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); - modify = 1; - } - s += n; - } + if (flags&ONIGENC_CASE_ASCII_ONLY) + rb_str_ascii_casemap(str, &flags, enc); + else + str_shared_replace(str, rb_str_casemap(str, &flags, enc)); - if (modify) return str; + if (ONIGENC_CASE_MODIFIED&flags) return str; return Qnil; } /* * call-seq: - * str.capitalize -> new_str + * str.capitalize -> new_str + * str.capitalize([options]) -> new_str * * Returns a copy of <i>str</i> with the first character converted to uppercase * and the remainder to lowercase. - * Note: case conversion is effective only in ASCII region. + * + * See String#downcase for meaning of +options+ and use with different encodings. * * "hello".capitalize #=> "Hello" * "HELLO".capitalize #=> "Hello" @@ -4938,73 +6835,64 @@ rb_str_capitalize_bang(VALUE str) */ static VALUE -rb_str_capitalize(VALUE str) +rb_str_capitalize(int argc, VALUE *argv, VALUE str) { str = rb_str_dup(str); - rb_str_capitalize_bang(str); + rb_str_capitalize_bang(argc, argv, str); return str; } /* * call-seq: - * str.swapcase! -> str or nil + * str.swapcase! -> str or nil + * str.swapcase!([options]) -> str or nil * * Equivalent to <code>String#swapcase</code>, but modifies the receiver in * place, returning <i>str</i>, or <code>nil</code> if no changes were made. - * Note: case conversion is effective only in ASCII region. + * + * See String#downcase for meaning of +options+ and use with different encodings. */ static VALUE -rb_str_swapcase_bang(VALUE str) +rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str) { rb_encoding *enc; - char *s, *send; - int modify = 0; - int n; + OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE; + flags = check_case_options(argc, argv, flags); str_modify_keep_cr(str); enc = STR_ENC_GET(str); rb_str_check_dummy_enc(enc); - s = RSTRING_PTR(str); send = RSTRING_END(str); - while (s < send) { - unsigned int c = rb_enc_codepoint_len(s, send, &n, enc); - - if (rb_enc_isupper(c, enc)) { - /* assuming toupper returns codepoint with same size */ - rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); - modify = 1; - } - else if (rb_enc_islower(c, enc)) { - /* assuming tolower returns codepoint with same size */ - rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); - modify = 1; - } - s += n; - } + if (flags&ONIGENC_CASE_ASCII_ONLY) + rb_str_ascii_casemap(str, &flags, enc); + else + str_shared_replace(str, rb_str_casemap(str, &flags, enc)); - if (modify) return str; + if (ONIGENC_CASE_MODIFIED&flags) return str; return Qnil; } /* * call-seq: - * str.swapcase -> new_str + * str.swapcase -> new_str + * str.swapcase([options]) -> new_str * * Returns a copy of <i>str</i> with uppercase alphabetic characters converted * to lowercase and lowercase characters converted to uppercase. - * Note: case conversion is effective only in ASCII region. + * + * See String#downcase for meaning of +options+ and use with different encodings. * * "Hello".swapcase #=> "hELLO" * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11" */ static VALUE -rb_str_swapcase(VALUE str) +rb_str_swapcase(int argc, VALUE *argv, VALUE str) { str = rb_str_dup(str); - rb_str_swapcase_bang(str); + rb_str_swapcase_bang(argc, argv, str); return str; } @@ -5082,9 +6970,10 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) int cflag = 0; unsigned int c, c0, last = 0; int modify = 0, i, l; - char *s, *send; + unsigned char *s, *send; VALUE hash = 0; int singlebyte = single_byte_optimizable(str); + int termlen; int cr; #define CHECK_IF_ASCII(c) \ @@ -5162,20 +7051,21 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) } } - if (cr == ENC_CODERANGE_VALID) + if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1)) cr = ENC_CODERANGE_7BIT; str_modify_keep_cr(str); - s = RSTRING_PTR(str); send = RSTRING_END(str); + s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str); + termlen = rb_enc_mbminlen(enc); if (sflag) { int clen, tlen; long offset, max = RSTRING_LEN(str); unsigned int save = -1; - char *buf = ALLOC_N(char, max), *t = buf; + unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf; while (s < send) { int may_modify = 0; - c0 = c = rb_enc_codepoint_len(s, send, &clen, e1); + c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1); tlen = enc == e1 ? clen : rb_enc_codelen(c, enc); s += clen; @@ -5208,10 +7098,10 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) c = c0; if (enc != e1) may_modify = 1; } - while (t - buf + tlen >= max) { - offset = t - buf; - max *= 2; - REALLOC_N(buf, char, max); + if ((offset = t - buf) + tlen > max) { + size_t MAYBE_UNUSED(old) = max + termlen; + max = offset + tlen + (send - s); + SIZED_REALLOC_N(buf, unsigned char, max + termlen, old); t = buf + offset; } rb_enc_mbcput(c, t, enc); @@ -5222,10 +7112,10 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) t += tlen; } if (!STR_EMBED_P(str)) { - xfree(RSTRING(str)->as.heap.ptr); + ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str)); } - *t = '\0'; - RSTRING(str)->as.heap.ptr = buf; + TERM_FILL((char *)t, termlen); + RSTRING(str)->as.heap.ptr = (char *)buf; RSTRING(str)->as.heap.len = t - buf; STR_SET_NOEMBED(str); RSTRING(str)->as.heap.aux.capa = max; @@ -5249,13 +7139,13 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) } } else { - int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2); - long offset; - char *buf = ALLOC_N(char, max), *t = buf; + int clen, tlen; + long offset, max = (long)((send - s) * 1.2); + unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf; while (s < send) { int may_modify = 0; - c0 = c = rb_enc_codepoint_len(s, send, &clen, e1); + c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1); tlen = enc == e1 ? clen : rb_enc_codelen(c, enc); if (c < 256) { @@ -5281,10 +7171,10 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) c = c0; if (enc != e1) may_modify = 1; } - while (t - buf + tlen >= max) { - offset = t - buf; - max *= 2; - REALLOC_N(buf, char, max); + if ((offset = t - buf) + tlen > max) { + size_t MAYBE_UNUSED(old) = max + termlen; + max = offset + tlen + (long)((send - s) * 1.2); + SIZED_REALLOC_N(buf, unsigned char, max + termlen, old); t = buf + offset; } if (s != t) { @@ -5298,10 +7188,10 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) t += tlen; } if (!STR_EMBED_P(str)) { - xfree(RSTRING(str)->as.heap.ptr); + ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str)); } - *t = '\0'; - RSTRING(str)->as.heap.ptr = buf; + TERM_FILL((char *)t, termlen); + RSTRING(str)->as.heap.ptr = (char *)buf; RSTRING(str)->as.heap.len = t - buf; STR_SET_NOEMBED(str); RSTRING(str)->as.heap.aux.capa = max; @@ -5353,7 +7243,7 @@ rb_str_tr_bang(VALUE str, VALUE src, VALUE repl) * "hello".tr('a-y', 'b-z') #=> "ifmmp" * "hello".tr('^aeiou', '*') #=> "*e**o" * - * The backslash character <code>\</code> can be used to escape + * The backslash character <code>\\</code> can be used to escape * <code>^</code> or <code>-</code> and is otherwise ignored unless it * appears at the end of a range or the end of the +from_str+ or +to_str+: * @@ -5442,7 +7332,7 @@ tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first, static int -tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel) +tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel) { if (c < 256) { return table[c] != 0; @@ -5524,7 +7414,7 @@ rb_str_delete_bang(int argc, VALUE *argv, VALUE str) s += clen; } } - *t = '\0'; + TERM_FILL(t, TERM_LEN(str)); STR_SET_LEN(str, t - RSTRING_PTR(str)); ENC_CODERANGE_SET(str, cr); @@ -5570,7 +7460,7 @@ rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str) char squeez[TR_TABLE_SIZE]; rb_encoding *enc = 0; VALUE del = 0, nodel = 0; - char *s, *send, *t; + unsigned char *s, *send, *t; int i, modify = 0; int ascompat, singlebyte = single_byte_optimizable(str); unsigned int save; @@ -5591,32 +7481,33 @@ rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str) } str_modify_keep_cr(str); - s = t = RSTRING_PTR(str); + s = t = (unsigned char *)RSTRING_PTR(str); if (!s || RSTRING_LEN(str) == 0) return Qnil; - send = RSTRING_END(str); + send = (unsigned char *)RSTRING_END(str); save = -1; ascompat = rb_enc_asciicompat(enc); if (singlebyte) { while (s < send) { - unsigned int c = *(unsigned char*)s++; + unsigned int c = *s++; if (c != save || (argc > 0 && !squeez[c])) { *t++ = save = c; } } - } else { + } + else { while (s < send) { unsigned int c; int clen; - if (ascompat && (c = *(unsigned char*)s) < 0x80) { + if (ascompat && (c = *s) < 0x80) { if (c != save || (argc > 0 && !squeez[c])) { *t++ = save = c; } s++; } else { - c = rb_enc_codepoint_len(s, send, &clen, enc); + c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc); if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) { if (t != s) rb_enc_mbcput(c, t, enc); @@ -5628,9 +7519,9 @@ rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str) } } - *t = '\0'; - if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) { - STR_SET_LEN(str, t - RSTRING_PTR(str)); + TERM_FILL((char *)t, TERM_LEN(str)); + if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) { + STR_SET_LEN(str, (char *)t - RSTRING_PTR(str)); modify = 1; } @@ -5702,13 +7593,13 @@ rb_str_tr_s(VALUE str, VALUE src, VALUE repl) /* * call-seq: - * str.count([other_str]+) -> fixnum + * str.count([other_str]+) -> integer * * Each +other_str+ parameter defines a set of characters to count. The * intersection of these sets defines the characters to count in +str+. Any * +other_str+ that starts with a caret <code>^</code> is negated. The * sequence <code>c1-c2</code> means all characters between c1 and c2. The - * backslash character <code>\</code> can be used to escape <code>^</code> or + * backslash character <code>\\</code> can be used to escape <code>^</code> or * <code>-</code> and is otherwise ignored unless it appears at the end of a * sequence or the end of a +other_str+. * @@ -5732,21 +7623,25 @@ rb_str_count(int argc, VALUE *argv, VALUE str) { char table[TR_TABLE_SIZE]; rb_encoding *enc = 0; - VALUE del = 0, nodel = 0; + VALUE del = 0, nodel = 0, tstr; char *s, *send; int i; int ascompat; rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS); - for (i=0; i<argc; i++) { - VALUE tstr = argv[i]; - unsigned char c; - StringValue(tstr); - enc = rb_enc_check(str, tstr); - if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) && - (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) { + tstr = argv[0]; + StringValue(tstr); + enc = rb_enc_check(str, tstr); + if (argc == 1) { + const char *ptstr; + if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) && + (ptstr = RSTRING_PTR(tstr), + ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) && + !is_broken_string(str)) { int n = 0; + int clen; + unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc); s = RSTRING_PTR(str); if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0); @@ -5756,7 +7651,14 @@ rb_str_count(int argc, VALUE *argv, VALUE str) } return INT2NUM(n); } - tr_setup_table(tstr, table, i==0, &del, &nodel, enc); + } + + tr_setup_table(tstr, table, TRUE, &del, &nodel, enc); + for (i=1; i<argc; i++) { + tstr = argv[i]; + StringValue(tstr); + enc = rb_enc_check(str, tstr); + tr_setup_table(tstr, table, FALSE, &del, &nodel, enc); } s = RSTRING_PTR(str); @@ -5786,6 +7688,16 @@ rb_str_count(int argc, VALUE *argv, VALUE str) return INT2NUM(i); } +static VALUE +rb_fs_check(VALUE val) +{ + if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) { + val = rb_check_string_type(val); + if (NIL_P(val)) return 0; + } + return val; +} + static const char isspacetable[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -5807,39 +7719,71 @@ static const char isspacetable[256] = { #define ascii_isspace(c) isspacetable[(unsigned char)(c)] +static long +split_string(VALUE result, VALUE str, long beg, long len, long empty_count) +{ + if (empty_count >= 0 && len == 0) { + return empty_count + 1; + } + if (empty_count > 0) { + /* make different substrings */ + if (result) { + do { + rb_ary_push(result, str_new_empty(str)); + } while (--empty_count > 0); + } + else { + do { + rb_yield(str_new_empty(str)); + } while (--empty_count > 0); + } + } + str = rb_str_subseq(str, beg, len); + if (result) { + rb_ary_push(result, str); + } + else { + rb_yield(str); + } + return empty_count; +} + /* * call-seq: - * str.split(pattern=$;, [limit]) -> anArray + * str.split(pattern=nil, [limit]) -> an_array + * str.split(pattern=nil, [limit]) {|sub| block } -> str * * Divides <i>str</i> into substrings based on a delimiter, returning an array * of these substrings. * * If <i>pattern</i> is a <code>String</code>, then its contents are used as * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single - * space, <i>str</i> is split on whitespace, with leading whitespace and runs - * of contiguous whitespace characters ignored. + * space, <i>str</i> is split on whitespace, with leading and trailing + * whitespace and runs of contiguous whitespace characters ignored. * * If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the * pattern matches. Whenever the pattern matches a zero-length string, * <i>str</i> is split into individual characters. If <i>pattern</i> contains * groups, the respective matches will be returned in the array as well. * - * If <i>pattern</i> is omitted, the value of <code>$;</code> is used. If - * <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is - * split on whitespace as if ` ' were specified. + * If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used. + * If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is + * split on whitespace as if ' ' were specified. * * If the <i>limit</i> parameter is omitted, trailing null fields are - * suppressed. If <i>limit</i> is a positive number, at most that number of - * fields will be returned (if <i>limit</i> is <code>1</code>, the entire - * string is returned as the only entry in an array). If negative, there is no + * suppressed. If <i>limit</i> is a positive number, at most that number + * of split substrings will be returned (captured groups will be returned + * as well, but are not counted towards the limit). + * If <i>limit</i> is <code>1</code>, the entire + * string is returned as the only entry in an array. If negative, there is no * limit to the number of fields returned, and trailing null fields are not * suppressed. * * When the input +str+ is empty an empty Array is returned as the string is * considered to have no fields to split. * - * " now's the time".split #=> ["now's", "the", "time"] - * " now's the time".split(' ') #=> ["now's", "the", "time"] + * " now's the time ".split #=> ["now's", "the", "time"] + * " now's the time ".split(' ') #=> ["now's", "the", "time"] * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"] * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"] * "hello".split(//) #=> ["h", "e", "l", "l", "o"] @@ -5851,7 +7795,12 @@ static const char isspacetable[256] = { * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"] * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""] * + * "1:2:3".split(/(:)()()/, 2) #=> ["1", ":", "", "", "2:3"] + * * "".split(',', -1) #=> [] + * + * If a block is given, invoke the block with each split substring. + * */ static VALUE @@ -5861,34 +7810,44 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) VALUE spat; VALUE limit; enum {awk, string, regexp} split_type; - long beg, end, i = 0; + long beg, end, i = 0, empty_count = -1; int lim = 0; VALUE result, tmp; + result = rb_block_given_p() ? Qfalse : Qnil; if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) { lim = NUM2INT(limit); if (lim <= 0) limit = Qnil; else if (lim == 1) { if (RSTRING_LEN(str) == 0) - return rb_ary_new2(0); - return rb_ary_new3(1, str); + return result ? rb_ary_new2(0) : str; + tmp = rb_str_dup(str); + if (!result) { + rb_yield(tmp); + return str; + } + return rb_ary_new3(1, tmp); } i = 1; } + if (NIL_P(limit) && !lim) empty_count = 0; enc = STR_ENC_GET(str); - if (NIL_P(spat)) { - if (!NIL_P(rb_fs)) { - spat = rb_fs; - goto fs_set; - } + split_type = regexp; + if (!NIL_P(spat)) { + spat = get_pat_quoted(spat, 0); + } + else if (NIL_P(spat = rb_fs)) { split_type = awk; } - else { - fs_set: - if (RB_TYPE_P(spat, T_STRING)) { + else if (!(spat = rb_fs_check(spat))) { + rb_raise(rb_eTypeError, "value of $; must be String or Regexp"); + } + if (split_type != awk) { + if (BUILTIN_TYPE(spat) == T_STRING) { rb_encoding *enc2 = STR_ENC_GET(spat); + mustnot_broken(spat); split_type = string; if (RSTRING_LEN(spat) == 0) { /* Special case - split into chars */ @@ -5896,7 +7855,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) split_type = regexp; } else if (rb_enc_asciicompat(enc2) == 1) { - if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){ + if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') { split_type = awk; } } @@ -5908,13 +7867,11 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) } } } - else { - spat = get_pat(spat, 1); - split_type = regexp; - } } - result = rb_ary_new(); +#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count)) + + if (result) result = rb_ary_new(); beg = 0; if (split_type == awk) { char *ptr = RSTRING_PTR(str); @@ -5938,7 +7895,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) } } else if (ascii_isspace(c)) { - rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); + SPLIT_STR(beg, end-beg); skip = 1; beg = ptr - bptr; if (!NIL_P(limit)) ++i; @@ -5965,7 +7922,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) } } else if (rb_isspace(c)) { - rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); + SPLIT_STR(beg, end-beg); skip = 1; beg = ptr - bptr; if (!NIL_P(limit)) ++i; @@ -5978,17 +7935,13 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) } else if (split_type == string) { char *ptr = RSTRING_PTR(str); - char *temp = ptr; + char *str_start = ptr; + char *substr_start = ptr; char *eptr = RSTRING_END(str); char *sptr = RSTRING_PTR(spat); long slen = RSTRING_LEN(spat); - if (is_broken_string(str)) { - rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str))); - } - if (is_broken_string(spat)) { - rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat))); - } + mustnot_broken(str); enc = rb_enc_check(str, spat); while (ptr < eptr && (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) { @@ -5998,11 +7951,12 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) ptr = t; continue; } - rb_ary_push(result, rb_str_subseq(str, ptr - temp, end)); + SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start); ptr += end + slen; + substr_start = ptr; if (!NIL_P(limit) && lim <= ++i) break; } - beg = ptr - temp; + beg = ptr - str_start; } else { char *ptr = RSTRING_PTR(str); @@ -6011,23 +7965,24 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) long idx; int last_null = 0; struct re_registers *regs; + VALUE match = 0; - while ((end = rb_reg_search(spat, str, start, 0)) >= 0) { - regs = RMATCH_REGS(rb_backref_get()); + for (; (end = rb_reg_search(spat, str, start, 0)) >= 0; + (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) { + match = rb_backref_get(); + if (!result) rb_match_busy(match); + regs = RMATCH_REGS(match); if (start == end && BEG(0) == END(0)) { if (!ptr) { - rb_ary_push(result, str_new_empty(str)); + SPLIT_STR(0, 0); break; } else if (last_null == 1) { - rb_ary_push(result, rb_str_subseq(str, beg, - rb_enc_fast_mbclen(ptr+beg, - ptr+len, - enc))); + SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, ptr+len, enc)); beg = start; } else { - if (ptr+start == ptr+len) + if (start == len) start++; else start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc); @@ -6036,37 +7991,24 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) } } else { - rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); + SPLIT_STR(beg, end-beg); beg = start = END(0); } last_null = 0; for (idx=1; idx < regs->num_regs; idx++) { if (BEG(idx) == -1) continue; - if (BEG(idx) == END(idx)) - tmp = str_new_empty(str); - else - tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx)); - rb_ary_push(result, tmp); + SPLIT_STR(BEG(idx), END(idx)-BEG(idx)); } if (!NIL_P(limit) && lim <= ++i) break; } + if (match) rb_match_unbusy(match); } if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) { - if (RSTRING_LEN(str) == beg) - tmp = str_new_empty(str); - else - tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg); - rb_ary_push(result, tmp); - } - if (NIL_P(limit) && lim == 0) { - long len; - while ((len = RARRAY_LEN(result)) > 0 && - (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0)) - rb_ary_pop(result); + SPLIT_STR(beg, RSTRING_LEN(str)-beg); } - return result; + return result ? result : str; } VALUE @@ -6075,23 +8017,202 @@ rb_str_split(VALUE str, const char *sep0) VALUE sep; StringValue(str); - sep = rb_str_new2(sep0); + sep = rb_str_new_cstr(sep0); return rb_str_split_m(1, &sep, str); } +static int +enumerator_wantarray(const char *method) +{ + if (rb_block_given_p()) { +#if STRING_ENUMERATORS_WANTARRAY + rb_warn("given block not used"); +#else + rb_warning("passing a block to String#%s is deprecated", method); + return 0; +#endif + } + return 1; +} + +#define WANTARRAY(m, size) \ + (enumerator_wantarray(m) ? rb_ary_new_capa(size) : 0) + +static inline int +enumerator_element(VALUE ary, VALUE e) +{ + if (ary) { + rb_ary_push(ary, e); + return 0; + } + else { + rb_yield(e); + return 1; + } +} + +#define ENUM_ELEM(ary, e) enumerator_element(ary, e) + +static const char * +chomp_newline(const char *p, const char *e, rb_encoding *enc) +{ + const char *prev = rb_enc_prev_char(p, e, e, enc); + if (rb_enc_is_newline(prev, e, enc)) { + e = prev; + prev = rb_enc_prev_char(p, e, e, enc); + if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r') + e = prev; + } + return e; +} + +static VALUE +rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary) +{ + rb_encoding *enc; + VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse; + const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted; + long pos, len, rslen; + int rsnewline = 0; + + if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0) + rs = rb_rs; + if (!NIL_P(opts)) { + static ID keywords[1]; + if (!keywords[0]) { + keywords[0] = rb_intern_const("chomp"); + } + rb_get_kwargs(opts, keywords, 0, 1, &chomp); + chomp = (chomp != Qundef && RTEST(chomp)); + } + + if (NIL_P(rs)) { + if (!ENUM_ELEM(ary, str)) { + return ary; + } + else { + return orig; + } + } + + if (!RSTRING_LEN(str)) goto end; + str = rb_str_new_frozen(str); + ptr = subptr = RSTRING_PTR(str); + pend = RSTRING_END(str); + len = RSTRING_LEN(str); + StringValue(rs); + rslen = RSTRING_LEN(rs); + + if (rs == rb_default_rs) + enc = rb_enc_get(str); + else + enc = rb_enc_check(str, rs); + + if (rslen == 0) { + /* paragraph mode */ + int n; + const char *eol = NULL; + subend = subptr; + while (subend < pend) { + do { + if (rb_enc_ascget(subend, pend, &n, enc) != '\r') + n = 0; + rslen = n + rb_enc_mbclen(subend + n, pend, enc); + if (rb_enc_is_newline(subend + n, pend, enc)) { + if (eol == subend) break; + subend += rslen; + if (subptr) eol = subend; + } + else { + if (!subptr) subptr = subend; + subend += rslen; + } + rslen = 0; + } while (subend < pend); + if (!subptr) break; + line = rb_str_subseq(str, subptr - ptr, + subend - subptr + (chomp ? 0 : rslen)); + if (ENUM_ELEM(ary, line)) { + str_mod_check(str, ptr, len); + } + subptr = eol = NULL; + } + goto end; + } + else { + rsptr = RSTRING_PTR(rs); + if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) && + rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) { + rsnewline = 1; + } + } + + if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) { + rs = rb_str_new(rsptr, rslen); + rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil); + rsptr = RSTRING_PTR(rs); + rslen = RSTRING_LEN(rs); + } + + while (subptr < pend) { + pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc); + if (pos < 0) break; + hit = subptr + pos; + adjusted = rb_enc_right_char_head(subptr, hit, pend, enc); + if (hit != adjusted) { + subptr = adjusted; + continue; + } + subend = hit += rslen; + if (chomp) { + if (rsnewline) { + subend = chomp_newline(subptr, subend, enc); + } + else { + subend -= rslen; + } + } + line = rb_str_subseq(str, subptr - ptr, subend - subptr); + if (ENUM_ELEM(ary, line)) { + str_mod_check(str, ptr, len); + } + subptr = hit; + } + + if (subptr != pend) { + if (chomp) { + if (rsnewline) { + pend = chomp_newline(subptr, pend, enc); + } + else if (pend - subptr >= rslen && + memcmp(pend - rslen, rsptr, rslen) == 0) { + pend -= rslen; + } + } + line = rb_str_subseq(str, subptr - ptr, pend - subptr); + ENUM_ELEM(ary, line); + RB_GC_GUARD(str); + } + + end: + if (ary) + return ary; + else + return orig; +} /* * call-seq: - * str.each_line(separator=$/) {|substr| block } -> str - * str.each_line(separator=$/) -> an_enumerator + * str.each_line(separator=$/ [, getline_args]) {|substr| block } -> str + * str.each_line(separator=$/ [, getline_args]) -> an_enumerator * - * str.lines(separator=$/) {|substr| block } -> str - * str.lines(separator=$/) -> an_enumerator + * Splits <i>str</i> using the supplied parameter as the record + * separator (<code>$/</code> by default), passing each substring in + * turn to the supplied block. If a zero-length record separator is + * supplied, the string is split into paragraphs delimited by + * multiple successive newlines. * - * Splits <i>str</i> using the supplied parameter as the record separator - * (<code>$/</code> by default), passing each substring in turn to the supplied - * block. If a zero-length record separator is supplied, the string is split - * into paragraphs delimited by multiple successive newlines. + * See IO.readlines for details about getline_args. * * If no block is given, an enumerator is returned instead. * @@ -6113,120 +8234,69 @@ rb_str_split(VALUE str, const char *sep0) * "o\nworl" * "d" * Example three - * "hello\n\n\n" + * "hello\n\n" * "world" */ static VALUE rb_str_each_line(int argc, VALUE *argv, VALUE str) { - rb_encoding *enc; - VALUE rs; - unsigned int newline; - const char *p, *pend, *s, *ptr; - long len, rslen; - VALUE line; - int n; - VALUE orig = str; - - if (argc == 0) { - rs = rb_rs; - } - else { - rb_scan_args(argc, argv, "01", &rs); - } - RETURN_ENUMERATOR(str, argc, argv); - if (NIL_P(rs)) { - rb_yield(str); - return orig; - } - str = rb_str_new4(str); - ptr = p = s = RSTRING_PTR(str); - pend = p + RSTRING_LEN(str); - len = RSTRING_LEN(str); - StringValue(rs); - if (rs == rb_default_rs) { - enc = rb_enc_get(str); - while (p < pend) { - char *p0; + RETURN_SIZED_ENUMERATOR(str, argc, argv, 0); + return rb_str_enumerate_lines(argc, argv, str, 0); +} - p = memchr(p, '\n', pend - p); - if (!p) break; - p0 = rb_enc_left_char_head(s, p, pend, enc); - if (!rb_enc_is_newline(p0, pend, enc)) { - p++; - continue; - } - p = p0 + rb_enc_mbclen(p0, pend, enc); - line = rb_str_new5(str, s, p - s); - OBJ_INFECT(line, str); - rb_enc_cr_str_copy_for_substr(line, str); - rb_yield(line); - str_mod_check(str, ptr, len); - s = p; - } - goto finish; - } +/* + * call-seq: + * str.lines(separator=$/ [, getline_args]) -> an_array + * + * Returns an array of lines in <i>str</i> split using the supplied + * record separator (<code>$/</code> by default). This is a + * shorthand for <code>str.each_line(separator, getline_args).to_a</code>. + * + * See IO.readlines for details about getline_args. + * + * "hello\nworld\n".lines #=> ["hello\n", "world\n"] + * "hello world".lines(' ') #=> ["hello ", " ", "world"] + * "hello\nworld\n".lines(chomp: true) #=> ["hello", "world"] + * + * If a block is given, which is a deprecated form, works the same as + * <code>each_line</code>. + */ - enc = rb_enc_check(str, rs); - rslen = RSTRING_LEN(rs); - if (rslen == 0) { - newline = '\n'; - } - else { - newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc); - } +static VALUE +rb_str_lines(int argc, VALUE *argv, VALUE str) +{ + VALUE ary = WANTARRAY("lines", 0); + return rb_str_enumerate_lines(argc, argv, str, ary); +} - while (p < pend) { - unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc); +static VALUE +rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj) +{ + return LONG2FIX(RSTRING_LEN(str)); +} - again: - if (rslen == 0 && c == newline) { - p += n; - if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) { - goto again; - } - while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) { - p += n; - } - p -= n; - } - if (c == newline && - (rslen <= 1 || - (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) { - line = rb_str_new5(str, s, p - s + (rslen ? rslen : n)); - OBJ_INFECT(line, str); - rb_enc_cr_str_copy_for_substr(line, str); - rb_yield(line); - str_mod_check(str, ptr, len); - s = p + (rslen ? rslen : n); - } - p += n; - } +static VALUE +rb_str_enumerate_bytes(VALUE str, VALUE ary) +{ + long i; - finish: - if (s != pend) { - line = rb_str_new5(str, s, pend - s); - OBJ_INFECT(line, str); - rb_enc_cr_str_copy_for_substr(line, str); - rb_yield(line); - RB_GC_GUARD(str); + for (i=0; i<RSTRING_LEN(str); i++) { + ENUM_ELEM(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff)); } - - return orig; + if (ary) + return ary; + else + return str; } - /* * call-seq: - * str.bytes {|fixnum| block } -> str - * str.bytes -> an_enumerator - * - * str.each_byte {|fixnum| block } -> str + * str.each_byte {|integer| block } -> str * str.each_byte -> an_enumerator * - * Passes each byte in <i>str</i> to the given block, or returns - * an enumerator if no block is given. + * Passes each byte in <i>str</i> to the given block, or returns an + * enumerator if no block is given. * * "hello".each_byte {|c| print c, ' ' } * @@ -6238,21 +8308,68 @@ rb_str_each_line(int argc, VALUE *argv, VALUE str) static VALUE rb_str_each_byte(VALUE str) { - long i; + RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size); + return rb_str_enumerate_bytes(str, 0); +} - RETURN_ENUMERATOR(str, 0, 0); - for (i=0; i<RSTRING_LEN(str); i++) { - rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff)); - } - return str; +/* + * call-seq: + * str.bytes -> an_array + * + * Returns an array of bytes in <i>str</i>. This is a shorthand for + * <code>str.each_byte.to_a</code>. + * + * If a block is given, which is a deprecated form, works the same as + * <code>each_byte</code>. + */ + +static VALUE +rb_str_bytes(VALUE str) +{ + VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str)); + return rb_str_enumerate_bytes(str, ary); } +static VALUE +rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj) +{ + return rb_str_length(str); +} + +static VALUE +rb_str_enumerate_chars(VALUE str, VALUE ary) +{ + VALUE orig = str; + long i, len, n; + const char *ptr; + rb_encoding *enc; + + str = rb_str_new_frozen(str); + ptr = RSTRING_PTR(str); + len = RSTRING_LEN(str); + enc = rb_enc_get(str); + + if (ENC_CODERANGE_CLEAN_P(ENC_CODERANGE(str))) { + for (i = 0; i < len; i += n) { + n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc); + ENUM_ELEM(ary, rb_str_subseq(str, i, n)); + } + } + else { + for (i = 0; i < len; i += n) { + n = rb_enc_mbclen(ptr + i, ptr + len, enc); + ENUM_ELEM(ary, rb_str_subseq(str, i, n)); + } + } + RB_GC_GUARD(str); + if (ary) + return ary; + else + return orig; +} /* * call-seq: - * str.chars {|cstr| block } -> str - * str.chars -> an_enumerator - * * str.each_char {|cstr| block } -> str * str.each_char -> an_enumerator * @@ -6269,44 +8386,67 @@ rb_str_each_byte(VALUE str) static VALUE rb_str_each_char(VALUE str) { + RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size); + return rb_str_enumerate_chars(str, 0); +} + +/* + * call-seq: + * str.chars -> an_array + * + * Returns an array of characters in <i>str</i>. This is a shorthand + * for <code>str.each_char.to_a</code>. + * + * If a block is given, which is a deprecated form, works the same as + * <code>each_char</code>. + */ + +static VALUE +rb_str_chars(VALUE str) +{ + VALUE ary = WANTARRAY("chars", rb_str_strlen(str)); + return rb_str_enumerate_chars(str, ary); +} + +static VALUE +rb_str_enumerate_codepoints(VALUE str, VALUE ary) +{ VALUE orig = str; - long i, len, n; - const char *ptr; + int n; + unsigned int c; + const char *ptr, *end; rb_encoding *enc; - RETURN_ENUMERATOR(str, 0, 0); - str = rb_str_new4(str); + if (single_byte_optimizable(str)) + return rb_str_enumerate_bytes(str, ary); + + str = rb_str_new_frozen(str); ptr = RSTRING_PTR(str); - len = RSTRING_LEN(str); - enc = rb_enc_get(str); - switch (ENC_CODERANGE(str)) { - case ENC_CODERANGE_VALID: - case ENC_CODERANGE_7BIT: - for (i = 0; i < len; i += n) { - n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc); - rb_yield(rb_str_subseq(str, i, n)); - } - break; - default: - for (i = 0; i < len; i += n) { - n = rb_enc_mbclen(ptr + i, ptr + len, enc); - rb_yield(rb_str_subseq(str, i, n)); - } + end = RSTRING_END(str); + enc = STR_ENC_GET(str); + + while (ptr < end) { + c = rb_enc_codepoint_len(ptr, end, &n, enc); + ENUM_ELEM(ary, UINT2NUM(c)); + ptr += n; } - return orig; + RB_GC_GUARD(str); + if (ary) + return ary; + else + return orig; } /* * call-seq: - * str.codepoints {|integer| block } -> str - * str.codepoints -> an_enumerator - * * str.each_codepoint {|integer| block } -> str * str.each_codepoint -> an_enumerator * * Passes the <code>Integer</code> ordinal of each character in <i>str</i>, * also known as a <i>codepoint</i> when applied to Unicode strings to the - * given block. + * given block. For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE), + * values are directly derived from the binary representation + * of each character. * * If no block is given, an enumerator is returned instead. * @@ -6320,25 +8460,176 @@ rb_str_each_char(VALUE str) static VALUE rb_str_each_codepoint(VALUE str) { - VALUE orig = str; - int n; - unsigned int c; + RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size); + return rb_str_enumerate_codepoints(str, 0); +} + +/* + * call-seq: + * str.codepoints -> an_array + * + * Returns an array of the <code>Integer</code> ordinals of the + * characters in <i>str</i>. This is a shorthand for + * <code>str.each_codepoint.to_a</code>. + * + * If a block is given, which is a deprecated form, works the same as + * <code>each_codepoint</code>. + */ + +static VALUE +rb_str_codepoints(VALUE str) +{ + VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str)); + return rb_str_enumerate_codepoints(str, ary); +} + +static regex_t * +get_reg_grapheme_cluster(rb_encoding *enc) +{ + int encidx = rb_enc_to_index(enc); + regex_t *reg_grapheme_cluster = NULL; + static regex_t *reg_grapheme_cluster_utf8 = NULL; + + /* synchronize */ + if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) { + reg_grapheme_cluster = reg_grapheme_cluster_utf8; + } + if (!reg_grapheme_cluster) { + const OnigUChar source_ascii[] = "\\X"; + OnigErrorInfo einfo; + const OnigUChar *source = source_ascii; + size_t source_len = sizeof(source_ascii) - 1; + switch (encidx) { +#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x) +#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8) +#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x) +#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16) +#define CASE_UTF(e) \ + case ENCINDEX_UTF_##e: { \ + static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \ + source = source_UTF_##e; \ + source_len = sizeof(source_UTF_##e); \ + break; \ + } + CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE); +#undef CASE_UTF +#undef CHARS_16BE +#undef CHARS_16LE +#undef CHARS_32BE +#undef CHARS_32LE + } + int r = onig_new(®_grapheme_cluster, source, source + source_len, + ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo); + if (r) { + UChar message[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str(message, r, &einfo); + rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message); + } + if (encidx == rb_utf8_encindex()) { + reg_grapheme_cluster_utf8 = reg_grapheme_cluster; + } + } + return reg_grapheme_cluster; +} + +static VALUE +rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj) +{ + size_t grapheme_cluster_count = 0; + regex_t *reg_grapheme_cluster = NULL; + rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str)); const char *ptr, *end; - rb_encoding *enc; - if (single_byte_optimizable(str)) return rb_str_each_byte(str); - RETURN_ENUMERATOR(str, 0, 0); - str = rb_str_new4(str); + if (!rb_enc_unicode_p(enc)) { + return rb_str_length(str); + } + + reg_grapheme_cluster = get_reg_grapheme_cluster(enc); ptr = RSTRING_PTR(str); end = RSTRING_END(str); - enc = STR_ENC_GET(str); + while (ptr < end) { - c = rb_enc_codepoint_len(ptr, end, &n, enc); - rb_yield(UINT2NUM(c)); - ptr += n; + OnigPosition len = onig_match(reg_grapheme_cluster, + (const OnigUChar *)ptr, (const OnigUChar *)end, + (const OnigUChar *)ptr, NULL, 0); + if (len <= 0) break; + grapheme_cluster_count++; + ptr += len; + } + + return SIZET2NUM(grapheme_cluster_count); +} + +static VALUE +rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary) +{ + VALUE orig = str; + regex_t *reg_grapheme_cluster = NULL; + rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str)); + const char *ptr0, *ptr, *end; + + if (!rb_enc_unicode_p(enc)) { + return rb_str_enumerate_chars(str, ary); + } + + if (!ary) str = rb_str_new_frozen(str); + reg_grapheme_cluster = get_reg_grapheme_cluster(enc); + ptr0 = ptr = RSTRING_PTR(str); + end = RSTRING_END(str); + + while (ptr < end) { + OnigPosition len = onig_match(reg_grapheme_cluster, + (const OnigUChar *)ptr, (const OnigUChar *)end, + (const OnigUChar *)ptr, NULL, 0); + if (len <= 0) break; + ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len)); + ptr += len; } RB_GC_GUARD(str); - return orig; + if (ary) + return ary; + else + return orig; +} + +/* + * call-seq: + * str.each_grapheme_cluster {|cstr| block } -> str + * str.each_grapheme_cluster -> an_enumerator + * + * Passes each grapheme cluster in <i>str</i> to the given block, or returns + * an enumerator if no block is given. + * Unlike String#each_char, this enumerates by grapheme clusters defined by + * Unicode Standard Annex #29 http://unicode.org/reports/tr29/ + * + * "a\u0300".each_char.to_a.size #=> 2 + * "a\u0300".each_grapheme_cluster.to_a.size #=> 1 + * + */ + +static VALUE +rb_str_each_grapheme_cluster(VALUE str) +{ + RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size); + return rb_str_enumerate_grapheme_clusters(str, 0); +} + +/* + * call-seq: + * str.grapheme_clusters -> an_array + * + * Returns an array of grapheme clusters in <i>str</i>. This is a shorthand + * for <code>str.each_grapheme_cluster.to_a</code>. + * + * If a block is given, which is a deprecated form, works the same as + * <code>each_grapheme_cluster</code>. + */ + +static VALUE +rb_str_grapheme_clusters(VALUE str) +{ + VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str)); + return rb_str_enumerate_grapheme_clusters(str, ary); } static long @@ -6376,7 +8667,7 @@ rb_str_chop_bang(VALUE str) long len; len = chopped_length(str); STR_SET_LEN(str, len); - RSTRING_PTR(str)[len] = '\0'; + TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str)); if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { ENC_CODERANGE_CLEAR(str); } @@ -6406,119 +8697,160 @@ rb_str_chop_bang(VALUE str) static VALUE rb_str_chop(VALUE str) { - VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str)); - rb_enc_cr_str_copy_for_substr(str2, str); - OBJ_INFECT(str2, str); - return str2; + return rb_str_subseq(str, 0, chopped_length(str)); } -/* - * call-seq: - * str.chomp!(separator=$/) -> str or nil - * - * Modifies <i>str</i> in place as described for <code>String#chomp</code>, - * returning <i>str</i>, or <code>nil</code> if no modifications were made. - */ - -static VALUE -rb_str_chomp_bang(int argc, VALUE *argv, VALUE str) +static long +chompped_length(VALUE str, VALUE rs) { rb_encoding *enc; - VALUE rs; int newline; - char *p, *pp, *e; - long len, rslen; + char *pp, *e, *rsptr; + long rslen; + char *const p = RSTRING_PTR(str); + long len = RSTRING_LEN(str); - str_modify_keep_cr(str); - len = RSTRING_LEN(str); - if (len == 0) return Qnil; - p = RSTRING_PTR(str); + if (len == 0) return 0; e = p + len; - if (argc == 0) { - rs = rb_rs; - if (rs == rb_default_rs) { - smart_chomp: - enc = rb_enc_get(str); - if (rb_enc_mbminlen(enc) > 1) { - pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc); - if (rb_enc_is_newline(pp, e, enc)) { + if (rs == rb_default_rs) { + smart_chomp: + enc = rb_enc_get(str); + if (rb_enc_mbminlen(enc) > 1) { + pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc); + if (rb_enc_is_newline(pp, e, enc)) { + e = pp; + } + pp = e - rb_enc_mbminlen(enc); + if (pp >= p) { + pp = rb_enc_left_char_head(p, pp, e, enc); + if (rb_enc_ascget(pp, e, 0, enc) == '\r') { e = pp; } - pp = e - rb_enc_mbminlen(enc); + } + } + else { + switch (*(e-1)) { /* not e[-1] to get rid of VC bug */ + case '\n': + if (--e > p && *(e-1) == '\r') { + --e; + } + break; + case '\r': + --e; + break; + } + } + return e - p; + } + + enc = rb_enc_get(str); + RSTRING_GETMEM(rs, rsptr, rslen); + if (rslen == 0) { + if (rb_enc_mbminlen(enc) > 1) { + while (e > p) { + pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc); + if (!rb_enc_is_newline(pp, e, enc)) break; + e = pp; + pp -= rb_enc_mbminlen(enc); if (pp >= p) { pp = rb_enc_left_char_head(p, pp, e, enc); if (rb_enc_ascget(pp, e, 0, enc) == '\r') { e = pp; } } - if (e == RSTRING_END(str)) { - return Qnil; - } - len = e - RSTRING_PTR(str); - STR_SET_LEN(str, len); } - else { - if (RSTRING_PTR(str)[len-1] == '\n') { - STR_DEC_LEN(str); - if (RSTRING_LEN(str) > 0 && - RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') { - STR_DEC_LEN(str); - } - } - else if (RSTRING_PTR(str)[len-1] == '\r') { - STR_DEC_LEN(str); - } - else { - return Qnil; - } + } + else { + while (e > p && *(e-1) == '\n') { + --e; + if (e > p && *(e-1) == '\r') + --e; } - RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; - return str; } + return e - p; } - else { - rb_scan_args(argc, argv, "01", &rs); - } - if (NIL_P(rs)) return Qnil; - StringValue(rs); - rslen = RSTRING_LEN(rs); - if (rslen == 0) { - while (len>0 && p[len-1] == '\n') { - len--; - if (len>0 && p[len-1] == '\r') - len--; - } - if (len < RSTRING_LEN(str)) { - STR_SET_LEN(str, len); - RSTRING_PTR(str)[len] = '\0'; - return str; + if (rslen > len) return len; + + enc = rb_enc_get(rs); + newline = rsptr[rslen-1]; + if (rslen == rb_enc_mbminlen(enc)) { + if (rslen == 1) { + if (newline == '\n') + goto smart_chomp; + } + else { + if (rb_enc_is_newline(rsptr, rsptr+rslen, enc)) + goto smart_chomp; } - return Qnil; } - if (rslen > len) return Qnil; - newline = RSTRING_PTR(rs)[rslen-1]; - if (rslen == 1 && newline == '\n') - goto smart_chomp; enc = rb_enc_check(str, rs); if (is_broken_string(rs)) { - return Qnil; + return len; } pp = e - rslen; if (p[len-1] == newline && (rslen <= 1 || - memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) { - if (rb_enc_left_char_head(p, pp, e, enc) != pp) - return Qnil; - if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { - ENC_CODERANGE_CLEAR(str); - } - STR_SET_LEN(str, RSTRING_LEN(str) - rslen); - RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; - return str; + memcmp(rsptr, pp, rslen) == 0)) { + if (rb_enc_left_char_head(p, pp, e, enc) == pp) + return len - rslen; + RB_GC_GUARD(rs); + } + return len; +} + +/*! + * Returns the separator for arguments of rb_str_chomp. + * + * @return returns rb_ps ($/) as default, the default value of rb_ps ($/) is "\n". + */ +static VALUE +chomp_rs(int argc, const VALUE *argv) +{ + rb_check_arity(argc, 0, 1); + if (argc > 0) { + VALUE rs = argv[0]; + if (!NIL_P(rs)) StringValue(rs); + return rs; + } + else { + return rb_rs; } - return Qnil; +} + +VALUE +rb_str_chomp_string(VALUE str, VALUE rs) +{ + long olen = RSTRING_LEN(str); + long len = chompped_length(str, rs); + if (len >= olen) return Qnil; + str_modify_keep_cr(str); + STR_SET_LEN(str, len); + TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str)); + if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { + ENC_CODERANGE_CLEAR(str); + } + return str; +} + +/* + * call-seq: + * str.chomp!(separator=$/) -> str or nil + * + * Modifies <i>str</i> in place as described for <code>String#chomp</code>, + * returning <i>str</i>, or <code>nil</code> if no modifications were made. + */ + +static VALUE +rb_str_chomp_bang(int argc, VALUE *argv, VALUE str) +{ + VALUE rs; + str_modifiable(str); + if (RSTRING_LEN(str) == 0) return Qnil; + rs = chomp_rs(argc, argv); + if (NIL_P(rs)) return Qnil; + return rb_str_chomp_string(str, rs); } @@ -6530,34 +8862,63 @@ rb_str_chomp_bang(int argc, VALUE *argv, VALUE str) * from the end of <i>str</i> (if present). If <code>$/</code> has not been * changed from the default Ruby record separator, then <code>chomp</code> also * removes carriage return characters (that is it will remove <code>\n</code>, - * <code>\r</code>, and <code>\r\n</code>). - * - * "hello".chomp #=> "hello" - * "hello\n".chomp #=> "hello" - * "hello\r\n".chomp #=> "hello" - * "hello\n\r".chomp #=> "hello\n" - * "hello\r".chomp #=> "hello" - * "hello \n there".chomp #=> "hello \n there" - * "hello".chomp("llo") #=> "he" + * <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string, + * it will remove all trailing newlines from the string. + * + * "hello".chomp #=> "hello" + * "hello\n".chomp #=> "hello" + * "hello\r\n".chomp #=> "hello" + * "hello\n\r".chomp #=> "hello\n" + * "hello\r".chomp #=> "hello" + * "hello \n there".chomp #=> "hello \n there" + * "hello".chomp("llo") #=> "he" + * "hello\r\n\r\n".chomp('') #=> "hello" + * "hello\r\n\r\r\n".chomp('') #=> "hello\r\n\r" */ static VALUE rb_str_chomp(int argc, VALUE *argv, VALUE str) { - str = rb_str_dup(str); - rb_str_chomp_bang(argc, argv, str); - return str; + VALUE rs = chomp_rs(argc, argv); + if (NIL_P(rs)) return rb_str_dup(str); + return rb_str_subseq(str, 0, chompped_length(str, rs)); +} + +static long +lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc) +{ + const char *const start = s; + + if (!s || s >= e) return 0; + + /* remove spaces at head */ + if (single_byte_optimizable(str)) { + while (s < e && ascii_isspace(*s)) s++; + } + else { + while (s < e) { + int n; + unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc); + + if (!rb_isspace(cc)) break; + s += n; + } + } + return s - start; } /* * call-seq: * str.lstrip! -> self or nil * - * Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no - * change was made. See also <code>String#rstrip!</code> and - * <code>String#strip!</code>. + * Removes leading whitespace from the receiver. + * Returns the altered receiver, or +nil+ if no change was made. + * See also String#rstrip! and String#strip!. * - * " hello ".lstrip #=> "hello " + * Refer to String#strip for the definition of whitespace. + * + * " hello ".lstrip! #=> "hello " + * "hello ".lstrip! #=> nil * "hello".lstrip! #=> nil */ @@ -6565,26 +8926,21 @@ static VALUE rb_str_lstrip_bang(VALUE str) { rb_encoding *enc; - char *s, *t, *e; + char *start, *s; + long olen, loffset; str_modify_keep_cr(str); enc = STR_ENC_GET(str); - s = RSTRING_PTR(str); - if (!s || RSTRING_LEN(str) == 0) return Qnil; - e = t = RSTRING_END(str); - /* remove spaces at head */ - while (s < e) { - int n; - unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc); - - if (!rb_isspace(cc)) break; - s += n; - } - - if (s > RSTRING_PTR(str)) { - STR_SET_LEN(str, t-s); - memmove(RSTRING_PTR(str), s, RSTRING_LEN(str)); - RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; + RSTRING_GETMEM(str, start, olen); + loffset = lstrip_offset(str, start, start+olen, enc); + if (loffset > 0) { + long len = olen-loffset; + s = start + loffset; + memmove(start, s, len); + STR_SET_LEN(str, len); +#if !SHARABLE_MIDDLE_SUBSTRING + TERM_FILL(start+len, rb_enc_mbminlen(enc)); +#endif return str; } return Qnil; @@ -6595,8 +8951,10 @@ rb_str_lstrip_bang(VALUE str) * call-seq: * str.lstrip -> new_str * - * Returns a copy of <i>str</i> with leading whitespace removed. See also - * <code>String#rstrip</code> and <code>String#strip</code>. + * Returns a copy of the receiver with leading whitespace removed. + * See also String#rstrip and String#strip. + * + * Refer to String#strip for the definition of whitespace. * * " hello ".lstrip #=> "hello " * "hello".lstrip #=> "hello" @@ -6605,36 +8963,22 @@ rb_str_lstrip_bang(VALUE str) static VALUE rb_str_lstrip(VALUE str) { - str = rb_str_dup(str); - rb_str_lstrip_bang(str); - return str; + char *start; + long len, loffset; + RSTRING_GETMEM(str, start, len); + loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str)); + if (loffset <= 0) return rb_str_dup(str); + return rb_str_subseq(str, loffset, len - loffset); } - -/* - * call-seq: - * str.rstrip! -> self or nil - * - * Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if - * no change was made. See also <code>String#lstrip!</code> and - * <code>String#strip!</code>. - * - * " hello ".rstrip #=> " hello" - * "hello".rstrip! #=> nil - */ - -static VALUE -rb_str_rstrip_bang(VALUE str) +static long +rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc) { - rb_encoding *enc; - char *s, *t, *e; + const char *t; - str_modify_keep_cr(str); - enc = STR_ENC_GET(str); rb_str_check_dummy_enc(enc); - s = RSTRING_PTR(str); - if (!s || RSTRING_LEN(str) == 0) return Qnil; - t = e = RSTRING_END(str); + if (!s || s >= e) return 0; + t = e; /* remove trailing spaces or '\0's */ if (single_byte_optimizable(str)) { @@ -6650,11 +8994,42 @@ rb_str_rstrip_bang(VALUE str) t = tp; } } - if (t < e) { - long len = t-RSTRING_PTR(str); + return e - t; +} + +/* + * call-seq: + * str.rstrip! -> self or nil + * + * Removes trailing whitespace from the receiver. + * Returns the altered receiver, or +nil+ if no change was made. + * See also String#lstrip! and String#strip!. + * + * Refer to String#strip for the definition of whitespace. + * + * " hello ".rstrip! #=> " hello" + * " hello".rstrip! #=> nil + * "hello".rstrip! #=> nil + */ + +static VALUE +rb_str_rstrip_bang(VALUE str) +{ + rb_encoding *enc; + char *start; + long olen, roffset; + + str_modify_keep_cr(str); + enc = STR_ENC_GET(str); + RSTRING_GETMEM(str, start, olen); + roffset = rstrip_offset(str, start, start+olen, enc); + if (roffset > 0) { + long len = olen - roffset; STR_SET_LEN(str, len); - RSTRING_PTR(str)[len] = '\0'; +#if !SHARABLE_MIDDLE_SUBSTRING + TERM_FILL(start+len, rb_enc_mbminlen(enc)); +#endif return str; } return Qnil; @@ -6665,8 +9040,10 @@ rb_str_rstrip_bang(VALUE str) * call-seq: * str.rstrip -> new_str * - * Returns a copy of <i>str</i> with trailing whitespace removed. See also - * <code>String#lstrip</code> and <code>String#strip</code>. + * Returns a copy of the receiver with trailing whitespace removed. + * See also String#lstrip and String#strip. + * + * Refer to String#strip for the definition of whitespace. * * " hello ".rstrip #=> " hello" * "hello".rstrip #=> "hello" @@ -6675,28 +9052,58 @@ rb_str_rstrip_bang(VALUE str) static VALUE rb_str_rstrip(VALUE str) { - str = rb_str_dup(str); - rb_str_rstrip_bang(str); - return str; + rb_encoding *enc; + char *start; + long olen, roffset; + + enc = STR_ENC_GET(str); + RSTRING_GETMEM(str, start, olen); + roffset = rstrip_offset(str, start, start+olen, enc); + + if (roffset <= 0) return rb_str_dup(str); + return rb_str_subseq(str, 0, olen-roffset); } /* * call-seq: - * str.strip! -> str or nil + * str.strip! -> self or nil + * + * Removes leading and trailing whitespace from the receiver. + * Returns the altered receiver, or +nil+ if there was no change. * - * Removes leading and trailing whitespace from <i>str</i>. Returns - * <code>nil</code> if <i>str</i> was not altered. + * Refer to String#strip for the definition of whitespace. + * + * " hello ".strip! #=> "hello" + * "hello".strip! #=> nil */ static VALUE rb_str_strip_bang(VALUE str) { - VALUE l = rb_str_lstrip_bang(str); - VALUE r = rb_str_rstrip_bang(str); + char *start; + long olen, loffset, roffset; + rb_encoding *enc; - if (NIL_P(l) && NIL_P(r)) return Qnil; - return str; + str_modify_keep_cr(str); + enc = STR_ENC_GET(str); + RSTRING_GETMEM(str, start, olen); + loffset = lstrip_offset(str, start, start+olen, enc); + roffset = rstrip_offset(str, start+loffset, start+olen, enc); + + if (loffset > 0 || roffset > 0) { + long len = olen-roffset; + if (loffset > 0) { + len -= loffset; + memmove(start, start + loffset, len); + } + STR_SET_LEN(str, len); +#if !SHARABLE_MIDDLE_SUBSTRING + TERM_FILL(start+len, rb_enc_mbminlen(enc)); +#endif + return str; + } + return Qnil; } @@ -6704,50 +9111,77 @@ rb_str_strip_bang(VALUE str) * call-seq: * str.strip -> new_str * - * Returns a copy of <i>str</i> with leading and trailing whitespace removed. + * Returns a copy of the receiver with leading and trailing whitespace removed. + * + * Whitespace is defined as any of the following characters: + * null, horizontal tab, line feed, vertical tab, form feed, carriage return, space. * * " hello ".strip #=> "hello" * "\tgoodbye\r\n".strip #=> "goodbye" + * "\x00\t\n\v\f\r ".strip #=> "" + * "hello".strip #=> "hello" */ static VALUE rb_str_strip(VALUE str) { - str = rb_str_dup(str); - rb_str_strip_bang(str); - return str; + char *start; + long olen, loffset, roffset; + rb_encoding *enc = STR_ENC_GET(str); + + RSTRING_GETMEM(str, start, olen); + loffset = lstrip_offset(str, start, start+olen, enc); + roffset = rstrip_offset(str, start+loffset, start+olen, enc); + + if (loffset <= 0 && roffset <= 0) return rb_str_dup(str); + return rb_str_subseq(str, loffset, olen-loffset-roffset); } static VALUE -scan_once(VALUE str, VALUE pat, long *start) +scan_once(VALUE str, VALUE pat, long *start, int set_backref_str) { VALUE result, match; struct re_registers *regs; int i; - - if (rb_reg_search(pat, str, *start, 0) >= 0) { - match = rb_backref_get(); - regs = RMATCH_REGS(match); - if (BEG(0) == END(0)) { + long end, pos = rb_pat_search(pat, str, *start, set_backref_str); + if (pos >= 0) { + if (BUILTIN_TYPE(pat) == T_STRING) { + regs = NULL; + end = pos + RSTRING_LEN(pat); + } + else { + match = rb_backref_get(); + regs = RMATCH_REGS(match); + pos = BEG(0); + end = END(0); + } + if (pos == end) { rb_encoding *enc = STR_ENC_GET(str); /* * Always consume at least one character of the input string */ - if (RSTRING_LEN(str) > END(0)) - *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0), - RSTRING_END(str), enc); + if (RSTRING_LEN(str) > end) + *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end, + RSTRING_END(str), enc); else - *start = END(0)+1; + *start = end + 1; } else { - *start = END(0); + *start = end; } - if (regs->num_regs == 1) { - return rb_reg_nth_match(0, match); + if (!regs || regs->num_regs == 1) { + result = rb_str_subseq(str, pos, end - pos); + OBJ_INFECT(result, pat); + return result; } result = rb_ary_new2(regs->num_regs); for (i=1; i < regs->num_regs; i++) { - rb_ary_push(result, rb_reg_nth_match(i, match)); + VALUE s = Qnil; + if (BEG(i) >= 0) { + s = rb_str_subseq(str, BEG(i), END(i)-BEG(i)); + OBJ_INFECT(s, pat); + } + rb_ary_push(result, s); } return result; @@ -6795,26 +9229,28 @@ rb_str_scan(VALUE str, VALUE pat) long last = -1, prev = 0; char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str); - pat = get_pat(pat, 1); + pat = get_pat_quoted(pat, 1); + mustnot_broken(str); if (!rb_block_given_p()) { VALUE ary = rb_ary_new(); - while (!NIL_P(result = scan_once(str, pat, &start))) { + while (!NIL_P(result = scan_once(str, pat, &start, 0))) { last = prev; prev = start; rb_ary_push(ary, result); } - if (last >= 0) rb_reg_search(pat, str, last, 0); + if (last >= 0) rb_pat_search(pat, str, last, 1); + else rb_backref_set(Qnil); return ary; } - while (!NIL_P(result = scan_once(str, pat, &start))) { + while (!NIL_P(result = scan_once(str, pat, &start, 1))) { last = prev; prev = start; rb_yield(result); str_mod_check(str, p, len); } - if (last >= 0) rb_reg_search(pat, str, last, 0); + if (last >= 0) rb_pat_search(pat, str, last, 1); return str; } @@ -6852,6 +9288,9 @@ rb_str_hex(VALUE str) * "-377".oct #=> -255 * "bad".oct #=> 0 * "0377bad".oct #=> 255 + * + * If +str+ starts with <code>0</code>, radix indicators are honored. + * See Kernel#Integer. */ static VALUE @@ -6865,36 +9304,91 @@ rb_str_oct(VALUE str) * call-seq: * str.crypt(salt_str) -> new_str * - * Applies a one-way cryptographic hash to <i>str</i> by invoking the - * standard library function <code>crypt(3)</code> with the given - * salt string. While the format and the result are system and - * implementation dependent, using a salt matching the regular - * expression <code>\A[a-zA-Z0-9./]{2}</code> should be valid and - * safe on any platform, in which only the first two characters are - * significant. - * - * This method is for use in system specific scripts, so if you want - * a cross-platform hash function consider using Digest or OpenSSL - * instead. + * Returns the string generated by calling <code>crypt(3)</code> + * standard library function with <code>str</code> and + * <code>salt_str</code>, in this order, as its arguments. Please do + * not use this method any longer. It is legacy; provided only for + * backward compatibility with ruby scripts in earlier days. It is + * bad to use in contemporary programs for several reasons: + * + * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is + * run. The generated string lacks data portability. + * + * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails + * (i.e. silently ends up in unexpected results). + * + * * On some OSes such as Mac OS, <code>crypt(3)</code> is not + * thread safe. + * + * * So-called "traditional" usage of <code>crypt(3)</code> is very + * very very weak. According to its manpage, Linux's traditional + * <code>crypt(3)</code> output has only 2**56 variations; too + * easy to brute force today. And this is the default behaviour. + * + * * In order to make things robust some OSes implement so-called + * "modular" usage. To go through, you have to do a complex + * build-up of the <code>salt_str</code> parameter, by hand. + * Failure in generation of a proper salt string tends not to + * yield any errors; typos in parameters are normally not + * detectable. + * + * * For instance, in the following example, the second invocation + * of <code>String#crypt</code> is wrong; it has a typo in + * "round=" (lacks "s"). However the call does not fail and + * something unexpected is generated. + * + * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage + * "foo".crypt("$5$round=1000$salt$") # Typo not detected + * + * * Even in the "modular" mode, some hash functions are considered + * archaic and no longer recommended at all; for instance module + * <code>$1$</code> is officially abandoned by its author: see + * http://phk.freebsd.dk/sagas/md5crypt_eol.html . For another + * instance module <code>$3$</code> is considered completely + * broken: see the manpage of FreeBSD. + * + * * On some OS such as Mac OS, there is no modular mode. Yet, as + * written above, <code>crypt(3)</code> on Mac OS never fails. + * This means even if you build up a proper salt string it + * generates a traditional DES hash anyways, and there is no way + * for you to be aware of. + * + * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6." + * + * If for some reason you cannot migrate to other secure contemporary + * password hashing algorithms, install the string-crypt gem and + * <code>require 'string/crypt'</code> to continue using it. */ static VALUE rb_str_crypt(VALUE str, VALUE salt) { +#ifdef HAVE_CRYPT_R + VALUE databuf; + struct crypt_data *data; +# define CRYPT_END() ALLOCV_END(databuf) +#else extern char *crypt(const char *, const char *); +# define CRYPT_END() (void)0 +#endif VALUE result; const char *s, *saltp; + char *res; #ifdef BROKEN_CRYPT char salt_8bit_clean[3]; #endif StringValue(salt); - if (RSTRING_LEN(salt) < 2) + mustnot_wchar(str); + mustnot_wchar(salt); + if (RSTRING_LEN(salt) < 2) { + short_salt: rb_raise(rb_eArgError, "salt too short (need >=2 bytes)"); + } - s = RSTRING_PTR(str); - if (!s) s = ""; + s = StringValueCStr(str); saltp = RSTRING_PTR(salt); + if (!saltp[0] || !saltp[1]) goto short_salt; #ifdef BROKEN_CRYPT if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) { salt_8bit_clean[0] = saltp[0] & 0x7f; @@ -6903,49 +9397,32 @@ rb_str_crypt(VALUE str, VALUE salt) saltp = salt_8bit_clean; } #endif - result = rb_str_new2(crypt(s, saltp)); - OBJ_INFECT(result, str); - OBJ_INFECT(result, salt); +#ifdef HAVE_CRYPT_R + data = ALLOCV(databuf, sizeof(struct crypt_data)); +# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED + data->initialized = 0; +# endif + res = crypt_r(s, saltp, data); +#else + res = crypt(s, saltp); +#endif + if (!res) { + int err = errno; + CRYPT_END(); + rb_syserr_fail(err, "crypt"); + } + result = rb_str_new_cstr(res); + CRYPT_END(); + FL_SET_RAW(result, OBJ_TAINTED_RAW(str) | OBJ_TAINTED_RAW(salt)); return result; } /* * call-seq: - * str.intern -> symbol - * str.to_sym -> symbol - * - * Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the - * symbol if it did not previously exist. See <code>Symbol#id2name</code>. - * - * "Koala".intern #=> :Koala - * s = 'cat'.to_sym #=> :cat - * s == :cat #=> true - * s = '@cat'.to_sym #=> :@cat - * s == :@cat #=> true - * - * This can also be used to create symbols that cannot be represented using the - * <code>:xxx</code> notation. - * - * 'cat and dog'.to_sym #=> :"cat and dog" - */ - -VALUE -rb_str_intern(VALUE s) -{ - VALUE str = RB_GC_GUARD(s); - ID id; - - id = rb_intern_str(str); - return ID2SYM(id); -} - - -/* - * call-seq: * str.ord -> integer * - * Return the <code>Integer</code> ordinal of a one-character string. + * Returns the <code>Integer</code> ordinal of a one-character string. * * "a".ord #=> 97 */ @@ -6963,8 +9440,8 @@ rb_str_ord(VALUE s) * str.sum(n=16) -> integer * * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>, - * where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting - * to 16. The result is simply the sum of the binary value of each character in + * where <em>n</em> is the optional <code>Integer</code> parameter, defaulting + * to 16. The result is simply the sum of the binary value of each byte in * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good * checksum. */ @@ -6972,19 +9449,14 @@ rb_str_ord(VALUE s) static VALUE rb_str_sum(int argc, VALUE *argv, VALUE str) { - VALUE vbits; - int bits; + int bits = 16; char *ptr, *p, *pend; long len; VALUE sum = INT2FIX(0); unsigned long sum0 = 0; - if (argc == 0) { - bits = 16; - } - else { - rb_scan_args(argc, argv, "01", &vbits); - bits = NUM2INT(vbits); + if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) { + bits = 0; } ptr = p = RSTRING_PTR(str); len = RSTRING_LEN(str); @@ -7019,7 +9491,7 @@ rb_str_sum(int argc, VALUE *argv, VALUE str) sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); } - mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits)); + mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits)); mod = rb_funcall(mod, '-', 1, INT2FIX(1)); sum = rb_funcall(sum, '&', 1, mod); } @@ -7037,24 +9509,26 @@ rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag) char *p; const char *f = " "; long n, size, llen, rlen, llen2 = 0, rlen2 = 0; - volatile VALUE pad; + VALUE pad; int singlebyte = 1, cr; + int termlen; rb_scan_args(argc, argv, "11", &w, &pad); enc = STR_ENC_GET(str); + termlen = rb_enc_mbminlen(enc); width = NUM2LONG(w); if (argc == 2) { StringValue(pad); enc = rb_enc_check(str, pad); f = RSTRING_PTR(pad); flen = RSTRING_LEN(pad); - fclen = str_strlen(pad, enc); + fclen = str_strlen(pad, enc); /* rb_enc_check */ singlebyte = single_byte_optimizable(pad); if (flen == 0 || fclen == 0) { rb_raise(rb_eArgError, "zero width padding"); } } - len = str_strlen(str, enc); + len = str_strlen(str, enc); /* rb_enc_check */ if (width < 0 || len >= width) return rb_str_dup(str); n = width - len; llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2); @@ -7071,7 +9545,7 @@ rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag) rb_raise(rb_eArgError, "argument too big"); } len += size; - res = rb_str_new5(str, 0, len); + res = str_new0(rb_obj_class(str), 0, len, termlen); p = RSTRING_PTR(res); if (flen <= 1) { memset(p, *f, llen); @@ -7105,15 +9579,17 @@ rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag) p += rlen2; } } - *p = '\0'; + TERM_FILL(p, termlen); STR_SET_LEN(res, p-RSTRING_PTR(res)); - OBJ_INFECT(res, str); - if (!NIL_P(pad)) OBJ_INFECT(res, pad); + OBJ_INFECT_RAW(res, str); + if (!NIL_P(pad)) OBJ_INFECT_RAW(res, pad); rb_enc_associate(res, enc); if (argc == 2) cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad)); if (cr != ENC_CODERANGE_BROKEN) ENC_CODERANGE_SET(res, cr); + + RB_GC_GUARD(pad); return res; } @@ -7196,31 +9672,21 @@ static VALUE rb_str_partition(VALUE str, VALUE sep) { long pos; - int regex = FALSE; + sep = get_pat_quoted(sep, 0); if (RB_TYPE_P(sep, T_REGEXP)) { pos = rb_reg_search(sep, str, 0, 0); - regex = TRUE; - } - else { - VALUE tmp; - - tmp = rb_check_string_type(sep); - if (NIL_P(tmp)) { - rb_raise(rb_eTypeError, "type mismatch: %s given", - rb_obj_classname(sep)); + if (pos < 0) { + failed: + return rb_ary_new3(3, rb_str_dup(str), str_new_empty(str), str_new_empty(str)); } - sep = tmp; - pos = rb_str_index(str, sep, 0); - } - if (pos < 0) { - failed: - return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str)); - } - if (regex) { sep = rb_str_subpat(str, sep, INT2FIX(0)); if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed; } + else { + pos = rb_str_index(str, sep, 0); + if (pos < 0) goto failed; + } return rb_ary_new3(3, rb_str_subseq(str, 0, pos), sep, rb_str_subseq(str, pos+RSTRING_LEN(sep), @@ -7265,14 +9731,18 @@ rb_str_rpartition(VALUE str, VALUE sep) pos = rb_str_rindex(str, sep, pos); } if (pos < 0) { - return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str); + return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), rb_str_dup(str)); } if (regex) { sep = rb_reg_nth_match(0, rb_backref_get()); } - return rb_ary_new3(3, rb_str_substr(str, 0, pos), + else { + pos = rb_str_offset(str, pos); + } + return rb_ary_new3(3, rb_str_subseq(str, 0, pos), sep, - rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str))); + rb_str_subseq(str, pos+RSTRING_LEN(sep), + RSTRING_LEN(str)-pos-RSTRING_LEN(sep))); } /* @@ -7280,8 +9750,10 @@ rb_str_rpartition(VALUE str, VALUE sep) * str.start_with?([prefixes]+) -> true or false * * Returns true if +str+ starts with one of the +prefixes+ given. + * Each of the +prefixes+ should be a String or a Regexp. * * "hello".start_with?("hell") #=> true + * "hello".start_with?(/H/i) #=> true * * # returns true if one of the prefixes matches. * "hello".start_with?("heaven", "hell") #=> true @@ -7295,11 +9767,17 @@ rb_str_start_with(int argc, VALUE *argv, VALUE str) for (i=0; i<argc; i++) { VALUE tmp = argv[i]; - StringValue(tmp); - rb_enc_check(str, tmp); - if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue; - if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0) - return Qtrue; + if (RB_TYPE_P(tmp, T_REGEXP)) { + if (rb_reg_start_with_p(tmp, str)) + return Qtrue; + } + else { + StringValue(tmp); + rb_enc_check(str, tmp); + if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue; + if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0) + return Qtrue; + } } return Qfalse; } @@ -7309,6 +9787,12 @@ rb_str_start_with(int argc, VALUE *argv, VALUE str) * str.end_with?([suffixes]+) -> true or false * * Returns true if +str+ ends with one of the +suffixes+ given. + * + * "hello".end_with?("ello") #=> true + * + * # returns true if one of the +suffixes+ matches. + * "hello".end_with?("heaven", "ello") #=> true + * "hello".end_with?("heaven", "paradise") #=> false */ static VALUE @@ -7334,11 +9818,184 @@ rb_str_end_with(int argc, VALUE *argv, VALUE str) return Qfalse; } +/*! + * Returns the length of the <i>prefix</i> to be deleted in the given <i>str</i>, + * returning 0 if <i>str</i> does not start with the <i>prefix</i>. + * + * @param str the target + * @param prefix the prefix + * @retval 0 if the given <i>str</i> does not start with the given <i>prefix</i> + * @retval Positive-Integer otherwise + */ +static long +deleted_prefix_length(VALUE str, VALUE prefix) +{ + char *strptr, *prefixptr; + long olen, prefixlen; + + StringValue(prefix); + if (is_broken_string(prefix)) return 0; + rb_enc_check(str, prefix); + + /* return 0 if not start with prefix */ + prefixlen = RSTRING_LEN(prefix); + if (prefixlen <= 0) return 0; + olen = RSTRING_LEN(str); + if (olen < prefixlen) return 0; + strptr = RSTRING_PTR(str); + prefixptr = RSTRING_PTR(prefix); + if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0; + + return prefixlen; +} + +/* + * call-seq: + * str.delete_prefix!(prefix) -> self or nil + * + * Deletes leading <code>prefix</code> from <i>str</i>, returning + * <code>nil</code> if no change was made. + * + * "hello".delete_prefix!("hel") #=> "lo" + * "hello".delete_prefix!("llo") #=> nil + */ + +static VALUE +rb_str_delete_prefix_bang(VALUE str, VALUE prefix) +{ + long prefixlen; + str_modify_keep_cr(str); + + prefixlen = deleted_prefix_length(str, prefix); + if (prefixlen <= 0) return Qnil; + + return rb_str_drop_bytes(str, prefixlen); +} + +/* + * call-seq: + * str.delete_prefix(prefix) -> new_str + * + * Returns a copy of <i>str</i> with leading <code>prefix</code> deleted. + * + * "hello".delete_prefix("hel") #=> "lo" + * "hello".delete_prefix("llo") #=> "hello" + */ + +static VALUE +rb_str_delete_prefix(VALUE str, VALUE prefix) +{ + long prefixlen; + + prefixlen = deleted_prefix_length(str, prefix); + if (prefixlen <= 0) return rb_str_dup(str); + + return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen); +} + +/*! + * Returns the length of the <i>suffix</i> to be deleted in the given <i>str</i>, + * returning 0 if <i>str</i> does not end with the <i>suffix</i>. + * + * @param str the target + * @param suffix the suffix + * @retval 0 if the given <i>str</i> does not end with the given <i>suffix</i> + * @retval Positive-Integer otherwise + */ +static long +deleted_suffix_length(VALUE str, VALUE suffix) +{ + char *strptr, *suffixptr, *s; + long olen, suffixlen; + rb_encoding *enc; + + StringValue(suffix); + if (is_broken_string(suffix)) return 0; + enc = rb_enc_check(str, suffix); + + /* return 0 if not start with suffix */ + suffixlen = RSTRING_LEN(suffix); + if (suffixlen <= 0) return 0; + olen = RSTRING_LEN(str); + if (olen < suffixlen) return 0; + strptr = RSTRING_PTR(str); + suffixptr = RSTRING_PTR(suffix); + s = strptr + olen - suffixlen; + if (memcmp(s, suffixptr, suffixlen) != 0) return 0; + if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0; + + return suffixlen; +} + +/* + * call-seq: + * str.delete_suffix!(suffix) -> self or nil + * + * Deletes trailing <code>suffix</code> from <i>str</i>, returning + * <code>nil</code> if no change was made. + * + * "hello".delete_suffix!("llo") #=> "he" + * "hello".delete_suffix!("hel") #=> nil + */ + +static VALUE +rb_str_delete_suffix_bang(VALUE str, VALUE suffix) +{ + long olen, suffixlen, len; + str_modifiable(str); + + suffixlen = deleted_suffix_length(str, suffix); + if (suffixlen <= 0) return Qnil; + + olen = RSTRING_LEN(str); + str_modify_keep_cr(str); + len = olen - suffixlen; + STR_SET_LEN(str, len); + TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str)); + if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { + ENC_CODERANGE_CLEAR(str); + } + return str; +} + +/* + * call-seq: + * str.delete_suffix(suffix) -> new_str + * + * Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted. + * + * "hello".delete_suffix("llo") #=> "he" + * "hello".delete_suffix("hel") #=> "hello" + */ + +static VALUE +rb_str_delete_suffix(VALUE str, VALUE suffix) +{ + long suffixlen; + + suffixlen = deleted_suffix_length(str, suffix); + if (suffixlen <= 0) return rb_str_dup(str); + + return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen); +} + void rb_str_setter(VALUE val, ID id, VALUE *var) { if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) { - rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id)); + rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id)); + } + *var = val; +} + +static void +rb_fs_setter(VALUE val, ID id, VALUE *var) +{ + val = rb_fs_check(val); + if (!val) { + rb_raise(rb_eTypeError, + "value of %"PRIsVALUE" must be String or Regexp", + rb_id2str(id)); } *var = val; } @@ -7362,9 +10019,26 @@ rb_str_force_encoding(VALUE str, VALUE enc) /* * call-seq: + * str.b -> str + * + * Returns a copied string whose encoding is ASCII-8BIT. + */ + +static VALUE +rb_str_b(VALUE str) +{ + VALUE str2 = str_alloc(rb_cString); + str_replace_shared_without_enc(str2, str); + OBJ_INFECT_RAW(str2, str); + ENC_CODERANGE_CLEAR(str2); + return str2; +} + +/* + * call-seq: * str.valid_encoding? -> true or false * - * Returns true for a string which encoded correctly. + * Returns true for a string which is encoded correctly. * * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false @@ -7407,7 +10081,7 @@ rb_str_is_ascii_only_p(VALUE str) * \pre _len_ must not be negative. * \post the length of the returned string in characters is less than or equal to _len_. * \post If the length of _str_ is less than or equal _len_, returns _str_ itself. - * \post the encoded of returned string is equal to the encoded of _str_. + * \post the encoding of returned string is equal to the encoding of _str_. * \post the class of returned string is equal to the class of _str_. * \note the length is counted in characters. */ @@ -7448,6 +10122,423 @@ rb_str_ellipsize(VALUE str, long len) return ret; } +static VALUE +str_compat_and_valid(VALUE str, rb_encoding *enc) +{ + int cr; + str = StringValue(str); + cr = rb_enc_str_coderange(str); + if (cr == ENC_CODERANGE_BROKEN) { + rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str); + } + else { + rb_encoding *e = STR_ENC_GET(str); + if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) { + rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", + rb_enc_name(enc), rb_enc_name(e)); + } + } + return str; +} + +static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr); + +/** + * @param str the string to be scrubbed + * @param repl the replacement character + * @return If given string is invalid, returns a new string. Otherwise, returns Qnil. + */ +VALUE +rb_str_scrub(VALUE str, VALUE repl) +{ + rb_encoding *enc = STR_ENC_GET(str); + return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str)); +} + +VALUE +rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl) +{ + int cr = ENC_CODERANGE_UNKNOWN; + if (enc == STR_ENC_GET(str)) { + /* cached coderange makes sense only when enc equals the + * actual encoding of str */ + cr = ENC_CODERANGE(str); + } + return enc_str_scrub(enc, str, repl, cr); +} + +static VALUE +enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr) +{ + int encidx; + VALUE buf = Qnil; + const char *rep; + long replen = -1; + int tainted = 0; + + if (rb_block_given_p()) { + if (!NIL_P(repl)) + rb_raise(rb_eArgError, "both of block and replacement given"); + replen = 0; + } + + if (ENC_CODERANGE_CLEAN_P(cr)) + return Qnil; + + if (!NIL_P(repl)) { + repl = str_compat_and_valid(repl, enc); + tainted = OBJ_TAINTED_RAW(repl); + } + + if (rb_enc_dummy_p(enc)) { + return Qnil; + } + encidx = rb_enc_to_index(enc); + +#define DEFAULT_REPLACE_CHAR(str) do { \ + static const char replace[sizeof(str)-1] = str; \ + rep = replace; replen = (int)sizeof(replace); \ + } while (0) + + if (rb_enc_asciicompat(enc)) { + const char *p = RSTRING_PTR(str); + const char *e = RSTRING_END(str); + const char *p1 = p; + int rep7bit_p; + if (!replen) { + rep = NULL; + rep7bit_p = FALSE; + } + else if (!NIL_P(repl)) { + rep = RSTRING_PTR(repl); + replen = RSTRING_LEN(repl); + rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT); + } + else if (encidx == rb_utf8_encindex()) { + DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD"); + rep7bit_p = FALSE; + } + else { + DEFAULT_REPLACE_CHAR("?"); + rep7bit_p = TRUE; + } + cr = ENC_CODERANGE_7BIT; + + p = search_nonascii(p, e); + if (!p) { + p = e; + } + while (p < e) { + int ret = rb_enc_precise_mbclen(p, e, enc); + if (MBCLEN_NEEDMORE_P(ret)) { + break; + } + else if (MBCLEN_CHARFOUND_P(ret)) { + cr = ENC_CODERANGE_VALID; + p += MBCLEN_CHARFOUND_LEN(ret); + } + else if (MBCLEN_INVALID_P(ret)) { + /* + * p1~p: valid ascii/multibyte chars + * p ~e: invalid bytes + unknown bytes + */ + long clen = rb_enc_mbmaxlen(enc); + if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str)); + if (p > p1) { + rb_str_buf_cat(buf, p1, p - p1); + } + + if (e - p < clen) clen = e - p; + if (clen <= 2) { + clen = 1; + } + else { + const char *q = p; + clen--; + for (; clen > 1; clen--) { + ret = rb_enc_precise_mbclen(q, q + clen, enc); + if (MBCLEN_NEEDMORE_P(ret)) break; + if (MBCLEN_INVALID_P(ret)) continue; + UNREACHABLE; + } + } + if (rep) { + rb_str_buf_cat(buf, rep, replen); + if (!rep7bit_p) cr = ENC_CODERANGE_VALID; + } + else { + repl = rb_yield(rb_enc_str_new(p, clen, enc)); + repl = str_compat_and_valid(repl, enc); + tainted |= OBJ_TAINTED_RAW(repl); + rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); + if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID) + cr = ENC_CODERANGE_VALID; + } + p += clen; + p1 = p; + p = search_nonascii(p, e); + if (!p) { + p = e; + break; + } + } + else { + UNREACHABLE; + } + } + if (NIL_P(buf)) { + if (p == e) { + ENC_CODERANGE_SET(str, cr); + return Qnil; + } + buf = rb_str_buf_new(RSTRING_LEN(str)); + } + if (p1 < p) { + rb_str_buf_cat(buf, p1, p - p1); + } + if (p < e) { + if (rep) { + rb_str_buf_cat(buf, rep, replen); + if (!rep7bit_p) cr = ENC_CODERANGE_VALID; + } + else { + repl = rb_yield(rb_enc_str_new(p, e-p, enc)); + repl = str_compat_and_valid(repl, enc); + tainted |= OBJ_TAINTED_RAW(repl); + rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); + if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID) + cr = ENC_CODERANGE_VALID; + } + } + } + else { + /* ASCII incompatible */ + const char *p = RSTRING_PTR(str); + const char *e = RSTRING_END(str); + const char *p1 = p; + long mbminlen = rb_enc_mbminlen(enc); + if (!replen) { + rep = NULL; + } + else if (!NIL_P(repl)) { + rep = RSTRING_PTR(repl); + replen = RSTRING_LEN(repl); + } + else if (encidx == ENCINDEX_UTF_16BE) { + DEFAULT_REPLACE_CHAR("\xFF\xFD"); + } + else if (encidx == ENCINDEX_UTF_16LE) { + DEFAULT_REPLACE_CHAR("\xFD\xFF"); + } + else if (encidx == ENCINDEX_UTF_32BE) { + DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD"); + } + else if (encidx == ENCINDEX_UTF_32LE) { + DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00"); + } + else { + DEFAULT_REPLACE_CHAR("?"); + } + + while (p < e) { + int ret = rb_enc_precise_mbclen(p, e, enc); + if (MBCLEN_NEEDMORE_P(ret)) { + break; + } + else if (MBCLEN_CHARFOUND_P(ret)) { + p += MBCLEN_CHARFOUND_LEN(ret); + } + else if (MBCLEN_INVALID_P(ret)) { + const char *q = p; + long clen = rb_enc_mbmaxlen(enc); + if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str)); + if (p > p1) rb_str_buf_cat(buf, p1, p - p1); + + if (e - p < clen) clen = e - p; + if (clen <= mbminlen * 2) { + clen = mbminlen; + } + else { + clen -= mbminlen; + for (; clen > mbminlen; clen-=mbminlen) { + ret = rb_enc_precise_mbclen(q, q + clen, enc); + if (MBCLEN_NEEDMORE_P(ret)) break; + if (MBCLEN_INVALID_P(ret)) continue; + UNREACHABLE; + } + } + if (rep) { + rb_str_buf_cat(buf, rep, replen); + } + else { + repl = rb_yield(rb_enc_str_new(p, clen, enc)); + repl = str_compat_and_valid(repl, enc); + tainted |= OBJ_TAINTED_RAW(repl); + rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); + } + p += clen; + p1 = p; + } + else { + UNREACHABLE; + } + } + if (NIL_P(buf)) { + if (p == e) { + ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); + return Qnil; + } + buf = rb_str_buf_new(RSTRING_LEN(str)); + } + if (p1 < p) { + rb_str_buf_cat(buf, p1, p - p1); + } + if (p < e) { + if (rep) { + rb_str_buf_cat(buf, rep, replen); + } + else { + repl = rb_yield(rb_enc_str_new(p, e-p, enc)); + repl = str_compat_and_valid(repl, enc); + tainted |= OBJ_TAINTED_RAW(repl); + rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); + } + } + cr = ENC_CODERANGE_VALID; + } + FL_SET_RAW(buf, tainted|OBJ_TAINTED_RAW(str)); + ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr); + return buf; +} + +/* + * call-seq: + * str.scrub -> new_str + * str.scrub(repl) -> new_str + * str.scrub{|bytes|} -> new_str + * + * If the string is invalid byte sequence then replace invalid bytes with given replacement + * character, else returns self. + * If block is given, replace invalid bytes with returned value of the block. + * + * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD" + * "abc\u3042\x81".scrub("*") #=> "abc\u3042*" + * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>" + */ +static VALUE +str_scrub(int argc, VALUE *argv, VALUE str) +{ + VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil; + VALUE new = rb_str_scrub(str, repl); + return NIL_P(new) ? rb_str_dup(str): new; +} + +/* + * call-seq: + * str.scrub! -> str + * str.scrub!(repl) -> str + * str.scrub!{|bytes|} -> str + * + * If the string is invalid byte sequence then replace invalid bytes with given replacement + * character, else returns self. + * If block is given, replace invalid bytes with returned value of the block. + * + * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD" + * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*" + * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>" + */ +static VALUE +str_scrub_bang(int argc, VALUE *argv, VALUE str) +{ + VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil; + VALUE new = rb_str_scrub(str, repl); + if (!NIL_P(new)) rb_str_replace(str, new); + return str; +} + +static ID id_normalize; +static ID id_normalized_p; +static VALUE mUnicodeNormalize; + +static VALUE +unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id) +{ + static int UnicodeNormalizeRequired = 0; + VALUE argv2[2]; + + if (!UnicodeNormalizeRequired) { + rb_require("unicode_normalize/normalize.rb"); + UnicodeNormalizeRequired = 1; + } + argv2[0] = str; + if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0]; + return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2); +} + +/* + * call-seq: + * str.unicode_normalize(form=:nfc) + * + * Unicode Normalization---Returns a normalized form of +str+, + * using Unicode normalizations NFC, NFD, NFKC, or NFKD. + * The normalization form used is determined by +form+, which can + * be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+. + * The default is +:nfc+. + * + * If the string is not in a Unicode Encoding, then an Exception is raised. + * In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE, + * and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE. + * Anything other than UTF-8 is implemented by converting to UTF-8, + * which makes it slower than UTF-8. + * + * "a\u0300".unicode_normalize #=> "\u00E0" + * "a\u0300".unicode_normalize(:nfc) #=> "\u00E0" + * "\u00E0".unicode_normalize(:nfd) #=> "a\u0300" + * "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd) + * #=> Encoding::CompatibilityError raised + */ +static VALUE +rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str) +{ + return unicode_normalize_common(argc, argv, str, id_normalize); +} + +/* + * call-seq: + * str.unicode_normalize!(form=:nfc) + * + * Destructive version of String#unicode_normalize, doing Unicode + * normalization in place. + */ +static VALUE +rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str) +{ + return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize)); +} + +/* call-seq: + * str.unicode_normalized?(form=:nfc) + * + * Checks whether +str+ is in Unicode normalization form +form+, + * which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+. + * The default is +:nfc+. + * + * If the string is not in a Unicode Encoding, then an Exception is raised. + * For details, see String#unicode_normalize. + * + * "a\u0300".unicode_normalized? #=> false + * "a\u0300".unicode_normalized?(:nfd) #=> true + * "\u00E0".unicode_normalized? #=> true + * "\u00E0".unicode_normalized?(:nfd) #=> false + * "\xE0".force_encoding('ISO-8859-1').unicode_normalized? + * #=> Encoding::CompatibilityError raised + */ +static VALUE +rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str) +{ + return unicode_normalize_common(argc, argv, str, id_normalized_p); +} + /********************************************************************** * Document-class: Symbol * @@ -7490,21 +10581,18 @@ rb_str_ellipsize(VALUE str, long len) * symbol, returns <code>true</code>. */ -static VALUE -sym_equal(VALUE sym1, VALUE sym2) -{ - if (sym1 == sym2) return Qtrue; - return Qfalse; -} - +#define sym_equal rb_obj_equal static int sym_printable(const char *s, const char *send, rb_encoding *enc) { while (s < send) { int n; - int c = rb_enc_codepoint_len(s, send, &n, enc); + int c = rb_enc_precise_mbclen(s, send, enc); + if (!MBCLEN_CHARFOUND_P(c)) return FALSE; + n = MBCLEN_CHARFOUND_LEN(c); + c = rb_enc_mbc_to_codepoint(s, send, enc); if (!rb_enc_isprint(c, enc)) return FALSE; s += n; } @@ -7524,12 +10612,43 @@ rb_str_symname_p(VALUE sym) ptr = RSTRING_PTR(sym); len = RSTRING_LEN(sym); if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) || - !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) { + !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) { return FALSE; } return TRUE; } +VALUE +rb_str_quote_unprintable(VALUE str) +{ + rb_encoding *enc; + const char *ptr; + long len; + rb_encoding *resenc; + + Check_Type(str, T_STRING); + resenc = rb_default_internal_encoding(); + if (resenc == NULL) resenc = rb_default_external_encoding(); + enc = STR_ENC_GET(str); + ptr = RSTRING_PTR(str); + len = RSTRING_LEN(str); + if ((resenc != enc && !rb_str_is_ascii_only_p(str)) || + !sym_printable(ptr, ptr + len, enc)) { + return rb_str_inspect(str); + } + return str; +} + +MJIT_FUNC_EXPORTED VALUE +rb_id_quote_unprintable(ID id) +{ + VALUE str = rb_id2str(id); + if (!rb_str_symname_p(str)) { + return rb_str_inspect(str); + } + return str; +} + /* * call-seq: * sym.inspect -> string @@ -7542,30 +10661,26 @@ rb_str_symname_p(VALUE sym) static VALUE sym_inspect(VALUE sym) { - VALUE str; + VALUE str = rb_sym2str(sym); const char *ptr; long len; - ID id = SYM2ID(sym); char *dest; - sym = rb_id2str(id); - if (!rb_str_symname_p(sym)) { - str = rb_str_inspect(sym); + if (!rb_str_symname_p(str)) { + str = rb_str_inspect(str); len = RSTRING_LEN(str); rb_str_resize(str, len + 1); dest = RSTRING_PTR(str); memmove(dest + 1, dest, len); - dest[0] = ':'; } else { - rb_encoding *enc = STR_ENC_GET(sym); - ptr = RSTRING_PTR(sym); - len = RSTRING_LEN(sym); + rb_encoding *enc = STR_ENC_GET(str); + RSTRING_GETMEM(str, ptr, len); str = rb_enc_str_new(0, len + 1, enc); dest = RSTRING_PTR(str); - dest[0] = ':'; memcpy(dest + 1, ptr, len); } + dest[0] = ':'; return str; } @@ -7578,15 +10693,14 @@ sym_inspect(VALUE sym) * Returns the name or string corresponding to <i>sym</i>. * * :fred.id2name #=> "fred" + * :ginger.to_s #=> "ginger" */ VALUE rb_sym_to_s(VALUE sym) { - ID id = SYM2ID(sym); - - return str_new3(rb_cString, rb_id2str(id)); + return str_new_shared(rb_cString, rb_sym2str(sym)); } @@ -7606,66 +10720,33 @@ sym_to_sym(VALUE sym) return sym; } -static VALUE -sym_call(VALUE args, VALUE p, int argc, VALUE *argv) +MJIT_FUNC_EXPORTED VALUE +rb_sym_proc_call(ID mid, int argc, const VALUE *argv, VALUE passed_proc) { VALUE obj; - NODE *memo = RNODE(p); if (argc < 1) { rb_raise(rb_eArgError, "no receiver given"); } obj = argv[0]; - return rb_funcall_passing_block_with_refinements(obj, (ID) memo->u1.id, - argc - 1, argv + 1, - memo->u2.value); + return rb_funcall_with_block(obj, mid, argc - 1, argv + 1, passed_proc); } +#if 0 /* * call-seq: * sym.to_proc * - * Returns a _Proc_ object which respond to the given method by _sym_. + * Returns a _Proc_ object which responds to the given method by _sym_. * * (1..3).collect(&:to_s) #=> ["1", "2", "3"] */ -static VALUE -sym_to_proc(VALUE sym) +VALUE +rb_sym_to_proc(VALUE sym) { - static VALUE sym_proc_cache = Qfalse; - enum {SYM_PROC_CACHE_SIZE = 67}; - VALUE proc; - long id, index; - VALUE *aryp; - const NODE *cref = rb_vm_cref(); - - id = SYM2ID(sym); - if (NIL_P(cref->nd_refinements)) { - if (!sym_proc_cache) { - sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2); - rb_gc_register_mark_object(sym_proc_cache); - rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil); - } - - index = (id % SYM_PROC_CACHE_SIZE) << 1; - aryp = RARRAY_PTR(sym_proc_cache); - if (aryp[index] == sym) { - return aryp[index + 1]; - } - else { - proc = rb_proc_new(sym_call, - (VALUE) NEW_MEMO(id, Qnil, 0)); - aryp[index] = sym; - aryp[index + 1] = proc; - return proc; - } - } - else { - return rb_proc_new(sym_call, - (VALUE) NEW_MEMO(id, cref->nd_refinements, 0)); - } } +#endif /* * call-seq: @@ -7678,15 +10759,21 @@ sym_to_proc(VALUE sym) static VALUE sym_succ(VALUE sym) { - return rb_str_intern(rb_str_succ(rb_sym_to_s(sym))); + return rb_str_intern(rb_str_succ(rb_sym2str(sym))); } /* * call-seq: * - * str <=> other -> -1, 0, +1 or nil + * symbol <=> other_symbol -> -1, 0, +1, or nil * - * Compares _sym_ with _other_ in string form. + * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the + * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is + * less than, equal to, or greater than +other_symbol+. + * + * +nil+ is returned if the two values are incomparable. + * + * See String#<=> for more information. */ static VALUE @@ -7695,15 +10782,27 @@ sym_cmp(VALUE sym, VALUE other) if (!SYMBOL_P(other)) { return Qnil; } - return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other)); + return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other)); } /* * call-seq: - * - * sym.casecmp(other) -> -1, 0, +1 or nil + * sym.casecmp(other_symbol) -> -1, 0, +1, or nil * * Case-insensitive version of <code>Symbol#<=></code>. + * Currently, case-insensitivity only works on characters A-Z/a-z, + * not all of Unicode. This is different from Symbol#casecmp?. + * + * :aBcDeF.casecmp(:abcde) #=> 1 + * :aBcDeF.casecmp(:abcdef) #=> 0 + * :aBcDeF.casecmp(:abcdefg) #=> -1 + * :abcdef.casecmp(:ABCDEF) #=> 0 + * + * +nil+ is returned if the two symbols have incompatible encodings, + * or if +other_symbol+ is not a symbol. + * + * :foo.casecmp(2) #=> nil + * "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym.casecmp(:"\u{c4 d6 dc}") #=> nil */ static VALUE @@ -7712,12 +10811,41 @@ sym_casecmp(VALUE sym, VALUE other) if (!SYMBOL_P(other)) { return Qnil; } - return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other)); + return str_casecmp(rb_sym2str(sym), rb_sym2str(other)); } /* * call-seq: - * sym =~ obj -> fixnum or nil + * sym.casecmp?(other_symbol) -> true, false, or nil + * + * Returns +true+ if +sym+ and +other_symbol+ are equal after + * Unicode case folding, +false+ if they are not equal. + * + * :aBcDeF.casecmp?(:abcde) #=> false + * :aBcDeF.casecmp?(:abcdef) #=> true + * :aBcDeF.casecmp?(:abcdefg) #=> false + * :abcdef.casecmp?(:ABCDEF) #=> true + * :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true + * + * +nil+ is returned if the two symbols have incompatible encodings, + * or if +other_symbol+ is not a symbol. + * + * :foo.casecmp?(2) #=> nil + * "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym.casecmp?(:"\u{c4 d6 dc}") #=> nil + */ + +static VALUE +sym_casecmp_p(VALUE sym, VALUE other) +{ + if (!SYMBOL_P(other)) { + return Qnil; + } + return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other)); +} + +/* + * call-seq: + * sym =~ obj -> integer or nil * * Returns <code>sym.to_s =~ obj</code>. */ @@ -7725,13 +10853,43 @@ sym_casecmp(VALUE sym, VALUE other) static VALUE sym_match(VALUE sym, VALUE other) { - return rb_str_match(rb_sym_to_s(sym), other); + return rb_str_match(rb_sym2str(sym), other); +} + +/* + * call-seq: + * sym.match(pattern) -> matchdata or nil + * sym.match(pattern, pos) -> matchdata or nil + * + * Returns <code>sym.to_s.match</code>. + */ + +static VALUE +sym_match_m(int argc, VALUE *argv, VALUE sym) +{ + return rb_str_match_m(argc, argv, rb_sym2str(sym)); +} + +/* + * call-seq: + * sym.match?(pattern) -> true or false + * sym.match?(pattern, pos) -> true or false + * + * Returns <code>sym.to_s.match?</code>. + */ + +static VALUE +sym_match_m_p(int argc, VALUE *argv, VALUE sym) +{ + return rb_str_match_m_p(argc, argv, sym); } /* * call-seq: * sym[idx] -> char - * sym[b, n] -> char + * sym[b, n] -> string + * sym.slice(idx) -> char + * sym.slice(b, n) -> string * * Returns <code>sym.to_s[]</code>. */ @@ -7739,12 +10897,13 @@ sym_match(VALUE sym, VALUE other) static VALUE sym_aref(int argc, VALUE *argv, VALUE sym) { - return rb_str_aref_m(argc, argv, rb_sym_to_s(sym)); + return rb_str_aref_m(argc, argv, rb_sym2str(sym)); } /* * call-seq: - * sym.length -> integer + * sym.length -> integer + * sym.size -> integer * * Same as <code>sym.to_s.length</code>. */ @@ -7752,72 +10911,76 @@ sym_aref(int argc, VALUE *argv, VALUE sym) static VALUE sym_length(VALUE sym) { - return rb_str_length(rb_id2str(SYM2ID(sym))); + return rb_str_length(rb_sym2str(sym)); } /* * call-seq: * sym.empty? -> true or false * - * Returns that _sym_ is :"" or not. + * Returns whether _sym_ is :"" or not. */ static VALUE sym_empty(VALUE sym) { - return rb_str_empty(rb_id2str(SYM2ID(sym))); + return rb_str_empty(rb_sym2str(sym)); } /* * call-seq: - * sym.upcase -> symbol + * sym.upcase -> symbol + * sym.upcase([options]) -> symbol * * Same as <code>sym.to_s.upcase.intern</code>. */ static VALUE -sym_upcase(VALUE sym) +sym_upcase(int argc, VALUE *argv, VALUE sym) { - return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym)))); + return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym))); } /* * call-seq: - * sym.downcase -> symbol + * sym.downcase -> symbol + * sym.downcase([options]) -> symbol * * Same as <code>sym.to_s.downcase.intern</code>. */ static VALUE -sym_downcase(VALUE sym) +sym_downcase(int argc, VALUE *argv, VALUE sym) { - return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym)))); + return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym))); } /* * call-seq: - * sym.capitalize -> symbol + * sym.capitalize -> symbol + * sym.capitalize([options]) -> symbol * * Same as <code>sym.to_s.capitalize.intern</code>. */ static VALUE -sym_capitalize(VALUE sym) +sym_capitalize(int argc, VALUE *argv, VALUE sym) { - return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym)))); + return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym))); } /* * call-seq: - * sym.swapcase -> symbol + * sym.swapcase -> symbol + * sym.swapcase([options]) -> symbol * * Same as <code>sym.to_s.swapcase.intern</code>. */ static VALUE -sym_swapcase(VALUE sym) +sym_swapcase(int argc, VALUE *argv, VALUE sym) { - return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym)))); + return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym))); } /* @@ -7830,32 +10993,41 @@ sym_swapcase(VALUE sym) static VALUE sym_encoding(VALUE sym) { - return rb_obj_encoding(rb_id2str(SYM2ID(sym))); + return rb_obj_encoding(rb_sym2str(sym)); } -ID -rb_to_id(VALUE name) +static VALUE +string_for_symbol(VALUE name) { - VALUE tmp; - - switch (TYPE(name)) { - default: - tmp = rb_check_string_type(name); + if (!RB_TYPE_P(name, T_STRING)) { + VALUE tmp = rb_check_string_type(name); if (NIL_P(tmp)) { - tmp = rb_inspect(name); - rb_raise(rb_eTypeError, "%s is not a symbol", - RSTRING_PTR(tmp)); + rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol", + name); } name = tmp; - /* fall through */ - case T_STRING: - name = rb_str_intern(name); - /* fall through */ - case T_SYMBOL: + } + return name; +} + +ID +rb_to_id(VALUE name) +{ + if (SYMBOL_P(name)) { return SYM2ID(name); } + name = string_for_symbol(name); + return rb_intern_str(name); +} - UNREACHABLE; +VALUE +rb_to_symbol(VALUE name) +{ + if (SYMBOL_P(name)) { + return name; + } + name = string_for_symbol(name); + return rb_str_intern(name); } /* @@ -7878,8 +11050,10 @@ Init_String(void) #define rb_intern(str) rb_intern_const(str) rb_cString = rb_define_class("String", rb_cObject); + assert(rb_vm_fstring_table()); + st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString); rb_include_module(rb_cString, rb_mComparable); - rb_define_alloc_func(rb_cString, str_alloc); + rb_define_alloc_func(rb_cString, empty_str_alloc); rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1); rb_define_method(rb_cString, "initialize", rb_str_init, -1); rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1); @@ -7889,6 +11063,7 @@ Init_String(void) rb_define_method(rb_cString, "eql?", rb_str_eql, 1); rb_define_method(rb_cString, "hash", rb_str_hash_m, 0); rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1); + rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1); rb_define_method(rb_cString, "+", rb_str_plus, 1); rb_define_method(rb_cString, "*", rb_str_times, 1); rb_define_method(rb_cString, "%", rb_str_format_m, 1); @@ -7901,6 +11076,7 @@ Init_String(void) rb_define_method(rb_cString, "empty?", rb_str_empty, 0); rb_define_method(rb_cString, "=~", rb_str_match, 1); rb_define_method(rb_cString, "match", rb_str_match_m, -1); + rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1); rb_define_method(rb_cString, "succ", rb_str_succ, 0); rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0); rb_define_method(rb_cString, "next", rb_str_succ, 0); @@ -7914,6 +11090,11 @@ Init_String(void) rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1); rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2); rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1); + rb_define_method(rb_cString, "scrub", str_scrub, -1); + rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1); + rb_define_method(rb_cString, "freeze", rb_str_freeze, 0); + rb_define_method(rb_cString, "+@", str_uplus, 0); + rb_define_method(rb_cString, "-@", str_uminus, 0); rb_define_method(rb_cString, "to_i", rb_str_to_i, -1); rb_define_method(rb_cString, "to_f", rb_str_to_f, 0); @@ -7921,32 +11102,39 @@ Init_String(void) rb_define_method(rb_cString, "to_str", rb_str_to_s, 0); rb_define_method(rb_cString, "inspect", rb_str_inspect, 0); rb_define_method(rb_cString, "dump", rb_str_dump, 0); + rb_define_method(rb_cString, "undump", str_undump, 0); - rb_define_method(rb_cString, "upcase", rb_str_upcase, 0); - rb_define_method(rb_cString, "downcase", rb_str_downcase, 0); - rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0); - rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0); + sym_ascii = ID2SYM(rb_intern("ascii")); + sym_turkic = ID2SYM(rb_intern("turkic")); + sym_lithuanian = ID2SYM(rb_intern("lithuanian")); + sym_fold = ID2SYM(rb_intern("fold")); - rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0); - rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0); - rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0); - rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0); + rb_define_method(rb_cString, "upcase", rb_str_upcase, -1); + rb_define_method(rb_cString, "downcase", rb_str_downcase, -1); + rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1); + rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1); + + rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1); + rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1); + rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1); + rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1); rb_define_method(rb_cString, "hex", rb_str_hex, 0); rb_define_method(rb_cString, "oct", rb_str_oct, 0); rb_define_method(rb_cString, "split", rb_str_split_m, -1); - rb_define_method(rb_cString, "lines", rb_str_each_line, -1); - rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0); - rb_define_method(rb_cString, "chars", rb_str_each_char, 0); - rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0); + rb_define_method(rb_cString, "lines", rb_str_lines, -1); + rb_define_method(rb_cString, "bytes", rb_str_bytes, 0); + rb_define_method(rb_cString, "chars", rb_str_chars, 0); + rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0); + rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0); rb_define_method(rb_cString, "reverse", rb_str_reverse, 0); rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0); - rb_define_method(rb_cString, "concat", rb_str_concat, 1); + rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1); rb_define_method(rb_cString, "<<", rb_str_concat, 1); - rb_define_method(rb_cString, "prepend", rb_str_prepend, 1); + rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1); rb_define_method(rb_cString, "crypt", rb_str_crypt, 1); - rb_define_method(rb_cString, "intern", rb_str_intern, 0); - rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); + rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */ + rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */ rb_define_method(rb_cString, "ord", rb_str_ord, 0); rb_define_method(rb_cString, "include?", rb_str_include, 1); @@ -7966,6 +11154,8 @@ Init_String(void) rb_define_method(rb_cString, "strip", rb_str_strip, 0); rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0); rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0); + rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1); + rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1); rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1); rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1); @@ -7974,6 +11164,8 @@ Init_String(void) rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0); rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0); rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0); + rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1); + rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1); rb_define_method(rb_cString, "tr", rb_str_tr, 2); rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2); @@ -7990,6 +11182,7 @@ Init_String(void) rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0); rb_define_method(rb_cString, "each_char", rb_str_each_char, 0); rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0); + rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0); rb_define_method(rb_cString, "sum", rb_str_sum, -1); @@ -8001,20 +11194,29 @@ Init_String(void) rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */ rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1); + rb_define_method(rb_cString, "b", rb_str_b, 0); rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0); rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0); - id_to_s = rb_intern("to_s"); + /* define UnicodeNormalize module here so that we don't have to look it up */ + mUnicodeNormalize = rb_define_module("UnicodeNormalize"); + id_normalize = rb_intern("normalize"); + id_normalized_p = rb_intern("normalized?"); + + rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1); + rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1); + rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1); rb_fs = Qnil; - rb_define_variable("$;", &rb_fs); - rb_define_variable("$-F", &rb_fs); + rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter); + rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter); + rb_gc_register_address(&rb_fs); rb_cSymbol = rb_define_class("Symbol", rb_cObject); rb_include_module(rb_cSymbol, rb_mComparable); rb_undef_alloc_func(rb_cSymbol); rb_undef_method(CLASS_OF(rb_cSymbol), "new"); - rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */ + rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in symbol.c */ rb_define_method(rb_cSymbol, "==", sym_equal, 1); rb_define_method(rb_cSymbol, "===", sym_equal, 1); @@ -8023,12 +11225,13 @@ Init_String(void) rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0); rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0); rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0); - rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0); + rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); rb_define_method(rb_cSymbol, "succ", sym_succ, 0); rb_define_method(rb_cSymbol, "next", sym_succ, 0); rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1); rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1); + rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1); rb_define_method(rb_cSymbol, "=~", sym_match, 1); rb_define_method(rb_cSymbol, "[]", sym_aref, -1); @@ -8036,12 +11239,13 @@ Init_String(void) rb_define_method(rb_cSymbol, "length", sym_length, 0); rb_define_method(rb_cSymbol, "size", sym_length, 0); rb_define_method(rb_cSymbol, "empty?", sym_empty, 0); - rb_define_method(rb_cSymbol, "match", sym_match, 1); + rb_define_method(rb_cSymbol, "match", sym_match_m, -1); + rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1); - rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0); - rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0); - rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0); - rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0); + rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1); + rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1); + rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1); + rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1); rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0); } |
