diff options
Diffstat (limited to 'transcode.c')
-rw-r--r-- | transcode.c | 696 |
1 files changed, 363 insertions, 333 deletions
diff --git a/transcode.c b/transcode.c index bc985992a1..892814ee3b 100644 --- a/transcode.c +++ b/transcode.c @@ -47,6 +47,7 @@ static VALUE sym_xml, sym_text, sym_attr; static VALUE sym_universal_newline; static VALUE sym_crlf_newline; static VALUE sym_cr_newline; +static VALUE sym_lf_newline; #ifdef ENABLE_ECONV_NEWLINE_OPTION static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf; #endif @@ -180,6 +181,28 @@ typedef struct { static st_table *transcoder_table; +static int +free_inner_transcode_i(st_data_t key, st_data_t val, st_data_t arg) +{ + xfree((void *)val); + return ST_DELETE; +} + +static int +free_transcode_i(st_data_t key, st_data_t val, st_data_t arg) +{ + st_foreach((void *)val, free_inner_transcode_i, 0); + st_free_table((void *)val); + return ST_DELETE; +} + +void +rb_free_transcoder_table(void) +{ + st_foreach(transcoder_table, free_transcode_i, 0); + st_free_table(transcoder_table); +} + static transcoder_entry_t * make_transcoder_entry(const char *sname, const char *dname) { @@ -229,8 +252,8 @@ rb_register_transcoder(const rb_transcoder *tr) entry = make_transcoder_entry(sname, dname); if (entry->transcoder) { - rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered", - sname, dname); + rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered", + sname, dname); } entry->transcoder = tr; @@ -251,7 +274,7 @@ void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib) { if (!lib) { - rb_raise(rb_eArgError, "invalid library name - (null)"); + rb_raise(rb_eArgError, "invalid library name - (null)"); } declare_transcoder(enc1, enc2, lib); } @@ -376,6 +399,8 @@ transcode_search_path(const char *sname, const char *dname, return pathlen; /* is -1 if not found */ } +int rb_require_internal_silent(VALUE fname); + static const rb_transcoder * load_transcoder_entry(transcoder_entry_t *entry) { @@ -393,7 +418,7 @@ load_transcoder_entry(transcoder_entry_t *entry) memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len); rb_str_set_len(fn, total_len); OBJ_FREEZE(fn); - rb_require_string(fn); + rb_require_internal_silent(fn); } if (entry->transcoder) @@ -533,7 +558,7 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, while (1) { inchar_start = in_p; tc->recognized_len = 0; - next_table = tr->conv_tree_start; + next_table = tr->conv_tree_start; SUSPEND_AFTER_OUTPUT(24); @@ -553,7 +578,7 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE]) #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))]) - next_byte = (unsigned char)*in_p++; + next_byte = (unsigned char)*in_p++; follow_byte: if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte) next_info = INVALID; @@ -561,8 +586,8 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, next_info = (VALUE)BL_ACTION(next_byte); } follow_info: - switch (next_info & 0x1F) { - case NOMAP: + switch (next_info & 0x1F) { + case NOMAP: { const unsigned char *p = inchar_start; writebuf_off = 0; @@ -577,43 +602,43 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, } } continue; - case 0x00: case 0x04: case 0x08: case 0x0C: - case 0x10: case 0x14: case 0x18: case 0x1C: + case 0x00: case 0x04: case 0x08: case 0x0C: + case 0x10: case 0x14: case 0x18: case 0x1C: SUSPEND_AFTER_OUTPUT(25); - while (in_p >= in_stop) { + while (in_p >= in_stop) { if (!(opt & ECONV_PARTIAL_INPUT)) goto incomplete; SUSPEND(econv_source_buffer_empty, 5); - } - next_byte = (unsigned char)*in_p++; - next_table = (unsigned int)next_info; - goto follow_byte; - case ZERObt: /* drop input */ - continue; - case ONEbt: + } + next_byte = (unsigned char)*in_p++; + next_table = (unsigned int)next_info; + goto follow_byte; + case ZERObt: /* drop input */ + continue; + case ONEbt: SUSPEND_OBUF(9); *out_p++ = getBT1(next_info); - continue; - case TWObt: + continue; + case TWObt: SUSPEND_OBUF(10); *out_p++ = getBT1(next_info); SUSPEND_OBUF(21); *out_p++ = getBT2(next_info); - continue; - case THREEbt: + continue; + case THREEbt: SUSPEND_OBUF(11); *out_p++ = getBT1(next_info); SUSPEND_OBUF(15); *out_p++ = getBT2(next_info); SUSPEND_OBUF(16); *out_p++ = getBT3(next_info); - continue; - case FOURbt: + continue; + case FOURbt: SUSPEND_OBUF(12); *out_p++ = getBT0(next_info); SUSPEND_OBUF(17); *out_p++ = getBT1(next_info); SUSPEND_OBUF(18); *out_p++ = getBT2(next_info); SUSPEND_OBUF(19); *out_p++ = getBT3(next_info); - continue; - case GB4bt: + continue; + case GB4bt: SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info); SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info); SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info); SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info); - continue; + continue; case STR1: tc->output_index = 0; while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) { @@ -621,10 +646,10 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, tc->output_index++; } continue; - case FUNii: - next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info); - goto follow_info; - case FUNsi: + case FUNii: + next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info); + goto follow_info; + case FUNsi: { const unsigned char *char_start; size_t char_len; @@ -632,7 +657,7 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len); goto follow_info; } - case FUNio: + case FUNio: SUSPEND_OBUF(13); if (tr->max_output <= out_stop - out_p) out_p += tr->func_io(TRANSCODING_STATE(tc), @@ -647,8 +672,8 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; } } - break; - case FUNso: + break; + case FUNso: { const unsigned char *char_start; size_t char_len; @@ -672,7 +697,7 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, } break; } - case FUNsio: + case FUNsio: { const unsigned char *char_start; size_t char_len; @@ -696,7 +721,7 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, } break; } - case INVALID: + case INVALID: if (tc->recognized_len + (in_p - inchar_start) <= unitlen) { if (tc->recognized_len + (in_p - inchar_start) < unitlen) SUSPEND_AFTER_OUTPUT(26); @@ -719,12 +744,12 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, readagain_len = invalid_len - discard_len; } goto invalid; - case UNDEF: - goto undef; - default: - rb_raise(rb_eRuntimeError, "unknown transcoding instruction"); - } - continue; + case UNDEF: + goto undef; + default: + rb_raise(rb_eRuntimeError, "unknown transcoding instruction"); + } + continue; invalid: SUSPEND(econv_invalid_byte_sequence, 1); @@ -854,13 +879,13 @@ rb_transcoding_memsize(rb_transcoding *tc) const rb_transcoder *tr = tc->transcoder; if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) { - size += tr->state_size; + size += tr->state_size; } if ((int)sizeof(tc->readbuf.ary) < tr->max_input) { - size += tr->max_input; + size += tr->max_input; } if ((int)sizeof(tc->writebuf.ary) < tr->max_output) { - size += tr->max_output; + size += tr->max_output; } return size; } @@ -1000,7 +1025,7 @@ rb_econv_open0(const char *sname, const char *dname, int ecflags) if (*sname == '\0' && *dname == '\0') { num_trans = 0; entries = NULL; - sname = dname = ""; + sname = dname = ""; } else { struct trans_open_t toarg; @@ -1037,8 +1062,9 @@ decorator_names(int ecflags, const char **decorators_ret) case ECONV_UNIVERSAL_NEWLINE_DECORATOR: case ECONV_CRLF_NEWLINE_DECORATOR: case ECONV_CR_NEWLINE_DECORATOR: + case ECONV_LF_NEWLINE_DECORATOR: case 0: - break; + break; default: return -1; } @@ -1060,6 +1086,8 @@ decorator_names(int ecflags, const char **decorators_ret) decorators_ret[num_decorators++] = "crlf_newline"; if (ecflags & ECONV_CR_NEWLINE_DECORATOR) decorators_ret[num_decorators++] = "cr_newline"; + if (ecflags & ECONV_LF_NEWLINE_DECORATOR) + decorators_ret[num_decorators++] = "lf_newline"; if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) decorators_ret[num_decorators++] = "universal_newline"; @@ -1471,22 +1499,22 @@ rb_econv_convert(rb_econv_t *ec, if (ret == econv_invalid_byte_sequence || ret == econv_incomplete_input) { - /* deal with invalid byte sequence */ - /* todo: add more alternative behaviors */ + /* deal with invalid byte sequence */ + /* todo: add more alternative behaviors */ switch (ec->flags & ECONV_INVALID_MASK) { case ECONV_INVALID_REPLACE: - if (output_replacement_character(ec) == 0) + if (output_replacement_character(ec) == 0) goto resume; - } + } } if (ret == econv_undefined_conversion) { - /* valid character in source encoding - * but no related character(s) in destination encoding */ - /* todo: add more alternative behaviors */ + /* valid character in source encoding + * but no related character(s) in destination encoding */ + /* todo: add more alternative behaviors */ switch (ec->flags & ECONV_UNDEF_MASK) { case ECONV_UNDEF_REPLACE: - if (output_replacement_character(ec) == 0) + if (output_replacement_character(ec) == 0) goto resume; break; @@ -1709,8 +1737,7 @@ rb_econv_close(rb_econv_t *ec) } for (i = 0; i < ec->num_trans; i++) { rb_transcoding_close(ec->elems[i].tc); - if (ec->elems[i].out_buf_start) - xfree(ec->elems[i].out_buf_start); + xfree(ec->elems[i].out_buf_start); } xfree(ec->in_buf_start); xfree(ec->elems); @@ -1724,14 +1751,14 @@ rb_econv_memsize(rb_econv_t *ec) int i; if (ec->replacement_allocated) { - size += ec->replacement_len; + size += ec->replacement_len; } for (i = 0; i < ec->num_trans; i++) { - size += rb_transcoding_memsize(ec->elems[i].tc); + size += rb_transcoding_memsize(ec->elems[i].tc); - if (ec->elems[i].out_buf_start) { + if (ec->elems[i].out_buf_start) { size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start; - } + } } size += ec->in_buf_end - ec->in_buf_start; size += sizeof(rb_econv_elem_t) * ec->num_allocated; @@ -1810,6 +1837,12 @@ rb_econv_asciicompat_encoding(const char *ascii_incompat_name) return data.ascii_compat_name; } +/* + * Append `len` bytes pointed by `ss` to `dst` with converting with `ec`. + * + * If the result of the conversion is not compatible with the encoding of + * `dst`, `dst` may not be valid encoding. + */ VALUE rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags) { @@ -1817,11 +1850,19 @@ rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags) unsigned char *ds, *dp, *de; rb_econv_result_t res; int max_output; + enum ruby_coderange_type coderange; + rb_encoding *dst_enc = ec->destination_encoding; if (NIL_P(dst)) { dst = rb_str_buf_new(len); - if (ec->destination_encoding) - rb_enc_associate(dst, ec->destination_encoding); + if (dst_enc) { + rb_enc_associate(dst, dst_enc); + } + coderange = ENC_CODERANGE_7BIT; // scan from the start + } + else { + dst_enc = rb_enc_get(dst); + coderange = rb_enc_str_coderange(dst); } if (ec->last_tc) @@ -1830,13 +1871,13 @@ rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags) max_output = 1; do { + int cr; long dlen = RSTRING_LEN(dst); if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) { unsigned long new_capa = (unsigned long)dlen + len + max_output; if (LONG_MAX < new_capa) rb_raise(rb_eArgError, "too long string"); - rb_str_resize(dst, new_capa); - rb_str_set_len(dst, dlen); + rb_str_modify_expand(dst, new_capa - dlen); } sp = (const unsigned char *)ss; se = sp + len; @@ -1844,6 +1885,18 @@ rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags) de = ds + rb_str_capacity(dst); dp = ds += dlen; res = rb_econv_convert(ec, &sp, se, &dp, de, flags); + switch (coderange) { + case ENC_CODERANGE_7BIT: + case ENC_CODERANGE_VALID: + cr = (int)coderange; + rb_str_coderange_scan_restartable((char *)ds, (char *)dp, dst_enc, &cr); + coderange = cr; + ENC_CODERANGE_SET(dst, coderange); + break; + case ENC_CODERANGE_UNKNOWN: + case ENC_CODERANGE_BROKEN: + break; + } len -= (const char *)sp - ss; ss = (const char *)sp; rb_str_set_len(dst, dlen + (dp - ds)); @@ -1946,30 +1999,33 @@ rb_econv_binmode(rb_econv_t *ec) switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) { case ECONV_UNIVERSAL_NEWLINE_DECORATOR: - dname = "universal_newline"; - break; + dname = "universal_newline"; + break; case ECONV_CRLF_NEWLINE_DECORATOR: - dname = "crlf_newline"; - break; + dname = "crlf_newline"; + break; case ECONV_CR_NEWLINE_DECORATOR: - dname = "cr_newline"; - break; + dname = "cr_newline"; + break; + case ECONV_LF_NEWLINE_DECORATOR: + dname = "lf_newline"; + break; } if (dname) { const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder; int num_trans = ec->num_trans; - int i, j = 0; + int i, j = 0; - for (i=0; i < num_trans; i++) { - if (transcoder == ec->elems[i].tc->transcoder) { - rb_transcoding_close(ec->elems[i].tc); - xfree(ec->elems[i].out_buf_start); - ec->num_trans--; - } - else - ec->elems[j++] = ec->elems[i]; - } + for (i=0; i < num_trans; i++) { + if (transcoder == ec->elems[i].tc->transcoder) { + rb_transcoding_close(ec->elems[i].tc); + xfree(ec->elems[i].out_buf_start); + ec->num_trans--; + } + else + ec->elems[j++] = ec->elems[i]; + } } ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK; @@ -2012,6 +2068,10 @@ econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg) rb_str_cat2(mesg, pre); pre = ","; rb_str_cat2(mesg, "cr_newline"); } + if (ecflags & ECONV_LF_NEWLINE_DECORATOR) { + rb_str_cat2(mesg, pre); pre = ","; + rb_str_cat2(mesg, "lf_newline"); + } if (ecflags & ECONV_XML_TEXT_DECORATOR) { rb_str_cat2(mesg, pre); pre = ","; rb_str_cat2(mesg, "xml_text"); @@ -2100,7 +2160,7 @@ make_econv_exception(rb_econv_t *ec) dumped = rb_sprintf("U+%04X", cc); } } - if (dumped == Qnil) + if (NIL_P(dumped)) dumped = rb_str_dump(bytes); if (strcmp(ec->last_error.source_encoding, ec->source_encoding_name) == 0 && @@ -2265,7 +2325,7 @@ aref_fallback(VALUE fallback, VALUE c) static void transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, - const unsigned char *in_stop, unsigned char *out_stop, + const unsigned char *in_stop, unsigned char *out_stop, VALUE destination, unsigned char *(*resize_destination)(VALUE, size_t, size_t), const char *src_encoding, @@ -2287,19 +2347,19 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags)); if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) { - fallback = rb_hash_aref(ecopts, sym_fallback); - if (RB_TYPE_P(fallback, T_HASH)) { - fallback_func = hash_fallback; - } - else if (rb_obj_is_proc(fallback)) { - fallback_func = proc_fallback; - } - else if (rb_obj_is_method(fallback)) { - fallback_func = method_fallback; - } - else { - fallback_func = aref_fallback; - } + fallback = rb_hash_aref(ecopts, sym_fallback); + if (RB_TYPE_P(fallback, T_HASH)) { + fallback_func = hash_fallback; + } + else if (rb_obj_is_proc(fallback)) { + fallback_func = proc_fallback; + } + else if (rb_obj_is_method(fallback)) { + fallback_func = method_fallback; + } + else { + fallback_func = aref_fallback; + } } last_tc = ec->last_tc; max_output = last_tc ? last_tc->transcoder->max_output : 1; @@ -2308,20 +2368,20 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0); if (!NIL_P(fallback) && ret == econv_undefined_conversion) { - VALUE rep = rb_enc_str_new( - (const char *)ec->last_error.error_bytes_start, - ec->last_error.error_bytes_len, - rb_enc_find(ec->last_error.source_encoding)); - rep = (*fallback_func)(fallback, rep); - if (rep != Qundef && !NIL_P(rep)) { - StringValue(rep); - ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep), - RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep))); - if ((int)ret == -1) { - rb_raise(rb_eArgError, "too big fallback string"); - } - goto resume; - } + VALUE rep = rb_enc_str_new( + (const char *)ec->last_error.error_bytes_start, + ec->last_error.error_bytes_len, + rb_enc_find(ec->last_error.source_encoding)); + rep = (*fallback_func)(fallback, rep); + if (!UNDEF_P(rep) && !NIL_P(rep)) { + StringValue(rep); + ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep), + RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep))); + if ((int)ret == -1) { + rb_raise(rb_eArgError, "too big fallback string"); + } + goto resume; + } } if (ret == econv_invalid_byte_sequence || @@ -2329,7 +2389,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, ret == econv_undefined_conversion) { exc = make_econv_exception(ec); rb_econv_close(ec); - rb_exc_raise(exc); + rb_exc_raise(exc); } if (ret == econv_destination_buffer_full) { @@ -2344,7 +2404,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, /* sample transcode_loop implementation in byte-by-byte stream style */ static void transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, - const unsigned char *in_stop, unsigned char *out_stop, + const unsigned char *in_stop, unsigned char *out_stop, VALUE destination, unsigned char *(*resize_destination)(VALUE, size_t, size_t), const char *src_encoding, @@ -2476,56 +2536,61 @@ econv_opts(VALUE opt, int ecflags) v = rb_hash_aref(opt, sym_newline); if (!NIL_P(v)) { newlineflag = 2; - ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK; - if (v == sym_universal) { - ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR; - } - else if (v == sym_crlf) { - ecflags |= ECONV_CRLF_NEWLINE_DECORATOR; - } - else if (v == sym_cr) { - ecflags |= ECONV_CR_NEWLINE_DECORATOR; - } - else if (v == sym_lf) { - /* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */ - } - else if (SYMBOL_P(v)) { - rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE, - rb_sym2str(v)); - } - else { - rb_raise(rb_eArgError, "unexpected value for newline option"); - } + ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK; + if (v == sym_universal) { + ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR; + } + else if (v == sym_crlf) { + ecflags |= ECONV_CRLF_NEWLINE_DECORATOR; + } + else if (v == sym_cr) { + ecflags |= ECONV_CR_NEWLINE_DECORATOR; + } + else if (v == sym_lf) { + ecflags |= ECONV_LF_NEWLINE_DECORATOR; + } + else if (SYMBOL_P(v)) { + rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE, + rb_sym2str(v)); + } + else { + rb_raise(rb_eArgError, "unexpected value for newline option"); + } } #endif { int setflags = 0; - v = rb_hash_aref(opt, sym_universal_newline); - if (RTEST(v)) - setflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR; - newlineflag |= !NIL_P(v); + v = rb_hash_aref(opt, sym_universal_newline); + if (RTEST(v)) + setflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR; + newlineflag |= !NIL_P(v); - v = rb_hash_aref(opt, sym_crlf_newline); - if (RTEST(v)) - setflags |= ECONV_CRLF_NEWLINE_DECORATOR; - newlineflag |= !NIL_P(v); + v = rb_hash_aref(opt, sym_crlf_newline); + if (RTEST(v)) + setflags |= ECONV_CRLF_NEWLINE_DECORATOR; + newlineflag |= !NIL_P(v); - v = rb_hash_aref(opt, sym_cr_newline); - if (RTEST(v)) - setflags |= ECONV_CR_NEWLINE_DECORATOR; - newlineflag |= !NIL_P(v); + v = rb_hash_aref(opt, sym_cr_newline); + if (RTEST(v)) + setflags |= ECONV_CR_NEWLINE_DECORATOR; + newlineflag |= !NIL_P(v); + + v = rb_hash_aref(opt, sym_lf_newline); + if (RTEST(v)) + setflags |= ECONV_LF_NEWLINE_DECORATOR; + newlineflag |= !NIL_P(v); switch (newlineflag) { case 1: - ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK; - ecflags |= setflags; + ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK; + ecflags |= setflags; break; case 3: rb_warning(":newline option precedes other newline options"); break; - } + } } return ecflags; @@ -2545,28 +2610,28 @@ rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags) v = rb_hash_aref(opthash, sym_replace); if (!NIL_P(v)) { - StringValue(v); - if (rb_enc_str_coderange(v) == ENC_CODERANGE_BROKEN) { - VALUE dumped = rb_str_dump(v); - rb_raise(rb_eArgError, "replacement string is broken: %s as %s", - StringValueCStr(dumped), - rb_enc_name(rb_enc_get(v))); - } - v = rb_str_new_frozen(v); - newhash = rb_hash_new(); - rb_hash_aset(newhash, sym_replace, v); + StringValue(v); + if (is_broken_string(v)) { + VALUE dumped = rb_str_dump(v); + rb_raise(rb_eArgError, "replacement string is broken: %s as %s", + StringValueCStr(dumped), + rb_enc_name(rb_enc_get(v))); + } + v = rb_str_new_frozen(v); + newhash = rb_hash_new(); + rb_hash_aset(newhash, sym_replace, v); } v = rb_hash_aref(opthash, sym_fallback); if (!NIL_P(v)) { - VALUE h = rb_check_hash_type(v); - if (NIL_P(h) - ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, idAREF)) - : (v = h, 1)) { - if (NIL_P(newhash)) - newhash = rb_hash_new(); - rb_hash_aset(newhash, sym_fallback, v); - } + VALUE h = rb_check_hash_type(v); + if (NIL_P(h) + ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, idAREF)) + : (v = h, 1)) { + if (NIL_P(newhash)) + newhash = rb_hash_new(); + rb_hash_aset(newhash, sym_fallback, v); + } } if (!NIL_P(newhash)) @@ -2626,13 +2691,13 @@ enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p) VALUE encval; if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) || - !(enc = rb_enc_from_index(encidx))) { - enc = NULL; - encidx = 0; - n = StringValueCStr(*arg); + !(enc = rb_enc_from_index(encidx))) { + enc = NULL; + encidx = 0; + n = StringValueCStr(*arg); } else { - n = rb_enc_name(enc); + n = rb_enc_name(enc); } *name_p = n; @@ -2653,9 +2718,9 @@ str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2, dencidx = enc_arg(arg1, &dname, &denc); if (NIL_P(*arg2)) { - sencidx = rb_enc_get_index(str); - senc = rb_enc_from_index(sencidx); - sname = rb_enc_name(senc); + sencidx = rb_enc_get_index(str); + senc = rb_enc_from_index(sencidx); + sname = rb_enc_name(senc); } else { sencidx = enc_arg(arg2, &sname, &senc); @@ -2685,18 +2750,18 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts) rb_check_arity(argc, 0, 2); if (argc == 0) { - arg1 = rb_enc_default_internal(); - if (NIL_P(arg1)) { - if (!ecflags) return -1; - arg1 = rb_obj_encoding(str); - } - if (!(ecflags & ECONV_INVALID_MASK)) { - explicitly_invalid_replace = FALSE; - } - ecflags |= ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE; + arg1 = rb_enc_default_internal(); + if (NIL_P(arg1)) { + if (!ecflags) return -1; + arg1 = rb_obj_encoding(str); + } + if (!(ecflags & ECONV_INVALID_MASK)) { + explicitly_invalid_replace = FALSE; + } + ecflags |= ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE; } else { - arg1 = argv[0]; + arg1 = argv[0]; } arg2 = argc<=1 ? Qnil : argv[1]; dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc); @@ -2706,20 +2771,20 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts) ECONV_XML_ATTR_CONTENT_DECORATOR| ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) { if (senc && senc == denc) { - if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) { - VALUE rep = Qnil; - if (!NIL_P(ecopts)) { - rep = rb_hash_aref(ecopts, sym_replace); - } - dest = rb_enc_str_scrub(senc, str, rep); - if (NIL_P(dest)) dest = str; - *self = dest; - return dencidx; - } + if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) { + VALUE rep = Qnil; + if (!NIL_P(ecopts)) { + rep = rb_hash_aref(ecopts, sym_replace); + } + dest = rb_enc_str_scrub(senc, str, rep); + if (NIL_P(dest)) dest = str; + *self = dest; + return dencidx; + } return NIL_P(arg2) ? -1 : dencidx; } if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) { - if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) { + if (is_ascii_string(str)) { return dencidx; } } @@ -2756,9 +2821,9 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts) /* set encoding */ if (!denc) { - dencidx = rb_define_dummy_encoding(dname); - RB_GC_GUARD(arg1); - RB_GC_GUARD(arg2); + dencidx = rb_define_dummy_encoding(dname); + RB_GC_GUARD(arg1); + RB_GC_GUARD(arg2); } *self = dest; @@ -2774,7 +2839,7 @@ str_transcode(int argc, VALUE *argv, VALUE *self) argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt); if (!NIL_P(opt)) { - ecflags = rb_econv_prepare_opts(opt, &ecopts); + ecflags = rb_econv_prepare_opts(opt, &ecopts); } return str_transcode0(argc, argv, self, ecflags, ecopts); } @@ -2788,10 +2853,10 @@ str_encode_associate(VALUE str, int encidx) /* transcoded string never be broken. */ if (rb_enc_asciicompat(rb_enc_from_index(encidx))) { - rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr); + rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr); } else { - cr = ENC_CODERANGE_VALID; + cr = ENC_CODERANGE_VALID; } ENC_CODERANGE_SET(str, cr); return str; @@ -2799,16 +2864,11 @@ str_encode_associate(VALUE str, int encidx) /* * call-seq: - * str.encode!(encoding, **options) -> str - * str.encode!(dst_encoding, src_encoding, **options) -> str + * encode!(dst_encoding = Encoding.default_internal, **enc_opts) -> self + * encode!(dst_encoding, src_encoding, **enc_opts) -> self + * + * Like #encode, but applies encoding changes to +self+; returns +self+. * - * The first form transcodes the contents of <i>str</i> from - * str.encoding to +encoding+. - * The second form transcodes the contents of <i>str</i> from - * src_encoding to dst_encoding. - * The +options+ keyword arguments give details for conversion. See String#encode - * for details. - * Returns the string even if no changes were made. */ static VALUE @@ -2824,8 +2884,8 @@ str_encode_bang(int argc, VALUE *argv, VALUE str) if (encidx < 0) return str; if (newstr == str) { - rb_enc_associate_index(str, encidx); - return str; + rb_enc_associate_index(str, encidx); + return str; } rb_str_shared_replace(str, newstr); return str_encode_associate(str, encidx); @@ -2835,58 +2895,11 @@ static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx); /* * call-seq: - * str.encode(encoding, **options) -> str - * str.encode(dst_encoding, src_encoding, **options) -> str - * str.encode(**options) -> str - * - * The first form returns a copy of +str+ transcoded - * to encoding +encoding+. - * The second form returns a copy of +str+ transcoded - * from src_encoding to dst_encoding. - * The last form returns a copy of +str+ transcoded to - * <tt>Encoding.default_internal</tt>. + * encode(dst_encoding = Encoding.default_internal, **enc_opts) -> string + * encode(dst_encoding, src_encoding, **enc_opts) -> string * - * By default, the first and second form raise - * Encoding::UndefinedConversionError for characters that are - * undefined in the destination encoding, and - * Encoding::InvalidByteSequenceError for invalid byte sequences - * in the source encoding. The last form by default does not raise - * exceptions but uses replacement strings. + * :include: doc/string/encode.rdoc * - * The +options+ keyword arguments give details for conversion. - * The arguments are: - * - * :invalid :: - * If the value is +:replace+, #encode replaces invalid byte sequences in - * +str+ with the replacement character. The default is to raise the - * Encoding::InvalidByteSequenceError exception - * :undef :: - * If the value is +:replace+, #encode replaces characters which are - * undefined in the destination encoding with the replacement character. - * The default is to raise the Encoding::UndefinedConversionError. - * :replace :: - * Sets the replacement string to the given value. The default replacement - * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise. - * :fallback :: - * Sets the replacement string by the given object for undefined - * character. The object should be a Hash, a Proc, a Method, or an - * object which has [] method. - * Its key is an undefined character encoded in the source encoding - * of current transcoder. Its value can be any encoding until it - * can be converted into the destination encoding of the transcoder. - * :xml :: - * The value must be +:text+ or +:attr+. - * If the value is +:text+ #encode replaces undefined characters with their - * (upper-case hexadecimal) numeric character references. '&', '<', and '>' - * are converted to "&", "<", and ">", respectively. - * If the value is +:attr+, #encode also quotes the replacement result - * (using '"'), and replaces '"' with """. - * :cr_newline :: - * Replaces LF ("\n") with CR ("\r") if value is true. - * :crlf_newline :: - * Replaces LF ("\n") with CRLF ("\r\n") if value is true. - * :universal_newline :: - * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true. */ static VALUE @@ -2912,12 +2925,12 @@ encoded_dup(VALUE newstr, VALUE str, int encidx) { if (encidx < 0) return rb_str_dup(str); if (newstr == str) { - newstr = rb_str_dup(str); - rb_enc_associate_index(newstr, encidx); - return newstr; + newstr = rb_str_dup(str); + rb_enc_associate_index(newstr, encidx); + return newstr; } else { - RBASIC_SET_CLASS(newstr, rb_obj_class(str)); + RBASIC_SET_CLASS(newstr, rb_obj_class(str)); } return str_encode_associate(newstr, encidx); } @@ -3031,9 +3044,9 @@ econv_args(int argc, VALUE *argv, argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt); if (!NIL_P(flags_v)) { - if (!NIL_P(opt)) { - rb_error_arity(argc + 1, 2, 3); - } + if (!NIL_P(opt)) { + rb_error_arity(argc + 1, 2, 3); + } ecflags = NUM2INT(rb_to_int(flags_v)); ecopts = Qnil; } @@ -3089,22 +3102,22 @@ decorate_convpath(VALUE convpath, int ecflags) len = n = RARRAY_LENINT(convpath); if (n != 0) { VALUE pair = RARRAY_AREF(convpath, n-1); - if (RB_TYPE_P(pair, T_ARRAY)) { - const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0))); - const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1))); - transcoder_entry_t *entry = get_transcoder_entry(sname, dname); - const rb_transcoder *tr = load_transcoder_entry(entry); - if (!tr) - return -1; - if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) && - tr->asciicompat_type == asciicompat_encoder) { - n--; - rb_ary_store(convpath, len + num_decorators - 1, pair); - } - } - else { - rb_ary_store(convpath, len + num_decorators - 1, pair); - } + if (RB_TYPE_P(pair, T_ARRAY)) { + const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0))); + const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1))); + transcoder_entry_t *entry = get_transcoder_entry(sname, dname); + const rb_transcoder *tr = load_transcoder_entry(entry); + if (!tr) + return -1; + if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) && + tr->asciicompat_type == asciicompat_encoder) { + n--; + rb_ary_store(convpath, len + num_decorators - 1, pair); + } + } + else { + rb_ary_store(convpath, len + num_decorators - 1, pair); + } } for (i = 0; i < num_decorators; i++) @@ -3119,7 +3132,7 @@ search_convpath_i(const char *sname, const char *dname, int depth, void *arg) VALUE *ary_p = arg; VALUE v; - if (*ary_p == Qnil) { + if (NIL_P(*ary_p)) { *ary_p = rb_ary_new(); } @@ -3180,10 +3193,10 @@ econv_s_search_convpath(int argc, VALUE *argv, VALUE klass) } if (decorate_convpath(convpath, ecflags) == -1) { - VALUE exc = rb_econv_open_exc(sname, dname, ecflags); - RB_GC_GUARD(snamev); - RB_GC_GUARD(dnamev); - rb_exc_raise(exc); + VALUE exc = rb_econv_open_exc(sname, dname, ecflags); + RB_GC_GUARD(snamev); + RB_GC_GUARD(dnamev); + rb_exc_raise(exc); } return convpath; @@ -3199,7 +3212,7 @@ rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding) { VALUE convpath = Qnil; transcode_search_path(from_encoding, to_encoding, search_convpath_i, - &convpath); + &convpath); return RTEST(convpath); } @@ -3257,12 +3270,12 @@ rb_econv_init_by_convpath(VALUE self, VALUE convpath, } if (DECORATOR_P(sname, dname)) { ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans); - if (ret == -1) { - VALUE msg = rb_sprintf("decoration failed: %s", dname); - RB_GC_GUARD(snamev); - RB_GC_GUARD(dnamev); - rb_exc_raise(rb_exc_new_str(rb_eArgError, msg)); - } + if (ret == -1) { + VALUE msg = rb_sprintf("decoration failed: %s", dname); + RB_GC_GUARD(snamev); + RB_GC_GUARD(dnamev); + rb_exc_raise(rb_exc_new_str(rb_eArgError, msg)); + } } else { int j = ec->num_trans; @@ -3271,12 +3284,12 @@ rb_econv_init_by_convpath(VALUE self, VALUE convpath, arg.index = ec->num_trans; arg.ret = 0; ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg); - if (ret == -1 || arg.ret == -1) { - VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname); - RB_GC_GUARD(snamev); - RB_GC_GUARD(dnamev); + if (ret == -1 || arg.ret == -1) { + VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname); + RB_GC_GUARD(snamev); + RB_GC_GUARD(dnamev); rb_exc_raise(rb_exc_new_str(rb_eArgError, msg)); - } + } if (first) { first = 0; *senc_p = senc; @@ -3288,10 +3301,10 @@ rb_econv_init_by_convpath(VALUE self, VALUE convpath, } if (first) { - *senc_p = NULL; - *denc_p = NULL; - *sname_p = ""; - *dname_p = ""; + *senc_p = NULL; + *denc_p = NULL; + *sname_p = ""; + *dname_p = ""; } ec->source_encoding_name = *sname_p; @@ -3314,11 +3327,13 @@ rb_econv_init_by_convpath(VALUE self, VALUE convpath, * :undef => :replace # replace undefined conversion * :replace => string # replacement string ("?" or "\uFFFD" if not specified) * :newline => :universal # decorator for converting CRLF and CR to LF + * :newline => :lf # decorator for converting CRLF and CR to LF when writing * :newline => :crlf # decorator for converting LF to CRLF * :newline => :cr # decorator for converting LF to CR * :universal_newline => true # decorator for converting CRLF and CR to LF * :crlf_newline => true # decorator for converting LF to CRLF * :cr_newline => true # decorator for converting LF to CR + * :lf_newline => true # decorator for converting CRLF and CR to LF when writing * :xml => :text # escape as XML CharData. * :xml => :attr # escape as XML AttValue * integer form: @@ -3326,6 +3341,7 @@ rb_econv_init_by_convpath(VALUE self, VALUE convpath, * Encoding::Converter::UNDEF_REPLACE * Encoding::Converter::UNDEF_HEX_CHARREF * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR + * Encoding::Converter::LF_NEWLINE_DECORATOR * Encoding::Converter::CRLF_NEWLINE_DECORATOR * Encoding::Converter::CR_NEWLINE_DECORATOR * Encoding::Converter::XML_TEXT_DECORATOR @@ -3368,6 +3384,8 @@ rb_econv_init_by_convpath(VALUE self, VALUE convpath, * Convert LF to CRLF. * [:cr_newline => true] * Convert LF to CR. + * [:lf_newline => true] + * Convert CRLF and CR to LF (when writing). * [:xml => :text] * Escape as XML CharData. * This form can be used as an HTML 4.0 #PCDATA. @@ -3432,10 +3450,10 @@ econv_init(int argc, VALUE *argv, VALUE self) } if (!ec) { - VALUE exc = rb_econv_open_exc(sname, dname, ecflags); - RB_GC_GUARD(snamev); - RB_GC_GUARD(dnamev); - rb_exc_raise(exc); + VALUE exc = rb_econv_open_exc(sname, dname, ecflags); + RB_GC_GUARD(snamev); + RB_GC_GUARD(dnamev); + rb_exc_raise(exc); } if (!DECORATOR_P(sname, dname)) { @@ -3443,8 +3461,8 @@ econv_init(int argc, VALUE *argv, VALUE self) senc = make_dummy_encoding(sname); if (!denc) denc = make_dummy_encoding(dname); - RB_GC_GUARD(snamev); - RB_GC_GUARD(dnamev); + RB_GC_GUARD(snamev); + RB_GC_GUARD(dnamev); } ec->source_encoding = senc; @@ -3497,6 +3515,14 @@ check_econv(VALUE self) return ec; } +static VALUE +econv_get_encoding(rb_encoding *encoding) +{ + if (!encoding) + return Qnil; + return rb_enc_from_encoding(encoding); +} + /* * call-seq: * ec.source_encoding -> encoding @@ -3507,9 +3533,7 @@ static VALUE econv_source_encoding(VALUE self) { rb_econv_t *ec = check_econv(self); - if (!ec->source_encoding) - return Qnil; - return rb_enc_from_encoding(ec->source_encoding); + return econv_get_encoding(ec->source_encoding); } /* @@ -3522,9 +3546,7 @@ static VALUE econv_destination_encoding(VALUE self) { rb_econv_t *ec = check_econv(self); - if (!ec->destination_encoding) - return Qnil; - return rb_enc_from_encoding(ec->destination_encoding); + return econv_get_encoding(ec->destination_encoding); } /* @@ -3581,29 +3603,29 @@ econv_equal(VALUE self, VALUE other) int i; if (!rb_typeddata_is_kind_of(other, &econv_data_type)) { - return Qnil; + return Qnil; } ec2 = DATA_PTR(other); if (!ec2) return Qfalse; if (ec1->source_encoding_name != ec2->source_encoding_name && - strcmp(ec1->source_encoding_name, ec2->source_encoding_name)) - return Qfalse; + strcmp(ec1->source_encoding_name, ec2->source_encoding_name)) + return Qfalse; if (ec1->destination_encoding_name != ec2->destination_encoding_name && - strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name)) - return Qfalse; + strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name)) + return Qfalse; if (ec1->flags != ec2->flags) return Qfalse; if (ec1->replacement_enc != ec2->replacement_enc && - strcmp(ec1->replacement_enc, ec2->replacement_enc)) - return Qfalse; + strcmp(ec1->replacement_enc, ec2->replacement_enc)) + return Qfalse; if (ec1->replacement_len != ec2->replacement_len) return Qfalse; if (ec1->replacement_str != ec2->replacement_str && - memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len)) - return Qfalse; + memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len)) + return Qfalse; if (ec1->num_trans != ec2->num_trans) return Qfalse; for (i = 0; i < ec1->num_trans; i++) { if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder) - return Qfalse; + return Qfalse; } return Qtrue; } @@ -3742,10 +3764,10 @@ econv_primitive_convert(int argc, VALUE *argv, VALUE self) output_bytesize = NUM2LONG(output_bytesize_v); if (!NIL_P(flags_v)) { - if (!NIL_P(opt)) { - rb_error_arity(argc + 1, 2, 5); - } - flags = NUM2INT(rb_to_int(flags_v)); + if (!NIL_P(opt)) { + rb_error_arity(argc + 1, 2, 5); + } + flags = NUM2INT(rb_to_int(flags_v)); } else if (!NIL_P(opt)) { VALUE v; @@ -3767,7 +3789,8 @@ econv_primitive_convert(int argc, VALUE *argv, VALUE self) rb_str_modify(output); if (NIL_P(output_bytesize_v)) { - output_bytesize = RSTRING_EMBED_LEN_MAX; + output_bytesize = rb_str_capacity(output); + if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input)) output_bytesize = RSTRING_LEN(input); } @@ -4093,7 +4116,7 @@ econv_insert_output(VALUE self, VALUE string) ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc); if (ret == -1) { - rb_raise(rb_eArgError, "too big string"); + rb_raise(rb_eArgError, "too big string"); } return Qnil; @@ -4462,6 +4485,7 @@ Init_transcode(void) sym_universal_newline = ID2SYM(rb_intern_const("universal_newline")); sym_crlf_newline = ID2SYM(rb_intern_const("crlf_newline")); sym_cr_newline = ID2SYM(rb_intern_const("cr_newline")); + sym_lf_newline = ID2SYM(rb_intern("lf_newline")); sym_partial_input = ID2SYM(rb_intern_const("partial_input")); #ifdef ENABLE_ECONV_NEWLINE_OPTION @@ -4558,6 +4582,12 @@ InitVM_transcode(void) */ rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR)); + /* Document-const: LF_NEWLINE_DECORATOR + * + * Decorator for converting CRLF and CR to LF when writing + */ + rb_define_const(rb_cEncodingConverter, "LF_NEWLINE_DECORATOR", INT2FIX(ECONV_LF_NEWLINE_DECORATOR)); + /* Document-const: CRLF_NEWLINE_DECORATOR * * Decorator for converting LF to CRLF |