diff options
Diffstat (limited to 'transcode.c')
| -rw-r--r-- | transcode.c | 1999 |
1 files changed, 1340 insertions, 659 deletions
diff --git a/transcode.c b/transcode.c index dd75a5e303..86e828c479 100644 --- a/transcode.c +++ b/transcode.c @@ -9,22 +9,50 @@ **********************************************************************/ -#include "ruby/ruby.h" +#include "ruby/internal/config.h" + +#include <ctype.h> + +#include "internal.h" +#include "internal/array.h" +#include "internal/inits.h" +#include "internal/object.h" +#include "internal/string.h" +#include "internal/transcode.h" +#include "internal/encoding.h" #include "ruby/encoding.h" +#include "vm_sync.h" + #include "transcode_data.h" -#include <ctype.h> +#include "id.h" + +#define ENABLE_ECONV_NEWLINE_OPTION 1 -VALUE rb_eConversionUndefined; -VALUE rb_eInvalidByteSequence; -VALUE rb_eNoConverter; +/* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */ +static VALUE rb_eUndefinedConversionError; +static VALUE rb_eInvalidByteSequenceError; +static VALUE rb_eConverterNotFoundError; VALUE rb_cEncodingConverter; -static VALUE sym_invalid, sym_undef, sym_ignore, sym_replace; +static ID id_destination_encoding; +static ID id_destination_encoding_name; +static ID id_error_bytes; +static ID id_error_char; +static ID id_incomplete_input; +static ID id_readagain_bytes; +static ID id_source_encoding; +static ID id_source_encoding_name; + +static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback; static VALUE sym_xml, sym_text, sym_attr; static VALUE sym_universal_newline; static VALUE sym_crlf_newline; static VALUE sym_cr_newline; +static VALUE sym_lf_newline; +#ifdef ENABLE_ECONV_NEWLINE_OPTION +static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf; +#endif static VALUE sym_partial_input; static VALUE sym_invalid_byte_sequence; @@ -52,16 +80,17 @@ typedef struct rb_transcoding { unsigned int next_table; VALUE next_info; unsigned char next_byte; + unsigned int output_index; - int recognized_len; /* already interpreted */ - int readagain_len; /* not yet interpreted */ + ssize_t recognized_len; /* already interpreted */ + ssize_t readagain_len; /* not yet interpreted */ union { unsigned char ary[8]; /* max_input <= sizeof(ary) */ unsigned char *ptr; /* length: max_input */ } readbuf; /* recognized_len + readagain_len used */ - int writebuf_off; - int writebuf_len; + ssize_t writebuf_off; + ssize_t writebuf_len; union { unsigned char ary[8]; /* max_output <= sizeof(ary) */ unsigned char *ptr; /* length: max_output */ @@ -69,25 +98,26 @@ typedef struct rb_transcoding { union rb_transcoding_state_t { /* opaque data for stateful encoding */ void *ptr; + char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)]; double dummy_for_alignment; } state; } rb_transcoding; #define TRANSCODING_READBUF(tc) \ - ((tc)->transcoder->max_input <= sizeof((tc)->readbuf.ary) ? \ + ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \ (tc)->readbuf.ary : \ (tc)->readbuf.ptr) #define TRANSCODING_WRITEBUF(tc) \ - ((tc)->transcoder->max_output <= sizeof((tc)->writebuf.ary) ? \ + ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \ (tc)->writebuf.ary : \ (tc)->writebuf.ptr) #define TRANSCODING_WRITEBUF_SIZE(tc) \ - ((tc)->transcoder->max_output <= sizeof((tc)->writebuf.ary) ? \ + ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \ sizeof((tc)->writebuf.ary) : \ - (tc)->transcoder->max_output) -#define TRANSCODING_STATE_EMBED_MAX sizeof(union rb_transcoding_state_t) + (size_t)(tc)->transcoder->max_output) +#define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t)) #define TRANSCODING_STATE(tc) \ - ((tc)->transcoder->state_size <= sizeof((tc)->state) ? \ - (void *)&(tc)->state : \ + ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \ + (tc)->state.ary : \ (tc)->state.ptr) typedef struct { @@ -101,21 +131,21 @@ typedef struct { struct rb_econv_t { int flags; + int started; /* bool */ + const char *source_encoding_name; const char *destination_encoding_name; - int started; - const unsigned char *replacement_str; size_t replacement_len; const char *replacement_enc; - int replacement_allocated; unsigned char *in_buf_start; unsigned char *in_data_start; unsigned char *in_data_end; unsigned char *in_buf_end; rb_econv_elem_t *elems; + int replacement_allocated; /* bool */ int num_allocated; int num_trans; int num_finished; @@ -147,31 +177,55 @@ struct rb_econv_t { typedef struct { const char *sname; const char *dname; - const char *lib; /* maybe null. it means that don't load the library. */ + const char *lib; /* null means no need to load a library */ const rb_transcoder *transcoder; } transcoder_entry_t; static st_table *transcoder_table; +static int +free_inner_transcode_i(st_data_t key, st_data_t val, st_data_t arg) +{ + xfree((void *)val); + return ST_DELETE; +} + +static int +free_transcode_i(st_data_t key, st_data_t val, st_data_t arg) +{ + st_foreach((void *)val, free_inner_transcode_i, 0); + st_free_table((void *)val); + return ST_DELETE; +} + +void +rb_free_transcoder_table(void) +{ + st_foreach(transcoder_table, free_transcode_i, 0); + st_free_table(transcoder_table); +} + static transcoder_entry_t * make_transcoder_entry(const char *sname, const char *dname) { st_data_t val; st_table *table2; - if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) { - val = (st_data_t)st_init_strcasetable(); - st_add_direct(transcoder_table, (st_data_t)sname, val); - } - table2 = (st_table *)val; - if (!st_lookup(table2, (st_data_t)dname, &val)) { - transcoder_entry_t *entry = ALLOC(transcoder_entry_t); - entry->sname = sname; - entry->dname = dname; - entry->lib = NULL; - entry->transcoder = NULL; - val = (st_data_t)entry; - st_add_direct(table2, (st_data_t)dname, val); + RB_VM_LOCKING() { + if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) { + val = (st_data_t)st_init_strcasetable(); + st_add_direct(transcoder_table, (st_data_t)sname, val); + } + table2 = (st_table *)val; + if (!st_lookup(table2, (st_data_t)dname, &val)) { + transcoder_entry_t *entry = ALLOC(transcoder_entry_t); + entry->sname = sname; + entry->dname = dname; + entry->lib = NULL; + entry->transcoder = NULL; + val = (st_data_t)entry; + st_add_direct(table2, (st_data_t)dname, val); + } } return (transcoder_entry_t *)val; } @@ -179,15 +233,15 @@ make_transcoder_entry(const char *sname, const char *dname) static transcoder_entry_t * get_transcoder_entry(const char *sname, const char *dname) { - st_data_t val; + st_data_t val = 0; st_table *table2; - - if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) { - return NULL; - } - table2 = (st_table *)val; - if (!st_lookup(table2, (st_data_t)dname, &val)) { - return NULL; + RB_VM_LOCKING() { + if (st_lookup(transcoder_table, (st_data_t)sname, &val)) { + table2 = (st_table *)val; + if (!st_lookup(table2, (st_data_t)dname, &val)) { + val = 0; + } + } } return (transcoder_entry_t *)val; } @@ -200,13 +254,14 @@ rb_register_transcoder(const rb_transcoder *tr) transcoder_entry_t *entry; - entry = make_transcoder_entry(sname, dname); - if (entry->transcoder) { - rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered", - sname, dname); + RB_VM_LOCKING() { + entry = make_transcoder_entry(sname, dname); + if (entry->transcoder) { + rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered", + sname, dname); + } + entry->transcoder = tr; } - - entry->transcoder = tr; } static void @@ -218,20 +273,18 @@ declare_transcoder(const char *sname, const char *dname, const char *lib) entry->lib = lib; } -#define MAX_TRANSCODER_LIBNAME_LEN 64 static const char transcoder_lib_prefix[] = "enc/trans/"; void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib) { - if (!lib || strlen(lib) > MAX_TRANSCODER_LIBNAME_LEN) { - rb_raise(rb_eArgError, "invalid library name - %s", - lib ? lib : "(null)"); + if (!lib) { + rb_raise(rb_eArgError, "invalid library name - (null)"); } declare_transcoder(enc1, enc2, lib); } -#define encoding_equal(enc1, enc2) (STRCASECMP(enc1, enc2) == 0) +#define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0) typedef struct search_path_queue_tag { struct search_path_queue_tag *next; @@ -275,8 +328,9 @@ transcode_search_path(const char *sname, const char *dname, search_path_queue_t *q; st_data_t val; st_table *table2; - int found; - int pathlen; + int pathlen = -1; + bool found = false; + bool lookup_res; if (encoding_equal(sname, dname)) return -1; @@ -287,37 +341,39 @@ transcode_search_path(const char *sname, const char *dname, bfs.queue_last_ptr = &q->next; bfs.queue = q; - bfs.visited = st_init_strcasetable(); + bfs.visited = st_init_strcasetable(); // due to base encodings, we need to do search in a loop st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL); - while (bfs.queue) { - q = bfs.queue; - bfs.queue = q->next; - if (!bfs.queue) - bfs.queue_last_ptr = &bfs.queue; + RB_VM_LOCKING() { + while (bfs.queue) { + q = bfs.queue; + bfs.queue = q->next; + if (!bfs.queue) { + bfs.queue_last_ptr = &bfs.queue; + } - if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) { - xfree(q); - continue; - } - table2 = (st_table *)val; + lookup_res = st_lookup(transcoder_table, (st_data_t)q->enc, &val); // src => table2 + if (!lookup_res) { + xfree(q); + continue; + } + table2 = (st_table *)val; - if (st_lookup(table2, (st_data_t)dname, &val)) { - st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc); - xfree(q); - found = 1; - goto cleanup; - } + if (st_lookup(table2, (st_data_t)dname, &val)) { // dest => econv + st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc); + xfree(q); + found = true; + break; + } - bfs.base_enc = q->enc; - st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs); - bfs.base_enc = NULL; + bfs.base_enc = q->enc; + st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs); - xfree(q); + bfs.base_enc = NULL; + xfree(q); + } } - found = 0; - cleanup: while (bfs.queue) { q = bfs.queue; bfs.queue = q->next; @@ -348,31 +404,30 @@ transcode_search_path(const char *sname, const char *dname, st_free_table(bfs.visited); - if (found) - return pathlen; - else - return -1; + return pathlen; /* is -1 if not found */ } +int rb_require_internal_silent(VALUE fname); + static const rb_transcoder * load_transcoder_entry(transcoder_entry_t *entry) { + ASSERT_vm_unlocking(); if (entry->transcoder) return entry->transcoder; if (entry->lib) { - const char *lib = entry->lib; - int len = strlen(lib); - char path[sizeof(transcoder_lib_prefix) + MAX_TRANSCODER_LIBNAME_LEN]; - - entry->lib = NULL; + const char *const lib = entry->lib; + const size_t len = strlen(lib); + const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len; + const VALUE fn = rb_str_new(0, total_len); + char *const path = RSTRING_PTR(fn); - if (len > MAX_TRANSCODER_LIBNAME_LEN) - return NULL; memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1); - memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1); - if (!rb_require(path)) - return NULL; + memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len); + rb_str_set_len(fn, total_len); + OBJ_FREEZE(fn); + rb_require_internal_silent(fn); // Sets entry->transcoder } if (entry->transcoder) @@ -425,30 +480,16 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, const unsigned char *in_stop, unsigned char *out_stop, rb_transcoding *tc, const int opt) - { const rb_transcoder *tr = tc->transcoder; int unitlen = tr->input_unit_length; - int readagain_len = 0; + ssize_t readagain_len = 0; const unsigned char *inchar_start; const unsigned char *in_p; unsigned char *out_p; - unsigned char empty_buf; - unsigned char *empty_ptr = &empty_buf; - - if (!in_pos) { - in_pos = (const unsigned char **)&empty_ptr; - in_stop = empty_ptr; - } - - if (!out_pos) { - out_pos = &empty_ptr; - out_stop = empty_ptr; - } - in_p = inchar_start = *in_pos; out_p = *out_pos; @@ -466,7 +507,7 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, tc->recognized_len -= readagain_len; \ tc->readagain_len = readagain_len; \ } \ - return ret; \ + return (ret); \ resume_label ## num:; \ } while (0) #define SUSPEND_OBUF(num) \ @@ -515,12 +556,18 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, case 26: goto resume_label26; case 27: goto resume_label27; case 28: goto resume_label28; + case 29: goto resume_label29; + case 30: goto resume_label30; + case 31: goto resume_label31; + case 32: goto resume_label32; + case 33: goto resume_label33; + case 34: goto resume_label34; } while (1) { inchar_start = in_p; tc->recognized_len = 0; - next_table = tr->conv_tree_start; + next_table = tr->conv_tree_start; SUSPEND_AFTER_OUTPUT(24); @@ -540,7 +587,7 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE]) #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))]) - next_byte = (unsigned char)*in_p++; + next_byte = (unsigned char)*in_p++; follow_byte: if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte) next_info = INVALID; @@ -548,52 +595,70 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, next_info = (VALUE)BL_ACTION(next_byte); } follow_info: - switch (next_info & 0x1F) { - case NOMAP: /* xxx: copy last byte only? */ - SUSPEND_OBUF(3); *out_p++ = next_byte; - continue; - case 0x00: case 0x04: case 0x08: case 0x0C: - case 0x10: case 0x14: case 0x18: case 0x1C: + switch (next_info & 0x1F) { + case NOMAP: + { + const unsigned char *p = inchar_start; + writebuf_off = 0; + while (p < in_p) { + TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++; + } + writebuf_len = writebuf_off; + writebuf_off = 0; + while (writebuf_off < writebuf_len) { + SUSPEND_OBUF(3); + *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; + } + } + continue; + case 0x00: case 0x04: case 0x08: case 0x0C: + case 0x10: case 0x14: case 0x18: case 0x1C: SUSPEND_AFTER_OUTPUT(25); - while (in_p >= in_stop) { + while (in_p >= in_stop) { if (!(opt & ECONV_PARTIAL_INPUT)) goto incomplete; SUSPEND(econv_source_buffer_empty, 5); - } - next_byte = (unsigned char)*in_p++; - next_table = next_info; - goto follow_byte; - case ZERObt: /* drop input */ - continue; - case ONEbt: + } + next_byte = (unsigned char)*in_p++; + next_table = (unsigned int)next_info; + goto follow_byte; + case ZERObt: /* drop input */ + continue; + case ONEbt: SUSPEND_OBUF(9); *out_p++ = getBT1(next_info); - continue; - case TWObt: + continue; + case TWObt: SUSPEND_OBUF(10); *out_p++ = getBT1(next_info); SUSPEND_OBUF(21); *out_p++ = getBT2(next_info); - continue; - case THREEbt: + continue; + case THREEbt: SUSPEND_OBUF(11); *out_p++ = getBT1(next_info); SUSPEND_OBUF(15); *out_p++ = getBT2(next_info); SUSPEND_OBUF(16); *out_p++ = getBT3(next_info); - continue; - case FOURbt: + continue; + case FOURbt: SUSPEND_OBUF(12); *out_p++ = getBT0(next_info); SUSPEND_OBUF(17); *out_p++ = getBT1(next_info); SUSPEND_OBUF(18); *out_p++ = getBT2(next_info); SUSPEND_OBUF(19); *out_p++ = getBT3(next_info); - continue; + continue; + case GB4bt: + SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info); + SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info); + SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info); + SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info); + continue; case STR1: - next_byte = 0; /* index */ - while (next_byte < BYTE_ADDR(STR1_BYTEINDEX(next_info))[0]) { - SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+next_byte]; - next_byte++; + tc->output_index = 0; + while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) { + SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index]; + tc->output_index++; } continue; - case FUNii: - next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info); - goto follow_info; - case FUNsi: + case FUNii: + next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info); + goto follow_info; + case FUNsi: { const unsigned char *char_start; size_t char_len; @@ -601,7 +666,7 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len); goto follow_info; } - case FUNio: + case FUNio: SUSPEND_OBUF(13); if (tr->max_output <= out_stop - out_p) out_p += tr->func_io(TRANSCODING_STATE(tc), @@ -616,8 +681,8 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; } } - break; - case FUNso: + break; + case FUNso: { const unsigned char *char_start; size_t char_len; @@ -641,7 +706,31 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, } break; } - case INVALID: + case FUNsio: + { + const unsigned char *char_start; + size_t char_len; + SUSPEND_OBUF(33); + if (tr->max_output <= out_stop - out_p) { + char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); + out_p += tr->func_sio(TRANSCODING_STATE(tc), + char_start, (size_t)char_len, next_info, + out_p, out_stop - out_p); + } + else { + char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len); + writebuf_len = tr->func_sio(TRANSCODING_STATE(tc), + char_start, (size_t)char_len, next_info, + TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc)); + writebuf_off = 0; + while (writebuf_off < writebuf_len) { + SUSPEND_OBUF(34); + *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++]; + } + } + break; + } + case INVALID: if (tc->recognized_len + (in_p - inchar_start) <= unitlen) { if (tc->recognized_len + (in_p - inchar_start) < unitlen) SUSPEND_AFTER_OUTPUT(26); @@ -657,17 +746,19 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, } } else { - int invalid_len; /* including the last byte which causes invalid */ - int discard_len; + ssize_t invalid_len; /* including the last byte which causes invalid */ + ssize_t discard_len; invalid_len = tc->recognized_len + (in_p - inchar_start); discard_len = ((invalid_len - 1) / unitlen) * unitlen; readagain_len = invalid_len - discard_len; } goto invalid; - case UNDEF: - goto undef; - } - continue; + case UNDEF: + goto undef; + default: + rb_raise(rb_eRuntimeError, "unknown transcoding instruction"); + } + continue; invalid: SUSPEND(econv_invalid_byte_sequence, 1); @@ -753,10 +844,10 @@ rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags) tc->readagain_len = 0; tc->writebuf_len = 0; tc->writebuf_off = 0; - if (sizeof(tc->readbuf.ary) < tr->max_input) { + if ((int)sizeof(tc->readbuf.ary) < tr->max_input) { tc->readbuf.ptr = xmalloc(tr->max_input); } - if (sizeof(tc->writebuf.ary) < tr->max_output) { + if ((int)sizeof(tc->writebuf.ary) < tr->max_output) { tc->writebuf.ptr = xmalloc(tr->max_output); } return tc; @@ -783,13 +874,31 @@ rb_transcoding_close(rb_transcoding *tc) } if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) xfree(tc->state.ptr); - if (sizeof(tc->readbuf.ary) < tr->max_input) + if ((int)sizeof(tc->readbuf.ary) < tr->max_input) xfree(tc->readbuf.ptr); - if (sizeof(tc->writebuf.ary) < tr->max_output) + if ((int)sizeof(tc->writebuf.ary) < tr->max_output) xfree(tc->writebuf.ptr); xfree(tc); } +static size_t +rb_transcoding_memsize(rb_transcoding *tc) +{ + size_t size = sizeof(rb_transcoding); + const rb_transcoder *tr = tc->transcoder; + + if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) { + size += tr->state_size; + } + if ((int)sizeof(tc->readbuf.ary) < tr->max_input) { + size += tr->max_input; + } + if ((int)sizeof(tc->writebuf.ary) < tr->max_output) { + size += tr->max_output; + } + return size; +} + static rb_econv_t * rb_econv_alloc(int n_hint) { @@ -917,28 +1026,14 @@ rb_econv_open0(const char *sname, const char *dname, int ecflags) int num_trans; rb_econv_t *ec; - rb_encoding *senc, *denc; - int sidx, didx; - - senc = NULL; - if (*sname) { - sidx = rb_enc_find_index(sname); - if (0 <= sidx) { - senc = rb_enc_from_index(sidx); - } - } - - denc = NULL; - if (*dname) { - didx = rb_enc_find_index(dname); - if (0 <= didx) { - denc = rb_enc_from_index(didx); - } - } + // loads encodings if not loaded already + if (*sname) rb_enc_find_index(sname); + if (*dname) rb_enc_find_index(dname); if (*sname == '\0' && *dname == '\0') { num_trans = 0; entries = NULL; + sname = dname = ""; } else { struct trans_open_t toarg; @@ -971,13 +1066,16 @@ decorator_names(int ecflags, const char **decorators_ret) { int num_decorators; - if ((ecflags & ECONV_CRLF_NEWLINE_DECORATOR) && - (ecflags & ECONV_CR_NEWLINE_DECORATOR)) - return -1; - - if ((ecflags & (ECONV_CRLF_NEWLINE_DECORATOR|ECONV_CR_NEWLINE_DECORATOR)) && - (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)) + switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) { + case ECONV_UNIVERSAL_NEWLINE_DECORATOR: + case ECONV_CRLF_NEWLINE_DECORATOR: + case ECONV_CR_NEWLINE_DECORATOR: + case ECONV_LF_NEWLINE_DECORATOR: + case 0: + break; + default: return -1; + } if ((ecflags & ECONV_XML_TEXT_DECORATOR) && (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)) @@ -996,6 +1094,8 @@ decorator_names(int ecflags, const char **decorators_ret) decorators_ret[num_decorators++] = "crlf_newline"; if (ecflags & ECONV_CR_NEWLINE_DECORATOR) decorators_ret[num_decorators++] = "cr_newline"; + if (ecflags & ECONV_LF_NEWLINE_DECORATOR) + decorators_ret[num_decorators++] = "lf_newline"; if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) decorators_ret[num_decorators++] = "universal_newline"; @@ -1015,18 +1115,20 @@ rb_econv_open(const char *sname, const char *dname, int ecflags) return NULL; ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK); - if (!ec) - return NULL; - - for (i = 0; i < num_decorators; i++) - if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) { - rb_econv_close(ec); - return NULL; + if (ec) { + for (i = 0; i < num_decorators; i++) { + if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) { + rb_econv_close(ec); + ec = NULL; + break; + } } + } - ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK; - - return ec; + if (ec) { + ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK; + } + return ec; // can be NULL } static int @@ -1065,8 +1167,8 @@ trans_sweep(rb_econv_t *ec, } else { if (te->out_buf_start != te->out_data_start) { - int len = te->out_data_end - te->out_data_start; - int off = te->out_data_start - te->out_buf_start; + ssize_t len = te->out_data_end - te->out_data_start; + ssize_t off = te->out_data_start - te->out_buf_start; MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len); te->out_data_start = te->out_buf_start; te->out_data_end -= off; @@ -1137,7 +1239,6 @@ rb_trans_conv(rb_econv_t *ec, if (ec->elems[0].last_result == econv_after_output) ec->elems[0].last_result = econv_source_buffer_empty; - needreport_index = -1; for (i = ec->num_trans-1; 0 <= i; i--) { switch (ec->elems[i].last_result) { case econv_invalid_byte_sequence: @@ -1146,7 +1247,6 @@ rb_trans_conv(rb_econv_t *ec, case econv_after_output: case econv_finished: sweep_start = i+1; - needreport_index = i; goto found_needreport; case econv_destination_buffer_full: @@ -1273,13 +1373,13 @@ rb_econv_convert0(rb_econv_t *ec, memcpy(*output_ptr, data_start, len); *output_ptr += len; ec->elems[ec->num_trans-1].out_data_start = - ec->elems[ec->num_trans-1].out_data_end = + ec->elems[ec->num_trans-1].out_data_end = ec->elems[ec->num_trans-1].out_buf_start; has_output = 1; } } - if (ec->in_buf_start && + if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) { res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop, (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position); @@ -1409,22 +1509,22 @@ rb_econv_convert(rb_econv_t *ec, if (ret == econv_invalid_byte_sequence || ret == econv_incomplete_input) { - /* deal with invalid byte sequence */ - /* todo: add more alternative behaviors */ + /* deal with invalid byte sequence */ + /* todo: add more alternative behaviors */ switch (ec->flags & ECONV_INVALID_MASK) { case ECONV_INVALID_REPLACE: - if (output_replacement_character(ec) == 0) + if (output_replacement_character(ec) == 0) goto resume; - } + } } if (ret == econv_undefined_conversion) { - /* valid character in source encoding - * but no related character(s) in destination encoding */ - /* todo: add more alternative behaviors */ + /* valid character in source encoding + * but no related character(s) in destination encoding */ + /* todo: add more alternative behaviors */ switch (ec->flags & ECONV_UNDEF_MASK) { case ECONV_UNDEF_REPLACE: - if (output_replacement_character(ec) == 0) + if (output_replacement_character(ec) == 0) goto resume; break; @@ -1523,7 +1623,7 @@ allocate_converted_string(const char *sname, const char *dname, /* result: 0:success -1:failure */ int -rb_econv_insert_output(rb_econv_t *ec, +rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding) { const char *insert_encoding = rb_econv_encoding_to_insert_output(ec); @@ -1602,11 +1702,11 @@ rb_econv_insert_output(rb_econv_t *ec, *data_end_p = buf; *buf_end_p = buf+need; } - else if (*buf_end_p - *data_end_p < need) { + else if ((size_t)(*buf_end_p - *data_end_p) < need) { MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p); *data_end_p = *buf_start_p + (*data_end_p - *data_start_p); *data_start_p = *buf_start_p; - if (*buf_end_p - *data_end_p < need) { + if ((size_t)(*buf_end_p - *data_end_p) < need) { unsigned char *buf; size_t s = (*data_end_p - *buf_start_p) + need; if (s < need) @@ -1647,20 +1747,44 @@ rb_econv_close(rb_econv_t *ec) } for (i = 0; i < ec->num_trans; i++) { rb_transcoding_close(ec->elems[i].tc); - if (ec->elems[i].out_buf_start) - xfree(ec->elems[i].out_buf_start); + xfree(ec->elems[i].out_buf_start); } xfree(ec->in_buf_start); xfree(ec->elems); xfree(ec); } +size_t +rb_econv_memsize(rb_econv_t *ec) +{ + size_t size = sizeof(rb_econv_t); + int i; + + if (ec->replacement_allocated) { + size += ec->replacement_len; + } + for (i = 0; i < ec->num_trans; i++) { + size += rb_transcoding_memsize(ec->elems[i].tc); + + if (ec->elems[i].out_buf_start) { + size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start; + } + } + size += ec->in_buf_end - ec->in_buf_start; + size += sizeof(rb_econv_elem_t) * ec->num_allocated; + + return size; +} + int rb_econv_putbackable(rb_econv_t *ec) { if (ec->num_trans == 0) return 0; - return ec->elems[0].tc->readagain_len; +#if SIZEOF_SIZE_T > SIZEOF_INT + if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX; +#endif + return (int)ec->elems[0].tc->readagain_len; } void @@ -1701,40 +1825,72 @@ rb_econv_asciicompat_encoding(const char *ascii_incompat_name) { st_data_t v; st_table *table2; - struct asciicompat_encoding_t data; - - if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v)) - return NULL; - table2 = (st_table *)v; + struct asciicompat_encoding_t data = {0}; + + unsigned int lev; + RB_VM_LOCK_ENTER_LEV(&lev); + { + if (st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v)) { + table2 = (st_table *)v; + /* + * Assumption: + * There is at most one transcoder for + * converting from ASCII incompatible encoding. + * + * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others. + */ + if (table2->num_entries == 1) { + data.ascii_incompat_name = ascii_incompat_name; + data.ascii_compat_name = NULL; + if (rb_multi_ractor_p()) { + /* + * We need to unlock in case `load_transcoder_entry` actually loads the encoding + * and table2 could be inserted into when we unlock. + */ + st_table *dup_table2 = st_copy(table2); + RB_VM_LOCK_LEAVE_LEV(&lev); + st_foreach(dup_table2, asciicompat_encoding_i, (st_data_t)&data); + st_free_table(dup_table2); + RB_VM_LOCK_ENTER_LEV(&lev); + } + else { + st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data); + } + } - /* - * Assumption: - * There are at most one transcoder for - * converting from ASCII incompatible encoding. - * - * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others. - */ - if (table2->num_entries != 1) - return NULL; + } + } + RB_VM_LOCK_LEAVE_LEV(&lev); - data.ascii_incompat_name = ascii_incompat_name; - data.ascii_compat_name = NULL; - st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data); - return data.ascii_compat_name; + return data.ascii_compat_name; // can be NULL } +/* + * Append `len` bytes pointed by `ss` to `dst` with converting with `ec`. + * + * If the result of the conversion is not compatible with the encoding of + * `dst`, `dst` may not be valid encoding. + */ VALUE -rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags) +rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags) { - unsigned const char *ss, *sp, *se; + unsigned const char *sp, *se; unsigned char *ds, *dp, *de; rb_econv_result_t res; int max_output; + enum ruby_coderange_type coderange; + rb_encoding *dst_enc = ec->destination_encoding; if (NIL_P(dst)) { dst = rb_str_buf_new(len); - if (ec->destination_encoding) - rb_enc_associate(dst, ec->destination_encoding); + if (dst_enc) { + rb_enc_associate(dst, dst_enc); + } + coderange = ENC_CODERANGE_7BIT; // scan from the start + } + else { + dst_enc = rb_enc_get(dst); + coderange = rb_enc_str_coderange(dst); } if (ec->last_tc) @@ -1742,28 +1898,48 @@ rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, else max_output = 1; - res = econv_destination_buffer_full; - while (res == econv_destination_buffer_full) { + do { + int cr; long dlen = RSTRING_LEN(dst); if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) { unsigned long new_capa = (unsigned long)dlen + len + max_output; if (LONG_MAX < new_capa) rb_raise(rb_eArgError, "too long string"); - rb_str_resize(dst, new_capa); - rb_str_set_len(dst, dlen); + rb_str_modify_expand(dst, new_capa - dlen); } - ss = sp = (const unsigned char *)RSTRING_PTR(src) + off; - se = ss + len; + sp = (const unsigned char *)ss; + se = sp + len; ds = (unsigned char *)RSTRING_PTR(dst); de = ds + rb_str_capacity(dst); dp = ds += dlen; res = rb_econv_convert(ec, &sp, se, &dp, de, flags); - off += sp - ss; - len -= sp - ss; + switch (coderange) { + case ENC_CODERANGE_7BIT: + case ENC_CODERANGE_VALID: + cr = (int)coderange; + rb_str_coderange_scan_restartable((char *)ds, (char *)dp, dst_enc, &cr); + coderange = cr; + ENC_CODERANGE_SET(dst, coderange); + break; + case ENC_CODERANGE_UNKNOWN: + case ENC_CODERANGE_BROKEN: + break; + } + len -= (const char *)sp - ss; + ss = (const char *)sp; rb_str_set_len(dst, dlen + (dp - ds)); rb_econv_check_error(ec); - } + } while (res == econv_destination_buffer_full); + + return dst; +} +VALUE +rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags) +{ + src = rb_str_new_frozen(src); + dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags); + RB_GC_GUARD(src); return dst; } @@ -1789,20 +1965,17 @@ static int rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n) { transcoder_entry_t *entry; - const rb_transcoder *tr; + const rb_transcoder *tr = NULL; if (ec->started != 0) return -1; entry = get_transcoder_entry(sname, dname); - if (!entry) - return -1; - - tr = load_transcoder_entry(entry); - if (!entry) - return -1; + if (entry) { + tr = load_transcoder_entry(entry); + } - return rb_econv_add_transcoder_at(ec, tr, n); + return tr ? rb_econv_add_transcoder_at(ec, tr, n) : -1; } static int @@ -1848,48 +2021,40 @@ rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name) void rb_econv_binmode(rb_econv_t *ec) { - const rb_transcoder *trs[3]; - int n, i, j; - transcoder_entry_t *entry; - int num_trans; - - n = 0; - if (ec->flags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) { - entry = get_transcoder_entry("", "universal_newline"); - if (entry->transcoder) - trs[n++] = entry->transcoder; - } - if (ec->flags & ECONV_CRLF_NEWLINE_DECORATOR) { - entry = get_transcoder_entry("", "crlf_newline"); - if (entry->transcoder) - trs[n++] = entry->transcoder; - } - if (ec->flags & ECONV_CR_NEWLINE_DECORATOR) { - entry = get_transcoder_entry("", "cr_newline"); - if (entry->transcoder) - trs[n++] = entry->transcoder; - } - - num_trans = ec->num_trans; - j = 0; - for (i = 0; i < num_trans; i++) { - int k; - for (k = 0; k < n; k++) - if (trs[k] == ec->elems[i].tc->transcoder) - break; - if (k == n) { - ec->elems[j] = ec->elems[i]; - j++; - } - else { - rb_transcoding_close(ec->elems[i].tc); - xfree(ec->elems[i].out_buf_start); - ec->num_trans--; + const char *dname = 0; + + switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) { + case ECONV_UNIVERSAL_NEWLINE_DECORATOR: + dname = "universal_newline"; + break; + case ECONV_CRLF_NEWLINE_DECORATOR: + dname = "crlf_newline"; + break; + case ECONV_CR_NEWLINE_DECORATOR: + dname = "cr_newline"; + break; + case ECONV_LF_NEWLINE_DECORATOR: + dname = "lf_newline"; + break; + } + + if (dname) { + const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder; + int num_trans = ec->num_trans; + int i, j = 0; + + for (i=0; i < num_trans; i++) { + if (transcoder == ec->elems[i].tc->transcoder) { + rb_transcoding_close(ec->elems[i].tc); + xfree(ec->elems[i].out_buf_start); + ec->num_trans--; + } + else + ec->elems[j++] = ec->elems[i]; } } - ec->flags &= ~(ECONV_UNIVERSAL_NEWLINE_DECORATOR|ECONV_CRLF_NEWLINE_DECORATOR|ECONV_CR_NEWLINE_DECORATOR); - + ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK; } static VALUE @@ -1910,9 +2075,7 @@ econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg) has_description = 1; } - if (ecflags & (ECONV_UNIVERSAL_NEWLINE_DECORATOR| - ECONV_CRLF_NEWLINE_DECORATOR| - ECONV_CR_NEWLINE_DECORATOR| + if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK| ECONV_XML_TEXT_DECORATOR| ECONV_XML_ATTR_CONTENT_DECORATOR| ECONV_XML_ATTR_QUOTE_DECORATOR)) { @@ -1931,6 +2094,10 @@ econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg) rb_str_cat2(mesg, pre); pre = ","; rb_str_cat2(mesg, "cr_newline"); } + if (ecflags & ECONV_LF_NEWLINE_DECORATOR) { + rb_str_cat2(mesg, pre); pre = ","; + rb_str_cat2(mesg, "lf_newline"); + } if (ecflags & ECONV_XML_TEXT_DECORATOR) { rb_str_cat2(mesg, pre); pre = ","; rb_str_cat2(mesg, "xml_text"); @@ -1959,7 +2126,7 @@ rb_econv_open_exc(const char *sname, const char *dname, int ecflags) mesg = rb_str_new_cstr("code converter not found ("); econv_description(sname, dname, ecflags, mesg); rb_str_cat2(mesg, ")"); - exc = rb_exc_new3(rb_eNoConverter, mesg); + exc = rb_exc_new3(rb_eConverterNotFoundError, mesg); return exc; } @@ -1976,7 +2143,6 @@ make_econv_exception(rb_econv_t *ec) size_t readagain_len = ec->last_error.readagain_len; VALUE bytes2 = Qnil; VALUE dumped2; - int idx; if (ec->last_error.result == econv_incomplete_input) { mesg = rb_sprintf("incomplete %s on %s", StringValueCStr(dumped), @@ -1996,46 +2162,79 @@ make_econv_exception(rb_econv_t *ec) ec->last_error.source_encoding); } - exc = rb_exc_new3(rb_eInvalidByteSequence, mesg); - rb_ivar_set(exc, rb_intern("error_bytes"), bytes); - rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2); - rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse); - - set_encs: - rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding)); - rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding)); - idx = rb_enc_find_index(ec->last_error.source_encoding); - if (0 <= idx) - rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx))); - idx = rb_enc_find_index(ec->last_error.destination_encoding); - if (0 <= idx) - rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx))); - return exc; + exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg); + rb_ivar_set(exc, id_error_bytes, bytes); + rb_ivar_set(exc, id_readagain_bytes, bytes2); + rb_ivar_set(exc, id_incomplete_input, RBOOL(ec->last_error.result == econv_incomplete_input)); + goto set_encs; } if (ec->last_error.result == econv_undefined_conversion) { VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len); - VALUE dumped; + VALUE dumped = Qnil; int idx; - dumped = rb_str_dump(bytes); - mesg = rb_sprintf("%s from %s to %s", - StringValueCStr(dumped), - ec->last_error.source_encoding, - ec->last_error.destination_encoding); - exc = rb_exc_new3(rb_eConversionUndefined, mesg); + if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) { + rb_encoding *utf8 = rb_utf8_encoding(); + const char *start, *end; + int n; + start = (const char *)ec->last_error.error_bytes_start; + end = start + ec->last_error.error_bytes_len; + n = rb_enc_precise_mbclen(start, end, utf8); + if (MBCLEN_CHARFOUND_P(n) && + (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) { + unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8); + dumped = rb_sprintf("U+%04X", cc); + } + } + if (NIL_P(dumped)) + dumped = rb_str_dump(bytes); + if (strcmp(ec->last_error.source_encoding, + ec->source_encoding_name) == 0 && + strcmp(ec->last_error.destination_encoding, + ec->destination_encoding_name) == 0) { + mesg = rb_sprintf("%s from %s to %s", + StringValueCStr(dumped), + ec->last_error.source_encoding, + ec->last_error.destination_encoding); + } + else { + int i; + mesg = rb_sprintf("%s to %s in conversion from %s", + StringValueCStr(dumped), + ec->last_error.destination_encoding, + ec->source_encoding_name); + for (i = 0; i < ec->num_trans; i++) { + const rb_transcoder *tr = ec->elems[i].tc->transcoder; + if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding)) + rb_str_catf(mesg, " to %s", + ec->elems[i].tc->transcoder->dst_encoding); + } + } + exc = rb_exc_new3(rb_eUndefinedConversionError, mesg); idx = rb_enc_find_index(ec->last_error.source_encoding); if (0 <= idx) rb_enc_associate_index(bytes, idx); - rb_ivar_set(exc, rb_intern("error_char"), bytes); + rb_ivar_set(exc, id_error_char, bytes); goto set_encs; } return Qnil; + + set_encs: + rb_ivar_set(exc, id_source_encoding_name, rb_str_new2(ec->last_error.source_encoding)); + rb_ivar_set(exc, id_destination_encoding_name, rb_str_new2(ec->last_error.destination_encoding)); + int idx = rb_enc_find_index(ec->last_error.source_encoding); + if (0 <= idx) + rb_ivar_set(exc, id_source_encoding, rb_enc_from_encoding(rb_enc_from_index(idx))); + idx = rb_enc_find_index(ec->last_error.destination_encoding); + if (0 <= idx) + rb_ivar_set(exc, id_destination_encoding, rb_enc_from_encoding(rb_enc_from_index(idx))); + return exc; } static void more_output_buffer( VALUE destination, - unsigned char *(*resize_destination)(VALUE, int, int), + unsigned char *(*resize_destination)(VALUE, size_t, size_t), int max_output, unsigned char **out_start_ptr, unsigned char **out_pos, @@ -2053,7 +2252,6 @@ make_replacement(rb_econv_t *ec) { rb_transcoding *tc; const rb_transcoder *tr; - rb_encoding *enc; const unsigned char *replacement; const char *repl_enc; const char *ins_enc; @@ -2067,7 +2265,7 @@ make_replacement(rb_econv_t *ec) tc = ec->last_tc; if (*ins_enc) { tr = tc->transcoder; - enc = rb_enc_find(tr->dst_encoding); + rb_enc_find(tr->dst_encoding); replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc); } else { @@ -2093,7 +2291,7 @@ rb_econv_set_replacement(rb_econv_t *ec, encname2 = rb_econv_encoding_to_insert_output(ec); - if (encoding_equal(encname, encname2)) { + if (!*encname2 || encoding_equal(encname, encname2)) { str2 = xmalloc(len); MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */ len2 = len; @@ -2131,11 +2329,51 @@ output_replacement_character(rb_econv_t *ec) } #if 1 +#define hash_fallback rb_hash_aref + +static VALUE +proc_fallback(VALUE fallback, VALUE c) +{ + return rb_proc_call(fallback, rb_ary_new4(1, &c)); +} + +static VALUE +method_fallback(VALUE fallback, VALUE c) +{ + return rb_method_call(1, &c, fallback); +} + +static VALUE +aref_fallback(VALUE fallback, VALUE c) +{ + return rb_funcallv_public(fallback, idAREF, 1, &c); +} + +struct transcode_loop_fallback_args { + VALUE (*fallback_func)(VALUE, VALUE); + VALUE fallback; + VALUE rep; +}; + +static VALUE +transcode_loop_fallback_try(VALUE a) +{ + struct transcode_loop_fallback_args *args = (struct transcode_loop_fallback_args *)a; + + VALUE ret = args->fallback_func(args->fallback, args->rep); + + if (!UNDEF_P(ret) && !NIL_P(ret)) { + StringValue(ret); + } + + return ret; +} + static void transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, - const unsigned char *in_stop, unsigned char *out_stop, + const unsigned char *in_stop, unsigned char *out_stop, VALUE destination, - unsigned char *(*resize_destination)(VALUE, int, int), + unsigned char *(*resize_destination)(VALUE, size_t, size_t), const char *src_encoding, const char *dst_encoding, int ecflags, @@ -2147,23 +2385,71 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, unsigned char *out_start = *out_pos; int max_output; VALUE exc; + VALUE fallback = Qnil; + VALUE (*fallback_func)(VALUE, VALUE) = 0; ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts); if (!ec) rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags)); + if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) { + fallback = rb_hash_aref(ecopts, sym_fallback); + if (RB_TYPE_P(fallback, T_HASH)) { + fallback_func = hash_fallback; + } + else if (rb_obj_is_proc(fallback)) { + fallback_func = proc_fallback; + } + else if (rb_obj_is_method(fallback)) { + fallback_func = method_fallback; + } + else { + fallback_func = aref_fallback; + } + } last_tc = ec->last_tc; max_output = last_tc ? last_tc->transcoder->max_output : 1; resume: ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0); + if (!NIL_P(fallback) && ret == econv_undefined_conversion) { + VALUE rep = rb_enc_str_new( + (const char *)ec->last_error.error_bytes_start, + ec->last_error.error_bytes_len, + rb_enc_find(ec->last_error.source_encoding)); + + + struct transcode_loop_fallback_args args = { + .fallback_func = fallback_func, + .fallback = fallback, + .rep = rep, + }; + + int state; + rep = rb_protect(transcode_loop_fallback_try, (VALUE)&args, &state); + if (state) { + rb_econv_close(ec); + rb_jump_tag(state); + } + + if (!UNDEF_P(rep) && !NIL_P(rep)) { + ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep), + RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep))); + if ((int)ret == -1) { + rb_econv_close(ec); + rb_raise(rb_eArgError, "too big fallback string"); + } + goto resume; + } + } + if (ret == econv_invalid_byte_sequence || ret == econv_incomplete_input || ret == econv_undefined_conversion) { exc = make_econv_exception(ec); rb_econv_close(ec); - rb_exc_raise(exc); + rb_exc_raise(exc); } if (ret == econv_destination_buffer_full) { @@ -2178,9 +2464,9 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, /* sample transcode_loop implementation in byte-by-byte stream style */ static void transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, - const unsigned char *in_stop, unsigned char *out_stop, + const unsigned char *in_stop, unsigned char *out_stop, VALUE destination, - unsigned char *(*resize_destination)(VALUE, int, int), + unsigned char *(*resize_destination)(VALUE, size_t, size_t), const char *src_encoding, const char *dst_encoding, int ecflags, @@ -2253,17 +2539,17 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, */ static unsigned char * -str_transcoding_resize(VALUE destination, int len, int new_len) +str_transcoding_resize(VALUE destination, size_t len, size_t new_len) { rb_str_resize(destination, new_len); return (unsigned char *)RSTRING_PTR(destination); } static int -econv_opts(VALUE opt) +econv_opts(VALUE opt, int ecflags) { VALUE v; - int ecflags = 0; + int newlineflag = 0; v = rb_hash_aref(opt, sym_invalid); if (NIL_P(v)) { @@ -2285,62 +2571,129 @@ econv_opts(VALUE opt) rb_raise(rb_eArgError, "unknown value for undefined character option"); } + v = rb_hash_aref(opt, sym_replace); + if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) { + ecflags |= ECONV_UNDEF_REPLACE; + } + v = rb_hash_aref(opt, sym_xml); if (!NIL_P(v)) { - v = rb_convert_type(v, T_SYMBOL, "Symbol", "to_sym"); if (v==sym_text) { ecflags |= ECONV_XML_TEXT_DECORATOR|ECONV_UNDEF_HEX_CHARREF; } else if (v==sym_attr) { ecflags |= ECONV_XML_ATTR_CONTENT_DECORATOR|ECONV_XML_ATTR_QUOTE_DECORATOR|ECONV_UNDEF_HEX_CHARREF; } + else if (SYMBOL_P(v)) { + rb_raise(rb_eArgError, "unexpected value for xml option: %"PRIsVALUE, rb_sym2str(v)); + } + else { + rb_raise(rb_eArgError, "unexpected value for xml option"); + } + } + +#ifdef ENABLE_ECONV_NEWLINE_OPTION + v = rb_hash_aref(opt, sym_newline); + if (!NIL_P(v)) { + newlineflag = 2; + ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK; + if (v == sym_universal) { + ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR; + } + else if (v == sym_crlf) { + ecflags |= ECONV_CRLF_NEWLINE_DECORATOR; + } + else if (v == sym_cr) { + ecflags |= ECONV_CR_NEWLINE_DECORATOR; + } + else if (v == sym_lf) { + ecflags |= ECONV_LF_NEWLINE_DECORATOR; + } + else if (SYMBOL_P(v)) { + rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE, + rb_sym2str(v)); + } else { - rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v))); + rb_raise(rb_eArgError, "unexpected value for newline option"); } } +#endif + { + int setflags = 0; + + v = rb_hash_aref(opt, sym_universal_newline); + if (RTEST(v)) + setflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR; + newlineflag |= !NIL_P(v); + + v = rb_hash_aref(opt, sym_crlf_newline); + if (RTEST(v)) + setflags |= ECONV_CRLF_NEWLINE_DECORATOR; + newlineflag |= !NIL_P(v); - v = rb_hash_aref(opt, sym_universal_newline); - if (RTEST(v)) - ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR; + v = rb_hash_aref(opt, sym_cr_newline); + if (RTEST(v)) + setflags |= ECONV_CR_NEWLINE_DECORATOR; + newlineflag |= !NIL_P(v); + + v = rb_hash_aref(opt, sym_lf_newline); + if (RTEST(v)) + setflags |= ECONV_LF_NEWLINE_DECORATOR; + newlineflag |= !NIL_P(v); - v = rb_hash_aref(opt, sym_crlf_newline); - if (RTEST(v)) - ecflags |= ECONV_CRLF_NEWLINE_DECORATOR; + switch (newlineflag) { + case 1: + ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK; + ecflags |= setflags; + break; - v = rb_hash_aref(opt, sym_cr_newline); - if (RTEST(v)) - ecflags |= ECONV_CR_NEWLINE_DECORATOR; + case 3: + rb_warning(":newline option precedes other newline options"); + break; + } + } return ecflags; } int -rb_econv_prepare_opts(VALUE opthash, VALUE *opts) +rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags) { - int ecflags; VALUE newhash = Qnil; + VALUE v; + if (NIL_P(opthash)) { *opts = Qnil; - return 0; + return ecflags; } - ecflags = econv_opts(opthash); - - if ((ecflags & ECONV_INVALID_MASK) == ECONV_INVALID_REPLACE || - (ecflags & ECONV_UNDEF_MASK) == ECONV_UNDEF_REPLACE) { - VALUE v = rb_hash_aref(opthash, sym_replace); - if (!NIL_P(v)) { - StringValue(v); - if (rb_enc_str_coderange(v) == ENC_CODERANGE_BROKEN) { - VALUE dumped = rb_str_dump(v); - rb_raise(rb_eArgError, "replacement string is broken: %s as %s", - StringValueCStr(dumped), - rb_enc_name(rb_enc_get(v))); - } - v = rb_str_new_frozen(v); - newhash = rb_hash_new(); - rb_hash_aset(newhash, sym_replace, v); + ecflags = econv_opts(opthash, ecflags); + + v = rb_hash_aref(opthash, sym_replace); + if (!NIL_P(v)) { + StringValue(v); + if (is_broken_string(v)) { + VALUE dumped = rb_str_dump(v); + rb_raise(rb_eArgError, "replacement string is broken: %s as %s", + StringValueCStr(dumped), + rb_enc_name(rb_enc_get(v))); + } + v = rb_str_new_frozen(v); + newhash = rb_hash_new(); + rb_hash_aset(newhash, sym_replace, v); + } + + v = rb_hash_aref(opthash, sym_fallback); + if (!NIL_P(v)) { + VALUE h = rb_check_hash_type(v); + if (NIL_P(h) + ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, idAREF)) + : (v = h, 1)) { + if (NIL_P(newhash)) + newhash = rb_hash_new(); + rb_hash_aset(newhash, sym_fallback, v); } } + if (!NIL_P(newhash)) rb_hash_freeze(newhash); *opts = newhash; @@ -2348,6 +2701,12 @@ rb_econv_prepare_opts(VALUE opthash, VALUE *opts) return ecflags; } +int +rb_econv_prepare_opts(VALUE opthash, VALUE *opts) +{ + return rb_econv_prepare_options(opthash, opts, 0); +} + rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash) { @@ -2358,47 +2717,46 @@ rb_econv_open_opts(const char *source_encoding, const char *destination_encoding replacement = Qnil; } else { - if (TYPE(opthash) != T_HASH || !OBJ_FROZEN(opthash)) + if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash)) rb_bug("rb_econv_open_opts called with invalid opthash"); replacement = rb_hash_aref(opthash, sym_replace); } ec = rb_econv_open(source_encoding, destination_encoding, ecflags); - if (!ec) - return ec; - - if (!NIL_P(replacement)) { - int ret; - rb_encoding *enc = rb_enc_get(replacement); - - ret = rb_econv_set_replacement(ec, - (const unsigned char *)RSTRING_PTR(replacement), - RSTRING_LEN(replacement), - rb_enc_name(enc)); - if (ret == -1) { - rb_econv_close(ec); - return NULL; + if (ec) { + if (!NIL_P(replacement)) { + int ret; + rb_encoding *enc = rb_enc_get(replacement); + + ret = rb_econv_set_replacement(ec, + (const unsigned char *)RSTRING_PTR(replacement), + RSTRING_LEN(replacement), + rb_enc_name(enc)); + if (ret == -1) { + rb_econv_close(ec); + ec = NULL; + } } } - return ec; + return ec; // can be NULL } static int -enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p) +enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p) { rb_encoding *enc; const char *n; int encidx; VALUE encval; - if ((encidx = rb_to_encoding_index(encval = *arg)) < 0) { - enc = NULL; - encidx = 0; - n = StringValueCStr(*arg); + if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) || + !(enc = rb_enc_from_index(encidx))) { + enc = NULL; + encidx = 0; + n = StringValueCStr(*arg); } else { - enc = rb_enc_from_index(encidx); - n = rb_enc_name(enc); + n = rb_enc_name(enc); } *name_p = n; @@ -2408,7 +2766,7 @@ enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p) } static int -str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2, +str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2, const char **sname_p, rb_encoding **senc_p, const char **dname_p, rb_encoding **denc_p) { @@ -2419,9 +2777,9 @@ str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2, dencidx = enc_arg(arg1, &dname, &denc); if (NIL_P(*arg2)) { - sencidx = rb_enc_get_index(str); - senc = rb_enc_from_index(sencidx); - sname = rb_enc_name(senc); + sencidx = rb_enc_get_index(str); + senc = rb_enc_from_index(sencidx); + sname = rb_enc_name(senc); } else { sencidx = enc_arg(arg2, &sname, &senc); @@ -2439,41 +2797,67 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts) { VALUE dest; VALUE str = *self; - volatile VALUE arg1, arg2; + VALUE arg1, arg2; long blen, slen; unsigned char *buf, *bp, *sp; const unsigned char *fromp; rb_encoding *senc, *denc; const char *sname, *dname; int dencidx; + int explicitly_invalid_replace = TRUE; - if (argc < 1 || argc > 2) { - rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); - } + rb_check_arity(argc, 0, 2); - arg1 = argv[0]; - arg2 = argc==1 ? Qnil : argv[1]; + if (argc == 0) { + arg1 = rb_enc_default_internal(); + if (NIL_P(arg1)) { + if (!ecflags) return -1; + arg1 = rb_obj_encoding(str); + } + if (!(ecflags & ECONV_INVALID_MASK)) { + explicitly_invalid_replace = FALSE; + } + ecflags |= ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE; + } + else { + arg1 = argv[0]; + } + arg2 = argc<=1 ? Qnil : argv[1]; dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc); - if ((ecflags & (ECONV_UNIVERSAL_NEWLINE_DECORATOR| - ECONV_CRLF_NEWLINE_DECORATOR| - ECONV_CR_NEWLINE_DECORATOR| + if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK| ECONV_XML_TEXT_DECORATOR| ECONV_XML_ATTR_CONTENT_DECORATOR| ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) { if (senc && senc == denc) { - return -1; + if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) { + VALUE rep = Qnil; + if (!NIL_P(ecopts)) { + rep = rb_hash_aref(ecopts, sym_replace); + } + dest = rb_enc_str_scrub(senc, str, rep); + if (NIL_P(dest)) dest = str; + *self = dest; + return dencidx; + } + return NIL_P(arg2) ? -1 : dencidx; } if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) { - if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) { + if (is_ascii_string(str)) { return dencidx; } } if (encoding_equal(sname, dname)) { - return -1; + return NIL_P(arg2) ? -1 : dencidx; } } else { + if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) { + rb_encoding *utf8 = rb_utf8_encoding(); + str = rb_str_conv_enc(str, senc, utf8); + senc = utf8; + sname = "UTF-8"; + } if (encoding_equal(sname, dname)) { sname = ""; dname = ""; @@ -2496,7 +2880,9 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts) /* set encoding */ if (!denc) { - dencidx = rb_define_dummy_encoding(dname); + dencidx = rb_define_dummy_encoding(dname); + RB_GC_GUARD(arg1); + RB_GC_GUARD(arg2); } *self = dest; @@ -2510,12 +2896,9 @@ str_transcode(int argc, VALUE *argv, VALUE *self) int ecflags = 0; VALUE ecopts = Qnil; - if (0 < argc) { - opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash"); - if (!NIL_P(opt)) { - argc--; - ecflags = rb_econv_prepare_opts(opt, &ecopts); - } + argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt); + if (!NIL_P(opt)) { + ecflags = rb_econv_prepare_opts(opt, &ecopts); } return str_transcode0(argc, argv, self, ecflags, ecopts); } @@ -2529,10 +2912,10 @@ str_encode_associate(VALUE str, int encidx) /* transcoded string never be broken. */ if (rb_enc_asciicompat(rb_enc_from_index(encidx))) { - rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr); + rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr); } else { - cr = ENC_CODERANGE_VALID; + cr = ENC_CODERANGE_VALID; } ENC_CODERANGE_SET(str, cr); return str; @@ -2540,40 +2923,43 @@ str_encode_associate(VALUE str, int encidx) /* * call-seq: - * str.encode!(encoding [, options] ) => str - * str.encode!(dst_encoding, src_encoding [, options] ) => str - * - * The first form transcodes the contents of <i>str</i> from - * str.encoding to +encoding+. - * The second form transcodes the contents of <i>str</i> from - * src_encoding to dst_encoding. - * The options Hash gives details for conversion. See String#encode - * for details. - * Returns the string even if no changes were made. + * encode!(dst_encoding = Encoding.default_internal, **enc_opts) -> self + * encode!(dst_encoding, src_encoding, **enc_opts) -> self + * + * Like #encode, but applies encoding changes to +self+; returns +self+. + * + * Related: see {Modifying}[rdoc-ref:String@Modifying]. */ static VALUE str_encode_bang(int argc, VALUE *argv, VALUE str) { - VALUE newstr = str; - int encidx = str_transcode(argc, argv, &newstr); + VALUE newstr; + int encidx; + + rb_check_frozen(str); + + newstr = str; + encidx = str_transcode(argc, argv, &newstr); if (encidx < 0) return str; + if (newstr == str) { + rb_enc_associate_index(str, encidx); + return str; + } rb_str_shared_replace(str, newstr); return str_encode_associate(str, encidx); } +static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx); + /* * call-seq: - * str.encode(encoding [, options] ) => str - * str.encode(dst_encoding, src_encoding [, options] ) => str - * - * The first form returns a copy of <i>str</i> transcoded - * to encoding +encoding+. - * The second form returns a copy of <i>str</i> transcoded - * from src_encoding to dst_encoding. - * The options Hash gives details for conversion. Details - * to be added. + * encode(dst_encoding = Encoding.default_internal, **enc_opts) -> string + * encode(dst_encoding, src_encoding, **enc_opts) -> string + * + * :include: doc/string/encode.rdoc + * */ static VALUE @@ -2581,40 +2967,62 @@ str_encode(int argc, VALUE *argv, VALUE str) { VALUE newstr = str; int encidx = str_transcode(argc, argv, &newstr); - - if (encidx < 0) return rb_str_dup(str); - if (newstr == str) { - newstr = rb_str_dup(str); - } - else { - RBASIC(newstr)->klass = rb_obj_class(str); - } - return str_encode_associate(newstr, encidx); + return encoded_dup(newstr, str, encidx); } VALUE -rb_str_transcode(VALUE str, VALUE to, int ecflags, VALUE ecopts) +rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts) { int argc = 1; VALUE *argv = &to; VALUE newstr = str; int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts); + return encoded_dup(newstr, str, encidx); +} +static VALUE +encoded_dup(VALUE newstr, VALUE str, int encidx) +{ if (encidx < 0) return rb_str_dup(str); - RBASIC(newstr)->klass = rb_obj_class(str); + if (newstr == str) { + newstr = rb_str_dup(str); + rb_enc_associate_index(newstr, encidx); + return newstr; + } + else { + RBASIC_SET_CLASS(newstr, rb_obj_class(str)); + } return str_encode_associate(newstr, encidx); } +/* + * Document-class: Encoding::Converter + * + * Encoding conversion class. + */ static void -econv_free(rb_econv_t *ec) +econv_free(void *ptr) { + rb_econv_t *ec = ptr; rb_econv_close(ec); } +static size_t +econv_memsize(const void *ptr) +{ + return sizeof(rb_econv_t); +} + +static const rb_data_type_t econv_data_type = { + "econv", + {0, econv_free, econv_memsize,}, + 0, 0, RUBY_TYPED_FREE_IMMEDIATELY +}; + static VALUE econv_s_allocate(VALUE klass) { - return Data_Wrap_Struct(klass, NULL, econv_free, NULL); + return TypedData_Wrap_Struct(klass, &econv_data_type, NULL); } static rb_encoding * @@ -2632,8 +3040,16 @@ make_encoding(const char *name) { rb_encoding *enc; enc = rb_enc_find(name); - if (!enc) - enc = make_dummy_encoding(name); + if (!enc) { + RB_VM_LOCKING() { + if (rb_enc_registered(name)) { + enc = NULL; + } + else { + enc = make_dummy_encoding(name); + } + } + } return enc; } @@ -2645,21 +3061,20 @@ make_encobj(const char *name) /* * call-seq: - * Encoding::Converter.asciicompat_encoding(string) => encoding or nil - * Encoding::Converter.asciicompat_encoding(encoding) => encoding or nil - * - * returns the corresponding ASCII compatible encoding. + * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil + * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil * - * It returns nil if the argument is an ASCII compatible encoding. + * Returns the corresponding ASCII compatible encoding. * - * "corresponding ASCII compatible encoding" is a ASCII compatible encoding which - * represents same characters in the given ASCII incompatible encoding. + * Returns nil if the argument is an ASCII compatible encoding. * - * So, no conversion undefined error occur between the ASCII compatible and incompatible encoding. + * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which + * can represents exactly the same characters as the given ASCII incompatible encoding. + * So, no conversion undefined error occurs when converting between the two encodings. * - * Encoding::Converter.stateless_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP> - * Encoding::Converter.stateless_encoding("UTF-16BE") #=> #<Encoding:UTF-8> - * Encoding::Converter.stateless_encoding("UTF-8") #=> nil + * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP> + * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8> + * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil * */ static VALUE @@ -2667,46 +3082,46 @@ econv_s_asciicompat_encoding(VALUE klass, VALUE arg) { const char *arg_name, *result_name; rb_encoding *arg_enc, *result_enc; + VALUE enc = Qnil; enc_arg(&arg, &arg_name, &arg_enc); - result_name = rb_econv_asciicompat_encoding(arg_name); - - if (result_name == NULL) - return Qnil; - - result_enc = make_encoding(result_name); - - return rb_enc_from_encoding(result_enc); + if (result_name) { + result_enc = make_encoding(result_name); + enc = rb_enc_from_encoding(result_enc); + } + return enc; } static void econv_args(int argc, VALUE *argv, - volatile VALUE *snamev_p, volatile VALUE *dnamev_p, + VALUE *snamev_p, VALUE *dnamev_p, const char **sname_p, const char **dname_p, rb_encoding **senc_p, rb_encoding **denc_p, int *ecflags_p, VALUE *ecopts_p) { - VALUE opt, opthash, flags_v, ecopts; + VALUE opt, flags_v, ecopts; int sidx, didx; const char *sname, *dname; rb_encoding *senc, *denc; int ecflags; - rb_scan_args(argc, argv, "21", snamev_p, dnamev_p, &opt); + argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt); - if (NIL_P(opt)) { - ecflags = 0; + if (!NIL_P(flags_v)) { + if (!NIL_P(opt)) { + rb_error_arity(argc + 1, 2, 3); + } + ecflags = NUM2INT(rb_to_int(flags_v)); ecopts = Qnil; } - else if (!NIL_P(flags_v = rb_check_to_integer(opt, "to_int"))) { - ecflags = NUM2INT(flags_v); - ecopts = Qnil; + else if (!NIL_P(opt)) { + ecflags = rb_econv_prepare_opts(opt, &ecopts); } else { - opthash = rb_convert_type(opt, T_HASH, "Hash", "to_hash"); - ecflags = rb_econv_prepare_opts(opthash, &ecopts); + ecflags = 0; + ecopts = Qnil; } senc = NULL; @@ -2750,18 +3165,25 @@ decorate_convpath(VALUE convpath, int ecflags) if (num_decorators == -1) return -1; - len = n = RARRAY_LEN(convpath); + len = n = RARRAY_LENINT(convpath); if (n != 0) { - VALUE pair = RARRAY_PTR(convpath)[n-1]; - const char *sname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[0])); - const char *dname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[1])); - transcoder_entry_t *entry = get_transcoder_entry(sname, dname); - const rb_transcoder *tr = load_transcoder_entry(entry); - if (!tr) - return -1; - if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) && - tr->asciicompat_type == asciicompat_encoder) { - n--; + VALUE pair = RARRAY_AREF(convpath, n-1); + if (RB_TYPE_P(pair, T_ARRAY)) { + const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0))); + const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1))); + transcoder_entry_t *entry; + const rb_transcoder *tr; + entry = get_transcoder_entry(sname, dname); + tr = load_transcoder_entry(entry); + if (!tr) + return -1; + if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) && + tr->asciicompat_type == asciicompat_encoder) { + n--; + rb_ary_store(convpath, len + num_decorators - 1, pair); + } + } + else { rb_ary_store(convpath, len + num_decorators - 1, pair); } } @@ -2778,7 +3200,7 @@ search_convpath_i(const char *sname, const char *dname, int depth, void *arg) VALUE *ary_p = arg; VALUE v; - if (*ary_p == Qnil) { + if (NIL_P(*ary_p)) { *ary_p = rb_ary_new(); } @@ -2796,18 +3218,22 @@ search_convpath_i(const char *sname, const char *dname, int depth, void *arg) * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary * - * returns the conversion path. + * Returns a conversion path. * * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP") * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]] * * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true) + * or + * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal) * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>], * # "universal_newline"] * * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true) + * or + * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal) * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], * # "universal_newline", * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]] @@ -2815,7 +3241,7 @@ search_convpath_i(const char *sname, const char *dname, int depth, void *arg) static VALUE econv_s_search_convpath(int argc, VALUE *argv, VALUE klass) { - volatile VALUE snamev, dnamev; + VALUE snamev, dnamev; const char *sname, *dname; rb_encoding *senc, *denc; int ecflags; @@ -2827,22 +3253,45 @@ econv_s_search_convpath(int argc, VALUE *argv, VALUE klass) convpath = Qnil; transcode_search_path(sname, dname, search_convpath_i, &convpath); - if (NIL_P(convpath)) - rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags)); + if (NIL_P(convpath)) { + VALUE exc = rb_econv_open_exc(sname, dname, ecflags); + RB_GC_GUARD(snamev); + RB_GC_GUARD(dnamev); + rb_exc_raise(exc); + } - if (decorate_convpath(convpath, ecflags) == -1) - rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags)); + if (decorate_convpath(convpath, ecflags) == -1) { + VALUE exc = rb_econv_open_exc(sname, dname, ecflags); + RB_GC_GUARD(snamev); + RB_GC_GUARD(dnamev); + rb_exc_raise(exc); + } return convpath; } +/* + * Check the existence of a conversion path. + * Returns the number of converters in the conversion path. + * result: >=0:success -1:failure + */ +int +rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding) +{ + VALUE convpath = Qnil; + transcode_search_path(from_encoding, to_encoding, search_convpath_i, + &convpath); + return RTEST(convpath); +} + struct rb_econv_init_by_convpath_t { rb_econv_t *ec; int index; int ret; }; -void rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg) +static void +rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg) { struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg; int ret; @@ -2865,14 +3314,14 @@ rb_econv_init_by_convpath(VALUE self, VALUE convpath, long i; int ret, first=1; VALUE elt; - rb_encoding *senc, *denc; + rb_encoding *senc = 0, *denc = 0; const char *sname, *dname; - ec = rb_econv_alloc(RARRAY_LEN(convpath)); + ec = rb_econv_alloc(RARRAY_LENINT(convpath)); DATA_PTR(self) = ec; for (i = 0; i < RARRAY_LEN(convpath); i++) { - volatile VALUE snamev, dnamev; + VALUE snamev, dnamev; VALUE pair; elt = rb_ary_entry(convpath, i); if (!NIL_P(pair = rb_check_array_type(elt))) { @@ -2889,8 +3338,12 @@ rb_econv_init_by_convpath(VALUE self, VALUE convpath, } if (DECORATOR_P(sname, dname)) { ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans); - if (ret == -1) - rb_raise(rb_eArgError, "decoration failed: %s", dname); + if (ret == -1) { + VALUE msg = rb_sprintf("decoration failed: %s", dname); + RB_GC_GUARD(snamev); + RB_GC_GUARD(dnamev); + rb_exc_raise(rb_exc_new_str(rb_eArgError, msg)); + } } else { int j = ec->num_trans; @@ -2899,8 +3352,12 @@ rb_econv_init_by_convpath(VALUE self, VALUE convpath, arg.index = ec->num_trans; arg.ret = 0; ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg); - if (ret == -1 || arg.ret == -1) - rb_raise(rb_eArgError, "conversion add failed: %s to %s", sname, dname); + if (ret == -1 || arg.ret == -1) { + VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname); + RB_GC_GUARD(snamev); + RB_GC_GUARD(dnamev); + rb_exc_raise(rb_exc_new_str(rb_eArgError, msg)); + } if (first) { first = 0; *senc_p = senc; @@ -2912,10 +3369,10 @@ rb_econv_init_by_convpath(VALUE self, VALUE convpath, } if (first) { - *senc_p = NULL; - *denc_p = NULL; - *sname_p = ""; - *dname_p = ""; + *senc_p = NULL; + *denc_p = NULL; + *sname_p = ""; + *dname_p = ""; } ec->source_encoding_name = *sname_p; @@ -2932,47 +3389,104 @@ rb_econv_init_by_convpath(VALUE self, VALUE convpath, * * possible options elements: * hash form: - * :invalid => nil # error on invalid byte sequence (default) - * :invalid => :replace # replace invalid byte sequence - * :undef => nil # error on undefined conversion (default) - * :undef => :replace # replace undefined conversion - * :replace => string # replacement string ("?" or "\uFFFD" if not specified) - * :universal_newline => true # decorator for converting CRLF and CR to LF - * :crlf_newline => true # decorator for converting LF to CRLF - * :cr_newline => true # decorator for converting LF to CR + * :invalid => nil # raise error on invalid byte sequence (default) + * :invalid => :replace # replace invalid byte sequence + * :undef => nil # raise error on undefined conversion (default) + * :undef => :replace # replace undefined conversion + * :replace => string # replacement string ("?" or "\uFFFD" if not specified) + * :newline => :universal # decorator for converting CRLF and CR to LF + * :newline => :lf # decorator for converting CRLF and CR to LF when writing + * :newline => :crlf # decorator for converting LF to CRLF + * :newline => :cr # decorator for converting LF to CR + * :universal_newline => true # decorator for converting CRLF and CR to LF + * :crlf_newline => true # decorator for converting LF to CRLF + * :cr_newline => true # decorator for converting LF to CR + * :lf_newline => true # decorator for converting CRLF and CR to LF when writing + * :xml => :text # escape as XML CharData. + * :xml => :attr # escape as XML AttValue * integer form: + * Encoding::Converter::INVALID_REPLACE + * Encoding::Converter::UNDEF_REPLACE + * Encoding::Converter::UNDEF_HEX_CHARREF * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR + * Encoding::Converter::LF_NEWLINE_DECORATOR * Encoding::Converter::CRLF_NEWLINE_DECORATOR * Encoding::Converter::CR_NEWLINE_DECORATOR + * Encoding::Converter::XML_TEXT_DECORATOR + * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR + * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR * * Encoding::Converter.new creates an instance of Encoding::Converter. * - * source_encoding and destination_encoding should be a string or + * Source_encoding and destination_encoding should be a string or * Encoding object. * * opt should be nil, a hash or an integer. * * convpath should be an array. - * convpath should contains - * - two-element array which contains encoding or encoding name, or - * - a string of decorator name. + * convpath may contain + * - two-element arrays which contain encodings or encoding names, or + * - strings representing decorator names. * - * example: + * Encoding::Converter.new optionally takes an option. + * The option should be a hash or an integer. + * The option hash can contain :invalid => nil, etc. + * The option integer should be logical-or of constants such as + * Encoding::Converter::INVALID_REPLACE, etc. + * + * [:invalid => nil] + * Raise error on invalid byte sequence. This is a default behavior. + * [:invalid => :replace] + * Replace invalid byte sequence by replacement string. + * [:undef => nil] + * Raise an error if a character in source_encoding is not defined in destination_encoding. + * This is a default behavior. + * [:undef => :replace] + * Replace undefined character in destination_encoding with replacement string. + * [:replace => string] + * Specify the replacement string. + * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others. + * [:universal_newline => true] + * Convert CRLF and CR to LF. + * [:crlf_newline => true] + * Convert LF to CRLF. + * [:cr_newline => true] + * Convert LF to CR. + * [:lf_newline => true] + * Convert CRLF and CR to LF (when writing). + * [:xml => :text] + * Escape as XML CharData. + * This form can be used as an HTML 4.0 #PCDATA. + * - '&' -> '&' + * - '<' -> '<' + * - '>' -> '>' + * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH; + * [:xml => :attr] + * Escape as XML AttValue. + * The converted result is quoted as "...". + * This form can be used as an HTML 4.0 attribute value. + * - '&' -> '&' + * - '<' -> '<' + * - '>' -> '>' + * - '"' -> '"' + * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH; + * + * Examples: * # UTF-16BE to UTF-8 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8") * - * # Usually, decorators such as newline conversion are inserted at last. + * # Usually, decorators such as newline conversion are inserted last. * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true) * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>], * # "universal_newline"] * - * # But, if the last encoding is ASCII incompatible, + * # But, if the last encoding is ASCII incompatible, * # decorators are inserted before the last conversion. * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true) * p ec.convpath #=> ["crlf_newline", * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]] * - * # conversion path can be specified directly. + * # Conversion path can be specified directly. * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]]) * p ec.convpath #=> ["universal_newline", * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>], @@ -2982,14 +3496,14 @@ static VALUE econv_init(int argc, VALUE *argv, VALUE self) { VALUE ecopts; - volatile VALUE snamev, dnamev; + VALUE snamev, dnamev; const char *sname, *dname; rb_encoding *senc, *denc; rb_econv_t *ec; int ecflags; VALUE convpath; - if (DATA_PTR(self)) { + if (rb_check_typeddata(self, &econv_data_type)) { rb_raise(rb_eTypeError, "already initialized"); } @@ -3004,7 +3518,10 @@ econv_init(int argc, VALUE *argv, VALUE self) } if (!ec) { - rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags)); + VALUE exc = rb_econv_open_exc(sname, dname, ecflags); + RB_GC_GUARD(snamev); + RB_GC_GUARD(dnamev); + rb_exc_raise(exc); } if (!DECORATOR_P(sname, dname)) { @@ -3012,6 +3529,8 @@ econv_init(int argc, VALUE *argv, VALUE self) senc = make_dummy_encoding(sname); if (!denc) denc = make_dummy_encoding(dname); + RB_GC_GUARD(snamev); + RB_GC_GUARD(dnamev); } ec->source_encoding = senc; @@ -3036,8 +3555,9 @@ static VALUE econv_inspect(VALUE self) { const char *cname = rb_obj_classname(self); - rb_econv_t *ec = DATA_PTR(self); + rb_econv_t *ec; + TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec); if (!ec) return rb_sprintf("#<%s: uninitialized>", cname); else { @@ -3051,69 +3571,69 @@ econv_inspect(VALUE self) } } -#define IS_ECONV(obj) (RDATA(obj)->dfree == (RUBY_DATA_FUNC)econv_free) - static rb_econv_t * check_econv(VALUE self) { - Check_Type(self, T_DATA); - if (!IS_ECONV(self)) { - rb_raise(rb_eTypeError, "wrong argument type %s (expected Encoding::Converter)", - rb_class2name(CLASS_OF(self))); - } - if (!DATA_PTR(self)) { + rb_econv_t *ec; + + TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec); + if (!ec) { rb_raise(rb_eTypeError, "uninitialized encoding converter"); } - return DATA_PTR(self); + return ec; +} + +static VALUE +econv_get_encoding(rb_encoding *encoding) +{ + if (!encoding) + return Qnil; + return rb_enc_from_encoding(encoding); } /* * call-seq: * ec.source_encoding -> encoding * - * returns the source encoding as an Encoding object. + * Returns the source encoding as an Encoding object. */ static VALUE econv_source_encoding(VALUE self) { rb_econv_t *ec = check_econv(self); - if (!ec->source_encoding) - return Qnil; - return rb_enc_from_encoding(ec->source_encoding); + return econv_get_encoding(ec->source_encoding); } /* * call-seq: * ec.destination_encoding -> encoding * - * returns the destination encoding as an Encoding object. + * Returns the destination encoding as an Encoding object. */ static VALUE econv_destination_encoding(VALUE self) { rb_econv_t *ec = check_econv(self); - if (!ec->destination_encoding) - return Qnil; - return rb_enc_from_encoding(ec->destination_encoding); + return econv_get_encoding(ec->destination_encoding); } /* * call-seq: * ec.convpath -> ary * - * returns the conversion path of ec. + * Returns the conversion path of ec. * * The result is an array of conversions. * - * ec = Encoding::Converter.new("ISo-8859-1", "EUC-JP", crlf_newline: true) + * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true) * p ec.convpath * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>], * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>], * # "crlf_newline"] * - * A element of the array is a pair of encodings or a string. - * The pair means encoding conversion. - * The string means decorator. + * Each element of the array is a pair of encodings or a string. + * A pair means an encoding conversion. + * A string means a decorator. * * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means * a converter from ISO-8859-1 to UTF-8. @@ -3139,6 +3659,45 @@ econv_convpath(VALUE self) return result; } +/* + * call-seq: + * ec == other -> true or false + */ +static VALUE +econv_equal(VALUE self, VALUE other) +{ + rb_econv_t *ec1 = check_econv(self); + rb_econv_t *ec2; + int i; + + if (!rb_typeddata_is_kind_of(other, &econv_data_type)) { + return Qnil; + } + ec2 = DATA_PTR(other); + if (!ec2) return Qfalse; + if (ec1->source_encoding_name != ec2->source_encoding_name && + strcmp(ec1->source_encoding_name, ec2->source_encoding_name)) + return Qfalse; + if (ec1->destination_encoding_name != ec2->destination_encoding_name && + strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name)) + return Qfalse; + if (ec1->flags != ec2->flags) return Qfalse; + if (ec1->replacement_enc != ec2->replacement_enc && + strcmp(ec1->replacement_enc, ec2->replacement_enc)) + return Qfalse; + if (ec1->replacement_len != ec2->replacement_len) return Qfalse; + if (ec1->replacement_str != ec2->replacement_str && + memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len)) + return Qfalse; + + if (ec1->num_trans != ec2->num_trans) return Qfalse; + for (i = 0; i < ec1->num_trans; i++) { + if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder) + return Qfalse; + } + return Qtrue; +} + static VALUE econv_result_to_symbol(rb_econv_result_t res) { @@ -3181,7 +3740,7 @@ econv_result_to_symbol(rb_econv_result_t res) * primitive_convert converts source_buffer into destination_buffer. * * source_buffer should be a string or nil. - * nil means a empty string. + * nil means an empty string. * * destination_buffer should be a string. * @@ -3218,9 +3777,12 @@ econv_result_to_symbol(rb_econv_result_t res) * * primitive_convert stops conversion when one of following condition met. * - invalid byte sequence found in source buffer (:invalid_byte_sequence) + * +primitive_errinfo+ and +last_error+ methods returns the detail of the error. * - unexpected end of source buffer (:incomplete_input) * this occur only when :partial_input is not specified. + * +primitive_errinfo+ and +last_error+ methods returns the detail of the error. * - character not representable in output encoding (:undefined_conversion) + * +primitive_errinfo+ and +last_error+ methods returns the detail of the error. * - after some output is generated, before input is done (:after_output) * this occur only when :after_output is specified. * - destination buffer is full (:destination_buffer_full) @@ -3231,17 +3793,17 @@ econv_result_to_symbol(rb_econv_result_t res) * * example: * ec = Encoding::Converter.new("UTF-8", "UTF-16BE") - * ret = ec.primitive_convert(src="pi", dst="", 100) + * ret = ec.primitive_convert(src="pi", dst="", nil, 100) * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"] * * ec = Encoding::Converter.new("UTF-8", "UTF-16BE") - * ret = ec.primitive_convert(src="pi", dst="", 1) + * ret = ec.primitive_convert(src="pi", dst="", nil, 1) * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"] - * ret = ec.primitive_convert(src, dst="", 1) + * ret = ec.primitive_convert(src, dst="", nil, 1) * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"] - * ret = ec.primitive_convert(src, dst="", 1) + * ret = ec.primitive_convert(src, dst="", nil, 1) * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"] - * ret = ec.primitive_convert(src, dst="", 1) + * ret = ec.primitive_convert(src, dst="", nil, 1) * p [ret, src, dst] #=> [:finished, "", "i"] * */ @@ -3257,7 +3819,7 @@ econv_primitive_convert(int argc, VALUE *argv, VALUE self) unsigned long output_byteend; int flags; - rb_scan_args(argc, argv, "23", &input, &output, &output_byteoffset_v, &output_bytesize_v, &opt); + argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt); if (NIL_P(output_byteoffset_v)) output_byteoffset = 0; /* dummy */ @@ -3269,15 +3831,14 @@ econv_primitive_convert(int argc, VALUE *argv, VALUE self) else output_bytesize = NUM2LONG(output_bytesize_v); - if (NIL_P(opt)) { - flags = 0; - } - else if (!NIL_P(flags_v = rb_check_to_integer(opt, "to_int"))) { - flags = NUM2INT(flags_v); + if (!NIL_P(flags_v)) { + if (!NIL_P(opt)) { + rb_error_arity(argc + 1, 2, 5); + } + flags = NUM2INT(rb_to_int(flags_v)); } - else { + else if (!NIL_P(opt)) { VALUE v; - opt = rb_convert_type(opt, T_HASH, "Hash", "to_hash"); flags = 0; v = rb_hash_aref(opt, sym_partial_input); if (RTEST(v)) @@ -3286,6 +3847,9 @@ econv_primitive_convert(int argc, VALUE *argv, VALUE self) if (RTEST(v)) flags |= ECONV_AFTER_OUTPUT; } + else { + flags = 0; + } StringValue(output); if (!NIL_P(input)) @@ -3293,7 +3857,8 @@ econv_primitive_convert(int argc, VALUE *argv, VALUE self) rb_str_modify(output); if (NIL_P(output_bytesize_v)) { - output_bytesize = RSTRING_EMBED_LEN_MAX; + output_bytesize = rb_str_capacity(output); + if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input)) output_bytesize = RSTRING_LEN(input); } @@ -3335,8 +3900,9 @@ econv_primitive_convert(int argc, VALUE *argv, VALUE self) res = rb_econv_convert(ec, &ip, is, &op, os, flags); rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output)); - if (!NIL_P(input)) + if (!NIL_P(input)) { rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input)); + } if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) { if (LONG_MAX / 2 < output_bytesize) @@ -3357,11 +3923,11 @@ econv_primitive_convert(int argc, VALUE *argv, VALUE self) * call-seq: * ec.convert(source_string) -> destination_string * - * convert source_string and return destination_string. + * Convert source_string and return destination_string. * * source_string is assumed as a part of source. * i.e. :partial_input=>true is specified internally. - * finish method should be used at last. + * finish method should be used last. * * ec = Encoding::Converter.new("utf-8", "euc-jp") * puts ec.convert("\u3042").dump #=> "\xA4\xA2" @@ -3379,8 +3945,12 @@ econv_primitive_convert(int argc, VALUE *argv, VALUE self) * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP") * * If a conversion error occur, - * Encoding::ConversionUndefined or - * Encoding::InvalidByteSequence is raised. + * Encoding::UndefinedConversionError or + * Encoding::InvalidByteSequenceError is raised. + * Encoding::Converter#convert doesn't supply methods to recover or restart + * from these exceptions. + * When you want to handle these conversion errors, + * use Encoding::Converter#primitive_convert. * */ static VALUE @@ -3426,8 +3996,8 @@ econv_convert(VALUE self, VALUE source_string) * call-seq: * ec.finish -> string * - * finishes the converter. - * It returns the last part of converted string. + * Finishes the converter. + * It returns the last part of the converted string. * * ec = Encoding::Converter.new("utf-8", "iso-2022-jp") * p ec.convert("\u3042") #=> "\e$B$\"" @@ -3447,7 +4017,7 @@ econv_finish(VALUE self) av[1] = dst; av[2] = Qnil; av[3] = Qnil; - av[4] = INT2NUM(0); + av[4] = INT2FIX(0); ac = 5; ret = econv_primitive_convert(ac, av, self); @@ -3470,8 +4040,8 @@ econv_finish(VALUE self) * call-seq: * ec.primitive_errinfo -> array * - * primitive_errinfo returns a precious information of the last error result - * as a 5-elements array: + * primitive_errinfo returns important information regarding the last error + * as a 5-element array: * * [result, enc1, enc2, error_bytes, readagain_bytes] * @@ -3480,12 +4050,12 @@ econv_finish(VALUE self) * Other elements are only meaningful when result is * :invalid_byte_sequence, :incomplete_input or :undefined_conversion. * - * enc1 and enc2 indicates a conversion step as pair of strings. - * For example, a converter from EUC-JP to ISO-8859-1 converters - * a string as EUC-JP -> UTF-8 -> ISO-8859-1. - * So [enc1, enc2] is ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"]. + * enc1 and enc2 indicate a conversion step as a pair of strings. + * For example, a converter from EUC-JP to ISO-8859-1 converts + * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1. + * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"]. * - * error_bytes and readagain_bytes indicates the byte sequences which causes the error. + * error_bytes and readagain_bytes indicate the byte sequences which caused the error. * error_bytes is discarded portion. * readagain_bytes is buffered portion which is read again on next conversion. * @@ -3493,9 +4063,9 @@ econv_finish(VALUE self) * * # \xff is invalid as EUC-JP. * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS") - * ec.primitive_convert(src="\xff", dst="", nil, 10) + * ec.primitive_convert(src="\xff", dst="", nil, 10) * p ec.primitive_errinfo - * #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""] + * #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""] * * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1. * # Since this error is occur in UTF-8 to ISO-8859-1 conversion, @@ -3514,7 +4084,7 @@ econv_finish(VALUE self) * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by * # partial characters. * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") - * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT) + * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT) * p ec.primitive_errinfo * #=> [:source_buffer_empty, nil, nil, nil, nil] * @@ -3571,14 +4141,14 @@ econv_primitive_errinfo(VALUE self) * call-seq: * ec.insert_output(string) -> nil * - * inserts string into the encoding converter. - * The string will be converted into the destination encoding and - * outputed on later conversions. + * Inserts string into the encoding converter. + * The string will be converted to the destination encoding and + * output on later conversions. * * If the destination encoding is stateful, - * string is converted according to the state and update the state. + * string is converted according to the state and the state is updated. * - * This method should be used only when a conversion error is occur. + * This method should be used only when a conversion error occurs. * * ec = Encoding::Converter.new("utf-8", "iso-8859-1") * src = "HIRAGANA LETTER A is \u{3042}." @@ -3610,29 +4180,29 @@ econv_insert_output(VALUE self, VALUE string) StringValue(string); insert_enc = rb_econv_encoding_to_insert_output(ec); - string = rb_str_transcode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil); + string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil); ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc); if (ret == -1) { - rb_raise(rb_eArgError, "too big string"); + rb_raise(rb_eArgError, "too big string"); } return Qnil; } /* - * call-seq - * ec.putback => string - * ec.putback(max_numbytes) => string + * call-seq: + * ec.putback -> string + * ec.putback(max_numbytes) -> string * - * put back the bytes which will be converted. + * Put back the bytes which will be converted. * * The bytes are caused by invalid_byte_sequence error. * When invalid_byte_sequence error, some bytes are discarded and * some bytes are buffered to be converted later. * The latter bytes can be put back. * It can be observed by - * Encoding::InvalidByteSequence#readagain_bytes and + * Encoding::InvalidByteSequenceError#readagain_bytes and * Encoding::Converter#primitive_errinfo. * * ec = Encoding::Converter.new("utf-16le", "iso-8859-1") @@ -3652,10 +4222,9 @@ econv_putback(int argc, VALUE *argv, VALUE self) int putbackable; VALUE str, max; - rb_scan_args(argc, argv, "01", &max); - - if (NIL_P(max)) + if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) { n = rb_econv_putbackable(ec); + } else { n = NUM2INT(max); putbackable = rb_econv_putbackable(ec); @@ -3677,18 +4246,18 @@ econv_putback(int argc, VALUE *argv, VALUE self) * call-seq: * ec.last_error -> exception or nil * - * returns an exception object for the last conversion. - * It returns nil if the last conversion is not an error. + * Returns an exception object for the last conversion. + * Returns nil if the last conversion did not produce an error. * * "error" means that - * Encoding::InvalidByteSequence and Encoding::ConversionUndefined for + * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for * Encoding::Converter#convert and * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for * Encoding::Converter#primitive_convert. * * ec = Encoding::Converter.new("utf-8", "iso-8859-1") * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence - * p ec.last_error #=> #<Encoding::InvalidByteSequence: "\xF1" followed by "a" on UTF-8> + * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8> * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full * p ec.last_error #=> nil * @@ -3709,7 +4278,7 @@ econv_last_error(VALUE self) * call-seq: * ec.replacement -> string * - * returns the replacement string. + * Returns the replacement string. * * ec = Encoding::Converter.new("euc-jp", "us-ascii") * p ec.replacement #=> "?" @@ -3726,7 +4295,7 @@ econv_get_replacement(VALUE self) ret = make_replacement(ec); if (ret == -1) { - rb_raise(rb_eConversionUndefined, "replacement character setup failed"); + rb_raise(rb_eUndefinedConversionError, "replacement character setup failed"); } enc = rb_enc_find(ec->replacement_enc); @@ -3737,7 +4306,7 @@ econv_get_replacement(VALUE self) * call-seq: * ec.replacement = string * - * sets the replacement string. + * Sets the replacement string. * * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace) * ec.replacement = "<undef>" @@ -3760,13 +4329,19 @@ econv_set_replacement(VALUE self, VALUE arg) rb_enc_name(enc)); if (ret == -1) { - /* xxx: rb_eInvalidByteSequence? */ - rb_raise(rb_eConversionUndefined, "replacement character setup failed"); + /* xxx: rb_eInvalidByteSequenceError? */ + rb_raise(rb_eUndefinedConversionError, "replacement character setup failed"); } return arg; } +VALUE +rb_econv_make_exception(rb_econv_t *ec) +{ + return make_econv_exception(ec); +} + void rb_econv_check_error(rb_econv_t *ec) { @@ -3782,19 +4357,19 @@ rb_econv_check_error(rb_econv_t *ec) * call-seq: * ecerr.source_encoding_name -> string * - * returns the source encoding name as a string. + * Returns the source encoding name as a string. */ static VALUE ecerr_source_encoding_name(VALUE self) { - return rb_attr_get(self, rb_intern("source_encoding_name")); + return rb_attr_get(self, id_source_encoding_name); } /* * call-seq: * ecerr.source_encoding -> encoding * - * returns the source encoding as an encoding object. + * Returns the source encoding as an encoding object. * * Note that the result may not be equal to the source encoding of * the encoding converter if the conversion has multiple steps. @@ -3802,7 +4377,7 @@ ecerr_source_encoding_name(VALUE self) * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP * begin * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP. - * rescue Encoding::ConversionUndefined + * rescue Encoding::UndefinedConversionError * p $!.source_encoding #=> #<Encoding:UTF-8> * p $!.destination_encoding #=> #<Encoding:EUC-JP> * p $!.source_encoding_name #=> "UTF-8" @@ -3813,43 +4388,43 @@ ecerr_source_encoding_name(VALUE self) static VALUE ecerr_source_encoding(VALUE self) { - return rb_attr_get(self, rb_intern("source_encoding")); + return rb_attr_get(self, id_source_encoding); } /* * call-seq: * ecerr.destination_encoding_name -> string * - * returns the destination encoding name as a string. + * Returns the destination encoding name as a string. */ static VALUE ecerr_destination_encoding_name(VALUE self) { - return rb_attr_get(self, rb_intern("destination_encoding_name")); + return rb_attr_get(self, id_destination_encoding_name); } /* * call-seq: * ecerr.destination_encoding -> string * - * returns the destination encoding as an encoding object. + * Returns the destination encoding as an encoding object. */ static VALUE ecerr_destination_encoding(VALUE self) { - return rb_attr_get(self, rb_intern("destination_encoding")); + return rb_attr_get(self, id_destination_encoding); } /* * call-seq: * ecerr.error_char -> string * - * returns the one-character string which cause Encoding::ConversionUndefined. + * Returns the one-character string which cause Encoding::UndefinedConversionError. * * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") * begin * ec.convert("\xa0") - * rescue Encoding::ConversionUndefined + * rescue Encoding::UndefinedConversionError * puts $!.error_char.dump #=> "\xC2\xA0" * p $!.error_char.encoding #=> #<Encoding:UTF-8> * end @@ -3858,20 +4433,20 @@ ecerr_destination_encoding(VALUE self) static VALUE ecerr_error_char(VALUE self) { - return rb_attr_get(self, rb_intern("error_char")); + return rb_attr_get(self, id_error_char); } /* * call-seq: * ecerr.error_bytes -> string * - * returns the discarded bytes when Encoding::InvalidByteSequence occur. + * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs. * * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") * begin * ec.convert("abc\xA1\xFFdef") - * rescue Encoding::InvalidByteSequence - * p $! #=> #<Encoding::InvalidByteSequence: "\xA1" followed by "\xFF" on EUC-JP> + * rescue Encoding::InvalidByteSequenceError + * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP> * puts $!.error_bytes.dump #=> "\xA1" * puts $!.readagain_bytes.dump #=> "\xFF" * end @@ -3879,86 +4454,130 @@ ecerr_error_char(VALUE self) static VALUE ecerr_error_bytes(VALUE self) { - return rb_attr_get(self, rb_intern("error_bytes")); + return rb_attr_get(self, id_error_bytes); } /* * call-seq: * ecerr.readagain_bytes -> string * - * returns the bytes to be read again when Encoding::InvalidByteSequence occur. + * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs. */ static VALUE ecerr_readagain_bytes(VALUE self) { - return rb_attr_get(self, rb_intern("readagain_bytes")); + return rb_attr_get(self, id_readagain_bytes); } /* * call-seq: * ecerr.incomplete_input? -> true or false * - * returns true if the invalid byte sequence error is caused by + * Returns true if the invalid byte sequence error is caused by * premature end of string. * * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") * * begin * ec.convert("abc\xA1z") - * rescue Encoding::InvalidByteSequence - * p $! #=> #<Encoding::InvalidByteSequence: "\xA1" followed by "z" on EUC-JP> + * rescue Encoding::InvalidByteSequenceError + * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP> * p $!.incomplete_input? #=> false * end * * begin * ec.convert("abc\xA1") * ec.finish - * rescue Encoding::InvalidByteSequence - * p $! #=> #<Encoding::InvalidByteSequence: incomplete "\xA1" on EUC-JP> + * rescue Encoding::InvalidByteSequenceError + * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP> * p $!.incomplete_input? #=> true * end */ static VALUE ecerr_incomplete_input(VALUE self) { - return rb_attr_get(self, rb_intern("incomplete_input")); + return rb_attr_get(self, id_incomplete_input); } -extern void Init_newline(void); +/* + * Document-class: Encoding::UndefinedConversionError + * + * Raised by Encoding and String methods when a transcoding operation + * fails. + */ + +/* + * Document-class: Encoding::InvalidByteSequenceError + * + * Raised by Encoding and String methods when the string being + * transcoded contains a byte invalid for the either the source or + * target encoding. + */ + +/* + * Document-class: Encoding::ConverterNotFoundError + * + * Raised by transcoding methods when a named encoding does not + * correspond with a known converter. + */ void Init_transcode(void) { - rb_eConversionUndefined = rb_define_class_under(rb_cEncoding, "ConversionUndefined", rb_eStandardError); - rb_eInvalidByteSequence = rb_define_class_under(rb_cEncoding, "InvalidByteSequence", rb_eStandardError); - rb_eNoConverter = rb_define_class_under(rb_cEncoding, "NoConverter", rb_eStandardError); - transcoder_table = st_init_strcasetable(); - sym_invalid = ID2SYM(rb_intern("invalid")); - sym_undef = ID2SYM(rb_intern("undef")); - sym_ignore = ID2SYM(rb_intern("ignore")); - sym_replace = ID2SYM(rb_intern("replace")); - sym_xml = ID2SYM(rb_intern("xml")); - sym_text = ID2SYM(rb_intern("text")); - sym_attr = ID2SYM(rb_intern("attr")); - - sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence")); - sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion")); - sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full")); - sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty")); - sym_finished = ID2SYM(rb_intern("finished")); - sym_after_output = ID2SYM(rb_intern("after_output")); - sym_incomplete_input = ID2SYM(rb_intern("incomplete_input")); - sym_universal_newline = ID2SYM(rb_intern("universal_newline")); - sym_crlf_newline = ID2SYM(rb_intern("crlf_newline")); - sym_cr_newline = ID2SYM(rb_intern("cr_newline")); - sym_partial_input = ID2SYM(rb_intern("partial_input")); + id_destination_encoding = rb_intern_const("destination_encoding"); + id_destination_encoding_name = rb_intern_const("destination_encoding_name"); + id_error_bytes = rb_intern_const("error_bytes"); + id_error_char = rb_intern_const("error_char"); + id_incomplete_input = rb_intern_const("incomplete_input"); + id_readagain_bytes = rb_intern_const("readagain_bytes"); + id_source_encoding = rb_intern_const("source_encoding"); + id_source_encoding_name = rb_intern_const("source_encoding_name"); + + sym_invalid = ID2SYM(rb_intern_const("invalid")); + sym_undef = ID2SYM(rb_intern_const("undef")); + sym_replace = ID2SYM(rb_intern_const("replace")); + sym_fallback = ID2SYM(rb_intern_const("fallback")); + sym_xml = ID2SYM(rb_intern_const("xml")); + sym_text = ID2SYM(rb_intern_const("text")); + sym_attr = ID2SYM(rb_intern_const("attr")); + + sym_invalid_byte_sequence = ID2SYM(rb_intern_const("invalid_byte_sequence")); + sym_undefined_conversion = ID2SYM(rb_intern_const("undefined_conversion")); + sym_destination_buffer_full = ID2SYM(rb_intern_const("destination_buffer_full")); + sym_source_buffer_empty = ID2SYM(rb_intern_const("source_buffer_empty")); + sym_finished = ID2SYM(rb_intern_const("finished")); + sym_after_output = ID2SYM(rb_intern_const("after_output")); + sym_incomplete_input = ID2SYM(rb_intern_const("incomplete_input")); + sym_universal_newline = ID2SYM(rb_intern_const("universal_newline")); + sym_crlf_newline = ID2SYM(rb_intern_const("crlf_newline")); + sym_cr_newline = ID2SYM(rb_intern_const("cr_newline")); + sym_lf_newline = ID2SYM(rb_intern("lf_newline")); + sym_partial_input = ID2SYM(rb_intern_const("partial_input")); + +#ifdef ENABLE_ECONV_NEWLINE_OPTION + sym_newline = ID2SYM(rb_intern_const("newline")); + sym_universal = ID2SYM(rb_intern_const("universal")); + sym_crlf = ID2SYM(rb_intern_const("crlf")); + sym_cr = ID2SYM(rb_intern_const("cr")); + sym_lf = ID2SYM(rb_intern_const("lf")); +#endif + + InitVM(transcode); +} + +void +InitVM_transcode(void) +{ + rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError); + rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError); + rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError); rb_define_method(rb_cString, "encode", str_encode, -1); rb_define_method(rb_cString, "encode!", str_encode_bang, -1); - rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cData); + rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject); rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate); rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1); rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1); @@ -3976,34 +4595,96 @@ Init_transcode(void) rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0); rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0); rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1); + rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1); + /* + *Mask for invalid byte sequences + */ rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK)); + + /* + * Replace invalid byte sequences + */ rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE)); + + /* + * Mask for a valid character in the source encoding but no related + * character(s) in destination encoding. + */ rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK)); + + /* + * Replace byte sequences that are undefined in the destination encoding. + */ rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE)); + + /* + * Replace byte sequences that are undefined in the destination encoding + * with an XML hexadecimal character reference. This is valid for XML + * conversion. + */ rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF)); + + /* + * Indicates the source may be part of a larger string. See + * primitive_convert for an example. + */ rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT)); + + /* + * Stop converting after some output is complete but before all of the + * input was consumed. See primitive_convert for an example. + */ rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT)); + + /* + * Decorator for converting CRLF and CR to LF + */ rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR)); + + /* + * Decorator for converting CRLF and CR to LF when writing + */ + rb_define_const(rb_cEncodingConverter, "LF_NEWLINE_DECORATOR", INT2FIX(ECONV_LF_NEWLINE_DECORATOR)); + + /* + * Decorator for converting LF to CRLF + */ rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR)); + + /* + * Decorator for converting LF to CR + */ rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR)); + + /* + * Escape as XML CharData + */ rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR)); + + /* + * Escape as XML AttValue + */ rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR)); + + /* + * Escape as XML AttValue + */ rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR)); - rb_define_method(rb_eConversionUndefined, "source_encoding_name", ecerr_source_encoding_name, 0); - rb_define_method(rb_eConversionUndefined, "destination_encoding_name", ecerr_destination_encoding_name, 0); - rb_define_method(rb_eConversionUndefined, "source_encoding", ecerr_source_encoding, 0); - rb_define_method(rb_eConversionUndefined, "destination_encoding", ecerr_destination_encoding, 0); - rb_define_method(rb_eConversionUndefined, "error_char", ecerr_error_char, 0); - - rb_define_method(rb_eInvalidByteSequence, "source_encoding_name", ecerr_source_encoding_name, 0); - rb_define_method(rb_eInvalidByteSequence, "destination_encoding_name", ecerr_destination_encoding_name, 0); - rb_define_method(rb_eInvalidByteSequence, "source_encoding", ecerr_source_encoding, 0); - rb_define_method(rb_eInvalidByteSequence, "destination_encoding", ecerr_destination_encoding, 0); - rb_define_method(rb_eInvalidByteSequence, "error_bytes", ecerr_error_bytes, 0); - rb_define_method(rb_eInvalidByteSequence, "readagain_bytes", ecerr_readagain_bytes, 0); - rb_define_method(rb_eInvalidByteSequence, "incomplete_input?", ecerr_incomplete_input, 0); + rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0); + rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0); + rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0); + rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0); + rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0); + + rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0); + rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0); + rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0); + rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0); + rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0); + rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0); + rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0); Init_newline(); } |
