summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog18
-rw-r--r--enc/trans/newline.trans56
-rw-r--r--test/ruby/test_econv.rb18
-rw-r--r--transcode.c67
-rw-r--r--transcode_data.h1
5 files changed, 137 insertions, 23 deletions
diff --git a/ChangeLog b/ChangeLog
index 68baa0bc71..4c03b183bb 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+Wed Aug 13 14:22:16 2008 Tanaka Akira <akr@fsij.org>
+
+ * enc/trans/newline.trans: new file.
+
+ * transcode_data.h (rb_trans_t): add last_tc field.
+
+ * transcode.c (UNIVERSAL_NEWLINE): defined.
+ (CRLF_NEWLINE): defined.
+ (CR_NEWLINE): defined.
+ (rb_trans_open_by_transcoder_entries): initialize last_tc.
+ (trans_open_i): allocate one more room for newline converter.
+ (rb_trans_open): universal newline implemented.
+ (more_output_buffer): take max_output argument instead ts.
+ (output_replacement_character): take tc argument instead of ts.
+ (transcode_loop): use last_tc field.
+ (econv_init): add flags argument for rb_trans_open.
+ (Init_transcode): Encoding::Converter::UNIVERSAL_NEWLINE defined.
+
Wed Aug 13 14:00:19 2008 Nobuyoshi Nakada <nobu@ruby-lang.org>
* common.mk (parse.c): generates parse.h together.
diff --git a/enc/trans/newline.trans b/enc/trans/newline.trans
new file mode 100644
index 0000000000..3d33a1e323
--- /dev/null
+++ b/enc/trans/newline.trans
@@ -0,0 +1,56 @@
+#include "transcode_data.h"
+
+<%
+ map_normalize = {}
+ map_normalize["{00-ff}"] = :func_so
+%>
+
+<%= transcode_generate_node(ActionMap.parse(map_normalize), "universal_newline") %>
+
+static int
+fun_so_universal_newline(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o)
+{
+ int len;
+ /*
+ t->stateful[0] == 0 : normal
+ t->stateful[0] == 1 : just after '\r'
+ */
+ if (s[0] == '\n') {
+ if (t->stateful[0] == 0) {
+ o[0] = '\n';
+ len = 1;
+ }
+ else {
+ len = 0;
+ }
+ t->stateful[0] = 0;
+ }
+ else if (s[0] == '\r') {
+ o[0] = '\n';
+ len = 1;
+ t->stateful[0] = 1;
+ }
+ else {
+ o[0] = s[0];
+ len = 1;
+ t->stateful[0] = 0;
+ }
+ return len;
+}
+
+static const rb_transcoder
+rb_universal_newline = {
+ "universal_newline", "", &universal_newline,
+ 1, /* input_unit_length */
+ 1, /* max_input */
+ 1, /* max_output */
+ NULL, NULL, NULL, fun_so_universal_newline
+};
+
+
+void
+Init_newline(void)
+{
+ rb_register_transcoder(&rb_universal_newline);
+}
+
diff --git a/test/ruby/test_econv.rb b/test/ruby/test_econv.rb
index a4f86d1c93..8a09f1e9bb 100644
--- a/test/ruby/test_econv.rb
+++ b/test/ruby/test_econv.rb
@@ -4,7 +4,7 @@ class TestEncodingConverter < Test::Unit::TestCase
def assert_econv(ret_expected, dst_expected, src_expected, to, from, src, opt={})
opt[:obuf_len] ||= 100
src = src.dup
- ec = Encoding::Converter.new(from, to)
+ ec = Encoding::Converter.new(from, to, 0)
dst = ''
while true
ret = ec.primitive_convert(src, dst2="", opt[:obuf_len], 0)
@@ -35,7 +35,7 @@ class TestEncodingConverter < Test::Unit::TestCase
end
def test_errors
- ec = Encoding::Converter.new("UTF-16BE", "EUC-JP")
+ ec = Encoding::Converter.new("UTF-16BE", "EUC-JP", 0)
src = "\xFF\xFE\x00A\xDC\x00"
ret = ec.primitive_convert(src, dst="", 10, 0)
assert_equal("", src)
@@ -50,4 +50,18 @@ class TestEncodingConverter < Test::Unit::TestCase
assert_equal("", dst)
assert_equal(:finished, ret)
end
+
+ def test_universal_newline
+ ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::UNIVERSAL_NEWLINE)
+ ret = ec.primitive_convert(src="abc\r\ndef", dst="", 50, Encoding::Converter::PARTIAL_INPUT)
+ assert_equal([:ibuf_empty, "", "abc\ndef"], [ret, src, dst])
+ ret = ec.primitive_convert(src="ghi\njkl", dst="", 50, Encoding::Converter::PARTIAL_INPUT)
+ assert_equal([:ibuf_empty, "", "ghi\njkl"], [ret, src, dst])
+ ret = ec.primitive_convert(src="mno\rpqr", dst="", 50, Encoding::Converter::PARTIAL_INPUT)
+ assert_equal([:ibuf_empty, "", "mno\npqr"], [ret, src, dst])
+ ret = ec.primitive_convert(src="stu\r", dst="", 50, Encoding::Converter::PARTIAL_INPUT)
+ assert_equal([:ibuf_empty, "", "stu\n"], [ret, src, dst])
+ ret = ec.primitive_convert(src="\nvwx", dst="", 50, Encoding::Converter::PARTIAL_INPUT)
+ assert_equal([:ibuf_empty, "", "vwx"], [ret, src, dst])
+ end
end
diff --git a/transcode.c b/transcode.c
index 3fe8af7ed4..507e1b0b9e 100644
--- a/transcode.c
+++ b/transcode.c
@@ -25,7 +25,10 @@ static VALUE sym_invalid, sym_undef, sym_ignore, sym_replace;
#define INVALID_REPLACE 0x2
#define UNDEF_IGNORE 0x10
#define UNDEF_REPLACE 0x20
-#define PARTIAL_INPUT 0x100
+#define PARTIAL_INPUT 0x100
+#define UNIVERSAL_NEWLINE 0x200
+#define CRLF_NEWLINE 0x400
+#define CR_NEWLINE 0x800
/*
* Dispatch data and logic
@@ -646,6 +649,7 @@ rb_trans_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
ts->num_trans = n;
ts->elems = ALLOC_N(rb_trans_elem_t, ts->num_trans);
ts->num_finished = 0;
+ ts->last_tc = NULL;
for (i = 0; i < ts->num_trans; i++) {
const rb_transcoder *tr = load_transcoder_entry(entries[i]);
ts->elems[i].from = tr->from_encoding;
@@ -657,6 +661,7 @@ rb_trans_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
ts->elems[i].out_buf_end = NULL;
ts->elems[i].last_result = transcode_ibuf_empty;
}
+ ts->last_tc = ts->elems[ts->num_trans-1].tc;
for (i = 0; i < ts->num_trans-1; i++) {
int bufsize = 4096;
@@ -678,7 +683,7 @@ trans_open_i(const char *from, const char *to, int depth, void *arg)
transcoder_entry_t **entries;
if (!*entries_ptr) {
- entries = ALLOC_N(transcoder_entry_t *, depth+1);
+ entries = ALLOC_N(transcoder_entry_t *, depth+1+1);
*entries_ptr = entries;
}
else {
@@ -699,7 +704,19 @@ rb_trans_open(const char *from, const char *to, int flags)
if (num_trans < 0 || !entries)
return NULL;
+ if (flags & UNIVERSAL_NEWLINE) {
+ transcoder_entry_t *e = get_transcoder_entry("universal_newline", "");
+ if (!e)
+ return NULL;
+ entries[num_trans++] = e;
+ }
+
ts = rb_trans_open_by_transcoder_entries(num_trans, entries);
+
+ if (flags & UNIVERSAL_NEWLINE) {
+ ts->last_tc = ts->elems[ts->num_trans-2].tc;
+ }
+
return ts;
}
@@ -840,13 +857,13 @@ static void
more_output_buffer(
VALUE destination,
unsigned char *(*resize_destination)(VALUE, int, int),
- rb_trans_t *ts,
+ int max_output,
unsigned char **out_start_ptr,
unsigned char **out_pos,
unsigned char **out_stop_ptr)
{
size_t len = (*out_pos - *out_start_ptr);
- size_t new_len = (len + ts->elems[ts->num_trans-1].tc->transcoder->max_output) * 2;
+ size_t new_len = (len + max_output) * 2;
*out_start_ptr = resize_destination(destination, len, new_len);
*out_pos = *out_start_ptr + len;
*out_stop_ptr = *out_start_ptr + new_len;
@@ -856,20 +873,18 @@ static void
output_replacement_character(
VALUE destination,
unsigned char *(*resize_destination)(VALUE, int, int),
- rb_trans_t *ts,
+ rb_transcoding *tc,
unsigned char **out_start_ptr,
unsigned char **out_pos,
unsigned char **out_stop_ptr)
{
- rb_transcoding *tc;
const rb_transcoder *tr;
int max_output;
rb_encoding *enc;
const char *replacement;
int len;
- tc = ts->elems[ts->num_trans-1].tc;
tr = tc->transcoder;
max_output = tr->max_output;
enc = rb_enc_find(tr->to_encoding);
@@ -893,12 +908,12 @@ output_replacement_character(
if (tr->resetstate_func) {
if (*out_stop_ptr - *out_pos < max_output)
- more_output_buffer(destination, resize_destination, ts, out_start_ptr, out_pos, out_stop_ptr);
+ more_output_buffer(destination, resize_destination, max_output, out_start_ptr, out_pos, out_stop_ptr);
*out_pos += tr->resetstate_func(tc, *out_pos);
}
if (*out_stop_ptr - *out_pos < max_output)
- more_output_buffer(destination, resize_destination, ts, out_start_ptr, out_pos, out_stop_ptr);
+ more_output_buffer(destination, resize_destination, max_output, out_start_ptr, out_pos, out_stop_ptr);
replacement = get_replacement_character(enc, &len);
@@ -919,6 +934,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
const int opt)
{
rb_trans_t *ts;
+ rb_transcoding *last_tc;
rb_trans_result_t ret;
unsigned char *out_start = *out_pos;
int max_output;
@@ -927,7 +943,8 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
if (!ts)
rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_encoding, to_encoding);
- max_output = ts->elems[ts->num_trans-1].tc->transcoder->max_output;
+ last_tc = ts->last_tc;
+ max_output = last_tc->transcoder->max_output;
resume:
ret = rb_trans_conv(ts, in_pos, in_stop, out_pos, out_stop, opt);
@@ -938,7 +955,7 @@ resume:
goto resume;
}
else if (opt&INVALID_REPLACE) {
- output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
+ output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
goto resume;
}
rb_trans_close(ts);
@@ -952,14 +969,14 @@ resume:
goto resume;
}
else if (opt&UNDEF_REPLACE) {
- output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
+ output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
goto resume;
}
rb_trans_close(ts);
rb_raise(rb_eConversionUndefined, "conversion undefined for byte sequence (maybe invalid byte sequence)");
}
if (ret == transcode_obuf_full) {
- more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
+ more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
goto resume;
}
@@ -978,6 +995,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
const int opt)
{
rb_trans_t *ts;
+ rb_transcoding *last_tc;
rb_trans_result_t ret;
unsigned char *out_start = *out_pos;
const unsigned char *ptr;
@@ -987,6 +1005,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
if (!ts)
rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_encoding, to_encoding);
+ last_tc = ts->last_tc;
max_output = ts->elems[ts->num_trans-1].tc->transcoder->max_output;
ret = transcode_ibuf_empty;
@@ -1017,7 +1036,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
break;
}
else if (opt&INVALID_REPLACE) {
- output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
+ output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
break;
}
rb_trans_close(ts);
@@ -1032,7 +1051,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
break;
}
else if (opt&UNDEF_REPLACE) {
- output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
+ output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
break;
}
rb_trans_close(ts);
@@ -1040,7 +1059,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
break;
case transcode_obuf_full:
- more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
+ more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
break;
case transcode_ibuf_empty:
@@ -1261,19 +1280,24 @@ econv_s_allocate(VALUE klass)
}
static VALUE
-econv_init(VALUE self, VALUE from_encoding, VALUE to_encoding)
+econv_init(VALUE self, VALUE from_encoding, VALUE to_encoding, VALUE flags_v)
{
const char *from_e, *to_e;
rb_trans_t *ts;
+ int flags;
+
+ StringValue(from_encoding);
+ StringValue(to_encoding);
+ flags = NUM2INT(flags_v);
- from_e = StringValueCStr(from_encoding);
- to_e = StringValueCStr(to_encoding);
+ from_e = RSTRING_PTR(from_encoding);
+ to_e = RSTRING_PTR(to_encoding);
if (DATA_PTR(self)) {
rb_raise(rb_eTypeError, "already initialized");
}
- ts = rb_trans_open(from_e, to_e, 0);
+ ts = rb_trans_open(from_e, to_e, flags);
if (!ts) {
rb_raise(rb_eArgError, "encoding convewrter not supported (from %s to %s)", from_e, to_e);
}
@@ -1363,8 +1387,9 @@ Init_transcode(void)
rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cData);
rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
- rb_define_method(rb_cEncodingConverter, "initialize", econv_init, 2);
+ rb_define_method(rb_cEncodingConverter, "initialize", econv_init, 3);
rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, 4);
rb_define_method(rb_cEncodingConverter, "max_output", econv_max_output, 0);
rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(PARTIAL_INPUT));
+ rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE", INT2FIX(UNIVERSAL_NEWLINE));
}
diff --git a/transcode_data.h b/transcode_data.h
index 6b3ded4955..f49f796f3d 100644
--- a/transcode_data.h
+++ b/transcode_data.h
@@ -122,6 +122,7 @@ typedef struct {
rb_trans_elem_t *elems;
int num_trans;
int num_finished;
+ rb_transcoding *last_tc;
} rb_trans_t;
void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib);