summaryrefslogtreecommitdiff
path: root/io.c
diff options
context:
space:
mode:
authornaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2009-07-09 14:47:48 +0000
committernaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2009-07-09 14:47:48 +0000
commit634cfdc6e675e0643df699780dbbf44ff4854640 (patch)
treee22f62dae700ab86062b08ae8215a2cbbc97c0bb /io.c
parenta113d4f843ec2524010b06379d69b428061dd7f9 (diff)
Set encoding and strip bom when modeenc string is "r:foo-bom" [ruby-dev:37236]
* include/ruby/io.h (FMODE_STRIP_BOM): new constant. * io.c (io_encname_bom_p): judge whether the encoding name is *-bom or not. (parse_mode_enc): drop "-bom". (rb_io_modestr_fmode): set FMODE_STRIP_BOM if needed. (rb_io_extract_modeenc): ditto. (io_strip_bom): strip bom if exists. (io_set_encoding_by_bom): set encoding if there is bom. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@24001 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'io.c')
-rw-r--r--io.c114
1 files changed, 112 insertions, 2 deletions
diff --git a/io.c b/io.c
index 7f8733f031..d3174eef09 100644
--- a/io.c
+++ b/io.c
@@ -3934,11 +3934,26 @@ rb_io_fmode_modestr(int fmode)
return NULL; /* not reached */
}
+static int
+io_encname_bom_p(const char *name, long len) {
+ if (len) {
+ if (len > 4 && strncasecmp(name + len - 4, "-bom", 4) == 0)
+ return 1;
+ }
+ else {
+ const char *p = strchr(name, ':');
+ if (!p) p = name + strlen(name);
+ if (p - name > 4 && strncasecmp(p - 4, "-bom", 4) == 0)
+ return 1;
+ }
+ return 0;
+}
+
int
rb_io_modestr_fmode(const char *modestr)
{
int fmode = 0;
- const char *m = modestr;
+ const char *m = modestr, *p = NULL;
switch (*m++) {
case 'r':
@@ -3969,6 +3984,7 @@ rb_io_modestr_fmode(const char *modestr)
default:
goto error;
case ':':
+ p = m;
goto finished;
}
}
@@ -3976,6 +3992,8 @@ rb_io_modestr_fmode(const char *modestr)
finished:
if ((fmode & FMODE_BINMODE) && (fmode & FMODE_TEXTMODE))
goto error;
+ if (p && io_encname_bom_p(p, 0))
+ fmode |= FMODE_STRIP_BOM;
return fmode;
}
@@ -4126,14 +4144,24 @@ parse_mode_enc(const char *estr, rb_encoding **enc_p, rb_encoding **enc2_p)
if (len == 0 || len > ENCODING_MAXNAMELEN)
idx = -1;
else {
+ if (io_encname_bom_p(estr, len))
+ len -= 4;
memcpy(encname, estr, len);
encname[len] = '\0';
estr = encname;
idx = rb_enc_find_index(encname);
}
}
- else
+ else {
+ long len = strlen(estr);
+ if (io_encname_bom_p(estr, len)) {
+ len -= 4;
+ memcpy(encname, estr, len);
+ encname[len] = '\0';
+ estr = encname;
+ }
idx = rb_enc_find_index(estr);
+ }
if (idx >= 0)
ext_enc = rb_enc_from_index(idx);
@@ -4309,6 +4337,8 @@ rb_io_extract_modeenc(VALUE *vmode_p, VALUE *vperm_p, VALUE opthash,
if (p) {
has_enc = 1;
parse_mode_enc(p+1, &enc, &enc2);
+ if (io_encname_bom_p(p+1, 0))
+ fmode |= FMODE_STRIP_BOM;
}
else {
rb_encoding *e;
@@ -4493,6 +4523,84 @@ io_check_tty(rb_io_t *fptr)
fptr->mode |= FMODE_TTY|FMODE_DUPLEX;
}
+static VALUE rb_io_internal_encoding(VALUE);
+static void io_encoding_set(rb_io_t *, VALUE, VALUE, VALUE);
+
+static int
+io_strip_bom(VALUE io) {
+ int b1, b2, b3, b4;
+ switch (b1 = FIX2INT(rb_io_getbyte(io))) {
+ case 0xEF:
+ b2 = FIX2INT(rb_io_getbyte(io));
+ if (b2 == 0xBB) {
+ b3 = FIX2INT(rb_io_getbyte(io));
+ if (b3 == 0xBF) {
+ return rb_utf8_encindex();
+ }
+ rb_io_ungetbyte(io, INT2FIX(b3));
+ }
+ rb_io_ungetbyte(io, INT2FIX(b2));
+ break;
+
+ case 0xFE:
+ b2 = FIX2INT(rb_io_getbyte(io));
+ if (b2 == 0xFF) {
+ return rb_enc_find_index("UTF-16BE");
+ }
+ rb_io_ungetbyte(io, INT2FIX(b2));
+ break;
+
+ case 0xFF:
+ b2 = FIX2INT(rb_io_getbyte(io));
+ if (b2 == 0xFF) {
+ b3 = FIX2INT(rb_io_getbyte(io));
+ if (b3 == 0) {
+ b4 = FIX2INT(rb_io_getbyte(io));
+ if (b4 == 0) {
+ return rb_enc_find_index("UTF-32LE");
+ }
+ rb_io_ungetbyte(io, INT2FIX(b4));
+ }
+ else {
+ return rb_enc_find_index("UTF-16LE");
+ }
+ rb_io_ungetbyte(io, INT2FIX(b3));
+ }
+ rb_io_ungetbyte(io, INT2FIX(b2));
+ break;
+
+ case 0:
+ b2 = FIX2INT(rb_io_getbyte(io));
+ if (b2 == 0) {
+ b3 = FIX2INT(rb_io_getbyte(io));
+ if (b3 == 0xFE) {
+ b4 = FIX2INT(rb_io_getbyte(io));
+ if (b4 == 0xFF) {
+ return rb_enc_find_index("UTF-32BE");
+ }
+ rb_io_ungetbyte(io, INT2FIX(b4));
+ }
+ rb_io_ungetbyte(io, INT2FIX(b3));
+ }
+ rb_io_ungetbyte(io, INT2FIX(b2));
+ break;
+ }
+ rb_io_ungetbyte(io, INT2FIX(b1));
+ return 0;
+}
+
+static void
+io_set_encoding_by_bom(VALUE io) {
+ int idx = io_strip_bom(io);
+
+ if (idx) {
+ rb_io_t *fptr;
+ GetOpenFile(io, fptr);
+ io_encoding_set(fptr, rb_enc_from_encoding(rb_enc_from_index(idx)),
+ rb_io_internal_encoding(io), Qnil);
+ }
+}
+
static VALUE
rb_file_open_generic(VALUE io, VALUE filename, int oflags, int fmode, convconfig_t *convconfig, mode_t perm)
{
@@ -4513,6 +4621,7 @@ rb_file_open_generic(VALUE io, VALUE filename, int oflags, int fmode, convconfig
fptr->pathv = rb_str_new_frozen(filename);
fptr->fd = rb_sysopen(fptr->pathv, oflags, perm);
io_check_tty(fptr);
+ if (fmode & FMODE_STRIP_BOM) io_set_encoding_by_bom(io);
return io;
}
@@ -6250,6 +6359,7 @@ rb_io_initialize(int argc, VALUE *argv, VALUE io)
else if (fileno(stderr) == fd)
fp->stdio_file = stderr;
+ if (fmode & FMODE_STRIP_BOM) io_set_encoding_by_bom(io);
return io;
}