summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog16
-rw-r--r--enc/euc_jp.c7
-rw-r--r--enc/shift_jis.c4
-rw-r--r--numeric.c11
-rw-r--r--regenc.c5
-rw-r--r--string.c43
-rw-r--r--test/ruby/enc/test_shift_jis.rb2
-rw-r--r--test/ruby/test_m17n.rb9
-rw-r--r--test/ruby/test_regexp.rb2
-rw-r--r--version.h2
10 files changed, 80 insertions, 21 deletions
diff --git a/ChangeLog b/ChangeLog
index ab140541b0..37d4fc788c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+Thu Feb 9 07:32:40 2012 NARUSE, Yui <naruse@ruby-lang.org>
+
+ * numeric.c (rb_enc_uint_char): raise RangeError when added codepoint
+ is invalid. [Feature #5855] [Bug #5863] [Bug #5864]
+
+ * string.c (rb_str_concat): ditto.
+
+ * string.c (rb_str_concat): set encoding as ASCII-8BIT when the string
+ is US-ASCII and the argument is an integer greater than 127.
+
+ * regenc.c (onigenc_mb2_code_to_mbclen): rearrange error code.
+
+ * enc/euc_jp.c (code_to_mbclen): ditto.
+
+ * enc/shift_jis.c (code_to_mbclen): ditto.
+
Thu Feb 9 07:28:43 2012 NARUSE, Yui <naruse@ruby-lang.org>
* test/pathname/test_pathname.rb: not read but binread.
diff --git a/enc/euc_jp.c b/enc/euc_jp.c
index 2666e60ae0..7667c5800e 100644
--- a/enc/euc_jp.c
+++ b/enc/euc_jp.c
@@ -154,9 +154,10 @@ static int
code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
{
if (ONIGENC_IS_CODE_ASCII(code)) return 1;
- else if (code > 0xffffff) return 0;
- else if ((code & 0xff0000) >= 0x800000) return 3;
- else if ((code & 0xff00) >= 0x8000) return 2;
+ else if (code > 0xffffff)
+ return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
+ else if (code & 0x800000) return 3;
+ else if (code & 0x8000) return 2;
else
return ONIGERR_INVALID_CODE_POINT_VALUE;
}
diff --git a/enc/shift_jis.c b/enc/shift_jis.c
index d1357b3212..9dcacb584d 100644
--- a/enc/shift_jis.c
+++ b/enc/shift_jis.c
@@ -135,13 +135,13 @@ code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
if (EncLen_SJIS[(int )code] == 1)
return 1;
else
- return 0;
+ return ONIGERR_INVALID_CODE_POINT_VALUE;
}
else if (code <= 0xffff) {
return 2;
}
else
- return ONIGERR_INVALID_CODE_POINT_VALUE;
+ return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
}
static OnigCodePoint
diff --git a/numeric.c b/numeric.c
index baa6a178e4..c05aaee8fd 100644
--- a/numeric.c
+++ b/numeric.c
@@ -2196,11 +2196,20 @@ rb_enc_uint_chr(unsigned int code, rb_encoding *enc)
{
int n;
VALUE str;
- if ((n = rb_enc_codelen(code, enc)) <= 0) {
+ switch (n = rb_enc_codelen(code, enc)) {
+ case ONIGERR_INVALID_CODE_POINT_VALUE:
+ rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
+ break;
+ case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
+ case 0:
rb_raise(rb_eRangeError, "%u out of char range", code);
+ break;
}
str = rb_enc_str_new(0, n, enc);
rb_enc_mbcput(code, RSTRING_PTR(str), enc);
+ if (rb_enc_precise_mbclen(RSTRING_PTR(str), RSTRING_END(str), enc) != n) {
+ rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
+ }
return str;
}
diff --git a/regenc.c b/regenc.c
index 32d24e76d9..70b56ef727 100644
--- a/regenc.c
+++ b/regenc.c
@@ -732,8 +732,9 @@ onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag,
extern int
onigenc_mb2_code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
{
- if ((code & 0xff00) != 0) return 2;
- else return 1;
+ if (code <= 0xff) return 1;
+ if (code <= 0xffff) return 2;
+ return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
}
extern int
diff --git a/string.c b/string.c
index d2737045e1..22f7aaf614 100644
--- a/string.c
+++ b/string.c
@@ -2081,10 +2081,11 @@ rb_str_append(VALUE str, VALUE str2)
VALUE
rb_str_concat(VALUE str1, VALUE str2)
{
- unsigned int lc;
+ unsigned int code;
+ rb_encoding *enc = STR_ENC_GET(str1);
if (FIXNUM_P(str2) || TYPE(str2) == T_BIGNUM) {
- if (rb_num_to_uint(str2, &lc) == 0) {
+ if (rb_num_to_uint(str2, &code) == 0) {
}
else if (FIXNUM_P(str2)) {
rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
@@ -2096,22 +2097,46 @@ rb_str_concat(VALUE str1, VALUE str2)
else {
return rb_str_append(str1, str2);
}
- {
- rb_encoding *enc = STR_ENC_GET(str1);
+
+ if (enc == rb_usascii_encoding()) {
+ /* US-ASCII automatically extended to ASCII-8BIT */
+ char buf[1] = {(char)code};
+ if (code > 0xFF) {
+ rb_raise(rb_eRangeError, "%u out of char range", code);
+ }
+ rb_str_cat(str1, buf, 1);
+ if (code > 127) {
+ rb_enc_associate(str1, rb_ascii8bit_encoding());
+ ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
+ }
+ }
+ else {
long pos = RSTRING_LEN(str1);
int cr = ENC_CODERANGE(str1);
int len;
+ char *buf;
- if ((len = rb_enc_codelen(lc, enc)) <= 0) {
- rb_raise(rb_eRangeError, "%u invalid char", lc);
+ switch (len = rb_enc_codelen(code, enc)) {
+ case ONIGERR_INVALID_CODE_POINT_VALUE:
+ rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
+ break;
+ case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
+ case 0:
+ rb_raise(rb_eRangeError, "%u out of char range", code);
+ break;
+ }
+ buf = ALLOCA_N(char, len + 1);
+ rb_enc_mbcput(code, buf, enc);
+ if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
+ rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
}
rb_str_resize(str1, pos+len);
- rb_enc_mbcput(lc, RSTRING_PTR(str1)+pos, enc);
- if (cr == ENC_CODERANGE_7BIT && lc > 127)
+ strncpy(RSTRING_PTR(str1) + pos, buf, len);
+ if (cr == ENC_CODERANGE_7BIT && code > 127)
cr = ENC_CODERANGE_VALID;
ENC_CODERANGE_SET(str1, cr);
- return str1;
}
+ return str1;
}
/*
diff --git a/test/ruby/enc/test_shift_jis.rb b/test/ruby/enc/test_shift_jis.rb
index f81cb7801c..54ef67dd44 100644
--- a/test/ruby/enc/test_shift_jis.rb
+++ b/test/ruby/enc/test_shift_jis.rb
@@ -22,6 +22,6 @@ class TestShiftJIS < Test::Unit::TestCase
s = "あいうえお"
s << 0x82a9
assert_equal("あいうえおか", s)
- assert_raise(ArgumentError) { s << 0x82 }
+ assert_raise(RangeError) { s << 0x82 }
end
end
diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb
index f4d3dcbdad..3553f3a0a4 100644
--- a/test/ruby/test_m17n.rb
+++ b/test/ruby/test_m17n.rb
@@ -1137,6 +1137,7 @@ class TestM17N < Test::Unit::TestCase
def test_str_concat
assert_equal(1, "".concat(0xA2).size)
+ assert_equal(Encoding::ASCII_8BIT, "".force_encoding("US-ASCII").concat(0xA2).encoding)
assert_equal("A\x84\x31\xA4\x39".force_encoding("GB18030"),
"A".force_encoding("GB18030") << 0x8431A439)
end
@@ -1196,6 +1197,14 @@ class TestM17N < Test::Unit::TestCase
2206368128.chr(Encoding::UTF_8)
}
assert_not_match(/-\d+ out of char range/, e.message)
+
+ assert_raise(RangeError){ 0x80.chr("US-ASCII") }
+ assert_raise(RangeError){ 0x80.chr("SHIFT_JIS") }
+ assert_raise(RangeError){ 0xE0.chr("SHIFT_JIS") }
+ assert_raise(RangeError){ 0x100.chr("SHIFT_JIS") }
+ assert_raise(RangeError){ 0xA0.chr("EUC-JP") }
+ assert_raise(RangeError){ 0x100.chr("EUC-JP") }
+ assert_raise(RangeError){ 0xA1A0.chr("EUC-JP") }
end
def test_marshal
diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb
index 1bc2d0a79f..9626f61eb3 100644
--- a/test/ruby/test_regexp.rb
+++ b/test/ruby/test_regexp.rb
@@ -803,8 +803,6 @@ class TestRegexp < Test::Unit::TestCase
#assert_match(/^(\ufb05)\1\1$/i, "\ufb05\ufb06st") # this must be bug...
assert_match(/^\ufb05{3}$/i, "\ufb05\ufb06st")
assert_match(/^\u03b9\u0308\u0301$/i, "\u0390")
- assert_nothing_raised { 0x03ffffff.chr("utf-8").size }
- assert_nothing_raised { 0x7fffffff.chr("utf-8").size }
end
def test_unicode_age
diff --git a/version.h b/version.h
index 0e486b8b5f..5ce46a1eec 100644
--- a/version.h
+++ b/version.h
@@ -1,5 +1,5 @@
#define RUBY_VERSION "1.9.3"
-#define RUBY_PATCHLEVEL 70
+#define RUBY_PATCHLEVEL 71
#define RUBY_RELEASE_DATE "2012-02-09"
#define RUBY_RELEASE_YEAR 2012