From d403591b34e204a5937241025c62c877e579fbaf Mon Sep 17 00:00:00 2001 From: Lars Kanis Date: Sun, 22 Nov 2020 14:23:40 +0100 Subject: Add string encoding IBM720 alias CP720 (#3803) The mapping table is generated from the ICU project: https://github.com/unicode-org/icu/blob/master/icu4c/source/data/mappings/ibm-720_P100-1997.ucm Fixes bug 16233 : https://bugs.ruby-lang.org/issues/16233 --- enc/ascii.c | 2 + enc/trans/ibm720-tbl.rb | 122 +++++++++++++++++++++++++++ enc/trans/single_byte.trans | 3 +- ext/win32ole/win32ole.c | 1 + spec/ruby/core/string/valid_encoding_spec.rb | 8 ++ test/ruby/test_transcode.rb | 19 +++++ 6 files changed, 154 insertions(+), 1 deletion(-) create mode 100644 enc/trans/ibm720-tbl.rb diff --git a/enc/ascii.c b/enc/ascii.c index 8b32c414fe..a2fef2f879 100644 --- a/enc/ascii.c +++ b/enc/ascii.c @@ -61,6 +61,8 @@ OnigEncodingDefine(ascii, ASCII) = { ENC_ALIAS("BINARY", "ASCII-8BIT") ENC_REPLICATE("IBM437", "ASCII-8BIT") ENC_ALIAS("CP437", "IBM437") +ENC_REPLICATE("IBM720", "ASCII-8BIT") +ENC_ALIAS("CP720", "IBM720") ENC_REPLICATE("IBM737", "ASCII-8BIT") ENC_ALIAS("CP737", "IBM737") ENC_REPLICATE("IBM775", "ASCII-8BIT") diff --git a/enc/trans/ibm720-tbl.rb b/enc/trans/ibm720-tbl.rb new file mode 100644 index 0000000000..558684d649 --- /dev/null +++ b/enc/trans/ibm720-tbl.rb @@ -0,0 +1,122 @@ +IBM720_TO_UCS_TBL = [ + ["FF",0xA0], + ["9C",0xA3], + ["94",0xA4], + ["AE",0xAB], + ["F8",0xB0], + ["FD",0xB2], + ["E6",0xB5], + ["FA",0xB7], + ["AF",0xBB], + ["85",0xE0], + ["83",0xE2], + ["87",0xE7], + ["8A",0xE8], + ["82",0xE9], + ["88",0xEA], + ["89",0xEB], + ["8C",0xEE], + ["8B",0xEF], + ["93",0xF4], + ["97",0xF9], + ["96",0xFB], + ["98",0x621], + ["99",0x622], + ["9A",0x623], + ["9B",0x624], + ["9D",0x625], + ["9E",0x626], + ["9F",0x627], + ["A0",0x628], + ["A1",0x629], + ["A2",0x62A], + ["A3",0x62B], + ["A4",0x62C], + ["A5",0x62D], + ["A6",0x62E], + ["A7",0x62F], + ["A8",0x630], + ["A9",0x631], + ["AA",0x632], + ["AB",0x633], + ["AC",0x634], + ["AD",0x635], + ["E0",0x636], + ["E1",0x637], + ["E2",0x638], + ["E3",0x639], + ["E4",0x63A], + ["95",0x640], + ["E5",0x641], + ["E7",0x642], + ["E8",0x643], + ["E9",0x644], + ["EA",0x645], + ["EB",0x646], + ["EC",0x647], + ["ED",0x648], + ["EE",0x649], + ["EF",0x64A], + ["F1",0x64B], + ["F2",0x64C], + ["F3",0x64D], + ["F4",0x64E], + ["F5",0x64F], + ["F6",0x650], + ["91",0x651], + ["92",0x652], + ["FC",0x207F], + ["F9",0x2219], + ["FB",0x221A], + ["F7",0x2248], + ["F0",0x2261], + ["C4",0x2500], + ["B3",0x2502], + ["DA",0x250C], + ["BF",0x2510], + ["C0",0x2514], + ["D9",0x2518], + ["C3",0x251C], + ["B4",0x2524], + ["C2",0x252C], + ["C1",0x2534], + ["C5",0x253C], + ["CD",0x2550], + ["BA",0x2551], + ["D5",0x2552], + ["D6",0x2553], + ["C9",0x2554], + ["B8",0x2555], + ["B7",0x2556], + ["BB",0x2557], + ["D4",0x2558], + ["D3",0x2559], + ["C8",0x255A], + ["BE",0x255B], + ["BD",0x255C], + ["BC",0x255D], + ["C6",0x255E], + ["C7",0x255F], + ["CC",0x2560], + ["B5",0x2561], + ["B6",0x2562], + ["B9",0x2563], + ["D1",0x2564], + ["D2",0x2565], + ["CB",0x2566], + ["CF",0x2567], + ["D0",0x2568], + ["CA",0x2569], + ["D8",0x256A], + ["D7",0x256B], + ["CE",0x256C], + ["DF",0x2580], + ["DC",0x2584], + ["DB",0x2588], + ["DD",0x258C], + ["DE",0x2590], + ["B0",0x2591], + ["B1",0x2592], + ["B2",0x2593], + ["FE",0x25A0], +] diff --git a/enc/trans/single_byte.trans b/enc/trans/single_byte.trans index cf521bed38..0d5407b918 100644 --- a/enc/trans/single_byte.trans +++ b/enc/trans/single_byte.trans @@ -51,8 +51,9 @@ transcode_tblgen_singlebyte "WINDOWS-1256" transcode_tblgen_singlebyte "WINDOWS-1257" transcode_tblgen_singlebyte "IBM437" - transcode_tblgen_singlebyte "IBM775" + transcode_tblgen_singlebyte "IBM720" transcode_tblgen_singlebyte "IBM737" + transcode_tblgen_singlebyte "IBM775" transcode_tblgen_singlebyte "IBM852" transcode_tblgen_singlebyte "IBM855" transcode_tblgen_singlebyte "IBM857" diff --git a/ext/win32ole/win32ole.c b/ext/win32ole/win32ole.c index 4f4550c5b9..59bae0e774 100644 --- a/ext/win32ole/win32ole.c +++ b/ext/win32ole/win32ole.c @@ -507,6 +507,7 @@ static UINT ole_encoding2cp(rb_encoding *enc) ENC_MACHING_CP(enc, "GB2312", 20936); ENC_MACHING_CP(enc, "GBK", 936); ENC_MACHING_CP(enc, "IBM437", 437); + ENC_MACHING_CP(enc, "IBM720", 720); ENC_MACHING_CP(enc, "IBM737", 737); ENC_MACHING_CP(enc, "IBM775", 775); ENC_MACHING_CP(enc, "IBM852", 852); diff --git a/spec/ruby/core/string/valid_encoding_spec.rb b/spec/ruby/core/string/valid_encoding_spec.rb index 09916df079..3e820a5c6d 100644 --- a/spec/ruby/core/string/valid_encoding_spec.rb +++ b/spec/ruby/core/string/valid_encoding_spec.rb @@ -100,6 +100,14 @@ describe "String#valid_encoding?" do str.force_encoding('UTF8-MAC').valid_encoding?.should be_true end + ruby_version_is '3.0' do + it "returns true for IBM720 encoding self is valid in" do + str = "\u{6754}" + str.force_encoding('IBM720').valid_encoding?.should be_true + str.force_encoding('CP720').valid_encoding?.should be_true + end + end + it "returns false if self is valid in one encoding, but invalid in the one it's tagged with" do str = "\u{8765}" str.valid_encoding?.should be_true diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb index 0f6b787fa1..04c8248697 100644 --- a/test/ruby/test_transcode.rb +++ b/test/ruby/test_transcode.rb @@ -469,6 +469,25 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u00A0", "\xFF", 'IBM437') # non-breaking space end + def test_IBM720 + assert_raise(Encoding::UndefinedConversionError) { "\x80".encode("utf-8", 'IBM720') } + assert_raise(Encoding::UndefinedConversionError) { "\x8F".encode("utf-8", 'IBM720') } + assert_raise(Encoding::UndefinedConversionError) { "\x90".encode("utf-8", 'IBM720') } + check_both_ways("\u0627", "\x9F", 'IBM720') # ا + check_both_ways("\u0628", "\xA0", 'IBM720') # ب + check_both_ways("\u00BB", "\xAF", 'IBM720') # » + check_both_ways("\u2591", "\xB0", 'IBM720') # ░ + check_both_ways("\u2510", "\xBF", 'IBM720') # ┐ + check_both_ways("\u2514", "\xC0", 'IBM720') # └ + check_both_ways("\u2567", "\xCF", 'IBM720') # ╧ + check_both_ways("\u2568", "\xD0", 'IBM720') # ╨ + check_both_ways("\u2580", "\xDF", 'IBM720') # ▀ + check_both_ways("\u0636", "\xE0", 'IBM720') # ض + check_both_ways("\u064A", "\xEF", 'IBM720') # ي + check_both_ways("\u2261", "\xF0", 'IBM720') # ≡ + check_both_ways("\u00A0", "\xFF", 'IBM720') # non-breaking space + end + def test_IBM775 check_both_ways("\u0106", "\x80", 'IBM775') # Ć check_both_ways("\u00C5", "\x8F", 'IBM775') # Å -- cgit v1.2.3