From 90229f89a53f942b2aa3a2e583ab717b1e831aa7 Mon Sep 17 00:00:00 2001 From: yugui Date: Thu, 30 Oct 2008 12:57:28 +0000 Subject: merges r20054 and r20055 from trunk into ruby_1_9_1. * enc/trans/single_byte.trans: refactoring to make it easier to add more transcodings (with Yoshihiro Kambayashi) * enc/trans/iso-8859-1-tbl.rb: new file to avoid having to treat ISO-8859-1 as special git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/ruby_1_9_1@20063 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- enc/trans/iso-8859-1-tbl.rb | 98 +++++++++++++++++++++++++++++++++++++++++++++ enc/trans/single_byte.trans | 91 +++++++++++++++++------------------------ 2 files changed, 135 insertions(+), 54 deletions(-) create mode 100644 enc/trans/iso-8859-1-tbl.rb (limited to 'enc') diff --git a/enc/trans/iso-8859-1-tbl.rb b/enc/trans/iso-8859-1-tbl.rb new file mode 100644 index 0000000000..05397e6417 --- /dev/null +++ b/enc/trans/iso-8859-1-tbl.rb @@ -0,0 +1,98 @@ +ISO_8859_1_TO_UCS_TBL = [ + ["A0",0xA0], + ["A1",0xA1], + ["A2",0xA2], + ["A3",0xA3], + ["A4",0xA4], + ["A5",0xA5], + ["A6",0xA6], + ["A7",0xA7], + ["A8",0xA8], + ["A9",0xA9], + ["AA",0xAA], + ["AB",0xAB], + ["AC",0xAC], + ["AD",0xAD], + ["AE",0xAE], + ["AF",0xAF], + ["B0",0xB0], + ["B1",0xB1], + ["B2",0xB2], + ["B3",0xB3], + ["B4",0xB4], + ["B5",0xB5], + ["B6",0xB6], + ["B7",0xB7], + ["B8",0xB8], + ["B9",0xB9], + ["BA",0xBA], + ["BB",0xBB], + ["BC",0xBC], + ["BD",0xBD], + ["BE",0xBE], + ["BF",0xBF], + ["C0",0xC0], + ["C1",0xC1], + ["C2",0xC2], + ["C3",0xC3], + ["C4",0xC4], + ["C5",0xC5], + ["C6",0xC6], + ["C7",0xC7], + ["C8",0xC8], + ["C9",0xC9], + ["CA",0xCA], + ["CB",0xCB], + ["CC",0xCC], + ["CD",0xCD], + ["CE",0xCE], + ["CF",0xCF], + ["D0",0xD0], + ["D1",0xD1], + ["D2",0xD2], + ["D3",0xD3], + ["D4",0xD4], + ["D5",0xD5], + ["D6",0xD6], + ["D7",0xD7], + ["D8",0xD8], + ["D9",0xD9], + ["DA",0xDA], + ["DB",0xDB], + ["DC",0xDC], + ["DD",0xDD], + ["DE",0xDE], + ["DF",0xDF], + ["E0",0xE0], + ["E1",0xE1], + ["E2",0xE2], + ["E3",0xE3], + ["E4",0xE4], + ["E5",0xE5], + ["E6",0xE6], + ["E7",0xE7], + ["E8",0xE8], + ["E9",0xE9], + ["EA",0xEA], + ["EB",0xEB], + ["EC",0xEC], + ["ED",0xED], + ["EE",0xEE], + ["EF",0xEF], + ["F0",0xF0], + ["F1",0xF1], + ["F2",0xF2], + ["F3",0xF3], + ["F4",0xF4], + ["F5",0xF5], + ["F6",0xF6], + ["F7",0xF7], + ["F8",0xF8], + ["F9",0xF9], + ["FA",0xFA], + ["FB",0xFB], + ["FC",0xFC], + ["FD",0xFD], + ["FE",0xFE], + ["FF",0xFF], +] diff --git a/enc/trans/single_byte.trans b/enc/trans/single_byte.trans index d445c8e130..b49bc779a1 100644 --- a/enc/trans/single_byte.trans +++ b/enc/trans/single_byte.trans @@ -3,38 +3,25 @@ <% us_ascii_map = [["{00-7f}", :nomap]] - ISO_8859_1_TO_UCS_TBL = (0x80..0xff).map {|c| ["%02X" % c, c] } - CONTROL1_TO_UCS_TBL = (0x80..0x9f).map {|c| ["%02X" % c, c] } - - require 'iso-8859-2-tbl' - require 'iso-8859-3-tbl' - require 'iso-8859-4-tbl' - require 'iso-8859-5-tbl' - require 'iso-8859-6-tbl' - require 'iso-8859-7-tbl' - require 'iso-8859-8-tbl' - require 'iso-8859-9-tbl' - require 'iso-8859-10-tbl' - require 'iso-8859-11-tbl' - require 'iso-8859-13-tbl' - require 'iso-8859-14-tbl' - require 'iso-8859-15-tbl' - require 'windows-874-tbl' - require 'windows-1250-tbl' - require 'windows-1251-tbl' - require 'windows-1252-tbl' - require 'windows-1253-tbl' - require 'windows-1254-tbl' - require 'windows-1255-tbl' - require 'windows-1256-tbl' - require 'windows-1257-tbl' - transcode_tblgen "US-ASCII", "UTF-8", us_ascii_map transcode_tblgen "UTF-8", "US-ASCII", us_ascii_map transcode_tblgen "ASCII-8BIT", "UTF-8", us_ascii_map transcode_tblgen "UTF-8", "ASCII-8BIT", us_ascii_map - def transcode_tblgen_singlebyte(name, tbl_to_ucs) + CONTROL1_TO_UCS_TBL = (0x80..0x9f).map {|c| ["%02X" % c, c] } + + # Generate transcoding tables for single byte encoding from + # encoding name using table file. + # + # Conventions: + # name: encoding name as string, UPPER case, hyphens (e.g. 'ISO-8859-3') + # file name: lower case, hyphens, -tbl.rb suffix (e.g. iso-8859-3-tbl.rb) + # variable name: UPPER case, underscores, _TO_UCS_TBL suffix (e.g. ISO_8859_3_TO_UCS_TBL) + # If the name starts with "ISO-8859", the C1 control code area is added automatically. + def transcode_tblgen_singlebyte (name) + require(name.downcase + "-tbl") + control1_if_needed = (name =~ /^ISO-8859/) ? CONTROL1_TO_UCS_TBL : [] + tbl_to_ucs = control1_if_needed + eval(name.gsub(/-/, '_') + "_TO_UCS_TBL") set_valid_byte_pattern(name, '1byte') code = '' code << transcode_tblgen(name, "UTF-8", [["{00-7f}", :nomap], *tbl_to_ucs]) @@ -43,33 +30,29 @@ code end - def transcode_tblgen_iso8859(name, tbl_to_ucs) - transcode_tblgen_singlebyte(name, CONTROL1_TO_UCS_TBL + tbl_to_ucs) - end - - transcode_tblgen_iso8859("ISO-8859-1", ISO_8859_1_TO_UCS_TBL) - transcode_tblgen_iso8859("ISO-8859-2", ISO_8859_2_TO_UCS_TBL) - transcode_tblgen_iso8859("ISO-8859-3", ISO_8859_3_TO_UCS_TBL) - transcode_tblgen_iso8859("ISO-8859-4", ISO_8859_4_TO_UCS_TBL) - transcode_tblgen_iso8859("ISO-8859-5", ISO_8859_5_TO_UCS_TBL) - transcode_tblgen_iso8859("ISO-8859-6", ISO_8859_6_TO_UCS_TBL) - transcode_tblgen_iso8859("ISO-8859-7", ISO_8859_7_TO_UCS_TBL) - transcode_tblgen_iso8859("ISO-8859-8", ISO_8859_8_TO_UCS_TBL) - transcode_tblgen_iso8859("ISO-8859-9", ISO_8859_9_TO_UCS_TBL) - transcode_tblgen_iso8859("ISO-8859-10", ISO_8859_10_TO_UCS_TBL) - transcode_tblgen_iso8859("ISO-8859-11", ISO_8859_11_TO_UCS_TBL) - transcode_tblgen_iso8859("ISO-8859-13", ISO_8859_13_TO_UCS_TBL) - transcode_tblgen_iso8859("ISO-8859-14", ISO_8859_14_TO_UCS_TBL) - transcode_tblgen_iso8859("ISO-8859-15", ISO_8859_15_TO_UCS_TBL) - transcode_tblgen_singlebyte("WINDOWS-874", WINDOWS_874_TO_UCS_TBL) - transcode_tblgen_singlebyte("WINDOWS-1250", WINDOWS_1250_TO_UCS_TBL) - transcode_tblgen_singlebyte("WINDOWS-1251", WINDOWS_1251_TO_UCS_TBL) - transcode_tblgen_singlebyte("WINDOWS-1252", WINDOWS_1252_TO_UCS_TBL) - transcode_tblgen_singlebyte("WINDOWS-1253", WINDOWS_1253_TO_UCS_TBL) - transcode_tblgen_singlebyte("WINDOWS-1254", WINDOWS_1254_TO_UCS_TBL) - transcode_tblgen_singlebyte("WINDOWS-1255", WINDOWS_1255_TO_UCS_TBL) - transcode_tblgen_singlebyte("WINDOWS-1256", WINDOWS_1256_TO_UCS_TBL) - transcode_tblgen_singlebyte("WINDOWS-1257", WINDOWS_1257_TO_UCS_TBL) + transcode_tblgen_singlebyte "ISO-8859-1" + transcode_tblgen_singlebyte "ISO-8859-2" + transcode_tblgen_singlebyte "ISO-8859-3" + transcode_tblgen_singlebyte "ISO-8859-4" + transcode_tblgen_singlebyte "ISO-8859-5" + transcode_tblgen_singlebyte "ISO-8859-6" + transcode_tblgen_singlebyte "ISO-8859-7" + transcode_tblgen_singlebyte "ISO-8859-8" + transcode_tblgen_singlebyte "ISO-8859-9" + transcode_tblgen_singlebyte "ISO-8859-10" + transcode_tblgen_singlebyte "ISO-8859-11" + transcode_tblgen_singlebyte "ISO-8859-13" + transcode_tblgen_singlebyte "ISO-8859-14" + transcode_tblgen_singlebyte "ISO-8859-15" + transcode_tblgen_singlebyte "WINDOWS-874" + transcode_tblgen_singlebyte "WINDOWS-1250" + transcode_tblgen_singlebyte "WINDOWS-1251" + transcode_tblgen_singlebyte "WINDOWS-1252" + transcode_tblgen_singlebyte "WINDOWS-1253" + transcode_tblgen_singlebyte "WINDOWS-1254" + transcode_tblgen_singlebyte "WINDOWS-1255" + transcode_tblgen_singlebyte "WINDOWS-1256" + transcode_tblgen_singlebyte "WINDOWS-1257" %> <%= transcode_generated_code %> -- cgit v1.2.3