diff options
| author | Kevin Newton <kddnewton@gmail.com> | 2023-11-29 21:01:10 -0500 |
|---|---|---|
| committer | Kevin Newton <kddnewton@gmail.com> | 2023-11-30 21:37:56 -0500 |
| commit | a9162a44c59d85a56930e78cf1801558984db4a7 (patch) | |
| tree | c7b1521dc74122b91d66d4b816cb17956ade73c7 | |
| parent | 10d3897e13f87569d6a682336141330371a902e3 (diff) | |
[ruby/prism] Emacs MULE encodings
https://github.com/ruby/prism/commit/4c06b6c42e
| -rw-r--r-- | prism/enc/pm_big5.c | 102 | ||||
| -rw-r--r-- | prism/enc/pm_encoding.h | 3 | ||||
| -rw-r--r-- | prism/prism.c | 3 | ||||
| -rw-r--r-- | test/prism/encoding_test.rb | 36 |
4 files changed, 116 insertions, 28 deletions
diff --git a/prism/enc/pm_big5.c b/prism/enc/pm_big5.c index a97574a173..948cfc4b11 100644 --- a/prism/enc/pm_big5.c +++ b/prism/enc/pm_big5.c @@ -21,29 +21,17 @@ pm_encoding_big5_char_width(const uint8_t *b, ptrdiff_t n) { static size_t pm_encoding_big5_alpha_char(const uint8_t *b, ptrdiff_t n) { - if (pm_encoding_big5_char_width(b, n) == 1) { - return pm_encoding_ascii_alpha_char(b, n); - } else { - return 0; - } + return (pm_encoding_big5_char_width(b, n) == 1) ? pm_encoding_ascii_alpha_char(b, n) : 0; } static size_t pm_encoding_big5_alnum_char(const uint8_t *b, ptrdiff_t n) { - if (pm_encoding_big5_char_width(b, n) == 1) { - return pm_encoding_ascii_alnum_char(b, n); - } else { - return 0; - } + return (pm_encoding_big5_char_width(b, n) == 1) ? pm_encoding_ascii_alnum_char(b, n) : 0; } static bool pm_encoding_big5_isupper_char(const uint8_t *b, ptrdiff_t n) { - if (pm_encoding_big5_char_width(b, n) == 1) { - return pm_encoding_ascii_isupper_char(b, n); - } else { - return false; - } + return (pm_encoding_big5_char_width(b, n) == 1) && pm_encoding_ascii_isupper_char(b, n); } /** Big5 encoding */ @@ -95,3 +83,87 @@ pm_encoding_t pm_encoding_big5_uao = { .isupper_char = pm_encoding_big5_isupper_char, .multibyte = true }; + +static size_t +pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the 1 byte characters. + if (*b < 0x80) { + return 1; + } + + // These are the 2 byte characters. + if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0x8F) && (b[1] >= 0xA0)) { + return 2; + } + + // These are the 3 byte characters. + if ( + (n > 2) && + ( + ((b[0] >= 0x90 && b[0] <= 0x99) && (b[1] >= 0xA0)) || + ((b[0] == 0x9A || b[0] == 0x9B) && (b[1] >= 0xE0 && b[1] <= 0xEF)) + ) && + (b[2] >= 0xA0) + ) { + return 3; + } + + // These are the 4 byte characters. + if ( + (n > 3) && + ( + ((b[0] == 0x9C) && (b[1] >= 0xF0) && (b[1] <= 0xF4)) || + ((b[0] == 0x9D) && (b[1] >= 0xF5) && (b[1] <= 0xFE)) + ) && + (b[2] >= 0xA0) && (b[3] >= 0xA0) + ) { + return 4; + } + + return 0; +} + +static size_t +pm_encoding_emacs_mule_alpha_char(const uint8_t *b, ptrdiff_t n) { + return (pm_encoding_emacs_mule_char_width(b, n) == 1) ? pm_encoding_ascii_alpha_char(b, n) : 0; +} + +static size_t +pm_encoding_emacs_mule_alnum_char(const uint8_t *b, ptrdiff_t n) { + return (pm_encoding_emacs_mule_char_width(b, n) == 1) ? pm_encoding_ascii_alnum_char(b, n) : 0; +} + +static bool +pm_encoding_emacs_mule_isupper_char(const uint8_t *b, ptrdiff_t n) { + return (pm_encoding_emacs_mule_char_width(b, n) == 1) && pm_encoding_ascii_isupper_char(b, n); +} + +/** Emacs-Mule encoding */ +pm_encoding_t pm_encoding_emacs_mule = { + .name = "Emacs-Mule", + .char_width = pm_encoding_emacs_mule_char_width, + .alnum_char = pm_encoding_emacs_mule_alnum_char, + .alpha_char = pm_encoding_emacs_mule_alpha_char, + .isupper_char = pm_encoding_emacs_mule_isupper_char, + .multibyte = true +}; + +/** stateless-ISO-2022-JP encoding */ +pm_encoding_t pm_encoding_stateless_iso_2022_jp = { + .name = "stateless-ISO-2022-JP", + .char_width = pm_encoding_emacs_mule_char_width, + .alnum_char = pm_encoding_emacs_mule_alnum_char, + .alpha_char = pm_encoding_emacs_mule_alpha_char, + .isupper_char = pm_encoding_emacs_mule_isupper_char, + .multibyte = true +}; + +/** stateless-ISO-2022-JP-KDDI encoding */ +pm_encoding_t pm_encoding_stateless_iso_2022_jp_kddi = { + .name = "stateless-ISO-2022-JP-KDDI", + .char_width = pm_encoding_emacs_mule_char_width, + .alnum_char = pm_encoding_emacs_mule_alnum_char, + .alpha_char = pm_encoding_emacs_mule_alpha_char, + .isupper_char = pm_encoding_emacs_mule_isupper_char, + .multibyte = true +}; diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h index 937170d7d0..e14d4f6f2c 100644 --- a/prism/enc/pm_encoding.h +++ b/prism/enc/pm_encoding.h @@ -167,6 +167,7 @@ extern pm_encoding_t pm_encoding_cp855; extern pm_encoding_t pm_encoding_cp949; extern pm_encoding_t pm_encoding_cp950; extern pm_encoding_t pm_encoding_cp951; +extern pm_encoding_t pm_encoding_emacs_mule; extern pm_encoding_t pm_encoding_euc_jp; extern pm_encoding_t pm_encoding_euc_jp_ms; extern pm_encoding_t pm_encoding_euc_jis_2004; @@ -222,6 +223,8 @@ extern pm_encoding_t pm_encoding_shift_jis; extern pm_encoding_t pm_encoding_sjis_docomo; extern pm_encoding_t pm_encoding_sjis_kddi; extern pm_encoding_t pm_encoding_sjis_softbank; +extern pm_encoding_t pm_encoding_stateless_iso_2022_jp; +extern pm_encoding_t pm_encoding_stateless_iso_2022_jp_kddi; extern pm_encoding_t pm_encoding_tis_620; extern pm_encoding_t pm_encoding_utf_8; extern pm_encoding_t pm_encoding_utf8_mac; diff --git a/prism/prism.c b/prism/prism.c index 379d8b0ea9..e32b444619 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -6253,6 +6253,7 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star ENCODING2("EUC-JIS-2004", "EUC-JISX0213", pm_encoding_euc_jis_2004); ENCODING2("EUC-KR", "eucKR", pm_encoding_euc_kr); ENCODING2("EUC-CN", "eucCN", pm_encoding_gb2312); + ENCODING1("Emacs-Mule", pm_encoding_emacs_mule); ENCODING1("external", pm_encoding_utf_8); break; case 'F': case 'f': @@ -6327,6 +6328,8 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star ENCODING1("SJIS-DoCoMo", pm_encoding_sjis_docomo); ENCODING1("SJIS-KDDI", pm_encoding_sjis_kddi); ENCODING1("SJIS-SoftBank", pm_encoding_sjis_softbank); + ENCODING1("stateless-ISO-2022-JP", pm_encoding_stateless_iso_2022_jp); + ENCODING1("stateless-ISO-2022-JP-KDDI", pm_encoding_stateless_iso_2022_jp_kddi); break; case 'T': case 't': ENCODING1("TIS-620", pm_encoding_tis_620); diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb index 67652625cb..aa114b669b 100644 --- a/test/prism/encoding_test.rb +++ b/test/prism/encoding_test.rb @@ -6,8 +6,8 @@ require_relative "test_helper" module Prism class EncodingTest < TestCase - codepoints_1byte = 0x00...0x100 - codepoints_2bytes = 0x00...0x10000 + codepoints_1byte = 0...0x100 + codepoints_2bytes = 0...0x10000 encodings = { Encoding::ASCII => codepoints_1byte, @@ -91,19 +91,29 @@ module Prism # are 3 and 4 byte representations so it can drastically slow down the test # suite. if ENV["PRISM_TEST_ALL_ENCODINGS"] - codepoints_eucjp = [*(0x00...0x10000), *(0x00...0x10000).map { |bytes| bytes | 0x8F0000 }] - codepoints_unicode = 0x00...0x110000 + codepoints_eucjp = [*(0...0x10000), *(0...0x10000).map { |bytes| bytes | 0x8F0000 }] + codepoints_unicode = 0...0x110000 + codepoints_emacs_mule = [ + *(0...0x80), + *((0x81...0x90).flat_map { |byte1| (0x90...0x100).map { |byte2| byte1 << 8 | byte2 } }), + *((0x90...0x9C).flat_map { |byte1| (0xA0...0x100).flat_map { |byte2| (0xA0...0x100).flat_map { |byte3| byte1 << 16 | byte2 << 8 | byte3 } } }), + *((0xF0...0xF5).flat_map { |byte2| (0xA0...0x100).flat_map { |byte3| (0xA0...0x100).flat_map { |byte4| 0x9C << 24 | byte3 << 16 | byte3 << 8 | byte4 } } }), + ] + encodings.clear encodings.merge!( - Encoding::CP51932 => codepoints_eucjp, - Encoding::EUC_JP => codepoints_eucjp, - Encoding::EUCJP_MS => codepoints_eucjp, - Encoding::EUC_JIS_2004 => codepoints_eucjp, - Encoding::UTF_8 => codepoints_unicode, - Encoding::UTF8_MAC => codepoints_unicode, - Encoding::UTF8_DoCoMo => codepoints_unicode, - Encoding::UTF8_KDDI => codepoints_unicode, - Encoding::UTF8_SoftBank => codepoints_unicode + Encoding::CP51932 => codepoints_eucjp, + Encoding::EUC_JP => codepoints_eucjp, + Encoding::EUCJP_MS => codepoints_eucjp, + Encoding::EUC_JIS_2004 => codepoints_eucjp, + Encoding::UTF_8 => codepoints_unicode, + Encoding::UTF8_MAC => codepoints_unicode, + Encoding::UTF8_DoCoMo => codepoints_unicode, + Encoding::UTF8_KDDI => codepoints_unicode, + Encoding::UTF8_SoftBank => codepoints_unicode, + Encoding::EMACS_MULE => codepoints_emacs_mule, + Encoding::STATELESS_ISO_2022_JP => codepoints_emacs_mule, + Encoding::STATELESS_ISO_2022_JP_KDDI => codepoints_emacs_mule, ) end |
