summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2023-11-29 21:01:10 -0500
committerKevin Newton <kddnewton@gmail.com>2023-11-30 21:37:56 -0500
commita9162a44c59d85a56930e78cf1801558984db4a7 (patch)
treec7b1521dc74122b91d66d4b816cb17956ade73c7
parent10d3897e13f87569d6a682336141330371a902e3 (diff)
[ruby/prism] Emacs MULE encodings
https://github.com/ruby/prism/commit/4c06b6c42e
-rw-r--r--prism/enc/pm_big5.c102
-rw-r--r--prism/enc/pm_encoding.h3
-rw-r--r--prism/prism.c3
-rw-r--r--test/prism/encoding_test.rb36
4 files changed, 116 insertions, 28 deletions
diff --git a/prism/enc/pm_big5.c b/prism/enc/pm_big5.c
index a97574a173..948cfc4b11 100644
--- a/prism/enc/pm_big5.c
+++ b/prism/enc/pm_big5.c
@@ -21,29 +21,17 @@ pm_encoding_big5_char_width(const uint8_t *b, ptrdiff_t n) {
static size_t
pm_encoding_big5_alpha_char(const uint8_t *b, ptrdiff_t n) {
- if (pm_encoding_big5_char_width(b, n) == 1) {
- return pm_encoding_ascii_alpha_char(b, n);
- } else {
- return 0;
- }
+ return (pm_encoding_big5_char_width(b, n) == 1) ? pm_encoding_ascii_alpha_char(b, n) : 0;
}
static size_t
pm_encoding_big5_alnum_char(const uint8_t *b, ptrdiff_t n) {
- if (pm_encoding_big5_char_width(b, n) == 1) {
- return pm_encoding_ascii_alnum_char(b, n);
- } else {
- return 0;
- }
+ return (pm_encoding_big5_char_width(b, n) == 1) ? pm_encoding_ascii_alnum_char(b, n) : 0;
}
static bool
pm_encoding_big5_isupper_char(const uint8_t *b, ptrdiff_t n) {
- if (pm_encoding_big5_char_width(b, n) == 1) {
- return pm_encoding_ascii_isupper_char(b, n);
- } else {
- return false;
- }
+ return (pm_encoding_big5_char_width(b, n) == 1) && pm_encoding_ascii_isupper_char(b, n);
}
/** Big5 encoding */
@@ -95,3 +83,87 @@ pm_encoding_t pm_encoding_big5_uao = {
.isupper_char = pm_encoding_big5_isupper_char,
.multibyte = true
};
+
+static size_t
+pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) {
+ // These are the 1 byte characters.
+ if (*b < 0x80) {
+ return 1;
+ }
+
+ // These are the 2 byte characters.
+ if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0x8F) && (b[1] >= 0xA0)) {
+ return 2;
+ }
+
+ // These are the 3 byte characters.
+ if (
+ (n > 2) &&
+ (
+ ((b[0] >= 0x90 && b[0] <= 0x99) && (b[1] >= 0xA0)) ||
+ ((b[0] == 0x9A || b[0] == 0x9B) && (b[1] >= 0xE0 && b[1] <= 0xEF))
+ ) &&
+ (b[2] >= 0xA0)
+ ) {
+ return 3;
+ }
+
+ // These are the 4 byte characters.
+ if (
+ (n > 3) &&
+ (
+ ((b[0] == 0x9C) && (b[1] >= 0xF0) && (b[1] <= 0xF4)) ||
+ ((b[0] == 0x9D) && (b[1] >= 0xF5) && (b[1] <= 0xFE))
+ ) &&
+ (b[2] >= 0xA0) && (b[3] >= 0xA0)
+ ) {
+ return 4;
+ }
+
+ return 0;
+}
+
+static size_t
+pm_encoding_emacs_mule_alpha_char(const uint8_t *b, ptrdiff_t n) {
+ return (pm_encoding_emacs_mule_char_width(b, n) == 1) ? pm_encoding_ascii_alpha_char(b, n) : 0;
+}
+
+static size_t
+pm_encoding_emacs_mule_alnum_char(const uint8_t *b, ptrdiff_t n) {
+ return (pm_encoding_emacs_mule_char_width(b, n) == 1) ? pm_encoding_ascii_alnum_char(b, n) : 0;
+}
+
+static bool
+pm_encoding_emacs_mule_isupper_char(const uint8_t *b, ptrdiff_t n) {
+ return (pm_encoding_emacs_mule_char_width(b, n) == 1) && pm_encoding_ascii_isupper_char(b, n);
+}
+
+/** Emacs-Mule encoding */
+pm_encoding_t pm_encoding_emacs_mule = {
+ .name = "Emacs-Mule",
+ .char_width = pm_encoding_emacs_mule_char_width,
+ .alnum_char = pm_encoding_emacs_mule_alnum_char,
+ .alpha_char = pm_encoding_emacs_mule_alpha_char,
+ .isupper_char = pm_encoding_emacs_mule_isupper_char,
+ .multibyte = true
+};
+
+/** stateless-ISO-2022-JP encoding */
+pm_encoding_t pm_encoding_stateless_iso_2022_jp = {
+ .name = "stateless-ISO-2022-JP",
+ .char_width = pm_encoding_emacs_mule_char_width,
+ .alnum_char = pm_encoding_emacs_mule_alnum_char,
+ .alpha_char = pm_encoding_emacs_mule_alpha_char,
+ .isupper_char = pm_encoding_emacs_mule_isupper_char,
+ .multibyte = true
+};
+
+/** stateless-ISO-2022-JP-KDDI encoding */
+pm_encoding_t pm_encoding_stateless_iso_2022_jp_kddi = {
+ .name = "stateless-ISO-2022-JP-KDDI",
+ .char_width = pm_encoding_emacs_mule_char_width,
+ .alnum_char = pm_encoding_emacs_mule_alnum_char,
+ .alpha_char = pm_encoding_emacs_mule_alpha_char,
+ .isupper_char = pm_encoding_emacs_mule_isupper_char,
+ .multibyte = true
+};
diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h
index 937170d7d0..e14d4f6f2c 100644
--- a/prism/enc/pm_encoding.h
+++ b/prism/enc/pm_encoding.h
@@ -167,6 +167,7 @@ extern pm_encoding_t pm_encoding_cp855;
extern pm_encoding_t pm_encoding_cp949;
extern pm_encoding_t pm_encoding_cp950;
extern pm_encoding_t pm_encoding_cp951;
+extern pm_encoding_t pm_encoding_emacs_mule;
extern pm_encoding_t pm_encoding_euc_jp;
extern pm_encoding_t pm_encoding_euc_jp_ms;
extern pm_encoding_t pm_encoding_euc_jis_2004;
@@ -222,6 +223,8 @@ extern pm_encoding_t pm_encoding_shift_jis;
extern pm_encoding_t pm_encoding_sjis_docomo;
extern pm_encoding_t pm_encoding_sjis_kddi;
extern pm_encoding_t pm_encoding_sjis_softbank;
+extern pm_encoding_t pm_encoding_stateless_iso_2022_jp;
+extern pm_encoding_t pm_encoding_stateless_iso_2022_jp_kddi;
extern pm_encoding_t pm_encoding_tis_620;
extern pm_encoding_t pm_encoding_utf_8;
extern pm_encoding_t pm_encoding_utf8_mac;
diff --git a/prism/prism.c b/prism/prism.c
index 379d8b0ea9..e32b444619 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -6253,6 +6253,7 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
ENCODING2("EUC-JIS-2004", "EUC-JISX0213", pm_encoding_euc_jis_2004);
ENCODING2("EUC-KR", "eucKR", pm_encoding_euc_kr);
ENCODING2("EUC-CN", "eucCN", pm_encoding_gb2312);
+ ENCODING1("Emacs-Mule", pm_encoding_emacs_mule);
ENCODING1("external", pm_encoding_utf_8);
break;
case 'F': case 'f':
@@ -6327,6 +6328,8 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
ENCODING1("SJIS-DoCoMo", pm_encoding_sjis_docomo);
ENCODING1("SJIS-KDDI", pm_encoding_sjis_kddi);
ENCODING1("SJIS-SoftBank", pm_encoding_sjis_softbank);
+ ENCODING1("stateless-ISO-2022-JP", pm_encoding_stateless_iso_2022_jp);
+ ENCODING1("stateless-ISO-2022-JP-KDDI", pm_encoding_stateless_iso_2022_jp_kddi);
break;
case 'T': case 't':
ENCODING1("TIS-620", pm_encoding_tis_620);
diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb
index 67652625cb..aa114b669b 100644
--- a/test/prism/encoding_test.rb
+++ b/test/prism/encoding_test.rb
@@ -6,8 +6,8 @@ require_relative "test_helper"
module Prism
class EncodingTest < TestCase
- codepoints_1byte = 0x00...0x100
- codepoints_2bytes = 0x00...0x10000
+ codepoints_1byte = 0...0x100
+ codepoints_2bytes = 0...0x10000
encodings = {
Encoding::ASCII => codepoints_1byte,
@@ -91,19 +91,29 @@ module Prism
# are 3 and 4 byte representations so it can drastically slow down the test
# suite.
if ENV["PRISM_TEST_ALL_ENCODINGS"]
- codepoints_eucjp = [*(0x00...0x10000), *(0x00...0x10000).map { |bytes| bytes | 0x8F0000 }]
- codepoints_unicode = 0x00...0x110000
+ codepoints_eucjp = [*(0...0x10000), *(0...0x10000).map { |bytes| bytes | 0x8F0000 }]
+ codepoints_unicode = 0...0x110000
+ codepoints_emacs_mule = [
+ *(0...0x80),
+ *((0x81...0x90).flat_map { |byte1| (0x90...0x100).map { |byte2| byte1 << 8 | byte2 } }),
+ *((0x90...0x9C).flat_map { |byte1| (0xA0...0x100).flat_map { |byte2| (0xA0...0x100).flat_map { |byte3| byte1 << 16 | byte2 << 8 | byte3 } } }),
+ *((0xF0...0xF5).flat_map { |byte2| (0xA0...0x100).flat_map { |byte3| (0xA0...0x100).flat_map { |byte4| 0x9C << 24 | byte3 << 16 | byte3 << 8 | byte4 } } }),
+ ]
+ encodings.clear
encodings.merge!(
- Encoding::CP51932 => codepoints_eucjp,
- Encoding::EUC_JP => codepoints_eucjp,
- Encoding::EUCJP_MS => codepoints_eucjp,
- Encoding::EUC_JIS_2004 => codepoints_eucjp,
- Encoding::UTF_8 => codepoints_unicode,
- Encoding::UTF8_MAC => codepoints_unicode,
- Encoding::UTF8_DoCoMo => codepoints_unicode,
- Encoding::UTF8_KDDI => codepoints_unicode,
- Encoding::UTF8_SoftBank => codepoints_unicode
+ Encoding::CP51932 => codepoints_eucjp,
+ Encoding::EUC_JP => codepoints_eucjp,
+ Encoding::EUCJP_MS => codepoints_eucjp,
+ Encoding::EUC_JIS_2004 => codepoints_eucjp,
+ Encoding::UTF_8 => codepoints_unicode,
+ Encoding::UTF8_MAC => codepoints_unicode,
+ Encoding::UTF8_DoCoMo => codepoints_unicode,
+ Encoding::UTF8_KDDI => codepoints_unicode,
+ Encoding::UTF8_SoftBank => codepoints_unicode,
+ Encoding::EMACS_MULE => codepoints_emacs_mule,
+ Encoding::STATELESS_ISO_2022_JP => codepoints_emacs_mule,
+ Encoding::STATELESS_ISO_2022_JP_KDDI => codepoints_emacs_mule,
)
end