summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Boldt <me@mattboldt.com>2023-11-28 09:19:25 -0600
committerKevin Newton <kddnewton@gmail.com>2023-11-29 12:08:15 -0500
commit9fc40d2b26fad25960eff6dd4d35d6592f6faaef (patch)
tree5aac44364f0d18ed99744e5ff75cd8a276d47f20
parent86d9a6dcb61b47bcacfe98200cb6d47da6bb1134 (diff)
[ruby/prism] Add MacJapanese encoding
MacJapanese (also aliased as MacJapan) is a modified Shift_JIS encoding, but is implemented identically in Ruby https://github.com/ruby/prism/commit/9e0a097699
-rw-r--r--lib/prism/prism.gemspec1
-rw-r--r--prism/enc/pm_encoding.h1
-rw-r--r--prism/enc/pm_mac_japanese.c57
-rw-r--r--prism/prism.c2
-rw-r--r--test/prism/encoding_test.rb1
5 files changed, 62 insertions, 0 deletions
diff --git a/lib/prism/prism.gemspec b/lib/prism/prism.gemspec
index 80cfa8ab76..381ecfea5b 100644
--- a/lib/prism/prism.gemspec
+++ b/lib/prism/prism.gemspec
@@ -91,6 +91,7 @@ Gem::Specification.new do |spec|
"src/enc/pm_cp950.c",
"src/enc/pm_euc_jp.c",
"src/enc/pm_gbk.c",
+ "src/enc/pm_mac_japanese.c",
"src/enc/pm_shift_jis.c",
"src/enc/pm_tables.c",
"src/enc/pm_unicode.c",
diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h
index 5b79902389..797029365c 100644
--- a/prism/enc/pm_encoding.h
+++ b/prism/enc/pm_encoding.h
@@ -206,6 +206,7 @@ extern pm_encoding_t pm_encoding_mac_croatian;
extern pm_encoding_t pm_encoding_mac_cyrillic;
extern pm_encoding_t pm_encoding_mac_greek;
extern pm_encoding_t pm_encoding_mac_iceland;
+extern pm_encoding_t pm_encoding_mac_japanese;
extern pm_encoding_t pm_encoding_mac_roman;
extern pm_encoding_t pm_encoding_mac_romania;
extern pm_encoding_t pm_encoding_mac_thai;
diff --git a/prism/enc/pm_mac_japanese.c b/prism/enc/pm_mac_japanese.c
new file mode 100644
index 0000000000..a5185f0e55
--- /dev/null
+++ b/prism/enc/pm_mac_japanese.c
@@ -0,0 +1,57 @@
+#include "prism/enc/pm_encoding.h"
+
+static size_t
+pm_encoding_mac_japanese_char_width(const uint8_t *b, ptrdiff_t n) {
+ // These are the single byte characters.
+ if (*b < 0x80 || (*b >= 0xA1 && *b <= 0xDF)) {
+ return 1;
+ }
+
+ // These are the double byte characters.
+ if (
+ (n > 1) &&
+ ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) &&
+ (b[1] >= 0x40 && b[1] <= 0xFC)
+ ) {
+ return 2;
+ }
+
+ return 0;
+}
+
+static size_t
+pm_encoding_mac_japanese_alpha_char(const uint8_t *b, ptrdiff_t n) {
+ if (pm_encoding_mac_japanese_char_width(b, n) == 1) {
+ return pm_encoding_ascii_alpha_char(b, n);
+ } else {
+ return 0;
+ }
+}
+
+static size_t
+pm_encoding_mac_japanese_alnum_char(const uint8_t *b, ptrdiff_t n) {
+ if (pm_encoding_mac_japanese_char_width(b, n) == 1) {
+ return pm_encoding_ascii_alnum_char(b, n);
+ } else {
+ return 0;
+ }
+}
+
+static bool
+pm_encoding_mac_japanese_isupper_char(const uint8_t *b, ptrdiff_t n) {
+ if (pm_encoding_mac_japanese_char_width(b, n) == 1) {
+ return pm_encoding_ascii_isupper_char(b, n);
+ } else {
+ return 0;
+ }
+}
+
+/** MacJapanese encoding */
+pm_encoding_t pm_encoding_mac_japanese = {
+ .name = "MacJapanese",
+ .char_width = pm_encoding_mac_japanese_char_width,
+ .alnum_char = pm_encoding_mac_japanese_alnum_char,
+ .alpha_char = pm_encoding_mac_japanese_alpha_char,
+ .isupper_char = pm_encoding_mac_japanese_isupper_char,
+ .multibyte = true
+};
diff --git a/prism/prism.c b/prism/prism.c
index 960b652db8..154d8ea6b2 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -6303,6 +6303,8 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
ENCODING1("macCyrillic", pm_encoding_mac_cyrillic);
ENCODING1("macGreek", pm_encoding_mac_greek);
ENCODING1("macIceland", pm_encoding_mac_iceland);
+ ENCODING1("MacJapanese", pm_encoding_mac_japanese);
+ ENCODING1("MacJapan", pm_encoding_mac_japanese);
ENCODING1("macRoman", pm_encoding_mac_roman);
ENCODING1("macRomania", pm_encoding_mac_romania);
ENCODING1("macThai", pm_encoding_mac_thai);
diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb
index 28992fcf1b..76162bec1e 100644
--- a/test/prism/encoding_test.rb
+++ b/test/prism/encoding_test.rb
@@ -72,6 +72,7 @@ module Prism
Encoding::CP950 => 0x00...0x10000,
Encoding::CP51932 => 0x00...0x10000,
Encoding::GBK => 0x00...0x10000,
+ Encoding::MACJAPANESE => 0x00...0x10000,
Encoding::Shift_JIS => 0x00...0x10000,
Encoding::Windows_31J => 0x00...0x10000
}