summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2023-11-29 14:10:20 -0500
committerKevin Newton <kddnewton@gmail.com>2023-11-30 21:37:56 -0500
commit700e172a501d96ed3efe627b6a91b7890b32d2f4 (patch)
tree8ecdfe06f40fae764810a97e909f4858e132454e
parentddaa0730588a2c2947f6e59e91ca88b37e97f42a (diff)
[ruby/prism] EUC-KR encodings
https://github.com/ruby/prism/commit/ba5218385a
-rw-r--r--prism/enc/pm_encoding.h3
-rw-r--r--prism/enc/pm_euc_jp.c78
-rw-r--r--prism/prism.c6
-rw-r--r--test/prism/encoding_test.rb3
4 files changed, 89 insertions, 1 deletions
diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h
index e81ecad25b..3bea57fbf9 100644
--- a/prism/enc/pm_encoding.h
+++ b/prism/enc/pm_encoding.h
@@ -169,7 +169,10 @@ extern pm_encoding_t pm_encoding_cp950;
extern pm_encoding_t pm_encoding_euc_jp;
extern pm_encoding_t pm_encoding_euc_jp_ms;
extern pm_encoding_t pm_encoding_euc_jis_2004;
+extern pm_encoding_t pm_encoding_euc_kr;
+extern pm_encoding_t pm_encoding_gb12345;
extern pm_encoding_t pm_encoding_gb1988;
+extern pm_encoding_t pm_encoding_gb2312;
extern pm_encoding_t pm_encoding_gbk;
extern pm_encoding_t pm_encoding_ibm437;
extern pm_encoding_t pm_encoding_ibm720;
diff --git a/prism/enc/pm_euc_jp.c b/prism/enc/pm_euc_jp.c
index 9bee6a1292..a33ba4f0a5 100644
--- a/prism/enc/pm_euc_jp.c
+++ b/prism/enc/pm_euc_jp.c
@@ -97,3 +97,81 @@ pm_encoding_t pm_encoding_cp51932 = {
.isupper_char = pm_encoding_euc_jp_isupper_char,
.multibyte = true
};
+
+static size_t
+pm_encoding_euc_kr_char_width(const uint8_t *b, ptrdiff_t n) {
+ // These are the single byte characters.
+ if (*b < 0x80) {
+ return 1;
+ }
+
+ // These are the double byte characters.
+ if (
+ (n > 1) &&
+ (
+ (b[0] >= 0xA1 && b[0] <= 0xFE) &&
+ (b[1] >= 0xA1 && b[1] <= 0xFE)
+ )
+ ) {
+ return 2;
+ }
+
+ return 0;
+}
+
+static size_t
+pm_encoding_euc_kr_alpha_char(const uint8_t *b, ptrdiff_t n) {
+ if (pm_encoding_euc_kr_char_width(b, n) == 1) {
+ return pm_encoding_ascii_alpha_char(b, n);
+ } else {
+ return 0;
+ }
+}
+
+static size_t
+pm_encoding_euc_kr_alnum_char(const uint8_t *b, ptrdiff_t n) {
+ if (pm_encoding_euc_kr_char_width(b, n) == 1) {
+ return pm_encoding_ascii_alnum_char(b, n);
+ } else {
+ return 0;
+ }
+}
+
+static bool
+pm_encoding_euc_kr_isupper_char(const uint8_t *b, ptrdiff_t n) {
+ if (pm_encoding_euc_kr_char_width(b, n) == 1) {
+ return pm_encoding_ascii_isupper_char(b, n);
+ } else {
+ return 0;
+ }
+}
+
+/** EUC-KR encoding */
+pm_encoding_t pm_encoding_euc_kr = {
+ .name = "EUC-KR",
+ .char_width = pm_encoding_euc_kr_char_width,
+ .alnum_char = pm_encoding_euc_kr_alnum_char,
+ .alpha_char = pm_encoding_euc_kr_alpha_char,
+ .isupper_char = pm_encoding_euc_kr_isupper_char,
+ .multibyte = true
+};
+
+/** GB2312 encoding */
+pm_encoding_t pm_encoding_gb2312 = {
+ .name = "GB2312",
+ .char_width = pm_encoding_euc_kr_char_width,
+ .alnum_char = pm_encoding_euc_kr_alnum_char,
+ .alpha_char = pm_encoding_euc_kr_alpha_char,
+ .isupper_char = pm_encoding_euc_kr_isupper_char,
+ .multibyte = true
+};
+
+/** GB12345 encoding */
+pm_encoding_t pm_encoding_gb12345 = {
+ .name = "GB12345",
+ .char_width = pm_encoding_euc_kr_char_width,
+ .alnum_char = pm_encoding_euc_kr_alnum_char,
+ .alpha_char = pm_encoding_euc_kr_alpha_char,
+ .isupper_char = pm_encoding_euc_kr_isupper_char,
+ .multibyte = true
+};
diff --git a/prism/prism.c b/prism/prism.c
index 4679ebb4ed..e18244ddf4 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -6250,14 +6250,18 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
ENCODING2("EUC-JP", "eucJP", pm_encoding_euc_jp);
ENCODING2("eucJP-ms", "euc-jp-ms", pm_encoding_euc_jp_ms);
ENCODING2("EUC-JIS-2004", "EUC-JISX0213", pm_encoding_euc_jis_2004);
+ ENCODING2("EUC-KR", "eucKR", pm_encoding_euc_kr);
+ ENCODING2("EUC-CN", "eucCN", pm_encoding_gb2312);
ENCODING1("external", pm_encoding_utf_8);
break;
case 'F': case 'f':
ENCODING1("filesystem", pm_encoding_utf_8);
break;
case 'G': case 'g':
- ENCODING1("GB1988", pm_encoding_gb1988);
ENCODING1("GBK", pm_encoding_gbk);
+ ENCODING1("GB12345", pm_encoding_gb12345);
+ ENCODING1("GB1988", pm_encoding_gb1988);
+ ENCODING1("GB2312", pm_encoding_gb2312);
break;
case 'I': case 'i':
ENCODING1("IBM437", pm_encoding_ibm437);
diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb
index b206ab20e5..eddb0b294d 100644
--- a/test/prism/encoding_test.rb
+++ b/test/prism/encoding_test.rb
@@ -74,7 +74,10 @@ module Prism
Encoding::Big5_UAO => codepoints_2bytes,
Encoding::CP949 => codepoints_2bytes,
Encoding::CP950 => codepoints_2bytes,
+ Encoding::EUC_KR => codepoints_2bytes,
Encoding::GBK => codepoints_2bytes,
+ Encoding::GB12345 => codepoints_2bytes,
+ Encoding::GB2312 => codepoints_2bytes,
Encoding::MACJAPANESE => codepoints_2bytes,
Encoding::Shift_JIS => codepoints_2bytes,
Encoding::SJIS_DoCoMo => codepoints_2bytes,