diff options
| author | Kevin Newton <kddnewton@gmail.com> | 2023-11-29 14:10:20 -0500 |
|---|---|---|
| committer | Kevin Newton <kddnewton@gmail.com> | 2023-11-30 21:37:56 -0500 |
| commit | 700e172a501d96ed3efe627b6a91b7890b32d2f4 (patch) | |
| tree | 8ecdfe06f40fae764810a97e909f4858e132454e | |
| parent | ddaa0730588a2c2947f6e59e91ca88b37e97f42a (diff) | |
[ruby/prism] EUC-KR encodings
https://github.com/ruby/prism/commit/ba5218385a
| -rw-r--r-- | prism/enc/pm_encoding.h | 3 | ||||
| -rw-r--r-- | prism/enc/pm_euc_jp.c | 78 | ||||
| -rw-r--r-- | prism/prism.c | 6 | ||||
| -rw-r--r-- | test/prism/encoding_test.rb | 3 |
4 files changed, 89 insertions, 1 deletions
diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h index e81ecad25b..3bea57fbf9 100644 --- a/prism/enc/pm_encoding.h +++ b/prism/enc/pm_encoding.h @@ -169,7 +169,10 @@ extern pm_encoding_t pm_encoding_cp950; extern pm_encoding_t pm_encoding_euc_jp; extern pm_encoding_t pm_encoding_euc_jp_ms; extern pm_encoding_t pm_encoding_euc_jis_2004; +extern pm_encoding_t pm_encoding_euc_kr; +extern pm_encoding_t pm_encoding_gb12345; extern pm_encoding_t pm_encoding_gb1988; +extern pm_encoding_t pm_encoding_gb2312; extern pm_encoding_t pm_encoding_gbk; extern pm_encoding_t pm_encoding_ibm437; extern pm_encoding_t pm_encoding_ibm720; diff --git a/prism/enc/pm_euc_jp.c b/prism/enc/pm_euc_jp.c index 9bee6a1292..a33ba4f0a5 100644 --- a/prism/enc/pm_euc_jp.c +++ b/prism/enc/pm_euc_jp.c @@ -97,3 +97,81 @@ pm_encoding_t pm_encoding_cp51932 = { .isupper_char = pm_encoding_euc_jp_isupper_char, .multibyte = true }; + +static size_t +pm_encoding_euc_kr_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the single byte characters. + if (*b < 0x80) { + return 1; + } + + // These are the double byte characters. + if ( + (n > 1) && + ( + (b[0] >= 0xA1 && b[0] <= 0xFE) && + (b[1] >= 0xA1 && b[1] <= 0xFE) + ) + ) { + return 2; + } + + return 0; +} + +static size_t +pm_encoding_euc_kr_alpha_char(const uint8_t *b, ptrdiff_t n) { + if (pm_encoding_euc_kr_char_width(b, n) == 1) { + return pm_encoding_ascii_alpha_char(b, n); + } else { + return 0; + } +} + +static size_t +pm_encoding_euc_kr_alnum_char(const uint8_t *b, ptrdiff_t n) { + if (pm_encoding_euc_kr_char_width(b, n) == 1) { + return pm_encoding_ascii_alnum_char(b, n); + } else { + return 0; + } +} + +static bool +pm_encoding_euc_kr_isupper_char(const uint8_t *b, ptrdiff_t n) { + if (pm_encoding_euc_kr_char_width(b, n) == 1) { + return pm_encoding_ascii_isupper_char(b, n); + } else { + return 0; + } +} + +/** EUC-KR encoding */ +pm_encoding_t pm_encoding_euc_kr = { + .name = "EUC-KR", + .char_width = pm_encoding_euc_kr_char_width, + .alnum_char = pm_encoding_euc_kr_alnum_char, + .alpha_char = pm_encoding_euc_kr_alpha_char, + .isupper_char = pm_encoding_euc_kr_isupper_char, + .multibyte = true +}; + +/** GB2312 encoding */ +pm_encoding_t pm_encoding_gb2312 = { + .name = "GB2312", + .char_width = pm_encoding_euc_kr_char_width, + .alnum_char = pm_encoding_euc_kr_alnum_char, + .alpha_char = pm_encoding_euc_kr_alpha_char, + .isupper_char = pm_encoding_euc_kr_isupper_char, + .multibyte = true +}; + +/** GB12345 encoding */ +pm_encoding_t pm_encoding_gb12345 = { + .name = "GB12345", + .char_width = pm_encoding_euc_kr_char_width, + .alnum_char = pm_encoding_euc_kr_alnum_char, + .alpha_char = pm_encoding_euc_kr_alpha_char, + .isupper_char = pm_encoding_euc_kr_isupper_char, + .multibyte = true +}; diff --git a/prism/prism.c b/prism/prism.c index 4679ebb4ed..e18244ddf4 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -6250,14 +6250,18 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star ENCODING2("EUC-JP", "eucJP", pm_encoding_euc_jp); ENCODING2("eucJP-ms", "euc-jp-ms", pm_encoding_euc_jp_ms); ENCODING2("EUC-JIS-2004", "EUC-JISX0213", pm_encoding_euc_jis_2004); + ENCODING2("EUC-KR", "eucKR", pm_encoding_euc_kr); + ENCODING2("EUC-CN", "eucCN", pm_encoding_gb2312); ENCODING1("external", pm_encoding_utf_8); break; case 'F': case 'f': ENCODING1("filesystem", pm_encoding_utf_8); break; case 'G': case 'g': - ENCODING1("GB1988", pm_encoding_gb1988); ENCODING1("GBK", pm_encoding_gbk); + ENCODING1("GB12345", pm_encoding_gb12345); + ENCODING1("GB1988", pm_encoding_gb1988); + ENCODING1("GB2312", pm_encoding_gb2312); break; case 'I': case 'i': ENCODING1("IBM437", pm_encoding_ibm437); diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb index b206ab20e5..eddb0b294d 100644 --- a/test/prism/encoding_test.rb +++ b/test/prism/encoding_test.rb @@ -74,7 +74,10 @@ module Prism Encoding::Big5_UAO => codepoints_2bytes, Encoding::CP949 => codepoints_2bytes, Encoding::CP950 => codepoints_2bytes, + Encoding::EUC_KR => codepoints_2bytes, Encoding::GBK => codepoints_2bytes, + Encoding::GB12345 => codepoints_2bytes, + Encoding::GB2312 => codepoints_2bytes, Encoding::MACJAPANESE => codepoints_2bytes, Encoding::Shift_JIS => codepoints_2bytes, Encoding::SJIS_DoCoMo => codepoints_2bytes, |
