diff options
| author | Kevin Newton <kddnewton@gmail.com> | 2023-11-30 09:21:48 -0500 |
|---|---|---|
| committer | Kevin Newton <kddnewton@gmail.com> | 2023-11-30 21:37:56 -0500 |
| commit | ca26e0e34bcf41b5ce60f15dc5db5bae8bb36305 (patch) | |
| tree | 5e72681c2e8649c06aa7cf10e875fa898c5c5a2e /prism | |
| parent | dde0abb29ea119e5ae5ce7a01b9d007ce5403a37 (diff) | |
[ruby/prism] EUC-TW encoding
https://github.com/ruby/prism/commit/edfb54f039
Diffstat (limited to 'prism')
| -rw-r--r-- | prism/enc/pm_encoding.h | 1 | ||||
| -rw-r--r-- | prism/enc/pm_euc_jp.c | 172 | ||||
| -rw-r--r-- | prism/prism.c | 1 |
3 files changed, 75 insertions, 99 deletions
diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h index 49dace45c7..59d31e26b4 100644 --- a/prism/enc/pm_encoding.h +++ b/prism/enc/pm_encoding.h @@ -172,6 +172,7 @@ extern pm_encoding_t pm_encoding_euc_jp; extern pm_encoding_t pm_encoding_euc_jp_ms; extern pm_encoding_t pm_encoding_euc_jis_2004; extern pm_encoding_t pm_encoding_euc_kr; +extern pm_encoding_t pm_encoding_euc_tw; extern pm_encoding_t pm_encoding_gb12345; extern pm_encoding_t pm_encoding_gb18030; extern pm_encoding_t pm_encoding_gb1988; diff --git a/prism/enc/pm_euc_jp.c b/prism/enc/pm_euc_jp.c index a33ba4f0a5..ec3d84aa8e 100644 --- a/prism/enc/pm_euc_jp.c +++ b/prism/enc/pm_euc_jp.c @@ -1,6 +1,21 @@ #include "prism/enc/pm_encoding.h" static size_t +pm_encoding_ascii_alpha_char_lt_0x80(const uint8_t *b, ptrdiff_t n) { + return (*b < 0x80) ? pm_encoding_ascii_alpha_char(b, n) : 0; +} + +static size_t +pm_encoding_ascii_alnum_char_lt_0x80(const uint8_t *b, ptrdiff_t n) { + return (*b < 0x80) ? pm_encoding_ascii_alnum_char(b, n) : 0; +} + +static bool +pm_encoding_ascii_isupper_char_lt_0x80(const uint8_t *b, ptrdiff_t n) { + return (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n); +} + +static size_t pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) { // These are the single byte characters. if (*b < 0x80) { @@ -8,23 +23,12 @@ pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) { } // These are the double byte characters. - if ( - (n > 1) && - ( - ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && - (b[1] >= 0xA1 && b[1] <= 0xFE) - ) - ) { + if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) { return 2; } // These are the triple byte characters. - if ( - (n > 2) && - (b[0] == 0x8F) && - (b[1] >= 0xA1 && b[2] <= 0xFE) && - (b[2] >= 0xA1 && b[2] <= 0xFE) - ) { + if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) { return 3; } @@ -32,39 +36,47 @@ pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) { } static size_t -pm_encoding_euc_jp_alpha_char(const uint8_t *b, ptrdiff_t n) { - if (pm_encoding_euc_jp_char_width(b, n) == 1) { - return pm_encoding_ascii_alpha_char(b, n); - } else { - return 0; +pm_encoding_euc_kr_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the single byte characters. + if (*b < 0x80) { + return 1; + } + + // These are the double byte characters. + if ((n > 1) && (b[0] >= 0xA1 && b[0] <= 0xFE) && (b[1] >= 0xA1 && b[1] <= 0xFE)) { + return 2; } + + return 0; } static size_t -pm_encoding_euc_jp_alnum_char(const uint8_t *b, ptrdiff_t n) { - if (pm_encoding_euc_jp_char_width(b, n) == 1) { - return pm_encoding_ascii_alnum_char(b, n); - } else { - return 0; +pm_encoding_euc_tw_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the single byte characters. + if (*b < 0x80) { + return 1; } -} -static bool -pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) { - if (pm_encoding_euc_jp_char_width(b, n) == 1) { - return pm_encoding_ascii_isupper_char(b, n); - } else { - return 0; + // These are the double byte characters. + if ((n > 1) && (b[0] >= 0xA1) && (b[0] <= 0xFE) && (b[1] >= 0xA1) && (b[1] <= 0xFE)) { + return 2; } + + // These are the quadruple byte characters. + if ((n > 3) && (b[0] == 0x8E) && (b[1] >= 0xA1) && (b[1] <= 0xB0) && (b[2] >= 0xA1) && (b[2] <= 0xFE) && (b[3] >= 0xA1) && (b[3] <= 0xFE)) { + return 4; + } + + return 0; } /** EUC-JP encoding */ pm_encoding_t pm_encoding_euc_jp = { .name = "EUC-JP", .char_width = pm_encoding_euc_jp_char_width, - .alnum_char = pm_encoding_euc_jp_alnum_char, - .alpha_char = pm_encoding_euc_jp_alpha_char, - .isupper_char = pm_encoding_euc_jp_isupper_char, + .alnum_char = pm_encoding_ascii_alnum_char_lt_0x80, + .alpha_char = pm_encoding_ascii_alpha_char_lt_0x80, + .isupper_char = pm_encoding_ascii_isupper_char_lt_0x80, .multibyte = true }; @@ -72,9 +84,9 @@ pm_encoding_t pm_encoding_euc_jp = { pm_encoding_t pm_encoding_euc_jp_ms = { .name = "eucJP-ms", .char_width = pm_encoding_euc_jp_char_width, - .alnum_char = pm_encoding_euc_jp_alnum_char, - .alpha_char = pm_encoding_euc_jp_alpha_char, - .isupper_char = pm_encoding_euc_jp_isupper_char, + .alnum_char = pm_encoding_ascii_alnum_char_lt_0x80, + .alpha_char = pm_encoding_ascii_alpha_char_lt_0x80, + .isupper_char = pm_encoding_ascii_isupper_char_lt_0x80, .multibyte = true }; @@ -82,9 +94,9 @@ pm_encoding_t pm_encoding_euc_jp_ms = { pm_encoding_t pm_encoding_euc_jis_2004 = { .name = "EUC-JIS-2004", .char_width = pm_encoding_euc_jp_char_width, - .alnum_char = pm_encoding_euc_jp_alnum_char, - .alpha_char = pm_encoding_euc_jp_alpha_char, - .isupper_char = pm_encoding_euc_jp_isupper_char, + .alnum_char = pm_encoding_ascii_alnum_char_lt_0x80, + .alpha_char = pm_encoding_ascii_alpha_char_lt_0x80, + .isupper_char = pm_encoding_ascii_isupper_char_lt_0x80, .multibyte = true }; @@ -92,67 +104,19 @@ pm_encoding_t pm_encoding_euc_jis_2004 = { pm_encoding_t pm_encoding_cp51932 = { .name = "CP51932", .char_width = pm_encoding_euc_jp_char_width, - .alnum_char = pm_encoding_euc_jp_alnum_char, - .alpha_char = pm_encoding_euc_jp_alpha_char, - .isupper_char = pm_encoding_euc_jp_isupper_char, + .alnum_char = pm_encoding_ascii_alnum_char_lt_0x80, + .alpha_char = pm_encoding_ascii_alpha_char_lt_0x80, + .isupper_char = pm_encoding_ascii_isupper_char_lt_0x80, .multibyte = true }; -static size_t -pm_encoding_euc_kr_char_width(const uint8_t *b, ptrdiff_t n) { - // These are the single byte characters. - if (*b < 0x80) { - return 1; - } - - // These are the double byte characters. - if ( - (n > 1) && - ( - (b[0] >= 0xA1 && b[0] <= 0xFE) && - (b[1] >= 0xA1 && b[1] <= 0xFE) - ) - ) { - return 2; - } - - return 0; -} - -static size_t -pm_encoding_euc_kr_alpha_char(const uint8_t *b, ptrdiff_t n) { - if (pm_encoding_euc_kr_char_width(b, n) == 1) { - return pm_encoding_ascii_alpha_char(b, n); - } else { - return 0; - } -} - -static size_t -pm_encoding_euc_kr_alnum_char(const uint8_t *b, ptrdiff_t n) { - if (pm_encoding_euc_kr_char_width(b, n) == 1) { - return pm_encoding_ascii_alnum_char(b, n); - } else { - return 0; - } -} - -static bool -pm_encoding_euc_kr_isupper_char(const uint8_t *b, ptrdiff_t n) { - if (pm_encoding_euc_kr_char_width(b, n) == 1) { - return pm_encoding_ascii_isupper_char(b, n); - } else { - return 0; - } -} - /** EUC-KR encoding */ pm_encoding_t pm_encoding_euc_kr = { .name = "EUC-KR", .char_width = pm_encoding_euc_kr_char_width, - .alnum_char = pm_encoding_euc_kr_alnum_char, - .alpha_char = pm_encoding_euc_kr_alpha_char, - .isupper_char = pm_encoding_euc_kr_isupper_char, + .alnum_char = pm_encoding_ascii_alnum_char_lt_0x80, + .alpha_char = pm_encoding_ascii_alpha_char_lt_0x80, + .isupper_char = pm_encoding_ascii_isupper_char_lt_0x80, .multibyte = true }; @@ -160,9 +124,9 @@ pm_encoding_t pm_encoding_euc_kr = { pm_encoding_t pm_encoding_gb2312 = { .name = "GB2312", .char_width = pm_encoding_euc_kr_char_width, - .alnum_char = pm_encoding_euc_kr_alnum_char, - .alpha_char = pm_encoding_euc_kr_alpha_char, - .isupper_char = pm_encoding_euc_kr_isupper_char, + .alnum_char = pm_encoding_ascii_alnum_char_lt_0x80, + .alpha_char = pm_encoding_ascii_alpha_char_lt_0x80, + .isupper_char = pm_encoding_ascii_isupper_char_lt_0x80, .multibyte = true }; @@ -170,8 +134,18 @@ pm_encoding_t pm_encoding_gb2312 = { pm_encoding_t pm_encoding_gb12345 = { .name = "GB12345", .char_width = pm_encoding_euc_kr_char_width, - .alnum_char = pm_encoding_euc_kr_alnum_char, - .alpha_char = pm_encoding_euc_kr_alpha_char, - .isupper_char = pm_encoding_euc_kr_isupper_char, + .alnum_char = pm_encoding_ascii_alnum_char_lt_0x80, + .alpha_char = pm_encoding_ascii_alpha_char_lt_0x80, + .isupper_char = pm_encoding_ascii_isupper_char_lt_0x80, + .multibyte = true +}; + +/** EUC-TW encoding */ +pm_encoding_t pm_encoding_euc_tw = { + .name = "EUC-TW", + .char_width = pm_encoding_euc_tw_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_lt_0x80, + .alpha_char = pm_encoding_ascii_alpha_char_lt_0x80, + .isupper_char = pm_encoding_ascii_isupper_char_lt_0x80, .multibyte = true }; diff --git a/prism/prism.c b/prism/prism.c index 7ede34bea0..cdada34a73 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -6253,6 +6253,7 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star ENCODING2("EUC-JIS-2004", "EUC-JISX0213", pm_encoding_euc_jis_2004); ENCODING2("EUC-KR", "eucKR", pm_encoding_euc_kr); ENCODING2("EUC-CN", "eucCN", pm_encoding_gb2312); + ENCODING2("EUC-TW", "eucTW", pm_encoding_euc_tw); ENCODING1("Emacs-Mule", pm_encoding_emacs_mule); ENCODING1("external", pm_encoding_utf_8); break; |
