summaryrefslogtreecommitdiff
path: root/prism
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2023-11-30 09:21:48 -0500
committerKevin Newton <kddnewton@gmail.com>2023-11-30 21:37:56 -0500
commitca26e0e34bcf41b5ce60f15dc5db5bae8bb36305 (patch)
tree5e72681c2e8649c06aa7cf10e875fa898c5c5a2e /prism
parentdde0abb29ea119e5ae5ce7a01b9d007ce5403a37 (diff)
[ruby/prism] EUC-TW encoding
https://github.com/ruby/prism/commit/edfb54f039
Diffstat (limited to 'prism')
-rw-r--r--prism/enc/pm_encoding.h1
-rw-r--r--prism/enc/pm_euc_jp.c172
-rw-r--r--prism/prism.c1
3 files changed, 75 insertions, 99 deletions
diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h
index 49dace45c7..59d31e26b4 100644
--- a/prism/enc/pm_encoding.h
+++ b/prism/enc/pm_encoding.h
@@ -172,6 +172,7 @@ extern pm_encoding_t pm_encoding_euc_jp;
extern pm_encoding_t pm_encoding_euc_jp_ms;
extern pm_encoding_t pm_encoding_euc_jis_2004;
extern pm_encoding_t pm_encoding_euc_kr;
+extern pm_encoding_t pm_encoding_euc_tw;
extern pm_encoding_t pm_encoding_gb12345;
extern pm_encoding_t pm_encoding_gb18030;
extern pm_encoding_t pm_encoding_gb1988;
diff --git a/prism/enc/pm_euc_jp.c b/prism/enc/pm_euc_jp.c
index a33ba4f0a5..ec3d84aa8e 100644
--- a/prism/enc/pm_euc_jp.c
+++ b/prism/enc/pm_euc_jp.c
@@ -1,6 +1,21 @@
#include "prism/enc/pm_encoding.h"
static size_t
+pm_encoding_ascii_alpha_char_lt_0x80(const uint8_t *b, ptrdiff_t n) {
+ return (*b < 0x80) ? pm_encoding_ascii_alpha_char(b, n) : 0;
+}
+
+static size_t
+pm_encoding_ascii_alnum_char_lt_0x80(const uint8_t *b, ptrdiff_t n) {
+ return (*b < 0x80) ? pm_encoding_ascii_alnum_char(b, n) : 0;
+}
+
+static bool
+pm_encoding_ascii_isupper_char_lt_0x80(const uint8_t *b, ptrdiff_t n) {
+ return (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n);
+}
+
+static size_t
pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
// These are the single byte characters.
if (*b < 0x80) {
@@ -8,23 +23,12 @@ pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
}
// These are the double byte characters.
- if (
- (n > 1) &&
- (
- ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) &&
- (b[1] >= 0xA1 && b[1] <= 0xFE)
- )
- ) {
+ if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
return 2;
}
// These are the triple byte characters.
- if (
- (n > 2) &&
- (b[0] == 0x8F) &&
- (b[1] >= 0xA1 && b[2] <= 0xFE) &&
- (b[2] >= 0xA1 && b[2] <= 0xFE)
- ) {
+ if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) {
return 3;
}
@@ -32,39 +36,47 @@ pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
}
static size_t
-pm_encoding_euc_jp_alpha_char(const uint8_t *b, ptrdiff_t n) {
- if (pm_encoding_euc_jp_char_width(b, n) == 1) {
- return pm_encoding_ascii_alpha_char(b, n);
- } else {
- return 0;
+pm_encoding_euc_kr_char_width(const uint8_t *b, ptrdiff_t n) {
+ // These are the single byte characters.
+ if (*b < 0x80) {
+ return 1;
+ }
+
+ // These are the double byte characters.
+ if ((n > 1) && (b[0] >= 0xA1 && b[0] <= 0xFE) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
+ return 2;
}
+
+ return 0;
}
static size_t
-pm_encoding_euc_jp_alnum_char(const uint8_t *b, ptrdiff_t n) {
- if (pm_encoding_euc_jp_char_width(b, n) == 1) {
- return pm_encoding_ascii_alnum_char(b, n);
- } else {
- return 0;
+pm_encoding_euc_tw_char_width(const uint8_t *b, ptrdiff_t n) {
+ // These are the single byte characters.
+ if (*b < 0x80) {
+ return 1;
}
-}
-static bool
-pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
- if (pm_encoding_euc_jp_char_width(b, n) == 1) {
- return pm_encoding_ascii_isupper_char(b, n);
- } else {
- return 0;
+ // These are the double byte characters.
+ if ((n > 1) && (b[0] >= 0xA1) && (b[0] <= 0xFE) && (b[1] >= 0xA1) && (b[1] <= 0xFE)) {
+ return 2;
}
+
+ // These are the quadruple byte characters.
+ if ((n > 3) && (b[0] == 0x8E) && (b[1] >= 0xA1) && (b[1] <= 0xB0) && (b[2] >= 0xA1) && (b[2] <= 0xFE) && (b[3] >= 0xA1) && (b[3] <= 0xFE)) {
+ return 4;
+ }
+
+ return 0;
}
/** EUC-JP encoding */
pm_encoding_t pm_encoding_euc_jp = {
.name = "EUC-JP",
.char_width = pm_encoding_euc_jp_char_width,
- .alnum_char = pm_encoding_euc_jp_alnum_char,
- .alpha_char = pm_encoding_euc_jp_alpha_char,
- .isupper_char = pm_encoding_euc_jp_isupper_char,
+ .alnum_char = pm_encoding_ascii_alnum_char_lt_0x80,
+ .alpha_char = pm_encoding_ascii_alpha_char_lt_0x80,
+ .isupper_char = pm_encoding_ascii_isupper_char_lt_0x80,
.multibyte = true
};
@@ -72,9 +84,9 @@ pm_encoding_t pm_encoding_euc_jp = {
pm_encoding_t pm_encoding_euc_jp_ms = {
.name = "eucJP-ms",
.char_width = pm_encoding_euc_jp_char_width,
- .alnum_char = pm_encoding_euc_jp_alnum_char,
- .alpha_char = pm_encoding_euc_jp_alpha_char,
- .isupper_char = pm_encoding_euc_jp_isupper_char,
+ .alnum_char = pm_encoding_ascii_alnum_char_lt_0x80,
+ .alpha_char = pm_encoding_ascii_alpha_char_lt_0x80,
+ .isupper_char = pm_encoding_ascii_isupper_char_lt_0x80,
.multibyte = true
};
@@ -82,9 +94,9 @@ pm_encoding_t pm_encoding_euc_jp_ms = {
pm_encoding_t pm_encoding_euc_jis_2004 = {
.name = "EUC-JIS-2004",
.char_width = pm_encoding_euc_jp_char_width,
- .alnum_char = pm_encoding_euc_jp_alnum_char,
- .alpha_char = pm_encoding_euc_jp_alpha_char,
- .isupper_char = pm_encoding_euc_jp_isupper_char,
+ .alnum_char = pm_encoding_ascii_alnum_char_lt_0x80,
+ .alpha_char = pm_encoding_ascii_alpha_char_lt_0x80,
+ .isupper_char = pm_encoding_ascii_isupper_char_lt_0x80,
.multibyte = true
};
@@ -92,67 +104,19 @@ pm_encoding_t pm_encoding_euc_jis_2004 = {
pm_encoding_t pm_encoding_cp51932 = {
.name = "CP51932",
.char_width = pm_encoding_euc_jp_char_width,
- .alnum_char = pm_encoding_euc_jp_alnum_char,
- .alpha_char = pm_encoding_euc_jp_alpha_char,
- .isupper_char = pm_encoding_euc_jp_isupper_char,
+ .alnum_char = pm_encoding_ascii_alnum_char_lt_0x80,
+ .alpha_char = pm_encoding_ascii_alpha_char_lt_0x80,
+ .isupper_char = pm_encoding_ascii_isupper_char_lt_0x80,
.multibyte = true
};
-static size_t
-pm_encoding_euc_kr_char_width(const uint8_t *b, ptrdiff_t n) {
- // These are the single byte characters.
- if (*b < 0x80) {
- return 1;
- }
-
- // These are the double byte characters.
- if (
- (n > 1) &&
- (
- (b[0] >= 0xA1 && b[0] <= 0xFE) &&
- (b[1] >= 0xA1 && b[1] <= 0xFE)
- )
- ) {
- return 2;
- }
-
- return 0;
-}
-
-static size_t
-pm_encoding_euc_kr_alpha_char(const uint8_t *b, ptrdiff_t n) {
- if (pm_encoding_euc_kr_char_width(b, n) == 1) {
- return pm_encoding_ascii_alpha_char(b, n);
- } else {
- return 0;
- }
-}
-
-static size_t
-pm_encoding_euc_kr_alnum_char(const uint8_t *b, ptrdiff_t n) {
- if (pm_encoding_euc_kr_char_width(b, n) == 1) {
- return pm_encoding_ascii_alnum_char(b, n);
- } else {
- return 0;
- }
-}
-
-static bool
-pm_encoding_euc_kr_isupper_char(const uint8_t *b, ptrdiff_t n) {
- if (pm_encoding_euc_kr_char_width(b, n) == 1) {
- return pm_encoding_ascii_isupper_char(b, n);
- } else {
- return 0;
- }
-}
-
/** EUC-KR encoding */
pm_encoding_t pm_encoding_euc_kr = {
.name = "EUC-KR",
.char_width = pm_encoding_euc_kr_char_width,
- .alnum_char = pm_encoding_euc_kr_alnum_char,
- .alpha_char = pm_encoding_euc_kr_alpha_char,
- .isupper_char = pm_encoding_euc_kr_isupper_char,
+ .alnum_char = pm_encoding_ascii_alnum_char_lt_0x80,
+ .alpha_char = pm_encoding_ascii_alpha_char_lt_0x80,
+ .isupper_char = pm_encoding_ascii_isupper_char_lt_0x80,
.multibyte = true
};
@@ -160,9 +124,9 @@ pm_encoding_t pm_encoding_euc_kr = {
pm_encoding_t pm_encoding_gb2312 = {
.name = "GB2312",
.char_width = pm_encoding_euc_kr_char_width,
- .alnum_char = pm_encoding_euc_kr_alnum_char,
- .alpha_char = pm_encoding_euc_kr_alpha_char,
- .isupper_char = pm_encoding_euc_kr_isupper_char,
+ .alnum_char = pm_encoding_ascii_alnum_char_lt_0x80,
+ .alpha_char = pm_encoding_ascii_alpha_char_lt_0x80,
+ .isupper_char = pm_encoding_ascii_isupper_char_lt_0x80,
.multibyte = true
};
@@ -170,8 +134,18 @@ pm_encoding_t pm_encoding_gb2312 = {
pm_encoding_t pm_encoding_gb12345 = {
.name = "GB12345",
.char_width = pm_encoding_euc_kr_char_width,
- .alnum_char = pm_encoding_euc_kr_alnum_char,
- .alpha_char = pm_encoding_euc_kr_alpha_char,
- .isupper_char = pm_encoding_euc_kr_isupper_char,
+ .alnum_char = pm_encoding_ascii_alnum_char_lt_0x80,
+ .alpha_char = pm_encoding_ascii_alpha_char_lt_0x80,
+ .isupper_char = pm_encoding_ascii_isupper_char_lt_0x80,
+ .multibyte = true
+};
+
+/** EUC-TW encoding */
+pm_encoding_t pm_encoding_euc_tw = {
+ .name = "EUC-TW",
+ .char_width = pm_encoding_euc_tw_char_width,
+ .alnum_char = pm_encoding_ascii_alnum_char_lt_0x80,
+ .alpha_char = pm_encoding_ascii_alpha_char_lt_0x80,
+ .isupper_char = pm_encoding_ascii_isupper_char_lt_0x80,
.multibyte = true
};
diff --git a/prism/prism.c b/prism/prism.c
index 7ede34bea0..cdada34a73 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -6253,6 +6253,7 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
ENCODING2("EUC-JIS-2004", "EUC-JISX0213", pm_encoding_euc_jis_2004);
ENCODING2("EUC-KR", "eucKR", pm_encoding_euc_kr);
ENCODING2("EUC-CN", "eucCN", pm_encoding_gb2312);
+ ENCODING2("EUC-TW", "eucTW", pm_encoding_euc_tw);
ENCODING1("Emacs-Mule", pm_encoding_emacs_mule);
ENCODING1("external", pm_encoding_utf_8);
break;