summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2023-11-11 22:52:23 -0500
committergit <svn-admin@ruby-lang.org>2023-11-16 17:39:30 +0000
commitca789e7232f95f485198a459fdf23bff16a2c370 (patch)
treeaa6ee8a033794c9fade46b04da9baab5cab540a2
parentaebc6e8b8db259b7eeee203f6ec4137019081d70 (diff)
[ruby/prism] Add windows-1250 encoding
https://github.com/ruby/prism/commit/a362535ca4
-rw-r--r--prism/enc/pm_encoding.h3
-rw-r--r--prism/enc/pm_tables.c35
-rw-r--r--prism/prism.c92
-rw-r--r--test/prism/encoding_test.rb82
4 files changed, 135 insertions, 77 deletions
diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h
index 7d73576665..97ebc68a4d 100644
--- a/prism/enc/pm_encoding.h
+++ b/prism/enc/pm_encoding.h
@@ -180,8 +180,9 @@ extern pm_encoding_t pm_encoding_koi8_r;
extern pm_encoding_t pm_encoding_shift_jis;
extern pm_encoding_t pm_encoding_utf_8;
extern pm_encoding_t pm_encoding_utf8_mac;
-extern pm_encoding_t pm_encoding_windows_31j;
+extern pm_encoding_t pm_encoding_windows_1250;
extern pm_encoding_t pm_encoding_windows_1251;
extern pm_encoding_t pm_encoding_windows_1252;
+extern pm_encoding_t pm_encoding_windows_31j;
#endif
diff --git a/prism/enc/pm_tables.c b/prism/enc/pm_tables.c
index 7b840acfaa..3ed5523c7f 100644
--- a/prism/enc/pm_tables.c
+++ b/prism/enc/pm_tables.c
@@ -410,6 +410,30 @@ static uint8_t pm_encoding_koi8_r_table[256] = {
/**
* Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding windows-1250 character.
+ */
+static uint8_t pm_encoding_windows_1250_table[256] = {
+// 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x
+ 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x
+ 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 7, 7, 7, 7, // 8x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 3, 3, 3, // 9x
+ 0, 0, 0, 7, 0, 7, 0, 0, 0, 0, 7, 0, 0, 0, 0, 7, // Ax
+ 0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 3, 0, 7, 0, 3, 3, // Bx
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx
+ 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex
+ 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
+};
+
+/**
+ * Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding windows-1251 character.
*/
static uint8_t pm_encoding_windows_1251_table[256] = {
@@ -537,6 +561,7 @@ PRISM_ENCODING_TABLE(iso_8859_14)
PRISM_ENCODING_TABLE(iso_8859_15)
PRISM_ENCODING_TABLE(iso_8859_16)
PRISM_ENCODING_TABLE(koi8_r)
+PRISM_ENCODING_TABLE(windows_1250)
PRISM_ENCODING_TABLE(windows_1251)
PRISM_ENCODING_TABLE(windows_1252)
@@ -722,6 +747,16 @@ pm_encoding_t pm_encoding_koi8_r = {
.multibyte = false
};
+/** Windows-1250 */
+pm_encoding_t pm_encoding_windows_1250 = {
+ .name = "windows-1250",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_windows_1250_alnum_char,
+ .alpha_char = pm_encoding_windows_1250_alpha_char,
+ .isupper_char = pm_encoding_windows_1250_isupper_char,
+ .multibyte = false
+};
+
/** Windows-1251 */
pm_encoding_t pm_encoding_windows_1251 = {
.name = "windows-1251",
diff --git a/prism/prism.c b/prism/prism.c
index 572dc1f146..33d50acc3d 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -6022,10 +6022,18 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
}
// Next, we're going to check for UTF-8. This is the most common encoding.
- // Extensions like utf-8 can contain extra encoding details like,
- // utf-8-dos, utf-8-linux, utf-8-mac. We treat these all as utf-8 should
- // treat any encoding starting utf-8 as utf-8.
+ // utf-8 can contain extra information at the end about the platform it is
+ // encoded on, such as utf-8-mac or utf-8-unix. We'll ignore those suffixes.
if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "utf-8", 5) == 0)) {
+ // We need to explicitly handle utf-8-hfs, as that one needs to switch
+ // over to being utf8-mac.
+ if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-hfs", 4) == 0)) {
+ parser->encoding = pm_encoding_utf8_mac;
+ parser->encoding_changed = true;
+ if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser);
+ return true;
+ }
+
// We don't need to do anything here because the default encoding is
// already UTF-8. We'll just return.
return true;
@@ -6036,48 +6044,58 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
#define ENCODING(value, prebuilt) \
if (width == sizeof(value) - 1 && start + width <= end && pm_strncasecmp(start, (const uint8_t *) value, width) == 0) { \
parser->encoding = prebuilt; \
- parser->encoding_changed |= true; \
+ parser->encoding_changed = true; \
if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); \
return true; \
}
+ // Built convenience macros to compare aliases for the same encoding.
+#define ENCODING2(value1, value2, prebuilt) ENCODING(value1, prebuilt) ENCODING(value2, prebuilt)
+#define ENCODING3(value1, value2, value3, prebuilt) ENCODING2(value1, value2, prebuilt) ENCODING(value3, prebuilt)
+#define ENCODING4(value1, value2, value3, value4, prebuilt) ENCODING3(value1, value2, value3, prebuilt) ENCODING(value4, prebuilt)
+#define ENCODING5(value1, value2, value3, value4, value5, prebuilt) ENCODING4(value1, value2, value3, value4, prebuilt) ENCODING(value5, prebuilt)
+
// Check most common first. (This is pretty arbitrary.)
- ENCODING("ascii", pm_encoding_ascii);
- ENCODING("ascii-8bit", pm_encoding_ascii_8bit);
- ENCODING("us-ascii", pm_encoding_ascii);
- ENCODING("binary", pm_encoding_ascii_8bit);
- ENCODING("shift_jis", pm_encoding_shift_jis);
- ENCODING("euc-jp", pm_encoding_euc_jp);
+ ENCODING("ASCII", pm_encoding_ascii);
+ ENCODING("ASCII-8BIT", pm_encoding_ascii_8bit);
+ ENCODING("US-ASCII", pm_encoding_ascii);
+ ENCODING("BINARY", pm_encoding_ascii_8bit);
+ ENCODING("Shift_JIS", pm_encoding_shift_jis);
+ ENCODING("EUC-JP", pm_encoding_euc_jp);
// Then check all the others.
- ENCODING("big5", pm_encoding_big5);
+ ENCODING2("ANSI_X3.4-1968", "646", pm_encoding_ascii);
ENCODING("cp51932", pm_encoding_cp51932);
- ENCODING("gbk", pm_encoding_gbk);
- ENCODING("iso-8859-1", pm_encoding_iso_8859_1);
- ENCODING("iso-8859-2", pm_encoding_iso_8859_2);
- ENCODING("iso-8859-3", pm_encoding_iso_8859_3);
- ENCODING("iso-8859-4", pm_encoding_iso_8859_4);
- ENCODING("iso-8859-5", pm_encoding_iso_8859_5);
- ENCODING("iso-8859-6", pm_encoding_iso_8859_6);
- ENCODING("iso-8859-7", pm_encoding_iso_8859_7);
- ENCODING("iso-8859-8", pm_encoding_iso_8859_8);
- ENCODING("iso-8859-9", pm_encoding_iso_8859_9);
- ENCODING("iso-8859-10", pm_encoding_iso_8859_10);
- ENCODING("iso-8859-11", pm_encoding_iso_8859_11);
- ENCODING("iso-8859-13", pm_encoding_iso_8859_13);
- ENCODING("iso-8859-14", pm_encoding_iso_8859_14);
- ENCODING("iso-8859-15", pm_encoding_iso_8859_15);
- ENCODING("iso-8859-16", pm_encoding_iso_8859_16);
- ENCODING("koi8-r", pm_encoding_koi8_r);
- ENCODING("windows-31j", pm_encoding_windows_31j);
- ENCODING("windows-1251", pm_encoding_windows_1251);
- ENCODING("windows-1252", pm_encoding_windows_1252);
- ENCODING("cp1251", pm_encoding_windows_1251);
- ENCODING("cp1252", pm_encoding_windows_1252);
- ENCODING("cp932", pm_encoding_windows_31j);
- ENCODING("sjis", pm_encoding_windows_31j);
- ENCODING("utf8-mac", pm_encoding_utf8_mac);
-
+ ENCODING("eucJP", pm_encoding_euc_jp);
+ ENCODING("Big5", pm_encoding_big5);
+ ENCODING2("GBK", "CP936", pm_encoding_gbk);
+ ENCODING2("ISO-8859-1", "ISO8859-1", pm_encoding_iso_8859_1);
+ ENCODING2("ISO-8859-2", "ISO8859-2", pm_encoding_iso_8859_2);
+ ENCODING2("ISO-8859-3", "ISO8859-3", pm_encoding_iso_8859_3);
+ ENCODING2("ISO-8859-4", "ISO8859-4", pm_encoding_iso_8859_4);
+ ENCODING2("ISO-8859-5", "ISO8859-5", pm_encoding_iso_8859_5);
+ ENCODING2("ISO-8859-6", "ISO8859-6", pm_encoding_iso_8859_6);
+ ENCODING2("ISO-8859-7", "ISO8859-7", pm_encoding_iso_8859_7);
+ ENCODING2("ISO-8859-8", "ISO8859-8", pm_encoding_iso_8859_8);
+ ENCODING2("ISO-8859-9", "ISO8859-9", pm_encoding_iso_8859_9);
+ ENCODING2("ISO-8859-10", "ISO8859-10", pm_encoding_iso_8859_10);
+ ENCODING2("ISO-8859-11", "ISO8859-11", pm_encoding_iso_8859_11);
+ ENCODING2("ISO-8859-13", "ISO8859-13", pm_encoding_iso_8859_13);
+ ENCODING2("ISO-8859-14", "ISO8859-14", pm_encoding_iso_8859_14);
+ ENCODING2("ISO-8859-15", "ISO8859-15", pm_encoding_iso_8859_15);
+ ENCODING2("ISO-8859-16", "ISO8859-16", pm_encoding_iso_8859_16);
+ ENCODING2("KOI8-R", "CP878", pm_encoding_koi8_r);
+ ENCODING4("CP65001", "locale", "external", "filesystem", pm_encoding_utf_8);
+ ENCODING3("UTF8-MAC", "UTF-8-MAC", "UTF-8-HFS", pm_encoding_utf8_mac);
+ ENCODING2("Windows-1250", "CP1250", pm_encoding_windows_1250);
+ ENCODING2("Windows-1251", "CP1251", pm_encoding_windows_1251);
+ ENCODING2("Windows-1252", "CP1252", pm_encoding_windows_1252);
+ ENCODING5("Windows-31J", "CP932", "csWindows31J", "SJIS", "PCK", pm_encoding_windows_31j);
+
+#undef ENCODING2
+#undef ENCODING3
+#undef ENCODING4
+#undef ENCODING5
#undef ENCODING
return false;
diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb
index f3a24fa9dc..9e18989ad3 100644
--- a/test/prism/encoding_test.rb
+++ b/test/prism/encoding_test.rb
@@ -4,45 +4,49 @@ require_relative "test_helper"
module Prism
class EncodingTest < TestCase
- %w[
- ascii
- ascii-8bit
- big5
- binary
- euc-jp
- gbk
- iso-8859-1
- iso-8859-2
- iso-8859-3
- iso-8859-4
- iso-8859-5
- iso-8859-6
- iso-8859-7
- iso-8859-8
- iso-8859-9
- iso-8859-10
- iso-8859-11
- iso-8859-13
- iso-8859-14
- iso-8859-15
- iso-8859-16
- koi8-r
- shift_jis
- sjis
- us-ascii
- utf-8
- utf8-mac
- windows-31j
- windows-1251
- windows-1252
- CP1251
- CP1252
- CP51932
- ].each do |encoding|
- define_method "test_encoding_#{encoding}" do
- result = Prism.parse("# encoding: #{encoding}\n'string'")
- actual = result.value.statements.body.first.unescaped.encoding
- assert_equal Encoding.find(encoding), actual
+ [
+ "US-ASCII",
+ "ASCII-8BIT",
+ "Big5",
+ "CP51932",
+ "EUC-JP",
+ "GBK",
+ "ISO-8859-1",
+ "ISO-8859-2",
+ "ISO-8859-3",
+ "ISO-8859-4",
+ "ISO-8859-5",
+ "ISO-8859-6",
+ "ISO-8859-7",
+ "ISO-8859-8",
+ "ISO-8859-9",
+ "ISO-8859-10",
+ "ISO-8859-11",
+ "ISO-8859-13",
+ "ISO-8859-14",
+ "ISO-8859-15",
+ "ISO-8859-16",
+ "KOI8-R",
+ "Shift_JIS",
+ "Windows-31J",
+ "UTF-8",
+ "UTF8-MAC",
+ "Windows-1250",
+ "Windows-1251",
+ "Windows-1252",
+ ].each do |canonical_name|
+ encoding = Encoding.find(canonical_name)
+
+ encoding.names.each do |name|
+ # Even though UTF-8-MAC is an alias for UTF8-MAC, CRuby treats it as
+ # UTF-8. So we'll skip this test.
+ next if name == "UTF-8-MAC"
+
+ define_method "test_encoding_#{name}" do
+ result = Prism.parse("# encoding: #{name}\n'string'")
+ actual = result.value.statements.body.first.unescaped.encoding
+ assert_equal encoding, actual
+ end
end
end