From 63c490dc4efea44483d0f5b4e3a6b47a29aa8e84 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Sat, 11 Nov 2023 23:00:18 -0500 Subject: [ruby/prism] Add remaining windows encodings https://github.com/ruby/prism/commit/e77b549a59 --- prism/enc/pm_encoding.h | 6 ++ prism/enc/pm_tables.c | 210 ++++++++++++++++++++++++++++++++++++++++++++ prism/prism.c | 6 ++ test/prism/encoding_test.rb | 8 +- 4 files changed, 229 insertions(+), 1 deletion(-) diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h index 97ebc68a4d..9d8c1ac445 100644 --- a/prism/enc/pm_encoding.h +++ b/prism/enc/pm_encoding.h @@ -183,6 +183,12 @@ extern pm_encoding_t pm_encoding_utf8_mac; extern pm_encoding_t pm_encoding_windows_1250; extern pm_encoding_t pm_encoding_windows_1251; extern pm_encoding_t pm_encoding_windows_1252; +extern pm_encoding_t pm_encoding_windows_1253; +extern pm_encoding_t pm_encoding_windows_1254; +extern pm_encoding_t pm_encoding_windows_1255; +extern pm_encoding_t pm_encoding_windows_1256; +extern pm_encoding_t pm_encoding_windows_1257; +extern pm_encoding_t pm_encoding_windows_1258; extern pm_encoding_t pm_encoding_windows_31j; #endif diff --git a/prism/enc/pm_tables.c b/prism/enc/pm_tables.c index 3ed5523c7f..569128373d 100644 --- a/prism/enc/pm_tables.c +++ b/prism/enc/pm_tables.c @@ -480,6 +480,150 @@ static uint8_t pm_encoding_windows_1252_table[256] = { 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx }; +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding windows-1253 character. + */ +static uint8_t pm_encoding_windows_1253_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 3, 7, 0, 7, 7, 7, 0, 7, 0, 7, 7, // Bx + 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding windows-1254 character. + */ +static uint8_t pm_encoding_windows_1254_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 7, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 0, 7, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding windows-1255 character. + */ +static uint8_t pm_encoding_windows_1255_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding windows-1256 character. + */ +static uint8_t pm_encoding_windows_1256_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Cx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding windows-1257 character. + */ +static uint8_t pm_encoding_windows_1257_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 7, 0, 0, 0, 0, 7, // Ax + 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 3, 0, 0, 0, 0, 3, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding windows-1258 character. + */ +static uint8_t pm_encoding_windows_1258_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + /** * Returns the size of the next character in the ASCII encoding. This basically * means that if the top bit is not set, the character is 1 byte long. @@ -564,6 +708,12 @@ PRISM_ENCODING_TABLE(koi8_r) PRISM_ENCODING_TABLE(windows_1250) PRISM_ENCODING_TABLE(windows_1251) PRISM_ENCODING_TABLE(windows_1252) +PRISM_ENCODING_TABLE(windows_1253) +PRISM_ENCODING_TABLE(windows_1254) +PRISM_ENCODING_TABLE(windows_1255) +PRISM_ENCODING_TABLE(windows_1256) +PRISM_ENCODING_TABLE(windows_1257) +PRISM_ENCODING_TABLE(windows_1258) #undef PRISM_ENCODING_TABLE @@ -776,3 +926,63 @@ pm_encoding_t pm_encoding_windows_1252 = { .isupper_char = pm_encoding_windows_1252_isupper_char, .multibyte = false }; + +/** Windows-1253 */ +pm_encoding_t pm_encoding_windows_1253 = { + .name = "windows-1253", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_windows_1253_alnum_char, + .alpha_char = pm_encoding_windows_1253_alpha_char, + .isupper_char = pm_encoding_windows_1253_isupper_char, + .multibyte = false +}; + +/** Windows-1254 */ +pm_encoding_t pm_encoding_windows_1254 = { + .name = "windows-1254", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_windows_1254_alnum_char, + .alpha_char = pm_encoding_windows_1254_alpha_char, + .isupper_char = pm_encoding_windows_1254_isupper_char, + .multibyte = false +}; + +/** Windows-1255 */ +pm_encoding_t pm_encoding_windows_1255 = { + .name = "windows-1255", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_windows_1255_alnum_char, + .alpha_char = pm_encoding_windows_1255_alpha_char, + .isupper_char = pm_encoding_windows_1255_isupper_char, + .multibyte = false +}; + +/** Windows-1256 */ +pm_encoding_t pm_encoding_windows_1256 = { + .name = "windows-1256", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_windows_1256_alnum_char, + .alpha_char = pm_encoding_windows_1256_alpha_char, + .isupper_char = pm_encoding_windows_1256_isupper_char, + .multibyte = false +}; + +/** Windows-1257 */ +pm_encoding_t pm_encoding_windows_1257 = { + .name = "windows-1257", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_windows_1257_alnum_char, + .alpha_char = pm_encoding_windows_1257_alpha_char, + .isupper_char = pm_encoding_windows_1257_isupper_char, + .multibyte = false +}; + +/** Windows-1258 */ +pm_encoding_t pm_encoding_windows_1258 = { + .name = "windows-1258", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_windows_1258_alnum_char, + .alpha_char = pm_encoding_windows_1258_alpha_char, + .isupper_char = pm_encoding_windows_1258_isupper_char, + .multibyte = false +}; diff --git a/prism/prism.c b/prism/prism.c index 33d50acc3d..75b972407b 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -6090,6 +6090,12 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star ENCODING2("Windows-1250", "CP1250", pm_encoding_windows_1250); ENCODING2("Windows-1251", "CP1251", pm_encoding_windows_1251); ENCODING2("Windows-1252", "CP1252", pm_encoding_windows_1252); + ENCODING2("Windows-1253", "CP1253", pm_encoding_windows_1253); + ENCODING2("Windows-1254", "CP1254", pm_encoding_windows_1254); + ENCODING2("Windows-1255", "CP1255", pm_encoding_windows_1255); + ENCODING2("Windows-1256", "CP1256", pm_encoding_windows_1256); + ENCODING2("Windows-1257", "CP1257", pm_encoding_windows_1257); + ENCODING2("Windows-1258", "CP1258", pm_encoding_windows_1258); ENCODING5("Windows-31J", "CP932", "csWindows31J", "SJIS", "PCK", pm_encoding_windows_31j); #undef ENCODING2 diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb index 9e18989ad3..264dbba119 100644 --- a/test/prism/encoding_test.rb +++ b/test/prism/encoding_test.rb @@ -28,12 +28,18 @@ module Prism "ISO-8859-16", "KOI8-R", "Shift_JIS", - "Windows-31J", "UTF-8", "UTF8-MAC", "Windows-1250", "Windows-1251", "Windows-1252", + "Windows-1253", + "Windows-1254", + "Windows-1255", + "Windows-1256", + "Windows-1257", + "Windows-1258", + "Windows-31J" ].each do |canonical_name| encoding = Encoding.find(canonical_name) -- cgit v1.2.3