diff options
| author | Patrick O'Grady <ogradypatrickj@gmail.com> | 2023-11-22 08:21:11 -0500 |
|---|---|---|
| committer | git <svn-admin@ruby-lang.org> | 2023-11-22 13:21:15 +0000 |
| commit | e9ccceab1106fe887665646f2035f7005fc2a5a5 (patch) | |
| tree | fb244d4c6a91ebf5d019df32677f56a353c9905a | |
| parent | c05495530e0308d82a39bb5b248f711f0a485921 (diff) | |
[ruby/prism] Add KOI8-U encoding
(https://github.com/ruby/prism/pull/1906)
* Add test for KOI8-U
* Rename koi8 char_width function
- Rename function for use with any KOI8-based encoding
* Add KOI8-U encoding
* Add encoding to encoding.md
https://github.com/ruby/prism/commit/6cad4552f7
| -rw-r--r-- | prism/enc/pm_encoding.h | 1 | ||||
| -rw-r--r-- | prism/enc/pm_tables.c | 39 | ||||
| -rw-r--r-- | prism/prism.c | 1 | ||||
| -rw-r--r-- | test/prism/encoding_test.rb | 1 |
4 files changed, 40 insertions, 2 deletions
diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h index 16deaefcb3..51227b9c96 100644 --- a/prism/enc/pm_encoding.h +++ b/prism/enc/pm_encoding.h @@ -198,6 +198,7 @@ extern pm_encoding_t pm_encoding_iso_8859_14; extern pm_encoding_t pm_encoding_iso_8859_15; extern pm_encoding_t pm_encoding_iso_8859_16; extern pm_encoding_t pm_encoding_koi8_r; +extern pm_encoding_t pm_encoding_koi8_u; extern pm_encoding_t pm_encoding_mac_cent_euro; extern pm_encoding_t pm_encoding_mac_croatian; extern pm_encoding_t pm_encoding_mac_cyrillic; diff --git a/prism/enc/pm_tables.c b/prism/enc/pm_tables.c index 6a163315be..6eede35e32 100644 --- a/prism/enc/pm_tables.c +++ b/prism/enc/pm_tables.c @@ -866,6 +866,30 @@ static uint8_t pm_encoding_koi8_r_table[256] = { /** * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding KOI8-U character. + */ +static uint8_t pm_encoding_koi8_u_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 3, 3, 0, 3, 3, 0, 0, 0, 0, 0, 3, 0, 0, // Ax + 0, 0, 0, 7, 7, 0, 7, 7, 0, 0, 0, 0, 0, 7, 0, 0, // Bx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Cx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Dx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Ex + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a * piece of information about the corresponding macCentEuro character. */ static uint8_t pm_encoding_mac_cent_euro_table[256] = { @@ -1418,7 +1442,7 @@ pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATT * checking if it's a valid codepoint in KOI-8 and if it is returning 1. */ static size_t -pm_encoding_koi8_r_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { +pm_encoding_koi8_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { return ((*b >= 0x20 && *b <= 0x7E) || (*b >= 0x80)) ? 1 : 0; } @@ -1468,6 +1492,7 @@ PRISM_ENCODING_TABLE(iso_8859_14) PRISM_ENCODING_TABLE(iso_8859_15) PRISM_ENCODING_TABLE(iso_8859_16) PRISM_ENCODING_TABLE(koi8_r) +PRISM_ENCODING_TABLE(koi8_u) PRISM_ENCODING_TABLE(mac_cent_euro) PRISM_ENCODING_TABLE(mac_croatian) PRISM_ENCODING_TABLE(mac_cyrillic) @@ -1855,13 +1880,23 @@ pm_encoding_t pm_encoding_iso_8859_16 = { /** KOI8-R */ pm_encoding_t pm_encoding_koi8_r = { .name = "KOI8-R", - .char_width = pm_encoding_koi8_r_char_width, + .char_width = pm_encoding_koi8_char_width, .alnum_char = pm_encoding_koi8_r_alnum_char, .alpha_char = pm_encoding_koi8_r_alpha_char, .isupper_char = pm_encoding_koi8_r_isupper_char, .multibyte = false }; +/** KOI8-U */ +pm_encoding_t pm_encoding_koi8_u = { + .name = "KOI8-U", + .char_width = pm_encoding_koi8_char_width, + .alnum_char = pm_encoding_koi8_u_alnum_char, + .alpha_char = pm_encoding_koi8_u_alpha_char, + .isupper_char = pm_encoding_koi8_u_isupper_char, + .multibyte = false +}; + /** macCentEuro */ pm_encoding_t pm_encoding_mac_cent_euro = { .name = "macCentEuro", diff --git a/prism/prism.c b/prism/prism.c index 6892643495..e7848f0e04 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -6223,6 +6223,7 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star break; case 'K': case 'k': ENCODING1("KOI8-R", pm_encoding_koi8_r); + ENCODING1("KOI8-U", pm_encoding_koi8_u); break; case 'L': case 'l': ENCODING1("locale", pm_encoding_utf_8); diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb index dfbbff0a47..5a0a671b5b 100644 --- a/test/prism/encoding_test.rb +++ b/test/prism/encoding_test.rb @@ -44,6 +44,7 @@ module Prism Encoding::ISO_8859_15 => 0x00...0x100, Encoding::ISO_8859_16 => 0x00...0x100, Encoding::KOI8_R => 0x00...0x100, + Encoding::KOI8_U => 0x00...0x100, Encoding::MACCENTEURO => 0x00...0x100, Encoding::MACCROATIAN => 0x00...0x100, Encoding::MACCYRILLIC => 0x00...0x100, |
