diff options
Diffstat (limited to 'prism/encoding.c')
| -rw-r--r-- | prism/encoding.c | 560 |
1 files changed, 354 insertions, 206 deletions
diff --git a/prism/encoding.c b/prism/encoding.c index dc63cccc2d..c9c2e13056 100644 --- a/prism/encoding.c +++ b/prism/encoding.c @@ -1,8 +1,13 @@ -#include "prism/encoding.h" +#include "prism/internal/encoding.h" + +#include "prism/compiler/unused.h" +#include "prism/internal/strncasecmp.h" + +#include <assert.h> typedef uint32_t pm_unicode_codepoint_t; -#define UNICODE_ALPHA_CODEPOINTS_LENGTH 1450 +#define UNICODE_ALPHA_CODEPOINTS_LENGTH 1508 static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEPOINTS_LENGTH] = { 0x100, 0x2C1, 0x2C6, 0x2D1, @@ -10,7 +15,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x2EC, 0x2EC, 0x2EE, 0x2EE, 0x345, 0x345, - 0x370, 0x374, + 0x363, 0x374, 0x376, 0x377, 0x37A, 0x37D, 0x37F, 0x37F, @@ -50,7 +55,8 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x840, 0x858, 0x860, 0x86A, 0x870, 0x887, - 0x889, 0x88E, + 0x889, 0x88F, + 0x897, 0x897, 0x8A0, 0x8C9, 0x8D4, 0x8DF, 0x8E3, 0x8E9, @@ -140,7 +146,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0xC4A, 0xC4C, 0xC55, 0xC56, 0xC58, 0xC5A, - 0xC5D, 0xC5D, + 0xC5C, 0xC5D, 0xC60, 0xC63, 0xC80, 0xC83, 0xC85, 0xC8C, @@ -152,7 +158,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0xCC6, 0xCC8, 0xCCA, 0xCCC, 0xCD5, 0xCD6, - 0xCDD, 0xCDE, + 0xCDC, 0xCDE, 0xCE0, 0xCE3, 0xCF1, 0xCF3, 0xD00, 0xD0C, @@ -264,7 +270,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x1C00, 0x1C36, 0x1C4D, 0x1C4F, 0x1C5A, 0x1C7D, - 0x1C80, 0x1C88, + 0x1C80, 0x1C8A, 0x1C90, 0x1CBA, 0x1CBD, 0x1CBF, 0x1CE9, 0x1CEC, @@ -272,7 +278,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x1CF5, 0x1CF6, 0x1CFA, 0x1CFA, 0x1D00, 0x1DBF, - 0x1DE7, 0x1DF4, + 0x1DD3, 0x1DF4, 0x1E00, 0x1F15, 0x1F18, 0x1F1D, 0x1F20, 0x1F45, @@ -352,11 +358,8 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0xA67F, 0xA6EF, 0xA717, 0xA71F, 0xA722, 0xA788, - 0xA78B, 0xA7CA, - 0xA7D0, 0xA7D1, - 0xA7D3, 0xA7D3, - 0xA7D5, 0xA7D9, - 0xA7F2, 0xA805, + 0xA78B, 0xA7DC, + 0xA7F1, 0xA805, 0xA807, 0xA827, 0xA840, 0xA873, 0xA880, 0xA8C3, @@ -446,6 +449,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x105A3, 0x105B1, 0x105B3, 0x105B9, 0x105BB, 0x105BC, + 0x105C0, 0x105F3, 0x10600, 0x10736, 0x10740, 0x10755, 0x10760, 0x10767, @@ -464,6 +468,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x108F4, 0x108F5, 0x10900, 0x10915, 0x10920, 0x10939, + 0x10940, 0x10959, 0x10980, 0x109B7, 0x109BE, 0x109BF, 0x10A00, 0x10A03, @@ -483,9 +488,14 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x10C80, 0x10CB2, 0x10CC0, 0x10CF2, 0x10D00, 0x10D27, + 0x10D4A, 0x10D65, + 0x10D69, 0x10D69, + 0x10D6F, 0x10D85, 0x10E80, 0x10EA9, 0x10EAB, 0x10EAC, 0x10EB0, 0x10EB1, + 0x10EC2, 0x10EC7, + 0x10EFA, 0x10EFC, 0x10F00, 0x10F1C, 0x10F27, 0x10F27, 0x10F30, 0x10F45, @@ -529,6 +539,17 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x11350, 0x11350, 0x11357, 0x11357, 0x1135D, 0x11363, + 0x11380, 0x11389, + 0x1138B, 0x1138B, + 0x1138E, 0x1138E, + 0x11390, 0x113B5, + 0x113B7, 0x113C0, + 0x113C2, 0x113C2, + 0x113C5, 0x113C5, + 0x113C7, 0x113CA, + 0x113CC, 0x113CD, + 0x113D1, 0x113D1, + 0x113D3, 0x113D3, 0x11400, 0x11441, 0x11443, 0x11445, 0x11447, 0x1144A, @@ -567,6 +588,8 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x11A50, 0x11A97, 0x11A9D, 0x11A9D, 0x11AB0, 0x11AF8, + 0x11B60, 0x11B67, + 0x11BC0, 0x11BE0, 0x11C00, 0x11C08, 0x11C0A, 0x11C36, 0x11C38, 0x11C3E, @@ -588,6 +611,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x11D90, 0x11D91, 0x11D93, 0x11D96, 0x11D98, 0x11D98, + 0x11DB0, 0x11DDB, 0x11EE0, 0x11EF6, 0x11F00, 0x11F10, 0x11F12, 0x11F3A, @@ -599,7 +623,9 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x12F90, 0x12FF0, 0x13000, 0x1342F, 0x13441, 0x13446, + 0x13460, 0x143FA, 0x14400, 0x14646, + 0x16100, 0x1612E, 0x16800, 0x16A38, 0x16A40, 0x16A5E, 0x16A70, 0x16ABE, @@ -608,16 +634,19 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x16B40, 0x16B43, 0x16B63, 0x16B77, 0x16B7D, 0x16B8F, + 0x16D40, 0x16D6C, 0x16E40, 0x16E7F, + 0x16EA0, 0x16EB8, + 0x16EBB, 0x16ED3, 0x16F00, 0x16F4A, 0x16F4F, 0x16F87, 0x16F8F, 0x16F9F, 0x16FE0, 0x16FE1, 0x16FE3, 0x16FE3, - 0x16FF0, 0x16FF1, - 0x17000, 0x187F7, - 0x18800, 0x18CD5, - 0x18D00, 0x18D08, + 0x16FF0, 0x16FF6, + 0x17000, 0x18CD5, + 0x18CFF, 0x18D1E, + 0x18D80, 0x18DF2, 0x1AFF0, 0x1AFF3, 0x1AFF5, 0x1AFFB, 0x1AFFD, 0x1AFFE, @@ -677,6 +706,11 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x1E290, 0x1E2AD, 0x1E2C0, 0x1E2EB, 0x1E4D0, 0x1E4EB, + 0x1E5D0, 0x1E5ED, + 0x1E5F0, 0x1E5F0, + 0x1E6C0, 0x1E6DE, + 0x1E6E0, 0x1E6F5, + 0x1E6FE, 0x1E6FF, 0x1E7E0, 0x1E7E6, 0x1E7E8, 0x1E7EB, 0x1E7ED, 0x1E7EE, @@ -722,16 +756,16 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x1F150, 0x1F169, 0x1F170, 0x1F189, 0x20000, 0x2A6DF, - 0x2A700, 0x2B739, - 0x2B740, 0x2B81D, - 0x2B820, 0x2CEA1, + 0x2A700, 0x2B81D, + 0x2B820, 0x2CEAD, 0x2CEB0, 0x2EBE0, + 0x2EBF0, 0x2EE5D, 0x2F800, 0x2FA1D, 0x30000, 0x3134A, - 0x31350, 0x323AF, + 0x31350, 0x33479, }; -#define UNICODE_ALNUM_CODEPOINTS_LENGTH 1528 +#define UNICODE_ALNUM_CODEPOINTS_LENGTH 1598 static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEPOINTS_LENGTH] = { 0x100, 0x2C1, 0x2C6, 0x2D1, @@ -739,7 +773,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x2EC, 0x2EC, 0x2EE, 0x2EE, 0x345, 0x345, - 0x370, 0x374, + 0x363, 0x374, 0x376, 0x377, 0x37A, 0x37D, 0x37F, 0x37F, @@ -778,7 +812,8 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x840, 0x858, 0x860, 0x86A, 0x870, 0x887, - 0x889, 0x88E, + 0x889, 0x88F, + 0x897, 0x897, 0x8A0, 0x8C9, 0x8D4, 0x8DF, 0x8E3, 0x8E9, @@ -872,7 +907,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0xC4A, 0xC4C, 0xC55, 0xC56, 0xC58, 0xC5A, - 0xC5D, 0xC5D, + 0xC5C, 0xC5D, 0xC60, 0xC63, 0xC66, 0xC6F, 0xC80, 0xC83, @@ -885,7 +920,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0xCC6, 0xCC8, 0xCCA, 0xCCC, 0xCD5, 0xCD6, - 0xCDD, 0xCDE, + 0xCDC, 0xCDE, 0xCE0, 0xCE3, 0xCE6, 0xCEF, 0xCF1, 0xCF3, @@ -1007,7 +1042,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x1C00, 0x1C36, 0x1C40, 0x1C49, 0x1C4D, 0x1C7D, - 0x1C80, 0x1C88, + 0x1C80, 0x1C8A, 0x1C90, 0x1CBA, 0x1CBD, 0x1CBF, 0x1CE9, 0x1CEC, @@ -1015,7 +1050,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x1CF5, 0x1CF6, 0x1CFA, 0x1CFA, 0x1D00, 0x1DBF, - 0x1DE7, 0x1DF4, + 0x1DD3, 0x1DF4, 0x1E00, 0x1F15, 0x1F18, 0x1F1D, 0x1F20, 0x1F45, @@ -1094,11 +1129,8 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0xA67F, 0xA6EF, 0xA717, 0xA71F, 0xA722, 0xA788, - 0xA78B, 0xA7CA, - 0xA7D0, 0xA7D1, - 0xA7D3, 0xA7D3, - 0xA7D5, 0xA7D9, - 0xA7F2, 0xA805, + 0xA78B, 0xA7DC, + 0xA7F1, 0xA805, 0xA807, 0xA827, 0xA840, 0xA873, 0xA880, 0xA8C3, @@ -1191,6 +1223,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x105A3, 0x105B1, 0x105B3, 0x105B9, 0x105BB, 0x105BC, + 0x105C0, 0x105F3, 0x10600, 0x10736, 0x10740, 0x10755, 0x10760, 0x10767, @@ -1209,6 +1242,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x108F4, 0x108F5, 0x10900, 0x10915, 0x10920, 0x10939, + 0x10940, 0x10959, 0x10980, 0x109B7, 0x109BE, 0x109BF, 0x10A00, 0x10A03, @@ -1229,9 +1263,14 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x10CC0, 0x10CF2, 0x10D00, 0x10D27, 0x10D30, 0x10D39, + 0x10D40, 0x10D65, + 0x10D69, 0x10D69, + 0x10D6F, 0x10D85, 0x10E80, 0x10EA9, 0x10EAB, 0x10EAC, 0x10EB0, 0x10EB1, + 0x10EC2, 0x10EC7, + 0x10EFA, 0x10EFC, 0x10F00, 0x10F1C, 0x10F27, 0x10F27, 0x10F30, 0x10F45, @@ -1278,6 +1317,17 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x11350, 0x11350, 0x11357, 0x11357, 0x1135D, 0x11363, + 0x11380, 0x11389, + 0x1138B, 0x1138B, + 0x1138E, 0x1138E, + 0x11390, 0x113B5, + 0x113B7, 0x113C0, + 0x113C2, 0x113C2, + 0x113C5, 0x113C5, + 0x113C7, 0x113CA, + 0x113CC, 0x113CD, + 0x113D1, 0x113D1, + 0x113D3, 0x113D3, 0x11400, 0x11441, 0x11443, 0x11445, 0x11447, 0x1144A, @@ -1297,6 +1347,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x11680, 0x116B5, 0x116B8, 0x116B8, 0x116C0, 0x116C9, + 0x116D0, 0x116E3, 0x11700, 0x1171A, 0x1171D, 0x1172A, 0x11730, 0x11739, @@ -1322,6 +1373,9 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x11A50, 0x11A97, 0x11A9D, 0x11A9D, 0x11AB0, 0x11AF8, + 0x11B60, 0x11B67, + 0x11BC0, 0x11BE0, + 0x11BF0, 0x11BF9, 0x11C00, 0x11C08, 0x11C0A, 0x11C36, 0x11C38, 0x11C3E, @@ -1346,6 +1400,8 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x11D93, 0x11D96, 0x11D98, 0x11D98, 0x11DA0, 0x11DA9, + 0x11DB0, 0x11DDB, + 0x11DE0, 0x11DE9, 0x11EE0, 0x11EF6, 0x11F00, 0x11F10, 0x11F12, 0x11F3A, @@ -1358,7 +1414,10 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x12F90, 0x12FF0, 0x13000, 0x1342F, 0x13441, 0x13446, + 0x13460, 0x143FA, 0x14400, 0x14646, + 0x16100, 0x1612E, + 0x16130, 0x16139, 0x16800, 0x16A38, 0x16A40, 0x16A5E, 0x16A60, 0x16A69, @@ -1370,16 +1429,20 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x16B50, 0x16B59, 0x16B63, 0x16B77, 0x16B7D, 0x16B8F, + 0x16D40, 0x16D6C, + 0x16D70, 0x16D79, 0x16E40, 0x16E7F, + 0x16EA0, 0x16EB8, + 0x16EBB, 0x16ED3, 0x16F00, 0x16F4A, 0x16F4F, 0x16F87, 0x16F8F, 0x16F9F, 0x16FE0, 0x16FE1, 0x16FE3, 0x16FE3, - 0x16FF0, 0x16FF1, - 0x17000, 0x187F7, - 0x18800, 0x18CD5, - 0x18D00, 0x18D08, + 0x16FF0, 0x16FF6, + 0x17000, 0x18CD5, + 0x18CFF, 0x18D1E, + 0x18D80, 0x18DF2, 0x1AFF0, 0x1AFF3, 0x1AFF5, 0x1AFFB, 0x1AFFD, 0x1AFFE, @@ -1394,6 +1457,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x1BC80, 0x1BC88, 0x1BC90, 0x1BC99, 0x1BC9E, 0x1BC9E, + 0x1CCF0, 0x1CCF9, 0x1D400, 0x1D454, 0x1D456, 0x1D49C, 0x1D49E, 0x1D49F, @@ -1443,6 +1507,11 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x1E2F0, 0x1E2F9, 0x1E4D0, 0x1E4EB, 0x1E4F0, 0x1E4F9, + 0x1E5D0, 0x1E5ED, + 0x1E5F0, 0x1E5FA, + 0x1E6C0, 0x1E6DE, + 0x1E6E0, 0x1E6F5, + 0x1E6FE, 0x1E6FF, 0x1E7E0, 0x1E7E6, 0x1E7E8, 0x1E7EB, 0x1E7ED, 0x1E7EE, @@ -1490,16 +1559,16 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x1F170, 0x1F189, 0x1FBF0, 0x1FBF9, 0x20000, 0x2A6DF, - 0x2A700, 0x2B739, - 0x2B740, 0x2B81D, - 0x2B820, 0x2CEA1, + 0x2A700, 0x2B81D, + 0x2B820, 0x2CEAD, 0x2CEB0, 0x2EBE0, + 0x2EBF0, 0x2EE5D, 0x2F800, 0x2FA1D, 0x30000, 0x3134A, - 0x31350, 0x323AF, + 0x31350, 0x33479, }; -#define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1302 +#define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1320 static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_CODEPOINTS_LENGTH] = { 0x100, 0x100, 0x102, 0x102, @@ -1774,6 +1843,7 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C 0x10C7, 0x10C7, 0x10CD, 0x10CD, 0x13A0, 0x13F5, + 0x1C89, 0x1C89, 0x1C90, 0x1CBA, 0x1CBD, 0x1CBF, 0x1E00, 0x1E00, @@ -2103,9 +2173,15 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C 0xA7C2, 0xA7C2, 0xA7C4, 0xA7C7, 0xA7C9, 0xA7C9, + 0xA7CB, 0xA7CC, + 0xA7CE, 0xA7CE, 0xA7D0, 0xA7D0, + 0xA7D2, 0xA7D2, + 0xA7D4, 0xA7D4, 0xA7D6, 0xA7D6, 0xA7D8, 0xA7D8, + 0xA7DA, 0xA7DA, + 0xA7DC, 0xA7DC, 0xA7F5, 0xA7F5, 0xFF21, 0xFF3A, 0x10400, 0x10427, @@ -2115,8 +2191,10 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C 0x1058C, 0x10592, 0x10594, 0x10595, 0x10C80, 0x10CB2, + 0x10D50, 0x10D65, 0x118A0, 0x118BF, 0x16E40, 0x16E5F, + 0x16EA0, 0x16EB8, 0x1D400, 0x1D419, 0x1D434, 0x1D44D, 0x1D468, 0x1D481, @@ -2304,6 +2382,10 @@ pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) { */ size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + if (*b < 0x80) { return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT) ? 1 : 0; } @@ -2324,6 +2406,10 @@ pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) { */ size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + if (*b < 0x80) { return (pm_encoding_unicode_table[*b] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0; } @@ -2344,6 +2430,10 @@ pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) { */ bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + if (*b < 0x80) { return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false; } @@ -2358,9 +2448,12 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) { } } +#ifndef PRISM_ENCODING_EXCLUDE_FULL + static pm_unicode_codepoint_t pm_cesu_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) { - if (b[0] < 0x80) { + + if ((n > 0) && (b[0] < 0x80)) { *width = 1; return (pm_unicode_codepoint_t) b[0]; } @@ -2399,6 +2492,10 @@ pm_cesu_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) { static size_t pm_encoding_cesu_8_char_width(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + size_t width; pm_cesu_8_codepoint(b, n, &width); return width; @@ -2406,6 +2503,10 @@ pm_encoding_cesu_8_char_width(const uint8_t *b, ptrdiff_t n) { static size_t pm_encoding_cesu_8_alpha_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + if (*b < 0x80) { return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT) ? 1 : 0; } @@ -2422,6 +2523,10 @@ pm_encoding_cesu_8_alpha_char(const uint8_t *b, ptrdiff_t n) { static size_t pm_encoding_cesu_8_alnum_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + if (*b < 0x80) { return (pm_encoding_unicode_table[*b] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0; } @@ -2438,6 +2543,10 @@ pm_encoding_cesu_8_alnum_char(const uint8_t *b, ptrdiff_t n) { static bool pm_encoding_cesu_8_isupper_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + if (*b < 0x80) { return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false; } @@ -2452,6 +2561,8 @@ pm_encoding_cesu_8_isupper_char(const uint8_t *b, ptrdiff_t n) { } } +#endif + #undef UNICODE_ALPHA_CODEPOINTS_LENGTH #undef UNICODE_ALNUM_CODEPOINTS_LENGTH #undef UNICODE_ISUPPER_CODEPOINTS_LENGTH @@ -2480,6 +2591,8 @@ static const uint8_t pm_encoding_ascii_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx }; +#ifndef PRISM_ENCODING_EXCLUDE_FULL + /** * Each element of the following table contains a bitfield that indicates a * piece of information about the corresponding CP850 character. @@ -3849,14 +3962,14 @@ static const uint8_t pm_encoding_windows_874_table[256] = { }; #define PRISM_ENCODING_TABLE(name) \ - static size_t pm_encoding_ ##name ## _alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \ - return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHABETIC_BIT); \ + static size_t pm_encoding_ ##name ## _alpha_char(const uint8_t *b, ptrdiff_t n) { \ + return ((n > 0) && (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHABETIC_BIT)); \ } \ - static size_t pm_encoding_ ##name ## _alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \ - return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0; \ + static size_t pm_encoding_ ##name ## _alnum_char(const uint8_t *b, ptrdiff_t n) { \ + return ((n > 0) && (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0; \ } \ - static bool pm_encoding_ ##name ## _isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \ - return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_UPPERCASE_BIT); \ + static bool pm_encoding_ ##name ## _isupper_char(const uint8_t *b, ptrdiff_t n) { \ + return ((n > 0) && (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_UPPERCASE_BIT)); \ } PRISM_ENCODING_TABLE(cp850) @@ -3918,14 +4031,15 @@ PRISM_ENCODING_TABLE(windows_1258) PRISM_ENCODING_TABLE(windows_874) #undef PRISM_ENCODING_TABLE +#endif /** * Returns the size of the next character in the ASCII encoding. This basically * means that if the top bit is not set, the character is 1 byte long. */ static size_t -pm_encoding_ascii_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { - return *b < 0x80 ? 1 : 0; +pm_encoding_ascii_char_width(const uint8_t *b, ptrdiff_t n) { + return ((n > 0) && (*b < 0x80)) ? 1 : 0; } /** @@ -3933,8 +4047,8 @@ pm_encoding_ascii_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t * alphabetical character. */ static size_t -pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { - return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT); +pm_encoding_ascii_alpha_char(const uint8_t *b, ptrdiff_t n) { + return (n > 0) ? (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT) : 0; } /** @@ -3944,7 +4058,7 @@ pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t */ static size_t pm_encoding_ascii_alpha_char_7bit(const uint8_t *b, ptrdiff_t n) { - return (*b < 0x80) ? pm_encoding_ascii_alpha_char(b, n) : 0; + return ((n > 0) && (*b < 0x80)) ? pm_encoding_ascii_alpha_char(b, n) : 0; } /** @@ -3952,8 +4066,8 @@ pm_encoding_ascii_alpha_char_7bit(const uint8_t *b, ptrdiff_t n) { * alphanumeric character. */ static size_t -pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { - return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0; +pm_encoding_ascii_alnum_char(const uint8_t *b, ptrdiff_t n) { + return ((n > 0) && (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0; } /** @@ -3963,7 +4077,7 @@ pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t */ static size_t pm_encoding_ascii_alnum_char_7bit(const uint8_t *b, ptrdiff_t n) { - return (*b < 0x80) ? pm_encoding_ascii_alnum_char(b, n) : 0; + return ((n > 0) && (*b < 0x80)) ? pm_encoding_ascii_alnum_char(b, n) : 0; } /** @@ -3971,27 +4085,137 @@ pm_encoding_ascii_alnum_char_7bit(const uint8_t *b, ptrdiff_t n) { * character. */ static bool -pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { - return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_UPPERCASE_BIT); +pm_encoding_ascii_isupper_char(const uint8_t *b, ptrdiff_t n) { + return (n > 0) && (pm_encoding_ascii_table[*b] & PRISM_ENCODING_UPPERCASE_BIT); } /** - * Certain encodings are equivalent to ASCII below 0x80, so it works for our - * purposes to have a function here that first checks the bounds and then falls - * back to checking the ASCII lookup table. + * For a lot of encodings the default is that they are a single byte long no + * matter what the codepoint, so this function is shared between them. + */ +static size_t +pm_encoding_single_char_width(PRISM_UNUSED const uint8_t *b, PRISM_UNUSED ptrdiff_t n) { + return 1; +} + +/** + * Returns the size of the next character in the EUC-JP encoding, or 0 if a + * character cannot be decoded from the given bytes. + */ +static size_t +pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the single byte characters. + if ((n > 0) && (*b < 0x80)) { + return 1; + } + + // These are the double byte characters. + if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) { + return 2; + } + + // These are the triple byte characters. + if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) { + return 3; + } + + return 0; +} + +/** + * Returns the size of the next character in the EUC-JP encoding if it is an + * uppercase character. */ static bool -pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) { - return (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n); +pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) { + size_t width = pm_encoding_euc_jp_char_width(b, n); + + if (width == 1) { + return pm_encoding_ascii_isupper_char(b, n); + } else if (width == 2) { + return ( + (b[0] == 0xA3 && b[1] >= 0xC1 && b[1] <= 0xDA) || + (b[0] == 0xA6 && b[1] >= 0xA1 && b[1] <= 0xB8) || + (b[0] == 0xA7 && b[1] >= 0xA1 && b[1] <= 0xC1) + ); + } else { + return false; + } } /** - * For a lot of encodings the default is that they are a single byte long no - * matter what the codepoint, so this function is shared between them. + * Returns the size of the next character in the Shift_JIS encoding, or 0 if a + * character cannot be decoded from the given bytes. */ static size_t -pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { - return 1; +pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + // These are the single byte characters. + if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) { + return 1; + } + + // These are the double byte characters. + if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC && b[1] != 0x7F)) { + return 2; + } + + return 0; +} + +/** + * Returns the size of the next character in the Shift_JIS encoding if it is an + * alphanumeric character. + */ +static size_t +pm_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) { + size_t width = pm_encoding_shift_jis_char_width(b, n); + return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alnum_char(b, n)) : width; +} + +/** + * Returns the size of the next character in the Shift_JIS encoding if it is an + * alphabetical character. + */ +static size_t +pm_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) { + size_t width = pm_encoding_shift_jis_char_width(b, n); + return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alpha_char(b, n)) : width; +} + +/** + * Returns the size of the next character in the Shift_JIS encoding if it is an + * uppercase character. + */ +static bool +pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) { + size_t width = pm_encoding_shift_jis_char_width(b, n); + + if (width == 1) { + return pm_encoding_ascii_isupper_char(b, n); + } else if (width == 2) { + return ( + ((b[0] == 0x82) && (b[1] >= 0x60 && b[1] <= 0x79)) || + ((b[0] == 0x83) && (b[1] >= 0x9F && b[1] <= 0xB6)) || + ((b[0] == 0x84) && (b[1] >= 0x40 && b[1] <= 0x60)) + ); + } else { + return width; + } +} + +#ifndef PRISM_ENCODING_EXCLUDE_FULL + +/** + * Certain encodings are equivalent to ASCII below 0x80, so it works for our + * purposes to have a function here that first checks the bounds and then falls + * back to checking the ASCII lookup table. + */ +static bool +pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) { + return (n > 0) && (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n); } /** @@ -4001,7 +4225,7 @@ pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATT static size_t pm_encoding_big5_char_width(const uint8_t *b, ptrdiff_t n) { // These are the single byte characters. - if (*b < 0x80) { + if ((n > 0) && (*b < 0x80)) { return 1; } @@ -4020,7 +4244,7 @@ pm_encoding_big5_char_width(const uint8_t *b, ptrdiff_t n) { static size_t pm_encoding_cp949_char_width(const uint8_t *b, ptrdiff_t n) { // These are the single byte characters - if (*b <= 0x80) { + if ((n > 0) && (*b <= 0x80)) { return 1; } @@ -4039,7 +4263,7 @@ pm_encoding_cp949_char_width(const uint8_t *b, ptrdiff_t n) { static size_t pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) { // These are the 1 byte characters. - if (*b < 0x80) { + if ((n > 0) && (*b < 0x80)) { return 1; } @@ -4076,58 +4300,13 @@ pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) { } /** - * Returns the size of the next character in the EUC-JP encoding, or 0 if a - * character cannot be decoded from the given bytes. - */ -static size_t -pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) { - // These are the single byte characters. - if (*b < 0x80) { - return 1; - } - - // These are the double byte characters. - if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) { - return 2; - } - - // These are the triple byte characters. - if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) { - return 3; - } - - return 0; -} - -/** - * Returns the size of the next character in the EUC-JP encoding if it is an - * uppercase character. - */ -static bool -pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) { - size_t width = pm_encoding_euc_jp_char_width(b, n); - - if (width == 1) { - return pm_encoding_ascii_isupper_char(b, n); - } else if (width == 2) { - return ( - (b[0] == 0xA3 && b[1] >= 0xC1 && b[1] <= 0xDA) || - (b[0] == 0xA6 && b[1] >= 0xA1 && b[1] <= 0xB8) || - (b[0] == 0xA7 && b[1] >= 0xA1 && b[1] <= 0xC1) - ); - } else { - return false; - } -} - -/** * Returns the size of the next character in the EUC-KR encoding, or 0 if a * character cannot be decoded from the given bytes. */ static size_t pm_encoding_euc_kr_char_width(const uint8_t *b, ptrdiff_t n) { // These are the single byte characters. - if (*b < 0x80) { + if ((n > 0) && (*b < 0x80)) { return 1; } @@ -4146,7 +4325,7 @@ pm_encoding_euc_kr_char_width(const uint8_t *b, ptrdiff_t n) { static size_t pm_encoding_euc_tw_char_width(const uint8_t *b, ptrdiff_t n) { // These are the single byte characters. - if (*b < 0x80) { + if ((n > 0) && (*b < 0x80)) { return 1; } @@ -4170,7 +4349,7 @@ pm_encoding_euc_tw_char_width(const uint8_t *b, ptrdiff_t n) { static size_t pm_encoding_gb18030_char_width(const uint8_t *b, ptrdiff_t n) { // These are the 1 byte characters. - if (*b < 0x80) { + if ((n > 0) && (*b < 0x80)) { return 1; } @@ -4194,7 +4373,7 @@ pm_encoding_gb18030_char_width(const uint8_t *b, ptrdiff_t n) { static size_t pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) { // These are the single byte characters. - if (*b <= 0x80) { + if ((n > 0) && (*b <= 0x80)) { return 1; } @@ -4218,65 +4397,7 @@ pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) { return 0; } -/** - * Returns the size of the next character in the Shift_JIS encoding, or 0 if a - * character cannot be decoded from the given bytes. - */ -static size_t -pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) { - // These are the single byte characters. - if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) { - return 1; - } - - // These are the double byte characters. - if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC && b[1] != 0x7F)) { - return 2; - } - - return 0; -} - -/** - * Returns the size of the next character in the Shift_JIS encoding if it is an - * alphanumeric character. - */ -static size_t -pm_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) { - size_t width = pm_encoding_shift_jis_char_width(b, n); - return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alnum_char(b, n)) : width; -} - -/** - * Returns the size of the next character in the Shift_JIS encoding if it is an - * alphabetical character. - */ -static size_t -pm_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) { - size_t width = pm_encoding_shift_jis_char_width(b, n); - return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alpha_char(b, n)) : width; -} - -/** - * Returns the size of the next character in the Shift_JIS encoding if it is an - * uppercase character. - */ -static bool -pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) { - size_t width = pm_encoding_shift_jis_char_width(b, n); - - if (width == 1) { - return pm_encoding_ascii_isupper_char(b, n); - } else if (width == 2) { - return ( - ((b[0] == 0x82) && (b[1] >= 0x60 && b[1] <= 0x79)) || - ((b[0] == 0x83) && (b[1] >= 0x9F && b[1] <= 0xB6)) || - ((b[0] == 0x84) && (b[1] >= 0x40 && b[1] <= 0x60)) - ); - } else { - return width; - } -} +#endif /** * This is the table of all of the encodings that prism supports. @@ -4290,6 +4411,14 @@ const pm_encoding_t pm_encodings[] = { .isupper_char = pm_encoding_utf_8_isupper_char, .multibyte = true }, + [PM_ENCODING_US_ASCII] = { + .name = "US-ASCII", + .char_width = pm_encoding_ascii_char_width, + .alnum_char = pm_encoding_ascii_alnum_char, + .alpha_char = pm_encoding_ascii_alpha_char, + .isupper_char = pm_encoding_ascii_isupper_char, + .multibyte = false + }, [PM_ENCODING_ASCII_8BIT] = { .name = "ASCII-8BIT", .char_width = pm_encoding_single_char_width, @@ -4298,6 +4427,24 @@ const pm_encoding_t pm_encodings[] = { .isupper_char = pm_encoding_ascii_isupper_char, .multibyte = false }, + [PM_ENCODING_EUC_JP] = { + .name = "EUC-JP", + .char_width = pm_encoding_euc_jp_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_euc_jp_isupper_char, + .multibyte = true + }, + [PM_ENCODING_WINDOWS_31J] = { + .name = "Windows-31J", + .char_width = pm_encoding_shift_jis_char_width, + .alnum_char = pm_encoding_shift_jis_alnum_char, + .alpha_char = pm_encoding_shift_jis_alpha_char, + .isupper_char = pm_encoding_shift_jis_isupper_char, + .multibyte = true + }, + +#ifndef PRISM_ENCODING_EXCLUDE_FULL [PM_ENCODING_BIG5] = { .name = "Big5", .char_width = pm_encoding_big5_char_width, @@ -4394,14 +4541,6 @@ const pm_encoding_t pm_encodings[] = { .isupper_char = pm_encoding_ascii_isupper_char_7bit, .multibyte = true }, - [PM_ENCODING_EUC_JP] = { - .name = "EUC-JP", - .char_width = pm_encoding_euc_jp_char_width, - .alnum_char = pm_encoding_ascii_alnum_char_7bit, - .alpha_char = pm_encoding_ascii_alpha_char_7bit, - .isupper_char = pm_encoding_euc_jp_isupper_char, - .multibyte = true - }, [PM_ENCODING_EUC_JP_MS] = { .name = "eucJP-ms", .char_width = pm_encoding_euc_jp_char_width, @@ -4874,14 +5013,6 @@ const pm_encoding_t pm_encodings[] = { .isupper_char = pm_encoding_tis_620_isupper_char, .multibyte = false }, - [PM_ENCODING_US_ASCII] = { - .name = "US-ASCII", - .char_width = pm_encoding_ascii_char_width, - .alnum_char = pm_encoding_ascii_alnum_char, - .alpha_char = pm_encoding_ascii_alpha_char, - .isupper_char = pm_encoding_ascii_isupper_char, - .multibyte = false - }, [PM_ENCODING_UTF8_MAC] = { .name = "UTF8-MAC", .char_width = pm_encoding_utf_8_char_width, @@ -4986,14 +5117,6 @@ const pm_encoding_t pm_encodings[] = { .isupper_char = pm_encoding_windows_1258_isupper_char, .multibyte = false }, - [PM_ENCODING_WINDOWS_31J] = { - .name = "Windows-31J", - .char_width = pm_encoding_shift_jis_char_width, - .alnum_char = pm_encoding_shift_jis_alnum_char, - .alpha_char = pm_encoding_shift_jis_alpha_char, - .isupper_char = pm_encoding_shift_jis_isupper_char, - .multibyte = true - }, [PM_ENCODING_WINDOWS_874] = { .name = "Windows-874", .char_width = pm_encoding_single_char_width, @@ -5002,6 +5125,7 @@ const pm_encoding_t pm_encodings[] = { .isupper_char = pm_encoding_windows_874_isupper_char, .multibyte = false } +#endif }; /** @@ -5016,11 +5140,13 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { // UTF-8 can contain extra information at the end about the platform it is // encoded on, such as UTF-8-MAC or UTF-8-UNIX. We'll ignore those suffixes. if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "UTF-8", 5) == 0)) { +#ifndef PRISM_ENCODING_EXCLUDE_FULL // We need to explicitly handle UTF-8-HFS, as that one needs to switch // over to being UTF8-MAC. if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-HFS", 4) == 0)) { return &pm_encodings[PM_ENCODING_UTF8_MAC]; } +#endif // Otherwise we'll return the default UTF-8 encoding. return PM_ENCODING_UTF_8_ENTRY; @@ -5040,11 +5166,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { break; case 'B': case 'b': ENCODING1("BINARY", PM_ENCODING_ASCII_8BIT); +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING1("Big5", PM_ENCODING_BIG5); ENCODING2("Big5-HKSCS", "Big5-HKSCS:2008", PM_ENCODING_BIG5_HKSCS); ENCODING1("Big5-UAO", PM_ENCODING_BIG5_UAO); +#endif break; case 'C': case 'c': + ENCODING1("CP65001", PM_ENCODING_UTF_8); + ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J); +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING1("CESU-8", PM_ENCODING_CESU_8); ENCODING1("CP437", PM_ENCODING_IBM437); ENCODING1("CP720", PM_ENCODING_IBM720); @@ -5064,7 +5195,6 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { ENCODING1("CP874", PM_ENCODING_WINDOWS_874); ENCODING1("CP878", PM_ENCODING_KOI8_R); ENCODING1("CP863", PM_ENCODING_IBM863); - ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J); ENCODING1("CP936", PM_ENCODING_GBK); ENCODING1("CP949", PM_ENCODING_CP949); ENCODING1("CP950", PM_ENCODING_CP950); @@ -5079,25 +5209,30 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { ENCODING1("CP1257", PM_ENCODING_WINDOWS_1257); ENCODING1("CP1258", PM_ENCODING_WINDOWS_1258); ENCODING1("CP51932", PM_ENCODING_CP51932); - ENCODING1("CP65001", PM_ENCODING_UTF_8); +#endif break; case 'E': case 'e': ENCODING2("EUC-JP", "eucJP", PM_ENCODING_EUC_JP); +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING2("eucJP-ms", "euc-jp-ms", PM_ENCODING_EUC_JP_MS); ENCODING2("EUC-JIS-2004", "EUC-JISX0213", PM_ENCODING_EUC_JIS_2004); ENCODING2("EUC-KR", "eucKR", PM_ENCODING_EUC_KR); ENCODING2("EUC-CN", "eucCN", PM_ENCODING_GB2312); ENCODING2("EUC-TW", "eucTW", PM_ENCODING_EUC_TW); ENCODING1("Emacs-Mule", PM_ENCODING_EMACS_MULE); +#endif break; case 'G': case 'g': +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING1("GBK", PM_ENCODING_GBK); ENCODING1("GB12345", PM_ENCODING_GB12345); ENCODING1("GB18030", PM_ENCODING_GB18030); ENCODING1("GB1988", PM_ENCODING_GB1988); ENCODING1("GB2312", PM_ENCODING_GB2312); +#endif break; case 'I': case 'i': +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING1("IBM437", PM_ENCODING_IBM437); ENCODING1("IBM720", PM_ENCODING_IBM720); ENCODING1("IBM737", PM_ENCODING_IBM737); @@ -5129,12 +5264,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { ENCODING2("ISO-8859-14", "ISO8859-14", PM_ENCODING_ISO_8859_14); ENCODING2("ISO-8859-15", "ISO8859-15", PM_ENCODING_ISO_8859_15); ENCODING2("ISO-8859-16", "ISO8859-16", PM_ENCODING_ISO_8859_16); +#endif break; case 'K': case 'k': +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING1("KOI8-R", PM_ENCODING_KOI8_R); ENCODING1("KOI8-U", PM_ENCODING_KOI8_U); +#endif break; case 'M': case 'm': +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING1("macCentEuro", PM_ENCODING_MAC_CENT_EURO); ENCODING1("macCroatian", PM_ENCODING_MAC_CROATIAN); ENCODING1("macCyrillic", PM_ENCODING_MAC_CYRILLIC); @@ -5147,31 +5286,39 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { ENCODING1("macThai", PM_ENCODING_MAC_THAI); ENCODING1("macTurkish", PM_ENCODING_MAC_TURKISH); ENCODING1("macUkraine", PM_ENCODING_MAC_UKRAINE); +#endif break; case 'P': case 'p': ENCODING1("PCK", PM_ENCODING_WINDOWS_31J); break; case 'S': case 's': - ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS); ENCODING1("SJIS", PM_ENCODING_WINDOWS_31J); +#ifndef PRISM_ENCODING_EXCLUDE_FULL + ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS); ENCODING1("SJIS-DoCoMo", PM_ENCODING_SJIS_DOCOMO); ENCODING1("SJIS-KDDI", PM_ENCODING_SJIS_KDDI); ENCODING1("SJIS-SoftBank", PM_ENCODING_SJIS_SOFTBANK); ENCODING1("stateless-ISO-2022-JP", PM_ENCODING_STATELESS_ISO_2022_JP); ENCODING1("stateless-ISO-2022-JP-KDDI", PM_ENCODING_STATELESS_ISO_2022_JP_KDDI); +#endif break; case 'T': case 't': +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING1("TIS-620", PM_ENCODING_TIS_620); +#endif break; case 'U': case 'u': ENCODING1("US-ASCII", PM_ENCODING_US_ASCII); +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING2("UTF8-MAC", "UTF-8-HFS", PM_ENCODING_UTF8_MAC); ENCODING1("UTF8-DoCoMo", PM_ENCODING_UTF8_DOCOMO); ENCODING1("UTF8-KDDI", PM_ENCODING_UTF8_KDDI); ENCODING1("UTF8-SoftBank", PM_ENCODING_UTF8_SOFTBANK); +#endif break; case 'W': case 'w': ENCODING1("Windows-31J", PM_ENCODING_WINDOWS_31J); +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING1("Windows-874", PM_ENCODING_WINDOWS_874); ENCODING1("Windows-1250", PM_ENCODING_WINDOWS_1250); ENCODING1("Windows-1251", PM_ENCODING_WINDOWS_1251); @@ -5182,6 +5329,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { ENCODING1("Windows-1256", PM_ENCODING_WINDOWS_1256); ENCODING1("Windows-1257", PM_ENCODING_WINDOWS_1257); ENCODING1("Windows-1258", PM_ENCODING_WINDOWS_1258); +#endif break; case '6': ENCODING1("646", PM_ENCODING_US_ASCII); |
