summaryrefslogtreecommitdiff
path: root/prism/encoding.c
diff options
context:
space:
mode:
Diffstat (limited to 'prism/encoding.c')
-rw-r--r--prism/encoding.c592
1 files changed, 401 insertions, 191 deletions
diff --git a/prism/encoding.c b/prism/encoding.c
index 2210d71411..d7e5616840 100644
--- a/prism/encoding.c
+++ b/prism/encoding.c
@@ -2,7 +2,7 @@
typedef uint32_t pm_unicode_codepoint_t;
-#define UNICODE_ALPHA_CODEPOINTS_LENGTH 1450
+#define UNICODE_ALPHA_CODEPOINTS_LENGTH 1508
static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEPOINTS_LENGTH] = {
0x100, 0x2C1,
0x2C6, 0x2D1,
@@ -10,7 +10,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
0x2EC, 0x2EC,
0x2EE, 0x2EE,
0x345, 0x345,
- 0x370, 0x374,
+ 0x363, 0x374,
0x376, 0x377,
0x37A, 0x37D,
0x37F, 0x37F,
@@ -50,7 +50,8 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
0x840, 0x858,
0x860, 0x86A,
0x870, 0x887,
- 0x889, 0x88E,
+ 0x889, 0x88F,
+ 0x897, 0x897,
0x8A0, 0x8C9,
0x8D4, 0x8DF,
0x8E3, 0x8E9,
@@ -140,7 +141,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
0xC4A, 0xC4C,
0xC55, 0xC56,
0xC58, 0xC5A,
- 0xC5D, 0xC5D,
+ 0xC5C, 0xC5D,
0xC60, 0xC63,
0xC80, 0xC83,
0xC85, 0xC8C,
@@ -152,7 +153,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
0xCC6, 0xCC8,
0xCCA, 0xCCC,
0xCD5, 0xCD6,
- 0xCDD, 0xCDE,
+ 0xCDC, 0xCDE,
0xCE0, 0xCE3,
0xCF1, 0xCF3,
0xD00, 0xD0C,
@@ -264,7 +265,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
0x1C00, 0x1C36,
0x1C4D, 0x1C4F,
0x1C5A, 0x1C7D,
- 0x1C80, 0x1C88,
+ 0x1C80, 0x1C8A,
0x1C90, 0x1CBA,
0x1CBD, 0x1CBF,
0x1CE9, 0x1CEC,
@@ -272,7 +273,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
0x1CF5, 0x1CF6,
0x1CFA, 0x1CFA,
0x1D00, 0x1DBF,
- 0x1DE7, 0x1DF4,
+ 0x1DD3, 0x1DF4,
0x1E00, 0x1F15,
0x1F18, 0x1F1D,
0x1F20, 0x1F45,
@@ -352,11 +353,8 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
0xA67F, 0xA6EF,
0xA717, 0xA71F,
0xA722, 0xA788,
- 0xA78B, 0xA7CA,
- 0xA7D0, 0xA7D1,
- 0xA7D3, 0xA7D3,
- 0xA7D5, 0xA7D9,
- 0xA7F2, 0xA805,
+ 0xA78B, 0xA7DC,
+ 0xA7F1, 0xA805,
0xA807, 0xA827,
0xA840, 0xA873,
0xA880, 0xA8C3,
@@ -446,6 +444,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
0x105A3, 0x105B1,
0x105B3, 0x105B9,
0x105BB, 0x105BC,
+ 0x105C0, 0x105F3,
0x10600, 0x10736,
0x10740, 0x10755,
0x10760, 0x10767,
@@ -464,6 +463,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
0x108F4, 0x108F5,
0x10900, 0x10915,
0x10920, 0x10939,
+ 0x10940, 0x10959,
0x10980, 0x109B7,
0x109BE, 0x109BF,
0x10A00, 0x10A03,
@@ -483,9 +483,14 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
0x10C80, 0x10CB2,
0x10CC0, 0x10CF2,
0x10D00, 0x10D27,
+ 0x10D4A, 0x10D65,
+ 0x10D69, 0x10D69,
+ 0x10D6F, 0x10D85,
0x10E80, 0x10EA9,
0x10EAB, 0x10EAC,
0x10EB0, 0x10EB1,
+ 0x10EC2, 0x10EC7,
+ 0x10EFA, 0x10EFC,
0x10F00, 0x10F1C,
0x10F27, 0x10F27,
0x10F30, 0x10F45,
@@ -529,6 +534,17 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
0x11350, 0x11350,
0x11357, 0x11357,
0x1135D, 0x11363,
+ 0x11380, 0x11389,
+ 0x1138B, 0x1138B,
+ 0x1138E, 0x1138E,
+ 0x11390, 0x113B5,
+ 0x113B7, 0x113C0,
+ 0x113C2, 0x113C2,
+ 0x113C5, 0x113C5,
+ 0x113C7, 0x113CA,
+ 0x113CC, 0x113CD,
+ 0x113D1, 0x113D1,
+ 0x113D3, 0x113D3,
0x11400, 0x11441,
0x11443, 0x11445,
0x11447, 0x1144A,
@@ -567,6 +583,8 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
0x11A50, 0x11A97,
0x11A9D, 0x11A9D,
0x11AB0, 0x11AF8,
+ 0x11B60, 0x11B67,
+ 0x11BC0, 0x11BE0,
0x11C00, 0x11C08,
0x11C0A, 0x11C36,
0x11C38, 0x11C3E,
@@ -588,6 +606,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
0x11D90, 0x11D91,
0x11D93, 0x11D96,
0x11D98, 0x11D98,
+ 0x11DB0, 0x11DDB,
0x11EE0, 0x11EF6,
0x11F00, 0x11F10,
0x11F12, 0x11F3A,
@@ -599,7 +618,9 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
0x12F90, 0x12FF0,
0x13000, 0x1342F,
0x13441, 0x13446,
+ 0x13460, 0x143FA,
0x14400, 0x14646,
+ 0x16100, 0x1612E,
0x16800, 0x16A38,
0x16A40, 0x16A5E,
0x16A70, 0x16ABE,
@@ -608,16 +629,19 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
0x16B40, 0x16B43,
0x16B63, 0x16B77,
0x16B7D, 0x16B8F,
+ 0x16D40, 0x16D6C,
0x16E40, 0x16E7F,
+ 0x16EA0, 0x16EB8,
+ 0x16EBB, 0x16ED3,
0x16F00, 0x16F4A,
0x16F4F, 0x16F87,
0x16F8F, 0x16F9F,
0x16FE0, 0x16FE1,
0x16FE3, 0x16FE3,
- 0x16FF0, 0x16FF1,
- 0x17000, 0x187F7,
- 0x18800, 0x18CD5,
- 0x18D00, 0x18D08,
+ 0x16FF0, 0x16FF6,
+ 0x17000, 0x18CD5,
+ 0x18CFF, 0x18D1E,
+ 0x18D80, 0x18DF2,
0x1AFF0, 0x1AFF3,
0x1AFF5, 0x1AFFB,
0x1AFFD, 0x1AFFE,
@@ -677,6 +701,11 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
0x1E290, 0x1E2AD,
0x1E2C0, 0x1E2EB,
0x1E4D0, 0x1E4EB,
+ 0x1E5D0, 0x1E5ED,
+ 0x1E5F0, 0x1E5F0,
+ 0x1E6C0, 0x1E6DE,
+ 0x1E6E0, 0x1E6F5,
+ 0x1E6FE, 0x1E6FF,
0x1E7E0, 0x1E7E6,
0x1E7E8, 0x1E7EB,
0x1E7ED, 0x1E7EE,
@@ -722,16 +751,16 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
0x1F150, 0x1F169,
0x1F170, 0x1F189,
0x20000, 0x2A6DF,
- 0x2A700, 0x2B739,
- 0x2B740, 0x2B81D,
- 0x2B820, 0x2CEA1,
+ 0x2A700, 0x2B81D,
+ 0x2B820, 0x2CEAD,
0x2CEB0, 0x2EBE0,
+ 0x2EBF0, 0x2EE5D,
0x2F800, 0x2FA1D,
0x30000, 0x3134A,
- 0x31350, 0x323AF,
+ 0x31350, 0x33479,
};
-#define UNICODE_ALNUM_CODEPOINTS_LENGTH 1528
+#define UNICODE_ALNUM_CODEPOINTS_LENGTH 1598
static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEPOINTS_LENGTH] = {
0x100, 0x2C1,
0x2C6, 0x2D1,
@@ -739,7 +768,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0x2EC, 0x2EC,
0x2EE, 0x2EE,
0x345, 0x345,
- 0x370, 0x374,
+ 0x363, 0x374,
0x376, 0x377,
0x37A, 0x37D,
0x37F, 0x37F,
@@ -778,7 +807,8 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0x840, 0x858,
0x860, 0x86A,
0x870, 0x887,
- 0x889, 0x88E,
+ 0x889, 0x88F,
+ 0x897, 0x897,
0x8A0, 0x8C9,
0x8D4, 0x8DF,
0x8E3, 0x8E9,
@@ -872,7 +902,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0xC4A, 0xC4C,
0xC55, 0xC56,
0xC58, 0xC5A,
- 0xC5D, 0xC5D,
+ 0xC5C, 0xC5D,
0xC60, 0xC63,
0xC66, 0xC6F,
0xC80, 0xC83,
@@ -885,7 +915,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0xCC6, 0xCC8,
0xCCA, 0xCCC,
0xCD5, 0xCD6,
- 0xCDD, 0xCDE,
+ 0xCDC, 0xCDE,
0xCE0, 0xCE3,
0xCE6, 0xCEF,
0xCF1, 0xCF3,
@@ -1007,7 +1037,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0x1C00, 0x1C36,
0x1C40, 0x1C49,
0x1C4D, 0x1C7D,
- 0x1C80, 0x1C88,
+ 0x1C80, 0x1C8A,
0x1C90, 0x1CBA,
0x1CBD, 0x1CBF,
0x1CE9, 0x1CEC,
@@ -1015,7 +1045,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0x1CF5, 0x1CF6,
0x1CFA, 0x1CFA,
0x1D00, 0x1DBF,
- 0x1DE7, 0x1DF4,
+ 0x1DD3, 0x1DF4,
0x1E00, 0x1F15,
0x1F18, 0x1F1D,
0x1F20, 0x1F45,
@@ -1094,11 +1124,8 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0xA67F, 0xA6EF,
0xA717, 0xA71F,
0xA722, 0xA788,
- 0xA78B, 0xA7CA,
- 0xA7D0, 0xA7D1,
- 0xA7D3, 0xA7D3,
- 0xA7D5, 0xA7D9,
- 0xA7F2, 0xA805,
+ 0xA78B, 0xA7DC,
+ 0xA7F1, 0xA805,
0xA807, 0xA827,
0xA840, 0xA873,
0xA880, 0xA8C3,
@@ -1191,6 +1218,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0x105A3, 0x105B1,
0x105B3, 0x105B9,
0x105BB, 0x105BC,
+ 0x105C0, 0x105F3,
0x10600, 0x10736,
0x10740, 0x10755,
0x10760, 0x10767,
@@ -1209,6 +1237,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0x108F4, 0x108F5,
0x10900, 0x10915,
0x10920, 0x10939,
+ 0x10940, 0x10959,
0x10980, 0x109B7,
0x109BE, 0x109BF,
0x10A00, 0x10A03,
@@ -1229,9 +1258,14 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0x10CC0, 0x10CF2,
0x10D00, 0x10D27,
0x10D30, 0x10D39,
+ 0x10D40, 0x10D65,
+ 0x10D69, 0x10D69,
+ 0x10D6F, 0x10D85,
0x10E80, 0x10EA9,
0x10EAB, 0x10EAC,
0x10EB0, 0x10EB1,
+ 0x10EC2, 0x10EC7,
+ 0x10EFA, 0x10EFC,
0x10F00, 0x10F1C,
0x10F27, 0x10F27,
0x10F30, 0x10F45,
@@ -1278,6 +1312,17 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0x11350, 0x11350,
0x11357, 0x11357,
0x1135D, 0x11363,
+ 0x11380, 0x11389,
+ 0x1138B, 0x1138B,
+ 0x1138E, 0x1138E,
+ 0x11390, 0x113B5,
+ 0x113B7, 0x113C0,
+ 0x113C2, 0x113C2,
+ 0x113C5, 0x113C5,
+ 0x113C7, 0x113CA,
+ 0x113CC, 0x113CD,
+ 0x113D1, 0x113D1,
+ 0x113D3, 0x113D3,
0x11400, 0x11441,
0x11443, 0x11445,
0x11447, 0x1144A,
@@ -1297,6 +1342,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0x11680, 0x116B5,
0x116B8, 0x116B8,
0x116C0, 0x116C9,
+ 0x116D0, 0x116E3,
0x11700, 0x1171A,
0x1171D, 0x1172A,
0x11730, 0x11739,
@@ -1322,6 +1368,9 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0x11A50, 0x11A97,
0x11A9D, 0x11A9D,
0x11AB0, 0x11AF8,
+ 0x11B60, 0x11B67,
+ 0x11BC0, 0x11BE0,
+ 0x11BF0, 0x11BF9,
0x11C00, 0x11C08,
0x11C0A, 0x11C36,
0x11C38, 0x11C3E,
@@ -1346,6 +1395,8 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0x11D93, 0x11D96,
0x11D98, 0x11D98,
0x11DA0, 0x11DA9,
+ 0x11DB0, 0x11DDB,
+ 0x11DE0, 0x11DE9,
0x11EE0, 0x11EF6,
0x11F00, 0x11F10,
0x11F12, 0x11F3A,
@@ -1358,7 +1409,10 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0x12F90, 0x12FF0,
0x13000, 0x1342F,
0x13441, 0x13446,
+ 0x13460, 0x143FA,
0x14400, 0x14646,
+ 0x16100, 0x1612E,
+ 0x16130, 0x16139,
0x16800, 0x16A38,
0x16A40, 0x16A5E,
0x16A60, 0x16A69,
@@ -1370,16 +1424,20 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0x16B50, 0x16B59,
0x16B63, 0x16B77,
0x16B7D, 0x16B8F,
+ 0x16D40, 0x16D6C,
+ 0x16D70, 0x16D79,
0x16E40, 0x16E7F,
+ 0x16EA0, 0x16EB8,
+ 0x16EBB, 0x16ED3,
0x16F00, 0x16F4A,
0x16F4F, 0x16F87,
0x16F8F, 0x16F9F,
0x16FE0, 0x16FE1,
0x16FE3, 0x16FE3,
- 0x16FF0, 0x16FF1,
- 0x17000, 0x187F7,
- 0x18800, 0x18CD5,
- 0x18D00, 0x18D08,
+ 0x16FF0, 0x16FF6,
+ 0x17000, 0x18CD5,
+ 0x18CFF, 0x18D1E,
+ 0x18D80, 0x18DF2,
0x1AFF0, 0x1AFF3,
0x1AFF5, 0x1AFFB,
0x1AFFD, 0x1AFFE,
@@ -1394,6 +1452,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0x1BC80, 0x1BC88,
0x1BC90, 0x1BC99,
0x1BC9E, 0x1BC9E,
+ 0x1CCF0, 0x1CCF9,
0x1D400, 0x1D454,
0x1D456, 0x1D49C,
0x1D49E, 0x1D49F,
@@ -1443,6 +1502,11 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0x1E2F0, 0x1E2F9,
0x1E4D0, 0x1E4EB,
0x1E4F0, 0x1E4F9,
+ 0x1E5D0, 0x1E5ED,
+ 0x1E5F0, 0x1E5FA,
+ 0x1E6C0, 0x1E6DE,
+ 0x1E6E0, 0x1E6F5,
+ 0x1E6FE, 0x1E6FF,
0x1E7E0, 0x1E7E6,
0x1E7E8, 0x1E7EB,
0x1E7ED, 0x1E7EE,
@@ -1490,16 +1554,16 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0x1F170, 0x1F189,
0x1FBF0, 0x1FBF9,
0x20000, 0x2A6DF,
- 0x2A700, 0x2B739,
- 0x2B740, 0x2B81D,
- 0x2B820, 0x2CEA1,
+ 0x2A700, 0x2B81D,
+ 0x2B820, 0x2CEAD,
0x2CEB0, 0x2EBE0,
+ 0x2EBF0, 0x2EE5D,
0x2F800, 0x2FA1D,
0x30000, 0x3134A,
- 0x31350, 0x323AF,
+ 0x31350, 0x33479,
};
-#define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1296
+#define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1320
static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_CODEPOINTS_LENGTH] = {
0x100, 0x100,
0x102, 0x102,
@@ -1582,9 +1646,9 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
0x1B5, 0x1B5,
0x1B7, 0x1B8,
0x1BC, 0x1BC,
- 0x1C4, 0x1C4,
- 0x1C7, 0x1C7,
- 0x1CA, 0x1CA,
+ 0x1C4, 0x1C5,
+ 0x1C7, 0x1C8,
+ 0x1CA, 0x1CB,
0x1CD, 0x1CD,
0x1CF, 0x1CF,
0x1D1, 0x1D1,
@@ -1602,7 +1666,7 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
0x1EA, 0x1EA,
0x1EC, 0x1EC,
0x1EE, 0x1EE,
- 0x1F1, 0x1F1,
+ 0x1F1, 0x1F2,
0x1F4, 0x1F4,
0x1F6, 0x1F8,
0x1FA, 0x1FA,
@@ -1774,6 +1838,7 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
0x10C7, 0x10C7,
0x10CD, 0x10CD,
0x13A0, 0x13F5,
+ 0x1C89, 0x1C89,
0x1C90, 0x1CBA,
0x1CBD, 0x1CBF,
0x1E00, 0x1E00,
@@ -1910,11 +1975,14 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
0x1F5D, 0x1F5D,
0x1F5F, 0x1F5F,
0x1F68, 0x1F6F,
- 0x1FB8, 0x1FBB,
- 0x1FC8, 0x1FCB,
+ 0x1F88, 0x1F8F,
+ 0x1F98, 0x1F9F,
+ 0x1FA8, 0x1FAF,
+ 0x1FB8, 0x1FBC,
+ 0x1FC8, 0x1FCC,
0x1FD8, 0x1FDB,
0x1FE8, 0x1FEC,
- 0x1FF8, 0x1FFB,
+ 0x1FF8, 0x1FFC,
0x2102, 0x2102,
0x2107, 0x2107,
0x210B, 0x210D,
@@ -2100,9 +2168,15 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
0xA7C2, 0xA7C2,
0xA7C4, 0xA7C7,
0xA7C9, 0xA7C9,
+ 0xA7CB, 0xA7CC,
+ 0xA7CE, 0xA7CE,
0xA7D0, 0xA7D0,
+ 0xA7D2, 0xA7D2,
+ 0xA7D4, 0xA7D4,
0xA7D6, 0xA7D6,
0xA7D8, 0xA7D8,
+ 0xA7DA, 0xA7DA,
+ 0xA7DC, 0xA7DC,
0xA7F5, 0xA7F5,
0xFF21, 0xFF3A,
0x10400, 0x10427,
@@ -2112,8 +2186,10 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
0x1058C, 0x10592,
0x10594, 0x10595,
0x10C80, 0x10CB2,
+ 0x10D50, 0x10D65,
0x118A0, 0x118BF,
0x16E40, 0x16E5F,
+ 0x16EA0, 0x16EB8,
0x1D400, 0x1D419,
0x1D434, 0x1D44D,
0x1D468, 0x1D481,
@@ -2252,13 +2328,13 @@ static const uint8_t pm_utf_8_dfa[] = {
*/
static pm_unicode_codepoint_t
pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
- assert(n >= 1);
- size_t maximum = (size_t) n;
+ assert(n >= 0);
+ size_t maximum = (n > 4) ? 4 : ((size_t) n);
uint32_t codepoint;
uint32_t state = 0;
- for (size_t index = 0; index < 4 && index < maximum; index++) {
+ for (size_t index = 0; index < maximum; index++) {
uint32_t byte = b[index];
uint32_t type = pm_utf_8_dfa[byte];
@@ -2267,7 +2343,7 @@ pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
(0xffu >> type) & (byte);
state = pm_utf_8_dfa[256 + (state * 16) + type];
- if (!state) {
+ if (state == 0) {
*width = index + 1;
return (pm_unicode_codepoint_t) codepoint;
}
@@ -2277,11 +2353,22 @@ pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
return 0;
}
-static size_t
+/**
+ * Return the size of the next character in the UTF-8 encoding.
+ */
+size_t
pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
- size_t width;
- pm_utf_8_codepoint(b, n, &width);
- return width;
+ assert(n >= 0);
+
+ size_t maximum = (n > 4) ? 4 : ((size_t) n);
+ uint32_t state = 0;
+
+ for (size_t index = 0; index < maximum; index++) {
+ state = pm_utf_8_dfa[256 + (state * 16) + pm_utf_8_dfa[b[index]]];
+ if (state == 0) return index + 1;
+ }
+
+ return 0;
}
/**
@@ -2290,6 +2377,10 @@ pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
*/
size_t
pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
+ if (n == 0) {
+ return 0;
+ }
+
if (*b < 0x80) {
return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT) ? 1 : 0;
}
@@ -2310,6 +2401,10 @@ pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
*/
size_t
pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
+ if (n == 0) {
+ return 0;
+ }
+
if (*b < 0x80) {
return (pm_encoding_unicode_table[*b] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0;
}
@@ -2330,6 +2425,10 @@ pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
*/
bool
pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
+ if (n == 0) {
+ return 0;
+ }
+
if (*b < 0x80) {
return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false;
}
@@ -2344,9 +2443,12 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
+
static pm_unicode_codepoint_t
pm_cesu_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
- if (b[0] < 0x80) {
+
+ if ((n > 0) && (b[0] < 0x80)) {
*width = 1;
return (pm_unicode_codepoint_t) b[0];
}
@@ -2385,6 +2487,10 @@ pm_cesu_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
static size_t
pm_encoding_cesu_8_char_width(const uint8_t *b, ptrdiff_t n) {
+ if (n == 0) {
+ return 0;
+ }
+
size_t width;
pm_cesu_8_codepoint(b, n, &width);
return width;
@@ -2392,6 +2498,10 @@ pm_encoding_cesu_8_char_width(const uint8_t *b, ptrdiff_t n) {
static size_t
pm_encoding_cesu_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
+ if (n == 0) {
+ return 0;
+ }
+
if (*b < 0x80) {
return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT) ? 1 : 0;
}
@@ -2408,6 +2518,10 @@ pm_encoding_cesu_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
static size_t
pm_encoding_cesu_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
+ if (n == 0) {
+ return 0;
+ }
+
if (*b < 0x80) {
return (pm_encoding_unicode_table[*b] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0;
}
@@ -2424,6 +2538,10 @@ pm_encoding_cesu_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
static bool
pm_encoding_cesu_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
+ if (n == 0) {
+ return 0;
+ }
+
if (*b < 0x80) {
return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false;
}
@@ -2438,13 +2556,15 @@ pm_encoding_cesu_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}
+#endif
+
#undef UNICODE_ALPHA_CODEPOINTS_LENGTH
#undef UNICODE_ALNUM_CODEPOINTS_LENGTH
#undef UNICODE_ISUPPER_CODEPOINTS_LENGTH
/**
* Each element of the following table contains a bitfield that indicates a
- * piece of information about the corresponding ASCII character.
+ * piece of information about the corresponding US-ASCII character.
*/
static const uint8_t pm_encoding_ascii_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
@@ -2466,6 +2586,8 @@ static const uint8_t pm_encoding_ascii_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
};
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
+
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding CP850 character.
@@ -3613,7 +3735,7 @@ static const uint8_t pm_encoding_windows_1250_table[256] = {
0, 0, 0, 7, 0, 7, 0, 0, 0, 0, 7, 0, 0, 0, 0, 7, // Ax
0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 3, 0, 7, 0, 3, 3, // Bx
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx
- 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
};
@@ -3661,7 +3783,7 @@ static const uint8_t pm_encoding_windows_1252_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Ax
0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Bx
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx
- 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
@@ -3835,14 +3957,14 @@ static const uint8_t pm_encoding_windows_874_table[256] = {
};
#define PRISM_ENCODING_TABLE(name) \
- static size_t pm_encoding_ ##name ## _alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
- return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHABETIC_BIT); \
+ static size_t pm_encoding_ ##name ## _alpha_char(const uint8_t *b, ptrdiff_t n) { \
+ return ((n > 0) && (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHABETIC_BIT)); \
} \
- static size_t pm_encoding_ ##name ## _alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
- return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0; \
+ static size_t pm_encoding_ ##name ## _alnum_char(const uint8_t *b, ptrdiff_t n) { \
+ return ((n > 0) && (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0; \
} \
- static bool pm_encoding_ ##name ## _isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
- return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_UPPERCASE_BIT); \
+ static bool pm_encoding_ ##name ## _isupper_char(const uint8_t *b, ptrdiff_t n) { \
+ return ((n > 0) && (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_UPPERCASE_BIT)); \
}
PRISM_ENCODING_TABLE(cp850)
@@ -3904,14 +4026,15 @@ PRISM_ENCODING_TABLE(windows_1258)
PRISM_ENCODING_TABLE(windows_874)
#undef PRISM_ENCODING_TABLE
+#endif
/**
* Returns the size of the next character in the ASCII encoding. This basically
* means that if the top bit is not set, the character is 1 byte long.
*/
static size_t
-pm_encoding_ascii_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
- return *b < 0x80 ? 1 : 0;
+pm_encoding_ascii_char_width(const uint8_t *b, ptrdiff_t n) {
+ return ((n > 0) && (*b < 0x80)) ? 1 : 0;
}
/**
@@ -3919,8 +4042,8 @@ pm_encoding_ascii_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t
* alphabetical character.
*/
static size_t
-pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
- return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT);
+pm_encoding_ascii_alpha_char(const uint8_t *b, ptrdiff_t n) {
+ return (n > 0) ? (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT) : 0;
}
/**
@@ -3930,7 +4053,7 @@ pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t
*/
static size_t
pm_encoding_ascii_alpha_char_7bit(const uint8_t *b, ptrdiff_t n) {
- return (*b < 0x80) ? pm_encoding_ascii_alpha_char(b, n) : 0;
+ return ((n > 0) && (*b < 0x80)) ? pm_encoding_ascii_alpha_char(b, n) : 0;
}
/**
@@ -3938,8 +4061,8 @@ pm_encoding_ascii_alpha_char_7bit(const uint8_t *b, ptrdiff_t n) {
* alphanumeric character.
*/
static size_t
-pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
- return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0;
+pm_encoding_ascii_alnum_char(const uint8_t *b, ptrdiff_t n) {
+ return ((n > 0) && (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0;
}
/**
@@ -3949,7 +4072,7 @@ pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t
*/
static size_t
pm_encoding_ascii_alnum_char_7bit(const uint8_t *b, ptrdiff_t n) {
- return (*b < 0x80) ? pm_encoding_ascii_alnum_char(b, n) : 0;
+ return ((n > 0) && (*b < 0x80)) ? pm_encoding_ascii_alnum_char(b, n) : 0;
}
/**
@@ -3957,27 +4080,137 @@ pm_encoding_ascii_alnum_char_7bit(const uint8_t *b, ptrdiff_t n) {
* character.
*/
static bool
-pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
- return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_UPPERCASE_BIT);
+pm_encoding_ascii_isupper_char(const uint8_t *b, ptrdiff_t n) {
+ return (n > 0) && (pm_encoding_ascii_table[*b] & PRISM_ENCODING_UPPERCASE_BIT);
}
/**
- * Certain encodings are equivalent to ASCII below 0x80, so it works for our
- * purposes to have a function here that first checks the bounds and then falls
- * back to checking the ASCII lookup table.
+ * For a lot of encodings the default is that they are a single byte long no
+ * matter what the codepoint, so this function is shared between them.
+ */
+static size_t
+pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
+ return 1;
+}
+
+/**
+ * Returns the size of the next character in the EUC-JP encoding, or 0 if a
+ * character cannot be decoded from the given bytes.
+ */
+static size_t
+pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
+ // These are the single byte characters.
+ if ((n > 0) && (*b < 0x80)) {
+ return 1;
+ }
+
+ // These are the double byte characters.
+ if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
+ return 2;
+ }
+
+ // These are the triple byte characters.
+ if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) {
+ return 3;
+ }
+
+ return 0;
+}
+
+/**
+ * Returns the size of the next character in the EUC-JP encoding if it is an
+ * uppercase character.
*/
static bool
-pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) {
- return (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n);
+pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
+ size_t width = pm_encoding_euc_jp_char_width(b, n);
+
+ if (width == 1) {
+ return pm_encoding_ascii_isupper_char(b, n);
+ } else if (width == 2) {
+ return (
+ (b[0] == 0xA3 && b[1] >= 0xC1 && b[1] <= 0xDA) ||
+ (b[0] == 0xA6 && b[1] >= 0xA1 && b[1] <= 0xB8) ||
+ (b[0] == 0xA7 && b[1] >= 0xA1 && b[1] <= 0xC1)
+ );
+ } else {
+ return false;
+ }
}
/**
- * For a lot of encodings the default is that they are a single byte long no
- * matter what the codepoint, so this function is shared between them.
+ * Returns the size of the next character in the Shift_JIS encoding, or 0 if a
+ * character cannot be decoded from the given bytes.
*/
static size_t
-pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
- return 1;
+pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
+ if (n == 0) {
+ return 0;
+ }
+ // These are the single byte characters.
+ if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) {
+ return 1;
+ }
+
+ // These are the double byte characters.
+ if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC && b[1] != 0x7F)) {
+ return 2;
+ }
+
+ return 0;
+}
+
+/**
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
+ * alphanumeric character.
+ */
+static size_t
+pm_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) {
+ size_t width = pm_encoding_shift_jis_char_width(b, n);
+ return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alnum_char(b, n)) : width;
+}
+
+/**
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
+ * alphabetical character.
+ */
+static size_t
+pm_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) {
+ size_t width = pm_encoding_shift_jis_char_width(b, n);
+ return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alpha_char(b, n)) : width;
+}
+
+/**
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
+ * uppercase character.
+ */
+static bool
+pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
+ size_t width = pm_encoding_shift_jis_char_width(b, n);
+
+ if (width == 1) {
+ return pm_encoding_ascii_isupper_char(b, n);
+ } else if (width == 2) {
+ return (
+ ((b[0] == 0x82) && (b[1] >= 0x60 && b[1] <= 0x79)) ||
+ ((b[0] == 0x83) && (b[1] >= 0x9F && b[1] <= 0xB6)) ||
+ ((b[0] == 0x84) && (b[1] >= 0x40 && b[1] <= 0x60))
+ );
+ } else {
+ return width;
+ }
+}
+
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
+
+/**
+ * Certain encodings are equivalent to ASCII below 0x80, so it works for our
+ * purposes to have a function here that first checks the bounds and then falls
+ * back to checking the ASCII lookup table.
+ */
+static bool
+pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) {
+ return (n > 0) && (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n);
}
/**
@@ -3987,7 +4220,7 @@ pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATT
static size_t
pm_encoding_big5_char_width(const uint8_t *b, ptrdiff_t n) {
// These are the single byte characters.
- if (*b < 0x80) {
+ if ((n > 0) && (*b < 0x80)) {
return 1;
}
@@ -4006,12 +4239,12 @@ pm_encoding_big5_char_width(const uint8_t *b, ptrdiff_t n) {
static size_t
pm_encoding_cp949_char_width(const uint8_t *b, ptrdiff_t n) {
// These are the single byte characters
- if (*b <= 0x80) {
+ if ((n > 0) && (*b <= 0x80)) {
return 1;
}
// These are the double byte characters
- if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0xfe) && (b[1] >= 0x41 && b[1] <= 0xfe)) {
+ if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0xFE) && ((b[1] >= 0x41 && b[1] <= 0x5A) || (b[1] >= 0x61 && b[1] <= 0x7A) || (b[1] >= 0x81 && b[1] <= 0xFE))) {
return 2;
}
@@ -4025,7 +4258,7 @@ pm_encoding_cp949_char_width(const uint8_t *b, ptrdiff_t n) {
static size_t
pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) {
// These are the 1 byte characters.
- if (*b < 0x80) {
+ if ((n > 0) && (*b < 0x80)) {
return 1;
}
@@ -4062,37 +4295,13 @@ pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) {
}
/**
- * Returns the size of the next character in the EUC-JP encoding, or 0 if a
- * character cannot be decoded from the given bytes.
- */
-static size_t
-pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
- // These are the single byte characters.
- if (*b < 0x80) {
- return 1;
- }
-
- // These are the double byte characters.
- if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
- return 2;
- }
-
- // These are the triple byte characters.
- if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) {
- return 3;
- }
-
- return 0;
-}
-
-/**
* Returns the size of the next character in the EUC-KR encoding, or 0 if a
* character cannot be decoded from the given bytes.
*/
static size_t
pm_encoding_euc_kr_char_width(const uint8_t *b, ptrdiff_t n) {
// These are the single byte characters.
- if (*b < 0x80) {
+ if ((n > 0) && (*b < 0x80)) {
return 1;
}
@@ -4111,7 +4320,7 @@ pm_encoding_euc_kr_char_width(const uint8_t *b, ptrdiff_t n) {
static size_t
pm_encoding_euc_tw_char_width(const uint8_t *b, ptrdiff_t n) {
// These are the single byte characters.
- if (*b < 0x80) {
+ if ((n > 0) && (*b < 0x80)) {
return 1;
}
@@ -4135,7 +4344,7 @@ pm_encoding_euc_tw_char_width(const uint8_t *b, ptrdiff_t n) {
static size_t
pm_encoding_gb18030_char_width(const uint8_t *b, ptrdiff_t n) {
// These are the 1 byte characters.
- if (*b < 0x80) {
+ if ((n > 0) && (*b < 0x80)) {
return 1;
}
@@ -4159,7 +4368,7 @@ pm_encoding_gb18030_char_width(const uint8_t *b, ptrdiff_t n) {
static size_t
pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) {
// These are the single byte characters.
- if (*b <= 0x80) {
+ if ((n > 0) && (*b <= 0x80)) {
return 1;
}
@@ -4183,33 +4392,7 @@ pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) {
return 0;
}
-/**
- * Returns the size of the next character in the KOI-8 encoding. This means
- * checking if it's a valid codepoint in KOI-8 and if it is returning 1.
- */
-static size_t
-pm_encoding_koi8_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
- return ((*b >= 0x20 && *b <= 0x7E) || (*b >= 0x80)) ? 1 : 0;
-}
-
-/**
- * Returns the size of the next character in the Shift_JIS encoding, or 0 if a
- * character cannot be decoded from the given bytes.
- */
-static size_t
-pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
- // These are the single byte characters.
- if (*b < 0x80 || (*b >= 0xA1 && *b <= 0xDF)) {
- return 1;
- }
-
- // These are the double byte characters.
- if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC)) {
- return 2;
- }
-
- return 0;
-}
+#endif
/**
* This is the table of all of the encodings that prism supports.
@@ -4223,6 +4406,14 @@ const pm_encoding_t pm_encodings[] = {
.isupper_char = pm_encoding_utf_8_isupper_char,
.multibyte = true
},
+ [PM_ENCODING_US_ASCII] = {
+ .name = "US-ASCII",
+ .char_width = pm_encoding_ascii_char_width,
+ .alnum_char = pm_encoding_ascii_alnum_char,
+ .alpha_char = pm_encoding_ascii_alpha_char,
+ .isupper_char = pm_encoding_ascii_isupper_char,
+ .multibyte = false
+ },
[PM_ENCODING_ASCII_8BIT] = {
.name = "ASCII-8BIT",
.char_width = pm_encoding_single_char_width,
@@ -4231,6 +4422,24 @@ const pm_encoding_t pm_encodings[] = {
.isupper_char = pm_encoding_ascii_isupper_char,
.multibyte = false
},
+ [PM_ENCODING_EUC_JP] = {
+ .name = "EUC-JP",
+ .char_width = pm_encoding_euc_jp_char_width,
+ .alnum_char = pm_encoding_ascii_alnum_char_7bit,
+ .alpha_char = pm_encoding_ascii_alpha_char_7bit,
+ .isupper_char = pm_encoding_euc_jp_isupper_char,
+ .multibyte = true
+ },
+ [PM_ENCODING_WINDOWS_31J] = {
+ .name = "Windows-31J",
+ .char_width = pm_encoding_shift_jis_char_width,
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
+ .multibyte = true
+ },
+
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
[PM_ENCODING_BIG5] = {
.name = "Big5",
.char_width = pm_encoding_big5_char_width,
@@ -4268,7 +4477,7 @@ const pm_encoding_t pm_encodings[] = {
.char_width = pm_encoding_euc_jp_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
+ .isupper_char = pm_encoding_euc_jp_isupper_char,
.multibyte = true
},
[PM_ENCODING_CP850] = {
@@ -4327,20 +4536,12 @@ const pm_encoding_t pm_encodings[] = {
.isupper_char = pm_encoding_ascii_isupper_char_7bit,
.multibyte = true
},
- [PM_ENCODING_EUC_JP] = {
- .name = "EUC-JP",
- .char_width = pm_encoding_euc_jp_char_width,
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
- .multibyte = true
- },
[PM_ENCODING_EUC_JP_MS] = {
.name = "eucJP-ms",
.char_width = pm_encoding_euc_jp_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
+ .isupper_char = pm_encoding_euc_jp_isupper_char,
.multibyte = true
},
[PM_ENCODING_EUC_JIS_2004] = {
@@ -4348,7 +4549,7 @@ const pm_encoding_t pm_encodings[] = {
.char_width = pm_encoding_euc_jp_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
+ .isupper_char = pm_encoding_euc_jp_isupper_char,
.multibyte = true
},
[PM_ENCODING_EUC_KR] = {
@@ -4649,7 +4850,7 @@ const pm_encoding_t pm_encodings[] = {
},
[PM_ENCODING_KOI8_R] = {
.name = "KOI8-R",
- .char_width = pm_encoding_koi8_char_width,
+ .char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_koi8_r_alnum_char,
.alpha_char = pm_encoding_koi8_r_alpha_char,
.isupper_char = pm_encoding_koi8_r_isupper_char,
@@ -4657,7 +4858,7 @@ const pm_encoding_t pm_encodings[] = {
},
[PM_ENCODING_KOI8_U] = {
.name = "KOI8-U",
- .char_width = pm_encoding_koi8_char_width,
+ .char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_koi8_u_alnum_char,
.alpha_char = pm_encoding_koi8_u_alpha_char,
.isupper_char = pm_encoding_koi8_u_isupper_char,
@@ -4706,9 +4907,9 @@ const pm_encoding_t pm_encodings[] = {
[PM_ENCODING_MAC_JAPANESE] = {
.name = "MacJapanese",
.char_width = pm_encoding_shift_jis_char_width,
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
.multibyte = true
},
[PM_ENCODING_MAC_ROMAN] = {
@@ -4754,33 +4955,33 @@ const pm_encoding_t pm_encodings[] = {
[PM_ENCODING_SHIFT_JIS] = {
.name = "Shift_JIS",
.char_width = pm_encoding_shift_jis_char_width,
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
.multibyte = true
},
[PM_ENCODING_SJIS_DOCOMO] = {
.name = "SJIS-DoCoMo",
.char_width = pm_encoding_shift_jis_char_width,
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
.multibyte = true
},
[PM_ENCODING_SJIS_KDDI] = {
.name = "SJIS-KDDI",
.char_width = pm_encoding_shift_jis_char_width,
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
.multibyte = true
},
[PM_ENCODING_SJIS_SOFTBANK] = {
.name = "SJIS-SoftBank",
.char_width = pm_encoding_shift_jis_char_width,
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
+ .alnum_char = pm_encoding_shift_jis_alnum_char,
+ .alpha_char = pm_encoding_shift_jis_alpha_char,
+ .isupper_char = pm_encoding_shift_jis_isupper_char,
.multibyte = true
},
[PM_ENCODING_STATELESS_ISO_2022_JP] = {
@@ -4807,14 +5008,6 @@ const pm_encoding_t pm_encodings[] = {
.isupper_char = pm_encoding_tis_620_isupper_char,
.multibyte = false
},
- [PM_ENCODING_US_ASCII] = {
- .name = "US-ASCII",
- .char_width = pm_encoding_ascii_char_width,
- .alnum_char = pm_encoding_ascii_alnum_char,
- .alpha_char = pm_encoding_ascii_alpha_char,
- .isupper_char = pm_encoding_ascii_isupper_char,
- .multibyte = false
- },
[PM_ENCODING_UTF8_MAC] = {
.name = "UTF8-MAC",
.char_width = pm_encoding_utf_8_char_width,
@@ -4919,14 +5112,6 @@ const pm_encoding_t pm_encodings[] = {
.isupper_char = pm_encoding_windows_1258_isupper_char,
.multibyte = false
},
- [PM_ENCODING_WINDOWS_31J] = {
- .name = "Windows-31J",
- .char_width = pm_encoding_shift_jis_char_width,
- .alnum_char = pm_encoding_ascii_alnum_char_7bit,
- .alpha_char = pm_encoding_ascii_alpha_char_7bit,
- .isupper_char = pm_encoding_ascii_isupper_char_7bit,
- .multibyte = true
- },
[PM_ENCODING_WINDOWS_874] = {
.name = "Windows-874",
.char_width = pm_encoding_single_char_width,
@@ -4935,6 +5120,7 @@ const pm_encoding_t pm_encodings[] = {
.isupper_char = pm_encoding_windows_874_isupper_char,
.multibyte = false
}
+#endif
};
/**
@@ -4949,11 +5135,13 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
// UTF-8 can contain extra information at the end about the platform it is
// encoded on, such as UTF-8-MAC or UTF-8-UNIX. We'll ignore those suffixes.
if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "UTF-8", 5) == 0)) {
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
// We need to explicitly handle UTF-8-HFS, as that one needs to switch
// over to being UTF8-MAC.
if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-HFS", 4) == 0)) {
return &pm_encodings[PM_ENCODING_UTF8_MAC];
}
+#endif
// Otherwise we'll return the default UTF-8 encoding.
return PM_ENCODING_UTF_8_ENTRY;
@@ -4973,11 +5161,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
break;
case 'B': case 'b':
ENCODING1("BINARY", PM_ENCODING_ASCII_8BIT);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING1("Big5", PM_ENCODING_BIG5);
ENCODING2("Big5-HKSCS", "Big5-HKSCS:2008", PM_ENCODING_BIG5_HKSCS);
ENCODING1("Big5-UAO", PM_ENCODING_BIG5_UAO);
+#endif
break;
case 'C': case 'c':
+ ENCODING1("CP65001", PM_ENCODING_UTF_8);
+ ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING1("CESU-8", PM_ENCODING_CESU_8);
ENCODING1("CP437", PM_ENCODING_IBM437);
ENCODING1("CP720", PM_ENCODING_IBM720);
@@ -4997,7 +5190,6 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
ENCODING1("CP874", PM_ENCODING_WINDOWS_874);
ENCODING1("CP878", PM_ENCODING_KOI8_R);
ENCODING1("CP863", PM_ENCODING_IBM863);
- ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J);
ENCODING1("CP936", PM_ENCODING_GBK);
ENCODING1("CP949", PM_ENCODING_CP949);
ENCODING1("CP950", PM_ENCODING_CP950);
@@ -5012,25 +5204,30 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
ENCODING1("CP1257", PM_ENCODING_WINDOWS_1257);
ENCODING1("CP1258", PM_ENCODING_WINDOWS_1258);
ENCODING1("CP51932", PM_ENCODING_CP51932);
- ENCODING1("CP65001", PM_ENCODING_UTF_8);
+#endif
break;
case 'E': case 'e':
ENCODING2("EUC-JP", "eucJP", PM_ENCODING_EUC_JP);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING2("eucJP-ms", "euc-jp-ms", PM_ENCODING_EUC_JP_MS);
ENCODING2("EUC-JIS-2004", "EUC-JISX0213", PM_ENCODING_EUC_JIS_2004);
ENCODING2("EUC-KR", "eucKR", PM_ENCODING_EUC_KR);
ENCODING2("EUC-CN", "eucCN", PM_ENCODING_GB2312);
ENCODING2("EUC-TW", "eucTW", PM_ENCODING_EUC_TW);
ENCODING1("Emacs-Mule", PM_ENCODING_EMACS_MULE);
+#endif
break;
case 'G': case 'g':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING1("GBK", PM_ENCODING_GBK);
ENCODING1("GB12345", PM_ENCODING_GB12345);
ENCODING1("GB18030", PM_ENCODING_GB18030);
ENCODING1("GB1988", PM_ENCODING_GB1988);
ENCODING1("GB2312", PM_ENCODING_GB2312);
+#endif
break;
case 'I': case 'i':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING1("IBM437", PM_ENCODING_IBM437);
ENCODING1("IBM720", PM_ENCODING_IBM720);
ENCODING1("IBM737", PM_ENCODING_IBM737);
@@ -5062,12 +5259,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
ENCODING2("ISO-8859-14", "ISO8859-14", PM_ENCODING_ISO_8859_14);
ENCODING2("ISO-8859-15", "ISO8859-15", PM_ENCODING_ISO_8859_15);
ENCODING2("ISO-8859-16", "ISO8859-16", PM_ENCODING_ISO_8859_16);
+#endif
break;
case 'K': case 'k':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING1("KOI8-R", PM_ENCODING_KOI8_R);
ENCODING1("KOI8-U", PM_ENCODING_KOI8_U);
+#endif
break;
case 'M': case 'm':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING1("macCentEuro", PM_ENCODING_MAC_CENT_EURO);
ENCODING1("macCroatian", PM_ENCODING_MAC_CROATIAN);
ENCODING1("macCyrillic", PM_ENCODING_MAC_CYRILLIC);
@@ -5080,31 +5281,39 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
ENCODING1("macThai", PM_ENCODING_MAC_THAI);
ENCODING1("macTurkish", PM_ENCODING_MAC_TURKISH);
ENCODING1("macUkraine", PM_ENCODING_MAC_UKRAINE);
+#endif
break;
case 'P': case 'p':
ENCODING1("PCK", PM_ENCODING_WINDOWS_31J);
break;
case 'S': case 's':
- ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS);
ENCODING1("SJIS", PM_ENCODING_WINDOWS_31J);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
+ ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS);
ENCODING1("SJIS-DoCoMo", PM_ENCODING_SJIS_DOCOMO);
ENCODING1("SJIS-KDDI", PM_ENCODING_SJIS_KDDI);
ENCODING1("SJIS-SoftBank", PM_ENCODING_SJIS_SOFTBANK);
ENCODING1("stateless-ISO-2022-JP", PM_ENCODING_STATELESS_ISO_2022_JP);
ENCODING1("stateless-ISO-2022-JP-KDDI", PM_ENCODING_STATELESS_ISO_2022_JP_KDDI);
+#endif
break;
case 'T': case 't':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING1("TIS-620", PM_ENCODING_TIS_620);
+#endif
break;
case 'U': case 'u':
ENCODING1("US-ASCII", PM_ENCODING_US_ASCII);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING2("UTF8-MAC", "UTF-8-HFS", PM_ENCODING_UTF8_MAC);
ENCODING1("UTF8-DoCoMo", PM_ENCODING_UTF8_DOCOMO);
ENCODING1("UTF8-KDDI", PM_ENCODING_UTF8_KDDI);
ENCODING1("UTF8-SoftBank", PM_ENCODING_UTF8_SOFTBANK);
+#endif
break;
case 'W': case 'w':
ENCODING1("Windows-31J", PM_ENCODING_WINDOWS_31J);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING1("Windows-874", PM_ENCODING_WINDOWS_874);
ENCODING1("Windows-1250", PM_ENCODING_WINDOWS_1250);
ENCODING1("Windows-1251", PM_ENCODING_WINDOWS_1251);
@@ -5115,6 +5324,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
ENCODING1("Windows-1256", PM_ENCODING_WINDOWS_1256);
ENCODING1("Windows-1257", PM_ENCODING_WINDOWS_1257);
ENCODING1("Windows-1258", PM_ENCODING_WINDOWS_1258);
+#endif
break;
case '6':
ENCODING1("646", PM_ENCODING_US_ASCII);