1 files changed, 354 insertions, 206 deletions
diff --git a/prism/encoding.c b/prism/encoding.c
index dc63cccc2d..c9c2e13056 100644
--- a/prism/encoding.c
+++ b/prism/encoding.c
@@ -1,8 +1,13 @@
-#include "prism/encoding.h"
+#include "prism/internal/encoding.h"
+
+#include "prism/compiler/unused.h"
+#include "prism/internal/strncasecmp.h"
+
+#include <assert.h>
 
 typedef uint32_t pm_unicode_codepoint_t;
 
-#define UNICODE_ALPHA_CODEPOINTS_LENGTH 1450
+#define UNICODE_ALPHA_CODEPOINTS_LENGTH 1508
 static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEPOINTS_LENGTH] = {
     0x100, 0x2C1,
     0x2C6, 0x2D1,
@@ -10,7 +15,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
     0x2EC, 0x2EC,
     0x2EE, 0x2EE,
     0x345, 0x345,
-    0x370, 0x374,
+    0x363, 0x374,
     0x376, 0x377,
     0x37A, 0x37D,
     0x37F, 0x37F,
@@ -50,7 +55,8 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
     0x840, 0x858,
     0x860, 0x86A,
     0x870, 0x887,
-    0x889, 0x88E,
+    0x889, 0x88F,
+    0x897, 0x897,
     0x8A0, 0x8C9,
     0x8D4, 0x8DF,
     0x8E3, 0x8E9,
@@ -140,7 +146,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
     0xC4A, 0xC4C,
     0xC55, 0xC56,
     0xC58, 0xC5A,
-    0xC5D, 0xC5D,
+    0xC5C, 0xC5D,
     0xC60, 0xC63,
     0xC80, 0xC83,
     0xC85, 0xC8C,
@@ -152,7 +158,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
     0xCC6, 0xCC8,
     0xCCA, 0xCCC,
     0xCD5, 0xCD6,
-    0xCDD, 0xCDE,
+    0xCDC, 0xCDE,
     0xCE0, 0xCE3,
     0xCF1, 0xCF3,
     0xD00, 0xD0C,
@@ -264,7 +270,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
     0x1C00, 0x1C36,
     0x1C4D, 0x1C4F,
     0x1C5A, 0x1C7D,
-    0x1C80, 0x1C88,
+    0x1C80, 0x1C8A,
     0x1C90, 0x1CBA,
     0x1CBD, 0x1CBF,
     0x1CE9, 0x1CEC,
@@ -272,7 +278,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
     0x1CF5, 0x1CF6,
     0x1CFA, 0x1CFA,
     0x1D00, 0x1DBF,
-    0x1DE7, 0x1DF4,
+    0x1DD3, 0x1DF4,
     0x1E00, 0x1F15,
     0x1F18, 0x1F1D,
     0x1F20, 0x1F45,
@@ -352,11 +358,8 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
     0xA67F, 0xA6EF,
     0xA717, 0xA71F,
     0xA722, 0xA788,
-    0xA78B, 0xA7CA,
-    0xA7D0, 0xA7D1,
-    0xA7D3, 0xA7D3,
-    0xA7D5, 0xA7D9,
-    0xA7F2, 0xA805,
+    0xA78B, 0xA7DC,
+    0xA7F1, 0xA805,
     0xA807, 0xA827,
     0xA840, 0xA873,
     0xA880, 0xA8C3,
@@ -446,6 +449,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
     0x105A3, 0x105B1,
     0x105B3, 0x105B9,
     0x105BB, 0x105BC,
+    0x105C0, 0x105F3,
     0x10600, 0x10736,
     0x10740, 0x10755,
     0x10760, 0x10767,
@@ -464,6 +468,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
     0x108F4, 0x108F5,
     0x10900, 0x10915,
     0x10920, 0x10939,
+    0x10940, 0x10959,
     0x10980, 0x109B7,
     0x109BE, 0x109BF,
     0x10A00, 0x10A03,
@@ -483,9 +488,14 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
     0x10C80, 0x10CB2,
     0x10CC0, 0x10CF2,
     0x10D00, 0x10D27,
+    0x10D4A, 0x10D65,
+    0x10D69, 0x10D69,
+    0x10D6F, 0x10D85,
     0x10E80, 0x10EA9,
     0x10EAB, 0x10EAC,
     0x10EB0, 0x10EB1,
+    0x10EC2, 0x10EC7,
+    0x10EFA, 0x10EFC,
     0x10F00, 0x10F1C,
     0x10F27, 0x10F27,
     0x10F30, 0x10F45,
@@ -529,6 +539,17 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
     0x11350, 0x11350,
     0x11357, 0x11357,
     0x1135D, 0x11363,
+    0x11380, 0x11389,
+    0x1138B, 0x1138B,
+    0x1138E, 0x1138E,
+    0x11390, 0x113B5,
+    0x113B7, 0x113C0,
+    0x113C2, 0x113C2,
+    0x113C5, 0x113C5,
+    0x113C7, 0x113CA,
+    0x113CC, 0x113CD,
+    0x113D1, 0x113D1,
+    0x113D3, 0x113D3,
     0x11400, 0x11441,
     0x11443, 0x11445,
     0x11447, 0x1144A,
@@ -567,6 +588,8 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
     0x11A50, 0x11A97,
     0x11A9D, 0x11A9D,
     0x11AB0, 0x11AF8,
+    0x11B60, 0x11B67,
+    0x11BC0, 0x11BE0,
     0x11C00, 0x11C08,
     0x11C0A, 0x11C36,
     0x11C38, 0x11C3E,
@@ -588,6 +611,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
     0x11D90, 0x11D91,
     0x11D93, 0x11D96,
     0x11D98, 0x11D98,
+    0x11DB0, 0x11DDB,
     0x11EE0, 0x11EF6,
     0x11F00, 0x11F10,
     0x11F12, 0x11F3A,
@@ -599,7 +623,9 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
     0x12F90, 0x12FF0,
     0x13000, 0x1342F,
     0x13441, 0x13446,
+    0x13460, 0x143FA,
     0x14400, 0x14646,
+    0x16100, 0x1612E,
     0x16800, 0x16A38,
     0x16A40, 0x16A5E,
     0x16A70, 0x16ABE,
@@ -608,16 +634,19 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
     0x16B40, 0x16B43,
     0x16B63, 0x16B77,
     0x16B7D, 0x16B8F,
+    0x16D40, 0x16D6C,
     0x16E40, 0x16E7F,
+    0x16EA0, 0x16EB8,
+    0x16EBB, 0x16ED3,
     0x16F00, 0x16F4A,
     0x16F4F, 0x16F87,
     0x16F8F, 0x16F9F,
     0x16FE0, 0x16FE1,
     0x16FE3, 0x16FE3,
-    0x16FF0, 0x16FF1,
-    0x17000, 0x187F7,
-    0x18800, 0x18CD5,
-    0x18D00, 0x18D08,
+    0x16FF0, 0x16FF6,
+    0x17000, 0x18CD5,
+    0x18CFF, 0x18D1E,
+    0x18D80, 0x18DF2,
     0x1AFF0, 0x1AFF3,
     0x1AFF5, 0x1AFFB,
     0x1AFFD, 0x1AFFE,
@@ -677,6 +706,11 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
     0x1E290, 0x1E2AD,
     0x1E2C0, 0x1E2EB,
     0x1E4D0, 0x1E4EB,
+    0x1E5D0, 0x1E5ED,
+    0x1E5F0, 0x1E5F0,
+    0x1E6C0, 0x1E6DE,
+    0x1E6E0, 0x1E6F5,
+    0x1E6FE, 0x1E6FF,
     0x1E7E0, 0x1E7E6,
     0x1E7E8, 0x1E7EB,
     0x1E7ED, 0x1E7EE,
@@ -722,16 +756,16 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP
     0x1F150, 0x1F169,
     0x1F170, 0x1F189,
     0x20000, 0x2A6DF,
-    0x2A700, 0x2B739,
-    0x2B740, 0x2B81D,
-    0x2B820, 0x2CEA1,
+    0x2A700, 0x2B81D,
+    0x2B820, 0x2CEAD,
     0x2CEB0, 0x2EBE0,
+    0x2EBF0, 0x2EE5D,
     0x2F800, 0x2FA1D,
     0x30000, 0x3134A,
-    0x31350, 0x323AF,
+    0x31350, 0x33479,
 };
 
-#define UNICODE_ALNUM_CODEPOINTS_LENGTH 1528
+#define UNICODE_ALNUM_CODEPOINTS_LENGTH 1598
 static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEPOINTS_LENGTH] = {
     0x100, 0x2C1,
     0x2C6, 0x2D1,
@@ -739,7 +773,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0x2EC, 0x2EC,
     0x2EE, 0x2EE,
     0x345, 0x345,
-    0x370, 0x374,
+    0x363, 0x374,
     0x376, 0x377,
     0x37A, 0x37D,
     0x37F, 0x37F,
@@ -778,7 +812,8 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0x840, 0x858,
     0x860, 0x86A,
     0x870, 0x887,
-    0x889, 0x88E,
+    0x889, 0x88F,
+    0x897, 0x897,
     0x8A0, 0x8C9,
     0x8D4, 0x8DF,
     0x8E3, 0x8E9,
@@ -872,7 +907,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0xC4A, 0xC4C,
     0xC55, 0xC56,
     0xC58, 0xC5A,
-    0xC5D, 0xC5D,
+    0xC5C, 0xC5D,
     0xC60, 0xC63,
     0xC66, 0xC6F,
     0xC80, 0xC83,
@@ -885,7 +920,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0xCC6, 0xCC8,
     0xCCA, 0xCCC,
     0xCD5, 0xCD6,
-    0xCDD, 0xCDE,
+    0xCDC, 0xCDE,
     0xCE0, 0xCE3,
     0xCE6, 0xCEF,
     0xCF1, 0xCF3,
@@ -1007,7 +1042,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0x1C00, 0x1C36,
     0x1C40, 0x1C49,
     0x1C4D, 0x1C7D,
-    0x1C80, 0x1C88,
+    0x1C80, 0x1C8A,
     0x1C90, 0x1CBA,
     0x1CBD, 0x1CBF,
     0x1CE9, 0x1CEC,
@@ -1015,7 +1050,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0x1CF5, 0x1CF6,
     0x1CFA, 0x1CFA,
     0x1D00, 0x1DBF,
-    0x1DE7, 0x1DF4,
+    0x1DD3, 0x1DF4,
     0x1E00, 0x1F15,
     0x1F18, 0x1F1D,
     0x1F20, 0x1F45,
@@ -1094,11 +1129,8 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0xA67F, 0xA6EF,
     0xA717, 0xA71F,
     0xA722, 0xA788,
-    0xA78B, 0xA7CA,
-    0xA7D0, 0xA7D1,
-    0xA7D3, 0xA7D3,
-    0xA7D5, 0xA7D9,
-    0xA7F2, 0xA805,
+    0xA78B, 0xA7DC,
+    0xA7F1, 0xA805,
     0xA807, 0xA827,
     0xA840, 0xA873,
     0xA880, 0xA8C3,
@@ -1191,6 +1223,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0x105A3, 0x105B1,
     0x105B3, 0x105B9,
     0x105BB, 0x105BC,
+    0x105C0, 0x105F3,
     0x10600, 0x10736,
     0x10740, 0x10755,
     0x10760, 0x10767,
@@ -1209,6 +1242,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0x108F4, 0x108F5,
     0x10900, 0x10915,
     0x10920, 0x10939,
+    0x10940, 0x10959,
     0x10980, 0x109B7,
     0x109BE, 0x109BF,
     0x10A00, 0x10A03,
@@ -1229,9 +1263,14 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0x10CC0, 0x10CF2,
     0x10D00, 0x10D27,
     0x10D30, 0x10D39,
+    0x10D40, 0x10D65,
+    0x10D69, 0x10D69,
+    0x10D6F, 0x10D85,
     0x10E80, 0x10EA9,
     0x10EAB, 0x10EAC,
     0x10EB0, 0x10EB1,
+    0x10EC2, 0x10EC7,
+    0x10EFA, 0x10EFC,
     0x10F00, 0x10F1C,
     0x10F27, 0x10F27,
     0x10F30, 0x10F45,
@@ -1278,6 +1317,17 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0x11350, 0x11350,
     0x11357, 0x11357,
     0x1135D, 0x11363,
+    0x11380, 0x11389,
+    0x1138B, 0x1138B,
+    0x1138E, 0x1138E,
+    0x11390, 0x113B5,
+    0x113B7, 0x113C0,
+    0x113C2, 0x113C2,
+    0x113C5, 0x113C5,
+    0x113C7, 0x113CA,
+    0x113CC, 0x113CD,
+    0x113D1, 0x113D1,
+    0x113D3, 0x113D3,
     0x11400, 0x11441,
     0x11443, 0x11445,
     0x11447, 0x1144A,
@@ -1297,6 +1347,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0x11680, 0x116B5,
     0x116B8, 0x116B8,
     0x116C0, 0x116C9,
+    0x116D0, 0x116E3,
     0x11700, 0x1171A,
     0x1171D, 0x1172A,
     0x11730, 0x11739,
@@ -1322,6 +1373,9 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0x11A50, 0x11A97,
     0x11A9D, 0x11A9D,
     0x11AB0, 0x11AF8,
+    0x11B60, 0x11B67,
+    0x11BC0, 0x11BE0,
+    0x11BF0, 0x11BF9,
     0x11C00, 0x11C08,
     0x11C0A, 0x11C36,
     0x11C38, 0x11C3E,
@@ -1346,6 +1400,8 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0x11D93, 0x11D96,
     0x11D98, 0x11D98,
     0x11DA0, 0x11DA9,
+    0x11DB0, 0x11DDB,
+    0x11DE0, 0x11DE9,
     0x11EE0, 0x11EF6,
     0x11F00, 0x11F10,
     0x11F12, 0x11F3A,
@@ -1358,7 +1414,10 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0x12F90, 0x12FF0,
     0x13000, 0x1342F,
     0x13441, 0x13446,
+    0x13460, 0x143FA,
     0x14400, 0x14646,
+    0x16100, 0x1612E,
+    0x16130, 0x16139,
     0x16800, 0x16A38,
     0x16A40, 0x16A5E,
     0x16A60, 0x16A69,
@@ -1370,16 +1429,20 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0x16B50, 0x16B59,
     0x16B63, 0x16B77,
     0x16B7D, 0x16B8F,
+    0x16D40, 0x16D6C,
+    0x16D70, 0x16D79,
     0x16E40, 0x16E7F,
+    0x16EA0, 0x16EB8,
+    0x16EBB, 0x16ED3,
     0x16F00, 0x16F4A,
     0x16F4F, 0x16F87,
     0x16F8F, 0x16F9F,
     0x16FE0, 0x16FE1,
     0x16FE3, 0x16FE3,
-    0x16FF0, 0x16FF1,
-    0x17000, 0x187F7,
-    0x18800, 0x18CD5,
-    0x18D00, 0x18D08,
+    0x16FF0, 0x16FF6,
+    0x17000, 0x18CD5,
+    0x18CFF, 0x18D1E,
+    0x18D80, 0x18DF2,
     0x1AFF0, 0x1AFF3,
     0x1AFF5, 0x1AFFB,
     0x1AFFD, 0x1AFFE,
@@ -1394,6 +1457,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0x1BC80, 0x1BC88,
     0x1BC90, 0x1BC99,
     0x1BC9E, 0x1BC9E,
+    0x1CCF0, 0x1CCF9,
     0x1D400, 0x1D454,
     0x1D456, 0x1D49C,
     0x1D49E, 0x1D49F,
@@ -1443,6 +1507,11 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0x1E2F0, 0x1E2F9,
     0x1E4D0, 0x1E4EB,
     0x1E4F0, 0x1E4F9,
+    0x1E5D0, 0x1E5ED,
+    0x1E5F0, 0x1E5FA,
+    0x1E6C0, 0x1E6DE,
+    0x1E6E0, 0x1E6F5,
+    0x1E6FE, 0x1E6FF,
     0x1E7E0, 0x1E7E6,
     0x1E7E8, 0x1E7EB,
     0x1E7ED, 0x1E7EE,
@@ -1490,16 +1559,16 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
     0x1F170, 0x1F189,
     0x1FBF0, 0x1FBF9,
     0x20000, 0x2A6DF,
-    0x2A700, 0x2B739,
-    0x2B740, 0x2B81D,
-    0x2B820, 0x2CEA1,
+    0x2A700, 0x2B81D,
+    0x2B820, 0x2CEAD,
     0x2CEB0, 0x2EBE0,
+    0x2EBF0, 0x2EE5D,
     0x2F800, 0x2FA1D,
     0x30000, 0x3134A,
-    0x31350, 0x323AF,
+    0x31350, 0x33479,
 };
 
-#define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1302
+#define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1320
 static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_CODEPOINTS_LENGTH] = {
     0x100, 0x100,
     0x102, 0x102,
@@ -1774,6 +1843,7 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
     0x10C7, 0x10C7,
     0x10CD, 0x10CD,
     0x13A0, 0x13F5,
+    0x1C89, 0x1C89,
     0x1C90, 0x1CBA,
     0x1CBD, 0x1CBF,
     0x1E00, 0x1E00,
@@ -2103,9 +2173,15 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
     0xA7C2, 0xA7C2,
     0xA7C4, 0xA7C7,
     0xA7C9, 0xA7C9,
+    0xA7CB, 0xA7CC,
+    0xA7CE, 0xA7CE,
     0xA7D0, 0xA7D0,
+    0xA7D2, 0xA7D2,
+    0xA7D4, 0xA7D4,
     0xA7D6, 0xA7D6,
     0xA7D8, 0xA7D8,
+    0xA7DA, 0xA7DA,
+    0xA7DC, 0xA7DC,
     0xA7F5, 0xA7F5,
     0xFF21, 0xFF3A,
     0x10400, 0x10427,
@@ -2115,8 +2191,10 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
     0x1058C, 0x10592,
     0x10594, 0x10595,
     0x10C80, 0x10CB2,
+    0x10D50, 0x10D65,
     0x118A0, 0x118BF,
     0x16E40, 0x16E5F,
+    0x16EA0, 0x16EB8,
     0x1D400, 0x1D419,
     0x1D434, 0x1D44D,
     0x1D468, 0x1D481,
@@ -2304,6 +2382,10 @@ pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
  */
 size_t
 pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
+    if (n == 0) {
+        return 0;
+    }
+
     if (*b < 0x80) {
         return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT) ? 1 : 0;
     }
@@ -2324,6 +2406,10 @@ pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
  */
 size_t
 pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
+    if (n == 0) {
+        return 0;
+    }
+
     if (*b < 0x80) {
         return (pm_encoding_unicode_table[*b] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0;
     }
@@ -2344,6 +2430,10 @@ pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
  */
 bool
 pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
+    if (n == 0) {
+        return 0;
+    }
+
     if (*b < 0x80) {
         return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false;
     }
@@ -2358,9 +2448,12 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
     }
 }
 
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
+
 static pm_unicode_codepoint_t
 pm_cesu_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
-    if (b[0] < 0x80) {
+
+    if ((n > 0) && (b[0] < 0x80)) {
         *width = 1;
         return (pm_unicode_codepoint_t) b[0];
     }
@@ -2399,6 +2492,10 @@ pm_cesu_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
 
 static size_t
 pm_encoding_cesu_8_char_width(const uint8_t *b, ptrdiff_t n) {
+    if (n == 0) {
+        return 0;
+    }
+
     size_t width;
     pm_cesu_8_codepoint(b, n, &width);
     return width;
@@ -2406,6 +2503,10 @@ pm_encoding_cesu_8_char_width(const uint8_t *b, ptrdiff_t n) {
 
 static size_t
 pm_encoding_cesu_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
+    if (n == 0) {
+        return 0;
+    }
+
     if (*b < 0x80) {
         return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT) ? 1 : 0;
     }
@@ -2422,6 +2523,10 @@ pm_encoding_cesu_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
 
 static size_t
 pm_encoding_cesu_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
+    if (n == 0) {
+        return 0;
+    }
+
     if (*b < 0x80) {
         return (pm_encoding_unicode_table[*b] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0;
     }
@@ -2438,6 +2543,10 @@ pm_encoding_cesu_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
 
 static bool
 pm_encoding_cesu_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
+    if (n == 0) {
+        return 0;
+    }
+
     if (*b < 0x80) {
         return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false;
     }
@@ -2452,6 +2561,8 @@ pm_encoding_cesu_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
     }
 }
 
+#endif
+
 #undef UNICODE_ALPHA_CODEPOINTS_LENGTH
 #undef UNICODE_ALNUM_CODEPOINTS_LENGTH
 #undef UNICODE_ISUPPER_CODEPOINTS_LENGTH
@@ -2480,6 +2591,8 @@ static const uint8_t pm_encoding_ascii_table[256] = {
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
 };
 
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
+
 /**
  * Each element of the following table contains a bitfield that indicates a
  * piece of information about the corresponding CP850 character.
@@ -3849,14 +3962,14 @@ static const uint8_t pm_encoding_windows_874_table[256] = {
 };
 
 #define PRISM_ENCODING_TABLE(name) \
-    static size_t pm_encoding_ ##name ## _alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {           \
-        return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHABETIC_BIT);           \
+    static size_t pm_encoding_ ##name ## _alpha_char(const uint8_t *b, ptrdiff_t n) {           \
+        return ((n > 0) && (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHABETIC_BIT));           \
     }                                                                                                         \
-    static size_t pm_encoding_ ##name ## _alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {           \
-        return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0; \
+    static size_t pm_encoding_ ##name ## _alnum_char(const uint8_t *b, ptrdiff_t n) {           \
+        return ((n > 0) && (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0; \
     }                                                                                                         \
-    static bool pm_encoding_ ##name ## _isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {           \
-        return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_UPPERCASE_BIT);            \
+    static bool pm_encoding_ ##name ## _isupper_char(const uint8_t *b, ptrdiff_t n) {           \
+        return ((n > 0) && (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_UPPERCASE_BIT));            \
     }
 
 PRISM_ENCODING_TABLE(cp850)
@@ -3918,14 +4031,15 @@ PRISM_ENCODING_TABLE(windows_1258)
 PRISM_ENCODING_TABLE(windows_874)
 
 #undef PRISM_ENCODING_TABLE
+#endif
 
 /**
  * Returns the size of the next character in the ASCII encoding. This basically
  * means that if the top bit is not set, the character is 1 byte long.
  */
 static size_t
-pm_encoding_ascii_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
-    return *b < 0x80 ? 1 : 0;
+pm_encoding_ascii_char_width(const uint8_t *b, ptrdiff_t n) {
+    return ((n > 0) && (*b < 0x80)) ? 1 : 0;
 }
 
 /**
@@ -3933,8 +4047,8 @@ pm_encoding_ascii_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t
  * alphabetical character.
  */
 static size_t
-pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
-    return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT);
+pm_encoding_ascii_alpha_char(const uint8_t *b, ptrdiff_t n) {
+    return (n > 0) ? (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT) : 0;
 }
 
 /**
@@ -3944,7 +4058,7 @@ pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t
  */
 static size_t
 pm_encoding_ascii_alpha_char_7bit(const uint8_t *b, ptrdiff_t n) {
-    return (*b < 0x80) ? pm_encoding_ascii_alpha_char(b, n) : 0;
+    return ((n > 0) && (*b < 0x80)) ? pm_encoding_ascii_alpha_char(b, n) : 0;
 }
 
 /**
@@ -3952,8 +4066,8 @@ pm_encoding_ascii_alpha_char_7bit(const uint8_t *b, ptrdiff_t n) {
  * alphanumeric character.
  */
 static size_t
-pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
-    return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0;
+pm_encoding_ascii_alnum_char(const uint8_t *b, ptrdiff_t n) {
+    return ((n > 0) && (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0;
 }
 
 /**
@@ -3963,7 +4077,7 @@ pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t
  */
 static size_t
 pm_encoding_ascii_alnum_char_7bit(const uint8_t *b, ptrdiff_t n) {
-    return (*b < 0x80) ? pm_encoding_ascii_alnum_char(b, n) : 0;
+    return ((n > 0) && (*b < 0x80)) ? pm_encoding_ascii_alnum_char(b, n) : 0;
 }
 
 /**
@@ -3971,27 +4085,137 @@ pm_encoding_ascii_alnum_char_7bit(const uint8_t *b, ptrdiff_t n) {
  * character.
  */
 static bool
-pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
-    return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_UPPERCASE_BIT);
+pm_encoding_ascii_isupper_char(const uint8_t *b, ptrdiff_t n) {
+    return (n > 0) && (pm_encoding_ascii_table[*b] & PRISM_ENCODING_UPPERCASE_BIT);
 }
 
 /**
- * Certain encodings are equivalent to ASCII below 0x80, so it works for our
- * purposes to have a function here that first checks the bounds and then falls
- * back to checking the ASCII lookup table.
+ * For a lot of encodings the default is that they are a single byte long no
+ * matter what the codepoint, so this function is shared between them.
+ */
+static size_t
+pm_encoding_single_char_width(PRISM_UNUSED const uint8_t *b, PRISM_UNUSED ptrdiff_t n) {
+    return 1;
+}
+
+/**
+ * Returns the size of the next character in the EUC-JP encoding, or 0 if a
+ * character cannot be decoded from the given bytes.
+ */
+static size_t
+pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
+    // These are the single byte characters.
+    if ((n > 0) && (*b < 0x80)) {
+        return 1;
+    }
+
+    // These are the double byte characters.
+    if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
+        return 2;
+    }
+
+    // These are the triple byte characters.
+    if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) {
+        return 3;
+    }
+
+    return 0;
+}
+
+/**
+ * Returns the size of the next character in the EUC-JP encoding if it is an
+ * uppercase character.
  */
 static bool
-pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) {
-    return (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n);
+pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
+    size_t width = pm_encoding_euc_jp_char_width(b, n);
+
+    if (width == 1) {
+        return pm_encoding_ascii_isupper_char(b, n);
+    } else if (width == 2) {
+        return (
+            (b[0] == 0xA3 && b[1] >= 0xC1 && b[1] <= 0xDA) ||
+            (b[0] == 0xA6 && b[1] >= 0xA1 && b[1] <= 0xB8) ||
+            (b[0] == 0xA7 && b[1] >= 0xA1 && b[1] <= 0xC1)
+        );
+    } else {
+        return false;
+    }
 }
 
 /**
- * For a lot of encodings the default is that they are a single byte long no
- * matter what the codepoint, so this function is shared between them.
+ * Returns the size of the next character in the Shift_JIS encoding, or 0 if a
+ * character cannot be decoded from the given bytes.
  */
 static size_t
-pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
-    return 1;
+pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
+    if (n == 0) {
+        return 0;
+    }
+    // These are the single byte characters.
+    if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) {
+        return 1;
+    }
+
+    // These are the double byte characters.
+    if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC && b[1] != 0x7F)) {
+        return 2;
+    }
+
+    return 0;
+}
+
+/**
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
+ * alphanumeric character.
+ */
+static size_t
+pm_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) {
+    size_t width = pm_encoding_shift_jis_char_width(b, n);
+    return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alnum_char(b, n)) : width;
+}
+
+/**
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
+ * alphabetical character.
+ */
+static size_t
+pm_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) {
+    size_t width = pm_encoding_shift_jis_char_width(b, n);
+    return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alpha_char(b, n)) : width;
+}
+
+/**
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
+ * uppercase character.
+ */
+static bool
+pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
+    size_t width = pm_encoding_shift_jis_char_width(b, n);
+
+    if (width == 1) {
+        return pm_encoding_ascii_isupper_char(b, n);
+    } else if (width == 2) {
+        return (
+            ((b[0] == 0x82) && (b[1] >= 0x60 && b[1] <= 0x79)) ||
+            ((b[0] == 0x83) && (b[1] >= 0x9F && b[1] <= 0xB6)) ||
+            ((b[0] == 0x84) && (b[1] >= 0x40 && b[1] <= 0x60))
+        );
+    } else {
+        return width;
+    }
+}
+
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
+
+/**
+ * Certain encodings are equivalent to ASCII below 0x80, so it works for our
+ * purposes to have a function here that first checks the bounds and then falls
+ * back to checking the ASCII lookup table.
+ */
+static bool
+pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) {
+    return (n > 0) && (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n);
 }
 
 /**
@@ -4001,7 +4225,7 @@ pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATT
 static size_t
 pm_encoding_big5_char_width(const uint8_t *b, ptrdiff_t n) {
     // These are the single byte characters.
-    if (*b < 0x80) {
+    if ((n > 0) && (*b < 0x80)) {
         return 1;
     }
 
@@ -4020,7 +4244,7 @@ pm_encoding_big5_char_width(const uint8_t *b, ptrdiff_t n) {
 static size_t
 pm_encoding_cp949_char_width(const uint8_t *b, ptrdiff_t n) {
     // These are the single byte characters
-    if (*b <= 0x80) {
+    if ((n > 0) && (*b <= 0x80)) {
         return 1;
     }
 
@@ -4039,7 +4263,7 @@ pm_encoding_cp949_char_width(const uint8_t *b, ptrdiff_t n) {
 static size_t
 pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) {
     // These are the 1 byte characters.
-    if (*b < 0x80) {
+    if ((n > 0) && (*b < 0x80)) {
         return 1;
     }
 
@@ -4076,58 +4300,13 @@ pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) {
 }
 
 /**
- * Returns the size of the next character in the EUC-JP encoding, or 0 if a
- * character cannot be decoded from the given bytes.
- */
-static size_t
-pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
-    // These are the single byte characters.
-    if (*b < 0x80) {
-        return 1;
-    }
-
-    // These are the double byte characters.
-    if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
-        return 2;
-    }
-
-    // These are the triple byte characters.
-    if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) {
-        return 3;
-    }
-
-    return 0;
-}
-
-/**
- * Returns the size of the next character in the EUC-JP encoding if it is an
- * uppercase character.
- */
-static bool
-pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
-    size_t width = pm_encoding_euc_jp_char_width(b, n);
-
-    if (width == 1) {
-        return pm_encoding_ascii_isupper_char(b, n);
-    } else if (width == 2) {
-        return (
-            (b[0] == 0xA3 && b[1] >= 0xC1 && b[1] <= 0xDA) ||
-            (b[0] == 0xA6 && b[1] >= 0xA1 && b[1] <= 0xB8) ||
-            (b[0] == 0xA7 && b[1] >= 0xA1 && b[1] <= 0xC1)
-        );
-    } else {
-        return false;
-    }
-}
-
-/**
  * Returns the size of the next character in the EUC-KR encoding, or 0 if a
  * character cannot be decoded from the given bytes.
  */
 static size_t
 pm_encoding_euc_kr_char_width(const uint8_t *b, ptrdiff_t n) {
     // These are the single byte characters.
-    if (*b < 0x80) {
+    if ((n > 0) && (*b < 0x80)) {
         return 1;
     }
 
@@ -4146,7 +4325,7 @@ pm_encoding_euc_kr_char_width(const uint8_t *b, ptrdiff_t n) {
 static size_t
 pm_encoding_euc_tw_char_width(const uint8_t *b, ptrdiff_t n) {
     // These are the single byte characters.
-    if (*b < 0x80) {
+    if ((n > 0) && (*b < 0x80)) {
         return 1;
     }
 
@@ -4170,7 +4349,7 @@ pm_encoding_euc_tw_char_width(const uint8_t *b, ptrdiff_t n) {
 static size_t
 pm_encoding_gb18030_char_width(const uint8_t *b, ptrdiff_t n) {
     // These are the 1 byte characters.
-    if (*b < 0x80) {
+    if ((n > 0) && (*b < 0x80)) {
         return 1;
     }
 
@@ -4194,7 +4373,7 @@ pm_encoding_gb18030_char_width(const uint8_t *b, ptrdiff_t n) {
 static size_t
 pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) {
     // These are the single byte characters.
-    if (*b <= 0x80) {
+    if ((n > 0) && (*b <= 0x80)) {
         return 1;
     }
 
@@ -4218,65 +4397,7 @@ pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) {
     return 0;
 }
 
-/**
- * Returns the size of the next character in the Shift_JIS encoding, or 0 if a
- * character cannot be decoded from the given bytes.
- */
-static size_t
-pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
-    // These are the single byte characters.
-    if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) {
-        return 1;
-    }
-
-    // These are the double byte characters.
-    if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC && b[1] != 0x7F)) {
-        return 2;
-    }
-
-    return 0;
-}
-
-/**
- * Returns the size of the next character in the Shift_JIS encoding if it is an
- * alphanumeric character.
- */
-static size_t
-pm_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) {
-    size_t width = pm_encoding_shift_jis_char_width(b, n);
-    return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alnum_char(b, n)) : width;
-}
-
-/**
- * Returns the size of the next character in the Shift_JIS encoding if it is an
- * alphabetical character.
- */
-static size_t
-pm_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) {
-    size_t width = pm_encoding_shift_jis_char_width(b, n);
-    return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alpha_char(b, n)) : width;
-}
-
-/**
- * Returns the size of the next character in the Shift_JIS encoding if it is an
- * uppercase character.
- */
-static bool
-pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
-    size_t width = pm_encoding_shift_jis_char_width(b, n);
-
-    if (width == 1) {
-        return pm_encoding_ascii_isupper_char(b, n);
-    } else if (width == 2) {
-        return (
-            ((b[0] == 0x82) && (b[1] >= 0x60 && b[1] <= 0x79)) ||
-            ((b[0] == 0x83) && (b[1] >= 0x9F && b[1] <= 0xB6)) ||
-            ((b[0] == 0x84) && (b[1] >= 0x40 && b[1] <= 0x60))
-        );
-    } else {
-        return width;
-    }
-}
+#endif
 
 /**
  * This is the table of all of the encodings that prism supports.
@@ -4290,6 +4411,14 @@ const pm_encoding_t pm_encodings[] = {
         .isupper_char = pm_encoding_utf_8_isupper_char,
         .multibyte = true
     },
+    [PM_ENCODING_US_ASCII] = {
+        .name = "US-ASCII",
+        .char_width = pm_encoding_ascii_char_width,
+        .alnum_char = pm_encoding_ascii_alnum_char,
+        .alpha_char = pm_encoding_ascii_alpha_char,
+        .isupper_char = pm_encoding_ascii_isupper_char,
+        .multibyte = false
+    },
     [PM_ENCODING_ASCII_8BIT] = {
         .name = "ASCII-8BIT",
         .char_width = pm_encoding_single_char_width,
@@ -4298,6 +4427,24 @@ const pm_encoding_t pm_encodings[] = {
         .isupper_char = pm_encoding_ascii_isupper_char,
         .multibyte = false
     },
+    [PM_ENCODING_EUC_JP] = {
+        .name = "EUC-JP",
+        .char_width = pm_encoding_euc_jp_char_width,
+        .alnum_char = pm_encoding_ascii_alnum_char_7bit,
+        .alpha_char = pm_encoding_ascii_alpha_char_7bit,
+        .isupper_char = pm_encoding_euc_jp_isupper_char,
+        .multibyte = true
+    },
+    [PM_ENCODING_WINDOWS_31J] = {
+        .name = "Windows-31J",
+        .char_width = pm_encoding_shift_jis_char_width,
+        .alnum_char = pm_encoding_shift_jis_alnum_char,
+        .alpha_char = pm_encoding_shift_jis_alpha_char,
+        .isupper_char = pm_encoding_shift_jis_isupper_char,
+        .multibyte = true
+    },
+
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
     [PM_ENCODING_BIG5] = {
         .name = "Big5",
         .char_width = pm_encoding_big5_char_width,
@@ -4394,14 +4541,6 @@ const pm_encoding_t pm_encodings[] = {
         .isupper_char = pm_encoding_ascii_isupper_char_7bit,
         .multibyte = true
     },
-    [PM_ENCODING_EUC_JP] = {
-        .name = "EUC-JP",
-        .char_width = pm_encoding_euc_jp_char_width,
-        .alnum_char = pm_encoding_ascii_alnum_char_7bit,
-        .alpha_char = pm_encoding_ascii_alpha_char_7bit,
-        .isupper_char = pm_encoding_euc_jp_isupper_char,
-        .multibyte = true
-    },
     [PM_ENCODING_EUC_JP_MS] = {
         .name = "eucJP-ms",
         .char_width = pm_encoding_euc_jp_char_width,
@@ -4874,14 +5013,6 @@ const pm_encoding_t pm_encodings[] = {
         .isupper_char = pm_encoding_tis_620_isupper_char,
         .multibyte = false
     },
-    [PM_ENCODING_US_ASCII] = {
-        .name = "US-ASCII",
-        .char_width = pm_encoding_ascii_char_width,
-        .alnum_char = pm_encoding_ascii_alnum_char,
-        .alpha_char = pm_encoding_ascii_alpha_char,
-        .isupper_char = pm_encoding_ascii_isupper_char,
-        .multibyte = false
-    },
     [PM_ENCODING_UTF8_MAC] = {
         .name = "UTF8-MAC",
         .char_width = pm_encoding_utf_8_char_width,
@@ -4986,14 +5117,6 @@ const pm_encoding_t pm_encodings[] = {
         .isupper_char = pm_encoding_windows_1258_isupper_char,
         .multibyte = false
     },
-    [PM_ENCODING_WINDOWS_31J] = {
-        .name = "Windows-31J",
-        .char_width = pm_encoding_shift_jis_char_width,
-        .alnum_char = pm_encoding_shift_jis_alnum_char,
-        .alpha_char = pm_encoding_shift_jis_alpha_char,
-        .isupper_char = pm_encoding_shift_jis_isupper_char,
-        .multibyte = true
-    },
     [PM_ENCODING_WINDOWS_874] = {
         .name = "Windows-874",
         .char_width = pm_encoding_single_char_width,
@@ -5002,6 +5125,7 @@ const pm_encoding_t pm_encodings[] = {
         .isupper_char = pm_encoding_windows_874_isupper_char,
         .multibyte = false
     }
+#endif
 };
 
 /**
@@ -5016,11 +5140,13 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
     // UTF-8 can contain extra information at the end about the platform it is
     // encoded on, such as UTF-8-MAC or UTF-8-UNIX. We'll ignore those suffixes.
     if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "UTF-8", 5) == 0)) {
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
         // We need to explicitly handle UTF-8-HFS, as that one needs to switch
         // over to being UTF8-MAC.
         if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-HFS", 4) == 0)) {
             return &pm_encodings[PM_ENCODING_UTF8_MAC];
         }
+#endif
 
         // Otherwise we'll return the default UTF-8 encoding.
         return PM_ENCODING_UTF_8_ENTRY;
@@ -5040,11 +5166,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                 break;
             case 'B': case 'b':
                 ENCODING1("BINARY", PM_ENCODING_ASCII_8BIT);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING1("Big5", PM_ENCODING_BIG5);
                 ENCODING2("Big5-HKSCS", "Big5-HKSCS:2008", PM_ENCODING_BIG5_HKSCS);
                 ENCODING1("Big5-UAO", PM_ENCODING_BIG5_UAO);
+#endif
                 break;
             case 'C': case 'c':
+                ENCODING1("CP65001", PM_ENCODING_UTF_8);
+                ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING1("CESU-8", PM_ENCODING_CESU_8);
                 ENCODING1("CP437", PM_ENCODING_IBM437);
                 ENCODING1("CP720", PM_ENCODING_IBM720);
@@ -5064,7 +5195,6 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                 ENCODING1("CP874", PM_ENCODING_WINDOWS_874);
                 ENCODING1("CP878", PM_ENCODING_KOI8_R);
                 ENCODING1("CP863", PM_ENCODING_IBM863);
-                ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J);
                 ENCODING1("CP936", PM_ENCODING_GBK);
                 ENCODING1("CP949", PM_ENCODING_CP949);
                 ENCODING1("CP950", PM_ENCODING_CP950);
@@ -5079,25 +5209,30 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                 ENCODING1("CP1257", PM_ENCODING_WINDOWS_1257);
                 ENCODING1("CP1258", PM_ENCODING_WINDOWS_1258);
                 ENCODING1("CP51932", PM_ENCODING_CP51932);
-                ENCODING1("CP65001", PM_ENCODING_UTF_8);
+#endif
                 break;
             case 'E': case 'e':
                 ENCODING2("EUC-JP", "eucJP", PM_ENCODING_EUC_JP);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING2("eucJP-ms", "euc-jp-ms", PM_ENCODING_EUC_JP_MS);
                 ENCODING2("EUC-JIS-2004", "EUC-JISX0213", PM_ENCODING_EUC_JIS_2004);
                 ENCODING2("EUC-KR", "eucKR", PM_ENCODING_EUC_KR);
                 ENCODING2("EUC-CN", "eucCN", PM_ENCODING_GB2312);
                 ENCODING2("EUC-TW", "eucTW", PM_ENCODING_EUC_TW);
                 ENCODING1("Emacs-Mule", PM_ENCODING_EMACS_MULE);
+#endif
                 break;
             case 'G': case 'g':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING1("GBK", PM_ENCODING_GBK);
                 ENCODING1("GB12345", PM_ENCODING_GB12345);
                 ENCODING1("GB18030", PM_ENCODING_GB18030);
                 ENCODING1("GB1988", PM_ENCODING_GB1988);
                 ENCODING1("GB2312", PM_ENCODING_GB2312);
+#endif
                 break;
             case 'I': case 'i':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING1("IBM437", PM_ENCODING_IBM437);
                 ENCODING1("IBM720", PM_ENCODING_IBM720);
                 ENCODING1("IBM737", PM_ENCODING_IBM737);
@@ -5129,12 +5264,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                 ENCODING2("ISO-8859-14", "ISO8859-14", PM_ENCODING_ISO_8859_14);
                 ENCODING2("ISO-8859-15", "ISO8859-15", PM_ENCODING_ISO_8859_15);
                 ENCODING2("ISO-8859-16", "ISO8859-16", PM_ENCODING_ISO_8859_16);
+#endif
                 break;
             case 'K': case 'k':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING1("KOI8-R", PM_ENCODING_KOI8_R);
                 ENCODING1("KOI8-U", PM_ENCODING_KOI8_U);
+#endif
                 break;
             case 'M': case 'm':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING1("macCentEuro", PM_ENCODING_MAC_CENT_EURO);
                 ENCODING1("macCroatian", PM_ENCODING_MAC_CROATIAN);
                 ENCODING1("macCyrillic", PM_ENCODING_MAC_CYRILLIC);
@@ -5147,31 +5286,39 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                 ENCODING1("macThai", PM_ENCODING_MAC_THAI);
                 ENCODING1("macTurkish", PM_ENCODING_MAC_TURKISH);
                 ENCODING1("macUkraine", PM_ENCODING_MAC_UKRAINE);
+#endif
                 break;
             case 'P': case 'p':
                 ENCODING1("PCK", PM_ENCODING_WINDOWS_31J);
                 break;
             case 'S': case 's':
-                ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS);
                 ENCODING1("SJIS", PM_ENCODING_WINDOWS_31J);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
+                ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS);
                 ENCODING1("SJIS-DoCoMo", PM_ENCODING_SJIS_DOCOMO);
                 ENCODING1("SJIS-KDDI", PM_ENCODING_SJIS_KDDI);
                 ENCODING1("SJIS-SoftBank", PM_ENCODING_SJIS_SOFTBANK);
                 ENCODING1("stateless-ISO-2022-JP", PM_ENCODING_STATELESS_ISO_2022_JP);
                 ENCODING1("stateless-ISO-2022-JP-KDDI", PM_ENCODING_STATELESS_ISO_2022_JP_KDDI);
+#endif
                 break;
             case 'T': case 't':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING1("TIS-620", PM_ENCODING_TIS_620);
+#endif
                 break;
             case 'U': case 'u':
                 ENCODING1("US-ASCII", PM_ENCODING_US_ASCII);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING2("UTF8-MAC", "UTF-8-HFS", PM_ENCODING_UTF8_MAC);
                 ENCODING1("UTF8-DoCoMo", PM_ENCODING_UTF8_DOCOMO);
                 ENCODING1("UTF8-KDDI", PM_ENCODING_UTF8_KDDI);
                 ENCODING1("UTF8-SoftBank", PM_ENCODING_UTF8_SOFTBANK);
+#endif
                 break;
             case 'W': case 'w':
                 ENCODING1("Windows-31J", PM_ENCODING_WINDOWS_31J);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                 ENCODING1("Windows-874", PM_ENCODING_WINDOWS_874);
                 ENCODING1("Windows-1250", PM_ENCODING_WINDOWS_1250);
                 ENCODING1("Windows-1251", PM_ENCODING_WINDOWS_1251);
@@ -5182,6 +5329,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                 ENCODING1("Windows-1256", PM_ENCODING_WINDOWS_1256);
                 ENCODING1("Windows-1257", PM_ENCODING_WINDOWS_1257);
                 ENCODING1("Windows-1258", PM_ENCODING_WINDOWS_1258);
+#endif
                 break;
             case '6':
                 ENCODING1("646", PM_ENCODING_US_ASCII);