[ruby/prism] Documentation for the encodings

https://github.com/ruby/prism/commit/52a0d80a15
author: Kevin Newton <kddnewton@gmail.com> 2023-10-31 08:54:52 -0400
committer: Kevin Newton <kddnewton@gmail.com> 2023-11-01 13:10:29 -0400
commit: 87c6fb85483b63cf976d7738eb5a2b55e6f2b1d7 (patch)
tree: f53768e37ffe9b6cac3a9092f94689b6f3fc9de3 /prism
parent: 493439c9ce8d298f3fbd2c9c01d35fcc9add6d49 (diff)
8 files changed, 513 insertions, 154 deletions
diff --git a/prism/enc/pm_big5.c b/prism/enc/pm_big5.c
index deaa3afb3f..6d4fefcf2b 100644
--- a/prism/enc/pm_big5.c
+++ b/prism/enc/pm_big5.c
@@ -42,7 +42,8 @@ pm_encoding_big5_isupper_char(const uint8_t *b, ptrdiff_t n) {
     }
 }
 
-pm_encoding_t pm_encoding_big5 = {
+/** Big5 encoding */
+const pm_encoding_t pm_encoding_big5 = {
     .name = "big5",
     .char_width = pm_encoding_big5_char_width,
     .alnum_char = pm_encoding_big5_alnum_char,
diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h
index 232bc97dd4..28b9f02281 100644
--- a/prism/enc/pm_encoding.h
+++ b/prism/enc/pm_encoding.h
@@ -8,36 +8,50 @@
 #include <stddef.h>
 #include <stdint.h>
 
-// This struct defines the functions necessary to implement the encoding
-// interface so we can determine how many bytes the subsequent character takes.
-// Each callback should return the number of bytes, or 0 if the next bytes are
-// invalid for the encoding and type.
+/**
+ * This struct defines the functions necessary to implement the encoding
+ * interface so we can determine how many bytes the subsequent character takes.
+ * Each callback should return the number of bytes, or 0 if the next bytes are
+ * invalid for the encoding and type.
+ */
 typedef struct {
-    // Return the number of bytes that the next character takes if it is valid
-    // in the encoding. Does not read more than n bytes. It is assumed that n is
-    // at least 1.
+    /**
+     * Return the number of bytes that the next character takes if it is valid
+     * in the encoding. Does not read more than n bytes. It is assumed that n is
+     * at least 1.
+     */
     size_t (*char_width)(const uint8_t *b, ptrdiff_t n);
 
-    // Return the number of bytes that the next character takes if it is valid
-    // in the encoding and is alphabetical. Does not read more than n bytes. It
-    // is assumed that n is at least 1.
+    /**
+     * Return the number of bytes that the next character takes if it is valid
+     * in the encoding and is alphabetical. Does not read more than n bytes. It
+     * is assumed that n is at least 1.
+     */
     size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n);
 
-    // Return the number of bytes that the next character takes if it is valid
-    // in the encoding and is alphanumeric. Does not read more than n bytes. It
-    // is assumed that n is at least 1.
+    /**
+     * Return the number of bytes that the next character takes if it is valid
+     * in the encoding and is alphanumeric. Does not read more than n bytes. It
+     * is assumed that n is at least 1.
+     */
     size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n);
 
-    // Return true if the next character is valid in the encoding and is an
-    // uppercase character. Does not read more than n bytes. It is assumed that
-    // n is at least 1.
+    /**
+     * Return true if the next character is valid in the encoding and is an
+     * uppercase character. Does not read more than n bytes. It is assumed that
+     * n is at least 1.
+     */
     bool (*isupper_char)(const uint8_t *b, ptrdiff_t n);
 
-    // The name of the encoding. This should correspond to a value that can be
-    // passed to Encoding.find in Ruby.
+    /**
+     * The name of the encoding. This should correspond to a value that can be
+     * passed to Encoding.find in Ruby.
+     */
     const char *name;
 
-    // Return true if the encoding is a multibyte encoding.
+    /**
+     * Return true if the encoding is a multibyte encoding.
+     */
     bool multibyte;
 } pm_encoding_t;
 
@@ -47,50 +61,109 @@ typedef struct {
 #define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1
 #define PRISM_ENCODING_UPPERCASE_BIT 1 << 2
 
-// These functions are reused by some other encodings, so they are defined here
-// so they can be shared.
+/**
+ * Return the size of the next character in the ASCII encoding if it is an
+ * alphabetical character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns The number of bytes that the next character takes if it is valid in
+ *     the encoding, or 0 if it is not.
+ */
 size_t pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
+
+/**
+ * Return the size of the next character in the ASCII encoding if it is an
+ * alphanumeric character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns The number of bytes that the next character takes if it is valid in
+ *     the encoding, or 0 if it is not.
+ */
 size_t pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
+
+/**
+ * Return true if the next character in the ASCII encoding if it is an uppercase
+ * character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns True if the next character is valid in the encoding and is an
+ *     uppercase character, or false if it is not.
+ */
 bool pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
 
-// These functions are shared between the actual encoding and the fast path in
-// the parser so they need to be internally visible.
+/**
+ * Return the size of the next character in the UTF-8 encoding if it is an
+ * alphabetical character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns The number of bytes that the next character takes if it is valid in
+ *     the encoding, or 0 if it is not.
+ */
 size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);
+
+/**
+ * Return the size of the next character in the UTF-8 encoding if it is an
+ * alphanumeric character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns The number of bytes that the next character takes if it is valid in
+ *     the encoding, or 0 if it is not.
+ */
 size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);
+
+/**
+ * Return true if the next character in the UTF-8 encoding if it is an uppercase
+ * character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns True if the next character is valid in the encoding and is an
+ *     uppercase character, or false if it is not.
+ */
 bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);
 
-// This lookup table is referenced in both the UTF-8 encoding file and the
-// parser directly in order to speed up the default encoding processing.
+/**
+ * This lookup table is referenced in both the UTF-8 encoding file and the
+ * parser directly in order to speed up the default encoding processing. It is
+ * used to indicate whether a character is alphabetical, alphanumeric, or
+ * uppercase in unicode mappings.
+ */
 extern const uint8_t pm_encoding_unicode_table[256];
 
-// These are the encodings that are supported by the parser. They are defined in
+// Below are the encodings that are supported by the parser. They are defined in
 // their own files in the src/enc directory.
-extern pm_encoding_t pm_encoding_ascii;
-extern pm_encoding_t pm_encoding_ascii_8bit;
-extern pm_encoding_t pm_encoding_big5;
-extern pm_encoding_t pm_encoding_euc_jp;
-extern pm_encoding_t pm_encoding_gbk;
-extern pm_encoding_t pm_encoding_iso_8859_1;
-extern pm_encoding_t pm_encoding_iso_8859_2;
-extern pm_encoding_t pm_encoding_iso_8859_3;
-extern pm_encoding_t pm_encoding_iso_8859_4;
-extern pm_encoding_t pm_encoding_iso_8859_5;
-extern pm_encoding_t pm_encoding_iso_8859_6;
-extern pm_encoding_t pm_encoding_iso_8859_7;
-extern pm_encoding_t pm_encoding_iso_8859_8;
-extern pm_encoding_t pm_encoding_iso_8859_9;
-extern pm_encoding_t pm_encoding_iso_8859_10;
-extern pm_encoding_t pm_encoding_iso_8859_11;
-extern pm_encoding_t pm_encoding_iso_8859_13;
-extern pm_encoding_t pm_encoding_iso_8859_14;
-extern pm_encoding_t pm_encoding_iso_8859_15;
-extern pm_encoding_t pm_encoding_iso_8859_16;
-extern pm_encoding_t pm_encoding_koi8_r;
-extern pm_encoding_t pm_encoding_shift_jis;
-extern pm_encoding_t pm_encoding_utf_8;
-extern pm_encoding_t pm_encoding_utf8_mac;
-extern pm_encoding_t pm_encoding_windows_31j;
-extern pm_encoding_t pm_encoding_windows_1251;
-extern pm_encoding_t pm_encoding_windows_1252;
+
+const extern pm_encoding_t pm_encoding_ascii;
+const extern pm_encoding_t pm_encoding_ascii_8bit;
+const extern pm_encoding_t pm_encoding_big5;
+const extern pm_encoding_t pm_encoding_euc_jp;
+const extern pm_encoding_t pm_encoding_gbk;
+const extern pm_encoding_t pm_encoding_iso_8859_1;
+const extern pm_encoding_t pm_encoding_iso_8859_2;
+const extern pm_encoding_t pm_encoding_iso_8859_3;
+const extern pm_encoding_t pm_encoding_iso_8859_4;
+const extern pm_encoding_t pm_encoding_iso_8859_5;
+const extern pm_encoding_t pm_encoding_iso_8859_6;
+const extern pm_encoding_t pm_encoding_iso_8859_7;
+const extern pm_encoding_t pm_encoding_iso_8859_8;
+const extern pm_encoding_t pm_encoding_iso_8859_9;
+const extern pm_encoding_t pm_encoding_iso_8859_10;
+const extern pm_encoding_t pm_encoding_iso_8859_11;
+const extern pm_encoding_t pm_encoding_iso_8859_13;
+const extern pm_encoding_t pm_encoding_iso_8859_14;
+const extern pm_encoding_t pm_encoding_iso_8859_15;
+const extern pm_encoding_t pm_encoding_iso_8859_16;
+const extern pm_encoding_t pm_encoding_koi8_r;
+const extern pm_encoding_t pm_encoding_shift_jis;
+const extern pm_encoding_t pm_encoding_utf_8;
+const extern pm_encoding_t pm_encoding_utf8_mac;
+const extern pm_encoding_t pm_encoding_windows_31j;
+const extern pm_encoding_t pm_encoding_windows_1251;
+const extern pm_encoding_t pm_encoding_windows_1252;
 
 #endif
diff --git a/prism/enc/pm_euc_jp.c b/prism/enc/pm_euc_jp.c
index 13d3662455..cd0b1a7910 100644
--- a/prism/enc/pm_euc_jp.c
+++ b/prism/enc/pm_euc_jp.c
@@ -48,7 +48,8 @@ pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
     }
 }
 
-pm_encoding_t pm_encoding_euc_jp = {
+/** EUC-JP encoding */
+const pm_encoding_t pm_encoding_euc_jp = {
     .name = "euc-jp",
     .char_width = pm_encoding_euc_jp_char_width,
     .alnum_char = pm_encoding_euc_jp_alnum_char,
diff --git a/prism/enc/pm_gbk.c b/prism/enc/pm_gbk.c
index 2fc67b47a4..3dcf41fb99 100644
--- a/prism/enc/pm_gbk.c
+++ b/prism/enc/pm_gbk.c
@@ -51,7 +51,8 @@ pm_encoding_gbk_isupper_char(const uint8_t *b, ptrdiff_t n) {
     }
 }
 
-pm_encoding_t pm_encoding_gbk = {
+/** GBK encoding */
+const pm_encoding_t pm_encoding_gbk = {
     .name = "gbk",
     .char_width = pm_encoding_gbk_char_width,
     .alnum_char = pm_encoding_gbk_alnum_char,
diff --git a/prism/enc/pm_shift_jis.c b/prism/enc/pm_shift_jis.c
index 3c93937efc..ecc3d51b87 100644
--- a/prism/enc/pm_shift_jis.c
+++ b/prism/enc/pm_shift_jis.c
@@ -46,7 +46,8 @@ pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
     }
 }
 
-pm_encoding_t pm_encoding_shift_jis = {
+/** Shift_JIS encoding */
+const pm_encoding_t pm_encoding_shift_jis = {
     .name = "shift_jis",
     .char_width = pm_encoding_shift_jis_char_width,
     .alnum_char = pm_encoding_shift_jis_alnum_char,
diff --git a/prism/enc/pm_tables.c b/prism/enc/pm_tables.c
index c6bb4dce65..c2133649a4 100644
--- a/prism/enc/pm_tables.c
+++ b/prism/enc/pm_tables.c
@@ -1,7 +1,9 @@
 #include "prism/enc/pm_encoding.h"
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ASCII character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ASCII character.
+ */
 static uint8_t pm_encoding_ascii_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -22,8 +24,10 @@ static uint8_t pm_encoding_ascii_table[256] = {
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-1 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-1 character.
+ */
 static uint8_t pm_encoding_iso_8859_1_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -44,8 +48,10 @@ static uint8_t pm_encoding_iso_8859_1_table[256] = {
     3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-2 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-2 character.
+ */
 static uint8_t pm_encoding_iso_8859_2_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -66,8 +72,10 @@ static uint8_t pm_encoding_iso_8859_2_table[256] = {
     3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-3 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-3 character.
+ */
 static uint8_t pm_encoding_iso_8859_3_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -88,8 +96,10 @@ static uint8_t pm_encoding_iso_8859_3_table[256] = {
     0, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-4 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-4 character.
+ */
 static uint8_t pm_encoding_iso_8859_4_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -110,8 +120,10 @@ static uint8_t pm_encoding_iso_8859_4_table[256] = {
     3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-5 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-5 character.
+ */
 static uint8_t pm_encoding_iso_8859_5_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -132,8 +144,10 @@ static uint8_t pm_encoding_iso_8859_5_table[256] = {
     0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-6 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-6 character.
+ */
 static uint8_t pm_encoding_iso_8859_6_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -154,8 +168,10 @@ static uint8_t pm_encoding_iso_8859_6_table[256] = {
     3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-7 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-7 character.
+ */
 static uint8_t pm_encoding_iso_8859_7_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -176,8 +192,10 @@ static uint8_t pm_encoding_iso_8859_7_table[256] = {
     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-8 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-8 character.
+ */
 static uint8_t pm_encoding_iso_8859_8_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -198,8 +216,10 @@ static uint8_t pm_encoding_iso_8859_8_table[256] = {
     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-9 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-9 character.
+ */
 static uint8_t pm_encoding_iso_8859_9_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -220,8 +240,10 @@ static uint8_t pm_encoding_iso_8859_9_table[256] = {
     3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-10 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-10 character.
+ */
 static uint8_t pm_encoding_iso_8859_10_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -242,8 +264,10 @@ static uint8_t pm_encoding_iso_8859_10_table[256] = {
     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-11 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-11 character.
+ */
 static uint8_t pm_encoding_iso_8859_11_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -264,8 +288,10 @@ static uint8_t pm_encoding_iso_8859_11_table[256] = {
     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-13 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-13 character.
+ */
 static uint8_t pm_encoding_iso_8859_13_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -286,8 +312,10 @@ static uint8_t pm_encoding_iso_8859_13_table[256] = {
     3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-14 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-14 character.
+ */
 static uint8_t pm_encoding_iso_8859_14_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -308,8 +336,10 @@ static uint8_t pm_encoding_iso_8859_14_table[256] = {
     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-15 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-15 character.
+ */
 static uint8_t pm_encoding_iso_8859_15_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -330,8 +360,10 @@ static uint8_t pm_encoding_iso_8859_15_table[256] = {
     3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-16 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-16 character.
+ */
 static uint8_t pm_encoding_iso_8859_16_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -352,8 +384,10 @@ static uint8_t pm_encoding_iso_8859_16_table[256] = {
     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding KOI8-R character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding KOI8-R character.
+ */
 static uint8_t pm_encoding_koi8_r_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -374,8 +408,10 @@ static uint8_t pm_encoding_koi8_r_table[256] = {
     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding windows-1251 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding windows-1251 character.
+ */
 static uint8_t pm_encoding_windows_1251_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -396,8 +432,10 @@ static uint8_t pm_encoding_windows_1251_table[256] = {
     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
 };
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding windows-1252 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding windows-1252 character.
+ */
 static uint8_t pm_encoding_windows_1252_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -418,37 +456,94 @@ static uint8_t pm_encoding_windows_1252_table[256] = {
     3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
 };
 
+/**
+ * Returns the size of the next character in the ASCII encoding. This basically
+ * means that if the top bit is not set, the character is 1 byte long.
+ */
 static size_t
 pm_encoding_ascii_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
     return *b < 0x80 ? 1 : 0;
 }
 
+/**
+ * Return the size of the next character in the ASCII encoding if it is an
+ * alphabetical character.
+ */
 size_t
 pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
     return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT);
 }
 
+/**
+ * Return the size of the next character in the ASCII encoding if it is an
+ * alphanumeric character.
+ */
 size_t
 pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
     return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0;
 }
 
+/**
+ * Return true if the next character in the ASCII encoding if it is an uppercase
+ * character.
+ */
 bool
 pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
     return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_UPPERCASE_BIT);
 }
 
+/**
+ * For a lot of encodings the default is that they are a single byte long no
+ * matter what the codepoint, so this function is shared between them.
+ */
 static size_t
-pm_encoding_koi8_r_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
-    return ((*b >= 0x20 && *b <= 0x7E) || (*b >= 0x80)) ? 1 : 0;
+pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
+    return 1;
 }
 
+/**
+ * Returns the size of the next character in the KOI-8 encoding. This means
+ * checking if it's a valid codepoint in KOI-8 and if it is returning 1.
+ */
 static size_t
-pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
-    return 1;
+pm_encoding_koi8_r_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
+    return ((*b >= 0x20 && *b <= 0x7E) || (*b >= 0x80)) ? 1 : 0;
 }
 
-pm_encoding_t pm_encoding_ascii = {
+#define PRISM_ENCODING_TABLE(name) \
+    static size_t pm_encoding_ ##name ## _alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {           \
+        return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHABETIC_BIT);           \
+    }                                                                                                         \
+    static size_t pm_encoding_ ##name ## _alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {           \
+        return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0; \
+    }                                                                                                         \
+    static bool pm_encoding_ ##name ## _isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {           \
+        return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_UPPERCASE_BIT);            \
+    }
+
+PRISM_ENCODING_TABLE(iso_8859_1)
+PRISM_ENCODING_TABLE(iso_8859_2)
+PRISM_ENCODING_TABLE(iso_8859_3)
+PRISM_ENCODING_TABLE(iso_8859_4)
+PRISM_ENCODING_TABLE(iso_8859_5)
+PRISM_ENCODING_TABLE(iso_8859_6)
+PRISM_ENCODING_TABLE(iso_8859_7)
+PRISM_ENCODING_TABLE(iso_8859_8)
+PRISM_ENCODING_TABLE(iso_8859_9)
+PRISM_ENCODING_TABLE(iso_8859_10)
+PRISM_ENCODING_TABLE(iso_8859_11)
+PRISM_ENCODING_TABLE(iso_8859_13)
+PRISM_ENCODING_TABLE(iso_8859_14)
+PRISM_ENCODING_TABLE(iso_8859_15)
+PRISM_ENCODING_TABLE(iso_8859_16)
+PRISM_ENCODING_TABLE(koi8_r)
+PRISM_ENCODING_TABLE(windows_1251)
+PRISM_ENCODING_TABLE(windows_1252)
+
+#undef PRISM_ENCODING_TABLE
+
+/** ASCII encoding */
+const pm_encoding_t pm_encoding_ascii = {
     .name = "ascii",
     .char_width = pm_encoding_ascii_char_width,
     .alnum_char = pm_encoding_ascii_alnum_char,
@@ -457,7 +552,8 @@ pm_encoding_t pm_encoding_ascii = {
     .multibyte = false
 };
 
-pm_encoding_t pm_encoding_ascii_8bit = {
+/** ASCII-8BIT encoding */
+const pm_encoding_t pm_encoding_ascii_8bit = {
     .name = "ascii-8bit",
     .char_width = pm_encoding_single_char_width,
     .alnum_char = pm_encoding_ascii_alnum_char,
@@ -466,42 +562,182 @@ pm_encoding_t pm_encoding_ascii_8bit = {
     .multibyte = false
 };
 
-#define PRISM_ENCODING_TABLE(s, i, w) \
-    static size_t pm_encoding_ ##i ## _alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {           \
-        return (pm_encoding_ ##i ## _table[*b] & PRISM_ENCODING_ALPHABETIC_BIT);           \
-    }                                                                                                         \
-    static size_t pm_encoding_ ##i ## _alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {           \
-        return (pm_encoding_ ##i ## _table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0; \
-    }                                                                                                         \
-    static bool pm_encoding_ ##i ## _isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {           \
-        return (pm_encoding_ ##i ## _table[*b] & PRISM_ENCODING_UPPERCASE_BIT);            \
-    }                                                                                                         \
-    pm_encoding_t pm_encoding_ ##i = {                                                                        \
-        .name = s,                                                                                            \
-        .char_width = w,                                                                                      \
-        .alnum_char = pm_encoding_ ##i ## _alnum_char,                                                        \
-        .alpha_char = pm_encoding_ ##i ## _alpha_char,                                                        \
-        .isupper_char = pm_encoding_ ##i ## _isupper_char,                                                    \
-        .multibyte = false,                                                                                   \
-    };
-
-PRISM_ENCODING_TABLE("iso-8859-1", iso_8859_1, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-2", iso_8859_2, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-3", iso_8859_3, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-4", iso_8859_4, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-5", iso_8859_5, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-6", iso_8859_6, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-7", iso_8859_7, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-8", iso_8859_8, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-9", iso_8859_9, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-10", iso_8859_10, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-11", iso_8859_11, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-13", iso_8859_13, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-14", iso_8859_14, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-15", iso_8859_15, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-16", iso_8859_16, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("koi8-r", koi8_r, pm_encoding_koi8_r_char_width)
-PRISM_ENCODING_TABLE("windows-1251", windows_1251, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("windows-1252", windows_1252, pm_encoding_single_char_width)
+/** ISO-8859-1 */
+const pm_encoding_t pm_encoding_iso_8859_1 = {
+    .name = "iso-8859-1",
+    .char_width = pm_encoding_single_char_width,
+    .alnum_char = pm_encoding_iso_8859_1_alnum_char,
+    .alpha_char = pm_encoding_iso_8859_1_alpha_char,
+    .isupper_char = pm_encoding_iso_8859_1_isupper_char,
+    .multibyte = false
+};
 
-#undef PRISM_ENCODING_TABLE
+/** ISO-8859-2 */
+const pm_encoding_t pm_encoding_iso_8859_2 = {
+    .name = "iso-8859-2",
+    .char_width = pm_encoding_single_char_width,
+    .alnum_char = pm_encoding_iso_8859_2_alnum_char,
+    .alpha_char = pm_encoding_iso_8859_2_alpha_char,
+    .isupper_char = pm_encoding_iso_8859_2_isupper_char,
+    .multibyte = false
+};
+
+/** ISO-8859-3 */
+const pm_encoding_t pm_encoding_iso_8859_3 = {
+    .name = "iso-8859-3",
+    .char_width = pm_encoding_single_char_width,
+    .alnum_char = pm_encoding_iso_8859_3_alnum_char,
+    .alpha_char = pm_encoding_iso_8859_3_alpha_char,
+    .isupper_char = pm_encoding_iso_8859_3_isupper_char,
+    .multibyte = false
+};
+
+/** ISO-8859-4 */
+const pm_encoding_t pm_encoding_iso_8859_4 = {
+    .name = "iso-8859-4",
+    .char_width = pm_encoding_single_char_width,
+    .alnum_char = pm_encoding_iso_8859_4_alnum_char,
+    .alpha_char = pm_encoding_iso_8859_4_alpha_char,
+    .isupper_char = pm_encoding_iso_8859_4_isupper_char,
+    .multibyte = false
+};
+
+/** ISO-8859-5 */
+const pm_encoding_t pm_encoding_iso_8859_5 = {
+    .name = "iso-8859-5",
+    .char_width = pm_encoding_single_char_width,
+    .alnum_char = pm_encoding_iso_8859_5_alnum_char,
+    .alpha_char = pm_encoding_iso_8859_5_alpha_char,
+    .isupper_char = pm_encoding_iso_8859_5_isupper_char,
+    .multibyte = false
+};
+
+/** ISO-8859-6 */
+const pm_encoding_t pm_encoding_iso_8859_6 = {
+    .name = "iso-8859-6",
+    .char_width = pm_encoding_single_char_width,
+    .alnum_char = pm_encoding_iso_8859_6_alnum_char,
+    .alpha_char = pm_encoding_iso_8859_6_alpha_char,
+    .isupper_char = pm_encoding_iso_8859_6_isupper_char,
+    .multibyte = false
+};
+
+/** ISO-8859-7 */
+const pm_encoding_t pm_encoding_iso_8859_7 = {
+    .name = "iso-8859-7",
+    .char_width = pm_encoding_single_char_width,
+    .alnum_char = pm_encoding_iso_8859_7_alnum_char,
+    .alpha_char = pm_encoding_iso_8859_7_alpha_char,
+    .isupper_char = pm_encoding_iso_8859_7_isupper_char,
+    .multibyte = false
+};
+
+/** ISO-8859-8 */
+const pm_encoding_t pm_encoding_iso_8859_8 = {
+    .name = "iso-8859-8",
+    .char_width = pm_encoding_single_char_width,
+    .alnum_char = pm_encoding_iso_8859_8_alnum_char,
+    .alpha_char = pm_encoding_iso_8859_8_alpha_char,
+    .isupper_char = pm_encoding_iso_8859_8_isupper_char,
+    .multibyte = false
+};
+
+/** ISO-8859-9 */
+const pm_encoding_t pm_encoding_iso_8859_9 = {
+    .name = "iso-8859-9",
+    .char_width = pm_encoding_single_char_width,
+    .alnum_char = pm_encoding_iso_8859_9_alnum_char,
+    .alpha_char = pm_encoding_iso_8859_9_alpha_char,
+    .isupper_char = pm_encoding_iso_8859_9_isupper_char,
+    .multibyte = false
+};
+
+/** ISO-8859-10 */
+const pm_encoding_t pm_encoding_iso_8859_10 = {
+    .name = "iso-8859-10",
+    .char_width = pm_encoding_single_char_width,
+    .alnum_char = pm_encoding_iso_8859_10_alnum_char,
+    .alpha_char = pm_encoding_iso_8859_10_alpha_char,
+    .isupper_char = pm_encoding_iso_8859_10_isupper_char,
+    .multibyte = false
+};
+
+/** ISO-8859-11 */
+const pm_encoding_t pm_encoding_iso_8859_11 = {
+    .name = "iso-8859-11",
+    .char_width = pm_encoding_single_char_width,
+    .alnum_char = pm_encoding_iso_8859_11_alnum_char,
+    .alpha_char = pm_encoding_iso_8859_11_alpha_char,
+    .isupper_char = pm_encoding_iso_8859_11_isupper_char,
+    .multibyte = false
+};
+
+/** ISO-8859-13 */
+const pm_encoding_t pm_encoding_iso_8859_13 = {
+    .name = "iso-8859-13",
+    .char_width = pm_encoding_single_char_width,
+    .alnum_char = pm_encoding_iso_8859_13_alnum_char,
+    .alpha_char = pm_encoding_iso_8859_13_alpha_char,
+    .isupper_char = pm_encoding_iso_8859_13_isupper_char,
+    .multibyte = false
+};
+
+/** ISO-8859-14 */
+const pm_encoding_t pm_encoding_iso_8859_14 = {
+    .name = "iso-8859-14",
+    .char_width = pm_encoding_single_char_width,
+    .alnum_char = pm_encoding_iso_8859_14_alnum_char,
+    .alpha_char = pm_encoding_iso_8859_14_alpha_char,
+    .isupper_char = pm_encoding_iso_8859_14_isupper_char,
+    .multibyte = false
+};
+
+/** ISO-8859-15 */
+const pm_encoding_t pm_encoding_iso_8859_15 = {
+    .name = "iso-8859-15",
+    .char_width = pm_encoding_single_char_width,
+    .alnum_char = pm_encoding_iso_8859_15_alnum_char,
+    .alpha_char = pm_encoding_iso_8859_15_alpha_char,
+    .isupper_char = pm_encoding_iso_8859_15_isupper_char,
+    .multibyte = false
+};
+
+/** ISO-8859-16 */
+const pm_encoding_t pm_encoding_iso_8859_16 = {
+    .name = "iso-8859-16",
+    .char_width = pm_encoding_single_char_width,
+    .alnum_char = pm_encoding_iso_8859_16_alnum_char,
+    .alpha_char = pm_encoding_iso_8859_16_alpha_char,
+    .isupper_char = pm_encoding_iso_8859_16_isupper_char,
+    .multibyte = false
+};
+
+/** KOI8-R */
+const pm_encoding_t pm_encoding_koi8_r = {
+    .name = "koi8-r",
+    .char_width = pm_encoding_koi8_r_char_width,
+    .alnum_char = pm_encoding_koi8_r_alnum_char,
+    .alpha_char = pm_encoding_koi8_r_alpha_char,
+    .isupper_char = pm_encoding_koi8_r_isupper_char,
+    .multibyte = false
+};
+
+/** Windows-1251 */
+const pm_encoding_t pm_encoding_windows_1251 = {
+    .name = "windows-1251",
+    .char_width = pm_encoding_single_char_width,
+    .alnum_char = pm_encoding_windows_1251_alnum_char,
+    .alpha_char = pm_encoding_windows_1251_alpha_char,
+    .isupper_char = pm_encoding_windows_1251_isupper_char,
+    .multibyte = false
+};
+
+/** Windows-1252 */
+const pm_encoding_t pm_encoding_windows_1252 = {
+    .name = "windows-1252",
+    .char_width = pm_encoding_single_char_width,
+    .alnum_char = pm_encoding_windows_1252_alnum_char,
+    .alpha_char = pm_encoding_windows_1252_alpha_char,
+    .isupper_char = pm_encoding_windows_1252_isupper_char,
+    .multibyte = false
+};
diff --git a/prism/enc/pm_unicode.c b/prism/enc/pm_unicode.c
index ee776fa2ad..d021894c1e 100644
--- a/prism/enc/pm_unicode.c
+++ b/prism/enc/pm_unicode.c
@@ -1,15 +1,14 @@
-// Note that the UTF-8 decoding code is based on Bjoern Hoehrmann's UTF-8 DFA
-// decoder. See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
-
 #include "prism/enc/pm_encoding.h"
 
 typedef uint32_t pm_unicode_codepoint_t;
 
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding unicode codepoint. Note that
-// this table is different from other encodings where we used a lookup table
-// because the indices of those tables are the byte representations, not the
-// codepoints themselves.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding unicode codepoint. Note that
+ * this table is different from other encodings where we used a lookup table
+ * because the indices of those tables are the byte representations, not the
+ * codepoints themselves.
+ */
 const uint8_t pm_encoding_unicode_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -2179,8 +2178,12 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
     0x1F170, 0x1F189,
 };
 
+/**
+ * Binary search through the given list of codepoints to see if the given
+ * codepoint is in the list.
+ */
 static bool
-pm_unicode_codepoint_match(pm_unicode_codepoint_t codepoint, const pm_unicode_codepoint_t *codepoints, size_t size) {
+pm_unicode_codepoint_match(pm_unicode_codepoint_t codepoint, size_t size, const pm_unicode_codepoint_t codepoints[size]) {
     size_t start = 0;
     size_t end = size;
 
@@ -2202,6 +2205,29 @@ pm_unicode_codepoint_match(pm_unicode_codepoint_t codepoint, const pm_unicode_co
     return false;
 }
 
+/**
+ * A state transition table for decoding UTF-8.
+ * 
+ * Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 static const uint8_t pm_utf_8_dfa[] = {
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
@@ -2219,6 +2245,11 @@ static const uint8_t pm_utf_8_dfa[] = {
     1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
 };
 
+/**
+ * Given a pointer to a string and the number of bytes remaining in the string,
+ * decode the next UTF-8 codepoint and return it. The number of bytes consumed
+ * is returned in the width out parameter.
+ */
 static pm_unicode_codepoint_t
 pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
     assert(n >= 1);
@@ -2253,6 +2284,10 @@ pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
     return width;
 }
 
+/**
+ * Return the size of the next character in the UTF-8 encoding if it is an
+ * alphabetical character.
+ */
 size_t
 pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
     if (*b < 0x80) {
@@ -2265,10 +2300,14 @@ pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
     if (codepoint <= 0xFF) {
         return (pm_encoding_unicode_table[(uint8_t) codepoint] & PRISM_ENCODING_ALPHABETIC_BIT) ? width : 0;
     } else {
-        return pm_unicode_codepoint_match(codepoint, unicode_alpha_codepoints, UNICODE_ALPHA_CODEPOINTS_LENGTH) ? width : 0;
+        return pm_unicode_codepoint_match(codepoint, UNICODE_ALPHA_CODEPOINTS_LENGTH, unicode_alpha_codepoints) ? width : 0;
     }
 }
 
+/**
+ * Return the size of the next character in the UTF-8 encoding if it is an
+ * alphanumeric character.
+ */
 size_t
 pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
     if (*b < 0x80) {
@@ -2281,10 +2320,14 @@ pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
     if (codepoint <= 0xFF) {
         return (pm_encoding_unicode_table[(uint8_t) codepoint] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? width : 0;
     } else {
-        return pm_unicode_codepoint_match(codepoint, unicode_alnum_codepoints, UNICODE_ALNUM_CODEPOINTS_LENGTH) ? width : 0;
+        return pm_unicode_codepoint_match(codepoint, UNICODE_ALNUM_CODEPOINTS_LENGTH, unicode_alnum_codepoints) ? width : 0;
     }
 }
 
+/**
+ * Return true if the next character in the UTF-8 encoding if it is an uppercase
+ * character.
+ */
 bool
 pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
     if (*b < 0x80) {
@@ -2297,7 +2340,7 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
     if (codepoint <= 0xFF) {
         return (pm_encoding_unicode_table[(uint8_t) codepoint] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false;
     } else {
-        return pm_unicode_codepoint_match(codepoint, unicode_isupper_codepoints, UNICODE_ISUPPER_CODEPOINTS_LENGTH) ? true : false;
+        return pm_unicode_codepoint_match(codepoint, UNICODE_ISUPPER_CODEPOINTS_LENGTH, unicode_isupper_codepoints) ? true : false;
     }
 }
 
@@ -2305,7 +2348,8 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
 #undef UNICODE_ALNUM_CODEPOINTS_LENGTH
 #undef UNICODE_ISUPPER_CODEPOINTS_LENGTH
 
-pm_encoding_t pm_encoding_utf_8 = {
+/** UTF-8 */
+const pm_encoding_t pm_encoding_utf_8 = {
     .name = "utf-8",
     .char_width = pm_encoding_utf_8_char_width,
     .alnum_char = pm_encoding_utf_8_alnum_char,
@@ -2314,7 +2358,8 @@ pm_encoding_t pm_encoding_utf_8 = {
     .multibyte = true
 };
 
-pm_encoding_t pm_encoding_utf8_mac = {
+/** UTF8-mac */
+const pm_encoding_t pm_encoding_utf8_mac = {
     .name = "utf8-mac",
     .char_width = pm_encoding_utf_8_char_width,
     .alnum_char = pm_encoding_utf_8_alnum_char,
diff --git a/prism/enc/pm_windows_31j.c b/prism/enc/pm_windows_31j.c
index cf7eb46864..ce67cfb04e 100644
--- a/prism/enc/pm_windows_31j.c
+++ b/prism/enc/pm_windows_31j.c
@@ -46,7 +46,8 @@ pm_encoding_windows_31j_isupper_char(const uint8_t *b, ptrdiff_t n) {
     }
 }
 
-pm_encoding_t pm_encoding_windows_31j = {
+/** Windows-31J */
+const pm_encoding_t pm_encoding_windows_31j = {
     .name = "windows-31j",
     .char_width = pm_encoding_windows_31j_char_width,
     .alnum_char = pm_encoding_windows_31j_alnum_char,
author	Kevin Newton <kddnewton@gmail.com>	2023-10-31 08:54:52 -0400
committer	Kevin Newton <kddnewton@gmail.com>	2023-11-01 13:10:29 -0400
commit	87c6fb85483b63cf976d7738eb5a2b55e6f2b1d7 (patch)
tree	f53768e37ffe9b6cac3a9092f94689b6f3fc9de3 /prism
parent	493439c9ce8d298f3fbd2c9c01d35fcc9add6d49 (diff)