summaryrefslogtreecommitdiff
path: root/prism
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2023-10-31 08:54:52 -0400
committerKevin Newton <kddnewton@gmail.com>2023-11-01 13:10:29 -0400
commit87c6fb85483b63cf976d7738eb5a2b55e6f2b1d7 (patch)
treef53768e37ffe9b6cac3a9092f94689b6f3fc9de3 /prism
parent493439c9ce8d298f3fbd2c9c01d35fcc9add6d49 (diff)
[ruby/prism] Documentation for the encodings
https://github.com/ruby/prism/commit/52a0d80a15
Diffstat (limited to 'prism')
-rw-r--r--prism/enc/pm_big5.c3
-rw-r--r--prism/enc/pm_encoding.h179
-rw-r--r--prism/enc/pm_euc_jp.c3
-rw-r--r--prism/enc/pm_gbk.c3
-rw-r--r--prism/enc/pm_shift_jis.c3
-rw-r--r--prism/enc/pm_tables.c400
-rw-r--r--prism/enc/pm_unicode.c73
-rw-r--r--prism/enc/pm_windows_31j.c3
8 files changed, 513 insertions, 154 deletions
diff --git a/prism/enc/pm_big5.c b/prism/enc/pm_big5.c
index deaa3afb3f..6d4fefcf2b 100644
--- a/prism/enc/pm_big5.c
+++ b/prism/enc/pm_big5.c
@@ -42,7 +42,8 @@ pm_encoding_big5_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}
-pm_encoding_t pm_encoding_big5 = {
+/** Big5 encoding */
+const pm_encoding_t pm_encoding_big5 = {
.name = "big5",
.char_width = pm_encoding_big5_char_width,
.alnum_char = pm_encoding_big5_alnum_char,
diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h
index 232bc97dd4..28b9f02281 100644
--- a/prism/enc/pm_encoding.h
+++ b/prism/enc/pm_encoding.h
@@ -8,36 +8,50 @@
#include <stddef.h>
#include <stdint.h>
-// This struct defines the functions necessary to implement the encoding
-// interface so we can determine how many bytes the subsequent character takes.
-// Each callback should return the number of bytes, or 0 if the next bytes are
-// invalid for the encoding and type.
+/**
+ * This struct defines the functions necessary to implement the encoding
+ * interface so we can determine how many bytes the subsequent character takes.
+ * Each callback should return the number of bytes, or 0 if the next bytes are
+ * invalid for the encoding and type.
+ */
typedef struct {
- // Return the number of bytes that the next character takes if it is valid
- // in the encoding. Does not read more than n bytes. It is assumed that n is
- // at least 1.
+ /**
+ * Return the number of bytes that the next character takes if it is valid
+ * in the encoding. Does not read more than n bytes. It is assumed that n is
+ * at least 1.
+ */
size_t (*char_width)(const uint8_t *b, ptrdiff_t n);
- // Return the number of bytes that the next character takes if it is valid
- // in the encoding and is alphabetical. Does not read more than n bytes. It
- // is assumed that n is at least 1.
+ /**
+ * Return the number of bytes that the next character takes if it is valid
+ * in the encoding and is alphabetical. Does not read more than n bytes. It
+ * is assumed that n is at least 1.
+ */
size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n);
- // Return the number of bytes that the next character takes if it is valid
- // in the encoding and is alphanumeric. Does not read more than n bytes. It
- // is assumed that n is at least 1.
+ /**
+ * Return the number of bytes that the next character takes if it is valid
+ * in the encoding and is alphanumeric. Does not read more than n bytes. It
+ * is assumed that n is at least 1.
+ */
size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n);
- // Return true if the next character is valid in the encoding and is an
- // uppercase character. Does not read more than n bytes. It is assumed that
- // n is at least 1.
+ /**
+ * Return true if the next character is valid in the encoding and is an
+ * uppercase character. Does not read more than n bytes. It is assumed that
+ * n is at least 1.
+ */
bool (*isupper_char)(const uint8_t *b, ptrdiff_t n);
- // The name of the encoding. This should correspond to a value that can be
- // passed to Encoding.find in Ruby.
+ /**
+ * The name of the encoding. This should correspond to a value that can be
+ * passed to Encoding.find in Ruby.
+ */
const char *name;
- // Return true if the encoding is a multibyte encoding.
+ /**
+ * Return true if the encoding is a multibyte encoding.
+ */
bool multibyte;
} pm_encoding_t;
@@ -47,50 +61,109 @@ typedef struct {
#define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1
#define PRISM_ENCODING_UPPERCASE_BIT 1 << 2
-// These functions are reused by some other encodings, so they are defined here
-// so they can be shared.
+/**
+ * Return the size of the next character in the ASCII encoding if it is an
+ * alphabetical character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns The number of bytes that the next character takes if it is valid in
+ * the encoding, or 0 if it is not.
+ */
size_t pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
+
+/**
+ * Return the size of the next character in the ASCII encoding if it is an
+ * alphanumeric character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns The number of bytes that the next character takes if it is valid in
+ * the encoding, or 0 if it is not.
+ */
size_t pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
+
+/**
+ * Return true if the next character in the ASCII encoding if it is an uppercase
+ * character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns True if the next character is valid in the encoding and is an
+ * uppercase character, or false if it is not.
+ */
bool pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n);
-// These functions are shared between the actual encoding and the fast path in
-// the parser so they need to be internally visible.
+/**
+ * Return the size of the next character in the UTF-8 encoding if it is an
+ * alphabetical character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns The number of bytes that the next character takes if it is valid in
+ * the encoding, or 0 if it is not.
+ */
size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);
+
+/**
+ * Return the size of the next character in the UTF-8 encoding if it is an
+ * alphanumeric character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns The number of bytes that the next character takes if it is valid in
+ * the encoding, or 0 if it is not.
+ */
size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);
+
+/**
+ * Return true if the next character in the UTF-8 encoding if it is an uppercase
+ * character.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns True if the next character is valid in the encoding and is an
+ * uppercase character, or false if it is not.
+ */
bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);
-// This lookup table is referenced in both the UTF-8 encoding file and the
-// parser directly in order to speed up the default encoding processing.
+/**
+ * This lookup table is referenced in both the UTF-8 encoding file and the
+ * parser directly in order to speed up the default encoding processing. It is
+ * used to indicate whether a character is alphabetical, alphanumeric, or
+ * uppercase in unicode mappings.
+ */
extern const uint8_t pm_encoding_unicode_table[256];
-// These are the encodings that are supported by the parser. They are defined in
+// Below are the encodings that are supported by the parser. They are defined in
// their own files in the src/enc directory.
-extern pm_encoding_t pm_encoding_ascii;
-extern pm_encoding_t pm_encoding_ascii_8bit;
-extern pm_encoding_t pm_encoding_big5;
-extern pm_encoding_t pm_encoding_euc_jp;
-extern pm_encoding_t pm_encoding_gbk;
-extern pm_encoding_t pm_encoding_iso_8859_1;
-extern pm_encoding_t pm_encoding_iso_8859_2;
-extern pm_encoding_t pm_encoding_iso_8859_3;
-extern pm_encoding_t pm_encoding_iso_8859_4;
-extern pm_encoding_t pm_encoding_iso_8859_5;
-extern pm_encoding_t pm_encoding_iso_8859_6;
-extern pm_encoding_t pm_encoding_iso_8859_7;
-extern pm_encoding_t pm_encoding_iso_8859_8;
-extern pm_encoding_t pm_encoding_iso_8859_9;
-extern pm_encoding_t pm_encoding_iso_8859_10;
-extern pm_encoding_t pm_encoding_iso_8859_11;
-extern pm_encoding_t pm_encoding_iso_8859_13;
-extern pm_encoding_t pm_encoding_iso_8859_14;
-extern pm_encoding_t pm_encoding_iso_8859_15;
-extern pm_encoding_t pm_encoding_iso_8859_16;
-extern pm_encoding_t pm_encoding_koi8_r;
-extern pm_encoding_t pm_encoding_shift_jis;
-extern pm_encoding_t pm_encoding_utf_8;
-extern pm_encoding_t pm_encoding_utf8_mac;
-extern pm_encoding_t pm_encoding_windows_31j;
-extern pm_encoding_t pm_encoding_windows_1251;
-extern pm_encoding_t pm_encoding_windows_1252;
+
+const extern pm_encoding_t pm_encoding_ascii;
+const extern pm_encoding_t pm_encoding_ascii_8bit;
+const extern pm_encoding_t pm_encoding_big5;
+const extern pm_encoding_t pm_encoding_euc_jp;
+const extern pm_encoding_t pm_encoding_gbk;
+const extern pm_encoding_t pm_encoding_iso_8859_1;
+const extern pm_encoding_t pm_encoding_iso_8859_2;
+const extern pm_encoding_t pm_encoding_iso_8859_3;
+const extern pm_encoding_t pm_encoding_iso_8859_4;
+const extern pm_encoding_t pm_encoding_iso_8859_5;
+const extern pm_encoding_t pm_encoding_iso_8859_6;
+const extern pm_encoding_t pm_encoding_iso_8859_7;
+const extern pm_encoding_t pm_encoding_iso_8859_8;
+const extern pm_encoding_t pm_encoding_iso_8859_9;
+const extern pm_encoding_t pm_encoding_iso_8859_10;
+const extern pm_encoding_t pm_encoding_iso_8859_11;
+const extern pm_encoding_t pm_encoding_iso_8859_13;
+const extern pm_encoding_t pm_encoding_iso_8859_14;
+const extern pm_encoding_t pm_encoding_iso_8859_15;
+const extern pm_encoding_t pm_encoding_iso_8859_16;
+const extern pm_encoding_t pm_encoding_koi8_r;
+const extern pm_encoding_t pm_encoding_shift_jis;
+const extern pm_encoding_t pm_encoding_utf_8;
+const extern pm_encoding_t pm_encoding_utf8_mac;
+const extern pm_encoding_t pm_encoding_windows_31j;
+const extern pm_encoding_t pm_encoding_windows_1251;
+const extern pm_encoding_t pm_encoding_windows_1252;
#endif
diff --git a/prism/enc/pm_euc_jp.c b/prism/enc/pm_euc_jp.c
index 13d3662455..cd0b1a7910 100644
--- a/prism/enc/pm_euc_jp.c
+++ b/prism/enc/pm_euc_jp.c
@@ -48,7 +48,8 @@ pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}
-pm_encoding_t pm_encoding_euc_jp = {
+/** EUC-JP encoding */
+const pm_encoding_t pm_encoding_euc_jp = {
.name = "euc-jp",
.char_width = pm_encoding_euc_jp_char_width,
.alnum_char = pm_encoding_euc_jp_alnum_char,
diff --git a/prism/enc/pm_gbk.c b/prism/enc/pm_gbk.c
index 2fc67b47a4..3dcf41fb99 100644
--- a/prism/enc/pm_gbk.c
+++ b/prism/enc/pm_gbk.c
@@ -51,7 +51,8 @@ pm_encoding_gbk_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}
-pm_encoding_t pm_encoding_gbk = {
+/** GBK encoding */
+const pm_encoding_t pm_encoding_gbk = {
.name = "gbk",
.char_width = pm_encoding_gbk_char_width,
.alnum_char = pm_encoding_gbk_alnum_char,
diff --git a/prism/enc/pm_shift_jis.c b/prism/enc/pm_shift_jis.c
index 3c93937efc..ecc3d51b87 100644
--- a/prism/enc/pm_shift_jis.c
+++ b/prism/enc/pm_shift_jis.c
@@ -46,7 +46,8 @@ pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}
-pm_encoding_t pm_encoding_shift_jis = {
+/** Shift_JIS encoding */
+const pm_encoding_t pm_encoding_shift_jis = {
.name = "shift_jis",
.char_width = pm_encoding_shift_jis_char_width,
.alnum_char = pm_encoding_shift_jis_alnum_char,
diff --git a/prism/enc/pm_tables.c b/prism/enc/pm_tables.c
index c6bb4dce65..c2133649a4 100644
--- a/prism/enc/pm_tables.c
+++ b/prism/enc/pm_tables.c
@@ -1,7 +1,9 @@
#include "prism/enc/pm_encoding.h"
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ASCII character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ASCII character.
+ */
static uint8_t pm_encoding_ascii_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -22,8 +24,10 @@ static uint8_t pm_encoding_ascii_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-1 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-1 character.
+ */
static uint8_t pm_encoding_iso_8859_1_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -44,8 +48,10 @@ static uint8_t pm_encoding_iso_8859_1_table[256] = {
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-2 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-2 character.
+ */
static uint8_t pm_encoding_iso_8859_2_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -66,8 +72,10 @@ static uint8_t pm_encoding_iso_8859_2_table[256] = {
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-3 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-3 character.
+ */
static uint8_t pm_encoding_iso_8859_3_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -88,8 +96,10 @@ static uint8_t pm_encoding_iso_8859_3_table[256] = {
0, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-4 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-4 character.
+ */
static uint8_t pm_encoding_iso_8859_4_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -110,8 +120,10 @@ static uint8_t pm_encoding_iso_8859_4_table[256] = {
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-5 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-5 character.
+ */
static uint8_t pm_encoding_iso_8859_5_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -132,8 +144,10 @@ static uint8_t pm_encoding_iso_8859_5_table[256] = {
0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-6 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-6 character.
+ */
static uint8_t pm_encoding_iso_8859_6_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -154,8 +168,10 @@ static uint8_t pm_encoding_iso_8859_6_table[256] = {
3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-7 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-7 character.
+ */
static uint8_t pm_encoding_iso_8859_7_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -176,8 +192,10 @@ static uint8_t pm_encoding_iso_8859_7_table[256] = {
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-8 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-8 character.
+ */
static uint8_t pm_encoding_iso_8859_8_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -198,8 +216,10 @@ static uint8_t pm_encoding_iso_8859_8_table[256] = {
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-9 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-9 character.
+ */
static uint8_t pm_encoding_iso_8859_9_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -220,8 +240,10 @@ static uint8_t pm_encoding_iso_8859_9_table[256] = {
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-10 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-10 character.
+ */
static uint8_t pm_encoding_iso_8859_10_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -242,8 +264,10 @@ static uint8_t pm_encoding_iso_8859_10_table[256] = {
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-11 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-11 character.
+ */
static uint8_t pm_encoding_iso_8859_11_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -264,8 +288,10 @@ static uint8_t pm_encoding_iso_8859_11_table[256] = {
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-13 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-13 character.
+ */
static uint8_t pm_encoding_iso_8859_13_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -286,8 +312,10 @@ static uint8_t pm_encoding_iso_8859_13_table[256] = {
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-14 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-14 character.
+ */
static uint8_t pm_encoding_iso_8859_14_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -308,8 +336,10 @@ static uint8_t pm_encoding_iso_8859_14_table[256] = {
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-15 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-15 character.
+ */
static uint8_t pm_encoding_iso_8859_15_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -330,8 +360,10 @@ static uint8_t pm_encoding_iso_8859_15_table[256] = {
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding ISO-8859-16 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding ISO-8859-16 character.
+ */
static uint8_t pm_encoding_iso_8859_16_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -352,8 +384,10 @@ static uint8_t pm_encoding_iso_8859_16_table[256] = {
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding KOI8-R character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding KOI8-R character.
+ */
static uint8_t pm_encoding_koi8_r_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -374,8 +408,10 @@ static uint8_t pm_encoding_koi8_r_table[256] = {
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding windows-1251 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding windows-1251 character.
+ */
static uint8_t pm_encoding_windows_1251_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -396,8 +432,10 @@ static uint8_t pm_encoding_windows_1251_table[256] = {
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding windows-1252 character.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding windows-1252 character.
+ */
static uint8_t pm_encoding_windows_1252_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -418,37 +456,94 @@ static uint8_t pm_encoding_windows_1252_table[256] = {
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
+/**
+ * Returns the size of the next character in the ASCII encoding. This basically
+ * means that if the top bit is not set, the character is 1 byte long.
+ */
static size_t
pm_encoding_ascii_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
return *b < 0x80 ? 1 : 0;
}
+/**
+ * Return the size of the next character in the ASCII encoding if it is an
+ * alphabetical character.
+ */
size_t
pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT);
}
+/**
+ * Return the size of the next character in the ASCII encoding if it is an
+ * alphanumeric character.
+ */
size_t
pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0;
}
+/**
+ * Return true if the next character in the ASCII encoding if it is an uppercase
+ * character.
+ */
bool
pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_UPPERCASE_BIT);
}
+/**
+ * For a lot of encodings the default is that they are a single byte long no
+ * matter what the codepoint, so this function is shared between them.
+ */
static size_t
-pm_encoding_koi8_r_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
- return ((*b >= 0x20 && *b <= 0x7E) || (*b >= 0x80)) ? 1 : 0;
+pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
+ return 1;
}
+/**
+ * Returns the size of the next character in the KOI-8 encoding. This means
+ * checking if it's a valid codepoint in KOI-8 and if it is returning 1.
+ */
static size_t
-pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
- return 1;
+pm_encoding_koi8_r_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
+ return ((*b >= 0x20 && *b <= 0x7E) || (*b >= 0x80)) ? 1 : 0;
}
-pm_encoding_t pm_encoding_ascii = {
+#define PRISM_ENCODING_TABLE(name) \
+ static size_t pm_encoding_ ##name ## _alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
+ return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHABETIC_BIT); \
+ } \
+ static size_t pm_encoding_ ##name ## _alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
+ return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0; \
+ } \
+ static bool pm_encoding_ ##name ## _isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
+ return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_UPPERCASE_BIT); \
+ }
+
+PRISM_ENCODING_TABLE(iso_8859_1)
+PRISM_ENCODING_TABLE(iso_8859_2)
+PRISM_ENCODING_TABLE(iso_8859_3)
+PRISM_ENCODING_TABLE(iso_8859_4)
+PRISM_ENCODING_TABLE(iso_8859_5)
+PRISM_ENCODING_TABLE(iso_8859_6)
+PRISM_ENCODING_TABLE(iso_8859_7)
+PRISM_ENCODING_TABLE(iso_8859_8)
+PRISM_ENCODING_TABLE(iso_8859_9)
+PRISM_ENCODING_TABLE(iso_8859_10)
+PRISM_ENCODING_TABLE(iso_8859_11)
+PRISM_ENCODING_TABLE(iso_8859_13)
+PRISM_ENCODING_TABLE(iso_8859_14)
+PRISM_ENCODING_TABLE(iso_8859_15)
+PRISM_ENCODING_TABLE(iso_8859_16)
+PRISM_ENCODING_TABLE(koi8_r)
+PRISM_ENCODING_TABLE(windows_1251)
+PRISM_ENCODING_TABLE(windows_1252)
+
+#undef PRISM_ENCODING_TABLE
+
+/** ASCII encoding */
+const pm_encoding_t pm_encoding_ascii = {
.name = "ascii",
.char_width = pm_encoding_ascii_char_width,
.alnum_char = pm_encoding_ascii_alnum_char,
@@ -457,7 +552,8 @@ pm_encoding_t pm_encoding_ascii = {
.multibyte = false
};
-pm_encoding_t pm_encoding_ascii_8bit = {
+/** ASCII-8BIT encoding */
+const pm_encoding_t pm_encoding_ascii_8bit = {
.name = "ascii-8bit",
.char_width = pm_encoding_single_char_width,
.alnum_char = pm_encoding_ascii_alnum_char,
@@ -466,42 +562,182 @@ pm_encoding_t pm_encoding_ascii_8bit = {
.multibyte = false
};
-#define PRISM_ENCODING_TABLE(s, i, w) \
- static size_t pm_encoding_ ##i ## _alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
- return (pm_encoding_ ##i ## _table[*b] & PRISM_ENCODING_ALPHABETIC_BIT); \
- } \
- static size_t pm_encoding_ ##i ## _alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
- return (pm_encoding_ ##i ## _table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0; \
- } \
- static bool pm_encoding_ ##i ## _isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \
- return (pm_encoding_ ##i ## _table[*b] & PRISM_ENCODING_UPPERCASE_BIT); \
- } \
- pm_encoding_t pm_encoding_ ##i = { \
- .name = s, \
- .char_width = w, \
- .alnum_char = pm_encoding_ ##i ## _alnum_char, \
- .alpha_char = pm_encoding_ ##i ## _alpha_char, \
- .isupper_char = pm_encoding_ ##i ## _isupper_char, \
- .multibyte = false, \
- };
-
-PRISM_ENCODING_TABLE("iso-8859-1", iso_8859_1, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-2", iso_8859_2, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-3", iso_8859_3, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-4", iso_8859_4, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-5", iso_8859_5, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-6", iso_8859_6, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-7", iso_8859_7, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-8", iso_8859_8, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-9", iso_8859_9, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-10", iso_8859_10, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-11", iso_8859_11, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-13", iso_8859_13, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-14", iso_8859_14, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-15", iso_8859_15, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("iso-8859-16", iso_8859_16, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("koi8-r", koi8_r, pm_encoding_koi8_r_char_width)
-PRISM_ENCODING_TABLE("windows-1251", windows_1251, pm_encoding_single_char_width)
-PRISM_ENCODING_TABLE("windows-1252", windows_1252, pm_encoding_single_char_width)
+/** ISO-8859-1 */
+const pm_encoding_t pm_encoding_iso_8859_1 = {
+ .name = "iso-8859-1",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_iso_8859_1_alnum_char,
+ .alpha_char = pm_encoding_iso_8859_1_alpha_char,
+ .isupper_char = pm_encoding_iso_8859_1_isupper_char,
+ .multibyte = false
+};
-#undef PRISM_ENCODING_TABLE
+/** ISO-8859-2 */
+const pm_encoding_t pm_encoding_iso_8859_2 = {
+ .name = "iso-8859-2",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_iso_8859_2_alnum_char,
+ .alpha_char = pm_encoding_iso_8859_2_alpha_char,
+ .isupper_char = pm_encoding_iso_8859_2_isupper_char,
+ .multibyte = false
+};
+
+/** ISO-8859-3 */
+const pm_encoding_t pm_encoding_iso_8859_3 = {
+ .name = "iso-8859-3",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_iso_8859_3_alnum_char,
+ .alpha_char = pm_encoding_iso_8859_3_alpha_char,
+ .isupper_char = pm_encoding_iso_8859_3_isupper_char,
+ .multibyte = false
+};
+
+/** ISO-8859-4 */
+const pm_encoding_t pm_encoding_iso_8859_4 = {
+ .name = "iso-8859-4",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_iso_8859_4_alnum_char,
+ .alpha_char = pm_encoding_iso_8859_4_alpha_char,
+ .isupper_char = pm_encoding_iso_8859_4_isupper_char,
+ .multibyte = false
+};
+
+/** ISO-8859-5 */
+const pm_encoding_t pm_encoding_iso_8859_5 = {
+ .name = "iso-8859-5",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_iso_8859_5_alnum_char,
+ .alpha_char = pm_encoding_iso_8859_5_alpha_char,
+ .isupper_char = pm_encoding_iso_8859_5_isupper_char,
+ .multibyte = false
+};
+
+/** ISO-8859-6 */
+const pm_encoding_t pm_encoding_iso_8859_6 = {
+ .name = "iso-8859-6",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_iso_8859_6_alnum_char,
+ .alpha_char = pm_encoding_iso_8859_6_alpha_char,
+ .isupper_char = pm_encoding_iso_8859_6_isupper_char,
+ .multibyte = false
+};
+
+/** ISO-8859-7 */
+const pm_encoding_t pm_encoding_iso_8859_7 = {
+ .name = "iso-8859-7",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_iso_8859_7_alnum_char,
+ .alpha_char = pm_encoding_iso_8859_7_alpha_char,
+ .isupper_char = pm_encoding_iso_8859_7_isupper_char,
+ .multibyte = false
+};
+
+/** ISO-8859-8 */
+const pm_encoding_t pm_encoding_iso_8859_8 = {
+ .name = "iso-8859-8",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_iso_8859_8_alnum_char,
+ .alpha_char = pm_encoding_iso_8859_8_alpha_char,
+ .isupper_char = pm_encoding_iso_8859_8_isupper_char,
+ .multibyte = false
+};
+
+/** ISO-8859-9 */
+const pm_encoding_t pm_encoding_iso_8859_9 = {
+ .name = "iso-8859-9",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_iso_8859_9_alnum_char,
+ .alpha_char = pm_encoding_iso_8859_9_alpha_char,
+ .isupper_char = pm_encoding_iso_8859_9_isupper_char,
+ .multibyte = false
+};
+
+/** ISO-8859-10 */
+const pm_encoding_t pm_encoding_iso_8859_10 = {
+ .name = "iso-8859-10",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_iso_8859_10_alnum_char,
+ .alpha_char = pm_encoding_iso_8859_10_alpha_char,
+ .isupper_char = pm_encoding_iso_8859_10_isupper_char,
+ .multibyte = false
+};
+
+/** ISO-8859-11 */
+const pm_encoding_t pm_encoding_iso_8859_11 = {
+ .name = "iso-8859-11",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_iso_8859_11_alnum_char,
+ .alpha_char = pm_encoding_iso_8859_11_alpha_char,
+ .isupper_char = pm_encoding_iso_8859_11_isupper_char,
+ .multibyte = false
+};
+
+/** ISO-8859-13 */
+const pm_encoding_t pm_encoding_iso_8859_13 = {
+ .name = "iso-8859-13",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_iso_8859_13_alnum_char,
+ .alpha_char = pm_encoding_iso_8859_13_alpha_char,
+ .isupper_char = pm_encoding_iso_8859_13_isupper_char,
+ .multibyte = false
+};
+
+/** ISO-8859-14 */
+const pm_encoding_t pm_encoding_iso_8859_14 = {
+ .name = "iso-8859-14",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_iso_8859_14_alnum_char,
+ .alpha_char = pm_encoding_iso_8859_14_alpha_char,
+ .isupper_char = pm_encoding_iso_8859_14_isupper_char,
+ .multibyte = false
+};
+
+/** ISO-8859-15 */
+const pm_encoding_t pm_encoding_iso_8859_15 = {
+ .name = "iso-8859-15",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_iso_8859_15_alnum_char,
+ .alpha_char = pm_encoding_iso_8859_15_alpha_char,
+ .isupper_char = pm_encoding_iso_8859_15_isupper_char,
+ .multibyte = false
+};
+
+/** ISO-8859-16 */
+const pm_encoding_t pm_encoding_iso_8859_16 = {
+ .name = "iso-8859-16",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_iso_8859_16_alnum_char,
+ .alpha_char = pm_encoding_iso_8859_16_alpha_char,
+ .isupper_char = pm_encoding_iso_8859_16_isupper_char,
+ .multibyte = false
+};
+
+/** KOI8-R */
+const pm_encoding_t pm_encoding_koi8_r = {
+ .name = "koi8-r",
+ .char_width = pm_encoding_koi8_r_char_width,
+ .alnum_char = pm_encoding_koi8_r_alnum_char,
+ .alpha_char = pm_encoding_koi8_r_alpha_char,
+ .isupper_char = pm_encoding_koi8_r_isupper_char,
+ .multibyte = false
+};
+
+/** Windows-1251 */
+const pm_encoding_t pm_encoding_windows_1251 = {
+ .name = "windows-1251",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_windows_1251_alnum_char,
+ .alpha_char = pm_encoding_windows_1251_alpha_char,
+ .isupper_char = pm_encoding_windows_1251_isupper_char,
+ .multibyte = false
+};
+
+/** Windows-1252 */
+const pm_encoding_t pm_encoding_windows_1252 = {
+ .name = "windows-1252",
+ .char_width = pm_encoding_single_char_width,
+ .alnum_char = pm_encoding_windows_1252_alnum_char,
+ .alpha_char = pm_encoding_windows_1252_alpha_char,
+ .isupper_char = pm_encoding_windows_1252_isupper_char,
+ .multibyte = false
+};
diff --git a/prism/enc/pm_unicode.c b/prism/enc/pm_unicode.c
index ee776fa2ad..d021894c1e 100644
--- a/prism/enc/pm_unicode.c
+++ b/prism/enc/pm_unicode.c
@@ -1,15 +1,14 @@
-// Note that the UTF-8 decoding code is based on Bjoern Hoehrmann's UTF-8 DFA
-// decoder. See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
-
#include "prism/enc/pm_encoding.h"
typedef uint32_t pm_unicode_codepoint_t;
-// Each element of the following table contains a bitfield that indicates a
-// piece of information about the corresponding unicode codepoint. Note that
-// this table is different from other encodings where we used a lookup table
-// because the indices of those tables are the byte representations, not the
-// codepoints themselves.
+/**
+ * Each element of the following table contains a bitfield that indicates a
+ * piece of information about the corresponding unicode codepoint. Note that
+ * this table is different from other encodings where we used a lookup table
+ * because the indices of those tables are the byte representations, not the
+ * codepoints themselves.
+ */
const uint8_t pm_encoding_unicode_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
@@ -2179,8 +2178,12 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
0x1F170, 0x1F189,
};
+/**
+ * Binary search through the given list of codepoints to see if the given
+ * codepoint is in the list.
+ */
static bool
-pm_unicode_codepoint_match(pm_unicode_codepoint_t codepoint, const pm_unicode_codepoint_t *codepoints, size_t size) {
+pm_unicode_codepoint_match(pm_unicode_codepoint_t codepoint, size_t size, const pm_unicode_codepoint_t codepoints[size]) {
size_t start = 0;
size_t end = size;
@@ -2202,6 +2205,29 @@ pm_unicode_codepoint_match(pm_unicode_codepoint_t codepoint, const pm_unicode_co
return false;
}
+/**
+ * A state transition table for decoding UTF-8.
+ *
+ * Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
static const uint8_t pm_utf_8_dfa[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
@@ -2219,6 +2245,11 @@ static const uint8_t pm_utf_8_dfa[] = {
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};
+/**
+ * Given a pointer to a string and the number of bytes remaining in the string,
+ * decode the next UTF-8 codepoint and return it. The number of bytes consumed
+ * is returned in the width out parameter.
+ */
static pm_unicode_codepoint_t
pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
assert(n >= 1);
@@ -2253,6 +2284,10 @@ pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
return width;
}
+/**
+ * Return the size of the next character in the UTF-8 encoding if it is an
+ * alphabetical character.
+ */
size_t
pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
if (*b < 0x80) {
@@ -2265,10 +2300,14 @@ pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) {
if (codepoint <= 0xFF) {
return (pm_encoding_unicode_table[(uint8_t) codepoint] & PRISM_ENCODING_ALPHABETIC_BIT) ? width : 0;
} else {
- return pm_unicode_codepoint_match(codepoint, unicode_alpha_codepoints, UNICODE_ALPHA_CODEPOINTS_LENGTH) ? width : 0;
+ return pm_unicode_codepoint_match(codepoint, UNICODE_ALPHA_CODEPOINTS_LENGTH, unicode_alpha_codepoints) ? width : 0;
}
}
+/**
+ * Return the size of the next character in the UTF-8 encoding if it is an
+ * alphanumeric character.
+ */
size_t
pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
if (*b < 0x80) {
@@ -2281,10 +2320,14 @@ pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) {
if (codepoint <= 0xFF) {
return (pm_encoding_unicode_table[(uint8_t) codepoint] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? width : 0;
} else {
- return pm_unicode_codepoint_match(codepoint, unicode_alnum_codepoints, UNICODE_ALNUM_CODEPOINTS_LENGTH) ? width : 0;
+ return pm_unicode_codepoint_match(codepoint, UNICODE_ALNUM_CODEPOINTS_LENGTH, unicode_alnum_codepoints) ? width : 0;
}
}
+/**
+ * Return true if the next character in the UTF-8 encoding if it is an uppercase
+ * character.
+ */
bool
pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
if (*b < 0x80) {
@@ -2297,7 +2340,7 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
if (codepoint <= 0xFF) {
return (pm_encoding_unicode_table[(uint8_t) codepoint] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false;
} else {
- return pm_unicode_codepoint_match(codepoint, unicode_isupper_codepoints, UNICODE_ISUPPER_CODEPOINTS_LENGTH) ? true : false;
+ return pm_unicode_codepoint_match(codepoint, UNICODE_ISUPPER_CODEPOINTS_LENGTH, unicode_isupper_codepoints) ? true : false;
}
}
@@ -2305,7 +2348,8 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
#undef UNICODE_ALNUM_CODEPOINTS_LENGTH
#undef UNICODE_ISUPPER_CODEPOINTS_LENGTH
-pm_encoding_t pm_encoding_utf_8 = {
+/** UTF-8 */
+const pm_encoding_t pm_encoding_utf_8 = {
.name = "utf-8",
.char_width = pm_encoding_utf_8_char_width,
.alnum_char = pm_encoding_utf_8_alnum_char,
@@ -2314,7 +2358,8 @@ pm_encoding_t pm_encoding_utf_8 = {
.multibyte = true
};
-pm_encoding_t pm_encoding_utf8_mac = {
+/** UTF8-mac */
+const pm_encoding_t pm_encoding_utf8_mac = {
.name = "utf8-mac",
.char_width = pm_encoding_utf_8_char_width,
.alnum_char = pm_encoding_utf_8_alnum_char,
diff --git a/prism/enc/pm_windows_31j.c b/prism/enc/pm_windows_31j.c
index cf7eb46864..ce67cfb04e 100644
--- a/prism/enc/pm_windows_31j.c
+++ b/prism/enc/pm_windows_31j.c
@@ -46,7 +46,8 @@ pm_encoding_windows_31j_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}
-pm_encoding_t pm_encoding_windows_31j = {
+/** Windows-31J */
+const pm_encoding_t pm_encoding_windows_31j = {
.name = "windows-31j",
.char_width = pm_encoding_windows_31j_char_width,
.alnum_char = pm_encoding_windows_31j_alnum_char,