summaryrefslogtreecommitdiff
path: root/enc/unicode.c
diff options
context:
space:
mode:
Diffstat (limited to 'enc/unicode.c')
-rw-r--r--enc/unicode.c494
1 files changed, 316 insertions, 178 deletions
diff --git a/enc/unicode.c b/enc/unicode.c
index 20990c1e54..5bc806863e 100644
--- a/enc/unicode.c
+++ b/enc/unicode.c
@@ -2,7 +2,7 @@
unicode.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
+ * Copyright (c) 2002-2013 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -71,8 +71,6 @@ static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
};
-#include "enc/unicode/name2ctype.h"
-
typedef struct {
int n;
OnigCodePoint code[3];
@@ -103,10 +101,81 @@ typedef struct {
CodePointList2 to;
} CaseUnfold_13_Type;
-#include "enc/unicode/casefold.h"
+static inline int
+bits_of(const OnigCodePoint c, const int n)
+{
+ return (c >> (2 - n) * 7) & 127;
+}
+static inline int
+bits_at(const OnigCodePoint *c, const int n)
+{
+ return bits_of(c[n / 3], n % 3);
+}
+
+static int
+code1_equal(const OnigCodePoint x, const OnigCodePoint y)
+{
+ if (x != y) return 0;
+ return 1;
+}
+
+static int
+code2_equal(const OnigCodePoint *x, const OnigCodePoint *y)
+{
+ if (x[0] != y[0]) return 0;
+ if (x[1] != y[1]) return 0;
+ return 1;
+}
+
+static int
+code3_equal(const OnigCodePoint *x, const OnigCodePoint *y)
+{
+ if (x[0] != y[0]) return 0;
+ if (x[1] != y[1]) return 0;
+ if (x[2] != y[2]) return 0;
+ return 1;
+}
+
+/* macros related to ONIGENC_CASE flags */
+/* defined here because not used in other files */
+#define ONIGENC_CASE_SPECIALS (ONIGENC_CASE_TITLECASE | ONIGENC_CASE_IS_TITLECASE | ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL)
+
+/* macros for length in CaseMappingSpecials array in enc/unicode/casefold.h */
+#define SpecialsLengthOffset 25 /* needs to be higher than the 22 bits used for Unicode codepoints */
+#define SpecialsLengthExtract(n) ((n) >> SpecialsLengthOffset)
+#define SpecialsCodepointExtract(n) ((n) & ((1 << SpecialsLengthOffset) - 1))
+#define SpecialsLengthEncode(n) ((n) << SpecialsLengthOffset)
+
+#define OnigSpecialIndexMask (((1 << OnigSpecialIndexWidth) - 1) << OnigSpecialIndexShift)
+#define OnigSpecialIndexEncode(n) ((n) << OnigSpecialIndexShift)
+#define OnigSpecialIndexDecode(n) (((n) & OnigSpecialIndexMask) >> OnigSpecialIndexShift)
+
+/* macros to shorten "enc/unicode/casefold.h", undefined immediately after including the file */
+#define U ONIGENC_CASE_UPCASE
+#define D ONIGENC_CASE_DOWNCASE
+#define F ONIGENC_CASE_FOLD
+#define ST ONIGENC_CASE_TITLECASE
+#define SU ONIGENC_CASE_UP_SPECIAL
+#define SL ONIGENC_CASE_DOWN_SPECIAL
+#define IT ONIGENC_CASE_IS_TITLECASE
+#define I(n) OnigSpecialIndexEncode(n)
+#define L(n) SpecialsLengthEncode(n)
+
+#include "casefold.h"
+
+#undef U
+#undef D
+#undef F
+#undef ST
+#undef SU
+#undef SL
+#undef IT
+#undef I
+#undef L
+
+#include "name2ctype.h"
-#define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
#define CODE_RANGES_NUM numberof(CodeRanges)
extern int
@@ -143,23 +212,21 @@ onigenc_unicode_ctype_code_range(int ctype, const OnigCodePoint* ranges[])
extern int
onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
const OnigCodePoint* ranges[],
- struct OnigEncodingTypeST* enc ARG_UNUSED)
+ OnigEncoding enc ARG_UNUSED)
{
*sb_out = 0x00;
return onigenc_unicode_ctype_code_range(ctype, ranges);
}
-#include "ruby/st.h"
-
#define PROPERTY_NAME_MAX_SIZE (MAX_WORD_LENGTH + 1)
extern int
-onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name, UChar* end)
+onigenc_unicode_property_name_to_ctype(OnigEncoding enc, const UChar* name, const UChar* end)
{
int len;
int ctype;
UChar buf[PROPERTY_NAME_MAX_SIZE];
- UChar *p;
+ const UChar *p;
OnigCodePoint code;
len = 0;
@@ -184,137 +251,44 @@ onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name, UChar* end
return ctype;
}
+#define onigenc_unicode_fold_lookup onigenc_unicode_CaseFold_11_lookup
+#define onigenc_unicode_unfold1_lookup onigenc_unicode_CaseUnfold_11_lookup
+#define onigenc_unicode_unfold2_lookup onigenc_unicode_CaseUnfold_12_lookup
+#define onigenc_unicode_unfold3_lookup onigenc_unicode_CaseUnfold_13_lookup
-static int
-code2_cmp(OnigCodePoint* x, OnigCodePoint* y)
-{
- if (x[0] == y[0] && x[1] == y[1]) return 0;
- return 1;
-}
-
-static st_index_t
-code2_hash(OnigCodePoint* x)
-{
- return (st_index_t )(x[0] + x[1]);
-}
-
-static const struct st_hash_type type_code2_hash = {
- code2_cmp,
- code2_hash,
-};
-
-static int
-code3_cmp(OnigCodePoint* x, OnigCodePoint* y)
-{
- if (x[0] == y[0] && x[1] == y[1] && x[2] == y[2]) return 0;
- return 1;
-}
-
-static st_index_t
-code3_hash(OnigCodePoint* x)
-{
- return (st_index_t )(x[0] + x[1] + x[2]);
-}
-
-static const struct st_hash_type type_code3_hash = {
- code3_cmp,
- code3_hash,
+enum {
+ I_WITH_DOT_ABOVE = 0x0130,
+ DOTLESS_i = 0x0131,
+ DOT_ABOVE = 0x0307
};
-
-static st_table* FoldTable; /* fold-1, fold-2, fold-3 */
-static st_table* Unfold1Table;
-static st_table* Unfold2Table;
-static st_table* Unfold3Table;
-static int CaseFoldInited = 0;
-
-static int init_case_fold_table(void)
-{
- const CaseFold_11_Type *p;
- const CaseUnfold_11_Type *p1;
- const CaseUnfold_12_Type *p2;
- const CaseUnfold_13_Type *p3;
- int i;
-
- THREAD_ATOMIC_START;
-
- FoldTable = st_init_numtable_with_size(FOLD_TABLE_SIZE);
- if (ONIG_IS_NULL(FoldTable)) return ONIGERR_MEMORY;
- for (i = 0; i < numberof(CaseFold); i++) {
- p = &CaseFold[i];
- st_add_direct(FoldTable, (st_data_t )p->from, (st_data_t )&(p->to));
- }
- for (i = 0; i < numberof(CaseFold_Locale); i++) {
- p = &CaseFold_Locale[i];
- st_add_direct(FoldTable, (st_data_t )p->from, (st_data_t )&(p->to));
- }
-
- Unfold1Table = st_init_numtable_with_size(UNFOLD1_TABLE_SIZE);
- if (ONIG_IS_NULL(Unfold1Table)) return ONIGERR_MEMORY;
-
- for (i = 0; i < numberof(CaseUnfold_11); i++) {
- p1 = &CaseUnfold_11[i];
- st_add_direct(Unfold1Table, (st_data_t )p1->from, (st_data_t )&(p1->to));
- }
- for (i = 0; i < numberof(CaseUnfold_11_Locale); i++) {
- p1 = &CaseUnfold_11_Locale[i];
- st_add_direct(Unfold1Table, (st_data_t )p1->from, (st_data_t )&(p1->to));
- }
-
- Unfold2Table = st_init_table_with_size(&type_code2_hash, UNFOLD2_TABLE_SIZE);
- if (ONIG_IS_NULL(Unfold2Table)) return ONIGERR_MEMORY;
-
- for (i = 0; i < numberof(CaseUnfold_12); i++) {
- p2 = &CaseUnfold_12[i];
- st_add_direct(Unfold2Table, (st_data_t )p2->from, (st_data_t )(&p2->to));
- }
- for (i = 0; i < numberof(CaseUnfold_12_Locale); i++) {
- p2 = &CaseUnfold_12_Locale[i];
- st_add_direct(Unfold2Table, (st_data_t )p2->from, (st_data_t )(&p2->to));
- }
-
- Unfold3Table = st_init_table_with_size(&type_code3_hash, UNFOLD3_TABLE_SIZE);
- if (ONIG_IS_NULL(Unfold3Table)) return ONIGERR_MEMORY;
-
- for (i = 0; i < numberof(CaseUnfold_13); i++) {
- p3 = &CaseUnfold_13[i];
- st_add_direct(Unfold3Table, (st_data_t )p3->from, (st_data_t )(&p3->to));
- }
-
- CaseFoldInited = 1;
- THREAD_ATOMIC_END;
- return 0;
-}
-
extern int
onigenc_unicode_mbc_case_fold(OnigEncoding enc,
OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end,
UChar* fold)
{
- CodePointList3 *to;
+ const CodePointList3 *to;
OnigCodePoint code;
int i, len, rlen;
const UChar *p = *pp;
- if (CaseFoldInited == 0) init_case_fold_table();
-
code = ONIGENC_MBC_TO_CODE(enc, p, end);
len = enclen(enc, p, end);
*pp += len;
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
- if (code == 0x0049) {
- return ONIGENC_CODE_TO_MBC(enc, 0x0131, fold);
+ if (code == 'I') {
+ return ONIGENC_CODE_TO_MBC(enc, DOTLESS_i, fold);
}
- else if (code == 0x0130) {
- return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold);
+ else if (code == I_WITH_DOT_ABOVE) {
+ return ONIGENC_CODE_TO_MBC(enc, 'i', fold);
}
}
#endif
- if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0) {
- if (to->n == 1) {
+ if ((to = onigenc_unicode_fold_lookup(code)) != 0) {
+ if (OnigCodePointCount(to->n) == 1) {
return ONIGENC_CODE_TO_MBC(enc, to->code[0], fold);
}
#if 0
@@ -325,7 +299,7 @@ onigenc_unicode_mbc_case_fold(OnigEncoding enc,
#endif
{
rlen = 0;
- for (i = 0; i < to->n; i++) {
+ for (i = 0; i < OnigCodePointCount(to->n); i++) {
len = ONIGENC_CODE_TO_MBC(enc, to->code[i], fold);
fold += len;
rlen += len;
@@ -349,11 +323,9 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
OnigCodePoint code;
int i, j, k, r;
- /* if (CaseFoldInited == 0) init_case_fold_table(); */
-
for (i = 0; i < numberof(CaseUnfold_11); i++) {
p11 = &CaseUnfold_11[i];
- for (j = 0; j < p11->to.n; j++) {
+ for (j = 0; j < OnigCodePointCount(p11->to.n); j++) {
code = p11->from;
r = (*f)(p11->to.code[j], &code, 1, arg);
if (r != 0) return r;
@@ -374,25 +346,25 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
- code = 0x0131;
- r = (*f)(0x0049, &code, 1, arg);
+ code = DOTLESS_i;
+ r = (*f)('I', &code, 1, arg);
if (r != 0) return r;
- code = 0x0049;
- r = (*f)(0x0131, &code, 1, arg);
+ code = 'I';
+ r = (*f)(DOTLESS_i, &code, 1, arg);
if (r != 0) return r;
- code = 0x0130;
- r = (*f)(0x0069, &code, 1, arg);
+ code = I_WITH_DOT_ABOVE;
+ r = (*f)('i', &code, 1, arg);
if (r != 0) return r;
- code = 0x0069;
- r = (*f)(0x0130, &code, 1, arg);
+ code = 'i';
+ r = (*f)(I_WITH_DOT_ABOVE, &code, 1, arg);
if (r != 0) return r;
}
else {
#endif
for (i = 0; i < numberof(CaseUnfold_11_Locale); i++) {
p11 = &CaseUnfold_11_Locale[i];
- for (j = 0; j < p11->to.n; j++) {
+ for (j = 0; j < OnigCodePointCount(p11->to.n); j++) {
code = p11->from;
r = (*f)(p11->to.code[j], &code, 1, arg);
if (r != 0) return r;
@@ -418,12 +390,12 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
for (i = 0; i < numberof(CaseUnfold_12); i++) {
- for (j = 0; j < CaseUnfold_12[i].to.n; j++) {
+ for (j = 0; j < OnigCodePointCount(CaseUnfold_12[i].to.n); j++) {
r = (*f)(CaseUnfold_12[i].to.code[j],
(OnigCodePoint* )CaseUnfold_12[i].from, 2, arg);
if (r != 0) return r;
- for (k = 0; k < CaseUnfold_12[i].to.n; k++) {
+ for (k = 0; k < OnigCodePointCount(CaseUnfold_12[i].to.n); k++) {
if (k == j) continue;
r = (*f)(CaseUnfold_12[i].to.code[j],
@@ -437,12 +409,12 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) {
#endif
for (i = 0; i < numberof(CaseUnfold_12_Locale); i++) {
- for (j = 0; j < CaseUnfold_12_Locale[i].to.n; j++) {
+ for (j = 0; j < OnigCodePointCount(CaseUnfold_12_Locale[i].to.n); j++) {
r = (*f)(CaseUnfold_12_Locale[i].to.code[j],
(OnigCodePoint* )CaseUnfold_12_Locale[i].from, 2, arg);
if (r != 0) return r;
- for (k = 0; k < CaseUnfold_12_Locale[i].to.n; k++) {
+ for (k = 0; k < OnigCodePointCount(CaseUnfold_12_Locale[i].to.n); k++) {
if (k == j) continue;
r = (*f)(CaseUnfold_12_Locale[i].to.code[j],
@@ -457,12 +429,12 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
#endif
for (i = 0; i < numberof(CaseUnfold_13); i++) {
- for (j = 0; j < CaseUnfold_13[i].to.n; j++) {
+ for (j = 0; j < OnigCodePointCount(CaseUnfold_13[i].to.n); j++) {
r = (*f)(CaseUnfold_13[i].to.code[j],
(OnigCodePoint* )CaseUnfold_13[i].from, 3, arg);
if (r != 0) return r;
- for (k = 0; k < CaseUnfold_13[i].to.n; k++) {
+ for (k = 0; k < OnigCodePointCount(CaseUnfold_13[i].to.n); k++) {
if (k == j) continue;
r = (*f)(CaseUnfold_13[i].to.code[j],
@@ -476,6 +448,8 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
return 0;
}
+#define CodePointListValidP(x) (OnigCodePointCount((x)->n) <= numberof((x)->code))
+
extern int
onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end,
@@ -483,10 +457,8 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
{
int n, i, j, k, len;
OnigCodePoint code, codes[3];
- CodePointList3 *to, *z3;
- CodePointList2 *z2;
-
- if (CaseFoldInited == 0) init_case_fold_table();
+ const CodePointList3 *to, *z3;
+ const CodePointList2 *z2;
n = 0;
@@ -495,35 +467,37 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
- if (code == 0x0049) {
+ switch (code) {
+ case 'I':
items[0].byte_len = len;
items[0].code_len = 1;
- items[0].code[0] = 0x0131;
+ items[0].code[0] = DOTLESS_i;
return 1;
- }
- else if (code == 0x0130) {
+ case I_WITH_DOT_ABOVE:
items[0].byte_len = len;
items[0].code_len = 1;
- items[0].code[0] = 0x0069;
+ items[0].code[0] = 'i';
return 1;
- }
- else if (code == 0x0131) {
+ case DOTLESS_i:
items[0].byte_len = len;
items[0].code_len = 1;
- items[0].code[0] = 0x0049;
+ items[0].code[0] = 'I';
return 1;
- }
- else if (code == 0x0069) {
+ case 'i':
items[0].byte_len = len;
items[0].code_len = 1;
- items[0].code[0] = 0x0130;
+ items[0].code[0] = I_WITH_DOT_ABOVE;
return 1;
}
}
#endif
- if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0) {
- if (to->n == 1) {
+ if ((to = onigenc_unicode_fold_lookup(code)) != 0) {
+ if (OnigCodePointCount(to->n) == 0) {
+ /* any codepoint should not be empty */
+ UNREACHABLE_RETURN(0);
+ }
+ if (OnigCodePointCount(to->n) == 1) {
OnigCodePoint orig_code = code;
items[0].byte_len = len;
@@ -532,8 +506,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
n++;
code = to->code[0];
- if (onig_st_lookup(Unfold1Table, (st_data_t )code, (void* )&to) != 0) {
- for (i = 0; i < to->n; i++) {
+ if ((to = onigenc_unicode_unfold1_lookup(code)) != 0 &&
+ CodePointListValidP(to)) {
+ for (i = 0; i < OnigCodePointCount(to->n); i++) {
if (to->code[i] != orig_code) {
items[n].byte_len = len;
items[n].code_len = 1;
@@ -547,14 +522,13 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
OnigCodePoint cs[3][4];
int fn, ncs[3];
- for (fn = 0; fn < to->n; fn++) {
+ for (fn = 0; fn < OnigCodePointCount(to->n); fn++) {
cs[fn][0] = to->code[fn];
- if (onig_st_lookup(Unfold1Table, (st_data_t )cs[fn][0],
- (void* )&z3) != 0) {
- for (i = 0; i < z3->n; i++) {
+ if ((z3 = onigenc_unicode_unfold1_lookup(cs[fn][0])) != 0) {
+ for (i = 0; i < OnigCodePointCount(z3->n); i++) {
cs[fn][i+1] = z3->code[i];
}
- ncs[fn] = z3->n + 1;
+ ncs[fn] = OnigCodePointCount(z3->n) + 1;
}
else
ncs[fn] = 1;
@@ -571,9 +545,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
}
}
- if (onig_st_lookup(Unfold2Table, (st_data_t )to->code,
- (void* )&z2) != 0) {
- for (i = 0; i < z2->n; i++) {
+ if ((z2 = onigenc_unicode_unfold2_lookup(to->code)) != 0 &&
+ CodePointListValidP(z2)) {
+ for (i = 0; i < OnigCodePointCount(z2->n); i++) {
if (z2->code[i] == code) continue;
items[n].byte_len = len;
@@ -597,9 +571,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
}
}
- if (onig_st_lookup(Unfold3Table, (st_data_t )to->code,
- (void* )&z2) != 0) {
- for (i = 0; i < z2->n; i++) {
+ if ((z2 = onigenc_unicode_unfold3_lookup(to->code)) != 0 &&
+ CodePointListValidP(z2)) {
+ for (i = 0; i < OnigCodePointCount(z2->n); i++) {
if (z2->code[i] == code) continue;
items[n].byte_len = len;
@@ -615,8 +589,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
}
}
else {
- if (onig_st_lookup(Unfold1Table, (st_data_t )code, (void* )&to) != 0) {
- for (i = 0; i < to->n; i++) {
+ if ((to = onigenc_unicode_unfold1_lookup(code)) != 0 &&
+ CodePointListValidP(to)) {
+ for (i = 0; i < OnigCodePointCount(to->n); i++) {
items[n].byte_len = len;
items[n].code_len = 1;
items[n].code[0] = to->code[i];
@@ -633,8 +608,8 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
codes[0] = code;
code = ONIGENC_MBC_TO_CODE(enc, p, end);
- if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0
- && to->n == 1) {
+ if ((to = onigenc_unicode_fold_lookup(code)) != 0
+ && OnigCodePointCount(to->n) == 1) {
codes[1] = to->code[0];
}
else
@@ -642,8 +617,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
clen = enclen(enc, p, end);
len += clen;
- if (onig_st_lookup(Unfold2Table, (st_data_t )codes, (void* )&z2) != 0) {
- for (i = 0; i < z2->n; i++) {
+ if ((z2 = onigenc_unicode_unfold2_lookup(codes)) != 0 &&
+ CodePointListValidP(z2)) {
+ for (i = 0; i < OnigCodePointCount(z2->n); i++) {
items[n].byte_len = len;
items[n].code_len = 1;
items[n].code[0] = z2->code[i];
@@ -654,8 +630,8 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
p += clen;
if (p < end) {
code = ONIGENC_MBC_TO_CODE(enc, p, end);
- if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0
- && to->n == 1) {
+ if ((to = onigenc_unicode_fold_lookup(code)) != 0
+ && OnigCodePointCount(to->n) == 1) {
codes[2] = to->code[0];
}
else
@@ -663,9 +639,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
clen = enclen(enc, p, end);
len += clen;
- if (onig_st_lookup(Unfold3Table, (st_data_t )codes,
- (void* )&z2) != 0) {
- for (i = 0; i < z2->n; i++) {
+ if ((z2 = onigenc_unicode_unfold3_lookup(codes)) != 0 &&
+ CodePointListValidP(z2)) {
+ for (i = 0; i < OnigCodePointCount(z2->n); i++) {
items[n].byte_len = len;
items[n].code_len = 1;
items[n].code[0] = z2->code[i];
@@ -678,3 +654,165 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
return n;
}
+
+#ifdef USE_CASE_MAP_API
+/* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */
+#define CASE_MAPPING_SLACK 12
+#define MODIFIED (flags |= ONIGENC_CASE_MODIFIED)
+extern int
+onigenc_unicode_case_map(OnigCaseFoldType* flagP,
+ const OnigUChar** pp, const OnigUChar* end,
+ OnigUChar* to, OnigUChar* to_end,
+ const struct OnigEncodingTypeST* enc)
+{
+ OnigCodePoint code;
+ OnigUChar *to_start = to;
+ OnigCaseFoldType flags = *flagP;
+ int codepoint_length;
+
+ to_end -= CASE_MAPPING_SLACK;
+ /* copy flags ONIGENC_CASE_UPCASE and ONIGENC_CASE_DOWNCASE over to
+ * ONIGENC_CASE_UP_SPECIAL and ONIGENC_CASE_DOWN_SPECIAL */
+ flags |= (flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) << ONIGENC_CASE_SPECIAL_OFFSET;
+
+ while (*pp < end && to <= to_end) {
+ codepoint_length = ONIGENC_PRECISE_MBC_ENC_LEN(enc, *pp, end);
+ if (codepoint_length < 0)
+ return codepoint_length; /* encoding invalid */
+ code = ONIGENC_MBC_TO_CODE(enc, *pp, end);
+ *pp += codepoint_length;
+
+ if (code <= 'z') { /* ASCII comes first */
+ if (code >= 'a' /*&& code <= 'z'*/) {
+ if (flags & ONIGENC_CASE_UPCASE) {
+ MODIFIED;
+ if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'i')
+ code = I_WITH_DOT_ABOVE;
+ else
+ code -= 'a' - 'A';
+ }
+ }
+ else if (code >= 'A' && code <= 'Z') {
+ if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) {
+ MODIFIED;
+ if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'I')
+ code = DOTLESS_i;
+ else
+ code += 'a' - 'A';
+ }
+ }
+ }
+ else if (!(flags & ONIGENC_CASE_ASCII_ONLY) && code >= 0x00B5) { /* deal with non-ASCII; micron sign (U+00B5) is lowest affected */
+ const CodePointList3 *folded;
+
+ if (code == I_WITH_DOT_ABOVE) {
+ if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) {
+ MODIFIED;
+ code = 'i';
+ if (!(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI)) { /* make dot above explicit */
+ to += ONIGENC_CODE_TO_MBC(enc, code, to);
+ code = DOT_ABOVE;
+ }
+ }
+ }
+ else if (code == DOTLESS_i) { /* handle this manually, because it isn't involved in folding */
+ if (flags & ONIGENC_CASE_UPCASE) {
+ MODIFIED;
+ code = 'I';
+ }
+ }
+ else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) { /* data about character found in CaseFold_11_Table */
+ if ((flags & ONIGENC_CASE_TITLECASE) && code>=0x1C90 && code<=0x1CBF) { /* Georgian MTAVRULI */
+ MODIFIED;
+ code += 0x10D0 - 0x1C90;
+ }
+ else if ((flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, */
+ && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */
+ /* already Titlecase, no changes needed */
+ }
+ else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */
+ const OnigCodePoint *next;
+ int count;
+
+ MODIFIED;
+ if (flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_SPECIALS) { /* special */
+ const OnigCodePoint *SpecialsStart = CaseMappingSpecials + OnigSpecialIndexDecode(folded->n);
+
+ if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE) { /* swapCASE available */
+ if ((flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE))
+ == (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) /* swapCASE needed */
+ goto SpecialsCopy;
+ else /* swapCASE not needed */
+ SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
+ }
+ if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) { /* Titlecase available */
+ if (flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, but not yet Titlecase */
+ goto SpecialsCopy;
+ else /* Titlecase not needed */
+ SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
+ }
+ if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_DOWN_SPECIAL) {
+ if (!(flags & ONIGENC_CASE_DOWN_SPECIAL))
+ SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
+ }
+ /* here, we know we use ONIGENC_CASE_UP_SPECIAL, and the position is right */
+SpecialsCopy:
+ count = SpecialsLengthExtract(*SpecialsStart);
+ next = SpecialsStart;
+ code = SpecialsCodepointExtract(*next++);
+ }
+ else { /* no specials */
+ count = OnigCodePointCount(folded->n);
+ next = folded->code;
+ code = *next++;
+ }
+ if (count == 1)
+ ;
+ else if (count == 2) {
+ to += ONIGENC_CODE_TO_MBC(enc, code, to);
+ code = *next;
+ }
+ else { /* count == 3 */
+ to += ONIGENC_CODE_TO_MBC(enc, code, to);
+ to += ONIGENC_CODE_TO_MBC(enc, *next++, to);
+ code = *next;
+ }
+ }
+ }
+ else if ((folded = onigenc_unicode_unfold1_lookup(code)) != 0) { /* data about character found in CaseUnfold_11_Table */
+ if ((flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, */
+ && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */
+ /* already Titlecase, no changes needed */
+ }
+ else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */
+ MODIFIED;
+ code = folded->code[(flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) ? 1 : 0];
+ }
+ }
+ }
+ to += ONIGENC_CODE_TO_MBC(enc, code, to);
+ /* switch from titlecase to lowercase for capitalize */
+ if (flags & ONIGENC_CASE_TITLECASE)
+ flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE |
+ ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL);
+ }
+ *flagP = flags;
+ return (int )(to - to_start);
+}
+#endif
+
+const char onigenc_unicode_version_string[] =
+#ifdef ONIG_UNICODE_VERSION_STRING
+ ONIG_UNICODE_VERSION_STRING
+#endif
+ "";
+
+const int onigenc_unicode_version_number[3] = {
+#ifdef ONIG_UNICODE_VERSION_MAJOR
+ ONIG_UNICODE_VERSION_MAJOR,
+ ONIG_UNICODE_VERSION_MINOR,
+ ONIG_UNICODE_VERSION_TEENY,
+#else
+ 0
+#endif
+};