diff options
Diffstat (limited to 'enc/unicode.c')
| -rw-r--r-- | enc/unicode.c | 323 |
1 files changed, 268 insertions, 55 deletions
diff --git a/enc/unicode.c b/enc/unicode.c index 12d3a5414b..5bc806863e 100644 --- a/enc/unicode.c +++ b/enc/unicode.c @@ -2,7 +2,7 @@ unicode.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2013 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -137,9 +137,44 @@ code3_equal(const OnigCodePoint *x, const OnigCodePoint *y) return 1; } -#include "enc/unicode/casefold.h" - -#include "enc/unicode/name2ctype.h" +/* macros related to ONIGENC_CASE flags */ +/* defined here because not used in other files */ +#define ONIGENC_CASE_SPECIALS (ONIGENC_CASE_TITLECASE | ONIGENC_CASE_IS_TITLECASE | ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL) + +/* macros for length in CaseMappingSpecials array in enc/unicode/casefold.h */ +#define SpecialsLengthOffset 25 /* needs to be higher than the 22 bits used for Unicode codepoints */ +#define SpecialsLengthExtract(n) ((n) >> SpecialsLengthOffset) +#define SpecialsCodepointExtract(n) ((n) & ((1 << SpecialsLengthOffset) - 1)) +#define SpecialsLengthEncode(n) ((n) << SpecialsLengthOffset) + +#define OnigSpecialIndexMask (((1 << OnigSpecialIndexWidth) - 1) << OnigSpecialIndexShift) +#define OnigSpecialIndexEncode(n) ((n) << OnigSpecialIndexShift) +#define OnigSpecialIndexDecode(n) (((n) & OnigSpecialIndexMask) >> OnigSpecialIndexShift) + +/* macros to shorten "enc/unicode/casefold.h", undefined immediately after including the file */ +#define U ONIGENC_CASE_UPCASE +#define D ONIGENC_CASE_DOWNCASE +#define F ONIGENC_CASE_FOLD +#define ST ONIGENC_CASE_TITLECASE +#define SU ONIGENC_CASE_UP_SPECIAL +#define SL ONIGENC_CASE_DOWN_SPECIAL +#define IT ONIGENC_CASE_IS_TITLECASE +#define I(n) OnigSpecialIndexEncode(n) +#define L(n) SpecialsLengthEncode(n) + +#include "casefold.h" + +#undef U +#undef D +#undef F +#undef ST +#undef SU +#undef SL +#undef IT +#undef I +#undef L + +#include "name2ctype.h" #define CODE_RANGES_NUM numberof(CodeRanges) @@ -221,6 +256,12 @@ onigenc_unicode_property_name_to_ctype(OnigEncoding enc, const UChar* name, cons #define onigenc_unicode_unfold2_lookup onigenc_unicode_CaseUnfold_12_lookup #define onigenc_unicode_unfold3_lookup onigenc_unicode_CaseUnfold_13_lookup +enum { + I_WITH_DOT_ABOVE = 0x0130, + DOTLESS_i = 0x0131, + DOT_ABOVE = 0x0307 +}; + extern int onigenc_unicode_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end, @@ -237,17 +278,17 @@ onigenc_unicode_mbc_case_fold(OnigEncoding enc, #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { - if (code == 0x0049) { - return ONIGENC_CODE_TO_MBC(enc, 0x0131, fold); + if (code == 'I') { + return ONIGENC_CODE_TO_MBC(enc, DOTLESS_i, fold); } - else if (code == 0x0130) { - return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold); + else if (code == I_WITH_DOT_ABOVE) { + return ONIGENC_CODE_TO_MBC(enc, 'i', fold); } } #endif if ((to = onigenc_unicode_fold_lookup(code)) != 0) { - if (to->n == 1) { + if (OnigCodePointCount(to->n) == 1) { return ONIGENC_CODE_TO_MBC(enc, to->code[0], fold); } #if 0 @@ -258,7 +299,7 @@ onigenc_unicode_mbc_case_fold(OnigEncoding enc, #endif { rlen = 0; - for (i = 0; i < to->n; i++) { + for (i = 0; i < OnigCodePointCount(to->n); i++) { len = ONIGENC_CODE_TO_MBC(enc, to->code[i], fold); fold += len; rlen += len; @@ -284,7 +325,7 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, for (i = 0; i < numberof(CaseUnfold_11); i++) { p11 = &CaseUnfold_11[i]; - for (j = 0; j < p11->to.n; j++) { + for (j = 0; j < OnigCodePointCount(p11->to.n); j++) { code = p11->from; r = (*f)(p11->to.code[j], &code, 1, arg); if (r != 0) return r; @@ -305,25 +346,25 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { - code = 0x0131; - r = (*f)(0x0049, &code, 1, arg); + code = DOTLESS_i; + r = (*f)('I', &code, 1, arg); if (r != 0) return r; - code = 0x0049; - r = (*f)(0x0131, &code, 1, arg); + code = 'I'; + r = (*f)(DOTLESS_i, &code, 1, arg); if (r != 0) return r; - code = 0x0130; - r = (*f)(0x0069, &code, 1, arg); + code = I_WITH_DOT_ABOVE; + r = (*f)('i', &code, 1, arg); if (r != 0) return r; - code = 0x0069; - r = (*f)(0x0130, &code, 1, arg); + code = 'i'; + r = (*f)(I_WITH_DOT_ABOVE, &code, 1, arg); if (r != 0) return r; } else { #endif for (i = 0; i < numberof(CaseUnfold_11_Locale); i++) { p11 = &CaseUnfold_11_Locale[i]; - for (j = 0; j < p11->to.n; j++) { + for (j = 0; j < OnigCodePointCount(p11->to.n); j++) { code = p11->from; r = (*f)(p11->to.code[j], &code, 1, arg); if (r != 0) return r; @@ -349,12 +390,12 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { for (i = 0; i < numberof(CaseUnfold_12); i++) { - for (j = 0; j < CaseUnfold_12[i].to.n; j++) { + for (j = 0; j < OnigCodePointCount(CaseUnfold_12[i].to.n); j++) { r = (*f)(CaseUnfold_12[i].to.code[j], (OnigCodePoint* )CaseUnfold_12[i].from, 2, arg); if (r != 0) return r; - for (k = 0; k < CaseUnfold_12[i].to.n; k++) { + for (k = 0; k < OnigCodePointCount(CaseUnfold_12[i].to.n); k++) { if (k == j) continue; r = (*f)(CaseUnfold_12[i].to.code[j], @@ -368,12 +409,12 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) { #endif for (i = 0; i < numberof(CaseUnfold_12_Locale); i++) { - for (j = 0; j < CaseUnfold_12_Locale[i].to.n; j++) { + for (j = 0; j < OnigCodePointCount(CaseUnfold_12_Locale[i].to.n); j++) { r = (*f)(CaseUnfold_12_Locale[i].to.code[j], (OnigCodePoint* )CaseUnfold_12_Locale[i].from, 2, arg); if (r != 0) return r; - for (k = 0; k < CaseUnfold_12_Locale[i].to.n; k++) { + for (k = 0; k < OnigCodePointCount(CaseUnfold_12_Locale[i].to.n); k++) { if (k == j) continue; r = (*f)(CaseUnfold_12_Locale[i].to.code[j], @@ -388,12 +429,12 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, #endif for (i = 0; i < numberof(CaseUnfold_13); i++) { - for (j = 0; j < CaseUnfold_13[i].to.n; j++) { + for (j = 0; j < OnigCodePointCount(CaseUnfold_13[i].to.n); j++) { r = (*f)(CaseUnfold_13[i].to.code[j], (OnigCodePoint* )CaseUnfold_13[i].from, 3, arg); if (r != 0) return r; - for (k = 0; k < CaseUnfold_13[i].to.n; k++) { + for (k = 0; k < OnigCodePointCount(CaseUnfold_13[i].to.n); k++) { if (k == j) continue; r = (*f)(CaseUnfold_13[i].to.code[j], @@ -407,6 +448,8 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, return 0; } +#define CodePointListValidP(x) (OnigCodePointCount((x)->n) <= numberof((x)->code)) + extern int onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, @@ -424,35 +467,37 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { - if (code == 0x0049) { + switch (code) { + case 'I': items[0].byte_len = len; items[0].code_len = 1; - items[0].code[0] = 0x0131; + items[0].code[0] = DOTLESS_i; return 1; - } - else if (code == 0x0130) { + case I_WITH_DOT_ABOVE: items[0].byte_len = len; items[0].code_len = 1; - items[0].code[0] = 0x0069; + items[0].code[0] = 'i'; return 1; - } - else if (code == 0x0131) { + case DOTLESS_i: items[0].byte_len = len; items[0].code_len = 1; - items[0].code[0] = 0x0049; + items[0].code[0] = 'I'; return 1; - } - else if (code == 0x0069) { + case 'i': items[0].byte_len = len; items[0].code_len = 1; - items[0].code[0] = 0x0130; + items[0].code[0] = I_WITH_DOT_ABOVE; return 1; } } #endif if ((to = onigenc_unicode_fold_lookup(code)) != 0) { - if (to->n == 1) { + if (OnigCodePointCount(to->n) == 0) { + /* any codepoint should not be empty */ + UNREACHABLE_RETURN(0); + } + if (OnigCodePointCount(to->n) == 1) { OnigCodePoint orig_code = code; items[0].byte_len = len; @@ -461,8 +506,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, n++; code = to->code[0]; - if ((to = onigenc_unicode_unfold1_lookup(code)) != 0) { - for (i = 0; i < to->n; i++) { + if ((to = onigenc_unicode_unfold1_lookup(code)) != 0 && + CodePointListValidP(to)) { + for (i = 0; i < OnigCodePointCount(to->n); i++) { if (to->code[i] != orig_code) { items[n].byte_len = len; items[n].code_len = 1; @@ -476,13 +522,13 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, OnigCodePoint cs[3][4]; int fn, ncs[3]; - for (fn = 0; fn < to->n; fn++) { + for (fn = 0; fn < OnigCodePointCount(to->n); fn++) { cs[fn][0] = to->code[fn]; if ((z3 = onigenc_unicode_unfold1_lookup(cs[fn][0])) != 0) { - for (i = 0; i < z3->n; i++) { + for (i = 0; i < OnigCodePointCount(z3->n); i++) { cs[fn][i+1] = z3->code[i]; } - ncs[fn] = z3->n + 1; + ncs[fn] = OnigCodePointCount(z3->n) + 1; } else ncs[fn] = 1; @@ -499,8 +545,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, } } - if ((z2 = onigenc_unicode_unfold2_lookup(to->code)) != 0) { - for (i = 0; i < z2->n; i++) { + if ((z2 = onigenc_unicode_unfold2_lookup(to->code)) != 0 && + CodePointListValidP(z2)) { + for (i = 0; i < OnigCodePointCount(z2->n); i++) { if (z2->code[i] == code) continue; items[n].byte_len = len; @@ -524,8 +571,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, } } - if ((z2 = onigenc_unicode_unfold3_lookup(to->code)) != 0) { - for (i = 0; i < z2->n; i++) { + if ((z2 = onigenc_unicode_unfold3_lookup(to->code)) != 0 && + CodePointListValidP(z2)) { + for (i = 0; i < OnigCodePointCount(z2->n); i++) { if (z2->code[i] == code) continue; items[n].byte_len = len; @@ -541,8 +589,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, } } else { - if ((to = onigenc_unicode_unfold1_lookup(code)) != 0) { - for (i = 0; i < to->n; i++) { + if ((to = onigenc_unicode_unfold1_lookup(code)) != 0 && + CodePointListValidP(to)) { + for (i = 0; i < OnigCodePointCount(to->n); i++) { items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = to->code[i]; @@ -560,7 +609,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, codes[0] = code; code = ONIGENC_MBC_TO_CODE(enc, p, end); if ((to = onigenc_unicode_fold_lookup(code)) != 0 - && to->n == 1) { + && OnigCodePointCount(to->n) == 1) { codes[1] = to->code[0]; } else @@ -568,8 +617,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, clen = enclen(enc, p, end); len += clen; - if ((z2 = onigenc_unicode_unfold2_lookup(codes)) != 0) { - for (i = 0; i < z2->n; i++) { + if ((z2 = onigenc_unicode_unfold2_lookup(codes)) != 0 && + CodePointListValidP(z2)) { + for (i = 0; i < OnigCodePointCount(z2->n); i++) { items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = z2->code[i]; @@ -581,7 +631,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, if (p < end) { code = ONIGENC_MBC_TO_CODE(enc, p, end); if ((to = onigenc_unicode_fold_lookup(code)) != 0 - && to->n == 1) { + && OnigCodePointCount(to->n) == 1) { codes[2] = to->code[0]; } else @@ -589,8 +639,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, clen = enclen(enc, p, end); len += clen; - if ((z2 = onigenc_unicode_unfold3_lookup(codes)) != 0) { - for (i = 0; i < z2->n; i++) { + if ((z2 = onigenc_unicode_unfold3_lookup(codes)) != 0 && + CodePointListValidP(z2)) { + for (i = 0; i < OnigCodePointCount(z2->n); i++) { items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = z2->code[i]; @@ -603,3 +654,165 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, return n; } + +#ifdef USE_CASE_MAP_API +/* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */ +#define CASE_MAPPING_SLACK 12 +#define MODIFIED (flags |= ONIGENC_CASE_MODIFIED) +extern int +onigenc_unicode_case_map(OnigCaseFoldType* flagP, + const OnigUChar** pp, const OnigUChar* end, + OnigUChar* to, OnigUChar* to_end, + const struct OnigEncodingTypeST* enc) +{ + OnigCodePoint code; + OnigUChar *to_start = to; + OnigCaseFoldType flags = *flagP; + int codepoint_length; + + to_end -= CASE_MAPPING_SLACK; + /* copy flags ONIGENC_CASE_UPCASE and ONIGENC_CASE_DOWNCASE over to + * ONIGENC_CASE_UP_SPECIAL and ONIGENC_CASE_DOWN_SPECIAL */ + flags |= (flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) << ONIGENC_CASE_SPECIAL_OFFSET; + + while (*pp < end && to <= to_end) { + codepoint_length = ONIGENC_PRECISE_MBC_ENC_LEN(enc, *pp, end); + if (codepoint_length < 0) + return codepoint_length; /* encoding invalid */ + code = ONIGENC_MBC_TO_CODE(enc, *pp, end); + *pp += codepoint_length; + + if (code <= 'z') { /* ASCII comes first */ + if (code >= 'a' /*&& code <= 'z'*/) { + if (flags & ONIGENC_CASE_UPCASE) { + MODIFIED; + if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'i') + code = I_WITH_DOT_ABOVE; + else + code -= 'a' - 'A'; + } + } + else if (code >= 'A' && code <= 'Z') { + if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) { + MODIFIED; + if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'I') + code = DOTLESS_i; + else + code += 'a' - 'A'; + } + } + } + else if (!(flags & ONIGENC_CASE_ASCII_ONLY) && code >= 0x00B5) { /* deal with non-ASCII; micron sign (U+00B5) is lowest affected */ + const CodePointList3 *folded; + + if (code == I_WITH_DOT_ABOVE) { + if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) { + MODIFIED; + code = 'i'; + if (!(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI)) { /* make dot above explicit */ + to += ONIGENC_CODE_TO_MBC(enc, code, to); + code = DOT_ABOVE; + } + } + } + else if (code == DOTLESS_i) { /* handle this manually, because it isn't involved in folding */ + if (flags & ONIGENC_CASE_UPCASE) { + MODIFIED; + code = 'I'; + } + } + else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) { /* data about character found in CaseFold_11_Table */ + if ((flags & ONIGENC_CASE_TITLECASE) && code>=0x1C90 && code<=0x1CBF) { /* Georgian MTAVRULI */ + MODIFIED; + code += 0x10D0 - 0x1C90; + } + else if ((flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, */ + && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */ + /* already Titlecase, no changes needed */ + } + else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ + const OnigCodePoint *next; + int count; + + MODIFIED; + if (flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_SPECIALS) { /* special */ + const OnigCodePoint *SpecialsStart = CaseMappingSpecials + OnigSpecialIndexDecode(folded->n); + + if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE) { /* swapCASE available */ + if ((flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) + == (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) /* swapCASE needed */ + goto SpecialsCopy; + else /* swapCASE not needed */ + SpecialsStart += SpecialsLengthExtract(*SpecialsStart); + } + if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) { /* Titlecase available */ + if (flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, but not yet Titlecase */ + goto SpecialsCopy; + else /* Titlecase not needed */ + SpecialsStart += SpecialsLengthExtract(*SpecialsStart); + } + if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_DOWN_SPECIAL) { + if (!(flags & ONIGENC_CASE_DOWN_SPECIAL)) + SpecialsStart += SpecialsLengthExtract(*SpecialsStart); + } + /* here, we know we use ONIGENC_CASE_UP_SPECIAL, and the position is right */ +SpecialsCopy: + count = SpecialsLengthExtract(*SpecialsStart); + next = SpecialsStart; + code = SpecialsCodepointExtract(*next++); + } + else { /* no specials */ + count = OnigCodePointCount(folded->n); + next = folded->code; + code = *next++; + } + if (count == 1) + ; + else if (count == 2) { + to += ONIGENC_CODE_TO_MBC(enc, code, to); + code = *next; + } + else { /* count == 3 */ + to += ONIGENC_CODE_TO_MBC(enc, code, to); + to += ONIGENC_CODE_TO_MBC(enc, *next++, to); + code = *next; + } + } + } + else if ((folded = onigenc_unicode_unfold1_lookup(code)) != 0) { /* data about character found in CaseUnfold_11_Table */ + if ((flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, */ + && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */ + /* already Titlecase, no changes needed */ + } + else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ + MODIFIED; + code = folded->code[(flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) ? 1 : 0]; + } + } + } + to += ONIGENC_CODE_TO_MBC(enc, code, to); + /* switch from titlecase to lowercase for capitalize */ + if (flags & ONIGENC_CASE_TITLECASE) + flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE | + ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL); + } + *flagP = flags; + return (int )(to - to_start); +} +#endif + +const char onigenc_unicode_version_string[] = +#ifdef ONIG_UNICODE_VERSION_STRING + ONIG_UNICODE_VERSION_STRING +#endif + ""; + +const int onigenc_unicode_version_number[3] = { +#ifdef ONIG_UNICODE_VERSION_MAJOR + ONIG_UNICODE_VERSION_MAJOR, + ONIG_UNICODE_VERSION_MINOR, + ONIG_UNICODE_VERSION_TEENY, +#else + 0 +#endif +}; |
