diff options
Diffstat (limited to 'enc/unicode.c')
| -rw-r--r-- | enc/unicode.c | 494 |
1 files changed, 316 insertions, 178 deletions
diff --git a/enc/unicode.c b/enc/unicode.c index 20990c1e54..5bc806863e 100644 --- a/enc/unicode.c +++ b/enc/unicode.c @@ -2,7 +2,7 @@ unicode.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2013 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -71,8 +71,6 @@ static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = { 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2 }; -#include "enc/unicode/name2ctype.h" - typedef struct { int n; OnigCodePoint code[3]; @@ -103,10 +101,81 @@ typedef struct { CodePointList2 to; } CaseUnfold_13_Type; -#include "enc/unicode/casefold.h" +static inline int +bits_of(const OnigCodePoint c, const int n) +{ + return (c >> (2 - n) * 7) & 127; +} +static inline int +bits_at(const OnigCodePoint *c, const int n) +{ + return bits_of(c[n / 3], n % 3); +} + +static int +code1_equal(const OnigCodePoint x, const OnigCodePoint y) +{ + if (x != y) return 0; + return 1; +} + +static int +code2_equal(const OnigCodePoint *x, const OnigCodePoint *y) +{ + if (x[0] != y[0]) return 0; + if (x[1] != y[1]) return 0; + return 1; +} + +static int +code3_equal(const OnigCodePoint *x, const OnigCodePoint *y) +{ + if (x[0] != y[0]) return 0; + if (x[1] != y[1]) return 0; + if (x[2] != y[2]) return 0; + return 1; +} + +/* macros related to ONIGENC_CASE flags */ +/* defined here because not used in other files */ +#define ONIGENC_CASE_SPECIALS (ONIGENC_CASE_TITLECASE | ONIGENC_CASE_IS_TITLECASE | ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL) + +/* macros for length in CaseMappingSpecials array in enc/unicode/casefold.h */ +#define SpecialsLengthOffset 25 /* needs to be higher than the 22 bits used for Unicode codepoints */ +#define SpecialsLengthExtract(n) ((n) >> SpecialsLengthOffset) +#define SpecialsCodepointExtract(n) ((n) & ((1 << SpecialsLengthOffset) - 1)) +#define SpecialsLengthEncode(n) ((n) << SpecialsLengthOffset) + +#define OnigSpecialIndexMask (((1 << OnigSpecialIndexWidth) - 1) << OnigSpecialIndexShift) +#define OnigSpecialIndexEncode(n) ((n) << OnigSpecialIndexShift) +#define OnigSpecialIndexDecode(n) (((n) & OnigSpecialIndexMask) >> OnigSpecialIndexShift) + +/* macros to shorten "enc/unicode/casefold.h", undefined immediately after including the file */ +#define U ONIGENC_CASE_UPCASE +#define D ONIGENC_CASE_DOWNCASE +#define F ONIGENC_CASE_FOLD +#define ST ONIGENC_CASE_TITLECASE +#define SU ONIGENC_CASE_UP_SPECIAL +#define SL ONIGENC_CASE_DOWN_SPECIAL +#define IT ONIGENC_CASE_IS_TITLECASE +#define I(n) OnigSpecialIndexEncode(n) +#define L(n) SpecialsLengthEncode(n) + +#include "casefold.h" + +#undef U +#undef D +#undef F +#undef ST +#undef SU +#undef SL +#undef IT +#undef I +#undef L + +#include "name2ctype.h" -#define numberof(array) (int)(sizeof(array) / sizeof((array)[0])) #define CODE_RANGES_NUM numberof(CodeRanges) extern int @@ -143,23 +212,21 @@ onigenc_unicode_ctype_code_range(int ctype, const OnigCodePoint* ranges[]) extern int onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out, const OnigCodePoint* ranges[], - struct OnigEncodingTypeST* enc ARG_UNUSED) + OnigEncoding enc ARG_UNUSED) { *sb_out = 0x00; return onigenc_unicode_ctype_code_range(ctype, ranges); } -#include "ruby/st.h" - #define PROPERTY_NAME_MAX_SIZE (MAX_WORD_LENGTH + 1) extern int -onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name, UChar* end) +onigenc_unicode_property_name_to_ctype(OnigEncoding enc, const UChar* name, const UChar* end) { int len; int ctype; UChar buf[PROPERTY_NAME_MAX_SIZE]; - UChar *p; + const UChar *p; OnigCodePoint code; len = 0; @@ -184,137 +251,44 @@ onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name, UChar* end return ctype; } +#define onigenc_unicode_fold_lookup onigenc_unicode_CaseFold_11_lookup +#define onigenc_unicode_unfold1_lookup onigenc_unicode_CaseUnfold_11_lookup +#define onigenc_unicode_unfold2_lookup onigenc_unicode_CaseUnfold_12_lookup +#define onigenc_unicode_unfold3_lookup onigenc_unicode_CaseUnfold_13_lookup -static int -code2_cmp(OnigCodePoint* x, OnigCodePoint* y) -{ - if (x[0] == y[0] && x[1] == y[1]) return 0; - return 1; -} - -static st_index_t -code2_hash(OnigCodePoint* x) -{ - return (st_index_t )(x[0] + x[1]); -} - -static const struct st_hash_type type_code2_hash = { - code2_cmp, - code2_hash, -}; - -static int -code3_cmp(OnigCodePoint* x, OnigCodePoint* y) -{ - if (x[0] == y[0] && x[1] == y[1] && x[2] == y[2]) return 0; - return 1; -} - -static st_index_t -code3_hash(OnigCodePoint* x) -{ - return (st_index_t )(x[0] + x[1] + x[2]); -} - -static const struct st_hash_type type_code3_hash = { - code3_cmp, - code3_hash, +enum { + I_WITH_DOT_ABOVE = 0x0130, + DOTLESS_i = 0x0131, + DOT_ABOVE = 0x0307 }; - -static st_table* FoldTable; /* fold-1, fold-2, fold-3 */ -static st_table* Unfold1Table; -static st_table* Unfold2Table; -static st_table* Unfold3Table; -static int CaseFoldInited = 0; - -static int init_case_fold_table(void) -{ - const CaseFold_11_Type *p; - const CaseUnfold_11_Type *p1; - const CaseUnfold_12_Type *p2; - const CaseUnfold_13_Type *p3; - int i; - - THREAD_ATOMIC_START; - - FoldTable = st_init_numtable_with_size(FOLD_TABLE_SIZE); - if (ONIG_IS_NULL(FoldTable)) return ONIGERR_MEMORY; - for (i = 0; i < numberof(CaseFold); i++) { - p = &CaseFold[i]; - st_add_direct(FoldTable, (st_data_t )p->from, (st_data_t )&(p->to)); - } - for (i = 0; i < numberof(CaseFold_Locale); i++) { - p = &CaseFold_Locale[i]; - st_add_direct(FoldTable, (st_data_t )p->from, (st_data_t )&(p->to)); - } - - Unfold1Table = st_init_numtable_with_size(UNFOLD1_TABLE_SIZE); - if (ONIG_IS_NULL(Unfold1Table)) return ONIGERR_MEMORY; - - for (i = 0; i < numberof(CaseUnfold_11); i++) { - p1 = &CaseUnfold_11[i]; - st_add_direct(Unfold1Table, (st_data_t )p1->from, (st_data_t )&(p1->to)); - } - for (i = 0; i < numberof(CaseUnfold_11_Locale); i++) { - p1 = &CaseUnfold_11_Locale[i]; - st_add_direct(Unfold1Table, (st_data_t )p1->from, (st_data_t )&(p1->to)); - } - - Unfold2Table = st_init_table_with_size(&type_code2_hash, UNFOLD2_TABLE_SIZE); - if (ONIG_IS_NULL(Unfold2Table)) return ONIGERR_MEMORY; - - for (i = 0; i < numberof(CaseUnfold_12); i++) { - p2 = &CaseUnfold_12[i]; - st_add_direct(Unfold2Table, (st_data_t )p2->from, (st_data_t )(&p2->to)); - } - for (i = 0; i < numberof(CaseUnfold_12_Locale); i++) { - p2 = &CaseUnfold_12_Locale[i]; - st_add_direct(Unfold2Table, (st_data_t )p2->from, (st_data_t )(&p2->to)); - } - - Unfold3Table = st_init_table_with_size(&type_code3_hash, UNFOLD3_TABLE_SIZE); - if (ONIG_IS_NULL(Unfold3Table)) return ONIGERR_MEMORY; - - for (i = 0; i < numberof(CaseUnfold_13); i++) { - p3 = &CaseUnfold_13[i]; - st_add_direct(Unfold3Table, (st_data_t )p3->from, (st_data_t )(&p3->to)); - } - - CaseFoldInited = 1; - THREAD_ATOMIC_END; - return 0; -} - extern int onigenc_unicode_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end, UChar* fold) { - CodePointList3 *to; + const CodePointList3 *to; OnigCodePoint code; int i, len, rlen; const UChar *p = *pp; - if (CaseFoldInited == 0) init_case_fold_table(); - code = ONIGENC_MBC_TO_CODE(enc, p, end); len = enclen(enc, p, end); *pp += len; #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { - if (code == 0x0049) { - return ONIGENC_CODE_TO_MBC(enc, 0x0131, fold); + if (code == 'I') { + return ONIGENC_CODE_TO_MBC(enc, DOTLESS_i, fold); } - else if (code == 0x0130) { - return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold); + else if (code == I_WITH_DOT_ABOVE) { + return ONIGENC_CODE_TO_MBC(enc, 'i', fold); } } #endif - if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0) { - if (to->n == 1) { + if ((to = onigenc_unicode_fold_lookup(code)) != 0) { + if (OnigCodePointCount(to->n) == 1) { return ONIGENC_CODE_TO_MBC(enc, to->code[0], fold); } #if 0 @@ -325,7 +299,7 @@ onigenc_unicode_mbc_case_fold(OnigEncoding enc, #endif { rlen = 0; - for (i = 0; i < to->n; i++) { + for (i = 0; i < OnigCodePointCount(to->n); i++) { len = ONIGENC_CODE_TO_MBC(enc, to->code[i], fold); fold += len; rlen += len; @@ -349,11 +323,9 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, OnigCodePoint code; int i, j, k, r; - /* if (CaseFoldInited == 0) init_case_fold_table(); */ - for (i = 0; i < numberof(CaseUnfold_11); i++) { p11 = &CaseUnfold_11[i]; - for (j = 0; j < p11->to.n; j++) { + for (j = 0; j < OnigCodePointCount(p11->to.n); j++) { code = p11->from; r = (*f)(p11->to.code[j], &code, 1, arg); if (r != 0) return r; @@ -374,25 +346,25 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { - code = 0x0131; - r = (*f)(0x0049, &code, 1, arg); + code = DOTLESS_i; + r = (*f)('I', &code, 1, arg); if (r != 0) return r; - code = 0x0049; - r = (*f)(0x0131, &code, 1, arg); + code = 'I'; + r = (*f)(DOTLESS_i, &code, 1, arg); if (r != 0) return r; - code = 0x0130; - r = (*f)(0x0069, &code, 1, arg); + code = I_WITH_DOT_ABOVE; + r = (*f)('i', &code, 1, arg); if (r != 0) return r; - code = 0x0069; - r = (*f)(0x0130, &code, 1, arg); + code = 'i'; + r = (*f)(I_WITH_DOT_ABOVE, &code, 1, arg); if (r != 0) return r; } else { #endif for (i = 0; i < numberof(CaseUnfold_11_Locale); i++) { p11 = &CaseUnfold_11_Locale[i]; - for (j = 0; j < p11->to.n; j++) { + for (j = 0; j < OnigCodePointCount(p11->to.n); j++) { code = p11->from; r = (*f)(p11->to.code[j], &code, 1, arg); if (r != 0) return r; @@ -418,12 +390,12 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { for (i = 0; i < numberof(CaseUnfold_12); i++) { - for (j = 0; j < CaseUnfold_12[i].to.n; j++) { + for (j = 0; j < OnigCodePointCount(CaseUnfold_12[i].to.n); j++) { r = (*f)(CaseUnfold_12[i].to.code[j], (OnigCodePoint* )CaseUnfold_12[i].from, 2, arg); if (r != 0) return r; - for (k = 0; k < CaseUnfold_12[i].to.n; k++) { + for (k = 0; k < OnigCodePointCount(CaseUnfold_12[i].to.n); k++) { if (k == j) continue; r = (*f)(CaseUnfold_12[i].to.code[j], @@ -437,12 +409,12 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) { #endif for (i = 0; i < numberof(CaseUnfold_12_Locale); i++) { - for (j = 0; j < CaseUnfold_12_Locale[i].to.n; j++) { + for (j = 0; j < OnigCodePointCount(CaseUnfold_12_Locale[i].to.n); j++) { r = (*f)(CaseUnfold_12_Locale[i].to.code[j], (OnigCodePoint* )CaseUnfold_12_Locale[i].from, 2, arg); if (r != 0) return r; - for (k = 0; k < CaseUnfold_12_Locale[i].to.n; k++) { + for (k = 0; k < OnigCodePointCount(CaseUnfold_12_Locale[i].to.n); k++) { if (k == j) continue; r = (*f)(CaseUnfold_12_Locale[i].to.code[j], @@ -457,12 +429,12 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, #endif for (i = 0; i < numberof(CaseUnfold_13); i++) { - for (j = 0; j < CaseUnfold_13[i].to.n; j++) { + for (j = 0; j < OnigCodePointCount(CaseUnfold_13[i].to.n); j++) { r = (*f)(CaseUnfold_13[i].to.code[j], (OnigCodePoint* )CaseUnfold_13[i].from, 3, arg); if (r != 0) return r; - for (k = 0; k < CaseUnfold_13[i].to.n; k++) { + for (k = 0; k < OnigCodePointCount(CaseUnfold_13[i].to.n); k++) { if (k == j) continue; r = (*f)(CaseUnfold_13[i].to.code[j], @@ -476,6 +448,8 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, return 0; } +#define CodePointListValidP(x) (OnigCodePointCount((x)->n) <= numberof((x)->code)) + extern int onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, @@ -483,10 +457,8 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, { int n, i, j, k, len; OnigCodePoint code, codes[3]; - CodePointList3 *to, *z3; - CodePointList2 *z2; - - if (CaseFoldInited == 0) init_case_fold_table(); + const CodePointList3 *to, *z3; + const CodePointList2 *z2; n = 0; @@ -495,35 +467,37 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { - if (code == 0x0049) { + switch (code) { + case 'I': items[0].byte_len = len; items[0].code_len = 1; - items[0].code[0] = 0x0131; + items[0].code[0] = DOTLESS_i; return 1; - } - else if (code == 0x0130) { + case I_WITH_DOT_ABOVE: items[0].byte_len = len; items[0].code_len = 1; - items[0].code[0] = 0x0069; + items[0].code[0] = 'i'; return 1; - } - else if (code == 0x0131) { + case DOTLESS_i: items[0].byte_len = len; items[0].code_len = 1; - items[0].code[0] = 0x0049; + items[0].code[0] = 'I'; return 1; - } - else if (code == 0x0069) { + case 'i': items[0].byte_len = len; items[0].code_len = 1; - items[0].code[0] = 0x0130; + items[0].code[0] = I_WITH_DOT_ABOVE; return 1; } } #endif - if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0) { - if (to->n == 1) { + if ((to = onigenc_unicode_fold_lookup(code)) != 0) { + if (OnigCodePointCount(to->n) == 0) { + /* any codepoint should not be empty */ + UNREACHABLE_RETURN(0); + } + if (OnigCodePointCount(to->n) == 1) { OnigCodePoint orig_code = code; items[0].byte_len = len; @@ -532,8 +506,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, n++; code = to->code[0]; - if (onig_st_lookup(Unfold1Table, (st_data_t )code, (void* )&to) != 0) { - for (i = 0; i < to->n; i++) { + if ((to = onigenc_unicode_unfold1_lookup(code)) != 0 && + CodePointListValidP(to)) { + for (i = 0; i < OnigCodePointCount(to->n); i++) { if (to->code[i] != orig_code) { items[n].byte_len = len; items[n].code_len = 1; @@ -547,14 +522,13 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, OnigCodePoint cs[3][4]; int fn, ncs[3]; - for (fn = 0; fn < to->n; fn++) { + for (fn = 0; fn < OnigCodePointCount(to->n); fn++) { cs[fn][0] = to->code[fn]; - if (onig_st_lookup(Unfold1Table, (st_data_t )cs[fn][0], - (void* )&z3) != 0) { - for (i = 0; i < z3->n; i++) { + if ((z3 = onigenc_unicode_unfold1_lookup(cs[fn][0])) != 0) { + for (i = 0; i < OnigCodePointCount(z3->n); i++) { cs[fn][i+1] = z3->code[i]; } - ncs[fn] = z3->n + 1; + ncs[fn] = OnigCodePointCount(z3->n) + 1; } else ncs[fn] = 1; @@ -571,9 +545,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, } } - if (onig_st_lookup(Unfold2Table, (st_data_t )to->code, - (void* )&z2) != 0) { - for (i = 0; i < z2->n; i++) { + if ((z2 = onigenc_unicode_unfold2_lookup(to->code)) != 0 && + CodePointListValidP(z2)) { + for (i = 0; i < OnigCodePointCount(z2->n); i++) { if (z2->code[i] == code) continue; items[n].byte_len = len; @@ -597,9 +571,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, } } - if (onig_st_lookup(Unfold3Table, (st_data_t )to->code, - (void* )&z2) != 0) { - for (i = 0; i < z2->n; i++) { + if ((z2 = onigenc_unicode_unfold3_lookup(to->code)) != 0 && + CodePointListValidP(z2)) { + for (i = 0; i < OnigCodePointCount(z2->n); i++) { if (z2->code[i] == code) continue; items[n].byte_len = len; @@ -615,8 +589,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, } } else { - if (onig_st_lookup(Unfold1Table, (st_data_t )code, (void* )&to) != 0) { - for (i = 0; i < to->n; i++) { + if ((to = onigenc_unicode_unfold1_lookup(code)) != 0 && + CodePointListValidP(to)) { + for (i = 0; i < OnigCodePointCount(to->n); i++) { items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = to->code[i]; @@ -633,8 +608,8 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, codes[0] = code; code = ONIGENC_MBC_TO_CODE(enc, p, end); - if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0 - && to->n == 1) { + if ((to = onigenc_unicode_fold_lookup(code)) != 0 + && OnigCodePointCount(to->n) == 1) { codes[1] = to->code[0]; } else @@ -642,8 +617,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, clen = enclen(enc, p, end); len += clen; - if (onig_st_lookup(Unfold2Table, (st_data_t )codes, (void* )&z2) != 0) { - for (i = 0; i < z2->n; i++) { + if ((z2 = onigenc_unicode_unfold2_lookup(codes)) != 0 && + CodePointListValidP(z2)) { + for (i = 0; i < OnigCodePointCount(z2->n); i++) { items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = z2->code[i]; @@ -654,8 +630,8 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, p += clen; if (p < end) { code = ONIGENC_MBC_TO_CODE(enc, p, end); - if (onig_st_lookup(FoldTable, (st_data_t )code, (void* )&to) != 0 - && to->n == 1) { + if ((to = onigenc_unicode_fold_lookup(code)) != 0 + && OnigCodePointCount(to->n) == 1) { codes[2] = to->code[0]; } else @@ -663,9 +639,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, clen = enclen(enc, p, end); len += clen; - if (onig_st_lookup(Unfold3Table, (st_data_t )codes, - (void* )&z2) != 0) { - for (i = 0; i < z2->n; i++) { + if ((z2 = onigenc_unicode_unfold3_lookup(codes)) != 0 && + CodePointListValidP(z2)) { + for (i = 0; i < OnigCodePointCount(z2->n); i++) { items[n].byte_len = len; items[n].code_len = 1; items[n].code[0] = z2->code[i]; @@ -678,3 +654,165 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, return n; } + +#ifdef USE_CASE_MAP_API +/* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */ +#define CASE_MAPPING_SLACK 12 +#define MODIFIED (flags |= ONIGENC_CASE_MODIFIED) +extern int +onigenc_unicode_case_map(OnigCaseFoldType* flagP, + const OnigUChar** pp, const OnigUChar* end, + OnigUChar* to, OnigUChar* to_end, + const struct OnigEncodingTypeST* enc) +{ + OnigCodePoint code; + OnigUChar *to_start = to; + OnigCaseFoldType flags = *flagP; + int codepoint_length; + + to_end -= CASE_MAPPING_SLACK; + /* copy flags ONIGENC_CASE_UPCASE and ONIGENC_CASE_DOWNCASE over to + * ONIGENC_CASE_UP_SPECIAL and ONIGENC_CASE_DOWN_SPECIAL */ + flags |= (flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) << ONIGENC_CASE_SPECIAL_OFFSET; + + while (*pp < end && to <= to_end) { + codepoint_length = ONIGENC_PRECISE_MBC_ENC_LEN(enc, *pp, end); + if (codepoint_length < 0) + return codepoint_length; /* encoding invalid */ + code = ONIGENC_MBC_TO_CODE(enc, *pp, end); + *pp += codepoint_length; + + if (code <= 'z') { /* ASCII comes first */ + if (code >= 'a' /*&& code <= 'z'*/) { + if (flags & ONIGENC_CASE_UPCASE) { + MODIFIED; + if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'i') + code = I_WITH_DOT_ABOVE; + else + code -= 'a' - 'A'; + } + } + else if (code >= 'A' && code <= 'Z') { + if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) { + MODIFIED; + if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'I') + code = DOTLESS_i; + else + code += 'a' - 'A'; + } + } + } + else if (!(flags & ONIGENC_CASE_ASCII_ONLY) && code >= 0x00B5) { /* deal with non-ASCII; micron sign (U+00B5) is lowest affected */ + const CodePointList3 *folded; + + if (code == I_WITH_DOT_ABOVE) { + if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) { + MODIFIED; + code = 'i'; + if (!(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI)) { /* make dot above explicit */ + to += ONIGENC_CODE_TO_MBC(enc, code, to); + code = DOT_ABOVE; + } + } + } + else if (code == DOTLESS_i) { /* handle this manually, because it isn't involved in folding */ + if (flags & ONIGENC_CASE_UPCASE) { + MODIFIED; + code = 'I'; + } + } + else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) { /* data about character found in CaseFold_11_Table */ + if ((flags & ONIGENC_CASE_TITLECASE) && code>=0x1C90 && code<=0x1CBF) { /* Georgian MTAVRULI */ + MODIFIED; + code += 0x10D0 - 0x1C90; + } + else if ((flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, */ + && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */ + /* already Titlecase, no changes needed */ + } + else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ + const OnigCodePoint *next; + int count; + + MODIFIED; + if (flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_SPECIALS) { /* special */ + const OnigCodePoint *SpecialsStart = CaseMappingSpecials + OnigSpecialIndexDecode(folded->n); + + if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE) { /* swapCASE available */ + if ((flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) + == (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) /* swapCASE needed */ + goto SpecialsCopy; + else /* swapCASE not needed */ + SpecialsStart += SpecialsLengthExtract(*SpecialsStart); + } + if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) { /* Titlecase available */ + if (flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, but not yet Titlecase */ + goto SpecialsCopy; + else /* Titlecase not needed */ + SpecialsStart += SpecialsLengthExtract(*SpecialsStart); + } + if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_DOWN_SPECIAL) { + if (!(flags & ONIGENC_CASE_DOWN_SPECIAL)) + SpecialsStart += SpecialsLengthExtract(*SpecialsStart); + } + /* here, we know we use ONIGENC_CASE_UP_SPECIAL, and the position is right */ +SpecialsCopy: + count = SpecialsLengthExtract(*SpecialsStart); + next = SpecialsStart; + code = SpecialsCodepointExtract(*next++); + } + else { /* no specials */ + count = OnigCodePointCount(folded->n); + next = folded->code; + code = *next++; + } + if (count == 1) + ; + else if (count == 2) { + to += ONIGENC_CODE_TO_MBC(enc, code, to); + code = *next; + } + else { /* count == 3 */ + to += ONIGENC_CODE_TO_MBC(enc, code, to); + to += ONIGENC_CODE_TO_MBC(enc, *next++, to); + code = *next; + } + } + } + else if ((folded = onigenc_unicode_unfold1_lookup(code)) != 0) { /* data about character found in CaseUnfold_11_Table */ + if ((flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, */ + && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */ + /* already Titlecase, no changes needed */ + } + else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ + MODIFIED; + code = folded->code[(flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) ? 1 : 0]; + } + } + } + to += ONIGENC_CODE_TO_MBC(enc, code, to); + /* switch from titlecase to lowercase for capitalize */ + if (flags & ONIGENC_CASE_TITLECASE) + flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE | + ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL); + } + *flagP = flags; + return (int )(to - to_start); +} +#endif + +const char onigenc_unicode_version_string[] = +#ifdef ONIG_UNICODE_VERSION_STRING + ONIG_UNICODE_VERSION_STRING +#endif + ""; + +const int onigenc_unicode_version_number[3] = { +#ifdef ONIG_UNICODE_VERSION_MAJOR + ONIG_UNICODE_VERSION_MAJOR, + ONIG_UNICODE_VERSION_MINOR, + ONIG_UNICODE_VERSION_TEENY, +#else + 0 +#endif +}; |
