diff options
Diffstat (limited to 'enc/unicode.c')
| -rw-r--r-- | enc/unicode.c | 274 |
1 files changed, 140 insertions, 134 deletions
diff --git a/enc/unicode.c b/enc/unicode.c index e72b2e64b2..5bc806863e 100644 --- a/enc/unicode.c +++ b/enc/unicode.c @@ -139,17 +139,17 @@ code3_equal(const OnigCodePoint *x, const OnigCodePoint *y) /* macros related to ONIGENC_CASE flags */ /* defined here because not used in other files */ -#define ONIGENC_CASE_SPECIALS (ONIGENC_CASE_TITLECASE|ONIGENC_CASE_IS_TITLECASE|ONIGENC_CASE_UP_SPECIAL|ONIGENC_CASE_DOWN_SPECIAL) +#define ONIGENC_CASE_SPECIALS (ONIGENC_CASE_TITLECASE | ONIGENC_CASE_IS_TITLECASE | ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL) /* macros for length in CaseMappingSpecials array in enc/unicode/casefold.h */ #define SpecialsLengthOffset 25 /* needs to be higher than the 22 bits used for Unicode codepoints */ -#define SpecialsLengthExtract(n) ((n)>>SpecialsLengthOffset) -#define SpecialsCodepointExtract(n) ((n)&((1<<SpecialsLengthOffset)-1)) -#define SpecialsLengthEncode(n) ((n)<<SpecialsLengthOffset) +#define SpecialsLengthExtract(n) ((n) >> SpecialsLengthOffset) +#define SpecialsCodepointExtract(n) ((n) & ((1 << SpecialsLengthOffset) - 1)) +#define SpecialsLengthEncode(n) ((n) << SpecialsLengthOffset) -#define OnigSpecialIndexMask (((1<<OnigSpecialIndexWidth)-1)<<OnigSpecialIndexShift) -#define OnigSpecialIndexEncode(n) ((n)<<OnigSpecialIndexShift) -#define OnigSpecialIndexDecode(n) (((n)&OnigSpecialIndexMask)>>OnigSpecialIndexShift) +#define OnigSpecialIndexMask (((1 << OnigSpecialIndexWidth) - 1) << OnigSpecialIndexShift) +#define OnigSpecialIndexEncode(n) ((n) << OnigSpecialIndexShift) +#define OnigSpecialIndexDecode(n) (((n) & OnigSpecialIndexMask) >> OnigSpecialIndexShift) /* macros to shorten "enc/unicode/casefold.h", undefined immediately after including the file */ #define U ONIGENC_CASE_UPCASE @@ -493,6 +493,10 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, #endif if ((to = onigenc_unicode_fold_lookup(code)) != 0) { + if (OnigCodePointCount(to->n) == 0) { + /* any codepoint should not be empty */ + UNREACHABLE_RETURN(0); + } if (OnigCodePointCount(to->n) == 1) { OnigCodePoint orig_code = code; @@ -651,6 +655,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, return n; } +#ifdef USE_CASE_MAP_API /* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */ #define CASE_MAPPING_SLACK 12 #define MODIFIED (flags |= ONIGENC_CASE_MODIFIED) @@ -660,140 +665,142 @@ onigenc_unicode_case_map(OnigCaseFoldType* flagP, OnigUChar* to, OnigUChar* to_end, const struct OnigEncodingTypeST* enc) { - OnigCodePoint code; - OnigUChar *to_start = to; - OnigCaseFoldType flags = *flagP; - int codepoint_length; - - to_end -= CASE_MAPPING_SLACK; - /* copy flags ONIGENC_CASE_UPCASE and ONIGENC_CASE_DOWNCASE over to - * ONIGENC_CASE_UP_SPECIAL and ONIGENC_CASE_DOWN_SPECIAL */ - flags |= (flags&(ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE))<<ONIGENC_CASE_SPECIAL_OFFSET; - - while (*pp<end && to<=to_end) { - codepoint_length = ONIGENC_PRECISE_MBC_ENC_LEN(enc, *pp, end); - if (codepoint_length < 0) - return codepoint_length; /* encoding invalid */ - code = ONIGENC_MBC_TO_CODE(enc, *pp, end); - *pp += codepoint_length; - - if (code<='z') { /* ASCII comes first */ - if (code>='a' && code<='z') { - if (flags&ONIGENC_CASE_UPCASE) { - MODIFIED; - if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code=='i') - code = I_WITH_DOT_ABOVE; - else - code += 'A'-'a'; - } - } - else if (code>='A' && code<='Z') { - if (flags&(ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD)) { - MODIFIED; - if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code=='I') - code = DOTLESS_i; - else - code += 'a'-'A'; - } - } + OnigCodePoint code; + OnigUChar *to_start = to; + OnigCaseFoldType flags = *flagP; + int codepoint_length; + + to_end -= CASE_MAPPING_SLACK; + /* copy flags ONIGENC_CASE_UPCASE and ONIGENC_CASE_DOWNCASE over to + * ONIGENC_CASE_UP_SPECIAL and ONIGENC_CASE_DOWN_SPECIAL */ + flags |= (flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) << ONIGENC_CASE_SPECIAL_OFFSET; + + while (*pp < end && to <= to_end) { + codepoint_length = ONIGENC_PRECISE_MBC_ENC_LEN(enc, *pp, end); + if (codepoint_length < 0) + return codepoint_length; /* encoding invalid */ + code = ONIGENC_MBC_TO_CODE(enc, *pp, end); + *pp += codepoint_length; + + if (code <= 'z') { /* ASCII comes first */ + if (code >= 'a' /*&& code <= 'z'*/) { + if (flags & ONIGENC_CASE_UPCASE) { + MODIFIED; + if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'i') + code = I_WITH_DOT_ABOVE; + else + code -= 'a' - 'A'; } - else if (!(flags&ONIGENC_CASE_ASCII_ONLY) && code>=0x00B5) { /* deal with non-ASCII; micron sign (U+00B5) is lowest affected */ - const CodePointList3 *folded; - - if (code==I_WITH_DOT_ABOVE) { - if (flags&(ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD)) { - MODIFIED; - code = 'i'; - if (!(flags&ONIGENC_CASE_FOLD_TURKISH_AZERI)) { /* make dot above explicit */ - to += ONIGENC_CODE_TO_MBC(enc, code, to); - code = DOT_ABOVE; - } - } - } - else if (code==DOTLESS_i) { /* handle this manually, because it isn't involved in folding */ - if (flags&ONIGENC_CASE_UPCASE) - MODIFIED, code = 'I'; + } + else if (code >= 'A' && code <= 'Z') { + if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) { + MODIFIED; + if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'I') + code = DOTLESS_i; + else + code += 'a' - 'A'; + } + } + } + else if (!(flags & ONIGENC_CASE_ASCII_ONLY) && code >= 0x00B5) { /* deal with non-ASCII; micron sign (U+00B5) is lowest affected */ + const CodePointList3 *folded; + + if (code == I_WITH_DOT_ABOVE) { + if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) { + MODIFIED; + code = 'i'; + if (!(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI)) { /* make dot above explicit */ + to += ONIGENC_CODE_TO_MBC(enc, code, to); + code = DOT_ABOVE; + } + } + } + else if (code == DOTLESS_i) { /* handle this manually, because it isn't involved in folding */ + if (flags & ONIGENC_CASE_UPCASE) { + MODIFIED; + code = 'I'; + } + } + else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) { /* data about character found in CaseFold_11_Table */ + if ((flags & ONIGENC_CASE_TITLECASE) && code>=0x1C90 && code<=0x1CBF) { /* Georgian MTAVRULI */ + MODIFIED; + code += 0x10D0 - 0x1C90; + } + else if ((flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, */ + && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */ + /* already Titlecase, no changes needed */ + } + else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ + const OnigCodePoint *next; + int count; + + MODIFIED; + if (flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_SPECIALS) { /* special */ + const OnigCodePoint *SpecialsStart = CaseMappingSpecials + OnigSpecialIndexDecode(folded->n); + + if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE) { /* swapCASE available */ + if ((flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) + == (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) /* swapCASE needed */ + goto SpecialsCopy; + else /* swapCASE not needed */ + SpecialsStart += SpecialsLengthExtract(*SpecialsStart); } - else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) { /* data about character found in CaseFold_11_Table */ - if ((flags&ONIGENC_CASE_TITLECASE) /* Titlecase needed, */ - && (OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */ - /* already Titlecase, no changes needed */ - } - else if (flags&OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ - const OnigCodePoint *next; - int count; - - MODIFIED; - if (flags&OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_SPECIALS) { /* special */ - OnigCodePoint *SpecialsStart = CaseMappingSpecials + OnigSpecialIndexDecode(folded->n); - - if (OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_IS_TITLECASE) { /* swapCASE available */ - if ((flags&(ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) - == (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) /* swapCASE needed */ - goto SpecialsCopy; - else /* swapCASE not needed */ - SpecialsStart += SpecialsLengthExtract(*SpecialsStart); - } - if (OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_TITLECASE) { /* Titlecase available */ - if (flags&ONIGENC_CASE_TITLECASE) /* Titlecase needed, but not yet Titlecase */ - goto SpecialsCopy; - else /* Titlecase not needed */ - SpecialsStart += SpecialsLengthExtract(*SpecialsStart); - } - if (OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_DOWN_SPECIAL) { - if (!(flags&ONIGENC_CASE_DOWN_SPECIAL)) - SpecialsStart += SpecialsLengthExtract(*SpecialsStart); - } - /* here, we know we use ONIGENC_CASE_UP_SPECIAL, and the position is right */ - SpecialsCopy: - count = SpecialsLengthExtract(*SpecialsStart); - next = SpecialsStart; - code = SpecialsCodepointExtract(*next++); - } - else { /* no specials */ - count = OnigCodePointCount(folded->n); - next = folded->code; - code = *next++; - } - if (count==1) - ; - else if (count==2) { - to += ONIGENC_CODE_TO_MBC(enc, code, to); - code = *next; - } - else { /* count == 3 */ - to += ONIGENC_CODE_TO_MBC(enc, code, to); - to += ONIGENC_CODE_TO_MBC(enc, *next++, to); - code = *next; - } - } + if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) { /* Titlecase available */ + if (flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, but not yet Titlecase */ + goto SpecialsCopy; + else /* Titlecase not needed */ + SpecialsStart += SpecialsLengthExtract(*SpecialsStart); } - else if ((folded = onigenc_unicode_unfold1_lookup(code)) != 0) { /* data about character found in CaseUnfold_11_Table */ - if (flags&OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ - MODIFIED; - if (flags&OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_TITLECASE) - code = folded->code[1]; - else - code = folded->code[0]; - } - else if ((flags&(ONIGENC_CASE_UPCASE)) - && (code==0x03B9||code==0x03BC)) { /* GREEK SMALL LETTERs IOTA/MU */ - MODIFIED; - code = folded->code[1]; - } + if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_DOWN_SPECIAL) { + if (!(flags & ONIGENC_CASE_DOWN_SPECIAL)) + SpecialsStart += SpecialsLengthExtract(*SpecialsStart); } + /* here, we know we use ONIGENC_CASE_UP_SPECIAL, and the position is right */ +SpecialsCopy: + count = SpecialsLengthExtract(*SpecialsStart); + next = SpecialsStart; + code = SpecialsCodepointExtract(*next++); + } + else { /* no specials */ + count = OnigCodePointCount(folded->n); + next = folded->code; + code = *next++; + } + if (count == 1) + ; + else if (count == 2) { + to += ONIGENC_CODE_TO_MBC(enc, code, to); + code = *next; + } + else { /* count == 3 */ + to += ONIGENC_CODE_TO_MBC(enc, code, to); + to += ONIGENC_CODE_TO_MBC(enc, *next++, to); + code = *next; + } + } + } + else if ((folded = onigenc_unicode_unfold1_lookup(code)) != 0) { /* data about character found in CaseUnfold_11_Table */ + if ((flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, */ + && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */ + /* already Titlecase, no changes needed */ } - to += ONIGENC_CODE_TO_MBC(enc, code, to); - /* switch from titlecase to lowercase for capitalize */ - if (flags & ONIGENC_CASE_TITLECASE) - flags ^= (ONIGENC_CASE_UPCASE |ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE| - ONIGENC_CASE_UP_SPECIAL|ONIGENC_CASE_DOWN_SPECIAL); + else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ + MODIFIED; + code = folded->code[(flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) ? 1 : 0]; + } + } } - *flagP = flags; - return (int)(to-to_start); + to += ONIGENC_CODE_TO_MBC(enc, code, to); + /* switch from titlecase to lowercase for capitalize */ + if (flags & ONIGENC_CASE_TITLECASE) + flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE | + ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL); + } + *flagP = flags; + return (int )(to - to_start); } +#endif -#if 0 const char onigenc_unicode_version_string[] = #ifdef ONIG_UNICODE_VERSION_STRING ONIG_UNICODE_VERSION_STRING @@ -809,4 +816,3 @@ const int onigenc_unicode_version_number[3] = { 0 #endif }; -#endif |
