summaryrefslogtreecommitdiff
path: root/enc/unicode.c
diff options
context:
space:
mode:
Diffstat (limited to 'enc/unicode.c')
-rw-r--r--enc/unicode.c274
1 files changed, 140 insertions, 134 deletions
diff --git a/enc/unicode.c b/enc/unicode.c
index e72b2e64b2..5bc806863e 100644
--- a/enc/unicode.c
+++ b/enc/unicode.c
@@ -139,17 +139,17 @@ code3_equal(const OnigCodePoint *x, const OnigCodePoint *y)
/* macros related to ONIGENC_CASE flags */
/* defined here because not used in other files */
-#define ONIGENC_CASE_SPECIALS (ONIGENC_CASE_TITLECASE|ONIGENC_CASE_IS_TITLECASE|ONIGENC_CASE_UP_SPECIAL|ONIGENC_CASE_DOWN_SPECIAL)
+#define ONIGENC_CASE_SPECIALS (ONIGENC_CASE_TITLECASE | ONIGENC_CASE_IS_TITLECASE | ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL)
/* macros for length in CaseMappingSpecials array in enc/unicode/casefold.h */
#define SpecialsLengthOffset 25 /* needs to be higher than the 22 bits used for Unicode codepoints */
-#define SpecialsLengthExtract(n) ((n)>>SpecialsLengthOffset)
-#define SpecialsCodepointExtract(n) ((n)&((1<<SpecialsLengthOffset)-1))
-#define SpecialsLengthEncode(n) ((n)<<SpecialsLengthOffset)
+#define SpecialsLengthExtract(n) ((n) >> SpecialsLengthOffset)
+#define SpecialsCodepointExtract(n) ((n) & ((1 << SpecialsLengthOffset) - 1))
+#define SpecialsLengthEncode(n) ((n) << SpecialsLengthOffset)
-#define OnigSpecialIndexMask (((1<<OnigSpecialIndexWidth)-1)<<OnigSpecialIndexShift)
-#define OnigSpecialIndexEncode(n) ((n)<<OnigSpecialIndexShift)
-#define OnigSpecialIndexDecode(n) (((n)&OnigSpecialIndexMask)>>OnigSpecialIndexShift)
+#define OnigSpecialIndexMask (((1 << OnigSpecialIndexWidth) - 1) << OnigSpecialIndexShift)
+#define OnigSpecialIndexEncode(n) ((n) << OnigSpecialIndexShift)
+#define OnigSpecialIndexDecode(n) (((n) & OnigSpecialIndexMask) >> OnigSpecialIndexShift)
/* macros to shorten "enc/unicode/casefold.h", undefined immediately after including the file */
#define U ONIGENC_CASE_UPCASE
@@ -493,6 +493,10 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
#endif
if ((to = onigenc_unicode_fold_lookup(code)) != 0) {
+ if (OnigCodePointCount(to->n) == 0) {
+ /* any codepoint should not be empty */
+ UNREACHABLE_RETURN(0);
+ }
if (OnigCodePointCount(to->n) == 1) {
OnigCodePoint orig_code = code;
@@ -651,6 +655,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
return n;
}
+#ifdef USE_CASE_MAP_API
/* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */
#define CASE_MAPPING_SLACK 12
#define MODIFIED (flags |= ONIGENC_CASE_MODIFIED)
@@ -660,140 +665,142 @@ onigenc_unicode_case_map(OnigCaseFoldType* flagP,
OnigUChar* to, OnigUChar* to_end,
const struct OnigEncodingTypeST* enc)
{
- OnigCodePoint code;
- OnigUChar *to_start = to;
- OnigCaseFoldType flags = *flagP;
- int codepoint_length;
-
- to_end -= CASE_MAPPING_SLACK;
- /* copy flags ONIGENC_CASE_UPCASE and ONIGENC_CASE_DOWNCASE over to
- * ONIGENC_CASE_UP_SPECIAL and ONIGENC_CASE_DOWN_SPECIAL */
- flags |= (flags&(ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE))<<ONIGENC_CASE_SPECIAL_OFFSET;
-
- while (*pp<end && to<=to_end) {
- codepoint_length = ONIGENC_PRECISE_MBC_ENC_LEN(enc, *pp, end);
- if (codepoint_length < 0)
- return codepoint_length; /* encoding invalid */
- code = ONIGENC_MBC_TO_CODE(enc, *pp, end);
- *pp += codepoint_length;
-
- if (code<='z') { /* ASCII comes first */
- if (code>='a' && code<='z') {
- if (flags&ONIGENC_CASE_UPCASE) {
- MODIFIED;
- if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code=='i')
- code = I_WITH_DOT_ABOVE;
- else
- code += 'A'-'a';
- }
- }
- else if (code>='A' && code<='Z') {
- if (flags&(ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD)) {
- MODIFIED;
- if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code=='I')
- code = DOTLESS_i;
- else
- code += 'a'-'A';
- }
- }
+ OnigCodePoint code;
+ OnigUChar *to_start = to;
+ OnigCaseFoldType flags = *flagP;
+ int codepoint_length;
+
+ to_end -= CASE_MAPPING_SLACK;
+ /* copy flags ONIGENC_CASE_UPCASE and ONIGENC_CASE_DOWNCASE over to
+ * ONIGENC_CASE_UP_SPECIAL and ONIGENC_CASE_DOWN_SPECIAL */
+ flags |= (flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) << ONIGENC_CASE_SPECIAL_OFFSET;
+
+ while (*pp < end && to <= to_end) {
+ codepoint_length = ONIGENC_PRECISE_MBC_ENC_LEN(enc, *pp, end);
+ if (codepoint_length < 0)
+ return codepoint_length; /* encoding invalid */
+ code = ONIGENC_MBC_TO_CODE(enc, *pp, end);
+ *pp += codepoint_length;
+
+ if (code <= 'z') { /* ASCII comes first */
+ if (code >= 'a' /*&& code <= 'z'*/) {
+ if (flags & ONIGENC_CASE_UPCASE) {
+ MODIFIED;
+ if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'i')
+ code = I_WITH_DOT_ABOVE;
+ else
+ code -= 'a' - 'A';
}
- else if (!(flags&ONIGENC_CASE_ASCII_ONLY) && code>=0x00B5) { /* deal with non-ASCII; micron sign (U+00B5) is lowest affected */
- const CodePointList3 *folded;
-
- if (code==I_WITH_DOT_ABOVE) {
- if (flags&(ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD)) {
- MODIFIED;
- code = 'i';
- if (!(flags&ONIGENC_CASE_FOLD_TURKISH_AZERI)) { /* make dot above explicit */
- to += ONIGENC_CODE_TO_MBC(enc, code, to);
- code = DOT_ABOVE;
- }
- }
- }
- else if (code==DOTLESS_i) { /* handle this manually, because it isn't involved in folding */
- if (flags&ONIGENC_CASE_UPCASE)
- MODIFIED, code = 'I';
+ }
+ else if (code >= 'A' && code <= 'Z') {
+ if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) {
+ MODIFIED;
+ if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'I')
+ code = DOTLESS_i;
+ else
+ code += 'a' - 'A';
+ }
+ }
+ }
+ else if (!(flags & ONIGENC_CASE_ASCII_ONLY) && code >= 0x00B5) { /* deal with non-ASCII; micron sign (U+00B5) is lowest affected */
+ const CodePointList3 *folded;
+
+ if (code == I_WITH_DOT_ABOVE) {
+ if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) {
+ MODIFIED;
+ code = 'i';
+ if (!(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI)) { /* make dot above explicit */
+ to += ONIGENC_CODE_TO_MBC(enc, code, to);
+ code = DOT_ABOVE;
+ }
+ }
+ }
+ else if (code == DOTLESS_i) { /* handle this manually, because it isn't involved in folding */
+ if (flags & ONIGENC_CASE_UPCASE) {
+ MODIFIED;
+ code = 'I';
+ }
+ }
+ else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) { /* data about character found in CaseFold_11_Table */
+ if ((flags & ONIGENC_CASE_TITLECASE) && code>=0x1C90 && code<=0x1CBF) { /* Georgian MTAVRULI */
+ MODIFIED;
+ code += 0x10D0 - 0x1C90;
+ }
+ else if ((flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, */
+ && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */
+ /* already Titlecase, no changes needed */
+ }
+ else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */
+ const OnigCodePoint *next;
+ int count;
+
+ MODIFIED;
+ if (flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_SPECIALS) { /* special */
+ const OnigCodePoint *SpecialsStart = CaseMappingSpecials + OnigSpecialIndexDecode(folded->n);
+
+ if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE) { /* swapCASE available */
+ if ((flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE))
+ == (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) /* swapCASE needed */
+ goto SpecialsCopy;
+ else /* swapCASE not needed */
+ SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
}
- else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) { /* data about character found in CaseFold_11_Table */
- if ((flags&ONIGENC_CASE_TITLECASE) /* Titlecase needed, */
- && (OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */
- /* already Titlecase, no changes needed */
- }
- else if (flags&OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */
- const OnigCodePoint *next;
- int count;
-
- MODIFIED;
- if (flags&OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_SPECIALS) { /* special */
- OnigCodePoint *SpecialsStart = CaseMappingSpecials + OnigSpecialIndexDecode(folded->n);
-
- if (OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_IS_TITLECASE) { /* swapCASE available */
- if ((flags&(ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE))
- == (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) /* swapCASE needed */
- goto SpecialsCopy;
- else /* swapCASE not needed */
- SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
- }
- if (OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_TITLECASE) { /* Titlecase available */
- if (flags&ONIGENC_CASE_TITLECASE) /* Titlecase needed, but not yet Titlecase */
- goto SpecialsCopy;
- else /* Titlecase not needed */
- SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
- }
- if (OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_DOWN_SPECIAL) {
- if (!(flags&ONIGENC_CASE_DOWN_SPECIAL))
- SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
- }
- /* here, we know we use ONIGENC_CASE_UP_SPECIAL, and the position is right */
- SpecialsCopy:
- count = SpecialsLengthExtract(*SpecialsStart);
- next = SpecialsStart;
- code = SpecialsCodepointExtract(*next++);
- }
- else { /* no specials */
- count = OnigCodePointCount(folded->n);
- next = folded->code;
- code = *next++;
- }
- if (count==1)
- ;
- else if (count==2) {
- to += ONIGENC_CODE_TO_MBC(enc, code, to);
- code = *next;
- }
- else { /* count == 3 */
- to += ONIGENC_CODE_TO_MBC(enc, code, to);
- to += ONIGENC_CODE_TO_MBC(enc, *next++, to);
- code = *next;
- }
- }
+ if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) { /* Titlecase available */
+ if (flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, but not yet Titlecase */
+ goto SpecialsCopy;
+ else /* Titlecase not needed */
+ SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
}
- else if ((folded = onigenc_unicode_unfold1_lookup(code)) != 0) { /* data about character found in CaseUnfold_11_Table */
- if (flags&OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */
- MODIFIED;
- if (flags&OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_TITLECASE)
- code = folded->code[1];
- else
- code = folded->code[0];
- }
- else if ((flags&(ONIGENC_CASE_UPCASE))
- && (code==0x03B9||code==0x03BC)) { /* GREEK SMALL LETTERs IOTA/MU */
- MODIFIED;
- code = folded->code[1];
- }
+ if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_DOWN_SPECIAL) {
+ if (!(flags & ONIGENC_CASE_DOWN_SPECIAL))
+ SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
}
+ /* here, we know we use ONIGENC_CASE_UP_SPECIAL, and the position is right */
+SpecialsCopy:
+ count = SpecialsLengthExtract(*SpecialsStart);
+ next = SpecialsStart;
+ code = SpecialsCodepointExtract(*next++);
+ }
+ else { /* no specials */
+ count = OnigCodePointCount(folded->n);
+ next = folded->code;
+ code = *next++;
+ }
+ if (count == 1)
+ ;
+ else if (count == 2) {
+ to += ONIGENC_CODE_TO_MBC(enc, code, to);
+ code = *next;
+ }
+ else { /* count == 3 */
+ to += ONIGENC_CODE_TO_MBC(enc, code, to);
+ to += ONIGENC_CODE_TO_MBC(enc, *next++, to);
+ code = *next;
+ }
+ }
+ }
+ else if ((folded = onigenc_unicode_unfold1_lookup(code)) != 0) { /* data about character found in CaseUnfold_11_Table */
+ if ((flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, */
+ && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */
+ /* already Titlecase, no changes needed */
}
- to += ONIGENC_CODE_TO_MBC(enc, code, to);
- /* switch from titlecase to lowercase for capitalize */
- if (flags & ONIGENC_CASE_TITLECASE)
- flags ^= (ONIGENC_CASE_UPCASE |ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE|
- ONIGENC_CASE_UP_SPECIAL|ONIGENC_CASE_DOWN_SPECIAL);
+ else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */
+ MODIFIED;
+ code = folded->code[(flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) ? 1 : 0];
+ }
+ }
}
- *flagP = flags;
- return (int)(to-to_start);
+ to += ONIGENC_CODE_TO_MBC(enc, code, to);
+ /* switch from titlecase to lowercase for capitalize */
+ if (flags & ONIGENC_CASE_TITLECASE)
+ flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE |
+ ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL);
+ }
+ *flagP = flags;
+ return (int )(to - to_start);
}
+#endif
-#if 0
const char onigenc_unicode_version_string[] =
#ifdef ONIG_UNICODE_VERSION_STRING
ONIG_UNICODE_VERSION_STRING
@@ -809,4 +816,3 @@ const int onigenc_unicode_version_number[3] = {
0
#endif
};
-#endif