* enc/unicode.c: Removed artificial expansion for Turkic,

added hand-coded support for Turkic, fixed logic for swapcase. * string.c: Made use of new case mapping code possible from upcase, capitalize, and swapcase (with :lithuanian as a guard). * test/ruby/enc/test_case_mapping.rb: Adjusted for above. (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53562 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
author: duerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2016-01-17 08:42:16 +0000
committer: duerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2016-01-17 08:42:16 +0000
commit: 959bbb6f7202676f2da1ef5e134e6152e8613b54 (patch)
tree: 4a37adcb5edd3d2cc60a0e9a87ac107614babd79 /enc/unicode.c
parent: 0bc53416909fe4470b9cac34072b0b3c555218a3 (diff)
1 files changed, 41 insertions, 20 deletions
diff --git a/enc/unicode.c b/enc/unicode.c
index e877c99925..e61611801c 100644
--- a/enc/unicode.c
+++ b/enc/unicode.c
@@ -606,9 +606,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
 
 /* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */
 #define CASE_MAPPING_SLACK 12
-/* The following declaration should be moved to an include file rather than
-   be duplicated here (and in string.c), but we'll wait for this because we
-   want this to become a primitive anyway. */
+#define MODIFIED (flags |= ONIGENC_CASE_MODIFIED)
 extern int
 onigenc_unicode_case_map(OnigCaseFoldType* flagP,
     const OnigUChar** pp, const OnigUChar* end,
@@ -620,29 +618,52 @@ onigenc_unicode_case_map(OnigCaseFoldType* flagP,
     OnigCaseFoldType flags = *flagP;
     to_end -= CASE_MAPPING_SLACK;
 
-    /* hopelessly preliminary implementation, just dealing with ASCII,
-     * and just for downcase */
+    /* hopelessly preliminary implementation, just dealing with ASCII and Turkic */
     while (*pp<end && to<=to_end) {
 	code = ONIGENC_MBC_TO_CODE(enc, *pp, end);
 	*pp += enclen(enc, *pp, end);
-	/* using :turcic to test buffer expansion */
-	if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code==0x0049) { /* I */
-	    to += ONIGENC_CODE_TO_MBC(enc, 'T', to);
-	    to += ONIGENC_CODE_TO_MBC(enc, 'U', to);
-	    to += ONIGENC_CODE_TO_MBC(enc, 'R', to);
-	    to += ONIGENC_CODE_TO_MBC(enc, 'K', to);
-	    to += ONIGENC_CODE_TO_MBC(enc, 'I', to);
-	    to += ONIGENC_CODE_TO_MBC(enc, 'S', to);
-	    to += ONIGENC_CODE_TO_MBC(enc, 'H', to);
-	    to += ONIGENC_CODE_TO_MBC(enc, '*', to);
-	    code = 0x0131;
-	    flags |= ONIGENC_CASE_MODIFIED;
+	if (code<='z') { /* ASCII comes first */
+	    if (code>='a' && code<='z') {
+	        if (flags&ONIGENC_CASE_UPCASE) {
+		    if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code==0x0069) /* i → İ */
+			code = 0x0130;
+		    else
+			code += 'A'-'a';
+		    MODIFIED;
+		}
+	    }
+	    else if (code>='A' && code<='Z') {
+		if (flags&ONIGENC_CASE_DOWNCASE) {
+		    if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code==0x0049) /* I → ı */
+			code = 0x0131;
+		    else
+			code += 'a'-'A';
+		    MODIFIED;
+		}
+	    }
 	}
-	else if (code>='A' && code<='Z') {
-	    code += 'a'-'A';
-	    flags |= ONIGENC_CASE_MODIFIED;
+	else if (code>=0x00C0) { /* deal with non-ASCII; nothing relevant below U+00C0 */
+	    if (code==0x0130) { /* İ → i */
+		if (flags&ONIGENC_CASE_UPCASE) {
+		    if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI)
+			code = 0x0069;
+		    else { /* make dot above explicit */
+			to += ONIGENC_CODE_TO_MBC(enc, 0x0069, to);
+			code = 0x0307; /* dot above */
+		    }
+		    MODIFIED;
+		}
+	    }
+	    /* the following case can be removed once we rely on data,
+	     * because the mapping is always the same */
+	    else if (code==0x0131 && flags&ONIGENC_CASE_UPCASE) { /* ı → I */
+		code = 0x0049; MODIFIED;
+	    }
 	}
 	to += ONIGENC_CODE_TO_MBC(enc, code, to);
+	/* switch from titlecase to lowercase for capitalize */
+	if (flags & ONIGENC_CASE_TITLECASE)
+	    flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_TITLECASE|ONIGENC_CASE_DOWNCASE);
     }
     *flagP = flags;
     return (int)(to-to_start);
author	duerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2016-01-17 08:42:16 +0000
committer	duerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2016-01-17 08:42:16 +0000
commit	959bbb6f7202676f2da1ef5e134e6152e8613b54 (patch)
tree	4a37adcb5edd3d2cc60a0e9a87ac107614babd79 /enc/unicode.c
parent	0bc53416909fe4470b9cac34072b0b3c555218a3 (diff)