summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorduerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2018-12-09 23:14:29 +0000
committerduerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2018-12-09 23:14:29 +0000
commit3628eae2e754a7489feebc6f41371d42d2efcf3c (patch)
treeaf9f1aea84a5c0d0a6d8cc48b67409324220676d
parent596d7bdec4c389090afcddbd448282ec4bcfdf86 (diff)
implement special behavior for Georgian for String#capitalize
The modern Georgian script is special in that it has an 'uppercase' variant called MTAVRULI which can be used for emphasis of whole words, for screamy headlines, and so on. However, in contrast to all other bicameral scripts, there is no usage of capitalizing the first letter in a word or a sentence. Words with mixed capitalization are not used at all. We therefore implement special behavior for String#capitalize. Formally, we define String#capitalize as first applying String#downcase for the whole string, then using titlecase on the first letter. Because Georgian defines titlecase as the identity function both for MTAVRULI ('uppercase') and Mkhedruli (lowercase), this results in String#capitalize being equivalent to String#downcase for Georgian. This avoids undesirable mixed case. * enc/unicode.c: Actual implementation * string.c: Add mention of this special case for documentation * test/ruby/enc/test_case_mapping.rb: Add two tests, a general one that uses String#capitalize on some (including nonsensical) combinations of MTAVRULI and Mkhedruli, and a canary test to detect the potential assignment of characters to the currently open slots (holes) at U+1CBB and U+1CBC. * test/ruby/enc/test_case_comprehensive.rb: Tweak generation of expectation data. Together with r65933, this closes issue #14839. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66300 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r--enc/unicode.c6
-rw-r--r--string.c2
-rw-r--r--test/ruby/enc/test_case_comprehensive.rb6
-rw-r--r--test/ruby/enc/test_case_mapping.rb17
4 files changed, 29 insertions, 2 deletions
diff --git a/enc/unicode.c b/enc/unicode.c
index b3dbd55..6e8c3d8 100644
--- a/enc/unicode.c
+++ b/enc/unicode.c
@@ -719,7 +719,11 @@ onigenc_unicode_case_map(OnigCaseFoldType* flagP,
}
}
else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) { /* data about character found in CaseFold_11_Table */
- if ((flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, */
+ if ((flags & ONIGENC_CASE_TITLECASE) && code>=0x1C90 && code<=0x1CBF) { /* Georgian MTAVRULI */
+ MODIFIED;
+ code += 0x10D0 - 0x1C90;
+ }
+ else if ((flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, */
&& (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */
/* already Titlecase, no changes needed */
}
diff --git a/string.c b/string.c
index 55a3043..4fb2c04 100644
--- a/string.c
+++ b/string.c
@@ -6727,6 +6727,8 @@ rb_str_downcase(int argc, VALUE *argv, VALUE str)
*
* Modifies <i>str</i> by converting the first character to uppercase and the
* remainder to lowercase. Returns <code>nil</code> if no changes are made.
+ * There is an exception for modern Georgian (mkhedruli/MTAVRULI), where
+ * the result is the same as for String#downcase, to avoid mixed case.
*
* See String#downcase for meaning of +options+ and use with different encodings.
*
diff --git a/test/ruby/enc/test_case_comprehensive.rb b/test/ruby/enc/test_case_comprehensive.rb
index cd6447e..bde4701 100644
--- a/test/ruby/enc/test_case_comprehensive.rb
+++ b/test/ruby/enc/test_case_comprehensive.rb
@@ -73,7 +73,11 @@ TestComprehensiveCaseMapping.data_files_available? and class TestComprehensiveC
@@codepoints << code
upcase[code] = hex2utf8 data[12] unless data[12].empty?
downcase[code] = hex2utf8 data[13] unless data[13].empty?
- titlecase[code] = hex2utf8 data[14] unless data[14].empty?
+ if code>="\u1C90" and code<="\u1CBF" # exception for Georgian: use lowercase for titlecase
+ titlecase[code] = hex2utf8(data[13]) unless data[13].empty?
+ else
+ titlecase[code] = hex2utf8 data[14] unless data[14].empty?
+ end
end
read_data_file('CaseFolding') do |code, data|
casefold[code] = hex2utf8(data[2]) if data[1] =~ /^[CF]$/
diff --git a/test/ruby/enc/test_case_mapping.rb b/test/ruby/enc/test_case_mapping.rb
index d095cd5..984fd5d 100644
--- a/test/ruby/enc/test_case_mapping.rb
+++ b/test/ruby/enc/test_case_mapping.rb
@@ -187,6 +187,23 @@ class TestCaseMappingPreliminary < Test::Unit::TestCase
assert_equal 0, "\ua64A" =~ /\uA64B/i
end
+ def test_georgian_canary
+ message = "Reexamine implementation of Georgian in String#capitalize"
+ assert_equal false, "\u1CBB".match?(/\p{assigned}/), message
+ assert_equal false, "\u1CBC".match?(/\p{assigned}/), message
+ end
+
+ def test_georgian_capitalize
+ assert_equal "\u10D0\u10D1\u10D2", "\u1C90\u1C91\u1C92".capitalize
+ assert_equal "\u10D0\u10D1\u10D2", "\u1C90\u1C91\u10D2".capitalize
+ assert_equal "\u10D0\u10D1\u10D2", "\u1C90\u10D1\u1C92".capitalize
+ assert_equal "\u10D0\u10D1\u10D2", "\u1C90\u10D1\u10D2".capitalize
+ assert_equal "\u10D0\u10D1\u10D2", "\u10D0\u1C91\u1C92".capitalize
+ assert_equal "\u10D0\u10D1\u10D2", "\u10D0\u1C91\u10D2".capitalize
+ assert_equal "\u10D0\u10D1\u10D2", "\u10D0\u10D1\u1C92".capitalize
+ assert_equal "\u10D0\u10D1\u10D2", "\u10D0\u10D1\u10D2".capitalize
+ end
+
def no_longer_a_test_buffer_allocations
assert_equal 'TURKISH*ı'*10, ('I'*10).downcase(:turkic)
assert_equal 'TURKISH*ı'*100, ('I'*100).downcase(:turkic)