From 66a6073859ac6ae2143a9d72162efedece7e1348 Mon Sep 17 00:00:00 2001 From: duerst Date: Wed, 5 Dec 2018 08:10:24 +0000 Subject: update to Unicode 11.0.0 (main step, not complete yet) - common.mk: Change Unicode version to 11.0.0, and Emoji version to 11.0 - test/ruby/enc/test_emoji_breaks.rb: update hard-coded Emoji version - enc/unicode/11.0.0, enc/unicode/11.0.0/casefold.h, enc/unicode/name2ctype.h: Add generated files. Files for Unicode 10.0.0 will be removed once we are sure 11.0.0 works. - lib/unicode_normalize/tables.rb: Updated table. - regparse.c: Almost completely reimplement grapheme cluster detection in function node_extended_grapheme_cluster(). git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66213 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- regparse.c | 483 ++++++++++++++++++++++--------------------------------------- 1 file changed, 174 insertions(+), 309 deletions(-) (limited to 'regparse.c') diff --git a/regparse.c b/regparse.c index 77876b0023..8bf122b995 100644 --- a/regparse.c +++ b/regparse.c @@ -5831,6 +5831,7 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env) Node* list2 = NULL; Node* alt = NULL; Node* alt2 = NULL; + Node* top_alt = NULL; BBuf *pbuf1 = NULL; int r = 0; int num1; @@ -5845,9 +5846,9 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env) if (ONIGENC_IS_UNICODE(env->enc)) { /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */ CClassNode* cc; - OnigCodePoint sb_out = (ONIGENC_MBC_MINLEN(env->enc) > 1) ? 0x00 : 0x80; - Node **seq = node_array; /* seq[5] */ - Node **alts = node_array+5; /* alts[4] */ + /* OnigCodePoint sb_out = (ONIGENC_MBC_MINLEN(env->enc) > 1) ? 0x00 : 0x80; */ + /* Node **seq = node_array; * seq[5] */ + /* Node **alts = node_array+5; * alts[4] */ for (i=0; imbuf), env, 0x200D, 0x200D)); - - R_ERR(quantify_node(&np1, 0, REPEAT_INFINITE)); - - tmp = node_new_list(np1, NULL_NODE); - if (IS_NULL(tmp)) goto err; - list = tmp; - np1 = NULL; - - /* Unicode 10.0.0 */ - /* ( RI-sequence | Hangul-Syllable | !Control ) */ - /* !Control */ - np1 = node_new_cclass(); - if (IS_NULL(np1)) goto err; - cc = NCCLASS(np1); - R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 1, env)); - if (! (ONIGENC_MBC_MINLEN(env->enc) > 1)) { - BITSET_CLEAR_BIT(cc->bs, 0x0a); - BITSET_CLEAR_BIT(cc->bs, 0x0d); - } - - tmp = onig_node_new_alt(np1, NULL_NODE); - if (IS_NULL(tmp)) goto err; - alt = tmp; - np1 = NULL; - - /* Unicode 10.0.0 */ - /* Hangul-Syllable - * := L* V+ T* - * | L* LV V* T* - * | L* LVT T* - * | L+ - * | T+ */ - /* Unicode 11.0.0 */ - /* Hangul-Syllable - * := L* (V+ | LV V* | LVT) T* - * | L+ - * | T+ */ - /* these are equivalent, so we leave things as is for the moment */ - - /* T+ */ - R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=T", '+')); - - tmp = onig_node_new_alt(np1, alt); - if (IS_NULL(tmp)) goto err; - alt = tmp; - np1 = NULL; - - /* L+ */ - R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=L", '+')); - - tmp = onig_node_new_alt(np1, alt); - if (IS_NULL(tmp)) goto err; - alt = tmp; - np1 = NULL; - - /* L* LVT T* */ - { - R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=L", '*')); - R_ERR(create_property_node(seq+1, env, "Grapheme_Cluster_Break=LVT")); - R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=T", '*')); - - R_ERR(create_sequence_node(&list2, seq)); - } - - tmp = onig_node_new_alt(list2, alt); - if (IS_NULL(tmp)) goto err; - alt = tmp; - list2 = NULL; - - /* L* LV V* T* */ - { - R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=L", '*')); - R_ERR(create_property_node(seq+1, env, "Grapheme_Cluster_Break=LV")); - R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=V", '*')); - R_ERR(quantify_property_node(seq+3, env, "Grapheme_Cluster_Break=T", '*')); - - R_ERR(create_sequence_node(&list2, seq)); - } - - tmp = onig_node_new_alt(list2, alt); - if (IS_NULL(tmp)) goto err; - alt = tmp; - list2 = NULL; - - /* L* V+ T* */ - { - R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=L", '*')); - R_ERR(quantify_property_node(seq+1, env, "Grapheme_Cluster_Break=V", '+')); - R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=T", '*')); - - R_ERR(create_sequence_node(&list2, seq)); - } - - tmp = onig_node_new_alt(list2, alt); - if (IS_NULL(tmp)) goto err; - alt = tmp; - list2 = NULL; - /* end of Hangul-Syllable */ - - /* Unicode 10.0.0 */ - /* Emoji sequence := (E_Base | EBG) Extend* E_Modifier? - * (ZWJ (Glue_After_Zwj | EBG Extend* E_Modifier?) )* */ - /* ZWJ (Glue_After_Zwj | E_Base_GAZ Extend* E_Modifier?) */ + /* Unicode 11.0.0 + * CRLF (this is added last because it is common with non-Unicode encodings) + * | [Control CR LF] + * | precore* core postcore* + * | . (to catch invalid stuff, because this seems to be spec for String#grapheme_clusters) */ { - /* Unicode 10.0.0 */ - /* Emoji variation sequence - * http://unicode.org/Public/emoji/4.0/emoji-zwj-sequences.txt - */ - /* Emoji U+FE0F */ - { - seq[0] = node_new_cclass(); - if (IS_NULL(seq[0])) goto err; - cc = NCCLASS(seq[0]); - R_ERR(add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, onigenc_unicode_GCB_ranges_Emoji)); + Node *alts[4]; - r = ONIGENC_CODE_TO_MBC(env->enc, 0xfe0f, buf); /* VARIATION SELECTOR-16 */ - if (r < 0) goto err; - seq[1] = node_new_str_raw(buf, buf + r); - if (IS_NULL(seq[1])) goto err; - R_ERR(quantify_node(seq+1, 0, 1)); - - R_ERR(create_sequence_node(alts+0, seq)); + /* [Control CR LF] (CR and LF are not in the spec, but this is a conformed fix) */ + alts[0] = node_new_cclass(); + if (IS_NULL(alts[0])) goto err; + cc = NCCLASS(alts[0]); + R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env)); + if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */ + R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */ + R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */ + } + else { + BITSET_SET_BIT(cc->bs, 0x0a); + BITSET_SET_BIT(cc->bs, 0x0d); } - /* Unicode 10.0.0 */ - /* Glue_After_Zwj */ + /* precore* core postcore* */ { - seq[0] = node_new_cclass(); - if (IS_NULL(seq[0])) goto err; - cc = NCCLASS(seq[0]); - R_ERR(add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, onigenc_unicode_GCB_ranges_GAZ)); - R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Glue_After_Zwj", 0, env)); + Node *seq[4]; - R_ERR(quantify_property_node(seq+1, env, "Grapheme_Cluster_Break=Extend", '*')); + /* precore*; precore := Prepend */ + R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=Prepend", '*')); + /* core := hangul-syllable + * | ri-sequence + * | xpicto-sequence + * | [^Control CR LF] */ + { + Node *core_alts[7]; + + /* hangul-syllable := + * L* (V+ | LV V* | LVT) T* + * | L+ + * | T+ */ + /* hangul-syllable is an alternative (would be called H_alt) + * inside an alternative, but we flatten it into core_alts */ + + /* L* (V+ | LV V* | LVT) T* */ + { + Node *H_seq[4]; + + R_ERR(quantify_property_node(H_seq+0, env, "Grapheme_Cluster_Break=L", '*')); + + /* V+ | LV V* | LVT */ + { + Node *H_alt2[4]; + + R_ERR(quantify_property_node(H_alt2+0, env, "Grapheme_Cluster_Break=V", '+')); + + /* LV V* */ + { + Node *H_seq2[3]; + + R_ERR(create_property_node(H_seq2+0, env, "Grapheme_Cluster_Break=LV")); + R_ERR(quantify_property_node(H_seq2+1, env, "Grapheme_Cluster_Break=V", '*')); + + H_seq2[2] = NULL_NODE; + R_ERR(create_sequence_node(H_alt2+1, H_seq2)); + } + + R_ERR(create_property_node(H_alt2+2, env, "Grapheme_Cluster_Break=LVT")); + + H_alt2[3] = NULL_NODE; + R_ERR(create_alternate_node(H_seq+1, H_alt2)); + } + + R_ERR(quantify_property_node(H_seq+2, env, "Grapheme_Cluster_Break=T", '*')); + + H_seq[3] = NULL_NODE; + R_ERR(create_sequence_node(core_alts+0, H_seq)); + } + /* end of L* (V+ | LV V* | LVT) T*, result is in core_alts[0] */ + + /* L+ */ + R_ERR(quantify_property_node(core_alts+1, env, "Grapheme_Cluster_Break=L", '+')); + + /* T+ */ + R_ERR(quantify_property_node(core_alts+2, env, "Grapheme_Cluster_Break=T", '+')); + /* end of hangul-syllable */ + + /* ri-sequence := RI RI */ + R_ERR(quantify_property_node(core_alts+3, env, "Regional_Indicator", '2')); + + /* xpicto-sequence := \p{Extended_Pictographic} (Extend* ZWJ \p{Extended_Pictographic})* */ + { + Node *XP_seq[3]; + + R_ERR(create_property_node(XP_seq+0, env, "Extended_Pictographic")); + + /* (Extend* ZWJ \p{Extended_Pictographic})* */ + { + Node *Ex_seq[4]; + + R_ERR(quantify_property_node(Ex_seq+0, env, "Grapheme_Cluster_Break=Extend", '*')); + + /* ZWJ (ZERO WIDTH JOINER) */ + r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); + if (r < 0) goto err; + Ex_seq[1] = node_new_str_raw(buf, buf + r); + if (IS_NULL(Ex_seq[1])) goto err; + + R_ERR(create_property_node(Ex_seq+2, env, "Extended_Pictographic")); + + Ex_seq[3] = NULL_NODE; + R_ERR(create_sequence_node(XP_seq+1, Ex_seq)); + } + R_ERR(quantify_node(XP_seq+1, 0, REPEAT_INFINITE)); /* TODO: Check about node freeing */ + /* end of (Extend* ZWJ \p{Extended_Pictographic})* */ + + XP_seq[2] = NULL_NODE; + R_ERR(create_sequence_node(core_alts+4, XP_seq)); + } + /* end of xpicto-sequence, result is in core_alts[4] */ + + /* [^Control CR LF] */ + core_alts[5] = node_new_cclass(); + if (IS_NULL(core_alts[5])) goto err; + cc = NCCLASS(core_alts[5]); + if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */ + BBuf *inverted_buf = NULL; + + /* Start with a positive buffer and invert at the end, + * because otherwise adding single-character ranges works the wrong way. */ + R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env)); + R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */ + R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */ + R_ERR(not_code_range_buf(env->enc, cc->mbuf, &inverted_buf, env)); + cc->mbuf = inverted_buf; /* TODO: check what to do with buffer before inversion */ + } + else { + R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 1, env)); + BITSET_CLEAR_BIT(cc->bs, 0x0a); + BITSET_CLEAR_BIT(cc->bs, 0x0d); + } + /* end of [^Control CR LF], result in core_alts[5] */ + + core_alts[6] = NULL_NODE; + R_ERR(create_alternate_node(seq+1, core_alts)); + } + /* end of core := hangul-syllable | ri-sequence | xpicto-sequence | [^Control CR LF], + * result is in seq[1] */ + + /* postcore*; postcore = [Extend ZWJ SpacingMark] */ + R_ERR(create_property_node(seq+2, env, "Grapheme_Cluster_Break=Extend")); + cc = NCCLASS(seq[2]); + R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=SpacingMark", 0, env)); + R_ERR(add_code_range(&(cc->mbuf), env, 0x200D, 0x200D)); + R_ERR(quantify_node(seq+2, 0, REPEAT_INFINITE)); + + seq[3] = NULL_NODE; R_ERR(create_sequence_node(alts+1, seq)); } + /* end of (precore* core postcore*), result is in alts[1] */ - /* E_Base_GAZ Extend* E_Modifier? */ - { - R_ERR(create_property_node(seq+0, env, "Grapheme_Cluster_Break=E_Base_GAZ")); - R_ERR(quantify_property_node(seq+1, env, "Grapheme_Cluster_Break=Extend", '*')); - R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=E_Modifier", '?')); - - R_ERR(create_sequence_node(alts+2, seq)); - } - - R_ERR(create_alternate_node(&alt2, alts)); - } - - tmp = node_new_list(alt2, NULL_NODE); - if (IS_NULL(tmp)) goto err; - list2 = tmp; - alt2 = NULL; - - /* ZWJ */ - r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); /* ZERO WIDTH JOINER (ZWJ) */ - if (r < 0) goto err; - np1 = node_new_str_raw(buf, buf + r); - if (IS_NULL(np1)) goto err; - - tmp = node_new_list(np1, list2); - if (IS_NULL(tmp)) goto err; - list2 = tmp; - np1 = NULL; - - R_ERR(quantify_node(&list2, 0, REPEAT_INFINITE)); - np1 = list2; - list2 = NULL; - - tmp = node_new_list(np1, NULL_NODE); - if (IS_NULL(tmp)) goto err; - list2 = tmp; - np1 = NULL; - - /* E_Modifier? */ - R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=E_Modifier", '?')); - - tmp = node_new_list(np1, list2); - if (IS_NULL(tmp)) goto err; - list2 = tmp; - np1 = NULL; - - /* Extend* */ - R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=Extend", '*')); - - tmp = node_new_list(np1, list2); - if (IS_NULL(tmp)) goto err; - list2 = tmp; - np1 = NULL; - - /* (E_Base | EBG) */ - np1 = node_new_cclass(); - if (IS_NULL(np1)) goto err; - cc = NCCLASS(np1); - R_ERR(add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, onigenc_unicode_GCB_ranges_E_Base)); - R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=E_Base", 0, env)); - R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=E_Base_GAZ", 0, env)); - - tmp = node_new_list(np1, list2); - if (IS_NULL(tmp)) goto err; - list2 = tmp; - np1 = NULL; - - tmp = onig_node_new_alt(list2, alt); - if (IS_NULL(tmp)) goto err; - alt = tmp; - list2 = NULL; - - /* Unicode 10.0.0 */ - /* a sequence starting with ZWJ seems artificial, but GraphemeBreakTest - * has such examples. - * http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.html - */ - /* ZWJ (E_Base_GAZ | Glue_After_Zwj) E_Modifier? */ - { - r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); /* ZERO WIDTH JOINER (ZWJ) */ - if (r < 0) goto err; - seq[0] = node_new_str_raw(buf, buf + r); - if (IS_NULL(seq[0])) goto err; - - seq[1] = node_new_cclass(); - if (IS_NULL(seq[1])) goto err; - cc = NCCLASS(seq[1]); - R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Glue_After_Zwj", 0, env)); - R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=E_Base_GAZ", 0, env)); - - R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=E_Modifier", '?')); - - R_ERR(create_sequence_node(&list2, seq)); - } /* End of ZWJ (E_Base_GAZ | Glue_After_Zwj) E_Modifier? */ - - tmp = onig_node_new_alt(list2, alt); - if (IS_NULL(tmp)) goto err; - alt = tmp; - list2 = NULL; - - /* Unicode 10.0.0/11.0.0 */ - /* this is Regional_Indicator+ in the Unicode 10.0.0 regular expression, - * but the segmentation rules and Unicode 11.0.0 use Regional_Indicator{2}, so no need to fix */ - /* RI-Sequence := Regional_Indicator{2} */ - R_ERR(quantify_property_node(&np1, env, "Regional_Indicator", '2')); - - tmp = onig_node_new_alt(np1, alt); - if (IS_NULL(tmp)) goto err; - alt = tmp; - np1 = NULL; + /* PerlSyntax: (?s:.), RubySyntax: (?m:.) */ + /* Not in spec, but added to catch invalid stuff, + * because this is spec for String#grapheme_clusters. */ + np1 = node_new_anychar(); + if (IS_NULL(np1)) goto err; - tmp = node_new_list(alt, list); - if (IS_NULL(tmp)) goto err; - list = tmp; - alt = NULL; - - /* Prepend* */ - R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=Prepend", '*')); - - tmp = node_new_list(np1, list); - if (IS_NULL(tmp)) goto err; - list = tmp; - np1 = NULL; - - /* PerlSyntax: (?s:.), RubySyntax: (?m:.) */ - np1 = node_new_anychar(); - if (IS_NULL(np1)) goto err; - - option = env->option; - ONOFF(option, ONIG_OPTION_MULTILINE, 0); - tmp = node_new_option(option); - if (IS_NULL(tmp)) goto err; - NENCLOSE(tmp)->target = np1; - np1 = tmp; - - tmp = onig_node_new_alt(np1, NULL_NODE); - if (IS_NULL(tmp)) goto err; - alt = tmp; - np1 = NULL; - - /* Prepend+ ZWJ* */ - { - R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=Prepend", '+')); + option = env->option; + ONOFF(option, ONIG_OPTION_MULTILINE, 0); + tmp = node_new_option(option); + if (IS_NULL(tmp)) goto err; + NENCLOSE(tmp)->target = np1; + alts[2] = tmp; - r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); /* does this belong to Prepend?? */ - if (r < 0) goto err; - seq[1] = node_new_str_raw(buf, buf + r); - if (IS_NULL(seq[1])) goto err; - R_ERR(quantify_node(seq+1, 0, 1)); - - R_ERR(create_sequence_node(&list2, seq)); - } - - tmp = onig_node_new_alt(list2, alt); - if (IS_NULL(tmp)) goto err; - alt = tmp; - list2 = NULL; - - tmp = onig_node_new_alt(list, alt); - if (IS_NULL(tmp)) goto err; - alt = tmp; - list = NULL; + alts[3] = NULL_NODE; + R_ERR(create_alternate_node(&top_alt, alts)); + } /* end of (CRLF | Control | precore* core postcore*) (without CRLF!), result is in top_alt */ } else #endif /* USE_UNICODE_PROPERTIES */ @@ -6186,11 +6050,12 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env) NENCLOSE(tmp)->target = np1; np1 = tmp; - alt = onig_node_new_alt(np1, NULL_NODE); - if (IS_NULL(alt)) goto err; + top_alt = onig_node_new_alt(np1, NULL_NODE); + if (IS_NULL(top_alt)) goto err; np1 = NULL; } + /* add in CRLF to complete (CRLF | Control | precore* core postcore*) */ /* \x0D\x0A */ r = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf); if (r < 0) goto err; @@ -6200,15 +6065,15 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env) np1 = node_new_str_raw(buf, buf + num1 + r); if (IS_NULL(np1)) goto err; - tmp = onig_node_new_alt(np1, alt); + tmp = onig_node_new_alt(np1, top_alt); if (IS_NULL(tmp)) goto err; - alt = tmp; + top_alt = tmp; np1 = NULL; /* (?>\x0D\x0A|...) */ tmp = node_new_enclose(ENCLOSE_STOP_BACKTRACK); if (IS_NULL(tmp)) goto err; - NENCLOSE(tmp)->target = alt; + NENCLOSE(tmp)->target = top_alt; np1 = tmp; #ifdef USE_UNICODE_PROPERTIES -- cgit v1.2.3