From c898839035b6fe74f8281fb918c13f856690bfde Mon Sep 17 00:00:00 2001 From: yugui Date: Fri, 27 Nov 2009 02:54:10 +0000 Subject: merges r24544 from trunk into ruby_1_9_1. -- \d, \s and \w are now non Unicode class. [ruby-dev:39026] * include/ruby/oniguruma.h (ONIGENC_CTYPE_SPECIAL_MASK): added. (ONIGENC_CTYPE_D): ditto. (ONIGENC_CTYPE_S): ditto. (ONIGENC_CTYPE_W): ditto. * regparse.c: \d, \s and \w are now non Unicode class. [ruby-dev:39026] (fetch_token_in_cc): use ONIGENC_CTYPE_[DSW] for \d/\s/\w. (fetch_token): ditto. (add_ctype_to_cc): add routines for ONIGENC_CTYPE_[DSW]. (parse_exp): ditto. * test/ruby/test_regexp.rb (TestRegexp#test_char_class): add tests for above. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/ruby_1_9_1@25941 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- regparse.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 12 deletions(-) (limited to 'regparse.c') diff --git a/regparse.c b/regparse.c index 1fc0459fa6..75ad24feda 100644 --- a/regparse.c +++ b/regparse.c @@ -2946,32 +2946,32 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) switch (c) { case 'w': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_WORD; + tok->u.prop.ctype = ONIGENC_CTYPE_W; tok->u.prop.not = 0; break; case 'W': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_WORD; + tok->u.prop.ctype = ONIGENC_CTYPE_W; tok->u.prop.not = 1; break; case 'd': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; + tok->u.prop.ctype = ONIGENC_CTYPE_D; tok->u.prop.not = 0; break; case 'D': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; + tok->u.prop.ctype = ONIGENC_CTYPE_D; tok->u.prop.not = 1; break; case 's': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; + tok->u.prop.ctype = ONIGENC_CTYPE_S; tok->u.prop.not = 0; break; case 'S': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; + tok->u.prop.ctype = ONIGENC_CTYPE_S; tok->u.prop.not = 1; break; case 'h': @@ -3233,14 +3233,14 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case 'w': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_WORD; + tok->u.prop.ctype = ONIGENC_CTYPE_W; tok->u.prop.not = 0; break; case 'W': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_WORD; + tok->u.prop.ctype = ONIGENC_CTYPE_W; tok->u.prop.not = 1; break; @@ -3273,28 +3273,28 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case 's': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; + tok->u.prop.ctype = ONIGENC_CTYPE_S; tok->u.prop.not = 0; break; case 'S': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; + tok->u.prop.ctype = ONIGENC_CTYPE_S; tok->u.prop.not = 1; break; case 'd': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; + tok->u.prop.ctype = ONIGENC_CTYPE_D; tok->u.prop.not = 0; break; case 'D': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; + tok->u.prop.ctype = ONIGENC_CTYPE_D; tok->u.prop.not = 1; break; @@ -3835,6 +3835,28 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) OnigCodePoint sb_out; OnigEncoding enc = env->enc; + switch (ctype) { + case ONIGENC_CTYPE_D: + case ONIGENC_CTYPE_S: + case ONIGENC_CTYPE_W: + ctype ^= ONIGENC_CTYPE_SPECIAL_MASK; + if (not != 0) { + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (! ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype)) + BITSET_SET_BIT(cc->bs, c); + } + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); + } + else { + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype)) + BITSET_SET_BIT(cc->bs, c); + } + } + return 0; + break; + } + r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges); if (r == 0) { return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges); @@ -5186,6 +5208,19 @@ parse_exp(Node** np, OnigToken* tok, int term, case TK_CHAR_TYPE: { switch (tok->u.prop.ctype) { + case ONIGENC_CTYPE_D: + case ONIGENC_CTYPE_S: + case ONIGENC_CTYPE_W: + { + CClassNode* cc; + *np = node_new_cclass(); + CHECK_NULL_RETURN_MEMERR(*np); + cc = NCCLASS(*np); + add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env); + if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); + } + break; + case ONIGENC_CTYPE_WORD: *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not); CHECK_NULL_RETURN_MEMERR(*np); -- cgit v1.2.3