From c898839035b6fe74f8281fb918c13f856690bfde Mon Sep 17 00:00:00 2001
From: yugui <yugui@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>
Date: Fri, 27 Nov 2009 02:54:10 +0000
Subject: merges r24544 from trunk into ruby_1_9_1. -- \d, \s and \w are now
 non Unicode class. [ruby-dev:39026]

* include/ruby/oniguruma.h
  (ONIGENC_CTYPE_SPECIAL_MASK): added.
  (ONIGENC_CTYPE_D): ditto.
  (ONIGENC_CTYPE_S): ditto.
  (ONIGENC_CTYPE_W): ditto.

* regparse.c: \d, \s and \w are now non Unicode class.
  [ruby-dev:39026]
  (fetch_token_in_cc): use ONIGENC_CTYPE_[DSW] for \d/\s/\w.
  (fetch_token): ditto.
  (add_ctype_to_cc): add routines for ONIGENC_CTYPE_[DSW].
  (parse_exp): ditto.

* test/ruby/test_regexp.rb (TestRegexp#test_char_class):
  add tests for above.

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/ruby_1_9_1@25941 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
---
 regparse.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 47 insertions(+), 12 deletions(-)

(limited to 'regparse.c')

diff --git a/regparse.c b/regparse.c
index 1fc0459fa6..75ad24feda 100644
--- a/regparse.c
+++ b/regparse.c
@@ -2946,32 +2946,32 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
     switch (c) {
     case 'w':
       tok->type = TK_CHAR_TYPE;
-      tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
+      tok->u.prop.ctype = ONIGENC_CTYPE_W;
       tok->u.prop.not   = 0;
       break;
     case 'W':
       tok->type = TK_CHAR_TYPE;
-      tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
+      tok->u.prop.ctype = ONIGENC_CTYPE_W;
       tok->u.prop.not   = 1;
       break;
     case 'd':
       tok->type = TK_CHAR_TYPE;
-      tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
+      tok->u.prop.ctype = ONIGENC_CTYPE_D;
       tok->u.prop.not   = 0;
       break;
     case 'D':
       tok->type = TK_CHAR_TYPE;
-      tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
+      tok->u.prop.ctype = ONIGENC_CTYPE_D;
       tok->u.prop.not   = 1;
       break;
     case 's':
       tok->type = TK_CHAR_TYPE;
-      tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
+      tok->u.prop.ctype = ONIGENC_CTYPE_S;
       tok->u.prop.not   = 0;
       break;
     case 'S':
       tok->type = TK_CHAR_TYPE;
-      tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
+      tok->u.prop.ctype = ONIGENC_CTYPE_S;
       tok->u.prop.not   = 1;
       break;
     case 'h':
@@ -3233,14 +3233,14 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
     case 'w':
       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
       tok->type = TK_CHAR_TYPE;
-      tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
+      tok->u.prop.ctype = ONIGENC_CTYPE_W;
       tok->u.prop.not   = 0;
       break;
 
     case 'W':
       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
       tok->type = TK_CHAR_TYPE;
-      tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
+      tok->u.prop.ctype = ONIGENC_CTYPE_W;
       tok->u.prop.not   = 1;
       break;
 
@@ -3273,28 +3273,28 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
     case 's':
       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
       tok->type = TK_CHAR_TYPE;
-      tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
+      tok->u.prop.ctype = ONIGENC_CTYPE_S;
       tok->u.prop.not   = 0;
       break;
 
     case 'S':
       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
       tok->type = TK_CHAR_TYPE;
-      tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
+      tok->u.prop.ctype = ONIGENC_CTYPE_S;
       tok->u.prop.not   = 1;
       break;
 
     case 'd':
       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
       tok->type = TK_CHAR_TYPE;
-      tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
+      tok->u.prop.ctype = ONIGENC_CTYPE_D;
       tok->u.prop.not   = 0;
       break;
 
     case 'D':
       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
       tok->type = TK_CHAR_TYPE;
-      tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
+      tok->u.prop.ctype = ONIGENC_CTYPE_D;
       tok->u.prop.not   = 1;
       break;
 
@@ -3835,6 +3835,28 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
   OnigCodePoint sb_out;
   OnigEncoding enc = env->enc;
 
+  switch (ctype) {
+  case ONIGENC_CTYPE_D:
+  case ONIGENC_CTYPE_S:
+  case ONIGENC_CTYPE_W:
+    ctype ^= ONIGENC_CTYPE_SPECIAL_MASK;
+    if (not != 0) {
+      for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
+	if (! ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype))
+	  BITSET_SET_BIT(cc->bs, c);
+      }
+      ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
+    }
+    else {
+      for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
+	if (ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype))
+	  BITSET_SET_BIT(cc->bs, c);
+      }
+    }
+    return 0;
+    break;
+  }
+
   r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
   if (r == 0) {
     return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
@@ -5186,6 +5208,19 @@ parse_exp(Node** np, OnigToken* tok, int term,
   case TK_CHAR_TYPE:
     {
       switch (tok->u.prop.ctype) {
+      case ONIGENC_CTYPE_D:
+      case ONIGENC_CTYPE_S:
+      case ONIGENC_CTYPE_W:
+	{
+	    CClassNode* cc;
+	    *np = node_new_cclass();
+	    CHECK_NULL_RETURN_MEMERR(*np);
+	    cc = NCCLASS(*np);
+	    add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
+	    if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
+	}
+	break;
+
       case ONIGENC_CTYPE_WORD:
 	*np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not);
 	CHECK_NULL_RETURN_MEMERR(*np);
-- 
cgit v1.2.3