summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog12
-rw-r--r--parse.y40
-rw-r--r--re.c56
3 files changed, 71 insertions, 37 deletions
diff --git a/ChangeLog b/ChangeLog
index 53091f0c2a..8f2ea9cb4d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+Fri Oct 19 16:41:00 2007 Nobuyoshi Nakada <nobu@ruby-lang.org>
+
+ * parse.y (parser_regx_options, reg_compile_gen): relaxened encoding
+ matching rule.
+
+ * re.c (rb_reg_initialize): always set encoding of Regexp.
+
+ * re.c (rb_reg_initialize_str): fix enconding for non 7bit-clean
+ strings.
+
+ * re.c (rb_reg_initialize_m): use ascii encoding for 'n' option.
+
Fri Oct 19 11:09:56 2007 Nobuyoshi Nakada <nobu@ruby-lang.org>
* ruby.c (process_options): set primary encoding from the parser
diff --git a/parse.y b/parse.y
index a6e8637f8e..8653fed1d8 100644
--- a/parse.y
+++ b/parse.y
@@ -261,7 +261,7 @@ struct parser_params {
};
#define STR_NEW(p,n) rb_enc_str_new((p),(n),parser->enc)
-#define STR_NEW0() rb_enc_str_new(0,0,rb_enc_from_index(0))
+#define STR_NEW0() rb_str_new(0,0)
#define STR_NEW2(p) rb_enc_str_new((p),strlen(p),parser->enc)
#define STR_NEW3(p,n,m) parser_str_new((p),(n),STR_ENC(!ENC_SINGLE(m)),(m))
#define STR_ENC(m) ((m)?parser->enc:rb_enc_from_index(0))
@@ -443,6 +443,10 @@ static int lvar_defined_gen(struct parser_params*, ID);
#define lvar_defined(id) lvar_defined_gen(parser, id)
#define RE_OPTION_ONCE (1<<16)
+#define RE_OPTION_ENCODING_SHIFT 8
+#define RE_OPTION_ENCODING(e) (((e)&0xff)<<RE_OPTION_ENCODING_SHIFT)
+#define RE_OPTION_ENCODING_IDX(o) (((o)>>RE_OPTION_ENCODING_SHIFT)&0xff)
+#define RE_OPTION_MASK 0xff
#define NODE_STRTERM NODE_ZARRAY /* nothing to gc */
#define NODE_HEREDOC NODE_ARRAY /* 1, 3 to gc */
@@ -3639,14 +3643,14 @@ regexp : tREGEXP_BEG xstring_contents tREGEXP_END
int options = $3;
NODE *node = $2;
if (!node) {
- node = NEW_LIT(reg_compile(0, options & ~RE_OPTION_ONCE));
+ node = NEW_LIT(reg_compile(STR_NEW0(), options));
}
else switch (nd_type(node)) {
case NODE_STR:
{
VALUE src = node->nd_lit;
nd_set_type(node, NODE_LIT);
- node->nd_lit = reg_compile(src, options&~RE_OPTION_ONCE);
+ node->nd_lit = reg_compile(src, options);
}
break;
default:
@@ -3658,7 +3662,7 @@ regexp : tREGEXP_BEG xstring_contents tREGEXP_END
else {
nd_set_type(node, NODE_DREGX);
}
- node->nd_cflag = options & ~RE_OPTION_ONCE;
+ node->nd_cflag = options & RE_OPTION_MASK;
break;
}
$$ = node;
@@ -5110,11 +5114,12 @@ parser_tokadd_escape(struct parser_params *parser, int term, int *mb)
return 0;
}
+extern int rb_char_to_option_kcode(int c, int *option, int *kcode);
+
static int
parser_regx_options(struct parser_params *parser)
{
- extern int rb_char_to_option_kcode(int c, int *option, int *kcode);
-
+ int kcode = 0;
int options = 0;
int c, opt, kc;
@@ -5125,11 +5130,7 @@ parser_regx_options(struct parser_params *parser)
}
else if (rb_char_to_option_kcode(c, &opt, &kc)) {
options |= opt;
- if (kc != 0 && rb_enc_from_index(kc) != parser->enc) {
- compile_error(PARSER_ARG
- "regexp encoding option '%c' mismatch to %s",
- c, rb_enc_name(parser->enc));
- }
+ if (kc >= 0) kcode = c;
}
else {
tokadd(c);
@@ -5141,7 +5142,7 @@ parser_regx_options(struct parser_params *parser)
compile_error(PARSER_ARG "unknown regexp option%s - %s",
toklen() > 1 ? "s" : "", tok());
}
- return options;
+ return options | RE_OPTION_ENCODING(kcode);
}
#define STR_FUNC_ESCAPE 0x01
@@ -8212,8 +8213,21 @@ VALUE rb_reg_compile(VALUE str, int options);
static VALUE
reg_compile_gen(struct parser_params* parser, VALUE str, int options)
{
- VALUE re = rb_reg_compile(str, (options) & ~RE_OPTION_ONCE);
+ VALUE re;
+ int c = RE_OPTION_ENCODING_IDX(options);
+ if (c) {
+ int opt, idx;
+ rb_char_to_option_kcode(c, &opt, &idx);
+ if (idx != ENCODING_GET(str) && ENCODING_GET(str) &&
+ rb_enc_str_coderange(str) != ENC_CODERANGE_SINGLE) {
+ compile_error(PARSER_ARG
+ "regexp encoding option '%c' differs from source encoding '%s'",
+ c, rb_enc_name(rb_enc_get(str)));
+ }
+ ENCODING_SET(str, idx);
+ }
+ re = rb_reg_compile(str, options & RE_OPTION_MASK);
if (NIL_P(re)) {
RB_GC_GUARD(re) = rb_obj_as_string(rb_errinfo());
compile_error(PARSER_ARG "%s", RSTRING_PTR(re));
diff --git a/re.c b/re.c
index 9501595bfb..8acf438cb2 100644
--- a/re.c
+++ b/re.c
@@ -136,8 +136,11 @@ rb_memsearch(const void *x0, long m, const void *y0, long n)
#define KCODE_FIXED FL_USER4
-#define ARG_REG_OPTION_MASK 0x0f
-#define ARG_KCODE_NONE 0x10
+#define ARG_REG_OPTION_MASK \
+ (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
+#define ARG_ENCODING_FIXED 16
+
+#define ARG_KCODE_NONE 0
#define ARG_KCODE_EUC 1
#define ARG_KCODE_SJIS 2
#define ARG_KCODE_UTF8 3
@@ -157,9 +160,6 @@ char_to_option(int c)
case 'm':
val = ONIG_OPTION_MULTILINE;
break;
- case 'n':
- val = ARG_KCODE_NONE;
- break;
default:
val = 0;
break;
@@ -184,19 +184,24 @@ rb_char_to_option_kcode(int c, int *option, int *kcode)
*option = 0;
switch (c) {
+ case 'n':
+ *kcode = ARG_KCODE_NONE;
+ break;
case 'e':
*kcode = ARG_KCODE_EUC;
- return 1;
+ break;
case 's':
*kcode = ARG_KCODE_SJIS;
- return 1;
+ break;
case 'u':
*kcode = ARG_KCODE_UTF8;
- return 1;
+ break;
default:
- *kcode = 0;
+ *kcode = -1;
return (*option = char_to_option(c));
}
+ *option = ARG_ENCODING_FIXED;
+ return 1;
}
static void
@@ -1227,14 +1232,10 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
re->ptr = 0;
re->str = 0;
- if (options & ARG_KCODE_NONE) {
- rb_enc_associate_index((VALUE)re, 0);
- enc = rb_enc_from_index(0);
+ rb_enc_associate((VALUE)re, enc);
+ if (options & ARG_ENCODING_FIXED) {
re->basic.flags |= KCODE_FIXED;
}
- else {
- rb_enc_associate((VALUE)re, enc);
- }
re->ptr = make_regexp(s, len, enc, options & ARG_REG_OPTION_MASK, err);
if (!re->ptr) return -1;
re->str = ALLOC_N(char, len+1);
@@ -1247,6 +1248,9 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
static int
rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err)
{
+ if (rb_enc_str_coderange(str) != ENC_CODERANGE_SINGLE) {
+ options |= ARG_ENCODING_FIXED;
+ }
return rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), rb_enc_get(str),
options, err);
}
@@ -1573,21 +1577,21 @@ rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
onig_errmsg_buffer err;
int flags = 0;
VALUE str;
+ rb_encoding *enc;
+ const char *ptr;
+ long len;
if (argc == 0 || argc > 3) {
rb_raise(rb_eArgError, "wrong number of arguments");
}
if (TYPE(argv[0]) == T_REGEXP) {
VALUE re = argv[0];
- const char *ptr;
- long len;
- rb_encoding *enc;
if (argc > 1) {
rb_warn("flags ignored");
}
rb_reg_check(re);
- flags = RREGEXP(argv[0])->ptr->options & ARG_REG_OPTION_MASK;
+ flags = rb_reg_options(re);
ptr = RREGEXP(re)->str;
len = RREGEXP(re)->len;
enc = rb_enc_get(re);
@@ -1601,18 +1605,22 @@ rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
}
+ enc = 0;
if (argc == 3 && !NIL_P(argv[2])) {
char *kcode = StringValuePtr(argv[2]);
if (kcode[0] == 'n' || kcode[1] == 'N') {
- flags |= ARG_KCODE_NONE;
+ enc = rb_enc_from_index(0);
+ flags |= ARG_ENCODING_FIXED;
}
else {
rb_warning("encoding option is obsolete - %s", kcode);
}
}
str = argv[0];
- StringValueCStr(str);
- if (rb_reg_initialize_str(self, str, flags, err)) {
+ ptr = StringValueCStr(str);
+ if (enc
+ ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err)
+ : rb_reg_initialize_str(self, str, flags, err)) {
rb_reg_raise_str(str, flags, err);
}
}
@@ -1731,8 +1739,8 @@ rb_reg_options(VALUE re)
int options;
rb_reg_check(re);
- options = RREGEXP(re)->ptr->options &
- (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND);
+ options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK;
+ if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
return options;
}