From a25fbe3b3e531bbe479f344af24eaf9d2eeae6ea Mon Sep 17 00:00:00 2001 From: matz Date: Sat, 25 Aug 2007 03:29:39 +0000 Subject: * encoding.c: provide basic features for M17N. * parse.y: encoding aware parsing. * parse.y (pragma_encoding): encoding specification pragma. * parse.y (rb_intern3): encoding specified symbols. * string.c (rb_str_length): length based on characters. for older behavior, bytesize method added. * string.c (rb_str_index_m): index based on characters. rindex as well. * string.c (succ_char): encoding aware succeeding string. * string.c (rb_str_reverse): reverse based on characters. * string.c (rb_str_inspect): encoding aware string description. * string.c (rb_str_upcase_bang): encoding aware case conversion. downcase, capitalize, swapcase as well. * string.c (rb_str_tr_bang): tr based on characters. delete, squeeze, tr_s, count as well. * string.c (rb_str_split_m): split based on characters. * string.c (rb_str_each_line): encoding aware each_line. * string.c (rb_str_each_char): added. iteration based on characters. * string.c (rb_str_strip_bang): encoding aware whitespace stripping. lstrip, rstrip as well. * string.c (rb_str_justify): encoding aware justifying (ljust, rjust, center). * string.c (str_encoding): get encoding attribute from a string. * re.c (rb_reg_initialize): encoding aware regular expression * sprintf.c (rb_str_format): formatting (i.e. length count) based on characters. * io.c (rb_io_getc): getc to return one-character string. for older behavior, getbyte method added. * ext/stringio/stringio.c (strio_getc): ditto. * io.c (rb_io_ungetc): allow pushing arbitrary string at the current reading point. * ext/stringio/stringio.c (strio_ungetc): ditto. * ext/strscan/strscan.c: encoding support. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@13261 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- re.c | 145 ++++++++++++++++++++++++++++++++++++------------------------------- 1 file changed, 78 insertions(+), 67 deletions(-) (limited to 're.c') diff --git a/re.c b/re.c index 824d56210c..d44f274197 100644 --- a/re.c +++ b/re.c @@ -5,12 +5,13 @@ $Author$ created at: Mon Aug 9 18:24:49 JST 1993 - Copyright (C) 1993-2006 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ #include "ruby/ruby.h" #include "ruby/re.h" +#include "ruby/encoding.h" #include "regint.h" #include @@ -289,23 +290,27 @@ kcode_to_arg_value(unsigned int kcode) static void set_re_kcode_by_option(struct RRegexp *re, int options) { + rb_encoding *enc = 0; + + FL_UNSET(re, KCODE_MASK); switch (options & ARG_KCODE_MASK) { case ARG_KCODE_NONE: - FL_UNSET(re, KCODE_MASK); + enc = rb_enc_from_index(0); + FL_SET(re, KCODE_NONE); FL_SET(re, KCODE_FIXED); break; case ARG_KCODE_EUC: - FL_UNSET(re, KCODE_MASK); + enc = rb_enc_find("euc-jp"); FL_SET(re, KCODE_EUC); FL_SET(re, KCODE_FIXED); break; case ARG_KCODE_SJIS: - FL_UNSET(re, KCODE_MASK); - FL_SET(re, KCODE_SJIS); + enc = rb_enc_find("sjis"); FL_SET(re, KCODE_FIXED); + FL_SET(re, KCODE_SJIS); break; case ARG_KCODE_UTF8: - FL_UNSET(re, KCODE_MASK); + enc = rb_enc_find("utf-8"); FL_SET(re, KCODE_UTF8); FL_SET(re, KCODE_FIXED); break; @@ -315,6 +320,9 @@ set_re_kcode_by_option(struct RRegexp *re, int options) FL_SET(re, reg_kcode); break; } + if (enc) { + rb_enc_associate((VALUE)re, enc); + } } static int @@ -371,15 +379,9 @@ kcode_reset_option(void) int rb_reg_mbclen2(unsigned int c, VALUE re) { - int len; unsigned char uc = (unsigned char)c; - if (!FL_TEST(re, KCODE_FIXED)) - return mbclen(uc); - kcode_set_option(re); - len = mbclen(uc); - kcode_reset_option(); - return len; + return rb_enc_mbclen(&uc, rb_enc_get(re)); } static void @@ -393,16 +395,17 @@ rb_reg_check(VALUE re) static void rb_reg_expr_str(VALUE str, const char *s, long len) { + rb_encoding *enc = rb_enc_get(str); const char *p, *pend; int need_escape = 0; p = s; pend = p + len; while (p true or false @@ -1489,7 +1484,7 @@ match_inspect(VALUE match) VALUE rb_cRegexp; static int -rb_reg_initialize(VALUE obj, const char *s, long len, +rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc, int options, onig_errmsg_buffer err) { struct RRegexp *re = RREGEXP(obj); @@ -1504,7 +1499,12 @@ rb_reg_initialize(VALUE obj, const char *s, long len, re->ptr = 0; re->str = 0; - set_re_kcode_by_option(re, options); + if (options & ARG_KCODE_MASK) { + set_re_kcode_by_option(re, options); + } + else { + rb_enc_associate((VALUE)re, enc); + } if (options & ARG_KCODE_MASK) { kcode_set_option((VALUE)re); @@ -1525,6 +1525,13 @@ rb_reg_initialize(VALUE obj, const char *s, long len, return 0; } +static int +rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err) +{ + return rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), rb_enc_get(str), + options, err); +} + static VALUE rb_reg_s_alloc(VALUE klass) { @@ -1539,27 +1546,35 @@ rb_reg_s_alloc(VALUE klass) } VALUE -rb_reg_new(const char *s, long len, int options) +rb_reg_new(VALUE s, int options) { VALUE re = rb_reg_s_alloc(rb_cRegexp); onig_errmsg_buffer err; - if (rb_reg_initialize(re, s, len, options, err) != 0) { - rb_exc_raise(rb_reg_error_desc(s, len, options, err)); + if (rb_reg_initialize_str(re, s, options, err) != 0) { + rb_reg_raise_str(s, err, re); } return re; } VALUE -rb_reg_compile(const char *s, long len, int options) +rb_reg_compile(VALUE str, int options) { VALUE re = rb_reg_s_alloc(rb_cRegexp); onig_errmsg_buffer err; - if (rb_reg_initialize(re, s, len, options, err) != 0) { - rb_set_errinfo(rb_reg_error_desc(s, len, options, err)); - return Qnil; + if (!str) str = rb_str_new(0,0); + if (rb_reg_initialize_str(re, str, options, err) != 0) { + char opts[6]; + VALUE desc = rb_str_buf_new2(err); + + rb_str_buf_cat2(desc, ": /"); + rb_reg_expr_str(desc, RSTRING_PTR(str), RSTRING_LEN(str)); + opts[0] = '/'; + option_to_str(opts + 1, options); + strlcat(opts, arg_kcode(options), sizeof(opts)); + return rb_str_buf_cat2(desc, opts); } FL_SET(re, REG_LITERAL); return re; @@ -1581,8 +1596,7 @@ rb_reg_regcomp(VALUE str) case_cache = ruby_ignorecase; kcode_cache = reg_kcode; - return reg_cache = rb_reg_new(RSTRING_PTR(save_str), RSTRING_LEN(save_str), - ruby_ignorecase); + return reg_cache = rb_reg_new(save_str, ruby_ignorecase); } static int @@ -1843,9 +1857,8 @@ static VALUE rb_reg_initialize_m(int argc, VALUE *argv, VALUE self) { onig_errmsg_buffer err; - const char *s; - long len; int flags = 0; + VALUE str; if (argc == 0 || argc > 3) { rb_raise(rb_eArgError, "wrong number of arguments"); @@ -1859,8 +1872,8 @@ rb_reg_initialize_m(int argc, VALUE *argv, VALUE self) if (FL_TEST(argv[0], KCODE_FIXED)) { flags |= re_to_kcode_arg_value(argv[0]); } - s = RREGEXP(argv[0])->str; - len = RREGEXP(argv[0])->len; + str = rb_enc_str_new(RREGEXP(argv[0])->str, RREGEXP(argv[0])->len, + rb_enc_get(argv[0])); } else { if (argc >= 2) { @@ -1873,11 +1886,10 @@ rb_reg_initialize_m(int argc, VALUE *argv, VALUE self) flags &= ~ARG_KCODE_MASK; flags |= char_to_arg_kcode((int )kcode[0]); } - s = StringValuePtr(argv[0]); - len = RSTRING_LEN(argv[0]); + str = argv[0]; } - if (rb_reg_initialize(self, s, len, flags, err) != 0) { - rb_exc_raise(rb_reg_error_desc(s, len, flags, err)); + if (rb_reg_initialize_str(self, str, flags, err) != 0) { + rb_reg_raise_str(str, err, self); } return self; } @@ -1885,6 +1897,7 @@ rb_reg_initialize_m(int argc, VALUE *argv, VALUE self) VALUE rb_reg_quote(VALUE str) { + rb_encoding *enc = rb_enc_get(str); char *s, *send, *t; VALUE tmp; int c; @@ -1893,8 +1906,8 @@ rb_reg_quote(VALUE str) send = s + RSTRING_LEN(str); for (; s < send; s++) { c = *s; - if (ismbchar(*s)) { - int n = mbclen(*s); + if (ismbchar(s, enc)) { + int n = mbclen(s, enc); while (n-- && s < send) s++; @@ -1922,8 +1935,8 @@ rb_reg_quote(VALUE str) for (; s < send; s++) { c = *s; - if (ismbchar(*s)) { - int n = mbclen(*s); + if (ismbchar(s, enc)) { + int n = mbclen(s, enc); while (n-- && s < send) *t++ = *s++; @@ -2146,9 +2159,8 @@ rb_reg_init_copy(VALUE copy, VALUE re) rb_reg_check(re); s = RREGEXP(re)->str; len = RREGEXP(re)->len; - options = rb_reg_options(re); - if (rb_reg_initialize(copy, s, len, options, err) != 0) { - rb_exc_raise(rb_reg_error_desc(s, len, options, err)); + if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re), err) != 0) { + rb_reg_raise(s, len, err, copy); } return copy; } @@ -2160,20 +2172,20 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) char *p, *s, *e; unsigned char uc; int no; + rb_encoding *enc = rb_enc_check(str, src); - + rb_enc_check(str, regexp); p = s = RSTRING_PTR(str); e = s + RSTRING_LEN(str); while (s < e) { - char *ss = s; + char *ss = s++; - uc = (unsigned char)*s++; - if (ismbchar(uc)) { - s += mbclen(uc) - 1; + if (ismbchar(ss, enc)) { + s += mbclen(ss, enc) - 1; continue; } - if (uc != '\\' || s == e) continue; + if (*ss != '\\' || s == e) continue; if (!val) { val = rb_str_buf_new(ss-p); @@ -2203,8 +2215,7 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) name_end = name = s + 1; while (name_end < e) { if (*name_end == '>') break; - uc = (unsigned char)*name_end; - name_end += mbclen(uc); + name_end += mbclen(name_end, enc); } if (name_end < e) { no = name_to_backref_number(regs, regexp, name, name_end); -- cgit v1.2.3