summaryrefslogtreecommitdiff
path: root/re.c
diff options
context:
space:
mode:
authorakr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2007-12-08 02:50:43 +0000
committerakr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2007-12-08 02:50:43 +0000
commitf1b7e60cb90a7e1a392d4ffccd07dd06eeff5345 (patch)
tree8135b8dc1f1ef8a6bcd08a86c4106c83941780fa /re.c
parent990bec97020bfabd09ebfd92581f505b4f09a78a (diff)
* encoding.c (rb_enc_mbclen): make it never fail.
(rb_enc_nth): don't check the return value of rb_enc_mbclen. (rb_enc_strlen): ditto. (rb_enc_precise_mbclen): return needmore(1) if e <= p. (rb_enc_get_ascii): new function for extracting ASCII character. * include/ruby/encoding.h (rb_enc_get_ascii): declared. * include/ruby/regex.h (ismbchar): removed. * re.c (rb_reg_expr_str): use rb_enc_get_ascii. (unescape_escaped_nonascii): use rb_enc_precise_mbclen to determine the termination of escaped non-ASCII character. (unescape_nonascii): use rb_enc_precise_mbclen. (rb_reg_quote): use rb_enc_get_ascii. (rb_reg_regsub): use rb_enc_get_ascii. * string.c (rb_str_reverse) don't check the return value of rb_enc_mbclen. (rb_str_split_m): don't call rb_enc_mbclen with e <= p. * parse.y (is_identchar): use ISASCII. (parser_ismbchar): removed. (parser_precise_mbclen): new macro. (parser_isascii): new macro. (parser_tokadd_mbchar): use parser_precise_mbclen to check invalid character precisely. (parser_tokadd_string): use parser_isascii. (parser_yylex): ditto. (is_special_global_name): don't call is_identchar with e <= p. (rb_enc_symname_p): ditto. [ruby-dev:32455] * ext/tk/sample/tkextlib/vu/canvSticker2.rb: remove coding cookie because the encoding is not UTF-8. [ruby-dev:32475] git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14131 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 're.c')
-rw-r--r--re.c54
1 files changed, 30 insertions, 24 deletions
diff --git a/re.c b/re.c
index fca7f3a791..c30453591f 100644
--- a/re.c
+++ b/re.c
@@ -218,10 +218,12 @@ rb_reg_expr_str(VALUE str, const char *s, long len)
rb_encoding *enc = rb_enc_get(str);
const char *p, *pend;
int need_escape = 0;
+ int c;
p = s; pend = p + len;
while (p<pend) {
- if (*p == '/' || (!rb_enc_isprint(*p, enc) && !ismbchar(p, pend, enc))) {
+ c = rb_enc_get_ascii(p, pend, enc);
+ if (c == '/' || (c != -1 && !rb_enc_isprint(c, enc))) {
need_escape = 1;
break;
}
@@ -233,29 +235,31 @@ rb_reg_expr_str(VALUE str, const char *s, long len)
else {
p = s;
while (p<pend) {
- if (*p == '\\') {
+ c = rb_enc_get_ascii(p, pend, enc);
+ if (c == '\\') {
int n = mbclen(p+1, pend, enc) + 1;
rb_str_buf_cat(str, p, n);
p += n;
continue;
}
- else if (*p == '/') {
+ else if (c == '/') {
char c = '\\';
rb_str_buf_cat(str, &c, 1);
rb_str_buf_cat(str, p, 1);
}
- else if (ismbchar(p, pend, enc)) {
- rb_str_buf_cat(str, p, mbclen(p, pend, enc));
- p += mbclen(p, pend, enc);
+ else if (c == -1) {
+ int l = mbclen(p, pend, enc);
+ rb_str_buf_cat(str, p, l);
+ p += l;
continue;
}
- else if (rb_enc_isprint(*p, enc)) {
+ else if (rb_enc_isprint(c, enc)) {
rb_str_buf_cat(str, p, 1);
}
- else if (!rb_enc_isspace(*p, enc)) {
+ else if (!rb_enc_isspace(c, enc)) {
char b[8];
- sprintf(b, "\\%03o", *p & 0377);
+ sprintf(b, "\\%03o", c);
rb_str_buf_cat(str, b, 4);
}
else {
@@ -1377,6 +1381,7 @@ unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
char *chbuf = ALLOCA_N(char, chmaxlen);
int chlen = 0;
int byte;
+ int l;
memset(chbuf, 0, chmaxlen);
@@ -1386,7 +1391,8 @@ unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
}
chbuf[chlen++] = byte;
- while (chlen < chmaxlen && chlen != mbclen(chbuf, chbuf+chlen, enc)) {
+ while (chlen < chmaxlen &&
+ MBCLEN_NEEDMORE(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
byte = read_escaped_byte(&p, end, err);
if (byte == -1) {
return -1;
@@ -1394,11 +1400,11 @@ unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
chbuf[chlen++] = byte;
}
- if (chlen != mbclen(chbuf, chbuf+chlen, enc)) {
+ l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
+ if (MBCLEN_INVALID(l)) {
strcpy(err, "invalid multibyte escape");
return -1;
}
-
if (1 < chlen || (chbuf[0] & 0x80)) {
rb_str_buf_cat(buf, chbuf, chlen);
@@ -1515,13 +1521,12 @@ unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
char smallbuf[2];
while (p < end) {
- int chlen = mbclen(p, end, enc);
+ int chlen = rb_enc_precise_mbclen(p, end, enc);
+ if (!MBCLEN_CHARFOUND(chlen)) {
+ strcpy(err, "invalid multibyte character");
+ return -1;
+ }
if (1 < chlen || (*p & 0x80)) {
- if (end < p + chlen) {
- strcpy(err, "too short multibyte character");
- return -1;
- }
- /* xxx: validate the non-ascii character */
rb_str_buf_cat(buf, p, chlen);
p += chlen;
if (*encp == 0)
@@ -2093,8 +2098,8 @@ rb_reg_quote(VALUE str)
s = RSTRING_PTR(str);
send = s + RSTRING_LEN(str);
for (; s < send; s++) {
- c = *s;
- if (ismbchar(s, send, enc)) {
+ c = rb_enc_get_ascii(s, send, enc);
+ if (c == -1) {
int n = mbclen(s, send, enc);
while (n-- && s < send)
@@ -2129,8 +2134,8 @@ rb_reg_quote(VALUE str)
t += s - RSTRING_PTR(str);
for (; s < send; s++) {
- c = *s;
- if (ismbchar(s, send, enc)) {
+ c = rb_enc_get_ascii(s, send, enc);
+ if (c == -1) {
int n = mbclen(s, send, enc);
while (n-- && s < send)
@@ -2397,13 +2402,14 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
e = s + RSTRING_LEN(str);
while (s < e) {
+ int c = rb_enc_get_ascii(s, e, enc);
char *ss = s++;
- if (ismbchar(ss, e, enc)) {
+ if (c == -1) {
s += mbclen(ss, e, enc) - 1;
continue;
}
- if (*ss != '\\' || s == e) continue;
+ if (c != '\\' || s == e) continue;
if (!val) {
val = rb_str_buf_new(ss-p);