summaryrefslogtreecommitdiff
path: root/re.c
diff options
context:
space:
mode:
Diffstat (limited to 're.c')
-rw-r--r--re.c446
1 files changed, 431 insertions, 15 deletions
diff --git a/re.c b/re.c
index 398e748f3a..78cfd018f5 100644
--- a/re.c
+++ b/re.c
@@ -12,6 +12,7 @@
#include "ruby/ruby.h"
#include "ruby/re.h"
#include "ruby/encoding.h"
+#include "ruby/util.h"
#include "regint.h"
#include <ctype.h>
@@ -715,6 +716,10 @@ rb_reg_fixed_encoding_p(VALUE re)
return Qfalse;
}
+static VALUE
+rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
+ rb_encoding **fixed_enc, onig_errmsg_buffer err);
+
static void
rb_reg_prepare_re(VALUE re, VALUE str)
{
@@ -740,13 +745,19 @@ rb_reg_prepare_re(VALUE re, VALUE str)
OnigErrorInfo einfo;
regex_t *reg, *reg2;
UChar *pattern;
+ VALUE unescaped;
+ rb_encoding *fixed_enc = 0;
rb_reg_check(re);
reg = RREGEXP(re)->ptr;
pattern = ((UChar*)RREGEXP(re)->str);
- r = onig_new(&reg2, (UChar* )pattern,
- (UChar* )(pattern + RREGEXP(re)->len),
+ unescaped = rb_reg_preprocess(
+ RREGEXP(re)->str, RREGEXP(re)->str + RREGEXP(re)->len, enc,
+ &fixed_enc, err);
+
+ r = onig_new(&reg2, (UChar* )RSTRING_PTR(unescaped),
+ (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
reg->options, enc,
OnigDefaultSyntax, &einfo);
if (r) {
@@ -756,6 +767,7 @@ rb_reg_prepare_re(VALUE re, VALUE str)
RREGEXP(re)->ptr = reg2;
onig_free(reg);
+ RB_GC_GUARD(unescaped);
}
}
@@ -1236,12 +1248,407 @@ match_inspect(VALUE match)
VALUE rb_cRegexp;
static int
+read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
+{
+ const char *p = *pp;
+ int code;
+ int meta_prefix = 0, ctrl_prefix = 0;
+ int len;
+ int retbyte;
+
+ retbyte = -1;
+ if (p == end || *p++ != '\\') {
+ strcpy(err, "too short escaped multibyte character");
+ return -1;
+ }
+
+again:
+ if (p == end) {
+ strcpy(err, "too short escape sequence");
+ return -1;
+ }
+ switch (*p++) {
+ case '\\': code = '\\'; break;
+ case 'n': code = '\n'; break;
+ case 't': code = '\t'; break;
+ case 'r': code = '\r'; break;
+ case 'f': code = '\f'; break;
+ case 'v': code = '\013'; break;
+ case 'a': code = '\007'; break;
+ case 'e': code = '\033'; break;
+
+ /* \OOO */
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ p--;
+ code = ruby_scan_oct(p, end < p+3 ? end-p : 3, &len);
+ p += len;
+ break;
+
+ case 'x': /* \xHH */
+ code = ruby_scan_hex(p, end < p+2 ? end-p : 2, &len);
+ if (len < 1) {
+ strcpy(err, "invalid hex escape");
+ return -1;
+ }
+ p += len;
+ break;
+
+ case 'M': /* \M-X, \M-\C-X, \M-\cX */
+ if (meta_prefix) {
+ strcpy(err, "duplicate meta escape");
+ return -1;
+ }
+ meta_prefix = 1;
+ if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
+ if (*p == '\\') {
+ p++;
+ goto again;
+ }
+ else {
+ code = *p++;
+ break;
+ }
+ }
+ strcpy(err, "too short meta escape");
+ return -1;
+
+ case 'C': /* \C-X, \C-\M-X */
+ if (p == end || *p++ != '-') {
+ strcpy(err, "too short control escape");
+ return -1;
+ }
+ case 'c': /* \cX, \c\M-X */
+ if (ctrl_prefix) {
+ strcpy(err, "duplicate control escape");
+ return -1;
+ }
+ ctrl_prefix = 1;
+ if (p < end && (*p & 0x80) == 0) {
+ if (*p == '\\') {
+ p++;
+ goto again;
+ }
+ else {
+ code = *p++;
+ break;
+ }
+ }
+ strcpy(err, "too short control escape");
+ return -1;
+
+ default:
+ strcpy(err, "unexpected escape sequence");
+ return -1;
+ }
+ if (code < 0 || 0xff < code) {
+ strcpy(err, "invalid escape code");
+ return -1;
+ }
+
+ if (ctrl_prefix)
+ code &= 0x1f;
+ if (meta_prefix)
+ code |= 0x80;
+
+ *pp = p;
+ return code;
+}
+
+static int
+unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
+ VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
+{
+ const char *p = *pp;
+ int chmaxlen = rb_enc_mbmaxlen(enc);
+ char *chbuf = ALLOCA_N(char, chmaxlen);
+ int chlen = 0;
+ int byte;
+
+ memset(chbuf, 0, chmaxlen);
+
+ byte = read_escaped_byte(&p, end, err);
+ if (byte == -1) {
+ return -1;
+ }
+
+ chbuf[chlen++] = byte;
+ while (chlen < chmaxlen && chlen != mbclen(chbuf, chbuf+chmaxlen, enc)) {
+ byte = read_escaped_byte(&p, end, err);
+ if (byte == -1) {
+ return -1;
+ }
+ chbuf[chlen++] = byte;
+ }
+
+ if (chlen != mbclen(chbuf, chbuf+chmaxlen, enc)) {
+ strcpy(err, "invalid multibyte escape");
+ return -1;
+ }
+
+ if (1 < chlen || (chbuf[0] & 0x80)) {
+ rb_str_buf_cat(buf, chbuf, chlen);
+
+ if (*encp == 0)
+ *encp = enc;
+ else if (*encp != enc) {
+ strcpy(err, "character encodings differ");
+ return -1;
+ }
+ }
+ else {
+ char escbuf[5];
+ snprintf(escbuf, sizeof(escbuf), "\\x%02x", chbuf[0]&0xff);
+ rb_str_buf_cat(buf, escbuf, 4);
+ }
+ *pp = p;
+ return 0;
+}
+
+static int
+append_utf8(unsigned long uv,
+ VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
+{
+ if (uv < 0x80) {
+ char escbuf[5];
+ snprintf(escbuf, sizeof(escbuf), "\\x%02x", (int)uv);
+ rb_str_buf_cat(buf, escbuf, 4);
+ }
+ else {
+ int len;
+ char utf8buf[6];
+ len = rb_uv_to_utf8(utf8buf, uv);
+ rb_str_buf_cat(buf, utf8buf, len);
+
+ if (*encp == 0)
+ *encp = rb_enc_find("utf-8");
+ else if (*encp != rb_enc_find("utf-8")) {
+ strcpy(err, "character encodings differ");
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int
+unescape_unicode_list(const char **pp, const char *end,
+ VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
+{
+ const char *p = *pp;
+ int has_unicode = 0;
+ unsigned long code;
+ int len;
+
+ while (p < end && ISSPACE(*p)) p++;
+
+ while (1) {
+ code = ruby_scan_hex(p, end-p, &len);
+ if (len == 0)
+ break;
+ if (6 < len) { /* max 10FFFF */
+ strcpy(err, "invalid unicode range");
+ return -1;
+ }
+ if (0x10ffff < code) {
+ strcpy(err, "invalid unicode range");
+ return -1;
+ }
+ p += len;
+ if (append_utf8(code, buf, encp, err) != 0)
+ return -1;
+ has_unicode = 1;
+
+ while (p < end && ISSPACE(*p)) p++;
+ }
+
+ if (has_unicode == 0) {
+ strcpy(err, "invalid unicode list");
+ return -1;
+ }
+
+ *pp = p;
+
+ return 0;
+}
+
+static int
+unescape_unicode_bmp(const char **pp, const char *end,
+ VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
+{
+ const char *p = *pp;
+ int len;
+ unsigned long code;
+
+ if (end < p+4) {
+ strcpy(err, "invalid unicode escape");
+ return -1;
+ }
+ code = ruby_scan_hex(p, 4, &len);
+ if (len != 4) {
+ strcpy(err, "invalid unicode escape");
+ return -1;
+ }
+ if (append_utf8(code, buf, encp, err) != 0)
+ return -1;
+ *pp = p + 4;
+ return 0;
+}
+
+static int
+unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
+ VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
+{
+ char c;
+ char smallbuf[2];
+
+ while (p < end) {
+ int chlen = mbclen(p, end, enc);
+ if (1 < chlen || (*p & 0x80)) {
+ if (end < p + chlen) {
+ strcpy(err, "too short multibyte character");
+ return -1;
+ }
+ /* xxx: validate the non-ascii character */
+ rb_str_buf_cat(buf, p, chlen);
+ p += chlen;
+ if (*encp == 0)
+ *encp = enc;
+ else if (*encp != enc) {
+ strcpy(err, "character encodings differ");
+ return -1;
+ }
+ continue;
+ }
+
+ switch (c = *p++) {
+ case '\\':
+ if (p == end) {
+ strcpy(err, "too short escape sequence");
+ return -1;
+ }
+ switch (c = *p++) {
+ case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
+ {
+ int octlen;
+ if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
+ /* backref or 7bit octal.
+ no need to unescape anyway.
+ re-escaping may break backref */
+ goto escape_asis;
+ }
+ }
+ /* xxx: How about more than 199 subexpressions? */
+
+ case '0': /* \0, \0O, \0OO */
+
+ case 'x': /* \xHH */
+ case 'c': /* \cX, \c\M-X */
+ case 'C': /* \C-X, \C-\M-X */
+ case 'M': /* \M-X, \M-\C-X, \M-\cX */
+ p = p-2;
+ if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
+ return -1;
+ break;
+
+ case 'u':
+ if (p == end) {
+ strcpy(err, "too short escape sequence");
+ return -1;
+ }
+ if (*p == '{') {
+ /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */
+ p++;
+ if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
+ return -1;
+ if (p == end || *p++ != '}') {
+ strcpy(err, "invalid unicode list");
+ return -1;
+ }
+ break;
+ }
+ else {
+ /* \uHHHH */
+ if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
+ return -1;
+ break;
+ }
+
+ default: /* \n, \\, \d, \9, etc. */
+escape_asis:
+ smallbuf[0] = '\\';
+ smallbuf[1] = c;
+ rb_str_buf_cat(buf, smallbuf, 2);
+ break;
+ }
+ break;
+
+ default:
+ rb_str_buf_cat(buf, &c, 1);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static VALUE
+rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
+ rb_encoding **fixed_enc, onig_errmsg_buffer err)
+{
+ VALUE buf;
+
+ buf = rb_str_buf_new(0);
+
+ *fixed_enc = 0;
+ if (unescape_nonascii(p, end, enc, buf, fixed_enc, err) != 0)
+ return Qnil;
+
+ if (fixed_enc) {
+ rb_enc_associate(buf, *fixed_enc);
+ }
+
+ return buf;
+}
+
+#if 0
+static VALUE
+rb_reg_preprocess_obj(VALUE str,
+ rb_encoding **fixed_enc, onig_errmsg_buffer err)
+{
+ VALUE buf;
+ char *p, *end;
+ rb_encoding *enc;
+
+ StringValue(str);
+ p = RSTRING_PTR(str);
+ end = p + RSTRING_LEN(str);
+ enc = rb_enc_get(str);
+
+ buf = rb_reg_preprocess(p, end, enc, fixed_enc, err);
+ RB_GC_GUARD(str);
+ return buf;
+}
+
+static VALUE
+rb_reg_preprocess_m(VALUE klass, VALUE obj)
+{
+ rb_encoding *fixed_enc = 0;
+ onig_errmsg_buffer err;
+ VALUE str = rb_reg_preprocess_obj(obj, &fixed_enc, err);
+ if (str == Qnil)
+ rb_raise(rb_eArgError, "%s", err);
+ return rb_assoc_new(str, fixed_enc ? Qtrue : Qfalse);
+}
+#endif
+
+static int
rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
int options, onig_errmsg_buffer err)
{
struct RRegexp *re = RREGEXP(obj);
- int raw8bit;
- long i;
+ VALUE unescaped;
+ rb_encoding *fixed_enc = 0;
if (!OBJ_TAINTED(obj) && rb_safe_level() >= 4)
rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
@@ -1253,33 +1660,38 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
re->ptr = 0;
re->str = 0;
- raw8bit = 0;
- for (i = 0; i < len; i++) {
- if (s[i] & 0x80) {
- raw8bit = 1;
- break;
- }
+ unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
+ if (unescaped == Qnil)
+ return -1;
+
+ if (fixed_enc && (options & ARG_ENCODING_FIXED) && fixed_enc != enc) {
+ strcpy(err, "character encodings differ");
+ return -1;
}
+ if (fixed_enc)
+ enc = fixed_enc;
+ else if (!(options & ARG_ENCODING_FIXED))
+ enc = rb_default_encoding();
+
rb_enc_associate((VALUE)re, enc);
- if (options & ARG_ENCODING_FIXED || raw8bit) {
+ if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
re->basic.flags |= KCODE_FIXED;
}
- re->ptr = make_regexp(s, len, enc, options & ARG_REG_OPTION_MASK, err);
+ re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
+ options & ARG_REG_OPTION_MASK, err);
if (!re->ptr) return -1;
re->str = ALLOC_N(char, len+1);
memcpy(re->str, s, len);
re->str[len] = '\0';
re->len = len;
+ RB_GC_GUARD(unescaped);
return 0;
}
static int
rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err)
{
- if (!rb_enc_str_asciionly_p(str)) {
- options |= ARG_ENCODING_FIXED;
- }
return rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), rb_enc_get(str),
options, err);
}
@@ -2183,6 +2595,10 @@ Init_Regexp(void)
rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
+#if 0
+ rb_define_singleton_method(rb_cRegexp, "preprocess", rb_reg_preprocess_m, 1);
+#endif
+
rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);