summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorakr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2007-12-01 16:56:19 +0000
committerakr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2007-12-01 16:56:19 +0000
commit7ff702406a324ed12c69bc23a7cfaf066e401547 (patch)
treeaf1c1a9ee2d96bcc009c639271ef5031fd6a05dd
parentd92b461dd9f5ff0eae7d15435aab1507dfb53496 (diff)
* include/ruby/intern.h (rb_uv_to_utf8): declared.
* re.c (rb_reg_preprocess): new function for dynamic regexp with \u{} such as Regexp.new("\\u{6666}"). (rb_reg_prepare_re): preprocess regexp for recompiling. (read_escaped_byte): new function. (unescape_escaped_nonascii): new function. (append_utf8): new function. (unescape_unicode_list): new function. (unescape_unicode_bmp): new function. (unescape_nonascii): new function. (rb_reg_initialize): preprocess regexp. * pack.c (rb_uv_to_utf8): renamed from uv_to_utf8. * parse.y (STR_NEW3): take func instead of has8 and hasmb. (parser_str_new): use default coderange mechanism except for regexp. (parser_tokadd_utf8): copy regexp source as-is. (parser_read_escape): UTF-8 stuff removed. (parser_tokadd_escape): has8bit and hasmb removed. (parser_tokadd_string): fix 8-bit single byte character with \u. (parser_parse_string): has8bit and hasmb removed. (parser_here_document): has8bit and hasmb removed. (parser_yylex): call parser_tokadd_utf8 instead of read_escape for UTF-8 character. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14072 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r--ChangeLog28
-rw-r--r--include/ruby/intern.h1
-rw-r--r--pack.c7
-rw-r--r--parse.y209
-rw-r--r--re.c446
-rw-r--r--test/ruby/test_m17n.rb27
-rw-r--r--test/ruby/test_unicode_escape.rb91
7 files changed, 635 insertions, 174 deletions
diff --git a/ChangeLog b/ChangeLog
index 84f9c1e43b..cce0ff4ccd 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,31 @@
+Sun Dec 2 01:39:51 2007 Tanaka Akira <akr@fsij.org>
+
+ * include/ruby/intern.h (rb_uv_to_utf8): declared.
+
+ * re.c (rb_reg_preprocess): new function for dynamic regexp with
+ \u{} such as Regexp.new("\\u{6666}").
+ (rb_reg_prepare_re): preprocess regexp for recompiling.
+ (read_escaped_byte): new function.
+ (unescape_escaped_nonascii): new function.
+ (append_utf8): new function.
+ (unescape_unicode_list): new function.
+ (unescape_unicode_bmp): new function.
+ (unescape_nonascii): new function.
+ (rb_reg_initialize): preprocess regexp.
+
+ * pack.c (rb_uv_to_utf8): renamed from uv_to_utf8.
+
+ * parse.y (STR_NEW3): take func instead of has8 and hasmb.
+ (parser_str_new): use default coderange mechanism except for regexp.
+ (parser_tokadd_utf8): copy regexp source as-is.
+ (parser_read_escape): UTF-8 stuff removed.
+ (parser_tokadd_escape): has8bit and hasmb removed.
+ (parser_tokadd_string): fix 8-bit single byte character with \u.
+ (parser_parse_string): has8bit and hasmb removed.
+ (parser_here_document): has8bit and hasmb removed.
+ (parser_yylex): call parser_tokadd_utf8 instead of read_escape for
+ UTF-8 character.
+
Wed Dec 2 01:00:07 2007 James Edward Gray II <jeg2@ruby-lang.org>
* lib/xmlrpc/server.rb (XMLRPC::Server#server): Improve signal handling so
diff --git a/include/ruby/intern.h b/include/ruby/intern.h
index a43d4d31ab..42f61a91bc 100644
--- a/include/ruby/intern.h
+++ b/include/ruby/intern.h
@@ -101,6 +101,7 @@ unsigned LONG_LONG rb_big2ull(VALUE);
#endif /* HAVE_LONG_LONG */
void rb_quad_pack(char*,VALUE);
VALUE rb_quad_unpack(const char*,int);
+int rb_uv_to_utf8(char[6],unsigned long);
VALUE rb_dbl2big(double);
double rb_big2dbl(VALUE);
VALUE rb_big_cmp(VALUE, VALUE);
diff --git a/pack.c b/pack.c
index 4cab476bd2..b1d7268850 100644
--- a/pack.c
+++ b/pack.c
@@ -365,7 +365,6 @@ static const char toofew[] = "too few arguments";
static void encodes(VALUE,const char*,long,int);
static void qpencode(VALUE,VALUE,long);
-static int uv_to_utf8(char*,unsigned long);
static unsigned long utf8_to_uv(const char*,long*);
/*
@@ -872,7 +871,7 @@ pack_pack(VALUE ary, VALUE fmt)
if (l < 0) {
rb_raise(rb_eRangeError, "pack(U): value out of range");
}
- le = uv_to_utf8(buf, l);
+ le = rb_uv_to_utf8(buf, l);
rb_str_buf_cat(res, (char*)buf, le);
}
break;
@@ -1991,8 +1990,8 @@ pack_unpack(VALUE str, VALUE fmt)
#define BYTEWIDTH 8
-static int
-uv_to_utf8(char *buf, unsigned long uv)
+int
+rb_uv_to_utf8(char buf[6], unsigned long uv)
{
if (uv <= 0x7f) {
buf[0] = (char)uv;
diff --git a/parse.y b/parse.y
index 7f5af4e4e6..264a3d2d63 100644
--- a/parse.y
+++ b/parse.y
@@ -269,7 +269,7 @@ struct parser_params {
#define STR_NEW(p,n) rb_enc_str_new((p),(n),parser->enc)
#define STR_NEW0() rb_str_new(0,0)
#define STR_NEW2(p) rb_enc_str_new((p),strlen(p),parser->enc)
-#define STR_NEW3(p,n,e,has8,hasmb) parser_str_new2((p),(n),(e),(has8),(hasmb))
+#define STR_NEW3(p,n,e,func) parser_str_new((p),(n),(e),(func))
#define STR_ENC(m) ((m)?parser->enc:rb_enc_from_index(0))
#define ENC_SINGLE(cr) ((cr)==ENC_CODERANGE_7BIT)
#define TOK_INTERN(mb) rb_intern3(tok(), toklen(), STR_ENC(mb))
@@ -4488,7 +4488,7 @@ none : /* none */
# define yylval (*((YYSTYPE*)(parser->parser_yylval)))
static int parser_regx_options(struct parser_params*);
-static int parser_tokadd_string(struct parser_params*,int,int,int,long*,int*,int*,rb_encoding**);
+static int parser_tokadd_string(struct parser_params*,int,int,int,long*,rb_encoding**);
static void parser_tokaddmbc(struct parser_params *parser, int c, rb_encoding *enc);
static int parser_parse_string(struct parser_params*,NODE*);
static int parser_here_document(struct parser_params*,NODE*);
@@ -4500,11 +4500,10 @@ static int parser_here_document(struct parser_params*,NODE*);
# define tokspace(n) parser_tokspace(parser, n)
# define tokadd(c) parser_tokadd(parser, c)
# define tok_hex(numlen) parser_tok_hex(parser, numlen)
-# define tok_utf8(numlen,e) parser_tok_utf8(parser, numlen, e)
-# define read_escape(flags,has8,hasmb,e) parser_read_escape(parser, flags, has8, hasmb, e)
-# define tokadd_escape(t,has8,hasmb,e) parser_tokadd_escape(parser, t, has8,hasmb, e)
+# define read_escape(flags,e) parser_read_escape(parser, flags, e)
+# define tokadd_escape(t,e) parser_tokadd_escape(parser, t, e)
# define regx_options() parser_regx_options(parser)
-# define tokadd_string(f,t,p,n,has8bit,hasmb,e) parser_tokadd_string(parser,f,t,p,n,has8bit,hasmb,e)
+# define tokadd_string(f,t,p,n,e) parser_tokadd_string(parser,f,t,p,n,e)
# define parse_string(n) parser_parse_string(parser,n)
# define tokaddmbc(c, enc) parser_tokaddmbc(parser, c, enc)
# define here_document(n) parser_here_document(parser,n)
@@ -4821,35 +4820,37 @@ rb_parser_compile_file(volatile VALUE vparser, const char *f, VALUE file, int st
}
#endif /* !RIPPER */
-static VALUE
-parser_str_new(const char *p, long n, rb_encoding *enc, int coderange)
-{
- VALUE str = rb_enc_str_new(p, n, enc);
- ENC_CODERANGE_SET(str, coderange);
- return str;
-}
+#define STR_FUNC_ESCAPE 0x01
+#define STR_FUNC_EXPAND 0x02
+#define STR_FUNC_REGEXP 0x04
+#define STR_FUNC_QWORDS 0x08
+#define STR_FUNC_SYMBOL 0x10
+#define STR_FUNC_INDENT 0x20
+
+enum string_type {
+ str_squote = (0),
+ str_dquote = (STR_FUNC_EXPAND),
+ str_xquote = (STR_FUNC_EXPAND),
+ str_regexp = (STR_FUNC_REGEXP|STR_FUNC_ESCAPE|STR_FUNC_EXPAND),
+ str_sword = (STR_FUNC_QWORDS),
+ str_dword = (STR_FUNC_QWORDS|STR_FUNC_EXPAND),
+ str_ssym = (STR_FUNC_SYMBOL),
+ str_dsym = (STR_FUNC_SYMBOL|STR_FUNC_EXPAND),
+};
static VALUE
-parser_str_new2(const char *p, long n, rb_encoding *enc, int has8bit,int hasmb)
+parser_str_new(const char *p, long n, rb_encoding *enc, int func)
{
- /*
- * Set coderange bit flags based on the presence of 8-bit and
- * multi-byte characters in the string
- */
- int coderange = ENC_CODERANGE_7BIT;
- if (hasmb) coderange = ENC_CODERANGE_8BIT;
- else if (has8bit) coderange = ENC_CODERANGE_UNKNOWN;
+ VALUE str;
- /*
- * If it is all single byte characters with the 8th bit clear,
- * and if the specified encoding is ASCII-compatible, then this
- * string is in the ASCII subset, and we just use the ASCII encoding
- * instead.
- */
- if ((coderange == ENC_CODERANGE_7BIT) && rb_enc_asciicompat(enc))
- enc = rb_default_encoding();
+ str = rb_enc_str_new(p, n, enc);
+ if (!(func & STR_FUNC_REGEXP) &&
+ rb_enc_asciicompat(enc) &&
+ rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
+ rb_enc_associate(str, rb_default_encoding());
+ }
- return parser_str_new(p, n, enc, coderange);
+ return str;
}
static inline int
@@ -4979,9 +4980,11 @@ parser_tok_hex(struct parser_params *parser, int *numlen)
return c;
}
+#define tokcopy(n) memcpy(tokspace(n), lex_p - (n), (n))
+
static int
-parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
- rb_encoding **encp, int string_literal, int symbol_literal)
+parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp,
+ int string_literal, int symbol_literal, int regexp_literal)
{
/*
* If string_literal is true, then we allow multiple codepoints
@@ -4993,8 +4996,11 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
int codepoint;
int numlen;
+ if (regexp_literal) { tokadd('\\'); tokadd('u'); }
+
if (peek('{')) { /* handle \u{...} form */
do {
+ if (regexp_literal) { tokadd(*lex_p); }
nextc();
codepoint = scan_hex(lex_p, 6, &numlen);
if (numlen == 0) {
@@ -5006,8 +5012,10 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
return 0;
}
lex_p += numlen;
- if (codepoint >= 0x80) {
- *hasmb = 1;
+ if (regexp_literal) {
+ tokcopy(numlen);
+ }
+ else if (codepoint >= 0x80) {
*encp = UTF8_ENC();
if (string_literal) tokaddmbc(codepoint, *encp);
}
@@ -5026,6 +5034,7 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
return 0;
}
+ if (regexp_literal) { tokadd('}'); }
nextc();
}
else { /* handle \uxxxx form */
@@ -5035,8 +5044,10 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
return 0;
}
lex_p += 4;
- if (codepoint >= 0x80) {
- *hasmb = 1;
+ if (regexp_literal) {
+ tokcopy(4);
+ }
+ else if (codepoint >= 0x80) {
*encp = UTF8_ENC();
if (string_literal) tokaddmbc(codepoint, *encp);
}
@@ -5058,7 +5069,7 @@ parser_tokadd_utf8(struct parser_params *parser, int *hasmb,
static int
parser_read_escape(struct parser_params *parser, int flags,
- int *has8bit, int *hasmb, rb_encoding **encp)
+ rb_encoding **encp)
{
int c;
int numlen;
@@ -5098,19 +5109,12 @@ parser_read_escape(struct parser_params *parser, int flags,
c = scan_oct(lex_p, 3, &numlen);
lex_p += numlen;
}
- if (c >= 0200) *has8bit = 1;
return c;
case 'x': /* hex constant */
if (flags & (ESCAPE_CONTROL|ESCAPE_META)) goto eof;
c = tok_hex(&numlen);
if (numlen == 0) return 0;
- if (c >= 0x80) *has8bit = 1;
- return c;
-
- case 'u': /* unicode constant: here only for char literal */
- if (flags & (ESCAPE_CONTROL|ESCAPE_META)) goto eof;
- c = parser_tokadd_utf8(parser, hasmb, encp, 0, 0);
return c;
case 'b': /* backspace */
@@ -5126,13 +5130,10 @@ parser_read_escape(struct parser_params *parser, int flags,
goto eof;
}
if ((c = nextc()) == '\\') {
- int tmp;
- *has8bit = 1;
- return read_escape(flags|ESCAPE_META, &tmp, &tmp, encp) | 0x80;
+ return read_escape(flags|ESCAPE_META, encp) | 0x80;
}
else if (c == -1 || !ISASCII(c)) goto eof;
else {
- *has8bit = 1;
return ((c & 0xff) | 0x80);
}
@@ -5144,8 +5145,7 @@ parser_read_escape(struct parser_params *parser, int flags,
case 'c':
if (flags & ESCAPE_CONTROL) goto eof;
if ((c = nextc())== '\\') {
- int tmp;
- c = read_escape(flags|ESCAPE_CONTROL, has8bit, &tmp, encp);
+ c = read_escape(flags|ESCAPE_CONTROL, encp);
}
else if (c == '?')
return 0177;
@@ -5162,8 +5162,6 @@ parser_read_escape(struct parser_params *parser, int flags,
}
}
-#define tokcopy(n) memcpy(tokspace(n), lex_p - (n), (n))
-
static void
parser_tokaddmbc(struct parser_params *parser, int c, rb_encoding *enc)
{
@@ -5173,7 +5171,7 @@ parser_tokaddmbc(struct parser_params *parser, int c, rb_encoding *enc)
static int
parser_tokadd_escape(struct parser_params *parser, int term,
- int *has8bit, int *hasmb, rb_encoding **encp)
+ rb_encoding **encp)
{
int c;
int flags = 0;
@@ -5194,7 +5192,6 @@ parser_tokadd_escape(struct parser_params *parser, int term,
if (numlen == 0) goto eof;
lex_p += numlen;
tokcopy(numlen + 1);
- if (oct >= 0200) *has8bit = 1;
}
return 0;
@@ -5207,7 +5204,6 @@ parser_tokadd_escape(struct parser_params *parser, int term,
hex = tok_hex(&numlen);
if (numlen == 0) goto eof;
tokcopy(numlen + 2);
- if (hex >= 0x80) *has8bit = 1;
}
return 0;
@@ -5218,7 +5214,6 @@ parser_tokadd_escape(struct parser_params *parser, int term,
goto eof;
}
tokcopy(3);
- *has8bit = 1;
flags |= ESCAPE_META;
goto escaped;
@@ -5287,24 +5282,6 @@ parser_regx_options(struct parser_params *parser)
return options | RE_OPTION_ENCODING(kcode);
}
-#define STR_FUNC_ESCAPE 0x01
-#define STR_FUNC_EXPAND 0x02
-#define STR_FUNC_REGEXP 0x04
-#define STR_FUNC_QWORDS 0x08
-#define STR_FUNC_SYMBOL 0x10
-#define STR_FUNC_INDENT 0x20
-
-enum string_type {
- str_squote = (0),
- str_dquote = (STR_FUNC_EXPAND),
- str_xquote = (STR_FUNC_EXPAND),
- str_regexp = (STR_FUNC_REGEXP|STR_FUNC_ESCAPE|STR_FUNC_EXPAND),
- str_sword = (STR_FUNC_QWORDS),
- str_dword = (STR_FUNC_QWORDS|STR_FUNC_EXPAND),
- str_ssym = (STR_FUNC_SYMBOL),
- str_dsym = (STR_FUNC_SYMBOL|STR_FUNC_EXPAND),
-};
-
static void
dispose_string(VALUE str)
{
@@ -5328,10 +5305,10 @@ parser_tokadd_mbchar(struct parser_params *parser, int c)
static int
parser_tokadd_string(struct parser_params *parser,
int func, int term, int paren, long *nest,
- int *has8bit, int *hasmb, rb_encoding **encp)
+ rb_encoding **encp)
{
int c;
- int has_mb = 0;
+ int has_nonascii = 0;
rb_encoding *enc = *encp;
char *errbuf = 0;
static const char mixed_msg[] = "%s mixed within %s source";
@@ -5390,9 +5367,10 @@ parser_tokadd_string(struct parser_params *parser,
tokadd('\\');
break;
}
- parser_tokadd_utf8(parser, hasmb, &enc, 1,
- func & STR_FUNC_SYMBOL);
- if (has_mb && enc != *encp) {
+ parser_tokadd_utf8(parser, &enc, 1,
+ func & STR_FUNC_SYMBOL,
+ func & STR_FUNC_REGEXP);
+ if (has_nonascii && enc != *encp) {
mixed_escape(beg, enc, *encp);
}
continue;
@@ -5400,28 +5378,17 @@ parser_tokadd_string(struct parser_params *parser,
default:
if (func & STR_FUNC_REGEXP) {
pushback(c);
- if ((c = tokadd_escape(term, has8bit, hasmb, &enc)) < 0)
+ if ((c = tokadd_escape(term, &enc)) < 0)
return -1;
- if (has_mb && enc != *encp) {
+ if (has_nonascii && enc != *encp) {
mixed_escape(beg, enc, *encp);
}
continue;
}
else if (func & STR_FUNC_EXPAND) {
- int tmb = 0;
pushback(c);
if (func & STR_FUNC_ESCAPE) tokadd('\\');
- c = read_escape(0, has8bit, &tmb, &enc);
- if (tmb) {
- *hasmb = tmb;
- if (has_mb && enc != *encp) {
- mixed_escape(beg, enc, *encp);
- }
- else {
- tokaddmbc(c, enc);
- }
- continue;
- }
+ c = read_escape(0, &enc);
}
else if ((func & STR_FUNC_QWORDS) && ISSPACE(c)) {
/* ignore backslashed spaces in %w */
@@ -5432,13 +5399,12 @@ parser_tokadd_string(struct parser_params *parser,
}
}
else if (parser_ismbchar()) {
- has_mb = 1;
+ has_nonascii = 1;
if (enc != *encp) {
mixed_error(enc, *encp);
continue;
}
tokadd_mbchar(c);
- if (hasmb) *hasmb = 1;
continue;
}
else if ((func & STR_FUNC_QWORDS) && ISSPACE(c)) {
@@ -5450,6 +5416,13 @@ parser_tokadd_string(struct parser_params *parser,
compile_error(PARSER_ARG "symbol cannot contain '\\0'");
continue;
}
+ if (c & 0x80) {
+ has_nonascii = 1;
+ if (enc != *encp) {
+ mixed_error(enc, *encp);
+ continue;
+ }
+ }
tokadd(c);
}
*encp = enc;
@@ -5465,7 +5438,7 @@ parser_parse_string(struct parser_params *parser, NODE *quote)
int func = quote->nd_func;
int term = nd_term(quote);
int paren = nd_paren(quote);
- int c, space = 0, has8bit=0, hasmb=0;
+ int c, space = 0;
rb_encoding *enc = parser->enc;
if (func == -1) return tSTRING_END;
@@ -5501,7 +5474,7 @@ parser_parse_string(struct parser_params *parser, NODE *quote)
}
pushback(c);
if (tokadd_string(func, term, paren, &quote->nd_nest,
- &has8bit, &hasmb, &enc) == -1) {
+ &enc) == -1) {
ruby_sourceline = nd_line(quote);
if (func & STR_FUNC_REGEXP) {
compile_error(PARSER_ARG "unterminated regexp meets end of file");
@@ -5514,7 +5487,7 @@ parser_parse_string(struct parser_params *parser, NODE *quote)
}
tokfix();
- set_yylval_str(STR_NEW3(tok(), toklen(), enc, has8bit, hasmb));
+ set_yylval_str(STR_NEW3(tok(), toklen(), enc, func));
return tSTRING_CONTENT;
}
@@ -5678,7 +5651,6 @@ parser_here_document(struct parser_params *parser, NODE *here)
}
else {
/* int mb = ENC_CODERANGE_7BIT, *mbp = &mb;*/
- int has8bit=0, hasmb=0;
rb_encoding *enc = parser->enc;
newtok();
if (c == '#') {
@@ -5695,16 +5667,16 @@ parser_here_document(struct parser_params *parser, NODE *here)
do {
pushback(c);
if ((c = tokadd_string(func, '\n', 0, NULL,
- &has8bit, &hasmb, &enc)) == -1) goto error;
+ &enc)) == -1) goto error;
if (c != '\n') {
- set_yylval_str(STR_NEW3(tok(), toklen(), enc, has8bit,hasmb));
+ set_yylval_str(STR_NEW3(tok(), toklen(), enc, func));
return tSTRING_CONTENT;
}
tokadd(nextc());
/* if (mbp && mb == ENC_CODERANGE_UNKNOWN) mbp = 0;*/
if ((c = nextc()) == -1) goto error;
} while (!whole_match_p(eos, len, indent));
- str = STR_NEW3(tok(), toklen(), enc, has8bit,hasmb);
+ str = STR_NEW3(tok(), toklen(), enc, func);
}
heredoc_restore(lex_strterm);
lex_strterm = NEW_STRTERM(-1, 0, 0);
@@ -5966,7 +5938,6 @@ parser_yylex(struct parser_params *parser)
int cmd_state;
enum lex_state_e last_state;
rb_encoding *enc;
- int has8bit = 0, hasmb = 0;
int mb;
#ifdef RIPPER
int fallthru = Qfalse;
@@ -6317,26 +6288,33 @@ parser_yylex(struct parser_params *parser)
newtok();
enc = parser->enc;
if (parser_ismbchar()) {
- hasmb = 1;
tokadd_mbchar(c);
}
else if ((rb_enc_isalnum(c, parser->enc) || c == '_') &&
lex_p < lex_pend && is_identchar(lex_p, lex_pend, parser->enc)) {
goto ternary;
}
- else if (c == '\\' && (c = read_escape(0, &has8bit, &hasmb, &enc)) >= 0x80) {
- if (hasmb) {
- tokaddmbc(c, enc);
- }
- else {
- tokadd(c);
- }
- }
- else {
+ else if (c == '\\') {
+ if (peek('u')) {
+ nextc();
+ c = parser_tokadd_utf8(parser, &enc, 0, 0, 0);
+ if (0x80 <= c) {
+ tokaddmbc(c, enc);
+ }
+ else {
+ tokadd(c);
+ }
+ }
+ else {
+ c = read_escape(0, &enc);
+ tokadd(c);
+ }
+ }
+ else {
tokadd(c);
- }
+ }
tokfix();
- set_yylval_str(STR_NEW3(tok(), toklen(), enc, has8bit, hasmb));
+ set_yylval_str(STR_NEW3(tok(), toklen(), enc, 0));
lex_state = EXPR_ENDARG;
return tCHAR;
@@ -8481,7 +8459,6 @@ reg_compile_gen(struct parser_params* parser, VALUE str, int options)
compile_error(PARSER_ARG "%s", RSTRING_PTR(re));
return Qnil;
}
- if (str) rb_enc_copy(re, str);
return re;
}
diff --git a/re.c b/re.c
index 398e748f3a..78cfd018f5 100644
--- a/re.c
+++ b/re.c
@@ -12,6 +12,7 @@
#include "ruby/ruby.h"
#include "ruby/re.h"
#include "ruby/encoding.h"
+#include "ruby/util.h"
#include "regint.h"
#include <ctype.h>
@@ -715,6 +716,10 @@ rb_reg_fixed_encoding_p(VALUE re)
return Qfalse;
}
+static VALUE
+rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
+ rb_encoding **fixed_enc, onig_errmsg_buffer err);
+
static void
rb_reg_prepare_re(VALUE re, VALUE str)
{
@@ -740,13 +745,19 @@ rb_reg_prepare_re(VALUE re, VALUE str)
OnigErrorInfo einfo;
regex_t *reg, *reg2;
UChar *pattern;
+ VALUE unescaped;
+ rb_encoding *fixed_enc = 0;
rb_reg_check(re);
reg = RREGEXP(re)->ptr;
pattern = ((UChar*)RREGEXP(re)->str);
- r = onig_new(&reg2, (UChar* )pattern,
- (UChar* )(pattern + RREGEXP(re)->len),
+ unescaped = rb_reg_preprocess(
+ RREGEXP(re)->str, RREGEXP(re)->str + RREGEXP(re)->len, enc,
+ &fixed_enc, err);
+
+ r = onig_new(&reg2, (UChar* )RSTRING_PTR(unescaped),
+ (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
reg->options, enc,
OnigDefaultSyntax, &einfo);
if (r) {
@@ -756,6 +767,7 @@ rb_reg_prepare_re(VALUE re, VALUE str)
RREGEXP(re)->ptr = reg2;
onig_free(reg);
+ RB_GC_GUARD(unescaped);
}
}
@@ -1236,12 +1248,407 @@ match_inspect(VALUE match)
VALUE rb_cRegexp;
static int
+read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
+{
+ const char *p = *pp;
+ int code;
+ int meta_prefix = 0, ctrl_prefix = 0;
+ int len;
+ int retbyte;
+
+ retbyte = -1;
+ if (p == end || *p++ != '\\') {
+ strcpy(err, "too short escaped multibyte character");
+ return -1;
+ }
+
+again:
+ if (p == end) {
+ strcpy(err, "too short escape sequence");
+ return -1;
+ }
+ switch (*p++) {
+ case '\\': code = '\\'; break;
+ case 'n': code = '\n'; break;
+ case 't': code = '\t'; break;
+ case 'r': code = '\r'; break;
+ case 'f': code = '\f'; break;
+ case 'v': code = '\013'; break;
+ case 'a': code = '\007'; break;
+ case 'e': code = '\033'; break;
+
+ /* \OOO */
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ p--;
+ code = ruby_scan_oct(p, end < p+3 ? end-p : 3, &len);
+ p += len;
+ break;
+
+ case 'x': /* \xHH */
+ code = ruby_scan_hex(p, end < p+2 ? end-p : 2, &len);
+ if (len < 1) {
+ strcpy(err, "invalid hex escape");
+ return -1;
+ }
+ p += len;
+ break;
+
+ case 'M': /* \M-X, \M-\C-X, \M-\cX */
+ if (meta_prefix) {
+ strcpy(err, "duplicate meta escape");
+ return -1;
+ }
+ meta_prefix = 1;
+ if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
+ if (*p == '\\') {
+ p++;
+ goto again;
+ }
+ else {
+ code = *p++;
+ break;
+ }
+ }
+ strcpy(err, "too short meta escape");
+ return -1;
+
+ case 'C': /* \C-X, \C-\M-X */
+ if (p == end || *p++ != '-') {
+ strcpy(err, "too short control escape");
+ return -1;
+ }
+ case 'c': /* \cX, \c\M-X */
+ if (ctrl_prefix) {
+ strcpy(err, "duplicate control escape");
+ return -1;
+ }
+ ctrl_prefix = 1;
+ if (p < end && (*p & 0x80) == 0) {
+ if (*p == '\\') {
+ p++;
+ goto again;
+ }
+ else {
+ code = *p++;
+ break;
+ }
+ }
+ strcpy(err, "too short control escape");
+ return -1;
+
+ default:
+ strcpy(err, "unexpected escape sequence");
+ return -1;
+ }
+ if (code < 0 || 0xff < code) {
+ strcpy(err, "invalid escape code");
+ return -1;
+ }
+
+ if (ctrl_prefix)
+ code &= 0x1f;
+ if (meta_prefix)
+ code |= 0x80;
+
+ *pp = p;
+ return code;
+}
+
+static int
+unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
+ VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
+{
+ const char *p = *pp;
+ int chmaxlen = rb_enc_mbmaxlen(enc);
+ char *chbuf = ALLOCA_N(char, chmaxlen);
+ int chlen = 0;
+ int byte;
+
+ memset(chbuf, 0, chmaxlen);
+
+ byte = read_escaped_byte(&p, end, err);
+ if (byte == -1) {
+ return -1;
+ }
+
+ chbuf[chlen++] = byte;
+ while (chlen < chmaxlen && chlen != mbclen(chbuf, chbuf+chmaxlen, enc)) {
+ byte = read_escaped_byte(&p, end, err);
+ if (byte == -1) {
+ return -1;
+ }
+ chbuf[chlen++] = byte;
+ }
+
+ if (chlen != mbclen(chbuf, chbuf+chmaxlen, enc)) {
+ strcpy(err, "invalid multibyte escape");
+ return -1;
+ }
+
+ if (1 < chlen || (chbuf[0] & 0x80)) {
+ rb_str_buf_cat(buf, chbuf, chlen);
+
+ if (*encp == 0)
+ *encp = enc;
+ else if (*encp != enc) {
+ strcpy(err, "character encodings differ");
+ return -1;
+ }
+ }
+ else {
+ char escbuf[5];
+ snprintf(escbuf, sizeof(escbuf), "\\x%02x", chbuf[0]&0xff);
+ rb_str_buf_cat(buf, escbuf, 4);
+ }
+ *pp = p;
+ return 0;
+}
+
+static int
+append_utf8(unsigned long uv,
+ VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
+{
+ if (uv < 0x80) {
+ char escbuf[5];
+ snprintf(escbuf, sizeof(escbuf), "\\x%02x", (int)uv);
+ rb_str_buf_cat(buf, escbuf, 4);
+ }
+ else {
+ int len;
+ char utf8buf[6];
+ len = rb_uv_to_utf8(utf8buf, uv);
+ rb_str_buf_cat(buf, utf8buf, len);
+
+ if (*encp == 0)
+ *encp = rb_enc_find("utf-8");
+ else if (*encp != rb_enc_find("utf-8")) {
+ strcpy(err, "character encodings differ");
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int
+unescape_unicode_list(const char **pp, const char *end,
+ VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
+{
+ const char *p = *pp;
+ int has_unicode = 0;
+ unsigned long code;
+ int len;
+
+ while (p < end && ISSPACE(*p)) p++;
+
+ while (1) {
+ code = ruby_scan_hex(p, end-p, &len);
+ if (len == 0)
+ break;
+ if (6 < len) { /* max 10FFFF */
+ strcpy(err, "invalid unicode range");
+ return -1;
+ }
+ if (0x10ffff < code) {
+ strcpy(err, "invalid unicode range");
+ return -1;
+ }
+ p += len;
+ if (append_utf8(code, buf, encp, err) != 0)
+ return -1;
+ has_unicode = 1;
+
+ while (p < end && ISSPACE(*p)) p++;
+ }
+
+ if (has_unicode == 0) {
+ strcpy(err, "invalid unicode list");
+ return -1;
+ }
+
+ *pp = p;
+
+ return 0;
+}
+
+static int
+unescape_unicode_bmp(const char **pp, const char *end,
+ VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
+{
+ const char *p = *pp;
+ int len;
+ unsigned long code;
+
+ if (end < p+4) {
+ strcpy(err, "invalid unicode escape");
+ return -1;
+ }
+ code = ruby_scan_hex(p, 4, &len);
+ if (len != 4) {
+ strcpy(err, "invalid unicode escape");
+ return -1;
+ }
+ if (append_utf8(code, buf, encp, err) != 0)
+ return -1;
+ *pp = p + 4;
+ return 0;
+}
+
+static int
+unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
+ VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
+{
+ char c;
+ char smallbuf[2];
+
+ while (p < end) {
+ int chlen = mbclen(p, end, enc);
+ if (1 < chlen || (*p & 0x80)) {
+ if (end < p + chlen) {
+ strcpy(err, "too short multibyte character");
+ return -1;
+ }
+ /* xxx: validate the non-ascii character */
+ rb_str_buf_cat(buf, p, chlen);
+ p += chlen;
+ if (*encp == 0)
+ *encp = enc;
+ else if (*encp != enc) {
+ strcpy(err, "character encodings differ");
+ return -1;
+ }
+ continue;
+ }
+
+ switch (c = *p++) {
+ case '\\':
+ if (p == end) {
+ strcpy(err, "too short escape sequence");
+ return -1;
+ }
+ switch (c = *p++) {
+ case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
+ {
+ int octlen;
+ if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
+ /* backref or 7bit octal.
+ no need to unescape anyway.
+ re-escaping may break backref */
+ goto escape_asis;
+ }
+ }
+ /* xxx: How about more than 199 subexpressions? */
+
+ case '0': /* \0, \0O, \0OO */
+
+ case 'x': /* \xHH */
+ case 'c': /* \cX, \c\M-X */
+ case 'C': /* \C-X, \C-\M-X */
+ case 'M': /* \M-X, \M-\C-X, \M-\cX */
+ p = p-2;
+ if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
+ return -1;
+ break;
+
+ case 'u':
+ if (p == end) {
+ strcpy(err, "too short escape sequence");
+ return -1;
+ }
+ if (*p == '{') {
+ /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */
+ p++;
+ if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
+ return -1;
+ if (p == end || *p++ != '}') {
+ strcpy(err, "invalid unicode list");
+ return -1;
+ }
+ break;
+ }
+ else {
+ /* \uHHHH */
+ if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
+ return -1;
+ break;
+ }
+
+ default: /* \n, \\, \d, \9, etc. */
+escape_asis:
+ smallbuf[0] = '\\';
+ smallbuf[1] = c;
+ rb_str_buf_cat(buf, smallbuf, 2);
+ break;
+ }
+ break;
+
+ default:
+ rb_str_buf_cat(buf, &c, 1);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static VALUE
+rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
+ rb_encoding **fixed_enc, onig_errmsg_buffer err)
+{
+ VALUE buf;
+
+ buf = rb_str_buf_new(0);
+
+ *fixed_enc = 0;
+ if (unescape_nonascii(p, end, enc, buf, fixed_enc, err) != 0)
+ return Qnil;
+
+ if (fixed_enc) {
+ rb_enc_associate(buf, *fixed_enc);
+ }
+
+ return buf;
+}
+
+#if 0
+static VALUE
+rb_reg_preprocess_obj(VALUE str,
+ rb_encoding **fixed_enc, onig_errmsg_buffer err)
+{
+ VALUE buf;
+ char *p, *end;
+ rb_encoding *enc;
+
+ StringValue(str);
+ p = RSTRING_PTR(str);
+ end = p + RSTRING_LEN(str);
+ enc = rb_enc_get(str);
+
+ buf = rb_reg_preprocess(p, end, enc, fixed_enc, err);
+ RB_GC_GUARD(str);
+ return buf;
+}
+
+static VALUE
+rb_reg_preprocess_m(VALUE klass, VALUE obj)
+{
+ rb_encoding *fixed_enc = 0;
+ onig_errmsg_buffer err;
+ VALUE str = rb_reg_preprocess_obj(obj, &fixed_enc, err);
+ if (str == Qnil)
+ rb_raise(rb_eArgError, "%s", err);
+ return rb_assoc_new(str, fixed_enc ? Qtrue : Qfalse);
+}
+#endif
+
+static int
rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
int options, onig_errmsg_buffer err)
{
struct RRegexp *re = RREGEXP(obj);
- int raw8bit;
- long i;
+ VALUE unescaped;
+ rb_encoding *fixed_enc = 0;
if (!OBJ_TAINTED(obj) && rb_safe_level() >= 4)
rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
@@ -1253,33 +1660,38 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
re->ptr = 0;
re->str = 0;
- raw8bit = 0;
- for (i = 0; i < len; i++) {
- if (s[i] & 0x80) {
- raw8bit = 1;
- break;
- }
+ unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
+ if (unescaped == Qnil)
+ return -1;
+
+ if (fixed_enc && (options & ARG_ENCODING_FIXED) && fixed_enc != enc) {
+ strcpy(err, "character encodings differ");
+ return -1;
}
+ if (fixed_enc)
+ enc = fixed_enc;
+ else if (!(options & ARG_ENCODING_FIXED))
+ enc = rb_default_encoding();
+
rb_enc_associate((VALUE)re, enc);
- if (options & ARG_ENCODING_FIXED || raw8bit) {
+ if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
re->basic.flags |= KCODE_FIXED;
}
- re->ptr = make_regexp(s, len, enc, options & ARG_REG_OPTION_MASK, err);
+ re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
+ options & ARG_REG_OPTION_MASK, err);
if (!re->ptr) return -1;
re->str = ALLOC_N(char, len+1);
memcpy(re->str, s, len);
re->str[len] = '\0';
re->len = len;
+ RB_GC_GUARD(unescaped);
return 0;
}
static int
rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err)
{
- if (!rb_enc_str_asciionly_p(str)) {
- options |= ARG_ENCODING_FIXED;
- }
return rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), rb_enc_get(str),
options, err);
}
@@ -2183,6 +2595,10 @@ Init_Regexp(void)
rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
+#if 0
+ rb_define_singleton_method(rb_cRegexp, "preprocess", rb_reg_preprocess_m, 1);
+#endif
+
rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);
diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb
index fd183967a3..bb0a8a5010 100644
--- a/test/ruby/test_m17n.rb
+++ b/test/ruby/test_m17n.rb
@@ -25,6 +25,17 @@ class TestM17N < Test::Unit::TestCase
assert_encoding("EUC-JP", eval(e(%{"\\x80"})).encoding)
end
+ def test_string_mixed_unicode
+ assert_raise(SyntaxError) { eval(a(%{"\xc0\xa0\\u{6666}"})) }
+ assert_raise(SyntaxError) { eval(e(%{"\xc0\xa0\\u{6666}"})) }
+ assert_raise(SyntaxError) { eval(s(%{"\xc0\xa0\\u{6666}"})) }
+ assert_nothing_raised { eval(u(%{"\xc0\xa0\\u{6666}"})) }
+ assert_raise(SyntaxError) { eval(a(%{"\\u{6666}\xc0\xa0"})) }
+ assert_raise(SyntaxError) { eval(e(%{"\\u{6666}\xc0\xa0"})) }
+ assert_raise(SyntaxError) { eval(s(%{"\\u{6666}\xc0\xa0"})) }
+ assert_nothing_raised { eval(u(%{"\\u{6666}\xc0\xa0"})) }
+ end
+
def test_regexp_too_short_multibyte_character
assert_raise(SyntaxError) { eval('/\xfe/e') }
assert_raise(SyntaxError) { eval('/\x8e/e') }
@@ -38,11 +49,12 @@ class TestM17N < Test::Unit::TestCase
assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') }
# raw 8bit
- #assert_raise(SyntaxError) { eval("/\xfe/e") }
- #assert_raise(SyntaxError) { eval("/\xc0/u") }
+ assert_raise(SyntaxError) { eval("/\xfe/e") }
+ assert_raise(SyntaxError) { eval("/\xc0/u") }
# invalid suffix
- #assert_raise(SyntaxError) { eval('/\xc0\xff/u') }
+ assert_raise(SyntaxError) { eval('/\xc0\xff/u') }
+ assert_raise(SyntaxError) { eval('/\xc0 /u') }
#assert_raise(SyntaxError) { eval('/\xc0\x20/u') }
end
@@ -94,6 +106,9 @@ class TestM17N < Test::Unit::TestCase
def test_regexp_generic
assert_regexp_generic_ascii(/a/)
assert_regexp_generic_ascii(Regexp.new(a("a")))
+ assert_regexp_generic_ascii(Regexp.new(e("a")))
+ assert_regexp_generic_ascii(Regexp.new(s("a")))
+ assert_regexp_generic_ascii(Regexp.new(u("a")))
[/a/, Regexp.new(a("a"))].each {|r|
assert_equal(0, r =~ a("a"))
@@ -112,7 +127,7 @@ class TestM17N < Test::Unit::TestCase
assert_regexp_fixed_ascii8bit(/\xc0\xa1/n)
assert_regexp_fixed_ascii8bit(eval(a(%{/\xc0\xa1/})))
assert_regexp_fixed_ascii8bit(eval(a(%{/\xc0\xa1/n})))
- # assert_regexp_fixed_ascii8bit(eval(a(%q{/\xc0\xa1/})))
+ assert_regexp_fixed_ascii8bit(eval(a(%q{/\xc0\xa1/})))
[/a/n].each {|r|
assert_equal(0, r =~ a("a"))
@@ -139,12 +154,11 @@ class TestM17N < Test::Unit::TestCase
def test_regexp_euc
assert_regexp_fixed_eucjp(/a/e)
- assert_regexp_fixed_eucjp(Regexp.new(e("a")))
assert_regexp_fixed_eucjp(/\xc0\xa1/e)
assert_regexp_fixed_eucjp(eval(e(%{/\xc0\xa1/})))
assert_regexp_fixed_eucjp(eval(e(%q{/\xc0\xa1/})))
- [/a/e, Regexp.new(e("a"))].each {|r|
+ [/a/e].each {|r|
assert_equal(0, r =~ a("a"))
assert_equal(0, r =~ e("a"))
assert_equal(0, r =~ s("a"))
@@ -169,7 +183,6 @@ class TestM17N < Test::Unit::TestCase
def test_regexp_sjis
assert_regexp_fixed_sjis(/a/s)
- assert_regexp_fixed_sjis(Regexp.new(s("a")))
assert_regexp_fixed_sjis(/\xc0\xa1/s)
assert_regexp_fixed_sjis(eval(s(%{/\xc0\xa1/})))
assert_regexp_fixed_sjis(eval(s(%q{/\xc0\xa1/})))
diff --git a/test/ruby/test_unicode_escape.rb b/test/ruby/test_unicode_escape.rb
index 46413cdcdb..a1800c66e6 100644
--- a/test/ruby/test_unicode_escape.rb
+++ b/test/ruby/test_unicode_escape.rb
@@ -68,47 +68,74 @@ EOS
def test_regexp
# Compare regexps to regexps
- assert_equal(/Yukihiro Matsumoto - 松本行弘/,
+ assert_not_equal(/Yukihiro Matsumoto - 松本行弘/,
/Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/)
- assert_equal(/Yukihiro Matsumoto - 松本行弘/,
- /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/)
- assert_equal(/Matz - まつもと ゆきひろ/,
+ assert_not_equal(/Yukihiro Matsumoto - 松本行弘/,
+ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/)
+ assert_not_equal(/Matz - まつもと ゆきひろ/,
/Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/)
- assert_equal(/Aoyama Gakuin University - 青山学院大学/,
+ assert_not_equal(/Aoyama Gakuin University - 青山学院大学/,
/Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/)
- assert_equal(/青山学院大学/, /\u9752\u5C71\u5B66\u9662\u5927\u5B66/)
- assert_equal(/Martin Dürst/, /Martin D\u00FCrst/)
- assert_equal(/ü/, /\u00FC/)
- assert_equal(/Martin Dürst/, /Martin D\u{FC}rst/)
- assert_equal(/ü/, /\u{FC}/)
- assert_equal(/ü/, %r{\u{FC}})
- assert_equal(/ü/i, %r{\u00FC}i)
+ assert_not_equal(/青山学院大学/, /\u9752\u5C71\u5B66\u9662\u5927\u5B66/)
+ assert_not_equal(/Martin Dürst/, /Martin D\u00FCrst/)
+ assert_not_equal(/ü/, /\u00FC/)
+ assert_not_equal(/Martin Dürst/, /Martin D\u{FC}rst/)
+ assert_not_equal(/ü/, /\u{FC}/)
+ assert_not_equal(/ü/, %r{\u{FC}})
+ assert_not_equal(/ü/i, %r{\u00FC}i)
+
+ assert_equal('Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18',
+ /Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/.source)
+ assert_equal('Yukihiro Matsumoto - \u{677E 672C 884C 5F18}',
+ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/.source)
+ assert_equal('Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D',
+ /Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/.source)
+ assert_equal('Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66',
+ /Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/.source)
+ assert_equal('\u9752\u5C71\u5B66\u9662\u5927\u5B66',
+ /\u9752\u5C71\u5B66\u9662\u5927\u5B66/.source)
+ assert_equal('Martin D\u00FCrst', /Martin D\u00FCrst/.source)
+ assert_equal('\u00FC', /\u00FC/.source)
+ assert_equal('Martin D\u{FC}rst', /Martin D\u{FC}rst/.source)
+ assert_equal('\u{FC}', /\u{FC}/.source)
+ assert_equal('\u{FC}', %r{\u{FC}}.source)
+ assert_equal('\u00FC', %r{\u00FC}i.source)
# match strings to regexps
- assert_equal("Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/, 0)
- assert_equal("Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C/, 0)
- assert_equal("Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/, 0)
- assert_equal(%Q{Yukihiro Matsumoto - \u{677E 672C 884C 5F18}} =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/, 0)
- assert_equal("Matz - まつもと ゆきひろ" =~ /Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/, 0)
- assert_equal("Aoyama Gakuin University - 青山学院大学" =~ /Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/, 0)
- assert_equal("青山学院大学" =~ /\u9752\u5C71\u5B66\u9662\u5927\u5B66/, 0)
- assert_equal("Martin Dürst" =~ /Martin D\u00FCrst/, 0)
- assert_equal("ü" =~ /\u00FC/, 0)
- assert_equal("Martin Dürst" =~ /Martin D\u{FC}rst/, 0)
- assert_equal("ü" =~ %r{\u{FC}}, 0)
- assert_equal("ü" =~ %r{\u00FC}i, 0)
+ assert_equal(0, "Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/)
+ assert_equal(0, "Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C/)
+ assert_equal(0, "Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/)
+ assert_equal(0, %Q{Yukihiro Matsumoto - \u{677E 672C 884C 5F18}} =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/)
+ assert_equal(0, "Matz - まつもと ゆきひろ" =~ /Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/)
+ assert_equal(0, "Aoyama Gakuin University - 青山学院大学" =~ /Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/)
+ assert_equal(0, "青山学院大学" =~ /\u9752\u5C71\u5B66\u9662\u5927\u5B66/)
+ assert_equal(0, "Martin Dürst" =~ /Martin D\u00FCrst/)
+ assert_equal(0, "ü" =~ /\u00FC/)
+ assert_equal(0, "Martin Dürst" =~ /Martin D\u{FC}rst/)
+ assert_equal(0, "ü" =~ %r{\u{FC}})
+ assert_equal(0, "ü" =~ %r{\u00FC}i)
# Flip order of the two operands
- assert_equal(/Martin D\u00FCrst/ =~ "Martin Dürst", 0)
- assert_equal(/\u00FC/ =~ "testü", 4)
- assert_equal(/Martin D\u{FC}rst/ =~ "fooMartin Dürstbar", 3)
- assert_equal(%r{\u{FC}} =~ "fooübar", 3)
+ assert_equal(0, /Martin D\u00FCrst/ =~ "Martin Dürst")
+ assert_equal(4, /\u00FC/ =~ "testü")
+ assert_equal(3, /Martin D\u{FC}rst/ =~ "fooMartin Dürstbar")
+ assert_equal(3, %r{\u{FC}} =~ "fooübar")
# Put \u in strings, literal character in regexp
- assert_equal("Martin D\u00FCrst" =~ /Martin Dürst/, 0)
- assert_equal("test\u00FC" =~ /ü/, 4)
- assert_equal("fooMartin D\u{FC}rstbar" =~ /Martin Dürst/, 3)
- assert_equal(%Q{foo\u{FC}bar} =~ %r<ü>, 3)
+ assert_equal(0, "Martin D\u00FCrst" =~ /Martin Dürst/)
+ assert_equal(4, "test\u00FC" =~ /ü/)
+ assert_equal(3, "fooMartin D\u{FC}rstbar" =~ /Martin Dürst/)
+ assert_equal(3, %Q{foo\u{FC}bar} =~ %r<ü>)
+
+ assert_match(eval('/\u{2a}/'), "*")
+ assert_raise(SyntaxError) { eval('/\u{6666}/n') }
+ assert_raise(SyntaxError) { eval('/\u{6666}/e') }
+ assert_raise(SyntaxError) { eval('/\u{6666}/s') }
+ assert_nothing_raised { eval('/\u{6666}/u') }
+ end
+
+ def test_dynamic_regexp
+ assert_match(Regexp.new("Martin D\\u{FC}rst"), "Martin Dürst")
end
def test_syntax_variants