summaryrefslogtreecommitdiff
path: root/string.c
diff options
context:
space:
mode:
authortadd <tadd@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2017-12-14 08:47:13 +0000
committertadd <tadd@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2017-12-14 08:47:13 +0000
commitbbec11d329f5a72fe6151ec9fb0e25ff255f2eed (patch)
tree6ea1a7056b7905c0f7a8b3ee26e3df7857ad4ee7 /string.c
parent4abc1a24af2541ce6fd823ae4c99c1c18c748984 (diff)
Implement String#undump to unescape String#dump-ed string
[Feature #12275] [close GH-1765] git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@61228 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'string.c')
-rw-r--r--string.c301
1 files changed, 284 insertions, 17 deletions
diff --git a/string.c b/string.c
index 56b6f641c6..6a8e24a1bf 100644
--- a/string.c
+++ b/string.c
@@ -19,6 +19,7 @@
#include "ruby_assert.h"
#include "id.h"
#include "debug_counter.h"
+#include "ruby/util.h"
#define BEG(no) (regs->beg[(no)])
#define END(no) (regs->end[(no)])
@@ -3422,13 +3423,34 @@ str_casecmp_p(VALUE str1, VALUE str2)
return rb_str_eql(folded_str1, folded_str2);
}
+static long
+strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
+ const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
+{
+ const char *search_start = str_ptr;
+ long pos, search_len = str_len - offset;
+
+ for (;;) {
+ const char *t;
+ pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
+ if (pos < 0) return pos;
+ t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
+ if (t == search_start + pos) break;
+ search_len -= t - search_start;
+ if (search_len <= 0) return -1;
+ offset += t - search_start;
+ search_start = t;
+ }
+ return pos + offset;
+}
+
#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
static long
rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
{
- const char *str_ptr, *str_ptr_end, *sub_ptr, *search_start;
- long pos, str_len, sub_len, search_len;
+ const char *str_ptr, *str_ptr_end, *sub_ptr;
+ long str_len, sub_len;
int single_byte = single_byte_optimizable(str);
rb_encoding *enc;
@@ -3458,21 +3480,7 @@ rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
if (sub_len == 0) return offset;
/* need proceed one character at a time */
-
- search_start = str_ptr;
- search_len = RSTRING_LEN(str) - offset;
- for (;;) {
- const char *t;
- pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
- if (pos < 0) return pos;
- t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
- if (t == search_start + pos) break;
- search_len -= t - search_start;
- if (search_len <= 0) return -1;
- offset += t - search_start;
- search_start = t;
- }
- return pos + offset;
+ return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
}
@@ -6073,6 +6081,264 @@ rb_str_dump(VALUE str)
return result;
}
+enum undump_source_format {
+ UNDUMP_SOURCE_SIMPLE, /* "..." */
+ UNDUMP_SOURCE_FORCE_ENCODING, /* "...".force_encoding("...") */
+ UNDUMP_SOURCE_INVALID
+};
+
+static enum undump_source_format
+check_undump_source_format(const char *s, const char *s_end, long len, rb_encoding *enc,
+ VALUE *forced_enc_str, long *forced_enc_str_len)
+{
+ unsigned int cbeg, cend;
+ const char *prev;
+ static const long force_encoding_minimum_len = rb_strlen_lit("\"\".force_encoding(\"\")");
+ static const char force_encoding_middle_part[] = "\".force_encoding(\"";
+ static const long force_encoding_middle_part_len = rb_strlen_lit("\".force_encoding(\"");
+ static const char force_encoding_end_part[] = "\")";
+ static const long force_encoding_end_part_len = rb_strlen_lit("\")");
+ long pos_before_middle_part, pos_before_end_part, pos_after_middle_part;
+
+ if (len < 2) return UNDUMP_SOURCE_INVALID;
+
+ cbeg = rb_enc_mbc_to_codepoint(s, s_end, enc);
+ if (cbeg != '"') return UNDUMP_SOURCE_INVALID;
+
+ prev = rb_enc_prev_char(s, s_end, s_end, enc);
+ cend = rb_enc_mbc_to_codepoint(prev, s_end, enc);
+ if (cend == '"') return UNDUMP_SOURCE_SIMPLE;
+
+ if (cend != ')' || len < force_encoding_minimum_len) {
+ return UNDUMP_SOURCE_INVALID;
+ }
+
+ /* find '".force_encoding("' */
+ pos_before_middle_part = strseq_core(s, s_end, len,
+ force_encoding_middle_part, force_encoding_middle_part_len,
+ 0, enc);
+ if (pos_before_middle_part <= 0) {
+ return UNDUMP_SOURCE_INVALID;
+ }
+
+ pos_after_middle_part = pos_before_middle_part + force_encoding_middle_part_len;
+ /* find '")' */
+ pos_before_end_part = strseq_core(s + pos_after_middle_part, s_end, len - pos_after_middle_part,
+ force_encoding_end_part, force_encoding_end_part_len,
+ 0, enc);
+ if (pos_before_end_part < 0 || pos_after_middle_part + pos_before_end_part + 2 != len) {
+ return UNDUMP_SOURCE_INVALID;
+ }
+
+ *forced_enc_str_len = pos_before_end_part;
+ *forced_enc_str = rb_str_new(s + pos_after_middle_part, *forced_enc_str_len);
+ return UNDUMP_SOURCE_FORCE_ENCODING;
+}
+
+static int
+unescape_ascii(unsigned int c)
+{
+ switch (c) {
+ case 'n':
+ return '\n';
+ case 'r':
+ return '\r';
+ case 't':
+ return '\t';
+ case 'f':
+ return '\f';
+ case 'v':
+ return '\13';
+ case 'b':
+ return '\010';
+ case 'a':
+ return '\007';
+ case 'e':
+ return 033;
+ default:
+ UNREACHABLE;
+ }
+}
+
+static int
+undump_after_backslash(VALUE undumped, const char *s, const char *s_end, rb_encoding **penc)
+{
+ unsigned int c, c2;
+ int n, codelen;
+ size_t hexlen;
+ char buf[6];
+ static rb_encoding *enc_utf8 = NULL;
+
+ c = rb_enc_codepoint_len(s, s_end, &n, *penc);
+ switch (c) {
+ case '\\':
+ case '"':
+ case '#':
+ rb_str_cat(undumped, s, n); /* cat itself */
+ n++;
+ break;
+ case 'n':
+ case 'r':
+ case 't':
+ case 'f':
+ case 'v':
+ case 'b':
+ case 'a':
+ case 'e':
+ *buf = (char)unescape_ascii(c);
+ rb_str_cat(undumped, buf, n);
+ n++;
+ break;
+ case 'u':
+ if (s+1 >= s_end) {
+ rb_raise(rb_eRuntimeError, "invalid Unicode escape");
+ }
+ if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
+ if (*penc != enc_utf8) {
+ *penc = enc_utf8;
+ rb_enc_associate(undumped, enc_utf8);
+ ENC_CODERANGE_CLEAR(undumped);
+ }
+ c2 = rb_enc_codepoint_len(s+1, s_end, NULL, *penc);
+ if (c2 == '{') { /* handle \u{...} form */
+ const char *hexstr = s + 2;
+ int hex;
+ static const char* const close_brace = "}";
+ long pos;
+
+ if (hexstr >= s_end) {
+ rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
+ }
+ /* find close brace */
+ pos = strseq_core(hexstr, s_end, s_end - hexstr, close_brace, 1, 0, *penc);
+ if (pos < 0) {
+ rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
+ }
+ hex = scan_hex(hexstr, pos, &hexlen);
+ if (hexlen == 0 || hexlen > 6) {
+ rb_raise(rb_eRuntimeError, "invalid Unicode escape");
+ }
+ if (hex > 0x10ffff) {
+ rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
+ }
+ if ((hex & 0xfffff800) == 0xd800) {
+ rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
+ }
+ codelen = rb_enc_codelen(hex, *penc);
+ rb_enc_mbcput(hex, buf, *penc);
+ rb_str_cat(undumped, buf, codelen);
+ n += rb_strlen_lit("u{}") + hexlen;
+ }
+ else { /* handle \uXXXX form */
+ int hex = scan_hex(s+1, 4, &hexlen);
+ if (hexlen != 4) {
+ rb_raise(rb_eRuntimeError, "invalid Unicode escape");
+ }
+ codelen = rb_enc_codelen(hex, *penc);
+ rb_enc_mbcput(hex, buf, *penc);
+ rb_str_cat(undumped, buf, codelen);
+ n += rb_strlen_lit("uXXXX");
+ }
+ break;
+ case 'x':
+ if (s+1 >= s_end) {
+ rb_raise(rb_eRuntimeError, "invalid hex escape");
+ }
+ c2 = scan_hex(s+1, 2, &hexlen);
+ if (hexlen != 2) {
+ rb_raise(rb_eRuntimeError, "invalid hex escape");
+ }
+ *buf = (char)c2;
+ rb_str_cat(undumped, buf, 1L);
+ n += rb_strlen_lit("xXX");
+ break;
+ default:
+ rb_str_cat(undumped, "\\", 1L); /* keep backslash */
+ }
+
+ return n;
+}
+
+static VALUE rb_str_is_ascii_only_p(VALUE str);
+
+/*
+ * call-seq:
+ * str.undump -> new_str
+ *
+ * Produces unescaped version of +str+.
+ * See also String#dump because String#undump does inverse of String#dump.
+ *
+ * "\"hello \\n ''\"".undump #=> "hello \n ''"
+ */
+
+static VALUE
+str_undump(VALUE str)
+{
+ const char *s = RSTRING_PTR(str);
+ const char *s_end = RSTRING_END(str);
+ long len = RSTRING_LEN(str);
+ rb_encoding *enc = rb_enc_get(str), *forced_enc;
+ int n;
+ unsigned int c;
+ enum undump_source_format source_format;
+ VALUE undumped = rb_enc_str_new(s, 0L, enc);
+ VALUE forced_enc_str;
+ long forced_enc_str_len;
+ int w;
+
+ rb_must_asciicompat(str);
+ if (rb_str_is_ascii_only_p(str) == Qfalse) {
+ rb_raise(rb_eRuntimeError, "non-ASCII character detected");
+ }
+ if (!str_null_check(str, &w)) {
+ rb_raise(rb_eRuntimeError, "string contains null byte");
+ }
+
+ source_format = check_undump_source_format(s, s_end, len, enc,
+ &forced_enc_str, &forced_enc_str_len);
+ if (source_format == UNDUMP_SOURCE_INVALID) {
+ rb_raise(rb_eRuntimeError, "not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
+ }
+ if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) {
+ forced_enc = rb_find_encoding(forced_enc_str);
+ if (forced_enc == NULL) {
+ rb_raise(rb_eRuntimeError, "unknown encoding name - %"PRIsVALUE, forced_enc_str);
+ }
+ }
+
+ /* strip '"' at the start */
+ s++;
+ if (source_format == UNDUMP_SOURCE_SIMPLE) {
+ /* strip '"' at the end */
+ s_end--;
+ } else { /* source_format == UNDUMP_SOURCE_FORCE_ENCODING */
+ /* strip '".force_encoding("...")' */
+ s_end -= rb_strlen_lit("\".force_encoding(\"\")") + forced_enc_str_len;
+ }
+
+ for (; s < s_end; s += n) {
+ c = rb_enc_codepoint_len(s, s_end, &n, enc);
+ if (c == '\\') {
+ if (s+1 >= s_end) {
+ rb_raise(rb_eRuntimeError, "invalid escape");
+ }
+ n = undump_after_backslash(undumped, s+1, s_end, &enc);
+ }
+ else if (c == '"') {
+ rb_raise(rb_eRuntimeError, "non-escaped double quote detected");
+ }
+ else {
+ rb_str_cat(undumped, s, n);
+ }
+ }
+
+ if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) {
+ rb_enc_associate(undumped, forced_enc);
+ ENC_CODERANGE_CLEAR(undumped);
+ }
+ OBJ_INFECT(undumped, str);
+ return undumped;
+}
static void
rb_str_check_dummy_enc(rb_encoding *enc)
@@ -10586,6 +10852,7 @@ Init_String(void)
rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
rb_define_method(rb_cString, "dump", rb_str_dump, 0);
+ rb_define_method(rb_cString, "undump", str_undump, 0);
sym_ascii = ID2SYM(rb_intern("ascii"));
sym_turkic = ID2SYM(rb_intern("turkic"));