summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2023-08-24 11:09:17 -0400
committergit <svn-admin@ruby-lang.org>2023-08-24 21:30:01 +0000
commit0e3dc5a056abf51363070ad94de4a8097bc80197 (patch)
treee2bf91984c5aaf0d5157863b9e5c196c5489c0da
parent90048241cad97573d830e86222ca4826a32da13e (diff)
[ruby/yarp] Fix lex compat with BOM
* BOM should not impact looking for the encoding string * We should re-encode tokens when the encoding changes * BOM should change the column of comments only https://github.com/ruby/yarp/commit/119fc2d7b2
-rw-r--r--lib/yarp/lex_compat.rb45
-rw-r--r--test/bom_test.rb57
-rw-r--r--yarp/extension.c14
-rw-r--r--yarp/yarp.c15
4 files changed, 116 insertions, 15 deletions
diff --git a/lib/yarp/lex_compat.rb b/lib/yarp/lex_compat.rb
index 8362b9063a..a9867737c2 100644
--- a/lib/yarp/lex_compat.rb
+++ b/lib/yarp/lex_compat.rb
@@ -574,19 +574,41 @@ module YARP
result = YARP.lex(source, @filepath)
result_value = result.value
previous_state = nil
-
- # If there's a UTF-8 byte-order mark as the start of the file, then ripper
- # sets every token's on the first line back by 6 bytes. It also keeps the
- # byte order mark in the first token's value. This is weird, and I don't
- # want to mirror that in our parser. So instead, we'll match up the values
- # here, and then match up the locations as we process the tokens.
- bom = source.bytes[0..2] == [0xEF, 0xBB, 0xBF]
- result_value[0][0].value.prepend("\xEF\xBB\xBF") if bom
+ bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
result_value.each_with_index do |(token, lex_state), index|
lineno = token.location.start_line
column = token.location.start_column
- column -= index == 0 ? 6 : 3 if bom && lineno == 1
+
+ # If there's a UTF-8 byte-order mark as the start of the file, then for
+ # certain tokens ripper sets the first token back by 3 bytes. It also
+ # keeps the byte order mark in the first token's value. This is weird,
+ # and I don't want to mirror that in our parser. So instead, we'll match
+ # up the columns and values here.
+ if bom && lineno == 1
+ column -= 3
+
+ if index == 0 && column == 0
+ flushed =
+ case token.type
+ when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE,
+ :GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I,
+ :PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I,
+ :PERCENT_UPPER_W, :STRING_BEGIN
+ true
+ when :REGEXP_BEGIN, :SYMBOL_BEGIN
+ token.value.start_with?("%")
+ else
+ false
+ end
+
+ unless flushed
+ column -= 3
+ value = token.value
+ value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding))
+ end
+ end
+ end
event = RIPPER.fetch(token.type)
value = token.value
@@ -668,6 +690,11 @@ module YARP
end_offset = token.location.start_offset
if previous_token.type == :COMMENT && start_offset < end_offset
+ if bom
+ start_offset += 3
+ end_offset += 3
+ end
+
tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
end
diff --git a/test/bom_test.rb b/test/bom_test.rb
new file mode 100644
index 0000000000..7dc7eabe92
--- /dev/null
+++ b/test/bom_test.rb
@@ -0,0 +1,57 @@
+# frozen_string_literal: true
+
+# Don't bother checking this on these engines, this is such a specific Ripper
+# test.
+return if RUBY_ENGINE == "jruby" || RUBY_ENGINE == "truffleruby"
+
+require "yarp_test_helper"
+
+class BOMTest < Test::Unit::TestCase
+ def test_ident
+ assert_bom("foo")
+ end
+
+ def test_back_reference
+ assert_bom("$+")
+ end
+
+ def test_instance_variable
+ assert_bom("@foo")
+ end
+
+ def test_class_variable
+ assert_bom("@@foo")
+ end
+
+ def test_global_variable
+ assert_bom("$foo")
+ end
+
+ def test_numbered_reference
+ assert_bom("$1")
+ end
+
+ def test_percents
+ assert_bom("%i[]")
+ assert_bom("%r[]")
+ assert_bom("%s[]")
+ assert_bom("%q{}")
+ assert_bom("%w[]")
+ assert_bom("%x[]")
+ assert_bom("%I[]")
+ assert_bom("%W[]")
+ assert_bom("%Q{}")
+ end
+
+ def test_string
+ assert_bom("\"\"")
+ assert_bom("''")
+ end
+
+ private
+
+ def assert_bom(source)
+ bommed = "\xEF\xBB\xBF#{source}"
+ assert_equal YARP.lex_ripper(bommed), YARP.lex_compat(bommed).value
+ end
+end
diff --git a/yarp/extension.c b/yarp/extension.c
index 455cdcadcc..8aef456c00 100644
--- a/yarp/extension.c
+++ b/yarp/extension.c
@@ -221,6 +221,20 @@ static void
lex_encoding_changed_callback(yp_parser_t *parser) {
lex_data_t *lex_data = (lex_data_t *) parser->lex_callback->data;
lex_data->encoding = rb_enc_find(parser->encoding.name);
+
+ // Since we got a new encoding, we need to go back and change the encoding
+ // of the tokens that we've already lexed. This should be a tiny amount
+ // since encoding magic comments need to be the first or second line of the
+ // file.
+ VALUE tokens = lex_data->tokens;
+ for (long index = 0; index < RARRAY_LEN(tokens); index++) {
+ VALUE yields = rb_ary_entry(tokens, index);
+ VALUE token = rb_ary_entry(yields, 0);
+
+ VALUE value = rb_ivar_get(token, rb_intern("@value"));
+ rb_enc_associate(value, lex_data->encoding);
+ ENC_CODERANGE_CLEAR(value);
+ }
}
// Return an array of tokens corresponding to the given source.
diff --git a/yarp/yarp.c b/yarp/yarp.c
index 2a5a923c76..4de3338dc1 100644
--- a/yarp/yarp.c
+++ b/yarp/yarp.c
@@ -12876,6 +12876,8 @@ yp_parser_metadata(yp_parser_t *parser, const char *metadata) {
// Initialize a parser with the given start and end pointers.
YP_EXPORTED_FUNCTION void
yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char *filepath) {
+ assert(source != NULL);
+
// Set filepath to the file that was passed
if (!filepath) filepath = "";
yp_string_t filepath_string;
@@ -12944,14 +12946,15 @@ yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char
size_t newline_size = size / 22;
yp_newline_list_init(&parser->newline_list, source, newline_size < 4 ? 4 : newline_size);
- assert(source != NULL);
+ // Skip past the UTF-8 BOM if it exists.
if (size >= 3 && (unsigned char) source[0] == 0xef && (unsigned char) source[1] == 0xbb && (unsigned char) source[2] == 0xbf) {
- // If the first three bytes of the source are the UTF-8 BOM, then we'll skip
- // over them.
parser->current.end += 3;
- } else if (size >= 2 && source[0] == '#' && source[1] == '!') {
- // If the first two bytes of the source are a shebang, then we'll indicate
- // that the encoding comment is at the end of the shebang.
+ parser->encoding_comment_start += 3;
+ }
+
+ // If the first two bytes of the source are a shebang, then we'll indicate
+ // that the encoding comment is at the end of the shebang.
+ if (peek(parser) == '#' && peek_offset(parser, 1) == '!') {
const char *encoding_comment_start = next_newline(source, (ptrdiff_t) size);
if (encoding_comment_start) {
parser->encoding_comment_start = encoding_comment_start + 1;