From 0e3dc5a056abf51363070ad94de4a8097bc80197 Mon Sep 17 00:00:00 2001
From: Kevin Newton <kddnewton@gmail.com>
Date: Thu, 24 Aug 2023 11:09:17 -0400
Subject: [ruby/yarp] Fix lex compat with BOM

* BOM should not impact looking for the encoding string
* We should re-encode tokens when the encoding changes
* BOM should change the column of comments only

https://github.com/ruby/yarp/commit/119fc2d7b2
---
 yarp/extension.c | 14 ++++++++++++++
 yarp/yarp.c      | 15 +++++++++------
 2 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'yarp')

diff --git a/yarp/extension.c b/yarp/extension.c
index 455cdcadcc..8aef456c00 100644
--- a/yarp/extension.c
+++ b/yarp/extension.c
@@ -221,6 +221,20 @@ static void
 lex_encoding_changed_callback(yp_parser_t *parser) {
     lex_data_t *lex_data = (lex_data_t *) parser->lex_callback->data;
     lex_data->encoding = rb_enc_find(parser->encoding.name);
+
+    // Since we got a new encoding, we need to go back and change the encoding
+    // of the tokens that we've already lexed. This should be a tiny amount
+    // since encoding magic comments need to be the first or second line of the
+    // file.
+    VALUE tokens = lex_data->tokens;
+    for (long index = 0; index < RARRAY_LEN(tokens); index++) {
+        VALUE yields = rb_ary_entry(tokens, index);
+        VALUE token = rb_ary_entry(yields, 0);
+
+        VALUE value = rb_ivar_get(token, rb_intern("@value"));
+        rb_enc_associate(value, lex_data->encoding);
+        ENC_CODERANGE_CLEAR(value);
+    }
 }
 
 // Return an array of tokens corresponding to the given source.
diff --git a/yarp/yarp.c b/yarp/yarp.c
index 2a5a923c76..4de3338dc1 100644
--- a/yarp/yarp.c
+++ b/yarp/yarp.c
@@ -12876,6 +12876,8 @@ yp_parser_metadata(yp_parser_t *parser, const char *metadata) {
 // Initialize a parser with the given start and end pointers.
 YP_EXPORTED_FUNCTION void
 yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char *filepath) {
+    assert(source != NULL);
+
     // Set filepath to the file that was passed
     if (!filepath) filepath = "";
     yp_string_t filepath_string;
@@ -12944,14 +12946,15 @@ yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char
     size_t newline_size = size / 22;
     yp_newline_list_init(&parser->newline_list, source, newline_size < 4 ? 4 : newline_size);
 
-    assert(source != NULL);
+    // Skip past the UTF-8 BOM if it exists.
     if (size >= 3 && (unsigned char) source[0] == 0xef && (unsigned char) source[1] == 0xbb && (unsigned char) source[2] == 0xbf) {
-        // If the first three bytes of the source are the UTF-8 BOM, then we'll skip
-        // over them.
         parser->current.end += 3;
-    } else if (size >= 2 && source[0] == '#' && source[1] == '!') {
-        // If the first two bytes of the source are a shebang, then we'll indicate
-        // that the encoding comment is at the end of the shebang.
+        parser->encoding_comment_start += 3;
+    }
+
+    // If the first two bytes of the source are a shebang, then we'll indicate
+    // that the encoding comment is at the end of the shebang.
+    if (peek(parser) == '#' && peek_offset(parser, 1) == '!') {
         const char *encoding_comment_start = next_newline(source, (ptrdiff_t) size);
         if (encoding_comment_start) {
             parser->encoding_comment_start = encoding_comment_start + 1;
-- 
cgit v1.2.3