summaryrefslogtreecommitdiff
path: root/yarp
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2023-08-24 11:09:17 -0400
committergit <svn-admin@ruby-lang.org>2023-08-24 21:30:01 +0000
commit0e3dc5a056abf51363070ad94de4a8097bc80197 (patch)
treee2bf91984c5aaf0d5157863b9e5c196c5489c0da /yarp
parent90048241cad97573d830e86222ca4826a32da13e (diff)
[ruby/yarp] Fix lex compat with BOM
* BOM should not impact looking for the encoding string * We should re-encode tokens when the encoding changes * BOM should change the column of comments only https://github.com/ruby/yarp/commit/119fc2d7b2
Diffstat (limited to 'yarp')
-rw-r--r--yarp/extension.c14
-rw-r--r--yarp/yarp.c15
2 files changed, 23 insertions, 6 deletions
diff --git a/yarp/extension.c b/yarp/extension.c
index 455cdcadcc..8aef456c00 100644
--- a/yarp/extension.c
+++ b/yarp/extension.c
@@ -221,6 +221,20 @@ static void
lex_encoding_changed_callback(yp_parser_t *parser) {
lex_data_t *lex_data = (lex_data_t *) parser->lex_callback->data;
lex_data->encoding = rb_enc_find(parser->encoding.name);
+
+ // Since we got a new encoding, we need to go back and change the encoding
+ // of the tokens that we've already lexed. This should be a tiny amount
+ // since encoding magic comments need to be the first or second line of the
+ // file.
+ VALUE tokens = lex_data->tokens;
+ for (long index = 0; index < RARRAY_LEN(tokens); index++) {
+ VALUE yields = rb_ary_entry(tokens, index);
+ VALUE token = rb_ary_entry(yields, 0);
+
+ VALUE value = rb_ivar_get(token, rb_intern("@value"));
+ rb_enc_associate(value, lex_data->encoding);
+ ENC_CODERANGE_CLEAR(value);
+ }
}
// Return an array of tokens corresponding to the given source.
diff --git a/yarp/yarp.c b/yarp/yarp.c
index 2a5a923c76..4de3338dc1 100644
--- a/yarp/yarp.c
+++ b/yarp/yarp.c
@@ -12876,6 +12876,8 @@ yp_parser_metadata(yp_parser_t *parser, const char *metadata) {
// Initialize a parser with the given start and end pointers.
YP_EXPORTED_FUNCTION void
yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char *filepath) {
+ assert(source != NULL);
+
// Set filepath to the file that was passed
if (!filepath) filepath = "";
yp_string_t filepath_string;
@@ -12944,14 +12946,15 @@ yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char
size_t newline_size = size / 22;
yp_newline_list_init(&parser->newline_list, source, newline_size < 4 ? 4 : newline_size);
- assert(source != NULL);
+ // Skip past the UTF-8 BOM if it exists.
if (size >= 3 && (unsigned char) source[0] == 0xef && (unsigned char) source[1] == 0xbb && (unsigned char) source[2] == 0xbf) {
- // If the first three bytes of the source are the UTF-8 BOM, then we'll skip
- // over them.
parser->current.end += 3;
- } else if (size >= 2 && source[0] == '#' && source[1] == '!') {
- // If the first two bytes of the source are a shebang, then we'll indicate
- // that the encoding comment is at the end of the shebang.
+ parser->encoding_comment_start += 3;
+ }
+
+ // If the first two bytes of the source are a shebang, then we'll indicate
+ // that the encoding comment is at the end of the shebang.
+ if (peek(parser) == '#' && peek_offset(parser, 1) == '!') {
const char *encoding_comment_start = next_newline(source, (ptrdiff_t) size);
if (encoding_comment_start) {
parser->encoding_comment_start = encoding_comment_start + 1;