From 0e3dc5a056abf51363070ad94de4a8097bc80197 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Thu, 24 Aug 2023 11:09:17 -0400 Subject: [ruby/yarp] Fix lex compat with BOM * BOM should not impact looking for the encoding string * We should re-encode tokens when the encoding changes * BOM should change the column of comments only https://github.com/ruby/yarp/commit/119fc2d7b2 --- yarp/extension.c | 14 ++++++++++++++ yarp/yarp.c | 15 +++++++++------ 2 files changed, 23 insertions(+), 6 deletions(-) (limited to 'yarp') diff --git a/yarp/extension.c b/yarp/extension.c index 455cdcadcc..8aef456c00 100644 --- a/yarp/extension.c +++ b/yarp/extension.c @@ -221,6 +221,20 @@ static void lex_encoding_changed_callback(yp_parser_t *parser) { lex_data_t *lex_data = (lex_data_t *) parser->lex_callback->data; lex_data->encoding = rb_enc_find(parser->encoding.name); + + // Since we got a new encoding, we need to go back and change the encoding + // of the tokens that we've already lexed. This should be a tiny amount + // since encoding magic comments need to be the first or second line of the + // file. + VALUE tokens = lex_data->tokens; + for (long index = 0; index < RARRAY_LEN(tokens); index++) { + VALUE yields = rb_ary_entry(tokens, index); + VALUE token = rb_ary_entry(yields, 0); + + VALUE value = rb_ivar_get(token, rb_intern("@value")); + rb_enc_associate(value, lex_data->encoding); + ENC_CODERANGE_CLEAR(value); + } } // Return an array of tokens corresponding to the given source. diff --git a/yarp/yarp.c b/yarp/yarp.c index 2a5a923c76..4de3338dc1 100644 --- a/yarp/yarp.c +++ b/yarp/yarp.c @@ -12876,6 +12876,8 @@ yp_parser_metadata(yp_parser_t *parser, const char *metadata) { // Initialize a parser with the given start and end pointers. YP_EXPORTED_FUNCTION void yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char *filepath) { + assert(source != NULL); + // Set filepath to the file that was passed if (!filepath) filepath = ""; yp_string_t filepath_string; @@ -12944,14 +12946,15 @@ yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char size_t newline_size = size / 22; yp_newline_list_init(&parser->newline_list, source, newline_size < 4 ? 4 : newline_size); - assert(source != NULL); + // Skip past the UTF-8 BOM if it exists. if (size >= 3 && (unsigned char) source[0] == 0xef && (unsigned char) source[1] == 0xbb && (unsigned char) source[2] == 0xbf) { - // If the first three bytes of the source are the UTF-8 BOM, then we'll skip - // over them. parser->current.end += 3; - } else if (size >= 2 && source[0] == '#' && source[1] == '!') { - // If the first two bytes of the source are a shebang, then we'll indicate - // that the encoding comment is at the end of the shebang. + parser->encoding_comment_start += 3; + } + + // If the first two bytes of the source are a shebang, then we'll indicate + // that the encoding comment is at the end of the shebang. + if (peek(parser) == '#' && peek_offset(parser, 1) == '!') { const char *encoding_comment_start = next_newline(source, (ptrdiff_t) size); if (encoding_comment_start) { parser->encoding_comment_start = encoding_comment_start + 1; -- cgit v1.2.3