diff options
| author | Kevin Newton <kddnewton@gmail.com> | 2023-10-13 13:34:56 -0400 |
|---|---|---|
| committer | Jemma Issroff <jemmaissroff@gmail.com> | 2023-10-16 15:40:19 -0700 |
| commit | 9f16f07cf1e340acd9c41acaf8d46394353a0cea (patch) | |
| tree | d4b6a8e83ba4ca75f7203da8ed05f93a3284861e | |
| parent | 5523a23469987f92e38d52d4332bde09bdd8896c (diff) | |
[ruby/prism] Additionally handle encoding comments in vim mode
https://github.com/ruby/prism/commit/bf9bdb9d82
| -rw-r--r-- | prism/prism.c | 93 | ||||
| -rw-r--r-- | test/prism/magic_comment_test.rb | 3 | ||||
| -rw-r--r-- | test/prism/parse_test.rb | 26 |
3 files changed, 105 insertions, 17 deletions
diff --git a/prism/prism.c b/prism/prism.c index c0f726e796..47b84a70dc 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -5221,7 +5221,7 @@ next_newline(const uint8_t *cursor, ptrdiff_t length) { // Here we're going to check if this is a "magic" comment, and perform whatever // actions are necessary for it here. static void -parser_lex_magic_comment_encoding(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { +parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { size_t width = (size_t) (end - start); // First, we're going to call out to a user-defined callback if one was @@ -5301,10 +5301,58 @@ parser_lex_magic_comment_encoding(pm_parser_t *parser, const uint8_t *start, con pm_parser_err(parser, start, end, PM_ERR_INVALID_ENCODING_MAGIC_COMMENT); } +// Look for a specific pattern of "coding" and potentially set the encoding on +// the parser. +static void +parser_lex_magic_comment_encoding(pm_parser_t *parser) { + const uint8_t *cursor = parser->current.start + 1; + const uint8_t *end = parser->current.end; + + bool separator = false; + while (true) { + if (end - cursor <= 6) return; + switch (cursor[6]) { + case 'C': case 'c': cursor += 6; continue; + case 'O': case 'o': cursor += 5; continue; + case 'D': case 'd': cursor += 4; continue; + case 'I': case 'i': cursor += 3; continue; + case 'N': case 'n': cursor += 2; continue; + case 'G': case 'g': cursor += 1; continue; + case '=': case ':': + separator = true; + cursor += 6; + break; + default: + cursor += 6; + if (pm_char_is_whitespace(*cursor)) break; + continue; + } + if (pm_strncasecmp(cursor - 6, (const uint8_t *) "coding", 6) == 0) break; + separator = false; + } + + while (true) { + do { + if (++cursor >= end) return; + } while (pm_char_is_whitespace(*cursor)); + + if (separator) break; + if (*cursor != '=' && *cursor != ':') return; + + separator = true; + cursor++; + } + + const uint8_t *value_start = cursor; + while ((*cursor == '-' || *cursor == '_' || parser->encoding.alnum_char(cursor, 1)) && ++cursor < end); + + parser_lex_magic_comment_encoding_value(parser, value_start, cursor); +} + // Check if this is a magic comment that includes the frozen_string_literal // pragma. If it does, set that field on the parser. static void -parser_lex_magic_comment_frozen_string_literal(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { +parser_lex_magic_comment_frozen_string_literal_value(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { if (start + 4 <= end && pm_strncasecmp(start, (const uint8_t *) "true", 4) == 0) { parser->frozen_string_literal = true; } @@ -5335,10 +5383,13 @@ parser_lex_magic_comment_emacs_marker(pm_parser_t *parser, const uint8_t *cursor // // %r"([^\\s\'\":;]+)\\s*:\\s*(\"(?:\\\\.|[^\"])*\"|[^\"\\s;]+)[\\s;]*" // -static inline void +// It returns true if it consumes the entire comment. Otherwise it returns +// false. +static inline bool parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { const uint8_t *start = parser->current.start + 1; const uint8_t *end = parser->current.end; + if (end - start <= 7) return false; const uint8_t *cursor; bool indicator = false; @@ -5352,7 +5403,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { } else { // If we have a start marker but not an end marker, then we cannot // have a magic comment. - return; + return false; } } @@ -5365,17 +5416,17 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { const uint8_t *key_end = cursor; while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++; - if (cursor == end) return; + if (cursor == end) break; if (*cursor == ':') { cursor++; } else { - if (!indicator) return; + if (!indicator) return false; continue; } while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++; - if (cursor == end) return; + if (cursor == end) break; const uint8_t *value_start; const uint8_t *value_end; @@ -5396,7 +5447,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { while (cursor < end && (*cursor == ';' || pm_char_is_whitespace(*cursor))) cursor++; } else { while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++; - if (cursor != end) return; + if (cursor != end) return false; } // Here, we need to do some processing on the key to swap out dashes for @@ -5409,7 +5460,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { } else { size_t width = (size_t) (key_end - key_start); uint8_t *buffer = malloc(width); - if (buffer == NULL) return; + if (buffer == NULL) break; memcpy(buffer, key_start, width); buffer[dash - key_start] = '_'; @@ -5423,25 +5474,25 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { // Finally, we can start checking the key against the list of known // magic comment keys, and potentially change state based on that. - const char *key_source = (const char *) pm_string_source(&key); + const uint8_t *key_source = pm_string_source(&key); const size_t key_length = pm_string_length(&key); // We only want to attempt to compare against encoding comments if it's // the first line in the file (or the second in the case of a shebang). if (parser->current.start == parser->encoding_comment_start) { if ( - (key_length == 8 && strncasecmp(key_source, "encoding", 8) == 0) || - (key_length == 6 && strncasecmp(key_source, "coding", 6) == 0) + (key_length == 8 && pm_strncasecmp(key_source, (const uint8_t *) "encoding", 8) == 0) || + (key_length == 6 && pm_strncasecmp(key_source, (const uint8_t *) "coding", 6) == 0) ) { - parser_lex_magic_comment_encoding(parser, value_start, value_end); + parser_lex_magic_comment_encoding_value(parser, value_start, value_end); } } // We only want to handle frozen string literal comments if it's before // any semantic tokens have been seen. if (!semantic_token_seen) { - if (key_length == 21 && strncasecmp(key_source, "frozen_string_literal", 21) == 0) { - parser_lex_magic_comment_frozen_string_literal(parser, value_start, value_end); + if (key_length == 21 && pm_strncasecmp(key_source, (const uint8_t *) "frozen_string_literal", 21) == 0) { + parser_lex_magic_comment_frozen_string_literal_value(parser, value_start, value_end); } } @@ -5459,6 +5510,8 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { pm_list_append(&parser->magic_comment_list, (pm_list_node_t *) magic_comment); } } + + return true; } /******************************************************************************/ @@ -7061,7 +7114,15 @@ parser_lex(pm_parser_t *parser) { // Here, parse the comment to see if it's a magic comment // and potentially change state on the parser. - parser_lex_magic_comment(parser, semantic_token_seen); + if (!parser_lex_magic_comment(parser, semantic_token_seen) && (parser->current.start == parser->encoding_comment_start)) { + ptrdiff_t length = parser->current.end - parser->current.start; + + // If we didn't find a magic comment within the first + // pass and we're at the start of the file, then we need + // to do another pass to potentially find other patterns + // for encoding comments. + if (length >= 10) parser_lex_magic_comment_encoding(parser); + } lexed_comment = true; } diff --git a/test/prism/magic_comment_test.rb b/test/prism/magic_comment_test.rb index c40364ccfa..76c4fcb71f 100644 --- a/test/prism/magic_comment_test.rb +++ b/test/prism/magic_comment_test.rb @@ -16,7 +16,8 @@ module Prism "# -*- CoDiNg: ascii -*-", "# -*- \s\t\v encoding \s\t\v : \s\t\v ascii \s\t\v -*-", "# -*- foo: bar; encoding: ascii -*-", - "# coding \t \r \v : \t \v \r ascii-8bit\n" + "# coding \t \r \v : \t \v \r ascii-8bit\n", + "# vim: filetype=ruby, fileencoding=big5, tabsize=3, shiftwidth=3\n" ] examples.each do |example| diff --git a/test/prism/parse_test.rb b/test/prism/parse_test.rb index aed41b5f84..eada2952df 100644 --- a/test/prism/parse_test.rb +++ b/test/prism/parse_test.rb @@ -4,6 +4,21 @@ require_relative "test_helper" module Prism class ParseTest < TestCase + # A subclass of Ripper that extracts out magic comments. + class MagicCommentRipper < Ripper + attr_reader :magic_comments + + def initialize(*) + super + @magic_comments = [] + end + + def on_magic_comment(key, value) + @magic_comments << [key, value] + super + end + end + # When we pretty-print the trees to compare against the snapshots, we want to # be certain that we print with the same external encoding. This is because # methods like Symbol#inspect take into account external encoding and it could @@ -159,6 +174,17 @@ module Prism rescue SyntaxError raise ArgumentError, "Test file has invalid syntax #{filepath}" end + + # Next, check that we get the correct number of magic comments when + # lexing with ripper. + expected = MagicCommentRipper.new(source).tap(&:parse).magic_comments + actual = result.magic_comments + + assert_equal expected.length, actual.length + expected.zip(actual).each do |(expected_key, expected_value), magic_comment| + assert_equal expected_key, magic_comment.key + assert_equal expected_value, magic_comment.value + end end end end |
