summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2023-10-13 13:34:56 -0400
committerJemma Issroff <jemmaissroff@gmail.com>2023-10-16 15:40:19 -0700
commit9f16f07cf1e340acd9c41acaf8d46394353a0cea (patch)
treed4b6a8e83ba4ca75f7203da8ed05f93a3284861e
parent5523a23469987f92e38d52d4332bde09bdd8896c (diff)
[ruby/prism] Additionally handle encoding comments in vim mode
https://github.com/ruby/prism/commit/bf9bdb9d82
-rw-r--r--prism/prism.c93
-rw-r--r--test/prism/magic_comment_test.rb3
-rw-r--r--test/prism/parse_test.rb26
3 files changed, 105 insertions, 17 deletions
diff --git a/prism/prism.c b/prism/prism.c
index c0f726e796..47b84a70dc 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -5221,7 +5221,7 @@ next_newline(const uint8_t *cursor, ptrdiff_t length) {
// Here we're going to check if this is a "magic" comment, and perform whatever
// actions are necessary for it here.
static void
-parser_lex_magic_comment_encoding(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
+parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
size_t width = (size_t) (end - start);
// First, we're going to call out to a user-defined callback if one was
@@ -5301,10 +5301,58 @@ parser_lex_magic_comment_encoding(pm_parser_t *parser, const uint8_t *start, con
pm_parser_err(parser, start, end, PM_ERR_INVALID_ENCODING_MAGIC_COMMENT);
}
+// Look for a specific pattern of "coding" and potentially set the encoding on
+// the parser.
+static void
+parser_lex_magic_comment_encoding(pm_parser_t *parser) {
+ const uint8_t *cursor = parser->current.start + 1;
+ const uint8_t *end = parser->current.end;
+
+ bool separator = false;
+ while (true) {
+ if (end - cursor <= 6) return;
+ switch (cursor[6]) {
+ case 'C': case 'c': cursor += 6; continue;
+ case 'O': case 'o': cursor += 5; continue;
+ case 'D': case 'd': cursor += 4; continue;
+ case 'I': case 'i': cursor += 3; continue;
+ case 'N': case 'n': cursor += 2; continue;
+ case 'G': case 'g': cursor += 1; continue;
+ case '=': case ':':
+ separator = true;
+ cursor += 6;
+ break;
+ default:
+ cursor += 6;
+ if (pm_char_is_whitespace(*cursor)) break;
+ continue;
+ }
+ if (pm_strncasecmp(cursor - 6, (const uint8_t *) "coding", 6) == 0) break;
+ separator = false;
+ }
+
+ while (true) {
+ do {
+ if (++cursor >= end) return;
+ } while (pm_char_is_whitespace(*cursor));
+
+ if (separator) break;
+ if (*cursor != '=' && *cursor != ':') return;
+
+ separator = true;
+ cursor++;
+ }
+
+ const uint8_t *value_start = cursor;
+ while ((*cursor == '-' || *cursor == '_' || parser->encoding.alnum_char(cursor, 1)) && ++cursor < end);
+
+ parser_lex_magic_comment_encoding_value(parser, value_start, cursor);
+}
+
// Check if this is a magic comment that includes the frozen_string_literal
// pragma. If it does, set that field on the parser.
static void
-parser_lex_magic_comment_frozen_string_literal(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
+parser_lex_magic_comment_frozen_string_literal_value(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
if (start + 4 <= end && pm_strncasecmp(start, (const uint8_t *) "true", 4) == 0) {
parser->frozen_string_literal = true;
}
@@ -5335,10 +5383,13 @@ parser_lex_magic_comment_emacs_marker(pm_parser_t *parser, const uint8_t *cursor
//
// %r"([^\\s\'\":;]+)\\s*:\\s*(\"(?:\\\\.|[^\"])*\"|[^\"\\s;]+)[\\s;]*"
//
-static inline void
+// It returns true if it consumes the entire comment. Otherwise it returns
+// false.
+static inline bool
parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
const uint8_t *start = parser->current.start + 1;
const uint8_t *end = parser->current.end;
+ if (end - start <= 7) return false;
const uint8_t *cursor;
bool indicator = false;
@@ -5352,7 +5403,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
} else {
// If we have a start marker but not an end marker, then we cannot
// have a magic comment.
- return;
+ return false;
}
}
@@ -5365,17 +5416,17 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
const uint8_t *key_end = cursor;
while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
- if (cursor == end) return;
+ if (cursor == end) break;
if (*cursor == ':') {
cursor++;
} else {
- if (!indicator) return;
+ if (!indicator) return false;
continue;
}
while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
- if (cursor == end) return;
+ if (cursor == end) break;
const uint8_t *value_start;
const uint8_t *value_end;
@@ -5396,7 +5447,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
while (cursor < end && (*cursor == ';' || pm_char_is_whitespace(*cursor))) cursor++;
} else {
while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
- if (cursor != end) return;
+ if (cursor != end) return false;
}
// Here, we need to do some processing on the key to swap out dashes for
@@ -5409,7 +5460,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
} else {
size_t width = (size_t) (key_end - key_start);
uint8_t *buffer = malloc(width);
- if (buffer == NULL) return;
+ if (buffer == NULL) break;
memcpy(buffer, key_start, width);
buffer[dash - key_start] = '_';
@@ -5423,25 +5474,25 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
// Finally, we can start checking the key against the list of known
// magic comment keys, and potentially change state based on that.
- const char *key_source = (const char *) pm_string_source(&key);
+ const uint8_t *key_source = pm_string_source(&key);
const size_t key_length = pm_string_length(&key);
// We only want to attempt to compare against encoding comments if it's
// the first line in the file (or the second in the case of a shebang).
if (parser->current.start == parser->encoding_comment_start) {
if (
- (key_length == 8 && strncasecmp(key_source, "encoding", 8) == 0) ||
- (key_length == 6 && strncasecmp(key_source, "coding", 6) == 0)
+ (key_length == 8 && pm_strncasecmp(key_source, (const uint8_t *) "encoding", 8) == 0) ||
+ (key_length == 6 && pm_strncasecmp(key_source, (const uint8_t *) "coding", 6) == 0)
) {
- parser_lex_magic_comment_encoding(parser, value_start, value_end);
+ parser_lex_magic_comment_encoding_value(parser, value_start, value_end);
}
}
// We only want to handle frozen string literal comments if it's before
// any semantic tokens have been seen.
if (!semantic_token_seen) {
- if (key_length == 21 && strncasecmp(key_source, "frozen_string_literal", 21) == 0) {
- parser_lex_magic_comment_frozen_string_literal(parser, value_start, value_end);
+ if (key_length == 21 && pm_strncasecmp(key_source, (const uint8_t *) "frozen_string_literal", 21) == 0) {
+ parser_lex_magic_comment_frozen_string_literal_value(parser, value_start, value_end);
}
}
@@ -5459,6 +5510,8 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
pm_list_append(&parser->magic_comment_list, (pm_list_node_t *) magic_comment);
}
}
+
+ return true;
}
/******************************************************************************/
@@ -7061,7 +7114,15 @@ parser_lex(pm_parser_t *parser) {
// Here, parse the comment to see if it's a magic comment
// and potentially change state on the parser.
- parser_lex_magic_comment(parser, semantic_token_seen);
+ if (!parser_lex_magic_comment(parser, semantic_token_seen) && (parser->current.start == parser->encoding_comment_start)) {
+ ptrdiff_t length = parser->current.end - parser->current.start;
+
+ // If we didn't find a magic comment within the first
+ // pass and we're at the start of the file, then we need
+ // to do another pass to potentially find other patterns
+ // for encoding comments.
+ if (length >= 10) parser_lex_magic_comment_encoding(parser);
+ }
lexed_comment = true;
}
diff --git a/test/prism/magic_comment_test.rb b/test/prism/magic_comment_test.rb
index c40364ccfa..76c4fcb71f 100644
--- a/test/prism/magic_comment_test.rb
+++ b/test/prism/magic_comment_test.rb
@@ -16,7 +16,8 @@ module Prism
"# -*- CoDiNg: ascii -*-",
"# -*- \s\t\v encoding \s\t\v : \s\t\v ascii \s\t\v -*-",
"# -*- foo: bar; encoding: ascii -*-",
- "# coding \t \r \v : \t \v \r ascii-8bit\n"
+ "# coding \t \r \v : \t \v \r ascii-8bit\n",
+ "# vim: filetype=ruby, fileencoding=big5, tabsize=3, shiftwidth=3\n"
]
examples.each do |example|
diff --git a/test/prism/parse_test.rb b/test/prism/parse_test.rb
index aed41b5f84..eada2952df 100644
--- a/test/prism/parse_test.rb
+++ b/test/prism/parse_test.rb
@@ -4,6 +4,21 @@ require_relative "test_helper"
module Prism
class ParseTest < TestCase
+ # A subclass of Ripper that extracts out magic comments.
+ class MagicCommentRipper < Ripper
+ attr_reader :magic_comments
+
+ def initialize(*)
+ super
+ @magic_comments = []
+ end
+
+ def on_magic_comment(key, value)
+ @magic_comments << [key, value]
+ super
+ end
+ end
+
# When we pretty-print the trees to compare against the snapshots, we want to
# be certain that we print with the same external encoding. This is because
# methods like Symbol#inspect take into account external encoding and it could
@@ -159,6 +174,17 @@ module Prism
rescue SyntaxError
raise ArgumentError, "Test file has invalid syntax #{filepath}"
end
+
+ # Next, check that we get the correct number of magic comments when
+ # lexing with ripper.
+ expected = MagicCommentRipper.new(source).tap(&:parse).magic_comments
+ actual = result.magic_comments
+
+ assert_equal expected.length, actual.length
+ expected.zip(actual).each do |(expected_key, expected_value), magic_comment|
+ assert_equal expected_key, magic_comment.key
+ assert_equal expected_value, magic_comment.value
+ end
end
end
end