From dc90c26a103ad62df73464cc1896edbcc90bd0c7 Mon Sep 17 00:00:00 2001 From: Earlopain <14981592+Earlopain@users.noreply.github.com> Date: Fri, 8 May 2026 21:04:08 +0200 Subject: [ruby/prism] Respect `encoding` option in `Prism.lex` and friends utf-8 is the default for source files but can be overwritten via options https://github.com/ruby/prism/commit/355f451528 --- prism/extension.c | 2 +- test/prism/lex_test.rb | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/prism/extension.c b/prism/extension.c index 9f9169cfff..27df8dac50 100644 --- a/prism/extension.c +++ b/prism/extension.c @@ -793,7 +793,7 @@ parse_lex_input(const uint8_t *input, size_t input_length, const pm_options_t *o parse_lex_data_t parse_lex_data = { .source = source, .tokens = rb_ary_new(), - .encoding = rb_utf8_encoding(), + .encoding = rb_enc_find(pm_parser_encoding_name(parser)), .freeze = pm_options_freeze(options), }; diff --git a/test/prism/lex_test.rb b/test/prism/lex_test.rb index 8ea7ce7e9b..1e06d52184 100644 --- a/test/prism/lex_test.rb +++ b/test/prism/lex_test.rb @@ -47,6 +47,24 @@ module Prism end end + def test_lex_encoding + tokens = Prism.lex('"わたし"', encoding: Encoding::Windows_31J).value + tokens.each do |t| + assert_equal(Encoding::Windows_31J, t[0].value.encoding) + end + + # Shebangs must appear on the first line. For these cases, the encoding + # comment may appear second, but it should still change encoding. + tokens = Prism.lex(<<~RUBY, encoding: Encoding::Windows_31J).value + #! /usr/bin/env ruby + # encoding: utf-8 + "わたし" + RUBY + tokens.each do |t| + assert_equal(Encoding::UTF_8, t[0].value.encoding) + end + end + if RUBY_VERSION >= "3.3" def test_lex_compat source = "foo bar" -- cgit v1.2.3