summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTakashi Kokubun <takashikkbn@gmail.com>2026-02-09 13:44:42 -0800
committerTakashi Kokubun <takashikkbn@gmail.com>2026-02-09 13:44:42 -0800
commit306930ae1ac62fb3b7f96581f4a6e9ab4c083e84 (patch)
treeba2fa6f33143ef066aa871f8ba27771eec278b46
parentc6d9ba58c50fd9c07023453d71cb55b4b9c36957 (diff)
merge revision(s) 78b7646bdb91285873ac26bca060591e06c45afe, b4a62a1ca949d93332ad8bce0fcc273581160cc5: [Backport #21842]
[PATCH] [Bug #21842] Let `rb_interned_str` return US-ASCII if possible [PATCH] [DOC] Update docs for rb_interned_str and related functions (#15897) Related to [Bug #21842]. * rb_interned_str: document what decides whether the returned string is in US-ASCII or BINARY encoding. * rb_interned_str_cstr: include the same description as rb_interned_str for the encoding. This one was still missing the update for US-ASCII and erroneously said the returned string was alwasy in BINARY encoding * rb_str_to_interned_str: document how the encoding of the result is defined. Co-authored-by: Herwin <herwinw@users.noreply.github.com>
-rw-r--r--include/ruby/internal/intern/string.h14
-rw-r--r--string.c10
-rw-r--r--test/-ext-/string/test_interned_str.rb5
3 files changed, 22 insertions, 7 deletions
diff --git a/include/ruby/internal/intern/string.h b/include/ruby/internal/intern/string.h
index 75a28143fb..8bd1ffcfb4 100644
--- a/include/ruby/internal/intern/string.h
+++ b/include/ruby/internal/intern/string.h
@@ -412,8 +412,8 @@ VALUE rb_utf8_str_new_static(const char *ptr, long len);
/**
* Identical to rb_interned_str(), except it takes a Ruby's string instead of
- * C's. It can also be seen as a routine identical to rb_str_new_shared(),
- * except it returns an infamous "f"string.
+ * C's and preserves its encoding. It can also be seen as a routine identical
+ * to rb_str_new_shared(), except it returns an infamous "f"string.
*
* @param[in] str An object of ::RString.
* @return An instance of ::rb_cString, either cached or allocated, which
@@ -444,8 +444,9 @@ VALUE rb_str_to_interned_str(VALUE str);
* terminating NUL character.
* @exception rb_eArgError `len` is negative.
* @return A found or created instance of ::rb_cString, of `len` bytes
- * length, of "binary" encoding, whose contents are identical to
- * that of `ptr`.
+ * length, whose contents are identical to that of `ptr`. Its
+ * encoding will be US-ASCII if all bytes are lower ASCII, BINARY
+ * otherwise.
* @pre At least `len` bytes of continuous memory region shall be
* accessible via `ptr`.
*/
@@ -461,8 +462,9 @@ RBIMPL_ATTR_NONNULL(())
*
* @param[in] ptr A C string.
* @exception rb_eNoMemError Failed to allocate memory.
- * @return An instance of ::rb_cString, of "binary" encoding, whose
- * contents are verbatim copy of `ptr`.
+ * @return An instance of ::rb_cString, whose contents are verbatim copy
+ * of `ptr`. Its encoding will be US-ASCII if all bytes are lower
+ * ASCII, BINARY otherwise.
* @pre `ptr` must not be a null pointer.
*/
VALUE rb_interned_str_cstr(const char *ptr);
diff --git a/string.c b/string.c
index b70cee020d..516fc81b93 100644
--- a/string.c
+++ b/string.c
@@ -12705,7 +12705,15 @@ VALUE
rb_interned_str(const char *ptr, long len)
{
struct RString fake_str = {RBASIC_INIT};
- return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
+ int encidx = ENCINDEX_US_ASCII;
+ int coderange = ENC_CODERANGE_7BIT;
+ if (len > 0 && search_nonascii(ptr, ptr + len)) {
+ encidx = ENCINDEX_ASCII_8BIT;
+ coderange = ENC_CODERANGE_VALID;
+ }
+ VALUE str = setup_fake_str(&fake_str, ptr, len, encidx);
+ ENC_CODERANGE_SET(str, coderange);
+ return register_fstring(str, true, false);
}
VALUE
diff --git a/test/-ext-/string/test_interned_str.rb b/test/-ext-/string/test_interned_str.rb
index 340dba41e8..a81cb59aa5 100644
--- a/test/-ext-/string/test_interned_str.rb
+++ b/test/-ext-/string/test_interned_str.rb
@@ -9,4 +9,9 @@ class Test_RbInternedStr < Test::Unit::TestCase
src << "b" * 20
assert_equal "a" * 20, interned_str
end
+
+ def test_interned_str_encoding
+ src = :ascii.name
+ assert_equal Encoding::US_ASCII, Bug::String.rb_interned_str_dup(src).encoding
+ end
end