diff options
| author | Jean Boussier <jean.boussier@gmail.com> | 2026-01-18 10:33:54 +0100 |
|---|---|---|
| committer | Jean Boussier <jean.boussier@gmail.com> | 2026-01-18 16:31:31 +0100 |
| commit | 6cd4549060a608d8a7e5ee0dde2c4b69b08d7f6e (patch) | |
| tree | 17cd606e1d3ecd918d00c515126f29b0b3456a3e /string.c | |
| parent | d1dc4bdb2fe7f16e6da78c0930353e4a5031465a (diff) | |
Optimize File.join common use case
`File.join` is a hotspot for common libraries such as Zeitwerk
and Bootsnap. It has a fairly flexible signature, but 99% of
the time it's called with just two (or a small number of) UTF-8 strings.
If we optimistically optimize for that use case we can cut down a large
number of type and encoding checks, significantly speeding up the method.
The one remaining expensive check we could try to optimize is `str_null_check`.
Given it's common to use the same base string for joining, we could memoize it.
Also we could precompute it for literal strings.
```
compare-ruby: ruby 4.1.0dev (2026-01-17T14:40:03Z master 00a3b71eaf) +PRISM [arm64-darwin25]
built-ruby: ruby 4.1.0dev (2026-01-18T12:10:38Z spedup-file-join 069bab58d4) +PRISM [arm64-darwin25]
warming up....
| |compare-ruby|built-ruby|
|:-------------|-----------:|---------:|
|two_strings | 2.475M| 9.444M|
| | -| 3.82x|
|many_strings | 551.975k| 2.346M|
| | -| 4.25x|
|array | 514.946k| 522.034k|
| | -| 1.01x|
|mixed | 621.236k| 633.189k|
| | -| 1.02x|
```
Diffstat (limited to 'string.c')
| -rw-r--r-- | string.c | 47 |
1 files changed, 16 insertions, 31 deletions
@@ -146,27 +146,7 @@ VALUE rb_cSymbol; RSTRING(str)->len = (n); \ } while (0) -static inline bool -str_encindex_fastpath(int encindex) -{ - // The overwhelming majority of strings are in one of these 3 encodings. - switch (encindex) { - case ENCINDEX_ASCII_8BIT: - case ENCINDEX_UTF_8: - case ENCINDEX_US_ASCII: - return true; - default: - return false; - } -} - -static inline bool -str_enc_fastpath(VALUE str) -{ - return str_encindex_fastpath(ENCODING_GET_INLINED(str)); -} - -#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str)))) +#define TERM_LEN(str) (rb_str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str)))) #define TERM_FILL(ptr, termlen) do {\ char *const term_fill_ptr = (ptr);\ const int term_fill_len = (termlen);\ @@ -960,7 +940,7 @@ static inline bool rb_enc_str_asciicompat(VALUE str) { int encindex = ENCODING_GET_INLINED(str); - return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex)); + return rb_str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex)); } int @@ -2796,7 +2776,7 @@ rb_must_asciicompat(VALUE str) rb_raise(rb_eTypeError, "not encoding capable object"); } - if (RB_LIKELY(str_encindex_fastpath(encindex))) { + if (RB_LIKELY(rb_str_encindex_fastpath(encindex))) { return; } @@ -2897,16 +2877,21 @@ str_null_check(VALUE str, int *w) { char *s = RSTRING_PTR(str); long len = RSTRING_LEN(str); - rb_encoding *enc = rb_enc_get(str); - const int minlen = rb_enc_mbminlen(enc); + int minlen = 1; + + if (RB_UNLIKELY(!rb_str_enc_fastpath(str))) { + rb_encoding *enc = rb_enc_get(str); + minlen = rb_enc_mbminlen(enc); - if (minlen > 1) { - *w = 1; - if (str_null_char(s, len, minlen, enc)) { - return NULL; + if (minlen > 1) { + *w = 1; + if (str_null_char(s, len, minlen, enc)) { + return NULL; + } + return str_fill_term(str, s, len, minlen); } - return str_fill_term(str, s, len, minlen); } + *w = 0; if (!s || memchr(s, 0, len)) { return NULL; @@ -3765,7 +3750,7 @@ rb_str_buf_append(VALUE str, VALUE str2) { int str2_cr = rb_enc_str_coderange(str2); - if (str_enc_fastpath(str)) { + if (rb_str_enc_fastpath(str)) { switch (str2_cr) { case ENC_CODERANGE_7BIT: // If RHS is 7bit we can do simple concatenation |
