diff options
| author | Jean Boussier <jean.boussier@gmail.com> | 2026-01-20 17:59:15 +0100 |
|---|---|---|
| committer | Jean Boussier <jean.boussier@gmail.com> | 2026-01-21 11:23:01 +0100 |
| commit | 519a4bdbc19693be9020419eb4dea9e66a256a41 (patch) | |
| tree | a801eb1c39de0922c529fead5e4d545606d80250 | |
| parent | 0f1eea09490287cf9ed907f7cd84398b03074d98 (diff) | |
Optimize File.basename
The actual algorithm is largely unchanged, just allowed to use
singlebyte checks for common encodings.
It could certainly be optimized much further, as here again it often
scans from the front of the string when we're interested in the back of
it. But the algorithm as many Windows only corner cases so I'd rather
ship a good improvement now and eventually come back to it later.
Most of improvement here is from the reduced setup cost (avodi double
null checks, avoid duping the argument, etc), and skipping the multi-byte
checks.
```
compare-ruby: ruby 4.1.0dev (2026-01-19T03:51:30Z master 631bf19b37) +PRISM [arm64-darwin25]
built-ruby: ruby 4.1.0dev (2026-01-21T08:21:05Z opt-basename 7eb11745b2) +PRISM [arm64-darwin25]
```
| |compare-ruby|built-ruby|
|:----------|-----------:|---------:|
|long | 3.412M| 18.158M|
| | -| 5.32x|
|long_name | 1.981M| 8.580M|
| | -| 4.33x|
|withext | 3.200M| 12.986M|
| | -| 4.06x|
| -rw-r--r-- | benchmark/file_basename.yml | 6 | ||||
| -rw-r--r-- | file.c | 88 | ||||
| -rw-r--r-- | internal/string.h | 8 | ||||
| -rw-r--r-- | spec/ruby/core/file/basename_spec.rb | 26 |
4 files changed, 97 insertions, 31 deletions
diff --git a/benchmark/file_basename.yml b/benchmark/file_basename.yml new file mode 100644 index 0000000000..fbd78785aa --- /dev/null +++ b/benchmark/file_basename.yml @@ -0,0 +1,6 @@ +prelude: | + # frozen_string_literal: true +benchmark: + long: File.basename("/Users/george/src/github.com/ruby/ruby/benchmark/file_dirname.yml") + long_name: File.basename("Users_george_src_github.com_ruby_ruby_benchmark_file_dirname.yml") + withext: File.basename("/Users/george/src/github.com/ruby/ruby/benchmark/file_dirname.yml", ".yml") @@ -3749,7 +3749,7 @@ strrdirsep(const char *path, const char *end, bool mb_enc, rb_encoding *enc) } static char * -chompdirsep(const char *path, const char *end, rb_encoding *enc) +chompdirsep(const char *path, const char *end, bool mb_enc, rb_encoding *enc) { while (path < end) { if (isdirsep(*path)) { @@ -3758,7 +3758,7 @@ chompdirsep(const char *path, const char *end, rb_encoding *enc) if (path >= end) return (char *)last; } else { - Inc(path, end, true, enc); + Inc(path, end, mb_enc, enc); } } return (char *)path; @@ -3768,7 +3768,7 @@ char * rb_enc_path_end(const char *path, const char *end, rb_encoding *enc) { if (path < end && isdirsep(*path)) path++; - return chompdirsep(path, end, enc); + return chompdirsep(path, end, true, enc); } static rb_encoding * @@ -4088,7 +4088,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na rb_enc_associate(result, enc = fs_enc_check(result, fname)); p = pend; } - p = chompdirsep(skiproot(buf, p), p, enc); + p = chompdirsep(skiproot(buf, p), p, true, enc); s += 2; } } @@ -4113,7 +4113,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na } else #endif /* defined DOSISH || defined __CYGWIN__ */ - p = chompdirsep(skiproot(buf, p), p, enc); + p = chompdirsep(skiproot(buf, p), p, true, enc); } else { size_t len; @@ -4656,7 +4656,7 @@ rb_check_realpath_emulate(VALUE basedir, VALUE path, rb_encoding *origenc, enum root_found: RSTRING_GETMEM(resolved, prefixptr, prefixlen); pend = prefixptr + prefixlen; - ptr = chompdirsep(prefixptr, pend, enc); + ptr = chompdirsep(prefixptr, pend, true, enc); if (ptr < pend) { prefixlen = ++ptr - prefixptr; rb_str_set_len(resolved, prefixlen); @@ -4910,8 +4910,8 @@ rmext(const char *p, long l0, long l1, const char *e, long l2, rb_encoding *enc) return 0; } -const char * -ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encoding *enc) +static inline const char * +enc_find_basename(const char *name, long *baselen, long *alllen, bool mb_enc, rb_encoding *enc) { const char *p, *q, *e, *end; #if defined DOSISH_DRIVE_LETTER || defined DOSISH_UNC @@ -4919,13 +4919,22 @@ ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encodin #endif long f = 0, n = -1; - end = name + (alllen ? (size_t)*alllen : strlen(name)); - name = skipprefix(name, end, true, enc); + long len = (alllen ? (size_t)*alllen : strlen(name)); + + if (len <= 0) { + return name; + } + + end = name + len; + name = skipprefix(name, end, mb_enc, enc); #if defined DOSISH_DRIVE_LETTER || defined DOSISH_UNC root = name; #endif - while (isdirsep(*name)) + + while (isdirsep(*name)) { name++; + } + if (!*name) { p = name - 1; f = 1; @@ -4947,32 +4956,47 @@ ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encodin #endif /* defined DOSISH_DRIVE_LETTER || defined DOSISH_UNC */ } else { - if (!(p = strrdirsep(name, end, true, enc))) { + p = strrdirsep(name, end, mb_enc, enc); + if (!p) { p = name; } else { - while (isdirsep(*p)) p++; /* skip last / */ + while (isdirsep(*p)) { + p++; /* skip last / */ + } } #if USE_NTFS n = ntfs_tail(p, end, enc) - p; #else - n = chompdirsep(p, end, enc) - p; + n = chompdirsep(p, end, mb_enc, enc) - p; #endif for (q = p; q - p < n && *q == '.'; q++); - for (e = 0; q - p < n; Inc(q, end, true, enc)) { + for (e = 0; q - p < n; Inc(q, end, mb_enc, enc)) { if (*q == '.') e = q; } - if (e) f = e - p; - else f = n; + if (e) { + f = e - p; + } + else { + f = n; + } } - if (baselen) + if (baselen) { *baselen = f; - if (alllen) + } + if (alllen) { *alllen = n; + } return p; } +const char * +ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encoding *enc) +{ + return enc_find_basename(name, baselen, alllen, true, enc); +} + /* * call-seq: * File.basename(file_name [, suffix] ) -> base_name @@ -4993,7 +5017,7 @@ ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encodin static VALUE rb_file_s_basename(int argc, VALUE *argv, VALUE _) { - VALUE fname, fext, basename; + VALUE fname, fext; const char *name, *p; long f, n; rb_encoding *enc; @@ -5006,15 +5030,19 @@ rb_file_s_basename(int argc, VALUE *argv, VALUE _) enc = rb_str_enc_get(fext); } fname = argv[0]; - FilePathStringValue(fname); + CheckPath(fname, name); if (NIL_P(fext) || !(enc = rb_enc_compatible(fname, fext))) { - enc = rb_enc_get(fname); + enc = rb_str_enc_get(fname); fext = Qnil; } - if ((n = RSTRING_LEN(fname)) == 0 || !*(name = RSTRING_PTR(fname))) - return rb_str_new_shared(fname); - p = ruby_enc_find_basename(name, &f, &n, enc); + n = RSTRING_LEN(fname); + if (n == 0 || !*name) { + rb_enc_str_new(0, 0, enc); + } + + bool mb_enc = !rb_str_encindex_fastpath(rb_enc_to_index(enc)); + p = enc_find_basename(name, &f, &n, mb_enc, enc); if (n >= 0) { if (NIL_P(fext)) { f = n; @@ -5027,12 +5055,12 @@ rb_file_s_basename(int argc, VALUE *argv, VALUE _) } RB_GC_GUARD(fext); } - if (f == RSTRING_LEN(fname)) return rb_str_new_shared(fname); + if (f == RSTRING_LEN(fname)) { + return rb_str_new_shared(fname); + } } - basename = rb_str_new(p, f); - rb_enc_copy(basename, fname); - return basename; + return rb_enc_str_new(p, f, enc); } static VALUE rb_file_dirname_n(VALUE fname, int n); @@ -5350,7 +5378,7 @@ rb_file_join_ary(VALUE ary) rb_enc_copy(result, tmp); } else { - tail = chompdirsep(name, name + len, rb_enc_get(result)); + tail = chompdirsep(name, name + len, true, rb_enc_get(result)); if (RSTRING_PTR(tmp) && isdirsep(RSTRING_PTR(tmp)[0])) { rb_str_set_len(result, tail - name); } diff --git a/internal/string.h b/internal/string.h index dd5e20c0c6..9212ce8986 100644 --- a/internal/string.h +++ b/internal/string.h @@ -33,7 +33,13 @@ enum ruby_rstring_private_flags { static inline bool rb_str_encindex_fastpath(int encindex) { - // The overwhelming majority of strings are in one of these 3 encodings. + // The overwhelming majority of strings are in one of these 3 encodings, + // which are all either ASCII or perfect ASCII supersets. + // Hence you can use fast, single byte algorithms on them, such as `memchr` etc, + // without all the overhead of fetching the rb_encoding and using functions such as + // rb_enc_mbminlen etc. + // Many other encodings could qualify, but they are expected to be rare occurences, + // so it's better to keep that list small. switch (encindex) { case ENCINDEX_ASCII_8BIT: case ENCINDEX_UTF_8: diff --git a/spec/ruby/core/file/basename_spec.rb b/spec/ruby/core/file/basename_spec.rb index 989409d76b..87695ab97b 100644 --- a/spec/ruby/core/file/basename_spec.rb +++ b/spec/ruby/core/file/basename_spec.rb @@ -151,8 +151,34 @@ describe "File.basename" do File.basename("c:\\bar.txt", ".*").should == "bar" File.basename("c:\\bar.txt.exe", ".*").should == "bar.txt" end + + it "handles Shift JIS 0x5C (\\) as second byte of a multi-byte sequence" do + # dir\fileソname.txt + path = "dir\\file\x83\x5cname.txt".b.force_encoding(Encoding::SHIFT_JIS) + path.valid_encoding?.should be_true + File.basename(path).should == "file\x83\x5cname.txt".b.force_encoding(Encoding::SHIFT_JIS) + end end + it "rejects strings encoded with non ASCII-compatible encodings" do + Encoding.list.reject(&:ascii_compatible?).reject(&:dummy?).each do |enc| + begin + path = "/foo/bar".encode(enc) + rescue Encoding::ConverterNotFoundError + next + end + + -> { + File.basename(path) + }.should raise_error(Encoding::CompatibilityError) + end + end + + it "works with all ASCII-compatible encodings" do + Encoding.list.select(&:ascii_compatible?).each do |enc| + File.basename("/foo/bar".encode(enc)).should == "bar".encode(enc) + end + end it "returns the extension for a multibyte filename" do File.basename('/path/Офис.m4a').should == "Офис.m4a" |
