summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean Boussier <jean.boussier@gmail.com>2026-01-20 17:59:15 +0100
committerJean Boussier <jean.boussier@gmail.com>2026-01-21 11:23:01 +0100
commit519a4bdbc19693be9020419eb4dea9e66a256a41 (patch)
treea801eb1c39de0922c529fead5e4d545606d80250
parent0f1eea09490287cf9ed907f7cd84398b03074d98 (diff)
Optimize File.basename
The actual algorithm is largely unchanged, just allowed to use singlebyte checks for common encodings. It could certainly be optimized much further, as here again it often scans from the front of the string when we're interested in the back of it. But the algorithm as many Windows only corner cases so I'd rather ship a good improvement now and eventually come back to it later. Most of improvement here is from the reduced setup cost (avodi double null checks, avoid duping the argument, etc), and skipping the multi-byte checks. ``` compare-ruby: ruby 4.1.0dev (2026-01-19T03:51:30Z master 631bf19b37) +PRISM [arm64-darwin25] built-ruby: ruby 4.1.0dev (2026-01-21T08:21:05Z opt-basename 7eb11745b2) +PRISM [arm64-darwin25] ``` | |compare-ruby|built-ruby| |:----------|-----------:|---------:| |long | 3.412M| 18.158M| | | -| 5.32x| |long_name | 1.981M| 8.580M| | | -| 4.33x| |withext | 3.200M| 12.986M| | | -| 4.06x|
-rw-r--r--benchmark/file_basename.yml6
-rw-r--r--file.c88
-rw-r--r--internal/string.h8
-rw-r--r--spec/ruby/core/file/basename_spec.rb26
4 files changed, 97 insertions, 31 deletions
diff --git a/benchmark/file_basename.yml b/benchmark/file_basename.yml
new file mode 100644
index 0000000000..fbd78785aa
--- /dev/null
+++ b/benchmark/file_basename.yml
@@ -0,0 +1,6 @@
+prelude: |
+ # frozen_string_literal: true
+benchmark:
+ long: File.basename("/Users/george/src/github.com/ruby/ruby/benchmark/file_dirname.yml")
+ long_name: File.basename("Users_george_src_github.com_ruby_ruby_benchmark_file_dirname.yml")
+ withext: File.basename("/Users/george/src/github.com/ruby/ruby/benchmark/file_dirname.yml", ".yml")
diff --git a/file.c b/file.c
index a98bb9728e..c23e92d22e 100644
--- a/file.c
+++ b/file.c
@@ -3749,7 +3749,7 @@ strrdirsep(const char *path, const char *end, bool mb_enc, rb_encoding *enc)
}
static char *
-chompdirsep(const char *path, const char *end, rb_encoding *enc)
+chompdirsep(const char *path, const char *end, bool mb_enc, rb_encoding *enc)
{
while (path < end) {
if (isdirsep(*path)) {
@@ -3758,7 +3758,7 @@ chompdirsep(const char *path, const char *end, rb_encoding *enc)
if (path >= end) return (char *)last;
}
else {
- Inc(path, end, true, enc);
+ Inc(path, end, mb_enc, enc);
}
}
return (char *)path;
@@ -3768,7 +3768,7 @@ char *
rb_enc_path_end(const char *path, const char *end, rb_encoding *enc)
{
if (path < end && isdirsep(*path)) path++;
- return chompdirsep(path, end, enc);
+ return chompdirsep(path, end, true, enc);
}
static rb_encoding *
@@ -4088,7 +4088,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na
rb_enc_associate(result, enc = fs_enc_check(result, fname));
p = pend;
}
- p = chompdirsep(skiproot(buf, p), p, enc);
+ p = chompdirsep(skiproot(buf, p), p, true, enc);
s += 2;
}
}
@@ -4113,7 +4113,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na
}
else
#endif /* defined DOSISH || defined __CYGWIN__ */
- p = chompdirsep(skiproot(buf, p), p, enc);
+ p = chompdirsep(skiproot(buf, p), p, true, enc);
}
else {
size_t len;
@@ -4656,7 +4656,7 @@ rb_check_realpath_emulate(VALUE basedir, VALUE path, rb_encoding *origenc, enum
root_found:
RSTRING_GETMEM(resolved, prefixptr, prefixlen);
pend = prefixptr + prefixlen;
- ptr = chompdirsep(prefixptr, pend, enc);
+ ptr = chompdirsep(prefixptr, pend, true, enc);
if (ptr < pend) {
prefixlen = ++ptr - prefixptr;
rb_str_set_len(resolved, prefixlen);
@@ -4910,8 +4910,8 @@ rmext(const char *p, long l0, long l1, const char *e, long l2, rb_encoding *enc)
return 0;
}
-const char *
-ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encoding *enc)
+static inline const char *
+enc_find_basename(const char *name, long *baselen, long *alllen, bool mb_enc, rb_encoding *enc)
{
const char *p, *q, *e, *end;
#if defined DOSISH_DRIVE_LETTER || defined DOSISH_UNC
@@ -4919,13 +4919,22 @@ ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encodin
#endif
long f = 0, n = -1;
- end = name + (alllen ? (size_t)*alllen : strlen(name));
- name = skipprefix(name, end, true, enc);
+ long len = (alllen ? (size_t)*alllen : strlen(name));
+
+ if (len <= 0) {
+ return name;
+ }
+
+ end = name + len;
+ name = skipprefix(name, end, mb_enc, enc);
#if defined DOSISH_DRIVE_LETTER || defined DOSISH_UNC
root = name;
#endif
- while (isdirsep(*name))
+
+ while (isdirsep(*name)) {
name++;
+ }
+
if (!*name) {
p = name - 1;
f = 1;
@@ -4947,32 +4956,47 @@ ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encodin
#endif /* defined DOSISH_DRIVE_LETTER || defined DOSISH_UNC */
}
else {
- if (!(p = strrdirsep(name, end, true, enc))) {
+ p = strrdirsep(name, end, mb_enc, enc);
+ if (!p) {
p = name;
}
else {
- while (isdirsep(*p)) p++; /* skip last / */
+ while (isdirsep(*p)) {
+ p++; /* skip last / */
+ }
}
#if USE_NTFS
n = ntfs_tail(p, end, enc) - p;
#else
- n = chompdirsep(p, end, enc) - p;
+ n = chompdirsep(p, end, mb_enc, enc) - p;
#endif
for (q = p; q - p < n && *q == '.'; q++);
- for (e = 0; q - p < n; Inc(q, end, true, enc)) {
+ for (e = 0; q - p < n; Inc(q, end, mb_enc, enc)) {
if (*q == '.') e = q;
}
- if (e) f = e - p;
- else f = n;
+ if (e) {
+ f = e - p;
+ }
+ else {
+ f = n;
+ }
}
- if (baselen)
+ if (baselen) {
*baselen = f;
- if (alllen)
+ }
+ if (alllen) {
*alllen = n;
+ }
return p;
}
+const char *
+ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encoding *enc)
+{
+ return enc_find_basename(name, baselen, alllen, true, enc);
+}
+
/*
* call-seq:
* File.basename(file_name [, suffix] ) -> base_name
@@ -4993,7 +5017,7 @@ ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encodin
static VALUE
rb_file_s_basename(int argc, VALUE *argv, VALUE _)
{
- VALUE fname, fext, basename;
+ VALUE fname, fext;
const char *name, *p;
long f, n;
rb_encoding *enc;
@@ -5006,15 +5030,19 @@ rb_file_s_basename(int argc, VALUE *argv, VALUE _)
enc = rb_str_enc_get(fext);
}
fname = argv[0];
- FilePathStringValue(fname);
+ CheckPath(fname, name);
if (NIL_P(fext) || !(enc = rb_enc_compatible(fname, fext))) {
- enc = rb_enc_get(fname);
+ enc = rb_str_enc_get(fname);
fext = Qnil;
}
- if ((n = RSTRING_LEN(fname)) == 0 || !*(name = RSTRING_PTR(fname)))
- return rb_str_new_shared(fname);
- p = ruby_enc_find_basename(name, &f, &n, enc);
+ n = RSTRING_LEN(fname);
+ if (n == 0 || !*name) {
+ rb_enc_str_new(0, 0, enc);
+ }
+
+ bool mb_enc = !rb_str_encindex_fastpath(rb_enc_to_index(enc));
+ p = enc_find_basename(name, &f, &n, mb_enc, enc);
if (n >= 0) {
if (NIL_P(fext)) {
f = n;
@@ -5027,12 +5055,12 @@ rb_file_s_basename(int argc, VALUE *argv, VALUE _)
}
RB_GC_GUARD(fext);
}
- if (f == RSTRING_LEN(fname)) return rb_str_new_shared(fname);
+ if (f == RSTRING_LEN(fname)) {
+ return rb_str_new_shared(fname);
+ }
}
- basename = rb_str_new(p, f);
- rb_enc_copy(basename, fname);
- return basename;
+ return rb_enc_str_new(p, f, enc);
}
static VALUE rb_file_dirname_n(VALUE fname, int n);
@@ -5350,7 +5378,7 @@ rb_file_join_ary(VALUE ary)
rb_enc_copy(result, tmp);
}
else {
- tail = chompdirsep(name, name + len, rb_enc_get(result));
+ tail = chompdirsep(name, name + len, true, rb_enc_get(result));
if (RSTRING_PTR(tmp) && isdirsep(RSTRING_PTR(tmp)[0])) {
rb_str_set_len(result, tail - name);
}
diff --git a/internal/string.h b/internal/string.h
index dd5e20c0c6..9212ce8986 100644
--- a/internal/string.h
+++ b/internal/string.h
@@ -33,7 +33,13 @@ enum ruby_rstring_private_flags {
static inline bool
rb_str_encindex_fastpath(int encindex)
{
- // The overwhelming majority of strings are in one of these 3 encodings.
+ // The overwhelming majority of strings are in one of these 3 encodings,
+ // which are all either ASCII or perfect ASCII supersets.
+ // Hence you can use fast, single byte algorithms on them, such as `memchr` etc,
+ // without all the overhead of fetching the rb_encoding and using functions such as
+ // rb_enc_mbminlen etc.
+ // Many other encodings could qualify, but they are expected to be rare occurences,
+ // so it's better to keep that list small.
switch (encindex) {
case ENCINDEX_ASCII_8BIT:
case ENCINDEX_UTF_8:
diff --git a/spec/ruby/core/file/basename_spec.rb b/spec/ruby/core/file/basename_spec.rb
index 989409d76b..87695ab97b 100644
--- a/spec/ruby/core/file/basename_spec.rb
+++ b/spec/ruby/core/file/basename_spec.rb
@@ -151,8 +151,34 @@ describe "File.basename" do
File.basename("c:\\bar.txt", ".*").should == "bar"
File.basename("c:\\bar.txt.exe", ".*").should == "bar.txt"
end
+
+ it "handles Shift JIS 0x5C (\\) as second byte of a multi-byte sequence" do
+ # dir\fileソname.txt
+ path = "dir\\file\x83\x5cname.txt".b.force_encoding(Encoding::SHIFT_JIS)
+ path.valid_encoding?.should be_true
+ File.basename(path).should == "file\x83\x5cname.txt".b.force_encoding(Encoding::SHIFT_JIS)
+ end
end
+ it "rejects strings encoded with non ASCII-compatible encodings" do
+ Encoding.list.reject(&:ascii_compatible?).reject(&:dummy?).each do |enc|
+ begin
+ path = "/foo/bar".encode(enc)
+ rescue Encoding::ConverterNotFoundError
+ next
+ end
+
+ -> {
+ File.basename(path)
+ }.should raise_error(Encoding::CompatibilityError)
+ end
+ end
+
+ it "works with all ASCII-compatible encodings" do
+ Encoding.list.select(&:ascii_compatible?).each do |enc|
+ File.basename("/foo/bar".encode(enc)).should == "bar".encode(enc)
+ end
+ end
it "returns the extension for a multibyte filename" do
File.basename('/path/Офис.m4a').should == "Офис.m4a"