summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorMartin Emde <me@martinemde.com>2024-05-30 21:37:24 -0700
committergit <svn-admin@ruby-lang.org>2024-05-31 20:20:03 +0000
commitbc40d0609177cd60ba1bc4fd915e03e917c0b79c (patch)
tree1dd1b74d97bb52f0a98109e6272557ff337dc187 /lib
parentb5c8fb9a3f1ee8a38a04a3c96b7da5bf6c4c23ac (diff)
[rubygems/rubygems] String search based parsing of compact index versions
This significantly reduces memory usage. https://github.com/rubygems/rubygems/commit/8a76506c90
Diffstat (limited to 'lib')
-rw-r--r--lib/bundler/compact_index_client.rb23
-rw-r--r--lib/bundler/compact_index_client/parser.rb75
2 files changed, 74 insertions, 24 deletions
diff --git a/lib/bundler/compact_index_client.rb b/lib/bundler/compact_index_client.rb
index d53f3ca605..692d68e579 100644
--- a/lib/bundler/compact_index_client.rb
+++ b/lib/bundler/compact_index_client.rb
@@ -4,6 +4,29 @@ require "pathname"
require "set"
module Bundler
+ # The CompactIndexClient is responsible for fetching and parsing the compact index.
+ #
+ # The compact index is a set of caching optimized files that are used to fetch gem information.
+ # The files are:
+ # - names: a list of all gem names
+ # - versions: a list of all gem versions
+ # - info/[gem]: a list of all versions of a gem
+ #
+ # The client is instantiated with:
+ # - `directory`: the root directory where the cache files are stored.
+ # - `fetcher`: (optional) an object that responds to #call(uri_path, headers) and returns an http response.
+ # If the `fetcher` is not provided, the client will only read cached files from disk.
+ #
+ # The client is organized into:
+ # - `Updater`: updates the cached files on disk using the fetcher.
+ # - `Cache`: calls the updater, caches files, read and return them from disk
+ # - `Parser`: parses the compact index file data
+ # - `CacheFile`: a concurrency safe file reader/writer that verifies checksums
+ #
+ # The client is intended to optimize memory usage and performance.
+ # It is called 100s or 1000s of times, parsing files with hundreds of thousands of lines.
+ # It may be called concurrently without global interpreter lock in some Rubies.
+ # As a result, some methods may look more complex than necessary to save memory or time.
class CompactIndexClient
# NOTE: MD5 is here not because we expect a server to respond with it, but
# because we use it to generate the etag on first request during the upgrade
diff --git a/lib/bundler/compact_index_client/parser.rb b/lib/bundler/compact_index_client/parser.rb
index a227bc2cfd..3a0dec4907 100644
--- a/lib/bundler/compact_index_client/parser.rb
+++ b/lib/bundler/compact_index_client/parser.rb
@@ -10,6 +10,8 @@ module Bundler
@info_checksums = nil
@versions_by_name = nil
@available = nil
+ @gem_parser = nil
+ @versions_data = nil
end
def names
@@ -38,46 +40,71 @@ module Bundler
end
def info(name)
- data = @compact_index.info(name, info_checksums[name])
+ data = @compact_index.info(name, info_checksum(name))
lines(data).map {|line| gem_parser.parse(line).unshift(name) }
end
+ # parse the last, most recently updated line of the versions file to determine availability
def available?
return @available unless @available.nil?
- @available = !info_checksums.empty?
+ return @available = false unless versions_data&.size&.nonzero?
+
+ line_end = versions_data.size - 1
+ return @available = false if versions_data[line_end] != "\n"
+
+ line_start = versions_data.rindex("\n", line_end - 1)
+ line_start ||= -1 # allow a single line versions file
+
+ @available = !split_last_word(versions_data, line_start + 1, line_end).nil?
end
private
- def info_checksums
- @info_checksums ||= lines(@compact_index.versions).each_with_object({}) do |line, checksums|
- parse_version_checksum(line, checksums)
+ # Search for a line starting with gem name, then return last space-separated word (the checksum)
+ def info_checksum(name)
+ return unless versions_data
+ return unless (line_start = rindex_of_gem(name))
+ return unless (line_end = versions_data.index("\n", line_start))
+ split_last_word(versions_data, line_start, line_end)
+ end
+
+ def gem_parser
+ @gem_parser ||= GemParser.new
+ end
+
+ def versions_data
+ @versions_data ||= begin
+ data = @compact_index.versions
+ strip_header!(data) if data
+ data.freeze
end
end
- def lines(data)
- return [] if data.nil? || data.empty?
- lines = data.split("\n")
- header = lines.index("---")
- header ? lines[header + 1..-1] : lines
+ def rindex_of_gem(name)
+ if (pos = versions_data.rindex("\n#{name} "))
+ pos + 1
+ elsif versions_data.start_with?("#{name} ")
+ 0
+ end
end
- def gem_parser
- @gem_parser ||= GemParser.new
+ # This is similar to `string.split(" ").last` but it avoids allocating extra objects.
+ def split_last_word(string, line_start, line_end)
+ return unless line_start < line_end && line_start >= 0
+ word_start = string.rindex(" ", line_end).to_i + 1
+ return if word_start < line_start
+ string[word_start, line_end - word_start]
+ end
+
+ def lines(string)
+ return [] if string.nil? || string.empty?
+ strip_header!(string)
+ string.split("\n")
end
- # This is mostly the same as `split(" ", 3)` but it avoids allocating extra objects.
- # This method gets called at least once for every gem when parsing versions.
- def parse_version_checksum(line, checksums)
- line.freeze # allows slicing into the string to not allocate a copy of the line
- name_end = line.index(" ")
- checksum_start = line.index(" ", name_end + 1) + 1
- checksum_end = line.size - checksum_start
- # freeze name since it is used as a hash key
- # pre-freezing means a frozen copy isn't created
- name = line[0, name_end].freeze
- checksum = line[checksum_start, checksum_end]
- checksums[name] = checksum
+ def strip_header!(string)
+ header_end = string.index("---\n")
+ string.slice!(0, header_end + 4) if header_end
end
end
end