From 4a0072d5f29befde814ea0d9a83c711e1f049564 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Chris=20Hasi=C5=84ski?= Date: Fri, 8 May 2026 19:10:12 +0200 Subject: Speed up Integer#to_s with a two digit lookup table (#16719) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * numeric: emit two decimal digits per iteration in rb_fix2str Replace the digit-at-a-time loop in rb_fix2str with the standard itoa 2-digit lookup table for base 10. Each iteration now writes two digits using a single (u % 100, u / 100) pair, so the number of loop iterations is halved for multi-digit integers. The classic per-digit loop is kept for non-base-10 conversion. Benchmark (Apple M-series, 5M-10M ops, best of 3 runs): case base patch delta --------- ----- ----- ----- 1-digit (5) 64 ns/op 64 ns/op -0% 2-digit (42) 64 ns/op 65 ns/op +2% (noise) 3-digit (400) 66 ns/op 64 ns/op -3% 5-digit (12345) 69 ns/op 67 ns/op -3% 10-digit (1234567890) 77 ns/op 67 ns/op -13% 19-digit (2^62-1) 111 ns/op 75 ns/op -33% The crossover is at ~3 digits: below that the constant setup dominates and the benefit is within noise, above that the halved iteration count shows up linearly. Typical Rails payloads mix short IDs (1-5 digits) and longer values (timestamps, nanos, large counts), so the win is workload-dependent but strictly non-negative for real code. Correctness: 100k random fuzz across the full fixnum range plus targeted edges (0, ±1, ±99, ±100, 2^30-1, 2^62-1, etc.) all pass. make test-all shows 34694 tests, 7325860 assertions, 0 new failures (same pre-existing TestArgf#test_puts flake as on master) — test_integer.rb alone runs 38 tests / 421628 assertions of which Integer#to_s exercises the bulk, all pass. The 200-byte lookup table sits in .rodata and fits in a single cache line of its own (3 lines for the whole table). No change to public API, no change to bignum conversion, no change to non-base-10 conversion paths. * bignum: emit two decimal digits per iteration in big2str_2bdigits Extend the 2-digit lookup-table itoa optimisation from rb_fix2str to the inner conversion loop used by Bignum#to_s. big2str_2bdigits has two code paths — a leading-chunk path that emits variable-length digits, and a recursive-chunk path that emits a fixed-width zero- padded block — and both gain from the halved division count. The classic per-digit loop is preserved for non-base-10 conversion. Moves the ruby_decimal_digit_pairs table from a file-static in numeric.c to bignum.c next to ruby_digitmap, and exposes it through internal/bignum.h so both files share the same 200-byte .rodata instance. Benchmark (Apple M-series, best of 3 runs, measures bignum-only speedup against the preceding fixnum commit): case base patch delta --------- ----- ----- ----- big_20dig 10^19+... 146 ns/op 124 ns/op -15% big_40dig 10^39+... 174 ns/op 152 ns/op -13% big_100dig 10^99+42 236 ns/op 213 ns/op -10% big_500dig 10^499+7 1119 ns/op 1086 ns/op -3% big_1000dig 10^999 3490 ns/op 3459 ns/op -1% fix_19dig 2^62-1 76 ns/op 76 ns/op 0% (unchanged path) Wins concentrate in the 20-100 digit range where big2str_2bdigits is the dominant cost. Above ~500 digits the Karatsuba divmod recursion dominates and the digit-emission saving shrinks to the noise floor. The 20-100 range is what actual Ruby code exercises (financial high-precision sums, nanosecond timestamps, large counters); crypto-size (1000+ digit) bignums are rare in to_s paths. Correctness: 100k random fixnum fuzz unchanged, 500 random bignum fuzz up to 2^256 with cross-check against sprintf("%d"), bases 2/8/16/36 round-trip, plus edge cases (0, just-above-fixnum, ±2^100, 20-digit strings near the fixnum boundary). test/ruby/test_integer.rb stays at 38 tests / 421628 assertions / 0 failures, test_bignum.rb passes 74 / 607 / 0 failures, full make test-all reports 34694 tests / 0 new failures (same TestArgf#test_puts pre-existing flake as master). * benchmark: add int_to_s yaml for Integer#to_s Reproducible benchmark for the two preceding commits. Covers: - 1/2/3/5/10/19-digit positive fixnums (spans the break-even point and the two large-number wins at the top) - A negative fixnum (exercises the minus-sign prepend path) - 20/40/100-digit bignums (spans the big2str_2bdigits win range) - Two string-interpolation scenarios, so reviewers can see how much of the Integer#to_s speedup reaches real code that allocates the result string too Intended to be consumed by benchmark-driver against master vs int-to-s-twodigit for A/B comparison. Matches the numbers in the commit messages of 5bfb7e02a2 and c5df6de835. --------- Co-authored-by: tomoya ishida --- benchmark/int_to_s.yml | 25 +++++++++++++++ bignum.c | 86 ++++++++++++++++++++++++++++++++++++++++++++------ internal/bignum.h | 1 + numeric.c | 36 +++++++++++++++++++-- 4 files changed, 135 insertions(+), 13 deletions(-) create mode 100644 benchmark/int_to_s.yml diff --git a/benchmark/int_to_s.yml b/benchmark/int_to_s.yml new file mode 100644 index 0000000000..000dae9612 --- /dev/null +++ b/benchmark/int_to_s.yml @@ -0,0 +1,25 @@ +prelude: | + # frozen_string_literal: true + N1 = 5 + N2 = 42 + N3 = 400 + N5 = 12345 + N10 = 1_234_567_890 + N19 = 4_611_686_018_427_387_903 + NEG = -1_234_567_890 + BIG20 = 10 ** 19 + 12_345_678_901_234_567 + BIG40 = 10 ** 39 + 123_456_789_012_345 + BIG100 = 10 ** 99 + 42 +benchmark: + fix_1digit: "N1.to_s" + fix_2digit: "N2.to_s" + fix_3digit: "N3.to_s" + fix_5digit: "N5.to_s" + fix_10digit: "N10.to_s" + fix_19digit: "N19.to_s" + fix_negative: "NEG.to_s" + big_20digit: "BIG20.to_s" + big_40digit: "BIG40.to_s" + big_100digit: "BIG100.to_s" + interp_id: '"id=#{N10}"' + interp_mixed: '"a=#{N2},b=#{N5},c=#{N10}"' diff --git a/bignum.c b/bignum.c index e4af035cac..28924b4eb9 100644 --- a/bignum.c +++ b/bignum.c @@ -64,6 +64,21 @@ static const bool debug_integer_pack = ( const char ruby_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz"; +/* Two-digit decimal lookup table. Offset 2*n holds the ASCII pair for + * n in the range 0..99. Used by both rb_fix2str in numeric.c and + * big2str_2bdigits below to emit two base-10 digits per iteration. */ +const char ruby_decimal_digit_pairs[201] = + "00010203040506070809" + "10111213141516171819" + "20212223242526272829" + "30313233343536373839" + "40414243444546474849" + "50515253545556575859" + "60616263646566676869" + "70717273747576777879" + "80818283848586878889" + "90919293949596979899"; + #ifndef SIZEOF_BDIGIT_DBL # if SIZEOF_INT*2 <= SIZEOF_LONG_LONG # define SIZEOF_BDIGIT_DBL SIZEOF_LONG_LONG @@ -4811,11 +4826,34 @@ big2str_2bdigits(struct big2str_struct *b2s, BDIGIT *xds, size_t xn, size_t tail return; p = buf; j = sizeof(buf); - do { - BDIGIT_DBL idx = num % b2s->base; - num /= b2s->base; - p[--j] = ruby_digitmap[idx]; - } while (num); + if (b2s->base == 10) { + /* Emit two decimal digits per iteration from ruby_decimal_digit_pairs. + * See the comment on the table in bignum.c near ruby_digitmap. */ + while (num >= 100) { + BDIGIT_DBL idx = (num % 100) * 2; + num /= 100; + j -= 2; + p[j] = ruby_decimal_digit_pairs[idx]; + p[j + 1] = ruby_decimal_digit_pairs[idx + 1]; + } + if (num >= 10) { + BDIGIT_DBL idx = num * 2; + j -= 2; + p[j] = ruby_decimal_digit_pairs[idx]; + p[j + 1] = ruby_decimal_digit_pairs[idx + 1]; + } + else { + /* num is 1..9 here (0 was handled above) */ + p[--j] = (char)('0' + num); + } + } + else { + do { + BDIGIT_DBL idx = num % b2s->base; + num /= b2s->base; + p[--j] = ruby_digitmap[idx]; + } while (num); + } len = sizeof(buf) - j; big2str_alloc(b2s, len + taillen); MEMCPY(b2s->ptr, buf + j, char, len); @@ -4823,11 +4861,39 @@ big2str_2bdigits(struct big2str_struct *b2s, BDIGIT *xds, size_t xn, size_t tail else { p = b2s->ptr; j = b2s->hbase2_numdigits; - do { - BDIGIT_DBL idx = num % b2s->base; - num /= b2s->base; - p[--j] = ruby_digitmap[idx]; - } while (j); + if (b2s->base == 10) { + /* Non-beginning chunks must emit EXACTLY hbase2_numdigits, + * zero-padded on the left. Consume num in 2-digit groups, + * handle the odd trailing digit, then memset remaining + * positions with '0'. */ + while (num >= 100) { + BDIGIT_DBL idx = (num % 100) * 2; + num /= 100; + j -= 2; + p[j] = ruby_decimal_digit_pairs[idx]; + p[j + 1] = ruby_decimal_digit_pairs[idx + 1]; + } + if (num >= 10) { + BDIGIT_DBL idx = num * 2; + j -= 2; + p[j] = ruby_decimal_digit_pairs[idx]; + p[j + 1] = ruby_decimal_digit_pairs[idx + 1]; + } + else if (num > 0) { + p[--j] = (char)('0' + num); + } + if (j > 0) { + memset(p, '0', j); + j = 0; + } + } + else { + do { + BDIGIT_DBL idx = num % b2s->base; + num /= b2s->base; + p[--j] = ruby_digitmap[idx]; + } while (j); + } len = b2s->hbase2_numdigits; } b2s->ptr += len; diff --git a/internal/bignum.h b/internal/bignum.h index f11fbd3a4d..7389a17c74 100644 --- a/internal/bignum.h +++ b/internal/bignum.h @@ -107,6 +107,7 @@ struct RBignum { /* bignum.c */ extern const char ruby_digitmap[]; +extern const char ruby_decimal_digit_pairs[]; double rb_big_fdiv_double(VALUE x, VALUE y); VALUE rb_big_uminus(VALUE x); VALUE rb_big_hash(VALUE); diff --git a/numeric.c b/numeric.c index 40b7bfc0f8..175bd7cfa0 100644 --- a/numeric.c +++ b/numeric.c @@ -4040,6 +4040,11 @@ rb_int_uminus(VALUE num) } } +/* ruby_decimal_digit_pairs is defined in bignum.c and declared in + * internal/bignum.h. See there for the rationale of the 2-digit + * lookup-table itoa optimisation; both rb_fix2str here and big2str_2bdigits + * in bignum.c consume it. */ + VALUE rb_fix2str(VALUE x, int base) { @@ -4072,9 +4077,34 @@ rb_fix2str(VALUE x, int base) else { u = val; } - do { - *--b = ruby_digitmap[(int)(u % base)]; - } while (u /= base); + if (base == 10) { + /* Emit two digits per iteration from a precomputed table. The + * compiler lowers `u % 100` and `u / 100` to a single multiply + + * shift, so each iteration costs roughly one multiply, one shift, + * and two stores. About 2x fewer iterations than the classic + * per-digit loop for multi-digit inputs. */ + while (u >= 100) { + unsigned long idx = (u % 100) * 2; + u /= 100; + b -= 2; + b[0] = ruby_decimal_digit_pairs[idx]; + b[1] = ruby_decimal_digit_pairs[idx + 1]; + } + if (u >= 10) { + unsigned long idx = u * 2; + b -= 2; + b[0] = ruby_decimal_digit_pairs[idx]; + b[1] = ruby_decimal_digit_pairs[idx + 1]; + } + else { + *--b = (char)('0' + u); + } + } + else { + do { + *--b = ruby_digitmap[(int)(u % base)]; + } while (u /= base); + } if (neg) { *--b = '-'; } -- cgit v1.2.3