From 4a0072d5f29befde814ea0d9a83c711e1f049564 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Chris=20Hasi=C5=84ski?= <krzysztof.hasinski@gmail.com>
Date: Fri, 8 May 2026 19:10:12 +0200
Subject: Speed up Integer#to_s with a two digit lookup table (#16719)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* numeric: emit two decimal digits per iteration in rb_fix2str

Replace the digit-at-a-time loop in rb_fix2str with the standard
itoa 2-digit lookup table for base 10.  Each iteration now
writes two digits using a single (u % 100, u / 100) pair, so the
number of loop iterations is halved for multi-digit integers.
The classic per-digit loop is kept for non-base-10 conversion.

Benchmark (Apple M-series, 5M-10M ops, best of 3 runs):

  case            base       patch      delta
  ---------       -----      -----      -----
  1-digit   (5)   64 ns/op   64 ns/op    -0%
  2-digit   (42)  64 ns/op   65 ns/op    +2%  (noise)
  3-digit   (400) 66 ns/op   64 ns/op    -3%
  5-digit   (12345)          69 ns/op   67 ns/op    -3%
  10-digit  (1234567890)     77 ns/op   67 ns/op   -13%
  19-digit  (2^62-1)        111 ns/op   75 ns/op   -33%

The crossover is at ~3 digits: below that the constant setup
dominates and the benefit is within noise, above that the halved
iteration count shows up linearly.  Typical Rails payloads mix
short IDs (1-5 digits) and longer values (timestamps, nanos,
large counts), so the win is workload-dependent but strictly
non-negative for real code.

Correctness: 100k random fuzz across the full fixnum range plus
targeted edges (0, ±1, ±99, ±100, 2^30-1, 2^62-1, etc.) all pass.
make test-all shows 34694 tests, 7325860 assertions, 0 new
failures (same pre-existing TestArgf#test_puts flake as on
master) — test_integer.rb alone runs 38 tests / 421628 assertions
of which Integer#to_s exercises the bulk, all pass.

The 200-byte lookup table sits in .rodata and fits in a single
cache line of its own (3 lines for the whole table).  No change
to public API, no change to bignum conversion, no change to
non-base-10 conversion paths.

* bignum: emit two decimal digits per iteration in big2str_2bdigits

Extend the 2-digit lookup-table itoa optimisation from rb_fix2str to
the inner conversion loop used by Bignum#to_s.  big2str_2bdigits has
two code paths — a leading-chunk path that emits variable-length
digits, and a recursive-chunk path that emits a fixed-width zero-
padded block — and both gain from the halved division count.  The
classic per-digit loop is preserved for non-base-10 conversion.

Moves the ruby_decimal_digit_pairs table from a file-static in
numeric.c to bignum.c next to ruby_digitmap, and exposes it through
internal/bignum.h so both files share the same 200-byte .rodata
instance.

Benchmark (Apple M-series, best of 3 runs, measures bignum-only
speedup against the preceding fixnum commit):

  case            base      patch     delta
  ---------       -----     -----     -----
  big_20dig   10^19+...  146 ns/op 124 ns/op  -15%
  big_40dig   10^39+...  174 ns/op 152 ns/op  -13%
  big_100dig  10^99+42   236 ns/op 213 ns/op  -10%
  big_500dig  10^499+7  1119 ns/op 1086 ns/op  -3%
  big_1000dig 10^999    3490 ns/op 3459 ns/op  -1%
  fix_19dig   2^62-1      76 ns/op   76 ns/op   0% (unchanged path)

Wins concentrate in the 20-100 digit range where big2str_2bdigits
is the dominant cost.  Above ~500 digits the Karatsuba divmod
recursion dominates and the digit-emission saving shrinks to the
noise floor.  The 20-100 range is what actual Ruby code exercises
(financial high-precision sums, nanosecond timestamps, large
counters); crypto-size (1000+ digit) bignums are rare in to_s paths.

Correctness: 100k random fixnum fuzz unchanged, 500 random bignum
fuzz up to 2^256 with cross-check against sprintf("%d"), bases
2/8/16/36 round-trip, plus edge cases (0, just-above-fixnum, ±2^100,
20-digit strings near the fixnum boundary).  test/ruby/test_integer.rb
stays at 38 tests / 421628 assertions / 0 failures, test_bignum.rb
passes 74 / 607 / 0 failures, full make test-all reports 34694
tests / 0 new failures (same TestArgf#test_puts pre-existing flake
as master).

* benchmark: add int_to_s yaml for Integer#to_s

Reproducible benchmark for the two preceding commits.  Covers:

- 1/2/3/5/10/19-digit positive fixnums (spans the break-even point
  and the two large-number wins at the top)
- A negative fixnum (exercises the minus-sign prepend path)
- 20/40/100-digit bignums (spans the big2str_2bdigits win range)
- Two string-interpolation scenarios, so reviewers can see how much
  of the Integer#to_s speedup reaches real code that allocates the
  result string too

Intended to be consumed by benchmark-driver against master vs
int-to-s-twodigit for A/B comparison.  Matches the numbers in the
commit messages of 5bfb7e02a2 and c5df6de835.

---------

Co-authored-by: tomoya ishida <tomoyapenguin@gmail.com>
---
 benchmark/int_to_s.yml | 25 +++++++++++++++
 bignum.c               | 86 ++++++++++++++++++++++++++++++++++++++++++++------
 internal/bignum.h      |  1 +
 numeric.c              | 36 +++++++++++++++++++--
 4 files changed, 135 insertions(+), 13 deletions(-)
 create mode 100644 benchmark/int_to_s.yml

diff --git a/benchmark/int_to_s.yml b/benchmark/int_to_s.yml
new file mode 100644
index 0000000000..000dae9612
--- /dev/null
+++ b/benchmark/int_to_s.yml
@@ -0,0 +1,25 @@
+prelude: |
+  # frozen_string_literal: true
+  N1   = 5
+  N2   = 42
+  N3   = 400
+  N5   = 12345
+  N10  = 1_234_567_890
+  N19  = 4_611_686_018_427_387_903
+  NEG  = -1_234_567_890
+  BIG20  = 10 ** 19 + 12_345_678_901_234_567
+  BIG40  = 10 ** 39 + 123_456_789_012_345
+  BIG100 = 10 ** 99 + 42
+benchmark:
+  fix_1digit:   "N1.to_s"
+  fix_2digit:   "N2.to_s"
+  fix_3digit:   "N3.to_s"
+  fix_5digit:   "N5.to_s"
+  fix_10digit:  "N10.to_s"
+  fix_19digit:  "N19.to_s"
+  fix_negative: "NEG.to_s"
+  big_20digit:  "BIG20.to_s"
+  big_40digit:  "BIG40.to_s"
+  big_100digit: "BIG100.to_s"
+  interp_id:    '"id=#{N10}"'
+  interp_mixed: '"a=#{N2},b=#{N5},c=#{N10}"'
diff --git a/bignum.c b/bignum.c
index e4af035cac..28924b4eb9 100644
--- a/bignum.c
+++ b/bignum.c
@@ -64,6 +64,21 @@ static const bool debug_integer_pack = (
 
 const char ruby_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz";
 
+/* Two-digit decimal lookup table.  Offset 2*n holds the ASCII pair for
+ * n in the range 0..99.  Used by both rb_fix2str in numeric.c and
+ * big2str_2bdigits below to emit two base-10 digits per iteration. */
+const char ruby_decimal_digit_pairs[201] =
+    "00010203040506070809"
+    "10111213141516171819"
+    "20212223242526272829"
+    "30313233343536373839"
+    "40414243444546474849"
+    "50515253545556575859"
+    "60616263646566676869"
+    "70717273747576777879"
+    "80818283848586878889"
+    "90919293949596979899";
+
 #ifndef SIZEOF_BDIGIT_DBL
 # if SIZEOF_INT*2 <= SIZEOF_LONG_LONG
 #  define SIZEOF_BDIGIT_DBL SIZEOF_LONG_LONG
@@ -4811,11 +4826,34 @@ big2str_2bdigits(struct big2str_struct *b2s, BDIGIT *xds, size_t xn, size_t tail
             return;
         p = buf;
         j = sizeof(buf);
-        do {
-            BDIGIT_DBL idx = num % b2s->base;
-            num /= b2s->base;
-            p[--j] = ruby_digitmap[idx];
-        } while (num);
+        if (b2s->base == 10) {
+            /* Emit two decimal digits per iteration from ruby_decimal_digit_pairs.
+             * See the comment on the table in bignum.c near ruby_digitmap. */
+            while (num >= 100) {
+                BDIGIT_DBL idx = (num % 100) * 2;
+                num /= 100;
+                j -= 2;
+                p[j]     = ruby_decimal_digit_pairs[idx];
+                p[j + 1] = ruby_decimal_digit_pairs[idx + 1];
+            }
+            if (num >= 10) {
+                BDIGIT_DBL idx = num * 2;
+                j -= 2;
+                p[j]     = ruby_decimal_digit_pairs[idx];
+                p[j + 1] = ruby_decimal_digit_pairs[idx + 1];
+            }
+            else {
+                /* num is 1..9 here (0 was handled above) */
+                p[--j] = (char)('0' + num);
+            }
+        }
+        else {
+            do {
+                BDIGIT_DBL idx = num % b2s->base;
+                num /= b2s->base;
+                p[--j] = ruby_digitmap[idx];
+            } while (num);
+        }
         len = sizeof(buf) - j;
         big2str_alloc(b2s, len + taillen);
         MEMCPY(b2s->ptr, buf + j, char, len);
@@ -4823,11 +4861,39 @@ big2str_2bdigits(struct big2str_struct *b2s, BDIGIT *xds, size_t xn, size_t tail
     else {
         p = b2s->ptr;
         j = b2s->hbase2_numdigits;
-        do {
-            BDIGIT_DBL idx = num % b2s->base;
-            num /= b2s->base;
-            p[--j] = ruby_digitmap[idx];
-        } while (j);
+        if (b2s->base == 10) {
+            /* Non-beginning chunks must emit EXACTLY hbase2_numdigits,
+             * zero-padded on the left.  Consume num in 2-digit groups,
+             * handle the odd trailing digit, then memset remaining
+             * positions with '0'. */
+            while (num >= 100) {
+                BDIGIT_DBL idx = (num % 100) * 2;
+                num /= 100;
+                j -= 2;
+                p[j]     = ruby_decimal_digit_pairs[idx];
+                p[j + 1] = ruby_decimal_digit_pairs[idx + 1];
+            }
+            if (num >= 10) {
+                BDIGIT_DBL idx = num * 2;
+                j -= 2;
+                p[j]     = ruby_decimal_digit_pairs[idx];
+                p[j + 1] = ruby_decimal_digit_pairs[idx + 1];
+            }
+            else if (num > 0) {
+                p[--j] = (char)('0' + num);
+            }
+            if (j > 0) {
+                memset(p, '0', j);
+                j = 0;
+            }
+        }
+        else {
+            do {
+                BDIGIT_DBL idx = num % b2s->base;
+                num /= b2s->base;
+                p[--j] = ruby_digitmap[idx];
+            } while (j);
+        }
         len = b2s->hbase2_numdigits;
     }
     b2s->ptr += len;
diff --git a/internal/bignum.h b/internal/bignum.h
index f11fbd3a4d..7389a17c74 100644
--- a/internal/bignum.h
+++ b/internal/bignum.h
@@ -107,6 +107,7 @@ struct RBignum {
 
 /* bignum.c */
 extern const char ruby_digitmap[];
+extern const char ruby_decimal_digit_pairs[];
 double rb_big_fdiv_double(VALUE x, VALUE y);
 VALUE rb_big_uminus(VALUE x);
 VALUE rb_big_hash(VALUE);
diff --git a/numeric.c b/numeric.c
index 40b7bfc0f8..175bd7cfa0 100644
--- a/numeric.c
+++ b/numeric.c
@@ -4040,6 +4040,11 @@ rb_int_uminus(VALUE num)
     }
 }
 
+/* ruby_decimal_digit_pairs is defined in bignum.c and declared in
+ * internal/bignum.h.  See there for the rationale of the 2-digit
+ * lookup-table itoa optimisation; both rb_fix2str here and big2str_2bdigits
+ * in bignum.c consume it. */
+
 VALUE
 rb_fix2str(VALUE x, int base)
 {
@@ -4072,9 +4077,34 @@ rb_fix2str(VALUE x, int base)
     else {
         u = val;
     }
-    do {
-        *--b = ruby_digitmap[(int)(u % base)];
-    } while (u /= base);
+    if (base == 10) {
+        /* Emit two digits per iteration from a precomputed table.  The
+         * compiler lowers `u % 100` and `u / 100` to a single multiply +
+         * shift, so each iteration costs roughly one multiply, one shift,
+         * and two stores.  About 2x fewer iterations than the classic
+         * per-digit loop for multi-digit inputs. */
+        while (u >= 100) {
+            unsigned long idx = (u % 100) * 2;
+            u /= 100;
+            b -= 2;
+            b[0] = ruby_decimal_digit_pairs[idx];
+            b[1] = ruby_decimal_digit_pairs[idx + 1];
+        }
+        if (u >= 10) {
+            unsigned long idx = u * 2;
+            b -= 2;
+            b[0] = ruby_decimal_digit_pairs[idx];
+            b[1] = ruby_decimal_digit_pairs[idx + 1];
+        }
+        else {
+            *--b = (char)('0' + u);
+        }
+    }
+    else {
+        do {
+            *--b = ruby_digitmap[(int)(u % base)];
+        } while (u /= base);
+    }
     if (neg) {
         *--b = '-';
     }
-- 
cgit v1.2.3