From 19f45f853c125ff744c53bd548f29f891c76c28b Mon Sep 17 00:00:00 2001
From: mame <mame@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>
Date: Sun, 14 Dec 2008 03:59:02 +0000
Subject: * bignum.c (rb_big_mul): faster multiplication by Karatsuba method
 and   twice faster square than normal multiplication.

* random.c (rb_rand_internal): used by Bignum#*.

* test/ruby/test_bignum.rb: add some tests for above.


git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@20733 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
---
 bignum.c | 395 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 327 insertions(+), 68 deletions(-)

(limited to 'bignum.c')

diff --git a/bignum.c b/bignum.c
index 35db091454..bcac99309b 100644
--- a/bignum.c
+++ b/bignum.c
@@ -17,6 +17,7 @@
 #ifdef HAVE_IEEEFP_H
 #include <ieeefp.h>
 #endif
+#include <assert.h>
 
 VALUE rb_cBignum;
 
@@ -1380,12 +1381,36 @@ rb_big_neg(VALUE x)
     return bignorm(z);
 }
 
+static void
+bigsub_core(BDIGIT *xds, long xn, BDIGIT *yds, long yn, BDIGIT *zds, long zn)
+{
+    BDIGIT_DBL_SIGNED num;
+    long i;
+
+    for (i = 0, num = 0; i < yn; i++) {
+	num += (BDIGIT_DBL_SIGNED)xds[i] - yds[i];
+	zds[i] = BIGLO(num);
+	num = BIGDN(num);
+    }
+    while (num && i < xn) {
+	num += xds[i];
+	zds[i++] = BIGLO(num);
+	num = BIGDN(num);
+    }
+    while (i < xn) {
+	zds[i] = xds[i];
+	i++;
+    }
+    assert(i <= zn);
+    while (i < zn) {
+	zds[i++] = 0;
+    }
+}
+
 static VALUE
 bigsub(VALUE x, VALUE y)
 {
     VALUE z = 0;
-    BDIGIT *zds;
-    BDIGIT_DBL_SIGNED num;
     long i = RBIGNUM_LEN(x);
 
     /* if x is larger than y, swap */
@@ -1406,32 +1431,52 @@ bigsub(VALUE x, VALUE y)
     }
 
     z = bignew(RBIGNUM_LEN(x), z==0);
-    zds = BDIGITS(z);
+    bigsub_core(BDIGITS(x), RBIGNUM_LEN(x),
+		BDIGITS(y), RBIGNUM_LEN(y),
+		BDIGITS(z), RBIGNUM_LEN(z));
 
-    for (i = 0, num = 0; i < RBIGNUM_LEN(y); i++) {
-	num += (BDIGIT_DBL_SIGNED)BDIGITS(x)[i] - BDIGITS(y)[i];
-	zds[i] = BIGLO(num);
+    return z;
+}
+
+static void
+bigadd_core(BDIGIT *xds, long xn, BDIGIT *yds, long yn, BDIGIT *zds, long zn)
+{
+    BDIGIT_DBL num = 0;
+    long i;
+
+    if (xn > yn) {
+	BDIGIT *tds;
+	tds = xds; xds = yds; yds = tds;
+	i = xn; xn = yn; yn = i;
+    }
+
+    i = 0;
+    while (i < xn) {
+	num += (BDIGIT_DBL)xds[i] + yds[i];
+	zds[i++] = BIGLO(num);
 	num = BIGDN(num);
     }
-    while (num && i < RBIGNUM_LEN(x)) {
-	num += BDIGITS(x)[i];
+    while (num && i < yn) {
+	num += yds[i];
 	zds[i++] = BIGLO(num);
 	num = BIGDN(num);
     }
-    while (i < RBIGNUM_LEN(x)) {
-	zds[i] = BDIGITS(x)[i];
+    while (i < yn) {
+	zds[i] = yds[i];
 	i++;
     }
-
-    return z;
+    if (num) zds[i++] = (BDIGIT)num;
+    assert(i <= zn);
+    while (i < zn) {
+	zds[i++] = 0;
+    }
 }
 
 static VALUE
 bigadd(VALUE x, VALUE y, int sign)
 {
     VALUE z;
-    BDIGIT_DBL num;
-    long i, len;
+    long len;
 
     sign = (sign == RBIGNUM_SIGN(y));
     if (RBIGNUM_SIGN(x) != sign) {
@@ -1441,30 +1486,15 @@ bigadd(VALUE x, VALUE y, int sign)
 
     if (RBIGNUM_LEN(x) > RBIGNUM_LEN(y)) {
 	len = RBIGNUM_LEN(x) + 1;
-	z = x; x = y; y = z;
     }
     else {
 	len = RBIGNUM_LEN(y) + 1;
     }
     z = bignew(len, sign);
 
-    len = RBIGNUM_LEN(x);
-    for (i = 0, num = 0; i < len; i++) {
-	num += (BDIGIT_DBL)BDIGITS(x)[i] + BDIGITS(y)[i];
-	BDIGITS(z)[i] = BIGLO(num);
-	num = BIGDN(num);
-    }
-    len = RBIGNUM_LEN(y);
-    while (num && i < len) {
-	num += BDIGITS(y)[i];
-	BDIGITS(z)[i++] = BIGLO(num);
-	num = BIGDN(num);
-    }
-    while (i < len) {
-	BDIGITS(z)[i] = BDIGITS(y)[i];
-	i++;
-    }
-    BDIGITS(z)[i] = (BDIGIT)num;
+    bigadd_core(BDIGITS(x), RBIGNUM_LEN(x),
+		BDIGITS(y), RBIGNUM_LEN(y),
+		BDIGITS(z), RBIGNUM_LEN(z));
 
     return z;
 }
@@ -1519,24 +1549,20 @@ rb_big_minus(VALUE x, VALUE y)
     }
 }
 
-static void
-rb_big_stop(void *ptr)
+static long
+big_real_len(VALUE x)
 {
-    VALUE *stop = (VALUE*)ptr;
-    *stop = Qtrue;
+    long i = RBIGNUM_LEN(x);
+    while (--i && !BDIGITS(x)[i]);
+    return i + 1;
 }
 
-struct big_mul_struct {
-    VALUE x, y, z, stop;
-};
-
 static VALUE
-bigmul1(void *ptr)
+bigmul1_normal(VALUE x, VALUE y)
 {
-    struct big_mul_struct *bms = (struct big_mul_struct*)ptr;
     long i, j;
     BDIGIT_DBL n = 0;
-    VALUE x = bms->x, y = bms->y, z = bms->z;
+    VALUE z = bignew(RBIGNUM_LEN(x) + RBIGNUM_LEN(y) + 1, RBIGNUM_SIGN(x)==RBIGNUM_SIGN(y));
     BDIGIT *zds;
 
     j = RBIGNUM_LEN(x) + RBIGNUM_LEN(y) + 1;
@@ -1544,7 +1570,6 @@ bigmul1(void *ptr)
     while (j--) zds[j] = 0;
     for (i = 0; i < RBIGNUM_LEN(x); i++) {
 	BDIGIT_DBL dd;
-	if (bms->stop) return Qnil;
 	dd = BDIGITS(x)[i];
 	if (dd == 0) continue;
 	n = 0;
@@ -1558,45 +1583,257 @@ bigmul1(void *ptr)
 	    zds[i + j] = n;
 	}
     }
+    rb_thread_check_ints();
     return z;
 }
 
+static VALUE bigmul0(VALUE x, VALUE y);
+
+/* balancing multiplication by slicing larger argument */
 static VALUE
-rb_big_mul0(VALUE x, VALUE y)
+bigmul1_balance(VALUE x, VALUE y)
 {
-    struct big_mul_struct bms;
-    volatile VALUE z;
+    VALUE z, t1, t2;
+    long i, xn, yn, r, n;
 
-    switch (TYPE(y)) {
-      case T_FIXNUM:
-	y = rb_int2big(FIX2LONG(y));
-	break;
+    xn = RBIGNUM_LEN(x);
+    yn = RBIGNUM_LEN(y);
+    assert(2 * xn <= yn);
 
-      case T_BIGNUM:
-	break;
+    z = bignew(xn + yn, RBIGNUM_SIGN(x)==RBIGNUM_SIGN(y));
+    t1 = bignew(xn, 1);
 
-      case T_FLOAT:
-	return DBL2NUM(rb_big2dbl(x) * RFLOAT_VALUE(y));
+    for (i = 0; i < xn + yn; i++) BDIGITS(z)[i] = 0;
 
-      default:
-	return rb_num_coerce_bin(x, y, '*');
+    n = 0;
+    while (yn > 0) {
+	r = xn > yn ? yn : xn;
+	MEMCPY(BDIGITS(t1), BDIGITS(y) + n, BDIGIT, r);
+	RBIGNUM_SET_LEN(t1, r);
+	t2 = bigmul0(x, t1);
+	bigadd_core(BDIGITS(z) + n, RBIGNUM_LEN(z) - n,
+		    BDIGITS(t2), big_real_len(t2),
+		    BDIGITS(z) + n, RBIGNUM_LEN(z) - n);
+	yn -= r;
+	n += r;
     }
+    rb_gc_force_recycle(t1);
+
+    return z;
+}
+
+/* split a bignum into high and low bignums */
+static void
+big_split(VALUE v, long n, VALUE *ph, VALUE *pl)
+{
+    long hn, ln;
+    VALUE h, l;
+
+    ln = RBIGNUM_LEN(v) > n ? n : RBIGNUM_LEN(v);
+    hn = RBIGNUM_LEN(v) - ln;
+
+    while (--hn && !BDIGITS(v)[hn + ln]);
+    h = bignew(++hn, 1);
+    MEMCPY(BDIGITS(h), BDIGITS(v) + ln, BDIGIT, hn);
+
+    while (--ln && !BDIGITS(v)[ln]);
+    l = bignew(++ln, 1);
+    MEMCPY(BDIGITS(l), BDIGITS(v), BDIGIT, ln);
 
-    bms.x = x;
-    bms.y = y;
-    bms.z = bignew(RBIGNUM_LEN(x) + RBIGNUM_LEN(y) + 1, RBIGNUM_SIGN(x)==RBIGNUM_SIGN(y));
-    bms.stop = Qfalse;
+    *pl = l;
+    *ph = h;
+}
+
+/* multiplication by karatsuba method */
+static VALUE
+bigmul1_karatsuba(VALUE x, VALUE y)
+{
+    long i, n, xn, yn, t1n, t2n;
+    VALUE xh, xl, yh, yl, z, t1, t2, t3;
+    BDIGIT *zds;
+
+    xn = RBIGNUM_LEN(x);
+    yn = RBIGNUM_LEN(y);
+    n = yn / 2;
+    big_split(x, n, &xh, &xl);
+    if (x == y) {
+	yh = xh; yl = xl;
+    }
+    else big_split(y, n, &yh, &yl);
+
+    /* x = xh * b + xl
+     * y = yh * b + yl
+     *
+     * Karatsuba method:
+     *   x * y = z2 * b^2 + z1 * b + z0
+     *   where
+     *     z2 = xh * yh
+     *     z0 = xl * yl
+     *     z1 = (xh + xl) * (yh + yl) - x2 - x0
+     *
+     *  ref: http://en.wikipedia.org/wiki/Karatsuba_algorithm
+     */
+
+    /* allocate a result bignum */
+    z = bignew(xn + yn, RBIGNUM_SIGN(x)==RBIGNUM_SIGN(y));
+    zds = BDIGITS(z);
+
+    /* t1 <- xh * yh */
+    t1 = bigmul0(xh, yh);
+    t1n = big_real_len(t1);
+
+    /* copy t1 into high bytes of the result (z2) */
+    MEMCPY(zds + 2 * n, BDIGITS(t1), BDIGIT, t1n);
+    for (i = 2 * n + t1n; i < xn + yn; i++) BDIGITS(z)[i] = 0;
+
+    if (!BIGZEROP(xl) && !BIGZEROP(yl)) {
+	/* t2 <- xl * yl */
+    	t2 = bigmul0(xl, yl);
+    	t2n = big_real_len(t2);
 
-    if (RBIGNUM_LEN(x) + RBIGNUM_LEN(y) > 10000) {
-	z = rb_thread_blocking_region(bigmul1, &bms, rb_big_stop, &bms.stop);
+	/* copy t2 into low bytes of the result (z0) */
+	MEMCPY(zds, BDIGITS(t2), BDIGIT, t2n);
+	for (i = t2n; i < 2 * n; i++) BDIGITS(z)[i] = 0;
+
+	/* subtract t2 from middle bytes of the result (z1) */
+	i = xn + yn - n;
+	bigsub_core(zds + n, i, BDIGITS(t2), t2n, zds + n, i);
+	rb_gc_force_recycle(t2);
     }
     else {
-	z = bigmul1(&bms);
+	/* copy 0 into low bytes of the result (z0) */
+	for (i = 0; i < 2 * n; i++) BDIGITS(z)[i] = 0;
     }
 
+    /* subtract t1 from middle bytes of the result (z1) */
+    i = xn + yn - n;
+    bigsub_core(zds + n, i, BDIGITS(t1), t1n, zds + n, i);
+    rb_gc_force_recycle(t1);
+
+    /* t1 <- xh + xl */
+    t1 = bigadd(xh, xl, 1);
+    if (xh != yh) rb_gc_force_recycle(xh);
+    if (xl != yl) rb_gc_force_recycle(xl);
+
+    /* t2 <- yh + yl */
+    t2 = (x == y) ? t1 : bigadd(yh, yl, 1);
+    rb_gc_force_recycle(yh);
+    rb_gc_force_recycle(yl);
+
+    /* t3 <- t1 * t2 */
+    t3 = bigmul0(t1, t2);
+    rb_gc_force_recycle(t1);
+    if (t1 != t2) rb_gc_force_recycle(t2);
+
+    /* add t3 to middle bytes of the result (z1) */
+    bigadd_core(zds + n, i, BDIGITS(t3), big_real_len(t3), zds + n, i);
+    rb_gc_force_recycle(t3);
+
     return z;
 }
 
+/* efficient squaring (2 times faster than normal multiplication)
+ * ref: Handbook of Applied Cryptography, Algorithm 14.16
+ *      http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf
+ */
+static VALUE
+bigsqr_fast(VALUE x)
+{
+    long len = RBIGNUM_LEN(x), i, j;
+    VALUE z = bignew(2 * len + 1, 1);
+    BDIGIT *xds = BDIGITS(x), *zds = BDIGITS(z);
+    BDIGIT_DBL c, v, w;
+
+    for (i = 2 * len + 1; i--; ) zds[i] = 0;
+    for (i = 0; i < len; i++) {
+	v = (BDIGIT_DBL)xds[i];
+	if (!v) continue;
+	c = (BDIGIT_DBL)zds[i + i] + v * v;
+	zds[i + i] = BIGLO(c);
+	c = BIGDN(c);
+	v *= 2;
+	for (j = i + 1; j < len; j++) {
+	    w = (BDIGIT_DBL)xds[j];
+	    c += (BDIGIT_DBL)zds[i + j] + BIGLO(v) * w;
+	    zds[i + j] = BIGLO(c);
+	    c = BIGDN(c);
+	    if (BIGDN(v)) c += w;
+	}
+	if (c) {
+	    c += (BDIGIT_DBL)zds[i + len];
+	    zds[i + len] = BIGLO(c);
+	    c = BIGDN(c);
+	}
+	if (c) zds[i + len + 1] += c;
+    }
+    return z;
+}
+
+#define KARATSUBA_MUL_DIGITS 70
+
+
+/* determine whether a bignum is sparse or not by random sampling */
+static inline VALUE
+big_sparse_p(VALUE x)
+{
+    long c = 0, n = RBIGNUM_LEN(x);
+    unsigned long rb_rand_internal(unsigned long i);
+
+    if (          BDIGITS(x)[rb_rand_internal(n / 2) + n / 4]) c++;
+    if (c <= 1 && BDIGITS(x)[rb_rand_internal(n / 2) + n / 4]) c++;
+    if (c <= 1 && BDIGITS(x)[rb_rand_internal(n / 2) + n / 4]) c++;
+
+    return (c <= 1) ? Qtrue : Qfalse;
+}
+
+#if 0
+static void
+dump_bignum(VALUE x)
+{
+    long i;
+    printf("0x0");
+    for (i = RBIGNUM_LEN(x); i--; ) {
+    	printf("_%08x", BDIGITS(x)[i]);
+    }
+    puts("");
+}
+#endif
+
+static VALUE
+bigmul0(VALUE x, VALUE y)
+{
+    long xn, yn;
+
+    xn = RBIGNUM_LEN(x);
+    yn = RBIGNUM_LEN(y);
+
+    /* make sure that y is longer than x */
+    if (xn > yn) {
+	VALUE t;
+	long tn;
+	t = x; x = y; y = t;
+	tn = xn; xn = yn; yn = tn;
+    }
+    assert(xn <= yn);
+
+    /* normal multiplication when x is small */
+    if (xn < KARATSUBA_MUL_DIGITS) {
+      normal:
+	if (x == y) return bigsqr_fast(x);
+    	return bigmul1_normal(x, y);
+    }
+
+    /* normal multiplication when x or y is a sparse bignum */
+    if (big_sparse_p(x)) goto normal;
+    if (big_sparse_p(y)) return bigmul1_normal(y, x);
+
+    /* balance multiplication by slicing y when x is much smaller than y */
+    if (2 * xn <= yn) return bigmul1_balance(x, y);
+
+    /* multiplication by karatsuba method */
+    return bigmul1_karatsuba(x, y);
+}
+
 /*
  *  call-seq:
  *     big * other  => Numeric
@@ -1607,7 +1844,22 @@ rb_big_mul0(VALUE x, VALUE y)
 VALUE
 rb_big_mul(VALUE x, VALUE y)
 {
-    return bignorm(rb_big_mul0(x, y));
+    switch (TYPE(y)) {
+      case T_FIXNUM:
+	y = rb_int2big(FIX2LONG(y));
+	break;
+
+      case T_BIGNUM:
+	break;
+
+      case T_FLOAT:
+	return DBL2NUM(rb_big2dbl(x) * RFLOAT_VALUE(y));
+
+      default:
+	return rb_num_coerce_bin(x, y, '*');
+    }
+
+    return bignorm(bigmul0(x, y));
 }
 
 struct big_div_struct {
@@ -1661,6 +1913,13 @@ bigdivrem1(void *ptr)
     return Qnil;
 }
 
+static void
+rb_big_stop(void *ptr)
+{
+    VALUE *stop = (VALUE*)ptr;
+    *stop = Qtrue;
+}
+
 static VALUE
 bigdivrem(VALUE x, VALUE y, VALUE *divp, VALUE *modp)
 {
@@ -2037,7 +2296,7 @@ bigsqr(VALUE x)
     BDIGIT_DBL num;
 
     if (len < 4000 / BITSPERDIG) {
-	return bigtrunc(rb_big_mul0(x, x));
+	return bigtrunc(bigmul0(x, x));
     }
 
     a = bignew(len - k, 1);
@@ -2054,7 +2313,7 @@ bigsqr(VALUE x)
     }
     MEMCPY(BDIGITS(z) + 2 * k, BDIGITS(a2), BDIGIT, RBIGNUM_LEN(a2));
     RBIGNUM_SET_LEN(z, len);
-    a2 = bigtrunc(rb_big_mul0(a, b));
+    a2 = bigtrunc(bigmul0(a, b));
     len = RBIGNUM_LEN(a2);
     for (i = 0, num = 0; i < len; i++) {
 	num += (BDIGIT_DBL)BDIGITS(z)[i + k] + ((BDIGIT_DBL)BDIGITS(a2)[i] << 1);
@@ -2125,7 +2384,7 @@ rb_big_pow(VALUE x, VALUE y)
 	    for (mask = FIXNUM_MAX + 1; mask; mask >>= 1) {
 		if (z) z = bigtrunc(bigsqr(z));
 		if (yy & mask) {
-		    z = z ? bigtrunc(rb_big_mul0(z, x)) : x;
+		    z = z ? bigtrunc(bigmul0(z, x)) : x;
 		}
 	    }
 	    return bignorm(z);
-- 
cgit v1.2.3