Faster digits and fromdigits in base 2^k, and FlxqM_mul

Peter Bruin on Thu, 08 Oct 2015 14:52:36 +0200

[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]

Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker

To: pari-dev@pari.math.u-bordeaux1.fr
Subject: Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker
From: Peter Bruin <P.J.Bruin@math.leidenuniv.nl>
Date: Thu, 08 Oct 2015 14:53:30 +0200
Cancel-lock: sha1:jMVObKcndS+6Iagy/GjpEjOzLDk=
Delivery-date: Thu, 08 Oct 2015 14:52:36 +0200
User-agent: Gnus/5.13 (Gnus v5.13) Emacs/24.5 (gnu/linux)

Bonjour,

The functions digits(x,b) and fromdigits(x,b) are currently not very
fast when b is a power of 2.  I am attaching a patch (digits_2k.patch)
that contains

- reimplementations of binary_2k, binary_2k_zv
- new functions fromdigits_2k, fromdigits_2k_zv
- a 2-line addition to fromdigits to use the new fromdigits_2k

Here is one example of the speed-up obtained by this patch:

Before:
gp> n = random(2^1000);
gp> for(i=1,1000000,digits(n,2^10));
time = 1,961 ms.
gp> v = vector(100,i,random(2^10));
gp> for(i=1,1000000,fromdigits(v,2^10));
time = 3,480 ms.

After:
gp> n = random(2^1000);
gp> for(i=1,1000000,digits(n,2^10));
time = 436 ms.
gp> v = vector(100,i,random(2^10));
gp> for(i=1,1000000,fromdigits(v,2^10));
time = 482 ms.

My motivation for this was mostly to speed up multiplication of matrices
over non-prime finite fields of small characteristic (FlxqM).  I am
attaching a second patch (FlxqM_mul_Kronecker.patch) that enhances
Kronecker multiplication for FlxqM to use the above functions for
packing into and unpacking from integer matrices in cases where it
allows us to pack the entries in fewer words per entry than the
(half-)word-aligned packing that is used now.

In experiments with many different finite fields and matrix sizes, I get
an average speedup of about 17%, i.e. if r(F,n) = (new time)/(old time)
for the multiplication of two n × n matrices over F, then the average
value of r(F,n) over many fields F and sizes n is about 0.83.  For some
F and n, the new time is less than 0.4 times the old one, and it is
rarely more than 1.1 times the old one.  I obtained these benchmarks
using the attached file FlxqM_mul_test.c.

The only thing that is still missing is the documentation for the new
functions fromdigits_2k and fromdigits_2k_zv.  If the new functions are
accepted into PARI, I can write this documentation.  Also, I am open to
suggestions for renaming some of the (exported or auxiliary) functions.

Thanks,

Peter

diff --git a/src/basemath/arith2.c b/src/basemath/arith2.c
index 01726a0..1935132 100644
--- a/src/basemath/arith2.c
+++ b/src/basemath/arith2.c
@@ -1501,6 +1501,8 @@ fromdigits(GEN x, GEN B)
   if (typ(x)!=t_VEC || !RgV_is_ZV(x)) pari_err_TYPE("fromdigits",x);
   B = check_basis(B);
   if (lg(x)==1) { avma = av; return gen_0; }
+  if (Z_ispow2(B))
+    return fromdigits_2k(x, expi(B));
   x = vecreverse(x);
   return gerepileupto(av, gen_fromdigits(x, B, NULL, &Z_ring));
 }
diff --git a/src/basemath/bit.c b/src/basemath/bit.c
index 6e9e253..9cdfc8d 100644
--- a/src/basemath/bit.c
+++ b/src/basemath/bit.c
@@ -81,6 +81,117 @@ bits_to_u(GEN v, long l)
   return u;
 }
 
+/* set BITS_IN_LONG bits starting at word *w plus *r bits,
+ * clearing subsequent bits in the last word touched */
+INLINE void
+int_set_ulong(ulong a, GEN *w, long *r)
+{
+  if (*r) {
+    **w |= (a << *r);
+    *w = int_nextW(*w);
+    **w = (a >> (BITS_IN_LONG - *r));
+  } else {
+    **w = a;
+    *w = int_nextW(*w);
+  }
+}
+
+/* set k bits starting at word *w plus *r bits,
+ * clearing subsequent bits in the last word touched */
+INLINE void
+int_set_bits(ulong a, long k, GEN *w, long *r)
+{
+  if (*r) {
+    **w |= a << *r;
+    a >>= BITS_IN_LONG - *r;
+  } else {
+    **w = a;
+    a = 0;
+  }
+  *r += k;
+  if (*r >= BITS_IN_LONG) {
+    *w = int_nextW(*w);
+    *r -= BITS_IN_LONG;
+    for (; *r >= BITS_IN_LONG; *r -= BITS_IN_LONG) {
+      **w = a;
+      a = 0;
+      *w = int_nextW(*w);
+    }
+    if (*r)
+      **w = a;
+  }
+}
+
+/* set k bits from z (t_INT) starting at word *w plus *r bits,
+ * clearing subsequent bits in the last word touched */
+INLINE void
+int_set_int(GEN z, long k, GEN *w, long *r)
+{
+  long l = lgefint(z) - 2;
+  GEN y;
+  if (!l) {
+    int_set_bits(0, k, w, r);
+    return;
+  }
+  y = int_LSW(z);
+  for (; l > 1; l--) {
+    int_set_ulong((ulong) *y, w, r);
+    y = int_nextW(y);
+    k -= BITS_IN_LONG;
+  }
+  if (k)
+    int_set_bits((ulong) *y, k, w, r);
+}
+
+GEN
+fromdigits_2k_zv(GEN x, long k)
+{
+  long l = lg(x) - 1, r;
+  GEN w, z;
+  if (!l) return gen_0;
+  z = cgetipos(nbits2lg(k * l));
+  w = int_LSW(z);
+  r = 0;
+  for (; l; l--)
+    int_set_bits(uel(x, l), k, &w, &r);
+  return int_normalize(z, 0);
+}
+
+GEN
+fromdigits_2k(GEN x, long k)
+{
+  long l, m;
+  GEN w, y, z;
+  for (l = lg(x) - 1; l && !signe(gel(x, 1)); x++, l--);
+  if (k == 1) return bits_to_int(x, l);
+  if (!l) return gen_0;
+  m = expi(gel(x, 1)) + 1;
+  z = cgetipos(nbits2lg(k * (l - 1) + m));
+  w = int_LSW(z);
+  if (!(k & (BITS_IN_LONG - 1))) {
+    long i, j, t = k >> TWOPOTBITS_IN_LONG;
+    for (; l; l--) {
+      j = lgefint(gel(x, l)) - 2;
+      y = int_LSW(gel(x, l));
+      for (i = 0; i < j; i++) {
+	*w = *y;
+	y = int_nextW(y);
+	w = int_nextW(w);
+      }
+      for (; i < t; i++) {
+	*w = 0;
+	w = int_nextW(w);
+      }
+    }
+  } else {
+    long r = 0;
+    for (; l > 1; l--)
+      int_set_int(gel(x, l), k, &w, &r);
+    int_set_int(gel(x, 1), m, &w, &r);
+  }
+  return int_normalize(z, 0);
+}
+
 GEN
 binaire(GEN x)
 {
@@ -139,41 +250,101 @@ binaire(GEN x)
   return y;
 }
 
+/* extract k bits (as ulong) starting at word *w plus *r bits */
+INLINE ulong
+int_get_bits(long k, GEN *w, long *r)
+{
+  ulong mask = (1UL << k) - 1;
+  ulong a = (((ulong) **w) >> *r) & mask;
+  *r += k;
+  if (*r >= BITS_IN_LONG) {
+    *r -= BITS_IN_LONG;
+    *w = int_nextW(*w);
+    if (*r)
+      a |= (**w << (k - *r)) & mask;
+  }
+  return a;
+}
+
+/* extract BITS_IN_LONG bits starting at word *w plus *r bits */
+INLINE ulong
+int_get_ulong(GEN *w, long *r)
+{
+  ulong a = ((ulong) **w) >> *r;
+  *w = int_nextW(*w);
+  if (*r)
+    a |= (**w << (BITS_IN_LONG - *r));
+  return a;
+}
+
+/* extract k bits (as t_INT) starting at word *w plus *r bits */
+INLINE GEN
+int_get_int(long k, GEN *w, long *r)
+{
+  GEN z = cgetipos(nbits2lg(k));
+  GEN y = int_LSW(z);
+  for (; k >= BITS_IN_LONG; k -= BITS_IN_LONG) {
+    *y = int_get_ulong(w, r);
+    y = int_nextW(y);
+  }
+  if (k)
+    *y = int_get_bits(k, w, r);
+  return int_normalize(z, 0);
+}
+
 /* assume k < BITS_IN_LONG */
 GEN
 binary_2k_zv(GEN x, long k)
 {
-  long iv, j, n, nmodk, nk;
-  GEN v, vk;
+  long l, n, r;
+  GEN v, w;
   if (k == 1) return binary_zv(x);
-  if (!signe(x)) return cgetg(1,t_VECSMALL);
-  v = binary_zv(x);
-  n = lg(v)-1;
-  nk = n / k; nmodk = n % k;
-  if (nmodk) nk++;
-  vk = cgetg(nk+1, t_VECSMALL);
-  iv = n - k;
-  if (!nmodk) nmodk = k;
-  for (j = nk; j >= 2; j--,iv-=k) vk[j] = bits_to_u(v+iv, k);
-  vk[1] = bits_to_u(v,nmodk);
-  return vk;
+  if (!signe(x)) return cgetg(1, t_VECSMALL);
+  n = expi(x) + 1;
+  l = (n + k - 1) / k;
+  v = cgetg(l + 1, t_VECSMALL);
+  w = int_LSW(x);
+  r = 0;
+  for (; l > 1; l--) {
+    uel(v, l) = int_get_bits(k, &w, &r);
+    n -= k;
+  }
+  uel(v, 1) = int_get_bits(n, &w, &r);
+  return v;
 }
+
 GEN
 binary_2k(GEN x, long k)
 {
-  long iv, j, n, nmodk, nk;
-  GEN v, vk;
-  if (!signe(x)) return cgetg(1,t_VEC);
-  v = binary_zv(x);
-  n = lg(v)-1;
-  nk = n / k; nmodk = n % k;
-  if (nmodk) nk++;
-  vk = cgetg(nk+1, t_VEC);
-  iv = n - k;
-  if (!nmodk) nmodk = k;
-  for (j = nk; j >= 2; j--,iv-=k) gel(vk,j) = bits_to_int(v+iv, k);
-  gel(vk,1) = bits_to_int(v, nmodk);
-  return vk;
+  long l, n;
+  GEN v, w, y, z;
+  if (k == 1) return binaire(x);
+  if (!signe(x)) return cgetg(1, t_VEC);
+  n = expi(x) + 1;
+  l = (n + k - 1) / k;
+  v = cgetg(l + 1, t_VEC);
+  w = int_LSW(x);
+  if (!(k & (BITS_IN_LONG - 1))) {
+    long m, t = k >> TWOPOTBITS_IN_LONG, u = lgefint(x) - 2;
+    for (; l; l--) {
+      m = minss(t, u);
+      z = cgetipos(m + 2);
+      y = int_LSW(z);
+      for (; m; m--) {
+	*y = *w;
+	y = int_nextW(y);
+	w = int_nextW(w);
+      }
+      gel(v, l) = int_normalize(z, 0);
+      u -= t;
+    }
+  } else {
+    long r = 0;
+    for (; l > 1; l--, n -= k)
+      gel(v, l) = int_get_int(k, &w, &r);
+    gel(v, 1) = int_get_int(n, &w, &r);
+  }
+  return v;
 }
 
 /* return 1 if bit n of x is set, 0 otherwise */
diff --git a/src/headers/paridecl.h b/src/headers/paridecl.h
index 83c5285..77752b3 100644
--- a/src/headers/paridecl.h
+++ b/src/headers/paridecl.h
@@ -2100,6 +2100,8 @@ GEN     binary_zv(GEN x);
 GEN     binary_2k(GEN x, long k);
 GEN     binary_2k_zv(GEN x, long k);
 long    bittest(GEN x, long n);
+GEN     fromdigits_2k(GEN x, long k);
+GEN     fromdigits_2k_zv(GEN x, long k);
 GEN     gbitand(GEN x, GEN y);
 GEN     gbitneg(GEN x, long n);
 GEN     gbitnegimply(GEN x, GEN y);

diff --git a/src/basemath/Flx.c b/src/basemath/Flx.c
index 391dee9..c7eaab5 100644
--- a/src/basemath/Flx.c
+++ b/src/basemath/Flx.c
@@ -4142,6 +4142,18 @@ kron_pack_Flx_spec_3(GEN x, long l) {
 }
 
 static GEN
+kron_pack_Flx_spec_bits(GEN x, long b, long l) {
+  GEN y;
+  long i;
+  if (l == 0)
+    return gen_0;
+  y = cgetg(l + 1, t_VECSMALL);
+  for(i = 1; i <= l; i++)
+    y[i] = x[l - i];
+  return fromdigits_2k_zv(y, b);
+}
+
+static GEN
 kron_unpack_Flx(GEN z, ulong p)
 {
   long i, l = lgefint(z);
@@ -4163,6 +4175,37 @@ kron_unpack_Flx_3(GEN x, ulong p) {
   return Z_mod2BIL_Flx_3(x, d, p);
 }
 
+/* assume b < BITS_IN_LONG */
+static GEN
+kron_unpack_Flx_bits_narrow(GEN z, long b, ulong p) {
+  GEN v = binary_2k_zv(z, b), x;
+  long i, l = lg(v) + 1;
+  x = cgetg(l, t_VECSMALL);
+  for (i = 2; i < l; i++)
+    x[i] = v[l - i] % p;
+  return Flx_renormalize(x, l);
+}
+
+static GEN
+kron_unpack_Flx_bits_wide(GEN z, long b, ulong p, ulong pi) {
+  GEN v = binary_2k(z, b), x, y;
+  long i, l = lg(v) + 1, ly;
+  x = cgetg(l, t_VECSMALL);
+  for (i = 2; i < l; i++) {
+    y = gel(v, l - i);
+    ly = lgefint(y);
+    switch (ly) {
+    case 2: x[i] = 0; break;
+    case 3: x[i] = *int_W_lg(y, 0, ly) % p; break;
+    case 4: x[i] = remll_pre(*int_W_lg(y, 1, ly), *int_W_lg(y, 0, ly), p, pi); break;
+    case 5: x[i] = remlll_pre(*int_W_lg(y, 2, ly), *int_W_lg(y, 1, ly),
+			      *int_W_lg(y, 0, ly), p, pi); break;
+    default: x[i] = umodiu(gel(v, l - i), p);
+    }
+  }
+  return Flx_renormalize(x, l);
+}
+
 static GEN
 FlxM_pack_ZM(GEN M, GEN (*pack)(GEN, long)) {
   long i, j, l, lc;
@@ -4181,6 +4224,24 @@ FlxM_pack_ZM(GEN M, GEN (*pack)(GEN, long)) {
 }
 
 static GEN
+FlxM_pack_ZM_bits(GEN M, long b)
+{
+  long i, j, l, lc;
+  GEN N = cgetg_copy(M, &l), x;
+  if (l == 1)
+    return N;
+  lc = lgcols(M);
+  for (j = 1; j < l; j++) {
+    gel(N, j) = cgetg(lc, t_COL);
+    for (i = 1; i < lc; i++) {
+      x = gcoeff(M, i, j);
+      gcoeff(N, i, j) = kron_pack_Flx_spec_bits(x + 2, b, lgpol(x));
+    }
+  }
+  return N;
+}
+
+static GEN
 ZM_unpack_FlxqM(GEN M, GEN T, ulong p, GEN (*unpack)(GEN, ulong))
 {
   long i, j, l, lc, sv = get_Flx_var(T);
@@ -4199,44 +4260,82 @@ ZM_unpack_FlxqM(GEN M, GEN T, ulong p, GEN (*unpack)(GEN, ulong))
   return N;
 }
 
+static GEN
+ZM_unpack_FlxqM_bits(GEN M, long b, GEN T, ulong p)
+{
+  long i, j, l, lc, sv = get_Flx_var(T);
+  GEN N = cgetg_copy(M, &l), x;
+  if (l == 1)
+    return N;
+  lc = lgcols(M);
+  if (b < BITS_IN_LONG) {
+    for (j = 1; j < l; j++) {
+      gel(N, j) = cgetg(lc, t_COL);
+      for (i = 1; i < lc; i++) {
+	x = kron_unpack_Flx_bits_narrow(gcoeff(M, i, j), b, p);
+	x[1] = sv;
+	gcoeff(N, i, j) = Flx_rem(x, T, p);
+      }
+    }
+  } else {
+    ulong pi = get_Fl_red(p);
+    for (j = 1; j < l; j++) {
+      gel(N, j) = cgetg(lc, t_COL);
+      for (i = 1; i < lc; i++) {
+	x = kron_unpack_Flx_bits_wide(gcoeff(M, i, j), b, p, pi);
+	x[1] = sv;
+	gcoeff(N, i, j) = Flx_rem(x, T, p);
+      }
+    }
+  }
+  return N;
+}
+
 GEN
 FlxqM_mul_Kronecker(GEN A, GEN B, GEN T, ulong p)
 {
   pari_sp av = avma;
-  long l, n = lg(A) - 1;
+  long b, d = degpol(T), n = lg(A) - 1;
   GEN C, D, z;
   GEN (*pack)(GEN, long), (*unpack)(GEN, ulong);
   int is_sqr = A==B;
 
-  /*
-    cf. Flx_mulspec(), maxlengthcoeffpol()
-    z  can be 1, 2, 3 or (theoretically) 4 words long
-  */
-  z = muliu(muliu(sqru(p - 1), degpol(T)), n);
-  l = lgefint(z);
+  z = muliu(muliu(sqru(p - 1), d), n);
+  b = expi(z) + 1;
+  /* only do expensive bit-packing if it saves at least 1 limb */
+  if (b <= BITS_IN_HALFULONG) {
+    if (nbits2lg(d*b) - 2 == (d + 1)/2)
+      b = BITS_IN_HALFULONG;
+  } else {
+    long l = lgefint(z) - 2;
+    if (nbits2lg(d*b) - 2 == d*l)
+      b = l*BITS_IN_LONG;
+  }
   avma = av;
 
-  switch (l - 2) {
-  case 1:
-    if (HIGHWORD(z[2]) == 0) {
-      pack = kron_pack_Flx_spec_half;
-      unpack = int_to_Flx_half;
-    } else {
-      pack = kron_pack_Flx_spec;
-      unpack = kron_unpack_Flx;
-    }
+  switch (b) {
+  case BITS_IN_HALFULONG:
+    pack = kron_pack_Flx_spec_half;
+    unpack = int_to_Flx_half;
     break;
-  case 2:
+  case BITS_IN_LONG:
+    pack = kron_pack_Flx_spec;
+    unpack = kron_unpack_Flx;
+    break;
+  case 2*BITS_IN_LONG:
     pack = kron_pack_Flx_spec_2;
     unpack = kron_unpack_Flx_2;
     break;
-  case 3:
+  case 3*BITS_IN_LONG:
     pack = kron_pack_Flx_spec_3;
     unpack = kron_unpack_Flx_3;
     break;
   default:
-    /* not implemented, probably won't occur in practice */
-    return NULL;
+    A = FlxM_pack_ZM_bits(A, b);
+    B = is_sqr? A: FlxM_pack_ZM_bits(B, b);
+    C = ZM_mul(A, B);
+    D = ZM_unpack_FlxqM_bits(C, b, T, p);
+    return gerepilecopy(av, D);
   }
   A = FlxM_pack_ZM(A, pack);
   B = is_sqr? A: FlxM_pack_ZM(B, pack);
diff --git a/src/basemath/alglin1.c b/src/basemath/alglin1.c
index 5ebdf91..ae1b1e2 100644
--- a/src/basemath/alglin1.c
+++ b/src/basemath/alglin1.c
@@ -981,11 +981,8 @@ FlxqM_mul(GEN A, GEN B, GEN T, ulong p) {
 
   if (n == 0)
     return cgetg(1, t_MAT);
-  if (n > 1) {
-    GEN C = FlxqM_mul_Kronecker(A, B, T, p);
-    if (C != NULL)
-      return C;
-  }
+  if (n > 1)
+    return FlxqM_mul_Kronecker(A, B, T, p);
   ff = get_Flxq_field(&E, T, p);
   return gen_matmul(A, B, E, ff);
 }

#include <stdio.h>
#include <time.h>
#include <pari/pari.h>

static time_t t_tot = 0;

static GEN
random_FlxqC(long m, GEN T, ulong p) {
  GEN C = cgetg(m + 1, t_COL);
  long i, d = get_Flx_degree(T), v = get_Flx_var(T);
  for (i = 1; i <= m; i++)
    gel(C, i) = random_Flx(d, v, p);
  return C;
}

static GEN
random_FlxqM(long m, long n, GEN T, ulong p) {
  GEN M = cgetg(n + 1, t_MAT);
  long i;
  for (i = 1; i <= n; i++)
    gel(M, i) = random_FlxqC(m, T, p);
  return M;
}

static void
test(ulong p, long d, long m, long n, long k) {
  long v = 0;
  pari_sp av = avma;
  GEN T = ZX_to_Flx(liftint(ffinit(stoi(p), d, v)), p);
  GEN A = random_FlxqM(m, n, T, p);
  GEN B = random_FlxqM(n, k, T, p);
  time_t t0, t1;
  int i, N = 60 / n;
  pari_sp av1 = avma;

  printf("\\\\ %li^%li, %li, %li, %li\n", p, d, m, n, k);
  fflush(stdout);
  t0 = clock();
  for(i = 0; i < N; i++) {
    (void) FlxqM_mul(A, B, T, p);
    avma = av1;
  }
  t1 = clock();
  t_tot += (t1 - t0);
  printf("%f\n", ((double) (t1 - t0)) / CLOCKS_PER_SEC);
  fflush(stdout);
  avma = av;
}

int main () {
  ulong i, j, m, n, p, d;
  ulong P[] = {
    3,
    5,
    7,
    11,
    13,
    17,
    67,
    257,
    1031,
    4099,  /* nextprime(2^12) */
    16777259,  /* nextprime(2^24) */
    4294967311,  /* nextprime(2^32) */
    1099511627791,  /* nextprime(2^40) */
    281474976710677,  /* nextprime(2^48) */
    4611686018427388039,  /* nextprime(2^62) */
    0 };
  ulong D[] = { 2, 4, 6, 8, 10, 12,
		14, 16, 20, 24, 36,
		48, 60, 72, 84, 96,
		0 };
  ulong N[] = { 2, 4, 6, 8, 10, 20, 40, 60, 0 };

  pari_init(1 << 30, 500000);
  for (m = 0; (n = N[m]) != 0; m++) {
    for (i = 0; (p = P[i]) != 0; i++) {
      for (j = 0; (d = D[j]) != 0; j++)
	test(p, d, n, n, n);
    }
  }
  printf("\\\\ t_tot = %f\n", ((double) t_tot) / CLOCKS_PER_SEC);
  return 0;
}

Follow-Ups:
- Re: Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker
  - From: Bill Allombert <Bill.Allombert@math.u-bordeaux.fr>
- Re: Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker
  - From: Peter Bruin <P.J.Bruin@math.leidenuniv.nl>

Prev by Date: faster factorcantor
Next by Date: Re: Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker
Previous by thread: faster factorcantor
Next by thread: Re: Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker
Index(es):
- Date
- Thread