Re: Faster digits and fromdigits in base 2^k, and FlxqM_mul

Peter Bruin on Thu, 29 Oct 2015 13:40:49 +0100

[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]

Re: Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker

To: pari-dev@pari.math.u-bordeaux1.fr
Subject: Re: Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker
From: Peter Bruin <P.J.Bruin@math.leidenuniv.nl>
Date: Thu, 29 Oct 2015 13:40:20 +0100
References: <874mi134j9.fsf@dds.nl> <87vbaf5gge.fsf@dds.nl> <20151024121646.GB17984@yellowpig> <87mvv3gljw.fsf@dds.nl> <20151029113326.GA12822@yellowpig>

Hi Bill,

> However 
> make test-digits
> fails
> and
> ? fromdigits([1,0,1],2)
> %1 = 7
> is not good.
>
> the line 166 in fromdigits_2k
>   if (k == 1) return bits_to_int(x, l);
>
> does not seem correct.

You are right; I added this line at a late stage, and somehow
misunderstood bits_to_int.  It should be in nv_fromdigits_2k instead
of fromdigits_2k.

> PS: next time, you might want to look at 'git format-patch' to
> generate the patches.
> Also, you should try 'make checkspaces'.

Thanks for the hints.  I replaced the tabs by spaces and made new
patches using git format-patch.  The difference with respect to the
previous ones is just the whitespace fixes and moving the call to
bits_to_int.

Cheers,

Peter

>From 1aff932dddab902558eb1fb1051f3a2c4b8005c4 Mon Sep 17 00:00:00 2001
From: Peter Bruin <P.J.Bruin@math.leidenuniv.nl>
Date: Fri, 11 Sep 2015 12:56:58 +0200
Subject: [PATCH 1/2] improve digits and fromdigits in base 2^k

---
 doc/usersch5.tex       |   13 ++-
 src/basemath/arith2.c  |    2 +
 src/basemath/bit.c     |  223 ++++++++++++++++++++++++++++++++++++++++++------
 src/headers/paridecl.h |    2 +
 4 files changed, 213 insertions(+), 27 deletions(-)

diff --git a/doc/usersch5.tex b/doc/usersch5.tex
index 78c4497..069a1a4 100644
--- a/doc/usersch5.tex
+++ b/doc/usersch5.tex
@@ -2347,7 +2347,7 @@ shifts the mantissa
 $$f, s[m], s[m+1],\ldots s[M]$$
 right by $n$ bits.
 
-\subsec{From \typ{INT} to bits}
+\subsec{From \typ{INT} to bits or digits in base $2^k$ and back}
 
 \fun{GEN}{binary_zv}{GEN x} given a \typ{INT} $x$, return a \typ{VECSMALL} of
 bits, from most significant to least significant.
@@ -2368,6 +2368,17 @@ proper \kbd{GEN}), return the integer $\sum_{i = 1}^l x[i] 2^{l-i}$, as a
 \fun{ulong}{bits_to_u}{GEN v, long l} same as \tet{bits_to_int}, where
 $l < \tet{BITS_IN_LONG}$, so we can return an \kbd{ulong}.
 
+\fun{GEN}{fromdigits_2k}{GEN x, long k} converse of \tet{binary_2k};
+given a \typ{VEC} $x$ of length $l$ and a positive \kbd{long} $k$,
+where each $x[i]$ is a \typ{INT} with $0\leq x[i] < 2^k$, return the
+integer $\sum_{i = 1}^l x[i] 2^{k(l-i)}$, as a \typ{INT}.
+
+\fun{GEN}{nv_fromdigits_2k}{GEN x, long k} as \tet{fromdigits_2k}, but
+with $x$ being a \typ{VECSMALL} and each $x[i]$ being a \kbd{ulong}
+with $0\leq x[i] < 2^{\min\{k,\tet{BITS_IN_LONG}\}}$.  Here $k$ may be
+any positive \kbd{long}, and the $x[i]$ are regarded as $k$-bit
+integers by truncating or extending with zeroes.
+
 \subsec{Integer valuation}
 For integers $x$ and $p$, such that $x\neq 0$ and $|p| > 1$, we define
 $v_p(x)$ to be the largest integer exponent $e$ such that $p^e$ divides $x$.
diff --git a/src/basemath/arith2.c b/src/basemath/arith2.c
index 01726a0..1935132 100644
--- a/src/basemath/arith2.c
+++ b/src/basemath/arith2.c
@@ -1501,6 +1501,8 @@ fromdigits(GEN x, GEN B)
   if (typ(x)!=t_VEC || !RgV_is_ZV(x)) pari_err_TYPE("fromdigits",x);
   B = check_basis(B);
   if (lg(x)==1) { avma = av; return gen_0; }
+  if (Z_ispow2(B))
+    return fromdigits_2k(x, expi(B));
   x = vecreverse(x);
   return gerepileupto(av, gen_fromdigits(x, B, NULL, &Z_ring));
 }
diff --git a/src/basemath/bit.c b/src/basemath/bit.c
index 6e9e253..5c0ff08 100644
--- a/src/basemath/bit.c
+++ b/src/basemath/bit.c
@@ -81,6 +81,117 @@ bits_to_u(GEN v, long l)
   return u;
 }
 
+/* set BITS_IN_LONG bits starting at word *w plus *r bits,
+ * clearing subsequent bits in the last word touched */
+INLINE void
+int_set_ulong(ulong a, GEN *w, long *r)
+{
+  if (*r) {
+    **w |= (a << *r);
+    *w = int_nextW(*w);
+    **w = (a >> (BITS_IN_LONG - *r));
+  } else {
+    **w = a;
+    *w = int_nextW(*w);
+  }
+}
+
+/* set k bits starting at word *w plus *r bits,
+ * clearing subsequent bits in the last word touched */
+INLINE void
+int_set_bits(ulong a, long k, GEN *w, long *r)
+{
+  if (*r) {
+    **w |= a << *r;
+    a >>= BITS_IN_LONG - *r;
+  } else {
+    **w = a;
+    a = 0;
+  }
+  *r += k;
+  if (*r >= BITS_IN_LONG) {
+    *w = int_nextW(*w);
+    *r -= BITS_IN_LONG;
+    for (; *r >= BITS_IN_LONG; *r -= BITS_IN_LONG) {
+      **w = a;
+      a = 0;
+      *w = int_nextW(*w);
+    }
+    if (*r)
+      **w = a;
+  }
+}
+
+/* set k bits from z (t_INT) starting at word *w plus *r bits,
+ * clearing subsequent bits in the last word touched */
+INLINE void
+int_set_int(GEN z, long k, GEN *w, long *r)
+{
+  long l = lgefint(z) - 2;
+  GEN y;
+  if (!l) {
+    int_set_bits(0, k, w, r);
+    return;
+  }
+  y = int_LSW(z);
+  for (; l > 1; l--) {
+    int_set_ulong((ulong) *y, w, r);
+    y = int_nextW(y);
+    k -= BITS_IN_LONG;
+  }
+  if (k)
+    int_set_bits((ulong) *y, k, w, r);
+}
+
+GEN
+nv_fromdigits_2k(GEN x, long k)
+{
+  long l = lg(x) - 1, r;
+  GEN w, z;
+  if (k == 1) return bits_to_int(x, l);
+  if (!l) return gen_0;
+  z = cgetipos(nbits2lg(k * l));
+  w = int_LSW(z);
+  r = 0;
+  for (; l; l--)
+    int_set_bits(uel(x, l), k, &w, &r);
+  return int_normalize(z, 0);
+}
+
+GEN
+fromdigits_2k(GEN x, long k)
+{
+  long l, m;
+  GEN w, y, z;
+  for (l = lg(x) - 1; l && !signe(gel(x, 1)); x++, l--);
+  if (!l) return gen_0;
+  m = expi(gel(x, 1)) + 1;
+  z = cgetipos(nbits2lg(k * (l - 1) + m));
+  w = int_LSW(z);
+  if (!(k & (BITS_IN_LONG - 1))) {
+    long i, j, t = k >> TWOPOTBITS_IN_LONG;
+    for (; l; l--) {
+      j = lgefint(gel(x, l)) - 2;
+      y = int_LSW(gel(x, l));
+      for (i = 0; i < j; i++) {
+        *w = *y;
+        y = int_nextW(y);
+        w = int_nextW(w);
+      }
+      for (; i < t; i++) {
+        *w = 0;
+        w = int_nextW(w);
+      }
+    }
+  } else {
+    long r = 0;
+    for (; l > 1; l--)
+      int_set_int(gel(x, l), k, &w, &r);
+    int_set_int(gel(x, 1), m, &w, &r);
+  }
+  return int_normalize(z, 0);
+}
+
 GEN
 binaire(GEN x)
 {
@@ -139,41 +250,101 @@ binaire(GEN x)
   return y;
 }
 
+/* extract k bits (as ulong) starting at word *w plus *r bits */
+INLINE ulong
+int_get_bits(long k, GEN *w, long *r)
+{
+  ulong mask = (1UL << k) - 1;
+  ulong a = (((ulong) **w) >> *r) & mask;
+  *r += k;
+  if (*r >= BITS_IN_LONG) {
+    *r -= BITS_IN_LONG;
+    *w = int_nextW(*w);
+    if (*r)
+      a |= (**w << (k - *r)) & mask;
+  }
+  return a;
+}
+
+/* extract BITS_IN_LONG bits starting at word *w plus *r bits */
+INLINE ulong
+int_get_ulong(GEN *w, long *r)
+{
+  ulong a = ((ulong) **w) >> *r;
+  *w = int_nextW(*w);
+  if (*r)
+    a |= (**w << (BITS_IN_LONG - *r));
+  return a;
+}
+
+/* extract k bits (as t_INT) starting at word *w plus *r bits */
+INLINE GEN
+int_get_int(long k, GEN *w, long *r)
+{
+  GEN z = cgetipos(nbits2lg(k));
+  GEN y = int_LSW(z);
+  for (; k >= BITS_IN_LONG; k -= BITS_IN_LONG) {
+    *y = int_get_ulong(w, r);
+    y = int_nextW(y);
+  }
+  if (k)
+    *y = int_get_bits(k, w, r);
+  return int_normalize(z, 0);
+}
+
 /* assume k < BITS_IN_LONG */
 GEN
 binary_2k_zv(GEN x, long k)
 {
-  long iv, j, n, nmodk, nk;
-  GEN v, vk;
+  long l, n, r;
+  GEN v, w;
   if (k == 1) return binary_zv(x);
-  if (!signe(x)) return cgetg(1,t_VECSMALL);
-  v = binary_zv(x);
-  n = lg(v)-1;
-  nk = n / k; nmodk = n % k;
-  if (nmodk) nk++;
-  vk = cgetg(nk+1, t_VECSMALL);
-  iv = n - k;
-  if (!nmodk) nmodk = k;
-  for (j = nk; j >= 2; j--,iv-=k) vk[j] = bits_to_u(v+iv, k);
-  vk[1] = bits_to_u(v,nmodk);
-  return vk;
+  if (!signe(x)) return cgetg(1, t_VECSMALL);
+  n = expi(x) + 1;
+  l = (n + k - 1) / k;
+  v = cgetg(l + 1, t_VECSMALL);
+  w = int_LSW(x);
+  r = 0;
+  for (; l > 1; l--) {
+    uel(v, l) = int_get_bits(k, &w, &r);
+    n -= k;
+  }
+  uel(v, 1) = int_get_bits(n, &w, &r);
+  return v;
 }
+
 GEN
 binary_2k(GEN x, long k)
 {
-  long iv, j, n, nmodk, nk;
-  GEN v, vk;
-  if (!signe(x)) return cgetg(1,t_VEC);
-  v = binary_zv(x);
-  n = lg(v)-1;
-  nk = n / k; nmodk = n % k;
-  if (nmodk) nk++;
-  vk = cgetg(nk+1, t_VEC);
-  iv = n - k;
-  if (!nmodk) nmodk = k;
-  for (j = nk; j >= 2; j--,iv-=k) gel(vk,j) = bits_to_int(v+iv, k);
-  gel(vk,1) = bits_to_int(v, nmodk);
-  return vk;
+  long l, n;
+  GEN v, w, y, z;
+  if (k == 1) return binaire(x);
+  if (!signe(x)) return cgetg(1, t_VEC);
+  n = expi(x) + 1;
+  l = (n + k - 1) / k;
+  v = cgetg(l + 1, t_VEC);
+  w = int_LSW(x);
+  if (!(k & (BITS_IN_LONG - 1))) {
+    long m, t = k >> TWOPOTBITS_IN_LONG, u = lgefint(x) - 2;
+    for (; l; l--) {
+      m = minss(t, u);
+      z = cgetipos(m + 2);
+      y = int_LSW(z);
+      for (; m; m--) {
+        *y = *w;
+        y = int_nextW(y);
+        w = int_nextW(w);
+      }
+      gel(v, l) = int_normalize(z, 0);
+      u -= t;
+    }
+  } else {
+    long r = 0;
+    for (; l > 1; l--, n -= k)
+      gel(v, l) = int_get_int(k, &w, &r);
+    gel(v, 1) = int_get_int(n, &w, &r);
+  }
+  return v;
 }
 
 /* return 1 if bit n of x is set, 0 otherwise */
diff --git a/src/headers/paridecl.h b/src/headers/paridecl.h
index d4a4743..514745b 100644
--- a/src/headers/paridecl.h
+++ b/src/headers/paridecl.h
@@ -2100,6 +2100,8 @@ GEN     binary_zv(GEN x);
 GEN     binary_2k(GEN x, long k);
 GEN     binary_2k_zv(GEN x, long k);
 long    bittest(GEN x, long n);
+GEN     fromdigits_2k(GEN x, long k);
+GEN     nv_fromdigits_2k(GEN x, long k);
 GEN     gbitand(GEN x, GEN y);
 GEN     gbitneg(GEN x, long n);
 GEN     gbitnegimply(GEN x, GEN y);
-- 
1.7.9.5

>From ed63bd98566d8e9a06d17983191d966f45f5445e Mon Sep 17 00:00:00 2001
From: Peter Bruin <P.J.Bruin@math.leidenuniv.nl>
Date: Sun, 4 Oct 2015 23:19:12 +0200
Subject: [PATCH 2/2] improve FlxqM_mul_Kronecker using nv_fromdigits_2k and
 binary_2k_zv

---
 src/basemath/Flx.c     |  139 +++++++++++++++++++++++++++++++++++++++++-------
 src/basemath/alglin1.c |    7 +--
 2 files changed, 121 insertions(+), 25 deletions(-)

diff --git a/src/basemath/Flx.c b/src/basemath/Flx.c
index 2fb5d56..51e01ff 100644
--- a/src/basemath/Flx.c
+++ b/src/basemath/Flx.c
@@ -4229,6 +4229,18 @@ kron_pack_Flx_spec_3(GEN x, long l) {
 }
 
 static GEN
+kron_pack_Flx_spec_bits(GEN x, long b, long l) {
+  GEN y;
+  long i;
+  if (l == 0)
+    return gen_0;
+  y = cgetg(l + 1, t_VECSMALL);
+  for(i = 1; i <= l; i++)
+    y[i] = x[l - i];
+  return nv_fromdigits_2k(y, b);
+}
+
+static GEN
 kron_unpack_Flx(GEN z, ulong p)
 {
   long i, l = lgefint(z);
@@ -4250,6 +4262,37 @@ kron_unpack_Flx_3(GEN x, ulong p) {
   return Z_mod2BIL_Flx_3(x, d, p);
 }
 
+/* assume b < BITS_IN_LONG */
+static GEN
+kron_unpack_Flx_bits_narrow(GEN z, long b, ulong p) {
+  GEN v = binary_2k_zv(z, b), x;
+  long i, l = lg(v) + 1;
+  x = cgetg(l, t_VECSMALL);
+  for (i = 2; i < l; i++)
+    x[i] = v[l - i] % p;
+  return Flx_renormalize(x, l);
+}
+
+static GEN
+kron_unpack_Flx_bits_wide(GEN z, long b, ulong p, ulong pi) {
+  GEN v = binary_2k(z, b), x, y;
+  long i, l = lg(v) + 1, ly;
+  x = cgetg(l, t_VECSMALL);
+  for (i = 2; i < l; i++) {
+    y = gel(v, l - i);
+    ly = lgefint(y);
+    switch (ly) {
+    case 2: x[i] = 0; break;
+    case 3: x[i] = *int_W_lg(y, 0, ly) % p; break;
+    case 4: x[i] = remll_pre(*int_W_lg(y, 1, ly), *int_W_lg(y, 0, ly), p, pi); break;
+    case 5: x[i] = remlll_pre(*int_W_lg(y, 2, ly), *int_W_lg(y, 1, ly),
+                              *int_W_lg(y, 0, ly), p, pi); break;
+    default: x[i] = umodiu(gel(v, l - i), p);
+    }
+  }
+  return Flx_renormalize(x, l);
+}
+
 static GEN
 FlxM_pack_ZM(GEN M, GEN (*pack)(GEN, long)) {
   long i, j, l, lc;
@@ -4268,6 +4311,24 @@ FlxM_pack_ZM(GEN M, GEN (*pack)(GEN, long)) {
 }
 
 static GEN
+FlxM_pack_ZM_bits(GEN M, long b)
+{
+  long i, j, l, lc;
+  GEN N = cgetg_copy(M, &l), x;
+  if (l == 1)
+    return N;
+  lc = lgcols(M);
+  for (j = 1; j < l; j++) {
+    gel(N, j) = cgetg(lc, t_COL);
+    for (i = 1; i < lc; i++) {
+      x = gcoeff(M, i, j);
+      gcoeff(N, i, j) = kron_pack_Flx_spec_bits(x + 2, b, lgpol(x));
+    }
+  }
+  return N;
+}
+
+static GEN
 ZM_unpack_FlxqM(GEN M, GEN T, ulong p, GEN (*unpack)(GEN, ulong))
 {
   long i, j, l, lc, sv = get_Flx_var(T);
@@ -4286,44 +4347,82 @@ ZM_unpack_FlxqM(GEN M, GEN T, ulong p, GEN (*unpack)(GEN, ulong))
   return N;
 }
 
+static GEN
+ZM_unpack_FlxqM_bits(GEN M, long b, GEN T, ulong p)
+{
+  long i, j, l, lc, sv = get_Flx_var(T);
+  GEN N = cgetg_copy(M, &l), x;
+  if (l == 1)
+    return N;
+  lc = lgcols(M);
+  if (b < BITS_IN_LONG) {
+    for (j = 1; j < l; j++) {
+      gel(N, j) = cgetg(lc, t_COL);
+      for (i = 1; i < lc; i++) {
+        x = kron_unpack_Flx_bits_narrow(gcoeff(M, i, j), b, p);
+        x[1] = sv;
+        gcoeff(N, i, j) = Flx_rem(x, T, p);
+      }
+    }
+  } else {
+    ulong pi = get_Fl_red(p);
+    for (j = 1; j < l; j++) {
+      gel(N, j) = cgetg(lc, t_COL);
+      for (i = 1; i < lc; i++) {
+        x = kron_unpack_Flx_bits_wide(gcoeff(M, i, j), b, p, pi);
+        x[1] = sv;
+        gcoeff(N, i, j) = Flx_rem(x, T, p);
+      }
+    }
+  }
+  return N;
+}
+
 GEN
 FlxqM_mul_Kronecker(GEN A, GEN B, GEN T, ulong p)
 {
   pari_sp av = avma;
-  long l, n = lg(A) - 1;
+  long b, d = degpol(T), n = lg(A) - 1;
   GEN C, D, z;
   GEN (*pack)(GEN, long), (*unpack)(GEN, ulong);
   int is_sqr = A==B;
 
-  /*
-    cf. Flx_mulspec(), maxlengthcoeffpol()
-    z  can be 1, 2, 3 or (theoretically) 4 words long
-  */
-  z = muliu(muliu(sqru(p - 1), degpol(T)), n);
-  l = lgefint(z);
+  z = muliu(muliu(sqru(p - 1), d), n);
+  b = expi(z) + 1;
+  /* only do expensive bit-packing if it saves at least 1 limb */
+  if (b <= BITS_IN_HALFULONG) {
+    if (nbits2lg(d*b) - 2 == (d + 1)/2)
+      b = BITS_IN_HALFULONG;
+  } else {
+    long l = lgefint(z) - 2;
+    if (nbits2lg(d*b) - 2 == d*l)
+      b = l*BITS_IN_LONG;
+  }
   avma = av;
 
-  switch (l - 2) {
-  case 1:
-    if (HIGHWORD(z[2]) == 0) {
-      pack = kron_pack_Flx_spec_half;
-      unpack = int_to_Flx_half;
-    } else {
-      pack = kron_pack_Flx_spec;
-      unpack = kron_unpack_Flx;
-    }
+  switch (b) {
+  case BITS_IN_HALFULONG:
+    pack = kron_pack_Flx_spec_half;
+    unpack = int_to_Flx_half;
     break;
-  case 2:
+  case BITS_IN_LONG:
+    pack = kron_pack_Flx_spec;
+    unpack = kron_unpack_Flx;
+    break;
+  case 2*BITS_IN_LONG:
     pack = kron_pack_Flx_spec_2;
     unpack = kron_unpack_Flx_2;
     break;
-  case 3:
+  case 3*BITS_IN_LONG:
     pack = kron_pack_Flx_spec_3;
     unpack = kron_unpack_Flx_3;
     break;
   default:
-    /* not implemented, probably won't occur in practice */
-    return NULL;
+    A = FlxM_pack_ZM_bits(A, b);
+    B = is_sqr? A: FlxM_pack_ZM_bits(B, b);
+    C = ZM_mul(A, B);
+    D = ZM_unpack_FlxqM_bits(C, b, T, p);
+    return gerepilecopy(av, D);
   }
   A = FlxM_pack_ZM(A, pack);
   B = is_sqr? A: FlxM_pack_ZM(B, pack);
diff --git a/src/basemath/alglin1.c b/src/basemath/alglin1.c
index 383f2eb..e3bab48 100644
--- a/src/basemath/alglin1.c
+++ b/src/basemath/alglin1.c
@@ -981,11 +981,8 @@ FlxqM_mul(GEN A, GEN B, GEN T, ulong p) {
 
   if (n == 0)
     return cgetg(1, t_MAT);
-  if (n > 1) {
-    GEN C = FlxqM_mul_Kronecker(A, B, T, p);
-    if (C != NULL)
-      return C;
-  }
+  if (n > 1)
+    return FlxqM_mul_Kronecker(A, B, T, p);
   ff = get_Flxq_field(&E, T, p);
   return gen_matmul(A, B, E, ff);
 }
-- 
1.7.9.5

Follow-Ups:
- Re: Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker
  - From: Bill Allombert <Bill.Allombert@math.u-bordeaux.fr>

References:
- Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker
  - From: Peter Bruin <P.J.Bruin@math.leidenuniv.nl>
- Re: Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker
  - From: Peter Bruin <P.J.Bruin@math.leidenuniv.nl>
- Re: Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker
  - From: Bill Allombert <Bill.Allombert@math.u-bordeaux.fr>
- Re: Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker
  - From: Peter Bruin <P.J.Bruin@math.leidenuniv.nl>
- Re: Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker
  - From: Bill Allombert <Bill.Allombert@math.u-bordeaux.fr>

Prev by Date: Re: Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker
Next by Date: Re: Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker
Previous by thread: Re: Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker
Next by thread: Re: Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker
Index(es):
- Date
- Thread