Peter Bruin on Wed, 28 Oct 2015 14:37:44 +0100


[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]

Re: Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker


Hi Bill,

> as I understand, x[i] is treated as a ulong, and not as a long.  The
> meaning of "here $k$ may be any positive \kbd{long}." should be
> clarified to be more explicit when k is equal or larger than
> BITS_IN_LONG
>
> But more importantly, the fromdigits_2k_zv should be called
> zv_fromdigits_2k, because the input is a zv and not the output.
>
> And in fact, more accurately, it should be nv_fromdigits_2k because
> the entries are treated as unsigned.  (the original functions
> binary_2k_zv and binary_zv have the same problem)

Here are updated patches (the first for the fromdigits_2k and binary_2k
functions, the second to use them in FlxqM_mul_Kronecker).  Apart from
simply renaming fromdigits_2k_zv to nv_fromdigits_2k, I clarified the
documentation as follows:

--- a/doc/usersch5.tex
+++ b/doc/usersch5.tex
@@ -2373,9 +2373,11 @@
 where each $x[i]$ is a \typ{INT} with $0\leq x[i] < 2^k$, return the
 integer $\sum_{i = 1}^l x[i] 2^{k(l-i)}$, as a \typ{INT}.
 
-\fun{GEN}{fromdigits_2k_zv}{GEN x, long k} as \tet{fromdigits_2k}, but
-with $x$ being a \typ{VECSMALL} and each $x[i]$ being a \kbd{long}
-with $0\leq x[i] < 2^k$; here $k$ may be any positive \kbd{long}.
+\fun{GEN}{nv_fromdigits_2k}{GEN x, long k} as \tet{fromdigits_2k}, but
+with $x$ being a \typ{VECSMALL} and each $x[i]$ being a \kbd{ulong}
+with $0\leq x[i] < 2^{\min\{k,\tet{BITS_IN_LONG}\}}$.  Here $k$ may be
+any positive \kbd{long}, and the $x[i]$ are regarded as $k$-bit
+integers by truncating or extending with zeroes.
 
 \subsec{Integer valuation}
 For integers $x$ and $p$, such that $x\neq 0$ and $|p| > 1$, we define

I hope this addresses your point.

Thanks,

Peter

diff --git a/doc/usersch5.tex b/doc/usersch5.tex
index 78c4497..069a1a4 100644
--- a/doc/usersch5.tex
+++ b/doc/usersch5.tex
@@ -2347,7 +2347,7 @@ shifts the mantissa
 $$f, s[m], s[m+1],\ldots s[M]$$
 right by $n$ bits.
 
-\subsec{From \typ{INT} to bits}
+\subsec{From \typ{INT} to bits or digits in base $2^k$ and back}
 
 \fun{GEN}{binary_zv}{GEN x} given a \typ{INT} $x$, return a \typ{VECSMALL} of
 bits, from most significant to least significant.
@@ -2368,6 +2368,17 @@ proper \kbd{GEN}), return the integer $\sum_{i = 1}^l x[i] 2^{l-i}$, as a
 \fun{ulong}{bits_to_u}{GEN v, long l} same as \tet{bits_to_int}, where
 $l < \tet{BITS_IN_LONG}$, so we can return an \kbd{ulong}.
 
+\fun{GEN}{fromdigits_2k}{GEN x, long k} converse of \tet{binary_2k};
+given a \typ{VEC} $x$ of length $l$ and a positive \kbd{long} $k$,
+where each $x[i]$ is a \typ{INT} with $0\leq x[i] < 2^k$, return the
+integer $\sum_{i = 1}^l x[i] 2^{k(l-i)}$, as a \typ{INT}.
+
+\fun{GEN}{nv_fromdigits_2k}{GEN x, long k} as \tet{fromdigits_2k}, but
+with $x$ being a \typ{VECSMALL} and each $x[i]$ being a \kbd{ulong}
+with $0\leq x[i] < 2^{\min\{k,\tet{BITS_IN_LONG}\}}$.  Here $k$ may be
+any positive \kbd{long}, and the $x[i]$ are regarded as $k$-bit
+integers by truncating or extending with zeroes.
+
 \subsec{Integer valuation}
 For integers $x$ and $p$, such that $x\neq 0$ and $|p| > 1$, we define
 $v_p(x)$ to be the largest integer exponent $e$ such that $p^e$ divides $x$.
diff --git a/src/basemath/arith2.c b/src/basemath/arith2.c
index 01726a0..1935132 100644
--- a/src/basemath/arith2.c
+++ b/src/basemath/arith2.c
@@ -1501,6 +1501,8 @@ fromdigits(GEN x, GEN B)
   if (typ(x)!=t_VEC || !RgV_is_ZV(x)) pari_err_TYPE("fromdigits",x);
   B = check_basis(B);
   if (lg(x)==1) { avma = av; return gen_0; }
+  if (Z_ispow2(B))
+    return fromdigits_2k(x, expi(B));
   x = vecreverse(x);
   return gerepileupto(av, gen_fromdigits(x, B, NULL, &Z_ring));
 }
diff --git a/src/basemath/bit.c b/src/basemath/bit.c
index 6e9e253..183472b 100644
--- a/src/basemath/bit.c
+++ b/src/basemath/bit.c
@@ -81,6 +81,117 @@ bits_to_u(GEN v, long l)
   return u;
 }
 
+/* set BITS_IN_LONG bits starting at word *w plus *r bits,
+ * clearing subsequent bits in the last word touched */
+INLINE void
+int_set_ulong(ulong a, GEN *w, long *r)
+{
+  if (*r) {
+    **w |= (a << *r);
+    *w = int_nextW(*w);
+    **w = (a >> (BITS_IN_LONG - *r));
+  } else {
+    **w = a;
+    *w = int_nextW(*w);
+  }
+}
+
+/* set k bits starting at word *w plus *r bits,
+ * clearing subsequent bits in the last word touched */
+INLINE void
+int_set_bits(ulong a, long k, GEN *w, long *r)
+{
+  if (*r) {
+    **w |= a << *r;
+    a >>= BITS_IN_LONG - *r;
+  } else {
+    **w = a;
+    a = 0;
+  }
+  *r += k;
+  if (*r >= BITS_IN_LONG) {
+    *w = int_nextW(*w);
+    *r -= BITS_IN_LONG;
+    for (; *r >= BITS_IN_LONG; *r -= BITS_IN_LONG) {
+      **w = a;
+      a = 0;
+      *w = int_nextW(*w);
+    }
+    if (*r)
+      **w = a;
+  }
+}
+
+/* set k bits from z (t_INT) starting at word *w plus *r bits,
+ * clearing subsequent bits in the last word touched */
+INLINE void
+int_set_int(GEN z, long k, GEN *w, long *r)
+{
+  long l = lgefint(z) - 2;
+  GEN y;
+  if (!l) {
+    int_set_bits(0, k, w, r);
+    return;
+  }
+  y = int_LSW(z);
+  for (; l > 1; l--) {
+    int_set_ulong((ulong) *y, w, r);
+    y = int_nextW(y);
+    k -= BITS_IN_LONG;
+  }
+  if (k)
+    int_set_bits((ulong) *y, k, w, r);
+}
+
+GEN
+nv_fromdigits_2k(GEN x, long k)
+{
+  long l = lg(x) - 1, r;
+  GEN w, z;
+  if (!l) return gen_0;
+  z = cgetipos(nbits2lg(k * l));
+  w = int_LSW(z);
+  r = 0;
+  for (; l; l--)
+    int_set_bits(uel(x, l), k, &w, &r);
+  return int_normalize(z, 0);
+}
+
+GEN
+fromdigits_2k(GEN x, long k)
+{
+  long l, m;
+  GEN w, y, z;
+  for (l = lg(x) - 1; l && !signe(gel(x, 1)); x++, l--);
+  if (k == 1) return bits_to_int(x, l);
+  if (!l) return gen_0;
+  m = expi(gel(x, 1)) + 1;
+  z = cgetipos(nbits2lg(k * (l - 1) + m));
+  w = int_LSW(z);
+  if (!(k & (BITS_IN_LONG - 1))) {
+    long i, j, t = k >> TWOPOTBITS_IN_LONG;
+    for (; l; l--) {
+      j = lgefint(gel(x, l)) - 2;
+      y = int_LSW(gel(x, l));
+      for (i = 0; i < j; i++) {
+	*w = *y;
+	y = int_nextW(y);
+	w = int_nextW(w);
+      }
+      for (; i < t; i++) {
+	*w = 0;
+	w = int_nextW(w);
+      }
+    }
+  } else {
+    long r = 0;
+    for (; l > 1; l--)
+      int_set_int(gel(x, l), k, &w, &r);
+    int_set_int(gel(x, 1), m, &w, &r);
+  }
+  return int_normalize(z, 0);
+}
+
 GEN
 binaire(GEN x)
 {
@@ -139,41 +250,101 @@ binaire(GEN x)
   return y;
 }
 
+/* extract k bits (as ulong) starting at word *w plus *r bits */
+INLINE ulong
+int_get_bits(long k, GEN *w, long *r)
+{
+  ulong mask = (1UL << k) - 1;
+  ulong a = (((ulong) **w) >> *r) & mask;
+  *r += k;
+  if (*r >= BITS_IN_LONG) {
+    *r -= BITS_IN_LONG;
+    *w = int_nextW(*w);
+    if (*r)
+      a |= (**w << (k - *r)) & mask;
+  }
+  return a;
+}
+
+/* extract BITS_IN_LONG bits starting at word *w plus *r bits */
+INLINE ulong
+int_get_ulong(GEN *w, long *r)
+{
+  ulong a = ((ulong) **w) >> *r;
+  *w = int_nextW(*w);
+  if (*r)
+    a |= (**w << (BITS_IN_LONG - *r));
+  return a;
+}
+
+/* extract k bits (as t_INT) starting at word *w plus *r bits */
+INLINE GEN
+int_get_int(long k, GEN *w, long *r)
+{
+  GEN z = cgetipos(nbits2lg(k));
+  GEN y = int_LSW(z);
+  for (; k >= BITS_IN_LONG; k -= BITS_IN_LONG) {
+    *y = int_get_ulong(w, r);
+    y = int_nextW(y);
+  }
+  if (k)
+    *y = int_get_bits(k, w, r);
+  return int_normalize(z, 0);
+}
+
 /* assume k < BITS_IN_LONG */
 GEN
 binary_2k_zv(GEN x, long k)
 {
-  long iv, j, n, nmodk, nk;
-  GEN v, vk;
+  long l, n, r;
+  GEN v, w;
   if (k == 1) return binary_zv(x);
-  if (!signe(x)) return cgetg(1,t_VECSMALL);
-  v = binary_zv(x);
-  n = lg(v)-1;
-  nk = n / k; nmodk = n % k;
-  if (nmodk) nk++;
-  vk = cgetg(nk+1, t_VECSMALL);
-  iv = n - k;
-  if (!nmodk) nmodk = k;
-  for (j = nk; j >= 2; j--,iv-=k) vk[j] = bits_to_u(v+iv, k);
-  vk[1] = bits_to_u(v,nmodk);
-  return vk;
+  if (!signe(x)) return cgetg(1, t_VECSMALL);
+  n = expi(x) + 1;
+  l = (n + k - 1) / k;
+  v = cgetg(l + 1, t_VECSMALL);
+  w = int_LSW(x);
+  r = 0;
+  for (; l > 1; l--) {
+    uel(v, l) = int_get_bits(k, &w, &r);
+    n -= k;
+  }
+  uel(v, 1) = int_get_bits(n, &w, &r);
+  return v;
 }
+
 GEN
 binary_2k(GEN x, long k)
 {
-  long iv, j, n, nmodk, nk;
-  GEN v, vk;
-  if (!signe(x)) return cgetg(1,t_VEC);
-  v = binary_zv(x);
-  n = lg(v)-1;
-  nk = n / k; nmodk = n % k;
-  if (nmodk) nk++;
-  vk = cgetg(nk+1, t_VEC);
-  iv = n - k;
-  if (!nmodk) nmodk = k;
-  for (j = nk; j >= 2; j--,iv-=k) gel(vk,j) = bits_to_int(v+iv, k);
-  gel(vk,1) = bits_to_int(v, nmodk);
-  return vk;
+  long l, n;
+  GEN v, w, y, z;
+  if (k == 1) return binaire(x);
+  if (!signe(x)) return cgetg(1, t_VEC);
+  n = expi(x) + 1;
+  l = (n + k - 1) / k;
+  v = cgetg(l + 1, t_VEC);
+  w = int_LSW(x);
+  if (!(k & (BITS_IN_LONG - 1))) {
+    long m, t = k >> TWOPOTBITS_IN_LONG, u = lgefint(x) - 2;
+    for (; l; l--) {
+      m = minss(t, u);
+      z = cgetipos(m + 2);
+      y = int_LSW(z);
+      for (; m; m--) {
+	*y = *w;
+	y = int_nextW(y);
+	w = int_nextW(w);
+      }
+      gel(v, l) = int_normalize(z, 0);
+      u -= t;
+    }
+  } else {
+    long r = 0;
+    for (; l > 1; l--, n -= k)
+      gel(v, l) = int_get_int(k, &w, &r);
+    gel(v, 1) = int_get_int(n, &w, &r);
+  }
+  return v;
 }
 
 /* return 1 if bit n of x is set, 0 otherwise */
diff --git a/src/headers/paridecl.h b/src/headers/paridecl.h
index d4a4743..514745b 100644
--- a/src/headers/paridecl.h
+++ b/src/headers/paridecl.h
@@ -2100,6 +2100,8 @@ GEN     binary_zv(GEN x);
 GEN     binary_2k(GEN x, long k);
 GEN     binary_2k_zv(GEN x, long k);
 long    bittest(GEN x, long n);
+GEN     fromdigits_2k(GEN x, long k);
+GEN     nv_fromdigits_2k(GEN x, long k);
 GEN     gbitand(GEN x, GEN y);
 GEN     gbitneg(GEN x, long n);
 GEN     gbitnegimply(GEN x, GEN y);
diff --git a/src/basemath/Flx.c b/src/basemath/Flx.c
index 2fb5d56..d5eb2be 100644
--- a/src/basemath/Flx.c
+++ b/src/basemath/Flx.c
@@ -4229,6 +4229,18 @@ kron_pack_Flx_spec_3(GEN x, long l) {
 }
 
 static GEN
+kron_pack_Flx_spec_bits(GEN x, long b, long l) {
+  GEN y;
+  long i;
+  if (l == 0)
+    return gen_0;
+  y = cgetg(l + 1, t_VECSMALL);
+  for(i = 1; i <= l; i++)
+    y[i] = x[l - i];
+  return nv_fromdigits_2k(y, b);
+}
+
+static GEN
 kron_unpack_Flx(GEN z, ulong p)
 {
   long i, l = lgefint(z);
@@ -4250,6 +4262,37 @@ kron_unpack_Flx_3(GEN x, ulong p) {
   return Z_mod2BIL_Flx_3(x, d, p);
 }
 
+/* assume b < BITS_IN_LONG */
+static GEN
+kron_unpack_Flx_bits_narrow(GEN z, long b, ulong p) {
+  GEN v = binary_2k_zv(z, b), x;
+  long i, l = lg(v) + 1;
+  x = cgetg(l, t_VECSMALL);
+  for (i = 2; i < l; i++)
+    x[i] = v[l - i] % p;
+  return Flx_renormalize(x, l);
+}
+
+static GEN
+kron_unpack_Flx_bits_wide(GEN z, long b, ulong p, ulong pi) {
+  GEN v = binary_2k(z, b), x, y;
+  long i, l = lg(v) + 1, ly;
+  x = cgetg(l, t_VECSMALL);
+  for (i = 2; i < l; i++) {
+    y = gel(v, l - i);
+    ly = lgefint(y);
+    switch (ly) {
+    case 2: x[i] = 0; break;
+    case 3: x[i] = *int_W_lg(y, 0, ly) % p; break;
+    case 4: x[i] = remll_pre(*int_W_lg(y, 1, ly), *int_W_lg(y, 0, ly), p, pi); break;
+    case 5: x[i] = remlll_pre(*int_W_lg(y, 2, ly), *int_W_lg(y, 1, ly),
+			      *int_W_lg(y, 0, ly), p, pi); break;
+    default: x[i] = umodiu(gel(v, l - i), p);
+    }
+  }
+  return Flx_renormalize(x, l);
+}
+
 static GEN
 FlxM_pack_ZM(GEN M, GEN (*pack)(GEN, long)) {
   long i, j, l, lc;
@@ -4268,6 +4311,24 @@ FlxM_pack_ZM(GEN M, GEN (*pack)(GEN, long)) {
 }
 
 static GEN
+FlxM_pack_ZM_bits(GEN M, long b)
+{
+  long i, j, l, lc;
+  GEN N = cgetg_copy(M, &l), x;
+  if (l == 1)
+    return N;
+  lc = lgcols(M);
+  for (j = 1; j < l; j++) {
+    gel(N, j) = cgetg(lc, t_COL);
+    for (i = 1; i < lc; i++) {
+      x = gcoeff(M, i, j);
+      gcoeff(N, i, j) = kron_pack_Flx_spec_bits(x + 2, b, lgpol(x));
+    }
+  }
+  return N;
+}
+
+static GEN
 ZM_unpack_FlxqM(GEN M, GEN T, ulong p, GEN (*unpack)(GEN, ulong))
 {
   long i, j, l, lc, sv = get_Flx_var(T);
@@ -4286,44 +4347,82 @@ ZM_unpack_FlxqM(GEN M, GEN T, ulong p, GEN (*unpack)(GEN, ulong))
   return N;
 }
 
+static GEN
+ZM_unpack_FlxqM_bits(GEN M, long b, GEN T, ulong p)
+{
+  long i, j, l, lc, sv = get_Flx_var(T);
+  GEN N = cgetg_copy(M, &l), x;
+  if (l == 1)
+    return N;
+  lc = lgcols(M);
+  if (b < BITS_IN_LONG) {
+    for (j = 1; j < l; j++) {
+      gel(N, j) = cgetg(lc, t_COL);
+      for (i = 1; i < lc; i++) {
+	x = kron_unpack_Flx_bits_narrow(gcoeff(M, i, j), b, p);
+	x[1] = sv;
+	gcoeff(N, i, j) = Flx_rem(x, T, p);
+      }
+    }
+  } else {
+    ulong pi = get_Fl_red(p);
+    for (j = 1; j < l; j++) {
+      gel(N, j) = cgetg(lc, t_COL);
+      for (i = 1; i < lc; i++) {
+	x = kron_unpack_Flx_bits_wide(gcoeff(M, i, j), b, p, pi);
+	x[1] = sv;
+	gcoeff(N, i, j) = Flx_rem(x, T, p);
+      }
+    }
+  }
+  return N;
+}
+
 GEN
 FlxqM_mul_Kronecker(GEN A, GEN B, GEN T, ulong p)
 {
   pari_sp av = avma;
-  long l, n = lg(A) - 1;
+  long b, d = degpol(T), n = lg(A) - 1;
   GEN C, D, z;
   GEN (*pack)(GEN, long), (*unpack)(GEN, ulong);
   int is_sqr = A==B;
 
-  /*
-    cf. Flx_mulspec(), maxlengthcoeffpol()
-    z  can be 1, 2, 3 or (theoretically) 4 words long
-  */
-  z = muliu(muliu(sqru(p - 1), degpol(T)), n);
-  l = lgefint(z);
+  z = muliu(muliu(sqru(p - 1), d), n);
+  b = expi(z) + 1;
+  /* only do expensive bit-packing if it saves at least 1 limb */
+  if (b <= BITS_IN_HALFULONG) {
+    if (nbits2lg(d*b) - 2 == (d + 1)/2)
+      b = BITS_IN_HALFULONG;
+  } else {
+    long l = lgefint(z) - 2;
+    if (nbits2lg(d*b) - 2 == d*l)
+      b = l*BITS_IN_LONG;
+  }
   avma = av;
 
-  switch (l - 2) {
-  case 1:
-    if (HIGHWORD(z[2]) == 0) {
-      pack = kron_pack_Flx_spec_half;
-      unpack = int_to_Flx_half;
-    } else {
-      pack = kron_pack_Flx_spec;
-      unpack = kron_unpack_Flx;
-    }
+  switch (b) {
+  case BITS_IN_HALFULONG:
+    pack = kron_pack_Flx_spec_half;
+    unpack = int_to_Flx_half;
     break;
-  case 2:
+  case BITS_IN_LONG:
+    pack = kron_pack_Flx_spec;
+    unpack = kron_unpack_Flx;
+    break;
+  case 2*BITS_IN_LONG:
     pack = kron_pack_Flx_spec_2;
     unpack = kron_unpack_Flx_2;
     break;
-  case 3:
+  case 3*BITS_IN_LONG:
     pack = kron_pack_Flx_spec_3;
     unpack = kron_unpack_Flx_3;
     break;
   default:
-    /* not implemented, probably won't occur in practice */
-    return NULL;
+    A = FlxM_pack_ZM_bits(A, b);
+    B = is_sqr? A: FlxM_pack_ZM_bits(B, b);
+    C = ZM_mul(A, B);
+    D = ZM_unpack_FlxqM_bits(C, b, T, p);
+    return gerepilecopy(av, D);
   }
   A = FlxM_pack_ZM(A, pack);
   B = is_sqr? A: FlxM_pack_ZM(B, pack);
diff --git a/src/basemath/alglin1.c b/src/basemath/alglin1.c
index 383f2eb..e3bab48 100644
--- a/src/basemath/alglin1.c
+++ b/src/basemath/alglin1.c
@@ -981,11 +981,8 @@ FlxqM_mul(GEN A, GEN B, GEN T, ulong p) {
 
   if (n == 0)
     return cgetg(1, t_MAT);
-  if (n > 1) {
-    GEN C = FlxqM_mul_Kronecker(A, B, T, p);
-    if (C != NULL)
-      return C;
-  }
+  if (n > 1)
+    return FlxqM_mul_Kronecker(A, B, T, p);
   ff = get_Flxq_field(&E, T, p);
   return gen_matmul(A, B, E, ff);
 }