Peter Bruin on Sat, 07 Feb 2015 13:22:20 +0100


[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]

FlxqM_mul_Kronecker


Bonjour,

Here is a patch to add a new (private) function FlxqM_mul_Kronecker that
I have been working on since the Atelier PARI/GP in January.  It uses
Kronecker substitution to multiply matrices over non-prime finite fields
of small characteristics.  Over the range of matrices I tested it on, it
is on average 60%-80% faster than the existing FlxqM_mul; for some
examples, the speedup is up to 4-5 times.

When multiplying m x n matrices by n x k matrices, the speedup is
already noticeable (up to 20%) when n >= 2 (again on average over a
range of reasonable field sizes; in a small number of "unlucky" cases,
it is up to 20% slower because it has to use larger integers than
Flx_mulspec because several products must be added).

The new FlxqM_mul_Kronecker is called by FlxqM_mul (and hence by FFM_mul
and gmul) precisely when n >= 2.  All new functions are covered by the
new tests.

Thanks,

Peter


diff --git a/src/basemath/Flx.c b/src/basemath/Flx.c
index f0ddc04..0b7e536 100644
--- a/src/basemath/Flx.c
+++ b/src/basemath/Flx.c
@@ -645,9 +645,32 @@ Flx_mulspec_basecase(GEN x, GEN y, ulong p, long nx, long ny)
 }
 
 static GEN
+Flx_to_int_spec(GEN x, long l) {
+  long i;
+  GEN w, y;
+  if (l == 0)
+    return gen_0;
+  y = cgetipos(l + 2);
+  for (i = 0, w = int_LSW(y); i < l; i++, w = int_nextW(w))
+    *w = x[i];
+  return y;
+}
+
+static GEN
 int_to_Flx(GEN z, ulong p)
 {
   long i, l = lgefint(z);
+  GEN x = cgetg(l, t_VECSMALL), w;
+  for (w = int_LSW(z), i = 2; i < l; w = int_nextW(w), i++)
+    x[i] = ((ulong) *w) % p;
+  return Flx_renormalize(x, l);
+}
+
+/* z has least significant word first */
+static GEN
+int_LSWfirst_to_Flx(GEN z, ulong p)
+{
+  long i, l = lgefint(z);
   GEN x = cgetg(l, t_VECSMALL);
   for (i=2; i<l; i++) x[i] = uel(z,i)%p;
   return Flx_renormalize(x, l);
@@ -657,7 +680,7 @@ INLINE GEN
 Flx_mulspec_mulii(GEN a, GEN b, ulong p, long na, long nb)
 {
   GEN z=muliispec(a,b,na,nb);
-  return int_to_Flx(z,p);
+  return int_LSWfirst_to_Flx(z,p);
 }
 
 static GEN
@@ -888,7 +911,7 @@ static GEN
 Flx_sqrspec_sqri(GEN a, ulong p, long na)
 {
   GEN z=sqrispec(a,na);
-  return int_to_Flx(z,p);
+  return int_LSWfirst_to_Flx(z,p);
 }
 
 static GEN
@@ -3968,6 +3991,113 @@ FlxqV_roots_to_pol(GEN V, GEN T, ulong p, long v)
   return gerepileupto(ltop, FlxqXV_prod(W, T, p));
 }
 
+/*** FlxqM ***/
+
+static GEN
+zx_to_int_halfspec(GEN x, long l) {
+  if (l == 0)
+    return gen_0;
+  return Flx_to_int_halfspec(x, l);
+}
+
+static GEN
+zx_to_int_spec_2(GEN x, long l) {
+  return Flx_eval2BILspec(x, 2, l);
+}
+
+static GEN
+zx_to_int_spec_3(GEN x, long l) {
+  return Flx_eval2BILspec(x, 3, l);
+}
+
+static GEN
+int_to_Flx_2(GEN x, ulong p) {
+  long d = (lgefint(x)-1)/2 - 1;
+  return Z_mod2BIL_Flx_2(x, d, p);
+}
+
+static GEN
+int_to_Flx_3(GEN x, ulong p) {
+  long d = lgefint(x)/3 - 1;
+  return Z_mod2BIL_Flx_3(x, d, p);
+}
+
+static GEN
+FlxM_to_ZM(GEN M, GEN (*pack)(GEN, long)) {
+  long i, j, l, lc;
+  GEN N = cgetg_copy(M, &l), x;
+  if (l == 1)
+    return N;
+  lc = lgcols(M);
+  for (j = 1; j < l; j++) {
+    gel(N, j) = cgetg(lc, t_COL);
+    for (i = 1; i < lc; i++) {
+      x = gcoeff(M, i, j);
+      gcoeff(N, i, j) = pack(x + 2, lgpol(x));
+    }
+  }
+  return N;
+}
+
+static GEN
+ZM_to_FlxqM(GEN M, GEN T, ulong p,
+	    GEN (*unpack)(GEN, ulong)) {
+  long i, j, l, lc, sv = evalvarn(get_Flx_var(T));
+  GEN N = cgetg_copy(M, &l), x;
+  if (l == 1)
+    return N;
+  lc = lgcols(M);
+  for (j = 1; j < l; j++) {
+    gel(N, j) = cgetg(lc, t_COL);
+    for (i = 1; i < lc; i++) {
+      x = unpack(gcoeff(M, i, j), p);
+      x[1] = sv;
+      gcoeff(N, i, j) = Flx_rem(x, T, p);
+    }
+  }
+  return N;
+}
+
+GEN
+FlxqM_mul_Kronecker(GEN A, GEN B, GEN T, ulong p) {
+  pari_sp av = avma;
+  long l, n = lg(A) - 1;
+  GEN C, D, z;
+  GEN (*pack)(GEN, long), (*unpack)(GEN, ulong);
+
+  /*
+    cf. Flx_mulspec(), maxlengthcoeffpol()
+    z  can be 1, 2, 3 or (theoretically) 4 words long
+  */
+  z = muliu(muliu(sqru(p - 1), degpol(T)), n);
+  l = lgefint(z);
+  avma = av;
+
+  if (l == 3) {
+    if (HIGHWORD(z[2]) == 0) {
+      pack = zx_to_int_halfspec;
+      unpack = int_to_Flx_half;
+    } else {
+      pack = Flx_to_int_spec;
+      unpack = int_to_Flx;
+    }
+  } else if (l == 4) {
+    pack = zx_to_int_spec_2;
+    unpack = int_to_Flx_2;
+  } else if (l == 5) {
+    pack = zx_to_int_spec_3;
+    unpack = int_to_Flx_3;
+  } else {
+    /* not implemented, probably won't occur in practice */
+    return NULL;
+  }
+  A = FlxM_to_ZM(A, pack);
+  B = FlxM_to_ZM(B, pack);
+  C = ZM_mul(A, B);
+  D = ZM_to_FlxqM(C, T, p, unpack);
+  return gerepilecopy(av, D);
+}
+
 /*******************************************************************/
 /*                                                                 */
 /*                       (Fl[X]/T(X))[Y] / S(Y)                    */
diff --git a/src/basemath/alglin1.c b/src/basemath/alglin1.c
index f2976cb..229f6f4 100644
--- a/src/basemath/alglin1.c
+++ b/src/basemath/alglin1.c
@@ -976,7 +976,17 @@ FlxqM_FlxqC_mul(GEN A, GEN B, GEN T, unsigned long p) {
 GEN
 FlxqM_mul(GEN A, GEN B, GEN T, unsigned long p) {
   void *E;
-  const struct bb_field *ff = get_Flxq_field(&E, T, p);
+  const struct bb_field *ff;
+  long n = lg(A) - 1;
+
+  if (n == 0)
+    return cgetg(1, t_MAT);
+  if (n > 1) {
+    GEN C = FlxqM_mul_Kronecker(A, B, T, p);
+    if (C != NULL)
+      return C;
+  }
+  ff = get_Flxq_field(&E, T, p);
   return gen_matmul(A, B, E, ff);
 }
 
diff --git a/src/headers/paripriv.h b/src/headers/paripriv.h
index 4b0677e..dbcfc10 100644
--- a/src/headers/paripriv.h
+++ b/src/headers/paripriv.h
@@ -562,6 +562,10 @@ void    pari_close_files(void);
 int     popinfile(void);
 pariFILE* try_pipe(const char *cmd, int flag);
 
+/* Flx.c */
+
+GEN FlxqM_mul_Kronecker(GEN A, GEN B, GEN T, ulong p);
+
 /* Flxq_log.c */
 
 GEN Flxq_log_index(GEN a0, GEN b0, GEN m, GEN T0, ulong p);
diff --git a/src/test/32/ff b/src/test/32/ff
index 2d90cb3..8a57055 100644
--- a/src/test/32/ff
+++ b/src/test/32/ff
@@ -319,6 +319,15 @@ t^4 + t^2
 [3*t^2 + 3*t, 18446744073709551628*t^4 + 5*t^3 + 4*t^2 + 1844674407370955162
 7*t + 18446744073709551628, 18446744073709551628*t^4 + 5*t^3 + 4*t^2 + 18446
 744073709551628*t]~
+? test(q)=my(t=ffgen(q,'t),M=matrix(10,10,i,j,random(t)));subst(charpoly(M),'x,M)==0;
+? test(nextprime(2^7)^5)
+1
+? test(nextprime(2^15)^5)
+1
+? test(nextprime(2^31)^5)
+1
+? test(nextprime(2^63)^5)
+1
 ? p=2^64+13;g=ffprimroot(ffgen(p^2),&o);a=2*g^0;
 ? v=[I,-1,Mat(1),matid(2)/2];
 ? for(i=1,#v,print(iferr(fflog(a,g,v[i]),E,E)));
diff --git a/src/test/in/ff b/src/test/in/ff
index 6b47f60..c4b5b28 100644
--- a/src/test/in/ff
+++ b/src/test/in/ff
@@ -106,6 +106,15 @@ test(2^5)
 test(7^5)
 test((2^64+13)^5)
 
+test(q)={
+  my(t = ffgen(q, 't), M = matrix(10, 10, i, j, random(t)));
+  subst(charpoly(M), 'x, M) == 0;
+}
+test(nextprime(2^7)^5)
+test(nextprime(2^15)^5)
+test(nextprime(2^31)^5)
+test(nextprime(2^63)^5)
+
 p=2^64+13; g=ffprimroot(ffgen(p^2), &o); a=2*g^0;
 v=[I,-1,Mat(1),matid(2)/2];
 for(i=1,#v, print(iferr(fflog(a,g,v[i]),E,E)));