Peter Bruin on Tue, 08 Sep 2015 11:47:12 +0200


[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]

Speed up {Flx,FpX,FpXQX}_divrem_basecase for suitable polynomials


Bonjour,

When computing with non-prime finite fields, one is often free to choose
a defining polynomial f (of some given degree n) over the prime field.
If we write f in the form c*X^n + f1 with deg(f1) < n, then for division
with remainder modulo f, it "should be" best to take deg(f1) as small as
possible.  However, the current code for division with remainder in PARI
does not yet give an advantage in that case.

The attached patch modifies the functions {Flx,FpX,FpXQX}_divrem_basecase
and Flx_rem_basecase in order to reduce the complexity of computing the
(quotient and) remainder of g by f from O((deg g - deg f)(deg f)) to
O((deg g - deg f)(deg f1)).

It passes the test suite and gives a noticeable speedup in my code for
computing with curves over finite fields.  For example, with this patch,
multiplying two 10 x 10 matrices over a field of size 59^100 becomes
about 25% faster when choosing a polynomial of the form x^100 + O(x^3)
instead of an arbitrary polynomial of degree 100 over F_59; previously
there was no visible difference.

Thanks,

Peter


diff --git a/src/basemath/Flx.c b/src/basemath/Flx.c
index 9387df7..8d7fe24 100644
--- a/src/basemath/Flx.c
+++ b/src/basemath/Flx.c
@@ -1179,7 +1179,7 @@ Flx_rem_basecase(GEN x, GEN y, ulong p)
 {
   pari_sp av;
   GEN z, c;
-  long dx,dy,dz,i,j;
+  long dx,dy,dy1,dz,i,j;
   ulong p1,inv;
   long vs=x[1];
 
@@ -1189,6 +1189,7 @@ Flx_rem_basecase(GEN x, GEN y, ulong p)
   x += 2; y += 2;
   inv = y[dy];
   if (inv != 1UL) inv = Fl_inv(inv,p);
+  for (dy1=dy-1; dy1>=0 && !y[dy1]; dy1--);
 
   c = cgetg(dy+3, t_VECSMALL); c[1]=vs; c += 2; av=avma;
   z = cgetg(dz+3, t_VECSMALL); z[1]=vs; z += 2;
@@ -1199,7 +1200,7 @@ Flx_rem_basecase(GEN x, GEN y, ulong p)
     for (i=dx-1; i>=dy; --i)
     {
       p1 = p - x[i]; /* compute -p1 instead of p1 (pb with ulongs otherwise) */
-      for (j=i-dy+1; j<=i && j<=dz; j++)
+      for (j=i-dy1; j<=i && j<=dz; j++)
       {
         p1 += z[j]*y[i-j];
         if (p1 & HIGHBIT) p1 %= p;
@@ -1210,7 +1211,7 @@ Flx_rem_basecase(GEN x, GEN y, ulong p)
     for (i=0; i<dy; i++)
     {
       p1 = z[0]*y[i];
-      for (j=1; j<=i && j<=dz; j++)
+      for (j=maxss(1,i-dy1); j<=i && j<=dz; j++)
       {
         p1 += z[j]*y[i-j];
         if (p1 & HIGHBIT) p1 %= p;
@@ -1225,14 +1226,14 @@ Flx_rem_basecase(GEN x, GEN y, ulong p)
     for (i=dx-1; i>=dy; --i)
     {
       p1 = p - x[i]; /* compute -p1 instead of p1 (pb with ulongs otherwise) */
-      for (j=i-dy+1; j<=i && j<=dz; j++)
+      for (j=i-dy1; j<=i && j<=dz; j++)
         p1 = Fl_addmul_pre(z[j], y[i-j], p1, p, pi);
       z[i-dy] = p1? Fl_mul_pre(p - p1, inv, p, pi): 0;
     }
     for (i=0; i<dy; i++)
     {
       p1 = Fl_mul_pre(z[0],y[i],p,pi);
-      for (j=1; j<=i && j<=dz; j++)
+      for (j=maxss(1,i-dy1); j<=i && j<=dz; j++)
         p1 = Fl_addmul_pre(z[j],y[i-j],p1, p,pi);
       c[i] = Fl_sub(x[i], p1, p);
     }
@@ -1248,7 +1249,7 @@ static GEN
 Flx_divrem_basecase(GEN x, GEN y, ulong p, GEN *pr)
 {
   GEN z,q,c;
-  long dx,dy,dz,i,j;
+  long dx,dy,dy1,dz,i,j;
   ulong p1,inv;
   long sv=x[1];
 
@@ -1274,6 +1275,7 @@ Flx_divrem_basecase(GEN x, GEN y, ulong p, GEN *pr)
   z = cgetg(dz + 3, t_VECSMALL); z[1] = sv; z += 2;
   inv = uel(y, dy);
   if (inv != 1UL) inv = Fl_inv(inv,p);
+  for (dy1=dy-1; dy1>=0 && !y[dy1]; dy1--);
 
   if (SMALL_ULONG(p))
   {
@@ -1281,7 +1283,7 @@ Flx_divrem_basecase(GEN x, GEN y, ulong p, GEN *pr)
     for (i=dx-1; i>=dy; --i)
     {
       p1 = p - x[i]; /* compute -p1 instead of p1 (pb with ulongs otherwise) */
-      for (j=i-dy+1; j<=i && j<=dz; j++)
+      for (j=i-dy1; j<=i && j<=dz; j++)
       {
         p1 += z[j]*y[i-j];
         if (p1 & HIGHBIT) p1 %= p;
@@ -1296,7 +1298,7 @@ Flx_divrem_basecase(GEN x, GEN y, ulong p, GEN *pr)
     for (i=dx-1; i>=dy; --i)
     { /* compute -p1 instead of p1 (pb with ulongs otherwise) */
       p1 = p - uel(x,i);
-      for (j=i-dy+1; j<=i && j<=dz; j++)
+      for (j=i-dy1; j<=i && j<=dz; j++)
         p1 = Fl_add(p1, Fl_mul(z[j],y[i-j],p), p);
       z[i-dy] = p1? Fl_mul(p - p1, inv, p): 0;
     }
@@ -1310,7 +1312,7 @@ Flx_divrem_basecase(GEN x, GEN y, ulong p, GEN *pr)
     for (i=0; i<dy; i++)
     {
       p1 = (ulong)z[0]*y[i];
-      for (j=1; j<=i && j<=dz; j++)
+      for (j=maxss(1,i-dy1); j<=i && j<=dz; j++)
       {
         p1 += (ulong)z[j]*y[i-j];
         if (p1 & HIGHBIT) p1 %= p;
@@ -1323,7 +1325,7 @@ Flx_divrem_basecase(GEN x, GEN y, ulong p, GEN *pr)
     for (i=0; i<dy; i++)
     {
       p1 = Fl_mul(z[0],y[i],p);
-      for (j=1; j<=i && j<=dz; j++)
+      for (j=maxss(1,i-dy1); j<=i && j<=dz; j++)
         p1 = Fl_add(p1, Fl_mul(z[j],y[i-j],p), p);
       c[i] = Fl_sub(x[i], p1, p);
     }
diff --git a/src/basemath/FpX.c b/src/basemath/FpX.c
index f601efe..1c5a1dc 100644
--- a/src/basemath/FpX.c
+++ b/src/basemath/FpX.c
@@ -350,7 +350,7 @@ FpX_halve(GEN y, GEN p)
 static GEN
 FpX_divrem_basecase(GEN x, GEN y, GEN p, GEN *pr)
 {
-  long vx, dx, dy, dz, i, j, sx, lr;
+  long vx, dx, dy, dy1, dz, i, j, sx, lr;
   pari_sp av0, av;
   GEN z,p1,rem,lead;
 
@@ -400,13 +400,14 @@ FpX_divrem_basecase(GEN x, GEN y, GEN p, GEN *pr)
   avma = av0;
   z=cgetg(dz+3,t_POL); z[1] = x[1];
   x += 2; y += 2; z += 2;
+  for (dy1=dy-1; dy1>=0 && !signe(gel(y, dy1)); dy1--);
 
   p1 = gel(x,dx); av = avma;
   gel(z,dz) = lead? gerepileuptoint(av, Fp_mul(p1,lead, p)): icopy(p1);
   for (i=dx-1; i>=dy; i--)
   {
     av=avma; p1=gel(x,i);
-    for (j=i-dy+1; j<=i && j<=dz; j++)
+    for (j=i-dy1; j<=i && j<=dz; j++)
       p1 = subii(p1, mulii(gel(z,j),gel(y,i-j)));
     if (lead) p1 = mulii(p1,lead);
     gel(z,i-dy) = gerepileuptoint(av,modii(p1, p));
@@ -417,7 +418,7 @@ FpX_divrem_basecase(GEN x, GEN y, GEN p, GEN *pr)
   for (sx=0; ; i--)
   {
     p1 = gel(x,i);
-    for (j=0; j<=i && j<=dz; j++)
+    for (j=maxss(0,i-dy1); j<=i && j<=dz; j++)
       p1 = subii(p1, mulii(gel(z,j),gel(y,i-j)));
     p1 = modii(p1,p); if (signe(p1)) { sx = 1; break; }
     if (!i) break;
@@ -437,7 +438,7 @@ FpX_divrem_basecase(GEN x, GEN y, GEN p, GEN *pr)
   for (i--; i>=0; i--)
   {
     av=avma; p1 = gel(x,i);
-    for (j=0; j<=i && j<=dz; j++)
+    for (j=maxss(0,i-dy1); j<=i && j<=dz; j++)
       p1 = subii(p1, mulii(gel(z,j),gel(y,i-j)));
     gel(rem,i) = gerepileuptoint(av, modii(p1,p));
   }
diff --git a/src/basemath/FpXX.c b/src/basemath/FpXX.c
index ee6b8f3..2c17c4f 100644
--- a/src/basemath/FpXX.c
+++ b/src/basemath/FpXX.c
@@ -321,7 +321,7 @@ FpXQX_FpXQ_mul(GEN P, GEN U, GEN T, GEN p)
 static GEN
 FpXQX_divrem_basecase(GEN x, GEN y, GEN T, GEN p, GEN *pr)
 {
-  long vx, dx, dy, dz, i, j, sx, lr;
+  long vx, dx, dy, dy1, dz, i, j, sx, lr;
   pari_sp av0, av, tetpil;
   GEN z,p1,rem,lead;
 
@@ -373,13 +373,14 @@ FpXQX_divrem_basecase(GEN x, GEN y, GEN T, GEN p, GEN *pr)
   avma = av0;
   z = cgetg(dz+3,t_POL); z[1] = x[1];
   x += 2; y += 2; z += 2;
+  for (dy1=dy-1; dy1>=0 && !signe(gel(y, dy1)); dy1--);
 
   p1 = gel(x,dx); av = avma;
   gel(z,dz) = lead? gerepileupto(av, Fq_mul(p1,lead, T, p)): gcopy(p1);
   for (i=dx-1; i>=dy; i--)
   {
     av=avma; p1=gel(x,i);
-    for (j=i-dy+1; j<=i && j<=dz; j++)
+    for (j=i-dy1; j<=i && j<=dz; j++)
       p1 = Fq_sub(p1, Fq_mul(gel(z,j),gel(y,i-j),NULL,p),NULL,p);
     if (lead) p1 = Fq_mul(p1, lead, NULL,p);
     tetpil=avma; gel(z,i-dy) = gerepile(av,tetpil,Fq_red(p1,T,p));
@@ -390,7 +391,7 @@ FpXQX_divrem_basecase(GEN x, GEN y, GEN T, GEN p, GEN *pr)
   for (sx=0; ; i--)
   {
     p1 = gel(x,i);
-    for (j=0; j<=i && j<=dz; j++)
+    for (j=maxss(0,i-dy1); j<=i && j<=dz; j++)
       p1 = Fq_sub(p1, Fq_mul(gel(z,j),gel(y,i-j),NULL,p),NULL,p);
     tetpil=avma; p1 = Fq_red(p1, T, p); if (signe(p1)) { sx = 1; break; }
     if (!i) break;
@@ -410,7 +411,7 @@ FpXQX_divrem_basecase(GEN x, GEN y, GEN T, GEN p, GEN *pr)
   for (i--; i>=0; i--)
   {
     av=avma; p1 = gel(x,i);
-    for (j=0; j<=i && j<=dz; j++)
+    for (j=maxss(0,i-dy1); j<=i && j<=dz; j++)
       p1 = Fq_sub(p1, Fq_mul(gel(z,j),gel(y,i-j), NULL,p), NULL,p);
     tetpil=avma; gel(rem,i) = gerepile(av,tetpil, Fq_red(p1, T, p));
   }