Jean-Pierre Flori on Thu, 14 Jan 2016 13:33:11 +0100


[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]

Re: Faster exponentiation in some extensions of finite fields of small characteristics


Should be better now.

2016-01-14 13:22 GMT+01:00 Jean-Pierre Flori <jpflori@gmail.com>:
> It seems it was an old buggy version of the patch.
> I'm trying to relocate the right commit.
>
> 2016-01-14 11:58 GMT+01:00 Jean-Pierre Flori <jpflori@gmail.com>:
>> Dear all,
>>
>> Here is a preliminary patch to speed up exponentiation in some
>> extensions of finite fields of small characteristics (< size of
>> machine word/2).
>>
>> It packs more coefficients of the polynomial representation over the
>> prime field into machine words when the finite field element is mapped
>> to a multiprecision integer (i.e. KS).
>> Two functions are added:
>> * one function which packs 4 coeffs into a machine word,
>> * one generic funciton which packs the coeffs as much as possible
>> possibly crossing machine words boundaries.
>>
>> I did not take care of adding new tuning parameters or smartly
>> choosing between the different functions, e.g. calling the
>> 4-coeffs-in-1-word function when the (product) coeff size is
>> BITS_IN_LONG/4-\epsilon might be more efficient than using the generic
>> function which does more complicated packing when the polynomial
>> degree is not large.
>>
>> I did not add (yet) optimal packing when the product coeffs are larger
>> than a machine word.
>>
>> I did not really check it is bug free.
>>
>> Best,
>> --
>> Jean-Pierre Flori
>
>
>
> --
> Jean-Pierre Flori



-- 
Jean-Pierre Flori
diff --git a/src/basemath/Flx.c b/src/basemath/Flx.c
index 715cb02..db56a8a 100644
--- a/src/basemath/Flx.c
+++ b/src/basemath/Flx.c
@@ -594,15 +594,11 @@ Flx_shiftip(pari_sp av, GEN x, long v)
   avma = (pari_sp)y; return y;
 }
 
+/* Return the maximal bitsize of coeffs. */
 INLINE long
 maxlengthcoeffpol(ulong p, long n)
 {
-  pari_sp ltop = avma;
-  GEN z = muliu(sqru(p-1), n);
-  long l = lgefint(z);
-  avma = ltop;
-  if (l==3 && HIGHWORD(z[2])==0) return 0;
-  return l-2;
+  return ceil((2*log(p-1)+log(n))/log(2));
 }
 
 INLINE ulong
@@ -708,6 +704,135 @@ Flx_mulspec_halfmulii(GEN a, GEN b, ulong p, long na, long nb)
   return int_to_Flx_half(z,p);
 }
 
+static GEN
+Flx_to_int_bitpack_word(GEN x, long s, long l)
+{
+  long i;long j=0;
+  long shift = 0;
+  long n = (l*s-1)/BITS_IN_LONG+1;
+  GEN V = cgetipos(2+n);
+  GEN w = int_LSW(V);
+  *w = 0;
+  for (i=0; i<l-1; i++)
+  {
+    *w |= x[i] << shift;
+    shift += s;
+    if (shift >= BITS_IN_LONG)
+    {
+      w=int_nextW(w);j++;
+      shift -= BITS_IN_LONG;
+      if (shift != 0)
+        *w = x[i] >> (s-shift);
+      else
+        *w = 0;
+    }
+  }
+  {
+    *w |= x[i] << shift;
+    shift += s;
+    if (shift > BITS_IN_LONG)
+    {
+      w=int_nextW(w);j++;
+      shift -= BITS_IN_LONG;
+      *w = x[i] >> (s-shift);
+    }
+  }
+  return V;
+}
+
+static GEN
+int_to_Flx_bitunpack_word(GEN x, long s, ulong p)
+{
+  long i; long j = 0;
+  long lz = ((lgefint(x)-2)*BITS_IN_LONG-1)/s+3;
+  long shift = 0;
+  long mask = (1UL<<s)-1UL;
+  GEN w, z = cgetg(lz, t_VECSMALL);
+  w = int_LSW(x); j++;
+  for (i=2; i<lz-1; i++)
+  {
+    z[i] = (((ulong) *w) >> shift) & mask;
+    shift += s;
+    if (shift >= BITS_IN_LONG)
+    {
+      shift -= BITS_IN_LONG;
+      w = int_nextW(w); j++;
+      if (shift != 0)
+        z[i] |= (((ulong) *w) << (s-shift)) & mask;
+    }
+    z[i] %= p;
+  }
+  {
+    z[i] = (((ulong) *w) >> shift) & mask;
+    z[i] %= p;
+  }
+  return Flx_renormalize(z, lz);
+}
+
+static GEN
+Flx_mulspec_mulii_bitpack(GEN x, GEN y, long s, ulong p, long nx, long ny)
+{
+  GEN z = mulii(Flx_to_int_bitpack_word(x,s,nx), Flx_to_int_bitpack_word(y,s,ny));
+  return int_to_Flx_bitunpack_word(z, s, p);
+}
+
+#define BITS_IN_QUARTULONG (BITS_IN_HALFULONG >> 1)
+static GEN
+Flx_to_int_quartspec(GEN a, long na)
+{
+  long j;
+  long n = (na+3)>>2UL;
+  GEN V = cgetipos(2+n);
+  GEN w;
+  for (w = int_LSW(V), j=0; j+3<na; j+=4, w=int_nextW(w))
+    *w = a[j]|(a[j+1]<<BITS_IN_QUARTULONG)|(a[j+2]<<(2*BITS_IN_QUARTULONG))|(a[j+3]<<(3*BITS_IN_QUARTULONG));
+  switch (na-j)
+  {
+  case 3:
+    *w = a[j]|(a[j+1]<<BITS_IN_QUARTULONG)|(a[j+2]<<(2*BITS_IN_QUARTULONG));
+    break;
+  case 2:
+    *w = a[j]|(a[j+1]<<BITS_IN_QUARTULONG);
+    break;
+  case 1:
+    *w = a[j];
+    break;
+  case 0:
+    break;
+  }
+  return V;
+}
+
+#define QUARTMASK ((1UL<<BITS_IN_QUARTULONG)-1UL)
+#define LLQUARTWORD(x) ((x) & QUARTMASK)
+#define HLQUARTWORD(x) (((x) >> BITS_IN_QUARTULONG) & QUARTMASK)
+#define LHQUARTWORD(x) (((x) >> (2*BITS_IN_QUARTULONG)) & QUARTMASK)
+#define HHQUARTWORD(x) (((x) >> (3*BITS_IN_QUARTULONG)) & QUARTMASK)
+static GEN
+int_to_Flx_quart(GEN z, ulong p)
+{
+  long i;
+  long lx = (lgefint(z)-2)*4+2;
+  GEN w, x = cgetg(lx, t_VECSMALL);
+  for (w = int_LSW(z), i=2; i<lx; i+=4, w=int_nextW(w))
+  {
+    x[i]   = LLQUARTWORD((ulong)*w)%p;
+    x[i+1] = HLQUARTWORD((ulong)*w)%p;
+    x[i+2] = LHQUARTWORD((ulong)*w)%p;
+    x[i+3] = HHQUARTWORD((ulong)*w)%p;
+  }
+  return Flx_renormalize(x, lx);
+}
+
+static GEN
+Flx_mulspec_quartmulii(GEN a, GEN b, ulong p, long na, long nb)
+{
+  GEN A = Flx_to_int_quartspec(a,na);
+  GEN B = Flx_to_int_quartspec(b,nb);
+  GEN z = mulii(A,B);
+  return int_to_Flx_quart(z,p);
+}
+
 /*Eval x in 2^(k*BIL) in linear time, k==2 or 3*/
 static GEN
 Flx_eval2BILspec(GEN x, long k, long l)
@@ -774,7 +899,7 @@ static GEN
 Flx_mulspec(GEN a, GEN b, ulong p, long na, long nb)
 {
   GEN a0,c,c0;
-  long n0, n0a, i, v = 0;
+  long n0, n0a, i, v = 0, s;
   pari_sp av;
 
   while (na && !a[0]) { a++; na--; v++; }
@@ -783,24 +908,28 @@ Flx_mulspec(GEN a, GEN b, ulong p, long na, long nb)
   if (!nb) return pol0_Flx(0);
 
   av = avma;
-  switch (maxlengthcoeffpol(p,nb))
+  s = maxlengthcoeffpol(p,nb);
+  switch (s)
   {
-  case 0:
+  case BITS_IN_QUARTULONG:
+    if (na>=Flx_MUL_HALFMULII_LIMIT)
+      return Flx_shiftip(av,Flx_mulspec_quartmulii(a,b,p,na,nb), v);
+    break;
+  case BITS_IN_HALFULONG:
     if (na>=Flx_MUL_HALFMULII_LIMIT)
       return Flx_shiftip(av,Flx_mulspec_halfmulii(a,b,p,na,nb), v);
     break;
-  case 1:
+  case BITS_IN_LONG:
     if (na>=Flx_MUL_MULII_LIMIT)
       return Flx_shiftip(av,Flx_mulspec_mulii(a,b,p,na,nb), v);
     break;
-  case 2:
-    if (na>=Flx_MUL_MULII2_LIMIT)
+  default:
+    if (s<BITS_IN_LONG && na>=Flx_MUL_HALFMULII_LIMIT)
+      return Flx_shiftip(av,Flx_mulspec_mulii_bitpack(a,b,s,p,na,nb), v);
+    if (s<=(BITS_IN_LONG<<1) && na>=Flx_MUL_MULII2_LIMIT)
       return Flx_shiftip(av,Flx_mulspec_mulii_inflate(a,b,2,p,na,nb), v);
-    break;
-  case 3:
     if (na>70)
       return Flx_shiftip(av,Flx_mulspec_mulii_inflate(a,b,3,p,na,nb), v);
-    break;
   }
   if (nb < Flx_MUL_KARATSUBA_LIMIT)
     return Flx_shiftip(av,Flx_mulspec_basecase(a,b,p,na,nb), v);
@@ -910,6 +1039,20 @@ Flx_sqrspec_halfsqri(GEN a, ulong p, long na)
 }
 
 static GEN
+Flx_sqrspec_sqri_bitpack(GEN x, long s, ulong p, long nx)
+{
+  GEN z = sqri(Flx_to_int_bitpack_word(x,s,nx));
+  return int_to_Flx_bitunpack_word(z, s, p);
+}
+
+static GEN
+Flx_sqrspec_quartsqri(GEN a, ulong p, long na)
+{
+  GEN z = sqri(Flx_to_int_quartspec(a,na));
+  return int_to_Flx_quart(z,p);
+}
+
+static GEN
 Flx_sqrspec_sqri_inflate(GEN x, long N, ulong p, long nx)
 {
   pari_sp av = avma;
@@ -921,31 +1064,35 @@ static GEN
 Flx_sqrspec(GEN a, ulong p, long na)
 {
   GEN a0, c, c0;
-  long n0, n0a, i, v = 0;
+  long n0, n0a, i, v = 0, s;
   pari_sp av;
 
   while (na && !a[0]) { a++; na--; v += 2; }
   if (!na) return pol0_Flx(0);
 
   av = avma;
-  switch(maxlengthcoeffpol(p,na))
+  s = maxlengthcoeffpol(p,na);
+  switch (s)
   {
-  case 0:
+  case BITS_IN_QUARTULONG:
     if (na>=Flx_SQR_HALFSQRI_LIMIT)
-      return Flx_shiftip(av, Flx_sqrspec_halfsqri(a,p,na), v);
+      return Flx_shiftip(av,Flx_sqrspec_quartsqri(a,p,na), v);
     break;
-  case 1:
-    if (na>=Flx_SQR_SQRI_LIMIT)
-      return Flx_shiftip(av, Flx_sqrspec_sqri(a,p,na), v);
+  case BITS_IN_HALFULONG:
+    if (na>=Flx_SQR_HALFSQRI_LIMIT)
+      return Flx_shiftip(av,Flx_sqrspec_halfsqri(a,p,na), v);
     break;
-  case 2:
-    if (na>=Flx_SQR_SQRI2_LIMIT)
-      return Flx_shiftip(av, Flx_sqrspec_sqri_inflate(a,2,p,na), v);
+  case BITS_IN_LONG:
+    if (na>=Flx_SQR_SQRI_LIMIT)
+      return Flx_shiftip(av,Flx_sqrspec_sqri(a,p,na), v);
     break;
-  case 3:
+  default:
+    if (s<BITS_IN_LONG && na>=Flx_SQR_HALFSQRI_LIMIT)
+      return Flx_shiftip(av,Flx_sqrspec_sqri_bitpack(a,s,p,na), v);
+    if (s<=(BITS_IN_LONG<<1) && na>=Flx_SQR_SQRI2_LIMIT)
+      return Flx_shiftip(av,Flx_sqrspec_sqri_inflate(a,2,p,na), v);
     if (na>70)
-      return Flx_shiftip(av, Flx_sqrspec_sqri_inflate(a,3,p,na), v);
-    break;
+      return Flx_shiftip(av,Flx_sqrspec_sqri_inflate(a,3,p,na), v);
   }
   if (na < Flx_SQR_KARATSUBA_LIMIT)
     return Flx_shiftip(av, Flx_sqrspec_basecase(a,p,na), v);
@@ -1031,24 +1178,23 @@ static long
 Flx_multhreshold(GEN T, ulong p, long half, long mul, long mul2, long kara)
 {
   long na = lgpol(T);
-  switch (maxlengthcoeffpol(p,na))
+  long s = maxlengthcoeffpol(p,na);
+  switch (s)
   {
-  case 0:
+  case BITS_IN_QUARTULONG:
+  case BITS_IN_HALFULONG:
     if (na>=Flx_MUL_HALFMULII_LIMIT)
       return na>=half;
-    break;
-  case 1:
+  case BITS_IN_LONG:
     if (na>=Flx_MUL_MULII_LIMIT)
       return na>=mul;
-    break;
-  case 2:
-    if (na>=Flx_MUL_MULII2_LIMIT)
+  default:
+    if (s<BITS_IN_LONG && na>=Flx_MUL_HALFMULII_LIMIT)
+      return na>=half;
+    if (s<=(BITS_IN_LONG<<1) && na>=Flx_MUL_MULII2_LIMIT)
       return na>=mul2;
-    break;
-  case 3:
-    if (na>=70)
-      return na>=70;
-    break;
+    if (na>70)
+      return na>70;
   }
   return na>=kara;
 }