Jean-Pierre Flori on Thu, 14 Jan 2016 13:33:11 +0100 |
[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]
Re: Faster exponentiation in some extensions of finite fields of small characteristics |
Should be better now. 2016-01-14 13:22 GMT+01:00 Jean-Pierre Flori <jpflori@gmail.com>: > It seems it was an old buggy version of the patch. > I'm trying to relocate the right commit. > > 2016-01-14 11:58 GMT+01:00 Jean-Pierre Flori <jpflori@gmail.com>: >> Dear all, >> >> Here is a preliminary patch to speed up exponentiation in some >> extensions of finite fields of small characteristics (< size of >> machine word/2). >> >> It packs more coefficients of the polynomial representation over the >> prime field into machine words when the finite field element is mapped >> to a multiprecision integer (i.e. KS). >> Two functions are added: >> * one function which packs 4 coeffs into a machine word, >> * one generic funciton which packs the coeffs as much as possible >> possibly crossing machine words boundaries. >> >> I did not take care of adding new tuning parameters or smartly >> choosing between the different functions, e.g. calling the >> 4-coeffs-in-1-word function when the (product) coeff size is >> BITS_IN_LONG/4-\epsilon might be more efficient than using the generic >> function which does more complicated packing when the polynomial >> degree is not large. >> >> I did not add (yet) optimal packing when the product coeffs are larger >> than a machine word. >> >> I did not really check it is bug free. >> >> Best, >> -- >> Jean-Pierre Flori > > > > -- > Jean-Pierre Flori -- Jean-Pierre Flori
diff --git a/src/basemath/Flx.c b/src/basemath/Flx.c index 715cb02..db56a8a 100644 --- a/src/basemath/Flx.c +++ b/src/basemath/Flx.c @@ -594,15 +594,11 @@ Flx_shiftip(pari_sp av, GEN x, long v) avma = (pari_sp)y; return y; } +/* Return the maximal bitsize of coeffs. */ INLINE long maxlengthcoeffpol(ulong p, long n) { - pari_sp ltop = avma; - GEN z = muliu(sqru(p-1), n); - long l = lgefint(z); - avma = ltop; - if (l==3 && HIGHWORD(z[2])==0) return 0; - return l-2; + return ceil((2*log(p-1)+log(n))/log(2)); } INLINE ulong @@ -708,6 +704,135 @@ Flx_mulspec_halfmulii(GEN a, GEN b, ulong p, long na, long nb) return int_to_Flx_half(z,p); } +static GEN +Flx_to_int_bitpack_word(GEN x, long s, long l) +{ + long i;long j=0; + long shift = 0; + long n = (l*s-1)/BITS_IN_LONG+1; + GEN V = cgetipos(2+n); + GEN w = int_LSW(V); + *w = 0; + for (i=0; i<l-1; i++) + { + *w |= x[i] << shift; + shift += s; + if (shift >= BITS_IN_LONG) + { + w=int_nextW(w);j++; + shift -= BITS_IN_LONG; + if (shift != 0) + *w = x[i] >> (s-shift); + else + *w = 0; + } + } + { + *w |= x[i] << shift; + shift += s; + if (shift > BITS_IN_LONG) + { + w=int_nextW(w);j++; + shift -= BITS_IN_LONG; + *w = x[i] >> (s-shift); + } + } + return V; +} + +static GEN +int_to_Flx_bitunpack_word(GEN x, long s, ulong p) +{ + long i; long j = 0; + long lz = ((lgefint(x)-2)*BITS_IN_LONG-1)/s+3; + long shift = 0; + long mask = (1UL<<s)-1UL; + GEN w, z = cgetg(lz, t_VECSMALL); + w = int_LSW(x); j++; + for (i=2; i<lz-1; i++) + { + z[i] = (((ulong) *w) >> shift) & mask; + shift += s; + if (shift >= BITS_IN_LONG) + { + shift -= BITS_IN_LONG; + w = int_nextW(w); j++; + if (shift != 0) + z[i] |= (((ulong) *w) << (s-shift)) & mask; + } + z[i] %= p; + } + { + z[i] = (((ulong) *w) >> shift) & mask; + z[i] %= p; + } + return Flx_renormalize(z, lz); +} + +static GEN +Flx_mulspec_mulii_bitpack(GEN x, GEN y, long s, ulong p, long nx, long ny) +{ + GEN z = mulii(Flx_to_int_bitpack_word(x,s,nx), Flx_to_int_bitpack_word(y,s,ny)); + return int_to_Flx_bitunpack_word(z, s, p); +} + +#define BITS_IN_QUARTULONG (BITS_IN_HALFULONG >> 1) +static GEN +Flx_to_int_quartspec(GEN a, long na) +{ + long j; + long n = (na+3)>>2UL; + GEN V = cgetipos(2+n); + GEN w; + for (w = int_LSW(V), j=0; j+3<na; j+=4, w=int_nextW(w)) + *w = a[j]|(a[j+1]<<BITS_IN_QUARTULONG)|(a[j+2]<<(2*BITS_IN_QUARTULONG))|(a[j+3]<<(3*BITS_IN_QUARTULONG)); + switch (na-j) + { + case 3: + *w = a[j]|(a[j+1]<<BITS_IN_QUARTULONG)|(a[j+2]<<(2*BITS_IN_QUARTULONG)); + break; + case 2: + *w = a[j]|(a[j+1]<<BITS_IN_QUARTULONG); + break; + case 1: + *w = a[j]; + break; + case 0: + break; + } + return V; +} + +#define QUARTMASK ((1UL<<BITS_IN_QUARTULONG)-1UL) +#define LLQUARTWORD(x) ((x) & QUARTMASK) +#define HLQUARTWORD(x) (((x) >> BITS_IN_QUARTULONG) & QUARTMASK) +#define LHQUARTWORD(x) (((x) >> (2*BITS_IN_QUARTULONG)) & QUARTMASK) +#define HHQUARTWORD(x) (((x) >> (3*BITS_IN_QUARTULONG)) & QUARTMASK) +static GEN +int_to_Flx_quart(GEN z, ulong p) +{ + long i; + long lx = (lgefint(z)-2)*4+2; + GEN w, x = cgetg(lx, t_VECSMALL); + for (w = int_LSW(z), i=2; i<lx; i+=4, w=int_nextW(w)) + { + x[i] = LLQUARTWORD((ulong)*w)%p; + x[i+1] = HLQUARTWORD((ulong)*w)%p; + x[i+2] = LHQUARTWORD((ulong)*w)%p; + x[i+3] = HHQUARTWORD((ulong)*w)%p; + } + return Flx_renormalize(x, lx); +} + +static GEN +Flx_mulspec_quartmulii(GEN a, GEN b, ulong p, long na, long nb) +{ + GEN A = Flx_to_int_quartspec(a,na); + GEN B = Flx_to_int_quartspec(b,nb); + GEN z = mulii(A,B); + return int_to_Flx_quart(z,p); +} + /*Eval x in 2^(k*BIL) in linear time, k==2 or 3*/ static GEN Flx_eval2BILspec(GEN x, long k, long l) @@ -774,7 +899,7 @@ static GEN Flx_mulspec(GEN a, GEN b, ulong p, long na, long nb) { GEN a0,c,c0; - long n0, n0a, i, v = 0; + long n0, n0a, i, v = 0, s; pari_sp av; while (na && !a[0]) { a++; na--; v++; } @@ -783,24 +908,28 @@ Flx_mulspec(GEN a, GEN b, ulong p, long na, long nb) if (!nb) return pol0_Flx(0); av = avma; - switch (maxlengthcoeffpol(p,nb)) + s = maxlengthcoeffpol(p,nb); + switch (s) { - case 0: + case BITS_IN_QUARTULONG: + if (na>=Flx_MUL_HALFMULII_LIMIT) + return Flx_shiftip(av,Flx_mulspec_quartmulii(a,b,p,na,nb), v); + break; + case BITS_IN_HALFULONG: if (na>=Flx_MUL_HALFMULII_LIMIT) return Flx_shiftip(av,Flx_mulspec_halfmulii(a,b,p,na,nb), v); break; - case 1: + case BITS_IN_LONG: if (na>=Flx_MUL_MULII_LIMIT) return Flx_shiftip(av,Flx_mulspec_mulii(a,b,p,na,nb), v); break; - case 2: - if (na>=Flx_MUL_MULII2_LIMIT) + default: + if (s<BITS_IN_LONG && na>=Flx_MUL_HALFMULII_LIMIT) + return Flx_shiftip(av,Flx_mulspec_mulii_bitpack(a,b,s,p,na,nb), v); + if (s<=(BITS_IN_LONG<<1) && na>=Flx_MUL_MULII2_LIMIT) return Flx_shiftip(av,Flx_mulspec_mulii_inflate(a,b,2,p,na,nb), v); - break; - case 3: if (na>70) return Flx_shiftip(av,Flx_mulspec_mulii_inflate(a,b,3,p,na,nb), v); - break; } if (nb < Flx_MUL_KARATSUBA_LIMIT) return Flx_shiftip(av,Flx_mulspec_basecase(a,b,p,na,nb), v); @@ -910,6 +1039,20 @@ Flx_sqrspec_halfsqri(GEN a, ulong p, long na) } static GEN +Flx_sqrspec_sqri_bitpack(GEN x, long s, ulong p, long nx) +{ + GEN z = sqri(Flx_to_int_bitpack_word(x,s,nx)); + return int_to_Flx_bitunpack_word(z, s, p); +} + +static GEN +Flx_sqrspec_quartsqri(GEN a, ulong p, long na) +{ + GEN z = sqri(Flx_to_int_quartspec(a,na)); + return int_to_Flx_quart(z,p); +} + +static GEN Flx_sqrspec_sqri_inflate(GEN x, long N, ulong p, long nx) { pari_sp av = avma; @@ -921,31 +1064,35 @@ static GEN Flx_sqrspec(GEN a, ulong p, long na) { GEN a0, c, c0; - long n0, n0a, i, v = 0; + long n0, n0a, i, v = 0, s; pari_sp av; while (na && !a[0]) { a++; na--; v += 2; } if (!na) return pol0_Flx(0); av = avma; - switch(maxlengthcoeffpol(p,na)) + s = maxlengthcoeffpol(p,na); + switch (s) { - case 0: + case BITS_IN_QUARTULONG: if (na>=Flx_SQR_HALFSQRI_LIMIT) - return Flx_shiftip(av, Flx_sqrspec_halfsqri(a,p,na), v); + return Flx_shiftip(av,Flx_sqrspec_quartsqri(a,p,na), v); break; - case 1: - if (na>=Flx_SQR_SQRI_LIMIT) - return Flx_shiftip(av, Flx_sqrspec_sqri(a,p,na), v); + case BITS_IN_HALFULONG: + if (na>=Flx_SQR_HALFSQRI_LIMIT) + return Flx_shiftip(av,Flx_sqrspec_halfsqri(a,p,na), v); break; - case 2: - if (na>=Flx_SQR_SQRI2_LIMIT) - return Flx_shiftip(av, Flx_sqrspec_sqri_inflate(a,2,p,na), v); + case BITS_IN_LONG: + if (na>=Flx_SQR_SQRI_LIMIT) + return Flx_shiftip(av,Flx_sqrspec_sqri(a,p,na), v); break; - case 3: + default: + if (s<BITS_IN_LONG && na>=Flx_SQR_HALFSQRI_LIMIT) + return Flx_shiftip(av,Flx_sqrspec_sqri_bitpack(a,s,p,na), v); + if (s<=(BITS_IN_LONG<<1) && na>=Flx_SQR_SQRI2_LIMIT) + return Flx_shiftip(av,Flx_sqrspec_sqri_inflate(a,2,p,na), v); if (na>70) - return Flx_shiftip(av, Flx_sqrspec_sqri_inflate(a,3,p,na), v); - break; + return Flx_shiftip(av,Flx_sqrspec_sqri_inflate(a,3,p,na), v); } if (na < Flx_SQR_KARATSUBA_LIMIT) return Flx_shiftip(av, Flx_sqrspec_basecase(a,p,na), v); @@ -1031,24 +1178,23 @@ static long Flx_multhreshold(GEN T, ulong p, long half, long mul, long mul2, long kara) { long na = lgpol(T); - switch (maxlengthcoeffpol(p,na)) + long s = maxlengthcoeffpol(p,na); + switch (s) { - case 0: + case BITS_IN_QUARTULONG: + case BITS_IN_HALFULONG: if (na>=Flx_MUL_HALFMULII_LIMIT) return na>=half; - break; - case 1: + case BITS_IN_LONG: if (na>=Flx_MUL_MULII_LIMIT) return na>=mul; - break; - case 2: - if (na>=Flx_MUL_MULII2_LIMIT) + default: + if (s<BITS_IN_LONG && na>=Flx_MUL_HALFMULII_LIMIT) + return na>=half; + if (s<=(BITS_IN_LONG<<1) && na>=Flx_MUL_MULII2_LIMIT) return na>=mul2; - break; - case 3: - if (na>=70) - return na>=70; - break; + if (na>70) + return na>70; } return na>=kara; }