Peter Bruin on Thu, 08 Oct 2015 14:52:36 +0200 |
[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]
Faster digits and fromdigits in base 2^k, and FlxqM_mul_Kronecker |
Bonjour, The functions digits(x,b) and fromdigits(x,b) are currently not very fast when b is a power of 2. I am attaching a patch (digits_2k.patch) that contains - reimplementations of binary_2k, binary_2k_zv - new functions fromdigits_2k, fromdigits_2k_zv - a 2-line addition to fromdigits to use the new fromdigits_2k Here is one example of the speed-up obtained by this patch: Before: gp> n = random(2^1000); gp> for(i=1,1000000,digits(n,2^10)); time = 1,961 ms. gp> v = vector(100,i,random(2^10)); gp> for(i=1,1000000,fromdigits(v,2^10)); time = 3,480 ms. After: gp> n = random(2^1000); gp> for(i=1,1000000,digits(n,2^10)); time = 436 ms. gp> v = vector(100,i,random(2^10)); gp> for(i=1,1000000,fromdigits(v,2^10)); time = 482 ms. My motivation for this was mostly to speed up multiplication of matrices over non-prime finite fields of small characteristic (FlxqM). I am attaching a second patch (FlxqM_mul_Kronecker.patch) that enhances Kronecker multiplication for FlxqM to use the above functions for packing into and unpacking from integer matrices in cases where it allows us to pack the entries in fewer words per entry than the (half-)word-aligned packing that is used now. In experiments with many different finite fields and matrix sizes, I get an average speedup of about 17%, i.e. if r(F,n) = (new time)/(old time) for the multiplication of two n × n matrices over F, then the average value of r(F,n) over many fields F and sizes n is about 0.83. For some F and n, the new time is less than 0.4 times the old one, and it is rarely more than 1.1 times the old one. I obtained these benchmarks using the attached file FlxqM_mul_test.c. The only thing that is still missing is the documentation for the new functions fromdigits_2k and fromdigits_2k_zv. If the new functions are accepted into PARI, I can write this documentation. Also, I am open to suggestions for renaming some of the (exported or auxiliary) functions. Thanks, Peter
diff --git a/src/basemath/arith2.c b/src/basemath/arith2.c index 01726a0..1935132 100644 --- a/src/basemath/arith2.c +++ b/src/basemath/arith2.c @@ -1501,6 +1501,8 @@ fromdigits(GEN x, GEN B) if (typ(x)!=t_VEC || !RgV_is_ZV(x)) pari_err_TYPE("fromdigits",x); B = check_basis(B); if (lg(x)==1) { avma = av; return gen_0; } + if (Z_ispow2(B)) + return fromdigits_2k(x, expi(B)); x = vecreverse(x); return gerepileupto(av, gen_fromdigits(x, B, NULL, &Z_ring)); } diff --git a/src/basemath/bit.c b/src/basemath/bit.c index 6e9e253..9cdfc8d 100644 --- a/src/basemath/bit.c +++ b/src/basemath/bit.c @@ -81,6 +81,117 @@ bits_to_u(GEN v, long l) return u; } +/* set BITS_IN_LONG bits starting at word *w plus *r bits, + * clearing subsequent bits in the last word touched */ +INLINE void +int_set_ulong(ulong a, GEN *w, long *r) +{ + if (*r) { + **w |= (a << *r); + *w = int_nextW(*w); + **w = (a >> (BITS_IN_LONG - *r)); + } else { + **w = a; + *w = int_nextW(*w); + } +} + +/* set k bits starting at word *w plus *r bits, + * clearing subsequent bits in the last word touched */ +INLINE void +int_set_bits(ulong a, long k, GEN *w, long *r) +{ + if (*r) { + **w |= a << *r; + a >>= BITS_IN_LONG - *r; + } else { + **w = a; + a = 0; + } + *r += k; + if (*r >= BITS_IN_LONG) { + *w = int_nextW(*w); + *r -= BITS_IN_LONG; + for (; *r >= BITS_IN_LONG; *r -= BITS_IN_LONG) { + **w = a; + a = 0; + *w = int_nextW(*w); + } + if (*r) + **w = a; + } +} + +/* set k bits from z (t_INT) starting at word *w plus *r bits, + * clearing subsequent bits in the last word touched */ +INLINE void +int_set_int(GEN z, long k, GEN *w, long *r) +{ + long l = lgefint(z) - 2; + GEN y; + if (!l) { + int_set_bits(0, k, w, r); + return; + } + y = int_LSW(z); + for (; l > 1; l--) { + int_set_ulong((ulong) *y, w, r); + y = int_nextW(y); + k -= BITS_IN_LONG; + } + if (k) + int_set_bits((ulong) *y, k, w, r); +} + +GEN +fromdigits_2k_zv(GEN x, long k) +{ + long l = lg(x) - 1, r; + GEN w, z; + if (!l) return gen_0; + z = cgetipos(nbits2lg(k * l)); + w = int_LSW(z); + r = 0; + for (; l; l--) + int_set_bits(uel(x, l), k, &w, &r); + return int_normalize(z, 0); +} + +GEN +fromdigits_2k(GEN x, long k) +{ + long l, m; + GEN w, y, z; + for (l = lg(x) - 1; l && !signe(gel(x, 1)); x++, l--); + if (k == 1) return bits_to_int(x, l); + if (!l) return gen_0; + m = expi(gel(x, 1)) + 1; + z = cgetipos(nbits2lg(k * (l - 1) + m)); + w = int_LSW(z); + if (!(k & (BITS_IN_LONG - 1))) { + long i, j, t = k >> TWOPOTBITS_IN_LONG; + for (; l; l--) { + j = lgefint(gel(x, l)) - 2; + y = int_LSW(gel(x, l)); + for (i = 0; i < j; i++) { + *w = *y; + y = int_nextW(y); + w = int_nextW(w); + } + for (; i < t; i++) { + *w = 0; + w = int_nextW(w); + } + } + } else { + long r = 0; + for (; l > 1; l--) + int_set_int(gel(x, l), k, &w, &r); + int_set_int(gel(x, 1), m, &w, &r); + } + return int_normalize(z, 0); +} + GEN binaire(GEN x) { @@ -139,41 +250,101 @@ binaire(GEN x) return y; } +/* extract k bits (as ulong) starting at word *w plus *r bits */ +INLINE ulong +int_get_bits(long k, GEN *w, long *r) +{ + ulong mask = (1UL << k) - 1; + ulong a = (((ulong) **w) >> *r) & mask; + *r += k; + if (*r >= BITS_IN_LONG) { + *r -= BITS_IN_LONG; + *w = int_nextW(*w); + if (*r) + a |= (**w << (k - *r)) & mask; + } + return a; +} + +/* extract BITS_IN_LONG bits starting at word *w plus *r bits */ +INLINE ulong +int_get_ulong(GEN *w, long *r) +{ + ulong a = ((ulong) **w) >> *r; + *w = int_nextW(*w); + if (*r) + a |= (**w << (BITS_IN_LONG - *r)); + return a; +} + +/* extract k bits (as t_INT) starting at word *w plus *r bits */ +INLINE GEN +int_get_int(long k, GEN *w, long *r) +{ + GEN z = cgetipos(nbits2lg(k)); + GEN y = int_LSW(z); + for (; k >= BITS_IN_LONG; k -= BITS_IN_LONG) { + *y = int_get_ulong(w, r); + y = int_nextW(y); + } + if (k) + *y = int_get_bits(k, w, r); + return int_normalize(z, 0); +} + /* assume k < BITS_IN_LONG */ GEN binary_2k_zv(GEN x, long k) { - long iv, j, n, nmodk, nk; - GEN v, vk; + long l, n, r; + GEN v, w; if (k == 1) return binary_zv(x); - if (!signe(x)) return cgetg(1,t_VECSMALL); - v = binary_zv(x); - n = lg(v)-1; - nk = n / k; nmodk = n % k; - if (nmodk) nk++; - vk = cgetg(nk+1, t_VECSMALL); - iv = n - k; - if (!nmodk) nmodk = k; - for (j = nk; j >= 2; j--,iv-=k) vk[j] = bits_to_u(v+iv, k); - vk[1] = bits_to_u(v,nmodk); - return vk; + if (!signe(x)) return cgetg(1, t_VECSMALL); + n = expi(x) + 1; + l = (n + k - 1) / k; + v = cgetg(l + 1, t_VECSMALL); + w = int_LSW(x); + r = 0; + for (; l > 1; l--) { + uel(v, l) = int_get_bits(k, &w, &r); + n -= k; + } + uel(v, 1) = int_get_bits(n, &w, &r); + return v; } + GEN binary_2k(GEN x, long k) { - long iv, j, n, nmodk, nk; - GEN v, vk; - if (!signe(x)) return cgetg(1,t_VEC); - v = binary_zv(x); - n = lg(v)-1; - nk = n / k; nmodk = n % k; - if (nmodk) nk++; - vk = cgetg(nk+1, t_VEC); - iv = n - k; - if (!nmodk) nmodk = k; - for (j = nk; j >= 2; j--,iv-=k) gel(vk,j) = bits_to_int(v+iv, k); - gel(vk,1) = bits_to_int(v, nmodk); - return vk; + long l, n; + GEN v, w, y, z; + if (k == 1) return binaire(x); + if (!signe(x)) return cgetg(1, t_VEC); + n = expi(x) + 1; + l = (n + k - 1) / k; + v = cgetg(l + 1, t_VEC); + w = int_LSW(x); + if (!(k & (BITS_IN_LONG - 1))) { + long m, t = k >> TWOPOTBITS_IN_LONG, u = lgefint(x) - 2; + for (; l; l--) { + m = minss(t, u); + z = cgetipos(m + 2); + y = int_LSW(z); + for (; m; m--) { + *y = *w; + y = int_nextW(y); + w = int_nextW(w); + } + gel(v, l) = int_normalize(z, 0); + u -= t; + } + } else { + long r = 0; + for (; l > 1; l--, n -= k) + gel(v, l) = int_get_int(k, &w, &r); + gel(v, 1) = int_get_int(n, &w, &r); + } + return v; } /* return 1 if bit n of x is set, 0 otherwise */ diff --git a/src/headers/paridecl.h b/src/headers/paridecl.h index 83c5285..77752b3 100644 --- a/src/headers/paridecl.h +++ b/src/headers/paridecl.h @@ -2100,6 +2100,8 @@ GEN binary_zv(GEN x); GEN binary_2k(GEN x, long k); GEN binary_2k_zv(GEN x, long k); long bittest(GEN x, long n); +GEN fromdigits_2k(GEN x, long k); +GEN fromdigits_2k_zv(GEN x, long k); GEN gbitand(GEN x, GEN y); GEN gbitneg(GEN x, long n); GEN gbitnegimply(GEN x, GEN y);
diff --git a/src/basemath/Flx.c b/src/basemath/Flx.c index 391dee9..c7eaab5 100644 --- a/src/basemath/Flx.c +++ b/src/basemath/Flx.c @@ -4142,6 +4142,18 @@ kron_pack_Flx_spec_3(GEN x, long l) { } static GEN +kron_pack_Flx_spec_bits(GEN x, long b, long l) { + GEN y; + long i; + if (l == 0) + return gen_0; + y = cgetg(l + 1, t_VECSMALL); + for(i = 1; i <= l; i++) + y[i] = x[l - i]; + return fromdigits_2k_zv(y, b); +} + +static GEN kron_unpack_Flx(GEN z, ulong p) { long i, l = lgefint(z); @@ -4163,6 +4175,37 @@ kron_unpack_Flx_3(GEN x, ulong p) { return Z_mod2BIL_Flx_3(x, d, p); } +/* assume b < BITS_IN_LONG */ +static GEN +kron_unpack_Flx_bits_narrow(GEN z, long b, ulong p) { + GEN v = binary_2k_zv(z, b), x; + long i, l = lg(v) + 1; + x = cgetg(l, t_VECSMALL); + for (i = 2; i < l; i++) + x[i] = v[l - i] % p; + return Flx_renormalize(x, l); +} + +static GEN +kron_unpack_Flx_bits_wide(GEN z, long b, ulong p, ulong pi) { + GEN v = binary_2k(z, b), x, y; + long i, l = lg(v) + 1, ly; + x = cgetg(l, t_VECSMALL); + for (i = 2; i < l; i++) { + y = gel(v, l - i); + ly = lgefint(y); + switch (ly) { + case 2: x[i] = 0; break; + case 3: x[i] = *int_W_lg(y, 0, ly) % p; break; + case 4: x[i] = remll_pre(*int_W_lg(y, 1, ly), *int_W_lg(y, 0, ly), p, pi); break; + case 5: x[i] = remlll_pre(*int_W_lg(y, 2, ly), *int_W_lg(y, 1, ly), + *int_W_lg(y, 0, ly), p, pi); break; + default: x[i] = umodiu(gel(v, l - i), p); + } + } + return Flx_renormalize(x, l); +} + static GEN FlxM_pack_ZM(GEN M, GEN (*pack)(GEN, long)) { long i, j, l, lc; @@ -4181,6 +4224,24 @@ FlxM_pack_ZM(GEN M, GEN (*pack)(GEN, long)) { } static GEN +FlxM_pack_ZM_bits(GEN M, long b) +{ + long i, j, l, lc; + GEN N = cgetg_copy(M, &l), x; + if (l == 1) + return N; + lc = lgcols(M); + for (j = 1; j < l; j++) { + gel(N, j) = cgetg(lc, t_COL); + for (i = 1; i < lc; i++) { + x = gcoeff(M, i, j); + gcoeff(N, i, j) = kron_pack_Flx_spec_bits(x + 2, b, lgpol(x)); + } + } + return N; +} + +static GEN ZM_unpack_FlxqM(GEN M, GEN T, ulong p, GEN (*unpack)(GEN, ulong)) { long i, j, l, lc, sv = get_Flx_var(T); @@ -4199,44 +4260,82 @@ ZM_unpack_FlxqM(GEN M, GEN T, ulong p, GEN (*unpack)(GEN, ulong)) return N; } +static GEN +ZM_unpack_FlxqM_bits(GEN M, long b, GEN T, ulong p) +{ + long i, j, l, lc, sv = get_Flx_var(T); + GEN N = cgetg_copy(M, &l), x; + if (l == 1) + return N; + lc = lgcols(M); + if (b < BITS_IN_LONG) { + for (j = 1; j < l; j++) { + gel(N, j) = cgetg(lc, t_COL); + for (i = 1; i < lc; i++) { + x = kron_unpack_Flx_bits_narrow(gcoeff(M, i, j), b, p); + x[1] = sv; + gcoeff(N, i, j) = Flx_rem(x, T, p); + } + } + } else { + ulong pi = get_Fl_red(p); + for (j = 1; j < l; j++) { + gel(N, j) = cgetg(lc, t_COL); + for (i = 1; i < lc; i++) { + x = kron_unpack_Flx_bits_wide(gcoeff(M, i, j), b, p, pi); + x[1] = sv; + gcoeff(N, i, j) = Flx_rem(x, T, p); + } + } + } + return N; +} + GEN FlxqM_mul_Kronecker(GEN A, GEN B, GEN T, ulong p) { pari_sp av = avma; - long l, n = lg(A) - 1; + long b, d = degpol(T), n = lg(A) - 1; GEN C, D, z; GEN (*pack)(GEN, long), (*unpack)(GEN, ulong); int is_sqr = A==B; - /* - cf. Flx_mulspec(), maxlengthcoeffpol() - z can be 1, 2, 3 or (theoretically) 4 words long - */ - z = muliu(muliu(sqru(p - 1), degpol(T)), n); - l = lgefint(z); + z = muliu(muliu(sqru(p - 1), d), n); + b = expi(z) + 1; + /* only do expensive bit-packing if it saves at least 1 limb */ + if (b <= BITS_IN_HALFULONG) { + if (nbits2lg(d*b) - 2 == (d + 1)/2) + b = BITS_IN_HALFULONG; + } else { + long l = lgefint(z) - 2; + if (nbits2lg(d*b) - 2 == d*l) + b = l*BITS_IN_LONG; + } avma = av; - switch (l - 2) { - case 1: - if (HIGHWORD(z[2]) == 0) { - pack = kron_pack_Flx_spec_half; - unpack = int_to_Flx_half; - } else { - pack = kron_pack_Flx_spec; - unpack = kron_unpack_Flx; - } + switch (b) { + case BITS_IN_HALFULONG: + pack = kron_pack_Flx_spec_half; + unpack = int_to_Flx_half; break; - case 2: + case BITS_IN_LONG: + pack = kron_pack_Flx_spec; + unpack = kron_unpack_Flx; + break; + case 2*BITS_IN_LONG: pack = kron_pack_Flx_spec_2; unpack = kron_unpack_Flx_2; break; - case 3: + case 3*BITS_IN_LONG: pack = kron_pack_Flx_spec_3; unpack = kron_unpack_Flx_3; break; default: - /* not implemented, probably won't occur in practice */ - return NULL; + A = FlxM_pack_ZM_bits(A, b); + B = is_sqr? A: FlxM_pack_ZM_bits(B, b); + C = ZM_mul(A, B); + D = ZM_unpack_FlxqM_bits(C, b, T, p); + return gerepilecopy(av, D); } A = FlxM_pack_ZM(A, pack); B = is_sqr? A: FlxM_pack_ZM(B, pack); diff --git a/src/basemath/alglin1.c b/src/basemath/alglin1.c index 5ebdf91..ae1b1e2 100644 --- a/src/basemath/alglin1.c +++ b/src/basemath/alglin1.c @@ -981,11 +981,8 @@ FlxqM_mul(GEN A, GEN B, GEN T, ulong p) { if (n == 0) return cgetg(1, t_MAT); - if (n > 1) { - GEN C = FlxqM_mul_Kronecker(A, B, T, p); - if (C != NULL) - return C; - } + if (n > 1) + return FlxqM_mul_Kronecker(A, B, T, p); ff = get_Flxq_field(&E, T, p); return gen_matmul(A, B, E, ff); }
#include <stdio.h> #include <time.h> #include <pari/pari.h> static time_t t_tot = 0; static GEN random_FlxqC(long m, GEN T, ulong p) { GEN C = cgetg(m + 1, t_COL); long i, d = get_Flx_degree(T), v = get_Flx_var(T); for (i = 1; i <= m; i++) gel(C, i) = random_Flx(d, v, p); return C; } static GEN random_FlxqM(long m, long n, GEN T, ulong p) { GEN M = cgetg(n + 1, t_MAT); long i; for (i = 1; i <= n; i++) gel(M, i) = random_FlxqC(m, T, p); return M; } static void test(ulong p, long d, long m, long n, long k) { long v = 0; pari_sp av = avma; GEN T = ZX_to_Flx(liftint(ffinit(stoi(p), d, v)), p); GEN A = random_FlxqM(m, n, T, p); GEN B = random_FlxqM(n, k, T, p); time_t t0, t1; int i, N = 60 / n; pari_sp av1 = avma; printf("\\\\ %li^%li, %li, %li, %li\n", p, d, m, n, k); fflush(stdout); t0 = clock(); for(i = 0; i < N; i++) { (void) FlxqM_mul(A, B, T, p); avma = av1; } t1 = clock(); t_tot += (t1 - t0); printf("%f\n", ((double) (t1 - t0)) / CLOCKS_PER_SEC); fflush(stdout); avma = av; } int main () { ulong i, j, m, n, p, d; ulong P[] = { 3, 5, 7, 11, 13, 17, 67, 257, 1031, 4099, /* nextprime(2^12) */ 16777259, /* nextprime(2^24) */ 4294967311, /* nextprime(2^32) */ 1099511627791, /* nextprime(2^40) */ 281474976710677, /* nextprime(2^48) */ 4611686018427388039, /* nextprime(2^62) */ 0 }; ulong D[] = { 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 36, 48, 60, 72, 84, 96, 0 }; ulong N[] = { 2, 4, 6, 8, 10, 20, 40, 60, 0 }; pari_init(1 << 30, 500000); for (m = 0; (n = N[m]) != 0; m++) { for (i = 0; (p = P[i]) != 0; i++) { for (j = 0; (d = D[j]) != 0; j++) test(p, d, n, n, n); } } printf("\\\\ t_tot = %f\n", ((double) t_tot) / CLOCKS_PER_SEC); return 0; }