Ilya Zakharevich on Thu, 12 Dec 2002 15:20:20 -0800 |
[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]
[PATCH oldish CVS] primelimit above 1<<31 |
Actually, the bug in initprimes0() has a one-line fix, but this patch also fixes other places where the limit was signed, and adds some minimal docs. Some minimal cache optimizations are also performed. Now sieving up to 4e9 takes less than 2min on Athlon/850 or Ultra Sparc/333. Enjoy, Ilya --- ./src/basemath/arith2.c-pre Wed Oct 23 14:58:22 2002 +++ ./src/basemath/arith2.c Thu Dec 12 14:52:30 2002 @@ -116,7 +116,145 @@ initprimes1(ulong size, long *lenp, long return (byteptr) gprealloc(p,r-p); } -#define PRIME_ARENA (200 * 1024) /* No slowdown even with 256K level-2 cache */ +/* Timing in ms (Athlon/850; reports 512M of secondary cache; looks + like there is 64K of quickier cache too). + + The algorithm of allocation starts to work regularly from + 2pi(sqrt(lim)); we skip or double-quote what is less than this. + + + arena| 10m 30m 100m 300m 1000m 2000m 4000m + ================================================================= + 1K 360 + 1.5 250 1150 + 2K 210 840 + 3K 180 660 3160 + 4K 160 580 2530 11930 "36680" "84730" "262850" + 12K 150 460 1730 5590 26010 65500 "184520" + 20K 150 430 1500 4900 25450 56420 143220 + ... + 64K 140 410 1400 4360 22150 32850 122820 + + Timing relative to 64K: + + 20K 1.071 1.049 1.071 1.124 1.149 1.718 1.166 + 28K 1.071 1.000 1.043 1.067 1.115 1.256 1.326 + 36K 1.000 1.024 1.029 1.034 0.946 1.075 1.087 + 44K 1.000 1.049 1.007 1.016 1.006 1.056 0.974 + 52K 1.000 0.976 1.007 1.009 1.023 1.035 0.910 + 56K 0.929 1.024 1.000 1.002 1.023 1.022 0.893 + 58K 1.143 0.976 1.000 1.002 0.995 1.016 0.888 + 60K 1.000 1.024 1.000 0.998 1.016 1.010 0.865 + 62K 1.000 1.000 1.000 0.995 1.023 1.004 0.922 + 64K 1.000 1.000 1.000 1.000 1.000 1.000 1.000 + 66K 1.071 1.049 1.014 0.993 1.025 0.995 0.938 + 68K 1.071 1.098 1.086 1.050 0.939 0.991 0.926 + 70K 1.214 1.171 1.164 1.124 1.061 1.023 0.860 + 72K 1.214 1.244 1.221 1.181 1.070 1.081 0.852 + 76K 1.357 1.415 1.371 1.305 1.154 1.260 0.841 + 84K 1.643 1.610 1.614 1.546 1.340 1.426 0.954 + 92K 1.857 1.878 1.850 1.773 1.475 1.640 1.063 + 96K 2.000 2.073 2.057 2.016 1.380 1.710 1.127 + 128K 2.214 2.341 2.371 2.328 1.624 2.118 1.429 + 192K 2.500 2.707 2.743 2.729 1.901 2.495 1.626 + 256K 2.786 2.902 2.979 2.977 2.077 2.680 1.791 + 384K 3.143 3.293 3.371 3.392 2.344 3.050 1.906 + 512K 3.286 3.488 3.521 3.537 2.445 3.441 2.130 + 768K 3.857 4.171 4.264 4.319 2.977 4.045 2.490 + 1024K 4.429 4.707 4.879 4.947 3.435 4.900 2.941 + 1536K 5.071 5.659 6.029 6.197 4.393 6.237 3.662 + 2048K 5.357 6.195 6.593 6.995 4.857 6.886 4.039 + 3072K 5.786 6.488 7.029 7.356 5.264 7.425 4.402 + + Values after 92K are from different run... Matters much for 4000m + one, when swapping starts; also 1000m run looks not very much + reproducible... + + The stats for small arena lead to the value of ARENA_IN_ROOTS for i386. + + Similar data for Sparc leads to 10. + + TODO: One needs to create macro for (small) OVERHEAD_100_IN_ROOTS where the + overhead of subdivision is 100%; likewise one needs to estimate + the overhead of having the arena larger than the cache size. Then + one can intelligently optimize the arena size taking both effects + into the account... +*/ + +#ifndef ARENA_IN_ROOTS +# ifdef i386 /* gcc defines this? */ +# define ARENA_IN_ROOTS 1.5 +# else +# define ARENA_IN_ROOTS 10 +# endif +#endif +#ifndef PRIME_ARENA +# ifdef i386 /* gcc defines this? */ + /* Due to smaller ARENA_IN_ROOTS, smaller arena is OK; fit level-1 cache */ +# define PRIME_ARENA (63 * 1024) /* No slowdown even with 64K level-1 cache */ +# else +# define PRIME_ARENA (200 * 1024) /* No slowdown even with 256K level-2 cache */ +# endif +#endif + +static long prime_arena = PRIME_ARENA; +static double arena_in_roots = ARENA_IN_ROOTS; + +long +good_arena_size(long rootnum, ulong remains, ulong primes) +{ + /* ARENA_IN_ROOTS below 12: some slowdown starts to be noticable + * when things fit into the cache. + * XXX The choice of 10 gives a slowdown of 1-2% on UltraSparcII, + * but makes calculations even for (the current) maximum of 436273009 + * fit into 256K cache (still common for some architectures). + * + * One may change it when small caches become uncommon, but the gain + * is not going to be very noticable... */ + + long asize = arena_in_roots * rootnum; /* Make % overhead negligeable. */ + if (asize < prime_arena) asize = prime_arena - 1; + if (asize > remains) asize = remains; /* + room for a sentinel byte */ + if (2 * primes < asize) /* XXXX Better substitutes for 2? */ + asize -= primes; /* We access arena AND primes */ + return asize; +} + +/* Use as in + install(set_internal,lLDG) \\ Through some M too? + set_internal(2,100) \\ disable dependence on limit + + { time_primes_arena(ar,limit) = + set_internal(1,floor(ar*1024)); + default(primelimit, 200 000); \\ 100000 results in *larger* malloc()! + gettime; + default(primelimit, limit); + if(ar >= 1, ar=floor(ar)); + print("arena "ar"K => "gettime"ms"); + } +*/ + +long +set_internal(long what, GEN g) +{ + long ret = 0; + + if (what == 1) + ret = prime_arena; + else if (what == 2) + ret = arena_in_roots * 1000; + else + err(talker, "panic: set_internal"); + if (g != NULL) { + long val = itos(g); + + if (what == 1) + prime_arena = val; + else if (what == 2) + arena_in_roots = val / 1000.; + } + return ret; +} /* Here's the workhorse. This is recursive, although normally the first recursive call will bottom out and invoke initprimes1() at once. @@ -151,26 +289,15 @@ initprimes0(ulong maxnum, long *lenp, ul fin1 = p1 + psize - 1; remains = (maxnum - rootnum) >> 1; /* number of odd numbers to check */ - /* ARENA_IN_ROOTS below 12: some slowdown starts to be noticable - * when things fit into the cache. - * XXX The choice of 10 gives a slowdown of 1-2% on UltraSparcII, - * but makes calculations even for (the current) maximum of 436273009 - * fit into 256K cache (still common for some architectures). - * - * One may change it when small caches become uncommon, but the gain - * is not going to be very noticable... */ -#define ARENA_IN_ROOTS 10 - - asize = ARENA_IN_ROOTS * rootnum; /* Make % overhead negligeable. */ - if (asize < PRIME_ARENA) asize = PRIME_ARENA; - if (asize > remains) asize = remains + 1;/* + room for a sentinel byte */ - alloced = (avma - bot < (size_t)asize>>1); /* enough room on the stack ? */ + asize = good_arena_size(rootnum, remains, psize); + /* enough room on the stack ? */ + alloced = (((byteptr)avma) - ((byteptr)bot) <= (size_t)asize); if (alloced) - p = (byteptr) gpmalloc(asize); + p = (byteptr) gpmalloc(asize + 1); else p = (byteptr) bot; - fin = p + asize - 1; /* the 0 sentinel goes at fin. */ - curlow = rootnum + 2; /* We know all primes up to rootnum (odd). */ + fin = p + asize; /* the 0 sentinel goes at fin. */ + curlow = rootnum + 2; /* First candidate: know primes up to rootnum (odd). */ curdiff = fin1; /* During each iteration p..fin-1 represents a range of odd @@ -181,10 +308,10 @@ initprimes0(ulong maxnum, long *lenp, ul { if (asize > remains) { - asize = remains + 1; - fin = p + asize - 1; + asize = remains; + fin = p + asize; } - memset(p, 0, asize); + memset(p, 0, asize + 1); /* p corresponds to curlow. q runs over primediffs. */ /* Don't care about DIFFPTR_SKIP: false positives provide no problem */ for (q = p1+2, k = 3; q <= fin1; k += *q++) @@ -201,12 +328,29 @@ initprimes0(ulong maxnum, long *lenp, ul */ long k2 = k*k - curlow; - if (k2 > 0) - { - r = p + (k2 >>= 1); - if (k2 > remains) break; /* Guard against an address wrap. */ - } else +#if 0 /* XXXX Check which one is quickier! */ + if (k2 > 0) { /* May be due to ulong==>long wrap */ + k2 >>= 1; + if (k2 >= asize) { + if (k2 > remains) { + /* Can happen due to a conversion wrap only, so . */ + goto small_k; + } else + break; /* All primes up to sqrt(top) checked */ + } + r = p + k2; + } else { + small_k: r = p - (((curlow+k-2) % (2*k)) >> 1) + k - 1; + } +#else + if (k2 > 0) { + r = p + (k2 >>= 1); + if (k2 <= remains) goto finish; /* Guard against an address wrap. */ + } + r = p - (((curlow+k-2) % (2*k)) >> 1) + k - 1; + finish: +#endif for (s = r; s < fin; s += k) *s = 1; } /* now q runs over addresses corresponding to primes */ @@ -221,9 +365,9 @@ initprimes0(ulong maxnum, long *lenp, ul *curdiff++ = DIFFPTR_SKIP, d -= DIFFPTR_SKIP; *curdiff++ = d; } - plast -= asize - 1; - remains -= asize - 1; - curlow += ((asize - 1)<<1); + plast -= asize; + remains -= asize; + curlow += (asize<<1); } /* while (remains) */ last = curlow - ((p - plast) << 1); *curdiff++ = 0; /* sentinel */ --- ./src/headers/paricom.h-pre Thu Oct 31 00:54:14 2002 +++ ./src/headers/paricom.h Tue Nov 12 22:01:04 2002 @@ -287,6 +287,8 @@ extern int new_galois_format; #define mod4(x) (modBIL(x) & 3) #define mod2(x) (modBIL(x) & 1) #define is_pm1(n) ((lgefint(n)==3) && (((GEN)(n))[2]==1)) +#define is_biguint(n) ((lgefint(n)>3) || \ + ((lgefint(n)==3) && signe(n) < 0)) #define is_bigint(n) ((lgefint(n)>3) || \ ((lgefint(n)==3) && ((((GEN)(n))[2]) < 0))) --- ./src/language/sumiter.c-pre Tue Oct 15 17:34:06 2002 +++ ./src/language/sumiter.c Tue Nov 12 22:00:08 2002 @@ -99,8 +99,8 @@ forstep(entree *ep, GEN a, GEN b, GEN s, /* assume ptr is the address of a diffptr containing the succesive * differences between primes, and c = current prime (up to *p excluded) * return smallest prime >= a, update ptr */ -static long -sinitp(long a, long c, byteptr *ptr) +static ulong +sinitp(ulong a, ulong c, byteptr *ptr) { byteptr p = *ptr; if (a <= 0) a = 2; @@ -113,14 +113,14 @@ sinitp(long a, long c, byteptr *ptr) /* value changed during the loop, replace by the first prime whose value is strictly larger than new value */ static void -update_p(entree *ep, byteptr *ptr, long prime[]) +update_p(entree *ep, byteptr *ptr, ulong prime[]) { GEN z = (GEN)ep->value; - long a, c; + ulong a, c; if (typ(z) == t_INT) a = 1; else { z = gceil(z); a = 0; } - if (is_bigint(z)) { prime[2] = -1; /* = infinity */ return; } - a += itos(z); c = prime[2]; + if (is_biguint(z)) { prime[2] = -1; /* = infinity */ return; } + a += z[2]; c = prime[2]; if (c < a) prime[2] = sinitp(a, c, ptr); /* increased */ else if (c > a) @@ -128,25 +128,29 @@ update_p(entree *ep, byteptr *ptr, long *ptr = diffptr; prime[2] = sinitp(a, 0, ptr); } - changevalue_p(ep, prime); + changevalue_p(ep, (GEN)prime); } static byteptr -prime_loop_init(GEN ga, GEN gb, long *a, long *b, long prime[]) +prime_loop_init(GEN ga, GEN gb, ulong *a, ulong *b, ulong prime[]) { byteptr d = diffptr; ga = gceil(ga); gb = gfloor(gb); if (typ(ga) != t_INT || typ(gb) != t_INT) err(typeer,"prime_loop_init"); - if (is_bigint(ga) || is_bigint(gb)) + if (signe(gb) < 0) + return NULL; + if (signe(ga) < 0) + ga = gun; + if (is_biguint(ga) || is_biguint(gb)) { if (cmpii(ga, gb) > 0) return NULL; err(primer1); } - *a = itos(ga); if (*a <= 0) *a = 1; - *b = itos(gb); if (*a > *b) return NULL; - maxprime_check((ulong)*b); + *a = (ulong)ga[2]; + *b = (ulong)gb[2]; if (*a > *b) return NULL; + maxprime_check(*b); prime[2] = sinitp(*a, 0, &d); return d; } @@ -154,8 +158,9 @@ prime_loop_init(GEN ga, GEN gb, long *a, void forprime(entree *ep, GEN ga, GEN gb, char *ch) { - long prime[] = {evaltyp(t_INT)|_evallg(3), evalsigne(1)|evallgefint(3), 0}; - long a, b; + long p[] = {evaltyp(t_INT)|_evallg(3), evalsigne(1)|evallgefint(3), 0}; + ulong *prime = (ulong*)p; + ulong a, b; pari_sp av = avma; byteptr d; @@ -163,7 +168,7 @@ forprime(entree *ep, GEN ga, GEN gb, cha if (!d) { avma = av; return; } avma = av; push_val(ep, prime); - while ((ulong)prime[2] < (ulong)b) + while (prime[2] < b) { (void)lisseq(ch); if (loop_break()) break; if (ep->value == prime) --- ./src/gp/gp.c-pre Wed Oct 23 14:58:24 2002 +++ ./src/gp/gp.c Sat Dec 7 00:22:24 2002 @@ -423,7 +423,7 @@ sd_ulong(char *v, int flag, char *s, ulo if (*v == 0) n = *ptn; else { - n = (ulong)get_int(v,0); + n = get_uint(v); if (*ptn == n) return gnil; if (n > Max || n < Min) { @@ -766,7 +766,7 @@ static GEN sd_primelimit(char *v, int flag) { ulong n = primelimit; - GEN r = sd_ulong(v,flag,"primelimit",&n, 0,VERYBIGINT,NULL); + GEN r = sd_ulong(v,flag,"primelimit",&n, 0,2*(ulong)VERYBIGINT,NULL); if (n != primelimit) { if (flag != d_INITRC)