@@ -82,63 +82,136 @@ static int u32_is_base2_pseudoprime(uint32_t x)
8282/* Faster arithmetic for 32-bit probable prime test on 64-bit machines. */
8383#if FLINT_BITS == 64
8484
85- static uint32_t
86- u32_addmod (uint32_t x , uint32_t y , uint32_t n )
85+ /* Use the division routines currently defined internally in radix.h;
86+ todo - move to ulong_extras */
87+ #include "radix.h"
88+
89+ static ulong
90+ n_rem_precomp_c0 (ulong x , ulong d , const n_div_precomp_t pre )
8791{
88- return (n - y > x ? x + y : x + y - n );
92+ ulong q = n_div_precomp_c0 (x , pre );
93+ return x - d * q ;
8994}
9095
91- static uint32_t
92- u32_rem_u64_shoup ( uint64_t x , uint32_t n , uint64_t ninv )
96+ static ulong
97+ n_rem_precomp_c1_unsafe ( ulong x , ulong d , const n_div_precomp_t pre )
9398{
94- return n_mulmod_shoup (1 , x , ninv , n );
99+ ulong q = n_div_precomp_c1_unsafe (x , pre );
100+ return x - d * q ;
95101}
96102
97- static uint32_t
98- u32_2_powmod (uint32_t exp , uint32_t n , uint64_t ninv )
103+ static ulong
104+ n_2_powmod_c0 (uint32_t exp , ulong n , const n_div_precomp_t pre )
99105{
100- uint32_t x , bit ;
101- unsigned int ebits ;
106+ ulong x ;
107+ int k , ebits ;
102108
103109 if (exp < FLINT_BITS )
104- return u32_rem_u64_shoup (UWORD (1 ) << exp , n , ninv );
110+ return n_rem_precomp_c0 (UWORD (1 ) << exp , n , pre );
105111
106112 ebits = FLINT_BITS - flint_clz (exp );
107- bit = UWORD (1 ) << (ebits - 6 );
113+ x = n_rem_precomp_c0 ( UWORD (1 ) << (exp >> ( ebits - 6 )), n , pre );
108114
109- x = u32_rem_u64_shoup (((uint64_t ) 1 ) << (exp >> (ebits - 6 )), n , ninv );
115+ if (n < UWORD (1 ) << (FLINT_BITS / 2 - 1 ))
116+ {
117+ for (k = ebits - 7 ; k >= 0 ; k -- )
118+ {
119+ x = n_rem_precomp_c0 ((ulong ) x * x , n , pre );
120+ x <<= ((exp >> k ) & 1 );
121+ }
110122
111- while (bit >>= 1 )
123+ if (x >= n )
124+ x -= n ;
125+ }
126+ else
112127 {
113- x = u32_rem_u64_shoup ((uint64_t ) x * x , n , ninv );
128+ for (k = ebits - 7 ; k >= 0 ; k -- )
129+ {
130+ x = n_rem_precomp_c0 ((ulong ) x * x , n , pre );
131+ x <<= ((exp >> k ) & 1 );
132+ if (x >= n )
133+ x -= n ;
134+ }
135+ }
136+
137+ return x ;
138+ }
139+
140+ static ulong
141+ n_2_powmod_c1 (uint32_t exp , ulong n , const n_div_precomp_t pre )
142+ {
143+ ulong x ;
144+ int k , ebits ;
145+
146+ if (exp < FLINT_BITS )
147+ return n_rem_precomp_c1_unsafe (UWORD (1 ) << exp , n , pre );
114148
115- if (bit & exp )
116- x = u32_addmod (x , x , n );
149+ ebits = FLINT_BITS - flint_clz (exp );
150+ x = n_rem_precomp_c1_unsafe (UWORD (1 ) << (exp >> (ebits - 6 )), n , pre );
151+
152+ if (n < UWORD (1 ) << (FLINT_BITS / 2 - 1 ))
153+ {
154+ for (k = ebits - 7 ; k >= 0 ; k -- )
155+ {
156+ x = n_rem_precomp_c1_unsafe (x * x , n , pre );
157+ x <<= ((exp >> k ) & 1 );
158+ }
159+
160+ if (x >= n )
161+ x -= n ;
162+ }
163+ else
164+ {
165+ for (k = ebits - 7 ; k >= 0 ; k -- )
166+ {
167+ x = n_rem_precomp_c1_unsafe (x * x , n , pre );
168+ x <<= ((exp >> k ) & 1 );
169+ if (x >= n )
170+ x -= n ;
171+ }
117172 }
118173
119174 return x ;
120175}
121176
122177static int u32_is_base2_probabprime (uint32_t n )
123178{
124- uint32_t d = n - 1 , s = 0 ;
125- uint64_t ninv = n_mulmod_precomp_shoup (UWORD (1 ), n );
179+ ulong d = n - 1 , s = 0 ;
180+
181+ n_div_precomp_t pre ;
182+ n_div_precomp_init (pre , n );
126183
127184 s = flint_ctz (d );
128185 d >>= s ;
129186
130- uint64_t t = d ;
131- uint64_t y ;
187+ ulong t = d ;
188+ ulong y ;
132189
133- y = u32_2_powmod (t , n , ninv );
134- if (y == 1 )
135- return 1 ;
190+ if (pre -> c == 0 )
191+ {
192+ y = n_2_powmod_c0 (t , n , pre );
193+ if (y == 1 )
194+ return 1 ;
136195
137- t <<= 1 ;
138- while ((t != n - 1 ) && (y != n - 1 ))
196+ t <<= 1 ;
197+ while ((t != n - 1 ) && (y != n - 1 ))
198+ {
199+ y = n_rem_precomp_c0 (y * y , n , pre );
200+ t <<= 1 ;
201+ }
202+ }
203+ else
139204 {
140- y = u32_rem_u64_shoup (y * y , n , ninv );
205+ y = n_2_powmod_c1 (t , n , pre );
206+ if (y == 1 )
207+ return 1 ;
208+
141209 t <<= 1 ;
210+ while ((t != n - 1 ) && (y != n - 1 ))
211+ {
212+ y = n_rem_precomp_c1_unsafe (y * y , n , pre );
213+ t <<= 1 ;
214+ }
142215 }
143216
144217 return y == n - 1 ;
@@ -280,34 +353,129 @@ n_is_prime_odd_no_trial(ulong n)
280353 }
281354}
282355
356+ /* 3, 5 (checked separately), and 32 more primes which may be vectorized */
357+ #define NUM_ODD_TRIAL_PRIMES 34
358+
359+ static const uint32_t trial_inv_32 [2 * NUM_ODD_TRIAL_PRIMES ] = {
360+ UWORD (0xaaaaaaab ), UWORD (0x55555555 ), /* 3 */
361+ UWORD (0xcccccccd ), UWORD (0x33333333 ), /* 5 */
362+ UWORD (0xb6db6db7 ), UWORD (0x24924924 ), /* 7 */
363+ UWORD (0xba2e8ba3 ), UWORD (0x1745d174 ), /* 11 */
364+ UWORD (0xc4ec4ec5 ), UWORD (0x13b13b13 ), /* 13 */
365+ UWORD (0xf0f0f0f1 ), UWORD (0xf0f0f0f ), /* 17 */
366+ UWORD (0x286bca1b ), UWORD (0xd79435e ), /* 19 */
367+ UWORD (0xe9bd37a7 ), UWORD (0xb21642c ), /* 23 */
368+ UWORD (0x4f72c235 ), UWORD (0x8d3dcb0 ), /* 29 */
369+ UWORD (0xbdef7bdf ), UWORD (0x8421084 ), /* 31 */
370+ UWORD (0x914c1bad ), UWORD (0x6eb3e45 ), /* 37 */
371+ UWORD (0xc18f9c19 ), UWORD (0x63e7063 ), /* 41 */
372+ UWORD (0x2fa0be83 ), UWORD (0x5f417d0 ), /* 43 */
373+ UWORD (0x677d46cf ), UWORD (0x572620a ), /* 47 */
374+ UWORD (0x8c13521d ), UWORD (0x4d4873e ), /* 53 */
375+ UWORD (0xa08ad8f3 ), UWORD (0x456c797 ), /* 59 */
376+ UWORD (0xc10c9715 ), UWORD (0x4325c53 ), /* 61 */
377+ UWORD (0x7a44c6b ), UWORD (0x3d22635 ), /* 67 */
378+ UWORD (0xe327a977 ), UWORD (0x39b0ad1 ), /* 71 */
379+ UWORD (0xc7e3f1f9 ), UWORD (0x381c0e0 ), /* 73 */
380+ UWORD (0x613716af ), UWORD (0x33d91d2 ), /* 79 */
381+ UWORD (0x2b2e43db ), UWORD (0x3159721 ), /* 83 */
382+ UWORD (0xfa3f47e9 ), UWORD (0x2e05c0b ), /* 89 */
383+ UWORD (0x5f02a3a1 ), UWORD (0x2a3a0fd ), /* 97 */
384+ UWORD (0x7c32b16d ), UWORD (0x288df0c ), /* 101 */
385+ UWORD (0xd3431b57 ), UWORD (0x27c4597 ), /* 103 */
386+ UWORD (0x8d28ac43 ), UWORD (0x2647c69 ), /* 107 */
387+ UWORD (0xda6c0965 ), UWORD (0x2593f69 ), /* 109 */
388+ UWORD (0xfdbc091 ), UWORD (0x243f6f0 ), /* 113 */
389+ UWORD (0xefdfbf7f ), UWORD (0x2040810 ), /* 127 */
390+ UWORD (0xc9484e2b ), UWORD (0x1f44659 ), /* 131 */
391+ UWORD (0x77975b9 ), UWORD (0x1de5d6e ), /* 137 */
392+ UWORD (0x70586723 ), UWORD (0x1d77b65 ), /* 139 */
393+ UWORD (0x8ce2cabd ), UWORD (0x1b7d6c3 ), /* 149 */
394+ };
395+
396+ static const uint64_t trial_inv_64 [2 * NUM_ODD_TRIAL_PRIMES ] = {
397+ UWORD (0xaaaaaaaaaaaaaaab ), UWORD (0x5555555555555555 ), /* 3 */
398+ UWORD (0xcccccccccccccccd ), UWORD (0x3333333333333333 ), /* 5 */
399+ UWORD (0x6db6db6db6db6db7 ), UWORD (0x2492492492492492 ), /* 7 */
400+ UWORD (0x2e8ba2e8ba2e8ba3 ), UWORD (0x1745d1745d1745d1 ), /* 11 */
401+ UWORD (0x4ec4ec4ec4ec4ec5 ), UWORD (0x13b13b13b13b13b1 ), /* 13 */
402+ UWORD (0xf0f0f0f0f0f0f0f1 ), UWORD (0xf0f0f0f0f0f0f0f ), /* 17 */
403+ UWORD (0x86bca1af286bca1b ), UWORD (0xd79435e50d79435 ), /* 19 */
404+ UWORD (0xd37a6f4de9bd37a7 ), UWORD (0xb21642c8590b216 ), /* 23 */
405+ UWORD (0x34f72c234f72c235 ), UWORD (0x8d3dcb08d3dcb08 ), /* 29 */
406+ UWORD (0xef7bdef7bdef7bdf ), UWORD (0x842108421084210 ), /* 31 */
407+ UWORD (0x14c1bacf914c1bad ), UWORD (0x6eb3e45306eb3e4 ), /* 37 */
408+ UWORD (0x8f9c18f9c18f9c19 ), UWORD (0x63e7063e7063e70 ), /* 41 */
409+ UWORD (0x82fa0be82fa0be83 ), UWORD (0x5f417d05f417d05 ), /* 43 */
410+ UWORD (0x51b3bea3677d46cf ), UWORD (0x572620ae4c415c9 ), /* 47 */
411+ UWORD (0x21cfb2b78c13521d ), UWORD (0x4d4873ecade304d ), /* 53 */
412+ UWORD (0xcbeea4e1a08ad8f3 ), UWORD (0x456c797dd49c341 ), /* 59 */
413+ UWORD (0x4fbcda3ac10c9715 ), UWORD (0x4325c53ef368eb0 ), /* 61 */
414+ UWORD (0xf0b7672a07a44c6b ), UWORD (0x3d226357e16ece5 ), /* 67 */
415+ UWORD (0x193d4bb7e327a977 ), UWORD (0x39b0ad12073615a ), /* 71 */
416+ UWORD (0x7e3f1f8fc7e3f1f9 ), UWORD (0x381c0e070381c0e ), /* 73 */
417+ UWORD (0x9b8b577e613716af ), UWORD (0x33d91d2a2067b23 ), /* 79 */
418+ UWORD (0xa3784a062b2e43db ), UWORD (0x3159721ed7e7534 ), /* 83 */
419+ UWORD (0xf47e8fd1fa3f47e9 ), UWORD (0x2e05c0b81702e05 ), /* 89 */
420+ UWORD (0xa3a0fd5c5f02a3a1 ), UWORD (0x2a3a0fd5c5f02a3 ), /* 97 */
421+ UWORD (0x3a4c0a237c32b16d ), UWORD (0x288df0cac5b3f5d ), /* 101 */
422+ UWORD (0xdab7ec1dd3431b57 ), UWORD (0x27c45979c95204f ), /* 103 */
423+ UWORD (0x77a04c8f8d28ac43 ), UWORD (0x2647c69456217ec ), /* 107 */
424+ UWORD (0xa6c0964fda6c0965 ), UWORD (0x2593f69b02593f6 ), /* 109 */
425+ UWORD (0x90fdbc090fdbc091 ), UWORD (0x243f6f0243f6f02 ), /* 113 */
426+ UWORD (0x7efdfbf7efdfbf7f ), UWORD (0x204081020408102 ), /* 127 */
427+ UWORD (0x3e88cb3c9484e2b ), UWORD (0x1f44659e4a42715 ), /* 131 */
428+ UWORD (0xe21a291c077975b9 ), UWORD (0x1de5d6e3f8868a4 ), /* 137 */
429+ UWORD (0x3aef6ca970586723 ), UWORD (0x1d77b654b82c339 ), /* 139 */
430+ UWORD (0xdf5b0f768ce2cabd ), UWORD (0x1b7d6c3dda338b2 ), /* 149 */
431+ };
432+
433+ #define TRIAL32 (n , ii ) (((uint32_t) (n) * trial_inv_32[2 * ii]) <= trial_inv_32[2 * ii + 1])
434+ #define TRIAL64 (n , ii ) (((uint64_t) (n) * trial_inv_64[2 * ii]) <= trial_inv_64[2 * ii + 1])
435+
283436int
284437n_is_prime (ulong n )
285438{
286439 if (n < SMALL_ODDPRIME_LIMIT )
287440 return (n % 2 ? n_is_oddprime_small2 (n ) : n == 2 );
288441
289- if (!(n % 2 ) || !(n % 3 ) || !(n % 5 ))
442+ /* Trial division by 2, 3 and 5 */
443+ #if FLINT_BITS == 64
444+ if (!(n % 2 ) || TRIAL64 (n , 0 ) || TRIAL64 (n , 1 ))
445+ #else
446+ if (!(n % 2 ) || TRIAL32 (n , 0 ) || TRIAL32 (n , 1 ))
447+ #endif
290448 return 0 ;
291449
292450 if (n < (1 << 20 ))
293451 return n_is_mod30prime_small (n );
294452
295- if (n > (1 << 20 ) &&
296- (!(n % 7 ) || !(n % 11 ) || !(n % 13 ) || !(n % 17 ) || !(n % 19 ) ||
297- !(n % 23 ) || !(n % 29 ) || !(n % 31 ) || !(n % 37 ) ||
298- !(n % 41 ) || !(n % 43 ) || !(n % 47 ) || !(n % 53 )))
299- return 0 ;
300-
301- if (n > UINT32_MAX &&
302- (!(n % 59 ) || !(n % 61 ) || !(n % 67 ) || !(n % 71 ) || !(n % 73 ) ||
303- !(n % 79 ) || !(n % 83 ) || !(n % 89 ) || !(n % 97 ) || !(n %101 ) ||
304- !(n %103 ) || !(n % 107 ) || !(n %109 ) || !(n %113 ) || !(n % 127 ) ||
305- !(n %131 ) || !(n % 137 ) || !(n %139 ) || !(n %149 )))
306- return 0 ;
453+ if (n <= UINT32_MAX )
454+ {
455+ int i , c ;
456+ /* The compiler should unroll and convert this to a few vector
457+ instructions when e.g. AVX2 is available; branchy return isn't
458+ worth it. */
459+ c = 0 ;
460+ for (i = 2 ; i < NUM_ODD_TRIAL_PRIMES ; i ++ )
461+ c |= TRIAL32 (n , i );
462+
463+ if (c != 0 )
464+ return 0 ;
465+ }
466+ else
467+ {
468+ int i ;
469+ for (i = 2 ; i < NUM_ODD_TRIAL_PRIMES ; i ++ )
470+ if (TRIAL64 (n , i ))
471+ return 0 ;
472+ }
307473
308474 return n_is_prime_odd_no_trial (n );
309475}
310476
477+
478+
311479/* first 64 primes used for modular arithmetic */
312480#define N_MODULUS (UWORD(1) << (FLINT_BITS - 1))
313481#define N_MOD_TAB 64
0 commit comments