Skip to content

Commit 9a8f553

Browse files
Optimize trial division and half-limb probable prime test in n_is_prime
1 parent 9946b40 commit 9a8f553

File tree

1 file changed

+208
-40
lines changed

1 file changed

+208
-40
lines changed

src/ulong_extras/is_prime.c

Lines changed: 208 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -82,63 +82,136 @@ static int u32_is_base2_pseudoprime(uint32_t x)
8282
/* Faster arithmetic for 32-bit probable prime test on 64-bit machines. */
8383
#if FLINT_BITS == 64
8484

85-
static uint32_t
86-
u32_addmod(uint32_t x, uint32_t y, uint32_t n)
85+
/* Use the division routines currently defined internally in radix.h;
86+
todo - move to ulong_extras */
87+
#include "radix.h"
88+
89+
static ulong
90+
n_rem_precomp_c0(ulong x, ulong d, const n_div_precomp_t pre)
8791
{
88-
return (n - y > x ? x + y : x + y - n);
92+
ulong q = n_div_precomp_c0(x, pre);
93+
return x - d * q;
8994
}
9095

91-
static uint32_t
92-
u32_rem_u64_shoup(uint64_t x, uint32_t n, uint64_t ninv)
96+
static ulong
97+
n_rem_precomp_c1_unsafe(ulong x, ulong d, const n_div_precomp_t pre)
9398
{
94-
return n_mulmod_shoup(1, x, ninv, n);
99+
ulong q = n_div_precomp_c1_unsafe(x, pre);
100+
return x - d * q;
95101
}
96102

97-
static uint32_t
98-
u32_2_powmod(uint32_t exp, uint32_t n, uint64_t ninv)
103+
static ulong
104+
n_2_powmod_c0(uint32_t exp, ulong n, const n_div_precomp_t pre)
99105
{
100-
uint32_t x, bit;
101-
unsigned int ebits;
106+
ulong x;
107+
int k, ebits;
102108

103109
if (exp < FLINT_BITS)
104-
return u32_rem_u64_shoup(UWORD(1) << exp, n, ninv);
110+
return n_rem_precomp_c0(UWORD(1) << exp, n, pre);
105111

106112
ebits = FLINT_BITS - flint_clz(exp);
107-
bit = UWORD(1) << (ebits - 6);
113+
x = n_rem_precomp_c0(UWORD(1) << (exp >> (ebits - 6)), n, pre);
108114

109-
x = u32_rem_u64_shoup(((uint64_t) 1) << (exp >> (ebits - 6)), n, ninv);
115+
if (n < UWORD(1) << (FLINT_BITS / 2 - 1))
116+
{
117+
for (k = ebits - 7; k >= 0; k--)
118+
{
119+
x = n_rem_precomp_c0((ulong) x * x, n, pre);
120+
x <<= ((exp >> k) & 1);
121+
}
110122

111-
while (bit >>= 1)
123+
if (x >= n)
124+
x -= n;
125+
}
126+
else
112127
{
113-
x = u32_rem_u64_shoup((uint64_t) x * x, n, ninv);
128+
for (k = ebits - 7; k >= 0; k--)
129+
{
130+
x = n_rem_precomp_c0((ulong) x * x, n, pre);
131+
x <<= ((exp >> k) & 1);
132+
if (x >= n)
133+
x -= n;
134+
}
135+
}
136+
137+
return x;
138+
}
139+
140+
static ulong
141+
n_2_powmod_c1(uint32_t exp, ulong n, const n_div_precomp_t pre)
142+
{
143+
ulong x;
144+
int k, ebits;
145+
146+
if (exp < FLINT_BITS)
147+
return n_rem_precomp_c1_unsafe(UWORD(1) << exp, n, pre);
114148

115-
if (bit & exp)
116-
x = u32_addmod(x, x, n);
149+
ebits = FLINT_BITS - flint_clz(exp);
150+
x = n_rem_precomp_c1_unsafe(UWORD(1) << (exp >> (ebits - 6)), n, pre);
151+
152+
if (n < UWORD(1) << (FLINT_BITS / 2 - 1))
153+
{
154+
for (k = ebits - 7; k >= 0; k--)
155+
{
156+
x = n_rem_precomp_c1_unsafe(x * x, n, pre);
157+
x <<= ((exp >> k) & 1);
158+
}
159+
160+
if (x >= n)
161+
x -= n;
162+
}
163+
else
164+
{
165+
for (k = ebits - 7; k >= 0; k--)
166+
{
167+
x = n_rem_precomp_c1_unsafe(x * x, n, pre);
168+
x <<= ((exp >> k) & 1);
169+
if (x >= n)
170+
x -= n;
171+
}
117172
}
118173

119174
return x;
120175
}
121176

122177
static int u32_is_base2_probabprime(uint32_t n)
123178
{
124-
uint32_t d = n-1, s = 0;
125-
uint64_t ninv = n_mulmod_precomp_shoup(UWORD(1), n);
179+
ulong d = n-1, s = 0;
180+
181+
n_div_precomp_t pre;
182+
n_div_precomp_init(pre, n);
126183

127184
s = flint_ctz(d);
128185
d >>= s;
129186

130-
uint64_t t = d;
131-
uint64_t y;
187+
ulong t = d;
188+
ulong y;
132189

133-
y = u32_2_powmod(t, n, ninv);
134-
if (y == 1)
135-
return 1;
190+
if (pre->c == 0)
191+
{
192+
y = n_2_powmod_c0(t, n, pre);
193+
if (y == 1)
194+
return 1;
136195

137-
t <<= 1;
138-
while ((t != n - 1) && (y != n - 1))
196+
t <<= 1;
197+
while ((t != n - 1) && (y != n - 1))
198+
{
199+
y = n_rem_precomp_c0(y * y, n, pre);
200+
t <<= 1;
201+
}
202+
}
203+
else
139204
{
140-
y = u32_rem_u64_shoup(y * y, n, ninv);
205+
y = n_2_powmod_c1(t, n, pre);
206+
if (y == 1)
207+
return 1;
208+
141209
t <<= 1;
210+
while ((t != n - 1) && (y != n - 1))
211+
{
212+
y = n_rem_precomp_c1_unsafe(y * y, n, pre);
213+
t <<= 1;
214+
}
142215
}
143216

144217
return y == n - 1;
@@ -280,34 +353,129 @@ n_is_prime_odd_no_trial(ulong n)
280353
}
281354
}
282355

356+
/* 3, 5 (checked separately), and 32 more primes which may be vectorized */
357+
#define NUM_ODD_TRIAL_PRIMES 34
358+
359+
static const uint32_t trial_inv_32[2 * NUM_ODD_TRIAL_PRIMES] = {
360+
UWORD(0xaaaaaaab), UWORD(0x55555555), /* 3 */
361+
UWORD(0xcccccccd), UWORD(0x33333333), /* 5 */
362+
UWORD(0xb6db6db7), UWORD(0x24924924), /* 7 */
363+
UWORD(0xba2e8ba3), UWORD(0x1745d174), /* 11 */
364+
UWORD(0xc4ec4ec5), UWORD(0x13b13b13), /* 13 */
365+
UWORD(0xf0f0f0f1), UWORD(0xf0f0f0f), /* 17 */
366+
UWORD(0x286bca1b), UWORD(0xd79435e), /* 19 */
367+
UWORD(0xe9bd37a7), UWORD(0xb21642c), /* 23 */
368+
UWORD(0x4f72c235), UWORD(0x8d3dcb0), /* 29 */
369+
UWORD(0xbdef7bdf), UWORD(0x8421084), /* 31 */
370+
UWORD(0x914c1bad), UWORD(0x6eb3e45), /* 37 */
371+
UWORD(0xc18f9c19), UWORD(0x63e7063), /* 41 */
372+
UWORD(0x2fa0be83), UWORD(0x5f417d0), /* 43 */
373+
UWORD(0x677d46cf), UWORD(0x572620a), /* 47 */
374+
UWORD(0x8c13521d), UWORD(0x4d4873e), /* 53 */
375+
UWORD(0xa08ad8f3), UWORD(0x456c797), /* 59 */
376+
UWORD(0xc10c9715), UWORD(0x4325c53), /* 61 */
377+
UWORD(0x7a44c6b), UWORD(0x3d22635), /* 67 */
378+
UWORD(0xe327a977), UWORD(0x39b0ad1), /* 71 */
379+
UWORD(0xc7e3f1f9), UWORD(0x381c0e0), /* 73 */
380+
UWORD(0x613716af), UWORD(0x33d91d2), /* 79 */
381+
UWORD(0x2b2e43db), UWORD(0x3159721), /* 83 */
382+
UWORD(0xfa3f47e9), UWORD(0x2e05c0b), /* 89 */
383+
UWORD(0x5f02a3a1), UWORD(0x2a3a0fd), /* 97 */
384+
UWORD(0x7c32b16d), UWORD(0x288df0c), /* 101 */
385+
UWORD(0xd3431b57), UWORD(0x27c4597), /* 103 */
386+
UWORD(0x8d28ac43), UWORD(0x2647c69), /* 107 */
387+
UWORD(0xda6c0965), UWORD(0x2593f69), /* 109 */
388+
UWORD(0xfdbc091), UWORD(0x243f6f0), /* 113 */
389+
UWORD(0xefdfbf7f), UWORD(0x2040810), /* 127 */
390+
UWORD(0xc9484e2b), UWORD(0x1f44659), /* 131 */
391+
UWORD(0x77975b9), UWORD(0x1de5d6e), /* 137 */
392+
UWORD(0x70586723), UWORD(0x1d77b65), /* 139 */
393+
UWORD(0x8ce2cabd), UWORD(0x1b7d6c3), /* 149 */
394+
};
395+
396+
static const uint64_t trial_inv_64[2 * NUM_ODD_TRIAL_PRIMES] = {
397+
UWORD(0xaaaaaaaaaaaaaaab), UWORD(0x5555555555555555), /* 3 */
398+
UWORD(0xcccccccccccccccd), UWORD(0x3333333333333333), /* 5 */
399+
UWORD(0x6db6db6db6db6db7), UWORD(0x2492492492492492), /* 7 */
400+
UWORD(0x2e8ba2e8ba2e8ba3), UWORD(0x1745d1745d1745d1), /* 11 */
401+
UWORD(0x4ec4ec4ec4ec4ec5), UWORD(0x13b13b13b13b13b1), /* 13 */
402+
UWORD(0xf0f0f0f0f0f0f0f1), UWORD(0xf0f0f0f0f0f0f0f), /* 17 */
403+
UWORD(0x86bca1af286bca1b), UWORD(0xd79435e50d79435), /* 19 */
404+
UWORD(0xd37a6f4de9bd37a7), UWORD(0xb21642c8590b216), /* 23 */
405+
UWORD(0x34f72c234f72c235), UWORD(0x8d3dcb08d3dcb08), /* 29 */
406+
UWORD(0xef7bdef7bdef7bdf), UWORD(0x842108421084210), /* 31 */
407+
UWORD(0x14c1bacf914c1bad), UWORD(0x6eb3e45306eb3e4), /* 37 */
408+
UWORD(0x8f9c18f9c18f9c19), UWORD(0x63e7063e7063e70), /* 41 */
409+
UWORD(0x82fa0be82fa0be83), UWORD(0x5f417d05f417d05), /* 43 */
410+
UWORD(0x51b3bea3677d46cf), UWORD(0x572620ae4c415c9), /* 47 */
411+
UWORD(0x21cfb2b78c13521d), UWORD(0x4d4873ecade304d), /* 53 */
412+
UWORD(0xcbeea4e1a08ad8f3), UWORD(0x456c797dd49c341), /* 59 */
413+
UWORD(0x4fbcda3ac10c9715), UWORD(0x4325c53ef368eb0), /* 61 */
414+
UWORD(0xf0b7672a07a44c6b), UWORD(0x3d226357e16ece5), /* 67 */
415+
UWORD(0x193d4bb7e327a977), UWORD(0x39b0ad12073615a), /* 71 */
416+
UWORD(0x7e3f1f8fc7e3f1f9), UWORD(0x381c0e070381c0e), /* 73 */
417+
UWORD(0x9b8b577e613716af), UWORD(0x33d91d2a2067b23), /* 79 */
418+
UWORD(0xa3784a062b2e43db), UWORD(0x3159721ed7e7534), /* 83 */
419+
UWORD(0xf47e8fd1fa3f47e9), UWORD(0x2e05c0b81702e05), /* 89 */
420+
UWORD(0xa3a0fd5c5f02a3a1), UWORD(0x2a3a0fd5c5f02a3), /* 97 */
421+
UWORD(0x3a4c0a237c32b16d), UWORD(0x288df0cac5b3f5d), /* 101 */
422+
UWORD(0xdab7ec1dd3431b57), UWORD(0x27c45979c95204f), /* 103 */
423+
UWORD(0x77a04c8f8d28ac43), UWORD(0x2647c69456217ec), /* 107 */
424+
UWORD(0xa6c0964fda6c0965), UWORD(0x2593f69b02593f6), /* 109 */
425+
UWORD(0x90fdbc090fdbc091), UWORD(0x243f6f0243f6f02), /* 113 */
426+
UWORD(0x7efdfbf7efdfbf7f), UWORD(0x204081020408102), /* 127 */
427+
UWORD(0x3e88cb3c9484e2b), UWORD(0x1f44659e4a42715), /* 131 */
428+
UWORD(0xe21a291c077975b9), UWORD(0x1de5d6e3f8868a4), /* 137 */
429+
UWORD(0x3aef6ca970586723), UWORD(0x1d77b654b82c339), /* 139 */
430+
UWORD(0xdf5b0f768ce2cabd), UWORD(0x1b7d6c3dda338b2), /* 149 */
431+
};
432+
433+
#define TRIAL32(n, ii) (((uint32_t) (n) * trial_inv_32[2 * ii]) <= trial_inv_32[2 * ii + 1])
434+
#define TRIAL64(n, ii) (((uint64_t) (n) * trial_inv_64[2 * ii]) <= trial_inv_64[2 * ii + 1])
435+
283436
int
284437
n_is_prime(ulong n)
285438
{
286439
if (n < SMALL_ODDPRIME_LIMIT)
287440
return (n % 2 ? n_is_oddprime_small2(n) : n == 2);
288441

289-
if (!(n % 2) || !(n % 3) || !(n % 5))
442+
/* Trial division by 2, 3 and 5 */
443+
#if FLINT_BITS == 64
444+
if (!(n % 2) || TRIAL64(n, 0) || TRIAL64(n, 1))
445+
#else
446+
if (!(n % 2) || TRIAL32(n, 0) || TRIAL32(n, 1))
447+
#endif
290448
return 0;
291449

292450
if (n < (1 << 20))
293451
return n_is_mod30prime_small(n);
294452

295-
if (n > (1 << 20) &&
296-
(!(n % 7) || !(n % 11) || !(n % 13) || !(n % 17) || !(n % 19) ||
297-
!(n % 23) || !(n % 29) || !(n % 31) || !(n % 37) ||
298-
!(n % 41) || !(n % 43) || !(n % 47) || !(n % 53)))
299-
return 0;
300-
301-
if (n > UINT32_MAX &&
302-
(!(n % 59) || !(n % 61) || !(n % 67) || !(n % 71) || !(n % 73) ||
303-
!(n % 79) || !(n % 83) || !(n % 89) || !(n % 97) || !(n %101) ||
304-
!(n %103) || !(n % 107) || !(n %109) || !(n %113) || !(n % 127) ||
305-
!(n %131) || !(n % 137) || !(n %139) || !(n %149)))
306-
return 0;
453+
if (n <= UINT32_MAX)
454+
{
455+
int i, c;
456+
/* The compiler should unroll and convert this to a few vector
457+
instructions when e.g. AVX2 is available; branchy return isn't
458+
worth it. */
459+
c = 0;
460+
for (i = 2; i < NUM_ODD_TRIAL_PRIMES; i++)
461+
c |= TRIAL32(n, i);
462+
463+
if (c != 0)
464+
return 0;
465+
}
466+
else
467+
{
468+
int i;
469+
for (i = 2; i < NUM_ODD_TRIAL_PRIMES; i++)
470+
if (TRIAL64(n, i))
471+
return 0;
472+
}
307473

308474
return n_is_prime_odd_no_trial(n);
309475
}
310476

477+
478+
311479
/* first 64 primes used for modular arithmetic */
312480
#define N_MODULUS (UWORD(1) << (FLINT_BITS - 1))
313481
#define N_MOD_TAB 64

0 commit comments

Comments
 (0)