@@ -83,10 +83,10 @@ POINTS_TO_AFFINE_IMPL(blst_p2, POINTonE2, 384x, fp2)
8383/* The intermediate infinity points are encoded as [0, 0, 1]. */
8484
8585#define PRECOMPUTE_WBITS_IMPL (prefix , ptype , bits , field , one ) \
86- static void ptype##_precompute_row_wbits (ptype row[], size_t wbits , \
87- const ptype##_affine *point) \
86+ static void ptype##_precompute_row (ptype row[], size_t n , \
87+ const ptype##_affine *point) \
8888{ \
89- size_t i, j, n = (size_t)1 << (wbits-1) ; \
89+ size_t i, j; \
9090 bool_t inf = vec_is_zero(point, sizeof(*point)); \
9191 /* row[-1] is implicit infinity */ \
9292 vec_copy (& row [0 ], point , sizeof (* point )); /* row[0]=p*1 */ \
@@ -153,16 +153,44 @@ static void ptype##s_precompute_wbits(ptype##_affine table[], size_t wbits, \
153153 rows = row = (ptype *)(&table[top]); \
154154 for (i = 0; i < stride; i++, row += nwin) \
155155 point = *points ? *points++ : point+1, \
156- ptype##_precompute_row_wbits (row, wbits , point); \
156+ ptype##_precompute_row (row, nwin , point); \
157157 ptype##s_to_affine_row_wbits(&table[top], rows, wbits, stride); \
158158 top += stride << (wbits-1); \
159159 npoints -= stride; \
160160 } \
161- rows = row = alloca(2*sizeof(ptype##_affine) * npoints * nwin); \
162- for (i = 0; i < npoints; i++, row += nwin) \
163- point = *points ? *points++ : point+1, \
164- ptype##_precompute_row_wbits(row, wbits, point); \
165- ptype##s_to_affine_row_wbits(&table[top], rows, wbits, npoints); \
161+ if ((i = 2*sizeof(ptype##_affine)*npoints*nwin) <= SCRATCH_LIMIT) { \
162+ rows = row = alloca(i); \
163+ for (i = 0; i < npoints; i++, row += nwin) \
164+ point = *points ? *points++ : point+1, \
165+ ptype##_precompute_row(row, nwin, point); \
166+ ptype##s_to_affine_row_wbits(&table[top], rows, wbits, npoints); \
167+ } else { \
168+ const ptype *pp[2]; \
169+ \
170+ stride = SCRATCH_LIMIT / sizeof(ptype); \
171+ stride -= stride % 2; \
172+ if (stride > nwin) stride = nwin; \
173+ \
174+ pp[0] = row = alloca(stride * sizeof(ptype)); \
175+ pp[1] = NULL; \
176+ for (i = 0; i < npoints; i++, top += nwin) { \
177+ size_t j, k, n; \
178+ \
179+ point = *points ? *points++ : point+1; \
180+ ptype##_precompute_row(row, stride, point); \
181+ ptype##s_to_affine(&table[top], pp, stride); \
182+ for (j = stride; j < nwin; j += stride) { \
183+ n = (j+stride) <= nwin ? stride : nwin-j; \
184+ for (k = 0; k < n-1; k++) \
185+ ptype##_add_affine(&row[k], &row[stride-1], &table[top+k]); \
186+ if (j == stride) \
187+ ptype##_double(&row[k], &row[stride-1]); \
188+ else \
189+ ptype##_add_affine(&row[k], &row[stride-1], &table[top+k]); \
190+ ptype##s_to_affine(&table[top+j], pp, n); \
191+ } \
192+ } \
193+ } \
166194} \
167195\
168196size_t prefix##s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints) \
0 commit comments