@@ -139,22 +139,42 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
139139! storage scheme, and radix8_ditN_cy_dif1 for details on the reduced-length weights array scheme.
140140*/
141141 const char func [] = "radix320_ditN_cy_dif1" ;
142+ #ifdef USE_SSE2
142143 static int thr_id = 0 ; // Master thread gets this special id
144+ #endif
145+ #if !defined(MULTITHREAD ) && defined(USE_SSE2 )
143146 const int pfetch_dist = PFETCH_DIST ;
147+ #endif
144148 const int stride = (int )RE_IM_STRIDE << 1 ; // main-array loop stride = 2*RE_IM_STRIDE
145149 static double wts_mult [2 ], inv_mult [2 ]; // Const wts-multiplier and 2*(its multiplicative inverse)
150+ #if !defined(MULTITHREAD ) && !defined(USE_SSE2 )
146151 double wt_re ,wt_im , wi_re ,wi_im ; // Fermat-mod/LOACC weights stuff, used in both scalar and SIMD mode
152+ #endif
147153 // Cleanup loop assumes carryins propagate at most 4 words up, but need at least 1 vec_cmplx
148154 // (2 vec_dbl)'s worth of doubles in wraparound step, hence AVX-512 needs value bumped up:
149155 #ifdef USE_AVX512
150156 const int jhi_wrap = 15 ;
151157 #else
152158 const int jhi_wrap = 7 ;
153159 #endif
154- int NDIVR ,i ,incr ,j ,j1 ,j2 ,jt ,jp ,jstart ,jhi ,full_pass ,k ,khi ,l ,ntmp ,outer ,nbytes , ilo ,ihi ,hi_neg ,k0 ,k1 ,k2 ,k3 ,k4 ,nshift ;
160+ int NDIVR ,i ,j ,j1 ,jt ,jhi ,full_pass ,khi ,l ,outer ;
161+ #if !defined(MULTITHREAD ) && !defined(USE_SSE2 )
162+ int j2 ,jp ,ntmp ;
163+ #endif
164+ #ifndef MULTITHREAD
165+ int jstart , ilo ,ihi ,hi_neg ;;
166+ #endif
167+ #ifdef USE_SSE2
168+ int nbytes ;
169+ #endif
170+ #ifndef MULTITHREAD
171+ int k0 ,k1 ,k2 ,k3 ,k4 ,nshift ;
172+ #endif
173+ #if !defined(MULTITHREAD ) && defined(USE_SSE2 )
155174 // incr = Carry-chain wts-multipliers recurrence length, which must divide
156175 // RADIX/[n-wayness of carry macro], e.g. RADIX/[16|8|4] = 20|40|80 for avx512,avx,sse, respectively:
157176 // Note: Not able to get away with as large a setting of incr for radix 320 as we did for 160.
177+ int incr ;
158178 #ifdef USE_AVX512
159179 const int incr_long = 10 ,incr_med = 5 ,incr_short = 5 ;
160180 #elif defined(USE_AVX2 )
@@ -177,6 +197,7 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
177197 incr = incr_short ;
178198 else
179199 incr = incr_hiacc ;
200+ #endif // !MULTITHREAD && USE_SSE2
180201
181202 // Jun 2018: Add support for residue shift. (Only LL-test needs intervention at carry-loop level).
182203 int target_idx = -1 , target_set = 0 ,tidx_mod_stride ;
@@ -189,7 +210,9 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
189210 static int poff [RADIX >>2 ]; // Store mults of p4 offset for loop control
190211#ifndef MULTITHREAD
191212// Shared DIF+DIT:
213+ #ifndef USE_SSE2
192214 double rt ,it ;
215+ #endif
193216 static int t_offsets [64 ];
194217 static int plo [16 ],phi [20 ];
195218 // Need storage for 4 circular-shifts perms of a basic 5-vector, with shift count in [0,4] that means 4*9 elts:
@@ -230,14 +253,25 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
230253 // Local storage: We must use an array here because scalars have no guarantees about relative address offsets
231254 // [and even if those are contiguous-as-hoped-for, they may run in reverse]; Make array type (struct complex)
232255 // to allow us to use the same offset-indexing as in the original radix-32 in-place DFT macros:
256+ #ifndef MULTITHREAD
233257 double * addr ;
234- struct complex t [RADIX ], * tptr ;
235- int * itmp ,* itm2 ; // Pointer into the bjmodn array
258+ #endif
259+ struct complex t [RADIX ];
260+ #if !defined(MULTITHREAD ) && !defined(USE_SSE2 )
261+ struct complex * tptr ;
262+ #endif
263+ #ifndef MULTITHREAD
264+ int * itmp ; // Pointer into the bjmodn array
265+ #endif
266+ #if !defined(MULTITHREAD ) && defined(USE_AVX ) && !defined(USE_AVX512 )
267+ int * itm2 ; // Pointer into the bjmodn array
268+ #endif
236269 int err ;
237270 static int first_entry = TRUE;
238271
239272/*...stuff for the reduced-length DWT weights array is here: */
240273 int n_div_nwt ;
274+ #ifndef MULTITHREAD
241275 int col ,co2 ,co3 ;
242276 #ifdef USE_AVX512
243277 double t0 ,t1 ,t2 ,t3 ;
@@ -252,6 +286,7 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
252286 int n_minus_sil ,n_minus_silp1 ,sinwt ,sinwtm1 ;
253287 double wtl ,wtlp1 ,wtn ,wtnm1 ; /* Mersenne-mod weights stuff */
254288 #endif
289+ #endif // !MULTITHREAD
255290
256291#ifdef USE_SSE2
257292
@@ -270,20 +305,37 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
270305 double * add0 ,* add1 ,* add2 ,* add3 ; /* Addresses into array sections */
271306 #endif
272307
308+ #ifndef MULTITHREAD
273309 static int * bjmodn ; // Alloc mem for this along with other SIMD stuff
310+ #endif
311+ #ifndef USE_AVX512
274312 const double crnd = 3.0 * 0x4000000 * 0x2000000 ;
313+ #endif
314+ #ifndef USE_AVX
275315 struct complex * ctmp ; // Hybrid AVX-DFT/SSE2-carry scheme used for Mersenne-mod needs a 2-word-double pointer
316+ #endif
276317 vec_dbl
277318 #ifndef MULTITHREAD
278319 * va0 ,* va1 ,* va2 ,* va3 ,* va4 , * vb0 ,* vb1 ,* vb2 ,* vb3 ,* vb4 ,
320+ #if defined(USE_AVX2 ) && !defined (USE_AVX512 )
279321 * wa0 ,* wa1 ,* wa2 ,* wa3 ,* wa4 , * wb0 ,* wb1 ,* wb2 ,* wb3 ,* wb4 ,
322+ #endif
280323 #endif
281- * tmp ,* tm0 ,* tm1 ,* tm2 ; // Non-static utility ptrs
324+ * tmp ,* tm2 // Non-static utility ptrs
325+ #ifndef USE_SSE2
326+ ,* tm0 // Non-static utility ptrs
327+ #endif
328+ #if !defined (MULTITHREAD ) && defined (USE_SSE2 )
329+ ,* tm1 // Non-static utility ptrs
330+ #endif
331+ ;
282332 static vec_dbl * two ,
283333 * ycc1 ,* yss1 ,* ycc2 ,* yss2 ,* yss3 , // radiy-5 DFT trig consts
284334 * max_err , * sse2_rnd , * half_arr ,
285335 * r00 , // Head of RADIX*vec_cmplx-sized local store #1
336+ #if !defined(MULTITHREAD ) && defined(USE_SSE2 )
286337 * s1p00 , // Head of RADIX*vec_cmplx-sized local store #2
338+ #endif
287339 * cy ; // Need RADIX/2 slots for sse2 carries, RADIX/4 for avx
288340#else
289341 static int p0123 [4 ];
@@ -293,7 +345,10 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
293345
294346 static struct cy_thread_data_t * tdat = 0x0 ;
295347 // Threadpool-based dispatch stuff:
296- static int main_work_units = 0 , pool_work_units = 0 ;
348+ #if 0 //def OS_TYPE_MACOSX
349+ static int main_work_units = 0 ;
350+ #endif
351+ static int pool_work_units = 0 ;
297352 static struct threadpool * tpool = 0x0 ;
298353 static int task_is_blocking = TRUE;
299354 static thread_control_t thread_control = {0 ,0 ,0 };
@@ -327,8 +382,10 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
327382 ASSERT (0 , "Fermat-mod only available for radices 7,8,9,15 and their multiples!" );
328383 }
329384
385+ #ifndef MULTITHREAD
330386 // Init these to get rid of GCC "may be used uninitialized in this function" warnings:
331387 col = co2 = co3 = -1 ;
388+ #endif
332389 // Jan 2018: To support PRP-testing, read the LR-modpow-scalar-multiply-needed bit for the current iteration from the global array:
333390 double prp_mult = 1.0 ;
334391 if ((TEST_TYPE & 0xfffffffe ) == TEST_TYPE_PRP ) { // Mask off low bit to lump together PRP and PRP-C tests
@@ -507,7 +564,10 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
507564 __r0 = sc_ptr ;
508565 #endif
509566 tmp = sc_ptr ; r00 = tmp ; // Head of RADIX*vec_cmplx-sized local store #1
510- tmp += 0x280 ; s1p00 = tmp ; // Head of RADIX*vec_cmplx-sized local store #2
567+ tmp += 0x280 ; // Head of RADIX*vec_cmplx-sized local store #2
568+ #if !defined(MULTITHREAD ) && defined(USE_SSE2 )
569+ s1p00 = tmp ; // Head of RADIX*vec_cmplx-sized local store #2
570+ #endif
511571 tmp += 0x280 ;
512572 two = tmp + 0x0 ;
513573 // DFT-64 roots all def'd locally in the DFT-64 functions, only need the radix-5 roots here:
@@ -789,23 +849,29 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
789849
790850 #ifdef USE_AVX512
791851 #ifdef CARRY_16_WAY
852+ #ifndef MULTITHREAD
792853 n_minus_sil = (struct uint32x16 * )sse_n + 1 ;
793854 n_minus_silp1 = (struct uint32x16 * )sse_n + 2 ;
794855 sinwt = (struct uint32x16 * )sse_n + 3 ;
795856 sinwtm1 = (struct uint32x16 * )sse_n + 4 ;
857+ #endif
796858 nbytes += 256 ;
797859 #else
860+ #ifndef MULTITHREAD
798861 n_minus_sil = (struct uint32x8 * )sse_n + 1 ;
799862 n_minus_silp1 = (struct uint32x8 * )sse_n + 2 ;
800863 sinwt = (struct uint32x8 * )sse_n + 3 ;
801864 sinwtm1 = (struct uint32x8 * )sse_n + 4 ;
865+ #endif
802866 nbytes += 128 ;
803867 #endif
804868 #elif defined(USE_AVX )
869+ #ifndef MULTITHREAD
805870 n_minus_sil = (struct uint32x4 * )sse_n + 1 ;
806871 n_minus_silp1 = (struct uint32x4 * )sse_n + 2 ;
807872 sinwt = (struct uint32x4 * )sse_n + 3 ;
808873 sinwtm1 = (struct uint32x4 * )sse_n + 4 ;
874+ #endif
809875 nbytes += 64 ;
810876 #endif
811877
@@ -817,12 +883,14 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
817883 tmp = tm2 ; tm2 += cslots_in_local_store ;
818884 }
819885
886+ #ifndef MULTITHREAD
820887 // For large radices, array-access to bjmodn means only init base-ptr here:
821888 #ifdef USE_AVX
822889 bjmodn = (int * )(sinwtm1 + RE_IM_STRIDE );
823890 #else
824891 bjmodn = (int * )(sse_n + RE_IM_STRIDE );
825892 #endif
893+ #endif
826894
827895 #endif // USE_SSE2
828896
@@ -2528,18 +2596,21 @@ void radix320_dit_pass1(double a[], int n)
25282596 void *
25292597 cy320_process_chunk (void * targ ) // Thread-arg pointer *must* be cast to void and specialized inside the function
25302598 {
2531- const char func [] = "radix320_ditN_cy_dif1" ;
25322599 struct cy_thread_data_t * thread_arg = targ ; // Move to top because scalar-mode carry pointers taken directly from it
25332600 double * addr ;
2601+ #ifdef USE_SSE2
25342602 const int pfetch_dist = PFETCH_DIST ;
2603+ #endif
25352604 const int stride = (int )RE_IM_STRIDE << 1 ; // main-array loop stride = 2*RE_IM_STRIDE
25362605 uint32 p1 ,p2 ,p3 ,p4 ,p5 ,p6 ,p7 ,p8 ,p9 ,pa ,pb ,pc ,pd ,pe ,pf
2537- ,p10 ,p20 ,p30 ,p40 ,p50 ,p60 ,p70 ,p80 ,p90 ,pa0 ,pb0 ,pc0 ,pd0 ,pe0 ,pf0 ,pg0 ,ph0 ,pi0 ,pj0 , nsave = 0 ;
2606+ ,p10 ,p20 ,p30 ,p40 ,p50 ,p60 ,p70 ,p80 ,p90 ,pa0 ,pb0 ,pc0 ,pd0 ,pe0 ,pf0 ,pg0 ,ph0 ,pi0 ,pj0 ;
25382607 // Shared DIF+DIT:
2608+ #ifndef USE_SSE2
25392609 double rt ,it ;
25402610 double wt_re ,wt_im , wi_re ,wi_im ; // Fermat-mod/LOACC weights stuff, used in both scalar and SIMD mode
2611+ #endif
25412612 int poff [RADIX >>2 ]; // Store [RADIX/4] mults of p04 offset for loop control
2542- int ilo ,ihi ,idx , jdx , hi_neg ,k0 ,k1 ,k2 ,k3 ,k4 ,nshift , * iptr ;
2613+ int ilo ,ihi ,hi_neg ,k0 ,k1 ,k2 ,k3 ,k4 ,nshift ;
25432614 int plo [16 ],phi [20 ], t_offsets [64 ];
25442615 uint64 i0 ,i1 ,i2 ,i3 ;
25452616 // DIF:
@@ -2570,10 +2641,15 @@ void radix320_dit_pass1(double a[], int n)
25702641 0x4576013289bafedcull // [I]
25712642 };
25722643
2573- int incr ,j ,j1 ,j2 ,jt ,jp ,k ,l ,ntmp ;
2644+ int j ,j1 ,jt ,l ;
2645+ #ifndef USE_SSE2
2646+ int j2 ,jp ,ntmp ;
2647+ #endif
2648+ #ifdef USE_SSE2
25742649 // incr = Carry-chain wts-multipliers recurrence length, which must divide
25752650 // RADIX/[n-wayness of carry macro], e.g. RADIX/[16|8|4] = 20|40|80 for avx512,avx,sse, respectively:
25762651 // Note: Not able to get away with as large a setting of incr for radix 320 as we did for 160.
2652+ int incr ;
25772653 #ifdef USE_AVX512
25782654 const int incr_long = 10 ,incr_med = 5 ,incr_short = 5 ;
25792655 #elif defined(USE_AVX2 )
@@ -2596,6 +2672,7 @@ void radix320_dit_pass1(double a[], int n)
25962672 incr = incr_short ;
25972673 else
25982674 incr = incr_hiacc ;
2675+ #endif // USE_SSE2
25992676
26002677 #ifdef USE_AVX512
26012678 double t0 ,t1 ,t2 ,t3 ;
@@ -2613,21 +2690,40 @@ void radix320_dit_pass1(double a[], int n)
26132690
26142691 #ifdef USE_SSE2
26152692
2693+ #ifndef USE_AVX512
26162694 const double crnd = 3.0 * 0x4000000 * 0x2000000 ;
2617- int * itmp ,* itm2 ; // Pointer into the bjmodn array
2695+ #endif
2696+ int * itmp ; // Pointer into the bjmodn array
2697+ #if defined(USE_AVX ) && !defined(USE_AVX512 )
2698+ int * itm2 ; // Pointer into the bjmodn array
2699+ #endif
2700+ #ifndef USE_AVX
26182701 struct complex * ctmp ; // Hybrid AVX-DFT/SSE2-carry scheme used for Mersenne-mod needs a 2-word-double pointer
2702+ #endif
26192703 double * add0 ,* add1 ,* add2 ,* add3 ;
26202704 int * bjmodn ; // Alloc mem for this along with other SIMD stuff
26212705 vec_dbl * tmp ,* tm1 ,* tm2 , // Non-static utility ptrs
2622- * va0 ,* va1 ,* va2 ,* va3 ,* va4 , * vb0 ,* vb1 ,* vb2 ,* vb3 ,* vb4 ,
2623- * wa0 ,* wa1 ,* wa2 ,* wa3 ,* wa4 , * wb0 ,* wb1 ,* wb2 ,* wb3 ,* wb4 ;
2624- vec_dbl * two ,
2625- * ycc1 ,* yss1 ,* ycc2 ,* yss2 ,* yss3 , // radiy-5 DFT trig consts
2626- * max_err , * sse2_rnd , * half_arr ,
2706+ * va0 ,* va1 ,* va2 ,* va3 ,* va4 , * vb0 ,* vb1 ,* vb2 ,* vb3 ,* vb4
2707+ #if defined(USE_AVX2 ) && !defined(USE_AVX512 )
2708+ ,* wa0 ,* wa1 ,* wa2 ,* wa3 ,* wa4 , * wb0 ,* wb1 ,* wb2 ,* wb3 ,* wb4
2709+ #endif
2710+ ;
2711+ vec_dbl
2712+ #if defined(USE_AVX2 ) && !defined(USE_AVX512 )
2713+ * two ,
2714+ #endif
2715+ * ycc1 ,/* *yss1,*ycc2,*yss2,*yss3, */ // radiy-5 DFT trig consts
2716+ * max_err ,
2717+ #ifndef USE_AVX512
2718+ * sse2_rnd ,
2719+ #endif
2720+ * half_arr ,
26272721 * r00 , // Head of RADIX*vec_cmplx-sized local store #1
26282722 * s1p00 , // Head of RADIX*vec_cmplx-sized local store #2
26292723 * cy ; // Need RADIX/2 slots for sse2 carries, RADIX/4 for avx
2724+ #ifndef USE_AVX512
26302725 double dtmp ;
2726+ #endif
26312727 uint64 * sign_mask , * sse_bw , * sse_sw , * sse_n ;
26322728
26332729 #else
@@ -2650,8 +2746,9 @@ void radix320_dit_pass1(double a[], int n)
26502746 #endif
26512747
26522748 // int data:
2749+ #ifdef USE_SSE2
26532750 int thr_id = thread_arg -> tid ;
2654- int iter = thread_arg -> iter ;
2751+ #endif
26552752 int NDIVR = thread_arg -> ndivr ;
26562753 int n = NDIVR * RADIX ;
26572754 int target_idx = thread_arg -> target_idx ;
@@ -2669,7 +2766,7 @@ void radix320_dit_pass1(double a[], int n)
26692766
26702767 // double data:
26712768 double maxerr = thread_arg -> maxerr ;
2672- double scale = thread_arg -> scale ; int full_pass = scale < 0.5 ;
2769+ double scale = thread_arg -> scale ;
26732770 double prp_mult = thread_arg -> prp_mult ;
26742771
26752772 // pointer data:
@@ -2881,18 +2978,20 @@ void radix320_dit_pass1(double a[], int n)
28812978 tmp = r00 = thread_arg -> r00 ; // Head of RADIX*vec_cmplx-sized local store #1
28822979 tmp += 0x280 ; s1p00 = tmp ; // Head of RADIX*vec_cmplx-sized local store #2
28832980 tmp += 0x280 ;
2981+ #if defined(USE_AVX2 ) && !defined(USE_AVX512 )
28842982 two = tmp + 0x0 ;
2983+ #endif
28852984 // DFT-64 roots all def'd locally in the DFT-64 functions, only need the radix-5 roots here:
28862985 ycc1 = tmp + 0x1 ; // radix-5 DFT trig consts
2887- ycc2 = tmp + 0x2 ;
2888- yss1 = tmp + 0x3 ;
2889- yss2 = tmp + 0x4 ;
2890- yss3 = tmp + 0x5 ;
2986+ // ycc2 = tmp + 0x2;
2987+ // yss1 = tmp + 0x3;
2988+ // yss2 = tmp + 0x4;
2989+ // yss3 = tmp + 0x5;
28912990 tmp += 0x6 ; // sc_ptr += 0x506; add a pad so the tmp += 2 below gives half_arr = 0x508 + (RADIX/vec_dbl-length)
28922991 #ifdef USE_AVX512
28932992 cy = tmp ; tmp += 0x28 ; // RADIX/8 vec_dbl slots for carry sub-array
28942993 max_err = tmp + 0x00 ;
2895- sse2_rnd = tmp + 0x01 ;
2994+ // sse2_rnd= tmp + 0x01;
28962995 half_arr = tmp + 0x02 ; // sc_ptr += 0x(508 + 28) = 0x538; This is where the value of half_arr_offset320 comes from
28972996 #elif defined(USE_AVX )
28982997 cy = tmp ; tmp += 0x50 ; // RADIX/4 vec_dbl slots
0 commit comments