@@ -172,7 +172,9 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
172172! storage scheme, and radix16_ditN_cy_dif1 for details on the reduced-length weights array scheme.
173173*/
174174 const char func [] = "radix64_ditN_cy_dif1" ;
175+ #if !USE_SCALAR_DFT_MACRO && !defined(USE_SSE2 )
175176 static int thr_id = 0 ; // Master thread gets this special id
177+ #endif
176178#if USE_SCALAR_DFT_MACRO
177179 static int dft_offsets [RADIX ], c_offsets [RADIX ];
178180#endif
@@ -267,11 +269,13 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
267269 * r20 ,* r22 ,* r24 ,* r26 ,* r28 ,* r2A ,* r2C ,* r2E ,* r30 ,* r32 ,* r34 ,* r36 ,* r38 ,* r3A ,* r3C ,* r3E ,
268270 * r40 ,* r42 ,* r44 ,* r46 ,* r48 ,* r4A ,* r4C ,* r4E ,* r50 ,* r52 ,* r54 ,* r56 ,* r58 ,* r5A ,* r5C ,* r5E ,
269271 * r60 ,* r62 ,* r64 ,* r66 ,* r68 ,* r6A ,* r6C ,* r6E ,* r70 ,* r72 ,* r74 ,* r76 ,* r78 ,* r7A ,* r7C ,* r7E ,
272+ #ifndef MULTITHREAD
270273 // ...and s's as pointers-to-complex-SIMD; thus the r-indices run 2x faster than the s-ones:
271- * s1p00 ,* s1p01 ,* s1p02 ,* s1p03 ,* s1p04 ,* s1p05 ,* s1p06 ,* s1p07 ,* s1p08 ,* s1p09 ,* s1p0a ,* s1p0b ,* s1p0c ,* s1p0d ,* s1p0e ,* s1p0f ,
272- * s1p10 ,* s1p11 ,* s1p12 ,* s1p13 ,* s1p14 ,* s1p15 ,* s1p16 ,* s1p17 ,* s1p18 ,* s1p19 ,* s1p1a ,* s1p1b ,* s1p1c ,* s1p1d ,* s1p1e ,* s1p1f ,
273- * s1p20 ,* s1p21 ,* s1p22 ,* s1p23 ,* s1p24 ,* s1p25 ,* s1p26 ,* s1p27 ,* s1p28 ,* s1p29 ,* s1p2a ,* s1p2b ,* s1p2c ,* s1p2d ,* s1p2e ,* s1p2f ,
274- * s1p30 ,* s1p31 ,* s1p32 ,* s1p33 ,* s1p34 ,* s1p35 ,* s1p36 ,* s1p37 ,* s1p38 ,* s1p39 ,* s1p3a ,* s1p3b ,* s1p3c ,* s1p3d ,* s1p3e ,* s1p3f ,
274+ * s1p00 ,* s1p01 ,* s1p02 ,* s1p03 ,* s1p04 ,* s1p05 ,* s1p06 ,* s1p07 ,* s1p08 ,/* *s1p09,*s1p0a,*s1p0b,*s1p0c,*s1p0d,*s1p0e,*s1p0f, */
275+ * s1p10 ,/* *s1p11,*s1p12,*s1p13,*s1p14,*s1p15,*s1p16,*s1p17, */ * s1p18 ,/* *s1p19,*s1p1a,*s1p1b,*s1p1c,*s1p1d,*s1p1e,*s1p1f, */
276+ * s1p20 ,/* *s1p21,*s1p22,*s1p23,*s1p24,*s1p25,*s1p26,*s1p27, */ * s1p28 ,/* *s1p29,*s1p2a,*s1p2b,*s1p2c,*s1p2d,*s1p2e,*s1p2f, */
277+ * s1p30 ,/* *s1p31,*s1p32,*s1p33,*s1p34,*s1p35,*s1p36,*s1p37, */ * s1p38 ,/* *s1p39,*s1p3a,*s1p3b,*s1p3c,*s1p3d,*s1p3e,*s1p3f, */
278+ #endif
275279 * cy_r ,* cy_i ; // Need RADIX slots for sse2 carries, RADIX/2 for avx
276280 #ifdef USE_AVX
277281 static vec_dbl * base_negacyclic_root ;
@@ -287,7 +291,10 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
287291
288292 static struct cy_thread_data_t * tdat = 0x0 ;
289293 // Threadpool-based dispatch stuff:
290- static int main_work_units = 0 , pool_work_units = 0 ;
294+ #if 0 //def OS_TYPE_MACOSX
295+ static int main_work_units = 0 ;
296+ #endif
297+ static int pool_work_units = 0 ;
291298 static struct threadpool * tpool = 0x0 ;
292299 static int task_is_blocking = TRUE;
293300 static thread_control_t thread_control = {0 ,0 ,0 };
@@ -556,38 +563,40 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
556563 r3C = tmp + 0x3c ; r7C = tmp + 0x7c ;
557564 r3E = tmp + 0x3e ; r7E = tmp + 0x7e ;
558565 tmp += 0x80 ;
566+ #ifndef MULTITHREAD
559567 s1p00 = tmp + 0x00 ; s1p20 = tmp + 0x40 ;
560- s1p01 = tmp + 0x02 ; s1p21 = tmp + 0x42 ;
561- s1p02 = tmp + 0x04 ; s1p22 = tmp + 0x44 ;
562- s1p03 = tmp + 0x06 ; s1p23 = tmp + 0x46 ;
563- s1p04 = tmp + 0x08 ; s1p24 = tmp + 0x48 ;
564- s1p05 = tmp + 0x0a ; s1p25 = tmp + 0x4a ;
565- s1p06 = tmp + 0x0c ; s1p26 = tmp + 0x4c ;
566- s1p07 = tmp + 0x0e ; s1p27 = tmp + 0x4e ;
568+ s1p01 = tmp + 0x02 ; // s1p21 = tmp + 0x42;
569+ s1p02 = tmp + 0x04 ; // s1p22 = tmp + 0x44;
570+ s1p03 = tmp + 0x06 ; // s1p23 = tmp + 0x46;
571+ s1p04 = tmp + 0x08 ; // s1p24 = tmp + 0x48;
572+ s1p05 = tmp + 0x0a ; // s1p25 = tmp + 0x4a;
573+ s1p06 = tmp + 0x0c ; // s1p26 = tmp + 0x4c;
574+ s1p07 = tmp + 0x0e ; // s1p27 = tmp + 0x4e;
567575 s1p08 = tmp + 0x10 ; s1p28 = tmp + 0x50 ;
568- s1p09 = tmp + 0x12 ; s1p29 = tmp + 0x52 ;
569- s1p0a = tmp + 0x14 ; s1p2a = tmp + 0x54 ;
570- s1p0b = tmp + 0x16 ; s1p2b = tmp + 0x56 ;
571- s1p0c = tmp + 0x18 ; s1p2c = tmp + 0x58 ;
572- s1p0d = tmp + 0x1a ; s1p2d = tmp + 0x5a ;
573- s1p0e = tmp + 0x1c ; s1p2e = tmp + 0x5c ;
574- s1p0f = tmp + 0x1e ; s1p2f = tmp + 0x5e ;
576+ // s1p09 = tmp + 0x12; s1p29 = tmp + 0x52;
577+ // s1p0a = tmp + 0x14; s1p2a = tmp + 0x54;
578+ // s1p0b = tmp + 0x16; s1p2b = tmp + 0x56;
579+ // s1p0c = tmp + 0x18; s1p2c = tmp + 0x58;
580+ // s1p0d = tmp + 0x1a; s1p2d = tmp + 0x5a;
581+ // s1p0e = tmp + 0x1c; s1p2e = tmp + 0x5c;
582+ // s1p0f = tmp + 0x1e; s1p2f = tmp + 0x5e;
575583 s1p10 = tmp + 0x20 ; s1p30 = tmp + 0x60 ;
576- s1p11 = tmp + 0x22 ; s1p31 = tmp + 0x62 ;
577- s1p12 = tmp + 0x24 ; s1p32 = tmp + 0x64 ;
578- s1p13 = tmp + 0x26 ; s1p33 = tmp + 0x66 ;
579- s1p14 = tmp + 0x28 ; s1p34 = tmp + 0x68 ;
580- s1p15 = tmp + 0x2a ; s1p35 = tmp + 0x6a ;
581- s1p16 = tmp + 0x2c ; s1p36 = tmp + 0x6c ;
582- s1p17 = tmp + 0x2e ; s1p37 = tmp + 0x6e ;
584+ // s1p11 = tmp + 0x22; s1p31 = tmp + 0x62;
585+ // s1p12 = tmp + 0x24; s1p32 = tmp + 0x64;
586+ // s1p13 = tmp + 0x26; s1p33 = tmp + 0x66;
587+ // s1p14 = tmp + 0x28; s1p34 = tmp + 0x68;
588+ // s1p15 = tmp + 0x2a; s1p35 = tmp + 0x6a;
589+ // s1p16 = tmp + 0x2c; s1p36 = tmp + 0x6c;
590+ // s1p17 = tmp + 0x2e; s1p37 = tmp + 0x6e;
583591 s1p18 = tmp + 0x30 ; s1p38 = tmp + 0x70 ;
584- s1p19 = tmp + 0x32 ; s1p39 = tmp + 0x72 ;
585- s1p1a = tmp + 0x34 ; s1p3a = tmp + 0x74 ;
586- s1p1b = tmp + 0x36 ; s1p3b = tmp + 0x76 ;
587- s1p1c = tmp + 0x38 ; s1p3c = tmp + 0x78 ;
588- s1p1d = tmp + 0x3a ; s1p3d = tmp + 0x7a ;
589- s1p1e = tmp + 0x3c ; s1p3e = tmp + 0x7c ;
590- s1p1f = tmp + 0x3e ; s1p3f = tmp + 0x7e ;
592+ //s1p19 = tmp + 0x32; s1p39 = tmp + 0x72;
593+ //s1p1a = tmp + 0x34; s1p3a = tmp + 0x74;
594+ //s1p1b = tmp + 0x36; s1p3b = tmp + 0x76;
595+ //s1p1c = tmp + 0x38; s1p3c = tmp + 0x78;
596+ //s1p1d = tmp + 0x3a; s1p3d = tmp + 0x7a;
597+ //s1p1e = tmp + 0x3c; s1p3e = tmp + 0x7c;
598+ //s1p1f = tmp + 0x3e; s1p3f = tmp + 0x7e;
599+ #endif
591600 tmp += 0x80 ;
592601 #if !USE_SCALAR_DFT_MACRO
593602 two = tmp + 0 ; // AVX+ versions of various DFT macros need consts [2,1,sqrt2,isrt2] quartet laid out thusly
@@ -2463,10 +2472,10 @@ void radix64_dit_pass1(double a[], int n)
24632472 * r40 ,* r42 ,* r44 ,* r46 ,* r48 ,* r4A ,* r4C ,* r4E ,* r50 ,* r52 ,* r54 ,* r56 ,* r58 ,* r5A ,* r5C ,* r5E ,
24642473 * r60 ,* r62 ,* r64 ,* r66 ,* r68 ,* r6A ,* r6C ,* r6E ,* r70 ,* r72 ,* r74 ,* r76 ,* r78 ,* r7A ,* r7C ,* r7E ,
24652474 // ...and s's as pointers-to-complex-SIMD; thus the r-indices run 2x faster than the s-ones:
2466- * s1p00 ,* s1p01 ,* s1p02 ,* s1p03 ,* s1p04 ,* s1p05 ,* s1p06 ,* s1p07 ,* s1p08 ,* s1p09 ,* s1p0a ,* s1p0b ,* s1p0c ,* s1p0d ,* s1p0e ,* s1p0f ,
2467- * s1p10 ,* s1p11 ,* s1p12 ,* s1p13 ,* s1p14 ,* s1p15 ,* s1p16 ,* s1p17 ,* s1p18 ,* s1p19 ,* s1p1a ,* s1p1b ,* s1p1c ,* s1p1d ,* s1p1e ,* s1p1f ,
2468- * s1p20 ,* s1p21 ,* s1p22 ,* s1p23 ,* s1p24 ,* s1p25 ,* s1p26 ,* s1p27 ,* s1p28 ,* s1p29 ,* s1p2a ,* s1p2b ,* s1p2c ,* s1p2d ,* s1p2e ,* s1p2f ,
2469- * s1p30 ,* s1p31 ,* s1p32 ,* s1p33 ,* s1p34 ,* s1p35 ,* s1p36 ,* s1p37 ,* s1p38 ,* s1p39 ,* s1p3a ,* s1p3b ,* s1p3c ,* s1p3d ,* s1p3e ,* s1p3f ,
2475+ * s1p00 ,* s1p01 ,* s1p02 ,* s1p03 ,* s1p04 ,* s1p05 ,* s1p06 ,* s1p07 ,* s1p08 ,/* * s1p09,*s1p0a,*s1p0b,*s1p0c,*s1p0d,*s1p0e,*s1p0f, */
2476+ * s1p10 ,/* * s1p11,*s1p12,*s1p13,*s1p14,*s1p15,*s1p16,*s1p17,*/ * s1p18 ,/* * s1p19,*s1p1a,*s1p1b,*s1p1c,*s1p1d,*s1p1e,*s1p1f, */
2477+ * s1p20 ,/* * s1p21,*s1p22,*s1p23,*s1p24,*s1p25,*s1p26,*s1p27,*/ * s1p28 ,/* * s1p29,*s1p2a,*s1p2b,*s1p2c,*s1p2d,*s1p2e,*s1p2f, */
2478+ * s1p30 ,/* * s1p31,*s1p32,*s1p33,*s1p34,*s1p35,*s1p36,*s1p37,*/ * s1p38 ,/* * s1p39,*s1p3a,*s1p3b,*s1p3c,*s1p3d,*s1p3e,*s1p3f, */
24702479 * cy_r ,* cy_i ; // Need RADIX slots for sse2 carries, RADIX/2 for avx
24712480 #ifdef USE_AVX
24722481 vec_dbl * base_negacyclic_root ;
@@ -2496,7 +2505,6 @@ void radix64_dit_pass1(double a[], int n)
24962505 #endif
24972506
24982507 // int data:
2499- int thr_id = thread_arg -> tid ;
25002508 int iter = thread_arg -> iter ;
25012509 int NDIVR = thread_arg -> ndivr ;
25022510 int n = NDIVR * RADIX , nm1 = n - 1 ;
@@ -2648,37 +2656,37 @@ void radix64_dit_pass1(double a[], int n)
26482656 r3E = tmp + 0x3e ; r7E = tmp + 0x7e ;
26492657 tmp += 0x80 ;
26502658 s1p00 = tmp + 0x00 ; s1p20 = tmp + 0x40 ;
2651- s1p01 = tmp + 0x02 ; s1p21 = tmp + 0x42 ;
2652- s1p02 = tmp + 0x04 ; s1p22 = tmp + 0x44 ;
2653- s1p03 = tmp + 0x06 ; s1p23 = tmp + 0x46 ;
2654- s1p04 = tmp + 0x08 ; s1p24 = tmp + 0x48 ;
2655- s1p05 = tmp + 0x0a ; s1p25 = tmp + 0x4a ;
2656- s1p06 = tmp + 0x0c ; s1p26 = tmp + 0x4c ;
2657- s1p07 = tmp + 0x0e ; s1p27 = tmp + 0x4e ;
2659+ s1p01 = tmp + 0x02 ; // s1p21 = tmp + 0x42;
2660+ s1p02 = tmp + 0x04 ; // s1p22 = tmp + 0x44;
2661+ s1p03 = tmp + 0x06 ; // s1p23 = tmp + 0x46;
2662+ s1p04 = tmp + 0x08 ; // s1p24 = tmp + 0x48;
2663+ s1p05 = tmp + 0x0a ; // s1p25 = tmp + 0x4a;
2664+ s1p06 = tmp + 0x0c ; // s1p26 = tmp + 0x4c;
2665+ s1p07 = tmp + 0x0e ; // s1p27 = tmp + 0x4e;
26582666 s1p08 = tmp + 0x10 ; s1p28 = tmp + 0x50 ;
2659- s1p09 = tmp + 0x12 ; s1p29 = tmp + 0x52 ;
2660- s1p0a = tmp + 0x14 ; s1p2a = tmp + 0x54 ;
2661- s1p0b = tmp + 0x16 ; s1p2b = tmp + 0x56 ;
2662- s1p0c = tmp + 0x18 ; s1p2c = tmp + 0x58 ;
2663- s1p0d = tmp + 0x1a ; s1p2d = tmp + 0x5a ;
2664- s1p0e = tmp + 0x1c ; s1p2e = tmp + 0x5c ;
2665- s1p0f = tmp + 0x1e ; s1p2f = tmp + 0x5e ;
2667+ // s1p09 = tmp + 0x12; s1p29 = tmp + 0x52;
2668+ // s1p0a = tmp + 0x14; s1p2a = tmp + 0x54;
2669+ // s1p0b = tmp + 0x16; s1p2b = tmp + 0x56;
2670+ // s1p0c = tmp + 0x18; s1p2c = tmp + 0x58;
2671+ // s1p0d = tmp + 0x1a; s1p2d = tmp + 0x5a;
2672+ // s1p0e = tmp + 0x1c; s1p2e = tmp + 0x5c;
2673+ // s1p0f = tmp + 0x1e; s1p2f = tmp + 0x5e;
26662674 s1p10 = tmp + 0x20 ; s1p30 = tmp + 0x60 ;
2667- s1p11 = tmp + 0x22 ; s1p31 = tmp + 0x62 ;
2668- s1p12 = tmp + 0x24 ; s1p32 = tmp + 0x64 ;
2669- s1p13 = tmp + 0x26 ; s1p33 = tmp + 0x66 ;
2670- s1p14 = tmp + 0x28 ; s1p34 = tmp + 0x68 ;
2671- s1p15 = tmp + 0x2a ; s1p35 = tmp + 0x6a ;
2672- s1p16 = tmp + 0x2c ; s1p36 = tmp + 0x6c ;
2673- s1p17 = tmp + 0x2e ; s1p37 = tmp + 0x6e ;
2675+ // s1p11 = tmp + 0x22; s1p31 = tmp + 0x62;
2676+ // s1p12 = tmp + 0x24; s1p32 = tmp + 0x64;
2677+ // s1p13 = tmp + 0x26; s1p33 = tmp + 0x66;
2678+ // s1p14 = tmp + 0x28; s1p34 = tmp + 0x68;
2679+ // s1p15 = tmp + 0x2a; s1p35 = tmp + 0x6a;
2680+ // s1p16 = tmp + 0x2c; s1p36 = tmp + 0x6c;
2681+ // s1p17 = tmp + 0x2e; s1p37 = tmp + 0x6e;
26742682 s1p18 = tmp + 0x30 ; s1p38 = tmp + 0x70 ;
2675- s1p19 = tmp + 0x32 ; s1p39 = tmp + 0x72 ;
2676- s1p1a = tmp + 0x34 ; s1p3a = tmp + 0x74 ;
2677- s1p1b = tmp + 0x36 ; s1p3b = tmp + 0x76 ;
2678- s1p1c = tmp + 0x38 ; s1p3c = tmp + 0x78 ;
2679- s1p1d = tmp + 0x3a ; s1p3d = tmp + 0x7a ;
2680- s1p1e = tmp + 0x3c ; s1p3e = tmp + 0x7c ;
2681- s1p1f = tmp + 0x3e ; s1p3f = tmp + 0x7e ;
2683+ // s1p19 = tmp + 0x32; s1p39 = tmp + 0x72;
2684+ // s1p1a = tmp + 0x34; s1p3a = tmp + 0x74;
2685+ // s1p1b = tmp + 0x36; s1p3b = tmp + 0x76;
2686+ // s1p1c = tmp + 0x38; s1p3c = tmp + 0x78;
2687+ // s1p1d = tmp + 0x3a; s1p3d = tmp + 0x7a;
2688+ // s1p1e = tmp + 0x3c; s1p3e = tmp + 0x7c;
2689+ // s1p1f = tmp + 0x3e; s1p3f = tmp + 0x7e;
26822690 tmp += 0x80 ;
26832691 #if !USE_SCALAR_DFT_MACRO
26842692 // To support FMA versions of the radix-8 macros used to build radix-64 we insert a standalone copy of the [2,1,sqrt2,isrt2] quartet:
0 commit comments