Skip to content

Commit 1d5b98b

Browse files
committed
Remove warnings from radix320_ditN_cy_dif1.c.
1 parent ecc3ab9 commit 1d5b98b

File tree

2 files changed

+130
-26
lines changed

2 files changed

+130
-26
lines changed

src/radix320_ditN_cy_dif1.c

Lines changed: 122 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -139,22 +139,42 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
139139
! storage scheme, and radix8_ditN_cy_dif1 for details on the reduced-length weights array scheme.
140140
*/
141141
const char func[] = "radix320_ditN_cy_dif1";
142+
#ifdef USE_SSE2
142143
static int thr_id = 0; // Master thread gets this special id
144+
#endif
145+
#if !defined(MULTITHREAD) && defined(USE_SSE2)
143146
const int pfetch_dist = PFETCH_DIST;
147+
#endif
144148
const int stride = (int)RE_IM_STRIDE << 1; // main-array loop stride = 2*RE_IM_STRIDE
145149
static double wts_mult[2], inv_mult[2]; // Const wts-multiplier and 2*(its multiplicative inverse)
150+
#if !defined(MULTITHREAD) && !defined(USE_SSE2)
146151
double wt_re,wt_im, wi_re,wi_im; // Fermat-mod/LOACC weights stuff, used in both scalar and SIMD mode
152+
#endif
147153
// Cleanup loop assumes carryins propagate at most 4 words up, but need at least 1 vec_cmplx
148154
// (2 vec_dbl)'s worth of doubles in wraparound step, hence AVX-512 needs value bumped up:
149155
#ifdef USE_AVX512
150156
const int jhi_wrap = 15;
151157
#else
152158
const int jhi_wrap = 7;
153159
#endif
154-
int NDIVR,i,incr,j,j1,j2,jt,jp,jstart,jhi,full_pass,k,khi,l,ntmp,outer,nbytes, ilo,ihi,hi_neg,k0,k1,k2,k3,k4,nshift;
160+
int NDIVR,i,j,j1,jt,jhi,full_pass,khi,l,outer;
161+
#if !defined(MULTITHREAD) && !defined(USE_SSE2)
162+
int j2,jp,ntmp;
163+
#endif
164+
#ifndef MULTITHREAD
165+
int jstart, ilo,ihi,hi_neg;;
166+
#endif
167+
#ifdef USE_SSE2
168+
int nbytes;
169+
#endif
170+
#ifndef MULTITHREAD
171+
int k0,k1,k2,k3,k4,nshift;
172+
#endif
173+
#if !defined(MULTITHREAD) && defined(USE_SSE2)
155174
// incr = Carry-chain wts-multipliers recurrence length, which must divide
156175
// RADIX/[n-wayness of carry macro], e.g. RADIX/[16|8|4] = 20|40|80 for avx512,avx,sse, respectively:
157176
// Note: Not able to get away with as large a setting of incr for radix 320 as we did for 160.
177+
int incr;
158178
#ifdef USE_AVX512
159179
const int incr_long = 10,incr_med = 5,incr_short = 5;
160180
#elif defined(USE_AVX2)
@@ -177,6 +197,7 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
177197
incr = incr_short;
178198
else
179199
incr = incr_hiacc;
200+
#endif // !MULTITHREAD && USE_SSE2
180201

181202
// Jun 2018: Add support for residue shift. (Only LL-test needs intervention at carry-loop level).
182203
int target_idx = -1, target_set = 0,tidx_mod_stride;
@@ -189,7 +210,9 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
189210
static int poff[RADIX>>2]; // Store mults of p4 offset for loop control
190211
#ifndef MULTITHREAD
191212
// Shared DIF+DIT:
213+
#ifndef USE_SSE2
192214
double rt,it;
215+
#endif
193216
static int t_offsets[64];
194217
static int plo[16],phi[20];
195218
// Need storage for 4 circular-shifts perms of a basic 5-vector, with shift count in [0,4] that means 4*9 elts:
@@ -230,14 +253,25 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
230253
// Local storage: We must use an array here because scalars have no guarantees about relative address offsets
231254
// [and even if those are contiguous-as-hoped-for, they may run in reverse]; Make array type (struct complex)
232255
// to allow us to use the same offset-indexing as in the original radix-32 in-place DFT macros:
256+
#ifndef MULTITHREAD
233257
double *addr;
234-
struct complex t[RADIX], *tptr;
235-
int *itmp,*itm2; // Pointer into the bjmodn array
258+
#endif
259+
struct complex t[RADIX];
260+
#if !defined(MULTITHREAD) && !defined(USE_SSE2)
261+
struct complex *tptr;
262+
#endif
263+
#ifndef MULTITHREAD
264+
int *itmp; // Pointer into the bjmodn array
265+
#endif
266+
#if !defined(MULTITHREAD) && defined(USE_AVX) && !defined(USE_AVX512)
267+
int *itm2; // Pointer into the bjmodn array
268+
#endif
236269
int err;
237270
static int first_entry=TRUE;
238271

239272
/*...stuff for the reduced-length DWT weights array is here: */
240273
int n_div_nwt;
274+
#ifndef MULTITHREAD
241275
int col,co2,co3;
242276
#ifdef USE_AVX512
243277
double t0,t1,t2,t3;
@@ -252,6 +286,7 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
252286
int n_minus_sil,n_minus_silp1,sinwt,sinwtm1;
253287
double wtl,wtlp1,wtn,wtnm1; /* Mersenne-mod weights stuff */
254288
#endif
289+
#endif // !MULTITHREAD
255290

256291
#ifdef USE_SSE2
257292

@@ -270,20 +305,37 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
270305
double *add0,*add1,*add2,*add3; /* Addresses into array sections */
271306
#endif
272307

308+
#ifndef MULTITHREAD
273309
static int *bjmodn; // Alloc mem for this along with other SIMD stuff
310+
#endif
311+
#ifndef USE_AVX512
274312
const double crnd = 3.0*0x4000000*0x2000000;
313+
#endif
314+
#ifndef USE_AVX
275315
struct complex *ctmp; // Hybrid AVX-DFT/SSE2-carry scheme used for Mersenne-mod needs a 2-word-double pointer
316+
#endif
276317
vec_dbl
277318
#ifndef MULTITHREAD
278319
*va0,*va1,*va2,*va3,*va4, *vb0,*vb1,*vb2,*vb3,*vb4,
320+
#if defined(USE_AVX2) && !defined(USE_AVX512)
279321
*wa0,*wa1,*wa2,*wa3,*wa4, *wb0,*wb1,*wb2,*wb3,*wb4,
322+
#endif
280323
#endif
281-
*tmp,*tm0,*tm1,*tm2; // Non-static utility ptrs
324+
*tmp,*tm2 // Non-static utility ptrs
325+
#ifndef USE_SSE2
326+
,*tm0 // Non-static utility ptrs
327+
#endif
328+
#if !defined(MULTITHREAD) && defined(USE_SSE2)
329+
,*tm1 // Non-static utility ptrs
330+
#endif
331+
;
282332
static vec_dbl *two,
283333
*ycc1,*yss1,*ycc2,*yss2,*yss3, // radiy-5 DFT trig consts
284334
*max_err, *sse2_rnd, *half_arr,
285335
*r00, // Head of RADIX*vec_cmplx-sized local store #1
336+
#if !defined(MULTITHREAD) && defined(USE_SSE2)
286337
*s1p00, // Head of RADIX*vec_cmplx-sized local store #2
338+
#endif
287339
*cy; // Need RADIX/2 slots for sse2 carries, RADIX/4 for avx
288340
#else
289341
static int p0123[4];
@@ -293,7 +345,10 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
293345

294346
static struct cy_thread_data_t *tdat = 0x0;
295347
// Threadpool-based dispatch stuff:
296-
static int main_work_units = 0, pool_work_units = 0;
348+
#if 0//def OS_TYPE_MACOSX
349+
static int main_work_units = 0;
350+
#endif
351+
static int pool_work_units = 0;
297352
static struct threadpool *tpool = 0x0;
298353
static int task_is_blocking = TRUE;
299354
static thread_control_t thread_control = {0,0,0};
@@ -327,8 +382,10 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
327382
ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
328383
}
329384

385+
#ifndef MULTITHREAD
330386
// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
331387
col=co2=co3=-1;
388+
#endif
332389
// Jan 2018: To support PRP-testing, read the LR-modpow-scalar-multiply-needed bit for the current iteration from the global array:
333390
double prp_mult = 1.0;
334391
if((TEST_TYPE & 0xfffffffe) == TEST_TYPE_PRP) { // Mask off low bit to lump together PRP and PRP-C tests
@@ -507,7 +564,10 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
507564
__r0 = sc_ptr;
508565
#endif
509566
tmp = sc_ptr; r00 = tmp; // Head of RADIX*vec_cmplx-sized local store #1
510-
tmp += 0x280; s1p00 = tmp; // Head of RADIX*vec_cmplx-sized local store #2
567+
tmp += 0x280; // Head of RADIX*vec_cmplx-sized local store #2
568+
#if !defined(MULTITHREAD) && defined(USE_SSE2)
569+
s1p00 = tmp; // Head of RADIX*vec_cmplx-sized local store #2
570+
#endif
511571
tmp += 0x280;
512572
two = tmp + 0x0;
513573
// DFT-64 roots all def'd locally in the DFT-64 functions, only need the radix-5 roots here:
@@ -789,23 +849,29 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
789849

790850
#ifdef USE_AVX512
791851
#ifdef CARRY_16_WAY
852+
#ifndef MULTITHREAD
792853
n_minus_sil = (struct uint32x16*)sse_n + 1;
793854
n_minus_silp1 = (struct uint32x16*)sse_n + 2;
794855
sinwt = (struct uint32x16*)sse_n + 3;
795856
sinwtm1 = (struct uint32x16*)sse_n + 4;
857+
#endif
796858
nbytes += 256;
797859
#else
860+
#ifndef MULTITHREAD
798861
n_minus_sil = (struct uint32x8 *)sse_n + 1;
799862
n_minus_silp1 = (struct uint32x8 *)sse_n + 2;
800863
sinwt = (struct uint32x8 *)sse_n + 3;
801864
sinwtm1 = (struct uint32x8 *)sse_n + 4;
865+
#endif
802866
nbytes += 128;
803867
#endif
804868
#elif defined(USE_AVX)
869+
#ifndef MULTITHREAD
805870
n_minus_sil = (struct uint32x4 *)sse_n + 1;
806871
n_minus_silp1 = (struct uint32x4 *)sse_n + 2;
807872
sinwt = (struct uint32x4 *)sse_n + 3;
808873
sinwtm1 = (struct uint32x4 *)sse_n + 4;
874+
#endif
809875
nbytes += 64;
810876
#endif
811877

@@ -817,12 +883,14 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
817883
tmp = tm2; tm2 += cslots_in_local_store;
818884
}
819885

886+
#ifndef MULTITHREAD
820887
// For large radices, array-access to bjmodn means only init base-ptr here:
821888
#ifdef USE_AVX
822889
bjmodn = (int*)(sinwtm1 + RE_IM_STRIDE);
823890
#else
824891
bjmodn = (int*)(sse_n + RE_IM_STRIDE);
825892
#endif
893+
#endif
826894

827895
#endif // USE_SSE2
828896

@@ -2528,18 +2596,21 @@ void radix320_dit_pass1(double a[], int n)
25282596
void*
25292597
cy320_process_chunk(void*targ) // Thread-arg pointer *must* be cast to void and specialized inside the function
25302598
{
2531-
const char func[] = "radix320_ditN_cy_dif1";
25322599
struct cy_thread_data_t* thread_arg = targ; // Move to top because scalar-mode carry pointers taken directly from it
25332600
double *addr;
2601+
#ifdef USE_SSE2
25342602
const int pfetch_dist = PFETCH_DIST;
2603+
#endif
25352604
const int stride = (int)RE_IM_STRIDE << 1; // main-array loop stride = 2*RE_IM_STRIDE
25362605
uint32 p1,p2,p3,p4,p5,p6,p7,p8,p9,pa,pb,pc,pd,pe,pf
2537-
,p10,p20,p30,p40,p50,p60,p70,p80,p90,pa0,pb0,pc0,pd0,pe0,pf0,pg0,ph0,pi0,pj0, nsave = 0;
2606+
,p10,p20,p30,p40,p50,p60,p70,p80,p90,pa0,pb0,pc0,pd0,pe0,pf0,pg0,ph0,pi0,pj0;
25382607
// Shared DIF+DIT:
2608+
#ifndef USE_SSE2
25392609
double rt,it;
25402610
double wt_re,wt_im, wi_re,wi_im; // Fermat-mod/LOACC weights stuff, used in both scalar and SIMD mode
2611+
#endif
25412612
int poff[RADIX>>2]; // Store [RADIX/4] mults of p04 offset for loop control
2542-
int ilo,ihi,idx,jdx,hi_neg,k0,k1,k2,k3,k4,nshift, *iptr;
2613+
int ilo,ihi,hi_neg,k0,k1,k2,k3,k4,nshift;
25432614
int plo[16],phi[20], t_offsets[64];
25442615
uint64 i0,i1,i2,i3;
25452616
// DIF:
@@ -2570,10 +2641,15 @@ void radix320_dit_pass1(double a[], int n)
25702641
0x4576013289bafedcull // [I]
25712642
};
25722643

2573-
int incr,j,j1,j2,jt,jp,k,l,ntmp;
2644+
int j,j1,jt,l;
2645+
#ifndef USE_SSE2
2646+
int j2,jp,ntmp;
2647+
#endif
2648+
#ifdef USE_SSE2
25742649
// incr = Carry-chain wts-multipliers recurrence length, which must divide
25752650
// RADIX/[n-wayness of carry macro], e.g. RADIX/[16|8|4] = 20|40|80 for avx512,avx,sse, respectively:
25762651
// Note: Not able to get away with as large a setting of incr for radix 320 as we did for 160.
2652+
int incr;
25772653
#ifdef USE_AVX512
25782654
const int incr_long = 10,incr_med = 5,incr_short = 5;
25792655
#elif defined(USE_AVX2)
@@ -2596,6 +2672,7 @@ void radix320_dit_pass1(double a[], int n)
25962672
incr = incr_short;
25972673
else
25982674
incr = incr_hiacc;
2675+
#endif // USE_SSE2
25992676

26002677
#ifdef USE_AVX512
26012678
double t0,t1,t2,t3;
@@ -2613,21 +2690,40 @@ void radix320_dit_pass1(double a[], int n)
26132690

26142691
#ifdef USE_SSE2
26152692

2693+
#ifndef USE_AVX512
26162694
const double crnd = 3.0*0x4000000*0x2000000;
2617-
int *itmp,*itm2; // Pointer into the bjmodn array
2695+
#endif
2696+
int *itmp; // Pointer into the bjmodn array
2697+
#if defined(USE_AVX) && !defined(USE_AVX512)
2698+
int *itm2; // Pointer into the bjmodn array
2699+
#endif
2700+
#ifndef USE_AVX
26182701
struct complex *ctmp; // Hybrid AVX-DFT/SSE2-carry scheme used for Mersenne-mod needs a 2-word-double pointer
2702+
#endif
26192703
double *add0,*add1,*add2,*add3;
26202704
int *bjmodn; // Alloc mem for this along with other SIMD stuff
26212705
vec_dbl *tmp,*tm1,*tm2, // Non-static utility ptrs
2622-
*va0,*va1,*va2,*va3,*va4, *vb0,*vb1,*vb2,*vb3,*vb4,
2623-
*wa0,*wa1,*wa2,*wa3,*wa4, *wb0,*wb1,*wb2,*wb3,*wb4;
2624-
vec_dbl *two,
2625-
*ycc1,*yss1,*ycc2,*yss2,*yss3, // radiy-5 DFT trig consts
2626-
*max_err, *sse2_rnd, *half_arr,
2706+
*va0,*va1,*va2,*va3,*va4, *vb0,*vb1,*vb2,*vb3,*vb4
2707+
#if defined(USE_AVX2) && !defined(USE_AVX512)
2708+
,*wa0,*wa1,*wa2,*wa3,*wa4, *wb0,*wb1,*wb2,*wb3,*wb4
2709+
#endif
2710+
;
2711+
vec_dbl
2712+
#if defined(USE_AVX2) && !defined(USE_AVX512)
2713+
*two,
2714+
#endif
2715+
*ycc1,/* *yss1,*ycc2,*yss2,*yss3, */ // radiy-5 DFT trig consts
2716+
*max_err,
2717+
#ifndef USE_AVX512
2718+
*sse2_rnd,
2719+
#endif
2720+
*half_arr,
26272721
*r00, // Head of RADIX*vec_cmplx-sized local store #1
26282722
*s1p00, // Head of RADIX*vec_cmplx-sized local store #2
26292723
*cy; // Need RADIX/2 slots for sse2 carries, RADIX/4 for avx
2724+
#ifndef USE_AVX512
26302725
double dtmp;
2726+
#endif
26312727
uint64 *sign_mask, *sse_bw, *sse_sw, *sse_n;
26322728

26332729
#else
@@ -2650,8 +2746,9 @@ void radix320_dit_pass1(double a[], int n)
26502746
#endif
26512747

26522748
// int data:
2749+
#ifdef USE_SSE2
26532750
int thr_id = thread_arg->tid;
2654-
int iter = thread_arg->iter;
2751+
#endif
26552752
int NDIVR = thread_arg->ndivr;
26562753
int n = NDIVR*RADIX;
26572754
int target_idx = thread_arg->target_idx;
@@ -2669,7 +2766,7 @@ void radix320_dit_pass1(double a[], int n)
26692766

26702767
// double data:
26712768
double maxerr = thread_arg->maxerr;
2672-
double scale = thread_arg->scale; int full_pass = scale < 0.5;
2769+
double scale = thread_arg->scale;
26732770
double prp_mult = thread_arg->prp_mult;
26742771

26752772
// pointer data:
@@ -2881,18 +2978,20 @@ void radix320_dit_pass1(double a[], int n)
28812978
tmp = r00 = thread_arg->r00; // Head of RADIX*vec_cmplx-sized local store #1
28822979
tmp += 0x280; s1p00 = tmp; // Head of RADIX*vec_cmplx-sized local store #2
28832980
tmp += 0x280;
2981+
#if defined(USE_AVX2) && !defined(USE_AVX512)
28842982
two = tmp + 0x0;
2983+
#endif
28852984
// DFT-64 roots all def'd locally in the DFT-64 functions, only need the radix-5 roots here:
28862985
ycc1 = tmp + 0x1; // radix-5 DFT trig consts
2887-
ycc2 = tmp + 0x2;
2888-
yss1 = tmp + 0x3;
2889-
yss2 = tmp + 0x4;
2890-
yss3 = tmp + 0x5;
2986+
//ycc2 = tmp + 0x2;
2987+
//yss1 = tmp + 0x3;
2988+
//yss2 = tmp + 0x4;
2989+
//yss3 = tmp + 0x5;
28912990
tmp += 0x6; // sc_ptr += 0x506; add a pad so the tmp += 2 below gives half_arr = 0x508 + (RADIX/vec_dbl-length)
28922991
#ifdef USE_AVX512
28932992
cy = tmp; tmp += 0x28; // RADIX/8 vec_dbl slots for carry sub-array
28942993
max_err = tmp + 0x00;
2895-
sse2_rnd= tmp + 0x01;
2994+
//sse2_rnd= tmp + 0x01;
28962995
half_arr= tmp + 0x02; // sc_ptr += 0x(508 + 28) = 0x538; This is where the value of half_arr_offset320 comes from
28972996
#elif defined(USE_AVX)
28982997
cy = tmp; tmp += 0x50; // RADIX/4 vec_dbl slots

src/radix320_main_carry_loop.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,15 @@
2323
// This main loop is same for un-and-multithreaded, so stick into a header file
2424
// (can't use a macro because of the #if-enclosed stuff).
2525

26-
for(k=1; k <= khi; k++) /* Do n/(radix(1)*nwt) outer loop executions... */
26+
for(int k=1; k <= khi; k++) /* Do n/(radix(1)*nwt) outer loop executions... */
2727
{
2828
for(j = jstart; j < jhi; j += stride)
2929
{
3030
j1 = j;
3131
j1 = j1 + ( (j1 >> DAT_BITS) << PAD_BITS ); /* padded-array fetch index is here */
32+
#ifndef USE_SSE2
3233
j2 = j1 + RE_IM_STRIDE;
34+
#endif
3335

3436
/*...The radix-320 DIT pass is here: */
3537
#ifdef USE_SSE2
@@ -237,7 +239,10 @@ for(k=1; k <= khi; k++) /* Do n/(radix(1)*nwt) outer loop executions... */
237239
#endif
238240
i = (!j); // Need this to force 0-wod to be bigword
239241
addr = &prp_mult;
240-
tmp = s1p00; tm1 = cy; tm2 = cy+1; itmp = bjmodn; itm2 = bjmodn+4; // tm2,itm2 not used in AVX-512 mode
242+
tmp = s1p00; tm1 = cy; itmp = bjmodn;
243+
#ifndef USE_AVX512
244+
tm2 = cy+1; itm2 = bjmodn+4; // tm2,itm2 not used in AVX-512 mode
245+
#endif
241246
for(loop = 0; loop < nloop; loop += incr)
242247
{
243248
co2 = co2save; // Need this for all wts-inits beynd the initial set, due to the co2 = co3 preceding the (j+2) data
@@ -626,4 +631,4 @@ for(k=1; k <= khi; k++) /* Do n/(radix(1)*nwt) outer loop executions... */
626631
col += RADIX;
627632
co3 -= RADIX;
628633
}
629-
} /* end for(k=1; k <= khi; k++) */
634+
} /* end for(int k=1; k <= khi; k++) */

0 commit comments

Comments
 (0)