@@ -43,7 +43,7 @@ __device__ auto get_kerval_and_local_start(
4343 cuda::std::array<cuda::std::array<T, ns>, ndim> ker;
4444 cuda::std::array<int , ndim> start;
4545 for (int idim = 0 ; idim < ndim; ++idim) {
46- const auto rescaled = fold_rescale (xyz[idim][ idx] , nf[idim]);
46+ const auto rescaled = fold_rescale (loadReadOnly ( xyz[idim] + idx) , nf[idim]);
4747 const auto s = int (std::ceil (rescaled - ns_2f));
4848 if constexpr (KEREVALMETH == 1 ) {
4949 eval_kernel_vec_horner<T, ns>(&ker[idim][0 ], T (s) - rescaled, sigma);
@@ -95,7 +95,7 @@ __device__ int compute_bin_index(
9595 int binidx = 0 ;
9696 int stride = 1 ;
9797 for (int idim = 0 ; idim < ndim; ++idim) {
98- const T rescaled = fold_rescale (xyz[idim][ idx] , nf[idim]);
98+ const T rescaled = fold_rescale (loadReadOnly ( xyz[idim] + idx) , nf[idim]);
9999 int bin = floor (rescaled / binsizes[idim]);
100100 bin = bin >= nbins[idim] ? bin - 1 : bin;
101101 bin = bin < 0 ? 0 : bin;
@@ -140,9 +140,10 @@ __device__ int output_index_from_flat_local_index(
140140
141141template <typename T, int KEREVALMETH, int ndim, int ns>
142142__global__ FINUFFT_FLATTEN void interp_nupts_driven (
143- const cuda::std::array<const T *, 3 > xyz, cuda_complex<T> *c,
144- const cuda_complex<T> *fw, int M, const cuda::std::array<int , 3 > nf, T es_c,
145- T es_beta, T sigma, const int *idxnupts) {
143+ const cuda::std::array<const T *, 3 > xyz, cuda_complex<T> *FINUFFT_RESTRICT c,
144+ const cuda_complex<T> *FINUFFT_RESTRICT fw, const int M,
145+ const cuda::std::array<int , 3 > nf, const T es_c, const T es_beta, const T sigma,
146+ const int *FINUFFT_RESTRICT idxnupts) {
146147
147148 for (int i = blockDim.x * blockIdx.x + threadIdx.x ; i < M;
148149 i += blockDim.x * gridDim.x ) {
@@ -154,15 +155,15 @@ __global__ FINUFFT_FLATTEN void interp_nupts_driven(
154155 cuda_complex<T> cnow{0 , 0 };
155156 if constexpr (ndim == 1 ) {
156157 for (int x0 = 0 , ix = start[0 ]; x0 < ns; ++x0, ix = (ix + 1 >= nf[0 ]) ? 0 : ix + 1 )
157- cnow += fw[ix] * ker[0 ][x0];
158+ cnow += loadReadOnly (fw + ix) * ker[0 ][x0];
158159 } else if constexpr (ndim == 2 ) {
159160 for (int y0 = 0 , iy = start[1 ]; y0 < ns;
160161 ++y0, iy = (iy + 1 >= nf[1 ]) ? 0 : iy + 1 ) {
161162 const auto inidx0 = iy * nf[0 ];
162163 cuda_complex<T> cnowx{0 , 0 };
163164 for (int x0 = 0 , ix = start[0 ]; x0 < ns;
164165 ++x0, ix = (ix + 1 >= nf[0 ]) ? 0 : ix + 1 )
165- cnowx += fw[ inidx0 + ix] * ker[0 ][x0];
166+ cnowx += loadReadOnly (fw + inidx0 + ix) * ker[0 ][x0];
166167 cnow += cnowx * ker[1 ][y0];
167168 }
168169 } else {
@@ -177,13 +178,14 @@ __global__ FINUFFT_FLATTEN void interp_nupts_driven(
177178 ++y0, iy = (iy + 1 >= nf[1 ]) ? 0 : iy + 1 ) {
178179 const auto inidx1 = inidx0 + iy * nf[0 ];
179180 cuda_complex<T> cnowx{0 , 0 };
180- for (int x0 = 0 ; x0 < ns; ++x0) cnowx += fw[inidx1 + xidx[x0]] * ker[0 ][x0];
181+ for (int x0 = 0 ; x0 < ns; ++x0)
182+ cnowx += loadReadOnly (fw + inidx1 + xidx[x0]) * ker[0 ][x0];
181183 cnowy += cnowx * ker[1 ][y0];
182184 }
183185 cnow += cnowy * ker[2 ][z0];
184186 }
185187 }
186- c[idxnupts[i]] = cnow;
188+ storeCacheStreaming (c + nuptsidx, cnow) ;
187189 }
188190}
189191
@@ -241,35 +243,40 @@ __device__ void shared_mem_copy_helper(cuda::std::array<int, 3> binsizes,
241243
242244template <typename T, int KEREVALMETH, int ndim, int ns>
243245__global__ FINUFFT_FLATTEN void interp_subprob (
244- cuda::std::array<const T *, 3 > xyz, cuda_complex<T> *c, const cuda_complex<T> *fw,
245- int M, cuda::std::array<int , 3 > nf, T es_c, T es_beta, T sigma,
246- const int *binstartpts, const int *bin_size, cuda::std::array<int , 3 > binsizes,
247- const int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob,
248- int maxsubprobsize, cuda::std::array<int , 3 > nbins, const int *idxnupts) {
246+ const cuda::std::array<const T *, 3 > xyz, cuda_complex<T> *FINUFFT_RESTRICT c,
247+ const cuda_complex<T> *FINUFFT_RESTRICT fw, const int M,
248+ const cuda::std::array<int , 3 > nf, const T es_c, const T es_beta, const T sigma,
249+ const int *FINUFFT_RESTRICT binstartpts, const int *FINUFFT_RESTRICT bin_size,
250+ const cuda::std::array<int , 3 > binsizes, const int *FINUFFT_RESTRICT subprob_to_bin,
251+ const int *FINUFFT_RESTRICT subprobstartpts, const int *FINUFFT_RESTRICT numsubprob,
252+ const int maxsubprobsize, const cuda::std::array<int , 3 > nbins,
253+ const int *FINUFFT_RESTRICT idxnupts) {
249254 extern __shared__ char sharedbuf[];
250255 auto fwshared = (cuda_complex<T> *)sharedbuf;
251256
252257 const auto subpidx = blockIdx.x ;
253- const auto bidx = subprob_to_bin[subpidx];
254- const auto binsubp_idx = subpidx - subprobstartpts[bidx];
255- const auto ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
256- const auto nupts = min (maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
258+ const auto bidx = loadReadOnly (subprob_to_bin + subpidx);
259+ const auto binsubp_idx = subpidx - loadReadOnly (subprobstartpts + bidx);
260+ const auto ptstart = loadReadOnly (binstartpts + bidx) + binsubp_idx * maxsubprobsize;
261+ const auto nupts =
262+ min (maxsubprobsize, loadReadOnly (bin_size + bidx) - binsubp_idx * maxsubprobsize);
257263
258264 auto offset = compute_offset<ndim>(bidx, nbins, binsizes);
259265
260266 constexpr auto ns_2 = (ns + 1 ) / 2 ;
261267 constexpr auto rounded_ns = ns_2 * 2 ;
262268
263- shared_mem_copy_helper<T, ndim, ns>(binsizes, offset, nf,
264- [fw, fwshared](int idx_shared, int idx_global) {
265- fwshared[idx_shared] = fw[ idx_global] ;
266- });
269+ shared_mem_copy_helper<T, ndim, ns>(
270+ binsizes, offset, nf, [fw, fwshared](int idx_shared, int idx_global) {
271+ fwshared[idx_shared] = loadReadOnly (fw + idx_global) ;
272+ });
267273 __syncthreads ();
268274
269275 for (int i = threadIdx.x ; i < nupts; i += blockDim.x ) {
270- const int idx = ptstart + i;
271- auto [ker, start] = get_kerval_and_local_start<T, KEREVALMETH, ndim, ns>(
272- idxnupts[idx], xyz, nf, offset, sigma, es_c, es_beta);
276+ const int idx = ptstart + i;
277+ const auto nuptsidx = loadReadOnly (idxnupts + idx);
278+ auto [ker, start] = get_kerval_and_local_start<T, KEREVALMETH, ndim, ns>(
279+ nuptsidx, xyz, nf, offset, sigma, es_c, es_beta);
273280
274281 cuda_complex<T> cnow{0 , 0 };
275282 if constexpr (ndim == 1 ) {
@@ -307,7 +314,7 @@ __global__ FINUFFT_FLATTEN void interp_subprob(
307314 cnow += cnowz * ker[2 ][zz];
308315 }
309316 }
310- c[idxnupts[idx]] = cnow;
317+ storeCacheStreaming (c + nuptsidx, cnow) ;
311318 }
312319}
313320
@@ -351,16 +358,17 @@ void cuinterp_subprob(const cufinufft_plan_t<T> &d_plan, int blksize) {
351358
352359template <typename T, int KEREVALMETH, int ndim, int ns>
353360__global__ FINUFFT_FLATTEN void spread_nupts_driven (
354- cuda::std::array<const T *, 3 > xyz, const cuda_complex<T> *c, cuda_complex<T> *fw,
355- int M, cuda::std::array<int , 3 > nf, T es_c, T es_beta, T sigma, const int *idxnupts) {
361+ const cuda::std::array<const T *, 3 > xyz, const cuda_complex<T> *FINUFFT_RESTRICT c,
362+ cuda_complex<T> *FINUFFT_RESTRICT fw, const int M, const cuda::std::array<int , 3 > nf,
363+ const T es_c, const T es_beta, const T sigma, const int *FINUFFT_RESTRICT idxnupts) {
356364
357365 for (int i = blockDim.x * blockIdx.x + threadIdx.x ; i < M;
358366 i += blockDim.x * gridDim.x ) {
359367 const auto nuptsidx = loadReadOnly (idxnupts + i);
360368 auto [ker, start] = get_kerval_and_startpos_nuptsdriven<T, KEREVALMETH, ndim, ns>(
361369 nuptsidx, xyz, nf, sigma, es_c, es_beta);
362370
363- cuda_complex<T> val = c[idxnupts[i]] ;
371+ const auto val = loadReadOnly (c + nuptsidx) ;
364372 if constexpr (ndim == 1 ) {
365373 for (int x0 = 0 , ix = start[0 ]; x0 < ns; ++x0, ix = (ix + 1 >= nf[0 ]) ? 0 : ix + 1 )
366374 atomicAddComplexGlobal<T>(fw + ix, ker[0 ][x0] * val);
@@ -418,26 +426,29 @@ void cuspread_nupts_driven(const cufinufft_plan_t<T> &d_plan, int blksize) {
418426// FIXME unify the next two functions and templatize on a lambda?
419427template <typename T, int ndim>
420428__global__ FINUFFT_FLATTEN void calc_bin_size_noghost (
421- int M, cuda::std::array<int , 3 > nf, cuda::std::array<int , 3 > binsizes,
422- cuda::std::array<int , 3 > nbins, int *bin_size, cuda::std::array<const T *, 3 > xyz,
423- int *sortidx) {
429+ const int M, const cuda::std::array<int , 3 > nf,
430+ const cuda::std::array<int , 3 > binsizes, const cuda::std::array<int , 3 > nbins,
431+ int *FINUFFT_RESTRICT bin_size, const cuda::std::array<const T *, 3 > xyz,
432+ int *FINUFFT_RESTRICT sortidx) {
424433 for (int i = threadIdx.x + blockIdx.x * blockDim.x ; i < M;
425434 i += gridDim.x * blockDim.x ) {
426435 const int binidx = compute_bin_index<ndim>(i, nf, binsizes, nbins, xyz);
427- int oldidx = atomicAdd (&bin_size[binidx], 1 );
428- sortidx[i] = oldidx;
436+ const int oldidx = atomicAdd (&bin_size[binidx], 1 );
437+ storeCacheStreaming ( sortidx + i, oldidx) ;
429438 }
430439}
431440
432441template <typename T, int ndim>
433442__global__ FINUFFT_FLATTEN void calc_inverse_of_global_sort_idx (
434- int M, cuda::std::array<int , 3 > binsizes, cuda::std::array<int , 3 > nbins,
435- const int *bin_startpts, const int *sortidx, cuda::std::array<const T *, 3 > xyz,
436- int *index, cuda::std::array<int , 3 > nf) {
443+ const int M, const cuda::std::array<int , 3 > binsizes,
444+ const cuda::std::array<int , 3 > nbins, const int *FINUFFT_RESTRICT bin_startpts,
445+ const int *FINUFFT_RESTRICT sortidx, const cuda::std::array<const T *, 3 > xyz,
446+ int *FINUFFT_RESTRICT index, const cuda::std::array<int , 3 > nf) {
437447 for (int i = threadIdx.x + blockIdx.x * blockDim.x ; i < M;
438448 i += gridDim.x * blockDim.x ) {
439449 const int binidx = compute_bin_index<ndim>(i, nf, binsizes, nbins, xyz);
440- index[bin_startpts[binidx] + sortidx[i]] = i;
450+ storeCacheStreaming (
451+ index + loadReadOnly (bin_startpts + binidx) + loadReadOnly (sortidx + i), i);
441452 }
442453}
443454
@@ -477,19 +488,23 @@ void cuspread_nuptsdriven_prop(cufinufft_plan_t<T> &d_plan) {
477488
478489template <typename T, int KEREVALMETH, int ndim, int ns>
479490__global__ FINUFFT_FLATTEN void spread_subprob (
480- cuda::std::array<const T *, 3 > xyz, const cuda_complex<T> *c, cuda_complex<T> *fw,
481- int M, cuda::std::array<int , 3 > nf, T sigma, T es_c, T es_beta,
482- const int *binstartpts, const int *bin_size, cuda::std::array<int , 3 > binsizes,
483- const int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob,
484- int maxsubprobsize, cuda::std::array<int , 3 > nbins, const int *idxnupts) {
491+ const cuda::std::array<const T *, 3 > xyz, const cuda_complex<T> *FINUFFT_RESTRICT c,
492+ cuda_complex<T> *FINUFFT_RESTRICT fw, const int M, const cuda::std::array<int , 3 > nf,
493+ const T sigma, const T es_c, const T es_beta, const int *FINUFFT_RESTRICT binstartpts,
494+ const int *FINUFFT_RESTRICT bin_size, const cuda::std::array<int , 3 > binsizes,
495+ const int *FINUFFT_RESTRICT subprob_to_bin,
496+ const int *FINUFFT_RESTRICT subprobstartpts, const int *FINUFFT_RESTRICT numsubprob,
497+ const int maxsubprobsize, const cuda::std::array<int , 3 > nbins,
498+ const int *FINUFFT_RESTRICT idxnupts) {
485499 extern __shared__ char sharedbuf[];
486500 auto fwshared = (cuda_complex<T> *)sharedbuf;
487501
488502 const auto subpidx = blockIdx.x ;
489- const auto bidx = subprob_to_bin[subpidx];
490- const auto binsubp_idx = subpidx - subprobstartpts[bidx];
491- const auto ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
492- const auto nupts = min (maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
503+ const auto bidx = loadReadOnly (subprob_to_bin + subpidx);
504+ const auto binsubp_idx = subpidx - loadReadOnly (subprobstartpts + bidx);
505+ const auto ptstart = loadReadOnly (binstartpts + bidx) + binsubp_idx * maxsubprobsize;
506+ const auto nupts =
507+ min (maxsubprobsize, loadReadOnly (bin_size + bidx) - binsubp_idx * maxsubprobsize);
493508
494509 auto offset = compute_offset<ndim>(bidx, nbins, binsizes);
495510
@@ -506,11 +521,12 @@ __global__ FINUFFT_FLATTEN void spread_subprob(
506521 __syncthreads ();
507522
508523 for (int i = threadIdx.x ; i < nupts; i += blockDim.x ) {
509- const int idx = ptstart + i;
510- auto [ker, start] = get_kerval_and_local_start<T, KEREVALMETH, ndim, ns>(
511- idxnupts[idx], xyz, nf, offset, sigma, es_c, es_beta);
524+ const int idx = ptstart + i;
525+ const auto nuptsidx = loadReadOnly (idxnupts + idx);
526+ auto [ker, start] = get_kerval_and_local_start<T, KEREVALMETH, ndim, ns>(
527+ nuptsidx, xyz, nf, offset, sigma, es_c, es_beta);
512528
513- const auto cnow = c[idxnupts[idx]] ;
529+ const auto cnow = loadReadOnly (c + nuptsidx) ;
514530 if constexpr (ndim == 1 ) {
515531 const auto ofs = start[0 ] + ns_2;
516532 for (int xx = 0 ; xx < ns; ++xx) {
@@ -587,20 +603,21 @@ static void cuspread_subprob(const cufinufft_plan_t<T> &d_plan, int blksize) {
587603 launch (spread_subprob<T, 0 , ndim, ns>);
588604}
589605
590- static __global__ void calc_subprob (const int *bin_size, int *num_subprob,
591- int maxsubprobsize, int numbins) {
606+ static __global__ void calc_subprob (const int *FINUFFT_RESTRICT bin_size,
607+ int *FINUFFT_RESTRICT num_subprob,
608+ const int maxsubprobsize, const int numbins) {
592609 for (int i = threadIdx.x + blockIdx.x * blockDim.x ; i < numbins;
593610 i += gridDim.x * blockDim.x ) {
594- num_subprob[i] = (bin_size[i] + maxsubprobsize - 1 ) / maxsubprobsize;
611+ num_subprob[i] = (loadReadOnly ( bin_size + i) + maxsubprobsize - 1 ) / maxsubprobsize;
595612 }
596613}
597- static __global__ void map_b_into_subprob (int *d_subprob_to_bin,
598- const int *d_subprobstartpts,
599- const int *d_numsubprob, int numbins) {
614+ static __global__ void map_b_into_subprob (
615+ int *FINUFFT_RESTRICT d_subprob_to_bin, const int *FINUFFT_RESTRICT d_subprobstartpts,
616+ const int *FINUFFT_RESTRICT d_numsubprob, const int numbins) {
600617 for (int i = threadIdx.x + blockIdx.x * blockDim.x ; i < numbins;
601618 i += gridDim.x * blockDim.x ) {
602- for (int j = 0 ; j < d_numsubprob[i] ; j++) {
603- d_subprob_to_bin[d_subprobstartpts[i] + j] = i;
619+ for (int j = 0 ; j < loadReadOnly ( d_numsubprob + i) ; j++) {
620+ d_subprob_to_bin[loadReadOnly ( d_subprobstartpts + i) + j] = i;
604621 }
605622 }
606623}
@@ -662,11 +679,14 @@ static void cuspread_subprob_prop(cufinufft_plan_t<T> &d_plan) {
662679
663680template <typename T, int KEREVALMETH, int ndim, int ns>
664681__global__ FINUFFT_FLATTEN void spread_output_driven (
665- cuda::std::array<const T *, 3 > xyz, const cuda_complex<T> *c, cuda_complex<T> *fw,
666- int M, cuda::std::array<int , 3 > nf, T sigma, T es_c, T es_beta,
667- const int *binstartpts, const int *bin_size, cuda::std::array<int , 3 > binsizes,
668- const int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob,
669- int maxsubprobsize, cuda::std::array<int , 3 > nbins, const int *idxnupts, int np) {
682+ const cuda::std::array<const T *, 3 > xyz, const cuda_complex<T> *FINUFFT_RESTRICT c,
683+ cuda_complex<T> *FINUFFT_RESTRICT fw, const int M, const cuda::std::array<int , 3 > nf,
684+ const T sigma, const T es_c, const T es_beta, const int *FINUFFT_RESTRICT binstartpts,
685+ const int *FINUFFT_RESTRICT bin_size, const cuda::std::array<int , 3 > binsizes,
686+ const int *FINUFFT_RESTRICT subprob_to_bin,
687+ const int *FINUFFT_RESTRICT subprobstartpts, const int *FINUFFT_RESTRICT numsubprob,
688+ const int maxsubprobsize, const cuda::std::array<int , 3 > nbins,
689+ const int *FINUFFT_RESTRICT idxnupts, const int np) {
670690 extern __shared__ char sharedbuf[];
671691
672692 static constexpr auto ns_2 = (ns + 1 ) / 2 ;
@@ -675,10 +695,11 @@ __global__ FINUFFT_FLATTEN void spread_output_driven(
675695
676696 auto [padded_size, local_subgrid_size] = get_padded_subgrid_info<ndim, ns>(binsizes);
677697
678- const int bidx = subprob_to_bin[blockIdx.x ];
679- const int binsubp_idx = blockIdx.x - subprobstartpts[bidx];
680- const int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
681- const int nupts = min (maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
698+ const int bidx = loadReadOnly (subprob_to_bin + blockIdx.x );
699+ const int binsubp_idx = blockIdx.x - loadReadOnly (subprobstartpts + bidx);
700+ const int ptstart = loadReadOnly (binstartpts + bidx) + binsubp_idx * maxsubprobsize;
701+ const int nupts =
702+ min (maxsubprobsize, loadReadOnly (bin_size + bidx) - binsubp_idx * maxsubprobsize);
682703
683704 auto offset = compute_offset<ndim>(bidx, nbins, binsizes);
684705
@@ -705,7 +726,7 @@ __global__ FINUFFT_FLATTEN void spread_output_driven(
705726 const auto batch_size = min (np, nupts - batch_begin);
706727 for (int i = threadIdx.x ; i < batch_size; i += blockDim.x ) {
707728 const int nuptsidx = loadReadOnly (idxnupts + ptstart + i + batch_begin);
708- nupts_sm[i] = c[ nuptsidx] ;
729+ nupts_sm[i] = loadReadOnly (c + nuptsidx) ;
709730 auto [ker, local_shift] = get_kerval_and_local_start<T, KEREVALMETH, ndim, ns>(
710731 nuptsidx, xyz, nf, offset, sigma, es_c, es_beta);
711732 kerevals[i] = ker;
0 commit comments