@@ -253,25 +253,27 @@ __global__ FINUFFT_FLATTEN void interp_subprob(
253253 const auto subpidx = blockIdx.x ;
254254 const auto bidx = loadReadOnly (p.subprob_to_bin + subpidx);
255255 const auto binsubp_idx = subpidx - loadReadOnly (p.subprobstartpts + bidx);
256- const auto ptstart = loadReadOnly (p.binstartpts + bidx) + binsubp_idx * p.opts .gpu_maxsubprobsize ;
257- const auto nupts = min (p.opts .gpu_maxsubprobsize ,
258- loadReadOnly (p.binsize + bidx) - binsubp_idx * p.opts .gpu_maxsubprobsize );
256+ const auto ptstart =
257+ loadReadOnly (p.binstartpts + bidx) + binsubp_idx * p.opts .gpu_maxsubprobsize ;
258+ const auto nupts =
259+ min (p.opts .gpu_maxsubprobsize ,
260+ loadReadOnly (p.binsize + bidx) - binsubp_idx * p.opts .gpu_maxsubprobsize );
259261
260262 auto offset = compute_offset<ndim>(bidx, nbins, binsizes);
261263
262264 constexpr auto ns_2 = (ns + 1 ) / 2 ;
263265 constexpr auto rounded_ns = ns_2 * 2 ;
264266
265- shared_mem_copy_helper<T, ndim, ns>(binsizes, offset, p. nf123 ,
266- [fw, fwshared](int idx_shared, int idx_global) {
267- fwshared[idx_shared] = loadReadOnly (fw + idx_global);
268- });
267+ shared_mem_copy_helper<T, ndim, ns>(
268+ binsizes, offset, p. nf123 , [fw, fwshared](int idx_shared, int idx_global) {
269+ fwshared[idx_shared] = loadReadOnly (fw + idx_global);
270+ });
269271 __syncthreads ();
270272
271273 for (int i = threadIdx.x ; i < nupts; i += blockDim.x ) {
272- const int idx = ptstart + i;
274+ const int idx = ptstart + i;
273275 const auto nuptsidx = loadReadOnly (p.idxnupts + idx);
274- auto [ker, start] = get_kerval_and_local_start<T, KEREVALMETH, ndim, ns>(
276+ auto [ker, start] = get_kerval_and_local_start<T, KEREVALMETH, ndim, ns>(
275277 nuptsidx, p.xyz , p.nf123 , offset, sigma, es_c, es_beta);
276278
277279 cuda_complex<T> cnow{0 , 0 };
@@ -488,9 +490,11 @@ __global__ FINUFFT_FLATTEN void spread_subprob(
488490 const auto subpidx = blockIdx.x ;
489491 const auto bidx = loadReadOnly (p.subprob_to_bin + subpidx);
490492 const auto binsubp_idx = subpidx - loadReadOnly (p.subprobstartpts + bidx);
491- const auto ptstart = loadReadOnly (p.binstartpts + bidx) + binsubp_idx * p.opts .gpu_maxsubprobsize ;
492- const auto nupts = min (p.opts .gpu_maxsubprobsize ,
493- loadReadOnly (p.binsize + bidx) - binsubp_idx * p.opts .gpu_maxsubprobsize );
493+ const auto ptstart =
494+ loadReadOnly (p.binstartpts + bidx) + binsubp_idx * p.opts .gpu_maxsubprobsize ;
495+ const auto nupts =
496+ min (p.opts .gpu_maxsubprobsize ,
497+ loadReadOnly (p.binsize + bidx) - binsubp_idx * p.opts .gpu_maxsubprobsize );
494498
495499 auto offset = compute_offset<ndim>(bidx, nbins, binsizes);
496500
@@ -507,9 +511,9 @@ __global__ FINUFFT_FLATTEN void spread_subprob(
507511 __syncthreads ();
508512
509513 for (int i = threadIdx.x ; i < nupts; i += blockDim.x ) {
510- const int idx = ptstart + i;
514+ const int idx = ptstart + i;
511515 const auto nuptsidx = loadReadOnly (p.idxnupts + idx);
512- auto [ker, start] = get_kerval_and_local_start<T, KEREVALMETH, ndim, ns>(
516+ auto [ker, start] = get_kerval_and_local_start<T, KEREVALMETH, ndim, ns>(
513517 nuptsidx, p.xyz , p.nf123 , offset, sigma, es_c, es_beta);
514518
515519 const auto cnow = loadReadOnly (c + nuptsidx);
@@ -677,9 +681,11 @@ __global__ FINUFFT_FLATTEN void spread_output_driven(
677681
678682 const int bidx = loadReadOnly (p.subprob_to_bin + blockIdx.x );
679683 const int binsubp_idx = blockIdx.x - loadReadOnly (p.subprobstartpts + bidx);
680- const int ptstart = loadReadOnly (p.binstartpts + bidx) + binsubp_idx * p.opts .gpu_maxsubprobsize ;
681- const int nupts = min (p.opts .gpu_maxsubprobsize ,
682- loadReadOnly (p.binsize + bidx) - binsubp_idx * p.opts .gpu_maxsubprobsize );
684+ const int ptstart =
685+ loadReadOnly (p.binstartpts + bidx) + binsubp_idx * p.opts .gpu_maxsubprobsize ;
686+ const int nupts =
687+ min (p.opts .gpu_maxsubprobsize ,
688+ loadReadOnly (p.binsize + bidx) - binsubp_idx * p.opts .gpu_maxsubprobsize );
683689
684690 auto offset = compute_offset<ndim>(bidx, nbins, binsizes);
685691
0 commit comments