Skip to content

Commit 0d58ca5

Browse files
committed
Using intrinsics to improve performance where possible
1 parent 84670f0 commit 0d58ca5

File tree

1 file changed

+90
-69
lines changed

1 file changed

+90
-69
lines changed

include/cufinufft/common_kernels.hpp

Lines changed: 90 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ __device__ auto get_kerval_and_local_start(
4343
cuda::std::array<cuda::std::array<T, ns>, ndim> ker;
4444
cuda::std::array<int, ndim> start;
4545
for (int idim = 0; idim < ndim; ++idim) {
46-
const auto rescaled = fold_rescale(xyz[idim][idx], nf[idim]);
46+
const auto rescaled = fold_rescale(loadReadOnly(xyz[idim] + idx), nf[idim]);
4747
const auto s = int(std::ceil(rescaled - ns_2f));
4848
if constexpr (KEREVALMETH == 1) {
4949
eval_kernel_vec_horner<T, ns>(&ker[idim][0], T(s) - rescaled, sigma);
@@ -95,7 +95,7 @@ __device__ int compute_bin_index(
9595
int binidx = 0;
9696
int stride = 1;
9797
for (int idim = 0; idim < ndim; ++idim) {
98-
const T rescaled = fold_rescale(xyz[idim][idx], nf[idim]);
98+
const T rescaled = fold_rescale(loadReadOnly(xyz[idim] + idx), nf[idim]);
9999
int bin = floor(rescaled / binsizes[idim]);
100100
bin = bin >= nbins[idim] ? bin - 1 : bin;
101101
bin = bin < 0 ? 0 : bin;
@@ -140,9 +140,10 @@ __device__ int output_index_from_flat_local_index(
140140

141141
template<typename T, int KEREVALMETH, int ndim, int ns>
142142
__global__ FINUFFT_FLATTEN void interp_nupts_driven(
143-
const cuda::std::array<const T *, 3> xyz, cuda_complex<T> *c,
144-
const cuda_complex<T> *fw, int M, const cuda::std::array<int, 3> nf, T es_c,
145-
T es_beta, T sigma, const int *idxnupts) {
143+
const cuda::std::array<const T *, 3> xyz, cuda_complex<T> *FINUFFT_RESTRICT c,
144+
const cuda_complex<T> *FINUFFT_RESTRICT fw, const int M,
145+
const cuda::std::array<int, 3> nf, const T es_c, const T es_beta, const T sigma,
146+
const int *FINUFFT_RESTRICT idxnupts) {
146147

147148
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M;
148149
i += blockDim.x * gridDim.x) {
@@ -154,15 +155,15 @@ __global__ FINUFFT_FLATTEN void interp_nupts_driven(
154155
cuda_complex<T> cnow{0, 0};
155156
if constexpr (ndim == 1) {
156157
for (int x0 = 0, ix = start[0]; x0 < ns; ++x0, ix = (ix + 1 >= nf[0]) ? 0 : ix + 1)
157-
cnow += fw[ix] * ker[0][x0];
158+
cnow += loadReadOnly(fw + ix) * ker[0][x0];
158159
} else if constexpr (ndim == 2) {
159160
for (int y0 = 0, iy = start[1]; y0 < ns;
160161
++y0, iy = (iy + 1 >= nf[1]) ? 0 : iy + 1) {
161162
const auto inidx0 = iy * nf[0];
162163
cuda_complex<T> cnowx{0, 0};
163164
for (int x0 = 0, ix = start[0]; x0 < ns;
164165
++x0, ix = (ix + 1 >= nf[0]) ? 0 : ix + 1)
165-
cnowx += fw[inidx0 + ix] * ker[0][x0];
166+
cnowx += loadReadOnly(fw + inidx0 + ix) * ker[0][x0];
166167
cnow += cnowx * ker[1][y0];
167168
}
168169
} else {
@@ -177,13 +178,14 @@ __global__ FINUFFT_FLATTEN void interp_nupts_driven(
177178
++y0, iy = (iy + 1 >= nf[1]) ? 0 : iy + 1) {
178179
const auto inidx1 = inidx0 + iy * nf[0];
179180
cuda_complex<T> cnowx{0, 0};
180-
for (int x0 = 0; x0 < ns; ++x0) cnowx += fw[inidx1 + xidx[x0]] * ker[0][x0];
181+
for (int x0 = 0; x0 < ns; ++x0)
182+
cnowx += loadReadOnly(fw + inidx1 + xidx[x0]) * ker[0][x0];
181183
cnowy += cnowx * ker[1][y0];
182184
}
183185
cnow += cnowy * ker[2][z0];
184186
}
185187
}
186-
c[idxnupts[i]] = cnow;
188+
storeCacheStreaming(c + nuptsidx, cnow);
187189
}
188190
}
189191

@@ -241,35 +243,40 @@ __device__ void shared_mem_copy_helper(cuda::std::array<int, 3> binsizes,
241243

242244
template<typename T, int KEREVALMETH, int ndim, int ns>
243245
__global__ FINUFFT_FLATTEN void interp_subprob(
244-
cuda::std::array<const T *, 3> xyz, cuda_complex<T> *c, const cuda_complex<T> *fw,
245-
int M, cuda::std::array<int, 3> nf, T es_c, T es_beta, T sigma,
246-
const int *binstartpts, const int *bin_size, cuda::std::array<int, 3> binsizes,
247-
const int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob,
248-
int maxsubprobsize, cuda::std::array<int, 3> nbins, const int *idxnupts) {
246+
const cuda::std::array<const T *, 3> xyz, cuda_complex<T> *FINUFFT_RESTRICT c,
247+
const cuda_complex<T> *FINUFFT_RESTRICT fw, const int M,
248+
const cuda::std::array<int, 3> nf, const T es_c, const T es_beta, const T sigma,
249+
const int *FINUFFT_RESTRICT binstartpts, const int *FINUFFT_RESTRICT bin_size,
250+
const cuda::std::array<int, 3> binsizes, const int *FINUFFT_RESTRICT subprob_to_bin,
251+
const int *FINUFFT_RESTRICT subprobstartpts, const int *FINUFFT_RESTRICT numsubprob,
252+
const int maxsubprobsize, const cuda::std::array<int, 3> nbins,
253+
const int *FINUFFT_RESTRICT idxnupts) {
249254
extern __shared__ char sharedbuf[];
250255
auto fwshared = (cuda_complex<T> *)sharedbuf;
251256

252257
const auto subpidx = blockIdx.x;
253-
const auto bidx = subprob_to_bin[subpidx];
254-
const auto binsubp_idx = subpidx - subprobstartpts[bidx];
255-
const auto ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
256-
const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
258+
const auto bidx = loadReadOnly(subprob_to_bin + subpidx);
259+
const auto binsubp_idx = subpidx - loadReadOnly(subprobstartpts + bidx);
260+
const auto ptstart = loadReadOnly(binstartpts + bidx) + binsubp_idx * maxsubprobsize;
261+
const auto nupts =
262+
min(maxsubprobsize, loadReadOnly(bin_size + bidx) - binsubp_idx * maxsubprobsize);
257263

258264
auto offset = compute_offset<ndim>(bidx, nbins, binsizes);
259265

260266
constexpr auto ns_2 = (ns + 1) / 2;
261267
constexpr auto rounded_ns = ns_2 * 2;
262268

263-
shared_mem_copy_helper<T, ndim, ns>(binsizes, offset, nf,
264-
[fw, fwshared](int idx_shared, int idx_global) {
265-
fwshared[idx_shared] = fw[idx_global];
266-
});
269+
shared_mem_copy_helper<T, ndim, ns>(
270+
binsizes, offset, nf, [fw, fwshared](int idx_shared, int idx_global) {
271+
fwshared[idx_shared] = loadReadOnly(fw + idx_global);
272+
});
267273
__syncthreads();
268274

269275
for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
270-
const int idx = ptstart + i;
271-
auto [ker, start] = get_kerval_and_local_start<T, KEREVALMETH, ndim, ns>(
272-
idxnupts[idx], xyz, nf, offset, sigma, es_c, es_beta);
276+
const int idx = ptstart + i;
277+
const auto nuptsidx = loadReadOnly(idxnupts + idx);
278+
auto [ker, start] = get_kerval_and_local_start<T, KEREVALMETH, ndim, ns>(
279+
nuptsidx, xyz, nf, offset, sigma, es_c, es_beta);
273280

274281
cuda_complex<T> cnow{0, 0};
275282
if constexpr (ndim == 1) {
@@ -307,7 +314,7 @@ __global__ FINUFFT_FLATTEN void interp_subprob(
307314
cnow += cnowz * ker[2][zz];
308315
}
309316
}
310-
c[idxnupts[idx]] = cnow;
317+
storeCacheStreaming(c + nuptsidx, cnow);
311318
}
312319
}
313320

@@ -351,16 +358,17 @@ void cuinterp_subprob(const cufinufft_plan_t<T> &d_plan, int blksize) {
351358

352359
template<typename T, int KEREVALMETH, int ndim, int ns>
353360
__global__ FINUFFT_FLATTEN void spread_nupts_driven(
354-
cuda::std::array<const T *, 3> xyz, const cuda_complex<T> *c, cuda_complex<T> *fw,
355-
int M, cuda::std::array<int, 3> nf, T es_c, T es_beta, T sigma, const int *idxnupts) {
361+
const cuda::std::array<const T *, 3> xyz, const cuda_complex<T> *FINUFFT_RESTRICT c,
362+
cuda_complex<T> *FINUFFT_RESTRICT fw, const int M, const cuda::std::array<int, 3> nf,
363+
const T es_c, const T es_beta, const T sigma, const int *FINUFFT_RESTRICT idxnupts) {
356364

357365
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M;
358366
i += blockDim.x * gridDim.x) {
359367
const auto nuptsidx = loadReadOnly(idxnupts + i);
360368
auto [ker, start] = get_kerval_and_startpos_nuptsdriven<T, KEREVALMETH, ndim, ns>(
361369
nuptsidx, xyz, nf, sigma, es_c, es_beta);
362370

363-
cuda_complex<T> val = c[idxnupts[i]];
371+
const auto val = loadReadOnly(c + nuptsidx);
364372
if constexpr (ndim == 1) {
365373
for (int x0 = 0, ix = start[0]; x0 < ns; ++x0, ix = (ix + 1 >= nf[0]) ? 0 : ix + 1)
366374
atomicAddComplexGlobal<T>(fw + ix, ker[0][x0] * val);
@@ -418,26 +426,29 @@ void cuspread_nupts_driven(const cufinufft_plan_t<T> &d_plan, int blksize) {
418426
// FIXME unify the next two functions and templatize on a lambda?
419427
template<typename T, int ndim>
420428
__global__ FINUFFT_FLATTEN void calc_bin_size_noghost(
421-
int M, cuda::std::array<int, 3> nf, cuda::std::array<int, 3> binsizes,
422-
cuda::std::array<int, 3> nbins, int *bin_size, cuda::std::array<const T *, 3> xyz,
423-
int *sortidx) {
429+
const int M, const cuda::std::array<int, 3> nf,
430+
const cuda::std::array<int, 3> binsizes, const cuda::std::array<int, 3> nbins,
431+
int *FINUFFT_RESTRICT bin_size, const cuda::std::array<const T *, 3> xyz,
432+
int *FINUFFT_RESTRICT sortidx) {
424433
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
425434
i += gridDim.x * blockDim.x) {
426435
const int binidx = compute_bin_index<ndim>(i, nf, binsizes, nbins, xyz);
427-
int oldidx = atomicAdd(&bin_size[binidx], 1);
428-
sortidx[i] = oldidx;
436+
const int oldidx = atomicAdd(&bin_size[binidx], 1);
437+
storeCacheStreaming(sortidx + i, oldidx);
429438
}
430439
}
431440

432441
template<typename T, int ndim>
433442
__global__ FINUFFT_FLATTEN void calc_inverse_of_global_sort_idx(
434-
int M, cuda::std::array<int, 3> binsizes, cuda::std::array<int, 3> nbins,
435-
const int *bin_startpts, const int *sortidx, cuda::std::array<const T *, 3> xyz,
436-
int *index, cuda::std::array<int, 3> nf) {
443+
const int M, const cuda::std::array<int, 3> binsizes,
444+
const cuda::std::array<int, 3> nbins, const int *FINUFFT_RESTRICT bin_startpts,
445+
const int *FINUFFT_RESTRICT sortidx, const cuda::std::array<const T *, 3> xyz,
446+
int *FINUFFT_RESTRICT index, const cuda::std::array<int, 3> nf) {
437447
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
438448
i += gridDim.x * blockDim.x) {
439449
const int binidx = compute_bin_index<ndim>(i, nf, binsizes, nbins, xyz);
440-
index[bin_startpts[binidx] + sortidx[i]] = i;
450+
storeCacheStreaming(
451+
index + loadReadOnly(bin_startpts + binidx) + loadReadOnly(sortidx + i), i);
441452
}
442453
}
443454

@@ -477,19 +488,23 @@ void cuspread_nuptsdriven_prop(cufinufft_plan_t<T> &d_plan) {
477488

478489
template<typename T, int KEREVALMETH, int ndim, int ns>
479490
__global__ FINUFFT_FLATTEN void spread_subprob(
480-
cuda::std::array<const T *, 3> xyz, const cuda_complex<T> *c, cuda_complex<T> *fw,
481-
int M, cuda::std::array<int, 3> nf, T sigma, T es_c, T es_beta,
482-
const int *binstartpts, const int *bin_size, cuda::std::array<int, 3> binsizes,
483-
const int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob,
484-
int maxsubprobsize, cuda::std::array<int, 3> nbins, const int *idxnupts) {
491+
const cuda::std::array<const T *, 3> xyz, const cuda_complex<T> *FINUFFT_RESTRICT c,
492+
cuda_complex<T> *FINUFFT_RESTRICT fw, const int M, const cuda::std::array<int, 3> nf,
493+
const T sigma, const T es_c, const T es_beta, const int *FINUFFT_RESTRICT binstartpts,
494+
const int *FINUFFT_RESTRICT bin_size, const cuda::std::array<int, 3> binsizes,
495+
const int *FINUFFT_RESTRICT subprob_to_bin,
496+
const int *FINUFFT_RESTRICT subprobstartpts, const int *FINUFFT_RESTRICT numsubprob,
497+
const int maxsubprobsize, const cuda::std::array<int, 3> nbins,
498+
const int *FINUFFT_RESTRICT idxnupts) {
485499
extern __shared__ char sharedbuf[];
486500
auto fwshared = (cuda_complex<T> *)sharedbuf;
487501

488502
const auto subpidx = blockIdx.x;
489-
const auto bidx = subprob_to_bin[subpidx];
490-
const auto binsubp_idx = subpidx - subprobstartpts[bidx];
491-
const auto ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
492-
const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
503+
const auto bidx = loadReadOnly(subprob_to_bin + subpidx);
504+
const auto binsubp_idx = subpidx - loadReadOnly(subprobstartpts + bidx);
505+
const auto ptstart = loadReadOnly(binstartpts + bidx) + binsubp_idx * maxsubprobsize;
506+
const auto nupts =
507+
min(maxsubprobsize, loadReadOnly(bin_size + bidx) - binsubp_idx * maxsubprobsize);
493508

494509
auto offset = compute_offset<ndim>(bidx, nbins, binsizes);
495510

@@ -506,11 +521,12 @@ __global__ FINUFFT_FLATTEN void spread_subprob(
506521
__syncthreads();
507522

508523
for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
509-
const int idx = ptstart + i;
510-
auto [ker, start] = get_kerval_and_local_start<T, KEREVALMETH, ndim, ns>(
511-
idxnupts[idx], xyz, nf, offset, sigma, es_c, es_beta);
524+
const int idx = ptstart + i;
525+
const auto nuptsidx = loadReadOnly(idxnupts + idx);
526+
auto [ker, start] = get_kerval_and_local_start<T, KEREVALMETH, ndim, ns>(
527+
nuptsidx, xyz, nf, offset, sigma, es_c, es_beta);
512528

513-
const auto cnow = c[idxnupts[idx]];
529+
const auto cnow = loadReadOnly(c + nuptsidx);
514530
if constexpr (ndim == 1) {
515531
const auto ofs = start[0] + ns_2;
516532
for (int xx = 0; xx < ns; ++xx) {
@@ -587,20 +603,21 @@ static void cuspread_subprob(const cufinufft_plan_t<T> &d_plan, int blksize) {
587603
launch(spread_subprob<T, 0, ndim, ns>);
588604
}
589605

590-
static __global__ void calc_subprob(const int *bin_size, int *num_subprob,
591-
int maxsubprobsize, int numbins) {
606+
static __global__ void calc_subprob(const int *FINUFFT_RESTRICT bin_size,
607+
int *FINUFFT_RESTRICT num_subprob,
608+
const int maxsubprobsize, const int numbins) {
592609
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
593610
i += gridDim.x * blockDim.x) {
594-
num_subprob[i] = (bin_size[i] + maxsubprobsize - 1) / maxsubprobsize;
611+
num_subprob[i] = (loadReadOnly(bin_size + i) + maxsubprobsize - 1) / maxsubprobsize;
595612
}
596613
}
597-
static __global__ void map_b_into_subprob(int *d_subprob_to_bin,
598-
const int *d_subprobstartpts,
599-
const int *d_numsubprob, int numbins) {
614+
static __global__ void map_b_into_subprob(
615+
int *FINUFFT_RESTRICT d_subprob_to_bin, const int *FINUFFT_RESTRICT d_subprobstartpts,
616+
const int *FINUFFT_RESTRICT d_numsubprob, const int numbins) {
600617
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
601618
i += gridDim.x * blockDim.x) {
602-
for (int j = 0; j < d_numsubprob[i]; j++) {
603-
d_subprob_to_bin[d_subprobstartpts[i] + j] = i;
619+
for (int j = 0; j < loadReadOnly(d_numsubprob + i); j++) {
620+
d_subprob_to_bin[loadReadOnly(d_subprobstartpts + i) + j] = i;
604621
}
605622
}
606623
}
@@ -662,11 +679,14 @@ static void cuspread_subprob_prop(cufinufft_plan_t<T> &d_plan) {
662679

663680
template<typename T, int KEREVALMETH, int ndim, int ns>
664681
__global__ FINUFFT_FLATTEN void spread_output_driven(
665-
cuda::std::array<const T *, 3> xyz, const cuda_complex<T> *c, cuda_complex<T> *fw,
666-
int M, cuda::std::array<int, 3> nf, T sigma, T es_c, T es_beta,
667-
const int *binstartpts, const int *bin_size, cuda::std::array<int, 3> binsizes,
668-
const int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob,
669-
int maxsubprobsize, cuda::std::array<int, 3> nbins, const int *idxnupts, int np) {
682+
const cuda::std::array<const T *, 3> xyz, const cuda_complex<T> *FINUFFT_RESTRICT c,
683+
cuda_complex<T> *FINUFFT_RESTRICT fw, const int M, const cuda::std::array<int, 3> nf,
684+
const T sigma, const T es_c, const T es_beta, const int *FINUFFT_RESTRICT binstartpts,
685+
const int *FINUFFT_RESTRICT bin_size, const cuda::std::array<int, 3> binsizes,
686+
const int *FINUFFT_RESTRICT subprob_to_bin,
687+
const int *FINUFFT_RESTRICT subprobstartpts, const int *FINUFFT_RESTRICT numsubprob,
688+
const int maxsubprobsize, const cuda::std::array<int, 3> nbins,
689+
const int *FINUFFT_RESTRICT idxnupts, const int np) {
670690
extern __shared__ char sharedbuf[];
671691

672692
static constexpr auto ns_2 = (ns + 1) / 2;
@@ -675,10 +695,11 @@ __global__ FINUFFT_FLATTEN void spread_output_driven(
675695

676696
auto [padded_size, local_subgrid_size] = get_padded_subgrid_info<ndim, ns>(binsizes);
677697

678-
const int bidx = subprob_to_bin[blockIdx.x];
679-
const int binsubp_idx = blockIdx.x - subprobstartpts[bidx];
680-
const int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
681-
const int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
698+
const int bidx = loadReadOnly(subprob_to_bin + blockIdx.x);
699+
const int binsubp_idx = blockIdx.x - loadReadOnly(subprobstartpts + bidx);
700+
const int ptstart = loadReadOnly(binstartpts + bidx) + binsubp_idx * maxsubprobsize;
701+
const int nupts =
702+
min(maxsubprobsize, loadReadOnly(bin_size + bidx) - binsubp_idx * maxsubprobsize);
682703

683704
auto offset = compute_offset<ndim>(bidx, nbins, binsizes);
684705

@@ -705,7 +726,7 @@ __global__ FINUFFT_FLATTEN void spread_output_driven(
705726
const auto batch_size = min(np, nupts - batch_begin);
706727
for (int i = threadIdx.x; i < batch_size; i += blockDim.x) {
707728
const int nuptsidx = loadReadOnly(idxnupts + ptstart + i + batch_begin);
708-
nupts_sm[i] = c[nuptsidx];
729+
nupts_sm[i] = loadReadOnly(c + nuptsidx);
709730
auto [ker, local_shift] = get_kerval_and_local_start<T, KEREVALMETH, ndim, ns>(
710731
nuptsidx, xyz, nf, offset, sigma, es_c, es_beta);
711732
kerevals[i] = ker;

0 commit comments

Comments
 (0)