|
4 | 4 | #include <cufft.h> |
5 | 5 | #include <cufinufft/types.h> |
6 | 6 | #include <cufinufft_opts.h> |
| 7 | +#include <finufft_errors.h> |
7 | 8 | #include <finufft_spread_opts.h> |
8 | 9 |
|
9 | 10 | #include <complex.h> |
@@ -32,6 +33,38 @@ template<typename T> |
32 | 33 | void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a, |
33 | 34 | T *fwkerhalf, finufft_spread_opts opts); |
34 | 35 |
|
| 36 | +template<typename T> |
| 37 | +std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, |
| 38 | + int bin_size_z); |
| 39 | + |
| 40 | +template<typename T> |
| 41 | +void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts); |
| 42 | + |
| 43 | +template<typename T, typename V> |
| 44 | +auto cufinufft_set_shared_memory(V *kernel, const int dim, |
| 45 | + const cufinufft_plan_t<T> &d_plan) { |
| 46 | + /** |
| 47 | + * WARNING: this function does not handle cuda errors. The caller should check them. |
| 48 | + */ |
| 49 | + int device_id{}, shared_mem_per_block{}; |
| 50 | + cudaGetDevice(&device_id); |
| 51 | + const auto shared_mem_required = |
| 52 | + shared_memory_required<T>(dim, d_plan.spopts.nspread, d_plan.opts.gpu_binsizex, |
| 53 | + d_plan.opts.gpu_binsizey, d_plan.opts.gpu_binsizez); |
| 54 | + cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, |
| 55 | + device_id); |
| 56 | + if (shared_mem_required > shared_mem_per_block) { |
| 57 | + fprintf(stderr, |
| 58 | + "Error: Shared memory required per block is %zu bytes, but the device " |
| 59 | + "supports only %d bytes.\n", |
| 60 | + shared_mem_required, shared_mem_per_block); |
| 61 | + return FINUFFT_ERR_INSUFFICIENT_SHMEM; |
| 62 | + } |
| 63 | + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, |
| 64 | + shared_mem_required); |
| 65 | + return 0; |
| 66 | +} |
| 67 | + |
35 | 68 | } // namespace common |
36 | 69 | } // namespace cufinufft |
37 | 70 | #endif |
0 commit comments