Skip to content

Commit b3c2be7

Browse files
authored
GPU performance improvements (#488)
* cuFINUFFT binsize is now a function of the shared memory available where possible. * cuFINUFFT GM 1D sorts using thrust::sort instead of bin-sort. * cuFINUFFT using the new normalized Horner coefficients and added support for 1.25. * cuFINUFFT new compile flags for extra-vectorization, flushing single precision denormals to 0 and using fma where possible. * cuFINUFFT using intrinsics in foldrescale and other places to increase performance * cuFINUFFT using SM90 float2 vector atomicAdd where supported * cuFINUFFT making default binsize = 0
1 parent b81c86f commit b3c2be7

38 files changed

+2220
-1464
lines changed

CHANGELOG

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,17 @@ V 2.3.0beta (7/24/24)
5454
* cmake adding nvcc and msvc optimization flags
5555
* cmake supports sphinx
5656
* updated install docs
57+
* cuFINUFFT binsize is now a function of the shared memory available where
58+
possible.
59+
* cuFINUFFT GM 1D sorts using thrust::sort instead of bin-sort.
60+
* cuFINUFFT using the new normalized Horner coefficients and added support
61+
for 1.25.
62+
* cuFINUFFT new compile flags for extra-vectorization, flushing single
63+
precision denormals to 0 and using fma where possible.
64+
* cuFINUFFT using intrinsics in foldrescale and other places to increase
65+
performance
66+
* cuFINUFFT using SM90 float2 vector atomicAdd where supported
67+
* cuFINUFFT making default binsize = 0
5768

5869
V 2.2.0 (12/12/23)
5970

devel/CMakeLists.txt

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,25 @@ project(finufft_devel)
22
# Set the minimum required version of CMake
33
cmake_minimum_required(VERSION 3.5)
44

5-
65
# include cpm cmake, downloading it
7-
CPMAddPackage(
8-
NAME benchmark
9-
GITHUB_REPOSITORY google/benchmark
10-
VERSION 1.8.3
11-
OPTIONS "BENCHMARK_ENABLE_TESTING OFF"
12-
13-
)
6+
cpmaddpackage(
7+
NAME
8+
benchmark
9+
GITHUB_REPOSITORY
10+
google/benchmark
11+
VERSION
12+
1.8.3
13+
OPTIONS
14+
"BENCHMARK_ENABLE_TESTING OFF")
1415

15-
if (benchmark_ADDED)
16-
# patch benchmark target
17-
set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
16+
if(benchmark_ADDED)
17+
# patch benchmark target
18+
set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
1819
endif()
1920

2021
add_executable(foldrescale foldrescale.cpp)
2122
target_link_libraries(foldrescale finufft benchmark xsimd)
2223
add_executable(padding padding.cpp)
24+
target_compile_features(padding PRIVATE cxx_std_17)
2325
target_link_libraries(padding finufft xsimd)
2426
target_compile_options(padding PRIVATE -march=native)

devel/gen_all_horner_C_code.m

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,12 @@
1212

1313
for upsampfac = [2.0, 1.25]; % sigma: either 2 (default) or low (eg 5/4)
1414
fprintf('upsampfac = %g...\n',upsampfac)
15-
15+
1616
ws = 2:16;
17-
opts.wpad = true; % pad kernel eval to multiple of 4
17+
opts.wpad = false; % pad kernel eval to multiple of 4
1818

19-
if upsampfac==2, fid = fopen('../src/ker_horner_allw_loop_constexpr.c','w');
20-
else, fid = fopen('../src/ker_lowupsampfac_horner_allw_loop_constexpr.c','w');
19+
if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc','w');
20+
else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w');
2121
end
2222
fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n'));
2323
fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n'));
@@ -27,9 +27,9 @@
2727
fprintf('w=%d\td=%d\tbeta=%.3g\n',w,d,beta);
2828
str = gen_ker_horner_loop_C_code(w,d,beta,opts);
2929
if j==1 % write switch statement
30-
fwrite(fid,sprintf(' if constexpr(w==%d) {\n',w));
30+
fwrite(fid,sprintf(' if (w==%d) {\n',w));
3131
else
32-
fwrite(fid,sprintf(' } else if constexpr(w==%d) {\n',w));
32+
fwrite(fid,sprintf(' } else if (w==%d) {\n',w));
3333
end
3434
for i=1:numel(str); fwrite(fid,[' ',str{i}]); end
3535
end

devel/gen_ker_horner_loop_C_code.m

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@
3838
width = w;
3939
end
4040
for n=1:d+1 % loop over poly coeff powers
41-
s = sprintf('FLT c%d[] = {%.16E',n-1, C(n,1));
41+
s = sprintf('constexpr FLT c%d[] = {%.16E',n-1, C(n,1));
4242
for i=2:width % loop over segments
43-
s = sprintf('%s, %.16E', s, C(n,i));
43+
s = sprintf('%s, %.16E', s, C(n,i));
4444
end
4545
str{n} = [s sprintf('};\n')];
4646
end

examples/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,23 @@ set(EXAMPLES_C guru1d1c simple1d1c simple1d1cf)
1111

1212
foreach(EXAMPLE ${EXAMPLES})
1313
add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
14+
target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
1415
target_link_libraries(${EXAMPLE} PRIVATE finufft)
1516
enable_asan(${EXAMPLE})
1617
endforeach()
1718

1819
foreach(EXAMPLE ${EXAMPLES_C})
1920
add_executable(${EXAMPLE} ${EXAMPLE}.c)
2021
target_link_libraries(${EXAMPLE} PRIVATE finufft)
22+
target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
2123
enable_asan(${EXAMPLE})
2224
endforeach()
2325

2426
if(FINUFFT_USE_OPENMP)
2527
foreach(EXAMPLE ${EXAMPLES_OPENMP})
2628
add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
2729
target_link_libraries(${EXAMPLE} PRIVATE finufft OpenMP::OpenMP_CXX)
30+
target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
2831
enable_asan(${EXAMPLE})
2932
endforeach()
3033
endif()

examples/cuda/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
file(GLOB example_src "*.cpp")
32

43
foreach(srcfile ${example_src})
@@ -7,4 +6,5 @@ foreach(srcfile ${example_src})
76
add_executable(${executable} ${srcfile})
87
target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
98
target_link_libraries(${executable} cufinufft)
9+
target_compile_features(${executable} PRIVATE cxx_std_17)
1010
endforeach()

include/cufinufft/common.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <cufft.h>
55
#include <cufinufft/types.h>
66
#include <cufinufft_opts.h>
7+
#include <finufft_errors.h>
78
#include <finufft_spread_opts.h>
89

910
#include <complex.h>
@@ -32,6 +33,38 @@ template<typename T>
3233
void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
3334
T *fwkerhalf, finufft_spread_opts opts);
3435

36+
template<typename T>
37+
std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,
38+
int bin_size_z);
39+
40+
template<typename T>
41+
void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts);
42+
43+
template<typename T, typename V>
44+
auto cufinufft_set_shared_memory(V *kernel, const int dim,
45+
const cufinufft_plan_t<T> &d_plan) {
46+
/**
47+
* WARNING: this function does not handle cuda errors. The caller should check them.
48+
*/
49+
int device_id{}, shared_mem_per_block{};
50+
cudaGetDevice(&device_id);
51+
const auto shared_mem_required =
52+
shared_memory_required<T>(dim, d_plan.spopts.nspread, d_plan.opts.gpu_binsizex,
53+
d_plan.opts.gpu_binsizey, d_plan.opts.gpu_binsizez);
54+
cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin,
55+
device_id);
56+
if (shared_mem_required > shared_mem_per_block) {
57+
fprintf(stderr,
58+
"Error: Shared memory required per block is %zu bytes, but the device "
59+
"supports only %d bytes.\n",
60+
shared_mem_required, shared_mem_per_block);
61+
return FINUFFT_ERR_INSUFFICIENT_SHMEM;
62+
}
63+
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
64+
shared_mem_required);
65+
return 0;
66+
}
67+
3568
} // namespace common
3669
} // namespace cufinufft
3770
#endif

include/cufinufft/contrib/helper_cuda.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,14 @@ static inline cudaError_t cudaFreeWrapper(T *devPtr, cudaStream_t stream,
5858
return pool_supported ? cudaFreeAsync(devPtr, stream) : cudaFree(devPtr);
5959
}
6060

61-
#define RETURN_IF_CUDA_ERROR \
62-
{ \
63-
cudaError_t err = cudaGetLastError(); \
64-
if (err != cudaSuccess) { \
65-
printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err)); \
66-
return FINUFFT_ERR_CUDA_FAILURE; \
67-
} \
61+
#define RETURN_IF_CUDA_ERROR \
62+
{ \
63+
cudaError_t err = cudaGetLastError(); \
64+
if (err != cudaSuccess) { \
65+
printf("[%s] Error: %s in %s at line %d\n", __func__, cudaGetErrorString(err), \
66+
__FILE__, __LINE__); \
67+
return FINUFFT_ERR_CUDA_FAILURE; \
68+
} \
6869
}
6970

7071
#define CUDA_FREE_AND_NULL(val, stream, pool_supported) \

0 commit comments

Comments
 (0)