Skip to content

Commit c28ca33

Browse files
committed
Conflicts resolved
2 parents a7bb9b4 + 1d9c6e9 commit c28ca33

File tree

10 files changed

+782
-310
lines changed

10 files changed

+782
-310
lines changed

CHANGELOG

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ V 2.3.0beta (6/21/24)
77
kernel evaluation, templating by ns with AVX-width-dependent decisions.
88
Up to 80% faster, dep on compiler. (Marco Barbone with help from Libin Lu).
99
NOTE: introduces new dependency (XSIMD), added to cMake and makefile.
10+
* new test/finufft3dkernel_test checks kerevalmeth=0,1 same to tol (M Barbone).
1011
* new perftest/compare_spreads.jl compares two spreadinterp libs (A Barnett).
1112
* new benchmarker perftest/spreadtestndall sweeps all kernel widths (M Barbone).
1213
* cufinufft now supports modeord(type 1,2 only): 0 CMCL-style increasing mode
@@ -26,15 +27,15 @@ V 2.3.0beta (6/21/24)
2627
* improved GPU python docs: migration guide; usage from cupy, numba, torch,
2728
pycuda. PyPI pkg still at 2.2.0beta.
2829
* Added a clang-format pre-commit hook to ensure consistent code style.
29-
Created a .clang-format file to define the style similar to the existing style.
30+
Created a .clang-format file to define a style similar to the existing style.
3031
Applied clang-format to all cmake, C, C++, and CUDA code. Ignored the blame
3132
using .git-blame-ignore-revs. Added a contributing.md for developers.
32-
* cuFINUFFT interface update: number of nonuniform points M is now a 64-bit integer
33-
as opposed to 32-bit. While this does modify the ABI, most code will just need to
34-
recompile against the new library as compilers will silently upcast any 32-bit
35-
integers to 64-bit when calling cufinufft(f)_setpts. Note that internally, 32-bit
36-
integers are still used, so calling cufinufft with more than 2e9 points will fail.
37-
This restriction may be lifted in the future.
33+
* cuFINUFFT interface update: number of nonuniform points M is now a 64-bit int
34+
as opposed to 32-bit. While this does modify the ABI, most code will just
35+
need to recompile against the new library as compilers will silently upcast
36+
any 32-bit integers to 64-bit when calling cufinufft(f)_setpts. Note that
37+
internally, 32-bit integers are still used, so calling cufinufft with more
38+
than 2e9 points will fail. This restriction may be lifted in the future.
3839

3940
V 2.2.0 (12/12/23)
4041

@@ -52,7 +53,7 @@ V 2.2.0 (12/12/23)
5253
* CMake build structure (thanks: Wenda Zhou, Marco Barbone, Libin Lu)
5354
- Note: the plan is to continue to support GNU makefile and make.inc.* but
5455
to transition to CMake as the main build system.
55-
- CI workflow using CMake on 3 OSes, 2 compilers each, PR #382 (Libin Lu)
56+
- CI workflow using CMake on 3 OSes, 2 compilers each, PR #382 (Libin Lu)
5657
* Docs: new tutorial content on iterative inverse NUFFTs; troubleshooting.
5758
* GitHub-facing badges
5859
* include/finufft/finufft_eitherprec.h moved up directory to be public (bea316c)

CMakeLists.txt

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,27 @@ project(finufft VERSION 2.2.0 LANGUAGES C CXX)
44

55
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
66

7+
include(CheckCXXCompilerFlag)
8+
79
set(GNU_LIKE_FRONTENDS AppleClang Clang GNU)
810
if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS)
911
# Set custom compiler flags for gcc-compatible compilers
10-
set(FINUFFT_CXX_FLAGS_RELEASE -O3 -funroll-loops -ffp-contract=fast)
12+
set(FINUFFT_CXX_FLAGS_RELEASE -O3
13+
-funroll-loops
14+
-ffp-contract=fast
15+
-fno-math-errno
16+
-fno-signed-zeros
17+
-fno-trapping-math
18+
-fassociative-math
19+
-freciprocal-math
20+
-fmerge-all-constants
21+
-ftree-vectorize
22+
)
23+
# if -fimplicit-constexpr is supported, add it to the list of flags
24+
check_cxx_compiler_flag("-fimplicit-constexpr" COMPILER_SUPPORTS_FIMPLICIT_CONSTEXPR)
25+
if (COMPILER_SUPPORTS_FIMPLICIT_CONSTEXPR)
26+
list(APPEND FINUFFT_CXX_FLAGS_RELEASE -fimplicit-constexpr)
27+
endif ()
1128
set(FINUFFT_CXX_FLAGS_RELWITHDEBINFO -g ${FINUFFT_CXX_FLAGS_RELEASE})
1229
endif ()
1330

@@ -100,6 +117,7 @@ function(enable_asan target)
100117
target_compile_options(${target} PRIVATE $<$<CONFIG:DEBUG>:-fsanitize=address -fsanitize=undefined -fsanitize=bounds-strict>)
101118
target_link_options(${target} PRIVATE $<$<CONFIG:DEBUG>:-fsanitize=address -fsanitize=undefined -fsanitize=bounds-strict>)
102119
endif ()
120+
target_compile_options(${target} PRIVATE $<$<CONFIG:DEBUG>:-Wall -Wno-sign-compare>)
103121
endfunction()
104122

105123
# Utility function to link static/dynamic lib

include/finufft/defs.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,21 +40,26 @@
4040
// inline macro, to force inlining of small functions
4141
// this avoids the use of macros to implement functions
4242
#if defined(_MSC_VER)
43-
#define FINUFFT_ALWAYS_INLINE __forceinline
43+
#define FINUFFT_ALWAYS_INLINE __forceinline inline
4444
#define FINUFFT_NEVER_INLINE __declspec(noinline)
4545
#define FINUFFT_RESTRICT __restrict
4646
#define FINUFFT_UNREACHABLE __assume(0)
47-
47+
#define FINUFFT_UNLIKELY(x) (x)
48+
#define FINUFFT_LIKELY(x) (x)
4849
#elif defined(__GNUC__) || defined(__clang__)
4950
#define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline
5051
#define FINUFFT_NEVER_INLINE __attribute__((noinline))
5152
#define FINUFFT_RESTRICT __restrict__
5253
#define FINUFFT_UNREACHABLE __builtin_unreachable()
54+
#define FINUFFT_UNLIKELY(x) __builtin_expect(!!(x), 0)
55+
#define FINUFFT_LIKELY(x) __builtin_expect(!!(x), 1)
5356
#else
5457
#define FINUFFT_ALWAYS_INLINE inline
5558
#define FINUFFT_NEVER_INLINE
5659
#define FINUFFT_RESTRICT
5760
#define FINUFFT_UNREACHABLE
61+
#define FINUFFT_UNLIKELY(x) (x)
62+
#define FINUFFT_LIKELY(x) (x)
5863
#endif
5964

6065
// ------------- Library-wide algorithm parameter settings ----------------

include/finufft/spreadinterp.h

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,26 +30,26 @@ namespace spreadinterp {
3030

3131
// things external (spreadinterp) interface needs...
3232
FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp(
33-
BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky,
33+
UBIGINT N1, UBIGINT N2, UBIGINT N3, FLT *data_uniform, UBIGINT N, FLT *kx, FLT *ky,
3434
FLT *kz, FLT *data_nonuniform, const finufft_spread_opts &opts);
35-
FINUFFT_EXPORT int FINUFFT_CDECL spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M,
36-
FLT *kx, FLT *ky, FLT *kz,
35+
FINUFFT_EXPORT int FINUFFT_CDECL spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3,
36+
UBIGINT N, FLT *kx, FLT *ky, FLT *kz,
3737
const finufft_spread_opts &opts);
38-
FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT *sort_indices, BIGINT N1, BIGINT N2,
39-
BIGINT N3, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
40-
const finufft_spread_opts &opts);
38+
FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2,
39+
UBIGINT N3, UBIGINT N, FLT *kx, FLT *ky,
40+
FLT *kz, const finufft_spread_opts &opts);
4141
FINUFFT_EXPORT int FINUFFT_CDECL interpSorted(
42-
const BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3,
43-
FLT *FINUFFT_RESTRICT data_uniform, BIGINT M, FLT *FINUFFT_RESTRICT kx,
42+
const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3,
43+
FLT *FINUFFT_RESTRICT data_uniform, UBIGINT N, FLT *FINUFFT_RESTRICT kx,
4444
FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz,
4545
FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts);
4646
FINUFFT_EXPORT int FINUFFT_CDECL spreadSorted(
47-
const BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform,
48-
BIGINT M, FLT *kx, FLT *ky, FLT *kz, const FLT *data_nonuniform,
47+
const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, FLT *data_uniform,
48+
UBIGINT N, FLT *kx, FLT *ky, FLT *kz, const FLT *data_nonuniform,
4949
const finufft_spread_opts &opts, int did_sort);
5050
FINUFFT_EXPORT int FINUFFT_CDECL spreadinterpSorted(
51-
const BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3,
52-
FLT *FINUFFT_RESTRICT data_uniform, BIGINT M, FLT *FINUFFT_RESTRICT kx,
51+
const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3,
52+
FLT *FINUFFT_RESTRICT data_uniform, UBIGINT N, FLT *FINUFFT_RESTRICT kx,
5353
FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz,
5454
FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts, int did_sort);
5555
FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel(FLT x, const finufft_spread_opts &opts);

makefile

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,11 @@ PYTHON = python3
2727
# Notes: 1) -Ofast breaks isfinite() & isnan(), so use -O3 which now is as fast
2828
# 2) -fcx-limited-range for fortran-speed complex arith in C++
2929
# 3) we use simply-expanded (:=) makefile variables, otherwise confusing
30-
CFLAGS := -O3 -funroll-loops -march=native -fcx-limited-range -ffp-contract=fast $(CFLAGS)
30+
# 4) the extra math flags are for speed, but they do not impact accuracy
31+
# they allow gcc to vectorize the code more effectively
32+
CFLAGS := -O3 -funroll-loops -march=native -fcx-limited-range -ffp-contract=fast\
33+
-fno-math-errno -fno-signed-zeros -fno-trapping-math -fassociative-math\
34+
-freciprocal-math -fmerge-all-constants -ftree-vectorize $(CFLAGS)
3135
FFLAGS := $(CFLAGS) $(FFLAGS)
3236
CXXFLAGS := $(CFLAGS) $(CXXFLAGS)
3337
# FFTW base name, and math linking...

0 commit comments

Comments
 (0)