Skip to content

Commit 8d5afee

Browse files
committed
fix: avoid unnecessary 2GB allocation in makeplan for FFTW_ESTIMATE
Since d4d08a4 (PR #633, between v2.3.0 and v2.4.1), makeplan allocates a temporary std::vector for FFTW planning that is never read. For large 3D grids (e.g. 250^3 with upsamp=2.0 → 500^3 = 2GB), this causes ~250ms of page faults for a buffer that neither FFTW_ESTIMATE nor DUCC0 touches. Fix: remove the ptr parameter from plan() and let each backend manage its own buffer internally. For FFTW_ESTIMATE (the default), a small dummy is used since FFTW never touches the buffer. For FFTW_MEASURE etc., a full buffer is allocated for trial FFTs. The DUCC0 stub remains a no-op.
1 parent 8b7ac66 commit 8d5afee

File tree

2 files changed

+32
-26
lines changed

2 files changed

+32
-26
lines changed

include/finufft/fft.h

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@ template<typename T> class Finufft_FFT_plan {
1414
Finufft_FFT_plan(const Finufft_FFT_plan &) = delete;
1515
Finufft_FFT_plan &operator=(const Finufft_FFT_plan &) = delete;
1616
[[maybe_unused]] void plan(const std::vector<int> & /*dims*/, size_t /*batchSize*/,
17-
std::complex<T> * /*ptr*/, int /*sign*/, int /*options*/,
18-
int /*nthreads*/) {}
17+
int /*sign*/, int /*options*/, int /*nthreads*/) {}
1918

2019
[[maybe_unused]] static void forget_wisdom() {}
2120
[[maybe_unused]] static void cleanup() {}
@@ -28,6 +27,7 @@ template<typename T> class Finufft_FFT_plan {
2827
#include <complex>
2928
#include <fftw3.h> // (after complex) needed so can typedef FFTW_CPX
3029
//clang-format on
30+
#include <array>
3131
#include <mutex>
3232

3333
template<typename T> class Finufft_FFT_plan {};
@@ -73,11 +73,17 @@ template<> struct Finufft_FFT_plan<float> {
7373
}
7474
Finufft_FFT_plan &operator=(const Finufft_FFT_plan &) = delete;
7575

76-
void plan
77-
[[maybe_unused]] (const std::vector<int> &dims, size_t batchSize,
78-
std::complex<float> *ptr, int sign, int options, int nthreads) {
76+
void plan [[maybe_unused]] (const std::vector<int> &dims, size_t batchSize, int sign,
77+
int options, int nthreads) {
7978
uint64_t nf = 1;
8079
for (auto i : dims) nf *= i;
80+
// FFTW_ESTIMATE never touches the buffer; FFTW_MEASURE etc. run trial FFTs.
81+
// Use a 1-element dummy for ESTIMATE, full aligned buffer otherwise.
82+
using cpxf = std::complex<float>;
83+
std::array<cpxf, 1> dummy{};
84+
std::vector<cpxf> buf(options & FFTW_ESTIMATE ? 0 : nf * batchSize);
85+
auto *ptr =
86+
reinterpret_cast<fftwf_complex *>(buf.empty() ? dummy.data() : buf.data());
8187
lock();
8288
// Destroy existing plans before creating new ones (handles re-planning)
8389
if (plan_) {
@@ -91,14 +97,12 @@ template<> struct Finufft_FFT_plan<float> {
9197
#ifdef _OPENMP
9298
fftwf_plan_with_nthreads(nthreads);
9399
#endif
94-
plan_ = fftwf_plan_many_dft(int(dims.size()), dims.data(), int(batchSize),
95-
reinterpret_cast<fftwf_complex *>(ptr), nullptr, 1,
96-
int(nf), reinterpret_cast<fftwf_complex *>(ptr), nullptr,
97-
1, int(nf), sign, unsigned(options));
98-
plan_adj_ = fftwf_plan_many_dft(int(dims.size()), dims.data(), int(batchSize),
99-
reinterpret_cast<fftwf_complex *>(ptr), nullptr, 1,
100-
int(nf), reinterpret_cast<fftwf_complex *>(ptr),
101-
nullptr, 1, int(nf), -sign, unsigned(options));
100+
plan_ = fftwf_plan_many_dft(int(dims.size()), dims.data(), int(batchSize), ptr,
101+
nullptr, 1, int(nf), ptr, nullptr, 1, int(nf), sign,
102+
unsigned(options));
103+
plan_adj_ = fftwf_plan_many_dft(int(dims.size()), dims.data(), int(batchSize), ptr,
104+
nullptr, 1, int(nf), ptr, nullptr, 1, int(nf), -sign,
105+
unsigned(options));
102106
unlock();
103107
}
104108
void execute [[maybe_unused]] (std::complex<float> *data) const {
@@ -158,11 +162,16 @@ template<> struct Finufft_FFT_plan<double> {
158162
}
159163
Finufft_FFT_plan &operator=(const Finufft_FFT_plan &) = delete;
160164

161-
void plan
162-
[[maybe_unused]] (const std::vector<int> &dims, size_t batchSize,
163-
std::complex<double> *ptr, int sign, int options, int nthreads) {
165+
void plan [[maybe_unused]] (const std::vector<int> &dims, size_t batchSize, int sign,
166+
int options, int nthreads) {
164167
uint64_t nf = 1;
165168
for (auto i : dims) nf *= i;
169+
// FFTW_ESTIMATE never touches the buffer; FFTW_MEASURE etc. run trial FFTs.
170+
// Use a 1-element dummy for ESTIMATE, full aligned buffer otherwise.
171+
using cpxd = std::complex<double>;
172+
std::array<cpxd, 1> dummy{};
173+
std::vector<cpxd> buf(options & FFTW_ESTIMATE ? 0 : nf * batchSize);
174+
auto *ptr = reinterpret_cast<fftw_complex *>(buf.empty() ? dummy.data() : buf.data());
166175
lock();
167176
// Destroy existing plans before creating new ones (handles re-planning)
168177
if (plan_) {
@@ -176,14 +185,12 @@ template<> struct Finufft_FFT_plan<double> {
176185
#ifdef _OPENMP
177186
fftw_plan_with_nthreads(nthreads);
178187
#endif
179-
plan_ = fftw_plan_many_dft(int(dims.size()), dims.data(), int(batchSize),
180-
reinterpret_cast<fftw_complex *>(ptr), nullptr, 1, int(nf),
181-
reinterpret_cast<fftw_complex *>(ptr), nullptr, 1, int(nf),
182-
sign, unsigned(options));
183-
plan_adj_ = fftw_plan_many_dft(int(dims.size()), dims.data(), int(batchSize),
184-
reinterpret_cast<fftw_complex *>(ptr), nullptr, 1,
185-
int(nf), reinterpret_cast<fftw_complex *>(ptr),
186-
nullptr, 1, int(nf), -sign, unsigned(options));
188+
plan_ =
189+
fftw_plan_many_dft(int(dims.size()), dims.data(), int(batchSize), ptr, nullptr, 1,
190+
int(nf), ptr, nullptr, 1, int(nf), sign, unsigned(options));
191+
plan_adj_ =
192+
fftw_plan_many_dft(int(dims.size()), dims.data(), int(batchSize), ptr, nullptr, 1,
193+
int(nf), ptr, nullptr, 1, int(nf), -sign, unsigned(options));
187194
unlock();
188195
}
189196
void execute [[maybe_unused]] (std::complex<double> *data) const {

src/finufft_core.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -764,8 +764,7 @@ template<typename TF> int FINUFFT_PLAN_T<TF>::init_grid_kerFT_FFT() {
764764
timer.restart(); // plan the FFTW (to act in-place on the workspace fwBatch)
765765
int nthr_fft = opts.nthreads;
766766
const auto ns = gridsize_for_fft(*this);
767-
std::vector<TC, xsimd::aligned_allocator<TC, 64>> fwBatch(nf() * batchSize);
768-
fftPlan->plan(ns, batchSize, fwBatch.data(), fftSign, opts.fftw, nthr_fft);
767+
fftPlan->plan(ns, batchSize, fftSign, opts.fftw, nthr_fft);
769768
if (opts.debug)
770769
printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, opts.fftw, nthr_fft,
771770
timer.elapsedsec());

0 commit comments

Comments
 (0)