Skip to content

Commit 02de82d

Browse files
committed
feat: mmap-based reclaimable memory for thread-safe execute
Use mmap+MADV_FREE for persistent fwBatch buffer in execute, making concurrent execute calls thread-safe without repeated allocation. Includes lazy fwBatch allocation in makeplan and Windows min/max macro collision fixes.
1 parent d360f7f commit 02de82d

File tree

9 files changed

+216
-12
lines changed

9 files changed

+216
-12
lines changed

CHANGELOG

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@ If not stated, FINUFFT is assumed (old cuFINUFFT <=1.3 is listed separately).
33

44
v2.6.0-dev
55

6-
* Added `threadsafe_execute` regression test verifying concurrent `execute()`
7-
calls on the same plan produce correct results. Added sanitizer mode selection
8-
via `FINUFFT_USE_SANITIZERS=OFF|ON|MEMSAN|TSAN`, and extended the sanitizer
6+
* Fixed CPU fallback execute scratch handling to use thread-local reclaimable
7+
buffers when caller scratch is not provided, so concurrent `execute()` calls
8+
on the same plan no longer share internal workspace. Added unit coverage for
9+
`ReclaimableMemory` and a `threadsafe_execute` regression test based on
10+
direct 1D reference evaluation. Added sanitizer mode selection via
11+
`FINUFFT_USE_SANITIZERS=OFF|ON|MEMSAN|TSAN`, and extended the sanitizer
912
GitHub workflow to run a focused Linux TSAN job. (Barbone)
1013
* SIMD-vectorized bin sort with parallel prefix sum: uint32_t bin counts,
1114
ndims dispatch for vectorized coordinate binning, std::exclusive_scan for

include/finufft/execute.hpp

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@
55
#include <cstdio>
66
#include <vector>
77

8+
#include <finufft/memory.hpp>
89
#include <finufft/plan.hpp>
9-
#include <finufft/utils.hpp>
10-
#include <finufft/spreadinterp.hpp>
1110
#include <finufft/simd.hpp>
11+
#include <finufft/spreadinterp.hpp>
12+
#include <finufft/utils.hpp>
1213

1314
/* Computational core for FINUFFT.
1415
@@ -363,11 +364,17 @@ int FINUFFT_PLAN_T<TF>::execute_internal(TC *cj, TC *fk, bool adjoint, int ntran
363364
if (opts.debug)
364365
printf("[%s] start%s ntrans=%d (%d batches, bsize=%d)...\n", "execute",
365366
adjoint ? " adjoint" : "", ntrans_actual, nbatch, batchSize);
366-
// allocate temporary buffers
367+
// Use caller-provided scratch, or fall back to a thread-local reclaimable
368+
// buffer so concurrent execute() calls on the same plan do not share state.
367369
bool scratch_provided = scratch_size >= size_t(nf() * batchSize);
368-
std::vector<TC, xsimd::aligned_allocator<TC, 64>> fwBatch_(
369-
scratch_provided ? 0 : nf() * batchSize);
370-
TC *fwBatch = scratch_provided ? aligned_scratch : fwBatch_.data();
370+
static thread_local finufft::ReclaimableMemory tls_fwBatchBuf;
371+
TC *fwBatch = aligned_scratch;
372+
if (!scratch_provided) {
373+
const size_t fwBatchBytes = size_t(nf()) * batchSize * sizeof(TC);
374+
if (!tls_fwBatchBuf.allocate(fwBatchBytes))
375+
throw finufft::exception(FINUFFT_ERR_ALLOC);
376+
fwBatch = static_cast<TC *>(tls_fwBatchBuf.data());
377+
}
371378
for (int b = 0; b * batchSize < ntrans_actual; b++) { // .....loop b over batches
372379

373380
// current batch is either batchSize, or possibly truncated if last one
@@ -411,6 +418,11 @@ int FINUFFT_PLAN_T<TF>::execute_internal(TC *cj, TC *fk, bool adjoint, int ntran
411418
}
412419
} // ........end b loop
413420

421+
// Mark thread-local pages as reclaimable so the OS can reclaim physical
422+
// memory between execute calls if under pressure. Pages stay resident
423+
// otherwise, and virtual addresses remain stable for reuse.
424+
if (!scratch_provided) tls_fwBatchBuf.mark_reclaimable();
425+
414426
if (opts.debug) { // report total times in their natural order...
415427
if ((type == 1) != adjoint) {
416428
printf("[%s] done. tot spread:\t\t%.3g s\n", "execute", t_sprint);

include/finufft/memory.hpp

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
#pragma once
2+
3+
// Cross-platform RAII wrapper for large temporary buffers.
4+
//
5+
// Uses mmap/VirtualAlloc for large allocations with two key features:
6+
// 1. Page-aligned allocation that keeps a stable virtual address range for
7+
// reuse across calls (no munmap/VirtualFree between uses).
8+
// 2. mark_reclaimable() issues MADV_FREE (Linux/macOS) or MEM_RESET (Windows)
9+
// to tell the OS the physical pages may be reclaimed under memory pressure,
10+
// without releasing the virtual address range. If pages are not reclaimed,
11+
// the next use is essentially free; if they are, the OS transparently
12+
// zero-fills them on the next fault — no user-visible error.
13+
//
14+
// Platform support:
15+
// Linux / macOS — mmap + MADV_FREE
16+
// Windows — VirtualAlloc + MEM_RESET
17+
// Other — std::aligned_alloc fallback (no reclaimable support)
18+
//
19+
// This relies only on POSIX mmap (available since 4.4BSD / POSIX.1-2001) and
20+
// Win32 VirtualAlloc — both stable OS-level APIs with decades of support.
21+
22+
#include <cstddef>
23+
#include <cstdio>
24+
#include <cstdlib>
25+
26+
#if defined(_WIN32)
27+
#ifndef WIN32_LEAN_AND_MEAN
28+
#define WIN32_LEAN_AND_MEAN // Exclude rarely-used Windows headers to speed compilation
29+
#endif
30+
#ifndef NOMINMAX
31+
#define NOMINMAX // Prevent Windows <windows.h> from defining min/max macros
32+
#endif
33+
#include <windows.h>
34+
#elif defined(__unix__) || defined(__APPLE__)
35+
#include <sys/mman.h>
36+
#include <unistd.h>
37+
#else
38+
#include <cstring> // memset fallback
39+
#endif
40+
41+
namespace finufft {
42+
43+
// Large-buffer allocator that returns page-aligned memory and supports
44+
// marking pages as reclaimable between uses.
45+
class ReclaimableMemory {
46+
public:
47+
ReclaimableMemory() = default;
48+
49+
// Non-copyable, movable
50+
ReclaimableMemory(const ReclaimableMemory &) = delete;
51+
ReclaimableMemory &operator=(const ReclaimableMemory &) = delete;
52+
ReclaimableMemory(ReclaimableMemory &&o) noexcept : ptr_(o.ptr_), nbytes_(o.nbytes_) {
53+
o.ptr_ = nullptr;
54+
o.nbytes_ = 0;
55+
}
56+
ReclaimableMemory &operator=(ReclaimableMemory &&o) noexcept {
57+
if (this != &o) {
58+
deallocate();
59+
ptr_ = o.ptr_;
60+
nbytes_ = o.nbytes_;
61+
o.ptr_ = nullptr;
62+
o.nbytes_ = 0;
63+
}
64+
return *this;
65+
}
66+
67+
~ReclaimableMemory() { deallocate(); }
68+
69+
// Allocate nbytes of memory while leaving physical pages to be faulted in on
70+
// first use. Returns true on success.
71+
bool allocate(size_t nbytes) {
72+
if (nbytes == 0) return true;
73+
if (ptr_ && nbytes_ == nbytes) return true; // already the right size
74+
deallocate();
75+
nbytes_ = nbytes;
76+
#if defined(_WIN32)
77+
ptr_ = VirtualAlloc(nullptr, nbytes, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
78+
if (!ptr_) {
79+
nbytes_ = 0;
80+
return false;
81+
}
82+
#elif defined(__linux__)
83+
ptr_ =
84+
mmap(nullptr, nbytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
85+
if (ptr_ == MAP_FAILED) {
86+
ptr_ = nullptr;
87+
nbytes_ = 0;
88+
return false;
89+
}
90+
#elif defined(__APPLE__) || defined(__unix__)
91+
// macOS and other Unix: no MAP_POPULATE, use plain mmap
92+
ptr_ =
93+
mmap(nullptr, nbytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
94+
if (ptr_ == MAP_FAILED) {
95+
ptr_ = nullptr;
96+
nbytes_ = 0;
97+
return false;
98+
}
99+
#else
100+
// Fallback: aligned allocation
101+
ptr_ = std::aligned_alloc(4096, ((nbytes + 4095) / 4096) * 4096);
102+
if (!ptr_) {
103+
nbytes_ = 0;
104+
return false;
105+
}
106+
std::memset(ptr_, 0, nbytes);
107+
#endif
108+
return true;
109+
}
110+
111+
// Mark pages as reclaimable by the OS. The virtual address range is kept,
112+
// and pages may remain resident if there is no memory pressure.
113+
// After this call, the contents are undefined until the next write.
114+
void mark_reclaimable() {
115+
if (!ptr_ || !nbytes_) return;
116+
#if defined(_WIN32)
117+
// MEM_RESET tells Windows the pages are no longer needed.
118+
// Pages remain committed but can be discarded under pressure.
119+
VirtualAlloc(ptr_, nbytes_, MEM_RESET, PAGE_READWRITE);
120+
#elif defined(__linux__) || defined(__APPLE__)
121+
madvise(ptr_, nbytes_, MADV_FREE);
122+
#endif
123+
// Other platforms: no-op, pages stay resident
124+
}
125+
126+
void *data() const { return ptr_; }
127+
size_t size() const { return nbytes_; }
128+
129+
private:
130+
void deallocate() {
131+
if (!ptr_) return;
132+
#if defined(_WIN32)
133+
VirtualFree(ptr_, 0, MEM_RELEASE);
134+
#elif defined(__unix__) || defined(__APPLE__)
135+
munmap(ptr_, nbytes_);
136+
#else
137+
std::free(ptr_);
138+
#endif
139+
ptr_ = nullptr;
140+
nbytes_ = 0;
141+
}
142+
143+
void *ptr_ = nullptr;
144+
size_t nbytes_ = 0;
145+
};
146+
147+
} // namespace finufft

include/finufft/plan.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
#include <complex>
55
#include <cstdint>
66
#include <memory>
7+
#include <vector>
78

89
#include "finufft_common/common.h"
9-
#include "finufft_errors.h"
1010

1111
// All indexing in library that potentially can exceed 2^31 uses 64-bit signed.
1212
// This includes all calling arguments (eg M,N) that could be huge someday.

include/finufft_common/kernel.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,12 @@ void set_kernel_shape_given_ns(finufft_spread_opts &opts, int debug);
7878
// Since for low upsampfacs, ns=16 can need only nc~12, allow such low nc here.
7979
// Note: spreadinterp.cpp compilation time grows with the gap between these bounds...
8080
inline constexpr int min_nc_given_ns(int ns) {
81-
return std::max(common::MIN_NC, ns - 4); // note must stay in bounds from constants.h
81+
// Parens around (std::max) prevent expansion of Windows min/max macros
82+
// that are defined in <windows.h> when NOMINMAX is not set.
83+
return (std::max)(common::MIN_NC, ns - 4); // note must stay in bounds from constants.h
8284
}
8385
inline constexpr int max_nc_given_ns(int ns) {
84-
return std::min(common::MAX_NC, ns + 3); // "
86+
return (std::min)(common::MAX_NC, ns + 3); // (std::min) for same Windows macro reason
8587
}
8688

8789
template<int NS, int NC> inline constexpr bool ValidKernelParams() noexcept {

src/fft.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,12 @@ template<typename TF> void FINUFFT_PLAN_T<TF>::init_grid_kerFT_FFT() {
458458
if (opts.debug)
459459
printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, opts.fftw, nthr_fft,
460460
timer.elapsedsec());
461+
462+
if (opts.debug) {
463+
const size_t fwBatchBytes = size_t(nf()) * batchSize * sizeof(TC);
464+
printf("[%s] fwBatch alloc (%.3g GB):\tdeferred to execute\n", __func__,
465+
fwBatchBytes / 1e9);
466+
}
461467
}
462468
}
463469
template void FINUFFT_PLAN_T<float>::init_grid_kerFT_FFT();

src/utils.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515

1616
#if defined(_WIN32)
1717
#include <vector>
18+
#ifndef NOMINMAX
19+
#define NOMINMAX
20+
#endif
1821
#include <windows.h>
1922
#elif defined(__APPLE__)
2023
#include <sys/sysctl.h>

test/testutils.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@
1818

1919
// This switches FLT macro from double to float if SINGLE is defined, etc...
2020

21+
#include "finufft/memory.hpp"
2122
#include "finufft/utils.hpp"
2223
#include "utils/norms.hpp"
24+
#include <cstdint>
2325
#include <finufft/test_defs.hpp>
2426

2527
namespace finufft::common {
@@ -93,6 +95,24 @@ int main(int argc, char *argv[]) {
9395
if (std::abs(errtwonorm(M, &a[0], &b[0]) - 1.0) > relerr) return 1;
9496
if (std::abs(std::sqrt((FLT)M) * relerrtwonorm(M, &a[0], &b[0]) - 1.0) > relerr) return 1;
9597

98+
// test reclaimable workspace allocator...
99+
finufft::ReclaimableMemory buf;
100+
buf.mark_reclaimable(); // no-op before allocation
101+
if (!buf.allocate(0) || buf.size() != 0) return 1;
102+
103+
constexpr size_t nbytes = 8192;
104+
if (!buf.allocate(nbytes) || buf.data() == nullptr || buf.size() != nbytes) return 1;
105+
if ((reinterpret_cast<std::uintptr_t>(buf.data()) & 4095u) != 0u) return 1;
106+
107+
void *ptr = buf.data();
108+
if (!buf.allocate(nbytes) || buf.data() != ptr) return 1; // same-size reuse
109+
buf.mark_reclaimable(); // should be safe after alloc
110+
111+
finufft::ReclaimableMemory moved = std::move(buf);
112+
if (buf.data() != nullptr || buf.size() != 0) return 1;
113+
if (moved.data() != ptr || moved.size() != nbytes) return 1;
114+
moved.mark_reclaimable();
115+
96116
#if defined(__cpp_lib_math_special_functions)
97117
// std::cyl_bessel_i present: compare std vs custom series
98118
for (double x = 0.0; x <= 42.0; x += 0.5) {

test/threadsafe_execute.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1+
/* Regression test for thread-safe concurrent execute() on the same plan.
2+
3+
Creates a single 1D type-1 plan and then runs finufft_execute from
4+
multiple threads simultaneously, each into its own output array.
5+
Correctness is verified against a direct (slow) Fourier transform.
6+
This catches data races in internal scratch workspace allocation.
7+
8+
Usage: ./threadsafe_execute (exit 0 = pass, >0 = fail)
9+
Barbone, Mar 2026.
10+
*/
11+
112
#include <finufft.h>
213
#include <finufft_common/constants.h>
314
#include <finufft_opts.h>

0 commit comments

Comments
 (0)