Skip to content

Commit 7bd8adc

Browse files
authored
Fix getOptimalThreadCount segfault (#714)
This uses `/proc` instead of `cpuinfo` as it seems more stable across old kernels.
1 parent d4d08a4 commit 7bd8adc

File tree

3 files changed

+109
-105
lines changed

3 files changed

+109
-105
lines changed

include/finufft/finufft_utils.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,9 @@ class FINUFFT_EXPORT CNTime {
6161
double initial;
6262
};
6363

64-
FINUFFT_NEVER_INLINE int getOptimalThreadCount();
65-
66-
FINUFFT_NEVER_INLINE int get_num_threads_parallel_block();
64+
#ifdef _OPENMP
65+
FINUFFT_NEVER_INLINE unsigned getOptimalThreadCount();
66+
#endif
6767

6868
} // namespace finufft::utils
6969

src/finufft_core.cpp

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -548,12 +548,6 @@ void finufft_default_opts_t(finufft_opts *o)
548548
// sphinx tag (don't remove): @defopts_end
549549
}
550550

551-
// Wrapper to cache the optimal thread count using a static variable.
552-
int getCachedOptimalThreadCount() {
553-
static const int cached_value = getOptimalThreadCount();
554-
return cached_value;
555-
}
556-
557551
// PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP
558552
template<typename TF>
559553
FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, int iflag,
@@ -600,9 +594,13 @@ FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i
600594
// get stuff from args...
601595
fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input
602596

597+
CNTime timer{};
598+
if (opts.debug > 1) {
599+
timer.start();
600+
}
603601
#ifdef _OPENMP
604602
// choose overall # threads...
605-
int ompmaxnthr = getCachedOptimalThreadCount();
603+
int ompmaxnthr = static_cast<int>(getOptimalThreadCount());
606604
int nthr = ompmaxnthr; // default: use as many physical cores as possible
607605
// (the above could be set, or suggested set, to 1 for small enough problems...)
608606
if (opts.nthreads > 0) {
@@ -632,6 +630,11 @@ FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i
632630
__func__);
633631
throw int(FINUFFT_ERR_NTHREADS_NOTVALID);
634632
}
633+
if (opts.debug > 1) {
634+
const auto sec = timer.elapsedsec();
635+
fprintf(stdout, "[%s] detected %d threads in %.3g sec.\n", __func__, nthr, sec);
636+
}
637+
635638
// (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...)
636639

637640
// choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick)
@@ -728,8 +731,7 @@ FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i
728731
}
729732

730733
// STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim
731-
CNTime timer;
732-
timer.start();
734+
timer.restart();
733735
for (int idim = 0; idim < dim; ++idim)
734736
onedim_fseries_kernel(nfdim[idim], phiHat[idim], spopts);
735737
if (opts.debug)

src/finufft_utils.cpp

Lines changed: 95 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -135,10 +135,12 @@ double CNTime::elapsedsec() const
135135
return nowsec - initial;
136136
}
137137

138+
#ifdef _OPENMP
138139
namespace {
139140
#if defined(_WIN32)
140141
// Returns the number of physical CPU cores on Windows (excluding hyper-threaded cores)
141-
static int getPhysicalCoreCount() {
142+
unsigned getPhysicalCoreCount() {
143+
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
142144
int physicalCoreCount = 0;
143145

144146
// Determine the required buffer size.
@@ -161,9 +163,13 @@ static int getPhysicalCoreCount() {
161163
return MY_OMP_GET_MAX_THREADS();
162164
}
163165
return physicalCoreCount;
166+
#else
167+
// On non-x86 architectures, there should be no hyper-threading
168+
return MY_OMP_GET_MAX_THREADS();
169+
#endif
164170
}
165171

166-
static int getAllowedCoreCount() {
172+
unsigned getAllowedCoreCount() {
167173
DWORD_PTR processMask, systemMask;
168174
if (!GetProcessAffinityMask(GetCurrentProcess(), &processMask, &systemMask)) {
169175
return 0; // API call failed (should rarely happen for the current process)
@@ -180,7 +186,7 @@ static int getAllowedCoreCount() {
180186
#elif defined(__APPLE__)
181187

182188
// Returns the number of physical CPU cores on macOS (excluding hyper-threaded cores)
183-
static int getPhysicalCoreCount() {
189+
unsigned getPhysicalCoreCount() {
184190
int physicalCoreCount = 0;
185191
int cores = 0;
186192
size_t size = sizeof(cores);
@@ -194,81 +200,76 @@ static int getPhysicalCoreCount() {
194200
return physicalCoreCount;
195201
}
196202

197-
static int getAllowedCoreCount() {
203+
unsigned getAllowedCoreCount() {
198204
// MacOS does not support CPU affinity, so we return the maximum number of threads.
199205
return MY_OMP_GET_MAX_THREADS();
200206
}
201207

202208
#elif defined(__linux__)
203209
// Returns the number of physical CPU cores on Linux (excluding hyper-threaded cores)
204-
static int getPhysicalCoreCount() {
205-
int physicalCoreCount = 0;
206-
std::ifstream cpuinfo("/proc/cpuinfo");
207-
if (!cpuinfo.is_open()) return MY_OMP_GET_MAX_THREADS();
208-
209-
std::set<std::string> coreSet;
210-
std::string line;
211-
int physicalId = -1, coreId = -1;
212-
bool foundPhysical = false, foundCore = false;
213-
214-
while (std::getline(cpuinfo, line)) {
215-
// An empty line indicates the end of a processor block.
216-
if (line.empty()) {
217-
if (foundPhysical && foundCore)
218-
coreSet.insert(std::to_string(physicalId) + "-" + std::to_string(coreId));
219-
// Reset for the next processor block.
220-
foundPhysical = foundCore = false;
221-
physicalId = coreId = -1;
222-
} else {
223-
auto colonPos = line.find(':');
224-
if (colonPos == std::string::npos) continue;
225-
std::string key = line.substr(0, colonPos);
226-
std::string value = line.substr(colonPos + 1);
227-
// Trim whitespace.
228-
key.erase(key.find_last_not_of(" \t") + 1);
229-
value.erase(0, value.find_first_not_of(" \t"));
230-
231-
if (key == "physical id") {
232-
physicalId = std::stoi(value);
233-
foundPhysical = true;
234-
} else if (key == "core id") {
235-
coreId = std::stoi(value);
236-
foundCore = true;
210+
// Compatibility:
211+
// - Linux kernels 2.6 and later (provides /sys/devices/system/cpu topology interface)
212+
// - Any CPU architecture supported by the kernel (Intel, AMD, ARM, POWER, etc.)
213+
// - Works in containers or cgroups (reflects host topology)
214+
unsigned getPhysicalCoreCount() {
215+
// only x86_64 and x86_32 architectures support HT (hyper-threading)
216+
// in all other cases, we assume no HT and return MY_OMP_GET_MAX_THREADS()
217+
#if defined(__i386__) || defined(__x86_64__)
218+
// Parse strings like "0-3,5,7-9" → {0,1,2,3,5,7,8,9}
219+
auto parseCpuList = [](const std::string &s) {
220+
std::vector<int> cpus;
221+
std::istringstream iss(s);
222+
for (std::string tok; std::getline(iss, tok, ',');) {
223+
auto dash = tok.find('-');
224+
if (dash == std::string::npos) {
225+
cpus.push_back(std::stoi(tok));
226+
} else {
227+
int start = std::stoi(tok.substr(0, dash));
228+
int end = std::stoi(tok.substr(dash + 1));
229+
for (int i = start; i <= end; ++i) cpus.push_back(i);
237230
}
238231
}
232+
return cpus;
233+
};
234+
235+
// Read a single integer from the given sysfs file
236+
auto readInt = [&](const std::string &path, int &out) -> bool {
237+
std::ifstream f(path);
238+
if (!f.is_open()) return false;
239+
f >> out;
240+
return !f.fail();
241+
};
242+
243+
// 1) Read list of present CPUs
244+
std::ifstream presentF("/sys/devices/system/cpu/present");
245+
if (!presentF.is_open()) {
246+
return MY_OMP_GET_MAX_THREADS();
239247
}
240-
// In case the file doesn't end with an empty line.
241-
if (foundPhysical && foundCore)
242-
coreSet.insert(std::to_string(physicalId) + "-" + std::to_string(coreId));
243-
244-
if (!coreSet.empty()) {
245-
physicalCoreCount = static_cast<int>(coreSet.size());
246-
} else {
247-
// Fallback: try reading "cpu cores" from the first processor block.
248-
cpuinfo.clear();
249-
cpuinfo.seekg(0, std::ios::beg);
250-
while (std::getline(cpuinfo, line)) {
251-
auto colonPos = line.find(':');
252-
if (colonPos != std::string::npos) {
253-
std::string key = line.substr(0, colonPos);
254-
std::string value = line.substr(colonPos + 1);
255-
key.erase(key.find_last_not_of(" \t") + 1);
256-
value.erase(0, value.find_first_not_of(" \t"));
257-
if (key == "cpu cores") {
258-
physicalCoreCount = std::stoi(value);
259-
break;
260-
}
261-
}
262-
}
248+
std::string presentLine;
249+
std::getline(presentF, presentLine);
250+
auto cpus = parseCpuList(presentLine);
251+
252+
// 2) For each CPU, read its package & core IDs
253+
std::set<std::pair<int, int>> physicalCores;
254+
for (int cpu : cpus) {
255+
std::string topoBase =
256+
"/sys/devices/system/cpu/cpu" + std::to_string(cpu) + "/topology/";
257+
int pkg = -1, core = -1;
258+
if (!readInt(topoBase + "physical_package_id", pkg)) continue;
259+
if (!readInt(topoBase + "core_id", core)) continue;
260+
physicalCores.emplace(pkg, core);
263261
}
264262

265-
if (physicalCoreCount == 0) {
266-
return MY_OMP_GET_MAX_THREADS();
263+
// 3) Return count if successful, else fallback
264+
if (!physicalCores.empty()) {
265+
return static_cast<unsigned>(physicalCores.size());
267266
}
268-
return physicalCoreCount;
267+
#endif
268+
// in ARM and RISKV we only need this
269+
return MY_OMP_GET_MAX_THREADS();
269270
}
270271

271-
static int getAllowedCoreCount() {
272+
unsigned getAllowedCoreCount() {
272273
cpu_set_t cpuSet;
273274
CPU_ZERO(&cpuSet);
274275
if (sched_getaffinity(0, sizeof(cpu_set_t), &cpuSet) != 0) {
@@ -287,44 +288,45 @@ static int getAllowedCoreCount() {
287288

288289
#warning "Unknown platform. Impossible to detect the number of physical cores."
289290
// Fallback version if none of the above platforms is detected.
290-
static int getPhysicalCoreCount() { return MY_OMP_GET_MAX_THREADS(); }
291-
static int getAllowedCoreCount() { return MY_OMP_GET_MAX_THREADS(); }
291+
unsigned getPhysicalCoreCount() { return MY_OMP_GET_MAX_THREADS(); }
292+
unsigned getAllowedCoreCount() { return MY_OMP_GET_MAX_THREADS(); }
292293

293294
#endif
294295

295296
} // namespace
296297

297-
int getOptimalThreadCount() {
298+
unsigned getOptimalThreadCount() {
298299
// if the user has set the OMP_NUM_THREADS environment variable, use that value
299-
const auto OMP_THREADS = std::getenv("OMP_NUM_THREADS");
300-
if (OMP_THREADS) {
301-
return std::stoi(OMP_THREADS);
302-
}
303-
// otherwise, use the min between number of physical cores or the number of allowed
304-
// cores (e.g. by taskset)
305-
const auto physicalCores = getPhysicalCoreCount();
306-
const auto allowedCores = getAllowedCoreCount();
307-
if (physicalCores < allowedCores) {
308-
return physicalCores;
309-
}
310-
return allowedCores;
311-
}
300+
static const auto cached_threads = []() -> unsigned {
301+
const auto OMP_THREADS = std::getenv("OMP_NUM_THREADS");
302+
if (OMP_THREADS) {
303+
try {
304+
return std::stoi(OMP_THREADS);
305+
} catch (...) {
306+
std::cerr << "Invalid OMP_NUM_THREADS value: " << OMP_THREADS
307+
<< ". using default thread count." << std::endl;
308+
}
309+
}
310+
// otherwise, use the min between number of physical cores or the number of allowed
311+
// cores (e.g. by taskset)
312+
try {
313+
const auto physicalCores = getPhysicalCoreCount();
314+
const auto allowedCores = getAllowedCoreCount();
315+
if (physicalCores < allowedCores) {
316+
return physicalCores;
317+
}
318+
return allowedCores;
319+
} catch (const std::exception &e) {
320+
std::cerr << "Error determining optimal thread count: " << e.what()
321+
<< ". Using OpenMP default thread count." << std::endl;
322+
}
323+
return MY_OMP_GET_MAX_THREADS();
324+
}();
312325

313-
// -------------------------- openmp helpers -------------------------------
314-
int get_num_threads_parallel_block()
315-
// return how many threads an omp parallel block would use.
316-
// omp_get_max_threads() does not report this; consider case of NESTED=0.
317-
// Why is there no such routine? Barnett 5/22/20
318-
{
319-
int nth_used;
320-
#pragma omp parallel
321-
{
322-
#pragma omp single
323-
nth_used = MY_OMP_GET_NUM_THREADS();
324-
}
325-
return nth_used;
326+
return cached_threads;
326327
}
327328

329+
#endif // _OPENMP
328330
// ---------- thread-safe rand number generator for Windows platform ---------
329331
// (note this is used by macros in test_defs.h, and supplied in linux/macosx)
330332
#ifdef _WIN32

0 commit comments

Comments
 (0)