Skip to content

Commit 86c6764

Browse files
authored
Merge pull request #3701 from Flamefire/increase_max_cpu_count_support
enhance sched_getaffinity function to avoid early crash when counting available cores on systems with more than 1024 cores
2 parents 584de26 + 3c1279e commit 86c6764

File tree

2 files changed

+26
-14
lines changed

2 files changed

+26
-14
lines changed

easybuild/tools/systemtools.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
@auther: Ward Poelmans (Ghent University)
3030
"""
3131
import ctypes
32+
import errno
3233
import fcntl
3334
import grp # @UnresolvedImport
3435
import os
@@ -160,24 +161,35 @@ class SystemToolsException(Exception):
160161
def sched_getaffinity():
161162
"""Determine list of available cores for current process."""
162163
cpu_mask_t = ctypes.c_ulong
163-
cpu_setsize = 1024
164164
n_cpu_bits = 8 * ctypes.sizeof(cpu_mask_t)
165-
n_mask_bits = cpu_setsize // n_cpu_bits
166-
167-
class cpu_set_t(ctypes.Structure):
168-
"""Class that implements the cpu_set_t struct."""
169-
_fields_ = [('bits', cpu_mask_t * n_mask_bits)]
170165

171166
_libc_lib = find_library('c')
172-
_libc = ctypes.cdll.LoadLibrary(_libc_lib)
167+
_libc = ctypes.CDLL(_libc_lib, use_errno=True)
173168

174169
pid = os.getpid()
175-
cs = cpu_set_t()
176-
ec = _libc.sched_getaffinity(os.getpid(), ctypes.sizeof(cpu_set_t), ctypes.pointer(cs))
177-
if ec == 0:
178-
_log.debug("sched_getaffinity for pid %s successful", pid)
179-
else:
180-
raise EasyBuildError("sched_getaffinity failed for pid %s ec %s", pid, ec)
170+
171+
cpu_setsize = 1024 # Max number of CPUs currently detectable
172+
max_cpu_setsize = cpu_mask_t(-1).value // 4 # (INT_MAX / 2)
173+
# Limit it to something reasonable but still big enough
174+
max_cpu_setsize = min(max_cpu_setsize, 1e9)
175+
while cpu_setsize < max_cpu_setsize:
176+
n_mask_bits = cpu_setsize // n_cpu_bits
177+
178+
class cpu_set_t(ctypes.Structure):
179+
"""Class that implements the cpu_set_t struct."""
180+
_fields_ = [('bits', cpu_mask_t * n_mask_bits)]
181+
182+
cs = cpu_set_t()
183+
ec = _libc.sched_getaffinity(pid, ctypes.sizeof(cpu_set_t), ctypes.pointer(cs))
184+
if ec == 0:
185+
_log.debug("sched_getaffinity for pid %s successful", pid)
186+
break
187+
elif ctypes.get_errno() != errno.EINVAL:
188+
raise EasyBuildError("sched_getaffinity failed for pid %s errno %s", pid, ctypes.get_errno())
189+
cpu_setsize *= 2
190+
191+
if ec != 0:
192+
raise EasyBuildError("sched_getaffinity failed finding a large enough cpuset for pid %s", pid)
181193

182194
cpus = []
183195
for bitmask in cs.bits:

test/framework/systemtools.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,7 @@ def test_cpu_speed_native(self):
454454
"""Test getting CPU speed."""
455455
cpu_speed = get_cpu_speed()
456456
self.assertTrue(isinstance(cpu_speed, float) or cpu_speed is None)
457-
self.assertTrue(cpu_speed > 0.0 or cpu_speed is None)
457+
self.assertTrue(cpu_speed is None or cpu_speed > 0.0)
458458

459459
def test_cpu_speed_linux(self):
460460
"""Test getting CPU speed (mocked for Linux)."""

0 commit comments

Comments
 (0)