Skip to content

Commit ba93361

Browse files
author
Samuel Moors
committed
set srun --cpus-per-task in custom eessi-srun launcher
1 parent b6446a2 commit ba93361

File tree

3 files changed

+39
-12
lines changed

3 files changed

+39
-12
lines changed

eessi/testsuite/common_config.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import os
22

3+
from reframe.core.backends import register_launcher
4+
from reframe.core.launchers.mpi import SrunLauncher
35
import reframe.core.logging as rflog
46

57
perflog_format = '|'.join([
@@ -143,3 +145,24 @@ def get_sbatch_account():
143145
err_msg += " It is required to set `SBATCH_ACCOUNT` to run on this system."
144146
raise ValueError(err_msg)
145147
return sbatch_account
148+
149+
150+
@register_launcher('eessi-srun')
151+
class EESSISrunLauncher(SrunLauncher):
152+
"""
153+
Custom srun launcher for the EESSI test suite
154+
Sets srun cpus per task to the actually used cpus per task (used_cpus_per_task)
155+
"""
156+
157+
def command(self, job):
158+
ret = ['srun']
159+
if hasattr(job, 'used_cpus_per_task') and job.used_cpus_per_task:
160+
ret.append(f'--cpus-per-task={job.used_cpus_per_task}')
161+
elif self.use_cpus_per_task and job.num_cpus_per_task:
162+
ret.append(f'--cpus-per-task={job.num_cpus_per_task}')
163+
164+
if self.env_vars:
165+
env_vars = ','.join(f'{k}={v}' for k, v in self.env_vars.items())
166+
ret.append(f'--export={env_vars}')
167+
168+
return ret

eessi/testsuite/eessi_mixin.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ class EESSI_Mixin(RegressionTestPlugin):
5858
bench_name = None
5959
is_ci_test = False
6060
num_tasks_per_compute_unit = 1
61-
used_cpus_per_task = None # effectively used cpus per task
61+
used_cpus_per_task = None # actually used cpus per task
6262
always_request_gpus = None
6363
require_buildenv_module = False
6464
require_internet = False
@@ -229,6 +229,14 @@ def EESSI_mixin_validate_setup(self):
229229
# i.e. exists in their respective dict from eessi.testsuite.constants
230230
self.EESSI_mixin_validate_item_in_list('compute_unit', COMPUTE_UNITS[:])
231231

232+
# Check that default ReFrame srun launcher is not used
233+
launcher = self.current_partition.launcher_type().registered_name
234+
if launcher == 'srun':
235+
msg = ('The default ReFrame srun launcher is not fully supported by the EESSI test suite.'
236+
' Please use `eessi.testsuite.common_config.eessi-srun` instead.')
237+
log_once(self, msg, msg_id='3', level='warning')
238+
239+
232240
@run_after('setup')
233241
def EESSI_mixin_assign_tasks_per_compute_unit(self):
234242
"""Call hooks to assign tasks per compute unit, set OMP_NUM_THREADS, and set compact process binding"""

eessi/testsuite/hooks.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def _set_job_resources(test: rfm.RegressionTest):
3030
test.job.num_tasks_per_socket = test.num_tasks_per_socket
3131
test.job.num_cpus_per_task = test.num_cpus_per_task
3232
test.job.use_smt = test.use_multithreading
33+
test.job.used_cpus_per_task = test.used_cpus_per_task
3334

3435

3536
def _assign_default_num_cpus_per_node(test: rfm.RegressionTest):
@@ -161,12 +162,7 @@ def assign_tasks_per_compute_unit(test: rfm.RegressionTest):
161162
if not test.used_cpus_per_task:
162163
test.used_cpus_per_task = test.num_cpus_per_task
163164

164-
# Tell srun to use the number of actually used cpus per tasks, not the number of allocated cpus per task.
165-
# With the default srun launcher in ReFrame this doesn’t work because it sets `srun --cpus-per-task`.
166-
# To make it work, a custom srun launcher should be used that doesn’t set `srun --cpus-per-task`.
167-
test.env_vars['SLURM_CPUS_PER_TASK'] = test.used_cpus_per_task
168-
169-
if test.current_partition.launcher_type().registered_name == 'srun':
165+
if test.current_partition.launcher_type().registered_name in ['eessi-srun', 'srun']:
170166
# Make sure srun inherits --cpus-per-task from the job environment for Slurm versions >= 22.05 < 23.11,
171167
# ensuring the same task binding across all Slurm versions.
172168
# https://bugs.schedmd.com/show_bug.cgi?id=13351
@@ -705,8 +701,9 @@ def set_compact_process_binding(test: rfm.RegressionTest):
705701
check_proc_attribute_defined(test, 'num_cpus_per_core')
706702
num_cpus_per_core = test.current_partition.processor.num_cpus_per_core
707703
physical_cpus_per_task = int(test.used_cpus_per_task / num_cpus_per_core)
704+
launcher = test.current_partition.launcher_type().registered_name
708705

709-
if test.current_partition.launcher_type().registered_name == 'mpirun':
706+
if launcher == 'mpirun':
710707
# Do binding for intel and OpenMPI's mpirun, and srun
711708
env_vars = {
712709
'I_MPI_PIN_CELL': 'core', # Don't bind to hyperthreads, only to physcial cores
@@ -725,17 +722,16 @@ def set_compact_process_binding(test: rfm.RegressionTest):
725722
if any(re.search(pattern, x) for x in test.modules):
726723
test.job.launcher.options.append(f'--map-by slot:PE={physical_cpus_per_task} --report-bindings')
727724
log(f'Set launcher command to {test.job.launcher.run_command(test.job)}')
728-
elif test.current_partition.launcher_type().registered_name == 'srun':
725+
elif launcher in ['eessi-srun', 'srun']:
729726
# Set compact binding for SLURM. Only effective if the task/affinity plugin is enabled
730727
# and when number of tasks times cpus per task equals either socket, core or thread count
731728
env_vars = {
732729
'SLURM_DISTRIBUTION': 'block:block',
733-
'SLURM_CPU_BIND': 'verbose',
730+
'SLURM_CPU_BIND': 'verbose,cores',
734731
}
735732
else:
736733
env_vars = {}
737-
msg = "hooks.set_compact_process_binding does not support the current launcher"
738-
msg += f" ({test.current_partition.launcher_type().registered_name})."
734+
msg = f"hooks.set_compact_process_binding does not support the current launcher ({launcher})."
739735
msg += " The test will run, but using the default binding strategy of your parallel launcher."
740736
msg += " This may lead to suboptimal performance."
741737
msg += " Please expand the functionality of hooks.set_compact_process_binding for your parallel launcher."

0 commit comments

Comments
 (0)