Skip to content

Commit ca811d8

Browse files
author
Vasileios Karakasis
authored
Merge pull request #1834 from jjotero/test/affinity-sockets
[test] Adjust affinity tests to new Slurm config on Eiger
2 parents dce8940 + 2451d6f commit ca811d8

File tree

1 file changed

+126
-88
lines changed

1 file changed

+126
-88
lines changed

cscs-checks/prgenv/affinity_check.py

Lines changed: 126 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -41,40 +41,57 @@ class to figure out the processor's topology. The content of this reference
4141
# }
4242
system = variable(dict, value={})
4343

44+
valid_systems = [
45+
'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
46+
'eiger:mc', 'pilatus:mc',
47+
'ault:amdv100'
48+
]
49+
valid_prog_environs = [
50+
'PrgEnv-gnu', 'PrgEnv-cray', 'PrgEnv-intel', 'PrgEnv-pgi'
51+
]
52+
build_system = 'Make'
53+
54+
# The github URL can not be specifid as `self.sourcedir` as that
55+
# would prevent the src folder from being copied to stage which is
56+
# necessary since these tests need files from it.
57+
sourcesdir = os.path.join('src/affinity_ref')
58+
prebuild_cmds = ['git clone https://github.com/vkarak/affinity']
59+
60+
# Dict with the partition's topology - output of "lscpu -e"
61+
topology = variable(dict, value={
62+
'dom:gpu': 'topo_dom_gpu.json',
63+
'dom:mc': 'topo_dom_mc.json',
64+
'daint:gpu': 'topo_dom_gpu.json',
65+
'daint:mc': 'topo_dom_mc.json',
66+
'eiger:mc': 'topo_eiger_mc.json',
67+
'pilatus:mc': 'topo_eiger_mc.json',
68+
'ault:amdv100': 'topo_ault_amdv100.json',
69+
})
70+
71+
# Reference topology file as required variable
72+
topo_file = variable(str)
73+
74+
maintainers = ['RS', 'SK']
75+
tags = {'production', 'scs', 'maintenance', 'craype'}
76+
4477
def __init__(self):
45-
self.valid_systems = ['daint:gpu', 'daint:mc',
46-
'dom:gpu', 'dom:mc', 'eiger:mc',
47-
'ault:amdv100']
48-
self.valid_prog_environs = [
49-
'PrgEnv-gnu', 'PrgEnv-cray', 'PrgEnv-intel', 'PrgEnv-pgi'
50-
]
51-
self.build_system = 'Make'
78+
# FIXME: These two right now cannot be set in the class body.
79+
self.executable = './affinity/affinity'
5280
self.build_system.options = ['-C affinity', 'MPI=1']
5381

54-
# The github URL can not be specifid as `self.sourcedir` as that
55-
# would prevent the src folder from being copied to stage which is
56-
# necessary since these tests need files from it.
57-
self.sourcesdir = os.path.join('src/affinity_ref')
58-
self.prebuild_cmds = ['git clone https://github.com/vkarak/affinity']
59-
self.executable = './affinity/affinity'
82+
@rfm.run_before('sanity')
83+
def set_sanity(self):
6084
self.sanity_patterns = self.assert_consumed_cpu_set()
61-
self.maintainers = ['RS', 'SK']
62-
self.tags = {'production', 'scs', 'maintenance', 'craype'}
63-
64-
# Dict with the partition's topology - output of "lscpu -e"
65-
self.topology = {
66-
'dom:gpu': 'topo_dom_gpu.json',
67-
'dom:mc': 'topo_dom_mc.json',
68-
'daint:gpu': 'topo_dom_gpu.json',
69-
'daint:mc': 'topo_dom_mc.json',
70-
'eiger:mc': 'topo_eiger_mc.json',
71-
'ault:amdv100': 'topo_ault_amdv100.json',
72-
}
7385

7486
@rfm.run_before('compile')
7587
def set_topo_file(self):
88+
'''Set the topo_file variable.
89+
90+
If not present in the topology dict, leave it as required.
91+
'''
7692
cp = self.current_partition.fullname
77-
self.topo_file = self.topology[cp]
93+
if cp in self.topology:
94+
self.topo_file = self.topology[cp]
7895

7996
# FIXME: Update the hook below once the PR #1773 is merged.
8097
@rfm.run_after('compile')
@@ -171,6 +188,7 @@ def parse_output(self):
171188
'''Extract the data from the affinity tool.'''
172189

173190
re_aff_cpus = r'CPU affinity: \[\s+(?P<cpus>[\d+\s+]+)\]'
191+
174192
def parse_cpus(x):
175193
return sorted([int(xi) for xi in x.split()])
176194

@@ -219,10 +237,7 @@ class AffinityOpenMPBase(AffinityTestBase):
219237

220238
omp_bind = variable(str)
221239
omp_proc_bind = variable(str, value='spread')
222-
223-
def __init__(self):
224-
super().__init__()
225-
self.num_tasks = 1
240+
num_tasks = 1
226241

227242
@property
228243
def ncpus_per_task(self):
@@ -251,10 +266,7 @@ class OneThreadPerLogicalCoreOpenMP(AffinityOpenMPBase):
251266
'''Pin each OMP thread to a different logical core.'''
252267

253268
omp_bind = 'threads'
254-
255-
def __init__(self):
256-
super().__init__()
257-
self.descr = 'Pin one OMP thread per CPU.'
269+
descr = 'Pin one OMP thread per CPU.'
258270

259271
@property
260272
def num_omp_threads(self):
@@ -280,10 +292,7 @@ class OneThreadPerPhysicalCoreOpenMP(AffinityOpenMPBase):
280292
'''Pin each OMP thread to a different physical core.'''
281293

282294
omp_bind = 'cores'
283-
284-
def __init__(self):
285-
super().__init__()
286-
self.descr = 'Pin one OMP thread per core.'
295+
descr = 'Pin one OMP thread per core.'
287296

288297
@property
289298
def num_omp_threads(self):
@@ -311,10 +320,8 @@ def consume_cpu_set(self):
311320
class OneThreadPerPhysicalCoreOpenMPnomt(OneThreadPerPhysicalCoreOpenMP):
312321
'''Only one cpu per core booked without multithread.'''
313322

314-
def __init__(self):
315-
super().__init__()
316-
self.descr = 'Pin one OMP thread per core wo. multithreading.'
317-
self.use_multithreading = False
323+
use_multithreading = False
324+
descr = 'Pin one OMP thread per core wo. multithreading.'
318325

319326
@property
320327
def ncpus_per_task(self):
@@ -332,10 +339,7 @@ class OneThreadPerSocketOpenMP(AffinityOpenMPBase):
332339
'''Pin each OMP thread to a different socket.'''
333340

334341
omp_bind = 'sockets'
335-
336-
def __init__(self):
337-
super().__init__()
338-
self.descr = 'Pin one OMP thread per socket.'
342+
descr = 'Pin one OMP thread per socket.'
339343

340344
@property
341345
def num_omp_threads(self):
@@ -366,11 +370,8 @@ class OneTaskPerSocketOpenMPnomt(AffinityOpenMPBase):
366370

367371
omp_bind = 'sockets'
368372
omp_proc_bind = 'close'
369-
370-
def __init__(self):
371-
super().__init__()
372-
self.descr = 'One task per socket - wo. multithreading.'
373-
self.use_multithreading = False
373+
descr = 'One task per socket - wo. multithreading.'
374+
use_multithreading = False
374375

375376
@property
376377
def num_omp_threads(self):
@@ -423,29 +424,24 @@ class OneTaskPerSocketOpenMP(OneTaskPerSocketOpenMPnomt):
423424
and the number of OMP threads.
424425
'''
425426

426-
def __init__(self):
427-
super().__init__()
428-
self.descr = 'One task per socket - w. multithreading.'
429-
self.use_multithreading = True
427+
descr = 'One task per socket - w. multithreading.'
428+
use_multithreading = True
430429

431430
@property
432431
def num_omp_threads(self):
433432
return int(self.num_cpus/self.num_sockets)
434433

435434

436435
@rfm.simple_test
437-
class ConsecutiveNumaFilling(AffinityTestBase):
438-
'''Fill the NUMA nodes with the tasks in consecutive order.
436+
class ConsecutiveSocketFilling(AffinityTestBase):
437+
'''Fill the sockets with the tasks in consecutive order.
439438
440439
This test uses as many tasks as physical cores available in a node.
441440
Multithreading is disabled.
442441
'''
443442

444443
cpu_bind = 'rank'
445-
446-
def __init__(self):
447-
super().__init__()
448-
self.use_multithreading = False
444+
use_multithreading = False
449445

450446
@rfm.run_before('run')
451447
def set_tasks(self):
@@ -456,10 +452,10 @@ def set_tasks(self):
456452
def consume_cpu_set(self):
457453
'''Check that all physical cores have been used in the right order.'''
458454
task_count = 0
459-
for numa_number in range(self.num_numa_nodes):
460-
# Keep track of the CPUs present in this NUMA node
455+
for socket_number in range(self.num_sockets):
456+
# Keep track of the CPUs present in this socket
461457
cpus_present = set()
462-
for task_number in range(int(self.num_tasks/self.num_numa_nodes)):
458+
for task_number in range(int(self.num_tasks/self.num_sockets)):
463459
# Get the list of CPUs with affinity
464460
affinity_set = self.aff_cpus[task_count]
465461

@@ -477,14 +473,14 @@ def consume_cpu_set(self):
477473

478474
task_count += 1
479475

480-
# Ensure all CPUs belong to the same NUMA node
481-
cpuset_by_numa = self.get_sibling_cpus(
482-
next(iter(cpus_present)), by='node'
476+
# Ensure all CPUs belong to the same socket
477+
cpuset_by_socket = self.get_sibling_cpus(
478+
next(iter(cpus_present)), by='socket'
483479
)
484-
if (not all(cpu in cpuset_by_numa for cpu in cpus_present) and
485-
len(cpuset_by_numa)==len(cpus_present)):
480+
if (not all(cpu in cpuset_by_socket for cpu in cpus_present) and
481+
len(cpuset_by_socket) == len(cpus_present)):
486482
raise SanityError(
487-
f'numa node {numa_number} not filled in order'
483+
f'socket {socket_number} not filled in order'
488484
)
489485

490486
else:
@@ -493,54 +489,96 @@ def consume_cpu_set(self):
493489

494490

495491
@rfm.simple_test
496-
class AlternateNumaFilling(AffinityTestBase):
497-
'''Numa nodes are filled in a round-robin fashion.
492+
class AlternateSocketFilling(AffinityTestBase):
493+
'''Sockets are filled in a round-robin fashion.
498494
499495
This test uses as many tasks as physical cores available in a node.
500496
Multithreading is disabled.
501497
'''
502498

503-
def __init__(self):
504-
super().__init__()
505-
self.use_multithreading = False
499+
use_multithreading = False
506500

507501
@rfm.run_before('run')
508502
def set_tasks(self):
509503
self.num_tasks = int(self.num_cpus/self.num_cpus_per_core)
510504
self.num_cpus_per_task = 1
511-
self.num_tasks_per_numa = int(self.num_tasks/self.num_numa_nodes)
505+
self.num_tasks_per_socket = int(self.num_tasks/self.num_sockets)
512506

513507
@rfm.run_before('sanity')
514508
def consume_cpu_set(self):
515-
'''Check that consecutive tasks are round-robin pinned to numa nodes.'''
509+
'''Check that consecutive tasks are round-robin pinned to sockets.'''
516510

517-
# Get a set per numa node to keep track of the CPUs
518-
numa_nodes = [set() for s in range(self.num_numa_nodes)]
511+
# Get a set per socket to keep track of the CPUs
512+
sockets = [set() for s in range(self.num_sockets)]
519513
task_count = 0
520-
for task in range(self.num_tasks_per_numa):
521-
for s in range(self.num_numa_nodes):
514+
for task in range(self.num_tasks_per_socket):
515+
for s in range(self.num_sockets):
522516
# Get the list of CPUs with affinity
523517
affinity_set = self.aff_cpus[task_count]
524518

525519
# Only 1 CPU per affinity set is allowed
526520
if ((len(affinity_set) > 1) or
527-
(any(cpu in numa_nodes[s] for cpu in affinity_set)) or
528-
(any(cpu not in self.numa_nodes[s] for cpu in affinity_set))):
521+
(any(cpu in sockets[s] for cpu in affinity_set)) or
522+
(any(cpu not in self.sockets[s] for cpu in affinity_set))):
529523
raise SanityError(
530524
f'incorrect affinity set for task {task_count}'
531525
)
532526

533527
else:
534-
numa_nodes[s].update(
528+
sockets[s].update(
535529
self.get_sibling_cpus(affinity_set[0], by='core')
536530
)
537531

538532
task_count += 1
539533

540-
# Check that all numa nodes have the same CPU count
541-
if not all(len(s) == (task+1)*2 for s in numa_nodes):
534+
# Check that all sockets have the same CPU count
535+
if not all(len(s) == (task+1)*2 for s in sockets):
542536
self.cpu_set.add(-1)
543537

544-
# Decrement the NUMA nodes from the CPU set
545-
for s in numa_nodes:
538+
# Decrement the socket set from the CPU set
539+
for s in sockets:
546540
self.cpu_set -= s
541+
542+
543+
@rfm.simple_test
544+
class OneTaskPerNumaNode(AffinityTestBase):
545+
'''Place a task on each NUMA node.
546+
547+
The trick here is to "pad" the tasks with --cpus-per-task.
548+
The same could be done to target any cache level instead.
549+
Multithreading is disabled.
550+
'''
551+
552+
valid_systems = ['eiger:mc', 'pilatus:mc']
553+
use_multithreading = False
554+
num_cpus_per_task = required
555+
556+
@rfm.run_before('compile')
557+
def build_settings(self):
558+
self.build_system.options += ['OPENMP=0']
559+
560+
@rfm.run_before('run')
561+
def set_tasks(self):
562+
self.num_tasks = self.num_numa_nodes
563+
if self.current_partition.fullname in {'eiger:mc', 'pilatus:mc'}:
564+
self.num_cpus_per_task = 16
565+
566+
@rfm.run_before('sanity')
567+
def consume_cpu_set(self):
568+
'''Check that each task lives in a different NUMA node.'''
569+
570+
if len(self.aff_cpus) != self.num_numa_nodes:
571+
raise SanityError(
572+
'number of tasks does not match the number of numa nodes'
573+
)
574+
575+
for numa_node, aff_set in enumerate(self.aff_cpus):
576+
cpuset_by_numa = self.get_sibling_cpus(aff_set[0], by='node')
577+
if (len(aff_set) != self.num_cpus_per_task or
578+
any(cpu not in cpuset_by_numa for cpu in aff_set)):
579+
raise SanityError(
580+
f'incorrect affinity set for numa node {numa_node}'
581+
)
582+
else:
583+
# Decrement the current NUMA node from the available CPU set
584+
self.cpu_set -= cpuset_by_numa

0 commit comments

Comments
 (0)