Skip to content

Commit 7d6b831

Browse files
author
Vasileios Karakasis
authored
Merge pull request #2148 from rsarm/new-syntax/slurm
[test] Modernize Slurm test
2 parents a40ac60 + be91017 commit 7d6b831

File tree

1 file changed

+143
-141
lines changed

1 file changed

+143
-141
lines changed

cscs-checks/system/slurm/slurm.py

Lines changed: 143 additions & 141 deletions
Original file line numberDiff line numberDiff line change
@@ -11,57 +11,54 @@
1111
class SlurmSimpleBaseCheck(rfm.RunOnlyRegressionTest):
1212
'''Base class for Slurm simple binary tests'''
1313

14-
def __init__(self):
15-
self.valid_systems = ['daint:gpu', 'daint:mc',
16-
'dom:gpu', 'dom:mc',
17-
'arolla:cn', 'arolla:pn',
18-
'tsa:cn', 'tsa:pn',
19-
'daint:xfer', 'eiger:mc',
20-
'pilatus:mc']
21-
self.valid_prog_environs = ['PrgEnv-cray']
22-
self.tags = {'slurm', 'maintenance', 'ops',
23-
'production', 'single-node'}
24-
self.num_tasks_per_node = 1
14+
valid_systems = ['daint:gpu', 'daint:mc',
15+
'dom:gpu', 'dom:mc',
16+
'arolla:cn', 'arolla:pn',
17+
'tsa:cn', 'tsa:pn',
18+
'daint:xfer', 'eiger:mc',
19+
'pilatus:mc']
20+
valid_prog_environs = ['PrgEnv-cray']
21+
tags = {'slurm', 'maintenance', 'ops',
22+
'production', 'single-node'}
23+
num_tasks_per_node = 1
24+
maintainers = ['RS', 'VH']
25+
26+
@run_after('init')
27+
def customize_systems(self):
2528
if self.current_system.name in ['arolla', 'tsa']:
2629
self.valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-pgi']
2730
self.exclusive_access = True
2831

29-
self.maintainers = ['RS', 'VH']
30-
3132

3233
class SlurmCompiledBaseCheck(rfm.RegressionTest):
3334
'''Base class for Slurm tests that require compiling some code'''
3435

35-
def __init__(self):
36-
self.valid_systems = ['daint:gpu', 'daint:mc',
37-
'dom:gpu', 'dom:mc']
38-
self.valid_prog_environs = ['PrgEnv-cray']
39-
self.tags = {'slurm', 'maintenance', 'ops',
40-
'production', 'single-node'}
41-
self.num_tasks_per_node = 1
42-
43-
self.maintainers = ['RS', 'VH']
36+
valid_systems = ['daint:gpu', 'daint:mc',
37+
'dom:gpu', 'dom:mc']
38+
valid_prog_environs = ['PrgEnv-cray']
39+
tags = {'slurm', 'maintenance', 'ops',
40+
'production', 'single-node'}
41+
num_tasks_per_node = 1
42+
maintainers = ['RS', 'VH']
4443

4544

4645
@rfm.simple_test
4746
class HostnameCheck(SlurmSimpleBaseCheck):
48-
def __init__(self):
49-
super().__init__()
50-
self.executable = '/bin/hostname'
51-
self.valid_prog_environs = ['builtin']
52-
self.hostname_patt = {
53-
'arolla:cn': r'^arolla-cn\d{3}$',
54-
'arolla:pn': r'^arolla-pp\d{3}$',
55-
'tsa:cn': r'^tsa-cn\d{3}$',
56-
'tsa:pn': r'^tsa-pp\d{3}$',
57-
'daint:gpu': r'^nid\d{5}$',
58-
'daint:mc': r'^nid\d{5}$',
59-
'daint:xfer': r'^datamover\d{2}.cscs.ch$',
60-
'dom:gpu': r'^nid\d{5}$',
61-
'dom:mc': r'^nid\d{5}$',
62-
'eiger:mc': r'^nid\d{6}$',
63-
'pilatus:mc': r'^nid\d{6}$'
64-
}
47+
executable = '/bin/hostname'
48+
valid_prog_environs = ['builtin']
49+
hostname_patt = {
50+
'arolla:cn': r'^arolla-cn\d{3}$',
51+
'arolla:pn': r'^arolla-pp\d{3}$',
52+
'tsa:cn': r'^tsa-cn\d{3}$',
53+
'tsa:pn': r'^tsa-pp\d{3}$',
54+
'daint:gpu': r'^nid\d{5}$',
55+
'daint:mc': r'^nid\d{5}$',
56+
'daint:xfer': r'^datamover\d{2}.cscs.ch$',
57+
'dom:gpu': r'^nid\d{5}$',
58+
'dom:mc': r'^nid\d{5}$',
59+
'eiger:mc': r'^nid\d{6}$',
60+
'pilatus:mc': r'^nid\d{6}$'
61+
}
6562

6663
@run_before('sanity')
6764
def set_sanity_patterns(self):
@@ -74,45 +71,48 @@ def set_sanity_patterns(self):
7471

7572
@rfm.simple_test
7673
class EnvironmentVariableCheck(SlurmSimpleBaseCheck):
77-
def __init__(self):
78-
super().__init__()
79-
self.num_tasks = 2
80-
self.valid_systems = ['daint:gpu', 'daint:mc',
81-
'dom:gpu', 'dom:mc',
82-
'arolla:cn', 'arolla:pn',
83-
'tsa:cn', 'tsa:pn',
84-
'eiger:mc', 'pilatus:mc']
85-
self.executable = '/bin/echo'
86-
self.executable_opts = ['$MY_VAR']
87-
self.variables = {'MY_VAR': 'TEST123456!'}
88-
self.tags.remove('single-node')
74+
num_tasks = 2
75+
valid_systems = ['daint:gpu', 'daint:mc',
76+
'dom:gpu', 'dom:mc',
77+
'arolla:cn', 'arolla:pn',
78+
'tsa:cn', 'tsa:pn',
79+
'eiger:mc', 'pilatus:mc']
80+
executable = '/bin/echo'
81+
executable_opts = ['$MY_VAR']
82+
variables = {'MY_VAR': 'TEST123456!'}
83+
tags.remove('single-node')
84+
85+
@sanity_function
86+
def assert_num_tasks(self):
8987
num_matches = sn.count(sn.findall(r'TEST123456!', self.stdout))
90-
self.sanity_patterns = sn.assert_eq(self.num_tasks, num_matches)
88+
return sn.assert_eq(self.num_tasks, num_matches)
9189

9290

9391
@rfm.simple_test
9492
class RequiredConstraintCheck(SlurmSimpleBaseCheck):
95-
def __init__(self):
96-
super().__init__()
97-
self.valid_systems = ['daint:login', 'dom:login']
98-
self.executable = 'srun'
99-
self.executable_opts = ['-A', osext.osgroup(), 'hostname']
100-
self.sanity_patterns = sn.assert_found(
93+
valid_systems = ['daint:login', 'dom:login']
94+
executable = 'srun'
95+
executable_opts = ['-A', osext.osgroup(), 'hostname']
96+
97+
@sanity_function
98+
def assert_found_missing_constraint(self):
99+
return sn.assert_found(
101100
r'ERROR: you must specify -C with one of the following: mc,gpu',
102101
self.stderr
103102
)
104103

105104

106105
@rfm.simple_test
107106
class RequestLargeMemoryNodeCheck(SlurmSimpleBaseCheck):
108-
def __init__(self):
109-
super().__init__()
110-
self.valid_systems = ['daint:mc']
111-
self.executable = '/usr/bin/free'
112-
self.executable_opts = ['-h']
107+
valid_systems = ['daint:mc']
108+
executable = '/usr/bin/free'
109+
executable_opts = ['-h']
110+
111+
@sanity_function
112+
def assert_memory_is_bounded(self):
113113
mem_obtained = sn.extractsingle(r'Mem:\s+(?P<mem>\S+)G',
114114
self.stdout, 'mem', float)
115-
self.sanity_patterns = sn.assert_bounded(mem_obtained, 122.0, 128.0)
115+
return sn.assert_bounded(mem_obtained, 122.0, 128.0)
116116

117117
@run_before('run')
118118
def set_memory_limit(self):
@@ -121,55 +121,56 @@ def set_memory_limit(self):
121121

122122
@rfm.simple_test
123123
class DefaultRequestGPU(SlurmSimpleBaseCheck):
124-
def __init__(self):
125-
super().__init__()
126-
self.valid_systems = ['daint:gpu', 'dom:gpu',
127-
'arolla:cn', 'tsa:cn']
128-
self.executable = 'nvidia-smi'
129-
self.sanity_patterns = sn.assert_found(
130-
r'NVIDIA-SMI.*Driver Version.*', self.stdout)
124+
valid_systems = ['daint:gpu', 'dom:gpu',
125+
'arolla:cn', 'tsa:cn']
126+
executable = 'nvidia-smi'
127+
128+
@sanity_function
129+
def asser_found_nvidia_driver_version(self):
130+
return sn.assert_found(r'NVIDIA-SMI.*Driver Version.*',
131+
self.stdout)
131132

132133

133134
@rfm.simple_test
134135
class DefaultRequestGPUSetsGRES(SlurmSimpleBaseCheck):
135-
def __init__(self):
136-
super().__init__()
137-
self.valid_systems = ['daint:gpu', 'dom:gpu']
138-
self.executable = 'scontrol show job ${SLURM_JOB_ID}'
139-
self.sanity_patterns = sn.assert_found(
140-
r'.*(TresPerNode|Gres)=.*gpu:1.*', self.stdout)
136+
valid_systems = ['daint:gpu', 'dom:gpu']
137+
executable = 'scontrol show job ${SLURM_JOB_ID}'
138+
139+
@sanity_function
140+
def assert_found_resources(self):
141+
return sn.assert_found(r'.*(TresPerNode|Gres)=.*gpu:1.*', self.stdout)
141142

142143

143144
@rfm.simple_test
144145
class DefaultRequestMC(SlurmSimpleBaseCheck):
145-
def __init__(self):
146-
super().__init__()
147-
self.valid_systems = ['daint:mc', 'dom:mc']
148-
# This is a basic test that should return the number of CPUs on the
149-
# system which, on a MC node should be 72
150-
self.executable = 'lscpu -p |grep -v "^#" -c'
151-
self.sanity_patterns = sn.assert_found(r'72', self.stdout)
146+
valid_systems = ['daint:mc', 'dom:mc']
147+
# This is a basic test that should return the number of CPUs on the
148+
# system which, on a MC node should be 72
149+
executable = 'lscpu -p |grep -v "^#" -c'
150+
151+
@sanity_function
152+
def assert_found_num_cpus(self):
153+
return sn.assert_found(r'72', self.stdout)
152154

153155

154156
@rfm.simple_test
155157
class ConstraintRequestCabinetGrouping(SlurmSimpleBaseCheck):
156-
def __init__(self):
157-
super().__init__()
158-
self.valid_systems = ['daint:gpu', 'daint:mc',
159-
'dom:gpu', 'dom:mc']
160-
self.executable = 'cat /proc/cray_xt/cname'
161-
self.cabinets = {
162-
'daint:gpu': 'c0-1',
163-
'daint:mc': 'c1-0',
164-
165-
# Numbering is inverse in Dom
166-
'dom:gpu': 'c0-0',
167-
'dom:mc': 'c0-1',
168-
}
158+
valid_systems = ['daint:gpu', 'daint:mc',
159+
'dom:gpu', 'dom:mc']
160+
executable = 'cat /proc/cray_xt/cname'
161+
cabinets = {
162+
'daint:gpu': 'c0-1',
163+
'daint:mc': 'c1-0',
164+
# Numbering is inverse in Dom
165+
'dom:gpu': 'c0-0',
166+
'dom:mc': 'c0-1',
167+
}
169168

169+
@sanity_function
170+
def assert_found_cabinet(self):
170171
# We choose a default pattern that will cause assert_found() to fail
171172
cabinet = self.cabinets.get(self.current_system.name, r'$^')
172-
self.sanity_patterns = sn.assert_found(fr'{cabinet}.*', self.stdout)
173+
return sn.assert_found(fr'{cabinet}.*', self.stdout)
173174

174175
@run_before('run')
175176
def set_slurm_constraint(self):
@@ -180,16 +181,16 @@ def set_slurm_constraint(self):
180181

181182
@rfm.simple_test
182183
class MemoryOverconsumptionCheck(SlurmCompiledBaseCheck):
183-
def __init__(self):
184-
super().__init__()
185-
self.time_limit = '1m'
186-
self.valid_systems += ['eiger:mc', 'pilatus:mc']
187-
self.sourcepath = 'eatmemory.c'
188-
self.tags.add('mem')
189-
self.executable_opts = ['4000M']
190-
self.sanity_patterns = sn.assert_found(
191-
r'(exceeded memory limit)|(Out Of Memory)', self.stderr
192-
)
184+
time_limit = '1m'
185+
valid_systems += ['eiger:mc', 'pilatus:mc']
186+
tags.add('mem')
187+
sourcepath = 'eatmemory.c'
188+
executable_opts = ['4000M']
189+
190+
@sanity_function
191+
def assert_found_exceeded_memory(self):
192+
return sn.assert_found(r'(exceeded memory limit)|(Out Of Memory)',
193+
self.stderr)
193194

194195
@run_before('run')
195196
def set_memory_limit(self):
@@ -198,38 +199,41 @@ def set_memory_limit(self):
198199

199200
@rfm.simple_test
200201
class MemoryOverconsumptionMpiCheck(SlurmCompiledBaseCheck):
201-
def __init__(self):
202-
super().__init__()
203-
self.maintainers = ['JG']
204-
self.valid_systems += ['eiger:mc', 'pilatus:mc']
205-
self.time_limit = '5m'
206-
self.sourcepath = 'eatmemory_mpi.c'
207-
self.tags.add('mem')
208-
self.executable_opts = ['100%']
209-
self.sanity_patterns = sn.assert_found(r'(oom-kill)|(Killed)',
210-
self.stderr)
211-
# {{{ perf
212-
regex = (r'^Eating \d+ MB\/mpi \*\d+mpi = -\d+ MB memory from \/proc\/'
213-
r'meminfo: total: \d+ GB, free: \d+ GB, avail: \d+ GB, using:'
214-
r' (\d+) GB')
215-
self.perf_patterns = {
216-
'max_cn_memory': sn.getattr(self, 'reference_meminfo'),
217-
'max_allocated_memory': sn.max(
218-
sn.extractall(regex, self.stdout, 1, int)
219-
),
220-
}
202+
maintainers = ['JG']
203+
valid_systems += ['eiger:mc', 'pilatus:mc']
204+
time_limit = '5m'
205+
sourcepath = 'eatmemory_mpi.c'
206+
tags.add('mem')
207+
executable_opts = ['100%']
208+
209+
@sanity_function
210+
def assert_found_oom(self):
211+
return sn.assert_found(r'(oom-kill)|(Killed)',
212+
self.stderr)
213+
214+
@run_before('performance')
215+
def set_references(self):
221216
no_limit = (0, None, None, 'GB')
222217
self.reference = {
223218
'*': {
224219
'max_cn_memory': no_limit,
225220
'max_allocated_memory': (
226-
sn.getattr(self, 'reference_meminfo'), -0.05, None, 'GB'
221+
self.reference_meminfo(), -0.05, None, 'GB'
227222
),
228223
}
229224
}
230-
# }}}
231225

232-
# {{{ hooks
226+
@performance_function('GB')
227+
def max_cn_memory(self):
228+
return self.reference_meminfo()
229+
230+
@performance_function('GB')
231+
def max_allocated_memory(self):
232+
regex = (r'^Eating \d+ MB\/mpi \*\d+mpi = -\d+ MB memory from \/proc\/'
233+
r'meminfo: total: \d+ GB, free: \d+ GB, avail: \d+ GB, using:'
234+
r' (\d+) GB')
235+
return sn.max(sn.extractall(regex, self.stdout, 1, int))
236+
233237
@run_before('run')
234238
def set_tasks(self):
235239
tasks_per_node = {
@@ -244,10 +248,7 @@ def set_tasks(self):
244248
self.num_tasks_per_node = tasks_per_node[partname]
245249
self.num_tasks = self.num_tasks_per_node
246250
self.job.launcher.options = ['-u']
247-
# }}}
248251

249-
@property
250-
@sn.sanity_function
251252
def reference_meminfo(self):
252253
reference_meminfo = {
253254
'dom:gpu': 62,
@@ -285,10 +286,11 @@ class slurm_response_check(rfm.RunOnlyRegressionTest):
285286
def set_exec_opts(self):
286287
self.executable_opts = [self.command]
287288

288-
@run_before('sanity')
289-
def set_sanity(self):
290-
self.sanity_patterns = sn.assert_eq(self.job.exitcode, 0)
291-
self.perf_patterns = {
292-
'real_time': sn.extractsingle(r'real (?P<real_time>\S+)',
293-
self.stderr, 'real_time', float)
294-
}
289+
@sanity_function
290+
def assert_exitcode_zero(self):
291+
return sn.assert_eq(self.job.exitcode, 0)
292+
293+
@performance_function('s')
294+
def real_time(self):
295+
return sn.extractsingle(r'real (?P<real_time>\S+)', self.stderr,
296+
'real_time', float)

0 commit comments

Comments
 (0)