Skip to content

Commit 2f5f2e1

Browse files
committed
modernize slurm test
1 parent da18b68 commit 2f5f2e1

File tree

1 file changed

+132
-128
lines changed

1 file changed

+132
-128
lines changed

cscs-checks/system/slurm/slurm.py

Lines changed: 132 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -11,57 +11,54 @@
1111
class SlurmSimpleBaseCheck(rfm.RunOnlyRegressionTest):
1212
'''Base class for Slurm simple binary tests'''
1313

14-
def __init__(self):
15-
self.valid_systems = ['daint:gpu', 'daint:mc',
16-
'dom:gpu', 'dom:mc',
17-
'arolla:cn', 'arolla:pn',
18-
'tsa:cn', 'tsa:pn',
19-
'daint:xfer', 'eiger:mc',
20-
'pilatus:mc']
21-
self.valid_prog_environs = ['PrgEnv-cray']
22-
self.tags = {'slurm', 'maintenance', 'ops',
23-
'production', 'single-node'}
24-
self.num_tasks_per_node = 1
14+
valid_systems = ['daint:gpu', 'daint:mc',
15+
'dom:gpu', 'dom:mc',
16+
'arolla:cn', 'arolla:pn',
17+
'tsa:cn', 'tsa:pn',
18+
'daint:xfer', 'eiger:mc',
19+
'pilatus:mc']
20+
valid_prog_environs = ['PrgEnv-cray']
21+
tags = {'slurm', 'maintenance', 'ops',
22+
'production', 'single-node'}
23+
num_tasks_per_node = 1
24+
maintainers = ['RS', 'VH']
25+
26+
@run_after('init')
27+
def customize_systems(self):
2528
if self.current_system.name in ['arolla', 'tsa']:
2629
self.valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-pgi']
2730
self.exclusive_access = True
2831

29-
self.maintainers = ['RS', 'VH']
30-
3132

3233
class SlurmCompiledBaseCheck(rfm.RegressionTest):
3334
'''Base class for Slurm tests that require compiling some code'''
3435

35-
def __init__(self):
36-
self.valid_systems = ['daint:gpu', 'daint:mc',
37-
'dom:gpu', 'dom:mc']
38-
self.valid_prog_environs = ['PrgEnv-cray']
39-
self.tags = {'slurm', 'maintenance', 'ops',
40-
'production', 'single-node'}
41-
self.num_tasks_per_node = 1
42-
43-
self.maintainers = ['RS', 'VH']
36+
valid_systems = ['daint:gpu', 'daint:mc',
37+
'dom:gpu', 'dom:mc']
38+
valid_prog_environs = ['PrgEnv-cray']
39+
tags = {'slurm', 'maintenance', 'ops',
40+
'production', 'single-node'}
41+
num_tasks_per_node = 1
42+
maintainers = ['RS', 'VH']
4443

4544

4645
@rfm.simple_test
4746
class HostnameCheck(SlurmSimpleBaseCheck):
48-
def __init__(self):
49-
super().__init__()
50-
self.executable = '/bin/hostname'
51-
self.valid_prog_environs = ['builtin']
52-
self.hostname_patt = {
53-
'arolla:cn': r'^arolla-cn\d{3}$',
54-
'arolla:pn': r'^arolla-pp\d{3}$',
55-
'tsa:cn': r'^tsa-cn\d{3}$',
56-
'tsa:pn': r'^tsa-pp\d{3}$',
57-
'daint:gpu': r'^nid\d{5}$',
58-
'daint:mc': r'^nid\d{5}$',
59-
'daint:xfer': r'^datamover\d{2}.cscs.ch$',
60-
'dom:gpu': r'^nid\d{5}$',
61-
'dom:mc': r'^nid\d{5}$',
62-
'eiger:mc': r'^nid\d{6}$',
63-
'pilatus:mc': r'^nid\d{6}$'
64-
}
47+
executable = '/bin/hostname'
48+
valid_prog_environs = ['builtin']
49+
hostname_patt = {
50+
'arolla:cn': r'^arolla-cn\d{3}$',
51+
'arolla:pn': r'^arolla-pp\d{3}$',
52+
'tsa:cn': r'^tsa-cn\d{3}$',
53+
'tsa:pn': r'^tsa-pp\d{3}$',
54+
'daint:gpu': r'^nid\d{5}$',
55+
'daint:mc': r'^nid\d{5}$',
56+
'daint:xfer': r'^datamover\d{2}.cscs.ch$',
57+
'dom:gpu': r'^nid\d{5}$',
58+
'dom:mc': r'^nid\d{5}$',
59+
'eiger:mc': r'^nid\d{6}$',
60+
'pilatus:mc': r'^nid\d{6}$'
61+
}
6562

6663
@run_before('sanity')
6764
def set_sanity_patterns(self):
@@ -74,45 +71,48 @@ def set_sanity_patterns(self):
7471

7572
@rfm.simple_test
7673
class EnvironmentVariableCheck(SlurmSimpleBaseCheck):
77-
def __init__(self):
78-
super().__init__()
79-
self.num_tasks = 2
80-
self.valid_systems = ['daint:gpu', 'daint:mc',
81-
'dom:gpu', 'dom:mc',
82-
'arolla:cn', 'arolla:pn',
83-
'tsa:cn', 'tsa:pn',
84-
'eiger:mc', 'pilatus:mc']
85-
self.executable = '/bin/echo'
86-
self.executable_opts = ['$MY_VAR']
87-
self.variables = {'MY_VAR': 'TEST123456!'}
88-
self.tags.remove('single-node')
74+
num_tasks = 2
75+
valid_systems = ['daint:gpu', 'daint:mc',
76+
'dom:gpu', 'dom:mc',
77+
'arolla:cn', 'arolla:pn',
78+
'tsa:cn', 'tsa:pn',
79+
'eiger:mc', 'pilatus:mc']
80+
executable = '/bin/echo'
81+
executable_opts = ['$MY_VAR']
82+
variables = {'MY_VAR': 'TEST123456!'}
83+
tags.remove('single-node')
84+
85+
@sanity_function
86+
def assert_num_tasks(self):
8987
num_matches = sn.count(sn.findall(r'TEST123456!', self.stdout))
90-
self.sanity_patterns = sn.assert_eq(self.num_tasks, num_matches)
88+
return sn.assert_eq(self.num_tasks, num_matches)
9189

9290

9391
@rfm.simple_test
9492
class RequiredConstraintCheck(SlurmSimpleBaseCheck):
95-
def __init__(self):
96-
super().__init__()
97-
self.valid_systems = ['daint:login', 'dom:login']
98-
self.executable = 'srun'
99-
self.executable_opts = ['-A', osext.osgroup(), 'hostname']
100-
self.sanity_patterns = sn.assert_found(
93+
valid_systems = ['daint:login', 'dom:login']
94+
executable = 'srun'
95+
executable_opts = ['-A', osext.osgroup(), 'hostname']
96+
97+
@sanity_function
98+
def assert_found_missing_constraint(self):
99+
return sn.assert_found(
101100
r'ERROR: you must specify -C with one of the following: mc,gpu',
102101
self.stderr
103102
)
104103

105104

106105
@rfm.simple_test
107106
class RequestLargeMemoryNodeCheck(SlurmSimpleBaseCheck):
108-
def __init__(self):
109-
super().__init__()
110-
self.valid_systems = ['daint:mc']
111-
self.executable = '/usr/bin/free'
112-
self.executable_opts = ['-h']
107+
valid_systems = ['daint:mc']
108+
executable = '/usr/bin/free'
109+
executable_opts = ['-h']
110+
111+
@sanity_function
112+
def assert_memory_is_bounded(self):
113113
mem_obtained = sn.extractsingle(r'Mem:\s+(?P<mem>\S+)G',
114114
self.stdout, 'mem', float)
115-
self.sanity_patterns = sn.assert_bounded(mem_obtained, 122.0, 128.0)
115+
return sn.assert_bounded(mem_obtained, 122.0, 128.0)
116116

117117
@run_before('run')
118118
def set_memory_limit(self):
@@ -121,55 +121,57 @@ def set_memory_limit(self):
121121

122122
@rfm.simple_test
123123
class DefaultRequestGPU(SlurmSimpleBaseCheck):
124-
def __init__(self):
125-
super().__init__()
126-
self.valid_systems = ['daint:gpu', 'dom:gpu',
127-
'arolla:cn', 'tsa:cn']
128-
self.executable = 'nvidia-smi'
129-
self.sanity_patterns = sn.assert_found(
130-
r'NVIDIA-SMI.*Driver Version.*', self.stdout)
124+
valid_systems = ['daint:gpu', 'dom:gpu',
125+
'arolla:cn', 'tsa:cn']
126+
executable = 'nvidia-smi'
127+
128+
@sanity_function
129+
def asser_found_nvidia_driver_version(self):
130+
return sn.assert_found(r'NVIDIA-SMI.*Driver Version.*',
131+
self.stdout)
131132

132133

133134
@rfm.simple_test
134135
class DefaultRequestGPUSetsGRES(SlurmSimpleBaseCheck):
135-
def __init__(self):
136-
super().__init__()
137-
self.valid_systems = ['daint:gpu', 'dom:gpu']
138-
self.executable = 'scontrol show job ${SLURM_JOB_ID}'
139-
self.sanity_patterns = sn.assert_found(
136+
valid_systems = ['daint:gpu', 'dom:gpu']
137+
executable = 'scontrol show job ${SLURM_JOB_ID}'
138+
139+
@sanity_function
140+
def assert_found_resources(self):
141+
return sn.assert_found(
140142
r'.*(TresPerNode|Gres)=.*gpu:1.*', self.stdout)
141143

142144

143145
@rfm.simple_test
144146
class DefaultRequestMC(SlurmSimpleBaseCheck):
145-
def __init__(self):
146-
super().__init__()
147-
self.valid_systems = ['daint:mc', 'dom:mc']
148-
# This is a basic test that should return the number of CPUs on the
149-
# system which, on a MC node should be 72
150-
self.executable = 'lscpu -p |grep -v "^#" -c'
151-
self.sanity_patterns = sn.assert_found(r'72', self.stdout)
147+
valid_systems = ['daint:mc', 'dom:mc']
148+
# This is a basic test that should return the number of CPUs on the
149+
# system which, on a MC node should be 72
150+
executable = 'lscpu -p |grep -v "^#" -c'
151+
152+
@sanity_function
153+
def assert_found_num_cpus(self):
154+
return sn.assert_found(r'72', self.stdout)
152155

153156

154157
@rfm.simple_test
155158
class ConstraintRequestCabinetGrouping(SlurmSimpleBaseCheck):
156-
def __init__(self):
157-
super().__init__()
158-
self.valid_systems = ['daint:gpu', 'daint:mc',
159-
'dom:gpu', 'dom:mc']
160-
self.executable = 'cat /proc/cray_xt/cname'
161-
self.cabinets = {
162-
'daint:gpu': 'c0-1',
163-
'daint:mc': 'c1-0',
164-
165-
# Numbering is inverse in Dom
166-
'dom:gpu': 'c0-0',
167-
'dom:mc': 'c0-1',
168-
}
159+
valid_systems = ['daint:gpu', 'daint:mc',
160+
'dom:gpu', 'dom:mc']
161+
executable = 'cat /proc/cray_xt/cname'
162+
cabinets = {
163+
'daint:gpu': 'c0-1',
164+
'daint:mc': 'c1-0',
165+
# Numbering is inverse in Dom
166+
'dom:gpu': 'c0-0',
167+
'dom:mc': 'c0-1',
168+
}
169169

170+
@sanity_function
171+
def assert_found_cabinet(self):
170172
# We choose a default pattern that will cause assert_found() to fail
171173
cabinet = self.cabinets.get(self.current_system.name, r'$^')
172-
self.sanity_patterns = sn.assert_found(fr'{cabinet}.*', self.stdout)
174+
return sn.assert_found(fr'{cabinet}.*', self.stdout)
173175

174176
@run_before('run')
175177
def set_slurm_constraint(self):
@@ -180,16 +182,16 @@ def set_slurm_constraint(self):
180182

181183
@rfm.simple_test
182184
class MemoryOverconsumptionCheck(SlurmCompiledBaseCheck):
183-
def __init__(self):
184-
super().__init__()
185-
self.time_limit = '1m'
186-
self.valid_systems += ['eiger:mc', 'pilatus:mc']
187-
self.sourcepath = 'eatmemory.c'
188-
self.tags.add('mem')
189-
self.executable_opts = ['4000M']
190-
self.sanity_patterns = sn.assert_found(
191-
r'(exceeded memory limit)|(Out Of Memory)', self.stderr
192-
)
185+
time_limit = '1m'
186+
valid_systems += ['eiger:mc', 'pilatus:mc']
187+
tags.add('mem')
188+
sourcepath = 'eatmemory.c'
189+
executable_opts = ['4000M']
190+
191+
@sanity_function
192+
def assert_found_exceeded_memory(self):
193+
return sn.assert_found(r'(exceeded memory limit)|(Out Of Memory)',
194+
self.stderr)
193195

194196
@run_before('run')
195197
def set_memory_limit(self):
@@ -198,25 +200,27 @@ def set_memory_limit(self):
198200

199201
@rfm.simple_test
200202
class MemoryOverconsumptionMpiCheck(SlurmCompiledBaseCheck):
201-
def __init__(self):
202-
super().__init__()
203-
self.maintainers = ['JG']
204-
self.valid_systems += ['eiger:mc', 'pilatus:mc']
205-
self.time_limit = '5m'
206-
self.sourcepath = 'eatmemory_mpi.c'
207-
self.tags.add('mem')
208-
self.executable_opts = ['100%']
209-
self.sanity_patterns = sn.assert_found(r'(oom-kill)|(Killed)',
210-
self.stderr)
211-
# {{{ perf
203+
maintainers = ['JG']
204+
valid_systems += ['eiger:mc', 'pilatus:mc']
205+
time_limit = '5m'
206+
sourcepath = 'eatmemory_mpi.c'
207+
tags.add('mem')
208+
executable_opts = ['100%']
209+
210+
@sanity_function
211+
def assert_found_oom(self):
212+
return sn.assert_found(r'(oom-kill)|(Killed)',
213+
self.stderr)
214+
215+
@run_before('performance')
216+
def set_perf_patterns(self):
212217
regex = (r'^Eating \d+ MB\/mpi \*\d+mpi = -\d+ MB memory from \/proc\/'
213218
r'meminfo: total: \d+ GB, free: \d+ GB, avail: \d+ GB, using:'
214219
r' (\d+) GB')
215220
self.perf_patterns = {
216221
'max_cn_memory': sn.getattr(self, 'reference_meminfo'),
217-
'max_allocated_memory': sn.max(
218-
sn.extractall(regex, self.stdout, 1, int)
219-
),
222+
'max_allocated_memory': sn.max(sn.extractall(regex, self.stdout,
223+
1, int)),
220224
}
221225
no_limit = (0, None, None, 'GB')
222226
self.reference = {
@@ -227,9 +231,7 @@ def __init__(self):
227231
),
228232
}
229233
}
230-
# }}}
231234

232-
# {{{ hooks
233235
@run_before('run')
234236
def set_tasks(self):
235237
tasks_per_node = {
@@ -244,10 +246,9 @@ def set_tasks(self):
244246
self.num_tasks_per_node = tasks_per_node[partname]
245247
self.num_tasks = self.num_tasks_per_node
246248
self.job.launcher.options = ['-u']
247-
# }}}
248249

249250
@property
250-
@sn.sanity_function
251+
@deferrable
251252
def reference_meminfo(self):
252253
reference_meminfo = {
253254
'dom:gpu': 62,
@@ -285,9 +286,12 @@ class slurm_response_check(rfm.RunOnlyRegressionTest):
285286
def set_exec_opts(self):
286287
self.executable_opts = [self.command]
287288

288-
@run_before('sanity')
289-
def set_sanity(self):
290-
self.sanity_patterns = sn.assert_eq(self.job.exitcode, 0)
289+
@sanity_function
290+
def assert_exitcode_zero(self):
291+
return sn.assert_eq(self.job.exitcode, 0)
292+
293+
@run_before('performance')
294+
def set_perf_patterns(self):
291295
self.perf_patterns = {
292296
'real_time': sn.extractsingle(r'real (?P<real_time>\S+)',
293297
self.stderr, 'real_time', float)

0 commit comments

Comments
 (0)