Skip to content

Commit f9d69c8

Browse files
author
Vasileios Karakasis
authored
Merge pull request #2516 from teojgo/test/align_job_report_gpu_burn
[test] Align the job report test to the new `gpu_burn_check` library test
2 parents b1b962d + 6f843c0 commit f9d69c8

File tree

1 file changed

+18
-44
lines changed

1 file changed

+18
-44
lines changed

cscs-checks/system/jobreport/gpu_report.py

Lines changed: 18 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@
88
import time
99

1010
from reframe.core.exceptions import SanityError
11-
from hpctestlib.microbenchmarks.gpu.gpu_burn import GpuBurn
11+
from hpctestlib.microbenchmarks.gpu.gpu_burn import gpu_burn_check
1212

1313

1414
@rfm.simple_test
15-
class gpu_usage_report_check(GpuBurn):
15+
class gpu_usage_report_check(gpu_burn_check):
1616
'''Check the output from the job report.
1717
1818
This check uses the gpu burn app and checks that the job report produces
@@ -23,14 +23,10 @@ class gpu_usage_report_check(GpuBurn):
2323
'''
2424

2525
valid_systems = ['daint:gpu', 'dom:gpu']
26-
valid_prog_environs = ['PrgEnv-gnu']
2726
descr = 'Check GPU usage from job report'
2827
gpu_build = 'cuda'
29-
modules = ['craype-accel-nvidia60', 'cdt-cuda']
30-
num_tasks = 0
28+
num_tasks = 2
3129
num_gpus_per_node = 1
32-
burn_time = variable(int, value=10)
33-
executable_opts = ['-d', f'{burn_time}']
3430
perf_floor = variable(float, value=-0.2)
3531
tags = {'production'}
3632

@@ -44,7 +40,7 @@ def set_launcher_opts(self):
4440
self.job.launcher.options = ['-u']
4541

4642
@sanity_function
47-
def set_sanity_patterns(self):
43+
def assert_jobreport_success(self):
4844
'''Extend sanity and wait for the jobreport.
4945
5046
If a large number of nodes is used, the final jobreport output happens
@@ -59,57 +55,35 @@ def set_sanity_patterns(self):
5955
except SanityError:
6056
time.sleep(25)
6157

62-
return sn.all([
63-
self.count_successful_burns(), self.gpu_usage_sanity()
64-
])
58+
return self.assert_successful_burn_count(), self.gpu_usage_sanity()
6559

6660
@deferrable
6761
def gpu_usage_sanity(self):
6862
'''Verify that the jobreport output has sensible numbers.
6963
70-
This function asserts that the nodes reported are at least a subset of
71-
all nodes used by the gpu burn app. Also, the GPU usage is verified by
72-
assuming that in the worst case scenario, the usage is near 100% during
73-
the burn, and 0% outside the burn period. Lastly, the GPU usage time
74-
for each node is also asserted to be greater or equal than the burn
75-
time.
64+
The GPU usage is verified by assuming that in the worst case scenario,
65+
the usage is near 100% during the burn, and 0% outside the burn period.
66+
Lastly, the GPU usage time for each node is also asserted to be greater
67+
or equal than the burn time.
7668
'''
7769

78-
# Get set with all nodes
79-
patt = r'^\s*\[([^\]]*)\]\s*GPU\s*\d+\(OK\)'
80-
full_node_set = set(sn.extractall(patt, self.stdout, 1))
81-
8270
# Parse job report data
8371
patt = r'^\s*(\w*)\s*(\d+)\s*%\s*\d+\s*MiB\s*\d+:\d+:(\d+)'
8472
self.nodes_reported = sn.extractall(patt, self.stdout, 1)
73+
self.num_tasks_assigned = self.num_tasks * self.num_gpus_per_node
8574
usage = sn.extractall(patt, self.stdout, 2, int)
8675
time_reported = sn.extractall(patt, self.stdout, 3, int)
8776
return sn.all([
8877
sn.assert_ge(sn.count(self.nodes_reported), 1),
89-
set(self.nodes_reported).issubset(full_node_set),
9078
sn.all(
91-
map(lambda x, y: self.burn_time/x <= y, time_reported, usage)
79+
map(lambda x, y: self.duration/x <= y/100, time_reported, usage)
9280
),
93-
sn.assert_ge(sn.min(time_reported), self.burn_time)
81+
sn.assert_ge(sn.min(time_reported), self.duration)
9482
])
9583

96-
@performance_function('nodes')
97-
def total_nodes_reported(self):
98-
return sn.count(self.nodes_reported)
99-
100-
@run_before('performance')
101-
def set_perf_variables(self):
102-
'''The number of reported nodes can be used as a perf metric.
103-
104-
For now, the low limit can go to zero, but this can be set to a more
105-
restrictive value.
106-
'''
107-
108-
self.reference = {
109-
'*': {
110-
'nodes_reported': (self.num_tasks, self.perf_floor, 0)
111-
},
112-
}
113-
self.perf_variables = {
114-
'nodes_reported': self.total_nodes_reported()
115-
}
84+
@deferrable
85+
def assert_successful_burn_count(self):
86+
'''Assert that the expected successful burn count is reported.'''
87+
return sn.assert_eq(sn.count(sn.findall(r'^GPU\s*\d+\(OK\)',
88+
self.stdout)),
89+
self.num_tasks_assigned)

0 commit comments

Comments
 (0)