Skip to content

Commit b516a9e

Browse files
author
Theofilos Manitaras
committed
Align the job report test to the gpu_burn one
1 parent b1b962d commit b516a9e

File tree

1 file changed

+18
-20
lines changed

1 file changed

+18
-20
lines changed

cscs-checks/system/jobreport/gpu_report.py

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@
88
import time
99

1010
from reframe.core.exceptions import SanityError
11-
from hpctestlib.microbenchmarks.gpu.gpu_burn import GpuBurn
11+
from hpctestlib.microbenchmarks.gpu.gpu_burn import gpu_burn_check
1212

1313

1414
@rfm.simple_test
15-
class gpu_usage_report_check(GpuBurn):
15+
class gpu_usage_report_check(gpu_burn_check):
1616
'''Check the output from the job report.
1717
1818
This check uses the gpu burn app and checks that the job report produces
@@ -23,14 +23,11 @@ class gpu_usage_report_check(GpuBurn):
2323
'''
2424

2525
valid_systems = ['daint:gpu', 'dom:gpu']
26-
valid_prog_environs = ['PrgEnv-gnu']
26+
valid_prog_environs = ['PrgEnv-nvidia']
2727
descr = 'Check GPU usage from job report'
2828
gpu_build = 'cuda'
29-
modules = ['craype-accel-nvidia60', 'cdt-cuda']
3029
num_tasks = 0
3130
num_gpus_per_node = 1
32-
burn_time = variable(int, value=10)
33-
executable_opts = ['-d', f'{burn_time}']
3431
perf_floor = variable(float, value=-0.2)
3532
tags = {'production'}
3633

@@ -44,7 +41,7 @@ def set_launcher_opts(self):
4441
self.job.launcher.options = ['-u']
4542

4643
@sanity_function
47-
def set_sanity_patterns(self):
44+
def assert_jobreport_success(self):
4845
'''Extend sanity and wait for the jobreport.
4946
5047
If a large number of nodes is used, the final jobreport output happens
@@ -67,32 +64,33 @@ def set_sanity_patterns(self):
6764
def gpu_usage_sanity(self):
6865
'''Verify that the jobreport output has sensible numbers.
6966
70-
This function asserts that the nodes reported are at least a subset of
71-
all nodes used by the gpu burn app. Also, the GPU usage is verified by
72-
assuming that in the worst case scenario, the usage is near 100% during
73-
the burn, and 0% outside the burn period. Lastly, the GPU usage time
74-
for each node is also asserted to be greater or equal than the burn
75-
time.
67+
The GPU usage is verified by assuming that in the worst case scenario,
68+
the usage is near 100% during the burn, and 0% outside the burn period.
69+
Lastly, the GPU usage time for each node is also asserted to be greater
70+
or equal than the burn time.
7671
'''
7772

78-
# Get set with all nodes
79-
patt = r'^\s*\[([^\]]*)\]\s*GPU\s*\d+\(OK\)'
80-
full_node_set = set(sn.extractall(patt, self.stdout, 1))
81-
8273
# Parse job report data
8374
patt = r'^\s*(\w*)\s*(\d+)\s*%\s*\d+\s*MiB\s*\d+:\d+:(\d+)'
8475
self.nodes_reported = sn.extractall(patt, self.stdout, 1)
76+
self.num_tasks_assigned = self.job.num_tasks * self.num_gpus_per_node
8577
usage = sn.extractall(patt, self.stdout, 2, int)
8678
time_reported = sn.extractall(patt, self.stdout, 3, int)
8779
return sn.all([
8880
sn.assert_ge(sn.count(self.nodes_reported), 1),
89-
set(self.nodes_reported).issubset(full_node_set),
9081
sn.all(
91-
map(lambda x, y: self.burn_time/x <= y, time_reported, usage)
82+
map(lambda x, y: self.duration/x <= y/100, time_reported, usage)
9283
),
93-
sn.assert_ge(sn.min(time_reported), self.burn_time)
84+
sn.assert_ge(sn.min(time_reported), self.duration)
9485
])
9586

87+
@deferrable
88+
def count_successful_burns(self):
89+
'''Set the sanity patterns to count the number of successful burns.'''
90+
return sn.assert_eq(sn.count(sn.findall(r'^GPU\s*\d+\(OK\)',
91+
self.stdout)),
92+
self.num_tasks_assigned)
93+
9694
@performance_function('nodes')
9795
def total_nodes_reported(self):
9896
return sn.count(self.nodes_reported)

0 commit comments

Comments
 (0)