@@ -23,10 +23,9 @@ class gpu_usage_report_check(gpu_burn_check):
2323 '''
2424
2525 valid_systems = ['daint:gpu' , 'dom:gpu' ]
26- valid_prog_environs = ['PrgEnv-nvidia' ]
2726 descr = 'Check GPU usage from job report'
2827 gpu_build = 'cuda'
29- num_tasks = 0
28+ num_tasks = 2
3029 num_gpus_per_node = 1
3130 perf_floor = variable (float , value = - 0.2 )
3231 tags = {'production' }
@@ -57,7 +56,7 @@ def assert_jobreport_success(self):
5756 time .sleep (25 )
5857
5958 return sn .all ([
60- self .count_successful_burns (), self .gpu_usage_sanity ()
59+ self .assert_successful_burn_count (), self .gpu_usage_sanity ()
6160 ])
6261
6362 @deferrable
@@ -73,7 +72,7 @@ def gpu_usage_sanity(self):
7372 # Parse job report data
7473 patt = r'^\s*(\w*)\s*(\d+)\s*%\s*\d+\s*MiB\s*\d+:\d+:(\d+)'
7574 self .nodes_reported = sn .extractall (patt , self .stdout , 1 )
76- self .num_tasks_assigned = self .job . num_tasks * self .num_gpus_per_node
75+ self .num_tasks_assigned = self .num_tasks * self .num_gpus_per_node
7776 usage = sn .extractall (patt , self .stdout , 2 , int )
7877 time_reported = sn .extractall (patt , self .stdout , 3 , int )
7978 return sn .all ([
@@ -85,29 +84,8 @@ def gpu_usage_sanity(self):
8584 ])
8685
8786 @deferrable
88- def count_successful_burns (self ):
89- '''Set the sanity patterns to count the number of successful burns .'''
87+ def assert_successful_burn_count (self ):
88+ '''Assert that the expected successful burn count is reported .'''
9089 return sn .assert_eq (sn .count (sn .findall (r'^GPU\s*\d+\(OK\)' ,
9190 self .stdout )),
9291 self .num_tasks_assigned )
93-
94- @performance_function ('nodes' )
95- def total_nodes_reported (self ):
96- return sn .count (self .nodes_reported )
97-
98- @run_before ('performance' )
99- def set_perf_variables (self ):
100- '''The number of reported nodes can be used as a perf metric.
101-
102- For now, the low limit can go to zero, but this can be set to a more
103- restrictive value.
104- '''
105-
106- self .reference = {
107- '*' : {
108- 'nodes_reported' : (self .num_tasks , self .perf_floor , 0 )
109- },
110- }
111- self .perf_variables = {
112- 'nodes_reported' : self .total_nodes_reported ()
113- }
0 commit comments