88import time
99
1010from reframe .core .exceptions import SanityError
11- from hpctestlib .microbenchmarks .gpu .gpu_burn import GpuBurn
11+ from hpctestlib .microbenchmarks .gpu .gpu_burn import gpu_burn_check
1212
1313
1414@rfm .simple_test
15- class gpu_usage_report_check (GpuBurn ):
15+ class gpu_usage_report_check (gpu_burn_check ):
1616 '''Check the output from the job report.
1717
1818 This check uses the gpu burn app and checks that the job report produces
@@ -23,14 +23,10 @@ class gpu_usage_report_check(GpuBurn):
2323 '''
2424
2525 valid_systems = ['daint:gpu' , 'dom:gpu' ]
26- valid_prog_environs = ['PrgEnv-gnu' ]
2726 descr = 'Check GPU usage from job report'
2827 gpu_build = 'cuda'
29- modules = ['craype-accel-nvidia60' , 'cdt-cuda' ]
30- num_tasks = 0
28+ num_tasks = 2
3129 num_gpus_per_node = 1
32- burn_time = variable (int , value = 10 )
33- executable_opts = ['-d' , f'{ burn_time } ' ]
3430 perf_floor = variable (float , value = - 0.2 )
3531 tags = {'production' }
3632
@@ -44,7 +40,7 @@ def set_launcher_opts(self):
4440 self .job .launcher .options = ['-u' ]
4541
4642 @sanity_function
47- def set_sanity_patterns (self ):
43+ def assert_jobreport_success (self ):
4844 '''Extend sanity and wait for the jobreport.
4945
5046 If a large number of nodes is used, the final jobreport output happens
@@ -59,57 +55,35 @@ def set_sanity_patterns(self):
5955 except SanityError :
6056 time .sleep (25 )
6157
62- return sn .all ([
63- self .count_successful_burns (), self .gpu_usage_sanity ()
64- ])
58+ return self .assert_successful_burn_count (), self .gpu_usage_sanity ()
6559
6660 @deferrable
6761 def gpu_usage_sanity (self ):
6862 '''Verify that the jobreport output has sensible numbers.
6963
70- This function asserts that the nodes reported are at least a subset of
71- all nodes used by the gpu burn app. Also, the GPU usage is verified by
72- assuming that in the worst case scenario, the usage is near 100% during
73- the burn, and 0% outside the burn period. Lastly, the GPU usage time
74- for each node is also asserted to be greater or equal than the burn
75- time.
64+ The GPU usage is verified by assuming that in the worst case scenario,
65+ the usage is near 100% during the burn, and 0% outside the burn period.
66+ Lastly, the GPU usage time for each node is also asserted to be greater
67+ or equal than the burn time.
7668 '''
7769
78- # Get set with all nodes
79- patt = r'^\s*\[([^\]]*)\]\s*GPU\s*\d+\(OK\)'
80- full_node_set = set (sn .extractall (patt , self .stdout , 1 ))
81-
8270 # Parse job report data
8371 patt = r'^\s*(\w*)\s*(\d+)\s*%\s*\d+\s*MiB\s*\d+:\d+:(\d+)'
8472 self .nodes_reported = sn .extractall (patt , self .stdout , 1 )
73+ self .num_tasks_assigned = self .num_tasks * self .num_gpus_per_node
8574 usage = sn .extractall (patt , self .stdout , 2 , int )
8675 time_reported = sn .extractall (patt , self .stdout , 3 , int )
8776 return sn .all ([
8877 sn .assert_ge (sn .count (self .nodes_reported ), 1 ),
89- set (self .nodes_reported ).issubset (full_node_set ),
9078 sn .all (
91- map (lambda x , y : self .burn_time / x <= y , time_reported , usage )
79+ map (lambda x , y : self .duration / x <= y / 100 , time_reported , usage )
9280 ),
93- sn .assert_ge (sn .min (time_reported ), self .burn_time )
81+ sn .assert_ge (sn .min (time_reported ), self .duration )
9482 ])
9583
96- @performance_function ('nodes' )
97- def total_nodes_reported (self ):
98- return sn .count (self .nodes_reported )
99-
100- @run_before ('performance' )
101- def set_perf_variables (self ):
102- '''The number of reported nodes can be used as a perf metric.
103-
104- For now, the low limit can go to zero, but this can be set to a more
105- restrictive value.
106- '''
107-
108- self .reference = {
109- '*' : {
110- 'nodes_reported' : (self .num_tasks , self .perf_floor , 0 )
111- },
112- }
113- self .perf_variables = {
114- 'nodes_reported' : self .total_nodes_reported ()
115- }
84+ @deferrable
85+ def assert_successful_burn_count (self ):
86+ '''Assert that the expected successful burn count is reported.'''
87+ return sn .assert_eq (sn .count (sn .findall (r'^GPU\s*\d+\(OK\)' ,
88+ self .stdout )),
89+ self .num_tasks_assigned )
0 commit comments