88import time
99
1010from reframe .core .exceptions import SanityError
11- from hpctestlib .microbenchmarks .gpu .gpu_burn import GpuBurn
11+ from hpctestlib .microbenchmarks .gpu .gpu_burn import gpu_burn_check
1212
1313
1414@rfm .simple_test
15- class gpu_usage_report_check (GpuBurn ):
15+ class gpu_usage_report_check (gpu_burn_check ):
1616 '''Check the output from the job report.
1717
1818 This check uses the gpu burn app and checks that the job report produces
@@ -23,14 +23,11 @@ class gpu_usage_report_check(GpuBurn):
2323 '''
2424
2525 valid_systems = ['daint:gpu' , 'dom:gpu' ]
26- valid_prog_environs = ['PrgEnv-gnu ' ]
26+ valid_prog_environs = ['PrgEnv-nvidia ' ]
2727 descr = 'Check GPU usage from job report'
2828 gpu_build = 'cuda'
29- modules = ['craype-accel-nvidia60' , 'cdt-cuda' ]
3029 num_tasks = 0
3130 num_gpus_per_node = 1
32- burn_time = variable (int , value = 10 )
33- executable_opts = ['-d' , f'{ burn_time } ' ]
3431 perf_floor = variable (float , value = - 0.2 )
3532 tags = {'production' }
3633
@@ -44,7 +41,7 @@ def set_launcher_opts(self):
4441 self .job .launcher .options = ['-u' ]
4542
4643 @sanity_function
47- def set_sanity_patterns (self ):
44+ def assert_jobreport_success (self ):
4845 '''Extend sanity and wait for the jobreport.
4946
5047 If a large number of nodes is used, the final jobreport output happens
@@ -67,32 +64,33 @@ def set_sanity_patterns(self):
6764 def gpu_usage_sanity (self ):
6865 '''Verify that the jobreport output has sensible numbers.
6966
70- This function asserts that the nodes reported are at least a subset of
71- all nodes used by the gpu burn app. Also, the GPU usage is verified by
72- assuming that in the worst case scenario, the usage is near 100% during
73- the burn, and 0% outside the burn period. Lastly, the GPU usage time
74- for each node is also asserted to be greater or equal than the burn
75- time.
67+ The GPU usage is verified by assuming that in the worst case scenario,
68+ the usage is near 100% during the burn, and 0% outside the burn period.
69+ Lastly, the GPU usage time for each node is also asserted to be greater
70+ or equal than the burn time.
7671 '''
7772
78- # Get set with all nodes
79- patt = r'^\s*\[([^\]]*)\]\s*GPU\s*\d+\(OK\)'
80- full_node_set = set (sn .extractall (patt , self .stdout , 1 ))
81-
8273 # Parse job report data
8374 patt = r'^\s*(\w*)\s*(\d+)\s*%\s*\d+\s*MiB\s*\d+:\d+:(\d+)'
8475 self .nodes_reported = sn .extractall (patt , self .stdout , 1 )
76+ self .num_tasks_assigned = self .job .num_tasks * self .num_gpus_per_node
8577 usage = sn .extractall (patt , self .stdout , 2 , int )
8678 time_reported = sn .extractall (patt , self .stdout , 3 , int )
8779 return sn .all ([
8880 sn .assert_ge (sn .count (self .nodes_reported ), 1 ),
89- set (self .nodes_reported ).issubset (full_node_set ),
9081 sn .all (
91- map (lambda x , y : self .burn_time / x <= y , time_reported , usage )
82+ map (lambda x , y : self .duration / x <= y / 100 , time_reported , usage )
9283 ),
93- sn .assert_ge (sn .min (time_reported ), self .burn_time )
84+ sn .assert_ge (sn .min (time_reported ), self .duration )
9485 ])
9586
87+ @deferrable
88+ def count_successful_burns (self ):
89+ '''Set the sanity patterns to count the number of successful burns.'''
90+ return sn .assert_eq (sn .count (sn .findall (r'^GPU\s*\d+\(OK\)' ,
91+ self .stdout )),
92+ self .num_tasks_assigned )
93+
9694 @performance_function ('nodes' )
9795 def total_nodes_reported (self ):
9896 return sn .count (self .nodes_reported )
0 commit comments