Merge pull request #1653 from jgphpc/UES-1279

Vasileios Karakasis · web-flow · commit 8134945105f0 · 2020-12-16T22:49:25.000+01:00
[test] Extend GPU burn test to report GPU node with smallest flops
diff --git a/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py b/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py
@@ -3,6 +3,8 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
+import os
+
 import reframe as rfm
 import reframe.utility.sanity as sn
 
@@ -31,41 +33,35 @@ def __init__(self):
                 r'(?P<temp>\S*) Celsius')
         self.perf_patterns = {
             'perf': sn.min(sn.extractall(patt, self.stdout, 'perf', float)),
+            'temp': sn.max(sn.extractall(patt, self.stdout, 'temp', float)),
         }
 
         self.reference = {
             'dom:gpu': {
                 'perf': (4115, -0.10, None, 'Gflop/s'),
-                'max_temp': (0, None, None, 'Celsius')
             },
             'daint:gpu': {
                 'perf': (4115, -0.10, None, 'Gflop/s'),
-                'max_temp': (0, None, None, 'Celsius')
             },
             'arolla:cn': {
                 'perf': (5861, -0.10, None, 'Gflop/s'),
-                'max_temp': (0, None, None, 'Celsius')
             },
             'tsa:cn': {
                 'perf': (5861, -0.10, None, 'Gflop/s'),
-                'max_temp': (0, None, None, 'Celsius')
             },
             'ault:amda100': {
                 'perf': (15000, -0.10, None, 'Gflop/s'),
-                'max_temp': (0, None, None, 'Celsius')
             },
             'ault:amdv100': {
                 'perf': (5500, -0.10, None, 'Gflop/s'),
-                'max_temp': (0, None, None, 'Celsius')
             },
             'ault:intelv100': {
                 'perf': (5500, -0.10, None, 'Gflop/s'),
-                'max_temp': (0, None, None, 'Celsius')
             },
             'ault:amdvega': {
                 'perf': (3450, -0.10, None, 'Gflop/s'),
-                'max_temp': (0, None, None, 'Celsius')
             },
+            '*': {'temp': (0, None, None, 'degC')}
         }
 
         self.maintainers = ['AJ', 'TM']
@@ -133,3 +129,18 @@ def set_gpus_per_node(self):
             self.num_gpus_per_node = 3
         else:
             self.num_gpus_per_node = 1
+
+    @rfm.run_before('performance')
+    def report_nid_with_smallest_flops(self):
+        regex = r'\[(\S+)\] GPU\s+\d\(OK\): (\d+) GF/s'
+        rptf = os.path.join(self.stagedir, sn.evaluate(self.stdout))
+        self.nids = sn.extractall(regex, rptf, 1)
+        self.flops = sn.extractall(regex, rptf, 2, float)
+
+        # Find index of smallest flops and update reference dictionary to
+        # include our patched units
+        index = self.flops.evaluate().index(min(self.flops))
+        unit = f'GF/s ({self.nids[index]})'
+        for key, ref in self.reference.items():
+            if not key.endswith(':temp'):
+                self.reference[key] = (*ref[:3], unit)