Base CSCS TF2 test on the library

Vasileios Karakasis · Vasileios Karakasis · commit 6237d2f7b1f8 · 2021-11-16T18:01:56.000+01:00
diff --git a/cscs-checks/apps/tensorflow/tf2_horovod_check.py b/cscs-checks/apps/tensorflow/tf2_horovod_check.py
@@ -3,69 +3,58 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
+import contextlib
 import reframe as rfm
 import reframe.utility.osext as osext
 
-from hpctestlib.apps.tensorflow.base_check import TensorFlow2Horovod_BaseTest
-
-
-REFERENCE_SMALL_PERFORMANCE = {
-    'dom:gpu': {
-        'throughput': (1712, -0.05, None, 'images/s'),
-        'throughput_per_gpu': (214, -0.05, None, 'images/s'),
-    },
-    'daint:gpu': {
-        'throughput': (1712, -0.05, None, 'images/s'),
-        'throughput_per_gpu': (214, -0.05, None, 'images/s')
-    },
-}
-
-REFERENCE_LARGE_PERFORMANCE = {
-    'daint:gpu': {
-        'throughput': (6848, -0.05, None, 'images/s'),
-        'throughput_per_gpu': (214, -0.05, None, 'images/s')
-    },
-}
+from hpctestlib.ml.tensorflow.horovod import tensorflow_cnn_check
 
 
 @rfm.simple_test
-class TensorFlow2HorovodTestCSCS(TensorFlow2Horovod_BaseTest):
-    variant = parameter(['small', 'large'])
-    sourcesdir = None
+class cscs_tensorflow_horovod_check(tensorflow_cnn_check):
+    num_nodes = parameter([8, 32])
     num_tasks_per_node = 1
-    num_cpus_per_task = 12
     batch_size = 64
-    tags = {'production'}
-    maintainers = ['RS', 'TR']
-    valid_prog_environs = ['builtin']
     valid_systems = ['daint:gpu']
+    valid_prog_environs = ['builtin']
     modules = [
         f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0'
     ]
+    tags |= {'production'}
+    maintainers = ['sarafael', 'henrique']
+    allref = {
+        8: {
+            'sm_60': {
+                'throughput_total': (1712, -0.05, None, 'images/s'),
+                'throughput_iteration': (214, -0.05, None, 'images/s')
+            }
+        },
+        16: {
+            'sm_60': {
+                'throughput_total': (6848, -0.05, None, 'images/s'),
+                'throughput_iteration': (214, -0.05, None, 'images/s')
+            }
+        },
+    }
 
     @run_after('init')
-    def set_num_task(self):
-        if self.variant == 'small':
+    def setup_filtering_criteria(self):
+        if self.num_nodes == 32:
             self.valid_systems += ['dom:gpu']
-            self.num_tasks = 8
-            self.reference = REFERENCE_SMALL_PERFORMANCE
-        else:
-            self.num_tasks = 32
-            self.reference = REFERENCE_LARGE_PERFORMANCE
 
-    @run_after('init')
-    def set_executable_opts(self):
+    @run_before('run')
+    def setup_run(self):
+        proc = self.current_partition.processor
+        self.num_tasks = self.num_nodes * self.num_tasks_per_node
+        self.num_cpus_per_task = proc.num_cores
+        with contextlib.suppress(KeyError):
+            self.reference = {
+                '*': self.allref[self.num_nodes]['sm_60']
+            }
+
         self.variables = {
             'NCCL_DEBUG': 'INFO',
             'NCCL_IB_HCA': 'ipogif0',
             'NCCL_IB_CUDA_SUPPORT': '1',
-            'OMP_NUM_THREADS': '$SLURM_CPUS_PER_TASK',
+            'OMP_NUM_THREADS': str(self.num_cpus_per_task)
         }
-        self.executable_opts = [
-            f'{self.script}',
-            f'--model {self.model}',
-            f'--batch-size {self.batch_size}',
-            '--num-iters 5',
-            '--num-batches-per-iter 5',
-            '--num-warmup-batches 5',
-        ]
diff --git a/hpctestlib/ml/tensorflow/horovod.py b/hpctestlib/ml/tensorflow/horovod.py
@@ -43,7 +43,6 @@ class tensorflow_cnn_check(rfm.RunOnlyRegressionTest, pin_prefix=True):
 
     executable = 'python tensorflow2_synthetic_benchmark.py'
     tags = {'ml', 'cnn', 'horovod'}
-    maintainers = ['sarafael', 'henrique']
 
     @run_after('init')
     def prepare_test(self):