Merge pull request #2182 from hurricane642/tensorflow_test

Vasileios Karakasis · web-flow · commit 2b1d388d2a78 · 2021-11-16T21:46:33.000+01:00
[testlib] Add a generic test for TensorFlow with Horovod
diff --git a/cscs-checks/apps/tensorflow/tf2_horovod_check.py b/cscs-checks/apps/tensorflow/tf2_horovod_check.py
@@ -3,81 +3,58 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
+import contextlib
 import reframe as rfm
-import reframe.utility.sanity as sn
 import reframe.utility.osext as osext
 
+from hpctestlib.ml.tensorflow.horovod import tensorflow_cnn_check
+
 
 @rfm.simple_test
-class TensorFlow2HorovodTest(rfm.RunOnlyRegressionTest):
-    variant = parameter(['small', 'large'])
+class cscs_tensorflow_horovod_check(tensorflow_cnn_check):
+    num_nodes = parameter([8, 32])
+    num_tasks_per_node = 1
+    batch_size = 64
+    valid_systems = ['daint:gpu']
+    valid_prog_environs = ['builtin']
+    modules = [
+        f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0'
+    ]
+    tags |= {'production'}
+    maintainers = ['sarafael', 'henrique']
+    allref = {
+        8: {
+            'sm_60': {
+                'throughput_total': (1712, -0.05, None, 'images/s'),
+                'throughput_iteration': (214, -0.05, None, 'images/s')
+            }
+        },
+        16: {
+            'sm_60': {
+                'throughput_total': (6848, -0.05, None, 'images/s'),
+                'throughput_iteration': (214, -0.05, None, 'images/s')
+            }
+        },
+    }
 
-    def __init__(self):
-        self.descr = 'Distributed training with TensorFlow2 and Horovod'
-        self.valid_systems = ['daint:gpu']
-        self.valid_prog_environs = ['builtin']
-        self.modules = [
-            f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0'
-        ]
-        self.sourcesdir = None
-        self.num_tasks_per_node = 1
-        self.num_cpus_per_task = 12
-        if self.variant == 'small':
+    @run_after('init')
+    def setup_filtering_criteria(self):
+        if self.num_nodes == 32:
             self.valid_systems += ['dom:gpu']
-            self.num_tasks = 8
-            self.reference = {
-                'dom:gpu': {
-                    'throughput': (1712, -0.05, None, 'images/s'),
-                    'throughput_per_gpu': (214, -0.05, None, 'images/s'),
-                },
-                'daint:gpu': {
-                    'throughput': (1712, -0.05, None, 'images/s'),
-                    'throughput_per_gpu': (214, -0.05, None, 'images/s')
-                },
-            }
-        else:
-            self.num_tasks = 32
+
+    @run_before('run')
+    def setup_run(self):
+        proc = self.current_partition.processor
+        self.num_tasks = self.num_nodes * self.num_tasks_per_node
+        self.num_cpus_per_task = proc.num_cores
+        with contextlib.suppress(KeyError):
             self.reference = {
-                'daint:gpu': {
-                    'throughput': (6848, -0.05, None, 'images/s'),
-                    'throughput_per_gpu': (214, -0.05, None, 'images/s')
-                },
+                '*': self.allref[self.num_nodes]['sm_60']
             }
-        self.perf_patterns = {
-            'throughput': sn.extractsingle(
-                rf'Total img/sec on {self.num_tasks} GPU\(s\): '
-                rf'(?P<throughput>\S+) \S+',
-                self.stdout, 'throughput', float),
-            'throughput_per_gpu': sn.extractsingle(
-                r'Img/sec per GPU: (?P<throughput_per_gpu>\S+) \S+',
-                self.stdout, 'throughput_per_gpu', float)
-        }
-        model = 'InceptionV3'
-        batch_size = 64
-        self.sanity_patterns = sn.all([
-            sn.assert_found(rf'Model: {model}', self.stdout),
-            sn.assert_found(rf'Batch size: {batch_size}', self.stdout)
-        ])
+
         self.variables = {
             'NCCL_DEBUG': 'INFO',
             'NCCL_IB_HCA': 'ipogif0',
             'NCCL_IB_CUDA_SUPPORT': '1',
-            'OMP_NUM_THREADS': '$SLURM_CPUS_PER_TASK',
+            'OMP_NUM_THREADS': str(self.num_cpus_per_task)
         }
-        script = 'tensorflow2_synthetic_benchmark.py'
-        self.prerun_cmds = ['wget https://raw.githubusercontent.com/horovod/'
-                            'horovod/842d1075e8440f15e84364f494645c28bf20c3ae/'
-                            'examples/tensorflow2_synthetic_benchmark.py',
-                            'sed -i "s/weights=None/weights=None, '
-                            f'input_shape=(224, 224, 3)/g" {script}']
-        self.executable = 'python'
-        self.executable_opts = [
-            f'{script}',
-            f'--model {model}',
-            f'--batch-size {batch_size}',
-            '--num-iters 5',
-            '--num-batches-per-iter 5',
-            '--num-warmup-batches 5',
-        ]
-        self.tags = {'production'}
-        self.maintainers = ['RS', 'TR']
diff --git a/docs/hpctestlib.rst b/docs/hpctestlib.rst
@@ -34,3 +34,11 @@ Interactive Computing
 .. automodule:: hpctestlib.interactive.jupyter.ipcmagic
    :members:
    :show-inheritance:
+
+
+Machine Learning
+----------------
+
+.. automodule:: hpctestlib.ml.tensorflow.horovod
+   :members:
+   :show-inheritance:
diff --git a/hpctestlib/ml/tensorflow/horovod.py b/hpctestlib/ml/tensorflow/horovod.py
@@ -0,0 +1,107 @@
+# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+@rfm.simple_test
+class tensorflow_cnn_check(rfm.RunOnlyRegressionTest, pin_prefix=True):
+    '''Run a synthetic CNN benchmark with TensorFlow2 and Horovod.
+
+    TensorFlow is an end-to-end open source platform for machine learning. It
+    has a comprehensive, flexible ecosystem of tools, libraries and community
+    resources that lets researchers push the state-of-the-art in ML and
+    developers easily build and deploy ML powered applications. For more
+    information, refer to `<https://www.tensorflow.org/>`__.
+
+    Horovod is a distributed deep learning training framework for TensorFlow,
+    Keras, PyTorch, and Apache MXNet. The goal of Horovod is to make
+    distributed deep learning fast and easy to use. For more information refer
+    to `<https://github.com/horovod/horovod>`__.
+
+    This test runs the Horovod ``tensorflow2_synthentic_benchmark.py``
+    example, checks its sanity and extracts the GPU performance.
+    '''
+
+    #: The version of Horovod to use.
+    #:
+    #: :type: :class:`str`
+    #: :default: ``'v0.21.0'``
+    benchmark_version = variable(str, value='v0.21.0')
+
+    #: The name of the model to use for this benchmark.
+    #:
+    #: :type: :class:`str`
+    #: :default: ``'InceptionV3'``
+    model = variable(str, value='InceptionV3')
+
+    #: The size of the batch used during the learning of models.
+    #:
+    #: :type: :class:`int`
+    #: :default: ``32``
+    batch_size = variable(int, value=32)
+
+    #: The number of iterations.
+    #:
+    #: :type: :class:`int`
+    #: :default: ``5``
+    num_iters = variable(int, value=5)
+
+    #: The number of batches per iteration.
+    #:
+    #: :type: :class:`int`
+    #: :default: ``5``
+    num_batches_per_iter = variable(int, value=5)
+
+    #: The number of warmup batches
+    #:
+    #: :type: :class:`int`
+    #: :default: ``5``
+    num_warmup_batches = variable(int, value=5)
+
+    executable = 'python tensorflow2_synthetic_benchmark.py'
+    tags = {'ml', 'cnn', 'horovod'}
+
+    @run_after('init')
+    def prepare_test(self):
+        # Get the python script
+        script = self.executable.split()[1]
+
+        self.descr = (f'Distributed CNN training with TensorFlow2 and Horovod '
+                      f'(model: {self.model})')
+        self.prerun_cmds = [
+            f'curl -LJO https://raw.githubusercontent.com/horovod/horovod/{self.benchmark_version}/examples/tensorflow2/{script}',  # noqa: E501
+            f'sed -i "s/weights=None/weights=None, input_shape=(224, 224, 3)/g" {script}'   # noqa: E501
+        ]
+        self.executable_opts = [
+            f'--model {self.model}',
+            f'--batch-size {self.batch_size}',
+            f'--num-iters {self.num_iters}',
+            f'--num-batches-per-iter {self.num_batches_per_iter}',
+            f'--num-warmup-batches {self.num_warmup_batches}'
+        ]
+
+    @performance_function('images/s')
+    def throughput_iteration(self):
+        '''The average GPU throughput per iteration in ``images/s``.'''
+        return sn.avg(
+            sn.extractall(r'Img/sec per GPU: (\S+) \S+', self.stdout, 1, float)
+        )
+
+    @performance_function('images/s')
+    def throughput_total(self):
+        '''The total GPU throughput of the benchmark in ``images/s``.'''
+        return sn.extractsingle(
+            rf'Total img/sec on {self.num_tasks} GPU\(s\): (\S+) \S+',
+            self.stdout, 1, float
+        )
+
+    @sanity_function
+    def validate_run(self):
+        return sn.all([
+            sn.assert_found(rf'Model: {self.model}', self.stdout),
+            sn.assert_found(rf'Batch size: {self.batch_size}', self.stdout)
+        ])