|
| 1 | +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) |
| 2 | +# ReFrame Project Developers. See the top-level LICENSE file for details. |
| 3 | +# |
| 4 | +# SPDX-License-Identifier: BSD-3-Clause |
| 5 | + |
| 6 | +import reframe as rfm |
| 7 | +import reframe.utility.sanity as sn |
| 8 | + |
| 9 | + |
| 10 | +@rfm.simple_test |
| 11 | +class tensorflow_cnn_check(rfm.RunOnlyRegressionTest, pin_prefix=True): |
| 12 | + '''Base class for the TensorFlow2 Horovod Test. |
| 13 | +
|
| 14 | + TensorFlow is an end-to-end open source platform for machine |
| 15 | + learning. It has a comprehensive, flexible ecosystem of tools, |
| 16 | + libraries and community resources that lets researchers push the |
| 17 | + state-of-the-art in ML and developers easily build and deploy ML |
| 18 | + powered applications. (see tensorflow.org). |
| 19 | +
|
| 20 | + Horovod is a distributed deep learning training |
| 21 | + framework for TensorFlow, Keras, PyTorch, and Apache |
| 22 | + MXNet. The goal of Horovod is to make distributed deep |
| 23 | + learning fast and easy to use (see github.com/horovod/horovod). |
| 24 | +
|
| 25 | + This test tests the performance of TensorFlow2 and Horovod using |
| 26 | + classic deep learning model Inception v3. It checks whether learning is |
| 27 | + performed to the end. The default assumption |
| 28 | + is that TensorFlow2 and Horovod is already installed on the device |
| 29 | + under test. |
| 30 | + ''' |
| 31 | + |
| 32 | + benchmark_version = variable(str, value='v0.21.0') |
| 33 | + |
| 34 | + # Name of the model used for the testing |
| 35 | + model = variable(str, value='InceptionV3') |
| 36 | + |
| 37 | + # Size of the batch used during the learning of models |
| 38 | + batch_size = variable(int, value=32) |
| 39 | + |
| 40 | + num_iters = variable(int, value=5) |
| 41 | + num_batches_per_iter = variable(int, value=5) |
| 42 | + num_warmup_batches = variable(int, value=5) |
| 43 | + |
| 44 | + executable = 'python tensorflow2_synthetic_benchmark.py' |
| 45 | + tags = {'ml', 'cnn', 'horovod'} |
| 46 | + maintainers = ['sarafael', 'henrique'] |
| 47 | + |
| 48 | + @run_after('init') |
| 49 | + def prepare_test(self): |
| 50 | + # Get the python script |
| 51 | + script = self.executable.split()[1] |
| 52 | + |
| 53 | + self.descr = (f'Distributed CNN training with TensorFlow2 and Horovod ' |
| 54 | + f'(model: {self.model})') |
| 55 | + self.prerun_cmds = [ |
| 56 | + f'curl -LJO https://raw.githubusercontent.com/horovod/horovod/{self.benchmark_version}/examples/tensorflow2/{script}', # noqa: E501 |
| 57 | + f'sed -i "s/weights=None/weights=None, input_shape=(224, 224, 3)/g" {script}' # noqa: E501 |
| 58 | + ] |
| 59 | + self.executable_opts = [ |
| 60 | + f'--model {self.model}', |
| 61 | + f'--batch-size {self.batch_size}', |
| 62 | + f'--num-iters {self.num_iters}', |
| 63 | + f'--num-batches-per-iter {self.num_batches_per_iter}', |
| 64 | + f'--num-warmup-batches {self.num_warmup_batches}' |
| 65 | + ] |
| 66 | + |
| 67 | + @performance_function('images/s') |
| 68 | + def throughput_iteration(self): |
| 69 | + return sn.avg( |
| 70 | + sn.extractall(r'Img/sec per GPU: (\S+) \S+', self.stdout, 1, float) |
| 71 | + ) |
| 72 | + |
| 73 | + @performance_function('images/s') |
| 74 | + def throughput_total(self): |
| 75 | + return sn.extractsingle( |
| 76 | + rf'Total img/sec on {self.num_tasks} GPU\(s\): (\S+) \S+', |
| 77 | + self.stdout, 1, float |
| 78 | + ) |
| 79 | + |
| 80 | + @sanity_function |
| 81 | + def validate_run(self): |
| 82 | + return sn.all([ |
| 83 | + sn.assert_found(rf'Model: {self.model}', self.stdout), |
| 84 | + sn.assert_found(rf'Batch size: {self.batch_size}', self.stdout) |
| 85 | + ]) |
0 commit comments