Skip to content

Commit 554c0c6

Browse files
author
Vasileios Karakasis
committed
Add TensorFlow+Horovod library test
1 parent e1e22bc commit 554c0c6

File tree

2 files changed

+85
-72
lines changed

2 files changed

+85
-72
lines changed

hpctestlib/apps/tensorflow/base_check.py

Lines changed: 0 additions & 72 deletions
This file was deleted.
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
2+
# ReFrame Project Developers. See the top-level LICENSE file for details.
3+
#
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
6+
import reframe as rfm
7+
import reframe.utility.sanity as sn
8+
9+
10+
@rfm.simple_test
11+
class tensorflow_cnn_check(rfm.RunOnlyRegressionTest, pin_prefix=True):
12+
'''Base class for the TensorFlow2 Horovod Test.
13+
14+
TensorFlow is an end-to-end open source platform for machine
15+
learning. It has a comprehensive, flexible ecosystem of tools,
16+
libraries and community resources that lets researchers push the
17+
state-of-the-art in ML and developers easily build and deploy ML
18+
powered applications. (see tensorflow.org).
19+
20+
Horovod is a distributed deep learning training
21+
framework for TensorFlow, Keras, PyTorch, and Apache
22+
MXNet. The goal of Horovod is to make distributed deep
23+
learning fast and easy to use (see github.com/horovod/horovod).
24+
25+
This test tests the performance of TensorFlow2 and Horovod using
26+
classic deep learning model Inception v3. It checks whether learning is
27+
performed to the end. The default assumption
28+
is that TensorFlow2 and Horovod is already installed on the device
29+
under test.
30+
'''
31+
32+
benchmark_version = variable(str, value='v0.21.0')
33+
34+
# Name of the model used for the testing
35+
model = variable(str, value='InceptionV3')
36+
37+
# Size of the batch used during the learning of models
38+
batch_size = variable(int, value=32)
39+
40+
num_iters = variable(int, value=5)
41+
num_batches_per_iter = variable(int, value=5)
42+
num_warmup_batches = variable(int, value=5)
43+
44+
executable = 'python tensorflow2_synthetic_benchmark.py'
45+
tags = {'ml', 'cnn', 'horovod'}
46+
maintainers = ['sarafael', 'henrique']
47+
48+
@run_after('init')
49+
def prepare_test(self):
50+
# Get the python script
51+
script = self.executable.split()[1]
52+
53+
self.descr = (f'Distributed CNN training with TensorFlow2 and Horovod '
54+
f'(model: {self.model})')
55+
self.prerun_cmds = [
56+
f'curl -LJO https://raw.githubusercontent.com/horovod/horovod/{self.benchmark_version}/examples/tensorflow2/{script}', # noqa: E501
57+
f'sed -i "s/weights=None/weights=None, input_shape=(224, 224, 3)/g" {script}' # noqa: E501
58+
]
59+
self.executable_opts = [
60+
f'--model {self.model}',
61+
f'--batch-size {self.batch_size}',
62+
f'--num-iters {self.num_iters}',
63+
f'--num-batches-per-iter {self.num_batches_per_iter}',
64+
f'--num-warmup-batches {self.num_warmup_batches}'
65+
]
66+
67+
@performance_function('images/s')
68+
def throughput_iteration(self):
69+
return sn.avg(
70+
sn.extractall(r'Img/sec per GPU: (\S+) \S+', self.stdout, 1, float)
71+
)
72+
73+
@performance_function('images/s')
74+
def throughput_total(self):
75+
return sn.extractsingle(
76+
rf'Total img/sec on {self.num_tasks} GPU\(s\): (\S+) \S+',
77+
self.stdout, 1, float
78+
)
79+
80+
@sanity_function
81+
def validate_run(self):
82+
return sn.all([
83+
sn.assert_found(rf'Model: {self.model}', self.stdout),
84+
sn.assert_found(rf'Batch size: {self.batch_size}', self.stdout)
85+
])

0 commit comments

Comments
 (0)