Skip to content

Commit 2b1d388

Browse files
author
Vasileios Karakasis
authored
Merge pull request #2182 from hurricane642/tensorflow_test
[testlib] Add a generic test for TensorFlow with Horovod
2 parents 82a6bc8 + 6bcdb55 commit 2b1d388

File tree

3 files changed

+156
-64
lines changed

3 files changed

+156
-64
lines changed

cscs-checks/apps/tensorflow/tf2_horovod_check.py

Lines changed: 41 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -3,81 +3,58 @@
33
#
44
# SPDX-License-Identifier: BSD-3-Clause
55

6+
import contextlib
67
import reframe as rfm
7-
import reframe.utility.sanity as sn
88
import reframe.utility.osext as osext
99

10+
from hpctestlib.ml.tensorflow.horovod import tensorflow_cnn_check
11+
1012

1113
@rfm.simple_test
12-
class TensorFlow2HorovodTest(rfm.RunOnlyRegressionTest):
13-
variant = parameter(['small', 'large'])
14+
class cscs_tensorflow_horovod_check(tensorflow_cnn_check):
15+
num_nodes = parameter([8, 32])
16+
num_tasks_per_node = 1
17+
batch_size = 64
18+
valid_systems = ['daint:gpu']
19+
valid_prog_environs = ['builtin']
20+
modules = [
21+
f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0'
22+
]
23+
tags |= {'production'}
24+
maintainers = ['sarafael', 'henrique']
25+
allref = {
26+
8: {
27+
'sm_60': {
28+
'throughput_total': (1712, -0.05, None, 'images/s'),
29+
'throughput_iteration': (214, -0.05, None, 'images/s')
30+
}
31+
},
32+
16: {
33+
'sm_60': {
34+
'throughput_total': (6848, -0.05, None, 'images/s'),
35+
'throughput_iteration': (214, -0.05, None, 'images/s')
36+
}
37+
},
38+
}
1439

15-
def __init__(self):
16-
self.descr = 'Distributed training with TensorFlow2 and Horovod'
17-
self.valid_systems = ['daint:gpu']
18-
self.valid_prog_environs = ['builtin']
19-
self.modules = [
20-
f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0'
21-
]
22-
self.sourcesdir = None
23-
self.num_tasks_per_node = 1
24-
self.num_cpus_per_task = 12
25-
if self.variant == 'small':
40+
@run_after('init')
41+
def setup_filtering_criteria(self):
42+
if self.num_nodes == 32:
2643
self.valid_systems += ['dom:gpu']
27-
self.num_tasks = 8
28-
self.reference = {
29-
'dom:gpu': {
30-
'throughput': (1712, -0.05, None, 'images/s'),
31-
'throughput_per_gpu': (214, -0.05, None, 'images/s'),
32-
},
33-
'daint:gpu': {
34-
'throughput': (1712, -0.05, None, 'images/s'),
35-
'throughput_per_gpu': (214, -0.05, None, 'images/s')
36-
},
37-
}
38-
else:
39-
self.num_tasks = 32
44+
45+
@run_before('run')
46+
def setup_run(self):
47+
proc = self.current_partition.processor
48+
self.num_tasks = self.num_nodes * self.num_tasks_per_node
49+
self.num_cpus_per_task = proc.num_cores
50+
with contextlib.suppress(KeyError):
4051
self.reference = {
41-
'daint:gpu': {
42-
'throughput': (6848, -0.05, None, 'images/s'),
43-
'throughput_per_gpu': (214, -0.05, None, 'images/s')
44-
},
52+
'*': self.allref[self.num_nodes]['sm_60']
4553
}
46-
self.perf_patterns = {
47-
'throughput': sn.extractsingle(
48-
rf'Total img/sec on {self.num_tasks} GPU\(s\): '
49-
rf'(?P<throughput>\S+) \S+',
50-
self.stdout, 'throughput', float),
51-
'throughput_per_gpu': sn.extractsingle(
52-
r'Img/sec per GPU: (?P<throughput_per_gpu>\S+) \S+',
53-
self.stdout, 'throughput_per_gpu', float)
54-
}
55-
model = 'InceptionV3'
56-
batch_size = 64
57-
self.sanity_patterns = sn.all([
58-
sn.assert_found(rf'Model: {model}', self.stdout),
59-
sn.assert_found(rf'Batch size: {batch_size}', self.stdout)
60-
])
54+
6155
self.variables = {
6256
'NCCL_DEBUG': 'INFO',
6357
'NCCL_IB_HCA': 'ipogif0',
6458
'NCCL_IB_CUDA_SUPPORT': '1',
65-
'OMP_NUM_THREADS': '$SLURM_CPUS_PER_TASK',
59+
'OMP_NUM_THREADS': str(self.num_cpus_per_task)
6660
}
67-
script = 'tensorflow2_synthetic_benchmark.py'
68-
self.prerun_cmds = ['wget https://raw.githubusercontent.com/horovod/'
69-
'horovod/842d1075e8440f15e84364f494645c28bf20c3ae/'
70-
'examples/tensorflow2_synthetic_benchmark.py',
71-
'sed -i "s/weights=None/weights=None, '
72-
f'input_shape=(224, 224, 3)/g" {script}']
73-
self.executable = 'python'
74-
self.executable_opts = [
75-
f'{script}',
76-
f'--model {model}',
77-
f'--batch-size {batch_size}',
78-
'--num-iters 5',
79-
'--num-batches-per-iter 5',
80-
'--num-warmup-batches 5',
81-
]
82-
self.tags = {'production'}
83-
self.maintainers = ['RS', 'TR']

docs/hpctestlib.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,11 @@ Interactive Computing
3434
.. automodule:: hpctestlib.interactive.jupyter.ipcmagic
3535
:members:
3636
:show-inheritance:
37+
38+
39+
Machine Learning
40+
----------------
41+
42+
.. automodule:: hpctestlib.ml.tensorflow.horovod
43+
:members:
44+
:show-inheritance:
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
2+
# ReFrame Project Developers. See the top-level LICENSE file for details.
3+
#
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
6+
import reframe as rfm
7+
import reframe.utility.sanity as sn
8+
9+
10+
@rfm.simple_test
11+
class tensorflow_cnn_check(rfm.RunOnlyRegressionTest, pin_prefix=True):
12+
'''Run a synthetic CNN benchmark with TensorFlow2 and Horovod.
13+
14+
TensorFlow is an end-to-end open source platform for machine learning. It
15+
has a comprehensive, flexible ecosystem of tools, libraries and community
16+
resources that lets researchers push the state-of-the-art in ML and
17+
developers easily build and deploy ML powered applications. For more
18+
information, refer to `<https://www.tensorflow.org/>`__.
19+
20+
Horovod is a distributed deep learning training framework for TensorFlow,
21+
Keras, PyTorch, and Apache MXNet. The goal of Horovod is to make
22+
distributed deep learning fast and easy to use. For more information refer
23+
to `<https://github.com/horovod/horovod>`__.
24+
25+
This test runs the Horovod ``tensorflow2_synthentic_benchmark.py``
26+
example, checks its sanity and extracts the GPU performance.
27+
'''
28+
29+
#: The version of Horovod to use.
30+
#:
31+
#: :type: :class:`str`
32+
#: :default: ``'v0.21.0'``
33+
benchmark_version = variable(str, value='v0.21.0')
34+
35+
#: The name of the model to use for this benchmark.
36+
#:
37+
#: :type: :class:`str`
38+
#: :default: ``'InceptionV3'``
39+
model = variable(str, value='InceptionV3')
40+
41+
#: The size of the batch used during the learning of models.
42+
#:
43+
#: :type: :class:`int`
44+
#: :default: ``32``
45+
batch_size = variable(int, value=32)
46+
47+
#: The number of iterations.
48+
#:
49+
#: :type: :class:`int`
50+
#: :default: ``5``
51+
num_iters = variable(int, value=5)
52+
53+
#: The number of batches per iteration.
54+
#:
55+
#: :type: :class:`int`
56+
#: :default: ``5``
57+
num_batches_per_iter = variable(int, value=5)
58+
59+
#: The number of warmup batches
60+
#:
61+
#: :type: :class:`int`
62+
#: :default: ``5``
63+
num_warmup_batches = variable(int, value=5)
64+
65+
executable = 'python tensorflow2_synthetic_benchmark.py'
66+
tags = {'ml', 'cnn', 'horovod'}
67+
68+
@run_after('init')
69+
def prepare_test(self):
70+
# Get the python script
71+
script = self.executable.split()[1]
72+
73+
self.descr = (f'Distributed CNN training with TensorFlow2 and Horovod '
74+
f'(model: {self.model})')
75+
self.prerun_cmds = [
76+
f'curl -LJO https://raw.githubusercontent.com/horovod/horovod/{self.benchmark_version}/examples/tensorflow2/{script}', # noqa: E501
77+
f'sed -i "s/weights=None/weights=None, input_shape=(224, 224, 3)/g" {script}' # noqa: E501
78+
]
79+
self.executable_opts = [
80+
f'--model {self.model}',
81+
f'--batch-size {self.batch_size}',
82+
f'--num-iters {self.num_iters}',
83+
f'--num-batches-per-iter {self.num_batches_per_iter}',
84+
f'--num-warmup-batches {self.num_warmup_batches}'
85+
]
86+
87+
@performance_function('images/s')
88+
def throughput_iteration(self):
89+
'''The average GPU throughput per iteration in ``images/s``.'''
90+
return sn.avg(
91+
sn.extractall(r'Img/sec per GPU: (\S+) \S+', self.stdout, 1, float)
92+
)
93+
94+
@performance_function('images/s')
95+
def throughput_total(self):
96+
'''The total GPU throughput of the benchmark in ``images/s``.'''
97+
return sn.extractsingle(
98+
rf'Total img/sec on {self.num_tasks} GPU\(s\): (\S+) \S+',
99+
self.stdout, 1, float
100+
)
101+
102+
@sanity_function
103+
def validate_run(self):
104+
return sn.all([
105+
sn.assert_found(rf'Model: {self.model}', self.stdout),
106+
sn.assert_found(rf'Batch size: {self.batch_size}', self.stdout)
107+
])

0 commit comments

Comments
 (0)