Skip to content

Commit 2b2b499

Browse files
author
Vasileios Karakasis
authored
Merge pull request #2183 from hurricane642/pytorch
[testlib] Add PyTorch library test and base CSCS test on top of it
2 parents bd478fe + b2082db commit 2b2b499

File tree

3 files changed

+161
-68
lines changed

3 files changed

+161
-68
lines changed

cscs-checks/apps/pytorch/pytorch_horovod_check.py

Lines changed: 45 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -3,82 +3,59 @@
33
#
44
# SPDX-License-Identifier: BSD-3-Clause
55

6+
import contextlib
67
import reframe as rfm
7-
import reframe.utility.sanity as sn
88

9+
from hpctestlib.ml.pytorch.horovod import pytorch_cnn_check
910

10-
@rfm.parameterized_test(*[[model, mpi_task]
11-
for mpi_task in [32, 8, 1]
12-
for model in ['inception_v3', 'resnet50']])
13-
class PytorchHorovodTest(rfm.RunOnlyRegressionTest):
14-
def __init__(self, model, mpi_task):
15-
self.descr = 'Distributed training with Pytorch and Horovod'
16-
self.valid_systems = ['daint:gpu']
17-
if mpi_task < 20:
11+
12+
@rfm.simple_test
13+
class cscs_pytorch_horovod_check(pytorch_cnn_check):
14+
num_nodes = parameter([1, 8, 32])
15+
model_name = parameter(['inception_v3', 'resnet50'])
16+
num_tasks_per_node = 1
17+
batch_size = 64
18+
valid_systems = ['daint:gpu']
19+
valid_prog_environs = ['builtin']
20+
modules = ['PyTorch']
21+
tags |= {'production'}
22+
maintainers = ['sarafael', 'henrique']
23+
allref = {
24+
'sm_60': {
25+
'inception_v3': {
26+
'throughput_per_gpu': (131, -0.05, None, 'images/s'),
27+
},
28+
'resnet50': {
29+
'throughput_per_gpu': (201, -0.05, None, 'images/s'),
30+
}
31+
}
32+
}
33+
34+
@run_after('init')
35+
def setup_filtering_criteria(self):
36+
self.model = self.model_name
37+
if self.num_nodes == 8:
1838
self.valid_systems += ['dom:gpu']
1939

20-
self.valid_prog_environs = ['builtin']
21-
self.modules = ['PyTorch']
22-
self.num_tasks_per_node = 1
23-
self.num_cpus_per_task = 12
24-
self.num_tasks = mpi_task
25-
batch_size = 64
40+
@run_before('run')
41+
def setup_run(self):
42+
self.skip_if_no_procinfo()
43+
proc = self.current_partition.processor
44+
self.num_tasks = self.num_nodes * self.num_tasks_per_node
45+
self.num_cpus_per_task = proc.num_cores
2646
self.variables = {
2747
'NCCL_DEBUG': 'INFO',
2848
'NCCL_IB_HCA': 'ipogif0',
2949
'NCCL_IB_CUDA_SUPPORT': '1',
30-
'OMP_NUM_THREADS': '$SLURM_CPUS_PER_TASK',
50+
'OMP_NUM_THREADS': str(self.num_cpus_per_task)
3151
}
32-
hash = 'master'
33-
git_url = f'https://raw.githubusercontent.com/horovod/horovod/{hash}/examples/pytorch' # noqa: E501
34-
git_src = 'pytorch_synthetic_benchmark.py'
35-
self.prerun_cmds = [f'wget {git_url}/{git_src}']
36-
37-
if model == 'inception_v3':
38-
self.prerun_cmds += [
39-
'python3 -m venv --system-site-packages myvenv',
40-
'source myvenv/bin/activate',
41-
'pip install scipy',
42-
'sed -i "s-output = model(data)-output, aux = model(data)-"'
43-
f' {git_src}',
44-
'sed -i "s-data = torch.randn(args.batch_size, 3, 224, 224)-'
45-
f'data = torch.randn(args.batch_size, 3, 299, 299)-"'
46-
f' {git_src}'
47-
]
48-
49-
self.executable = 'python'
50-
self.executable_opts = [
51-
git_src,
52-
f'--model {model}',
53-
f'--batch-size {batch_size}',
54-
'--num-iters 5',
55-
'--num-batches-per-iter 5'
56-
]
57-
self.tags = {'production'}
58-
self.maintainers = ['RS', 'HM']
59-
self.sanity_patterns = sn.all([
60-
sn.assert_found(rf'Model: {model}', self.stdout),
61-
sn.assert_found(rf'Batch size: {batch_size}', self.stdout)
62-
])
63-
self.perf_patterns = {
64-
'throughput_per_gpu': sn.extractsingle(
65-
r'Img/sec per GPU: (?P<throughput_per_gpu>\S+) \S+',
66-
self.stdout, 'throughput_per_gpu', float
67-
),
68-
'throughput_per_job': sn.extractsingle(
69-
r'Total img/sec on \d+ GPU\(s\): (?P<throughput>\S+) \S+',
70-
self.stdout, 'throughput', float
71-
),
72-
}
73-
ref_per_gpu = 131 if model == 'inception_v3' else 201
74-
ref_per_job = ref_per_gpu * mpi_task
75-
self.reference = {
76-
'dom:gpu': {
77-
'throughput_per_gpu': (ref_per_gpu, -0.1, None, 'images/s'),
78-
'throughput_per_job': (ref_per_job, -0.1, None, 'images/s'),
79-
},
80-
'daint:gpu': {
81-
'throughput_per_gpu': (ref_per_gpu, -0.1, None, 'images/s'),
82-
'throughput_per_job': (ref_per_job, -0.1, None, 'images/s'),
52+
with contextlib.suppress(KeyError):
53+
ref_vars = self.allref['sm_60'][self.model]
54+
ref_per_gpu = ref_vars['throughput_per_gpu'][0]
55+
ref_total = ref_per_gpu * self.num_nodes
56+
self.reference = {
57+
'*': {
58+
**ref_vars,
59+
'throughput_total': (ref_total, -0.05, None, 'images/s'),
60+
}
8361
}
84-
}

docs/hpctestlib.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,7 @@ Machine Learning
4545
.. automodule:: hpctestlib.ml.tensorflow.horovod
4646
:members:
4747
:show-inheritance:
48+
49+
.. automodule:: hpctestlib.ml.pytorch.horovod
50+
:members:
51+
:show-inheritance:

hpctestlib/ml/pytorch/horovod.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
2+
# ReFrame Project Developers. See the top-level LICENSE file for details.
3+
#
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
6+
import reframe as rfm
7+
import reframe.utility.sanity as sn
8+
9+
10+
@rfm.simple_test
11+
class pytorch_cnn_check(rfm.RunOnlyRegressionTest, pin_prefix=True):
12+
'''Run a synthetic CNN benchmark with PyTorch and Horovod.
13+
14+
PyTorch is a Python package that provides tensor computation like
15+
NumPy with strong GPU acceleration and deep neural networks built
16+
on a tape-based autograd system. For more information, refer to
17+
`<https://pytorch.org/>`__.
18+
19+
Horovod is a distributed deep learning training framework for TensorFlow,
20+
Keras, PyTorch, and Apache MXNet. The goal of Horovod is to make
21+
distributed deep learning fast and easy to use. For more information refer
22+
to `<https://github.com/horovod/horovod>`__.
23+
24+
This test runs the Horovod ``pytorch_synthentic_benchmark.py``
25+
example, checks its sanity and extracts the GPU performance.
26+
'''
27+
28+
#: The version of Horovod to use.
29+
#:
30+
#: :type: :class:`str`
31+
#: :default: ``'v0.21.0'``
32+
benchmark_version = variable(str, value='v0.21.0')
33+
34+
#: The name of the model to use for this benchmark.
35+
#:
36+
#: :type: :class:`str`
37+
#: :default: ``'inception_v3'``
38+
model = variable(str, value='inception_v3')
39+
40+
#: The size of the batch used during the learning of models.
41+
#:
42+
#: :type: :class:`int`
43+
#: :default: ``64``
44+
batch_size = variable(int, value=64)
45+
46+
#: The number of iterations.
47+
#:
48+
#: :type: :class:`int`
49+
#: :default: ``5``
50+
num_iters = variable(int, value=5)
51+
52+
#: The number of batches per iteration.
53+
#:
54+
#: :type: :class:`int`
55+
#: :default: ``5``
56+
num_batches_per_iter = variable(int, value=5)
57+
58+
#: The number of warmup batches
59+
#:
60+
#: :type: :class:`int`
61+
#: :default: ``5``
62+
num_warmup_batches = variable(int, value=5)
63+
64+
executable = 'python pytorch_synthetic_benchmark.py'
65+
tags = {'ml', 'cnn', 'horovod'}
66+
67+
@run_after('init')
68+
def update_descr(self):
69+
# Get the python script
70+
self.descr = (f'Distributed CNN training with PyTorch and Horovod '
71+
f'(model: {self.model})')
72+
73+
@run_before('run')
74+
def prepare_run(self):
75+
script = self.executable.split()[1]
76+
self.prerun_cmds = [
77+
f'curl -LJO https://raw.githubusercontent.com/horovod/horovod/{self.benchmark_version}/examples/pytorch/{script}', # noqa: E501
78+
]
79+
if self.model == 'inception_v3':
80+
self.prerun_cmds += [
81+
f'sed -i "s/output = model/output, aux = model/g" {script}',
82+
f'sed -i "s/224, 224/299, 299/g" {script}'
83+
]
84+
self.executable_opts = [
85+
f'--model {self.model}',
86+
f'--batch-size {self.batch_size}',
87+
f'--num-iters {self.num_iters}',
88+
f'--num-batches-per-iter {self.num_batches_per_iter}',
89+
f'--num-warmup-batches {self.num_warmup_batches}'
90+
]
91+
92+
@performance_function('images/s')
93+
def throughput_iteration(self):
94+
'''The average GPU throughput per iteration in ``images/s``.'''
95+
return sn.avg(
96+
sn.extractall(r'Img/sec per GPU: (\S+) \S+', self.stdout, 1, float)
97+
)
98+
99+
@performance_function('images/s')
100+
def throughput_total(self):
101+
'''The total GPU throughput of the benchmark in ``images/s``.'''
102+
return sn.extractsingle(
103+
rf'Total img/sec on {self.num_tasks} GPU\(s\): (\S+) \S+',
104+
self.stdout, 1, float
105+
)
106+
107+
@sanity_function
108+
def validate_run(self):
109+
return sn.all([
110+
sn.assert_found(rf'Model: {self.model}', self.stdout),
111+
sn.assert_found(rf'Batch size: {self.batch_size}', self.stdout)
112+
])

0 commit comments

Comments
 (0)