|
3 | 3 | # |
4 | 4 | # SPDX-License-Identifier: BSD-3-Clause |
5 | 5 |
|
| 6 | +import contextlib |
6 | 7 | import reframe as rfm |
7 | | -import reframe.utility.sanity as sn |
8 | 8 | import reframe.utility.osext as osext |
9 | 9 |
|
| 10 | +from hpctestlib.ml.tensorflow.horovod import tensorflow_cnn_check |
| 11 | + |
10 | 12 |
|
11 | 13 | @rfm.simple_test |
12 | | -class TensorFlow2HorovodTest(rfm.RunOnlyRegressionTest): |
13 | | - variant = parameter(['small', 'large']) |
| 14 | +class cscs_tensorflow_horovod_check(tensorflow_cnn_check): |
| 15 | + num_nodes = parameter([8, 32]) |
| 16 | + num_tasks_per_node = 1 |
| 17 | + batch_size = 64 |
| 18 | + valid_systems = ['daint:gpu'] |
| 19 | + valid_prog_environs = ['builtin'] |
| 20 | + modules = [ |
| 21 | + f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0' |
| 22 | + ] |
| 23 | + tags |= {'production'} |
| 24 | + maintainers = ['sarafael', 'henrique'] |
| 25 | + allref = { |
| 26 | + 8: { |
| 27 | + 'sm_60': { |
| 28 | + 'throughput_total': (1712, -0.05, None, 'images/s'), |
| 29 | + 'throughput_iteration': (214, -0.05, None, 'images/s') |
| 30 | + } |
| 31 | + }, |
| 32 | + 16: { |
| 33 | + 'sm_60': { |
| 34 | + 'throughput_total': (6848, -0.05, None, 'images/s'), |
| 35 | + 'throughput_iteration': (214, -0.05, None, 'images/s') |
| 36 | + } |
| 37 | + }, |
| 38 | + } |
14 | 39 |
|
15 | | - def __init__(self): |
16 | | - self.descr = 'Distributed training with TensorFlow2 and Horovod' |
17 | | - self.valid_systems = ['daint:gpu'] |
18 | | - self.valid_prog_environs = ['builtin'] |
19 | | - self.modules = [ |
20 | | - f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0' |
21 | | - ] |
22 | | - self.sourcesdir = None |
23 | | - self.num_tasks_per_node = 1 |
24 | | - self.num_cpus_per_task = 12 |
25 | | - if self.variant == 'small': |
| 40 | + @run_after('init') |
| 41 | + def setup_filtering_criteria(self): |
| 42 | + if self.num_nodes == 32: |
26 | 43 | self.valid_systems += ['dom:gpu'] |
27 | | - self.num_tasks = 8 |
28 | | - self.reference = { |
29 | | - 'dom:gpu': { |
30 | | - 'throughput': (1712, -0.05, None, 'images/s'), |
31 | | - 'throughput_per_gpu': (214, -0.05, None, 'images/s'), |
32 | | - }, |
33 | | - 'daint:gpu': { |
34 | | - 'throughput': (1712, -0.05, None, 'images/s'), |
35 | | - 'throughput_per_gpu': (214, -0.05, None, 'images/s') |
36 | | - }, |
37 | | - } |
38 | | - else: |
39 | | - self.num_tasks = 32 |
| 44 | + |
| 45 | + @run_before('run') |
| 46 | + def setup_run(self): |
| 47 | + proc = self.current_partition.processor |
| 48 | + self.num_tasks = self.num_nodes * self.num_tasks_per_node |
| 49 | + self.num_cpus_per_task = proc.num_cores |
| 50 | + with contextlib.suppress(KeyError): |
40 | 51 | self.reference = { |
41 | | - 'daint:gpu': { |
42 | | - 'throughput': (6848, -0.05, None, 'images/s'), |
43 | | - 'throughput_per_gpu': (214, -0.05, None, 'images/s') |
44 | | - }, |
| 52 | + '*': self.allref[self.num_nodes]['sm_60'] |
45 | 53 | } |
46 | | - self.perf_patterns = { |
47 | | - 'throughput': sn.extractsingle( |
48 | | - rf'Total img/sec on {self.num_tasks} GPU\(s\): ' |
49 | | - rf'(?P<throughput>\S+) \S+', |
50 | | - self.stdout, 'throughput', float), |
51 | | - 'throughput_per_gpu': sn.extractsingle( |
52 | | - r'Img/sec per GPU: (?P<throughput_per_gpu>\S+) \S+', |
53 | | - self.stdout, 'throughput_per_gpu', float) |
54 | | - } |
55 | | - model = 'InceptionV3' |
56 | | - batch_size = 64 |
57 | | - self.sanity_patterns = sn.all([ |
58 | | - sn.assert_found(rf'Model: {model}', self.stdout), |
59 | | - sn.assert_found(rf'Batch size: {batch_size}', self.stdout) |
60 | | - ]) |
| 54 | + |
61 | 55 | self.variables = { |
62 | 56 | 'NCCL_DEBUG': 'INFO', |
63 | 57 | 'NCCL_IB_HCA': 'ipogif0', |
64 | 58 | 'NCCL_IB_CUDA_SUPPORT': '1', |
65 | | - 'OMP_NUM_THREADS': '$SLURM_CPUS_PER_TASK', |
| 59 | + 'OMP_NUM_THREADS': str(self.num_cpus_per_task) |
66 | 60 | } |
67 | | - script = 'tensorflow2_synthetic_benchmark.py' |
68 | | - self.prerun_cmds = ['wget https://raw.githubusercontent.com/horovod/' |
69 | | - 'horovod/842d1075e8440f15e84364f494645c28bf20c3ae/' |
70 | | - 'examples/tensorflow2_synthetic_benchmark.py', |
71 | | - 'sed -i "s/weights=None/weights=None, ' |
72 | | - f'input_shape=(224, 224, 3)/g" {script}'] |
73 | | - self.executable = 'python' |
74 | | - self.executable_opts = [ |
75 | | - f'{script}', |
76 | | - f'--model {model}', |
77 | | - f'--batch-size {batch_size}', |
78 | | - '--num-iters 5', |
79 | | - '--num-batches-per-iter 5', |
80 | | - '--num-warmup-batches 5', |
81 | | - ] |
82 | | - self.tags = {'production'} |
83 | | - self.maintainers = ['RS', 'TR'] |
0 commit comments