|
3 | 3 | # |
4 | 4 | # SPDX-License-Identifier: BSD-3-Clause |
5 | 5 |
|
| 6 | +import contextlib |
6 | 7 | import reframe as rfm |
7 | 8 | import reframe.utility.osext as osext |
8 | 9 |
|
9 | | -from hpctestlib.apps.tensorflow.base_check import TensorFlow2Horovod_BaseTest |
10 | | - |
11 | | - |
12 | | -REFERENCE_SMALL_PERFORMANCE = { |
13 | | - 'dom:gpu': { |
14 | | - 'throughput': (1712, -0.05, None, 'images/s'), |
15 | | - 'throughput_per_gpu': (214, -0.05, None, 'images/s'), |
16 | | - }, |
17 | | - 'daint:gpu': { |
18 | | - 'throughput': (1712, -0.05, None, 'images/s'), |
19 | | - 'throughput_per_gpu': (214, -0.05, None, 'images/s') |
20 | | - }, |
21 | | -} |
22 | | - |
23 | | -REFERENCE_LARGE_PERFORMANCE = { |
24 | | - 'daint:gpu': { |
25 | | - 'throughput': (6848, -0.05, None, 'images/s'), |
26 | | - 'throughput_per_gpu': (214, -0.05, None, 'images/s') |
27 | | - }, |
28 | | -} |
| 10 | +from hpctestlib.ml.tensorflow.horovod import tensorflow_cnn_check |
29 | 11 |
|
30 | 12 |
|
31 | 13 | @rfm.simple_test |
32 | | -class TensorFlow2HorovodTestCSCS(TensorFlow2Horovod_BaseTest): |
33 | | - variant = parameter(['small', 'large']) |
34 | | - sourcesdir = None |
| 14 | +class cscs_tensorflow_horovod_check(tensorflow_cnn_check): |
| 15 | + num_nodes = parameter([8, 32]) |
35 | 16 | num_tasks_per_node = 1 |
36 | | - num_cpus_per_task = 12 |
37 | 17 | batch_size = 64 |
38 | | - tags = {'production'} |
39 | | - maintainers = ['RS', 'TR'] |
40 | | - valid_prog_environs = ['builtin'] |
41 | 18 | valid_systems = ['daint:gpu'] |
| 19 | + valid_prog_environs = ['builtin'] |
42 | 20 | modules = [ |
43 | 21 | f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0' |
44 | 22 | ] |
| 23 | + tags |= {'production'} |
| 24 | + maintainers = ['sarafael', 'henrique'] |
| 25 | + allref = { |
| 26 | + 8: { |
| 27 | + 'sm_60': { |
| 28 | + 'throughput_total': (1712, -0.05, None, 'images/s'), |
| 29 | + 'throughput_iteration': (214, -0.05, None, 'images/s') |
| 30 | + } |
| 31 | + }, |
| 32 | + 16: { |
| 33 | + 'sm_60': { |
| 34 | + 'throughput_total': (6848, -0.05, None, 'images/s'), |
| 35 | + 'throughput_iteration': (214, -0.05, None, 'images/s') |
| 36 | + } |
| 37 | + }, |
| 38 | + } |
45 | 39 |
|
46 | 40 | @run_after('init') |
47 | | - def set_num_task(self): |
48 | | - if self.variant == 'small': |
| 41 | + def setup_filtering_criteria(self): |
| 42 | + if self.num_nodes == 32: |
49 | 43 | self.valid_systems += ['dom:gpu'] |
50 | | - self.num_tasks = 8 |
51 | | - self.reference = REFERENCE_SMALL_PERFORMANCE |
52 | | - else: |
53 | | - self.num_tasks = 32 |
54 | | - self.reference = REFERENCE_LARGE_PERFORMANCE |
55 | 44 |
|
56 | | - @run_after('init') |
57 | | - def set_executable_opts(self): |
| 45 | + @run_before('run') |
| 46 | + def setup_run(self): |
| 47 | + proc = self.current_partition.processor |
| 48 | + self.num_tasks = self.num_nodes * self.num_tasks_per_node |
| 49 | + self.num_cpus_per_task = proc.num_cores |
| 50 | + with contextlib.suppress(KeyError): |
| 51 | + self.reference = { |
| 52 | + '*': self.allref[self.num_nodes]['sm_60'] |
| 53 | + } |
| 54 | + |
58 | 55 | self.variables = { |
59 | 56 | 'NCCL_DEBUG': 'INFO', |
60 | 57 | 'NCCL_IB_HCA': 'ipogif0', |
61 | 58 | 'NCCL_IB_CUDA_SUPPORT': '1', |
62 | | - 'OMP_NUM_THREADS': '$SLURM_CPUS_PER_TASK', |
| 59 | + 'OMP_NUM_THREADS': str(self.num_cpus_per_task) |
63 | 60 | } |
64 | | - self.executable_opts = [ |
65 | | - f'{self.script}', |
66 | | - f'--model {self.model}', |
67 | | - f'--batch-size {self.batch_size}', |
68 | | - '--num-iters 5', |
69 | | - '--num-batches-per-iter 5', |
70 | | - '--num-warmup-batches 5', |
71 | | - ] |
0 commit comments