|
3 | 3 | # |
4 | 4 | # SPDX-License-Identifier: BSD-3-Clause |
5 | 5 |
|
| 6 | +import contextlib |
6 | 7 | import reframe as rfm |
7 | | -import reframe.utility.sanity as sn |
8 | 8 |
|
| 9 | +from hpctestlib.ml.pytorch.horovod import pytorch_cnn_check |
9 | 10 |
|
10 | | -@rfm.parameterized_test(*[[model, mpi_task] |
11 | | - for mpi_task in [32, 8, 1] |
12 | | - for model in ['inception_v3', 'resnet50']]) |
13 | | -class PytorchHorovodTest(rfm.RunOnlyRegressionTest): |
14 | | - def __init__(self, model, mpi_task): |
15 | | - self.descr = 'Distributed training with Pytorch and Horovod' |
16 | | - self.valid_systems = ['daint:gpu'] |
17 | | - if mpi_task < 20: |
| 11 | + |
| 12 | +@rfm.simple_test |
| 13 | +class cscs_pytorch_horovod_check(pytorch_cnn_check): |
| 14 | + num_nodes = parameter([1, 8, 32]) |
| 15 | + model_name = parameter(['inception_v3', 'resnet50']) |
| 16 | + num_tasks_per_node = 1 |
| 17 | + batch_size = 64 |
| 18 | + valid_systems = ['daint:gpu'] |
| 19 | + valid_prog_environs = ['builtin'] |
| 20 | + modules = ['PyTorch'] |
| 21 | + tags |= {'production'} |
| 22 | + maintainers = ['sarafael', 'henrique'] |
| 23 | + allref = { |
| 24 | + 'sm_60': { |
| 25 | + 'inception_v3': { |
| 26 | + 'throughput_per_gpu': (131, -0.05, None, 'images/s'), |
| 27 | + }, |
| 28 | + 'resnet50': { |
| 29 | + 'throughput_per_gpu': (201, -0.05, None, 'images/s'), |
| 30 | + } |
| 31 | + } |
| 32 | + } |
| 33 | + |
| 34 | + @run_after('init') |
| 35 | + def setup_filtering_criteria(self): |
| 36 | + self.model = self.model_name |
| 37 | + if self.num_nodes == 8: |
18 | 38 | self.valid_systems += ['dom:gpu'] |
19 | 39 |
|
20 | | - self.valid_prog_environs = ['builtin'] |
21 | | - self.modules = ['PyTorch'] |
22 | | - self.num_tasks_per_node = 1 |
23 | | - self.num_cpus_per_task = 12 |
24 | | - self.num_tasks = mpi_task |
25 | | - batch_size = 64 |
| 40 | + @run_before('run') |
| 41 | + def setup_run(self): |
| 42 | + self.skip_if_no_procinfo() |
| 43 | + proc = self.current_partition.processor |
| 44 | + self.num_tasks = self.num_nodes * self.num_tasks_per_node |
| 45 | + self.num_cpus_per_task = proc.num_cores |
26 | 46 | self.variables = { |
27 | 47 | 'NCCL_DEBUG': 'INFO', |
28 | 48 | 'NCCL_IB_HCA': 'ipogif0', |
29 | 49 | 'NCCL_IB_CUDA_SUPPORT': '1', |
30 | | - 'OMP_NUM_THREADS': '$SLURM_CPUS_PER_TASK', |
| 50 | + 'OMP_NUM_THREADS': str(self.num_cpus_per_task) |
31 | 51 | } |
32 | | - hash = 'master' |
33 | | - git_url = f'https://raw.githubusercontent.com/horovod/horovod/{hash}/examples/pytorch' # noqa: E501 |
34 | | - git_src = 'pytorch_synthetic_benchmark.py' |
35 | | - self.prerun_cmds = [f'wget {git_url}/{git_src}'] |
36 | | - |
37 | | - if model == 'inception_v3': |
38 | | - self.prerun_cmds += [ |
39 | | - 'python3 -m venv --system-site-packages myvenv', |
40 | | - 'source myvenv/bin/activate', |
41 | | - 'pip install scipy', |
42 | | - 'sed -i "s-output = model(data)-output, aux = model(data)-"' |
43 | | - f' {git_src}', |
44 | | - 'sed -i "s-data = torch.randn(args.batch_size, 3, 224, 224)-' |
45 | | - f'data = torch.randn(args.batch_size, 3, 299, 299)-"' |
46 | | - f' {git_src}' |
47 | | - ] |
48 | | - |
49 | | - self.executable = 'python' |
50 | | - self.executable_opts = [ |
51 | | - git_src, |
52 | | - f'--model {model}', |
53 | | - f'--batch-size {batch_size}', |
54 | | - '--num-iters 5', |
55 | | - '--num-batches-per-iter 5' |
56 | | - ] |
57 | | - self.tags = {'production'} |
58 | | - self.maintainers = ['RS', 'HM'] |
59 | | - self.sanity_patterns = sn.all([ |
60 | | - sn.assert_found(rf'Model: {model}', self.stdout), |
61 | | - sn.assert_found(rf'Batch size: {batch_size}', self.stdout) |
62 | | - ]) |
63 | | - self.perf_patterns = { |
64 | | - 'throughput_per_gpu': sn.extractsingle( |
65 | | - r'Img/sec per GPU: (?P<throughput_per_gpu>\S+) \S+', |
66 | | - self.stdout, 'throughput_per_gpu', float |
67 | | - ), |
68 | | - 'throughput_per_job': sn.extractsingle( |
69 | | - r'Total img/sec on \d+ GPU\(s\): (?P<throughput>\S+) \S+', |
70 | | - self.stdout, 'throughput', float |
71 | | - ), |
72 | | - } |
73 | | - ref_per_gpu = 131 if model == 'inception_v3' else 201 |
74 | | - ref_per_job = ref_per_gpu * mpi_task |
75 | | - self.reference = { |
76 | | - 'dom:gpu': { |
77 | | - 'throughput_per_gpu': (ref_per_gpu, -0.1, None, 'images/s'), |
78 | | - 'throughput_per_job': (ref_per_job, -0.1, None, 'images/s'), |
79 | | - }, |
80 | | - 'daint:gpu': { |
81 | | - 'throughput_per_gpu': (ref_per_gpu, -0.1, None, 'images/s'), |
82 | | - 'throughput_per_job': (ref_per_job, -0.1, None, 'images/s'), |
| 52 | + with contextlib.suppress(KeyError): |
| 53 | + ref_vars = self.allref['sm_60'][self.model] |
| 54 | + ref_per_gpu = ref_vars['throughput_per_gpu'][0] |
| 55 | + ref_total = ref_per_gpu * self.num_nodes |
| 56 | + self.reference = { |
| 57 | + '*': { |
| 58 | + **ref_vars, |
| 59 | + 'throughput_total': (ref_total, -0.05, None, 'images/s'), |
| 60 | + } |
83 | 61 | } |
84 | | - } |
0 commit comments