Skip to content

Commit 6237d2f

Browse files
author
Vasileios Karakasis
committed
Base CSCS TF2 test on the library
1 parent 554c0c6 commit 6237d2f

File tree

2 files changed

+34
-46
lines changed

2 files changed

+34
-46
lines changed

cscs-checks/apps/tensorflow/tf2_horovod_check.py

Lines changed: 34 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -3,69 +3,58 @@
33
#
44
# SPDX-License-Identifier: BSD-3-Clause
55

6+
import contextlib
67
import reframe as rfm
78
import reframe.utility.osext as osext
89

9-
from hpctestlib.apps.tensorflow.base_check import TensorFlow2Horovod_BaseTest
10-
11-
12-
REFERENCE_SMALL_PERFORMANCE = {
13-
'dom:gpu': {
14-
'throughput': (1712, -0.05, None, 'images/s'),
15-
'throughput_per_gpu': (214, -0.05, None, 'images/s'),
16-
},
17-
'daint:gpu': {
18-
'throughput': (1712, -0.05, None, 'images/s'),
19-
'throughput_per_gpu': (214, -0.05, None, 'images/s')
20-
},
21-
}
22-
23-
REFERENCE_LARGE_PERFORMANCE = {
24-
'daint:gpu': {
25-
'throughput': (6848, -0.05, None, 'images/s'),
26-
'throughput_per_gpu': (214, -0.05, None, 'images/s')
27-
},
28-
}
10+
from hpctestlib.ml.tensorflow.horovod import tensorflow_cnn_check
2911

3012

3113
@rfm.simple_test
32-
class TensorFlow2HorovodTestCSCS(TensorFlow2Horovod_BaseTest):
33-
variant = parameter(['small', 'large'])
34-
sourcesdir = None
14+
class cscs_tensorflow_horovod_check(tensorflow_cnn_check):
15+
num_nodes = parameter([8, 32])
3516
num_tasks_per_node = 1
36-
num_cpus_per_task = 12
3717
batch_size = 64
38-
tags = {'production'}
39-
maintainers = ['RS', 'TR']
40-
valid_prog_environs = ['builtin']
4118
valid_systems = ['daint:gpu']
19+
valid_prog_environs = ['builtin']
4220
modules = [
4321
f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0'
4422
]
23+
tags |= {'production'}
24+
maintainers = ['sarafael', 'henrique']
25+
allref = {
26+
8: {
27+
'sm_60': {
28+
'throughput_total': (1712, -0.05, None, 'images/s'),
29+
'throughput_iteration': (214, -0.05, None, 'images/s')
30+
}
31+
},
32+
16: {
33+
'sm_60': {
34+
'throughput_total': (6848, -0.05, None, 'images/s'),
35+
'throughput_iteration': (214, -0.05, None, 'images/s')
36+
}
37+
},
38+
}
4539

4640
@run_after('init')
47-
def set_num_task(self):
48-
if self.variant == 'small':
41+
def setup_filtering_criteria(self):
42+
if self.num_nodes == 32:
4943
self.valid_systems += ['dom:gpu']
50-
self.num_tasks = 8
51-
self.reference = REFERENCE_SMALL_PERFORMANCE
52-
else:
53-
self.num_tasks = 32
54-
self.reference = REFERENCE_LARGE_PERFORMANCE
5544

56-
@run_after('init')
57-
def set_executable_opts(self):
45+
@run_before('run')
46+
def setup_run(self):
47+
proc = self.current_partition.processor
48+
self.num_tasks = self.num_nodes * self.num_tasks_per_node
49+
self.num_cpus_per_task = proc.num_cores
50+
with contextlib.suppress(KeyError):
51+
self.reference = {
52+
'*': self.allref[self.num_nodes]['sm_60']
53+
}
54+
5855
self.variables = {
5956
'NCCL_DEBUG': 'INFO',
6057
'NCCL_IB_HCA': 'ipogif0',
6158
'NCCL_IB_CUDA_SUPPORT': '1',
62-
'OMP_NUM_THREADS': '$SLURM_CPUS_PER_TASK',
59+
'OMP_NUM_THREADS': str(self.num_cpus_per_task)
6360
}
64-
self.executable_opts = [
65-
f'{self.script}',
66-
f'--model {self.model}',
67-
f'--batch-size {self.batch_size}',
68-
'--num-iters 5',
69-
'--num-batches-per-iter 5',
70-
'--num-warmup-batches 5',
71-
]

hpctestlib/ml/tensorflow/horovod.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ class tensorflow_cnn_check(rfm.RunOnlyRegressionTest, pin_prefix=True):
4343

4444
executable = 'python tensorflow2_synthetic_benchmark.py'
4545
tags = {'ml', 'cnn', 'horovod'}
46-
maintainers = ['sarafael', 'henrique']
4746

4847
@run_after('init')
4948
def prepare_test(self):

0 commit comments

Comments
 (0)