Skip to content

Commit 28fbb8f

Browse files
authored
Merge pull request #686 from ajocksch/benchmarks/osu_allreduce
[test] Add OSU Allreduce test
2 parents 7c63333 + d669518 commit 28fbb8f

File tree

3 files changed

+231
-27
lines changed

3 files changed

+231
-27
lines changed

cscs-checks/microbenchmarks/osu/osu_tests.py

Lines changed: 71 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import reframe.utility.sanity as sn
33

44

5-
@rfm.required_version('>=2.14')
5+
@rfm.required_version('>=2.16')
66
@rfm.parameterized_test(['production'])
77
class AlltoallTest(rfm.RegressionTest):
88
def __init__(self, variant):
@@ -21,16 +21,16 @@ def __init__(self, variant):
2121
self.maintainers = ['RS', 'VK']
2222
self.sanity_patterns = sn.assert_found(r'^8', self.stdout)
2323
self.perf_patterns = {
24-
'perf': sn.extractsingle(r'^8\s+(?P<perf>\S+)',
25-
self.stdout, 'perf', float)
24+
'latency': sn.extractsingle(r'^8\s+(?P<latency>\S+)',
25+
self.stdout, 'latency', float)
2626
}
2727
self.tags = {variant}
2828
self.reference = {
2929
'dom:gpu': {
30-
'perf': (8.23, None, 0.1)
30+
'latency': (8.23, None, 0.1, 'us')
3131
},
3232
'daint:gpu': {
33-
'perf': (20.73, None, 2.0)
33+
'latency': (20.73, None, 2.0, 'us')
3434
},
3535
}
3636
self.num_tasks_per_node = 1
@@ -72,6 +72,50 @@ def __init__(self):
7272
self.tags = {'diagnostic', 'ops'}
7373

7474

75+
@rfm.required_version('>=2.16')
76+
@rfm.simple_test
77+
class AllreduceTest(rfm.RegressionTest):
78+
def __init__(self):
79+
super().__init__()
80+
self.strict_check = False
81+
self.valid_systems = ['daint:gpu', 'dom:gpu']
82+
self.descr = 'Allreduce OSU microbenchmark'
83+
self.build_system = 'Make'
84+
self.build_system.makefile = 'Makefile_allreduce'
85+
self.executable = './osu_allreduce'
86+
# The -x option controls the number of warm-up iterations
87+
# The -i option controls the number of iterations
88+
self.executable_opts = ['-m', '8', '-x', '1000', '-i', '20000']
89+
self.valid_prog_environs = ['PrgEnv-gnu']
90+
self.maintainers = ['RS', 'VK']
91+
self.sanity_patterns = sn.assert_found(r'^8', self.stdout)
92+
self.perf_patterns = {
93+
'latency': sn.extractsingle(r'^8\s+(?P<latency>\S+)',
94+
self.stdout, 'latency', float)
95+
}
96+
self.tags = {'production'}
97+
self.reference = {
98+
'dom:gpu': {
99+
'latency': (6.0, None, 0.1, 'us')
100+
},
101+
'daint:gpu': {
102+
'latency': (20.5, None, 2.0, 'us')
103+
},
104+
}
105+
self.num_tasks_per_node = 1
106+
self.num_gpus_per_node = 1
107+
if self.current_system.name == 'dom':
108+
self.num_tasks = 6
109+
elif self.current_system.name == 'daint':
110+
self.num_tasks = 16
111+
112+
self.extra_resources = {
113+
'switches': {
114+
'num_switches': 1
115+
}
116+
}
117+
118+
75119
# FIXME: This test is obsolete; it is kept only for reference.
76120
@rfm.parameterized_test(*({'num_tasks': i} for i in range(2, 10, 2)))
77121
class AlltoallMonchAcceptanceTest(AlltoallTest):
@@ -125,7 +169,7 @@ def __init__(self):
125169
}
126170

127171

128-
@rfm.required_version('>=2.14')
172+
@rfm.required_version('>=2.16')
129173
@rfm.simple_test
130174
class P2PCPUBandwidthTest(P2PBaseTest):
131175
def __init__(self):
@@ -137,22 +181,22 @@ def __init__(self):
137181

138182
self.reference = {
139183
'daint:gpu': {
140-
'bw': (9798.29, -0.1, None)
184+
'bw': (9798.29, -0.1, None, 'MB/s')
141185
},
142186
'daint:mc': {
143-
'bw': (9865.00, -0.2, None)
187+
'bw': (9865.00, -0.2, None, 'MB/s')
144188
},
145189
'dom:gpu': {
146-
'bw': (9815.66, -0.1, None)
190+
'bw': (9815.66, -0.1, None, 'MB/s')
147191
},
148192
'dom:mc': {
149-
'bw': (9472.59, -0.20, None)
193+
'bw': (9472.59, -0.20, None, 'MB/s')
150194
},
151195
'monch:compute': {
152-
'bw': (6317.84, -0.15, None)
196+
'bw': (6317.84, -0.15, None, 'MB/s')
153197
},
154198
'kesch:cn': {
155-
'bw': (6311.48, -0.15, None)
199+
'bw': (6311.48, -0.15, None, 'MB/s')
156200
}
157201
}
158202
self.perf_patterns = {
@@ -162,7 +206,7 @@ def __init__(self):
162206
self.tags |= {'monch_acceptance'}
163207

164208

165-
@rfm.required_version('>=2.14')
209+
@rfm.required_version('>=2.16')
166210
@rfm.simple_test
167211
class P2PCPULatencyTest(P2PBaseTest):
168212
def __init__(self):
@@ -174,22 +218,22 @@ def __init__(self):
174218
self.executable = './p2p_osu_latency'
175219
self.reference = {
176220
'daint:gpu': {
177-
'latency': (1.16, None, 1.0)
221+
'latency': (1.16, None, 1.0, 'us')
178222
},
179223
'daint:mc': {
180-
'latency': (1.15, None, 0.6)
224+
'latency': (1.15, None, 0.6, 'us')
181225
},
182226
'dom:gpu': {
183-
'latency': (1.13, None, 0.1)
227+
'latency': (1.13, None, 0.1, 'us')
184228
},
185229
'dom:mc': {
186-
'latency': (1.27, None, 0.2)
230+
'latency': (1.27, None, 0.2, 'us')
187231
},
188232
'monch:compute': {
189-
'latency': (1.27, None, 0.1)
233+
'latency': (1.27, None, 0.1, 'us')
190234
},
191235
'kesch:cn': {
192-
'latency': (1.17, None, 0.1)
236+
'latency': (1.17, None, 0.1, 'us')
193237
}
194238
}
195239
self.perf_patterns = {
@@ -199,7 +243,7 @@ def __init__(self):
199243
self.tags |= {'monch_acceptance'}
200244

201245

202-
@rfm.required_version('>=2.14')
246+
@rfm.required_version('>=2.16')
203247
@rfm.simple_test
204248
class G2GBandwidthTest(P2PBaseTest):
205249
def __init__(self):
@@ -212,13 +256,13 @@ def __init__(self):
212256

213257
self.reference = {
214258
'dom:gpu': {
215-
'bw': (8897.86, -0.1, None)
259+
'bw': (8897.86, -0.1, None, 'MB/s')
216260
},
217261
'daint:gpu': {
218-
'bw': (8765.65, -0.1, None)
262+
'bw': (8765.65, -0.1, None, 'MB/s')
219263
},
220264
'kesch:cn': {
221-
'bw': (6288.98, -0.1, None)
265+
'bw': (6288.98, -0.1, None, 'MB/s')
222266
},
223267
}
224268
self.perf_patterns = {
@@ -236,7 +280,7 @@ def __init__(self):
236280
self.build_system.cppflags = ['-D_ENABLE_CUDA_']
237281

238282

239-
@rfm.required_version('>=2.14')
283+
@rfm.required_version('>=2.16')
240284
@rfm.simple_test
241285
class G2GLatencyTest(P2PBaseTest):
242286
def __init__(self):
@@ -249,13 +293,13 @@ def __init__(self):
249293

250294
self.reference = {
251295
'dom:gpu': {
252-
'latency': (5.49, None, 0.1)
296+
'latency': (5.49, None, 0.1, 'us')
253297
},
254298
'daint:gpu': {
255-
'latency': (5.73, None, 1.0)
299+
'latency': (5.73, None, 1.0, 'us')
256300
},
257301
'kesch:cn': {
258-
'latency': (23.09, None, 0.1)
302+
'latency': (23.09, None, 0.1, 'us')
259303
},
260304
}
261305
self.perf_patterns = {
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
EXECUTABLE := osu_allreduce
2+
3+
all: $(EXECUTABLE)
4+
5+
SRCS += osu_util.c \
6+
osu_allreduce.c
7+
8+
OBJS := $(SRCS:.c=.o)
9+
10+
$(OBJS):
11+
$(CC) $(CPPFLAGS) $(CFLAGS) -I. -o $(@) -c $(@:.o=.c)
12+
13+
$(EXECUTABLE): $(OBJS)
14+
$(CC) $(CPPFLAGS) $(CFLAGS) -o $(@) $(OBJS) $(LDFLAGS)
15+
16+
clean:
17+
rm -f $(OBJS) $(EXECUTABLE)
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
#define BENCHMARK "OSU MPI%s Allreduce Latency Test"
2+
/*
3+
* Copyright (C) 2002-2018 the Network-Based Computing Laboratory
4+
* (NBCL), The Ohio State University.
5+
*
6+
* Contact: Dr. D. K. Panda ([email protected])
7+
*
8+
* For detailed copyright and licensing information, please refer to the
9+
* copyright file COPYRIGHT in the top level OMB directory.
10+
*/
11+
#include <osu_util.h>
12+
13+
int main(int argc, char *argv[])
14+
{
15+
int i, numprocs, rank, size;
16+
double latency = 0.0, t_start = 0.0, t_stop = 0.0;
17+
double timer=0.0;
18+
double avg_time = 0.0, max_time = 0.0, min_time = 0.0;
19+
float *sendbuf, *recvbuf;
20+
int po_ret;
21+
size_t bufsize;
22+
options.bench = COLLECTIVE;
23+
options.subtype = LAT;
24+
25+
set_header(HEADER);
26+
set_benchmark_name("osu_allreduce");
27+
po_ret = process_options(argc, argv);
28+
29+
if (PO_OKAY == po_ret && NONE != options.accel) {
30+
if (init_accel()) {
31+
fprintf(stderr, "Error initializing device\n");
32+
exit(EXIT_FAILURE);
33+
}
34+
}
35+
36+
MPI_CHECK(MPI_Init(&argc, &argv));
37+
MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
38+
MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &numprocs));
39+
40+
switch (po_ret) {
41+
case PO_BAD_USAGE:
42+
print_bad_usage_message(rank);
43+
MPI_CHECK(MPI_Finalize());
44+
exit(EXIT_FAILURE);
45+
case PO_HELP_MESSAGE:
46+
print_help_message(rank);
47+
MPI_CHECK(MPI_Finalize());
48+
exit(EXIT_SUCCESS);
49+
case PO_VERSION_MESSAGE:
50+
print_version_message(rank);
51+
MPI_CHECK(MPI_Finalize());
52+
exit(EXIT_SUCCESS);
53+
case PO_OKAY:
54+
break;
55+
}
56+
57+
if(numprocs < 2) {
58+
if (rank == 0) {
59+
fprintf(stderr, "This test requires at least two processes\n");
60+
}
61+
62+
MPI_CHECK(MPI_Finalize());
63+
exit(EXIT_FAILURE);
64+
}
65+
66+
if (options.max_message_size > options.max_mem_limit) {
67+
if (rank == 0) {
68+
fprintf(stderr, "Warning! Increase the Max Memory Limit to be able to run up to %ld bytes.\n"
69+
"Continuing with max message size of %ld bytes\n",
70+
options.max_message_size, options.max_mem_limit);
71+
}
72+
options.max_message_size = options.max_mem_limit;
73+
}
74+
75+
options.min_message_size /= sizeof(float);
76+
if (options.min_message_size < MIN_MESSAGE_SIZE) {
77+
options.min_message_size = MIN_MESSAGE_SIZE;
78+
}
79+
80+
bufsize = sizeof(float)*(options.max_message_size/sizeof(float));
81+
if (allocate_memory_coll((void**)&sendbuf, bufsize, options.accel)) {
82+
fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank);
83+
MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE));
84+
}
85+
set_buffer(sendbuf, options.accel, 1, bufsize);
86+
87+
bufsize = sizeof(float)*(options.max_message_size/sizeof(float));
88+
if (allocate_memory_coll((void**)&recvbuf, bufsize, options.accel)) {
89+
fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank);
90+
MPI_CHECK(MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE));
91+
}
92+
set_buffer(recvbuf, options.accel, 0, bufsize);
93+
94+
print_preamble(rank);
95+
96+
for(size=options.min_message_size; size*sizeof(float) <= options.max_message_size; size *= 2) {
97+
98+
if(size > LARGE_MESSAGE_SIZE) {
99+
options.skip = options.skip_large;
100+
options.iterations = options.iterations_large;
101+
}
102+
103+
MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
104+
105+
timer=0.0;
106+
for(i=0; i < options.iterations + options.skip ; i++) {
107+
t_start = MPI_Wtime();
108+
MPI_CHECK(MPI_Allreduce(sendbuf, recvbuf, size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD ));
109+
t_stop=MPI_Wtime();
110+
if(i>=options.skip){
111+
112+
timer+=t_stop-t_start;
113+
}
114+
MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
115+
}
116+
latency = (double)(timer * 1e6) / options.iterations;
117+
118+
MPI_CHECK(MPI_Reduce(&latency, &min_time, 1, MPI_DOUBLE, MPI_MIN, 0,
119+
MPI_COMM_WORLD));
120+
MPI_CHECK(MPI_Reduce(&latency, &max_time, 1, MPI_DOUBLE, MPI_MAX, 0,
121+
MPI_COMM_WORLD));
122+
MPI_CHECK(MPI_Reduce(&latency, &avg_time, 1, MPI_DOUBLE, MPI_SUM, 0,
123+
MPI_COMM_WORLD));
124+
avg_time = avg_time/numprocs;
125+
126+
print_stats(rank, size * sizeof(float), avg_time, min_time, max_time);
127+
MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
128+
}
129+
130+
free_buffer(sendbuf, options.accel);
131+
free_buffer(recvbuf, options.accel);
132+
133+
MPI_CHECK(MPI_Finalize());
134+
135+
if (NONE != options.accel) {
136+
if (cleanup_accel()) {
137+
fprintf(stderr, "Error cleaning up device\n");
138+
exit(EXIT_FAILURE);
139+
}
140+
}
141+
142+
return EXIT_SUCCESS;
143+
}

0 commit comments

Comments
 (0)