Skip to content

Commit 40e2fbb

Browse files
authored
Merge pull request #7114 from brminich/topic/mlx_scat_tuning
COLL/TUNED: Add linear scatter using isend for mlnx platform
2 parents 6fc5a4e + f2cbd48 commit 40e2fbb

File tree

7 files changed

+181
-2
lines changed

7 files changed

+181
-2
lines changed

contrib/platform/mellanox/optimized.conf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# Copyright (c) 2004-2005 The Regents of the University of California.
1111
# All rights reserved.
1212
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
13+
# Copyright (c) 2019 Mellanox Technologies. All rights reserved.
1314
# $COPYRIGHT$
1415
#
1516
# Additional copyrights may follow
@@ -84,4 +85,8 @@ bml_r2_show_unreach_errors = 0
8485
coll_tuned_alltoall_large_msg = 250000
8586
coll_tuned_alltoall_min_procs = 2048
8687
coll_tuned_alltoall_algorithm_max_requests = 8
88+
coll_tuned_scatter_intermediate_msg = 8192
89+
coll_tuned_scatter_large_msg = 250000
90+
coll_tuned_scatter_min_procs = 1048510
91+
coll_tuned_scatter_algorithm_max_requests = 64
8792

ompi/mca/coll/base/coll_base_functions.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
* and Technology (RIST). All rights reserved.
1919
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
2020
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
21+
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
2122
* $COPYRIGHT$
2223
*
2324
* Additional copyrights may follow
@@ -291,6 +292,7 @@ int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS);
291292
/* Scatter */
292293
int ompi_coll_base_scatter_intra_basic_linear(SCATTER_ARGS);
293294
int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS);
295+
int ompi_coll_base_scatter_intra_linear_nb(SCATTER_ARGS, int max_reqs);
294296

295297
/* ScatterV */
296298

ompi/mca/coll/base/coll_base_scatter.c

Lines changed: 111 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
* reserved.
1515
* Copyright (c) 2015-2016 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
17+
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
1718
* $COPYRIGHT$
1819
*
1920
* Additional copyrights may follow
@@ -273,5 +274,114 @@ ompi_coll_base_scatter_intra_basic_linear(const void *sbuf, int scount,
273274
return MPI_SUCCESS;
274275
}
275276

276-
277277
/* copied function (with appropriate renaming) ends here */
278+
279+
/*
280+
* Use isends for distributing the data with periodic sync by blocking send.
281+
* Blocking send acts like a local resources flush, because it ensures
282+
* progression until the message is sent/(copied to some sort of transmit buffer).
283+
*/
284+
int
285+
ompi_coll_base_scatter_intra_linear_nb(const void *sbuf, int scount,
286+
struct ompi_datatype_t *sdtype,
287+
void *rbuf, int rcount,
288+
struct ompi_datatype_t *rdtype,
289+
int root,
290+
struct ompi_communicator_t *comm,
291+
mca_coll_base_module_t *module,
292+
int max_reqs)
293+
{
294+
int i, rank, size, err, line, nreqs;
295+
ptrdiff_t incr;
296+
char *ptmp;
297+
ompi_request_t **reqs = NULL, **preq;
298+
299+
rank = ompi_comm_rank(comm);
300+
size = ompi_comm_size(comm);
301+
302+
/* If not root, receive data. */
303+
if (rank != root) {
304+
err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root,
305+
MCA_COLL_BASE_TAG_SCATTER,
306+
comm, MPI_STATUS_IGNORE));
307+
if (MPI_SUCCESS != err) {
308+
line = __LINE__; goto err_hndl;
309+
}
310+
311+
return MPI_SUCCESS;
312+
}
313+
314+
if (max_reqs <= 1) {
315+
max_reqs = 0;
316+
nreqs = size - 1; /* no send for myself */
317+
} else {
318+
/* We use blocking MPI_Send (which does not need a request)
319+
* every max_reqs send operation (which is size/max_reqs at most),
320+
* therefore no need to allocate requests for these sends. */
321+
nreqs = size - (size / max_reqs);
322+
}
323+
324+
reqs = ompi_coll_base_comm_get_reqs(module->base_data, nreqs);
325+
if (NULL == reqs) {
326+
err = OMPI_ERR_OUT_OF_RESOURCE;
327+
line = __LINE__; goto err_hndl;
328+
}
329+
330+
err = ompi_datatype_type_extent(sdtype, &incr);
331+
if (OMPI_SUCCESS != err) {
332+
line = __LINE__; goto err_hndl;
333+
}
334+
incr *= scount;
335+
336+
/* I am the root, loop sending data. */
337+
for (i = 0, ptmp = (char *)sbuf, preq = reqs; i < size; ++i, ptmp += incr) {
338+
/* simple optimization */
339+
if (i == rank) {
340+
if (MPI_IN_PLACE != rbuf) {
341+
err = ompi_datatype_sndrcv(ptmp, scount, sdtype, rbuf, rcount,
342+
rdtype);
343+
}
344+
} else {
345+
if (!max_reqs || (i % max_reqs)) {
346+
err = MCA_PML_CALL(isend(ptmp, scount, sdtype, i,
347+
MCA_COLL_BASE_TAG_SCATTER,
348+
MCA_PML_BASE_SEND_STANDARD,
349+
comm, preq++));
350+
} else {
351+
err = MCA_PML_CALL(send(ptmp, scount, sdtype, i,
352+
MCA_COLL_BASE_TAG_SCATTER,
353+
MCA_PML_BASE_SEND_STANDARD,
354+
comm));
355+
}
356+
}
357+
if (MPI_SUCCESS != err) {
358+
line = __LINE__; goto err_hndl;
359+
}
360+
}
361+
362+
err = ompi_request_wait_all(preq - reqs, reqs, MPI_STATUSES_IGNORE);
363+
if (MPI_SUCCESS != err) {
364+
line = __LINE__; goto err_hndl;
365+
}
366+
367+
return MPI_SUCCESS;
368+
369+
err_hndl:
370+
if (NULL != reqs) {
371+
/* find a real error code */
372+
if (MPI_ERR_IN_STATUS == err) {
373+
for (i = 0; i < nreqs; i++) {
374+
if (MPI_REQUEST_NULL == reqs[i]) continue;
375+
if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue;
376+
err = reqs[i]->req_status.MPI_ERROR;
377+
break;
378+
}
379+
}
380+
ompi_coll_base_free_reqs(reqs, nreqs);
381+
}
382+
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
383+
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank));
384+
(void)line; /* silence compiler warning */
385+
return err;
386+
}
387+

ompi/mca/coll/tuned/coll_tuned.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* reserved.
66
* Copyright (c) 2015-2018 Research Organization for Information Science
77
* and Technology (RIST). All rights reserved.
8+
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
89
* $COPYRIGHT$
910
*
1011
* Additional copyrights may follow
@@ -41,6 +42,10 @@ extern int ompi_coll_tuned_alltoall_intermediate_msg;
4142
extern int ompi_coll_tuned_alltoall_large_msg;
4243
extern int ompi_coll_tuned_alltoall_min_procs;
4344
extern int ompi_coll_tuned_alltoall_max_requests;
45+
extern int ompi_coll_tuned_scatter_intermediate_msg;
46+
extern int ompi_coll_tuned_scatter_large_msg;
47+
extern int ompi_coll_tuned_scatter_min_procs;
48+
extern int ompi_coll_tuned_scatter_blocking_send_ratio;
4449

4550
/* forced algorithm choices */
4651
/* this structure is for storing the indexes to the forced algorithm mca params... */

ompi/mca/coll/tuned/coll_tuned_component.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
* reserved.
1717
* Copyright (c) 2015-2018 Research Organization for Information Science
1818
* and Technology (RIST). All rights reserved.
19+
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
1920
* $COPYRIGHT$
2021
*
2122
* Additional copyrights may follow
@@ -64,6 +65,12 @@ int ompi_coll_tuned_alltoall_large_msg = 3000;
6465
int ompi_coll_tuned_alltoall_min_procs = 0; /* disable by default */
6566
int ompi_coll_tuned_alltoall_max_requests = 0; /* no limit for alltoall by default */
6667

68+
/* Disable by default */
69+
int ompi_coll_tuned_scatter_intermediate_msg = 0;
70+
int ompi_coll_tuned_scatter_large_msg = 0;
71+
int ompi_coll_tuned_scatter_min_procs = 0;
72+
int ompi_coll_tuned_scatter_blocking_send_ratio = 0;
73+
6774
/* forced alogrithm variables */
6875
/* indices for the MCA parameters */
6976
coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT] = {{0}};

ompi/mca/coll/tuned/coll_tuned_decision_fixed.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* reserved.
1616
* Copyright (c) 2015-2018 Research Organization for Information Science
1717
* and Technology (RIST). All rights reserved.
18+
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
1819
* $COPYRIGHT$
1920
*
2021
* Additional copyrights may follow
@@ -780,6 +781,7 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, int scount,
780781
{
781782
const size_t small_block_size = 300;
782783
const int small_comm_size = 10;
784+
const int intermediate_comm_size = 64;
783785
int communicator_size, rank;
784786
size_t dsize, block_size;
785787

@@ -802,7 +804,16 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, int scount,
802804
return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
803805
rbuf, rcount, rdtype,
804806
root, comm, module);
807+
} else if ((communicator_size < ompi_coll_tuned_scatter_min_procs) &&
808+
(communicator_size > intermediate_comm_size) &&
809+
(block_size >= ompi_coll_tuned_scatter_intermediate_msg) &&
810+
(block_size < ompi_coll_tuned_scatter_large_msg)) {
811+
return ompi_coll_base_scatter_intra_linear_nb(sbuf, scount, sdtype,
812+
rbuf, rcount, rdtype,
813+
root, comm, module,
814+
ompi_coll_tuned_scatter_blocking_send_ratio);
805815
}
816+
806817
return ompi_coll_base_scatter_intra_basic_linear(sbuf, scount, sdtype,
807818
rbuf, rcount, rdtype,
808819
root, comm, module);

ompi/mca/coll/tuned/coll_tuned_scatter_decision.c

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* reserved.
66
* Copyright (c) 2015 Research Organization for Information Science
77
* and Technology (RIST). All rights reserved.
8+
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
89
* $COPYRIGHT$
910
*
1011
* Additional copyrights may follow
@@ -36,6 +37,7 @@ static mca_base_var_enum_value_t scatter_algorithms[] = {
3637
{0, "ignore"},
3738
{1, "basic_linear"},
3839
{2, "binomial"},
40+
{3, "linear_nb"},
3941
{0, NULL}
4042
};
4143

@@ -74,7 +76,7 @@ ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_p
7476
mca_param_indices->algorithm_param_index =
7577
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
7678
"scatter_algorithm",
77-
"Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial.",
79+
"Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial, 3 non-blocking linear.",
7880
MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
7981
OPAL_INFO_LVL_5,
8082
MCA_BASE_VAR_SCOPE_ALL,
@@ -114,6 +116,38 @@ ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_p
114116
MCA_BASE_VAR_SCOPE_ALL,
115117
&coll_tuned_scatter_chain_fanout);
116118

119+
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
120+
"scatter_min_procs",
121+
"use basic linear algorithm for communicators larger than this value",
122+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
123+
OPAL_INFO_LVL_6,
124+
MCA_BASE_VAR_SCOPE_READONLY,
125+
&ompi_coll_tuned_scatter_min_procs);
126+
127+
(void)mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
128+
"scatter_algorithm_max_requests",
129+
"Issue a blocking send every this many non-blocking requests. Only has meaning for non-blocking linear algorithm.",
130+
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
131+
OPAL_INFO_LVL_5,
132+
MCA_BASE_VAR_SCOPE_ALL,
133+
&ompi_coll_tuned_scatter_blocking_send_ratio);
134+
135+
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
136+
"scatter_intermediate_msg",
137+
"use non-blocking linear algorithm for messages larger than this value",
138+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
139+
OPAL_INFO_LVL_6,
140+
MCA_BASE_VAR_SCOPE_READONLY,
141+
&ompi_coll_tuned_scatter_intermediate_msg);
142+
143+
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
144+
"scatter_large_msg",
145+
"use linear algorithm for messages larger than this value",
146+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
147+
OPAL_INFO_LVL_6,
148+
MCA_BASE_VAR_SCOPE_READONLY,
149+
&ompi_coll_tuned_scatter_large_msg);
150+
117151
return (MPI_SUCCESS);
118152
}
119153

@@ -144,6 +178,11 @@ ompi_coll_tuned_scatter_intra_do_this(const void *sbuf, int scount,
144178
return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
145179
rbuf, rcount, rdtype,
146180
root, comm, module);
181+
case (3):
182+
return ompi_coll_base_scatter_intra_linear_nb(sbuf, scount, sdtype,
183+
rbuf, rcount, rdtype,
184+
root, comm, module,
185+
ompi_coll_tuned_scatter_blocking_send_ratio);
147186
} /* switch */
148187
OPAL_OUTPUT((ompi_coll_tuned_stream,
149188
"coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",

0 commit comments

Comments
 (0)