Skip to content

Commit f533a28

Browse files
authored
Merge pull request open-mpi#4706 from matcabral/osc_rdma_new_mtl_param_20x
osc/rdma: add an mca parameter to list MTLs for which osc pt2pt should have higher priority than rdma and default to psm2
2 parents aaae77c + ebce5f1 commit f533a28

File tree

1 file changed

+75
-22
lines changed

1 file changed

+75
-22
lines changed

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 75 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
1616
* Copyright (c) 2012-2015 Sandia National Laboratories. All rights reserved.
1717
* Copyright (c) 2015 NVIDIA Corporation. All rights reserved.
18-
* Copyright (c) 2015 Intel, Inc. All rights reserved.
18+
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
19+
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
1920
* $COPYRIGHT$
2021
*
2122
* Additional copyrights may follow
@@ -53,6 +54,7 @@
5354
#include "opal/mca/btl/base/base.h"
5455
#include "opal/mca/base/mca_base_pvar.h"
5556
#include "ompi/mca/bml/base/base.h"
57+
#include "ompi/mca/mtl/base/base.h"
5658

5759
static int ompi_osc_rdma_component_register (void);
5860
static int ompi_osc_rdma_component_init (bool enable_progress_threads, bool enable_mpi_threads);
@@ -68,8 +70,10 @@ static int ompi_osc_rdma_set_info (struct ompi_win_t *win, struct ompi_info_t *i
6870
static int ompi_osc_rdma_get_info (struct ompi_win_t *win, struct ompi_info_t **info_used);
6971

7072
static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_base_module_t **btl);
73+
static int ompi_osc_rdma_query_mtls (void);
7174

7275
static char *ompi_osc_rdma_btl_names;
76+
static char *ompi_osc_rdma_mtl_names;
7377

7478
ompi_osc_rdma_component_t mca_osc_rdma_component = {
7579
.super = {
@@ -166,62 +170,88 @@ static int ompi_osc_rdma_pvar_read (const struct mca_base_pvar_t *pvar, void *va
166170

167171
static int ompi_osc_rdma_component_register (void)
168172
{
173+
char *description_str;
169174
mca_osc_rdma_component.no_locks = false;
175+
asprintf(&description_str, "Enable optimizations available only if MPI_LOCK is "
176+
"not used. Info key of same name overrides this value (default: %s)",
177+
mca_osc_rdma_component.no_locks ? "true" : "false");
170178
(void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version,
171-
"no_locks", "Enable optimizations available only if MPI_LOCK is "
172-
"not used. Info key of same name overrides this value (default: false)",
179+
"no_locks", description_str,
173180
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
174181
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.no_locks);
182+
free(description_str);
175183

176184
mca_osc_rdma_component.acc_single_intrinsic = false;
185+
asprintf(&description_str, "Enable optimizations for MPI_Fetch_and_op, MPI_Accumulate, etc for codes "
186+
"that will not use anything more than a single predefined datatype (default: %s)",
187+
mca_osc_rdma_component.acc_single_intrinsic ? "true" : "false");
177188
(void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_single_intrinsic",
178-
"Enable optimizations for MPI_Fetch_and_op, MPI_Accumulate, etc for codes "
179-
"that will not use anything more than a single predefined datatype (default: false)",
189+
description_str,
180190
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
181191
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.acc_single_intrinsic);
192+
free(description_str);
182193

183194
mca_osc_rdma_component.acc_use_amo = true;
195+
asprintf(&description_str, "Enable the use of network atomic memory operations when using single "
196+
"intrinsic optimizations. If not set network compare-and-swap will be "
197+
"used instread (default: %s)", mca_osc_rdma_component.acc_use_amo ? "true" : "false");
184198
(void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_use_amo",
185-
"Enable the use of network atomic memory operations when using single "
186-
"intrinsic optimizations. If not set network compare-and-swap will be "
187-
"used instread (default: true)", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
199+
description_str, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
188200
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.acc_use_amo);
201+
free(description_str);
189202

190203
mca_osc_rdma_component.buffer_size = 32768;
204+
asprintf(&description_str, "Size of temporary buffers (default: %d)", mca_osc_rdma_component.buffer_size);
191205
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "buffer_size",
192-
"Size of temporary buffers (default: 32k)", MCA_BASE_VAR_TYPE_UNSIGNED_INT,
206+
description_str, MCA_BASE_VAR_TYPE_UNSIGNED_INT,
193207
NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
194208
&mca_osc_rdma_component.buffer_size);
209+
free(description_str);
195210

196211
mca_osc_rdma_component.max_attach = 32;
212+
asprintf(&description_str, "Maximum number of buffers that can be attached to a dynamic window. "
213+
"Keep in mind that each attached buffer will use a potentially limited "
214+
"resource (default: %d)", mca_osc_rdma_component.max_attach);
197215
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "max_attach",
198-
"Maximum number of buffers that can be attached to a dynamic window. "
199-
"Keep in mind that each attached buffer will use a potentially limited "
200-
"resource (default: 32)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0,
216+
description_str , MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0,
201217
OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.max_attach);
218+
free(description_str);
202219

203220
mca_osc_rdma_component.aggregation_limit = 1024;
221+
asprintf(&description_str, "Maximum size of an aggregated put/get. Messages are aggregated for consecutive"
222+
"put and get operations. In some cases this may lead to higher latency but "
223+
"should also lead to higher bandwidth utilization. Set to 0 to disable (default: %d)",
224+
mca_osc_rdma_component.aggregation_limit);
204225
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "aggregation_limit",
205-
"Maximum size of an aggregated put/get. Messages are aggregated for consecutive"
206-
"put and get operations. In some cases this may lead to higher latency but "
207-
"should also lead to higher bandwidth utilization. Set to 0 to disable (default:"
208-
" 1k)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
226+
description_str, MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
209227
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.aggregation_limit);
228+
free(description_str);
210229

211230
mca_osc_rdma_component.priority = 90;
231+
asprintf(&description_str, "Priority of the osc/rdma component (default: %d)",
232+
mca_osc_rdma_component.priority);
212233
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "priority",
213-
"Priority of the osc/rdma component (default: 90)",
214-
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
234+
description_str, MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
215235
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.priority);
236+
free(description_str);
216237

217238
ompi_osc_rdma_btl_names = "openib,ugni";
239+
asprintf(&description_str, "Comma-delimited list of BTL component names to allow without verifying "
240+
"connectivity. Do not add a BTL to to this list unless it can reach all "
241+
"processes in any communicator used with an MPI window (default: %s)",
242+
ompi_osc_rdma_btl_names);
218243
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "btls",
219-
"Comma-delimited list of BTL component names to allow without verifying "
220-
"connectivity. Do not add a BTL to to this list unless it can reach all "
221-
"processes in any communicator used with an MPI window (default: openib,ugni)",
222-
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
244+
description_str, MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
223245
MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_names);
246+
free(description_str);
224247

248+
ompi_osc_rdma_mtl_names = "psm2";
249+
asprintf(&description_str, "Comma-delimited list of MTL component names to lower the priority of rdma "
250+
"osc component favoring pt2pt osc (default: %s)", ompi_osc_rdma_mtl_names);
251+
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "mtls",
252+
description_str, MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
253+
MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_mtl_names);
254+
free(description_str);
225255

226256
/* register performance variables */
227257

@@ -339,6 +369,10 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
339369
}
340370
#endif /* OPAL_CUDA_SUPPORT */
341371

372+
if (OMPI_SUCCESS == ompi_osc_rdma_query_mtls ()) {
373+
return 5; /* this has to be lower that osc pt2pt default priority */
374+
}
375+
342376
if (OMPI_SUCCESS != ompi_osc_rdma_query_btls (comm, NULL)) {
343377
return -1;
344378
}
@@ -703,6 +737,25 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
703737
return ret;
704738
}
705739

740+
static int ompi_osc_rdma_query_mtls (void)
741+
{
742+
char **mtls_to_use;
743+
bool mtl_match = false;
744+
745+
mtls_to_use = opal_argv_split (ompi_osc_rdma_mtl_names, ',');
746+
if (mtls_to_use && ompi_mtl_base_selected_component) {
747+
for (int i = 0 ; mtls_to_use[i] ; ++i) {
748+
if (0 == strcmp (mtls_to_use[i], ompi_mtl_base_selected_component->mtl_version.mca_component_name)) {
749+
mtl_match = true;
750+
break;
751+
}
752+
}
753+
}
754+
755+
opal_argv_free (mtls_to_use);
756+
return mtl_match ? OMPI_SUCCESS : OMPI_ERR_NOT_FOUND;
757+
}
758+
706759
static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_base_module_t **btl)
707760
{
708761
struct mca_btl_base_module_t **possible_btls = NULL;

0 commit comments

Comments
 (0)