Skip to content

Commit 9f60cd0

Browse files
author
Ralph Castain
committed
Update the connect/accept support so we check to see if we have the proper infrastructure and RTE support, including whether we have ompi-server available if the connect/accept spans multiple applications. Print pretty help messages in all cases where we do not have support
Signed-off-by: Ralph Castain <[email protected]>
1 parent a6f6113 commit 9f60cd0

File tree

14 files changed

+115
-20
lines changed

14 files changed

+115
-20
lines changed

ompi/dpm/dpm.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
1616
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
1717
* reserved.
18-
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
18+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1919
* Copyright (c) 2014-2017 Research Organization for Information Science
2020
* and Technology (RIST). All rights reserved.
2121
* $COPYRIGHT$
@@ -40,6 +40,7 @@
4040
#include "opal/util/argv.h"
4141
#include "opal/util/opal_getcwd.h"
4242
#include "opal/util/proc.h"
43+
#include "opal/util/show_help.h"
4344
#include "opal/dss/dss.h"
4445
#include "opal/mca/hwloc/base/base.h"
4546
#include "opal/mca/pmix/pmix.h"
@@ -112,6 +113,12 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
112113
if (NULL == opal_pmix.publish || NULL == opal_pmix.connect ||
113114
NULL == opal_pmix.unpublish ||
114115
(NULL == opal_pmix.lookup && NULL == opal_pmix.lookup_nb)) {
116+
/* print a nice message explaining we don't have support */
117+
opal_show_help("help-mpi-runtime.txt", "noconxcpt", true);
118+
return OMPI_ERR_NOT_SUPPORTED;
119+
}
120+
if (!ompi_rte_connect_accept_support(port_string)) {
121+
/* they will have printed the help message */
115122
return OMPI_ERR_NOT_SUPPORTED;
116123
}
117124

ompi/mca/rte/orte/rte_orte.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
33
* All rights reserved.
4-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
4+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
55
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
66
* Copyright (c) 2014-2016 Research Organization for Information Science
77
* and Technology (RIST). All rights reserved.
@@ -116,6 +116,9 @@ static inline orte_process_name_t * OMPI_CAST_RTE_NAME(opal_process_name_t * nam
116116
}
117117
#endif
118118

119+
/* check dynamics support */
120+
OMPI_DECLSPEC bool ompi_rte_connect_accept_support(const char *port);
121+
119122
END_C_DECLS
120123

121124
#endif /* MCA_OMPI_RTE_ORTE_H */

ompi/mca/rte/orte/rte_orte_module.c

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include "orte/mca/routed/routed.h"
4040
#include "orte/util/name_fns.h"
4141
#include "orte/util/session_dir.h"
42+
#include "orte/util/show_help.h"
4243
#include "orte/runtime/orte_globals.h"
4344
#include "orte/runtime/orte_wait.h"
4445
#include "orte/runtime/orte_data_server.h"
@@ -198,3 +199,47 @@ void ompi_rte_wait_for_debugger(void)
198199
opal_pmix.deregister_evhandler(handler, NULL, NULL);
199200
}
200201
}
202+
203+
bool ompi_rte_connect_accept_support(const char *port)
204+
{
205+
char *ptr, *tmp;
206+
orte_process_name_t name;
207+
208+
/* were we launched by mpirun, or are we calling
209+
* without a defined port? */
210+
if (NULL == orte_process_info.my_hnp_uri ||
211+
0 == strlen(port)) {
212+
return true;
213+
}
214+
215+
/* is the job family in the port different than my own? */
216+
tmp = strdup(port); // protect input
217+
if (NULL == (ptr = strchr(tmp, ':'))) {
218+
/* this port didn't come from us! */
219+
orte_show_help("help-orterun.txt", "orterun:malformedport", true);
220+
free(tmp);
221+
return false;
222+
}
223+
*ptr = '\0';
224+
if (ORTE_SUCCESS != orte_util_convert_string_to_process_name(&name, tmp)) {
225+
free(tmp);
226+
orte_show_help("help-orterun.txt", "orterun:malformedport", true);
227+
return false;
228+
}
229+
free(tmp);
230+
if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) == ORTE_JOB_FAMILY(name.jobid)) {
231+
/* same job family, so our infrastructure is adequate */
232+
return true;
233+
}
234+
235+
/* if the job family of the port is different than our own
236+
* and we were launched by mpirun, then we require ompi-server
237+
* support */
238+
if (NULL == orte_data_server_uri) {
239+
/* print a pretty help message */
240+
orte_show_help("help-orterun.txt", "orterun:server-unavailable", true);
241+
return false;
242+
}
243+
244+
return true;
245+
}

ompi/runtime/help-mpi-runtime.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# All rights reserved.
1313
# Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved.
1414
# Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
15+
# Copyright (c) 2017 Intel, Inc. All rights reserved.
1516
# $COPYRIGHT$
1617
#
1718
# Additional copyrights may follow
@@ -93,3 +94,13 @@ Open MPI with --enable-heterogeneous.
9394
[no cuda support]
9495
The user requested CUDA support with the --mca mpi_cuda_support 1 flag
9596
but the library was not compiled with any support.
97+
#
98+
[noconxcpt]
99+
The user has called an operation involving MPI_Connect and/or MPI_Accept,
100+
but this environment lacks the necessary infrastructure support for
101+
that operation. Open MPI relies on the PMIx_Publish/Lookup (or one of
102+
its predecessors) APIs for this operation.
103+
104+
This typically happens when launching outside of mpirun where the underlying
105+
resource manager does not provide publish/lookup support. One way of solving
106+
the problem is to simply use mpirun to start the application.

opal/mca/pmix/base/base.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
2+
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
33
* $COPYRIGHT$
44
*
55
* Additional copyrights may follow
@@ -57,6 +57,7 @@ OPAL_DECLSPEC void opal_pmix_base_set_evbase(opal_event_base_t *evbase);
5757

5858
typedef struct {
5959
opal_event_base_t *evbase;
60+
int timeout;
6061
} opal_pmix_base_t;
6162

6263
extern opal_pmix_base_t opal_pmix_base;

opal/mca/pmix/base/pmix_base_fns.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,12 @@ int opal_pmix_base_exchange(opal_value_t *indat,
193193
info = OBJ_NEW(opal_value_t);
194194
info->key = strdup(OPAL_PMIX_TIMEOUT);
195195
info->type = OPAL_INT;
196-
info->data.integer = timeout;
196+
if (0 < opal_pmix_base.timeout) {
197+
/* the user has overridden the default */
198+
info->data.integer = opal_pmix_base.timeout;
199+
} else {
200+
info->data.integer = timeout;
201+
}
197202
opal_list_append(&mlist, &info->super);
198203

199204
/* if a non-blocking version of lookup isn't

opal/mca/pmix/base/pmix_base_frame.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ static int opal_pmix_base_frame_register(mca_base_register_flag_t flags)
4747
(void) mca_base_var_register("opal", "pmix", "base", "collect_data", "Collect all data during modex",
4848
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9,
4949
MCA_BASE_VAR_SCOPE_READONLY, &opal_pmix_collect_all_data);
50+
51+
opal_pmix_base.timeout = -1;
52+
(void) mca_base_var_register("opal", "pmix", "base", "exchange_timeout",
53+
"Time (in seconds) to wait for a data exchange to complete",
54+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
55+
MCA_BASE_VAR_SCOPE_READONLY, &opal_pmix_base.timeout);
5056
return OPAL_SUCCESS;
5157
}
5258

orte/orted/help-orted.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,9 @@ A request has timed out and will therefore fail:
6767
Operation: %s
6868

6969
Your job may terminate as a result of this problem. You may want to
70-
adjust the MCA parameter pmix_server_max_wait and try again.
70+
adjust the MCA parameter pmix_server_max_wait and try again. If this
71+
occurred during a connect/accept operation, you can adjust that time
72+
using the pmix_base_exchange_timeout parameter.
7173
#
7274
[noroom]
7375
A request for an asynchronous runtime operation cannot be fulfilled

orte/orted/pmix/pmix_server.c

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -138,14 +138,6 @@ void pmix_server_register_params(void)
138138
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
139139
&orte_pmix_server_globals.timeout);
140140

141-
/* register the URI of the UNIVERSAL data server */
142-
orte_pmix_server_globals.server_uri = NULL;
143-
(void) mca_base_var_register ("orte", "pmix", NULL, "server_uri",
144-
"URI of a session-level keyval server for publish/lookup operations",
145-
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
146-
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
147-
&orte_pmix_server_globals.server_uri);
148-
149141
/* whether or not to wait for the universal server */
150142
orte_pmix_server_globals.wait_for_server = false;
151143
(void) mca_base_var_register ("orte", "pmix", NULL, "wait_for_server",

orte/orted/pmix/pmix_server_pub.c

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,13 @@ static int init_server(void)
5959

6060
/* if the universal server wasn't specified, then we use
6161
* our own HNP for that purpose */
62-
if (NULL == orte_pmix_server_globals.server_uri) {
62+
if (NULL == orte_data_server_uri) {
6363
orte_pmix_server_globals.server = *ORTE_PROC_MY_HNP;
6464
} else {
65-
if (0 == strncmp(orte_pmix_server_globals.server_uri, "file", strlen("file")) ||
66-
0 == strncmp(orte_pmix_server_globals.server_uri, "FILE", strlen("FILE"))) {
65+
if (0 == strncmp(orte_data_server_uri, "file", strlen("file")) ||
66+
0 == strncmp(orte_data_server_uri, "FILE", strlen("FILE"))) {
6767
/* it is a file - get the filename */
68-
filename = strchr(orte_pmix_server_globals.server_uri, ':');
68+
filename = strchr(orte_data_server_uri, ':');
6969
if (NULL == filename) {
7070
/* filename is not correctly formatted */
7171
orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true,
@@ -121,7 +121,6 @@ static int init_server(void)
121121
* as a background job - e.g., in scripts
122122
*/
123123
if (orte_pmix_server_globals.wait_for_server) {
124-
opal_output(0, "WAIT");
125124
/* ping the server */
126125
struct timeval timeout;
127126
timeout.tv_sec = orte_pmix_server_globals.timeout;
@@ -141,8 +140,6 @@ static int init_server(void)
141140
}
142141
}
143142

144-
opal_output(0, "SERVER READY");
145-
146143
return ORTE_SUCCESS;
147144
}
148145

0 commit comments

Comments
 (0)