Skip to content

Commit 7cbea77

Browse files
author
Ralph Castain
authored
Merge pull request #3778 from rhc54/topic/warn
Attempt to detect when we are direct-launched without the necessary P…
2 parents cb19296 + bd4a6fe commit 7cbea77

File tree

7 files changed

+109
-7
lines changed

7 files changed

+109
-7
lines changed

ompi/runtime/ompi_mpi_init.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -990,7 +990,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
990990
error:
991991
if (ret != OMPI_SUCCESS) {
992992
/* Only print a message if one was not already printed */
993-
if (NULL != error) {
993+
if (NULL != error && OMPI_ERR_SILENT != ret) {
994994
const char *err_msg = opal_strerror(ret);
995995
opal_show_help("help-mpi-runtime.txt",
996996
"mpi_init:startup:internal-failure", true,

orte/mca/ess/base/help-ess-base.txt

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,43 @@ MCA parameter:
4949
param: %s
5050

5151
This is not a recognized signal value. Please fix or remove it.
52+
#
53+
[slurm-error]
54+
The application appears to have been direct launched using "srun",
55+
but OMPI was not built with SLURM's PMI support and therefore cannot
56+
execute. There are several options for building PMI support under
57+
SLURM, depending upon the SLURM version you are using:
58+
59+
version 16.05 or later: you can use SLURM's PMIx support. This
60+
requires that you configure and build SLURM --with-pmix.
61+
62+
Versions earlier than 16.05: you must use either SLURM's PMI-1 or
63+
PMI-2 support. SLURM builds PMI-1 by default, or you can manually
64+
install PMI-2. You must then build Open MPI using --with-pmi pointing
65+
to the SLURM PMI library location.
66+
67+
Please configure as appropriate and try again.
68+
#
69+
[slurm-error2]
70+
The application appears to have been direct launched using "srun",
71+
but OMPI was not built with SLURM support. This usually happens
72+
when OMPI was not configured --with-slurm and we weren't able
73+
to discover a SLURM installation in the usual places.
74+
75+
Please configure as appropriate and try again.
76+
#
77+
[alps-error]
78+
The application appears to have been direct launched using "aprun",
79+
but OMPI was not built with ALPS PMI support and therefore cannot
80+
execute. You must build Open MPI using --with-pmi pointing
81+
to the ALPS PMI library location.
82+
83+
Please configure as appropriate and try again.
84+
#
85+
[alps-error2]
86+
The application appears to have been direct launched using "aprun",
87+
but OMPI was not built with ALPS support. This usually happens
88+
when OMPI was not configured --with-alps and we weren't able
89+
to discover an ALPS installation in the usual places.
90+
91+
Please configure as appropriate and try again.

orte/mca/ess/pmi/ess_pmi_module.c

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
#include "orte/mca/errmgr/errmgr.h"
5353
#include "orte/mca/grpcomm/grpcomm.h"
5454
#include "orte/mca/rml/rml.h"
55+
#include "orte/mca/schizo/schizo.h"
5556
#include "orte/util/proc_info.h"
5657
#include "orte/util/show_help.h"
5758
#include "orte/util/name_fns.h"
@@ -125,7 +126,24 @@ static int rte_init(void)
125126
opal_pmix_base_set_evbase(orte_event_base);
126127
/* initialize the selected module */
127128
if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init(NULL)))) {
128-
/* we cannot run */
129+
/* we cannot run - this could be due to being direct launched
130+
* without the required PMI support being built. Try to detect
131+
* that scenario and warn the user */
132+
if (ORTE_SCHIZO_DIRECT_LAUNCHED == orte_schizo.check_launch_environment() &&
133+
NULL != (envar = getenv("ORTE_SCHIZO_DETECTION"))) {
134+
if (0 == strcmp(envar, "SLURM")) {
135+
/* yes to both - so emit a hopefully helpful
136+
* error message and abort */
137+
orte_show_help_finalize();
138+
orte_show_help("help-ess-base.txt", "slurm-error", true);
139+
return ORTE_ERR_SILENT;
140+
} else if (0 == strcmp(envar, "ALPS")) {
141+
/* we were direct launched by ALPS */
142+
orte_show_help_finalize();
143+
orte_show_help("help-ess-base.txt", "alps-error", true);
144+
return ORTE_ERR_SILENT;
145+
}
146+
}
129147
error = "pmix init";
130148
goto error;
131149
}

orte/mca/ess/singleton/ess_singleton_component.c

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2016 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
1616
* $COPYRIGHT$
1717
*
1818
* Additional copyrights may follow
@@ -33,6 +33,7 @@
3333
#include "opal/mca/pmix/base/base.h"
3434

3535
#include "orte/util/proc_info.h"
36+
#include "orte/util/show_help.h"
3637
#include "orte/mca/schizo/schizo.h"
3738

3839
#include "orte/mca/ess/ess.h"
@@ -131,6 +132,32 @@ static int component_query(mca_base_module_t **module, int *priority)
131132
return ORTE_ERROR;
132133
}
133134

135+
/* we may be incorrectly trying to run as a singleton - e.g.,
136+
* someone direct-launched us under SLURM without building
137+
* ORTE --with-slurm or in a slurm environment (so we didn't
138+
* autodetect slurm). Try to detect that here. Sadly, we
139+
* cannot just use the schizo framework to help us here as
140+
* the corresponding schizo component may not have even
141+
* been build. So we have to do things a little uglier */
142+
143+
if (ORTE_SCHIZO_UNMANAGED_SINGLETON == ret) {
144+
/* see if we are in a SLURM allocation */
145+
if (NULL != getenv("SLURM_NODELIST")) {
146+
/* emit a hopefully helpful error message and abort */
147+
orte_show_help("help-ess-base.txt", "slurm-error2", true);
148+
*module = NULL;
149+
*priority = 0;
150+
return ORTE_ERR_SILENT;
151+
}
152+
/* see if we are under ALPS */
153+
if (NULL != getenv("ALPS_APP_ID")) {
154+
orte_show_help("help-ess-base.txt", "alps-error2", true);
155+
*module = NULL;
156+
*priority = 0;
157+
return ORTE_ERR_SILENT;
158+
}
159+
}
160+
134161
/* okay, we want to be selected as we must be a singleton */
135162
*priority = 100;
136163
*module = (mca_base_module_t *)&orte_ess_singleton_module;
@@ -142,4 +169,3 @@ static int component_close(void)
142169
{
143170
return ORTE_SUCCESS;
144171
}
145-

orte/mca/schizo/alps/schizo_alps.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2016 Intel, Inc. All rights reserved.
2+
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
33
* $COPYRIGHT$
44
*
55
* Additional copyrights may follow
@@ -65,9 +65,16 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
6565
* launch performance penalty for hwloc at high ppn on knl */
6666
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX "orte_bound_at_launch");
6767
opal_argv_append_nosize(&pushed_vals, "true");
68+
/* mark that we are native */
69+
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
70+
opal_argv_append_nosize(&pushed_vals, "NATIVE");
6871
goto setup;
6972
}
7073

74+
/* mark that we are on ALPS */
75+
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
76+
opal_argv_append_nosize(&pushed_vals, "ALPS");
77+
7178
/* see if we are running in a Cray PAGG container */
7279
fd = fopen(proc_job_file, "r");
7380
if (NULL == fd) {

orte/mca/schizo/orte/schizo_orte.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2016 Intel, Inc. All rights reserved.
2+
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
33
* $COPYRIGHT$
44
*
55
* Additional copyrights may follow
@@ -53,7 +53,7 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
5353
* so no need to further check that here. Instead,
5454
* see if we were direct launched vs launched via mpirun */
5555
if (NULL != orte_process_info.my_daemon_uri) {
56-
/* nope */
56+
/* yes we were */
5757
myenv = ORTE_SCHIZO_NATIVE_LAUNCHED;
5858
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
5959
opal_argv_append_nosize(&pushed_vals, "pmi");
@@ -65,6 +65,10 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
6565
myenv = ORTE_SCHIZO_UNMANAGED_SINGLETON;
6666
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
6767
opal_argv_append_nosize(&pushed_vals, "singleton");
68+
/* mark that we are in ORTE */
69+
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
70+
opal_argv_append_nosize(&pushed_vals, "ORTE");
71+
6872

6973
setup:
7074
opal_output_verbose(1, orte_schizo_base_framework.framework_output,

orte/mca/schizo/slurm/schizo_slurm.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
6262
myenv = ORTE_SCHIZO_NATIVE_LAUNCHED;
6363
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
6464
opal_argv_append_nosize(&pushed_vals, "pmi");
65+
/* mark that we are native */
66+
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
67+
opal_argv_append_nosize(&pushed_vals, "NATIVE");
6568
goto setup;
6669
}
6770

@@ -72,6 +75,10 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
7275
return myenv;
7376
}
7477

78+
/* mark that we are in SLURM */
79+
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
80+
opal_argv_append_nosize(&pushed_vals, "SLURM");
81+
7582
/* we are in an allocation, but were we direct launched
7683
* or are we a singleton? */
7784
if (NULL == getenv("SLURM_STEP_ID")) {

0 commit comments

Comments
 (0)