Skip to content

Commit 666386f

Browse files
author
Ralph Castain
authored
Merge pull request #3294 from rhc54/topic/modx
Enable SLURM on Cray with constraints and fix bug in nidmap
2 parents d7f283c + a29ca2b commit 666386f

File tree

9 files changed

+73
-32
lines changed

9 files changed

+73
-32
lines changed

orte/mca/plm/alps/help-plm-alps.txt

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# University of Stuttgart. All rights reserved.
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
13+
# Copyright (c) 2017 Intel, Inc. All rights reserved.
1314
# $COPYRIGHT$
1415
#
1516
# Additional copyrights may follow
@@ -39,7 +40,3 @@ the map for this application. This can be caused by a lack of
3940
an allocation, or by an error in the Open MPI code. Please check
4041
to ensure you have a ALPS allocation. If you do, then please pass
4142
the error to the Open MPI user's mailing list for assistance.
42-
#
43-
[slurm-not-supported]
44-
mpirun is not a supported launcher on Cray XC using Native SLURM.
45-
srun must be used to launch jobs on these systems.

orte/mca/plm/alps/plm_alps.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12+
* Copyright (c) 2017 Intel, Inc. All rights reserved.
1213
* $COPYRIGHT$
1314
*
1415
* Additional copyrights may follow
@@ -46,7 +47,6 @@ ORTE_MODULE_DECLSPEC extern orte_plm_alps_component_t
4647
mca_plm_alps_component;
4748
ORTE_DECLSPEC extern orte_plm_base_module_t
4849
orte_plm_alps_module;
49-
extern bool mca_plm_alps_using_aprun;
5050

5151
END_C_DECLS
5252

orte/mca/plm/alps/plm_alps_component.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1414
* reserved.
15+
* Copyright (c) 2017 Intel, Inc. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -43,7 +44,6 @@
4344
*/
4445
const char *mca_plm_alps_component_version_string =
4546
"Open MPI alps plm MCA component version " ORTE_VERSION;
46-
bool mca_plm_alps_using_aprun = {true};
4747

4848

4949
/*
@@ -158,7 +158,11 @@ static int orte_plm_alps_component_query(mca_base_module_t **module, int *priori
158158
}
159159

160160
if((NULL != wlm_detected) && !strcmp(slurm, wlm_detected)) {
161-
mca_plm_alps_using_aprun = false;
161+
/* we are in a Cray SLURM environment, so we don't want
162+
* this plm component */
163+
*priority = 0;
164+
*module = NULL;
165+
return ORTE_ERROR;
162166
}
163167
#endif
164168

orte/mca/plm/alps/plm_alps_module.c

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -121,23 +121,6 @@ static int plm_alps_init(void)
121121
return rc;
122122
}
123123

124-
/*
125-
* owing to way the SLURM PLM component works, we can't use
126-
* it on Cray XC systems as currently designed. The problem
127-
* is the MPI processes launched on the head node (where the
128-
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
129-
* (mpirun) which is not a child of a slurmd daemon. This
130-
* means that any RDMA credentials obtained via the odls/alps
131-
* local launcher are incorrect.
132-
*
133-
* So for now, we just don't support mpirun launched jobs
134-
* on Cray XC systems using Native SLURM.
135-
*/
136-
if (false == mca_plm_alps_using_aprun) {
137-
orte_show_help("help-plm-alps.txt", "slurm-not-supported", true);
138-
exit(-1);
139-
}
140-
141124
if (orte_do_not_launch) {
142125
/* must map daemons since we won't be launching them */
143126
orte_plm_globals.daemon_nodes_assigned_at_launch = true;

orte/mca/plm/slurm/help-plm-slurm.txt

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# University of Stuttgart. All rights reserved.
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
13-
# Copyright (c) 2014 Intel, Inc. All rights reserved.
13+
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1414
# $COPYRIGHT$
1515
#
1616
# Additional copyrights may follow
@@ -49,3 +49,18 @@ are running.
4949

5050
Please consult with your system administrator about obtaining
5151
such support.
52+
[no-local-support]
53+
The SLURM process starter cannot start processes local to
54+
mpirun when executing under a Cray environment. The problem
55+
is that mpirun is not itself a child of a slurmd daemon. Thus,
56+
any processes mpirun itself starts will inherit incorrect
57+
RDMA credentials.
58+
59+
Your application will be mapped and run (assuming adequate
60+
resources) on the remaining allocated nodes. If adequate
61+
resources are not available, you will need to exit and obtain
62+
a larger allocation.
63+
64+
This situation will be fixed in a future release. Meantime,
65+
you can turn "off" this warning by setting the plm_slurm_warning
66+
MCA param to 0.

orte/mca/plm/slurm/plm_slurm.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12+
* Copyright (c) 2017 Intel, Inc. All rights reserved.
1213
* $COPYRIGHT$
1314
*
1415
* Additional copyrights may follow
@@ -29,6 +30,7 @@ BEGIN_C_DECLS
2930
struct orte_plm_slurm_component_t {
3031
orte_plm_base_component_t super;
3132
char *custom_args;
33+
bool slurm_warning_msg;
3234
};
3335
typedef struct orte_plm_slurm_component_t orte_plm_slurm_component_t;
3436

orte/mca/plm/slurm/plm_slurm_component.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1414
* reserved.
15+
* Copyright (c) 2017 Intel, Inc. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -28,7 +29,9 @@
2829
#include "orte_config.h"
2930
#include "orte/constants.h"
3031

32+
#include "opal/util/opal_environ.h"
3133
#include "orte/util/name_fns.h"
34+
#include "orte/util/show_help.h"
3235
#include "orte/runtime/orte_globals.h"
3336

3437
#include "orte/mca/plm/plm.h"
@@ -99,6 +102,13 @@ static int plm_slurm_register(void)
99102
MCA_BASE_VAR_SCOPE_READONLY,
100103
&mca_plm_slurm_component.custom_args);
101104

105+
mca_plm_slurm_component.slurm_warning_msg = true;
106+
(void) mca_base_component_var_register (comp, "warning", "Turn off warning message",
107+
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
108+
OPAL_INFO_LVL_9,
109+
MCA_BASE_VAR_SCOPE_READONLY,
110+
&mca_plm_slurm_component.slurm_warning_msg);
111+
102112
return ORTE_SUCCESS;
103113
}
104114

orte/mca/plm/slurm/plm_slurm_module.c

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
#include "orte/runtime/orte_wait.h"
6666
#include "orte/runtime/orte_quit.h"
6767
#include "orte/mca/errmgr/errmgr.h"
68-
#include "orte/mca/rmaps/rmaps.h"
68+
#include "orte/mca/rmaps/base/base.h"
6969
#include "orte/mca/state/state.h"
7070

7171
#include "orte/orted/orted.h"
@@ -193,6 +193,25 @@ static void launch_daemons(int fd, short args, void *cbdata)
193193
"%s plm:slurm: LAUNCH DAEMONS CALLED",
194194
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
195195

196+
#if SLURM_CRAY_ENV
197+
/* if we are in a Cray-SLURM environment, then we cannot
198+
* launch procs local to the HNP. The problem
199+
* is the MPI processes launched on the head node (where the
200+
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
201+
* (mpirun) which is not a child of a slurmd daemon. This
202+
* means that any RDMA credentials obtained via the odls/alps
203+
* local launcher are incorrect. So warn the user and set
204+
* the envar for no_schedule_local if mpirun is not on a
205+
* system management node (i.e. is part of the allocation)
206+
* and the "no_use_local" flag hasn't been set */
207+
if (mca_plm_slurm_component.slurm_warning_msg &&
208+
(orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL))) {
209+
orte_show_help("help-plm-slurm.txt", "no-local-support", true);
210+
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL);
211+
mca_plm_slurm_component.slurm_warning_msg = false; // only do this once
212+
}
213+
#endif
214+
196215
/* if we are launching debugger daemons, then just go
197216
* do it - no new daemons will be launched
198217
*/

orte/util/nidmap.c

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -618,14 +618,25 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
618618

619619
/* handle the topologies - as the most common case by far
620620
* is to have homogeneous topologies, we only send them
621-
* if something is different */
622-
if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) {
623-
ui8 = 2;
624-
} else {
625-
ui8 = 1;
621+
* if something is different. We know that the HNP is
622+
* the first topology, and that any differing topology
623+
* on the compute nodes must follow. So send the topologies
624+
* if and only if:
625+
*
626+
* (a) the HNP is being used to house application procs and
627+
* there is more than one topology on our list; or
628+
*
629+
* (b) the HNP is not being used, but there are more than
630+
* two topologies on our list, thus indicating that
631+
* there are multiple topologies on the compute nodes
632+
*/
633+
if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) {
634+
/* remove the first topo on the list */
635+
item = opal_list_remove_first(&topos);
636+
OBJ_RELEASE(item);
626637
}
627638
tmp = NULL;
628-
if (ui8 < opal_list_get_size(&topos)) {
639+
if (1 < opal_list_get_size(&topos)) {
629640
opal_buffer_t bucket, *bptr;
630641
OBJ_CONSTRUCT(&bucket, opal_buffer_t);
631642
while (NULL != (item = opal_list_remove_first(&topos))) {

0 commit comments

Comments
 (0)