Skip to content

Commit a29ca2b

Browse files
author
Ralph Castain
committed
Enable slurm operations on Cray with constraints
Cleanup some errors in the nidmap code that caused us to send unnecessary topologies Signed-off-by: Ralph Castain <[email protected]>
1 parent bf668ad commit a29ca2b

File tree

9 files changed

+73
-32
lines changed

9 files changed

+73
-32
lines changed

orte/mca/plm/alps/help-plm-alps.txt

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# University of Stuttgart. All rights reserved.
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
13+
# Copyright (c) 2017 Intel, Inc. All rights reserved.
1314
# $COPYRIGHT$
1415
#
1516
# Additional copyrights may follow
@@ -39,7 +40,3 @@ the map for this application. This can be caused by a lack of
3940
an allocation, or by an error in the Open MPI code. Please check
4041
to ensure you have a ALPS allocation. If you do, then please pass
4142
the error to the Open MPI user's mailing list for assistance.
42-
#
43-
[slurm-not-supported]
44-
mpirun is not a supported launcher on Cray XC using Native SLURM.
45-
srun must be used to launch jobs on these systems.

orte/mca/plm/alps/plm_alps.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12+
* Copyright (c) 2017 Intel, Inc. All rights reserved.
1213
* $COPYRIGHT$
1314
*
1415
* Additional copyrights may follow
@@ -46,7 +47,6 @@ ORTE_MODULE_DECLSPEC extern orte_plm_alps_component_t
4647
mca_plm_alps_component;
4748
ORTE_DECLSPEC extern orte_plm_base_module_t
4849
orte_plm_alps_module;
49-
extern bool mca_plm_alps_using_aprun;
5050

5151
END_C_DECLS
5252

orte/mca/plm/alps/plm_alps_component.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1414
* reserved.
15+
* Copyright (c) 2017 Intel, Inc. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -43,7 +44,6 @@
4344
*/
4445
const char *mca_plm_alps_component_version_string =
4546
"Open MPI alps plm MCA component version " ORTE_VERSION;
46-
bool mca_plm_alps_using_aprun = {true};
4747

4848

4949
/*
@@ -158,7 +158,11 @@ static int orte_plm_alps_component_query(mca_base_module_t **module, int *priori
158158
}
159159

160160
if((NULL != wlm_detected) && !strcmp(slurm, wlm_detected)) {
161-
mca_plm_alps_using_aprun = false;
161+
/* we are in a Cray SLURM environment, so we don't want
162+
* this plm component */
163+
*priority = 0;
164+
*module = NULL;
165+
return ORTE_ERROR;
162166
}
163167
#endif
164168

orte/mca/plm/alps/plm_alps_module.c

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -121,23 +121,6 @@ static int plm_alps_init(void)
121121
return rc;
122122
}
123123

124-
/*
125-
* owing to way the SLURM PLM component works, we can't use
126-
* it on Cray XC systems as currently designed. The problem
127-
* is the MPI processes launched on the head node (where the
128-
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
129-
* (mpirun) which is not a child of a slurmd daemon. This
130-
* means that any RDMA credentials obtained via the odls/alps
131-
* local launcher are incorrect.
132-
*
133-
* So for now, we just don't support mpirun launched jobs
134-
* on Cray XC systems using Native SLURM.
135-
*/
136-
if (false == mca_plm_alps_using_aprun) {
137-
orte_show_help("help-plm-alps.txt", "slurm-not-supported", true);
138-
exit(-1);
139-
}
140-
141124
if (orte_do_not_launch) {
142125
/* must map daemons since we won't be launching them */
143126
orte_plm_globals.daemon_nodes_assigned_at_launch = true;

orte/mca/plm/slurm/help-plm-slurm.txt

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# University of Stuttgart. All rights reserved.
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
13-
# Copyright (c) 2014 Intel, Inc. All rights reserved.
13+
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1414
# $COPYRIGHT$
1515
#
1616
# Additional copyrights may follow
@@ -49,3 +49,18 @@ are running.
4949

5050
Please consult with your system administrator about obtaining
5151
such support.
52+
[no-local-support]
53+
The SLURM process starter cannot start processes local to
54+
mpirun when executing under a Cray environment. The problem
55+
is that mpirun is not itself a child of a slurmd daemon. Thus,
56+
any processes mpirun itself starts will inherit incorrect
57+
RDMA credentials.
58+
59+
Your application will be mapped and run (assuming adequate
60+
resources) on the remaining allocated nodes. If adequate
61+
resources are not available, you will need to exit and obtain
62+
a larger allocation.
63+
64+
This situation will be fixed in a future release. Meantime,
65+
you can turn "off" this warning by setting the plm_slurm_warning
66+
MCA param to 0.

orte/mca/plm/slurm/plm_slurm.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12+
* Copyright (c) 2017 Intel, Inc. All rights reserved.
1213
* $COPYRIGHT$
1314
*
1415
* Additional copyrights may follow
@@ -29,6 +30,7 @@ BEGIN_C_DECLS
2930
struct orte_plm_slurm_component_t {
3031
orte_plm_base_component_t super;
3132
char *custom_args;
33+
bool slurm_warning_msg;
3234
};
3335
typedef struct orte_plm_slurm_component_t orte_plm_slurm_component_t;
3436

orte/mca/plm/slurm/plm_slurm_component.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1414
* reserved.
15+
* Copyright (c) 2017 Intel, Inc. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -28,7 +29,9 @@
2829
#include "orte_config.h"
2930
#include "orte/constants.h"
3031

32+
#include "opal/util/opal_environ.h"
3133
#include "orte/util/name_fns.h"
34+
#include "orte/util/show_help.h"
3235
#include "orte/runtime/orte_globals.h"
3336

3437
#include "orte/mca/plm/plm.h"
@@ -99,6 +102,13 @@ static int plm_slurm_register(void)
99102
MCA_BASE_VAR_SCOPE_READONLY,
100103
&mca_plm_slurm_component.custom_args);
101104

105+
mca_plm_slurm_component.slurm_warning_msg = true;
106+
(void) mca_base_component_var_register (comp, "warning", "Turn off warning message",
107+
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
108+
OPAL_INFO_LVL_9,
109+
MCA_BASE_VAR_SCOPE_READONLY,
110+
&mca_plm_slurm_component.slurm_warning_msg);
111+
102112
return ORTE_SUCCESS;
103113
}
104114

orte/mca/plm/slurm/plm_slurm_module.c

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
#include "orte/runtime/orte_wait.h"
6666
#include "orte/runtime/orte_quit.h"
6767
#include "orte/mca/errmgr/errmgr.h"
68-
#include "orte/mca/rmaps/rmaps.h"
68+
#include "orte/mca/rmaps/base/base.h"
6969
#include "orte/mca/state/state.h"
7070

7171
#include "orte/orted/orted.h"
@@ -193,6 +193,25 @@ static void launch_daemons(int fd, short args, void *cbdata)
193193
"%s plm:slurm: LAUNCH DAEMONS CALLED",
194194
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
195195

196+
#if SLURM_CRAY_ENV
197+
/* if we are in a Cray-SLURM environment, then we cannot
198+
* launch procs local to the HNP. The problem
199+
* is the MPI processes launched on the head node (where the
200+
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
201+
* (mpirun) which is not a child of a slurmd daemon. This
202+
* means that any RDMA credentials obtained via the odls/alps
203+
* local launcher are incorrect. So warn the user and set
204+
* the envar for no_schedule_local if mpirun is not on a
205+
* system management node (i.e. is part of the allocation)
206+
* and the "no_use_local" flag hasn't been set */
207+
if (mca_plm_slurm_component.slurm_warning_msg &&
208+
(orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL))) {
209+
orte_show_help("help-plm-slurm.txt", "no-local-support", true);
210+
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL);
211+
mca_plm_slurm_component.slurm_warning_msg = false; // only do this once
212+
}
213+
#endif
214+
196215
/* if we are launching debugger daemons, then just go
197216
* do it - no new daemons will be launched
198217
*/

orte/util/nidmap.c

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -618,14 +618,25 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
618618

619619
/* handle the topologies - as the most common case by far
620620
* is to have homogeneous topologies, we only send them
621-
* if something is different */
622-
if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) {
623-
ui8 = 2;
624-
} else {
625-
ui8 = 1;
621+
* if something is different. We know that the HNP is
622+
* the first topology, and that any differing topology
623+
* on the compute nodes must follow. So send the topologies
624+
* if and only if:
625+
*
626+
* (a) the HNP is being used to house application procs and
627+
* there is more than one topology on our list; or
628+
*
629+
* (b) the HNP is not being used, but there are more than
630+
* two topologies on our list, thus indicating that
631+
* there are multiple topologies on the compute nodes
632+
*/
633+
if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) {
634+
/* remove the first topo on the list */
635+
item = opal_list_remove_first(&topos);
636+
OBJ_RELEASE(item);
626637
}
627638
tmp = NULL;
628-
if (ui8 < opal_list_get_size(&topos)) {
639+
if (1 < opal_list_get_size(&topos)) {
629640
opal_buffer_t bucket, *bptr;
630641
OBJ_CONSTRUCT(&bucket, opal_buffer_t);
631642
while (NULL != (item = opal_list_remove_first(&topos))) {

0 commit comments

Comments
 (0)