From 78b59a5daba4c7297fc5c47ebf8e862c6ae27d7f Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 5 Apr 2017 17:32:39 -0700 Subject: [PATCH] Enable slurm operations on Cray with constraints Signed-off-by: Ralph Castain (cherry picked from commit a29ca2bb0d7e0a1f4749fe88c1aa6bd09837d0a0) --- orte/mca/plm/alps/help-plm-alps.txt | 5 +---- orte/mca/plm/alps/plm_alps.h | 2 +- orte/mca/plm/alps/plm_alps_component.c | 8 ++++++-- orte/mca/plm/alps/plm_alps_module.c | 19 +------------------ orte/mca/plm/slurm/help-plm-slurm.txt | 17 ++++++++++++++++- orte/mca/plm/slurm/plm_slurm.h | 2 ++ orte/mca/plm/slurm/plm_slurm_component.c | 10 ++++++++++ orte/mca/plm/slurm/plm_slurm_module.c | 23 +++++++++++++++++++++-- 8 files changed, 58 insertions(+), 28 deletions(-) diff --git a/orte/mca/plm/alps/help-plm-alps.txt b/orte/mca/plm/alps/help-plm-alps.txt index f109299a862..c0e3d0470fb 100644 --- a/orte/mca/plm/alps/help-plm-alps.txt +++ b/orte/mca/plm/alps/help-plm-alps.txt @@ -10,6 +10,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -39,7 +40,3 @@ the map for this application. This can be caused by a lack of an allocation, or by an error in the Open MPI code. Please check to ensure you have a ALPS allocation. If you do, then please pass the error to the Open MPI user's mailing list for assistance. -# -[slurm-not-supported] -mpirun is not a supported launcher on Cray XC using Native SLURM. -srun must be used to launch jobs on these systems. diff --git a/orte/mca/plm/alps/plm_alps.h b/orte/mca/plm/alps/plm_alps.h index d15ae07ffa0..bdc039fedaf 100644 --- a/orte/mca/plm/alps/plm_alps.h +++ b/orte/mca/plm/alps/plm_alps.h @@ -9,6 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -46,7 +47,6 @@ ORTE_MODULE_DECLSPEC extern orte_plm_alps_component_t mca_plm_alps_component; ORTE_DECLSPEC extern orte_plm_base_module_t orte_plm_alps_module; -extern bool mca_plm_alps_using_aprun; END_C_DECLS diff --git a/orte/mca/plm/alps/plm_alps_component.c b/orte/mca/plm/alps/plm_alps_component.c index e474cd59130..f906a5cb1be 100644 --- a/orte/mca/plm/alps/plm_alps_component.c +++ b/orte/mca/plm/alps/plm_alps_component.c @@ -12,6 +12,7 @@ * All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -43,7 +44,6 @@ */ const char *mca_plm_alps_component_version_string = "Open MPI alps plm MCA component version " ORTE_VERSION; -bool mca_plm_alps_using_aprun = {true}; /* @@ -158,7 +158,11 @@ static int orte_plm_alps_component_query(mca_base_module_t **module, int *priori } if((NULL != wlm_detected) && !strcmp(slurm, wlm_detected)) { - mca_plm_alps_using_aprun = false; + /* we are in a Cray SLURM environment, so we don't want + * this plm component */ + *priority = 0; + *module = NULL; + return ORTE_ERROR; } #endif diff --git a/orte/mca/plm/alps/plm_alps_module.c b/orte/mca/plm/alps/plm_alps_module.c index 8cf9c287fe8..93f86114773 100644 --- a/orte/mca/plm/alps/plm_alps_module.c +++ b/orte/mca/plm/alps/plm_alps_module.c @@ -13,7 +13,7 @@ * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -121,23 +121,6 @@ static int plm_alps_init(void) return rc; } - /* - * owing to way the SLURM PLM component works, we can't use - * it on Cray XC systems as currently designed. The problem - * is the MPI processes launched on the head node (where the - * ORTE_PROC_IS_HNP evalues to true) get launched by a daemon - * (mpirun) which is not a child of a slurmd daemon. This - * means that any RDMA credentials obtained via the odls/alps - * local launcher are incorrect. - * - * So for now, we just don't support mpirun launched jobs - * on Cray XC systems using Native SLURM. - */ - if (false == mca_plm_alps_using_aprun) { - orte_show_help("help-plm-alps.txt", "slurm-not-supported", true); - exit(-1); - } - if (orte_do_not_launch) { /* must map daemons since we won't be launching them */ orte_plm_globals.daemon_nodes_assigned_at_launch = true; diff --git a/orte/mca/plm/slurm/help-plm-slurm.txt b/orte/mca/plm/slurm/help-plm-slurm.txt index 8c450c0a283..837c3e88a89 100644 --- a/orte/mca/plm/slurm/help-plm-slurm.txt +++ b/orte/mca/plm/slurm/help-plm-slurm.txt @@ -10,7 +10,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. -# Copyright (c) 2014 Intel, Inc. All rights reserved. +# Copyright (c) 2014-2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -49,3 +49,18 @@ are running. Please consult with your system administrator about obtaining such support. +[no-local-support] +The SLURM process starter cannot start processes local to +mpirun when executing under a Cray environment. The problem +is that mpirun is not itself a child of a slurmd daemon. Thus, +any processes mpirun itself starts will inherit incorrect +RDMA credentials. + +Your application will be mapped and run (assuming adequate +resources) on the remaining allocated nodes. If adequate +resources are not available, you will need to exit and obtain +a larger allocation. + +This situation will be fixed in a future release. Meantime, +you can turn "off" this warning by setting the plm_slurm_warning +MCA param to 0. diff --git a/orte/mca/plm/slurm/plm_slurm.h b/orte/mca/plm/slurm/plm_slurm.h index eae239edf07..1e88ef60a84 100644 --- a/orte/mca/plm/slurm/plm_slurm.h +++ b/orte/mca/plm/slurm/plm_slurm.h @@ -9,6 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,6 +30,7 @@ BEGIN_C_DECLS struct orte_plm_slurm_component_t { orte_plm_base_component_t super; char *custom_args; + bool slurm_warning_msg; }; typedef struct orte_plm_slurm_component_t orte_plm_slurm_component_t; diff --git a/orte/mca/plm/slurm/plm_slurm_component.c b/orte/mca/plm/slurm/plm_slurm_component.c index 90d14dd24c7..3e29bd46231 100644 --- a/orte/mca/plm/slurm/plm_slurm_component.c +++ b/orte/mca/plm/slurm/plm_slurm_component.c @@ -12,6 +12,7 @@ * All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -28,7 +29,9 @@ #include "orte_config.h" #include "orte/constants.h" +#include "opal/util/opal_environ.h" #include "orte/util/name_fns.h" +#include "orte/util/show_help.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/plm/plm.h" @@ -99,6 +102,13 @@ static int plm_slurm_register(void) MCA_BASE_VAR_SCOPE_READONLY, &mca_plm_slurm_component.custom_args); + mca_plm_slurm_component.slurm_warning_msg = true; + (void) mca_base_component_var_register (comp, "warning", "Turn off warning message", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_plm_slurm_component.slurm_warning_msg); + return ORTE_SUCCESS; } diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c index dac5efde34a..354ec363481 100644 --- a/orte/mca/plm/slurm/plm_slurm_module.c +++ b/orte/mca/plm/slurm/plm_slurm_module.c @@ -12,7 +12,7 @@ * Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -65,7 +65,7 @@ #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_quit.h" #include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/rmaps/rmaps.h" +#include "orte/mca/rmaps/base/base.h" #include "orte/mca/state/state.h" #include "orte/orted/orted.h" @@ -193,6 +193,25 @@ static void launch_daemons(int fd, short args, void *cbdata) "%s plm:slurm: LAUNCH DAEMONS CALLED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); +#if SLURM_CRAY_ENV + /* if we are in a Cray-SLURM environment, then we cannot + * launch procs local to the HNP. The problem + * is the MPI processes launched on the head node (where the + * ORTE_PROC_IS_HNP evalues to true) get launched by a daemon + * (mpirun) which is not a child of a slurmd daemon. This + * means that any RDMA credentials obtained via the odls/alps + * local launcher are incorrect. So warn the user and set + * the envar for no_schedule_local if mpirun is not on a + * system management node (i.e. is part of the allocation) + * and the "no_use_local" flag hasn't been set */ + if (mca_plm_slurm_component.slurm_warning_msg && + (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL))) { + orte_show_help("help-plm-slurm.txt", "no-local-support", true); + ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL); + mca_plm_slurm_component.slurm_warning_msg = false; // only do this once + } +#endif + /* if we are launching debugger daemons, then just go * do it - no new daemons will be launched */