Skip to content

Commit 95dacd2

Browse files
committed
Fix singletons and ensure adequate PMIx version
OMPI can only support PMIx v3 and above. PRRTE requires at least PMIx v4, so protect against the case where OMPI is built against an external PMIx v3. Fix check of PMIx_Init return code for singleton operations. Ensure that the PMIx framework gets properly opened. Signed-off-by: Ralph Castain <[email protected]>
1 parent 973d101 commit 95dacd2

File tree

6 files changed

+50
-6
lines changed

6 files changed

+50
-6
lines changed

config/ompi_setup_prrte.m4

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,15 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[
4646
[AC_HELP_STRING([--enable-prte-prefix-by-default],
4747
[Make "mpirun ..." behave exactly the same as "mpirun --prefix \$prefix" (where \$prefix is the value given to --prefix in configure) (default:enabled)])])
4848

49+
AS_IF([test "$opal_external_pmix_happy" = "yes" && test $opal_numerical_pmix_version -lt 4 && test "$enable_internal_rte" != "no"],
50+
[AC_MSG_WARN([OMPI's internal runtime environment "PRRTE" does not support])
51+
AC_MSG_WARN([PMIx versions less than v4.x as they lack adequate tool])
52+
AC_MSG_WARN([support. You can, if desired, build OMPI against an earlier])
53+
AC_MSG_WARN([version of PMIx for strictly direct-launch purposes - e.g., using)])
54+
AC_MSG_WARN([Slurm's srun to launch the job - by configuring with the])
55+
AC_MSG_WARN([--disable-internal-rte option.])
56+
AC_MSG_ERROR([Cannot continue])])
57+
4958
AC_MSG_CHECKING([if RTE support is enabled])
5059
if test "$enable_internal_rte" != "no"; then
5160
AC_MSG_RESULT([yes])
@@ -81,7 +90,7 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[
8190
opal_prrte_prefix_arg=
8291
fi
8392

84-
opal_prrte_args="--prefix=$prefix --disable-dlopen $opal_prrte_prefix_arg $opal_prrte_libevent_arg $opal_prrte_hwloc_arg $opal_prrte_pmix_arg"
93+
opal_prrte_args="--prefix=$prefix $opal_prrte_prefix_arg $opal_prrte_libevent_arg $opal_prrte_hwloc_arg $opal_prrte_pmix_arg"
8594
AS_IF([test "$enable_debug" = "yes"],
8695
[opal_prrte_args="--enable-debug $opal_prrte_args"
8796
CFLAGS="$OPAL_CFLAGS_BEFORE_PICKY $OPAL_VISIBILITY_CFLAGS -g"],

config/opal_check_pmi.m4

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[
125125
], [])],
126126
[AC_MSG_RESULT([found])
127127
opal_external_pmix_version=4x
128+
opal_numerical_pmix_version=4
128129
opal_external_pmix_version_found=1
129130
opal_external_pmix_happy=yes],
130131
[AC_MSG_RESULT([not found])])])
@@ -139,6 +140,7 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[
139140
], [])],
140141
[AC_MSG_RESULT([found])
141142
opal_external_pmix_version=3x
143+
opal_numerical_pmix_version=3
142144
opal_external_pmix_version_found=1
143145
opal_external_pmix_happy=yes],
144146
[AC_MSG_RESULT([not found])])])
@@ -153,6 +155,7 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[
153155
], [])],
154156
[AC_MSG_RESULT([found])
155157
opal_external_pmix_version=2x
158+
opal_numerical_pmix_version=2
156159
opal_external_pmix_version_found=1
157160
opal_external_pmix_happy=yes],
158161
[AC_MSG_RESULT([not found])])])
@@ -167,6 +170,7 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[
167170
], [])],
168171
[AC_MSG_RESULT([found])
169172
opal_external_pmix_version=1x
173+
opal_numerical_pmix_version=1
170174
opal_external_pmix_version_found=1
171175
opal_external_have_pmix1=1
172176
opal_external_pmix_happy=yes],
@@ -179,6 +183,12 @@ AC_DEFUN([OPAL_CHECK_PMIX_LIB],[
179183
opal_external_pmix_happy=no])
180184
181185
])
186+
AS_IF([test "$opal_external_pmix_happy" = "yes" && test $opal_numerical_pmix_version -lt 3],
187+
[AC_MSG_WARN([OMPI no longer supports PMIx versions prior to v3])
188+
AC_MSG_WARN([Please direct us to a more current PMIx release or])
189+
AC_MSG_WARN([use the internally provided one])
190+
AC_MSG_ERROR([Cannot continue])])
191+
182192
AS_IF([test "$opal_external_pmix_happy" = "yes"],
183193
[$3
184194
# add the new flags to our wrapper compilers

ompi/interlib/interlib.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,10 @@ int ompi_interlib_declare(int threadlevel, char *version)
121121
PMIX_INFO_DESTRUCT(&info[3]);
122122
/* account for our refcount on pmix_init */
123123
PMIx_Finalize(NULL, 0);
124-
ret = opal_pmix_convert_status(rc);
124+
if (ompi_singleton && PMIX_ERR_UNREACH == rc) {
125+
ret = OMPI_SUCCESS;
126+
} else {
127+
ret = opal_pmix_convert_status(rc);
128+
}
125129
return ret;
126130
}

ompi/runtime/ompi_mpi_abort.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,8 @@ static void try_kill_peers(ompi_communicator_t *comm,
8585

8686
procs = (ompi_process_name_t*) calloc(nprocs, sizeof(ompi_process_name_t));
8787
if (NULL == procs) {
88-
/* quick clean orte and get out */
89-
ompi_rte_abort(errno, "Abort: unable to alloc memory to kill procs");
88+
/* quick clean RTE and get out */
89+
ompi_rte_abort(errcode, "Abort: unable to alloc memory to kill procs");
9090
}
9191

9292
/* put all the local group procs in the abort list */

ompi/runtime/ompi_rte.c

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,27 @@
6161
opal_process_name_t pmix_name_wildcard = {UINT32_MAX-1, UINT32_MAX-1};
6262
opal_process_name_t pmix_name_invalid = {UINT32_MAX, UINT32_MAX};
6363
hwloc_cpuset_t ompi_proc_applied_binding = NULL;
64-
pmix_process_info_t pmix_process_info = {0};
64+
pmix_process_info_t pmix_process_info = {
65+
.my_name = {OPAL_JOBID_INVALID, OPAL_VPID_INVALID},
66+
.nodename = NULL,
67+
.pid = 0,
68+
.top_session_dir = NULL,
69+
.job_session_dir = NULL,
70+
.proc_session_dir = NULL,
71+
.my_local_rank = 0,
72+
.my_node_rank = 0,
73+
.num_local_peers = 0,
74+
.num_procs = 0,
75+
.app_num = 0,
76+
.univ_size = 0,
77+
.app_sizes = NULL,
78+
.app_ldrs = NULL,
79+
.cpuset = NULL,
80+
.command = NULL,
81+
.num_apps = 0,
82+
.initial_wdir = NULL,
83+
.reincarnation = 0
84+
};
6585
bool pmix_proc_is_bound = false;
6686
bool ompi_singleton = false;
6787

opal/runtime/opal_init.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
#include "opal/mca/installdirs/base/base.h"
5555
#include "opal/mca/memory/base/base.h"
5656
#include "opal/mca/patcher/base/base.h"
57+
#include "opal/mca/pmix/base/base.h"
5758
#include "opal/mca/memcpy/base/base.h"
5859
#include "opal/mca/hwloc/base/base.h"
5960
#include "opal/mca/reachable/base/base.h"
@@ -630,7 +631,7 @@ opal_init_util(int* pargc, char*** pargv)
630631
static mca_base_framework_t *opal_init_frameworks[] = {
631632
&opal_hwloc_base_framework, &opal_memcpy_base_framework, &opal_memchecker_base_framework,
632633
&opal_backtrace_base_framework, &opal_timer_base_framework, &opal_event_base_framework,
633-
&opal_shmem_base_framework, &opal_reachable_base_framework,
634+
&opal_shmem_base_framework, &opal_reachable_base_framework, &opal_pmix_base_framework,
634635
NULL,
635636
};
636637

0 commit comments

Comments
 (0)