Skip to content

Commit f9643b8

Browse files
authored
Merge pull request #7441 from rhc54/topic/hack
Create a hack to protect against non-integer jobids
2 parents 6d34b06 + 829fd47 commit f9643b8

File tree

6 files changed

+111
-16
lines changed

6 files changed

+111
-16
lines changed

ompi/runtime/ompi_rte.c

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ pmix_process_info_t pmix_process_info = {0};
6565
bool pmix_proc_is_bound = false;
6666
bool ompi_singleton = false;
6767

68+
static pmix_proc_t myprocid;
69+
6870
static bool added_transport_keys = false;
6971
static bool added_num_procs = false;
7072
static bool added_app_ctx = false;
@@ -498,7 +500,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
498500
int ret;
499501
char *error = NULL;
500502
opal_process_name_t pname;
501-
pmix_proc_t myproc, rproc;
503+
pmix_proc_t rproc;
502504
int u32, *u32ptr;
503505
uint16_t u16, *u16ptr;
504506
char **peers=NULL;
@@ -530,24 +532,28 @@ int ompi_rte_init(int *pargc, char ***pargv)
530532
goto error;
531533
}
532534

535+
/* setup our internal nspace hack */
536+
opal_pmix_setup_nspace_tracker();
537+
533538
/* initialize the selected module */
534-
if (!PMIx_Initialized() && (PMIX_SUCCESS != (ret = PMIx_Init(&myproc, NULL, 0)))) {
539+
if (!PMIx_Initialized() && (PMIX_SUCCESS != (ret = PMIx_Init(&myprocid, NULL, 0)))) {
535540
/* we cannot run - this could be due to being direct launched
536541
* without the required PMI support being built, so print
537542
* out a help message indicating it */
538543
opal_show_help("help-mpi-runtime.txt", "no-pmi", true, PMIx_Error_string(ret));
539544
return OPAL_ERR_SILENT;
540545
}
541546

542-
/* setup the process name fields */
543-
OPAL_PMIX_CONVERT_PROCT(rc, &pname, &myproc);
547+
/* setup the process name fields - also registers the new nspace */
548+
OPAL_PMIX_CONVERT_PROCT(rc, &pname, &myprocid);
544549
if (OPAL_SUCCESS != rc) {
545550
return rc;
546551
}
547552
OPAL_PROC_MY_NAME.jobid = pname.jobid;
548553
OPAL_PROC_MY_NAME.vpid = pname.vpid;
549554
pmix_process_info.my_name.jobid = OPAL_PROC_MY_NAME.jobid;
550555
pmix_process_info.my_name.vpid = OPAL_PROC_MY_NAME.vpid;
556+
551557
/* set our hostname */
552558
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &OPAL_PROC_MY_NAME,
553559
(char**)&ev1, PMIX_STRING);
@@ -828,6 +834,10 @@ int ompi_rte_finalize(void)
828834
free (pmix_process_info.cpuset);
829835
pmix_process_info.cpuset = NULL;
830836

837+
/* cleanup our internal nspace hack */
838+
opal_pmix_finalize_nspace_tracker();
839+
840+
831841
return OMPI_SUCCESS;
832842
}
833843

opal/mca/pmix/base/pmix_base_fns.c

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,83 @@ int opal_pmix_base_exchange(pmix_info_t *indat,
7777
return opal_pmix_convert_status(rc);
7878
}
7979

80+
typedef struct {
81+
opal_list_item_t super;
82+
pmix_nspace_t nspace;
83+
opal_jobid_t jobid;
84+
} opal_nptr_t;
85+
static OBJ_CLASS_INSTANCE(opal_nptr_t,
86+
opal_list_item_t,
87+
NULL, NULL);
88+
89+
static opal_list_t localnspaces;
90+
91+
void opal_pmix_setup_nspace_tracker(void)
92+
{
93+
/* check if we were launched by PRRTE */
94+
if (NULL != getenv("PRRTE_LAUNCHED")) {
95+
opal_process_info.nativelaunch = true;
96+
}
97+
98+
OBJ_CONSTRUCT(&localnspaces, opal_list_t);
99+
}
100+
101+
void opal_pmix_finalize_nspace_tracker(void)
102+
{
103+
OPAL_LIST_DESTRUCT(&localnspaces);
104+
}
105+
106+
int opal_pmix_convert_jobid(pmix_nspace_t nspace, opal_jobid_t jobid)
107+
{
108+
opal_nptr_t *nptr;
109+
110+
/* zero out the nspace */
111+
PMIX_LOAD_NSPACE(nspace, NULL);
112+
113+
if (opal_process_info.nativelaunch) {
114+
opal_snprintf_jobid(nspace, PMIX_MAX_NSLEN, jobid);
115+
return OPAL_SUCCESS;
116+
} else {
117+
/* cycle across our list of known jobids */
118+
OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) {
119+
if (jobid == nptr->jobid) {
120+
PMIX_LOAD_NSPACE(nspace, nptr->nspace);
121+
return OPAL_SUCCESS;
122+
}
123+
}
124+
}
125+
return OPAL_ERR_NOT_FOUND;
126+
}
127+
128+
int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t nspace)
129+
{
130+
opal_nptr_t *nptr;
131+
opal_jobid_t jid;
132+
133+
/* set a default */
134+
*jobid = OPAL_JOBID_INVALID;
135+
136+
if (opal_process_info.nativelaunch) {
137+
return opal_convert_string_to_jobid(jobid, nspace);
138+
} else {
139+
/* cycle across our list of known jobids */
140+
OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) {
141+
if (PMIX_CHECK_NSPACE(nspace, nptr->nspace)) {
142+
*jobid = nptr->jobid;
143+
return OPAL_SUCCESS;
144+
}
145+
}
146+
/* if we get here, we don't know this nspace */
147+
OPAL_HASH_STR(nspace, jid);
148+
*jobid = jid;
149+
nptr = OBJ_NEW(opal_nptr_t);
150+
nptr->jobid = jid;
151+
PMIX_LOAD_NSPACE(nptr->nspace, nspace);
152+
opal_list_append(&localnspaces, &nptr->super);
153+
}
154+
return OPAL_SUCCESS;
155+
}
156+
80157
pmix_status_t opal_pmix_convert_rc(int rc)
81158
{
82159
switch (rc) {

opal/mca/pmix/pmix-internal.h

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2014-2019 Intel, Inc. All rights reserved.
3+
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
44
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
55
* reserved.
66
* Copyright (c) 2019 Research Organization for Information Science
@@ -442,9 +442,13 @@ OPAL_DECLSPEC pmix_proc_state_t opal_pmix_convert_state(int state);
442442
OPAL_DECLSPEC int opal_pmix_convert_pstate(pmix_proc_state_t);
443443
OPAL_DECLSPEC pmix_status_t opal_pmix_convert_rc(int rc);
444444
OPAL_DECLSPEC int opal_pmix_convert_status(pmix_status_t status);
445+
OPAL_DECLSPEC int opal_pmix_convert_jobid(pmix_nspace_t nspace, opal_jobid_t jobid);
446+
OPAL_DECLSPEC int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t nspace);
447+
OPAL_DECLSPEC void opal_pmix_setup_nspace_tracker(void);
448+
OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void);
445449

446450
#define OPAL_PMIX_CONVERT_JOBID(n, j) \
447-
(void)opal_snprintf_jobid((n), PMIX_MAX_NSLEN, (j))
451+
opal_pmix_convert_jobid((n), (j))
448452

449453
#define OPAL_PMIX_CONVERT_VPID(r, v) \
450454
do { \
@@ -454,6 +458,7 @@ OPAL_DECLSPEC int opal_pmix_convert_status(pmix_status_t status);
454458
(r) = (v); \
455459
} \
456460
} while(0)
461+
457462
#define OPAL_PMIX_CONVERT_NAME(p, n) \
458463
do { \
459464
OPAL_PMIX_CONVERT_JOBID((p)->nspace, (n)->jobid); \
@@ -462,15 +467,17 @@ OPAL_DECLSPEC int opal_pmix_convert_status(pmix_status_t status);
462467

463468

464469
#define OPAL_PMIX_CONVERT_NSPACE(r, j, n) \
465-
(r) = opal_convert_string_to_jobid((j), (n))
470+
(r) = opal_pmix_convert_nspace((j), (n))
466471

467-
#define OPAL_PMIX_CONVERT_RANK(v, r) \
468-
do { \
469-
if (PMIX_RANK_WILDCARD == (r)) { \
470-
(v) = OPAL_VPID_WILDCARD; \
471-
} else { \
472-
(v) = (r); \
473-
} \
472+
#define OPAL_PMIX_CONVERT_RANK(v, r) \
473+
do { \
474+
if (PMIX_RANK_WILDCARD == (r)) { \
475+
(v) = OPAL_VPID_WILDCARD; \
476+
} else if (PMIX_RANK_INVALID == (r)) { \
477+
(v) = OPAL_VPID_INVALID; \
478+
} else { \
479+
(v) = (r); \
480+
} \
474481
} while(0)
475482

476483
#define OPAL_PMIX_CONVERT_PROCT(r, n, p) \

opal/util/proc.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ opal_process_name_t opal_name_wildcard = {OPAL_JOBID_WILDCARD, OPAL_VPID_WILDCAR
2828
opal_process_name_t opal_name_invalid = {OPAL_JOBID_INVALID, OPAL_VPID_INVALID};
2929

3030
opal_process_info_t opal_process_info = {
31+
.nativelaunch = false,
3132
.nodename = NULL,
3233
.top_session_dir = NULL,
3334
.job_session_dir = NULL,

opal/util/proc.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
#include "opal/types.h"
2424
#include "opal/dss/dss.h"
2525

26-
2726
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
2827
#include <arpa/inet.h>
2928
#endif
@@ -105,6 +104,7 @@ typedef struct {
105104
OBJ_CLASS_DECLARATION(opal_namelist_t);
106105

107106
typedef struct opal_process_info_t {
107+
bool nativelaunch; /**< launched by mpirun */
108108
char *nodename; /**< string name for this node */
109109
char *top_session_dir; /**< Top-level session directory */
110110
char *job_session_dir; /**< Session directory for job */

prrte

Submodule prrte updated 173 files

0 commit comments

Comments
 (0)