Skip to content

Commit 1cf972d

Browse files
committed
Update PMIx and PRRTE
Deprecate --am and --amca options Avoid default param files on backend nodes Any parameters in the PRRTE default or user param files will have been picked up by prte and included in the environment sent to the prted, so don't open those files on the backend. Avoid picking up MCA param file info on backend Avoid the scaling problem at PRRTE startup by only reading the system and user param files on the frontend. Complete revisions to cmd line parser for OMPI Per specification, enforce following precedence order: 1. system-level default parameter file 1. user-level default parameter file 1. Anything found in the environment 1. "--tune" files. Note that "--amca" goes away and becomes equivalent to "--tune". Okay if it is provided more than once on a cmd line (we will aggregate the list of files, retaining order), but an error if a parameter is referenced in more than one file with a different value 1. "--mca" options. Again, error if the same option appears more than once with a different value. Allowed to override a parameter referenced in a "tune" file 1. "-x" options. Allowed to overwrite options given in a "tune" file, but cannot conflict with an explicit "--mca" option 1. all other options Fix special handling of "-np" Get agreement on jobid across the layers Need all three pieces (PRRTE, PMIx, and OPAL) to agree on the nspace conversion to jobid method Ensure prte show_help messages get output Print abnormal termination messages Cleanup error reporting in persistent operations Signed-off-by: Ralph Castain <[email protected]> dd Signed-off-by: Ralph Castain <[email protected]>
1 parent f9575ed commit 1cf972d

File tree

4 files changed

+57
-29
lines changed

4 files changed

+57
-29
lines changed

opal/mca/pmix/base/pmix_base_fns.c

Lines changed: 49 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -110,48 +110,70 @@ int opal_pmix_convert_jobid(pmix_nspace_t nspace, opal_jobid_t jobid)
110110
/* zero out the nspace */
111111
PMIX_LOAD_NSPACE(nspace, NULL);
112112

113-
if (opal_process_info.nativelaunch) {
114-
opal_snprintf_jobid(nspace, PMIX_MAX_NSLEN, jobid);
115-
return OPAL_SUCCESS;
116-
} else {
117-
/* cycle across our list of known jobids */
118-
OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) {
119-
if (jobid == nptr->jobid) {
120-
PMIX_LOAD_NSPACE(nspace, nptr->nspace);
121-
return OPAL_SUCCESS;
122-
}
113+
/* cycle across our list of known jobids */
114+
OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) {
115+
if (jobid == nptr->jobid) {
116+
PMIX_LOAD_NSPACE(nspace, nptr->nspace);
117+
return OPAL_SUCCESS;
123118
}
124119
}
120+
125121
return OPAL_ERR_NOT_FOUND;
126122
}
127123

128124
int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t nspace)
129125
{
130126
opal_nptr_t *nptr;
131127
opal_jobid_t jid;
128+
uint16_t jobfam;
129+
uint32_t hash32, localjob = 0;
130+
char *p = NULL;
132131

133132
/* set a default */
134133
*jobid = OPAL_JOBID_INVALID;
135134

136-
if (opal_process_info.nativelaunch) {
137-
return opal_convert_string_to_jobid(jobid, nspace);
138-
} else {
139-
/* cycle across our list of known jobids */
140-
OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) {
141-
if (PMIX_CHECK_NSPACE(nspace, nptr->nspace)) {
142-
*jobid = nptr->jobid;
143-
return OPAL_SUCCESS;
144-
}
135+
/* if the nspace is empty, there is nothing more to do */
136+
if (0 == strlen(nspace)) {
137+
return OPAL_SUCCESS;
138+
}
139+
if (NULL != strstr(nspace, "JOBID_WILDCARD")) {
140+
*jobid = OPAL_JOBID_WILDCARD;
141+
return OPAL_SUCCESS;
142+
}
143+
if (NULL != strstr(nspace, "JOBID_INVALID")) {
144+
*jobid = OPAL_JOBID_INVALID;
145+
return OPAL_SUCCESS;
146+
}
147+
148+
/* cycle across our list of known jobids */
149+
OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) {
150+
if (PMIX_CHECK_NSPACE(nspace, nptr->nspace)) {
151+
*jobid = nptr->jobid;
152+
return OPAL_SUCCESS;
145153
}
146-
/* if we get here, we don't know this nspace */
147-
OPAL_HASH_STR(nspace, jid);
148-
jid &= ~(0x8000);
149-
*jobid = jid;
150-
nptr = OBJ_NEW(opal_nptr_t);
151-
nptr->jobid = jid;
152-
PMIX_LOAD_NSPACE(nptr->nspace, nspace);
153-
opal_list_append(&localnspaces, &nptr->super);
154154
}
155+
/* if we get here, we don't know this nspace */
156+
/* find the "." at the end that indicates the child job */
157+
if (NULL != (p = strrchr(nspace, '.'))) {
158+
*p = '\0';
159+
}
160+
OPAL_HASH_STR(nspace, hash32);
161+
if (NULL != p) {
162+
*p = '.';
163+
++p;
164+
localjob = strtoul(p, NULL, 10);
165+
}
166+
167+
/* now compress to 16-bits */
168+
jobfam = (uint16_t)(((0x0000ffff & (0xffff0000 & hash32) >> 16)) ^ (0x0000ffff & hash32));
169+
jid = (0xffff0000 & ((uint32_t)jobfam << 16)) | (0x0000ffff & localjob);
170+
*jobid = jid;
171+
/* save this jobid/nspace pair */
172+
nptr = OBJ_NEW(opal_nptr_t);
173+
nptr->jobid = jid;
174+
PMIX_LOAD_NSPACE(nptr->nspace, nspace);
175+
opal_list_append(&localnspaces, &nptr->super);
176+
155177
return OPAL_SUCCESS;
156178
}
157179

opal/mca/pmix/pmix-internal.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -595,9 +595,11 @@ OPAL_DECLSPEC int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t ns
595595
OPAL_DECLSPEC void opal_pmix_setup_nspace_tracker(void);
596596
OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void);
597597

598+
/* convert jobid to nspace */
598599
#define OPAL_PMIX_CONVERT_JOBID(n, j) \
599600
opal_pmix_convert_jobid((n), (j))
600601

602+
/* convert vpid to rank */
601603
#define OPAL_PMIX_CONVERT_VPID(r, v) \
602604
do { \
603605
if (OPAL_VPID_WILDCARD == (v)) { \
@@ -607,16 +609,19 @@ OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void);
607609
} \
608610
} while(0)
609611

612+
/* convert opal_process_name_t to pmix_proc_t */
610613
#define OPAL_PMIX_CONVERT_NAME(p, n) \
611614
do { \
612615
OPAL_PMIX_CONVERT_JOBID((p)->nspace, (n)->jobid); \
613616
OPAL_PMIX_CONVERT_VPID((p)->rank, (n)->vpid); \
614617
} while(0)
615618

616619

620+
/* convert nspace to jobid */
617621
#define OPAL_PMIX_CONVERT_NSPACE(r, j, n) \
618622
(r) = opal_pmix_convert_nspace((j), (n))
619623

624+
/* convert pmix rank to opal vpid */
620625
#define OPAL_PMIX_CONVERT_RANK(v, r) \
621626
do { \
622627
if (PMIX_RANK_WILDCARD == (r)) { \
@@ -628,6 +633,7 @@ OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void);
628633
} \
629634
} while(0)
630635

636+
/* convert pmix_proc_t to opal_process_name_t */
631637
#define OPAL_PMIX_CONVERT_PROCT(r, n, p) \
632638
do { \
633639
OPAL_PMIX_CONVERT_NSPACE((r), &(n)->jobid, (p)->nspace); \

0 commit comments

Comments
 (0)