Skip to content

Commit fb5907b

Browse files
author
Ralph Castain
committed
Cleanup the code a bit by simply adding our nspace to the top of the list of jobid <-> nspace correlations. Add two new APIs to opal_pmix for registering new jobid/nspace pairs and retrieving an nspace given a jobid - these are required to support connect/accept. No impact on the PMIx library.
(cherry picked from commit a4a3dfd)
1 parent ca60b28 commit fb5907b

File tree

9 files changed

+349
-199
lines changed

9 files changed

+349
-199
lines changed

ompi/dpm/dpm.c

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,13 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
139139
opal_argv_append_nosize(&members, nstring);
140140
free(nstring);
141141
/* have to add the number of procs in the job so the remote side
142-
* can correctly add the procs by computing their names */
142+
* can correctly add the procs by computing their names, and our nspace
143+
* so they can update their records */
144+
if (NULL == (nstring = (char*)opal_pmix.get_nspace(OMPI_PROC_MY_NAME->jobid))) {
145+
opal_argv_free(members);
146+
return OMPI_ERR_NOT_SUPPORTED;
147+
}
148+
opal_argv_append_nosize(&members, nstring);
143149
(void)asprintf(&nstring, "%d", size);
144150
opal_argv_append_nosize(&members, nstring);
145151
free(nstring);
@@ -171,6 +177,11 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
171177
}
172178
opal_argv_append_nosize(&members, nstring);
173179
free(nstring);
180+
if (NULL == (nstring = (char*)opal_pmix.get_nspace(proc_list[i]->super.proc_name.jobid))) {
181+
opal_argv_free(members);
182+
return OMPI_ERR_NOT_SUPPORTED;
183+
}
184+
opal_argv_append_nosize(&members, nstring);
174185
}
175186
if (!dense) {
176187
free(proc_list);
@@ -246,6 +257,17 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
246257
OPAL_LIST_DESTRUCT(&mlist);
247258
goto exit;
248259
}
260+
/* step over the nspace */
261+
++i;
262+
if (NULL == members[i]) {
263+
/* this shouldn't happen and is an error */
264+
OMPI_ERROR_LOG(OMPI_ERR_BAD_PARAM);
265+
OPAL_LIST_DESTRUCT(&mlist);
266+
opal_argv_free(members);
267+
free(rport);
268+
rc = OMPI_ERR_BAD_PARAM;
269+
goto exit;
270+
}
249271
/* if the rank is wildcard, then we need to add all procs
250272
* in that job to the list */
251273
if (OPAL_VPID_WILDCARD == nm->name.vpid) {
@@ -295,6 +317,16 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
295317
OPAL_LIST_DESTRUCT(&rlist);
296318
goto exit;
297319
}
320+
/* next entry is the nspace - register it */
321+
++i;
322+
if (NULL == members[i]) {
323+
OMPI_ERROR_LOG(OMPI_ERR_NOT_SUPPORTED);
324+
opal_argv_free(members);
325+
OPAL_LIST_DESTRUCT(&ilist);
326+
OPAL_LIST_DESTRUCT(&rlist);
327+
goto exit;
328+
}
329+
opal_pmix.register_jobid(nm->name.jobid, members[i]);
298330
if (OPAL_VPID_WILDCARD == nm->name.vpid) {
299331
jobid = nm->name.jobid;
300332
OBJ_RELEASE(nm);

opal/mca/pmix/cray/pmix_cray.c

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ static int cray_unpublish_nb(char **keys, opal_list_t *info,
7878
static const char *cray_get_version(void);
7979
static int cray_store_local(const opal_process_name_t *proc,
8080
opal_value_t *val);
81+
static const char *cray_get_nspace(opal_jobid_t jobid);
82+
static void cray_register_jobid(opal_jobid_t jobid, const char *nspace);
83+
8184
#if 0
8285
static bool cray_get_attr(const char *attr, opal_value_t **kv);
8386
#endif
@@ -109,7 +112,9 @@ const opal_pmix_base_module_t opal_pmix_cray_module = {
109112
.get_version = cray_get_version,
110113
.register_errhandler = opal_pmix_base_register_handler,
111114
.deregister_errhandler = opal_pmix_base_deregister_handler,
112-
.store_local = cray_store_local
115+
.store_local = cray_store_local,
116+
.get_nspace = cray_get_nspace,
117+
.register_jobid = cray_register_jobid
113118
};
114119

115120
// usage accounting
@@ -814,6 +819,16 @@ static int cray_store_local(const opal_process_name_t *proc,
814819
return OPAL_SUCCESS;
815820
}
816821

822+
static const char *cray_get_nspace(opal_jobid_t jobid)
823+
{
824+
return NULL;
825+
}
826+
827+
static void cray_register_jobid(opal_jobid_t jobid, const char *nspace)
828+
{
829+
return;
830+
}
831+
817832
static char* pmix_error(int pmix_err)
818833
{
819834
char * err_msg;

opal/mca/pmix/pmix.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -701,6 +701,12 @@ typedef void (*opal_pmix_base_module_deregister_fn_t)(void);
701701
typedef int (*opal_pmix_base_module_store_fn_t)(const opal_process_name_t *proc,
702702
opal_value_t *val);
703703

704+
/* retrieve the nspace corresponding to a given jobid */
705+
typedef const char* (*opal_pmix_base_module_get_nspace_fn_t)(opal_jobid_t jobid);
706+
707+
/* register a jobid-to-nspace pair */
708+
typedef void (*opal_pmix_base_module_register_jobid_fn_t)(opal_jobid_t jobid, const char *nspace);
709+
704710
/*
705711
* the standard public API data structure
706712
*/
@@ -745,6 +751,8 @@ typedef struct {
745751
opal_pmix_base_module_register_fn_t register_errhandler;
746752
opal_pmix_base_module_deregister_fn_t deregister_errhandler;
747753
opal_pmix_base_module_store_fn_t store_local;
754+
opal_pmix_base_module_get_nspace_fn_t get_nspace;
755+
opal_pmix_base_module_register_jobid_fn_t register_jobid;
748756
} opal_pmix_base_module_t;
749757

750758
typedef struct {

opal/mca/pmix/pmix1xx/pmix1.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,24 @@
3030

3131
BEGIN_C_DECLS
3232

33-
OPAL_DECLSPEC extern opal_pmix_base_component_t mca_pmix_pmix1_component;
33+
typedef struct {
34+
opal_pmix_base_component_t super;
35+
opal_list_t jobids;
36+
bool native_launch;
37+
} mca_pmix_pmix1_component_t;
38+
39+
OPAL_DECLSPEC extern mca_pmix_pmix1_component_t mca_pmix_pmix1xx_component;
3440

3541
OPAL_DECLSPEC extern const opal_pmix_base_module_t opal_pmix_pmix1xx_module;
3642

3743
/**** INTERNAL OBJECTS ****/
44+
typedef struct {
45+
opal_list_item_t super;
46+
opal_jobid_t jobid;
47+
char nspace[PMIX_MAX_NSLEN + 1];
48+
} opal_pmix1_jobid_trkr_t;
49+
OBJ_CLASS_DECLARATION(opal_pmix1_jobid_trkr_t);
50+
3851
typedef struct {
3952
opal_object_t super;
4053
pmix_proc_t p;

0 commit comments

Comments
 (0)