Skip to content

Commit e4248a7

Browse files
committed
Merge pull request open-mpi#609 from rhc54/cmr2.0/pmix
Sync to PMIx master
2 parents 1b6740a + fb5907b commit e4248a7

File tree

17 files changed

+498
-123
lines changed

17 files changed

+498
-123
lines changed

ompi/dpm/dpm.c

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,13 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
139139
opal_argv_append_nosize(&members, nstring);
140140
free(nstring);
141141
/* have to add the number of procs in the job so the remote side
142-
* can correctly add the procs by computing their names */
142+
* can correctly add the procs by computing their names, and our nspace
143+
* so they can update their records */
144+
if (NULL == (nstring = (char*)opal_pmix.get_nspace(OMPI_PROC_MY_NAME->jobid))) {
145+
opal_argv_free(members);
146+
return OMPI_ERR_NOT_SUPPORTED;
147+
}
148+
opal_argv_append_nosize(&members, nstring);
143149
(void)asprintf(&nstring, "%d", size);
144150
opal_argv_append_nosize(&members, nstring);
145151
free(nstring);
@@ -171,6 +177,11 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
171177
}
172178
opal_argv_append_nosize(&members, nstring);
173179
free(nstring);
180+
if (NULL == (nstring = (char*)opal_pmix.get_nspace(proc_list[i]->super.proc_name.jobid))) {
181+
opal_argv_free(members);
182+
return OMPI_ERR_NOT_SUPPORTED;
183+
}
184+
opal_argv_append_nosize(&members, nstring);
174185
}
175186
if (!dense) {
176187
free(proc_list);
@@ -246,6 +257,17 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
246257
OPAL_LIST_DESTRUCT(&mlist);
247258
goto exit;
248259
}
260+
/* step over the nspace */
261+
++i;
262+
if (NULL == members[i]) {
263+
/* this shouldn't happen and is an error */
264+
OMPI_ERROR_LOG(OMPI_ERR_BAD_PARAM);
265+
OPAL_LIST_DESTRUCT(&mlist);
266+
opal_argv_free(members);
267+
free(rport);
268+
rc = OMPI_ERR_BAD_PARAM;
269+
goto exit;
270+
}
249271
/* if the rank is wildcard, then we need to add all procs
250272
* in that job to the list */
251273
if (OPAL_VPID_WILDCARD == nm->name.vpid) {
@@ -295,6 +317,16 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
295317
OPAL_LIST_DESTRUCT(&rlist);
296318
goto exit;
297319
}
320+
/* next entry is the nspace - register it */
321+
++i;
322+
if (NULL == members[i]) {
323+
OMPI_ERROR_LOG(OMPI_ERR_NOT_SUPPORTED);
324+
opal_argv_free(members);
325+
OPAL_LIST_DESTRUCT(&ilist);
326+
OPAL_LIST_DESTRUCT(&rlist);
327+
goto exit;
328+
}
329+
opal_pmix.register_jobid(nm->name.jobid, members[i]);
298330
if (OPAL_VPID_WILDCARD == nm->name.vpid) {
299331
jobid = nm->name.jobid;
300332
OBJ_RELEASE(nm);

opal/mca/pmix/cray/pmix_cray.c

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ static int cray_unpublish_nb(char **keys, opal_list_t *info,
7878
static const char *cray_get_version(void);
7979
static int cray_store_local(const opal_process_name_t *proc,
8080
opal_value_t *val);
81+
static const char *cray_get_nspace(opal_jobid_t jobid);
82+
static void cray_register_jobid(opal_jobid_t jobid, const char *nspace);
83+
8184
#if 0
8285
static bool cray_get_attr(const char *attr, opal_value_t **kv);
8386
#endif
@@ -109,7 +112,9 @@ const opal_pmix_base_module_t opal_pmix_cray_module = {
109112
.get_version = cray_get_version,
110113
.register_errhandler = opal_pmix_base_register_handler,
111114
.deregister_errhandler = opal_pmix_base_deregister_handler,
112-
.store_local = cray_store_local
115+
.store_local = cray_store_local,
116+
.get_nspace = cray_get_nspace,
117+
.register_jobid = cray_register_jobid
113118
};
114119

115120
// usage accounting
@@ -814,6 +819,16 @@ static int cray_store_local(const opal_process_name_t *proc,
814819
return OPAL_SUCCESS;
815820
}
816821

822+
static const char *cray_get_nspace(opal_jobid_t jobid)
823+
{
824+
return NULL;
825+
}
826+
827+
static void cray_register_jobid(opal_jobid_t jobid, const char *nspace)
828+
{
829+
return;
830+
}
831+
817832
static char* pmix_error(int pmix_err)
818833
{
819834
char * err_msg;

opal/mca/pmix/pmix.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,9 @@ extern int opal_pmix_base_exchange(opal_value_t *info,
6262
OPAL_ERROR_LOG((r)); \
6363
} \
6464
} \
65-
/* do not destruct the keyval as we don't own */ \
66-
/* the data - the caller will take care of the */ \
67-
/* key and value storage, and the kv itself has none */ \
65+
/* opal_value_load makes a copy of the data, so release it */ \
66+
_kv.key = NULL; \
67+
OBJ_DESTRUCT(&(_kv)); \
6868
} while(0);
6969

7070
/**
@@ -701,6 +701,12 @@ typedef void (*opal_pmix_base_module_deregister_fn_t)(void);
701701
typedef int (*opal_pmix_base_module_store_fn_t)(const opal_process_name_t *proc,
702702
opal_value_t *val);
703703

704+
/* retrieve the nspace corresponding to a given jobid */
705+
typedef const char* (*opal_pmix_base_module_get_nspace_fn_t)(opal_jobid_t jobid);
706+
707+
/* register a jobid-to-nspace pair */
708+
typedef void (*opal_pmix_base_module_register_jobid_fn_t)(opal_jobid_t jobid, const char *nspace);
709+
704710
/*
705711
* the standard public API data structure
706712
*/
@@ -745,6 +751,8 @@ typedef struct {
745751
opal_pmix_base_module_register_fn_t register_errhandler;
746752
opal_pmix_base_module_deregister_fn_t deregister_errhandler;
747753
opal_pmix_base_module_store_fn_t store_local;
754+
opal_pmix_base_module_get_nspace_fn_t get_nspace;
755+
opal_pmix_base_module_register_jobid_fn_t register_jobid;
748756
} opal_pmix_base_module_t;
749757

750758
typedef struct {

opal/mca/pmix/pmix1xx/pmix/VERSION

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ greek=a1
3030
# command, or with the date (if "git describe" fails) in the form of
3131
# "date<date>".
3232

33-
repo_rev=git89680d6
33+
repo_rev=git0a8e0d9
3434

3535
# If tarball_version is not empty, it is used as the version string in
3636
# the tarball filename, regardless of all other versions listed in
@@ -44,7 +44,7 @@ tarball_version=
4444

4545
# The date when this release was created
4646

47-
date="Sep 10, 2015"
47+
date="Sep 23, 2015"
4848

4949
# The shared library version of each of PMIx's public libraries.
5050
# These versions are maintained in accordance with the "Library

opal/mca/pmix/pmix1xx/pmix/src/client/pmix_client.c

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -779,21 +779,28 @@ static pmix_status_t recv_connect_ack(int sd)
779779
pmix_status_t rc;
780780
struct timeval tv, save;
781781
pmix_socklen_t sz;
782+
bool sockopt = true;
782783

783784
pmix_output_verbose(2, pmix_globals.debug_output,
784785
"pmix: RECV CONNECT ACK FROM SERVER");
785786

786787
/* get the current timeout value so we can reset to it */
787788
sz = sizeof(save);
788789
if (0 != getsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, (void*)&save, &sz)) {
789-
return PMIX_ERR_UNREACH;
790-
}
791-
792-
/* set a timeout on the blocking recv so we don't hang */
793-
tv.tv_sec = 2;
794-
tv.tv_usec = 0;
795-
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv))) {
796-
return PMIX_ERR_UNREACH;
790+
if (ENOPROTOOPT == errno) {
791+
sockopt = false;
792+
} else {
793+
return PMIX_ERR_UNREACH;
794+
}
795+
} else {
796+
/* set a timeout on the blocking recv so we don't hang */
797+
tv.tv_sec = 2;
798+
tv.tv_usec = 0;
799+
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv))) {
800+
pmix_output_verbose(2, pmix_globals.debug_output,
801+
"pmix: recv_connect_ack could not setsockopt SO_RCVTIMEO");
802+
return PMIX_ERR_UNREACH;
803+
}
797804
}
798805

799806
/* receive the status reply */
@@ -825,9 +832,11 @@ static pmix_status_t recv_connect_ack(int sd)
825832
return rc;
826833
}
827834

828-
/* return the socket to normal */
829-
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &save, sz)) {
830-
return PMIX_ERR_UNREACH;
835+
if (sockopt) {
836+
/* return the socket to normal */
837+
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &save, sz)) {
838+
return PMIX_ERR_UNREACH;
839+
}
831840
}
832841

833842
return PMIX_SUCCESS;

opal/mca/pmix/pmix1xx/pmix/src/server/pmix_server.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,12 +102,19 @@ PMIX_CLASS_INSTANCE(pmix_usock_queue_t,
102102
static void scon(pmix_shift_caddy_t *p)
103103
{
104104
p->active = false;
105+
p->kv = NULL;
105106
p->relfn = NULL;
106107
p->relcbd = NULL;
107108
}
109+
static void scdes(pmix_shift_caddy_t *p)
110+
{
111+
if (NULL != p->kv) {
112+
PMIX_RELEASE(p->kv);
113+
}
114+
}
108115
PMIX_CLASS_INSTANCE(pmix_shift_caddy_t,
109116
pmix_object_t,
110-
scon, NULL);
117+
scon, scdes);
111118

112119

113120
#define PMIX_THREADSHIFT(r, c) \

opal/mca/pmix/pmix1xx/pmix1.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,24 @@
3030

3131
BEGIN_C_DECLS
3232

33-
OPAL_DECLSPEC extern opal_pmix_base_component_t mca_pmix_pmix1_component;
33+
typedef struct {
34+
opal_pmix_base_component_t super;
35+
opal_list_t jobids;
36+
bool native_launch;
37+
} mca_pmix_pmix1_component_t;
38+
39+
OPAL_DECLSPEC extern mca_pmix_pmix1_component_t mca_pmix_pmix1xx_component;
3440

3541
OPAL_DECLSPEC extern const opal_pmix_base_module_t opal_pmix_pmix1xx_module;
3642

3743
/**** INTERNAL OBJECTS ****/
44+
typedef struct {
45+
opal_list_item_t super;
46+
opal_jobid_t jobid;
47+
char nspace[PMIX_MAX_NSLEN + 1];
48+
} opal_pmix1_jobid_trkr_t;
49+
OBJ_CLASS_DECLARATION(opal_pmix1_jobid_trkr_t);
50+
3851
typedef struct {
3952
opal_object_t super;
4053
pmix_proc_t p;

0 commit comments

Comments
 (0)