Skip to content

Commit f4411c4

Browse files
author
Ralph Castain
committed
Enable use of OFI fabrics for launch and other collective operations. Update the PMIx repo to the latest master to get the required support for the server to "push" modex info, and to retrieve all its own "modex" values for sending back to mpirun. Have mpirun cache them in its local modex hash as OFI goes point-to-point direct and doesn't route - so the remote daemons don't need a copy of this connection info.
Remove the opal_ignore from the RML/OFI component, but disable that component unless the user specifically requests it via the "rml_ofi_desired=1" MCA param. This will let us test compile in various environments without interfering with operations while we continue to debug Fix an error when computing the number of infos during server init Signed-off-by: Ralph Castain <[email protected]>
1 parent 7b0653a commit f4411c4

File tree

18 files changed

+316
-358
lines changed

18 files changed

+316
-358
lines changed

opal/mca/pmix/pmix2x/pmix/include/pmix_common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,8 @@ typedef uint32_t pmix_rank_t;
124124
#define PMIX_CONNECT_SYSTEM_FIRST "pmix.cnct.sys.first" // (bool) Preferentially look for a system-level PMIx server first
125125
#define PMIX_REGISTER_NODATA "pmix.reg.nodata" // (bool) Registration is for nspace only, do not copy job data
126126
#define PMIX_SERVER_ENABLE_MONITORING "pmix.srv.monitor" // (bool) Enable PMIx internal monitoring by server
127+
#define PMIX_SERVER_NSPACE "pmix.srv.nspace" // (char*) Name of the nspace to use for this server
128+
#define PMIX_SERVER_RANK "pmix.srv.rank" // (pmix_rank_t) Rank of this server
127129

128130

129131
/* identification attributes */

opal/mca/pmix/pmix2x/pmix/src/buffer_ops/copy.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,7 @@ PMIX_EXPORT pmix_status_t pmix_value_xfer(pmix_value_t *p, pmix_value_t *src)
425425
break;
426426
}
427427
/* allocate space and do the copy */
428-
switch (src->type) {
428+
switch (src->data.darray->type) {
429429
case PMIX_UINT8:
430430
case PMIX_INT8:
431431
case PMIX_BYTE:

opal/mca/pmix/pmix2x/pmix/src/client/pmix_client_get.c

Lines changed: 45 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ PMIX_EXPORT pmix_status_t PMIx_Get(const pmix_proc_t *proc, const char key[],
111111
PMIX_RELEASE(cb);
112112

113113
pmix_output_verbose(2, pmix_globals.debug_output,
114-
"pmix:client get completed");
114+
"pmix:client get completed %d", rc);
115115

116116
return rc;
117117
}
@@ -464,7 +464,7 @@ static pmix_status_t process_val(pmix_value_t *val,
464464
}
465465
nvals = 0;
466466
for (n=0; n < nsize; n++) {
467-
if (PMIX_SUCCESS != (rc = pmix_pointer_array_add(results, &info[n]))) {
467+
if (0 > (rc = pmix_pointer_array_add(results, &info[n]))) {
468468
return rc;
469469
}
470470
++nvals;
@@ -536,25 +536,45 @@ static void _getnbfn(int fd, short flags, void *cbdata)
536536
/* if the rank is WILDCARD, then they want all the job-level info,
537537
* so no need to check the modex */
538538
if (PMIX_RANK_WILDCARD != cb->rank) {
539+
rc = PMIX_ERR_NOT_FOUND;
539540
#if defined(PMIX_ENABLE_DSTORE) && (PMIX_ENABLE_DSTORE == 1)
540-
if (PMIX_SUCCESS == (rc = pmix_dstore_fetch(nptr->nspace, cb->rank, NULL, &val))) {
541-
#else
542-
if (PMIX_SUCCESS == (rc = pmix_hash_fetch(&nptr->modex, cb->rank, NULL, &val))) {
541+
/* my own data is in the hash table, so don't bother looking
542+
* in the dstore if that is what they want */
543+
if (pmix_globals.myid.rank != cb->rank) {
544+
if (PMIX_SUCCESS == (rc = pmix_dstore_fetch(nptr->nspace, cb->rank, NULL, &val))) {
545+
pmix_output_verbose(2, pmix_globals.debug_output,
546+
"pmix_get[%d]: value retrieved from dstore", __LINE__);
547+
if (PMIX_SUCCESS != (rc = process_val(val, &nvals, &results))) {
548+
cb->value_cbfunc(rc, NULL, cb->cbdata);
549+
/* cleanup */
550+
if (NULL != val) {
551+
PMIX_VALUE_RELEASE(val);
552+
}
553+
PMIX_RELEASE(cb);
554+
return;
555+
}
556+
}
557+
}
543558
#endif /* PMIX_ENABLE_DSTORE */
544-
pmix_output_verbose(2, pmix_globals.debug_output,
545-
"pmix_get[%d]: value retrieved from dstore", __LINE__);
546-
if (PMIX_SUCCESS != (rc = process_val(val, &nvals, &results))) {
547-
cb->value_cbfunc(rc, NULL, cb->cbdata);
548-
/* cleanup */
549-
if (NULL != val) {
550-
PMIX_VALUE_RELEASE(val);
559+
if (PMIX_SUCCESS != rc) {
560+
/* if the user was asking about themselves, or we aren't using the dstore,
561+
* then we need to check the hash table */
562+
if (PMIX_SUCCESS == (rc = pmix_hash_fetch(&nptr->modex, cb->rank, NULL, &val))) {
563+
pmix_output_verbose(2, pmix_globals.debug_output,
564+
"pmix_get[%d]: value retrieved from hash", __LINE__);
565+
if (PMIX_SUCCESS != (rc = process_val(val, &nvals, &results))) {
566+
cb->value_cbfunc(rc, NULL, cb->cbdata);
567+
/* cleanup */
568+
if (NULL != val) {
569+
PMIX_VALUE_RELEASE(val);
570+
}
571+
PMIX_RELEASE(cb);
572+
return;
551573
}
552-
PMIX_RELEASE(cb);
553-
return;
574+
PMIX_VALUE_RELEASE(val);
554575
}
555-
/* cleanup */
556-
PMIX_VALUE_RELEASE(val);
557-
} else {
576+
}
577+
if (PMIX_SUCCESS != rc) {
558578
/* if we didn't find a modex for this rank, then we need
559579
* to go get it. Thus, the caller wants -all- information for
560580
* the specified rank, not just the job-level info. */
@@ -572,12 +592,17 @@ static void _getnbfn(int fd, short flags, void *cbdata)
572592
PMIX_RELEASE(cb);
573593
return;
574594
}
575-
/* cleanup */
576595
PMIX_VALUE_RELEASE(val);
577596
}
578597
/* now let's package up the results */
579598
PMIX_VALUE_CREATE(val, 1);
580599
val->type = PMIX_DATA_ARRAY;
600+
val->data.darray = (pmix_data_array_t*)malloc(sizeof(pmix_data_array_t));
601+
if (NULL == val->data.darray) {
602+
PMIX_VALUE_RELEASE(val);
603+
cb->value_cbfunc(PMIX_ERR_NOMEM, NULL, cb->cbdata);
604+
return;
605+
}
581606
val->data.darray->type = PMIX_INFO;
582607
val->data.darray->size = nvals;
583608
PMIX_INFO_CREATE(iptr, nvals);
@@ -597,14 +622,13 @@ static void _getnbfn(int fd, short flags, void *cbdata)
597622
} else {
598623
pmix_value_xfer(&iptr[n].value, &info->value);
599624
}
600-
PMIX_INFO_FREE(info, 1);
625+
PMIX_INFO_DESTRUCT(info);
601626
}
602627
}
603628
/* done with results array */
604629
PMIX_DESTRUCT(&results);
605-
/* return the result to the caller */
630+
/* return the result to the caller - they are responsible for releasing it */
606631
cb->value_cbfunc(PMIX_SUCCESS, val, cb->cbdata);
607-
PMIX_VALUE_FREE(val, 1);
608632
PMIX_RELEASE(cb);
609633
return;
610634
}

opal/mca/pmix/pmix2x/pmix/src/util/hash.c

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,9 @@ pmix_status_t pmix_hash_fetch(pmix_hash_table_t *table, pmix_rank_t rank,
106106
pmix_kval_t *hv;
107107
uint64_t id;
108108
char *node;
109+
pmix_info_t *info;
110+
size_t ninfo, n;
111+
pmix_value_t *val;
109112

110113
pmix_output_verbose(10, pmix_globals.debug_output,
111114
"HASH:FETCH rank %d key %s",
@@ -143,7 +146,36 @@ pmix_status_t pmix_hash_fetch(pmix_hash_table_t *table, pmix_rank_t rank,
143146
if (NULL == key) {
144147
/* we will return the data as an array of pmix_info_t
145148
* in the kvs pmix_value_t */
146-
149+
val = (pmix_value_t*)malloc(sizeof(pmix_value_t));
150+
if (NULL == val) {
151+
return PMIX_ERR_NOMEM;
152+
}
153+
val->type = PMIX_DATA_ARRAY;
154+
val->data.darray = (pmix_data_array_t*)malloc(sizeof(pmix_data_array_t));
155+
if (NULL == val->data.darray) {
156+
PMIX_VALUE_RELEASE(val);
157+
return PMIX_ERR_NOMEM;
158+
}
159+
val->data.darray->type = PMIX_INFO;
160+
val->data.darray->size = 0;
161+
val->data.darray->array = NULL;
162+
ninfo = pmix_list_get_size(&proc_data->data);
163+
PMIX_INFO_CREATE(info, ninfo);
164+
if (NULL == info) {
165+
PMIX_VALUE_RELEASE(val);
166+
return PMIX_ERR_NOMEM;
167+
}
168+
/* copy the list elements */
169+
n=0;
170+
PMIX_LIST_FOREACH(hv, &proc_data->data, pmix_kval_t) {
171+
(void)strncpy(info[n].key, hv->key, PMIX_MAX_KEYLEN);
172+
pmix_value_xfer(&info[n].value, hv->value);
173+
++n;
174+
}
175+
val->data.darray->size = ninfo;
176+
val->data.darray->array = info;
177+
*kvs = val;
178+
return PMIX_SUCCESS;
147179
} else {
148180
/* find the value from within this proc_data object */
149181
hv = lookup_keyval(&proc_data->data, key);

opal/mca/pmix/pmix2x/pmix/test/simple/simpclient.c

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -269,21 +269,51 @@ int main(int argc, char **argv)
269269
PMIX_VALUE_RELEASE(val);
270270
free(tmp);
271271

272-
(void)asprintf(&tmp, "%s-%d-remote-%d", proc.nspace, n, j);
273-
if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, tmp, NULL, 0, &val))) {
274-
/* this data should _not_ be found as we are on the same node
275-
* and the data was "put" with a PMIX_REMOTE scope */
276-
pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned correct", myproc.nspace, myproc.rank, j, tmp);
277-
continue;
272+
if (n != myproc.rank) {
273+
(void)asprintf(&tmp, "%s-%d-remote-%d", proc.nspace, n, j);
274+
if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, tmp, NULL, 0, &val))) {
275+
/* this data should _not_ be found as we are on the same node
276+
* and the data was "put" with a PMIX_REMOTE scope */
277+
pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned correct", myproc.nspace, myproc.rank, j, tmp);
278+
continue;
279+
}
280+
pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned remote data for a local proc",
281+
myproc.nspace, myproc.rank, j, tmp);
282+
PMIX_VALUE_RELEASE(val);
283+
free(tmp);
278284
}
279-
pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned remote data for a local proc",
280-
myproc.nspace, myproc.rank, j, tmp);
281-
PMIX_VALUE_RELEASE(val);
282-
free(tmp);
283285
}
284286
}
285287
}
286288

289+
/* now get the data blob for myself */
290+
pmix_output(0, "Client ns %s rank %d testing internal modex blob",
291+
myproc.nspace, myproc.rank);
292+
if (PMIX_SUCCESS == (rc = PMIx_Get(&myproc, NULL, NULL, 0, &val))) {
293+
if (PMIX_DATA_ARRAY != val->type) {
294+
pmix_output(0, "Client ns %s rank %d did not return an array for its internal modex blob",
295+
myproc.nspace, myproc.rank);
296+
PMIX_VALUE_RELEASE(val);
297+
} else if (PMIX_INFO != val->data.darray->type) {
298+
pmix_output(0, "Client ns %s rank %d returned an internal modex array of type %s instead of PMIX_INFO",
299+
myproc.nspace, myproc.rank, PMIx_Data_type_string(val->data.darray->type));
300+
PMIX_VALUE_RELEASE(val);
301+
} else if (0 == val->data.darray->size) {
302+
pmix_output(0, "Client ns %s rank %d returned an internal modex array of zero length",
303+
myproc.nspace, myproc.rank);
304+
PMIX_VALUE_RELEASE(val);
305+
} else {
306+
pmix_info_t *iptr = (pmix_info_t*)val->data.darray->array;
307+
for (n=0; n < val->data.darray->size; n++) {
308+
pmix_output(0, "\tKey: %s", iptr[n].key);
309+
}
310+
PMIX_VALUE_RELEASE(val);
311+
}
312+
} else {
313+
pmix_output(0, "Client ns %s rank %d internal modex blob FAILED with error %s(%d)",
314+
myproc.nspace, myproc.rank, PMIx_Error_string(rc), rc);
315+
}
316+
287317
/* log something */
288318
PMIX_INFO_CONSTRUCT(&info);
289319
(void)strncpy(info.key, "foobar", PMIX_MAX_KEYLEN);

opal/mca/pmix/pmix2x/pmix/test/test_common.c

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -226,10 +226,7 @@ void parse_cmd(int argc, char **argv, test_params *params)
226226
}
227227

228228
// Fix rank if running under SLURM
229-
#if 0
230-
/* the following "if" statement can never be true as rank is
231-
* an unsigned 32-bit int */
232-
if( 0 > params->rank ){
229+
if( PMIX_RANK_UNDEF == params->rank ){
233230
char *ranklist = getenv("SLURM_GTIDS");
234231
char *rankno = getenv("SLURM_LOCALID");
235232
if( NULL != ranklist && NULL != rankno ){
@@ -246,7 +243,6 @@ void parse_cmd(int argc, char **argv, test_params *params)
246243
pmix_argv_free(argv);
247244
}
248245
}
249-
#endif
250246

251247
// Fix namespace if running under SLURM
252248
if( NULL == params->nspace ){

opal/mca/pmix/pmix2x/pmix2x_server_south.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ int pmix2x_server_init(opal_pmix_server_module_t *module,
112112

113113
/* convert the list to an array of pmix_info_t */
114114
if (NULL != info) {
115-
sz = opal_list_get_size(info);
115+
sz = opal_list_get_size(info) + 2;
116116
PMIX_INFO_CREATE(pinfo, sz);
117117
n = 0;
118118
OPAL_LIST_FOREACH(kv, info, opal_value_t) {
@@ -121,8 +121,8 @@ int pmix2x_server_init(opal_pmix_server_module_t *module,
121121
++n;
122122
}
123123
} else {
124-
sz = 0;
125-
pinfo = NULL;
124+
sz = 2;
125+
PMIX_INFO_CREATE(pinfo, 2);
126126
}
127127

128128
/* insert ourselves into our list of jobids - it will be the
@@ -133,6 +133,9 @@ int pmix2x_server_init(opal_pmix_server_module_t *module,
133133
opal_list_append(&mca_pmix_pmix2x_component.jobids, &job->super);
134134
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
135135

136+
/* add our nspace and rank to the array going down to the PMIx server */
137+
PMIX_INFO_LOAD(&pinfo[sz-2], PMIX_SERVER_NSPACE, job->nspace, PMIX_STRING);
138+
PMIX_INFO_LOAD(&pinfo[sz-1], PMIX_SERVER_RANK, &OPAL_PROC_MY_NAME.vpid, PMIX_PROC_RANK);
136139
if (PMIX_SUCCESS != (rc = PMIx_server_init(&mymodule, pinfo, sz))) {
137140
PMIX_INFO_FREE(pinfo, sz);
138141
return pmix2x_convert_rc(rc);

opal/mca/pmix/pmix_types.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ BEGIN_C_DECLS
6262
#define OPAL_PMIX_CONNECT_SYSTEM_FIRST "pmix.cnct.sys.first" // (bool) Preferentially look for a system-level PMIx server first
6363
#define OPAL_PMIX_REGISTER_NODATA "pmix.reg.nodata" // (bool) Registration is for nspace only, do not copy job data
6464
#define OPAL_PMIX_SERVER_ENABLE_MONITORING "pmix.srv.monitor" // (bool) Enable PMIx internal monitoring by server
65+
#define OPAL_PMIX_SERVER_NSPACE "pmix.srv.nspace" // (char*) Name of the nspace to use for this server
66+
#define OPAL_PMIX_SERVER_RANK "pmix.srv.rank" // (uint32_t) Rank of this server
6567

6668

6769
/* identification attributes */

orte/mca/ess/base/ess_base_std_orted.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,9 @@ int orte_ess_base_orted_setup(void)
357357
}
358358
/* set the event base */
359359
opal_pmix_base_set_evbase(orte_event_base);
360-
/* setup the PMIx server */
360+
/* setup the PMIx server - we need this here in case the
361+
* communications infrastructure wants to register
362+
* information */
361363
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
362364
/* the server code already barked, so let's be quiet */
363365
ret = ORTE_ERR_SILENT;
@@ -398,6 +400,9 @@ int orte_ess_base_orted_setup(void)
398400
goto error;
399401
}
400402

403+
/* it is now safe to start the pmix server */
404+
pmix_server_start();
405+
401406
if (NULL != orte_process_info.my_hnp_uri) {
402407
/* extract the HNP's name so we can update the routing table */
403408
if (ORTE_SUCCESS != (ret = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri,
@@ -444,7 +449,7 @@ int orte_ess_base_orted_setup(void)
444449
/* add our contact info to our proc object */
445450
proc->rml_uri = orte_rml.get_contact_info();
446451

447-
/*
452+
/*
448453
* Group communications
449454
*/
450455
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {

0 commit comments

Comments
 (0)