Skip to content

Commit d85239e

Browse files
author
Ralph Castain
committed
Cleanup some issues in connect/accept support across jobs started by different mpirun commands. Still not fully operational, but someone else will have to finish debugging it
Signed-off-by: Ralph Castain <[email protected]>
1 parent 4e76379 commit d85239e

File tree

4 files changed

+162
-77
lines changed

4 files changed

+162
-77
lines changed

orte/orted/pmix/pmix_server_dyn.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,11 @@ static void _cnlk(int status, opal_list_t *data, void *cbdata)
399399

400400
/* restart the cnct processor */
401401
ORTE_PMIX_OPERATION(cd->procs, cd->info, _cnct, cd->cbfunc, cd->cbdata);
402+
/* protect the re-referenced data */
403+
cd->procs = NULL;
404+
cd->info = NULL;
402405
OBJ_RELEASE(cd);
406+
return;
403407

404408
release:
405409
if (NULL != cd->cbfunc) {
@@ -415,6 +419,7 @@ static void _cnct(int sd, short args, void *cbdata)
415419
char **keys = NULL, *key;
416420
orte_job_t *jdata;
417421
int rc = ORTE_SUCCESS;
422+
opal_value_t *kv;
418423

419424
ORTE_ACQUIRE_OBJECT(cd);
420425

@@ -444,6 +449,12 @@ static void _cnct(int sd, short args, void *cbdata)
444449
orte_util_convert_jobid_to_string(&key, nm->name.jobid);
445450
opal_argv_append_nosize(&keys, key);
446451
free(key);
452+
/* we have to add the user's id to our list of info */
453+
kv = OBJ_NEW(opal_value_t);
454+
kv->key = strdup(OPAL_PMIX_USERID);
455+
kv->type = OPAL_UINT32;
456+
kv->data.uint32 = geteuid();
457+
opal_list_append(cd->info, &kv->super);
447458
if (ORTE_SUCCESS != (rc = pmix_server_lookup_fn(&nm->name, keys, cd->info, _cnlk, cd))) {
448459
opal_argv_free(keys);
449460
goto release;

orte/orted/pmix/pmix_server_pub.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,10 @@ int pmix_server_lookup_fn(opal_process_name_t *proc, char **keys,
394394
req->timeout = iptr->data.integer;
395395
continue;
396396
}
397+
opal_output_verbose(2, orte_pmix_server_globals.output,
398+
"%s lookup directive %s for proc %s",
399+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), iptr->key,
400+
ORTE_NAME_PRINT(proc));
397401
if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &iptr, 1, OPAL_VALUE))) {
398402
ORTE_ERROR_LOG(rc);
399403
OBJ_RELEASE(req);

orte/orted/pmix/pmix_server_register_fns.c

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@
5050
#include "pmix_server_internal.h"
5151
#include "pmix_server.h"
5252

53+
static void mycbfunc(int status, void *cbdata);
54+
5355
/* stuff proc attributes for sending back to a proc */
5456
int orte_pmix_server_register_nspace(orte_job_t *jdata, bool force)
5557
{
@@ -472,5 +474,67 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata, bool force)
472474
info, NULL, NULL);
473475
OPAL_LIST_RELEASE(info);
474476

477+
/* if the user has connected us to an external server, then we must
478+
* assume there is going to be some cross-mpirun exchange, and so
479+
* we protect against that situation by publishing the job info
480+
* for this job - this allows any subsequent "connect" to retrieve
481+
* the job info */
482+
if (NULL != orte_data_server_uri) {
483+
opal_buffer_t buf;
484+
485+
OBJ_CONSTRUCT(&buf, opal_buffer_t);
486+
if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &jdata, 1, ORTE_JOB))) {
487+
ORTE_ERROR_LOG(rc);
488+
OBJ_DESTRUCT(&buf);
489+
return rc;
490+
}
491+
info = OBJ_NEW(opal_list_t);
492+
/* create a key-value with the key being the string jobid
493+
* and the value being the byte object */
494+
kv = OBJ_NEW(opal_value_t);
495+
orte_util_convert_jobid_to_string(&kv->key, jdata->jobid);
496+
kv->type = OPAL_BYTE_OBJECT;
497+
opal_dss.unload(&buf, (void**)&kv->data.bo.bytes, &kv->data.bo.size);
498+
OBJ_DESTRUCT(&buf);
499+
opal_list_append(info, &kv->super);
500+
501+
/* set the range to be session */
502+
kv = OBJ_NEW(opal_value_t);
503+
kv->key = strdup(OPAL_PMIX_RANGE);
504+
kv->type = OPAL_UINT;
505+
kv->data.uint = OPAL_PMIX_RANGE_SESSION;
506+
opal_list_append(info, &kv->super);
507+
508+
/* set the persistence to be app */
509+
kv = OBJ_NEW(opal_value_t);
510+
kv->key = strdup(OPAL_PMIX_PERSISTENCE);
511+
kv->type = OPAL_INT;
512+
kv->data.integer = OPAL_PMIX_PERSIST_APP;
513+
opal_list_append(info, &kv->super);
514+
515+
/* add our effective userid to the directives */
516+
kv = OBJ_NEW(opal_value_t);
517+
kv->key = strdup(OPAL_PMIX_USERID);
518+
kv->type = OPAL_UINT32;
519+
kv->data.uint32 = geteuid();
520+
opal_list_append(info, &kv->super);
521+
522+
/* now publish it */
523+
if (ORTE_SUCCESS != (rc = pmix_server_publish_fn(ORTE_PROC_MY_NAME,
524+
info, mycbfunc, info))) {
525+
ORTE_ERROR_LOG(rc);
526+
}
527+
}
528+
475529
return rc;
476530
}
531+
532+
static void mycbfunc(int status, void *cbdata)
533+
{
534+
opal_list_t *info = (opal_list_t*)cbdata;
535+
536+
if (ORTE_SUCCESS != status) {
537+
ORTE_ERROR_LOG(status);
538+
}
539+
OPAL_LIST_RELEASE(info);
540+
}

0 commit comments

Comments
 (0)