Skip to content

Commit 2a01cc8

Browse files
author
rhc54
authored
Merge pull request #2512 from rhc54/topic/dyn
Allow a PMIx tool to spawn a job
2 parents 003f7d3 + 79cde18 commit 2a01cc8

File tree

2 files changed

+101
-20
lines changed

2 files changed

+101
-20
lines changed

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -327,24 +327,29 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
327327
OBJ_RELEASE(caddy);
328328
return;
329329
}
330+
/* a tool might be the parent calling spawn, so cannot require that
331+
* a job transport key has been assigned to it */
330332
key = NULL;
331-
if (!orte_get_attribute(&parent->attributes, ORTE_JOB_TRANSPORT_KEY, (void**)&key, OPAL_STRING) ||
332-
NULL == key) {
333-
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
334-
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
335-
OBJ_RELEASE(caddy);
336-
return;
337-
}
338-
/* record it */
339-
orte_set_attribute(&caddy->jdata->attributes, ORTE_JOB_TRANSPORT_KEY, ORTE_ATTR_LOCAL, key, OPAL_STRING);
340-
/* add the transport key envar to each app */
341-
for (i=0; i < caddy->jdata->apps->size; i++) {
342-
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(caddy->jdata->apps, i))) {
343-
continue;
333+
if (orte_get_attribute(&parent->attributes, ORTE_JOB_TRANSPORT_KEY, (void**)&key, OPAL_STRING) &&
334+
NULL != key) {
335+
/* record it */
336+
orte_set_attribute(&caddy->jdata->attributes, ORTE_JOB_TRANSPORT_KEY, ORTE_ATTR_LOCAL, key, OPAL_STRING);
337+
/* add the transport key envar to each app */
338+
for (i=0; i < caddy->jdata->apps->size; i++) {
339+
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(caddy->jdata->apps, i))) {
340+
continue;
341+
}
342+
opal_setenv(OPAL_MCA_PREFIX"orte_precondition_transports", key, true, &app->env);
343+
}
344+
free(key);
345+
} else {
346+
if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(caddy->jdata))) {
347+
ORTE_ERROR_LOG(rc);
348+
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
349+
OBJ_RELEASE(caddy);
350+
return;
344351
}
345-
opal_setenv(OPAL_MCA_PREFIX"orte_precondition_transports", key, true, &app->env);
346352
}
347-
free(key);
348353
} else {
349354
/* this will also record the transport key attribute in the job object, and
350355
* adds the key envar to each app */

orte/orted/pmix/pmix_server_gen.c

Lines changed: 81 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include "opal/dss/dss.h"
3838

3939
#include "orte/mca/errmgr/errmgr.h"
40+
#include "orte/mca/rmaps/rmaps_types.h"
4041
#include "orte/mca/state/state.h"
4142
#include "orte/util/name_fns.h"
4243
#include "orte/runtime/orte_globals.h"
@@ -351,6 +352,11 @@ int pmix_server_notify_event(int code, opal_process_name_t *source,
351352
opal_value_t *val;
352353
orte_grpcomm_signature_t *sig;
353354

355+
opal_output_verbose(2, orte_pmix_server_globals.output,
356+
"%s local process %s generated event code %d",
357+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
358+
ORTE_NAME_PRINT(source), code);
359+
354360
/* a local process has generated an event - we need to xcast it
355361
* to all the daemons so it can be passed down to their local
356362
* procs */
@@ -442,6 +448,10 @@ static void _query(int sd, short args, void *cbdata)
442448
void *nptr;
443449
char **nspaces=NULL, nspace[512];
444450

451+
opal_output_verbose(2, orte_pmix_server_globals.output,
452+
"%s processing query",
453+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
454+
445455
results = OBJ_NEW(opal_list_t);
446456

447457
/* see what they wanted */
@@ -508,15 +518,75 @@ int pmix_server_query_fn(opal_process_name_t *requestor,
508518
static void _toolconn(int sd, short args, void *cbdata)
509519
{
510520
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
511-
orte_job_t jdata;
521+
orte_job_t *jdata;
522+
orte_app_context_t *app;
523+
orte_proc_t *proc;
524+
orte_node_t *node;
512525
orte_process_name_t tool;
513526
int rc;
514527

528+
opal_output_verbose(2, orte_pmix_server_globals.output,
529+
"%s TOOL CONNECTION PROCESSING",
530+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
531+
515532
/* if we are the HNP, we can directly assign the jobid */
516533
if (ORTE_PROC_IS_HNP) {
517-
OBJ_CONSTRUCT(&jdata, orte_job_t);
518-
rc = orte_plm_base_create_jobid(&jdata);
519-
tool.jobid = jdata.jobid;
534+
jdata = OBJ_NEW(orte_job_t);
535+
rc = orte_plm_base_create_jobid(jdata);
536+
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
537+
/* setup some required job-level fields in case this
538+
* tool calls spawn, or uses some other functions that
539+
* need them */
540+
/* must create a map for it (even though it has no
541+
* info in it) so that the job info will be picked
542+
* up in subsequent pidmaps or other daemons won't
543+
* know how to route
544+
*/
545+
jdata->map = OBJ_NEW(orte_job_map_t);
546+
547+
/* setup an app_context for the singleton */
548+
app = OBJ_NEW(orte_app_context_t);
549+
app->app = strdup("tool");
550+
app->num_procs = 1;
551+
opal_pointer_array_add(jdata->apps, app);
552+
jdata->num_apps = 1;
553+
554+
/* setup a proc object for the singleton - since we
555+
* -must- be the HNP, and therefore we stored our
556+
* node on the global node pool, and since the singleton
557+
* -must- be on the same node as us, indicate that
558+
*/
559+
proc = OBJ_NEW(orte_proc_t);
560+
proc->name.jobid = jdata->jobid;
561+
proc->name.vpid = 0;
562+
proc->parent = ORTE_PROC_MY_NAME->vpid;
563+
ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_ALIVE);
564+
proc->state = ORTE_PROC_STATE_RUNNING;
565+
proc->app_idx = 0;
566+
/* obviously, it is on my node */
567+
node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
568+
proc->node = node;
569+
OBJ_RETAIN(node); /* keep accounting straight */
570+
opal_pointer_array_add(jdata->procs, proc);
571+
jdata->num_procs = 1;
572+
/* add the node to the job map */
573+
OBJ_RETAIN(node);
574+
opal_pointer_array_add(jdata->map->nodes, node);
575+
jdata->map->num_nodes++;
576+
/* and it obviously is on the node */
577+
OBJ_RETAIN(proc);
578+
opal_pointer_array_add(node->procs, proc);
579+
node->num_procs++;
580+
/* set the trivial */
581+
proc->local_rank = 0;
582+
proc->node_rank = 0;
583+
proc->app_rank = 0;
584+
proc->state = ORTE_PROC_STATE_RUNNING;
585+
proc->app_idx = 0;
586+
ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_LOCAL);
587+
588+
/* pass back the assigned jobid */
589+
tool.jobid = jdata->jobid;
520590
tool.vpid = 0;
521591
if (NULL != cd->toolcbfunc) {
522592
cd->toolcbfunc(rc, tool, cd->cbdata);
@@ -541,7 +611,9 @@ void pmix_tool_connected_fn(opal_list_t *info,
541611
{
542612
orte_pmix_server_op_caddy_t *cd;
543613

544-
opal_output(0, "TOOL CONNECTION REQUEST RECVD");
614+
opal_output_verbose(2, orte_pmix_server_globals.output,
615+
"%s TOOL CONNECTION REQUEST RECVD",
616+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
545617

546618
/* need to threadshift this request */
547619
cd = OBJ_NEW(orte_pmix_server_op_caddy_t);
@@ -566,6 +638,10 @@ void pmix_server_log_fn(opal_process_name_t *requestor,
566638
opal_buffer_t *buf;
567639
int rc;
568640

641+
opal_output_verbose(2, orte_pmix_server_globals.output,
642+
"%s logging info",
643+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
644+
569645
/* for now, we only support logging show_help messages */
570646
OPAL_LIST_FOREACH(val, info, opal_value_t) {
571647
/* we ignore the key as irrelevant - we only want to

0 commit comments

Comments
 (0)