Skip to content

Commit ebbd815

Browse files
Ralph Castainhppritcha
authored andcommitted
Update mapping system
Correctly transfer job-level mapping directives for dynamically spawned jobs to the mapping system. Signed-off-by: Ralph Castain <[email protected]> (cherry picked from commit 45f23ca)
1 parent 0eece12 commit ebbd815

File tree

5 files changed

+127
-14
lines changed

5 files changed

+127
-14
lines changed

orte/mca/rmaps/base/base.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,8 @@ ORTE_DECLSPEC int orte_rmaps_base_filter_nodes(orte_app_context_t *app,
121121
opal_list_t *nodes,
122122
bool remove);
123123

124-
ORTE_DECLSPEC int orte_rmaps_base_set_mapping_policy(orte_mapping_policy_t *policy,
124+
ORTE_DECLSPEC int orte_rmaps_base_set_mapping_policy(orte_job_t *jdata,
125+
orte_mapping_policy_t *policy,
125126
char **device, char *spec);
126127
ORTE_DECLSPEC int orte_rmaps_base_set_ranking_policy(orte_ranking_policy_t *policy,
127128
orte_mapping_policy_t mapping,

orte/mca/rmaps/base/rmaps_base_frame.c

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags)
287287
"rmaps_base_cpus_per_proc", "rmaps_base_mapping_policy=<obj>:PE=N, default <obj>=NUMA");
288288
}
289289

290-
if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_mapping_policy(&orte_rmaps_base.mapping,
290+
if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_mapping_policy(NULL, &orte_rmaps_base.mapping,
291291
&orte_rmaps_base.device,
292292
rmaps_base_mapping_policy))) {
293293
return rc;
@@ -599,7 +599,8 @@ static int check_modifiers(char *ck, orte_mapping_policy_t *tmp)
599599
return ORTE_ERR_TAKE_NEXT_OPTION;
600600
}
601601

602-
int orte_rmaps_base_set_mapping_policy(orte_mapping_policy_t *policy,
602+
int orte_rmaps_base_set_mapping_policy(orte_job_t *jdata,
603+
orte_mapping_policy_t *policy,
603604
char **device, char *inspec)
604605
{
605606
char *ck;
@@ -687,7 +688,11 @@ int orte_rmaps_base_set_mapping_policy(orte_mapping_policy_t *policy,
687688
}
688689
}
689690
/* now save the pattern */
690-
orte_rmaps_base.ppr = strdup(ck);
691+
if (NULL == jdata || NULL == jdata->map) {
692+
orte_rmaps_base.ppr = strdup(ck);
693+
} else {
694+
jdata->map->ppr = strdup(ck);
695+
}
691696
ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_PPR);
692697
ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_GIVEN);
693698
free(spec);
@@ -753,7 +758,11 @@ int orte_rmaps_base_set_mapping_policy(orte_mapping_policy_t *policy,
753758
}
754759

755760
setpolicy:
756-
*policy = tmp;
761+
if (NULL == jdata || NULL == jdata->map) {
762+
*policy = tmp;
763+
} else {
764+
jdata->map->mapping = tmp;
765+
}
757766

758767
return ORTE_SUCCESS;
759768
}

orte/orted/orted_submit.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -864,7 +864,7 @@ int orte_submit_job(char *argv[], int *index,
864864
jdata->map = OBJ_NEW(orte_job_map_t);
865865

866866
if (NULL != orte_cmd_options.mapping_policy) {
867-
if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_mapping_policy(&jdata->map->mapping, NULL, orte_cmd_options.mapping_policy))) {
867+
if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_mapping_policy(jdata, &jdata->map->mapping, NULL, orte_cmd_options.mapping_policy))) {
868868
ORTE_ERROR_LOG(rc);
869869
return rc;
870870
}

orte/orted/pmix/pmix_server_dyn.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor,
289289
orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
290290
return ORTE_ERR_BAD_PARAM;
291291
}
292-
rc = orte_rmaps_base_set_mapping_policy(&jdata->map->mapping,
292+
rc = orte_rmaps_base_set_mapping_policy(jdata, &jdata->map->mapping,
293293
NULL, info->data.string);
294294
if (ORTE_SUCCESS != rc) {
295295
return rc;

orte/orted/pmix/pmix_server_gen.c

Lines changed: 110 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -485,9 +485,11 @@ static void _query(int sd, short args, void *cbdata)
485485
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
486486
opal_pmix_query_t *q;
487487
opal_value_t *kv;
488+
orte_jobid_t jobid;
488489
orte_job_t *jdata;
489490
orte_proc_t *proct;
490-
int rc, i, num_replies;
491+
orte_app_context_t *app;
492+
int rc = ORTE_SUCCESS, i, k, num_replies;
491493
opal_list_t *results, targets, *array;
492494
size_t n;
493495
uint32_t key;
@@ -703,16 +705,117 @@ static void _query(int sd, short args, void *cbdata)
703705
kv->type = OPAL_STRING;
704706
kv->data.string = strdup(orte_process_info.my_hnp_uri);
705707
opal_list_append(results, &kv->super);
708+
} else if (0 == strcmp(q->keys[n], OPAL_PMIX_QUERY_PROC_TABLE)) {
709+
/* the job they are asking about is in the qualifiers */
710+
jobid = ORTE_JOBID_INVALID;
711+
OPAL_LIST_FOREACH(kv, &q->qualifiers, opal_value_t) {
712+
if (0 == strcmp(kv->key, OPAL_PMIX_PROCID)) {
713+
/* save the id */
714+
jobid = kv->data.name.jobid;
715+
break;
716+
}
717+
}
718+
if (ORTE_JOBID_INVALID == jobid) {
719+
rc = ORTE_ERR_NOT_FOUND;
720+
goto done;
721+
}
722+
/* construct a list of values with opal_proc_info_t
723+
* entries for each proc in the indicated job */
724+
jdata = orte_get_job_data_object(jobid);
725+
if (NULL == jdata) {
726+
rc = ORTE_ERR_NOT_FOUND;
727+
goto done;
728+
}
729+
/* setup the reply */
730+
kv = OBJ_NEW(opal_value_t);
731+
kv->key = strdup(OPAL_PMIX_QUERY_PROC_TABLE);
732+
kv->type = OPAL_PTR;
733+
array = OBJ_NEW(opal_list_t);
734+
kv->data.ptr = array;
735+
opal_list_append(results, &kv->super);
736+
/* cycle thru the job and create an entry for each proc */
737+
for (k=0; k < jdata->procs->size; k++) {
738+
if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, k))) {
739+
continue;
740+
}
741+
kv = OBJ_NEW(opal_value_t);
742+
kv->type = OPAL_PROC_INFO;
743+
kv->data.pinfo.name.jobid = jobid;
744+
kv->data.pinfo.name.vpid = proct->name.vpid;
745+
if (NULL != proct->node && NULL != proct->node->name) {
746+
kv->data.pinfo.hostname = strdup(proct->node->name);
747+
}
748+
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proct->app_idx);
749+
if (NULL != app && NULL != app->app) {
750+
kv->data.pinfo.executable_name = strdup(app->app);
751+
}
752+
kv->data.pinfo.pid = proct->pid;
753+
kv->data.pinfo.exit_code = proct->exit_code;
754+
kv->data.pinfo.state = proct->state;
755+
opal_list_append(array, &kv->super);
756+
}
757+
} else if (0 == strcmp(q->keys[n], OPAL_PMIX_QUERY_LOCAL_PROC_TABLE)) {
758+
/* the job they are asking about is in the qualifiers */
759+
jobid = ORTE_JOBID_INVALID;
760+
OPAL_LIST_FOREACH(kv, &q->qualifiers, opal_value_t) {
761+
if (0 == strcmp(kv->key, OPAL_PMIX_PROCID)) {
762+
/* save the id */
763+
jobid = kv->data.name.jobid;
764+
break;
765+
}
766+
}
767+
if (ORTE_JOBID_INVALID == jobid) {
768+
rc = ORTE_ERR_BAD_PARAM;
769+
goto done;
770+
}
771+
/* construct a list of values with opal_proc_info_t
772+
* entries for each LOCAL proc in the indicated job */
773+
jdata = orte_get_job_data_object(jobid);
774+
if (NULL == jdata) {
775+
rc = ORTE_ERR_NOT_FOUND;
776+
goto done;
777+
}
778+
/* setup the reply */
779+
kv = OBJ_NEW(opal_value_t);
780+
kv->key = strdup(OPAL_PMIX_QUERY_LOCAL_PROC_TABLE);
781+
kv->type = OPAL_PTR;
782+
array = OBJ_NEW(opal_list_t);
783+
kv->data.ptr = array;
784+
opal_list_append(results, &kv->super);
785+
/* cycle thru the job and create an entry for each proc */
786+
for (k=0; k < jdata->procs->size; k++) {
787+
if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, k))) {
788+
continue;
789+
}
790+
if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_LOCAL)) {
791+
kv = OBJ_NEW(opal_value_t);
792+
kv->type = OPAL_PROC_INFO;
793+
kv->data.pinfo.name.jobid = jobid;
794+
kv->data.pinfo.name.vpid = proct->name.vpid;
795+
if (NULL != proct->node && NULL != proct->node->name) {
796+
kv->data.pinfo.hostname = strdup(proct->node->name);
797+
}
798+
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proct->app_idx);
799+
if (NULL != app && NULL != app->app) {
800+
kv->data.pinfo.executable_name = strdup(app->app);
801+
}
802+
kv->data.pinfo.pid = proct->pid;
803+
kv->data.pinfo.exit_code = proct->exit_code;
804+
kv->data.pinfo.state = proct->state;
805+
opal_list_append(array, &kv->super);
806+
}
807+
}
706808
}
707809
}
708810
}
709811

710-
if (0 == opal_list_get_size(results)) {
711-
rc = ORTE_ERR_NOT_FOUND;
712-
} else if (opal_list_get_size(results) < opal_list_get_size(cd->info)) {
713-
rc = ORTE_ERR_PARTIAL_SUCCESS;
714-
} else {
715-
rc = ORTE_SUCCESS;
812+
done:
813+
if (ORTE_SUCCESS == rc) {
814+
if (0 == opal_list_get_size(results)) {
815+
rc = ORTE_ERR_NOT_FOUND;
816+
} else if (opal_list_get_size(results) < opal_list_get_size(cd->info)) {
817+
rc = ORTE_ERR_PARTIAL_SUCCESS;
818+
}
716819
}
717820
cd->infocbfunc(rc, results, cd->cbdata, qrel, results);
718821
}

0 commit comments

Comments
 (0)