Skip to content

Commit 0ae873d

Browse files
author
Ralph Castain
committed
Fix a bug where we failed to compute #procs for nperXXX directives, thus resulting in an incorrect default binding
Signed-off-by: Ralph Castain <[email protected]>
1 parent 2234953 commit 0ae873d

File tree

1 file changed

+56
-34
lines changed

1 file changed

+56
-34
lines changed

orte/mca/rmaps/base/rmaps_base_map_job.c

Lines changed: 56 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
5050
{
5151
orte_job_t *jdata;
5252
orte_node_t *node;
53-
int rc, i;
54-
bool did_map, given;
53+
int rc, i, ppx;
54+
bool did_map, given, pernode;
5555
orte_rmaps_base_selected_module_t *mod;
5656
orte_job_t *parent;
5757
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
@@ -71,6 +71,22 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
7171
"mca:rmaps: mapping job %s",
7272
ORTE_JOBID_PRINT(jdata->jobid));
7373

74+
if (NULL == jdata->map->ppr && NULL != orte_rmaps_base.ppr) {
75+
jdata->map->ppr = strdup(orte_rmaps_base.ppr);
76+
}
77+
if (NULL != jdata->map->ppr) {
78+
/* get the procs/object */
79+
ppx = strtoul(jdata->map->ppr, NULL, 10);
80+
if (NULL != strstr(jdata->map->ppr, "node")) {
81+
pernode = true;
82+
} else {
83+
pernode = false;
84+
}
85+
}
86+
if (0 == jdata->map->cpus_per_rank) {
87+
jdata->map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
88+
}
89+
7490
/* compute the number of procs and check validity */
7591
nprocs = 0;
7692
for (i=0; i < jdata->apps->size; i++) {
@@ -80,34 +96,47 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
8096
orte_std_cntr_t slots;
8197
OBJ_CONSTRUCT(&nodes, opal_list_t);
8298
orte_rmaps_base_get_target_nodes(&nodes, &slots, app, ORTE_MAPPING_BYNODE, true, true);
83-
/* if we are in a managed allocation, then all is good - otherwise,
84-
* we have to do a little more checking */
85-
if (!orte_managed_allocation) {
86-
/* if all the nodes have their slots given, then we are okay */
87-
given = true;
88-
OPAL_LIST_FOREACH(node, &nodes, orte_node_t) {
89-
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
90-
given = false;
91-
break;
99+
if (NULL != jdata->map->ppr) {
100+
if (pernode) {
101+
nprocs += ppx * opal_list_get_size(&nodes);
102+
} else {
103+
/* must be procs/socket, so add in #sockets for each node */
104+
slots = 0;
105+
OPAL_LIST_FOREACH(node, &nodes, orte_node_t) {
106+
slots += ppx * opal_hwloc_base_get_nbobjs_by_type(node->topology->topo,
107+
HWLOC_OBJ_SOCKET, 0,
108+
OPAL_HWLOC_AVAILABLE);
92109
}
110+
nprocs += slots;
93111
}
94-
/* if -host or -hostfile was given, and the slots were not,
95-
* then this is no longer allowed */
96-
if (!given &&
97-
(orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, NULL, OPAL_STRING) ||
98-
orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING))) {
99-
/* inform the user of the error */
100-
orte_show_help("help-orte-rmaps-base.txt", "num-procs-not-specified", true);
101-
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
102-
OBJ_RELEASE(caddy);
103-
OPAL_LIST_DESTRUCT(&nodes);
104-
return;
112+
} else {
113+
/* if we are in a managed allocation, then all is good - otherwise,
114+
* we have to do a little more checking */
115+
if (!orte_managed_allocation) {
116+
/* if all the nodes have their slots given, then we are okay */
117+
given = true;
118+
OPAL_LIST_FOREACH(node, &nodes, orte_node_t) {
119+
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
120+
given = false;
121+
break;
122+
}
123+
}
124+
/* if -host or -hostfile was given, and the slots were not,
125+
* then this is no longer allowed */
126+
if (!given &&
127+
(orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, NULL, OPAL_STRING) ||
128+
orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING))) {
129+
/* inform the user of the error */
130+
orte_show_help("help-orte-rmaps-base.txt", "num-procs-not-specified", true);
131+
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
132+
OBJ_RELEASE(caddy);
133+
OPAL_LIST_DESTRUCT(&nodes);
134+
return;
135+
}
105136
}
106-
}
107-
OPAL_LIST_DESTRUCT(&nodes);
108-
if (ORTE_MAPPING_PPR != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
109137
nprocs += slots;
110138
}
139+
OPAL_LIST_DESTRUCT(&nodes);
111140
} else {
112141
nprocs += app->num_procs;
113142
}
@@ -116,8 +145,8 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
116145

117146

118147
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
119-
"mca:rmaps: setting mapping policies for job %s",
120-
ORTE_JOBID_PRINT(jdata->jobid));
148+
"mca:rmaps: setting mapping policies for job %s nprocs %d",
149+
ORTE_JOBID_PRINT(jdata->jobid), (int)nprocs);
121150

122151
if (!jdata->map->display_map) {
123152
jdata->map->display_map = orte_rmaps_base.display_map;
@@ -187,13 +216,6 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
187216
jdata->map->ranking = orte_rmaps_base.ranking;
188217
}
189218

190-
if (NULL == jdata->map->ppr && NULL != orte_rmaps_base.ppr) {
191-
jdata->map->ppr = strdup(orte_rmaps_base.ppr);
192-
}
193-
if (0 == jdata->map->cpus_per_rank) {
194-
jdata->map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
195-
}
196-
197219
/* define the binding policy for this job - if the user specified one
198220
* already (e.g., during the call to comm_spawn), then we don't
199221
* override it */

0 commit comments

Comments
 (0)