Skip to content

Commit d644f7e

Browse files
author
Ralph Castain
committed
Correctly fix the ranking policy
Shorten the loops as much as possible - if someone wants to further optimize, they are welcome to do so. Signed-off-by: Ralph Castain <[email protected]>
1 parent fd704d8 commit d644f7e

File tree

1 file changed

+60
-54
lines changed

1 file changed

+60
-54
lines changed

orte/mca/rmaps/base/rmaps_base_ranking.c

Lines changed: 60 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -304,15 +304,15 @@ static int rank_by(orte_job_t *jdata,
304304
{
305305
orte_app_context_t *app;
306306
hwloc_obj_t obj;
307-
int num_objs, i, j, m, n, rc;
307+
int num_objs, i, j, m, n, rc, nn;
308308
orte_vpid_t num_ranked=0;
309309
orte_node_t *node;
310310
orte_proc_t *proc, *pptr;
311-
orte_vpid_t vpid;
311+
orte_vpid_t vpid, np;
312312
int cnt;
313313
opal_pointer_array_t objs;
314-
bool all_done;
315314
hwloc_obj_t locale;
315+
orte_app_idx_t napp;
316316

317317
if (ORTE_RANKING_SPAN & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) {
318318
return rank_span(jdata, target, cache_level);
@@ -333,20 +333,21 @@ static int rank_by(orte_job_t *jdata,
333333
*/
334334

335335
vpid = 0;
336-
for (n=0; n < jdata->apps->size; n++) {
336+
for (n=0, napp=0; napp < jdata->num_apps && n < jdata->apps->size; n++) {
337337
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) {
338338
continue;
339339
}
340-
340+
napp++;
341341
/* setup the pointer array */
342342
OBJ_CONSTRUCT(&objs, opal_pointer_array_t);
343343
opal_pointer_array_init(&objs, 2, INT_MAX, 2);
344344

345345
cnt = 0;
346-
for (m=0; m < jdata->map->nodes->size; m++) {
346+
for (m=0, nn=0; nn < jdata->map->num_nodes && m < jdata->map->nodes->size; m++) {
347347
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) {
348348
continue;
349349
}
350+
nn++;
350351

351352
/* get the number of objects - only consider those we can actually use */
352353
num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target,
@@ -376,13 +377,20 @@ static int rank_by(orte_job_t *jdata,
376377
* Perhaps someday someone will come up with a more efficient
377378
* algorithm, but this works for now.
378379
*/
379-
all_done = false;
380-
while (!all_done && cnt < app->num_procs) {
381-
all_done = true;
382-
for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
380+
i = 0;
381+
while (cnt < app->num_procs) {
382+
/* get the next object */
383+
obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i);
384+
if (NULL == obj) {
385+
break;
386+
}
387+
/* scan across the procs and find the one that is on this object */
388+
np = 0;
389+
for (j=0; np < node->num_procs && j < node->procs->size && cnt < app->num_procs; j++) {
383390
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
384391
continue;
385392
}
393+
np++;
386394
/* ignore procs from other jobs */
387395
if (proc->name.jobid != jdata->jobid) {
388396
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
@@ -404,53 +412,48 @@ static int rank_by(orte_job_t *jdata,
404412
ORTE_NAME_PRINT(&proc->name), num_ranked);
405413
continue;
406414
}
407-
/* cycle across the objects */
408-
for (i=0; i < num_objs && cnt < app->num_procs && all_done; i++) {
409-
obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i);
410-
/* protect against bozo case */
411-
locale = NULL;
412-
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
413-
ORTE_ERROR_LOG(ORTE_ERROR);
414-
return ORTE_ERROR;
415-
}
416-
/* ignore procs not on this object */
417-
if (NULL == locale ||
418-
!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
419-
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
420-
"mca:rmaps:rank_by: proc at position %d is not on object %d",
421-
j, i);
422-
continue;
423-
}
424-
/* assign the vpid */
425-
proc->name.vpid = vpid++;
426-
if (0 == cnt) {
427-
app->first_rank = proc->name.vpid;
428-
}
429-
cnt++;
415+
/* protect against bozo case */
416+
locale = NULL;
417+
if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
418+
ORTE_ERROR_LOG(ORTE_ERROR);
419+
return ORTE_ERROR;
420+
}
421+
/* ignore procs not on this object */
422+
if (NULL == locale ||
423+
!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
430424
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
431-
"mca:rmaps:rank_by: proc in position %d is on object %d assigned rank %s",
432-
j, i, ORTE_VPID_PRINT(proc->name.vpid));
433-
/* insert the proc into the jdata array */
434-
if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) {
435-
OBJ_RELEASE(pptr);
436-
}
437-
OBJ_RETAIN(proc);
438-
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
439-
ORTE_ERROR_LOG(rc);
440-
OBJ_DESTRUCT(&objs);
441-
return rc;
442-
}
443-
num_ranked++;
444-
/* flag that one was mapped */
445-
all_done = false;
446-
/* track where the highest vpid landed - this is our
447-
* new bookmark
448-
*/
449-
jdata->bookmark = node;
450-
/* move to next object */
451-
break;
425+
"mca:rmaps:rank_by: proc at position %d is not on object %d",
426+
j, i);
427+
continue;
428+
}
429+
/* assign the vpid */
430+
proc->name.vpid = vpid++;
431+
if (0 == cnt) {
432+
app->first_rank = proc->name.vpid;
452433
}
434+
cnt++;
435+
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
436+
"mca:rmaps:rank_by: proc in position %d is on object %d assigned rank %s",
437+
j, i, ORTE_VPID_PRINT(proc->name.vpid));
438+
/* insert the proc into the jdata array */
439+
if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) {
440+
OBJ_RELEASE(pptr);
441+
}
442+
OBJ_RETAIN(proc);
443+
if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
444+
ORTE_ERROR_LOG(rc);
445+
OBJ_DESTRUCT(&objs);
446+
return rc;
447+
}
448+
num_ranked++;
449+
/* track where the highest vpid landed - this is our
450+
* new bookmark
451+
*/
452+
jdata->bookmark = node;
453+
/* move to next object */
454+
break;
453455
}
456+
i++;
454457
}
455458
}
456459
/* cleanup */
@@ -474,6 +477,9 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
474477

475478
map = jdata->map;
476479

480+
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
481+
"RANKING POLICY: %s", orte_rmaps_base_print_ranking(map->ranking));
482+
477483
/* start with the rank-by object options - if the object isn't
478484
* included in the topology, then we obviously cannot rank by it.
479485
* However, if this was the default ranking policy (as opposed to

0 commit comments

Comments
 (0)