Skip to content

Commit ede6352

Browse files
authored
Merge pull request #4787 from karasevb/fix_rmaps_nodelist_v3.1.x
v3.1: rmaps: fixed the ordering of `mpirun` target nodes
2 parents 4324c00 + 9e4602b commit ede6352

File tree

1 file changed

+21
-28
lines changed

1 file changed

+21
-28
lines changed

orte/mca/rmaps/base/rmaps_base_support_fns.c

Lines changed: 21 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@
1212
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
1414
* All rights reserved.
15-
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
1616
* Copyright (c) 2016 IBM Corporation. All rights reserved.
17+
* Copyright (c) 2018 Mellanox Technologies, Inc.
18+
* All rights reserved.
1719
* $COPYRIGHT$
1820
*
1921
* Additional copyrights may follow
@@ -140,8 +142,8 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
140142
orte_app_context_t *app, orte_mapping_policy_t policy,
141143
bool initial_map, bool silent)
142144
{
143-
opal_list_item_t *item, *next;
144-
orte_node_t *node, *nd, *nptr;
145+
opal_list_item_t *item;
146+
orte_node_t *node, *nd, *nptr, *next;
145147
orte_std_cntr_t num_slots;
146148
orte_std_cntr_t i;
147149
int rc;
@@ -253,13 +255,12 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
253255
/* find the nodes in our node array and assemble them
254256
* in daemon order if the vm was launched
255257
*/
256-
while (NULL != (item = opal_list_remove_first(&nodes))) {
257-
nptr = (orte_node_t*)item;
258+
for (i=0; i < orte_node_pool->size; i++) {
258259
nd = NULL;
259-
for (i=0; i < orte_node_pool->size; i++) {
260-
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
261-
continue;
262-
}
260+
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
261+
continue;
262+
}
263+
OPAL_LIST_FOREACH_SAFE(nptr, next, &nodes, orte_node_t) {
263264
if (0 != strcmp(node->name, nptr->name)) {
264265
OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
265266
"NODE %s DOESNT MATCH NODE %s",
@@ -332,8 +333,9 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
332333
/* reset us back to the end for the next node */
333334
nd = (orte_node_t*)opal_list_get_last(allocated_nodes);
334335
}
336+
opal_list_remove_item(&nodes, (opal_list_item_t*)nptr);
337+
OBJ_RELEASE(nptr);
335338
}
336-
OBJ_RELEASE(nptr);
337339
}
338340
OBJ_DESTRUCT(&nodes);
339341
/* now prune for usage and compute total slots */
@@ -470,17 +472,13 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
470472
num_slots = opal_list_get_size(allocated_nodes); // tell the mapper there is one slot/node for debuggers
471473
} else {
472474
item = opal_list_get_first(allocated_nodes);
473-
while (item != opal_list_get_end(allocated_nodes)) {
474-
node = (orte_node_t*)item;
475-
/** save the next pointer in case we remove this node */
476-
next = opal_list_get_next(item);
475+
OPAL_LIST_FOREACH_SAFE(node, next, allocated_nodes, orte_node_t) {
477476
/* if the hnp was not allocated, or flagged not to be used,
478477
* then remove it here */
479478
if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(policy) & ORTE_MAPPING_NO_USE_LOCAL)) {
480479
if (0 == node->index) {
481-
opal_list_remove_item(allocated_nodes, item);
482-
OBJ_RELEASE(item); /* "un-retain" it */
483-
item = next;
480+
opal_list_remove_item(allocated_nodes, &node->super);
481+
OBJ_RELEASE(node); /* "un-retain" it */
484482
continue;
485483
}
486484
}
@@ -490,9 +488,8 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
490488
"%s Removing node %s: max %d inuse %d",
491489
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
492490
node->name, node->slots_max, node->slots_inuse));
493-
opal_list_remove_item(allocated_nodes, item);
494-
OBJ_RELEASE(item); /* "un-retain" it */
495-
item = next;
491+
opal_list_remove_item(allocated_nodes, &node->super);
492+
OBJ_RELEASE(node); /* "un-retain" it */
496493
continue;
497494
}
498495
if (node->slots <= node->slots_inuse &&
@@ -502,9 +499,8 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
502499
"%s Removing node %s slots %d inuse %d",
503500
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
504501
node->name, node->slots, node->slots_inuse));
505-
opal_list_remove_item(allocated_nodes, item);
506-
OBJ_RELEASE(item); /* "un-retain" it */
507-
item = next;
502+
opal_list_remove_item(allocated_nodes, &node->super);
503+
OBJ_RELEASE(node); /* "un-retain" it */
508504
continue;
509505
}
510506
if (node->slots > node->slots_inuse) {
@@ -514,7 +510,6 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
514510
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
515511
node->name, node->slots - node->slots_inuse));
516512
num_slots += node->slots - node->slots_inuse;
517-
item = next;
518513
continue;
519514
}
520515
if (!(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) {
@@ -528,11 +523,9 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
528523
node->name));
529524
} else {
530525
/* if we cannot use it, remove it from list */
531-
opal_list_remove_item(allocated_nodes, item);
532-
OBJ_RELEASE(item); /* "un-retain" it */
526+
opal_list_remove_item(allocated_nodes, &node->super);
527+
OBJ_RELEASE(node); /* "un-retain" it */
533528
}
534-
/** go on to next item */
535-
item = next;
536529
}
537530
}
538531

0 commit comments

Comments
 (0)