Skip to content

Commit b9afbb9

Browse files
committed
shell: allow taskmaps to redistribute tasks amongst nodes
Problem: taskmap_check() now allows an updated taskmap to change the counts of tasks per node, if total task and node count remains unchanged, but the affected rcalc data in the shell is not updated, leading to a possible mismatch between various shell components. Add a rcalc_update_map() function which can be passed a new taskmap and updates ntasks on each shell rank internally. Also update the rankinfo object for the current shell after rcalc_update_map() returns. This should now allow a custom taskmap to redistribute task counts amongst the nodes of a job.
1 parent fc97c06 commit b9afbb9

File tree

3 files changed

+34
-0
lines changed

3 files changed

+34
-0
lines changed

src/shell/info.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,18 @@ int shell_info_set_taskmap (struct shell_info *info,
263263
if (!(taskids = taskmap_taskids (map, info->shell_rank))
264264
|| !(copy = idset_copy (taskids)))
265265
return -1;
266+
267+
/* Update rcalc using the new taskmap.
268+
* Be sure to update the local rankinfo structure, which may have changed.
269+
*/
270+
if (rcalc_update_map (info->rcalc, map) < 0
271+
|| rcalc_get_nth (info->rcalc,
272+
info->shell_rank,
273+
&info->rankinfo) < 0) {
274+
shell_log_error ("unable to update task counts from new taskmap");
275+
return -1;
276+
}
277+
266278
idset_destroy (info->taskids);
267279
info->taskids = copy;
268280
taskmap_destroy (info->taskmap);

src/shell/rcalc.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <unistd.h>
1717
#include <errno.h>
1818
#include <jansson.h>
19+
#include <flux/taskmap.h>
1920

2021
#include "src/common/libczmqcontainers/czmq_containers.h"
2122
#include "src/common/libidset/idset.h"
@@ -52,6 +53,22 @@ struct rcalc {
5253
struct allocinfo *alloc;
5354
};
5455

56+
int rcalc_update_map (rcalc_t *r, struct taskmap *map)
57+
{
58+
if (r == NULL || map == NULL) {
59+
errno = EINVAL;
60+
return -1;
61+
}
62+
/* Update task counts for each rank in rcalc based on new taskmap.
63+
* N.B.: ignores alloc[i]->ncores_avail since this variable is known
64+
* to not be used after the initial distribution of tasks.
65+
*/
66+
for (int i = 0; i < r->nranks; i++)
67+
r->alloc[i].ntasks = taskmap_ntasks (map, i);
68+
69+
return 0;
70+
}
71+
5572
void rcalc_destroy (rcalc_t *r)
5673
{
5774
if (r == NULL)

src/shell/rcalc.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <sched.h>
1515
#include <jansson.h>
1616
#include <flux/core.h>
17+
#include <flux/taskmap.h>
1718

1819
typedef struct rcalc rcalc_t;
1920

@@ -61,4 +62,8 @@ int rcalc_get_rankinfo (rcalc_t *r, int rank, struct rcalc_rankinfo *ri);
6162
/* Fill in rcalc_rankinfo for the nth rank in the rcalc_t list */
6263
int rcalc_get_nth (rcalc_t *r, int id, struct rcalc_rankinfo *ri);
6364

65+
/* Update rcalc with a new taskmap
66+
*/
67+
int rcalc_update_map (rcalc_t *r, struct taskmap *map);
68+
6469
#endif /* !SHELL_RCALC_H */

0 commit comments

Comments
 (0)