Skip to content

Commit ff92b52

Browse files
authored
Merge pull request #6815 from grondo/issue#6810
remove restriction that taskmaps cannot assign differing counts of tasks to nodes in a job
2 parents 1c825b8 + 61298c8 commit ff92b52

File tree

8 files changed

+51
-22
lines changed

8 files changed

+51
-22
lines changed

doc/man1/common/job-other-run.rst

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,15 @@
1313
manual:TASKMAP
1414
An explicit RFC 34 taskmap is provided and used to manually map
1515
task ids to nodes. The provided *TASKMAP* must match the total number
16-
of tasks in the job and the number of tasks per node assigned by
17-
the job shell, so this option is not useful unless the total number
18-
of nodes and tasks per node are known at job submission time.
16+
of nodes and tasks in the job, so this option is not useful unless
17+
the total number of nodes are known at job submission time.
1918

2019
hostfile:FILE
2120
Assign tasks in order to hosts as they appear in FILE. FILE should
2221
have one or more lines each of which contains a host name or RFC
2322
29 Hostlist string. Each host assigned to the job must appear in
24-
the hostfile and be assigned the same number of tasks as the default
25-
taskmap from the shell. If there are less hosts in the hostfile than
26-
tasks in the job, then the list of hosts will be reused.
23+
the hostfile. If there are less hosts in the hostfile than tasks
24+
in the job, then the list of hosts will be reused.
2725

2826
However, shell plugins may provide other task mapping schemes, so
2927
check the current job shell configuration for a full list of supported

src/common/libtaskmap/taskmap.c

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -973,16 +973,6 @@ int taskmap_check (const struct taskmap *old,
973973
"got %d total tasks, expected %d",
974974
ntasks_new,
975975
ntasks_old);
976-
for (int i = 0; i < nnodes_old; i++) {
977-
ntasks_old = taskmap_ntasks (old, i);
978-
ntasks_new = taskmap_ntasks (new, i);
979-
if (ntasks_old != ntasks_new)
980-
return errprintf (errp,
981-
"node %d has %d tasks, expected %d",
982-
i,
983-
ntasks_new,
984-
ntasks_old);
985-
}
986976
return 0;
987977
}
988978

src/common/libtaskmap/taskmap.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,8 @@ int taskmap_nnodes (const struct taskmap *map);
9090
int taskmap_total_ntasks (const struct taskmap *map);
9191

9292
/* Check if 'a' and 'b' taskmaps are compatible, i.e. they
93-
* have equivalent numbers of total tasks, total nodes, and tasks assigned
94-
* to each node.
93+
* have equivalent numbers of total tasks and total nodes.
94+
*
9595
* Returns 0 on success, -1 with error message in 'errp' on failure.
9696
*/
9797
int taskmap_check (const struct taskmap *a,

src/common/libtaskmap/test/taskmap.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -522,12 +522,11 @@ struct check_test check_tests[] = {
522522
{ "[[0,4,4,1]]", "[[0,4,1,4]]", 0, NULL },
523523
{ "[[0,4,4,1]]", "[[0,4,2,2]]", 0, NULL },
524524
{ "[[0,4,4,1]]", "[[0,4,3,1],[0,4,1,1]]", 0, NULL },
525+
{ "[[0,4,4,1]]", "[[0,2,4,1],[2,1,3,1],[3,1,5,1]]", 0, NULL },
525526
{ "[[0,4,4,1]]", "[[0,4,3,1]]", -1,
526527
"got 12 total tasks, expected 16" },
527528
{ "[[0,4,4,1]]", "[[0,2,8,1]]", -1,
528529
"got 2 nodes, expected 4" },
529-
{ "[[0,4,4,1]]", "[[0,2,4,1],[2,1,3,1],[3,1,5,1]]", -1,
530-
"node 2 has 3 tasks, expected 4" },
531530
{ NULL, NULL, 0, NULL },
532531
};
533532

src/shell/info.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,18 @@ int shell_info_set_taskmap (struct shell_info *info,
263263
if (!(taskids = taskmap_taskids (map, info->shell_rank))
264264
|| !(copy = idset_copy (taskids)))
265265
return -1;
266+
267+
/* Update rcalc using the new taskmap.
268+
* Be sure to update the local rankinfo structure, which may have changed.
269+
*/
270+
if (rcalc_update_map (info->rcalc, map) < 0
271+
|| rcalc_get_nth (info->rcalc,
272+
info->shell_rank,
273+
&info->rankinfo) < 0) {
274+
shell_log_error ("unable to update task counts from new taskmap");
275+
return -1;
276+
}
277+
266278
idset_destroy (info->taskids);
267279
info->taskids = copy;
268280
taskmap_destroy (info->taskmap);

src/shell/rcalc.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <unistd.h>
1717
#include <errno.h>
1818
#include <jansson.h>
19+
#include <flux/taskmap.h>
1920

2021
#include "src/common/libczmqcontainers/czmq_containers.h"
2122
#include "src/common/libidset/idset.h"
@@ -52,6 +53,22 @@ struct rcalc {
5253
struct allocinfo *alloc;
5354
};
5455

56+
int rcalc_update_map (rcalc_t *r, struct taskmap *map)
57+
{
58+
if (r == NULL || map == NULL) {
59+
errno = EINVAL;
60+
return -1;
61+
}
62+
/* Update task counts for each rank in rcalc based on new taskmap.
63+
* N.B.: ignores alloc[i]->ncores_avail since this variable is known
64+
* to not be used after the initial distribution of tasks.
65+
*/
66+
for (int i = 0; i < r->nranks; i++)
67+
r->alloc[i].ntasks = taskmap_ntasks (map, i);
68+
69+
return 0;
70+
}
71+
5572
void rcalc_destroy (rcalc_t *r)
5673
{
5774
if (r == NULL)

src/shell/rcalc.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <sched.h>
1515
#include <jansson.h>
1616
#include <flux/core.h>
17+
#include <flux/taskmap.h>
1718

1819
typedef struct rcalc rcalc_t;
1920

@@ -61,4 +62,8 @@ int rcalc_get_rankinfo (rcalc_t *r, int rank, struct rcalc_rankinfo *ri);
6162
/* Fill in rcalc_rankinfo for the nth rank in the rcalc_t list */
6263
int rcalc_get_nth (rcalc_t *r, int id, struct rcalc_rankinfo *ri);
6364

65+
/* Update rcalc with a new taskmap
66+
*/
67+
int rcalc_update_map (rcalc_t *r, struct taskmap *map);
68+
6469
#endif /* !SHELL_RCALC_H */

t/t2616-job-shell-taskmap.t

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,15 @@ test_expect_success 'taskmap=manual works' '
136136
| jq -e ".context.taskmap.map == [[1,3,4,1],[0,1,4,1]]" &&
137137
test_check_taskmap $id
138138
'
139+
test_expect_success 'taskmap=manual can redistribute tasks amongst nodes' '
140+
# put 2 extra tasks on rank 0
141+
id=$(flux submit \
142+
--taskmap=manual:"[[0,1,3,1],[1,3,1,1]]" \
143+
-N4 -n6 ./map.sh) &&
144+
flux job wait-event -p exec -f json $id shell.start \
145+
| jq -e ".context.taskmap.map == [[0,1,3,1],[1,3,1,1]]" &&
146+
test_check_taskmap $id
147+
'
139148
test_expect_success 'taskmap=manual with unknown taskmap fails' '
140149
test_must_fail_or_be_terminated \
141150
flux run --taskmap="manual:[]" true
@@ -149,8 +158,7 @@ test_expect_success 'invalid manual taskmaps fail predictably' '
149158
-N4 --tasks-per-node=4 true ::: \
150159
"{}" \
151160
"[[1,3,4,1]]" \
152-
"[[1,3,4,1],[0,1,3,1]]" \
153-
"[[3,1,1,1],[0,3,5,1]]") &&
161+
"[[1,3,4,1],[0,1,3,1]]") &&
154162
for id in $ids; do
155163
test_must_fail_or_be_terminated flux job attach $id
156164
done

0 commit comments

Comments
 (0)