Skip to content

Commit 56cd401

Browse files
committed
shell: support hostfile taskmap scheme
Problem: There is currently no simple way to map tasks to specific hosts using a list of hostnames. Add a hostfile taskmap plugin to the job shell. This taskmap scheme assigns tasks to hosts in order as they appear in a file provided by the scheme value. The hostfile is read as a list of newline separated hostnames or RFC 29 Hostlist strings. If there are not as many hosts as tasks in the hostfile, then the list is reused, and if more hosts than tasks appear in the file, then the list is truncated. Like other taskmap schemes, it is an error if the same task count is not assigned to each host as in the default task map. Fixes #5794
1 parent 3d4d4ff commit 56cd401

File tree

3 files changed

+207
-0
lines changed

3 files changed

+207
-0
lines changed

src/shell/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ flux_shell_SOURCES = \
9090
exception.c \
9191
rlimit.c \
9292
taskmap/cyclic.c \
93+
taskmap/hostfile.c \
9394
signal.c \
9495
files.c \
9596
oom.c \

src/shell/builtins.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ extern struct shell_builtin builtin_doom;
4949
extern struct shell_builtin builtin_exception;
5050
extern struct shell_builtin builtin_rlimit;
5151
extern struct shell_builtin builtin_cyclic;
52+
extern struct shell_builtin builtin_hostfile;
5253
extern struct shell_builtin builtin_signal;
5354
extern struct shell_builtin builtin_oom;
5455
extern struct shell_builtin builtin_hwloc;
@@ -74,6 +75,7 @@ static struct shell_builtin * builtins [] = {
7475
&builtin_exception,
7576
&builtin_rlimit,
7677
&builtin_cyclic,
78+
&builtin_hostfile,
7779
&builtin_signal,
7880
&builtin_oom,
7981
&builtin_hwloc,

src/shell/taskmap/hostfile.c

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
/************************************************************\
2+
* Copyright 2024 Lawrence Livermore National Security, LLC
3+
* (c.f. AUTHORS, NOTICE.LLNS, COPYING)
4+
*
5+
* This file is part of the Flux resource manager framework.
6+
* For details, see https://github.com/flux-framework.
7+
*
8+
* SPDX-License-Identifier: LGPL-3.0
9+
\************************************************************/
10+
11+
/* shell taskmap.hostfile plugin
12+
*
13+
* Read a list of hosts from a file, and assign tasks to hosts in order
14+
* they are listed.
15+
*/
16+
#define FLUX_SHELL_PLUGIN_NAME "taskmap.hostfile"
17+
18+
#if HAVE_CONFIG_H
19+
#include "config.h"
20+
#endif
21+
#include <stdio.h>
22+
#include <jansson.h>
23+
24+
#include <flux/core.h>
25+
#include <flux/taskmap.h>
26+
#include <flux/hostlist.h>
27+
28+
#include "src/common/libutil/errprintf.h"
29+
30+
#include "builtins.h"
31+
32+
33+
/* Create a taskmap that represents 'ntasks' tasks mapped across a set
34+
* of hosts in 'nodelist', ordered by hostlist 'hl'.
35+
*/
36+
char *taskmap_hostlist (int ntasks,
37+
struct hostlist *nodelist,
38+
struct hostlist *hl,
39+
flux_error_t *errp)
40+
{
41+
struct taskmap *map = NULL;
42+
char *result = NULL;
43+
const char *host = NULL;
44+
45+
if (!(map = taskmap_create ()))
46+
goto error;
47+
48+
/* Loop through hostlist hl until all tasks have been assigned to hosts
49+
*/
50+
while (ntasks > 0) {
51+
int rank;
52+
if (host == NULL)
53+
host = hostlist_first (hl);
54+
if ((rank = hostlist_find (nodelist, host)) < 0) {
55+
errprintf (errp, "host %s not found in job nodelist", host);
56+
goto error;
57+
}
58+
if (taskmap_append (map, rank, 1, 1) < 0) {
59+
errprintf (errp,
60+
"failed to append task to taskmap: %s",
61+
strerror (errno));
62+
goto error;
63+
}
64+
host = hostlist_next (hl);
65+
ntasks--;
66+
}
67+
result = taskmap_encode (map, TASKMAP_ENCODE_WRAPPED);
68+
error:
69+
taskmap_destroy (map);
70+
return result;
71+
}
72+
73+
static struct hostlist *hostlist_from_file (const char *path)
74+
{
75+
ssize_t n;
76+
size_t size;
77+
struct hostlist *hl = NULL;
78+
FILE *fp = NULL;
79+
char *line = NULL;
80+
81+
if (!(fp = fopen (path, "r"))) {
82+
shell_log_errno ("failed to open hostfile: %s", path);
83+
goto error;
84+
}
85+
if (!(hl = hostlist_create ())) {
86+
shell_log_errno ("failed to create hostlist");
87+
goto error;
88+
}
89+
while ((n = getline (&line, &size, fp)) != -1) {
90+
int len = strlen (line);
91+
if (line[len-1] == '\n')
92+
line[len-1] = '\0';
93+
if (strlen (line) > 0 && hostlist_append (hl, line) < 0) {
94+
shell_log_errno ("hostlist_append: %s", line);
95+
}
96+
}
97+
error:
98+
if (fp)
99+
fclose (fp);
100+
free (line);
101+
return hl;
102+
}
103+
104+
static struct hostlist *hostlist_from_R (flux_shell_t *shell)
105+
{
106+
size_t i;
107+
json_t *nodelist;
108+
json_t *val;
109+
struct hostlist *hl = NULL;
110+
111+
if (flux_shell_info_unpack (shell,
112+
"{s:{s:{s:o}}}",
113+
"R",
114+
"execution",
115+
"nodelist", &nodelist) < 0) {
116+
shell_log_errno ("unable to get job nodelist");
117+
return NULL;
118+
}
119+
if (!(hl = hostlist_create ())) {
120+
shell_log_errno ("hostlist_create");
121+
return NULL;
122+
}
123+
json_array_foreach (nodelist, i, val) {
124+
const char *host = json_string_value (val);
125+
if (!host)
126+
goto error;
127+
if (hostlist_append (hl, host) < 0) {
128+
shell_log_errno ("hostlist_append %s", host);
129+
goto error;
130+
}
131+
}
132+
return hl;
133+
error:
134+
hostlist_destroy (hl);
135+
return NULL;
136+
}
137+
138+
static int map_hostfile (flux_plugin_t *p,
139+
const char *topic,
140+
flux_plugin_arg_t *args,
141+
void *data)
142+
{
143+
flux_shell_t *shell;
144+
int rc = -1;
145+
const char *value = NULL;
146+
struct hostlist *hl = NULL;
147+
struct hostlist *nodelist = NULL;
148+
char *map = NULL;
149+
int ntasks;
150+
flux_error_t error;
151+
152+
if (!(shell = flux_plugin_get_shell (p)))
153+
return -1;
154+
155+
if (flux_plugin_arg_unpack (args,
156+
FLUX_PLUGIN_ARG_IN,
157+
"{s?s}",
158+
"value", &value) < 0) {
159+
shell_log_error ("unpack: %s", flux_plugin_arg_strerror (args));
160+
return -1;
161+
}
162+
163+
if (!(hl = hostlist_from_file (value))
164+
|| !(nodelist = hostlist_from_R (shell))) {
165+
shell_log_error ("failed to get hostlists from file and R");
166+
goto out;
167+
}
168+
if ((ntasks = taskmap_total_ntasks (flux_shell_get_taskmap (shell))) < 0)
169+
shell_log_error ("failed to get ntasks from current shell taskmap");
170+
171+
if (!(map = taskmap_hostlist (ntasks, nodelist, hl, &error))) {
172+
shell_log_error ("failed to map tasks with hostfile:%s: %s",
173+
value,
174+
error.text);
175+
goto out;
176+
}
177+
if (flux_plugin_arg_pack (args,
178+
FLUX_PLUGIN_ARG_OUT,
179+
"{s:s}",
180+
"taskmap", map) < 0) {
181+
shell_log_error ("failed to set new taskmap in plugin output args");
182+
goto out;
183+
}
184+
rc = 0;
185+
out:
186+
free (map);
187+
hostlist_destroy (hl);
188+
hostlist_destroy (nodelist);
189+
return rc;
190+
}
191+
192+
static int plugin_init (flux_plugin_t *p)
193+
{
194+
return flux_plugin_add_handler (p, "taskmap.hostfile", map_hostfile, NULL);
195+
}
196+
197+
struct shell_builtin builtin_hostfile = {
198+
.name = FLUX_SHELL_PLUGIN_NAME,
199+
.plugin_init = plugin_init,
200+
};
201+
202+
/*
203+
* vi:tabstop=4 shiftwidth=4 expandtab
204+
*/

0 commit comments

Comments
 (0)