Skip to content

Commit 33a3f53

Browse files
authored
Merge pull request #5721 from grondo/issue#5718
shell: write hwloc XML to a file with `HWLOC_XMLFILE` set with `-o hwloc.xmlfile`
2 parents 96ff0f4 + 71fbedb commit 33a3f53

File tree

10 files changed

+150
-3
lines changed

10 files changed

+150
-3
lines changed

doc/man1/common/job-shell-options.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,6 @@
3636
* - :option:`exit-on-error`
3737
- Raise a fatal job exception immediately if first task exits with
3838
nonzero exit code.
39+
40+
* - :option:`hwloc.xmlfile`
41+
- Write hwloc XML gathered by job to a file and set ``HWLOC_XMLFILE``

doc/man1/flux-shell.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,10 @@ plugins include:
438438

439439
Send signal :option:`signal.signum` *TIME* seconds before job expiration.
440440

441+
.. option:: hwloc.xmlfile
442+
443+
Write the job shell's copy of hwloc XML to a file and set ``HWLOC_XMLFILE``.
444+
441445
.. warning::
442446
The directory referenced by :envvar:`FLUX_JOB_TMPDIR` is cleaned up when the
443447
job ends, is guaranteed to be unique, and is generally on fast local storage

doc/test/spell.en.pws

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -823,3 +823,4 @@ aspirational
823823
infeasible
824824
login
825825
workflow
826+
xmlfile

src/shell/Makefile.am

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,8 @@ flux_shell_SOURCES = \
9292
cyclic.c \
9393
signal.c \
9494
files.c \
95-
oom.c
95+
oom.c \
96+
hwloc.c
9697

9798
flux_shell_LDADD = \
9899
$(builddir)/libshell.la \

src/shell/batch.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,11 +97,10 @@ batch_info_create (flux_shell_t *shell, json_t *batch)
9797
shell_debug ("Copying batch script size=%zu for job to %s",
9898
len,
9999
b->script);
100-
if (write_all (fd, data, len) < 0) {
100+
if (write_all (fd, data, len) < 0 || close (fd) < 0) {
101101
shell_log_error ("failed to write batch script");
102102
goto error;
103103
}
104-
close (fd);
105104
}
106105
return b;
107106
error:

src/shell/builtins.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ extern struct shell_builtin builtin_rlimit;
5151
extern struct shell_builtin builtin_cyclic;
5252
extern struct shell_builtin builtin_signal;
5353
extern struct shell_builtin builtin_oom;
54+
extern struct shell_builtin builtin_hwloc;
5455

5556
static struct shell_builtin * builtins [] = {
5657
&builtin_tmpdir,
@@ -74,6 +75,7 @@ static struct shell_builtin * builtins [] = {
7475
&builtin_cyclic,
7576
&builtin_signal,
7677
&builtin_oom,
78+
&builtin_hwloc,
7779
&builtin_list_end,
7880
};
7981

@@ -91,6 +93,10 @@ static int shell_load_builtin (flux_shell_t *shell,
9193
|| flux_plugin_add_handler (p, "shell.reconnect",
9294
sb->reconnect, NULL) < 0
9395
|| flux_plugin_add_handler (p, "shell.init", sb->init, NULL) < 0
96+
|| flux_plugin_add_handler (p,
97+
"shell.post-init",
98+
sb->post_init,
99+
NULL) < 0
94100
|| flux_plugin_add_handler (p, "shell.exit", sb->exit, NULL) < 0
95101
|| flux_plugin_add_handler (p, "task.init", sb->task_init, NULL) < 0
96102
|| flux_plugin_add_handler (p, "task.fork", sb->task_fork, NULL) < 0

src/shell/builtins.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ struct shell_builtin {
2121
flux_plugin_f connect;
2222
flux_plugin_f reconnect;
2323
flux_plugin_f init;
24+
flux_plugin_f post_init;
2425
flux_plugin_f task_init;
2526
flux_plugin_f task_exec;
2627
flux_plugin_f task_fork;

src/shell/hwloc.c

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
/************************************************************\
2+
* Copyright 2024 Lawrence Livermore National Security, LLC
3+
* (c.f. AUTHORS, NOTICE.LLNS, COPYING)
4+
*
5+
* This file is part of the Flux resource manager framework.
6+
* For details, see https://github.com/flux-framework.
7+
*
8+
* SPDX-License-Identifier: LGPL-3.0
9+
\************************************************************/
10+
11+
/* hwloc options handler
12+
*/
13+
#define FLUX_SHELL_PLUGIN_NAME "hwloc"
14+
15+
#if HAVE_CONFIG_H
16+
#include "config.h"
17+
#endif
18+
19+
#include <sys/types.h>
20+
#include <sys/stat.h>
21+
#include <fcntl.h>
22+
#include <unistd.h>
23+
24+
#include <flux/core.h>
25+
#include <flux/shell.h>
26+
27+
#include "src/common/libutil/read_all.h"
28+
#include "ccan/str/str.h"
29+
30+
#include "builtins.h"
31+
32+
static int create_xmlfile (flux_shell_t *shell)
33+
{
34+
int fd = -1;
35+
char *xmlfile = NULL;
36+
const char *hwloc_xml;
37+
const char *tmpdir;
38+
39+
if (!(tmpdir = flux_shell_getenv (shell, "FLUX_JOB_TMPDIR")))
40+
tmpdir = "/tmp";
41+
42+
if (flux_shell_get_hwloc_xml (shell, &hwloc_xml) < 0) {
43+
shell_log_error ("failed to get shell hwloc xml");
44+
goto error;
45+
}
46+
if (asprintf (&xmlfile, "%s/hwloc.xml", tmpdir) < 0) {
47+
shell_log_error ("asprintf HWLOC_XMLFILE failed");
48+
goto error;
49+
}
50+
if ((fd = open (xmlfile, O_CREAT|O_EXCL|O_WRONLY, 0640)) < 0) {
51+
shell_log_errno ("%s", xmlfile);
52+
goto error;
53+
}
54+
shell_debug ("Writing %ld bytes to HWLOC_XMLFILE=%s\n",
55+
(long int) strlen (hwloc_xml),
56+
xmlfile);
57+
if (write_all (fd, hwloc_xml, strlen (hwloc_xml)) < 0 || close (fd) < 0) {
58+
shell_log_errno ("failed to write HWLOC_XMLFILE");
59+
goto error;
60+
}
61+
if (flux_shell_setenvf (shell, 0, "HWLOC_XMLFILE", "%s", xmlfile) < 0) {
62+
shell_log_errno ("failed to set HWLOC_XMLFILE in job environment");
63+
goto error;
64+
}
65+
return 0;
66+
error:
67+
if (fd >= 0)
68+
close (fd);
69+
free (xmlfile);
70+
return -1;
71+
}
72+
73+
static int hwloc_post_init (flux_plugin_t *p,
74+
const char *topic,
75+
flux_plugin_arg_t *args,
76+
void *data)
77+
{
78+
flux_shell_t *shell = flux_plugin_get_shell (p);
79+
int xmlfile = 0;
80+
81+
if (flux_shell_getopt_unpack (shell,
82+
"hwloc",
83+
"{s?i}",
84+
"xmlfile", &xmlfile) < 0)
85+
return shell_log_errno ("failed to unpack hwloc options");
86+
87+
if (xmlfile && create_xmlfile (shell) < 0)
88+
return shell_log_errno ("failed to write HWLOC_XMLFILE");
89+
return 0;
90+
}
91+
92+
struct shell_builtin builtin_hwloc = {
93+
.name = FLUX_SHELL_PLUGIN_NAME,
94+
.post_init = hwloc_post_init,
95+
};
96+
97+
/*
98+
* vi:tabstop=4 shiftwidth=4 expandtab
99+
*/

t/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@ TESTSCRIPTS = \
207207
t2616-job-shell-taskmap.t \
208208
t2617-job-shell-stage-in.t \
209209
t2618-job-shell-signal.t \
210+
t2619-job-shell-hwloc.t \
210211
t2710-python-cli-submit.t \
211212
t2711-python-cli-run.t \
212213
t2713-python-cli-bulksubmit.t \

t/t2619-job-shell-hwloc.t

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/bin/sh
2+
#
3+
test_description='Test flux-shell hwloc.xmlfile'
4+
5+
. `dirname $0`/sharness.sh
6+
7+
test_under_flux 2 job
8+
unset HWLOC_XMLFILE
9+
unset FLUX_HWLOC_XMLFILE
10+
11+
test_expect_success 'shell: HWLOC_XMLFILE is not set for normal jobs' '
12+
test_must_fail flux run printenv HWLOC_XMLFILE
13+
'
14+
test_expect_success 'shell: -o hwloc.xmlfile sets HWLOC_XMLFILE' '
15+
flux run -o hwloc.xmlfile printenv HWLOC_XMLFILE
16+
'
17+
command -v hwloc-info >/dev/null && test_set_prereq HWLOC_INFO
18+
test_expect_success HWLOC_INFO 'shell: -o hwloc.xmlfile HWLOC_XMLFILE is valid' '
19+
flux run -o hwloc.xmlfile sh -c "hwloc-info --input \$HWLOC_XMLFILE"
20+
'
21+
test_expect_success 'shell: -o hwloc.xmlfile sets HWLOC_XMLFILE per node' '
22+
flux run -N2 --label-io -o hwloc.xmlfile printenv HWLOC_XMLFILE \
23+
> xmlfile.out &&
24+
test_debug "cat xmlfile.out" &&
25+
sed -n "s/^0://p" xmlfile.out >path.0 &&
26+
sed -n "s/^1://p" xmlfile.out >path.1 &&
27+
test_must_fail test_cmp path.0 path.1
28+
'
29+
test_expect_success 'shell: bad hwloc.xmlfile value is an error' '
30+
test_must_fail flux run -o hwloc.xmlfile=foo printenv HWLOC_XMLFILE
31+
'
32+
test_done

0 commit comments

Comments
 (0)