Skip to content

Commit 0ec34e0

Browse files
authored
Merge pull request #5294 from chu11/issue5233_libsubprocess_human_readable_errors
libsubprocess: support user friendly error string
2 parents b20460a + d7d15e7 commit 0ec34e0

File tree

9 files changed

+278
-103
lines changed

9 files changed

+278
-103
lines changed

src/cmd/flux-exec.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,12 +157,17 @@ void state_cb (flux_subprocess_t *p, flux_subprocess_state_t state)
157157
if (state == FLUX_SUBPROCESS_FAILED) {
158158
flux_cmd_t *cmd = flux_subprocess_get_cmd (p);
159159
int errnum = flux_subprocess_fail_errno (p);
160+
const char *errmsg = flux_subprocess_fail_error (p);
160161
int ec = 1;
161162

163+
/* N.B. if no error message available from
164+
* flux_subprocess_fail_error(), errmsg is set to strerror of
165+
* subprocess errno.
166+
*/
162167
log_msg ("Error: rank %d: %s: %s",
163168
flux_subprocess_rank (p),
164169
flux_cmd_arg (cmd, 0),
165-
strerror (errnum));
170+
errmsg);
166171

167172
/* bash standard, 126 for permission/access denied, 127 for
168173
* command not found. 68 (EX_NOHOST) for No route to host.

src/common/libsubprocess/fork.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,12 @@ static int local_release_child (flux_subprocess_t *p)
248248

249249
static int local_exec (flux_subprocess_t *p)
250250
{
251-
if ((p->exec_failed_errno = local_release_child (p)) != 0) {
251+
int ret;
252+
/* N.B. We don't set p->failed_errno here, if locally launched via
253+
* flux_local_exec(), will return -1 and errno to caller. If
254+
* called via server, p->failed_errno will be set by remote
255+
* handler. */
256+
if ((ret = local_release_child (p)) != 0) {
252257
/*
253258
* Reap child immediately. Expectation from caller is that
254259
* failure to exec will not require subsequent reaping of
@@ -260,7 +265,7 @@ static int local_exec (flux_subprocess_t *p)
260265
return -1;
261266
p->status = status;
262267

263-
errno = p->exec_failed_errno;
268+
errno = ret;
264269
return -1;
265270
}
266271
return 0;

src/common/libsubprocess/remote.c

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,15 @@
1515
#include <sys/types.h>
1616
#include <wait.h>
1717
#include <unistd.h>
18+
#include <stdarg.h>
1819
#include <errno.h>
1920
#include <assert.h>
2021

2122
#include <flux/core.h>
2223

2324
#include "ccan/str/str.h"
2425
#include "src/common/libczmqcontainers/czmq_containers.h"
26+
#include "src/common/libutil/errprintf.h"
2527
#include "src/common/libutil/log.h"
2628
#include "src/common/libutil/fdwalk.h"
2729
#include "src/common/libutil/macros.h"
@@ -37,6 +39,15 @@
3739

3840
static void remote_kill_nowait (flux_subprocess_t *p, int signum);
3941

42+
static void set_failed (flux_subprocess_t *p, const char *fmt, ...)
43+
{
44+
va_list ap;
45+
va_start (ap, fmt);
46+
verrprintf (&p->failed_error, fmt, ap);
47+
p->failed_errno = errno;
48+
va_end (ap);
49+
}
50+
4051
static void start_channel_watchers (flux_subprocess_t *p)
4152
{
4253
struct subprocess_channel *c;
@@ -169,6 +180,7 @@ static int remote_write (struct subprocess_channel *c)
169180

170181
if (!(ptr = flux_buffer_read (c->write_buffer, -1, &lenp))) {
171182
llog_debug (c->p, "flux_buffer_read: %s", strerror (errno));
183+
set_failed (c->p, "internal buffer read error");
172184
goto error;
173185
}
174186

@@ -192,6 +204,7 @@ static int remote_write (struct subprocess_channel *c)
192204
llog_debug (c->p,
193205
"error sending rexec.write request: %s",
194206
strerror (errno));
207+
set_failed (c->p, "internal write error");
195208
goto error;
196209
}
197210

@@ -215,6 +228,7 @@ static int remote_close (struct subprocess_channel *c)
215228
llog_debug (c->p,
216229
"error sending rexec.write request: %s",
217230
strerror (errno));
231+
set_failed (c->p, "internal close error");
218232
return -1;
219233
}
220234
/* No need to do a "channel_flush", normal io reactor will handle
@@ -254,7 +268,9 @@ static void remote_in_check_cb (flux_reactor_t *r,
254268
return;
255269

256270
error:
257-
c->p->failed_errno = errno;
271+
/* c->p->failed_errno and c->p->failed_error expected to be
272+
* set before this point (typically via set_failed())
273+
*/
258274
process_new_state (c->p, FLUX_SUBPROCESS_FAILED);
259275
remote_kill_nowait (c->p, SIGKILL);
260276
flux_future_destroy (c->p->f);
@@ -520,6 +536,7 @@ static int remote_output (flux_subprocess_t *p,
520536
(int)flux_subprocess_pid (p),
521537
stream);
522538
errno = EPROTO;
539+
set_failed (p, "error buffering unknown channel %s", stream);
523540
return -1;
524541
}
525542

@@ -539,6 +556,7 @@ static int remote_output (flux_subprocess_t *p,
539556
(int)flux_subprocess_pid (p),
540557
stream,
541558
strerror (errno));
559+
set_failed (p, "error buffering %d bytes of data", len);
542560
return -1;
543561
}
544562
}
@@ -572,6 +590,7 @@ static void rexec_continuation (flux_future_t *f, void *arg)
572590
remote_completion (p);
573591
return;
574592
}
593+
set_failed (p, "%s", future_strerror (f, errno));
575594
goto error;
576595
}
577596
if (subprocess_rexec_is_started (f, &p->pid)) {
@@ -592,7 +611,9 @@ static void rexec_continuation (flux_future_t *f, void *arg)
592611
return;
593612

594613
error:
595-
p->failed_errno = errno;
614+
/* c->p->failed_errno and c->p->failed_error expected to be
615+
* set before this point (typically via set_failed())
616+
*/
596617
process_new_state (p, FLUX_SUBPROCESS_FAILED);
597618
remote_kill_nowait (p, SIGKILL);
598619
}

src/common/libsubprocess/server.c

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
#include "src/common/libczmqcontainers/czmq_containers.h"
2121
#include "src/common/libutil/errno_safe.h"
22+
#include "src/common/libutil/errprintf.h"
2223
#include "src/common/libutil/llog.h"
2324
#include "src/common/libioencode/ioencode.h"
2425
#include "ccan/str/str.h"
@@ -136,6 +137,7 @@ static void proc_internal_fatal (flux_subprocess_t *p)
136137
*/
137138
p->state = FLUX_SUBPROCESS_FAILED;
138139
p->failed_errno = errno;
140+
errprintf (&p->failed_error, "internal fatal error: %s", strerror (errno));
139141
state_change_start (p);
140142

141143
/* if we fail here, probably not much can be done */
@@ -177,7 +179,13 @@ static void proc_state_change_cb (flux_subprocess_t *p,
177179
"type", "stopped");
178180
}
179181
else if (state == FLUX_SUBPROCESS_FAILED) {
180-
rc = flux_respond_error (s->h, request, p->failed_errno, NULL);
182+
const char *errmsg = NULL;
183+
if (p->failed_error.text[0] != '\0')
184+
errmsg = p->failed_error.text;
185+
rc = flux_respond_error (s->h,
186+
request,
187+
p->failed_errno,
188+
errmsg);
181189
proc_delete (s, p); // N.B. proc_delete preserves errno
182190
} else {
183191
errno = EPROTO;
@@ -333,8 +341,11 @@ static void server_exec_cb (flux_t *h, flux_msg_handler_t *mh,
333341
&ops,
334342
NULL,
335343
s->llog,
336-
s->llog_data)))
344+
s->llog_data))) {
345+
errprintf (&error, "error launching process: %s", strerror (errno));
346+
errmsg = error.text;
337347
goto error;
348+
}
338349

339350
if (flux_subprocess_aux_set (p,
340351
msgkey,

src/common/libsubprocess/subprocess.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1121,6 +1121,17 @@ int flux_subprocess_fail_errno (flux_subprocess_t *p)
11211121
return p->failed_errno;
11221122
}
11231123

1124+
const char *flux_subprocess_fail_error (flux_subprocess_t *p)
1125+
{
1126+
if (!p)
1127+
return "internal error: subprocess is NULL";
1128+
if (p->state != FLUX_SUBPROCESS_FAILED)
1129+
return "internal error: subprocess is not in FAILED state";
1130+
if (p->failed_error.text[0] == '\0')
1131+
return strerror (p->failed_errno);
1132+
return p->failed_error.text;
1133+
}
1134+
11241135
int flux_subprocess_status (flux_subprocess_t *p)
11251136
{
11261137
if (!p) {

src/common/libsubprocess/subprocess.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -453,10 +453,16 @@ const char *flux_subprocess_state_string (flux_subprocess_state_t state);
453453

454454
int flux_subprocess_rank (flux_subprocess_t *p);
455455

456-
/* Returns the errno causing the FLUX_SUBPROCESS_FAILED states to be reached.
456+
/* Returns the errno causing the FLUX_SUBPROCESS_FAILED state to be reached.
457457
*/
458458
int flux_subprocess_fail_errno (flux_subprocess_t *p);
459459

460+
/* Returns error message describing why FLUX_SUBPROCESS_FAILED state was
461+
* reached. If error message was not set, will return strerror() of
462+
* errno returned from flux_subprocess_fail_errno().
463+
*/
464+
const char *flux_subprocess_fail_error (flux_subprocess_t *p);
465+
460466
/* Returns exit status as returned from wait(2). Works only for
461467
* FLUX_SUBPROCESS_EXITED state. */
462468
int flux_subprocess_status (flux_subprocess_t *p);

src/common/libsubprocess/subprocess_private.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@ struct flux_subprocess {
8484
int channels_eof_sent; /* counter to avoid loop checks */
8585

8686
int status; /* Raw status from waitpid(2), valid if exited */
87-
int exec_failed_errno; /* Holds errno from exec(2) if exec() failed */
8887

8988
flux_subprocess_state_t state;
9089
flux_subprocess_state_t state_reported; /* for on_state_change */
@@ -111,6 +110,7 @@ struct flux_subprocess {
111110
flux_future_t *f; /* primary future reactor */
112111
bool remote_completed; /* if remote has completed */
113112
int failed_errno; /* Holds errno if FAILED state reached */
113+
flux_error_t failed_error; /* Holds detailed message for failed_errno */
114114
int signal_pending; /* signal sent while starting */
115115
};
116116

0 commit comments

Comments
 (0)