Skip to content

Commit 844b5ef

Browse files
committed
libsubprocess: support user friendly error string
Problem: When a subprocess fails the "failed_errno" value is set and can be returned in flux_subprocess_fail_errno(). It would be convenient if there was an error message to describe the error in more detail. Solution: Internally support a new "failed_error" field which can store a text message describing the error in more detail. Generally speaking, always fill this with a text message when "failed_errno" is also set. Set user friendly error string in many cases throughout libsubprocess. Support a new flux_subprocess_fail_error() to retrieve this error message. Add corner case unit tests. Fixes #5233
1 parent ac7774d commit 844b5ef

File tree

6 files changed

+58
-4
lines changed

6 files changed

+58
-4
lines changed

src/common/libsubprocess/remote.c

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,15 @@
1515
#include <sys/types.h>
1616
#include <wait.h>
1717
#include <unistd.h>
18+
#include <stdarg.h>
1819
#include <errno.h>
1920
#include <assert.h>
2021

2122
#include <flux/core.h>
2223

2324
#include "ccan/str/str.h"
2425
#include "src/common/libczmqcontainers/czmq_containers.h"
26+
#include "src/common/libutil/errprintf.h"
2527
#include "src/common/libutil/log.h"
2628
#include "src/common/libutil/fdwalk.h"
2729
#include "src/common/libutil/macros.h"
@@ -37,6 +39,15 @@
3739

3840
static void remote_kill_nowait (flux_subprocess_t *p, int signum);
3941

42+
static void set_failed (flux_subprocess_t *p, const char *fmt, ...)
43+
{
44+
va_list ap;
45+
va_start (ap, fmt);
46+
verrprintf (&p->failed_error, fmt, ap);
47+
p->failed_errno = errno;
48+
va_end (ap);
49+
}
50+
4051
static void start_channel_watchers (flux_subprocess_t *p)
4152
{
4253
struct subprocess_channel *c;
@@ -169,6 +180,7 @@ static int remote_write (struct subprocess_channel *c)
169180

170181
if (!(ptr = flux_buffer_read (c->write_buffer, -1, &lenp))) {
171182
llog_debug (c->p, "flux_buffer_read: %s", strerror (errno));
183+
set_failed (c->p, "internal buffer read error");
172184
goto error;
173185
}
174186

@@ -192,6 +204,7 @@ static int remote_write (struct subprocess_channel *c)
192204
llog_debug (c->p,
193205
"error sending rexec.write request: %s",
194206
strerror (errno));
207+
set_failed (c->p, "internal write error");
195208
goto error;
196209
}
197210

@@ -215,6 +228,7 @@ static int remote_close (struct subprocess_channel *c)
215228
llog_debug (c->p,
216229
"error sending rexec.write request: %s",
217230
strerror (errno));
231+
set_failed (c->p, "internal close error");
218232
return -1;
219233
}
220234
/* No need to do a "channel_flush", normal io reactor will handle
@@ -254,7 +268,9 @@ static void remote_in_check_cb (flux_reactor_t *r,
254268
return;
255269

256270
error:
257-
c->p->failed_errno = errno;
271+
/* c->p->failed_errno and c->p->failed_error expected to be
272+
* set before this point (typically via set_failed())
273+
*/
258274
process_new_state (c->p, FLUX_SUBPROCESS_FAILED);
259275
remote_kill_nowait (c->p, SIGKILL);
260276
flux_future_destroy (c->p->f);
@@ -520,6 +536,7 @@ static int remote_output (flux_subprocess_t *p,
520536
(int)flux_subprocess_pid (p),
521537
stream);
522538
errno = EPROTO;
539+
set_failed (p, "error buffering unknown channel %s", stream);
523540
return -1;
524541
}
525542

@@ -539,6 +556,7 @@ static int remote_output (flux_subprocess_t *p,
539556
(int)flux_subprocess_pid (p),
540557
stream,
541558
strerror (errno));
559+
set_failed (p, "error buffering %d bytes of data", len);
542560
return -1;
543561
}
544562
}
@@ -572,6 +590,7 @@ static void rexec_continuation (flux_future_t *f, void *arg)
572590
remote_completion (p);
573591
return;
574592
}
593+
set_failed (p, "%s", future_strerror (f, errno));
575594
goto error;
576595
}
577596
if (subprocess_rexec_is_started (f, &p->pid)) {
@@ -592,7 +611,9 @@ static void rexec_continuation (flux_future_t *f, void *arg)
592611
return;
593612

594613
error:
595-
p->failed_errno = errno;
614+
/* c->p->failed_errno and c->p->failed_error expected to be
615+
* set before this point (typically via set_failed())
616+
*/
596617
process_new_state (p, FLUX_SUBPROCESS_FAILED);
597618
remote_kill_nowait (p, SIGKILL);
598619
}

src/common/libsubprocess/server.c

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
#include "src/common/libczmqcontainers/czmq_containers.h"
2121
#include "src/common/libutil/errno_safe.h"
22+
#include "src/common/libutil/errprintf.h"
2223
#include "src/common/libutil/llog.h"
2324
#include "src/common/libioencode/ioencode.h"
2425
#include "ccan/str/str.h"
@@ -136,6 +137,7 @@ static void proc_internal_fatal (flux_subprocess_t *p)
136137
*/
137138
p->state = FLUX_SUBPROCESS_FAILED;
138139
p->failed_errno = errno;
140+
errprintf (&p->failed_error, "internal fatal error: %s", strerror (errno));
139141
state_change_start (p);
140142

141143
/* if we fail here, probably not much can be done */
@@ -177,7 +179,13 @@ static void proc_state_change_cb (flux_subprocess_t *p,
177179
"type", "stopped");
178180
}
179181
else if (state == FLUX_SUBPROCESS_FAILED) {
180-
rc = flux_respond_error (s->h, request, p->failed_errno, NULL);
182+
const char *errmsg = NULL;
183+
if (p->failed_error.text[0] != '\0')
184+
errmsg = p->failed_error.text;
185+
rc = flux_respond_error (s->h,
186+
request,
187+
p->failed_errno,
188+
errmsg);
181189
proc_delete (s, p); // N.B. proc_delete preserves errno
182190
} else {
183191
errno = EPROTO;
@@ -333,8 +341,11 @@ static void server_exec_cb (flux_t *h, flux_msg_handler_t *mh,
333341
&ops,
334342
NULL,
335343
s->llog,
336-
s->llog_data)))
344+
s->llog_data))) {
345+
errprintf (&error, "error launching process: %s", strerror (errno));
346+
errmsg = error.text;
337347
goto error;
348+
}
338349

339350
if (flux_subprocess_aux_set (p,
340351
msgkey,

src/common/libsubprocess/subprocess.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1121,6 +1121,17 @@ int flux_subprocess_fail_errno (flux_subprocess_t *p)
11211121
return p->failed_errno;
11221122
}
11231123

1124+
const char *flux_subprocess_fail_error (flux_subprocess_t *p)
1125+
{
1126+
if (!p)
1127+
return "internal error: subprocess is NULL";
1128+
if (p->state != FLUX_SUBPROCESS_FAILED)
1129+
return "internal error: subprocess is not in FAILED state";
1130+
if (p->failed_error.text[0] == '\0')
1131+
return strerror (p->failed_errno);
1132+
return p->failed_error.text;
1133+
}
1134+
11241135
int flux_subprocess_status (flux_subprocess_t *p)
11251136
{
11261137
if (!p) {

src/common/libsubprocess/subprocess.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,12 @@ int flux_subprocess_rank (flux_subprocess_t *p);
457457
*/
458458
int flux_subprocess_fail_errno (flux_subprocess_t *p);
459459

460+
/* Returns error message describing why FLUX_SUBPROCESS_FAILED state was
461+
* reached. If error message was not set, will return strerror() of
462+
* errno returned from flux_subprocess_fail_errno().
463+
*/
464+
const char *flux_subprocess_fail_error (flux_subprocess_t *p);
465+
460466
/* Returns exit status as returned from wait(2). Works only for
461467
* FLUX_SUBPROCESS_EXITED state. */
462468
int flux_subprocess_status (flux_subprocess_t *p);

src/common/libsubprocess/subprocess_private.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ struct flux_subprocess {
110110
flux_future_t *f; /* primary future reactor */
111111
bool remote_completed; /* if remote has completed */
112112
int failed_errno; /* Holds errno if FAILED state reached */
113+
flux_error_t failed_error; /* Holds detailed message for failed_errno */
113114
int signal_pending; /* signal sent while starting */
114115
};
115116

src/common/libsubprocess/test/subprocess.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,8 @@ void test_corner_cases (flux_reactor_t *r)
173173
ok (flux_subprocess_fail_errno (NULL) < 0
174174
&& errno == EINVAL,
175175
"flux_subprocess_fail_errno fails with NULL pointer input");
176+
ok (flux_subprocess_fail_error (NULL) != NULL,
177+
"flux_subprocess_fail_error works with NULL pointer input");
176178
ok (flux_subprocess_status (NULL) < 0
177179
&& errno == EINVAL,
178180
"flux_subprocess_status fails with NULL pointer input");
@@ -258,6 +260,8 @@ void test_post_exec_errors (flux_reactor_t *r)
258260
"flux_subprocess_rank fails b/c subprocess is local");
259261
ok (flux_subprocess_fail_errno (p) < 0,
260262
"subprocess fail errno fails b/c subprocess not failed");
263+
ok (flux_subprocess_fail_error (p) != NULL,
264+
"subprocess fail error works when subprocess not failed");
261265
ok (flux_subprocess_status (p) < 0,
262266
"subprocess status fails b/c subprocess not yet exited");
263267
ok (flux_subprocess_exit_code (p) < 0,

0 commit comments

Comments
 (0)