Skip to content

Commit 67fc412

Browse files
authored
Merge pull request #6013 from garlick/rexec_matchtag
libsubprocess: use matchtag instead of pid for flux_subprocess_write()
2 parents 0ddc3d9 + b84e4eb commit 67fc412

File tree

8 files changed

+126
-138
lines changed

8 files changed

+126
-138
lines changed

src/common/libsubprocess/client.c

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ struct rexec_ctx {
4444
json_t *cmd;
4545
int flags;
4646
struct rexec_response response;
47+
uint32_t matchtag;
48+
uint32_t rank;
49+
char *service_name;
4750
};
4851

4952
static void rexec_response_clear (struct rexec_response *resp)
@@ -62,12 +65,16 @@ static void rexec_ctx_destroy (struct rexec_ctx *ctx)
6265
int saved_errno = errno;
6366
rexec_response_clear (&ctx->response);
6467
json_decref (ctx->cmd);
68+
free (ctx->service_name);
6569
free (ctx);
6670
errno = saved_errno;
6771
}
6872
}
6973

70-
static struct rexec_ctx *rexec_ctx_create (flux_cmd_t *cmd, int flags)
74+
static struct rexec_ctx *rexec_ctx_create (flux_cmd_t *cmd,
75+
const char *service_name,
76+
uint32_t rank,
77+
int flags)
7178
{
7279
struct rexec_ctx *ctx;
7380
int valid_flags = SUBPROCESS_REXEC_STDOUT
@@ -80,10 +87,12 @@ static struct rexec_ctx *rexec_ctx_create (flux_cmd_t *cmd, int flags)
8087
}
8188
if (!(ctx = calloc (1, sizeof (*ctx))))
8289
return NULL;
83-
if (!(ctx->cmd = cmd_tojson (cmd)))
90+
if (!(ctx->cmd = cmd_tojson (cmd))
91+
|| !(ctx->service_name = strdup (service_name)))
8492
goto error;
8593
ctx->flags = flags;
8694
ctx->response.pid = -1;
95+
ctx->rank = rank;
8796
return ctx;
8897
error:
8998
rexec_ctx_destroy (ctx);
@@ -106,7 +115,7 @@ flux_future_t *subprocess_rexec (flux_t *h,
106115
}
107116
if (asprintf (&topic, "%s.exec", service_name) < 0)
108117
return NULL;
109-
if (!(ctx = rexec_ctx_create (cmd, flags)))
118+
if (!(ctx = rexec_ctx_create (cmd, service_name, rank, flags)))
110119
goto error;
111120
if (!(f = flux_rpc_pack (h,
112121
topic,
@@ -122,6 +131,7 @@ flux_future_t *subprocess_rexec (flux_t *h,
122131
rexec_ctx_destroy (ctx);
123132
goto error;
124133
}
134+
ctx->matchtag = flux_rpc_get_matchtag (f);
125135
free (topic);
126136
return f;
127137
error:
@@ -223,33 +233,32 @@ bool subprocess_rexec_is_output (flux_future_t *f,
223233
return false;
224234
}
225235

226-
int subprocess_write (flux_t *h,
227-
const char *service_name,
228-
uint32_t rank,
229-
pid_t pid,
236+
int subprocess_write (flux_future_t *f_exec,
230237
const char *stream,
231238
const char *data,
232239
int len,
233240
bool eof)
234241
{
242+
struct rexec_ctx *ctx = flux_future_aux_get (f_exec, "flux::rexec");
243+
flux_t *h = flux_future_get_flux (f_exec);
235244
flux_future_t *f = NULL;
236245
json_t *io;
237246
char *topic;
238247
int rc = -1;
239248

240-
if (!h || pid < 0 || !stream || !service_name) {
249+
if (!stream || !ctx) {
241250
errno = EINVAL;
242251
return -1;
243252
}
244-
if (asprintf (&topic, "%s.write", service_name) < 0)
253+
if (asprintf (&topic, "%s.write", ctx->service_name) < 0)
245254
return -1;
246255
if (!(io = ioencode (stream, "0", data, len, eof))
247256
|| !(f = flux_rpc_pack (h,
248257
topic,
249-
rank,
258+
ctx->rank,
250259
FLUX_RPC_NORESPONSE,
251260
"{s:i s:O}",
252-
"pid", pid,
261+
"matchtag", ctx->matchtag,
253262
"io", io)))
254263
goto out;
255264
rc = 0;

src/common/libsubprocess/client.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,7 @@ bool subprocess_rexec_is_output (flux_future_t *f,
4040
int *len,
4141
bool *eof);
4242

43-
int subprocess_write (flux_t *h,
44-
const char *service_name,
45-
uint32_t rank,
46-
pid_t pid,
43+
int subprocess_write (flux_future_t *f,
4744
const char *stream,
4845
const char *data,
4946
int len,

src/common/libsubprocess/remote.c

Lines changed: 0 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -468,48 +468,6 @@ static int remote_output_buffered (flux_subprocess_t *p,
468468
return 0;
469469
}
470470

471-
/* In the event channel had data / closed before process running */
472-
static int send_channel_data (flux_subprocess_t *p)
473-
{
474-
struct subprocess_channel *c;
475-
c = zhash_first (p->channels);
476-
while (c) {
477-
int bytes = fbuf_bytes (c->write_buffer);
478-
if (c->closed || bytes > 0) {
479-
const char *ptr = NULL;
480-
int len = 0;
481-
if (bytes > 0) {
482-
if (!(ptr = fbuf_read (c->write_buffer, -1, &len))) {
483-
llog_debug (p,
484-
"error reading buffered data: %s",
485-
strerror (errno));
486-
set_failed (p, "internal fbuf_read error");
487-
return -1;
488-
}
489-
}
490-
if (subprocess_write (p->h,
491-
p->service_name,
492-
p->rank,
493-
p->pid,
494-
c->name,
495-
ptr,
496-
len,
497-
c->closed) < 0) {
498-
llog_debug (p,
499-
"error sending rexec.write request: %s",
500-
strerror (errno));
501-
set_failed (p, "internal close error");
502-
return -1;
503-
}
504-
/* Don't need this buffer anymore, reclaim the space */
505-
fbuf_destroy (c->write_buffer);
506-
c->write_buffer = NULL;
507-
}
508-
c = zhash_next (p->channels);
509-
}
510-
return 0;
511-
}
512-
513471
static void rexec_continuation (flux_future_t *f, void *arg)
514472
{
515473
flux_subprocess_t *p = arg;
@@ -543,8 +501,6 @@ static void rexec_continuation (flux_future_t *f, void *arg)
543501
if (subprocess_rexec_is_started (f, &p->pid)) {
544502
p->pid_set = true;
545503
process_new_state (p, FLUX_SUBPROCESS_RUNNING);
546-
if (send_channel_data (p) < 0)
547-
goto error;
548504
}
549505
else if (subprocess_rexec_is_stopped (f)) {
550506
process_new_state (p, FLUX_SUBPROCESS_STOPPED);

src/common/libsubprocess/server.c

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,30 @@ static flux_subprocess_t *proc_find_bypid (subprocess_server_t *s, pid_t pid)
107107
return NULL;
108108
}
109109

110+
/* Find a <service>.exec message with the same sender as msg and matchtag as
111+
* specified in the request matchtag field.
112+
* N.B. flux_cancel_match() happens to be helpful because RFC 42 subprocess
113+
* write works like RFC 6 cancel.
114+
*/
115+
static flux_subprocess_t *proc_find_byclient (subprocess_server_t *s,
116+
const flux_msg_t *request)
117+
{
118+
flux_subprocess_t *p;
119+
120+
p = zlistx_first (s->subprocesses);
121+
while (p) {
122+
const flux_msg_t *msg;
123+
124+
if ((msg = flux_subprocess_aux_get (p, msgkey))
125+
&& flux_cancel_match (request, msg))
126+
return p;
127+
p = zlistx_next (s->subprocesses);
128+
}
129+
errno = ESRCH;
130+
return NULL;
131+
}
132+
133+
110134
static void proc_completion_cb (flux_subprocess_t *p)
111135
{
112136
subprocess_server_t *s = flux_subprocess_aux_get (p, srvkey);
@@ -390,14 +414,14 @@ static void server_write_cb (flux_t *h,
390414
char *data = NULL;
391415
int len = 0;
392416
bool eof = false;
393-
pid_t pid;
417+
int matchtag;
394418
json_t *io = NULL;
395419
flux_error_t error;
396420

397421
if (flux_request_unpack (msg,
398422
NULL,
399423
"{ s:i s:o }",
400-
"pid", &pid,
424+
"matchtag", &matchtag,
401425
"io", &io) < 0
402426
|| iodecode (io, &stream, NULL, &data, &len, &eof) < 0) {
403427
llog_error (s,
@@ -415,27 +439,24 @@ static void server_write_cb (flux_t *h,
415439
* in flight, and is not necessarily an error, and can be common enough
416440
* that the log messages end up being a nuisance.
417441
*/
418-
if (!(p = proc_find_bypid (s, pid))
419-
|| p->state != FLUX_SUBPROCESS_RUNNING)
442+
if (!(p = proc_find_byclient (s, msg))
443+
|| p->state == FLUX_SUBPROCESS_FAILED
444+
|| p->state == FLUX_SUBPROCESS_EXITED)
420445
goto out;
421446

422447
if (data && len) {
423448
int rc = flux_subprocess_write (p, stream, data, len);
424449
if (rc < 0) {
425450
llog_error (s,
426-
"Error writing %d bytes to subprocess pid %d %s",
451+
"Error writing %d bytes to subprocess %s",
427452
len,
428-
(int)pid,
429453
stream);
430454
goto error;
431455
}
432456
}
433457
if (eof) {
434458
if (flux_subprocess_close (p, stream) < 0) {
435-
llog_error (s,
436-
"Error writing EOF to subprocess pid %d %s",
437-
(int)pid,
438-
stream);
459+
llog_error (s, "Error writing EOF to subprocess %s", stream);
439460
goto error;
440461
}
441462
}

src/common/libsubprocess/subprocess.c

Lines changed: 10 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ void channel_destroy (void *arg)
5454
flux_watcher_destroy (c->buffer_read_stopped_w);
5555
c->buffer_read_w_started = false;
5656

57-
fbuf_destroy (c->write_buffer);
5857
fbuf_destroy (c->read_buffer);
5958
flux_watcher_destroy (c->out_prep_w);
6059
flux_watcher_destroy (c->out_idle_w);
@@ -707,42 +706,12 @@ int flux_subprocess_write (flux_subprocess_t *p,
707706
errno = EPIPE;
708707
return -1;
709708
}
710-
if (p->state == FLUX_SUBPROCESS_INIT) {
711-
if (!c->write_buffer) {
712-
int buffer_size;
713-
if ((buffer_size = cmd_option_bufsize (p, stream)) < 0) {
714-
log_err ("cmd_option_bufsize: %s", strerror (errno));
715-
return -1;
716-
}
717-
if (!(c->write_buffer = fbuf_create (buffer_size))) {
718-
log_err ("fbuf_create");
719-
return -1;
720-
}
721-
}
722-
if (fbuf_space (c->write_buffer) < len) {
723-
errno = ENOSPC;
724-
return -1;
725-
}
726-
if ((ret = fbuf_write (c->write_buffer, buf, len)) < 0) {
727-
log_err ("fbuf_write");
728-
return -1;
729-
}
730-
}
731-
else { /* p->state == FLUX_SUBPROCESS_RUNNING */
732-
if (subprocess_write (p->h,
733-
p->service_name,
734-
p->rank,
735-
p->pid,
736-
c->name,
737-
buf,
738-
len,
739-
false) < 0) {
740-
log_err ("error sending rexec.write request: %s",
741-
strerror (errno));
742-
return -1;
743-
}
744-
ret = len;
709+
if (subprocess_write (p->f, c->name, buf, len, false) < 0) {
710+
log_err ("error sending rexec.write request: %s",
711+
strerror (errno));
712+
return -1;
745713
}
714+
ret = len;
746715
}
747716

748717
return ret;
@@ -780,26 +749,12 @@ int flux_subprocess_close (flux_subprocess_t *p, const char *stream)
780749
c->closed = true;
781750
}
782751
else {
783-
/* if process isn't running, eof plus any previously written
784-
* data will be sent after process converts to running. See
785-
* send_channel_data() in remote.c. If subprocess has already
786-
* exited, this does nothing.
787-
*/
788-
c->closed = true;
789-
if (p->state == FLUX_SUBPROCESS_RUNNING) {
790-
if (subprocess_write (p->h,
791-
p->service_name,
792-
p->rank,
793-
p->pid,
794-
c->name,
795-
NULL,
796-
0,
797-
true) < 0) {
798-
log_err ("error sending rexec.write request: %s",
799-
strerror (errno));
800-
return -1;
801-
}
752+
if (subprocess_write (p->f, c->name, NULL, 0, true) < 0) {
753+
log_err ("error sending rexec.write request: %s",
754+
strerror (errno));
755+
return -1;
802756
}
757+
c->closed = true;
803758
}
804759

805760
return 0;

src/common/libsubprocess/subprocess_private.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ struct subprocess_channel {
4545
bool buffer_read_w_started;
4646

4747
/* remote */
48-
struct fbuf *write_buffer; /* buffer pre-running data */
4948
struct fbuf *read_buffer;
5049
bool read_eof_received;
5150
flux_watcher_t *out_prep_w;

src/common/libsubprocess/test/iostress.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ static void iostress_state_cb (flux_subprocess_t *p,
120120
}
121121
}
122122

123-
static int rexec_write (flux_t *h, pid_t pid, const char *buf, int len)
123+
static int rexec_write (flux_t *h, uint32_t matchtag, const char *buf, int len)
124124
{
125125
flux_future_t *f;
126126
json_t *io;
@@ -133,7 +133,7 @@ static int rexec_write (flux_t *h, pid_t pid, const char *buf, int len)
133133
0,
134134
FLUX_RPC_NORESPONSE,
135135
"{s:i s:O}",
136-
"pid", pid,
136+
"matchtag", matchtag,
137137
"io", io))) {
138138
json_decref (io);
139139
return -1;
@@ -150,15 +150,18 @@ static void iostress_source_cb (flux_reactor_t *r,
150150
{
151151
struct iostress_ctx *ctx = arg;
152152
char *buf;
153+
uint32_t matchtag;
153154

154155
if (!(buf = malloc (ctx->linesize)))
155156
BAIL_OUT ("out of memory");
156157
memset (buf, 'F', ctx->linesize - 1);
157158
buf[ctx->linesize - 1] = '\n';
158159

160+
matchtag = flux_rpc_get_matchtag (ctx->p->f);
161+
159162
for (int i = 0; i < ctx->batchlines; i++) {
160163
if (ctx->direct) {
161-
if (rexec_write (ctx->h, ctx->pid, buf, ctx->linesize) < 0)
164+
if (rexec_write (ctx->h, matchtag, buf, ctx->linesize) < 0)
162165
BAIL_OUT ("rexec_write failed");
163166
}
164167
else {

0 commit comments

Comments
 (0)