Skip to content

Commit 80c8a69

Browse files
authored
Merge pull request #5213 from chu11/issue2864_idsync_state
job-list: allow list-id to wait for job state
2 parents 7693cd7 + 75a64be commit 80c8a69

File tree

11 files changed

+454
-307
lines changed

11 files changed

+454
-307
lines changed

src/cmd/flux-job.c

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,13 @@ static struct optparse_option list_inactive_opts[] = {
145145
OPTPARSE_TABLE_END
146146
};
147147

148+
static struct optparse_option list_ids_opts[] = {
149+
{ .name = "wait-state", .key = 'W', .has_arg = 1, .arginfo = "STATE",
150+
.usage = "Return only after jobid has reached specified state",
151+
},
152+
OPTPARSE_TABLE_END
153+
};
154+
148155
static struct optparse_option urgency_opts[] = {
149156
{ .name = "verbose", .key = 'v', .has_arg = 0,
150157
.usage = "Output old urgency value on success",
@@ -442,7 +449,7 @@ static struct optparse_subcommand subcommands[] = {
442449
"List job(s) by id",
443450
cmd_list_ids,
444451
OPTPARSE_SUBCMD_HIDDEN,
445-
NULL,
452+
list_ids_opts,
446453
},
447454
{ "urgency",
448455
"[OPTIONS] id urgency",
@@ -1380,6 +1387,8 @@ int cmd_list_ids (optparse_t *p, int argc, char **argv)
13801387
int optindex = optparse_option_index (p);
13811388
flux_t *h;
13821389
int i, ids_len;
1390+
flux_job_state_t state;
1391+
const char *state_str;
13831392

13841393
if (isatty (STDOUT_FILENO)) {
13851394
fprintf (stderr,
@@ -1394,12 +1403,28 @@ int cmd_list_ids (optparse_t *p, int argc, char **argv)
13941403
if (!(h = flux_open (NULL, 0)))
13951404
log_err_exit ("flux_open");
13961405

1406+
/* if no job state specified by user, pick first job state of
1407+
* depend, which means will return as soon as the job-list module
1408+
* is aware of the job
1409+
*/
1410+
state_str = optparse_get_str (p, "wait-state", "depend");
1411+
if (flux_job_strtostate (state_str, &state) < 0)
1412+
log_msg_exit ("invalid job state specified");
1413+
13971414
ids_len = argc - optindex;
13981415
for (i = 0; i < ids_len; i++) {
13991416
flux_jobid_t id = parse_jobid (argv[optindex + i]);
14001417
flux_future_t *f;
1401-
if (!(f = flux_job_list_id (h, id, "[\"all\"]")))
1402-
log_err_exit ("flux_job_list_id");
1418+
if (!(f = flux_rpc_pack (h,
1419+
"job-list.list-id",
1420+
FLUX_NODEID_ANY,
1421+
0,
1422+
"{s:I s:[s] s:i}",
1423+
"id", id,
1424+
"attrs",
1425+
"all",
1426+
"state", state)))
1427+
log_err_exit ("flux_rpc_pack");
14031428
if (flux_future_then (f, -1, list_id_continuation, NULL) < 0)
14041429
log_err_exit ("flux_future_then");
14051430
}

src/modules/job-list/idsync.c

Lines changed: 55 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@
2222
#include "idsync.h"
2323
#include "job_util.h"
2424

25+
/* Used in waits hash, need to store job id within data structure for lookup */
26+
struct idsync_wait_list {
27+
zlistx_t *l;
28+
flux_jobid_t id;
29+
};
30+
2531
void idsync_data_destroy (void *data)
2632
{
2733
if (data) {
@@ -48,6 +54,7 @@ static struct idsync_data *idsync_data_create (flux_t *h,
4854
flux_jobid_t id,
4955
const flux_msg_t *msg,
5056
json_t *attrs,
57+
flux_job_state_t state,
5158
flux_future_t *f_lookup)
5259
{
5360
struct idsync_data *isd = NULL;
@@ -60,6 +67,7 @@ static struct idsync_data *idsync_data_create (flux_t *h,
6067
if (!(isd->msg = flux_msg_copy (msg, false)))
6168
goto error;
6269
isd->attrs = json_incref (attrs);
70+
isd->state = state;
6371
isd->f_lookup = f_lookup;
6472
return isd;
6573

@@ -70,10 +78,15 @@ static struct idsync_data *idsync_data_create (flux_t *h,
7078
return NULL;
7179
}
7280

73-
static void idsync_waits_list_destroy (void **data)
81+
static void idsync_wait_list_destroy (void **data)
7482
{
75-
if (data)
76-
zlistx_destroy ((zlistx_t **) data);
83+
if (data) {
84+
struct idsync_wait_list *iwl = *data;
85+
if (iwl) {
86+
zlistx_destroy (&iwl->l);
87+
free (iwl);
88+
}
89+
}
7790
}
7891

7992
struct idsync_ctx *idsync_ctx_create (flux_t *h)
@@ -93,7 +106,7 @@ struct idsync_ctx *idsync_ctx_create (flux_t *h)
93106
if (!(isctx->waits = job_hash_create ()))
94107
goto error;
95108

96-
zhashx_set_destructor (isctx->waits, idsync_waits_list_destroy);
109+
zhashx_set_destructor (isctx->waits, idsync_wait_list_destroy);
97110

98111
return isctx;
99112

@@ -126,7 +139,8 @@ void idsync_ctx_destroy (struct idsync_ctx *isctx)
126139
struct idsync_data *idsync_check_id_valid (struct idsync_ctx *isctx,
127140
flux_jobid_t id,
128141
const flux_msg_t *msg,
129-
json_t *attrs)
142+
json_t *attrs,
143+
flux_job_state_t state)
130144
{
131145
flux_future_t *f = NULL;
132146
struct idsync_data *isd = NULL;
@@ -142,7 +156,7 @@ struct idsync_data *idsync_check_id_valid (struct idsync_ctx *isctx,
142156
goto error;
143157
}
144158

145-
if (!(isd = idsync_data_create (isctx->h, id, msg, attrs, f)))
159+
if (!(isd = idsync_data_create (isctx->h, id, msg, attrs, state, f)))
146160
goto error;
147161

148162
/* future now owned by struct idsync_data */
@@ -174,26 +188,31 @@ void idsync_check_id_valid_cleanup (struct idsync_ctx *isctx,
174188
static int idsync_add_waiter (struct idsync_ctx *isctx,
175189
struct idsync_data *isd)
176190
{
177-
zlistx_t *list_isd;
191+
struct idsync_wait_list *iwl = NULL;
178192

179193
/* isctx->waits holds lists of ids waiting on, b/c multiple callers
180194
* could wait on same id */
181-
if (!(list_isd = zhashx_lookup (isctx->waits, &isd->id))) {
182-
if (!(list_isd = zlistx_new ()))
195+
if (!(iwl = zhashx_lookup (isctx->waits, &isd->id))) {
196+
iwl = calloc (1, sizeof (*iwl));
197+
if (!iwl)
183198
goto enomem;
184-
zlistx_set_destructor (list_isd, idsync_data_destroy_wrapper);
185199

186-
if (zhashx_insert (isctx->waits, &isd->id, list_isd) < 0)
200+
if (!(iwl->l = zlistx_new ()))
201+
goto enomem;
202+
zlistx_set_destructor (iwl->l, idsync_data_destroy_wrapper);
203+
iwl->id = isd->id;
204+
205+
if (zhashx_insert (isctx->waits, &iwl->id, iwl) < 0)
187206
goto enomem;
188207
}
189208

190-
if (!zlistx_add_end (list_isd, isd))
209+
if (!zlistx_add_end (iwl->l, isd))
191210
goto enomem;
192211

193212
return 0;
194213

195214
enomem:
196-
idsync_data_destroy (isd);
215+
idsync_wait_list_destroy ((void **)&iwl);
197216
errno = ENOMEM;
198217
return -1;
199218
}
@@ -215,11 +234,12 @@ int idsync_wait_valid (struct idsync_ctx *isctx, struct idsync_data *isd)
215234
int idsync_wait_valid_id (struct idsync_ctx *isctx,
216235
flux_jobid_t id,
217236
const flux_msg_t *msg,
218-
json_t *attrs)
237+
json_t *attrs,
238+
flux_job_state_t state)
219239
{
220240
struct idsync_data *isd = NULL;
221241

222-
if (!(isd = idsync_data_create (isctx->h, id, msg, attrs, NULL)))
242+
if (!(isd = idsync_data_create (isctx->h, id, msg, attrs, state, NULL)))
223243
return -1;
224244

225245
return idsync_add_waiter (isctx, isd);
@@ -248,16 +268,30 @@ static void idsync_data_respond (struct idsync_ctx *isctx,
248268

249269
void idsync_check_waiting_id (struct idsync_ctx *isctx, struct job *job)
250270
{
251-
zlistx_t *list_isd;
271+
struct idsync_wait_list *iwl;
252272

253-
if ((list_isd = zhashx_lookup (isctx->waits, &job->id))) {
273+
if ((iwl = zhashx_lookup (isctx->waits, &job->id))) {
254274
struct idsync_data *isd;
255-
isd = zlistx_first (list_isd);
275+
isd = zlistx_first (iwl->l);
256276
while (isd) {
257-
idsync_data_respond (isctx, isd, job);
258-
isd = zlistx_next (list_isd);
277+
/* Some job states can be missed. For example a job that
278+
* is canceled before it runs will never reach the
279+
* FLUX_JOB_STATE_RUN state. To ensure jobs waiting on
280+
* states that are missed will eventually get a response, always
281+
* respond once the job has reached the inactive state.
282+
*/
283+
if (!isd->state
284+
|| (isd->state & job->states_mask)
285+
|| (isd->state && job->state == FLUX_JOB_STATE_INACTIVE)) {
286+
struct idsync_data *tmp;
287+
idsync_data_respond (isctx, isd, job);
288+
tmp = zlistx_detach_cur (iwl->l);
289+
idsync_data_destroy (tmp);
290+
}
291+
isd = zlistx_next (iwl->l);
259292
}
260-
zhashx_delete (isctx->waits, &job->id);
293+
if (!zlistx_size (iwl->l))
294+
zhashx_delete (isctx->waits, &job->id);
261295
}
262296
}
263297

src/modules/job-list/idsync.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ struct idsync_data {
2828
flux_jobid_t id;
2929
flux_msg_t *msg;
3030
json_t *attrs;
31+
flux_job_state_t state;
3132

3233
flux_future_t *f_lookup;
3334
};
@@ -45,7 +46,8 @@ void idsync_data_destroy (void *data);
4546
struct idsync_data *idsync_check_id_valid (struct idsync_ctx *isctx,
4647
flux_jobid_t id,
4748
const flux_msg_t *msg,
48-
json_t *attrs);
49+
json_t *attrs,
50+
flux_job_state_t state);
4951

5052

5153
/* free / cleanup 'struct idsync_data' after
@@ -65,7 +67,8 @@ int idsync_wait_valid (struct idsync_ctx *isctx, struct idsync_data *isd);
6567
int idsync_wait_valid_id (struct idsync_ctx *isctx,
6668
flux_jobid_t id,
6769
const flux_msg_t *msg,
68-
json_t *attrs);
70+
json_t *attrs,
71+
flux_job_state_t state);
6972

7073
/* check if 'job' is in waits list, if so respond to original
7174
* message */

src/modules/job-list/job_state.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,8 @@ static void update_job_state_and_list (struct job_state_ctx *jsctx,
267267
zlistx_reorder (jsctx->pending,
268268
job->list_handle,
269269
search_direction (job));
270+
271+
idsync_check_waiting_id (jsctx->isctx, job);
270272
}
271273

272274
static void state_depend_lookup_continuation (flux_future_t *f, void *arg)
@@ -291,7 +293,6 @@ static void state_depend_lookup_continuation (flux_future_t *f, void *arg)
291293
st = zlist_head (job->next_states);
292294
assert (st);
293295
update_job_state_and_list (jsctx, job, st->state, st->timestamp);
294-
idsync_check_waiting_id (jsctx->isctx, job);
295296
zlist_remove (job->next_states, st);
296297
process_next_state (jsctx, job);
297298

src/modules/job-list/list.c

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ json_t *get_job_by_id (struct job_state_ctx *jsctx,
3232
const flux_msg_t *msg,
3333
flux_jobid_t id,
3434
json_t *attrs,
35+
flux_job_state_t state,
3536
bool *stall);
3637

3738
/* Filter test to determine if job desired by caller */
@@ -294,7 +295,7 @@ void check_id_valid_continuation (flux_future_t *f, void *arg)
294295
else {
295296
json_t *o;
296297
if (!(o = get_job_by_id (jsctx, NULL, isd->msg,
297-
isd->id, isd->attrs, NULL))) {
298+
isd->id, isd->attrs, isd->state, NULL))) {
298299
flux_log_error (jsctx->h, "%s: get_job_by_id", __FUNCTION__);
299300
goto cleanup;
300301
}
@@ -314,14 +315,16 @@ void check_id_valid_continuation (flux_future_t *f, void *arg)
314315
int check_id_valid (struct job_state_ctx *jsctx,
315316
const flux_msg_t *msg,
316317
flux_jobid_t id,
317-
json_t *attrs)
318+
json_t *attrs,
319+
flux_job_state_t state)
318320
{
319321
struct idsync_data *isd = NULL;
320322

321323
if (!(isd = idsync_check_id_valid (jsctx->isctx,
322324
id,
323325
msg,
324-
attrs))
326+
attrs,
327+
state))
325328
|| flux_future_aux_set (isd->f_lookup,
326329
"job_state_ctx",
327330
jsctx,
@@ -349,13 +352,14 @@ json_t *get_job_by_id (struct job_state_ctx *jsctx,
349352
const flux_msg_t *msg,
350353
flux_jobid_t id,
351354
json_t *attrs,
355+
flux_job_state_t state,
352356
bool *stall)
353357
{
354358
struct job *job;
355359

356360
if (!(job = zhashx_lookup (jsctx->index, &id))) {
357361
if (stall) {
358-
if (check_id_valid (jsctx, msg, id, attrs) < 0) {
362+
if (check_id_valid (jsctx, msg, id, attrs, state) < 0) {
359363
flux_log_error (jsctx->h, "%s: check_id_valid", __FUNCTION__);
360364
return NULL;
361365
}
@@ -367,7 +371,7 @@ json_t *get_job_by_id (struct job_state_ctx *jsctx,
367371
if (job->state == FLUX_JOB_STATE_NEW) {
368372
if (stall) {
369373
/* Must wait for job-list to see state change */
370-
if (idsync_wait_valid_id (jsctx->isctx, id, msg, attrs) < 0) {
374+
if (idsync_wait_valid_id (jsctx->isctx, id, msg, attrs, state) < 0) {
371375
flux_log_error (jsctx->h, "%s: idsync_wait_valid_id",
372376
__FUNCTION__);
373377
return NULL;
@@ -388,11 +392,14 @@ void list_id_cb (flux_t *h, flux_msg_handler_t *mh,
388392
json_t *job;
389393
flux_jobid_t id;
390394
json_t *attrs;
395+
int state = 0;
396+
int valid_states = FLUX_JOB_STATE_ACTIVE | FLUX_JOB_STATE_INACTIVE;
391397
bool stall = false;
392398

393-
if (flux_request_unpack (msg, NULL, "{s:I s:o}",
399+
if (flux_request_unpack (msg, NULL, "{s:I s:o s?i}",
394400
"id", &id,
395-
"attrs", &attrs) < 0) {
401+
"attrs", &attrs,
402+
"state", &state) < 0) {
396403
seterror (&err, "invalid payload: %s", flux_msg_last_error (msg));
397404
errno = EPROTO;
398405
goto error;
@@ -404,7 +411,19 @@ void list_id_cb (flux_t *h, flux_msg_handler_t *mh,
404411
goto error;
405412
}
406413

407-
if (!(job = get_job_by_id (ctx->jsctx, &err, msg, id, attrs, &stall))) {
414+
if (state && (state & ~valid_states)) {
415+
seterror (&err, "invalid payload: invalid state specified");
416+
errno = EPROTO;
417+
goto error;
418+
}
419+
420+
if (!(job = get_job_by_id (ctx->jsctx,
421+
&err,
422+
msg,
423+
id,
424+
attrs,
425+
state,
426+
&stall))) {
408427
/* response handled after KVS lookup complete */
409428
if (stall)
410429
goto stall;

0 commit comments

Comments
 (0)