Skip to content

Commit 57ac53a

Browse files
authored
Merge pull request #5837 from garlick/inactive_job_journal
eliminate duplicate KVS restart in job-list and job-manager
2 parents 69711aa + abdb9b6 commit 57ac53a

File tree

19 files changed

+432
-1188
lines changed

19 files changed

+432
-1188
lines changed

doc/man5/flux-config-job-manager.rst

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,6 @@ table, which may contain the following keys:
1313
KEYS
1414
====
1515

16-
journal-size-limit
17-
(optional) Integer value that determines the maximum number of job events to
18-
be retained in the in-memory journal used to answer queries. The default
19-
is 1000.
20-
2116
inactive-age-limit
2217
(optional) String (in RFC 23 Flux Standard Duration format) that specifies
2318
the maximum age of inactive jobs retained in the KVS. The age is computed

src/modules/job-list/job-list.c

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,13 @@ static void stats_cb (flux_t *h, flux_msg_handler_t *mh,
4343
const flux_msg_t *msg, void *arg)
4444
{
4545
struct list_ctx *ctx = arg;
46+
47+
if (!ctx->jsctx->initialized) {
48+
if (flux_msglist_append (ctx->deferred_requests, msg) < 0)
49+
goto error;
50+
return;
51+
}
52+
4653
int pending = zlistx_size (ctx->jsctx->pending);
4754
int running = zlistx_size (ctx->jsctx->running);
4855
int inactive = zlistx_size (ctx->jsctx->inactive);
@@ -57,15 +64,12 @@ static void stats_cb (flux_t *h, flux_msg_handler_t *mh,
5764
"idsync",
5865
"lookups", idsync_lookups,
5966
"waits", idsync_waits,
60-
"stats_watchers", stats_watchers) < 0) {
61-
flux_log_error (h, "%s: flux_respond_pack", __FUNCTION__);
62-
goto error;
63-
}
64-
67+
"stats_watchers", stats_watchers) < 0)
68+
flux_log_error (h, "error responding to stats-get request");
6569
return;
6670
error:
6771
if (flux_respond_error (h, msg, errno, NULL) < 0)
68-
flux_log_error (h, "%s: flux_respond_error", __FUNCTION__);
72+
flux_log_error (h, "error responding to stats-get request");
6973
}
7074

7175
static void purge_cb (flux_t *h,
@@ -98,6 +102,17 @@ static void purge_cb (flux_t *h,
98102
flux_log (h, LOG_DEBUG, "purged %d inactive jobs", count);
99103
}
100104

105+
void requeue_deferred_requests (struct list_ctx *ctx)
106+
{
107+
const flux_msg_t *msg;
108+
109+
while ((msg = flux_msglist_pop (ctx->deferred_requests))) {
110+
if (flux_requeue (ctx->h, msg, FLUX_RQ_TAIL) < 0)
111+
flux_log_error (ctx->h, "error requeuing deferred request");
112+
flux_msg_decref (msg);
113+
}
114+
}
115+
101116
static void disconnect_cb (flux_t *h,
102117
flux_msg_handler_t *mh,
103118
const flux_msg_t *msg,
@@ -186,6 +201,7 @@ static void list_ctx_destroy (struct list_ctx *ctx)
186201
if (ctx) {
187202
int saved_errno = errno;
188203
flux_msg_handler_delvec (ctx->handlers);
204+
flux_msglist_destroy (ctx->deferred_requests);
189205
if (ctx->jsctx)
190206
job_state_destroy (ctx->jsctx);
191207
if (ctx->isctx)
@@ -207,7 +223,9 @@ static struct list_ctx *list_ctx_create (flux_t *h)
207223
goto error;
208224
if (!(ctx->isctx = idsync_ctx_create (ctx->h)))
209225
goto error;
210-
if (!(ctx->jsctx = job_state_create (ctx->isctx)))
226+
if (!(ctx->jsctx = job_state_create (ctx)))
227+
goto error;
228+
if (!(ctx->deferred_requests = flux_msglist_create ()))
211229
goto error;
212230
return ctx;
213231
error:
@@ -224,10 +242,6 @@ int mod_main (flux_t *h, int argc, char **argv)
224242
flux_log_error (h, "initialization error");
225243
goto done;
226244
}
227-
if (job_state_init_from_kvs (ctx->jsctx) < 0) {
228-
flux_log_error (h, "initialization from kvs error");
229-
goto done;
230-
}
231245
if (flux_reactor_run (flux_get_reactor (h), 0) < 0)
232246
goto done;
233247
rc = 0;

src/modules/job-list/job-list.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,13 @@ struct list_ctx {
2323
flux_msg_handler_t **handlers;
2424
struct job_state_ctx *jsctx;
2525
struct idsync_ctx *isctx;
26+
struct flux_msglist *deferred_requests;
2627
};
2728

2829
const char **job_attrs (void);
2930

31+
void requeue_deferred_requests (struct list_ctx *ctx);
32+
3033
#endif /* _FLUX_JOB_LIST_H */
3134

3235
/*

src/modules/job-list/job_data.c

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ void job_destroy (void *data)
4242
json_decref (job->exception_context);
4343
json_decref (job->jobspec_updates);
4444
json_decref (job->R_updates);
45-
zlist_destroy (&job->updates);
4645
free (job);
4746
errno = save_errno;
4847
}
@@ -69,16 +68,8 @@ struct job *job_create (flux_t *h, flux_jobid_t id)
6968
job->expiration = -1.0;
7069
job->wait_status = -1;
7170
job->result = FLUX_JOB_RESULT_FAILED;
72-
73-
if (!(job->updates = zlist_new ())) {
74-
errno = ENOMEM;
75-
job_destroy (job);
76-
return NULL;
77-
}
78-
7971
job->states_mask = FLUX_JOB_STATE_NEW;
8072
job->states_events_mask = FLUX_JOB_STATE_NEW;
81-
job->eventlog_seq = -1;
8273
return job;
8374
}
8475

@@ -368,15 +359,24 @@ static int parse_jobspec (struct job *job, bool allow_nonfatal)
368359
return allow_nonfatal ? 0 : -1;
369360
}
370361

371-
int job_parse_jobspec (struct job *job, const char *s, json_t *updates)
362+
int job_parse_jobspec_cached (struct job *job, json_t *updates)
372363
{
373-
if (load_jobspec (job, s, true) < 0)
364+
if (!job->jobspec) {
365+
errno = EINVAL;
374366
return -1;
367+
}
375368
if (parse_jobspec (job, true) < 0)
376369
return -1;
377370
return job_jobspec_update (job, updates);
378371
}
379372

373+
int job_parse_jobspec (struct job *job, const char *s, json_t *updates)
374+
{
375+
if (load_jobspec (job, s, true) < 0)
376+
return -1;
377+
return job_parse_jobspec_cached (job, updates);
378+
}
379+
380380
int job_parse_jobspec_fatal (struct job *job, const char *s, json_t *updates)
381381
{
382382
if (load_jobspec (job, s, false) < 0)
@@ -466,15 +466,24 @@ static int parse_R (struct job *job, bool allow_nonfatal)
466466
return rc;
467467
}
468468

469-
int job_parse_R (struct job *job, const char *s, json_t *updates)
469+
int job_parse_R_cached (struct job *job, json_t *updates)
470470
{
471-
if (load_R (job, s, true) < 0)
471+
if (!job->R) {
472+
errno = EINVAL;
472473
return -1;
474+
}
473475
if (parse_R (job, true) < 0)
474476
return -1;
475477
return job_R_update (job, updates);
476478
}
477479

480+
int job_parse_R (struct job *job, const char *s, json_t *updates)
481+
{
482+
if (load_R (job, s, true) < 0)
483+
return -1;
484+
return job_parse_R_cached (job, updates);
485+
}
486+
478487
int job_parse_R_fatal (struct job *job, const char *s, json_t *updates)
479488
{
480489
if (load_R (job, s, false) < 0)

src/modules/job-list/job_data.h

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -70,27 +70,17 @@ struct job {
7070
json_t *R;
7171
json_t *exception_context;
7272

73-
/* All internal changes (most notably job state transitions) are
74-
* placed on the updates list. We do not immediately update to
75-
* the new state and place onto a new list until we have retrieved
76-
* any necessary data associated to that state. For example, when
77-
* the 'depend' state has been seen, we don't immediately place it
78-
* on the `pending` list. We wait until we've retrieved data such
79-
* as userid, urgency, etc.
80-
*
81-
* Track which states we have seen and have completed transition
73+
/* Track which states we have seen and have completed transition
8274
* to. States we've processed via the states_mask and states seen
8375
* via events stream in states_events_mask.
8476
*/
85-
zlist_t *updates;
8677
unsigned int states_mask;
8778
unsigned int states_events_mask;
8879
void *list_handle;
89-
/* if updates in eventlog before jobspec / R read from KVS */
80+
/* store updates that were received before jobspec/R objects */
9081
json_t *jobspec_updates;
9182
json_t *R_updates;
9283

93-
int eventlog_seq; /* last event seq read */
9484
int submit_version; /* version number in submit context */
9585
};
9686

@@ -110,6 +100,7 @@ struct job *job_create (flux_t *h, flux_jobid_t id);
110100
* the jobspec.
111101
*/
112102
int job_parse_jobspec (struct job *job, const char *s, json_t *updates);
103+
int job_parse_jobspec_cached (struct job *job, json_t *updates);
113104

114105
/* identical to above, but all nonfatal errors will return error.
115106
* Primarily used for testing.
@@ -129,6 +120,7 @@ int job_jobspec_update (struct job *job, json_t *updates);
129120
* - ntasks (if necessary)
130121
*/
131122
int job_parse_R (struct job *job, const char *s, json_t *updates);
123+
int job_parse_R_cached (struct job *job, json_t *updates);
132124

133125
/* identical to above, but all nonfatal errors will return error.
134126
* Primarily used for testing.

0 commit comments

Comments
 (0)