Skip to content

Commit 3383f7f

Browse files
authored
Merge pull request #6191 from garlick/issue#6179
improve housekeeping logging and list management
2 parents 5993420 + a892a0f commit 3383f7f

File tree

1 file changed

+22
-21
lines changed

1 file changed

+22
-21
lines changed

src/modules/job-manager/housekeeping.c

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ struct allocation {
116116
int free_count; // number of releases
117117
double t_start;
118118
struct bulk_exec *bulk_exec;
119+
void *list_handle;
119120
};
120121

121122
struct housekeeping {
@@ -258,15 +259,13 @@ static void allocation_release (struct allocation *a)
258259

259260
static void allocation_remove (struct allocation *a)
260261
{
261-
void *cursor;
262-
if (!(cursor = zlistx_find (a->hk->allocations, a))) {
262+
if (!a->list_handle
263+
|| zlistx_delete (a->hk->allocations, a->list_handle) < 0) {
263264
flux_log (a->hk->ctx->h,
264-
LOG_ERR,
265+
LOG_CRIT,
265266
"housekeeping: internal error removing allocation for %s",
266267
idf58 (a->id));
267-
return;
268268
}
269-
zlistx_delete (a->hk->allocations, cursor);
270269
}
271270

272271
static void allocation_timeout (flux_reactor_t *r,
@@ -318,6 +317,14 @@ static void set_failed_reason (const char **s, const char *reason)
318317
*s = "multiple failure modes";
319318
}
320319

320+
static void bulk_start (struct bulk_exec *bulk_exec, void *arg)
321+
{
322+
struct allocation *a = arg;
323+
flux_t *h = a->hk->ctx->h;
324+
325+
flux_log (h, LOG_DEBUG, "housekeeping: %s started", idf58 (a->id));
326+
}
327+
321328
static void bulk_exit (struct bulk_exec *bulk_exec,
322329
void *arg,
323330
const struct idset *ids)
@@ -377,7 +384,9 @@ static void bulk_exit (struct bulk_exec *bulk_exec,
377384
static void bulk_complete (struct bulk_exec *bulk_exec, void *arg)
378385
{
379386
struct allocation *a = arg;
387+
flux_t *h = a->hk->ctx->h;
380388

389+
flux_log (h, LOG_DEBUG, "housekeeping: %s complete", idf58 (a->id));
381390
allocation_remove (a);
382391
}
383392

@@ -430,34 +439,26 @@ int housekeeping_start (struct housekeeping *hk,
430439
{
431440
flux_t *h = hk->ctx->h;
432441
struct allocation *a;
433-
void *list_handle;
434442

435443
/* Housekeeping is not configured
436444
*/
437445
if (!hk->cmd)
438446
goto skip;
439447

440-
/* Create the 'allocation' and put it in our list.
448+
/* Create and start the 'allocation' and put it in our list.
449+
* N.B. bulk_exec_start() starts watchers but does not send RPCs.
441450
*/
442451
if (!(a = allocation_create (hk, R, id, userid))
443-
|| !(list_handle = zlistx_insert (hk->allocations, a, false))) {
452+
|| bulk_exec_start (h, a->bulk_exec) < 0
453+
|| !(a->list_handle = zlistx_insert (hk->allocations, a, false))) {
444454
flux_log (h,
445455
LOG_ERR,
446-
"housekeeping: %s error saving alloc object (skipping)",
456+
"housekeeping: %s error creating alloc object"
457+
" - returning resources to the scheduler",
447458
idf58 (id));
448459
allocation_destroy (a);
449460
goto skip;
450461
}
451-
/* Start bulk execution.
452-
*/
453-
if (bulk_exec_start (h, a->bulk_exec) < 0) {
454-
flux_log (h,
455-
LOG_ERR,
456-
"housekeeping: %s error starting housekeeping tasks",
457-
idf58 (id));
458-
zlistx_delete (hk->allocations, list_handle);
459-
goto skip;
460-
}
461462
return 0;
462463
skip:
463464
return alloc_send_free_request (hk->ctx->alloc, R, id, true);
@@ -548,7 +549,7 @@ int housekeeping_hello_respond (struct housekeeping *hk, const flux_msg_t *msg)
548549
flux_future_destroy (f);
549550

550551
// delete the allocation to avoid sending frees later
551-
zlistx_delete (hk->allocations, zlistx_cursor (hk->allocations));
552+
allocation_remove (a);
552553
}
553554
a = zlistx_next (hk->allocations);
554555
}
@@ -839,7 +840,7 @@ struct housekeeping *housekeeping_ctx_create (struct job_manager *ctx)
839840
}
840841

841842
static struct bulk_exec_ops bulk_ops = {
842-
.on_start = NULL,
843+
.on_start = bulk_start,
843844
.on_exit = bulk_exit,
844845
.on_complete = bulk_complete,
845846
.on_output = bulk_output,

0 commit comments

Comments
 (0)