Skip to content

Commit 8417d2a

Browse files
committed
housekeeping: improve internal list management
Problem: the hk->allocations list is manipulated directly in several places, and deletion is a bit awkward. Store the list cursor in the list item so it doesn't have to be looked up using zlistx_find() for deletion. Streamline error handling code in housekeeping_start() by starting bulk execution before adding the allocation to the list. Log unexpected internal errors at LOG_CRIT.
1 parent 5993420 commit 8417d2a

File tree

1 file changed

+11
-20
lines changed

1 file changed

+11
-20
lines changed

src/modules/job-manager/housekeeping.c

Lines changed: 11 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ struct allocation {
116116
int free_count; // number of releases
117117
double t_start;
118118
struct bulk_exec *bulk_exec;
119+
void *list_handle;
119120
};
120121

121122
struct housekeeping {
@@ -258,15 +259,13 @@ static void allocation_release (struct allocation *a)
258259

259260
static void allocation_remove (struct allocation *a)
260261
{
261-
void *cursor;
262-
if (!(cursor = zlistx_find (a->hk->allocations, a))) {
262+
if (!a->list_handle
263+
|| zlistx_delete (a->hk->allocations, a->list_handle) < 0) {
263264
flux_log (a->hk->ctx->h,
264-
LOG_ERR,
265+
LOG_CRIT,
265266
"housekeeping: internal error removing allocation for %s",
266267
idf58 (a->id));
267-
return;
268268
}
269-
zlistx_delete (a->hk->allocations, cursor);
270269
}
271270

272271
static void allocation_timeout (flux_reactor_t *r,
@@ -430,34 +429,26 @@ int housekeeping_start (struct housekeeping *hk,
430429
{
431430
flux_t *h = hk->ctx->h;
432431
struct allocation *a;
433-
void *list_handle;
434432

435433
/* Housekeeping is not configured
436434
*/
437435
if (!hk->cmd)
438436
goto skip;
439437

440-
/* Create the 'allocation' and put it in our list.
438+
/* Create and start the 'allocation' and put it in our list.
439+
* N.B. bulk_exec_start() starts watchers but does not send RPCs.
441440
*/
442441
if (!(a = allocation_create (hk, R, id, userid))
443-
|| !(list_handle = zlistx_insert (hk->allocations, a, false))) {
442+
|| bulk_exec_start (h, a->bulk_exec) < 0
443+
|| !(a->list_handle = zlistx_insert (hk->allocations, a, false))) {
444444
flux_log (h,
445445
LOG_ERR,
446-
"housekeeping: %s error saving alloc object (skipping)",
446+
"housekeeping: %s error creating alloc object"
447+
" - returning resources to the scheduler",
447448
idf58 (id));
448449
allocation_destroy (a);
449450
goto skip;
450451
}
451-
/* Start bulk execution.
452-
*/
453-
if (bulk_exec_start (h, a->bulk_exec) < 0) {
454-
flux_log (h,
455-
LOG_ERR,
456-
"housekeeping: %s error starting housekeeping tasks",
457-
idf58 (id));
458-
zlistx_delete (hk->allocations, list_handle);
459-
goto skip;
460-
}
461452
return 0;
462453
skip:
463454
return alloc_send_free_request (hk->ctx->alloc, R, id, true);
@@ -548,7 +539,7 @@ int housekeeping_hello_respond (struct housekeeping *hk, const flux_msg_t *msg)
548539
flux_future_destroy (f);
549540

550541
// delete the allocation to avoid sending frees later
551-
zlistx_delete (hk->allocations, zlistx_cursor (hk->allocations));
542+
allocation_remove (a);
552543
}
553544
a = zlistx_next (hk->allocations);
554545
}

0 commit comments

Comments
 (0)