@@ -116,6 +116,7 @@ struct allocation {
116116 int free_count ; // number of releases
117117 double t_start ;
118118 struct bulk_exec * bulk_exec ;
119+ void * list_handle ;
119120};
120121
121122struct housekeeping {
@@ -258,15 +259,13 @@ static void allocation_release (struct allocation *a)
258259
259260static void allocation_remove (struct allocation * a )
260261{
261- void * cursor ;
262- if (!( cursor = zlistx_find (a -> hk -> allocations , a )) ) {
262+ if (! a -> list_handle
263+ || zlistx_delete (a -> hk -> allocations , a -> list_handle ) < 0 ) {
263264 flux_log (a -> hk -> ctx -> h ,
264- LOG_ERR ,
265+ LOG_CRIT ,
265266 "housekeeping: internal error removing allocation for %s" ,
266267 idf58 (a -> id ));
267- return ;
268268 }
269- zlistx_delete (a -> hk -> allocations , cursor );
270269}
271270
272271static void allocation_timeout (flux_reactor_t * r ,
@@ -318,6 +317,14 @@ static void set_failed_reason (const char **s, const char *reason)
318317 * s = "multiple failure modes" ;
319318}
320319
320+ static void bulk_start (struct bulk_exec * bulk_exec , void * arg )
321+ {
322+ struct allocation * a = arg ;
323+ flux_t * h = a -> hk -> ctx -> h ;
324+
325+ flux_log (h , LOG_DEBUG , "housekeeping: %s started" , idf58 (a -> id ));
326+ }
327+
321328static void bulk_exit (struct bulk_exec * bulk_exec ,
322329 void * arg ,
323330 const struct idset * ids )
@@ -377,7 +384,9 @@ static void bulk_exit (struct bulk_exec *bulk_exec,
377384static void bulk_complete (struct bulk_exec * bulk_exec , void * arg )
378385{
379386 struct allocation * a = arg ;
387+ flux_t * h = a -> hk -> ctx -> h ;
380388
389+ flux_log (h , LOG_DEBUG , "housekeeping: %s complete" , idf58 (a -> id ));
381390 allocation_remove (a );
382391}
383392
@@ -430,34 +439,26 @@ int housekeeping_start (struct housekeeping *hk,
430439{
431440 flux_t * h = hk -> ctx -> h ;
432441 struct allocation * a ;
433- void * list_handle ;
434442
435443 /* Housekeeping is not configured
436444 */
437445 if (!hk -> cmd )
438446 goto skip ;
439447
440- /* Create the 'allocation' and put it in our list.
448+ /* Create and start the 'allocation' and put it in our list.
449+ * N.B. bulk_exec_start() starts watchers but does not send RPCs.
441450 */
442451 if (!(a = allocation_create (hk , R , id , userid ))
443- || !(list_handle = zlistx_insert (hk -> allocations , a , false))) {
452+ || bulk_exec_start (h , a -> bulk_exec ) < 0
453+ || !(a -> list_handle = zlistx_insert (hk -> allocations , a , false))) {
444454 flux_log (h ,
445455 LOG_ERR ,
446- "housekeeping: %s error saving alloc object (skipping)" ,
456+ "housekeeping: %s error creating alloc object"
457+ " - returning resources to the scheduler" ,
447458 idf58 (id ));
448459 allocation_destroy (a );
449460 goto skip ;
450461 }
451- /* Start bulk execution.
452- */
453- if (bulk_exec_start (h , a -> bulk_exec ) < 0 ) {
454- flux_log (h ,
455- LOG_ERR ,
456- "housekeeping: %s error starting housekeeping tasks" ,
457- idf58 (id ));
458- zlistx_delete (hk -> allocations , list_handle );
459- goto skip ;
460- }
461462 return 0 ;
462463skip :
463464 return alloc_send_free_request (hk -> ctx -> alloc , R , id , true);
@@ -548,7 +549,7 @@ int housekeeping_hello_respond (struct housekeeping *hk, const flux_msg_t *msg)
548549 flux_future_destroy (f );
549550
550551 // delete the allocation to avoid sending frees later
551- zlistx_delete ( hk -> allocations , zlistx_cursor ( hk -> allocations ) );
552+ allocation_remove ( a );
552553 }
553554 a = zlistx_next (hk -> allocations );
554555 }
@@ -839,7 +840,7 @@ struct housekeeping *housekeeping_ctx_create (struct job_manager *ctx)
839840}
840841
841842static struct bulk_exec_ops bulk_ops = {
842- .on_start = NULL ,
843+ .on_start = bulk_start ,
843844 .on_exit = bulk_exit ,
844845 .on_complete = bulk_complete ,
845846 .on_output = bulk_output ,
0 commit comments