@@ -54,6 +54,11 @@ struct cleanup {
5454 flux_watcher_t * timer ;
5555};
5656
57+ struct shutdown {
58+ double timeout ;
59+ flux_watcher_t * timer ;
60+ };
61+
5762struct monitor {
5863 struct flux_msglist * requests ;
5964
@@ -78,6 +83,7 @@ struct state_machine {
7883 struct monitor monitor ;
7984 struct quorum quorum ;
8085 struct cleanup cleanup ;
86+ struct shutdown shutdown ;
8187
8288 struct flux_msglist * wait_requests ;
8389
@@ -159,6 +165,7 @@ static struct state_next nexttab[] = {
159165};
160166
161167static const double default_quorum_timeout = 60 ; // log slow joiners
168+ static const double default_shutdown_timeout = 60 ; // log slow shutdown
162169static const double default_cleanup_timeout = -1 ;
163170static const double goodbye_timeout = 60 ;
164171
@@ -424,17 +431,49 @@ static void action_finalize (struct state_machine *s)
424431 state_machine_post (s , "rc3-none" );
425432}
426433
434+ static void shutdown_timer_cb (flux_reactor_t * r ,
435+ flux_watcher_t * w ,
436+ int revents ,
437+ void * arg )
438+ {
439+ struct state_machine * s = arg ;
440+ struct idset * ranks = overlay_get_child_peer_idset (s -> ctx -> overlay );
441+ char * rankstr = idset_encode (ranks , IDSET_FLAG_RANGE );
442+ char * hoststr = flux_hostmap_lookup (s -> ctx -> h , rankstr , NULL );
443+
444+ flux_log (s -> ctx -> h ,
445+ LOG_ERR ,
446+ "shutdown delayed: waiting for %d peers: %s (rank %s)" ,
447+ overlay_get_child_peer_count (s -> ctx -> overlay ),
448+ hoststr ? hoststr : "?" ,
449+ rankstr ? rankstr : "?" );
450+
451+ free (hoststr );
452+ free (rankstr );
453+ idset_destroy (ranks );
454+
455+ flux_timer_watcher_reset (w , s -> shutdown .timeout , 0. );
456+ flux_watcher_start (w );
457+ }
458+
459+
427460static void action_shutdown (struct state_machine * s )
428461{
429- if (overlay_get_child_peer_count (s -> ctx -> overlay ) == 0 )
462+ if (overlay_get_child_peer_count (s -> ctx -> overlay ) == 0 ) {
430463 state_machine_post (s , "children-none" );
464+ return ;
465+ }
431466#if HAVE_LIBSYSTEMD
432467 if (s -> ctx -> sd_notify ) {
433468 sd_notifyf (0 ,
434469 "STATUS=Waiting for %d peers to shutdown" ,
435470 overlay_get_child_peer_count (s -> ctx -> overlay ));
436471 }
437472#endif
473+ if (s -> shutdown .timeout >= 0 ) {
474+ flux_timer_watcher_reset (s -> shutdown .timer , s -> shutdown .timeout , 0. );
475+ flux_watcher_start (s -> shutdown .timer );
476+ }
438477}
439478
440479static void goodbye_continuation (flux_future_t * f , void * arg )
@@ -1144,6 +1183,7 @@ static void overlay_monitor_cb (struct overlay *overlay,
11441183 void * arg )
11451184{
11461185 struct state_machine * s = arg ;
1186+ int count ;
11471187
11481188 switch (s -> state ) {
11491189 /* IN JOIN state, post parent-fail if something goes wrong with the
@@ -1166,8 +1206,21 @@ static void overlay_monitor_cb (struct overlay *overlay,
11661206 * node) the exit event is posted immediately in action_shutdown().
11671207 */
11681208 case STATE_SHUTDOWN :
1169- if (overlay_get_child_peer_count (overlay ) == 0 )
1209+ count = overlay_get_child_peer_count (overlay );
1210+ if (count == 0 ) {
11701211 state_machine_post (s , "children-complete" );
1212+ flux_watcher_stop (s -> shutdown .timer );
1213+ }
1214+ #if HAVE_LIBSYSTEMD
1215+ else {
1216+ if (s -> ctx -> sd_notify ) {
1217+ sd_notifyf (0 ,
1218+ "STATUS=Waiting for %d peer%s to shutdown" ,
1219+ count ,
1220+ count > 1 ? "s" : "" );
1221+ }
1222+ }
1223+ #endif
11711224 break ;
11721225 default :
11731226 break ;
@@ -1255,6 +1308,7 @@ void state_machine_destroy (struct state_machine *s)
12551308 flux_watcher_destroy (s -> quorum .timer );
12561309 flux_future_destroy (s -> quorum .f );
12571310 flux_watcher_destroy (s -> cleanup .timer );
1311+ flux_watcher_destroy (s -> shutdown .timer );
12581312 free (s );
12591313 errno = saved_errno ;
12601314 }
@@ -1289,7 +1343,12 @@ struct state_machine *state_machine_create (struct broker *ctx)
12891343 0. ,
12901344 0. ,
12911345 cleanup_timer_cb ,
1292- s )))
1346+ s ))
1347+ || !(s -> shutdown .timer = flux_timer_watcher_create (r ,
1348+ 0. ,
1349+ 0. ,
1350+ shutdown_timer_cb ,
1351+ s )))
12931352 goto error ;
12941353 flux_watcher_start (s -> prep );
12951354 flux_watcher_start (s -> check );
@@ -1320,6 +1379,13 @@ struct state_machine *state_machine_create (struct broker *ctx)
13201379 log_err ("error configuring cleanup timeout attribute" );
13211380 goto error ;
13221381 }
1382+ if (timeout_configure (s ,
1383+ "broker.shutdown-timeout" ,
1384+ & s -> shutdown .timeout ,
1385+ default_shutdown_timeout ) < 0 ) {
1386+ log_err ("error configuring shutdown timeout attribute" );
1387+ goto error ;
1388+ }
13231389 norestart_configure (s );
13241390 overlay_set_monitor_cb (ctx -> overlay , overlay_monitor_cb , s );
13251391 if (s -> ctx -> rank == 0 ) {
0 commit comments