@@ -303,8 +303,22 @@ impl DynamicEngine {
303303
304304 // Broadcast to all subscribers
305305 self . state_subscribers . retain ( |subscriber| {
306- // If send fails, the subscriber has disconnected, so we remove it
307- subscriber. try_send ( update. clone ( ) ) . is_ok ( )
306+ // Keep subscribers on transient backpressure (Full); remove only when Closed.
307+ //
308+ // For state updates we also try to deliver eventually: dropping a state transition
309+ // (e.g. Running -> Recovering) can leave clients showing a stale "healthy" status.
310+ match subscriber. try_send ( update. clone ( ) ) {
311+ Ok ( ( ) ) => true ,
312+ Err ( mpsc:: error:: TrySendError :: Full ( _) ) => {
313+ let subscriber = subscriber. clone ( ) ;
314+ let update = update. clone ( ) ;
315+ tokio:: spawn ( async move {
316+ let _ = subscriber. send ( update) . await ;
317+ } ) ;
318+ true
319+ } ,
320+ Err ( mpsc:: error:: TrySendError :: Closed ( _) ) => false ,
321+ }
308322 } ) ;
309323 }
310324
@@ -362,8 +376,13 @@ impl DynamicEngine {
362376
363377 // Broadcast to all subscribers
364378 self . stats_subscribers . retain ( |subscriber| {
365- // If send fails, the subscriber has disconnected, so we remove it
366- subscriber. try_send ( update. clone ( ) ) . is_ok ( )
379+ // Keep subscribers on transient backpressure (Full); remove only when Closed.
380+ //
381+ // Stats are high-frequency, best-effort updates; dropping an update is acceptable.
382+ match subscriber. try_send ( update. clone ( ) ) {
383+ Ok ( ( ) ) | Err ( mpsc:: error:: TrySendError :: Full ( _) ) => true ,
384+ Err ( mpsc:: error:: TrySendError :: Closed ( _) ) => false ,
385+ }
367386 } ) ;
368387 }
369388
@@ -463,10 +482,12 @@ impl DynamicEngine {
463482 } ;
464483
465484 // 5. Spawn Node
466- let task_handle =
467- tokio:: spawn ( node. run ( context) . instrument (
468- tracing:: info_span!( "node_run" , node. name = %node_id, node. kind = %kind) ,
469- ) ) ;
485+ let task_handle = tokio:: spawn ( node. run ( context) . instrument ( tracing:: info_span!(
486+ "node_run" ,
487+ session. id = %self . session_id. as_deref( ) . unwrap_or( "<unknown>" ) ,
488+ node. name = %node_id,
489+ node. kind = %kind
490+ ) ) ) ;
470491 self . live_nodes
471492 . insert ( node_id. to_string ( ) , graph_builder:: LiveNode { control_tx, task_handle } ) ;
472493 self . nodes_active_gauge . record ( self . live_nodes . len ( ) as u64 , & [ ] ) ;
0 commit comments