@@ -475,18 +475,32 @@ impl HostMesh {
475475 /// `BootstrapProcManager`. On drop, the manager walks its PID
476476 /// table and sends SIGKILL to any procs it spawned—tying proc
477477 /// lifetimes to their hosts and preventing leaks.
478+ #[ hyperactor:: instrument( fields( mesh_name=self . name. to_string( ) ) ) ]
478479 pub async fn shutdown ( & self , cx : & impl hyperactor:: context:: Actor ) -> anyhow:: Result < ( ) > {
479- let mut attempted = 0 ;
480- let mut ok = 0 ;
480+ tracing :: info! ( name = "HostMeshStatus" , status = "Shutdown::Attempt" ) ;
481+ let mut failed_hosts = vec ! [ ] ;
481482 for host in self . current_ref . values ( ) {
482- attempted += 1 ;
483483 if let Err ( e) = host. shutdown ( cx) . await {
484- tracing:: warn!( host = %host, error = %e, "host shutdown failed" ) ;
485- } else {
486- ok += 1 ;
484+ tracing:: warn!(
485+ name = "HostMeshStatus" ,
486+ status = "Shutdown::Host::Failed" ,
487+ host = %host,
488+ error = %e,
489+ "host shutdown failed"
490+ ) ;
491+ failed_hosts. push ( host) ;
487492 }
488493 }
489- tracing:: info!( attempted, ok, "hostmesh shutdown summary" ) ;
494+ if failed_hosts. is_empty ( ) {
495+ tracing:: info!( name = "HostMeshStatus" , status = "Shutdown::Success" ) ;
496+ } else {
497+ tracing:: error!(
498+ name = "HostMeshStatus" ,
499+ status = "Shutdown::Failed" ,
500+ "host mesh shutdown failed; check the logs of the failed hosts for details: {:?}" ,
501+ failed_hosts
502+ ) ;
503+ }
490504 Ok ( ( ) )
491505 }
492506}
@@ -711,7 +725,30 @@ impl HostMeshRef {
711725 name : & str ,
712726 per_host : Extent ,
713727 ) -> v1:: Result < ProcMesh > {
714- self . spawn_inner ( cx, Name :: new ( name) , per_host) . await
728+ let proc_mesh_name = Name :: new ( name) ;
729+ tracing:: info!(
730+ name = "HostMeshStatus" ,
731+ status = "ProcMesh::Spawn::Attempt" ,
732+ mesh_name = %self . name,
733+ "spawning proc mesh {}" , proc_mesh_name
734+ ) ;
735+ let result = self . spawn_inner ( cx, proc_mesh_name. clone ( ) , per_host) . await ;
736+ if result. is_ok ( ) {
737+ tracing:: info!(
738+ name = "HostMeshStatus" ,
739+ status = "ProcMesh::Spawn::Success" ,
740+ mesh_name = %self . name,
741+ "spawned proc mesh {}" , proc_mesh_name
742+ ) ;
743+ } else {
744+ tracing:: error!(
745+ name = "HostMeshStatus" ,
746+ status = "ProcMesh::Spawn::Failed" ,
747+ mesh_name = %self . name,
748+ "failed to spawn proc mesh {}" , proc_mesh_name
749+ ) ;
750+ }
751+ result
715752 }
716753
717754 #[ hyperactor:: instrument( fields( mesh_name=mesh_name. to_string( ) ) ) ]
@@ -976,9 +1013,10 @@ impl HostMeshRef {
9761013 mesh_name = %self . name,
9771014 name = "HostMeshStatus" ,
9781015 status = "ProcMesh::Stop::Sent" ,
979- "Sending Stop to host mesh {} for {:?} procs" ,
980- self . name,
981- proc_names
1016+ "sending Stop to proc mesh {} for {} procs: {}" ,
1017+ proc_mesh_name,
1018+ proc_names. len( ) ,
1019+ proc_names. iter( ) . map( |n| n. to_string( ) ) . collect:: <Vec <_>>( ) . join( ", " )
9821020 ) ;
9831021
9841022 let start_time = RealClock . now ( ) ;
0 commit comments