@@ -435,41 +435,52 @@ impl ProcMesh {
435435
436436 let stop = Arc :: new ( Notify :: new ( ) ) ;
437437 let extent = alloc. extent ( ) . clone ( ) ;
438+ let alloc_name = alloc. world_id ( ) . to_string ( ) ;
438439
439440 {
440441 let stop = Arc :: clone ( & stop) ;
441- let name = name. clone ( ) ;
442-
443- tokio:: spawn ( async move {
444- loop {
445- tokio:: select! {
446- _ = stop. notified( ) => {
447- // If we are explicitly stopped, the alloc is torn down.
448- if let Err ( e) = alloc. stop_and_wait( ) . await {
449- tracing:: error!( "alloc {}: failed to stop: {}" , name, e) ;
450- }
451- break ;
452- }
453- // We are mostly just using this to drive allocation events.
454- proc_state = alloc. next( ) => {
455- match proc_state {
456- // The alloc was stopped.
457- None => break ,
458- Some ( proc_state) => {
459- tracing:: info!( "unmonitored allocation event for {}: {}" , name, proc_state) ;
442+
443+ tokio:: spawn (
444+ async move {
445+ loop {
446+ tokio:: select! {
447+ _ = stop. notified( ) => {
448+ // If we are explicitly stopped, the alloc is torn down.
449+ if let Err ( error) = alloc. stop_and_wait( ) . await {
450+ tracing:: error!(
451+ name = "ProcMeshStatus" ,
452+ alloc_name = %alloc. world_id( ) ,
453+ status = "FailedToStopAlloc" ,
454+ %error,
455+ ) ;
460456 }
457+ break ;
461458 }
459+ // We are mostly just using this to drive allocation events.
460+ proc_state = alloc. next( ) => {
461+ match proc_state {
462+ // The alloc was stopped.
463+ None => break ,
464+ Some ( proc_state) => {
465+ tracing:: debug!(
466+ alloc_name = %alloc. world_id( ) ,
467+ "unmonitored allocation event: {}" , proc_state) ;
468+ }
469+ }
462470
471+ }
463472 }
464473 }
465474 }
466- } . instrument ( tracing:: info_span!( "alloc_monitor" ) ) ) ;
475+ . instrument ( tracing:: info_span!( "alloc_monitor" ) ) ,
476+ ) ;
467477 }
468478
469479 let mesh = Self :: create (
470480 cx,
471481 name,
472482 ProcMeshAllocation :: Allocated {
483+ alloc_name,
473484 stop,
474485 extent,
475486 ranks : Arc :: new ( ranks) ,
@@ -497,15 +508,24 @@ impl ProcMesh {
497508 pub async fn stop ( & mut self , cx : & impl context:: Actor ) -> anyhow:: Result < ( ) > {
498509 let region = self . region . clone ( ) ;
499510 match & mut self . allocation {
500- ProcMeshAllocation :: Allocated { stop, .. } => {
511+ ProcMeshAllocation :: Allocated {
512+ stop, alloc_name, ..
513+ } => {
501514 stop. notify_one ( ) ;
515+ tracing:: info!(
516+ name = "ProcMeshStatus" ,
517+ mesh_name = %self . name,
518+ alloc_name,
519+ status = "StoppingAlloc" ,
520+ "sending stop to alloc {alloc_name}; check its log for stop status" ,
521+ ) ;
502522 Ok ( ( ) )
503523 }
504524 ProcMeshAllocation :: Owned { hosts, .. } => {
505- let names = self . current_ref . proc_ids ( ) . collect :: < Vec < ProcId > > ( ) ;
525+ let procs = self . current_ref . proc_ids ( ) . collect :: < Vec < ProcId > > ( ) ;
506526 // We use the proc mesh region rather than the host mesh region
507527 // because the host agent stores one entry per proc, not per host.
508- hosts. stop_proc_mesh ( cx, names , region) . await
528+ hosts. stop_proc_mesh ( cx, & self . name , procs , region) . await
509529 }
510530 }
511531 }
@@ -539,6 +559,9 @@ impl Drop for ProcMesh {
539559enum ProcMeshAllocation {
540560 /// A mesh that has been allocated from an `Alloc`.
541561 Allocated {
562+ // The name of the alloc from which this mesh was allocated.
563+ alloc_name : String ,
564+
542565 // A cancellation token used to stop the task keeping the alloc alive.
543566 stop : Arc < Notify > ,
544567
0 commit comments