Skip to content

Commit 2682fc6

Browse files
pzhan9meta-codesync[bot]
authored andcommitted
Add event logs for HostMesh::shutdown and HostMesh::spawn (#1936)
Summary: Pull Request resolved: #1936 As title. Reviewed By: mariusae Differential Revision: D87366183 fbshipit-source-id: 51741f4c51c7803ca210f807ffa12c241f47f986
1 parent 59e8f9c commit 2682fc6

File tree

1 file changed

+49
-11
lines changed

1 file changed

+49
-11
lines changed

hyperactor_mesh/src/v1/host_mesh.rs

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -475,18 +475,32 @@ impl HostMesh {
475475
/// `BootstrapProcManager`. On drop, the manager walks its PID
476476
/// table and sends SIGKILL to any procs it spawned—tying proc
477477
/// lifetimes to their hosts and preventing leaks.
478+
#[hyperactor::instrument(fields(mesh_name=self.name.to_string()))]
478479
pub async fn shutdown(&self, cx: &impl hyperactor::context::Actor) -> anyhow::Result<()> {
479-
let mut attempted = 0;
480-
let mut ok = 0;
480+
tracing::info!(name = "HostMeshStatus", status = "Shutdown::Attempt");
481+
let mut failed_hosts = vec![];
481482
for host in self.current_ref.values() {
482-
attempted += 1;
483483
if let Err(e) = host.shutdown(cx).await {
484-
tracing::warn!(host = %host, error = %e, "host shutdown failed");
485-
} else {
486-
ok += 1;
484+
tracing::warn!(
485+
name = "HostMeshStatus",
486+
status = "Shutdown::Host::Failed",
487+
host = %host,
488+
error = %e,
489+
"host shutdown failed"
490+
);
491+
failed_hosts.push(host);
487492
}
488493
}
489-
tracing::info!(attempted, ok, "hostmesh shutdown summary");
494+
if failed_hosts.is_empty() {
495+
tracing::info!(name = "HostMeshStatus", status = "Shutdown::Success");
496+
} else {
497+
tracing::error!(
498+
name = "HostMeshStatus",
499+
status = "Shutdown::Failed",
500+
"host mesh shutdown failed; check the logs of the failed hosts for details: {:?}",
501+
failed_hosts
502+
);
503+
}
490504
Ok(())
491505
}
492506
}
@@ -711,7 +725,30 @@ impl HostMeshRef {
711725
name: &str,
712726
per_host: Extent,
713727
) -> v1::Result<ProcMesh> {
714-
self.spawn_inner(cx, Name::new(name), per_host).await
728+
let proc_mesh_name = Name::new(name);
729+
tracing::info!(
730+
name = "HostMeshStatus",
731+
status = "ProcMesh::Spawn::Attempt",
732+
mesh_name = %self.name,
733+
"spawning proc mesh {}", proc_mesh_name
734+
);
735+
let result = self.spawn_inner(cx, proc_mesh_name.clone(), per_host).await;
736+
if result.is_ok() {
737+
tracing::info!(
738+
name = "HostMeshStatus",
739+
status = "ProcMesh::Spawn::Success",
740+
mesh_name = %self.name,
741+
"spawned proc mesh {}", proc_mesh_name
742+
);
743+
} else {
744+
tracing::error!(
745+
name = "HostMeshStatus",
746+
status = "ProcMesh::Spawn::Failed",
747+
mesh_name = %self.name,
748+
"failed to spawn proc mesh {}", proc_mesh_name
749+
);
750+
}
751+
result
715752
}
716753

717754
#[hyperactor::instrument(fields(mesh_name=mesh_name.to_string()))]
@@ -976,9 +1013,10 @@ impl HostMeshRef {
9761013
mesh_name = %self.name,
9771014
name = "HostMeshStatus",
9781015
status = "ProcMesh::Stop::Sent",
979-
"Sending Stop to host mesh {} for {:?} procs",
980-
self.name,
981-
proc_names
1016+
"sending Stop to proc mesh {} for {} procs: {}",
1017+
proc_mesh_name,
1018+
proc_names.len(),
1019+
proc_names.iter().map(|n| n.to_string()).collect::<Vec<_>>().join(", ")
9821020
);
9831021

9841022
let start_time = RealClock.now();

0 commit comments

Comments
 (0)