Skip to content

Commit cae88da

Browse files
mariusaemeta-codesync[bot]
authored andcommitted
add some more logging and finer-grained error handling to the proc spawn path (#1373)
Summary: Pull Request resolved: #1373 A more principled solution is in the works, but this will give us better diagnostics meanwhile. ghstack-source-id: 313885568 exported-using-ghexport Reviewed By: shayne-fletcher Differential Revision: D83529216 fbshipit-source-id: d39a8418b68fcb1424c6fd47406c3276be49668a
1 parent 226dcd2 commit cae88da

File tree

4 files changed

+42
-17
lines changed

4 files changed

+42
-17
lines changed

hyperactor/src/host.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,14 @@ impl<M: ProcManager> Host<M> {
146146

147147
// Set up a system proc. This is often used to manage the host itself.
148148
let service_proc_id = ProcId::Direct(frontend_addr.clone(), "service".to_string());
149-
let service_proc = Proc::new(service_proc_id, router.boxed());
149+
let service_proc = Proc::new(service_proc_id.clone(), router.boxed());
150+
151+
tracing::info!(
152+
frontend_addr = frontend_addr.to_string(),
153+
backend_addr = backend_addr.to_string(),
154+
service_proc_id = service_proc_id.to_string(),
155+
"serving host"
156+
);
150157

151158
let host = Host {
152159
procs: HashMap::new(),

hyperactor_mesh/src/v1.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ pub use actor_mesh::ActorMesh;
2323
pub use actor_mesh::ActorMeshRef;
2424
pub use host_mesh::HostMeshRef;
2525
use hyperactor::ActorId;
26+
use hyperactor::ActorRef;
2627
use hyperactor::mailbox::MailboxSenderError;
2728
use ndslice::view;
2829
pub use proc_mesh::ProcMesh;
@@ -32,6 +33,7 @@ use serde::Serialize;
3233
pub use value_mesh::ValueMesh;
3334

3435
use crate::shortuuid::ShortUuid;
36+
use crate::v1::host_mesh::HostMeshAgent;
3537
use crate::v1::host_mesh::HostMeshRefParseError;
3638

3739
/// Errors that occur during mesh operations.
@@ -84,6 +86,13 @@ pub enum Error {
8486
#[error("error configuring host mesh agent {0}: {1}")]
8587
HostMeshAgentConfigurationError(ActorId, String),
8688

89+
#[error("error creating {proc_name} (host rank {host_rank}) on host mesh agent {mesh_agent}")]
90+
ProcCreationError {
91+
proc_name: Name,
92+
mesh_agent: ActorRef<HostMeshAgent>,
93+
host_rank: usize,
94+
},
95+
8796
#[error("error: {0} does not exist")]
8897
NotExist(Name),
8998
}

hyperactor_mesh/src/v1/host_mesh.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -528,7 +528,7 @@ impl HostMeshRef {
528528
for (host_rank, host) in self.ranks.iter().enumerate() {
529529
for per_host_rank in 0..per_host.num_ranks() {
530530
let proc_name = Name::new(format!("{}-{}", name, per_host_rank));
531-
let _ok = host
531+
let ok = host
532532
.mesh_agent()
533533
.create_or_update(cx, proc_name.clone(), ())
534534
.await
@@ -538,6 +538,14 @@ impl HostMeshRef {
538538
format!("failed while creating proc: {}", e),
539539
)
540540
})?;
541+
if !ok {
542+
// TODO: clean up the rest of the procs
543+
return Err(v1::Error::ProcCreationError {
544+
proc_name,
545+
mesh_agent: host.mesh_agent(),
546+
host_rank,
547+
});
548+
}
541549
procs.push(ProcRef::new(
542550
host.named_proc(&proc_name),
543551
per_host.num_ranks() * host_rank + per_host_rank,

hyperactor_mesh/src/v1/host_mesh/mesh_agent.rs

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ impl Actor for HostMeshAgent {
9797

9898
#[async_trait]
9999
impl Handler<resource::CreateOrUpdate<()>> for HostMeshAgent {
100+
#[tracing::instrument("HostMeshAgent::CreateOrUpdate", level = "info", skip_all, fields(name=%create_or_update.name))]
100101
async fn handle(
101102
&mut self,
102103
cx: &Context<Self>,
@@ -108,22 +109,21 @@ impl Handler<resource::CreateOrUpdate<()>> for HostMeshAgent {
108109
}
109110

110111
let host = self.host.as_mut().expect("host present");
111-
let ok = self
112-
.created
113-
.insert(
114-
create_or_update.name.clone(),
115-
match host {
116-
HostAgentMode::Process(host) => {
117-
host.spawn(create_or_update.name.clone().to_string()).await
118-
}
119-
HostAgentMode::Local(host) => {
120-
host.spawn(create_or_update.name.clone().to_string()).await
121-
}
122-
},
123-
)
124-
.is_none();
125-
112+
let created = match host {
113+
HostAgentMode::Process(host) => {
114+
host.spawn(create_or_update.name.clone().to_string()).await
115+
}
116+
HostAgentMode::Local(host) => {
117+
host.spawn(create_or_update.name.clone().to_string()).await
118+
}
119+
};
120+
let ok = created.is_ok();
121+
if let Err(e) = &created {
122+
tracing::error!("failed to spawn proc {}: {}", create_or_update.name, e);
123+
}
124+
self.created.insert(create_or_update.name.clone(), created);
126125
create_or_update.reply.send(cx, ok)?;
126+
127127
Ok(())
128128
}
129129
}
@@ -242,6 +242,7 @@ impl Actor for HostMeshAgentProcMeshTrampoline {
242242
Some(command) => command,
243243
None => BootstrapCommand::current()?,
244244
};
245+
tracing::info!("booting host with proc command {:?}", command);
245246
let manager = BootstrapProcManager::new(command);
246247
let (host, _) = Host::serve(manager, transport.any()).await?;
247248
HostAgentMode::Process(host)

0 commit comments

Comments
 (0)