Skip to content

Commit 53a2a2d

Browse files
mariusaemeta-codesync[bot]
authored andcommitted
include detailed proc status, command in ProcState when available (#1378)
Summary: Pull Request resolved: #1378 Useful when debugging bootsrapping issues, and to show in error messages. ghstack-source-id: 313885565 exported-using-ghexport Reviewed By: shayne-fletcher Differential Revision: D83535144 fbshipit-source-id: d6677bb8f46474eeb0a37ee55416b7ec37b595b7
1 parent cae88da commit 53a2a2d

File tree

3 files changed

+43
-11
lines changed

3 files changed

+43
-11
lines changed

hyperactor/src/host.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,11 @@ impl<M: ProcManager> Host<M> {
176176
Ok((host, frontend_handle))
177177
}
178178

179+
/// The underlying proc manager.
180+
pub fn manager(&self) -> &M {
181+
&self.manager
182+
}
183+
179184
/// The address which accepts messages destined for this host.
180185
pub fn addr(&self) -> &ChannelAddr {
181186
&self.frontend_addr

hyperactor_mesh/src/bootstrap.rs

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,7 @@ pub fn install_pdeathsig_kill() -> io::Result<()> {
418418
/// In short:
419419
/// - `ProcState`/`ProcStopReason`: historical / event-driven model
420420
/// - `ProcStatus`: immediate status surface for lifecycle control
421-
#[derive(Debug, Clone)]
421+
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
422422
pub enum ProcStatus {
423423
/// The OS process has been spawned but is not yet fully running.
424424
/// (Process-level: child handle exists, no confirmation yet.)
@@ -427,7 +427,7 @@ pub enum ProcStatus {
427427
/// (Process-level: `pid` is known; Proc-level: bootstrap
428428
/// may still be running.)
429429
Running { pid: u32, started_at: SystemTime },
430-
/// Ready means boostrap has completed and the proc is serving.
430+
/// Ready means bootstrap has completed and the proc is serving.
431431
/// (Process-level: `pid` is known; Proc-level: bootstrap
432432
/// completed.)
433433
Ready {
@@ -1301,7 +1301,7 @@ impl hyperactor::host::ProcHandle for BootstrapProcHandle {
13011301
}
13021302

13031303
/// A specification of the command used to bootstrap procs.
1304-
#[derive(Debug, Named, Serialize, Deserialize, Clone, Default)]
1304+
#[derive(Debug, Named, Serialize, Deserialize, Clone, Default, PartialEq, Eq)]
13051305
pub struct BootstrapCommand {
13061306
pub program: std::path::PathBuf,
13071307
pub arg0: Option<String>,
@@ -1430,10 +1430,15 @@ impl BootstrapProcManager {
14301430
}
14311431
}
14321432

1433+
/// The bootstrap command used to launch processes.
1434+
pub fn command(&self) -> &BootstrapCommand {
1435+
&self.command
1436+
}
1437+
14331438
/// Return the current [`ProcStatus`] for the given [`ProcId`], if
14341439
/// the proc is known to this manager.
14351440
///
1436-
/// This queries the live [`BootstrapProcHandle`] stored in the
1441+
/// This querprocies the live [`BootstrapProcHandle`] stored in the
14371442
/// manager's internal map. It provides an immediate snapshot of
14381443
/// lifecycle state (`Starting`, `Running`, `Stopping`, `Stopped`,
14391444
/// etc.).

hyperactor_mesh/src/v1/host_mesh/mesh_agent.rs

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ use std::fmt;
1313
use std::pin::Pin;
1414

1515
use async_trait::async_trait;
16+
use enum_as_inner::EnumAsInner;
1617
use hyperactor::Actor;
1718
use hyperactor::ActorHandle;
1819
use hyperactor::ActorRef;
@@ -31,6 +32,7 @@ use hyperactor::host::LocalProcManager;
3132
use serde::Deserialize;
3233
use serde::Serialize;
3334

35+
use crate::bootstrap;
3436
use crate::bootstrap::BootstrapCommand;
3537
use crate::bootstrap::BootstrapProcManager;
3638
use crate::proc_mesh::mesh_agent::ProcMeshAgent;
@@ -51,6 +53,7 @@ type ProcManagerSpawnFn = Box<dyn Fn(Proc) -> ProcManagerSpawnFuture + Send + Sy
5153
///
5254
/// This abstraction lets the same `HostAgent` work across both
5355
/// out-of-process and in-process execution modes.
56+
#[derive(EnumAsInner)]
5457
pub enum HostAgentMode {
5558
Process(Host<BootstrapProcManager>),
5659
Local(Host<LocalProcManager<ProcManagerSpawnFn>>),
@@ -173,6 +176,8 @@ impl Handler<ShutdownHost> for HostMeshAgent {
173176
pub struct ProcState {
174177
pub proc_id: ProcId,
175178
pub mesh_agent: ActorRef<ProcMeshAgent>,
179+
pub bootstrap_command: Option<BootstrapCommand>,
180+
pub proc_status: Option<bootstrap::ProcStatus>,
176181
}
177182

178183
#[async_trait]
@@ -182,13 +187,24 @@ impl Handler<resource::GetState<ProcState>> for HostMeshAgent {
182187
cx: &Context<Self>,
183188
get_state: resource::GetState<ProcState>,
184189
) -> anyhow::Result<()> {
190+
let manager = self
191+
.host
192+
.as_mut()
193+
.expect("host")
194+
.as_process()
195+
.map(Host::manager);
185196
let state = match self.created.get(&get_state.name) {
186197
Some(Ok((proc_id, mesh_agent))) => resource::State {
187198
name: get_state.name.clone(),
188199
status: resource::Status::Running,
189200
state: Some(ProcState {
190201
proc_id: proc_id.clone(),
191202
mesh_agent: mesh_agent.clone(),
203+
bootstrap_command: manager.map(|m| m.command().clone()),
204+
proc_status: match manager {
205+
Some(manager) => Some(manager.status(proc_id).await.unwrap()),
206+
None => None,
207+
},
192208
}),
193209
},
194210
Some(Err(e)) => resource::State {
@@ -288,10 +304,13 @@ impl Handler<GetHostMeshAgent> for HostMeshAgentProcMeshTrampoline {
288304

289305
#[cfg(test)]
290306
mod tests {
307+
use std::assert_matches::assert_matches;
308+
291309
use hyperactor::Proc;
292310
use hyperactor::channel::ChannelTransport;
293311

294312
use super::*;
313+
use crate::bootstrap::ProcStatus;
295314
use crate::resource::CreateOrUpdateClient;
296315
use crate::resource::GetStateClient;
297316

@@ -326,21 +345,24 @@ mod tests {
326345
.await
327346
.unwrap()
328347
);
329-
assert_eq!(
348+
assert_matches!(
330349
host_agent.get_state(&client, name.clone()).await.unwrap(),
331350
resource::State {
332-
name: name.clone(),
351+
name: resource_name,
333352
status: resource::Status::Running,
334353
state: Some(ProcState {
335354
// The proc itself should be direct addressed, with its name directly.
336-
proc_id: ProcId::Direct(host_addr.clone(), name.to_string()),
355+
proc_id,
337356
// The mesh agent should run in the same proc, under the name
338357
// "agent".
339-
mesh_agent: ActorRef::attest(
340-
ProcId::Direct(host_addr.clone(), name.to_string()).actor_id("agent", 0)
341-
),
358+
mesh_agent,
359+
bootstrap_command,
360+
proc_status: Some(ProcStatus::Ready { pid: _, started_at: _, addr: _, agent: proc_status_mesh_agent}),
342361
}),
343-
}
362+
} if name == resource_name
363+
&& proc_id == ProcId::Direct(host_addr.clone(), name.to_string())
364+
&& mesh_agent == ActorRef::attest(ProcId::Direct(host_addr.clone(), name.to_string()).actor_id("agent", 0)) && bootstrap_command == Some(BootstrapCommand::test())
365+
&& mesh_agent == proc_status_mesh_agent
344366
);
345367
}
346368
}

0 commit comments

Comments
 (0)