diff --git a/hyperactor_mesh/src/v1/actor_mesh.rs b/hyperactor_mesh/src/v1/actor_mesh.rs index 0710883a3..3fc6f9d4a 100644 --- a/hyperactor_mesh/src/v1/actor_mesh.rs +++ b/hyperactor_mesh/src/v1/actor_mesh.rs @@ -405,6 +405,7 @@ mod tests { use crate::v1::ActorMeshRef; use crate::v1::Name; use crate::v1::ProcMesh; + use crate::v1::proc_mesh::GET_ACTOR_STATE_MAX_IDLE; use crate::v1::testactor; use crate::v1::testing; @@ -577,6 +578,9 @@ mod tests { async fn test_actor_states_with_process_exit() { hyperactor_telemetry::initialize_logging_for_test(); + let config = hyperactor::config::global::lock(); + let _guard = config.override_key(GET_ACTOR_STATE_MAX_IDLE, Duration::from_secs(1)); + let instance = testing::instance().await; // Listen for supervision events sent to the parent instance. let (supervision_port, mut supervision_receiver) = diff --git a/hyperactor_mesh/src/v1/proc_mesh.rs b/hyperactor_mesh/src/v1/proc_mesh.rs index b27f97e7a..6365fb9f1 100644 --- a/hyperactor_mesh/src/v1/proc_mesh.rs +++ b/hyperactor_mesh/src/v1/proc_mesh.rs @@ -72,6 +72,9 @@ declare_attrs! { /// The maximum idle time between updates while spawning actor meshes. @meta(CONFIG_ENV_VAR = "HYPERACTOR_MESH_ACTOR_SPAWN_MAX_IDLE".to_string()) pub attr ACTOR_SPAWN_MAX_IDLE: Duration = Duration::from_secs(30); + + @meta(CONFIG_ENV_VAR = "HYPERACTOR_MESH_GET_ACTOR_STATE_MAX_IDLE".to_string()) + pub attr GET_ACTOR_STATE_MAX_IDLE: Duration = Duration::from_secs(30); } /// A reference to a single [`hyperactor::Proc`]. @@ -545,7 +548,9 @@ impl ProcMeshRef { // the agent will be unresponsive. // We handle this by setting a timeout on the recv, and if we don't get a // message we assume the agent is dead and return a failed state. - let state = RealClock.timeout(Duration::from_secs(1), rx.recv()).await; + let state = RealClock + .timeout(config::global::get(GET_ACTOR_STATE_MAX_IDLE), rx.recv()) + .await; if let Ok(state) = state { // Handle non-timeout receiver error. let state = state?; diff --git a/python/tests/test_actor_error.py b/python/tests/test_actor_error.py index 34ed57611..b1a7e4400 100644 --- a/python/tests/test_actor_error.py +++ b/python/tests/test_actor_error.py @@ -747,7 +747,7 @@ async def test_supervision_with_proc_mesh_stopped(mesh, v1: bool) -> None: # TODO - re-enable after resolving T232206970 @pytest.mark.oss_skip @pytest.mark.parametrize("v1", [True, False]) -@pytest.mark.timeout(30) +@pytest.mark.timeout(60) async def test_supervision_with_sending_error(v1: bool) -> None: # Messages of length > this will cause a send error and a returned # undeliverable.