Skip to content

Commit cc89acb

Browse files
committed
[monarch][supervision] Increase GetState::<ActorState> default timeout and make it configurable
Pull Request resolved: #1474 The current timeout of 1 second for `GetState::<ActorState>` for supervision was causing at least one test to fail. This diff makes the value configurable, and also increases the default to 30 seconds. The failing test now passes. Example test failure that shows the supervision timeout: P1984316048 ghstack-source-id: 315174069 Differential Revision: [D84232284](https://our.internmc.facebook.com/intern/diff/D84232284/) **NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D84232284/)!
1 parent 4c893bc commit cc89acb

File tree

3 files changed

+11
-2
lines changed

3 files changed

+11
-2
lines changed

hyperactor_mesh/src/v1/actor_mesh.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,7 @@ mod tests {
405405
use crate::v1::ActorMeshRef;
406406
use crate::v1::Name;
407407
use crate::v1::ProcMesh;
408+
use crate::v1::proc_mesh::GET_ACTOR_STATE_MAX_IDLE;
408409
use crate::v1::testactor;
409410
use crate::v1::testing;
410411

@@ -577,6 +578,9 @@ mod tests {
577578
async fn test_actor_states_with_process_exit() {
578579
hyperactor_telemetry::initialize_logging_for_test();
579580

581+
let config = hyperactor::config::global::lock();
582+
let _guard = config.override_key(GET_ACTOR_STATE_MAX_IDLE, Duration::from_secs(1));
583+
580584
let instance = testing::instance().await;
581585
// Listen for supervision events sent to the parent instance.
582586
let (supervision_port, mut supervision_receiver) =

hyperactor_mesh/src/v1/proc_mesh.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ declare_attrs! {
7272
/// The maximum idle time between updates while spawning actor meshes.
7373
@meta(CONFIG_ENV_VAR = "HYPERACTOR_MESH_ACTOR_SPAWN_MAX_IDLE".to_string())
7474
pub attr ACTOR_SPAWN_MAX_IDLE: Duration = Duration::from_secs(30);
75+
76+
@meta(CONFIG_ENV_VAR = "HYPERACTOR_MESH_GET_ACTOR_STATE_MAX_IDLE".to_string())
77+
pub attr GET_ACTOR_STATE_MAX_IDLE: Duration = Duration::from_secs(30);
7578
}
7679

7780
/// A reference to a single [`hyperactor::Proc`].
@@ -545,7 +548,9 @@ impl ProcMeshRef {
545548
// the agent will be unresponsive.
546549
// We handle this by setting a timeout on the recv, and if we don't get a
547550
// message we assume the agent is dead and return a failed state.
548-
let state = RealClock.timeout(Duration::from_secs(1), rx.recv()).await;
551+
let state = RealClock
552+
.timeout(config::global::get(GET_ACTOR_STATE_MAX_IDLE), rx.recv())
553+
.await;
549554
if let Ok(state) = state {
550555
// Handle non-timeout receiver error.
551556
let state = state?;

python/tests/test_actor_error.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -747,7 +747,7 @@ async def test_supervision_with_proc_mesh_stopped(mesh, v1: bool) -> None:
747747
# TODO - re-enable after resolving T232206970
748748
@pytest.mark.oss_skip
749749
@pytest.mark.parametrize("v1", [True, False])
750-
@pytest.mark.timeout(30)
750+
@pytest.mark.timeout(60)
751751
async def test_supervision_with_sending_error(v1: bool) -> None:
752752
# Messages of length > this will cause a send error and a returned
753753
# undeliverable.

0 commit comments

Comments
 (0)