Skip to content

Commit df1231d

Browse files
samluryemeta-codesync[bot]
authored andcommitted
Increase GetState::<ActorState> default timeout and make it configurable (#1474)
Summary: Pull Request resolved: #1474 The current timeout of 1 second for `GetState::<ActorState>` for supervision was causing at least one test to fail. This diff makes the value configurable, and also increases the default to 30 seconds. The failing test now passes. Example test failure that shows the supervision timeout: P1984316048 ghstack-source-id: 315174069 Reviewed By: mariusae Differential Revision: D84232284 fbshipit-source-id: 4a56b45e4d0d828602883a68dd57bc044639b4dd
1 parent 6242eb9 commit df1231d

File tree

3 files changed

+11
-2
lines changed

3 files changed

+11
-2
lines changed

hyperactor_mesh/src/v1/actor_mesh.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,7 @@ mod tests {
405405
use crate::v1::ActorMeshRef;
406406
use crate::v1::Name;
407407
use crate::v1::ProcMesh;
408+
use crate::v1::proc_mesh::GET_ACTOR_STATE_MAX_IDLE;
408409
use crate::v1::testactor;
409410
use crate::v1::testing;
410411

@@ -577,6 +578,9 @@ mod tests {
577578
async fn test_actor_states_with_process_exit() {
578579
hyperactor_telemetry::initialize_logging_for_test();
579580

581+
let config = hyperactor::config::global::lock();
582+
let _guard = config.override_key(GET_ACTOR_STATE_MAX_IDLE, Duration::from_secs(1));
583+
580584
let instance = testing::instance().await;
581585
// Listen for supervision events sent to the parent instance.
582586
let (supervision_port, mut supervision_receiver) =

hyperactor_mesh/src/v1/proc_mesh.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ declare_attrs! {
7272
/// The maximum idle time between updates while spawning actor meshes.
7373
@meta(CONFIG_ENV_VAR = "HYPERACTOR_MESH_ACTOR_SPAWN_MAX_IDLE".to_string())
7474
pub attr ACTOR_SPAWN_MAX_IDLE: Duration = Duration::from_secs(30);
75+
76+
@meta(CONFIG_ENV_VAR = "HYPERACTOR_MESH_GET_ACTOR_STATE_MAX_IDLE".to_string())
77+
pub attr GET_ACTOR_STATE_MAX_IDLE: Duration = Duration::from_secs(30);
7578
}
7679

7780
/// A reference to a single [`hyperactor::Proc`].
@@ -545,7 +548,9 @@ impl ProcMeshRef {
545548
// the agent will be unresponsive.
546549
// We handle this by setting a timeout on the recv, and if we don't get a
547550
// message we assume the agent is dead and return a failed state.
548-
let state = RealClock.timeout(Duration::from_secs(1), rx.recv()).await;
551+
let state = RealClock
552+
.timeout(config::global::get(GET_ACTOR_STATE_MAX_IDLE), rx.recv())
553+
.await;
549554
if let Ok(state) = state {
550555
// Handle non-timeout receiver error.
551556
let state = state?;

python/tests/test_actor_error.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -778,7 +778,7 @@ async def test_supervision_with_proc_mesh_stopped(mesh, api_ver: ApiVersion) ->
778778
# TODO - re-enable after resolving T232206970
779779
@pytest.mark.oss_skip
780780
@pytest.mark.parametrize("api_ver", [ApiVersion.V0, ApiVersion.V1])
781-
@pytest.mark.timeout(30)
781+
@pytest.mark.timeout(60)
782782
async def test_supervision_with_sending_error(api_ver: ApiVersion) -> None:
783783
# Messages of length > this will cause a send error and a returned
784784
# undeliverable.

0 commit comments

Comments
 (0)