Skip to content

Commit 3768014

Browse files
pablorfb-metafacebook-github-bot
authored andcommitted
Support configurable heartbeat timeout (#698)
Summary: Pull Request resolved: #698 Move RemoteAlloc HB timeout as part of configs Monarch runs with 1k+ workers timeout on allocation with the default heartbeat setting (1s). This is not a fix but simply extending configurability to Python side. Reviewed By: vidhyav Differential Revision: D79064585 fbshipit-source-id: f9a7857c087598951d80af94c393cb3d80e89103
1 parent e54f601 commit 3768014

File tree

6 files changed

+82
-87
lines changed

6 files changed

+82
-87
lines changed

hyperactor/src/config.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ declare_attrs! {
4646

4747
/// Timeout used by proc mesh for stopping an actor.
4848
pub attr STOP_ACTOR_TIMEOUT: Duration = Duration::from_secs(1);
49+
50+
/// Heartbeat interval for remote allocator
51+
pub attr REMOTE_ALLOCATOR_HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5);
4952
}
5053

5154
/// Load configuration from environment variables
@@ -87,6 +90,13 @@ pub fn from_env() -> Attrs {
8790
}
8891
}
8992

93+
// Load remote allocator heartbeat interval
94+
if let Ok(val) = env::var("HYPERACTOR_REMOTE_ALLOCATOR_HEARTBEAT_INTERVAL_SECS") {
95+
if let Ok(parsed) = val.parse::<u64>() {
96+
config[REMOTE_ALLOCATOR_HEARTBEAT_INTERVAL] = Duration::from_secs(parsed);
97+
}
98+
}
99+
90100
config
91101
}
92102

@@ -122,6 +132,9 @@ pub fn merge(config: &mut Attrs, other: &Attrs) {
122132
if other.contains_key(SPLIT_MAX_BUFFER_SIZE) {
123133
config[SPLIT_MAX_BUFFER_SIZE] = other[SPLIT_MAX_BUFFER_SIZE];
124134
}
135+
if other.contains_key(REMOTE_ALLOCATOR_HEARTBEAT_INTERVAL) {
136+
config[REMOTE_ALLOCATOR_HEARTBEAT_INTERVAL] = other[REMOTE_ALLOCATOR_HEARTBEAT_INTERVAL];
137+
}
125138
}
126139

127140
/// Global configuration functions
@@ -292,6 +305,10 @@ mod tests {
292305
);
293306
assert_eq!(config[MESSAGE_ACK_EVERY_N_MESSAGES], 1000);
294307
assert_eq!(config[SPLIT_MAX_BUFFER_SIZE], 5);
308+
assert_eq!(
309+
config[REMOTE_ALLOCATOR_HEARTBEAT_INTERVAL],
310+
Duration::from_secs(5)
311+
);
295312
}
296313

297314
#[test]

0 commit comments

Comments
 (0)