Skip to content

Commit 83b35d8

Browse files
process: fix cleanup race (#818)
Summary: Pull Request resolved: #818 remove `.kill_on_drop()` introduced by D79898223. fix the underlying race that necessitated it and kill child processes via a `Drop` implementation for `struct Child`. fix provided by mariusae Reviewed By: mariusae Differential Revision: D80022315 fbshipit-source-id: 1757e1eaf830cf50989173142e5c20c451fe7e70
1 parent d68e1d7 commit 83b35d8

File tree

1 file changed

+37
-14
lines changed

1 file changed

+37
-14
lines changed

hyperactor_mesh/src/alloc/process.rs

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -135,9 +135,10 @@ struct Child {
135135
channel: ChannelState,
136136
group: monitor::Group,
137137
exit_flag: Option<flag::Flag>,
138-
stdout: LogTailer,
139-
stderr: LogTailer,
138+
stdout: Option<LogTailer>,
139+
stderr: Option<LogTailer>,
140140
stop_reason: Arc<OnceLock<ProcStopReason>>,
141+
process_pid: Arc<std::sync::Mutex<Option<i32>>>,
141142
}
142143

143144
impl Child {
@@ -179,27 +180,23 @@ impl Child {
179180
stderr_tee,
180181
);
181182

183+
let process_pid = Arc::new(std::sync::Mutex::new(Some(process.id().unwrap() as i32)));
184+
182185
let child = Self {
183186
local_rank,
184187
channel: ChannelState::NotConnected,
185188
group,
186189
exit_flag: Some(exit_flag),
187-
stdout,
188-
stderr,
190+
stdout: Some(stdout),
191+
stderr: Some(stderr),
189192
stop_reason: Arc::clone(&stop_reason),
193+
process_pid: process_pid.clone(),
190194
};
191195

192196
let monitor = async move {
193197
let reason = tokio::select! {
194198
_ = handle => {
195-
let Some(id) = process.id() else {
196-
tracing::error!("could not get child process id");
197-
return ProcStopReason::Unknown;
198-
};
199-
if let Err(e) = signal::kill(Pid::from_raw(id as i32), signal::SIGTERM) {
200-
tracing::error!("failed to kill child process: {}", e);
201-
return ProcStopReason::Unknown;
202-
};
199+
Self::ensure_killed(process_pid);
203200
Self::exit_status_to_reason(process.wait().await)
204201
}
205202
result = process.wait() => {
@@ -214,6 +211,25 @@ impl Child {
214211
(child, monitor)
215212
}
216213

214+
fn ensure_killed(pid: Arc<std::sync::Mutex<Option<i32>>>) {
215+
match pid.lock().unwrap().take() {
216+
Some(pid) => {
217+
if let Err(e) = signal::kill(Pid::from_raw(pid), signal::SIGTERM) {
218+
match e {
219+
nix::errno::Errno::ESRCH => {
220+
// Process already gone.
221+
tracing::debug!("pid {} already exited", pid);
222+
}
223+
_ => {
224+
tracing::error!("failed to kill {}: {}", pid, e);
225+
}
226+
}
227+
}
228+
}
229+
None => (),
230+
}
231+
}
232+
217233
fn exit_status_to_reason(result: io::Result<ExitStatus>) -> ProcStopReason {
218234
match result {
219235
Ok(status) if status.success() => ProcStopReason::Stopped,
@@ -300,6 +316,12 @@ impl Child {
300316
}
301317
}
302318

319+
impl Drop for Child {
320+
fn drop(&mut self) {
321+
Self::ensure_killed(self.process_pid.clone());
322+
}
323+
}
324+
303325
impl ProcessAlloc {
304326
// Also implement exit (for graceful exit)
305327

@@ -371,7 +393,6 @@ impl ProcessAlloc {
371393
cmd.env(bootstrap::BOOTSTRAP_LOG_CHANNEL, log_channel.to_string());
372394
cmd.stdout(Stdio::piped());
373395
cmd.stderr(Stdio::piped());
374-
cmd.kill_on_drop(true);
375396

376397
let proc_id = ProcId::Ranked(WorldId(self.name.to_string()), index);
377398
tracing::debug!("Spawning process {:?}", cmd);
@@ -472,7 +493,9 @@ impl Alloc for ProcessAlloc {
472493
},
473494

474495
Some(Ok((index, mut reason))) = self.children.join_next() => {
475-
let stderr_content = if let Some(Child { stdout, stderr, ..} ) = self.remove(index) {
496+
let stderr_content = if let Some(mut child) = self.remove(index) {
497+
let stdout = child.stdout.take().unwrap();
498+
let stderr = child.stderr.take().unwrap();
476499
stdout.abort();
477500
stderr.abort();
478501
let (_stdout, _) = stdout.join().await;

0 commit comments

Comments
 (0)