Skip to content

Commit 2115824

Browse files
authored
Merge pull request #1090 from rust-lang/robust-kill
Try to more thoroughly kill child Docker processes
2 parents ca5d2ca + d28aa3e commit 2115824

File tree

1 file changed

+72
-29
lines changed

1 file changed

+72
-29
lines changed

compiler/base/orchestrator/src/coordinator.rs

Lines changed: 72 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use tokio::{
2525
};
2626
use tokio_stream::wrappers::ReceiverStream;
2727
use tokio_util::{io::SyncIoBridge, sync::CancellationToken};
28-
use tracing::{info_span, instrument, trace, trace_span, warn, Instrument};
28+
use tracing::{error, info_span, instrument, trace, trace_span, warn, Instrument};
2929

3030
use crate::{
3131
bincode_input_closed,
@@ -1150,7 +1150,7 @@ where
11501150
struct Container {
11511151
permit: Box<dyn ContainerPermit>,
11521152
task: JoinHandle<Result<()>>,
1153-
kill_child: Option<Command>,
1153+
kill_child: TerminateContainer,
11541154
modify_cargo_toml: ModifyCargoToml,
11551155
commander: Commander,
11561156
}
@@ -1908,24 +1908,14 @@ impl Container {
19081908
let Self {
19091909
permit,
19101910
task,
1911-
kill_child,
1911+
mut kill_child,
19121912
modify_cargo_toml,
19131913
commander,
19141914
} = self;
19151915
drop(commander);
19161916
drop(modify_cargo_toml);
19171917

1918-
if let Some(mut kill_child) = kill_child {
1919-
// We don't care if the command itself succeeds or not; it
1920-
// may already be dead!
1921-
let _ = kill_child
1922-
.stdin(Stdio::null())
1923-
.stdout(Stdio::null())
1924-
.stderr(Stdio::null())
1925-
.status()
1926-
.await
1927-
.context(KillWorkerSnafu)?;
1928-
}
1918+
kill_child.terminate_now().await?;
19291919

19301920
let r = task.await;
19311921
drop(permit);
@@ -2542,12 +2532,64 @@ pub enum CommanderError {
25422532
WorkerOperationFailed { source: SerializedError2 },
25432533
}
25442534

2535+
#[derive(Debug)]
2536+
pub struct TerminateContainer(Option<Command>);
2537+
2538+
impl TerminateContainer {
2539+
pub fn new(command: Command) -> Self {
2540+
Self(Some(command))
2541+
}
2542+
2543+
pub fn none() -> Self {
2544+
Self(None)
2545+
}
2546+
2547+
async fn terminate_now(&mut self) -> Result<(), TerminateContainerError> {
2548+
if let Some(mut kill_child) = self.take_command() {
2549+
// [ALREADY-DEAD] We don't care if the command itself
2550+
// succeeds or not; the container may already be dead!
2551+
let _ = kill_child.status().await?;
2552+
}
2553+
2554+
Ok(())
2555+
}
2556+
2557+
fn take_command(&mut self) -> Option<Command> {
2558+
self.0.take().map(|mut kill_child| {
2559+
kill_child
2560+
.stdin(Stdio::null())
2561+
.stdout(Stdio::null())
2562+
.stderr(Stdio::null());
2563+
kill_child
2564+
})
2565+
}
2566+
}
2567+
2568+
impl Drop for TerminateContainer {
2569+
fn drop(&mut self) {
2570+
if let Some(mut kill_child) = self.take_command() {
2571+
if let Err(e) = kill_child.as_std_mut().status() {
2572+
// See [ALREADY-DEAD]
2573+
error!("Unable to kill the container while dropping: {e}");
2574+
}
2575+
}
2576+
}
2577+
}
2578+
2579+
#[derive(Debug, Snafu)]
2580+
#[snafu(module)]
2581+
pub enum TerminateContainerError {
2582+
#[snafu(display("Unable to kill the child process"))]
2583+
#[snafu(context(false))]
2584+
Execute { source: std::io::Error },
2585+
}
2586+
25452587
pub trait Backend {
25462588
fn run_worker_in_background(
25472589
&self,
25482590
channel: Channel,
25492591
id: impl fmt::Display,
2550-
) -> Result<(Child, Option<Command>, ChildStdin, ChildStdout)> {
2592+
) -> Result<(Child, TerminateContainer, ChildStdin, ChildStdout)> {
25512593
let (mut start, kill) = self.prepare_worker_command(channel, id);
25522594

25532595
let mut child = start
@@ -2565,7 +2607,7 @@ pub trait Backend {
25652607
&self,
25662608
channel: Channel,
25672609
id: impl fmt::Display,
2568-
) -> (Command, Option<Command>);
2610+
) -> (Command, TerminateContainer);
25692611
}
25702612

25712613
impl<B> Backend for &B
@@ -2576,7 +2618,7 @@ where
25762618
&self,
25772619
channel: Channel,
25782620
id: impl fmt::Display,
2579-
) -> (Command, Option<Command>) {
2621+
) -> (Command, TerminateContainer) {
25802622
B::prepare_worker_command(self, channel, id)
25812623
}
25822624
}
@@ -2635,7 +2677,7 @@ impl Backend for DockerBackend {
26352677
&self,
26362678
channel: Channel,
26372679
id: impl fmt::Display,
2638-
) -> (Command, Option<Command>) {
2680+
) -> (Command, TerminateContainer) {
26392681
let name = format!("playground-{id}");
26402682

26412683
let mut command = basic_secure_docker_command();
@@ -2654,8 +2696,9 @@ impl Backend for DockerBackend {
26542696

26552697
let mut kill = Command::new("docker");
26562698
kill.arg("kill").args(["--signal", "KILL"]).arg(name);
2699+
let kill = TerminateContainer::new(kill);
26572700

2658-
(command, Some(kill))
2701+
(command, kill)
26592702
}
26602703
}
26612704

@@ -2688,8 +2731,8 @@ pub enum Error {
26882731
#[snafu(display("The IO queue task panicked"))]
26892732
IoQueuePanicked { source: tokio::task::JoinError },
26902733

2691-
#[snafu(display("Unable to kill the child process"))]
2692-
KillWorker { source: std::io::Error },
2734+
#[snafu(transparent)]
2735+
KillWorker { source: TerminateContainerError },
26932736

26942737
#[snafu(display("The container task panicked"))]
26952738
ContainerTaskPanicked { source: tokio::task::JoinError },
@@ -2863,14 +2906,14 @@ mod tests {
28632906
&self,
28642907
channel: Channel,
28652908
_id: impl fmt::Display,
2866-
) -> (Command, Option<Command>) {
2909+
) -> (Command, TerminateContainer) {
28672910
let channel_dir = self.project_dir.path().join(channel.to_str());
28682911

28692912
let mut command = Command::new("./target/debug/worker");
28702913
command.env("RUSTUP_TOOLCHAIN", channel.to_str());
28712914
command.arg(channel_dir);
28722915

2873-
(command, None)
2916+
(command, TerminateContainer::none())
28742917
}
28752918
}
28762919

@@ -2982,7 +3025,7 @@ mod tests {
29823025

29833026
coordinator.shutdown().await?;
29843027

2985-
Ok(())
3028+
Ok::<_, Error>(())
29863029
});
29873030

29883031
try_join_all(tests).with_timeout().await?;
@@ -3028,7 +3071,7 @@ mod tests {
30283071

30293072
coordinator.shutdown().await?;
30303073

3031-
Ok(())
3074+
Ok::<_, Error>(())
30323075
},
30333076
)
30343077
});
@@ -3067,7 +3110,7 @@ mod tests {
30673110

30683111
coordinator.shutdown().await?;
30693112

3070-
Ok(())
3113+
Ok::<_, Error>(())
30713114
});
30723115

30733116
try_join_all(tests).with_timeout().await?;
@@ -3097,7 +3140,7 @@ mod tests {
30973140

30983141
coordinator.shutdown().await?;
30993142

3100-
Ok(())
3143+
Ok::<_, Error>(())
31013144
});
31023145

31033146
try_join_all(tests).with_timeout().await?;
@@ -3134,7 +3177,7 @@ mod tests {
31343177

31353178
coordinator.shutdown().await?;
31363179

3137-
Ok(())
3180+
Ok::<_, Error>(())
31383181
});
31393182

31403183
try_join_all(tests).with_timeout().await?;
@@ -3857,7 +3900,7 @@ mod tests {
38573900

38583901
coordinator.shutdown().await?;
38593902

3860-
Ok(())
3903+
Ok::<_, Error>(())
38613904
},
38623905
)
38633906
});

0 commit comments

Comments
 (0)