Skip to content

Commit d28aa3e

Browse files
committed
Try to more thoroughly kill child Docker processes
During a recent outage, the Docker service reported 161 running `docker` processes, while the playground was only aware of 27. Each `docker` processes (and their children) use about 30 MiB of RAM. That would total 4.7 GiB of memory, while the instance only has 4GiB. My hypothesis is that we are starting containers, then encountering an error and bubbling it up to the caller. Along the way, we skip the call to shutdown the container. This change tries to shutdown the container when the value is dropped. This should allow it to be handled in error cases as well. It's possible that this might also help with some of the zombie processes we see from time-to-time!
1 parent ca5d2ca commit d28aa3e

File tree

1 file changed

+72
-29
lines changed

1 file changed

+72
-29
lines changed

compiler/base/orchestrator/src/coordinator.rs

Lines changed: 72 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use tokio::{
2525
};
2626
use tokio_stream::wrappers::ReceiverStream;
2727
use tokio_util::{io::SyncIoBridge, sync::CancellationToken};
28-
use tracing::{info_span, instrument, trace, trace_span, warn, Instrument};
28+
use tracing::{error, info_span, instrument, trace, trace_span, warn, Instrument};
2929

3030
use crate::{
3131
bincode_input_closed,
@@ -1150,7 +1150,7 @@ where
11501150
struct Container {
11511151
permit: Box<dyn ContainerPermit>,
11521152
task: JoinHandle<Result<()>>,
1153-
kill_child: Option<Command>,
1153+
kill_child: TerminateContainer,
11541154
modify_cargo_toml: ModifyCargoToml,
11551155
commander: Commander,
11561156
}
@@ -1908,24 +1908,14 @@ impl Container {
19081908
let Self {
19091909
permit,
19101910
task,
1911-
kill_child,
1911+
mut kill_child,
19121912
modify_cargo_toml,
19131913
commander,
19141914
} = self;
19151915
drop(commander);
19161916
drop(modify_cargo_toml);
19171917

1918-
if let Some(mut kill_child) = kill_child {
1919-
// We don't care if the command itself succeeds or not; it
1920-
// may already be dead!
1921-
let _ = kill_child
1922-
.stdin(Stdio::null())
1923-
.stdout(Stdio::null())
1924-
.stderr(Stdio::null())
1925-
.status()
1926-
.await
1927-
.context(KillWorkerSnafu)?;
1928-
}
1918+
kill_child.terminate_now().await?;
19291919

19301920
let r = task.await;
19311921
drop(permit);
@@ -2542,12 +2532,64 @@ pub enum CommanderError {
25422532
WorkerOperationFailed { source: SerializedError2 },
25432533
}
25442534

2535+
#[derive(Debug)]
2536+
pub struct TerminateContainer(Option<Command>);
2537+
2538+
impl TerminateContainer {
2539+
pub fn new(command: Command) -> Self {
2540+
Self(Some(command))
2541+
}
2542+
2543+
pub fn none() -> Self {
2544+
Self(None)
2545+
}
2546+
2547+
async fn terminate_now(&mut self) -> Result<(), TerminateContainerError> {
2548+
if let Some(mut kill_child) = self.take_command() {
2549+
// [ALREADY-DEAD] We don't care if the command itself
2550+
// succeeds or not; the container may already be dead!
2551+
let _ = kill_child.status().await?;
2552+
}
2553+
2554+
Ok(())
2555+
}
2556+
2557+
fn take_command(&mut self) -> Option<Command> {
2558+
self.0.take().map(|mut kill_child| {
2559+
kill_child
2560+
.stdin(Stdio::null())
2561+
.stdout(Stdio::null())
2562+
.stderr(Stdio::null());
2563+
kill_child
2564+
})
2565+
}
2566+
}
2567+
2568+
impl Drop for TerminateContainer {
2569+
fn drop(&mut self) {
2570+
if let Some(mut kill_child) = self.take_command() {
2571+
if let Err(e) = kill_child.as_std_mut().status() {
2572+
// See [ALREADY-DEAD]
2573+
error!("Unable to kill the container while dropping: {e}");
2574+
}
2575+
}
2576+
}
2577+
}
2578+
2579+
#[derive(Debug, Snafu)]
2580+
#[snafu(module)]
2581+
pub enum TerminateContainerError {
2582+
#[snafu(display("Unable to kill the child process"))]
2583+
#[snafu(context(false))]
2584+
Execute { source: std::io::Error },
2585+
}
2586+
25452587
pub trait Backend {
25462588
fn run_worker_in_background(
25472589
&self,
25482590
channel: Channel,
25492591
id: impl fmt::Display,
2550-
) -> Result<(Child, Option<Command>, ChildStdin, ChildStdout)> {
2592+
) -> Result<(Child, TerminateContainer, ChildStdin, ChildStdout)> {
25512593
let (mut start, kill) = self.prepare_worker_command(channel, id);
25522594

25532595
let mut child = start
@@ -2565,7 +2607,7 @@ pub trait Backend {
25652607
&self,
25662608
channel: Channel,
25672609
id: impl fmt::Display,
2568-
) -> (Command, Option<Command>);
2610+
) -> (Command, TerminateContainer);
25692611
}
25702612

25712613
impl<B> Backend for &B
@@ -2576,7 +2618,7 @@ where
25762618
&self,
25772619
channel: Channel,
25782620
id: impl fmt::Display,
2579-
) -> (Command, Option<Command>) {
2621+
) -> (Command, TerminateContainer) {
25802622
B::prepare_worker_command(self, channel, id)
25812623
}
25822624
}
@@ -2635,7 +2677,7 @@ impl Backend for DockerBackend {
26352677
&self,
26362678
channel: Channel,
26372679
id: impl fmt::Display,
2638-
) -> (Command, Option<Command>) {
2680+
) -> (Command, TerminateContainer) {
26392681
let name = format!("playground-{id}");
26402682

26412683
let mut command = basic_secure_docker_command();
@@ -2654,8 +2696,9 @@ impl Backend for DockerBackend {
26542696

26552697
let mut kill = Command::new("docker");
26562698
kill.arg("kill").args(["--signal", "KILL"]).arg(name);
2699+
let kill = TerminateContainer::new(kill);
26572700

2658-
(command, Some(kill))
2701+
(command, kill)
26592702
}
26602703
}
26612704

@@ -2688,8 +2731,8 @@ pub enum Error {
26882731
#[snafu(display("The IO queue task panicked"))]
26892732
IoQueuePanicked { source: tokio::task::JoinError },
26902733

2691-
#[snafu(display("Unable to kill the child process"))]
2692-
KillWorker { source: std::io::Error },
2734+
#[snafu(transparent)]
2735+
KillWorker { source: TerminateContainerError },
26932736

26942737
#[snafu(display("The container task panicked"))]
26952738
ContainerTaskPanicked { source: tokio::task::JoinError },
@@ -2863,14 +2906,14 @@ mod tests {
28632906
&self,
28642907
channel: Channel,
28652908
_id: impl fmt::Display,
2866-
) -> (Command, Option<Command>) {
2909+
) -> (Command, TerminateContainer) {
28672910
let channel_dir = self.project_dir.path().join(channel.to_str());
28682911

28692912
let mut command = Command::new("./target/debug/worker");
28702913
command.env("RUSTUP_TOOLCHAIN", channel.to_str());
28712914
command.arg(channel_dir);
28722915

2873-
(command, None)
2916+
(command, TerminateContainer::none())
28742917
}
28752918
}
28762919

@@ -2982,7 +3025,7 @@ mod tests {
29823025

29833026
coordinator.shutdown().await?;
29843027

2985-
Ok(())
3028+
Ok::<_, Error>(())
29863029
});
29873030

29883031
try_join_all(tests).with_timeout().await?;
@@ -3028,7 +3071,7 @@ mod tests {
30283071

30293072
coordinator.shutdown().await?;
30303073

3031-
Ok(())
3074+
Ok::<_, Error>(())
30323075
},
30333076
)
30343077
});
@@ -3067,7 +3110,7 @@ mod tests {
30673110

30683111
coordinator.shutdown().await?;
30693112

3070-
Ok(())
3113+
Ok::<_, Error>(())
30713114
});
30723115

30733116
try_join_all(tests).with_timeout().await?;
@@ -3097,7 +3140,7 @@ mod tests {
30973140

30983141
coordinator.shutdown().await?;
30993142

3100-
Ok(())
3143+
Ok::<_, Error>(())
31013144
});
31023145

31033146
try_join_all(tests).with_timeout().await?;
@@ -3134,7 +3177,7 @@ mod tests {
31343177

31353178
coordinator.shutdown().await?;
31363179

3137-
Ok(())
3180+
Ok::<_, Error>(())
31383181
});
31393182

31403183
try_join_all(tests).with_timeout().await?;
@@ -3857,7 +3900,7 @@ mod tests {
38573900

38583901
coordinator.shutdown().await?;
38593902

3860-
Ok(())
3903+
Ok::<_, Error>(())
38613904
},
38623905
)
38633906
});

0 commit comments

Comments
 (0)