-
Notifications
You must be signed in to change notification settings - Fork 129
blockdev: implement signal-safe loopback device cleanup helper #1449
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
@@ -1,12 +1,12 @@ | ||||
use std::collections::HashMap; | ||||
use std::env; | ||||
use std::path::Path; | ||||
use std::process::Command; | ||||
use std::path::PathBuf; | ||||
use std::process::{Command, Stdio}; | ||||
use std::sync::OnceLock; | ||||
|
||||
use anyhow::{anyhow, Context, Result}; | ||||
use camino::Utf8Path; | ||||
use camino::Utf8PathBuf; | ||||
use camino::{Utf8Path, Utf8PathBuf}; | ||||
use fn_error_context::context; | ||||
use regex::Regex; | ||||
use serde::Deserialize; | ||||
|
@@ -181,6 +181,14 @@ pub fn partitions_of(dev: &Utf8Path) -> Result<PartitionTable> { | |||
|
||||
pub struct LoopbackDevice { | ||||
pub dev: Option<Utf8PathBuf>, | ||||
// Handle to the cleanup helper process | ||||
cleanup_handle: Option<LoopbackCleanupHandle>, | ||||
} | ||||
|
||||
/// Handle to manage the cleanup helper process for loopback devices | ||||
struct LoopbackCleanupHandle { | ||||
/// Child process handle | ||||
child: std::process::Child, | ||||
} | ||||
|
||||
impl LoopbackDevice { | ||||
|
@@ -208,7 +216,25 @@ impl LoopbackDevice { | |||
.run_get_string()?; | ||||
let dev = Utf8PathBuf::from(dev.trim()); | ||||
tracing::debug!("Allocated loopback {dev}"); | ||||
Ok(Self { dev: Some(dev) }) | ||||
|
||||
// Try to spawn cleanup helper, but don't fail if it doesn't work | ||||
let cleanup_handle = match Self::spawn_cleanup_helper(dev.as_str()) { | ||||
Ok(handle) => Some(handle), | ||||
Err(e) => { | ||||
tracing::warn!( | ||||
"Failed to spawn loopback cleanup helper for {}: {}. \ | ||||
Loopback device may not be cleaned up if process is interrupted.", | ||||
dev, | ||||
e | ||||
); | ||||
None | ||||
} | ||||
}; | ||||
|
||||
Ok(Self { | ||||
dev: Some(dev), | ||||
cleanup_handle, | ||||
}) | ||||
} | ||||
|
||||
// Access the path to the loopback block device. | ||||
|
@@ -217,13 +243,85 @@ impl LoopbackDevice { | |||
self.dev.as_deref().unwrap() | ||||
} | ||||
|
||||
/// Spawn a cleanup helper process that will clean up the loopback device | ||||
/// if the parent process dies unexpectedly | ||||
fn spawn_cleanup_helper(device_path: &str) -> Result<LoopbackCleanupHandle> { | ||||
// Try multiple strategies to find the bootc binary | ||||
let bootc_path = Self::find_bootc_binary() | ||||
.context("Failed to locate bootc binary for cleanup helper")?; | ||||
|
||||
// Create the helper process | ||||
let mut cmd = Command::new(bootc_path); | ||||
cmd.args(["loopback-cleanup-helper", "--device", device_path]); | ||||
|
||||
// Set environment variable to indicate this is a cleanup helper | ||||
cmd.env("BOOTC_LOOPBACK_CLEANUP_HELPER", "1"); | ||||
|
||||
// Set up stdio to redirect to /dev/null | ||||
cmd.stdin(Stdio::null()); | ||||
cmd.stdout(Stdio::null()); | ||||
// Don't redirect stderr so we can see error messages | ||||
|
||||
// Spawn the process | ||||
let child = cmd | ||||
.spawn() | ||||
.context("Failed to spawn loopback cleanup helper")?; | ||||
|
||||
Ok(LoopbackCleanupHandle { child }) | ||||
} | ||||
|
||||
/// Find the bootc binary using multiple strategies | ||||
fn find_bootc_binary() -> Result<PathBuf> { | ||||
// Strategy 1: Try /proc/self/exe (works in most cases) | ||||
if let Ok(exe_path) = std::fs::read_link("/proc/self/exe") { | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK so this is trying to handle the original failure...but it's really unclear to me why it was failing and specifically how that test case run is different than the existing loopback tests we have. (original issue in #1439 ) As far as I know I think what would be good here is that if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm random idea as to the root cause - what if this is happening just when we need to re-exec the current process twice? And inside the container env? I'm starting to suspect it has to do with that... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Def could be. Adding some changes that handles the case where There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah yes it's this almost for sure Line 91 in f4b01ab
In some cases we're hitting that fallback, but not others. So...this is getting a bit complicated. Your fallback code here will definitely work in general....but I have some ideas to do something more sophisticated. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The more complex fix I think is to make an new internal "out of process drop helper" crate that we initialize before we do any re-exec dances. I.e. we fork/re-exec that helper and talk to it over IPC or so. It would be more involved though. |
||||
if exe_path.exists() { | ||||
return Ok(exe_path); | ||||
} else { | ||||
tracing::warn!("/proc/self/exe points to non-existent path: {:?}", exe_path); | ||||
} | ||||
} else { | ||||
tracing::warn!("Failed to read /proc/self/exe"); | ||||
} | ||||
|
||||
// Strategy 2: Try argv[0] from std::env | ||||
if let Some(argv0) = std::env::args().next() { | ||||
let argv0_path = PathBuf::from(argv0); | ||||
if argv0_path.is_absolute() && argv0_path.exists() { | ||||
return Ok(argv0_path); | ||||
} | ||||
// If it's relative, try to resolve it | ||||
if let Ok(canonical) = argv0_path.canonicalize() { | ||||
return Ok(canonical); | ||||
} | ||||
} | ||||
|
||||
// Strategy 3: Try common installation paths | ||||
let common_paths = ["/usr/bin/bootc", "/usr/local/bin/bootc"]; | ||||
|
||||
for path in &common_paths { | ||||
let path_buf = PathBuf::from(path); | ||||
if path_buf.exists() { | ||||
return Ok(path_buf); | ||||
} | ||||
} | ||||
|
||||
anyhow::bail!("Could not locate bootc binary using any available strategy") | ||||
} | ||||
|
||||
// Shared backend for our `close` and `drop` implementations. | ||||
fn impl_close(&mut self) -> Result<()> { | ||||
// SAFETY: This is the only place we take the option | ||||
let Some(dev) = self.dev.take() else { | ||||
tracing::trace!("loopback device already deallocated"); | ||||
return Ok(()); | ||||
}; | ||||
|
||||
// Kill the cleanup helper since we're cleaning up normally | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm. We could also send it |
||||
if let Some(mut cleanup_handle) = self.cleanup_handle.take() { | ||||
// Send SIGTERM to the child process and let it do the cleanup | ||||
let _ = cleanup_handle.child.kill(); | ||||
} | ||||
|
||||
Command::new("losetup").args(["-d", dev.as_str()]).run() | ||||
} | ||||
|
||||
|
@@ -240,6 +338,56 @@ impl Drop for LoopbackDevice { | |||
} | ||||
} | ||||
|
||||
/// Main function for the loopback cleanup helper process | ||||
/// This function does not return - it either exits normally or via signal | ||||
pub async fn run_loopback_cleanup_helper(device_path: &str) -> Result<()> { | ||||
// Check if we're running as a cleanup helper | ||||
if std::env::var("BOOTC_LOOPBACK_CLEANUP_HELPER").is_err() { | ||||
anyhow::bail!("This function should only be called as a cleanup helper"); | ||||
} | ||||
|
||||
// Set up death signal notification - we want to be notified when parent dies | ||||
rustix::process::set_parent_process_death_signal(Some(rustix::process::Signal::TERM)) | ||||
.context("Failed to set parent death signal")?; | ||||
|
||||
// Wait for SIGTERM (either from parent death or normal cleanup) | ||||
tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()) | ||||
.expect("Failed to create signal stream") | ||||
.recv() | ||||
.await; | ||||
|
||||
// Clean up the loopback device | ||||
let output = std::process::Command::new("losetup") | ||||
.args(["-d", device_path]) | ||||
.output(); | ||||
|
||||
match output { | ||||
Ok(output) if output.status.success() => { | ||||
// Log to systemd journal instead of stderr | ||||
tracing::info!("Cleaned up leaked loopback device {}", device_path); | ||||
std::process::exit(0); | ||||
} | ||||
Ok(output) => { | ||||
let stderr = String::from_utf8_lossy(&output.stderr); | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This child process has its stdout/stderr going to /dev/null by default so all of this is just lost unfortunately. |
||||
tracing::error!( | ||||
"Failed to clean up loopback device {}: {}. Stderr: {}", | ||||
device_path, | ||||
output.status, | ||||
stderr.trim() | ||||
); | ||||
std::process::exit(1); | ||||
} | ||||
Err(e) => { | ||||
tracing::error!( | ||||
"Error executing losetup to clean up loopback device {}: {}", | ||||
device_path, | ||||
e | ||||
); | ||||
std::process::exit(1); | ||||
} | ||||
} | ||||
} | ||||
|
||||
/// Parse key-value pairs from lsblk --pairs. | ||||
/// Newer versions of lsblk support JSON but the one in CentOS 7 doesn't. | ||||
fn split_lsblk_line(line: &str) -> HashMap<String, String> { | ||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the AI tooling doesn't know about our helper
run_get_string()
which handles the.success()
and error printing.IOW this code was probably fine as is, can we just keep it that way please?