diff --git a/crates/integration-tests/src/tests/run_ephemeral.rs b/crates/integration-tests/src/tests/run_ephemeral.rs index f3c5b5b..5ff502a 100644 --- a/crates/integration-tests/src/tests/run_ephemeral.rs +++ b/crates/integration-tests/src/tests/run_ephemeral.rs @@ -299,3 +299,77 @@ fn test_run_ephemeral_instancetype_invalid() -> Result<()> { Ok(()) } integration_test!(test_run_ephemeral_instancetype_invalid); + +/// Test that ephemeral VMs have the expected mount layout: +/// - / is read-only virtiofs +/// - /etc is overlayfs with tmpfs upper (writable) +/// - /var is tmpfs (not overlayfs, so podman can use overlayfs inside) +fn test_run_ephemeral_mount_layout() -> Result<()> { + // Check each mount point individually using findmnt + // Running all three at once with -J can hang on some configurations + + // Check root mount + let output = run_bcvk(&[ + "ephemeral", + "run", + "--rm", + "--label", + INTEGRATION_TEST_LABEL, + "--execute", + "findmnt -n -o FSTYPE,OPTIONS /", + &get_test_image(), + ])?; + output.assert_success("check root mount"); + let root_line = output.stdout.trim(); + assert!( + root_line.starts_with("virtiofs"), + "Root should be virtiofs, got: {}", + root_line + ); + assert!( + root_line.contains("ro"), + "Root should be read-only, got: {}", + root_line + ); + + // Check /etc mount + let output = run_bcvk(&[ + "ephemeral", + "run", + "--rm", + "--label", + INTEGRATION_TEST_LABEL, + "--execute", + "findmnt -n -o FSTYPE /etc", + &get_test_image(), + ])?; + output.assert_success("check /etc mount"); + assert_eq!( + output.stdout.trim(), + "overlay", + "/etc should be overlay, got: {}", + output.stdout + ); + + // Check /var mount - should be tmpfs, NOT overlay + let output = run_bcvk(&[ + "ephemeral", + "run", + "--rm", + "--label", + INTEGRATION_TEST_LABEL, + "--execute", + "findmnt -n -o FSTYPE /var", + &get_test_image(), + ])?; + output.assert_success("check /var mount"); + assert_eq!( + output.stdout.trim(), + "tmpfs", + "/var should be tmpfs (not overlay), got: {}", + output.stdout + ); + + Ok(()) +} +integration_test!(test_run_ephemeral_mount_layout); diff --git a/crates/kit/src/cpio.rs b/crates/kit/src/cpio.rs new file mode 100644 index 0000000..77e559b --- /dev/null +++ b/crates/kit/src/cpio.rs @@ -0,0 +1,221 @@ +//! Minimal CPIO archive creation for initramfs appending +//! +//! This module implements the "newc" CPIO format for appending files to +//! an initramfs. The Linux kernel supports concatenating multiple CPIO +//! archives, so we can simply append our files to an existing initramfs. + +use std::io::{self, BufWriter, Write}; + +/// CPIO "newc" format magic number +const CPIO_MAGIC: &str = "070701"; + +/// Write a CPIO archive entry header +fn write_header( + writer: &mut BufWriter, + name: &str, + mode: u32, + file_size: u32, +) -> io::Result<()> { + let name_with_nul = format!("{}\0", name); + // SAFETY: name length should fit within 32 bits + let namesize: u32 = name_with_nul.len().try_into().unwrap(); + + // newc header format: all fields are 8-char hex ASCII + let ino = 0u32; + write!( + writer, + "{CPIO_MAGIC}{ino:08x}{mode:08x}{uid:08x}{gid:08x}{nlink:08x}{mtime:08x}{filesize:08x}{devmajor:08x}{devminor:08x}{rdevmajor:08x}{rdevminor:08x}{namesize:08x}{check:08x}", + uid = 0u32, + gid = 0u32, + nlink = 1u32, + mtime = 0u32, + filesize = file_size, + devmajor = 0u32, + devminor = 0u32, + rdevmajor = 0u32, + rdevminor = 0u32, + check = 0u32, + )?; + + // Write filename (with NUL terminator) + writer.write_all(name_with_nul.as_bytes())?; + + // Pad to 4-byte boundary after header + filename + // Header is 110 bytes, so total is 110 + namesize + let header_plus_name = 110 + namesize; + let padding = (4 - (header_plus_name % 4)) % 4; + for _ in 0..padding { + writer.write_all(b"\0")?; + } + + Ok(()) +} + +/// Pad output to 4-byte boundary +fn write_data_padding(writer: &mut BufWriter, data_len: u32) -> io::Result<()> { + let padding = (4 - (data_len % 4)) % 4; + for _ in 0..padding { + writer.write_all(b"\0")?; + } + Ok(()) +} + +/// Write a directory entry to a CPIO archive +fn write_directory(writer: &mut BufWriter, path: &str) -> io::Result<()> { + // Directory mode: 0755 + S_IFDIR (0o40000) + let mode = 0o40755; + write_header(writer, path, mode, 0)?; + Ok(()) +} + +/// Write a regular file entry to a CPIO archive +fn write_file( + writer: &mut BufWriter, + path: &str, + content: &[u8], + mode: u32, +) -> io::Result<()> { + // Add S_IFREG (0o100000) to mode + let full_mode = 0o100000 | mode; + // SAFETY: content length should fit within 32 bits + let content_len: u32 = content.len().try_into().unwrap(); + write_header(writer, path, full_mode, content_len)?; + writer.write_all(content)?; + write_data_padding(writer, content_len)?; + Ok(()) +} + +/// Write the CPIO trailer (end of archive marker) +fn write_trailer(writer: &mut BufWriter) -> io::Result<()> { + write_header(writer, "TRAILER!!!", 0, 0)?; + Ok(()) +} + +/// Create a CPIO archive with bcvk initramfs units +/// +/// This creates a minimal CPIO archive containing: +/// - The /etc overlay service unit (runs in initramfs) +/// - The /var ephemeral service unit (runs in initramfs) +/// - The copy-units service (copies journal-stream to /sysroot/etc for systemd <256) +/// - The journal-stream service (to be copied for systemd <256 compatibility) +/// - Drop-in files to pull units into appropriate targets +/// +/// On systemd v256+, the journal-stream unit is created via SMBIOS credentials. +/// On older versions, bcvk-copy-units.service copies the embedded unit to +/// /sysroot/etc/systemd/system/ before switch-root. +pub fn create_initramfs_units_cpio() -> Vec { + let mut buf = Vec::new(); + let mut writer = BufWriter::new(&mut buf); + + // Include the initramfs service units + let etc_overlay_content = include_str!("units/bcvk-etc-overlay.service"); + let var_ephemeral_content = include_str!("units/bcvk-var-ephemeral.service"); + let copy_units_content = include_str!("units/bcvk-copy-units.service"); + + // Include the journal-stream service (copied to /sysroot/etc on systemd <256) + let journal_stream_content = include_str!("units/bcvk-journal-stream.service"); + + // Create directory structure + write_directory(&mut writer, "usr").unwrap(); + write_directory(&mut writer, "usr/lib").unwrap(); + write_directory(&mut writer, "usr/lib/systemd").unwrap(); + write_directory(&mut writer, "usr/lib/systemd/system").unwrap(); + + // Write the initramfs service units (mode 0644) + write_file( + &mut writer, + "usr/lib/systemd/system/bcvk-etc-overlay.service", + etc_overlay_content.as_bytes(), + 0o644, + ) + .unwrap(); + + write_file( + &mut writer, + "usr/lib/systemd/system/bcvk-var-ephemeral.service", + var_ephemeral_content.as_bytes(), + 0o644, + ) + .unwrap(); + + write_file( + &mut writer, + "usr/lib/systemd/system/bcvk-copy-units.service", + copy_units_content.as_bytes(), + 0o644, + ) + .unwrap(); + + // Write the journal-stream service (will be copied to /sysroot/etc on systemd <256) + write_file( + &mut writer, + "usr/lib/systemd/system/bcvk-journal-stream.service", + journal_stream_content.as_bytes(), + 0o644, + ) + .unwrap(); + + // Create drop-in directories and files to pull units into initrd-fs.target + write_directory(&mut writer, "usr/lib/systemd/system/initrd-fs.target.d").unwrap(); + + let etc_dropin = "[Unit]\nWants=bcvk-etc-overlay.service\n"; + write_file( + &mut writer, + "usr/lib/systemd/system/initrd-fs.target.d/bcvk-etc-overlay.conf", + etc_dropin.as_bytes(), + 0o644, + ) + .unwrap(); + + let var_dropin = "[Unit]\nWants=bcvk-var-ephemeral.service\n"; + write_file( + &mut writer, + "usr/lib/systemd/system/initrd-fs.target.d/bcvk-var-ephemeral.conf", + var_dropin.as_bytes(), + 0o644, + ) + .unwrap(); + + let copy_dropin = "[Unit]\nWants=bcvk-copy-units.service\n"; + write_file( + &mut writer, + "usr/lib/systemd/system/initrd-fs.target.d/bcvk-copy-units.conf", + copy_dropin.as_bytes(), + 0o644, + ) + .unwrap(); + + // Write trailer + write_trailer(&mut writer).unwrap(); + + // Flush and return the buffer + writer.into_inner().unwrap(); + buf +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_create_initramfs_units_cpio() { + let cpio = create_initramfs_units_cpio(); + + // Should start with CPIO magic + assert!(cpio.starts_with(CPIO_MAGIC.as_bytes())); + + let cpio_str = std::str::from_utf8(&cpio).unwrap(); + + // Should contain the embedded service units + assert!(cpio_str.contains("bcvk-etc-overlay.service")); + assert!(cpio_str.contains("bcvk-var-ephemeral.service")); + assert!(cpio_str.contains("bcvk-copy-units.service")); + assert!(cpio_str.contains("bcvk-journal-stream.service")); + + // Should contain the drop-in configs + assert!(cpio_str.contains("initrd-fs.target.d")); + + // Should end with TRAILER!!! + assert!(cpio_str.contains("TRAILER!!!")); + } +} diff --git a/crates/kit/src/credentials.rs b/crates/kit/src/credentials.rs index c66f86b..7318f89 100644 --- a/crates/kit/src/credentials.rs +++ b/crates/kit/src/credentials.rs @@ -173,6 +173,10 @@ pub fn storage_opts_tmpfiles_d_lines() -> String { "f /etc/systemd/system.conf.d/90-bcvk-storage.conf 0644 root root - [Manager]\\nDefaultEnvironment=STORAGE_OPTS=additionalimagestore=/run/host-container-storage\n" ).to_string() } +// Note: The /etc overlay and /var ephemeral units are now embedded directly in the +// initramfs CPIO archive (see cpio.rs) rather than being injected via SMBIOS credentials. +// This ensures they work on systemd <256 where credential import happens too late for +// generators to process the credentials. /// Generate SMBIOS credential string for root SSH access /// diff --git a/crates/kit/src/lib.rs b/crates/kit/src/lib.rs index 310efb3..35860d5 100644 --- a/crates/kit/src/lib.rs +++ b/crates/kit/src/lib.rs @@ -1,4 +1,5 @@ //! bcvk library - exposes internal modules for testing +pub mod cpio; pub mod qemu_img; pub mod xml_utils; diff --git a/crates/kit/src/main.rs b/crates/kit/src/main.rs index 772e5fc..8d5326f 100644 --- a/crates/kit/src/main.rs +++ b/crates/kit/src/main.rs @@ -10,6 +10,7 @@ mod cache_metadata; mod cli_json; mod common_opts; mod container_entrypoint; +mod cpio; mod credentials; mod domain_list; mod ephemeral; diff --git a/crates/kit/src/run_ephemeral.rs b/crates/kit/src/run_ephemeral.rs index 3398e1f..8d4bd2b 100644 --- a/crates/kit/src/run_ephemeral.rs +++ b/crates/kit/src/run_ephemeral.rs @@ -944,28 +944,41 @@ pub(crate) async fn run_impl(opts: RunEphemeralOpts) -> Result<()> { } else { let vmlinuz_path = vmlinuz_path .ok_or_else(|| eyre!("No kernel found in /run/source-image/usr/lib/modules"))?; - let initramfs_path = initramfs_path + let source_initramfs_path = initramfs_path .ok_or_else(|| eyre!("No initramfs found in /run/source-image/usr/lib/modules"))?; fs::File::create(&kernel_mount)?; - fs::File::create(&initramfs_mount)?; - // Bind mount kernel and initramfs + // Bind mount kernel (read-only is fine) Command::new("mount") .args(["--bind", "-o", "ro", vmlinuz_path.as_str(), &kernel_mount]) .run() .map_err(|e| eyre!("Failed to bind mount kernel: {e}"))?; - Command::new("mount") - .args([ - "--bind", - "-o", - "ro", - initramfs_path.as_str(), - &initramfs_mount, - ]) - .run() - .map_err(|e| eyre!("Failed to bind mount initramfs: {e}"))?; + // Copy initramfs so we can append to it + fs::copy(&source_initramfs_path, &initramfs_mount) + .map_err(|e| eyre!("Failed to copy initramfs: {e}"))?; + } + + // Append bcvk units to initramfs + // This includes: + // - /etc overlay and /var ephemeral services (run in initramfs) + // - bcvk-copy-units.service (copies journal-stream to /sysroot/etc for systemd <256) + // - bcvk-journal-stream.service (embedded for systemd <256 compatibility) + { + use std::io::Write; + let cpio_data = crate::cpio::create_initramfs_units_cpio(); + let mut initramfs_file = fs::OpenOptions::new() + .append(true) + .open(&initramfs_mount) + .map_err(|e| eyre!("Failed to open initramfs for appending: {e}"))?; + initramfs_file + .write_all(&cpio_data) + .map_err(|e| eyre!("Failed to append bcvk units to initramfs: {e}"))?; + debug!( + "Appended bcvk units to initramfs ({} bytes)", + cpio_data.len() + ); } // Process host mounts and prepare virtiofsd instances for each using async manager @@ -1053,6 +1066,11 @@ pub(crate) async fn run_impl(opts: RunEphemeralOpts) -> Result<()> { ); } + // Note: /etc overlay and /var ephemeral units are now embedded directly in the + // initramfs CPIO (see cpio.rs) rather than injected via SMBIOS credentials. + // This ensures they work on systemd <256 where credential import happens too + // late for generators to process. + // Handle --execute: pipes will be created when adding to qemu_config later // No need to create files anymore as we're using pipes @@ -1099,6 +1117,8 @@ WantedBy=sysinit.target Description=Execute Script Service Requires=dev-virtio\x2dports-execute.device After=dev-virtio\x2dports-execute.device +# Ensure we only run after switch-root in the real root filesystem +ConditionPathExists=!/etc/initrd-release [Service] Type=oneshot @@ -1116,6 +1136,8 @@ Description=Execute Script Service Completion After=bootc-execute.service Requires=dev-virtio\x2dports-executestatus.device After=dev-virtio\x2dports-executestatus.device +# Ensure we only run after switch-root in the real root filesystem +ConditionPathExists=!/etc/initrd-release [Service] Type=oneshot @@ -1137,12 +1159,14 @@ StandardOutput=file:/dev/virtio-ports/executestatus ); mount_unit_smbios_creds.push(finish_cred); - // Create dropin for default.target to enable execute services + // Create dropin for multi-user.target to enable execute services + // Using multi-user.target instead of default.target ensures these only run + // after switch-root in the real root filesystem (not in initramfs) let execute_dropin = "[Unit]\nWants=bootc-execute.service bootc-execute-finish.service\n"; let encoded_dropin = data_encoding::BASE64.encode(execute_dropin.as_bytes()); let dropin_cred = format!( - "io.systemd.credential.binary:systemd.unit-dropin.default.target~bcvk-execute={encoded_dropin}" + "io.systemd.credential.binary:systemd.unit-dropin.multi-user.target~bcvk-execute={encoded_dropin}" ); mount_unit_smbios_creds.push(dropin_cred); debug!("Generated SMBIOS credentials for execute units"); @@ -1188,10 +1212,10 @@ StandardOutput=file:/dev/virtio-ports/executestatus // At the core we boot from the mounted container's root, "rootfstype=virtiofs", "root=rootfs", - // But read-only, with an overlayfs in the VM backed - // by tmpfs + // But read-only. We set up /etc overlay and /var copyup via + // systemd credentials rather than systemd.volatile=overlay + // to have more control over individual directories. "rootflags=ro", - "systemd.volatile=overlay", // This avoids having journald interact with the rootfs // at all, which lessens the I/O traffic for virtiofs "systemd.journald.storage=volatile", diff --git a/crates/kit/src/units/bcvk-copy-units.service b/crates/kit/src/units/bcvk-copy-units.service new file mode 100644 index 0000000..2f2bfc6 --- /dev/null +++ b/crates/kit/src/units/bcvk-copy-units.service @@ -0,0 +1,16 @@ +[Unit] +Description=Copy bcvk units for post-switch-root on systemd <256 +DefaultDependencies=no +ConditionPathExists=/etc/initrd-release +Before=initrd-fs.target + +[Service] +Type=oneshot +RemainAfterExit=yes +# On systemd <256, SMBIOS credentials with systemd.extra-unit.* are not processed +# natively. We work around this by embedding the units in the initramfs and copying +# them to /run/systemd/system/ before switch-root. The /run tmpfs is preserved +# across switch-root via MS_MOVE. +# +# Copy journal-stream unit and create dropin for sysinit.target +ExecStart=/bin/sh -c 'mkdir -p /run/systemd/system/sysinit.target.wants && cp /usr/lib/systemd/system/bcvk-journal-stream.service /run/systemd/system/ && ln -s ../bcvk-journal-stream.service /run/systemd/system/sysinit.target.wants/' diff --git a/crates/kit/src/units/bcvk-etc-overlay.service b/crates/kit/src/units/bcvk-etc-overlay.service new file mode 100644 index 0000000..2e973c7 --- /dev/null +++ b/crates/kit/src/units/bcvk-etc-overlay.service @@ -0,0 +1,19 @@ +[Unit] +Description=Setup ephemeral /etc overlay +DefaultDependencies=no +ConditionPathExists=/etc/initrd-release +Before=initrd-fs.target +# Must run after sysroot.mount and initrd-parse-etc.service which scans /sysroot/etc/fstab +After=sysroot.mount initrd-parse-etc.service +Requires=sysroot.mount + +[Service] +Type=oneshot +RemainAfterExit=yes +TimeoutStartSec=30 +# Bind-mount /sysroot/etc to a separate location first, then use that as lowerdir. +# Using /sysroot/etc as both lowerdir and mount destination can hang on older kernels. +ExecStart=/usr/bin/mkdir -p /run/etc-lower /run/etc-upper /run/etc-work +ExecStart=/usr/bin/mount --bind /sysroot/etc /run/etc-lower +# Use index=off,metacopy=off to avoid extended attr operations on virtiofs +ExecStart=/usr/bin/mount -t overlay overlay -o lowerdir=/run/etc-lower,upperdir=/run/etc-upper,workdir=/run/etc-work,index=off,metacopy=off /sysroot/etc diff --git a/crates/kit/src/units/bcvk-journal-stream.service b/crates/kit/src/units/bcvk-journal-stream.service new file mode 100644 index 0000000..5acd5ce --- /dev/null +++ b/crates/kit/src/units/bcvk-journal-stream.service @@ -0,0 +1,18 @@ +[Unit] +Description=Stream systemd journal to host via virtio-serial +DefaultDependencies=no +After=systemd-journald.service dev-virtio\x2dports-org.bcvk.journal.device +Requires=systemd-journald.service dev-virtio\x2dports-org.bcvk.journal.device +# Only run after switch-root (not in initramfs) +ConditionPathExists=!/etc/initrd-release + +[Service] +Type=simple +ExecStart=/usr/bin/journalctl -f -o short-precise --no-pager +StandardOutput=file:/dev/virtio-ports/org.bcvk.journal +StandardError=file:/dev/virtio-ports/org.bcvk.journal +Restart=always +RestartSec=1s + +[Install] +WantedBy=sysinit.target diff --git a/crates/kit/src/units/bcvk-var-ephemeral.service b/crates/kit/src/units/bcvk-var-ephemeral.service new file mode 100644 index 0000000..87e22dc --- /dev/null +++ b/crates/kit/src/units/bcvk-var-ephemeral.service @@ -0,0 +1,15 @@ +[Unit] +Description=Setup ephemeral /var from image content +DefaultDependencies=no +ConditionPathExists=/etc/initrd-release +Before=initrd-fs.target +After=sysroot.mount initrd-parse-etc.service +Requires=sysroot.mount + +[Service] +Type=oneshot +RemainAfterExit=yes +TimeoutStartSec=60 +ExecStart=/usr/bin/mkdir -p /run/var-ephemeral +ExecStart=/usr/bin/cp -a /sysroot/var/. /run/var-ephemeral/ +ExecStart=/usr/bin/mount --bind /run/var-ephemeral /sysroot/var