Skip to content

Commit 67695e6

Browse files
committed
ephemeral: Replace systemd.volatile=overlay with fine-grained mounts
Instead of using systemd.volatile=overlay which overlaid all of / with a single tmpfs-backed overlayfs, set up /etc and /var separately: - /etc: overlayfs with tmpfs upper (transient changes, lost on reboot) - /var: real tmpfs with content copied from image (not overlayfs) The key benefit is that /var is now a real tmpfs, allowing podman to use overlayfs for container storage inside /var/lib/containers. With the old approach, the nested overlayfs caused "too many levels of symbolic links" errors. Implementation: The initramfs units are embedded in a CPIO archive that gets appended to the existing initramfs. This uses the Linux kernel's ability to concatenate multiple CPIO archives. Services running in initramfs (before switch-root): - bcvk-etc-overlay.service: Sets up overlay on /sysroot/etc using a bind-mounted lowerdir to avoid self-referential mount issues on older kernels. Uses index=off,metacopy=off for virtiofs compat. - bcvk-var-ephemeral.service: Copies /sysroot/var to tmpfs and bind mounts it back. - bcvk-copy-units.service: Copies bcvk-journal-stream.service to /run/systemd/system/ for systemd <256 compatibility. The /run tmpfs is preserved across switch-root via MS_MOVE. For systemd 256+, the journal-stream unit is created via SMBIOS credentials (systemd.extra-unit.*). For older versions like CentOS Stream 9 (systemd 252), the unit is copied from the initramfs since credential-based unit creation isn't supported. The execute service target is changed from default.target to multi-user.target with ConditionPathExists=!/etc/initrd-release to ensure it runs after switch-root, not in the initramfs. This is Phase 1 of issue #22, making ephemeral VMs more bootc-like. SELinux is still disabled (selinux=0); Phase 2 will add composefs support to enable proper SELinux labeling. Closes: #22 (Phase 1) Assisted-by: OpenCode (Sonnet 4)
1 parent 087e8d5 commit 67695e6

File tree

10 files changed

+411
-18
lines changed

10 files changed

+411
-18
lines changed

crates/integration-tests/src/tests/run_ephemeral.rs

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,3 +299,77 @@ fn test_run_ephemeral_instancetype_invalid() -> Result<()> {
299299
Ok(())
300300
}
301301
integration_test!(test_run_ephemeral_instancetype_invalid);
302+
303+
/// Test that ephemeral VMs have the expected mount layout:
304+
/// - / is read-only virtiofs
305+
/// - /etc is overlayfs with tmpfs upper (writable)
306+
/// - /var is tmpfs (not overlayfs, so podman can use overlayfs inside)
307+
fn test_run_ephemeral_mount_layout() -> Result<()> {
308+
// Check each mount point individually using findmnt
309+
// Running all three at once with -J can hang on some configurations
310+
311+
// Check root mount
312+
let output = run_bcvk(&[
313+
"ephemeral",
314+
"run",
315+
"--rm",
316+
"--label",
317+
INTEGRATION_TEST_LABEL,
318+
"--execute",
319+
"findmnt -n -o FSTYPE,OPTIONS /",
320+
&get_test_image(),
321+
])?;
322+
output.assert_success("check root mount");
323+
let root_line = output.stdout.trim();
324+
assert!(
325+
root_line.starts_with("virtiofs"),
326+
"Root should be virtiofs, got: {}",
327+
root_line
328+
);
329+
assert!(
330+
root_line.contains("ro"),
331+
"Root should be read-only, got: {}",
332+
root_line
333+
);
334+
335+
// Check /etc mount
336+
let output = run_bcvk(&[
337+
"ephemeral",
338+
"run",
339+
"--rm",
340+
"--label",
341+
INTEGRATION_TEST_LABEL,
342+
"--execute",
343+
"findmnt -n -o FSTYPE /etc",
344+
&get_test_image(),
345+
])?;
346+
output.assert_success("check /etc mount");
347+
assert_eq!(
348+
output.stdout.trim(),
349+
"overlay",
350+
"/etc should be overlay, got: {}",
351+
output.stdout
352+
);
353+
354+
// Check /var mount - should be tmpfs, NOT overlay
355+
let output = run_bcvk(&[
356+
"ephemeral",
357+
"run",
358+
"--rm",
359+
"--label",
360+
INTEGRATION_TEST_LABEL,
361+
"--execute",
362+
"findmnt -n -o FSTYPE /var",
363+
&get_test_image(),
364+
])?;
365+
output.assert_success("check /var mount");
366+
assert_eq!(
367+
output.stdout.trim(),
368+
"tmpfs",
369+
"/var should be tmpfs (not overlay), got: {}",
370+
output.stdout
371+
);
372+
373+
Ok(())
374+
}
375+
integration_test!(test_run_ephemeral_mount_layout);

crates/kit/src/cpio.rs

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
//! Minimal CPIO archive creation for initramfs appending
2+
//!
3+
//! This module implements the "newc" CPIO format for appending files to
4+
//! an initramfs. The Linux kernel supports concatenating multiple CPIO
5+
//! archives, so we can simply append our files to an existing initramfs.
6+
7+
use std::io::{self, BufWriter, Write};
8+
9+
/// CPIO "newc" format magic number
10+
const CPIO_MAGIC: &str = "070701";
11+
12+
/// Write a CPIO archive entry header
13+
fn write_header<W: Write>(
14+
writer: &mut BufWriter<W>,
15+
name: &str,
16+
mode: u32,
17+
file_size: u32,
18+
) -> io::Result<()> {
19+
let name_with_nul = format!("{}\0", name);
20+
// SAFETY: name length should fit within 32 bits
21+
let namesize: u32 = name_with_nul.len().try_into().unwrap();
22+
23+
// newc header format: all fields are 8-char hex ASCII
24+
let ino = 0u32;
25+
write!(
26+
writer,
27+
"{CPIO_MAGIC}{ino:08x}{mode:08x}{uid:08x}{gid:08x}{nlink:08x}{mtime:08x}{filesize:08x}{devmajor:08x}{devminor:08x}{rdevmajor:08x}{rdevminor:08x}{namesize:08x}{check:08x}",
28+
uid = 0u32,
29+
gid = 0u32,
30+
nlink = 1u32,
31+
mtime = 0u32,
32+
filesize = file_size,
33+
devmajor = 0u32,
34+
devminor = 0u32,
35+
rdevmajor = 0u32,
36+
rdevminor = 0u32,
37+
check = 0u32,
38+
)?;
39+
40+
// Write filename (with NUL terminator)
41+
writer.write_all(name_with_nul.as_bytes())?;
42+
43+
// Pad to 4-byte boundary after header + filename
44+
// Header is 110 bytes, so total is 110 + namesize
45+
let header_plus_name = 110 + namesize;
46+
let padding = (4 - (header_plus_name % 4)) % 4;
47+
for _ in 0..padding {
48+
writer.write_all(b"\0")?;
49+
}
50+
51+
Ok(())
52+
}
53+
54+
/// Pad output to 4-byte boundary
55+
fn write_data_padding<W: Write>(writer: &mut BufWriter<W>, data_len: u32) -> io::Result<()> {
56+
let padding = (4 - (data_len % 4)) % 4;
57+
for _ in 0..padding {
58+
writer.write_all(b"\0")?;
59+
}
60+
Ok(())
61+
}
62+
63+
/// Write a directory entry to a CPIO archive
64+
fn write_directory<W: Write>(writer: &mut BufWriter<W>, path: &str) -> io::Result<()> {
65+
// Directory mode: 0755 + S_IFDIR (0o40000)
66+
let mode = 0o40755;
67+
write_header(writer, path, mode, 0)?;
68+
Ok(())
69+
}
70+
71+
/// Write a regular file entry to a CPIO archive
72+
fn write_file<W: Write>(
73+
writer: &mut BufWriter<W>,
74+
path: &str,
75+
content: &[u8],
76+
mode: u32,
77+
) -> io::Result<()> {
78+
// Add S_IFREG (0o100000) to mode
79+
let full_mode = 0o100000 | mode;
80+
// SAFETY: content length should fit within 32 bits
81+
let content_len: u32 = content.len().try_into().unwrap();
82+
write_header(writer, path, full_mode, content_len)?;
83+
writer.write_all(content)?;
84+
write_data_padding(writer, content_len)?;
85+
Ok(())
86+
}
87+
88+
/// Write the CPIO trailer (end of archive marker)
89+
fn write_trailer<W: Write>(writer: &mut BufWriter<W>) -> io::Result<()> {
90+
write_header(writer, "TRAILER!!!", 0, 0)?;
91+
Ok(())
92+
}
93+
94+
/// Create a CPIO archive with bcvk initramfs units
95+
///
96+
/// This creates a minimal CPIO archive containing:
97+
/// - The /etc overlay service unit (runs in initramfs)
98+
/// - The /var ephemeral service unit (runs in initramfs)
99+
/// - The copy-units service (copies journal-stream to /sysroot/etc for systemd <256)
100+
/// - The journal-stream service (to be copied for systemd <256 compatibility)
101+
/// - Drop-in files to pull units into appropriate targets
102+
///
103+
/// On systemd v256+, the journal-stream unit is created via SMBIOS credentials.
104+
/// On older versions, bcvk-copy-units.service copies the embedded unit to
105+
/// /sysroot/etc/systemd/system/ before switch-root.
106+
pub fn create_initramfs_units_cpio() -> Vec<u8> {
107+
let mut buf = Vec::new();
108+
let mut writer = BufWriter::new(&mut buf);
109+
110+
// Include the initramfs service units
111+
let etc_overlay_content = include_str!("units/bcvk-etc-overlay.service");
112+
let var_ephemeral_content = include_str!("units/bcvk-var-ephemeral.service");
113+
let copy_units_content = include_str!("units/bcvk-copy-units.service");
114+
115+
// Include the journal-stream service (copied to /sysroot/etc on systemd <256)
116+
let journal_stream_content = include_str!("units/bcvk-journal-stream.service");
117+
118+
// Create directory structure
119+
write_directory(&mut writer, "usr").unwrap();
120+
write_directory(&mut writer, "usr/lib").unwrap();
121+
write_directory(&mut writer, "usr/lib/systemd").unwrap();
122+
write_directory(&mut writer, "usr/lib/systemd/system").unwrap();
123+
124+
// Write the initramfs service units (mode 0644)
125+
write_file(
126+
&mut writer,
127+
"usr/lib/systemd/system/bcvk-etc-overlay.service",
128+
etc_overlay_content.as_bytes(),
129+
0o644,
130+
)
131+
.unwrap();
132+
133+
write_file(
134+
&mut writer,
135+
"usr/lib/systemd/system/bcvk-var-ephemeral.service",
136+
var_ephemeral_content.as_bytes(),
137+
0o644,
138+
)
139+
.unwrap();
140+
141+
write_file(
142+
&mut writer,
143+
"usr/lib/systemd/system/bcvk-copy-units.service",
144+
copy_units_content.as_bytes(),
145+
0o644,
146+
)
147+
.unwrap();
148+
149+
// Write the journal-stream service (will be copied to /sysroot/etc on systemd <256)
150+
write_file(
151+
&mut writer,
152+
"usr/lib/systemd/system/bcvk-journal-stream.service",
153+
journal_stream_content.as_bytes(),
154+
0o644,
155+
)
156+
.unwrap();
157+
158+
// Create drop-in directories and files to pull units into initrd-fs.target
159+
write_directory(&mut writer, "usr/lib/systemd/system/initrd-fs.target.d").unwrap();
160+
161+
let etc_dropin = "[Unit]\nWants=bcvk-etc-overlay.service\n";
162+
write_file(
163+
&mut writer,
164+
"usr/lib/systemd/system/initrd-fs.target.d/bcvk-etc-overlay.conf",
165+
etc_dropin.as_bytes(),
166+
0o644,
167+
)
168+
.unwrap();
169+
170+
let var_dropin = "[Unit]\nWants=bcvk-var-ephemeral.service\n";
171+
write_file(
172+
&mut writer,
173+
"usr/lib/systemd/system/initrd-fs.target.d/bcvk-var-ephemeral.conf",
174+
var_dropin.as_bytes(),
175+
0o644,
176+
)
177+
.unwrap();
178+
179+
let copy_dropin = "[Unit]\nWants=bcvk-copy-units.service\n";
180+
write_file(
181+
&mut writer,
182+
"usr/lib/systemd/system/initrd-fs.target.d/bcvk-copy-units.conf",
183+
copy_dropin.as_bytes(),
184+
0o644,
185+
)
186+
.unwrap();
187+
188+
// Write trailer
189+
write_trailer(&mut writer).unwrap();
190+
191+
// Flush and return the buffer
192+
writer.into_inner().unwrap();
193+
buf
194+
}
195+
196+
#[cfg(test)]
197+
mod tests {
198+
use super::*;
199+
200+
#[test]
201+
fn test_create_initramfs_units_cpio() {
202+
let cpio = create_initramfs_units_cpio();
203+
204+
// Should start with CPIO magic
205+
assert!(cpio.starts_with(CPIO_MAGIC.as_bytes()));
206+
207+
let cpio_str = std::str::from_utf8(&cpio).unwrap();
208+
209+
// Should contain the embedded service units
210+
assert!(cpio_str.contains("bcvk-etc-overlay.service"));
211+
assert!(cpio_str.contains("bcvk-var-ephemeral.service"));
212+
assert!(cpio_str.contains("bcvk-copy-units.service"));
213+
assert!(cpio_str.contains("bcvk-journal-stream.service"));
214+
215+
// Should contain the drop-in configs
216+
assert!(cpio_str.contains("initrd-fs.target.d"));
217+
218+
// Should end with TRAILER!!!
219+
assert!(cpio_str.contains("TRAILER!!!"));
220+
}
221+
}

crates/kit/src/credentials.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,10 @@ pub fn storage_opts_tmpfiles_d_lines() -> String {
173173
"f /etc/systemd/system.conf.d/90-bcvk-storage.conf 0644 root root - [Manager]\\nDefaultEnvironment=STORAGE_OPTS=additionalimagestore=/run/host-container-storage\n"
174174
).to_string()
175175
}
176+
// Note: The /etc overlay and /var ephemeral units are now embedded directly in the
177+
// initramfs CPIO archive (see cpio.rs) rather than being injected via SMBIOS credentials.
178+
// This ensures they work on systemd <256 where credential import happens too late for
179+
// generators to process the credentials.
176180

177181
/// Generate SMBIOS credential string for root SSH access
178182
///

crates/kit/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
//! bcvk library - exposes internal modules for testing
22
3+
pub mod cpio;
34
pub mod qemu_img;
45
pub mod xml_utils;

crates/kit/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ mod cache_metadata;
1010
mod cli_json;
1111
mod common_opts;
1212
mod container_entrypoint;
13+
mod cpio;
1314
mod credentials;
1415
mod domain_list;
1516
mod ephemeral;

0 commit comments

Comments
 (0)