Skip to content

Commit 933fbc1

Browse files
committed
ephemeral: Replace systemd.volatile=overlay with fine-grained mounts
Instead of using systemd.volatile=overlay which overlaid all of / with a single tmpfs-backed overlayfs, set up /etc and /var separately: - /etc: overlayfs with tmpfs upper (transient changes, lost on reboot) - /var: real tmpfs with content copied from image (not overlayfs) The key benefit is that /var is now a real tmpfs, allowing podman to use overlayfs for container storage inside /var/lib/containers. With the old approach, the nested overlayfs caused "too many levels of symbolic links" errors. Implementation uses systemd credentials to inject units that run in the initramfs before switch-root: - bcvk-etc-overlay.service: overlay on /sysroot/etc with index=off,metacopy=off to avoid virtiofs contention; ordered after initrd-parse-etc.service - bcvk-var-ephemeral.service: copies /sysroot/var to tmpfs and bind mounts Both units use ConditionPathExists=/etc/initrd-release to only run in the initramfs context. The execute service target is changed from default.target to multi-user.target with ConditionPathExists=!/etc/initrd-release to ensure it runs after switch-root, not in the initramfs. This is Phase 1 of issue #22, making ephemeral VMs more bootc-like. SELinux is still disabled (selinux=0); Phase 2 will add composefs support to enable proper SELinux labeling. Closes: #22 (Phase 1) Assisted-by: OpenCode (Sonnet 4)
1 parent d2d72cb commit 933fbc1

File tree

5 files changed

+175
-5
lines changed

5 files changed

+175
-5
lines changed

crates/integration-tests/src/tests/run_ephemeral.rs

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,3 +299,77 @@ fn test_run_ephemeral_instancetype_invalid() -> Result<()> {
299299
Ok(())
300300
}
301301
integration_test!(test_run_ephemeral_instancetype_invalid);
302+
303+
/// Test that ephemeral VMs have the expected mount layout:
304+
/// - / is read-only virtiofs
305+
/// - /etc is overlayfs with tmpfs upper (writable)
306+
/// - /var is tmpfs (not overlayfs, so podman can use overlayfs inside)
307+
fn test_run_ephemeral_mount_layout() -> Result<()> {
308+
// Check each mount point individually using findmnt
309+
// Running all three at once with -J can hang on some configurations
310+
311+
// Check root mount
312+
let output = run_bcvk(&[
313+
"ephemeral",
314+
"run",
315+
"--rm",
316+
"--label",
317+
INTEGRATION_TEST_LABEL,
318+
"--execute",
319+
"findmnt -n -o FSTYPE,OPTIONS /",
320+
&get_test_image(),
321+
])?;
322+
output.assert_success("check root mount");
323+
let root_line = output.stdout.trim();
324+
assert!(
325+
root_line.starts_with("virtiofs"),
326+
"Root should be virtiofs, got: {}",
327+
root_line
328+
);
329+
assert!(
330+
root_line.contains("ro"),
331+
"Root should be read-only, got: {}",
332+
root_line
333+
);
334+
335+
// Check /etc mount
336+
let output = run_bcvk(&[
337+
"ephemeral",
338+
"run",
339+
"--rm",
340+
"--label",
341+
INTEGRATION_TEST_LABEL,
342+
"--execute",
343+
"findmnt -n -o FSTYPE /etc",
344+
&get_test_image(),
345+
])?;
346+
output.assert_success("check /etc mount");
347+
assert_eq!(
348+
output.stdout.trim(),
349+
"overlay",
350+
"/etc should be overlay, got: {}",
351+
output.stdout
352+
);
353+
354+
// Check /var mount - should be tmpfs, NOT overlay
355+
let output = run_bcvk(&[
356+
"ephemeral",
357+
"run",
358+
"--rm",
359+
"--label",
360+
INTEGRATION_TEST_LABEL,
361+
"--execute",
362+
"findmnt -n -o FSTYPE /var",
363+
&get_test_image(),
364+
])?;
365+
output.assert_success("check /var mount");
366+
assert_eq!(
367+
output.stdout.trim(),
368+
"tmpfs",
369+
"/var should be tmpfs (not overlay), got: {}",
370+
output.stdout
371+
);
372+
373+
Ok(())
374+
}
375+
integration_test!(test_run_ephemeral_mount_layout);

crates/kit/src/credentials.rs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,60 @@ pub fn storage_opts_tmpfiles_d_lines() -> String {
174174
).to_string()
175175
}
176176

177+
/// Generate SMBIOS credentials for transient /etc overlay
178+
///
179+
/// Creates a systemd service that runs in the initramfs to set up /etc as an
180+
/// overlayfs with tmpfs upper. Changes are stored in tmpfs (lost on reboot),
181+
/// while the base /etc content comes from the container image.
182+
///
183+
/// Uses a service instead of a mount unit because systemd mount units can
184+
/// hang when mounting overlayfs on virtiofs subdirectories.
185+
pub fn smbios_creds_for_etc_overlay() -> Vec<String> {
186+
// Systemd unit that sets up /etc overlay in initramfs
187+
let unit_content = include_str!("units/bcvk-etc-overlay.service");
188+
let encoded_unit = data_encoding::BASE64.encode(unit_content.as_bytes());
189+
let unit_cred = format!(
190+
"io.systemd.credential.binary:systemd.extra-unit.bcvk-etc-overlay.service={encoded_unit}"
191+
);
192+
193+
// Create dropin for initrd-fs.target to pull in our unit
194+
let dropin_content = "[Unit]\nWants=bcvk-etc-overlay.service\n";
195+
let encoded_dropin = data_encoding::BASE64.encode(dropin_content.as_bytes());
196+
let dropin_cred = format!(
197+
"io.systemd.credential.binary:systemd.unit-dropin.initrd-fs.target~bcvk-etc-overlay={encoded_dropin}"
198+
);
199+
200+
vec![unit_cred, dropin_cred]
201+
}
202+
203+
/// Generate SMBIOS credentials for ephemeral /var with image content
204+
///
205+
/// Creates a systemd service that runs in the initramfs to:
206+
/// 1. Creates /run/var-ephemeral directory
207+
/// 2. Copies all content from /sysroot/var to /run/var-ephemeral (one-time)
208+
/// 3. Bind mounts /run/var-ephemeral over /sysroot/var
209+
///
210+
/// This runs before switch-root so /var is ready as a real tmpfs (not overlayfs).
211+
/// This allows podman to use overlayfs inside /var/lib/containers.
212+
pub fn smbios_creds_for_var_ephemeral() -> Vec<String> {
213+
// Systemd unit that sets up ephemeral /var in initramfs
214+
// Must run after sysroot is mounted but before switch-root
215+
let unit_content = include_str!("units/bcvk-var-ephemeral.service");
216+
let encoded_unit = data_encoding::BASE64.encode(unit_content.as_bytes());
217+
let unit_cred = format!(
218+
"io.systemd.credential.binary:systemd.extra-unit.bcvk-var-ephemeral.service={encoded_unit}"
219+
);
220+
221+
// Create dropin for initrd-fs.target to pull in our unit
222+
let dropin_content = "[Unit]\nWants=bcvk-var-ephemeral.service\n";
223+
let encoded_dropin = data_encoding::BASE64.encode(dropin_content.as_bytes());
224+
let dropin_cred = format!(
225+
"io.systemd.credential.binary:systemd.unit-dropin.initrd-fs.target~bcvk-var-ephemeral={encoded_dropin}"
226+
);
227+
228+
vec![unit_cred, dropin_cred]
229+
}
230+
177231
/// Generate SMBIOS credential string for root SSH access
178232
///
179233
/// Creates a systemd credential for QEMU's SMBIOS interface. Preferred method

crates/kit/src/run_ephemeral.rs

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,6 +1064,13 @@ pub(crate) async fn run_impl(opts: RunEphemeralOpts) -> Result<()> {
10641064
);
10651065
}
10661066

1067+
// Add /etc overlay and /var ephemeral credentials
1068+
// These replace systemd.volatile=overlay with more fine-grained control:
1069+
// - /etc: overlay with tmpfs upper (writable, changes lost on reboot)
1070+
// - /var: real tmpfs (allows podman to use overlayfs inside)
1071+
mount_unit_smbios_creds.extend(crate::credentials::smbios_creds_for_etc_overlay());
1072+
mount_unit_smbios_creds.extend(crate::credentials::smbios_creds_for_var_ephemeral());
1073+
10671074
// Handle --execute: pipes will be created when adding to qemu_config later
10681075
// No need to create files anymore as we're using pipes
10691076

@@ -1110,6 +1117,8 @@ WantedBy=sysinit.target
11101117
Description=Execute Script Service
11111118
Requires=dev-virtio\x2dports-execute.device
11121119
After=dev-virtio\x2dports-execute.device
1120+
# Ensure we only run after switch-root in the real root filesystem
1121+
ConditionPathExists=!/etc/initrd-release
11131122
11141123
[Service]
11151124
Type=oneshot
@@ -1127,6 +1136,8 @@ Description=Execute Script Service Completion
11271136
After=bootc-execute.service
11281137
Requires=dev-virtio\x2dports-executestatus.device
11291138
After=dev-virtio\x2dports-executestatus.device
1139+
# Ensure we only run after switch-root in the real root filesystem
1140+
ConditionPathExists=!/etc/initrd-release
11301141
11311142
[Service]
11321143
Type=oneshot
@@ -1148,12 +1159,14 @@ StandardOutput=file:/dev/virtio-ports/executestatus
11481159
);
11491160
mount_unit_smbios_creds.push(finish_cred);
11501161

1151-
// Create dropin for default.target to enable execute services
1162+
// Create dropin for multi-user.target to enable execute services
1163+
// Using multi-user.target instead of default.target ensures these only run
1164+
// after switch-root in the real root filesystem (not in initramfs)
11521165
let execute_dropin =
11531166
"[Unit]\nWants=bootc-execute.service bootc-execute-finish.service\n";
11541167
let encoded_dropin = data_encoding::BASE64.encode(execute_dropin.as_bytes());
11551168
let dropin_cred = format!(
1156-
"io.systemd.credential.binary:systemd.unit-dropin.default.target~bcvk-execute={encoded_dropin}"
1169+
"io.systemd.credential.binary:systemd.unit-dropin.multi-user.target~bcvk-execute={encoded_dropin}"
11571170
);
11581171
mount_unit_smbios_creds.push(dropin_cred);
11591172
debug!("Generated SMBIOS credentials for execute units");
@@ -1199,10 +1212,10 @@ StandardOutput=file:/dev/virtio-ports/executestatus
11991212
// At the core we boot from the mounted container's root,
12001213
"rootfstype=virtiofs",
12011214
"root=rootfs",
1202-
// But read-only, with an overlayfs in the VM backed
1203-
// by tmpfs
1215+
// But read-only. We set up /etc overlay and /var copyup via
1216+
// systemd credentials rather than systemd.volatile=overlay
1217+
// to have more control over individual directories.
12041218
"rootflags=ro",
1205-
"systemd.volatile=overlay",
12061219
// This avoids having journald interact with the rootfs
12071220
// at all, which lessens the I/O traffic for virtiofs
12081221
"systemd.journald.storage=volatile",
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
[Unit]
2+
Description=Setup ephemeral /etc overlay
3+
DefaultDependencies=no
4+
ConditionPathExists=/etc/initrd-release
5+
Before=initrd-fs.target
6+
# Must run after initrd-parse-etc.service which scans /sysroot/etc/fstab
7+
# to avoid virtiofs contention that causes mount to hang
8+
After=sysroot.mount initrd-parse-etc.service
9+
10+
[Service]
11+
Type=oneshot
12+
RemainAfterExit=yes
13+
TimeoutStartSec=30
14+
ExecStart=/usr/bin/mkdir -p /run/etc-upper /run/etc-work
15+
# Use index=off,metacopy=off to avoid extended attr operations on virtiofs
16+
ExecStart=/usr/bin/mount -t overlay overlay -o lowerdir=/sysroot/etc,upperdir=/run/etc-upper,workdir=/run/etc-work,index=off,metacopy=off /sysroot/etc
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[Unit]
2+
Description=Setup ephemeral /var from image content
3+
DefaultDependencies=no
4+
ConditionPathExists=/etc/initrd-release
5+
Before=initrd-fs.target
6+
After=sysroot.mount
7+
8+
[Service]
9+
Type=oneshot
10+
RemainAfterExit=yes
11+
ExecStart=/usr/bin/mkdir -p /run/var-ephemeral
12+
ExecStart=/usr/bin/cp -a /sysroot/var/. /run/var-ephemeral/
13+
ExecStart=/usr/bin/mount --bind /run/var-ephemeral /sysroot/var

0 commit comments

Comments
 (0)