Skip to content

Commit 061bc0a

Browse files
authored
LinuxContainer: Add support for optional writable layer (#533)
This adds support to LinuxContainer to be able to provide an optional writable layer. This is useful as today the rootfs size is determined by the size of the image block itself which is set in stone at unpack time of the image. This leaves a lot to be desired because a user might want a larger or smaller rootfs size, but it's not configurable on a per container basis because of this. To support this, we can pass an additional writable block device that we can overlayfs with the image contents in the guest. All writes will go to this writable layer, and the rootfs size in the container is now whatever size this writable layer is. The block devices (via our ext4 package, but you could use whatever) are cheap enough to generate on the fly that you can choose whatever size you want easily.
1 parent 6cb52e2 commit 061bc0a

File tree

6 files changed

+472
-20
lines changed

6 files changed

+472
-20
lines changed

Sources/Containerization/ContainerManager.swift

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@
1717
#if os(macOS)
1818

1919
import ContainerizationError
20+
import ContainerizationEXT4
2021
import ContainerizationOCI
2122
import ContainerizationOS
2223
import Foundation
2324
import ContainerizationExtras
25+
import SystemPackage
2426
import Virtualization
2527
import vmnet
2628

@@ -369,11 +371,14 @@ public struct ContainerManager: Sendable {
369371
/// - id: The container ID.
370372
/// - reference: The image reference.
371373
/// - rootfsSizeInBytes: The size of the root filesystem in bytes. Defaults to 8 GiB.
374+
/// - writableLayerSizeInBytes: Optional size for a separate writable layer. When provided,
375+
/// the rootfs becomes read-only and an overlayfs is used with a separate writable layer of this size.
372376
/// - readOnly: Whether to mount the root filesystem as read-only.
373377
public mutating func create(
374378
_ id: String,
375379
reference: String,
376380
rootfsSizeInBytes: UInt64 = 8.gib(),
381+
writableLayerSizeInBytes: UInt64? = nil,
377382
readOnly: Bool = false,
378383
configuration: (inout LinuxContainer.Configuration) throws -> Void
379384
) async throws -> LinuxContainer {
@@ -382,6 +387,7 @@ public struct ContainerManager: Sendable {
382387
id,
383388
image: image,
384389
rootfsSizeInBytes: rootfsSizeInBytes,
390+
writableLayerSizeInBytes: writableLayerSizeInBytes,
385391
readOnly: readOnly,
386392
configuration: configuration
387393
)
@@ -392,11 +398,14 @@ public struct ContainerManager: Sendable {
392398
/// - id: The container ID.
393399
/// - image: The image.
394400
/// - rootfsSizeInBytes: The size of the root filesystem in bytes. Defaults to 8 GiB.
401+
/// - writableLayerSizeInBytes: Optional size for a separate writable layer. When provided,
402+
/// the rootfs becomes read-only and an overlayfs is used with a separate writable layer of this size.
395403
/// - readOnly: Whether to mount the root filesystem as read-only.
396404
public mutating func create(
397405
_ id: String,
398406
image: Image,
399407
rootfsSizeInBytes: UInt64 = 8.gib(),
408+
writableLayerSizeInBytes: UInt64? = nil,
400409
readOnly: Bool = false,
401410
configuration: (inout LinuxContainer.Configuration) throws -> Void
402411
) async throws -> LinuxContainer {
@@ -410,10 +419,21 @@ public struct ContainerManager: Sendable {
410419
if readOnly {
411420
rootfs.options.append("ro")
412421
}
422+
423+
// Create writable layer if size is specified.
424+
var writableLayer: Mount? = nil
425+
if let writableLayerSize = writableLayerSizeInBytes {
426+
writableLayer = try createEmptyFilesystem(
427+
at: path.appendingPathComponent("writable.ext4"),
428+
size: writableLayerSize
429+
)
430+
}
431+
413432
return try await create(
414433
id,
415434
image: image,
416435
rootfs: rootfs,
436+
writableLayer: writableLayer,
417437
configuration: configuration
418438
)
419439
}
@@ -423,16 +443,22 @@ public struct ContainerManager: Sendable {
423443
/// - id: The container ID.
424444
/// - image: The image.
425445
/// - rootfs: The root filesystem mount pointing to an existing block file.
446+
/// The `destination` field is ignored as mounting is handled internally.
447+
/// - writableLayer: Optional writable layer mount. When provided, an overlayfs is used with
448+
/// rootfs as the lower layer and this as the upper layer.
449+
/// The `destination` field is ignored as mounting is handled internally.
426450
public mutating func create(
427451
_ id: String,
428452
image: Image,
429453
rootfs: Mount,
454+
writableLayer: Mount? = nil,
430455
configuration: (inout LinuxContainer.Configuration) throws -> Void
431456
) async throws -> LinuxContainer {
432457
let imageConfig = try await image.config(for: .current).config
433458
return try LinuxContainer(
434459
id,
435460
rootfs: rootfs,
461+
writableLayer: writableLayer,
436462
vmm: self.vmm
437463
) { config in
438464
if let imageConfig {
@@ -490,6 +516,21 @@ public struct ContainerManager: Sendable {
490516
throw err
491517
}
492518
}
519+
520+
private func createEmptyFilesystem(at destination: URL, size: UInt64) throws -> Mount {
521+
let path = destination.absolutePath()
522+
guard !FileManager.default.fileExists(atPath: path) else {
523+
throw ContainerizationError(.exists, message: "filesystem already exists at \(path)")
524+
}
525+
let filesystem = try EXT4.Formatter(FilePath(path), minDiskSize: size)
526+
try filesystem.close()
527+
return .block(
528+
format: "ext4",
529+
source: path,
530+
destination: "/",
531+
options: []
532+
)
533+
}
493534
}
494535

495536
extension CIDRv4 {

Sources/Containerization/LinuxContainer.swift

Lines changed: 118 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,17 @@ public final class LinuxContainer: Container, Sendable {
3131
public let id: String
3232

3333
/// Rootfs for the container.
34+
///
35+
/// Note: The `destination` field of this mount is ignored as mounting is handled internally.
3436
public let rootfs: Mount
3537

38+
/// Optional writable layer for the container. When provided, the rootfs
39+
/// is mounted as the lower layer of an overlayfs, with this as the upper layer.
40+
/// All writes will go to this layer instead of the rootfs.
41+
///
42+
/// Note: The `destination` field of this mount is ignored as mounting is handled internally.
43+
public let writableLayer: Mount?
44+
3645
/// Configuration for the container.
3746
public let config: Configuration
3847

@@ -238,21 +247,27 @@ public final class LinuxContainer: Container, Sendable {
238247
/// - Parameters:
239248
/// - id: The identifier for the container.
240249
/// - rootfs: The root filesystem mount containing the container image contents.
250+
/// The `destination` field is ignored as mounting is handled internally.
251+
/// - writableLayer: Optional writable layer mount. When provided, an overlayfs is used with
252+
/// rootfs as the lower layer and this as the upper layer. Must be a block device.
253+
/// The `destination` field is ignored as mounting is handled internally.
241254
/// - vmm: The virtual machine manager that will handle launching the VM for the container.
242255
/// - logger: Optional logger for container operations.
243256
/// - configuration: A closure that configures the container by modifying the Configuration instance.
244257
public convenience init(
245258
_ id: String,
246259
rootfs: Mount,
260+
writableLayer: Mount? = nil,
247261
vmm: VirtualMachineManager,
248262
logger: Logger? = nil,
249263
configuration: (inout Configuration) throws -> Void
250264
) throws {
251265
var config = Configuration()
252266
try configuration(&config)
253-
self.init(
267+
try self.init(
254268
id,
255269
rootfs: rootfs,
270+
writableLayer: writableLayer,
256271
vmm: vmm,
257272
configuration: config,
258273
logger: logger
@@ -264,16 +279,29 @@ public final class LinuxContainer: Container, Sendable {
264279
/// - Parameters:
265280
/// - id: The identifier for the container.
266281
/// - rootfs: The root filesystem mount containing the container image contents.
282+
/// The `destination` field is ignored as mounting is handled internally.
283+
/// - writableLayer: Optional writable layer mount. When provided, an overlayfs is used with
284+
/// rootfs as the lower layer and this as the upper layer. Must be a block device.
285+
/// The `destination` field is ignored as mounting is handled internally.
267286
/// - vmm: The virtual machine manager that will handle launching the VM for the container.
268287
/// - configuration: The container configuration specifying process, resources, networking, and other settings.
269288
/// - logger: Optional logger for container operations.
270289
public init(
271290
_ id: String,
272291
rootfs: Mount,
292+
writableLayer: Mount? = nil,
273293
vmm: VirtualMachineManager,
274294
configuration: LinuxContainer.Configuration,
275295
logger: Logger? = nil
276-
) {
296+
) throws {
297+
if let writableLayer {
298+
guard writableLayer.isBlock else {
299+
throw ContainerizationError(
300+
.invalidArgument,
301+
message: "writableLayer must be a block device"
302+
)
303+
}
304+
}
277305
self.id = id
278306
self.vmm = vmm
279307
self.hostVsockPorts = Atomic<UInt32>(0x1000_0000)
@@ -282,6 +310,7 @@ public final class LinuxContainer: Container, Sendable {
282310
self.config = configuration
283311
self.state = AsyncMutex(.initialized)
284312
self.rootfs = rootfs
313+
self.writableLayer = writableLayer
285314
}
286315

287316
private static func createDefaultRuntimeSpec(_ id: String) -> Spec {
@@ -313,7 +342,8 @@ public final class LinuxContainer: Container, Sendable {
313342

314343
// If the rootfs was requested as read-only, set it in the OCI spec.
315344
// We let the OCI runtime remount as ro, instead of doing it originally.
316-
spec.root?.readonly = self.rootfs.options.contains("ro")
345+
// However, if we have a writable layer, the overlay allows writes so we don't mark it read-only.
346+
spec.root?.readonly = self.rootfs.options.contains("ro") && self.writableLayer == nil
317347

318348
// Resource limits.
319349
// CPU: quota/period model where period is 100ms (100,000µs) and quota is cpus * period
@@ -393,6 +423,67 @@ extension LinuxContainer {
393423
config.interfaces
394424
}
395425

426+
private func mountRootfs(
427+
attachments: [AttachedFilesystem],
428+
rootfsPath: String,
429+
agent: VirtualMachineAgent
430+
) async throws {
431+
guard let rootfsAttachment = attachments.first else {
432+
throw ContainerizationError(.notFound, message: "rootfs mount not found")
433+
}
434+
435+
if self.writableLayer != nil {
436+
// Set up overlayfs with image as lower layer and writable layer as upper.
437+
guard attachments.count >= 2 else {
438+
throw ContainerizationError(
439+
.notFound,
440+
message: "writable layer mount not found"
441+
)
442+
}
443+
let writableAttachment = attachments[1]
444+
445+
let lowerPath = "/run/container/\(self.id)/lower"
446+
let upperMountPath = "/run/container/\(self.id)/upper"
447+
let upperPath = "/run/container/\(self.id)/upper/diff"
448+
let workPath = "/run/container/\(self.id)/upper/work"
449+
450+
// Mount the image (lower layer) as read-only.
451+
var lowerMount = rootfsAttachment.to
452+
lowerMount.destination = lowerPath
453+
if !lowerMount.options.contains("ro") {
454+
lowerMount.options.append("ro")
455+
}
456+
try await agent.mount(lowerMount)
457+
458+
// Mount the writable layer.
459+
var upperMount = writableAttachment.to
460+
upperMount.destination = upperMountPath
461+
try await agent.mount(upperMount)
462+
463+
// Create the upper and work directories inside the writable layer.
464+
try await agent.mkdir(path: upperPath, all: true, perms: 0o755)
465+
try await agent.mkdir(path: workPath, all: true, perms: 0o755)
466+
467+
// Mount the overlay.
468+
let overlayMount = ContainerizationOCI.Mount(
469+
type: "overlay",
470+
source: "overlay",
471+
destination: rootfsPath,
472+
options: [
473+
"lowerdir=\(lowerPath)",
474+
"upperdir=\(upperPath)",
475+
"workdir=\(workPath)",
476+
]
477+
)
478+
try await agent.mount(overlayMount)
479+
} else {
480+
// No writable layer. Mount rootfs directly.
481+
var rootfs = rootfsAttachment.to
482+
rootfs.destination = rootfsPath
483+
try await agent.mount(rootfs)
484+
}
485+
}
486+
396487
/// Create and start the underlying container's virtual machine
397488
/// and set up the runtime environment. The container's init process
398489
/// is NOT running afterwards.
@@ -428,11 +519,17 @@ extension LinuxContainer {
428519
// This is dumb, but alas.
429520
let fileMountContextHolder = Mutex<FileMountContext>(fileMountContext)
430521

522+
// Build the list of mounts to attach to the VM.
523+
var containerMounts = [modifiedRootfs] + fileMountContext.transformedMounts
524+
if let writableLayer = self.writableLayer {
525+
containerMounts.insert(writableLayer, at: 1)
526+
}
527+
431528
let vmConfig = VMConfiguration(
432529
cpus: self.cpus,
433530
memoryInBytes: vmMemory,
434531
interfaces: self.interfaces,
435-
mountsByID: [self.id: [modifiedRootfs] + fileMountContext.transformedMounts],
532+
mountsByID: [self.id: containerMounts],
436533
bootLog: self.config.bootLog,
437534
nestedVirtualization: self.config.virtualization
438535
)
@@ -445,13 +542,11 @@ extension LinuxContainer {
445542
try await vm.withAgent { agent in
446543
try await agent.standardSetup()
447544

448-
// Mount the rootfs.
449-
guard let attachments = vm.mounts[self.id], let rootfsAttachment = attachments.first else {
545+
guard let attachments = vm.mounts[self.id] else {
450546
throw ContainerizationError(.notFound, message: "rootfs mount not found")
451547
}
452-
var rootfs = rootfsAttachment.to
453-
rootfs.destination = Self.guestRootfsPath(self.id)
454-
try await agent.mount(rootfs)
548+
let rootfsPath = Self.guestRootfsPath(self.id)
549+
try await self.mountRootfs(attachments: attachments, rootfsPath: rootfsPath, agent: agent)
455550

456551
// Mount file mount holding directories under /run.
457552
if fileMountContext.hasFileMounts {
@@ -493,10 +588,10 @@ extension LinuxContainer {
493588

494589
// Setup /etc/resolv.conf and /etc/hosts if asked for.
495590
if let dns = self.config.dns {
496-
try await agent.configureDNS(config: dns, location: rootfs.destination)
591+
try await agent.configureDNS(config: dns, location: rootfsPath)
497592
}
498593
if let hosts = self.config.hosts {
499-
try await agent.configureHosts(config: hosts, location: rootfs.destination)
594+
try await agent.configureHosts(config: hosts, location: rootfsPath)
500595
}
501596

502597
}
@@ -518,12 +613,14 @@ extension LinuxContainer {
518613
let agent = try await createdState.vm.dialAgent()
519614
do {
520615
var spec = self.generateRuntimeSpec()
521-
// We don't need the rootfs, nor do OCI runtimes want it included.
616+
// We don't need the rootfs (or writable layer), nor do OCI runtimes want it included.
522617
// Also filter out file mount holding directories. We'll mount those separately under /run.
523618
let containerMounts = createdState.vm.mounts[self.id] ?? []
524619
let holdingTags = createdState.fileMountContext.holdingDirectoryTags
620+
// Drop rootfs, and writable layer if present.
621+
let mountsToSkip = self.writableLayer != nil ? 2 : 1
525622
spec.mounts =
526-
containerMounts.dropFirst()
623+
containerMounts.dropFirst(mountsToSkip)
527624
.filter { !holdingTags.contains($0.source) }
528625
.map { $0.to }
529626
+ createdState.fileMountContext.ociBindMounts()
@@ -666,6 +763,14 @@ extension LinuxContainer {
666763
flags: 0
667764
)
668765

766+
// If we have a writable layer, we also need to unmount the lower and upper layers.
767+
if self.writableLayer != nil {
768+
let upperPath = "/run/container/\(self.id)/upper"
769+
let lowerPath = "/run/container/\(self.id)/lower"
770+
try await agent.umount(path: upperPath, flags: 0)
771+
try await agent.umount(path: lowerPath, flags: 0)
772+
}
773+
669774
try await agent.sync()
670775
}
671776
} catch {

Sources/Containerization/Mount.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,4 +226,12 @@ extension Mount {
226226
fileprivate var readonly: Bool {
227227
self.options.contains("ro")
228228
}
229+
230+
/// Returns true if this mount is a virtio block device.
231+
public var isBlock: Bool {
232+
if case .virtioblk = self.runtimeOptions {
233+
return true
234+
}
235+
return false
236+
}
229237
}

Sources/Containerization/VZVirtualMachineInstance.swift

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -430,12 +430,6 @@ extension VZVirtualMachineInstance.Configuration {
430430
}
431431
}
432432

433-
extension Mount {
434-
var isBlock: Bool {
435-
type == "ext4"
436-
}
437-
}
438-
439433
extension Kernel {
440434
func linuxCommandline(initialFilesystem: Mount) -> String {
441435
var args = self.commandLine.kernelArgs

0 commit comments

Comments
 (0)