Skip to content

Commit 2d27ef6

Browse files
authored
LinuxPod: Wire up pid namespace sharing (#434)
In a prior change I'd added a way for vminitd to double as a simple pause container. This change wires this up by adding a new bool to the pod config to ask for pid ns sharing.
1 parent c45fef7 commit 2d27ef6

File tree

5 files changed

+186
-7
lines changed

5 files changed

+186
-7
lines changed

Sources/Containerization/LinuxPod.swift

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ public final class LinuxPod: Sendable {
5050
public var virtualization: Bool = false
5151
/// Optional file path to store serial boot logs.
5252
public var bootLog: BootLog?
53+
/// Whether containers in the pod should share a PID namespace.
54+
/// When enabled, all containers can see each other's processes.
55+
public var shareProcessNamespace: Bool = false
5356

5457
public init() {}
5558
}
@@ -103,6 +106,7 @@ public final class LinuxPod: Sendable {
103106
private struct State: Sendable {
104107
var phase: Phase
105108
var containers: [String: PodContainer]
109+
var pauseProcess: LinuxProcess?
106110
}
107111

108112
private enum Phase: Sendable {
@@ -173,7 +177,7 @@ public final class LinuxPod: Sendable {
173177
try configuration(&config)
174178

175179
self.config = config
176-
self.state = AsyncMutex(State(phase: .initialized, containers: [:]))
180+
self.state = AsyncMutex(State(phase: .initialized, containers: [:], pauseProcess: nil))
177181
}
178182

179183
private static func createDefaultRuntimeSpec(_ containerID: String, podID: String) -> Spec {
@@ -303,9 +307,64 @@ extension LinuxPod {
303307

304308
do {
305309
let containers = state.containers
310+
let shareProcessNamespace = self.config.shareProcessNamespace
311+
let pauseProcessHolder = Mutex<LinuxProcess?>(nil)
312+
306313
try await vm.withAgent { agent in
307314
try await agent.standardSetup()
308315

316+
// Create pause container if PID namespace sharing is enabled
317+
if shareProcessNamespace {
318+
let pauseID = "pause-\(self.id)"
319+
let pauseRootfsPath = "/run/container/\(pauseID)/rootfs"
320+
321+
// Bind mount /sbin into the pause container rootfs.
322+
// This is where the guest agent lives.
323+
try await agent.mount(
324+
ContainerizationOCI.Mount(
325+
type: "",
326+
source: "/sbin",
327+
destination: "\(pauseRootfsPath)/sbin",
328+
options: ["bind"]
329+
))
330+
331+
var pauseSpec = Self.createDefaultRuntimeSpec(pauseID, podID: self.id)
332+
pauseSpec.process?.args = ["/sbin/vminitd", "pause"]
333+
pauseSpec.hostname = ""
334+
pauseSpec.mounts = LinuxContainer.defaultMounts().map {
335+
ContainerizationOCI.Mount(
336+
type: $0.type,
337+
source: $0.source,
338+
destination: $0.destination,
339+
options: $0.options
340+
)
341+
}
342+
pauseSpec.linux?.namespaces = [
343+
LinuxNamespace(type: .cgroup),
344+
LinuxNamespace(type: .ipc),
345+
LinuxNamespace(type: .mount),
346+
LinuxNamespace(type: .pid),
347+
LinuxNamespace(type: .uts),
348+
]
349+
350+
// Create LinuxProcess for pause container
351+
let process = LinuxProcess(
352+
pauseID,
353+
containerID: pauseID,
354+
spec: pauseSpec,
355+
io: LinuxProcess.Stdio(stdin: nil, stdout: nil, stderr: nil),
356+
ociRuntimePath: nil,
357+
agent: agent,
358+
vm: vm,
359+
logger: self.logger
360+
)
361+
362+
try await process.start()
363+
pauseProcessHolder.withLock { $0 = process }
364+
365+
self.logger?.debug("Pause container started", metadata: ["pid": "\(process.pid)"])
366+
}
367+
309368
// Mount all container rootfs
310369
for (_, container) in containers {
311370
guard let attachments = vm.mounts[container.id], let rootfsAttachment = attachments.first else {
@@ -353,6 +412,8 @@ extension LinuxPod {
353412
}
354413
}
355414

415+
state.pauseProcess = pauseProcessHolder.withLock { $0 }
416+
356417
// Transition all containers to created state
357418
for id in state.containers.keys {
358419
state.containers[id]?.state = .created
@@ -394,6 +455,33 @@ extension LinuxPod {
394455
let containerMounts = createdState.vm.mounts[containerID] ?? []
395456
spec.mounts = containerMounts.dropFirst().map { $0.to }
396457

458+
// Configure namespaces for the container
459+
var namespaces: [LinuxNamespace] = [
460+
LinuxNamespace(type: .cgroup),
461+
LinuxNamespace(type: .ipc),
462+
LinuxNamespace(type: .mount),
463+
LinuxNamespace(type: .uts),
464+
]
465+
466+
// Either join pause container's pid ns or create a new one
467+
if self.config.shareProcessNamespace, let pausePID = state.pauseProcess?.pid {
468+
let nsPath = "/proc/\(pausePID)/ns/pid"
469+
470+
self.logger?.debug(
471+
"Container joining pause PID namespace",
472+
metadata: [
473+
"container": "\(containerID)",
474+
"pausePID": "\(pausePID)",
475+
"nsPath": "\(nsPath)",
476+
])
477+
478+
namespaces.append(LinuxNamespace(type: .pid, path: nsPath))
479+
} else {
480+
namespaces.append(LinuxNamespace(type: .pid))
481+
}
482+
483+
spec.linux?.namespaces = namespaces
484+
397485
let stdio = IOUtil.setup(
398486
portAllocator: self.hostVsockPorts,
399487
stdin: container.config.process.stdin,

Sources/Integration/PodTests.swift

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -702,4 +702,48 @@ extension IntegrationSuite {
702702
throw error
703703
}
704704
}
705+
706+
func testPodSharedPIDNamespace() async throws {
707+
let id = "test-pod-shared-pid-namespace"
708+
709+
let bs = try await bootstrap(id)
710+
let pod = try LinuxPod(id, vmm: bs.vmm) { config in
711+
config.cpus = 4
712+
config.memoryInBytes = 1024.mib()
713+
config.bootLog = bs.bootLog
714+
config.shareProcessNamespace = true
715+
}
716+
717+
// First container runs a long-running process
718+
try await pod.addContainer("container1", rootfs: try cloneRootfs(bs.rootfs, testID: id, containerID: "container1")) { config in
719+
config.process.arguments = ["/bin/sleep", "300"]
720+
}
721+
722+
// Second container checks if it can see container1's sleep process
723+
let psBuffer = BufferWriter()
724+
try await pod.addContainer("container2", rootfs: try cloneRootfs(bs.rootfs, testID: id, containerID: "container2")) { config in
725+
config.process.arguments = ["/bin/sh", "-c", "ps aux | grep 'sleep 300' | grep -v grep"]
726+
config.process.stdout = psBuffer
727+
}
728+
729+
try await pod.create()
730+
try await pod.startContainer("container1")
731+
try await Task.sleep(for: .milliseconds(100))
732+
733+
try await pod.startContainer("container2")
734+
let status = try await pod.waitContainer("container2")
735+
736+
try await pod.killContainer("container1", signal: SIGKILL)
737+
_ = try await pod.waitContainer("container1")
738+
try await pod.stop()
739+
740+
guard status.exitCode == 0 else {
741+
throw IntegrationError.assert(msg: "container2 should have found the sleep process (status: \(status))")
742+
}
743+
744+
let output = String(data: psBuffer.data, encoding: .utf8) ?? ""
745+
guard output.contains("sleep 300") else {
746+
throw IntegrationError.assert(msg: "ps output should contain 'sleep 300', got: '\(output)'")
747+
}
748+
}
705749
}

Sources/Integration/Suite.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,7 @@ struct IntegrationSuite: AsyncParsableCommand {
315315
Test("pod container filesystem isolation", testPodContainerFilesystemIsolation),
316316
Test("pod container PID namespace isolation", testPodContainerPIDNamespaceIsolation),
317317
Test("pod container independent resource limits", testPodContainerIndependentResourceLimits),
318+
Test("pod shared PID namespace", testPodSharedPIDNamespace),
318319
]
319320

320321
let passed: Atomic<Int> = Atomic(0)

vminitd/Sources/vmexec/RunCommand.swift

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,14 @@ struct RunCommand: ParsableCommand {
3636
LoggingSystem.bootstrap(App.standardError)
3737
let log = Logger(label: "vmexec")
3838

39-
let bundle = try ContainerizationOCI.Bundle.load(path: URL(filePath: bundlePath))
40-
let ociSpec = try bundle.loadConfig()
41-
try execInNamespace(spec: ociSpec, log: log)
39+
let spec: ContainerizationOCI.Spec
40+
do {
41+
let bundle = try ContainerizationOCI.Bundle.load(path: URL(filePath: bundlePath))
42+
spec = try bundle.loadConfig()
43+
} catch {
44+
throw App.Failure(message: "failed to load OCI bundle at \(bundlePath): \(error)")
45+
}
46+
try execInNamespace(spec: spec, log: log)
4247
} catch {
4348
App.writeError(error)
4449
throw error
@@ -146,12 +151,54 @@ struct RunCommand: ParsableCommand {
146151
try App.exec(process: process, currentEnv: process.env)
147152
}
148153

154+
private func setupNamespaces(namespaces: [ContainerizationOCI.LinuxNamespace]?) throws -> Int32 {
155+
var unshareFlags: Int32 = 0
156+
157+
// Map namespace types to their corresponding CLONE flags
158+
let nsTypeToFlag: [ContainerizationOCI.LinuxNamespaceType: Int32] = [
159+
.pid: CLONE_NEWPID,
160+
.mount: CLONE_NEWNS,
161+
.uts: CLONE_NEWUTS,
162+
.ipc: CLONE_NEWIPC,
163+
.user: CLONE_NEWUSER,
164+
.cgroup: CLONE_NEWCGROUP,
165+
]
166+
167+
guard let namespaces = namespaces else {
168+
return CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWUTS
169+
}
170+
171+
for ns in namespaces {
172+
guard let flag = nsTypeToFlag[ns.type] else {
173+
continue
174+
}
175+
176+
if ns.path.isEmpty {
177+
unshareFlags |= flag
178+
} else {
179+
let fd = open(ns.path, O_RDONLY | O_CLOEXEC)
180+
guard fd >= 0 else {
181+
throw App.Errno(stage: "open(\(ns.path))")
182+
}
183+
defer { close(fd) }
184+
185+
guard setns(fd, flag) == 0 else {
186+
throw App.Errno(stage: "setns(\(ns.path))")
187+
}
188+
}
189+
}
190+
191+
return unshareFlags
192+
}
193+
149194
private func execInNamespace(spec: ContainerizationOCI.Spec, log: Logger) throws {
150195
let syncPipe = FileHandle(fileDescriptor: 3)
151196
let ackPipe = FileHandle(fileDescriptor: 4)
152197

153-
guard unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWUTS) == 0 else {
154-
throw App.Errno(stage: "unshare(pid|mnt|uts)")
198+
let unshareFlags = try setupNamespaces(namespaces: spec.linux?.namespaces)
199+
200+
guard unshare(unshareFlags) == 0 else {
201+
throw App.Errno(stage: "unshare(\(unshareFlags))")
155202
}
156203

157204
let processID = fork()

vminitd/Sources/vminitd/Server+GRPC.swift

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,6 @@ extension Initd: Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncProvid
428428
"stdin": "Port: \(request.stdin)",
429429
"stdout": "Port: \(request.stdout)",
430430
"stderr": "Port: \(request.stderr)",
431-
"configuration": "\(request.configuration.count)",
432431
])
433432

434433
if !request.hasContainerID {

0 commit comments

Comments
 (0)