Skip to content

Commit 53021be

Browse files
authored
Cgroup2Manager: Various adjustments (#270)
Systemd loves to move the root cgroups processes to one it created named /init.scope and then write the root cgs subtree_control file. Because of this we can't just add exec processes to the cg we made for the container anymore as we'll get EBUSY. We should follow where the init processes cg is actually at (/proc/pid/cgroup) and add it there.
1 parent 2f32e36 commit 53021be

File tree

5 files changed

+142
-56
lines changed

5 files changed

+142
-56
lines changed

vminitd/Sources/vmexec/ExecCommand.swift

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,8 @@ struct ExecCommand: ParsableCommand {
4646
try execInNamespaces(process: process, log: log)
4747
}
4848

49-
static func enterNS(path: String, nsType: Int32) throws {
50-
let fd = open(path, O_RDONLY)
51-
if fd <= 0 {
52-
throw App.Errno(stage: "open(ns)")
53-
}
54-
defer { close(fd) }
55-
56-
guard setns(fd, nsType) == 0 else {
49+
static func enterNS(pidFd: Int32, nsType: Int32) throws {
50+
guard setns(pidFd, nsType) == 0 else {
5751
throw App.Errno(stage: "setns(fd)")
5852
}
5953
}
@@ -65,10 +59,14 @@ struct ExecCommand: ParsableCommand {
6559
let syncPipe = FileHandle(fileDescriptor: 3)
6660
let ackPipe = FileHandle(fileDescriptor: 4)
6761

68-
try Self.enterNS(path: "/proc/\(self.parentPid)/ns/cgroup", nsType: CLONE_NEWCGROUP)
69-
try Self.enterNS(path: "/proc/\(self.parentPid)/ns/pid", nsType: CLONE_NEWPID)
70-
try Self.enterNS(path: "/proc/\(self.parentPid)/ns/uts", nsType: CLONE_NEWUTS)
71-
try Self.enterNS(path: "/proc/\(self.parentPid)/ns/mnt", nsType: CLONE_NEWNS)
62+
let pidFd = CZ_pidfd_open(Int32(parentPid), 0)
63+
guard pidFd > 0 else {
64+
throw App.Errno(stage: "pidfd_open(\(parentPid))")
65+
}
66+
try Self.enterNS(
67+
pidFd: pidFd,
68+
nsType: CLONE_NEWCGROUP | CLONE_NEWPID | CLONE_NEWUTS | CLONE_NEWNS
69+
)
7270

7371
let processID = fork()
7472

vminitd/Sources/vminitd/CgroupManager.swift renamed to vminitd/Sources/vminitd/Cgroup2Manager.swift

Lines changed: 79 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,17 @@
1414
// limitations under the License.
1515
//===----------------------------------------------------------------------===//
1616

17+
#if os(Linux)
18+
19+
#if canImport(Musl)
20+
import Musl
21+
#elseif canImport(Glibc)
22+
import Glibc
23+
#endif
24+
1725
import ContainerizationOS
1826
import Foundation
1927
import Logging
20-
import Musl
2128

2229
enum Cgroup2Controller: String {
2330
case pids
@@ -30,27 +37,71 @@ enum Cgroup2Controller: String {
3037

3138
// Extremely simple cgroup manager. Our needs are simple for now, and this is
3239
// reflected in the type.
33-
internal struct Cgroup2Manager {
40+
struct Cgroup2Manager: Sendable {
3441
static let defaultMountPoint = URL(filePath: "/sys/fs/cgroup")
3542

36-
static let killFile = "cgroup.kill"
37-
static let procsFile = "cgroup.procs"
38-
static let subtreeControlFile = "cgroup.subtree_control"
43+
private static let killFile = "cgroup.kill"
44+
private static let procsFile = "cgroup.procs"
45+
private static let subtreeControlFile = "cgroup.subtree_control"
46+
47+
private static let cg2Magic = 0x6367_7270
3948

4049
private let mountPoint: URL
4150
private let path: URL
4251
private let logger: Logger?
4352

4453
init(
45-
mountPoint: URL = defaultMountPoint,
46-
path: URL,
47-
perms: Int16 = 0o755,
54+
mountPoint: URL = Self.defaultMountPoint,
55+
group: URL,
4856
logger: Logger? = nil
49-
) throws {
57+
) {
5058
self.mountPoint = mountPoint
51-
self.path = mountPoint.appending(path: path.path)
59+
self.path = mountPoint.appending(path: group.path)
5260
self.logger = logger
61+
}
62+
63+
static func load(
64+
mountPoint: URL = Self.defaultMountPoint,
65+
group: URL,
66+
logger: Logger? = nil
67+
) throws -> Cgroup2Manager {
68+
let path = mountPoint.appending(path: group.path)
69+
var s = statfs()
70+
let res = statfs(path.path, &s)
71+
if res != 0 {
72+
throw Error.errno(errno: errno, message: "failed to statfs \(path.path)")
73+
}
74+
if Int64(s.f_type) != Self.cg2Magic {
75+
throw Error.notCgroup
76+
}
77+
return Cgroup2Manager(
78+
mountPoint: mountPoint,
79+
group: group,
80+
logger: logger
81+
)
82+
}
5383

84+
static func loadFromPid(pid: Int32, logger: Logger? = nil) throws -> Cgroup2Manager {
85+
let procCgPath = URL(filePath: "/proc/\(pid)/cgroup")
86+
let fh = try FileHandle(forReadingFrom: procCgPath)
87+
guard let data = try fh.readToEnd() else {
88+
throw Error.errno(errno: errno, message: "failed to read \(procCgPath)")
89+
}
90+
91+
// If this fails we have bigger problems.
92+
let str = String(data: data, encoding: .utf8)!
93+
let parts = str.split(separator: ":")
94+
if parts[0] != "0" {
95+
throw Error.cgroup1
96+
}
97+
98+
// We should really read /proc/pid/mountinfo, but for now just assume
99+
// it's always at /sys/fs/cgroup.
100+
let path = parts[1].trimmingCharacters(in: .whitespacesAndNewlines)
101+
return Cgroup2Manager(group: URL(filePath: String(path)), logger: logger)
102+
}
103+
104+
func create(perms: Int16 = 0o755) throws {
54105
self.logger?.info(
55106
"creating cgroup manager",
56107
metadata: [
@@ -110,6 +161,13 @@ internal struct Cgroup2Manager {
110161
}
111162

112163
func addProcess(pid: Int32) throws {
164+
self.logger?.debug(
165+
"adding new proc to cgroup",
166+
metadata: [
167+
"mountpoint": "\(self.mountPoint.path)",
168+
"path": "\(self.path.path)",
169+
])
170+
113171
let pidStr = String(pid)
114172
try Self.writeValue(
115173
path: self.path,
@@ -143,13 +201,24 @@ internal struct Cgroup2Manager {
143201

144202
extension Cgroup2Manager {
145203
enum Error: Swift.Error, CustomStringConvertible {
204+
case notCgroup
205+
case cgroup1
146206
case errno(errno: Int32, message: String)
207+
case notExist(path: String)
147208

148209
var description: String {
149210
switch self {
150211
case .errno(let errno, let message):
151212
return "failed with errno \(errno): \(message)"
213+
case .notExist(let path):
214+
return "cgroup at path \(path) does not exist"
215+
case .cgroup1:
216+
return "tried to load a cgroup v1 path"
217+
case .notCgroup:
218+
return "path is not a cgroup mountpoint"
152219
}
153220
}
154221
}
155222
}
223+
224+
#endif

vminitd/Sources/vminitd/ManagedContainer.swift

Lines changed: 33 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -39,43 +39,50 @@ actor ManagedContainer {
3939
spec: ContainerizationOCI.Spec,
4040
log: Logger
4141
) throws {
42-
let bundle = try ContainerizationOCI.Bundle.create(
43-
path: Self.craftBundlePath(id: id),
44-
spec: spec
45-
)
46-
log.info("created bundle with spec \(spec)")
47-
4842
var cgroupsPath: String
4943
if let cgPath = spec.linux?.cgroupsPath {
5044
cgroupsPath = cgPath
5145
} else {
5246
cgroupsPath = "/container/\(id)"
5347
}
5448

55-
let cgManager = try Cgroup2Manager(
56-
path: URL(filePath: cgroupsPath),
57-
logger: log
58-
)
59-
try cgManager.toggleSubtreeControllers(
60-
controllers: [.cpu, .cpuset, .hugetlb, .io, .memory, .pids],
61-
enable: true
49+
let bundle = try ContainerizationOCI.Bundle.create(
50+
path: Self.craftBundlePath(id: id),
51+
spec: spec
6252
)
53+
log.info("created bundle with spec \(spec)")
6354

64-
let initProcess = try ManagedProcess(
65-
id: id,
66-
stdio: stdio,
67-
bundle: bundle,
68-
cgroupManager: cgManager,
69-
owningPid: nil,
70-
log: log
55+
let cgManager = Cgroup2Manager(
56+
group: URL(filePath: cgroupsPath),
57+
logger: log
7158
)
72-
log.info("created managed init process")
59+
try cgManager.create()
7360

74-
self.initProcess = initProcess
75-
self.id = id
76-
self.cgroupManager = cgManager
77-
self.bundle = bundle
78-
self.log = log
61+
do {
62+
try cgManager.toggleSubtreeControllers(
63+
controllers: [.cpu, .cpuset, .hugetlb, .io, .memory, .pids],
64+
enable: true
65+
)
66+
67+
let initProcess = try ManagedProcess(
68+
id: id,
69+
stdio: stdio,
70+
bundle: bundle,
71+
cgroupManager: cgManager,
72+
owningPid: nil,
73+
log: log
74+
)
75+
log.info("created managed init process")
76+
77+
self.cgroupManager = cgManager
78+
self.initProcess = initProcess
79+
self.id = id
80+
self.bundle = bundle
81+
self.log = log
82+
} catch {
83+
try? cgManager.delete()
84+
throw error
85+
}
7986
}
8087
}
8188

@@ -104,7 +111,6 @@ extension ManagedContainer {
104111
id: id,
105112
stdio: stdio,
106113
bundle: self.bundle,
107-
cgroupManager: self.cgroupManager,
108114
owningPid: self.initProcess.pid,
109115
log: self.log
110116
)

vminitd/Sources/vminitd/ManagedProcess.swift

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ final class ManagedProcess: Sendable {
3434
private let syncPipe: FileHandle
3535
private let terminal: Bool
3636
private let bundle: ContainerizationOCI.Bundle
37-
private let cgroupManager: Cgroup2Manager
37+
private let cgroupManager: Cgroup2Manager?
3838

3939
private struct State {
4040
init(io: IO) {
@@ -75,7 +75,7 @@ final class ManagedProcess: Sendable {
7575
id: String,
7676
stdio: HostStdio,
7777
bundle: ContainerizationOCI.Bundle,
78-
cgroupManager: Cgroup2Manager,
78+
cgroupManager: Cgroup2Manager? = nil,
7979
owningPid: Int32? = nil,
8080
log: Logger
8181
) throws {
@@ -84,7 +84,6 @@ final class ManagedProcess: Sendable {
8484
Self.localizeLogger(log: &log, id: id)
8585
self.log = log
8686
self.owningPid = owningPid
87-
self.cgroupManager = cgroupManager
8887

8988
let syncPipe = Pipe()
9089
try syncPipe.setCloexec()
@@ -138,6 +137,7 @@ final class ManagedProcess: Sendable {
138137
// Setup IO early. We expect the host to be listening already.
139138
try io.start(process: &process)
140139

140+
self.cgroupManager = cgroupManager
141141
self.process = process
142142
self.terminal = stdio.terminal
143143
self.bundle = bundle
@@ -184,8 +184,17 @@ extension ManagedProcess {
184184
])
185185
$0.pid = pid
186186

187-
// First add to our cg, then ack the pid.
188-
try self.cgroupManager.addProcess(pid: pid)
187+
// Add to our cgroup. For execs (owningPid is non-nil) we'll
188+
// see where the init process is actually located now (systemd
189+
// loves to move all its processes to a child /init.scope cg).
190+
if let cgroupManager {
191+
try cgroupManager.addProcess(pid: pid)
192+
} else {
193+
if let owningPid {
194+
let cgManager = try Cgroup2Manager.loadFromPid(pid: owningPid)
195+
try cgManager.addProcess(pid: pid)
196+
}
197+
}
189198

190199
log.info(
191200
"sending pid acknowledgement",

vminitd/Sources/vminitd/Server+GRPC.swift

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ extension Initd: Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncProvid
439439
from: request.configuration
440440
)
441441

442-
try ociAlterations(ociSpec: &ociSpec)
442+
try ociAlterations(id: request.id, ociSpec: &ociSpec)
443443

444444
guard let process = ociSpec.process else {
445445
throw ContainerizationError(
@@ -940,7 +940,7 @@ extension Com_Apple_Containerization_Sandbox_V3_ConfigureHostsRequest {
940940
}
941941

942942
extension Initd {
943-
func ociAlterations(ociSpec: inout ContainerizationOCI.Spec) throws {
943+
func ociAlterations(id: String, ociSpec: inout ContainerizationOCI.Spec) throws {
944944
guard var process = ociSpec.process else {
945945
throw ContainerizationError(
946946
.invalidArgument,
@@ -954,6 +954,10 @@ extension Initd {
954954
)
955955
}
956956

957+
if ociSpec.linux!.cgroupsPath.isEmpty {
958+
ociSpec.linux!.cgroupsPath = "/container/\(id)"
959+
}
960+
957961
if process.cwd.isEmpty {
958962
process.cwd = "/"
959963
}

0 commit comments

Comments
 (0)