Skip to content

Commit bb039da

Browse files
committed
Fix hang in ServiceConsoleTests.serviceShutdown
This hang occurred only in CI environments and only on Linux. Here's the sequence of events: - Test terminates swbuild using SIGKILL - OS reparents SWBBuildService (a subprocess of swbuild) to launchd (Darwin) / init (others) - OS closes the file descriptors for the I/O pipes swbuild has connected to SWBBuildService - SWBBuildService's read() loop indicates EOF due to the broken pipe - SWBBuildService causes itself to exit At this point, the getpgid loop should return ERSCH and terminate the test. However, SWBBuildService is sticking around as a zombie for an extended period of time without init reaping the pid, causing getpgid to never hit the termination state. This causes the test to hang indefinitely. To fix this, there are two aspects: - A timeout is added around the termination monitoring loop that forces the exit promise to be fulfilled with an error if a 30-second interval elapses without the process exiting - We switch from using a getpgid loop to using a waitid loop, where the terminal state is that the process has _exited_... we don't care if the zombie hasn't been collected by init, only that it's not in a running state This fixes the hang for both the Jenkins based CI as well as GitHub actions, and also insulates us against future hangs by ensuring the test will terminate with a timeout error instead of hanging indefinitely, so that we at least know _which_ test is the problem.
1 parent f49864e commit bb039da

File tree

1 file changed

+78
-18
lines changed

1 file changed

+78
-18
lines changed

Tests/SwiftBuildTests/ConsoleCommands/ServiceConsoleTests.swift

Lines changed: 78 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,13 @@ import SWBUtil
1919
import WinSDK
2020
#endif
2121

22-
@Suite(.skipInGitHubActions("failing in the GitHub actions runner environment"), .skipHostOS(.windows))
22+
#if canImport(System)
23+
import System
24+
#else
25+
import SystemPackage
26+
#endif
27+
28+
@Suite(.skipHostOS(.windows))
2329
fileprivate struct ServiceConsoleTests {
2430
@Test
2531
func emptyInput() async throws {
@@ -92,7 +98,13 @@ fileprivate struct ServiceConsoleTests {
9298
await #expect(try cli.exitStatus.wasSignaled)
9399

94100
// Now wait for the service subprocess to exit, without any further communication.
95-
try await serviceExitPromise.value
101+
try await withTimeout(timeout: .seconds(30), description: "Service process exit promise 30-second limit") {
102+
try await withTaskCancellationHandler {
103+
try await serviceExitPromise.value
104+
} onCancel: {
105+
serviceExitPromise.fail(throwing: CancellationError())
106+
}
107+
}
96108
}
97109
}
98110

@@ -124,6 +136,31 @@ private var SYNCHRONIZE: DWORD {
124136
}
125137

126138
extension HANDLE: @retroactive @unchecked Sendable {}
139+
140+
func WaitForSingleObjectAsync(_ handle: HANDLE) async throws {
141+
var waitHandle: HANDLE?
142+
defer {
143+
if let waitHandle {
144+
_ = UnregisterWait(waitHandle)
145+
}
146+
}
147+
148+
try await withCheckedThrowingContinuation { (continuation: CheckedContinuation<Void, any Error>) in
149+
if !RegisterWaitForSingleObject(
150+
&waitHandle,
151+
handle,
152+
{ context, _ in
153+
let continuation = Unmanaged<AnyObject>.fromOpaque(context!).takeRetainedValue() as! CheckedContinuation<Void, any Error>
154+
continuation.resume()
155+
},
156+
Unmanaged.passRetained(continuation as AnyObject).toOpaque(),
157+
INFINITE,
158+
ULONG(WT_EXECUTEONLYONCE | WT_EXECUTELONGFUNCTION)
159+
) {
160+
continuation.resume(throwing: Win32Error(GetLastError()))
161+
}
162+
}
163+
}
127164
#endif
128165

129166
extension Processes {
@@ -134,34 +171,57 @@ extension Processes {
134171
throw Win32Error(GetLastError())
135172
}
136173
defer { CloseHandle(proc) }
137-
Thread.detachNewThread {
138-
if WaitForSingleObject(proc, INFINITE) == WAIT_FAILED {
139-
promise.fail(throwing: Win32Error(GetLastError()))
140-
return
141-
}
142-
promise.fulfill(with: ())
174+
Task<Void, Never> {
175+
await promise.fulfill(with: Result.catching { try await WaitForSingleObjectAsync(proc) })
143176
}
144177
#else
145-
Task<Void, Never>.detached {
146-
while true {
147-
// We use getpgid() here to detect when the process has exited (it is not a child). Surprisingly, getpgid() is substantially faster than using kill(pid, 0) here because kill still returns success for zombies, and the service has been reparented to launchd. // ignore-unacceptable-language; POSIX API
148-
do {
149-
if getpgid(pid) < 0 {
150-
// We expect the signal to eventually fail with "No such process".
151-
if errno != ESRCH {
152-
throw StubError.error("unexpected exit code: \(errno)")
178+
Task<Void, Never> {
179+
func wait(pid: pid_t) throws -> Bool {
180+
repeat {
181+
do {
182+
var siginfo = siginfo_t()
183+
if waitid(P_PID, id_t(pid), &siginfo, WEXITED | WNOWAIT | WNOHANG) != 0 {
184+
throw Errno(rawValue: errno)
153185
}
154-
break
186+
return siginfo.si_pid == pid
187+
} catch Errno.noChildProcess {
188+
return true
189+
} catch Errno.interrupted {
190+
// ignore
191+
}
192+
} while true
193+
}
194+
while !Task.isCancelled {
195+
do {
196+
if try wait(pid: pid) {
197+
promise.fulfill(with: ())
198+
return
155199
}
156200
try await Task.sleep(for: .microseconds(1000))
157201
} catch {
158202
promise.fail(throwing: error)
159203
return
160204
}
161205
}
162-
promise.fulfill(with: ())
206+
promise.fail(throwing: CancellationError())
163207
}
164208
#endif
165209
return promise
166210
}
167211
}
212+
213+
#if !os(Windows) && !canImport(Darwin) && !os(FreeBSD)
214+
fileprivate extension siginfo_t {
215+
var si_pid: pid_t {
216+
#if os(OpenBSD)
217+
return _data._proc._pid
218+
#elseif canImport(Glibc)
219+
return _sifields._sigchld.si_pid
220+
#elseif canImport(Musl)
221+
return __si_fields.__si_common.__first.__piduid.si_pid
222+
#elseif canImport(Bionic)
223+
return _sifields._kill._pid
224+
#endif
225+
}
226+
}
227+
#endif

0 commit comments

Comments
 (0)