Skip to content

Commit 87a2167

Browse files
authored
fix: prevent darwin-vz exec stream hangs (#90)
Amp-Thread-ID: https://ampcode.com/threads/T-019cd4f1-489c-7240-b1c9-edb74b908b7d
1 parent e0ce677 commit 87a2167

File tree

2 files changed

+160
-3
lines changed

2 files changed

+160
-3
lines changed

cmd/cleanroom-darwin-vz/main.swift

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ private final class GuestChannel {
219219
let readFD: Int32
220220
let writeFD: Int32
221221
private let closeAfterUse: Bool
222+
private let lock = NSLock()
222223
private var closed = false
223224
private let onClose: () -> Void
224225

@@ -236,12 +237,46 @@ private final class GuestChannel {
236237
}
237238

238239
func close() {
240+
lock.lock()
239241
if closed {
242+
lock.unlock()
240243
return
241244
}
242245
closed = true
246+
lock.unlock()
243247
onClose()
244248
}
249+
250+
func duplicate(closeAfterUse: Bool = true) throws -> GuestChannel {
251+
let duplicatedReadFD = Darwin.dup(readFD)
252+
if duplicatedReadFD < 0 {
253+
throw HelperError.posix("dup(guest read fd)", errno)
254+
}
255+
256+
let duplicatedWriteFD: Int32
257+
if writeFD == readFD {
258+
duplicatedWriteFD = duplicatedReadFD
259+
} else {
260+
duplicatedWriteFD = Darwin.dup(writeFD)
261+
if duplicatedWriteFD < 0 {
262+
let code = errno
263+
_ = Darwin.close(duplicatedReadFD)
264+
throw HelperError.posix("dup(guest write fd)", code)
265+
}
266+
}
267+
268+
return GuestChannel(
269+
readFD: duplicatedReadFD,
270+
writeFD: duplicatedWriteFD,
271+
closeAfterUse: closeAfterUse,
272+
onClose: {
273+
if duplicatedWriteFD != duplicatedReadFD {
274+
_ = Darwin.close(duplicatedWriteFD)
275+
}
276+
_ = Darwin.close(duplicatedReadFD)
277+
}
278+
)
279+
}
245280
}
246281

247282
private final class ProxyServer {
@@ -308,7 +343,7 @@ private final class ProxyServer {
308343
activeChannel = guestChannel
309344
lock.unlock()
310345

311-
bridge(hostFD: hostFD, guestReadFD: guestChannel.readFD, guestWriteFD: guestChannel.writeFD)
346+
bridge(hostFD: hostFD, guestChannel: guestChannel)
312347
guestChannel.finishSession()
313348

314349
lock.lock()
@@ -319,7 +354,9 @@ private final class ProxyServer {
319354
return true
320355
}
321356

322-
private func bridge(hostFD: Int32, guestReadFD: Int32, guestWriteFD: Int32) {
357+
private func bridge(hostFD: Int32, guestChannel: GuestChannel) {
358+
let guestReadFD = guestChannel.readFD
359+
let guestWriteFD = guestChannel.writeFD
323360
let group = DispatchGroup()
324361
let errorLock = NSLock()
325362
var firstError: Error?
@@ -335,6 +372,11 @@ private final class ProxyServer {
335372
group.enter()
336373
DispatchQueue.global(qos: .userInitiated).async {
337374
defer { group.leave() }
375+
defer {
376+
// Closing the guest side after host EOF prevents the bridge from
377+
// stalling on long-lived transports (for example serial fallback).
378+
guestChannel.close()
379+
}
338380
do {
339381
try pumpBytes(src: hostFD, dst: guestWriteFD)
340382
} catch {
@@ -661,7 +703,7 @@ private final class VMRuntime {
661703
guard let serialChannel else {
662704
throw HelperError.vm("guest serial channel is not available")
663705
}
664-
return serialChannel
706+
return try serialChannel.duplicate()
665707
}
666708
}
667709

internal/backend/darwinvz/persistent_e2e_test.go

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,3 +178,118 @@ func TestPersistentSandboxE2E(t *testing.T) {
178178
t.Fatalf("expected run after terminate to fail with unknown sandbox, got %v", err)
179179
}
180180
}
181+
182+
func TestPersistentSandboxE2EExecStreamingDoesNotHang(t *testing.T) {
183+
if strings.TrimSpace(os.Getenv(darwinVZE2EEnvEnabled)) == "" {
184+
t.Skipf("set %s=1 to run real darwin-vz persistence e2e", darwinVZE2EEnvEnabled)
185+
}
186+
if testing.Short() {
187+
t.Skip("skipping darwin-vz e2e in short mode")
188+
}
189+
190+
helperPath, err := resolveHelperBinaryPath()
191+
if err != nil {
192+
t.Fatalf("resolve helper binary: %v", err)
193+
}
194+
hasEntitlement, err := helperHasVirtualizationEntitlement(helperPath)
195+
if err != nil {
196+
t.Fatalf("verify helper entitlement: %v", err)
197+
}
198+
if !hasEntitlement {
199+
t.Fatalf("helper %q is missing com.apple.security.virtualization entitlement", helperPath)
200+
}
201+
if _, _, err := New().getGuestAgentBinary(); err != nil {
202+
t.Fatalf("resolve guest agent binary: %v", err)
203+
}
204+
205+
rootFSOverride := strings.TrimSpace(os.Getenv(darwinVZE2EEnvRootFS))
206+
if rootFSOverride == "" {
207+
if _, err := hosttools.ResolveE2FSProgsBinary("mkfs.ext4"); err != nil {
208+
t.Fatalf("resolve mkfs.ext4: %v", err)
209+
}
210+
if _, err := hosttools.ResolveE2FSProgsBinary("debugfs"); err != nil {
211+
t.Fatalf("resolve debugfs: %v", err)
212+
}
213+
}
214+
215+
imageRef := strings.TrimSpace(os.Getenv(darwinVZE2EEnvImageRef))
216+
if imageRef == "" {
217+
imageRef = defaultDarwinVZE2EImageRef()
218+
}
219+
220+
cfg := backend.FirecrackerConfig{
221+
KernelImagePath: strings.TrimSpace(os.Getenv(darwinVZE2EEnvKernelImage)),
222+
RootFSPath: rootFSOverride,
223+
VCPUs: 1,
224+
MemoryMiB: 1024,
225+
LaunchSeconds: 90,
226+
}
227+
compiled := &policy.CompiledPolicy{
228+
Version: 1,
229+
ImageRef: imageRef,
230+
NetworkDefault: "deny",
231+
}
232+
sandboxID := fmt.Sprintf("cr-e2e-streaming-%d", time.Now().UnixNano())
233+
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
234+
defer cancel()
235+
236+
adapter := New()
237+
if err := adapter.ProvisionSandbox(ctx, backend.ProvisionRequest{
238+
SandboxID: sandboxID,
239+
Policy: compiled,
240+
FirecrackerConfig: cfg,
241+
}); err != nil {
242+
t.Fatalf("ProvisionSandbox returned error: %v", err)
243+
}
244+
245+
defer func() {
246+
if err := adapter.TerminateSandbox(context.Background(), sandboxID); err != nil {
247+
t.Fatalf("deferred TerminateSandbox returned error: %v", err)
248+
}
249+
}()
250+
251+
// A quick warm-up run that exercises the readiness/proxy path before we
252+
// assert repeated non-interactive executions complete promptly.
253+
warmupCtx, warmupCancel := context.WithTimeout(ctx, 60*time.Second)
254+
warmup, err := adapter.RunInSandbox(warmupCtx, backend.RunRequest{
255+
SandboxID: sandboxID,
256+
RunID: "run-warmup",
257+
Command: []string{"sh", "-lc", "echo warmup"},
258+
Policy: compiled,
259+
FirecrackerConfig: backend.FirecrackerConfig{
260+
LaunchSeconds: cfg.LaunchSeconds,
261+
},
262+
}, backend.OutputStream{})
263+
warmupCancel()
264+
if err != nil {
265+
t.Fatalf("warm-up RunInSandbox returned error: %v", err)
266+
}
267+
if warmup.ExitCode != 0 {
268+
t.Fatalf("expected warm-up exit code 0, got %d (stderr=%q)", warmup.ExitCode, warmup.Stderr)
269+
}
270+
271+
for i := 0; i < 8; i++ {
272+
runID := fmt.Sprintf("run-stream-%d", i)
273+
want := fmt.Sprintf("stream-%d", i)
274+
runCtx, runCancel := context.WithTimeout(ctx, 30*time.Second)
275+
res, err := adapter.RunInSandbox(runCtx, backend.RunRequest{
276+
SandboxID: sandboxID,
277+
RunID: runID,
278+
Command: []string{"sh", "-lc", "echo " + want},
279+
Policy: compiled,
280+
FirecrackerConfig: backend.FirecrackerConfig{
281+
LaunchSeconds: cfg.LaunchSeconds,
282+
},
283+
}, backend.OutputStream{})
284+
runCancel()
285+
if err != nil {
286+
t.Fatalf("RunInSandbox %q returned error: %v", runID, err)
287+
}
288+
if res.ExitCode != 0 {
289+
t.Fatalf("expected %q exit code 0, got %d (stderr=%q)", runID, res.ExitCode, res.Stderr)
290+
}
291+
if got := strings.TrimSpace(res.Stdout); got != want {
292+
t.Fatalf("unexpected stdout for %q: got %q want %q", runID, got, want)
293+
}
294+
}
295+
}

0 commit comments

Comments
 (0)