Skip to content

Commit 2206bd9

Browse files
committed
test/testenv: fix leaking buildx dial-stdio processes on interrupt
Existing implementation was functional only with modified buildx binary and missed few details. With this commit, we gracefully handle: - buildx binary not starting at all, e.g. caused by bad parameter. - automated connection issues without explicit confirmation. - interruption of test suite gracefully shuts down docker CLI and buildx processes. - Docker daemon restarting or crashing. Closes #974 Signed-off-by: Mateusz Gozdek <mgozdek@microsoft.com>
1 parent 9a57e74 commit 2206bd9

File tree

2 files changed

+100
-52
lines changed

2 files changed

+100
-52
lines changed

test/main_test.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"fmt"
77
"os"
88
"os/signal"
9+
"sync"
910
"syscall"
1011
"testing"
1112
"time"
@@ -74,8 +75,18 @@ func TestMain(m *testing.M) {
7475
ctx, done := signal.NotifyContext(baseCtx, os.Interrupt)
7576
baseCtx = ctx
7677

78+
closeTestEnv := sync.OnceFunc(func() {
79+
if err := testEnv.Close(); err != nil {
80+
fmt.Fprintln(os.Stderr, "Error closing test environment:", err)
81+
}
82+
})
83+
7784
go func() {
7885
<-ctx.Done()
86+
87+
// Cleanup test env before restoring default signal handling, so we give a chance for a proper cleanup.
88+
closeTestEnv()
89+
7990
// The context was cancelled due to interrupt
8091
// This _should_ trigger builds to cancel naturally and exit the program,
8192
// but in some cases it may not (due to timing, bugs in buildkit, uninteruptable operations, etc.).
@@ -96,11 +107,7 @@ func TestMain(m *testing.M) {
96107
cancel()
97108
}()
98109

99-
defer func() {
100-
if err := testEnv.Close(); err != nil {
101-
fmt.Fprintln(os.Stderr, "Error closing test environment:", err)
102-
}
103-
}()
110+
defer closeTestEnv()
104111

105112
if err := testEnv.Load(ctx, phonyRef, fixtures.PhonyFrontend); err != nil {
106113
panic(err)

test/testenv/buildx.go

Lines changed: 88 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package testenv
22

33
import (
4-
"bufio"
54
"bytes"
65
"context"
76
"encoding/json"
@@ -13,6 +12,7 @@ import (
1312
"os/exec"
1413
"strings"
1514
"sync"
15+
"syscall"
1616
"testing"
1717
"time"
1818

@@ -91,12 +91,32 @@ func (b *BuildxEnv) supportsDialStdio(ctx context.Context) (bool, error) {
9191

9292
var errDialStdioNotSupported = errors.New("buildx dial-stdio not supported")
9393

94-
type connCloseWrapper struct {
94+
// cmdConn wraps a net.Conn and replaces generic pipe errors with the
95+
// actual error from the underlying command when it has exited.
96+
type cmdConn struct {
9597
net.Conn
96-
close func()
98+
close func()
99+
cmdWait <-chan error
97100
}
98101

99-
func (c *connCloseWrapper) Close() error {
102+
func (c *cmdConn) Read(b []byte) (int, error) {
103+
n, err := c.Conn.Read(b)
104+
if err != nil {
105+
// If the command has exited with an error, surface that instead
106+
// of the generic pipe closed error.
107+
select {
108+
case cmdErr := <-c.cmdWait:
109+
if cmdErr != nil {
110+
return n, fmt.Errorf("%v: %w", cmdErr, err)
111+
}
112+
default:
113+
}
114+
}
115+
116+
return n, err
117+
}
118+
119+
func (c *cmdConn) Close() error {
100120
if c.close != nil {
101121
c.close()
102122
}
@@ -126,68 +146,89 @@ func (b *BuildxEnv) dialStdio(ctx context.Context) error {
126146
// the buildx dial-stdio process from cleaning up its resources properly.
127147
cmd := exec.Command("docker", args...)
128148
cmd.Env = os.Environ()
149+
// Put the child in its own process group so it does not receive
150+
// SIGINT from the terminal on Ctrl+C. This way shutdown is always
151+
// driven by closing stdin (the Ctrl+D / EOF path), which buildx
152+
// handles cleanly. The process group is only used for signal
153+
// isolation, not for killing.
154+
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
129155

130156
dialStdioConn, clientConn := net.Pipe()
131-
cmd.Stdin = dialStdioConn
132157
cmd.Stdout = dialStdioConn
133158

134-
// Use a pipe to check when the connection is actually complete
135-
// Also write all of stderr to an error buffer so we can have more details
136-
// in the error message when the command fails.
137-
r, w := io.Pipe()
138-
errBuf := bytes.NewBuffer(nil)
139-
ww := io.MultiWriter(w, errBuf)
140-
cmd.Stderr = ww
159+
// Capture stderr so we can include it in error messages
160+
// when the command fails.
161+
var stderr bytes.Buffer
162+
cmd.Stderr = &stderr
163+
164+
// Use StdinPipe instead of setting cmd.Stdin directly.
165+
// When cmd.Stdin is set to a non-*os.File (like net.Conn),
166+
// exec creates an internal goroutine to copy data into the
167+
// process's stdin pipe, and cmd.Wait blocks until that
168+
// goroutine finishes. If the process exits immediately (e.g.
169+
// bad arguments), the goroutine is stuck reading from
170+
// dialStdioConn (nobody is writing yet), so cmd.Wait hangs.
171+
// StdinPipe avoids the internal goroutine entirely.
172+
stdinPipe, err := cmd.StdinPipe()
173+
if err != nil {
174+
_, _ = dialStdioConn.Close(), clientConn.Close()
175+
176+
return nil, fmt.Errorf("creating stdin pipe: %w", err)
177+
}
141178

142179
if err := cmd.Start(); err != nil {
143-
return nil, err
180+
_, _, _ = stdinPipe.Close(), dialStdioConn.Close(), clientConn.Close()
181+
182+
if s := strings.TrimSpace(stderr.String()); s != "" {
183+
return nil, fmt.Errorf("starting buildx dial-stdio: %s: %w", strings.TrimSpace(stderr.String()), err)
184+
}
185+
186+
return nil, fmt.Errorf("starting buildx dial-stdio: %w", err)
144187
}
145188

146-
// chWait is closed when cmd.Wait() returns, signaling the cleanup
189+
// Copy client writes to the process's stdin.
190+
// This goroutine stops when dialStdioConn is closed (read
191+
// returns error) or stdinPipe is closed (write returns error).
192+
go func() {
193+
io.Copy(stdinPipe, dialStdioConn) //nolint:errcheck
194+
stdinPipe.Close()
195+
}()
196+
197+
// cmdWait is closed when cmd.Wait() returns, signaling the cleanup
147198
// function that the process has exited.
148-
chWait := make(chan struct{})
199+
// waitErr is written before cmdWait is closed, so it is safe to
200+
// read after receiving from cmdWait.
201+
cmdWait := make(chan error, 1)
202+
cmdDone := make(chan struct{})
149203
go func() {
150204
err := cmd.Wait()
151-
close(chWait)
205+
if stderr.Len() > 0 && err != nil {
206+
err = fmt.Errorf("%v: %w", strings.TrimSpace(stderr.String()), err)
207+
}
208+
cmdWait <- err
209+
close(cmdWait)
152210
dialStdioConn.Close()
153-
// pkgerrors.Wrap will return nil if err is nil, otherwise it will give
154-
// us a wrapped error with the buffered stderr from the command.
155-
w.CloseWithError(pkgerrors.Wrapf(err, "%s", errBuf))
211+
close(cmdDone)
156212
}()
157213

158-
defer r.Close()
159-
160-
scanner := bufio.NewScanner(r)
161-
for scanner.Scan() {
162-
txt := strings.ToLower(scanner.Text())
163-
164-
if strings.HasPrefix(txt, "#1 dialing builder") && strings.HasSuffix(txt, "done") {
165-
go func() {
166-
// Continue draining stderr so the process does not get blocked
167-
_, _ = io.Copy(io.Discard, r)
168-
}()
169-
break
170-
}
171-
}
172-
if err := scanner.Err(); err != nil {
173-
return nil, err
174-
}
175-
176-
out := &connCloseWrapper{
177-
Conn: clientConn,
214+
out := &cmdConn{
215+
Conn: clientConn,
216+
cmdWait: cmdWait,
178217
close: sync.OnceFunc(func() {
179-
// Close the stdin/stdout pipe to the process.
180-
// This causes stdin EOF in buildx's dial-stdio, which triggers
181-
// closeWrite(conn) on the buildkit connection and should start
182-
// the chain reaction for docker CLI process to exit.
218+
// Close the pipe to the process. This sends EOF on stdin
219+
// (like Ctrl+D), which triggers closeWrite(conn) on the
220+
// buildkit connection and starts the chain reaction for the
221+
// docker CLI process to exit.
183222
dialStdioConn.Close()
184223

185224
select {
186-
case <-chWait:
225+
case <-cmdDone:
187226
case <-time.After(10 * time.Second):
188-
// Safety net: force kill if still running.
189-
cmd.Process.Kill() //nolint:errcheck
190-
<-chWait
227+
// Safety net: force kill the entire process group
228+
// (negative PID) since Setpgid puts the child and
229+
// any subprocesses it spawns into their own group.
230+
syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL) //nolint:errcheck
231+
<-cmdWait
191232
}
192233
}),
193234
}

0 commit comments

Comments
 (0)