Skip to content

Commit 087b8d9

Browse files
tomassrnkae2bclaude
authored
fix: data races in tests and NFS cache scanner (#2256)
* fix: data races in tests and NFS cache scanner - Serialize goose migrations with mutex (race on package globals) - Use RunAndReturn in envd conversion test (race on connect.Response headers) - Replace *os.File fd with path string in NFS cache scanner (race between Close/Fd) - Capture loop vars in NBD path_direct goroutine closure - Use sync.Map and mutexes in multipart upload tests - Use atomic.Bool and separate context vars in errorcollector test These races were exposed by ARM64 testing but are architecture-independent bugs. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: include full path in statInDir error message Use filepath.Join(dirPath, filename) in the error instead of bare filename, matching the sibling stat() function's error format. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: remove unnecessary loop var capture (Go 1.22+) Go 1.22+ changed loop variable semantics — each iteration gets its own copy. No need to capture deviceIndex/i before the goroutine. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: restore deviceIndex sentinel value (math.MaxUint32) The MaxUint32 initialization was intentional as a sentinel value. Reverting the unnecessary change per review feedback. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: e2b <e2b@Onsites-MacBook-Pro.local> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent d250235 commit 087b8d9

File tree

8 files changed

+68
-29
lines changed

8 files changed

+68
-29
lines changed

packages/db/pkg/testutils/db.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"os/exec"
66
"path/filepath"
77
"strings"
8+
"sync"
89
"testing"
910
"time"
1011

@@ -106,6 +107,12 @@ func SetupDatabase(t *testing.T) *Database {
106107
}
107108
}
108109

110+
// gooseMu serializes goose operations across parallel tests.
111+
// goose.OpenDBWithDriver calls goose.SetDialect which writes to package-level
112+
// globals (dialect, store) without synchronization. Concurrent test goroutines
113+
// race on these globals, triggering the race detector on ARM64.
114+
var gooseMu sync.Mutex
115+
109116
// runDatabaseMigrations executes all required database migrations
110117
func runDatabaseMigrations(t *testing.T, connStr string) {
111118
t.Helper()
@@ -115,6 +122,9 @@ func runDatabaseMigrations(t *testing.T, connStr string) {
115122
require.NoError(t, err, "Failed to find git root")
116123
repoRoot := strings.TrimSpace(string(output))
117124

125+
gooseMu.Lock()
126+
defer gooseMu.Unlock()
127+
118128
db, err := goose.OpenDBWithDriver("pgx", connStr)
119129
require.NoError(t, err)
120130
t.Cleanup(func() {

packages/envd/internal/services/legacy/conversion_test.go

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package legacy
22

33
import (
44
"bytes"
5+
"context"
56
"io"
67
"net/http"
78
"net/http/httptest"
@@ -22,12 +23,19 @@ import (
2223
func TestFilesystemClient_FieldFormatter(t *testing.T) {
2324
t.Parallel()
2425
fsh := filesystemconnectmocks.NewMockFilesystemHandler(t)
25-
fsh.EXPECT().Move(mock.Anything, mock.Anything).Return(connect.NewResponse(&filesystem.MoveResponse{
26-
Entry: &filesystem.EntryInfo{
27-
Name: "test-name",
28-
Owner: "new-extra-field",
26+
// Use RunAndReturn to create a fresh response per call. Using Return()
27+
// shares one Response across parallel subtests, causing a data race on
28+
// the lazily-initialized header/trailer maps inside connect.Response.
29+
fsh.EXPECT().Move(mock.Anything, mock.Anything).RunAndReturn(
30+
func(_ context.Context, _ *connect.Request[filesystem.MoveRequest]) (*connect.Response[filesystem.MoveResponse], error) {
31+
return connect.NewResponse(&filesystem.MoveResponse{
32+
Entry: &filesystem.EntryInfo{
33+
Name: "test-name",
34+
Owner: "new-extra-field",
35+
},
36+
}), nil
2937
},
30-
}), nil)
38+
)
3139

3240
_, handler := filesystemconnect.NewFilesystemHandler(fsh,
3341
connect.WithInterceptors(

packages/orchestrator/cmd/clean-nfs-cache/cleaner/clean.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ type Candidate struct {
9999
}
100100

101101
type statReq struct {
102-
df *os.File
102+
dirPath string
103103
name string
104104
response chan *statReq
105105
f *File

packages/orchestrator/cmd/clean-nfs-cache/cleaner/scan.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ func (c *Cleaner) Statter(ctx context.Context, done *sync.WaitGroup) {
6060
case <-ctx.Done():
6161
return
6262
case req := <-c.statRequestCh:
63-
f, err := c.statInDir(req.df, req.name)
63+
f, err := c.statInDir(req.dirPath, req.name)
6464
req.f = f
6565
req.err = err
6666
req.response <- req
@@ -201,13 +201,16 @@ func (c *Cleaner) scanDir(ctx context.Context, path []*Dir) (out *Dir, err error
201201
}
202202
}
203203

204-
// submit all stat requests
204+
// Submit stat requests using the directory path (not the *os.File).
205+
// The file descriptor df is closed when scanDir returns (defer above),
206+
// but Statter goroutines may still be processing requests concurrently.
207+
// Passing the path avoids a race between df.Close() and df.Fd().
205208
responseCh := make(chan *statReq, len(filenames))
206209
for _, name := range filenames {
207210
select {
208211
case <-ctx.Done():
209212
return nil, ctx.Err()
210-
case c.statRequestCh <- &statReq{df: df, name: name, response: responseCh}:
213+
case c.statRequestCh <- &statReq{dirPath: absPath, name: name, response: responseCh}:
211214
// submitted
212215
}
213216
}

packages/orchestrator/cmd/clean-nfs-cache/cleaner/stat_linux.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ package cleaner
44

55
import (
66
"fmt"
7-
"os"
7+
"path/filepath"
88

99
"golang.org/x/sys/unix"
1010
)
@@ -30,17 +30,18 @@ func (c *Cleaner) stat(fullPath string) (*Candidate, error) {
3030
}, nil
3131
}
3232

33-
func (c *Cleaner) statInDir(df *os.File, filename string) (*File, error) {
33+
func (c *Cleaner) statInDir(dirPath string, filename string) (*File, error) {
3434
c.StatxC.Add(1)
3535
c.StatxInDirC.Add(1)
3636
var statx unix.Statx_t
37-
err := unix.Statx(int(df.Fd()), filename,
37+
fullPath := filepath.Join(dirPath, filename)
38+
err := unix.Statx(unix.AT_FDCWD, fullPath,
3839
unix.AT_STATX_DONT_SYNC|unix.AT_SYMLINK_NOFOLLOW|unix.AT_NO_AUTOMOUNT,
3940
unix.STATX_ATIME|unix.STATX_SIZE,
4041
&statx,
4142
)
4243
if err != nil {
43-
return nil, fmt.Errorf("failed to statx %q: %w", filename, err)
44+
return nil, fmt.Errorf("failed to statx %q: %w", fullPath, err)
4445
}
4546

4647
return &File{

packages/orchestrator/cmd/clean-nfs-cache/cleaner/stat_osx.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@ func (c *Cleaner) stat(path string) (*Candidate, error) {
3131
}, nil
3232
}
3333

34-
func (c *Cleaner) statInDir(df *os.File, filename string) (*File, error) {
34+
func (c *Cleaner) statInDir(dirPath string, filename string) (*File, error) {
3535
c.StatxInDirC.Add(1)
36-
// performance on OS X doeas not matter, so just use the full stat
37-
cand, err := c.stat(filepath.Join(df.Name(), filename))
36+
// performance on OS X does not matter, so just use the full stat
37+
cand, err := c.stat(filepath.Join(dirPath, filename))
3838
if err != nil {
3939
return nil, err
4040
}

packages/shared/pkg/storage/gcp_multipart_test.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ func TestMultipartUploader_UploadFileInParallel_Success(t *testing.T) {
172172

173173
var uploadID string
174174
var initiateCount, uploadPartCount, completeCount int32
175-
receivedParts := make(map[int]string)
175+
receivedParts := sync.Map{}
176176

177177
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
178178
switch {
@@ -194,7 +194,7 @@ func TestMultipartUploader_UploadFileInParallel_Success(t *testing.T) {
194194
// Upload part
195195
partNum := atomic.AddInt32(&uploadPartCount, 1)
196196
body, _ := io.ReadAll(r.Body)
197-
receivedParts[int(partNum)] = string(body)
197+
receivedParts.Store(int(partNum), string(body))
198198

199199
w.Header().Set("ETag", fmt.Sprintf(`"etag%d"`, partNum))
200200
w.WriteHeader(http.StatusOK)
@@ -217,7 +217,9 @@ func TestMultipartUploader_UploadFileInParallel_Success(t *testing.T) {
217217
// Verify all parts were uploaded and content matches
218218
var reconstructed strings.Builder
219219
for i := 1; i <= int(atomic.LoadInt32(&uploadPartCount)); i++ {
220-
reconstructed.WriteString(receivedParts[i])
220+
if part, ok := receivedParts.Load(i); ok {
221+
reconstructed.WriteString(part.(string))
222+
}
221223
}
222224
require.Equal(t, testContent, reconstructed.String())
223225
}
@@ -655,6 +657,7 @@ func TestMultipartUploader_BoundaryConditions_ExactChunkSize(t *testing.T) {
655657
require.NoError(t, err)
656658

657659
var partSizes []int
660+
var partSizesMu sync.Mutex
658661

659662
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
660663
switch {
@@ -670,7 +673,9 @@ func TestMultipartUploader_BoundaryConditions_ExactChunkSize(t *testing.T) {
670673

671674
case strings.Contains(r.URL.RawQuery, "partNumber"):
672675
body, _ := io.ReadAll(r.Body)
676+
partSizesMu.Lock()
673677
partSizes = append(partSizes, len(body))
678+
partSizesMu.Unlock()
674679

675680
partNum := strings.Split(strings.Split(r.URL.RawQuery, "partNumber=")[1], "&")[0]
676681
w.Header().Set("ETag", fmt.Sprintf(`"boundary-etag-%s"`, partNum))
@@ -904,10 +909,13 @@ func TestRetryableClient_ActualRetryBehavior(t *testing.T) {
904909
var requestCount int32
905910
var retryDelays []time.Duration
906911
var retryTimes []time.Time
912+
var retryMu sync.Mutex
907913

908914
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
909915
count := atomic.AddInt32(&requestCount, 1)
916+
retryMu.Lock()
910917
retryTimes = append(retryTimes, time.Now())
918+
retryMu.Unlock()
911919

912920
if count < 3 {
913921
w.WriteHeader(http.StatusInternalServerError)

packages/shared/pkg/utils/errorcollector_test.go

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package utils
33
import (
44
"context"
55
"errors"
6+
"sync/atomic"
67
"testing"
78

89
"github.com/stretchr/testify/assert"
@@ -49,23 +50,31 @@ func TestErrorCollector(t *testing.T) {
4950

5051
ec := NewErrorCollector(1)
5152

52-
// Block the collector's only slot
53+
// Block the collector's only slot.
54+
// ctx1 and ctx2 must be distinct variables: the closure passed to ec.Go
55+
// captures the context variable by reference. If we reused a single "ctx"
56+
// variable, the first closure's <-ctx.Done() would race with the main
57+
// goroutine's reassignment of ctx on the second WithCancel call.
5358
started := make(chan struct{})
54-
ctx, cancel1 := context.WithCancel(t.Context())
55-
ec.Go(ctx, func() error {
59+
ctx1, cancel1 := context.WithCancel(t.Context())
60+
ec.Go(ctx1, func() error {
5661
close(started)
57-
<-ctx.Done()
62+
<-ctx1.Done()
5863

5964
return nil
6065
})
6166

6267
<-started
6368

64-
// This Go call should block on the semaphore
65-
var wasCalled bool
66-
ctx, cancel2 := context.WithCancel(t.Context())
67-
ec.Go(ctx, func() error {
68-
wasCalled = true
69+
// This Go call should block on the semaphore.
70+
// wasCalled must be atomic: the goroutine spawned by ec.Go may write it
71+
// concurrently with the main goroutine's read in assert.False below.
72+
// A plain bool causes a data race that the -race detector catches on ARM64
73+
// (weaker memory model) even though it appears safe on x86.
74+
var wasCalled atomic.Bool
75+
ctx2, cancel2 := context.WithCancel(t.Context())
76+
ec.Go(ctx2, func() error {
77+
wasCalled.Store(true)
6978

7079
return nil
7180
})
@@ -78,6 +87,6 @@ func TestErrorCollector(t *testing.T) {
7887

7988
err := ec.Wait()
8089
require.ErrorIs(t, err, context.Canceled)
81-
assert.False(t, wasCalled)
90+
assert.False(t, wasCalled.Load())
8291
})
8392
}

0 commit comments

Comments
 (0)