Skip to content

Commit 88cd619

Browse files
committed
make the test deterministic by ensuring correct listener behaviour
1 parent 8eab968 commit 88cd619

File tree

1 file changed

+57
-21
lines changed

1 file changed

+57
-21
lines changed

balancer/pickfirst/metrics_test.go

Lines changed: 57 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ package pickfirst_test
2121
import (
2222
"context"
2323
"fmt"
24+
"io"
25+
"net"
26+
"sync"
27+
"syscall"
2428
"testing"
2529
"time"
2630

@@ -298,34 +302,48 @@ func (s) TestDisconnectLabel(t *testing.T) {
298302
// Server GracefulStop sends GOAWAY with active streams = 0.
299303
// This usually sends NoError(0) code.
300304
t.Run("GoAway", func(t *testing.T) {
301-
runDisconnectLabelTest(t, "GOAWAY NO_ERROR", func(ss *stubserver.StubServer) {
305+
runDisconnectLabelTest(t, "GOAWAY NO_ERROR", func(ss *stubserver.StubServer, _ *controllableConn) {
302306
ss.S.GracefulStop()
303-
// GracefulStop waits for connections to close, which happens after
304-
// GOAWAY is sent.
305307
})
306308
})
307309

308-
// 2. IO Error
309-
// Server Stop closes the listener and active connections immediately.
310-
// This often results in "connection reset" or "EOF" (unknown) depending on timing/OS.
311-
// Let's check for "unknown" or "connection reset" or "subchannel shutdown" strictly.
312-
// In this test env, it often results in io.EOF which we mapped to "unknown".
313-
t.Run("IO_Error", func(t *testing.T) {
314-
runDisconnectLabelTest(t, "unknown", func(ss *stubserver.StubServer) {
315-
ss.Stop()
310+
t.Run("ConnectionReset", func(t *testing.T) {
311+
runDisconnectLabelTest(t, "connection reset", func(_ *stubserver.StubServer, cc *controllableConn) {
312+
cc.breakWith(syscall.ECONNRESET)
316313
})
317314
})
318315

319-
// Scenario 3: Unknown (Client closes - voluntary? actually client close might be UNKNOWN or not recorded as split)
320-
// If client closes, we might not record "disconnections" metric from ClientConn perspective?
321-
// disconnections metric is "Number of times the selected subchannel becomes disconnected".
322-
// If we close 'cc', we tear down subchannels.
323-
// But let's try to trigger a case where we just disconnect without server side action?
324-
// Or maybe "unknown" is what we get for "Idle" timeout?
325-
// Let's stick to IO and GoAway first which are explicit in A94.
316+
t.Run("EOF", func(t *testing.T) {
317+
runDisconnectLabelTest(t, "unknown", func(_ *stubserver.StubServer, cc *controllableConn) {
318+
cc.breakWith(io.EOF)
319+
})
320+
})
321+
}
322+
323+
type controllableConn struct {
324+
net.Conn
325+
mu sync.Mutex
326+
readErr error
327+
}
328+
329+
func (c *controllableConn) Read(b []byte) (int, error) {
330+
n, err := c.Conn.Read(b)
331+
c.mu.Lock()
332+
defer c.mu.Unlock()
333+
if c.readErr != nil {
334+
return 0, c.readErr
335+
}
336+
return n, err
326337
}
327338

328-
func runDisconnectLabelTest(t *testing.T, wantLabel string, triggerFunc func(*stubserver.StubServer)) {
339+
func (c *controllableConn) breakWith(err error) {
340+
c.mu.Lock()
341+
c.readErr = err
342+
c.mu.Unlock()
343+
c.Conn.Close()
344+
}
345+
346+
func runDisconnectLabelTest(t *testing.T, wantLabel string, triggerFunc func(*stubserver.StubServer, *controllableConn)) {
329347
ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
330348
defer cancel()
331349

@@ -353,7 +371,21 @@ func runDisconnectLabelTest(t *testing.T, wantLabel string, triggerFunc func(*st
353371
OptionalLabels: []string{"grpc.disconnect_error"},
354372
}
355373

356-
cc, err := grpc.NewClient(grpcTarget, opentelemetry.DialOption(opentelemetry.Options{MetricsOptions: mo}), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithResolvers(r))
374+
var mu sync.Mutex
375+
var lastConn *controllableConn
376+
dialer := func(ctx context.Context, addr string) (net.Conn, error) {
377+
conn, err := (&net.Dialer{}).DialContext(ctx, "tcp", addr)
378+
if err != nil {
379+
return nil, err
380+
}
381+
cc := &controllableConn{Conn: conn}
382+
mu.Lock()
383+
lastConn = cc
384+
mu.Unlock()
385+
return cc, nil
386+
}
387+
388+
cc, err := grpc.NewClient(grpcTarget, opentelemetry.DialOption(opentelemetry.Options{MetricsOptions: mo}), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithResolvers(r), grpc.WithContextDialer(dialer))
357389
if err != nil {
358390
t.Fatalf("NewClient() failed: %v", err)
359391
}
@@ -365,8 +397,12 @@ func runDisconnectLabelTest(t *testing.T, wantLabel string, triggerFunc func(*st
365397
t.Fatalf("EmptyCall() failed: %v", err)
366398
}
367399

400+
mu.Lock()
401+
lc := lastConn
402+
mu.Unlock()
403+
368404
// Trigger disconnection
369-
triggerFunc(ss)
405+
triggerFunc(ss, lc)
370406

371407
// Wait for Idle state (disconnection happened)
372408
testutils.AwaitState(ctx, t, cc, connectivity.Idle)

0 commit comments

Comments
 (0)