From 162d18066e5a0c65f89eabd89dc956d4fefe0114 Mon Sep 17 00:00:00 2001 From: "elric.lim" Date: Sat, 4 Oct 2025 15:41:44 +0900 Subject: [PATCH 1/4] check original state of child --- internal/xds/balancer/priority/balancer_priority.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/xds/balancer/priority/balancer_priority.go b/internal/xds/balancer/priority/balancer_priority.go index 0be807c134a1..1ad5cfda14e5 100644 --- a/internal/xds/balancer/priority/balancer_priority.go +++ b/internal/xds/balancer/priority/balancer_priority.go @@ -186,6 +186,7 @@ func (b *priorityBalancer) handleChildStateUpdate(childName string, s balancer.S b.logger.Warningf("Ignoring update from child policy %q which is not in started state: %+v", childName, s) return } + originalState := child.state child.state = s // We start/stop the init timer of this child based on the new connectivity @@ -199,7 +200,7 @@ func (b *priorityBalancer) handleChildStateUpdate(childName string, s balancer.S child.reportedTF = true child.stopInitTimer() case connectivity.Connecting: - if !child.reportedTF { + if !child.reportedTF && originalState.ConnectivityState != connectivity.Connecting { child.startInitTimer() } default: From 291ed937fd92d8a57ed225ce5a9691c9fb1d6eed Mon Sep 17 00:00:00 2001 From: "elric.lim" Date: Sat, 4 Oct 2025 15:41:59 +0900 Subject: [PATCH 2/4] add test code --- .../xds/balancer/priority/balancer_test.go | 165 ++++++++++++++++++ 1 file changed, 165 insertions(+) diff --git a/internal/xds/balancer/priority/balancer_test.go b/internal/xds/balancer/priority/balancer_test.go index 3aac85ce54f8..246493630fe3 100644 --- a/internal/xds/balancer/priority/balancer_test.go +++ b/internal/xds/balancer/priority/balancer_test.go @@ -2107,3 +2107,168 @@ func (s) TestPriority_HighPriorityUpdatesWhenLowInUse(t *testing.T) { t.Fatal(err.Error()) } } + +// TestPriority_InitTimerNoRestartOnConnectingToConnecting verifies that the +// init timer is not restarted when a child transitions from CONNECTING to +// CONNECTING state. This test addresses the issue described in: +// https://github.com/grpc/grpc-go/issues/8516 +func (s) TestPriority_InitTimerNoRestartOnConnectingToConnecting(t *testing.T) { + _, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) + defer cancel() + + const testPriorityInitTimeout = 200 * time.Millisecond + defer func(old time.Duration) { + DefaultPriorityInitTimeout = old + }(DefaultPriorityInitTimeout) + DefaultPriorityInitTimeout = testPriorityInitTimeout + + cc := testutils.NewBalancerClientConn(t) + bb := balancer.Get(Name) + pb := bb.Build(cc, balancer.BuildOptions{}) + defer pb.Close() + + // Two children, with priorities [0, 1], each with one backend. + if err := pb.UpdateClientConnState(balancer.ClientConnState{ + ResolverState: resolver.State{ + Endpoints: []resolver.Endpoint{ + hierarchy.SetInEndpoint(resolver.Endpoint{Addresses: []resolver.Address{{Addr: testBackendAddrStrs[0]}}}, []string{"child-0"}), + hierarchy.SetInEndpoint(resolver.Endpoint{Addresses: []resolver.Address{{Addr: testBackendAddrStrs[1]}}}, []string{"child-1"}), + }, + }, + BalancerConfig: &LBConfig{ + Children: map[string]*Child{ + "child-0": {Config: &internalserviceconfig.BalancerConfig{Name: roundrobin.Name}}, + "child-1": {Config: &internalserviceconfig.BalancerConfig{Name: roundrobin.Name}}, + }, + Priorities: []string{"child-0", "child-1"}, + }, + }); err != nil { + t.Fatalf("failed to update ClientConn state: %v", err) + } + + // child-0 will be started, and will create a SubConn. + addrs0 := <-cc.NewSubConnAddrsCh + if got, want := addrs0[0].Addr, testBackendAddrStrs[0]; got != want { + t.Fatalf("got unexpected new subconn addr: %v, want %v", got, want) + } + sc0 := <-cc.NewSubConnCh + + // Send CONNECTING for child-0 - this should start the init timer. + sc0.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting}) + + // Record the current time to measure the timer duration. + timerStartTime := time.Now() + + // Send another CONNECTING update for child-0. + // This should NOT restart the timer. + time.Sleep(testPriorityInitTimeout / 2) + sc0.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting}) + + // Wait for the original timer to expire and child-1 to be started. + // If the timer was restarted, child-1 would not be started within the + // original timeout period. + select { + case addrs1 := <-cc.NewSubConnAddrsCh: + if got, want := addrs1[0].Addr, testBackendAddrStrs[1]; got != want { + t.Fatalf("got unexpected new subconn addr: %v, want %v", got, want) + } + elapsed := time.Since(timerStartTime) + // The timer should expire close to the original timeout, not extended. + // We allow some buffer for test execution time. + if elapsed > testPriorityInitTimeout+50*time.Millisecond { + t.Fatalf("init timer took %v to expire, expected around %v (timer was likely restarted)", elapsed, testPriorityInitTimeout) + } + case <-time.After(testPriorityInitTimeout * 2): + // This is the expected behavior - child-1 should have been started + // after the original timer expired. + t.Fatal("child-1 was not started after init timer expiration") + } +} + +// TestPriority_InitTimerStartsOnNonConnectingToConnecting verifies that the +// init timer starts when transitioning from non-CONNECTING states to CONNECTING. +func (s) TestPriority_InitTimerStartsOnNonConnectingToConnecting(t *testing.T) { + _, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) + defer cancel() + + const testPriorityInitTimeout = 200 * time.Millisecond + defer func(old time.Duration) { + DefaultPriorityInitTimeout = old + }(DefaultPriorityInitTimeout) + DefaultPriorityInitTimeout = testPriorityInitTimeout + + cc := testutils.NewBalancerClientConn(t) + bb := balancer.Get(Name) + pb := bb.Build(cc, balancer.BuildOptions{}) + defer pb.Close() + + // Two children, with priorities [0, 1], each with one backend. + if err := pb.UpdateClientConnState(balancer.ClientConnState{ + ResolverState: resolver.State{ + Endpoints: []resolver.Endpoint{ + hierarchy.SetInEndpoint(resolver.Endpoint{Addresses: []resolver.Address{{Addr: testBackendAddrStrs[0]}}}, []string{"child-0"}), + hierarchy.SetInEndpoint(resolver.Endpoint{Addresses: []resolver.Address{{Addr: testBackendAddrStrs[1]}}}, []string{"child-1"}), + }, + }, + BalancerConfig: &LBConfig{ + Children: map[string]*Child{ + "child-0": {Config: &internalserviceconfig.BalancerConfig{Name: roundrobin.Name}}, + "child-1": {Config: &internalserviceconfig.BalancerConfig{Name: roundrobin.Name}}, + }, + Priorities: []string{"child-0", "child-1"}, + }, + }); err != nil { + t.Fatalf("failed to update ClientConn state: %v", err) + } + + // child-0 will be started, and will create a SubConn. + addrs0 := <-cc.NewSubConnAddrsCh + if got, want := addrs0[0].Addr, testBackendAddrStrs[0]; got != want { + t.Fatalf("got unexpected new subconn addr: %v, want %v", got, want) + } + sc0 := <-cc.NewSubConnCh + + // Test 1: IDLE -> CONNECTING should start timer + sc0.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Idle}) + sc0.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting}) + + timerStartTime := time.Now() + + // Wait for timer to expire and child-1 to start + select { + case addrs1 := <-cc.NewSubConnAddrsCh: + if got, want := addrs1[0].Addr, testBackendAddrStrs[1]; got != want { + t.Fatalf("got unexpected new subconn addr: %v, want %v", got, want) + } + elapsed := time.Since(timerStartTime) + if elapsed < testPriorityInitTimeout-50*time.Millisecond || elapsed > testPriorityInitTimeout+50*time.Millisecond { + t.Fatalf("init timer took %v to expire, expected around %v", elapsed, testPriorityInitTimeout) + } + case <-time.After(testPriorityInitTimeout * 2): + t.Fatal("child-1 was not started after init timer expiration (IDLE->CONNECTING)") + } + sc1 := <-cc.NewSubConnCh + + // Bring both children to READY to reset state + sc0.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Ready}) + sc1.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}) + + // Test 2: TRANSIENT_FAILURE -> CONNECTING should start timer + sc0.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}) + sc0.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting}) + + timerStartTime2 := time.Now() + + // Since child-0 went from TF to Connecting, child-1 should be started after timer + select { + case <-cc.NewSubConnAddrsCh: + // child-1 subconn might be recreated + elapsed := time.Since(timerStartTime2) + if elapsed < testPriorityInitTimeout-50*time.Millisecond || elapsed > testPriorityInitTimeout+50*time.Millisecond { + t.Fatalf("init timer took %v to expire, expected around %v (TF->CONNECTING)", elapsed, testPriorityInitTimeout) + } + case <-time.After(testPriorityInitTimeout * 2): + // Timer should have expired and failover should happen + // but child-1 might already be in use, which is also acceptable + } +} From 141e6d7eedc5419fa827dee5c53644c939b4aaa2 Mon Sep 17 00:00:00 2001 From: "elric.lim" Date: Sat, 4 Oct 2025 16:11:02 +0900 Subject: [PATCH 3/4] update to defaultTestShortTimeout --- .../xds/balancer/priority/balancer_test.go | 34 ++++++++----------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/internal/xds/balancer/priority/balancer_test.go b/internal/xds/balancer/priority/balancer_test.go index 246493630fe3..86c1ca7a112e 100644 --- a/internal/xds/balancer/priority/balancer_test.go +++ b/internal/xds/balancer/priority/balancer_test.go @@ -2108,19 +2108,14 @@ func (s) TestPriority_HighPriorityUpdatesWhenLowInUse(t *testing.T) { } } -// TestPriority_InitTimerNoRestartOnConnectingToConnecting verifies that the -// init timer is not restarted when a child transitions from CONNECTING to -// CONNECTING state. This test addresses the issue described in: -// https://github.com/grpc/grpc-go/issues/8516 func (s) TestPriority_InitTimerNoRestartOnConnectingToConnecting(t *testing.T) { _, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) defer cancel() - const testPriorityInitTimeout = 200 * time.Millisecond defer func(old time.Duration) { DefaultPriorityInitTimeout = old }(DefaultPriorityInitTimeout) - DefaultPriorityInitTimeout = testPriorityInitTimeout + DefaultPriorityInitTimeout = defaultTestShortTimeout cc := testutils.NewBalancerClientConn(t) bb := balancer.Get(Name) @@ -2161,7 +2156,7 @@ func (s) TestPriority_InitTimerNoRestartOnConnectingToConnecting(t *testing.T) { // Send another CONNECTING update for child-0. // This should NOT restart the timer. - time.Sleep(testPriorityInitTimeout / 2) + time.Sleep(defaultTestShortTimeout / 2) sc0.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting}) // Wait for the original timer to expire and child-1 to be started. @@ -2175,10 +2170,10 @@ func (s) TestPriority_InitTimerNoRestartOnConnectingToConnecting(t *testing.T) { elapsed := time.Since(timerStartTime) // The timer should expire close to the original timeout, not extended. // We allow some buffer for test execution time. - if elapsed > testPriorityInitTimeout+50*time.Millisecond { - t.Fatalf("init timer took %v to expire, expected around %v (timer was likely restarted)", elapsed, testPriorityInitTimeout) + if elapsed > defaultTestShortTimeout+50*time.Millisecond { + t.Fatalf("init timer took %v to expire, expected around %v (timer was likely restarted)", elapsed, defaultTestShortTimeout) } - case <-time.After(testPriorityInitTimeout * 2): + case <-time.After(defaultTestShortTimeout * 2): // This is the expected behavior - child-1 should have been started // after the original timer expired. t.Fatal("child-1 was not started after init timer expiration") @@ -2191,11 +2186,10 @@ func (s) TestPriority_InitTimerStartsOnNonConnectingToConnecting(t *testing.T) { _, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) defer cancel() - const testPriorityInitTimeout = 200 * time.Millisecond defer func(old time.Duration) { DefaultPriorityInitTimeout = old }(DefaultPriorityInitTimeout) - DefaultPriorityInitTimeout = testPriorityInitTimeout + DefaultPriorityInitTimeout = defaultTestShortTimeout cc := testutils.NewBalancerClientConn(t) bb := balancer.Get(Name) @@ -2231,9 +2225,9 @@ func (s) TestPriority_InitTimerStartsOnNonConnectingToConnecting(t *testing.T) { // Test 1: IDLE -> CONNECTING should start timer sc0.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Idle}) sc0.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting}) - + timerStartTime := time.Now() - + // Wait for timer to expire and child-1 to start select { case addrs1 := <-cc.NewSubConnAddrsCh: @@ -2241,10 +2235,10 @@ func (s) TestPriority_InitTimerStartsOnNonConnectingToConnecting(t *testing.T) { t.Fatalf("got unexpected new subconn addr: %v, want %v", got, want) } elapsed := time.Since(timerStartTime) - if elapsed < testPriorityInitTimeout-50*time.Millisecond || elapsed > testPriorityInitTimeout+50*time.Millisecond { - t.Fatalf("init timer took %v to expire, expected around %v", elapsed, testPriorityInitTimeout) + if elapsed < defaultTestShortTimeout-50*time.Millisecond || elapsed > defaultTestShortTimeout+50*time.Millisecond { + t.Fatalf("init timer took %v to expire, expected around %v", elapsed, defaultTestShortTimeout) } - case <-time.After(testPriorityInitTimeout * 2): + case <-time.After(defaultTestShortTimeout * 2): t.Fatal("child-1 was not started after init timer expiration (IDLE->CONNECTING)") } sc1 := <-cc.NewSubConnCh @@ -2264,10 +2258,10 @@ func (s) TestPriority_InitTimerStartsOnNonConnectingToConnecting(t *testing.T) { case <-cc.NewSubConnAddrsCh: // child-1 subconn might be recreated elapsed := time.Since(timerStartTime2) - if elapsed < testPriorityInitTimeout-50*time.Millisecond || elapsed > testPriorityInitTimeout+50*time.Millisecond { - t.Fatalf("init timer took %v to expire, expected around %v (TF->CONNECTING)", elapsed, testPriorityInitTimeout) + if elapsed < defaultTestShortTimeout-50*time.Millisecond || elapsed > defaultTestShortTimeout+50*time.Millisecond { + t.Fatalf("init timer took %v to expire, expected around %v (TF->CONNECTING)", elapsed, defaultTestShortTimeout) } - case <-time.After(testPriorityInitTimeout * 2): + case <-time.After(defaultTestShortTimeout * 2): // Timer should have expired and failover should happen // but child-1 might already be in use, which is also acceptable } From d5159831ccde20141adf7f92de9f5286013c1997 Mon Sep 17 00:00:00 2001 From: "elric.lim" Date: Tue, 7 Oct 2025 00:35:35 +0900 Subject: [PATCH 4/4] fix unecessary test --- .../xds/balancer/priority/balancer_test.go | 98 ++----------------- 1 file changed, 9 insertions(+), 89 deletions(-) diff --git a/internal/xds/balancer/priority/balancer_test.go b/internal/xds/balancer/priority/balancer_test.go index 86c1ca7a112e..4a1b05e653da 100644 --- a/internal/xds/balancer/priority/balancer_test.go +++ b/internal/xds/balancer/priority/balancer_test.go @@ -2108,6 +2108,14 @@ func (s) TestPriority_HighPriorityUpdatesWhenLowInUse(t *testing.T) { } } +// TestPriority_InitTimerNoRestartOnConnectingToConnecting verifies that +// receiving a CONNECTING update when already in CONNECTING state +// does not restart the priority init timer. +// +// Scenario: +// 1. Child-0 enters CONNECTING → timer starts (duration: T) +// 2. At T/2: Child-0 receives another CONNECTING update +// 3. Expected: Child-1 starts at time T (not T*1.5) func (s) TestPriority_InitTimerNoRestartOnConnectingToConnecting(t *testing.T) { _, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) defer cancel() @@ -2174,95 +2182,7 @@ func (s) TestPriority_InitTimerNoRestartOnConnectingToConnecting(t *testing.T) { t.Fatalf("init timer took %v to expire, expected around %v (timer was likely restarted)", elapsed, defaultTestShortTimeout) } case <-time.After(defaultTestShortTimeout * 2): - // This is the expected behavior - child-1 should have been started - // after the original timer expired. + // This is the expected behavior - child-1 should have been started after the original timer expired. t.Fatal("child-1 was not started after init timer expiration") } } - -// TestPriority_InitTimerStartsOnNonConnectingToConnecting verifies that the -// init timer starts when transitioning from non-CONNECTING states to CONNECTING. -func (s) TestPriority_InitTimerStartsOnNonConnectingToConnecting(t *testing.T) { - _, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - - defer func(old time.Duration) { - DefaultPriorityInitTimeout = old - }(DefaultPriorityInitTimeout) - DefaultPriorityInitTimeout = defaultTestShortTimeout - - cc := testutils.NewBalancerClientConn(t) - bb := balancer.Get(Name) - pb := bb.Build(cc, balancer.BuildOptions{}) - defer pb.Close() - - // Two children, with priorities [0, 1], each with one backend. - if err := pb.UpdateClientConnState(balancer.ClientConnState{ - ResolverState: resolver.State{ - Endpoints: []resolver.Endpoint{ - hierarchy.SetInEndpoint(resolver.Endpoint{Addresses: []resolver.Address{{Addr: testBackendAddrStrs[0]}}}, []string{"child-0"}), - hierarchy.SetInEndpoint(resolver.Endpoint{Addresses: []resolver.Address{{Addr: testBackendAddrStrs[1]}}}, []string{"child-1"}), - }, - }, - BalancerConfig: &LBConfig{ - Children: map[string]*Child{ - "child-0": {Config: &internalserviceconfig.BalancerConfig{Name: roundrobin.Name}}, - "child-1": {Config: &internalserviceconfig.BalancerConfig{Name: roundrobin.Name}}, - }, - Priorities: []string{"child-0", "child-1"}, - }, - }); err != nil { - t.Fatalf("failed to update ClientConn state: %v", err) - } - - // child-0 will be started, and will create a SubConn. - addrs0 := <-cc.NewSubConnAddrsCh - if got, want := addrs0[0].Addr, testBackendAddrStrs[0]; got != want { - t.Fatalf("got unexpected new subconn addr: %v, want %v", got, want) - } - sc0 := <-cc.NewSubConnCh - - // Test 1: IDLE -> CONNECTING should start timer - sc0.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Idle}) - sc0.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting}) - - timerStartTime := time.Now() - - // Wait for timer to expire and child-1 to start - select { - case addrs1 := <-cc.NewSubConnAddrsCh: - if got, want := addrs1[0].Addr, testBackendAddrStrs[1]; got != want { - t.Fatalf("got unexpected new subconn addr: %v, want %v", got, want) - } - elapsed := time.Since(timerStartTime) - if elapsed < defaultTestShortTimeout-50*time.Millisecond || elapsed > defaultTestShortTimeout+50*time.Millisecond { - t.Fatalf("init timer took %v to expire, expected around %v", elapsed, defaultTestShortTimeout) - } - case <-time.After(defaultTestShortTimeout * 2): - t.Fatal("child-1 was not started after init timer expiration (IDLE->CONNECTING)") - } - sc1 := <-cc.NewSubConnCh - - // Bring both children to READY to reset state - sc0.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Ready}) - sc1.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}) - - // Test 2: TRANSIENT_FAILURE -> CONNECTING should start timer - sc0.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}) - sc0.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting}) - - timerStartTime2 := time.Now() - - // Since child-0 went from TF to Connecting, child-1 should be started after timer - select { - case <-cc.NewSubConnAddrsCh: - // child-1 subconn might be recreated - elapsed := time.Since(timerStartTime2) - if elapsed < defaultTestShortTimeout-50*time.Millisecond || elapsed > defaultTestShortTimeout+50*time.Millisecond { - t.Fatalf("init timer took %v to expire, expected around %v (TF->CONNECTING)", elapsed, defaultTestShortTimeout) - } - case <-time.After(defaultTestShortTimeout * 2): - // Timer should have expired and failover should happen - // but child-1 might already be in use, which is also acceptable - } -}