Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion internal/xds/balancer/priority/balancer_priority.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ func (b *priorityBalancer) handleChildStateUpdate(childName string, s balancer.S
b.logger.Warningf("Ignoring update from child policy %q which is not in started state: %+v", childName, s)
return
}
originalState := child.state
child.state = s

// We start/stop the init timer of this child based on the new connectivity
Expand All @@ -199,7 +200,7 @@ func (b *priorityBalancer) handleChildStateUpdate(childName string, s balancer.S
child.reportedTF = true
child.stopInitTimer()
case connectivity.Connecting:
if !child.reportedTF {
if !child.reportedTF && originalState.ConnectivityState != connectivity.Connecting {
child.startInitTimer()
}
default:
Expand Down
79 changes: 79 additions & 0 deletions internal/xds/balancer/priority/balancer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2107,3 +2107,82 @@ func (s) TestPriority_HighPriorityUpdatesWhenLowInUse(t *testing.T) {
t.Fatal(err.Error())
}
}

// TestPriority_InitTimerNoRestartOnConnectingToConnecting verifies that
// receiving a CONNECTING update when already in CONNECTING state
// does not restart the priority init timer.
//
// Scenario:
// 1. Child-0 enters CONNECTING → timer starts (duration: T)
// 2. At T/2: Child-0 receives another CONNECTING update
// 3. Expected: Child-1 starts at time T (not T*1.5)
func (s) TestPriority_InitTimerNoRestartOnConnectingToConnecting(t *testing.T) {
_, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
defer cancel()

defer func(old time.Duration) {
DefaultPriorityInitTimeout = old
}(DefaultPriorityInitTimeout)
DefaultPriorityInitTimeout = defaultTestShortTimeout
Comment on lines +2123 to +2126
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apologies for existing art like this.

Would you mind adding a helper function instead:

func overrideInitTimeout(t *testing.T, val time.Duration) {
  orig := DefaultPriorityInitTimeout
  DefaultPriorityInitTimeout = val
  t.Cleanup(func() { DefaultPriorityInitTimeout = orig })
}

And all existing tests and this new can have a one-liner to override the init timeout.


cc := testutils.NewBalancerClientConn(t)
bb := balancer.Get(Name)
pb := bb.Build(cc, balancer.BuildOptions{})
defer pb.Close()

// Two children, with priorities [0, 1], each with one backend.
if err := pb.UpdateClientConnState(balancer.ClientConnState{
ResolverState: resolver.State{
Endpoints: []resolver.Endpoint{
hierarchy.SetInEndpoint(resolver.Endpoint{Addresses: []resolver.Address{{Addr: testBackendAddrStrs[0]}}}, []string{"child-0"}),
hierarchy.SetInEndpoint(resolver.Endpoint{Addresses: []resolver.Address{{Addr: testBackendAddrStrs[1]}}}, []string{"child-1"}),
},
},
BalancerConfig: &LBConfig{
Children: map[string]*Child{
"child-0": {Config: &internalserviceconfig.BalancerConfig{Name: roundrobin.Name}},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Could we use pick_first instead of round_robin, since the latter anyways delegates to the former.

"child-1": {Config: &internalserviceconfig.BalancerConfig{Name: roundrobin.Name}},
},
Priorities: []string{"child-0", "child-1"},
},
}); err != nil {
t.Fatalf("failed to update ClientConn state: %v", err)
}

// child-0 will be started, and will create a SubConn.
addrs0 := <-cc.NewSubConnAddrsCh
if got, want := addrs0[0].Addr, testBackendAddrStrs[0]; got != want {
t.Fatalf("got unexpected new subconn addr: %v, want %v", got, want)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: How about a more readable error message like New subchannel created for address: %q, want: %q?

}
sc0 := <-cc.NewSubConnCh

// Send CONNECTING for child-0 - this should start the init timer.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The init timer is initially started when the child policy for that priority is created, which will happen when we call pb.UpdateClientConnState above. And as part of that in newChildBalancer, we also set the state in the childBalancer to Connecting.

Instead of relying on keeping track of the time elapsed between these events, what do you think about using the following state transitions which is possible in the real world:

  • Move the subchannel corresponding to priority 0 to Connecting
  • Move the subchannel corresponding to priority 0 to Ready
  • Move the subchannel corresponding to priority 0 to Idle
  • Move the subchannel corresponding to priority 0 to Connecting

And create a local variable for named timeAfterFunc and set it to what is happening currently:

// As a package global
var timeAfterFunc time.AfterFunc

func (cb *childBalancer) startInitTimer() {
	...
	// Instead of directly using time.AfterFunc, use the variable timeAfterFunc
	timerW.timer = timeAfterFunc(DefaultPriorityInitTimeout, func() {
		...
	})
}

From the test, change timeAfterFunc such that you write to a channel controlled by the test, and delegate to the actual afterFunc that was passed in. So, something like:

initTimerStarted := make(chan struct{}, 1)
origTimeAfterFunc := timeAfterFunc
timeAfterFunc = func(d time.Duration, f func()) *time.Timer {
  initTimerStarted <- struct{}{}
  time.AfterFunc(d, f)
}

Then, you can verify that a new timer is not created by reading from the channel.

sc0.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting})

// Record the current time to measure the timer duration.
timerStartTime := time.Now()

// Send another CONNECTING update for child-0.
// This should NOT restart the timer.
time.Sleep(defaultTestShortTimeout / 2)
sc0.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting})

// Wait for the original timer to expire and child-1 to be started.
// If the timer was restarted, child-1 would not be started within the
// original timeout period.
select {
case addrs1 := <-cc.NewSubConnAddrsCh:
if got, want := addrs1[0].Addr, testBackendAddrStrs[1]; got != want {
t.Fatalf("got unexpected new subconn addr: %v, want %v", got, want)
}
elapsed := time.Since(timerStartTime)
// The timer should expire close to the original timeout, not extended.
// We allow some buffer for test execution time.
if elapsed > defaultTestShortTimeout+50*time.Millisecond {
t.Fatalf("init timer took %v to expire, expected around %v (timer was likely restarted)", elapsed, defaultTestShortTimeout)
}
case <-time.After(defaultTestShortTimeout * 2):
// This is the expected behavior - child-1 should have been started after the original timer expired.
t.Fatal("child-1 was not started after init timer expiration")
}
}
Loading