Skip to content

Commit c4256ad

Browse files
committed
xds: Align PriorityLB child selection with A56
The PriorityLB predates A56. tryNextPriority() now matches ChoosePriority() from the gRFC. The biggest change is waiting on CONNECTING children instead of failing after the failOverTimer fires. The failOverTimer should be used to start lower priorities more eagerly, but shouldn't cause the overall connectivity state to become TRANSIENT_FAILURE on its own. The prior behavior of creating the "Connection timeout for priority" failing picker was particularly strange, because it didn't update child's connectivity state. This previous behavior was creating errors because of the failOverTimer with no way to diagnose what was going wrong. b/428517222
1 parent 6ff8eca commit c4256ad

File tree

2 files changed

+37
-21
lines changed

2 files changed

+37
-21
lines changed

xds/src/main/java/io/grpc/xds/PriorityLoadBalancer.java

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,8 @@ private Status tryNextPriority() {
154154
ChildLbState child =
155155
new ChildLbState(priority, priorityConfigs.get(priority).ignoreReresolution);
156156
children.put(priority, child);
157-
updateOverallState(priority, CONNECTING, new FixedResultPicker(PickResult.withNoResult()));
157+
// Child is created in CONNECTING with pending failOverTimer
158+
updateOverallState(priority, child.connectivityState, child.picker);
158159
// Calling the child's updateResolvedAddresses() can result in tryNextPriority() being
159160
// called recursively. We need to be sure to be done with processing here before it is
160161
// called.
@@ -173,21 +174,23 @@ private Status tryNextPriority() {
173174
}
174175
return Status.OK;
175176
}
176-
if (child.failOverTimer != null && child.failOverTimer.isPending()) {
177+
if (child.failOverTimer.isPending()) {
177178
updateOverallState(priority, child.connectivityState, child.picker);
178179
return Status.OK; // Give priority i time to connect.
179180
}
180-
if (priority.equals(currentPriority) && child.connectivityState != TRANSIENT_FAILURE) {
181-
// If the current priority is not changed into TRANSIENT_FAILURE, keep using it.
181+
}
182+
for (int i = 0; i < priorityNames.size(); i++) {
183+
String priority = priorityNames.get(i);
184+
ChildLbState child = children.get(priority);
185+
if (child.connectivityState.equals(CONNECTING)) {
182186
updateOverallState(priority, child.connectivityState, child.picker);
183187
return Status.OK;
184188
}
185189
}
186-
// TODO(zdapeng): Include error details of each priority.
187190
logger.log(XdsLogLevel.DEBUG, "All priority failed");
188191
String lastPriority = priorityNames.get(priorityNames.size() - 1);
189-
SubchannelPicker errorPicker = children.get(lastPriority).picker;
190-
updateOverallState(lastPriority, TRANSIENT_FAILURE, errorPicker);
192+
ChildLbState child = children.get(lastPriority);
193+
updateOverallState(lastPriority, child.connectivityState, child.picker);
191194
return Status.OK;
192195
}
193196

@@ -231,10 +234,7 @@ public void run() {
231234
// The child is deactivated.
232235
return;
233236
}
234-
picker = new FixedResultPicker(PickResult.withError(
235-
Status.UNAVAILABLE.withDescription("Connection timeout for priority " + priority)));
236237
logger.log(XdsLogLevel.DEBUG, "Priority {0} failed over to next", priority);
237-
currentPriority = null; // reset currentPriority to guarantee failover happen
238238
Status status = tryNextPriority();
239239
if (!status.isOk()) {
240240
// A child had a problem with the addresses/config. Request it to be refreshed

xds/src/test/java/io/grpc/xds/PriorityLoadBalancerTest.java

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,7 @@ public PickResult pickSubchannel(PickSubchannelArgs args) {
376376
assertThat(fooBalancers).hasSize(2);
377377
assertThat(fooHelpers).hasSize(2);
378378
LoadBalancer balancer1 = Iterables.getLast(fooBalancers);
379+
Helper helper1 = Iterables.getLast(fooHelpers);
379380

380381
// p1 timeout, and fails over to p2
381382
fakeClock.forwardTime(10, TimeUnit.SECONDS);
@@ -423,14 +424,20 @@ public PickResult pickSubchannel(PickSubchannelArgs args) {
423424
LoadBalancer balancer3 = Iterables.getLast(fooBalancers);
424425
Helper helper3 = Iterables.getLast(fooHelpers);
425426

426-
// p3 timeout then the channel should go to TRANSIENT_FAILURE
427+
// p3 timeout then the channel should stay in CONNECTING
427428
fakeClock.forwardTime(10, TimeUnit.SECONDS);
428-
assertCurrentPickerReturnsError(Status.Code.UNAVAILABLE, "timeout");
429+
assertCurrentPicker(CONNECTING, PickResult.withNoResult());
429430

430-
// p3 fails then the picker should have error status updated
431+
// p3 fails then the picker should still be waiting on p1
431432
helper3.updateBalancingState(
432433
TRANSIENT_FAILURE,
433434
new FixedResultPicker(PickResult.withError(Status.DATA_LOSS.withDescription("foo"))));
435+
assertCurrentPicker(CONNECTING, PickResult.withNoResult());
436+
437+
// p1 fails then the picker should have error status updated to p3
438+
helper1.updateBalancingState(
439+
TRANSIENT_FAILURE,
440+
new FixedResultPicker(PickResult.withError(Status.DATA_LOSS.withDescription("bar"))));
434441
assertCurrentPickerReturnsError(Status.Code.DATA_LOSS, "foo");
435442

436443
// p2 gets back to READY
@@ -642,6 +649,7 @@ public void typicalPriorityFailOverFlowWithIdleUpdate() {
642649
assertThat(fooBalancers).hasSize(2);
643650
assertThat(fooHelpers).hasSize(2);
644651
LoadBalancer balancer1 = Iterables.getLast(fooBalancers);
652+
Helper helper1 = Iterables.getLast(fooHelpers);
645653

646654
// p1 timeout, and fails over to p2
647655
fakeClock.forwardTime(10, TimeUnit.SECONDS);
@@ -677,14 +685,20 @@ public void typicalPriorityFailOverFlowWithIdleUpdate() {
677685
LoadBalancer balancer3 = Iterables.getLast(fooBalancers);
678686
Helper helper3 = Iterables.getLast(fooHelpers);
679687

680-
// p3 timeout then the channel should go to TRANSIENT_FAILURE
688+
// p3 timeout then the channel should stay in CONNECTING
681689
fakeClock.forwardTime(10, TimeUnit.SECONDS);
682-
assertCurrentPickerReturnsError(Status.Code.UNAVAILABLE, "timeout");
690+
assertCurrentPicker(CONNECTING, PickResult.withNoResult());
683691

684-
// p3 fails then the picker should have error status updated
692+
// p3 fails then the picker should still be waiting on p1
685693
helper3.updateBalancingState(
686694
TRANSIENT_FAILURE,
687695
new FixedResultPicker(PickResult.withError(Status.DATA_LOSS.withDescription("foo"))));
696+
assertCurrentPicker(CONNECTING, PickResult.withNoResult());
697+
698+
// p1 fails then the picker should have error status updated to p3
699+
helper1.updateBalancingState(
700+
TRANSIENT_FAILURE,
701+
new FixedResultPicker(PickResult.withError(Status.DATA_LOSS.withDescription("bar"))));
688702
assertCurrentPickerReturnsError(Status.Code.DATA_LOSS, "foo");
689703

690704
// p2 gets back to IDLE
@@ -863,15 +877,17 @@ private void assertCurrentPickerReturnsError(
863877
}
864878

865879
private void assertCurrentPickerPicksSubchannel(Subchannel expectedSubchannelToPick) {
866-
assertLatestConnectivityState(READY);
867-
PickResult pickResult = pickerCaptor.getValue().pickSubchannel(mock(PickSubchannelArgs.class));
868-
assertThat(pickResult.getSubchannel()).isEqualTo(expectedSubchannelToPick);
880+
assertCurrentPicker(READY, PickResult.withSubchannel(expectedSubchannelToPick));
869881
}
870882

871883
private void assertCurrentPickerIsBufferPicker() {
872-
assertLatestConnectivityState(IDLE);
884+
assertCurrentPicker(IDLE, PickResult.withNoResult());
885+
}
886+
887+
private void assertCurrentPicker(ConnectivityState state, PickResult result) {
888+
assertLatestConnectivityState(state);
873889
PickResult pickResult = pickerCaptor.getValue().pickSubchannel(mock(PickSubchannelArgs.class));
874-
assertThat(pickResult).isEqualTo(PickResult.withNoResult());
890+
assertThat(pickResult).isEqualTo(result);
875891
}
876892

877893
private Object newChildConfig(LoadBalancerProvider provider, Object config) {

0 commit comments

Comments
 (0)