Skip to content

Commit 4537eb8

Browse files
authored
Fail fast in current region when Document requests hit 410 Lease Not Found. (#46433)
* Adding a way to avoid barrier calls when at least 1 response in quorum read flow has 410-1022. * Adding a way to avoid barrier calls when at least 1 response in quorum read flow has 410-1022. * Adding a way to avoid barrier calls when at least 1 response in quorum read flow has 410-1022. * Adding a way to avoid barrier calls when at least 1 response in quorum read flow has 410-1022. * Adding a way to avoid barrier calls when at least 1 response in quorum read flow has 410-1022. * Adding a way to avoid barrier calls when at least 1 response in quorum read flow has 410-1022. * Adding a way to avoid barrier calls when at least 1 response in quorum read flow has 410-1022. * Adding a way to avoid barrier calls when at least 1 response in quorum read flow has 410-1022. * Modifying tests. * Scope out barrier requests from the fail fast flow (for the time being). * Fixing tests * Adding replicaId and error code logs. * Adding replicaId and error code logs. * Adding replicaId and error code logs. * Adding replicaId and error code logs. * Specify work item for barrier handling on 410-1022s * Addressing review comments. * Updated CHANGELOG.md * Addressing review comments. * Addressing review comments.
1 parent 5bf4b0a commit 4537eb8

File tree

11 files changed

+352
-48
lines changed

11 files changed

+352
-48
lines changed

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/directconnectivity/GoneAndRetryWithRetryPolicyTest.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import com.azure.cosmos.implementation.GoneException;
1010
import com.azure.cosmos.implementation.HttpConstants;
1111
import com.azure.cosmos.implementation.InvalidPartitionException;
12+
import com.azure.cosmos.implementation.LeaseNotFoundException;
1213
import com.azure.cosmos.implementation.OperationType;
1314
import com.azure.cosmos.implementation.PartitionIsMigratingException;
1415
import com.azure.cosmos.implementation.PartitionKeyRangeIsSplittingException;
@@ -76,6 +77,35 @@ public void shouldRetryReadWithGoneException() {
7677
assertThat(shouldRetryResult.backOffTime.getSeconds()).isEqualTo(4);
7778
}
7879

80+
@Test(groups = { "unit" }, timeOut = TIMEOUT)
81+
public void shouldNotRetryReadWithLeaseNotFoundException() {
82+
RxDocumentServiceRequest request = RxDocumentServiceRequest.create(
83+
mockDiagnosticsClientContext(),
84+
OperationType.Read,
85+
ResourceType.Document);
86+
GoneAndRetryWithRetryPolicy goneAndRetryWithRetryPolicy = new GoneAndRetryWithRetryPolicy(request, 30);
87+
Mono<ShouldRetryResult> singleShouldRetry = goneAndRetryWithRetryPolicy
88+
.shouldRetry(new LeaseNotFoundException("0", null));
89+
ShouldRetryResult shouldRetryResult = singleShouldRetry.block();
90+
assertThat(shouldRetryResult.shouldRetry).isFalse();
91+
assertThat(shouldRetryResult.policyArg.getValue0()).isTrue();
92+
assertThat(shouldRetryResult.policyArg.getValue3()).isEqualTo(1);
93+
assertThat(shouldRetryResult.backOffTime).isNull();
94+
95+
request = RxDocumentServiceRequest.create(
96+
mockDiagnosticsClientContext(),
97+
OperationType.Create,
98+
ResourceType.Document);
99+
goneAndRetryWithRetryPolicy = new GoneAndRetryWithRetryPolicy(request, 30);
100+
singleShouldRetry = goneAndRetryWithRetryPolicy
101+
.shouldRetry(new LeaseNotFoundException("0", null));
102+
shouldRetryResult = singleShouldRetry.block();
103+
assertThat(shouldRetryResult.shouldRetry).isFalse();
104+
assertThat(shouldRetryResult.policyArg.getValue0()).isTrue();
105+
assertThat(shouldRetryResult.policyArg.getValue3()).isEqualTo(1);
106+
assertThat(shouldRetryResult.backOffTime).isNull();
107+
}
108+
79109
/**
80110
* Retry with GoneException for write which is not yet sent to the wire,
81111
* retried 4 times and verified the returned

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/directconnectivity/StoreReaderTest.java

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,13 @@
77
import com.azure.cosmos.CosmosException;
88
import com.azure.cosmos.implementation.DiagnosticsClientContext;
99
import com.azure.cosmos.implementation.DocumentServiceRequestContext;
10+
import com.azure.cosmos.implementation.Exceptions;
1011
import com.azure.cosmos.implementation.FailureValidator;
1112
import com.azure.cosmos.implementation.GoneException;
1213
import com.azure.cosmos.implementation.HttpConstants;
1314
import com.azure.cosmos.implementation.ISessionContainer;
1415
import com.azure.cosmos.implementation.ISessionToken;
16+
import com.azure.cosmos.implementation.LeaseNotFoundException;
1517
import com.azure.cosmos.implementation.NotFoundException;
1618
import com.azure.cosmos.implementation.OperationType;
1719
import com.azure.cosmos.implementation.PartitionIsMigratingException;
@@ -149,6 +151,7 @@ public Object[][] storeResponseArgProvider() {
149151
{ new PartitionKeyRangeIsSplittingException() , null, },
150152
{ new PartitionIsMigratingException(), null, },
151153
{ new GoneException(), null, },
154+
{ new LeaseNotFoundException(null, 0L, null, null), null },
152155
{ null, Mockito.mock(StoreResponse.class), }
153156
};
154157
}
@@ -915,9 +918,15 @@ public void storeResponseRecordedOnException(Exception ex, StoreResponse storeRe
915918
try {
916919
StoreReader.verifyCanContinueOnException((CosmosException) ex);
917920

918-
// for continuable exception, SDK will retry on all other replicas, so the failed endpoints should match replica counts.
919-
List<Uri> expectedFailedEndpoints = Arrays.asList(primaryUri, secondaryUri1, secondaryUri2, secondaryUri3);
920-
assertThat(dsr.requestContext.getFailedEndpoints()).hasSize(expectedFailedEndpoints.size()).containsAll(expectedFailedEndpoints);
921+
if (Exceptions.isAvoidQuorumSelectionException((CosmosException) ex)) {
922+
// while the exception is continuable, it is avoid quorum selection exception, so such results are collected
923+
// while these results are not valid, they are still collected in the failed endpoints and also contribute towards decrementing replicaCountToRead to avoid quorum reselection.
924+
assertThat(dsr.requestContext.getFailedEndpoints().size()).isEqualTo(3);
925+
} else {
926+
// for continuable exception, SDK will retry on all other replicas, so the failed endpoints should match replica counts.
927+
List<Uri> expectedFailedEndpoints = Arrays.asList(primaryUri, secondaryUri1, secondaryUri2, secondaryUri3);
928+
assertThat(dsr.requestContext.getFailedEndpoints()).hasSize(expectedFailedEndpoints.size()).containsAll(expectedFailedEndpoints);
929+
}
921930

922931
} catch (Exception exception) {
923932
if (exception instanceof CosmosException) {

0 commit comments

Comments
 (0)