Skip to content

Commit 0028d66

Browse files
authored
Perform cross-region retry of Document Read request when enclosing address request hits a 408:10002. (Azure#43937)
* Adding changes to do cross-region of Document request when enclosing address refresh request times out. * Modified tests. * Modified tests. * Modified tests. * Fixing live tests pipeline. * Fixing live tests pipeline. * Addressing review comments.
1 parent 2ce078b commit 0028d66

File tree

6 files changed

+160
-44
lines changed

6 files changed

+160
-44
lines changed

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/faultinjection/FaultInjectionMetadataRequestRuleTests.java

Lines changed: 67 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import com.azure.cosmos.implementation.OperationType;
2020
import com.azure.cosmos.implementation.Utils;
2121
import com.azure.cosmos.implementation.throughputControl.TestItem;
22+
import com.azure.cosmos.models.CosmosItemResponse;
2223
import com.azure.cosmos.models.CosmosQueryRequestOptions;
2324
import com.azure.cosmos.models.FeedRange;
2425
import com.azure.cosmos.models.PartitionKey;
@@ -208,9 +209,9 @@ public void faultInjectionServerErrorRuleTests_AddressRefresh_ConnectionDelay(bo
208209
// when preferred regions is 2
209210
// Due to issue https://github.com/Azure/azure-sdk-for-java/issues/35779, the request mark the region unavailable will retry
210211
// in the unavailable region again, hence the addressRefresh fault injection will be happened 4 times
211-
validateFaultInjectionRuleAppliedForAddressResolution(cosmosDiagnostics, addressRefreshConnectionDelay, 4);
212+
validateFaultInjectionRuleAppliedForAddressResolution(cosmosDiagnostics, addressRefreshConnectionDelay, 4, false);
212213
} else {
213-
validateFaultInjectionRuleAppliedForAddressResolution(cosmosDiagnostics, addressRefreshConnectionDelay, 3);
214+
validateFaultInjectionRuleAppliedForAddressResolution(cosmosDiagnostics, addressRefreshConnectionDelay, 3, false);
214215
}
215216
} catch (CosmosException e) {
216217
fail("Request should be able to succeed by retrying in another region. " + e.getDiagnostics());
@@ -305,18 +306,34 @@ public void faultInjectionServerErrorRuleTests_AddressRefresh_ResponseDelay(
305306
CosmosDiagnostics cosmosDiagnostics =
306307
this.performDocumentOperation(container, operationType, createdItem);
307308

308-
assertThat(cosmosDiagnostics.getContactedRegionNames().size()).isEqualTo(1);
309-
assertThat(
310-
cosmosDiagnostics
311-
.getContactedRegionNames()
312-
.containsAll(Arrays.asList(this.readPreferredLocations.get(0).toLowerCase())))
313-
.isTrue();
314309

315-
assertThat(cosmosDiagnostics.getDiagnosticsContext().getStatusCode())
316-
.isEqualTo(HttpConstants.StatusCodes.REQUEST_TIMEOUT);
317-
assertThat(cosmosDiagnostics.getDiagnosticsContext().getSubStatusCode())
318-
.isEqualTo(HttpConstants.SubStatusCodes.GATEWAY_ENDPOINT_READ_TIMEOUT);
319-
validateFaultInjectionRuleAppliedForAddressResolution(cosmosDiagnostics, addressRefreshResponseDelay, 3);
310+
if (isNonWriteDocumentOperation(operationType)) {
311+
assertThat(cosmosDiagnostics.getContactedRegionNames().size()).isEqualTo(2);
312+
assertThat(
313+
cosmosDiagnostics
314+
.getContactedRegionNames()
315+
.containsAll(Arrays.asList(this.readPreferredLocations.get(0).toLowerCase(), this.readPreferredLocations.get(0).toLowerCase())))
316+
.isTrue();
317+
318+
assertThat(cosmosDiagnostics.getDiagnosticsContext().getStatusCode())
319+
.isNotEqualTo(HttpConstants.StatusCodes.REQUEST_TIMEOUT);
320+
assertThat(cosmosDiagnostics.getDiagnosticsContext().getSubStatusCode())
321+
.isNotEqualTo(HttpConstants.SubStatusCodes.GATEWAY_ENDPOINT_READ_TIMEOUT);
322+
} else {
323+
assertThat(cosmosDiagnostics.getContactedRegionNames().size()).isEqualTo(1);
324+
assertThat(
325+
cosmosDiagnostics
326+
.getContactedRegionNames()
327+
.containsAll(Arrays.asList(this.readPreferredLocations.get(0).toLowerCase())))
328+
.isTrue();
329+
330+
assertThat(cosmosDiagnostics.getDiagnosticsContext().getStatusCode())
331+
.isEqualTo(HttpConstants.StatusCodes.REQUEST_TIMEOUT);
332+
assertThat(cosmosDiagnostics.getDiagnosticsContext().getSubStatusCode())
333+
.isEqualTo(HttpConstants.SubStatusCodes.GATEWAY_ENDPOINT_READ_TIMEOUT);
334+
}
335+
336+
validateFaultInjectionRuleAppliedForAddressResolution(cosmosDiagnostics, addressRefreshResponseDelay, 3, operationType == OperationType.Query);
320337
} finally {
321338
addressRefreshResponseDelayRule.disable();
322339
dataOperationGoneRule.disable();
@@ -404,17 +421,25 @@ public void faultInjectionServerErrorRuleTests_AddressRefresh_byPartition(boolea
404421
Arrays.asList(addressRefreshResponseDelayRule, dataOperationGoneRule))
405422
.block();
406423

407-
// validate for request on feed range 0, it will fail
424+
// validate for request on feed range 0, address refresh request failures will happen but request will succeed in second region as request is Document read
408425
try {
409-
CosmosDiagnostics cosmosDiagnostics =
426+
CosmosItemResponse<JsonNode> itemResponse =
410427
container
411428
.readItem(itemOnFeedRange1.getId(), new PartitionKey(itemOnFeedRange1.getId()), JsonNode.class)
412-
.block()
413-
.getDiagnostics();
429+
.block();
414430

415-
fail("Item on feed range 1 should have failed. " + cosmosDiagnostics);
431+
assertThat(itemResponse).isNotNull();
432+
433+
CosmosDiagnostics cosmosDiagnostics = itemResponse.getDiagnostics();
434+
435+
assertThat(addressRefreshResponseDelayRule.getHitCount()).isGreaterThan(0);
436+
437+
assertThat(cosmosDiagnostics).isNotNull();
438+
assertThat(cosmosDiagnostics.getDiagnosticsContext()).isNotNull();
439+
assertThat(cosmosDiagnostics.getDiagnosticsContext().getContactedRegionNames()).isNotNull();
440+
assertThat(cosmosDiagnostics.getDiagnosticsContext().getContactedRegionNames().size()).isEqualTo(2);
416441
} catch (CosmosException e) {
417-
// no-op
442+
fail("Item on feed range 1 should have succeeded. " + e.getDiagnostics());
418443
}
419444

420445
try {
@@ -497,7 +522,7 @@ public void faultInjectionServerErrorRuleTests_AddressRefresh_TooManyRequest(boo
497522

498523
assertThat(cosmosDiagnostics.getContactedRegionNames().size()).isEqualTo(1);
499524
assertThat(cosmosDiagnostics.getContactedRegionNames().containsAll(Arrays.asList(this.readPreferredLocations.get(0).toLowerCase()))).isTrue();
500-
validateFaultInjectionRuleAppliedForAddressResolution(cosmosDiagnostics, addressRefreshTooManyRequest, 1);
525+
validateFaultInjectionRuleAppliedForAddressResolution(cosmosDiagnostics, addressRefreshTooManyRequest, 1, false);
501526
} finally {
502527
addressRefreshTooManyRequestRule.disable();
503528
dataOperationGoneRule.disable();
@@ -719,10 +744,24 @@ public void afterClass() {
719744
private void validateFaultInjectionRuleAppliedForAddressResolution(
720745
CosmosDiagnostics cosmosDiagnostics,
721746
String ruleId,
722-
int failureInjectedExpectedCount) throws JsonProcessingException {
747+
int failureInjectedExpectedCount,
748+
boolean isQueryOperation) throws JsonProcessingException {
723749

724750
ObjectNode diagnosticsNode = (ObjectNode) Utils.getSimpleObjectMapper().readTree(cosmosDiagnostics.toString());
725-
JsonNode addressResolutionStatistics = diagnosticsNode.get("addressResolutionStatistics");
751+
JsonNode addressResolutionStatistics;
752+
753+
if (isQueryOperation) {
754+
755+
ArrayNode arrayNode = (ArrayNode) diagnosticsNode.get("clientSideRequestStatistics");
756+
757+
assertThat(arrayNode).isNotNull();
758+
assertThat(arrayNode.get(0)).isNotNull();
759+
760+
addressResolutionStatistics = arrayNode.get(0).get("addressResolutionStatistics");
761+
} else {
762+
addressResolutionStatistics = diagnosticsNode.get("addressResolutionStatistics");
763+
}
764+
726765
Iterator<Map.Entry<String, JsonNode>> addressResolutionIterator = addressResolutionStatistics.fields();
727766
int failureInjectedCount = 0;
728767
while (addressResolutionIterator.hasNext()) {
@@ -736,6 +775,12 @@ private void validateFaultInjectionRuleAppliedForAddressResolution(
736775
assertThat(failureInjectedCount).isEqualTo(failureInjectedExpectedCount);
737776
}
738777

778+
private static boolean isNonWriteDocumentOperation(OperationType operationType) {
779+
return operationType == OperationType.Read ||
780+
operationType == OperationType.Query ||
781+
operationType == OperationType.ReadFeed;
782+
}
783+
739784
private static AccountLevelLocationContext getAccountLevelLocationContext(DatabaseAccount databaseAccount, boolean writeOnly) {
740785
Iterator<DatabaseAccountLocation> locationIterator =
741786
writeOnly ? databaseAccount.getWritableLocations().iterator() : databaseAccount.getReadableLocations().iterator();

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/ClientRetryPolicyTest.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,9 @@ public static Object[][] operationProvider() {
3434
return new Object[][]{
3535
// OperationType, ResourceType, isAddressRequest, RequestFullPath, ShouldRetryCrossRegion
3636
{ OperationType.Read, ResourceType.Document, Boolean.FALSE, TEST_DOCUMENT_PATH, Boolean.TRUE },
37-
{ OperationType.Read, ResourceType.Document, Boolean.TRUE, TEST_DOCUMENT_PATH, Boolean.FALSE },
37+
{ OperationType.Read, ResourceType.Document, Boolean.TRUE, TEST_DOCUMENT_PATH, Boolean.TRUE },
3838
{ OperationType.Create, ResourceType.Document, Boolean.FALSE, TEST_DOCUMENT_PATH, Boolean.FALSE },
39+
{ OperationType.Create, ResourceType.Document, Boolean.TRUE, TEST_DOCUMENT_PATH, Boolean.FALSE },
3940
{ OperationType.Read, ResourceType.Database, Boolean.FALSE, TEST_DATABASE_PATH, Boolean.TRUE },
4041
{ OperationType.Create, ResourceType.Database, Boolean.FALSE, TEST_DATABASE_PATH, Boolean.FALSE },
4142
{ OperationType.QueryPlan, ResourceType.Document, Boolean.FALSE, TEST_DOCUMENT_PATH, Boolean.TRUE }

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/rx/ClientRetryPolicyE2ETests.java

Lines changed: 72 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import com.azure.cosmos.implementation.Utils;
2222
import com.azure.cosmos.implementation.throughputControl.TestItem;
2323
import com.azure.cosmos.models.CosmosChangeFeedRequestOptions;
24+
import com.azure.cosmos.models.CosmosItemResponse;
2425
import com.azure.cosmos.models.CosmosPatchOperations;
2526
import com.azure.cosmos.models.CosmosQueryRequestOptions;
2627
import com.azure.cosmos.models.FeedRange;
@@ -207,7 +208,7 @@ public void queryPlanHttpTimeoutWillNotMarkRegionUnavailable(boolean shouldUsePr
207208
}
208209

209210
@Test(groups = { "multi-master" }, dataProvider = "preferredRegionsConfigProvider", timeOut = TIMEOUT)
210-
public void addressRefreshHttpTimeoutWillNotDoCrossRegionRetry(boolean shouldUsePreferredRegionsOnClient) {
211+
public void addressRefreshHttpTimeoutWillDoCrossRegionRetryForReads(boolean shouldUsePreferredRegionsOnClient) {
211212

212213
CosmosAsyncContainer resultantCosmosAsyncContainer;
213214
CosmosAsyncClient resultantCosmosAsyncClient;
@@ -262,10 +263,78 @@ public void addressRefreshHttpTimeoutWillNotDoCrossRegionRetry(boolean shouldUse
262263
resultantCosmosAsyncContainer,
263264
Arrays.asList(addressRefreshDelayRule, serverGoneRule)).block();
264265
try {
265-
resultantCosmosAsyncContainer
266+
CosmosItemResponse<TestItem> itemResponse = resultantCosmosAsyncContainer
266267
.readItem(newItem.getId(), new PartitionKey(newItem.getId()), TestItem.class)
267268
.block();
268-
fail("addressRefreshHttpTimeoutWillNotDoCrossRegionRetry() should fail due to addressRefresh timeout");
269+
270+
assertThat(itemResponse).isNotNull();
271+
272+
CosmosDiagnostics diagnostics = itemResponse.getDiagnostics();
273+
274+
assertThat(diagnostics.getContactedRegionNames().size()).isEqualTo(2);
275+
assertThat(diagnostics.getContactedRegionNames()).contains(this.preferredRegions.get(0));
276+
assertThat(diagnostics.getContactedRegionNames()).contains(this.preferredRegions.get(1));
277+
} finally {
278+
addressRefreshDelayRule.disable();
279+
serverGoneRule.disable();
280+
}
281+
}
282+
283+
@Test(groups = { "multi-master" }, dataProvider = "preferredRegionsConfigProvider", timeOut = TIMEOUT)
284+
public void addressRefreshHttpTimeoutWillNotDoCrossRegionRetryForWrites(boolean shouldUsePreferredRegionsOnClient) {
285+
286+
CosmosAsyncContainer resultantCosmosAsyncContainer;
287+
CosmosAsyncClient resultantCosmosAsyncClient;
288+
289+
if (shouldUsePreferredRegionsOnClient) {
290+
resultantCosmosAsyncClient = this.clientWithPreferredRegions;
291+
resultantCosmosAsyncContainer = this.cosmosAsyncContainerFromClientWithPreferredRegions;
292+
} else {
293+
resultantCosmosAsyncClient = this.clientWithoutPreferredRegions;
294+
resultantCosmosAsyncContainer = this.cosmosAsyncContainerFromClientWithoutPreferredRegions;
295+
}
296+
297+
if (BridgeInternal
298+
.getContextClient(resultantCosmosAsyncClient)
299+
.getConnectionPolicy()
300+
.getConnectionMode() != ConnectionMode.DIRECT) {
301+
throw new SkipException("queryPlanHttpTimeoutWillNotMarkRegionUnavailable() is only meant for DIRECT mode");
302+
}
303+
304+
// create fault injection rules for address refresh
305+
FaultInjectionRule addressRefreshDelayRule = new FaultInjectionRuleBuilder("addressRefreshDelayRule")
306+
.condition(
307+
new FaultInjectionConditionBuilder()
308+
.operationType(FaultInjectionOperationType.METADATA_REQUEST_ADDRESS_REFRESH)
309+
.build())
310+
.result(
311+
FaultInjectionResultBuilders.getResultBuilder(FaultInjectionServerErrorType.RESPONSE_DELAY)
312+
.delay(Duration.ofSeconds(11))
313+
.times(4) // make sure it will exhaust the local region retry
314+
.build()
315+
)
316+
.build();
317+
318+
// Create gone rules to force address refresh will bound to happen
319+
FaultInjectionRule serverGoneRule = new FaultInjectionRuleBuilder("serverGoneRule")
320+
.condition(
321+
new FaultInjectionConditionBuilder()
322+
.operationType(FaultInjectionOperationType.CREATE_ITEM)
323+
.build())
324+
.result(
325+
FaultInjectionResultBuilders.getResultBuilder(FaultInjectionServerErrorType.GONE)
326+
.times(4) // client is on session consistency, make sure exception will be bubbled up to goneAndRetry policy
327+
.build()
328+
)
329+
.build();
330+
331+
CosmosFaultInjectionHelper
332+
.configureFaultInjectionRules(
333+
resultantCosmosAsyncContainer,
334+
Arrays.asList(addressRefreshDelayRule, serverGoneRule)).block();
335+
try {
336+
TestItem newItem = TestItem.createNewItem();
337+
resultantCosmosAsyncContainer.createItem(newItem).block();
269338
} catch (CosmosException e) {
270339
assertThat(e.getDiagnostics().getContactedRegionNames().size()).isEqualTo(1);
271340
assertThat(e.getDiagnostics().getContactedRegionNames()).contains(this.preferredRegions.get(0));

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/rx/WebExceptionRetryPolicyE2ETests.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ public void addressRefreshHttpTimeout() {
125125
FaultInjectionRule serverGoneRule = new FaultInjectionRuleBuilder("serverGoneRule")
126126
.condition(
127127
new FaultInjectionConditionBuilder()
128-
.operationType(FaultInjectionOperationType.READ_ITEM)
128+
.operationType(FaultInjectionOperationType.CREATE_ITEM)
129129
.build())
130130
.result(
131131
FaultInjectionResultBuilders.getResultBuilder(FaultInjectionServerErrorType.GONE)
@@ -140,7 +140,7 @@ public void addressRefreshHttpTimeout() {
140140
Arrays.asList(addressRefreshDelayRule, serverGoneRule)).block();
141141
try {
142142
cosmosAsyncContainer
143-
.readItem(newItem.getId(), new PartitionKey(newItem.getId()), TestItem.class)
143+
.createItem(newItem)
144144
.block();
145145
fail("addressRefreshHttpTimeout() should fail due to addressRefresh timeout");
146146
} catch (CosmosException e) {

sdk/cosmos/azure-cosmos/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#### Bugs Fixed
1010
* Fixes an issue in change feed processor where records are skipped and excessive requests are prefetched. - See [PR 43788](https://github.com/Azure/azure-sdk-for-java/pull/43788)
11+
* Perform cross-region retry for `Document` reads when enclosing address requests hit request timeouts (408:10002). - See [PR 43937](https://github.com/Azure/azure-sdk-for-java/pull/43937)
1112

1213
#### Other Changes
1314
* Added temporary internal-only option to enable thin client mode with system property COSMOS.THINCLIENT_ENABLED, setting the thin client endpoint with system property COSMOS.THINCLIENT_ENDPOINT, and default thin client endpoint with system property COSMOS.DEFAULT_THINCLIENT_ENDPOINT while the thin-client transport is still under development. This transport mode is not yet supported or ready to be used by external customers. Please don't use these configs in any production scenario yet. - [PR 43188](https://github.com/Azure/azure-sdk-for-java/pull/43188)

0 commit comments

Comments
 (0)