Skip to content

Commit 9a65e74

Browse files
Change default read timeout for database account (Azure#39596)
* change default read timeout * fix tests * Add read timeout tests for database account calls * fix timeout retry policy * Fixed the timeout logic * Fixed the timeout retry policy * Mock tests for timeout and failover retry policy * Fixed async tests * Updated versions for release and changelog * Add tests for cross region tries --------- Co-authored-by: Kushagra Thapar <[email protected]> Co-authored-by: Kushagra Thapar <[email protected]>
1 parent 6f22379 commit 9a65e74

14 files changed

+415
-75
lines changed

sdk/cosmos/azure-cosmos/CHANGELOG.md

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
11
## Release History
22

3-
### 4.9.1b4 (Unreleased)
4-
5-
#### Features Added
6-
7-
#### Breaking Changes
3+
### 4.9.1b4 (2025-02-06)
84

95
#### Bugs Fixed
10-
11-
#### Other Changes
6+
* Improved retry logic for read requests to failover on other regions in case of timeouts and any error codes >= 500. See [PR 39596](https://github.com/Azure/azure-sdk-for-python/pull/39596)
7+
* Fixed a regression where read operations were not retrying on timeouts. See [PR 39596](https://github.com/Azure/azure-sdk-for-python/pull/39596)
8+
* Updated default read timeout for `getDatabaseAccount` calls to 3 seconds. See [PR 39596](https://github.com/Azure/azure-sdk-for-python/pull/39596)
129

1310
### 4.9.1b3 (2025-02-04)
1411

sdk/cosmos/azure-cosmos/azure/cosmos/_retry_utility.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,6 @@ def Execute(client, global_endpoint_manager, function, *args, **kwargs):
131131
sub_status_code=SubStatusCodes.THROUGHPUT_OFFER_NOT_FOUND)
132132
return result
133133
except exceptions.CosmosHttpResponseError as e:
134-
retry_policy = defaultRetry_policy
135134
if request and _has_database_account_header(request.headers):
136135
retry_policy = database_account_retry_policy
137136
# Re-assign retry policy based on error code
@@ -173,8 +172,12 @@ def Execute(client, global_endpoint_manager, function, *args, **kwargs):
173172

174173
retry_policy.container_rid = cached_container["_rid"]
175174
request.headers[retry_policy._intended_headers] = retry_policy.container_rid
176-
elif e.status_code in [StatusCodes.REQUEST_TIMEOUT, StatusCodes.SERVICE_UNAVAILABLE]:
175+
elif e.status_code == StatusCodes.REQUEST_TIMEOUT:
177176
retry_policy = timeout_failover_retry_policy
177+
elif e.status_code >= StatusCodes.INTERNAL_SERVER_ERROR:
178+
retry_policy = timeout_failover_retry_policy
179+
else:
180+
retry_policy = defaultRetry_policy
178181

179182
# If none of the retry policies applies or there is no retry needed, set the
180183
# throttle related response headers and re-throw the exception back arg[0]

sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ def _Request(global_endpoint_manager, request_params, connection_policy, pipelin
9393
if request_params.resource_type != http_constants.ResourceType.DatabaseAccount:
9494
global_endpoint_manager.refresh_endpoint_list(None, **kwargs)
9595
else:
96+
# always override database account call timeouts
97+
read_timeout = connection_policy.DBAReadTimeout
9698
connection_timeout = connection_policy.DBAConnectionTimeout
9799
if client_timeout is not None:
98100
kwargs['timeout'] = client_timeout - (time.time() - start_time)

sdk/cosmos/azure-cosmos/azure/cosmos/_timeout_failover_retry_policy.py

Lines changed: 8 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,18 @@
55
Cosmos database service.
66
"""
77
from azure.cosmos.documents import _OperationType
8-
from . import http_constants
98

109

1110
class _TimeoutFailoverRetryPolicy(object):
1211

1312
def __init__(self, connection_policy, global_endpoint_manager, *args):
14-
self._max_retry_attempt_count = 120
15-
self._max_service_unavailable_retry_count = 1
16-
self.retry_after_in_milliseconds = 0
13+
self.retry_after_in_milliseconds = 500
1714
self.args = args
1815

1916
self.global_endpoint_manager = global_endpoint_manager
17+
# If an account only has 1 region, then we still want to retry once on the same region
18+
self._max_retry_attempt_count = len(self.global_endpoint_manager.location_cache.read_regional_endpoints) + 1
2019
self.retry_count = 0
21-
self.location_index = 0
2220
self.connection_policy = connection_policy
2321
self.request = args[0] if args else None
2422

@@ -29,64 +27,32 @@ def ShouldRetry(self, _exception):
2927
:returns: a boolean stating whether the request should be retried
3028
:rtype: bool
3129
"""
32-
if self.request:
33-
if _OperationType.IsReadOnlyOperation(self.request.operation_type):
34-
return False
30+
# we don't retry on write operations for timeouts or any internal server errors
31+
if self.request and (not _OperationType.IsReadOnlyOperation(self.request.operation_type)):
32+
return False
3533

3634
if not self.connection_policy.EnableEndpointDiscovery:
3735
return False
3836

39-
40-
# Check if the next retry about to be done is safe
41-
if _exception.status_code == http_constants.StatusCodes.SERVICE_UNAVAILABLE and \
42-
self.retry_count >= self._max_service_unavailable_retry_count:
43-
return False
4437
self.retry_count += 1
4538
# Check if the next retry about to be done is safe
4639
if self.retry_count >= self._max_retry_attempt_count:
4740
return False
4841

4942
if self.request:
50-
# Update the last routed location to where this request was routed previously.
51-
# So that we can check in location cache if we need to return the current or previous
52-
# based on where the request was routed previously.
53-
self.request.last_routed_location_endpoint_within_region = self.request.location_endpoint_to_route
54-
55-
if _OperationType.IsReadOnlyOperation(self.request.operation_type):
56-
# We just directly got to the next location in case of read requests
57-
# We don't retry again on the same region for regional endpoint
58-
location_endpoint = self.resolve_next_region_service_endpoint()
59-
else:
60-
self.global_endpoint_manager.swap_regional_endpoint_values(self.request)
61-
location_endpoint = self.resolve_current_region_service_endpoint()
62-
# This is the case where both current and previous point to the same writable endpoint
63-
# In this case we don't want to retry again, rather failover to the next region
64-
if self.request.last_routed_location_endpoint_within_region == location_endpoint:
65-
location_endpoint = self.resolve_next_region_service_endpoint()
66-
43+
location_endpoint = self.resolve_next_region_service_endpoint()
6744
self.request.route_to_location(location_endpoint)
6845
return True
6946

70-
71-
# This function prepares the request to go to the second endpoint in the same region
72-
def resolve_current_region_service_endpoint(self):
73-
# clear previous location-based routing directive
74-
self.request.clear_route_to_location()
75-
# resolve the next service endpoint in the same region
76-
# since we maintain 2 endpoints per region for write operations
77-
self.request.route_to_location_with_preferred_location_flag(self.location_index, True)
78-
return self.global_endpoint_manager.resolve_service_endpoint(self.request)
79-
8047
# This function prepares the request to go to the next region
8148
def resolve_next_region_service_endpoint(self):
82-
self.location_index += 1
8349
# clear previous location-based routing directive
8450
self.request.clear_route_to_location()
8551
# clear the last routed endpoint within same region since we are going to a new region now
8652
self.request.last_routed_location_endpoint_within_region = None
8753
# set location-based routing directive based on retry count
8854
# ensuring usePreferredLocations is set to True for retry
89-
self.request.route_to_location_with_preferred_location_flag(self.location_index, True)
55+
self.request.route_to_location_with_preferred_location_flag(self.retry_count, True)
9056
# Resolve the endpoint for the request and pin the resolution to the resolved endpoint
9157
# This enables marking the endpoint unavailability on endpoint failover/unreachability
9258
return self.global_endpoint_manager.resolve_service_endpoint(self.request)

sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ async def _Request(global_endpoint_manager, request_params, connection_policy, p
6262
if request_params.resource_type != http_constants.ResourceType.DatabaseAccount:
6363
await global_endpoint_manager.refresh_endpoint_list(None, **kwargs)
6464
else:
65+
# always override database account call timeouts
66+
read_timeout = connection_policy.DBAReadTimeout
6567
connection_timeout = connection_policy.DBAConnectionTimeout
6668
if client_timeout is not None:
6769
kwargs['timeout'] = client_timeout - (time.time() - start_time)

sdk/cosmos/azure-cosmos/azure/cosmos/aio/_retry_utility_async.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ async def ExecuteAsync(client, global_endpoint_manager, function, *args, **kwarg
130130

131131
return result
132132
except exceptions.CosmosHttpResponseError as e:
133-
retry_policy = None
134133
if request and _has_database_account_header(request.headers):
135134
retry_policy = database_account_retry_policy
136135
elif e.status_code == StatusCodes.FORBIDDEN and e.sub_status in \
@@ -171,7 +170,9 @@ async def ExecuteAsync(client, global_endpoint_manager, function, *args, **kwarg
171170

172171
retry_policy.container_rid = cached_container["_rid"]
173172
request.headers[retry_policy._intended_headers] = retry_policy.container_rid
174-
elif e.status_code in [StatusCodes.REQUEST_TIMEOUT, StatusCodes.SERVICE_UNAVAILABLE]:
173+
elif e.status_code == StatusCodes.REQUEST_TIMEOUT:
174+
retry_policy = timeout_failover_retry_policy
175+
elif e.status_code >= StatusCodes.INTERNAL_SERVER_ERROR:
175176
retry_policy = timeout_failover_retry_policy
176177
else:
177178
retry_policy = defaultRetry_policy

sdk/cosmos/azure-cosmos/azure/cosmos/documents.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,12 +332,14 @@ class ConnectionPolicy: # pylint: disable=too-many-instance-attributes
332332
__defaultRequestTimeout: int = 5 # seconds
333333
__defaultDBAConnectionTimeout: int = 3 # seconds
334334
__defaultReadTimeout: int = 65 # seconds
335+
__defaultDBAReadTimeout: int = 3 # seconds
335336
__defaultMaxBackoff: int = 1 # seconds
336337

337338
def __init__(self) -> None:
338339
self.RequestTimeout: int = self.__defaultRequestTimeout
339340
self.DBAConnectionTimeout: int = self.__defaultDBAConnectionTimeout
340341
self.ReadTimeout: int = self.__defaultReadTimeout
342+
self.DBAReadTimeout: int = self.__defaultDBAReadTimeout
341343
self.MaxBackoff: int = self.__defaultMaxBackoff
342344
self.ConnectionMode: int = ConnectionMode.Gateway
343345
self.SSLConfiguration: Optional[SSLConfiguration] = None

sdk/cosmos/azure-cosmos/azure/cosmos/http_constants.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,6 @@ class StatusCodes:
400400
RETRY_WITH = 449
401401

402402
INTERNAL_SERVER_ERROR = 500
403-
SERVICE_UNAVAILABLE = 503
404403

405404
# Operation pause and cancel. These are FAKE status codes for QOS logging purpose only.
406405
OPERATION_PAUSED = 1200

sdk/cosmos/azure-cosmos/test/test_crud.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1821,7 +1821,14 @@ def test_client_request_timeout(self):
18211821
container = databaseForTest.get_container_client(self.configs.TEST_SINGLE_PARTITION_CONTAINER_ID)
18221822
container.create_item(body={'id': str(uuid.uuid4()), 'name': 'sample'})
18231823

1824-
1824+
async def test_read_timeout_async(self):
1825+
connection_policy = documents.ConnectionPolicy()
1826+
# making timeout 0 ms to make sure it will throw
1827+
connection_policy.DBAReadTimeout = 0.000000000001
1828+
with self.assertRaises(ServiceResponseError):
1829+
# this will make a get database account call
1830+
with cosmos_client.CosmosClient(self.host, self.masterKey, connection_policy=connection_policy):
1831+
print('initialization')
18251832

18261833
def test_client_request_timeout_when_connection_retry_configuration_specified(self):
18271834
connection_policy = documents.ConnectionPolicy()

sdk/cosmos/azure-cosmos/test/test_crud_async.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1689,6 +1689,15 @@ async def test_client_request_timeout_async(self):
16891689
await container.create_item(body={'id': str(uuid.uuid4()), 'name': 'sample'})
16901690
print('Async initialization')
16911691

1692+
async def test_read_timeout_async(self):
1693+
connection_policy = documents.ConnectionPolicy()
1694+
# making timeout 0 ms to make sure it will throw
1695+
connection_policy.DBAReadTimeout = 0.000000000001
1696+
with self.assertRaises(ServiceResponseError):
1697+
# this will make a get database account call
1698+
async with CosmosClient(self.host, self.masterKey, connection_policy=connection_policy):
1699+
print('Async initialization')
1700+
16921701
async def test_client_request_timeout_when_connection_retry_configuration_specified_async(self):
16931702
connection_policy = documents.ConnectionPolicy()
16941703
# making timeout 0 ms to make sure it will throw

0 commit comments

Comments
 (0)