Azure
diff --git a/‎sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py‎
Lines changed: 2 additions & 0 deletions b/‎sdk/cosmos/azure-cosmos/azure/cosmos/_endpoint_discovery_retry_policy.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py‎
Lines changed: 10 additions & 11 deletions b/‎sdk/cosmos/azure-cosmos/azure/cosmos/_partition_health_tracker.py‎
Lines changed: 10 additions & 11 deletions
diff --git a/‎sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator.py‎
Lines changed: 3 additions & 3 deletions b/‎sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator_async.py‎
Lines changed: 3 additions & 3 deletions b/‎sdk/cosmos/azure-cosmos/tests/test_circuit_breaker_emulator_async.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py‎
Lines changed: 65 additions & 16 deletions b/‎sdk/cosmos/azure-cosmos/tests/test_per_partition_circuit_breaker_mm.py‎
Lines changed: 65 additions & 16 deletions
@@ -91,6 +91,8 @@ def ShouldRetry(self, exception):  # pylint: disable=unused-argument
         # set location-based routing directive based on retry count
         # simulating single master writes by ensuring usePreferredLocations
         # is set to false
+        # reasoning being that 403.3 is only expected for write region failover in single writer account
+        # and we must rely on account locations as they are the source of truth
         self.request.route_to_location_with_preferred_location_flag(self.failover_retry_count, False)
 
         return True
@@ -32,9 +32,9 @@
 from ._constants import _Constants as Constants
 
 MINIMUM_REQUESTS_FOR_FAILURE_RATE = 100
-MAX_UNAVAILABLE_TIME = 1200 * 1000 # milliseconds
-REFRESH_INTERVAL = 60 * 1000 # milliseconds
-INITIAL_UNAVAILABLE_TIME = 60 * 1000 # milliseconds
+MAX_UNAVAILABLE_TIME_MS = 1200 * 1000 # 20 minutes in milliseconds
+REFRESH_INTERVAL_MS = 60 * 1000 # 1 minute in milliseconds
+INITIAL_UNAVAILABLE_TIME_MS = 60 * 1000 # 1 minute in milliseconds
 # partition is unhealthy if sdk tried to recover and failed
 UNHEALTHY = "unhealthy"
 # partition is unhealthy tentative when it initially marked unavailable
@@ -58,27 +58,25 @@ def __init__(self) -> None:
         self.write_consecutive_failure_count: int = 0
         self.unavailability_info: Dict[str, Any] = {}
 
-    def reset_health_stats(self) -> None:
+    def reset_failure_rate_health_stats(self) -> None:
         self.write_failure_count = 0
         self.read_failure_count = 0
         self.write_success_count = 0
         self.read_success_count = 0
-        self.read_consecutive_failure_count = 0
-        self.write_consecutive_failure_count = 0
 
     def transition_health_status(self, target_health_status: str, curr_time: int) -> None:
         if target_health_status == UNHEALTHY :
             self.unavailability_info[HEALTH_STATUS] = UNHEALTHY
             # reset the last unavailability check time stamp
             self.unavailability_info[UNAVAILABLE_INTERVAL] = \
                 min(self.unavailability_info[UNAVAILABLE_INTERVAL] * 2,
-                    MAX_UNAVAILABLE_TIME)
+                    MAX_UNAVAILABLE_TIME_MS)
             self.unavailability_info[LAST_UNAVAILABILITY_CHECK_TIME_STAMP] \
                 = curr_time
         elif target_health_status == UNHEALTHY_TENTATIVE :
             self.unavailability_info = {
                 LAST_UNAVAILABILITY_CHECK_TIME_STAMP: curr_time,
-                UNAVAILABLE_INTERVAL: INITIAL_UNAVAILABLE_TIME,
+                UNAVAILABLE_INTERVAL: INITIAL_UNAVAILABLE_TIME_MS,
                 HEALTH_STATUS: UNHEALTHY_TENTATIVE
             }
 
@@ -108,7 +106,7 @@ def _should_mark_healthy_tentative(partition_health_info: _PartitionHealthInfo,
     stale_partition_unavailability_check = partition_health_info.unavailability_info[UNAVAILABLE_INTERVAL]
     # check if the partition key range is still unavailable
     return ((current_health_status == UNHEALTHY and elapsed_time > stale_partition_unavailability_check)
-            or (current_health_status == UNHEALTHY_TENTATIVE and  elapsed_time > INITIAL_UNAVAILABLE_TIME))
+            or (current_health_status == UNHEALTHY_TENTATIVE and elapsed_time > INITIAL_UNAVAILABLE_TIME_MS))
 
 logger = logging.getLogger("azure.cosmos._PartitionHealthTracker")
 
@@ -178,9 +176,10 @@ def check_stale_partition_info(
                                 partition_health_info.transition_health_status(UNHEALTHY, current_time)
                                 request.healthy_tentative_location = location
 
-        if current_time - self.last_refresh > REFRESH_INTERVAL:
+        if current_time - self.last_refresh > REFRESH_INTERVAL_MS:
             # all partition stats reset every minute
             self._reset_partition_health_tracker_stats()
+            self.last_refresh = current_time
 
 
     def get_unhealthy_locations(
@@ -290,4 +289,4 @@ def add_success(self, pk_range_wrapper: PartitionKeyRangeWrapper, operation_type
     def _reset_partition_health_tracker_stats(self) -> None:
         for locations in self.pk_range_wrapper_to_health_info.values():
             for health_info in locations.values():
-                health_info.reset_health_stats()
+                health_info.reset_failure_rate_health_stats()
@@ -160,8 +160,8 @@ def test_write_consecutive_failure_threshold_delete_all_items_by_pk_mm(self, set
 
         validate_unhealthy_partitions_mm(global_endpoint_manager, 1)
         # remove faults and reduce initial recover time and perform a write
-        original_unavailable_time = _partition_health_tracker.INITIAL_UNAVAILABLE_TIME
-        _partition_health_tracker.INITIAL_UNAVAILABLE_TIME = 1
+        original_unavailable_time = _partition_health_tracker.INITIAL_UNAVAILABLE_TIME_MS
+        _partition_health_tracker.INITIAL_UNAVAILABLE_TIME_MS = 1
         custom_transport.faults = []
         try:
             perform_write_operation(DELETE_ALL_ITEMS_BY_PARTITION_KEY,
@@ -171,7 +171,7 @@ def test_write_consecutive_failure_threshold_delete_all_items_by_pk_mm(self, set
                                           PK_VALUE,
                                           uri_down)
         finally:
-            _partition_health_tracker.INITIAL_UNAVAILABLE_TIME = original_unavailable_time
+            _partition_health_tracker.INITIAL_UNAVAILABLE_TIME_MS = original_unavailable_time
         validate_unhealthy_partitions_mm(global_endpoint_manager, 0)
 
 
 
@@ -163,8 +163,8 @@ async def test_write_consecutive_failure_threshold_delete_all_items_by_pk_mm_asy
 
         validate_unhealthy_partitions_mm(global_endpoint_manager, 1)
         # remove faults and reduce initial recover time and perform a write
-        original_unavailable_time = _partition_health_tracker.INITIAL_UNAVAILABLE_TIME
-        _partition_health_tracker.INITIAL_UNAVAILABLE_TIME = 1
+        original_unavailable_time = _partition_health_tracker.INITIAL_UNAVAILABLE_TIME_MS
+        _partition_health_tracker.INITIAL_UNAVAILABLE_TIME_MS = 1
         custom_transport.faults = []
         try:
             await perform_write_operation(DELETE_ALL_ITEMS_BY_PARTITION_KEY,
@@ -174,7 +174,7 @@ async def test_write_consecutive_failure_threshold_delete_all_items_by_pk_mm_asy
                                     PK_VALUE,
                                     uri_down)
         finally:
-            _partition_health_tracker.INITIAL_UNAVAILABLE_TIME = original_unavailable_time
+            _partition_health_tracker.INITIAL_UNAVAILABLE_TIME_MS = original_unavailable_time
         validate_unhealthy_partitions_mm(global_endpoint_manager, 0)
         await cleanup_method([custom_setup, setup])
 
 
@@ -13,10 +13,11 @@
 from azure.cosmos import CosmosClient
 from azure.cosmos.exceptions import CosmosHttpResponseError
 from _fault_injection_transport import FaultInjectionTransport
-from test_per_partition_circuit_breaker_mm_async import DELETE, CREATE, UPSERT, REPLACE, PATCH, BATCH, validate_response_uri, READ, \
+from test_per_partition_circuit_breaker_mm_async import DELETE, CREATE, UPSERT, REPLACE, PATCH, BATCH, \
+    validate_response_uri, READ, \
     QUERY_PK, QUERY, CHANGE_FEED, CHANGE_FEED_PK, CHANGE_FEED_EPK, READ_ALL_ITEMS, REGION_1, REGION_2, \
     write_operations_and_errors, validate_unhealthy_partitions, read_operations_and_errors, PK_VALUE, operations, \
-    create_doc
+    create_doc, validate_stats
 from test_per_partition_circuit_breaker_mm_async import DELETE_ALL_ITEMS_BY_PARTITION_KEY
 
 def perform_write_operation(operation, container, fault_injection_container, doc_id, pk, expected_uri):
@@ -99,15 +100,18 @@ class TestPerPartitionCircuitBreakerMM:
     host = test_config.TestConfig.host
     master_key = test_config.TestConfig.masterKey
     TEST_DATABASE_ID = test_config.TestConfig.TEST_DATABASE_ID
-    TEST_CONTAINER_SINGLE_PARTITION_ID = test_config.TestConfig.TEST_MULTI_PARTITION_CONTAINER_ID
+    TEST_CONTAINER_MULTI_PARTITION_ID = test_config.TestConfig.TEST_MULTI_PARTITION_CONTAINER_ID
 
     def setup_method_with_custom_transport(self, custom_transport, default_endpoint=host, **kwargs):
+        container_id = kwargs.pop("container_id", None)
+        if not container_id:
+            container_id = self.TEST_CONTAINER_MULTI_PARTITION_ID
         client = CosmosClient(default_endpoint, self.master_key,
                               preferred_locations=[REGION_1, REGION_2],
                               multiple_write_locations=True,
                               transport=custom_transport, **kwargs)
         db = client.get_database_client(self.TEST_DATABASE_ID)
-        container = db.get_container_client(self.TEST_CONTAINER_SINGLE_PARTITION_ID)
+        container = db.get_container_client(container_id)
         return {"client": client, "db": db, "col": container}
 
     @pytest.mark.parametrize("write_operation, error", write_operations_and_errors())
@@ -151,8 +155,8 @@ def test_write_consecutive_failure_threshold(self, write_operation, error):
 
         validate_unhealthy_partitions(global_endpoint_manager, 1)
         # remove faults and reduce initial recover time and perform a write
-        original_unavailable_time = _partition_health_tracker.INITIAL_UNAVAILABLE_TIME
-        _partition_health_tracker.INITIAL_UNAVAILABLE_TIME = 1
+        original_unavailable_time = _partition_health_tracker.INITIAL_UNAVAILABLE_TIME_MS
+        _partition_health_tracker.INITIAL_UNAVAILABLE_TIME_MS = 1
         custom_transport.faults = []
         try:
             perform_write_operation(write_operation,
@@ -162,7 +166,7 @@ def test_write_consecutive_failure_threshold(self, write_operation, error):
                                           PK_VALUE,
                                           uri_down)
         finally:
-            _partition_health_tracker.INITIAL_UNAVAILABLE_TIME = original_unavailable_time
+            _partition_health_tracker.INITIAL_UNAVAILABLE_TIME_MS = original_unavailable_time
         validate_unhealthy_partitions(global_endpoint_manager, 0)
 
     @pytest.mark.cosmosCircuitBreakerMultiRegion
@@ -203,8 +207,8 @@ def test_read_consecutive_failure_threshold(self, read_operation, error):
             expected_unhealthy_partitions = 1
         validate_unhealthy_partitions(global_endpoint_manager, expected_unhealthy_partitions)
         # remove faults and reduce initial recover time and perform a read
-        original_unavailable_time = _partition_health_tracker.INITIAL_UNAVAILABLE_TIME
-        _partition_health_tracker.INITIAL_UNAVAILABLE_TIME = 1
+        original_unavailable_time = _partition_health_tracker.INITIAL_UNAVAILABLE_TIME_MS
+        _partition_health_tracker.INITIAL_UNAVAILABLE_TIME_MS = 1
         custom_transport.faults = []
         try:
             perform_read_operation(read_operation,
@@ -213,7 +217,7 @@ def test_read_consecutive_failure_threshold(self, read_operation, error):
                                          doc['pk'],
                                          uri_down)
         finally:
-            _partition_health_tracker.INITIAL_UNAVAILABLE_TIME = original_unavailable_time
+            _partition_health_tracker.INITIAL_UNAVAILABLE_TIME_MS = original_unavailable_time
         validate_unhealthy_partitions(global_endpoint_manager, 0)
 
     @pytest.mark.parametrize("write_operation, error", write_operations_and_errors())
@@ -297,7 +301,7 @@ def test_read_failure_rate_threshold(self, read_operation, error):
             # restore minimum requests
             _partition_health_tracker.MINIMUM_REQUESTS_FOR_FAILURE_RATE = 100
 
-    def setup_info(self, error):
+    def setup_info(self, error, **kwargs):
         expected_uri = _location_cache.LocationCache.GetLocationalEndpoint(self.host, REGION_2)
         uri_down = _location_cache.LocationCache.GetLocationalEndpoint(self.host, REGION_1)
         custom_transport = FaultInjectionTransport()
@@ -307,12 +311,57 @@ def setup_info(self, error):
                                FaultInjectionTransport.predicate_targets_region(r, uri_down))
         custom_transport.add_fault(predicate,
                                    error)
-        custom_setup = self.setup_method_with_custom_transport(custom_transport, default_endpoint=self.host)
+        custom_setup = self.setup_method_with_custom_transport(custom_transport, default_endpoint=self.host, **kwargs)
         fault_injection_container = custom_setup['col']
-        setup = self.setup_method_with_custom_transport(None, default_endpoint=self.host)
+        setup = self.setup_method_with_custom_transport(None, default_endpoint=self.host, **kwargs)
         container = setup['col']
         return container, doc, expected_uri, uri_down, fault_injection_container, custom_transport, predicate
 
+    def test_stat_reset(self):
+        error_lambda = lambda r: FaultInjectionTransport.error_after_delay(
+            0,
+            CosmosHttpResponseError(
+                status_code=503,
+                message="Some injected error.")
+        )
+        container, doc, expected_uri, uri_down, fault_injection_container, custom_transport, predicate = \
+            self.setup_info(error_lambda, container_id=test_config.TestConfig.TEST_SINGLE_PARTITION_CONTAINER_ID)
+        container.upsert_item(body=doc)
+        sleep(1)
+        global_endpoint_manager = fault_injection_container.client_connection._global_endpoint_manager
+        # lower refresh interval for testing
+        _partition_health_tracker.REFRESH_INTERVAL_MS = 10 * 1000
+        try:
+            for i in range(2):
+                validate_unhealthy_partitions(global_endpoint_manager, 0)
+                # read will fail and retry in other region
+                perform_read_operation(READ,
+                                       fault_injection_container,
+                                       doc['id'],
+                                       PK_VALUE,
+                                       expected_uri)
+                try:
+                    perform_write_operation(CREATE,
+                                            container,
+                                            fault_injection_container,
+                                            str(uuid.uuid4()),
+                                            PK_VALUE,
+                                            expected_uri)
+                except CosmosHttpResponseError as e:
+                    assert e.status_code == 503
+            validate_unhealthy_partitions(global_endpoint_manager, 0)
+            validate_stats(global_endpoint_manager, 2,  2, 2, 2, 0, 0)
+            sleep(25)
+            perform_read_operation(READ,
+                                   fault_injection_container,
+                                   doc['id'],
+                                   PK_VALUE,
+                                   expected_uri)
+
+            validate_stats(global_endpoint_manager, 2, 3, 1, 0, 0, 0)
+        finally:
+            _partition_health_tracker.REFRESH_INTERVAL_MS = 60 * 1000
+
     @pytest.mark.parametrize("read_operation, write_operation", operations())
     def test_service_request_error(self, read_operation, write_operation):
         # the region should be tried 4 times before failing over and mark the partition as unavailable
@@ -333,8 +382,8 @@ def test_service_request_error(self, read_operation, write_operation):
 
         # recover partition
         # remove faults and reduce initial recover time and perform a write
-        original_unavailable_time = _partition_health_tracker.INITIAL_UNAVAILABLE_TIME
-        _partition_health_tracker.INITIAL_UNAVAILABLE_TIME = 1
+        original_unavailable_time = _partition_health_tracker.INITIAL_UNAVAILABLE_TIME_MS
+        _partition_health_tracker.INITIAL_UNAVAILABLE_TIME_MS = 1
         custom_transport.faults = []
         try:
             perform_read_operation(read_operation,
@@ -343,7 +392,7 @@ def test_service_request_error(self, read_operation, write_operation):
                                           PK_VALUE,
                                           expected_uri)
         finally:
-            _partition_health_tracker.INITIAL_UNAVAILABLE_TIME = original_unavailable_time
+            _partition_health_tracker.INITIAL_UNAVAILABLE_TIME_MS = original_unavailable_time
         validate_unhealthy_partitions(global_endpoint_manager, 0)
 
         custom_transport.add_fault(predicate,