azure-sdk
diff --git a/‎sdk/cosmos/azure-cosmos/CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions b/‎sdk/cosmos/azure-cosmos/CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py‎
Lines changed: 2 additions & 1 deletion b/‎sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_client_connection.py‎
Lines changed: 2 additions & 2 deletions b/‎sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_client_connection.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎sdk/cosmos/azure-cosmos/azure/cosmos/_global_endpoint_manager.py‎
Lines changed: 79 additions & 34 deletions b/‎sdk/cosmos/azure-cosmos/azure/cosmos/_global_endpoint_manager.py‎
Lines changed: 79 additions & 34 deletions
@@ -7,9 +7,11 @@
 #### Breaking Changes
 
 #### Bugs Fixed
+* Fixed bug preventing health check in some scenarios. See [PR 39647](https://github.com/Azure/azure-sdk-for-python/pull/39647)
 * Fixed `partition_key` filter for `query_items_change_feed` API. See [PR 39895](https://github.com/Azure/azure-sdk-for-python/pull/39895)
 
 #### Other Changes
+* Moved endpoint health check to the background for async APIs. See [PR 39647](https://github.com/Azure/azure-sdk-for-python/pull/39647)
 
 ### 4.10.0b1 (2025-02-13)
 
 
@@ -38,7 +38,8 @@ class _Constants:
     ReadableLocations: Literal["readableLocations"] = "readableLocations"
     Name: Literal["name"] = "name"
     DatabaseAccountEndpoint: Literal["databaseAccountEndpoint"] = "databaseAccountEndpoint"
-    DefaultUnavailableLocationExpirationTime: int = 5 * 60 * 1000
+    DefaultEndpointsRefreshTime: int = 5 * 60 * 1000 # milliseconds
+    UnavailableEndpointDBATimeouts: int = 1 # seconds
 
     # ServiceDocument Resource
     EnableMultipleWritableLocations: Literal["enableMultipleWriteLocations"] = "enableMultipleWriteLocations"
 
@@ -234,8 +234,8 @@ def __init__( # pylint: disable=too-many-statements
         # Routing map provider
         self._routing_map_provider = routing_map_provider.SmartRoutingMapProvider(self)
 
-        database_account = self._global_endpoint_manager._GetDatabaseAccount(**kwargs)
-        self._global_endpoint_manager.force_refresh(database_account)
+        database_account, _ = self._global_endpoint_manager._GetDatabaseAccount(**kwargs)
+        self._global_endpoint_manager.force_refresh_on_startup(database_account)
 
         # Use database_account if no consistency passed in to verify consistency level to be used
         self.session: Optional[_session.Session] = None
 
@@ -24,18 +24,20 @@
 """
 
 import threading
+from typing import Tuple
 
 from azure.core.exceptions import AzureError
 
 from . import _constants as constants
 from . import exceptions
+from .documents import DatabaseAccount
 from ._location_cache import LocationCache
 
 
 # pylint: disable=protected-access
 
 
-class _GlobalEndpointManager(object):
+class _GlobalEndpointManager(object): # pylint: disable=too-many-instance-attributes
     """
     This internal class implements the logic for endpoint management for
     geo-replicated database accounts.
@@ -51,25 +53,24 @@ def __init__(self, client):
             self.PreferredLocations,
             self.DefaultEndpoint,
             self.EnableEndpointDiscovery,
-            client.connection_policy.UseMultipleWriteLocations,
-            self.refresh_time_interval_in_ms,
+            client.connection_policy.UseMultipleWriteLocations
         )
         self.refresh_needed = False
         self.refresh_lock = threading.RLock()
         self.last_refresh_time = 0
         self._database_account_cache = None
 
+    def get_use_multiple_write_locations(self):
+        return self.location_cache.can_use_multiple_write_locations()
+
     def get_refresh_time_interval_in_ms_stub(self):
-        return constants._Constants.DefaultUnavailableLocationExpirationTime
+        return constants._Constants.DefaultEndpointsRefreshTime
 
     def get_write_endpoint(self):
-        return self.location_cache.get_write_regional_endpoint()
+        return self.location_cache.get_write_regional_routing_context()
 
     def get_read_endpoint(self):
-        return self.location_cache.get_read_regional_endpoint()
-
-    def swap_regional_endpoint_values(self, request):
-        return self.location_cache.swap_regional_endpoint_values(request)
+        return self.location_cache.get_read_regional_routing_context()
 
     def resolve_service_endpoint(self, request):
         return self.location_cache.resolve_service_endpoint(request)
@@ -89,7 +90,7 @@ def get_ordered_read_locations(self):
     def can_use_multiple_write_locations(self, request):
         return self.location_cache.can_use_multiple_write_locations_for_request(request)
 
-    def force_refresh(self, database_account):
+    def force_refresh_on_startup(self, database_account):
         self.refresh_needed = True
         self.refresh_endpoint_list(database_account)
 
@@ -118,61 +119,88 @@ def _refresh_endpoint_list_private(self, database_account=None, **kwargs):
             if self.location_cache.should_refresh_endpoints() or self.refresh_needed:
                 self.refresh_needed = False
                 self.last_refresh_time = self.location_cache.current_time_millis()
-                database_account = self._GetDatabaseAccount(**kwargs)
-                self.location_cache.perform_on_database_account_read(database_account)
                 # this will perform getDatabaseAccount calls to check endpoint health
                 self._endpoints_health_check(**kwargs)
 
-    def _GetDatabaseAccount(self, **kwargs):
+    def _GetDatabaseAccount(self, **kwargs) -> Tuple[DatabaseAccount, str]:
         """Gets the database account.
 
         First tries by using the default endpoint, and if that doesn't work,
         use the endpoints for the preferred locations in the order they are
         specified, to get the database account.
-        :returns: A `DatabaseAccount` instance representing the Cosmos DB Database Account.
-        :rtype: ~azure.cosmos.DatabaseAccount
+        :returns: A `DatabaseAccount` instance representing the Cosmos DB Database Account
+        and the endpoint that was used for the request.
+        :rtype: tuple of (~azure.cosmos.DatabaseAccount, str)
         """
         try:
             database_account = self._GetDatabaseAccountStub(self.DefaultEndpoint, **kwargs)
             self._database_account_cache = database_account
-            return database_account
+            self.location_cache.mark_endpoint_available(self.DefaultEndpoint)
+            return database_account, self.DefaultEndpoint
         # If for any reason(non-globaldb related), we are not able to get the database
         # account from the above call to GetDatabaseAccount, we would try to get this
         # information from any of the preferred locations that the user might have
         # specified (by creating a locational endpoint) and keeping eating the exception
         # until we get the database account and return None at the end, if we are not able
         # to get that info from any endpoints
         except (exceptions.CosmosHttpResponseError, AzureError):
+            # when atm is available, L: 145, 146 should be removed as the global endpoint shouldn't be used
+            # for dataplane operations anymore
+            self.mark_endpoint_unavailable_for_read(self.DefaultEndpoint, False)
+            self.mark_endpoint_unavailable_for_write(self.DefaultEndpoint, False)
             for location_name in self.PreferredLocations:
                 locational_endpoint = LocationCache.GetLocationalEndpoint(self.DefaultEndpoint, location_name)
                 try:
                     database_account = self._GetDatabaseAccountStub(locational_endpoint, **kwargs)
                     self._database_account_cache = database_account
-                    return database_account
+                    self.location_cache.mark_endpoint_available(locational_endpoint)
+                    return database_account, locational_endpoint
                 except (exceptions.CosmosHttpResponseError, AzureError):
-                    pass
+                    self.mark_endpoint_unavailable_for_read(locational_endpoint, False)
+                    self.mark_endpoint_unavailable_for_write(locational_endpoint, False)
             raise
 
     def _endpoints_health_check(self, **kwargs):
         """Gets the database account for each endpoint.
 
         Validating if the endpoint is healthy else marking it as unavailable.
         """
-        all_endpoints = [self.location_cache.read_regional_endpoints[0]]
-        all_endpoints.extend(self.location_cache.write_regional_endpoints)
-        count = 0
-        for endpoint in all_endpoints:
-            count += 1
-            if count > 3:
-                break
-            try:
-                self.Client._GetDatabaseAccountCheck(endpoint.get_current(), **kwargs)
-            except (exceptions.CosmosHttpResponseError, AzureError):
-                if endpoint in self.location_cache.read_regional_endpoints:
-                    self.mark_endpoint_unavailable_for_read(endpoint.get_current(), False)
-                if endpoint in self.location_cache.write_regional_endpoints:
-                    self.mark_endpoint_unavailable_for_write(endpoint.get_current(), False)
-                    endpoint.swap()
+        endpoints_attempted = set()
+        database_account, attempted_endpoint = self._GetDatabaseAccount(**kwargs)
+        endpoints_attempted.add(attempted_endpoint)
+        self.location_cache.perform_on_database_account_read(database_account)
+        # get all the regional routing contexts to check
+        endpoints = self.location_cache.endpoints_to_health_check()
+        success_count = 0
+        for endpoint in endpoints:
+            if endpoint not in endpoints_attempted:
+                if success_count >= 4:
+                    break
+                endpoints_attempted.add(endpoint)
+                # save current dba timeouts
+                previous_dba_read_timeout = self.Client.connection_policy.DBAReadTimeout
+                previous_dba_connection_timeout = self.Client.connection_policy.DBAConnectionTimeout
+                try:
+                    if (endpoint in
+                            self.location_cache.location_unavailability_info_by_endpoint):
+                        # if the endpoint is unavailable, we need to lower the timeouts to be more aggressive in the
+                        # health check. This helps reduce the time the health check is blocking all requests.
+                        self.Client.connection_policy.override_dba_timeouts(constants._Constants
+                                                                            .UnavailableEndpointDBATimeouts,
+                                                                            constants._Constants
+                                                                            .UnavailableEndpointDBATimeouts)
+                        self.Client._GetDatabaseAccountCheck(endpoint, **kwargs)
+                    else:
+                        self.Client._GetDatabaseAccountCheck(endpoint, **kwargs)
+                    success_count += 1
+                    self.location_cache.mark_endpoint_available(endpoint)
+                except (exceptions.CosmosHttpResponseError, AzureError):
+                    self.mark_endpoint_unavailable_for_read(endpoint, False)
+                    self.mark_endpoint_unavailable_for_write(endpoint, False)
+                finally:
+                    # after the health check for that endpoint setting the timeouts back to their original values
+                    self.Client.connection_policy.override_dba_timeouts(previous_dba_read_timeout,
+                                                                        previous_dba_connection_timeout)
         self.location_cache.update_location_cache()
 
     def _GetDatabaseAccountStub(self, endpoint, **kwargs):
@@ -183,4 +211,21 @@ def _GetDatabaseAccountStub(self, endpoint, **kwargs):
         :returns: A `DatabaseAccount` instance representing the Cosmos DB Database Account.
         :rtype: ~azure.cosmos.DatabaseAccount
         """
-        return self.Client.GetDatabaseAccount(endpoint, **kwargs)
+        if endpoint in self.location_cache.location_unavailability_info_by_endpoint:
+            previous_dba_read_timeout = self.Client.connection_policy.DBAReadTimeout
+            previous_dba_connection_timeout = self.Client.connection_policy.DBAConnectionTimeout
+            try:
+                # if the endpoint is unavailable, we need to lower the timeouts to be more aggressive in the
+                # health check. This helps reduce the time the health check is blocking all requests.
+                self.Client.connection_policy.override_dba_timeouts(constants._Constants
+                                                                    .UnavailableEndpointDBATimeouts,
+                                                                    constants._Constants
+                                                                    .UnavailableEndpointDBATimeouts)
+                database_account = self.Client.GetDatabaseAccount(endpoint, **kwargs)
+            finally:
+                # after the health check for that endpoint setting the timeouts back to their original values
+                self.Client.connection_policy.override_dba_timeouts(previous_dba_read_timeout,
+                                                                    previous_dba_connection_timeout)
+        else:
+            database_account = self.Client.GetDatabaseAccount(endpoint, **kwargs)
+        return database_account