feat: add rate limiter and force refresh function (#146)

shubha-rajan · web-flow · commit b390fac132c5 · 2021-08-16T12:44:33.000-07:00
* add basic rate limiter using event queue

* add unit tests

* use rate limiter in perform_refresh

* add force refresh method

* use faster rate limiter for perform_refresh tests

* remove initial delay in rate limiter

* call force refresh when connect attempt fails

* use token bucket algorithm

* address review comments

* use asyncio event to indicate when refresh is in progress

* use semaphore instead of queue in rate limiter

* address review comments

* run black

* update type annotations

* address review comments

* use asyncio time instead of time.time()

* update rate limiter implementation

* update docstrings

* add docstring for force_refresh
diff --git a/google/cloud/sql/connector/connector.py b/google/cloud/sql/connector/connector.py
@@ -15,6 +15,7 @@
 """
 import asyncio
 import concurrent
+import logging
 from google.cloud.sql.connector.instance_connection_manager import (
     InstanceConnectionManager,
     IPTypes,
@@ -31,6 +32,8 @@
 
 _instances: Dict[str, InstanceConnectionManager] = {}
 
+logger = logging.getLogger(name=__name__)
+
 
 def _get_loop() -> asyncio.AbstractEventLoop:
     global _loop
@@ -112,5 +115,9 @@ def connect(
         timeout = kwargs["connect_timeout"]
     else:
         timeout = 30  # 30s
-
-    return icm.connect(driver, ip_types, timeout, **kwargs)
+    try:
+        return icm.connect(driver, ip_types, timeout, **kwargs)
+    except Exception as e:
+        # with any other exception, we attempt a force refresh, then throw the error
+        icm.force_refresh()
+        raise (e)
diff --git a/google/cloud/sql/connector/instance_connection_manager.py b/google/cloud/sql/connector/instance_connection_manager.py
@@ -15,6 +15,7 @@
 """
 
 # Custom utils import
+from google.cloud.sql.connector.rate_limiter import AsyncRateLimiter
 from google.cloud.sql.connector.refresh_utils import _get_ephemeral, _get_metadata
 from google.cloud.sql.connector.utils import write_to_file
 from google.cloud.sql.connector.version import __version__ as version
@@ -218,8 +219,9 @@ def _client_session(self) -> aiohttp.ClientSession:
     _project: str
     _region: str
 
-    _current: asyncio.Task
-    _next: asyncio.Task
+    _refresh_in_progress: asyncio.locks.Event
+    _current: asyncio.Task  # task wraps coroutine that returns InstanceMetadata
+    _next: asyncio.Task  # task wraps coroutine that returns another task
 
     def __init__(
         self,
@@ -250,8 +252,13 @@ def __init__(
         self._keys = asyncio.wrap_future(keys, loop=self._loop)
         self._auth_init()
 
+        self._refresh_rate_limiter = AsyncRateLimiter(
+            max_capacity=2, rate=1 / 30, loop=self._loop
+        )
+
         async def _set_instance_data() -> None:
             logger.debug("Updating instance data")
+            self._refresh_in_progress = asyncio.locks.Event(loop=self._loop)
             self._current = self._loop.create_task(self._get_instance_data())
             self._next = self._loop.create_task(self._schedule_refresh())
 
@@ -350,6 +357,35 @@ def _auth_init(self) -> None:
 
         self._credentials = credentials
 
+    async def _force_refresh(self) -> bool:
+        if self._refresh_in_progress.is_set():
+            # if a new refresh is already in progress, then block on the result
+            self._current = await self._next
+            return True
+        try:
+            self._next.cancel()
+            # schedule a refresh immediately with no delay
+            self._next = self._loop.create_task(self._schedule_refresh(0))
+            self._current = await self._next
+            return True
+        except Exception as e:
+            # if anything else goes wrong, log the error and return false
+            logger.exception("Error occurred during force refresh attempt", exc_info=e)
+            return False
+
+    def force_refresh(self, timeout: Optional[int] = None) -> bool:
+        """
+        Forces a new refresh attempt and returns a boolean value that indicates
+        whether the attempt was successful.
+
+        :type timeout: Optional[int]
+        :param timeout: Amount of time to wait for the attempted force refresh
+        to complete before throwing a timeout error.
+        """
+        return asyncio.run_coroutine_threadsafe(
+            self._force_refresh(), self._loop
+        ).result(timeout=timeout)
+
     async def seconds_until_refresh(self) -> int:
         expiration = (await self._current).expiration
 
@@ -378,7 +414,8 @@ async def _perform_refresh(self) -> asyncio.Task:
         :rtype: concurrent.future.Futures
         :returns: A future representing the creation of an SSLcontext.
         """
-
+        self._refresh_in_progress.set()
+        await self._refresh_rate_limiter.acquire()
         logger.debug("Entered _perform_refresh")
 
         refresh_task = self._loop.create_task(self._get_instance_data())
@@ -387,7 +424,8 @@ async def _perform_refresh(self) -> asyncio.Task:
             await refresh_task
         except Exception as e:
             logger.exception(
-                "An error occurred while performing refresh. Retrying in 60s.",
+                "An error occurred while performing refresh."
+                "Scheduling another refresh attempt immediately",
                 exc_info=e,
             )
             instance_data = None
@@ -401,14 +439,14 @@ async def _perform_refresh(self) -> asyncio.Task:
                 or instance_data.expiration < datetime.datetime.now()
             ):
                 self._current = refresh_task
-                # TODO: Implement force refresh method and a rate-limiter for perform_refresh
-                # Retry by scheduling a refresh 60s from now.
-                self._next = self._loop.create_task(self._schedule_refresh(60))
+                self._next = self._loop.create_task(self._perform_refresh())
 
         else:
             self._current = refresh_task
             # Ephemeral certificate expires in 1 hour, so we schedule a refresh to happen in 55 minutes.
             self._next = self._loop.create_task(self._schedule_refresh())
+        finally:
+            self._refresh_in_progress.clear()
 
         return refresh_task
 
@@ -419,17 +457,15 @@ async def _schedule_refresh(self, delay: Optional[int] = None) -> asyncio.Task:
         :rtype: asyncio.Task
         :returns: A Task representing _get_instance_data.
         """
-        logger.debug("Entering sleep")
 
         if delay is None:
             delay = await self.seconds_until_refresh()
-
         try:
+            logger.debug("Entering sleep")
             await asyncio.sleep(delay)
         except asyncio.CancelledError as e:
             logger.debug("Schedule refresh task cancelled.")
             raise e
-
         return await self._perform_refresh()
 
     def connect(
diff --git a/google/cloud/sql/connector/rate_limiter.py b/google/cloud/sql/connector/rate_limiter.py
@@ -0,0 +1,85 @@
+"""
+Copyright 2021 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import asyncio
+
+
+class AsyncRateLimiter(object):
+    """
+    An asyncio-compatible rate limiter which uses the Token Bucket algorithm
+    (https://en.wikipedia.org/wiki/Token_bucket) to limit the number of function calls over a time interval using an event queue.
+
+    :type max_capacity: int
+    :param: max_capacity:
+        The maximum capacity of tokens the bucket will store at any one time.
+        Default: 1
+
+    :type rate: float
+    :param: rate:
+        The number of tokens that should be added per second.
+
+    :type loop: asyncio.AbstractEventLoop
+    :param: loop:
+        The event loop to use. If not provided, the default event loop will be used.
+
+
+    """
+
+    def __init__(
+        self,
+        max_capacity: int = 1,
+        rate: float = 1 / 60,
+        loop: asyncio.AbstractEventLoop = None,
+    ) -> None:
+        self.rate = rate
+        self.max_capacity = max_capacity
+        self._loop = loop or asyncio.get_event_loop()
+        self._lock = asyncio.Lock(loop=self._loop)
+        self._tokens: float = max_capacity
+        self._last_token_update = self._loop.time()
+
+    def _update_token_count(self) -> None:
+        """
+        Calculates how much time has passed since the last leak and removes the
+        appropriate amount of events from the queue.
+        Leaking is done lazily, meaning that if there is a large time gap between
+        leaks, the next set of calls might be a burst if burst_size > 1
+        """
+        now = self._loop.time()
+        time_elapsed = now - self._last_token_update
+        new_tokens = time_elapsed * self.rate
+        self._tokens = min(new_tokens + self._tokens, self.max_capacity)
+        self._last_token_update = now
+
+    async def _wait_for_next_token(self) -> None:
+        """
+        Wait until enough time has elapsed to add another token.
+        """
+        token_deficit = 1 - self._tokens
+        if token_deficit > 0:
+            wait_time = token_deficit / self.rate
+            await asyncio.sleep(wait_time, loop=self._loop)
+
+    async def acquire(self) -> None:
+        """
+        Waits for a token to become available, if necessary, then subtracts token and allows
+        request to go through.
+        """
+        async with self._lock:
+            self._update_token_count()
+            if self._tokens < 1:
+                await self._wait_for_next_token()
+                self._update_token_count()
+            self._tokens -= 1
diff --git a/tests/unit/test_instance_connection_manager.py b/tests/unit/test_instance_connection_manager.py
@@ -16,6 +16,7 @@
 
 import asyncio
 import datetime
+from google.cloud.sql.connector.rate_limiter import AsyncRateLimiter
 from typing import Any
 import pytest  # noqa F401 Needed to run the tests
 from google.cloud.sql.connector.instance_connection_manager import (
@@ -34,6 +35,11 @@ def icm(
     return icm
 
 
+@pytest.fixture
+def test_rate_limiter(async_loop: asyncio.AbstractEventLoop) -> AsyncRateLimiter:
+    return AsyncRateLimiter(max_capacity=1, rate=1 / 2, loop=async_loop)
+
+
 class MockMetadata:
     def __init__(self, expiration: datetime.datetime) -> None:
         self.expiration = expiration
@@ -68,10 +74,14 @@ def test_InstanceConnectionManager_init(async_loop: asyncio.AbstractEventLoop) -
 
 
 @pytest.mark.asyncio
-async def test_perform_refresh_replaces_result(icm: InstanceConnectionManager) -> None:
+async def test_perform_refresh_replaces_result(
+    icm: InstanceConnectionManager, test_rate_limiter: AsyncRateLimiter
+) -> None:
     """
     Test to check whether _perform_refresh replaces a valid result with another valid result
     """
+    # allow more frequent refreshes for tests
+    setattr(icm, "_refresh_rate_limiter", test_rate_limiter)
 
     # stub _get_instance_data to return a "valid" MockMetadata object
     setattr(icm, "_get_instance_data", _get_metadata_success)
@@ -85,12 +95,14 @@ async def test_perform_refresh_replaces_result(icm: InstanceConnectionManager) -
 
 @pytest.mark.asyncio
 async def test_perform_refresh_wont_replace_valid_result_with_invalid(
-    icm: InstanceConnectionManager,
+    icm: InstanceConnectionManager, test_rate_limiter: AsyncRateLimiter
 ) -> None:
     """
     Test to check whether _perform_refresh won't replace a valid _current
     value with an invalid one
     """
+    # allow more frequent refreshes for tests
+    setattr(icm, "_refresh_rate_limiter", test_rate_limiter)
 
     # stub _get_instance_data to return a "valid" MockMetadata object
     setattr(icm, "_get_instance_data", _get_metadata_success)
@@ -111,12 +123,14 @@ async def test_perform_refresh_wont_replace_valid_result_with_invalid(
 
 @pytest.mark.asyncio
 async def test_perform_refresh_replaces_invalid_result(
-    icm: InstanceConnectionManager,
+    icm: InstanceConnectionManager, test_rate_limiter: AsyncRateLimiter
 ) -> None:
     """
     Test to check whether _perform_refresh will replace an invalid refresh result with
     a valid one
     """
+    # allow more frequent refreshes for tests
+    setattr(icm, "_refresh_rate_limiter", test_rate_limiter)
 
     # stub _get_instance_data to throw an error
     setattr(icm, "_get_instance_data", _get_metadata_error)
@@ -132,3 +146,28 @@ async def test_perform_refresh_replaces_invalid_result(
 
     assert icm._current == new_task
     assert isinstance(icm._current.result(), MockMetadata)
+
+
+@pytest.mark.asyncio
+async def test_force_refresh_cancels_pending_refresh(
+    icm: InstanceConnectionManager,
+    test_rate_limiter: AsyncRateLimiter,
+) -> None:
+    """
+    Test that force_refresh cancels pending task if refresh_in_progress event is not set.
+    """
+    # allow more frequent refreshes for tests
+    setattr(icm, "_refresh_rate_limiter", test_rate_limiter)
+
+    # stub _get_instance_data to return a MockMetadata instance
+    setattr(icm, "_get_instance_data", _get_metadata_success)
+
+    # since the pending refresh isn't for another 55 min, the refresh_in_progress event
+    # shouldn't be set
+    pending_refresh = icm._next
+    assert icm._refresh_in_progress.is_set() is False
+
+    icm.force_refresh()
+
+    assert pending_refresh.cancelled() is True
+    assert isinstance(icm._current.result(), MockMetadata)
diff --git a/tests/unit/test_rate_limiter.py b/tests/unit/test_rate_limiter.py