From e8722ced0cb49a5cfc7c05f92d024b4d808d94c1 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 4 Feb 2025 21:47:35 +0000 Subject: [PATCH 1/2] chore: implement _snapshot_client for Snapshotter --- src/crawlee/_autoscaling/snapshotter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/crawlee/_autoscaling/snapshotter.py b/src/crawlee/_autoscaling/snapshotter.py index 26669096b6..b967f3c362 100644 --- a/src/crawlee/_autoscaling/snapshotter.py +++ b/src/crawlee/_autoscaling/snapshotter.py @@ -305,11 +305,11 @@ def _snapshot_client(self) -> None: Only errors produced by a 2nd retry of the API call are considered for snapshotting since earlier errors may just be caused by a random spike in the number of requests and do not necessarily signify API overloading. """ - # TODO: This is just a dummy placeholder. It can be implemented once `StorageClient` is ready. - # Attribute `self._client_rate_limit_error_retry_count` will be used here. - # https://github.com/apify/crawlee-python/issues/60 + client = service_locator.get_storage_client() - error_count = 0 + rate_limit_errors: dict[int, int] = getattr(client, 'rate_limit_errors', {}) + + error_count = rate_limit_errors.get(self._CLIENT_RATE_LIMIT_ERROR_RETRY_COUNT, 0) snapshot = ClientSnapshot(error_count=error_count, max_error_count=self._max_client_errors) snapshots = cast(list[Snapshot], self._client_snapshots) From 3e250f80b4de99ee06cb261ec1296be20cc346ef Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Fri, 7 Feb 2025 16:26:02 +0000 Subject: [PATCH 2/2] add `get_rate_limit_errors` in `BaseStorageClient` --- src/crawlee/_autoscaling/snapshotter.py | 2 +- src/crawlee/storage_clients/_base/_base_storage_client.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/crawlee/_autoscaling/snapshotter.py b/src/crawlee/_autoscaling/snapshotter.py index b967f3c362..afce742be0 100644 --- a/src/crawlee/_autoscaling/snapshotter.py +++ b/src/crawlee/_autoscaling/snapshotter.py @@ -307,7 +307,7 @@ def _snapshot_client(self) -> None: """ client = service_locator.get_storage_client() - rate_limit_errors: dict[int, int] = getattr(client, 'rate_limit_errors', {}) + rate_limit_errors: dict[int, int] = client.get_rate_limit_errors() error_count = rate_limit_errors.get(self._CLIENT_RATE_LIMIT_ERROR_RETRY_COUNT, 0) snapshot = ClientSnapshot(error_count=error_count, max_error_count=self._max_client_errors) diff --git a/src/crawlee/storage_clients/_base/_base_storage_client.py b/src/crawlee/storage_clients/_base/_base_storage_client.py index 8497160cf4..27decefaae 100644 --- a/src/crawlee/storage_clients/_base/_base_storage_client.py +++ b/src/crawlee/storage_clients/_base/_base_storage_client.py @@ -56,3 +56,7 @@ async def purge_on_start(self) -> None: It is primarily used to clean up residual data from previous runs to maintain a clean state. If the storage client does not support purging, leave it empty. """ + + def get_rate_limit_errors(self) -> dict[int, int]: + """Returns statistics about rate limit errors encountered by the HTTP client in storage client.""" + return {}