WIp, fix failing tests

Pijukatel · Pijukatel · commit 7dd0f66a89e1 · 2025-10-02T15:38:11.000+02:00
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -11,7 +11,7 @@
 from asyncio import CancelledError
 from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Sequence
 from contextlib import AsyncExitStack, suppress
-from datetime import timedelta
+from datetime import datetime, timedelta, timezone
 from functools import partial
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generic, Literal, cast
@@ -56,7 +56,7 @@
     SessionError,
     UserDefinedErrorHandlerError,
 )
-from crawlee.events._types import Event, EventCrawlerStatusData
+from crawlee.events._types import Event, EventCrawlerStatusData, EventPersistStateData
 from crawlee.http_clients import ImpitHttpClient
 from crawlee.router import Router
 from crawlee.sessions import SessionPool
@@ -440,6 +440,7 @@ def __init__(
         self._statistics = statistics or cast(
             'Statistics[TStatisticsState]',
             Statistics.with_default_state(
+                persistence_enabled=True,
                 periodic_message_logger=self._logger,
                 statistics_log_format=self._statistics_log_format,
                 log_message='Current request statistics:',
@@ -744,6 +745,16 @@ async def _run_crawler(self) -> None:
 
             await self._autoscaled_pool.run()
 
+            # Emit PERSIST_STATE event when crawler is finishing to allow listeners to persist their state if needed
+            if not self.statistics.state.crawler_last_started_at:
+                raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
+            run_duration = datetime.now(timezone.utc) - self.statistics.state.crawler_last_started_at
+            self._statistics.state.crawler_runtime = self.statistics.state.crawler_runtime + run_duration
+            self._service_locator.get_event_manager().emit(
+                event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False)
+            )
+            await asyncio.sleep(10)
+
     async def add_requests(
         self,
         requests: Sequence[str | Request],
diff --git a/src/crawlee/statistics/_statistics.py b/src/crawlee/statistics/_statistics.py
@@ -250,8 +250,7 @@ def calculate(self) -> FinalStatistics:
         if self._instance_start is None:
             raise RuntimeError('The Statistics object is not initialized')
 
-        crawler_runtime = datetime.now(timezone.utc) - self._instance_start
-        total_minutes = crawler_runtime.total_seconds() / 60
+        total_minutes = self.state.crawler_runtime.total_seconds() / 60
         state = self._state.current_value
         serialized_state = state.model_dump(by_alias=False)
 
@@ -262,7 +261,7 @@ def calculate(self) -> FinalStatistics:
             requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0,
             request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration,
             requests_total=state.requests_failed + state.requests_finished,
-            crawler_runtime=crawler_runtime,
+            crawler_runtime=state.crawler_runtime,
             requests_finished=state.requests_finished,
             requests_failed=state.requests_failed,
             retry_histogram=serialized_state['request_retry_histogram'],
diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import asyncio
+import concurrent
 import json
 import logging
 import os
@@ -1643,3 +1644,58 @@ async def handler(context: BasicCrawlingContext) -> None:
 
     # Crawler should not fall back to the default storage after the purge
     assert await unrelated_rq.fetch_next_request() == unrelated_request
+
+
+async def _run_crawler(requests: list[str], storage_dir: str) -> StatisticsState:
+    """Run crawler and return its statistics state.
+
+    Must be defined like this to be picklable for ProcessPoolExecutor."""
+    service_locator.set_configuration(
+        Configuration(
+            crawlee_storage_dir=storage_dir,  # type: ignore[call-arg]
+            purge_on_start=False,
+        )
+    )
+
+    async def request_handler(context: BasicCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+    crawler = BasicCrawler(
+        request_handler=request_handler,
+        concurrency_settings=ConcurrencySettings(max_concurrency=1, desired_concurrency=1),
+    )
+
+    await crawler.run(requests)
+    return crawler.statistics.state
+
+
+def _process_run_crawler(requests: list[str], storage_dir: str) -> StatisticsState:
+    return asyncio.run(_run_crawler(requests=requests, storage_dir=storage_dir))
+
+
+async def test_crawler_statistics_persistence(tmp_path: Path) -> None:
+    """Test that crawler statistics persist and are loaded correctly.
+
+    This test simulates starting the crawler process twice, and checks that the statistics include first run."""
+
+    with concurrent.futures.ProcessPoolExecutor() as executor:
+        # Crawl 2 requests in the first run and automatically persist the state.
+        first_run_state = executor.submit(
+            _process_run_crawler,
+            requests=['https://a.placeholder.com', 'https://b.placeholder.com'],
+            storage_dir=str(tmp_path),
+        ).result()
+        assert first_run_state.requests_finished == 2
+
+        # Crawl 1 additional requests in the second run, but use previously automatically persisted state.
+        second_run_state = executor.submit(
+            _process_run_crawler, requests=['https://c.placeholder.com'], storage_dir=str(tmp_path)
+        ).result()
+        assert second_run_state.requests_finished == 3
+
+    assert first_run_state.crawler_started_at == second_run_state.crawler_started_at
+    assert first_run_state.crawler_finished_at
+    assert second_run_state.crawler_finished_at
+
+    assert first_run_state.crawler_finished_at < second_run_state.crawler_finished_at
+    assert first_run_state.crawler_runtime < second_run_state.crawler_runtime
diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py
@@ -41,10 +41,10 @@ async def test_storage_not_persisted_when_disabled(tmp_path: Path, server_url: U
     )
     storage_client = MemoryStorageClient()
 
-    crawler = HttpCrawler(
-        configuration=configuration,
-        storage_client=storage_client,
-    )
+    service_locator.set_configuration(configuration)
+    service_locator.set_storage_client(storage_client)
+
+    crawler = HttpCrawler()
 
     @crawler.router.default_handler
     async def default_handler(context: HttpCrawlingContext) -> None: