11# Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts
22from __future__ import annotations
33
4+ import asyncio
45import math
56import time
67from datetime import datetime , timedelta , timezone
@@ -84,8 +85,6 @@ def __init__(
8485 self ._id = Statistics .__next_id
8586 Statistics .__next_id += 1
8687
87- self ._instance_start : datetime | None = None
88-
8988 self .error_tracker = ErrorTracker (
9089 save_error_snapshots = save_error_snapshots ,
9190 snapshot_kvs_name = persist_state_kvs_name ,
@@ -111,6 +110,9 @@ def __init__(
111110 # Flag to indicate the context state.
112111 self ._active = False
113112
113+ # Pre-existing runtime offset, that can be non-zero when restoring serialized state from KVS.
114+ self ._runtime_offset = timedelta (seconds = 0 )
115+
114116 def replace_state_model (self , state_model : type [TNewStatisticsState ]) -> Statistics [TNewStatisticsState ]:
115117 """Create near copy of the `Statistics` with replaced `state_model`."""
116118 new_statistics : Statistics [TNewStatisticsState ] = Statistics (
@@ -165,14 +167,17 @@ async def __aenter__(self) -> Self:
165167 if self ._active :
166168 raise RuntimeError (f'The { self .__class__ .__name__ } is already active.' )
167169
168- self ._active = True
169- self ._instance_start = datetime .now (timezone .utc )
170-
171170 await self ._state .initialize ()
172- self ._after_initialize ()
173171
172+ self ._runtime_offset = self .state .crawler_runtime
173+
174+ # Start periodic logging and let it print initial state before activation.
174175 self ._periodic_logger .start ()
176+ await asyncio .sleep (0.01 )
177+ self ._active = True
175178
179+ self .state .crawler_last_started_at = datetime .now (timezone .utc )
180+ self .state .crawler_started_at = self .state .crawler_started_at or self .state .crawler_last_started_at
176181 return self
177182
178183 async def __aexit__ (
@@ -191,14 +196,16 @@ async def __aexit__(
191196
192197 if not self .state .crawler_last_started_at :
193198 raise RuntimeError ('Statistics.state.crawler_last_started_at not set.' )
194- self .state .crawler_finished_at = datetime .now (timezone .utc )
195- self .state .crawler_runtime += self .state .crawler_finished_at - self .state .crawler_last_started_at
196-
197- await self ._state .teardown ()
198199
200+ # Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
199201 await self ._periodic_logger .stop ()
202+ self .state .crawler_finished_at = datetime .now (timezone .utc )
203+ self .state .crawler_runtime = (
204+ self ._runtime_offset + self .state .crawler_finished_at - self .state .crawler_last_started_at
205+ )
200206
201207 self ._active = False
208+ await self ._state .teardown ()
202209
203210 @property
204211 def state (self ) -> TStatisticsState :
@@ -255,10 +262,19 @@ def record_request_processing_failure(self, request_id_or_key: str) -> None:
255262
256263 del self ._requests_in_progress [request_id_or_key ]
257264
265+ def _update_crawler_runtime (self ) -> None :
266+ current_run_duration = (
267+ (datetime .now (timezone .utc ) - self .state .crawler_last_started_at )
268+ if self .state .crawler_last_started_at
269+ else timedelta ()
270+ )
271+ self .state .crawler_runtime = current_run_duration + self ._runtime_offset
272+
258273 def calculate (self ) -> FinalStatistics :
259274 """Calculate the current statistics."""
260- if self ._instance_start is None :
261- raise RuntimeError ('The Statistics object is not initialized' )
275+ if self ._active :
276+ # Only update state when active. If not, just report the last known runtime.
277+ self ._update_crawler_runtime ()
262278
263279 total_minutes = self .state .crawler_runtime .total_seconds () / 60
264280 state = self ._state .current_value
@@ -291,21 +307,6 @@ def _log(self) -> None:
291307 else :
292308 self ._periodic_message_logger .info (self ._log_message , extra = stats .to_dict ())
293309
294- def _after_initialize (self ) -> None :
295- state = self ._state .current_value
296-
297- if state .crawler_started_at is None :
298- state .crawler_started_at = datetime .now (timezone .utc )
299-
300- if state .stats_persisted_at is not None and state .crawler_last_started_at :
301- self ._instance_start = datetime .now (timezone .utc ) - (
302- state .stats_persisted_at - state .crawler_last_started_at
303- )
304- elif state .crawler_last_started_at :
305- self ._instance_start = state .crawler_last_started_at
306-
307- state .crawler_last_started_at = self ._instance_start
308-
309310 def _save_retry_count_for_request (self , record : RequestProcessingRecord ) -> None :
310311 retry_count = record .retry_count
311312 state = self ._state .current_value
0 commit comments