7
7
from typing import TYPE_CHECKING , Any , Callable , Literal , TypeVar , cast , overload
8
8
9
9
from lazy_object_proxy import Proxy
10
+ from more_itertools import flatten
10
11
from pydantic import AliasChoices
11
12
12
13
from apify_client import ApifyClientAsync
13
14
from apify_shared .consts import ActorEnvVars , ActorExitCodes , ApifyEnvVars
14
15
from apify_shared .utils import ignore_docs , maybe_extract_enum_member_value
15
- from crawlee import service_container
16
+ from crawlee import service_locator
16
17
from crawlee .events import (
17
18
Event ,
18
19
EventAbortingData ,
41
42
from typing_extensions import Self
42
43
43
44
from crawlee .proxy_configuration import _NewUrlFunction
45
+ from crawlee .storage_clients import BaseStorageClient
44
46
45
47
from apify ._models import Webhook
46
48
@@ -56,6 +58,7 @@ class _ActorType:
56
58
_apify_client : ApifyClientAsync
57
59
_configuration : Configuration
58
60
_is_exiting = False
61
+ _is_rebooting = False
59
62
60
63
def __init__ (
61
64
self ,
@@ -77,17 +80,22 @@ def __init__(
77
80
self ._configure_logging = configure_logging
78
81
self ._apify_client = self .new_client ()
79
82
80
- self ._event_manager : EventManager
81
- if self ._configuration .is_at_home :
82
- self ._event_manager = PlatformEventManager (
83
+ # Create an instance of the cloud storage client, the local storage client is obtained
84
+ # from the service locator.
85
+ self ._cloud_storage_client = ApifyStorageClient .from_config (config = self ._configuration )
86
+
87
+ # Set the event manager based on whether the Actor is running on the platform or locally.
88
+ self ._event_manager = (
89
+ PlatformEventManager (
83
90
config = self ._configuration ,
84
91
persist_state_interval = self ._configuration .persist_state_interval ,
85
92
)
86
- else :
87
- self . _event_manager = LocalEventManager (
93
+ if self . is_at_home ()
94
+ else LocalEventManager (
88
95
system_info_interval = self ._configuration .system_info_interval ,
89
96
persist_state_interval = self ._configuration .persist_state_interval ,
90
97
)
98
+ )
91
99
92
100
self ._is_initialized = False
93
101
@@ -100,9 +108,6 @@ async def __aenter__(self) -> Self:
100
108
When you exit the `async with` block, the `Actor.exit()` method is called, and if any exception happens while
101
109
executing the block code, the `Actor.fail` method is called.
102
110
"""
103
- if self ._configure_logging :
104
- _configure_logging (self ._configuration )
105
-
106
111
await self .init ()
107
112
return self
108
113
@@ -162,10 +167,25 @@ def log(self) -> logging.Logger:
162
167
"""The logging.Logger instance the Actor uses."""
163
168
return logger
164
169
170
+ @property
171
+ def _local_storage_client (self ) -> BaseStorageClient :
172
+ """The local storage client the Actor instance uses."""
173
+ return service_locator .get_storage_client ()
174
+
165
175
def _raise_if_not_initialized (self ) -> None :
166
176
if not self ._is_initialized :
167
177
raise RuntimeError ('The Actor was not initialized!' )
168
178
179
+ def _raise_if_cloud_requested_but_not_configured (self , * , force_cloud : bool ) -> None :
180
+ if not force_cloud :
181
+ return
182
+
183
+ if not self .is_at_home () and self .config .token is None :
184
+ raise RuntimeError (
185
+ 'In order to use the Apify cloud storage from your computer, '
186
+ 'you need to provide an Apify token using the APIFY_TOKEN environment variable.'
187
+ )
188
+
169
189
async def init (self ) -> None :
170
190
"""Initialize the Actor instance.
171
191
@@ -180,18 +200,19 @@ async def init(self) -> None:
180
200
if self ._is_initialized :
181
201
raise RuntimeError ('The Actor was already initialized!' )
182
202
183
- if self ._configuration . token :
184
- service_container . set_cloud_storage_client ( ApifyStorageClient ( configuration = self ._configuration ))
203
+ self ._is_exiting = False
204
+ self ._was_final_persist_state_emitted = False
185
205
186
- if self ._configuration .is_at_home :
187
- service_container .set_default_storage_client_type ('cloud' )
188
- else :
189
- service_container .set_default_storage_client_type ('local' )
206
+ # If the Actor is running on the Apify platform, we set the cloud storage client.
207
+ if self .is_at_home ():
208
+ service_locator .set_storage_client (self ._cloud_storage_client )
190
209
191
- service_container .set_event_manager (self ._event_manager )
210
+ service_locator .set_event_manager (self .event_manager )
211
+ service_locator .set_configuration (self .configuration )
192
212
193
- self ._is_exiting = False
194
- self ._was_final_persist_state_emitted = False
213
+ # The logging configuration has to be called after all service_locator set methods.
214
+ if self ._configure_logging :
215
+ _configure_logging ()
195
216
196
217
self .log .info ('Initializing Actor...' )
197
218
self .log .info ('System info' , extra = get_system_info ())
@@ -241,7 +262,6 @@ async def finalize() -> None:
241
262
await self ._event_manager .wait_for_all_listeners_to_complete (timeout = event_listeners_timeout )
242
263
243
264
await self ._event_manager .__aexit__ (None , None , None )
244
- cast (dict , service_container ._services ).clear () # noqa: SLF001
245
265
246
266
await asyncio .wait_for (finalize (), cleanup_timeout .total_seconds ())
247
267
self ._is_initialized = False
@@ -343,12 +363,15 @@ async def open_dataset(
343
363
An instance of the `Dataset` class for the given ID or name.
344
364
"""
345
365
self ._raise_if_not_initialized ()
366
+ self ._raise_if_cloud_requested_but_not_configured (force_cloud = force_cloud )
367
+
368
+ storage_client = self ._cloud_storage_client if force_cloud else self ._local_storage_client
346
369
347
370
return await Dataset .open (
348
371
id = id ,
349
372
name = name ,
350
373
configuration = self ._configuration ,
351
- storage_client = service_container . get_storage_client ( client_type = 'cloud' if force_cloud else None ) ,
374
+ storage_client = storage_client ,
352
375
)
353
376
354
377
async def open_key_value_store (
@@ -375,12 +398,14 @@ async def open_key_value_store(
375
398
An instance of the `KeyValueStore` class for the given ID or name.
376
399
"""
377
400
self ._raise_if_not_initialized ()
401
+ self ._raise_if_cloud_requested_but_not_configured (force_cloud = force_cloud )
402
+ storage_client = self ._cloud_storage_client if force_cloud else self ._local_storage_client
378
403
379
404
return await KeyValueStore .open (
380
405
id = id ,
381
406
name = name ,
382
407
configuration = self ._configuration ,
383
- storage_client = service_container . get_storage_client ( client_type = 'cloud' if force_cloud else None ) ,
408
+ storage_client = storage_client ,
384
409
)
385
410
386
411
async def open_request_queue (
@@ -409,12 +434,15 @@ async def open_request_queue(
409
434
An instance of the `RequestQueue` class for the given ID or name.
410
435
"""
411
436
self ._raise_if_not_initialized ()
437
+ self ._raise_if_cloud_requested_but_not_configured (force_cloud = force_cloud )
438
+
439
+ storage_client = self ._cloud_storage_client if force_cloud else self ._local_storage_client
412
440
413
441
return await RequestQueue .open (
414
442
id = id ,
415
443
name = name ,
416
444
configuration = self ._configuration ,
417
- storage_client = service_container . get_storage_client ( client_type = 'cloud' if force_cloud else None ) ,
445
+ storage_client = storage_client ,
418
446
)
419
447
420
448
async def push_data (self , data : dict | list [dict ]) -> None :
@@ -870,12 +898,32 @@ async def reboot(
870
898
self .log .error ('Actor.reboot() is only supported when running on the Apify platform.' )
871
899
return
872
900
901
+ if self ._is_rebooting :
902
+ self .log .debug ('Actor is already rebooting, skipping the additional reboot call.' )
903
+ return
904
+
905
+ self ._is_rebooting = True
906
+
873
907
if not custom_after_sleep :
874
908
custom_after_sleep = self ._configuration .metamorph_after_sleep
875
909
876
- self ._event_manager .emit (event = Event .PERSIST_STATE , event_data = EventPersistStateData (is_migrating = True ))
910
+ # Call all the listeners for the PERSIST_STATE and MIGRATING events, and wait for them to finish.
911
+ # PERSIST_STATE listeners are called to allow the Actor to persist its state before the reboot.
912
+ # MIGRATING listeners are called to allow the Actor to gracefully stop in-progress tasks before the reboot.
913
+ # Typically, crawlers are listening for the MIIGRATING event to stop processing new requests.
914
+ # We can't just emit the events and wait for all listeners to finish,
915
+ # because this method might be called from an event listener itself, and we would deadlock.
916
+ persist_state_listeners = flatten (
917
+ (self ._event_manager ._listeners_to_wrappers [Event .PERSIST_STATE ] or {}).values () # noqa: SLF001
918
+ )
919
+ migrating_listeners = flatten (
920
+ (self ._event_manager ._listeners_to_wrappers [Event .MIGRATING ] or {}).values () # noqa: SLF001
921
+ )
877
922
878
- await self ._event_manager .__aexit__ (None , None , None )
923
+ await asyncio .gather (
924
+ * [listener (EventPersistStateData (is_migrating = True )) for listener in persist_state_listeners ],
925
+ * [listener (EventMigratingData ()) for listener in migrating_listeners ],
926
+ )
879
927
880
928
if not self ._configuration .actor_run_id :
881
929
raise RuntimeError ('actor_run_id cannot be None when running on the Apify platform.' )
@@ -972,7 +1020,7 @@ async def create_proxy_configuration(
972
1020
password : str | None = None ,
973
1021
groups : list [str ] | None = None ,
974
1022
country_code : str | None = None ,
975
- proxy_urls : list [str ] | None = None ,
1023
+ proxy_urls : list [str | None ] | None = None ,
976
1024
new_url_function : _NewUrlFunction | None = None ,
977
1025
) -> ProxyConfiguration | None :
978
1026
"""Create a ProxyConfiguration object with the passed proxy configuration.
0 commit comments