Skip to content

Commit 51be03d

Browse files
committed
feat: Upgrade to Crawlee v0.5
1 parent ccba8d1 commit 51be03d

File tree

12 files changed

+363
-145
lines changed

12 files changed

+363
-145
lines changed

poetry.lock

Lines changed: 150 additions & 38 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ keywords = [
4545
python = "^3.9"
4646
apify-client = ">=1.8.1"
4747
apify-shared = ">=1.2.1"
48-
crawlee = "~0.4.0"
48+
crawlee = "~0.5.0"
4949
cryptography = ">=42.0.0"
5050
httpx = ">=0.27.0"
5151
lazy-object-proxy = ">=1.10.0"

src/apify/_actor.py

Lines changed: 34 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@
1313
from apify_client import ApifyClientAsync
1414
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
1515
from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value
16-
from crawlee import service_container
16+
from crawlee import service_locator
1717
from crawlee.events._types import Event, EventMigratingData, EventPersistStateData
18+
from crawlee.memory_storage_client import MemoryStorageClient
1819

1920
from apify._configuration import Configuration
2021
from apify._consts import EVENT_LISTENERS_TIMEOUT
@@ -71,17 +72,22 @@ def __init__(
7172
self._configure_logging = configure_logging
7273
self._apify_client = self.new_client()
7374

74-
self._event_manager: EventManager
75-
if self._configuration.is_at_home:
76-
self._event_manager = PlatformEventManager(
77-
config=self._configuration,
78-
persist_state_interval=self._configuration.persist_state_interval,
75+
# We need to keep both local & cloud storage clients because of the `force_cloud` option.
76+
self._local_storage_client = MemoryStorageClient.from_config(config=self.config)
77+
self._cloud_storage_client = ApifyStorageClient.from_config(config=self.config)
78+
79+
# Set the event manager based on whether the Actor is running on the platform or locally.
80+
self._event_manager = (
81+
PlatformEventManager(
82+
config=self.config,
83+
persist_state_interval=self.config.persist_state_interval,
7984
)
80-
else:
81-
self._event_manager = LocalEventManager(
82-
system_info_interval=self._configuration.system_info_interval,
83-
persist_state_interval=self._configuration.persist_state_interval,
85+
if self.is_at_home()
86+
else LocalEventManager(
87+
system_info_interval=self.config.system_info_interval,
88+
persist_state_interval=self.config.persist_state_interval,
8489
)
90+
)
8591

8692
self._is_initialized = False
8793

@@ -95,7 +101,7 @@ async def __aenter__(self) -> Self:
95101
executing the block code, the `Actor.fail` method is called.
96102
"""
97103
if self._configure_logging:
98-
_configure_logging(self._configuration)
104+
_configure_logging()
99105

100106
await self.init()
101107
return self
@@ -184,18 +190,17 @@ async def init(self) -> None:
184190
if self._is_initialized:
185191
raise RuntimeError('The Actor was already initialized!')
186192

187-
if self._configuration.token:
188-
service_container.set_cloud_storage_client(ApifyStorageClient(configuration=self._configuration))
193+
self._is_exiting = False
194+
self._was_final_persist_state_emitted = False
189195

190-
if self._configuration.is_at_home:
191-
service_container.set_default_storage_client_type('cloud')
196+
# Register services in the service locator.
197+
if self.is_at_home():
198+
service_locator.set_storage_client(self._cloud_storage_client)
192199
else:
193-
service_container.set_default_storage_client_type('local')
200+
service_locator.set_storage_client(self._local_storage_client)
194201

195-
service_container.set_event_manager(self._event_manager)
196-
197-
self._is_exiting = False
198-
self._was_final_persist_state_emitted = False
202+
service_locator.set_event_manager(self.event_manager)
203+
service_locator.set_configuration(self.configuration)
199204

200205
self.log.info('Initializing Actor...')
201206
self.log.info('System info', extra=get_system_info())
@@ -245,7 +250,6 @@ async def finalize() -> None:
245250
await self._event_manager.wait_for_all_listeners_to_complete(timeout=event_listeners_timeout)
246251

247252
await self._event_manager.__aexit__(None, None, None)
248-
cast(dict, service_container._services).clear() # noqa: SLF001
249253

250254
await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds())
251255
self._is_initialized = False
@@ -349,11 +353,13 @@ async def open_dataset(
349353
self._raise_if_not_initialized()
350354
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
351355

356+
storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client()
357+
352358
return await Dataset.open(
353359
id=id,
354360
name=name,
355361
configuration=self._configuration,
356-
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
362+
storage_client=storage_client,
357363
)
358364

359365
async def open_key_value_store(
@@ -381,12 +387,13 @@ async def open_key_value_store(
381387
"""
382388
self._raise_if_not_initialized()
383389
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
390+
storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client()
384391

385392
return await KeyValueStore.open(
386393
id=id,
387394
name=name,
388395
configuration=self._configuration,
389-
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
396+
storage_client=storage_client,
390397
)
391398

392399
async def open_request_queue(
@@ -417,11 +424,13 @@ async def open_request_queue(
417424
self._raise_if_not_initialized()
418425
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
419426

427+
storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client()
428+
420429
return await RequestQueue.open(
421430
id=id,
422431
name=name,
423432
configuration=self._configuration,
424-
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
433+
storage_client=storage_client,
425434
)
426435

427436
async def push_data(self, data: dict | list[dict]) -> None:
@@ -963,7 +972,7 @@ async def create_proxy_configuration(
963972
password: str | None = None,
964973
groups: list[str] | None = None,
965974
country_code: str | None = None,
966-
proxy_urls: list[str] | None = None,
975+
proxy_urls: list[str | None] | None = None,
967976
new_url_function: _NewUrlFunction | None = None,
968977
) -> ProxyConfiguration | None:
969978
"""Create a ProxyConfiguration object with the passed proxy configuration.

src/apify/_configuration.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
from datetime import datetime, timedelta
4+
from logging import getLogger
45
from typing import Annotated, Any
56

67
from pydantic import AliasChoices, BeforeValidator, Field
@@ -12,6 +13,8 @@
1213

1314
from apify._utils import docs_group
1415

16+
logger = getLogger(__name__)
17+
1518

1619
def _transform_to_list(value: Any) -> list[str] | None:
1720
if value is None:
@@ -353,6 +356,15 @@ class Configuration(CrawleeConfiguration):
353356
),
354357
] = None
355358

359+
@classmethod
360+
def get_global_configuration(cls) -> Configuration:
361+
"""Retrieve the global instance of the configuration.
362+
363+
Mostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`
364+
instead.
365+
"""
366+
return cls()
367+
356368

357369
# Monkey-patch the base class so that it works with the extended configuration
358370
CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore[method-assign]

src/apify/_proxy_configuration.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,9 @@ def __init__(
111111
password: str | None = None,
112112
groups: list[str] | None = None,
113113
country_code: str | None = None,
114-
proxy_urls: list[str] | None = None,
114+
proxy_urls: list[str | None] | None = None,
115115
new_url_function: _NewUrlFunction | None = None,
116-
tiered_proxy_urls: list[list[str]] | None = None,
116+
tiered_proxy_urls: list[list[str | None]] | None = None,
117117
_actor_config: Configuration | None = None,
118118
_apify_client: ApifyClientAsync | None = None,
119119
) -> None:
@@ -148,7 +148,7 @@ def __init__(
148148
' "groups" or "country_code".'
149149
)
150150

151-
if proxy_urls and any('apify.com' in url for url in proxy_urls):
151+
if proxy_urls and any('apify.com' in (url or '') for url in proxy_urls):
152152
logger.warning(
153153
'Some Apify proxy features may work incorrectly. Please consider setting up Apify properties '
154154
'instead of `proxy_urls`.\n'

src/apify/apify_storage_client/_apify_storage_client.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
15
from typing_extensions import override
26

37
from apify_client import ApifyClientAsync
48
from crawlee._utils.crypto import crypto_random_object_id
59
from crawlee.base_storage_client import BaseStorageClient
610

7-
from apify._configuration import Configuration
811
from apify._utils import docs_group
912
from apify.apify_storage_client._dataset_client import DatasetClient
1013
from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient
@@ -13,6 +16,9 @@
1316
from apify.apify_storage_client._request_queue_client import RequestQueueClient
1417
from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient
1518

19+
if TYPE_CHECKING:
20+
from apify._configuration import Configuration
21+
1622

1723
@docs_group('Classes')
1824
class ApifyStorageClient(BaseStorageClient):
@@ -29,6 +35,10 @@ def __init__(self, *, configuration: Configuration) -> None:
2935
)
3036
self._configuration = configuration
3137

38+
@classmethod
39+
def from_config(cls, config: Configuration) -> ApifyStorageClient:
40+
return cls(configuration=config)
41+
3242
@override
3343
def dataset(self, id: str) -> DatasetClient:
3444
return DatasetClient(self._apify_client.dataset(id))

src/apify/log.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,10 @@
11
from __future__ import annotations
22

33
import logging
4-
from typing import TYPE_CHECKING
54

65
from apify_shared.utils import ignore_docs
76
from crawlee._log_config import CrawleeLogFormatter, configure_logger, get_configured_log_level
87

9-
if TYPE_CHECKING:
10-
from apify import Configuration
11-
128
# Name of the logger used throughout the library (resolves to 'apify')
139
logger_name = __name__.split('.')[0]
1410

@@ -21,11 +17,11 @@ class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 (Inherited from pare
2117
pass
2218

2319

24-
def _configure_logging(configuration: Configuration) -> None:
20+
def _configure_logging() -> None:
2521
apify_client_logger = logging.getLogger('apify_client')
26-
configure_logger(apify_client_logger, configuration, remove_old_handlers=True)
22+
configure_logger(apify_client_logger, remove_old_handlers=True)
2723

28-
level = get_configured_log_level(configuration)
24+
level = get_configured_log_level()
2925

3026
# Keep apify_client logger quiet unless debug logging is requested
3127
if level > logging.DEBUG:
@@ -42,4 +38,4 @@ def _configure_logging(configuration: Configuration) -> None:
4238

4339
# Use configured log level for apify logger
4440
apify_logger = logging.getLogger('apify')
45-
configure_logger(apify_logger, configuration, remove_old_handlers=True)
41+
configure_logger(apify_logger, remove_old_handlers=True)

tests/integration/conftest.py

Lines changed: 61 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,15 @@
77
import sys
88
import textwrap
99
from pathlib import Path
10-
from typing import TYPE_CHECKING, Any, Callable, Protocol, cast
10+
from typing import TYPE_CHECKING, Any, Callable, Protocol
1111

1212
import pytest
1313
from filelock import FileLock
1414

1515
from apify_client import ApifyClientAsync
16-
from apify_shared.consts import ActorJobStatus, ActorSourceType
16+
from apify_shared.consts import ActorJobStatus, ActorSourceType, ApifyEnvVars
17+
from crawlee import service_locator
18+
from crawlee.storages import _creation_management
1719

1820
import apify._actor
1921
from ._utils import generate_unique_resource_name
@@ -29,19 +31,67 @@
2931
_SDK_ROOT_PATH = Path(__file__).parent.parent.parent.resolve()
3032

3133

32-
@pytest.fixture(autouse=True)
33-
def _reset_and_patch_default_instances() -> None:
34-
"""Reset the used singletons and patch the default storage client with a temporary directory.
34+
@pytest.fixture
35+
def prepare_test_env(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Callable[[], None]:
36+
"""Prepare the testing environment by resetting the global state before each test.
37+
38+
This fixture ensures that the global state of the package is reset to a known baseline before each test runs.
39+
It also configures a temporary storage directory for test isolation.
40+
41+
Args:
42+
monkeypatch: Test utility provided by pytest for patching.
43+
tmp_path: A unique temporary directory path provided by pytest for test isolation.
3544
36-
To isolate the tests, we need to reset the used singletons before each test case. We also patch the default
37-
storage client with a tmp_path.
45+
Returns:
46+
A callable that prepares the test environment.
3847
"""
39-
from crawlee import service_container
4048

41-
cast(dict, service_container._services).clear()
42-
delattr(apify._actor.Actor, '__wrapped__')
49+
def _prepare_test_env() -> None:
50+
delattr(apify._actor.Actor, '__wrapped__')
51+
52+
# Set the environment variable for the local storage directory to the temporary path.
53+
monkeypatch.setenv(ApifyEnvVars.LOCAL_STORAGE_DIR, str(tmp_path))
54+
55+
# Reset the flags in the service locator to indicate that no services are explicitly set. This ensures
56+
# a clean state, as services might have been set during a previous test and not reset properly.
57+
service_locator._configuration_was_set = False
58+
service_locator._storage_client_was_set = False
59+
service_locator._event_manager_was_set = False
60+
61+
# Reset the services in the service locator.
62+
service_locator._configuration = None
63+
service_locator._event_manager = None
64+
service_locator._storage_client = None
65+
66+
# Clear creation-related caches to ensure no state is carried over between tests.
67+
monkeypatch.setattr(_creation_management, '_cache_dataset_by_id', {})
68+
monkeypatch.setattr(_creation_management, '_cache_dataset_by_name', {})
69+
monkeypatch.setattr(_creation_management, '_cache_kvs_by_id', {})
70+
monkeypatch.setattr(_creation_management, '_cache_kvs_by_name', {})
71+
monkeypatch.setattr(_creation_management, '_cache_rq_by_id', {})
72+
monkeypatch.setattr(_creation_management, '_cache_rq_by_name', {})
73+
74+
# Verify that the test environment was set up correctly.
75+
assert os.environ.get(ApifyEnvVars.LOCAL_STORAGE_DIR) == str(tmp_path)
76+
assert service_locator._configuration_was_set is False
77+
assert service_locator._storage_client_was_set is False
78+
assert service_locator._event_manager_was_set is False
79+
80+
return _prepare_test_env
81+
82+
83+
@pytest.fixture(autouse=True)
84+
def _isolate_test_environment(prepare_test_env: Callable[[], None]) -> None:
85+
"""Isolate the testing environment by resetting global state before and after each test.
86+
87+
This fixture ensures that each test starts with a clean slate and that any modifications during the test
88+
do not affect subsequent tests. It runs automatically for all tests.
89+
90+
Args:
91+
prepare_test_env: Fixture to prepare the environment before each test.
92+
"""
4393

44-
# TODO: StorageClientManager local storage client purge # noqa: TD003
94+
prepare_test_env()
4595

4696

4797
@pytest.fixture

0 commit comments

Comments
 (0)