Skip to content

Commit 37a1adf

Browse files
committed
feat: Upgrade to Crawlee v0.5
1 parent fa7bb9d commit 37a1adf

File tree

12 files changed

+233
-129
lines changed

12 files changed

+233
-129
lines changed

poetry.lock

Lines changed: 20 additions & 20 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,16 @@ keywords = [
4545
python = "^3.9"
4646
apify-client = ">=1.8.1"
4747
apify-shared = ">=1.2.1"
48-
crawlee = "~0.4.0"
48+
crawlee = "==0.5.0b17"
4949
cryptography = ">=42.0.0"
5050
# TODO: relax the upper bound once the issue is resolved:
5151
# https://github.com/apify/apify-sdk-python/issues/348
5252
httpx = "~0.27.0"
5353
lazy-object-proxy = ">=1.10.0"
5454
scrapy = { version = ">=2.11.0", optional = true }
5555
typing-extensions = ">=4.1.0"
56+
# TODO: relax the upper bound once the issue is resolved:
57+
# https://github.com/apify/apify-sdk-python/issues/325
5658
websockets = ">=10.0 <14.0.0"
5759

5860
[tool.poetry.group.dev.dependencies]

src/apify/_actor.py

Lines changed: 34 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@
1212
from apify_client import ApifyClientAsync
1313
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
1414
from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value
15-
from crawlee import service_container
15+
from crawlee import service_locator
1616
from crawlee.events._types import Event, EventPersistStateData
17+
from crawlee.memory_storage_client import MemoryStorageClient
1718

1819
from apify._configuration import Configuration
1920
from apify._consts import EVENT_LISTENERS_TIMEOUT
@@ -69,17 +70,22 @@ def __init__(
6970
self._configure_logging = configure_logging
7071
self._apify_client = self.new_client()
7172

72-
self._event_manager: EventManager
73-
if self._configuration.is_at_home:
74-
self._event_manager = PlatformEventManager(
75-
config=self._configuration,
76-
persist_state_interval=self._configuration.persist_state_interval,
73+
# We need to keep both local & cloud storage clients because of the `force_cloud` option.
74+
self._local_storage_client = MemoryStorageClient.from_config(config=self.config)
75+
self._cloud_storage_client = ApifyStorageClient.from_config(config=self.config)
76+
77+
# Set the event manager based on whether the Actor is running on the platform or locally.
78+
self._event_manager = (
79+
PlatformEventManager(
80+
config=self.config,
81+
persist_state_interval=self.config.persist_state_interval,
7782
)
78-
else:
79-
self._event_manager = LocalEventManager(
80-
system_info_interval=self._configuration.system_info_interval,
81-
persist_state_interval=self._configuration.persist_state_interval,
83+
if self.is_at_home()
84+
else LocalEventManager(
85+
system_info_interval=self.config.system_info_interval,
86+
persist_state_interval=self.config.persist_state_interval,
8287
)
88+
)
8389

8490
self._is_initialized = False
8591

@@ -93,7 +99,7 @@ async def __aenter__(self) -> Self:
9399
executing the block code, the `Actor.fail` method is called.
94100
"""
95101
if self._configure_logging:
96-
_configure_logging(self._configuration)
102+
_configure_logging()
97103

98104
await self.init()
99105
return self
@@ -182,18 +188,17 @@ async def init(self) -> None:
182188
if self._is_initialized:
183189
raise RuntimeError('The Actor was already initialized!')
184190

185-
if self._configuration.token:
186-
service_container.set_cloud_storage_client(ApifyStorageClient(configuration=self._configuration))
191+
self._is_exiting = False
192+
self._was_final_persist_state_emitted = False
187193

188-
if self._configuration.is_at_home:
189-
service_container.set_default_storage_client_type('cloud')
194+
# Register services in the service locator.
195+
if self.is_at_home():
196+
service_locator.set_storage_client(self._cloud_storage_client)
190197
else:
191-
service_container.set_default_storage_client_type('local')
198+
service_locator.set_storage_client(self._local_storage_client)
192199

193-
service_container.set_event_manager(self._event_manager)
194-
195-
self._is_exiting = False
196-
self._was_final_persist_state_emitted = False
200+
service_locator.set_event_manager(self.event_manager)
201+
service_locator.set_configuration(self.configuration)
197202

198203
self.log.info('Initializing Actor...')
199204
self.log.info('System info', extra=get_system_info())
@@ -243,7 +248,6 @@ async def finalize() -> None:
243248
await self._event_manager.wait_for_all_listeners_to_complete(timeout=event_listeners_timeout)
244249

245250
await self._event_manager.__aexit__(None, None, None)
246-
cast(dict, service_container._services).clear() # noqa: SLF001
247251

248252
await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds())
249253
self._is_initialized = False
@@ -347,11 +351,13 @@ async def open_dataset(
347351
self._raise_if_not_initialized()
348352
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
349353

354+
storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client()
355+
350356
return await Dataset.open(
351357
id=id,
352358
name=name,
353359
configuration=self._configuration,
354-
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
360+
storage_client=storage_client,
355361
)
356362

357363
async def open_key_value_store(
@@ -379,12 +385,13 @@ async def open_key_value_store(
379385
"""
380386
self._raise_if_not_initialized()
381387
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
388+
storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client()
382389

383390
return await KeyValueStore.open(
384391
id=id,
385392
name=name,
386393
configuration=self._configuration,
387-
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
394+
storage_client=storage_client,
388395
)
389396

390397
async def open_request_queue(
@@ -415,11 +422,13 @@ async def open_request_queue(
415422
self._raise_if_not_initialized()
416423
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
417424

425+
storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client()
426+
418427
return await RequestQueue.open(
419428
id=id,
420429
name=name,
421430
configuration=self._configuration,
422-
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
431+
storage_client=storage_client,
423432
)
424433

425434
async def push_data(self, data: dict | list[dict]) -> None:
@@ -941,7 +950,7 @@ async def create_proxy_configuration(
941950
password: str | None = None,
942951
groups: list[str] | None = None,
943952
country_code: str | None = None,
944-
proxy_urls: list[str] | None = None,
953+
proxy_urls: list[str | None] | None = None,
945954
new_url_function: _NewUrlFunction | None = None,
946955
) -> ProxyConfiguration | None:
947956
"""Create a ProxyConfiguration object with the passed proxy configuration.

src/apify/_configuration.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
from datetime import datetime, timedelta
4+
from logging import getLogger
45
from typing import Annotated, Any
56

67
from pydantic import AliasChoices, BeforeValidator, Field
@@ -12,6 +13,8 @@
1213

1314
from apify._utils import docs_group
1415

16+
logger = getLogger(__name__)
17+
1518

1619
def _transform_to_list(value: Any) -> list[str] | None:
1720
if value is None:
@@ -353,6 +356,11 @@ class Configuration(CrawleeConfiguration):
353356
),
354357
] = None
355358

359+
@classmethod
360+
def get_global_configuration(cls) -> Configuration:
361+
"""Retrieve the global instance of the configuration.
356362
357-
# Monkey-patch the base class so that it works with the extended configuration
358-
CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore[method-assign]
363+
Mostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`
364+
instead.
365+
"""
366+
return cls()

src/apify/_proxy_configuration.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,9 @@ def __init__(
111111
password: str | None = None,
112112
groups: list[str] | None = None,
113113
country_code: str | None = None,
114-
proxy_urls: list[str] | None = None,
114+
proxy_urls: list[str | None] | None = None,
115115
new_url_function: _NewUrlFunction | None = None,
116-
tiered_proxy_urls: list[list[str]] | None = None,
116+
tiered_proxy_urls: list[list[str | None]] | None = None,
117117
_actor_config: Configuration | None = None,
118118
_apify_client: ApifyClientAsync | None = None,
119119
) -> None:
@@ -148,7 +148,7 @@ def __init__(
148148
' "groups" or "country_code".'
149149
)
150150

151-
if proxy_urls and any('apify.com' in url for url in proxy_urls):
151+
if proxy_urls and any('apify.com' in (url or '') for url in proxy_urls):
152152
logger.warning(
153153
'Some Apify proxy features may work incorrectly. Please consider setting up Apify properties '
154154
'instead of `proxy_urls`.\n'

src/apify/apify_storage_client/_apify_storage_client.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
15
from typing_extensions import override
26

37
from apify_client import ApifyClientAsync
48
from crawlee._utils.crypto import crypto_random_object_id
59
from crawlee.base_storage_client import BaseStorageClient
610

7-
from apify._configuration import Configuration
811
from apify._utils import docs_group
912
from apify.apify_storage_client._dataset_client import DatasetClient
1013
from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient
@@ -13,6 +16,9 @@
1316
from apify.apify_storage_client._request_queue_client import RequestQueueClient
1417
from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient
1518

19+
if TYPE_CHECKING:
20+
from apify._configuration import Configuration
21+
1622

1723
@docs_group('Classes')
1824
class ApifyStorageClient(BaseStorageClient):
@@ -29,6 +35,10 @@ def __init__(self, *, configuration: Configuration) -> None:
2935
)
3036
self._configuration = configuration
3137

38+
@classmethod
39+
def from_config(cls, config: Configuration) -> ApifyStorageClient:
40+
return cls(configuration=config)
41+
3242
@override
3343
def dataset(self, id: str) -> DatasetClient:
3444
return DatasetClient(self._apify_client.dataset(id))

src/apify/log.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,10 @@
11
from __future__ import annotations
22

33
import logging
4-
from typing import TYPE_CHECKING
54

65
from apify_shared.utils import ignore_docs
76
from crawlee._log_config import CrawleeLogFormatter, configure_logger, get_configured_log_level
87

9-
if TYPE_CHECKING:
10-
from apify import Configuration
11-
128
# Name of the logger used throughout the library (resolves to 'apify')
139
logger_name = __name__.split('.')[0]
1410

@@ -21,11 +17,11 @@ class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 (Inherited from pare
2117
pass
2218

2319

24-
def _configure_logging(configuration: Configuration) -> None:
20+
def _configure_logging() -> None:
2521
apify_client_logger = logging.getLogger('apify_client')
26-
configure_logger(apify_client_logger, configuration, remove_old_handlers=True)
22+
configure_logger(apify_client_logger, remove_old_handlers=True)
2723

28-
level = get_configured_log_level(configuration)
24+
level = get_configured_log_level()
2925

3026
# Keep apify_client logger quiet unless debug logging is requested
3127
if level > logging.DEBUG:
@@ -42,4 +38,4 @@ def _configure_logging(configuration: Configuration) -> None:
4238

4339
# Use configured log level for apify logger
4440
apify_logger = logging.getLogger('apify')
45-
configure_logger(apify_logger, configuration, remove_old_handlers=True)
41+
configure_logger(apify_logger, remove_old_handlers=True)

0 commit comments

Comments
 (0)