Skip to content

Commit a2825bf

Browse files
committed
Wip, TODO: Solve patching of service_locator from Crawlee
1 parent 9c3e7b1 commit a2825bf

File tree

8 files changed

+73
-42
lines changed

8 files changed

+73
-42
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ keywords = [
3636
dependencies = [
3737
"apify-client>=2.0.0,<3.0.0",
3838
"apify-shared>=2.0.0,<3.0.0",
39-
"crawlee @ git+https://github.com/apify/crawlee-python.git@storage-clients-and-configurations",
39+
"crawlee @ git+https://github.com/apify/crawlee-python.git@storage-clients-and-configurations-2",
4040
"cachetools>=5.5.0",
4141
"cryptography>=42.0.0",
4242
# TODO: ensure compatibility with the latest version of lazy-object-proxy

src/apify/_actor.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from apify_client import ApifyClientAsync
1515
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
16+
from crawlee import service_locator
1617
from crawlee.errors import ServiceConflictError
1718
from crawlee.events import (
1819
Event,
@@ -25,7 +26,7 @@
2526
)
2627

2728
from apify._charging import ChargeResult, ChargingManager, ChargingManagerImplementation
28-
from apify._configuration import Configuration, service_locator
29+
from apify._configuration import Configuration
2930
from apify._consts import EVENT_LISTENERS_TIMEOUT
3031
from apify._crypto import decrypt_input_secrets, load_private_key
3132
from apify._models import ActorRun

src/apify/_configuration.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,26 @@
11
from __future__ import annotations
22

33
import traceback
4+
import types
45
from datetime import datetime, timedelta
56
from decimal import Decimal
67
from logging import getLogger
7-
from typing import Annotated, Any
8+
from typing import TYPE_CHECKING, Annotated, Any, cast
89

910
from pydantic import AliasChoices, BeforeValidator, Field, model_validator
1011
from typing_extensions import Self, deprecated
1112

1213
import crawlee
13-
from crawlee._service_locator import ServiceLocator
14+
from crawlee import service_locator
1415
from crawlee._utils.models import timedelta_ms
1516
from crawlee._utils.urls import validate_http_url
1617
from crawlee.configuration import Configuration as CrawleeConfiguration
1718

1819
from apify._utils import docs_group
1920

21+
if TYPE_CHECKING:
22+
from crawlee._service_locator import ServiceLocator
23+
2024
logger = getLogger(__name__)
2125

2226

@@ -431,20 +435,21 @@ def get_global_configuration(cls) -> Configuration:
431435
Mostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`
432436
instead.
433437
"""
434-
return service_locator.get_configuration()
438+
return cast('Configuration', service_locator.get_configuration())
435439

436440

437-
class ApifyServiceLocator(ServiceLocator):
438-
"""Same as ServiceLocator from Crawlee, but it always returns Apify Configuration."""
441+
def _patch_service_locator() -> None:
442+
"""Patch the Crawlee ServiceLocator to ensure that Apify Configuration is always returned."""
443+
old_get_configuration = crawlee.service_locator.get_configuration
439444

440-
def get_configuration(self) -> Configuration:
445+
def get_configuration(self: ServiceLocator) -> Configuration:
441446
# ApifyServiceLocator can store any children of Crawlee Configuration, but in Apify context it is desired to
442447
# return Apify Configuration.
443448
if isinstance(self._configuration, Configuration):
444449
# If Apify configuration was already stored in service locator, return it.
445450
return self._configuration
446451

447-
stored_configuration = super().get_configuration()
452+
stored_configuration = old_get_configuration()
448453
apify_configuration = Configuration()
449454

450455
# Ensure the returned configuration is of type Apify Configuration.
@@ -457,7 +462,8 @@ def get_configuration(self) -> Configuration:
457462

458463
return apify_configuration
459464

465+
crawlee.service_locator.get_configuration = types.MethodType(get_configuration, crawlee.service_locator)
466+
460467

461468
# Ensure that ApifyServiceLocator is used to make sure Apify Configuration is used.
462-
service_locator = ApifyServiceLocator()
463-
crawlee.service_locator = service_locator
469+
_patch_service_locator()

src/apify/storage_clients/_apify/_storage_client.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44

55
from typing_extensions import override
66

7+
from crawlee import service_locator
78
from crawlee.storage_clients._base import StorageClient
89

910
from ._dataset_client import ApifyDatasetClient
1011
from ._key_value_store_client import ApifyKeyValueStoreClient
1112
from ._request_queue_client import ApifyRequestQueueClient
12-
from apify._configuration import service_locator
1313
from apify._utils import docs_group
1414

1515
if TYPE_CHECKING:
@@ -40,12 +40,13 @@ async def create_dataset_client(
4040
# Import here to avoid circular imports.
4141
from apify import Configuration as ApifyConfiguration # noqa: PLC0415
4242

43-
if isinstance(self._configuration, ApifyConfiguration):
44-
return await ApifyDatasetClient.open(id=id, name=name, configuration=self._configuration)
43+
configuration = configuration or ApifyConfiguration.get_global_configuration()
44+
if isinstance(configuration, ApifyConfiguration):
45+
return await ApifyDatasetClient.open(id=id, name=name, configuration=configuration)
4546

4647
raise TypeError(
4748
f'Expected "configuration" to be an instance of "apify.Configuration", '
48-
f'but got {type(self._configuration).__name__} instead.'
49+
f'but got {type(configuration).__name__} instead.'
4950
)
5051

5152
@override
@@ -59,12 +60,13 @@ async def create_kvs_client(
5960
# Import here to avoid circular imports.
6061
from apify import Configuration as ApifyConfiguration # noqa: PLC0415
6162

62-
if isinstance(self._configuration, ApifyConfiguration):
63-
return await ApifyKeyValueStoreClient.open(id=id, name=name, configuration=self._configuration)
63+
configuration = configuration or ApifyConfiguration.get_global_configuration()
64+
if isinstance(configuration, ApifyConfiguration):
65+
return await ApifyKeyValueStoreClient.open(id=id, name=name, configuration=configuration)
6466

6567
raise TypeError(
6668
f'Expected "configuration" to be an instance of "apify.Configuration", '
67-
f'but got {type(self._configuration).__name__} instead.'
69+
f'but got {type(configuration).__name__} instead.'
6870
)
6971

7072
@override
@@ -78,15 +80,11 @@ async def create_rq_client(
7880
# Import here to avoid circular imports.
7981
from apify import Configuration as ApifyConfiguration # noqa: PLC0415
8082

81-
if isinstance(self._configuration, ApifyConfiguration):
82-
return await ApifyRequestQueueClient.open(id=id, name=name, configuration=self._configuration)
83+
configuration = configuration or ApifyConfiguration.get_global_configuration()
84+
if isinstance(configuration, ApifyConfiguration):
85+
return await ApifyRequestQueueClient.open(id=id, name=name, configuration=configuration)
8386

8487
raise TypeError(
8588
f'Expected "configuration" to be an instance of "apify.Configuration", '
86-
f'but got {type(self._configuration).__name__} instead.'
89+
f'but got {type(configuration).__name__} instead.'
8790
)
88-
89-
@override
90-
def create_client(self, configuration: Configuration) -> ApifyStorageClient:
91-
"""Create a storage client from an existing storage client potentially just replacing the configuration."""
92-
return ApifyStorageClient(configuration)

tests/integration/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def _prepare_test_env() -> None:
5858
service_locator._configuration = None
5959
service_locator._event_manager = None
6060
service_locator._storage_client = None
61-
service_locator._storage_instance_manager = None
61+
service_locator.storage_instance_manager.clear_cache()
6262

6363
# Verify that the test environment was set up correctly.
6464
assert os.environ.get(ApifyEnvVars.LOCAL_STORAGE_DIR) == str(tmp_path)

tests/unit/actor/test_configuration.py

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
1+
from pathlib import Path
2+
13
import pytest
24

5+
from crawlee import service_locator
36
from crawlee.configuration import Configuration as CrawleeConfiguration
47
from crawlee.crawlers import BasicCrawler
58
from crawlee.errors import ServiceConflictError
69

710
from apify import Actor
811
from apify import Configuration as ApifyConfiguration
9-
from apify._configuration import service_locator
1012

1113

1214
@pytest.mark.parametrize(
@@ -105,9 +107,9 @@ async def test_crawler_implicit_configuration() -> None:
105107
"""Test that crawler and Actor use implicit service_locator based configuration unless explicit configuration
106108
was passed to them."""
107109
async with Actor():
108-
crawler_1 = BasicCrawler()
110+
crawler = BasicCrawler()
109111

110-
assert service_locator.get_configuration() is crawler_1._service_locator.get_configuration()
112+
assert service_locator.get_configuration() is crawler._service_locator.get_configuration()
111113

112114

113115
async def test_crawlers_own_configuration() -> None:
@@ -132,30 +134,54 @@ async def test_crawler_global_configuration() -> None:
132134
service_locator.set_configuration(config_global)
133135

134136
async with Actor():
135-
crawler_1 = BasicCrawler()
137+
crawler = BasicCrawler()
136138

137139
assert service_locator.get_configuration() is config_global
138-
assert crawler_1._service_locator.get_configuration() is config_global
140+
assert crawler._service_locator.get_configuration() is config_global
139141

140142

141-
async def test_storage_retrieved_is_different_with_different_config() -> None:
143+
async def test_storage_retrieved_is_different_with_different_config(tmp_path: Path) -> None:
142144
"""Test that retrieving storage depends on used configuration."""
145+
dir_1 = tmp_path / 'dir_1'
146+
dir_2 = tmp_path / 'dir_2'
143147
config_actor = ApifyConfiguration()
144-
apify_crawler_1 = ApifyConfiguration()
148+
config_actor.storage_dir = str(dir_1)
149+
apify_crawler = ApifyConfiguration()
150+
apify_crawler.storage_dir = str(dir_2)
145151

146152
async with Actor(configuration=config_actor):
147153
actor_kvs = await Actor.open_key_value_store()
148-
crawler_1 = BasicCrawler(configuration=apify_crawler_1)
149-
crawler_kvs = await crawler_1.get_key_value_store()
154+
crawler = BasicCrawler(configuration=apify_crawler)
155+
crawler_kvs = await crawler.get_key_value_store()
150156

151157
assert actor_kvs is not crawler_kvs
152158

153159

160+
async def test_storage_retrieved_is_same_with_equivalent_config() -> None:
161+
"""Test that retrieving storage depends on used configuration. If two same configuration(even if they are different
162+
instances) are used it returns same storage."""
163+
config_actor = ApifyConfiguration()
164+
apify_crawler = ApifyConfiguration()
165+
166+
async with Actor(configuration=config_actor):
167+
actor_kvs = await Actor.open_key_value_store()
168+
crawler = BasicCrawler(configuration=apify_crawler)
169+
crawler_kvs = await crawler.get_key_value_store()
170+
171+
assert actor_kvs is crawler_kvs
172+
173+
154174
async def test_storage_retrieved_is_same_with_same_config() -> None:
155175
"""Test that retrieving storage is same if same configuration is used."""
156176
async with Actor():
157177
actor_kvs = await Actor.open_key_value_store()
158-
crawler_1 = BasicCrawler()
159-
crawler_kvs = await crawler_1.get_key_value_store()
178+
crawler = BasicCrawler()
179+
crawler_kvs = await crawler.get_key_value_store()
160180

161181
assert actor_kvs is crawler_kvs
182+
183+
184+
async def test_crawler_uses_apify_config() -> None:
185+
"""Test that crawler is using ApifyConfiguration in SDK context."""
186+
crawler = BasicCrawler()
187+
assert isinstance(crawler._service_locator.get_configuration(), ApifyConfiguration)

tests/unit/conftest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313

1414
from apify_client import ApifyClientAsync
1515
from apify_shared.consts import ApifyEnvVars
16+
from crawlee import service_locator
1617

1718
import apify._actor
18-
from apify._configuration import service_locator
1919

2020
if TYPE_CHECKING:
2121
from collections.abc import Callable, Iterator
@@ -49,7 +49,7 @@ def _prepare_test_env() -> None:
4949
service_locator._configuration = None
5050
service_locator._event_manager = None
5151
service_locator._storage_client = None
52-
service_locator._storage_instance_manager = None
52+
service_locator.storage_instance_manager.clear_cache()
5353

5454
# Verify that the test environment was set up correctly.
5555
assert os.environ.get(ApifyEnvVars.LOCAL_STORAGE_DIR) == str(tmp_path)

uv.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)