Skip to content

Commit 89e28b6

Browse files
committed
chore: Integrate service locator from Crawlee [WIP]
1 parent ef6d579 commit 89e28b6

File tree

8 files changed

+388
-289
lines changed

8 files changed

+388
-289
lines changed

poetry.lock

Lines changed: 221 additions & 210 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ keywords = [
4545
python = "^3.9"
4646
apify-client = ">=1.8.1"
4747
apify-shared = ">=1.2.1"
48-
crawlee = "~0.4.0"
48+
crawlee = { git = "https://github.com/apify/crawlee-python.git", branch = "refactor-service-container" }
4949
cryptography = ">=42.0.0"
5050
# TODO: relax the upper bound once the issue is resolved:
5151
# https://github.com/apify/apify-sdk-python/issues/348

src/apify/_actor.py

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@
1212
from apify_client import ApifyClientAsync
1313
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
1414
from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value
15-
from crawlee import service_container
15+
from crawlee import service_locator
1616
from crawlee.events._types import Event, EventPersistStateData
17+
from crawlee.memory_storage_client import MemoryStorageClient
1718

1819
from apify._configuration import Configuration
1920
from apify._consts import EVENT_LISTENERS_TIMEOUT
@@ -69,17 +70,31 @@ def __init__(
6970
self._configure_logging = configure_logging
7071
self._apify_client = self.new_client()
7172

72-
self._event_manager: EventManager
73-
if self._configuration.is_at_home:
74-
self._event_manager = PlatformEventManager(
73+
# We need to keep both local & cloud storage clients because of the `force_cloud` option.
74+
self._local_storage_client = MemoryStorageClient.from_config()
75+
self._cloud_storage_client = ApifyStorageClient(configuration=self._configuration)
76+
77+
# Set the event manager based on whether the Actor is running on the platform or locally.
78+
self._event_manager = (
79+
PlatformEventManager(
7580
config=self._configuration,
7681
persist_state_interval=self._configuration.persist_state_interval,
7782
)
78-
else:
79-
self._event_manager = LocalEventManager(
83+
if self.is_at_home()
84+
else LocalEventManager(
8085
system_info_interval=self._configuration.system_info_interval,
8186
persist_state_interval=self._configuration.persist_state_interval,
8287
)
88+
)
89+
90+
# Register services in the service locator.
91+
if self.is_at_home():
92+
service_locator.set_storage_client(self._cloud_storage_client)
93+
else:
94+
service_locator.set_storage_client(self._local_storage_client)
95+
96+
service_locator.set_event_manager(self.event_manager)
97+
service_locator.set_configuration(self.configuration)
8398

8499
self._is_initialized = False
85100

@@ -93,7 +108,7 @@ async def __aenter__(self) -> Self:
93108
executing the block code, the `Actor.fail` method is called.
94109
"""
95110
if self._configure_logging:
96-
_configure_logging(self._configuration)
111+
_configure_logging()
97112

98113
await self.init()
99114
return self
@@ -172,16 +187,6 @@ async def init(self) -> None:
172187
if self._is_initialized:
173188
raise RuntimeError('The Actor was already initialized!')
174189

175-
if self._configuration.token:
176-
service_container.set_cloud_storage_client(ApifyStorageClient(configuration=self._configuration))
177-
178-
if self._configuration.is_at_home:
179-
service_container.set_default_storage_client_type('cloud')
180-
else:
181-
service_container.set_default_storage_client_type('local')
182-
183-
service_container.set_event_manager(self._event_manager)
184-
185190
self._is_exiting = False
186191
self._was_final_persist_state_emitted = False
187192

@@ -233,7 +238,6 @@ async def finalize() -> None:
233238
await self._event_manager.wait_for_all_listeners_to_complete(timeout=event_listeners_timeout)
234239

235240
await self._event_manager.__aexit__(None, None, None)
236-
cast(dict, service_container._services).clear() # noqa: SLF001
237241

238242
await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds())
239243
self._is_initialized = False
@@ -335,12 +339,13 @@ async def open_dataset(
335339
An instance of the `Dataset` class for the given ID or name.
336340
"""
337341
self._raise_if_not_initialized()
342+
storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client()
338343

339344
return await Dataset.open(
340345
id=id,
341346
name=name,
342347
configuration=self._configuration,
343-
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
348+
storage_client=storage_client,
344349
)
345350

346351
async def open_key_value_store(
@@ -367,12 +372,13 @@ async def open_key_value_store(
367372
An instance of the `KeyValueStore` class for the given ID or name.
368373
"""
369374
self._raise_if_not_initialized()
375+
storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client()
370376

371377
return await KeyValueStore.open(
372378
id=id,
373379
name=name,
374380
configuration=self._configuration,
375-
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
381+
storage_client=storage_client,
376382
)
377383

378384
async def open_request_queue(
@@ -401,12 +407,13 @@ async def open_request_queue(
401407
An instance of the `RequestQueue` class for the given ID or name.
402408
"""
403409
self._raise_if_not_initialized()
410+
storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client()
404411

405412
return await RequestQueue.open(
406413
id=id,
407414
name=name,
408415
configuration=self._configuration,
409-
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
416+
storage_client=storage_client,
410417
)
411418

412419
async def push_data(self, data: dict | list[dict]) -> None:

src/apify/_configuration.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
from datetime import datetime, timedelta
4+
from logging import getLogger
45
from typing import Annotated, Any
56

67
from pydantic import AliasChoices, BeforeValidator, Field
@@ -12,6 +13,8 @@
1213

1314
from apify._utils import docs_group
1415

16+
logger = getLogger(__name__)
17+
1518

1619
def _transform_to_list(value: Any) -> list[str] | None:
1720
if value is None:
@@ -353,6 +356,11 @@ class Configuration(CrawleeConfiguration):
353356
),
354357
] = None
355358

359+
@classmethod
360+
def get_global_configuration(cls) -> Configuration:
361+
"""Retrieve the global instance of the configuration.
356362
357-
# Monkey-patch the base class so that it works with the extended configuration
358-
CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore[method-assign]
363+
Mostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`
364+
instead.
365+
"""
366+
return cls()

src/apify/log.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,10 @@
11
from __future__ import annotations
22

33
import logging
4-
from typing import TYPE_CHECKING
54

65
from apify_shared.utils import ignore_docs
76
from crawlee._log_config import CrawleeLogFormatter, configure_logger, get_configured_log_level
87

9-
if TYPE_CHECKING:
10-
from apify import Configuration
11-
128
# Name of the logger used throughout the library (resolves to 'apify')
139
logger_name = __name__.split('.')[0]
1410

@@ -21,11 +17,11 @@ class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 (Inherited from pare
2117
pass
2218

2319

24-
def _configure_logging(configuration: Configuration) -> None:
20+
def _configure_logging() -> None:
2521
apify_client_logger = logging.getLogger('apify_client')
26-
configure_logger(apify_client_logger, configuration, remove_old_handlers=True)
22+
configure_logger(apify_client_logger, remove_old_handlers=True)
2723

28-
level = get_configured_log_level(configuration)
24+
level = get_configured_log_level()
2925

3026
# Keep apify_client logger quiet unless debug logging is requested
3127
if level > logging.DEBUG:
@@ -42,4 +38,4 @@ def _configure_logging(configuration: Configuration) -> None:
4238

4339
# Use configured log level for apify logger
4440
apify_logger = logging.getLogger('apify')
45-
configure_logger(apify_logger, configuration, remove_old_handlers=True)
41+
configure_logger(apify_logger, remove_old_handlers=True)

tests/integration/conftest.py

Lines changed: 61 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,15 @@
77
import sys
88
import textwrap
99
from pathlib import Path
10-
from typing import TYPE_CHECKING, Any, Callable, Protocol, cast
10+
from typing import TYPE_CHECKING, Any, Callable, Protocol
1111

1212
import pytest
1313
from filelock import FileLock
1414

1515
from apify_client import ApifyClientAsync
16-
from apify_shared.consts import ActorJobStatus, ActorSourceType
16+
from apify_shared.consts import ActorJobStatus, ActorSourceType, ApifyEnvVars
17+
from crawlee import service_locator
18+
from crawlee.storages import _creation_management
1719

1820
import apify._actor
1921
from ._utils import generate_unique_resource_name
@@ -29,19 +31,67 @@
2931
_SDK_ROOT_PATH = Path(__file__).parent.parent.parent.resolve()
3032

3133

32-
@pytest.fixture(autouse=True)
33-
def _reset_and_patch_default_instances() -> None:
34-
"""Reset the used singletons and patch the default storage client with a temporary directory.
34+
@pytest.fixture
35+
def prepare_test_env(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Callable[[], None]:
36+
"""Prepare the testing environment by resetting the global state before each test.
37+
38+
This fixture ensures that the global state of the package is reset to a known baseline before each test runs.
39+
It also configures a temporary storage directory for test isolation.
40+
41+
Args:
42+
monkeypatch: Test utility provided by pytest for patching.
43+
tmp_path: A unique temporary directory path provided by pytest for test isolation.
3544
36-
To isolate the tests, we need to reset the used singletons before each test case. We also patch the default
37-
storage client with a tmp_path.
45+
Returns:
46+
A callable that prepares the test environment.
3847
"""
39-
from crawlee import service_container
4048

41-
cast(dict, service_container._services).clear()
42-
delattr(apify._actor.Actor, '__wrapped__')
49+
def _prepare_test_env() -> None:
50+
delattr(apify._actor.Actor, '__wrapped__')
51+
52+
# Set the environment variable for the local storage directory to the temporary path.
53+
monkeypatch.setenv(ApifyEnvVars.LOCAL_STORAGE_DIR, str(tmp_path))
54+
55+
# Reset the flags in the service locator to indicate that no services are explicitly set. This ensures
56+
# a clean state, as services might have been set during a previous test and not reset properly.
57+
service_locator._configuration_was_set = False
58+
service_locator._storage_client_was_set = False
59+
service_locator._event_manager_was_set = False
60+
61+
# Reset the services in the service locator.
62+
service_locator._configuration = None
63+
service_locator._event_manager = None
64+
service_locator._storage_client = None
65+
66+
# Clear creation-related caches to ensure no state is carried over between tests.
67+
monkeypatch.setattr(_creation_management, '_cache_dataset_by_id', {})
68+
monkeypatch.setattr(_creation_management, '_cache_dataset_by_name', {})
69+
monkeypatch.setattr(_creation_management, '_cache_kvs_by_id', {})
70+
monkeypatch.setattr(_creation_management, '_cache_kvs_by_name', {})
71+
monkeypatch.setattr(_creation_management, '_cache_rq_by_id', {})
72+
monkeypatch.setattr(_creation_management, '_cache_rq_by_name', {})
73+
74+
# Verify that the test environment was set up correctly.
75+
assert os.environ.get(ApifyEnvVars.LOCAL_STORAGE_DIR) == str(tmp_path)
76+
assert service_locator._configuration_was_set is False
77+
assert service_locator._storage_client_was_set is False
78+
assert service_locator._event_manager_was_set is False
79+
80+
return _prepare_test_env
81+
82+
83+
@pytest.fixture(autouse=True)
84+
def _isolate_test_environment(prepare_test_env: Callable[[], None]) -> None:
85+
"""Isolate the testing environment by resetting global state before and after each test.
86+
87+
This fixture ensures that each test starts with a clean slate and that any modifications during the test
88+
do not affect subsequent tests. It runs automatically for all tests.
89+
90+
Args:
91+
prepare_test_env: Fixture to prepare the environment before each test.
92+
"""
4393

44-
# TODO: StorageClientManager local storage client purge # noqa: TD003
94+
prepare_test_env()
4595

4696

4797
@pytest.fixture

tests/unit/actor/test_actor_non_default_instance.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
from datetime import timedelta
22

3+
import pytest
4+
35
from apify import Actor, Configuration
46

57

8+
@pytest.mark.only
69
async def test_actor_with_non_default_config() -> None:
710
config = Configuration(internal_timeout=timedelta(minutes=111))
811

0 commit comments

Comments
 (0)