Skip to content

Commit aa3667f

Browse files
authored
fix: Relax ServiceLocator restrictions (#837)
- this makes it possible to overwrite services in the service locator before they are retrieved for the first time - closes #806
1 parent 3f09e87 commit aa3667f

File tree

6 files changed

+134
-33
lines changed

6 files changed

+134
-33
lines changed

src/crawlee/_service_locator.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,16 @@ def __init__(self) -> None:
2020
self._storage_client: BaseStorageClient | None = None
2121

2222
# Flags to check if the services were already set.
23-
self._configuration_was_set = False
24-
self._event_manager_was_set = False
25-
self._storage_client_was_set = False
23+
self._configuration_was_retrieved = False
24+
self._event_manager_was_retrieved = False
25+
self._storage_client_was_retrieved = False
2626

2727
def get_configuration(self) -> Configuration:
2828
"""Get the configuration."""
2929
if self._configuration is None:
3030
self._configuration = Configuration()
3131

32+
self._configuration_was_retrieved = True
3233
return self._configuration
3334

3435
def set_configuration(self, configuration: Configuration) -> None:
@@ -38,13 +39,12 @@ def set_configuration(self, configuration: Configuration) -> None:
3839
configuration: The configuration to set.
3940
4041
Raises:
41-
ServiceConflictError: If the configuration was already set.
42+
ServiceConflictError: If the configuration has already been retrieved before.
4243
"""
43-
if self._configuration_was_set:
44+
if self._configuration_was_retrieved:
4445
raise ServiceConflictError(Configuration, configuration, self._configuration)
4546

4647
self._configuration = configuration
47-
self._configuration_was_set = True
4848

4949
def get_event_manager(self) -> EventManager:
5050
"""Get the event manager."""
@@ -53,6 +53,7 @@ def get_event_manager(self) -> EventManager:
5353

5454
self._event_manager = LocalEventManager()
5555

56+
self._event_manager_was_retrieved = True
5657
return self._event_manager
5758

5859
def set_event_manager(self, event_manager: EventManager) -> None:
@@ -62,13 +63,12 @@ def set_event_manager(self, event_manager: EventManager) -> None:
6263
event_manager: The event manager to set.
6364
6465
Raises:
65-
ServiceConflictError: If the event manager was already set.
66+
ServiceConflictError: If the event manager has already been retrieved before.
6667
"""
67-
if self._event_manager_was_set:
68+
if self._event_manager_was_retrieved:
6869
raise ServiceConflictError(EventManager, event_manager, self._event_manager)
6970

7071
self._event_manager = event_manager
71-
self._event_manager_was_set = True
7272

7373
def get_storage_client(self) -> BaseStorageClient:
7474
"""Get the storage client."""
@@ -77,6 +77,7 @@ def get_storage_client(self) -> BaseStorageClient:
7777

7878
self._storage_client = MemoryStorageClient.from_config()
7979

80+
self._storage_client_was_retrieved = True
8081
return self._storage_client
8182

8283
def set_storage_client(self, storage_client: BaseStorageClient) -> None:
@@ -86,13 +87,12 @@ def set_storage_client(self, storage_client: BaseStorageClient) -> None:
8687
storage_client: The storage client to set.
8788
8889
Raises:
89-
ServiceConflictError: If the storage client was already set.
90+
ServiceConflictError: If the storage client has already been retrieved before.
9091
"""
91-
if self._storage_client_was_set:
92+
if self._storage_client_was_retrieved:
9293
raise ServiceConflictError(BaseStorageClient, storage_client, self._storage_client)
9394

9495
self._storage_client = storage_client
95-
self._storage_client_was_set = True
9696

9797

9898
service_locator = ServiceLocator()

src/crawlee/errors.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,11 @@ class SessionError(Exception):
3838

3939
@docs_group('Errors')
4040
class ServiceConflictError(Exception):
41-
"""Raised when attempting to reassign a service in service container that was already configured."""
41+
"""Raised when attempting to reassign a service in service container that is already in use."""
4242

4343
def __init__(self, service: type, new_value: object, existing_value: object) -> None:
4444
super().__init__(
45-
f'Service {service.__name__} has already been set. Existing value: {existing_value}, '
45+
f'Service {service.__name__} is already in use. Existing value: {existing_value}, '
4646
f'attempted new value: {new_value}.'
4747
)
4848

tests/unit/_memory_storage_client/test_memory_storage_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ async def test_not_implemented_method(tmp_path: Path) -> None:
230230
async def test_default_storage_path_used(monkeypatch: pytest.MonkeyPatch) -> None:
231231
# Reset the configuration in service locator
232232
service_locator._configuration = None
233-
service_locator._configuration_was_set = False
233+
service_locator._configuration_was_retrieved = False
234234

235235
# Remove the env var for setting the storage directory
236236
monkeypatch.delenv('CRAWLEE_STORAGE_DIR', raising=False)

tests/unit/basic_crawler/test_basic_crawler.py

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,21 @@
99
from dataclasses import dataclass
1010
from datetime import timedelta
1111
from pathlib import Path
12-
from typing import TYPE_CHECKING, Any
12+
from typing import TYPE_CHECKING, Any, cast
1313
from unittest.mock import AsyncMock, Mock
1414

1515
import httpx
1616
import pytest
1717

18-
from crawlee import ConcurrencySettings, EnqueueStrategy, Glob
18+
from crawlee import ConcurrencySettings, EnqueueStrategy, Glob, service_locator
1919
from crawlee._request import BaseRequestData, Request
2020
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpHeaders
2121
from crawlee.basic_crawler import BasicCrawler
2222
from crawlee.configuration import Configuration
2323
from crawlee.errors import SessionError, UserDefinedErrorHandlerError
24+
from crawlee.events._local_event_manager import LocalEventManager
25+
from crawlee.memory_storage_client import MemoryStorageClient
26+
from crawlee.memory_storage_client._dataset_client import DatasetClient
2427
from crawlee.request_loaders import RequestList, RequestManagerTandem
2528
from crawlee.statistics import FinalStatistics
2629
from crawlee.storages import Dataset, KeyValueStore, RequestQueue
@@ -997,3 +1000,49 @@ async def handler(context: BasicCrawlingContext) -> None:
9971000
assert len(processed_urls) == 2
9981001
assert stats.requests_total == 2
9991002
assert stats.requests_finished == 2
1003+
1004+
1005+
async def test_sets_services() -> None:
1006+
custom_configuration = Configuration()
1007+
custom_event_manager = LocalEventManager()
1008+
custom_storage_client = MemoryStorageClient.from_config(custom_configuration)
1009+
1010+
crawler = BasicCrawler(
1011+
configuration=custom_configuration,
1012+
event_manager=custom_event_manager,
1013+
storage_client=custom_storage_client,
1014+
)
1015+
1016+
assert service_locator.get_configuration() is custom_configuration
1017+
assert service_locator.get_event_manager() is custom_event_manager
1018+
assert service_locator.get_storage_client() is custom_storage_client
1019+
1020+
dataset = await crawler.get_dataset(name='test')
1021+
assert cast(DatasetClient, dataset._resource_client)._memory_storage_client is custom_storage_client
1022+
1023+
1024+
async def test_allows_storage_client_overwrite_before_run(monkeypatch: pytest.MonkeyPatch) -> None:
1025+
custom_storage_client = MemoryStorageClient.from_config()
1026+
1027+
crawler = BasicCrawler(
1028+
storage_client=custom_storage_client,
1029+
)
1030+
1031+
@crawler.router.default_handler
1032+
async def handler(context: BasicCrawlingContext) -> None:
1033+
await context.push_data({'foo': 'bar'})
1034+
1035+
other_storage_client = MemoryStorageClient.from_config()
1036+
service_locator.set_storage_client(other_storage_client)
1037+
1038+
with monkeypatch.context() as monkey:
1039+
spy = Mock(wraps=service_locator.get_storage_client)
1040+
monkey.setattr(service_locator, 'get_storage_client', spy)
1041+
await crawler.run(['https://does-not-matter.com'])
1042+
assert spy.call_count >= 1
1043+
1044+
dataset = await crawler.get_dataset()
1045+
assert cast(DatasetClient, dataset._resource_client)._memory_storage_client is other_storage_client
1046+
1047+
data = await dataset.get_data()
1048+
assert data.items == [{'foo': 'bar'}]

tests/unit/conftest.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ def _prepare_test_env() -> None:
4242

4343
# Reset the flags in the service locator to indicate that no services are explicitly set. This ensures
4444
# a clean state, as services might have been set during a previous test and not reset properly.
45-
service_locator._configuration_was_set = False
46-
service_locator._storage_client_was_set = False
47-
service_locator._event_manager_was_set = False
45+
service_locator._configuration_was_retrieved = False
46+
service_locator._storage_client_was_retrieved = False
47+
service_locator._event_manager_was_retrieved = False
4848

4949
# Reset the services in the service locator.
5050
service_locator._configuration = None
@@ -61,9 +61,9 @@ def _prepare_test_env() -> None:
6161

6262
# Verify that the test environment was set up correctly.
6363
assert os.environ.get('CRAWLEE_STORAGE_DIR') == str(tmp_path)
64-
assert service_locator._configuration_was_set is False
65-
assert service_locator._storage_client_was_set is False
66-
assert service_locator._event_manager_was_set is False
64+
assert service_locator._configuration_was_retrieved is False
65+
assert service_locator._storage_client_was_retrieved is False
66+
assert service_locator._event_manager_was_retrieved is False
6767

6868
return _prepare_test_env
6969

tests/unit/test_service_locator.py

Lines changed: 62 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,41 +9,93 @@
99
from crawlee.memory_storage_client import MemoryStorageClient
1010

1111

12-
def test_configuration() -> None:
12+
def test_default_configuration() -> None:
1313
default_config = Configuration()
1414
config = service_locator.get_configuration()
15-
assert config == default_config
15+
assert config == default_config # == because these are in fact different instances, which should be fine
1616

17+
18+
def test_custom_configuration() -> None:
1719
custom_config = Configuration(default_browser_path='custom_path')
1820
service_locator.set_configuration(custom_config)
1921
config = service_locator.get_configuration()
20-
assert config == custom_config
22+
assert config is custom_config
23+
24+
25+
def test_configuration_overwrite() -> None:
26+
default_config = Configuration()
27+
service_locator.set_configuration(default_config)
28+
29+
custom_config = Configuration(default_browser_path='custom_path')
30+
service_locator.set_configuration(custom_config)
31+
assert service_locator.get_configuration() is custom_config
32+
2133

22-
with pytest.raises(ServiceConflictError, match='Configuration has already been set.'):
34+
def test_configuration_conflict() -> None:
35+
service_locator.get_configuration()
36+
custom_config = Configuration(default_browser_path='custom_path')
37+
38+
with pytest.raises(ServiceConflictError, match='Configuration is already in use.'):
2339
service_locator.set_configuration(custom_config)
2440

2541

26-
def test_event_manager() -> None:
42+
def test_default_event_manager() -> None:
2743
default_event_manager = service_locator.get_event_manager()
2844
assert isinstance(default_event_manager, LocalEventManager)
2945

46+
47+
def test_custom_event_manager() -> None:
3048
custom_event_manager = LocalEventManager()
3149
service_locator.set_event_manager(custom_event_manager)
3250
event_manager = service_locator.get_event_manager()
33-
assert event_manager == custom_event_manager
51+
assert event_manager is custom_event_manager
52+
53+
54+
def test_event_manager_overwrite() -> None:
55+
custom_event_manager = LocalEventManager()
56+
service_locator.set_event_manager(custom_event_manager)
3457

35-
with pytest.raises(ServiceConflictError, match='EventManager has already been set.'):
58+
another_custom_event_manager = LocalEventManager()
59+
service_locator.set_event_manager(another_custom_event_manager)
60+
61+
assert custom_event_manager != another_custom_event_manager
62+
assert service_locator.get_event_manager() is another_custom_event_manager
63+
64+
65+
def test_event_manager_conflict() -> None:
66+
service_locator.get_event_manager()
67+
custom_event_manager = LocalEventManager()
68+
69+
with pytest.raises(ServiceConflictError, match='EventManager is already in use.'):
3670
service_locator.set_event_manager(custom_event_manager)
3771

3872

39-
def test_storage_client() -> None:
73+
def test_default_storage_client() -> None:
4074
default_storage_client = service_locator.get_storage_client()
4175
assert isinstance(default_storage_client, MemoryStorageClient)
4276

77+
78+
def test_custom_storage_client() -> None:
4379
custom_storage_client = MemoryStorageClient.from_config()
4480
service_locator.set_storage_client(custom_storage_client)
4581
storage_client = service_locator.get_storage_client()
46-
assert storage_client == custom_storage_client
82+
assert storage_client is custom_storage_client
83+
84+
85+
def test_storage_client_overwrite() -> None:
86+
custom_storage_client = MemoryStorageClient.from_config()
87+
service_locator.set_storage_client(custom_storage_client)
88+
89+
another_custom_storage_client = MemoryStorageClient.from_config()
90+
service_locator.set_storage_client(another_custom_storage_client)
91+
92+
assert custom_storage_client != another_custom_storage_client
93+
assert service_locator.get_storage_client() is another_custom_storage_client
94+
95+
96+
def test_storage_client_conflict() -> None:
97+
service_locator.get_storage_client()
98+
custom_storage_client = MemoryStorageClient.from_config()
4799

48-
with pytest.raises(ServiceConflictError, match='StorageClient has already been set.'):
100+
with pytest.raises(ServiceConflictError, match='StorageClient is already in use.'):
49101
service_locator.set_storage_client(custom_storage_client)

0 commit comments

Comments
 (0)