Skip to content

Commit 4c8472c

Browse files
committed
Add ServiceConflictError
1 parent 30ba246 commit 4c8472c

File tree

7 files changed

+167
-34
lines changed

7 files changed

+167
-34
lines changed

src/crawlee/errors.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
'HttpStatusCodeError',
1515
'ProxyError',
1616
'RequestHandlerError',
17+
'ServiceConflictError',
1718
'SessionError',
1819
'UserDefinedErrorHandlerError',
1920
]
@@ -34,6 +35,17 @@ class SessionError(Exception):
3435
"""
3536

3637

38+
@docs_group('Errors')
39+
class ServiceConflictError(Exception):
40+
"""Raised when attempting to reassign a service in service container that was already configured."""
41+
42+
def __init__(self, service: type, new_value: object, existing_value: object) -> None:
43+
super().__init__(
44+
f'Service {service.__name__} has already been set. Existing value: {existing_value}, '
45+
f'attempted new value: {new_value}.'
46+
)
47+
48+
3749
@docs_group('Errors')
3850
class ProxyError(SessionError):
3951
"""Raised when a proxy is being blocked or malfunctions."""

src/crawlee/service_container.py

Lines changed: 79 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,10 @@
11
from __future__ import annotations
22

3-
from dataclasses import dataclass, field
4-
from typing import TYPE_CHECKING
5-
63
from crawlee._utils.docs import docs_group
7-
8-
if TYPE_CHECKING:
9-
from crawlee.base_storage_client._base_storage_client import BaseStorageClient
10-
from crawlee.configuration import Configuration
11-
from crawlee.events._event_manager import EventManager
4+
from crawlee.base_storage_client._base_storage_client import BaseStorageClient
5+
from crawlee.configuration import Configuration
6+
from crawlee.errors import ServiceConflictError
7+
from crawlee.events._event_manager import EventManager
128

139
__all__ = [
1410
'get_configuration',
@@ -20,29 +16,33 @@
2016
]
2117

2218

23-
@dataclass
2419
class _ServiceLocator:
2520
"""Service locator for managing the services used by Crawlee.
2621
2722
All services are initialized to its default value lazily.
2823
"""
2924

30-
_configuration: Configuration | None = field(default=None, init=False)
31-
_event_manager: EventManager | None = field(default=None, init=False)
32-
_storage_client: BaseStorageClient | None = field(default=None, init=False)
25+
def __init__(self) -> None:
26+
self._configuration: Configuration | None = None
27+
self._event_manager: EventManager | None = None
28+
self._storage_client: BaseStorageClient | None = None
29+
30+
# Flags to check if the services were already set.
31+
self._configuration_was_set = False
32+
self._event_manager_was_set = False
33+
self._storage_client_was_set = False
3334

3435
@property
3536
def configuration(self) -> Configuration:
3637
if self._configuration is None:
37-
from crawlee.configuration import Configuration
38-
3938
self._configuration = Configuration()
4039

4140
return self._configuration
4241

4342
@configuration.setter
4443
def configuration(self, value: Configuration) -> None:
4544
self._configuration = value
45+
self._configuration_was_set = True
4646

4747
@property
4848
def storage_client(self) -> BaseStorageClient:
@@ -56,6 +56,7 @@ def storage_client(self) -> BaseStorageClient:
5656
@storage_client.setter
5757
def storage_client(self, value: BaseStorageClient) -> None:
5858
self._storage_client = value
59+
self._storage_client_was_set = True
5960

6061
@property
6162
def event_manager(self) -> EventManager:
@@ -69,6 +70,19 @@ def event_manager(self) -> EventManager:
6970
@event_manager.setter
7071
def event_manager(self, value: EventManager) -> None:
7172
self._event_manager = value
73+
self._event_manager_was_set = True
74+
75+
@property
76+
def configuration_was_set(self) -> bool:
77+
return self._configuration_was_set
78+
79+
@property
80+
def event_manager_was_set(self) -> bool:
81+
return self._event_manager_was_set
82+
83+
@property
84+
def storage_client_was_set(self) -> bool:
85+
return self._storage_client_was_set
7286

7387

7488
_service_locator = _ServiceLocator()
@@ -81,8 +95,23 @@ def get_configuration() -> Configuration:
8195

8296

8397
@docs_group('Functions')
84-
def set_configuration(configuration: Configuration) -> None:
85-
"""Set the configuration."""
98+
def set_configuration(
99+
configuration: Configuration,
100+
*,
101+
force: bool = False,
102+
) -> None:
103+
"""Set the configuration.
104+
105+
Args:
106+
configuration: The configuration to set.
107+
force: If True, the configuration will be set even if it was already set.
108+
109+
Raises:
110+
ServiceConflictError: If the configuration was already set.
111+
"""
112+
if _service_locator.configuration_was_set and not force:
113+
raise ServiceConflictError(Configuration, configuration, _service_locator.configuration)
114+
86115
_service_locator.configuration = configuration
87116

88117

@@ -93,8 +122,23 @@ def get_event_manager() -> EventManager:
93122

94123

95124
@docs_group('Functions')
96-
def set_event_manager(event_manager: EventManager) -> None:
97-
"""Set the event manager."""
125+
def set_event_manager(
126+
event_manager: EventManager,
127+
*,
128+
force: bool = False,
129+
) -> None:
130+
"""Set the event manager.
131+
132+
Args:
133+
event_manager: The event manager to set.
134+
force: If True, the event manager will be set even if it was already set.
135+
136+
Raises:
137+
ServiceConflictError: If the event manager was already set.
138+
"""
139+
if _service_locator.event_manager_was_set and not force:
140+
raise ServiceConflictError(EventManager, event_manager, _service_locator.event_manager)
141+
98142
_service_locator.event_manager = event_manager
99143

100144

@@ -105,6 +149,21 @@ def get_storage_client() -> BaseStorageClient:
105149

106150

107151
@docs_group('Functions')
108-
def set_storage_client(storage_client: BaseStorageClient) -> None:
109-
"""Set the storage client."""
152+
def set_storage_client(
153+
storage_client: BaseStorageClient,
154+
*,
155+
force: bool = False,
156+
) -> None:
157+
"""Set the storage client.
158+
159+
Args:
160+
storage_client: The storage client to set.
161+
force: If True, the storage client will be set even if it was already set.
162+
163+
Raises:
164+
ServiceConflictError: If the storage client was already set.
165+
"""
166+
if _service_locator.storage_client_was_set and not force:
167+
raise ServiceConflictError(BaseStorageClient, storage_client, _service_locator.storage_client)
168+
110169
_service_locator.storage_client = storage_client

tests/unit/_memory_storage_client/test_memory_storage_client.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,15 +224,15 @@ async def test_not_implemented_method(tmp_path: Path) -> None:
224224
async def test_default_storage_path_used(monkeypatch: pytest.MonkeyPatch) -> None:
225225
# We expect the default value to be used
226226
monkeypatch.delenv('CRAWLEE_STORAGE_DIR', raising=False)
227-
service_container.set_configuration(Configuration())
227+
service_container.set_configuration(Configuration(), force=True)
228228
ms = MemoryStorageClient()
229229
assert ms.storage_dir == './storage'
230230

231231

232232
async def test_storage_path_from_env_var_overrides_default(monkeypatch: pytest.MonkeyPatch) -> None:
233233
# We expect the env var to override the default value
234234
monkeypatch.setenv('CRAWLEE_STORAGE_DIR', './env_var_storage_dir')
235-
service_container.set_configuration(Configuration())
235+
service_container.set_configuration(Configuration(), force=True)
236236
ms = MemoryStorageClient()
237237
assert ms.storage_dir == './env_var_storage_dir'
238238

tests/unit/basic_crawler/test_basic_crawler.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import pytest
1616
from httpx import URL
1717

18-
from crawlee import ConcurrencySettings, EnqueueStrategy, Glob
18+
from crawlee import ConcurrencySettings, EnqueueStrategy, Glob, service_container
1919
from crawlee._request import BaseRequestData, Request
2020
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpHeaders
2121
from crawlee.basic_crawler import BasicCrawler
@@ -761,14 +761,15 @@ async def handler(context: BasicCrawlingContext) -> None:
761761

762762

763763
async def test_respects_no_persist_storage() -> None:
764-
configuration = Configuration(persist_storage=False)
765-
crawler = BasicCrawler(configuration=configuration)
764+
config = Configuration(persist_storage=False)
765+
service_container.set_configuration(config, force=True)
766+
crawler = BasicCrawler()
766767

767768
@crawler.router.default_handler
768769
async def handler(context: BasicCrawlingContext) -> None:
769770
await context.push_data({'something': 'something'})
770771

771-
datasets_path = Path(configuration.storage_dir) / 'datasets' / 'default'
772+
datasets_path = Path(config.storage_dir) / 'datasets' / 'default'
772773
assert not datasets_path.exists() or list(datasets_path.iterdir()) == []
773774

774775

tests/unit/conftest.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@
77
from typing import TYPE_CHECKING, Callable
88

99
import pytest
10-
from apify import Configuration
1110
from httpx import URL
1211
from proxy import Proxy
1312

1413
from crawlee import service_container
14+
from crawlee.configuration import Configuration
1515
from crawlee.events._local_event_manager import LocalEventManager
1616
from crawlee.memory_storage_client import MemoryStorageClient
1717
from crawlee.proxy_configuration import ProxyInfo
@@ -29,9 +29,9 @@ def reset() -> None:
2929
monkeypatch.setenv('CRAWLEE_STORAGE_DIR', str(tmp_path))
3030

3131
# Reset services in crawlee.service_container
32-
service_container.set_configuration(Configuration())
33-
service_container.set_storage_client(MemoryStorageClient())
34-
service_container.set_event_manager(LocalEventManager())
32+
service_container.set_configuration(Configuration(), force=True)
33+
service_container.set_storage_client(MemoryStorageClient(), force=True)
34+
service_container.set_event_manager(LocalEventManager(), force=True)
3535

3636
# Clear creation-related caches to ensure no state is carried over between tests
3737
monkeypatch.setattr(_creation_management, '_cache_dataset_by_id', {})

tests/unit/test_configuration.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,13 @@ def test_global_configuration_works() -> None:
2424

2525

2626
async def test_storage_not_persisted_when_disabled(tmp_path: Path) -> None:
27-
configuration = Configuration(
27+
config = Configuration(
2828
persist_storage=False,
2929
write_metadata=False,
3030
crawlee_storage_dir=str(tmp_path), # type: ignore
3131
)
32-
set_storage_client(MemoryStorageClient(configuration=configuration))
32+
storage_client = MemoryStorageClient(config)
33+
set_storage_client(storage_client, force=True)
3334

3435
crawler = HttpCrawler()
3536

@@ -45,12 +46,13 @@ async def default_handler(context: HttpCrawlingContext) -> None:
4546

4647

4748
async def test_storage_persisted_when_enabled(tmp_path: Path) -> None:
48-
configuration = Configuration(
49+
config = Configuration(
4950
persist_storage=True,
5051
write_metadata=True,
5152
crawlee_storage_dir=str(tmp_path), # type: ignore
5253
)
53-
set_storage_client(MemoryStorageClient(configuration=configuration))
54+
storage_client = MemoryStorageClient(config)
55+
set_storage_client(storage_client, force=True)
5456

5557
crawler = HttpCrawler()
5658

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
from __future__ import annotations
2+
3+
import pytest
4+
5+
from crawlee.configuration import Configuration
6+
from crawlee.errors import ServiceConflictError
7+
from crawlee.events import LocalEventManager
8+
from crawlee.memory_storage_client import MemoryStorageClient
9+
from crawlee.service_container import (
10+
get_configuration,
11+
get_event_manager,
12+
get_storage_client,
13+
set_configuration,
14+
set_event_manager,
15+
set_storage_client,
16+
)
17+
18+
19+
def test_configuration() -> None:
20+
default_config = Configuration()
21+
config = get_configuration()
22+
assert config == default_config
23+
24+
custom_config = Configuration(default_browser_path='custom_path')
25+
26+
with pytest.raises(ServiceConflictError, match='Configuration has already been set.'):
27+
set_configuration(custom_config)
28+
29+
set_configuration(custom_config, force=True)
30+
config = get_configuration()
31+
assert config == custom_config
32+
33+
34+
def test_event_manager() -> None:
35+
default_event_manager = get_event_manager()
36+
assert isinstance(default_event_manager, LocalEventManager)
37+
38+
custom_event_manager = LocalEventManager()
39+
40+
with pytest.raises(ServiceConflictError, match='EventManager has already been set.'):
41+
set_event_manager(custom_event_manager)
42+
43+
set_event_manager(custom_event_manager, force=True)
44+
event_manager = get_event_manager()
45+
assert event_manager == custom_event_manager
46+
47+
48+
def test_storage_client() -> None:
49+
default_storage_client = get_storage_client()
50+
assert isinstance(default_storage_client, MemoryStorageClient)
51+
52+
custom_storage_client = MemoryStorageClient()
53+
54+
with pytest.raises(ServiceConflictError, match='StorageClient has already been set.'):
55+
set_storage_client(custom_storage_client)
56+
57+
set_storage_client(custom_storage_client, force=True)
58+
storage_client = get_storage_client()
59+
assert storage_client == custom_storage_client

0 commit comments

Comments
 (0)