Skip to content

Commit baa8d76

Browse files
committed
same approach for event manager and storage client
1 parent b72f094 commit baa8d76

File tree

9 files changed

+46
-104
lines changed

9 files changed

+46
-104
lines changed

src/crawlee/_autoscaling/snapshotter.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,8 @@
1010
import psutil
1111
from sortedcontainers import SortedList
1212

13-
from crawlee._autoscaling.types import (
14-
ClientSnapshot,
15-
CpuSnapshot,
16-
EventLoopSnapshot,
17-
MemorySnapshot,
18-
Snapshot,
19-
)
13+
from crawlee import service_container
14+
from crawlee._autoscaling.types import ClientSnapshot, CpuSnapshot, EventLoopSnapshot, MemorySnapshot, Snapshot
2015
from crawlee._utils.byte_size import ByteSize
2116
from crawlee._utils.context import ensure_context
2217
from crawlee._utils.docs import docs_group
@@ -26,8 +21,6 @@
2621
if TYPE_CHECKING:
2722
from types import TracebackType
2823

29-
from crawlee.events import EventManager
30-
3124
logger = getLogger(__name__)
3225

3326
T = TypeVar('T')
@@ -45,7 +38,6 @@ class Snapshotter:
4538

4639
def __init__(
4740
self,
48-
event_manager: EventManager,
4941
*,
5042
event_loop_snapshot_interval: timedelta = timedelta(milliseconds=500),
5143
client_snapshot_interval: timedelta = timedelta(milliseconds=1000),
@@ -63,8 +55,6 @@ def __init__(
6355
"""A default constructor.
6456
6557
Args:
66-
event_manager: The event manager used to emit system info events. From data provided by this event
67-
the CPU and memory usage are read.
6858
event_loop_snapshot_interval: The interval at which the event loop is sampled.
6959
client_snapshot_interval: The interval at which the client is sampled.
7060
max_used_cpu_ratio: Sets the ratio, defining the maximum CPU usage. When the CPU usage is higher than
@@ -90,7 +80,8 @@ def __init__(
9080
if available_memory_ratio is None and max_memory_size is None:
9181
raise ValueError('At least one of `available_memory_ratio` or `max_memory_size` must be specified')
9282

93-
self._event_manager = event_manager
83+
self._event_manager = service_container.get_event_manager()
84+
9485
self._event_loop_snapshot_interval = event_loop_snapshot_interval
9586
self._client_snapshot_interval = client_snapshot_interval
9687
self._max_event_loop_delay = max_event_loop_delay

src/crawlee/basic_crawler/_basic_crawler.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@
5151

5252
from crawlee._types import ConcurrencySettings, HttpMethod, JsonSerializable
5353
from crawlee.base_storage_client._models import DatasetItemsListPage
54-
from crawlee.events._event_manager import EventManager
5554
from crawlee.http_clients import BaseHttpClient, HttpResponse
5655
from crawlee.proxy_configuration import ProxyConfiguration, ProxyInfo
5756
from crawlee.sessions import Session
@@ -113,9 +112,6 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
113112
statistics: NotRequired[Statistics[StatisticsState]]
114113
"""A custom `Statistics` instance, allowing the use of non-default configuration."""
115114

116-
event_manager: NotRequired[EventManager]
117-
"""A custom `EventManager` instance, allowing the use of non-default configuration."""
118-
119115
configure_logging: NotRequired[bool]
120116
"""If True, the crawler will set up logging infrastructure automatically."""
121117

@@ -178,7 +174,6 @@ def __init__(
178174
retry_on_blocked: bool = True,
179175
proxy_configuration: ProxyConfiguration | None = None,
180176
statistics: Statistics | None = None,
181-
event_manager: EventManager | None = None,
182177
configure_logging: bool = True,
183178
max_crawl_depth: int | None = None,
184179
abort_on_error: bool = False,
@@ -206,7 +201,6 @@ def __init__(
206201
retry_on_blocked: If True, the crawler attempts to bypass bot protections automatically.
207202
proxy_configuration: HTTP proxy configuration used when making requests.
208203
statistics: A custom `Statistics` instance, allowing the use of non-default configuration.
209-
event_manager: A custom `EventManager` instance, allowing the use of non-default configuration.
210204
configure_logging: If True, the crawler will set up logging infrastructure automatically.
211205
max_crawl_depth: Maximum crawl depth. If set, the crawler will stop crawling after reaching this depth.
212206
abort_on_error: If True, the crawler stops immediately when any request handler error occurs.
@@ -247,9 +241,8 @@ def __init__(
247241

248242
self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
249243

250-
self._event_manager = event_manager or service_container.get_event_manager()
244+
self._event_manager = service_container.get_event_manager()
251245
self._snapshotter = Snapshotter(
252-
self._event_manager,
253246
max_memory_size=ByteSize.from_mb(config.memory_mbytes) if config.memory_mbytes else None,
254247
available_memory_ratio=config.available_memory_ratio,
255248
)

src/crawlee/configuration.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55

66
from pydantic import AliasChoices, BeforeValidator, Field
77
from pydantic_settings import BaseSettings, SettingsConfigDict
8+
from typing_extensions import Self
89

10+
from crawlee import service_container
911
from crawlee._utils.docs import docs_group
1012
from crawlee._utils.models import timedelta_ms
1113

@@ -230,3 +232,16 @@ class Configuration(BaseSettings):
230232
),
231233
] = False
232234
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""
235+
236+
@classmethod
237+
def get_global_configuration(cls) -> Self:
238+
"""Retrieve the global instance of the configuration.
239+
240+
Mostly for the backward compatibility.
241+
"""
242+
config = service_container.get_configuration()
243+
244+
if not isinstance(config, cls):
245+
raise TypeError(f'Requested configuration of type {cls}, but got {config.__class__} instead.')
246+
247+
return config

src/crawlee/sessions/_session_pool.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@
1616
if TYPE_CHECKING:
1717
from types import TracebackType
1818

19-
from crawlee.events import EventManager
20-
2119
logger = getLogger(__name__)
2220

2321
CreateSessionFunctionType = Callable[[], Session]
@@ -33,7 +31,6 @@ def __init__(
3331
max_pool_size: int = 1000,
3432
create_session_settings: dict | None = None,
3533
create_session_function: CreateSessionFunctionType | None = None,
36-
event_manager: EventManager | None = None,
3734
persistence_enabled: bool = False,
3835
persist_state_kvs_name: str = 'default',
3936
persist_state_key: str = 'CRAWLEE_SESSION_POOL_STATE',
@@ -47,7 +44,6 @@ def __init__(
4744
be used. Do not set it if you are providing a `create_session_function`.
4845
create_session_function: A callable to create new session instances. If None, a default session settings
4946
will be used. Do not set it if you are providing `create_session_settings`.
50-
event_manager: The event manager to handle events like persist state.
5147
persistence_enabled: Flag to enable or disable state persistence of the pool. If it is enabled, make sure
5248
to provide an event manager to handle the events.
5349
persist_state_kvs_name: The name of the `KeyValueStore` used for state persistence.
@@ -56,7 +52,7 @@ def __init__(
5652
self._max_pool_size = max_pool_size
5753
self._session_settings = create_session_settings or {}
5854
self._create_session_function = create_session_function
59-
self._event_manager = event_manager
55+
self._event_manager = service_container.get_event_manager()
6056
self._persistence_enabled = persistence_enabled
6157
self._persist_state_kvs_name = persist_state_kvs_name
6258
self._persist_state_key = persist_state_key

src/crawlee/statistics/_statistics.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@
2020
if TYPE_CHECKING:
2121
from types import TracebackType
2222

23-
from crawlee.events import EventManager
24-
2523
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
2624

2725
logger = getLogger(__name__)
@@ -67,7 +65,6 @@ class Statistics(Generic[TStatisticsState]):
6765
def __init__(
6866
self,
6967
*,
70-
event_manager: EventManager | None = None,
7168
persistence_enabled: bool = False,
7269
persist_state_kvs_name: str = 'default',
7370
persist_state_key: str | None = None,
@@ -87,9 +84,7 @@ def __init__(
8784

8885
self.error_tracker = ErrorTracker()
8986
self.error_tracker_retry = ErrorTracker()
90-
91-
self._events = event_manager or crawlee.service_container.get_event_manager()
92-
87+
self._event_manager = service_container.get_event_manager()
9388
self._requests_in_progress = dict[str, RequestProcessingRecord]()
9489

9590
if persist_state_key is None:
@@ -151,7 +146,7 @@ async def __aexit__(
151146
raise RuntimeError(f'The {self.__class__.__name__} is not active.')
152147

153148
self.state.crawler_finished_at = datetime.now(timezone.utc)
154-
self._events.off(event=Event.PERSIST_STATE, listener=self._persist_state)
149+
self._event_manager.off(event=Event.PERSIST_STATE, listener=self._persist_state)
155150
await self._periodic_logger.stop()
156151
await self._persist_state(event_data=EventPersistStateData(is_migrating=False))
157152
self._active = False

src/crawlee/storages/_creation_management.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,13 +122,12 @@ def _get_default_storage_id(configuration: Configuration, storage_class: type[TR
122122
async def open_storage(
123123
*,
124124
storage_class: type[TResource],
125-
storage_client: BaseStorageClient | None = None,
126125
id: str | None = None,
127126
name: str | None = None,
128127
) -> TResource:
129128
"""Open either a new storage or restore an existing one and return it."""
130129
config = service_container.get_configuration()
131-
storage_client = storage_client or service_container.get_storage_client()
130+
storage_client = service_container.get_storage_client()
132131

133132
# Try to restore the storage from cache by name
134133
if name:
@@ -175,7 +174,6 @@ async def open_storage(
175174
id=storage_info.id,
176175
name=storage_info.name,
177176
client=storage_client,
178-
event_manager=service_container.get_event_manager(),
179177
)
180178
else:
181179
storage = storage_class(

src/crawlee/storages/_dataset.py

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from typing_extensions import NotRequired, Required, Unpack, override
1010

11+
from crawlee import service_container
1112
from crawlee._utils.byte_size import ByteSize
1213
from crawlee._utils.docs import docs_group
1314
from crawlee._utils.file import json_dumps
@@ -19,7 +20,6 @@
1920
from collections.abc import AsyncIterator, Callable
2021

2122
from crawlee._types import JsonSerializable, PushDataKwargs
22-
from crawlee.base_storage_client import BaseStorageClient
2323
from crawlee.base_storage_client._models import DatasetItemsListPage
2424

2525

@@ -192,18 +192,14 @@ class Dataset(BaseStorage):
192192
_EFFECTIVE_LIMIT_SIZE = _MAX_PAYLOAD_SIZE - (_MAX_PAYLOAD_SIZE * _SAFETY_BUFFER_PERCENT)
193193
"""Calculated payload limit considering safety buffer."""
194194

195-
def __init__(
196-
self,
197-
id: str,
198-
name: str | None,
199-
client: BaseStorageClient,
200-
) -> None:
195+
def __init__(self, id: str, name: str | None) -> None:
201196
self._id = id
202197
self._name = name
203198

204199
# Get resource clients from storage client
205-
self._resource_client = client.dataset(self._id)
206-
self._resource_collection_client = client.datasets()
200+
storage_client = service_container.get_storage_client()
201+
self._resource_client = storage_client.dataset(self._id)
202+
self._resource_collection_client = storage_client.datasets()
207203

208204
@override
209205
@property
@@ -217,21 +213,10 @@ def name(self) -> str | None:
217213

218214
@override
219215
@classmethod
220-
async def open(
221-
cls,
222-
*,
223-
id: str | None = None,
224-
name: str | None = None,
225-
storage_client: BaseStorageClient | None = None,
226-
) -> Dataset:
216+
async def open(cls, *, id: str | None = None, name: str | None = None) -> Dataset:
227217
from crawlee.storages._creation_management import open_storage
228218

229-
return await open_storage(
230-
storage_class=cls,
231-
id=id,
232-
name=name,
233-
storage_client=storage_client,
234-
)
219+
return await open_storage(storage_class=cls, id=id, name=name)
235220

236221
@override
237222
async def drop(self) -> None:

src/crawlee/storages/_key_value_store.py

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from typing_extensions import override
66

7+
from crawlee import service_container
78
from crawlee._utils.docs import docs_group
89
from crawlee.base_storage_client._models import KeyValueStoreKeyInfo, KeyValueStoreMetadata
910
from crawlee.storages._base_storage import BaseStorage
@@ -51,17 +52,13 @@ class KeyValueStore(BaseStorage):
5152
```
5253
"""
5354

54-
def __init__(
55-
self,
56-
id: str,
57-
name: str | None,
58-
client: BaseStorageClient,
59-
) -> None:
55+
def __init__(self, id: str, name: str | None) -> None:
6056
self._id = id
6157
self._name = name
6258

6359
# Get resource clients from storage client
64-
self._resource_client = client.key_value_store(self._id)
60+
storage_client = service_container.get_storage_client()
61+
self._resource_client = storage_client.key_value_store(self._id)
6562

6663
@override
6764
@property
@@ -79,21 +76,10 @@ async def get_info(self) -> KeyValueStoreMetadata | None:
7976

8077
@override
8178
@classmethod
82-
async def open(
83-
cls,
84-
*,
85-
id: str | None = None,
86-
name: str | None = None,
87-
storage_client: BaseStorageClient | None = None,
88-
) -> KeyValueStore:
79+
async def open(cls, *, id: str | None = None, name: str | None = None) -> KeyValueStore:
8980
from crawlee.storages._creation_management import open_storage
9081

91-
return await open_storage(
92-
storage_class=cls,
93-
id=id,
94-
name=name,
95-
storage_client=storage_client,
96-
)
82+
return await open_storage(storage_class=cls, id=id, name=name)
9783

9884
@override
9985
async def drop(self) -> None:

src/crawlee/storages/_request_queue.py

Lines changed: 9 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@
2424
from collections.abc import Sequence
2525

2626
from crawlee._request import Request
27-
from crawlee.base_storage_client import BaseStorageClient
28-
from crawlee.events import EventManager
2927

3028
logger = getLogger(__name__)
3129

@@ -105,21 +103,17 @@ class RequestQueue(BaseStorage, RequestProvider):
105103
_STORAGE_CONSISTENCY_DELAY = timedelta(seconds=3)
106104
"""Expected delay for storage to achieve consistency, guiding the timing of subsequent read operations."""
107105

108-
def __init__(
109-
self,
110-
id: str,
111-
name: str | None,
112-
client: BaseStorageClient,
113-
event_manager: EventManager,
114-
) -> None:
115-
config = service_container.get_configuration()
116-
106+
def __init__(self, id: str, name: str | None) -> None:
117107
self._id = id
118108
self._name = name
119109

110+
config = service_container.get_configuration()
111+
event_manager = service_container.get_event_manager()
112+
storage_client = service_container.get_storage_client()
113+
120114
# Get resource clients from storage client
121-
self._resource_client = client.request_queue(self._id)
122-
self._resource_collection_client = client.request_queues()
115+
self._resource_client = storage_client.request_queue(self._id)
116+
self._resource_collection_client = storage_client.request_queues()
123117

124118
self._request_lock_time = timedelta(minutes=3)
125119
self._queue_paused_for_migration = False
@@ -153,21 +147,10 @@ def name(self) -> str | None:
153147

154148
@override
155149
@classmethod
156-
async def open(
157-
cls,
158-
*,
159-
id: str | None = None,
160-
name: str | None = None,
161-
storage_client: BaseStorageClient | None = None,
162-
) -> RequestQueue:
150+
async def open(cls, *, id: str | None = None, name: str | None = None) -> RequestQueue:
163151
from crawlee.storages._creation_management import open_storage
164152

165-
return await open_storage(
166-
storage_class=cls,
167-
id=id,
168-
name=name,
169-
storage_client=storage_client,
170-
)
153+
return await open_storage(storage_class=cls, id=id, name=name)
171154

172155
@override
173156
async def drop(self, *, timeout: timedelta | None = None) -> None:

0 commit comments

Comments
 (0)