Skip to content

Commit 0408130

Browse files
committed
same approach for event manager and storage client
1 parent 1f47475 commit 0408130

File tree

9 files changed

+50
-110
lines changed

9 files changed

+50
-110
lines changed

src/crawlee/_autoscaling/snapshotter.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,8 @@
1010
import psutil
1111
from sortedcontainers import SortedList
1212

13-
from crawlee._autoscaling.types import (
14-
ClientSnapshot,
15-
CpuSnapshot,
16-
EventLoopSnapshot,
17-
MemorySnapshot,
18-
Snapshot,
19-
)
13+
from crawlee import service_container
14+
from crawlee._autoscaling.types import ClientSnapshot, CpuSnapshot, EventLoopSnapshot, MemorySnapshot, Snapshot
2015
from crawlee._utils.byte_size import ByteSize
2116
from crawlee._utils.docs import docs_group
2217
from crawlee._utils.recurring_task import RecurringTask
@@ -25,8 +20,6 @@
2520
if TYPE_CHECKING:
2621
from types import TracebackType
2722

28-
from crawlee.events import EventManager
29-
3023
logger = getLogger(__name__)
3124

3225
T = TypeVar('T')
@@ -44,7 +37,6 @@ class Snapshotter:
4437

4538
def __init__(
4639
self,
47-
event_manager: EventManager,
4840
*,
4941
event_loop_snapshot_interval: timedelta = timedelta(milliseconds=500),
5042
client_snapshot_interval: timedelta = timedelta(milliseconds=1000),
@@ -62,8 +54,6 @@ def __init__(
6254
"""A default constructor.
6355
6456
Args:
65-
event_manager: The event manager used to emit system info events. From data provided by this event
66-
the CPU and memory usage are read.
6757
event_loop_snapshot_interval: The interval at which the event loop is sampled.
6858
client_snapshot_interval: The interval at which the client is sampled.
6959
max_used_cpu_ratio: Sets the ratio, defining the maximum CPU usage. When the CPU usage is higher than
@@ -89,7 +79,8 @@ def __init__(
8979
if available_memory_ratio is None and max_memory_size is None:
9080
raise ValueError('At least one of `available_memory_ratio` or `max_memory_size` must be specified')
9181

92-
self._event_manager = event_manager
82+
self._event_manager = service_container.get_event_manager()
83+
9384
self._event_loop_snapshot_interval = event_loop_snapshot_interval
9485
self._client_snapshot_interval = client_snapshot_interval
9586
self._max_event_loop_delay = max_event_loop_delay

src/crawlee/basic_crawler/_basic_crawler.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@
5050

5151
from crawlee._types import ConcurrencySettings, HttpMethod, JsonSerializable
5252
from crawlee.base_storage_client._models import DatasetItemsListPage
53-
from crawlee.events._event_manager import EventManager
5453
from crawlee.http_clients import BaseHttpClient, HttpResponse
5554
from crawlee.proxy_configuration import ProxyConfiguration, ProxyInfo
5655
from crawlee.sessions import Session
@@ -112,9 +111,6 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
112111
statistics: NotRequired[Statistics[StatisticsState]]
113112
"""A custom `Statistics` instance, allowing the use of non-default configuration."""
114113

115-
event_manager: NotRequired[EventManager]
116-
"""A custom `EventManager` instance, allowing the use of non-default configuration."""
117-
118114
configure_logging: NotRequired[bool]
119115
"""If True, the crawler will set up logging infrastructure automatically."""
120116

@@ -174,7 +170,6 @@ def __init__(
174170
retry_on_blocked: bool = True,
175171
proxy_configuration: ProxyConfiguration | None = None,
176172
statistics: Statistics | None = None,
177-
event_manager: EventManager | None = None,
178173
configure_logging: bool = True,
179174
max_crawl_depth: int | None = None,
180175
_context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
@@ -201,7 +196,6 @@ def __init__(
201196
retry_on_blocked: If True, the crawler attempts to bypass bot protections automatically.
202197
proxy_configuration: HTTP proxy configuration used when making requests.
203198
statistics: A custom `Statistics` instance, allowing the use of non-default configuration.
204-
event_manager: A custom `EventManager` instance, allowing the use of non-default configuration.
205199
configure_logging: If True, the crawler will set up logging infrastructure automatically.
206200
max_crawl_depth: Maximum crawl depth. If set, the crawler will stop crawling after reaching this depth.
207201
_context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
@@ -241,9 +235,8 @@ def __init__(
241235

242236
self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
243237

244-
self._event_manager = event_manager or service_container.get_event_manager()
238+
self._event_manager = service_container.get_event_manager()
245239
self._snapshotter = Snapshotter(
246-
self._event_manager,
247240
max_memory_size=ByteSize.from_mb(config.memory_mbytes) if config.memory_mbytes else None,
248241
available_memory_ratio=config.available_memory_ratio,
249242
)

src/crawlee/configuration.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55

66
from pydantic import AliasChoices, BeforeValidator, Field
77
from pydantic_settings import BaseSettings, SettingsConfigDict
8+
from typing_extensions import Self
89

10+
from crawlee import service_container
911
from crawlee._utils.docs import docs_group
1012
from crawlee._utils.models import timedelta_ms
1113

@@ -229,3 +231,16 @@ class Configuration(BaseSettings):
229231
),
230232
] = False
231233
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""
234+
235+
@classmethod
236+
def get_global_configuration(cls) -> Self:
237+
"""Retrieve the global instance of the configuration.
238+
239+
Mostly for the backward compatibility.
240+
"""
241+
config = service_container.get_configuration()
242+
243+
if not isinstance(config, cls):
244+
raise TypeError(f'Requested configuration of type {cls}, but got {config.__class__} instead.')
245+
246+
return config

src/crawlee/sessions/_session_pool.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from logging import getLogger
77
from typing import TYPE_CHECKING, Callable, Literal, overload
88

9+
from crawlee import service_container
910
from crawlee._utils.docs import docs_group
1011
from crawlee.events._types import Event, EventPersistStateData
1112
from crawlee.sessions import Session
@@ -15,8 +16,6 @@
1516
if TYPE_CHECKING:
1617
from types import TracebackType
1718

18-
from crawlee.events import EventManager
19-
2019
logger = getLogger(__name__)
2120

2221
CreateSessionFunctionType = Callable[[], Session]
@@ -32,7 +31,6 @@ def __init__(
3231
max_pool_size: int = 1000,
3332
create_session_settings: dict | None = None,
3433
create_session_function: CreateSessionFunctionType | None = None,
35-
event_manager: EventManager | None = None,
3634
persistence_enabled: bool = False,
3735
persist_state_kvs_name: str = 'default',
3836
persist_state_key: str = 'CRAWLEE_SESSION_POOL_STATE',
@@ -46,7 +44,6 @@ def __init__(
4644
be used. Do not set it if you are providing a `create_session_function`.
4745
create_session_function: A callable to create new session instances. If None, a default session settings
4846
will be used. Do not set it if you are providing `create_session_settings`.
49-
event_manager: The event manager to handle events like persist state.
5047
persistence_enabled: Flag to enable or disable state persistence of the pool. If it is enabled, make sure
5148
to provide an event manager to handle the events.
5249
persist_state_kvs_name: The name of the `KeyValueStore` used for state persistence.
@@ -55,7 +52,7 @@ def __init__(
5552
self._max_pool_size = max_pool_size
5653
self._session_settings = create_session_settings or {}
5754
self._create_session_function = create_session_function
58-
self._event_manager = event_manager
55+
self._event_manager = service_container.get_event_manager()
5956
self._persistence_enabled = persistence_enabled
6057
self._persist_state_kvs_name = persist_state_kvs_name
6158
self._persist_state_key = persist_state_key

src/crawlee/statistics/_statistics.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from typing_extensions import Self, TypeVar
1010

11-
import crawlee.service_container
11+
from crawlee import service_container
1212
from crawlee._utils.docs import docs_group
1313
from crawlee._utils.recurring_task import RecurringTask
1414
from crawlee.events._types import Event, EventPersistStateData
@@ -19,8 +19,6 @@
1919
if TYPE_CHECKING:
2020
from types import TracebackType
2121

22-
from crawlee.events import EventManager
23-
2422
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
2523

2624
logger = getLogger(__name__)
@@ -66,7 +64,6 @@ class Statistics(Generic[TStatisticsState]):
6664
def __init__(
6765
self,
6866
*,
69-
event_manager: EventManager | None = None,
7067
persistence_enabled: bool = False,
7168
persist_state_kvs_name: str = 'default',
7269
persist_state_key: str | None = None,
@@ -86,9 +83,7 @@ def __init__(
8683

8784
self.error_tracker = ErrorTracker()
8885
self.error_tracker_retry = ErrorTracker()
89-
90-
self._events = event_manager or crawlee.service_container.get_event_manager()
91-
86+
self._event_manager = service_container.get_event_manager()
9287
self._requests_in_progress = dict[str, RequestProcessingRecord]()
9388

9489
if persist_state_key is None:
@@ -114,7 +109,7 @@ async def __aenter__(self) -> Self:
114109
self._key_value_store = await KeyValueStore.open(name=self._persist_state_kvs_name)
115110

116111
await self._maybe_load_statistics()
117-
self._events.on(event=Event.PERSIST_STATE, listener=self._persist_state)
112+
self._event_manager.on(event=Event.PERSIST_STATE, listener=self._persist_state)
118113

119114
self._periodic_logger.start()
120115

@@ -128,7 +123,7 @@ async def __aexit__(
128123
) -> None:
129124
"""Stop collecting statistics."""
130125
self.state.crawler_finished_at = datetime.now(timezone.utc)
131-
self._events.off(event=Event.PERSIST_STATE, listener=self._persist_state)
126+
self._event_manager.off(event=Event.PERSIST_STATE, listener=self._persist_state)
132127
await self._periodic_logger.stop()
133128
await self._persist_state(event_data=EventPersistStateData(is_migrating=False))
134129

src/crawlee/storages/_creation_management.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,13 +122,12 @@ def _get_default_storage_id(configuration: Configuration, storage_class: type[TR
122122
async def open_storage(
123123
*,
124124
storage_class: type[TResource],
125-
storage_client: BaseStorageClient | None = None,
126125
id: str | None = None,
127126
name: str | None = None,
128127
) -> TResource:
129128
"""Open either a new storage or restore an existing one and return it."""
130129
config = service_container.get_configuration()
131-
storage_client = storage_client or service_container.get_storage_client()
130+
storage_client = service_container.get_storage_client()
132131

133132
# Try to restore the storage from cache by name
134133
if name:
@@ -175,7 +174,6 @@ async def open_storage(
175174
id=storage_info.id,
176175
name=storage_info.name,
177176
client=storage_client,
178-
event_manager=service_container.get_event_manager(),
179177
)
180178
else:
181179
storage = storage_class(

src/crawlee/storages/_dataset.py

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from typing_extensions import NotRequired, Required, Unpack, override
1010

11+
from crawlee import service_container
1112
from crawlee._utils.byte_size import ByteSize
1213
from crawlee._utils.docs import docs_group
1314
from crawlee._utils.file import json_dumps
@@ -19,7 +20,6 @@
1920
from collections.abc import Callable
2021

2122
from crawlee._types import JsonSerializable, PushDataKwargs
22-
from crawlee.base_storage_client import BaseStorageClient
2323
from crawlee.base_storage_client._models import DatasetItemsListPage
2424

2525

@@ -192,18 +192,14 @@ class Dataset(BaseStorage):
192192
_EFFECTIVE_LIMIT_SIZE = _MAX_PAYLOAD_SIZE - (_MAX_PAYLOAD_SIZE * _SAFETY_BUFFER_PERCENT)
193193
"""Calculated payload limit considering safety buffer."""
194194

195-
def __init__(
196-
self,
197-
id: str,
198-
name: str | None,
199-
client: BaseStorageClient,
200-
) -> None:
195+
def __init__(self, id: str, name: str | None) -> None:
201196
self._id = id
202197
self._name = name
203198

204199
# Get resource clients from storage client
205-
self._resource_client = client.dataset(self._id)
206-
self._resource_collection_client = client.datasets()
200+
storage_client = service_container.get_storage_client()
201+
self._resource_client = storage_client.dataset(self._id)
202+
self._resource_collection_client = storage_client.datasets()
207203

208204
@override
209205
@property
@@ -217,21 +213,10 @@ def name(self) -> str | None:
217213

218214
@override
219215
@classmethod
220-
async def open(
221-
cls,
222-
*,
223-
id: str | None = None,
224-
name: str | None = None,
225-
storage_client: BaseStorageClient | None = None,
226-
) -> Dataset:
216+
async def open(cls, *, id: str | None = None, name: str | None = None) -> Dataset:
227217
from crawlee.storages._creation_management import open_storage
228218

229-
return await open_storage(
230-
storage_class=cls,
231-
id=id,
232-
name=name,
233-
storage_client=storage_client,
234-
)
219+
return await open_storage(storage_class=cls, id=id, name=name)
235220

236221
@override
237222
async def drop(self) -> None:

src/crawlee/storages/_key_value_store.py

Lines changed: 7 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,14 @@
11
from __future__ import annotations
22

3-
from typing import TYPE_CHECKING, Any, AsyncIterator, TypeVar, overload
3+
from typing import Any, AsyncIterator, TypeVar, overload
44

55
from typing_extensions import override
66

7+
from crawlee import service_container
78
from crawlee._utils.docs import docs_group
89
from crawlee.base_storage_client._models import KeyValueStoreKeyInfo, KeyValueStoreMetadata
910
from crawlee.storages._base_storage import BaseStorage
1011

11-
if TYPE_CHECKING:
12-
from crawlee.base_storage_client import BaseStorageClient
13-
1412
T = TypeVar('T')
1513

1614

@@ -49,17 +47,13 @@ class KeyValueStore(BaseStorage):
4947
```
5048
"""
5149

52-
def __init__(
53-
self,
54-
id: str,
55-
name: str | None,
56-
client: BaseStorageClient,
57-
) -> None:
50+
def __init__(self, id: str, name: str | None) -> None:
5851
self._id = id
5952
self._name = name
6053

6154
# Get resource clients from storage client
62-
self._resource_client = client.key_value_store(self._id)
55+
storage_client = service_container.get_storage_client()
56+
self._resource_client = storage_client.key_value_store(self._id)
6357

6458
@override
6559
@property
@@ -77,21 +71,10 @@ async def get_info(self) -> KeyValueStoreMetadata | None:
7771

7872
@override
7973
@classmethod
80-
async def open(
81-
cls,
82-
*,
83-
id: str | None = None,
84-
name: str | None = None,
85-
storage_client: BaseStorageClient | None = None,
86-
) -> KeyValueStore:
74+
async def open(cls, *, id: str | None = None, name: str | None = None) -> KeyValueStore:
8775
from crawlee.storages._creation_management import open_storage
8876

89-
return await open_storage(
90-
storage_class=cls,
91-
id=id,
92-
name=name,
93-
storage_client=storage_client,
94-
)
77+
return await open_storage(storage_class=cls, id=id, name=name)
9578

9679
@override
9780
async def drop(self) -> None:

0 commit comments

Comments
 (0)