Skip to content

Commit 931def3

Browse files
committed
Fix cycle imports, tests and everything
1 parent b70ba2b commit 931def3

18 files changed

+255
-303
lines changed

src/crawlee/_autoscaling/snapshotter.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
if TYPE_CHECKING:
2222
from types import TracebackType
2323

24+
from crawlee.events import EventManager
25+
2426
logger = getLogger(__name__)
2527

2628
T = TypeVar('T')
@@ -39,6 +41,7 @@ class Snapshotter:
3941
def __init__(
4042
self,
4143
*,
44+
event_manager: EventManager | None = None,
4245
event_loop_snapshot_interval: timedelta = timedelta(milliseconds=500),
4346
client_snapshot_interval: timedelta = timedelta(milliseconds=1000),
4447
max_used_cpu_ratio: float = 0.95,
@@ -55,6 +58,8 @@ def __init__(
5558
"""A default constructor.
5659
5760
Args:
61+
event_manager: The event manager used to emit system info events. From data provided by this event
62+
the CPU and memory usage are read.
5863
event_loop_snapshot_interval: The interval at which the event loop is sampled.
5964
client_snapshot_interval: The interval at which the client is sampled.
6065
max_used_cpu_ratio: Sets the ratio, defining the maximum CPU usage. When the CPU usage is higher than
@@ -80,8 +85,7 @@ def __init__(
8085
if available_memory_ratio is None and max_memory_size is None:
8186
raise ValueError('At least one of `available_memory_ratio` or `max_memory_size` must be specified')
8287

83-
self._event_manager = service_container.get_event_manager()
84-
88+
self._event_manager = event_manager or service_container.get_event_manager()
8589
self._event_loop_snapshot_interval = event_loop_snapshot_interval
8690
self._client_snapshot_interval = client_snapshot_interval
8791
self._max_event_loop_delay = max_event_loop_delay

src/crawlee/basic_crawler/_basic_crawler.py

Lines changed: 67 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@
5151

5252
from crawlee._types import ConcurrencySettings, HttpMethod, JsonSerializable
5353
from crawlee.base_storage_client._models import DatasetItemsListPage
54+
from crawlee.configuration import Configuration
55+
from crawlee.events._event_manager import EventManager
5456
from crawlee.http_clients import BaseHttpClient, HttpResponse
5557
from crawlee.proxy_configuration import ProxyConfiguration, ProxyInfo
5658
from crawlee.sessions import Session
@@ -94,6 +96,9 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
9496
"""Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs
9597
or if the website blocks the request."""
9698

99+
configuration: NotRequired[Configuration]
100+
"""Crawler configuration."""
101+
97102
request_handler_timeout: NotRequired[timedelta]
98103
"""Maximum duration allowed for a single request handler to run."""
99104

@@ -112,6 +117,9 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
112117
statistics: NotRequired[Statistics[StatisticsState]]
113118
"""A custom `Statistics` instance, allowing the use of non-default configuration."""
114119

120+
event_manager: NotRequired[EventManager]
121+
"""A custom `EventManager` instance, allowing the use of non-default configuration."""
122+
115123
configure_logging: NotRequired[bool]
116124
"""If True, the crawler will set up logging infrastructure automatically."""
117125

@@ -158,126 +166,136 @@ class BasicCrawler(Generic[TCrawlingContext]):
158166
def __init__(
159167
self,
160168
*,
169+
configuration: Configuration | None = None,
170+
event_manager: EventManager | None = None,
161171
request_provider: RequestProvider | None = None,
162-
request_handler: Callable[[TCrawlingContext], Awaitable[None]] | None = None,
172+
session_pool: SessionPool | None = None,
173+
proxy_configuration: ProxyConfiguration | None = None,
163174
http_client: BaseHttpClient | None = None,
164-
concurrency_settings: ConcurrencySettings | None = None,
175+
request_handler: Callable[[TCrawlingContext], Awaitable[None]] | None = None,
165176
max_request_retries: int = 3,
166177
max_requests_per_crawl: int | None = None,
167178
max_session_rotations: int = 10,
168-
request_handler_timeout: timedelta = timedelta(minutes=1),
169-
session_pool: SessionPool | None = None,
179+
max_crawl_depth: int | None = None,
170180
use_session_pool: bool = True,
171181
retry_on_blocked: bool = True,
172-
proxy_configuration: ProxyConfiguration | None = None,
182+
concurrency_settings: ConcurrencySettings | None = None,
183+
request_handler_timeout: timedelta = timedelta(minutes=1),
173184
statistics: Statistics | None = None,
174185
configure_logging: bool = True,
175-
max_crawl_depth: int | None = None,
176186
_context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
177187
_additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,
178188
_logger: logging.Logger | None = None,
179189
) -> None:
180190
"""A default constructor.
181191
182192
Args:
193+
configuration: The configuration object. Some of its properties are used as defaults for the crawler.
194+
event_manager: The event manager for managing events for the crawler and all its components.
183195
request_provider: Provider for requests to be processed by the crawler.
184-
request_handler: A callable responsible for handling requests.
196+
session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
197+
proxy_configuration: HTTP proxy configuration used when making requests.
185198
http_client: HTTP client used by `BasicCrawlingContext.send_request` and the HTTP-based crawling.
186-
concurrency_settings: Settings to fine-tune concurrency levels.
199+
request_handler: A callable responsible for handling requests.
187200
max_request_retries: Maximum number of attempts to process a single request.
188201
max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
189202
this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means
190203
no limit. Due to concurrency settings, the actual number of pages visited may slightly exceed
191204
this value.
192205
max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
193206
if a proxy error occurs or if the website blocks the request.
194-
request_handler_timeout: Maximum duration allowed for a single request handler to run.
207+
max_crawl_depth: Maximum crawl depth. If set, the crawler will stop crawling after reaching this depth.
195208
use_session_pool: Enable the use of a session pool for managing sessions during crawling.
196-
session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
197209
retry_on_blocked: If True, the crawler attempts to bypass bot protections automatically.
198-
proxy_configuration: HTTP proxy configuration used when making requests.
210+
concurrency_settings: Settings to fine-tune concurrency levels.
211+
request_handler_timeout: Maximum duration allowed for a single request handler to run.
199212
statistics: A custom `Statistics` instance, allowing the use of non-default configuration.
200213
configure_logging: If True, the crawler will set up logging infrastructure automatically.
201-
max_crawl_depth: Maximum crawl depth. If set, the crawler will stop crawling after reaching this depth.
202214
_context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
203215
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
204216
_additional_context_managers: Additional context managers used throughout the crawler lifecycle.
205217
_logger: A logger instance, typically provided by a subclass, for consistent logging labels.
206218
"""
207-
self._router: Router[TCrawlingContext] | None = None
219+
if configuration:
220+
service_container.set_configuration(configuration)
221+
if event_manager:
222+
service_container.set_event_manager(event_manager)
223+
224+
config = service_container.get_configuration()
225+
226+
# Core components
227+
self._request_provider = request_provider
228+
self._session_pool = session_pool or SessionPool()
229+
self._proxy_configuration = proxy_configuration
230+
self._http_client = http_client or HttpxHttpClient()
208231

232+
# Request router setup
233+
self._router: Router[TCrawlingContext] | None = None
209234
if isinstance(cast(Router, request_handler), Router):
210235
self._router = cast(Router[TCrawlingContext], request_handler)
211236
elif request_handler is not None:
212237
self._router = None
213238
self.router.default_handler(request_handler)
214239

215-
self._http_client = http_client or HttpxHttpClient()
216-
217-
self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)
218-
240+
# Error & failed request handlers
219241
self._error_handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext] | None = None
220242
self._failed_request_handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] | None = None
221243

244+
# Context pipeline
245+
self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)
246+
247+
# Crawl settings
222248
self._max_request_retries = max_request_retries
223249
self._max_requests_per_crawl = max_requests_per_crawl
224250
self._max_session_rotations = max_session_rotations
251+
self._max_crawl_depth = max_crawl_depth
225252

226-
self._request_provider = request_provider
227-
228-
config = service_container.get_configuration()
229-
253+
# Timeouts
230254
self._request_handler_timeout = request_handler_timeout
231255
self._internal_timeout = (
232256
config.internal_timeout
233257
if config.internal_timeout is not None
234258
else max(2 * request_handler_timeout, timedelta(minutes=5))
235259
)
236260

237-
self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
238-
239-
self._event_manager = service_container.get_event_manager()
240-
self._snapshotter = Snapshotter(
241-
max_memory_size=ByteSize.from_mb(config.memory_mbytes) if config.memory_mbytes else None,
242-
available_memory_ratio=config.available_memory_ratio,
243-
)
244-
self._autoscaled_pool = AutoscaledPool(
245-
system_status=SystemStatus(self._snapshotter),
246-
is_finished_function=self.__is_finished_function,
247-
is_task_ready_function=self.__is_task_ready_function,
248-
run_task_function=self.__run_task_function,
249-
concurrency_settings=concurrency_settings,
250-
)
251-
261+
# Retry and session settings
252262
self._use_session_pool = use_session_pool
253-
self._session_pool = session_pool or SessionPool()
254-
255263
self._retry_on_blocked = retry_on_blocked
256264

265+
# Logging setup
257266
if configure_logging:
258267
root_logger = logging.getLogger()
259268
configure_logger(root_logger, remove_old_handlers=True)
260-
261-
# Silence HTTPX logger
262-
httpx_logger = logging.getLogger('httpx')
269+
httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger
263270
httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
271+
self._logger = _logger or logging.getLogger(__name__)
264272

265-
if not _logger:
266-
_logger = logging.getLogger(__name__)
267-
268-
self._logger = _logger
269-
270-
self._proxy_configuration = proxy_configuration
273+
# Statistics
271274
self._statistics = statistics or Statistics(
272-
event_manager=self._event_manager,
273275
periodic_message_logger=self._logger,
274276
log_message='Current request statistics:',
275277
)
278+
279+
# Additional context managers to enter and exit
276280
self._additional_context_managers = _additional_context_managers or []
277281

282+
# Internal, not explicitly configurable components
283+
self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
284+
self._snapshotter = Snapshotter(
285+
max_memory_size=ByteSize.from_mb(config.memory_mbytes) if config.memory_mbytes else None,
286+
available_memory_ratio=config.available_memory_ratio,
287+
)
288+
self._autoscaled_pool = AutoscaledPool(
289+
system_status=SystemStatus(self._snapshotter),
290+
is_finished_function=self.__is_finished_function,
291+
is_task_ready_function=self.__is_task_ready_function,
292+
run_task_function=self.__run_task_function,
293+
concurrency_settings=concurrency_settings,
294+
)
295+
296+
# State flags
278297
self._running = False
279298
self._has_finished_before = False
280-
self._max_crawl_depth = max_crawl_depth
281299

282300
@property
283301
def log(self) -> logging.Logger:

src/crawlee/configuration.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,7 @@
55

66
from pydantic import AliasChoices, BeforeValidator, Field
77
from pydantic_settings import BaseSettings, SettingsConfigDict
8-
from typing_extensions import Self
98

10-
from crawlee import service_container
119
from crawlee._utils.docs import docs_group
1210
from crawlee._utils.models import timedelta_ms
1311

@@ -236,11 +234,13 @@ class Configuration(BaseSettings):
236234
def get_global_configuration(cls) -> Self:
237235
"""Retrieve the global instance of the configuration.
238236
239-
Mostly for the backward compatibility.
237+
TODO: Can we remove this?
240238
"""
241-
config = service_container.get_configuration()
239+
from crawlee.service_container import get_configuration
242240

243-
if not isinstance(config, cls):
244-
raise TypeError(f'Requested configuration of type {cls}, but got {config.__class__} instead.')
241+
cfg = get_configuration()
245242

246-
return config
243+
if not isinstance(cfg, cls):
244+
raise TypeError(f'Requested global configuration object of type {cls}, but {cfg.__class__} was found')
245+
246+
return cfg

src/crawlee/events/_event_manager.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,17 +149,17 @@ async def listener_wrapper(event_data: EventData) -> None:
149149
self._listener_tasks.add(listener_task)
150150

151151
try:
152-
logger.debug('LocalEventManager.on.listener_wrapper(): Awaiting listener task...')
152+
logger.debug('EventManager.on.listener_wrapper(): Awaiting listener task...')
153153
await listener_task
154-
logger.debug('LocalEventManager.on.listener_wrapper(): Listener task completed.')
154+
logger.debug('EventManager.on.listener_wrapper(): Listener task completed.')
155155
except Exception:
156156
# We need to swallow the exception and just log it here, otherwise it could break the event emitter
157157
logger.exception(
158158
'Exception in the event listener',
159159
extra={'event_name': event.value, 'listener_name': listener.__name__},
160160
)
161161
finally:
162-
logger.debug('LocalEventManager.on.listener_wrapper(): Removing listener task from the set...')
162+
logger.debug('EventManager.on.listener_wrapper(): Removing listener task from the set...')
163163
self._listener_tasks.remove(listener_task)
164164

165165
self._listeners_to_wrappers[event][listener].append(listener_wrapper)
@@ -189,6 +189,9 @@ def emit(self, *, event: Event, event_data: EventData) -> None:
189189
event: The event which will be emitted.
190190
event_data: The data which will be passed to the event listeners.
191191
"""
192+
if not self._initialized:
193+
raise RuntimeError('EventManager is not initialized. Please use it within async context manager.')
194+
192195
self._event_emitter.emit(event.value, event_data)
193196

194197
@ensure_context
@@ -199,6 +202,8 @@ async def wait_for_all_listeners_to_complete(self, *, timeout: timedelta | None
199202
timeout: The maximum time to wait for the event listeners to finish. If they do not complete within
200203
the specified timeout, they will be canceled.
201204
"""
205+
if not self._initialized:
206+
raise RuntimeError('EventManager is not initialized. Please use it within async context manager.')
202207

203208
async def wait_for_listeners() -> None:
204209
"""Gathers all listener tasks and awaits their completion, logging any exceptions encountered."""

src/crawlee/memory_storage_client/_creation_management.py

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from logging import getLogger
1111
from typing import TYPE_CHECKING
1212

13-
from crawlee import service_container
1413
from crawlee._consts import METADATA_FILENAME
1514
from crawlee._utils.data_processing import maybe_parse_body
1615
from crawlee._utils.file import json_dumps
@@ -22,16 +21,14 @@
2221
Request,
2322
RequestQueueMetadata,
2423
)
25-
from crawlee.storages._dataset import Dataset
26-
from crawlee.storages._key_value_store import KeyValueStore
27-
from crawlee.storages._request_queue import RequestQueue
2824

2925
if TYPE_CHECKING:
3026
from crawlee.memory_storage_client._dataset_client import DatasetClient
3127
from crawlee.memory_storage_client._key_value_store_client import KeyValueStoreClient
3228
from crawlee.memory_storage_client._memory_storage_client import MemoryStorageClient, TResourceClient
3329
from crawlee.memory_storage_client._request_queue_client import RequestQueueClient
3430

31+
3532
logger = getLogger(__name__)
3633

3734

@@ -401,24 +398,8 @@ def _determine_storage_path(
401398
id: str | None = None,
402399
name: str | None = None,
403400
) -> str | None:
404-
from crawlee.memory_storage_client._dataset_client import DatasetClient
405-
from crawlee.memory_storage_client._key_value_store_client import KeyValueStoreClient
406-
from crawlee.memory_storage_client._request_queue_client import RequestQueueClient
407-
from crawlee.storages._creation_management import _get_default_storage_id
408-
409-
config = service_container.get_configuration()
410-
411-
if issubclass(resource_client_class, DatasetClient):
412-
storages_dir = memory_storage_client.datasets_directory
413-
default_id = _get_default_storage_id(config, Dataset)
414-
elif issubclass(resource_client_class, KeyValueStoreClient):
415-
storages_dir = memory_storage_client.key_value_stores_directory
416-
default_id = _get_default_storage_id(config, KeyValueStore)
417-
elif issubclass(resource_client_class, RequestQueueClient):
418-
storages_dir = memory_storage_client.request_queues_directory
419-
default_id = _get_default_storage_id(config, RequestQueue)
420-
else:
421-
raise TypeError('Invalid resource client class.')
401+
storages_dir = memory_storage_client._get_storage_dir(resource_client_class) # noqa: SLF001
402+
default_id = memory_storage_client._get_default_storage_id(resource_client_class) # noqa: SLF001
422403

423404
# Try to find by name directly from directories
424405
if name:

0 commit comments

Comments
 (0)