Skip to content

Commit c2b99be

Browse files
committed
Fix cycle imports, tests and everything
1 parent baa8d76 commit c2b99be

18 files changed

+253
-300
lines changed

src/crawlee/_autoscaling/snapshotter.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
if TYPE_CHECKING:
2222
from types import TracebackType
2323

24+
from crawlee.events import EventManager
25+
2426
logger = getLogger(__name__)
2527

2628
T = TypeVar('T')
@@ -39,6 +41,7 @@ class Snapshotter:
3941
def __init__(
4042
self,
4143
*,
44+
event_manager: EventManager | None = None,
4245
event_loop_snapshot_interval: timedelta = timedelta(milliseconds=500),
4346
client_snapshot_interval: timedelta = timedelta(milliseconds=1000),
4447
max_used_cpu_ratio: float = 0.95,
@@ -55,6 +58,8 @@ def __init__(
5558
"""A default constructor.
5659
5760
Args:
61+
event_manager: The event manager used to emit system info events. From data provided by this event
62+
the CPU and memory usage are read.
5863
event_loop_snapshot_interval: The interval at which the event loop is sampled.
5964
client_snapshot_interval: The interval at which the client is sampled.
6065
max_used_cpu_ratio: Sets the ratio, defining the maximum CPU usage. When the CPU usage is higher than
@@ -80,8 +85,7 @@ def __init__(
8085
if available_memory_ratio is None and max_memory_size is None:
8186
raise ValueError('At least one of `available_memory_ratio` or `max_memory_size` must be specified')
8287

83-
self._event_manager = service_container.get_event_manager()
84-
88+
self._event_manager = event_manager or service_container.get_event_manager()
8589
self._event_loop_snapshot_interval = event_loop_snapshot_interval
8690
self._client_snapshot_interval = client_snapshot_interval
8791
self._max_event_loop_delay = max_event_loop_delay

src/crawlee/basic_crawler/_basic_crawler.py

Lines changed: 67 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@
5151

5252
from crawlee._types import ConcurrencySettings, HttpMethod, JsonSerializable
5353
from crawlee.base_storage_client._models import DatasetItemsListPage
54+
from crawlee.configuration import Configuration
55+
from crawlee.events._event_manager import EventManager
5456
from crawlee.http_clients import BaseHttpClient, HttpResponse
5557
from crawlee.proxy_configuration import ProxyConfiguration, ProxyInfo
5658
from crawlee.sessions import Session
@@ -94,6 +96,9 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
9496
"""Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs
9597
or if the website blocks the request."""
9698

99+
configuration: NotRequired[Configuration]
100+
"""Crawler configuration."""
101+
97102
request_handler_timeout: NotRequired[timedelta]
98103
"""Maximum duration allowed for a single request handler to run."""
99104

@@ -112,6 +117,9 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
112117
statistics: NotRequired[Statistics[StatisticsState]]
113118
"""A custom `Statistics` instance, allowing the use of non-default configuration."""
114119

120+
event_manager: NotRequired[EventManager]
121+
"""A custom `EventManager` instance, allowing the use of non-default configuration."""
122+
115123
configure_logging: NotRequired[bool]
116124
"""If True, the crawler will set up logging infrastructure automatically."""
117125

@@ -161,18 +169,21 @@ class BasicCrawler(Generic[TCrawlingContext]):
161169
def __init__(
162170
self,
163171
*,
172+
configuration: Configuration | None = None,
173+
event_manager: EventManager | None = None,
164174
request_provider: RequestProvider | None = None,
165-
request_handler: Callable[[TCrawlingContext], Awaitable[None]] | None = None,
175+
session_pool: SessionPool | None = None,
176+
proxy_configuration: ProxyConfiguration | None = None,
166177
http_client: BaseHttpClient | None = None,
167-
concurrency_settings: ConcurrencySettings | None = None,
178+
request_handler: Callable[[TCrawlingContext], Awaitable[None]] | None = None,
168179
max_request_retries: int = 3,
169180
max_requests_per_crawl: int | None = None,
170181
max_session_rotations: int = 10,
171-
request_handler_timeout: timedelta = timedelta(minutes=1),
172-
session_pool: SessionPool | None = None,
182+
max_crawl_depth: int | None = None,
173183
use_session_pool: bool = True,
174184
retry_on_blocked: bool = True,
175-
proxy_configuration: ProxyConfiguration | None = None,
185+
concurrency_settings: ConcurrencySettings | None = None,
186+
request_handler_timeout: timedelta = timedelta(minutes=1),
176187
statistics: Statistics | None = None,
177188
configure_logging: bool = True,
178189
max_crawl_depth: int | None = None,
@@ -184,22 +195,25 @@ def __init__(
184195
"""A default constructor.
185196
186197
Args:
198+
configuration: The configuration object. Some of its properties are used as defaults for the crawler.
199+
event_manager: The event manager for managing events for the crawler and all its components.
187200
request_provider: Provider for requests to be processed by the crawler.
188-
request_handler: A callable responsible for handling requests.
201+
session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
202+
proxy_configuration: HTTP proxy configuration used when making requests.
189203
http_client: HTTP client used by `BasicCrawlingContext.send_request` and the HTTP-based crawling.
190-
concurrency_settings: Settings to fine-tune concurrency levels.
204+
request_handler: A callable responsible for handling requests.
191205
max_request_retries: Maximum number of attempts to process a single request.
192206
max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
193207
this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means
194208
no limit. Due to concurrency settings, the actual number of pages visited may slightly exceed
195209
this value.
196210
max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
197211
if a proxy error occurs or if the website blocks the request.
198-
request_handler_timeout: Maximum duration allowed for a single request handler to run.
212+
max_crawl_depth: Maximum crawl depth. If set, the crawler will stop crawling after reaching this depth.
199213
use_session_pool: Enable the use of a session pool for managing sessions during crawling.
200-
session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
201214
retry_on_blocked: If True, the crawler attempts to bypass bot protections automatically.
202-
proxy_configuration: HTTP proxy configuration used when making requests.
215+
concurrency_settings: Settings to fine-tune concurrency levels.
216+
request_handler_timeout: Maximum duration allowed for a single request handler to run.
203217
statistics: A custom `Statistics` instance, allowing the use of non-default configuration.
204218
configure_logging: If True, the crawler will set up logging infrastructure automatically.
205219
max_crawl_depth: Maximum crawl depth. If set, the crawler will stop crawling after reaching this depth.
@@ -209,80 +223,86 @@ def __init__(
209223
_additional_context_managers: Additional context managers used throughout the crawler lifecycle.
210224
_logger: A logger instance, typically provided by a subclass, for consistent logging labels.
211225
"""
212-
self._router: Router[TCrawlingContext] | None = None
226+
if configuration:
227+
service_container.set_configuration(configuration)
228+
if event_manager:
229+
service_container.set_event_manager(event_manager)
230+
231+
config = service_container.get_configuration()
232+
233+
# Core components
234+
self._request_provider = request_provider
235+
self._session_pool = session_pool or SessionPool()
236+
self._proxy_configuration = proxy_configuration
237+
self._http_client = http_client or HttpxHttpClient()
213238

239+
# Request router setup
240+
self._router: Router[TCrawlingContext] | None = None
214241
if isinstance(cast(Router, request_handler), Router):
215242
self._router = cast(Router[TCrawlingContext], request_handler)
216243
elif request_handler is not None:
217244
self._router = None
218245
self.router.default_handler(request_handler)
219246

220-
self._http_client = http_client or HttpxHttpClient()
221-
222-
self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)
223-
247+
# Error & failed request handlers
224248
self._error_handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext] | None = None
225249
self._failed_request_handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] | None = None
226250

251+
# Context pipeline
252+
self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)
253+
254+
# Crawl settings
227255
self._max_request_retries = max_request_retries
228256
self._max_requests_per_crawl = max_requests_per_crawl
229257
self._max_session_rotations = max_session_rotations
258+
self._max_crawl_depth = max_crawl_depth
230259

231-
self._request_provider = request_provider
232-
233-
config = service_container.get_configuration()
234-
260+
# Timeouts
235261
self._request_handler_timeout = request_handler_timeout
236262
self._internal_timeout = (
237263
config.internal_timeout
238264
if config.internal_timeout is not None
239265
else max(2 * request_handler_timeout, timedelta(minutes=5))
240266
)
241267

242-
self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
243-
244-
self._event_manager = service_container.get_event_manager()
245-
self._snapshotter = Snapshotter(
246-
max_memory_size=ByteSize.from_mb(config.memory_mbytes) if config.memory_mbytes else None,
247-
available_memory_ratio=config.available_memory_ratio,
248-
)
249-
self._autoscaled_pool = AutoscaledPool(
250-
system_status=SystemStatus(self._snapshotter),
251-
is_finished_function=self.__is_finished_function,
252-
is_task_ready_function=self.__is_task_ready_function,
253-
run_task_function=self.__run_task_function,
254-
concurrency_settings=concurrency_settings,
255-
)
256-
268+
# Retry and session settings
257269
self._use_session_pool = use_session_pool
258-
self._session_pool = session_pool or SessionPool()
259-
260270
self._retry_on_blocked = retry_on_blocked
261271

272+
# Logging setup
262273
if configure_logging:
263274
root_logger = logging.getLogger()
264275
configure_logger(root_logger, remove_old_handlers=True)
265-
266-
# Silence HTTPX logger
267-
httpx_logger = logging.getLogger('httpx')
276+
httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger
268277
httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
278+
self._logger = _logger or logging.getLogger(__name__)
269279

270-
if not _logger:
271-
_logger = logging.getLogger(__name__)
272-
273-
self._logger = _logger
274-
275-
self._proxy_configuration = proxy_configuration
280+
# Statistics
276281
self._statistics = statistics or Statistics(
277-
event_manager=self._event_manager,
278282
periodic_message_logger=self._logger,
279283
log_message='Current request statistics:',
280284
)
285+
286+
# Additional context managers to enter and exit
281287
self._additional_context_managers = _additional_context_managers or []
282288

289+
# Internal, not explicitly configurable components
290+
self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
291+
self._snapshotter = Snapshotter(
292+
max_memory_size=ByteSize.from_mb(config.memory_mbytes) if config.memory_mbytes else None,
293+
available_memory_ratio=config.available_memory_ratio,
294+
)
295+
self._autoscaled_pool = AutoscaledPool(
296+
system_status=SystemStatus(self._snapshotter),
297+
is_finished_function=self.__is_finished_function,
298+
is_task_ready_function=self.__is_task_ready_function,
299+
run_task_function=self.__run_task_function,
300+
concurrency_settings=concurrency_settings,
301+
)
302+
303+
# State flags
283304
self._running = False
284305
self._has_finished_before = False
285-
self._max_crawl_depth = max_crawl_depth
286306

287307
self._failed = False
288308
self._abort_on_error = abort_on_error

src/crawlee/configuration.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,7 @@
55

66
from pydantic import AliasChoices, BeforeValidator, Field
77
from pydantic_settings import BaseSettings, SettingsConfigDict
8-
from typing_extensions import Self
98

10-
from crawlee import service_container
119
from crawlee._utils.docs import docs_group
1210
from crawlee._utils.models import timedelta_ms
1311

@@ -237,11 +235,13 @@ class Configuration(BaseSettings):
237235
def get_global_configuration(cls) -> Self:
238236
"""Retrieve the global instance of the configuration.
239237
240-
Mostly for the backward compatibility.
238+
TODO: Can we remove this?
241239
"""
242-
config = service_container.get_configuration()
240+
from crawlee.service_container import get_configuration
243241

244-
if not isinstance(config, cls):
245-
raise TypeError(f'Requested configuration of type {cls}, but got {config.__class__} instead.')
242+
cfg = get_configuration()
246243

247-
return config
244+
if not isinstance(cfg, cls):
245+
raise TypeError(f'Requested global configuration object of type {cls}, but {cfg.__class__} was found')
246+
247+
return cfg

src/crawlee/events/_event_manager.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,17 +149,17 @@ async def listener_wrapper(event_data: EventData) -> None:
149149
self._listener_tasks.add(listener_task)
150150

151151
try:
152-
logger.debug('LocalEventManager.on.listener_wrapper(): Awaiting listener task...')
152+
logger.debug('EventManager.on.listener_wrapper(): Awaiting listener task...')
153153
await listener_task
154-
logger.debug('LocalEventManager.on.listener_wrapper(): Listener task completed.')
154+
logger.debug('EventManager.on.listener_wrapper(): Listener task completed.')
155155
except Exception:
156156
# We need to swallow the exception and just log it here, otherwise it could break the event emitter
157157
logger.exception(
158158
'Exception in the event listener',
159159
extra={'event_name': event.value, 'listener_name': listener.__name__},
160160
)
161161
finally:
162-
logger.debug('LocalEventManager.on.listener_wrapper(): Removing listener task from the set...')
162+
logger.debug('EventManager.on.listener_wrapper(): Removing listener task from the set...')
163163
self._listener_tasks.remove(listener_task)
164164

165165
self._listeners_to_wrappers[event][listener].append(listener_wrapper)
@@ -189,6 +189,9 @@ def emit(self, *, event: Event, event_data: EventData) -> None:
189189
event: The event which will be emitted.
190190
event_data: The data which will be passed to the event listeners.
191191
"""
192+
if not self._initialized:
193+
raise RuntimeError('EventManager is not initialized. Please use it within async context manager.')
194+
192195
self._event_emitter.emit(event.value, event_data)
193196

194197
@ensure_context
@@ -199,6 +202,8 @@ async def wait_for_all_listeners_to_complete(self, *, timeout: timedelta | None
199202
timeout: The maximum time to wait for the event listeners to finish. If they do not complete within
200203
the specified timeout, they will be canceled.
201204
"""
205+
if not self._initialized:
206+
raise RuntimeError('EventManager is not initialized. Please use it within async context manager.')
202207

203208
async def wait_for_listeners() -> None:
204209
"""Gathers all listener tasks and awaits their completion, logging any exceptions encountered."""

src/crawlee/memory_storage_client/_creation_management.py

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from logging import getLogger
1111
from typing import TYPE_CHECKING
1212

13-
from crawlee import service_container
1413
from crawlee._consts import METADATA_FILENAME
1514
from crawlee._utils.data_processing import maybe_parse_body
1615
from crawlee._utils.file import json_dumps
@@ -22,16 +21,14 @@
2221
Request,
2322
RequestQueueMetadata,
2423
)
25-
from crawlee.storages._dataset import Dataset
26-
from crawlee.storages._key_value_store import KeyValueStore
27-
from crawlee.storages._request_queue import RequestQueue
2824

2925
if TYPE_CHECKING:
3026
from crawlee.memory_storage_client._dataset_client import DatasetClient
3127
from crawlee.memory_storage_client._key_value_store_client import KeyValueStoreClient
3228
from crawlee.memory_storage_client._memory_storage_client import MemoryStorageClient, TResourceClient
3329
from crawlee.memory_storage_client._request_queue_client import RequestQueueClient
3430

31+
3532
logger = getLogger(__name__)
3633

3734

@@ -401,24 +398,8 @@ def _determine_storage_path(
401398
id: str | None = None,
402399
name: str | None = None,
403400
) -> str | None:
404-
from crawlee.memory_storage_client._dataset_client import DatasetClient
405-
from crawlee.memory_storage_client._key_value_store_client import KeyValueStoreClient
406-
from crawlee.memory_storage_client._request_queue_client import RequestQueueClient
407-
from crawlee.storages._creation_management import _get_default_storage_id
408-
409-
config = service_container.get_configuration()
410-
411-
if issubclass(resource_client_class, DatasetClient):
412-
storages_dir = memory_storage_client.datasets_directory
413-
default_id = _get_default_storage_id(config, Dataset)
414-
elif issubclass(resource_client_class, KeyValueStoreClient):
415-
storages_dir = memory_storage_client.key_value_stores_directory
416-
default_id = _get_default_storage_id(config, KeyValueStore)
417-
elif issubclass(resource_client_class, RequestQueueClient):
418-
storages_dir = memory_storage_client.request_queues_directory
419-
default_id = _get_default_storage_id(config, RequestQueue)
420-
else:
421-
raise TypeError('Invalid resource client class.')
401+
storages_dir = memory_storage_client._get_storage_dir(resource_client_class) # noqa: SLF001
402+
default_id = memory_storage_client._get_default_storage_id(resource_client_class) # noqa: SLF001
422403

423404
# Try to find by name directly from directories
424405
if name:

0 commit comments

Comments
 (0)