Skip to content

Commit 99c5017

Browse files
committed
basic crawler accepts storage client
1 parent fe97df1 commit 99c5017

File tree

1 file changed

+46
-28
lines changed

1 file changed

+46
-28
lines changed

src/crawlee/basic_crawler/_basic_crawler.py

Lines changed: 46 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
from contextlib import AbstractAsyncContextManager
5151

5252
from crawlee._types import ConcurrencySettings, HttpMethod, JsonSerializable
53+
from crawlee.base_storage_client import BaseStorageClient
5354
from crawlee.base_storage_client._models import DatasetItemsListPage
5455
from crawlee.configuration import Configuration
5556
from crawlee.events._event_manager import EventManager
@@ -72,17 +73,29 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
7273
It is intended for typing forwarded `__init__` arguments in the subclasses.
7374
"""
7475

76+
configuration: NotRequired[Configuration]
77+
"""The configuration object. Some of its properties are used as defaults for the crawler."""
78+
79+
event_manager: NotRequired[EventManager]
80+
"""The event manager for managing events for the crawler and all its components."""
81+
82+
storage_client: NotRequired[BaseStorageClient]
83+
"""The storage client for managing storages for the crawler and all its components."""
84+
7585
request_provider: NotRequired[RequestProvider]
7686
"""Provider for requests to be processed by the crawler."""
7787

78-
request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]
79-
"""A callable responsible for handling requests."""
88+
session_pool: NotRequired[SessionPool]
89+
"""A custom `SessionPool` instance, allowing the use of non-default configuration."""
90+
91+
proxy_configuration: NotRequired[ProxyConfiguration]
92+
"""HTTP proxy configuration used when making requests."""
8093

8194
http_client: NotRequired[BaseHttpClient]
82-
"""HTTP client used by `BasicCrawlingContext.send_request` and the HTTP-based crawling."""
95+
"""HTTP client used by `BasicCrawlingContext.send_request` method."""
8396

84-
concurrency_settings: NotRequired[ConcurrencySettings]
85-
"""Settings to fine-tune concurrency levels."""
97+
request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]
98+
"""A callable responsible for handling requests."""
8699

87100
max_request_retries: NotRequired[int]
88101
"""Maximum number of attempts to process a single request."""
@@ -96,49 +109,45 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
96109
"""Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs
97110
or if the website blocks the request."""
98111

99-
configuration: NotRequired[Configuration]
100-
"""Crawler configuration."""
101-
102-
request_handler_timeout: NotRequired[timedelta]
103-
"""Maximum duration allowed for a single request handler to run."""
112+
max_crawl_depth: NotRequired[int | None]
113+
"""Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.
114+
The crawl depth starts at 0 for initial requests and increases with each subsequent level of links.
115+
Requests at the maximum depth will still be processed, but no new links will be enqueued from those requests.
116+
If not set, crawling continues without depth restrictions.
117+
"""
104118

105119
use_session_pool: NotRequired[bool]
106120
"""Enable the use of a session pool for managing sessions during crawling."""
107121

108-
session_pool: NotRequired[SessionPool]
109-
"""A custom `SessionPool` instance, allowing the use of non-default configuration."""
110-
111122
retry_on_blocked: NotRequired[bool]
112123
"""If True, the crawler attempts to bypass bot protections automatically."""
113124

114-
proxy_configuration: NotRequired[ProxyConfiguration]
115-
"""HTTP proxy configuration used when making requests."""
125+
concurrency_settings: NotRequired[ConcurrencySettings]
126+
"""Settings to fine-tune concurrency levels."""
127+
128+
request_handler_timeout: NotRequired[timedelta]
129+
"""Maximum duration allowed for a single request handler to run."""
116130

117131
statistics: NotRequired[Statistics[StatisticsState]]
118132
"""A custom `Statistics` instance, allowing the use of non-default configuration."""
119133

120-
event_manager: NotRequired[EventManager]
121-
"""A custom `EventManager` instance, allowing the use of non-default configuration."""
134+
abort_on_error: NotRequired[bool]
135+
"""If True, the crawler stops immediately when any request handler error occurs."""
122136

123137
configure_logging: NotRequired[bool]
124138
"""If True, the crawler will set up logging infrastructure automatically."""
125139

126-
max_crawl_depth: NotRequired[int | None]
127-
"""Limits crawl depth from 0 (initial requests) up to the specified `max_crawl_depth`.
128-
Requests at the maximum depth are processed, but no further links are enqueued."""
129-
130-
abort_on_error: NotRequired[bool]
131-
"""If True, the crawler stops immediately when any request handler error occurs."""
132-
133140
_context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]]
134141
"""Enables extending the request lifecycle and modifying the crawling context. Intended for use by
135142
subclasses rather than direct instantiation of `BasicCrawler`."""
136143

137144
_additional_context_managers: NotRequired[Sequence[AbstractAsyncContextManager]]
138-
"""Additional context managers used throughout the crawler lifecycle."""
145+
"""Additional context managers used throughout the crawler lifecycle. Intended for use by
146+
subclasses rather than direct instantiation of `BasicCrawler`."""
139147

140148
_logger: NotRequired[logging.Logger]
141-
"""A logger instance, typically provided by a subclass, for consistent logging labels."""
149+
"""A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by
150+
subclasses rather than direct instantiation of `BasicCrawler`."""
142151

143152

144153
@docs_group('Classes')
@@ -171,6 +180,7 @@ def __init__(
171180
*,
172181
configuration: Configuration | None = None,
173182
event_manager: EventManager | None = None,
183+
storage_client: BaseStorageClient | None = None,
174184
request_provider: RequestProvider | None = None,
175185
session_pool: SessionPool | None = None,
176186
proxy_configuration: ProxyConfiguration | None = None,
@@ -196,10 +206,11 @@ def __init__(
196206
Args:
197207
configuration: The configuration object. Some of its properties are used as defaults for the crawler.
198208
event_manager: The event manager for managing events for the crawler and all its components.
209+
storage_client: The storage client for managing storages for the crawler and all its components.
199210
request_provider: Provider for requests to be processed by the crawler.
200211
session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
201212
proxy_configuration: HTTP proxy configuration used when making requests.
202-
http_client: HTTP client used by `BasicCrawlingContext.send_request` and the HTTP-based crawling.
213+
http_client: HTTP client used by `BasicCrawlingContext.send_request` method.
203214
request_handler: A callable responsible for handling requests.
204215
max_request_retries: Maximum number of attempts to process a single request.
205216
max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
@@ -208,7 +219,10 @@ def __init__(
208219
this value.
209220
max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
210221
if a proxy error occurs or if the website blocks the request.
211-
max_crawl_depth: Maximum crawl depth. If set, the crawler will stop crawling after reaching this depth.
222+
max_crawl_depth: Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond
223+
this depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level
224+
of links. Requests at the maximum depth will still be processed, but no new links will be enqueued
225+
from those requests. If not set, crawling continues without depth restrictions.
212226
use_session_pool: Enable the use of a session pool for managing sessions during crawling.
213227
retry_on_blocked: If True, the crawler attempts to bypass bot protections automatically.
214228
concurrency_settings: Settings to fine-tune concurrency levels.
@@ -219,10 +233,14 @@ def __init__(
219233
_context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
220234
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
221235
_additional_context_managers: Additional context managers used throughout the crawler lifecycle.
236+
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
222237
_logger: A logger instance, typically provided by a subclass, for consistent logging labels.
238+
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
223239
"""
224240
if configuration:
225241
service_container.set_configuration(configuration)
242+
if storage_client:
243+
service_container.set_storage_client(storage_client)
226244
if event_manager:
227245
service_container.set_event_manager(event_manager)
228246

0 commit comments

Comments
 (0)