50
50
from contextlib import AbstractAsyncContextManager
51
51
52
52
from crawlee ._types import ConcurrencySettings , HttpMethod , JsonSerializable
53
+ from crawlee .base_storage_client import BaseStorageClient
53
54
from crawlee .base_storage_client ._models import DatasetItemsListPage
54
55
from crawlee .configuration import Configuration
55
56
from crawlee .events ._event_manager import EventManager
@@ -72,17 +73,29 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
72
73
It is intended for typing forwarded `__init__` arguments in the subclasses.
73
74
"""
74
75
76
+ configuration : NotRequired [Configuration ]
77
+ """The configuration object. Some of its properties are used as defaults for the crawler."""
78
+
79
+ event_manager : NotRequired [EventManager ]
80
+ """The event manager for managing events for the crawler and all its components."""
81
+
82
+ storage_client : NotRequired [BaseStorageClient ]
83
+ """The storage client for managing storages for the crawler and all its components."""
84
+
75
85
request_provider : NotRequired [RequestProvider ]
76
86
"""Provider for requests to be processed by the crawler."""
77
87
78
- request_handler : NotRequired [Callable [[TCrawlingContext ], Awaitable [None ]]]
79
- """A callable responsible for handling requests."""
88
+ session_pool : NotRequired [SessionPool ]
89
+ """A custom `SessionPool` instance, allowing the use of non-default configuration."""
90
+
91
+ proxy_configuration : NotRequired [ProxyConfiguration ]
92
+ """HTTP proxy configuration used when making requests."""
80
93
81
94
http_client : NotRequired [BaseHttpClient ]
82
- """HTTP client used by `BasicCrawlingContext.send_request` and the HTTP-based crawling ."""
95
+ """HTTP client used by `BasicCrawlingContext.send_request` method ."""
83
96
84
- concurrency_settings : NotRequired [ConcurrencySettings ]
85
- """Settings to fine-tune concurrency levels ."""
97
+ request_handler : NotRequired [Callable [[ TCrawlingContext ], Awaitable [ None ]] ]
98
+ """A callable responsible for handling requests ."""
86
99
87
100
max_request_retries : NotRequired [int ]
88
101
"""Maximum number of attempts to process a single request."""
@@ -96,49 +109,45 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
96
109
"""Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs
97
110
or if the website blocks the request."""
98
111
99
- configuration : NotRequired [Configuration ]
100
- """Crawler configuration."""
101
-
102
- request_handler_timeout : NotRequired [timedelta ]
103
- """Maximum duration allowed for a single request handler to run."""
112
+ max_crawl_depth : NotRequired [int | None ]
113
+ """Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.
114
+ The crawl depth starts at 0 for initial requests and increases with each subsequent level of links.
115
+ Requests at the maximum depth will still be processed, but no new links will be enqueued from those requests.
116
+ If not set, crawling continues without depth restrictions.
117
+ """
104
118
105
119
use_session_pool : NotRequired [bool ]
106
120
"""Enable the use of a session pool for managing sessions during crawling."""
107
121
108
- session_pool : NotRequired [SessionPool ]
109
- """A custom `SessionPool` instance, allowing the use of non-default configuration."""
110
-
111
122
retry_on_blocked : NotRequired [bool ]
112
123
"""If True, the crawler attempts to bypass bot protections automatically."""
113
124
114
- proxy_configuration : NotRequired [ProxyConfiguration ]
115
- """HTTP proxy configuration used when making requests."""
125
+ concurrency_settings : NotRequired [ConcurrencySettings ]
126
+ """Settings to fine-tune concurrency levels."""
127
+
128
+ request_handler_timeout : NotRequired [timedelta ]
129
+ """Maximum duration allowed for a single request handler to run."""
116
130
117
131
statistics : NotRequired [Statistics [StatisticsState ]]
118
132
"""A custom `Statistics` instance, allowing the use of non-default configuration."""
119
133
120
- event_manager : NotRequired [EventManager ]
121
- """A custom `EventManager` instance, allowing the use of non-default configuration ."""
134
+ abort_on_error : NotRequired [bool ]
135
+ """If True, the crawler stops immediately when any request handler error occurs ."""
122
136
123
137
configure_logging : NotRequired [bool ]
124
138
"""If True, the crawler will set up logging infrastructure automatically."""
125
139
126
- max_crawl_depth : NotRequired [int | None ]
127
- """Limits crawl depth from 0 (initial requests) up to the specified `max_crawl_depth`.
128
- Requests at the maximum depth are processed, but no further links are enqueued."""
129
-
130
- abort_on_error : NotRequired [bool ]
131
- """If True, the crawler stops immediately when any request handler error occurs."""
132
-
133
140
_context_pipeline : NotRequired [ContextPipeline [TCrawlingContext ]]
134
141
"""Enables extending the request lifecycle and modifying the crawling context. Intended for use by
135
142
subclasses rather than direct instantiation of `BasicCrawler`."""
136
143
137
144
_additional_context_managers : NotRequired [Sequence [AbstractAsyncContextManager ]]
138
- """Additional context managers used throughout the crawler lifecycle."""
145
+ """Additional context managers used throughout the crawler lifecycle. Intended for use by
146
+ subclasses rather than direct instantiation of `BasicCrawler`."""
139
147
140
148
_logger : NotRequired [logging .Logger ]
141
- """A logger instance, typically provided by a subclass, for consistent logging labels."""
149
+ """A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by
150
+ subclasses rather than direct instantiation of `BasicCrawler`."""
142
151
143
152
144
153
@docs_group ('Classes' )
@@ -171,6 +180,7 @@ def __init__(
171
180
* ,
172
181
configuration : Configuration | None = None ,
173
182
event_manager : EventManager | None = None ,
183
+ storage_client : BaseStorageClient | None = None ,
174
184
request_provider : RequestProvider | None = None ,
175
185
session_pool : SessionPool | None = None ,
176
186
proxy_configuration : ProxyConfiguration | None = None ,
@@ -196,10 +206,11 @@ def __init__(
196
206
Args:
197
207
configuration: The configuration object. Some of its properties are used as defaults for the crawler.
198
208
event_manager: The event manager for managing events for the crawler and all its components.
209
+ storage_client: The storage client for managing storages for the crawler and all its components.
199
210
request_provider: Provider for requests to be processed by the crawler.
200
211
session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
201
212
proxy_configuration: HTTP proxy configuration used when making requests.
202
- http_client: HTTP client used by `BasicCrawlingContext.send_request` and the HTTP-based crawling .
213
+ http_client: HTTP client used by `BasicCrawlingContext.send_request` method .
203
214
request_handler: A callable responsible for handling requests.
204
215
max_request_retries: Maximum number of attempts to process a single request.
205
216
max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
@@ -208,7 +219,10 @@ def __init__(
208
219
this value.
209
220
max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
210
221
if a proxy error occurs or if the website blocks the request.
211
- max_crawl_depth: Maximum crawl depth. If set, the crawler will stop crawling after reaching this depth.
222
+ max_crawl_depth: Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond
223
+ this depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level
224
+ of links. Requests at the maximum depth will still be processed, but no new links will be enqueued
225
+ from those requests. If not set, crawling continues without depth restrictions.
212
226
use_session_pool: Enable the use of a session pool for managing sessions during crawling.
213
227
retry_on_blocked: If True, the crawler attempts to bypass bot protections automatically.
214
228
concurrency_settings: Settings to fine-tune concurrency levels.
@@ -219,10 +233,14 @@ def __init__(
219
233
_context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
220
234
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
221
235
_additional_context_managers: Additional context managers used throughout the crawler lifecycle.
236
+ Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
222
237
_logger: A logger instance, typically provided by a subclass, for consistent logging labels.
238
+ Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
223
239
"""
224
240
if configuration :
225
241
service_container .set_configuration (configuration )
242
+ if storage_client :
243
+ service_container .set_storage_client (storage_client )
226
244
if event_manager :
227
245
service_container .set_event_manager (event_manager )
228
246
0 commit comments