51
51
52
52
from crawlee ._types import ConcurrencySettings , HttpMethod , JsonSerializable
53
53
from crawlee .base_storage_client ._models import DatasetItemsListPage
54
+ from crawlee .configuration import Configuration
55
+ from crawlee .events ._event_manager import EventManager
54
56
from crawlee .http_clients import BaseHttpClient , HttpResponse
55
57
from crawlee .proxy_configuration import ProxyConfiguration , ProxyInfo
56
58
from crawlee .sessions import Session
@@ -94,6 +96,9 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
94
96
"""Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs
95
97
or if the website blocks the request."""
96
98
99
+ configuration : NotRequired [Configuration ]
100
+ """Crawler configuration."""
101
+
97
102
request_handler_timeout : NotRequired [timedelta ]
98
103
"""Maximum duration allowed for a single request handler to run."""
99
104
@@ -112,6 +117,9 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
112
117
statistics : NotRequired [Statistics [StatisticsState ]]
113
118
"""A custom `Statistics` instance, allowing the use of non-default configuration."""
114
119
120
+ event_manager : NotRequired [EventManager ]
121
+ """A custom `EventManager` instance, allowing the use of non-default configuration."""
122
+
115
123
configure_logging : NotRequired [bool ]
116
124
"""If True, the crawler will set up logging infrastructure automatically."""
117
125
@@ -161,18 +169,21 @@ class BasicCrawler(Generic[TCrawlingContext]):
161
169
def __init__ (
162
170
self ,
163
171
* ,
172
+ configuration : Configuration | None = None ,
173
+ event_manager : EventManager | None = None ,
164
174
request_provider : RequestProvider | None = None ,
165
- request_handler : Callable [[TCrawlingContext ], Awaitable [None ]] | None = None ,
175
+ session_pool : SessionPool | None = None ,
176
+ proxy_configuration : ProxyConfiguration | None = None ,
166
177
http_client : BaseHttpClient | None = None ,
167
- concurrency_settings : ConcurrencySettings | None = None ,
178
+ request_handler : Callable [[ TCrawlingContext ], Awaitable [ None ]] | None = None ,
168
179
max_request_retries : int = 3 ,
169
180
max_requests_per_crawl : int | None = None ,
170
181
max_session_rotations : int = 10 ,
171
- request_handler_timeout : timedelta = timedelta (minutes = 1 ),
172
- session_pool : SessionPool | None = None ,
182
+ max_crawl_depth : int | None = None ,
173
183
use_session_pool : bool = True ,
174
184
retry_on_blocked : bool = True ,
175
- proxy_configuration : ProxyConfiguration | None = None ,
185
+ concurrency_settings : ConcurrencySettings | None = None ,
186
+ request_handler_timeout : timedelta = timedelta (minutes = 1 ),
176
187
statistics : Statistics | None = None ,
177
188
configure_logging : bool = True ,
178
189
max_crawl_depth : int | None = None ,
@@ -184,22 +195,25 @@ def __init__(
184
195
"""A default constructor.
185
196
186
197
Args:
198
+ configuration: The configuration object. Some of its properties are used as defaults for the crawler.
199
+ event_manager: The event manager for managing events for the crawler and all its components.
187
200
request_provider: Provider for requests to be processed by the crawler.
188
- request_handler: A callable responsible for handling requests.
201
+ session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
202
+ proxy_configuration: HTTP proxy configuration used when making requests.
189
203
http_client: HTTP client used by `BasicCrawlingContext.send_request` and the HTTP-based crawling.
190
- concurrency_settings: Settings to fine-tune concurrency levels .
204
+ request_handler: A callable responsible for handling requests .
191
205
max_request_retries: Maximum number of attempts to process a single request.
192
206
max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
193
207
this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means
194
208
no limit. Due to concurrency settings, the actual number of pages visited may slightly exceed
195
209
this value.
196
210
max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
197
211
if a proxy error occurs or if the website blocks the request.
198
- request_handler_timeout : Maximum duration allowed for a single request handler to run .
212
+ max_crawl_depth : Maximum crawl depth. If set, the crawler will stop crawling after reaching this depth .
199
213
use_session_pool: Enable the use of a session pool for managing sessions during crawling.
200
- session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
201
214
retry_on_blocked: If True, the crawler attempts to bypass bot protections automatically.
202
- proxy_configuration: HTTP proxy configuration used when making requests.
215
+ concurrency_settings: Settings to fine-tune concurrency levels.
216
+ request_handler_timeout: Maximum duration allowed for a single request handler to run.
203
217
statistics: A custom `Statistics` instance, allowing the use of non-default configuration.
204
218
configure_logging: If True, the crawler will set up logging infrastructure automatically.
205
219
max_crawl_depth: Maximum crawl depth. If set, the crawler will stop crawling after reaching this depth.
@@ -209,80 +223,86 @@ def __init__(
209
223
_additional_context_managers: Additional context managers used throughout the crawler lifecycle.
210
224
_logger: A logger instance, typically provided by a subclass, for consistent logging labels.
211
225
"""
212
- self ._router : Router [TCrawlingContext ] | None = None
226
+ if configuration :
227
+ service_container .set_configuration (configuration )
228
+ if event_manager :
229
+ service_container .set_event_manager (event_manager )
230
+
231
+ config = service_container .get_configuration ()
232
+
233
+ # Core components
234
+ self ._request_provider = request_provider
235
+ self ._session_pool = session_pool or SessionPool ()
236
+ self ._proxy_configuration = proxy_configuration
237
+ self ._http_client = http_client or HttpxHttpClient ()
213
238
239
+ # Request router setup
240
+ self ._router : Router [TCrawlingContext ] | None = None
214
241
if isinstance (cast (Router , request_handler ), Router ):
215
242
self ._router = cast (Router [TCrawlingContext ], request_handler )
216
243
elif request_handler is not None :
217
244
self ._router = None
218
245
self .router .default_handler (request_handler )
219
246
220
- self ._http_client = http_client or HttpxHttpClient ()
221
-
222
- self ._context_pipeline = (_context_pipeline or ContextPipeline ()).compose (self ._check_url_after_redirects )
223
-
247
+ # Error & failed request handlers
224
248
self ._error_handler : ErrorHandler [TCrawlingContext | BasicCrawlingContext ] | None = None
225
249
self ._failed_request_handler : FailedRequestHandler [TCrawlingContext | BasicCrawlingContext ] | None = None
226
250
251
+ # Context pipeline
252
+ self ._context_pipeline = (_context_pipeline or ContextPipeline ()).compose (self ._check_url_after_redirects )
253
+
254
+ # Crawl settings
227
255
self ._max_request_retries = max_request_retries
228
256
self ._max_requests_per_crawl = max_requests_per_crawl
229
257
self ._max_session_rotations = max_session_rotations
258
+ self ._max_crawl_depth = max_crawl_depth
230
259
231
- self ._request_provider = request_provider
232
-
233
- config = service_container .get_configuration ()
234
-
260
+ # Timeouts
235
261
self ._request_handler_timeout = request_handler_timeout
236
262
self ._internal_timeout = (
237
263
config .internal_timeout
238
264
if config .internal_timeout is not None
239
265
else max (2 * request_handler_timeout , timedelta (minutes = 5 ))
240
266
)
241
267
242
- self ._tld_extractor = TLDExtract (cache_dir = tempfile .TemporaryDirectory ().name )
243
-
244
- self ._event_manager = service_container .get_event_manager ()
245
- self ._snapshotter = Snapshotter (
246
- max_memory_size = ByteSize .from_mb (config .memory_mbytes ) if config .memory_mbytes else None ,
247
- available_memory_ratio = config .available_memory_ratio ,
248
- )
249
- self ._autoscaled_pool = AutoscaledPool (
250
- system_status = SystemStatus (self ._snapshotter ),
251
- is_finished_function = self .__is_finished_function ,
252
- is_task_ready_function = self .__is_task_ready_function ,
253
- run_task_function = self .__run_task_function ,
254
- concurrency_settings = concurrency_settings ,
255
- )
256
-
268
+ # Retry and session settings
257
269
self ._use_session_pool = use_session_pool
258
- self ._session_pool = session_pool or SessionPool ()
259
-
260
270
self ._retry_on_blocked = retry_on_blocked
261
271
272
+ # Logging setup
262
273
if configure_logging :
263
274
root_logger = logging .getLogger ()
264
275
configure_logger (root_logger , remove_old_handlers = True )
265
-
266
- # Silence HTTPX logger
267
- httpx_logger = logging .getLogger ('httpx' )
276
+ httpx_logger = logging .getLogger ('httpx' ) # Silence HTTPX logger
268
277
httpx_logger .setLevel (logging .DEBUG if get_configured_log_level () <= logging .DEBUG else logging .WARNING )
278
+ self ._logger = _logger or logging .getLogger (__name__ )
269
279
270
- if not _logger :
271
- _logger = logging .getLogger (__name__ )
272
-
273
- self ._logger = _logger
274
-
275
- self ._proxy_configuration = proxy_configuration
280
+ # Statistics
276
281
self ._statistics = statistics or Statistics (
277
- event_manager = self ._event_manager ,
278
282
periodic_message_logger = self ._logger ,
279
283
log_message = 'Current request statistics:' ,
280
284
)
285
+
286
+ # Additional context managers to enter and exit
281
287
self ._additional_context_managers = _additional_context_managers or []
282
288
289
+ # Internal, not explicitly configurable components
290
+ self ._tld_extractor = TLDExtract (cache_dir = tempfile .TemporaryDirectory ().name )
291
+ self ._snapshotter = Snapshotter (
292
+ max_memory_size = ByteSize .from_mb (config .memory_mbytes ) if config .memory_mbytes else None ,
293
+ available_memory_ratio = config .available_memory_ratio ,
294
+ )
295
+ self ._autoscaled_pool = AutoscaledPool (
296
+ system_status = SystemStatus (self ._snapshotter ),
297
+ is_finished_function = self .__is_finished_function ,
298
+ is_task_ready_function = self .__is_task_ready_function ,
299
+ run_task_function = self .__run_task_function ,
300
+ concurrency_settings = concurrency_settings ,
301
+ )
302
+
303
+ # State flags
283
304
self ._running = False
284
305
self ._has_finished_before = False
285
- self ._max_crawl_depth = max_crawl_depth
286
306
287
307
self ._failed = False
288
308
self ._abort_on_error = abort_on_error
0 commit comments