51
51
52
52
from crawlee ._types import ConcurrencySettings , HttpMethod , JsonSerializable
53
53
from crawlee .base_storage_client ._models import DatasetItemsListPage
54
+ from crawlee .configuration import Configuration
55
+ from crawlee .events ._event_manager import EventManager
54
56
from crawlee .http_clients import BaseHttpClient , HttpResponse
55
57
from crawlee .proxy_configuration import ProxyConfiguration , ProxyInfo
56
58
from crawlee .sessions import Session
@@ -94,6 +96,9 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
94
96
"""Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs
95
97
or if the website blocks the request."""
96
98
99
+ configuration : NotRequired [Configuration ]
100
+ """Crawler configuration."""
101
+
97
102
request_handler_timeout : NotRequired [timedelta ]
98
103
"""Maximum duration allowed for a single request handler to run."""
99
104
@@ -112,6 +117,9 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
112
117
statistics : NotRequired [Statistics [StatisticsState ]]
113
118
"""A custom `Statistics` instance, allowing the use of non-default configuration."""
114
119
120
+ event_manager : NotRequired [EventManager ]
121
+ """A custom `EventManager` instance, allowing the use of non-default configuration."""
122
+
115
123
configure_logging : NotRequired [bool ]
116
124
"""If True, the crawler will set up logging infrastructure automatically."""
117
125
@@ -158,126 +166,136 @@ class BasicCrawler(Generic[TCrawlingContext]):
158
166
def __init__ (
159
167
self ,
160
168
* ,
169
+ configuration : Configuration | None = None ,
170
+ event_manager : EventManager | None = None ,
161
171
request_provider : RequestProvider | None = None ,
162
- request_handler : Callable [[TCrawlingContext ], Awaitable [None ]] | None = None ,
172
+ session_pool : SessionPool | None = None ,
173
+ proxy_configuration : ProxyConfiguration | None = None ,
163
174
http_client : BaseHttpClient | None = None ,
164
- concurrency_settings : ConcurrencySettings | None = None ,
175
+ request_handler : Callable [[ TCrawlingContext ], Awaitable [ None ]] | None = None ,
165
176
max_request_retries : int = 3 ,
166
177
max_requests_per_crawl : int | None = None ,
167
178
max_session_rotations : int = 10 ,
168
- request_handler_timeout : timedelta = timedelta (minutes = 1 ),
169
- session_pool : SessionPool | None = None ,
179
+ max_crawl_depth : int | None = None ,
170
180
use_session_pool : bool = True ,
171
181
retry_on_blocked : bool = True ,
172
- proxy_configuration : ProxyConfiguration | None = None ,
182
+ concurrency_settings : ConcurrencySettings | None = None ,
183
+ request_handler_timeout : timedelta = timedelta (minutes = 1 ),
173
184
statistics : Statistics | None = None ,
174
185
configure_logging : bool = True ,
175
- max_crawl_depth : int | None = None ,
176
186
_context_pipeline : ContextPipeline [TCrawlingContext ] | None = None ,
177
187
_additional_context_managers : Sequence [AbstractAsyncContextManager ] | None = None ,
178
188
_logger : logging .Logger | None = None ,
179
189
) -> None :
180
190
"""A default constructor.
181
191
182
192
Args:
193
+ configuration: The configuration object. Some of its properties are used as defaults for the crawler.
194
+ event_manager: The event manager for managing events for the crawler and all its components.
183
195
request_provider: Provider for requests to be processed by the crawler.
184
- request_handler: A callable responsible for handling requests.
196
+ session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
197
+ proxy_configuration: HTTP proxy configuration used when making requests.
185
198
http_client: HTTP client used by `BasicCrawlingContext.send_request` and the HTTP-based crawling.
186
- concurrency_settings: Settings to fine-tune concurrency levels .
199
+ request_handler: A callable responsible for handling requests .
187
200
max_request_retries: Maximum number of attempts to process a single request.
188
201
max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
189
202
this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means
190
203
no limit. Due to concurrency settings, the actual number of pages visited may slightly exceed
191
204
this value.
192
205
max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
193
206
if a proxy error occurs or if the website blocks the request.
194
- request_handler_timeout : Maximum duration allowed for a single request handler to run .
207
+ max_crawl_depth : Maximum crawl depth. If set, the crawler will stop crawling after reaching this depth .
195
208
use_session_pool: Enable the use of a session pool for managing sessions during crawling.
196
- session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
197
209
retry_on_blocked: If True, the crawler attempts to bypass bot protections automatically.
198
- proxy_configuration: HTTP proxy configuration used when making requests.
210
+ concurrency_settings: Settings to fine-tune concurrency levels.
211
+ request_handler_timeout: Maximum duration allowed for a single request handler to run.
199
212
statistics: A custom `Statistics` instance, allowing the use of non-default configuration.
200
213
configure_logging: If True, the crawler will set up logging infrastructure automatically.
201
- max_crawl_depth: Maximum crawl depth. If set, the crawler will stop crawling after reaching this depth.
202
214
_context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
203
215
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
204
216
_additional_context_managers: Additional context managers used throughout the crawler lifecycle.
205
217
_logger: A logger instance, typically provided by a subclass, for consistent logging labels.
206
218
"""
207
- self ._router : Router [TCrawlingContext ] | None = None
219
+ if configuration :
220
+ service_container .set_configuration (configuration )
221
+ if event_manager :
222
+ service_container .set_event_manager (event_manager )
223
+
224
+ config = service_container .get_configuration ()
225
+
226
+ # Core components
227
+ self ._request_provider = request_provider
228
+ self ._session_pool = session_pool or SessionPool ()
229
+ self ._proxy_configuration = proxy_configuration
230
+ self ._http_client = http_client or HttpxHttpClient ()
208
231
232
+ # Request router setup
233
+ self ._router : Router [TCrawlingContext ] | None = None
209
234
if isinstance (cast (Router , request_handler ), Router ):
210
235
self ._router = cast (Router [TCrawlingContext ], request_handler )
211
236
elif request_handler is not None :
212
237
self ._router = None
213
238
self .router .default_handler (request_handler )
214
239
215
- self ._http_client = http_client or HttpxHttpClient ()
216
-
217
- self ._context_pipeline = (_context_pipeline or ContextPipeline ()).compose (self ._check_url_after_redirects )
218
-
240
+ # Error & failed request handlers
219
241
self ._error_handler : ErrorHandler [TCrawlingContext | BasicCrawlingContext ] | None = None
220
242
self ._failed_request_handler : FailedRequestHandler [TCrawlingContext | BasicCrawlingContext ] | None = None
221
243
244
+ # Context pipeline
245
+ self ._context_pipeline = (_context_pipeline or ContextPipeline ()).compose (self ._check_url_after_redirects )
246
+
247
+ # Crawl settings
222
248
self ._max_request_retries = max_request_retries
223
249
self ._max_requests_per_crawl = max_requests_per_crawl
224
250
self ._max_session_rotations = max_session_rotations
251
+ self ._max_crawl_depth = max_crawl_depth
225
252
226
- self ._request_provider = request_provider
227
-
228
- config = service_container .get_configuration ()
229
-
253
+ # Timeouts
230
254
self ._request_handler_timeout = request_handler_timeout
231
255
self ._internal_timeout = (
232
256
config .internal_timeout
233
257
if config .internal_timeout is not None
234
258
else max (2 * request_handler_timeout , timedelta (minutes = 5 ))
235
259
)
236
260
237
- self ._tld_extractor = TLDExtract (cache_dir = tempfile .TemporaryDirectory ().name )
238
-
239
- self ._event_manager = service_container .get_event_manager ()
240
- self ._snapshotter = Snapshotter (
241
- max_memory_size = ByteSize .from_mb (config .memory_mbytes ) if config .memory_mbytes else None ,
242
- available_memory_ratio = config .available_memory_ratio ,
243
- )
244
- self ._autoscaled_pool = AutoscaledPool (
245
- system_status = SystemStatus (self ._snapshotter ),
246
- is_finished_function = self .__is_finished_function ,
247
- is_task_ready_function = self .__is_task_ready_function ,
248
- run_task_function = self .__run_task_function ,
249
- concurrency_settings = concurrency_settings ,
250
- )
251
-
261
+ # Retry and session settings
252
262
self ._use_session_pool = use_session_pool
253
- self ._session_pool = session_pool or SessionPool ()
254
-
255
263
self ._retry_on_blocked = retry_on_blocked
256
264
265
+ # Logging setup
257
266
if configure_logging :
258
267
root_logger = logging .getLogger ()
259
268
configure_logger (root_logger , remove_old_handlers = True )
260
-
261
- # Silence HTTPX logger
262
- httpx_logger = logging .getLogger ('httpx' )
269
+ httpx_logger = logging .getLogger ('httpx' ) # Silence HTTPX logger
263
270
httpx_logger .setLevel (logging .DEBUG if get_configured_log_level () <= logging .DEBUG else logging .WARNING )
271
+ self ._logger = _logger or logging .getLogger (__name__ )
264
272
265
- if not _logger :
266
- _logger = logging .getLogger (__name__ )
267
-
268
- self ._logger = _logger
269
-
270
- self ._proxy_configuration = proxy_configuration
273
+ # Statistics
271
274
self ._statistics = statistics or Statistics (
272
- event_manager = self ._event_manager ,
273
275
periodic_message_logger = self ._logger ,
274
276
log_message = 'Current request statistics:' ,
275
277
)
278
+
279
+ # Additional context managers to enter and exit
276
280
self ._additional_context_managers = _additional_context_managers or []
277
281
282
+ # Internal, not explicitly configurable components
283
+ self ._tld_extractor = TLDExtract (cache_dir = tempfile .TemporaryDirectory ().name )
284
+ self ._snapshotter = Snapshotter (
285
+ max_memory_size = ByteSize .from_mb (config .memory_mbytes ) if config .memory_mbytes else None ,
286
+ available_memory_ratio = config .available_memory_ratio ,
287
+ )
288
+ self ._autoscaled_pool = AutoscaledPool (
289
+ system_status = SystemStatus (self ._snapshotter ),
290
+ is_finished_function = self .__is_finished_function ,
291
+ is_task_ready_function = self .__is_task_ready_function ,
292
+ run_task_function = self .__run_task_function ,
293
+ concurrency_settings = concurrency_settings ,
294
+ )
295
+
296
+ # State flags
278
297
self ._running = False
279
298
self ._has_finished_before = False
280
- self ._max_crawl_depth = max_crawl_depth
281
299
282
300
@property
283
301
def log (self ) -> logging .Logger :
0 commit comments