@@ -270,23 +270,26 @@ def __init__(
270
270
autoset_encoding : bool = True ,
271
271
encoding : Optional [str ] = None ,
272
272
proxies : Optional [dict ] = None ,
273
+ ssl : bool = True ,
273
274
) -> None :
274
275
"""Initialize with URL to crawl and any subdirectories to exclude.
275
276
276
277
Args:
277
278
url: The URL to crawl.
278
279
max_depth: The max depth of the recursive loading.
279
280
use_async: Whether to use asynchronous loading.
280
- If True, lazy_load function will not be lazy, but it will still work in the
281
- expected way, just not lazy.
281
+ If `` True``, `` lazy_load()`` will not be lazy, but it will still work in
282
+ the expected way, just not lazy.
282
283
extractor: A function to extract document contents from raw HTML.
283
284
When extract function returns an empty string, the document is
284
285
ignored. Default returns the raw HTML.
285
286
metadata_extractor: A function to extract metadata from args: raw HTML, the
286
287
source url, and the requests.Response/aiohttp.ClientResponse object
287
288
(args in that order).
289
+
288
290
Default extractor will attempt to use BeautifulSoup4 to extract the
289
291
title, description and language of the page.
292
+
290
293
..code-block:: python
291
294
292
295
import requests
@@ -299,38 +302,54 @@ def simple_metadata_extractor(
299
302
return {"source": url, "content_type": content_type}
300
303
301
304
exclude_dirs: A list of subdirectories to exclude.
302
- timeout: The timeout for the requests, in the unit of seconds. If None then
303
- connection will not timeout.
304
- prevent_outside: If True, prevent loading from urls which are not children
305
+ timeout: The timeout for the requests, in the unit of seconds. If `` None``
306
+ then connection will not timeout.
307
+ prevent_outside: If `` True`` , prevent loading from urls which are not children
305
308
of the root url.
306
309
link_regex: Regex for extracting sub-links from the raw html of a web page.
307
310
headers: Default request headers to use for all requests.
308
- check_response_status: If True, check HTTP response status and skip
309
- URLs with error responses (400-599).
310
- continue_on_failure: If True, continue if getting or parsing a link raises
311
+ check_response_status: If `` True`` , check HTTP response status and skip
312
+ URLs with error responses (`` 400-599`` ).
313
+ continue_on_failure: If `` True`` , continue if getting or parsing a link raises
311
314
an exception. Otherwise, raise the exception.
312
315
base_url: The base url to check for outside links against.
313
316
autoset_encoding: Whether to automatically set the encoding of the response.
314
- If True, the encoding of the response will be set to the apparent
315
- encoding, unless the `encoding` argument has already been explicitly set.
317
+ If `` True`` , the encoding of the response will be set to the apparent
318
+ encoding, unless the `` encoding` ` argument has already been explicitly set.
316
319
encoding: The encoding of the response. If manually set, the encoding will be
317
- set to given value, regardless of the `autoset_encoding` argument.
320
+ set to given value, regardless of the `` autoset_encoding` ` argument.
318
321
proxies: A dictionary mapping protocol names to the proxy URLs to be used for requests.
319
322
This allows the crawler to route its requests through specified proxy servers.
320
- If None, no proxies will be used and requests will go directly to the target URL.
323
+ If ``None``, no proxies will be used and requests will go directly to the target URL.
324
+
321
325
Example usage:
326
+
322
327
..code-block:: python
323
328
324
329
proxies = {
325
330
"http": "http://10.10.1.10:3128",
326
331
"https": "https://10.10.1.10:1080",
327
332
}
333
+
334
+ ssl: Whether to verify SSL certificates during requests.
335
+ By default, SSL certificate verification is enabled (``ssl=True``),
336
+ ensuring secure HTTPS connections. Setting this to ``False`` disables SSL
337
+ certificate verification, which can be useful when crawling internal
338
+ services, development environments, or sites with misconfigured or
339
+ self-signed certificates.
340
+
341
+ **Use with caution:** Disabling SSL verification exposes your crawler to
342
+ man-in-the-middle (MitM) attacks, data tampering, and potential
343
+ interception of sensitive information. This significantly compromises
344
+ the security and integrity of the communication. It should never be
345
+ used in production or when handling sensitive data.
328
346
""" # noqa: E501
329
347
330
348
self .url = url
331
349
self .max_depth = max_depth if max_depth is not None else 2
332
350
self .use_async = use_async if use_async is not None else False
333
351
self .extractor = extractor if extractor is not None else lambda x : x
352
+ self .ssl = ssl
334
353
metadata_extractor = (
335
354
metadata_extractor
336
355
if metadata_extractor is not None
@@ -447,7 +466,7 @@ async def _async_get_child_links_recursive(
447
466
session
448
467
if session is not None
449
468
else aiohttp .ClientSession (
450
- connector = aiohttp .TCPConnector (ssl = False ),
469
+ connector = aiohttp .TCPConnector (ssl = self . ssl ),
451
470
timeout = aiohttp .ClientTimeout (total = self .timeout ),
452
471
headers = self .headers ,
453
472
)
0 commit comments