diff --git a/docs/guides/code_examples/playwright_crawler_adaptive/handler.py b/docs/guides/code_examples/playwright_crawler_adaptive/handler.py index ad88e054cd..629b49449e 100644 --- a/docs/guides/code_examples/playwright_crawler_adaptive/handler.py +++ b/docs/guides/code_examples/playwright_crawler_adaptive/handler.py @@ -5,7 +5,7 @@ async def main() -> None: - crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser() + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser() @crawler.router.default_handler async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: diff --git a/docs/guides/code_examples/request_loaders/rl_basic_example.py b/docs/guides/code_examples/request_loaders/rl_basic_example.py index abe4d55584..3403673382 100644 --- a/docs/guides/code_examples/request_loaders/rl_basic_example.py +++ b/docs/guides/code_examples/request_loaders/rl_basic_example.py @@ -18,6 +18,7 @@ async def main() -> None: # Fetch and process requests from the queue. while request := await request_list.fetch_next_request(): # Do something with it... + print(f'Processing {request.url}') # And mark it as handled. await request_list.mark_request_as_handled(request) diff --git a/docs/guides/code_examples/request_loaders/rl_tandem_example.py b/docs/guides/code_examples/request_loaders/rl_tandem_example.py index eddb63af9a..d71345b420 100644 --- a/docs/guides/code_examples/request_loaders/rl_tandem_example.py +++ b/docs/guides/code_examples/request_loaders/rl_tandem_example.py @@ -8,9 +8,11 @@ async def main() -> None: # Create a static request list. request_list = RequestList(['https://crawlee.dev', 'https://apify.com']) + # highlight-start # Convert the request list to a request manager using the to_tandem method. # It is a tandem with the default request queue. request_manager = await request_list.to_tandem() + # highlight-end # Create a crawler and pass the request manager to it. crawler = ParselCrawler( @@ -20,9 +22,20 @@ async def main() -> None: @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + # New links will be enqueued directly to the queue. await context.enqueue_links() + # Extract data using Parsel's XPath and CSS selectors. + data = { + 'url': context.request.url, + 'title': context.selector.xpath('//title/text()').get(), + } + + # Push extracted data to the dataset. + await context.push_data(data) + await crawler.run() diff --git a/docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py b/docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py index 7972804d76..f3397b7043 100644 --- a/docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py +++ b/docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py @@ -23,9 +23,20 @@ async def main() -> None: @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + # New links will be enqueued directly to the queue. await context.enqueue_links() + # Extract data using Parsel's XPath and CSS selectors. + data = { + 'url': context.request.url, + 'title': context.selector.xpath('//title/text()').get(), + } + + # Push extracted data to the dataset. + await context.push_data(data) + await crawler.run() diff --git a/docs/guides/code_examples/request_loaders/sitemap_basic_example.py b/docs/guides/code_examples/request_loaders/sitemap_basic_example.py new file mode 100644 index 0000000000..0b367e4710 --- /dev/null +++ b/docs/guides/code_examples/request_loaders/sitemap_basic_example.py @@ -0,0 +1,29 @@ +import asyncio +import re + +from crawlee.http_clients import ImpitHttpClient +from crawlee.request_loaders import SitemapRequestLoader + + +async def main() -> None: + # Create an HTTP client for fetching the sitemap. + http_client = ImpitHttpClient() + + # Create a sitemap request loader with filtering rules. + sitemap_loader = SitemapRequestLoader( + sitemap_urls=['https://crawlee.dev/sitemap.xml'], + http_client=http_client, + include=[re.compile(r'.*docs.*')], # Only include URLs containing 'docs'. + max_buffer_size=500, # Keep up to 500 URLs in memory before processing. + ) + + while request := await sitemap_loader.fetch_next_request(): + # Do something with it... + print(f'Processing {request.url}') + + # And mark it as handled. + await sitemap_loader.mark_request_as_handled(request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/request_loaders/sitemap_example.py b/docs/guides/code_examples/request_loaders/sitemap_example.py deleted file mode 100644 index 3f8b1c8377..0000000000 --- a/docs/guides/code_examples/request_loaders/sitemap_example.py +++ /dev/null @@ -1,28 +0,0 @@ -import asyncio -import re - -from crawlee.http_clients import ImpitHttpClient -from crawlee.request_loaders import SitemapRequestLoader - - -async def main() -> None: - # Create an HTTP client for fetching sitemaps - async with ImpitHttpClient() as http_client: - # Create a sitemap request loader with URL filtering - sitemap_loader = SitemapRequestLoader( - sitemap_urls=['https://crawlee.dev/sitemap.xml'], - http_client=http_client, - # Exclude all URLs that do not contain 'blog' - exclude=[re.compile(r'^((?!blog).)*$')], - max_buffer_size=500, # Buffer up to 500 URLs in memory - ) - - while request := await sitemap_loader.fetch_next_request(): - # Do something with it... - - # And mark it as handled. - await sitemap_loader.mark_request_as_handled(request) - - -if __name__ == '__main__': - asyncio.run(main()) diff --git a/docs/guides/code_examples/request_loaders/sitemap_tandem_example.py b/docs/guides/code_examples/request_loaders/sitemap_tandem_example.py index 61608112e4..bf5fc012b4 100644 --- a/docs/guides/code_examples/request_loaders/sitemap_tandem_example.py +++ b/docs/guides/code_examples/request_loaders/sitemap_tandem_example.py @@ -2,38 +2,51 @@ import re from crawlee.crawlers import ParselCrawler, ParselCrawlingContext -from crawlee.http_clients import HttpxHttpClient +from crawlee.http_clients import ImpitHttpClient from crawlee.request_loaders import SitemapRequestLoader async def main() -> None: - # Create an HTTP client for fetching sitemaps - async with HttpxHttpClient() as http_client: - # Create a sitemap request loader with URL filtering - sitemap_loader = SitemapRequestLoader( - sitemap_urls=['https://crawlee.dev/sitemap.xml'], - http_client=http_client, - # Include only URLs that contain 'docs' - include=[re.compile(r'.*docs.*')], - max_buffer_size=500, # Buffer up to 500 URLs in memory - ) - - # Convert the sitemap loader to a request manager using the to_tandem method. - # It is a tandem with the default request queue. - request_manager = await sitemap_loader.to_tandem() - - # Create a crawler and pass the request manager to it. - crawler = ParselCrawler( - request_manager=request_manager, - max_requests_per_crawl=10, # Limit the max requests per crawl. - ) - - @crawler.router.default_handler - async def handler(context: ParselCrawlingContext) -> None: - # New links will be enqueued directly to the queue. - await context.enqueue_links() - - await crawler.run() + # Create an HTTP client for fetching the sitemap. + http_client = ImpitHttpClient() + + # Create a sitemap request loader with filtering rules. + sitemap_loader = SitemapRequestLoader( + sitemap_urls=['https://crawlee.dev/sitemap.xml'], + http_client=http_client, + include=[re.compile(r'.*docs.*')], # Only include URLs containing 'docs'. + max_buffer_size=500, # Keep up to 500 URLs in memory before processing. + ) + + # highlight-start + # Convert the sitemap loader into a request manager linked + # to the default request queue. + request_manager = await sitemap_loader.to_tandem() + # highlight-end + + # Create a crawler and pass the request manager to it. + crawler = ParselCrawler( + request_manager=request_manager, + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # New links will be enqueued directly to the queue. + await context.enqueue_links() + + # Extract data using Parsel's XPath and CSS selectors. + data = { + 'url': context.request.url, + 'title': context.selector.xpath('//title/text()').get(), + } + + # Push extracted data to the dataset. + await context.push_data(data) + + await crawler.run() if __name__ == '__main__': diff --git a/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py b/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py index 5089fe8902..4b121c9b40 100644 --- a/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py +++ b/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py @@ -2,41 +2,52 @@ import re from crawlee.crawlers import ParselCrawler, ParselCrawlingContext -from crawlee.http_clients import HttpxHttpClient +from crawlee.http_clients import ImpitHttpClient from crawlee.request_loaders import RequestManagerTandem, SitemapRequestLoader from crawlee.storages import RequestQueue async def main() -> None: - # Create an HTTP client for fetching sitemaps - async with HttpxHttpClient() as http_client: - # Create a sitemap request loader with URL filtering - sitemap_loader = SitemapRequestLoader( - sitemap_urls=['https://crawlee.dev/sitemap.xml'], - http_client=http_client, - # Include only URLs that contain 'docs' - include=[re.compile(r'.*docs.*')], - max_buffer_size=500, # Buffer up to 500 URLs in memory - ) - - # Open the default request queue. - request_queue = await RequestQueue.open() - - # And combine them together to a single request manager. - request_manager = RequestManagerTandem(sitemap_loader, request_queue) - - # Create a crawler and pass the request manager to it. - crawler = ParselCrawler( - request_manager=request_manager, - max_requests_per_crawl=10, # Limit the max requests per crawl. - ) - - @crawler.router.default_handler - async def handler(context: ParselCrawlingContext) -> None: - # New links will be enqueued directly to the queue. - await context.enqueue_links() - - await crawler.run() + # Create an HTTP client for fetching the sitemap. + http_client = ImpitHttpClient() + + # Create a sitemap request loader with filtering rules. + sitemap_loader = SitemapRequestLoader( + sitemap_urls=['https://crawlee.dev/sitemap.xml'], + http_client=http_client, + include=[re.compile(r'.*docs.*')], # Only include URLs containing 'docs'. + max_buffer_size=500, # Keep up to 500 URLs in memory before processing. + ) + + # Open the default request queue. + request_queue = await RequestQueue.open() + + # And combine them together to a single request manager. + request_manager = RequestManagerTandem(sitemap_loader, request_queue) + + # Create a crawler and pass the request manager to it. + crawler = ParselCrawler( + request_manager=request_manager, + max_requests_per_crawl=10, # Limit the max requests per crawl. + ) + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # New links will be enqueued directly to the queue. + await context.enqueue_links() + + # Extract data using Parsel's XPath and CSS selectors. + data = { + 'url': context.request.url, + 'title': context.selector.xpath('//title/text()').get(), + } + + # Push extracted data to the dataset. + await context.push_data(data) + + await crawler.run() if __name__ == '__main__': diff --git a/docs/guides/request_loaders.mdx b/docs/guides/request_loaders.mdx index ce5c0d13cf..e3a19be46c 100644 --- a/docs/guides/request_loaders.mdx +++ b/docs/guides/request_loaders.mdx @@ -10,7 +10,7 @@ import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import RlBasicExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example.py'; -import SitemapExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_example.py'; +import SitemapExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_basic_example.py'; import RlTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example.py'; import RlExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example_explicit.py'; import SitemapTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example.py'; @@ -102,6 +102,10 @@ RequestManager --|> RequestManagerTandem The `RequestLoader` interface defines the foundation for fetching requests during a crawl. It provides abstract methods for basic operations like retrieving, marking, and checking the status of requests. Concrete implementations, such as `RequestList`, build on this interface to handle specific scenarios. You can create your own custom loader that reads from an external file, web endpoint, database, or any other specific data source. For more details, refer to the `RequestLoader` API reference. +:::info NOTE +To learn how to use request loaders in your crawlers, see the [Request manager tandem](#request-manager-tandem) section below. +::: + ### Request list The `RequestList` can accept an asynchronous generator as input, allowing requests to be streamed rather than loading them all into memory at once. This can significantly reduce memory usage, especially when working with large sets of URLs. diff --git a/docs/guides/service_locator.mdx b/docs/guides/service_locator.mdx index aa046b3bba..fe10ce50c2 100644 --- a/docs/guides/service_locator.mdx +++ b/docs/guides/service_locator.mdx @@ -33,7 +33,7 @@ There are three core services that are managed by the service locator: ### Configuration -`Configuration` is a class that provides access to application-wide settings and parameters. It allows you to configure various aspects of Crawlee, such as timeouts, logging level, persistance intervals, and various other settings. The configuration can be set directly in the code or via environment variables. +`Configuration` is a class that provides access to application-wide settings and parameters. It allows you to configure various aspects of Crawlee, such as timeouts, logging level, persistence intervals, and various other settings. The configuration can be set directly in the code or via environment variables. ### StorageClient diff --git a/pyproject.toml b/pyproject.toml index 11d9f76102..0559b9310d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -174,6 +174,7 @@ indent-style = "space" "INP001", # File {filename} is part of an implicit namespace package, add an __init__.py "F841", # Local variable {variable} is assigned to but never used "N999", # Invalid module name + "T201", # `print` found ] "**/docs/examples/code_examples/*crawler_with_error_snapshotter.py" = [ "PLR2004", # Magic value used in comparison. Ignored for simplicity and readability of example code. diff --git a/src/crawlee/fingerprint_suite/_fingerprint_generator.py b/src/crawlee/fingerprint_suite/_fingerprint_generator.py index 3dd7188a61..05db985fe2 100644 --- a/src/crawlee/fingerprint_suite/_fingerprint_generator.py +++ b/src/crawlee/fingerprint_suite/_fingerprint_generator.py @@ -3,10 +3,13 @@ from abc import ABC, abstractmethod from typing import TYPE_CHECKING +from crawlee._utils.docs import docs_group + if TYPE_CHECKING: from browserforge.fingerprints import Fingerprint +@docs_group('Other') class FingerprintGenerator(ABC): """A class for creating browser fingerprints that mimic browser fingerprints of real users."""