Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


async def main() -> None:
crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser()
crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser()

@crawler.router.default_handler
async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ async def main() -> None:
# Fetch and process requests from the queue.
while request := await request_list.fetch_next_request():
# Do something with it...
print(f'Processing {request.url}')

# And mark it as handled.
await request_list.mark_request_as_handled(request)
Expand Down
13 changes: 13 additions & 0 deletions docs/guides/code_examples/request_loaders/rl_tandem_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@ async def main() -> None:
# Create a static request list.
request_list = RequestList(['https://crawlee.dev', 'https://apify.com'])

# highlight-start
# Convert the request list to a request manager using the to_tandem method.
# It is a tandem with the default request queue.
request_manager = await request_list.to_tandem()
# highlight-end

# Create a crawler and pass the request manager to it.
crawler = ParselCrawler(
Expand All @@ -20,9 +22,20 @@ async def main() -> None:

@crawler.router.default_handler
async def handler(context: ParselCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url}')

# New links will be enqueued directly to the queue.
await context.enqueue_links()

# Extract data using Parsel's XPath and CSS selectors.
data = {
'url': context.request.url,
'title': context.selector.xpath('//title/text()').get(),
}

# Push extracted data to the dataset.
await context.push_data(data)

await crawler.run()


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,20 @@ async def main() -> None:

@crawler.router.default_handler
async def handler(context: ParselCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url}')

# New links will be enqueued directly to the queue.
await context.enqueue_links()

# Extract data using Parsel's XPath and CSS selectors.
data = {
'url': context.request.url,
'title': context.selector.xpath('//title/text()').get(),
}

# Push extracted data to the dataset.
await context.push_data(data)

await crawler.run()


Expand Down
29 changes: 29 additions & 0 deletions docs/guides/code_examples/request_loaders/sitemap_basic_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import asyncio
import re

from crawlee.http_clients import ImpitHttpClient
from crawlee.request_loaders import SitemapRequestLoader


async def main() -> None:
# Create an HTTP client for fetching the sitemap.
http_client = ImpitHttpClient()

# Create a sitemap request loader with filtering rules.
sitemap_loader = SitemapRequestLoader(
sitemap_urls=['https://crawlee.dev/sitemap.xml'],
http_client=http_client,
include=[re.compile(r'.*docs.*')], # Only include URLs containing 'docs'.
max_buffer_size=500, # Keep up to 500 URLs in memory before processing.
)

while request := await sitemap_loader.fetch_next_request():
# Do something with it...
print(f'Processing {request.url}')

# And mark it as handled.
await sitemap_loader.mark_request_as_handled(request)


if __name__ == '__main__':
asyncio.run(main())
28 changes: 0 additions & 28 deletions docs/guides/code_examples/request_loaders/sitemap_example.py

This file was deleted.

69 changes: 41 additions & 28 deletions docs/guides/code_examples/request_loaders/sitemap_tandem_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,51 @@
import re

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import HttpxHttpClient
from crawlee.http_clients import ImpitHttpClient
from crawlee.request_loaders import SitemapRequestLoader


async def main() -> None:
# Create an HTTP client for fetching sitemaps
async with HttpxHttpClient() as http_client:
# Create a sitemap request loader with URL filtering
sitemap_loader = SitemapRequestLoader(
sitemap_urls=['https://crawlee.dev/sitemap.xml'],
http_client=http_client,
# Include only URLs that contain 'docs'
include=[re.compile(r'.*docs.*')],
max_buffer_size=500, # Buffer up to 500 URLs in memory
)

# Convert the sitemap loader to a request manager using the to_tandem method.
# It is a tandem with the default request queue.
request_manager = await sitemap_loader.to_tandem()

# Create a crawler and pass the request manager to it.
crawler = ParselCrawler(
request_manager=request_manager,
max_requests_per_crawl=10, # Limit the max requests per crawl.
)

@crawler.router.default_handler
async def handler(context: ParselCrawlingContext) -> None:
# New links will be enqueued directly to the queue.
await context.enqueue_links()

await crawler.run()
# Create an HTTP client for fetching the sitemap.
http_client = ImpitHttpClient()

# Create a sitemap request loader with filtering rules.
sitemap_loader = SitemapRequestLoader(
sitemap_urls=['https://crawlee.dev/sitemap.xml'],
http_client=http_client,
include=[re.compile(r'.*docs.*')], # Only include URLs containing 'docs'.
max_buffer_size=500, # Keep up to 500 URLs in memory before processing.
)

# highlight-start
# Convert the sitemap loader into a request manager linked
# to the default request queue.
request_manager = await sitemap_loader.to_tandem()
# highlight-end

# Create a crawler and pass the request manager to it.
crawler = ParselCrawler(
request_manager=request_manager,
max_requests_per_crawl=10, # Limit the max requests per crawl.
)

@crawler.router.default_handler
async def handler(context: ParselCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url}')

# New links will be enqueued directly to the queue.
await context.enqueue_links()

# Extract data using Parsel's XPath and CSS selectors.
data = {
'url': context.request.url,
'title': context.selector.xpath('//title/text()').get(),
}

# Push extracted data to the dataset.
await context.push_data(data)

await crawler.run()


if __name__ == '__main__':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,41 +2,52 @@
import re

from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import HttpxHttpClient
from crawlee.http_clients import ImpitHttpClient
from crawlee.request_loaders import RequestManagerTandem, SitemapRequestLoader
from crawlee.storages import RequestQueue


async def main() -> None:
# Create an HTTP client for fetching sitemaps
async with HttpxHttpClient() as http_client:
# Create a sitemap request loader with URL filtering
sitemap_loader = SitemapRequestLoader(
sitemap_urls=['https://crawlee.dev/sitemap.xml'],
http_client=http_client,
# Include only URLs that contain 'docs'
include=[re.compile(r'.*docs.*')],
max_buffer_size=500, # Buffer up to 500 URLs in memory
)

# Open the default request queue.
request_queue = await RequestQueue.open()

# And combine them together to a single request manager.
request_manager = RequestManagerTandem(sitemap_loader, request_queue)

# Create a crawler and pass the request manager to it.
crawler = ParselCrawler(
request_manager=request_manager,
max_requests_per_crawl=10, # Limit the max requests per crawl.
)

@crawler.router.default_handler
async def handler(context: ParselCrawlingContext) -> None:
# New links will be enqueued directly to the queue.
await context.enqueue_links()

await crawler.run()
# Create an HTTP client for fetching the sitemap.
http_client = ImpitHttpClient()

# Create a sitemap request loader with filtering rules.
sitemap_loader = SitemapRequestLoader(
sitemap_urls=['https://crawlee.dev/sitemap.xml'],
http_client=http_client,
include=[re.compile(r'.*docs.*')], # Only include URLs containing 'docs'.
max_buffer_size=500, # Keep up to 500 URLs in memory before processing.
)

# Open the default request queue.
request_queue = await RequestQueue.open()

# And combine them together to a single request manager.
request_manager = RequestManagerTandem(sitemap_loader, request_queue)

# Create a crawler and pass the request manager to it.
crawler = ParselCrawler(
request_manager=request_manager,
max_requests_per_crawl=10, # Limit the max requests per crawl.
)

@crawler.router.default_handler
async def handler(context: ParselCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url}')

# New links will be enqueued directly to the queue.
await context.enqueue_links()

# Extract data using Parsel's XPath and CSS selectors.
data = {
'url': context.request.url,
'title': context.selector.xpath('//title/text()').get(),
}

# Push extracted data to the dataset.
await context.push_data(data)

await crawler.run()


if __name__ == '__main__':
Expand Down
6 changes: 5 additions & 1 deletion docs/guides/request_loaders.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import RlBasicExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example.py';
import SitemapExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_example.py';
import SitemapExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_basic_example.py';
import RlTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example.py';
import RlExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example_explicit.py';
import SitemapTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example.py';
Expand Down Expand Up @@ -102,6 +102,10 @@ RequestManager --|> RequestManagerTandem

The <ApiLink to="class/RequestLoader">`RequestLoader`</ApiLink> interface defines the foundation for fetching requests during a crawl. It provides abstract methods for basic operations like retrieving, marking, and checking the status of requests. Concrete implementations, such as <ApiLink to="class/RequestList">`RequestList`</ApiLink>, build on this interface to handle specific scenarios. You can create your own custom loader that reads from an external file, web endpoint, database, or any other specific data source. For more details, refer to the <ApiLink to="class/RequestLoader">`RequestLoader`</ApiLink> API reference.

:::info NOTE
To learn how to use request loaders in your crawlers, see the [Request manager tandem](#request-manager-tandem) section below.
:::

### Request list

The <ApiLink to="class/RequestList">`RequestList`</ApiLink> can accept an asynchronous generator as input, allowing requests to be streamed rather than loading them all into memory at once. This can significantly reduce memory usage, especially when working with large sets of URLs.
Expand Down
2 changes: 1 addition & 1 deletion docs/guides/service_locator.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ There are three core services that are managed by the service locator:

### Configuration

<ApiLink to="class/Configuration">`Configuration`</ApiLink> is a class that provides access to application-wide settings and parameters. It allows you to configure various aspects of Crawlee, such as timeouts, logging level, persistance intervals, and various other settings. The configuration can be set directly in the code or via environment variables.
<ApiLink to="class/Configuration">`Configuration`</ApiLink> is a class that provides access to application-wide settings and parameters. It allows you to configure various aspects of Crawlee, such as timeouts, logging level, persistence intervals, and various other settings. The configuration can be set directly in the code or via environment variables.

### StorageClient

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ indent-style = "space"
"INP001", # File {filename} is part of an implicit namespace package, add an __init__.py
"F841", # Local variable {variable} is assigned to but never used
"N999", # Invalid module name
"T201", # `print` found
]
"**/docs/examples/code_examples/*crawler_with_error_snapshotter.py" = [
"PLR2004", # Magic value used in comparison. Ignored for simplicity and readability of example code.
Expand Down
3 changes: 3 additions & 0 deletions src/crawlee/fingerprint_suite/_fingerprint_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
from browserforge.fingerprints import Fingerprint


@docs_group('Other')
class FingerprintGenerator(ABC):
"""A class for creating browser fingerprints that mimic browser fingerprints of real users."""

Expand Down
Loading