diff --git a/docs/guides/http_crawlers.mdx b/docs/guides/http_crawlers.mdx
new file mode 100644
index 0000000000..c8479699ba
--- /dev/null
+++ b/docs/guides/http_crawlers.mdx
@@ -0,0 +1,35 @@
+---
+id: http-crawlers
+title: HTTP crawlers
+description: Crawlee supports multiple HTTP crawlers that can be used to extract data from server-rendered webpages.
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import CodeBlock from '@theme/CodeBlock';
+
+Generic class `AbstractHttpCrawler` is parent to `BeautifulSoupCrawler`, `ParselCrawler` and `HttpCrawler` and it could be used as parent for your crawler with custom content parsing requirements.
+
+It already includes almost all the functionality to crawl webpages and the only missing part is the parser that should be used to parse HTTP responses, and a context dataclass that defines what context helpers will be available to user handler functions.
+
+## `BeautifulSoupCrawler`
+`BeautifulSoupCrawler` uses `BeautifulSoupParser` to parse the HTTP response and makes it available in `BeautifulSoupCrawlingContext` in the `.soup` or `.parsed_content` attribute.
+
+## `ParselCrawler`
+`ParselCrawler` uses `ParselParser` to parse the HTTP response and makes it available in `ParselCrawlingContext` in the `.selector` or `.parsed_content` attribute.
+
+## `HttpCrawler`
+`HttpCrawler` uses `NoParser` that does not parse the HTTP response at all and is to be used if no parsing is required.
+
+## Creating your own HTTP crawler.
+### Why?
+In case you want to use some custom parser for parsing HTTP responses, and the rest of the `AbstractHttpCrawler` functionality suit your needs.
+
+### How?
+You need to define at least 2 new classes and decide what will be the type returned by the parser's `parse` method.
+Parser will inherit from `AbstractHttpParser` and it will need to implement all it's abstract methods.
+Crawler will inherit from `AbstractHttpCrawler` and it will need to implement all it's abstract methods.
+Newly defined parser is then used in the `parser` argument of `AbstractHttpCrawler.__init__` method.
+
+To get better idea and as an example please see one of our own HTTP-based crawlers mentioned above.
diff --git a/src/crawlee/abstract_http_crawler/__init__.py b/src/crawlee/abstract_http_crawler/__init__.py
new file mode 100644
index 0000000000..85e3c3b0b7
--- /dev/null
+++ b/src/crawlee/abstract_http_crawler/__init__.py
@@ -0,0 +1,10 @@
+from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
+from ._abstract_http_parser import AbstractHttpParser
+from ._http_crawling_context import ParsedHttpCrawlingContext
+
+__all__ = [
+ 'AbstractHttpCrawler',
+ 'AbstractHttpParser',
+ 'HttpCrawlerOptions',
+ 'ParsedHttpCrawlingContext',
+]
diff --git a/src/crawlee/abstract_http_crawler/_abstract_http_crawler.py b/src/crawlee/abstract_http_crawler/_abstract_http_crawler.py
new file mode 100644
index 0000000000..d5cf1fb3ad
--- /dev/null
+++ b/src/crawlee/abstract_http_crawler/_abstract_http_crawler.py
@@ -0,0 +1,209 @@
+from __future__ import annotations
+
+import logging
+from abc import ABC
+from typing import TYPE_CHECKING, Any, Generic
+
+from pydantic import ValidationError
+from typing_extensions import NotRequired, TypeVar
+
+from crawlee import EnqueueStrategy
+from crawlee._request import BaseRequestData
+from crawlee._utils.docs import docs_group
+from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
+from crawlee.abstract_http_crawler._http_crawling_context import (
+ HttpCrawlingContext,
+ ParsedHttpCrawlingContext,
+ TParseResult,
+)
+from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline
+from crawlee.errors import SessionError
+from crawlee.http_clients import HttpxHttpClient
+
+if TYPE_CHECKING:
+ from collections.abc import AsyncGenerator, Iterable
+
+ from typing_extensions import Unpack
+
+ from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, EnqueueLinksKwargs
+ from crawlee.abstract_http_crawler._abstract_http_parser import AbstractHttpParser
+
+TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
+
+
+@docs_group('Data structures')
+class HttpCrawlerOptions(Generic[TCrawlingContext], BasicCrawlerOptions[TCrawlingContext]):
+ """Arguments for the `AbstractHttpCrawler` constructor.
+
+ It is intended for typing forwarded `__init__` arguments in the subclasses.
+ """
+
+ additional_http_error_status_codes: NotRequired[Iterable[int]]
+ """Additional HTTP status codes to treat as errors, triggering automatic retries when encountered."""
+
+ ignore_http_error_status_codes: NotRequired[Iterable[int]]
+ """HTTP status codes typically considered errors but to be treated as successful responses."""
+
+
+@docs_group('Abstract classes')
+class AbstractHttpCrawler(Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext], ABC):
+ """A web crawler for performing HTTP requests.
+
+ The `AbstractHttpCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features. On top
+ of that it implements the HTTP communication using the HTTP clients. The class allows integration with
+ any HTTP client that implements the `BaseHttpClient` interface. The HTTP client is provided to the crawler
+ as an input parameter to the constructor.
+ AbstractHttpCrawler is generic class and is expected to be used together with specific parser that will be used to
+ parse http response and type of expected TCrawlingContext which is available to the user function.
+ See prepared specific version of it: BeautifulSoupCrawler, ParselCrawler or HttpCrawler for example.
+
+ The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,
+ if you need to execute client-side JavaScript, consider using a browser-based crawler like the `PlaywrightCrawler`.
+ """
+
+ def __init__(
+ self,
+ *,
+ parser: AbstractHttpParser[TParseResult],
+ additional_http_error_status_codes: Iterable[int] = (),
+ ignore_http_error_status_codes: Iterable[int] = (),
+ **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext]],
+ ) -> None:
+ self._parser = parser
+
+ kwargs.setdefault(
+ 'http_client',
+ HttpxHttpClient(
+ additional_http_error_status_codes=additional_http_error_status_codes,
+ ignore_http_error_status_codes=ignore_http_error_status_codes,
+ ),
+ )
+
+ if '_context_pipeline' not in kwargs:
+ raise ValueError(
+ 'Please pass in a `_context_pipeline`. You should use the '
+ 'AbstractHttpCrawler._create_static_content_crawler_pipeline() method to initialize it.'
+ )
+
+ kwargs.setdefault('_logger', logging.getLogger(__name__))
+ super().__init__(**kwargs)
+
+ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpCrawlingContext[TParseResult]]:
+ """Create static content crawler context pipeline with expected pipeline steps."""
+ return (
+ ContextPipeline()
+ .compose(self._make_http_request)
+ .compose(self._parse_http_response)
+ .compose(self._handle_blocked_request)
+ )
+
+ async def _parse_http_response(
+ self, context: HttpCrawlingContext
+ ) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]:
+ """Parse HTTP response and create context enhanced by the parsing result and enqueue links function.
+
+ Args:
+ context: The current crawling context, that includes HTTP response.
+
+ Yields:
+ The original crawling context enhanced by the parsing result and enqueue links function.
+ """
+ parsed_content = await self._parser.parse(context.http_response)
+ yield ParsedHttpCrawlingContext.from_http_crawling_context(
+ context=context,
+ parsed_content=parsed_content,
+ enqueue_links=self._create_enqueue_links_function(context, parsed_content),
+ )
+
+ def _create_enqueue_links_function(
+ self, context: HttpCrawlingContext, parsed_content: TParseResult
+ ) -> EnqueueLinksFunction:
+ """Create a callback function for extracting links from parsed content and enqueuing them to the crawl.
+
+ Args:
+ context: The current crawling context.
+ parsed_content: The parsed http response.
+
+ Returns:
+ Awaitable that is used for extracting links from parsed content and enqueuing them to the crawl.
+ """
+
+ async def enqueue_links(
+ *,
+ selector: str = 'a',
+ label: str | None = None,
+ user_data: dict[str, Any] | None = None,
+ **kwargs: Unpack[EnqueueLinksKwargs],
+ ) -> None:
+ kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME)
+
+ requests = list[BaseRequestData]()
+ user_data = user_data or {}
+ if label is not None:
+ user_data.setdefault('label', label)
+ for link in self._parser.find_links(parsed_content, selector=selector):
+ url = link
+ if not is_url_absolute(url):
+ url = convert_to_absolute_url(context.request.url, url)
+ try:
+ request = BaseRequestData.from_url(url, user_data=user_data)
+ except ValidationError as exc:
+ context.log.debug(
+ f'Skipping URL "{url}" due to invalid format: {exc}. '
+ 'This may be caused by a malformed URL or unsupported URL scheme. '
+ 'Please ensure the URL is correct and retry.'
+ )
+ continue
+
+ requests.append(request)
+
+ await context.add_requests(requests, **kwargs)
+
+ return enqueue_links
+
+ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
+ """Make http request and create context enhanced by HTTP response.
+
+ Args:
+ context: The current crawling context.
+
+ Yields:
+ The original crawling context enhanced by HTTP response.
+ """
+ result = await self._http_client.crawl(
+ request=context.request,
+ session=context.session,
+ proxy_info=context.proxy_info,
+ statistics=self._statistics,
+ )
+
+ yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
+
+ async def _handle_blocked_request(
+ self, context: ParsedHttpCrawlingContext[TParseResult]
+ ) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]:
+ """Try to detect if the request is blocked based on the HTTP status code or the parsed response content.
+
+ Args:
+ context: The current crawling context.
+
+ Raises:
+ SessionError: If the request is considered blocked.
+
+ Yields:
+ The original crawling context if no errors are detected.
+ """
+ if self._retry_on_blocked:
+ status_code = context.http_response.status_code
+
+ # TODO: refactor to avoid private member access
+ # https://github.com/apify/crawlee-python/issues/708
+ if (
+ context.session
+ and status_code not in self._http_client._ignore_http_error_status_codes # noqa: SLF001
+ and context.session.is_blocked_status_code(status_code=status_code)
+ ):
+ raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
+ if blocked_info := self._parser.is_blocked(context.parsed_content):
+ raise SessionError(blocked_info.reason)
+ yield context
diff --git a/src/crawlee/abstract_http_crawler/_abstract_http_parser.py b/src/crawlee/abstract_http_crawler/_abstract_http_parser.py
new file mode 100644
index 0000000000..31e31a4b57
--- /dev/null
+++ b/src/crawlee/abstract_http_crawler/_abstract_http_parser.py
@@ -0,0 +1,81 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Generic
+
+from crawlee._utils.blocked import RETRY_CSS_SELECTORS
+from crawlee._utils.docs import docs_group
+from crawlee.abstract_http_crawler._http_crawling_context import TParseResult
+from crawlee.basic_crawler import BlockedInfo
+
+if TYPE_CHECKING:
+ from collections.abc import Iterable
+
+ from crawlee.http_clients import HttpResponse
+
+
+@docs_group('Abstract classes')
+class AbstractHttpParser(Generic[TParseResult], ABC):
+ """Parser used for parsing http response and inspecting parsed result to find links or detect blocking."""
+
+ @abstractmethod
+ async def parse(self, response: HttpResponse) -> TParseResult:
+ """Parse http response.
+
+ Args:
+ response: HTTP response to be parsed.
+
+ Returns:
+ Parsed HTTP response.
+ """
+
+ def is_blocked(self, parsed_content: TParseResult) -> BlockedInfo:
+ """Detect if blocked and return BlockedInfo with additional information.
+
+ Default implementation that expects `is_matching_selector` abstract method to be implemented.
+ Override this method if your parser has different way of blockage detection.
+
+ Args:
+ parsed_content: Parsed HTTP response. Result of `parse` method.
+
+ Returns:
+ `BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty
+ string in reason signifies no blockage detected.
+ """
+ reason = ''
+ if parsed_content is not None:
+ matched_selectors = [
+ selector for selector in RETRY_CSS_SELECTORS if self.is_matching_selector(parsed_content, selector)
+ ]
+
+ if matched_selectors:
+ reason = (
+ f"Assuming the session is blocked - HTTP response matched the following selectors: "
+ f"{'; '.join(matched_selectors)}"
+ )
+
+ return BlockedInfo(reason=reason)
+
+ @abstractmethod
+ def is_matching_selector(self, parsed_content: TParseResult, selector: str) -> bool:
+ """Find if selector has match in parsed content.
+
+ Args:
+ parsed_content: Parsed HTTP response. Result of `parse` method.
+ selector: String used to define matching pattern.
+
+ Returns:
+ True if selector has match in parsed content.
+ """
+
+ @abstractmethod
+ def find_links(self, parsed_content: TParseResult, selector: str) -> Iterable[str]:
+ """Find all links in result using selector.
+
+ Args:
+ parsed_content: Parsed HTTP response. Result of `parse` method.
+ selector: String used to define matching pattern for finding links.
+
+ Returns:
+ Iterable of strings that contain found links.
+ """
diff --git a/src/crawlee/abstract_http_crawler/_http_crawling_context.py b/src/crawlee/abstract_http_crawler/_http_crawling_context.py
new file mode 100644
index 0000000000..7475b85eb3
--- /dev/null
+++ b/src/crawlee/abstract_http_crawler/_http_crawling_context.py
@@ -0,0 +1,44 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, fields
+from typing import Generic
+
+from typing_extensions import Self, TypeVar
+
+from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction
+from crawlee._utils.docs import docs_group
+from crawlee.http_clients import HttpCrawlingResult, HttpResponse
+
+TParseResult = TypeVar('TParseResult')
+
+
+@dataclass(frozen=True)
+@docs_group('Data structures')
+class HttpCrawlingContext(BasicCrawlingContext, HttpCrawlingResult):
+ """The crawling context used by the `AbstractHttpCrawler`."""
+
+ @classmethod
+ def from_basic_crawling_context(cls, context: BasicCrawlingContext, http_response: HttpResponse) -> Self:
+ """Convenience constructor that creates `HttpCrawlingContext` from existing `BasicCrawlingContext`."""
+ context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}
+ return cls(http_response=http_response, **context_kwargs)
+
+
+@dataclass(frozen=True)
+@docs_group('Data structures')
+class ParsedHttpCrawlingContext(Generic[TParseResult], HttpCrawlingContext):
+ """The crawling context used by `AbstractHttpCrawler`.
+
+ It provides access to key objects as well as utility functions for handling crawling tasks.
+ """
+
+ parsed_content: TParseResult
+ enqueue_links: EnqueueLinksFunction
+
+ @classmethod
+ def from_http_crawling_context(
+ cls, context: HttpCrawlingContext, parsed_content: TParseResult, enqueue_links: EnqueueLinksFunction
+ ) -> Self:
+ """Convenience constructor that creates new context from existing HttpCrawlingContext."""
+ context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}
+ return cls(parsed_content=parsed_content, enqueue_links=enqueue_links, **context_kwargs)
diff --git a/src/crawlee/basic_crawler/__init__.py b/src/crawlee/basic_crawler/__init__.py
index fb126330ba..adfc6be5ee 100644
--- a/src/crawlee/basic_crawler/__init__.py
+++ b/src/crawlee/basic_crawler/__init__.py
@@ -1,6 +1,7 @@
from crawlee._types import BasicCrawlingContext
from ._basic_crawler import BasicCrawler, BasicCrawlerOptions
+from ._blocked_info import BlockedInfo
from ._context_pipeline import ContextPipeline
-__all__ = ['BasicCrawler', 'BasicCrawlerOptions', 'BasicCrawlingContext', 'ContextPipeline']
+__all__ = ['BasicCrawler', 'BasicCrawlerOptions', 'BasicCrawlingContext', 'BlockedInfo', 'ContextPipeline']
diff --git a/src/crawlee/basic_crawler/_blocked_info.py b/src/crawlee/basic_crawler/_blocked_info.py
new file mode 100644
index 0000000000..aee46d21ad
--- /dev/null
+++ b/src/crawlee/basic_crawler/_blocked_info.py
@@ -0,0 +1,17 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from crawlee._utils.docs import docs_group
+
+
+@docs_group('Data structures')
+@dataclass(frozen=True)
+class BlockedInfo:
+ """Information about whether the crawling is blocked. If reason is empty, then it means it is not blocked."""
+
+ reason: str
+
+ def __bool__(self) -> bool:
+ """No reason means no blocking."""
+ return bool(self.reason)
diff --git a/src/crawlee/beautifulsoup_crawler/__init__.py b/src/crawlee/beautifulsoup_crawler/__init__.py
index 58a8e98deb..59b0264cc1 100644
--- a/src/crawlee/beautifulsoup_crawler/__init__.py
+++ b/src/crawlee/beautifulsoup_crawler/__init__.py
@@ -1,10 +1,11 @@
try:
- from ._beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupParser
+ from ._beautifulsoup_crawler import BeautifulSoupCrawler
from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
+ from ._beautifulsoup_parser import BeautifulSoupParserType
except ImportError as exc:
raise ImportError(
"To import anything from this subpackage, you need to install the 'beautifulsoup' extra."
"For example, if you use pip, run `pip install 'crawlee[beautifulsoup]'`.",
) from exc
-__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParser']
+__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType']
diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py
index f3d414ca2d..d4e0787502 100644
--- a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py
+++ b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py
@@ -1,41 +1,30 @@
from __future__ import annotations
-import asyncio
-import logging
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING
-from bs4 import BeautifulSoup, Tag
-from pydantic import ValidationError
+from bs4 import BeautifulSoup
-from crawlee import EnqueueStrategy
-from crawlee._request import BaseRequestData
-from crawlee._utils.blocked import RETRY_CSS_SELECTORS
from crawlee._utils.docs import docs_group
-from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
-from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline
-from crawlee.beautifulsoup_crawler._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
-from crawlee.errors import SessionError
-from crawlee.http_clients import HttpxHttpClient
-from crawlee.http_crawler import HttpCrawlingContext
+from crawlee.abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
+from crawlee.beautifulsoup_crawler._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
+
+from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
if TYPE_CHECKING:
- from collections.abc import AsyncGenerator, Iterable
+ from collections.abc import AsyncGenerator
from typing_extensions import Unpack
- from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs
-
-BeautifulSoupParser = Literal['html.parser', 'lxml', 'xml', 'html5lib']
+ from crawlee.abstract_http_crawler import ParsedHttpCrawlingContext
@docs_group('Classes')
-class BeautifulSoupCrawler(BasicCrawler[BeautifulSoupCrawlingContext]):
+class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, BeautifulSoup]):
"""A web crawler for performing HTTP requests and parsing HTML/XML content.
- The `BeautifulSoupCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features.
- On top of that it implements the HTTP communication using the HTTP clients and HTML/XML parsing using the
- `BeautifulSoup` library. The class allows integration with any HTTP client that implements the `BaseHttpClient`
- interface. The HTTP client is provided to the crawler as an input parameter to the constructor.
+ The `BeautifulSoupCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features.
+ It specifies its own parser `BeautifulSoupParser` which is used to parse `HttpResponse`.
+ `BeautifulSoupParser` uses following library for parsing: https://pypi.org/project/beautifulsoup4/
The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,
if you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`.
@@ -68,172 +57,25 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
def __init__(
self,
*,
- parser: BeautifulSoupParser = 'lxml',
- additional_http_error_status_codes: Iterable[int] = (),
- ignore_http_error_status_codes: Iterable[int] = (),
- **kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
+ parser: BeautifulSoupParserType = 'lxml',
+ **kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
) -> None:
"""A default constructor.
Args:
parser: The type of parser that should be used by `BeautifulSoup`.
- additional_http_error_status_codes: Additional HTTP status codes to treat as errors, triggering
- automatic retries when encountered.
- ignore_http_error_status_codes: HTTP status codes typically considered errors but to be treated
- as successful responses.
- kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
+ kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`.
"""
- self._parser = parser
-
- kwargs['_context_pipeline'] = (
- ContextPipeline()
- .compose(self._make_http_request)
- .compose(self._parse_http_response)
- .compose(self._handle_blocked_request)
- )
-
- kwargs.setdefault(
- 'http_client',
- HttpxHttpClient(
- additional_http_error_status_codes=additional_http_error_status_codes,
- ignore_http_error_status_codes=ignore_http_error_status_codes,
- ),
- )
-
- kwargs.setdefault('_logger', logging.getLogger(__name__))
- super().__init__(**kwargs)
+ async def final_step(
+ context: ParsedHttpCrawlingContext[BeautifulSoup],
+ ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]:
+ """Enhance `ParsedHttpCrawlingContext[BeautifulSoup]` with `soup` property."""
+ yield BeautifulSoupCrawlingContext.from_parsed_http_crawling_context(context)
- async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
- """Executes an HTTP request using a configured HTTP client.
+ kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline().compose(final_step)
- Args:
- context: The crawling context from the `BasicCrawler`.
-
- Yields:
- The enhanced crawling context with the HTTP response.
- """
- result = await self._http_client.crawl(
- request=context.request,
- session=context.session,
- proxy_info=context.proxy_info,
- statistics=self._statistics,
- )
-
- yield HttpCrawlingContext(
- request=context.request,
- session=context.session,
- proxy_info=context.proxy_info,
- add_requests=context.add_requests,
- send_request=context.send_request,
- push_data=context.push_data,
- get_key_value_store=context.get_key_value_store,
- log=context.log,
- http_response=result.http_response,
- )
-
- async def _handle_blocked_request(
- self,
- context: BeautifulSoupCrawlingContext,
- ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]:
- """Try to detect if the request is blocked based on the HTTP status code or the response content.
-
- Args:
- context: The current crawling context.
-
- Raises:
- SessionError: If the request is considered blocked.
-
- Yields:
- The original crawling context if no errors are detected.
- """
- if self._retry_on_blocked:
- status_code = context.http_response.status_code
-
- # TODO: refactor to avoid private member access
- # https://github.com/apify/crawlee-python/issues/708
- if (
- context.session
- and status_code not in self._http_client._ignore_http_error_status_codes # noqa: SLF001
- and context.session.is_blocked_status_code(status_code=status_code)
- ):
- raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
-
- matched_selectors = [
- selector for selector in RETRY_CSS_SELECTORS if context.soup.select_one(selector) is not None
- ]
-
- if matched_selectors:
- raise SessionError(
- 'Assuming the session is blocked - '
- f"HTTP response matched the following selectors: {'; '.join(matched_selectors)}"
- )
-
- yield context
-
- async def _parse_http_response(
- self,
- context: HttpCrawlingContext,
- ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]:
- """Parse the HTTP response using the `BeautifulSoup` library and implements the `enqueue_links` function.
-
- Args:
- context: The current crawling context.
-
- Yields:
- The enhanced crawling context with the `BeautifulSoup` selector and the `enqueue_links` function.
- """
- soup = await asyncio.to_thread(lambda: BeautifulSoup(context.http_response.read(), self._parser))
-
- async def enqueue_links(
- *,
- selector: str = 'a',
- label: str | None = None,
- user_data: dict[str, Any] | None = None,
- **kwargs: Unpack[EnqueueLinksKwargs],
- ) -> None:
- kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME)
-
- requests = list[BaseRequestData]()
- user_data = user_data or {}
-
- link: Tag
- for link in soup.select(selector):
- link_user_data = user_data
-
- if label is not None:
- link_user_data.setdefault('label', label)
-
- if (url := link.attrs.get('href')) is not None:
- url = url.strip()
-
- if not is_url_absolute(url):
- url = convert_to_absolute_url(context.request.url, url)
-
- try:
- request = BaseRequestData.from_url(url, user_data=link_user_data)
- except ValidationError as exc:
- context.log.debug(
- f'Skipping URL "{url}" due to invalid format: {exc}. '
- 'This may be caused by a malformed URL or unsupported URL scheme. '
- 'Please ensure the URL is correct and retry.'
- )
- continue
-
- requests.append(request)
-
- await context.add_requests(requests, **kwargs)
-
- yield BeautifulSoupCrawlingContext(
- request=context.request,
- session=context.session,
- proxy_info=context.proxy_info,
- enqueue_links=enqueue_links,
- add_requests=context.add_requests,
- send_request=context.send_request,
- push_data=context.push_data,
- get_key_value_store=context.get_key_value_store,
- log=context.log,
- http_response=context.http_response,
- soup=soup,
+ super().__init__(
+ parser=BeautifulSoupParser(parser=parser),
+ **kwargs,
)
diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py
index 8658782181..f01d66a1c0 100644
--- a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py
+++ b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py
@@ -1,26 +1,26 @@
-from __future__ import annotations
+from dataclasses import dataclass, fields
-from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from bs4 import BeautifulSoup
+from typing_extensions import Self
-from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction
from crawlee._utils.docs import docs_group
-from crawlee.http_crawler import HttpCrawlingResult
-
-if TYPE_CHECKING:
- from bs4 import BeautifulSoup
+from crawlee.abstract_http_crawler._http_crawling_context import ParsedHttpCrawlingContext
@dataclass(frozen=True)
@docs_group('Data structures')
-class BeautifulSoupCrawlingContext(HttpCrawlingResult, BasicCrawlingContext):
+class BeautifulSoupCrawlingContext(ParsedHttpCrawlingContext[BeautifulSoup]):
"""The crawling context used by the `BeautifulSoupCrawler`.
It provides access to key objects as well as utility functions for handling crawling tasks.
"""
- soup: BeautifulSoup
- """The `BeautifulSoup` object for the current page."""
+ @property
+ def soup(self) -> BeautifulSoup:
+ """Convenience alias."""
+ return self.parsed_content
- enqueue_links: EnqueueLinksFunction
- """The BeautifulSoup `EnqueueLinksFunction` implementation."""
+ @classmethod
+ def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[BeautifulSoup]) -> Self:
+ """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext[BeautifulSoup]`."""
+ return cls(**{field.name: getattr(context, field.name) for field in fields(context)})
diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_parser.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_parser.py
new file mode 100644
index 0000000000..f523db9e2c
--- /dev/null
+++ b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_parser.py
@@ -0,0 +1,41 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal
+
+from bs4 import BeautifulSoup, Tag
+from typing_extensions import override
+
+from crawlee.abstract_http_crawler._abstract_http_parser import AbstractHttpParser
+
+if TYPE_CHECKING:
+ from collections.abc import Iterable
+
+ from crawlee.http_clients import HttpResponse
+
+
+class BeautifulSoupParser(AbstractHttpParser[BeautifulSoup]):
+ """Parser for parsing HTTP response using `BeautifulSoup`."""
+
+ def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None:
+ self._parser = parser
+
+ @override
+ async def parse(self, response: HttpResponse) -> BeautifulSoup:
+ return BeautifulSoup(response.read(), features=self._parser)
+
+ @override
+ def is_matching_selector(self, parsed_content: BeautifulSoup, selector: str) -> bool:
+ return parsed_content.select_one(selector) is not None
+
+ @override
+ def find_links(self, parsed_content: BeautifulSoup, selector: str) -> Iterable[str]:
+ link: Tag
+ urls: list[str] = []
+ for link in parsed_content.select(selector):
+ url = link.attrs.get('href')
+ if url:
+ urls.append(url.strip())
+ return urls
+
+
+BeautifulSoupParserType = Literal['html.parser', 'lxml', 'xml', 'html5lib']
diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py
index dcca721184..9012fdc431 100644
--- a/src/crawlee/http_clients/_base.py
+++ b/src/crawlee/http_clients/_base.py
@@ -41,7 +41,7 @@ def read(self) -> bytes:
@dataclass(frozen=True)
@docs_group('Data structures')
class HttpCrawlingResult:
- """Result of a HTTP-only crawl.
+ """Result of an HTTP-only crawl.
Mainly for the purpose of composing specific crawling contexts (e.g. `BeautifulSoupCrawlingContext`,
`ParselCrawlingContext`, ...).
diff --git a/src/crawlee/http_crawler/__init__.py b/src/crawlee/http_crawler/__init__.py
index d3138a180c..0990525000 100644
--- a/src/crawlee/http_crawler/__init__.py
+++ b/src/crawlee/http_crawler/__init__.py
@@ -1,4 +1,10 @@
+from crawlee.abstract_http_crawler._http_crawling_context import HttpCrawlingContext
+from crawlee.http_clients import HttpCrawlingResult
+
from ._http_crawler import HttpCrawler
-from ._http_crawling_context import HttpCrawlingContext, HttpCrawlingResult
-__all__ = ['HttpCrawler', 'HttpCrawlingContext', 'HttpCrawlingResult']
+__all__ = [
+ 'HttpCrawler',
+ 'HttpCrawlingContext',
+ 'HttpCrawlingResult',
+]
diff --git a/src/crawlee/http_crawler/_http_crawler.py b/src/crawlee/http_crawler/_http_crawler.py
index c5813b11b0..94672fe0fe 100644
--- a/src/crawlee/http_crawler/_http_crawler.py
+++ b/src/crawlee/http_crawler/_http_crawler.py
@@ -1,33 +1,23 @@
from __future__ import annotations
-import logging
from typing import TYPE_CHECKING
from crawlee._utils.docs import docs_group
-from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline
-from crawlee.errors import SessionError
-from crawlee.http_clients import HttpxHttpClient
-from crawlee.http_crawler._http_crawling_context import HttpCrawlingContext
+from crawlee.abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions, ParsedHttpCrawlingContext
-if TYPE_CHECKING:
- from collections.abc import AsyncGenerator, Iterable
+from ._http_parser import NoParser
+if TYPE_CHECKING:
from typing_extensions import Unpack
- from crawlee._types import BasicCrawlingContext
-
@docs_group('Classes')
-class HttpCrawler(BasicCrawler[HttpCrawlingContext]):
- """A web crawler for performing HTTP requests.
+class HttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[bytes], bytes]):
+ """Specific version of generic `AbstractHttpCrawler`.
- The `HttpCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features. On top
- of that it implements the HTTP communication using the HTTP clients. The class allows integration with
- any HTTP client that implements the `BaseHttpClient` interface. The HTTP client is provided to the crawler
- as an input parameter to the constructor.
-
- The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,
- if you need to execute client-side JavaScript, consider using a browser-based crawler like the `PlaywrightCrawler`.
+ It uses a dummy parser that simply returns the HTTP response body as-is. Use this only if you know what you are
+ doing. In most cases, using an HTML parser would be more beneficial. For such scenarios, consider using
+ `BeautifulSoupCrawler`, `ParselCrawler`, or writing your own subclass of `AbstractHttpCrawler`.
### Usage
@@ -56,86 +46,15 @@ async def request_handler(context: HttpCrawlingContext) -> None:
def __init__(
self,
- *,
- additional_http_error_status_codes: Iterable[int] = (),
- ignore_http_error_status_codes: Iterable[int] = (),
- **kwargs: Unpack[BasicCrawlerOptions[HttpCrawlingContext]],
+ **kwargs: Unpack[HttpCrawlerOptions[ParsedHttpCrawlingContext[bytes]]],
) -> None:
"""A default constructor.
Args:
- additional_http_error_status_codes: Additional HTTP status codes to treat as errors, triggering
- automatic retries when encountered.
- ignore_http_error_status_codes: HTTP status codes typically considered errors but to be treated
- as successful responses.
- kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
- """
- kwargs['_context_pipeline'] = (
- ContextPipeline().compose(self._make_http_request).compose(self._handle_blocked_request)
- )
-
- kwargs.setdefault(
- 'http_client',
- HttpxHttpClient(
- additional_http_error_status_codes=additional_http_error_status_codes,
- ignore_http_error_status_codes=ignore_http_error_status_codes,
- ),
- )
-
- kwargs.setdefault('_logger', logging.getLogger(__name__))
-
- super().__init__(**kwargs)
-
- async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
- """Executes an HTTP request using a configured HTTP client.
-
- Args:
- context: The crawling context from the `BasicCrawler`.
-
- Yields:
- The enhanced crawling context with the HTTP response.
+ kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`.
"""
- result = await self._http_client.crawl(
- request=context.request,
- session=context.session,
- proxy_info=context.proxy_info,
- statistics=self._statistics,
+ kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline()
+ super().__init__(
+ parser=NoParser(),
+ **kwargs,
)
-
- yield HttpCrawlingContext(
- request=context.request,
- session=context.session,
- proxy_info=context.proxy_info,
- add_requests=context.add_requests,
- send_request=context.send_request,
- push_data=context.push_data,
- get_key_value_store=context.get_key_value_store,
- log=context.log,
- http_response=result.http_response,
- )
-
- async def _handle_blocked_request(self, context: HttpCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
- """Try to detect if the request is blocked based on the HTTP status code.
-
- Args:
- context: The current crawling context.
-
- Raises:
- SessionError: If the request is considered blocked.
-
- Yields:
- The original crawling context if no errors are detected.
- """
- if self._retry_on_blocked:
- status_code = context.http_response.status_code
-
- # TODO: refactor to avoid private member access
- # https://github.com/apify/crawlee-python/issues/708
- if (
- context.session
- and status_code not in self._http_client._ignore_http_error_status_codes # noqa: SLF001
- and context.session.is_blocked_status_code(status_code=status_code)
- ):
- raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
-
- yield context
diff --git a/src/crawlee/http_crawler/_http_crawling_context.py b/src/crawlee/http_crawler/_http_crawling_context.py
deleted file mode 100644
index 414e9f43bb..0000000000
--- a/src/crawlee/http_crawler/_http_crawling_context.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from __future__ import annotations
-
-from dataclasses import dataclass
-
-from crawlee._types import BasicCrawlingContext
-from crawlee._utils.docs import docs_group
-from crawlee.http_clients import HttpCrawlingResult
-
-
-@dataclass(frozen=True)
-@docs_group('Data structures')
-class HttpCrawlingContext(BasicCrawlingContext, HttpCrawlingResult):
- """The crawling context used by the `HttpCrawler`."""
diff --git a/src/crawlee/http_crawler/_http_parser.py b/src/crawlee/http_crawler/_http_parser.py
new file mode 100644
index 0000000000..47d47f2d3b
--- /dev/null
+++ b/src/crawlee/http_crawler/_http_parser.py
@@ -0,0 +1,36 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from typing_extensions import override
+
+from crawlee.abstract_http_crawler import AbstractHttpParser
+from crawlee.basic_crawler import BlockedInfo
+
+if TYPE_CHECKING:
+ from collections.abc import Iterable
+
+ from crawlee.http_clients import HttpResponse
+
+
+class NoParser(AbstractHttpParser[bytes]):
+ """Dummy parser for backwards compatibility.
+
+ To enable using `HttpCrawler` without need for additional specific parser.
+ """
+
+ @override
+ async def parse(self, response: HttpResponse) -> bytes:
+ return response.read()
+
+ @override
+ def is_blocked(self, parsed_content: bytes) -> BlockedInfo: # Intentional unused argument.
+ return BlockedInfo(reason='')
+
+ @override
+ def is_matching_selector(self, parsed_content: bytes, selector: str) -> bool: # Intentional unused argument.
+ return False
+
+ @override
+ def find_links(self, parsed_content: bytes, selector: str) -> Iterable[str]: # Intentional unused argument.
+ return []
diff --git a/src/crawlee/parsel_crawler/_parsel_crawler.py b/src/crawlee/parsel_crawler/_parsel_crawler.py
index d3ba78af41..0d47b4014a 100644
--- a/src/crawlee/parsel_crawler/_parsel_crawler.py
+++ b/src/crawlee/parsel_crawler/_parsel_crawler.py
@@ -1,39 +1,30 @@
from __future__ import annotations
-import asyncio
-import logging
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING
from parsel import Selector
-from pydantic import ValidationError
-from crawlee import EnqueueStrategy
-from crawlee._request import BaseRequestData
-from crawlee._utils.blocked import RETRY_CSS_SELECTORS
from crawlee._utils.docs import docs_group
-from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
-from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline
-from crawlee.errors import SessionError
-from crawlee.http_clients import HttpxHttpClient
-from crawlee.http_crawler import HttpCrawlingContext
-from crawlee.parsel_crawler._parsel_crawling_context import ParselCrawlingContext
+from crawlee.abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
+from crawlee.parsel_crawler._parsel_parser import ParselParser
+
+from ._parsel_crawling_context import ParselCrawlingContext
if TYPE_CHECKING:
- from collections.abc import AsyncGenerator, Iterable
+ from collections.abc import AsyncGenerator
from typing_extensions import Unpack
- from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs
+ from crawlee.abstract_http_crawler import ParsedHttpCrawlingContext
@docs_group('Classes')
-class ParselCrawler(BasicCrawler[ParselCrawlingContext]):
+class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector]):
"""A web crawler for performing HTTP requests and parsing HTML/XML content.
- The `ParselCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features.
- On top of that it implements the HTTP communication using the HTTP clients and HTML/XML parsing using the
- `Parsel` library. The class allows integration with any HTTP client that implements the `BaseHttpClient`
- interface. The HTTP client is provided to the crawler as an input parameter to the constructor.
+ The `ParselCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features.
+ It specifies its own parser `ParselParser` which is used to parse `HttpResponse`.
+ `ParselParser` uses following library for parsing: https://pypi.org/project/parsel/
The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,
if you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`.
@@ -65,172 +56,22 @@ async def request_handler(context: ParselCrawlingContext) -> None:
def __init__(
self,
- *,
- additional_http_error_status_codes: Iterable[int] = (),
- ignore_http_error_status_codes: Iterable[int] = (),
- **kwargs: Unpack[BasicCrawlerOptions[ParselCrawlingContext]],
+ **kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
) -> None:
"""A default constructor.
Args:
- additional_http_error_status_codes: Additional HTTP status codes to treat as errors, triggering
- automatic retries when encountered.
- ignore_http_error_status_codes: HTTP status codes typically considered errors but to be treated
- as successful responses.
- kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
+ kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`.
"""
- kwargs['_context_pipeline'] = (
- ContextPipeline()
- .compose(self._make_http_request)
- .compose(self._parse_http_response)
- .compose(self._handle_blocked_request)
- )
-
- kwargs.setdefault(
- 'http_client',
- HttpxHttpClient(
- additional_http_error_status_codes=additional_http_error_status_codes,
- ignore_http_error_status_codes=ignore_http_error_status_codes,
- ),
- )
-
- kwargs.setdefault('_logger', logging.getLogger(__name__))
-
- super().__init__(**kwargs)
- async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
- """Executes an HTTP request using a configured HTTP client.
+ async def final_step(
+ context: ParsedHttpCrawlingContext[Selector],
+ ) -> AsyncGenerator[ParselCrawlingContext, None]:
+ """Enhance `ParsedHttpCrawlingContext[Selector]` with a `selector` property."""
+ yield ParselCrawlingContext.from_parsed_http_crawling_context(context)
- Args:
- context: The crawling context from the `BasicCrawler`.
-
- Yields:
- The enhanced crawling context with the HTTP response.
- """
- result = await self._http_client.crawl(
- request=context.request,
- session=context.session,
- proxy_info=context.proxy_info,
- statistics=self._statistics,
- )
-
- yield HttpCrawlingContext(
- request=context.request,
- session=context.session,
- proxy_info=context.proxy_info,
- add_requests=context.add_requests,
- send_request=context.send_request,
- push_data=context.push_data,
- get_key_value_store=context.get_key_value_store,
- log=context.log,
- http_response=result.http_response,
- )
-
- async def _handle_blocked_request(
- self, context: ParselCrawlingContext
- ) -> AsyncGenerator[ParselCrawlingContext, None]:
- """Try to detect if the request is blocked based on the HTTP status code or the response content.
-
- Args:
- context: The current crawling context.
-
- Raises:
- SessionError: If the request is considered blocked.
-
- Yields:
- The original crawling context if no errors are detected.
- """
- if self._retry_on_blocked:
- status_code = context.http_response.status_code
-
- # TODO: refactor to avoid private member access
- # https://github.com/apify/crawlee-python/issues/708
- if (
- context.session
- and status_code not in self._http_client._ignore_http_error_status_codes # noqa: SLF001
- and context.session.is_blocked_status_code(status_code=status_code)
- ):
- raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
-
- parsel = context.selector
-
- matched_selectors = [
- selector
- for selector in RETRY_CSS_SELECTORS
- if parsel.type in ('html', 'xml') and parsel.css(selector).get() is not None
- ]
-
- if matched_selectors:
- raise SessionError(
- 'Assuming the session is blocked - '
- f"HTTP response matched the following selectors: {'; '.join(matched_selectors)}"
- )
-
- yield context
-
- async def _parse_http_response(
- self,
- context: HttpCrawlingContext,
- ) -> AsyncGenerator[ParselCrawlingContext, None]:
- """Parse the HTTP response using the `Parsel` library and implements the `enqueue_links` function.
-
- Args:
- context: The current crawling context.
-
- Yields:
- The enhanced crawling context with the `Parsel` selector and the `enqueue_links` function.
- """
- parsel_selector = await asyncio.to_thread(lambda: Selector(body=context.http_response.read()))
-
- async def enqueue_links(
- *,
- selector: str = 'a',
- label: str | None = None,
- user_data: dict[str, Any] | None = None,
- **kwargs: Unpack[EnqueueLinksKwargs],
- ) -> None:
- kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME)
-
- requests = list[BaseRequestData]()
- user_data = user_data or {}
-
- link: Selector
- for link in parsel_selector.css(selector):
- link_user_data = user_data
-
- if label is not None:
- link_user_data.setdefault('label', label)
-
- if (url := link.xpath('@href').get()) is not None:
- url = url.strip()
-
- if not is_url_absolute(url):
- url = str(convert_to_absolute_url(context.request.url, url))
-
- try:
- request = BaseRequestData.from_url(url, user_data=link_user_data)
- except ValidationError as exc:
- context.log.debug(
- f'Skipping URL "{url}" due to invalid format: {exc}. '
- 'This may be caused by a malformed URL or unsupported URL scheme. '
- 'Please ensure the URL is correct and retry.'
- )
- continue
-
- requests.append(request)
-
- await context.add_requests(requests, **kwargs)
-
- yield ParselCrawlingContext(
- request=context.request,
- session=context.session,
- proxy_info=context.proxy_info,
- enqueue_links=enqueue_links,
- add_requests=context.add_requests,
- send_request=context.send_request,
- push_data=context.push_data,
- get_key_value_store=context.get_key_value_store,
- log=context.log,
- http_response=context.http_response,
- selector=parsel_selector,
+ kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline().compose(final_step)
+ super().__init__(
+ parser=ParselParser(),
+ **kwargs,
)
diff --git a/src/crawlee/parsel_crawler/_parsel_crawling_context.py b/src/crawlee/parsel_crawler/_parsel_crawling_context.py
index 22a565d678..5dd13e3868 100644
--- a/src/crawlee/parsel_crawler/_parsel_crawling_context.py
+++ b/src/crawlee/parsel_crawler/_parsel_crawling_context.py
@@ -1,26 +1,26 @@
-from __future__ import annotations
+from dataclasses import dataclass, fields
-from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from parsel import Selector
+from typing_extensions import Self
-from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction
from crawlee._utils.docs import docs_group
-from crawlee.http_crawler import HttpCrawlingResult
-
-if TYPE_CHECKING:
- from parsel import Selector
+from crawlee.abstract_http_crawler._http_crawling_context import ParsedHttpCrawlingContext
@dataclass(frozen=True)
@docs_group('Data structures')
-class ParselCrawlingContext(HttpCrawlingResult, BasicCrawlingContext):
+class ParselCrawlingContext(ParsedHttpCrawlingContext[Selector]):
"""The crawling context used by the `ParselCrawler`.
It provides access to key objects as well as utility functions for handling crawling tasks.
"""
- selector: Selector
- """The Parsel `Selector` object for the current page."""
+ @property
+ def selector(self) -> Selector:
+ """Convenience alias."""
+ return self.parsed_content
- enqueue_links: EnqueueLinksFunction
- """The Parsel `EnqueueLinksFunction` implementation."""
+ @classmethod
+ def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[Selector]) -> Self:
+ """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext[BeautifulSoup]`."""
+ return cls(**{field.name: getattr(context, field.name) for field in fields(context)})
diff --git a/src/crawlee/parsel_crawler/_parsel_parser.py b/src/crawlee/parsel_crawler/_parsel_parser.py
new file mode 100644
index 0000000000..bde77fbfde
--- /dev/null
+++ b/src/crawlee/parsel_crawler/_parsel_parser.py
@@ -0,0 +1,30 @@
+import asyncio
+from collections.abc import Iterable
+
+from parsel import Selector
+from typing_extensions import override
+
+from crawlee.abstract_http_crawler._abstract_http_parser import AbstractHttpParser
+from crawlee.http_clients import HttpResponse
+
+
+class ParselParser(AbstractHttpParser[Selector]):
+ """Parser for parsing HTTP response using Parsel."""
+
+ @override
+ async def parse(self, response: HttpResponse) -> Selector:
+ return await asyncio.to_thread(lambda: Selector(body=response.read()))
+
+ @override
+ def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool:
+ return parsed_content.type in ('html', 'xml') and parsed_content.css(selector).get() is not None
+
+ @override
+ def find_links(self, parsed_content: Selector, selector: str) -> Iterable[str]:
+ link: Selector
+ urls: list[str] = []
+ for link in parsed_content.css(selector):
+ url = link.xpath('@href').get()
+ if url:
+ urls.append(url.strip())
+ return urls
diff --git a/tests/unit/beautifulsoup_crawler/test_beautifulsoup_crawler.py b/tests/unit/beautifulsoup_crawler/test_beautifulsoup_crawler.py
index 726599dd7e..1dabf7770e 100644
--- a/tests/unit/beautifulsoup_crawler/test_beautifulsoup_crawler.py
+++ b/tests/unit/beautifulsoup_crawler/test_beautifulsoup_crawler.py
@@ -8,14 +8,12 @@
from httpx import Response
from crawlee import ConcurrencySettings
-from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.storages import RequestList
if TYPE_CHECKING:
from collections.abc import AsyncGenerator
- from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext
-
@pytest.fixture
async def server() -> AsyncGenerator[respx.MockRouter, None]:
diff --git a/tests/unit/http_crawler/test_http_crawler.py b/tests/unit/http_crawler/test_http_crawler.py
index 53c53164d2..c97d15072b 100644
--- a/tests/unit/http_crawler/test_http_crawler.py
+++ b/tests/unit/http_crawler/test_http_crawler.py
@@ -20,8 +20,7 @@
from collections.abc import AsyncGenerator, Awaitable
from crawlee.http_clients._base import BaseHttpClient
- from crawlee.http_crawler._http_crawling_context import HttpCrawlingContext
-
+ from crawlee.http_crawler import HttpCrawlingContext
# Payload, e.g. data for a form submission.
PAYLOAD = {