apify · Pijukatel · Dec 6, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/docs/guides/http_crawlers.mdx b/docs/guides/http_crawlers.mdx
@@ -0,0 +1,24 @@
+---
+id: http-crawlers
+title: HTTP crawlers
+description: Crawlee supports multiple http crawlers that can be used to extract data from server-rendered webpages.
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import CodeBlock from '@theme/CodeBlock';
+
+Generic class <ApiLink to="class/AbstractHttpCrawler">`AbstractHttpCrawler`</ApiLink> is parent to <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> and <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> and it could be used as parent for your crawler with custom content parsing requirements.
+
+It already includes almost all the functionality to crawl webpages and the only missing part is the parser that should be used to parse HTTP responses, and a context dataclass that defines what context helpers will be available to user handler functions.
+
+## `BeautifulSoupCrawler`
+<ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> uses <ApiLink to="class/BeautifulSoupParser">`BeautifulSoupParser`</ApiLink> to parse the HTTP response and makes it available in <ApiLink to="class/BeautifulSoupCrawlingContext">`BeautifulSoupCrawlingContext`</ApiLink> in the `.soup` or `.parsed_content` attribute.
+
+## `ParselCrawler`
+<ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> uses <ApiLink to="class/ParselParser">`ParselParser`</ApiLink> to parse the HTTP response and makes it available in <ApiLink to="class/ParselCrawlingContext">`ParselCrawlingContext`</ApiLink> in the `.selector` or `.parsed_content` attribute.
+
+## `HttpCrawler`
+<ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> uses <ApiLink to="class/NoParser">`NoParser`</ApiLink> that does not parse the HTTP response at all and is to be used if no parsing is required.
+
diff --git a/src/crawlee/abstract_http_crawler/__init__.py b/src/crawlee/abstract_http_crawler/__init__.py
@@ -0,0 +1,11 @@
+from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
+from ._abstract_http_parser import AbstractHttpParser, BlockedInfo
+from ._http_crawling_context import ParsedHttpCrawlingContext
+
+__all__ = [
+    'AbstractHttpCrawler',
+    'AbstractHttpParser',
+    'BlockedInfo',
+    'HttpCrawlerOptions',
+    'ParsedHttpCrawlingContext',
+]
diff --git a/src/crawlee/abstract_http_crawler/_abstract_http_crawler.py b/src/crawlee/abstract_http_crawler/_abstract_http_crawler.py
@@ -0,0 +1,204 @@
+from __future__ import annotations
+
+import logging
+from abc import ABC
+from typing import TYPE_CHECKING, Any, Generic
+
+from pydantic import ValidationError
+from typing_extensions import NotRequired, TypeVar
+
+from crawlee import EnqueueStrategy
+from crawlee._request import BaseRequestData
+from crawlee._utils.docs import docs_group
+from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
+from crawlee.abstract_http_crawler._http_crawling_context import (
+    HttpCrawlingContext,
+    ParsedHttpCrawlingContext,
+    TParseResult,
+)
+from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline
+from crawlee.errors import SessionError
+from crawlee.http_clients import HttpxHttpClient
+
+if TYPE_CHECKING:
+    from collections.abc import AsyncGenerator, Iterable
+
+    from typing_extensions import Unpack
+
+    from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, EnqueueLinksKwargs
+    from crawlee.abstract_http_crawler._abstract_http_parser import AbstractHttpParser
+
+TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
+
+
+@docs_group('Data structures')
+class HttpCrawlerOptions(Generic[TCrawlingContext], BasicCrawlerOptions[TCrawlingContext]):
+    additional_http_error_status_codes: NotRequired[Iterable[int]]
+    """Additional HTTP status codes to treat as errors, triggering automatic retries when encountered."""
+
+    ignore_http_error_status_codes: NotRequired[Iterable[int]]
+    """HTTP status codes typically considered errors but to be treated as successful responses."""
+
+
+@docs_group('Abstract classes')
+class AbstractHttpCrawler(Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext], ABC):
+    """A web crawler for performing HTTP requests.
+
+    The `AbstractHttpCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features. On top
+    of that it implements the HTTP communication using the HTTP clients. The class allows integration with
+    any HTTP client that implements the `BaseHttpClient` interface. The HTTP client is provided to the crawler
+    as an input parameter to the constructor.
+    AbstractHttpCrawler is generic class and is expected to be used together with specific parser that will be used to
+    parse http response and type of expected TCrawlingContext which is available to the user function.
+    See prepared specific version of it: BeautifulSoupCrawler, ParselCrawler or HttpCrawler for example.
+
+    The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,
+    if you need to execute client-side JavaScript, consider using a browser-based crawler like the `PlaywrightCrawler`.
+    """
+
+    def __init__(
+        self,
+        *,
+        parser: AbstractHttpParser[TParseResult],
+        additional_http_error_status_codes: Iterable[int] = (),
+        ignore_http_error_status_codes: Iterable[int] = (),
+        **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext]],
+    ) -> None:
+        self._parser = parser
+
+        kwargs.setdefault(
+            'http_client',
+            HttpxHttpClient(
+                additional_http_error_status_codes=additional_http_error_status_codes,
+                ignore_http_error_status_codes=ignore_http_error_status_codes,
+            ),
+        )
+
+        if '_context_pipeline' not in kwargs:
+            raise ValueError(
+                'Please pass in a `_context_pipeline`. You should use the '
+                'AbstractHttpCrawler._create_static_content_crawler_pipeline() method to initialize it.'
+            )
+
+        kwargs.setdefault('_logger', logging.getLogger(__name__))
+        super().__init__(**kwargs)
+
+    def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpCrawlingContext[TParseResult]]:
+        """Create static content crawler context pipeline with expected pipeline steps."""
+        return (
+            ContextPipeline()
+            .compose(self._make_http_request)
+            .compose(self._parse_http_response)
+            .compose(self._handle_blocked_request)
+        )
+
+    async def _parse_http_response(
+        self, context: HttpCrawlingContext
+    ) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]:
+        """Parse http response and create context enhanced by the parsing result and enqueue links function.
+
+        Args:
+            context: The current crawling context, that includes http response.
+
+        Yields:
+            The original crawling context enhanced by the parsing result and enqueue links function.
+        """
+        parsed_content = await self._parser.parse(context.http_response)
+        yield ParsedHttpCrawlingContext.from_http_crawling_context(
+            context=context,
+            parsed_content=parsed_content,
+            enqueue_links=self._create_enqueue_links_function(context, parsed_content),
+        )
+
+    def _create_enqueue_links_function(
+        self, context: HttpCrawlingContext, parsed_content: TParseResult
+    ) -> EnqueueLinksFunction:
+        """Create a callback function for extracting links from parsed content and enqueuing them to the crawl.
+
+        Args:
+            context: The current crawling context.
+            parsed_content: The parsed http response.
+
+        Returns:
+            Awaitable that is used for extracting links from parsed content and enqueuing them to the crawl.
+        """
+
+        async def enqueue_links(
+            *,
+            selector: str = 'a',
+            label: str | None = None,
+            user_data: dict[str, Any] | None = None,
+            **kwargs: Unpack[EnqueueLinksKwargs],
+        ) -> None:
+            kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME)
+
+            requests = list[BaseRequestData]()
+            user_data = user_data or {}
+            if label is not None:
+                user_data.setdefault('label', label)
+            for link in self._parser.find_links(parsed_content, selector=selector):
+                url = link
+                if not is_url_absolute(url):
+                    url = convert_to_absolute_url(context.request.url, url)
+                try:
+                    request = BaseRequestData.from_url(url, user_data=user_data)
+                except ValidationError as exc:
+                    context.log.debug(
+                        f'Skipping URL "{url}" due to invalid format: {exc}. '
+                        'This may be caused by a malformed URL or unsupported URL scheme. '
+                        'Please ensure the URL is correct and retry.'
+                    )
+                    continue
+
+                requests.append(request)
+
+            await context.add_requests(requests, **kwargs)
+
+        return enqueue_links
+
+    async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
+        """Make http request and create context enhanced by http response.
+
+        Args:
+            context: The current crawling context.
+
+        Yields:
+            The original crawling context enhanced by http response.
+        """
+        result = await self._http_client.crawl(
+            request=context.request,
+            session=context.session,
+            proxy_info=context.proxy_info,
+            statistics=self._statistics,
+        )
+
+        yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
+
+    async def _handle_blocked_request(
+        self, context: ParsedHttpCrawlingContext[TParseResult]
+    ) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]:
+        """Try to detect if the request is blocked based on the HTTP status code or the parsed response content.
+
+        Args:
+            context: The current crawling context.
+
+        Raises:
+            SessionError: If the request is considered blocked.
+
+        Yields:
+            The original crawling context if no errors are detected.
+        """
+        if self._retry_on_blocked:
+            status_code = context.http_response.status_code
+
+            # TODO: refactor to avoid private member access
+            # https://github.com/apify/crawlee-python/issues/708
+            if (
+                context.session
+                and status_code not in self._http_client._ignore_http_error_status_codes  # noqa: SLF001
+                and context.session.is_blocked_status_code(status_code=status_code)
+            ):
+                raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
+            if blocked_info := self._parser.is_blocked(context.parsed_content):
+                raise SessionError(blocked_info.reason)
+        yield context
diff --git a/src/crawlee/abstract_http_crawler/_abstract_http_parser.py b/src/crawlee/abstract_http_crawler/_abstract_http_parser.py
@@ -0,0 +1,93 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Generic
+
+from crawlee._utils.blocked import RETRY_CSS_SELECTORS
+from crawlee._utils.docs import docs_group
+from crawlee.abstract_http_crawler._http_crawling_context import TParseResult
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+    from crawlee.http_clients import HttpResponse
+
+
+@docs_group('Classes')
+@dataclass(frozen=True)
+class BlockedInfo:
+    """Information about whether the crawling is blocked. If reason is empty, then it means it is not blocked."""
+
+    reason: str
+
+    def __bool__(self) -> bool:
+        """No reason means no blocking."""
+        return bool(self.reason)
+
+
+@docs_group('Abstract classes')
+class AbstractHttpParser(Generic[TParseResult], ABC):
+    """Parser used for parsing http response and inspecting parsed result to find links or detect blocking."""
+
+    @abstractmethod
+    async def parse(self, response: HttpResponse) -> TParseResult:
+        """Parse http response.
+
+        Args:
+            response: Http response to be parsed.
+
+        Returns:
+            Parsed http response.
+        """
+
+    def is_blocked(self, parsed_content: TParseResult) -> BlockedInfo:
+        """Detect if blocked and return BlockedInfo with additional information.
+
+        Default implementation that expects is_matching_selector abstract method to be implemented.
+        Override this method if your parser has different way of blockage detection.
+
+        Args:
+            parsed_content: Parsed http response. Result of parse method.
+
+        Returns:
+            BlockedInfo object that contains non-empty string description of reason if blockage was detected. Empty
+            string in reason signifies no blockage detected.
+        """
+        reason = ''
+        if parsed_content is not None:
+            matched_selectors = [
+                selector for selector in RETRY_CSS_SELECTORS if self.is_matching_selector(parsed_content, selector)
+            ]
+
+            if matched_selectors:
+                reason = (
+                    f"Assuming the session is blocked - HTTP response matched the following selectors: "
+                    f"{'; '.join(matched_selectors)}"
+                )
+
+        return BlockedInfo(reason=reason)
+
+    @abstractmethod
+    def is_matching_selector(self, parsed_content: TParseResult, selector: str) -> bool:
+        """Find if selector has match in parsed content.
+
+        Args:
+            parsed_content: Parsed http response. Result of parse method.
+            selector: String used to define matching pattern.
+
+        Returns:
+            True if selector has match in parsed content.
+        """
+
+    @abstractmethod
+    def find_links(self, parsed_content: TParseResult, selector: str) -> Iterable[str]:
+        """Find all links in result using selector.
+
+        Args:
+            parsed_content: Parsed http response. Result of parse method.
+            selector: String used to define matching pattern for finding links.
+
+        Returns:
+            Iterable of strings that contain found links.
+        """
diff --git a/src/crawlee/abstract_http_crawler/_http_crawling_context.py b/src/crawlee/abstract_http_crawler/_http_crawling_context.py
@@ -0,0 +1,44 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, fields
+from typing import Generic
+
+from typing_extensions import Self, TypeVar
+
+from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction
+from crawlee._utils.docs import docs_group
+from crawlee.http_clients import HttpCrawlingResult, HttpResponse
+
+TParseResult = TypeVar('TParseResult')
+
+
+@dataclass(frozen=True)
+@docs_group('Data structures')
+class HttpCrawlingContext(BasicCrawlingContext, HttpCrawlingResult):
+    """The crawling context used by the `AbstractHttpCrawler`."""
+
+    @classmethod
+    def from_basic_crawling_context(cls, context: BasicCrawlingContext, http_response: HttpResponse) -> Self:
+        """Convenience constructor that creates HttpCrawlingContext from existing BasicCrawlingContext."""
+        context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}
+        return cls(http_response=http_response, **context_kwargs)
+
+
+@dataclass(frozen=True)
+@docs_group('Data structures')
+class ParsedHttpCrawlingContext(Generic[TParseResult], HttpCrawlingContext):
+    """The crawling context used by AbstractHttpCrawler.
+
+    It provides access to key objects as well as utility functions for handling crawling tasks.
+    """
+
+    parsed_content: TParseResult
+    enqueue_links: EnqueueLinksFunction
+
+    @classmethod
+    def from_http_crawling_context(
+        cls, context: HttpCrawlingContext, parsed_content: TParseResult, enqueue_links: EnqueueLinksFunction
+    ) -> Self:
+        """Convenience constructor that creates new context from existing HttpCrawlingContext."""
+        context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}
+        return cls(parsed_content=parsed_content, enqueue_links=enqueue_links, **context_kwargs)
diff --git a/src/crawlee/beautifulsoup_crawler/__init__.py b/src/crawlee/beautifulsoup_crawler/__init__.py
@@ -1,10 +1,11 @@
 try:
-    from ._beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupParser
+    from ._beautifulsoup_crawler import BeautifulSoupCrawler
     from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
+    from ._beautifulsoup_parser import BeautifulSoupParserType
 except ImportError as exc:
     raise ImportError(
         "To import anything from this subpackage, you need to install the 'beautifulsoup' extra."
         "For example, if you use pip, run `pip install 'crawlee[beautifulsoup]'`.",
     ) from exc
 
-__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParser']
+__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType']