apify · Pijukatel · Dec 6, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/src/crawlee/basic_crawler/_basic_crawler.py b/src/crawlee/basic_crawler/_basic_crawler.py
@@ -147,7 +147,7 @@ class BasicCrawler(Generic[TCrawlingContext]):
 
     The `BasicCrawler` provides a low-level functionality for crawling websites, allowing users to define their
     own page download and data extraction logic. It is designed mostly to be subclassed by crawlers with specific
-    purposes. In most cases, you will want to use a more specialized crawler, such as `HttpCrawler`,
+    purposes. In most cases, you will want to use a more specialized crawler, such as `HttpCrawlerGeneric`,
     `BeautifulSoupCrawler`, `ParselCrawler`, or `PlaywrightCrawler`. If you are an advanced user and want full
     control over the crawling process, you can subclass the `BasicCrawler` and implement the request-handling logic
     yourself.

diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py
@@ -1,38 +1,29 @@
 from __future__ import annotations
+from typing import TYPE_CHECKING, Iterable, Literal
 
-import asyncio
-import logging
-from typing import TYPE_CHECKING, Any, Literal
+from bs4 import BeautifulSoup
 
-from bs4 import BeautifulSoup, Tag
-from pydantic import ValidationError
+from crawlee.beautifulsoup_crawler._beautifulsoup_parser import BeautifulSoupContentParser
+from crawlee.http_crawler import HttpCrawlerGeneric
 
-from crawlee import EnqueueStrategy
-from crawlee._request import BaseRequestData
-from crawlee._utils.blocked import RETRY_CSS_SELECTORS
-from crawlee._utils.docs import docs_group
-from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
-from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline
-from crawlee.beautifulsoup_crawler._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
-from crawlee.errors import SessionError
-from crawlee.http_clients import HttpxHttpClient
-from crawlee.http_crawler import HttpCrawlingContext
 
 if TYPE_CHECKING:
     from collections.abc import AsyncGenerator, Iterable
 
     from typing_extensions import Unpack
 
-    from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs
+    from crawlee.basic_crawler import BasicCrawlerOptions
+    from crawlee.http_crawler import ParsedHttpCrawlingContext
+
 
 BeautifulSoupParser = Literal['html.parser', 'lxml', 'xml', 'html5lib']
 
 
-@docs_group('Classes')
-class BeautifulSoupCrawler(BasicCrawler[BeautifulSoupCrawlingContext]):
+class BeautifulSoupCrawler(HttpCrawlerGeneric[BeautifulSoup]):
     """A web crawler for performing HTTP requests and parsing HTML/XML content.
 
-    The `BeautifulSoupCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features.
+    The `BeautifulSoupCrawler` builds on top of the `HttpCrawlerGeneric`, which means it inherits all of its features.
+    It specifies its own parser BeautifulSoupParser which is used to parse HttpResponse.
     On top of that it implements the HTTP communication using the HTTP clients and HTML/XML parsing using the
     `BeautifulSoup` library. The class allows integration with any HTTP client that implements the `BaseHttpClient`
     interface. The HTTP client is provided to the crawler as an input parameter to the constructor.
@@ -43,13 +34,13 @@ class BeautifulSoupCrawler(BasicCrawler[BeautifulSoupCrawlingContext]):
     ### Usage
 
     ```python
-    from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+    from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, ParsedHttpCrawlingContext[BeautifulSoup]
 
     crawler = BeautifulSoupCrawler()
 
     # Define the default request handler, which will be called for every request.
     @crawler.router.default_handler
-    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+    async def request_handler(context: ParsedHttpCrawlingContext[BeautifulSoup]) -> None:
         context.log.info(f'Processing {context.request.url} ...')
 
         # Extract data from the page.
@@ -71,7 +62,7 @@ def __init__(
         parser: BeautifulSoupParser = 'lxml',
         additional_http_error_status_codes: Iterable[int] = (),
         ignore_http_error_status_codes: Iterable[int] = (),
-        **kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
+        **kwargs: Unpack[BasicCrawlerOptions[ParsedHttpCrawlingContext[BeautifulSoup]]],
     ) -> None:
         """A default constructor.
 
@@ -83,157 +74,9 @@ def __init__(
                 as successful responses.
             kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
         """
-        self._parser = parser
-
-        kwargs['_context_pipeline'] = (
-            ContextPipeline()
-            .compose(self._make_http_request)
-            .compose(self._parse_http_response)
-            .compose(self._handle_blocked_request)
-        )
-
-        kwargs.setdefault(
-            'http_client',
-            HttpxHttpClient(
-                additional_http_error_status_codes=additional_http_error_status_codes,
-                ignore_http_error_status_codes=ignore_http_error_status_codes,
-            ),
-        )
-
-        kwargs.setdefault('_logger', logging.getLogger(__name__))
-
-        super().__init__(**kwargs)
-
-    async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
-        """Executes an HTTP request using a configured HTTP client.
-
-        Args:
-            context: The crawling context from the `BasicCrawler`.
-
-        Yields:
-            The enhanced crawling context with the HTTP response.
-        """
-        result = await self._http_client.crawl(
-            request=context.request,
-            session=context.session,
-            proxy_info=context.proxy_info,
-            statistics=self._statistics,
-        )
-
-        yield HttpCrawlingContext(
-            request=context.request,
-            session=context.session,
-            proxy_info=context.proxy_info,
-            add_requests=context.add_requests,
-            send_request=context.send_request,
-            push_data=context.push_data,
-            get_key_value_store=context.get_key_value_store,
-            log=context.log,
-            http_response=result.http_response,
-        )
-
-    async def _handle_blocked_request(
-        self,
-        context: BeautifulSoupCrawlingContext,
-    ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]:
-        """Try to detect if the request is blocked based on the HTTP status code or the response content.
-
-        Args:
-            context: The current crawling context.
-
-        Raises:
-            SessionError: If the request is considered blocked.
-
-        Yields:
-            The original crawling context if no errors are detected.
-        """
-        if self._retry_on_blocked:
-            status_code = context.http_response.status_code
-
-            # TODO: refactor to avoid private member access
-            # https://github.com/apify/crawlee-python/issues/708
-            if (
-                context.session
-                and status_code not in self._http_client._ignore_http_error_status_codes  # noqa: SLF001
-                and context.session.is_blocked_status_code(status_code=status_code)
-            ):
-                raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
-
-            matched_selectors = [
-                selector for selector in RETRY_CSS_SELECTORS if context.soup.select_one(selector) is not None
-            ]
-
-            if matched_selectors:
-                raise SessionError(
-                    'Assuming the session is blocked - '
-                    f"HTTP response matched the following selectors: {'; '.join(matched_selectors)}"
-                )
-
-        yield context
-
-    async def _parse_http_response(
-        self,
-        context: HttpCrawlingContext,
-    ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]:
-        """Parse the HTTP response using the `BeautifulSoup` library and implements the `enqueue_links` function.
-
-        Args:
-            context: The current crawling context.
-
-        Yields:
-            The enhanced crawling context with the `BeautifulSoup` selector and the `enqueue_links` function.
-        """
-        soup = await asyncio.to_thread(lambda: BeautifulSoup(context.http_response.read(), self._parser))
-
-        async def enqueue_links(
-            *,
-            selector: str = 'a',
-            label: str | None = None,
-            user_data: dict[str, Any] | None = None,
-            **kwargs: Unpack[EnqueueLinksKwargs],
-        ) -> None:
-            kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME)
-
-            requests = list[BaseRequestData]()
-            user_data = user_data or {}
-
-            link: Tag
-            for link in soup.select(selector):
-                link_user_data = user_data
-
-                if label is not None:
-                    link_user_data.setdefault('label', label)
-
-                if (url := link.attrs.get('href')) is not None:
-                    url = url.strip()
-
-                    if not is_url_absolute(url):
-                        url = convert_to_absolute_url(context.request.url, url)
-
-                    try:
-                        request = BaseRequestData.from_url(url, user_data=link_user_data)
-                    except ValidationError as exc:
-                        context.log.debug(
-                            f'Skipping URL "{url}" due to invalid format: {exc}. '
-                            'This may be caused by a malformed URL or unsupported URL scheme. '
-                            'Please ensure the URL is correct and retry.'
-                        )
-                        continue
-
-                    requests.append(request)
-
-            await context.add_requests(requests, **kwargs)
-
-        yield BeautifulSoupCrawlingContext(
-            request=context.request,
-            session=context.session,
-            proxy_info=context.proxy_info,
-            enqueue_links=enqueue_links,
-            add_requests=context.add_requests,
-            send_request=context.send_request,
-            push_data=context.push_data,
-            get_key_value_store=context.get_key_value_store,
-            log=context.log,
-            http_response=context.http_response,
-            soup=soup,
+        super().__init__(
+            parser=BeautifulSoupContentParser(parser=parser),
+            additional_http_error_status_codes=additional_http_error_status_codes,
+            ignore_http_error_status_codes=ignore_http_error_status_codes,
+            **kwargs,
         )
diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py
@@ -1,26 +1,5 @@
-from __future__ import annotations
+from bs4 import BeautifulSoup
 
-from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from crawlee.http_crawler import ParsedHttpCrawlingContext
 
-from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction
-from crawlee._utils.docs import docs_group
-from crawlee.http_crawler import HttpCrawlingResult
-
-if TYPE_CHECKING:
-    from bs4 import BeautifulSoup
-
-
-@dataclass(frozen=True)
-@docs_group('Data structures')
-class BeautifulSoupCrawlingContext(HttpCrawlingResult, BasicCrawlingContext):
-    """The crawling context used by the `BeautifulSoupCrawler`.
-
-    It provides access to key objects as well as utility functions for handling crawling tasks.
-    """
-
-    soup: BeautifulSoup
-    """The `BeautifulSoup` object for the current page."""
-
-    enqueue_links: EnqueueLinksFunction
-    """The BeautifulSoup `EnqueueLinksFunction` implementation."""
+BeautifulSoupCrawlingContext = ParsedHttpCrawlingContext[BeautifulSoup]
diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_parser.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_parser.py
@@ -0,0 +1,44 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Iterable
+
+from bs4 import BeautifulSoup, Tag
+from typing_extensions import override
+
+from crawlee._utils.blocked import RETRY_CSS_SELECTORS
+from crawlee.http_crawler import BlockedInfo, StaticContentParser
+
+if TYPE_CHECKING:
+    from crawlee.http_clients import HttpResponse
+
+
+class BeautifulSoupContentParser(StaticContentParser[BeautifulSoup]):
+    """Parser for parsing http response using BeautifulSoup."""
+
+    def __init__(self, parser: str = 'lxml') -> None:
+        self._parser = parser
+
+    @override
+    async def parse(self, response: HttpResponse) -> BeautifulSoup:
+        return BeautifulSoup(response.read(), parser=self._parser)
+
+    @override
+    def is_blocked(self, parsed_content: BeautifulSoup) -> BlockedInfo:
+        reason = ''
+        if parsed_content is not None:
+            matched_selectors = [
+                selector for selector in RETRY_CSS_SELECTORS if parsed_content.select_one(selector) is not None
+            ]
+            if matched_selectors:
+                reason = f"Assuming the session is blocked - HTTP response matched the following selectors: {'; '.join(
+                    matched_selectors)}"
+        return BlockedInfo(reason=reason)
+
+    @override
+    def find_links(self, parsed_content: BeautifulSoup, selector: str) -> Iterable[str]:
+        link: Tag
+        urls: list[str] = []
+        for link in parsed_content.select(selector):
+            if (url := link.attrs.get('href')) is not None:
+                urls.append(url.strip())  # noqa: PERF401  #Mypy has problems using is not None for type inference in list comprehension.
+        return urls
diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py
@@ -84,10 +84,10 @@ class HttpxHttpClient(BaseHttpClient):
 
     ```python
     from crawlee.http_clients import HttpxHttpClient
-    from crawlee.http_crawler import HttpCrawler  # or any other HTTP client-based crawler
+    from crawlee.http_crawler import HttpCrawlerGeneric  # or any other HTTP client-based crawler
 
     http_client = HttpxHttpClient()
-    crawler = HttpCrawler(http_client=http_client)
+    crawler = HttpCrawlerGeneric(http_client=http_client)
     ```
     """
 

diff --git a/src/crawlee/http_clients/curl_impersonate.py b/src/crawlee/http_clients/curl_impersonate.py
@@ -85,10 +85,10 @@ class CurlImpersonateHttpClient(BaseHttpClient):
 
     ```python
     from crawlee.http_clients.curl_impersonate import CurlImpersonateHttpClient
-    from crawlee.http_crawler import HttpCrawler  # or any other HTTP client-based crawler
+    from crawlee.http_crawler import HttpCrawlerGeneric  # or any other HTTP client-based crawler
 
     http_client = CurlImpersonateHttpClient()
-    crawler = HttpCrawler(http_client=http_client)
+    crawler = HttpCrawlerGeneric(http_client=http_client)
     ```
     """
 

diff --git a/src/crawlee/http_crawler/__init__.py b/src/crawlee/http_crawler/__init__.py
@@ -1,4 +1,13 @@
-from ._http_crawler import HttpCrawler
-from ._http_crawling_context import HttpCrawlingContext, HttpCrawlingResult
+from ._http_crawler import HttpCrawler, HttpCrawlerGeneric
+from ._http_crawling_context import HttpCrawlingContext, HttpCrawlingResult, ParsedHttpCrawlingContext
+from ._http_parser import BlockedInfo, StaticContentParser
 
-__all__ = ['HttpCrawler', 'HttpCrawlingContext', 'HttpCrawlingResult']
+__all__ = [
+    'BlockedInfo',
+    'HttpCrawler',
+    'HttpCrawlerGeneric',
+    'HttpCrawlingContext',
+    'HttpCrawlingResult',
+    'ParsedHttpCrawlingContext',
+    'StaticContentParser',
+]