apify · Pijukatel · Dec 6, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/docs/guides/static_content_crawlers.mdx b/docs/guides/static_content_crawlers.mdx
@@ -0,0 +1,24 @@
+---
+id: static-content-crawlers
+title: Static content crawlers
+description: Crawlee supports multiple static content crawlers that can be used to extract data from server-rendered webpages.
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import CodeBlock from '@theme/CodeBlock';
+
+Generic class <ApiLink to="class/StaticContentCrawler">`StaticContentCrawler`</ApiLink> is parent to <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> and <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> and it could be used as parent for your crawler with custom content parsing requirements.
+
+It already includes almost all the functionality to crawl webpages and the only missing part if the parser, that should be used to parse http responses, and context object that defines what context will be available to user handler functions.
+
+## `BeautifulSoupCrawler`
+<ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> uses <ApiLink to="class/BeautifulSoupParser">`BeautifulSoupParser`</ApiLink> to parse http response and this is available in <ApiLink to="class/BeautifulSoupCrawlingContext">`BeautifulSoupCrawlingContext`</ApiLink> in .soup or .parsed_content.
+
+## `ParselCrawler`
+<ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> uses <ApiLink to="class/ParselParser">`ParselParser`</ApiLink> to parse http response and this is available in <ApiLink to="class/ParselCrawlingContext">`ParselCrawlingContext`</ApiLink> in .selector or .parsed_content.
+
+## `HttpCrawler`
+<ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> uses <ApiLink to="class/NoParser">`NoParser`</ApiLink> that does not parse http response at all and is to be used if no parsing is required.
+
diff --git a/src/crawlee/beautifulsoup_crawler/__init__.py b/src/crawlee/beautifulsoup_crawler/__init__.py
@@ -1,10 +1,11 @@
 try:
-    from ._beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupParser
+    from ._beautifulsoup_crawler import BeautifulSoupCrawler
     from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
+    from ._beautifulsoup_parser import BeautifulSoupParserType
 except ImportError as exc:
     raise ImportError(
         "To import anything from this subpackage, you need to install the 'beautifulsoup' extra."
         "For example, if you use pip, run `pip install 'crawlee[beautifulsoup]'`.",
     ) from exc
 
-__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParser']
+__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType']
diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py
@@ -1,41 +1,29 @@
 from __future__ import annotations
 
-import asyncio
-import logging
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING
 
-from bs4 import BeautifulSoup, Tag
-from pydantic import ValidationError
+from bs4 import BeautifulSoup
 
-from crawlee import EnqueueStrategy
-from crawlee._request import BaseRequestData
-from crawlee._utils.blocked import RETRY_CSS_SELECTORS
 from crawlee._utils.docs import docs_group
-from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
-from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline
-from crawlee.beautifulsoup_crawler._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
-from crawlee.errors import SessionError
-from crawlee.http_clients import HttpxHttpClient
-from crawlee.http_crawler import HttpCrawlingContext
+from crawlee.beautifulsoup_crawler._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
+from crawlee.static_content_crawler import StaticContentCrawler, StaticContentCrawlerOptions
+
+from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
 
 if TYPE_CHECKING:
-    from collections.abc import AsyncGenerator, Iterable
+    from collections.abc import AsyncGenerator
 
     from typing_extensions import Unpack
 
-    from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs
-
-BeautifulSoupParser = Literal['html.parser', 'lxml', 'xml', 'html5lib']
+    from crawlee.static_content_crawler import ParsedHttpCrawlingContext
 
 
 @docs_group('Classes')
-class BeautifulSoupCrawler(BasicCrawler[BeautifulSoupCrawlingContext]):
+class BeautifulSoupCrawler(StaticContentCrawler[BeautifulSoupCrawlingContext, BeautifulSoup]):
     """A web crawler for performing HTTP requests and parsing HTML/XML content.
 
-    The `BeautifulSoupCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features.
-    On top of that it implements the HTTP communication using the HTTP clients and HTML/XML parsing using the
-    `BeautifulSoup` library. The class allows integration with any HTTP client that implements the `BaseHttpClient`
-    interface. The HTTP client is provided to the crawler as an input parameter to the constructor.
+    The `BeautifulSoupCrawler` builds on top of the `StaticContentCrawler`, which means it inherits all of its features.
+    It specifies its own parser BeautifulSoupParser which is used to parse HttpResponse.
 
     The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,
     if you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`.
@@ -68,172 +56,25 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
     def __init__(
         self,
         *,
-        parser: BeautifulSoupParser = 'lxml',
-        additional_http_error_status_codes: Iterable[int] = (),
-        ignore_http_error_status_codes: Iterable[int] = (),
-        **kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
+        parser: BeautifulSoupParserType = 'lxml',
+        **kwargs: Unpack[StaticContentCrawlerOptions[BeautifulSoupCrawlingContext]],
     ) -> None:
         """A default constructor.
 
         Args:
             parser: The type of parser that should be used by `BeautifulSoup`.
-            additional_http_error_status_codes: Additional HTTP status codes to treat as errors, triggering
-                automatic retries when encountered.
-            ignore_http_error_status_codes: HTTP status codes typically considered errors but to be treated
-                as successful responses.
-            kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
+            kwargs: Additional keyword arguments to pass to the underlying `StaticContentCrawler`.
         """
-        self._parser = parser
-
-        kwargs['_context_pipeline'] = (
-            ContextPipeline()
-            .compose(self._make_http_request)
-            .compose(self._parse_http_response)
-            .compose(self._handle_blocked_request)
-        )
-
-        kwargs.setdefault(
-            'http_client',
-            HttpxHttpClient(
-                additional_http_error_status_codes=additional_http_error_status_codes,
-                ignore_http_error_status_codes=ignore_http_error_status_codes,
-            ),
-        )
-
-        kwargs.setdefault('_logger', logging.getLogger(__name__))
 
-        super().__init__(**kwargs)
+        async def final_step(
+            context: ParsedHttpCrawlingContext[BeautifulSoup],
+        ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]:
+            """Enhance ParsedHttpCrawlingContext[BeautifulSoup] with soup property."""
+            yield BeautifulSoupCrawlingContext.from_parsed_http_crawling_context(context)
 
-    async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
-        """Executes an HTTP request using a configured HTTP client.
+        kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline().compose(final_step)
 
-        Args:
-            context: The crawling context from the `BasicCrawler`.
-
-        Yields:
-            The enhanced crawling context with the HTTP response.
-        """
-        result = await self._http_client.crawl(
-            request=context.request,
-            session=context.session,
-            proxy_info=context.proxy_info,
-            statistics=self._statistics,
-        )
-
-        yield HttpCrawlingContext(
-            request=context.request,
-            session=context.session,
-            proxy_info=context.proxy_info,
-            add_requests=context.add_requests,
-            send_request=context.send_request,
-            push_data=context.push_data,
-            get_key_value_store=context.get_key_value_store,
-            log=context.log,
-            http_response=result.http_response,
-        )
-
-    async def _handle_blocked_request(
-        self,
-        context: BeautifulSoupCrawlingContext,
-    ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]:
-        """Try to detect if the request is blocked based on the HTTP status code or the response content.
-
-        Args:
-            context: The current crawling context.
-
-        Raises:
-            SessionError: If the request is considered blocked.
-
-        Yields:
-            The original crawling context if no errors are detected.
-        """
-        if self._retry_on_blocked:
-            status_code = context.http_response.status_code
-
-            # TODO: refactor to avoid private member access
-            # https://github.com/apify/crawlee-python/issues/708
-            if (
-                context.session
-                and status_code not in self._http_client._ignore_http_error_status_codes  # noqa: SLF001
-                and context.session.is_blocked_status_code(status_code=status_code)
-            ):
-                raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
-
-            matched_selectors = [
-                selector for selector in RETRY_CSS_SELECTORS if context.soup.select_one(selector) is not None
-            ]
-
-            if matched_selectors:
-                raise SessionError(
-                    'Assuming the session is blocked - '
-                    f"HTTP response matched the following selectors: {'; '.join(matched_selectors)}"
-                )
-
-        yield context
-
-    async def _parse_http_response(
-        self,
-        context: HttpCrawlingContext,
-    ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]:
-        """Parse the HTTP response using the `BeautifulSoup` library and implements the `enqueue_links` function.
-
-        Args:
-            context: The current crawling context.
-
-        Yields:
-            The enhanced crawling context with the `BeautifulSoup` selector and the `enqueue_links` function.
-        """
-        soup = await asyncio.to_thread(lambda: BeautifulSoup(context.http_response.read(), self._parser))
-
-        async def enqueue_links(
-            *,
-            selector: str = 'a',
-            label: str | None = None,
-            user_data: dict[str, Any] | None = None,
-            **kwargs: Unpack[EnqueueLinksKwargs],
-        ) -> None:
-            kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME)
-
-            requests = list[BaseRequestData]()
-            user_data = user_data or {}
-
-            link: Tag
-            for link in soup.select(selector):
-                link_user_data = user_data
-
-                if label is not None:
-                    link_user_data.setdefault('label', label)
-
-                if (url := link.attrs.get('href')) is not None:
-                    url = url.strip()
-
-                    if not is_url_absolute(url):
-                        url = convert_to_absolute_url(context.request.url, url)
-
-                    try:
-                        request = BaseRequestData.from_url(url, user_data=link_user_data)
-                    except ValidationError as exc:
-                        context.log.debug(
-                            f'Skipping URL "{url}" due to invalid format: {exc}. '
-                            'This may be caused by a malformed URL or unsupported URL scheme. '
-                            'Please ensure the URL is correct and retry.'
-                        )
-                        continue
-
-                    requests.append(request)
-
-            await context.add_requests(requests, **kwargs)
-
-        yield BeautifulSoupCrawlingContext(
-            request=context.request,
-            session=context.session,
-            proxy_info=context.proxy_info,
-            enqueue_links=enqueue_links,
-            add_requests=context.add_requests,
-            send_request=context.send_request,
-            push_data=context.push_data,
-            get_key_value_store=context.get_key_value_store,
-            log=context.log,
-            http_response=context.http_response,
-            soup=soup,
+        super().__init__(
+            parser=BeautifulSoupParser(parser=parser),
+            **kwargs,
         )
diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py
@@ -1,26 +1,21 @@
-from __future__ import annotations
+from dataclasses import dataclass, fields
 
-from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from bs4 import BeautifulSoup
+from typing_extensions import Self
 
-from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction
 from crawlee._utils.docs import docs_group
-from crawlee.http_crawler import HttpCrawlingResult
-
-if TYPE_CHECKING:
-    from bs4 import BeautifulSoup
+from crawlee.static_content_crawler._static_crawling_context import ParsedHttpCrawlingContext
 
 
 @dataclass(frozen=True)
 @docs_group('Data structures')
-class BeautifulSoupCrawlingContext(HttpCrawlingResult, BasicCrawlingContext):
-    """The crawling context used by the `BeautifulSoupCrawler`.
-
-    It provides access to key objects as well as utility functions for handling crawling tasks.
-    """
-
-    soup: BeautifulSoup
-    """The `BeautifulSoup` object for the current page."""
-
-    enqueue_links: EnqueueLinksFunction
-    """The BeautifulSoup `EnqueueLinksFunction` implementation."""
+class BeautifulSoupCrawlingContext(ParsedHttpCrawlingContext[BeautifulSoup]):
+    @property
+    def soup(self) -> BeautifulSoup:
+        """Convenience alias."""
+        return self.parsed_content
+
+    @classmethod
+    def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[BeautifulSoup]) -> Self:
+        """Convenience constructor that creates new context from existing ParsedHttpCrawlingContext[BeautifulSoup]."""
+        return cls(**{field.name: getattr(context, field.name) for field in fields(context)})
diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_parser.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_parser.py
@@ -0,0 +1,41 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal
+
+from bs4 import BeautifulSoup, Tag
+from typing_extensions import override
+
+from crawlee.static_content_crawler._static_content_parser import StaticContentParser
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+    from crawlee.http_clients import HttpResponse
+
+
+class BeautifulSoupParser(StaticContentParser[BeautifulSoup]):
+    """Parser for parsing http response using BeautifulSoup."""
+
+    def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None:
+        self._parser = parser
+
+    @override
+    async def parse(self, response: HttpResponse) -> BeautifulSoup:
+        return BeautifulSoup(response.read(), features=self._parser)
+
+    @override
+    def is_matching_selector(self, parsed_content: BeautifulSoup, selector: str) -> bool:
+        return parsed_content.select_one(selector) is not None
+
+    @override
+    def find_links(self, parsed_content: BeautifulSoup, selector: str) -> Iterable[str]:
+        link: Tag
+        urls: list[str] = []
+        for link in parsed_content.select(selector):
+            url = link.attrs.get('href')
+            if url:
+                urls.append(url.strip())
+        return urls
+
+
+BeautifulSoupParserType = Literal['html.parser', 'lxml', 'xml', 'html5lib']
diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py
@@ -41,7 +41,7 @@ def read(self) -> bytes:
 @dataclass(frozen=True)
 @docs_group('Data structures')
 class HttpCrawlingResult:
-    """Result of a HTTP-only crawl.
+    """Result of an HTTP-only crawl.
 
     Mainly for the purpose of composing specific crawling contexts (e.g. `BeautifulSoupCrawlingContext`,
     `ParselCrawlingContext`, ...).

diff --git a/src/crawlee/http_crawler/__init__.py b/src/crawlee/http_crawler/__init__.py
@@ -1,4 +1,10 @@
+from crawlee.http_clients import HttpCrawlingResult
+from crawlee.static_content_crawler._static_crawling_context import HttpCrawlingContext
+
 from ._http_crawler import HttpCrawler
-from ._http_crawling_context import HttpCrawlingContext, HttpCrawlingResult
 
-__all__ = ['HttpCrawler', 'HttpCrawlingContext', 'HttpCrawlingResult']
+__all__ = [
+    'HttpCrawler',
+    'HttpCrawlingContext',
+    'HttpCrawlingResult',
+]