diff --git a/docs/guides/http_crawlers.mdx b/docs/guides/http_crawlers.mdx new file mode 100644 index 0000000000..c8479699ba --- /dev/null +++ b/docs/guides/http_crawlers.mdx @@ -0,0 +1,35 @@ +--- +id: http-crawlers +title: HTTP crawlers +description: Crawlee supports multiple HTTP crawlers that can be used to extract data from server-rendered webpages. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import CodeBlock from '@theme/CodeBlock'; + +Generic class `AbstractHttpCrawler` is parent to `BeautifulSoupCrawler`, `ParselCrawler` and `HttpCrawler` and it could be used as parent for your crawler with custom content parsing requirements. + +It already includes almost all the functionality to crawl webpages and the only missing part is the parser that should be used to parse HTTP responses, and a context dataclass that defines what context helpers will be available to user handler functions. + +## `BeautifulSoupCrawler` +`BeautifulSoupCrawler` uses `BeautifulSoupParser` to parse the HTTP response and makes it available in `BeautifulSoupCrawlingContext` in the `.soup` or `.parsed_content` attribute. + +## `ParselCrawler` +`ParselCrawler` uses `ParselParser` to parse the HTTP response and makes it available in `ParselCrawlingContext` in the `.selector` or `.parsed_content` attribute. + +## `HttpCrawler` +`HttpCrawler` uses `NoParser` that does not parse the HTTP response at all and is to be used if no parsing is required. + +## Creating your own HTTP crawler. +### Why? +In case you want to use some custom parser for parsing HTTP responses, and the rest of the `AbstractHttpCrawler` functionality suit your needs. + +### How? +You need to define at least 2 new classes and decide what will be the type returned by the parser's `parse` method. +Parser will inherit from `AbstractHttpParser` and it will need to implement all it's abstract methods. +Crawler will inherit from `AbstractHttpCrawler` and it will need to implement all it's abstract methods. +Newly defined parser is then used in the `parser` argument of `AbstractHttpCrawler.__init__` method. + +To get better idea and as an example please see one of our own HTTP-based crawlers mentioned above. diff --git a/src/crawlee/abstract_http_crawler/__init__.py b/src/crawlee/abstract_http_crawler/__init__.py new file mode 100644 index 0000000000..85e3c3b0b7 --- /dev/null +++ b/src/crawlee/abstract_http_crawler/__init__.py @@ -0,0 +1,10 @@ +from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions +from ._abstract_http_parser import AbstractHttpParser +from ._http_crawling_context import ParsedHttpCrawlingContext + +__all__ = [ + 'AbstractHttpCrawler', + 'AbstractHttpParser', + 'HttpCrawlerOptions', + 'ParsedHttpCrawlingContext', +] diff --git a/src/crawlee/abstract_http_crawler/_abstract_http_crawler.py b/src/crawlee/abstract_http_crawler/_abstract_http_crawler.py new file mode 100644 index 0000000000..d5cf1fb3ad --- /dev/null +++ b/src/crawlee/abstract_http_crawler/_abstract_http_crawler.py @@ -0,0 +1,209 @@ +from __future__ import annotations + +import logging +from abc import ABC +from typing import TYPE_CHECKING, Any, Generic + +from pydantic import ValidationError +from typing_extensions import NotRequired, TypeVar + +from crawlee import EnqueueStrategy +from crawlee._request import BaseRequestData +from crawlee._utils.docs import docs_group +from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute +from crawlee.abstract_http_crawler._http_crawling_context import ( + HttpCrawlingContext, + ParsedHttpCrawlingContext, + TParseResult, +) +from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline +from crawlee.errors import SessionError +from crawlee.http_clients import HttpxHttpClient + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator, Iterable + + from typing_extensions import Unpack + + from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, EnqueueLinksKwargs + from crawlee.abstract_http_crawler._abstract_http_parser import AbstractHttpParser + +TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext) + + +@docs_group('Data structures') +class HttpCrawlerOptions(Generic[TCrawlingContext], BasicCrawlerOptions[TCrawlingContext]): + """Arguments for the `AbstractHttpCrawler` constructor. + + It is intended for typing forwarded `__init__` arguments in the subclasses. + """ + + additional_http_error_status_codes: NotRequired[Iterable[int]] + """Additional HTTP status codes to treat as errors, triggering automatic retries when encountered.""" + + ignore_http_error_status_codes: NotRequired[Iterable[int]] + """HTTP status codes typically considered errors but to be treated as successful responses.""" + + +@docs_group('Abstract classes') +class AbstractHttpCrawler(Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext], ABC): + """A web crawler for performing HTTP requests. + + The `AbstractHttpCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features. On top + of that it implements the HTTP communication using the HTTP clients. The class allows integration with + any HTTP client that implements the `BaseHttpClient` interface. The HTTP client is provided to the crawler + as an input parameter to the constructor. + AbstractHttpCrawler is generic class and is expected to be used together with specific parser that will be used to + parse http response and type of expected TCrawlingContext which is available to the user function. + See prepared specific version of it: BeautifulSoupCrawler, ParselCrawler or HttpCrawler for example. + + The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However, + if you need to execute client-side JavaScript, consider using a browser-based crawler like the `PlaywrightCrawler`. + """ + + def __init__( + self, + *, + parser: AbstractHttpParser[TParseResult], + additional_http_error_status_codes: Iterable[int] = (), + ignore_http_error_status_codes: Iterable[int] = (), + **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext]], + ) -> None: + self._parser = parser + + kwargs.setdefault( + 'http_client', + HttpxHttpClient( + additional_http_error_status_codes=additional_http_error_status_codes, + ignore_http_error_status_codes=ignore_http_error_status_codes, + ), + ) + + if '_context_pipeline' not in kwargs: + raise ValueError( + 'Please pass in a `_context_pipeline`. You should use the ' + 'AbstractHttpCrawler._create_static_content_crawler_pipeline() method to initialize it.' + ) + + kwargs.setdefault('_logger', logging.getLogger(__name__)) + super().__init__(**kwargs) + + def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpCrawlingContext[TParseResult]]: + """Create static content crawler context pipeline with expected pipeline steps.""" + return ( + ContextPipeline() + .compose(self._make_http_request) + .compose(self._parse_http_response) + .compose(self._handle_blocked_request) + ) + + async def _parse_http_response( + self, context: HttpCrawlingContext + ) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]: + """Parse HTTP response and create context enhanced by the parsing result and enqueue links function. + + Args: + context: The current crawling context, that includes HTTP response. + + Yields: + The original crawling context enhanced by the parsing result and enqueue links function. + """ + parsed_content = await self._parser.parse(context.http_response) + yield ParsedHttpCrawlingContext.from_http_crawling_context( + context=context, + parsed_content=parsed_content, + enqueue_links=self._create_enqueue_links_function(context, parsed_content), + ) + + def _create_enqueue_links_function( + self, context: HttpCrawlingContext, parsed_content: TParseResult + ) -> EnqueueLinksFunction: + """Create a callback function for extracting links from parsed content and enqueuing them to the crawl. + + Args: + context: The current crawling context. + parsed_content: The parsed http response. + + Returns: + Awaitable that is used for extracting links from parsed content and enqueuing them to the crawl. + """ + + async def enqueue_links( + *, + selector: str = 'a', + label: str | None = None, + user_data: dict[str, Any] | None = None, + **kwargs: Unpack[EnqueueLinksKwargs], + ) -> None: + kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME) + + requests = list[BaseRequestData]() + user_data = user_data or {} + if label is not None: + user_data.setdefault('label', label) + for link in self._parser.find_links(parsed_content, selector=selector): + url = link + if not is_url_absolute(url): + url = convert_to_absolute_url(context.request.url, url) + try: + request = BaseRequestData.from_url(url, user_data=user_data) + except ValidationError as exc: + context.log.debug( + f'Skipping URL "{url}" due to invalid format: {exc}. ' + 'This may be caused by a malformed URL or unsupported URL scheme. ' + 'Please ensure the URL is correct and retry.' + ) + continue + + requests.append(request) + + await context.add_requests(requests, **kwargs) + + return enqueue_links + + async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]: + """Make http request and create context enhanced by HTTP response. + + Args: + context: The current crawling context. + + Yields: + The original crawling context enhanced by HTTP response. + """ + result = await self._http_client.crawl( + request=context.request, + session=context.session, + proxy_info=context.proxy_info, + statistics=self._statistics, + ) + + yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response) + + async def _handle_blocked_request( + self, context: ParsedHttpCrawlingContext[TParseResult] + ) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]: + """Try to detect if the request is blocked based on the HTTP status code or the parsed response content. + + Args: + context: The current crawling context. + + Raises: + SessionError: If the request is considered blocked. + + Yields: + The original crawling context if no errors are detected. + """ + if self._retry_on_blocked: + status_code = context.http_response.status_code + + # TODO: refactor to avoid private member access + # https://github.com/apify/crawlee-python/issues/708 + if ( + context.session + and status_code not in self._http_client._ignore_http_error_status_codes # noqa: SLF001 + and context.session.is_blocked_status_code(status_code=status_code) + ): + raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}') + if blocked_info := self._parser.is_blocked(context.parsed_content): + raise SessionError(blocked_info.reason) + yield context diff --git a/src/crawlee/abstract_http_crawler/_abstract_http_parser.py b/src/crawlee/abstract_http_crawler/_abstract_http_parser.py new file mode 100644 index 0000000000..31e31a4b57 --- /dev/null +++ b/src/crawlee/abstract_http_crawler/_abstract_http_parser.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Generic + +from crawlee._utils.blocked import RETRY_CSS_SELECTORS +from crawlee._utils.docs import docs_group +from crawlee.abstract_http_crawler._http_crawling_context import TParseResult +from crawlee.basic_crawler import BlockedInfo + +if TYPE_CHECKING: + from collections.abc import Iterable + + from crawlee.http_clients import HttpResponse + + +@docs_group('Abstract classes') +class AbstractHttpParser(Generic[TParseResult], ABC): + """Parser used for parsing http response and inspecting parsed result to find links or detect blocking.""" + + @abstractmethod + async def parse(self, response: HttpResponse) -> TParseResult: + """Parse http response. + + Args: + response: HTTP response to be parsed. + + Returns: + Parsed HTTP response. + """ + + def is_blocked(self, parsed_content: TParseResult) -> BlockedInfo: + """Detect if blocked and return BlockedInfo with additional information. + + Default implementation that expects `is_matching_selector` abstract method to be implemented. + Override this method if your parser has different way of blockage detection. + + Args: + parsed_content: Parsed HTTP response. Result of `parse` method. + + Returns: + `BlockedInfo` object that contains non-empty string description of reason if blockage was detected. Empty + string in reason signifies no blockage detected. + """ + reason = '' + if parsed_content is not None: + matched_selectors = [ + selector for selector in RETRY_CSS_SELECTORS if self.is_matching_selector(parsed_content, selector) + ] + + if matched_selectors: + reason = ( + f"Assuming the session is blocked - HTTP response matched the following selectors: " + f"{'; '.join(matched_selectors)}" + ) + + return BlockedInfo(reason=reason) + + @abstractmethod + def is_matching_selector(self, parsed_content: TParseResult, selector: str) -> bool: + """Find if selector has match in parsed content. + + Args: + parsed_content: Parsed HTTP response. Result of `parse` method. + selector: String used to define matching pattern. + + Returns: + True if selector has match in parsed content. + """ + + @abstractmethod + def find_links(self, parsed_content: TParseResult, selector: str) -> Iterable[str]: + """Find all links in result using selector. + + Args: + parsed_content: Parsed HTTP response. Result of `parse` method. + selector: String used to define matching pattern for finding links. + + Returns: + Iterable of strings that contain found links. + """ diff --git a/src/crawlee/abstract_http_crawler/_http_crawling_context.py b/src/crawlee/abstract_http_crawler/_http_crawling_context.py new file mode 100644 index 0000000000..7475b85eb3 --- /dev/null +++ b/src/crawlee/abstract_http_crawler/_http_crawling_context.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from dataclasses import dataclass, fields +from typing import Generic + +from typing_extensions import Self, TypeVar + +from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction +from crawlee._utils.docs import docs_group +from crawlee.http_clients import HttpCrawlingResult, HttpResponse + +TParseResult = TypeVar('TParseResult') + + +@dataclass(frozen=True) +@docs_group('Data structures') +class HttpCrawlingContext(BasicCrawlingContext, HttpCrawlingResult): + """The crawling context used by the `AbstractHttpCrawler`.""" + + @classmethod + def from_basic_crawling_context(cls, context: BasicCrawlingContext, http_response: HttpResponse) -> Self: + """Convenience constructor that creates `HttpCrawlingContext` from existing `BasicCrawlingContext`.""" + context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} + return cls(http_response=http_response, **context_kwargs) + + +@dataclass(frozen=True) +@docs_group('Data structures') +class ParsedHttpCrawlingContext(Generic[TParseResult], HttpCrawlingContext): + """The crawling context used by `AbstractHttpCrawler`. + + It provides access to key objects as well as utility functions for handling crawling tasks. + """ + + parsed_content: TParseResult + enqueue_links: EnqueueLinksFunction + + @classmethod + def from_http_crawling_context( + cls, context: HttpCrawlingContext, parsed_content: TParseResult, enqueue_links: EnqueueLinksFunction + ) -> Self: + """Convenience constructor that creates new context from existing HttpCrawlingContext.""" + context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)} + return cls(parsed_content=parsed_content, enqueue_links=enqueue_links, **context_kwargs) diff --git a/src/crawlee/basic_crawler/__init__.py b/src/crawlee/basic_crawler/__init__.py index fb126330ba..adfc6be5ee 100644 --- a/src/crawlee/basic_crawler/__init__.py +++ b/src/crawlee/basic_crawler/__init__.py @@ -1,6 +1,7 @@ from crawlee._types import BasicCrawlingContext from ._basic_crawler import BasicCrawler, BasicCrawlerOptions +from ._blocked_info import BlockedInfo from ._context_pipeline import ContextPipeline -__all__ = ['BasicCrawler', 'BasicCrawlerOptions', 'BasicCrawlingContext', 'ContextPipeline'] +__all__ = ['BasicCrawler', 'BasicCrawlerOptions', 'BasicCrawlingContext', 'BlockedInfo', 'ContextPipeline'] diff --git a/src/crawlee/basic_crawler/_blocked_info.py b/src/crawlee/basic_crawler/_blocked_info.py new file mode 100644 index 0000000000..aee46d21ad --- /dev/null +++ b/src/crawlee/basic_crawler/_blocked_info.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from crawlee._utils.docs import docs_group + + +@docs_group('Data structures') +@dataclass(frozen=True) +class BlockedInfo: + """Information about whether the crawling is blocked. If reason is empty, then it means it is not blocked.""" + + reason: str + + def __bool__(self) -> bool: + """No reason means no blocking.""" + return bool(self.reason) diff --git a/src/crawlee/beautifulsoup_crawler/__init__.py b/src/crawlee/beautifulsoup_crawler/__init__.py index 58a8e98deb..59b0264cc1 100644 --- a/src/crawlee/beautifulsoup_crawler/__init__.py +++ b/src/crawlee/beautifulsoup_crawler/__init__.py @@ -1,10 +1,11 @@ try: - from ._beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupParser + from ._beautifulsoup_crawler import BeautifulSoupCrawler from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext + from ._beautifulsoup_parser import BeautifulSoupParserType except ImportError as exc: raise ImportError( "To import anything from this subpackage, you need to install the 'beautifulsoup' extra." "For example, if you use pip, run `pip install 'crawlee[beautifulsoup]'`.", ) from exc -__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParser'] +__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType'] diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py index f3d414ca2d..d4e0787502 100644 --- a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py +++ b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py @@ -1,41 +1,30 @@ from __future__ import annotations -import asyncio -import logging -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING -from bs4 import BeautifulSoup, Tag -from pydantic import ValidationError +from bs4 import BeautifulSoup -from crawlee import EnqueueStrategy -from crawlee._request import BaseRequestData -from crawlee._utils.blocked import RETRY_CSS_SELECTORS from crawlee._utils.docs import docs_group -from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute -from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline -from crawlee.beautifulsoup_crawler._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext -from crawlee.errors import SessionError -from crawlee.http_clients import HttpxHttpClient -from crawlee.http_crawler import HttpCrawlingContext +from crawlee.abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions +from crawlee.beautifulsoup_crawler._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType + +from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext if TYPE_CHECKING: - from collections.abc import AsyncGenerator, Iterable + from collections.abc import AsyncGenerator from typing_extensions import Unpack - from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs - -BeautifulSoupParser = Literal['html.parser', 'lxml', 'xml', 'html5lib'] + from crawlee.abstract_http_crawler import ParsedHttpCrawlingContext @docs_group('Classes') -class BeautifulSoupCrawler(BasicCrawler[BeautifulSoupCrawlingContext]): +class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, BeautifulSoup]): """A web crawler for performing HTTP requests and parsing HTML/XML content. - The `BeautifulSoupCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features. - On top of that it implements the HTTP communication using the HTTP clients and HTML/XML parsing using the - `BeautifulSoup` library. The class allows integration with any HTTP client that implements the `BaseHttpClient` - interface. The HTTP client is provided to the crawler as an input parameter to the constructor. + The `BeautifulSoupCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features. + It specifies its own parser `BeautifulSoupParser` which is used to parse `HttpResponse`. + `BeautifulSoupParser` uses following library for parsing: https://pypi.org/project/beautifulsoup4/ The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However, if you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`. @@ -68,172 +57,25 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: def __init__( self, *, - parser: BeautifulSoupParser = 'lxml', - additional_http_error_status_codes: Iterable[int] = (), - ignore_http_error_status_codes: Iterable[int] = (), - **kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]], + parser: BeautifulSoupParserType = 'lxml', + **kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]], ) -> None: """A default constructor. Args: parser: The type of parser that should be used by `BeautifulSoup`. - additional_http_error_status_codes: Additional HTTP status codes to treat as errors, triggering - automatic retries when encountered. - ignore_http_error_status_codes: HTTP status codes typically considered errors but to be treated - as successful responses. - kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`. + kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`. """ - self._parser = parser - - kwargs['_context_pipeline'] = ( - ContextPipeline() - .compose(self._make_http_request) - .compose(self._parse_http_response) - .compose(self._handle_blocked_request) - ) - - kwargs.setdefault( - 'http_client', - HttpxHttpClient( - additional_http_error_status_codes=additional_http_error_status_codes, - ignore_http_error_status_codes=ignore_http_error_status_codes, - ), - ) - - kwargs.setdefault('_logger', logging.getLogger(__name__)) - super().__init__(**kwargs) + async def final_step( + context: ParsedHttpCrawlingContext[BeautifulSoup], + ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]: + """Enhance `ParsedHttpCrawlingContext[BeautifulSoup]` with `soup` property.""" + yield BeautifulSoupCrawlingContext.from_parsed_http_crawling_context(context) - async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]: - """Executes an HTTP request using a configured HTTP client. + kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline().compose(final_step) - Args: - context: The crawling context from the `BasicCrawler`. - - Yields: - The enhanced crawling context with the HTTP response. - """ - result = await self._http_client.crawl( - request=context.request, - session=context.session, - proxy_info=context.proxy_info, - statistics=self._statistics, - ) - - yield HttpCrawlingContext( - request=context.request, - session=context.session, - proxy_info=context.proxy_info, - add_requests=context.add_requests, - send_request=context.send_request, - push_data=context.push_data, - get_key_value_store=context.get_key_value_store, - log=context.log, - http_response=result.http_response, - ) - - async def _handle_blocked_request( - self, - context: BeautifulSoupCrawlingContext, - ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]: - """Try to detect if the request is blocked based on the HTTP status code or the response content. - - Args: - context: The current crawling context. - - Raises: - SessionError: If the request is considered blocked. - - Yields: - The original crawling context if no errors are detected. - """ - if self._retry_on_blocked: - status_code = context.http_response.status_code - - # TODO: refactor to avoid private member access - # https://github.com/apify/crawlee-python/issues/708 - if ( - context.session - and status_code not in self._http_client._ignore_http_error_status_codes # noqa: SLF001 - and context.session.is_blocked_status_code(status_code=status_code) - ): - raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}') - - matched_selectors = [ - selector for selector in RETRY_CSS_SELECTORS if context.soup.select_one(selector) is not None - ] - - if matched_selectors: - raise SessionError( - 'Assuming the session is blocked - ' - f"HTTP response matched the following selectors: {'; '.join(matched_selectors)}" - ) - - yield context - - async def _parse_http_response( - self, - context: HttpCrawlingContext, - ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]: - """Parse the HTTP response using the `BeautifulSoup` library and implements the `enqueue_links` function. - - Args: - context: The current crawling context. - - Yields: - The enhanced crawling context with the `BeautifulSoup` selector and the `enqueue_links` function. - """ - soup = await asyncio.to_thread(lambda: BeautifulSoup(context.http_response.read(), self._parser)) - - async def enqueue_links( - *, - selector: str = 'a', - label: str | None = None, - user_data: dict[str, Any] | None = None, - **kwargs: Unpack[EnqueueLinksKwargs], - ) -> None: - kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME) - - requests = list[BaseRequestData]() - user_data = user_data or {} - - link: Tag - for link in soup.select(selector): - link_user_data = user_data - - if label is not None: - link_user_data.setdefault('label', label) - - if (url := link.attrs.get('href')) is not None: - url = url.strip() - - if not is_url_absolute(url): - url = convert_to_absolute_url(context.request.url, url) - - try: - request = BaseRequestData.from_url(url, user_data=link_user_data) - except ValidationError as exc: - context.log.debug( - f'Skipping URL "{url}" due to invalid format: {exc}. ' - 'This may be caused by a malformed URL or unsupported URL scheme. ' - 'Please ensure the URL is correct and retry.' - ) - continue - - requests.append(request) - - await context.add_requests(requests, **kwargs) - - yield BeautifulSoupCrawlingContext( - request=context.request, - session=context.session, - proxy_info=context.proxy_info, - enqueue_links=enqueue_links, - add_requests=context.add_requests, - send_request=context.send_request, - push_data=context.push_data, - get_key_value_store=context.get_key_value_store, - log=context.log, - http_response=context.http_response, - soup=soup, + super().__init__( + parser=BeautifulSoupParser(parser=parser), + **kwargs, ) diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py index 8658782181..f01d66a1c0 100644 --- a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py +++ b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py @@ -1,26 +1,26 @@ -from __future__ import annotations +from dataclasses import dataclass, fields -from dataclasses import dataclass -from typing import TYPE_CHECKING +from bs4 import BeautifulSoup +from typing_extensions import Self -from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction from crawlee._utils.docs import docs_group -from crawlee.http_crawler import HttpCrawlingResult - -if TYPE_CHECKING: - from bs4 import BeautifulSoup +from crawlee.abstract_http_crawler._http_crawling_context import ParsedHttpCrawlingContext @dataclass(frozen=True) @docs_group('Data structures') -class BeautifulSoupCrawlingContext(HttpCrawlingResult, BasicCrawlingContext): +class BeautifulSoupCrawlingContext(ParsedHttpCrawlingContext[BeautifulSoup]): """The crawling context used by the `BeautifulSoupCrawler`. It provides access to key objects as well as utility functions for handling crawling tasks. """ - soup: BeautifulSoup - """The `BeautifulSoup` object for the current page.""" + @property + def soup(self) -> BeautifulSoup: + """Convenience alias.""" + return self.parsed_content - enqueue_links: EnqueueLinksFunction - """The BeautifulSoup `EnqueueLinksFunction` implementation.""" + @classmethod + def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[BeautifulSoup]) -> Self: + """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext[BeautifulSoup]`.""" + return cls(**{field.name: getattr(context, field.name) for field in fields(context)}) diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_parser.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_parser.py new file mode 100644 index 0000000000..f523db9e2c --- /dev/null +++ b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_parser.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal + +from bs4 import BeautifulSoup, Tag +from typing_extensions import override + +from crawlee.abstract_http_crawler._abstract_http_parser import AbstractHttpParser + +if TYPE_CHECKING: + from collections.abc import Iterable + + from crawlee.http_clients import HttpResponse + + +class BeautifulSoupParser(AbstractHttpParser[BeautifulSoup]): + """Parser for parsing HTTP response using `BeautifulSoup`.""" + + def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None: + self._parser = parser + + @override + async def parse(self, response: HttpResponse) -> BeautifulSoup: + return BeautifulSoup(response.read(), features=self._parser) + + @override + def is_matching_selector(self, parsed_content: BeautifulSoup, selector: str) -> bool: + return parsed_content.select_one(selector) is not None + + @override + def find_links(self, parsed_content: BeautifulSoup, selector: str) -> Iterable[str]: + link: Tag + urls: list[str] = [] + for link in parsed_content.select(selector): + url = link.attrs.get('href') + if url: + urls.append(url.strip()) + return urls + + +BeautifulSoupParserType = Literal['html.parser', 'lxml', 'xml', 'html5lib'] diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index dcca721184..9012fdc431 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -41,7 +41,7 @@ def read(self) -> bytes: @dataclass(frozen=True) @docs_group('Data structures') class HttpCrawlingResult: - """Result of a HTTP-only crawl. + """Result of an HTTP-only crawl. Mainly for the purpose of composing specific crawling contexts (e.g. `BeautifulSoupCrawlingContext`, `ParselCrawlingContext`, ...). diff --git a/src/crawlee/http_crawler/__init__.py b/src/crawlee/http_crawler/__init__.py index d3138a180c..0990525000 100644 --- a/src/crawlee/http_crawler/__init__.py +++ b/src/crawlee/http_crawler/__init__.py @@ -1,4 +1,10 @@ +from crawlee.abstract_http_crawler._http_crawling_context import HttpCrawlingContext +from crawlee.http_clients import HttpCrawlingResult + from ._http_crawler import HttpCrawler -from ._http_crawling_context import HttpCrawlingContext, HttpCrawlingResult -__all__ = ['HttpCrawler', 'HttpCrawlingContext', 'HttpCrawlingResult'] +__all__ = [ + 'HttpCrawler', + 'HttpCrawlingContext', + 'HttpCrawlingResult', +] diff --git a/src/crawlee/http_crawler/_http_crawler.py b/src/crawlee/http_crawler/_http_crawler.py index c5813b11b0..94672fe0fe 100644 --- a/src/crawlee/http_crawler/_http_crawler.py +++ b/src/crawlee/http_crawler/_http_crawler.py @@ -1,33 +1,23 @@ from __future__ import annotations -import logging from typing import TYPE_CHECKING from crawlee._utils.docs import docs_group -from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline -from crawlee.errors import SessionError -from crawlee.http_clients import HttpxHttpClient -from crawlee.http_crawler._http_crawling_context import HttpCrawlingContext +from crawlee.abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions, ParsedHttpCrawlingContext -if TYPE_CHECKING: - from collections.abc import AsyncGenerator, Iterable +from ._http_parser import NoParser +if TYPE_CHECKING: from typing_extensions import Unpack - from crawlee._types import BasicCrawlingContext - @docs_group('Classes') -class HttpCrawler(BasicCrawler[HttpCrawlingContext]): - """A web crawler for performing HTTP requests. +class HttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[bytes], bytes]): + """Specific version of generic `AbstractHttpCrawler`. - The `HttpCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features. On top - of that it implements the HTTP communication using the HTTP clients. The class allows integration with - any HTTP client that implements the `BaseHttpClient` interface. The HTTP client is provided to the crawler - as an input parameter to the constructor. - - The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However, - if you need to execute client-side JavaScript, consider using a browser-based crawler like the `PlaywrightCrawler`. + It uses a dummy parser that simply returns the HTTP response body as-is. Use this only if you know what you are + doing. In most cases, using an HTML parser would be more beneficial. For such scenarios, consider using + `BeautifulSoupCrawler`, `ParselCrawler`, or writing your own subclass of `AbstractHttpCrawler`. ### Usage @@ -56,86 +46,15 @@ async def request_handler(context: HttpCrawlingContext) -> None: def __init__( self, - *, - additional_http_error_status_codes: Iterable[int] = (), - ignore_http_error_status_codes: Iterable[int] = (), - **kwargs: Unpack[BasicCrawlerOptions[HttpCrawlingContext]], + **kwargs: Unpack[HttpCrawlerOptions[ParsedHttpCrawlingContext[bytes]]], ) -> None: """A default constructor. Args: - additional_http_error_status_codes: Additional HTTP status codes to treat as errors, triggering - automatic retries when encountered. - ignore_http_error_status_codes: HTTP status codes typically considered errors but to be treated - as successful responses. - kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`. - """ - kwargs['_context_pipeline'] = ( - ContextPipeline().compose(self._make_http_request).compose(self._handle_blocked_request) - ) - - kwargs.setdefault( - 'http_client', - HttpxHttpClient( - additional_http_error_status_codes=additional_http_error_status_codes, - ignore_http_error_status_codes=ignore_http_error_status_codes, - ), - ) - - kwargs.setdefault('_logger', logging.getLogger(__name__)) - - super().__init__(**kwargs) - - async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]: - """Executes an HTTP request using a configured HTTP client. - - Args: - context: The crawling context from the `BasicCrawler`. - - Yields: - The enhanced crawling context with the HTTP response. + kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`. """ - result = await self._http_client.crawl( - request=context.request, - session=context.session, - proxy_info=context.proxy_info, - statistics=self._statistics, + kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline() + super().__init__( + parser=NoParser(), + **kwargs, ) - - yield HttpCrawlingContext( - request=context.request, - session=context.session, - proxy_info=context.proxy_info, - add_requests=context.add_requests, - send_request=context.send_request, - push_data=context.push_data, - get_key_value_store=context.get_key_value_store, - log=context.log, - http_response=result.http_response, - ) - - async def _handle_blocked_request(self, context: HttpCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]: - """Try to detect if the request is blocked based on the HTTP status code. - - Args: - context: The current crawling context. - - Raises: - SessionError: If the request is considered blocked. - - Yields: - The original crawling context if no errors are detected. - """ - if self._retry_on_blocked: - status_code = context.http_response.status_code - - # TODO: refactor to avoid private member access - # https://github.com/apify/crawlee-python/issues/708 - if ( - context.session - and status_code not in self._http_client._ignore_http_error_status_codes # noqa: SLF001 - and context.session.is_blocked_status_code(status_code=status_code) - ): - raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}') - - yield context diff --git a/src/crawlee/http_crawler/_http_crawling_context.py b/src/crawlee/http_crawler/_http_crawling_context.py deleted file mode 100644 index 414e9f43bb..0000000000 --- a/src/crawlee/http_crawler/_http_crawling_context.py +++ /dev/null @@ -1,13 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass - -from crawlee._types import BasicCrawlingContext -from crawlee._utils.docs import docs_group -from crawlee.http_clients import HttpCrawlingResult - - -@dataclass(frozen=True) -@docs_group('Data structures') -class HttpCrawlingContext(BasicCrawlingContext, HttpCrawlingResult): - """The crawling context used by the `HttpCrawler`.""" diff --git a/src/crawlee/http_crawler/_http_parser.py b/src/crawlee/http_crawler/_http_parser.py new file mode 100644 index 0000000000..47d47f2d3b --- /dev/null +++ b/src/crawlee/http_crawler/_http_parser.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from typing_extensions import override + +from crawlee.abstract_http_crawler import AbstractHttpParser +from crawlee.basic_crawler import BlockedInfo + +if TYPE_CHECKING: + from collections.abc import Iterable + + from crawlee.http_clients import HttpResponse + + +class NoParser(AbstractHttpParser[bytes]): + """Dummy parser for backwards compatibility. + + To enable using `HttpCrawler` without need for additional specific parser. + """ + + @override + async def parse(self, response: HttpResponse) -> bytes: + return response.read() + + @override + def is_blocked(self, parsed_content: bytes) -> BlockedInfo: # Intentional unused argument. + return BlockedInfo(reason='') + + @override + def is_matching_selector(self, parsed_content: bytes, selector: str) -> bool: # Intentional unused argument. + return False + + @override + def find_links(self, parsed_content: bytes, selector: str) -> Iterable[str]: # Intentional unused argument. + return [] diff --git a/src/crawlee/parsel_crawler/_parsel_crawler.py b/src/crawlee/parsel_crawler/_parsel_crawler.py index d3ba78af41..0d47b4014a 100644 --- a/src/crawlee/parsel_crawler/_parsel_crawler.py +++ b/src/crawlee/parsel_crawler/_parsel_crawler.py @@ -1,39 +1,30 @@ from __future__ import annotations -import asyncio -import logging -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING from parsel import Selector -from pydantic import ValidationError -from crawlee import EnqueueStrategy -from crawlee._request import BaseRequestData -from crawlee._utils.blocked import RETRY_CSS_SELECTORS from crawlee._utils.docs import docs_group -from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute -from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline -from crawlee.errors import SessionError -from crawlee.http_clients import HttpxHttpClient -from crawlee.http_crawler import HttpCrawlingContext -from crawlee.parsel_crawler._parsel_crawling_context import ParselCrawlingContext +from crawlee.abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions +from crawlee.parsel_crawler._parsel_parser import ParselParser + +from ._parsel_crawling_context import ParselCrawlingContext if TYPE_CHECKING: - from collections.abc import AsyncGenerator, Iterable + from collections.abc import AsyncGenerator from typing_extensions import Unpack - from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs + from crawlee.abstract_http_crawler import ParsedHttpCrawlingContext @docs_group('Classes') -class ParselCrawler(BasicCrawler[ParselCrawlingContext]): +class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector]): """A web crawler for performing HTTP requests and parsing HTML/XML content. - The `ParselCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features. - On top of that it implements the HTTP communication using the HTTP clients and HTML/XML parsing using the - `Parsel` library. The class allows integration with any HTTP client that implements the `BaseHttpClient` - interface. The HTTP client is provided to the crawler as an input parameter to the constructor. + The `ParselCrawler` builds on top of the `AbstractHttpCrawler`, which means it inherits all of its features. + It specifies its own parser `ParselParser` which is used to parse `HttpResponse`. + `ParselParser` uses following library for parsing: https://pypi.org/project/parsel/ The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However, if you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`. @@ -65,172 +56,22 @@ async def request_handler(context: ParselCrawlingContext) -> None: def __init__( self, - *, - additional_http_error_status_codes: Iterable[int] = (), - ignore_http_error_status_codes: Iterable[int] = (), - **kwargs: Unpack[BasicCrawlerOptions[ParselCrawlingContext]], + **kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]], ) -> None: """A default constructor. Args: - additional_http_error_status_codes: Additional HTTP status codes to treat as errors, triggering - automatic retries when encountered. - ignore_http_error_status_codes: HTTP status codes typically considered errors but to be treated - as successful responses. - kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`. + kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`. """ - kwargs['_context_pipeline'] = ( - ContextPipeline() - .compose(self._make_http_request) - .compose(self._parse_http_response) - .compose(self._handle_blocked_request) - ) - - kwargs.setdefault( - 'http_client', - HttpxHttpClient( - additional_http_error_status_codes=additional_http_error_status_codes, - ignore_http_error_status_codes=ignore_http_error_status_codes, - ), - ) - - kwargs.setdefault('_logger', logging.getLogger(__name__)) - - super().__init__(**kwargs) - async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]: - """Executes an HTTP request using a configured HTTP client. + async def final_step( + context: ParsedHttpCrawlingContext[Selector], + ) -> AsyncGenerator[ParselCrawlingContext, None]: + """Enhance `ParsedHttpCrawlingContext[Selector]` with a `selector` property.""" + yield ParselCrawlingContext.from_parsed_http_crawling_context(context) - Args: - context: The crawling context from the `BasicCrawler`. - - Yields: - The enhanced crawling context with the HTTP response. - """ - result = await self._http_client.crawl( - request=context.request, - session=context.session, - proxy_info=context.proxy_info, - statistics=self._statistics, - ) - - yield HttpCrawlingContext( - request=context.request, - session=context.session, - proxy_info=context.proxy_info, - add_requests=context.add_requests, - send_request=context.send_request, - push_data=context.push_data, - get_key_value_store=context.get_key_value_store, - log=context.log, - http_response=result.http_response, - ) - - async def _handle_blocked_request( - self, context: ParselCrawlingContext - ) -> AsyncGenerator[ParselCrawlingContext, None]: - """Try to detect if the request is blocked based on the HTTP status code or the response content. - - Args: - context: The current crawling context. - - Raises: - SessionError: If the request is considered blocked. - - Yields: - The original crawling context if no errors are detected. - """ - if self._retry_on_blocked: - status_code = context.http_response.status_code - - # TODO: refactor to avoid private member access - # https://github.com/apify/crawlee-python/issues/708 - if ( - context.session - and status_code not in self._http_client._ignore_http_error_status_codes # noqa: SLF001 - and context.session.is_blocked_status_code(status_code=status_code) - ): - raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}') - - parsel = context.selector - - matched_selectors = [ - selector - for selector in RETRY_CSS_SELECTORS - if parsel.type in ('html', 'xml') and parsel.css(selector).get() is not None - ] - - if matched_selectors: - raise SessionError( - 'Assuming the session is blocked - ' - f"HTTP response matched the following selectors: {'; '.join(matched_selectors)}" - ) - - yield context - - async def _parse_http_response( - self, - context: HttpCrawlingContext, - ) -> AsyncGenerator[ParselCrawlingContext, None]: - """Parse the HTTP response using the `Parsel` library and implements the `enqueue_links` function. - - Args: - context: The current crawling context. - - Yields: - The enhanced crawling context with the `Parsel` selector and the `enqueue_links` function. - """ - parsel_selector = await asyncio.to_thread(lambda: Selector(body=context.http_response.read())) - - async def enqueue_links( - *, - selector: str = 'a', - label: str | None = None, - user_data: dict[str, Any] | None = None, - **kwargs: Unpack[EnqueueLinksKwargs], - ) -> None: - kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME) - - requests = list[BaseRequestData]() - user_data = user_data or {} - - link: Selector - for link in parsel_selector.css(selector): - link_user_data = user_data - - if label is not None: - link_user_data.setdefault('label', label) - - if (url := link.xpath('@href').get()) is not None: - url = url.strip() - - if not is_url_absolute(url): - url = str(convert_to_absolute_url(context.request.url, url)) - - try: - request = BaseRequestData.from_url(url, user_data=link_user_data) - except ValidationError as exc: - context.log.debug( - f'Skipping URL "{url}" due to invalid format: {exc}. ' - 'This may be caused by a malformed URL or unsupported URL scheme. ' - 'Please ensure the URL is correct and retry.' - ) - continue - - requests.append(request) - - await context.add_requests(requests, **kwargs) - - yield ParselCrawlingContext( - request=context.request, - session=context.session, - proxy_info=context.proxy_info, - enqueue_links=enqueue_links, - add_requests=context.add_requests, - send_request=context.send_request, - push_data=context.push_data, - get_key_value_store=context.get_key_value_store, - log=context.log, - http_response=context.http_response, - selector=parsel_selector, + kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline().compose(final_step) + super().__init__( + parser=ParselParser(), + **kwargs, ) diff --git a/src/crawlee/parsel_crawler/_parsel_crawling_context.py b/src/crawlee/parsel_crawler/_parsel_crawling_context.py index 22a565d678..5dd13e3868 100644 --- a/src/crawlee/parsel_crawler/_parsel_crawling_context.py +++ b/src/crawlee/parsel_crawler/_parsel_crawling_context.py @@ -1,26 +1,26 @@ -from __future__ import annotations +from dataclasses import dataclass, fields -from dataclasses import dataclass -from typing import TYPE_CHECKING +from parsel import Selector +from typing_extensions import Self -from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction from crawlee._utils.docs import docs_group -from crawlee.http_crawler import HttpCrawlingResult - -if TYPE_CHECKING: - from parsel import Selector +from crawlee.abstract_http_crawler._http_crawling_context import ParsedHttpCrawlingContext @dataclass(frozen=True) @docs_group('Data structures') -class ParselCrawlingContext(HttpCrawlingResult, BasicCrawlingContext): +class ParselCrawlingContext(ParsedHttpCrawlingContext[Selector]): """The crawling context used by the `ParselCrawler`. It provides access to key objects as well as utility functions for handling crawling tasks. """ - selector: Selector - """The Parsel `Selector` object for the current page.""" + @property + def selector(self) -> Selector: + """Convenience alias.""" + return self.parsed_content - enqueue_links: EnqueueLinksFunction - """The Parsel `EnqueueLinksFunction` implementation.""" + @classmethod + def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[Selector]) -> Self: + """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext[BeautifulSoup]`.""" + return cls(**{field.name: getattr(context, field.name) for field in fields(context)}) diff --git a/src/crawlee/parsel_crawler/_parsel_parser.py b/src/crawlee/parsel_crawler/_parsel_parser.py new file mode 100644 index 0000000000..bde77fbfde --- /dev/null +++ b/src/crawlee/parsel_crawler/_parsel_parser.py @@ -0,0 +1,30 @@ +import asyncio +from collections.abc import Iterable + +from parsel import Selector +from typing_extensions import override + +from crawlee.abstract_http_crawler._abstract_http_parser import AbstractHttpParser +from crawlee.http_clients import HttpResponse + + +class ParselParser(AbstractHttpParser[Selector]): + """Parser for parsing HTTP response using Parsel.""" + + @override + async def parse(self, response: HttpResponse) -> Selector: + return await asyncio.to_thread(lambda: Selector(body=response.read())) + + @override + def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool: + return parsed_content.type in ('html', 'xml') and parsed_content.css(selector).get() is not None + + @override + def find_links(self, parsed_content: Selector, selector: str) -> Iterable[str]: + link: Selector + urls: list[str] = [] + for link in parsed_content.css(selector): + url = link.xpath('@href').get() + if url: + urls.append(url.strip()) + return urls diff --git a/tests/unit/beautifulsoup_crawler/test_beautifulsoup_crawler.py b/tests/unit/beautifulsoup_crawler/test_beautifulsoup_crawler.py index 726599dd7e..1dabf7770e 100644 --- a/tests/unit/beautifulsoup_crawler/test_beautifulsoup_crawler.py +++ b/tests/unit/beautifulsoup_crawler/test_beautifulsoup_crawler.py @@ -8,14 +8,12 @@ from httpx import Response from crawlee import ConcurrencySettings -from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler +from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext from crawlee.storages import RequestList if TYPE_CHECKING: from collections.abc import AsyncGenerator - from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext - @pytest.fixture async def server() -> AsyncGenerator[respx.MockRouter, None]: diff --git a/tests/unit/http_crawler/test_http_crawler.py b/tests/unit/http_crawler/test_http_crawler.py index 53c53164d2..c97d15072b 100644 --- a/tests/unit/http_crawler/test_http_crawler.py +++ b/tests/unit/http_crawler/test_http_crawler.py @@ -20,8 +20,7 @@ from collections.abc import AsyncGenerator, Awaitable from crawlee.http_clients._base import BaseHttpClient - from crawlee.http_crawler._http_crawling_context import HttpCrawlingContext - + from crawlee.http_crawler import HttpCrawlingContext # Payload, e.g. data for a form submission. PAYLOAD = {