Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
8c8dd24
WIP
Pijukatel Nov 21, 2024
48812b1
Draft proposal for discussion.
Pijukatel Nov 21, 2024
853ee85
Remove redundant type
Pijukatel Nov 21, 2024
17e08a1
BeautifulSoupParser
Pijukatel Nov 22, 2024
188afdb
Being stuck on mypy and generics
Pijukatel Nov 22, 2024
96356d6
Almost there. Figure out the reason for casts in middleware
Pijukatel Nov 22, 2024
def0e72
Solved BScrawler. Next ParselCrawler.
Pijukatel Nov 26, 2024
54ce154
Reworked ParselCrawler
Pijukatel Nov 26, 2024
4692fe9
Ready for review.
Pijukatel Nov 26, 2024
e2e3cd9
Merge remote-tracking branch 'origin/master' into new-class-hier-curr…
Pijukatel Nov 26, 2024
bb8cd12
Edit forgotten comment .
Pijukatel Nov 26, 2024
f869be6
Remove mistaken edits in docs
Pijukatel Nov 26, 2024
81e46cd
Merge branch 'master' into new-class-hier-current-middleware
Pijukatel Nov 26, 2024
f994e32
Reformat after merge.
Pijukatel Nov 26, 2024
bbc27af
Fix CI reported issues on previous Python versions
Pijukatel Nov 26, 2024
7567164
Update docstrings in child crawlers to not repeat text after parent.
Pijukatel Nov 26, 2024
9335967
Revert incorrect docstring update.
Pijukatel Nov 26, 2024
b4877cb
Review comments
Pijukatel Nov 26, 2024
2929be1
Reverted back name change in doc strings.
Pijukatel Nov 26, 2024
19bc041
Fix CI reported issues.
Pijukatel Nov 26, 2024
fe19345
Fix incorrectly name BS argument
Pijukatel Nov 26, 2024
6ab5a09
Changes by Honza
Pijukatel Nov 27, 2024
2af695b
Polish proposed changes,
Pijukatel Nov 27, 2024
0b0f4ce
Review comments
Pijukatel Nov 27, 2024
03832fb
Review commnets about interl imports in docs
Pijukatel Nov 27, 2024
005c7cf
Extract is_matching_selector from Parser and put
Pijukatel Nov 27, 2024
fc2de60
Update src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_cont…
Pijukatel Nov 28, 2024
578cdc0
Update src/crawlee/http_crawler/_http_crawler.py
Pijukatel Nov 28, 2024
280cecb
Update src/crawlee/parsel_crawler/_parsel_crawling_context.py
Pijukatel Nov 28, 2024
a88a5e4
Review comments.
Pijukatel Nov 28, 2024
b1c0fad
Use correctly BeautifulSoupParser type
Pijukatel Nov 28, 2024
4e3fbd5
Add doc page describing new classes.
Pijukatel Nov 28, 2024
9fc66d8
Update docs more
Pijukatel Nov 28, 2024
434bd6b
Apply suggestions from code review
Pijukatel Nov 29, 2024
18562de
Review comments.
Pijukatel Nov 29, 2024
b9255be
More review comments
Pijukatel Nov 29, 2024
d70e8a8
Update docs names
Pijukatel Nov 29, 2024
3e87db5
Update docs/guides/static_content_crawlers.mdx
Pijukatel Dec 3, 2024
460e1ac
Review comments.
Pijukatel Dec 3, 2024
8c4ec82
Review comments
Pijukatel Dec 3, 2024
e7c7817
Apply suggestions from code review
Pijukatel Dec 3, 2024
05cec1a
Rename StaticCOntentCrawler to AbstractContentCrawler and related fil…
Pijukatel Dec 3, 2024
bed215e
Renaming to AbstractHttpCrawler 2
Pijukatel Dec 3, 2024
c43b564
Renaming to AbstractHttpCrawler 2
Pijukatel Dec 3, 2024
a1db9e2
Apply suggestions from code review
Pijukatel Dec 3, 2024
fae917e
Review comments
Pijukatel Dec 3, 2024
b563bf9
Expand docs by short description of how to create your own HTTPbase c…
Pijukatel Dec 3, 2024
89a8e83
Update src/crawlee/abstract_http_crawler/_abstract_http_crawler.py
Pijukatel Dec 3, 2024
139b21b
Update src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py
Pijukatel Dec 4, 2024
bd7846f
Apply suggestions from code review
Pijukatel Dec 4, 2024
454f9ec
Review comments
Pijukatel Dec 4, 2024
6bba552
Move BlockedInfo to its own file.
Pijukatel Dec 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/crawlee/basic_crawler/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ class BasicCrawler(Generic[TCrawlingContext]):

The `BasicCrawler` provides a low-level functionality for crawling websites, allowing users to define their
own page download and data extraction logic. It is designed mostly to be subclassed by crawlers with specific
purposes. In most cases, you will want to use a more specialized crawler, such as `HttpCrawler`,
purposes. In most cases, you will want to use a more specialized crawler, such as `HttpCrawlerGeneric`,
`BeautifulSoupCrawler`, `ParselCrawler`, or `PlaywrightCrawler`. If you are an advanced user and want full
control over the crawling process, you can subclass the `BasicCrawler` and implement the request-handling logic
yourself.
Expand Down
193 changes: 18 additions & 175 deletions src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,29 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Iterable, Literal

import asyncio
import logging
from typing import TYPE_CHECKING, Any, Literal
from bs4 import BeautifulSoup

from bs4 import BeautifulSoup, Tag
from pydantic import ValidationError
from crawlee.beautifulsoup_crawler._beautifulsoup_parser import BeautifulSoupContentParser
from crawlee.http_crawler import HttpCrawlerGeneric

from crawlee import EnqueueStrategy
from crawlee._request import BaseRequestData
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
from crawlee._utils.docs import docs_group
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline
from crawlee.beautifulsoup_crawler._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
from crawlee.errors import SessionError
from crawlee.http_clients import HttpxHttpClient
from crawlee.http_crawler import HttpCrawlingContext

if TYPE_CHECKING:
from collections.abc import AsyncGenerator, Iterable

from typing_extensions import Unpack

from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs
from crawlee.basic_crawler import BasicCrawlerOptions
from crawlee.http_crawler import ParsedHttpCrawlingContext


BeautifulSoupParser = Literal['html.parser', 'lxml', 'xml', 'html5lib']


@docs_group('Classes')
class BeautifulSoupCrawler(BasicCrawler[BeautifulSoupCrawlingContext]):
class BeautifulSoupCrawler(HttpCrawlerGeneric[BeautifulSoup]):
"""A web crawler for performing HTTP requests and parsing HTML/XML content.

The `BeautifulSoupCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features.
The `BeautifulSoupCrawler` builds on top of the `HttpCrawlerGeneric`, which means it inherits all of its features.
It specifies its own parser BeautifulSoupParser which is used to parse HttpResponse.
On top of that it implements the HTTP communication using the HTTP clients and HTML/XML parsing using the
`BeautifulSoup` library. The class allows integration with any HTTP client that implements the `BaseHttpClient`
interface. The HTTP client is provided to the crawler as an input parameter to the constructor.
Expand All @@ -43,13 +34,13 @@ class BeautifulSoupCrawler(BasicCrawler[BeautifulSoupCrawlingContext]):
### Usage

```python
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, ParsedHttpCrawlingContext[BeautifulSoup]

crawler = BeautifulSoupCrawler()

# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
async def request_handler(context: ParsedHttpCrawlingContext[BeautifulSoup]) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Extract data from the page.
Expand All @@ -71,7 +62,7 @@ def __init__(
parser: BeautifulSoupParser = 'lxml',
additional_http_error_status_codes: Iterable[int] = (),
ignore_http_error_status_codes: Iterable[int] = (),
**kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
**kwargs: Unpack[BasicCrawlerOptions[ParsedHttpCrawlingContext[BeautifulSoup]]],
) -> None:
"""A default constructor.

Expand All @@ -83,157 +74,9 @@ def __init__(
as successful responses.
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
"""
self._parser = parser

kwargs['_context_pipeline'] = (
ContextPipeline()
.compose(self._make_http_request)
.compose(self._parse_http_response)
.compose(self._handle_blocked_request)
)

kwargs.setdefault(
'http_client',
HttpxHttpClient(
additional_http_error_status_codes=additional_http_error_status_codes,
ignore_http_error_status_codes=ignore_http_error_status_codes,
),
)

kwargs.setdefault('_logger', logging.getLogger(__name__))

super().__init__(**kwargs)

async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
"""Executes an HTTP request using a configured HTTP client.

Args:
context: The crawling context from the `BasicCrawler`.

Yields:
The enhanced crawling context with the HTTP response.
"""
result = await self._http_client.crawl(
request=context.request,
session=context.session,
proxy_info=context.proxy_info,
statistics=self._statistics,
)

yield HttpCrawlingContext(
request=context.request,
session=context.session,
proxy_info=context.proxy_info,
add_requests=context.add_requests,
send_request=context.send_request,
push_data=context.push_data,
get_key_value_store=context.get_key_value_store,
log=context.log,
http_response=result.http_response,
)

async def _handle_blocked_request(
self,
context: BeautifulSoupCrawlingContext,
) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]:
"""Try to detect if the request is blocked based on the HTTP status code or the response content.

Args:
context: The current crawling context.

Raises:
SessionError: If the request is considered blocked.

Yields:
The original crawling context if no errors are detected.
"""
if self._retry_on_blocked:
status_code = context.http_response.status_code

# TODO: refactor to avoid private member access
# https://github.com/apify/crawlee-python/issues/708
if (
context.session
and status_code not in self._http_client._ignore_http_error_status_codes # noqa: SLF001
and context.session.is_blocked_status_code(status_code=status_code)
):
raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')

matched_selectors = [
selector for selector in RETRY_CSS_SELECTORS if context.soup.select_one(selector) is not None
]

if matched_selectors:
raise SessionError(
'Assuming the session is blocked - '
f"HTTP response matched the following selectors: {'; '.join(matched_selectors)}"
)

yield context

async def _parse_http_response(
self,
context: HttpCrawlingContext,
) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]:
"""Parse the HTTP response using the `BeautifulSoup` library and implements the `enqueue_links` function.

Args:
context: The current crawling context.

Yields:
The enhanced crawling context with the `BeautifulSoup` selector and the `enqueue_links` function.
"""
soup = await asyncio.to_thread(lambda: BeautifulSoup(context.http_response.read(), self._parser))

async def enqueue_links(
*,
selector: str = 'a',
label: str | None = None,
user_data: dict[str, Any] | None = None,
**kwargs: Unpack[EnqueueLinksKwargs],
) -> None:
kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME)

requests = list[BaseRequestData]()
user_data = user_data or {}

link: Tag
for link in soup.select(selector):
link_user_data = user_data

if label is not None:
link_user_data.setdefault('label', label)

if (url := link.attrs.get('href')) is not None:
url = url.strip()

if not is_url_absolute(url):
url = convert_to_absolute_url(context.request.url, url)

try:
request = BaseRequestData.from_url(url, user_data=link_user_data)
except ValidationError as exc:
context.log.debug(
f'Skipping URL "{url}" due to invalid format: {exc}. '
'This may be caused by a malformed URL or unsupported URL scheme. '
'Please ensure the URL is correct and retry.'
)
continue

requests.append(request)

await context.add_requests(requests, **kwargs)

yield BeautifulSoupCrawlingContext(
request=context.request,
session=context.session,
proxy_info=context.proxy_info,
enqueue_links=enqueue_links,
add_requests=context.add_requests,
send_request=context.send_request,
push_data=context.push_data,
get_key_value_store=context.get_key_value_store,
log=context.log,
http_response=context.http_response,
soup=soup,
super().__init__(
parser=BeautifulSoupContentParser(parser=parser),
additional_http_error_status_codes=additional_http_error_status_codes,
ignore_http_error_status_codes=ignore_http_error_status_codes,
**kwargs,
)
Original file line number Diff line number Diff line change
@@ -1,26 +1,5 @@
from __future__ import annotations
from bs4 import BeautifulSoup

from dataclasses import dataclass
from typing import TYPE_CHECKING
from crawlee.http_crawler import ParsedHttpCrawlingContext

from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction
from crawlee._utils.docs import docs_group
from crawlee.http_crawler import HttpCrawlingResult

if TYPE_CHECKING:
from bs4 import BeautifulSoup


@dataclass(frozen=True)
@docs_group('Data structures')
class BeautifulSoupCrawlingContext(HttpCrawlingResult, BasicCrawlingContext):
"""The crawling context used by the `BeautifulSoupCrawler`.
It provides access to key objects as well as utility functions for handling crawling tasks.
"""

soup: BeautifulSoup
"""The `BeautifulSoup` object for the current page."""

enqueue_links: EnqueueLinksFunction
"""The BeautifulSoup `EnqueueLinksFunction` implementation."""
BeautifulSoupCrawlingContext = ParsedHttpCrawlingContext[BeautifulSoup]
44 changes: 44 additions & 0 deletions src/crawlee/beautifulsoup_crawler/_beautifulsoup_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Iterable

from bs4 import BeautifulSoup, Tag
from typing_extensions import override

from crawlee._utils.blocked import RETRY_CSS_SELECTORS
from crawlee.http_crawler import BlockedInfo, StaticContentParser

if TYPE_CHECKING:
from crawlee.http_clients import HttpResponse


class BeautifulSoupContentParser(StaticContentParser[BeautifulSoup]):
"""Parser for parsing http response using BeautifulSoup."""

def __init__(self, parser: str = 'lxml') -> None:
self._parser = parser

@override
async def parse(self, response: HttpResponse) -> BeautifulSoup:
return BeautifulSoup(response.read(), parser=self._parser)

@override
def is_blocked(self, parsed_content: BeautifulSoup) -> BlockedInfo:
reason = ''
if parsed_content is not None:
matched_selectors = [
selector for selector in RETRY_CSS_SELECTORS if parsed_content.select_one(selector) is not None
]
if matched_selectors:
reason = f"Assuming the session is blocked - HTTP response matched the following selectors: {'; '.join(
matched_selectors)}"
return BlockedInfo(reason=reason)

@override
def find_links(self, parsed_content: BeautifulSoup, selector: str) -> Iterable[str]:
link: Tag
urls: list[str] = []
for link in parsed_content.select(selector):
if (url := link.attrs.get('href')) is not None:
urls.append(url.strip()) # noqa: PERF401 #Mypy has problems using is not None for type inference in list comprehension.
return urls
4 changes: 2 additions & 2 deletions src/crawlee/http_clients/_httpx.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,10 @@ class HttpxHttpClient(BaseHttpClient):

```python
from crawlee.http_clients import HttpxHttpClient
from crawlee.http_crawler import HttpCrawler # or any other HTTP client-based crawler
from crawlee.http_crawler import HttpCrawlerGeneric # or any other HTTP client-based crawler

http_client = HttpxHttpClient()
crawler = HttpCrawler(http_client=http_client)
crawler = HttpCrawlerGeneric(http_client=http_client)
```
"""

Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/http_clients/curl_impersonate.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,10 @@ class CurlImpersonateHttpClient(BaseHttpClient):

```python
from crawlee.http_clients.curl_impersonate import CurlImpersonateHttpClient
from crawlee.http_crawler import HttpCrawler # or any other HTTP client-based crawler
from crawlee.http_crawler import HttpCrawlerGeneric # or any other HTTP client-based crawler

http_client = CurlImpersonateHttpClient()
crawler = HttpCrawler(http_client=http_client)
crawler = HttpCrawlerGeneric(http_client=http_client)
```
"""

Expand Down
15 changes: 12 additions & 3 deletions src/crawlee/http_crawler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
from ._http_crawler import HttpCrawler
from ._http_crawling_context import HttpCrawlingContext, HttpCrawlingResult
from ._http_crawler import HttpCrawler, HttpCrawlerGeneric
from ._http_crawling_context import HttpCrawlingContext, HttpCrawlingResult, ParsedHttpCrawlingContext
from ._http_parser import BlockedInfo, StaticContentParser

__all__ = ['HttpCrawler', 'HttpCrawlingContext', 'HttpCrawlingResult']
__all__ = [
'BlockedInfo',
'HttpCrawler',
'HttpCrawlerGeneric',
'HttpCrawlingContext',
'HttpCrawlingResult',
'ParsedHttpCrawlingContext',
'StaticContentParser',
]
Loading
Loading