Skip to content

Commit 758424b

Browse files
committed
add AbstractHttpCrawler section
1 parent 247ef79 commit 758424b

13 files changed

+306
-67
lines changed

docs/guides/code_examples/crawler_custom_parser/__init__.py

Whitespace-only changes.

docs/guides/code_examples/httpcrawler_custom_parser/selectolax_parser.py renamed to docs/guides/code_examples/crawler_custom_parser/lexbor_parser.py

File renamed without changes.

docs/guides/code_examples/httpcrawler_custom_parser/lxml_parser.py renamed to docs/guides/code_examples/crawler_custom_parser/lxml_parser.py

File renamed without changes.

docs/guides/code_examples/httpcrawler_custom_parser/lxml_saxonche_parser.py renamed to docs/guides/code_examples/crawler_custom_parser/lxml_saxonche_parser.py

File renamed without changes.

docs/guides/code_examples/httpcrawler_custom_parser/pyquery_parser.py renamed to docs/guides/code_examples/crawler_custom_parser/pyquery_parser.py

File renamed without changes.

docs/guides/code_examples/httpcrawler_custom_parser/scrapling_parser.py renamed to docs/guides/code_examples/crawler_custom_parser/scrapling_parser.py

File renamed without changes.
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from dataclasses import dataclass, fields
2+
3+
from selectolax.lexbor import LexborHTMLParser
4+
from typing_extensions import Self
5+
6+
from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext
7+
8+
9+
@dataclass(frozen=True)
10+
class SelectolaxLexborContext(ParsedHttpCrawlingContext[LexborHTMLParser]):
11+
"""Crawling context providing access to the parsed page.
12+
13+
This context is passed to request handlers and includes all standard
14+
context methods (push_data, enqueue_links, etc.) plus custom helpers.
15+
"""
16+
17+
@property
18+
def parser(self) -> LexborHTMLParser:
19+
"""Convenient alias for accessing the parsed document."""
20+
return self.parsed_content
21+
22+
@classmethod
23+
def from_parsed_http_crawling_context(
24+
cls, context: ParsedHttpCrawlingContext[LexborHTMLParser]
25+
) -> Self:
26+
"""Create custom context from the base context.
27+
28+
Copies all fields from the base context to preserve framework
29+
functionality while adding custom interface.
30+
"""
31+
return cls(
32+
**{field.name: getattr(context, field.name) for field in fields(context)}
33+
)
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
from selectolax.lexbor import LexborHTMLParser, LexborNode
6+
7+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
8+
9+
from .selectolax_context import SelectolaxLexborContext
10+
from .selectolax_parser import SelectolaxLexborParser
11+
12+
if TYPE_CHECKING:
13+
from collections.abc import AsyncGenerator
14+
15+
from typing_extensions import Unpack
16+
17+
from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext
18+
19+
20+
class SelectolaxLexborCrawler(
21+
AbstractHttpCrawler[SelectolaxLexborContext, LexborHTMLParser, LexborNode]
22+
):
23+
"""Custom crawler using Selectolax Lexbor for HTML parsing."""
24+
25+
def __init__(
26+
self,
27+
**kwargs: Unpack[HttpCrawlerOptions[SelectolaxLexborContext]],
28+
) -> None:
29+
# Final step converts the base context to custom context type.
30+
async def final_step(
31+
context: ParsedHttpCrawlingContext[LexborHTMLParser],
32+
) -> AsyncGenerator[SelectolaxLexborContext, None]:
33+
yield SelectolaxLexborContext.from_parsed_http_crawling_context(context)
34+
35+
# Build context pipeline: HTTP request -> parsing -> custom context.
36+
kwargs['_context_pipeline'] = (
37+
self._create_static_content_crawler_pipeline().compose(final_step)
38+
)
39+
super().__init__(
40+
parser=SelectolaxLexborParser(),
41+
**kwargs,
42+
)
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import asyncio
2+
3+
from .selectolax_crawler import SelectolaxLexborContext, SelectolaxLexborCrawler
4+
5+
6+
async def main() -> None:
7+
crawler = SelectolaxLexborCrawler(
8+
max_requests_per_crawl=10,
9+
)
10+
11+
@crawler.router.default_handler
12+
async def handle_request(context: SelectolaxLexborContext) -> None:
13+
context.log.info(f'Processing {context.request.url} ...')
14+
15+
data = {
16+
'url': context.request.url,
17+
'title': context.parser.css_first('title').text(),
18+
}
19+
20+
await context.push_data(data)
21+
await context.enqueue_links()
22+
23+
await crawler.run(['https://crawlee.dev/'])
24+
25+
26+
if __name__ == '__main__':
27+
asyncio.run(main())
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from __future__ import annotations
2+
3+
import asyncio
4+
from typing import TYPE_CHECKING
5+
6+
from selectolax.lexbor import LexborHTMLParser, LexborNode
7+
from typing_extensions import override
8+
9+
from crawlee.crawlers._abstract_http import AbstractHttpParser
10+
11+
if TYPE_CHECKING:
12+
from collections.abc import Iterable, Sequence
13+
14+
from crawlee.http_clients import HttpResponse
15+
16+
17+
class SelectolaxLexborParser(AbstractHttpParser[LexborHTMLParser, LexborNode]):
18+
"""Parser for parsing HTTP response using Selectolax Lexbor."""
19+
20+
@override
21+
async def parse(self, response: HttpResponse) -> LexborHTMLParser:
22+
"""Parse HTTP response body into a document object."""
23+
response_body = await response.read()
24+
# Run parsing in a thread to avoid blocking the event loop.
25+
return await asyncio.to_thread(lambda: LexborHTMLParser(response_body))
26+
27+
@override
28+
async def parse_text(self, text: str) -> LexborHTMLParser:
29+
"""Parse raw HTML string into a document object."""
30+
return LexborHTMLParser(text)
31+
32+
@override
33+
async def select(
34+
self, parsed_content: LexborHTMLParser, selector: str
35+
) -> Sequence[LexborNode]:
36+
"""Select elements matching a CSS selector."""
37+
return tuple(match for match in parsed_content.css(selector))
38+
39+
@override
40+
def is_matching_selector(
41+
self, parsed_content: LexborHTMLParser, selector: str
42+
) -> bool:
43+
"""Check if any element matches the selector."""
44+
return parsed_content.css_first(selector) is not None
45+
46+
@override
47+
def find_links(
48+
self, parsed_content: LexborHTMLParser, selector: str
49+
) -> Iterable[str]:
50+
"""Extract href attributes from elements matching the selector.
51+
52+
Used by `enqueue_links` helper to discover URLs.
53+
"""
54+
link: LexborNode
55+
urls: list[str] = []
56+
for link in parsed_content.css(selector):
57+
url = link.attributes.get('href')
58+
if url:
59+
urls.append(url.strip())
60+
return urls

0 commit comments

Comments
 (0)