Skip to content

Commit e5aff86

Browse files
committed
add docs "HttpCrawler with custom parser"
1 parent c634d4e commit e5aff86

File tree

7 files changed

+411
-0
lines changed

7 files changed

+411
-0
lines changed
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import asyncio
2+
3+
from lxml import html
4+
from pydantic import ValidationError
5+
6+
from crawlee import Request
7+
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
8+
9+
10+
async def main() -> None:
11+
crawler = HttpCrawler(
12+
max_request_retries=1,
13+
max_requests_per_crawl=10,
14+
)
15+
16+
@crawler.router.default_handler
17+
async def request_handler(context: HttpCrawlingContext) -> None:
18+
context.log.info(f'Processing {context.request.url} ...')
19+
20+
# Parse the HTML content using lxml.
21+
parsed_html = html.fromstring(await context.http_response.read())
22+
23+
# Extract data from the page.
24+
data = {
25+
'url': context.request.url,
26+
'title': parsed_html.findtext('.//title'),
27+
'h1s': [h1.text_content() for h1 in parsed_html.findall('.//h1')],
28+
'h2s': [h2.text_content() for h2 in parsed_html.findall('.//h2')],
29+
'h3s': [h3.text_content() for h3 in parsed_html.findall('.//h3')],
30+
}
31+
await context.push_data(data)
32+
33+
# Convert relative URLs to absolute before extracting links.
34+
parsed_html.make_links_absolute(context.request.url, resolve_base_href=True)
35+
36+
# Xpath 1.0 selector for extracting valid href attributes.
37+
links_xpath = (
38+
'//a/@href[not(starts-with(., "#")) '
39+
'and not(starts-with(., "javascript:")) '
40+
'and not(starts-with(., "mailto:"))]'
41+
)
42+
43+
extracted_requests = []
44+
45+
# Extract links.
46+
for url in parsed_html.xpath(links_xpath):
47+
try:
48+
request = Request.from_url(url)
49+
except ValidationError as exc:
50+
context.log.warning(f'Skipping invalid URL "{url}": {exc}')
51+
continue
52+
extracted_requests.append(request)
53+
54+
# Add extracted requests to the queue with the same-domain strategy.
55+
await context.add_requests(extracted_requests, strategy='same-domain')
56+
57+
await crawler.run(['https://crawlee.dev'])
58+
59+
60+
if __name__ == '__main__':
61+
asyncio.run(main())
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import asyncio
2+
3+
from lxml import html
4+
from pydantic import ValidationError
5+
from saxonche import PySaxonProcessor
6+
7+
from crawlee import Request
8+
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
9+
10+
11+
async def main() -> None:
12+
crawler = HttpCrawler(
13+
max_request_retries=1,
14+
max_requests_per_crawl=10,
15+
)
16+
17+
# Create Saxon processor once and reuse across requests.
18+
saxon_proc = PySaxonProcessor(license=False)
19+
xpath_proc = saxon_proc.new_xpath_processor()
20+
21+
@crawler.router.default_handler
22+
async def request_handler(context: HttpCrawlingContext) -> None:
23+
context.log.info(f'Processing {context.request.url} ...')
24+
25+
# Parse HTML with lxml.
26+
parsed_html = html.fromstring(await context.http_response.read())
27+
# Convert relative URLs to absolute before extracting links.
28+
parsed_html.make_links_absolute(context.request.url, resolve_base_href=True)
29+
# Convert parsed HTML to XML for Saxon processing.
30+
xml = html.tostring(parsed_html, encoding='unicode', method='xml')
31+
# Parse XML with Saxon.
32+
parsed_xml = saxon_proc.parse_xml(xml_text=xml)
33+
# Set the parsed context for XPath evaluation.
34+
xpath_proc.set_context(xdm_item=parsed_xml)
35+
36+
# Extract data using XPath 2.0 string() function.
37+
data = {
38+
'url': context.request.url,
39+
'title': xpath_proc.evaluate_single('.//title/string()'),
40+
'h1s': [str(h) for h in (xpath_proc.evaluate('//h1/string()') or [])],
41+
'h2s': [str(h) for h in (xpath_proc.evaluate('//h2/string()') or [])],
42+
'h3s': [str(h) for h in (xpath_proc.evaluate('//h3/string()') or [])],
43+
}
44+
await context.push_data(data)
45+
46+
# XPath 2.0 with distinct-values() to get unique links and remove fragments.
47+
links_xpath = """
48+
distinct-values(
49+
for $href in //a/@href[
50+
not(starts-with(., "#"))
51+
and not(starts-with(., "javascript:"))
52+
and not(starts-with(., "mailto:"))
53+
]
54+
return replace($href, "#.*$", "")
55+
)
56+
"""
57+
58+
extracted_requests = []
59+
60+
# Extract links.
61+
for item in xpath_proc.evaluate(links_xpath) or []:
62+
url = item.string_value
63+
try:
64+
request = Request.from_url(url)
65+
except ValidationError as exc:
66+
context.log.warning(f'Skipping invalid URL "{url}": {exc}')
67+
continue
68+
extracted_requests.append(request)
69+
70+
# Add extracted requests to the queue with the same-domain strategy.
71+
await context.add_requests(extracted_requests, strategy='same-domain')
72+
73+
await crawler.run(['https://crawlee.dev'])
74+
75+
76+
if __name__ == '__main__':
77+
asyncio.run(main())
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import asyncio
2+
3+
from pydantic import ValidationError
4+
from pyquery import PyQuery
5+
from yarl import URL
6+
7+
from crawlee import Request
8+
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
9+
10+
11+
async def main() -> None:
12+
crawler = HttpCrawler(
13+
max_request_retries=1,
14+
max_requests_per_crawl=10,
15+
)
16+
17+
@crawler.router.default_handler
18+
async def request_handler(context: HttpCrawlingContext) -> None:
19+
context.log.info(f'Processing {context.request.url} ...')
20+
21+
# Parse the HTML content using PyQuery.
22+
parsed_html = PyQuery(await context.http_response.read())
23+
24+
# Extract data using jQuery-style selectors.
25+
data = {
26+
'url': context.request.url,
27+
'title': parsed_html('title').text(),
28+
'h1s': [h1.text() for h1 in parsed_html('h1').items()],
29+
'h2s': [h2.text() for h2 in parsed_html('h2').items()],
30+
'h3s': [h3.text() for h3 in parsed_html('h3').items()],
31+
}
32+
await context.push_data(data)
33+
34+
# Css selector to extract valid href attributes.
35+
links_selector = (
36+
'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])'
37+
)
38+
base_url = URL(context.request.url)
39+
40+
extracted_requests = []
41+
42+
# Extract links.
43+
for item in parsed_html(links_selector).items():
44+
href = item.attr('href')
45+
if not href:
46+
continue
47+
48+
# Convert relative URLs to absolute if needed.
49+
url = str(base_url.join(URL(str(href))))
50+
try:
51+
request = Request.from_url(url)
52+
except ValidationError as exc:
53+
context.log.warning(f'Skipping invalid URL "{url}": {exc}')
54+
continue
55+
extracted_requests.append(request)
56+
57+
# Add extracted requests to the queue with the same-domain strategy.
58+
await context.add_requests(extracted_requests, strategy='same-domain')
59+
60+
await crawler.run(['https://crawlee.dev'])
61+
62+
63+
if __name__ == '__main__':
64+
asyncio.run(main())
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import asyncio
2+
3+
from pydantic import ValidationError
4+
from scrapling.parser import Selector
5+
from yarl import URL
6+
7+
from crawlee import Request
8+
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
9+
10+
11+
async def main() -> None:
12+
crawler = HttpCrawler(
13+
max_request_retries=1,
14+
max_requests_per_crawl=10,
15+
)
16+
17+
@crawler.router.default_handler
18+
async def request_handler(context: HttpCrawlingContext) -> None:
19+
context.log.info(f'Processing {context.request.url} ...')
20+
21+
# Parse the HTML content using Scrapling.
22+
page = Selector(await context.http_response.read(), url=context.request.url)
23+
24+
# Extract data using Xpath selectors with .get_all_text method for full text
25+
# content.
26+
title_el = page.xpath_first('//title')
27+
data = {
28+
'url': context.request.url,
29+
'title': title_el.text if isinstance(title_el, Selector) else title_el,
30+
'h1s': [
31+
h1.get_all_text() if isinstance(h1, Selector) else h1
32+
for h1 in page.xpath('//h1')
33+
],
34+
'h2s': [
35+
h2.get_all_text() if isinstance(h2, Selector) else h2
36+
for h2 in page.xpath('//h2')
37+
],
38+
'h3s': [
39+
h3.get_all_text() if isinstance(h3, Selector) else h3
40+
for h3 in page.xpath('//h3')
41+
],
42+
}
43+
await context.push_data(data)
44+
45+
# Css selector to extract valid href attributes.
46+
links_selector = (
47+
'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])'
48+
)
49+
base_url = URL(context.request.url)
50+
extracted_requests = []
51+
52+
# Extract links.
53+
for item in page.css(links_selector):
54+
href = item.attrib.get('href') if isinstance(item, Selector) else None
55+
if not href:
56+
continue
57+
58+
# Convert relative URLs to absolute if needed.
59+
url = str(base_url.join(URL(href)))
60+
try:
61+
request = Request.from_url(url)
62+
except ValidationError as exc:
63+
context.log.warning(f'Skipping invalid URL "{url}": {exc}')
64+
continue
65+
extracted_requests.append(request)
66+
67+
# Add extracted requests to the queue with the same-domain strategy.
68+
await context.add_requests(extracted_requests, strategy='same-domain')
69+
70+
await crawler.run(['https://crawlee.dev'])
71+
72+
73+
if __name__ == '__main__':
74+
asyncio.run(main())
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import asyncio
2+
3+
from pydantic import ValidationError
4+
from selectolax.lexbor import LexborHTMLParser
5+
from yarl import URL
6+
7+
from crawlee import Request
8+
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
9+
10+
11+
async def main() -> None:
12+
crawler = HttpCrawler(
13+
max_request_retries=1,
14+
max_requests_per_crawl=10,
15+
)
16+
17+
@crawler.router.default_handler
18+
async def request_handler(context: HttpCrawlingContext) -> None:
19+
context.log.info(f'Processing {context.request.url} ...')
20+
21+
# Parse the HTML content using Selectolax with Lexbor backend.
22+
parsed_html = LexborHTMLParser(await context.http_response.read())
23+
24+
# Extract data from the page.
25+
data = {
26+
'url': context.request.url,
27+
'title': parsed_html.css_first('title').text(),
28+
'h1s': [h1.text() for h1 in parsed_html.css('h1')],
29+
'h2s': [h2.text() for h2 in parsed_html.css('h2')],
30+
'h3s': [h3.text() for h3 in parsed_html.css('h3')],
31+
}
32+
await context.push_data(data)
33+
34+
# Css selector to extract valid href attributes.
35+
links_selector = (
36+
'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])'
37+
)
38+
base_url = URL(context.request.url)
39+
extracted_requests = []
40+
41+
# Extract links.
42+
for item in parsed_html.css(links_selector):
43+
href = item.attributes.get('href')
44+
if not href:
45+
continue
46+
47+
# Convert relative URLs to absolute if needed.
48+
url = str(base_url.join(URL(href)))
49+
try:
50+
request = Request.from_url(url)
51+
except ValidationError as exc:
52+
context.log.warning(f'Skipping invalid URL "{url}": {exc}')
53+
continue
54+
extracted_requests.append(request)
55+
56+
# Add extracted requests to the queue with the same-domain strategy.
57+
await context.add_requests(extracted_requests, strategy='same-domain')
58+
59+
await crawler.run(['https://crawlee.dev'])
60+
61+
62+
if __name__ == '__main__':
63+
asyncio.run(main())

0 commit comments

Comments
 (0)