add docs "HttpCrawler with custom parser"

Mantisus · Mantisus · commit e5aff8630cbf · 2025-12-16T20:05:43.000Z
diff --git a/docs/guides/code_examples/httpcrawler_custom_parser/lxml_parser.py b/docs/guides/code_examples/httpcrawler_custom_parser/lxml_parser.py
@@ -0,0 +1,61 @@
+import asyncio
+
+from lxml import html
+from pydantic import ValidationError
+
+from crawlee import Request
+from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
+
+
+async def main() -> None:
+    crawler = HttpCrawler(
+        max_request_retries=1,
+        max_requests_per_crawl=10,
+    )
+
+    @crawler.router.default_handler
+    async def request_handler(context: HttpCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Parse the HTML content using lxml.
+        parsed_html = html.fromstring(await context.http_response.read())
+
+        # Extract data from the page.
+        data = {
+            'url': context.request.url,
+            'title': parsed_html.findtext('.//title'),
+            'h1s': [h1.text_content() for h1 in parsed_html.findall('.//h1')],
+            'h2s': [h2.text_content() for h2 in parsed_html.findall('.//h2')],
+            'h3s': [h3.text_content() for h3 in parsed_html.findall('.//h3')],
+        }
+        await context.push_data(data)
+
+        # Convert relative URLs to absolute before extracting links.
+        parsed_html.make_links_absolute(context.request.url, resolve_base_href=True)
+
+        # Xpath 1.0 selector for extracting valid href attributes.
+        links_xpath = (
+            '//a/@href[not(starts-with(., "#")) '
+            'and not(starts-with(., "javascript:")) '
+            'and not(starts-with(., "mailto:"))]'
+        )
+
+        extracted_requests = []
+
+        # Extract links.
+        for url in parsed_html.xpath(links_xpath):
+            try:
+                request = Request.from_url(url)
+            except ValidationError as exc:
+                context.log.warning(f'Skipping invalid URL "{url}": {exc}')
+                continue
+            extracted_requests.append(request)
+
+        # Add extracted requests to the queue with the same-domain strategy.
+        await context.add_requests(extracted_requests, strategy='same-domain')
+
+    await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/httpcrawler_custom_parser/lxml_saxonche_parser.py b/docs/guides/code_examples/httpcrawler_custom_parser/lxml_saxonche_parser.py
@@ -0,0 +1,77 @@
+import asyncio
+
+from lxml import html
+from pydantic import ValidationError
+from saxonche import PySaxonProcessor
+
+from crawlee import Request
+from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
+
+
+async def main() -> None:
+    crawler = HttpCrawler(
+        max_request_retries=1,
+        max_requests_per_crawl=10,
+    )
+
+    # Create Saxon processor once and reuse across requests.
+    saxon_proc = PySaxonProcessor(license=False)
+    xpath_proc = saxon_proc.new_xpath_processor()
+
+    @crawler.router.default_handler
+    async def request_handler(context: HttpCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Parse HTML with lxml.
+        parsed_html = html.fromstring(await context.http_response.read())
+        # Convert relative URLs to absolute before extracting links.
+        parsed_html.make_links_absolute(context.request.url, resolve_base_href=True)
+        # Convert parsed HTML to XML for Saxon processing.
+        xml = html.tostring(parsed_html, encoding='unicode', method='xml')
+        # Parse XML with Saxon.
+        parsed_xml = saxon_proc.parse_xml(xml_text=xml)
+        # Set the parsed context for XPath evaluation.
+        xpath_proc.set_context(xdm_item=parsed_xml)
+
+        # Extract data using XPath 2.0 string() function.
+        data = {
+            'url': context.request.url,
+            'title': xpath_proc.evaluate_single('.//title/string()'),
+            'h1s': [str(h) for h in (xpath_proc.evaluate('//h1/string()') or [])],
+            'h2s': [str(h) for h in (xpath_proc.evaluate('//h2/string()') or [])],
+            'h3s': [str(h) for h in (xpath_proc.evaluate('//h3/string()') or [])],
+        }
+        await context.push_data(data)
+
+        # XPath 2.0 with distinct-values() to get unique links and remove fragments.
+        links_xpath = """
+            distinct-values(
+                for $href in //a/@href[
+                    not(starts-with(., "#"))
+                    and not(starts-with(., "javascript:"))
+                    and not(starts-with(., "mailto:"))
+                ]
+                return replace($href, "#.*$", "")
+            )
+        """
+
+        extracted_requests = []
+
+        # Extract links.
+        for item in xpath_proc.evaluate(links_xpath) or []:
+            url = item.string_value
+            try:
+                request = Request.from_url(url)
+            except ValidationError as exc:
+                context.log.warning(f'Skipping invalid URL "{url}": {exc}')
+                continue
+            extracted_requests.append(request)
+
+        # Add extracted requests to the queue with the same-domain strategy.
+        await context.add_requests(extracted_requests, strategy='same-domain')
+
+    await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/httpcrawler_custom_parser/pyquery_parser.py b/docs/guides/code_examples/httpcrawler_custom_parser/pyquery_parser.py
@@ -0,0 +1,64 @@
+import asyncio
+
+from pydantic import ValidationError
+from pyquery import PyQuery
+from yarl import URL
+
+from crawlee import Request
+from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
+
+
+async def main() -> None:
+    crawler = HttpCrawler(
+        max_request_retries=1,
+        max_requests_per_crawl=10,
+    )
+
+    @crawler.router.default_handler
+    async def request_handler(context: HttpCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Parse the HTML content using PyQuery.
+        parsed_html = PyQuery(await context.http_response.read())
+
+        # Extract data using jQuery-style selectors.
+        data = {
+            'url': context.request.url,
+            'title': parsed_html('title').text(),
+            'h1s': [h1.text() for h1 in parsed_html('h1').items()],
+            'h2s': [h2.text() for h2 in parsed_html('h2').items()],
+            'h3s': [h3.text() for h3 in parsed_html('h3').items()],
+        }
+        await context.push_data(data)
+
+        # Css selector to extract valid href attributes.
+        links_selector = (
+            'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])'
+        )
+        base_url = URL(context.request.url)
+
+        extracted_requests = []
+
+        # Extract links.
+        for item in parsed_html(links_selector).items():
+            href = item.attr('href')
+            if not href:
+                continue
+
+            # Convert relative URLs to absolute if needed.
+            url = str(base_url.join(URL(str(href))))
+            try:
+                request = Request.from_url(url)
+            except ValidationError as exc:
+                context.log.warning(f'Skipping invalid URL "{url}": {exc}')
+                continue
+            extracted_requests.append(request)
+
+        # Add extracted requests to the queue with the same-domain strategy.
+        await context.add_requests(extracted_requests, strategy='same-domain')
+
+    await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/httpcrawler_custom_parser/scrapling_parser.py b/docs/guides/code_examples/httpcrawler_custom_parser/scrapling_parser.py
@@ -0,0 +1,74 @@
+import asyncio
+
+from pydantic import ValidationError
+from scrapling.parser import Selector
+from yarl import URL
+
+from crawlee import Request
+from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
+
+
+async def main() -> None:
+    crawler = HttpCrawler(
+        max_request_retries=1,
+        max_requests_per_crawl=10,
+    )
+
+    @crawler.router.default_handler
+    async def request_handler(context: HttpCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Parse the HTML content using Scrapling.
+        page = Selector(await context.http_response.read(), url=context.request.url)
+
+        # Extract data using Xpath selectors with .get_all_text method for full text
+        # content.
+        title_el = page.xpath_first('//title')
+        data = {
+            'url': context.request.url,
+            'title': title_el.text if isinstance(title_el, Selector) else title_el,
+            'h1s': [
+                h1.get_all_text() if isinstance(h1, Selector) else h1
+                for h1 in page.xpath('//h1')
+            ],
+            'h2s': [
+                h2.get_all_text() if isinstance(h2, Selector) else h2
+                for h2 in page.xpath('//h2')
+            ],
+            'h3s': [
+                h3.get_all_text() if isinstance(h3, Selector) else h3
+                for h3 in page.xpath('//h3')
+            ],
+        }
+        await context.push_data(data)
+
+        # Css selector to extract valid href attributes.
+        links_selector = (
+            'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])'
+        )
+        base_url = URL(context.request.url)
+        extracted_requests = []
+
+        # Extract links.
+        for item in page.css(links_selector):
+            href = item.attrib.get('href') if isinstance(item, Selector) else None
+            if not href:
+                continue
+
+            # Convert relative URLs to absolute if needed.
+            url = str(base_url.join(URL(href)))
+            try:
+                request = Request.from_url(url)
+            except ValidationError as exc:
+                context.log.warning(f'Skipping invalid URL "{url}": {exc}')
+                continue
+            extracted_requests.append(request)
+
+        # Add extracted requests to the queue with the same-domain strategy.
+        await context.add_requests(extracted_requests, strategy='same-domain')
+
+    await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/httpcrawler_custom_parser/selectolax_parser.py b/docs/guides/code_examples/httpcrawler_custom_parser/selectolax_parser.py
@@ -0,0 +1,63 @@
+import asyncio
+
+from pydantic import ValidationError
+from selectolax.lexbor import LexborHTMLParser
+from yarl import URL
+
+from crawlee import Request
+from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
+
+
+async def main() -> None:
+    crawler = HttpCrawler(
+        max_request_retries=1,
+        max_requests_per_crawl=10,
+    )
+
+    @crawler.router.default_handler
+    async def request_handler(context: HttpCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Parse the HTML content using Selectolax with Lexbor backend.
+        parsed_html = LexborHTMLParser(await context.http_response.read())
+
+        # Extract data from the page.
+        data = {
+            'url': context.request.url,
+            'title': parsed_html.css_first('title').text(),
+            'h1s': [h1.text() for h1 in parsed_html.css('h1')],
+            'h2s': [h2.text() for h2 in parsed_html.css('h2')],
+            'h3s': [h3.text() for h3 in parsed_html.css('h3')],
+        }
+        await context.push_data(data)
+
+        # Css selector to extract valid href attributes.
+        links_selector = (
+            'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])'
+        )
+        base_url = URL(context.request.url)
+        extracted_requests = []
+
+        # Extract links.
+        for item in parsed_html.css(links_selector):
+            href = item.attributes.get('href')
+            if not href:
+                continue
+
+            # Convert relative URLs to absolute if needed.
+            url = str(base_url.join(URL(href)))
+            try:
+                request = Request.from_url(url)
+            except ValidationError as exc:
+                context.log.warning(f'Skipping invalid URL "{url}": {exc}')
+                continue
+            extracted_requests.append(request)
+
+        # Add extracted requests to the queue with the same-domain strategy.
+        await context.add_requests(extracted_requests, strategy='same-domain')
+
+    await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/httpcrawler_custom_parser.mdx b/docs/guides/httpcrawler_custom_parser.mdx
diff --git a/pyproject.toml b/pyproject.toml