add AdaptivePlaywrightCrawler example

Mantisus · Mantisus · commit a895901b7d69 · 2025-12-19T00:11:37.000Z
diff --git a/docs/guides/code_examples/crawler_custom_parser/selectolax_adaptive_run.py b/docs/guides/code_examples/crawler_custom_parser/selectolax_adaptive_run.py
@@ -0,0 +1,38 @@
+import asyncio
+
+from crawlee.crawlers import (
+    AdaptivePlaywrightCrawler,
+    AdaptivePlaywrightCrawlerStatisticState,
+    AdaptivePlaywrightCrawlingContext,
+)
+from crawlee.statistics import Statistics
+
+from .selectolax_parser import SelectolaxLexborParser
+
+
+async def main() -> None:
+    crawler: AdaptivePlaywrightCrawler = AdaptivePlaywrightCrawler(
+        max_requests_per_crawl=10,
+        # Use custom Selectolax parser for static content parsing.
+        static_parser=SelectolaxLexborParser(),
+        # Set up statistics with AdaptivePlaywrightCrawlerStatisticState.
+        statistics=Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState),
+    )
+
+    @crawler.router.default_handler
+    async def handle_request(context: AdaptivePlaywrightCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+        data = {
+            'url': context.request.url,
+            'title': await context.query_selector_one('title'),
+        }
+
+        await context.push_data(data)
+
+        await context.enqueue_links()
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/crawler_custom_parser/selectolax_context.py b/docs/guides/code_examples/crawler_custom_parser/selectolax_context.py
@@ -14,6 +14,8 @@ class SelectolaxLexborContext(ParsedHttpCrawlingContext[LexborHTMLParser]):
     context methods (push_data, enqueue_links, etc.) plus custom helpers.
     """
 
+    # It is only for convenience and not strictly necessary, as the
+    # parsed_content field is already available from the base class.
     @property
     def parser(self) -> LexborHTMLParser:
         """Convenient alias for accessing the parsed document."""
diff --git a/docs/guides/crawler_custom_parser.mdx b/docs/guides/crawler_custom_parser.mdx
@@ -20,6 +20,7 @@ import SelectolaxParserSource from '!!raw-loader!./code_examples/crawler_custom_
 import SelectolaxContextSource from '!!raw-loader!./code_examples/crawler_custom_parser/selectolax_context.py';
 import SelectolaxCrawlerSource from '!!raw-loader!./code_examples/crawler_custom_parser/selectolax_crawler.py';
 import SelectolaxCrawlerRunSource from '!!raw-loader!./code_examples/crawler_custom_parser/selectolax_crawler_run.py';
+import AdaptiveCrawlerRunSource from '!!raw-loader!./code_examples/crawler_custom_parser/selectolax_adaptive_run.py';
 
 Crawlee provides <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> and <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> as built-in solutions for HTML parsing. However, you may want to use a different parsing library that better fits your specific needs.
 
@@ -32,47 +33,35 @@ There are two approaches to integrate a custom parser:
 
 The <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> gives you direct access to raw HTTP responses, allowing you to integrate any parsing library of your choice. When using this approach, helpers like <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> and <ApiLink to="class/ExtractLinksFunction">`extract_links`</ApiLink> are not available, and it requires minimal setup.
 
-The following sections demonstrate how to use various parsing libraries with <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> to extract data from a page and enqueue discovered links for further crawling.
-
-### lxml
-
-[lxml](https://lxml.de/) is a high-performance XML and HTML parser that provides Python bindings to the C libraries libxml2 and libxslt. It supports XPath 1.0, XSLT 1.0, and EXSLT extensions for element selection. The `make_links_absolute` method is particularly useful for converting relative URLs to absolute ones before link extraction.
-
-<RunnableCodeBlock className="language-python" language="python">
-    {LxmlParser}
-</RunnableCodeBlock>
-
-### lxml with SaxonC-HE
-
-Using [SaxonC-HE](https://pypi.org/project/saxonche/) together with lxml enables XPath 3.1 support, which provides advanced features like `distinct-values()` function and more powerful string manipulation. In this setup, lxml converts HTML to well-formed XML that SaxonC-HE can process.
-
-<RunnableCodeBlock className="language-python" language="python">
-    {LxmlSaxoncheParser}
-</RunnableCodeBlock>
-
-### selectolax
-
-[selectolax](https://github.com/rushter/selectolax) is a fast HTML parser that offers two backends: the default `Modest` engine and `Lexbor`. It provides a simple API with CSS selector support. The example below uses the `Lexbor` backend for optimal performance.
-
-<RunnableCodeBlock className="language-python" language="python">
-    {LexborParser}
-</RunnableCodeBlock>
-
-### PyQuery
-
-[PyQuery](https://pyquery.readthedocs.io/) brings jQuery-like syntax to Python for HTML manipulation. Built on top of `lxml`, it combines familiar jQuery CSS selectors with Python's ease of use. This is a good choice if you're comfortable with jQuery syntax and want a straightforward API for DOM traversal and manipulation.
-
-<RunnableCodeBlock className="language-python" language="python">
-    {PyqueryParser}
-</RunnableCodeBlock>
-
-### Scrapling
-
-[Scrapling](https://github.com/D4Vinci/Scrapling) is a scraping library that provides both CSS selectors and XPath 1.0. It offers automatic text extraction and a Scrapy/BeautifulSoup-like API with pseudo-elements support similar to Parsel.
-
-<RunnableCodeBlock className="language-python" language="python">
-    {ScraplingParser}
-</RunnableCodeBlock>
+The following examples demonstrate integration with various parsing libraries: [lxml](https://lxml.de/) for high-performance XPath 1.0 parsing, [lxml with SaxonC-HE](https://pypi.org/project/saxonche/) for XPath 3.1 support, [selectolax](https://github.com/rushter/selectolax) for fast CSS selector-based parsing, [PyQuery](https://pyquery.readthedocs.io/) for jQuery-like syntax, and [scrapling](https://github.com/D4Vinci/Scrapling) for CSS and XPath selectors with Scrapy/Parsel-like API and BeautifulSoup-style find methods.
+
+<Tabs groupId="custom_parsers">
+    <TabItem value="lxml" label="lxml">
+        <RunnableCodeBlock className="language-python" language="python">
+            {LxmlParser}
+        </RunnableCodeBlock>
+    </TabItem>
+    <TabItem value="saxonche" label="lxml with SaxonC-HE">
+        <RunnableCodeBlock className="language-python" language="python">
+            {LxmlSaxoncheParser}
+        </RunnableCodeBlock>
+    </TabItem>
+    <TabItem value="selectolax" label="selectolax">
+        <RunnableCodeBlock className="language-python" language="python">
+            {LexborParser}
+        </RunnableCodeBlock>
+    </TabItem>
+    <TabItem value="pyquery" label="PyQuery">
+        <RunnableCodeBlock className="language-python" language="python">
+            {PyqueryParser}
+        </RunnableCodeBlock>
+    </TabItem>
+    <TabItem value="scrapling" label="Scrapling">
+        <RunnableCodeBlock className="language-python" language="python">
+            {ScraplingParser}
+        </RunnableCodeBlock>
+    </TabItem>
+</Tabs>
 
 ## Creating a custom crawler
 
@@ -106,11 +95,21 @@ The crawler class connects the parser and context. Extend <ApiLink to="class/Abs
 
 ### Using the crawler
 
-The custom crawler works like any built-in crawler. Request handlers receive your custom context with full access to framework helpers like <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink>:
+The custom crawler works like any built-in crawler. Request handlers receive your custom context with full access to framework helpers like <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink>. Additionally, the custom parser can be used with <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> for adaptive crawling:
+
+<Tabs groupId="crawlers">
+    <TabItem value="selectolax_crawler" label="SelectolaxCrawler">
+        <CodeBlock className="language-python" language="python">
+            {SelectolaxCrawlerRunSource}
+        </CodeBlock>
+    </TabItem>
+    <TabItem value="adaptive_playwright_crawler" label="AdaptivePlaywrightCrawler with SelectolaxParser">
+        <CodeBlock className="language-python" language="python">
+            {AdaptiveCrawlerRunSource}
+        </CodeBlock>
+    </TabItem>
+</Tabs>
 
-<CodeBlock className="language-python" language="python">
-    {SelectolaxCrawlerRunSource}
-</CodeBlock>
 
 ## Conclusion
 
diff --git a/src/crawlee/crawlers/__init__.py b/src/crawlee/crawlers/__init__.py
@@ -23,12 +23,14 @@
     'AdaptivePlaywrightCrawler',
     'AdaptivePlaywrightCrawlingContext',
     'AdaptivePlaywrightPreNavCrawlingContext',
+    'AdaptivePlaywrightCrawlerStatisticState',
     'RenderingType',
     'RenderingTypePrediction',
     'RenderingTypePredictor',
 ):
     from ._adaptive_playwright import (
         AdaptivePlaywrightCrawler,
+        AdaptivePlaywrightCrawlerStatisticState,
         AdaptivePlaywrightCrawlingContext,
         AdaptivePlaywrightPreNavCrawlingContext,
         RenderingType,
diff --git a/src/crawlee/crawlers/_adaptive_playwright/__init__.py b/src/crawlee/crawlers/_adaptive_playwright/__init__.py
@@ -15,9 +15,12 @@
     from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor
 with _try_import(__name__, 'BeautifulSoupCrawlingContext'):
     from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawler
+with _try_import(__name__, 'AdaptivePlaywrightCrawlerStatisticState'):
+    from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawlerStatisticState
 
 __all__ = [
     'AdaptivePlaywrightCrawler',
+    'AdaptivePlaywrightCrawlerStatisticState',
     'AdaptivePlaywrightCrawlingContext',
     'AdaptivePlaywrightPreNavCrawlingContext',
     'RenderingType',