Mantisus
diff --git a/‎docs/examples/code/adaptive_playwright_crawler.py‎
Lines changed: 19 additions & 15 deletions b/‎docs/examples/code/adaptive_playwright_crawler.py‎
Lines changed: 19 additions & 15 deletions
diff --git a/‎docs/examples/playwright_crawler_adaptive.mdx‎
Lines changed: 20 additions & 0 deletions b/‎docs/examples/playwright_crawler_adaptive.mdx‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_handler.py‎
Lines changed: 13 additions & 0 deletions b/‎docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_handler.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py‎
Lines changed: 10 additions & 0 deletions b/‎docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py‎
Lines changed: 10 additions & 0 deletions b/‎docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py‎
Lines changed: 61 additions & 0 deletions b/‎docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_pre_nav_hooks.py‎
Lines changed: 29 additions & 0 deletions b/‎docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_pre_nav_hooks.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎docs/guides/playwright_crawler_adaptive.mdx‎
Lines changed: 96 additions & 0 deletions b/‎docs/guides/playwright_crawler_adaptive.mdx‎
Lines changed: 96 additions & 0 deletions
@@ -1,4 +1,5 @@
 import asyncio
+from datetime import timedelta
 
 from playwright.async_api import Route
 
@@ -10,40 +11,43 @@
 
 
 async def main() -> None:
+    # Crawler created by following factory method will use `beautifulsoup`
+    # for parsing static content.
     crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
         max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'headless': False}
     )
 
-    @crawler.router.handler(label='label')
+    @crawler.router.default_handler
     async def request_handler_for_label(
         context: AdaptivePlaywrightCrawlingContext,
     ) -> None:
-        # Do some processing using `page`
-        some_locator = context.page.locator('div').first
-        await some_locator.wait_for()
-        # Do stuff with locator...
-        context.log.info(f'Playwright processing of: {context.request.url} ...')
-
-    @crawler.router.default_handler
-    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
-        context.log.info(f'User handler processing: {context.request.url} ...')
         # Do some processing using `parsed_content`
         context.log.info(context.parsed_content.title)
 
+        # Locate element h2 within 5 seconds
+        h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000))
+        # Do stuff with element found by the selector
+        context.log.info(h2)
+
         # Find more links and enqueue them.
         await context.enqueue_links()
-        await context.push_data({'Top crawler Url': context.request.url})
+        # Save some data.
+        await context.push_data({'Visited url': context.request.url})
 
     @crawler.pre_navigation_hook
     async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
-        """Hook executed both in static sub crawler and playwright sub crawler."""
-        # Trying to access context.page in this hook would raise `AdaptiveContextError`
-        # for pages crawled without playwright.
+        """Hook executed both in static sub crawler and playwright sub crawler.
+
+        Trying to access `context.page` in this hook would raise `AdaptiveContextError`
+        for pages crawled without playwright."""
         context.log.info(f'pre navigation hook for: {context.request.url} ...')
 
     @crawler.pre_navigation_hook(playwright_only=True)
     async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
-        """Hook executed only in playwright sub crawler."""
+        """Hook executed only in playwright sub crawler.
+
+        It is safe to access `page` object.
+        """
 
         async def some_routing_function(route: Route) -> None:
             await route.continue_()
 
@@ -0,0 +1,20 @@
+---
+id: adaptive-playwright-crawler
+title: AdaptivePlaywrightCrawler
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import CodeBlock from '@theme/CodeBlock';
+
+import AdaptivePlaywrightCrawlerExample from '!!raw-loader!./code/adaptive_playwright_crawler.py';
+
+This example demonstrates how to use <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink>. An <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> is a combination of <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and some implementation of HTTP-based crawler such as <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> or <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>.
+It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit.
+
+A [pre-navigation hook](/python/docs/guides/adaptive-playwright-crawler#page-configuration-with-pre-navigation-hooks) can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. Hooks will be executed both for the pages crawled by HTTP-bases sub crawler and playwright based sub crawler. Use `playwright_only=True` to mark hooks that should be executed only for playwright sub crawler.
+
+For more detailed description please see [AdaptivePlaywrightCrawler guide](/python/docs/guides/adaptive-playwright-crawler 'AdaptivePlaywrightCrawler guide')
+
+<CodeBlock className="language-python">
+    {AdaptivePlaywrightCrawlerExample}
+</CodeBlock>
@@ -0,0 +1,13 @@
+from datetime import timedelta
+
+from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext
+
+crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser()
+
+
+@crawler.router.default_handler
+async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
+    # Locate element h2 within 5 seconds
+    h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000))
+    # Do stuff with element found by the selector
+    context.log.info(h2)
@@ -0,0 +1,10 @@
+from crawlee.crawlers import AdaptivePlaywrightCrawler
+
+crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
+    # Arguments relevant only for PlaywrightCrawler
+    playwright_crawler_specific_kwargs={'headless': False, 'browser_type': 'chromium'},
+    # Arguments relevant only for BeautifulSoupCrawler
+    static_crawler_specific_kwargs={'additional_http_error_status_codes': [204]},
+    # Common arguments relevant to all crawlers
+    max_crawl_depth=5,
+)
@@ -0,0 +1,10 @@
+from crawlee.crawlers import AdaptivePlaywrightCrawler
+
+crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(
+    # Arguments relevant only for PlaywrightCrawler
+    playwright_crawler_specific_kwargs={'headless': False, 'browser_type': 'chromium'},
+    # Arguments relevant only for ParselCrawler
+    static_crawler_specific_kwargs={'additional_http_error_status_codes': [204]},
+    # Common arguments relevant to all crawlers
+    max_crawl_depth=5,
+)
@@ -0,0 +1,61 @@
+from crawlee import Request
+from crawlee._types import RequestHandlerRunResult
+from crawlee.crawlers import (
+    AdaptivePlaywrightCrawler,
+    RenderingType,
+    RenderingTypePrediction,
+    RenderingTypePredictor,
+)
+
+
+class CustomRenderingTypePredictor(RenderingTypePredictor):
+    def __init__(self) -> None:
+        self._learning_data = list[tuple[Request, RenderingType]]()
+
+    def predict(self, request: Request) -> RenderingTypePrediction:
+        # Some custom logic that produces some `RenderingTypePrediction`
+        # based on the `request` input.
+        rendering_type: RenderingType = (
+            'static' if 'abc' in request.url else 'client only'
+        )
+
+        return RenderingTypePrediction(
+            #  Recommends `static` rendering type -> HTTP-based sub crawler will be used.
+            rendering_type=rendering_type,
+            # Recommends that both sub crawlers should run with 20% chance. When both sub
+            # crawlers are running, the predictor can compare results and learn.
+            # High number means that predictor is not very confident about the
+            # `rendering_type`, low number means that predictor is very confident.
+            detection_probability_recommendation=0.2,
+        )
+
+    def store_result(self, request: Request, rendering_type: RenderingType) -> None:
+        # This function allows predictor to store new learning data and retrain itself
+        # if needed. `request` is input for prediction and `rendering_type` is the correct
+        # prediction.
+        self._learning_data.append((request, rendering_type))
+        # retrain
+
+
+def result_checker(result: RequestHandlerRunResult) -> bool:
+    # Some function that inspects produced `result` and returns `True` if the result
+    # is correct.
+    return bool(result)  # Check something on result
+
+
+def result_comparator(
+    result_1: RequestHandlerRunResult, result_2: RequestHandlerRunResult
+) -> bool:
+    # Some function that inspects two results and returns `True` if they are
+    # considered equivalent. It is used when comparing results produced by HTTP-based
+    # sub crawler and playwright based sub crawler.
+    return (
+        result_1.push_data_calls == result_2.push_data_calls
+    )  #  For example compare `push_data` calls.
+
+
+crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(
+    rendering_type_predictor=CustomRenderingTypePredictor(),
+    result_checker=result_checker,
+    result_comparator=result_comparator,
+)
@@ -0,0 +1,29 @@
+from playwright.async_api import Route
+
+from crawlee.crawlers import (
+    AdaptivePlaywrightCrawler,
+    AdaptivePlaywrightPreNavCrawlingContext,
+)
+
+crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser()
+
+
+@crawler.pre_navigation_hook
+async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
+    """Hook executed both in static sub crawler and playwright sub crawler.
+
+    Trying to access `context.page` in this hook would raise `AdaptiveContextError`
+    for pages crawled without playwright."""
+
+    context.log.info(f'pre navigation hook for: {context.request.url}')
+
+
+@crawler.pre_navigation_hook(playwright_only=True)
+async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
+    """Hook executed only in playwright sub crawler."""
+
+    async def some_routing_function(route: Route) -> None:
+        await route.continue_()
+
+    await context.page.route('*/**', some_routing_function)
+    context.log.info(f'Playwright only pre navigation hook for: {context.request.url}')
@@ -0,0 +1,96 @@
+---
+id: adaptive-playwright-crawler
+title: AdaptivePlaywrightCrawler
+description: How to use the AdaptivePlaywrightCrawler.
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import CodeBlock from '@theme/CodeBlock';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+import AdaptivePlaywrightCrawlerInitBeautifulSoup from '!!raw-loader!./code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py';
+import AdaptivePlaywrightCrawlerInitParsel from '!!raw-loader!./code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py';
+import AdaptivePlaywrightCrawlerInitPrediction from '!!raw-loader!./code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_prediction.py';
+import AdaptivePlaywrightCrawlerHandler from '!!raw-loader!./code/adaptive_playwright_crawler/adaptive_playwright_crawler_handler.py';
+import AdaptivePlaywrightCrawlerPreNavHooks from '!!raw-loader!./code/adaptive_playwright_crawler/adaptive_playwright_crawler_pre_nav_hooks.py';
+
+
+
+An <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> is a combination of <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and some implementation of HTTP-based crawler such as <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> or <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>.
+It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit.
+
+Detection is done based on the <ApiLink to="class/RenderingTypePredictor">`RenderingTypePredictor`</ApiLink> with default implementation <ApiLink to="class/DefaultRenderingTypePredictor">`DefaultRenderingTypePredictor`</ApiLink>. It predicts which crawling method should be used and learns from already crawled pages.
+
+## When to use AdaptivePlaywrightCrawler
+
+Use <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> in scenarios where some target pages have to be crawled with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, but for others faster HTTP-based crawler is sufficient. This way, you can achieve lower costs when crawling multiple different websites.
+
+Another use case is performing selector-based data extraction without prior knowledge of whether the selector exists in the static page or is dynamically added by a code executed in a browsing client.
+
+## Request handler and adaptive context helpers
+
+Request handler for <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> works on special context type - <ApiLink to="class/AdaptivePlaywrightCrawlingContext">`AdaptivePlaywrightCrawlingContext`</ApiLink>. This context is sometimes created by HTTP-based sub crawler and sometimes by playwright based sub crawler. Due to its dynamic nature, you can't always access [page](https://playwright.dev/python/docs/api/class-page) object. To overcome this limitation, there are three helper methods on this context that can be called regardless of how the context was created.
+
+<ApiLink to="class/AdaptivePlaywrightCrawlingContext#wait_for_selector">`wait_for_selector`</ApiLink> accepts `css` selector as first argument and timeout as second argument. The function will try to locate this selector a return once it is found(within timeout). In practice this means that if HTTP-based sub crawler was used, the function will find the selector only if it is part of the static content. If not, the adaptive crawler will fall back to the playwright sub crawler and will wait try to locate the selector within the timeout using playwright.
+
+<ApiLink to="class/AdaptivePlaywrightCrawlingContext#query_selector_one">`query_selector_one`</ApiLink> accepts `css` selector as first argument and timeout as second argument. This function acts similar to `wait_for_selector`, but it also returns one selector if any selector is found. Return value type is determined by used HTTP-based sub crawler. For example, it will be `Selector` for <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> and `Tag` for <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>.
+
+<ApiLink to="class/AdaptivePlaywrightCrawlingContext#query_selector_one">`query_selector_all`</ApiLink> same as <ApiLink to="class/AdaptivePlaywrightCrawlingContext#query_selector_one">`query_selector_one`</ApiLink>, but returns all found selectors.
+
+<ApiLink to="class/AdaptivePlaywrightCrawlingContext#parse_with_static_parser">`parse_with_static_parser`</ApiLink> will re-parse the whole page. Return value type is determined by used HTTP-based sub crawler. It has optional arguments: `selector` and `timeout`. If those optional arguments are used then the function first calls <ApiLink to="class/AdaptivePlaywrightCrawlingContext#wait_for_selector">`wait_for_selector`</ApiLink> and then do the parsing. This can be used in scenario where some specific element can signal, that page is already complete.
+
+See the following example about how to create request handler and use context helpers:
+
+<CodeBlock className="language-python">
+    {AdaptivePlaywrightCrawlerHandler}
+</CodeBlock>
+
+
+## Crawler configuration
+To use <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> it is recommended to use one of the prepared factory methods that will create the crawler with specific HTTP-based sub crawler variant: <ApiLink to="class/AdaptivePlaywrightCrawler#with_beautifulsoup_static_parser">`AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser`</ApiLink> or <ApiLink to="class/AdaptivePlaywrightCrawler#with_parsel_static_parser">`AdaptivePlaywrightCrawler.with_parsel_static_parser`</ApiLink>.
+
+<ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> is internally composed of two sub crawlers and you can do a detailed configuration of both of them. For detailed configuration options of the sub crawlers, please refer to their pages: <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>, <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>, <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>.
+
+In the following example you can see how to create and configure <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> with two different HTTP-based sub crawlers:
+
+
+<Tabs>
+  <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler" default>
+<CodeBlock className="language-python">
+    {AdaptivePlaywrightCrawlerInitBeautifulSoup}
+</CodeBlock>
+  </TabItem>
+<TabItem value="ParselCrawler" label="ParselCrawler" default>
+<CodeBlock className="language-python">
+    {AdaptivePlaywrightCrawlerInitParsel}
+</CodeBlock>
+  </TabItem>
+</Tabs>
+
+### Prediction related arguments
+
+To control which pages are crawled by which method you can use following arguments:
+
+<ApiLink to="class/RenderingTypePredictor">`RenderingTypePredictor`</ApiLink> - Class that can give recommendations about which sub crawler should be used for specific url. Predictor will also recommend to use both sub crawlers for some page from time to time, to check that the given recommendation was correct. Predictor should be able to learn from previous results and gradually give more reliable recommendations.
+
+`result_checker` - Is a function that checks result created from crawling a page. By default, it always returns `True`.
+
+`result_comparator` - Is a function that compares two results (HTTP-based sub crawler result and playwright based sub crawler result) and returns `True` if they are considered the same. By default, this function compares calls of context helper `push_data` by each sub crawler. This function is used by `rendering_type_predictor` to evaluate whether HTTP-based crawler has the same results as playwright based sub crawler.
+
+See the following example about how to pass prediction related arguments:
+
+<CodeBlock className="language-python">
+    {AdaptivePlaywrightCrawlerInitPrediction}
+</CodeBlock>
+
+
+
+## Page configuration with pre-navigation hooks
+In some use cases, you may need to configure the [page](https://playwright.dev/python/docs/api/class-page) before it navigates to the target URL. For instance, you might set navigation timeouts or manipulate other page-level settings. For such cases you can use the <ApiLink to="class/AdaptivePlaywrightCrawler#pre_navigation_hook">`pre_navigation_hook`</ApiLink> method of the <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink>. This method is called before the page navigates to the target URL and allows you to configure the page instance. Due to the dynamic nature of <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink> it is possible that the hook will be executed for HTTP-based sub crawler or playwright-based sub crawler. Using [page](https://playwright.dev/python/docs/api/class-page) object for hook that will be executed on HTTP-based sub crawler will raise an exception. To overcome this you can use optional argument `playwright_only` = `True` when registering the hook.
+
+See the following example about how to register the pre navigation hooks:
+
+<CodeBlock className="language-python">
+    {AdaptivePlaywrightCrawlerPreNavHooks}
+</CodeBlock>