feat: Add pre-navigation hooks router to AbstractHttpCrawler (#791)

Pijukatel · janbuchar · web-flow · commit 0f2320592306 · 2024-12-12T08:00:29.000+01:00
### Description This makes it possible for users to register their pre-navigation hooks for http based crawlers. Add tests. Update docs. ### Issues - Closes: #635 --------- Co-authored-by: Jan Buchar <jan.buchar@apify.com>
diff --git a/docs/examples/beautifulsoup_crawler.mdx b/docs/examples/beautifulsoup_crawler.mdx
@@ -8,7 +8,7 @@ import CodeBlock from '@theme/CodeBlock';
 
 import BeautifulSoupExample from '!!raw-loader!./code/beautifulsoup_crawler.py';
 
-This example demonstrates how to use <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> to crawl a list of URLs, load each URL using a plain HTTP request, parse the HTML using the [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library and extract some data from it - the page title and all `<h1>`, `<h2>` and `<h3>` tags. This setup is perfect for scraping specific elements from web pages. Thanks to the well-known BeautifulSoup, you can easily navigate the HTML structure and retrieve the data you need with minimal code.
+This example demonstrates how to use <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> to crawl a list of URLs, load each URL using a plain HTTP request, parse the HTML using the [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library and extract some data from it - the page title and all `<h1>`, `<h2>` and `<h3>` tags. This setup is perfect for scraping specific elements from web pages. Thanks to the well-known BeautifulSoup, you can easily navigate the HTML structure and retrieve the data you need with minimal code. It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request.
 
 <CodeBlock className="language-python">
     {BeautifulSoupExample}
diff --git a/docs/examples/code/beautifulsoup_crawler.py b/docs/examples/code/beautifulsoup_crawler.py
@@ -1,6 +1,7 @@
 import asyncio
 from datetime import timedelta
 
+from crawlee.basic_crawler import BasicCrawlingContext
 from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 
 
@@ -39,6 +40,12 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
         # the data will be stored as JSON files in ./storage/datasets/default.
         await context.push_data(data)
 
+    # Register pre navigation hook which will be called before each request.
+    # This hook is optional and does not need to be defined at all.
+    @crawler.pre_navigation_hook
+    async def some_hook(context: BasicCrawlingContext) -> None:
+        pass
+
     # Run the crawler with the initial list of URLs.
     await crawler.run(['https://crawlee.dev'])
 
diff --git a/docs/examples/code/parsel_crawler.py b/docs/examples/code/parsel_crawler.py
@@ -1,5 +1,6 @@
 import asyncio
 
+from crawlee.basic_crawler import BasicCrawlingContext
 from crawlee.parsel_crawler import ParselCrawler, ParselCrawlingContext
 
 # Regex for identifying email addresses on a webpage.
@@ -30,6 +31,12 @@ async def request_handler(context: ParselCrawlingContext) -> None:
         # Enqueue all links found on the page.
         await context.enqueue_links()
 
+    # Register pre navigation hook which will be called before each request.
+    # This hook is optional and does not need to be defined at all.
+    @crawler.pre_navigation_hook
+    async def some_hook(context: BasicCrawlingContext) -> None:
+        pass
+
     # Run the crawler with the initial list of URLs.
     await crawler.run(['https://github.com'])
 
diff --git a/docs/examples/parsel_crawler.mdx b/docs/examples/parsel_crawler.mdx
@@ -8,7 +8,7 @@ import CodeBlock from '@theme/CodeBlock';
 
 import ParselCrawlerExample from '!!raw-loader!./code/parsel_crawler.py';
 
-This example shows how to use <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> to crawl a website or a list of URLs. Each URL is loaded using a plain HTTP request and the response is parsed using [Parsel](https://pypi.org/project/parsel/) library which supports CSS and XPath selectors for HTML responses and JMESPath for JSON responses. We can extract data from all kinds of complex HTML structures using XPath. In this example, we will use Parsel to crawl github.com and extract page title, URL and emails found in the webpage. The default handler will scrape data from the current webpage and enqueue all the links found in the webpage for continuous scraping.
+This example shows how to use <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> to crawl a website or a list of URLs. Each URL is loaded using a plain HTTP request and the response is parsed using [Parsel](https://pypi.org/project/parsel/) library which supports CSS and XPath selectors for HTML responses and JMESPath for JSON responses. We can extract data from all kinds of complex HTML structures using XPath. In this example, we will use Parsel to crawl github.com and extract page title, URL and emails found in the webpage. The default handler will scrape data from the current webpage and enqueue all the links found in the webpage for continuous scraping.  It also shows how you can add optional pre-navigation hook to the crawler. Pre-navigation hooks are user defined functions that execute before sending the request.
 
 <CodeBlock className="language-python">
     {ParselCrawlerExample}
diff --git a/src/crawlee/abstract_http_crawler/_abstract_http_crawler.py b/src/crawlee/abstract_http_crawler/_abstract_http_crawler.py
@@ -2,7 +2,7 @@
 
 import logging
 from abc import ABC
-from typing import TYPE_CHECKING, Any, Generic
+from typing import TYPE_CHECKING, Any, Callable, Generic
 
 from pydantic import ValidationError
 from typing_extensions import NotRequired, TypeVar
@@ -21,7 +21,7 @@
 from crawlee.http_clients import HttpxHttpClient
 
 if TYPE_CHECKING:
-    from collections.abc import AsyncGenerator, Iterable
+    from collections.abc import AsyncGenerator, Awaitable, Iterable
 
     from typing_extensions import Unpack
 
@@ -70,6 +70,7 @@ def __init__(
         **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext]],
     ) -> None:
         self._parser = parser
+        self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
 
         kwargs.setdefault(
             'http_client',
@@ -92,11 +93,19 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC
         """Create static content crawler context pipeline with expected pipeline steps."""
         return (
             ContextPipeline()
+            .compose(self._execute_pre_navigation_hooks)
             .compose(self._make_http_request)
             .compose(self._parse_http_response)
             .compose(self._handle_blocked_request)
         )
 
+    async def _execute_pre_navigation_hooks(
+        self, context: BasicCrawlingContext
+    ) -> AsyncGenerator[BasicCrawlingContext, None]:
+        for hook in self._pre_navigation_hooks:
+            await hook(context)
+        yield context
+
     async def _parse_http_response(
         self, context: HttpCrawlingContext
     ) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]:
@@ -207,3 +216,11 @@ async def _handle_blocked_request(
             if blocked_info := self._parser.is_blocked(context.parsed_content):
                 raise SessionError(blocked_info.reason)
         yield context
+
+    def pre_navigation_hook(self, hook: Callable[[BasicCrawlingContext], Awaitable[None]]) -> None:
+        """Register a hook to be called before each navigation.
+
+        Args:
+            hook: A coroutine function to be called before each navigation.
+        """
+        self._pre_navigation_hooks.append(hook)
diff --git a/tests/unit/http_crawler/test_http_crawler.py b/tests/unit/http_crawler/test_http_crawler.py
@@ -21,6 +21,7 @@
 
     from yarl import URL
 
+    from crawlee._types import BasicCrawlingContext
     from crawlee.http_clients._base import BaseHttpClient
     from crawlee.http_crawler import HttpCrawlingContext
 
@@ -354,3 +355,37 @@ async def request_handler(context: HttpCrawlingContext) -> None:
 
     response_args = responses[0]['args']
     assert response_args == query_params, 'Reconstructed query params must match the original query params.'
+
+
+@respx.mock
+async def test_http_crawler_pre_navigation_hooks_executed_before_request() -> None:
+    """Test that pre-navigation hooks are executed in correct order."""
+    execution_order = []
+    test_url = 'http://www.something.com'
+
+    crawler = HttpCrawler()
+
+    #  Register final context handler.
+    @crawler.router.default_handler
+    async def default_request_handler(context: HttpCrawlingContext) -> None:  # noqa: ARG001 # Unused arg in test
+        execution_order.append('final handler')
+
+    #  Register pre navigation hook.
+    @crawler.pre_navigation_hook
+    async def hook1(context: BasicCrawlingContext) -> None:  # noqa: ARG001 # Unused arg in test
+        execution_order.append('pre-navigation-hook 1')
+
+    #  Register pre navigation hook.
+    @crawler.pre_navigation_hook
+    async def hook2(context: BasicCrawlingContext) -> None:  # noqa: ARG001 # Unused arg in test
+        execution_order.append('pre-navigation-hook 2')
+
+    def mark_request_execution(request: Request) -> Response:  # noqa: ARG001 # Unused arg in test
+        # Helper function to track execution order.
+        execution_order.append('request')
+        return Response(200)
+
+    respx.get(test_url).mock(side_effect=mark_request_execution)
+    await crawler.run([Request.from_url(url=test_url)])
+
+    assert execution_order == ['pre-navigation-hook 1', 'pre-navigation-hook 2', 'request', 'final handler']