Merge branch 'master' into memory-storage-refactor

vdusek · vdusek · commit 03de1f335a20 · 2025-05-07T13:38:58.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,20 +2,20 @@
 
 All notable changes to this project will be documented in this file.
 
-<!-- git-cliff-unreleased-start -->
-## 0.6.9 - **not yet released**
+## [0.6.9](https://github.com/apify/crawlee-python/releases/tag/v0.6.9) (2025-05-02)
 
 ### 🚀 Features
 
 - Add an internal `HttpClient` to be used in `send_request` for `PlaywrightCrawler` using `APIRequestContext` bound to the browser context ([#1134](https://github.com/apify/crawlee-python/pull/1134)) ([e794f49](https://github.com/apify/crawlee-python/commit/e794f4985d3a018ee76d634fe2b2c735fb450272)) by [@Mantisus](https://github.com/Mantisus), closes [#928](https://github.com/apify/crawlee-python/issues/928)
 - Make timeout error log cleaner ([#1170](https://github.com/apify/crawlee-python/pull/1170)) ([78ea9d2](https://github.com/apify/crawlee-python/commit/78ea9d23e0b2d73286043b68393e462f636625c9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1158](https://github.com/apify/crawlee-python/issues/1158)
+- Add `on_skipped_request` decorator, to process links skipped according to `robots.txt` rules ([#1166](https://github.com/apify/crawlee-python/pull/1166)) ([bd16f14](https://github.com/apify/crawlee-python/commit/bd16f14a834eebf485aea6b6a83f2b18bf16b504)) by [@Mantisus](https://github.com/Mantisus), closes [#1160](https://github.com/apify/crawlee-python/issues/1160)
 
 ### 🐛 Bug Fixes
 
 - Fix handle error without `args` in `_get_error_message`  for `ErrorTracker` ([#1181](https://github.com/apify/crawlee-python/pull/1181)) ([21944d9](https://github.com/apify/crawlee-python/commit/21944d908b8404d2ad6c182104e7a8c27be12a6e)) by [@Mantisus](https://github.com/Mantisus), closes [#1179](https://github.com/apify/crawlee-python/issues/1179)
+- Temporarily add `certifi&lt;=2025.1.31` dependency ([#1183](https://github.com/apify/crawlee-python/pull/1183)) ([25ff961](https://github.com/apify/crawlee-python/commit/25ff961990f9abc9d0673ba6573dfcf46dd6e53f)) by [@Pijukatel](https://github.com/Pijukatel)
 
 
-<!-- git-cliff-unreleased-end -->
 ## [0.6.8](https://github.com/apify/crawlee-python/releases/tag/v0.6.8) (2025-04-25)
 
 ### 🚀 Features
diff --git a/docs/examples/code_examples/respect_robots_on_skipped_request.py b/docs/examples/code_examples/respect_robots_on_skipped_request.py
@@ -0,0 +1,36 @@
+import asyncio
+
+from crawlee import SkippedReason
+from crawlee.crawlers import (
+    BeautifulSoupCrawler,
+    BeautifulSoupCrawlingContext,
+)
+
+
+async def main() -> None:
+    # Initialize the crawler with robots.txt compliance enabled
+    crawler = BeautifulSoupCrawler(respect_robots_txt_file=True)
+
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+    # highlight-start
+    # This handler is called when a request is skipped
+    @crawler.on_skipped_request
+    async def skipped_request_handler(url: str, reason: SkippedReason) -> None:
+        # Check if the request was skipped due to robots.txt rules
+        if reason == 'robots_txt':
+            crawler.log.info(f'Skipped {url} due to robots.txt rules.')
+
+    # highlight-end
+
+    # Start the crawler with the specified URLs
+    # The login URL will be skipped and handled by the skipped_request_handler
+    await crawler.run(
+        ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login']
+    )
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/examples/respect_robots_txt_file.mdx b/docs/examples/respect_robots_txt_file.mdx
@@ -7,6 +7,7 @@ import ApiLink from '@site/src/components/ApiLink';
 import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
 
 import RespectRobotsTxt from '!!raw-loader!roa-loader!./code_examples/respect_robots_txt_file.py';
+import OnSkippedRequest from '!!raw-loader!roa-loader!./code_examples/respect_robots_on_skipped_request.py';
 
 This example demonstrates how to configure your crawler to respect the rules established by websites for crawlers as described in the [robots.txt](https://www.robotstxt.org/robotstxt.html) file.
 
@@ -19,3 +20,13 @@ The code below demonstrates this behavior using the <ApiLink to="class/Beautiful
 <RunnableCodeBlock className="language-python" language="python">
     {RespectRobotsTxt}
 </RunnableCodeBlock>
+
+## Handle with `on_skipped_request`
+
+If you want to process URLs skipped according to the `robots.txt` rules, for example for further analysis, you should use the `on_skipped_request` handler from <ApiLink to="class/BasicCrawler#on_skipped_request">`BasicCrawler`</ApiLink>.
+
+Let's update the code by adding the `on_skipped_request` handler:
+
+<RunnableCodeBlock className="language-python" language="python">
+    {OnSkippedRequest}
+</RunnableCodeBlock>
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
     "apify_fingerprint_datapoints>=0.0.2",
     "browserforge>=1.2.3",
     "cachetools>=5.5.0",
+    "certifi<=2025.1.31",  # Not a direct dependency. Temporarily pinned. Dependency can be removed after: https://github.com/apify/crawlee-python/issues/1182
     "colorama>=0.4.0",
     "eval-type-backport>=0.2.0",
     "httpx[brotli,http2,zstd]>=0.27.0",
diff --git a/src/crawlee/__init__.py b/src/crawlee/__init__.py
@@ -2,7 +2,7 @@
 
 from ._request import Request, RequestOptions
 from ._service_locator import service_locator
-from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction
+from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
 from ._utils.globs import Glob
 
 __version__ = metadata.version('crawlee')
@@ -15,5 +15,6 @@
     'Request',
     'RequestOptions',
     'RequestTransformAction',
+    'SkippedReason',
     'service_locator',
 ]
diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py
@@ -50,6 +50,8 @@
 EnqueueStrategy: TypeAlias = Literal['all', 'same-domain', 'same-hostname', 'same-origin']
 """Enqueue strategy to be used for determining which links to extract and enqueue."""
 
+SkippedReason: TypeAlias = Literal['robots_txt']
+
 
 def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
     """Convert all header keys to lowercase, strips whitespace, and returns them sorted by key."""
diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import asyncio
 import logging
 from abc import ABC
 from typing import TYPE_CHECKING, Any, Callable, Generic, Union
@@ -157,6 +158,7 @@ async def extract_links(
             kwargs.setdefault('strategy', 'same-hostname')
 
             requests = list[Request]()
+            skipped = list[str]()
             base_user_data = user_data or {}
 
             robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
@@ -168,8 +170,7 @@ async def extract_links(
                     url = convert_to_absolute_url(base_url, url)
 
                 if robots_txt_file and not robots_txt_file.is_allowed(url):
-                    # TODO: https://github.com/apify/crawlee-python/issues/1160
-                    # add processing with on_skipped_request hook
+                    skipped.append(url)
                     continue
 
                 request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
@@ -192,6 +193,12 @@ async def extract_links(
                     continue
 
                 requests.append(request)
+
+            if skipped:
+                skipped_tasks = [
+                    asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
+                ]
+                await asyncio.gather(*skipped_tasks)
             return requests
 
         return extract_links
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -33,6 +33,7 @@
     HttpHeaders,
     RequestHandlerRunResult,
     SendRequestFunction,
+    SkippedReason,
 )
 from crawlee._utils.docs import docs_group
 from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
@@ -82,6 +83,7 @@
 TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
 ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Union[Request, None]]]
 FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
+SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
 
 
 class _BasicCrawlerOptions(TypedDict):
@@ -109,7 +111,11 @@ class _BasicCrawlerOptions(TypedDict):
     """HTTP client used by `BasicCrawlingContext.send_request` method."""
 
     max_request_retries: NotRequired[int]
-    """Maximum number of attempts to process a single request."""
+    """Specifies the maximum number of retries allowed for a request if its processing fails.
+    This includes retries due to navigation errors or errors thrown from user-supplied functions
+    (`request_handler`, `pre_navigation_hooks` etc.).
+
+    This limit does not apply to retries triggered by session rotation (see `max_session_rotations`)."""
 
     max_requests_per_crawl: NotRequired[int | None]
     """Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.
@@ -118,7 +124,10 @@ class _BasicCrawlerOptions(TypedDict):
 
     max_session_rotations: NotRequired[int]
     """Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs
-    or if the website blocks the request."""
+    or if the website blocks the request.
+
+    The session rotations are not counted towards the `max_request_retries` limit.
+    """
 
     max_crawl_depth: NotRequired[int | None]
     """Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.
@@ -268,14 +277,20 @@ def __init__(
             proxy_configuration: HTTP proxy configuration used when making requests.
             http_client: HTTP client used by `BasicCrawlingContext.send_request` method.
             request_handler: A callable responsible for handling requests.
-            max_request_retries: Maximum number of attempts to process a single request.
+            max_request_retries: Specifies the maximum number of retries allowed for a request if its processing fails.
+                This includes retries due to navigation errors or errors thrown from user-supplied functions
+                (`request_handler`, `pre_navigation_hooks` etc.).
+
+                This limit does not apply to retries triggered by session rotation (see `max_session_rotations`).
             max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
                 this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means
                 no limit. Due to concurrency settings, the actual number of pages visited may slightly exceed
                 this value. If used together with `keep_alive`, then the crawler will be kept alive only until
                 `max_requests_per_crawl` is achieved.
             max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
                 if a proxy error occurs or if the website blocks the request.
+
+                The session rotations are not counted towards the `max_request_retries` limit.
             max_crawl_depth: Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond
                 this depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level
                 of links. Requests at the maximum depth will still be processed, but no new links will be enqueued
@@ -336,9 +351,10 @@ def __init__(
             self._router = None
             self.router.default_handler(request_handler)
 
-        # Error & failed request handlers
+        # Error, failed & skipped request handlers
         self._error_handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext] | None = None
         self._failed_request_handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] | None = None
+        self._on_skipped_request: SkippedRequestCallback | None = None
         self._abort_on_error = abort_on_error
 
         # Context of each request with matching result of request handler.
@@ -541,6 +557,14 @@ def failed_request_handler(
         self._failed_request_handler = handler
         return handler
 
+    def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
+        """Register a function to handle skipped requests.
+
+        The skipped request handler is invoked when a request is skipped due to a collision or other reasons.
+        """
+        self._on_skipped_request = callback
+        return callback
+
     async def run(
         self,
         requests: Sequence[str | Request] | None = None,
@@ -679,8 +703,10 @@ async def add_requests(
                 skipped.append(request)
 
         if skipped:
-            # TODO: https://github.com/apify/crawlee-python/issues/1160
-            # add processing with on_skipped_request hook
+            skipped_tasks = [
+                asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
+            ]
+            await asyncio.gather(*skipped_tasks)
             self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file')
 
         request_manager = await self.get_request_manager()
@@ -954,6 +980,30 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling
             except Exception as e:
                 raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e
 
+    async def _handle_skipped_request(
+        self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
+    ) -> None:
+        if need_mark and isinstance(request, Request):
+            request_manager = await self.get_request_manager()
+
+            await wait_for(
+                lambda: request_manager.mark_request_as_handled(request),
+                timeout=self._internal_timeout,
+                timeout_message='Marking request as handled timed out after '
+                f'{self._internal_timeout.total_seconds()} seconds',
+                logger=self._logger,
+                max_retries=3,
+            )
+            request.state = RequestState.SKIPPED
+
+        url = request.url if isinstance(request, Request) else request
+
+        if self._on_skipped_request:
+            try:
+                await self._on_skipped_request(url, reason)
+            except Exception as e:
+                raise UserDefinedErrorHandlerError('Exception thrown in user-defined skipped request callback') from e
+
     def _get_message_from_error(self, error: Exception) -> str:
         """Get error message summary from exception.
 
@@ -1110,16 +1160,8 @@ async def __run_task_function(self) -> None:
             self._logger.warning(
                 f'Skipping request {request.url} ({request.id}) because it is disallowed based on robots.txt'
             )
-            await wait_for(
-                lambda: request_manager.mark_request_as_handled(request),
-                timeout=self._internal_timeout,
-                timeout_message='Marking request as handled timed out after '
-                f'{self._internal_timeout.total_seconds()} seconds',
-                logger=self._logger,
-                max_retries=3,
-            )
-            # TODO: https://github.com/apify/crawlee-python/issues/1160
-            # add processing with on_skipped_request hook
+
+            await self._handle_skipped_request(request, 'robots_txt', need_mark=True)
             return
 
         if request.session_id:
diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import asyncio
 import logging
 from functools import partial
 from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, Union
@@ -292,6 +293,7 @@ async def extract_links(
             kwargs.setdefault('strategy', 'same-hostname')
 
             requests = list[Request]()
+            skipped = list[str]()
             base_user_data = user_data or {}
 
             elements = await context.page.query_selector_all(selector)
@@ -309,8 +311,7 @@ async def extract_links(
                         url = convert_to_absolute_url(base_url, url)
 
                     if robots_txt_file and not robots_txt_file.is_allowed(url):
-                        # TODO: https://github.com/apify/crawlee-python/issues/1160
-                        # add processing with on_skipped_request hook
+                        skipped.append(url)
                         continue
 
                     request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label})
@@ -334,6 +335,12 @@ async def extract_links(
 
                     requests.append(request)
 
+            if skipped:
+                skipped_tasks = [
+                    asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
+                ]
+                await asyncio.gather(*skipped_tasks)
+
             return requests
 
         return extract_links
diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
@@ -3,7 +3,7 @@
 from typing import TYPE_CHECKING
 from unittest import mock
 
-from crawlee import ConcurrencySettings, HttpHeaders, RequestTransformAction
+from crawlee import ConcurrencySettings, HttpHeaders, RequestTransformAction, SkippedReason
 from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 
 if TYPE_CHECKING:
@@ -160,3 +160,26 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
         str(server_url / 'start_enqueue'),
         str(server_url / 'sub_index'),
     }
+
+
+async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
+    crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True)
+    skip = mock.Mock()
+
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        await context.enqueue_links()
+
+    @crawler.on_skipped_request
+    async def skipped_hook(url: str, _reason: SkippedReason) -> None:
+        skip(url)
+
+    await crawler.run([str(server_url / 'start_enqueue')])
+
+    skipped = {call[0][0] for call in skip.call_args_list}
+
+    assert skipped == {
+        str(server_url / 'page_1'),
+        str(server_url / 'page_2'),
+        str(server_url / 'page_3'),
+    }
diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py
diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py
diff --git a/uv.lock b/uv.lock
diff --git a/website/yarn.lock b/website/yarn.lock