Skip to content

Commit 03de1f3

Browse files
committed
Merge branch 'master' into memory-storage-refactor
2 parents 71174a6 + bd4b940 commit 03de1f3

File tree

14 files changed

+3378
-3180
lines changed

14 files changed

+3378
-3180
lines changed

CHANGELOG.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,20 @@
22

33
All notable changes to this project will be documented in this file.
44

5-
<!-- git-cliff-unreleased-start -->
6-
## 0.6.9 - **not yet released**
5+
## [0.6.9](https://github.com/apify/crawlee-python/releases/tag/v0.6.9) (2025-05-02)
76

87
### 🚀 Features
98

109
- Add an internal `HttpClient` to be used in `send_request` for `PlaywrightCrawler` using `APIRequestContext` bound to the browser context ([#1134](https://github.com/apify/crawlee-python/pull/1134)) ([e794f49](https://github.com/apify/crawlee-python/commit/e794f4985d3a018ee76d634fe2b2c735fb450272)) by [@Mantisus](https://github.com/Mantisus), closes [#928](https://github.com/apify/crawlee-python/issues/928)
1110
- Make timeout error log cleaner ([#1170](https://github.com/apify/crawlee-python/pull/1170)) ([78ea9d2](https://github.com/apify/crawlee-python/commit/78ea9d23e0b2d73286043b68393e462f636625c9)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1158](https://github.com/apify/crawlee-python/issues/1158)
11+
- Add `on_skipped_request` decorator, to process links skipped according to `robots.txt` rules ([#1166](https://github.com/apify/crawlee-python/pull/1166)) ([bd16f14](https://github.com/apify/crawlee-python/commit/bd16f14a834eebf485aea6b6a83f2b18bf16b504)) by [@Mantisus](https://github.com/Mantisus), closes [#1160](https://github.com/apify/crawlee-python/issues/1160)
1212

1313
### 🐛 Bug Fixes
1414

1515
- Fix handle error without `args` in `_get_error_message` for `ErrorTracker` ([#1181](https://github.com/apify/crawlee-python/pull/1181)) ([21944d9](https://github.com/apify/crawlee-python/commit/21944d908b8404d2ad6c182104e7a8c27be12a6e)) by [@Mantisus](https://github.com/Mantisus), closes [#1179](https://github.com/apify/crawlee-python/issues/1179)
16+
- Temporarily add `certifi&lt;=2025.1.31` dependency ([#1183](https://github.com/apify/crawlee-python/pull/1183)) ([25ff961](https://github.com/apify/crawlee-python/commit/25ff961990f9abc9d0673ba6573dfcf46dd6e53f)) by [@Pijukatel](https://github.com/Pijukatel)
1617

1718

18-
<!-- git-cliff-unreleased-end -->
1919
## [0.6.8](https://github.com/apify/crawlee-python/releases/tag/v0.6.8) (2025-04-25)
2020

2121
### 🚀 Features
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import asyncio
2+
3+
from crawlee import SkippedReason
4+
from crawlee.crawlers import (
5+
BeautifulSoupCrawler,
6+
BeautifulSoupCrawlingContext,
7+
)
8+
9+
10+
async def main() -> None:
11+
# Initialize the crawler with robots.txt compliance enabled
12+
crawler = BeautifulSoupCrawler(respect_robots_txt_file=True)
13+
14+
@crawler.router.default_handler
15+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
16+
context.log.info(f'Processing {context.request.url} ...')
17+
18+
# highlight-start
19+
# This handler is called when a request is skipped
20+
@crawler.on_skipped_request
21+
async def skipped_request_handler(url: str, reason: SkippedReason) -> None:
22+
# Check if the request was skipped due to robots.txt rules
23+
if reason == 'robots_txt':
24+
crawler.log.info(f'Skipped {url} due to robots.txt rules.')
25+
26+
# highlight-end
27+
28+
# Start the crawler with the specified URLs
29+
# The login URL will be skipped and handled by the skipped_request_handler
30+
await crawler.run(
31+
['https://news.ycombinator.com/', 'https://news.ycombinator.com/login']
32+
)
33+
34+
35+
if __name__ == '__main__':
36+
asyncio.run(main())

docs/examples/respect_robots_txt_file.mdx

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import ApiLink from '@site/src/components/ApiLink';
77
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
88

99
import RespectRobotsTxt from '!!raw-loader!roa-loader!./code_examples/respect_robots_txt_file.py';
10+
import OnSkippedRequest from '!!raw-loader!roa-loader!./code_examples/respect_robots_on_skipped_request.py';
1011

1112
This example demonstrates how to configure your crawler to respect the rules established by websites for crawlers as described in the [robots.txt](https://www.robotstxt.org/robotstxt.html) file.
1213

@@ -19,3 +20,13 @@ The code below demonstrates this behavior using the <ApiLink to="class/Beautiful
1920
<RunnableCodeBlock className="language-python" language="python">
2021
{RespectRobotsTxt}
2122
</RunnableCodeBlock>
23+
24+
## Handle with `on_skipped_request`
25+
26+
If you want to process URLs skipped according to the `robots.txt` rules, for example for further analysis, you should use the `on_skipped_request` handler from <ApiLink to="class/BasicCrawler#on_skipped_request">`BasicCrawler`</ApiLink>.
27+
28+
Let's update the code by adding the `on_skipped_request` handler:
29+
30+
<RunnableCodeBlock className="language-python" language="python">
31+
{OnSkippedRequest}
32+
</RunnableCodeBlock>

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ dependencies = [
3636
"apify_fingerprint_datapoints>=0.0.2",
3737
"browserforge>=1.2.3",
3838
"cachetools>=5.5.0",
39+
"certifi<=2025.1.31", # Not a direct dependency. Temporarily pinned. Dependency can be removed after: https://github.com/apify/crawlee-python/issues/1182
3940
"colorama>=0.4.0",
4041
"eval-type-backport>=0.2.0",
4142
"httpx[brotli,http2,zstd]>=0.27.0",

src/crawlee/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from ._request import Request, RequestOptions
44
from ._service_locator import service_locator
5-
from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction
5+
from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
66
from ._utils.globs import Glob
77

88
__version__ = metadata.version('crawlee')
@@ -15,5 +15,6 @@
1515
'Request',
1616
'RequestOptions',
1717
'RequestTransformAction',
18+
'SkippedReason',
1819
'service_locator',
1920
]

src/crawlee/_types.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@
5050
EnqueueStrategy: TypeAlias = Literal['all', 'same-domain', 'same-hostname', 'same-origin']
5151
"""Enqueue strategy to be used for determining which links to extract and enqueue."""
5252

53+
SkippedReason: TypeAlias = Literal['robots_txt']
54+
5355

5456
def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
5557
"""Convert all header keys to lowercase, strips whitespace, and returns them sorted by key."""

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import asyncio
34
import logging
45
from abc import ABC
56
from typing import TYPE_CHECKING, Any, Callable, Generic, Union
@@ -157,6 +158,7 @@ async def extract_links(
157158
kwargs.setdefault('strategy', 'same-hostname')
158159

159160
requests = list[Request]()
161+
skipped = list[str]()
160162
base_user_data = user_data or {}
161163

162164
robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
@@ -168,8 +170,7 @@ async def extract_links(
168170
url = convert_to_absolute_url(base_url, url)
169171

170172
if robots_txt_file and not robots_txt_file.is_allowed(url):
171-
# TODO: https://github.com/apify/crawlee-python/issues/1160
172-
# add processing with on_skipped_request hook
173+
skipped.append(url)
173174
continue
174175

175176
request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
@@ -192,6 +193,12 @@ async def extract_links(
192193
continue
193194

194195
requests.append(request)
196+
197+
if skipped:
198+
skipped_tasks = [
199+
asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
200+
]
201+
await asyncio.gather(*skipped_tasks)
195202
return requests
196203

197204
return extract_links

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 58 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
HttpHeaders,
3434
RequestHandlerRunResult,
3535
SendRequestFunction,
36+
SkippedReason,
3637
)
3738
from crawlee._utils.docs import docs_group
3839
from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
@@ -82,6 +83,7 @@
8283
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
8384
ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Union[Request, None]]]
8485
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
86+
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
8587

8688

8789
class _BasicCrawlerOptions(TypedDict):
@@ -109,7 +111,11 @@ class _BasicCrawlerOptions(TypedDict):
109111
"""HTTP client used by `BasicCrawlingContext.send_request` method."""
110112

111113
max_request_retries: NotRequired[int]
112-
"""Maximum number of attempts to process a single request."""
114+
"""Specifies the maximum number of retries allowed for a request if its processing fails.
115+
This includes retries due to navigation errors or errors thrown from user-supplied functions
116+
(`request_handler`, `pre_navigation_hooks` etc.).
117+
118+
This limit does not apply to retries triggered by session rotation (see `max_session_rotations`)."""
113119

114120
max_requests_per_crawl: NotRequired[int | None]
115121
"""Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit.
@@ -118,7 +124,10 @@ class _BasicCrawlerOptions(TypedDict):
118124

119125
max_session_rotations: NotRequired[int]
120126
"""Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs
121-
or if the website blocks the request."""
127+
or if the website blocks the request.
128+
129+
The session rotations are not counted towards the `max_request_retries` limit.
130+
"""
122131

123132
max_crawl_depth: NotRequired[int | None]
124133
"""Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth.
@@ -268,14 +277,20 @@ def __init__(
268277
proxy_configuration: HTTP proxy configuration used when making requests.
269278
http_client: HTTP client used by `BasicCrawlingContext.send_request` method.
270279
request_handler: A callable responsible for handling requests.
271-
max_request_retries: Maximum number of attempts to process a single request.
280+
max_request_retries: Specifies the maximum number of retries allowed for a request if its processing fails.
281+
This includes retries due to navigation errors or errors thrown from user-supplied functions
282+
(`request_handler`, `pre_navigation_hooks` etc.).
283+
284+
This limit does not apply to retries triggered by session rotation (see `max_session_rotations`).
272285
max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
273286
this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means
274287
no limit. Due to concurrency settings, the actual number of pages visited may slightly exceed
275288
this value. If used together with `keep_alive`, then the crawler will be kept alive only until
276289
`max_requests_per_crawl` is achieved.
277290
max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
278291
if a proxy error occurs or if the website blocks the request.
292+
293+
The session rotations are not counted towards the `max_request_retries` limit.
279294
max_crawl_depth: Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond
280295
this depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level
281296
of links. Requests at the maximum depth will still be processed, but no new links will be enqueued
@@ -336,9 +351,10 @@ def __init__(
336351
self._router = None
337352
self.router.default_handler(request_handler)
338353

339-
# Error & failed request handlers
354+
# Error, failed & skipped request handlers
340355
self._error_handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext] | None = None
341356
self._failed_request_handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] | None = None
357+
self._on_skipped_request: SkippedRequestCallback | None = None
342358
self._abort_on_error = abort_on_error
343359

344360
# Context of each request with matching result of request handler.
@@ -541,6 +557,14 @@ def failed_request_handler(
541557
self._failed_request_handler = handler
542558
return handler
543559

560+
def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
561+
"""Register a function to handle skipped requests.
562+
563+
The skipped request handler is invoked when a request is skipped due to a collision or other reasons.
564+
"""
565+
self._on_skipped_request = callback
566+
return callback
567+
544568
async def run(
545569
self,
546570
requests: Sequence[str | Request] | None = None,
@@ -679,8 +703,10 @@ async def add_requests(
679703
skipped.append(request)
680704

681705
if skipped:
682-
# TODO: https://github.com/apify/crawlee-python/issues/1160
683-
# add processing with on_skipped_request hook
706+
skipped_tasks = [
707+
asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
708+
]
709+
await asyncio.gather(*skipped_tasks)
684710
self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file')
685711

686712
request_manager = await self.get_request_manager()
@@ -954,6 +980,30 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling
954980
except Exception as e:
955981
raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e
956982

983+
async def _handle_skipped_request(
984+
self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
985+
) -> None:
986+
if need_mark and isinstance(request, Request):
987+
request_manager = await self.get_request_manager()
988+
989+
await wait_for(
990+
lambda: request_manager.mark_request_as_handled(request),
991+
timeout=self._internal_timeout,
992+
timeout_message='Marking request as handled timed out after '
993+
f'{self._internal_timeout.total_seconds()} seconds',
994+
logger=self._logger,
995+
max_retries=3,
996+
)
997+
request.state = RequestState.SKIPPED
998+
999+
url = request.url if isinstance(request, Request) else request
1000+
1001+
if self._on_skipped_request:
1002+
try:
1003+
await self._on_skipped_request(url, reason)
1004+
except Exception as e:
1005+
raise UserDefinedErrorHandlerError('Exception thrown in user-defined skipped request callback') from e
1006+
9571007
def _get_message_from_error(self, error: Exception) -> str:
9581008
"""Get error message summary from exception.
9591009
@@ -1110,16 +1160,8 @@ async def __run_task_function(self) -> None:
11101160
self._logger.warning(
11111161
f'Skipping request {request.url} ({request.id}) because it is disallowed based on robots.txt'
11121162
)
1113-
await wait_for(
1114-
lambda: request_manager.mark_request_as_handled(request),
1115-
timeout=self._internal_timeout,
1116-
timeout_message='Marking request as handled timed out after '
1117-
f'{self._internal_timeout.total_seconds()} seconds',
1118-
logger=self._logger,
1119-
max_retries=3,
1120-
)
1121-
# TODO: https://github.com/apify/crawlee-python/issues/1160
1122-
# add processing with on_skipped_request hook
1163+
1164+
await self._handle_skipped_request(request, 'robots_txt', need_mark=True)
11231165
return
11241166

11251167
if request.session_id:

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import asyncio
34
import logging
45
from functools import partial
56
from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, Union
@@ -292,6 +293,7 @@ async def extract_links(
292293
kwargs.setdefault('strategy', 'same-hostname')
293294

294295
requests = list[Request]()
296+
skipped = list[str]()
295297
base_user_data = user_data or {}
296298

297299
elements = await context.page.query_selector_all(selector)
@@ -309,8 +311,7 @@ async def extract_links(
309311
url = convert_to_absolute_url(base_url, url)
310312

311313
if robots_txt_file and not robots_txt_file.is_allowed(url):
312-
# TODO: https://github.com/apify/crawlee-python/issues/1160
313-
# add processing with on_skipped_request hook
314+
skipped.append(url)
314315
continue
315316

316317
request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label})
@@ -334,6 +335,12 @@ async def extract_links(
334335

335336
requests.append(request)
336337

338+
if skipped:
339+
skipped_tasks = [
340+
asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
341+
]
342+
await asyncio.gather(*skipped_tasks)
343+
337344
return requests
338345

339346
return extract_links

tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from typing import TYPE_CHECKING
44
from unittest import mock
55

6-
from crawlee import ConcurrencySettings, HttpHeaders, RequestTransformAction
6+
from crawlee import ConcurrencySettings, HttpHeaders, RequestTransformAction, SkippedReason
77
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
88

99
if TYPE_CHECKING:
@@ -160,3 +160,26 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
160160
str(server_url / 'start_enqueue'),
161161
str(server_url / 'sub_index'),
162162
}
163+
164+
165+
async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
166+
crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True)
167+
skip = mock.Mock()
168+
169+
@crawler.router.default_handler
170+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
171+
await context.enqueue_links()
172+
173+
@crawler.on_skipped_request
174+
async def skipped_hook(url: str, _reason: SkippedReason) -> None:
175+
skip(url)
176+
177+
await crawler.run([str(server_url / 'start_enqueue')])
178+
179+
skipped = {call[0][0] for call in skip.call_args_list}
180+
181+
assert skipped == {
182+
str(server_url / 'page_1'),
183+
str(server_url / 'page_2'),
184+
str(server_url / 'page_3'),
185+
}

0 commit comments

Comments
 (0)