Skip to content

Commit 6565eea

Browse files
committed
Wrap pre-navigation hooks with navigation timeout
1 parent ecd3c64 commit 6565eea

File tree

9 files changed

+171
-18
lines changed

9 files changed

+171
-18
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ keywords = [
3434
"scraping",
3535
]
3636
dependencies = [
37+
"async-timeout>=5.0.1",
3738
"cachetools>=5.5.0",
3839
"colorama>=0.4.0",
3940
"impit>=0.8.0",

src/crawlee/_utils/time.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,14 @@
33
import time
44
from contextlib import contextmanager
55
from dataclasses import dataclass
6+
from datetime import timedelta
67
from typing import TYPE_CHECKING
78

9+
from async_timeout import Timeout, timeout
10+
811
if TYPE_CHECKING:
912
from collections.abc import Iterator
10-
from datetime import timedelta
13+
from types import TracebackType
1114

1215
_SECONDS_PER_MINUTE = 60
1316
_SECONDS_PER_HOUR = 3600
@@ -35,6 +38,40 @@ def measure_time() -> Iterator[TimerResult]:
3538
result.cpu = after_cpu - before_cpu
3639

3740

41+
class SharedTimeout:
42+
"""Keeps track of a time budget shared by multiple independent async operations."""
43+
44+
def __init__(self, timeout: timedelta) -> None:
45+
self._remaining_timeout = timeout
46+
self._active_timeout: Timeout | None = None
47+
self._activation_timestamp: float | None = None
48+
49+
async def __aenter__(self) -> timedelta:
50+
if self._active_timeout is not None or self._activation_timestamp is not None:
51+
raise RuntimeError('A shared timeout context cannot be entered twice at the same time')
52+
53+
self._activation_timestamp = time.monotonic()
54+
self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())
55+
await new_timeout.__aenter__()
56+
return self._remaining_timeout
57+
58+
async def __aexit__(
59+
self,
60+
exc_type: type[BaseException] | None,
61+
exc_value: BaseException | None,
62+
exc_traceback: TracebackType | None,
63+
) -> None:
64+
if self._active_timeout is None or self._activation_timestamp is None:
65+
raise RuntimeError('Logic error')
66+
67+
await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)
68+
elapsed = time.monotonic() - self._activation_timestamp
69+
self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)
70+
71+
self._active_timeout = None
72+
self._activation_timestamp = None
73+
74+
3875
def format_duration(duration: timedelta | None) -> str:
3976
"""Format a timedelta into a human-readable string with appropriate units."""
4077
if duration is None:

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,16 @@
55
from abc import ABC
66
from datetime import timedelta
77
from typing import TYPE_CHECKING, Any, Generic
8+
from weakref import WeakKeyDictionary
89

910
from more_itertools import partition
1011
from pydantic import ValidationError
1112
from typing_extensions import NotRequired, TypeVar
1213

1314
from crawlee._request import Request, RequestOptions
15+
from crawlee._types import BasicCrawlingContext
1416
from crawlee._utils.docs import docs_group
17+
from crawlee._utils.time import SharedTimeout
1518
from crawlee._utils.urls import to_absolute_url_iterator
1619
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
1720
from crawlee.errors import SessionError
@@ -25,7 +28,7 @@
2528
from typing_extensions import Unpack
2629

2730
from crawlee import RequestTransformAction
28-
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, ExtractLinksFunction
31+
from crawlee._types import EnqueueLinksKwargs, ExtractLinksFunction
2932

3033
from ._abstract_http_parser import AbstractHttpParser
3134

@@ -76,6 +79,7 @@ def __init__(
7679
self._parser = parser
7780
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
7881
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
82+
self._shared_navigation_timeouts = WeakKeyDictionary[BasicCrawlingContext, SharedTimeout]()
7983

8084
if '_context_pipeline' not in kwargs:
8185
raise ValueError(
@@ -128,8 +132,12 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC
128132
async def _execute_pre_navigation_hooks(
129133
self, context: BasicCrawlingContext
130134
) -> AsyncGenerator[BasicCrawlingContext, None]:
135+
self._shared_navigation_timeouts[context] = SharedTimeout(self._navigation_timeout)
136+
131137
for hook in self._pre_navigation_hooks:
132-
await hook(context)
138+
async with self._shared_navigation_timeouts[context]:
139+
await hook(context)
140+
133141
yield context
134142

135143
async def _parse_http_response(
@@ -232,13 +240,14 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
232240
Yields:
233241
The original crawling context enhanced by HTTP response.
234242
"""
235-
result = await self._http_client.crawl(
236-
request=context.request,
237-
session=context.session,
238-
proxy_info=context.proxy_info,
239-
statistics=self._statistics,
240-
timeout=self._navigation_timeout,
241-
)
243+
async with self._shared_navigation_timeouts[context] as remaining_timeout:
244+
result = await self._http_client.crawl(
245+
request=context.request,
246+
session=context.session,
247+
proxy_info=context.proxy_info,
248+
statistics=self._statistics,
249+
timeout=remaining_timeout,
250+
)
242251

243252
yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
244253

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from datetime import timedelta
77
from functools import partial
88
from typing import TYPE_CHECKING, Any, Generic, Literal
9+
from weakref import WeakKeyDictionary
910

1011
import playwright.async_api
1112
from more_itertools import partition
@@ -14,10 +15,14 @@
1415

1516
from crawlee import service_locator
1617
from crawlee._request import Request, RequestOptions
17-
from crawlee._types import ConcurrencySettings
18+
from crawlee._types import (
19+
BasicCrawlingContext,
20+
ConcurrencySettings,
21+
)
1822
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
1923
from crawlee._utils.docs import docs_group
2024
from crawlee._utils.robots import RobotsTxtFile
25+
from crawlee._utils.time import SharedTimeout
2126
from crawlee._utils.urls import to_absolute_url_iterator
2227
from crawlee.browsers import BrowserPool
2328
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
@@ -46,7 +51,6 @@
4651

4752
from crawlee import RequestTransformAction
4853
from crawlee._types import (
49-
BasicCrawlingContext,
5054
EnqueueLinksKwargs,
5155
ExtractLinksFunction,
5256
HttpHeaders,
@@ -145,6 +149,8 @@ def __init__(
145149
if configuration is not None:
146150
service_locator.set_configuration(configuration)
147151

152+
self._shared_navigation_timeouts = WeakKeyDictionary[BasicCrawlingContext, SharedTimeout]()
153+
148154
if browser_pool:
149155
# Raise an exception if browser_pool is provided together with other browser-related arguments.
150156
if any(
@@ -235,9 +241,13 @@ async def _open_page(
235241
block_requests=partial(block_requests, page=crawlee_page.page),
236242
)
237243

244+
self._shared_navigation_timeouts[pre_navigation_context] = SharedTimeout(self._navigation_timeout)
245+
238246
async with browser_page_context(crawlee_page.page):
239247
for hook in self._pre_navigation_hooks:
240-
await hook(pre_navigation_context)
248+
async with self._shared_navigation_timeouts[context]:
249+
await hook(pre_navigation_context)
250+
241251
yield pre_navigation_context
242252

243253
def _prepare_request_interceptor(
@@ -306,9 +316,10 @@ async def _navigate(
306316
await context.page.route(context.request.url, route_handler)
307317

308318
try:
309-
response = await context.page.goto(
310-
context.request.url, timeout=self._navigation_timeout.total_seconds() * 1000
311-
)
319+
async with self._shared_navigation_timeouts[context] as remaining_timeout:
320+
response = await context.page.goto(
321+
context.request.url, timeout=remaining_timeout.total_seconds() * 1000
322+
)
312323
except playwright.async_api.TimeoutError as exc:
313324
raise asyncio.TimeoutError from exc
314325

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import asyncio
2+
from datetime import timedelta
3+
4+
import pytest
5+
6+
from crawlee._utils.time import SharedTimeout, measure_time
7+
8+
9+
async def test_shared_timeout_tracks_elapsed_time() -> None:
10+
timeout_duration = timedelta(seconds=1)
11+
shared_timeout = SharedTimeout(timeout_duration)
12+
13+
# First usage
14+
async with shared_timeout:
15+
await asyncio.sleep(0.2)
16+
17+
# Second usage - should have less time remaining
18+
async with shared_timeout as remaining:
19+
assert remaining < timedelta(seconds=0.85)
20+
assert remaining > timedelta(seconds=0)
21+
22+
23+
async def test_shared_timeout_expires() -> None:
24+
timeout_duration = timedelta(seconds=0.1)
25+
shared_timeout = SharedTimeout(timeout_duration)
26+
27+
with measure_time() as elapsed, pytest.raises(asyncio.TimeoutError):
28+
async with shared_timeout:
29+
await asyncio.sleep(0.5)
30+
31+
assert elapsed.wall is not None
32+
assert elapsed.wall < 0.3
33+
34+
35+
async def test_shared_timeout_cannot_be_nested() -> None:
36+
timeout_duration = timedelta(seconds=1)
37+
shared_timeout = SharedTimeout(timeout_duration)
38+
39+
async with shared_timeout:
40+
with pytest.raises(RuntimeError, match='cannot be entered twice'):
41+
async with shared_timeout:
42+
pass
43+
44+
45+
async def test_shared_timeout_multiple_sequential_uses() -> None:
46+
"""Test that SharedTimeout can be used multiple times sequentially."""
47+
timeout_duration = timedelta(seconds=1)
48+
shared_timeout = SharedTimeout(timeout_duration)
49+
50+
for _ in range(5):
51+
async with shared_timeout:
52+
await asyncio.sleep(0.05)
53+
54+
# Should have consumed roughly 0.25 seconds
55+
async with shared_timeout as remaining:
56+
assert remaining < timedelta(seconds=0.8)
57+
assert remaining > timedelta(seconds=0)

tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,24 @@ async def test_navigation_timeout_on_slow_request(server_url: URL, http_client:
366366
assert isinstance(failed_request_handler.call_args[0][1], asyncio.TimeoutError)
367367

368368

369+
async def test_navigation_timeout_applies_to_hooks(server_url: URL) -> None:
370+
crawler = BeautifulSoupCrawler(
371+
navigation_timeout=timedelta(seconds=1),
372+
max_request_retries=0,
373+
)
374+
375+
request_handler = mock.AsyncMock()
376+
crawler.router.default_handler(request_handler)
377+
crawler.pre_navigation_hook(lambda _: asyncio.sleep(1))
378+
379+
# Pre-navigation hook takes 1 second (exceeds navigation timeout), so the URL will not be handled
380+
result = await crawler.run([str(server_url)])
381+
382+
assert result.requests_failed == 1
383+
assert result.requests_finished == 0
384+
assert request_handler.call_count == 0
385+
386+
369387
async def test_slow_navigation_does_not_count_toward_handler_timeout(server_url: URL, http_client: HttpClient) -> None:
370388
crawler = BeautifulSoupCrawler(
371389
http_client=http_client,

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -952,6 +952,24 @@ async def test_navigation_timeout_on_slow_page_load(server_url: URL) -> None:
952952
assert isinstance(failed_request_handler.call_args[0][1], asyncio.TimeoutError)
953953

954954

955+
async def test_navigation_timeout_applies_to_hooks(server_url: URL) -> None:
956+
crawler = PlaywrightCrawler(
957+
navigation_timeout=timedelta(seconds=0.5),
958+
max_request_retries=0,
959+
)
960+
961+
request_handler = AsyncMock()
962+
crawler.router.default_handler(request_handler)
963+
crawler.pre_navigation_hook(lambda _: asyncio.sleep(1))
964+
965+
# Pre-navigation hook takes 1 second (exceeds navigation timeout), so the URL will not be handled
966+
result = await crawler.run([str(server_url)])
967+
968+
assert result.requests_failed == 1
969+
assert result.requests_finished == 0
970+
assert request_handler.call_count == 0
971+
972+
955973
async def test_slow_navigation_does_not_count_toward_handler_timeout(server_url: URL) -> None:
956974
crawler = PlaywrightCrawler(
957975
request_handler_timeout=timedelta(seconds=0.5),

uv.lock

Lines changed: 4 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)