Skip to content

Commit 3715db2

Browse files
committed
Fix tests and bugs
1 parent eba3eff commit 3715db2

File tree

7 files changed

+122
-8
lines changed

7 files changed

+122
-8
lines changed

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from functools import partial
88
from typing import TYPE_CHECKING, Any, Generic, Literal
99

10+
import playwright.async_api
1011
from more_itertools import partition
1112
from pydantic import ValidationError
1213
from typing_extensions import NotRequired, TypedDict, TypeVar
@@ -272,6 +273,7 @@ async def _navigate(
272273
Raises:
273274
ValueError: If the browser pool is not initialized.
274275
SessionError: If the URL cannot be loaded by the browser.
276+
TimeoutError: If navigation does not succeed within the navigation timeout.
275277
276278
Yields:
277279
The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
@@ -303,9 +305,12 @@ async def _navigate(
303305
# Set route_handler only for current request
304306
await context.page.route(context.request.url, route_handler)
305307

306-
response = await asyncio.wait_for(
307-
context.page.goto(context.request.url), timeout=self._navigation_timeout.total_seconds()
308-
)
308+
try:
309+
response = await context.page.goto(
310+
context.request.url, timeout=self._navigation_timeout.total_seconds() * 1000
311+
)
312+
except playwright.async_api.TimeoutError as exc:
313+
raise asyncio.TimeoutError from exc
309314

310315
if response is None:
311316
raise SessionError(f'Failed to load the URL: {context.request.url}')

src/crawlee/http_clients/_curl_impersonate.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import asyncio
34
from contextlib import asynccontextmanager
45
from typing import TYPE_CHECKING, Any
56

@@ -10,6 +11,7 @@
1011
from curl_cffi.requests.cookies import CurlMorsel
1112
from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
1213
from curl_cffi.requests.exceptions import RequestException as CurlRequestError
14+
from curl_cffi.requests.exceptions import Timeout
1315
from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
1416
from typing_extensions import override
1517

@@ -160,6 +162,8 @@ async def crawl(
160162
cookies=session.cookies.jar if session else None,
161163
timeout=timeout.total_seconds() if timeout else None,
162164
)
165+
except Timeout as exc:
166+
raise asyncio.TimeoutError from exc
163167
except CurlRequestError as exc:
164168
if self._is_proxy_error(exc):
165169
raise ProxyError from exc
@@ -205,6 +209,8 @@ async def send_request(
205209
cookies=session.cookies.jar if session else None,
206210
timeout=timeout.total_seconds() if timeout else None,
207211
)
212+
except Timeout as exc:
213+
raise asyncio.TimeoutError from exc
208214
except CurlRequestError as exc:
209215
if self._is_proxy_error(exc):
210216
raise ProxyError from exc
@@ -245,6 +251,8 @@ async def stream(
245251
stream=True,
246252
timeout=timeout.total_seconds() if timeout else None,
247253
)
254+
except Timeout as exc:
255+
raise asyncio.TimeoutError from exc
248256
except CurlRequestError as exc:
249257
if self._is_proxy_error(exc):
250258
raise ProxyError from exc

src/crawlee/http_clients/_httpx.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import asyncio
34
from contextlib import asynccontextmanager
45
from logging import getLogger
56
from typing import TYPE_CHECKING, Any, cast
@@ -163,6 +164,8 @@ async def crawl(
163164

164165
try:
165166
response = await client.send(http_request)
167+
except httpx.TimeoutException as exc:
168+
raise asyncio.TimeoutError from exc
166169
except httpx.TransportError as exc:
167170
if self._is_proxy_error(exc):
168171
raise ProxyError from exc
@@ -203,6 +206,8 @@ async def send_request(
203206

204207
try:
205208
response = await client.send(http_request)
209+
except httpx.TimeoutException as exc:
210+
raise asyncio.TimeoutError from exc
206211
except httpx.TransportError as exc:
207212
if self._is_proxy_error(exc):
208213
raise ProxyError from exc
@@ -235,7 +240,10 @@ async def stream(
235240
timeout=httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None,
236241
)
237242

238-
response = await client.send(http_request, stream=True)
243+
try:
244+
response = await client.send(http_request, stream=True)
245+
except httpx.TimeoutException as exc:
246+
raise asyncio.TimeoutError from exc
239247

240248
try:
241249
yield _HttpxResponse(response)

src/crawlee/http_clients/_impit.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ async def crawl(
138138
timeout=timeout.total_seconds() if timeout else None,
139139
)
140140
except TimeoutException as exc:
141-
raise TimeoutError from exc
141+
raise asyncio.TimeoutError from exc
142142
except (TransportError, HTTPError) as exc:
143143
if self._is_proxy_error(exc):
144144
raise ProxyError from exc
@@ -177,7 +177,7 @@ async def send_request(
177177
timeout=timeout.total_seconds() if timeout else None,
178178
)
179179
except TimeoutException as exc:
180-
raise TimeoutError from exc
180+
raise asyncio.TimeoutError from exc
181181
except (TransportError, HTTPError) as exc:
182182
if self._is_proxy_error(exc):
183183
raise ProxyError from exc
@@ -210,7 +210,7 @@ async def stream(
210210
stream=True,
211211
)
212212
except TimeoutException as exc:
213-
raise TimeoutError from exc
213+
raise asyncio.TimeoutError from exc
214214

215215
try:
216216
yield _ImpitResponse(response)

tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from __future__ import annotations
22

3+
import asyncio
4+
from datetime import timedelta
35
from typing import TYPE_CHECKING
46
from unittest import mock
57

@@ -341,3 +343,42 @@ async def handler(context: BeautifulSoupCrawlingContext) -> None:
341343
await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)
342344

343345
await crawler.run([str(server_url / 'start_enqueue')])
346+
347+
348+
async def test_navigation_timeout_on_slow_request(server_url: URL, http_client: HttpClient) -> None:
349+
"""Test that navigation_timeout causes TimeoutError on slow HTTP requests."""
350+
crawler = BeautifulSoupCrawler(
351+
http_client=http_client,
352+
navigation_timeout=timedelta(seconds=1),
353+
max_request_retries=0,
354+
)
355+
356+
failed_request_handler = mock.AsyncMock()
357+
crawler.failed_request_handler(failed_request_handler)
358+
359+
request_handler = mock.AsyncMock()
360+
crawler.router.default_handler(request_handler)
361+
362+
# Request endpoint that delays 5 seconds - should timeout at 1 second
363+
await crawler.run([str(server_url.with_path('/slow').with_query(delay=5))])
364+
365+
assert failed_request_handler.call_count == 1
366+
assert isinstance(failed_request_handler.call_args[0][1], asyncio.TimeoutError)
367+
368+
369+
async def test_slow_navigation_does_not_count_toward_handler_timeout(server_url: URL, http_client: HttpClient) -> None:
370+
crawler = BeautifulSoupCrawler(
371+
http_client=http_client,
372+
request_handler_timeout=timedelta(seconds=0.5),
373+
max_request_retries=0,
374+
)
375+
376+
request_handler = mock.AsyncMock()
377+
crawler.router.default_handler(request_handler)
378+
379+
# Navigation takes 1 second (exceeds handler timeout), but should still succeed
380+
result = await crawler.run([str(server_url.with_path('/slow').with_query(delay=1))])
381+
382+
assert result.requests_failed == 0
383+
assert result.requests_finished == 1
384+
assert request_handler.call_count == 1

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@
44

55
from __future__ import annotations
66

7+
import asyncio
78
import json
89
import logging
10+
from datetime import timedelta
911
from typing import TYPE_CHECKING, Any, Literal
1012
from unittest import mock
11-
from unittest.mock import Mock
13+
from unittest.mock import AsyncMock, Mock
1214

1315
import pytest
1416

@@ -925,3 +927,43 @@ async def handler(context: PlaywrightCrawlingContext) -> None:
925927
await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)
926928

927929
await crawler.run([str(server_url / 'start_enqueue')])
930+
931+
932+
async def test_navigation_timeout_on_slow_page_load(server_url: URL) -> None:
933+
crawler = PlaywrightCrawler(
934+
navigation_timeout=timedelta(seconds=1),
935+
max_request_retries=0,
936+
)
937+
938+
request_handler = AsyncMock()
939+
crawler.router.default_handler(request_handler)
940+
941+
failed_request_handler = AsyncMock()
942+
crawler.failed_request_handler(failed_request_handler)
943+
944+
result = await crawler.run([str((server_url / 'slow').with_query(delay=2))])
945+
946+
assert result.requests_failed == 1
947+
assert result.requests_finished == 0
948+
949+
assert request_handler.call_count == 0
950+
951+
assert failed_request_handler.call_count == 1
952+
assert isinstance(failed_request_handler.call_args[0][1], asyncio.TimeoutError)
953+
954+
955+
async def test_slow_navigation_does_not_count_toward_handler_timeout(server_url: URL) -> None:
956+
crawler = PlaywrightCrawler(
957+
request_handler_timeout=timedelta(seconds=0.5),
958+
max_request_retries=0,
959+
)
960+
961+
request_handler = AsyncMock()
962+
crawler.router.default_handler(request_handler)
963+
964+
# Navigation takes 1 second (exceeds handler timeout), but should still succeed
965+
result = await crawler.run([str((server_url / 'slow').with_query(delay=1))])
966+
967+
assert result.requests_failed == 0
968+
assert result.requests_finished == 1
969+
assert request_handler.call_count == 1

tests/unit/server.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:
121121
'xml': hello_world_xml,
122122
'robots.txt': robots_txt,
123123
'get_compressed': get_compressed,
124+
'slow': slow_response,
124125
}
125126
path = URL(scope['path']).parts[1]
126127
# Route requests to appropriate handlers
@@ -411,6 +412,15 @@ async def get_compressed(_scope: dict[str, Any], _receive: Receive, send: Send)
411412
await send({'type': 'http.response.body', 'body': gzip.compress(HELLO_WORLD * 1000)})
412413

413414

415+
async def slow_response(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
416+
"""Handle requests with a configurable delay to test timeouts."""
417+
query_params = get_query_params(scope.get('query_string', b''))
418+
delay = float(query_params.get('delay', '5')) # Default 5 second delay
419+
420+
await asyncio.sleep(delay)
421+
await send_html_response(send, HELLO_WORLD)
422+
423+
414424
class TestServer(Server):
415425
"""A test HTTP server implementation based on Uvicorn Server."""
416426

0 commit comments

Comments
 (0)