Skip to content

Commit fb85108

Browse files
committed
Implement navigation_timeout for AbstractHttpCrawler and PlaywrightCrawler
1 parent 1b44070 commit fb85108

File tree

11 files changed

+82
-24
lines changed

11 files changed

+82
-24
lines changed

src/crawlee/crawlers/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from crawlee._utils.try_import import install_import_hook as _install_import_hook
22
from crawlee._utils.try_import import try_import as _try_import
33

4-
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
4+
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
55
from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
66
from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
77

@@ -51,6 +51,7 @@
5151
'BeautifulSoupParserType',
5252
'ContextPipeline',
5353
'HttpCrawler',
54+
'HttpCrawlerOptions',
5455
'HttpCrawlingContext',
5556
'HttpCrawlingResult',
5657
'ParsedHttpCrawlingContext',
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
from ._abstract_http_crawler import AbstractHttpCrawler
1+
from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
22
from ._abstract_http_parser import AbstractHttpParser
33
from ._http_crawling_context import ParsedHttpCrawlingContext
44

55
__all__ = [
66
'AbstractHttpCrawler',
77
'AbstractHttpParser',
8+
'HttpCrawlerOptions',
89
'ParsedHttpCrawlingContext',
910
]

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
import asyncio
44
import logging
55
from abc import ABC
6+
from datetime import timedelta
67
from typing import TYPE_CHECKING, Any, Generic
78

89
from more_itertools import partition
910
from pydantic import ValidationError
10-
from typing_extensions import TypeVar
11+
from typing_extensions import NotRequired, TypeVar
1112

1213
from crawlee._request import Request, RequestOptions
1314
from crawlee._utils.docs import docs_group
@@ -32,6 +33,19 @@
3233
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
3334

3435

36+
class HttpCrawlerOptions(
37+
BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
38+
Generic[TCrawlingContext, TStatisticsState],
39+
):
40+
"""Arguments for the `AbstractHttpCrawler` constructor.
41+
42+
It is intended for typing forwarded `__init__` arguments in the subclasses.
43+
"""
44+
45+
navigation_timeout: NotRequired[timedelta | None]
46+
"""Timeout for the HTTP request."""
47+
48+
3549
@docs_group('Crawlers')
3650
class AbstractHttpCrawler(
3751
BasicCrawler[TCrawlingContext, StatisticsState],
@@ -56,9 +70,11 @@ def __init__(
5670
self,
5771
*,
5872
parser: AbstractHttpParser[TParseResult, TSelectResult],
73+
navigation_timeout: timedelta | None = None,
5974
**kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
6075
) -> None:
6176
self._parser = parser
77+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
6278
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
6379

6480
if '_context_pipeline' not in kwargs:
@@ -219,6 +235,7 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
219235
session=context.session,
220236
proxy_info=context.proxy_info,
221237
statistics=self._statistics,
238+
timeout=self._navigation_timeout,
222239
)
223240

224241
yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)

src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from bs4 import BeautifulSoup, Tag
66

77
from crawlee._utils.docs import docs_group
8-
from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
99

1010
from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
1111
from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
@@ -58,7 +58,7 @@ def __init__(
5858
self,
5959
*,
6060
parser: BeautifulSoupParserType = 'lxml',
61-
**kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
61+
**kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
6262
) -> None:
6363
"""Initialize a new instance.
6464

src/crawlee/crawlers/_parsel/_parsel_crawler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from parsel import Selector
66

77
from crawlee._utils.docs import docs_group
8-
from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
99

1010
from ._parsel_crawling_context import ParselCrawlingContext
1111
from ._parsel_parser import ParselParser
@@ -56,7 +56,7 @@ async def request_handler(context: ParselCrawlingContext) -> None:
5656

5757
def __init__(
5858
self,
59-
**kwargs: Unpack[BasicCrawlerOptions[ParselCrawlingContext]],
59+
**kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
6060
) -> None:
6161
"""Initialize a new instance.
6262

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import asyncio
44
import logging
55
import warnings
6+
from datetime import timedelta
67
from functools import partial
78
from typing import TYPE_CHECKING, Any, Generic, Literal
89

@@ -106,6 +107,7 @@ def __init__(
106107
fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
107108
headless: bool | None = None,
108109
use_incognito_pages: bool | None = None,
110+
navigation_timeout: timedelta | None = None,
109111
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
110112
) -> None:
111113
"""Initialize a new instance.
@@ -131,6 +133,8 @@ def __init__(
131133
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
132134
own context that is destroyed once the page is closed or crashes.
133135
This option should not be used if `browser_pool` is provided.
136+
navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
137+
the request handler)
134138
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
135139
"""
136140
configuration = kwargs.pop('configuration', None)
@@ -199,6 +203,8 @@ def __init__(
199203
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
200204
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
201205

206+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
207+
202208
super().__init__(**kwargs)
203209

204210
async def _open_page(
@@ -294,7 +300,9 @@ async def _navigate(
294300
# Set route_handler only for current request
295301
await context.page.route(context.request.url, route_handler)
296302

297-
response = await context.page.goto(context.request.url)
303+
response = await asyncio.wait_for(
304+
context.page.goto(context.request.url), timeout=self._navigation_timeout.total_seconds()
305+
)
298306

299307
if response is None:
300308
raise SessionError(f'Failed to load the URL: {context.request.url}')

src/crawlee/crawlers/_playwright/_playwright_http_client.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ async def crawl(
5959
session: Session | None = None,
6060
proxy_info: ProxyInfo | None = None,
6161
statistics: Statistics | None = None,
62+
timeout: timedelta | None = None,
6263
) -> HttpCrawlingResult:
6364
raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')
6465

@@ -72,6 +73,7 @@ async def send_request(
7273
payload: HttpPayload | None = None,
7374
session: Session | None = None,
7475
proxy_info: ProxyInfo | None = None,
76+
timeout: timedelta | None = None,
7577
) -> HttpResponse:
7678
# `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
7779
# TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
@@ -87,7 +89,11 @@ async def send_request(
8789

8890
# Proxies appropriate to the browser context are used
8991
response = await browser_context.request.fetch(
90-
url_or_request=url, method=method.lower(), headers=dict(headers) if headers else None, data=payload
92+
url_or_request=url,
93+
method=method.lower(),
94+
headers=dict(headers) if headers else None,
95+
data=payload,
96+
timeout=timeout.total_seconds() if timeout else None,
9197
)
9298

9399
return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')

src/crawlee/http_clients/_base.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ async def crawl(
104104
session: Session | None = None,
105105
proxy_info: ProxyInfo | None = None,
106106
statistics: Statistics | None = None,
107+
timeout: timedelta | None = None,
107108
) -> HttpCrawlingResult:
108109
"""Perform the crawling for a given request.
109110
@@ -114,6 +115,7 @@ async def crawl(
114115
session: The session associated with the request.
115116
proxy_info: The information about the proxy to be used.
116117
statistics: The statistics object to register status codes.
118+
timeout: Request timeout
117119
118120
Raises:
119121
ProxyError: Raised if a proxy-related error occurs.
@@ -132,6 +134,7 @@ async def send_request(
132134
payload: HttpPayload | None = None,
133135
session: Session | None = None,
134136
proxy_info: ProxyInfo | None = None,
137+
timeout: timedelta | None = None,
135138
) -> HttpResponse:
136139
"""Send an HTTP request via the client.
137140
@@ -144,6 +147,7 @@ async def send_request(
144147
payload: The data to be sent as the request body.
145148
session: The session associated with the request.
146149
proxy_info: The information about the proxy to be used.
150+
timeout: Request timeout
147151
148152
Raises:
149153
ProxyError: Raised if a proxy-related error occurs.

src/crawlee/http_clients/_curl_impersonate.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ async def crawl(
147147
session: Session | None = None,
148148
proxy_info: ProxyInfo | None = None,
149149
statistics: Statistics | None = None,
150+
timeout: timedelta | None = None,
150151
) -> HttpCrawlingResult:
151152
client = self._get_client(proxy_info.url if proxy_info else None)
152153

@@ -157,6 +158,7 @@ async def crawl(
157158
headers=request.headers,
158159
data=request.payload,
159160
cookies=session.cookies.jar if session else None,
161+
timeout=timeout.total_seconds() if timeout else None,
160162
)
161163
except CurlRequestError as exc:
162164
if self._is_proxy_error(exc):
@@ -186,6 +188,7 @@ async def send_request(
186188
payload: HttpPayload | None = None,
187189
session: Session | None = None,
188190
proxy_info: ProxyInfo | None = None,
191+
timeout: timedelta | None = None,
189192
) -> HttpResponse:
190193
if isinstance(headers, dict) or headers is None:
191194
headers = HttpHeaders(headers or {})
@@ -200,6 +203,7 @@ async def send_request(
200203
headers=dict(headers) if headers else None,
201204
data=payload,
202205
cookies=session.cookies.jar if session else None,
206+
timeout=timeout.total_seconds() if timeout else None,
203207
)
204208
except CurlRequestError as exc:
205209
if self._is_proxy_error(exc):

src/crawlee/http_clients/_httpx.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ async def crawl(
146146
session: Session | None = None,
147147
proxy_info: ProxyInfo | None = None,
148148
statistics: Statistics | None = None,
149+
timeout: timedelta | None = None,
149150
) -> HttpCrawlingResult:
150151
client = self._get_client(proxy_info.url if proxy_info else None)
151152
headers = self._combine_headers(request.headers)
@@ -157,6 +158,7 @@ async def crawl(
157158
content=request.payload,
158159
cookies=session.cookies.jar if session else None,
159160
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
161+
timeout=timeout.total_seconds() if timeout is not None else httpx.USE_CLIENT_DEFAULT,
160162
)
161163

162164
try:
@@ -185,6 +187,7 @@ async def send_request(
185187
payload: HttpPayload | None = None,
186188
session: Session | None = None,
187189
proxy_info: ProxyInfo | None = None,
190+
timeout: timedelta | None = None,
188191
) -> HttpResponse:
189192
client = self._get_client(proxy_info.url if proxy_info else None)
190193

@@ -195,6 +198,7 @@ async def send_request(
195198
headers=headers,
196199
payload=payload,
197200
session=session,
201+
timeout=httpx.Timeout(timeout.total_seconds()) if timeout is not None else None,
198202
)
199203

200204
try:
@@ -228,7 +232,7 @@ async def stream(
228232
headers=headers,
229233
payload=payload,
230234
session=session,
231-
timeout=timeout,
235+
timeout=httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None,
232236
)
233237

234238
response = await client.send(http_request, stream=True)
@@ -246,23 +250,21 @@ def _build_request(
246250
headers: HttpHeaders | dict[str, str] | None,
247251
payload: HttpPayload | None,
248252
session: Session | None = None,
249-
timeout: timedelta | None = None,
253+
timeout: httpx.Timeout | None = None,
250254
) -> httpx.Request:
251255
"""Build an `httpx.Request` using the provided parameters."""
252256
if isinstance(headers, dict) or headers is None:
253257
headers = HttpHeaders(headers or {})
254258

255259
headers = self._combine_headers(headers)
256260

257-
httpx_timeout = httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None
258-
259261
return client.build_request(
260262
url=url,
261263
method=method,
262264
headers=dict(headers) if headers else None,
263265
content=payload,
264266
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
265-
timeout=httpx_timeout,
267+
timeout=timeout if timeout else httpx.USE_CLIENT_DEFAULT,
266268
)
267269

268270
def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:

0 commit comments

Comments
 (0)