-
Notifications
You must be signed in to change notification settings - Fork 543
fix: Only apply requestHandlerTimeout to request handler #1474
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
1b44070
fb85108
eba3eff
3715db2
ecd3c64
6565eea
f4b41f0
55967e8
4b80365
f61b1f8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,13 +5,16 @@ | |
| from abc import ABC | ||
| from datetime import timedelta | ||
| from typing import TYPE_CHECKING, Any, Generic | ||
| from weakref import WeakKeyDictionary | ||
|
|
||
| from more_itertools import partition | ||
| from pydantic import ValidationError | ||
| from typing_extensions import NotRequired, TypeVar | ||
|
|
||
| from crawlee._request import Request, RequestOptions | ||
| from crawlee._types import BasicCrawlingContext | ||
| from crawlee._utils.docs import docs_group | ||
| from crawlee._utils.time import SharedTimeout | ||
| from crawlee._utils.urls import to_absolute_url_iterator | ||
| from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline | ||
| from crawlee.errors import SessionError | ||
|
|
@@ -25,7 +28,7 @@ | |
| from typing_extensions import Unpack | ||
|
|
||
| from crawlee import RequestTransformAction | ||
| from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, ExtractLinksFunction | ||
| from crawlee._types import EnqueueLinksKwargs, ExtractLinksFunction | ||
|
|
||
| from ._abstract_http_parser import AbstractHttpParser | ||
|
|
||
|
|
@@ -76,6 +79,7 @@ def __init__( | |
| self._parser = parser | ||
| self._navigation_timeout = navigation_timeout or timedelta(minutes=1) | ||
| self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = [] | ||
| self._shared_navigation_timeouts = WeakKeyDictionary[BasicCrawlingContext, SharedTimeout]() | ||
|
|
||
| if '_context_pipeline' not in kwargs: | ||
| raise ValueError( | ||
|
|
@@ -128,8 +132,12 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC | |
| async def _execute_pre_navigation_hooks( | ||
| self, context: BasicCrawlingContext | ||
| ) -> AsyncGenerator[BasicCrawlingContext, None]: | ||
| self._shared_navigation_timeouts[context] = SharedTimeout(self._navigation_timeout) | ||
|
||
|
|
||
| for hook in self._pre_navigation_hooks: | ||
| await hook(context) | ||
| async with self._shared_navigation_timeouts[context]: | ||
| await hook(context) | ||
|
|
||
| yield context | ||
|
|
||
| async def _parse_http_response( | ||
|
|
@@ -232,13 +240,14 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera | |
| Yields: | ||
| The original crawling context enhanced by HTTP response. | ||
| """ | ||
| result = await self._http_client.crawl( | ||
| request=context.request, | ||
| session=context.session, | ||
| proxy_info=context.proxy_info, | ||
| statistics=self._statistics, | ||
| timeout=self._navigation_timeout, | ||
| ) | ||
| async with self._shared_navigation_timeouts[context] as remaining_timeout: | ||
| result = await self._http_client.crawl( | ||
| request=context.request, | ||
| session=context.session, | ||
| proxy_info=context.proxy_info, | ||
| statistics=self._statistics, | ||
| timeout=remaining_timeout, | ||
| ) | ||
|
|
||
| yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| import asyncio | ||
| from datetime import timedelta | ||
|
|
||
| import pytest | ||
|
|
||
| from crawlee._utils.time import SharedTimeout, measure_time | ||
|
|
||
|
|
||
| async def test_shared_timeout_tracks_elapsed_time() -> None: | ||
| timeout_duration = timedelta(seconds=1) | ||
| shared_timeout = SharedTimeout(timeout_duration) | ||
|
|
||
| # First usage | ||
| async with shared_timeout: | ||
| await asyncio.sleep(0.2) | ||
|
|
||
| # Second usage - should have less time remaining | ||
| async with shared_timeout as remaining: | ||
| assert remaining < timedelta(seconds=0.85) | ||
| assert remaining > timedelta(seconds=0) | ||
|
|
||
|
|
||
| async def test_shared_timeout_expires() -> None: | ||
| timeout_duration = timedelta(seconds=0.1) | ||
| shared_timeout = SharedTimeout(timeout_duration) | ||
|
|
||
| with measure_time() as elapsed, pytest.raises(asyncio.TimeoutError): | ||
| async with shared_timeout: | ||
| await asyncio.sleep(0.5) | ||
|
|
||
| assert elapsed.wall is not None | ||
| assert elapsed.wall < 0.3 | ||
|
|
||
|
|
||
| async def test_shared_timeout_cannot_be_nested() -> None: | ||
| timeout_duration = timedelta(seconds=1) | ||
| shared_timeout = SharedTimeout(timeout_duration) | ||
|
|
||
| async with shared_timeout: | ||
| with pytest.raises(RuntimeError, match='cannot be entered twice'): | ||
| async with shared_timeout: | ||
| pass | ||
|
|
||
|
|
||
| async def test_shared_timeout_multiple_sequential_uses() -> None: | ||
| """Test that SharedTimeout can be used multiple times sequentially.""" | ||
| timeout_duration = timedelta(seconds=1) | ||
| shared_timeout = SharedTimeout(timeout_duration) | ||
|
|
||
| for _ in range(5): | ||
| async with shared_timeout: | ||
| await asyncio.sleep(0.05) | ||
|
|
||
| # Should have consumed roughly 0.25 seconds | ||
| async with shared_timeout as remaining: | ||
| assert remaining < timedelta(seconds=0.8) | ||
| assert remaining > timedelta(seconds=0) |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice idea. I like this a lot.
Just thinking out loud, where this could lead to:
In case we need to create more granular timeouts for specific steps, I think this class could be easily expanded to support that. I am wondering if even the final context consumer (request handler) could just use timeout from here if the timeout is set on the context (my other comment)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure I follow, do you think that the request handler should be limited by a shared timeout? Or that it should have access to the remaining timeout "budget"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please do not focus on any specific interface example in my example. It is just about capability.
Imagine that the context would be created something like this:
And each timeout-protected context consumer would pick the relevant timeout from the context and apply it. Context consumers could even modify the timeouts of the steps that follow them.
For example, users could mutate "RequestHandler" timeout in pre-navigation hooks.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need to block this change for this. If needed, we can discuss here: #1596