Skip to content

Commit 6580a03

Browse files
committed
Finalize
1 parent 8e65733 commit 6580a03

File tree

5 files changed

+16
-10
lines changed

5 files changed

+16
-10
lines changed

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
RequestHandlerError,
6060
SessionError,
6161
UserDefinedErrorHandlerError,
62+
UserHandlerTimeoutError,
6263
)
6364
from crawlee.events._types import Event, EventCrawlerStatusData
6465
from crawlee.http_clients import ImpitHttpClient
@@ -1222,10 +1223,11 @@ def _get_message_from_error(self, error: Exception) -> str:
12221223

12231224
if (
12241225
isinstance(error, asyncio.exceptions.TimeoutError)
1226+
and traceback_parts
12251227
and self._request_handler_timeout_text in traceback_parts[-1]
1226-
):
1228+
) or isinstance(error, UserHandlerTimeoutError):
12271229
used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
1228-
used_traceback_parts.append(traceback_parts[-1])
1230+
used_traceback_parts.extend(traceback_parts[-1:])
12291231

12301232
return ''.join(used_traceback_parts).strip('\n')
12311233

src/crawlee/crawlers/_basic/_logging_utils.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import re
33
import traceback
44

5+
import crawlee.errors
56
import crawlee.router
67

78

@@ -10,7 +11,7 @@ def _get_only_innermost_exception(error: BaseException) -> BaseException:
1011
1112
If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
1213
"""
13-
if type(error) is crawlee.router.UserHandlerTimeoutError:
14+
if type(error) is crawlee.errors.UserHandlerTimeoutError:
1415
if error.__cause__:
1516
return error.__cause__
1617
if error.__context__:
@@ -46,7 +47,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str:
4647

4748

4849
def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
49-
timeout_error: asyncio.exceptions.TimeoutError,
50+
timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,
5051
) -> list[str]:
5152
innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
5253
return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)
@@ -63,7 +64,7 @@ def get_one_line_error_summary_if_possible(error: Exception) -> str:
6364
if isinstance(error, asyncio.exceptions.TimeoutError):
6465
relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
6566
most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
66-
elif isinstance(error, crawlee.router.UserHandlerTimeoutError):
67+
elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):
6768
# Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
6869
# code and third line the topmost user error
6970
traceback_parts = _get_traceback_parts_for_innermost_exception(error)

src/crawlee/errors.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ class UserDefinedErrorHandlerError(Exception):
2929
"""Wraps an exception thrown from an user-defined error handler."""
3030

3131

32+
class UserHandlerTimeoutError(UserDefinedErrorHandlerError):
33+
"""Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out."""
34+
35+
3236
@docs_group('Errors')
3337
class SessionError(Exception):
3438
"""Errors of `SessionError` type will trigger a session rotation.

src/crawlee/router.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
__all__ = ['Router']
1111

12+
from crawlee.errors import UserHandlerTimeoutError
13+
1214
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext)
1315
RequestHandler = Callable[[TCrawlingContext], Awaitable[None]]
1416

@@ -105,7 +107,3 @@ async def __call__(self, context: TCrawlingContext) -> None:
105107
except asyncio.TimeoutError as e:
106108
# Timeout in handler, but not timeout of handler.
107109
raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e
108-
109-
110-
class UserHandlerTimeoutError(Exception):
111-
"""Raised when a router fails due to user timeout."""

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1542,6 +1542,7 @@ async def test_reduced_logs_from_timed_out_request_handler(caplog: pytest.LogCap
15421542
caplog.set_level(logging.INFO)
15431543
crawler = BasicCrawler(
15441544
configure_logging=False,
1545+
max_request_retries=1,
15451546
request_handler_timeout=timedelta(seconds=1),
15461547
)
15471548

@@ -1568,7 +1569,7 @@ async def handler(context: BasicCrawlingContext) -> None:
15681569

15691570

15701571
async def test_reduced_logs_from_time_out_in_request_handler(caplog: pytest.LogCaptureFixture) -> None:
1571-
crawler = BasicCrawler(configure_logging=False)
1572+
crawler = BasicCrawler(configure_logging=False, max_request_retries=1)
15721573

15731574
@crawler.router.default_handler
15741575
async def default_handler(_: BasicCrawlingContext) -> None:

0 commit comments

Comments
 (0)