Skip to content

Commit b751208

Browse files
authored
fix: Fix short error summary (#1605)
### Description - Wrap Timeout errors raised by user-defined handler by `UserHandlerTimeoutError` (Otherwise it would be misunderstood as `Request handler timed out after 60 s`) - Ensure that `get_one_line_error_summary_if_possible` does not access a non-existing index ### Issues - Closes: #1602 ### Testing - Unit tests ### Checklist - [ ] CI passed
1 parent c56663a commit b751208

File tree

5 files changed

+71
-10
lines changed

5 files changed

+71
-10
lines changed

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
RequestHandlerError,
6060
SessionError,
6161
UserDefinedErrorHandlerError,
62+
UserHandlerTimeoutError,
6263
)
6364
from crawlee.events._types import Event, EventCrawlerStatusData
6465
from crawlee.http_clients import ImpitHttpClient
@@ -1134,7 +1135,7 @@ async def _handle_request_retries(
11341135
request.retry_count += 1
11351136
reduced_error = str(error).split('\n')[0]
11361137
self.log.warning(
1137-
f'Retrying request to {context.request.url} due to: {reduced_error}'
1138+
f'Retrying request to {context.request.url} due to: {reduced_error}. '
11381139
f'{get_one_line_error_summary_if_possible(error)}'
11391140
)
11401141
await self._statistics.error_tracker.add(error=error, context=context)
@@ -1221,10 +1222,11 @@ def _get_message_from_error(self, error: Exception) -> str:
12211222

12221223
if (
12231224
isinstance(error, asyncio.exceptions.TimeoutError)
1225+
and traceback_parts
12241226
and self._request_handler_timeout_text in traceback_parts[-1]
1225-
):
1227+
) or isinstance(error, UserHandlerTimeoutError):
12261228
used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
1227-
used_traceback_parts.append(traceback_parts[-1])
1229+
used_traceback_parts.extend(traceback_parts[-1:])
12281230

12291231
return ''.join(used_traceback_parts).strip('\n')
12301232

src/crawlee/crawlers/_basic/_logging_utils.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,21 @@
22
import re
33
import traceback
44

5+
import crawlee.errors
6+
57

68
def _get_only_innermost_exception(error: BaseException) -> BaseException:
7-
"""Get innermost exception by following __cause__ and __context__ attributes of exception."""
9+
"""Get innermost exception by following __cause__ and __context__ attributes of exception.
10+
11+
If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
12+
"""
13+
if type(error) is crawlee.errors.UserHandlerTimeoutError:
14+
if error.__cause__:
15+
return error.__cause__
16+
if error.__context__:
17+
return error.__context__
18+
return error
19+
820
if error.__cause__:
921
return _get_only_innermost_exception(error.__cause__)
1022
if error.__context__:
@@ -34,7 +46,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str:
3446

3547

3648
def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
37-
timeout_error: asyncio.exceptions.TimeoutError,
49+
timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,
3850
) -> list[str]:
3951
innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
4052
return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)
@@ -43,13 +55,20 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
4355
def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
4456
innermost_error = _get_only_innermost_exception(error)
4557
return traceback.format_exception(
46-
type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=True
58+
type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False
4759
)
4860

4961

5062
def get_one_line_error_summary_if_possible(error: Exception) -> str:
5163
if isinstance(error, asyncio.exceptions.TimeoutError):
52-
most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
64+
relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
65+
most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
66+
elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):
67+
# Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
68+
# code and third line the topmost user error
69+
traceback_parts = _get_traceback_parts_for_innermost_exception(error)
70+
relevant_index_from_start = 3
71+
most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''
5372
elif 'playwright._impl._errors.Error' in str(error.__class__):
5473
# Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
5574
# point to deep internals.

src/crawlee/errors.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ class UserDefinedErrorHandlerError(Exception):
2929
"""Wraps an exception thrown from an user-defined error handler."""
3030

3131

32+
class UserHandlerTimeoutError(UserDefinedErrorHandlerError):
33+
"""Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out."""
34+
35+
3236
@docs_group('Errors')
3337
class SessionError(Exception):
3438
"""Errors of `SessionError` type will trigger a session rotation.

src/crawlee/router.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import asyncio
34
from collections.abc import Awaitable, Callable
45
from typing import Generic, TypeVar
56

@@ -9,6 +10,8 @@
910

1011
__all__ = ['Router']
1112

13+
from crawlee.errors import UserHandlerTimeoutError
14+
1215
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext)
1316
RequestHandler = Callable[[TCrawlingContext], Awaitable[None]]
1417

@@ -97,7 +100,12 @@ async def __call__(self, context: TCrawlingContext) -> None:
97100
f'No handler matches label `{context.request.label}` and no default handler is configured'
98101
)
99102

100-
return await self._default_handler(context)
103+
user_defined_handler = self._default_handler
104+
else:
105+
user_defined_handler = self._handlers_by_label[context.request.label]
101106

102-
handler = self._handlers_by_label[context.request.label]
103-
return await handler(context)
107+
try:
108+
return await user_defined_handler(context)
109+
except asyncio.TimeoutError as e:
110+
# Timeout in handler, but not timeout of handler.
111+
raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
import json
77
import logging
88
import os
9+
import re
910
import sys
1011
import time
12+
from asyncio import Future
1113
from collections import Counter
1214
from dataclasses import dataclass
1315
from datetime import timedelta
@@ -1540,6 +1542,7 @@ async def test_reduced_logs_from_timed_out_request_handler(caplog: pytest.LogCap
15401542
caplog.set_level(logging.INFO)
15411543
crawler = BasicCrawler(
15421544
configure_logging=False,
1545+
max_request_retries=1,
15431546
request_handler_timeout=timedelta(seconds=1),
15441547
)
15451548

@@ -1565,6 +1568,31 @@ async def handler(context: BasicCrawlingContext) -> None:
15651568
assert found_timeout_message, 'Expected log message about request handler error was not found.'
15661569

15671570

1571+
async def test_reduced_logs_from_time_out_in_request_handler(caplog: pytest.LogCaptureFixture) -> None:
1572+
crawler = BasicCrawler(configure_logging=False, max_request_retries=1)
1573+
1574+
@crawler.router.default_handler
1575+
async def default_handler(_: BasicCrawlingContext) -> None:
1576+
await asyncio.wait_for(Future(), timeout=1)
1577+
1578+
# Capture all logs from the 'crawlee' logger at INFO level or higher
1579+
with caplog.at_level(logging.INFO, logger='crawlee'):
1580+
await crawler.run([Request.from_url('https://a.placeholder.com')])
1581+
1582+
# Check for 1 line summary message
1583+
found_timeout_message = False
1584+
for record in caplog.records:
1585+
if re.match(
1586+
r'Retrying request to .* due to: Timeout raised by user defined handler\. File .*, line .*,'
1587+
r' in default_handler, await asyncio.wait_for\(Future\(\), timeout=1\)',
1588+
record.message,
1589+
):
1590+
found_timeout_message = True
1591+
break
1592+
1593+
assert found_timeout_message, 'Expected log message about request handler error was not found.'
1594+
1595+
15681596
async def test_status_message_callback() -> None:
15691597
"""Test that status message callback is called with the correct message."""
15701598
status_message_callback = AsyncMock()

0 commit comments

Comments
 (0)