Skip to content

Commit 8e65733

Browse files
committed
Fix short error summary
1 parent 60e39a3 commit 8e65733

File tree

4 files changed

+63
-7
lines changed

4 files changed

+63
-7
lines changed

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1134,7 +1134,7 @@ async def _handle_request_retries(
11341134
request.retry_count += 1
11351135
reduced_error = str(error).split('\n')[0]
11361136
self.log.warning(
1137-
f'Retrying request to {context.request.url} due to: {reduced_error}'
1137+
f'Retrying request to {context.request.url} due to: {reduced_error}. '
11381138
f'{get_one_line_error_summary_if_possible(error)}'
11391139
)
11401140
await self._statistics.error_tracker.add(error=error, context=context)

src/crawlee/crawlers/_basic/_logging_utils.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,21 @@
22
import re
33
import traceback
44

5+
import crawlee.router
6+
57

68
def _get_only_innermost_exception(error: BaseException) -> BaseException:
7-
"""Get innermost exception by following __cause__ and __context__ attributes of exception."""
9+
"""Get innermost exception by following __cause__ and __context__ attributes of exception.
10+
11+
If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
12+
"""
13+
if type(error) is crawlee.router.UserHandlerTimeoutError:
14+
if error.__cause__:
15+
return error.__cause__
16+
if error.__context__:
17+
return error.__context__
18+
return error
19+
820
if error.__cause__:
921
return _get_only_innermost_exception(error.__cause__)
1022
if error.__context__:
@@ -43,13 +55,20 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
4355
def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
4456
innermost_error = _get_only_innermost_exception(error)
4557
return traceback.format_exception(
46-
type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=True
58+
type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False
4759
)
4860

4961

5062
def get_one_line_error_summary_if_possible(error: Exception) -> str:
5163
if isinstance(error, asyncio.exceptions.TimeoutError):
52-
most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
64+
relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
65+
most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
66+
elif isinstance(error, crawlee.router.UserHandlerTimeoutError):
67+
# Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
68+
# code and third line the topmost user error
69+
traceback_parts = _get_traceback_parts_for_innermost_exception(error)
70+
relevant_index_from_start = 3
71+
most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''
5372
elif 'playwright._impl._errors.Error' in str(error.__class__):
5473
# Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
5574
# point to deep internals.

src/crawlee/router.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import asyncio
34
from collections.abc import Awaitable, Callable
45
from typing import Generic, TypeVar
56

@@ -95,7 +96,16 @@ async def __call__(self, context: TCrawlingContext) -> None:
9596
f'No handler matches label `{context.request.label}` and no default handler is configured'
9697
)
9798

98-
return await self._default_handler(context)
99+
user_defined_handler = self._default_handler
100+
else:
101+
user_defined_handler = self._handlers_by_label[context.request.label]
99102

100-
handler = self._handlers_by_label[context.request.label]
101-
return await handler(context)
103+
try:
104+
return await user_defined_handler(context)
105+
except asyncio.TimeoutError as e:
106+
# Timeout in handler, but not timeout of handler.
107+
raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e
108+
109+
110+
class UserHandlerTimeoutError(Exception):
111+
"""Raised when a router fails due to user timeout."""

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
import json
77
import logging
88
import os
9+
import re
910
import sys
1011
import time
12+
from asyncio import Future
1113
from collections import Counter
1214
from dataclasses import dataclass
1315
from datetime import timedelta
@@ -1565,6 +1567,31 @@ async def handler(context: BasicCrawlingContext) -> None:
15651567
assert found_timeout_message, 'Expected log message about request handler error was not found.'
15661568

15671569

1570+
async def test_reduced_logs_from_time_out_in_request_handler(caplog: pytest.LogCaptureFixture) -> None:
1571+
crawler = BasicCrawler(configure_logging=False)
1572+
1573+
@crawler.router.default_handler
1574+
async def default_handler(_: BasicCrawlingContext) -> None:
1575+
await asyncio.wait_for(Future(), timeout=1)
1576+
1577+
# Capture all logs from the 'crawlee' logger at INFO level or higher
1578+
with caplog.at_level(logging.INFO, logger='crawlee'):
1579+
await crawler.run([Request.from_url('https://a.placeholder.com')])
1580+
1581+
# Check for 1 line summary message
1582+
found_timeout_message = False
1583+
for record in caplog.records:
1584+
if re.match(
1585+
r'Retrying request to .* due to: Timeout raised by user defined handler\. File .*, line .*,'
1586+
r' in default_handler, await asyncio.wait_for\(Future\(\), timeout=1\)',
1587+
record.message,
1588+
):
1589+
found_timeout_message = True
1590+
break
1591+
1592+
assert found_timeout_message, 'Expected log message about request handler error was not found.'
1593+
1594+
15681595
async def test_status_message_callback() -> None:
15691596
"""Test that status message callback is called with the correct message."""
15701597
status_message_callback = AsyncMock()

0 commit comments

Comments
 (0)