Skip to content

Commit 80b5fa8

Browse files
authored
chore: Further reduce retry error summary in log (#1370)
### Description - Further reduce retry error summary in log - no more multiline - Ignore details of `Playwrigth` internal errors in warning summary ### Issues - Closes: #1325
1 parent 4bd19a0 commit 80b5fa8

File tree

4 files changed

+34
-3
lines changed

4 files changed

+34
-3
lines changed

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1031,8 +1031,9 @@ async def _handle_request_retries(
10311031

10321032
if self._should_retry_request(context, error):
10331033
request.retry_count += 1
1034+
reduced_error = str(error).split('\n')[0]
10341035
self.log.warning(
1035-
f'Retrying request to {context.request.url} due to: {error} \n'
1036+
f'Retrying request to {context.request.url} due to: {reduced_error}'
10361037
f'{get_one_line_error_summary_if_possible(error)}'
10371038
)
10381039
await self._statistics.error_tracker.add(error=error, context=context)

src/crawlee/crawlers/_basic/_logging_utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,11 @@ def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
4949

5050
def get_one_line_error_summary_if_possible(error: Exception) -> str:
5151
if isinstance(error, asyncio.exceptions.TimeoutError):
52-
most_relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
52+
most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
53+
elif 'playwright._impl._errors.Error' in str(error.__class__):
54+
# Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
55+
# point to deep internals.
56+
return ''
5357
else:
5458
traceback_parts = _get_traceback_parts_for_innermost_exception(error)
5559
# Commonly last traceback part is type of the error, and the second last part is the relevant file.

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1368,7 +1368,7 @@ async def handler(context: BasicCrawlingContext) -> None:
13681368
for record in caplog.records:
13691369
if record.message and 'timed out after 1.0 seconds' in record.message:
13701370
full_message = (record.message or '') + (record.exc_text or '')
1371-
assert Counter(full_message)['\n'] < 10
1371+
assert '\n' not in full_message
13721372
assert '# INJECTED DELAY' in full_message
13731373
found_timeout_message = True
13741374
break

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from __future__ import annotations
66

77
import json
8+
import logging
89
from typing import TYPE_CHECKING, Any, Literal
910
from unittest import mock
1011
from unittest.mock import Mock
@@ -758,3 +759,28 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
758759

759760
assert len(extracted_links) == 1
760761
assert extracted_links[0] == str(server_url / 'page_1')
762+
763+
764+
async def test_reduced_logs_from_playwright_navigation_timeout(caplog: pytest.LogCaptureFixture) -> None:
765+
caplog.set_level(logging.INFO)
766+
crawler = PlaywrightCrawler(configure_logging=False)
767+
non_existent_page = 'https://totally-non-existing-site.com/blablablba'
768+
769+
# Capture all logs from the 'crawlee' logger at INFO level or higher
770+
with caplog.at_level(logging.INFO, logger='crawlee'):
771+
await crawler.run([Request.from_url(non_existent_page)])
772+
773+
expected_summarized_log = (
774+
f'Retrying request to {non_existent_page} due to: Page.goto: net::ERR_NAME_NOT_RESOLVED at {non_existent_page}'
775+
)
776+
777+
# Find the Playwright specific error message in the logs
778+
found_playwright_message = False
779+
for record in caplog.records:
780+
if record.message and expected_summarized_log in record.message:
781+
full_message = (record.message or '') + (record.exc_text or '')
782+
assert '\n' not in full_message
783+
found_playwright_message = True
784+
break
785+
786+
assert found_playwright_message, 'Expected log message about request handler error was not found.'

0 commit comments

Comments
 (0)