Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1031,8 +1031,9 @@ async def _handle_request_retries(

if self._should_retry_request(context, error):
request.retry_count += 1
reduced_error = str(error).split('\n')[0]
self.log.warning(
f'Retrying request to {context.request.url} due to: {error} \n'
f'Retrying request to {context.request.url} due to: {reduced_error}'
f'{get_one_line_error_summary_if_possible(error)}'
)
await self._statistics.error_tracker.add(error=error, context=context)
Expand Down
6 changes: 5 additions & 1 deletion src/crawlee/crawlers/_basic/_logging_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,11 @@ def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:

def get_one_line_error_summary_if_possible(error: Exception) -> str:
if isinstance(error, asyncio.exceptions.TimeoutError):
most_relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
elif 'playwright._impl._errors.Error' in str(error.__class__):
# Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
# point to deep internals.
return ''
else:
traceback_parts = _get_traceback_parts_for_innermost_exception(error)
# Commonly last traceback part is type of the error, and the second last part is the relevant file.
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/crawlers/_basic/test_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1368,7 +1368,7 @@ async def handler(context: BasicCrawlingContext) -> None:
for record in caplog.records:
if record.message and 'timed out after 1.0 seconds' in record.message:
full_message = (record.message or '') + (record.exc_text or '')
assert Counter(full_message)['\n'] < 10
assert '\n' not in full_message
assert '# INJECTED DELAY' in full_message
found_timeout_message = True
break
Expand Down
26 changes: 26 additions & 0 deletions tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from __future__ import annotations

import json
import logging
from typing import TYPE_CHECKING, Any, Literal
from unittest import mock
from unittest.mock import Mock
Expand Down Expand Up @@ -758,3 +759,28 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:

assert len(extracted_links) == 1
assert extracted_links[0] == str(server_url / 'page_1')


async def test_reduced_logs_from_playwright_navigation_timeout(caplog: pytest.LogCaptureFixture) -> None:
caplog.set_level(logging.INFO)
crawler = PlaywrightCrawler(configure_logging=False)
non_existent_page = 'https://totally-non-existing-site.com/blablablba'

# Capture all logs from the 'crawlee' logger at INFO level or higher
with caplog.at_level(logging.INFO, logger='crawlee'):
await crawler.run([Request.from_url(non_existent_page)])

expected_summarized_log = (
f'Retrying request to {non_existent_page} due to: Page.goto: net::ERR_NAME_NOT_RESOLVED at {non_existent_page}'
)

# Find the Playwright specific error message in the logs
found_playwright_message = False
for record in caplog.records:
if record.message and expected_summarized_log in record.message:
full_message = (record.message or '') + (record.exc_text or '')
assert '\n' not in full_message
found_playwright_message = True
break

assert found_playwright_message, 'Expected log message about request handler error was not found.'
Loading