diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index d772934df0..a087ffebc8 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1031,8 +1031,9 @@ async def _handle_request_retries( if self._should_retry_request(context, error): request.retry_count += 1 + reduced_error = str(error).split('\n')[0] self.log.warning( - f'Retrying request to {context.request.url} due to: {error} \n' + f'Retrying request to {context.request.url} due to: {reduced_error}' f'{get_one_line_error_summary_if_possible(error)}' ) await self._statistics.error_tracker.add(error=error, context=context) diff --git a/src/crawlee/crawlers/_basic/_logging_utils.py b/src/crawlee/crawlers/_basic/_logging_utils.py index 22198aff0f..f9a8306458 100644 --- a/src/crawlee/crawlers/_basic/_logging_utils.py +++ b/src/crawlee/crawlers/_basic/_logging_utils.py @@ -49,7 +49,11 @@ def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]: def get_one_line_error_summary_if_possible(error: Exception) -> str: if isinstance(error, asyncio.exceptions.TimeoutError): - most_relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1] + most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1] + elif 'playwright._impl._errors.Error' in str(error.__class__): + # Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway + # point to deep internals. + return '' else: traceback_parts = _get_traceback_parts_for_innermost_exception(error) # Commonly last traceback part is type of the error, and the second last part is the relevant file. diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index a77d92d8fc..eff582e603 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -1368,7 +1368,7 @@ async def handler(context: BasicCrawlingContext) -> None: for record in caplog.records: if record.message and 'timed out after 1.0 seconds' in record.message: full_message = (record.message or '') + (record.exc_text or '') - assert Counter(full_message)['\n'] < 10 + assert '\n' not in full_message assert '# INJECTED DELAY' in full_message found_timeout_message = True break diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index 97386e2cea..2f52cac163 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -5,6 +5,7 @@ from __future__ import annotations import json +import logging from typing import TYPE_CHECKING, Any, Literal from unittest import mock from unittest.mock import Mock @@ -758,3 +759,28 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: assert len(extracted_links) == 1 assert extracted_links[0] == str(server_url / 'page_1') + + +async def test_reduced_logs_from_playwright_navigation_timeout(caplog: pytest.LogCaptureFixture) -> None: + caplog.set_level(logging.INFO) + crawler = PlaywrightCrawler(configure_logging=False) + non_existent_page = 'https://totally-non-existing-site.com/blablablba' + + # Capture all logs from the 'crawlee' logger at INFO level or higher + with caplog.at_level(logging.INFO, logger='crawlee'): + await crawler.run([Request.from_url(non_existent_page)]) + + expected_summarized_log = ( + f'Retrying request to {non_existent_page} due to: Page.goto: net::ERR_NAME_NOT_RESOLVED at {non_existent_page}' + ) + + # Find the Playwright specific error message in the logs + found_playwright_message = False + for record in caplog.records: + if record.message and expected_summarized_log in record.message: + full_message = (record.message or '') + (record.exc_text or '') + assert '\n' not in full_message + found_playwright_message = True + break + + assert found_playwright_message, 'Expected log message about request handler error was not found.'