Skip to content

Commit 3730920

Browse files
committed
Add many debug logs for flaky test test_timeout_in_handler
1 parent 3bec9bb commit 3730920

File tree

3 files changed

+21
-3
lines changed

3 files changed

+21
-3
lines changed

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1130,16 +1130,18 @@ async def _handle_request_retries(
11301130
await self._statistics.error_tracker.add(error=error, context=context)
11311131

11321132
if self._error_handler:
1133+
self.log.warning('Error handler')
11331134
try:
11341135
new_request = await self._error_handler(context, error)
11351136
except Exception as e:
11361137
raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
11371138
else:
11381139
if new_request is not None:
11391140
request = new_request
1140-
1141+
self.log.warning('reclaim_request')
11411142
await request_manager.reclaim_request(request)
11421143
else:
1144+
self.log.warning('mark_request_as_handled')
11431145
await wait_for(
11441146
lambda: request_manager.mark_request_as_handled(context.request),
11451147
timeout=self._internal_timeout,
@@ -1148,20 +1150,23 @@ async def _handle_request_retries(
11481150
logger=self._logger,
11491151
max_retries=3,
11501152
)
1153+
self.log.warning('_handle_failed_request')
11511154
await self._handle_failed_request(context, error)
11521155
self._statistics.record_request_processing_failure(request.unique_key)
1156+
self.log.warning('_handle_request_retries DONE')
11531157

11541158
async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None:
11551159
try:
11561160
context.request.state = RequestState.ERROR_HANDLER
1157-
1161+
self.log.warning('Before _handle_request_error')
11581162
await wait_for(
11591163
partial(self._handle_request_retries, context, error),
11601164
timeout=self._internal_timeout,
11611165
timeout_message='Handling request failure timed out after '
11621166
f'{self._internal_timeout.total_seconds()} seconds',
11631167
logger=self._logger,
11641168
)
1169+
self.log.warning('After _handle_request_error')
11651170

11661171
context.request.state = RequestState.DONE
11671172
except UserDefinedErrorHandlerError:
@@ -1414,9 +1419,12 @@ async def __run_task_function(self) -> None:
14141419
try:
14151420
await self._run_request_handler(context=context)
14161421
except asyncio.TimeoutError as e:
1422+
context.log.info('RH error')
14171423
raise RequestHandlerError(e, context) from e
14181424

1425+
context.log.info('Commit resutls')
14191426
await self._commit_request_handler_result(context)
1427+
context.log.info('Marking as handled request')
14201428
await wait_for(
14211429
lambda: request_manager.mark_request_as_handled(context.request),
14221430
timeout=self._internal_timeout,
@@ -1431,6 +1439,7 @@ async def __run_task_function(self) -> None:
14311439
if context.session and context.session.is_usable:
14321440
context.session.mark_good()
14331441

1442+
context.log.info('Finished processing request')
14341443
self._statistics.record_request_processing_finish(request.unique_key)
14351444

14361445
except RequestCollisionError as request_error:

src/crawlee/statistics/_error_tracker.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ async def add(
5858
early: Flag indicating that the error is added earlier than usual to have access to resources that will be
5959
closed before normal error collection. This prevents double reporting during normal error collection.
6060
"""
61+
logger.warning('Adding error')
6162
if id(error) in self._early_reported_errors:
6263
# Error had to be collected earlier before relevant resources are closed.
6364
self._early_reported_errors.remove(id(error))
@@ -98,12 +99,15 @@ async def add(
9899
== 1
99100
and context is not None
100101
):
102+
logger.warning('awaiting snapshot')
101103
# Save snapshot only on the first occurrence of the error and only if context and kvs was passed as well.
102104
await self._capture_error_snapshot(
103105
error_message=new_error_group_message or error_group_message,
104106
file_and_line=error_group_file_and_line,
105107
context=context,
106108
)
109+
logger.warning('Snapshot added')
110+
logger.warning('Finished')
107111

108112
async def _capture_error_snapshot(
109113
self, error_message: str, file_and_line: str, context: BasicCrawlingContext

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1295,7 +1295,11 @@ async def test_timeout_in_handler(sleep_type: str) -> None:
12951295
double_handler_timeout_s = handler_timeout.total_seconds() * 2
12961296
handler_sleep = iter([double_handler_timeout_s, double_handler_timeout_s, 0])
12971297

1298-
crawler = BasicCrawler(request_handler_timeout=handler_timeout, max_request_retries=max_request_retries)
1298+
crawler = BasicCrawler(
1299+
request_handler_timeout=handler_timeout,
1300+
max_request_retries=max_request_retries,
1301+
)
1302+
crawler.log.setLevel(logging.DEBUG)
12991303

13001304
mocked_handler_before_sleep = Mock()
13011305
mocked_handler_after_sleep = Mock()
@@ -1311,6 +1315,7 @@ async def handler(context: BasicCrawlingContext) -> None:
13111315

13121316
# This will not execute if timeout happens.
13131317
mocked_handler_after_sleep()
1318+
context.log.info('Handling request')
13141319

13151320
# Timeout in pytest, because previous implementation would run crawler until following:
13161321
# "The request queue seems to be stuck for 300.0s, resetting internal state."

0 commit comments

Comments
 (0)