44from typing import TYPE_CHECKING , Any
55
66try :
7- from scrapy import Spider # noqa: TCH002
87 from scrapy .downloadermiddlewares .retry import RetryMiddleware
9- from scrapy .exceptions import IgnoreRequest
10- from scrapy .http import Request , Response # noqa: TCH002
118 from scrapy .utils .response import response_status_message
129except ImportError as exc :
1310 raise ImportError (
1815from .utils import nested_event_loop , open_queue_with_custom_client , to_apify_request
1916
2017if TYPE_CHECKING :
18+ from scrapy import Spider
19+ from scrapy .http import Request , Response
20+
2121 from ..storages import RequestQueue
2222
2323
@@ -33,11 +33,6 @@ def __init__(self: ApifyRetryMiddleware, *args: Any, **kwargs: Any) -> None:
3333 traceback .print_exc ()
3434 raise
3535
36- def __del__ (self : ApifyRetryMiddleware ) -> None :
37- """Before deleting the instance, close the nested event loop."""
38- nested_event_loop .stop ()
39- nested_event_loop .close ()
40-
4136 def process_response (
4237 self : ApifyRetryMiddleware ,
4338 request : Request ,
@@ -54,9 +49,11 @@ def process_response(
5449 Returns:
5550 The response, or a new request if the request should be retried.
5651 """
52+ if not isinstance (request .url , str ):
53+ raise TypeError (f'Expected request.url to be a string, got { type (request .url )} instead.' )
54+
5755 # Robots requests are bypassed directly, they don't go through a Scrapy Scheduler, and also through our
5856 # Request Queue. Check the scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware for details.
59- assert isinstance (request .url , str ) # noqa: S101
6057 if request .url .endswith ('robots.txt' ):
6158 return response
6259
@@ -72,20 +69,30 @@ def process_exception(
7269 exception : BaseException ,
7370 spider : Spider ,
7471 ) -> Request | Response | None :
75- """Handle the exception and decide whether the request should be retried."""
76- Actor .log .debug (f'ApifyRetryMiddleware.process_exception was called (scrapy_request={ request } )...' )
72+ """Handle the exception and decide whether the request should be retried.
73+
74+ Args:
75+ request: The request that encountered an exception.
76+ exception: The exception that occurred.
77+ spider: The Spider that sent the request.
78+
79+ Returns:
80+ None: The request will not be retried.
81+ """
82+ Actor .log .debug (f'ApifyRetryMiddleware.process_exception was called (request={ request } , exception={ exception } )...' )
7783 apify_request = to_apify_request (request , spider = spider )
7884
79- if isinstance (exception , IgnoreRequest ):
80- try :
81- nested_event_loop .run_until_complete (self ._rq .mark_request_as_handled (apify_request ))
82- except BaseException :
83- traceback .print_exc ()
84- raise
85- else :
86- nested_event_loop .run_until_complete (self ._rq .reclaim_request (apify_request ))
85+ # Unlike the default Scrapy RetryMiddleware, we do not attempt to retry requests on exception.
86+ # It was causing issues with the Apify request queue, because the request was not marked as handled and was
87+ # stucked in the request queue forever - Scrapy crawling never finished. The solution would be to completely
88+ # rewrite the retry logic from default RetryMiddleware.
89+ try :
90+ nested_event_loop .run_until_complete (self ._rq .mark_request_as_handled (apify_request ))
91+ except BaseException :
92+ traceback .print_exc ()
93+ raise
8794
88- return super (). process_exception ( request , exception , spider )
95+ return None
8996
9097 async def _handle_retry_logic (
9198 self : ApifyRetryMiddleware ,
0 commit comments