Skip to content

Commit 57086f5

Browse files
committed
Fix RQ stuck in infinite loop due to ID mismatch
Closes: #392
1 parent 7f2242d commit 57086f5

File tree

1 file changed

+42
-57
lines changed

1 file changed

+42
-57
lines changed

src/apify/scrapy/requests.py

Lines changed: 42 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -5,27 +5,18 @@
55
from logging import getLogger
66
from typing import Any, cast
77

8-
from scrapy import Request, Spider
8+
from scrapy import Request as ScrapyRequest
9+
from scrapy import Spider
910
from scrapy.http.headers import Headers
1011
from scrapy.utils.request import request_from_dict
1112

12-
from crawlee import Request as CrawleeRequest
13+
from crawlee import Request as ApifyRequest
1314
from crawlee._types import HttpHeaders
14-
from crawlee._utils.crypto import crypto_random_object_id
15-
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
1615

1716
logger = getLogger(__name__)
1817

1918

20-
def _is_request_produced_by_middleware(scrapy_request: Request) -> bool:
21-
"""Returns True if the Scrapy request was produced by a downloader middleware, otherwise False.
22-
23-
Works for RetryMiddleware and RedirectMiddleware.
24-
"""
25-
return bool(scrapy_request.meta.get('redirect_times')) or bool(scrapy_request.meta.get('retry_times'))
26-
27-
28-
def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest | None:
19+
def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequest | None:
2920
"""Convert a Scrapy request to an Apify request.
3021
3122
Args:
@@ -35,51 +26,45 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest
3526
Returns:
3627
The converted Apify request if the conversion was successful, otherwise None.
3728
"""
38-
if not isinstance(scrapy_request, Request):
39-
logger.warning('Failed to convert to Apify request: Scrapy request must be a Request instance.') # type: ignore[unreachable]
29+
if not isinstance(scrapy_request, ScrapyRequest):
30+
logger.warning('Failed to convert to Apify request: Scrapy request must be a ScrapyRequest instance.') # type: ignore[unreachable]
4031
return None
4132

4233
logger.debug(f'to_apify_request was called (scrapy_request={scrapy_request})...')
4334

35+
# Configuration to behave as similarly as possible to Scrapy's default RFPDupeFilter.
36+
request_kwargs: dict[str, Any] = {
37+
'url': scrapy_request.url,
38+
'method': scrapy_request.method,
39+
'payload': scrapy_request.body,
40+
'use_extended_unique_key': True,
41+
'keep_url_fragment': False,
42+
}
43+
4444
try:
45-
if _is_request_produced_by_middleware(scrapy_request):
46-
unique_key = compute_unique_key(
47-
url=scrapy_request.url,
48-
method=scrapy_request.method, # type: ignore[arg-type] # str vs literal
49-
payload=scrapy_request.body,
50-
use_extended_unique_key=True,
51-
)
52-
elif scrapy_request.dont_filter:
53-
unique_key = crypto_random_object_id(8)
54-
elif scrapy_request.meta.get('apify_request_unique_key'):
55-
unique_key = scrapy_request.meta['apify_request_unique_key']
45+
if scrapy_request.dont_filter:
46+
request_kwargs['always_enqueue'] = True
5647
else:
57-
unique_key = crypto_random_object_id(8)
48+
if scrapy_request.meta.get('apify_request_unique_key'):
49+
request_kwargs['unique_key'] = scrapy_request.meta['apify_request_unique_key']
5850

59-
if scrapy_request.meta.get('apify_request_id'):
60-
request_id = scrapy_request.meta['apify_request_id']
61-
else:
62-
request_id = unique_key_to_request_id(unique_key)
63-
64-
apify_request = CrawleeRequest(
65-
url=scrapy_request.url,
66-
method=scrapy_request.method,
67-
payload=scrapy_request.body,
68-
user_data=scrapy_request.meta.get('userData', {}),
69-
unique_key=unique_key,
70-
id=request_id,
71-
)
51+
if scrapy_request.meta.get('apify_request_id'):
52+
request_kwargs['id'] = scrapy_request.meta['apify_request_id']
53+
54+
request_kwargs['user_data'] = scrapy_request.meta.get('userData', {})
7255

7356
# Convert Scrapy's headers to a HttpHeaders and store them in the apify_request
7457
if isinstance(scrapy_request.headers, Headers):
75-
apify_request.headers = HttpHeaders(dict(scrapy_request.headers.to_unicode_dict()))
58+
request_kwargs['headers'] = HttpHeaders(dict(scrapy_request.headers.to_unicode_dict()))
7659
else:
7760
logger.warning( # type: ignore[unreachable]
7861
f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}'
7962
)
8063

81-
# Serialize the Scrapy Request and store it in the apify_request.
82-
# - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
64+
apify_request = ApifyRequest.from_url(**request_kwargs)
65+
66+
# Serialize the Scrapy ScrapyRequest and store it in the apify_request.
67+
# - This process involves converting the Scrapy ScrapyRequest object into a dictionary, encoding it to base64,
8368
# and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
8469
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
8570
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
@@ -94,31 +79,31 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest
9479
return apify_request
9580

9681

97-
def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request:
82+
def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequest:
9883
"""Convert an Apify request to a Scrapy request.
9984
10085
Args:
10186
apify_request: The Apify request to be converted.
10287
spider: The Scrapy spider that the request is associated with.
10388
10489
Raises:
105-
TypeError: If the apify_request is not a crawlee request.
106-
ValueError: If the apify_request does not contain the required keys.
90+
TypeError: If the Apify request is not an instance of the `ApifyRequest` class.
91+
ValueError: If the Apify request does not contain the required keys.
10792
10893
Returns:
10994
The converted Scrapy request.
11095
"""
111-
if not isinstance(cast(Any, apify_request), CrawleeRequest):
112-
raise TypeError('apify_request must be a crawlee.Request instance')
96+
if not isinstance(cast(Any, apify_request), ApifyRequest):
97+
raise TypeError('apify_request must be a crawlee.ScrapyRequest instance')
11398

11499
logger.debug(f'to_scrapy_request was called (apify_request={apify_request})...')
115100

116101
# If the apify_request comes from the Scrapy
117102
if 'scrapy_request' in apify_request.user_data:
118-
# Deserialize the Scrapy Request from the apify_request.
103+
# Deserialize the Scrapy ScrapyRequest from the apify_request.
119104
# - This process involves decoding the base64-encoded request data and reconstructing
120-
# the Scrapy Request object from its dictionary representation.
121-
logger.debug('Restoring the Scrapy Request from the apify_request...')
105+
# the Scrapy ScrapyRequest object from its dictionary representation.
106+
logger.debug('Restoring the Scrapy ScrapyRequest from the apify_request...')
122107

123108
scrapy_request_dict_encoded = apify_request.user_data['scrapy_request']
124109
if not isinstance(scrapy_request_dict_encoded, str):
@@ -129,22 +114,22 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request:
129114
raise TypeError('scrapy_request_dict must be a dictionary')
130115

131116
scrapy_request = request_from_dict(scrapy_request_dict, spider=spider)
132-
if not isinstance(scrapy_request, Request):
133-
raise TypeError('scrapy_request must be an instance of the Request class')
117+
if not isinstance(scrapy_request, ScrapyRequest):
118+
raise TypeError('scrapy_request must be an instance of the ScrapyRequest class')
134119

135-
logger.debug(f'Scrapy Request successfully reconstructed (scrapy_request={scrapy_request})...')
120+
logger.debug(f'Scrapy ScrapyRequest successfully reconstructed (scrapy_request={scrapy_request})...')
136121

137122
# Update the meta field with the meta field from the apify_request
138123
meta = scrapy_request.meta or {}
139124
meta.update({'apify_request_id': apify_request.id, 'apify_request_unique_key': apify_request.unique_key})
140125
# scrapy_request.meta is a property, so we have to set it like this
141126
scrapy_request._meta = meta # noqa: SLF001
142127

143-
# If the apify_request comes directly from the Request Queue, typically start URLs
128+
# If the apify_request comes directly from the Scrapy, typically start URLs.
144129
else:
145-
logger.debug('Gonna create a new Scrapy Request (cannot be restored)')
130+
logger.debug('Gonna create a new Scrapy ScrapyRequest (cannot be restored)')
146131

147-
scrapy_request = Request(
132+
scrapy_request = ScrapyRequest(
148133
url=apify_request.url,
149134
method=apify_request.method,
150135
meta={

0 commit comments

Comments
 (0)