55from logging import getLogger
66from typing import Any , cast
77
8- from scrapy import Request , Spider
8+ from scrapy import Request as ScrapyRequest
9+ from scrapy import Spider
910from scrapy .http .headers import Headers
1011from scrapy .utils .request import request_from_dict
1112
12- from crawlee import Request as CrawleeRequest
13+ from crawlee import Request as ApifyRequest
1314from crawlee ._types import HttpHeaders
14- from crawlee ._utils .crypto import crypto_random_object_id
15- from crawlee ._utils .requests import compute_unique_key , unique_key_to_request_id
1615
1716logger = getLogger (__name__ )
1817
1918
20- def _is_request_produced_by_middleware (scrapy_request : Request ) -> bool :
21- """Returns True if the Scrapy request was produced by a downloader middleware, otherwise False.
22-
23- Works for RetryMiddleware and RedirectMiddleware.
24- """
25- return bool (scrapy_request .meta .get ('redirect_times' )) or bool (scrapy_request .meta .get ('retry_times' ))
26-
27-
28- def to_apify_request (scrapy_request : Request , spider : Spider ) -> CrawleeRequest | None :
19+ def to_apify_request (scrapy_request : ScrapyRequest , spider : Spider ) -> ApifyRequest | None :
2920 """Convert a Scrapy request to an Apify request.
3021
3122 Args:
@@ -35,51 +26,45 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest
3526 Returns:
3627 The converted Apify request if the conversion was successful, otherwise None.
3728 """
38- if not isinstance (scrapy_request , Request ):
39- logger .warning ('Failed to convert to Apify request: Scrapy request must be a Request instance.' ) # type: ignore[unreachable]
29+ if not isinstance (scrapy_request , ScrapyRequest ):
30+ logger .warning ('Failed to convert to Apify request: Scrapy request must be a ScrapyRequest instance.' ) # type: ignore[unreachable]
4031 return None
4132
4233 logger .debug (f'to_apify_request was called (scrapy_request={ scrapy_request } )...' )
4334
35+ # Configuration to behave as similarly as possible to Scrapy's default RFPDupeFilter.
36+ request_kwargs : dict [str , Any ] = {
37+ 'url' : scrapy_request .url ,
38+ 'method' : scrapy_request .method ,
39+ 'payload' : scrapy_request .body ,
40+ 'use_extended_unique_key' : True ,
41+ 'keep_url_fragment' : False ,
42+ }
43+
4444 try :
45- if _is_request_produced_by_middleware (scrapy_request ):
46- unique_key = compute_unique_key (
47- url = scrapy_request .url ,
48- method = scrapy_request .method , # type: ignore[arg-type] # str vs literal
49- payload = scrapy_request .body ,
50- use_extended_unique_key = True ,
51- )
52- elif scrapy_request .dont_filter :
53- unique_key = crypto_random_object_id (8 )
54- elif scrapy_request .meta .get ('apify_request_unique_key' ):
55- unique_key = scrapy_request .meta ['apify_request_unique_key' ]
45+ if scrapy_request .dont_filter :
46+ request_kwargs ['always_enqueue' ] = True
5647 else :
57- unique_key = crypto_random_object_id (8 )
48+ if scrapy_request .meta .get ('apify_request_unique_key' ):
49+ request_kwargs ['unique_key' ] = scrapy_request .meta ['apify_request_unique_key' ]
5850
59- if scrapy_request .meta .get ('apify_request_id' ):
60- request_id = scrapy_request .meta ['apify_request_id' ]
61- else :
62- request_id = unique_key_to_request_id (unique_key )
63-
64- apify_request = CrawleeRequest (
65- url = scrapy_request .url ,
66- method = scrapy_request .method ,
67- payload = scrapy_request .body ,
68- user_data = scrapy_request .meta .get ('userData' , {}),
69- unique_key = unique_key ,
70- id = request_id ,
71- )
51+ if scrapy_request .meta .get ('apify_request_id' ):
52+ request_kwargs ['id' ] = scrapy_request .meta ['apify_request_id' ]
53+
54+ request_kwargs ['user_data' ] = scrapy_request .meta .get ('userData' , {})
7255
7356 # Convert Scrapy's headers to a HttpHeaders and store them in the apify_request
7457 if isinstance (scrapy_request .headers , Headers ):
75- apify_request . headers = HttpHeaders (dict (scrapy_request .headers .to_unicode_dict ()))
58+ request_kwargs [ ' headers' ] = HttpHeaders (dict (scrapy_request .headers .to_unicode_dict ()))
7659 else :
7760 logger .warning ( # type: ignore[unreachable]
7861 f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: { scrapy_request .headers } '
7962 )
8063
81- # Serialize the Scrapy Request and store it in the apify_request.
82- # - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
64+ apify_request = ApifyRequest .from_url (** request_kwargs )
65+
66+ # Serialize the Scrapy ScrapyRequest and store it in the apify_request.
67+ # - This process involves converting the Scrapy ScrapyRequest object into a dictionary, encoding it to base64,
8368 # and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
8469 # - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
8570 scrapy_request_dict = scrapy_request .to_dict (spider = spider )
@@ -94,31 +79,31 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest
9479 return apify_request
9580
9681
97- def to_scrapy_request (apify_request : CrawleeRequest , spider : Spider ) -> Request :
82+ def to_scrapy_request (apify_request : ApifyRequest , spider : Spider ) -> ScrapyRequest :
9883 """Convert an Apify request to a Scrapy request.
9984
10085 Args:
10186 apify_request: The Apify request to be converted.
10287 spider: The Scrapy spider that the request is associated with.
10388
10489 Raises:
105- TypeError: If the apify_request is not a crawlee request .
106- ValueError: If the apify_request does not contain the required keys.
90+ TypeError: If the Apify request is not an instance of the `ApifyRequest` class .
91+ ValueError: If the Apify request does not contain the required keys.
10792
10893 Returns:
10994 The converted Scrapy request.
11095 """
111- if not isinstance (cast (Any , apify_request ), CrawleeRequest ):
112- raise TypeError ('apify_request must be a crawlee.Request instance' )
96+ if not isinstance (cast (Any , apify_request ), ApifyRequest ):
97+ raise TypeError ('apify_request must be a crawlee.ScrapyRequest instance' )
11398
11499 logger .debug (f'to_scrapy_request was called (apify_request={ apify_request } )...' )
115100
116101 # If the apify_request comes from the Scrapy
117102 if 'scrapy_request' in apify_request .user_data :
118- # Deserialize the Scrapy Request from the apify_request.
103+ # Deserialize the Scrapy ScrapyRequest from the apify_request.
119104 # - This process involves decoding the base64-encoded request data and reconstructing
120- # the Scrapy Request object from its dictionary representation.
121- logger .debug ('Restoring the Scrapy Request from the apify_request...' )
105+ # the Scrapy ScrapyRequest object from its dictionary representation.
106+ logger .debug ('Restoring the Scrapy ScrapyRequest from the apify_request...' )
122107
123108 scrapy_request_dict_encoded = apify_request .user_data ['scrapy_request' ]
124109 if not isinstance (scrapy_request_dict_encoded , str ):
@@ -129,22 +114,22 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request:
129114 raise TypeError ('scrapy_request_dict must be a dictionary' )
130115
131116 scrapy_request = request_from_dict (scrapy_request_dict , spider = spider )
132- if not isinstance (scrapy_request , Request ):
133- raise TypeError ('scrapy_request must be an instance of the Request class' )
117+ if not isinstance (scrapy_request , ScrapyRequest ):
118+ raise TypeError ('scrapy_request must be an instance of the ScrapyRequest class' )
134119
135- logger .debug (f'Scrapy Request successfully reconstructed (scrapy_request={ scrapy_request } )...' )
120+ logger .debug (f'Scrapy ScrapyRequest successfully reconstructed (scrapy_request={ scrapy_request } )...' )
136121
137122 # Update the meta field with the meta field from the apify_request
138123 meta = scrapy_request .meta or {}
139124 meta .update ({'apify_request_id' : apify_request .id , 'apify_request_unique_key' : apify_request .unique_key })
140125 # scrapy_request.meta is a property, so we have to set it like this
141126 scrapy_request ._meta = meta # noqa: SLF001
142127
143- # If the apify_request comes directly from the Request Queue , typically start URLs
128+ # If the apify_request comes directly from the Scrapy , typically start URLs.
144129 else :
145- logger .debug ('Gonna create a new Scrapy Request (cannot be restored)' )
130+ logger .debug ('Gonna create a new Scrapy ScrapyRequest (cannot be restored)' )
146131
147- scrapy_request = Request (
132+ scrapy_request = ScrapyRequest (
148133 url = apify_request .url ,
149134 method = apify_request .method ,
150135 meta = {
0 commit comments