5
5
from logging import getLogger
6
6
from typing import Any , cast
7
7
8
- from scrapy import Request , Spider
8
+ from scrapy import Request as ScrapyRequest
9
+ from scrapy import Spider
9
10
from scrapy .http .headers import Headers
10
11
from scrapy .utils .request import request_from_dict
11
12
12
- from crawlee import Request as CrawleeRequest
13
+ from crawlee import Request as ApifyRequest
13
14
from crawlee ._types import HttpHeaders
14
- from crawlee ._utils .crypto import crypto_random_object_id
15
- from crawlee ._utils .requests import compute_unique_key , unique_key_to_request_id
16
15
17
16
logger = getLogger (__name__ )
18
17
19
18
20
- def _is_request_produced_by_middleware (scrapy_request : Request ) -> bool :
21
- """Returns True if the Scrapy request was produced by a downloader middleware, otherwise False.
22
-
23
- Works for RetryMiddleware and RedirectMiddleware.
24
- """
25
- return bool (scrapy_request .meta .get ('redirect_times' )) or bool (scrapy_request .meta .get ('retry_times' ))
26
-
27
-
28
- def to_apify_request (scrapy_request : Request , spider : Spider ) -> CrawleeRequest | None :
19
+ def to_apify_request (scrapy_request : ScrapyRequest , spider : Spider ) -> ApifyRequest | None :
29
20
"""Convert a Scrapy request to an Apify request.
30
21
31
22
Args:
@@ -35,51 +26,45 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest
35
26
Returns:
36
27
The converted Apify request if the conversion was successful, otherwise None.
37
28
"""
38
- if not isinstance (scrapy_request , Request ):
39
- logger .warning ('Failed to convert to Apify request: Scrapy request must be a Request instance.' ) # type: ignore[unreachable]
29
+ if not isinstance (scrapy_request , ScrapyRequest ):
30
+ logger .warning ('Failed to convert to Apify request: Scrapy request must be a ScrapyRequest instance.' ) # type: ignore[unreachable]
40
31
return None
41
32
42
33
logger .debug (f'to_apify_request was called (scrapy_request={ scrapy_request } )...' )
43
34
35
+ # Configuration to behave as similarly as possible to Scrapy's default RFPDupeFilter.
36
+ request_kwargs : dict [str , Any ] = {
37
+ 'url' : scrapy_request .url ,
38
+ 'method' : scrapy_request .method ,
39
+ 'payload' : scrapy_request .body ,
40
+ 'use_extended_unique_key' : True ,
41
+ 'keep_url_fragment' : False ,
42
+ }
43
+
44
44
try :
45
- if _is_request_produced_by_middleware (scrapy_request ):
46
- unique_key = compute_unique_key (
47
- url = scrapy_request .url ,
48
- method = scrapy_request .method , # type: ignore[arg-type] # str vs literal
49
- payload = scrapy_request .body ,
50
- use_extended_unique_key = True ,
51
- )
52
- elif scrapy_request .dont_filter :
53
- unique_key = crypto_random_object_id (8 )
54
- elif scrapy_request .meta .get ('apify_request_unique_key' ):
55
- unique_key = scrapy_request .meta ['apify_request_unique_key' ]
45
+ if scrapy_request .dont_filter :
46
+ request_kwargs ['always_enqueue' ] = True
56
47
else :
57
- unique_key = crypto_random_object_id (8 )
48
+ if scrapy_request .meta .get ('apify_request_unique_key' ):
49
+ request_kwargs ['unique_key' ] = scrapy_request .meta ['apify_request_unique_key' ]
58
50
59
- if scrapy_request .meta .get ('apify_request_id' ):
60
- request_id = scrapy_request .meta ['apify_request_id' ]
61
- else :
62
- request_id = unique_key_to_request_id (unique_key )
63
-
64
- apify_request = CrawleeRequest (
65
- url = scrapy_request .url ,
66
- method = scrapy_request .method ,
67
- payload = scrapy_request .body ,
68
- user_data = scrapy_request .meta .get ('userData' , {}),
69
- unique_key = unique_key ,
70
- id = request_id ,
71
- )
51
+ if scrapy_request .meta .get ('apify_request_id' ):
52
+ request_kwargs ['id' ] = scrapy_request .meta ['apify_request_id' ]
53
+
54
+ request_kwargs ['user_data' ] = scrapy_request .meta .get ('userData' , {})
72
55
73
56
# Convert Scrapy's headers to a HttpHeaders and store them in the apify_request
74
57
if isinstance (scrapy_request .headers , Headers ):
75
- apify_request . headers = HttpHeaders (dict (scrapy_request .headers .to_unicode_dict ()))
58
+ request_kwargs [ ' headers' ] = HttpHeaders (dict (scrapy_request .headers .to_unicode_dict ()))
76
59
else :
77
60
logger .warning ( # type: ignore[unreachable]
78
61
f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: { scrapy_request .headers } '
79
62
)
80
63
81
- # Serialize the Scrapy Request and store it in the apify_request.
82
- # - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
64
+ apify_request = ApifyRequest .from_url (** request_kwargs )
65
+
66
+ # Serialize the Scrapy ScrapyRequest and store it in the apify_request.
67
+ # - This process involves converting the Scrapy ScrapyRequest object into a dictionary, encoding it to base64,
83
68
# and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
84
69
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
85
70
scrapy_request_dict = scrapy_request .to_dict (spider = spider )
@@ -94,31 +79,31 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest
94
79
return apify_request
95
80
96
81
97
- def to_scrapy_request (apify_request : CrawleeRequest , spider : Spider ) -> Request :
82
+ def to_scrapy_request (apify_request : ApifyRequest , spider : Spider ) -> ScrapyRequest :
98
83
"""Convert an Apify request to a Scrapy request.
99
84
100
85
Args:
101
86
apify_request: The Apify request to be converted.
102
87
spider: The Scrapy spider that the request is associated with.
103
88
104
89
Raises:
105
- TypeError: If the apify_request is not a crawlee request .
106
- ValueError: If the apify_request does not contain the required keys.
90
+ TypeError: If the Apify request is not an instance of the `ApifyRequest` class .
91
+ ValueError: If the Apify request does not contain the required keys.
107
92
108
93
Returns:
109
94
The converted Scrapy request.
110
95
"""
111
- if not isinstance (cast (Any , apify_request ), CrawleeRequest ):
112
- raise TypeError ('apify_request must be a crawlee.Request instance' )
96
+ if not isinstance (cast (Any , apify_request ), ApifyRequest ):
97
+ raise TypeError ('apify_request must be a crawlee.ScrapyRequest instance' )
113
98
114
99
logger .debug (f'to_scrapy_request was called (apify_request={ apify_request } )...' )
115
100
116
101
# If the apify_request comes from the Scrapy
117
102
if 'scrapy_request' in apify_request .user_data :
118
- # Deserialize the Scrapy Request from the apify_request.
103
+ # Deserialize the Scrapy ScrapyRequest from the apify_request.
119
104
# - This process involves decoding the base64-encoded request data and reconstructing
120
- # the Scrapy Request object from its dictionary representation.
121
- logger .debug ('Restoring the Scrapy Request from the apify_request...' )
105
+ # the Scrapy ScrapyRequest object from its dictionary representation.
106
+ logger .debug ('Restoring the Scrapy ScrapyRequest from the apify_request...' )
122
107
123
108
scrapy_request_dict_encoded = apify_request .user_data ['scrapy_request' ]
124
109
if not isinstance (scrapy_request_dict_encoded , str ):
@@ -129,22 +114,22 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request:
129
114
raise TypeError ('scrapy_request_dict must be a dictionary' )
130
115
131
116
scrapy_request = request_from_dict (scrapy_request_dict , spider = spider )
132
- if not isinstance (scrapy_request , Request ):
133
- raise TypeError ('scrapy_request must be an instance of the Request class' )
117
+ if not isinstance (scrapy_request , ScrapyRequest ):
118
+ raise TypeError ('scrapy_request must be an instance of the ScrapyRequest class' )
134
119
135
- logger .debug (f'Scrapy Request successfully reconstructed (scrapy_request={ scrapy_request } )...' )
120
+ logger .debug (f'Scrapy ScrapyRequest successfully reconstructed (scrapy_request={ scrapy_request } )...' )
136
121
137
122
# Update the meta field with the meta field from the apify_request
138
123
meta = scrapy_request .meta or {}
139
124
meta .update ({'apify_request_id' : apify_request .id , 'apify_request_unique_key' : apify_request .unique_key })
140
125
# scrapy_request.meta is a property, so we have to set it like this
141
126
scrapy_request ._meta = meta # noqa: SLF001
142
127
143
- # If the apify_request comes directly from the Request Queue , typically start URLs
128
+ # If the apify_request comes directly from the Scrapy , typically start URLs.
144
129
else :
145
- logger .debug ('Gonna create a new Scrapy Request (cannot be restored)' )
130
+ logger .debug ('Gonna create a new Scrapy ScrapyRequest (cannot be restored)' )
146
131
147
- scrapy_request = Request (
132
+ scrapy_request = ScrapyRequest (
148
133
url = apify_request .url ,
149
134
method = apify_request .method ,
150
135
meta = {
0 commit comments