3
3
import typing
4
4
from typing import TYPE_CHECKING
5
5
from unittest import mock
6
+ from unittest .mock import call
6
7
7
8
import httpx
8
9
import pytest
11
12
from apify_shared .consts import ApifyEnvVars
12
13
from crawlee ._request import UserData
13
14
from crawlee ._types import HttpHeaders , HttpMethod
14
- from crawlee .http_clients import HttpxHttpClient , HttpResponse
15
+ from crawlee .http_clients import HttpResponse , HttpxHttpClient
15
16
16
17
from apify import Actor
17
18
@@ -160,14 +161,14 @@ async def test_proxy_configuration_with_actor_proxy_input(
160
161
async def test_actor_create_request_list_request_types (
161
162
request_method : HttpMethod , optional_input : dict [str , str ]
162
163
) -> None :
163
- """Tests proper request list generation from both minimal and full inputs for all method types."""
164
+ """Test proper request list generation from both minimal and full inputs for all method types for simple input ."""
164
165
minimal_request_dict_input = {'url' : 'https://www.abc.com' , 'method' : request_method }
165
166
request_dict_input = {** minimal_request_dict_input , ** optional_input }
166
167
example_start_urls_input = [
167
168
request_dict_input ,
168
169
]
169
170
170
- generated_request_list = await Actor .create_request_list (actor_start_urls_input = example_start_urls_input )
171
+ generated_request_list = await Actor .create_request_list (actor_start_urls_input = example_start_urls_input )
171
172
172
173
assert not await generated_request_list .is_empty ()
173
174
generated_request = await generated_request_list .fetch_next_request ()
@@ -185,42 +186,89 @@ async def test_actor_create_request_list_request_types(
185
186
assert generated_request .headers == expected_headers
186
187
187
188
188
- async def test_actor_create_request_list_from_url ():
189
- expected_urls = {"http://www.something.com" , "https://www.something_else.com" , "http://www.bla.net" }
190
- response_body = "blablabla{} more blablabla{} ,\n even more blablbablba.{}" .format (* expected_urls )
191
- mocked_http_client = HttpxHttpClient ()
189
+ def _create_dummy_response (read_output : typing .Iterable [str ]) -> HttpResponse :
190
+ """Create dummy_response that will iterate through read_output when called like dummy_response.read()"""
191
+
192
192
class DummyResponse (HttpResponse ):
193
193
@property
194
194
def http_version (self ) -> str :
195
- """The HTTP version used in the response."""
196
- return ""
195
+ return ''
197
196
198
197
@property
199
198
def status_code (self ) -> int :
200
- """The HTTP status code received from the server."""
201
199
return 200
202
200
203
201
@property
204
202
def headers (self ) -> HttpHeaders :
205
- """The HTTP headers received in the response."""
206
203
return HttpHeaders ()
207
204
208
205
def read (self ) -> bytes :
209
- return response_body .encode ('utf-8' )
206
+ return next ( read_output ) .encode ('utf-8' )
210
207
208
+ return DummyResponse ()
211
209
212
- async def mocked_send_request (* args , ** kwargs ):
213
- return DummyResponse ()
214
- with mock .patch .object (mocked_http_client , "send_request" , mocked_send_request ) as mocked_send_request2 :
215
210
216
- example_start_urls_input = [
217
- {"requestsFromUrl" : "https://crawlee.dev/file.txt" , 'method' : "GET" }
218
- ]
211
+ async def test_actor_create_request_list_from_url_correctly_send_requests () -> None :
212
+ """Test that injected HttpClient's method send_request is called with properly passed arguments."""
219
213
214
+ example_start_urls_input = [
215
+ {'requestsFromUrl' : 'https://crawlee.dev/file.txt' , 'method' : 'GET' },
216
+ {'requestsFromUrl' : 'https://www.crawlee.dev/file2' , 'method' : 'PUT' },
217
+ {
218
+ 'requestsFromUrl' : 'https://www.something.som' ,
219
+ 'method' : 'POST' ,
220
+ 'headers' : {'key' : 'value' },
221
+ 'payload' : 'some_payload' ,
222
+ 'userData' : 'irrelevant' ,
223
+ },
224
+ ]
225
+ mocked_read_outputs = ('' for url in example_start_urls_input )
226
+ http_client = HttpxHttpClient ()
227
+ with mock .patch .object (
228
+ http_client , 'send_request' , return_value = _create_dummy_response (mocked_read_outputs )
229
+ ) as mocked_send_request :
230
+ await Actor .create_request_list (actor_start_urls_input = example_start_urls_input , http_client = http_client )
231
+
232
+ expected_calls = [
233
+ call (
234
+ method = example_input ['method' ],
235
+ url = example_input ['requestsFromUrl' ],
236
+ headers = example_input .get ('headers' , {}),
237
+ payload = example_input .get ('payload' , '' ).encode ('utf-8' ),
238
+ )
239
+ for example_input in example_start_urls_input
240
+ ]
241
+ mocked_send_request .assert_has_calls (expected_calls )
242
+
243
+
244
+ async def test_actor_create_request_list_from_url () -> None :
245
+ """Test that create_request_list is correctly reading urls from remote url sources and also from simple input."""
246
+ expected_simple_url = 'https://www.someurl.com'
247
+ expected_remote_urls_1 = {'http://www.something.com' , 'https://www.somethingelse.com' , 'http://www.bla.net' }
248
+ expected_remote_urls_2 = {'http://www.ok.com' , 'https://www.true-positive.com' }
249
+ expected_urls = expected_remote_urls_1 | expected_remote_urls_2 | {expected_simple_url }
250
+ response_bodies = iter (
251
+ (
252
+ 'blablabla{} more blablabla{} , even more blablabla. {} ' .format (* expected_remote_urls_1 ),
253
+ 'some stuff{} more stuff{} www.falsepositive www.false_positive.com' .format (* expected_remote_urls_2 ),
254
+ )
255
+ )
220
256
221
- generated_request_list = await Actor .create_request_list (actor_start_urls_input = example_start_urls_input , http_client = mocked_http_client )
257
+ example_start_urls_input = [
258
+ {'requestsFromUrl' : 'https://crawlee.dev/file.txt' , 'method' : 'GET' },
259
+ {'url' : expected_simple_url , 'method' : 'GET' },
260
+ {'requestsFromUrl' : 'https://www.crawlee.dev/file2' , 'method' : 'GET' },
261
+ ]
262
+
263
+ http_client = HttpxHttpClient ()
264
+ with mock .patch .object (http_client , 'send_request' , return_value = _create_dummy_response (response_bodies )):
265
+ generated_request_list = await Actor .create_request_list (
266
+ actor_start_urls_input = example_start_urls_input , http_client = http_client
267
+ )
222
268
generated_requests = []
223
- while request := await generated_request_list .fetch_next_request ():
269
+ while request := await generated_request_list .fetch_next_request ():
270
+ print (request )
224
271
generated_requests .append (request )
225
272
226
- assert set (generated_request .url for generated_request in generated_requests ) == expected_urls
273
+ # Check correctly created requests' urls in request list
274
+ assert {generated_request .url for generated_request in generated_requests } == expected_urls
0 commit comments