9
9
import respx
10
10
from httpx import Response
11
11
12
- from crawlee . _request import Request
12
+ from crawlee import ConcurrencySettings , Request
13
13
from crawlee .crawlers import HttpCrawler
14
14
from crawlee .http_clients import CurlImpersonateHttpClient , HttpxHttpClient
15
15
from crawlee .sessions import SessionPool
@@ -183,7 +183,15 @@ async def test_handles_server_error(
183
183
assert server ['500_endpoint' ].called
184
184
185
185
186
- async def test_stores_cookies (httpbin : URL ) -> None :
186
+ @pytest .mark .parametrize (
187
+ 'http_client_class' ,
188
+ [
189
+ pytest .param (CurlImpersonateHttpClient , id = 'curl' ),
190
+ pytest .param (HttpxHttpClient , id = 'httpx' ),
191
+ ],
192
+ )
193
+ async def test_stores_cookies (http_client_class : type [BaseHttpClient ], httpbin : URL ) -> None :
194
+ http_client = http_client_class ()
187
195
visit = Mock ()
188
196
track_session_usage = Mock ()
189
197
@@ -192,6 +200,7 @@ async def test_stores_cookies(httpbin: URL) -> None:
192
200
# /cookies/set might redirect us to a page that we can't access - no problem, we only care about cookies
193
201
ignore_http_error_status_codes = [401 ],
194
202
session_pool = session_pool ,
203
+ http_client = http_client ,
195
204
)
196
205
197
206
@crawler .router .default_handler
@@ -410,3 +419,68 @@ def mark_request_execution(request: Request) -> Response: # noqa: ARG001 # Unus
410
419
await crawler .run ([Request .from_url (url = test_url )])
411
420
412
421
assert execution_order == ['pre-navigation-hook 1' , 'pre-navigation-hook 2' , 'request' , 'final handler' ]
422
+
423
+
424
+ @pytest .mark .parametrize (
425
+ 'http_client_class' ,
426
+ [
427
+ pytest .param (CurlImpersonateHttpClient , id = 'curl' ),
428
+ pytest .param (HttpxHttpClient , id = 'httpx' ),
429
+ ],
430
+ )
431
+ async def test_isolation_cookies (http_client_class : type [BaseHttpClient ], httpbin : URL ) -> None :
432
+ http_client = http_client_class ()
433
+ sessions_ids : list [str ] = []
434
+ sessions_cookies : dict [str , dict [str , str ]] = {}
435
+ response_cookies : dict [str , dict [str , str ]] = {}
436
+
437
+ crawler = HttpCrawler (
438
+ session_pool = SessionPool (max_pool_size = 1 ),
439
+ http_client = http_client ,
440
+ concurrency_settings = ConcurrencySettings (max_concurrency = 1 ),
441
+ )
442
+
443
+ @crawler .router .default_handler
444
+ async def handler (context : HttpCrawlingContext ) -> None :
445
+ if not context .session :
446
+ return
447
+
448
+ sessions_ids .append (context .session .id )
449
+
450
+ if context .request .unique_key not in {'1' , '2' }:
451
+ return
452
+
453
+ sessions_cookies [context .session .id ] = context .session .cookies
454
+ response_data = json .loads (context .http_response .read ())
455
+ response_cookies [context .session .id ] = response_data .get ('cookies' )
456
+
457
+ if context .request .user_data .get ('retire_session' ):
458
+ context .session .retire ()
459
+
460
+ await crawler .run (
461
+ [
462
+ # The first request sets the cookie in the session
463
+ str (httpbin .with_path ('/cookies/set' ).extend_query (a = 1 )),
464
+ # With the second request, we check the cookies in the session and set retire
465
+ Request .from_url (str (httpbin .with_path ('/cookies' )), unique_key = '1' , user_data = {'retire_session' : True }),
466
+ # The third request is made with a new session to make sure it does not use another session's cookies
467
+ Request .from_url (str (httpbin .with_path ('/cookies' )), unique_key = '2' ),
468
+ ]
469
+ )
470
+
471
+ assert len (sessions_cookies ) == 2
472
+ assert len (response_cookies ) == 2
473
+
474
+ assert sessions_ids [0 ] == sessions_ids [1 ]
475
+
476
+ cookie_session_id = sessions_ids [0 ]
477
+ clean_session_id = sessions_ids [2 ]
478
+
479
+ assert cookie_session_id != clean_session_id
480
+
481
+ # The initiated cookies must match in both the response and the session store
482
+ assert sessions_cookies [cookie_session_id ] == response_cookies [cookie_session_id ] == {'a' : '1' }
483
+
484
+ # For a clean session, the cookie should not be in the session store or in the response
485
+ # This way we can be sure that no cookies are being leaked through the http client
486
+ assert sessions_cookies [clean_session_id ] == response_cookies [clean_session_id ] == {}
0 commit comments