Skip to content

Commit ed415c4

Browse files
authored
fix: fix CurlImpersonateHttpClient cookies handler (#946)
### Description - fix cookie handling. Behavior alignment with `HttpxHttpClient`. ### Issues - #933
1 parent eae3a33 commit ed415c4

File tree

2 files changed

+107
-2
lines changed

2 files changed

+107
-2
lines changed

src/crawlee/http_clients/_curl_impersonate.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
from crawlee._utils.docs import docs_group
66

77
try:
8+
from curl_cffi import CurlInfo
89
from curl_cffi.requests import AsyncSession
10+
from curl_cffi.requests.cookies import Cookies, CurlMorsel
911
from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
1012
from curl_cffi.requests.exceptions import RequestException as CurlRequestError
1113
from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
@@ -26,6 +28,8 @@
2628
if TYPE_CHECKING:
2729
from collections.abc import Iterable
2830

31+
from curl_cffi import Curl
32+
from curl_cffi.requests import Request as CurlRequest
2933
from curl_cffi.requests import Response
3034

3135
from crawlee import Request
@@ -35,6 +39,16 @@
3539
from crawlee.statistics import Statistics
3640

3741

42+
class _EmptyCookies(Cookies):
43+
@override
44+
def get_cookies_for_curl(self, request: CurlRequest) -> list[CurlMorsel]:
45+
return []
46+
47+
@override
48+
def update_cookies_from_curl(self, morsels: list[CurlMorsel]) -> None:
49+
return None
50+
51+
3852
class _CurlImpersonateResponse:
3953
"""Adapter class for `curl_cffi.requests.Response` to conform to the `HttpResponse` protocol."""
4054

@@ -151,6 +165,10 @@ async def crawl(
151165
self._ignore_http_error_status_codes,
152166
)
153167

168+
if self._persist_cookies_per_session and session and response.curl:
169+
response_cookies = self._get_cookies(response.curl)
170+
session.cookies.update(response_cookies)
171+
154172
request.loaded_url = response.url
155173

156174
return HttpCrawlingResult(
@@ -194,6 +212,10 @@ async def send_request(
194212
self._ignore_http_error_status_codes,
195213
)
196214

215+
if self._persist_cookies_per_session and session and response.curl:
216+
response_cookies = self._get_cookies(response.curl)
217+
session.cookies.update(response_cookies)
218+
197219
return _CurlImpersonateResponse(response)
198220

199221
def _get_client(self, proxy_url: str | None) -> AsyncSession:
@@ -217,6 +239,7 @@ def _get_client(self, proxy_url: str | None) -> AsyncSession:
217239

218240
# Create and store the new session with the specified kwargs.
219241
self._client_by_proxy_url[proxy_url] = AsyncSession(**kwargs)
242+
self._client_by_proxy_url[proxy_url].cookies = _EmptyCookies()
220243

221244
return self._client_by_proxy_url[proxy_url]
222245

@@ -230,3 +253,11 @@ def _is_proxy_error(error: CurlRequestError) -> bool:
230253
return True
231254

232255
return False
256+
257+
@staticmethod
258+
def _get_cookies(curl: Curl) -> dict[str, str]:
259+
cookies = {}
260+
for curl_cookie in curl.getinfo(CurlInfo.COOKIELIST): # type: ignore[union-attr]
261+
curl_morsel = CurlMorsel.from_curl_format(curl_cookie) # type: ignore[arg-type]
262+
cookies[curl_morsel.name] = curl_morsel.value
263+
return cookies

tests/unit/crawlers/_http/test_http_crawler.py

Lines changed: 76 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import respx
1010
from httpx import Response
1111

12-
from crawlee._request import Request
12+
from crawlee import ConcurrencySettings, Request
1313
from crawlee.crawlers import HttpCrawler
1414
from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient
1515
from crawlee.sessions import SessionPool
@@ -183,7 +183,15 @@ async def test_handles_server_error(
183183
assert server['500_endpoint'].called
184184

185185

186-
async def test_stores_cookies(httpbin: URL) -> None:
186+
@pytest.mark.parametrize(
187+
'http_client_class',
188+
[
189+
pytest.param(CurlImpersonateHttpClient, id='curl'),
190+
pytest.param(HttpxHttpClient, id='httpx'),
191+
],
192+
)
193+
async def test_stores_cookies(http_client_class: type[BaseHttpClient], httpbin: URL) -> None:
194+
http_client = http_client_class()
187195
visit = Mock()
188196
track_session_usage = Mock()
189197

@@ -192,6 +200,7 @@ async def test_stores_cookies(httpbin: URL) -> None:
192200
# /cookies/set might redirect us to a page that we can't access - no problem, we only care about cookies
193201
ignore_http_error_status_codes=[401],
194202
session_pool=session_pool,
203+
http_client=http_client,
195204
)
196205

197206
@crawler.router.default_handler
@@ -410,3 +419,68 @@ def mark_request_execution(request: Request) -> Response: # noqa: ARG001 # Unus
410419
await crawler.run([Request.from_url(url=test_url)])
411420

412421
assert execution_order == ['pre-navigation-hook 1', 'pre-navigation-hook 2', 'request', 'final handler']
422+
423+
424+
@pytest.mark.parametrize(
425+
'http_client_class',
426+
[
427+
pytest.param(CurlImpersonateHttpClient, id='curl'),
428+
pytest.param(HttpxHttpClient, id='httpx'),
429+
],
430+
)
431+
async def test_isolation_cookies(http_client_class: type[BaseHttpClient], httpbin: URL) -> None:
432+
http_client = http_client_class()
433+
sessions_ids: list[str] = []
434+
sessions_cookies: dict[str, dict[str, str]] = {}
435+
response_cookies: dict[str, dict[str, str]] = {}
436+
437+
crawler = HttpCrawler(
438+
session_pool=SessionPool(max_pool_size=1),
439+
http_client=http_client,
440+
concurrency_settings=ConcurrencySettings(max_concurrency=1),
441+
)
442+
443+
@crawler.router.default_handler
444+
async def handler(context: HttpCrawlingContext) -> None:
445+
if not context.session:
446+
return
447+
448+
sessions_ids.append(context.session.id)
449+
450+
if context.request.unique_key not in {'1', '2'}:
451+
return
452+
453+
sessions_cookies[context.session.id] = context.session.cookies
454+
response_data = json.loads(context.http_response.read())
455+
response_cookies[context.session.id] = response_data.get('cookies')
456+
457+
if context.request.user_data.get('retire_session'):
458+
context.session.retire()
459+
460+
await crawler.run(
461+
[
462+
# The first request sets the cookie in the session
463+
str(httpbin.with_path('/cookies/set').extend_query(a=1)),
464+
# With the second request, we check the cookies in the session and set retire
465+
Request.from_url(str(httpbin.with_path('/cookies')), unique_key='1', user_data={'retire_session': True}),
466+
# The third request is made with a new session to make sure it does not use another session's cookies
467+
Request.from_url(str(httpbin.with_path('/cookies')), unique_key='2'),
468+
]
469+
)
470+
471+
assert len(sessions_cookies) == 2
472+
assert len(response_cookies) == 2
473+
474+
assert sessions_ids[0] == sessions_ids[1]
475+
476+
cookie_session_id = sessions_ids[0]
477+
clean_session_id = sessions_ids[2]
478+
479+
assert cookie_session_id != clean_session_id
480+
481+
# The initiated cookies must match in both the response and the session store
482+
assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}
483+
484+
# For a clean session, the cookie should not be in the session store or in the response
485+
# This way we can be sure that no cookies are being leaked through the http client
486+
assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {}

0 commit comments

Comments
 (0)