fix: fix CurlImpersonateHttpClient cookies handler (#946)

Mantisus · web-flow · commit ed415c433da2 · 2025-02-05T09:14:24.000+01:00
### Description - fix cookie handling. Behavior alignment with `HttpxHttpClient`. ### Issues - #933
diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py
@@ -5,7 +5,9 @@
 from crawlee._utils.docs import docs_group
 
 try:
+    from curl_cffi import CurlInfo
     from curl_cffi.requests import AsyncSession
+    from curl_cffi.requests.cookies import Cookies, CurlMorsel
     from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
     from curl_cffi.requests.exceptions import RequestException as CurlRequestError
     from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
@@ -26,6 +28,8 @@
 if TYPE_CHECKING:
     from collections.abc import Iterable
 
+    from curl_cffi import Curl
+    from curl_cffi.requests import Request as CurlRequest
     from curl_cffi.requests import Response
 
     from crawlee import Request
@@ -35,6 +39,16 @@
     from crawlee.statistics import Statistics
 
 
+class _EmptyCookies(Cookies):
+    @override
+    def get_cookies_for_curl(self, request: CurlRequest) -> list[CurlMorsel]:
+        return []
+
+    @override
+    def update_cookies_from_curl(self, morsels: list[CurlMorsel]) -> None:
+        return None
+
+
 class _CurlImpersonateResponse:
     """Adapter class for `curl_cffi.requests.Response` to conform to the `HttpResponse` protocol."""
 
@@ -151,6 +165,10 @@ async def crawl(
             self._ignore_http_error_status_codes,
         )
 
+        if self._persist_cookies_per_session and session and response.curl:
+            response_cookies = self._get_cookies(response.curl)
+            session.cookies.update(response_cookies)
+
         request.loaded_url = response.url
 
         return HttpCrawlingResult(
@@ -194,6 +212,10 @@ async def send_request(
             self._ignore_http_error_status_codes,
         )
 
+        if self._persist_cookies_per_session and session and response.curl:
+            response_cookies = self._get_cookies(response.curl)
+            session.cookies.update(response_cookies)
+
         return _CurlImpersonateResponse(response)
 
     def _get_client(self, proxy_url: str | None) -> AsyncSession:
@@ -217,6 +239,7 @@ def _get_client(self, proxy_url: str | None) -> AsyncSession:
 
             # Create and store the new session with the specified kwargs.
             self._client_by_proxy_url[proxy_url] = AsyncSession(**kwargs)
+            self._client_by_proxy_url[proxy_url].cookies = _EmptyCookies()
 
         return self._client_by_proxy_url[proxy_url]
 
@@ -230,3 +253,11 @@ def _is_proxy_error(error: CurlRequestError) -> bool:
             return True
 
         return False
+
+    @staticmethod
+    def _get_cookies(curl: Curl) -> dict[str, str]:
+        cookies = {}
+        for curl_cookie in curl.getinfo(CurlInfo.COOKIELIST):  # type: ignore[union-attr]
+            curl_morsel = CurlMorsel.from_curl_format(curl_cookie)  # type: ignore[arg-type]
+            cookies[curl_morsel.name] = curl_morsel.value
+        return cookies
diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py
@@ -9,7 +9,7 @@
 import respx
 from httpx import Response
 
-from crawlee._request import Request
+from crawlee import ConcurrencySettings, Request
 from crawlee.crawlers import HttpCrawler
 from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient
 from crawlee.sessions import SessionPool
@@ -183,7 +183,15 @@ async def test_handles_server_error(
     assert server['500_endpoint'].called
 
 
-async def test_stores_cookies(httpbin: URL) -> None:
+@pytest.mark.parametrize(
+    'http_client_class',
+    [
+        pytest.param(CurlImpersonateHttpClient, id='curl'),
+        pytest.param(HttpxHttpClient, id='httpx'),
+    ],
+)
+async def test_stores_cookies(http_client_class: type[BaseHttpClient], httpbin: URL) -> None:
+    http_client = http_client_class()
     visit = Mock()
     track_session_usage = Mock()
 
@@ -192,6 +200,7 @@ async def test_stores_cookies(httpbin: URL) -> None:
             # /cookies/set might redirect us to a page that we can't access - no problem, we only care about cookies
             ignore_http_error_status_codes=[401],
             session_pool=session_pool,
+            http_client=http_client,
         )
 
         @crawler.router.default_handler
@@ -410,3 +419,68 @@ def mark_request_execution(request: Request) -> Response:  # noqa: ARG001 # Unus
     await crawler.run([Request.from_url(url=test_url)])
 
     assert execution_order == ['pre-navigation-hook 1', 'pre-navigation-hook 2', 'request', 'final handler']
+
+
+@pytest.mark.parametrize(
+    'http_client_class',
+    [
+        pytest.param(CurlImpersonateHttpClient, id='curl'),
+        pytest.param(HttpxHttpClient, id='httpx'),
+    ],
+)
+async def test_isolation_cookies(http_client_class: type[BaseHttpClient], httpbin: URL) -> None:
+    http_client = http_client_class()
+    sessions_ids: list[str] = []
+    sessions_cookies: dict[str, dict[str, str]] = {}
+    response_cookies: dict[str, dict[str, str]] = {}
+
+    crawler = HttpCrawler(
+        session_pool=SessionPool(max_pool_size=1),
+        http_client=http_client,
+        concurrency_settings=ConcurrencySettings(max_concurrency=1),
+    )
+
+    @crawler.router.default_handler
+    async def handler(context: HttpCrawlingContext) -> None:
+        if not context.session:
+            return
+
+        sessions_ids.append(context.session.id)
+
+        if context.request.unique_key not in {'1', '2'}:
+            return
+
+        sessions_cookies[context.session.id] = context.session.cookies
+        response_data = json.loads(context.http_response.read())
+        response_cookies[context.session.id] = response_data.get('cookies')
+
+        if context.request.user_data.get('retire_session'):
+            context.session.retire()
+
+    await crawler.run(
+        [
+            # The first request sets the cookie in the session
+            str(httpbin.with_path('/cookies/set').extend_query(a=1)),
+            # With the second request, we check the cookies in the session and set retire
+            Request.from_url(str(httpbin.with_path('/cookies')), unique_key='1', user_data={'retire_session': True}),
+            # The third request is made with a new session to make sure it does not use another session's cookies
+            Request.from_url(str(httpbin.with_path('/cookies')), unique_key='2'),
+        ]
+    )
+
+    assert len(sessions_cookies) == 2
+    assert len(response_cookies) == 2
+
+    assert sessions_ids[0] == sessions_ids[1]
+
+    cookie_session_id = sessions_ids[0]
+    clean_session_id = sessions_ids[2]
+
+    assert cookie_session_id != clean_session_id
+
+    # The initiated cookies must match in both the response and the session store
+    assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}
+
+    # For a clean session, the cookie should not be in the session store or in the response
+    # This way we can be sure that no cookies are being leaked through the http client
+    assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {}