refactor!: Replace HttpxHttpClient with ImpitHttpClient as default HTTP client (#1307)

Mantisus · web-flow · commit c803a976776a · 2025-07-25T14:04:32.000+02:00
### Description - Replace `HttpxHttpClient` with `ImpitHttpClient` as the default HTTP client ### Issues - Closes: #1079
diff --git a/docs/guides/code_examples/request_loaders/sitemap_example.py b/docs/guides/code_examples/request_loaders/sitemap_example.py
@@ -1,13 +1,13 @@
 import asyncio
 import re
 
-from crawlee.http_clients import HttpxHttpClient
+from crawlee.http_clients import ImpitHttpClient
 from crawlee.request_loaders import SitemapRequestLoader
 
 
 async def main() -> None:
     # Create an HTTP client for fetching sitemaps
-    async with HttpxHttpClient() as http_client:
+    async with ImpitHttpClient() as http_client:
         # Create a sitemap request loader with URL filtering
         sitemap_loader = SitemapRequestLoader(
             sitemap_urls=['https://crawlee.dev/sitemap.xml'],
diff --git a/docs/guides/http_clients.mdx b/docs/guides/http_clients.mdx
@@ -36,24 +36,24 @@ class HttpClient {
 %% Specific classes
 %% ========================
 
+class ImpitHttpClient
+
 class HttpxHttpClient
 
 class CurlImpersonateHttpClient
 
-class ImpitHttpClient
-
 %% ========================
 %% Inheritance arrows
 %% ========================
 
+HttpClient --|> ImpitHttpClient
 HttpClient --|> HttpxHttpClient
 HttpClient --|> CurlImpersonateHttpClient
-HttpClient --|> ImpitHttpClient
 ```
 
 ## Switching between HTTP clients
 
-Crawlee currently provides three main HTTP clients: <ApiLink to="class/HttpxHttpClient">`HttpxHttpClient`</ApiLink>, which uses the `httpx` library, <ApiLink to="class/CurlImpersonateHttpClient">`CurlImpersonateHttpClient`</ApiLink>, which uses the `curl-cffi` library, and <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink>, which uses the `impit` library. You can switch between them by setting the `http_client` parameter when initializing a crawler class. The default HTTP client is <ApiLink to="class/HttpxHttpClient">`HttpxHttpClient`</ApiLink>.
+Crawlee currently provides three main HTTP clients: <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink>, which uses the `impit` library, <ApiLink to="class/HttpxHttpClient">`HttpxHttpClient`</ApiLink>, which uses the `httpx` library with `browserforge` for custom HTTP headers and fingerprints, and <ApiLink to="class/CurlImpersonateHttpClient">`CurlImpersonateHttpClient`</ApiLink>, which uses the `curl-cffi` library. You can switch between them by setting the `http_client` parameter when initializing a crawler class. The default HTTP client is <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink>. For more details on anti-blocking features, see our [avoid getting blocked guide](./avoid-blocking).
 
 Below are examples of how to configure the HTTP client for the <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>:
 
@@ -77,18 +77,18 @@ Below are examples of how to configure the HTTP client for the <ApiLink to="clas
 
 ## Installation requirements
 
-Since <ApiLink to="class/HttpxHttpClient">`HttpxHttpClient`</ApiLink> is the default HTTP client, it's included with the base Crawlee installation and requires no additional packages.
+Since <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink> is the default HTTP client, it's included with the base Crawlee installation and requires no additional packages.
 
 For <ApiLink to="class/CurlImpersonateHttpClient">`CurlImpersonateHttpClient`</ApiLink>, you need to install Crawlee with the `curl-impersonate` extra:
 
 ```sh
 python -m pip install 'crawlee[curl-impersonate]'
 ```
 
-For <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink>, you need to install Crawlee with the `impit` extra:
+For <ApiLink to="class/HttpxHttpClient">`HttpxHttpClient`</ApiLink>, you need to install Crawlee with the `httpx` extra:
 
 ```sh
-python -m pip install 'crawlee[impit]'
+python -m pip install 'crawlee[httpx]'
 ```
 
 Alternatively, you can install all available extras to get access to all HTTP clients and features:
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,11 +33,9 @@ keywords = [
     "scraping",
 ]
 dependencies = [
-    "apify_fingerprint_datapoints>=0.0.2",
-    "browserforge>=1.2.3",
     "cachetools>=5.5.0",
     "colorama>=0.4.0",
-    "httpx[brotli,http2,zstd]>=0.27.0",
+    "impit>=0.4.2",
     "more-itertools>=10.2.0",
     "protego>=0.5.0",
     "psutil>=6.0.0",
@@ -52,18 +50,20 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,parsel,playwright,otel]"]
+all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel]"]
 adaptive-crawler = [
     "jaro-winkler>=2.0.3",
     "playwright>=1.27.0",
     "scikit-learn>=1.6.0",
+    "apify_fingerprint_datapoints>=0.0.2",
+    "browserforge>=1.2.3"
 ]
 beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"]
-cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0","impit>=0.4.0"]
+cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"]
 curl-impersonate = ["curl-cffi>=0.9.0"]
-impit = ["impit>=0.4.0"]
+httpx = ["httpx[brotli,http2,zstd]>=0.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"]
 parsel = ["parsel>=1.10.0"]
-playwright = ["playwright>=1.27.0"]
+playwright = ["playwright>=1.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"]
 otel = [
     "opentelemetry-api>=1.34.1",
     "opentelemetry-distro[otlp]>=0.54",
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -56,7 +56,7 @@
     UserDefinedErrorHandlerError,
 )
 from crawlee.events._types import Event, EventCrawlerStatusData
-from crawlee.http_clients import HttpxHttpClient
+from crawlee.http_clients import ImpitHttpClient
 from crawlee.router import Router
 from crawlee.sessions import SessionPool
 from crawlee.statistics import Statistics, StatisticsState
@@ -367,7 +367,7 @@ def __init__(
             set(ignore_http_error_status_codes) if ignore_http_error_status_codes else set()
         )
 
-        self._http_client = http_client or HttpxHttpClient()
+        self._http_client = http_client or ImpitHttpClient()
 
         # Request router setup
         self._router: Router[TCrawlingContext] | None = None
diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -21,7 +21,7 @@
 from crawlee.errors import SessionError
 from crawlee.fingerprint_suite import DefaultFingerprintGenerator, FingerprintGenerator, HeaderGeneratorOptions
 from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type
-from crawlee.http_clients import HttpxHttpClient
+from crawlee.http_clients import ImpitHttpClient
 from crawlee.sessions._cookies import PlaywrightCookieParam
 from crawlee.statistics import StatisticsState
 
@@ -473,7 +473,7 @@ async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
         Args:
             url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.
         """
-        http_client = HttpxHttpClient() if isinstance(self._http_client, PlaywrightHttpClient) else self._http_client
+        http_client = ImpitHttpClient() if isinstance(self._http_client, PlaywrightHttpClient) else self._http_client
 
         return await RobotsTxtFile.find(url, http_client=http_client)
 
diff --git a/src/crawlee/http_clients/__init__.py b/src/crawlee/http_clients/__init__.py
@@ -3,7 +3,7 @@
 
 # These imports have only mandatory dependencies, so they are imported directly.
 from ._base import HttpClient, HttpCrawlingResult, HttpResponse
-from ._httpx import HttpxHttpClient
+from ._impit import ImpitHttpClient
 
 _install_import_hook(__name__)
 
@@ -12,8 +12,8 @@
 with _try_import(__name__, 'CurlImpersonateHttpClient'):
     from ._curl_impersonate import CurlImpersonateHttpClient
 
-with _try_import(__name__, 'ImpitHttpClient'):
-    from ._impit import ImpitHttpClient
+with _try_import(__name__, 'HttpxHttpClient'):
+    from ._httpx import HttpxHttpClient
 
 
 __all__ = [
diff --git a/src/crawlee/http_clients/_impit.py b/src/crawlee/http_clients/_impit.py
@@ -102,7 +102,6 @@ def __init__(
             persist_cookies_per_session: Whether to persist cookies per HTTP session.
             http3: Whether to enable HTTP/3 support.
             verify: SSL certificates used to verify the identity of requested hosts.
-            header_generator: Header generator instance to use for generating common headers.
             browser: Browser to impersonate.
             async_client_kwargs: Additional keyword arguments for `impit.AsyncClient`.
         """
@@ -135,7 +134,7 @@ async def crawl(
                 content=request.payload,
                 headers=dict(request.headers) if request.headers else None,
             )
-        except (TransportError, HTTPError) as exc:  # type: ignore[misc] # waiting for merge https://github.com/apify/impit/pull/207
+        except (TransportError, HTTPError) as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
             raise
@@ -167,7 +166,7 @@ async def send_request(
             response = await client.request(
                 method=method, url=url, content=payload, headers=dict(headers) if headers else None
             )
-        except (TransportError, HTTPError) as exc:  # type: ignore[misc] # waiting for merge https://github.com/apify/impit/pull/207
+        except (TransportError, HTTPError) as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
             raise
@@ -194,7 +193,7 @@ async def stream(
             url=url,
             content=payload,
             headers=dict(headers) if headers else None,
-            stream=True,  # type: ignore[call-arg] # waiting for merge https://github.com/apify/impit/pull/207
+            stream=True,
         )
         try:
             yield _ImpitResponse(response)
@@ -233,7 +232,7 @@ def _get_client(self, proxy_url: str | None, cookie_jar: CookieJar | None) -> As
         return client
 
     @staticmethod
-    def _is_proxy_error(error: RuntimeError) -> bool:
+    def _is_proxy_error(error: HTTPError) -> bool:
         """Determine whether the given error is related to a proxy issue.
 
         Check if the error message contains known proxy-related error keywords.
diff --git a/src/crawlee/project_template/cookiecutter.json b/src/crawlee/project_template/cookiecutter.json
@@ -3,7 +3,7 @@
     "__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}",
     "crawler_type": ["beautifulsoup", "parsel", "playwright", "playwright-camoufox"],
     "__crawler_type": "{{ cookiecutter.crawler_type|lower|replace('-', '_') }}",
-    "http_client": ["httpx", "curl-impersonate", "impit"],
+    "http_client": ["impit", "httpx", "curl-impersonate"],
     "package_manager": ["poetry", "pip", "uv"],
     "enable_apify_integration": false,
     "install_project": true,
diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py
@@ -512,7 +512,16 @@ async def handler(context: HttpCrawlingContext) -> None:
             'http_only': False,
         }
 
+        # Some clients may ignore `.` at the beginning of the domain
+        # https://www.rfc-editor.org/rfc/rfc6265#section-4.1.2.3
         assert session_cookies_dict['domain'] == {
+            'name': 'domain',
+            'value': '6',
+            'domain': {server_url.host},
+            'path': '/',
+            'secure': False,
+            'http_only': False,
+        } or {
             'name': 'domain',
             'value': '6',
             'domain': f'.{server_url.host}',
diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py
@@ -31,7 +31,7 @@
 from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values
 from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD
 from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type
-from crawlee.http_clients import HttpxHttpClient
+from crawlee.http_clients import ImpitHttpClient
 from crawlee.proxy_configuration import ProxyConfiguration
 from crawlee.sessions import Session, SessionPool
 from crawlee.statistics import Statistics
@@ -694,9 +694,7 @@ async def test_send_request_with_client(server_url: URL) -> None:
     """Check that the persist context works with fingerprints."""
     check_data: dict[str, Any] = {}
 
-    crawler = PlaywrightCrawler(
-        http_client=HttpxHttpClient(header_generator=None, headers={'user-agent': 'My User-Agent'})
-    )
+    crawler = PlaywrightCrawler(http_client=ImpitHttpClient(headers={'user-agent': 'My User-Agent'}))
 
     @crawler.router.default_handler
     async def request_handler(context: PlaywrightCrawlingContext) -> None:
diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py
@@ -45,7 +45,7 @@ def test_create_interactive(mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyP
             'project_name': 'my_project',
             'package_manager': 'poetry',
             'crawler_type': 'beautifulsoup',
-            'http_client': 'httpx',
+            'http_client': 'impit',
             'enable_apify_integration': False,
             'start_url': 'https://crawlee.dev',
             'install_project': True,
@@ -79,7 +79,7 @@ def test_create_interactive_non_default_template(mock_cookiecutter: Mock, monkey
             'project_name': 'my_project',
             'package_manager': 'poetry',
             'crawler_type': 'parsel',
-            'http_client': 'httpx',
+            'http_client': 'impit',
             'enable_apify_integration': False,
             'start_url': 'https://crawlee.dev',
             'install_project': True,
@@ -96,7 +96,7 @@ def test_create_non_interactive(mock_cookiecutter: Mock) -> None:
             '--crawler-type',
             'playwright',
             '--http-client',
-            'curl-impersonate',
+            'httpx',
             '--package-manager',
             'pip',
             '--start-url',
@@ -113,7 +113,7 @@ def test_create_non_interactive(mock_cookiecutter: Mock) -> None:
             'project_name': 'my_project',
             'package_manager': 'pip',
             'crawler_type': 'playwright',
-            'http_client': 'curl-impersonate',
+            'http_client': 'httpx',
             'start_url': 'https://yr.no',
             'enable_apify_integration': False,
             'install_project': False,
@@ -144,7 +144,7 @@ def test_create_existing_folder(
             '--crawler-type',
             'playwright',
             '--http-client',
-            'curl-impersonate',
+            'httpx',
             '--package-manager',
             'pip',
             '--start-url',
@@ -162,7 +162,7 @@ def test_create_existing_folder(
             'project_name': 'my_project',
             'package_manager': 'pip',
             'crawler_type': 'playwright',
-            'http_client': 'curl-impersonate',
+            'http_client': 'httpx',
             'start_url': 'https://yr.no',
             'enable_apify_integration': False,
             'install_project': True,
@@ -202,7 +202,7 @@ def test_create_existing_folder_interactive(
             'project_name': 'my_project',
             'package_manager': 'poetry',
             'crawler_type': 'playwright',
-            'http_client': 'httpx',
+            'http_client': 'impit',
             'start_url': 'https://crawlee.dev',
             'enable_apify_integration': False,
             'install_project': True,
@@ -245,7 +245,7 @@ def test_create_existing_folder_interactive_multiple_attempts(
             'project_name': 'my_project',
             'package_manager': 'poetry',
             'crawler_type': 'playwright',
-            'http_client': 'httpx',
+            'http_client': 'impit',
             'start_url': 'https://crawlee.dev',
             'enable_apify_integration': False,
             'install_project': True,
diff --git a/uv.lock b/uv.lock