Skip to content

Commit c803a97

Browse files
authored
refactor!: Replace HttpxHttpClient with ImpitHttpClient as default HTTP client (#1307)
### Description - Replace `HttpxHttpClient` with `ImpitHttpClient` as the default HTTP client ### Issues - Closes: #1079
1 parent d02fd2d commit c803a97

File tree

12 files changed

+105
-91
lines changed

12 files changed

+105
-91
lines changed

docs/guides/code_examples/request_loaders/sitemap_example.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
import asyncio
22
import re
33

4-
from crawlee.http_clients import HttpxHttpClient
4+
from crawlee.http_clients import ImpitHttpClient
55
from crawlee.request_loaders import SitemapRequestLoader
66

77

88
async def main() -> None:
99
# Create an HTTP client for fetching sitemaps
10-
async with HttpxHttpClient() as http_client:
10+
async with ImpitHttpClient() as http_client:
1111
# Create a sitemap request loader with URL filtering
1212
sitemap_loader = SitemapRequestLoader(
1313
sitemap_urls=['https://crawlee.dev/sitemap.xml'],

docs/guides/http_clients.mdx

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,24 +36,24 @@ class HttpClient {
3636
%% Specific classes
3737
%% ========================
3838
39+
class ImpitHttpClient
40+
3941
class HttpxHttpClient
4042
4143
class CurlImpersonateHttpClient
4244
43-
class ImpitHttpClient
44-
4545
%% ========================
4646
%% Inheritance arrows
4747
%% ========================
4848
49+
HttpClient --|> ImpitHttpClient
4950
HttpClient --|> HttpxHttpClient
5051
HttpClient --|> CurlImpersonateHttpClient
51-
HttpClient --|> ImpitHttpClient
5252
```
5353

5454
## Switching between HTTP clients
5555

56-
Crawlee currently provides three main HTTP clients: <ApiLink to="class/HttpxHttpClient">`HttpxHttpClient`</ApiLink>, which uses the `httpx` library, <ApiLink to="class/CurlImpersonateHttpClient">`CurlImpersonateHttpClient`</ApiLink>, which uses the `curl-cffi` library, and <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink>, which uses the `impit` library. You can switch between them by setting the `http_client` parameter when initializing a crawler class. The default HTTP client is <ApiLink to="class/HttpxHttpClient">`HttpxHttpClient`</ApiLink>.
56+
Crawlee currently provides three main HTTP clients: <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink>, which uses the `impit` library, <ApiLink to="class/HttpxHttpClient">`HttpxHttpClient`</ApiLink>, which uses the `httpx` library with `browserforge` for custom HTTP headers and fingerprints, and <ApiLink to="class/CurlImpersonateHttpClient">`CurlImpersonateHttpClient`</ApiLink>, which uses the `curl-cffi` library. You can switch between them by setting the `http_client` parameter when initializing a crawler class. The default HTTP client is <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink>. For more details on anti-blocking features, see our [avoid getting blocked guide](./avoid-blocking).
5757

5858
Below are examples of how to configure the HTTP client for the <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>:
5959

@@ -77,18 +77,18 @@ Below are examples of how to configure the HTTP client for the <ApiLink to="clas
7777

7878
## Installation requirements
7979

80-
Since <ApiLink to="class/HttpxHttpClient">`HttpxHttpClient`</ApiLink> is the default HTTP client, it's included with the base Crawlee installation and requires no additional packages.
80+
Since <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink> is the default HTTP client, it's included with the base Crawlee installation and requires no additional packages.
8181

8282
For <ApiLink to="class/CurlImpersonateHttpClient">`CurlImpersonateHttpClient`</ApiLink>, you need to install Crawlee with the `curl-impersonate` extra:
8383

8484
```sh
8585
python -m pip install 'crawlee[curl-impersonate]'
8686
```
8787

88-
For <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink>, you need to install Crawlee with the `impit` extra:
88+
For <ApiLink to="class/HttpxHttpClient">`HttpxHttpClient`</ApiLink>, you need to install Crawlee with the `httpx` extra:
8989

9090
```sh
91-
python -m pip install 'crawlee[impit]'
91+
python -m pip install 'crawlee[httpx]'
9292
```
9393

9494
Alternatively, you can install all available extras to get access to all HTTP clients and features:

pyproject.toml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,9 @@ keywords = [
3333
"scraping",
3434
]
3535
dependencies = [
36-
"apify_fingerprint_datapoints>=0.0.2",
37-
"browserforge>=1.2.3",
3836
"cachetools>=5.5.0",
3937
"colorama>=0.4.0",
40-
"httpx[brotli,http2,zstd]>=0.27.0",
38+
"impit>=0.4.2",
4139
"more-itertools>=10.2.0",
4240
"protego>=0.5.0",
4341
"psutil>=6.0.0",
@@ -52,18 +50,20 @@ dependencies = [
5250
]
5351

5452
[project.optional-dependencies]
55-
all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,parsel,playwright,otel]"]
53+
all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel]"]
5654
adaptive-crawler = [
5755
"jaro-winkler>=2.0.3",
5856
"playwright>=1.27.0",
5957
"scikit-learn>=1.6.0",
58+
"apify_fingerprint_datapoints>=0.0.2",
59+
"browserforge>=1.2.3"
6060
]
6161
beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"]
62-
cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0","impit>=0.4.0"]
62+
cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"]
6363
curl-impersonate = ["curl-cffi>=0.9.0"]
64-
impit = ["impit>=0.4.0"]
64+
httpx = ["httpx[brotli,http2,zstd]>=0.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"]
6565
parsel = ["parsel>=1.10.0"]
66-
playwright = ["playwright>=1.27.0"]
66+
playwright = ["playwright>=1.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"]
6767
otel = [
6868
"opentelemetry-api>=1.34.1",
6969
"opentelemetry-distro[otlp]>=0.54",

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
UserDefinedErrorHandlerError,
5757
)
5858
from crawlee.events._types import Event, EventCrawlerStatusData
59-
from crawlee.http_clients import HttpxHttpClient
59+
from crawlee.http_clients import ImpitHttpClient
6060
from crawlee.router import Router
6161
from crawlee.sessions import SessionPool
6262
from crawlee.statistics import Statistics, StatisticsState
@@ -367,7 +367,7 @@ def __init__(
367367
set(ignore_http_error_status_codes) if ignore_http_error_status_codes else set()
368368
)
369369

370-
self._http_client = http_client or HttpxHttpClient()
370+
self._http_client = http_client or ImpitHttpClient()
371371

372372
# Request router setup
373373
self._router: Router[TCrawlingContext] | None = None

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from crawlee.errors import SessionError
2222
from crawlee.fingerprint_suite import DefaultFingerprintGenerator, FingerprintGenerator, HeaderGeneratorOptions
2323
from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type
24-
from crawlee.http_clients import HttpxHttpClient
24+
from crawlee.http_clients import ImpitHttpClient
2525
from crawlee.sessions._cookies import PlaywrightCookieParam
2626
from crawlee.statistics import StatisticsState
2727

@@ -473,7 +473,7 @@ async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
473473
Args:
474474
url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.
475475
"""
476-
http_client = HttpxHttpClient() if isinstance(self._http_client, PlaywrightHttpClient) else self._http_client
476+
http_client = ImpitHttpClient() if isinstance(self._http_client, PlaywrightHttpClient) else self._http_client
477477

478478
return await RobotsTxtFile.find(url, http_client=http_client)
479479

src/crawlee/http_clients/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
# These imports have only mandatory dependencies, so they are imported directly.
55
from ._base import HttpClient, HttpCrawlingResult, HttpResponse
6-
from ._httpx import HttpxHttpClient
6+
from ._impit import ImpitHttpClient
77

88
_install_import_hook(__name__)
99

@@ -12,8 +12,8 @@
1212
with _try_import(__name__, 'CurlImpersonateHttpClient'):
1313
from ._curl_impersonate import CurlImpersonateHttpClient
1414

15-
with _try_import(__name__, 'ImpitHttpClient'):
16-
from ._impit import ImpitHttpClient
15+
with _try_import(__name__, 'HttpxHttpClient'):
16+
from ._httpx import HttpxHttpClient
1717

1818

1919
__all__ = [

src/crawlee/http_clients/_impit.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,6 @@ def __init__(
102102
persist_cookies_per_session: Whether to persist cookies per HTTP session.
103103
http3: Whether to enable HTTP/3 support.
104104
verify: SSL certificates used to verify the identity of requested hosts.
105-
header_generator: Header generator instance to use for generating common headers.
106105
browser: Browser to impersonate.
107106
async_client_kwargs: Additional keyword arguments for `impit.AsyncClient`.
108107
"""
@@ -135,7 +134,7 @@ async def crawl(
135134
content=request.payload,
136135
headers=dict(request.headers) if request.headers else None,
137136
)
138-
except (TransportError, HTTPError) as exc: # type: ignore[misc] # waiting for merge https://github.com/apify/impit/pull/207
137+
except (TransportError, HTTPError) as exc:
139138
if self._is_proxy_error(exc):
140139
raise ProxyError from exc
141140
raise
@@ -167,7 +166,7 @@ async def send_request(
167166
response = await client.request(
168167
method=method, url=url, content=payload, headers=dict(headers) if headers else None
169168
)
170-
except (TransportError, HTTPError) as exc: # type: ignore[misc] # waiting for merge https://github.com/apify/impit/pull/207
169+
except (TransportError, HTTPError) as exc:
171170
if self._is_proxy_error(exc):
172171
raise ProxyError from exc
173172
raise
@@ -194,7 +193,7 @@ async def stream(
194193
url=url,
195194
content=payload,
196195
headers=dict(headers) if headers else None,
197-
stream=True, # type: ignore[call-arg] # waiting for merge https://github.com/apify/impit/pull/207
196+
stream=True,
198197
)
199198
try:
200199
yield _ImpitResponse(response)
@@ -233,7 +232,7 @@ def _get_client(self, proxy_url: str | None, cookie_jar: CookieJar | None) -> As
233232
return client
234233

235234
@staticmethod
236-
def _is_proxy_error(error: RuntimeError) -> bool:
235+
def _is_proxy_error(error: HTTPError) -> bool:
237236
"""Determine whether the given error is related to a proxy issue.
238237
239238
Check if the error message contains known proxy-related error keywords.

src/crawlee/project_template/cookiecutter.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}",
44
"crawler_type": ["beautifulsoup", "parsel", "playwright", "playwright-camoufox"],
55
"__crawler_type": "{{ cookiecutter.crawler_type|lower|replace('-', '_') }}",
6-
"http_client": ["httpx", "curl-impersonate", "impit"],
6+
"http_client": ["impit", "httpx", "curl-impersonate"],
77
"package_manager": ["poetry", "pip", "uv"],
88
"enable_apify_integration": false,
99
"install_project": true,

tests/unit/crawlers/_http/test_http_crawler.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -512,7 +512,16 @@ async def handler(context: HttpCrawlingContext) -> None:
512512
'http_only': False,
513513
}
514514

515+
# Some clients may ignore `.` at the beginning of the domain
516+
# https://www.rfc-editor.org/rfc/rfc6265#section-4.1.2.3
515517
assert session_cookies_dict['domain'] == {
518+
'name': 'domain',
519+
'value': '6',
520+
'domain': {server_url.host},
521+
'path': '/',
522+
'secure': False,
523+
'http_only': False,
524+
} or {
516525
'name': 'domain',
517526
'value': '6',
518527
'domain': f'.{server_url.host}',

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values
3232
from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD
3333
from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type
34-
from crawlee.http_clients import HttpxHttpClient
34+
from crawlee.http_clients import ImpitHttpClient
3535
from crawlee.proxy_configuration import ProxyConfiguration
3636
from crawlee.sessions import Session, SessionPool
3737
from crawlee.statistics import Statistics
@@ -694,9 +694,7 @@ async def test_send_request_with_client(server_url: URL) -> None:
694694
"""Check that the persist context works with fingerprints."""
695695
check_data: dict[str, Any] = {}
696696

697-
crawler = PlaywrightCrawler(
698-
http_client=HttpxHttpClient(header_generator=None, headers={'user-agent': 'My User-Agent'})
699-
)
697+
crawler = PlaywrightCrawler(http_client=ImpitHttpClient(headers={'user-agent': 'My User-Agent'}))
700698

701699
@crawler.router.default_handler
702700
async def request_handler(context: PlaywrightCrawlingContext) -> None:

0 commit comments

Comments
 (0)