Skip to content

Commit a0c786c

Browse files
authored
Support huggingface_hub v0.x and v1.x (#7783)
* Let's test like this * code quality * add back requests * install transformers from source * will it work? * to remove later: don't fail fast * don't fail fast * fix test fixture * fix OfflineModeIsEnabled test * huggingface_hub 1.0.0 even if deps latest * will be broken but better * pip list in CI * revert branch * install latest only in latest tests * offline * get back to normal * better * ofc * why not * as before * this time is good * fix yaml format * system * fix import in o.x * :/ * Bump minimal version to 0.25.0 * x-compatible offline helper * code quality * fix utils tests * fixing last bits * x-version compat * final commit
1 parent c412a6f commit a0c786c

File tree

13 files changed

+139
-121
lines changed

13 files changed

+139
-121
lines changed

.github/workflows/ci.yml

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,16 @@ jobs:
7272
run: uv pip install --system "datasets[tests] @ ."
7373
- name: Install dependencies (latest versions)
7474
if: ${{ matrix.deps_versions == 'deps-latest' }}
75-
run: uv pip install --system --upgrade pyarrow huggingface-hub "dill<0.3.9"
75+
run: |
76+
uv pip install --system --upgrade pyarrow huggingface-hub "dill<0.3.9"
77+
# TODO: remove once transformers v5 / huggingface_hub v1 are released officially
78+
uv pip uninstall --system transformers huggingface_hub
79+
uv pip install --system --prerelease=allow git+https://github.com/huggingface/transformers.git
7680
- name: Install dependencies (minimum versions)
7781
if: ${{ matrix.deps_versions != 'deps-latest' }}
78-
run: uv pip install --system pyarrow==21.0.0 huggingface-hub==0.24.7 transformers dill==0.3.1.1
82+
run: uv pip install --system pyarrow==21.0.0 huggingface-hub==0.25.0 transformers dill==0.3.1.1
83+
- name: Print dependencies
84+
run: uv pip list
7985
- name: Test with pytest
8086
run: |
8187
python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/
@@ -119,6 +125,8 @@ jobs:
119125
run: pip install --upgrade uv
120126
- name: Install dependencies
121127
run: uv pip install --system "datasets[tests] @ ."
128+
- name: Print dependencies
129+
run: uv pip list
122130
- name: Test with pytest
123131
run: |
124132
python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/
@@ -161,7 +169,14 @@ jobs:
161169
- name: Install uv
162170
run: pip install --upgrade uv
163171
- name: Install dependencies
164-
run: uv pip install --system "datasets[tests_numpy2] @ ."
172+
run: |
173+
uv pip install --system "datasets[tests_numpy2] @ ."
174+
# TODO: remove once transformers v5 / huggingface_hub v1 are released officially
175+
uv pip uninstall --system transformers huggingface_hub
176+
uv pip install --system --prerelease=allow git+https://github.com/huggingface/transformers.git
177+
- name: Print dependencies
178+
run: pip list
179+
165180
- name: Test with pytest
166181
run: |
167182
python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@
118118
"pandas",
119119
# for downloading datasets over HTTPS
120120
"requests>=2.32.2",
121+
"httpx<1.0.0",
121122
# progress bars in downloads and data operations
122123
"tqdm>=4.66.3",
123124
# for fast hashing
@@ -128,7 +129,7 @@
128129
# minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143
129130
"fsspec[http]>=2023.1.0,<=2025.9.0",
130131
# To get datasets from the Datasets Hub on huggingface.co
131-
"huggingface-hub>=0.24.0",
132+
"huggingface-hub>=0.25.0,<2.0",
132133
# Utilities from PyPA to e.g., compare versions
133134
"packaging",
134135
# To parse YAML metadata from dataset cards

src/datasets/arrow_dataset.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,9 @@
6868
DatasetCardData,
6969
HfApi,
7070
)
71-
from huggingface_hub.hf_api import HfHubHTTPError, RepoFile, RepositoryNotFoundError
71+
from huggingface_hub.hf_api import RepoFile
72+
from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError
7273
from multiprocess import Pool
73-
from requests import HTTPError
7474
from tqdm.contrib.concurrent import thread_map
7575

7676
from . import config
@@ -5990,7 +5990,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete],
59905990
except HfHubHTTPError as err:
59915991
if (
59925992
err.__context__
5993-
and isinstance(err.__context__, HTTPError)
5993+
and isinstance(err.__context__, HfHubHTTPError)
59945994
and err.__context__.response.status_code == 409
59955995
):
59965996
# 409 is Conflict (another commit is in progress)
@@ -6040,7 +6040,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete],
60406040
except HfHubHTTPError as err:
60416041
if (
60426042
err.__context__
6043-
and isinstance(err.__context__, HTTPError)
6043+
and isinstance(err.__context__, HfHubHTTPError)
60446044
and err.__context__.response.status_code in (412, 409)
60456045
):
60466046
# 412 is Precondition failed (parent_commit isn't satisfied)

src/datasets/data_files.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ def resolve_pattern(
352352
protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0]
353353
protocol_prefix = protocol + "://" if protocol != "file" else ""
354354
glob_kwargs = {}
355-
if protocol == "hf" and config.HF_HUB_VERSION >= version.parse("0.20.0"):
355+
if protocol == "hf":
356356
# 10 times faster glob with detail=True (ignores costly info like lastCommit)
357357
glob_kwargs["expand_info"] = False
358358
matched_paths = [

src/datasets/dataset_dict.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
)
2727
from huggingface_hub.hf_api import RepoFile
2828
from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError
29-
from requests import HTTPError
3029

3130
from . import config
3231
from .arrow_dataset import (
@@ -1917,7 +1916,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete],
19171916
except HfHubHTTPError as err:
19181917
if (
19191918
err.__context__
1920-
and isinstance(err.__context__, HTTPError)
1919+
and isinstance(err.__context__, HfHubHTTPError)
19211920
and err.__context__.response.status_code == 409
19221921
):
19231922
# 409 is Conflict (another commit is in progress)
@@ -1967,7 +1966,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete],
19671966
except HfHubHTTPError as err:
19681967
if (
19691968
err.__context__
1970-
and isinstance(err.__context__, HTTPError)
1969+
and isinstance(err.__context__, HfHubHTTPError)
19711970
and err.__context__.response.status_code in (412, 409)
19721971
):
19731972
# 412 is Precondition failed (parent_commit isn't satisfied)
@@ -2786,7 +2785,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete],
27862785
except HfHubHTTPError as err:
27872786
if (
27882787
err.__context__
2789-
and isinstance(err.__context__, HTTPError)
2788+
and isinstance(err.__context__, HfHubHTTPError)
27902789
and err.__context__.response.status_code == 409
27912790
):
27922791
# 409 is Conflict (another commit is in progress)
@@ -2836,7 +2835,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete],
28362835
except HfHubHTTPError as err:
28372836
if (
28382837
err.__context__
2839-
and isinstance(err.__context__, HTTPError)
2838+
and isinstance(err.__context__, HfHubHTTPError)
28402839
and err.__context__.response.status_code in (412, 409)
28412840
):
28422841
# 412 is Precondition failed (parent_commit isn't satisfied)

src/datasets/iterable_dataset.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
from huggingface_hub.hf_api import RepoFile
3131
from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError
3232
from multiprocess import Pool
33-
from requests import HTTPError
3433

3534
from . import config
3635
from .arrow_dataset import PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED, Dataset, DatasetInfoMixin
@@ -4332,7 +4331,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete],
43324331
except HfHubHTTPError as err:
43334332
if (
43344333
err.__context__
4335-
and isinstance(err.__context__, HTTPError)
4334+
and isinstance(err.__context__, HfHubHTTPError)
43364335
and err.__context__.response.status_code == 409
43374336
):
43384337
# 409 is Conflict (another commit is in progress)
@@ -4382,7 +4381,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete],
43824381
except HfHubHTTPError as err:
43834382
if (
43844383
err.__context__
4385-
and isinstance(err.__context__, HTTPError)
4384+
and isinstance(err.__context__, HfHubHTTPError)
43864385
and err.__context__.response.status_code in (412, 409)
43874386
):
43884387
# 412 is Precondition failed (parent_commit isn't satisfied)

src/datasets/load.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from typing import Any, Optional, Union
2929

3030
import fsspec
31+
import httpx
3132
import requests
3233
import yaml
3334
from fsspec.core import url_to_fs
@@ -948,6 +949,8 @@ def dataset_module_factory(
948949
OfflineModeIsEnabled,
949950
requests.exceptions.Timeout,
950951
requests.exceptions.ConnectionError,
952+
httpx.ConnectError,
953+
httpx.TimeoutException,
951954
),
952955
):
953956
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e
@@ -963,6 +966,8 @@ def dataset_module_factory(
963966
OfflineModeIsEnabled,
964967
requests.exceptions.Timeout,
965968
requests.exceptions.ConnectionError,
969+
httpx.ConnectError,
970+
httpx.TimeoutException,
966971
) as e:
967972
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e
968973
except GatedRepoError as e:

src/datasets/utils/file_utils.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,13 @@
2727
from xml.etree import ElementTree as ET
2828

2929
import fsspec
30+
import httpx
3031
import huggingface_hub
3132
import huggingface_hub.errors
3233
import requests
3334
from fsspec.core import strip_protocol, url_to_fs
3435
from fsspec.utils import can_be_local
35-
from huggingface_hub.utils import EntryNotFoundError, get_session, insecure_hashlib
36+
from huggingface_hub.utils import get_session, insecure_hashlib
3637
from packaging import version
3738

3839
from .. import __version__, config
@@ -140,7 +141,7 @@ def cached_path(
140141
ConnectionError: in case of unreachable url
141142
and no cache on disk
142143
ValueError: if it couldn't parse the url or filename correctly
143-
requests.exceptions.ConnectionError: in case of internet connection issue
144+
httpx.NetworkError or requests.exceptions.ConnectionError: in case of internet connection issue
144145
"""
145146
if download_config is None:
146147
download_config = DownloadConfig(**download_kwargs)
@@ -246,7 +247,7 @@ def cached_path(
246247
def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str:
247248
ua = f"datasets/{__version__}"
248249
ua += f"; python/{config.PY_VERSION}"
249-
ua += f"; huggingface_hub/{huggingface_hub.__version__}"
250+
ua += f"; hf_hub/{huggingface_hub.__version__}"
250251
ua += f"; pyarrow/{config.PYARROW_VERSION}"
251252
if config.TORCH_AVAILABLE:
252253
ua += f"; torch/{config.TORCH_VERSION}"
@@ -753,7 +754,7 @@ def xgetsize(path, download_config: Optional[DownloadConfig] = None) -> int:
753754
fs, *_ = fs, *_ = url_to_fs(path, **storage_options)
754755
try:
755756
size = fs.size(main_hop)
756-
except EntryNotFoundError:
757+
except huggingface_hub.utils.EntryNotFoundError:
757758
raise FileNotFoundError(f"No such file: {path}")
758759
if size is None:
759760
# use xopen instead of fs.open to make data fetching more robust
@@ -817,6 +818,7 @@ def read_with_retries(*args, **kwargs):
817818
asyncio.TimeoutError,
818819
requests.exceptions.ConnectionError,
819820
requests.exceptions.Timeout,
821+
httpx.RequestError,
820822
) as err:
821823
disconnect_err = err
822824
logger.warning(
@@ -897,9 +899,6 @@ def _prepare_single_hop_path_and_storage_options(
897899
"endpoint": config.HF_ENDPOINT,
898900
**storage_options,
899901
}
900-
# streaming with block_size=0 is only implemented in 0.21 (see https://github.com/huggingface/huggingface_hub/pull/1967)
901-
if config.HF_HUB_VERSION < version.parse("0.21.0"):
902-
storage_options["block_size"] = "default"
903902
if storage_options:
904903
storage_options = {protocol: storage_options}
905904
return urlpath, storage_options

src/datasets/utils/hub.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,6 @@
11
from functools import partial
22

33
from huggingface_hub import hf_hub_url
4-
from huggingface_hub.utils import get_session, hf_raise_for_status
54

65

76
hf_dataset_url = partial(hf_hub_url, repo_type="dataset")
8-
9-
10-
def check_auth(hf_api, repo_id, token=None):
11-
headers = hf_api._build_hf_headers(token=token)
12-
path = f"{hf_api.endpoint}/api/datasets/{repo_id}/auth-check"
13-
r = get_session().get(path, headers=headers)
14-
hf_raise_for_status(r)

tests/fixtures/hub.py

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@
55
from typing import Optional
66

77
import pytest
8-
import requests
9-
from huggingface_hub.hf_api import HfApi, RepositoryNotFoundError
10-
from huggingface_hub.utils import hf_raise_for_status
8+
from huggingface_hub.hf_api import HfApi
9+
from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError
1110
from huggingface_hub.utils._headers import _http_user_agent
1211

1312

@@ -24,9 +23,14 @@
2423
def ci_hub_config(monkeypatch):
2524
monkeypatch.setattr("datasets.config.HF_ENDPOINT", CI_HUB_ENDPOINT)
2625
monkeypatch.setattr("datasets.config.HUB_DATASETS_URL", CI_HUB_DATASETS_URL)
27-
monkeypatch.setattr(
28-
"huggingface_hub.file_download.HUGGINGFACE_CO_URL_TEMPLATE", CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE
29-
)
26+
monkeypatch.setattr("huggingface_hub.constants.HUGGINGFACE_CO_URL_TEMPLATE", CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE)
27+
try:
28+
# for backward compatibility with huggingface_hub 0.x
29+
monkeypatch.setattr(
30+
"huggingface_hub.file_download.HUGGINGFACE_CO_URL_TEMPLATE", CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE
31+
)
32+
except AttributeError:
33+
pass
3034
old_environ = dict(os.environ)
3135
os.environ["HF_ENDPOINT"] = CI_HUB_ENDPOINT
3236
yield
@@ -107,18 +111,11 @@ def _hf_gated_dataset_repo_txt_data(hf_api: HfApi, hf_token, text_file_content):
107111
repo_id=repo_id,
108112
repo_type="dataset",
109113
)
110-
path = f"{hf_api.endpoint}/api/datasets/{repo_id}/settings"
111-
repo_settings = {"gated": "auto"}
112-
r = requests.put(
113-
path,
114-
headers={"authorization": f"Bearer {hf_token}"},
115-
json=repo_settings,
116-
)
117-
hf_raise_for_status(r)
114+
hf_api.update_repo_settings(repo_id, token=hf_token, repo_type="dataset", gated="auto")
118115
yield repo_id
119116
try:
120117
hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset")
121-
except (requests.exceptions.HTTPError, ValueError): # catch http error and token invalid error
118+
except (HfHubHTTPError, ValueError): # catch http error and token invalid error
122119
pass
123120

124121

@@ -142,7 +139,7 @@ def hf_private_dataset_repo_txt_data_(hf_api: HfApi, hf_token, text_file_content
142139
yield repo_id
143140
try:
144141
hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset")
145-
except (requests.exceptions.HTTPError, ValueError): # catch http error and token invalid error
142+
except (HfHubHTTPError, ValueError): # catch http error and token invalid error
146143
pass
147144

148145

@@ -166,7 +163,7 @@ def hf_private_dataset_repo_zipped_txt_data_(hf_api: HfApi, hf_token, zip_csv_wi
166163
yield repo_id
167164
try:
168165
hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset")
169-
except (requests.exceptions.HTTPError, ValueError): # catch http error and token invalid error
166+
except (HfHubHTTPError, ValueError): # catch http error and token invalid error
170167
pass
171168

172169

@@ -190,7 +187,7 @@ def hf_private_dataset_repo_zipped_img_data_(hf_api: HfApi, hf_token, zip_image_
190187
yield repo_id
191188
try:
192189
hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset")
193-
except (requests.exceptions.HTTPError, ValueError): # catch http error and token invalid error
190+
except (HfHubHTTPError, ValueError): # catch http error and token invalid error
194191
pass
195192

196193

0 commit comments

Comments
 (0)