Skip to content

Commit 5b4fa6a

Browse files
authored
Handle hf:// urls + raise ValueError if repo type is unknown (#1298)
1 parent 57fe360 commit 5b4fa6a

File tree

2 files changed

+48
-6
lines changed

2 files changed

+48
-6
lines changed

src/huggingface_hub/hf_api.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ def repo_type_and_id_from_hf_id(
103103
104104
- https://huggingface.co/<repo_type>/<namespace>/<repo_id>
105105
- https://huggingface.co/<namespace>/<repo_id>
106+
- hf://<repo_type>/<namespace>/<repo_id>
107+
- hf://<namespace>/<repo_id>
106108
- <repo_type>/<namespace>/<repo_id>
107109
- <namespace>/<repo_id>
108110
- <repo_id>
@@ -112,9 +114,21 @@ def repo_type_and_id_from_hf_id(
112114
Returns:
113115
A tuple with three items: repo_type (`str` or `None`), namespace (`str` or
114116
`None`) and repo_id (`str`).
117+
118+
Raises:
119+
- [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
120+
If URL cannot be parsed.
121+
- [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
122+
If `repo_type` is unknown.
115123
"""
124+
input_hf_id = hf_id
116125
hub_url = re.sub(r"https?://", "", hub_url if hub_url is not None else ENDPOINT)
117126
is_hf_url = hub_url in hf_id and "@" not in hf_id
127+
128+
HFFS_PREFIX = "hf://"
129+
if hf_id.startswith(HFFS_PREFIX): # Remove "hf://" prefix if exists
130+
hf_id = hf_id[len(HFFS_PREFIX) :]
131+
118132
url_segments = hf_id.split("/")
119133
is_hf_id = len(url_segments) <= 3
120134

@@ -144,9 +158,13 @@ def repo_type_and_id_from_hf_id(
144158
f"Unable to retrieve user and repo ID from the passed HF ID: {hf_id}"
145159
)
146160

161+
# Check if repo type is known (mapping "spaces" => "space" + empty value => `None`)
162+
if repo_type in REPO_TYPES_MAPPING:
163+
repo_type = REPO_TYPES_MAPPING[repo_type]
164+
if repo_type == "":
165+
repo_type = None
147166
if repo_type not in REPO_TYPES:
148-
assert repo_type is not None, "repo_type `None` do not have mapping"
149-
repo_type = REPO_TYPES_MAPPING.get(repo_type)
167+
raise ValueError(f"Unknown `repo_type`: '{repo_type}' ('{input_hf_id}')")
150168

151169
return repo_type, namespace, repo_id
152170

@@ -234,12 +252,21 @@ class RepoUrl(str):
234252
>>> RepoUrl('https://huggingface.co/gpt2')
235253
RepoUrl('https://huggingface.co/gpt2', endpoint='https://huggingface.co', repo_type='model', repo_id='gpt2')
236254
237-
>>> RepoUrl('https://hub-ci.huggingface.co/dataset/dummy_user/dummy_dataset', endpoint='https://hub-ci.huggingface.co')
238-
RepoUrl('https://hub-ci.huggingface.co/dataset/dummy_user/dummy_dataset', endpoint='https://hub-ci.huggingface.co', repo_type='dataset', repo_id='dummy_user/dummy_dataset')
255+
>>> RepoUrl('https://hub-ci.huggingface.co/datasets/dummy_user/dummy_dataset', endpoint='https://hub-ci.huggingface.co')
256+
RepoUrl('https://hub-ci.huggingface.co/datasets/dummy_user/dummy_dataset', endpoint='https://hub-ci.huggingface.co', repo_type='dataset', repo_id='dummy_user/dummy_dataset')
257+
258+
>>> RepoUrl('hf://datasets/my-user/my-dataset')
259+
RepoUrl('hf://datasets/my-user/my-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='user/dataset')
239260
240261
>>> HfApi.create_repo("dummy_model")
241262
RepoUrl('https://huggingface.co/Wauplin/dummy_model', endpoint='https://huggingface.co', repo_type='model', repo_id='Wauplin/dummy_model')
242263
```
264+
265+
Raises:
266+
- [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
267+
If URL cannot be parsed.
268+
- [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
269+
If `repo_type` is unknown.
243270
"""
244271

245272
def __new__(cls, url: Any, endpoint: Optional[str] = None):

tests/test_hf_api.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2155,8 +2155,8 @@ def test_end_to_end_thresh_16M(self):
21552155
print("took", time.time() - start_time)
21562156

21572157

2158-
class HfApiMiscTest(unittest.TestCase):
2159-
def test_repo_type_and_id_from_hf_id(self):
2158+
class ParseHFUrlTest(unittest.TestCase):
2159+
def test_repo_type_and_id_from_hf_id_on_correct_values(self):
21602160
possible_values = {
21612161
"https://huggingface.co/id": [None, None, "id"],
21622162
"https://huggingface.co/user/id": [None, "user", "id"],
@@ -2166,6 +2166,10 @@ def test_repo_type_and_id_from_hf_id(self):
21662166
"dataset/user/id": ["dataset", "user", "id"],
21672167
"space/user/id": ["space", "user", "id"],
21682168
"id": [None, None, "id"],
2169+
"hf://id": [None, None, "id"],
2170+
"hf://user/id": [None, "user", "id"],
2171+
"hf://model/user/name": ["model", "user", "name"], # 's' is optional
2172+
"hf://models/user/name": ["model", "user", "name"],
21692173
}
21702174

21712175
for key, value in possible_values.items():
@@ -2174,6 +2178,17 @@ def test_repo_type_and_id_from_hf_id(self):
21742178
tuple(value),
21752179
)
21762180

2181+
def test_repo_type_and_id_from_hf_id_on_wrong_values(self):
2182+
for hub_id in [
2183+
"https://unknown-endpoint.co/id",
2184+
"https://huggingface.co/datasets/user/id@revision", # @ forbidden
2185+
"datasets/user/id/subpath",
2186+
"hffs://model/user/name",
2187+
"spaeces/user/id", # with typo in repo type
2188+
]:
2189+
with self.assertRaises(ValueError):
2190+
repo_type_and_id_from_hf_id(hub_id, hub_url="https://huggingface.co")
2191+
21772192

21782193
class HfApiDiscussionsTest(HfApiCommonTestWithLogin):
21792194
def setUp(self):

0 commit comments

Comments
 (0)