Skip to content

Commit fcb5e41

Browse files
authored
Merge pull request #20805 from davelopez/add_huggingface_file_source
Add Hugging Face 🤗 file source and user-defined template
2 parents 49ef7aa + 77c4bba commit fcb5e41

File tree

8 files changed

+255
-13
lines changed

8 files changed

+255
-13
lines changed

client/src/api/fileSources.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { faAws, faDropbox, faGoogleDrive } from "@fortawesome/free-brands-svg-icons";
1+
import { faAws, faDropbox, faGoogleDrive, faHubspot } from "@fortawesome/free-brands-svg-icons";
22
import { faCloud, faFolderTree, faNetworkWired, type IconDefinition } from "font-awesome-6";
33

44
import type { components } from "@/api/schema";
@@ -66,6 +66,10 @@ export const templateTypes: FileSourceTypesDetail = {
6666
icon: faNetworkWired,
6767
message: "This is a repository plugin that connects with a Dataverse.org instance.",
6868
},
69+
huggingface: {
70+
icon: faHubspot,
71+
message: "This is a file repository plugin that connects with the Hugging Face Hub.",
72+
},
6973
};
7074

7175
export const FileSourcesValidFilters = {

client/src/api/schema/schema.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11184,7 +11184,8 @@ export interface components {
1118411184
| "inveniordm"
1118511185
| "zenodo"
1118611186
| "rspace"
11187-
| "dataverse";
11187+
| "dataverse"
11188+
| "huggingface";
1118811189
/** Variables */
1118911190
variables?:
1119011191
| (
@@ -21585,7 +21586,8 @@ export interface components {
2158521586
| "inveniordm"
2158621587
| "zenodo"
2158721588
| "rspace"
21588-
| "dataverse";
21589+
| "dataverse"
21590+
| "huggingface";
2158921591
/** Uri Root */
2159021592
uri_root: string;
2159121593
/**

lib/galaxy/dependencies/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,9 @@ def is_redis_url(url: str) -> bool:
333333

334334
return celery_enabled and is_redis_url(celery_result_backend) or is_redis_url(celery_broker_url)
335335

336+
def check_huggingface_hub(self):
337+
return "huggingface" in self.file_sources
338+
336339

337340
def optional(config_file=None):
338341
if not config_file:

lib/galaxy/dependencies/conditional-requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ fs.onedatarestfs==21.2.5.2 # type: onedata, depends on onedatafilerestclient
3333
fs-basespace # type: basespace
3434
fs-azureblob # type: azure
3535
rspace-client>=2.6.1,<3 # type: rspace
36+
huggingface_hub
3637

3738
# Vault backend
3839
hvac

lib/galaxy/files/sources/_fsspec.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ def _list(
141141
try:
142142
cache_options = self._get_cache_options(context.config)
143143
fs = self._open_fs(context, cache_options)
144+
path = self._to_filesystem_path(path)
144145

145146
if recursive:
146147
return self._list_recursive(fs, path)
@@ -182,6 +183,7 @@ def _realize_to(
182183
"""Download a file from the fsspec filesystem to a local path."""
183184
cache_options = self._get_cache_options(context.config)
184185
fs = self._open_fs(context, cache_options)
186+
source_path = self._to_filesystem_path(source_path)
185187
fs.get_file(source_path, native_path)
186188

187189
def _write_from(
@@ -193,6 +195,7 @@ def _write_from(
193195
"""Upload a file from a local path to the fsspec filesystem."""
194196
cache_options = self._get_cache_options(context.config)
195197
fs = self._open_fs(context, cache_options)
198+
target_path = self._to_filesystem_path(target_path)
196199
fs.put_file(native_path, target_path)
197200

198201
def _adapt_entry_path(self, filesystem_path: str) -> str:
@@ -203,17 +206,27 @@ def _adapt_entry_path(self, filesystem_path: str) -> str:
203206
"""
204207
return filesystem_path
205208

209+
def _to_filesystem_path(self, path: str) -> str:
210+
"""Convert an entry path to the filesystem path format.
211+
212+
Subclasses can override this to transform paths (e.g., virtual to filesystem paths).
213+
By default, returns the path unchanged.
214+
"""
215+
return path
216+
206217
def _extract_timestamp(self, info: dict) -> Optional[str]:
207-
"""Extract and format timestamp from fsspec file info."""
208-
# Handle timestamp fields more robustly - check for None explicitly
209-
mtime = info.get("mtime")
210-
if mtime is None:
211-
mtime = info.get("modified")
212-
if mtime is None:
213-
mtime = info.get("LastModified")
218+
"""Extract the timestamp from fsspec file info to use it in the RemoteFile entry.
219+
220+
Subclasses can override this to customize timestamp extraction.
221+
By default, it tries to extract 'mtime', 'modified', or 'LastModified'
222+
"""
223+
return info.get("mtime") or info.get("modified") or info.get("LastModified")
214224

215-
ctime_result = self.to_dict_time(mtime)
216-
return ctime_result
225+
def _get_formatted_timestamp(self, info: dict) -> Optional[str]:
226+
"""Get a formatted timestamp for the RemoteFile entry."""
227+
mtime = self._extract_timestamp(info)
228+
formatted_timestamp = self.to_dict_time(mtime)
229+
return formatted_timestamp
217230

218231
def _info_to_entry(self, info: dict) -> AnyRemoteEntry:
219232
"""Convert fsspec file info to Galaxy's remote entry format."""
@@ -226,7 +239,7 @@ def _info_to_entry(self, info: dict) -> AnyRemoteEntry:
226239
return RemoteDirectory(name=name, uri=uri, path=entry_path)
227240
else:
228241
size = int(info.get("size", 0))
229-
ctime = self._extract_timestamp(info)
242+
ctime = self._get_formatted_timestamp(info)
230243
return RemoteFile(name=name, size=size, ctime=ctime, uri=uri, path=entry_path)
231244

232245
def _list_recursive(self, fs: AbstractFileSystem, path: str) -> tuple[list[AnyRemoteEntry], int]:
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
"""
2+
Hugging Face Hub file source plugin using fsspec.
3+
"""
4+
5+
import logging
6+
from typing import (
7+
Annotated,
8+
Literal,
9+
Optional,
10+
Union,
11+
)
12+
13+
from fsspec import AbstractFileSystem
14+
from pydantic import Field
15+
16+
from galaxy.files.models import (
17+
AnyRemoteEntry,
18+
FilesSourceRuntimeContext,
19+
RemoteDirectory,
20+
)
21+
22+
try:
23+
from huggingface_hub import (
24+
HfApi,
25+
HfFileSystem,
26+
)
27+
except ImportError:
28+
HfApi = None
29+
HfFileSystem = None
30+
31+
from galaxy.exceptions import MessageException
32+
from galaxy.files.sources._fsspec import (
33+
CacheOptionsDictType,
34+
FsspecBaseFileSourceConfiguration,
35+
FsspecBaseFileSourceTemplateConfiguration,
36+
FsspecFilesSource,
37+
)
38+
from galaxy.util.config_templates import TemplateExpansion
39+
40+
log = logging.getLogger(__name__)
41+
42+
SortByOptions = Literal["last_modified", "trending_score", "created_at", "downloads", "likes"]
43+
44+
DEFAULT_SORT_BY: SortByOptions = "downloads"
45+
46+
MAX_REPO_LIMIT = 1000
47+
48+
49+
class HuggingFaceFileSourceTemplateConfiguration(FsspecBaseFileSourceTemplateConfiguration):
50+
token: Annotated[
51+
Union[str, TemplateExpansion, None],
52+
Field(
53+
description="Hugging Face API token for accessing private model repositories. "
54+
"If not provided, only public repositories will be accessible.",
55+
),
56+
] = None
57+
endpoint: Annotated[
58+
Union[str, TemplateExpansion, None],
59+
Field(
60+
description="Custom endpoint for Hugging Face Hub. "
61+
"If not provided, the default Hugging Face Hub will be used (https://huggingface.co).",
62+
),
63+
] = None
64+
65+
66+
class HuggingFaceFileSourceConfiguration(FsspecBaseFileSourceConfiguration):
67+
token: Optional[str] = None
68+
endpoint: Optional[str] = None
69+
70+
71+
class HuggingFaceFilesSource(
72+
FsspecFilesSource[HuggingFaceFileSourceTemplateConfiguration, HuggingFaceFileSourceConfiguration]
73+
):
74+
plugin_type = "huggingface"
75+
required_module = HfFileSystem
76+
required_package = "huggingface_hub"
77+
78+
template_config_class = HuggingFaceFileSourceTemplateConfiguration
79+
resolved_config_class = HuggingFaceFileSourceConfiguration
80+
81+
def _open_fs(
82+
self,
83+
context: FilesSourceRuntimeContext[HuggingFaceFileSourceConfiguration],
84+
cache_options: CacheOptionsDictType,
85+
) -> AbstractFileSystem:
86+
if HfFileSystem is None:
87+
raise self.required_package_exception
88+
89+
config = context.config
90+
return HfFileSystem(
91+
token=config.token or False, # Use False to disable authentication
92+
endpoint=config.endpoint,
93+
**cache_options,
94+
)
95+
96+
def _to_filesystem_path(self, path: str) -> str:
97+
"""Transform entry path to Hugging Face filesystem path."""
98+
if path == "/":
99+
# Hugging Face does not implement access to the repositories root
100+
return ""
101+
# Remove leading slash for HF compatibility
102+
return path.lstrip("/")
103+
104+
def _extract_timestamp(self, info: dict) -> Optional[str]:
105+
"""Extract timestamp from Hugging Face file info to use it in the RemoteFile entry."""
106+
last_commit: dict = info.get("last_commit", {})
107+
return last_commit.get("date")
108+
109+
def _list(
110+
self,
111+
context: FilesSourceRuntimeContext[HuggingFaceFileSourceConfiguration],
112+
path="/",
113+
recursive=False,
114+
write_intent: bool = False,
115+
limit: Optional[int] = None,
116+
offset: Optional[int] = None,
117+
query: Optional[str] = None,
118+
sort_by: Optional[str] = None,
119+
) -> tuple[list[AnyRemoteEntry], int]:
120+
# If we're at the root, list repositories using HfApi
121+
if path == "/":
122+
return self._list_repositories(config=context.config, limit=limit, offset=offset, query=query)
123+
124+
# For non-root paths, use the parent implementation
125+
return super()._list(
126+
context=context,
127+
path=path,
128+
recursive=recursive,
129+
limit=limit,
130+
offset=offset,
131+
query=query,
132+
sort_by=sort_by,
133+
)
134+
135+
def _list_repositories(
136+
self,
137+
config: HuggingFaceFileSourceConfiguration,
138+
limit: Optional[int] = None,
139+
offset: Optional[int] = None,
140+
query: Optional[str] = None,
141+
) -> tuple[list[AnyRemoteEntry], int]:
142+
if HfApi is None:
143+
raise self.required_package_exception
144+
145+
api = HfApi(
146+
token=config.token or False, # Use False to disable authentication
147+
endpoint=config.endpoint,
148+
)
149+
try:
150+
repos_iter = api.list_models(search=query, sort=DEFAULT_SORT_BY, direction=-1, limit=MAX_REPO_LIMIT)
151+
152+
# Convert repositories to directory entries
153+
entries_list: list[AnyRemoteEntry] = []
154+
for repo in repos_iter:
155+
repo_id = repo.id if hasattr(repo, "id") else str(repo)
156+
entry = RemoteDirectory(
157+
name=repo_id,
158+
uri=self.uri_from_path(repo_id),
159+
path=repo_id,
160+
)
161+
entries_list.append(entry)
162+
163+
total_count = len(entries_list)
164+
165+
# Apply pagination
166+
if offset is not None and limit is not None:
167+
entries_list = entries_list[offset : offset + limit]
168+
elif limit is not None:
169+
entries_list = entries_list[:limit]
170+
171+
return entries_list, total_count
172+
173+
except Exception as e:
174+
raise MessageException(f"Failed to list repositories from Hugging Face Hub: {e}") from e
175+
176+
177+
__all__ = ["HuggingFaceFilesSource"]
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
- id: huggingface
2+
version: 0
3+
name: Hugging Face Hub 🤗
4+
description: |
5+
[Hugging Face Hub](https://huggingface.co) is a platform for sharing machine learning models and datasets.
6+
Using this plugin you can connect to the Hugging Face Hub to access public or private machine learning models repositories.
7+
variables:
8+
endpoint:
9+
label: Hugging Face Hub Endpoint
10+
type: string
11+
help: |
12+
Custom endpoint of the Hugging Face Hub you are connecting to. This should be the full URL including the protocol (http or https) and the domain name.
13+
You can leave this blank to use the default Hugging Face Hub endpoint (https://huggingface.co).
14+
default: https://huggingface.co
15+
secrets:
16+
token:
17+
label: Hugging Face Access Token
18+
help: |
19+
The personal access token to use to connect to the Hugging Face Hub. You can generate a new token in your Hugging Face account settings.
20+
This will allow Galaxy to access private models if you have the necessary permissions.
21+
configuration:
22+
type: huggingface
23+
endpoint: "{{ variables.endpoint }}"
24+
token: "{{ secrets.token }}"

lib/galaxy/files/templates/models.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
"zenodo",
4646
"rspace",
4747
"dataverse",
48+
"huggingface",
4849
]
4950

5051

@@ -296,6 +297,20 @@ class DataverseFileSourceConfiguration(StrictModel):
296297
writable: bool = True
297298

298299

300+
class HuggingFaceFileSourceTemplateConfiguration(StrictModel):
301+
type: Literal["huggingface"]
302+
token: Union[str, TemplateExpansion, None] = None
303+
endpoint: Union[str, TemplateExpansion, None] = None
304+
template_start: Optional[str] = None
305+
template_end: Optional[str] = None
306+
307+
308+
class HuggingFaceFileSourceConfiguration(StrictModel):
309+
type: Literal["huggingface"]
310+
token: Optional[str] = None
311+
endpoint: Optional[str] = None
312+
313+
299314
FileSourceTemplateConfiguration = Annotated[
300315
Union[
301316
PosixFileSourceTemplateConfiguration,
@@ -311,6 +326,7 @@ class DataverseFileSourceConfiguration(StrictModel):
311326
ZenodoFileSourceTemplateConfiguration,
312327
RSpaceFileSourceTemplateConfiguration,
313328
DataverseFileSourceTemplateConfiguration,
329+
HuggingFaceFileSourceTemplateConfiguration,
314330
],
315331
Field(discriminator="type"),
316332
]
@@ -330,6 +346,7 @@ class DataverseFileSourceConfiguration(StrictModel):
330346
ZenodoFileSourceConfiguration,
331347
RSpaceFileSourceConfiguration,
332348
DataverseFileSourceConfiguration,
349+
HuggingFaceFileSourceConfiguration,
333350
],
334351
Field(discriminator="type"),
335352
]
@@ -407,6 +424,7 @@ def template_to_configuration(
407424
"zenodo": ZenodoFileSourceConfiguration,
408425
"rspace": RSpaceFileSourceConfiguration,
409426
"dataverse": DataverseFileSourceConfiguration,
427+
"huggingface": HuggingFaceFileSourceConfiguration,
410428
}
411429

412430

0 commit comments

Comments
 (0)