Skip to content

Commit 1f7ff25

Browse files
authored
Support permanently deleting LFS files (#2954)
* Support permanently deleting LFS files * update example * example
1 parent c734d0c commit 1f7ff25

File tree

4 files changed

+265
-1
lines changed

4 files changed

+265
-1
lines changed

docs/source/en/package_reference/hf_api.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@ models = hf_api.list_models()
5757

5858
[[autodoc]] huggingface_hub.hf_api.GitRefs
5959

60+
### LFSFileInfo
61+
62+
[[autodoc]] huggingface_hub.hf_api.LFSFileInfo
63+
6064
### ModelInfo
6165

6266
[[autodoc]] huggingface_hub.hf_api.ModelInfo

src/huggingface_hub/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@
208208
"list_datasets",
209209
"list_inference_catalog",
210210
"list_inference_endpoints",
211+
"list_lfs_files",
211212
"list_liked_repos",
212213
"list_models",
213214
"list_organization_members",
@@ -230,6 +231,7 @@
230231
"parse_safetensors_file_metadata",
231232
"pause_inference_endpoint",
232233
"pause_space",
234+
"permanently_delete_lfs_files",
233235
"preupload_lfs_files",
234236
"reject_access_request",
235237
"rename_discussion",
@@ -830,6 +832,7 @@
830832
"list_datasets",
831833
"list_inference_catalog",
832834
"list_inference_endpoints",
835+
"list_lfs_files",
833836
"list_liked_repos",
834837
"list_models",
835838
"list_organization_members",
@@ -862,6 +865,7 @@
862865
"parse_safetensors_file_metadata",
863866
"pause_inference_endpoint",
864867
"pause_space",
868+
"permanently_delete_lfs_files",
865869
"preupload_lfs_files",
866870
"push_to_hub_fastai",
867871
"push_to_hub_keras",
@@ -1156,6 +1160,7 @@ def __dir__():
11561160
list_datasets, # noqa: F401
11571161
list_inference_catalog, # noqa: F401
11581162
list_inference_endpoints, # noqa: F401
1163+
list_lfs_files, # noqa: F401
11591164
list_liked_repos, # noqa: F401
11601165
list_models, # noqa: F401
11611166
list_organization_members, # noqa: F401
@@ -1178,6 +1183,7 @@ def __dir__():
11781183
parse_safetensors_file_metadata, # noqa: F401
11791184
pause_inference_endpoint, # noqa: F401
11801185
pause_space, # noqa: F401
1186+
permanently_delete_lfs_files, # noqa: F401
11811187
preupload_lfs_files, # noqa: F401
11821188
reject_access_request, # noqa: F401
11831189
rename_discussion, # noqa: F401

src/huggingface_hub/hf_api.py

Lines changed: 192 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@
112112
SafetensorsRepoMetadata,
113113
TensorInfo,
114114
build_hf_headers,
115+
chunk_iterable,
115116
experimental,
116117
filter_repo_objects,
117118
fix_hf_endpoint_in_url,
@@ -1529,6 +1530,67 @@ def __init__(self, **kwargs) -> None:
15291530
self.__dict__.update(**kwargs)
15301531

15311532

1533+
@dataclass
1534+
class LFSFileInfo:
1535+
"""
1536+
Contains information about a file stored as LFS on a repo on the Hub.
1537+
1538+
Used in the context of listing and permanently deleting LFS files from a repo to free-up space.
1539+
See [`list_lfs_files`] and [`permanently_delete_lfs_files`] for more details.
1540+
1541+
Git LFS files are tracked using SHA-256 object IDs, rather than file paths, to optimize performance
1542+
This approach is necessary because a single object can be referenced by multiple paths across different commits,
1543+
making it impractical to search and resolve these connections. Check out [our documentation](https://huggingface.co/docs/hub/storage-limits#advanced-track-lfs-file-references)
1544+
to learn how to know which filename(s) is(are) associated with each SHA.
1545+
1546+
Attributes:
1547+
file_oid (`str`):
1548+
SHA-256 object ID of the file. This is the identifier to pass when permanently deleting the file.
1549+
filename (`str`):
1550+
Possible filename for the LFS object. See the note above for more information.
1551+
oid (`str`):
1552+
OID of the LFS object.
1553+
pushed_at (`datetime`):
1554+
Date the LFS object was pushed to the repo.
1555+
ref (`str`, *optional*):
1556+
Ref where the LFS object has been pushed (if any).
1557+
size (`int`):
1558+
Size of the LFS object.
1559+
1560+
Example:
1561+
```py
1562+
>>> from huggingface_hub import HfApi
1563+
>>> api = HfApi()
1564+
>>> lfs_files = api.list_lfs_files("username/my-cool-repo")
1565+
1566+
# Filter files files to delete based on a combination of `filename`, `pushed_at`, `ref` or `size`.
1567+
# e.g. select only LFS files in the "checkpoints" folder
1568+
>>> lfs_files_to_delete = (lfs_file for lfs_file in lfs_files if lfs_file.filename.startswith("checkpoints/"))
1569+
1570+
# Permanently delete LFS files
1571+
>>> api.permanently_delete_lfs_files("username/my-cool-repo", lfs_files_to_delete)
1572+
```
1573+
"""
1574+
1575+
file_oid: str
1576+
filename: str
1577+
oid: str
1578+
pushed_at: datetime
1579+
ref: Optional[str]
1580+
size: int
1581+
1582+
def __init__(self, **kwargs) -> None:
1583+
self.file_oid = kwargs.pop("fileOid")
1584+
self.filename = kwargs.pop("filename")
1585+
self.oid = kwargs.pop("oid")
1586+
self.pushed_at = parse_datetime(kwargs.pop("pushedAt"))
1587+
self.ref = kwargs.pop("ref", None)
1588+
self.size = kwargs.pop("size")
1589+
1590+
# forward compatibility
1591+
self.__dict__.update(**kwargs)
1592+
1593+
15321594
def future_compatible(fn: CallableT) -> CallableT:
15331595
"""Wrap a method of `HfApi` to handle `run_as_future=True`.
15341596
@@ -3387,6 +3449,131 @@ def super_squash_history(
33873449
response = get_session().post(url=url, headers=headers, json={"message": commit_message})
33883450
hf_raise_for_status(response)
33893451

3452+
@validate_hf_hub_args
3453+
def list_lfs_files(
3454+
self,
3455+
repo_id: str,
3456+
*,
3457+
repo_type: Optional[str] = None,
3458+
token: Union[bool, str, None] = None,
3459+
) -> Iterable[LFSFileInfo]:
3460+
"""
3461+
List all LFS files in a repo on the Hub.
3462+
3463+
This is primarily useful to count how much storage a repo is using and to eventually clean up large files
3464+
with [`permanently_delete_lfs_files`]. Note that this would be a permanent action that will affect all commits
3465+
referencing this deleted files and that cannot be undone.
3466+
3467+
Args:
3468+
repo_id (`str`):
3469+
The repository for which you are listing LFS files.
3470+
repo_type (`str`, *optional*):
3471+
Type of repository. Set to `"dataset"` or `"space"` if listing from a dataset or space, `None` or
3472+
`"model"` if listing from a model. Default is `None`.
3473+
token (Union[bool, str, None], optional):
3474+
A valid user access token (string). Defaults to the locally saved
3475+
token, which is the recommended method for authentication (see
3476+
https://huggingface.co/docs/huggingface_hub/quick-start#authentication).
3477+
To disable authentication, pass `False`.
3478+
3479+
Returns:
3480+
`Iterable[LFSFileInfo]`: An iterator of [`LFSFileInfo`] objects.
3481+
3482+
Example:
3483+
```py
3484+
>>> from huggingface_hub import HfApi
3485+
>>> api = HfApi()
3486+
>>> lfs_files = api.list_lfs_files("username/my-cool-repo")
3487+
3488+
# Filter files files to delete based on a combination of `filename`, `pushed_at`, `ref` or `size`.
3489+
# e.g. select only LFS files in the "checkpoints" folder
3490+
>>> lfs_files_to_delete = (lfs_file for lfs_file in lfs_files if lfs_file.filename.startswith("checkpoints/"))
3491+
3492+
# Permanently delete LFS files
3493+
>>> api.permanently_delete_lfs_files("username/my-cool-repo", lfs_files_to_delete)
3494+
```
3495+
"""
3496+
# Prepare request
3497+
if repo_type is None:
3498+
repo_type = constants.REPO_TYPE_MODEL
3499+
url = f"{self.endpoint}/api/{repo_type}s/{repo_id}/lfs-files"
3500+
headers = self._build_hf_headers(token=token)
3501+
3502+
# Paginate over LFS items
3503+
for item in paginate(url, params={}, headers=headers):
3504+
yield LFSFileInfo(**item)
3505+
3506+
@validate_hf_hub_args
3507+
def permanently_delete_lfs_files(
3508+
self,
3509+
repo_id: str,
3510+
lfs_files: Iterable[LFSFileInfo],
3511+
*,
3512+
rewrite_history: bool = True,
3513+
repo_type: Optional[str] = None,
3514+
token: Union[bool, str, None] = None,
3515+
) -> None:
3516+
"""
3517+
Permanently delete LFS files from a repo on the Hub.
3518+
3519+
<Tip warning={true}>
3520+
3521+
This is a permanent action that will affect all commits referencing the deleted files and might corrupt your
3522+
repository. This is a non-revertible operation. Use it only if you know what you are doing.
3523+
3524+
</Tip>
3525+
3526+
Args:
3527+
repo_id (`str`):
3528+
The repository for which you are listing LFS files.
3529+
lfs_files (`Iterable[LFSFileInfo]`):
3530+
An iterable of [`LFSFileInfo`] items to permanently delete from the repo. Use [`list_lfs_files`] to list
3531+
all LFS files from a repo.
3532+
rewrite_history (`bool`, *optional*, default to `True`):
3533+
Whether to rewrite repository history to remove file pointers referencing the deleted LFS files (recommended).
3534+
repo_type (`str`, *optional*):
3535+
Type of repository. Set to `"dataset"` or `"space"` if listing from a dataset or space, `None` or
3536+
`"model"` if listing from a model. Default is `None`.
3537+
token (Union[bool, str, None], optional):
3538+
A valid user access token (string). Defaults to the locally saved
3539+
token, which is the recommended method for authentication (see
3540+
https://huggingface.co/docs/huggingface_hub/quick-start#authentication).
3541+
To disable authentication, pass `False`.
3542+
3543+
Example:
3544+
```py
3545+
>>> from huggingface_hub import HfApi
3546+
>>> api = HfApi()
3547+
>>> lfs_files = api.list_lfs_files("username/my-cool-repo")
3548+
3549+
# Filter files files to delete based on a combination of `filename`, `pushed_at`, `ref` or `size`.
3550+
# e.g. select only LFS files in the "checkpoints" folder
3551+
>>> lfs_files_to_delete = (lfs_file for lfs_file in lfs_files if lfs_file.filename.startswith("checkpoints/"))
3552+
3553+
# Permanently delete LFS files
3554+
>>> api.permanently_delete_lfs_files("username/my-cool-repo", lfs_files_to_delete)
3555+
```
3556+
"""
3557+
# Prepare request
3558+
if repo_type is None:
3559+
repo_type = constants.REPO_TYPE_MODEL
3560+
url = f"{self.endpoint}/api/{repo_type}s/{repo_id}/lfs-files/batch"
3561+
headers = self._build_hf_headers(token=token)
3562+
3563+
# Delete LFS items by batches of 1000
3564+
for batch in chunk_iterable(lfs_files, 1000):
3565+
shas = [item.file_oid for item in batch]
3566+
if len(shas) == 0:
3567+
return
3568+
payload = {
3569+
"deletions": {
3570+
"sha": shas,
3571+
"rewriteHistory": rewrite_history,
3572+
}
3573+
}
3574+
response = get_session().post(url, headers=headers, json=payload)
3575+
hf_raise_for_status(response)
3576+
33903577
@validate_hf_hub_args
33913578
def create_repo(
33923579
self,
@@ -9641,7 +9828,6 @@ def _parse_revision_from_pr_url(pr_url: str) -> str:
96419828
delete_repo = api.delete_repo
96429829
update_repo_visibility = api.update_repo_visibility
96439830
update_repo_settings = api.update_repo_settings
9644-
super_squash_history = api.super_squash_history
96459831
move_repo = api.move_repo
96469832
upload_file = api.upload_file
96479833
upload_folder = api.upload_folder
@@ -9656,6 +9842,11 @@ def _parse_revision_from_pr_url(pr_url: str) -> str:
96569842
delete_tag = api.delete_tag
96579843
get_full_repo_name = api.get_full_repo_name
96589844

9845+
# Danger-zone API
9846+
super_squash_history = api.super_squash_history
9847+
list_lfs_files = api.list_lfs_files
9848+
permanently_delete_lfs_files = api.permanently_delete_lfs_files
9849+
96599850
# Safetensors helpers
96609851
get_safetensors_metadata = api.get_safetensors_metadata
96619852
parse_safetensors_file_metadata = api.parse_safetensors_file_metadata

tests/test_hf_api.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3103,6 +3103,69 @@ def test_super_squash_history_on_special_ref(self, repo_url: RepoUrl) -> None:
31033103
assert len(squashed_branch_commits) == 1
31043104

31053105

3106+
class TestListAndPermanentlyDeleteLFSFiles(HfApiCommonTest):
3107+
@use_tmp_repo()
3108+
def test_list_and_delete_lfs_files(self, repo_url: RepoUrl) -> None:
3109+
repo_id = repo_url.repo_id
3110+
3111+
# Main files
3112+
self._api.upload_file(path_or_fileobj=b"LFS content", path_in_repo="lfs_file.bin", repo_id=repo_id)
3113+
self._api.upload_file(path_or_fileobj=b"TXT content", path_in_repo="txt_file.txt", repo_id=repo_id)
3114+
self._api.upload_file(path_or_fileobj=b"LFS content 2", path_in_repo="lfs_file_2.bin", repo_id=repo_id)
3115+
self._api.upload_file(path_or_fileobj=b"TXT content 2", path_in_repo="txt_file_2.txt", repo_id=repo_id)
3116+
3117+
# Branch files
3118+
self._api.create_branch(repo_id=repo_id, branch="my-branch")
3119+
self._api.upload_file(
3120+
path_or_fileobj=b"LFS content branch",
3121+
path_in_repo="lfs_file_branch.bin",
3122+
repo_id=repo_id,
3123+
revision="my-branch",
3124+
)
3125+
self._api.upload_file(
3126+
path_or_fileobj=b"TXT content branch",
3127+
path_in_repo="txt_file_branch.txt",
3128+
repo_id=repo_id,
3129+
revision="my-branch",
3130+
)
3131+
3132+
# PR files
3133+
self._api.upload_file(
3134+
path_or_fileobj=b"LFS content PR", path_in_repo="lfs_file_PR.bin", repo_id=repo_id, create_pr=True
3135+
)
3136+
self._api.upload_file(
3137+
path_or_fileobj=b"TXT content PR", path_in_repo="txt_file_PR.txt", repo_id=repo_id, create_pr=True
3138+
)
3139+
3140+
# List LFS files
3141+
lfs_files = [file for file in self._api.list_lfs_files(repo_id=repo_id)]
3142+
assert len(lfs_files) == 4
3143+
assert {file.filename for file in lfs_files} == {
3144+
"lfs_file.bin",
3145+
"lfs_file_2.bin",
3146+
"lfs_file_branch.bin",
3147+
"lfs_file_PR.bin",
3148+
}
3149+
3150+
# Select LFS files that are on main
3151+
lfs_files_on_main = [file for file in lfs_files if file.ref == "main"]
3152+
assert len(lfs_files_on_main) == 2
3153+
3154+
# Permanently delete LFS files
3155+
self._api.permanently_delete_lfs_files(repo_id=repo_id, lfs_files=lfs_files_on_main)
3156+
3157+
# LFS files from branch and PR remain
3158+
lfs_files = [file for file in self._api.list_lfs_files(repo_id=repo_id)]
3159+
assert len(lfs_files) == 2
3160+
assert {file.filename for file in lfs_files} == {"lfs_file_branch.bin", "lfs_file_PR.bin"}
3161+
3162+
# Downloading "lfs_file.bin" fails with EntryNotFoundError
3163+
files = self._api.list_repo_files(repo_id=repo_id)
3164+
assert set(files) == {".gitattributes", "txt_file.txt", "txt_file_2.txt"}
3165+
with pytest.raises(EntryNotFoundError):
3166+
self._api.hf_hub_download(repo_id=repo_id, filename="lfs_file.bin")
3167+
3168+
31063169
@pytest.mark.vcr
31073170
class TestSpaceAPIProduction(unittest.TestCase):
31083171
"""

0 commit comments

Comments
 (0)