Skip to content

Commit e3fa660

Browse files
WauplinLysandreJik
andauthored
Add delete_patterns option to upload_folder (#1370)
* Add delete_patterns option to upload_folder + tests + adapt mixins * adapt upload guide * fix test python3.7 * Apply suggestions from code review Co-authored-by: Lysandre Debut <[email protected]> --------- Co-authored-by: Lysandre Debut <[email protected]>
1 parent 9a0497e commit e3fa660

File tree

6 files changed

+323
-104
lines changed

6 files changed

+323
-104
lines changed

docs/source/guides/upload.mdx

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,24 +58,54 @@ Specify the path of the file to upload, where you want to upload the file to in
5858

5959
### Upload a folder
6060

61-
Use the [`upload_folder`] function to upload a local folder to an existing repository. Specify the path of the local folder to upload, where you want to upload the folder to in the repository, and the name of the repository you want to add the folder to. Depending on your repository type, you can optionally set the repository type as a `dataset`, `model`, or `space`.
61+
Use the [`upload_folder`] function to upload a local folder to an existing repository. Specify the path of the local folder
62+
to upload, where you want to upload the folder to in the repository, and the name of the repository you want to add the
63+
folder to. Depending on your repository type, you can optionally set the repository type as a `dataset`, `model`, or `space`.
64+
65+
```py
66+
>>> from huggingface_hub import HfApi
67+
>>> api = HfApi()
68+
69+
# Upload all the content from the local folder to your remote Space.
70+
# By default, files are uploaded at the root of the repo
71+
>>> api.upload_folder(
72+
... folder_path="/path/to/local/space",
73+
... repo_id="username/my-cool-space",
74+
... repo_type="space",
75+
... )
76+
```
6277

6378
Use the `allow_patterns` and `ignore_patterns` arguments to specify which files to upload. These parameters accept either a single pattern or a list of patterns.
6479
Patterns are Standard Wildcards (globbing patterns) as documented [here](https://tldp.org/LDP/GNU-Linux-Tools-Summary/html/x11655.htm).
6580
If both `allow_patterns` and `ignore_patterns` are provided, both constraints apply. By default, all files from the folder are uploaded.
6681

6782
```py
68-
>>> from huggingface_hub import HfApi
69-
>>> api = HfApi()
7083
>>> api.upload_folder(
7184
... folder_path="/path/to/local/folder",
72-
... path_in_repo="my-dataset/train",
85+
... path_in_repo="my-dataset/train", # Upload to a specific folder
7386
... repo_id="username/test-dataset",
7487
... repo_type="dataset",
75-
... ignore_patterns="**/logs/*.txt",
88+
... ignore_patterns="**/logs/*.txt", # Ignore all text logs
89+
... )
90+
```
91+
92+
You can also use the `delete_patterns` argument to specify files you want to delete from the repo in the same commit.
93+
This can prove useful if you want to clean a remote folder before pushing files in it and you don't know which files
94+
already exists.
95+
96+
The example below uploads the local `./logs` folder to the remote `/experiment/logs/` folder. Only txt files are uploaded
97+
but before that, all previous logs on the repo on deleted. All of this in a single commit.
98+
```py
99+
>>> api.upload_folder(
100+
... folder_path="/path/to/local/folder/logs",
101+
... repo_id="username/trained-model",
102+
... path_in_repo="experiment/logs/",
103+
... allow_patterns="*.txt", # Upload all local text files
104+
... delete_patterns="*.txt", # Delete all remote text files before
76105
... )
77106
```
78107

108+
79109
### create_commit
80110

81111
If you want to work at a commit-level, use the [`create_commit`] function directly. There are two types of operations supported by [`create_commit`]:

src/huggingface_hub/fastai_utils.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -353,13 +353,15 @@ def push_to_hub_fastai(
353353
create_pr: Optional[bool] = None,
354354
allow_patterns: Optional[Union[List[str], str]] = None,
355355
ignore_patterns: Optional[Union[List[str], str]] = None,
356+
delete_patterns: Optional[Union[List[str], str]] = None,
356357
api_endpoint: Optional[str] = None,
357358
):
358359
"""
359360
Upload learner checkpoint files to the Hub.
360361
361-
Use `allow_patterns` and `ignore_patterns` to precisely filter which files should be
362-
pushed to the hub. See [`upload_folder`] reference for more details.
362+
Use `allow_patterns` and `ignore_patterns` to precisely filter which files should be pushed to the hub. Use
363+
`delete_patterns` to delete existing remote files in the same commit. See [`upload_folder`] reference for more
364+
details.
363365
364366
Args:
365367
learner (`Learner`):
@@ -387,6 +389,9 @@ def push_to_hub_fastai(
387389
If provided, only files matching at least one pattern are pushed.
388390
ignore_patterns (`List[str]` or `str`, *optional*):
389391
If provided, files matching any of the patterns are not pushed.
392+
delete_patterns (`List[str]` or `str`, *optional*):
393+
If provided, remote files matching any of the patterns will be deleted from the repo.
394+
390395
Returns:
391396
The url of the commit of your model in the given repository.
392397
@@ -401,20 +406,20 @@ def push_to_hub_fastai(
401406
"""
402407
_check_fastai_fastcore_versions()
403408
api = HfApi(endpoint=api_endpoint)
404-
api.create_repo(repo_id=repo_id, repo_type="model", token=token, private=private, exist_ok=True)
409+
repo_id = api.create_repo(repo_id=repo_id, token=token, private=private, exist_ok=True).repo_id
405410

406411
# Push the files to the repo in a single commit
407412
with SoftTemporaryDirectory() as tmp:
408413
saved_path = Path(tmp) / repo_id
409414
_save_pretrained_fastai(learner, saved_path, config=config)
410415
return api.upload_folder(
411416
repo_id=repo_id,
412-
repo_type="model",
413417
token=token,
414418
folder_path=saved_path,
415419
commit_message=commit_message,
416420
revision=branch,
417421
create_pr=create_pr,
418422
allow_patterns=allow_patterns,
419423
ignore_patterns=ignore_patterns,
424+
delete_patterns=delete_patterns,
420425
)

src/huggingface_hub/hf_api.py

Lines changed: 116 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515
import json
16-
import os
1716
import pprint
1817
import re
1918
import textwrap
@@ -1855,6 +1854,7 @@ def list_repo_files(
18551854
Returns:
18561855
`List[str]`: the list of files in a given repository.
18571856
"""
1857+
# TODO: use https://huggingface.co/api/{repo_type}/{repo_id}/tree/{revision}/{subfolder}
18581858
repo_info = self.repo_info(
18591859
repo_id,
18601860
revision=revision,
@@ -2626,21 +2626,25 @@ def upload_folder(
26262626
parent_commit: Optional[str] = None,
26272627
allow_patterns: Optional[Union[List[str], str]] = None,
26282628
ignore_patterns: Optional[Union[List[str], str]] = None,
2629+
delete_patterns: Optional[Union[List[str], str]] = None,
26292630
):
26302631
"""
2631-
Upload a local folder to the given repo. The upload is done
2632-
through a HTTP requests, and doesn't require git or git-lfs to be
2633-
installed.
2632+
Upload a local folder to the given repo. The upload is done through a HTTP request and doesn't require git or
2633+
git-lfs to be installed.
2634+
2635+
The structure of the folder will be preserved. Files with the same name already present in the repository will
2636+
be overwritten. Others will be left untouched.
26342637
2635-
The structure of the folder will be preserved. Files with the same name
2636-
already present in the repository will be overwritten, others will be left untouched.
2638+
Use the `allow_patterns` and `ignore_patterns` arguments to specify which files to upload. These parameters
2639+
accept either a single pattern or a list of patterns. Patterns are Standard Wildcards (globbing patterns) as
2640+
documented [here](https://tldp.org/LDP/GNU-Linux-Tools-Summary/html/x11655.htm). If both `allow_patterns` and
2641+
`ignore_patterns` are provided, both constraints apply. By default, all files from the folder are uploaded.
26372642
2638-
Use the `allow_patterns` and `ignore_patterns` arguments to specify which files
2639-
to upload. These parameters accept either a single pattern or a list of
2640-
patterns. Patterns are Standard Wildcards (globbing patterns) as documented
2641-
[here](https://tldp.org/LDP/GNU-Linux-Tools-Summary/html/x11655.htm). If both
2642-
`allow_patterns` and `ignore_patterns` are provided, both constraints apply. By
2643-
default, all files from the folder are uploaded.
2643+
Use the `delete_patterns` argument to specify remote files you want to delete. Input type is the same as for
2644+
`allow_patterns` (see above). If `path_in_repo` is also provided, the patterns are matched against paths
2645+
relative to this folder. For example, `upload_folder(..., path_in_repo="experiment", delete_patterns="logs/*")`
2646+
will delete any remote file under `experiment/logs/`. Note that the `.gitattributes` file will not be deleted
2647+
even if it matches the patterns.
26442648
26452649
Uses `HfApi.create_commit` under the hood.
26462650
@@ -2683,6 +2687,10 @@ def upload_folder(
26832687
If provided, only files matching at least one pattern are uploaded.
26842688
ignore_patterns (`List[str]` or `str`, *optional*):
26852689
If provided, files matching any of the patterns are not uploaded.
2690+
delete_patterns (`List[str]` or `str`, *optional*):
2691+
If provided, remote files matching any of the patterns will be deleted from the repo while committing
2692+
new files. This is useful if you don't know which files have already been uploaded.
2693+
Note: to avoid discrepancies the `.gitattributes` file is not deleted even if it matches the pattern.
26862694
26872695
Returns:
26882696
`str`: A URL to visualize the uploaded folder on the hub
@@ -2700,16 +2708,16 @@ def upload_folder(
27002708
27012709
<Tip warning={true}>
27022710
2703-
`upload_folder` assumes that the repo already exists on the Hub. If you get a
2704-
Client error 404, please make sure you are authenticated and that `repo_id` and
2705-
`repo_type` are set correctly. If repo does not exist, create it first using
2706-
[`~hf_api.create_repo`].
2711+
`upload_folder` assumes that the repo already exists on the Hub. If you get a Client error 404, please make
2712+
sure you are authenticated and that `repo_id` and `repo_type` are set correctly. If repo does not exist, create
2713+
it first using [`~hf_api.create_repo`].
27072714
27082715
</Tip>
27092716
27102717
Example:
27112718
27122719
```python
2720+
# Upload checkpoints folder except the log files
27132721
>>> upload_folder(
27142722
... folder_path="local/checkpoints",
27152723
... path_in_repo="remote/experiment/checkpoints",
@@ -2720,6 +2728,19 @@ def upload_folder(
27202728
... )
27212729
# "https://huggingface.co/datasets/username/my-dataset/tree/main/remote/experiment/checkpoints"
27222730
2731+
# Upload checkpoints folder including logs while deleting existing logs from the repo
2732+
# Useful if you don't know exactly which log files have already being pushed
2733+
>>> upload_folder(
2734+
... folder_path="local/checkpoints",
2735+
... path_in_repo="remote/experiment/checkpoints",
2736+
... repo_id="username/my-dataset",
2737+
... repo_type="datasets",
2738+
... token="my_token",
2739+
... delete_patterns="**/logs/*.txt",
2740+
... )
2741+
"https://huggingface.co/datasets/username/my-dataset/tree/main/remote/experiment/checkpoints"
2742+
2743+
# Upload checkpoints folder while creating a PR
27232744
>>> upload_folder(
27242745
... folder_path="local/checkpoints",
27252746
... path_in_repo="remote/experiment/checkpoints",
@@ -2743,17 +2764,33 @@ def upload_folder(
27432764
commit_message if commit_message is not None else f"Upload {path_in_repo} with huggingface_hub"
27442765
)
27452766

2746-
files_to_add = _prepare_upload_folder_commit(
2767+
delete_operations = self._prepare_upload_folder_deletions(
2768+
repo_id=repo_id,
2769+
repo_type=repo_type,
2770+
revision=DEFAULT_REVISION if create_pr else revision,
2771+
token=token,
2772+
path_in_repo=path_in_repo,
2773+
delete_patterns=delete_patterns,
2774+
)
2775+
add_operations = _prepare_upload_folder_additions(
27472776
folder_path,
27482777
path_in_repo,
27492778
allow_patterns=allow_patterns,
27502779
ignore_patterns=ignore_patterns,
27512780
)
27522781

2782+
# Optimize operations: if some files will be overwritten, we don't need to delete them first
2783+
if len(add_operations) > 0:
2784+
added_paths = set(op.path_in_repo for op in add_operations)
2785+
delete_operations = [
2786+
delete_op for delete_op in delete_operations if delete_op.path_in_repo not in added_paths
2787+
]
2788+
commit_operations = delete_operations + add_operations
2789+
27532790
commit_info = self.create_commit(
27542791
repo_type=repo_type,
27552792
repo_id=repo_id,
2756-
operations=files_to_add,
2793+
operations=commit_operations,
27572794
commit_message=commit_message,
27582795
commit_description=commit_description,
27592796
token=token,
@@ -4173,8 +4210,48 @@ def _build_hf_headers(
41734210
user_agent=user_agent or self.user_agent,
41744211
)
41754212

4213+
def _prepare_upload_folder_deletions(
4214+
self,
4215+
repo_id: str,
4216+
repo_type: Optional[str],
4217+
revision: Optional[str],
4218+
token: Optional[str],
4219+
path_in_repo: str,
4220+
delete_patterns: Optional[Union[List[str], str]],
4221+
) -> List[CommitOperationDelete]:
4222+
"""Generate the list of Delete operations for a commit to delete files from a repo.
4223+
4224+
List remote files and match them against the `delete_patterns` constraints. Returns a list of [`CommitOperationDelete`]
4225+
with the matching items.
41764226
4177-
def _prepare_upload_folder_commit(
4227+
Note: `.gitattributes` file is essential to make a repo work properly on the Hub. This file will always be
4228+
kept even if it matches the `delete_patterns` constraints.
4229+
"""
4230+
if delete_patterns is None:
4231+
# If no delete patterns, no need to list and filter remote files
4232+
return []
4233+
4234+
# List remote files
4235+
filenames = self.list_repo_files(repo_id=repo_id, revision=revision, repo_type=repo_type, token=token)
4236+
4237+
# Compute relative path in repo
4238+
if path_in_repo:
4239+
path_in_repo = path_in_repo.strip("/") + "/" # harmonize
4240+
relpath_to_abspath = {
4241+
file[len(path_in_repo) :]: file for file in filenames if file.startswith(path_in_repo)
4242+
}
4243+
else:
4244+
relpath_to_abspath = {file: file for file in filenames}
4245+
4246+
# Apply filter on relative paths and return
4247+
return [
4248+
CommitOperationDelete(path_in_repo=relpath_to_abspath[relpath], is_folder=False)
4249+
for relpath in filter_repo_objects(relpath_to_abspath.keys(), allow_patterns=delete_patterns)
4250+
if relpath_to_abspath[relpath] != ".gitattributes"
4251+
]
4252+
4253+
4254+
def _prepare_upload_folder_additions(
41784255
folder_path: Union[str, Path],
41794256
path_in_repo: str,
41804257
allow_patterns: Optional[Union[List[str], str]] = None,
@@ -4185,30 +4262,29 @@ def _prepare_upload_folder_commit(
41854262
Files not matching the `allow_patterns` (allowlist) and `ignore_patterns` (denylist)
41864263
constraints are discarded.
41874264
"""
4188-
folder_path = os.path.normpath(os.path.expanduser(folder_path))
4189-
if not os.path.isdir(folder_path):
4265+
folder_path = Path(folder_path).expanduser().resolve()
4266+
if not folder_path.is_dir():
41904267
raise ValueError(f"Provided path: '{folder_path}' is not a directory")
41914268

4192-
files_to_add: List[CommitOperationAdd] = []
4193-
for dirpath, _, filenames in os.walk(folder_path):
4194-
for filename in filenames:
4195-
abs_path = os.path.join(dirpath, filename)
4196-
rel_path = os.path.relpath(abs_path, folder_path)
4197-
files_to_add.append(
4198-
CommitOperationAdd(
4199-
path_or_fileobj=abs_path,
4200-
path_in_repo=os.path.normpath(os.path.join(path_in_repo, rel_path)).replace(os.sep, "/"),
4201-
)
4202-
)
4203-
4204-
return list(
4205-
filter_repo_objects(
4206-
files_to_add,
4207-
allow_patterns=allow_patterns,
4208-
ignore_patterns=ignore_patterns,
4209-
key=lambda x: x.path_in_repo,
4269+
# List files from folder
4270+
relpath_to_abspath = {
4271+
path.relative_to(folder_path).as_posix(): path
4272+
for path in sorted(folder_path.glob("**/*")) # sorted to be deterministic
4273+
if path.is_file()
4274+
}
4275+
4276+
# Filter files and return
4277+
# Patterns are applied on the path relative to `folder_path`. `path_in_repo` is prefixed after the filtering.
4278+
prefix = f"{path_in_repo.strip('/')}/" if path_in_repo else ""
4279+
return [
4280+
CommitOperationAdd(
4281+
path_or_fileobj=relpath_to_abspath[relpath], # absolute path on disk
4282+
path_in_repo=prefix + relpath, # "absolute" path in repo
42104283
)
4211-
)
4284+
for relpath in filter_repo_objects(
4285+
relpath_to_abspath.keys(), allow_patterns=allow_patterns, ignore_patterns=ignore_patterns
4286+
)
4287+
]
42124288

42134289

42144290
def _parse_revision_from_pr_url(pr_url: str) -> str:

0 commit comments

Comments
 (0)