Skip to content

Commit e9cd695

Browse files
authored
huggingface-cli upload - Validate README.md before file hashing (#2452)
1 parent dfd73c0 commit e9cd695

File tree

1 file changed

+102
-57
lines changed

1 file changed

+102
-57
lines changed

src/huggingface_hub/hf_api.py

Lines changed: 102 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -3797,26 +3797,10 @@ def create_commit(
37973797
for addition in additions:
37983798
if addition.path_in_repo == "README.md":
37993799
with addition.as_file() as file:
3800-
response = get_session().post(
3801-
f"{ENDPOINT}/api/validate-yaml",
3802-
json={"content": file.read().decode(), "repoType": repo_type},
3803-
headers=headers,
3804-
)
3805-
# Handle warnings (example: empty metadata)
3806-
response_content = response.json()
3807-
message = "\n".join(
3808-
[f"- {warning.get('message')}" for warning in response_content.get("warnings", [])]
3809-
)
3810-
if message:
3811-
warnings.warn(f"Warnings while validating metadata in README.md:\n{message}")
3812-
3813-
# Raise on errors
3814-
try:
3815-
hf_raise_for_status(response)
3816-
except BadRequestError as e:
3817-
errors = response_content.get("errors", [])
3818-
message = "\n".join([f"- {error.get('message')}" for error in errors])
3819-
raise ValueError(f"Invalid metadata in README.md.\n{message}") from e
3800+
content = file.read().decode()
3801+
self._validate_yaml(content, repo_type=repo_type, token=token)
3802+
# Skip other additions after `README.md` has been processed
3803+
break
38203804

38213805
# If updating twice the same file or update then delete a file in a single commit
38223806
_warn_on_overwriting_operations(operations)
@@ -4875,11 +4859,13 @@ def upload_folder(
48754859
path_in_repo=path_in_repo,
48764860
delete_patterns=delete_patterns,
48774861
)
4878-
add_operations = _prepare_upload_folder_additions(
4862+
add_operations = self._prepare_upload_folder_additions(
48794863
folder_path,
48804864
path_in_repo,
48814865
allow_patterns=allow_patterns,
48824866
ignore_patterns=ignore_patterns,
4867+
token=token,
4868+
repo_type=repo_type,
48834869
)
48844870

48854871
# Optimize operations: if some files will be overwritten, we don't need to delete them first
@@ -9182,6 +9168,101 @@ def _prepare_folder_deletions(
91829168
if relpath_to_abspath[relpath] != ".gitattributes"
91839169
]
91849170

9171+
def _prepare_upload_folder_additions(
9172+
self,
9173+
folder_path: Union[str, Path],
9174+
path_in_repo: str,
9175+
allow_patterns: Optional[Union[List[str], str]] = None,
9176+
ignore_patterns: Optional[Union[List[str], str]] = None,
9177+
repo_type: Optional[str] = None,
9178+
token: Union[bool, str, None] = None,
9179+
) -> List[CommitOperationAdd]:
9180+
"""Generate the list of Add operations for a commit to upload a folder.
9181+
9182+
Files not matching the `allow_patterns` (allowlist) and `ignore_patterns` (denylist)
9183+
constraints are discarded.
9184+
"""
9185+
9186+
folder_path = Path(folder_path).expanduser().resolve()
9187+
if not folder_path.is_dir():
9188+
raise ValueError(f"Provided path: '{folder_path}' is not a directory")
9189+
9190+
# List files from folder
9191+
relpath_to_abspath = {
9192+
path.relative_to(folder_path).as_posix(): path
9193+
for path in sorted(folder_path.glob("**/*")) # sorted to be deterministic
9194+
if path.is_file()
9195+
}
9196+
9197+
# Filter files
9198+
# Patterns are applied on the path relative to `folder_path`. `path_in_repo` is prefixed after the filtering.
9199+
filtered_repo_objects = list(
9200+
filter_repo_objects(
9201+
relpath_to_abspath.keys(), allow_patterns=allow_patterns, ignore_patterns=ignore_patterns
9202+
)
9203+
)
9204+
9205+
prefix = f"{path_in_repo.strip('/')}/" if path_in_repo else ""
9206+
9207+
# If updating a README.md file, make sure the metadata format is valid
9208+
# It's better to fail early than to fail after all the files have been hashed.
9209+
if "README.md" in filtered_repo_objects:
9210+
self._validate_yaml(
9211+
content=relpath_to_abspath["README.md"].read_text(),
9212+
repo_type=repo_type,
9213+
token=token,
9214+
)
9215+
9216+
return [
9217+
CommitOperationAdd(
9218+
path_or_fileobj=relpath_to_abspath[relpath], # absolute path on disk
9219+
path_in_repo=prefix + relpath, # "absolute" path in repo
9220+
)
9221+
for relpath in filtered_repo_objects
9222+
]
9223+
9224+
def _validate_yaml(self, content: str, *, repo_type: Optional[str] = None, token: Union[bool, str, None] = None):
9225+
"""
9226+
Validate YAML from `README.md`, used before file hashing and upload.
9227+
9228+
Args:
9229+
content (`str`):
9230+
Content of `README.md` to validate.
9231+
repo_type (`str`, *optional*):
9232+
The type of the repo to grant access to. Must be one of `model`, `dataset` or `space`.
9233+
Defaults to `model`.
9234+
token (Union[bool, str, None], optional):
9235+
A valid user access token (string). Defaults to the locally saved
9236+
token, which is the recommended method for authentication (see
9237+
https://huggingface.co/docs/huggingface_hub/quick-start#authentication).
9238+
To disable authentication, pass `False`.
9239+
9240+
Raises:
9241+
- [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
9242+
if YAML is invalid
9243+
"""
9244+
repo_type = repo_type if repo_type is not None else REPO_TYPE_MODEL
9245+
headers = self._build_hf_headers(token=token)
9246+
9247+
response = get_session().post(
9248+
f"{self.endpoint}/api/validate-yaml",
9249+
json={"content": content, "repoType": repo_type},
9250+
headers=headers,
9251+
)
9252+
# Handle warnings (example: empty metadata)
9253+
response_content = response.json()
9254+
message = "\n".join([f"- {warning.get('message')}" for warning in response_content.get("warnings", [])])
9255+
if message:
9256+
warnings.warn(f"Warnings while validating metadata in README.md:\n{message}")
9257+
9258+
# Raise on errors
9259+
try:
9260+
hf_raise_for_status(response)
9261+
except BadRequestError as e:
9262+
errors = response_content.get("errors", [])
9263+
message = "\n".join([f"- {error.get('message')}" for error in errors])
9264+
raise ValueError(f"Invalid metadata in README.md.\n{message}") from e
9265+
91859266
def get_user_overview(self, username: str) -> User:
91869267
"""
91879268
Get an overview of a user on the Hub.
@@ -9275,42 +9356,6 @@ def list_user_following(self, username: str) -> Iterable[User]:
92759356
yield User(**followed_user)
92769357

92779358

9278-
def _prepare_upload_folder_additions(
9279-
folder_path: Union[str, Path],
9280-
path_in_repo: str,
9281-
allow_patterns: Optional[Union[List[str], str]] = None,
9282-
ignore_patterns: Optional[Union[List[str], str]] = None,
9283-
) -> List[CommitOperationAdd]:
9284-
"""Generate the list of Add operations for a commit to upload a folder.
9285-
9286-
Files not matching the `allow_patterns` (allowlist) and `ignore_patterns` (denylist)
9287-
constraints are discarded.
9288-
"""
9289-
folder_path = Path(folder_path).expanduser().resolve()
9290-
if not folder_path.is_dir():
9291-
raise ValueError(f"Provided path: '{folder_path}' is not a directory")
9292-
9293-
# List files from folder
9294-
relpath_to_abspath = {
9295-
path.relative_to(folder_path).as_posix(): path
9296-
for path in sorted(folder_path.glob("**/*")) # sorted to be deterministic
9297-
if path.is_file()
9298-
}
9299-
9300-
# Filter files and return
9301-
# Patterns are applied on the path relative to `folder_path`. `path_in_repo` is prefixed after the filtering.
9302-
prefix = f"{path_in_repo.strip('/')}/" if path_in_repo else ""
9303-
return [
9304-
CommitOperationAdd(
9305-
path_or_fileobj=relpath_to_abspath[relpath], # absolute path on disk
9306-
path_in_repo=prefix + relpath, # "absolute" path in repo
9307-
)
9308-
for relpath in filter_repo_objects(
9309-
relpath_to_abspath.keys(), allow_patterns=allow_patterns, ignore_patterns=ignore_patterns
9310-
)
9311-
]
9312-
9313-
93149359
def _parse_revision_from_pr_url(pr_url: str) -> str:
93159360
"""Safely parse revision number from a PR url.
93169361

0 commit comments

Comments
 (0)