Skip to content

Commit aa28404

Browse files
davanstrienhanouticelinaWauplingithub-actions[bot]
authored
Add validation warnings for repository limits in upload_large_folder (#3280)
* Add validation warnings for repository limits in upload_large_folder * Add tests for upload validation warnings * Update docstring to document validation checks * ruff * Apply ruff formatting to test file * Update src/huggingface_hub/_upload_large_folder.py Co-authored-by: Lucain <[email protected]> * Refactor validation logic into separate function and fix GB conversion - Extract validation checks into _validate_upload_limits() function - Fix file size conversion from binary (1024^3) to decimal (10^9) GB - Addresses review comments from @Wauplin and @hanouticelina * Fix subdirectory counting to track immediate children only - Corrects bug where nested subdirs were attributed to grandparents - Now properly tracks each subdir to its immediate parent folder - Addresses @Wauplin's review comment about directory structure * Add comprehensive tests for _validate_upload_limits function - Test subdirectory counting logic - Test decimal GB conversion for file sizes - Test mixed files and subdirectories counting - Test deeply nested structures - All tests passing - Addresses testing requirements from PR review * Apply ruff linting fixes - Remove unused Counter import - Fix whitespace in docstrings and blank lines - All linting checks now pass * Apply make style formatting - Add blank line after import statement - Add blank line after docstring in nested class - All quality checks pass (except pre-existing mypy issues) * Fix mypy type annotation for entries_per_folder - Add Any and Dict to typing imports - Properly annotate entries_per_folder as Dict[str, Any] - Fixes CI mypy errors * Remove old validation tests that are incompatible with refactored code - Remove UploadLargeFolderValidationTest class entirely - These tests relied on mocking internal implementation details - Validation logic is now fully tested in TestValidateUploadLimits - Fixes CI test failures * Update src/huggingface_hub/_upload_large_folder.py Co-authored-by: célina <[email protected]> * Update src/huggingface_hub/_upload_large_folder.py Co-authored-by: célina <[email protected]> * Apply style fixes --------- Co-authored-by: célina <[email protected]> Co-authored-by: Lucain <[email protected]> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent c7ffbdb commit aa28404

File tree

3 files changed

+238
-4
lines changed

3 files changed

+238
-4
lines changed

src/huggingface_hub/_upload_large_folder.py

Lines changed: 108 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from datetime import datetime
2525
from pathlib import Path
2626
from threading import Lock
27-
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
27+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
2828
from urllib.parse import quote
2929

3030
from . import constants
@@ -49,6 +49,108 @@
4949
UPLOAD_BATCH_SIZE_XET = 256 # Max 256 files per upload batch for XET-enabled repos
5050
UPLOAD_BATCH_SIZE_LFS = 1 # Otherwise, batches of 1 for regular LFS upload
5151

52+
# Repository limits (from https://huggingface.co/docs/hub/repositories-recommendations)
53+
MAX_FILES_PER_REPO = 100_000 # Recommended maximum number of files per repository
54+
MAX_FILES_PER_FOLDER = 10_000 # Recommended maximum number of files per folder
55+
MAX_FILE_SIZE_GB = 50 # Hard limit for individual file size
56+
RECOMMENDED_FILE_SIZE_GB = 20 # Recommended maximum for individual file size
57+
58+
59+
def _validate_upload_limits(paths_list: List[LocalUploadFilePaths]) -> None:
60+
"""
61+
Validate upload against repository limits and warn about potential issues.
62+
63+
Args:
64+
paths_list: List of file paths to be uploaded
65+
66+
Warns about:
67+
- Too many files in the repository (>100k)
68+
- Too many entries (files or subdirectories) in a single folder (>10k)
69+
- Files exceeding size limits (>20GB recommended, >50GB hard limit)
70+
"""
71+
logger.info("Running validation checks on files to upload...")
72+
73+
# Check 1: Total file count
74+
if len(paths_list) > MAX_FILES_PER_REPO:
75+
logger.warning(
76+
f"You are about to upload {len(paths_list):,} files. "
77+
f"This exceeds the recommended limit of {MAX_FILES_PER_REPO:,} files per repository.\n"
78+
f"Consider:\n"
79+
f" - Splitting your data into multiple repositories\n"
80+
f" - Using fewer, larger files (e.g., parquet files)\n"
81+
f" - See: https://huggingface.co/docs/hub/repositories-recommendations"
82+
)
83+
84+
# Check 2: Files and subdirectories per folder
85+
# Track immediate children (files and subdirs) for each folder
86+
from collections import defaultdict
87+
88+
entries_per_folder: Dict[str, Any] = defaultdict(lambda: {"files": 0, "subdirs": set()})
89+
90+
for paths in paths_list:
91+
path = Path(paths.path_in_repo)
92+
parts = path.parts
93+
94+
# Count this file in its immediate parent directory
95+
parent = str(path.parent) if str(path.parent) != "." else "."
96+
entries_per_folder[parent]["files"] += 1
97+
98+
# Track immediate subdirectories for each parent folder
99+
# Walk through the path components to track parent-child relationships
100+
for i, child in enumerate(parts[:-1]):
101+
parent = "." if i == 0 else "/".join(parts[:i])
102+
entries_per_folder[parent]["subdirs"].add(child)
103+
104+
# Check limits for each folder
105+
for folder, data in entries_per_folder.items():
106+
file_count = data["files"]
107+
subdir_count = len(data["subdirs"])
108+
total_entries = file_count + subdir_count
109+
110+
if total_entries > MAX_FILES_PER_FOLDER:
111+
folder_display = "root" if folder == "." else folder
112+
logger.warning(
113+
f"Folder '{folder_display}' contains {total_entries:,} entries "
114+
f"({file_count:,} files and {subdir_count:,} subdirectories). "
115+
f"This exceeds the recommended {MAX_FILES_PER_FOLDER:,} entries per folder.\n"
116+
"Consider reorganising into sub-folders."
117+
)
118+
119+
# Check 3: File sizes
120+
large_files = []
121+
very_large_files = []
122+
123+
for paths in paths_list:
124+
size = paths.file_path.stat().st_size
125+
size_gb = size / 1_000_000_000 # Use decimal GB as per Hub limits
126+
127+
if size_gb > MAX_FILE_SIZE_GB:
128+
very_large_files.append((paths.path_in_repo, size_gb))
129+
elif size_gb > RECOMMENDED_FILE_SIZE_GB:
130+
large_files.append((paths.path_in_repo, size_gb))
131+
132+
# Warn about very large files (>50GB)
133+
if very_large_files:
134+
files_str = "\n - ".join(f"{path}: {size:.1f}GB" for path, size in very_large_files[:5])
135+
more_str = f"\n ... and {len(very_large_files) - 5} more files" if len(very_large_files) > 5 else ""
136+
logger.warning(
137+
f"Found {len(very_large_files)} files exceeding the {MAX_FILE_SIZE_GB}GB hard limit:\n"
138+
f" - {files_str}{more_str}\n"
139+
f"These files may fail to upload. Consider splitting them into smaller chunks."
140+
)
141+
142+
# Warn about large files (>20GB)
143+
if large_files:
144+
files_str = "\n - ".join(f"{path}: {size:.1f}GB" for path, size in large_files[:5])
145+
more_str = f"\n ... and {len(large_files) - 5} more files" if len(large_files) > 5 else ""
146+
logger.warning(
147+
f"Found {len(large_files)} files larger than {RECOMMENDED_FILE_SIZE_GB}GB (recommended limit):\n"
148+
f" - {files_str}{more_str}\n"
149+
f"Large files may slow down loading and processing."
150+
)
151+
152+
logger.info("Validation checks complete.")
153+
52154

53155
def upload_large_folder_internal(
54156
api: "HfApi",
@@ -118,6 +220,11 @@ def upload_large_folder_internal(
118220
paths_list = [get_local_upload_paths(folder_path, relpath) for relpath in filtered_paths_list]
119221
logger.info(f"Found {len(paths_list)} candidate files to upload")
120222

223+
# Validate upload against repository limits
224+
_validate_upload_limits(paths_list)
225+
226+
logger.info("Starting upload...")
227+
121228
# Read metadata for each file
122229
items = [
123230
(paths, read_upload_metadata(folder_path, paths.path_in_repo))

src/huggingface_hub/hf_api.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5319,14 +5319,18 @@ def upload_large_folder(
53195319
1. (Check parameters and setup.)
53205320
2. Create repo if missing.
53215321
3. List local files to upload.
5322-
4. Start workers. Workers can perform the following tasks:
5322+
4. Run validation checks and display warnings if repository limits might be exceeded:
5323+
- Warns if the total number of files exceeds 100k (recommended limit).
5324+
- Warns if any folder contains more than 10k files (recommended limit).
5325+
- Warns about files larger than 20GB (recommended) or 50GB (hard limit).
5326+
5. Start workers. Workers can perform the following tasks:
53235327
- Hash a file.
53245328
- Get upload mode (regular or LFS) for a list of files.
53255329
- Pre-upload an LFS file.
53265330
- Commit a bunch of files.
53275331
Once a worker finishes a task, it will move on to the next task based on the priority list (see below) until
53285332
all files are uploaded and committed.
5329-
5. While workers are up, regularly print a report to sys.stdout.
5333+
6. While workers are up, regularly print a report to sys.stdout.
53305334
53315335
Order of priority:
53325336
1. Commit if more than 5 minutes since last commit attempt (and at least 1 file).

tests/test_upload_large_folder.py

Lines changed: 124 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,16 @@
11
# tests/test_upload_large_folder.py
2+
import unittest
3+
from unittest.mock import MagicMock, patch
4+
25
import pytest
36

4-
from huggingface_hub._upload_large_folder import COMMIT_SIZE_SCALE, LargeUploadStatus
7+
from huggingface_hub._upload_large_folder import (
8+
COMMIT_SIZE_SCALE,
9+
MAX_FILES_PER_FOLDER,
10+
MAX_FILES_PER_REPO,
11+
LargeUploadStatus,
12+
_validate_upload_limits,
13+
)
514

615

716
@pytest.fixture
@@ -32,3 +41,117 @@ def test_update_chunk_transitions(status, start_idx, success, delta_items, durat
3241

3342
assert status._chunk_idx == expected_idx
3443
assert status.target_chunk() == COMMIT_SIZE_SCALE[expected_idx]
44+
45+
46+
class TestValidateUploadLimits(unittest.TestCase):
47+
"""Test the _validate_upload_limits function directly."""
48+
49+
class MockPath:
50+
"""Mock object to simulate LocalUploadFilePaths."""
51+
52+
def __init__(self, path_in_repo, size_bytes=1000):
53+
self.path_in_repo = path_in_repo
54+
self.file_path = MagicMock()
55+
self.file_path.stat.return_value.st_size = size_bytes
56+
57+
@patch("huggingface_hub._upload_large_folder.logger")
58+
def test_no_warnings_under_limits(self, mock_logger):
59+
"""Test that no warnings are issued when under all limits."""
60+
paths = [
61+
self.MockPath("file1.txt"),
62+
self.MockPath("data/file2.txt"),
63+
self.MockPath("data/sub/file3.txt"),
64+
]
65+
_validate_upload_limits(paths)
66+
67+
# Should only have info messages, no warnings
68+
mock_logger.warning.assert_not_called()
69+
70+
@patch("huggingface_hub._upload_large_folder.logger")
71+
def test_warns_too_many_total_files(self, mock_logger):
72+
"""Test warning when total files exceed MAX_FILES_PER_REPO."""
73+
# Create a list with more files than the limit
74+
paths = [self.MockPath(f"file{i}.txt") for i in range(MAX_FILES_PER_REPO + 10)]
75+
_validate_upload_limits(paths)
76+
77+
# Check that the appropriate warning was logged
78+
warning_calls = [str(call) for call in mock_logger.warning.call_args_list]
79+
assert any(f"{MAX_FILES_PER_REPO + 10:,} files" in call for call in warning_calls)
80+
assert any("exceeds the recommended limit" in call for call in warning_calls)
81+
82+
@patch("huggingface_hub._upload_large_folder.logger")
83+
def test_warns_too_many_subdirectories(self, mock_logger):
84+
"""Test warning when a folder has too many subdirectories."""
85+
# Create files in many subdirectories under "data"
86+
paths = []
87+
for i in range(MAX_FILES_PER_FOLDER + 10):
88+
paths.append(self.MockPath(f"data/subdir{i:05d}/file.txt"))
89+
90+
_validate_upload_limits(paths)
91+
92+
# Check that warning mentions subdirectories in "data" folder
93+
warning_calls = [str(call) for call in mock_logger.warning.call_args_list]
94+
assert any("data" in call and "subdirectories" in call for call in warning_calls)
95+
assert any(f"{MAX_FILES_PER_FOLDER + 10:,} subdirectories" in call for call in warning_calls)
96+
97+
@patch("huggingface_hub._upload_large_folder.logger")
98+
def test_counts_files_and_subdirs_separately(self, mock_logger):
99+
"""Test that files and subdirectories are counted separately and correctly."""
100+
# Create a structure with both files and subdirs in "data"
101+
paths = []
102+
# Add 5000 files directly in data/
103+
for i in range(5000):
104+
paths.append(self.MockPath(f"data/file{i}.txt"))
105+
# Add 5100 subdirectories with files (exceeds limit when combined)
106+
for i in range(5100):
107+
paths.append(self.MockPath(f"data/subdir{i}/file.txt"))
108+
109+
_validate_upload_limits(paths)
110+
111+
# Should warn about "data" having 10,100 entries (5000 files + 5100 subdirs)
112+
warning_calls = [str(call) for call in mock_logger.warning.call_args_list]
113+
assert any("data" in call and "10,100 entries" in call for call in warning_calls)
114+
assert any("5,000 files" in call and "5,100 subdirectories" in call for call in warning_calls)
115+
116+
@patch("huggingface_hub._upload_large_folder.logger")
117+
def test_file_size_decimal_gb(self, mock_logger):
118+
"""Test that file sizes are calculated using decimal GB (10^9 bytes)."""
119+
# Create a file that's 21 GB in decimal (21 * 10^9 bytes)
120+
size_bytes = 21 * 1_000_000_000
121+
paths = [self.MockPath("large_file.bin", size_bytes=size_bytes)]
122+
123+
_validate_upload_limits(paths)
124+
125+
# Should warn about file being larger than 20GB recommended
126+
warning_calls = [str(call) for call in mock_logger.warning.call_args_list]
127+
assert any("21.0GB" in call or "21GB" in call for call in warning_calls)
128+
assert any("20GB (recommended limit)" in call for call in warning_calls)
129+
130+
@patch("huggingface_hub._upload_large_folder.logger")
131+
def test_very_large_file_warning(self, mock_logger):
132+
"""Test warning for files exceeding hard limit (50GB)."""
133+
# Create a file that's 51 GB
134+
size_bytes = 51 * 1_000_000_000
135+
paths = [self.MockPath("huge_file.bin", size_bytes=size_bytes)]
136+
137+
_validate_upload_limits(paths)
138+
139+
# Should warn about file exceeding 50GB hard limit
140+
warning_calls = [str(call) for call in mock_logger.warning.call_args_list]
141+
assert any("51.0GB" in call or "51GB" in call for call in warning_calls)
142+
assert any("50GB hard limit" in call for call in warning_calls)
143+
144+
@patch("huggingface_hub._upload_large_folder.logger")
145+
def test_nested_directory_structure(self, mock_logger):
146+
"""Test correct handling of deeply nested directory structures."""
147+
paths = [
148+
self.MockPath("a/b/c/d/e/file1.txt"),
149+
self.MockPath("a/b/c/d/e/file2.txt"),
150+
self.MockPath("a/b/c/d/f/file3.txt"),
151+
self.MockPath("a/b/c/g/file4.txt"),
152+
]
153+
154+
_validate_upload_limits(paths)
155+
156+
# Should not warn - each folder has at most 2 entries
157+
mock_logger.warning.assert_not_called()

0 commit comments

Comments
 (0)