Skip to content

Commit 1b4bfe7

Browse files
committed
Merge branch 'main' into v0.25-release
2 parents 6f49af5 + f1a7ed4 commit 1b4bfe7

File tree

3 files changed

+89
-14
lines changed

3 files changed

+89
-14
lines changed

src/huggingface_hub/_upload_large_folder.py

Lines changed: 59 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
logger = logging.getLogger(__name__)
4242

4343
WAITING_TIME_IF_NO_TASKS = 10 # seconds
44+
MAX_NB_REGULAR_FILES_PER_COMMIT = 75
45+
MAX_NB_LFS_FILES_PER_COMMIT = 150
4446

4547

4648
def upload_large_folder_internal(
@@ -373,17 +375,18 @@ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob,
373375
if (
374376
status.nb_workers_commit == 0
375377
and status.queue_commit.qsize() > 0
376-
and (status.last_commit_attempt is None or time.time() - status.last_commit_attempt > 5 * 60)
378+
and status.last_commit_attempt is not None
379+
and time.time() - status.last_commit_attempt > 5 * 60
377380
):
378381
status.nb_workers_commit += 1
379382
logger.debug("Job: commit (more than 5 minutes since last commit attempt)")
380-
return (WorkerJob.COMMIT, _get_n(status.queue_commit, 25))
383+
return (WorkerJob.COMMIT, _get_items_to_commit(status.queue_commit))
381384

382-
# 2. Commit if at least 25 files are ready to commit
383-
elif status.nb_workers_commit == 0 and status.queue_commit.qsize() >= 25:
385+
# 2. Commit if at least 100 files are ready to commit
386+
elif status.nb_workers_commit == 0 and status.queue_commit.qsize() >= 150:
384387
status.nb_workers_commit += 1
385-
logger.debug("Job: commit (>25 files ready)")
386-
return (WorkerJob.COMMIT, _get_n(status.queue_commit, 25))
388+
logger.debug("Job: commit (>100 files ready)")
389+
return (WorkerJob.COMMIT, _get_items_to_commit(status.queue_commit))
387390

388391
# 3. Get upload mode if at least 10 files
389392
elif status.queue_get_upload_mode.qsize() >= 10:
@@ -430,18 +433,39 @@ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob,
430433
logger.debug("Job: get upload mode")
431434
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, 50))
432435

433-
# 10. Commit if at least 1 file
434-
elif status.nb_workers_commit == 0 and status.queue_commit.qsize() > 0:
436+
# 10. Commit if at least 1 file and 1 min since last commit attempt
437+
elif (
438+
status.nb_workers_commit == 0
439+
and status.queue_commit.qsize() > 0
440+
and status.last_commit_attempt is not None
441+
and time.time() - status.last_commit_attempt > 1 * 60
442+
):
443+
status.nb_workers_commit += 1
444+
logger.debug("Job: commit (1 min since last commit attempt)")
445+
return (WorkerJob.COMMIT, _get_items_to_commit(status.queue_commit))
446+
447+
# 11. Commit if at least 1 file all other queues are empty and all workers are waiting
448+
# e.g. when it's the last commit
449+
elif (
450+
status.nb_workers_commit == 0
451+
and status.queue_commit.qsize() > 0
452+
and status.queue_sha256.qsize() == 0
453+
and status.queue_get_upload_mode.qsize() == 0
454+
and status.queue_preupload_lfs.qsize() == 0
455+
and status.nb_workers_sha256 == 0
456+
and status.nb_workers_get_upload_mode == 0
457+
and status.nb_workers_preupload_lfs == 0
458+
):
435459
status.nb_workers_commit += 1
436460
logger.debug("Job: commit")
437-
return (WorkerJob.COMMIT, _get_n(status.queue_commit, 25))
461+
return (WorkerJob.COMMIT, _get_items_to_commit(status.queue_commit))
438462

439-
# 11. If all queues are empty, exit
463+
# 12. If all queues are empty, exit
440464
elif all(metadata.is_committed or metadata.should_ignore for _, metadata in status.items):
441465
logger.info("All files have been processed! Exiting worker.")
442466
return None
443467

444-
# 12. If no task is available, wait
468+
# 13. If no task is available, wait
445469
else:
446470
status.nb_workers_waiting += 1
447471
logger.debug(f"No task available, waiting... ({WAITING_TIME_IF_NO_TASKS}s)")
@@ -547,6 +571,30 @@ def _get_n(queue: "queue.Queue[JOB_ITEM_T]", n: int) -> List[JOB_ITEM_T]:
547571
return [queue.get() for _ in range(min(queue.qsize(), n))]
548572

549573

574+
def _get_items_to_commit(queue: "queue.Queue[JOB_ITEM_T]") -> List[JOB_ITEM_T]:
575+
"""Special case for commit job: the number of items to commit depends on the type of files."""
576+
# Can take at most 50 regular files and/or 100 LFS files in a single commit
577+
items: List[JOB_ITEM_T] = []
578+
nb_lfs, nb_regular = 0, 0
579+
while True:
580+
# If empty queue => commit everything
581+
if queue.qsize() == 0:
582+
return items
583+
584+
# If we have enough items => commit them
585+
if nb_lfs >= MAX_NB_LFS_FILES_PER_COMMIT or nb_regular >= MAX_NB_REGULAR_FILES_PER_COMMIT:
586+
return items
587+
588+
# Else, get a new item and increase counter
589+
item = queue.get()
590+
items.append(item)
591+
_, metadata = item
592+
if metadata.upload_mode == "lfs":
593+
nb_lfs += 1
594+
else:
595+
nb_regular += 1
596+
597+
550598
def _print_overwrite(report: str) -> None:
551599
"""Print a report, overwriting the previous lines.
552600

src/huggingface_hub/hf_api.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@
136136
validate_hf_hub_args,
137137
)
138138
from .utils import tqdm as hf_tqdm
139+
from .utils._deprecation import _deprecate_method
139140
from .utils._typing import CallableT
140141
from .utils.endpoint_helpers import _is_emission_within_threshold
141142

@@ -1405,6 +1406,10 @@ class User:
14051406
Number of upvotes received by the user.
14061407
num_likes (`int`, *optional*):
14071408
Number of likes given by the user.
1409+
num_following (`int`, *optional*):
1410+
Number of users this user is following.
1411+
num_followers (`int`, *optional*):
1412+
Number of users following this user.
14081413
orgs (list of [`Organization`]):
14091414
List of organizations the user is part of.
14101415
"""
@@ -1423,6 +1428,8 @@ class User:
14231428
num_papers: Optional[int] = None
14241429
num_upvotes: Optional[int] = None
14251430
num_likes: Optional[int] = None
1431+
num_following: Optional[int] = None
1432+
num_followers: Optional[int] = None
14261433
orgs: List[Organization] = field(default_factory=list)
14271434

14281435
def __init__(self, **kwargs) -> None:
@@ -1439,6 +1446,8 @@ def __init__(self, **kwargs) -> None:
14391446
self.num_papers = kwargs.pop("numPapers", None)
14401447
self.num_upvotes = kwargs.pop("numUpvotes", None)
14411448
self.num_likes = kwargs.pop("numLikes", None)
1449+
self.num_following = kwargs.pop("numFollowing", None)
1450+
self.num_followers = kwargs.pop("numFollowers", None)
14421451
self.user_type = kwargs.pop("type", None)
14431452
self.orgs = [Organization(**org) for org in kwargs.pop("orgs", [])]
14441453

@@ -4010,6 +4019,9 @@ def _payload_as_ndjson() -> Iterable[bytes]:
40104019

40114020
@experimental
40124021
@validate_hf_hub_args
4022+
@_deprecate_method(
4023+
version="0.27", message="This is an experimental feature. Please use `upload_large_folder` instead."
4024+
)
40134025
def create_commits_on_pr(
40144026
self,
40154027
*,
@@ -4848,8 +4860,10 @@ def upload_folder(
48484860
new files. This is useful if you don't know which files have already been uploaded.
48494861
Note: to avoid discrepancies the `.gitattributes` file is not deleted even if it matches the pattern.
48504862
multi_commits (`bool`):
4863+
Deprecated. For large uploads, use `upload_large_folder` instead.
48514864
If True, changes are pushed to a PR using a multi-commit process. Defaults to `False`.
48524865
multi_commits_verbose (`bool`):
4866+
Deprecated. For large uploads, use `upload_large_folder` instead.
48534867
If True and `multi_commits` is used, more information will be displayed to the user.
48544868
run_as_future (`bool`, *optional*):
48554869
Whether or not to run this method in the background. Background jobs are run sequentially without
@@ -5342,15 +5356,16 @@ def upload_large_folder(
53425356
53435357
Order of priority:
53445358
1. Commit if more than 5 minutes since last commit attempt (and at least 1 file).
5345-
2. Commit if at least 25 files are ready to commit.
5359+
2. Commit if at least 150 files are ready to commit.
53465360
3. Get upload mode if at least 10 files have been hashed.
53475361
4. Pre-upload LFS file if at least 1 file and no worker is pre-uploading.
53485362
5. Hash file if at least 1 file and no worker is hashing.
53495363
6. Get upload mode if at least 1 file and no worker is getting upload mode.
53505364
7. Pre-upload LFS file if at least 1 file (exception: if hf_transfer is enabled, only 1 worker can preupload LFS at a time).
53515365
8. Hash file if at least 1 file to hash.
53525366
9. Get upload mode if at least 1 file to get upload mode.
5353-
10. Commit if at least 1 file to commit.
5367+
10. Commit if at least 1 file to commit and at least 1 min since last commit attempt.
5368+
11. Commit if at least 1 file to commit and all other queues are empty.
53545369
53555370
Special rules:
53565371
- If `hf_transfer` is enabled, only 1 LFS uploader at a time. Otherwise the CPU would be bloated by `hf_transfer`.
@@ -9463,14 +9478,24 @@ def _prepare_upload_folder_additions(
94639478
repo_type=repo_type,
94649479
token=token,
94659480
)
9481+
if len(filtered_repo_objects) > 30:
9482+
logger.info(
9483+
"It seems you are trying to upload a large folder at once. This might take some time and then fail if "
9484+
"the folder is too large. For such cases, it is recommended to upload in smaller batches or to use "
9485+
"`HfApi().upload_large_folder(...)`/`huggingface-cli upload-large-folder` instead. For more details, "
9486+
"check out https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#upload-a-large-folder."
9487+
)
94669488

9467-
return [
9489+
logger.info(f"Start hashing {len(filtered_repo_objects)} files.")
9490+
operations = [
94689491
CommitOperationAdd(
94699492
path_or_fileobj=relpath_to_abspath[relpath], # absolute path on disk
94709493
path_in_repo=prefix + relpath, # "absolute" path in repo
94719494
)
94729495
for relpath in filtered_repo_objects
94739496
]
9497+
logger.info(f"Finished hashing {len(filtered_repo_objects)} files.")
9498+
return operations
94749499

94759500
def _validate_yaml(self, content: str, *, repo_type: Optional[str] = None, token: Union[bool, str, None] = None):
94769501
"""

tests/test_hf_api.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4093,6 +4093,8 @@ def test_user_overview(self) -> None:
40934093
assert overview.num_upvotes > 10
40944094
assert len(overview.orgs) > 0
40954095
assert any(org.name == "huggingface" for org in overview.orgs)
4096+
assert overview.num_following > 300
4097+
assert overview.num_followers > 1000
40964098

40974099
def test_organization_members(self) -> None:
40984100
members = self.api.list_organization_members("huggingface")

0 commit comments

Comments
 (0)