diff --git a/services/worker/src/worker/utils.py b/services/worker/src/worker/utils.py index 0e528ea29..f07f1b39d 100644 --- a/services/worker/src/worker/utils.py +++ b/services/worker/src/worker/utils.py @@ -13,10 +13,9 @@ from urllib.parse import quote import PIL -import requests from datasets import Dataset, DatasetInfo, DownloadConfig, Features, IterableDataset, load_dataset from datasets.utils.file_utils import SINGLE_FILE_COMPRESSION_EXTENSION_TO_PROTOCOL -from huggingface_hub import HfFileSystem, HfFileSystemFile +from huggingface_hub import HfFileSystem, HfFileSystemFile, revision_exists from huggingface_hub.errors import RepositoryNotFoundError from huggingface_hub.hf_api import HfApi from libcommon.constants import CONFIG_SPLIT_NAMES_KIND, MAX_COLUMN_NAME_LENGTH @@ -176,11 +175,12 @@ def retry_on_arrow_invalid_open_file( def create_branch(dataset: str, target_revision: str, hf_api: HfApi, committer_hf_api: HfApi) -> None: try: - refs = retry(on=[requests.exceptions.ConnectionError], sleeps=LIST_REPO_REFS_RETRY_SLEEPS)( - hf_api.list_repo_refs - )(repo_id=dataset, repo_type=DATASET_TYPE) - if all(ref.ref != target_revision for ref in refs.converts): + # Check if the target revision (branch) already exists + if not revision_exists(dataset, target_revision): + # If not, get the latest commit from the main branch (or current default) initial_commit = hf_api.list_repo_commits(repo_id=dataset, repo_type=DATASET_TYPE)[-1].commit_id + + # Create a new branch at the latest commit committer_hf_api.create_branch( repo_id=dataset, branch=target_revision, repo_type=DATASET_TYPE, revision=initial_commit, exist_ok=True )