Expose more hf.co API surface, and ability to download a snapshot of a repo to a local folder (#25)

julien-c · LysandreJik · web-flow · commit 305f6132cd7c · 2021-04-07T19:03:11.000-04:00
* [ci] Slim down the matrix quite a bit

* Expose more hf.co API surface

- Expose the `filter` param on the list of models to only list models with a specific tag (method renamed to list_models, don't think it's too breaking, feedback welcome)

- Expose a `model_info` method that gives you slightly more detailed info on a specific model, at a specific revision

* Add `snapshot_download`: Download a whole snapshot of a repo's files at the specified revision

* Docs, tests, lockfiles &amp; metadata

* In `force_filename` mode do not even store metadata file

* Update README.md

Co-authored-by: Julien Chaumond &lt;julien@huggingface.co&gt;

Co-authored-by: Lysandre &lt;lysandre.debut@reseau.eseo.fr&gt;
Co-authored-by: Lysandre Debut &lt;lysandre@huggingface.co&gt;
diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml
@@ -9,7 +9,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.6", "3.7", "3.8", "3.9"]
+        python-version: ["3.6", "3.9"]
 
     steps:
     - uses: actions/checkout@v2
diff --git a/README.md b/README.md
@@ -63,6 +63,16 @@ Parameters:
 
 Check out the source code for all possible params (we'll create a real doc page in the future).
 
+### Bonus: `snapshot_download`
+
+`snapshot_download()` downloads all the files from the remote repository at the specified revision, 
+stores it to disk (in a versioning-aware way) and returns its local file path.
+
+Parameters:
+- a `repo_id` in the format `namespace/repository`
+- a `revision` on which the repository will be downloaded
+- a `cache_dir` which you can specify if you want to control where on disk the files are cached.
+
 <br>
 
 ## Publish models to the huggingface.co hub
diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py
@@ -32,3 +32,4 @@
 from .hf_api import HfApi, HfFolder
 from .hub_mixin import ModelHubMixin
 from .repository import Repository
+from .snapshot_download import snapshot_download
diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py
@@ -281,6 +281,7 @@ def cached_download(
     cache_dir: Union[str, Path, None] = None,
     user_agent: Union[Dict, str, None] = None,
     force_download=False,
+    force_filename: Optional[str] = None,
     proxies=None,
     etag_timeout=10,
     resume_download=False,
@@ -360,7 +361,9 @@ def cached_download(
             # etag is None
             pass
 
-    filename = url_to_filename(url, etag)
+    filename = (
+        force_filename if force_filename is not None else url_to_filename(url, etag)
+    )
 
     # get cache path to put the file
     cache_path = os.path.join(cache_dir, filename)
@@ -378,7 +381,11 @@ def cached_download(
                 )
                 if not file.endswith(".json") and not file.endswith(".lock")
             ]
-            if len(matching_files) > 0 and not force_download:
+            if (
+                len(matching_files) > 0
+                and not force_download
+                and force_filename is None
+            ):
                 return os.path.join(cache_dir, matching_files[-1])
             else:
                 # If files cannot be found and local_files_only=True,
@@ -444,10 +451,11 @@ def _resumable_file_manager() -> "io.BufferedWriter":
         logger.info("storing %s in cache at %s", url, cache_path)
         os.replace(temp_file.name, cache_path)
 
-        logger.info("creating metadata file for %s", cache_path)
-        meta = {"url": url, "etag": etag}
-        meta_path = cache_path + ".json"
-        with open(meta_path, "w") as meta_file:
-            json.dump(meta, meta_file)
+        if force_filename is None:
+            logger.info("creating metadata file for %s", cache_path)
+            meta = {"url": url, "etag": etag}
+            meta_path = cache_path + ".json"
+            with open(meta_path, "w") as meta_file:
+                json.dump(meta, meta_file)
 
     return cache_path
diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py
@@ -15,6 +15,7 @@
 
 
 import os
+import warnings
 from os.path import expanduser
 from typing import Dict, List, Optional, Tuple
 
@@ -36,7 +37,7 @@ def __init__(self, **kwargs):
             setattr(self, k, v)
 
 
-class ModelSibling:
+class ModelFile:
     """
     Data structure that represents a public file inside a model, accessible from huggingface.co
     """
@@ -55,6 +56,7 @@ class ModelInfo:
     def __init__(
         self,
         modelId: Optional[str] = None,  # id of model
+        sha: Optional[str] = None,  # commit sha at the specified revision
         tags: List[str] = [],
         pipeline_tag: Optional[str] = None,
         siblings: Optional[
@@ -63,10 +65,11 @@ def __init__(
         **kwargs
     ):
         self.modelId = modelId
+        self.sha = sha
         self.tags = tags
         self.pipeline_tag = pipeline_tag
         self.siblings = (
-            [ModelSibling(**x) for x in siblings] if siblings is not None else None
+            [ModelFile(**x) for x in siblings] if siblings is not None else None
         )
         for k, v in kwargs.items():
             setattr(self, k, v)
@@ -108,16 +111,51 @@ def logout(self, token: str) -> None:
         r = requests.post(path, headers={"authorization": "Bearer {}".format(token)})
         r.raise_for_status()
 
-    def model_list(self) -> List[ModelInfo]:
+    def list_models(self, filter: Optional[str] = None) -> List[ModelInfo]:
         """
         Get the public list of all the models on huggingface.co
         """
         path = "{}/api/models".format(self.endpoint)
-        r = requests.get(path)
+        params = {"filter": filter, "full": True} if filter is not None else None
+        r = requests.get(path, params=params)
         r.raise_for_status()
         d = r.json()
         return [ModelInfo(**x) for x in d]
 
+    def model_list(self) -> List[ModelInfo]:
+        """
+        Deprecated method name, renamed to `list_models`.
+
+        Get the public list of all the models on huggingface.co
+        """
+        warnings.warn(
+            "This method has been renamed to `list_models` for consistency and will be removed in a future version."
+        )
+        return self.list_models()
+
+    def model_info(
+        self, repo_id: str, revision: Optional[str] = None, token: Optional[str] = None
+    ) -> ModelInfo:
+        """
+        Get info on one specific model on huggingface.co
+
+        Model can be private if you pass an acceptable token.
+        """
+        path = (
+            "{}/api/models/{repo_id}".format(self.endpoint, repo_id=repo_id)
+            if revision is None
+            else "{}/api/models/{repo_id}/revision/{revision}".format(
+                self.endpoint, repo_id=repo_id, revision=revision
+            )
+        )
+        headers = (
+            {"authorization": "Bearer {}".format(token)} if token is not None else None
+        )
+        r = requests.get(path, headers=headers)
+        r.raise_for_status()
+        d = r.json()
+        return ModelInfo(**d)
+
     def list_repos_objs(
         self, token: str, organization: Optional[str] = None
     ) -> List[RepoObj]:
diff --git a/src/huggingface_hub/snapshot_download.py b/src/huggingface_hub/snapshot_download.py
@@ -0,0 +1,67 @@
+import os
+from pathlib import Path
+from typing import Optional, Union
+
+from .constants import HUGGINGFACE_HUB_CACHE
+from .file_download import cached_download, hf_hub_url
+from .hf_api import HfApi
+
+
+REPO_ID_SEPARATOR = "__"
+# ^ make sure this substring is not allowed in repo_ids on hf.co
+
+
+def snapshot_download(
+    repo_id: str,
+    revision: Optional[str] = None,
+    cache_dir: Union[str, Path, None] = None,
+) -> str:
+    """
+    Downloads a whole snapshot of a repo's files at the specified revision.
+    This is useful when you want all files from a repo, because you don't know
+    which ones you will need a priori.
+    All files are nested inside a folder in order to keep their actual filename
+    relative to that folder.
+
+    An alternative would be to just clone a repo but this would require that
+    the user always has git and git-lfs installed, and properly configured.
+
+    Note: at some point maybe this format of storage should actually replace
+    the flat storage structure we've used so far (initially from allennlp
+    if I remember correctly).
+
+    Return:
+        Local folder path (string) of repo snapshot
+    """
+    if cache_dir is None:
+        cache_dir = HUGGINGFACE_HUB_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    _api = HfApi()
+    model_info = _api.model_info(repo_id=repo_id, revision=revision)
+
+    storage_folder = os.path.join(
+        cache_dir, repo_id.replace("/", REPO_ID_SEPARATOR) + "." + model_info.sha
+    )
+
+    for model_file in model_info.siblings:
+        url = hf_hub_url(
+            repo_id, filename=model_file.rfilename, revision=model_info.sha
+        )
+        relative_filepath = os.path.join(*model_file.rfilename.split("/"))
+
+        # Create potential nested dir
+        nested_dirname = os.path.dirname(
+            os.path.join(storage_folder, relative_filepath)
+        )
+        os.makedirs(nested_dirname, exist_ok=True)
+
+        path = cached_download(
+            url, cache_dir=storage_folder, force_filename=relative_filepath
+        )
+
+        if os.path.exists(path + ".lock"):
+            os.remove(path + ".lock")
+
+    return storage_folder
diff --git a/tests/test_file_download.py b/tests/test_file_download.py
@@ -23,32 +23,24 @@
 from huggingface_hub.file_download import cached_download, filename_to_url, hf_hub_url
 
 from .testing_utils import (
-    DUMMY_UNKWOWN_IDENTIFIER,
+    DUMMY_MODEL_ID,
+    DUMMY_MODEL_ID_PINNED_SHA1,
+    DUMMY_MODEL_ID_PINNED_SHA256,
+    DUMMY_MODEL_ID_REVISION_INVALID,
+    DUMMY_MODEL_ID_REVISION_ONE_SPECIFIC_COMMIT,
     SAMPLE_DATASET_IDENTIFIER,
     OfflineSimulationMode,
     offline,
 )
 
 
-MODEL_ID = DUMMY_UNKWOWN_IDENTIFIER
-# An actual model hosted on huggingface.co
+REVISION_ID_DEFAULT = "main"
+# Default branch name
 
 DATASET_ID = SAMPLE_DATASET_IDENTIFIER
 # An actual dataset hosted on huggingface.co
 
 
-REVISION_ID_DEFAULT = "main"
-# Default branch name
-REVISION_ID_ONE_SPECIFIC_COMMIT = "f2c752cfc5c0ab6f4bdec59acea69eefbee381c2"
-# One particular commit (not the top of `main`)
-REVISION_ID_INVALID = "aaaaaaa"
-# This commit does not exist, so we should 404.
-
-PINNED_SHA1 = "d9e9f15bc825e4b2c9249e9578f884bbcb5e3684"
-# Sha-1 of config.json on the top of `main`, for checking purposes
-PINNED_SHA256 = "4b243c475af8d0a7754e87d7d096c92e5199ec2fe168a2ee7998e3b8e9bcb1d3"
-# Sha-256 of pytorch_model.bin on the top of `main`, for checking purposes
-
 DATASET_REVISION_ID_ONE_SPECIFIC_COMMIT = "e25d55a1c4933f987c46cc75d8ffadd67f257c61"
 # One particular commit for DATASET_ID
 DATASET_SAMPLE_PY_FILE = "custom_squad.py"
@@ -62,10 +54,12 @@ def test_bogus_url(self):
 
     def test_no_connection(self):
         invalid_url = hf_hub_url(
-            MODEL_ID, filename=CONFIG_NAME, revision=REVISION_ID_INVALID
+            DUMMY_MODEL_ID,
+            filename=CONFIG_NAME,
+            revision=DUMMY_MODEL_ID_REVISION_INVALID,
         )
         valid_url = hf_hub_url(
-            MODEL_ID, filename=CONFIG_NAME, revision=REVISION_ID_DEFAULT
+            DUMMY_MODEL_ID, filename=CONFIG_NAME, revision=REVISION_ID_DEFAULT
         )
         self.assertIsNotNone(cached_download(valid_url, force_download=True))
         for offline_mode in OfflineSimulationMode:
@@ -78,39 +72,47 @@ def test_no_connection(self):
 
     def test_file_not_found(self):
         # Valid revision (None) but missing file.
-        url = hf_hub_url(MODEL_ID, filename="missing.bin")
+        url = hf_hub_url(DUMMY_MODEL_ID, filename="missing.bin")
         with self.assertRaisesRegex(requests.exceptions.HTTPError, "404 Client Error"):
             _ = cached_download(url)
 
     def test_revision_not_found(self):
         # Valid file but missing revision
-        url = hf_hub_url(MODEL_ID, filename=CONFIG_NAME, revision=REVISION_ID_INVALID)
+        url = hf_hub_url(
+            DUMMY_MODEL_ID,
+            filename=CONFIG_NAME,
+            revision=DUMMY_MODEL_ID_REVISION_INVALID,
+        )
         with self.assertRaisesRegex(requests.exceptions.HTTPError, "404 Client Error"):
             _ = cached_download(url)
 
     def test_standard_object(self):
-        url = hf_hub_url(MODEL_ID, filename=CONFIG_NAME, revision=REVISION_ID_DEFAULT)
+        url = hf_hub_url(
+            DUMMY_MODEL_ID, filename=CONFIG_NAME, revision=REVISION_ID_DEFAULT
+        )
         filepath = cached_download(url, force_download=True)
         metadata = filename_to_url(filepath)
-        self.assertEqual(metadata, (url, f'"{PINNED_SHA1}"'))
+        self.assertEqual(metadata, (url, f'"{DUMMY_MODEL_ID_PINNED_SHA1}"'))
 
     def test_standard_object_rev(self):
         # Same object, but different revision
         url = hf_hub_url(
-            MODEL_ID, filename=CONFIG_NAME, revision=REVISION_ID_ONE_SPECIFIC_COMMIT
+            DUMMY_MODEL_ID,
+            filename=CONFIG_NAME,
+            revision=DUMMY_MODEL_ID_REVISION_ONE_SPECIFIC_COMMIT,
         )
         filepath = cached_download(url, force_download=True)
         metadata = filename_to_url(filepath)
-        self.assertNotEqual(metadata[1], f'"{PINNED_SHA1}"')
+        self.assertNotEqual(metadata[1], f'"{DUMMY_MODEL_ID_PINNED_SHA1}"')
         # Caution: check that the etag is *not* equal to the one from `test_standard_object`
 
     def test_lfs_object(self):
         url = hf_hub_url(
-            MODEL_ID, filename=PYTORCH_WEIGHTS_NAME, revision=REVISION_ID_DEFAULT
+            DUMMY_MODEL_ID, filename=PYTORCH_WEIGHTS_NAME, revision=REVISION_ID_DEFAULT
         )
         filepath = cached_download(url, force_download=True)
         metadata = filename_to_url(filepath)
-        self.assertEqual(metadata, (url, f'"{PINNED_SHA256}"'))
+        self.assertEqual(metadata, (url, f'"{DUMMY_MODEL_ID_PINNED_SHA256}"'))
 
     def test_dataset_standard_object_rev(self):
         url = hf_hub_url(
@@ -129,7 +131,7 @@ def test_dataset_standard_object_rev(self):
         # now let's download
         filepath = cached_download(url, force_download=True)
         metadata = filename_to_url(filepath)
-        self.assertNotEqual(metadata[1], f'"{PINNED_SHA1}"')
+        self.assertNotEqual(metadata[1], f'"{DUMMY_MODEL_ID_PINNED_SHA1}"')
 
     def test_dataset_lfs_object(self):
         url = hf_hub_url(
diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py
diff --git a/tests/test_snapshot_download.py b/tests/test_snapshot_download.py
diff --git a/tests/testing_utils.py b/tests/testing_utils.py