Document .no_exist folder (#1308)

Wauplin · sgugger · web-flow · commit c06efc30afb9 · 2023-01-25T12:03:34.000+01:00
* Document .no_exist folder

* typo

* as subheader

* Update docs/source/how-to-cache.mdx

Co-authored-by: Sylvain Gugger &lt;35901082+sgugger@users.noreply.github.com&gt;

Co-authored-by: Sylvain Gugger &lt;35901082+sgugger@users.noreply.github.com&gt;
diff --git a/docs/source/how-to-cache.mdx b/docs/source/how-to-cache.mdx
@@ -87,6 +87,46 @@ That `README.md` file is actually a symlink linking to the blob that has the has
 By creating the skeleton this way we open the mechanism to file sharing: if the same file was fetched in
 revision `bbbbbb`, it would have the same hash and the file would not need to be re-downloaded.
 
+### .no_exist (advanced)
+
+In addition to the `blobs`, `refs` and `snapshots` folders, you might also find a `.no_exist` folder
+in your cache. This folder keeps track of files that you've tried to download once but don't exist
+on the Hub. Its structure is the same as the `snapshots` folder with 1 subfolder per known revision:
+
+```
+<CACHE_DIR>/<REPO_NAME>/.no_exist/aaaaaa/config_that_does_not_exist.json
+```
+
+Unlike the `snapshots` folder, files are simple empty files (no symlinks). In this example,
+the file `"config_that_does_not_exist.json"` does not exist on the Hub for the revision `"aaaaaa"`.
+As it only stores empty files, this folder is neglectable is term of disk usage.
+
+So now you might wonder, why is this information even relevant?
+In some cases, a framework tries to load optional files for a model. Saving the non-existence
+of optional files makes it faster to load a model as it saves 1 HTTP call per possible optional file.
+This is for example the case in `transformers` where each tokenizer can support additional files.
+The first time you load the tokenizer on your machine, it will cache which optional files exists (and
+which doesn't) to make the loading time faster for the next initializations.
+
+To test if a file is cached locally (without making any HTTP request), you can use the [`try_to_load_from_cache`]
+helper. It will either return the filepath (if exists and cached), the object `_CACHED_NO_EXIST` (if non-existence
+is cached) or `None` (if we don't know).
+
+```python
+from huggingface_hub import try_to_load_from_cache, _CACHED_NO_EXIST
+
+filepath = try_to_load_from_cache()
+if isinstance(filepath, str):
+    # file exists and is cached
+    ...
+elif filepath is _CACHED_NO_EXIST:
+    # non-existence of file is cached
+    ...
+else:
+    # file is not cached
+    ...
+```
+
 ### In practice
 
 In practice, your cache should look like the following tree:
diff --git a/docs/source/package_reference/cache.mdx b/docs/source/package_reference/cache.mdx
@@ -6,7 +6,11 @@ for a detailed presentation of caching at HF.
 
 ## Helpers
 
-## cached_assets_path
+### try_to_load_from_cache
+
+[[autodoc]] huggingface_hub.try_to_load_from_cache
+
+### cached_assets_path
 
 [[autodoc]] huggingface_hub.cached_assets_path
 
diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py
@@ -94,6 +94,7 @@
     ],
     "file_download": [
         "HfFileMetadata",
+        "_CACHED_NO_EXIST",
         "cached_download",
         "get_hf_file_metadata",
         "hf_hub_download",
@@ -336,6 +337,7 @@ def __dir__():
     from .fastai_utils import _save_pretrained_fastai  # noqa: F401
     from .fastai_utils import from_pretrained_fastai  # noqa: F401
     from .fastai_utils import push_to_hub_fastai  # noqa: F401
+    from .file_download import _CACHED_NO_EXIST  # noqa: F401
     from .file_download import HfFileMetadata  # noqa: F401
     from .file_download import cached_download  # noqa: F401
     from .file_download import get_hf_file_metadata  # noqa: F401
diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py
@@ -1327,6 +1327,23 @@ def try_to_load_from_cache(
             - The exact path to the cached file if it's found in the cache
             - A special value `_CACHED_NO_EXIST` if the file does not exist at the given commit hash and this fact was
               cached.
+
+    Example:
+
+    ```python
+    from huggingface_hub import try_to_load_from_cache, _CACHED_NO_EXIST
+
+    filepath = try_to_load_from_cache()
+    if isinstance(filepath, str):
+        # file exists and is cached
+        ...
+    elif filepath is _CACHED_NO_EXIST:
+        # non-existence of file is cached
+        ...
+    else:
+        # file is not cached
+        ...
+    ```
     """
     if revision is None:
         revision = "main"
@@ -1348,7 +1365,7 @@ def try_to_load_from_cache(
 
     refs_dir = os.path.join(repo_cache, "refs")
     snapshots_dir = os.path.join(repo_cache, "snapshots")
-    no_exists_dir = os.path.join(repo_cache, ".no_exist")
+    no_exist_dir = os.path.join(repo_cache, ".no_exist")
 
     # Resolve refs (for instance to convert main to the associated commit sha)
     if os.path.isdir(refs_dir):
@@ -1357,8 +1374,8 @@ def try_to_load_from_cache(
             with open(os.path.join(refs_dir, revision)) as f:
                 revision = f.read()
 
-    # Check if file is cached as "no_exists"
-    if os.path.isfile(os.path.join(no_exists_dir, revision, filename)):
+    # Check if file is cached as "no_exist"
+    if os.path.isfile(os.path.join(no_exist_dir, revision, filename)):
         return _CACHED_NO_EXIST
 
     # Check if revision folder exists