Skip to content

Commit 507703a

Browse files
committed
Fix relative symlinks in cache (#1390)
* fix relative symlinks in cache * fix test * fix test on windows
1 parent fbc4ccf commit 507703a

File tree

3 files changed

+63
-47
lines changed

3 files changed

+63
-47
lines changed

src/huggingface_hub/file_download.py

Lines changed: 31 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def are_symlinks_supported(cache_dir: Union[str, Path, None] = None) -> bool:
103103
src_path.touch()
104104
dst_path = Path(tmpdir) / "dummy_file_dst"
105105

106-
# Relative source path as in `_create_relative_symlink``
106+
# Relative source path as in `_create_symlink``
107107
relative_src = os.path.relpath(src_path, start=os.path.dirname(dst_path))
108108
try:
109109
os.symlink(relative_src, dst_path)
@@ -558,7 +558,7 @@ def cached_download(
558558
token: Union[bool, str, None] = None,
559559
local_files_only: bool = False,
560560
legacy_cache_layout: bool = False,
561-
) -> Optional[str]: # pragma: no cover
561+
) -> str:
562562
"""
563563
Download from a given URL and cache it if it's not already present in the
564564
local cache.
@@ -819,61 +819,56 @@ def _normalize_etag(etag: Optional[str]) -> Optional[str]:
819819
return etag.strip('"')
820820

821821

822-
def _create_relative_symlink(src: str, dst: str, new_blob: bool = False) -> None:
823-
"""Create a symbolic link named dst pointing to src as a relative path to dst.
824-
825-
The relative part is mostly because it seems more elegant to the author.
822+
def _create_symlink(src: str, dst: str, new_blob: bool = False) -> None:
823+
"""Create a symbolic link named dst pointing to src as an absolute path.
826824
827825
The result layout looks something like
828826
└── [ 128] snapshots
829827
├── [ 128] 2439f60ef33a0d46d85da5001d52aeda5b00ce9f
830-
│ ├── [ 52] README.md -> ../../blobs/d7edf6bd2a681fb0175f7735299831ee1b22b812
831-
│ └── [ 76] pytorch_model.bin -> ../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
828+
│ ├── [ 52] README.md -> /path/to/cache/blobs/d7edf6bd2a681fb0175f7735299831ee1b22b812
829+
│ └── [ 76] pytorch_model.bin -> /path/to/cache/blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
832830
833-
If symlinks cannot be created on this platform (most likely to be Windows), the
834-
workaround is to avoid symlinks by having the actual file in `dst`. If it is a new
835-
file (`new_blob=True`), we move it to `dst`. If it is not a new file
836-
(`new_blob=False`), we don't know if the blob file is already referenced elsewhere.
837-
To avoid breaking existing cache, the file is duplicated on the disk.
831+
If symlinks cannot be created on this platform (most likely to be Windows), the workaround is to avoid symlinks by
832+
having the actual file in `dst`. If it is a new file (`new_blob=True`), we move it to `dst`. If it is not a new file
833+
(`new_blob=False`), we don't know if the blob file is already referenced elsewhere. To avoid breaking existing
834+
cache, the file is duplicated on the disk.
838835
839-
In case symlinks are not supported, a warning message is displayed to the user once
840-
when loading `huggingface_hub`. The warning message can be disable with the
841-
`DISABLE_SYMLINKS_WARNING` environment variable.
836+
In case symlinks are not supported, a warning message is displayed to the user once when loading `huggingface_hub`.
837+
The warning message can be disable with the `DISABLE_SYMLINKS_WARNING` environment variable.
842838
"""
843839
try:
844840
os.remove(dst)
845841
except OSError:
846842
pass
847843

844+
abs_src = os.path.abspath(os.path.expanduser(src))
845+
abs_dst = os.path.abspath(os.path.expanduser(dst))
846+
848847
try:
849-
_support_symlinks = are_symlinks_supported(
850-
os.path.dirname(os.path.commonpath([os.path.realpath(src), os.path.realpath(dst)]))
851-
)
848+
_support_symlinks = are_symlinks_supported(os.path.dirname(os.path.commonpath([abs_src, abs_dst])))
852849
except PermissionError:
853850
# Permission error means src and dst are not in the same volume (e.g. destination path has been provided
854851
# by the user via `local_dir`. Let's test symlink support there)
855-
_support_symlinks = are_symlinks_supported(os.path.dirname(dst))
852+
_support_symlinks = are_symlinks_supported(os.path.dirname(abs_dst))
856853

857854
if _support_symlinks:
858-
logger.info(f"Creating pointer from {src} to {dst}")
855+
logger.info(f"Creating pointer from {abs_src} to {abs_dst}")
859856
try:
860-
os.symlink(src, dst)
857+
os.symlink(abs_src, abs_dst)
861858
except FileExistsError:
862-
if os.path.islink(dst) and os.path.realpath(dst) == os.path.realpath(src):
863-
# `dst` already exists and is a symlink to the `src` blob. It is most
864-
# likely that the file has been cached twice concurrently (exactly
865-
# between `os.remove` and `os.symlink`). Do nothing.
859+
if os.path.islink(abs_dst) and os.path.realpath(abs_dst) == os.path.realpath(abs_src):
860+
# `abs_dst` already exists and is a symlink to the `abs_src` blob. It is most likely that the file has
861+
# been cached twice concurrently (exactly between `os.remove` and `os.symlink`). Do nothing.
866862
pass
867863
else:
868-
# Very unlikely to happen. Means a file `dst` has been created exactly
869-
# between `os.remove` and `os.symlink` and is not a symlink to the `src`
870-
# blob file. Raise exception.
864+
# Very unlikely to happen. Means a file `dst` has been created exactly between `os.remove` and
865+
# `os.symlink` and is not a symlink to the `abs_src` blob file. Raise exception.
871866
raise
872867
elif new_blob:
873-
logger.info(f"Symlink not supported. Moving file from {src} to {dst}")
868+
logger.info(f"Symlink not supported. Moving file from {abs_src} to {abs_dst}")
874869
os.replace(src, dst)
875870
else:
876-
logger.info(f"Symlink not supported. Copying file from {src} to {dst}")
871+
logger.info(f"Symlink not supported. Copying file from {abs_src} to {abs_dst}")
877872
shutil.copyfile(src, dst)
878873

879874

@@ -926,7 +921,7 @@ def hf_hub_download(
926921
token: Union[bool, str, None] = None,
927922
local_files_only: bool = False,
928923
legacy_cache_layout: bool = False,
929-
):
924+
) -> str:
930925
"""Download a given file if it's not already present in the local cache.
931926
932927
The new cache file layout looks like this:
@@ -1258,7 +1253,7 @@ def hf_hub_download(
12581253
if local_dir is not None: # to local dir
12591254
return _to_local_dir(blob_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
12601255
else: # or in snapshot cache
1261-
_create_relative_symlink(blob_path, pointer_path, new_blob=False)
1256+
_create_symlink(blob_path, pointer_path, new_blob=False)
12621257
return pointer_path
12631258

12641259
# Prevent parallel downloads of the same file with a lock.
@@ -1313,7 +1308,7 @@ def _resumable_file_manager() -> Generator[io.BufferedWriter, None, None]:
13131308
if local_dir is None:
13141309
logger.info(f"Storing {url} in cache at {blob_path}")
13151310
_chmod_and_replace(temp_file.name, blob_path)
1316-
_create_relative_symlink(blob_path, pointer_path, new_blob=True)
1311+
_create_symlink(blob_path, pointer_path, new_blob=True)
13171312
else:
13181313
local_dir_filepath = os.path.join(local_dir, relative_filename)
13191314
os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True)
@@ -1325,7 +1320,7 @@ def _resumable_file_manager() -> Generator[io.BufferedWriter, None, None]:
13251320
logger.info(f"Storing {url} in cache at {blob_path}")
13261321
_chmod_and_replace(temp_file.name, blob_path)
13271322
logger.info("Create symlink to local dir")
1328-
_create_relative_symlink(blob_path, local_dir_filepath, new_blob=False)
1323+
_create_symlink(blob_path, local_dir_filepath, new_blob=False)
13291324
elif local_dir_use_symlinks == "auto" and not is_big_file:
13301325
logger.info(f"Storing {url} in cache at {blob_path}")
13311326
_chmod_and_replace(temp_file.name, blob_path)
@@ -1544,7 +1539,7 @@ def _to_local_dir(
15441539
use_symlinks = os.stat(real_blob_path).st_size > constants.HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD
15451540

15461541
if use_symlinks:
1547-
_create_relative_symlink(real_blob_path, local_dir_filepath, new_blob=False)
1542+
_create_symlink(real_blob_path, local_dir_filepath, new_blob=False)
15481543
else:
15491544
shutil.copyfile(real_blob_path, local_dir_filepath)
15501545
return local_dir_filepath

src/huggingface_hub/repocard.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -170,17 +170,19 @@ def load(
170170
if Path(repo_id_or_path).exists():
171171
card_path = Path(repo_id_or_path)
172172
elif isinstance(repo_id_or_path, str):
173-
card_path = hf_hub_download(
174-
repo_id_or_path,
175-
REPOCARD_NAME,
176-
repo_type=repo_type or cls.repo_type,
177-
token=token,
173+
card_path = Path(
174+
hf_hub_download(
175+
repo_id_or_path,
176+
REPOCARD_NAME,
177+
repo_type=repo_type or cls.repo_type,
178+
token=token,
179+
)
178180
)
179181
else:
180182
raise ValueError(f"Cannot load RepoCard: path not found on disk ({repo_id_or_path}).")
181183

182184
# Preserve newlines in the existing file.
183-
with Path(card_path).open(mode="r", newline="", encoding="utf-8") as f:
185+
with card_path.open(mode="r", newline="", encoding="utf-8") as f:
184186
return cls(f.read(), ignore_metadata_errors=ignore_metadata_errors)
185187

186188
def validate(self, repo_type: Optional[str] = None):

tests/test_file_download.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414
import os
1515
import re
16+
import shutil
1617
import stat
1718
import unittest
1819
from pathlib import Path
@@ -30,7 +31,7 @@
3031
)
3132
from huggingface_hub.file_download import (
3233
_CACHED_NO_EXIST,
33-
_create_relative_symlink,
34+
_create_symlink,
3435
cached_download,
3536
filename_to_url,
3637
get_hf_file_metadata,
@@ -739,15 +740,15 @@ def test_hf_hub_download_on_awful_subfolder_and_filename(self):
739740
class CreateSymlinkTest(unittest.TestCase):
740741
@unittest.skipIf(os.name == "nt", "No symlinks on Windows")
741742
@patch("huggingface_hub.file_download.are_symlinks_supported")
742-
def test_create_relative_symlink_concurrent_access(self, mock_are_symlinks_supported: Mock) -> None:
743+
def test_create_symlink_concurrent_access(self, mock_are_symlinks_supported: Mock) -> None:
743744
with SoftTemporaryDirectory() as tmpdir:
744745
src = os.path.join(tmpdir, "source")
745746
other = os.path.join(tmpdir, "other")
746747
dst = os.path.join(tmpdir, "destination")
747748

748749
# Normal case: symlink does not exist
749750
mock_are_symlinks_supported.return_value = True
750-
_create_relative_symlink(src, dst)
751+
_create_symlink(src, dst)
751752
self.assertEqual(os.path.realpath(dst), os.path.realpath(src))
752753

753754
# Symlink already exists when it tries to create it (most probably from a
@@ -757,7 +758,7 @@ def _are_symlinks_supported(cache_dir: str) -> bool:
757758
return True
758759

759760
mock_are_symlinks_supported.side_effect = _are_symlinks_supported
760-
_create_relative_symlink(src, dst)
761+
_create_symlink(src, dst)
761762

762763
# Symlink already exists but pointing to a different source file. This should
763764
# never happen in the context of HF cache system -> raise exception
@@ -767,7 +768,25 @@ def _are_symlinks_supported(cache_dir: str) -> bool:
767768

768769
mock_are_symlinks_supported.side_effect = _are_symlinks_supported
769770
with self.assertRaises(FileExistsError):
770-
_create_relative_symlink(src, dst)
771+
_create_symlink(src, dst)
772+
773+
def test_create_symlink_relative_src(self) -> None:
774+
"""Regression test for #1388.
775+
776+
See https://github.com/huggingface/huggingface_hub/issues/1388.
777+
"""
778+
# Test dir has to be relative
779+
test_dir = Path(".") / "dir_for_create_symlink_test"
780+
test_dir.mkdir(parents=True, exist_ok=True)
781+
src = Path(test_dir) / "source"
782+
src.touch()
783+
dst = Path(test_dir) / "destination"
784+
785+
_create_symlink(str(src), str(dst))
786+
self.assertTrue(dst.resolve().is_file())
787+
if os.name != "nt":
788+
self.assertEqual(dst.resolve(), src.resolve())
789+
shutil.rmtree(test_dir)
771790

772791

773792
def _recursive_chmod(path: str, mode: int) -> None:

0 commit comments

Comments
 (0)