diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index 4f438edc05..050a1d9bf9 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -367,7 +367,6 @@ def ls( """ resolved_path = self.resolve_path(path, revision=revision) path = resolved_path.unresolve() - kwargs = {"expand_info": detail, **kwargs} try: out = self._ls_tree(path, refresh=refresh, revision=revision, **kwargs) except EntryNotFoundError: @@ -386,7 +385,7 @@ def _ls_tree( recursive: bool = False, refresh: bool = False, revision: Optional[str] = None, - expand_info: bool = True, + expand_info: bool = False, ): resolved_path = self.resolve_path(path, revision=revision) path = resolved_path.unresolve() @@ -497,8 +496,6 @@ def walk(self, path: str, *args, **kwargs) -> Iterator[Tuple[str, List[str], Lis Returns: `Iterator[Tuple[str, List[str], List[str]]]`: An iterator of (path, list of directory names, list of file names) tuples. """ - # Set expand_info=False by default to get a x10 speed boost - kwargs = {"expand_info": kwargs.get("detail", False), **kwargs} path = self.resolve_path(path, revision=kwargs.get("revision")).unresolve() yield from super().walk(path, *args, **kwargs) @@ -515,8 +512,6 @@ def glob(self, path: str, **kwargs) -> List[str]: Returns: `List[str]`: List of paths matching the pattern. """ - # Set expand_info=False by default to get a x10 speed boost - kwargs = {"expand_info": kwargs.get("detail", False), **kwargs} path = self.resolve_path(path, revision=kwargs.get("revision")).unresolve() return super().glob(path, **kwargs) @@ -558,7 +553,6 @@ def find( ) resolved_path = self.resolve_path(path, revision=revision) path = resolved_path.unresolve() - kwargs = {"expand_info": detail, **kwargs} try: out = self._ls_tree(path, recursive=True, refresh=refresh, revision=resolved_path.revision, **kwargs) except EntryNotFoundError: @@ -653,7 +647,7 @@ def modified(self, path: str, **kwargs) -> datetime: Returns: `datetime`: Last commit date of the file. """ - info = self.info(path, **kwargs) + info = self.info(path, **{**kwargs, "expand_info": True}) return info["last_commit"]["date"] def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, **kwargs) -> Dict[str, Any]: @@ -683,7 +677,7 @@ def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, resolved_path = self.resolve_path(path, revision=revision) path = resolved_path.unresolve() expand_info = kwargs.get( - "expand_info", True + "expand_info", False ) # don't expose it as a parameter in the public API to follow the spec if not resolved_path.path_in_repo: # Path is the root directory @@ -691,6 +685,7 @@ def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, "name": path, "size": 0, "type": "directory", + "last_commit": None, } if expand_info: last_commit = self._api.list_repo_commits( @@ -708,7 +703,7 @@ def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, parent_path = self._parent(path) if not expand_info and parent_path not in self.dircache: # Fill the cache with cheap call - self.ls(parent_path, expand_info=False) + self.ls(parent_path) if parent_path in self.dircache: # Check if the path is in the cache out1 = [o for o in self.dircache[parent_path] if o["name"] == path] @@ -779,7 +774,7 @@ def exists(self, path, **kwargs): if kwargs.get("refresh", False): self.invalidate_cache(path) - self.info(path, **{**kwargs, "expand_info": False}) + self.info(path, **kwargs) return True except: # noqa: E722 return False @@ -798,7 +793,7 @@ def isdir(self, path): `bool`: True if path is a directory, False otherwise. """ try: - return self.info(path, expand_info=False)["type"] == "directory" + return self.info(path)["type"] == "directory" except OSError: return False @@ -816,7 +811,7 @@ def isfile(self, path): `bool`: True if path is a file, False otherwise. """ try: - return self.info(path, expand_info=False)["type"] == "file" + return self.info(path)["type"] == "file" except: # noqa: E722 return False @@ -942,9 +937,6 @@ def __init__(self, fs: HfFileSystem, path: str, revision: Optional[str] = None, f"{e}.\nMake sure the repository and revision exist before writing data." ) from e raise - # avoid an unnecessary .info() call with expensive expand_info=True to instantiate .details - if kwargs.get("mode", "rb") == "rb": - self.details = fs.info(self.resolved_path.unresolve(), expand_info=False) super().__init__(fs, self.resolved_path.unresolve(), **kwargs) self.fs: HfFileSystem diff --git a/tests/test_hf_file_system.py b/tests/test_hf_file_system.py index 0c829f5ad6..16dd3b5ee1 100644 --- a/tests/test_hf_file_system.py +++ b/tests/test_hf_file_system.py @@ -66,13 +66,13 @@ def test_info(self): self.assertEqual(root_dir["type"], "directory") self.assertEqual(root_dir["size"], 0) self.assertTrue(root_dir["name"].endswith(self.repo_id)) - self.assertIsNotNone(root_dir["last_commit"]) + self.assertIsNone(root_dir["last_commit"]) data_dir = self.hffs.info(self.hf_path + "/data") self.assertEqual(data_dir["type"], "directory") self.assertEqual(data_dir["size"], 0) self.assertTrue(data_dir["name"].endswith("/data")) - self.assertIsNotNone(data_dir["last_commit"]) + self.assertIsNone(data_dir["last_commit"]) self.assertIsNotNone(data_dir["tree_id"]) text_data_file = self.hffs.info(self.text_file) @@ -80,7 +80,7 @@ def test_info(self): self.assertGreater(text_data_file["size"], 0) # not empty self.assertTrue(text_data_file["name"].endswith("/data/text_data.txt")) self.assertIsNone(text_data_file["lfs"]) - self.assertIsNotNone(text_data_file["last_commit"]) + self.assertIsNone(text_data_file["last_commit"]) self.assertIsNotNone(text_data_file["blob_id"]) self.assertIn("security", text_data_file) # the staging endpoint does not run security checks @@ -140,7 +140,7 @@ def test_glob(self): self.assertTrue( files[keys[0]]["name"].endswith("/.gitattributes") and files[keys[1]]["name"].endswith("/data") ) - self.assertIsNotNone(files[keys[0]]["last_commit"]) + self.assertIsNone(files[keys[0]]["last_commit"]) def test_url(self): self.assertEqual( @@ -277,13 +277,13 @@ def test_list_root_directory_no_revision(self): self.assertEqual(files[0]["type"], "directory") self.assertEqual(files[0]["size"], 0) self.assertTrue(files[0]["name"].endswith("/data")) - self.assertIsNotNone(files[0]["last_commit"]) + self.assertIsNone(files[0]["last_commit"]) self.assertIsNotNone(files[0]["tree_id"]) self.assertEqual(files[1]["type"], "file") self.assertGreater(files[1]["size"], 0) # not empty self.assertTrue(files[1]["name"].endswith("/.gitattributes")) - self.assertIsNotNone(files[1]["last_commit"]) + self.assertIsNone(files[1]["last_commit"]) self.assertIsNotNone(files[1]["blob_id"]) self.assertIn("security", files[1]) # the staging endpoint does not run security checks @@ -298,7 +298,7 @@ def test_list_data_directory_no_revision(self): self.assertIn("sha256", files[0]["lfs"]) self.assertIn("size", files[0]["lfs"]) self.assertIn("pointer_size", files[0]["lfs"]) - self.assertIsNotNone(files[0]["last_commit"]) + self.assertIsNone(files[0]["last_commit"]) self.assertIsNotNone(files[0]["blob_id"]) self.assertIn("security", files[0]) # the staging endpoint does not run security checks @@ -306,7 +306,7 @@ def test_list_data_directory_no_revision(self): self.assertGreater(files[1]["size"], 0) # not empty self.assertTrue(files[1]["name"].endswith("/data/text_data.txt")) self.assertIsNone(files[1]["lfs"]) - self.assertIsNotNone(files[1]["last_commit"]) + self.assertIsNone(files[1]["last_commit"]) self.assertIsNotNone(files[1]["blob_id"]) self.assertIn("security", files[1]) # the staging endpoint does not run security checks @@ -318,7 +318,7 @@ def test_list_data_file_no_revision(self): self.assertGreater(files[0]["size"], 0) # not empty self.assertTrue(files[0]["name"].endswith("/data/text_data.txt")) self.assertIsNone(files[0]["lfs"]) - self.assertIsNotNone(files[0]["last_commit"]) + self.assertIsNone(files[0]["last_commit"]) self.assertIsNotNone(files[0]["blob_id"]) self.assertIn("security", files[0]) # the staging endpoint does not run security checks @@ -351,6 +351,13 @@ def test_list_root_directory_no_revision_no_detail_then_with_detail(self): files = self.hffs.ls(self.hf_path, detail=True) self.assertEqual(len(files), 2) self.assertTrue(files[0]["name"].endswith("/data") and files[1]["name"].endswith("/.gitattributes")) + self.assertIsNone( + self.hffs.dircache[self.hf_path][0]["last_commit"] + ) # no expand_info -> no last_commit in cache + + files = self.hffs.ls(self.hf_path, detail=True, expand_info=True) + self.assertEqual(len(files), 2) + self.assertTrue(files[0]["name"].endswith("/data") and files[1]["name"].endswith("/.gitattributes")) self.assertIsNotNone(self.hffs.dircache[self.hf_path][0]["last_commit"]) def test_find_root_directory_no_revision(self):