|
| 1 | +import humanize |
| 2 | +import os |
| 3 | +import time |
| 4 | +import shutil |
| 5 | +import urllib.parse |
| 6 | + |
| 7 | +from fsspec.implementations.cached import SimpleCacheFileSystem, hash_name |
| 8 | +from fsspec import register_implementation |
| 9 | +from pathlib import Path |
| 10 | + |
| 11 | +from .config import get_cache_dir |
| 12 | + |
| 13 | + |
| 14 | +# used if needed to preserve board path structure in the cache |
| 15 | +PLACEHOLDER_VERSION = "v" |
| 16 | +PLACEHOLDER_FILE = "file" |
| 17 | + |
| 18 | + |
| 19 | +def touch_access_time(path, access_time: "float | None" = None): |
| 20 | + """Update access time of file. |
| 21 | +
|
| 22 | + Returns the new access time. |
| 23 | + """ |
| 24 | + |
| 25 | + if access_time is None: |
| 26 | + access_time = time.time() |
| 27 | + |
| 28 | + p = Path(path) |
| 29 | + |
| 30 | + if not p.exists(): |
| 31 | + p.touch() |
| 32 | + |
| 33 | + stat = p.stat() |
| 34 | + os.utime(path, (access_time, stat.st_mtime)) |
| 35 | + |
| 36 | + return access_time |
| 37 | + |
| 38 | + |
| 39 | +def protocol_to_string(protocol): |
| 40 | + if isinstance(protocol, str): |
| 41 | + return protocol |
| 42 | + |
| 43 | + return protocol[0] |
| 44 | + |
| 45 | + |
| 46 | +class PinsCache(SimpleCacheFileSystem): |
| 47 | + protocol = "pinscache" |
| 48 | + |
| 49 | + def __init__(self, *args, hash_prefix=None, **kwargs): |
| 50 | + super().__init__(*args, **kwargs) |
| 51 | + self.hash_prefix = hash_prefix |
| 52 | + |
| 53 | + def _open(self, path, *args, **kwargs): |
| 54 | + # For some reason, the open method of SimpleCacheFileSystem doesn't |
| 55 | + # call _make_local_details, so we need to patch in here. |
| 56 | + # Note that methods like .cat() do call it. Other Caches don't have this issue. |
| 57 | + path = self._strip_protocol(path) |
| 58 | + self._make_local_details(path) |
| 59 | + |
| 60 | + return super()._open(path, *args, **kwargs) |
| 61 | + |
| 62 | + def _make_local_details(self, path): |
| 63 | + # modifies method to create any parent directories needed by the cached file |
| 64 | + # note that this is called in ._open(), at the point it's known the file |
| 65 | + # will be cached |
| 66 | + fn = super()._make_local_details(path) |
| 67 | + Path(fn).parent.mkdir(parents=True, exist_ok=True) |
| 68 | + |
| 69 | + return fn |
| 70 | + |
| 71 | + def hash_name(self, path, same_name): |
| 72 | + # the main change in this function is that, for same_name, it returns |
| 73 | + # the full path |
| 74 | + if same_name: |
| 75 | + if self.hash_prefix: |
| 76 | + # optionally make the name relative to a parent path |
| 77 | + # using the hash of parent path as a prefix, to flatten a bit |
| 78 | + suffix = Path(path).relative_to(Path(self.hash_prefix)) |
| 79 | + # TODO(compat): R pins uses xxh128 hash here, but fsspec uses sha256 |
| 80 | + prefix = hash_name(self.hash_prefix, False) |
| 81 | + |
| 82 | + # TODO: hacky to automatically tack on protocol here |
| 83 | + # but this is what R pins boards do. Could make a bool arg? |
| 84 | + proto_name = protocol_to_string(self.fs.protocol) |
| 85 | + full_prefix = "_".join([proto_name, prefix]) |
| 86 | + return str(full_prefix / suffix) |
| 87 | + |
| 88 | + return path |
| 89 | + else: |
| 90 | + return hash_name(path, same_name) |
| 91 | + |
| 92 | + def touch_access_time(path): |
| 93 | + return touch_access_time(path) |
| 94 | + |
| 95 | + |
| 96 | +class PinsUrlCache(PinsCache): |
| 97 | + protocol = "pinsurlcache" |
| 98 | + |
| 99 | + def hash_name(self, path, same_name): |
| 100 | + # strip final arg from path |
| 101 | + # note that R pins uses fs::path_file, and I'm not sure exactly how it |
| 102 | + # behaves for the many forms url paths can take. |
| 103 | + # e.g. path_file(.../extdata/) -> extdata |
| 104 | + # e.g. path_file(.../extdata?123) -> extdata?123 |
| 105 | + path_parts = urllib.parse.urlparse(path)[2] |
| 106 | + |
| 107 | + # strip off final whitespace and / (if it exists) |
| 108 | + # TODO(compat): python pins currently not keeping query part of url |
| 109 | + final_part = path_parts.rstrip().rstrip("/").rsplit("/", 1)[-1] |
| 110 | + |
| 111 | + # TODO: what happens in R pins if no final part? |
| 112 | + if final_part == "": |
| 113 | + final_part = PLACEHOLDER_FILE |
| 114 | + |
| 115 | + # hash url |
| 116 | + prefix = hash_name(path, False) |
| 117 | + |
| 118 | + # note that we include an extra version folder, so it conforms with |
| 119 | + # pin board path form: <board_path>/<pin_name>/<version_name>/<file> |
| 120 | + proto_name = protocol_to_string(self.fs.protocol) |
| 121 | + full_prefix = "_".join([proto_name, prefix]) |
| 122 | + return str(Path(full_prefix) / PLACEHOLDER_VERSION / final_part) |
| 123 | + |
| 124 | + |
| 125 | +class CachePruner: |
| 126 | + """Prunes the cache directory, across multiple boards. |
| 127 | +
|
| 128 | + Note |
| 129 | + ---- |
| 130 | +
|
| 131 | + `pins` assumes that all boards cache using these rules: |
| 132 | +
|
| 133 | + * path structure: `<cache_root>/<board_hash>/<pin>/<version>`. |
| 134 | + * each version has a data.txt file in it. |
| 135 | + """ |
| 136 | + |
| 137 | + meta_path = "data.txt" |
| 138 | + |
| 139 | + def __init__(self, cache_dir: "str | Path"): |
| 140 | + self.cache_dir = Path(cache_dir) |
| 141 | + |
| 142 | + def versions(self) -> "iter[Path]": |
| 143 | + for p_version in self.cache_dir.glob("*/*"): |
| 144 | + if p_version.is_dir() and (p_version / self.meta_path).exists(): |
| 145 | + yield p_version |
| 146 | + |
| 147 | + def should_prune_version(self, days, path: "str | Path"): |
| 148 | + path = Path(path) |
| 149 | + |
| 150 | + expiry_time_sec = days * 60 * 60 * 24 |
| 151 | + prune_before = time.time() - expiry_time_sec |
| 152 | + |
| 153 | + p_meta = path / self.meta_path |
| 154 | + |
| 155 | + if not p_meta.exists(): |
| 156 | + raise FileNotFoundError(f"No metadata file: {p_meta.absolute()}") |
| 157 | + |
| 158 | + access_time = p_meta.stat().st_atime |
| 159 | + return access_time < prune_before |
| 160 | + |
| 161 | + def old_versions(self, days): |
| 162 | + return [p for p in self.versions() if self.should_prune_version(days, p)] |
| 163 | + |
| 164 | + def prune(self, days=30): |
| 165 | + to_prune = self.old_versions(days) |
| 166 | + size = sum(map(disk_usage, to_prune)) |
| 167 | + |
| 168 | + # TODO: clean this up, general approach to prompting |
| 169 | + confirmed = prompt_cache_prune(to_prune, size) |
| 170 | + if confirmed: |
| 171 | + for path in to_prune: |
| 172 | + delete_version(to_prune) |
| 173 | + |
| 174 | + print("Skipping cache deletion") |
| 175 | + |
| 176 | + |
| 177 | +def delete_version(path: "str | Path"): |
| 178 | + path = Path(path) |
| 179 | + shutil.rmtree(str(path.absolute())) |
| 180 | + |
| 181 | + |
| 182 | +def disk_usage(path): |
| 183 | + return sum(p.stat().st_size for p in path.glob("**/*") if p.is_file() or p.is_dir()) |
| 184 | + |
| 185 | + |
| 186 | +def prompt_cache_prune(to_prune, size) -> bool: |
| 187 | + print(to_prune) |
| 188 | + human_size = humanize.naturalsize(size, binary=True) |
| 189 | + resp = input(f"Delete {len(to_prune)} pin versions, freeing {human_size}?") |
| 190 | + return resp == "yes" |
| 191 | + |
| 192 | + |
| 193 | +def cache_prune(days=30, cache_root=None, prompt=True): |
| 194 | + if cache_root is None: |
| 195 | + cache_root = get_cache_dir() |
| 196 | + |
| 197 | + final_delete = [] |
| 198 | + for p_board in Path(cache_root).glob("*"): |
| 199 | + pruner = CachePruner(p_board) |
| 200 | + final_delete.extend(pruner.old_versions(days)) |
| 201 | + |
| 202 | + size = sum(map(disk_usage, final_delete)) |
| 203 | + |
| 204 | + if prompt: |
| 205 | + confirmed = prompt_cache_prune(final_delete, size) |
| 206 | + else: |
| 207 | + confirmed = True |
| 208 | + if confirmed: |
| 209 | + for p in final_delete: |
| 210 | + delete_version(p) |
| 211 | + |
| 212 | + |
| 213 | +# def prune_files(days = 30, path = None): |
| 214 | +# if path is None: |
| 215 | +# for p_cache in Path(get_cache_dir()).glob("*"): |
| 216 | +# return prune_files(days=days, path=str(p_cache.absolute())) |
| 217 | +# |
| 218 | +# expiry_time_sec = days * 60 * 60 * 24 |
| 219 | +# fs_cache = PinsCache( |
| 220 | +# target_protocol=None, |
| 221 | +# cache_storage=path, |
| 222 | +# check_files=True, |
| 223 | +# expiry_time=expiry_time_sec |
| 224 | +# ) |
| 225 | +# |
| 226 | +# # note that fsspec considers only the last entry in cached_files deletable |
| 227 | +# for hash_path, entry in fs_cache.cached_files[-1].items(): |
| 228 | +# if time.time() - detail["time"] > self.expiry: |
| 229 | +# fs_cache.pop_from_cache(entry["original"]) |
| 230 | + |
| 231 | +# TODO: swap to use entrypoint |
| 232 | +register_implementation("pinscache", PinsCache) |
0 commit comments