Skip to content

Commit 26c6487

Browse files
committed
Merge branch 'main' of github.com:machow/pins-python
2 parents 3f9e590 + e37372d commit 26c6487

File tree

16 files changed

+812
-70
lines changed

16 files changed

+812
-70
lines changed

.env.dev

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# Pins optional config ----
2+
#PINS_CACHE_DIR=.pins_cache
3+
#PINS_DATA_DIR=.pins_data
4+
15
# AWS S3 backend ----
26
AWS_ACCESS_KEY_ID=
37
AWS_SECRET_ACCESS_KEY=

docs/getting_started.Rmd

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,3 +255,7 @@ import pandas as pd
255255

256256
pd.read_csv(fname[0]).head()
257257
```
258+
259+
```{python}
260+
my_data.pin_download("penguins")
261+
```

pins/boards.py

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from typing import Protocol, Sequence, Optional, Mapping
1313

1414
from .versions import VersionRaw, guess_version
15-
from .meta import Meta, MetaFactory
15+
from .meta import Meta, MetaRaw, MetaFactory
1616
from .errors import PinsError
1717
from .drivers import load_data, save_data, default_title
1818

@@ -45,9 +45,13 @@ def info(self, path):
4545

4646
class BaseBoard:
4747
def __init__(
48-
self, board: str, fs: IFileSystem, versioned=True, meta_factory=MetaFactory(),
48+
self,
49+
board: "str | Path",
50+
fs: IFileSystem,
51+
versioned=True,
52+
meta_factory=MetaFactory(),
4953
):
50-
self.board = board
54+
self.board = str(board)
5155
self.fs = fs
5256
self.meta_factory = meta_factory
5357

@@ -185,6 +189,12 @@ def pin_read(self, name, version: Optional[str] = None, hash: Optional[str] = No
185189
"""
186190
meta = self.pin_fetch(name, version)
187191

192+
if isinstance(meta, MetaRaw):
193+
raise TypeError(
194+
"Could not find metadata for this pin version. If this is an individual "
195+
"file, may need to use pin_download()."
196+
)
197+
188198
if hash is not None:
189199
raise NotImplementedError("TODO: validate hash")
190200

@@ -577,16 +587,41 @@ def pin_meta(self, name, version=None):
577587
pin_name = self.path_to_pin(name)
578588
meta_name = self.meta_factory.get_meta_name()
579589

590+
# special case where we are using http protocol to fetch what may be
591+
# a file. here we need to create a stripped down form of metadata, since
592+
# a metadata file does not exist (and we can't pull files from a version dir).
593+
path_to_pin = self.construct_path([pin_name])
594+
if self.fs.protocol == "http" and not path_to_pin.rstrip().endswith("/"):
595+
# create metadata, rather than read from a file
596+
return self.meta_factory.create_raw(path_to_pin, type="file",)
597+
580598
path_meta = self.construct_path([pin_name, meta_name])
581599
f = self.fs.open(path_meta)
582600
return self.meta_factory.read_pin_yaml(f, pin_name, VersionRaw(""))
583601

602+
def pin_download(self, name, version=None, hash=None) -> Sequence[str]:
603+
meta = self.pin_meta(name, version)
604+
605+
if isinstance(meta, MetaRaw):
606+
return load_data(meta, self.fs, None)
607+
608+
raise NotImplementedError("TODO: allow download beyond MetaRaw.")
609+
584610
def construct_path(self, elements):
585611
# TODO: in practice every call to construct_path has the first element of
586612
# pin name. to make this safer, we should enforce that in its signature.
587613
pin_name, *others = elements
588614
pin_path = self.pin_paths[pin_name]
589615

616+
if self.board.strip() == "":
617+
return pin_path
618+
619+
if len(others):
620+
# this is confusing, but R pins url board has a final "/" indicate that
621+
# something is a pin version, rather than a single file. but since other
622+
# boards forbid a final /, we need to strip it off to join elements
623+
pin_path = pin_path.rstrip().rstrip("/")
624+
590625
return super().construct_path([pin_path, *others])
591626

592627

@@ -626,7 +661,7 @@ def pin_search(self, search=None, as_df=True):
626661
# verify code is for inadequate permission to access
627662
if e.args[0]["code"] != 19:
628663
raise e
629-
# TODO(question): should this be a RawMeta class or something?
664+
# TODO(question): should this be a MetaRaw class or something?
630665
# that fixes our isinstance Meta below.
631666
# TODO(compatibility): R pins errors instead, see #27
632667
res.append({"name": pin_name, "meta": None})

pins/cache.py

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
import humanize
2+
import os
3+
import time
4+
import shutil
5+
import urllib.parse
6+
7+
from fsspec.implementations.cached import SimpleCacheFileSystem, hash_name
8+
from fsspec import register_implementation
9+
from pathlib import Path
10+
11+
from .config import get_cache_dir
12+
13+
14+
# used if needed to preserve board path structure in the cache
15+
PLACEHOLDER_VERSION = "v"
16+
PLACEHOLDER_FILE = "file"
17+
18+
19+
def touch_access_time(path, access_time: "float | None" = None):
20+
"""Update access time of file.
21+
22+
Returns the new access time.
23+
"""
24+
25+
if access_time is None:
26+
access_time = time.time()
27+
28+
p = Path(path)
29+
30+
if not p.exists():
31+
p.touch()
32+
33+
stat = p.stat()
34+
os.utime(path, (access_time, stat.st_mtime))
35+
36+
return access_time
37+
38+
39+
def protocol_to_string(protocol):
40+
if isinstance(protocol, str):
41+
return protocol
42+
43+
return protocol[0]
44+
45+
46+
class PinsCache(SimpleCacheFileSystem):
47+
protocol = "pinscache"
48+
49+
def __init__(self, *args, hash_prefix=None, **kwargs):
50+
super().__init__(*args, **kwargs)
51+
self.hash_prefix = hash_prefix
52+
53+
def _open(self, path, *args, **kwargs):
54+
# For some reason, the open method of SimpleCacheFileSystem doesn't
55+
# call _make_local_details, so we need to patch in here.
56+
# Note that methods like .cat() do call it. Other Caches don't have this issue.
57+
path = self._strip_protocol(path)
58+
self._make_local_details(path)
59+
60+
return super()._open(path, *args, **kwargs)
61+
62+
def _make_local_details(self, path):
63+
# modifies method to create any parent directories needed by the cached file
64+
# note that this is called in ._open(), at the point it's known the file
65+
# will be cached
66+
fn = super()._make_local_details(path)
67+
Path(fn).parent.mkdir(parents=True, exist_ok=True)
68+
69+
return fn
70+
71+
def hash_name(self, path, same_name):
72+
# the main change in this function is that, for same_name, it returns
73+
# the full path
74+
if same_name:
75+
if self.hash_prefix:
76+
# optionally make the name relative to a parent path
77+
# using the hash of parent path as a prefix, to flatten a bit
78+
suffix = Path(path).relative_to(Path(self.hash_prefix))
79+
# TODO(compat): R pins uses xxh128 hash here, but fsspec uses sha256
80+
prefix = hash_name(self.hash_prefix, False)
81+
82+
# TODO: hacky to automatically tack on protocol here
83+
# but this is what R pins boards do. Could make a bool arg?
84+
proto_name = protocol_to_string(self.fs.protocol)
85+
full_prefix = "_".join([proto_name, prefix])
86+
return str(full_prefix / suffix)
87+
88+
return path
89+
else:
90+
return hash_name(path, same_name)
91+
92+
def touch_access_time(path):
93+
return touch_access_time(path)
94+
95+
96+
class PinsUrlCache(PinsCache):
97+
protocol = "pinsurlcache"
98+
99+
def hash_name(self, path, same_name):
100+
# strip final arg from path
101+
# note that R pins uses fs::path_file, and I'm not sure exactly how it
102+
# behaves for the many forms url paths can take.
103+
# e.g. path_file(.../extdata/) -> extdata
104+
# e.g. path_file(.../extdata?123) -> extdata?123
105+
path_parts = urllib.parse.urlparse(path)[2]
106+
107+
# strip off final whitespace and / (if it exists)
108+
# TODO(compat): python pins currently not keeping query part of url
109+
final_part = path_parts.rstrip().rstrip("/").rsplit("/", 1)[-1]
110+
111+
# TODO: what happens in R pins if no final part?
112+
if final_part == "":
113+
final_part = PLACEHOLDER_FILE
114+
115+
# hash url
116+
prefix = hash_name(path, False)
117+
118+
# note that we include an extra version folder, so it conforms with
119+
# pin board path form: <board_path>/<pin_name>/<version_name>/<file>
120+
proto_name = protocol_to_string(self.fs.protocol)
121+
full_prefix = "_".join([proto_name, prefix])
122+
return str(Path(full_prefix) / PLACEHOLDER_VERSION / final_part)
123+
124+
125+
class CachePruner:
126+
"""Prunes the cache directory, across multiple boards.
127+
128+
Note
129+
----
130+
131+
`pins` assumes that all boards cache using these rules:
132+
133+
* path structure: `<cache_root>/<board_hash>/<pin>/<version>`.
134+
* each version has a data.txt file in it.
135+
"""
136+
137+
meta_path = "data.txt"
138+
139+
def __init__(self, cache_dir: "str | Path"):
140+
self.cache_dir = Path(cache_dir)
141+
142+
def versions(self) -> "iter[Path]":
143+
for p_version in self.cache_dir.glob("*/*"):
144+
if p_version.is_dir() and (p_version / self.meta_path).exists():
145+
yield p_version
146+
147+
def should_prune_version(self, days, path: "str | Path"):
148+
path = Path(path)
149+
150+
expiry_time_sec = days * 60 * 60 * 24
151+
prune_before = time.time() - expiry_time_sec
152+
153+
p_meta = path / self.meta_path
154+
155+
if not p_meta.exists():
156+
raise FileNotFoundError(f"No metadata file: {p_meta.absolute()}")
157+
158+
access_time = p_meta.stat().st_atime
159+
return access_time < prune_before
160+
161+
def old_versions(self, days):
162+
return [p for p in self.versions() if self.should_prune_version(days, p)]
163+
164+
def prune(self, days=30):
165+
to_prune = self.old_versions(days)
166+
size = sum(map(disk_usage, to_prune))
167+
168+
# TODO: clean this up, general approach to prompting
169+
confirmed = prompt_cache_prune(to_prune, size)
170+
if confirmed:
171+
for path in to_prune:
172+
delete_version(to_prune)
173+
174+
print("Skipping cache deletion")
175+
176+
177+
def delete_version(path: "str | Path"):
178+
path = Path(path)
179+
shutil.rmtree(str(path.absolute()))
180+
181+
182+
def disk_usage(path):
183+
return sum(p.stat().st_size for p in path.glob("**/*") if p.is_file() or p.is_dir())
184+
185+
186+
def prompt_cache_prune(to_prune, size) -> bool:
187+
print(to_prune)
188+
human_size = humanize.naturalsize(size, binary=True)
189+
resp = input(f"Delete {len(to_prune)} pin versions, freeing {human_size}?")
190+
return resp == "yes"
191+
192+
193+
def cache_prune(days=30, cache_root=None, prompt=True):
194+
if cache_root is None:
195+
cache_root = get_cache_dir()
196+
197+
final_delete = []
198+
for p_board in Path(cache_root).glob("*"):
199+
pruner = CachePruner(p_board)
200+
final_delete.extend(pruner.old_versions(days))
201+
202+
size = sum(map(disk_usage, final_delete))
203+
204+
if prompt:
205+
confirmed = prompt_cache_prune(final_delete, size)
206+
else:
207+
confirmed = True
208+
if confirmed:
209+
for p in final_delete:
210+
delete_version(p)
211+
212+
213+
# def prune_files(days = 30, path = None):
214+
# if path is None:
215+
# for p_cache in Path(get_cache_dir()).glob("*"):
216+
# return prune_files(days=days, path=str(p_cache.absolute()))
217+
#
218+
# expiry_time_sec = days * 60 * 60 * 24
219+
# fs_cache = PinsCache(
220+
# target_protocol=None,
221+
# cache_storage=path,
222+
# check_files=True,
223+
# expiry_time=expiry_time_sec
224+
# )
225+
#
226+
# # note that fsspec considers only the last entry in cached_files deletable
227+
# for hash_path, entry in fs_cache.cached_files[-1].items():
228+
# if time.time() - detail["time"] > self.expiry:
229+
# fs_cache.pop_from_cache(entry["original"])
230+
231+
# TODO: swap to use entrypoint
232+
register_implementation("pinscache", PinsCache)

pins/config.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import appdirs
2+
import os
3+
4+
PINS_NAME = "pins-py"
5+
6+
7+
def get_data_dir():
8+
return os.environ.get("PINS_DATA_DIR", appdirs.user_data_dir(PINS_NAME))
9+
10+
11+
def get_cache_dir():
12+
return os.environ.get("PINS_CACHE_DIR", appdirs.user_cache_dir(PINS_NAME))

0 commit comments

Comments
 (0)