Skip to content

Commit 97a8fa0

Browse files
committed
fix: working board_urls cache, which is flat
1 parent d154a56 commit 97a8fa0

File tree

4 files changed

+132
-38
lines changed

4 files changed

+132
-38
lines changed

pins/boards.py

Lines changed: 54 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from .errors import PinsError
1717
from .drivers import load_data, save_data, default_title
1818
from .utils import inform
19-
from .config import get_allow_rsc_short_name, get_feature_preview
19+
from .config import get_allow_rsc_short_name
2020

2121

2222
_log = logging.getLogger(__name__)
@@ -155,12 +155,10 @@ def pin_meta(self, name, version: str = None) -> Meta:
155155
meta_name = self.meta_factory.get_meta_name(*components)
156156

157157
path_meta = self.construct_path([*components, meta_name])
158-
f = self.fs.open(path_meta)
158+
f = self._open_pin_meta(path_meta)
159159

160160
meta = self.meta_factory.read_pin_yaml(f, pin_name, selected_version)
161161

162-
self._touch_cache(path_meta)
163-
164162
return meta
165163

166164
def pin_list(self):
@@ -213,11 +211,8 @@ def pin_read(self, name, version: Optional[str] = None, hash: Optional[str] = No
213211

214212
pin_name = self.path_to_pin(name)
215213

216-
return load_data(
217-
meta,
218-
self.fs,
219-
self.construct_path([pin_name, meta.version.version]),
220-
allow_pickle_read=self.allow_pickle_read,
214+
return self._load_data(
215+
meta, self.construct_path([pin_name, meta.version.version])
221216
)
222217

223218
def pin_write(
@@ -493,7 +488,13 @@ def pin_browse(self, name, version=None, local=False):
493488

494489
raise NotImplementedError()
495490

491+
# pin name internal methods -----------------------------------------------
492+
# these methods are responsible for validating pin names, and converting
493+
# names to their ultimate path on the file system.
494+
496495
def validate_pin_name(self, name: str) -> None:
496+
"""Raise an error if a pin name is not valid."""
497+
497498
if "/" in name:
498499
raise ValueError(f"Invalid pin name: {name}")
499500

@@ -512,6 +513,8 @@ def construct_path(self, elements) -> str:
512513
def keep_final_path_component(self, path):
513514
return path.split("/")[-1]
514515

516+
# version ordering, creation ----------------------------------------------
517+
515518
def sort_pin_versions(self, versions):
516519
# assume filesystem returned them with most recent last
517520
return sorted(versions, key=lambda v: v.version)
@@ -574,10 +577,29 @@ def _extract_search_meta(self, meta):
574577
d["meta"] = meta
575578
return d
576579

577-
def _get_cache_path(self, pin_name, version):
578-
p_version = self.construct_path([self.path_to_pin(pin_name), version])
579-
hash = self.fs.hash_name(p_version, True)
580-
return str(Path(self.fs.storage[-1]) / hash)
580+
# data loading ------------------------------------------------------------
581+
582+
def _load_data(self, meta, pin_version_path):
583+
"""Return the data object stored by a pin (e.g. a DataFrame)."""
584+
return load_data(
585+
meta, self.fs, pin_version_path, allow_pickle_read=self.allow_pickle_read
586+
)
587+
588+
# filesystem and cache methods --------------------------------------------
589+
590+
def _open_pin_meta(self, path):
591+
f = self.fs.open(path)
592+
self._touch_cache(path)
593+
594+
return f
595+
596+
def _get_cache_path(self, pin_name, version=None, fname=None):
597+
version_part = [version] if version is not None else []
598+
fname_part = [fname] if fname is not None else []
599+
p_version = self.construct_path(
600+
[self.path_to_pin(pin_name), *version_part, *fname_part]
601+
)
602+
return self.fs._check_file(p_version)
581603

582604
def _touch_cache(self, path):
583605
from pins.cache import touch_access_time
@@ -587,8 +609,7 @@ def _touch_cache(self, path):
587609
if not hasattr(self.fs, "cached_files"):
588610
return
589611

590-
hash = self.fs.hash_name(path, True)
591-
path_to_hashed = Path(self.fs.storage[-1]) / hash
612+
path_to_hashed = self.fs._check_file(path)
592613
return touch_access_time(path_to_hashed)
593614

594615

@@ -611,9 +632,6 @@ class BoardManual(BaseBoard):
611632
0 1 a 3
612633
1 2 b 4
613634
614-
615-
616-
617635
"""
618636

619637
# TODO(question): is this class worth it? Or should the user just use fsspec?
@@ -623,14 +641,14 @@ def __init__(self, *args, pin_paths: dict, **kwargs):
623641

624642
self.pin_paths = pin_paths
625643

626-
def pin_read(self, *args, **kwargs):
627-
if not get_feature_preview():
628-
raise NotImplementedError(
629-
"pin_read with BoardManual is currently unimplemented. "
630-
"See https://github.com/machow/pins-python/issues/59."
631-
)
644+
# def pin_read(self, *args, **kwargs):
645+
# if not get_feature_preview():
646+
# raise NotImplementedError(
647+
# "pin_read with BoardManual is currently unimplemented. "
648+
# "See https://github.com/machow/pins-python/issues/59."
649+
# )
632650

633-
return super().pin_read(*args, **kwargs)
651+
# return super().pin_read(*args, **kwargs)
634652

635653
def pin_list(self):
636654
return list(self.pin_paths)
@@ -654,18 +672,20 @@ def pin_meta(self, name, version=None):
654672
return self.meta_factory.create_raw(path_to_pin, type="file", name=pin_name)
655673

656674
path_meta = self.construct_path([pin_name, meta_name])
657-
f = self.fs.open(path_meta)
675+
f = self._open_pin_meta(path_meta)
658676
meta = self.meta_factory.read_pin_yaml(f, pin_name, VersionRaw(""))
659677

678+
# TODO(#59,#83): handle caching, and then re-enable pin_read.
679+
# self._touch_cache(path_meta)
680+
660681
return meta
661682

662683
def pin_download(self, name, version=None, hash=None) -> Sequence[str]:
663684
meta = self.pin_meta(name, version)
664685

665686
if isinstance(meta, MetaRaw):
666-
return load_data(
667-
meta, self.fs, None, allow_pickle_read=self.allow_pickle_read
668-
)
687+
688+
return self._load_data(meta, None)
669689

670690
raise NotImplementedError(
671691
"TODO: pin_download currently can only read a url to a single file."
@@ -686,6 +706,11 @@ def construct_path(self, elements):
686706
# boards forbid a final /, we need to strip it off to join elements
687707
pin_path = pin_path.rstrip().rstrip("/")
688708

709+
# this is a bit hacky, but this board only aims at specific pins versions,
710+
# so the metadata has the version as "", so we need to remove it.
711+
if others[0] == "":
712+
return super().construct_path([pin_path, *others[1:]])
713+
689714
return super().construct_path([pin_path, *others])
690715

691716

pins/cache.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,16 @@ def protocol_to_string(protocol):
4747
return protocol[0]
4848

4949

50+
def prefix_cache(fs, board_base_path):
51+
if isinstance(fs, str):
52+
proto_name = fs
53+
else:
54+
proto_name = protocol_to_string(fs.protocol)
55+
base_hash = hash_name(board_base_path, False)
56+
57+
return f"{proto_name}_{base_hash}"
58+
59+
5060
class PinsCache(SimpleCacheFileSystem):
5161
protocol = "pinscache"
5262

@@ -154,6 +164,33 @@ def hash_name(self, path, same_name):
154164
return str(Path(full_prefix) / PLACEHOLDER_VERSION / final_part)
155165

156166

167+
class PinsAccessTimeCache(SimpleCacheFileSystem):
168+
name = "pinsaccesstimecache"
169+
170+
def hash_name(self, path, same_name):
171+
if same_name:
172+
raise NotImplementedError("same_name not implemented.")
173+
174+
# hash full path, and put anything after the final / at the end, just
175+
# to make it easier to browse.
176+
base_name = super().hash_name(path, same_name)
177+
suffix = Path(path).name
178+
return f"{base_name}_{suffix}"
179+
180+
def _open(self, path, mode="rb", **kwargs):
181+
f = super()._open(path, mode=mode, **kwargs)
182+
fn = self._check_file(path)
183+
184+
if fn is None:
185+
raise ValueError(
186+
f"Cached file should exist for path, but none found: {path}"
187+
)
188+
189+
touch_access_time(fn)
190+
191+
return f
192+
193+
157194
class CachePruner:
158195
"""Prunes the cache directory, across multiple boards.
159196

pins/constructors.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import tempfile
44

55
from .boards import BaseBoard, BoardRsConnect, BoardManual
6-
from .cache import PinsCache, PinsUrlCache, PinsRscCache
6+
from .cache import PinsCache, PinsRscCache, PinsAccessTimeCache, prefix_cache
77
from .config import get_data_dir, get_cache_dir
88

99

@@ -274,10 +274,10 @@ def board_urls(path: str, pin_paths: dict, cache=DEFAULT, allow_pickle_read=None
274274
Example
275275
-------
276276
277-
>>> github_raw = "https://raw.githubusercontent.com/"
277+
>>> github_raw = "https://raw.githubusercontent.com/machow/pins-python/main/pins/tests/pins-compat"
278278
>>> pin_paths = {
279-
... "df_csv": "df_csv/20220214T163720Z-9bfad",
280-
... "df_arrow": "df_arrow/20220214T163720Z-ad0c1",
279+
... "df_csv": "df_csv/20220214T163720Z-9bfad/",
280+
... "df_arrow": "df_arrow/20220214T163720Z-ad0c1/",
281281
... }
282282
>>> board = board_urls(github_raw, pin_paths)
283283
>>> board.pin_list()
@@ -289,8 +289,10 @@ def board_urls(path: str, pin_paths: dict, cache=DEFAULT, allow_pickle_read=None
289289
# copied from board(). this ensures that paths in cache have the form:
290290
# <full_path_hash>/<version_placeholder>/<file_name>
291291
cache_dir = get_cache_dir()
292-
fs = PinsUrlCache(
293-
target_protocol="http", cache_storage=cache_dir, same_names=True
292+
sub_dir = prefix_cache("http", path)
293+
sub_cache = f"{cache_dir}/{sub_dir}"
294+
fs = PinsAccessTimeCache(
295+
target_protocol="http", cache_storage=sub_cache, same_names=False
294296
)
295297
else:
296298
raise NotImplementedError("Can't currently pass own cache object")

pins/tests/test_constructors.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,12 +65,42 @@ def test_constructor_board_url_cache(tmp_cache, http_example_board_path, df_csv)
6565

6666
assert len(http_dirs) == 1
6767

68+
# there are two files in the flat cache (metadata, and the csv)
6869
parent = http_dirs[0]
69-
res = list(parent.rglob("**/*.csv"))
70+
res = list(parent.rglob("*"))
71+
assert len(res) == 2
72+
73+
# validate that it creates an empty metadata file
74+
assert len(x for x in res if x.endswith("df_csv.csv")) == 1
75+
assert len(x for x in res if x.endswith("data.txt")) == 1
76+
77+
assert len(list(parent.glob("**/*"))) == 2
78+
79+
80+
@pytest.mark.skip_on_github
81+
def test_constructor_board_url_file(tmp_cache, http_example_board_path):
82+
# TODO: downloading a pin does not put files in the same directory, since
83+
# in this case we are hashing on the full url.
84+
85+
board = c.board_urls(
86+
http_example_board_path,
87+
# could derive from example version path
88+
pin_paths={"df_csv": "df_csv/20220214T163718Z-eceac/df_csv.csv"},
89+
)
90+
91+
board.pin_download("df_csv")
92+
93+
# check cache ----
94+
http_dirs = list(tmp_cache.glob("http_*"))
95+
96+
assert len(http_dirs) == 1
97+
98+
# there are two files in the flat cache (metadata, and the csv)
99+
parent = http_dirs[0]
100+
res = list(parent.rglob("*"))
70101
assert len(res) == 1
71102

72-
# has form: <pin>/<version>/<file>
73-
check_cache_file_path(res[0], parent)
103+
assert str(res[0]).endswith("df_csv.csv")
74104

75105

76106
@pytest.mark.skip_on_github

0 commit comments

Comments
 (0)