Skip to content

Commit d0d1095

Browse files
refactor(fsspec): remove upper bound on fsspec (#221)
* refactor(fsspec): remove upper bound on fsspec We're using `pins` as a dependency in `ibis-framework` and ideally we can remove the upper-pin on `fsspec` so that doesn't impact our users. I think the usage of the moved `hash_name` function is the only breaking change between the current upper pin and most recent release, so I've added a small workaround to handle the change. All of the local tests pass, I didn't yet set up sufficient credentials to handle the various cloud tests. * vendor in hash_name * file or local protocols * update gcs path * file, local not protocols * move to custom CacheMappers * move PinsRscCache to its own mapper * remove same_name * fix test paths to be windows compatible * remove 3.7 support * specify get_file for RsConnectFs * keep a call to hash_name * move gh org machow-> rstudio --------- Co-authored-by: isabelizimm <[email protected]>
1 parent e7f8077 commit d0d1095

File tree

10 files changed

+129
-69
lines changed

10 files changed

+129
-69
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ README.md:
2626
test: test-most test-rsc
2727

2828
test-most:
29-
pytest pins -m "not fs_rsc" --workers 4 --tests-per-worker 1
29+
pytest pins -m "not fs_rsc and not fs_s3" --workers 4 --tests-per-worker 1 -vv
3030

3131
test-rsc:
3232
pytest pins -m "fs_rsc"

pins/boards.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from .drivers import load_data, save_data, load_file, default_title
1818
from .utils import inform, warn_deprecated, ExtendMethodDoc
1919
from .config import get_allow_rsc_short_name
20+
from .cache import PinsCache
2021

2122

2223
_log = logging.getLogger(__name__)
@@ -733,9 +734,8 @@ def _touch_cache(self, path):
733734

734735
# TODO: assumes same_name set to True. Let's require this be set to
735736
# instantiate a pins cache.
736-
if not hasattr(self.fs, "cached_files"):
737+
if not isinstance(self.fs, PinsCache):
737738
return
738-
739739
path_to_hashed = self.fs._check_file(path)
740740
return touch_access_time(path_to_hashed)
741741

@@ -747,7 +747,7 @@ class BoardManual(BaseBoard):
747747
--------
748748
>>> import fsspec
749749
>>> import os
750-
>>> fs = fsspec.filesystem("github", org = "machow", repo = "pins-python")
750+
>>> fs = fsspec.filesystem("github", org = "rstudio", repo = "pins-python")
751751
>>> pin_paths = {"df_csv": "df_csv/20220214T163720Z-9bfad/"}
752752
>>> board = BoardManual("pins/tests/pins-compat", fs, pin_paths=pin_paths)
753753

pins/cache.py

Lines changed: 78 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@
55
import shutil
66
import urllib.parse
77

8-
from fsspec.implementations.cached import SimpleCacheFileSystem, hash_name
8+
from fsspec.implementations.cached import SimpleCacheFileSystem
99
from fsspec import register_implementation
1010
from pathlib import Path
1111

1212
from .config import get_cache_dir
13-
from .utils import inform
13+
from .utils import inform, hash_name
1414

1515
_log = logging.getLogger(__name__)
1616

@@ -57,12 +57,62 @@ def prefix_cache(fs, board_base_path):
5757
return f"{proto_name}_{base_hash}"
5858

5959

60+
class HashMapper:
61+
def __init__(self, hash_prefix):
62+
self.hash_prefix = hash_prefix
63+
64+
def __call__(self, path: str) -> str:
65+
if self.hash_prefix is not None:
66+
# optionally make the name relative to a parent path
67+
# using the hash of parent path as a prefix, to flatten a bit
68+
hash = Path(path).relative_to(Path(self.hash_prefix))
69+
return hash
70+
71+
else:
72+
raise NotImplementedError()
73+
74+
75+
class PinsAccessTimeCacheMapper:
76+
def __init__(self, hash_prefix):
77+
self.hash_prefix = hash_prefix
78+
79+
def __call__(self, path):
80+
# hash full path, and put anything after the final / at the end, just
81+
# to make it easier to browse.
82+
# this has
83+
base_name = hash_name(path, False)
84+
suffix = Path(path).name
85+
return f"{base_name}_{suffix}"
86+
87+
88+
class PinsRscCacheMapper:
89+
"""Modifies the PinsCache to allow hash_prefix to be an RSC server url.
90+
91+
Note that this class also modifies the first / in a path to be a -, so that
92+
pin contents will not be put into subdirectories, for e.g. michael/mtcars/data.txt.
93+
"""
94+
95+
def __init__(self, hash_prefix):
96+
self.hash_prefix = hash_prefix
97+
98+
def __call__(self, path):
99+
# the main change in this function is that, for same_name, it returns
100+
# the full path
101+
# change pin path of form <user>/<content> to <user>+<content>
102+
hash = path.replace("/", "+", 1)
103+
return hash
104+
105+
60106
class PinsCache(SimpleCacheFileSystem):
61107
protocol = "pinscache"
62108

63-
def __init__(self, *args, hash_prefix=None, **kwargs):
109+
def __init__(self, *args, hash_prefix=None, mapper=HashMapper, **kwargs):
64110
super().__init__(*args, **kwargs)
65111
self.hash_prefix = hash_prefix
112+
self._mapper = mapper(hash_prefix)
113+
114+
def hash_name(self, path, *args, **kwargs):
115+
return self._mapper(path)
66116

67117
def _open(self, path, *args, **kwargs):
68118
# For some reason, the open method of SimpleCacheFileSystem doesn't
@@ -83,43 +133,14 @@ def _make_local_details(self, path):
83133

84134
return fn
85135

86-
def hash_name(self, path, same_name):
87-
# the main change in this function is that, for same_name, it returns
88-
# the full path
89-
if same_name:
90-
if self.hash_prefix is not None:
91-
# optionally make the name relative to a parent path
92-
# using the hash of parent path as a prefix, to flatten a bit
93-
hash = Path(path).relative_to(Path(self.hash_prefix))
94-
return hash
95-
96-
return path
97-
else:
98-
raise NotImplementedError()
99-
100-
101-
class PinsRscCache(PinsCache):
102-
"""Modifies the PinsCache to allow hash_prefix to be an RSC server url.
103-
104-
Note that this class also modifies the first / in a path to be a -, so that
105-
pin contents will not be put into subdirectories, for e.g. michael/mtcars/data.txt.
106-
"""
107-
108-
protocol = "pinsrsccache"
109-
110-
def hash_name(self, path, same_name):
111-
# the main change in this function is that, for same_name, it returns
112-
# the full path
113-
if same_name:
114-
if self.hash_prefix is None:
115-
raise NotImplementedError()
116-
117-
# change pin path of form <user>/<content> to <user>+<content>
118-
hash = path.replace("/", "+", 1)
119-
return hash
120-
121-
else:
122-
raise NotImplementedError()
136+
# same as upstream, brought in to preserve backwards compatibility
137+
def _check_file(self, path):
138+
self._check_cache()
139+
sha = self._mapper(path)
140+
for storage in self.storage:
141+
fn = os.path.join(storage, sha)
142+
if os.path.exists(fn):
143+
return fn
123144

124145

125146
class PinsUrlCache(PinsCache):
@@ -154,15 +175,15 @@ def hash_name(self, path, same_name):
154175
class PinsAccessTimeCache(SimpleCacheFileSystem):
155176
name = "pinsaccesstimecache"
156177

157-
def hash_name(self, path, same_name):
158-
if same_name:
159-
raise NotImplementedError("same_name not implemented.")
178+
def __init__(
179+
self, *args, hash_prefix=None, mapper=PinsAccessTimeCacheMapper, **kwargs
180+
):
181+
super().__init__(*args, **kwargs)
182+
self.hash_prefix = hash_prefix
183+
self._mapper = mapper(hash_prefix)
160184

161-
# hash full path, and put anything after the final / at the end, just
162-
# to make it easier to browse.
163-
base_name = super().hash_name(path, same_name)
164-
suffix = Path(path).name
165-
return f"{base_name}_{suffix}"
185+
def hash_name(self, path, *args, **kwargs):
186+
return self._mapper(path)
166187

167188
def _open(self, path, mode="rb", **kwargs):
168189
f = super()._open(path, mode=mode, **kwargs)
@@ -177,6 +198,15 @@ def _open(self, path, mode="rb", **kwargs):
177198

178199
return f
179200

201+
# same as upstream, brought in to preserve backwards compatibility
202+
def _check_file(self, path):
203+
self._check_cache()
204+
sha = self._mapper(path)
205+
for storage in self.storage:
206+
fn = os.path.join(storage, sha)
207+
if os.path.exists(fn):
208+
return fn
209+
180210

181211
class CachePruner:
182212
"""Prunes the cache directory, across multiple boards.

pins/constructors.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import tempfile
44

55
from .boards import BaseBoard, BoardRsConnect, BoardManual
6-
from .cache import PinsCache, PinsRscCache, PinsAccessTimeCache, prefix_cache
6+
from .cache import PinsCache, PinsRscCacheMapper, PinsAccessTimeCache, prefix_cache
77
from .config import get_data_dir, get_cache_dir
88

99

@@ -56,9 +56,9 @@ def board_deparse(board: BaseBoard):
5656
if prot == "rsc":
5757
url = board.fs.api.server_url
5858
return f"board_connect(server_url={repr(url)}{allow_pickle})"
59-
elif prot == "file":
59+
elif prot in ["file", ("file", "local")]:
6060
return f"board_folder({repr(board.board)}{allow_pickle})"
61-
elif prot == ["s3", "s3a"]:
61+
elif prot in [["s3", "s3a"], ("s3", "s3a")]:
6262
return f"board_s3({repr(board.board)}{allow_pickle})"
6363
elif prot == "abfs":
6464
return f"board_azure({repr(board.board)}{allow_pickle})"
@@ -93,15 +93,15 @@ def board(
9393
9494
Parameters
9595
----------
96-
protocol:
96+
protocol: str
9797
File system protocol. E.g. file, s3, github, rsc (for Posit Connect).
9898
See `fsspec.filesystem` for more information.
99-
path:
99+
path: str
100100
A base path the board should use. For example, the directory the board lives in,
101101
or the path to its S3 bucket.
102-
versioned:
102+
versioned: bool
103103
Whether or not pins should be versioned.
104-
cache:
104+
cache: optional, type[DEFAULT]
105105
Whether to use a cache. By default, pins attempts to select the right cache
106106
directory, given your filesystem. If `None` is passed, then no cache will be
107107
used. You can set the cache using the `PINS_CACHE_DIR` environment variable.
@@ -154,8 +154,12 @@ def board(
154154
board_cache = prefix_cache(fs, hash_prefix)
155155
cache_dir = os.path.join(base_cache_dir, board_cache)
156156

157-
fs = PinsRscCache(
158-
cache_storage=cache_dir, fs=fs, hash_prefix=hash_prefix, same_names=True
157+
fs = PinsCache(
158+
cache_storage=cache_dir,
159+
fs=fs,
160+
hash_prefix=hash_prefix,
161+
same_names=True,
162+
mapper=PinsRscCacheMapper,
159163
)
160164
else:
161165
# ensures each subdir path is its own cache directory

pins/rsconnect/fs.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
RsConnectApiRequestError,
1818
RSC_CODE_OBJECT_DOES_NOT_EXIST,
1919
)
20+
from ..utils import isfilelike
2021

2122
# Misc ----
2223

@@ -277,6 +278,14 @@ def get(self, rpath, lpath, recursive=False, *args, **kwargs) -> None:
277278
bundle["content_guid"], bundle["id"], parsed.file_name, lpath
278279
)
279280

281+
def get_file(self, rpath, lpath, **kwargs):
282+
data = self.cat_file(rpath, **kwargs)
283+
if isfilelike(lpath):
284+
lpath.write(data)
285+
else:
286+
with open(lpath, "wb") as f:
287+
f.write(data)
288+
280289
def exists(self, path: str, **kwargs) -> bool:
281290
try:
282291
self.info(path)

pins/tests/test_boards.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,6 @@ def test_board_pin_download_filename(board_with_cache, tmp_path):
201201

202202
def test_board_pin_download_no_cache_error(board, tmp_path):
203203
df = pd.DataFrame({"x": [1, 2, 3]})
204-
205204
path = tmp_path / "data.csv"
206205
df.to_csv(path, index=False)
207206

@@ -210,7 +209,7 @@ def test_board_pin_download_no_cache_error(board, tmp_path):
210209
assert meta.type == "file"
211210

212211
# file boards work okay, since the board directory itself is the cache
213-
if board.fs.protocol == "file":
212+
if board.fs.protocol in ["file", ("file", "local")]:
214213
pytest.skip()
215214

216215
# uncached boards should fail, since nowhere to store the download

pins/tests/test_cache.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ def test_touch_access_time_auto(some_file):
5959

6060

6161
def test_pins_cache_hash_name_preserves():
62-
cache = PinsCache(fs=filesystem("file"))
63-
assert cache.hash_name("a/b/c.txt", True) == "a/b/c.txt"
62+
cache = PinsCache(fs=filesystem("file"), hash_prefix="")
63+
assert cache.hash_name("a/b/c.txt") == Path("a/b/c.txt")
6464

6565

6666
def test_pins_cache_url_hash_name():

pins/tests/test_constructors.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def construct_from_board(board):
3939
prot = board.fs.protocol
4040
fs_name = prot if isinstance(prot, str) else prot[0]
4141

42-
if fs_name == "file":
42+
if fs_name in ["file", ("file", "local")]:
4343
board = c.board_folder(board.board)
4444
elif fs_name == "rsc":
4545
board = c.board_rsconnect(
@@ -149,7 +149,7 @@ def test_constructor_board_url_file(tmp_cache, http_example_board_path):
149149

150150
@pytest.mark.skip_on_github
151151
def test_constructor_board_github(tmp_cache, http_example_board_path, df_csv):
152-
board = c.board_github("machow", "pins-python", EXAMPLE_REL_PATH) # noqa
152+
board = c.board_github("rstudio", "pins-python", EXAMPLE_REL_PATH) # noqa
153153

154154
df = board.pin_read("df_csv")
155155
assert_frame_equal(df, df_csv)
@@ -190,7 +190,7 @@ def test_constructor_boards(board, df_csv, tmp_cache):
190190
# check the cache structure -----------------------------------------------
191191

192192
# check cache
193-
if board.fs.protocol == "file":
193+
if board.fs.protocol in ["file", ("file", "local")]:
194194
# no caching for local file boards
195195
pass
196196
else:

pins/utils.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import hashlib
2+
import os
13
import sys
24

35
from functools import update_wrapper
@@ -19,6 +21,14 @@ def warn_deprecated(msg):
1921
warn(msg, DeprecationWarning)
2022

2123

24+
def hash_name(path, same_name):
25+
if same_name:
26+
hash = os.path.basename(path)
27+
else:
28+
hash = hashlib.sha256(path.encode()).hexdigest()
29+
return hash
30+
31+
2232
class ExtendMethodDoc:
2333
# Note that the indentation assumes these are top-level method docstrings,
2434
# so are indented 8 spaces (after the initial sentence).
@@ -68,3 +78,11 @@ def __call__(self, *args, **kwargs):
6878
# which allows all the inspect machinery to give sphinx the __call__
6979
# attribute we set in __init__.
7080
raise NotImplementedError()
81+
82+
83+
# based off fsspec.isfilelike
84+
def isfilelike(file) -> bool:
85+
for attr in ["read", "close", "tell"]:
86+
if not hasattr(file, attr):
87+
return False
88+
return True

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ zipsafe = False
2424

2525
python_requires = >=3.8
2626
install_requires =
27-
fsspec>=2022.2.0,<2023.9.0
27+
fsspec>=2022.2.0
2828
pyyaml>=3.13
2929
xxhash>=1.0.0
3030
pandas>=0.23.0

0 commit comments

Comments
 (0)