Skip to content

Commit 92002a3

Browse files
authored
Add full simplecache protocol support (#453)
* tests: adjust fsspec compat test * tests: adjust intended behavior for tests * upath._chain: fix unchain behavior * upath.implementations.cached: add simplecachepath * upath.core: fix chain behavior * upath._chain: refactor unchain * upath.core: fix rename behavior for achored paths with empty root_marker * upath.core: fix pickling for relative paths * upath._chain: typing fixes * upath._flavour_sources: add simplecache flavour * tests: add simple cache tests * upath.core: adjust rename logic * upath.core: fix rename for non-local targets
1 parent 2fe5882 commit 92002a3

File tree

8 files changed

+227
-96
lines changed

8 files changed

+227
-96
lines changed

dev/fsspec_inspector/generate_flavours.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,6 @@ def __init_subclass__(cls: Any, **kwargs):
105105
"dir",
106106
"blockcache",
107107
"cached",
108-
"simplecache",
109108
"filecache",
110109
]
111110

@@ -116,6 +115,7 @@ def __init_subclass__(cls: Any, **kwargs):
116115

117116
FIX_METHODS = {
118117
"GCSFileSystem": ["_strip_protocol", "_get_kwargs_from_urls", "_split_path"],
118+
"SimpleCacheFileSystem": [],
119119
}
120120

121121

upath/_chain.py

Lines changed: 116 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,28 @@
44
import warnings
55
from collections import defaultdict
66
from collections import deque
7+
from collections.abc import Iterator
78
from collections.abc import MutableMapping
89
from collections.abc import Sequence
910
from collections.abc import Set
11+
from itertools import zip_longest
1012
from typing import TYPE_CHECKING
1113
from typing import Any
1214
from typing import NamedTuple
1315

16+
from upath._flavour import WrappedFileSystemFlavour
17+
from upath._protocol import get_upath_protocol
18+
from upath.registry import available_implementations
19+
from upath.types import UNSET_DEFAULT
20+
1421
if TYPE_CHECKING:
1522
if sys.version_info >= (3, 11):
23+
from typing import Never
1624
from typing import Self
1725
else:
26+
from typing_extensions import Never
1827
from typing_extensions import Self
1928

20-
from upath._flavour import WrappedFileSystemFlavour
21-
from upath._protocol import get_upath_protocol
22-
from upath.registry import available_implementations
23-
2429
__all__ = [
2530
"ChainSegment",
2631
"Chain",
@@ -153,74 +158,129 @@ def nest(self) -> ChainSegment:
153158
return ChainSegment(urlpath, protocol, inkwargs)
154159

155160

161+
def _iter_fileobject_protocol_options(
162+
fileobject: str | None,
163+
protocol: str,
164+
storage_options: dict[str, Any],
165+
/,
166+
) -> Iterator[tuple[str | None, str, dict[str, Any]]]:
167+
"""yields fileobject, protocol and remaining storage options"""
168+
so = storage_options.copy()
169+
while "target_protocol" in so:
170+
t_protocol = so.pop("target_protocol", "")
171+
t_fileobject = so.pop("fo", None) # codespell:ignore fo
172+
t_so = so.pop("target_options", {})
173+
yield fileobject, protocol, so
174+
fileobject, protocol, so = t_fileobject, t_protocol, t_so
175+
yield fileobject, protocol, so
176+
177+
156178
class FSSpecChainParser:
157179
"""parse an fsspec chained urlpath"""
158180

159181
def __init__(self) -> None:
160182
self.link: str = "::"
161183
self.known_protocols: Set[str] = set()
162184

163-
def unchain(self, path: str, kwargs: dict[str, Any]) -> list[ChainSegment]:
185+
def unchain(
186+
self,
187+
path: str,
188+
_deprecated_storage_options: Never = UNSET_DEFAULT,
189+
/,
190+
*,
191+
protocol: str | None = None,
192+
storage_options: dict[str, Any] | None = None,
193+
) -> list[ChainSegment]:
164194
"""implements same behavior as fsspec.core._un_chain
165195
166196
two differences:
167197
1. it sets the urlpath to None for upstream filesystems that passthrough
168198
2. it checks against the known protocols for exact matches
169199
170200
"""
171-
# TODO: upstream to fsspec
172-
first_bit_protocol: str | None = kwargs.pop("protocol", None)
173-
it_bits = iter(path.split(self.link))
174-
bits: list[str]
175-
if first_bit_protocol is not None:
176-
bits = [next(it_bits)]
177-
else:
178-
bits = []
179-
for p in it_bits:
180-
if "://" in p: # uri-like, fast-path
181-
bits.append(p)
182-
elif "/" in p: # path-like, fast-path
183-
bits.append(p)
184-
elif p in self.known_protocols: # exact match a fsspec protocol
185-
bits.append(f"{p}://")
186-
elif p in (m := set(available_implementations(fallback=True))):
187-
self.known_protocols = m
188-
bits.append(f"{p}://")
189-
else:
190-
bits.append(p)
191-
192-
# [[url, protocol, kwargs], ...]
193-
out: list[ChainSegment] = []
194-
previous_bit: str | None = None
195-
kwargs = kwargs.copy()
196-
first_bit_idx = len(bits) - 1
197-
for idx, bit in enumerate(reversed(bits)):
198-
if idx == first_bit_idx:
199-
protocol = first_bit_protocol or get_upath_protocol(bit) or ""
200-
else:
201-
protocol = get_upath_protocol(bit) or ""
202-
flavour = WrappedFileSystemFlavour.from_protocol(protocol)
203-
extra_kwargs = flavour.get_kwargs_from_url(bit)
204-
kws = kwargs.pop(protocol, {})
205-
if bit is bits[0]:
206-
kws.update(kwargs)
207-
kw = dict(**extra_kwargs)
208-
kw.update(kws)
209-
if "target_protocol" in kw:
210-
kw.setdefault("target_options", {})
211-
bit = flavour.strip_protocol(bit) or flavour.root_marker
201+
if _deprecated_storage_options is not UNSET_DEFAULT:
202+
warnings.warn(
203+
"passing storage_options as positional argument is deprecated, "
204+
"pass as keyword argument instead",
205+
DeprecationWarning,
206+
stacklevel=2,
207+
)
208+
if storage_options is not None:
209+
raise ValueError(
210+
"cannot pass storage_options both positionally and as keyword"
211+
)
212+
storage_options = _deprecated_storage_options
213+
protocol = protocol or storage_options.get("protocol")
214+
if storage_options is None:
215+
storage_options = {}
216+
217+
segments: list[ChainSegment] = []
218+
path_bit: str | None
219+
next_path_overwrite: str | None = None
220+
for proto0, bit in zip_longest([protocol], path.split(self.link)):
221+
# get protocol and path_bit
212222
if (
213-
protocol in {"blockcache", "filecache", "simplecache"}
214-
and "target_protocol" not in kw
223+
"://" in bit # uri-like, fast-path (redundant)
224+
or "/" in bit # path-like, fast-path
215225
):
216-
out.append(ChainSegment(None, protocol, kw))
217-
if previous_bit is not None:
218-
bit = previous_bit
226+
proto = get_upath_protocol(bit, protocol=proto0)
227+
flavour = WrappedFileSystemFlavour.from_protocol(proto)
228+
path_bit = flavour.strip_protocol(bit)
229+
extra_so = flavour.get_kwargs_from_url(bit)
230+
elif bit in self.known_protocols and (
231+
proto0 is None or bit == proto0
232+
): # exact match a fsspec protocol
233+
proto = bit
234+
path_bit = ""
235+
extra_so = {}
236+
elif bit in (m := set(available_implementations(fallback=True))) and (
237+
proto0 is None or bit == proto0
238+
):
239+
self.known_protocols = m
240+
proto = bit
241+
path_bit = ""
242+
extra_so = {}
243+
else:
244+
proto = get_upath_protocol(bit, protocol=proto0)
245+
flavour = WrappedFileSystemFlavour.from_protocol(proto)
246+
path_bit = flavour.strip_protocol(bit)
247+
extra_so = flavour.get_kwargs_from_url(bit)
248+
if proto in {"blockcache", "filecache", "simplecache"}:
249+
if path_bit:
250+
next_path_overwrite = path_bit
251+
path_bit = None
252+
elif next_path_overwrite is not None:
253+
path_bit = next_path_overwrite
254+
next_path_overwrite = None
255+
segments.append(ChainSegment(path_bit, proto, extra_so))
256+
257+
root_so = segments[0].storage_options
258+
for segment, proto_fo_so in zip_longest(
259+
segments,
260+
_iter_fileobject_protocol_options(
261+
path_bit if segments else None,
262+
protocol or "",
263+
storage_options,
264+
),
265+
):
266+
t_fo, t_proto, t_so = proto_fo_so or (None, "", {})
267+
if segment is None:
268+
if next_path_overwrite is not None:
269+
t_fo = next_path_overwrite
270+
next_path_overwrite = None
271+
segments.append(ChainSegment(t_fo, t_proto, t_so))
219272
else:
220-
out.append(ChainSegment(bit, protocol, kw))
221-
previous_bit = bit
222-
out.reverse()
223-
return out
273+
proto = segment.protocol
274+
# check if protocol is consistent with storage options
275+
if t_proto and t_proto != proto:
276+
raise ValueError(
277+
f"protocol {proto!r} collides with target_protocol {t_proto!r}"
278+
)
279+
# update the storage_options
280+
segment.storage_options.update(root_so.pop(proto, {}))
281+
segment.storage_options.update(t_so)
282+
283+
return segments
224284

225285
def chain(self, segments: Sequence[ChainSegment]) -> tuple[str, dict[str, Any]]:
226286
"""returns a chained urlpath from the segments"""
@@ -268,7 +328,7 @@ def chain(self, segments: Sequence[ChainSegment]) -> tuple[str, dict[str, Any]]:
268328
chained_kw = {"zip": {"allowZip64": False}}
269329
print(chained_path, chained_kw)
270330
out0 = _un_chain(chained_path, chained_kw)
271-
out1 = FSSpecChainParser().unchain(chained_path, chained_kw)
331+
out1 = FSSpecChainParser().unchain(chained_path, storage_options=chained_kw)
272332

273333
pp(out0)
274334
pp(out1)

upath/_flavour_sources.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
# - cached
2424
# - dir
2525
# - filecache
26-
# - simplecache
2726
# protocol import errors:
2827
# - gdrive (Please install gdrive_fs for access to Google Drive)
2928
# - generic (GenericFileSystem: '_strip_protocol' not a classmethod)
@@ -926,6 +925,15 @@ def _get_kwargs_from_urls(path):
926925
return out
927926

928927

928+
class SimpleCacheFileSystemFlavour(AbstractFileSystemFlavour):
929+
__orig_class__ = 'fsspec.implementations.cached.SimpleCacheFileSystem'
930+
__orig_version__ = '2025.9.0'
931+
protocol = ('simplecache',)
932+
root_marker = ''
933+
sep = '/'
934+
local_file = True
935+
936+
929937
class TarFileSystemFlavour(AbstractFileSystemFlavour):
930938
__orig_class__ = 'fsspec.implementations.tar.TarFileSystem'
931939
__orig_version__ = '2025.9.0'

upath/core.py

Lines changed: 35 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from __future__ import annotations
22

3-
import os
3+
import posixpath
44
import sys
55
import warnings
66
from abc import ABCMeta
@@ -423,9 +423,12 @@ def __init__(
423423
str_args0 = "."
424424

425425
segments = chain_parser.unchain(
426-
str_args0, {"protocol": protocol, **storage_options}
426+
str_args0,
427+
protocol=protocol,
428+
storage_options=storage_options,
427429
)
428-
chain = Chain.from_list(segments)
430+
# FIXME: normalization needs to happen in unchain already...
431+
chain = Chain.from_list(Chain.from_list(segments).to_list())
429432
if len(args) > 1:
430433
chain = chain.replace(
431434
path=WrappedFileSystemFlavour.from_protocol(protocol).join(
@@ -1102,14 +1105,14 @@ def group(self) -> str:
11021105

11031106
def absolute(self) -> Self:
11041107
if self._relative_base is not None:
1105-
return self.cwd().joinpath(str(self))
1108+
return self.cwd().joinpath(self.__vfspath__())
11061109
return self
11071110

11081111
def is_absolute(self) -> bool:
11091112
if self._relative_base is not None:
11101113
return False
11111114
else:
1112-
return self.parser.isabs(str(self))
1115+
return self.parser.isabs(self.__vfspath__())
11131116

11141117
def __eq__(self, other: object) -> bool:
11151118
"""UPaths are considered equal if their protocol, path and
@@ -1223,22 +1226,24 @@ def rename(
12231226
maxdepth: int | None = UNSET_DEFAULT,
12241227
**kwargs: Any,
12251228
) -> Self:
1226-
if isinstance(target, str) and self.storage_options:
1227-
target = UPath(target, **self.storage_options)
1229+
target_protocol = get_upath_protocol(target)
1230+
if target_protocol and target_protocol != self.protocol:
1231+
raise ValueError(
1232+
f"expected protocol {self.protocol!r}, got: {target_protocol!r}"
1233+
)
1234+
if not isinstance(target, UPath):
1235+
target = str(target)
1236+
if target_protocol or (self.anchor and target.startswith(self.anchor)):
1237+
target = self.with_segments(target)
1238+
else:
1239+
target = UPath(target)
12281240
if target == self:
12291241
return self
12301242
if self._relative_base is not None:
12311243
self = self.absolute()
12321244
target_protocol = get_upath_protocol(target)
12331245
if target_protocol:
1234-
if target_protocol != self.protocol:
1235-
raise ValueError(
1236-
f"expected protocol {self.protocol!r}, got: {target_protocol!r}"
1237-
)
1238-
if not isinstance(target, UPath):
1239-
target_ = UPath(target, **self.storage_options)
1240-
else:
1241-
target_ = target
1246+
target_ = target
12421247
# avoid calling .resolve for subclasses of UPath
12431248
if ".." in target_.parts or "." in target_.parts:
12441249
target_ = target_.resolve()
@@ -1247,7 +1252,7 @@ def rename(
12471252
# avoid calling .resolve for subclasses of UPath
12481253
if ".." in parent.parts or "." in parent.parts:
12491254
parent = parent.resolve()
1250-
target_ = parent.joinpath(os.path.normpath(str(target)))
1255+
target_ = parent.joinpath(posixpath.normpath(target.path))
12511256
if recursive is not UNSET_DEFAULT:
12521257
kwargs["recursive"] = recursive
12531258
if maxdepth is not UNSET_DEFAULT:
@@ -1275,14 +1280,20 @@ def root(self) -> str:
12751280
return self.parser.splitroot(str(self))[1]
12761281

12771282
def __reduce__(self):
1278-
args = tuple(self._raw_urlpaths)
1279-
kwargs = {
1280-
"protocol": self._protocol,
1281-
**self._storage_options,
1282-
}
1283-
# Include _relative_base in the state if it's set
1284-
if self._relative_base is not None:
1285-
kwargs["_relative_base"] = self._relative_base
1283+
if self._relative_base is None:
1284+
args = (self.__vfspath__(),)
1285+
kwargs = {
1286+
"protocol": self._protocol,
1287+
**self._storage_options,
1288+
}
1289+
else:
1290+
args = (self._relative_base, self.__vfspath__())
1291+
# Include _relative_base in the state if it's set
1292+
kwargs = {
1293+
"protocol": self._protocol,
1294+
**self._storage_options,
1295+
"_relative_base": self._relative_base,
1296+
}
12861297
return _make_instance, (type(self), args, kwargs)
12871298

12881299
@classmethod

0 commit comments

Comments
 (0)