Skip to content

Commit ced0b63

Browse files
authored
Add ObjectStoreRegistry (#4)
1 parent fc8438e commit ced0b63

File tree

4 files changed

+323
-2
lines changed

4 files changed

+323
-2
lines changed

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,17 @@
1212

1313
## Installation
1414

15+
```bash
16+
python -m pip install obspec-utils
17+
```
18+
19+
## Setup development environment
20+
1521
```console
1622
git clone https://github.com/virtual-zarr/obspec-utils.git
1723
cd obspec-utils
18-
hatch env create
24+
uv sync --all-groups
25+
uv run --all-groups pytest --cov-config=pyproject.toml --cov=pkg --cov-report xml --cov=src
1926
```
2027

2128
## License

src/obspec_utils/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
11
from ._version import __version__
22
from .file_handlers import ObstoreMemCacheReader, ObstoreReader
3+
from .registry import ObjectStoreRegistry
34

4-
__all__ = ["__version__", "ObstoreMemCacheReader", "ObstoreReader"]
5+
__all__ = [
6+
"__version__",
7+
"ObstoreMemCacheReader",
8+
"ObstoreReader",
9+
"ObjectStoreRegistry",
10+
]

src/obspec_utils/registry.py

Lines changed: 274 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,274 @@
1+
"""
2+
Based on https://docs.rs/object_store/0.12.2/src/object_store/registry.rs.html#176-218
3+
"""
4+
5+
from __future__ import annotations
6+
7+
from collections import namedtuple
8+
from typing import TYPE_CHECKING, Dict, Iterator, Optional, Tuple, TypeAlias
9+
from urllib.parse import urlparse
10+
11+
if TYPE_CHECKING:
12+
from obstore.store import (
13+
ObjectStore,
14+
)
15+
16+
Url: TypeAlias = str
17+
Path: TypeAlias = str
18+
19+
UrlKey = namedtuple("UrlKey", ["scheme", "netloc"])
20+
21+
22+
def get_url_key(url: Url) -> UrlKey:
23+
"""
24+
Generate the UrlKey containing a url's scheme and authority/netloc that is used a the
25+
primary key's in a [ObjectStoreRegistry.map][virtualizarr.registry.ObjectStoreRegistry.map]
26+
27+
Parameters
28+
----------
29+
url
30+
Url to generate a UrlKey from
31+
32+
Returns
33+
-------
34+
NamedTuple containing the Url's scheme and authority/netloc
35+
36+
Raises
37+
------
38+
ValueError
39+
If provided Url does not contain a scheme based on [urllib.parse.urlparse][]
40+
"""
41+
parsed = urlparse(url)
42+
if not parsed.scheme:
43+
raise ValueError(
44+
f"Urls are expected to contain a scheme (e.g., `file://` or `s3://`), received {url} which parsed to {parsed}"
45+
)
46+
return UrlKey(parsed.scheme, parsed.netloc)
47+
48+
49+
class PathEntry:
50+
"""
51+
Construct a tree of path segments starting from the root
52+
53+
For example the following paths:
54+
* `/` => store1
55+
* `/foo/bar` => store2
56+
57+
Would be represented by:
58+
store: Some(store1)
59+
children:
60+
foo:
61+
store: None
62+
children:
63+
bar:
64+
store: Some(store2)
65+
"""
66+
67+
def __init__(self) -> None:
68+
self.store: Optional[ObjectStore] = None
69+
self.children: Dict[str, "PathEntry"] = {}
70+
71+
def lookup(self, to_resolve: str) -> Optional[Tuple[ObjectStore, int]]:
72+
"""
73+
Lookup a store based on URL path
74+
75+
Returns the store and its path segment depth
76+
"""
77+
current = self
78+
ret = (self.store, 0) if self.store is not None else None
79+
depth = 0
80+
81+
# Traverse the PathEntry tree to find the longest match
82+
for segment in path_segments(to_resolve):
83+
if segment in current.children:
84+
current = current.children[segment]
85+
depth += 1
86+
if current.store is not None:
87+
ret = (current.store, depth)
88+
else:
89+
break
90+
91+
return ret
92+
93+
94+
class ObjectStoreRegistry:
95+
def __init__(self, stores: dict[Url, ObjectStore] | None = None) -> None:
96+
"""
97+
Create a new store registry that matches the provided Urls and
98+
[ObjectStore][obstore.store.ObjectStore] instances.
99+
100+
101+
Parameters
102+
----------
103+
stores
104+
Mapping of [Url][virtualizarr.registry.Url] to the [ObjectStore][obstore.store.ObjectStore]
105+
to be registered under the [Url][virtualizarr.registry.Url].
106+
107+
Examples
108+
--------
109+
110+
```python exec="on" source="above" session="registry-examples"
111+
from obstore.store import S3Store
112+
from virtualizarr.registry import ObjectStoreRegistry
113+
114+
s3store = S3Store(bucket="my-bucket-1", prefix="orig-path")
115+
reg = ObjectStoreRegistry({"s3://my-bucket-1": s3store})
116+
117+
ret, path = reg.resolve("s3://my-bucket-1/orig-path/group/my-file.nc")
118+
assert path == "group/my-file.nc"
119+
assert ret is s3store
120+
```
121+
"""
122+
# Mapping from UrlKey (containing scheme and netlocs) to PathEntry
123+
self.map: Dict[UrlKey, PathEntry] = {}
124+
stores = stores or {}
125+
for url, store in stores.items():
126+
self.register(url, store)
127+
128+
def register(self, url: Url, store: ObjectStore) -> None:
129+
"""
130+
Register a new store for the provided store [Url][virtualizarr.registry.Url].
131+
132+
If a store with the same [Url][virtualizarr.registry.Url] existed before, it is replaced.
133+
134+
Parameters
135+
----------
136+
url
137+
[Url][virtualizarr.registry.Url] to registry the [ObjectStore][obstore.store.ObjectStore] under.
138+
store
139+
[ObjectStore][obstore.store.ObjectStore] instance to register using the
140+
provided [Url][virtualizarr.registry.Url].
141+
142+
Examples
143+
--------
144+
145+
```python exec="on" source="above" session="registry-examples"
146+
from obstore.store import S3Store
147+
from virtualizarr.registry import ObjectStoreRegistry
148+
149+
reg = ObjectStoreRegistry()
150+
orig_store = S3Store(bucket="my-bucket-1", prefix="orig-path")
151+
reg.register("s3://my-bucket-1", orig_store)
152+
153+
new_store = S3Store(bucket="my-bucket-1", prefix="updated-path")
154+
reg.register("s3://my-bucket-1", new_store)
155+
```
156+
"""
157+
parsed = urlparse(url)
158+
159+
key = get_url_key(url)
160+
161+
if key not in self.map:
162+
self.map[key] = PathEntry()
163+
164+
entry = self.map[key]
165+
166+
# Navigate to the correct path in the tree
167+
for segment in path_segments(parsed.path):
168+
if segment not in entry.children:
169+
entry.children[segment] = PathEntry()
170+
entry = entry.children[segment]
171+
# Update the store
172+
entry.store = store
173+
174+
def resolve(self, url: Url) -> Tuple[ObjectStore, Path]:
175+
"""
176+
Resolve an URL within the [ObjectStoreRegistry][virtualizarr.registry.ObjectStoreRegistry].
177+
178+
If [ObjectStoreRegistry.register][virtualizarr.registry.ObjectStoreRegistry.register] has been called
179+
with a URL with the same scheme and authority/netloc as the object URL, and a path that is a prefix
180+
of the provided url's, it is returned along with the trailing path. Paths are matched on a
181+
path segment basis, and in the event of multiple possibilities the longest path match is used.
182+
183+
Parameters
184+
----------
185+
url
186+
Url to resolve in the [ObjectStoreRegistry][virtualizarr.registry.ObjectStoreRegistry]
187+
188+
Returns
189+
-------
190+
ObjectStore
191+
The [ObjectStore][obstore.store.ObjectStore] stored at the resolved url.
192+
Path
193+
The trailing portion of the url after the prefix of the matching store in the
194+
[ObjectStoreRegistry][virtualizarr.registry.ObjectStoreRegistry].
195+
196+
Raises
197+
------
198+
ValueError
199+
If the URL cannot be resolved, meaning that [ObjectStoreRegistry.register][virtualizarr.registry.ObjectStoreRegistry.register]
200+
has not been called with a URL with the same scheme and authority/netloc as the object URL, and a path that is a prefix
201+
of the provided url's.
202+
203+
Examples
204+
--------
205+
206+
```python exec="on" source="above" session="registry-resolve-examples"
207+
from obstore.store import MemoryStore, S3Store
208+
from virtualizarr.registry import ObjectStoreRegistry
209+
210+
registry = ObjectStoreRegistry()
211+
memstore1 = MemoryStore()
212+
registry.register("s3://bucket1", memstore1)
213+
url = "s3://bucket1/path/to/object"
214+
ret, path = registry.resolve(url)
215+
assert path == "path/to/object"
216+
assert ret is memstore1
217+
print(f"Resolved url: `{url}` to store: `{ret}` and path: `{path}`")
218+
```
219+
220+
```python exec="on" source="above" session="registry-resolve-examples"
221+
memstore2 = MemoryStore()
222+
base = "https://s3.region.amazonaws.com/bucket"
223+
registry.register(base, memstore2)
224+
225+
url = "https://s3.region.amazonaws.com/bucket/path/to/object"
226+
ret, path = registry.resolve(url)
227+
assert path == "bucket/path/to/object"
228+
assert ret is memstore2
229+
print(f"Resolved url: `{url}` to store: `{ret}` and path: `{path}`")
230+
```
231+
232+
```python exec="on" source="above" session="registry-resolve-examples"
233+
s3store = S3Store(bucket = "my-bucket", prefix="my-data/prefix/")
234+
registry.register("s3://my-bucket", s3store)
235+
ret, path = registry.resolve("s3://my-bucket/my-data/prefix/my-file.nc")
236+
assert path == "my-file.nc"
237+
assert ret is s3store
238+
```
239+
"""
240+
parsed = urlparse(url)
241+
path = parsed.path
242+
243+
key = UrlKey(parsed.scheme, parsed.netloc)
244+
245+
if key in self.map:
246+
result = self.map[key].lookup(path)
247+
if result:
248+
store, _ = result
249+
if hasattr(store, "prefix") and store.prefix:
250+
prefix = str(store.prefix).lstrip("/")
251+
path_after_prefix = (
252+
path.lstrip("/").removeprefix(prefix).lstrip("/")
253+
)
254+
elif hasattr(store, "url"):
255+
prefix = urlparse(store.url).path.lstrip("/")
256+
path_after_prefix = (
257+
path.lstrip("/").removeprefix(prefix).lstrip("/")
258+
)
259+
else:
260+
path_after_prefix = path.lstrip("/")
261+
return store, path_after_prefix
262+
raise ValueError(f"Could not find an ObjectStore matching the url `{url}`")
263+
264+
265+
def path_segments(path: str) -> Iterator[str]:
266+
"""
267+
Returns the non-empty segments of a path
268+
269+
Note: We filter out empty segments unlike urllib.parse
270+
"""
271+
return filter(lambda x: x, path.split("/"))
272+
273+
274+
__all__ = ["ObjectStoreRegistry"]

tests/test_registry.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import pytest
2+
from obstore.store import MemoryStore
3+
4+
from obspec_utils import ObjectStoreRegistry
5+
6+
7+
def test_registry():
8+
registry = ObjectStoreRegistry()
9+
memstore = MemoryStore()
10+
registry.register("s3://bucket1", memstore)
11+
url = "s3://bucket1/path/to/object"
12+
ret, path = registry.resolve(url)
13+
assert path == "path/to/object"
14+
assert ret is memstore
15+
16+
17+
def test_register_raises():
18+
registry = ObjectStoreRegistry()
19+
with pytest.raises(
20+
ValueError,
21+
match=r"Urls are expected to contain a scheme \(e\.g\., `file://` or `s3://`\), received .* which parsed to ParseResult\(scheme='.*', netloc='.*', path='.*', params='.*', query='.*', fragment='.*'\)",
22+
):
23+
url = "bucket1/path/to/object"
24+
ret, path = registry.register(url, MemoryStore())
25+
26+
27+
def test_resolve_raises():
28+
registry = ObjectStoreRegistry()
29+
with pytest.raises(
30+
ValueError,
31+
match="Could not find an ObjectStore matching the url `s3://bucket1/path/to/object`",
32+
):
33+
url = "s3://bucket1/path/to/object"
34+
ret, path = registry.resolve(url)

0 commit comments

Comments
 (0)