Skip to content

Commit c8941ba

Browse files
committed
add s3 compatible object store support with s3+https://...
1 parent 3b55281 commit c8941ba

File tree

5 files changed

+280
-8
lines changed

5 files changed

+280
-8
lines changed

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,8 @@ file = "zarr.storage._builtin_adapters:FileSystemAdapter"
434434
memory = "zarr.storage._builtin_adapters:MemoryAdapter"
435435
https = "zarr.storage._builtin_adapters:HttpsAdapter"
436436
s3 = "zarr.storage._builtin_adapters:S3Adapter"
437+
"s3+http" = "zarr.storage._builtin_adapters:S3HttpAdapter"
438+
"s3+https" = "zarr.storage._builtin_adapters:S3HttpsAdapter"
437439
gs = "zarr.storage._builtin_adapters:GSAdapter"
438440
log = "zarr.storage._builtin_adapters:LoggingAdapter"
439441
zip = "zarr.storage._builtin_adapters:ZipAdapter"

src/zarr/abc/store_adapter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,5 +192,5 @@ def __init_subclass__(cls, **kwargs: Any) -> None:
192192

193193
import re
194194

195-
if not re.match(r"^[a-zA-Z][a-zA-Z0-9_-]*$", cls.adapter_name):
195+
if not re.match(r"^[a-zA-Z][a-zA-Z0-9_+-]*$", cls.adapter_name):
196196
raise ValueError(f"Invalid adapter_name format: {cls.adapter_name}")

src/zarr/storage/_builtin_adapters.py

Lines changed: 118 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,13 +223,129 @@ def get_supported_schemes(cls) -> list[str]:
223223

224224

225225
class S3Adapter(RemoteAdapter):
226-
"""Store adapter for S3 URLs using fsspec."""
226+
"""Store adapter for S3 URLs using fsspec.
227+
228+
Supports:
229+
- Standard AWS S3: s3://bucket/path
230+
- Custom S3 endpoints: s3+http://endpoint/bucket/path, s3+https://endpoint/bucket/path
231+
"""
227232

228233
adapter_name = "s3"
229234

230235
@classmethod
231236
def get_supported_schemes(cls) -> list[str]:
232-
return ["s3"]
237+
return ["s3", "s3+http", "s3+https"]
238+
239+
@classmethod
240+
def _parse_s3_url(cls, url: str) -> tuple[str, str | None, dict[str, Any]]:
241+
"""Parse S3 URL and return (s3_url, endpoint_url, storage_options).
242+
243+
Returns:
244+
- s3_url: Normalized s3:// URL for fsspec
245+
- endpoint_url: Custom endpoint URL (None for AWS)
246+
- storage_options: Additional fsspec configuration
247+
"""
248+
if url.startswith("s3://"):
249+
# Standard AWS S3
250+
return url, None, {}
251+
252+
elif url.startswith("s3+http://"):
253+
# Custom S3 via HTTP: s3+http://endpoint/bucket/path
254+
# Remove "s3+" prefix and parse the remaining http URL
255+
http_url = url[3:] # "http://endpoint/bucket/path"
256+
257+
# Find the first '/' after "http://"
258+
after_protocol = http_url[7:] # Everything after "http://"
259+
if "/" in after_protocol:
260+
# Split at the first '/' to separate endpoint from path
261+
endpoint_part, path_part = after_protocol.split("/", 1)
262+
endpoint_url = f"http://{endpoint_part}"
263+
s3_url = f"s3://{path_part}"
264+
else:
265+
# No path part, just endpoint
266+
endpoint_url = http_url
267+
s3_url = "s3://"
268+
269+
storage_options = {"endpoint_url": endpoint_url, "use_ssl": False}
270+
return s3_url, endpoint_url, storage_options
271+
272+
elif url.startswith("s3+https://"):
273+
# Custom S3 via HTTPS: s3+https://endpoint/bucket/path
274+
# Remove "s3+" prefix and parse the remaining https URL
275+
https_url = url[3:] # "https://endpoint/bucket/path"
276+
277+
# Find the first '/' after "https://"
278+
after_protocol = https_url[8:] # Everything after "https://"
279+
if "/" in after_protocol:
280+
# Split at the first '/' to separate endpoint from path
281+
endpoint_part, path_part = after_protocol.split("/", 1)
282+
endpoint_url = f"https://{endpoint_part}"
283+
s3_url = f"s3://{path_part}"
284+
else:
285+
# No path part, just endpoint
286+
endpoint_url = https_url
287+
s3_url = "s3://"
288+
289+
storage_options = {"endpoint_url": endpoint_url, "use_ssl": True}
290+
return s3_url, endpoint_url, storage_options
291+
292+
else:
293+
raise ValueError(f"Unsupported S3 URL format: {url}")
294+
295+
@classmethod
296+
async def from_url_segment(
297+
cls,
298+
segment: URLSegment,
299+
preceding_url: str,
300+
**kwargs: Any,
301+
) -> Store:
302+
"""Create an FsspecStore for S3 URLs with custom endpoint support."""
303+
from zarr.storage._fsspec import FsspecStore
304+
305+
# Parse the S3 URL to extract endpoint information
306+
s3_url, endpoint_url, endpoint_storage_options = cls._parse_s3_url(preceding_url)
307+
308+
# Merge storage options (user-provided options take precedence)
309+
storage_options = endpoint_storage_options.copy()
310+
user_storage_options = kwargs.get("storage_options", {})
311+
storage_options.update(user_storage_options)
312+
313+
# Determine read-only mode (S3 can be writable)
314+
read_only = cls._determine_read_only_mode(preceding_url, **kwargs)
315+
316+
return FsspecStore.from_url(s3_url, storage_options=storage_options, read_only=read_only)
317+
318+
@classmethod
319+
def _extract_scheme(cls, url: str) -> str:
320+
"""Extract scheme from URL, handling composite schemes."""
321+
if url.startswith("s3+http://"):
322+
return "s3+http"
323+
elif url.startswith("s3+https://"):
324+
return "s3+https"
325+
elif url.startswith("s3://"):
326+
return "s3"
327+
else:
328+
return url.split("://", 1)[0]
329+
330+
331+
class S3HttpAdapter(S3Adapter):
332+
"""Store adapter for custom S3 HTTP endpoints."""
333+
334+
adapter_name = "s3+http"
335+
336+
@classmethod
337+
def get_supported_schemes(cls) -> list[str]:
338+
return ["s3+http"]
339+
340+
341+
class S3HttpsAdapter(S3Adapter):
342+
"""Store adapter for custom S3 HTTPS endpoints."""
343+
344+
adapter_name = "s3+https"
345+
346+
@classmethod
347+
def get_supported_schemes(cls) -> list[str]:
348+
return ["s3+https"]
233349

234350

235351
class GSAdapter(RemoteAdapter):

src/zarr/storage/_register_adapters.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ def register_builtin_adapters() -> None:
2222
LoggingAdapter,
2323
MemoryAdapter,
2424
S3Adapter,
25+
S3HttpAdapter,
26+
S3HttpsAdapter,
2527
ZipAdapter,
2628
)
2729

@@ -33,6 +35,8 @@ def register_builtin_adapters() -> None:
3335
MemoryAdapter,
3436
HttpsAdapter,
3537
S3Adapter,
38+
S3HttpAdapter,
39+
S3HttpsAdapter,
3640
GSAdapter,
3741
LoggingAdapter,
3842
ZipAdapter,

tests/test_store/test_zep8.py

Lines changed: 155 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,14 @@
1818
from zarr.core.buffer import default_buffer_prototype
1919
from zarr.registry import get_store_adapter, register_store_adapter
2020
from zarr.storage import FsspecStore, LocalStore, LoggingStore, MemoryStore, ZipStore
21-
from zarr.storage._builtin_adapters import GSAdapter, HttpsAdapter, LoggingAdapter, S3Adapter
21+
from zarr.storage._builtin_adapters import (
22+
GSAdapter,
23+
HttpsAdapter,
24+
LoggingAdapter,
25+
S3Adapter,
26+
S3HttpAdapter,
27+
S3HttpsAdapter,
28+
)
2229
from zarr.storage._common import make_store_path
2330
from zarr.storage._zep8 import URLParser, URLStoreResolver, ZEP8URLError, is_zep8_url
2431

@@ -513,6 +520,8 @@ def test_fsspec_zep8_url_detection() -> None:
513520
# These should be detected as ZEP 8 URLs
514521
zep8_urls = [
515522
"s3://bucket/data.zip|zip:",
523+
"s3+http://minio.local:9000/bucket/data.zip|zip:",
524+
"s3+https://storage.example.com/bucket/data.zarr|zarr3:",
516525
"https://example.com/data|zip:|zarr3:",
517526
"gs://bucket/data.zarr|zarr2:",
518527
]
@@ -538,7 +547,7 @@ async def test_fsspec_adapter_error_handling() -> None:
538547
# Test S3 adapter with invalid URL
539548
segment = URLSegment(scheme="s3", path="bucket/data", adapter=None)
540549

541-
with pytest.raises(ValueError, match="Unsupported scheme"):
550+
with pytest.raises(ValueError, match="Unsupported S3 URL format"):
542551
await S3Adapter.from_url_segment(segment, "invalid://url")
543552

544553
# Test HTTPS adapter with invalid URL
@@ -566,7 +575,7 @@ def test_fsspec_schemes_support() -> None:
566575

567576
# Test S3 adapter
568577
assert S3Adapter.can_handle_scheme("s3")
569-
assert S3Adapter.get_supported_schemes() == ["s3"]
578+
assert S3Adapter.get_supported_schemes() == ["s3", "s3+http", "s3+https"]
570579

571580
# Test HTTPS adapter
572581
assert HttpsAdapter.can_handle_scheme("https")
@@ -589,6 +598,8 @@ async def test_fsspec_url_chain_parsing() -> None:
589598
# Test complex chained URLs
590599
complex_urls = [
591600
"s3://bucket/archive.zip|zip:data/|zarr3:group",
601+
"s3+http://minio.local:9000/bucket/archive.zip|zip:data/|zarr3:group",
602+
"s3+https://storage.example.com/bucket/nested.zip|zip:inner/|zarr2:",
592603
"https://example.com/data.tar.gz|tar:|zip:|zarr2:",
593604
"gs://bucket/dataset.zarr|zarr3:array/subarray",
594605
]
@@ -1223,16 +1234,149 @@ async def test_remote_adapter_comprehensive() -> None:
12231234

12241235

12251236
async def test_s3_adapter_functionality() -> None:
1226-
"""Test S3Adapter functionality."""
1237+
"""Test S3Adapter functionality including custom endpoints."""
12271238
from zarr.storage._builtin_adapters import S3Adapter
12281239

1229-
# Test can_handle_scheme
1240+
# Test can_handle_scheme for all supported schemes
12301241
assert S3Adapter.can_handle_scheme("s3")
1242+
assert S3Adapter.can_handle_scheme("s3+http")
1243+
assert S3Adapter.can_handle_scheme("s3+https")
12311244
assert not S3Adapter.can_handle_scheme("gs")
1245+
assert not S3Adapter.can_handle_scheme("http")
12321246

12331247
# Test get_supported_schemes
12341248
schemes = S3Adapter.get_supported_schemes()
12351249
assert "s3" in schemes
1250+
assert "s3+http" in schemes
1251+
assert "s3+https" in schemes
1252+
assert len(schemes) == 3
1253+
1254+
1255+
async def test_s3_custom_endpoint_url_parsing() -> None:
1256+
"""Test S3Adapter URL parsing for custom endpoints."""
1257+
from zarr.storage._builtin_adapters import S3Adapter
1258+
1259+
# Test standard AWS S3 URL parsing
1260+
s3_url, endpoint_url, storage_options = S3Adapter._parse_s3_url("s3://my-bucket/path/to/data")
1261+
assert s3_url == "s3://my-bucket/path/to/data"
1262+
assert endpoint_url is None
1263+
assert storage_options == {}
1264+
1265+
# Test custom HTTP endpoint parsing
1266+
s3_url, endpoint_url, storage_options = S3Adapter._parse_s3_url(
1267+
"s3+http://minio.local:9000/my-bucket/data"
1268+
)
1269+
assert s3_url == "s3://my-bucket/data"
1270+
assert endpoint_url == "http://minio.local:9000"
1271+
assert storage_options == {"endpoint_url": "http://minio.local:9000", "use_ssl": False}
1272+
1273+
# Test custom HTTPS endpoint parsing
1274+
s3_url, endpoint_url, storage_options = S3Adapter._parse_s3_url(
1275+
"s3+https://storage.example.com/bucket/path/file.zarr"
1276+
)
1277+
assert s3_url == "s3://bucket/path/file.zarr"
1278+
assert endpoint_url == "https://storage.example.com"
1279+
assert storage_options == {"endpoint_url": "https://storage.example.com", "use_ssl": True}
1280+
1281+
# Test custom HTTP endpoint with port
1282+
s3_url, endpoint_url, storage_options = S3Adapter._parse_s3_url(
1283+
"s3+http://localhost:9000/test-bucket"
1284+
)
1285+
assert s3_url == "s3://test-bucket"
1286+
assert endpoint_url == "http://localhost:9000"
1287+
assert storage_options["endpoint_url"] == "http://localhost:9000"
1288+
assert storage_options["use_ssl"] is False
1289+
1290+
# Test edge case: endpoint without path
1291+
s3_url, endpoint_url, storage_options = S3Adapter._parse_s3_url("s3+https://minio.example.com")
1292+
assert s3_url == "s3://"
1293+
assert endpoint_url == "https://minio.example.com"
1294+
1295+
1296+
async def test_s3_custom_endpoint_scheme_extraction() -> None:
1297+
"""Test S3Adapter scheme extraction for custom endpoints."""
1298+
from zarr.storage._builtin_adapters import S3Adapter
1299+
1300+
# Test scheme extraction
1301+
assert S3Adapter._extract_scheme("s3://bucket/path") == "s3"
1302+
assert S3Adapter._extract_scheme("s3+http://minio.local:9000/bucket") == "s3+http"
1303+
assert S3Adapter._extract_scheme("s3+https://storage.example.com/bucket") == "s3+https"
1304+
1305+
1306+
async def test_s3_custom_endpoint_error_handling() -> None:
1307+
"""Test S3Adapter error handling for invalid URLs."""
1308+
from zarr.storage._builtin_adapters import S3Adapter
1309+
1310+
# Test unsupported URL format
1311+
with pytest.raises(ValueError, match="Unsupported S3 URL format"):
1312+
S3Adapter._parse_s3_url("invalid://not-s3")
1313+
1314+
with pytest.raises(ValueError, match="Unsupported S3 URL format"):
1315+
S3Adapter._parse_s3_url("gs://bucket/path")
1316+
1317+
1318+
async def test_s3_custom_endpoint_registration() -> None:
1319+
"""Test that custom S3 endpoint schemes are properly registered."""
1320+
from zarr.registry import get_store_adapter
1321+
1322+
# Test that all S3 schemes can be retrieved
1323+
s3_adapter = get_store_adapter("s3")
1324+
assert s3_adapter is not None
1325+
assert s3_adapter == S3Adapter
1326+
1327+
s3_http_adapter = get_store_adapter("s3+http")
1328+
assert s3_http_adapter is not None
1329+
assert s3_http_adapter == S3HttpAdapter
1330+
1331+
s3_https_adapter = get_store_adapter("s3+https")
1332+
assert s3_https_adapter is not None
1333+
assert s3_https_adapter == S3HttpsAdapter
1334+
1335+
1336+
async def test_s3_http_adapter_functionality() -> None:
1337+
"""Test S3HttpAdapter specific functionality."""
1338+
# Test adapter name
1339+
assert S3HttpAdapter.adapter_name == "s3+http"
1340+
1341+
# Test supported schemes
1342+
schemes = S3HttpAdapter.get_supported_schemes()
1343+
assert schemes == ["s3+http"]
1344+
1345+
# Test can_handle_scheme
1346+
assert S3HttpAdapter.can_handle_scheme("s3+http")
1347+
assert not S3HttpAdapter.can_handle_scheme("s3")
1348+
assert not S3HttpAdapter.can_handle_scheme("s3+https")
1349+
1350+
1351+
async def test_s3_https_adapter_functionality() -> None:
1352+
"""Test S3HttpsAdapter specific functionality."""
1353+
# Test adapter name
1354+
assert S3HttpsAdapter.adapter_name == "s3+https"
1355+
1356+
# Test supported schemes
1357+
schemes = S3HttpsAdapter.get_supported_schemes()
1358+
assert schemes == ["s3+https"]
1359+
1360+
# Test can_handle_scheme
1361+
assert S3HttpsAdapter.can_handle_scheme("s3+https")
1362+
assert not S3HttpsAdapter.can_handle_scheme("s3")
1363+
assert not S3HttpsAdapter.can_handle_scheme("s3+http")
1364+
1365+
1366+
async def test_s3_custom_endpoint_zep8_url_detection() -> None:
1367+
"""Test ZEP 8 URL detection with custom S3 endpoints."""
1368+
from zarr.storage._zep8 import is_zep8_url
1369+
1370+
# Standard S3 URLs (not ZEP 8)
1371+
assert not is_zep8_url("s3://bucket/data")
1372+
assert not is_zep8_url("s3+http://minio.local:9000/bucket/data")
1373+
assert not is_zep8_url("s3+https://storage.example.com/bucket/data")
1374+
1375+
# ZEP 8 URLs with custom S3 endpoints
1376+
assert is_zep8_url("s3://bucket/data.zip|zip:")
1377+
assert is_zep8_url("s3+http://minio.local:9000/bucket/data.zip|zip:")
1378+
assert is_zep8_url("s3+https://storage.example.com/bucket/data|zarr3:")
1379+
assert is_zep8_url("s3+http://localhost:9000/bucket/archive.zip|zip:data/|zarr2:")
12361380

12371381

12381382
async def test_gcs_adapter_functionality() -> None:
@@ -1346,6 +1490,8 @@ def test_builtin_adapters_imports_and_module_structure() -> None:
13461490
MemoryAdapter,
13471491
RemoteAdapter,
13481492
S3Adapter,
1493+
S3HttpAdapter,
1494+
S3HttpsAdapter,
13491495
ZipAdapter,
13501496
)
13511497

@@ -1354,13 +1500,17 @@ def test_builtin_adapters_imports_and_module_structure() -> None:
13541500
assert MemoryAdapter.adapter_name == "memory"
13551501
assert RemoteAdapter.adapter_name == "remote"
13561502
assert S3Adapter.adapter_name == "s3"
1503+
assert S3HttpAdapter.adapter_name == "s3+http"
1504+
assert S3HttpsAdapter.adapter_name == "s3+https"
13571505
assert GSAdapter.adapter_name == "gs"
13581506
assert HttpsAdapter.adapter_name == "https"
13591507
assert LoggingAdapter.adapter_name == "log"
13601508
assert ZipAdapter.adapter_name == "zip"
13611509

13621510
# Test inheritance relationships
13631511
assert issubclass(S3Adapter, RemoteAdapter)
1512+
assert issubclass(S3HttpAdapter, S3Adapter)
1513+
assert issubclass(S3HttpsAdapter, S3Adapter)
13641514
assert issubclass(GSAdapter, RemoteAdapter)
13651515
assert issubclass(HttpsAdapter, RemoteAdapter)
13661516

0 commit comments

Comments
 (0)