Skip to content

Commit 11d374b

Browse files
committed
attempt to autodetect an s3 compatible URL
1 parent e1990c0 commit 11d374b

File tree

3 files changed

+294
-0
lines changed

3 files changed

+294
-0
lines changed

src/zarr/storage/_common.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,9 @@ async def make_store(
343343
elif isinstance(store_like, str):
344344
# Either a FSSpec URI or a local filesystem path
345345
if _is_fsspec_uri(store_like):
346+
# Check if this is an S3-compatible store accessed via HTTPS
347+
# and convert it to use the s3:// protocol for proper support
348+
store_like, storage_options = _maybe_convert_http_to_s3(store_like, storage_options)
346349
return FsspecStore.from_url(
347350
store_like, storage_options=storage_options, read_only=_read_only
348351
)
@@ -439,6 +442,99 @@ def _is_fsspec_uri(uri: str) -> bool:
439442
return "://" in uri or ("::" in uri and "local://" not in uri)
440443

441444

445+
def _maybe_convert_http_to_s3(
446+
url: str, storage_options: dict[str, Any] | None
447+
) -> tuple[str, dict[str, Any]]:
448+
"""
449+
Detect if an HTTP(S) URL is actually an S3-compatible object store and convert it.
450+
451+
Checks if the URL hostname contains S3-related keywords (s3, minio, ceph, rgw, etc.)
452+
and if so, converts it to use the s3:// protocol with proper endpoint configuration.
453+
454+
Parameters
455+
----------
456+
url : str
457+
The HTTP(S) URL to check.
458+
storage_options : dict | None
459+
Existing storage options (will be updated with S3 config if detected).
460+
461+
Returns
462+
-------
463+
tuple[str, dict[str, Any]]
464+
The (potentially converted) URL and storage options.
465+
"""
466+
from urllib.parse import urlparse
467+
468+
# Only process HTTP(S) URLs
469+
if not url.startswith(("http://", "https://")):
470+
return url, storage_options or {}
471+
472+
# Don't override if user already specified endpoint_url
473+
opts = storage_options or {}
474+
if "endpoint_url" in opts.get("client_kwargs", {}):
475+
return url, opts
476+
477+
# Check hostname for S3-compatible patterns
478+
parsed = urlparse(url)
479+
# Extract hostname without port (e.g., "example.com:8080" -> "example.com")
480+
hostname = parsed.hostname or parsed.netloc
481+
hostname = hostname.lower()
482+
483+
# Use more precise matching - look for S3 keywords as separate parts
484+
# Split by common separators (dots, dashes) to avoid false positives
485+
# e.g., "uk1s3.example.com" -> ["uk1s3", "example", "com"]
486+
hostname_parts = hostname.replace("-", ".").split(".")
487+
488+
# Check if any part contains s3 as a distinct token (not buried in random chars)
489+
# - Exact match: "s3" in parts (s3.amazonaws.com)
490+
# - Starts with "s3-": s3-us-west-2.amazonaws.com
491+
# - Ends with "s3": uk1s3.embassy.ebi.ac.uk
492+
def has_s3_token(part: str) -> bool:
493+
return part == "s3" or part.startswith("s3-") or part.endswith("s3")
494+
495+
is_likely_s3 = (
496+
any(has_s3_token(part) for part in hostname_parts)
497+
or "object-store" in hostname # Less likely to have false positives
498+
or "objectstore" in hostname
499+
or "minio" in hostname_parts # minio.example.com
500+
or "ceph" in hostname_parts # ceph.example.com
501+
or "rgw" in hostname_parts # rgw.example.com (Ceph RADOS Gateway)
502+
)
503+
504+
if not is_likely_s3:
505+
return url, opts
506+
507+
# Parse S3 path components from URL path
508+
endpoint = f"{parsed.scheme}://{parsed.netloc}"
509+
s3_path = parsed.path.lstrip("/")
510+
511+
# Simple bucket/key parsing - split on first slash
512+
# Format: https://endpoint/bucket/key/path -> s3://bucket/key/path
513+
if s3_path:
514+
parts = s3_path.split("/", 1)
515+
bucket = parts[0]
516+
key = parts[1] if len(parts) > 1 else ""
517+
else:
518+
bucket = ""
519+
key = ""
520+
521+
# Convert to S3 URL
522+
s3_url = f"s3://{bucket}/{key}"
523+
524+
# Update storage options - add endpoint_url without overriding user preferences
525+
# Note: We already checked that endpoint_url is not set (line 474)
526+
if "client_kwargs" not in opts:
527+
opts["client_kwargs"] = {}
528+
opts["client_kwargs"]["endpoint_url"] = endpoint
529+
530+
# Note: We intentionally do NOT set 'anon' by default because:
531+
# - If we set anon=True, it breaks users with credentials in environment variables
532+
# - s3fs ignores env credentials when anon=True is explicitly set
533+
# Users accessing public data should explicitly pass storage_options={"anon": True}
534+
535+
return s3_url, opts
536+
537+
442538
async def ensure_no_existing_node(
443539
store_path: StorePath,
444540
zarr_format: ZarrFormat,

tests/test_store/test_core.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,36 @@ async def test_make_store_path_fsspec(monkeypatch) -> None:
154154
assert isinstance(store_path.store, FsspecStore)
155155

156156

157+
async def test_make_store_s3_url_detection() -> None:
158+
"""Test that make_store correctly detects and converts S3-compatible HTTPS URLs."""
159+
pytest.importorskip("fsspec")
160+
pytest.importorskip("s3fs")
161+
162+
from zarr.storage._common import make_store
163+
164+
url = "https://uk1s3.example.com/bucket/path"
165+
166+
# Create store from HTTP URL - should auto-detect S3 and create FsspecStore
167+
store = await make_store(url, mode="r")
168+
169+
# Verify we got an FsspecStore
170+
assert isinstance(store, FsspecStore)
171+
172+
# Verify the underlying filesystem is S3FileSystem
173+
# Protocol can be "s3", ["s3"], or ("s3", "s3a") depending on s3fs version
174+
protocol = store.fs.protocol
175+
if isinstance(protocol, list | tuple):
176+
assert "s3" in protocol
177+
else:
178+
assert protocol == "s3"
179+
180+
# Verify the endpoint was set correctly
181+
assert store.fs.client_kwargs.get("endpoint_url") == "https://uk1s3.example.com"
182+
183+
# Note: We don't set anon by default to respect environment credentials
184+
# Users must explicitly add storage_options={"anon": True} for public data
185+
186+
157187
async def test_make_store_path_storage_options_raises(store_like: StoreLike) -> None:
158188
with pytest.raises(TypeError, match="storage_options"):
159189
await make_store_path(store_like, storage_options={"foo": "bar"})

tests/test_store/test_fsspec.py

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,3 +438,171 @@ async def test_with_read_only_auto_mkdir(tmp_path: Path) -> None:
438438

439439
store_w = FsspecStore.from_url(f"file://{tmp_path}", storage_options={"auto_mkdir": False})
440440
_ = store_w.with_read_only()
441+
442+
443+
class TestS3AutoDetection:
444+
"""Test automatic detection and conversion of S3-compatible URLs."""
445+
446+
@pytest.mark.parametrize(
447+
("url", "expected_detect"),
448+
[
449+
# Should detect as S3
450+
("https://s3.amazonaws.com/bucket/path", True),
451+
("https://s3-us-west-2.amazonaws.com/bucket/path", True),
452+
("https://bucket.s3.amazonaws.com/path", True),
453+
("https://uk1s3.embassy.ebi.ac.uk/idr/zarr/file.zarr", True),
454+
("https://us-west-2-s3.example.com/bucket/path", True),
455+
("https://minio.example.com/bucket/path", True),
456+
("https://ceph.example.com/bucket/path", True),
457+
("https://ceph-rgw.example.com/bucket/path", True),
458+
("https://rgw.example.com/bucket/path", True),
459+
("https://object-store.example.com/bucket/path", True),
460+
("https://my-objectstore.example.com/bucket/path", True),
461+
# Should NOT detect as S3 (false positives to avoid)
462+
("https://someurls345.com/data/file.zarr", False),
463+
("https://descriptions.example.com/file.zarr", False),
464+
("https://users3000.example.com/file.zarr", False),
465+
("https://s3tuff.example.com/file.zarr", False),
466+
("https://s3archive.example.com/file.zarr", False),
467+
("https://example.com/data/s3/file.zarr", False), # s3 in path, not hostname
468+
("https://example.com/file.zarr", False),
469+
],
470+
)
471+
def test_s3_detection_patterns(self, url: str, expected_detect: bool) -> None:
472+
"""Test that S3 URL patterns are correctly identified."""
473+
from zarr.storage._common import _maybe_convert_http_to_s3
474+
475+
converted_url, opts = _maybe_convert_http_to_s3(url, None)
476+
was_detected = converted_url.startswith("s3://")
477+
478+
assert was_detected == expected_detect, (
479+
f"URL {url} detection mismatch: got {was_detected}, expected {expected_detect}"
480+
)
481+
482+
if was_detected:
483+
# Verify S3 URL format is correct
484+
assert converted_url.startswith("s3://")
485+
# Verify endpoint_url was set
486+
assert "client_kwargs" in opts
487+
assert "endpoint_url" in opts["client_kwargs"]
488+
# We don't set anon by default - users must set it explicitly
489+
490+
def test_s3_detection_preserves_user_options(self) -> None:
491+
"""Test that user-provided storage options are preserved."""
492+
from zarr.storage._common import _maybe_convert_http_to_s3
493+
494+
url = "https://uk1s3.example.com/bucket/path"
495+
user_opts = {"anon": False, "other_option": "value"}
496+
497+
converted_url, opts = _maybe_convert_http_to_s3(url, user_opts)
498+
499+
# Should still convert to S3
500+
assert converted_url.startswith("s3://")
501+
# Should preserve user's anon setting
502+
assert opts["anon"] is False
503+
# Should preserve other options
504+
assert opts["other_option"] == "value"
505+
# Should add endpoint_url
506+
assert "endpoint_url" in opts["client_kwargs"]
507+
508+
def test_s3_detection_preserves_user_client_kwargs(self) -> None:
509+
"""Test that user's existing client_kwargs are preserved when adding endpoint_url."""
510+
from zarr.storage._common import _maybe_convert_http_to_s3
511+
512+
url = "https://uk1s3.example.com/bucket/path"
513+
user_opts = {
514+
"anon": False,
515+
"client_kwargs": {"region_name": "us-west-2", "use_ssl": True},
516+
}
517+
518+
# Call the function - it may modify user_opts, and that's okay
519+
_, result_opts = _maybe_convert_http_to_s3(url, user_opts)
520+
521+
# Result should have endpoint_url added
522+
result_client_kwargs = result_opts["client_kwargs"]
523+
assert isinstance(result_client_kwargs, dict)
524+
assert "endpoint_url" in result_client_kwargs
525+
assert result_client_kwargs["endpoint_url"] == "https://uk1s3.example.com"
526+
527+
# Result should preserve user's other client_kwargs (not override them)
528+
assert result_client_kwargs["region_name"] == "us-west-2"
529+
assert result_client_kwargs["use_ssl"] is True
530+
531+
# User's anon setting should be preserved (not overridden to True)
532+
assert result_opts["anon"] is False
533+
534+
def test_s3_detection_preserves_explicit_credentials(self) -> None:
535+
"""Test that explicit credentials are preserved."""
536+
from zarr.storage._common import _maybe_convert_http_to_s3
537+
538+
url = "https://uk1s3.example.com/bucket/path"
539+
540+
# Test with key/secret
541+
user_opts = {"key": "my_key", "secret": "my_secret"}
542+
converted_url, opts = _maybe_convert_http_to_s3(url, user_opts)
543+
544+
# Should convert to S3
545+
assert converted_url.startswith("s3://")
546+
# Credentials should be preserved
547+
assert opts["key"] == "my_key"
548+
assert opts["secret"] == "my_secret"
549+
# Endpoint should be added
550+
assert opts["client_kwargs"]["endpoint_url"] == "https://uk1s3.example.com"
551+
552+
def test_s3_detection_respects_existing_endpoint(self) -> None:
553+
"""Test that existing endpoint_url is not overridden."""
554+
from zarr.storage._common import _maybe_convert_http_to_s3
555+
556+
url = "https://uk1s3.example.com/bucket/path"
557+
user_opts = {"client_kwargs": {"endpoint_url": "https://custom-endpoint.com"}}
558+
559+
converted_url, opts = _maybe_convert_http_to_s3(url, user_opts)
560+
561+
# Should NOT convert if endpoint already specified
562+
assert converted_url == url
563+
assert opts["client_kwargs"]["endpoint_url"] == "https://custom-endpoint.com"
564+
565+
@pytest.mark.parametrize(
566+
("url", "expected_bucket", "expected_key"),
567+
[
568+
(
569+
"https://uk1s3.embassy.ebi.ac.uk/idr/zarr/v0.5/file.zarr",
570+
"idr",
571+
"zarr/v0.5/file.zarr",
572+
),
573+
("https://s3.amazonaws.com/my-bucket/path/to/data", "my-bucket", "path/to/data"),
574+
("https://s3.amazonaws.com/bucket", "bucket", ""), # No path
575+
(
576+
"https://s3.amazonaws.com/bucket/deep/nested/path/file.zarr",
577+
"bucket",
578+
"deep/nested/path/file.zarr",
579+
),
580+
],
581+
)
582+
def test_s3_url_parsing(self, url: str, expected_bucket: str, expected_key: str) -> None:
583+
"""Test that S3 URLs are correctly parsed into bucket and key."""
584+
from zarr.storage._common import _maybe_convert_http_to_s3
585+
586+
converted_url, _ = _maybe_convert_http_to_s3(url, None)
587+
588+
if expected_key:
589+
expected_s3_url = f"s3://{expected_bucket}/{expected_key}"
590+
else:
591+
expected_s3_url = f"s3://{expected_bucket}/"
592+
593+
assert converted_url == expected_s3_url
594+
595+
def test_s3_detection_non_http_urls(self) -> None:
596+
"""Test that non-HTTP URLs are not affected."""
597+
from zarr.storage._common import _maybe_convert_http_to_s3
598+
599+
urls = [
600+
"s3://bucket/path", # Already S3
601+
"file:///local/path", # Local file
602+
"gs://bucket/path", # Google Cloud Storage
603+
"/local/path", # Plain path
604+
]
605+
606+
for url in urls:
607+
converted_url, _ = _maybe_convert_http_to_s3(url, None)
608+
assert converted_url == url, f"Non-HTTP URL {url} should not be modified"

0 commit comments

Comments
 (0)