Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions src/zarr/storage/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,9 @@ async def make_store(
elif isinstance(store_like, str):
# Either a FSSpec URI or a local filesystem path
if _is_fsspec_uri(store_like):
# Check if this is an S3-compatible store accessed via HTTPS
# and convert it to use the s3:// protocol for proper support
store_like, storage_options = _maybe_convert_http_to_s3(store_like, storage_options)
return FsspecStore.from_url(
store_like, storage_options=storage_options, read_only=_read_only
)
Expand Down Expand Up @@ -439,6 +442,99 @@ def _is_fsspec_uri(uri: str) -> bool:
return "://" in uri or ("::" in uri and "local://" not in uri)


def _maybe_convert_http_to_s3(
url: str, storage_options: dict[str, Any] | None
) -> tuple[str, dict[str, Any]]:
"""
Detect if an HTTP(S) URL is actually an S3-compatible object store and convert it.

Checks if the URL hostname contains S3-related keywords (s3, minio, ceph, rgw, etc.)
and if so, converts it to use the s3:// protocol with proper endpoint configuration.

Parameters
----------
url : str
The HTTP(S) URL to check.
storage_options : dict | None
Existing storage options (will be updated with S3 config if detected).

Returns
-------
tuple[str, dict[str, Any]]
The (potentially converted) URL and storage options.
"""
from urllib.parse import urlparse

# Only process HTTP(S) URLs
if not url.startswith(("http://", "https://")):
return url, storage_options or {}

# Don't override if user already specified endpoint_url
opts = storage_options or {}
if "endpoint_url" in opts.get("client_kwargs", {}):
return url, opts

# Check hostname for S3-compatible patterns
parsed = urlparse(url)
# Extract hostname without port (e.g., "example.com:8080" -> "example.com")
hostname = parsed.hostname or parsed.netloc
hostname = hostname.lower()

# Use more precise matching - look for S3 keywords as separate parts
# Split by common separators (dots, dashes) to avoid false positives
# e.g., "uk1s3.example.com" -> ["uk1s3", "example", "com"]
hostname_parts = hostname.replace("-", ".").split(".")

# Check if any part contains s3 as a distinct token (not buried in random chars)
# - Exact match: "s3" in parts (s3.amazonaws.com)
# - Starts with "s3-": s3-us-west-2.amazonaws.com
# - Ends with "s3": uk1s3.embassy.ebi.ac.uk
def has_s3_token(part: str) -> bool:
return part == "s3" or part.startswith("s3-") or part.endswith("s3")

is_likely_s3 = (
any(has_s3_token(part) for part in hostname_parts)
or "object-store" in hostname # Less likely to have false positives
or "objectstore" in hostname
or "minio" in hostname_parts # minio.example.com
or "ceph" in hostname_parts # ceph.example.com
or "rgw" in hostname_parts # rgw.example.com (Ceph RADOS Gateway)
)
Comment on lines +495 to +502
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how reliable is this?


if not is_likely_s3:
return url, opts

# Parse S3 path components from URL path
endpoint = f"{parsed.scheme}://{parsed.netloc}"
s3_path = parsed.path.lstrip("/")

# Simple bucket/key parsing - split on first slash
# Format: https://endpoint/bucket/key/path -> s3://bucket/key/path
if s3_path:
parts = s3_path.split("/", 1)
bucket = parts[0]
key = parts[1] if len(parts) > 1 else ""
else:
bucket = ""
key = ""

# Convert to S3 URL
s3_url = f"s3://{bucket}/{key}"

# Update storage options - add endpoint_url without overriding user preferences
# Note: We already checked that endpoint_url is not set (line 474)
if "client_kwargs" not in opts:
opts["client_kwargs"] = {}
opts["client_kwargs"]["endpoint_url"] = endpoint

# Note: We intentionally do NOT set 'anon' by default because:
# - If we set anon=True, it breaks users with credentials in environment variables
# - s3fs ignores env credentials when anon=True is explicitly set
# Users accessing public data should explicitly pass storage_options={"anon": True}

return s3_url, opts


async def ensure_no_existing_node(
store_path: StorePath,
zarr_format: ZarrFormat,
Expand Down
30 changes: 30 additions & 0 deletions tests/test_store/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,36 @@ async def test_make_store_path_fsspec(monkeypatch) -> None:
assert isinstance(store_path.store, FsspecStore)


async def test_make_store_s3_url_detection() -> None:
"""Test that make_store correctly detects and converts S3-compatible HTTPS URLs."""
pytest.importorskip("fsspec")
pytest.importorskip("s3fs")

from zarr.storage._common import make_store

url = "https://uk1s3.example.com/bucket/path"

# Create store from HTTP URL - should auto-detect S3 and create FsspecStore
store = await make_store(url, mode="r")

# Verify we got an FsspecStore
assert isinstance(store, FsspecStore)

# Verify the underlying filesystem is S3FileSystem
# Protocol can be "s3", ["s3"], or ("s3", "s3a") depending on s3fs version
protocol = store.fs.protocol
if isinstance(protocol, list | tuple):
assert "s3" in protocol
else:
assert protocol == "s3"

# Verify the endpoint was set correctly
assert store.fs.client_kwargs.get("endpoint_url") == "https://uk1s3.example.com"

# Note: We don't set anon by default to respect environment credentials
# Users must explicitly add storage_options={"anon": True} for public data


async def test_make_store_path_storage_options_raises(store_like: StoreLike) -> None:
with pytest.raises(TypeError, match="storage_options"):
await make_store_path(store_like, storage_options={"foo": "bar"})
Expand Down
168 changes: 168 additions & 0 deletions tests/test_store/test_fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,3 +438,171 @@ async def test_with_read_only_auto_mkdir(tmp_path: Path) -> None:

store_w = FsspecStore.from_url(f"file://{tmp_path}", storage_options={"auto_mkdir": False})
_ = store_w.with_read_only()


class TestS3AutoDetection:
"""Test automatic detection and conversion of S3-compatible URLs."""

@pytest.mark.parametrize(
("url", "expected_detect"),
[
# Should detect as S3
("https://s3.amazonaws.com/bucket/path", True),
("https://s3-us-west-2.amazonaws.com/bucket/path", True),
("https://bucket.s3.amazonaws.com/path", True),
("https://uk1s3.embassy.ebi.ac.uk/idr/zarr/file.zarr", True),
("https://us-west-2-s3.example.com/bucket/path", True),
("https://minio.example.com/bucket/path", True),
("https://ceph.example.com/bucket/path", True),
("https://ceph-rgw.example.com/bucket/path", True),
("https://rgw.example.com/bucket/path", True),
("https://object-store.example.com/bucket/path", True),
("https://my-objectstore.example.com/bucket/path", True),
# Should NOT detect as S3 (false positives to avoid)
("https://someurls345.com/data/file.zarr", False),
("https://descriptions.example.com/file.zarr", False),
("https://users3000.example.com/file.zarr", False),
("https://s3tuff.example.com/file.zarr", False),
("https://s3archive.example.com/file.zarr", False),
("https://example.com/data/s3/file.zarr", False), # s3 in path, not hostname
("https://example.com/file.zarr", False),
],
)
def test_s3_detection_patterns(self, url: str, expected_detect: bool) -> None:
"""Test that S3 URL patterns are correctly identified."""
from zarr.storage._common import _maybe_convert_http_to_s3

converted_url, opts = _maybe_convert_http_to_s3(url, None)
was_detected = converted_url.startswith("s3://")

assert was_detected == expected_detect, (
f"URL {url} detection mismatch: got {was_detected}, expected {expected_detect}"
)

if was_detected:
# Verify S3 URL format is correct
assert converted_url.startswith("s3://")
# Verify endpoint_url was set
assert "client_kwargs" in opts
assert "endpoint_url" in opts["client_kwargs"]
# We don't set anon by default - users must set it explicitly

def test_s3_detection_preserves_user_options(self) -> None:
"""Test that user-provided storage options are preserved."""
from zarr.storage._common import _maybe_convert_http_to_s3

url = "https://uk1s3.example.com/bucket/path"
user_opts = {"anon": False, "other_option": "value"}

converted_url, opts = _maybe_convert_http_to_s3(url, user_opts)

# Should still convert to S3
assert converted_url.startswith("s3://")
# Should preserve user's anon setting
assert opts["anon"] is False
# Should preserve other options
assert opts["other_option"] == "value"
# Should add endpoint_url
assert "endpoint_url" in opts["client_kwargs"]

def test_s3_detection_preserves_user_client_kwargs(self) -> None:
"""Test that user's existing client_kwargs are preserved when adding endpoint_url."""
from zarr.storage._common import _maybe_convert_http_to_s3

url = "https://uk1s3.example.com/bucket/path"
user_opts = {
"anon": False,
"client_kwargs": {"region_name": "us-west-2", "use_ssl": True},
}

# Call the function - it may modify user_opts, and that's okay
_, result_opts = _maybe_convert_http_to_s3(url, user_opts)

# Result should have endpoint_url added
result_client_kwargs = result_opts["client_kwargs"]
assert isinstance(result_client_kwargs, dict)
assert "endpoint_url" in result_client_kwargs
assert result_client_kwargs["endpoint_url"] == "https://uk1s3.example.com"

# Result should preserve user's other client_kwargs (not override them)
assert result_client_kwargs["region_name"] == "us-west-2"
assert result_client_kwargs["use_ssl"] is True

# User's anon setting should be preserved (not overridden to True)
assert result_opts["anon"] is False

def test_s3_detection_preserves_explicit_credentials(self) -> None:
"""Test that explicit credentials are preserved."""
from zarr.storage._common import _maybe_convert_http_to_s3

url = "https://uk1s3.example.com/bucket/path"

# Test with key/secret
user_opts = {"key": "my_key", "secret": "my_secret"}
converted_url, opts = _maybe_convert_http_to_s3(url, user_opts)

# Should convert to S3
assert converted_url.startswith("s3://")
# Credentials should be preserved
assert opts["key"] == "my_key"
assert opts["secret"] == "my_secret"
# Endpoint should be added
assert opts["client_kwargs"]["endpoint_url"] == "https://uk1s3.example.com"

def test_s3_detection_respects_existing_endpoint(self) -> None:
"""Test that existing endpoint_url is not overridden."""
from zarr.storage._common import _maybe_convert_http_to_s3

url = "https://uk1s3.example.com/bucket/path"
user_opts = {"client_kwargs": {"endpoint_url": "https://custom-endpoint.com"}}

converted_url, opts = _maybe_convert_http_to_s3(url, user_opts)

# Should NOT convert if endpoint already specified
assert converted_url == url
assert opts["client_kwargs"]["endpoint_url"] == "https://custom-endpoint.com"

@pytest.mark.parametrize(
("url", "expected_bucket", "expected_key"),
[
(
"https://uk1s3.embassy.ebi.ac.uk/idr/zarr/v0.5/file.zarr",
"idr",
"zarr/v0.5/file.zarr",
),
("https://s3.amazonaws.com/my-bucket/path/to/data", "my-bucket", "path/to/data"),
("https://s3.amazonaws.com/bucket", "bucket", ""), # No path
(
"https://s3.amazonaws.com/bucket/deep/nested/path/file.zarr",
"bucket",
"deep/nested/path/file.zarr",
),
],
)
def test_s3_url_parsing(self, url: str, expected_bucket: str, expected_key: str) -> None:
"""Test that S3 URLs are correctly parsed into bucket and key."""
from zarr.storage._common import _maybe_convert_http_to_s3

converted_url, _ = _maybe_convert_http_to_s3(url, None)

if expected_key:
expected_s3_url = f"s3://{expected_bucket}/{expected_key}"
else:
expected_s3_url = f"s3://{expected_bucket}/"

assert converted_url == expected_s3_url

def test_s3_detection_non_http_urls(self) -> None:
"""Test that non-HTTP URLs are not affected."""
from zarr.storage._common import _maybe_convert_http_to_s3

urls = [
"s3://bucket/path", # Already S3
"file:///local/path", # Local file
"gs://bucket/path", # Google Cloud Storage
"/local/path", # Plain path
]

for url in urls:
converted_url, _ = _maybe_convert_http_to_s3(url, None)
assert converted_url == url, f"Non-HTTP URL {url} should not be modified"
Loading