diff --git a/src/zarr/storage/_common.py b/src/zarr/storage/_common.py index 9ecfe4c201..76002396d7 100644 --- a/src/zarr/storage/_common.py +++ b/src/zarr/storage/_common.py @@ -343,6 +343,9 @@ async def make_store( elif isinstance(store_like, str): # Either a FSSpec URI or a local filesystem path if _is_fsspec_uri(store_like): + # Check if this is an S3-compatible store accessed via HTTPS + # and convert it to use the s3:// protocol for proper support + store_like, storage_options = _maybe_convert_http_to_s3(store_like, storage_options) return FsspecStore.from_url( store_like, storage_options=storage_options, read_only=_read_only ) @@ -439,6 +442,99 @@ def _is_fsspec_uri(uri: str) -> bool: return "://" in uri or ("::" in uri and "local://" not in uri) +def _maybe_convert_http_to_s3( + url: str, storage_options: dict[str, Any] | None +) -> tuple[str, dict[str, Any]]: + """ + Detect if an HTTP(S) URL is actually an S3-compatible object store and convert it. + + Checks if the URL hostname contains S3-related keywords (s3, minio, ceph, rgw, etc.) + and if so, converts it to use the s3:// protocol with proper endpoint configuration. + + Parameters + ---------- + url : str + The HTTP(S) URL to check. + storage_options : dict | None + Existing storage options (will be updated with S3 config if detected). + + Returns + ------- + tuple[str, dict[str, Any]] + The (potentially converted) URL and storage options. + """ + from urllib.parse import urlparse + + # Only process HTTP(S) URLs + if not url.startswith(("http://", "https://")): + return url, storage_options or {} + + # Don't override if user already specified endpoint_url + opts = storage_options or {} + if "endpoint_url" in opts.get("client_kwargs", {}): + return url, opts + + # Check hostname for S3-compatible patterns + parsed = urlparse(url) + # Extract hostname without port (e.g., "example.com:8080" -> "example.com") + hostname = parsed.hostname or parsed.netloc + hostname = hostname.lower() + + # Use more precise matching - look for S3 keywords as separate parts + # Split by common separators (dots, dashes) to avoid false positives + # e.g., "uk1s3.example.com" -> ["uk1s3", "example", "com"] + hostname_parts = hostname.replace("-", ".").split(".") + + # Check if any part contains s3 as a distinct token (not buried in random chars) + # - Exact match: "s3" in parts (s3.amazonaws.com) + # - Starts with "s3-": s3-us-west-2.amazonaws.com + # - Ends with "s3": uk1s3.embassy.ebi.ac.uk + def has_s3_token(part: str) -> bool: + return part == "s3" or part.startswith("s3-") or part.endswith("s3") + + is_likely_s3 = ( + any(has_s3_token(part) for part in hostname_parts) + or "object-store" in hostname # Less likely to have false positives + or "objectstore" in hostname + or "minio" in hostname_parts # minio.example.com + or "ceph" in hostname_parts # ceph.example.com + or "rgw" in hostname_parts # rgw.example.com (Ceph RADOS Gateway) + ) + + if not is_likely_s3: + return url, opts + + # Parse S3 path components from URL path + endpoint = f"{parsed.scheme}://{parsed.netloc}" + s3_path = parsed.path.lstrip("/") + + # Simple bucket/key parsing - split on first slash + # Format: https://endpoint/bucket/key/path -> s3://bucket/key/path + if s3_path: + parts = s3_path.split("/", 1) + bucket = parts[0] + key = parts[1] if len(parts) > 1 else "" + else: + bucket = "" + key = "" + + # Convert to S3 URL + s3_url = f"s3://{bucket}/{key}" + + # Update storage options - add endpoint_url without overriding user preferences + # Note: We already checked that endpoint_url is not set (line 474) + if "client_kwargs" not in opts: + opts["client_kwargs"] = {} + opts["client_kwargs"]["endpoint_url"] = endpoint + + # Note: We intentionally do NOT set 'anon' by default because: + # - If we set anon=True, it breaks users with credentials in environment variables + # - s3fs ignores env credentials when anon=True is explicitly set + # Users accessing public data should explicitly pass storage_options={"anon": True} + + return s3_url, opts + + async def ensure_no_existing_node( store_path: StorePath, zarr_format: ZarrFormat, diff --git a/tests/test_store/test_core.py b/tests/test_store/test_core.py index a3850de90f..f896b7f76f 100644 --- a/tests/test_store/test_core.py +++ b/tests/test_store/test_core.py @@ -154,6 +154,36 @@ async def test_make_store_path_fsspec(monkeypatch) -> None: assert isinstance(store_path.store, FsspecStore) +async def test_make_store_s3_url_detection() -> None: + """Test that make_store correctly detects and converts S3-compatible HTTPS URLs.""" + pytest.importorskip("fsspec") + pytest.importorskip("s3fs") + + from zarr.storage._common import make_store + + url = "https://uk1s3.example.com/bucket/path" + + # Create store from HTTP URL - should auto-detect S3 and create FsspecStore + store = await make_store(url, mode="r") + + # Verify we got an FsspecStore + assert isinstance(store, FsspecStore) + + # Verify the underlying filesystem is S3FileSystem + # Protocol can be "s3", ["s3"], or ("s3", "s3a") depending on s3fs version + protocol = store.fs.protocol + if isinstance(protocol, list | tuple): + assert "s3" in protocol + else: + assert protocol == "s3" + + # Verify the endpoint was set correctly + assert store.fs.client_kwargs.get("endpoint_url") == "https://uk1s3.example.com" + + # Note: We don't set anon by default to respect environment credentials + # Users must explicitly add storage_options={"anon": True} for public data + + async def test_make_store_path_storage_options_raises(store_like: StoreLike) -> None: with pytest.raises(TypeError, match="storage_options"): await make_store_path(store_like, storage_options={"foo": "bar"}) diff --git a/tests/test_store/test_fsspec.py b/tests/test_store/test_fsspec.py index e970c674d4..834d931143 100644 --- a/tests/test_store/test_fsspec.py +++ b/tests/test_store/test_fsspec.py @@ -438,3 +438,171 @@ async def test_with_read_only_auto_mkdir(tmp_path: Path) -> None: store_w = FsspecStore.from_url(f"file://{tmp_path}", storage_options={"auto_mkdir": False}) _ = store_w.with_read_only() + + +class TestS3AutoDetection: + """Test automatic detection and conversion of S3-compatible URLs.""" + + @pytest.mark.parametrize( + ("url", "expected_detect"), + [ + # Should detect as S3 + ("https://s3.amazonaws.com/bucket/path", True), + ("https://s3-us-west-2.amazonaws.com/bucket/path", True), + ("https://bucket.s3.amazonaws.com/path", True), + ("https://uk1s3.embassy.ebi.ac.uk/idr/zarr/file.zarr", True), + ("https://us-west-2-s3.example.com/bucket/path", True), + ("https://minio.example.com/bucket/path", True), + ("https://ceph.example.com/bucket/path", True), + ("https://ceph-rgw.example.com/bucket/path", True), + ("https://rgw.example.com/bucket/path", True), + ("https://object-store.example.com/bucket/path", True), + ("https://my-objectstore.example.com/bucket/path", True), + # Should NOT detect as S3 (false positives to avoid) + ("https://someurls345.com/data/file.zarr", False), + ("https://descriptions.example.com/file.zarr", False), + ("https://users3000.example.com/file.zarr", False), + ("https://s3tuff.example.com/file.zarr", False), + ("https://s3archive.example.com/file.zarr", False), + ("https://example.com/data/s3/file.zarr", False), # s3 in path, not hostname + ("https://example.com/file.zarr", False), + ], + ) + def test_s3_detection_patterns(self, url: str, expected_detect: bool) -> None: + """Test that S3 URL patterns are correctly identified.""" + from zarr.storage._common import _maybe_convert_http_to_s3 + + converted_url, opts = _maybe_convert_http_to_s3(url, None) + was_detected = converted_url.startswith("s3://") + + assert was_detected == expected_detect, ( + f"URL {url} detection mismatch: got {was_detected}, expected {expected_detect}" + ) + + if was_detected: + # Verify S3 URL format is correct + assert converted_url.startswith("s3://") + # Verify endpoint_url was set + assert "client_kwargs" in opts + assert "endpoint_url" in opts["client_kwargs"] + # We don't set anon by default - users must set it explicitly + + def test_s3_detection_preserves_user_options(self) -> None: + """Test that user-provided storage options are preserved.""" + from zarr.storage._common import _maybe_convert_http_to_s3 + + url = "https://uk1s3.example.com/bucket/path" + user_opts = {"anon": False, "other_option": "value"} + + converted_url, opts = _maybe_convert_http_to_s3(url, user_opts) + + # Should still convert to S3 + assert converted_url.startswith("s3://") + # Should preserve user's anon setting + assert opts["anon"] is False + # Should preserve other options + assert opts["other_option"] == "value" + # Should add endpoint_url + assert "endpoint_url" in opts["client_kwargs"] + + def test_s3_detection_preserves_user_client_kwargs(self) -> None: + """Test that user's existing client_kwargs are preserved when adding endpoint_url.""" + from zarr.storage._common import _maybe_convert_http_to_s3 + + url = "https://uk1s3.example.com/bucket/path" + user_opts = { + "anon": False, + "client_kwargs": {"region_name": "us-west-2", "use_ssl": True}, + } + + # Call the function - it may modify user_opts, and that's okay + _, result_opts = _maybe_convert_http_to_s3(url, user_opts) + + # Result should have endpoint_url added + result_client_kwargs = result_opts["client_kwargs"] + assert isinstance(result_client_kwargs, dict) + assert "endpoint_url" in result_client_kwargs + assert result_client_kwargs["endpoint_url"] == "https://uk1s3.example.com" + + # Result should preserve user's other client_kwargs (not override them) + assert result_client_kwargs["region_name"] == "us-west-2" + assert result_client_kwargs["use_ssl"] is True + + # User's anon setting should be preserved (not overridden to True) + assert result_opts["anon"] is False + + def test_s3_detection_preserves_explicit_credentials(self) -> None: + """Test that explicit credentials are preserved.""" + from zarr.storage._common import _maybe_convert_http_to_s3 + + url = "https://uk1s3.example.com/bucket/path" + + # Test with key/secret + user_opts = {"key": "my_key", "secret": "my_secret"} + converted_url, opts = _maybe_convert_http_to_s3(url, user_opts) + + # Should convert to S3 + assert converted_url.startswith("s3://") + # Credentials should be preserved + assert opts["key"] == "my_key" + assert opts["secret"] == "my_secret" + # Endpoint should be added + assert opts["client_kwargs"]["endpoint_url"] == "https://uk1s3.example.com" + + def test_s3_detection_respects_existing_endpoint(self) -> None: + """Test that existing endpoint_url is not overridden.""" + from zarr.storage._common import _maybe_convert_http_to_s3 + + url = "https://uk1s3.example.com/bucket/path" + user_opts = {"client_kwargs": {"endpoint_url": "https://custom-endpoint.com"}} + + converted_url, opts = _maybe_convert_http_to_s3(url, user_opts) + + # Should NOT convert if endpoint already specified + assert converted_url == url + assert opts["client_kwargs"]["endpoint_url"] == "https://custom-endpoint.com" + + @pytest.mark.parametrize( + ("url", "expected_bucket", "expected_key"), + [ + ( + "https://uk1s3.embassy.ebi.ac.uk/idr/zarr/v0.5/file.zarr", + "idr", + "zarr/v0.5/file.zarr", + ), + ("https://s3.amazonaws.com/my-bucket/path/to/data", "my-bucket", "path/to/data"), + ("https://s3.amazonaws.com/bucket", "bucket", ""), # No path + ( + "https://s3.amazonaws.com/bucket/deep/nested/path/file.zarr", + "bucket", + "deep/nested/path/file.zarr", + ), + ], + ) + def test_s3_url_parsing(self, url: str, expected_bucket: str, expected_key: str) -> None: + """Test that S3 URLs are correctly parsed into bucket and key.""" + from zarr.storage._common import _maybe_convert_http_to_s3 + + converted_url, _ = _maybe_convert_http_to_s3(url, None) + + if expected_key: + expected_s3_url = f"s3://{expected_bucket}/{expected_key}" + else: + expected_s3_url = f"s3://{expected_bucket}/" + + assert converted_url == expected_s3_url + + def test_s3_detection_non_http_urls(self) -> None: + """Test that non-HTTP URLs are not affected.""" + from zarr.storage._common import _maybe_convert_http_to_s3 + + urls = [ + "s3://bucket/path", # Already S3 + "file:///local/path", # Local file + "gs://bucket/path", # Google Cloud Storage + "/local/path", # Plain path + ] + + for url in urls: + converted_url, _ = _maybe_convert_http_to_s3(url, None) + assert converted_url == url, f"Non-HTTP URL {url} should not be modified"