@@ -438,3 +438,171 @@ async def test_with_read_only_auto_mkdir(tmp_path: Path) -> None:
438438
439439 store_w = FsspecStore .from_url (f"file://{ tmp_path } " , storage_options = {"auto_mkdir" : False })
440440 _ = store_w .with_read_only ()
441+
442+
443+ class TestS3AutoDetection :
444+ """Test automatic detection and conversion of S3-compatible URLs."""
445+
446+ @pytest .mark .parametrize (
447+ ("url" , "expected_detect" ),
448+ [
449+ # Should detect as S3
450+ ("https://s3.amazonaws.com/bucket/path" , True ),
451+ ("https://s3-us-west-2.amazonaws.com/bucket/path" , True ),
452+ ("https://bucket.s3.amazonaws.com/path" , True ),
453+ ("https://uk1s3.embassy.ebi.ac.uk/idr/zarr/file.zarr" , True ),
454+ ("https://us-west-2-s3.example.com/bucket/path" , True ),
455+ ("https://minio.example.com/bucket/path" , True ),
456+ ("https://ceph.example.com/bucket/path" , True ),
457+ ("https://ceph-rgw.example.com/bucket/path" , True ),
458+ ("https://rgw.example.com/bucket/path" , True ),
459+ ("https://object-store.example.com/bucket/path" , True ),
460+ ("https://my-objectstore.example.com/bucket/path" , True ),
461+ # Should NOT detect as S3 (false positives to avoid)
462+ ("https://someurls345.com/data/file.zarr" , False ),
463+ ("https://descriptions.example.com/file.zarr" , False ),
464+ ("https://users3000.example.com/file.zarr" , False ),
465+ ("https://s3tuff.example.com/file.zarr" , False ),
466+ ("https://s3archive.example.com/file.zarr" , False ),
467+ ("https://example.com/data/s3/file.zarr" , False ), # s3 in path, not hostname
468+ ("https://example.com/file.zarr" , False ),
469+ ],
470+ )
471+ def test_s3_detection_patterns (self , url : str , expected_detect : bool ) -> None :
472+ """Test that S3 URL patterns are correctly identified."""
473+ from zarr .storage ._common import _maybe_convert_http_to_s3
474+
475+ converted_url , opts = _maybe_convert_http_to_s3 (url , None )
476+ was_detected = converted_url .startswith ("s3://" )
477+
478+ assert was_detected == expected_detect , (
479+ f"URL { url } detection mismatch: got { was_detected } , expected { expected_detect } "
480+ )
481+
482+ if was_detected :
483+ # Verify S3 URL format is correct
484+ assert converted_url .startswith ("s3://" )
485+ # Verify endpoint_url was set
486+ assert "client_kwargs" in opts
487+ assert "endpoint_url" in opts ["client_kwargs" ]
488+ # We don't set anon by default - users must set it explicitly
489+
490+ def test_s3_detection_preserves_user_options (self ) -> None :
491+ """Test that user-provided storage options are preserved."""
492+ from zarr .storage ._common import _maybe_convert_http_to_s3
493+
494+ url = "https://uk1s3.example.com/bucket/path"
495+ user_opts = {"anon" : False , "other_option" : "value" }
496+
497+ converted_url , opts = _maybe_convert_http_to_s3 (url , user_opts )
498+
499+ # Should still convert to S3
500+ assert converted_url .startswith ("s3://" )
501+ # Should preserve user's anon setting
502+ assert opts ["anon" ] is False
503+ # Should preserve other options
504+ assert opts ["other_option" ] == "value"
505+ # Should add endpoint_url
506+ assert "endpoint_url" in opts ["client_kwargs" ]
507+
508+ def test_s3_detection_preserves_user_client_kwargs (self ) -> None :
509+ """Test that user's existing client_kwargs are preserved when adding endpoint_url."""
510+ from zarr .storage ._common import _maybe_convert_http_to_s3
511+
512+ url = "https://uk1s3.example.com/bucket/path"
513+ user_opts = {
514+ "anon" : False ,
515+ "client_kwargs" : {"region_name" : "us-west-2" , "use_ssl" : True },
516+ }
517+
518+ # Call the function - it may modify user_opts, and that's okay
519+ _ , result_opts = _maybe_convert_http_to_s3 (url , user_opts )
520+
521+ # Result should have endpoint_url added
522+ result_client_kwargs = result_opts ["client_kwargs" ]
523+ assert isinstance (result_client_kwargs , dict )
524+ assert "endpoint_url" in result_client_kwargs
525+ assert result_client_kwargs ["endpoint_url" ] == "https://uk1s3.example.com"
526+
527+ # Result should preserve user's other client_kwargs (not override them)
528+ assert result_client_kwargs ["region_name" ] == "us-west-2"
529+ assert result_client_kwargs ["use_ssl" ] is True
530+
531+ # User's anon setting should be preserved (not overridden to True)
532+ assert result_opts ["anon" ] is False
533+
534+ def test_s3_detection_preserves_explicit_credentials (self ) -> None :
535+ """Test that explicit credentials are preserved."""
536+ from zarr .storage ._common import _maybe_convert_http_to_s3
537+
538+ url = "https://uk1s3.example.com/bucket/path"
539+
540+ # Test with key/secret
541+ user_opts = {"key" : "my_key" , "secret" : "my_secret" }
542+ converted_url , opts = _maybe_convert_http_to_s3 (url , user_opts )
543+
544+ # Should convert to S3
545+ assert converted_url .startswith ("s3://" )
546+ # Credentials should be preserved
547+ assert opts ["key" ] == "my_key"
548+ assert opts ["secret" ] == "my_secret"
549+ # Endpoint should be added
550+ assert opts ["client_kwargs" ]["endpoint_url" ] == "https://uk1s3.example.com"
551+
552+ def test_s3_detection_respects_existing_endpoint (self ) -> None :
553+ """Test that existing endpoint_url is not overridden."""
554+ from zarr .storage ._common import _maybe_convert_http_to_s3
555+
556+ url = "https://uk1s3.example.com/bucket/path"
557+ user_opts = {"client_kwargs" : {"endpoint_url" : "https://custom-endpoint.com" }}
558+
559+ converted_url , opts = _maybe_convert_http_to_s3 (url , user_opts )
560+
561+ # Should NOT convert if endpoint already specified
562+ assert converted_url == url
563+ assert opts ["client_kwargs" ]["endpoint_url" ] == "https://custom-endpoint.com"
564+
565+ @pytest .mark .parametrize (
566+ ("url" , "expected_bucket" , "expected_key" ),
567+ [
568+ (
569+ "https://uk1s3.embassy.ebi.ac.uk/idr/zarr/v0.5/file.zarr" ,
570+ "idr" ,
571+ "zarr/v0.5/file.zarr" ,
572+ ),
573+ ("https://s3.amazonaws.com/my-bucket/path/to/data" , "my-bucket" , "path/to/data" ),
574+ ("https://s3.amazonaws.com/bucket" , "bucket" , "" ), # No path
575+ (
576+ "https://s3.amazonaws.com/bucket/deep/nested/path/file.zarr" ,
577+ "bucket" ,
578+ "deep/nested/path/file.zarr" ,
579+ ),
580+ ],
581+ )
582+ def test_s3_url_parsing (self , url : str , expected_bucket : str , expected_key : str ) -> None :
583+ """Test that S3 URLs are correctly parsed into bucket and key."""
584+ from zarr .storage ._common import _maybe_convert_http_to_s3
585+
586+ converted_url , _ = _maybe_convert_http_to_s3 (url , None )
587+
588+ if expected_key :
589+ expected_s3_url = f"s3://{ expected_bucket } /{ expected_key } "
590+ else :
591+ expected_s3_url = f"s3://{ expected_bucket } /"
592+
593+ assert converted_url == expected_s3_url
594+
595+ def test_s3_detection_non_http_urls (self ) -> None :
596+ """Test that non-HTTP URLs are not affected."""
597+ from zarr .storage ._common import _maybe_convert_http_to_s3
598+
599+ urls = [
600+ "s3://bucket/path" , # Already S3
601+ "file:///local/path" , # Local file
602+ "gs://bucket/path" , # Google Cloud Storage
603+ "/local/path" , # Plain path
604+ ]
605+
606+ for url in urls :
607+ converted_url , _ = _maybe_convert_http_to_s3 (url , None )
608+ assert converted_url == url , f"Non-HTTP URL { url } should not be modified"
0 commit comments