@@ -438,3 +438,171 @@ async def test_with_read_only_auto_mkdir(tmp_path: Path) -> None:
438
438
439
439
store_w = FsspecStore .from_url (f"file://{ tmp_path } " , storage_options = {"auto_mkdir" : False })
440
440
_ = store_w .with_read_only ()
441
+
442
+
443
+ class TestS3AutoDetection :
444
+ """Test automatic detection and conversion of S3-compatible URLs."""
445
+
446
+ @pytest .mark .parametrize (
447
+ ("url" , "expected_detect" ),
448
+ [
449
+ # Should detect as S3
450
+ ("https://s3.amazonaws.com/bucket/path" , True ),
451
+ ("https://s3-us-west-2.amazonaws.com/bucket/path" , True ),
452
+ ("https://bucket.s3.amazonaws.com/path" , True ),
453
+ ("https://uk1s3.embassy.ebi.ac.uk/idr/zarr/file.zarr" , True ),
454
+ ("https://us-west-2-s3.example.com/bucket/path" , True ),
455
+ ("https://minio.example.com/bucket/path" , True ),
456
+ ("https://ceph.example.com/bucket/path" , True ),
457
+ ("https://ceph-rgw.example.com/bucket/path" , True ),
458
+ ("https://rgw.example.com/bucket/path" , True ),
459
+ ("https://object-store.example.com/bucket/path" , True ),
460
+ ("https://my-objectstore.example.com/bucket/path" , True ),
461
+ # Should NOT detect as S3 (false positives to avoid)
462
+ ("https://someurls345.com/data/file.zarr" , False ),
463
+ ("https://descriptions.example.com/file.zarr" , False ),
464
+ ("https://users3000.example.com/file.zarr" , False ),
465
+ ("https://s3tuff.example.com/file.zarr" , False ),
466
+ ("https://s3archive.example.com/file.zarr" , False ),
467
+ ("https://example.com/data/s3/file.zarr" , False ), # s3 in path, not hostname
468
+ ("https://example.com/file.zarr" , False ),
469
+ ],
470
+ )
471
+ def test_s3_detection_patterns (self , url : str , expected_detect : bool ) -> None :
472
+ """Test that S3 URL patterns are correctly identified."""
473
+ from zarr .storage ._common import _maybe_convert_http_to_s3
474
+
475
+ converted_url , opts = _maybe_convert_http_to_s3 (url , None )
476
+ was_detected = converted_url .startswith ("s3://" )
477
+
478
+ assert was_detected == expected_detect , (
479
+ f"URL { url } detection mismatch: got { was_detected } , expected { expected_detect } "
480
+ )
481
+
482
+ if was_detected :
483
+ # Verify S3 URL format is correct
484
+ assert converted_url .startswith ("s3://" )
485
+ # Verify endpoint_url was set
486
+ assert "client_kwargs" in opts
487
+ assert "endpoint_url" in opts ["client_kwargs" ]
488
+ # We don't set anon by default - users must set it explicitly
489
+
490
+ def test_s3_detection_preserves_user_options (self ) -> None :
491
+ """Test that user-provided storage options are preserved."""
492
+ from zarr .storage ._common import _maybe_convert_http_to_s3
493
+
494
+ url = "https://uk1s3.example.com/bucket/path"
495
+ user_opts = {"anon" : False , "other_option" : "value" }
496
+
497
+ converted_url , opts = _maybe_convert_http_to_s3 (url , user_opts )
498
+
499
+ # Should still convert to S3
500
+ assert converted_url .startswith ("s3://" )
501
+ # Should preserve user's anon setting
502
+ assert opts ["anon" ] is False
503
+ # Should preserve other options
504
+ assert opts ["other_option" ] == "value"
505
+ # Should add endpoint_url
506
+ assert "endpoint_url" in opts ["client_kwargs" ]
507
+
508
+ def test_s3_detection_preserves_user_client_kwargs (self ) -> None :
509
+ """Test that user's existing client_kwargs are preserved when adding endpoint_url."""
510
+ from zarr .storage ._common import _maybe_convert_http_to_s3
511
+
512
+ url = "https://uk1s3.example.com/bucket/path"
513
+ user_opts = {
514
+ "anon" : False ,
515
+ "client_kwargs" : {"region_name" : "us-west-2" , "use_ssl" : True },
516
+ }
517
+
518
+ # Call the function - it may modify user_opts, and that's okay
519
+ _ , result_opts = _maybe_convert_http_to_s3 (url , user_opts )
520
+
521
+ # Result should have endpoint_url added
522
+ result_client_kwargs = result_opts ["client_kwargs" ]
523
+ assert isinstance (result_client_kwargs , dict )
524
+ assert "endpoint_url" in result_client_kwargs
525
+ assert result_client_kwargs ["endpoint_url" ] == "https://uk1s3.example.com"
526
+
527
+ # Result should preserve user's other client_kwargs (not override them)
528
+ assert result_client_kwargs ["region_name" ] == "us-west-2"
529
+ assert result_client_kwargs ["use_ssl" ] is True
530
+
531
+ # User's anon setting should be preserved (not overridden to True)
532
+ assert result_opts ["anon" ] is False
533
+
534
+ def test_s3_detection_preserves_explicit_credentials (self ) -> None :
535
+ """Test that explicit credentials are preserved."""
536
+ from zarr .storage ._common import _maybe_convert_http_to_s3
537
+
538
+ url = "https://uk1s3.example.com/bucket/path"
539
+
540
+ # Test with key/secret
541
+ user_opts = {"key" : "my_key" , "secret" : "my_secret" }
542
+ converted_url , opts = _maybe_convert_http_to_s3 (url , user_opts )
543
+
544
+ # Should convert to S3
545
+ assert converted_url .startswith ("s3://" )
546
+ # Credentials should be preserved
547
+ assert opts ["key" ] == "my_key"
548
+ assert opts ["secret" ] == "my_secret"
549
+ # Endpoint should be added
550
+ assert opts ["client_kwargs" ]["endpoint_url" ] == "https://uk1s3.example.com"
551
+
552
+ def test_s3_detection_respects_existing_endpoint (self ) -> None :
553
+ """Test that existing endpoint_url is not overridden."""
554
+ from zarr .storage ._common import _maybe_convert_http_to_s3
555
+
556
+ url = "https://uk1s3.example.com/bucket/path"
557
+ user_opts = {"client_kwargs" : {"endpoint_url" : "https://custom-endpoint.com" }}
558
+
559
+ converted_url , opts = _maybe_convert_http_to_s3 (url , user_opts )
560
+
561
+ # Should NOT convert if endpoint already specified
562
+ assert converted_url == url
563
+ assert opts ["client_kwargs" ]["endpoint_url" ] == "https://custom-endpoint.com"
564
+
565
+ @pytest .mark .parametrize (
566
+ ("url" , "expected_bucket" , "expected_key" ),
567
+ [
568
+ (
569
+ "https://uk1s3.embassy.ebi.ac.uk/idr/zarr/v0.5/file.zarr" ,
570
+ "idr" ,
571
+ "zarr/v0.5/file.zarr" ,
572
+ ),
573
+ ("https://s3.amazonaws.com/my-bucket/path/to/data" , "my-bucket" , "path/to/data" ),
574
+ ("https://s3.amazonaws.com/bucket" , "bucket" , "" ), # No path
575
+ (
576
+ "https://s3.amazonaws.com/bucket/deep/nested/path/file.zarr" ,
577
+ "bucket" ,
578
+ "deep/nested/path/file.zarr" ,
579
+ ),
580
+ ],
581
+ )
582
+ def test_s3_url_parsing (self , url : str , expected_bucket : str , expected_key : str ) -> None :
583
+ """Test that S3 URLs are correctly parsed into bucket and key."""
584
+ from zarr .storage ._common import _maybe_convert_http_to_s3
585
+
586
+ converted_url , _ = _maybe_convert_http_to_s3 (url , None )
587
+
588
+ if expected_key :
589
+ expected_s3_url = f"s3://{ expected_bucket } /{ expected_key } "
590
+ else :
591
+ expected_s3_url = f"s3://{ expected_bucket } /"
592
+
593
+ assert converted_url == expected_s3_url
594
+
595
+ def test_s3_detection_non_http_urls (self ) -> None :
596
+ """Test that non-HTTP URLs are not affected."""
597
+ from zarr .storage ._common import _maybe_convert_http_to_s3
598
+
599
+ urls = [
600
+ "s3://bucket/path" , # Already S3
601
+ "file:///local/path" , # Local file
602
+ "gs://bucket/path" , # Google Cloud Storage
603
+ "/local/path" , # Plain path
604
+ ]
605
+
606
+ for url in urls :
607
+ converted_url , _ = _maybe_convert_http_to_s3 (url , None )
608
+ assert converted_url == url , f"Non-HTTP URL { url } should not be modified"
0 commit comments