File tree Expand file tree Collapse file tree 3 files changed +48
-8
lines changed
crates/polars-io/src/path_utils Expand file tree Collapse file tree 3 files changed +48
-8
lines changed Original file line number Diff line number Diff line change @@ -422,7 +422,8 @@ pub fn expand_paths_hive(
422422 // Search after offset to prevent matching `.s3.amazonaws.com` (legacy global endpoint URL without region).
423423 let region_end = offset + after_scheme[ offset..] . find ( ".amazonaws.com/" ) ?;
424424
425- if after_scheme[ ..region_end] . contains ( '/' ) {
425+ // Do not convert if '?' (this can be query parameters for AWS presigned URLs).
426+ if after_scheme[ ..region_end] . contains ( '/' ) || after_scheme. contains ( '?' ) {
426427 return None ;
427428 }
428429
Original file line number Diff line number Diff line change @@ -54,13 +54,22 @@ def _get_path_scheme(path: str | Path) -> str | None:
5454
5555
5656def _is_aws_cloud (* , scheme : str , first_scan_path : str ) -> bool :
57- return any (scheme == x for x in ["s3" , "s3a" ]) or (
58- (scheme == "http" or scheme == "https" )
59- and 0
60- < first_scan_path .find (".s3." )
61- < first_scan_path .find (".amazonaws.com/" )
62- < first_scan_path [len (scheme ) + 3 :].find ("/" )
63- )
57+ if any (scheme == x for x in ["s3" , "s3a" ]):
58+ return True
59+
60+ if scheme == "http" or scheme == "https" :
61+ bucket_end = first_scan_path .find (".s3." )
62+ region_end = first_scan_path .find (".amazonaws.com/" , bucket_end + 4 )
63+
64+ if (
65+ first_scan_path .find ("/" , len (scheme ) + 3 , region_end ) > 0
66+ or "?" in first_scan_path
67+ ):
68+ return False
69+
70+ return 0 < bucket_end < region_end
71+
72+ return False
6473
6574
6675def _is_azure_cloud (scheme : str ) -> bool :
Original file line number Diff line number Diff line change 66import pytest
77
88import polars as pl
9+ from polars .io .cloud ._utils import _is_aws_cloud
910
1011
1112@pytest .mark .slow
@@ -54,3 +55,32 @@ def f() -> None:
5455 # Note: We get called 2 times per attempt
5556 if call_count != 4 :
5657 raise AssertionError (call_count )
58+
59+
60+ def test_is_aws_cloud () -> None :
61+ assert _is_aws_cloud (
62+ scheme = "https" ,
63+ first_scan_path = "https://bucket.s3.eu-west-1.amazonaws.com/key" ,
64+ )
65+
66+ # Slash in front of amazonaws.com
67+ assert not _is_aws_cloud (
68+ scheme = "https" ,
69+ first_scan_path = "https://bucket/.s3.eu-west-1.amazonaws.com/key" ,
70+ )
71+
72+ assert not _is_aws_cloud (
73+ scheme = "https" ,
74+ first_scan_path = "https://bucket?.s3.eu-west-1.amazonaws.com/key" ,
75+ )
76+
77+ # Legacy global endpoint
78+ assert not _is_aws_cloud (
79+ scheme = "https" , first_scan_path = "https://bucket.s3.amazonaws.com/key"
80+ )
81+
82+ # Has query parameters (e.g. presigned URL).
83+ assert not _is_aws_cloud (
84+ scheme = "https" ,
85+ first_scan_path = "https://bucket.s3.eu-west-1.amazonaws.com/key?" ,
86+ )
You can’t perform that action at this time.
0 commit comments