Skip to content

Commit f0fc158

Browse files
fix: Fix regression error when scanning AWS presigned URL (pola-rs#24530)
1 parent 3ec9a21 commit f0fc158

File tree

3 files changed

+48
-8
lines changed

3 files changed

+48
-8
lines changed

crates/polars-io/src/path_utils/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,8 @@ pub fn expand_paths_hive(
422422
// Search after offset to prevent matching `.s3.amazonaws.com` (legacy global endpoint URL without region).
423423
let region_end = offset + after_scheme[offset..].find(".amazonaws.com/")?;
424424

425-
if after_scheme[..region_end].contains('/') {
425+
// Do not convert if '?' (this can be query parameters for AWS presigned URLs).
426+
if after_scheme[..region_end].contains('/') || after_scheme.contains('?') {
426427
return None;
427428
}
428429

py-polars/polars/io/cloud/_utils.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,22 @@ def _get_path_scheme(path: str | Path) -> str | None:
5454

5555

5656
def _is_aws_cloud(*, scheme: str, first_scan_path: str) -> bool:
57-
return any(scheme == x for x in ["s3", "s3a"]) or (
58-
(scheme == "http" or scheme == "https")
59-
and 0
60-
< first_scan_path.find(".s3.")
61-
< first_scan_path.find(".amazonaws.com/")
62-
< first_scan_path[len(scheme) + 3 :].find("/")
63-
)
57+
if any(scheme == x for x in ["s3", "s3a"]):
58+
return True
59+
60+
if scheme == "http" or scheme == "https":
61+
bucket_end = first_scan_path.find(".s3.")
62+
region_end = first_scan_path.find(".amazonaws.com/", bucket_end + 4)
63+
64+
if (
65+
first_scan_path.find("/", len(scheme) + 3, region_end) > 0
66+
or "?" in first_scan_path
67+
):
68+
return False
69+
70+
return 0 < bucket_end < region_end
71+
72+
return False
6473

6574

6675
def _is_azure_cloud(scheme: str) -> bool:

py-polars/tests/unit/io/cloud/test_cloud.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pytest
77

88
import polars as pl
9+
from polars.io.cloud._utils import _is_aws_cloud
910

1011

1112
@pytest.mark.slow
@@ -54,3 +55,32 @@ def f() -> None:
5455
# Note: We get called 2 times per attempt
5556
if call_count != 4:
5657
raise AssertionError(call_count)
58+
59+
60+
def test_is_aws_cloud() -> None:
61+
assert _is_aws_cloud(
62+
scheme="https",
63+
first_scan_path="https://bucket.s3.eu-west-1.amazonaws.com/key",
64+
)
65+
66+
# Slash in front of amazonaws.com
67+
assert not _is_aws_cloud(
68+
scheme="https",
69+
first_scan_path="https://bucket/.s3.eu-west-1.amazonaws.com/key",
70+
)
71+
72+
assert not _is_aws_cloud(
73+
scheme="https",
74+
first_scan_path="https://bucket?.s3.eu-west-1.amazonaws.com/key",
75+
)
76+
77+
# Legacy global endpoint
78+
assert not _is_aws_cloud(
79+
scheme="https", first_scan_path="https://bucket.s3.amazonaws.com/key"
80+
)
81+
82+
# Has query parameters (e.g. presigned URL).
83+
assert not _is_aws_cloud(
84+
scheme="https",
85+
first_scan_path="https://bucket.s3.eu-west-1.amazonaws.com/key?",
86+
)

0 commit comments

Comments
 (0)