diff --git a/fsspec/tests/test_utils.py b/fsspec/tests/test_utils.py index 1eeee912b..c388ee04d 100644 --- a/fsspec/tests/test_utils.py +++ b/fsspec/tests/test_utils.py @@ -209,8 +209,16 @@ def test_infer_options(): # - The bucket is included in path for protocol in ["s3", "s3a", "gcs", "gs"]: options = infer_storage_options(f"{protocol}://Bucket-name.com/test.csv") + assert options["host"] == "Bucket-name.com" assert options["path"] == "Bucket-name.com/test.csv" + for protocol in ["s3", "s3a"]: + options = infer_storage_options( + f"{protocol}://arn:aws:s3:us-west-2:1234:accesspoint/abc/test.csv" + ) + assert options["host"] == "arn:aws:s3:us-west-2:1234:accesspoint" + assert options["path"] == "arn:aws:s3:us-west-2:1234:accesspoint/abc/test.csv" + with pytest.raises(KeyError): infer_storage_options("file:///bucket/file.csv", {"path": "collide"}) with pytest.raises(KeyError): diff --git a/fsspec/utils.py b/fsspec/utils.py index faa63937f..403f39c89 100644 --- a/fsspec/utils.py +++ b/fsspec/utils.py @@ -96,12 +96,17 @@ def infer_storage_options( # https://github.com/dask/dask/issues/1417 options["host"] = parsed_path.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0] + if protocol in ("s3", "s3a") and parsed_path.netloc.endswith(":accesspoint"): + # When receiving a s3 accesspoint url like s3://arn:aws:s3:us-west-2:1234:accesspoint/abc + # the :accesspoint suffix would fail the port parsing with a ValueError complaining the port is not an integer + # Ignore the port setting and keep the :accesspoint suffix in the options["host"] + options["host"] = parsed_path.netloc.rsplit("@", 1)[-1] + else: + if parsed_path.port: + options["port"] = parsed_path.port + if protocol in ("s3", "s3a", "gcs", "gs"): options["path"] = options["host"] + options["path"] - else: - options["host"] = options["host"] - if parsed_path.port: - options["port"] = parsed_path.port if parsed_path.username: options["username"] = parsed_path.username if parsed_path.password: