diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 0bb1e92c07..efeaa4a2c2 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -392,11 +392,17 @@ def __init__(self, properties: Properties = EMPTY_DICT): super().__init__(properties=properties) @staticmethod - def parse_location(location: str) -> Tuple[str, str, str]: - """Return the path without the scheme.""" + def parse_location(location: str, properties: Properties = EMPTY_DICT) -> Tuple[str, str, str]: + """Return (scheme, netloc, path) for the given location. + + Uses DEFAULT_SCHEME and DEFAULT_NETLOC if scheme/netloc are missing. + """ uri = urlparse(location) + if not uri.scheme: - return "file", uri.netloc, os.path.abspath(location) + default_scheme = properties.get("DEFAULT_SCHEME", "file") + default_netloc = properties.get("DEFAULT_NETLOC", "") + return default_scheme, default_netloc, os.path.abspath(location) elif uri.scheme in ("hdfs", "viewfs"): return uri.scheme, uri.netloc, uri.path else: @@ -614,7 +620,7 @@ def new_input(self, location: str) -> PyArrowFile: Returns: PyArrowFile: A PyArrowFile instance for the given location. """ - scheme, netloc, path = self.parse_location(location) + scheme, netloc, path = self.parse_location(location, self.properties) return PyArrowFile( fs=self.fs_by_scheme(scheme, netloc), location=location, @@ -631,7 +637,7 @@ def new_output(self, location: str) -> PyArrowFile: Returns: PyArrowFile: A PyArrowFile instance for the given location. """ - scheme, netloc, path = self.parse_location(location) + scheme, netloc, path = self.parse_location(location, self.properties) return PyArrowFile( fs=self.fs_by_scheme(scheme, netloc), location=location, @@ -652,7 +658,7 @@ def delete(self, location: Union[str, InputFile, OutputFile]) -> None: an AWS error code 15. """ str_location = location.location if isinstance(location, (InputFile, OutputFile)) else location - scheme, netloc, path = self.parse_location(str_location) + scheme, netloc, path = self.parse_location(str_location, self.properties) fs = self.fs_by_scheme(scheme, netloc) try: diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index d01123dfd9..6efaf60cb9 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2785,3 +2785,29 @@ def _expected_batch(unit: str) -> pa.RecordBatch: ) assert _expected_batch("ns" if format_version > 2 else "us").equals(actual_result) + + +def test_parse_location_defaults() -> None: + """Test that parse_location uses defaults.""" + + from pyiceberg.io.pyarrow import PyArrowFileIO + + # if no default scheme or netloc is provided, use file scheme and empty netloc + scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar") + assert scheme == "file" + assert netloc == "" + assert path == "/foo/bar" + + scheme, netloc, path = PyArrowFileIO.parse_location( + "/foo/bar", properties={"DEFAULT_SCHEME": "scheme", "DEFAULT_NETLOC": "netloc:8000"} + ) + assert scheme == "scheme" + assert netloc == "netloc:8000" + assert path == "/foo/bar" + + scheme, netloc, path = PyArrowFileIO.parse_location( + "/foo/bar", properties={"DEFAULT_SCHEME": "hdfs", "DEFAULT_NETLOC": "netloc:8000"} + ) + assert scheme == "hdfs" + assert netloc == "netloc:8000" + assert path == "/foo/bar"