Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,11 +392,17 @@ def __init__(self, properties: Properties = EMPTY_DICT):
super().__init__(properties=properties)

@staticmethod
def parse_location(location: str) -> Tuple[str, str, str]:
"""Return the path without the scheme."""
def parse_location(location: str, properties: Properties = EMPTY_DICT) -> Tuple[str, str, str]:
"""Return (scheme, netloc, path) for the given location.

Uses DEFAULT_SCHEME and DEFAULT_NETLOC if scheme/netloc are missing.
"""
uri = urlparse(location)

if not uri.scheme:
return "file", uri.netloc, os.path.abspath(location)
default_scheme = properties.get("DEFAULT_SCHEME", "file")
default_netloc = properties.get("DEFAULT_NETLOC", "")
return default_scheme, default_netloc, os.path.abspath(location)
elif uri.scheme in ("hdfs", "viewfs"):
return uri.scheme, uri.netloc, uri.path
else:
Expand Down Expand Up @@ -614,7 +620,7 @@ def new_input(self, location: str) -> PyArrowFile:
Returns:
PyArrowFile: A PyArrowFile instance for the given location.
"""
scheme, netloc, path = self.parse_location(location)
scheme, netloc, path = self.parse_location(location, self.properties)
return PyArrowFile(
fs=self.fs_by_scheme(scheme, netloc),
location=location,
Expand All @@ -631,7 +637,7 @@ def new_output(self, location: str) -> PyArrowFile:
Returns:
PyArrowFile: A PyArrowFile instance for the given location.
"""
scheme, netloc, path = self.parse_location(location)
scheme, netloc, path = self.parse_location(location, self.properties)
return PyArrowFile(
fs=self.fs_by_scheme(scheme, netloc),
location=location,
Expand All @@ -652,7 +658,7 @@ def delete(self, location: Union[str, InputFile, OutputFile]) -> None:
an AWS error code 15.
"""
str_location = location.location if isinstance(location, (InputFile, OutputFile)) else location
scheme, netloc, path = self.parse_location(str_location)
scheme, netloc, path = self.parse_location(str_location, self.properties)
fs = self.fs_by_scheme(scheme, netloc)

try:
Expand Down
26 changes: 26 additions & 0 deletions tests/io/test_pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2785,3 +2785,29 @@ def _expected_batch(unit: str) -> pa.RecordBatch:
)

assert _expected_batch("ns" if format_version > 2 else "us").equals(actual_result)


def test_parse_location_defaults() -> None:
"""Test that parse_location uses defaults."""

from pyiceberg.io.pyarrow import PyArrowFileIO

# if no default scheme or netloc is provided, use file scheme and empty netloc
scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar")
assert scheme == "file"
assert netloc == ""
assert path == "/foo/bar"

scheme, netloc, path = PyArrowFileIO.parse_location(
"/foo/bar", properties={"DEFAULT_SCHEME": "scheme", "DEFAULT_NETLOC": "netloc:8000"}
)
assert scheme == "scheme"
assert netloc == "netloc:8000"
assert path == "/foo/bar"

scheme, netloc, path = PyArrowFileIO.parse_location(
"/foo/bar", properties={"DEFAULT_SCHEME": "hdfs", "DEFAULT_NETLOC": "netloc:8000"}
)
assert scheme == "hdfs"
Comment on lines +2788 to +2811

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The test assertions for path are not platform-independent. The parse_location function uses os.path.abspath(location) for schemeless paths, but the test hardcodes the expected path as "/foo/bar". This will work on Unix-like systems but fail on Windows where the absolute path would be different (e.g., C:\\foo\\bar).

To make the test robust across different operating systems, you should compare the result with os.path.abspath().

Additionally, the import from pyiceberg.io.pyarrow import PyArrowFileIO is inside the test function. It's better to move it to the top of the file to follow the common style in this project. I have removed it in the suggestion below.

def test_parse_location_defaults() -> None:
    """Test that parse_location uses defaults."""

    location = "/foo/bar"
    expected_path = os.path.abspath(location)

    # if no default scheme or netloc is provided, use file scheme and empty netloc
    scheme, netloc, path = PyArrowFileIO.parse_location(location)
    assert scheme == "file"
    assert netloc == ""
    assert path == expected_path

    scheme, netloc, path = PyArrowFileIO.parse_location(
        location, properties={"DEFAULT_SCHEME": "scheme", "DEFAULT_NETLOC": "netloc:8000"}
    )
    assert scheme == "scheme"
    assert netloc == "netloc:8000"
    assert path == expected_path

    scheme, netloc, path = PyArrowFileIO.parse_location(
        location, properties={"DEFAULT_SCHEME": "hdfs", "DEFAULT_NETLOC": "netloc:8000"}
    )
    assert scheme == "hdfs"
    assert netloc == "netloc:8000"
    assert path == expected_path

assert netloc == "netloc:8000"
assert path == "/foo/bar"