Skip to content

Commit 19ba343

Browse files
mccormickt12Tom McCormick
andauthored
feat: allow default scheme and netloc for schemeless path (apache#2291)
<!-- Thanks for opening a pull request! --> <!-- In the case this PR will resolve an issue, please replace ${GITHUB_ISSUE_ID} below with the actual Github issue id. --> <!-- Closes #${GITHUB_ISSUE_ID} --> # Rationale for this change For hdfs it's common to get scheme and netloc from config and have paths be just the uri. This PR adds properties to configure `DEFAULT_SCHEME` and `DEFAULT_NETLOC` For example ``` from pyiceberg.catalog import load_catalog catalog = load_catalog("default", { ..., "DEFAULT_SCHEME": "hdfs", "DEFAULT_NETLOC": "ltx1-yugioh-cluster01.linkfs.prod-ltx1.atd.prod.linkedin.com:9000", } ``` or if not using catalog ``` static_table = StaticTable.from_metadata( "/warehouse/wh/nyc.db/taxis/metadata/00002-6ea51ce3-62aa-4197-9cf8-43d07c3440ca.metadata.json" properties={ 'DEFAULT_SCHEME': 'hdfs', 'DEFAULT_NETLOC': 'ltx1-yugioh-cluster01.linkfs.prod-ltx1.atd.prod.linkedin.com:9000', } ) ``` Previously, schemeless paths are assumed to be for the local filesystem only. This PR allows schemeless paths to be passed to the HDFS Filesystem # Are these changes tested? Tested in test env at linkedin and with unit tests # Are there any user-facing changes? No user facing changes by default. If you add these env variables, if file path doesn't have scheme/netloc it'll use the defaults specified. <!-- In the case of user-facing changes, please add the changelog label. --> --------- Co-authored-by: Tom McCormick <[email protected]>
1 parent 86b0fe8 commit 19ba343

File tree

2 files changed

+38
-6
lines changed

2 files changed

+38
-6
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -392,11 +392,17 @@ def __init__(self, properties: Properties = EMPTY_DICT):
392392
super().__init__(properties=properties)
393393

394394
@staticmethod
395-
def parse_location(location: str) -> Tuple[str, str, str]:
396-
"""Return the path without the scheme."""
395+
def parse_location(location: str, properties: Properties = EMPTY_DICT) -> Tuple[str, str, str]:
396+
"""Return (scheme, netloc, path) for the given location.
397+
398+
Uses DEFAULT_SCHEME and DEFAULT_NETLOC if scheme/netloc are missing.
399+
"""
397400
uri = urlparse(location)
401+
398402
if not uri.scheme:
399-
return "file", uri.netloc, os.path.abspath(location)
403+
default_scheme = properties.get("DEFAULT_SCHEME", "file")
404+
default_netloc = properties.get("DEFAULT_NETLOC", "")
405+
return default_scheme, default_netloc, os.path.abspath(location)
400406
elif uri.scheme in ("hdfs", "viewfs"):
401407
return uri.scheme, uri.netloc, uri.path
402408
else:
@@ -614,7 +620,7 @@ def new_input(self, location: str) -> PyArrowFile:
614620
Returns:
615621
PyArrowFile: A PyArrowFile instance for the given location.
616622
"""
617-
scheme, netloc, path = self.parse_location(location)
623+
scheme, netloc, path = self.parse_location(location, self.properties)
618624
return PyArrowFile(
619625
fs=self.fs_by_scheme(scheme, netloc),
620626
location=location,
@@ -631,7 +637,7 @@ def new_output(self, location: str) -> PyArrowFile:
631637
Returns:
632638
PyArrowFile: A PyArrowFile instance for the given location.
633639
"""
634-
scheme, netloc, path = self.parse_location(location)
640+
scheme, netloc, path = self.parse_location(location, self.properties)
635641
return PyArrowFile(
636642
fs=self.fs_by_scheme(scheme, netloc),
637643
location=location,
@@ -652,7 +658,7 @@ def delete(self, location: Union[str, InputFile, OutputFile]) -> None:
652658
an AWS error code 15.
653659
"""
654660
str_location = location.location if isinstance(location, (InputFile, OutputFile)) else location
655-
scheme, netloc, path = self.parse_location(str_location)
661+
scheme, netloc, path = self.parse_location(str_location, self.properties)
656662
fs = self.fs_by_scheme(scheme, netloc)
657663

658664
try:

tests/io/test_pyarrow.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2785,3 +2785,29 @@ def _expected_batch(unit: str) -> pa.RecordBatch:
27852785
)
27862786

27872787
assert _expected_batch("ns" if format_version > 2 else "us").equals(actual_result)
2788+
2789+
2790+
def test_parse_location_defaults() -> None:
2791+
"""Test that parse_location uses defaults."""
2792+
2793+
from pyiceberg.io.pyarrow import PyArrowFileIO
2794+
2795+
# if no default scheme or netloc is provided, use file scheme and empty netloc
2796+
scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar")
2797+
assert scheme == "file"
2798+
assert netloc == ""
2799+
assert path == "/foo/bar"
2800+
2801+
scheme, netloc, path = PyArrowFileIO.parse_location(
2802+
"/foo/bar", properties={"DEFAULT_SCHEME": "scheme", "DEFAULT_NETLOC": "netloc:8000"}
2803+
)
2804+
assert scheme == "scheme"
2805+
assert netloc == "netloc:8000"
2806+
assert path == "/foo/bar"
2807+
2808+
scheme, netloc, path = PyArrowFileIO.parse_location(
2809+
"/foo/bar", properties={"DEFAULT_SCHEME": "hdfs", "DEFAULT_NETLOC": "netloc:8000"}
2810+
)
2811+
assert scheme == "hdfs"
2812+
assert netloc == "netloc:8000"
2813+
assert path == "/foo/bar"

0 commit comments

Comments
 (0)