Skip to content

Fix filesystem #2291

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,14 +388,29 @@ def __init__(self, properties: Properties = EMPTY_DICT):

@staticmethod
def parse_location(location: str) -> Tuple[str, str, str]:
"""Return the path without the scheme."""
"""Return (scheme, netloc, path) for the given location.

Uses environment variables DEFAULT_SCHEME and DEFAULT_NETLOC
if scheme/netloc are missing.
"""
uri = urlparse(location)
if not uri.scheme:
return "file", uri.netloc, os.path.abspath(location)
elif uri.scheme in ("hdfs", "viewfs"):
return uri.scheme, uri.netloc, uri.path

# Load defaults from environment
default_scheme = os.getenv("DEFAULT_SCHEME", "file")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we use central config instead of direct usage of env variables? pyiceberg/utils/config.py

this would enable configuration via file OR env variables, which is how most other configs are documented and exposed to catalog construction.

default_netloc = os.getenv("DEFAULT_NETLOC", "")

# Apply logic
scheme = uri.scheme or default_scheme
netloc = uri.netloc or default_netloc

if scheme in ("hdfs", "viewfs"):
return scheme, netloc, uri.path
else:
return uri.scheme, uri.netloc, f"{uri.netloc}{uri.path}"
# For non-HDFS URIs, include netloc in the path if present
path = uri.path if uri.scheme else os.path.abspath(location)
if netloc and not path.startswith(netloc):
path = f"{netloc}{path}"
return scheme, netloc, path
Comment on lines +404 to +415
Copy link
Contributor

@kevinjqliu kevinjqliu Aug 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i actually really want to get rid of this if {scheme} logic here.

Is there a way to refactor these changes down to the _initialize_hdfs_fs? so we can keep the hdfs logic in the same place?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i don't see a nice way to do this since the path used in the pyarrowfile is actually different in the different cases, i tried to see if we could use the same path with netloc in it for hdfs but it doesn't seem to work
#2291 (comment)


def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSystem:
"""Initialize FileSystem for different scheme."""
Expand Down
38 changes: 38 additions & 0 deletions tests/io/test_pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2638,3 +2638,41 @@ def test_retry_strategy_not_found() -> None:
io = PyArrowFileIO(properties={S3_RETRY_STRATEGY_IMPL: "pyiceberg.DoesNotExist"})
with pytest.warns(UserWarning, match="Could not initialize S3 retry strategy: pyiceberg.DoesNotExist"):
io.new_input("s3://bucket/path/to/file")


def test_parse_location_environment_defaults() -> None:
"""Test that parse_location uses environment variables for defaults."""
import os

from pyiceberg.io.pyarrow import PyArrowFileIO

# Test with default environment (no env vars set)
scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar")
assert scheme == "file"
assert netloc == ""
assert path == "/foo/bar"

try:
# Test with environment variables set
os.environ["DEFAULT_SCHEME"] = "scheme"
os.environ["DEFAULT_NETLOC"] = "netloc:8000"

scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar")
assert scheme == "scheme"
assert netloc == "netloc:8000"
assert path == "netloc:8000/foo/bar"

# Set environment variables
os.environ["DEFAULT_SCHEME"] = "hdfs"
os.environ["DEFAULT_NETLOC"] = "netloc:8000"

scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar")
assert scheme == "hdfs"
assert netloc == "netloc:8000"
assert path == "/foo/bar"
finally:
# Clean up environment variables
if "DEFAULT_SCHEME" in os.environ:
del os.environ["DEFAULT_SCHEME"]
if "DEFAULT_NETLOC" in os.environ:
del os.environ["DEFAULT_NETLOC"]