From e8c3a4351288c647f921675b0fddb7bef73c473a Mon Sep 17 00:00:00 2001 From: Tom McCormick Date: Tue, 5 Aug 2025 21:54:50 -0400 Subject: [PATCH 01/11] fix file system with env variables to set scheme and net loc if not specified in file path --- pyiceberg/io/pyarrow.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 0bb1e92c07..7fb868318f 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -393,14 +393,28 @@ def __init__(self, properties: Properties = EMPTY_DICT): @staticmethod def parse_location(location: str) -> Tuple[str, str, str]: - """Return the path without the scheme.""" + """Return (scheme, netloc, path) for the given location. + Uses environment variables DEFAULT_SCHEME and DEFAULT_NETLOC + if scheme/netloc are missing. + """ uri = urlparse(location) - if not uri.scheme: - return "file", uri.netloc, os.path.abspath(location) - elif uri.scheme in ("hdfs", "viewfs"): - return uri.scheme, uri.netloc, uri.path + + # Load defaults from environment + default_scheme = os.getenv("DEFAULT_SCHEME", "file") + default_netloc = os.getenv("DEFAULT_NETLOC", "") + + # Apply logic + scheme = uri.scheme or default_scheme + netloc = uri.netloc or default_netloc + + if scheme in ("hdfs", "viewfs"): + return scheme, netloc, uri.path else: - return uri.scheme, uri.netloc, f"{uri.netloc}{uri.path}" + # For non-HDFS URIs, include netloc in the path if present + path = uri.path if uri.scheme else os.path.abspath(location) + if netloc and not path.startswith(netloc): + path = f"{netloc}{path}" + return scheme, netloc, path def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSystem: """Initialize FileSystem for different scheme.""" From ae6cc0d9e9401520e5439b94e7b2c54607540176 Mon Sep 17 00:00:00 2001 From: Tom McCormick Date: Wed, 6 Aug 2025 12:05:25 -0400 Subject: [PATCH 02/11] add test --- tests/io/test_pyarrow.py | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index d01123dfd9..da03b601a2 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2734,7 +2734,6 @@ def test_retry_strategy_not_found() -> None: with pytest.warns(UserWarning, match="Could not initialize S3 retry strategy: pyiceberg.DoesNotExist"): io.new_input("s3://bucket/path/to/file") - @pytest.mark.parametrize("format_version", [1, 2, 3]) def test_task_to_record_batches_nanos(format_version: TableVersion, tmpdir: str) -> None: arrow_table = pa.table( @@ -2785,3 +2784,39 @@ def _expected_batch(unit: str) -> pa.RecordBatch: ) assert _expected_batch("ns" if format_version > 2 else "us").equals(actual_result) + +def test_parse_location_environment_defaults(): + """Test that parse_location uses environment variables for defaults.""" + from pyiceberg.io.pyarrow import PyArrowFileIO + import os + + # Test with default environment (no env vars set) + scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar") + assert scheme == "file" + assert netloc == "" + assert path == "/foo/bar" + + try: + # Test with environment variables set + os.environ["DEFAULT_SCHEME"] = "scheme" + os.environ["DEFAULT_NETLOC"] = "netloc:8000" + + scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar") + assert scheme == "scheme" + assert netloc == "netloc:8000" + assert path == "netloc:8000/foo/bar" + + # Set environment variables + os.environ["DEFAULT_SCHEME"] = "hdfs" + os.environ["DEFAULT_NETLOC"] = "netloc:8000" + + scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar") + assert scheme == "hdfs" + assert netloc == "netloc:8000" + assert path == "/foo/bar" + finally: + # Clean up environment variables + if "DEFAULT_SCHEME" in os.environ: + del os.environ["DEFAULT_SCHEME"] + if "DEFAULT_NETLOC" in os.environ: + del os.environ["DEFAULT_NETLOC"] From cd939c74663200c83ce20f864a42cf6d041bdc75 Mon Sep 17 00:00:00 2001 From: Tom McCormick Date: Wed, 6 Aug 2025 12:58:08 -0400 Subject: [PATCH 03/11] fix linting --- pyiceberg/io/pyarrow.py | 1 + tests/io/test_pyarrow.py | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 7fb868318f..10969dcd19 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -394,6 +394,7 @@ def __init__(self, properties: Properties = EMPTY_DICT): @staticmethod def parse_location(location: str) -> Tuple[str, str, str]: """Return (scheme, netloc, path) for the given location. + Uses environment variables DEFAULT_SCHEME and DEFAULT_NETLOC if scheme/netloc are missing. """ diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index da03b601a2..4c078d8d94 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2786,21 +2786,23 @@ def _expected_batch(unit: str) -> pa.RecordBatch: assert _expected_batch("ns" if format_version > 2 else "us").equals(actual_result) def test_parse_location_environment_defaults(): + """Test that parse_location uses environment variables for defaults.""" - from pyiceberg.io.pyarrow import PyArrowFileIO import os - + + from pyiceberg.io.pyarrow import PyArrowFileIO + # Test with default environment (no env vars set) scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar") assert scheme == "file" assert netloc == "" assert path == "/foo/bar" - + try: # Test with environment variables set os.environ["DEFAULT_SCHEME"] = "scheme" os.environ["DEFAULT_NETLOC"] = "netloc:8000" - + scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar") assert scheme == "scheme" assert netloc == "netloc:8000" From 731075ae0657f6116857999341c4ddfba374e043 Mon Sep 17 00:00:00 2001 From: Tom McCormick Date: Mon, 25 Aug 2025 18:44:18 -0400 Subject: [PATCH 04/11] use catalog env configs and update to use default scheme and netloc from properties --- pyiceberg/io/pyarrow.py | 12 ++++++------ tests/io/test_pyarrow.py | 42 +++++++++++++++++----------------------- 2 files changed, 24 insertions(+), 30 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 10969dcd19..b2bc5aa117 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -392,7 +392,7 @@ def __init__(self, properties: Properties = EMPTY_DICT): super().__init__(properties=properties) @staticmethod - def parse_location(location: str) -> Tuple[str, str, str]: + def parse_location(location: str, properties: Properties=EMPTY_DICT) -> Tuple[str, str, str]: """Return (scheme, netloc, path) for the given location. Uses environment variables DEFAULT_SCHEME and DEFAULT_NETLOC @@ -401,8 +401,8 @@ def parse_location(location: str) -> Tuple[str, str, str]: uri = urlparse(location) # Load defaults from environment - default_scheme = os.getenv("DEFAULT_SCHEME", "file") - default_netloc = os.getenv("DEFAULT_NETLOC", "") + default_scheme = properties.get("DEFAULT_SCHEME", "file") + default_netloc = properties.get("DEFAULT_NETLOC", "") # Apply logic scheme = uri.scheme or default_scheme @@ -629,7 +629,7 @@ def new_input(self, location: str) -> PyArrowFile: Returns: PyArrowFile: A PyArrowFile instance for the given location. """ - scheme, netloc, path = self.parse_location(location) + scheme, netloc, path = self.parse_location(location, self.properties) return PyArrowFile( fs=self.fs_by_scheme(scheme, netloc), location=location, @@ -646,7 +646,7 @@ def new_output(self, location: str) -> PyArrowFile: Returns: PyArrowFile: A PyArrowFile instance for the given location. """ - scheme, netloc, path = self.parse_location(location) + scheme, netloc, path = self.parse_location(location, self.properties) return PyArrowFile( fs=self.fs_by_scheme(scheme, netloc), location=location, @@ -667,7 +667,7 @@ def delete(self, location: Union[str, InputFile, OutputFile]) -> None: an AWS error code 15. """ str_location = location.location if isinstance(location, (InputFile, OutputFile)) else location - scheme, netloc, path = self.parse_location(str_location) + scheme, netloc, path = self.parse_location(str_location, self.properties) fs = self.fs_by_scheme(scheme, netloc) try: diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 4c078d8d94..043504eca5 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2798,27 +2798,21 @@ def test_parse_location_environment_defaults(): assert netloc == "" assert path == "/foo/bar" - try: - # Test with environment variables set - os.environ["DEFAULT_SCHEME"] = "scheme" - os.environ["DEFAULT_NETLOC"] = "netloc:8000" - - scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar") - assert scheme == "scheme" - assert netloc == "netloc:8000" - assert path == "netloc:8000/foo/bar" - - # Set environment variables - os.environ["DEFAULT_SCHEME"] = "hdfs" - os.environ["DEFAULT_NETLOC"] = "netloc:8000" - - scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar") - assert scheme == "hdfs" - assert netloc == "netloc:8000" - assert path == "/foo/bar" - finally: - # Clean up environment variables - if "DEFAULT_SCHEME" in os.environ: - del os.environ["DEFAULT_SCHEME"] - if "DEFAULT_NETLOC" in os.environ: - del os.environ["DEFAULT_NETLOC"] + # Test with properties set + properties = dict() + properties["DEFAULT_SCHEME"] = "scheme" + properties["DEFAULT_NETLOC"] = "netloc:8000" + + scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar", properties=properties) + assert scheme == "scheme" + assert netloc == "netloc:8000" + assert path == "netloc:8000/foo/bar" + + # Set properties + properties["DEFAULT_SCHEME"] = "hdfs" + properties["DEFAULT_NETLOC"] = "netloc:8000" + + scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar", properties=properties) + assert scheme == "hdfs" + assert netloc == "netloc:8000" + assert path == "/foo/bar" From 9bfb7e89c9008a4ff1668118366debc454ea4c46 Mon Sep 17 00:00:00 2001 From: Tom McCormick Date: Mon, 25 Aug 2025 18:48:37 -0400 Subject: [PATCH 05/11] fix linter --- pyiceberg/io/pyarrow.py | 2 +- tests/io/test_pyarrow.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index b2bc5aa117..03e0e4c54d 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -392,7 +392,7 @@ def __init__(self, properties: Properties = EMPTY_DICT): super().__init__(properties=properties) @staticmethod - def parse_location(location: str, properties: Properties=EMPTY_DICT) -> Tuple[str, str, str]: + def parse_location(location: str, properties: Properties = EMPTY_DICT) -> Tuple[str, str, str]: """Return (scheme, netloc, path) for the given location. Uses environment variables DEFAULT_SCHEME and DEFAULT_NETLOC diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 043504eca5..5e568c8402 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2788,7 +2788,6 @@ def _expected_batch(unit: str) -> pa.RecordBatch: def test_parse_location_environment_defaults(): """Test that parse_location uses environment variables for defaults.""" - import os from pyiceberg.io.pyarrow import PyArrowFileIO @@ -2799,7 +2798,7 @@ def test_parse_location_environment_defaults(): assert path == "/foo/bar" # Test with properties set - properties = dict() + properties = {} properties["DEFAULT_SCHEME"] = "scheme" properties["DEFAULT_NETLOC"] = "netloc:8000" From 1a75ab64d796223797f08652d42c44b59955a62a Mon Sep 17 00:00:00 2001 From: Tom McCormick Date: Tue, 26 Aug 2025 17:40:32 -0400 Subject: [PATCH 06/11] address comments --- pyiceberg/io/pyarrow.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 03e0e4c54d..63413fafae 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -395,18 +395,17 @@ def __init__(self, properties: Properties = EMPTY_DICT): def parse_location(location: str, properties: Properties = EMPTY_DICT) -> Tuple[str, str, str]: """Return (scheme, netloc, path) for the given location. - Uses environment variables DEFAULT_SCHEME and DEFAULT_NETLOC - if scheme/netloc are missing. + Uses DEFAULT_SCHEME and DEFAULT_NETLOC if scheme/netloc are missing. """ uri = urlparse(location) - # Load defaults from environment default_scheme = properties.get("DEFAULT_SCHEME", "file") default_netloc = properties.get("DEFAULT_NETLOC", "") - # Apply logic - scheme = uri.scheme or default_scheme - netloc = uri.netloc or default_netloc + if not uri.scheme: + scheme = default_scheme + if not uri.netloc: + netloc = default_netloc if scheme in ("hdfs", "viewfs"): return scheme, netloc, uri.path From 9387d6e71c2c190f6ee4301b21e6862fa53ef956 Mon Sep 17 00:00:00 2001 From: Tom McCormick Date: Tue, 26 Aug 2025 20:00:05 -0400 Subject: [PATCH 07/11] pr comments --- tests/io/test_pyarrow.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 5e568c8402..b81abdeb9d 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2785,33 +2785,26 @@ def _expected_batch(unit: str) -> pa.RecordBatch: assert _expected_batch("ns" if format_version > 2 else "us").equals(actual_result) -def test_parse_location_environment_defaults(): - - """Test that parse_location uses environment variables for defaults.""" +def test_parse_location_defaults() -> None: + """Test that parse_location uses defaults.""" from pyiceberg.io.pyarrow import PyArrowFileIO - # Test with default environment (no env vars set) scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar") assert scheme == "file" assert netloc == "" assert path == "/foo/bar" - # Test with properties set - properties = {} - properties["DEFAULT_SCHEME"] = "scheme" - properties["DEFAULT_NETLOC"] = "netloc:8000" - - scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar", properties=properties) + scheme, netloc, path = PyArrowFileIO.parse_location( + "/foo/bar", properties={"DEFAULT_SCHEME": "scheme", "DEFAULT_NETLOC": "netloc:8000"} + ) assert scheme == "scheme" assert netloc == "netloc:8000" assert path == "netloc:8000/foo/bar" - # Set properties - properties["DEFAULT_SCHEME"] = "hdfs" - properties["DEFAULT_NETLOC"] = "netloc:8000" - - scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar", properties=properties) + scheme, netloc, path = PyArrowFileIO.parse_location( + "/foo/bar", properties={"DEFAULT_SCHEME": "hdfs", "DEFAULT_NETLOC": "netloc:8000"} + ) assert scheme == "hdfs" assert netloc == "netloc:8000" assert path == "/foo/bar" From 80176e80c15a7a81824a54e9c7ebbabd6a11e60b Mon Sep 17 00:00:00 2001 From: Tom McCormick Date: Tue, 26 Aug 2025 20:01:22 -0400 Subject: [PATCH 08/11] pr comments update comment --- tests/io/test_pyarrow.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index b81abdeb9d..55440f5bd6 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2790,6 +2790,7 @@ def test_parse_location_defaults() -> None: from pyiceberg.io.pyarrow import PyArrowFileIO + # if no default scheme or netloc is provided, use file scheme and empty netloc scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar") assert scheme == "file" assert netloc == "" From 13001c8d6ccb710919aff15a743998d9e184cb95 Mon Sep 17 00:00:00 2001 From: Tom McCormick Date: Thu, 28 Aug 2025 09:07:43 -0400 Subject: [PATCH 09/11] simplify code, return to previous structure --- pyiceberg/io/pyarrow.py | 15 ++++----------- tests/io/test_pyarrow.py | 2 +- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 63413fafae..d3508316d3 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -403,18 +403,11 @@ def parse_location(location: str, properties: Properties = EMPTY_DICT) -> Tuple[ default_netloc = properties.get("DEFAULT_NETLOC", "") if not uri.scheme: - scheme = default_scheme - if not uri.netloc: - netloc = default_netloc - - if scheme in ("hdfs", "viewfs"): - return scheme, netloc, uri.path + return default_scheme, default_netloc, os.path.abspath(location) + elif uri.scheme in ("hdfs", "viewfs"): + return uri.scheme, uri.netloc, uri.path else: - # For non-HDFS URIs, include netloc in the path if present - path = uri.path if uri.scheme else os.path.abspath(location) - if netloc and not path.startswith(netloc): - path = f"{netloc}{path}" - return scheme, netloc, path + return uri.scheme, uri.netloc, f"{uri.netloc}{uri.path}" def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSystem: """Initialize FileSystem for different scheme.""" diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 55440f5bd6..e864f64df5 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2801,7 +2801,7 @@ def test_parse_location_defaults() -> None: ) assert scheme == "scheme" assert netloc == "netloc:8000" - assert path == "netloc:8000/foo/bar" + assert path == "/foo/bar" scheme, netloc, path = PyArrowFileIO.parse_location( "/foo/bar", properties={"DEFAULT_SCHEME": "hdfs", "DEFAULT_NETLOC": "netloc:8000"} From 35f4cdc39785334f57ee97c04cb0a00e69848089 Mon Sep 17 00:00:00 2001 From: Tom McCormick Date: Thu, 28 Aug 2025 09:12:35 -0400 Subject: [PATCH 10/11] move variable usage closer to usage --- pyiceberg/io/pyarrow.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index d3508316d3..efeaa4a2c2 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -399,10 +399,9 @@ def parse_location(location: str, properties: Properties = EMPTY_DICT) -> Tuple[ """ uri = urlparse(location) - default_scheme = properties.get("DEFAULT_SCHEME", "file") - default_netloc = properties.get("DEFAULT_NETLOC", "") - if not uri.scheme: + default_scheme = properties.get("DEFAULT_SCHEME", "file") + default_netloc = properties.get("DEFAULT_NETLOC", "") return default_scheme, default_netloc, os.path.abspath(location) elif uri.scheme in ("hdfs", "viewfs"): return uri.scheme, uri.netloc, uri.path From 2b83b79d22ac2156904a6cf05e0c103ba77b5399 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 29 Aug 2025 16:56:44 +0000 Subject: [PATCH 11/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/io/test_pyarrow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index e864f64df5..6efaf60cb9 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2734,6 +2734,7 @@ def test_retry_strategy_not_found() -> None: with pytest.warns(UserWarning, match="Could not initialize S3 retry strategy: pyiceberg.DoesNotExist"): io.new_input("s3://bucket/path/to/file") + @pytest.mark.parametrize("format_version", [1, 2, 3]) def test_task_to_record_batches_nanos(format_version: TableVersion, tmpdir: str) -> None: arrow_table = pa.table( @@ -2785,6 +2786,7 @@ def _expected_batch(unit: str) -> pa.RecordBatch: assert _expected_batch("ns" if format_version > 2 else "us").equals(actual_result) + def test_parse_location_defaults() -> None: """Test that parse_location uses defaults."""