feat: Allow os.PathLike[str] in {read,scan}_* functions (#3112)

dangotbanned · web-flow · commit aef26cec5002 · 2025-09-12T09:38:44.000+01:00
* test: Update fixtures for `str | Path` - First step of #3100 - Some of the error cases don't need to test both * fix(typing): Resolve most new warnings - `pyarrow.parquet.read_table` is the only one left (2x warns) - Seems to be a stub issue - Runtime *does* check `__fspath__` - https://github.com/apache/arrow/blob/982d31f35fd2cfe87494698dae9ef67d3333658c/python/pyarrow/parquet/core.py#L1381-L1391 * chore(typing): Ignore `pyarrow-stubs` issue for now (See last commit description) * fix: Normalize path for `duckdb` * ci: Ensure `pyspark` gets triggered None of the existing rules apply to IO, but this is pretty important to keep working * fix: Use `normalize_path` for spark-like Fixes https://github.com/narwhals-dev/narwhals/actions/runs/17552195156/job/49847143668?pr=3112 * test: Add failing `PathLike` tests Towards #3112 (comment) * fix: Support `__fspath__` everywhere Resolves #3112 (comment) * refactor(typing): Use `FileSource` in `normalize_path` * test(perf): Don't parametrize paths on fail cases All of these are validating something unrelated to `source` * docs: Document `FileSource` alias
diff --git a/.github/workflows/pytest-pyspark.yml b/.github/workflows/pytest-pyspark.yml
@@ -6,6 +6,8 @@ on:
       - narwhals/_expression_parsing.py
       - narwhals/_spark_like/**
       - narwhals/_sql/**
+      - tests/*scan*.py
+      - tests/frame/*sink*.py
   schedule:
     - cron: 0 12 * * 0  # Sunday at mid-day
 
diff --git a/narwhals/_utils.py b/narwhals/_utils.py
@@ -116,6 +116,7 @@
         CompliantLazyFrame,
         CompliantSeries,
         DTypes,
+        FileSource,
         IntoSeriesT,
         MultiIndexSelector,
         SingleIndexSelector,
@@ -2156,3 +2157,11 @@ def to_pyarrow_table(tbl: pa.Table | pa.RecordBatchReader) -> pa.Table:
     if isinstance(tbl, pa.RecordBatchReader):  # pragma: no cover
         return pa.Table.from_batches(tbl)
     return tbl
+
+
+def normalize_path(source: FileSource, /) -> str:
+    if isinstance(source, str):
+        return source
+    from pathlib import Path
+
+    return str(Path(source))
diff --git a/narwhals/functions.py b/narwhals/functions.py
@@ -21,6 +21,7 @@
     is_compliant_expr,
     is_eager_allowed,
     is_sequence_but_not_str,
+    normalize_path,
     supports_arrow_c_stream,
     validate_laziness,
 )
@@ -46,6 +47,7 @@
     from narwhals.dataframe import DataFrame, LazyFrame
     from narwhals.typing import (
         ConcatMethod,
+        FileSource,
         FrameT,
         IntoDType,
         IntoExpr,
@@ -564,7 +566,7 @@ def show_versions() -> None:
 
 
 def read_csv(
-    source: str, *, backend: IntoBackend[EagerAllowed], **kwargs: Any
+    source: FileSource, *, backend: IntoBackend[EagerAllowed], **kwargs: Any
 ) -> DataFrame[Any]:
     """Read a CSV file into a DataFrame.
 
@@ -604,7 +606,7 @@ def read_csv(
         Implementation.MODIN,
         Implementation.CUDF,
     }:
-        native_frame = native_namespace.read_csv(source, **kwargs)
+        native_frame = native_namespace.read_csv(normalize_path(source), **kwargs)
     elif impl is Implementation.PYARROW:
         from pyarrow import csv  # ignore-banned-import
 
@@ -634,7 +636,7 @@ def read_csv(
 
 
 def scan_csv(
-    source: str, *, backend: IntoBackend[Backend], **kwargs: Any
+    source: FileSource, *, backend: IntoBackend[Backend], **kwargs: Any
 ) -> LazyFrame[Any]:
     """Lazily read from a CSV file.
 
@@ -674,6 +676,7 @@ def scan_csv(
     implementation = Implementation.from_backend(backend)
     native_namespace = implementation.to_native_namespace()
     native_frame: NativeDataFrame | NativeLazyFrame
+    source = normalize_path(source)
     if implementation is Implementation.POLARS:
         native_frame = native_namespace.scan_csv(source, **kwargs)
     elif implementation in {
@@ -693,7 +696,6 @@ def scan_csv(
         if (session := kwargs.pop("session", None)) is None:
             msg = "Spark like backends require a session object to be passed in `kwargs`."
             raise ValueError(msg)
-
         csv_reader = session.read.format("csv")
         native_frame = (
             csv_reader.load(source)
@@ -715,7 +717,7 @@ def scan_csv(
 
 
 def read_parquet(
-    source: str, *, backend: IntoBackend[EagerAllowed], **kwargs: Any
+    source: FileSource, *, backend: IntoBackend[EagerAllowed], **kwargs: Any
 ) -> DataFrame[Any]:
     """Read into a DataFrame from a parquet file.
 
@@ -760,11 +762,12 @@ def read_parquet(
         Implementation.MODIN,
         Implementation.CUDF,
     }:
+        source = normalize_path(source)
         native_frame = native_namespace.read_parquet(source, **kwargs)
     elif impl is Implementation.PYARROW:
         import pyarrow.parquet as pq  # ignore-banned-import
 
-        native_frame = pq.read_table(source, **kwargs)
+        native_frame = pq.read_table(source, **kwargs)  # type: ignore[arg-type]
     elif impl in {
         Implementation.PYSPARK,
         Implementation.DASK,
@@ -790,7 +793,7 @@ def read_parquet(
 
 
 def scan_parquet(
-    source: str, *, backend: IntoBackend[Backend], **kwargs: Any
+    source: FileSource, *, backend: IntoBackend[Backend], **kwargs: Any
 ) -> LazyFrame[Any]:
     """Lazily read from a parquet file.
 
@@ -857,6 +860,7 @@ def scan_parquet(
     implementation = Implementation.from_backend(backend)
     native_namespace = implementation.to_native_namespace()
     native_frame: NativeDataFrame | NativeLazyFrame
+    source = normalize_path(source)
     if implementation is Implementation.POLARS:
         native_frame = native_namespace.scan_parquet(source, **kwargs)
     elif implementation in {
@@ -876,7 +880,6 @@ def scan_parquet(
         if (session := kwargs.pop("session", None)) is None:
             msg = "Spark like backends require a session object to be passed in `kwargs`."
             raise ValueError(msg)
-
         pq_reader = session.read.format("parquet")
         native_frame = (
             pq_reader.load(source)
diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py
@@ -80,6 +80,7 @@
     from narwhals.dataframe import MultiColSelector, MultiIndexSelector
     from narwhals.dtypes import DType
     from narwhals.typing import (
+        FileSource,
         IntoDType,
         IntoExpr,
         IntoFrame,
@@ -1280,7 +1281,7 @@ def from_numpy(
 
 @deprecate_native_namespace(required=True)
 def read_csv(
-    source: str,
+    source: FileSource,
     *,
     backend: IntoBackend[EagerAllowed] | None = None,
     native_namespace: ModuleType | None = None,  # noqa: ARG001
@@ -1298,7 +1299,7 @@ def read_csv(
 
 @deprecate_native_namespace(required=True)
 def scan_csv(
-    source: str,
+    source: FileSource,
     *,
     backend: IntoBackend[Backend] | None = None,
     native_namespace: ModuleType | None = None,  # noqa: ARG001
@@ -1316,7 +1317,7 @@ def scan_csv(
 
 @deprecate_native_namespace(required=True)
 def read_parquet(
-    source: str,
+    source: FileSource,
     *,
     backend: IntoBackend[EagerAllowed] | None = None,
     native_namespace: ModuleType | None = None,  # noqa: ARG001
@@ -1334,7 +1335,7 @@ def read_parquet(
 
 @deprecate_native_namespace(required=True)
 def scan_parquet(
-    source: str,
+    source: FileSource,
     *,
     backend: IntoBackend[Backend] | None = None,
     native_namespace: ModuleType | None = None,  # noqa: ARG001
diff --git a/narwhals/typing.py b/narwhals/typing.py
@@ -8,6 +8,7 @@
 
 if TYPE_CHECKING:
     import datetime as dt
+    import os
     from collections.abc import Iterable, Sequence, Sized
     from decimal import Decimal
     from types import ModuleType
@@ -432,6 +433,15 @@ def Binary(self) -> type[dtypes.Binary]: ...
 IntoPolarsSchema: TypeAlias = "pl.Schema | Mapping[str, pl.DataType]"
 IntoPandasSchema: TypeAlias = Mapping[str, PandasLikeDType]
 
+FileSource: TypeAlias = "str | os.PathLike[str]"
+"""Path to a file.
+
+Either a string or an object that implements [`__fspath__`], such as [`pathlib.Path`].
+
+[`__fspath__`]: https://docs.python.org/3/library/os.html#os.PathLike
+[`pathlib.Path`]: https://docs.python.org/3/library/pathlib.html#pathlib.Path
+"""
+
 
 # Annotations for `__getitem__` methods
 _T = TypeVar("_T")
diff --git a/tests/read_scan_test.py b/tests/read_scan_test.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import re
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Literal
 
 import pandas as pd
 import pytest
@@ -20,9 +20,15 @@
 
 if TYPE_CHECKING:
     from collections.abc import Mapping
+    from pathlib import Path
     from types import ModuleType
 
+    from typing_extensions import TypeAlias
+
     from narwhals._typing import EagerAllowed, _LazyOnly, _SparkLike
+    from narwhals.typing import FileSource
+
+IOSourceKind: TypeAlias = Literal["str", "Path", "PathLike"]
 
 data: Mapping[str, Any] = {"a": [1, 2, 3], "b": [4.5, 6.7, 8.9], "z": ["x", "y", "w"]}
 skipif_pandas_lt_1_5 = pytest.mark.skipif(
@@ -32,20 +38,39 @@
 spark_like_backend = pytest.mark.parametrize("backend", ["pyspark", "sqlframe"])
 
 
-@pytest.fixture(scope="module")
-def csv_path(tmp_path_factory: pytest.TempPathFactory) -> str:
+class MockPathLike:
+    def __init__(self, path: Path) -> None:
+        self._super_secret: Path = path
+
+    def __fspath__(self) -> str:
+        return self._super_secret.__fspath__()
+
+
+def _into_file_source(source: Path, which: IOSourceKind, /) -> FileSource:
+    mapping: Mapping[IOSourceKind, FileSource] = {
+        "str": str(source),
+        "Path": source,
+        "PathLike": MockPathLike(source),
+    }
+    return mapping[which]
+
+
+@pytest.fixture(scope="module", params=["str", "Path", "PathLike"])
+def csv_path(
+    tmp_path_factory: pytest.TempPathFactory, request: pytest.FixtureRequest
+) -> FileSource:
     fp = tmp_path_factory.mktemp("data") / "file.csv"
-    filepath = str(fp)
-    pl.DataFrame(data).write_csv(filepath)
-    return filepath
+    pl.DataFrame(data).write_csv(fp)
+    return _into_file_source(fp, request.param)
 
 
-@pytest.fixture(scope="module")
-def parquet_path(tmp_path_factory: pytest.TempPathFactory) -> str:
+@pytest.fixture(scope="module", params=["str", "Path", "PathLike"])
+def parquet_path(
+    tmp_path_factory: pytest.TempPathFactory, request: pytest.FixtureRequest
+) -> FileSource:
     fp = tmp_path_factory.mktemp("data") / "file.parquet"
-    filepath = str(fp)
-    pl.DataFrame(data).write_parquet(filepath)
-    return filepath
+    pl.DataFrame(data).write_parquet(fp)
+    return _into_file_source(fp, request.param)
 
 
 def assert_equal_eager(result: nw.DataFrame[Any]) -> None:
@@ -62,23 +87,23 @@ def native_namespace(cb: Constructor, /) -> ModuleType:
     return nw.get_native_namespace(nw.from_native(cb(data)))  # type: ignore[no-any-return]
 
 
-def test_read_csv(csv_path: str, eager_backend: EagerAllowed) -> None:
+def test_read_csv(csv_path: FileSource, eager_backend: EagerAllowed) -> None:
     assert_equal_eager(nw.read_csv(csv_path, backend=eager_backend))
 
 
 @skipif_pandas_lt_1_5
-def test_read_csv_kwargs(csv_path: str) -> None:
+def test_read_csv_kwargs(csv_path: FileSource) -> None:
     assert_equal_eager(nw.read_csv(csv_path, backend=pd, engine="pyarrow"))
 
 
 @lazy_core_backend
-def test_read_csv_raise_with_lazy(csv_path: str, backend: _LazyOnly) -> None:
+def test_read_csv_raise_with_lazy(backend: _LazyOnly) -> None:
     pytest.importorskip(backend)
     with pytest.raises(ValueError, match="Expected eager backend, found"):
-        nw.read_csv(csv_path, backend=backend)  # type: ignore[arg-type]
+        nw.read_csv("unused.csv", backend=backend)  # type: ignore[arg-type]
 
 
-def test_scan_csv(csv_path: str, constructor: Constructor) -> None:
+def test_scan_csv(csv_path: FileSource, constructor: Constructor) -> None:
     kwargs: dict[str, Any]
     if "sqlframe" in str(constructor):
         kwargs = {"session": sqlframe_session(), "inferSchema": True, "header": True}
@@ -91,29 +116,29 @@ def test_scan_csv(csv_path: str, constructor: Constructor) -> None:
 
 
 @skipif_pandas_lt_1_5
-def test_scan_csv_kwargs(csv_path: str) -> None:
+def test_scan_csv_kwargs(csv_path: FileSource) -> None:
     assert_equal_data(nw.scan_csv(csv_path, backend=pd, engine="pyarrow"), data)
 
 
 @skipif_pandas_lt_1_5
-def test_read_parquet(parquet_path: str, eager_backend: EagerAllowed) -> None:
+def test_read_parquet(parquet_path: FileSource, eager_backend: EagerAllowed) -> None:
     assert_equal_eager(nw.read_parquet(parquet_path, backend=eager_backend))
 
 
 @skipif_pandas_lt_1_5
-def test_read_parquet_kwargs(parquet_path: str) -> None:
+def test_read_parquet_kwargs(parquet_path: FileSource) -> None:
     assert_equal_eager(nw.read_parquet(parquet_path, backend=pd, engine="pyarrow"))
 
 
 @lazy_core_backend
-def test_read_parquet_raise_with_lazy(parquet_path: str, backend: _LazyOnly) -> None:
+def test_read_parquet_raise_with_lazy(backend: _LazyOnly) -> None:
     pytest.importorskip(backend)
     with pytest.raises(ValueError, match="Expected eager backend, found"):
-        nw.read_parquet(parquet_path, backend=backend)  # type: ignore[arg-type]
+        nw.read_parquet("unused.parquet", backend=backend)  # type: ignore[arg-type]
 
 
 @skipif_pandas_lt_1_5
-def test_scan_parquet(parquet_path: str, constructor: Constructor) -> None:
+def test_scan_parquet(parquet_path: FileSource, constructor: Constructor) -> None:
     kwargs: dict[str, Any]
     if "sqlframe" in str(constructor):
         kwargs = {"session": sqlframe_session(), "inferSchema": True}
@@ -126,16 +151,16 @@ def test_scan_parquet(parquet_path: str, constructor: Constructor) -> None:
 
 
 @skipif_pandas_lt_1_5
-def test_scan_parquet_kwargs(parquet_path: str) -> None:
+def test_scan_parquet_kwargs(parquet_path: FileSource) -> None:
     assert_equal_lazy(nw.scan_parquet(parquet_path, backend=pd, engine="pyarrow"))
 
 
 @spark_like_backend
 @pytest.mark.parametrize("scan_method", ["scan_csv", "scan_parquet"])
 def test_scan_fail_spark_like_without_session(
-    parquet_path: str, backend: _SparkLike, scan_method: str
+    backend: _SparkLike, scan_method: str
 ) -> None:
     pytest.importorskip(backend)
     pattern = re.compile(r"spark.+backend.+require.+session", re.IGNORECASE)
     with pytest.raises(ValueError, match=pattern):
-        getattr(nw, scan_method)(parquet_path, backend=backend)
+        getattr(nw, scan_method)("unused.csv", backend=backend)