Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
409dd4b
feat: add `separator` argument to `read_csv` / `scan_csv`
raisadz Aug 14, 2025
8143ae3
Merge remote-tracking branch 'upstream/main' into feat/add-separator-arg
raisadz Aug 14, 2025
9d6e850
add stable api
raisadz Aug 14, 2025
9000f88
Merge remote-tracking branch 'upstream/main' into feat/add-separator-arg
raisadz Aug 14, 2025
b99dfcd
add coverage
raisadz Aug 14, 2025
6b90890
Merge remote-tracking branch 'upstream/main' into feat/add-separator-arg
raisadz Aug 14, 2025
c4ff1c6
add session for sqlframe for coverage
raisadz Aug 14, 2025
00f0bc2
Update narwhals/functions.py
raisadz Aug 17, 2025
af21d2f
add separator validation
raisadz Aug 18, 2025
59a5b6b
Merge remote-tracking branch 'upstream/main' into feat/add-separator-arg
raisadz Aug 18, 2025
d0c7283
fix merge
raisadz Aug 18, 2025
ff68327
modify kwargs for pyarrow
raisadz Aug 18, 2025
b7cb02c
restore header that was there before
raisadz Aug 18, 2025
7cfae8f
Merge remote-tracking branch 'upstream/main' into feat/add-separator-arg
raisadz Aug 19, 2025
126c5c4
Update narwhals/functions.py
raisadz Aug 19, 2025
cf7c67d
Merge remote-tracking branch 'origin/feat/add-separator-arg' into fea…
raisadz Aug 19, 2025
8ace0f9
Merge remote-tracking branch 'upstream/main' into feat/add-separator-arg
raisadz Aug 22, 2025
512c529
make `validate` support functions private
raisadz Aug 23, 2025
ec12904
Merge remote-tracking branch 'upstream/main' into feat/add-separator-arg
raisadz Aug 23, 2025
bf4c269
Merge remote-tracking branch 'upstream/main' into feat/add-separator-arg
raisadz Oct 20, 2025
003d3e7
readd tests
raisadz Oct 20, 2025
4fd93fc
Merge remote-tracking branch 'upstream/main' into feat/add-separator-arg
raisadz Oct 20, 2025
7629ce7
add pyarrow parse_options for coverage
raisadz Oct 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 34 additions & 16 deletions narwhals/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,11 @@ def show_versions() -> None:


def read_csv(
source: str, *, backend: ModuleType | Implementation | str, **kwargs: Any
source: str,
*,
backend: ModuleType | Implementation | str,
separator: str = ",",
**kwargs: Any,
) -> DataFrame[Any]:
"""Read a CSV file into a DataFrame.

Expand All @@ -573,6 +577,7 @@ def read_csv(
`POLARS`, `MODIN` or `CUDF`.
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`.
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`.
separator: Single byte character to use as separator in the file.
kwargs: Extra keyword arguments which are passed to the native CSV reader.
For example, you could use
`nw.read_csv('file.csv', backend='pandas', engine='pyarrow')`.
Expand All @@ -594,17 +599,19 @@ def read_csv(
impl = Implementation.from_backend(backend)
native_namespace = impl.to_native_namespace()
native_frame: NativeFrame
if impl in {
Implementation.POLARS,
Implementation.PANDAS,
Implementation.MODIN,
Implementation.CUDF,
}:
native_frame = native_namespace.read_csv(source, **kwargs)
if impl in {Implementation.PANDAS, Implementation.MODIN, Implementation.CUDF}:
native_frame = native_namespace.read_csv(source, sep=separator, **kwargs)
elif impl is Implementation.POLARS:
native_frame = native_namespace.read_csv(source, separator=separator, **kwargs)
elif impl is Implementation.PYARROW:
if separator is not None and "parse_options" in kwargs:
msg = "Can't pass both `separator` and `parse_options`."
raise TypeError(msg)
from pyarrow import csv # ignore-banned-import

native_frame = csv.read_csv(source, **kwargs)
native_frame = csv.read_csv(
source, parse_options=csv.ParseOptions(delimiter=separator), **kwargs
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is a bit odd:

  1. separator is not typed to be None

  2. Even if that was the case, the following would not error in line 607:

    nw.read_csv(..., separator=None, parse_option=csv.ParseOptions(...), backend=nw.Implementation.PYARROW)

    However, then in line 613, we would call

    csv.read_csv(
            source, parse_options=csv.ParseOptions(delimiter=None), parse_options=parse_options, ...
        )

    which would end up raising an exception at this point

  3. Should we handle the same for other backends? i.e. pandas-like check that sep is not passed, and below for lazy backends

elif impl in {
Implementation.PYSPARK,
Implementation.DASK,
Expand All @@ -630,7 +637,11 @@ def read_csv(


def scan_csv(
source: str, *, backend: ModuleType | Implementation | str, **kwargs: Any
source: str,
*,
backend: ModuleType | Implementation | str,
separator: str = ",",
**kwargs: Any,
) -> LazyFrame[Any]:
"""Lazily read from a CSV file.

Expand All @@ -646,6 +657,7 @@ def scan_csv(
`POLARS`, `MODIN` or `CUDF`.
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`.
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`.
separator: Single byte character to use as separator in the file.
kwargs: Extra keyword arguments which are passed to the native CSV reader.
For example, you could use
`nw.scan_csv('file.csv', backend=pd, engine='pyarrow')`.
Expand All @@ -671,33 +683,39 @@ def scan_csv(
native_namespace = implementation.to_native_namespace()
native_frame: NativeFrame | NativeLazyFrame
if implementation is Implementation.POLARS:
native_frame = native_namespace.scan_csv(source, **kwargs)
native_frame = native_namespace.scan_csv(source, separator=separator, **kwargs)
elif implementation in {
Implementation.PANDAS,
Implementation.MODIN,
Implementation.CUDF,
Implementation.DASK,
Implementation.DUCKDB,
Implementation.IBIS,
}:
native_frame = native_namespace.read_csv(source, **kwargs)
native_frame = native_namespace.read_csv(source, sep=separator, **kwargs)
elif implementation in {Implementation.DUCKDB}:
native_frame = native_namespace.read_csv(source, delimiter=separator, **kwargs)
elif implementation is Implementation.PYARROW:
if separator is not None and "parse_options" in kwargs:
msg = "Can't pass both `separator` and `parse_options`."
raise TypeError(msg)
from pyarrow import csv # ignore-banned-import

native_frame = csv.read_csv(source, **kwargs)
native_frame = csv.read_csv(
source, parse_options=csv.ParseOptions(delimiter=separator), **kwargs
)
elif implementation.is_spark_like():
if (session := kwargs.pop("session", None)) is None:
msg = "Spark like backends require a session object to be passed in `kwargs`."
raise ValueError(msg)

csv_reader = session.read.format("csv")
native_frame = (
csv_reader.load(source)
csv_reader.load(source, sep=separator)
if (
implementation is Implementation.SQLFRAME
and implementation._backend_version() < (3, 27, 0)
)
else csv_reader.options(**kwargs).load(source)
else csv_reader.options(sep=separator, **kwargs).load(source)
)
else: # pragma: no cover
try:
Expand Down
22 changes: 18 additions & 4 deletions narwhals/stable/v2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1055,7 +1055,11 @@ def from_numpy(


def read_csv(
source: str, *, backend: ModuleType | Implementation | str, **kwargs: Any
source: str,
*,
backend: ModuleType | Implementation | str,
separator: str = ",",
**kwargs: Any,
) -> DataFrame[Any]:
"""Read a CSV file into a DataFrame.

Expand All @@ -1068,18 +1072,25 @@ def read_csv(
`POLARS`, `MODIN` or `CUDF`.
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`.
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`.
separator: Single byte character to use as separator in the file.
kwargs: Extra keyword arguments which are passed to the native CSV reader.
For example, you could use
`nw.read_csv('file.csv', backend='pandas', engine='pyarrow')`.

Returns:
DataFrame.
"""
return _stableify(nw_f.read_csv(source, backend=backend, **kwargs))
return _stableify(
nw_f.read_csv(source, backend=backend, separator=separator, **kwargs)
)


def scan_csv(
source: str, *, backend: ModuleType | Implementation | str, **kwargs: Any
source: str,
*,
backend: ModuleType | Implementation | str,
separator: str = ",",
**kwargs: Any,
) -> LazyFrame[Any]:
"""Lazily read from a CSV file.

Expand All @@ -1095,14 +1106,17 @@ def scan_csv(
`POLARS`, `MODIN` or `CUDF`.
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`.
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`.
separator: Single byte character to use as separator in the file.
kwargs: Extra keyword arguments which are passed to the native CSV reader.
For example, you could use
`nw.scan_csv('file.csv', backend=pd, engine='pyarrow')`.

Returns:
LazyFrame.
"""
return _stableify(nw_f.scan_csv(source, backend=backend, **kwargs))
return _stableify(
nw_f.scan_csv(source, backend=backend, separator=separator, **kwargs)
)


def read_parquet(
Expand Down
48 changes: 46 additions & 2 deletions tests/read_scan_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ def test_read_csv(tmpdir: pytest.TempdirFactory, backend: Implementation | str)
result = nw.read_csv(filepath, backend=backend)
assert_equal_data(result, data)
assert isinstance(result, nw.DataFrame)
df_pl.write_csv(filepath, separator=";")
result = nw.read_csv(filepath, backend=backend, separator=";")
assert_equal_data(result, data)
assert isinstance(result, nw.DataFrame)


@pytest.mark.skipif(PANDAS_VERSION < (1, 5), reason="too old for pyarrow")
Expand All @@ -57,12 +61,20 @@ def test_read_csv_raise_with_lazy(tmpdir: pytest.TempdirFactory, backend: str) -
nw.read_csv(filepath, backend=backend)


def test_scan_csv(tmpdir: pytest.TempdirFactory, constructor: Constructor) -> None:
def test_scan_csv(
tmpdir: pytest.TempdirFactory,
constructor: Constructor,
request: pytest.FixtureRequest,
) -> None:
kwargs: dict[str, Any]
if "sqlframe" in str(constructor):
from sqlframe.duckdb import DuckDBSession

kwargs = {"session": DuckDBSession(), "inferSchema": True, "header": True}
kwargs = {"session": DuckDBSession(), "inferSchema": True}

request.applymarker(
pytest.mark.xfail(reason="https://github.com/eakmanrq/sqlframe/issues/469")
)
elif "pyspark" in str(constructor):
if is_spark_connect := os.environ.get("SPARK_CONNECT", None):
from pyspark.sql.connect.session import SparkSession
Expand Down Expand Up @@ -96,6 +108,12 @@ def test_scan_csv(tmpdir: pytest.TempdirFactory, constructor: Constructor) -> No
result = nw.scan_csv(filepath, backend=backend, **kwargs)
assert_equal_data(result, data)
assert isinstance(result, nw.LazyFrame)
df_pl.write_csv(filepath, separator="|")
df = nw.from_native(constructor(data))
backend = nw.get_native_namespace(df)
result = nw.scan_csv(filepath, backend=backend, separator="|", **kwargs)
assert_equal_data(result, data)
assert isinstance(result, nw.LazyFrame)


@pytest.mark.skipif(PANDAS_VERSION < (1, 5), reason="too old for pyarrow")
Expand All @@ -107,6 +125,32 @@ def test_scan_csv_kwargs(tmpdir: pytest.TempdirFactory) -> None:
assert_equal_data(result, data)


def test_read_csv_raise_sep_multiple(tmpdir: pytest.TempdirFactory) -> None:
pytest.importorskip("pyarrow")
import pyarrow as pa
from pyarrow import csv

df_pl = pl.DataFrame(data)
filepath = str(tmpdir / "file.csv") # type: ignore[operator]
df_pl.write_csv(filepath)

msg = "Can't pass both `separator` and `parse_options`."
with pytest.raises(TypeError, match=msg):
nw.read_csv(
filepath,
backend=pa,
separator="|",
parse_options=csv.ParseOptions(delimiter=";"),
)
with pytest.raises(TypeError, match=msg):
nw.scan_csv(
filepath,
backend=pa,
separator="|",
parse_options=csv.ParseOptions(delimiter=";"),
)


@pytest.mark.skipif(PANDAS_VERSION < (1, 5), reason="too old for pyarrow")
def test_read_parquet(
tmpdir: pytest.TempdirFactory, constructor_eager: ConstructorEager
Expand Down
Loading