-
Notifications
You must be signed in to change notification settings - Fork 168
feat: add separator
argument in read_csv
/scan_csv
#2989
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 17 commits
409dd4b
8143ae3
9d6e850
9000f88
b99dfcd
6b90890
c4ff1c6
00f0bc2
af21d2f
59a5b6b
d0c7283
ff68327
b7cb02c
7cfae8f
126c5c4
cf7c67d
8ace0f9
512c529
ec12904
bf4c269
003d3e7
4fd93fc
7629ce7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -564,8 +564,36 @@ def show_versions() -> None: | |
print(f"{k:>13}: {stat}") # noqa: T201 | ||
|
||
|
||
def validate_separator(separator: str, native_separator: str, **kwargs: Any) -> None: | ||
if native_separator in kwargs and kwargs[native_separator] != separator: | ||
msg = ( | ||
f"`separator` and `{native_separator}` do not match: " | ||
f"`separator`={separator} and `{native_separator}`={kwargs[native_separator]}." | ||
) | ||
raise TypeError(msg) | ||
|
||
|
||
def validate_separator_pyarrow(separator: str, **kwargs: Any) -> Any: | ||
if "parse_options" in kwargs: | ||
parse_options = kwargs.pop("parse_options") | ||
if parse_options.delimiter != separator: | ||
msg = ( | ||
"`separator` and `parse_options.delimiter` do not match: " | ||
f"`separator`={separator} and `delimiter`={parse_options.delimiter}." | ||
) | ||
raise TypeError(msg) | ||
return kwargs | ||
from pyarrow import csv # ignore-banned-import | ||
|
||
return {"parse_options": csv.ParseOptions(delimiter=separator)} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nevermind I completely misread this Fake panic reviewThe issue I have with this is that if any other argument was provided in Say someone is calling the following: nw.read_csv(file, separator=",", parse_options=csv.ParseOptions(ignore_empty_lines=False) Then at the end of I agree that hardcoding fields as suggested in #2989 (review) is not ideal, yet pyarrow does not provide much else we can use. We could dynamically lookup its Unsuccessful tentatives I tried: inspect.signaturefrom inspect import signature
from pyarrow import csv
print(signature(csv.ParseOptions.__init__))
dataclasses.fieldsFrom the stubs I got tricked into thinking that is a dataclass: from dataclasses import fields
from pyarrow import csv
print(fields(csv.ParseOptions))
I have mixed feelings as now a pyarrow user should pass both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @FBruzzesi!
A user won't need to pass both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right, so someone would only need to pass both tbh I think this is fine There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since our default is: separator: str = "," and matches delimiter: str = "," Alternative
In either case - every other argument is respected Show definitions
from __future__ import annotations
from pyarrow import csv
from typing import Any
def merge_options(separator: str = ",", **kwargs: Any) -> dict[str, Any]:
DEFAULT = "," # noqa: N806
if separator != DEFAULT:
if opts := kwargs.pop("parse_options", None):
opts.delimiter = separator
else:
opts = csv.ParseOptions(delimiter=separator)
kwargs["parse_options"] = opts
return kwargs
def display_merge(result: dict[str, Any]) -> None:
if result and (options := result.pop("parse_options", None)):
print(f"{options.delimiter=}\n{options.double_quote=}")
if result:
print(f"Remaining: {result!r}")
elif result:
print(f"Unrelated: {result!r}")
else:
print(f"Empty: {result!r}") Would this behavior not be more ideal? # NOTE: `double_quote` default is `True`
user_options = csv.ParseOptions(delimiter="\t", double_quote=False) >>> display_merge(merge_options(parse_options=user_options))
options.delimiter='\t'
options.double_quote=False >>> display_merge(merge_options(",", parse_options=user_options))
options.delimiter='\t'
options.double_quote=False >>> display_merge(merge_options("?", parse_options=user_options))
options.delimiter='?'
options.double_quote=False >>> display_merge(merge_options())
Empty: {} >>> display_merge(merge_options("\t"))
options.delimiter='\t'
options.double_quote=True >>> display_merge(
merge_options(
"?",
parse_options=csv.ParseOptions(double_quote=False),
read_options=csv.ReadOptions(),
)
)
options.delimiter='?'
options.double_quote=False
Remaining: {'read_options': <pyarrow._csv.ReadOptions object at 0x000001F29413AD40>} Although it is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
hmmm yes, that does sound better actually, thanks! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks Marco with (#2989 (comment)) in mind ... Not sure if this is on from sqlframe.duckdb import DuckDBSession
import polars as pl
from pathlib import Path
data: Mapping[str, Any] = {"a": [1, 2, 3], "b": [4.5, 6.7, 8.9], "z": ["x", "y", "w"]}
fp = Path.cwd() / "data" / "file.csv"
pl.DataFrame(data).write_csv(fp, separator="\t")
session = DuckDBSession.builder.getOrCreate()
>>> session.read.format("csv").load(str(fp), sep="\t", delim="?").collect()
[Row(a=1, b=4.5, z='x'), Row(a=2, b=6.7, z='y'), Row(a=3, b=8.9, z='w')] Personally, I think we're best off just defining rule(s) and documenting what we do for each backend if needed. So instead of >>> nw.scan_csv("...", backend="sqlframe", separator=",", sep="?", delim="\t", delimiter="!")
TypeError: `separator` and `sep` do not match: `separator`=, and `sep`=?. We either:
If any backend raises on non-matching arguments - I say let them - as it saves us the hassle π |
||
|
||
|
||
def read_csv( | ||
source: str, *, backend: IntoBackend[EagerAllowed], **kwargs: Any | ||
source: str, | ||
*, | ||
backend: IntoBackend[EagerAllowed], | ||
separator: str = ",", | ||
**kwargs: Any, | ||
) -> DataFrame[Any]: | ||
"""Read a CSV file into a DataFrame. | ||
|
||
|
@@ -578,6 +606,7 @@ def read_csv( | |
`POLARS`, `MODIN` or `CUDF`. | ||
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`. | ||
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`. | ||
separator: Single byte character to use as separator in the file. | ||
kwargs: Extra keyword arguments which are passed to the native CSV reader. | ||
For example, you could use | ||
`nw.read_csv('file.csv', backend='pandas', engine='pyarrow')`. | ||
|
@@ -599,14 +628,13 @@ def read_csv( | |
impl = Implementation.from_backend(backend) | ||
native_namespace = impl.to_native_namespace() | ||
native_frame: NativeDataFrame | ||
if impl in { | ||
Implementation.POLARS, | ||
Implementation.PANDAS, | ||
Implementation.MODIN, | ||
Implementation.CUDF, | ||
}: | ||
native_frame = native_namespace.read_csv(source, **kwargs) | ||
if impl in {Implementation.PANDAS, Implementation.MODIN, Implementation.CUDF}: | ||
validate_separator(separator, "sep", **kwargs) | ||
native_frame = native_namespace.read_csv(source, sep=separator, **kwargs) | ||
elif impl is Implementation.POLARS: | ||
native_frame = native_namespace.read_csv(source, separator=separator, **kwargs) | ||
elif impl is Implementation.PYARROW: | ||
kwargs = validate_separator_pyarrow(separator, **kwargs) | ||
from pyarrow import csv # ignore-banned-import | ||
|
||
native_frame = csv.read_csv(source, **kwargs) | ||
|
@@ -635,7 +663,7 @@ def read_csv( | |
|
||
|
||
def scan_csv( | ||
source: str, *, backend: IntoBackend[Backend], **kwargs: Any | ||
source: str, *, backend: IntoBackend[Backend], separator: str = ",", **kwargs: Any | ||
) -> LazyFrame[Any]: | ||
"""Lazily read from a CSV file. | ||
|
||
|
@@ -651,6 +679,7 @@ def scan_csv( | |
`POLARS`, `MODIN` or `CUDF`. | ||
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`. | ||
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`. | ||
separator: Single byte character to use as separator in the file. | ||
kwargs: Extra keyword arguments which are passed to the native CSV reader. | ||
For example, you could use | ||
`nw.scan_csv('file.csv', backend=pd, engine='pyarrow')`. | ||
|
@@ -676,33 +705,40 @@ def scan_csv( | |
native_namespace = implementation.to_native_namespace() | ||
native_frame: NativeDataFrame | NativeLazyFrame | ||
if implementation is Implementation.POLARS: | ||
native_frame = native_namespace.scan_csv(source, **kwargs) | ||
native_frame = native_namespace.scan_csv(source, separator=separator, **kwargs) | ||
elif implementation in { | ||
Implementation.PANDAS, | ||
Implementation.MODIN, | ||
Implementation.CUDF, | ||
Implementation.DASK, | ||
Implementation.DUCKDB, | ||
Implementation.IBIS, | ||
}: | ||
native_frame = native_namespace.read_csv(source, **kwargs) | ||
validate_separator(separator, "sep", **kwargs) | ||
native_frame = native_namespace.read_csv(source, sep=separator, **kwargs) | ||
elif implementation is Implementation.DUCKDB: | ||
validate_separator(separator, "delimiter", **kwargs) | ||
validate_separator(separator, "delim", **kwargs) | ||
|
||
native_frame = native_namespace.read_csv(source, delimiter=separator, **kwargs) | ||
elif implementation is Implementation.PYARROW: | ||
kwargs = validate_separator_pyarrow(separator, **kwargs) | ||
from pyarrow import csv # ignore-banned-import | ||
|
||
native_frame = csv.read_csv(source, **kwargs) | ||
elif implementation.is_spark_like(): | ||
validate_separator(separator, "sep", **kwargs) | ||
validate_separator(separator, "delimiter", **kwargs) | ||
if (session := kwargs.pop("session", None)) is None: | ||
msg = "Spark like backends require a session object to be passed in `kwargs`." | ||
raise ValueError(msg) | ||
|
||
csv_reader = session.read.format("csv") | ||
native_frame = ( | ||
csv_reader.load(source) | ||
csv_reader.load(source, sep=separator) | ||
if ( | ||
implementation is Implementation.SQLFRAME | ||
and implementation._backend_version() < (3, 27, 0) | ||
) | ||
else csv_reader.options(**kwargs).load(source) | ||
else csv_reader.options(sep=separator, **kwargs).load(source) | ||
) | ||
else: # pragma: no cover | ||
try: | ||
|
Uh oh!
There was an error while loading. Please reload this page.