Skip to content

ENH: Include line number and number of fields when read_csv() callable with engine="python" raises ParserWarning #61974

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1201,7 +1201,7 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:

if actual_len > col_len:
if callable(self.on_bad_lines):
new_l = self.on_bad_lines(_content)
new_l = self.on_bad_lines(col_len, actual_len, i + 2, _content)
if new_l is not None:
content.append(new_l) # pyright: ignore[reportArgumentType]
elif self.on_bad_lines in (
Expand Down
5 changes: 4 additions & 1 deletion pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,10 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
- ``'skip'``, skip bad lines without raising or warning when they are encountered.
- Callable, function that will process a single bad line.
- With ``engine='python'``, function with signature
``(bad_line: list[str]) -> list[str] | None``.
``(expected_columns: int, actual_columns: int, row: int, bad_line: list[str]) -> list[str] | None``.
``expected_columns`` is the expected number of columns.
``actual_columns`` is the actual number of columns.
``row`` is the row number of the bad line.
``bad_line`` is a list of strings split by the ``sep``.
If the function returns ``None``, the bad line will be ignored.
If the function returns a new ``list`` of strings with more elements than
Expand Down
21 changes: 15 additions & 6 deletions pandas/tests/io/parser/test_python_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,9 @@ def readline(self):
parser.read_csv(NoNextBuffer("a\n1"))


@pytest.mark.parametrize("bad_line_func", [lambda x: ["2", "3"], lambda x: x[:2]])
@pytest.mark.parametrize(
"bad_line_func", [lambda x, y, z, a: ["2", "3"], lambda x, y, z, a: a[:2]]
)
def test_on_bad_lines_callable(python_parser_only, bad_line_func):
# GH 5686
parser = python_parser_only
Expand All @@ -367,7 +369,9 @@ def test_on_bad_lines_callable_write_to_external_list(python_parser_only):
bad_sio = StringIO(data)
lst = []

def bad_line_func(bad_line: list[str]) -> list[str]:
def bad_line_func(
expected_columns: int, actual_columns: int, row: int, bad_line: list[str]
) -> list[str]:
lst.append(bad_line)
return ["2", "3"]

Expand All @@ -377,7 +381,9 @@ def bad_line_func(bad_line: list[str]) -> list[str]:
assert lst == [["2", "3", "4", "5", "6"]]


@pytest.mark.parametrize("bad_line_func", [lambda x: ["foo", "bar"], lambda x: x[:2]])
@pytest.mark.parametrize(
"bad_line_func", [lambda x, y, z, a: ["foo", "bar"], lambda x, y, z, a: a[:2]]
)
@pytest.mark.parametrize("sep", [",", "111"])
def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func, sep):
# GH 5686
Expand Down Expand Up @@ -414,7 +420,7 @@ def test_on_bad_lines_callable_dont_swallow_errors(python_parser_only):
bad_sio = StringIO(data)
msg = "This function is buggy."

def bad_line_func(bad_line):
def bad_line_func(expected_columns, actual_columns, row, bad_line):
raise ValueError(msg)

with pytest.raises(ValueError, match=msg):
Expand All @@ -432,7 +438,10 @@ def test_on_bad_lines_callable_not_expected_length(python_parser_only):
bad_sio = StringIO(data)

result = parser.read_csv_check_warnings(
ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x
ParserWarning,
"Length of header or names",
bad_sio,
on_bad_lines=lambda x, y, z, a: a,
)
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
tm.assert_frame_equal(result, expected)
Expand All @@ -448,7 +457,7 @@ def test_on_bad_lines_callable_returns_none(python_parser_only):
"""
bad_sio = StringIO(data)

result = parser.read_csv(bad_sio, on_bad_lines=lambda x: None)
result = parser.read_csv(bad_sio, on_bad_lines=lambda x, y, z, a: None)
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
tm.assert_frame_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/parser/test_unsupported.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers):
# GH 5686
# GH 54643
sio = StringIO("a,b\n1,2")
bad_lines_func = lambda x: x
bad_lines_func = lambda x, y, z, a: a
parser = all_parsers
if all_parsers.engine not in ["python", "pyarrow"]:
msg = (
Expand Down
Loading