diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 70f0eefc55fd9..4e9408ffa7bbc 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1201,7 +1201,7 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: if actual_len > col_len: if callable(self.on_bad_lines): - new_l = self.on_bad_lines(_content) + new_l = self.on_bad_lines(col_len, actual_len, i + 2, _content) if new_l is not None: content.append(new_l) # pyright: ignore[reportArgumentType] elif self.on_bad_lines in ( diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 4fbd71ed03662..5a12f0f95299d 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -414,7 +414,10 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): - ``'skip'``, skip bad lines without raising or warning when they are encountered. - Callable, function that will process a single bad line. - With ``engine='python'``, function with signature - ``(bad_line: list[str]) -> list[str] | None``. + ``(expected_columns: int, actual_columns: int, row: int, bad_line: list[str]) -> list[str] | None``. + ``expected_columns`` is the expected number of columns. + ``actual_columns`` is the actual number of columns. + ``row`` is the row number of the bad line. ``bad_line`` is a list of strings split by the ``sep``. If the function returns ``None``, the bad line will be ignored. If the function returns a new ``list`` of strings with more elements than diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index a5bb151e84f47..7e544780e414d 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -341,7 +341,9 @@ def readline(self): parser.read_csv(NoNextBuffer("a\n1")) -@pytest.mark.parametrize("bad_line_func", [lambda x: ["2", "3"], lambda x: x[:2]]) +@pytest.mark.parametrize( + "bad_line_func", [lambda x, y, z, a: ["2", "3"], lambda x, y, z, a: a[:2]] +) def test_on_bad_lines_callable(python_parser_only, bad_line_func): # GH 5686 parser = python_parser_only @@ -367,7 +369,9 @@ def test_on_bad_lines_callable_write_to_external_list(python_parser_only): bad_sio = StringIO(data) lst = [] - def bad_line_func(bad_line: list[str]) -> list[str]: + def bad_line_func( + expected_columns: int, actual_columns: int, row: int, bad_line: list[str] + ) -> list[str]: lst.append(bad_line) return ["2", "3"] @@ -377,7 +381,9 @@ def bad_line_func(bad_line: list[str]) -> list[str]: assert lst == [["2", "3", "4", "5", "6"]] -@pytest.mark.parametrize("bad_line_func", [lambda x: ["foo", "bar"], lambda x: x[:2]]) +@pytest.mark.parametrize( + "bad_line_func", [lambda x, y, z, a: ["foo", "bar"], lambda x, y, z, a: a[:2]] +) @pytest.mark.parametrize("sep", [",", "111"]) def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func, sep): # GH 5686 @@ -414,7 +420,7 @@ def test_on_bad_lines_callable_dont_swallow_errors(python_parser_only): bad_sio = StringIO(data) msg = "This function is buggy." - def bad_line_func(bad_line): + def bad_line_func(expected_columns, actual_columns, row, bad_line): raise ValueError(msg) with pytest.raises(ValueError, match=msg): @@ -432,7 +438,10 @@ def test_on_bad_lines_callable_not_expected_length(python_parser_only): bad_sio = StringIO(data) result = parser.read_csv_check_warnings( - ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x + ParserWarning, + "Length of header or names", + bad_sio, + on_bad_lines=lambda x, y, z, a: a, ) expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) tm.assert_frame_equal(result, expected) @@ -448,7 +457,7 @@ def test_on_bad_lines_callable_returns_none(python_parser_only): """ bad_sio = StringIO(data) - result = parser.read_csv(bad_sio, on_bad_lines=lambda x: None) + result = parser.read_csv(bad_sio, on_bad_lines=lambda x, y, z, a: None) expected = DataFrame({"a": [1, 3], "b": [2, 4]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 07f84466e3ac2..06c62e84afd95 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -154,7 +154,7 @@ def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers): # GH 5686 # GH 54643 sio = StringIO("a,b\n1,2") - bad_lines_func = lambda x: x + bad_lines_func = lambda x, y, z, a: a parser = all_parsers if all_parsers.engine not in ["python", "pyarrow"]: msg = (