diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index be4b9c218f9f5..bdd1de3a84a39 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -563,6 +563,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) +- Bug in :meth:`read_html` would return an incorrect result when parsing a table with a space character in a ```` tag. (:issue:`12345`) - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) diff --git a/pandas/io/html.py b/pandas/io/html.py index db4c5f8507946..925853189fa28 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -507,6 +507,9 @@ def _expand_colspan_rowspan( # Append the text from this , colspan times text = _remove_whitespace(self._text_getter(td)) + if len(text) == 0: + text = self._text_getter(td) + if self.extract_links in ("all", section): href = self._href_getter(td) text = (text, href) @@ -1201,9 +1204,7 @@ def read_html( validate_header_arg(header) check_dtype_backend(dtype_backend) - io = stringify_path(io) - return _parse( flavor=flavor, io=io, @@ -1223,4 +1224,5 @@ def read_html( extract_links=extract_links, dtype_backend=dtype_backend, storage_options=storage_options, + skip_blank_lines=False, ) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index dfc9b4156ecab..24d215344e119 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1242,6 +1242,48 @@ def test_preserve_empty_rows(self, flavor_read_html): tm.assert_frame_equal(result, expected) + def test_preserve_rows_with_spaces(self, flavor_read_html): + result = flavor_read_html( + StringIO( + """ + + + + + + + + + + + + + +
AB
ab
+ """ + ) + )[0] + expected = DataFrame(data=[["a", "b"], [" ", " "]], columns=["A", "B"]) + + tm.assert_frame_equal(result, expected) + + def test_preserve_table_with_just_spaces(self, flavor_read_html): + result = flavor_read_html( + StringIO( + """ + + + + +
+ """ + ) + )[0] + + expected = DataFrame(data=[" "]) + assert len(result) != 0 + tm.assert_frame_equal(result, expected) + def test_ignore_empty_rows_when_inferring_header(self, flavor_read_html): result = flavor_read_html( StringIO(