diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index be4b9c218f9f5..bdd1de3a84a39 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -563,6 +563,7 @@ I/O
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
+- Bug in :meth:`read_html` would return an incorrect result when parsing a table with a space character in a ``
`` tag. (:issue:`12345`)
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
diff --git a/pandas/io/html.py b/pandas/io/html.py
index db4c5f8507946..925853189fa28 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -507,6 +507,9 @@ def _expand_colspan_rowspan(
# Append the text from this | , colspan times
text = _remove_whitespace(self._text_getter(td))
+ if len(text) == 0:
+ text = self._text_getter(td)
+
if self.extract_links in ("all", section):
href = self._href_getter(td)
text = (text, href)
@@ -1201,9 +1204,7 @@ def read_html(
validate_header_arg(header)
check_dtype_backend(dtype_backend)
-
io = stringify_path(io)
-
return _parse(
flavor=flavor,
io=io,
@@ -1223,4 +1224,5 @@ def read_html(
extract_links=extract_links,
dtype_backend=dtype_backend,
storage_options=storage_options,
+ skip_blank_lines=False,
)
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index dfc9b4156ecab..24d215344e119 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -1242,6 +1242,48 @@ def test_preserve_empty_rows(self, flavor_read_html):
tm.assert_frame_equal(result, expected)
+ def test_preserve_rows_with_spaces(self, flavor_read_html):
+ result = flavor_read_html(
+ StringIO(
+ """
+
+
+ A |
+ B |
+
+
+ a |
+ b |
+
+
+ |
+ |
+
+
+ """
+ )
+ )[0]
+ expected = DataFrame(data=[["a", "b"], [" ", " "]], columns=["A", "B"])
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_preserve_table_with_just_spaces(self, flavor_read_html):
+ result = flavor_read_html(
+ StringIO(
+ """
+
+ """
+ )
+ )[0]
+
+ expected = DataFrame(data=[" "])
+ assert len(result) != 0
+ tm.assert_frame_equal(result, expected)
+
def test_ignore_empty_rows_when_inferring_header(self, flavor_read_html):
result = flavor_read_html(
StringIO(
|