pandas-dev · ritwizsinha · Jul 1, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 1, 2024
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -563,6 +563,7 @@ I/O
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
+- Bug in :meth:`read_html` would return an incorrect result when parsing a table with a space character in a ``<td>`` tag. (:issue:`12345`)
 - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
 - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
 

diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -507,6 +507,9 @@ def _expand_colspan_rowspan(
 
                 # Append the text from this <td>, colspan times
                 text = _remove_whitespace(self._text_getter(td))
+                if len(text) == 0:
+                    text = self._text_getter(td)
+
                 if self.extract_links in ("all", section):
                     href = self._href_getter(td)
                     text = (text, href)
@@ -1027,6 +1030,7 @@ def read_html(
     extract_links: Literal[None, "header", "footer", "body", "all"] = None,
     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
     storage_options: StorageOptions = None,
+    skip_blank_lines: bool = True,
 ) -> list[DataFrame]:
     r"""
     Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1146,6 +1150,9 @@ def read_html(
 
         .. versionadded:: 2.1.0
 
+    skip_blank_lines : bool, default True
+        Whether lines containing only spaces should be skipped or not.
+
     Returns
     -------
     dfs
@@ -1201,9 +1208,7 @@ def read_html(
 
     validate_header_arg(header)
     check_dtype_backend(dtype_backend)
-
     io = stringify_path(io)
-
     return _parse(
         flavor=flavor,
         io=io,
@@ -1223,4 +1228,5 @@ def read_html(
         extract_links=extract_links,
         dtype_backend=dtype_backend,
         storage_options=storage_options,
+        skip_blank_lines=skip_blank_lines,
     )
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -1242,6 +1242,50 @@ def test_preserve_empty_rows(self, flavor_read_html):
 
         tm.assert_frame_equal(result, expected)
 
+    def test_preserve_rows_with_spaces(self, flavor_read_html):
+        result = flavor_read_html(
+            StringIO(
+                """
+            <table>
+                <tr>
+                    <th>A</th>
+                    <th>B</th>
+                </tr>
+                <tr>
+                    <td>a</td>
+                    <td>b</td>
+                </tr>
+                <tr>
+                    <td>  </td>
+                    <td> </td>
+                </tr>
+            </table>
+        """
+            ),
+            skip_blank_lines=False,
+        )[0]
+        expected = DataFrame(data=[["a", "b"], ["  ", " "]], columns=["A", "B"])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_preserve_table_with_just_spaces(self, flavor_read_html):
+        result = flavor_read_html(
+            StringIO(
+                """
+            <table>
+                <tr>
+                    <td> </td>
+                </tr>
+            </table>
+        """
+            ),
+            skip_blank_lines=False,
+        )[0]
+
+        expected = DataFrame(data=[" "])
+        assert len(result) != 0
+        tm.assert_frame_equal(result, expected)
+
     def test_ignore_empty_rows_when_inferring_header(self, flavor_read_html):
         result = flavor_read_html(
             StringIO(