Skip to content

Commit 5d30866

Browse files
committed
ENH: Add an option to prevent stripping extra whitespaces in pd.read_html
1 parent 0cdc6a4 commit 5d30866

File tree

3 files changed

+48
-1
lines changed

3 files changed

+48
-1
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ Other enhancements
4242
- Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
4343
- :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
4444
- :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
45+
- :class:`pandas.io.html._HtmlFrameParser` now accepts a ``strip_whitespace`` argument to decide if extra whitespaces should be trimmed in HTML tables (:issue:`24766`)
4546
- :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`)
4647
- :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
4748
- :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)

pandas/io/html.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,11 @@ class _HtmlFrameParser:
172172
173173
.. versionadded:: 1.5.0
174174
175+
strip_whitespace: bool
176+
Whether table row values should have all extra whitespaces stripped to
177+
a single space.
178+
.. versionadded:: 3.0.0
179+
175180
Attributes
176181
----------
177182
io : str or file-like
@@ -196,6 +201,11 @@ class _HtmlFrameParser:
196201
197202
.. versionadded:: 1.5.0
198203
204+
strip_whitespace: bool
205+
Whether table row values should have all extra whitespaces stripped to
206+
a single space.
207+
.. versionadded:: 3.0.0
208+
199209
Notes
200210
-----
201211
To subclass this class effectively you must override the following methods:
@@ -222,6 +232,7 @@ def __init__(
222232
displayed_only: bool,
223233
extract_links: Literal[None, "header", "footer", "body", "all"],
224234
storage_options: StorageOptions = None,
235+
strip_whitespace: bool = True,
225236
) -> None:
226237
self.io = io
227238
self.match = match
@@ -230,6 +241,7 @@ def __init__(
230241
self.displayed_only = displayed_only
231242
self.extract_links = extract_links
232243
self.storage_options = storage_options
244+
self.strip_whitespace = strip_whitespace
233245

234246
def parse_tables(self):
235247
"""
@@ -506,10 +518,15 @@ def _expand_colspan_rowspan(
506518
index += 1
507519

508520
# Append the text from this <td>, colspan times
509-
text = _remove_whitespace(self._text_getter(td))
521+
if self.strip_whitespace:
522+
text = _remove_whitespace(self._text_getter(td))
523+
else:
524+
text = self._text_getter(td)
525+
510526
if self.extract_links in ("all", section):
511527
href = self._href_getter(td)
512528
text = (text, href)
529+
513530
rowspan = int(self._attr_getter(td, "rowspan") or 1)
514531
colspan = int(self._attr_getter(td, "colspan") or 1)
515532

@@ -944,6 +961,7 @@ def _parse(
944961
displayed_only,
945962
extract_links,
946963
storage_options,
964+
strip_whitespace,
947965
**kwargs,
948966
):
949967
flavor = _validate_flavor(flavor)
@@ -960,6 +978,7 @@ def _parse(
960978
displayed_only,
961979
extract_links,
962980
storage_options,
981+
strip_whitespace,
963982
)
964983

965984
try:
@@ -1027,6 +1046,7 @@ def read_html(
10271046
extract_links: Literal[None, "header", "footer", "body", "all"] = None,
10281047
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
10291048
storage_options: StorageOptions = None,
1049+
strip_whitespace: bool = True,
10301050
) -> list[DataFrame]:
10311051
r"""
10321052
Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1146,6 +1166,11 @@ def read_html(
11461166
11471167
.. versionadded:: 2.1.0
11481168
1169+
strip_whitespace: bool
1170+
Whether table row values should have all extra whitespaces stripped to
1171+
a single space.
1172+
.. versionadded:: 3.0.0
1173+
11491174
Returns
11501175
-------
11511176
dfs
@@ -1226,4 +1251,5 @@ def read_html(
12261251
extract_links=extract_links,
12271252
dtype_backend=dtype_backend,
12281253
storage_options=storage_options,
1254+
strip_whitespace=strip_whitespace,
12291255
)

pandas/tests/io/test_html.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1646,3 +1646,23 @@ def test_style_tag(self, flavor_read_html):
16461646
result = flavor_read_html(StringIO(data))[0]
16471647
expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"])
16481648
tm.assert_frame_equal(result, expected)
1649+
1650+
def test_strip_whitespace(self, flavor_read_html):
1651+
# GH 24766
1652+
data = """
1653+
<table>
1654+
<tr>
1655+
<td>Field 1
1656+
Field 2</td>
1657+
<td>Value 1
1658+
Value 2</td>
1659+
</tr>
1660+
</table>
1661+
"""
1662+
result_strip = flavor_read_html(StringIO(data))[0]
1663+
expected_strip = DataFrame([["Field 1 Field 2", "Value 1 Value 2"]])
1664+
tm.assert_frame_equal(result_strip, expected_strip)
1665+
1666+
result_nostrip = flavor_read_html(StringIO(data), strip_whitespace=False)[0]
1667+
expected_nostrip = DataFrame([["Field 1\nField 2", "Value 1\nValue 2"]])
1668+
tm.assert_frame_equal(result_nostrip, expected_nostrip)

0 commit comments

Comments
 (0)