Skip to content

Commit a941904

Browse files
RomainL972Derekt2
andcommitted
ENH: Add an option to prevent stripping extra whitespaces in pd.read_html
Co-authored-by: Derekt2 <[email protected]>
1 parent b96491a commit a941904

File tree

3 files changed

+48
-1
lines changed

3 files changed

+48
-1
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ Other enhancements
4646
- :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
4747
- :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`)
4848
- :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`)
49+
- :func:`read_html` now accepts a ``strip_whitespace`` argument to decide if extra whitespaces should be trimmed in HTML tables (:issue:`24766`)
4950
- :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
5051
- :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
5152
- :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`)

pandas/io/html.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,11 @@ class _HtmlFrameParser:
172172
173173
.. versionadded:: 1.5.0
174174
175+
strip_whitespace : bool
176+
Whether table row values should have all extra whitespaces stripped to
177+
a single space.
178+
.. versionadded:: 3.0.0
179+
175180
Attributes
176181
----------
177182
io : str or file-like
@@ -196,6 +201,11 @@ class _HtmlFrameParser:
196201
197202
.. versionadded:: 1.5.0
198203
204+
strip_whitespace : bool
205+
Whether table row values should have all extra whitespaces stripped to
206+
a single space.
207+
.. versionadded:: 3.0.0
208+
199209
Notes
200210
-----
201211
To subclass this class effectively you must override the following methods:
@@ -222,6 +232,7 @@ def __init__(
222232
displayed_only: bool,
223233
extract_links: Literal[None, "header", "footer", "body", "all"],
224234
storage_options: StorageOptions = None,
235+
strip_whitespace: bool = True,
225236
) -> None:
226237
self.io = io
227238
self.match = match
@@ -230,6 +241,7 @@ def __init__(
230241
self.displayed_only = displayed_only
231242
self.extract_links = extract_links
232243
self.storage_options = storage_options
244+
self.strip_whitespace = strip_whitespace
233245

234246
def parse_tables(self):
235247
"""
@@ -506,10 +518,15 @@ def _expand_colspan_rowspan(
506518
index += 1
507519

508520
# Append the text from this <td>, colspan times
509-
text = _remove_whitespace(self._text_getter(td))
521+
if self.strip_whitespace:
522+
text = _remove_whitespace(self._text_getter(td))
523+
else:
524+
text = self._text_getter(td)
525+
510526
if self.extract_links in ("all", section):
511527
href = self._href_getter(td)
512528
text = (text, href)
529+
513530
rowspan = int(self._attr_getter(td, "rowspan") or 1)
514531
colspan = int(self._attr_getter(td, "colspan") or 1)
515532

@@ -944,6 +961,7 @@ def _parse(
944961
displayed_only,
945962
extract_links,
946963
storage_options,
964+
strip_whitespace,
947965
**kwargs,
948966
):
949967
flavor = _validate_flavor(flavor)
@@ -960,6 +978,7 @@ def _parse(
960978
displayed_only,
961979
extract_links,
962980
storage_options,
981+
strip_whitespace,
963982
)
964983

965984
try:
@@ -1027,6 +1046,7 @@ def read_html(
10271046
extract_links: Literal[None, "header", "footer", "body", "all"] = None,
10281047
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
10291048
storage_options: StorageOptions = None,
1049+
strip_whitespace: bool = True,
10301050
) -> list[DataFrame]:
10311051
r"""
10321052
Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1147,6 +1167,11 @@ def read_html(
11471167
11481168
.. versionadded:: 2.1.0
11491169
1170+
strip_whitespace : bool
1171+
Whether table row values should have all extra whitespaces stripped to
1172+
a single space.
1173+
.. versionadded:: 3.0.0
1174+
11501175
Returns
11511176
-------
11521177
dfs
@@ -1227,4 +1252,5 @@ def read_html(
12271252
extract_links=extract_links,
12281253
dtype_backend=dtype_backend,
12291254
storage_options=storage_options,
1255+
strip_whitespace=strip_whitespace,
12301256
)

pandas/tests/io/test_html.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1630,3 +1630,23 @@ def test_style_tag(self, flavor_read_html):
16301630
result = flavor_read_html(StringIO(data))[0]
16311631
expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"])
16321632
tm.assert_frame_equal(result, expected)
1633+
1634+
def test_strip_whitespace(self, flavor_read_html):
1635+
# GH 24766
1636+
data = """
1637+
<table>
1638+
<tr>
1639+
<td>Field 1
1640+
Field 2</td>
1641+
<td>Value 1
1642+
Value 2</td>
1643+
</tr>
1644+
</table>
1645+
"""
1646+
result_strip = flavor_read_html(StringIO(data))[0]
1647+
expected_strip = DataFrame([["Field 1 Field 2", "Value 1 Value 2"]])
1648+
tm.assert_frame_equal(result_strip, expected_strip)
1649+
1650+
result_nostrip = flavor_read_html(StringIO(data), strip_whitespace=False)[0]
1651+
expected_nostrip = DataFrame([["Field 1\nField 2", "Value 1\nValue 2"]])
1652+
tm.assert_frame_equal(result_nostrip, expected_nostrip)

0 commit comments

Comments
 (0)