diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 005818b0779e6..f7222ea7bd072 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -46,6 +46,7 @@ Other enhancements - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) - :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`) - :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) +- :func:`read_html` now accepts a ``strip_whitespace`` argument to decide if extra whitespaces should be trimmed in HTML tables (:issue:`24766`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`) diff --git a/pandas/io/html.py b/pandas/io/html.py index 183af3a03221b..0bc5f5e396043 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -172,6 +172,11 @@ class _HtmlFrameParser: .. versionadded:: 1.5.0 + strip_whitespace : bool + Whether table row values should have all extra whitespaces stripped to + a single space. + .. versionadded:: 3.0.0 + Attributes ---------- io : str or file-like @@ -196,6 +201,11 @@ class _HtmlFrameParser: .. versionadded:: 1.5.0 + strip_whitespace : bool + Whether table row values should have all extra whitespaces stripped to + a single space. + .. versionadded:: 3.0.0 + Notes ----- To subclass this class effectively you must override the following methods: @@ -222,6 +232,7 @@ def __init__( displayed_only: bool, extract_links: Literal[None, "header", "footer", "body", "all"], storage_options: StorageOptions = None, + strip_whitespace: bool = True, ) -> None: self.io = io self.match = match @@ -230,6 +241,7 @@ def __init__( self.displayed_only = displayed_only self.extract_links = extract_links self.storage_options = storage_options + self.strip_whitespace = strip_whitespace def parse_tables(self): """ @@ -523,10 +535,15 @@ def _expand_colspan_rowspan( index += 1 # Append the text from this
Field 1 +Field 2 | +Value 1 +Value 2 | +