Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1146,7 +1146,7 @@ To completely override the default values that are recognized as missing, specif
.. _io.navaluesconst:

The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A',
'n/a', 'NA', '<NA>', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``.
'n/a', 'NA', '<NA>', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', 'None', '']``.

Let us consider some examples:

Expand Down
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,15 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following

* :func:`read_csv`
* :func:`read_excel`
* :func:`read_html`
* :func:`read_sql`

Additionally a new global configuration, ``mode.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
to select the nullable dtypes implementation.

* :func:`read_csv` (with ``engine="pyarrow"``)
* :func:`read_excel`
* :func:`read_html`
* :func:`read_parquet`
* :func:`read_orc`

Expand Down Expand Up @@ -476,6 +478,7 @@ Other API changes
- :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`)
- Changed behavior of :class:`Index`, :class:`Series`, and :class:`DataFrame` arithmetic methods when working with object-dtypes, the results no longer do type inference on the result of the array operations, use ``result.infer_objects()`` to do type inference on the result (:issue:`49999`)
- Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`)
- Added ``"None"`` to default ``na_values`` in :func:`read_csv` (:issue:`50286`)
- Changed behavior of :class:`Series` and :class:`DataFrame` constructors when given an integer dtype and floating-point data that is not round numbers, this now raises ``ValueError`` instead of silently retaining the float dtype; do ``Series(data)`` or ``DataFrame(data)`` to get the old behavior, and ``Series(data).astype(dtype)`` or ``DataFrame(data).astype(dtype)`` to get the specified dtype (:issue:`49599`)
- Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`)
- Files are now closed when encountering an exception in :func:`read_json` (:issue:`49921`)
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1384,6 +1384,7 @@ STR_NA_VALUES = {
"nan",
"-nan",
"",
"None",
}
_NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))

Expand Down
9 changes: 9 additions & 0 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -1043,6 +1043,7 @@ def read_html(
keep_default_na: bool = True,
displayed_only: bool = True,
extract_links: Literal[None, "header", "footer", "body", "all"] = None,
use_nullable_dtypes: bool = False,
) -> list[DataFrame]:
r"""
Read HTML tables into a ``list`` of ``DataFrame`` objects.
Expand Down Expand Up @@ -1143,6 +1144,13 @@ def read_html(

.. versionadded:: 1.5.0

use_nullable_dtypes : bool = False
Whether to use nullable dtypes as default when reading data. If
set to True, nullable dtypes are used for all dtypes that have a nullable
implementation, even if no nulls are present.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add the additional paragraph of mode.dtype_backend being available that other docstrings have? (Should start with The nullable dtype implementation)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thx, added


.. versionadded:: 2.0

Returns
-------
dfs
Expand Down Expand Up @@ -1218,4 +1226,5 @@ def read_html(
keep_default_na=keep_default_na,
displayed_only=displayed_only,
extract_links=extract_links,
use_nullable_dtypes=use_nullable_dtypes,
)
1 change: 1 addition & 0 deletions pandas/tests/io/parser/test_na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def test_default_na_values(all_parsers):
"-nan",
"#N/A N/A",
"",
"None",
}
assert _NA_VALUES == STR_NA_VALUES

Expand Down
64 changes: 64 additions & 0 deletions pandas/tests/io/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
from pandas.compat import is_platform_windows
import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
NA,
DataFrame,
MultiIndex,
Series,
Expand All @@ -27,6 +29,10 @@
to_datetime,
)
import pandas._testing as tm
from pandas.core.arrays import (
ArrowStringArray,
StringArray,
)

from pandas.io.common import file_path_to_url
import pandas.io.html
Expand Down Expand Up @@ -132,6 +138,64 @@ def test_to_html_compat(self):
res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0]
tm.assert_frame_equal(res, df)

@pytest.mark.parametrize("nullable_backend", ["pandas", "pyarrow"])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
@pytest.mark.parametrize("nullable_backend", ["pandas", "pyarrow"])
@pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"])

@pytest.mark.parametrize("storage", ["python", "pyarrow"])
def test_use_nullable_dtypes(self, storage, nullable_backend):
# GH#50286
df = DataFrame(
{
"a": Series([1, np.nan, 3], dtype="Int64"),
"b": Series([1, 2, 3], dtype="Int64"),
"c": Series([1.5, np.nan, 2.5], dtype="Float64"),
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": [True, False, None],
"f": [True, False, True],
"g": ["a", "b", "c"],
"h": ["a", "b", None],
}
)

if storage == "python":
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))

else:
pa = pytest.importorskip("pyarrow")
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))

out = df.to_html(index=False)
with pd.option_context("mode.string_storage", storage):
with pd.option_context("mode.nullable_backend", nullable_backend):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
with pd.option_context("mode.nullable_backend", nullable_backend):
with pd.option_context("mode.dtype_backend", nullable_backend):

result = self.read_html(out, use_nullable_dtypes=True)[0]

expected = DataFrame(
{
"a": Series([1, np.nan, 3], dtype="Int64"),
"b": Series([1, 2, 3], dtype="Int64"),
"c": Series([1.5, np.nan, 2.5], dtype="Float64"),
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": Series([True, False, NA], dtype="boolean"),
"f": Series([True, False, True], dtype="boolean"),
"g": string_array,
"h": string_array_na,
}
)

if nullable_backend == "pyarrow":
import pyarrow as pa

from pandas.arrays import ArrowExtensionArray

expected = DataFrame(
{
col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
for col in expected.columns
}
)

tm.assert_frame_equal(result, expected)

@pytest.mark.network
@tm.network(
url=(
Expand Down