Merge branch 'main' into bugfix-spss-kwargs

astronights · web-flow · commit 94800db00eb6 · 2024-01-27T21:49:51.000+08:00
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -408,6 +408,9 @@ def time_read_stringcsv(self, engine):
     def time_read_bytescsv(self, engine):
         read_csv(self.data(self.BytesIO_input), engine=engine)
 
+    def peakmem_read_csv(self, engine):
+        read_csv(self.data(self.BytesIO_input), engine=engine)
+
 
 class ReadCSVCategorical(BaseIO):
     fname = "__test__.csv"
diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -460,7 +460,6 @@
         "dateutil": ("https://dateutil.readthedocs.io/en/latest/", None),
         "matplotlib": ("https://matplotlib.org/stable/", None),
         "numpy": ("https://numpy.org/doc/stable/", None),
-        "py": ("https://pylib.readthedocs.io/en/latest/", None),
         "python": ("https://docs.python.org/3/", None),
         "scipy": ("https://docs.scipy.org/doc/scipy/", None),
         "pyarrow": ("https://arrow.apache.org/docs/", None),
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -277,11 +277,12 @@ Installable with ``pip install "pandas[excel]"``.
 ========================= ================== =============== =============================================================
 Dependency                Minimum Version    pip extra       Notes
 ========================= ================== =============== =============================================================
-xlrd                      2.0.1              excel           Reading Excel
-xlsxwriter                3.0.5              excel           Writing Excel
-openpyxl                  3.1.0              excel           Reading / writing for xlsx files
+xlrd                      2.0.1              excel           Reading for xls files
+xlsxwriter                3.0.5              excel           Writing for xlsx files
+openpyxl                  3.1.0              excel           Reading / writing for Excel 2010 xlsx/xlsm/xltx/xltm files
 pyxlsb                    1.0.10             excel           Reading for xlsb files
-python-calamine           0.1.7              excel           Reading for xls/xlsx/xlsb/ods files
+python-calamine           0.1.7              excel           Reading for xls/xlsx/xlsm/xlsb/xla/xlam/ods files
+odfpy                     1.4.1              excel           Reading / writing for OpenDocument 1.2 files
 ========================= ================== =============== =============================================================
 
 HTML
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -61,8 +61,8 @@ Basic
 +++++
 
 filepath_or_buffer : various
-  Either a path to a file (a :class:`python:str`, :class:`python:pathlib.Path`,
-  or :class:`py:py._path.local.LocalPath`), URL (including http, ftp, and S3
+  Either a path to a file (a :class:`python:str`, :class:`python:pathlib.Path`)
+  URL (including http, ftp, and S3
   locations), or any object with a ``read()`` method (such as an open file or
   :class:`~python:io.StringIO`).
 sep : str, defaults to ``','`` for :func:`read_csv`, ``\t`` for :func:`read_table`
diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst
@@ -13,8 +13,11 @@ including other versions of pandas.
 
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
+- Fixed memory leak in :func:`read_csv` (:issue:`57039`)
 - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`)
 - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
+- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`)
+- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`)
 - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`)
 
 .. ---------------------------------------------------------------------------
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -84,7 +84,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor
 
 Other API changes
 ^^^^^^^^^^^^^^^^^
--
+- 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -1771,6 +1771,7 @@ def group_idxmin_idxmax(
         Py_ssize_t i, j, N, K, lab
         numeric_object_t val
         numeric_object_t[:, ::1] group_min_or_max
+        uint8_t[:, ::1] seen
         bint uses_mask = mask is not None
         bint isna_entry
         bint compute_max = name == "idxmax"
@@ -1784,13 +1785,10 @@ def group_idxmin_idxmax(
 
     if numeric_object_t is object:
         group_min_or_max = np.empty((<object>out).shape, dtype=object)
+        seen = np.zeros((<object>out).shape, dtype=np.uint8)
     else:
         group_min_or_max = np.empty_like(out, dtype=values.dtype)
-    if N > 0 and K > 0:
-        # When N or K is zero, we never use group_min_or_max
-        group_min_or_max[:] = _get_min_or_max(
-            values[0, 0], compute_max, is_datetimelike
-        )
+        seen = np.zeros_like(out, dtype=np.uint8)
 
     # When using transform, we need a valid value for take in the case
     # a category is not observed; these values will be dropped
@@ -1806,6 +1804,7 @@ def group_idxmin_idxmax(
                 if not skipna and out[lab, j] == -1:
                     # Once we've hit NA there is no going back
                     continue
+
                 val = values[i, j]
 
                 if uses_mask:
@@ -1814,10 +1813,14 @@ def group_idxmin_idxmax(
                     isna_entry = _treat_as_na(val, is_datetimelike)
 
                 if isna_entry:
-                    if not skipna:
+                    if not skipna or not seen[lab, j]:
                         out[lab, j] = -1
                 else:
-                    if compute_max:
+                    if not seen[lab, j]:
+                        seen[lab, j] = True
+                        group_min_or_max[lab, j] = val
+                        out[lab, j] = i
+                    elif compute_max:
                         if val > group_min_or_max[lab, j]:
                             group_min_or_max[lab, j] = val
                             out[lab, j] = i
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -109,6 +109,14 @@ void parser_set_default_options(parser_t *self) {
 
 parser_t *parser_new(void) { return (parser_t *)calloc(1, sizeof(parser_t)); }
 
+static void parser_clear_data_buffers(parser_t *self) {
+  free_if_not_null((void *)&self->stream);
+  free_if_not_null((void *)&self->words);
+  free_if_not_null((void *)&self->word_starts);
+  free_if_not_null((void *)&self->line_start);
+  free_if_not_null((void *)&self->line_fields);
+}
+
 static void parser_cleanup(parser_t *self) {
   // XXX where to put this
   free_if_not_null((void *)&self->error_msg);
@@ -119,6 +127,7 @@ static void parser_cleanup(parser_t *self) {
     self->skipset = NULL;
   }
 
+  parser_clear_data_buffers(self);
   if (self->cb_cleanup != NULL) {
     self->cb_cleanup(self->source);
     self->cb_cleanup = NULL;
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -34,7 +34,6 @@
     Series,
 )
 from pandas._testing._io import (
-    round_trip_localpath,
     round_trip_pathlib,
     round_trip_pickle,
     write_to_compressed,
@@ -609,7 +608,6 @@ def shares_memory(left, right) -> bool:
     "OBJECT_DTYPES",
     "raise_assert_detail",
     "raises_chained_assignment_error",
-    "round_trip_localpath",
     "round_trip_pathlib",
     "round_trip_pickle",
     "setitem",
diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py
@@ -89,35 +89,6 @@ def round_trip_pathlib(writer, reader, path: str | None = None):
     return obj
 
 
-def round_trip_localpath(writer, reader, path: str | None = None):
-    """
-    Write an object to file specified by a py.path LocalPath and read it back.
-
-    Parameters
-    ----------
-    writer : callable bound to pandas object
-        IO writing function (e.g. DataFrame.to_csv )
-    reader : callable
-        IO reading function (e.g. pd.read_csv )
-    path : str, default None
-        The path where the object is written and then read.
-
-    Returns
-    -------
-    pandas object
-        The original object that was serialized and then re-read.
-    """
-    import pytest
-
-    LocalPath = pytest.importorskip("py.path").local
-    if path is None:
-        path = "___localpath___"
-    with ensure_clean(path) as path:
-        writer(LocalPath(path))
-        obj = reader(LocalPath(path))
-    return obj
-
-
 def write_to_compressed(compression, path, data, dest: str = "test") -> None:
     """
     Write data to a compressed file.
diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py
@@ -220,7 +220,12 @@ def _assert_raised_with_correct_stacklevel(
     frame = inspect.currentframe()
     for _ in range(4):
         frame = frame.f_back  # type: ignore[union-attr]
-    caller_filename = inspect.getfile(frame)  # type: ignore[arg-type]
+    try:
+        caller_filename = inspect.getfile(frame)  # type: ignore[arg-type]
+    finally:
+        # See note in
+        # https://docs.python.org/3/library/inspect.html#inspect.Traceback
+        del frame
     msg = (
         "Warning not set with correct stacklevel. "
         f"File where warning is raised: {actual_warning.filename} != "
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -424,6 +424,7 @@ def _call_cython_op(
                     mask=mask,
                     result_mask=result_mask,
                     is_datetimelike=is_datetimelike,
+                    **kwargs,
                 )
             elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]:
                 if self.how in ["std", "sem"]:
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -318,7 +318,7 @@ def _get_filepath_or_buffer(
 
     Parameters
     ----------
-    filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
+    filepath_or_buffer : a url, filepath (str or pathlib.Path),
                          or buffer
     {compression_options}
 
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -1441,7 +1441,7 @@ class ExcelFile:
 
     Parameters
     ----------
-    path_or_buffer : str, bytes, path object (pathlib.Path or py._path.local.LocalPath),
+    path_or_buffer : str, bytes, pathlib.Path,
         A file-like object, xlrd workbook or openpyxl workbook.
         If a string or path object, expected to be a path to a
         .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file.
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -216,7 +216,7 @@
 Parameters
 ----------
 path_or_buf : path (string), buffer or path object
-    string, path object (pathlib.Path or py._path.local.LocalPath) or object
+    string, pathlib.Path or object
     implementing a binary read() functions.
 {_statafile_processing_params1}
 {_statafile_processing_params2}
@@ -2258,7 +2258,7 @@ class StataWriter(StataParser):
     Parameters
     ----------
     fname : path (string), buffer or path object
-        string, path object (pathlib.Path or py._path.local.LocalPath) or
+        string, pathlib.Path or
         object implementing a binary write() functions. If using a buffer
         then the buffer will not be automatically closed after the file
         is written.
@@ -3208,7 +3208,7 @@ class StataWriter117(StataWriter):
     Parameters
     ----------
     fname : path (string), buffer or path object
-        string, path object (pathlib.Path or py._path.local.LocalPath) or
+        string, pathlib.Path or
         object implementing a binary write() functions. If using a buffer
         then the buffer will not be automatically closed after the file
         is written.
@@ -3594,7 +3594,7 @@ class StataWriterUTF8(StataWriter117):
     Parameters
     ----------
     fname : path (string), buffer or path object
-        string, path object (pathlib.Path or py._path.local.LocalPath) or
+        string, pathlib.Path or
         object implementing a binary write() functions. If using a buffer
         then the buffer will not be automatically closed after the file
         is written.
diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
@@ -257,6 +257,68 @@ def test_empty(frame_or_series, all_boolean_reductions):
     tm.assert_equal(result, expected)
 
 
+@pytest.mark.parametrize("how", ["idxmin", "idxmax"])
+def test_idxmin_idxmax_extremes(how, any_real_numpy_dtype):
+    # GH#57040
+    if any_real_numpy_dtype is int or any_real_numpy_dtype is float:
+        # No need to test
+        return
+    info = np.iinfo if "int" in any_real_numpy_dtype else np.finfo
+    min_value = info(any_real_numpy_dtype).min
+    max_value = info(any_real_numpy_dtype).max
+    df = DataFrame(
+        {"a": [2, 1, 1, 2], "b": [min_value, max_value, max_value, min_value]},
+        dtype=any_real_numpy_dtype,
+    )
+    gb = df.groupby("a")
+    result = getattr(gb, how)()
+    expected = DataFrame(
+        {"b": [1, 0]}, index=pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype)
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("how", ["idxmin", "idxmax"])
+def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype):
+    # GH#57040
+    min_value = np.finfo(float_numpy_dtype).min
+    max_value = np.finfo(float_numpy_dtype).max
+    df = DataFrame(
+        {
+            "a": Series(np.repeat(range(1, 6), repeats=2), dtype="intp"),
+            "b": Series(
+                [
+                    np.nan,
+                    min_value,
+                    np.nan,
+                    max_value,
+                    min_value,
+                    np.nan,
+                    max_value,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                ],
+                dtype=float_numpy_dtype,
+            ),
+        },
+    )
+    gb = df.groupby("a")
+
+    warn = None if skipna else FutureWarning
+    msg = f"The behavior of DataFrameGroupBy.{how} with all-NA values"
+    with tm.assert_produces_warning(warn, match=msg):
+        result = getattr(gb, how)(skipna=skipna)
+    if skipna:
+        values = [1, 3, 4, 6, np.nan]
+    else:
+        values = np.nan
+    expected = DataFrame(
+        {"b": values}, index=pd.Index(range(1, 6), name="a", dtype="intp")
+    )
+    tm.assert_frame_equal(result, expected)
+
+
 @pytest.mark.parametrize(
     "func, values",
     [
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -966,19 +966,6 @@ def test_read_from_pathlib_path(self, read_ext):
 
         tm.assert_frame_equal(expected, actual)
 
-    @td.skip_if_no("py.path")
-    def test_read_from_py_localpath(self, read_ext):
-        # GH12655
-        from py.path import local as LocalPath
-
-        str_path = os.path.join("test1" + read_ext)
-        expected = pd.read_excel(str_path, sheet_name="Sheet1", index_col=0)
-
-        path_obj = LocalPath().join("test1" + read_ext)
-        actual = pd.read_excel(path_obj, sheet_name="Sheet1", index_col=0)
-
-        tm.assert_frame_equal(expected, actual)
-
     def test_close_from_py_localpath(self, read_ext):
         # GH31467
         str_path = os.path.join("test1" + read_ext)
diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py
@@ -1319,18 +1319,6 @@ def test_path_path_lib(self, engine, ext):
         result = tm.round_trip_pathlib(writer, reader, path=f"foo{ext}")
         tm.assert_frame_equal(result, df)
 
-    def test_path_local_path(self, engine, ext):
-        df = DataFrame(
-            1.1 * np.arange(120).reshape((30, 4)),
-            columns=Index(list("ABCD")),
-            index=Index([f"i-{i}" for i in range(30)]),
-        )
-        writer = partial(df.to_excel, engine=engine)
-
-        reader = partial(pd.read_excel, index_col=0)
-        result = tm.round_trip_localpath(writer, reader, path=f"foo{ext}")
-        tm.assert_frame_equal(result, df)
-
     def test_merged_cell_custom_objects(self, path):
         # see GH-27006
         mi = MultiIndex.from_tuples(
diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py
@@ -79,20 +79,6 @@ def test_path_path_lib(all_parsers):
     tm.assert_frame_equal(df, result)
 
 
-@xfail_pyarrow  # AssertionError: DataFrame.index are different
-def test_path_local_path(all_parsers):
-    parser = all_parsers
-    df = DataFrame(
-        1.1 * np.arange(120).reshape((30, 4)),
-        columns=Index(list("ABCD"), dtype=object),
-        index=Index([f"i-{i}" for i in range(30)], dtype=object),
-    )
-    result = tm.round_trip_localpath(
-        df.to_csv, lambda p: parser.read_csv(p, index_col=0)
-    )
-    tm.assert_frame_equal(df, result)
-
-
 def test_nonexistent_path(all_parsers):
     # gh-2428: pls no segfault
     # gh-14086: raise more helpful FileNotFoundError
diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py
diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py
diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py

Original file line number	Diff line number	Diff line change
@@ -84,7 +84,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor
`84`	`84`
`85`	`85`	`Other API changes`
`86`	`86`	`^^^^^^^^^^^^^^^^^`
`87`		`--`
	`87`	+- 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`)
`88`	`88`	`-`
`89`	`89`
`90`	`90`	`.. ---------------------------------------------------------------------------`