Merge branch 'main' into pandas.core.groupby.DataFrameGroupBy.groups

tuhinsharma121 · web-flow · commit 5135faeea420 · 2024-11-10T22:10:18.000+05:30
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -90,9 +90,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
         -i "pandas.arrays.IntegerArray SA01" \
         -i "pandas.arrays.IntervalArray.length SA01" \
-        -i "pandas.arrays.IntervalArray.right SA01" \
         -i "pandas.arrays.NumpyExtensionArray SA01" \
-        -i "pandas.arrays.SparseArray PR07,SA01" \
         -i "pandas.arrays.TimedeltaArray PR07,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \
diff --git a/environment.yml b/environment.yml
@@ -77,7 +77,7 @@ dependencies:
 
   # code checks
   - flake8=7.1.0  # run in subprocess over docstring examples
-  - mypy=1.11.2  # pre-commit uses locally installed mypy
+  - mypy=1.13.0  # pre-commit uses locally installed mypy
   - tokenize-rt  # scripts/check_for_inconsistent_pandas_namespace.py
   - pre-commit>=4.0.1
 
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -7,7 +7,6 @@
 from typing import (
     TYPE_CHECKING,
     ContextManager,
-    cast,
 )
 
 import numpy as np
@@ -21,8 +20,6 @@
 
 from pandas.compat import pa_version_under10p1
 
-from pandas.core.dtypes.common import is_string_dtype
-
 import pandas as pd
 from pandas import (
     ArrowDtype,
@@ -77,8 +74,8 @@
     with_csv_dialect,
 )
 from pandas.core.arrays import (
+    ArrowExtensionArray,
     BaseMaskedArray,
-    ExtensionArray,
     NumpyExtensionArray,
 )
 from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
@@ -92,7 +89,6 @@
         NpDtype,
     )
 
-    from pandas.core.arrays import ArrowExtensionArray
 
 UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"]
 UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"]
@@ -512,24 +508,18 @@ def shares_memory(left, right) -> bool:
     if isinstance(left, pd.core.arrays.IntervalArray):
         return shares_memory(left._left, right) or shares_memory(left._right, right)
 
-    if (
-        isinstance(left, ExtensionArray)
-        and is_string_dtype(left.dtype)
-        and left.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
-    ):
-        # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
-        left = cast("ArrowExtensionArray", left)
-        if (
-            isinstance(right, ExtensionArray)
-            and is_string_dtype(right.dtype)
-            and right.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
-        ):
-            right = cast("ArrowExtensionArray", right)
+    if isinstance(left, ArrowExtensionArray):
+        if isinstance(right, ArrowExtensionArray):
+            # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
             left_pa_data = left._pa_array
             right_pa_data = right._pa_array
             left_buf1 = left_pa_data.chunk(0).buffers()[1]
             right_buf1 = right_pa_data.chunk(0).buffers()[1]
-            return left_buf1 == right_buf1
+            return left_buf1.address == right_buf1.address
+        else:
+            # if we have one one ArrowExtensionArray and one other array, assume
+            # they can only share memory if they share the same numpy buffer
+            return np.shares_memory(left, right)
 
     if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray):
         # By convention, we'll say these share memory if they share *either*
diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -1269,6 +1269,21 @@ def right(self) -> Index:
         """
         Return the right endpoints of each Interval in the IntervalArray as an Index.
 
+        This property extracts the right endpoints from each interval contained within
+        the IntervalArray. This can be helpful in use cases where you need to work
+        with or compare only the upper bounds of intervals, such as when performing
+        range-based filtering, determining interval overlaps, or visualizing the end
+        boundaries of data segments.
+
+        See Also
+        --------
+        arrays.IntervalArray.left : Return the left endpoints of each Interval in
+            the IntervalArray as an Index.
+        arrays.IntervalArray.mid : Return the midpoint of each Interval in the
+            IntervalArray as an Index.
+        arrays.IntervalArray.contains : Check elementwise if the Intervals contain
+            the value.
+
         Examples
         --------
 
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -289,12 +289,18 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray):
     """
     An ExtensionArray for storing sparse data.
 
+    SparseArray efficiently stores data with a high frequency of a
+    specific fill value (e.g., zeros), saving memory by only retaining
+    non-fill elements and their indices. This class is particularly
+    useful for large datasets where most values are redundant.
+
     Parameters
     ----------
     data : array-like or scalar
         A dense array of values to store in the SparseArray. This may contain
         `fill_value`.
     sparse_index : SparseIndex, optional
+        Index indicating the locations of sparse elements.
     fill_value : scalar, optional
         Elements in data that are ``fill_value`` are not stored in the
         SparseArray. For memory savings, this should be the most common value
@@ -345,6 +351,10 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray):
     -------
     None
 
+    See Also
+    --------
+    SparseDtype : Dtype for sparse data.
+
     Examples
     --------
     >>> from pandas.arrays import SparseArray
diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py
@@ -76,8 +76,7 @@
 class Term:
     def __new__(cls, name, env, side=None, encoding=None):
         klass = Constant if not isinstance(name, str) else cls
-        # error: Argument 2 for "super" not an instance of argument 1
-        supr_new = super(Term, klass).__new__  # type: ignore[misc]
+        supr_new = super(Term, klass).__new__
         return supr_new(klass)
 
     is_local: bool
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -2130,9 +2130,11 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
                 PerformanceWarning,
                 stacklevel=find_stack_level(),
             )
-
         np_dtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes)
-        return SparseDtype(np_find_common_type(*np_dtypes), fill_value=fill_value)
+        # error: Argument 1 to "np_find_common_type" has incompatible type
+        # "*Generator[Any | dtype[Any] | ExtensionDtype, None, None]";
+        # expected "dtype[Any]"  [arg-type]
+        return SparseDtype(np_find_common_type(*np_dtypes), fill_value=fill_value)  # type: ignore [arg-type]
 
 
 @register_extension_dtype
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -8024,7 +8024,9 @@ def asof(self, where, subset=None):
                     np.nan, index=self.columns, name=where[0]
                 )
 
-        locs = self.index.asof_locs(where, ~(nulls._values))
+        # error: Unsupported operand type for
+        # ~ ("ExtensionArray | ndarray[Any, Any] | Any")
+        locs = self.index.asof_locs(where, ~nulls._values)  # type: ignore[operator]
 
         # mask the missing
         mask = locs == -1
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
@@ -558,8 +558,7 @@ def _maybe_convert_i8(self, key):
             left = self._maybe_convert_i8(key.left)
             right = self._maybe_convert_i8(key.right)
             constructor = Interval if scalar else IntervalIndex.from_arrays
-            # error: "object" not callable
-            return constructor(left, right, closed=self.closed)  # type: ignore[operator]
+            return constructor(left, right, closed=self.closed)
 
         if scalar:
             # Timestamp/Timedelta
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -914,7 +914,9 @@ def __setitem__(self, key, value) -> None:
         indexer = self._get_setitem_indexer(key)
         self._has_valid_setitem_indexer(key)
 
-        iloc = self if self.name == "iloc" else self.obj.iloc
+        iloc: _iLocIndexer = (
+            cast("_iLocIndexer", self) if self.name == "iloc" else self.obj.iloc
+        )
         iloc._setitem_with_indexer(indexer, value, self.name)
 
     def _validate_key(self, key, axis: AxisInt) -> None:
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
@@ -413,13 +413,10 @@ def func(yvalues: np.ndarray) -> None:
             **kwargs,
         )
 
-    # error: Argument 1 to "apply_along_axis" has incompatible type
-    # "Callable[[ndarray[Any, Any]], None]"; expected "Callable[...,
-    # Union[_SupportsArray[dtype[<nothing>]], Sequence[_SupportsArray
-    # [dtype[<nothing>]]], Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]],
-    # Sequence[Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]]],
-    # Sequence[Sequence[Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]]]]]]"
-    np.apply_along_axis(func, axis, data)  # type: ignore[arg-type]
+    # error: No overload variant of "apply_along_axis" matches
+    # argument types "Callable[[ndarray[Any, Any]], None]",
+    # "int", "ndarray[Any, Any]"
+    np.apply_along_axis(func, axis, data)  # type: ignore[call-overload]
 
 
 def _index_to_interp_indices(index: Index, method: str) -> np.ndarray:
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -726,7 +726,9 @@ def nanmean(
 
 
 @bottleneck_switch()
-def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask=None):
+def nanmedian(
+    values: np.ndarray, *, axis: AxisInt | None = None, skipna: bool = True, mask=None
+) -> float | np.ndarray:
     """
     Parameters
     ----------
@@ -738,7 +740,7 @@ def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask=
 
     Returns
     -------
-    result : float
+    result : float | ndarray
         Unless input is a float array, in which case use the same
         precision as the input array.
 
@@ -758,7 +760,7 @@ def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask=
     # cases we never need to set NaN to the masked values
     using_nan_sentinel = values.dtype.kind == "f" and mask is None
 
-    def get_median(x, _mask=None):
+    def get_median(x: np.ndarray, _mask=None):
         if _mask is None:
             _mask = notna(x)
         else:
@@ -794,6 +796,8 @@ def get_median(x, _mask=None):
 
     notempty = values.size
 
+    res: float | np.ndarray
+
     # an array from a frame
     if values.ndim > 1 and axis is not None:
         # there's a non-empty array to apply over otherwise numpy raises
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -910,10 +910,10 @@ def get_handle(
             or not hasattr(handle, "seekable")
         ):
             handle = _IOWrapper(handle)
-        # error: Argument 1 to "TextIOWrapper" has incompatible type
-        # "_IOWrapper"; expected "IO[bytes]"
+        # error: Value of type variable "_BufferT_co" of "TextIOWrapper" cannot
+        # be "_IOWrapper | BaseBuffer" [type-var]
         handle = TextIOWrapper(
-            handle,  # type: ignore[arg-type]
+            handle,  # type: ignore[type-var]
             encoding=ioargs.encoding,
             errors=errors,
             newline="",
diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py
@@ -1038,7 +1038,9 @@ def __call__(self, *args, **kwargs):
                     label_name = label_kw or y
                     data.name = label_name
                 else:
-                    match = is_list_like(label_kw) and len(label_kw) == len(y)
+                    # error: Argument 1 to "len" has incompatible type "Any | bool";
+                    # expected "Sized"  [arg-type]
+                    match = is_list_like(label_kw) and len(label_kw) == len(y)  # type: ignore[arg-type]
                     if label_kw and not match:
                         raise ValueError(
                             "label should be list-like and same length as y"
diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py
@@ -198,10 +198,7 @@ def _make_plot(self, fig: Figure) -> None:
                 else self.data
             )
 
-            # error: Argument "data" to "_iter_data" of "MPLPlot" has
-            # incompatible type "object"; expected "DataFrame |
-            # dict[Hashable, Series | DataFrame]"
-            for i, (label, y) in enumerate(self._iter_data(data=data)):  # type: ignore[arg-type]
+            for i, (label, y) in enumerate(self._iter_data(data=data)):
                 ax = self._get_ax(i)
                 kwds = self.kwds.copy()
 
diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py
@@ -137,10 +137,7 @@ def _make_plot(self, fig: Figure) -> None:
             if self.by is not None
             else self.data
         )
-
-        # error: Argument "data" to "_iter_data" of "MPLPlot" has incompatible
-        # type "object"; expected "DataFrame | dict[Hashable, Series | DataFrame]"
-        for i, (label, y) in enumerate(self._iter_data(data=data)):  # type: ignore[arg-type]
+        for i, (label, y) in enumerate(self._iter_data(data=data)):
             ax = self._get_ax(i)
 
             kwds = self.kwds.copy()
diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import HAS_PYARROW
 from pandas.compat.pyarrow import pa_version_under12p0
 
@@ -206,7 +204,6 @@ def test_astype_arrow_timestamp():
         assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_convert_dtypes_infer_objects():
     ser = Series(["a", "b", "c"])
     ser_orig = ser.copy()
@@ -217,20 +214,25 @@ def test_convert_dtypes_infer_objects():
         convert_string=False,
     )
 
-    assert np.shares_memory(get_array(ser), get_array(result))
+    assert tm.shares_memory(get_array(ser), get_array(result))
     result.iloc[0] = "x"
     tm.assert_series_equal(ser, ser_orig)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
-def test_convert_dtypes():
+def test_convert_dtypes(using_infer_string):
     df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]})
     df_orig = df.copy()
     df2 = df.convert_dtypes()
 
-    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
-    assert np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
-    assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
-    assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    if using_infer_string and HAS_PYARROW:
+        # TODO the default nullable string dtype still uses python storage
+        # this should be changed to pyarrow if installed
+        assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d"))
+    assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c"))
     df2.iloc[0, 0] = "x"
+    df2.iloc[0, 1] = 10
     tm.assert_frame_equal(df, df_orig)
diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py
@@ -153,7 +153,6 @@ def test_concat_copy_keyword():
     assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
 
 
-# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 @pytest.mark.parametrize(
     "func",
     [
diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py
diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
diff --git a/requirements-dev.txt b/requirements-dev.txt