Merge branch 'main' into to_timestamp

saldanhad · web-flow · commit 2e0f9a26cba2 · 2024-10-07T01:00:29.000+05:30
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -76,7 +76,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.dt.freq GL08" \
         -i "pandas.Series.dt.unit GL08" \
         -i "pandas.Series.pad PR01,SA01" \
-        -i "pandas.Series.sparse.from_coo PR07,SA01" \
         -i "pandas.Timedelta.max PR02" \
         -i "pandas.Timedelta.min PR02" \
         -i "pandas.Timedelta.resolution PR02" \
@@ -86,13 +85,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timestamp.resolution PR02" \
         -i "pandas.Timestamp.tzinfo GL08" \
         -i "pandas.Timestamp.year GL08" \
-        -i "pandas.api.types.is_float PR01,SA01" \
         -i "pandas.api.types.is_integer PR01,SA01" \
         -i "pandas.api.types.is_iterator PR07,SA01" \
         -i "pandas.api.types.is_re_compilable PR07,SA01" \
         -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \
         -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
-        -i "pandas.arrays.DatetimeArray SA01" \
         -i "pandas.arrays.IntegerArray SA01" \
         -i "pandas.arrays.IntervalArray.left SA01" \
         -i "pandas.arrays.IntervalArray.length SA01" \
@@ -143,7 +140,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.errors.DuplicateLabelError SA01" \
         -i "pandas.errors.IntCastingNaNError SA01" \
         -i "pandas.errors.InvalidIndexError SA01" \
-        -i "pandas.errors.InvalidVersion SA01" \
         -i "pandas.errors.NullFrequencyError SA01" \
         -i "pandas.errors.NumExprClobberingError SA01" \
         -i "pandas.errors.NumbaUtilError SA01" \
@@ -152,7 +148,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.errors.PerformanceWarning SA01" \
         -i "pandas.errors.PossibleDataLossError SA01" \
         -i "pandas.errors.PossiblePrecisionLoss SA01" \
-        -i "pandas.errors.SpecificationError SA01" \
         -i "pandas.errors.UndefinedVariableError PR01,SA01" \
         -i "pandas.errors.UnsortedIndexError SA01" \
         -i "pandas.errors.UnsupportedFunctionCall SA01" \
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -203,6 +203,67 @@ In cases with mixed-resolution inputs, the highest resolution is used:
     In [2]: pd.to_datetime([pd.Timestamp("2024-03-22 11:43:01"), "2024-03-22 11:43:01.002"]).dtype
     Out[2]: dtype('<M8[ns]')
 
+.. _whatsnew_300.api_breaking.value_counts_sorting:
+
+Changed behavior in :meth:`DataFrame.value_counts` and :meth:`DataFrameGroupBy.value_counts` when ``sort=False``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous versions of pandas, :meth:`DataFrame.value_counts` with ``sort=False`` would sort the result by row labels (as was documented). This was nonintuitive and inconsistent with :meth:`Series.value_counts` which would maintain the order of the input. Now :meth:`DataFrame.value_counts` will maintain the order of the input.
+
+.. ipython:: python
+
+    df = pd.DataFrame(
+        {
+            "a": [2, 2, 2, 2, 1, 1, 1, 1],
+            "b": [2, 1, 3, 1, 2, 3, 1, 1],
+        }
+    )
+    df
+
+*Old behavior*
+
+.. code-block:: ipython
+
+    In [3]: df.value_counts(sort=False)
+    Out[3]:
+    a  b
+    1  1    2
+       2    1
+       3    1
+    2  1    2
+       2    1
+       3    1
+    Name: count, dtype: int64
+
+*New behavior*
+
+.. ipython:: python
+
+    df.value_counts(sort=False)
+
+This change also applies to :meth:`.DataFrameGroupBy.value_counts`. Here, there are two options for sorting: one ``sort`` passed to :meth:`DataFrame.groupby` and one passed directly to :meth:`.DataFrameGroupBy.value_counts`. The former will determine whether to sort the groups, the latter whether to sort the counts. All non-grouping columns will maintain the order of the input *within groups*.
+
+*Old behavior*
+
+.. code-block:: ipython
+
+    In [5]: df.groupby("a", sort=True).value_counts(sort=False)
+    Out[5]:
+    a  b
+    1  1    2
+       2    1
+       3    1
+    2  1    2
+       2    1
+       3    1
+    dtype: int64
+
+*New behavior*
+
+.. ipython:: python
+
+    df.groupby("a", sort=True).value_counts(sort=False)
+
 .. _whatsnew_300.api_breaking.deps:
 
 Increased minimum version for Python
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1089,9 +1089,23 @@ def is_float(obj: object) -> bool:
     """
     Return True if given object is float.
 
+    This method checks whether the passed object is a float type. It
+    returns `True` if the object is a float, and `False` otherwise.
+
+    Parameters
+    ----------
+    obj : object
+        The object to check for float type.
+
     Returns
     -------
     bool
+        `True` if the object is of float type, otherwise `False`.
+
+    See Also
+    --------
+    api.types.is_integer : Check if an object is of integer type.
+    api.types.is_numeric_dtype : Check if an object is of numeric type.
 
     Examples
     --------
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -205,6 +205,14 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps):  # type: ignore[misc]
     -------
     None
 
+    See Also
+    --------
+    DatetimeIndex : Immutable Index for datetime-like data.
+    Series : One-dimensional labeled array capable of holding datetime-like data.
+    Timestamp : Pandas replacement for python datetime.datetime object.
+    to_datetime : Convert argument to datetime.
+    period_range : Return a fixed frequency PeriodIndex.
+
     Examples
     --------
     >>> pd.arrays.DatetimeArray._from_sequence(
diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
@@ -88,9 +88,17 @@ def from_coo(cls, A, dense_index: bool = False) -> Series:
         """
         Create a Series with sparse values from a scipy.sparse.coo_matrix.
 
+        This method takes a ``scipy.sparse.coo_matrix`` (coordinate format) as input and
+        returns a pandas ``Series`` where the non-zero elements are represented as
+        sparse values. The index of the Series can either include only the coordinates
+        of non-zero elements (default behavior) or the full sorted set of coordinates
+        from the matrix if ``dense_index`` is set to `True`.
+
         Parameters
         ----------
         A : scipy.sparse.coo_matrix
+            The sparse matrix in coordinate format from which the sparse Series
+            will be created.
         dense_index : bool, default False
             If False (default), the index consists of only the
             coords of the non-null entries of the original coo_matrix.
@@ -102,6 +110,12 @@ def from_coo(cls, A, dense_index: bool = False) -> Series:
         s : Series
             A Series with sparse values.
 
+        See Also
+        --------
+        DataFrame.sparse.from_spmatrix : Create a new DataFrame from a scipy sparse
+            matrix.
+        scipy.sparse.coo_matrix : A sparse matrix in COOrdinate format.
+
         Examples
         --------
         >>> from scipy import sparse
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -7266,7 +7266,11 @@ def value_counts(
         normalize : bool, default False
             Return proportions rather than frequencies.
         sort : bool, default True
-            Sort by frequencies when True. Sort by DataFrame column values when False.
+            Sort by frequencies when True. Preserve the order of the data when False.
+
+            .. versionchanged:: 3.0.0
+
+                Prior to 3.0.0, ``sort=False`` would sort by the columns values.
         ascending : bool, default False
             Sort in ascending order.
         dropna : bool, default True
@@ -7372,7 +7376,9 @@ def value_counts(
             subset = self.columns.tolist()
 
         name = "proportion" if normalize else "count"
-        counts = self.groupby(subset, dropna=dropna, observed=False)._grouper.size()
+        counts = self.groupby(
+            subset, sort=False, dropna=dropna, observed=False
+        )._grouper.size()
         counts.name = name
 
         if sort:
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -2621,7 +2621,13 @@ def value_counts(
         normalize : bool, default False
             Return proportions rather than frequencies.
         sort : bool, default True
-            Sort by frequencies.
+            Sort by frequencies when True. When False, non-grouping columns will appear
+            in the order they occur in within groups.
+
+            .. versionchanged:: 3.0.0
+
+                In prior versions, ``sort=False`` would sort the non-grouping columns
+                by label.
         ascending : bool, default False
             Sort in ascending order.
         dropna : bool, default True
@@ -2673,43 +2679,43 @@ def value_counts(
 
         >>> df.groupby("gender").value_counts()
         gender  education  country
-        female  high       FR         1
-                           US         1
+        female  high       US         1
+                           FR         1
         male    low        FR         2
                            US         1
                 medium     FR         1
         Name: count, dtype: int64
 
         >>> df.groupby("gender").value_counts(ascending=True)
         gender  education  country
-        female  high       FR         1
-                           US         1
+        female  high       US         1
+                           FR         1
         male    low        US         1
                 medium     FR         1
                 low        FR         2
         Name: count, dtype: int64
 
         >>> df.groupby("gender").value_counts(normalize=True)
         gender  education  country
-        female  high       FR         0.50
-                           US         0.50
+        female  high       US         0.50
+                           FR         0.50
         male    low        FR         0.50
                            US         0.25
                 medium     FR         0.25
         Name: proportion, dtype: float64
 
         >>> df.groupby("gender", as_index=False).value_counts()
            gender education country  count
-        0  female      high      FR      1
-        1  female      high      US      1
+        0  female      high      US      1
+        1  female      high      FR      1
         2    male       low      FR      2
         3    male       low      US      1
         4    male    medium      FR      1
 
         >>> df.groupby("gender", as_index=False).value_counts(normalize=True)
            gender education country  proportion
-        0  female      high      FR        0.50
-        1  female      high      US        0.50
+        0  female      high      US        0.50
+        1  female      high      FR        0.50
         2    male       low      FR        0.50
         3    male       low      US        0.25
         4    male    medium      FR        0.25
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -2519,7 +2519,7 @@ def _value_counts(
             grouper, _, _ = get_grouper(
                 df,
                 key=key,
-                sort=self.sort,
+                sort=False,
                 observed=False,
                 dropna=dropna,
             )
@@ -2528,7 +2528,7 @@ def _value_counts(
         # Take the size of the overall columns
         gb = df.groupby(
             groupings,
-            sort=self.sort,
+            sort=False,
             observed=self.observed,
             dropna=self.dropna,
         )
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -755,6 +755,7 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
         obs = [
             ping._observed or not ping._passed_categorical for ping in self.groupings
         ]
+        sorts = [ping._sort for ping in self.groupings]
         # When passed a categorical grouping, keep all categories
         for k, (ping, level) in enumerate(zip(self.groupings, levels)):
             if ping._passed_categorical:
@@ -765,7 +766,9 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
             result_index.name = self.names[0]
             ids = ensure_platform_int(self.codes[0])
         elif all(obs):
-            result_index, ids = self._ob_index_and_ids(levels, self.codes, self.names)
+            result_index, ids = self._ob_index_and_ids(
+                levels, self.codes, self.names, sorts
+            )
         elif not any(obs):
             result_index, ids = self._unob_index_and_ids(levels, self.codes, self.names)
         else:
@@ -778,6 +781,7 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
                 levels=[levels[idx] for idx in ob_indices],
                 codes=[codes[idx] for idx in ob_indices],
                 names=[names[idx] for idx in ob_indices],
+                sorts=[sorts[idx] for idx in ob_indices],
             )
             unob_index, unob_ids = self._unob_index_and_ids(
                 levels=[levels[idx] for idx in unob_indices],
@@ -800,9 +804,18 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
             ).reorder_levels(index)
             ids = len(unob_index) * ob_ids + unob_ids
 
-            if self._sort:
+            if any(sorts):
                 # Sort result_index and recode ids using the new order
-                sorter = result_index.argsort()
+                n_levels = len(sorts)
+                drop_levels = [
+                    n_levels - idx
+                    for idx, sort in enumerate(reversed(sorts), 1)
+                    if not sort
+                ]
+                if len(drop_levels) > 0:
+                    sorter = result_index._drop_level_numbers(drop_levels).argsort()
+                else:
+                    sorter = result_index.argsort()
                 result_index = result_index.take(sorter)
                 _, index = np.unique(sorter, return_index=True)
                 ids = ensure_platform_int(ids)
@@ -837,10 +850,13 @@ def _ob_index_and_ids(
         levels: list[Index],
         codes: list[npt.NDArray[np.intp]],
         names: list[Hashable],
+        sorts: list[bool],
     ) -> tuple[MultiIndex, npt.NDArray[np.intp]]:
+        consistent_sorting = all(sorts[0] == sort for sort in sorts[1:])
+        sort_in_compress = sorts[0] if consistent_sorting else False
         shape = tuple(len(level) for level in levels)
         group_index = get_group_index(codes, shape, sort=True, xnull=True)
-        ob_ids, obs_group_ids = compress_group_index(group_index, sort=self._sort)
+        ob_ids, obs_group_ids = compress_group_index(group_index, sort=sort_in_compress)
         ob_ids = ensure_platform_int(ob_ids)
         ob_index_codes = decons_obs_group_ids(
             ob_ids, obs_group_ids, shape, codes, xnull=True
@@ -851,6 +867,21 @@ def _ob_index_and_ids(
             names=names,
             verify_integrity=False,
         )
+        if not consistent_sorting:
+            # Sort by the levels where the corresponding sort argument is True
+            n_levels = len(sorts)
+            drop_levels = [
+                n_levels - idx
+                for idx, sort in enumerate(reversed(sorts), 1)
+                if not sort
+            ]
+            if len(drop_levels) > 0:
+                sorter = ob_index._drop_level_numbers(drop_levels).argsort()
+            else:
+                sorter = ob_index.argsort()
+            ob_index = ob_index.take(sorter)
+            _, index = np.unique(sorter, return_index=True)
+            ob_ids = np.where(ob_ids == -1, -1, index.take(ob_ids))
         ob_ids = ensure_platform_int(ob_ids)
         return ob_index, ob_ids
 
diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
@@ -444,6 +444,11 @@ class SpecificationError(Exception):
     The second way is calling ``agg`` on a Dataframe with duplicated functions
     names without assigning column name.
 
+    See Also
+    --------
+    DataFrame.agg : Aggregate using one or more operations over the specified axis.
+    Series.agg : Aggregate using one or more operations over the specified axis.
+
     Examples
     --------
     >>> df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py
diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py
diff --git a/pandas/util/version/__init__.py b/pandas/util/version/__init__.py