ENH: sparse: add nonzero functionality to min, max, argmin, argmax (scipy#16467)

quantresearch1 · dloney · tupui · web-flow · commit bcf317e64d43 · 2024-08-05T14:10:36.000-04:00
* Added sparse nonzero functionality

Added nonzero parsing funcitonality into sparse matrix methods

Added sparse nonzero functionality

Removed redundant else block

Changed working and removed enforced summation

Corrected formatting issues

* fix linting issues

* fold explicit into test_minmax_axis

* test explicit into existing tests

* change argmax_min explicit behaviour

* add explicit to test_argmax_overflow

* Update scipy/sparse/_data.py

Co-authored-by: Pamphile Roy &lt;roy.pamphile@gmail.com&gt;

* Update scipy/sparse/_data.py

Co-authored-by: Pamphile Roy &lt;roy.pamphile@gmail.com&gt;

* Update scipy/sparse/_data.py

Co-authored-by: Pamphile Roy &lt;roy.pamphile@gmail.com&gt;

* cleanup

* incorporate tupui comments

* benchmark for argmax

* fix lint errors

* non canonical test case

* add density to argmax benchmark

* add back sum_duplicates

* cleanup code

* merge main

* update tests after merge with main

* update doc_strings

* update docs to argmin/argmax

---------

Co-authored-by: Drew Allan Loney &lt;mundietomnis@gmail.com&gt;
Co-authored-by: Pamphile Roy &lt;roy.pamphile@gmail.com&gt;
Co-authored-by: CJ Carey &lt;perimosocordiae@gmail.com&gt;
Co-authored-by: Dan Schult &lt;dschult@colgate.edu&gt;
diff --git a/benchmarks/benchmarks/sparse.py b/benchmarks/benchmarks/sparse.py
@@ -504,3 +504,18 @@ def setup(self, density):
     def time_rand(self, density):
         sparse.rand(self.nrows, self.ncols,
                     format=self.format, density=density)
+
+
+class Argmax(Benchmark):
+    params = [[0.01, 0.1, 0.5], ['csr', 'csc', 'coo'], [True, False]]
+    param_names = ['density', 'format', 'explicit']
+
+    def setup(self, density, format, explicit):
+        n = 1000
+
+        warnings.simplefilter('ignore', SparseEfficiencyWarning)
+
+        self.X = sparse.rand(n, n, format=format, density=density)
+
+    def time_argmax(self, density, format, explicit):
+        self.X.argmax(explicit=explicit)
diff --git a/scipy/sparse/_data.py b/scipy/sparse/_data.py
@@ -171,7 +171,7 @@ class _minmax_mixin:
     These are not implemented for dia_matrix, hence the separate class.
     """
 
-    def _min_or_max_axis(self, axis, min_or_max):
+    def _min_or_max_axis(self, axis, min_or_max, explicit):
         N = self.shape[axis]
         if N == 0:
             raise ValueError("zero-size array to reduction operation")
@@ -182,8 +182,9 @@ def _min_or_max_axis(self, axis, min_or_max):
         mat.sum_duplicates()
 
         major_index, value = mat._minor_reduce(min_or_max)
-        not_full = np.diff(mat.indptr)[major_index] < N
-        value[not_full] = min_or_max(value[not_full], 0)
+        if not explicit:
+            not_full = np.diff(mat.indptr)[major_index] < N
+            value[not_full] = min_or_max(value[not_full], 0)
 
         mask = value != 0
         major_index = np.compress(mask, major_index)
@@ -205,7 +206,7 @@ def _min_or_max_axis(self, axis, min_or_max):
                 dtype=self.dtype, shape=(M, 1)
             )
 
-    def _min_or_max(self, axis, out, min_or_max):
+    def _min_or_max(self, axis, out, min_or_max, explicit):
         if out is not None:
             raise ValueError("Sparse arrays do not support an 'out' parameter.")
 
@@ -223,19 +224,19 @@ def _min_or_max(self, axis, out, min_or_max):
             if self.nnz == 0:
                 return zero
             m = min_or_max.reduce(self._deduped_data().ravel())
-            if self.nnz != math.prod(self.shape):
+            if self.nnz != math.prod(self.shape) and not explicit:
                 m = min_or_max(zero, m)
             return m
 
         if axis < 0:
             axis += 2
 
         if (axis == 0) or (axis == 1):
-            return self._min_or_max_axis(axis, min_or_max)
+            return self._min_or_max_axis(axis, min_or_max, explicit)
         else:
             raise ValueError("axis out of range")
 
-    def _arg_min_or_max_axis(self, axis, argmin_or_argmax, compare):
+    def _arg_min_or_max_axis(self, axis, argmin_or_argmax, compare, explicit):
         if self.shape[axis] == 0:
             raise ValueError("Cannot apply the operation along a zero-sized dimension.")
 
@@ -257,14 +258,18 @@ def _arg_min_or_max_axis(self, axis, argmin_or_argmax, compare):
             indices = mat.indices[p:q]
             extreme_index = argmin_or_argmax(data)
             extreme_value = data[extreme_index]
-            if compare(extreme_value, zero) or q - p == line_size:
-                ret[i] = indices[extreme_index]
+            if explicit:
+                if q - p > 0:
+                    ret[i] = indices[extreme_index]
             else:
-                zero_ind = _find_missing_index(indices, line_size)
-                if extreme_value == zero:
-                    ret[i] = min(extreme_index, zero_ind)
+                if compare(extreme_value, zero) or q - p == line_size:
+                    ret[i] = indices[extreme_index]
                 else:
-                    ret[i] = zero_ind
+                    zero_ind = _find_missing_index(indices, line_size)
+                    if extreme_value == zero:
+                        ret[i] = min(extreme_index, zero_ind)
+                    else:
+                        ret[i] = zero_ind
 
         if isinstance(self, sparray):
             return ret
@@ -274,7 +279,7 @@ def _arg_min_or_max_axis(self, axis, argmin_or_argmax, compare):
 
         return self._ascontainer(ret)
 
-    def _arg_min_or_max(self, axis, out, argmin_or_argmax, compare):
+    def _arg_min_or_max(self, axis, out, argmin_or_argmax, compare, explicit):
         if out is not None:
             raise ValueError("Sparse types do not support an 'out' parameter.")
 
@@ -286,19 +291,24 @@ def _arg_min_or_max(self, axis, out, argmin_or_argmax, compare):
             axis = None  # avoid calling special axis case. no impact on 1d
 
         if axis is not None:
-            return self._arg_min_or_max_axis(axis, argmin_or_argmax, compare)
+            return self._arg_min_or_max_axis(axis, argmin_or_argmax, compare, explicit)
 
         if 0 in self.shape:
             raise ValueError("Cannot apply the operation to an empty matrix.")
 
         if self.nnz == 0:
+            if explicit:
+                raise ValueError("Cannot apply the operation to zero matrix "
+                                 "when explicit=True.")
             return 0
 
         zero = self.dtype.type(0)
         mat = self.tocoo()
         # Convert to canonical form: no duplicates, sorted indices.
         mat.sum_duplicates()
         extreme_index = argmin_or_argmax(mat.data)
+        if explicit:
+            return extreme_index
         extreme_value = mat.data[extreme_index]
         num_col = mat.shape[-1]
 
@@ -322,10 +332,11 @@ def _arg_min_or_max(self, axis, out, argmin_or_argmax, compare):
             return min(first_implicit_zero_index, extreme_index)
         return first_implicit_zero_index
 
-    def max(self, axis=None, out=None):
-        """
-        Return the maximum of the array/matrix or maximum along an axis.
-        This takes all elements into account, not just the non-zero ones.
+    def max(self, axis=None, out=None, *, explicit=False):
+        """Return the maximum of the array/matrix or maximum along an axis.
+
+        By default, all elements are taken into account, not just the non-zero ones.
+        But with `explicit` set, only the stored elements are considered.
 
         Parameters
         ----------
@@ -339,25 +350,33 @@ def max(self, axis=None, out=None):
             compatibility reasons. Do not pass in anything except
             for the default value, as this argument is not used.
 
+        explicit : {False, True} optional (default: False)
+            When set to True, only the stored elements will be considered.
+            If a row/column is empty, the sparse.coo_array returned
+            has no stored element (i.e. an implicit zero) for that row/column.
+
+            .. versionadded:: 1.15.0
+
         Returns
         -------
-        amax : coo_matrix or scalar
+        amax : coo_array or scalar
             Maximum of `a`. If `axis` is None, the result is a scalar value.
-            If `axis` is given, the result is a sparse.coo_matrix of dimension
+            If `axis` is given, the result is a sparse.coo_array of dimension
             ``a.ndim - 1``.
 
         See Also
         --------
         min : The minimum value of a sparse array/matrix along a given axis.
-        numpy.matrix.max : NumPy's implementation of 'max' for matrices
+        numpy.max : NumPy's implementation of 'max'
 
         """
-        return self._min_or_max(axis, out, np.maximum)
+        return self._min_or_max(axis, out, np.maximum, explicit)
 
-    def min(self, axis=None, out=None):
-        """
-        Return the minimum of the array/matrix or maximum along an axis.
-        This takes all elements into account, not just the non-zero ones.
+    def min(self, axis=None, out=None, *, explicit=False):
+        """Return the minimum of the array/matrix or maximum along an axis.
+
+        By default, all elements are taken into account, not just the non-zero ones.
+        But with `explicit` set, only the stored elements are considered.
 
         Parameters
         ----------
@@ -371,26 +390,34 @@ def min(self, axis=None, out=None):
             compatibility reasons. Do not pass in anything except for
             the default value, as this argument is not used.
 
+        explicit : {False, True} optional (default: False)
+            When set to True, only the stored elements will be considered.
+            If a row/column is empty, the sparse.coo_array returned
+            has no stored element (i.e. an implicit zero) for that row/column.
+
+            .. versionadded:: 1.15.0
+
         Returns
         -------
         amin : coo_matrix or scalar
             Minimum of `a`. If `axis` is None, the result is a scalar value.
-            If `axis` is given, the result is a sparse.coo_matrix of dimension
+            If `axis` is given, the result is a sparse.coo_array of dimension
             ``a.ndim - 1``.
 
         See Also
         --------
         max : The maximum value of a sparse array/matrix along a given axis.
-        numpy.matrix.min : NumPy's implementation of 'min' for matrices
+        numpy.min : NumPy's implementation of 'min'
 
         """
-        return self._min_or_max(axis, out, np.minimum)
+        return self._min_or_max(axis, out, np.minimum, explicit)
 
-    def nanmax(self, axis=None, out=None):
-        """
-        Return the maximum of the array/matrix or maximum along an axis, ignoring any
-        NaNs. This takes all elements into account, not just the non-zero
-        ones.
+    def nanmax(self, axis=None, out=None, *, explicit=False):
+        """Return the maximum, ignoring any Nans, along an axis.
+
+        Return the maximum, ignoring any Nans, of the array/matrix along an axis.
+        By default this takes all elements into account, but with `explicit` set,
+        only stored elements are considered.
 
         .. versionadded:: 1.11.0
 
@@ -406,11 +433,18 @@ def nanmax(self, axis=None, out=None):
             compatibility reasons. Do not pass in anything except
             for the default value, as this argument is not used.
 
+        explicit : {False, True} optional (default: False)
+            When set to True, only the stored elements will be considered.
+            If a row/column is empty, the sparse.coo_array returned
+            has no stored element (i.e. an implicit zero) for that row/column.
+
+            .. versionadded:: 1.15.0
+
         Returns
         -------
-        amax : coo_matrix or scalar
+        amax : coo_array or scalar
             Maximum of `a`. If `axis` is None, the result is a scalar value.
-            If `axis` is given, the result is a sparse.coo_matrix of dimension
+            If `axis` is given, the result is a sparse.coo_array of dimension
             ``a.ndim - 1``.
 
         See Also
@@ -422,13 +456,14 @@ def nanmax(self, axis=None, out=None):
         numpy.nanmax : NumPy's implementation of 'nanmax'.
 
         """
-        return self._min_or_max(axis, out, np.fmax)
+        return self._min_or_max(axis, out, np.fmax, explicit)
 
-    def nanmin(self, axis=None, out=None):
-        """
-        Return the minimum of the array/matrix or minimum along an axis, ignoring any
-        NaNs. This takes all elements into account, not just the non-zero
-        ones.
+    def nanmin(self, axis=None, out=None, *, explicit=False):
+        """Return the minimum, ignoring any Nans, along an axis.
+
+        Return the minimum, ignoring any Nans, of the array/matrix along an axis.
+        By default this takes all elements into account, but with `explicit` set,
+        only stored elements are considered.
 
         .. versionadded:: 1.11.0
 
@@ -444,11 +479,18 @@ def nanmin(self, axis=None, out=None):
             compatibility reasons. Do not pass in anything except for
             the default value, as this argument is not used.
 
+        explicit : {False, True} optional (default: False)
+            When set to True, only the stored elements will be considered.
+            If a row/column is empty, the sparse.coo_array returned
+            has no stored element (i.e. an implicit zero) for that row/column.
+
+            .. versionadded:: 1.15.0
+
         Returns
         -------
-        amin : coo_matrix or scalar
+        amin : coo_array or scalar
             Minimum of `a`. If `axis` is None, the result is a scalar value.
-            If `axis` is given, the result is a sparse.coo_matrix of dimension
+            If `axis` is given, the result is a sparse.coo_array of dimension
             ``a.ndim - 1``.
 
         See Also
@@ -460,50 +502,68 @@ def nanmin(self, axis=None, out=None):
         numpy.nanmin : NumPy's implementation of 'nanmin'.
 
         """
-        return self._min_or_max(axis, out, np.fmin)
+        return self._min_or_max(axis, out, np.fmin, explicit)
 
-    def argmax(self, axis=None, out=None):
+    def argmax(self, axis=None, out=None, *, explicit=False):
         """Return indices of maximum elements along an axis.
 
-        Implicit zero elements are also taken into account. If there are
-        several maximum values, the index of the first occurrence is returned.
+        By default, implicit zero elements are taken into account. If there are
+        several minimum values, the index of the first occurrence is returned.
+        If `explicit` is set, only explicitly stored elements will be considered.
 
         Parameters
         ----------
         axis : {-2, -1, 0, 1, None}, optional
             Axis along which the argmax is computed. If None (default), index
             of the maximum element in the flatten data is returned.
+
         out : None, optional
             This argument is in the signature *solely* for NumPy
             compatibility reasons. Do not pass in anything except for
             the default value, as this argument is not used.
 
+        explicit : {False, True} optional (default: False)
+            When set to True, only explicitly stored elements will be considered.
+            If axis is not None and a row/column has no stored elements, argmax
+            is undefined, so the index ``0`` is returned for that row/column.
+
+            .. versionadded:: 1.15.0
+
         Returns
         -------
         ind : numpy.matrix or int
             Indices of maximum elements. If matrix, its size along `axis` is 1.
         """
-        return self._arg_min_or_max(axis, out, np.argmax, np.greater)
+        return self._arg_min_or_max(axis, out, np.argmax, np.greater, explicit)
 
-    def argmin(self, axis=None, out=None):
+    def argmin(self, axis=None, out=None, *, explicit=False):
         """Return indices of minimum elements along an axis.
 
-        Implicit zero elements are also taken into account. If there are
+        By default, implicit zero elements are taken into account. If there are
         several minimum values, the index of the first occurrence is returned.
+        If `explicit` is set, only explicitly stored elements will be considered.
 
         Parameters
         ----------
         axis : {-2, -1, 0, 1, None}, optional
             Axis along which the argmin is computed. If None (default), index
             of the minimum element in the flatten data is returned.
+
         out : None, optional
             This argument is in the signature *solely* for NumPy
             compatibility reasons. Do not pass in anything except for
             the default value, as this argument is not used.
 
+        explicit : {False, True} optional (default: False)
+            When set to True, only explicitly stored elements will be considered.
+            If axis is not None and a row/column has no stored elements, argmin
+            is undefined, so the index ``0`` is returned for that row/column.
+
+            .. versionadded:: 1.15.0
+
         Returns
         -------
          ind : numpy.matrix or int
             Indices of minimum elements. If matrix, its size along `axis` is 1.
         """
-        return self._arg_min_or_max(axis, out, np.argmin, np.less)
+        return self._arg_min_or_max(axis, out, np.argmin, np.less, explicit)
diff --git a/scipy/sparse/tests/test_base.py b/scipy/sparse/tests/test_base.py