ENH Reduce redundancy in floating type checks for Array API support in _regression.py (scikit-learn#30128)

virchan · adrinjalali · ogrisel · web-flow · commit 33f08f1c5202 · 2024-12-02T15:11:07.000+05:00
Co-authored-by: Adrin Jalali &lt;adrin.jalali@gmail.com&gt;
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
@@ -58,11 +58,16 @@
 def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric", xp=None):
     """Check that y_true and y_pred belong to the same regression task.
 
+    To reduce redundancy when calling `_find_matching_floating_dtype`,
+    please use `_check_reg_targets_with_floating_dtype` instead.
+
     Parameters
     ----------
-    y_true : array-like
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
 
-    y_pred : array-like
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
 
     multioutput : array-like or string in ['raw_values', uniform_average',
         'variance_weighted'] or None
@@ -137,6 +142,71 @@ def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric", xp=None):
     return y_type, y_true, y_pred, multioutput
 
 
+def _check_reg_targets_with_floating_dtype(
+    y_true, y_pred, sample_weight, multioutput, xp=None
+):
+    """Ensures that y_true, y_pred, and sample_weight correspond to the same
+    regression task.
+
+    Extends `_check_reg_targets` by automatically selecting a suitable floating-point
+    data type for inputs using `_find_matching_floating_dtype`.
+
+    Use this private method only when converting inputs to array API-compatibles.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,)
+
+    multioutput : array-like or string in ['raw_values', 'uniform_average', \
+        'variance_weighted'] or None
+        None is accepted due to backward compatibility of r2_score().
+
+    xp : module, default=None
+        Precomputed array namespace module. When passed, typically from a caller
+        that has already performed inspection of its own inputs, skips array
+        namespace inspection.
+
+    Returns
+    -------
+    type_true : one of {'continuous', 'continuous-multioutput'}
+        The type of the true target data, as output by
+        'utils.multiclass.type_of_target'.
+
+    y_true : array-like of shape (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    multioutput : array-like of shape (n_outputs) or string in ['raw_values', \
+        'uniform_average', 'variance_weighted'] or None
+        Custom output weights if ``multioutput`` is array-like or
+        just the corresponding argument if ``multioutput`` is a
+        correct keyword.
+    """
+    dtype_name = _find_matching_floating_dtype(y_true, y_pred, sample_weight, xp=xp)
+
+    y_type, y_true, y_pred, multioutput = _check_reg_targets(
+        y_true, y_pred, multioutput, dtype=dtype_name, xp=xp
+    )
+
+    # _check_reg_targets does not accept sample_weight as input.
+    # Convert sample_weight's data type separately to match dtype_name.
+    if sample_weight is not None:
+        sample_weight = xp.asarray(sample_weight, dtype=dtype_name)
+
+    return y_type, y_true, y_pred, sample_weight, multioutput
+
+
 @validate_params(
     {
         "y_true": ["array-like"],
@@ -201,14 +271,14 @@ def mean_absolute_error(
     >>> mean_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7])
     0.85...
     """
-    input_arrays = [y_true, y_pred, sample_weight, multioutput]
-    xp, _ = get_namespace(*input_arrays)
-
-    dtype = _find_matching_floating_dtype(y_true, y_pred, sample_weight, xp=xp)
+    xp, _ = get_namespace(y_true, y_pred, sample_weight, multioutput)
 
-    _, y_true, y_pred, multioutput = _check_reg_targets(
-        y_true, y_pred, multioutput, dtype=dtype, xp=xp
+    _, y_true, y_pred, sample_weight, multioutput = (
+        _check_reg_targets_with_floating_dtype(
+            y_true, y_pred, sample_weight, multioutput, xp=xp
+        )
     )
+
     check_consistent_length(y_true, y_pred, sample_weight)
 
     output_errors = _average(
@@ -398,19 +468,16 @@ def mean_absolute_percentage_error(
     >>> mean_absolute_percentage_error(y_true, y_pred)
     112589990684262.48
     """
-    input_arrays = [y_true, y_pred, sample_weight, multioutput]
-    xp, _ = get_namespace(*input_arrays)
-    dtype = _find_matching_floating_dtype(y_true, y_pred, sample_weight, xp=xp)
-
-    y_type, y_true, y_pred, multioutput = _check_reg_targets(
-        y_true, y_pred, multioutput, dtype=dtype, xp=xp
+    xp, _ = get_namespace(y_true, y_pred, sample_weight, multioutput)
+    _, y_true, y_pred, sample_weight, multioutput = (
+        _check_reg_targets_with_floating_dtype(
+            y_true, y_pred, sample_weight, multioutput, xp=xp
+        )
     )
     check_consistent_length(y_true, y_pred, sample_weight)
-    epsilon = xp.asarray(xp.finfo(xp.float64).eps, dtype=dtype)
-    y_true_abs = xp.asarray(xp.abs(y_true), dtype=dtype)
-    mape = xp.asarray(xp.abs(y_pred - y_true), dtype=dtype) / xp.maximum(
-        y_true_abs, epsilon
-    )
+    epsilon = xp.asarray(xp.finfo(xp.float64).eps, dtype=y_true.dtype)
+    y_true_abs = xp.abs(y_true)
+    mape = xp.abs(y_pred - y_true) / xp.maximum(y_true_abs, epsilon)
     output_errors = _average(mape, weights=sample_weight, axis=0)
     if isinstance(multioutput, str):
         if multioutput == "raw_values":
@@ -494,10 +561,10 @@ def mean_squared_error(
     0.825...
     """
     xp, _ = get_namespace(y_true, y_pred, sample_weight, multioutput)
-    dtype = _find_matching_floating_dtype(y_true, y_pred, xp=xp)
-
-    _, y_true, y_pred, multioutput = _check_reg_targets(
-        y_true, y_pred, multioutput, dtype=dtype, xp=xp
+    _, y_true, y_pred, sample_weight, multioutput = (
+        _check_reg_targets_with_floating_dtype(
+            y_true, y_pred, sample_weight, multioutput, xp=xp
+        )
     )
     check_consistent_length(y_true, y_pred, sample_weight)
     output_errors = _average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)
@@ -670,10 +737,9 @@ def mean_squared_log_error(
     0.060...
     """
     xp, _ = get_namespace(y_true, y_pred)
-    dtype = _find_matching_floating_dtype(y_true, y_pred, xp=xp)
 
-    _, y_true, y_pred, _ = _check_reg_targets(
-        y_true, y_pred, multioutput, dtype=dtype, xp=xp
+    _, y_true, y_pred, _, _ = _check_reg_targets_with_floating_dtype(
+        y_true, y_pred, sample_weight, multioutput, xp=xp
     )
 
     if xp.any(y_true <= -1) or xp.any(y_pred <= -1):
@@ -747,10 +813,9 @@ def root_mean_squared_log_error(
     0.199...
     """
     xp, _ = get_namespace(y_true, y_pred)
-    dtype = _find_matching_floating_dtype(y_true, y_pred, xp=xp)
 
-    _, y_true, y_pred, multioutput = _check_reg_targets(
-        y_true, y_pred, multioutput, dtype=dtype, xp=xp
+    _, y_true, y_pred, _, _ = _check_reg_targets_with_floating_dtype(
+        y_true, y_pred, sample_weight, multioutput, xp=xp
     )
 
     if xp.any(y_true <= -1) or xp.any(y_pred <= -1):
@@ -1188,11 +1253,12 @@ def r2_score(
         y_true, y_pred, sample_weight, multioutput
     )
 
-    dtype = _find_matching_floating_dtype(y_true, y_pred, sample_weight, xp=xp)
-
-    _, y_true, y_pred, multioutput = _check_reg_targets(
-        y_true, y_pred, multioutput, dtype=dtype, xp=xp
+    _, y_true, y_pred, sample_weight, multioutput = (
+        _check_reg_targets_with_floating_dtype(
+            y_true, y_pred, sample_weight, multioutput, xp=xp
+        )
     )
+
     check_consistent_length(y_true, y_pred, sample_weight)
 
     if _num_samples(y_pred) < 2:
@@ -1201,7 +1267,7 @@ def r2_score(
         return float("nan")
 
     if sample_weight is not None:
-        sample_weight = column_or_1d(sample_weight, dtype=dtype)
+        sample_weight = column_or_1d(sample_weight)
         weight = sample_weight[:, None]
     else:
         weight = 1.0
@@ -1356,8 +1422,8 @@ def mean_tweedie_deviance(y_true, y_pred, *, sample_weight=None, power=0):
     1.4260...
     """
     xp, _ = get_namespace(y_true, y_pred)
-    y_type, y_true, y_pred, _ = _check_reg_targets(
-        y_true, y_pred, None, dtype=[xp.float64, xp.float32], xp=xp
+    y_type, y_true, y_pred, sample_weight, _ = _check_reg_targets_with_floating_dtype(
+        y_true, y_pred, sample_weight, multioutput=None, xp=xp
     )
     if y_type == "continuous-multioutput":
         raise ValueError("Multioutput not supported in mean_tweedie_deviance")
@@ -1570,8 +1636,8 @@ def d2_tweedie_score(y_true, y_pred, *, sample_weight=None, power=0):
     """
     xp, _ = get_namespace(y_true, y_pred)
 
-    y_type, y_true, y_pred, _ = _check_reg_targets(
-        y_true, y_pred, None, dtype=[xp.float64, xp.float32], xp=xp
+    y_type, y_true, y_pred, sample_weight, _ = _check_reg_targets_with_floating_dtype(
+        y_true, y_pred, sample_weight, multioutput=None, xp=xp
     )
     if y_type == "continuous-multioutput":
         raise ValueError("Multioutput not supported in d2_tweedie_score")
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
@@ -583,8 +583,8 @@ def _require_positive_targets(y1, y2):
 def _require_log1p_targets(y1, y2):
     """Make targets strictly larger than -1"""
     offset = abs(min(y1.min(), y2.min())) - 0.99
-    y1 = y1.astype(float)
-    y2 = y2.astype(float)
+    y1 = y1.astype(np.float64)
+    y2 = y2.astype(np.float64)
     y1 += offset
     y2 += offset
     return y1, y2