Merge pull request #464 from scikit-learn-contrib/238-giving-a-fraction-of-samples-instead-of-a-number-of-samples-in-the-subsample-class

BaptisteCalot · web-flow · commit 4c2500142f2b · 2024-06-20T14:30:38.000+02:00
238 giving a fraction of samples instead of a number of samples in the subsample class
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -5,6 +5,8 @@ History
 0.8.x (2024-xx-xx)
 ------------------
 
+* Building a training set with a fraction between 0 and 1 with `n_samples` attribute when using `split` method from `Subsample` class.
+
 0.8.6 (2024-06-14)
 ------------------
 
diff --git a/mapie/subsample.py b/mapie/subsample.py
@@ -10,6 +10,7 @@
 from sklearn.utils.validation import _num_samples
 
 from ._typing import NDArray
+from .utils import check_n_samples
 
 
 class Subsample(BaseCrossValidator):
@@ -22,9 +23,10 @@ class Subsample(BaseCrossValidator):
     ----------
     n_resamplings : int
         Number of resamplings. By default ``30``.
-    n_samples: int
+    n_samples: Union[int, float]
         Number of samples in each resampling. By default ``None``,
-        the size of the training set.
+        the size of the training set. If it is between 0 and 1,
+        it becomes the fraction of samples
     replace: bool
         Whether to replace samples in resamplings or not. By default ``True``.
     random_state: Optional[Union[int, RandomState]]
@@ -46,7 +48,7 @@ class Subsample(BaseCrossValidator):
     def __init__(
         self,
         n_resamplings: int = 30,
-        n_samples: Optional[int] = None,
+        n_samples: Optional[Union[int, float]] = None,
         replace: bool = True,
         random_state: Optional[Union[int, RandomState]] = None,
     ) -> None:
@@ -74,9 +76,7 @@ def split(
             The testing set indices for that split.
         """
         indices = np.arange(_num_samples(X))
-        n_samples = (
-            self.n_samples if self.n_samples is not None else len(indices)
-        )
+        n_samples = check_n_samples(X, self.n_samples, indices)
         random_state = check_random_state(self.random_state)
         for k in range(self.n_resamplings):
             train_index = resample(
diff --git a/mapie/tests/test_subsample.py b/mapie/tests/test_subsample.py
@@ -32,6 +32,50 @@ def test_split_SubSample() -> None:
     np.testing.assert_equal(tests, tests_expected)
 
 
+@pytest.mark.parametrize("n_samples", [4, 6, 8, 10])
+@pytest.mark.parametrize("n_resamplings", [1, 2, 3])
+def test_n_samples_int(n_samples: int,
+                       n_resamplings: int) -> None:
+    """Test outputs of subsamplings when n_samples is a int"""
+    X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    cv = Subsample(n_resamplings=n_resamplings, random_state=0,
+                   n_samples=n_samples, replace=False)
+    train_set = np.concatenate([x[0] for x in cv.split(X)])
+    val_set = np.concatenate([x[1] for x in cv.split(X)])
+    assert len(train_set) == n_samples*n_resamplings
+    assert len(val_set) == (X.shape[0] - n_samples)*n_resamplings
+
+
+@pytest.mark.parametrize("n_samples", [0.4, 0.6, 0.8, 0.9])
+@pytest.mark.parametrize("n_resamplings", [1, 2, 3])
+def test_n_samples_float(n_samples: float,
+                         n_resamplings: int) -> None:
+    """Test outputs of subsamplings when n_samples is a
+    float between 0 and 1."""
+    X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    cv = Subsample(n_resamplings=n_resamplings, random_state=0,
+                   n_samples=n_samples, replace=False)
+    train_set = np.concatenate([x[0] for x in cv.split(X)])
+    val_set = np.concatenate([x[1] for x in cv.split(X)])
+    assert len(train_set) == int(np.floor(n_samples*X.shape[0]))*n_resamplings
+    assert len(val_set) == (
+        (X.shape[0] - int(np.floor(n_samples * X.shape[0]))) *
+        n_resamplings
+    )
+
+
+@pytest.mark.parametrize("n_resamplings", [1, 2, 3])
+def test_n_samples_none(n_resamplings: int) -> None:
+    """Test outputs of subsamplings when n_samples is None."""
+    X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    cv = Subsample(n_resamplings=n_resamplings, random_state=0,
+                   replace=False)
+    train_set = np.concatenate([x[0] for x in cv.split(X)])
+    val_set = np.concatenate([x[1] for x in cv.split(X)])
+    assert len(train_set) == X.shape[0]*n_resamplings
+    assert len(val_set) == 0
+
+
 def test_default_parameters_BlockBootstrap() -> None:
     """Test default values of Subsample."""
     cv = BlockBootstrap()
diff --git a/mapie/tests/test_utils.py b/mapie/tests/test_utils.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import re
 from typing import Any, Optional, Tuple
 
 import numpy as np
@@ -17,10 +18,10 @@
                          check_array_inf, check_array_nan, check_arrays_length,
                          check_binary_zero_one, check_cv, check_gamma,
                          check_lower_upper_bounds, check_n_features_in,
-                         check_n_jobs, check_no_agg_cv, check_null_weight,
-                         check_number_bins, check_split_strategy,
-                         check_verbose, compute_quantiles, fit_estimator,
-                         get_binning_groups)
+                         check_n_jobs, check_n_samples, check_no_agg_cv,
+                         check_null_weight, check_number_bins,
+                         check_split_strategy, check_verbose,
+                         compute_quantiles, fit_estimator, get_binning_groups)
 
 X_toy = np.array([0, 1, 2, 3, 4, 5]).reshape(-1, 1)
 y_toy = np.array([5, 7, 9, 11, 13, 15])
@@ -508,3 +509,51 @@ def test_check_no_agg_cv_value_error(cv: Any) -> None:
         match=r"Allowed values must have the `get_n_splits` method"
     ):
         check_no_agg_cv(X_toy, cv, array)
+
+
+@pytest.mark.parametrize("n_samples", [-4, -2, -1])
+def test_invalid_n_samples_int_negative(n_samples: int) -> None:
+    """Test that invalid n_samples raise errors."""
+    X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    indices = X.copy()
+    with pytest.raises(
+        ValueError,
+        match=re.escape(
+            r"Invalid n_samples. Allowed values "
+            r"are float in the range (0.0, 1.0) or"
+            r" int in the range [1, inf)"
+        )
+    ):
+        check_n_samples(X=X, n_samples=n_samples, indices=indices)
+
+
+@pytest.mark.parametrize("n_samples", [0.002, 0.003, 0.04])
+def test_invalid_n_samples_int_zero(n_samples: int) -> None:
+    """Test that invalid n_samples raise errors."""
+    X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    indices = X.copy()
+    with pytest.raises(
+        ValueError,
+        match=re.escape(
+            r"The value of n_samples is too small. "
+            r"You need to increase it so that n_samples*X.shape[0] > 1"
+            r"otherwise n_samples should be an int"
+        )
+    ):
+        check_n_samples(X=X, n_samples=n_samples, indices=indices)
+
+
+@pytest.mark.parametrize("n_samples", [-5.5, -4.3, -0.2, 1.2, 2.5, 3.4])
+def test_invalid_n_samples_float(n_samples: float) -> None:
+    """Test that invalid n_samples raise errors."""
+    X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    indices = X.copy()
+    with pytest.raises(
+        ValueError,
+        match=re.escape(
+            r"Invalid n_samples. Allowed values "
+            r"are float in the range (0.0, 1.0) or"
+            r" int in the range [1, inf)"
+        )
+    ):
+        check_n_samples(X=X, n_samples=n_samples, indices=indices)
diff --git a/mapie/utils.py b/mapie/utils.py
@@ -1355,3 +1355,56 @@ def check_arrays_length(*arrays: NDArray) -> None:
         raise ValueError(
                 "There are arrays with different length"
             )
+
+
+def check_n_samples(
+    X: NDArray,
+    n_samples: Optional[Union[float, int]],
+    indices: NDArray
+) -> int:
+    """
+    Check alpha and prepare it as a ArrayLike.
+
+    Parameters
+    ----------
+    n_samples: Union[float, int]
+        Can be a float between 0 and 1 or a int
+        Between 0 and 1, represent the part of data in the train sample
+        When n_samples is a int, it represents the number of elements
+        in the train sample
+
+    Returns
+    -------
+    int
+        n_samples
+
+    Raises
+    ------
+    ValueError
+        If n_samples is not an int in the range [1, inf)
+        or a float in the range (0.0, 1.0)
+    """
+    if n_samples is None:
+        n_samples = len(indices)
+    elif isinstance(n_samples, float):
+        if 0 < n_samples < 1:
+            n_samples = int(np.floor(n_samples * X.shape[0]))
+            if n_samples == 0:
+                raise ValueError(
+                    "The value of n_samples is too small. "
+                    "You need to increase it so that n_samples*X.shape[0] > 1"
+                    "otherwise n_samples should be an int"
+                    )
+        else:
+            raise ValueError(
+                "Invalid n_samples. Allowed values "
+                "are float in the range (0.0, 1.0) or"
+                " int in the range [1, inf)"
+                )
+    elif isinstance(n_samples, int) and n_samples <= 0:
+        raise ValueError(
+             "Invalid n_samples. Allowed values "
+             "are float in the range (0.0, 1.0) or"
+             " int in the range [1, inf)"
+             )
+    return int(n_samples)