Merge pull request #136 from scikit-learn-contrib/fix_issue_128

vtaquet · web-flow · commit 50842171c661 · 2022-03-08T17:35:13.000+01:00
Fix issue 128
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -20,7 +20,7 @@ Please describe the tests that you ran to verify your changes. Provide instructi
 - [ ] Test A
 - [ ] Test B
 
-# Checklist:
+# Checklist
 
 - [ ] I have read the [contributing guidelines](https://github.com/simai-ml/MAPIE/blob/master/CONTRIBUTING.rst)
 - [ ] I have updated the [HISTORY.rst](https://github.com/simai-ml/MAPIE/blob/master/HISTORY.rst) and [AUTHORS.rst](https://github.com/simai-ml/MAPIE/blob/master/AUTHORS.rst) files
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -14,6 +14,7 @@ History
 "predict" in regression.py
 * Add replication of the Chen Xu's tutorial testing Jackknife+aB vs Jackknife+
 * Add Jackknife+-after-Bootstrap documentation
+* Improve scikit-learn pipelines compatibility
 
 0.3.1 (2021-11-19)
 ------------------
diff --git a/environment.dev.yml b/environment.dev.yml
@@ -5,7 +5,7 @@ channels:
 dependencies:
     - bump2version=1.0.1
     - flake8=4.0.1
-    - mypy=0.920
+    - mypy=0.910
     - numpydoc=1.1.0
     - pandas=1.3.5
     - pytest=6.2.5
diff --git a/examples/classification/1-quickstart/plot_comp_methods_on_2d_dataset.py b/examples/classification/1-quickstart/plot_comp_methods_on_2d_dataset.py
@@ -168,7 +168,7 @@ def plot_scores(
 fig, axs = plt.subplots(1, 2, figsize=(10, 5))
 for i, method in enumerate(methods):
     conformity_scores = mapie[method].conformity_scores_
-    n = mapie[method].n_samples_val_
+    n = mapie[method].n_samples_
     quantiles = mapie[method].quantiles_
     plot_scores(alpha, conformity_scores, quantiles, method, axs[i])
 plt.show()
diff --git a/mapie/classification.py b/mapie/classification.py
@@ -7,10 +7,15 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import BaseCrossValidator
 from sklearn.pipeline import Pipeline
-from sklearn.utils import check_X_y, check_array, check_random_state
-from sklearn.utils.multiclass import type_of_target
-from sklearn.utils.validation import check_is_fitted
 from sklearn.preprocessing import label_binarize
+from sklearn.utils import check_random_state, _safe_indexing
+from sklearn.utils.multiclass import type_of_target
+from sklearn.utils.validation import (
+    indexable,
+    check_is_fitted,
+    _num_samples,
+    _check_y,
+)
 
 from ._typing import ArrayLike
 from ._machine_precision import EPSILON
@@ -122,7 +127,7 @@ class MapieClassifier(BaseEstimator, ClassifierMixin):  # type: ignore
     n_features_in_: int
         Number of features passed to the fit method.
 
-    n_samples_val_: Union[int, List[int]]
+    n_samples_: Union[int, List[int]]
         Number of samples passed to the fit method.
 
     conformity_scores_ : ArrayLike of shape (n_samples_train)
@@ -173,7 +178,7 @@ class MapieClassifier(BaseEstimator, ClassifierMixin):  # type: ignore
         "single_estimator_",
         "estimators_",
         "n_features_in_",
-        "n_samples_val_",
+        "n_samples_",
         "conformity_scores_"
     ]
 
@@ -601,16 +606,18 @@ def _fit_and_predict_oof_model(
           of shape (n_samples_val,).
 
         """
-        X_train, y_train, X_val, y_val = (
-            X[train_index], y[train_index], X[val_index], y[val_index]
-        )
+        X_train = _safe_indexing(X, train_index)
+        y_train = _safe_indexing(y, train_index)
+        X_val = _safe_indexing(X, val_index)
+        y_val = _safe_indexing(y, val_index)
+
         if sample_weight is None:
             estimator = fit_estimator(estimator, X_train, y_train)
         else:
             estimator = fit_estimator(
                 estimator, X_train, y_train, sample_weight[train_index]
             )
-        if X_val.shape[0] > 0:
+        if _num_samples(X_val) > 0:
             y_pred_proba = self._predict_oof_model(
                 estimator, X_val,
             )
@@ -663,13 +670,10 @@ def fit(
         self._check_parameters()
         cv = check_cv(self.cv)
         estimator = self._check_estimator(X, y, self.estimator)
-
         if self.image_input:
             check_input_is_image(X)
-        X, y = check_X_y(
-            X, y, force_all_finite=False, ensure_2d=self.image_input,
-            allow_nd=self.image_input, dtype=["float64", "int", "object"]
-        )
+        X, y = indexable(X, y)
+        y = _check_y(y)
         assert type_of_target(y) == "multiclass"
         self.n_classes_ = len(set(y))
         self.n_features_in_ = check_n_features_in(X, cv, estimator)
@@ -678,7 +682,7 @@ def fit(
         # Initialization
         self.estimators_: List[ClassifierMixin] = []
         self.k_ = np.empty_like(y, dtype=int)
-        self.n_samples_val_ = X.shape[0]
+        self.n_samples_ = _num_samples(X)
 
         # Work
         if cv == "prefit":
@@ -716,7 +720,7 @@ def fit(
             self.conformity_scores_ = np.empty(y_pred_proba.shape)
         elif self.method == "score":
             self.conformity_scores_ = np.take_along_axis(
-                1 - y_pred_proba, y.reshape(-1, 1), axis=1
+                1 - y_pred_proba, np.ravel(y).reshape(-1, 1), axis=1
             )
         elif self.method == "cumulated_score":
             y_true = label_binarize(y=y, classes=estimator.classes_)
@@ -731,7 +735,7 @@ def fit(
                 y_pred_proba_sorted_cumsum, cutoff.reshape(-1, 1), axis=1
             )
             y_proba_true = np.take_along_axis(
-                y_pred_proba, y.reshape(-1, 1), axis=1
+                y_pred_proba, np.ravel(y).reshape(-1, 1), axis=1
             )
             random_state = check_random_state(self.random_state)
             u = random_state.uniform(size=len(y_pred_proba)).reshape(-1, 1)
@@ -744,7 +748,7 @@ def fit(
             )
             self.conformity_scores_ = np.take_along_axis(
                 index,
-                y.reshape(-1, 1),
+                np.ravel(y).reshape(-1, 1),
                 axis=1
             )
 
@@ -829,13 +833,6 @@ def predict(
         check_is_fitted(self, self.fit_attributes)
         if self.image_input:
             check_input_is_image(X)
-        X = check_array(
-            X,
-            force_all_finite=False,
-            ensure_2d=self.image_input,
-            allow_nd=self.image_input,
-            dtype=["float64", "object"]
-        )
 
         # Estimate probabilities from estimator(s)
         y_pred = self.single_estimator_.predict(X)
@@ -865,7 +862,7 @@ def predict(
                 raise ValueError("Invalid 'agg_scores' argument.")
 
         # Estimate prediction sets
-        n = self.n_samples_val_
+        n = self.n_samples_
         if alpha_ is None:
             return np.array(y_pred)
 
diff --git a/mapie/regression.py b/mapie/regression.py
@@ -9,8 +9,13 @@
 from sklearn.linear_model import LinearRegression
 from sklearn.model_selection import BaseCrossValidator
 from sklearn.pipeline import Pipeline
-from sklearn.utils import check_array, check_X_y
-from sklearn.utils.validation import check_is_fitted
+from sklearn.utils import _safe_indexing
+from sklearn.utils.validation import (
+    indexable,
+    check_is_fitted,
+    _num_samples,
+    _check_y,
+)
 
 from ._typing import ArrayLike
 from .aggregation_functions import aggregate_all, phi2D
@@ -149,7 +154,7 @@ class MapieRegressor(BaseEstimator, RegressorMixin):  # type: ignore
     n_features_in_: int
         Number of features passed to the fit method.
 
-    n_samples_val_: List[int]
+    n_samples_: List[int]
         Number of samples passed to the fit method.
 
     References
@@ -190,7 +195,7 @@ class MapieRegressor(BaseEstimator, RegressorMixin):  # type: ignore
         "k_",
         "residuals_",
         "n_features_in_",
-        "n_samples_val_",
+        "n_samples_",
     ]
 
     def __init__(
@@ -341,7 +346,7 @@ def _fit_and_predict_oof_model(
         val_index: ArrayLike,
         k: int,
         sample_weight: Optional[ArrayLike] = None,
-    ) -> Tuple[RegressorMixin, ArrayLike, ArrayLike, ArrayLike]:
+    ) -> Tuple[RegressorMixin, ArrayLike, ArrayLike]:
         """
         Fit a single out-of-fold model on a given training set and
         perform predictions on a test set.
@@ -372,30 +377,30 @@ def _fit_and_predict_oof_model(
 
         Returns
         -------
-        Tuple[RegressorMixin, ArrayLike, ArrayLike, ArrayLike]
+        Tuple[RegressorMixin, ArrayLike, ArrayLike]
 
         - [0]: Fitted estimator
         - [1]: Estimator predictions on the validation fold,
           of shape (n_samples_val,)
-        - [2]: Identification number of the validation fold,
-          of shape (n_samples_val,)
         - [3]: Validation data indices,
           of shape (n_samples_val,).
 
         """
-        X_train, y_train, X_val = X[train_index], y[train_index], X[val_index]
+        X_train = _safe_indexing(X, train_index)
+        y_train = _safe_indexing(y, train_index)
+        X_val = _safe_indexing(X, val_index)
         if sample_weight is None:
             estimator = fit_estimator(estimator, X_train, y_train)
         else:
+            sample_weight_train = _safe_indexing(sample_weight, train_index)
             estimator = fit_estimator(
-                estimator, X_train, y_train, sample_weight[train_index]
+                estimator, X_train, y_train, sample_weight_train
             )
-        if X_val.shape[0] > 0:
+        if _num_samples(X_val) > 0:
             y_pred = estimator.predict(X_val)
         else:
             y_pred = np.array([])
-        val_id = np.full_like(y_pred, k, dtype=int)
-        return estimator, y_pred, val_id, val_index
+        return estimator, y_pred, val_index
 
     def aggregate_with_mask(self, x: ArrayLike, k: ArrayLike) -> ArrayLike:
         """
@@ -479,9 +484,8 @@ def fit(
         cv = check_cv(self.cv)
         estimator = self._check_estimator(self.estimator)
         agg_function = self._check_agg_function(self.agg_function)
-        X, y = check_X_y(
-            X, y, force_all_finite=False, dtype=["float64", "int", "object"]
-        )
+        X, y = indexable(X, y)
+        y = _check_y(y)
         self.n_features_in_ = check_n_features_in(X, cv, estimator)
         sample_weight, X, y = check_null_weight(sample_weight, X, y)
 
@@ -492,7 +496,7 @@ def fit(
         if cv == "prefit":
             self.single_estimator_ = estimator
             y_pred = self.single_estimator_.predict(X)
-            self.n_samples_val_ = [X.shape[0]]
+            self.n_samples_ = [_num_samples(X)]
             self.k_ = np.full(
                 shape=(len(y), 1), fill_value=np.nan, dtype=float
             )
@@ -514,7 +518,7 @@ def fit(
             )
             if self.method == "naive":
                 y_pred = self.single_estimator_.predict(X)
-                self.n_samples_val_ = [X.shape[0]]
+                self.n_samples_ = [_num_samples(X)]
             else:
                 outputs = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
                     delayed(self._fit_and_predict_oof_model)(
@@ -528,22 +532,22 @@ def fit(
                     )
                     for k, (train_index, val_index) in enumerate(cv.split(X))
                 )
-                self.estimators_, predictions, val_ids, val_indices = map(
+                self.estimators_, predictions, val_indices = map(
                     list, zip(*outputs)
                 )
 
-                self.n_samples_val_ = [
+                self.n_samples_ = [
                     np.array(pred).shape[0] for pred in predictions
                 ]
 
                 for i, val_ind in enumerate(val_indices):
-                    pred_matrix[val_ind, i] = predictions[i]
+                    pred_matrix[val_ind, i] = np.array(predictions[i]).ravel()
                     self.k_[val_ind, i] = 1
                 check_nan_in_aposteriori_prediction(pred_matrix)
 
                 y_pred = aggregate_all(agg_function, pred_matrix)
 
-        self.residuals_ = np.abs(y - y_pred)
+        self.residuals_ = np.abs(np.ravel(y) - y_pred)
         return self
 
     def predict(
@@ -605,7 +609,6 @@ def predict(
         check_is_fitted(self, self.fit_attributes)
         self._check_ensemble(ensemble)
         alpha_ = check_alpha(alpha)
-        X = check_array(X, force_all_finite=False, dtype=["float64", "object"])
         y_pred = self.single_estimator_.predict(X)
 
         if alpha is None:
@@ -658,7 +661,7 @@ def predict(
                         )
                         for _alpha in alpha_
                     ]
-                )
+                ).data
                 y_pred_up = np.column_stack(
                     [
                         np.quantile(
@@ -669,7 +672,8 @@ def predict(
                         )
                         for _alpha in alpha_
                     ]
-                )
+                ).data
                 if ensemble:
                     y_pred = aggregate_all(self.agg_function, y_pred_multi)
+                np.stack([y_pred_low, y_pred_up], axis=1)
             return y_pred, np.stack([y_pred_low, y_pred_up], axis=1)
diff --git a/mapie/tests/test_classification.py b/mapie/tests/test_classification.py
diff --git a/mapie/tests/test_regression.py b/mapie/tests/test_regression.py
diff --git a/mapie/utils.py b/mapie/utils.py