take into account VTA comments

LacombeLouis · LacombeLouis · commit 1eeba9a93a1d · 2022-08-31T11:38:55.000+02:00
diff --git a/environment.doc.yml b/environment.doc.yml
@@ -3,6 +3,7 @@ channels:
     - defaults
     - conda-forge
 dependencies:
+    - lightgbm=3.1.1
     - numpydoc=1.1.0
     - pandas=1.3.5
     - python=3.8
@@ -11,4 +12,3 @@ dependencies:
     - sphinx-gallery=0.10.1
     - sphinx_rtd_theme=1.0.0
     - typing_extensions=4.0.1
-    - lightgbm=3.1.1
diff --git a/examples/regression/1-quickstart/plot_prefit.py b/examples/regression/1-quickstart/plot_prefit.py
@@ -1,17 +1,19 @@
 """
-========================================================
-Example use of the prefit parameter with neural networks
-========================================================
+===========================================================================
+Example use of the prefit parameter with neural networks and LGBM Regressor
+===========================================================================
 
 :class:`mapie.regression.MapieRegressor` and
-:class:`mapie.quantile_regression.MapieQuantileRegressor``
+:class:`mapie.quantile_regression.MapieQuantileRegressor`
 are used to calibrate uncertainties for large models for
 which the cost of cross-validation is too high. Typically,
 neural networks rely on a single validation set.
 
 In this example, we first fit a neural network on the training set. We
 then compute residuals on a validation set with the `cv="prefit"` parameter.
 Finally, we evaluate the model with prediction intervals on a testing set.
+We will also show how to use the prefit method in the comformalized quantile
+regressor.
 """
 
 
@@ -20,11 +22,24 @@
 from matplotlib import pyplot as plt
 import scipy
 from sklearn.model_selection import train_test_split
+from sklearn.neural_network import MLPRegressor
 
 from mapie.regression import MapieRegressor
 from mapie.quantile_regression import MapieQuantileRegressor
 from mapie.metrics import regression_coverage_score
 from mapie._typing import NDArray
+import warnings
+warnings.filterwarnings("ignore")
+
+alpha = 0.1
+
+##############################################################################
+# 1. Generate dataset
+# -----------------------------------------------------------------------------
+#
+# We start by defining a function that we will use to generate data. We then
+# add random noise y values. Then we split the dataset to have a training,
+# calibration and test set.
 
 
 def f(x: NDArray) -> NDArray:
@@ -39,67 +54,123 @@ def f(x: NDArray) -> NDArray:
 y = f(X) + np.random.normal(0, sigma, n_samples)
 
 # Train/validation/test split
-X_train_val, X_test, y_train_val, y_test = train_test_split(
+X_train_cal, X_test, y_train_cal, y_test = train_test_split(
     X, y, test_size=1 / 10
 )
-X_train, X_val, y_train, y_val = train_test_split(
-    X_train_val, y_train_val, test_size=1 / 9
+X_train, X_cal, y_train, y_cal = train_test_split(
+    X_train_cal, y_train_cal, test_size=1 / 9
 )
 
-# Train model on training set for MapieRegressor
-model = estimator = LGBMRegressor(
-    objective='quantile',
-    alpha=0.5,
-)
-model.fit(X_train.reshape(-1, 1), y_train)
 
-# Calibrate uncertainties on validation set
-mapie = MapieRegressor(model, cv="prefit")
-mapie.fit(X_val.reshape(-1, 1), y_val)
+##############################################################################
+# 2. Pre-train models
+# -----------------------------------------------------------------------------
+#
+# For this example, we will train a MLPRegressor for
+# :class:`mapie.regression.MapieRegressor` and multiple LGBMRegressor in the
+# with a quantile objective as this is a requirement to perform conformalized
+# quantile regression using
+# :class:`mapie.quanitle_regression.MapieQuantileRegressor`. Note that the
+# three estimators need to be trained at quantile values of
+# $(\alpha/2, 1-(\alpha/2), 0.5)$.
+
+
+# Train a MLPRegressor for MapieRegressor
+est_mlp = MLPRegressor(activation="relu", random_state=1)
+est_mlp.fit(X_train.reshape(-1, 1), y_train)
+
+# Train LGBMRegressor models for MapieQuantileRegressor
+list_estimators_cqr = []
+for alpha_ in [alpha/2, (1-(alpha/2)), 0.5]:
+    estimator_ = LGBMRegressor(
+        objective='quantile',
+        alpha=alpha_,
+    )
+    estimator_.fit(X_train.reshape(-1, 1), y_train)
+    list_estimators_cqr.append(estimator_)
+
+
+##############################################################################
+# 3. Using MAPIE to calibrate the models
+# -----------------------------------------------------------------------------
+#
+# We will now proceed to calibrate the models using MAPIE. This means using
+# the `cv="prefit"` so that we use the models that we already trained prior.
+# We then precict using the test set and evaluate its coverage.
+
+
+# Calibrate uncertainties on calibration set
+mapie = MapieRegressor(est_mlp, cv="prefit")
+mapie.fit(X_cal.reshape(-1, 1), y_cal)
 
 # Evaluate prediction and coverage level on testing set
-alpha = 0.1
 y_pred, y_pis = mapie.predict(X_test.reshape(-1, 1), alpha=alpha)
-y_pred_low, y_pred_up = y_pis[:, 0, 0], y_pis[:, 1, 0]
-coverage = regression_coverage_score(y_test, y_pred_low, y_pred_up)
-
-# Train models for MapieQuantileRegressor
-list_estimators = []
-estimator_low = LGBMRegressor(
-    objective='quantile',
-    alpha=(alpha/2),
-)
-estimator_low.fit(X_train.reshape(-1, 1), y_train)
-list_estimators.append(estimator_low)
+coverage = regression_coverage_score(y_test, y_pis[:, 0, 0], y_pis[:, 1, 0])
 
-estimator_high = LGBMRegressor(
-    objective='quantile',
-    alpha=(1-(alpha/2)),
-)
-estimator_high.fit(X_train.reshape(-1, 1), y_train)
-list_estimators.append(estimator_high)
+# Calibrate uncertainties on calibration set
+mapie_cqr = MapieQuantileRegressor(list_estimators_cqr, cv="prefit")
+mapie_cqr.fit(X_cal.reshape(-1, 1), y_cal)
 
+# Evaluate prediction and coverage level on testing set
+y_pred_cqr, y_pis_cqr = mapie_cqr.predict(X_test.reshape(-1, 1))
+coverage_cqr = regression_coverage_score(
+    y_test,
+    y_pis_cqr[:, 0, 0],
+    y_pis_cqr[:, 1, 0]
+)
 
-estimator = LGBMRegressor(
-    objective='quantile',
-    alpha=0.5,
-)  # Note that this is the same model as used for QR
-estimator.fit(X_train.reshape(-1, 1), y_train)
-list_estimators.append(estimator)
 
-# Calibrate uncertainties on validation set
-mapie_cqr = MapieQuantileRegressor(list_estimators, cv="prefit")
-mapie_cqr.fit(X_val.reshape(-1, 1), y_val)
-y_pred_cqr, y_pis_cqr = mapie_cqr.predict(X_test.reshape(-1, 1))
-y_pred_low_cqr, y_pred_up_cqr = y_pis_cqr[:, 0, 0], y_pis_cqr[:, 1, 0]
-coverage_cqr = regression_coverage_score(y_test, y_pred_low_cqr, y_pred_up_cqr)
+##############################################################################
+# 4. Plots
+# -----------------------------------------------------------------------------
+#
+# In order to view the results shown above, we will plot each othe predictions
+# with their prediction interval. The multi-layer perceptron (MLP) with
+# :class:`mapie.regression.MapieRegressor` and LGBMRegressor with
+# :class:`mapie.quantile_regression.MapieQuantileRegressor`.
 
 # Plot obtained prediction intervals on testing set
 theoretical_semi_width = scipy.stats.norm.ppf(1 - alpha) * sigma
 y_test_theoretical = f(X_test)
 order = np.argsort(X_test)
 
-plt.scatter(X_test, y_test, color="red", alpha=0.3, label="testing", s=2)
+plt.figure(figsize=(8, 8))
+plt.plot(
+    X_test[order],
+    y_pred[order],
+    label="Predictions MLP",
+    color="green"
+)
+plt.fill_between(
+    X_test[order],
+    y_pis[:, 0, 0][order],
+    y_pis[:, 1, 0][order],
+    alpha=0.4,
+    label="prediction intervals MP",
+    color="green"
+)
+plt.plot(
+    X_test[order],
+    y_pred_cqr[order],
+    label="Predictions LGBM",
+    color="blue"
+)
+plt.fill_between(
+    X_test[order],
+    y_pis_cqr[:, 0, 0][order],
+    y_pis_cqr[:, 1, 0][order],
+    alpha=0.4,
+    label="prediction intervals MQP",
+    color="blue"
+)
+plt.title(
+    f"Target and effective coverages for:\n "
+    f"MLP with MapieRegressor alpha={alpha}: "
+    + f"({1 - alpha:.3f}, {coverage:.3f})\n"
+    f"LGBM with MapieQuantileRegressor alpha={alpha}: "
+    + f"({1 - alpha:.3f}, {coverage_cqr:.3f})"
+)
+plt.scatter(X_test, y_test, color="red", alpha=0.7, label="testing", s=2)
 plt.plot(
     X_test[order],
     y_test_theoretical[order],
@@ -118,27 +189,13 @@ def f(x: NDArray) -> NDArray:
     color="gray",
     ls="--",
 )
-plt.plot(X_test[order], y_pred[order], label="Predictions")
-plt.fill_between(
-    X_test[order],
-    y_pred_low[order],
-    y_pred_up[order],
-    alpha=0.4,
-    label="prediction intervals QR"
-)
-plt.fill_between(
-    X_test[order],
-    y_pred_low_cqr[order],
-    y_pred_up_cqr[order],
-    alpha=0.4,
-    label="prediction intervals CQR"
-)
-plt.title(
-    f"Target and effective coverages for:\n "
-    f"QR alpha={alpha}: ({1 - alpha:.3f}, {coverage:.3f})\n"
-    f"CQR alpha={alpha}: ({1 - alpha:.3f}, {coverage_cqr:.3f})"
-)
 plt.xlabel("x")
 plt.ylabel("y")
-plt.legend()
+plt.legend(
+    loc='upper center',
+    bbox_to_anchor=(0.5, -0.05),
+    fancybox=True,
+    shadow=True,
+    ncol=3
+)
 plt.show()
diff --git a/mapie/quantile_regression.py b/mapie/quantile_regression.py
@@ -444,7 +444,7 @@ def _check_prefit_params(
             If the alpha is defined, warns the user that it must be set
             accordingly with the prefit estimators.
         """
-        if hasattr(estimator, '__iter__') is False:
+        if isinstance(estimator, Iterable) is False:
             raise ValueError(
                 "Estimator for prefit must be an iterable object."
             )
diff --git a/mapie/regression.py b/mapie/regression.py
@@ -31,6 +31,7 @@
     check_null_weight,
     check_verbose,
     fit_estimator,
+    check_estimator_fit_predict,
 )
 
 
@@ -320,11 +321,7 @@ def _check_estimator(
         """
         if estimator is None:
             return LinearRegression()
-        if not (hasattr(estimator, "fit") and hasattr(estimator, "predict")):
-            raise ValueError(
-                "Invalid estimator."
-                "Please provide a regressor with fit and predict methods."
-            )
+        check_estimator_fit_predict(estimator)
         if self.cv == "prefit":
             if isinstance(self.estimator, Pipeline):
                 check_is_fitted(self.estimator[-1])
diff --git a/mapie/tests/test_quantile_regression.py b/mapie/tests/test_quantile_regression.py
@@ -519,7 +519,8 @@ def test_linear_regression_results(strategy: str) -> None:
     np.testing.assert_allclose(coverage, COVERAGES[strategy], rtol=1e-2)
 
 
-def test_quantile_prefit_non_list() -> None:
+@pytest.mark.parametrize("estimator", [-1, 3, KFold(), LeaveOneOut()])
+def test_quantile_prefit_non_list(estimator: Any) -> None:
     """
     Test that there is a list of estimators provided when cv='prefit'
     is called for MapieQuantileRegressor.
@@ -528,9 +529,8 @@ def test_quantile_prefit_non_list() -> None:
         ValueError,
         match=r".*Estimator for prefit must be an iterable object.*",
     ):
-        not_an_iterable = 10
         mapie_reg = MapieQuantileRegressor(
-            estimator=not_an_iterable,
+            estimator=estimator,
             cv="prefit"
         )
         mapie_reg.fit(
@@ -541,7 +541,7 @@ def test_quantile_prefit_non_list() -> None:
 
 def test_quantile_prefit_three_estimators() -> None:
     """
-    Test that there is a list of estimators three estimators provided for
+    Test that there is a list with three estimators provided for
     cv="prefit".
     """
     with pytest.raises(
@@ -564,7 +564,7 @@ def test_quantile_prefit_three_estimators() -> None:
 
 def test_prefit_no_fit_predict() -> None:
     """
-    Check that the user is warned that the alphas need to be correctly set.
+    Check that the estimators given have a prefit and fit attribute.
     """
     with pytest.raises(
         ValueError,
@@ -588,7 +588,7 @@ def test_prefit_no_fit_predict() -> None:
 
 def test_non_trained_estimator() -> None:
     """
-    Check that the user is warned that the alphas need to be correctly set.
+    Check that the estimators are all already trained when used in prefit.
     """
     with pytest.raises(
         ValueError,
@@ -633,6 +633,37 @@ def test_warning_alpha_prefit() -> None:
         )
 
 
+def test_prefit_and_non_prefit_equal() -> None:
+    """
+    Check that the user is warned that the alphas need to be correctly set.
+    """
+    list_estimators = []
+    alphas_ = [0.15, 0.85, 0.5]
+    for alpha_ in alphas_:
+        est = clone(qt)
+        params = {"quantile": alpha_}
+        est.set_params(**params)
+        est.fit(X_train, y_train)
+        list_estimators.append(est)
+    mapie_reg_prefit = MapieQuantileRegressor(
+        estimator=list_estimators,
+        cv="prefit",
+        alpha=0.3
+    )
+    mapie_reg_prefit.fit(X_calib, y_calib)
+    y_pred_prefit, y_pis_prefit = mapie_reg_prefit.predict(X)
+
+    mapie_reg = MapieQuantileRegressor(
+        estimator=qt,
+        alpha=0.3
+    )
+    mapie_reg.fit(X_train, y_train, X_calib=X_calib, y_calib=y_calib)
+    y_pred, y_pis = mapie_reg.predict(X)
+
+    np.testing.assert_allclose(y_pred_prefit, y_pred)
+    np.testing.assert_allclose(y_pis_prefit, y_pis)
+
+
 @pytest.mark.parametrize("estimator", ESTIMATOR)
 def test_pipeline_compatibility(estimator: RegressorMixin) -> None:
     """Check that MAPIE works on pipeline based on pandas dataframes"""
diff --git a/mapie/tests/test_utils.py b/mapie/tests/test_utils.py

Original file line number	Diff line number	Diff line change
`@@ -444,7 +444,7 @@ def _check_prefit_params(`
`444`	`444`	`If the alpha is defined, warns the user that it must be set`
`445`	`445`	`accordingly with the prefit estimators.`
`446`	`446`	`"""`
`447`		`- if hasattr(estimator, '__iter__') is False:`
	`447`	`+ if isinstance(estimator, Iterable) is False:`
`448`	`448`	`raise ValueError(`
`449`	`449`	`"Estimator for prefit must be an iterable object."`
`450`	`450`	`)`