DOC: fix display of mathematical equations in generated notebooks (#562)

jawadhussein462 · web-flow · commit 651231215cca · 2024-12-13T13:11:36.000+01:00
diff --git a/examples/classification/1-quickstart/plot_comp_methods_on_2d_dataset.py b/examples/classification/1-quickstart/plot_comp_methods_on_2d_dataset.py
@@ -13,7 +13,7 @@
 # We will use MAPIE to estimate a prediction set of several classes such that
 # the probability that the true label of a new test point is included in the
 # prediction set is always higher than the target confidence level :
-# :math:`1 - \alpha`.
+# ``1 - α``.
 # Throughout this tutorial, we compare two conformity scores :
 # softmax score or cumulated softmax score.
 # We start by using the softmax score or cumulated score output by the base
@@ -23,18 +23,18 @@
 # * First we generate a dataset with train, calibration and test, the model
 #   is fitted in the training set.
 #
-# * We set the conformal score :math:`S_i = \hat{f}(X_{i})_{y_i}`
+# * We set the conformal score ``Sᵢ = 𝑓̂(Xᵢ)ᵧᵢ``
 #   from the softmax output of the true class or the cumulated score
 #   (by decreasing order) for each sample in the calibration set.
 #
-# * Then we define :math:`\hat{q}` as being the
-#   :math:`(n + 1) (1 - \alpha) / n`
-#   previous quantile of :math:`S_{1}, ..., S_{n}` (this is essentially the
-#   quantile :math:`\alpha`, but with a small sample correction).
+# * Then we define q̂ as being the
+#   ``(n + 1)(1 - α) / n``
+#   previous quantile of ``S₁, ..., Sₙ`` (this is essentially the
+#   quantile α, but with a small sample correction).
 #
-# * Finally, for a new test data point (where :math:`X_{n + 1}` is known but
-#   :math:`Y_{n + 1}` is not), create a prediction set
-#   :math:`C(X_{n+1}) = \{y: \hat{f}(X_{n+1})_{y} > \hat{q}\}` which includes
+# * Finally, for a new test data point (where ``Xₙ₊₁`` is known but
+#   ``Yₙ₊₁`` is not), create a prediction set
+#   ``C(Xₙ₊₁) = {y: 𝑓̂(Xₙ₊₁)ᵧ > q̂}`` which includes
 #   all the classes with a sufficiently high conformity score.
 #
 # We use a two-dimensional dataset with three labels.
@@ -241,7 +241,7 @@ def plot_results(
 # in ambiguous regions.
 #
 # Let's now compare the effective coverage and the average of prediction set
-# widths as function of the :math:`1-\alpha` target coverage.
+# widths as function of the ``1 - α`` target coverage.
 
 alpha_ = np.arange(0.02, 0.98, 0.02)
 coverage, mean_width = {}, {}
@@ -288,6 +288,6 @@ def plot_results(
 
 ##############################################################################
 # It is seen that both methods give coverages close to the target coverages,
-# regardless of the :math:`\alpha` value. However, the "aps"
+# regardless of the ``α`` value. However, the "aps"
 # produces slightly bigger prediction sets, but without empty regions
 # (if the selection of the last label is not randomized).
diff --git a/examples/classification/4-tutorials/plot_crossconformal.py b/examples/classification/4-tutorials/plot_crossconformal.py
@@ -18,8 +18,8 @@
 of this documentation.
 
 We start the tutorial by splitting our training dataset
-in :math:`K` folds and sequentially use each fold as a
-calibration set, the :math:`K-1` folds remaining folds are
+in ``K`` folds and sequentially use each fold as a
+calibration set, the ``K-1`` folds remaining folds are
 used for training the base model using
 the ``cv="prefit"`` option of
 :class:`~mapie.classification.MapieClassifier`.
diff --git a/examples/classification/4-tutorials/plot_main-tutorial-binary-classification.py b/examples/classification/4-tutorials/plot_main-tutorial-binary-classification.py
@@ -45,26 +45,26 @@
 # We will use MAPIE to estimate a prediction set such that
 # the probability that the true label of a new test point is included in the
 # prediction set is always higher than the target confidence level :
-# :math:`1 - \alpha`.
+# ``1 - α``.
 # We start by using the softmax score output by the base
 # classifier as the conformity score on a toy two-dimensional dataset.
 # We estimate the prediction sets as follows :
 #
 # * First we generate a dataset with train, calibration and test, the model
 #   is fitted in the training set.
 #
-# * We set the conformal score :math:`S_i = \hat{f}(X_{i})_{y_i}`
+# * We set the conformal score ``Sᵢ = 𝑓̂(Xᵢ)ᵧᵢ``
 #   from the softmax output of the true class for each sample
 #   in the calibration set.
 #
-# * Then we define :math:`\hat{q}` as being the
-#   :math:`(n + 1) (1 - \alpha) / n`
-#   previous quantile of :math:`S_{1}, ..., S_{n}` (this is essentially the
-#   quantile :math:`\alpha`, but with a small sample correction).
+# * Then we define ``q̂`` as being the
+#   ``(n + 1) (1 - α) / n``
+#   previous quantile of ``S₁, ...,  Sₙ`` (this is essentially the
+#   quantile ``α``, but with a small sample correction).
 #
-# * Finally, for a new test data point (where :math:`X_{n + 1}` is known but
-#   :math:`Y_{n + 1}` is not), create a prediction set
-#   :math:`C(X_{n+1}) = \{y: \hat{f}(X_{n+1})_{y} > \hat{q}\}` which includes
+# * Finally, for a new test data point (where ``Xₙ₊₁`` is known but
+#   ``Yₙ₊₁`` is not), create a prediction set
+#   ``C(Xₙ₊₁) = {y: 𝑓̂(Xₙ₊₁)ᵧ > q̂}`` which includes
 #   all the classes with a sufficiently high conformity score.
 #
 # We use a two-dimensional dataset with two classes (i.e. YES or NO).
@@ -281,7 +281,7 @@ def plot_results(
 
 ##############################################################################
 # Let's now compare the effective coverage and the average of prediction set
-# widths as function of the :math:`1-\alpha` target coverage.
+# widths as function of the ``1 - α`` target coverage.
 
 alpha_ = np.arange(0.02, 0.98, 0.02)
 
@@ -332,7 +332,7 @@ def plot_coverages_widths(alpha, coverage, width, method):
 
 ##############################################################################
 # It is seen that the method gives coverages close to the target coverages,
-# regardless of the :math:`\alpha` value.
+# regardless of the ``α`` value.
 
 alpha_ = np.arange(0.02, 0.16, 0.01)
 
diff --git a/examples/classification/4-tutorials/plot_main-tutorial-classification.py b/examples/classification/4-tutorials/plot_main-tutorial-classification.py
@@ -33,7 +33,7 @@
 # We will use MAPIE to estimate a prediction set of several classes such
 # that the probability that the true label of a new test point is included
 # in the prediction set is always higher than the target confidence level :
-# :math:`P(Y_{n+1} \in \hat{C}_{n, \alpha}(X_{n+1}) \geq 1 - \alpha`.
+# ``P(Yₙ₊₁ ∈ Ĉₙ,α(Xₙ₊₁)) ≥ 1 - α``
 # We start by using the softmax score output by the base classifier as the
 # conformity score on a toy two-dimensional dataset.
 #
@@ -42,17 +42,17 @@
 # * Generate a dataset with train, calibration and test, the model is
 #   fitted on the training set.
 #
-# * Set the conformal score :math:`S_i = \hat{f}(X_{i})_{y_i}` the softmax
+# * Set the conformal score ``Sᵢ = 𝑓̂(Xᵢ)ᵧᵢ``, the softmax
 #   output of the true class for each sample in the calibration set.
 #
-# * Define :math:`\hat{q}` as being the :math:`(n + 1) (\alpha) / n`
-#   previous quantile of :math:`S_{1}, ..., S_{n}`
-#   (this is essentially the quantile :math:`\alpha`, but with a small sample
+# * Define ``q̂`` as being the ``(n + 1)(α) / n``
+#   previous quantile of ``S₁, ..., Sₙ``
+#   (this is essentially the quantile ``α``, but with a small sample
 #   correction).
 #
-# * Finally, for a new test data point (where :math:`X_{n + 1}` is known but
-#   :math:`Y_{n + 1}` is not), create a prediction set
-#   :math:`C(X_{n+1}) = \{y: \hat{f}(X_{n+1})_{y} > \hat{q}\}` which includes
+# * Finally, for a new test data point (where ``Xₙ₊₁`` is known but
+#   ``Yₙ₊₁`` is not), create a prediction set
+#   ``C(Xₙ₊₁) = {y: 𝑓̂(Xₙ₊₁)ᵧ > q̂}`` which includes
 #   all the classes with a sufficiently high softmax output.
 
 # We use a two-dimensional toy dataset with three labels. The distribution of
@@ -205,9 +205,9 @@ def plot_results(alphas, X, y_pred, y_ps):
 # classifier.
 #
 # Let’s now study the effective coverage and the mean prediction set widths
-# as function of the :math:`1-\alpha` target coverage. To this aim, we use once
+# as function of the ``1 - α`` target coverage. To this aim, we use once
 # again the ``predict`` method of MAPIE to estimate predictions sets on a
-# large number of :math:`\alpha` values.
+# large number of ``α`` values.
 
 alpha2 = np.arange(0.02, 0.98, 0.02)
 _, y_ps_score2 = mapie_score.predict(X_test, alpha=alpha2)
@@ -243,7 +243,7 @@ def plot_coverages_widths(alpha, coverage, width, method):
 #
 # We saw in the previous section that the "lac" method is well calibrated by
 # providing accurate coverage levels. However, it tends to give null
-# prediction sets for uncertain regions, especially when the :math:`\alpha`
+# prediction sets for uncertain regions, especially when the ``α``
 # value is high.
 # MAPIE includes another method, called Adaptive Prediction Set (APS),
 # whose conformity score is the cumulated score of the softmax output until
diff --git a/examples/multilabel_classification/1-quickstart/plot_tutorial_multilabel_classification.py b/examples/multilabel_classification/1-quickstart/plot_tutorial_multilabel_classification.py
@@ -102,16 +102,16 @@
 # Bernstein and Waudby-Smith–Ramdas).
 # The two methods give two different guarantees on the risk:
 #
-# * RCPS: :math:`P(R(\mathcal{T}_{\hat{\lambda}})\leq\alpha)\geq 1-\delta`
-#   where :math:`R(\mathcal{T}_{\hat{\lambda}})`
-#   is the risk we want to control and :math:`\alpha` is the desired risk
+# * RCPS: ``𝒫(R(𝒯̂λ̂) ≤ α) ≥ 1 − δ``
+#   where ``R(𝒯̂λ̂)``
+#   is the risk we want to control and α is the desired risk
 #
-# * CRC: :math:`\mathbb{E}\left[L_{n+1}(\hat{\lambda})\right] \leq \alpha`
-#   where :math:`L_{n+1}(\hat{\lambda})` is the risk of a new observation and
-#   :math:`\alpha` is the desired risk
+# * CRC: ``𝐸[Lₙ₊₁(λ̂)] ≤ α``
+#   where ``Lₙ₊₁(λ̂)`` is the risk of a new observation and
+#   ``α`` is the desired risk
 #
 # In both cases, the objective of the method is to find the optimal value of
-# :math:`\lambda` (threshold above which we consider a label as being present)
+# ``λ`` (threshold above which we consider a label as being present)
 # such that the recall on the test points is at least equal to the required
 # recall.
 
@@ -156,7 +156,7 @@
 # * The actual recall (which should be always near to the required one):
 # we can see that they are close to each other.
 # * The value of the threshold: we see that the threshold is decreasing as
-# :math:`1 - \alpha` increases, which is what is expected because a
+# ``1 - α`` increases, which is what is expected because a
 # smaller threshold will give larger prediction sets, hence a larger
 # recall.
 #
@@ -179,11 +179,11 @@
 ##############################################################################
 # 2 - Plots where we choose a specific risk value (0.1 in our case) and look at
 # the average risk, the UCB of the risk (for RCPS methods) and the choice of
-# the threshold :math:`\lambda`
+# the threshold ``λ``.
 # * We can see that among the RCPS methods, the Bernstein method
-# gives the best results as for a given value of :math:`\alpha`
+# gives the best results as for a given value of ``α``
 # as we are above the required recall but with a larger value of
-# :math:`\lambda` than the two others bounds.
+# ``λ`` than the two others bounds.
 # * The CRC method gives the best results since it guarantees the coverage
 # with a larger threshold.
 
@@ -223,20 +223,20 @@
 # In this part, we will use LTT to control precision.
 # At the opposite of the 2 previous method, LTT can handle non-monotonous loss.
 # The procedure consist in multiple hypothesis testing. This is why the output
-# of this procedure isn't reduce to one value of :math:`\lambda`.
+# of this procedure isn't reduce to one value of ``λ``.
 #
-# More precisely, we look after all the :math:`\lambda` that sastisfy the
+# More precisely, we look after all the ``λ`` that sastisfy the
 # following:
-# :math:`\mathbb{P}(R(\mathcal{T}_{\lambda}) \leq \alpha ) \geq 1 - \delta`,
-# where :math:`R(\mathcal{T}_{\lambda})` is the risk we want to control and
-# each :math:`\lambda`` should satisfy FWER control.
-# :math:`\alpha` is the desired risk.
+# ``𝒫(R(𝒯̂λ̂) ≤ α) ≥ 1 − δ``,
+# where ``R(𝒯̂λ̂)`` is the risk we want to control and
+# each ``λ`` should satisfy FWER control.
+# ``α`` is the desired risk.
 #
-# Notice that the procedure will diligently examine each :math:`\lambda`
-# such that the risk remains below level :math:`\alpha`, meaning not
-# every :math:`\lambda` will be considered.
-# This means that a for a :math:`\lambda` such that risk is below
-# :math:`\alpha`
+# Notice that the procedure will diligently examine each ``λ``
+# such that the risk remains below level ``α``, meaning not
+# every ``λ`` will be considered.
+# This means that a for a ``λ`` such that risk is below
+# ``α``
 # doesn't necessarly pass the FWER control! This is what we are going to
 # explore.
 
@@ -267,7 +267,7 @@
 ##############################################################################
 # 3.2 Valid parameters for precision control
 # ----------------------------------------------------------------------------
-# We can see that not all :math:`\lambda` such that risk is below the orange
+# We can see that not all ``λ`` such that risk is below the orange
 # line are choosen by the procedure. Otherwise, all the lambdas that are
 # in the red rectangle verify family wise error rate control and allow to
 # control precision at the desired level with a high probability.
diff --git a/examples/regression/1-quickstart/plot_cqr_symmetry_difference.py b/examples/regression/1-quickstart/plot_cqr_symmetry_difference.py
@@ -111,4 +111,4 @@
 # each bound, allowing for more flexible and accurate intervals that reflect
 # the heteroscedastic nature of the data. The resulting effective coverages
 # demonstrate the theoretical guarantee of the target coverage level
-# :math:`1 - \alpha`.
+# ``1 - α``.
diff --git a/examples/regression/1-quickstart/plot_prefit.py b/examples/regression/1-quickstart/plot_prefit.py
@@ -74,7 +74,7 @@ def f(x: NDArray) -> NDArray:
 # quantile regression using
 # :class:`~mapie.quanitle_regression.MapieQuantileRegressor`. Note that the
 # three estimators need to be trained at quantile values of
-# :math:`(\alpha/2, 1-(\alpha/2), 0.5)`.
+# ``(α/2, 1-(α/2), 0.5)``.
 
 
 # Train a MLPRegressor for MapieRegressor
diff --git a/examples/regression/2-advanced-analysis/plot-coverage-width-based-criterion.py b/examples/regression/2-advanced-analysis/plot-coverage-width-based-criterion.py
@@ -33,7 +33,7 @@
 # Estimating the aleatoric uncertainty of heteroscedastic noisy data
 # ---------------------------------------------------------------------
 #
-# Let's define again the :math:`x \times \sin(x)` function and another simple
+# Let's define again the ``x * sin(x)`` function and another simple
 # function that generates one-dimensional data with normal noise uniformely
 # in a given interval.
 
@@ -70,7 +70,7 @@ def get_1d_data_with_heteroscedastic_noise(
 ##############################################################################
 # We first generate noisy one-dimensional data uniformely on an interval.
 # Here, the noise is considered as *heteroscedastic*, since it will increase
-# linearly with :math:`x`.
+# linearly with `x`.
 
 min_x, max_x, n_samples, noise = 0, 5, 300, 0.5
 (
@@ -92,7 +92,7 @@ def get_1d_data_with_heteroscedastic_noise(
 ##############################################################################
 # As mentioned previously, we fit our training data with a simple
 # polynomial function. Here, we choose a degree equal to 10 so the function
-# is able to perfectly fit :math:`x \times \sin(x)`.
+# is able to perfectly fit ``x * sin(x)``.
 
 degree_polyn = 10
 polyn_model = Pipeline(
diff --git a/examples/regression/2-advanced-analysis/plot_nested-cv.py b/examples/regression/2-advanced-analysis/plot_nested-cv.py
@@ -22,9 +22,9 @@
 cross-validation occurs on the training fold, optimizing hyperparameters.
 This ensures that residuals seen by MAPIE are never seen by the algorithm
 beforehand. However, this method is much heavier computationally since
-it results in :math:`N * P` calculations, where *N* is the number of
+it results in ``N * P`` calculations, where *N* is the number of
 *out-of-fold* models and *P* the number of parameter search cross-validations,
-versus :math:`N + P` for the non-nested approach.
+versus ``N + P`` for the non-nested approach.
 
 Here, we compare the two strategies on a toy dataset. We use the Random
 Forest Regressor as a base regressor for the CV+ strategy. For the sake of
diff --git a/examples/regression/4-tutorials/plot_cqr_tutorial.py b/examples/regression/4-tutorials/plot_cqr_tutorial.py
@@ -230,7 +230,7 @@ def plot_prediction_intervals(
 
 ##############################################################################
 # We proceed to using MAPIE to return the predictions and prediction intervals.
-# We will use an :math:`\alpha=0.2`, this means a target coverage of 0.8
+# We will use an ``α=0.2``, this means a target coverage of 0.8
 # (recall that this parameter needs to be initialized directly when setting
 # :class:`~mapie.quantile_regression.MapieQuantileRegressor` and when using
 # :class:`~mapie.regression.MapieRegressor`, it needs to be set in the
@@ -241,7 +241,7 @@ def plot_prediction_intervals(
 #   model on a training set and then calibrates on the calibration set.
 # * ``cv="prefit"`` meaning that you can train your models with the correct
 #   quantile values (must be given in the following order:
-#   :math:`(\alpha, 1-(\alpha/2), 0.5)` and given to MAPIE as an iterable
+#   ``(α, 1-(α/2), 0.5)`` and given to MAPIE as an iterable
 #   object. (Check the examples for how to use prefit in MAPIE)
 #
 # Additionally, note that there is a list of accepted models by
@@ -413,7 +413,7 @@ def get_coverages_widths_by_bins(
 
 ##############################################################################
 # What we observe from these results is that none of the methods seems to
-# have conditional coverage at the target :math:`1 - \alpha`. However, we can
+# have conditional coverage at the target ``1 - α``. However, we can
 # clearly notice that the CQR seems to better adapt to large prices. Its
 # conditional coverage is closer to the target coverage not only for higher
 # prices, but also for lower prices where the other methods have a higher
diff --git a/examples/regression/4-tutorials/plot_main-tutorial-regression.py b/examples/regression/4-tutorials/plot_main-tutorial-regression.py