DOC fix xlabel in Tweedie regression on insurance claims (scikit-learn#30362)

antoinebaker · jeremiedbb · web-flow · commit d388d88b936f · 2024-11-29T15:05:19.000+01:00
Co-authored-by: Jérémie du Boisberranger &lt;jeremie@probabl.ai&gt;
diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -613,11 +613,11 @@ def score_estimator(
 
 # %%
 #
-# Finally, we can compare the two models using a plot of cumulated claims: for
+# Finally, we can compare the two models using a plot of cumulative claims: for
 # each model, the policyholders are ranked from safest to riskiest based on the
-# model predictions and the fraction of observed total cumulated claims is
-# plotted on the y axis. This plot is often called the ordered Lorenz curve of
-# the model.
+# model predictions and the cumulative proportion of claim amounts is plotted
+# against the cumulative proportion of exposure. This plot is often called
+# the ordered Lorenz curve of the model.
 #
 # The Gini coefficient (based on the area between the curve and the diagonal)
 # can be used as a model selection metric to quantify the ability of the model
@@ -627,7 +627,7 @@ def score_estimator(
 # Gini coefficient is upper bounded by 1.0 but even an oracle model that ranks
 # the policyholders by the observed claim amounts cannot reach a score of 1.0.
 #
-# We observe that both models are able to rank policyholders by risky-ness
+# We observe that both models are able to rank policyholders by riskiness
 # significantly better than chance although they are also both far from the
 # oracle model due to the natural difficulty of the prediction problem from a
 # few features: most accidents are not predictable and can be caused by
@@ -653,11 +653,11 @@ def lorenz_curve(y_true, y_pred, exposure):
     ranking = np.argsort(y_pred)
     ranked_exposure = exposure[ranking]
     ranked_pure_premium = y_true[ranking]
-    cumulated_claim_amount = np.cumsum(ranked_pure_premium * ranked_exposure)
-    cumulated_claim_amount /= cumulated_claim_amount[-1]
-    cumulated_exposure = np.cumsum(ranked_exposure)
-    cumulated_exposure /= cumulated_exposure[-1]
-    return cumulated_exposure, cumulated_claim_amount
+    cumulative_claim_amount = np.cumsum(ranked_pure_premium * ranked_exposure)
+    cumulative_claim_amount /= cumulative_claim_amount[-1]
+    cumulative_exposure = np.cumsum(ranked_exposure)
+    cumulative_exposure /= cumulative_exposure[-1]
+    return cumulative_exposure, cumulative_claim_amount
 
 
 fig, ax = plt.subplots(figsize=(8, 8))
@@ -669,27 +669,30 @@ def lorenz_curve(y_true, y_pred, exposure):
     ("Frequency * Severity model", y_pred_product),
     ("Compound Poisson Gamma", y_pred_total),
 ]:
-    ordered_samples, cum_claims = lorenz_curve(
+    cum_exposure, cum_claims = lorenz_curve(
         df_test["PurePremium"], y_pred, df_test["Exposure"]
     )
-    gini = 1 - 2 * auc(ordered_samples, cum_claims)
+    gini = 1 - 2 * auc(cum_exposure, cum_claims)
     label += " (Gini index: {:.3f})".format(gini)
-    ax.plot(ordered_samples, cum_claims, linestyle="-", label=label)
+    ax.plot(cum_exposure, cum_claims, linestyle="-", label=label)
 
 # Oracle model: y_pred == y_test
-ordered_samples, cum_claims = lorenz_curve(
+cum_exposure, cum_claims = lorenz_curve(
     df_test["PurePremium"], df_test["PurePremium"], df_test["Exposure"]
 )
-gini = 1 - 2 * auc(ordered_samples, cum_claims)
+gini = 1 - 2 * auc(cum_exposure, cum_claims)
 label = "Oracle (Gini index: {:.3f})".format(gini)
-ax.plot(ordered_samples, cum_claims, linestyle="-.", color="gray", label=label)
+ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label)
 
 # Random baseline
 ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline")
 ax.set(
     title="Lorenz Curves",
-    xlabel="Fraction of policyholders\n(ordered by model from safest to riskiest)",
-    ylabel="Fraction of total claim amount",
+    xlabel=(
+        "Cumulative proportion of exposure\n"
+        "(ordered by model from safest to riskiest)"
+    ),
+    ylabel="Cumulative proportion of claim amounts",
 )
 ax.legend(loc="upper left")
 plt.plot()