Move and adjr2 implementations from StatsBase (#7)

nalimilan · web-flow · commit 00ce15f034e7 · 2021-12-23T16:44:12.000+01:00
These are the only methods for `StatisticalModel` still defined in StatsBase.
Given that definitions are quite trivial and based only on functions defined
in StatsAPI, it makes more sense for them to live here too.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "StatsAPI"
 uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
 authors = ["Milan Bouchet-Valat <nalimilan@club.fr"]
-version = "1.1.0"
+version = "1.2.0"
 
 [compat]
 julia = "1"
diff --git a/src/statisticalmodel.jl b/src/statisticalmodel.jl
@@ -216,6 +216,49 @@ and ``TSS`` the total sum of squares.
 """
 function r2 end
 
+"""
+    r2(model::StatisticalModel, variant::Symbol)
+    r²(model::StatisticalModel, variant::Symbol)
+
+Pseudo-coefficient of determination (pseudo R-squared).
+
+For nonlinear models, one of several pseudo R² definitions must be chosen via `variant`.
+Supported variants are:
+- `:MacFadden` (a.k.a. likelihood ratio index), defined as ``1 - \\log (L)/\\log (L_0)``;
+- `:CoxSnell`, defined as ``1 - (L_0/L)^{2/n}``;
+- `:Nagelkerke`, defined as ``(1 - (L_0/L)^{2/n})/(1 - L_0^{2/n})``.
+- `:devianceratio`, defined as ``1 - D/D_0``.
+
+In the above formulas, ``L`` is the likelihood of the model,
+``L_0`` is the likelihood of the null model (the model with only an intercept),
+``D`` is the deviance of the model (from the saturated model),
+``D_0`` is the deviance of the null model,
+``n`` is the number of observations (given by [`nobs`](@ref)).
+
+The Cox-Snell and the deviance ratio variants both match the classical definition of R²
+for linear models.
+"""
+function r2(model::StatisticalModel, variant::Symbol)
+    loglikbased = (:McFadden, :CoxSnell, :Nagelkerke)
+    if variant in loglikbased
+        ll = loglikelihood(model)
+        ll0 = nullloglikelihood(model)
+        if variant == :McFadden
+            1 - ll/ll0
+        elseif variant == :CoxSnell
+            1 - exp(2 * (ll0 - ll) / nobs(model))
+        elseif variant == :Nagelkerke
+            (1 - exp(2 * (ll0 - ll) / nobs(model))) / (1 - exp(2 * ll0 / nobs(model)))
+        end
+    elseif variant == :devianceratio
+        dev  = deviance(model)
+        dev0 = nulldeviance(model)
+        1 - dev/dev0
+    else
+        error("variant must be one of $(join(loglikbased, ", ")) or :devianceratio")
+    end
+end
+
 const r² = r2
 
 """
@@ -230,4 +273,33 @@ coefficients (including the intercept). This definition is generally known as th
 """
 function adjr2 end
 
+"""
+    adjr2(model::StatisticalModel, variant::Symbol)
+    adjr²(model::StatisticalModel, variant::Symbol)
+
+Adjusted pseudo-coefficient of determination (adjusted pseudo R-squared).
+For nonlinear models, one of the several pseudo R² definitions must be chosen via `variant`.
+The only currently supported variants are `:MacFadden`, defined as ``1 - (\\log (L) - k)/\\log (L0)`` and
+`:devianceratio`, defined as ``1 - (D/(n-k))/(D_0/(n-1))``.
+In these formulas, ``L`` is the likelihood of the model, ``L0`` that of the null model
+(the model including only the intercept), ``D`` is the deviance of the model,
+``D_0`` is the deviance of the null model, ``n`` is the number of observations (given by [`nobs`](@ref)) and
+``k`` is the number of consumed degrees of freedom of the model (as returned by [`dof`](@ref)).
+"""
+function adjr2(model::StatisticalModel, variant::Symbol)
+    k = dof(model)
+    if variant == :McFadden
+        ll = loglikelihood(model)
+        ll0 = nullloglikelihood(model)
+        1 - (ll - k)/ll0
+    elseif variant == :devianceratio
+        n = nobs(model)
+        dev  = deviance(model)
+        dev0 = nulldeviance(model)
+        1 - (dev*(n-1))/(dev0*(n-k))
+    else
+        error("variant must be one of :McFadden or :devianceratio")
+    end
+end
+
 const adjr² = adjr2