closes #104 (error with hessian of logistic)

tlienart · tlienart · commit 47c0415db9dd · 2021-08-10T13:10:03.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJLinearModels"
 uuid = "6ee0df7b-362f-4a72-a706-9e79364fb692"
 authors = ["Thibaut Lienart <tlienart@me.com>"]
-version = "0.5.5"
+version = "0.5.6"
 
 [deps]
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
@@ -24,6 +24,7 @@ julia = "^1"
 [extras]
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
 PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
 RCall = "6f49c342-dc21-5d91-9882-a32aef131414"
@@ -33,4 +34,4 @@ StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["DelimitedFiles", "PyCall", "Test", "Random", "RDatasets", "RCall", "MLJBase", "StableRNGs", "DataFrames"]
+test = ["DelimitedFiles", "PyCall", "ForwardDiff", "Test", "Random", "RDatasets", "RCall", "MLJBase", "StableRNGs", "DataFrames"]
diff --git a/src/glr/constructors.jl b/src/glr/constructors.jl
@@ -32,7 +32,7 @@ end
 
 const GLR = GeneralizedLinearRegression
 
-getc(g::GLR) = getc(g.loss)
+getc(g::GLR)    = getc(g.loss)
 getc(g::GLR, y) = getc(g.loss, y)
 
 ## Specific constructors
diff --git a/src/glr/d_l2loss.jl b/src/glr/d_l2loss.jl
@@ -5,7 +5,7 @@
 # ----------------------- #
 #  -- Ridge Regression -- #
 # ----------------------- #
-# ->  f(θ)  = |Xθ - y|₂²/2 + λ|θ|₂²
+# ->  f(θ)  = |Xθ - y|₂²/2 + λ|θ|₂²/2
 # -> ∇f(θ)  = X'(Xθ - y) + λθ
 # -> ∇²f(θ) = X'X + λI
 # NOTE:
diff --git a/src/glr/d_logistic.jl b/src/glr/d_logistic.jl
@@ -1,12 +1,14 @@
 # ------------------------------- #
 #  -- Logistic Regression (L2) -- #
 # ------------------------------- #
-# ->  f(θ)  = -∑logσ(yXθ) + λ|θ|₂²
-# -> ∇f(θ)  = -X'(yσ(-yXθ)) + λθ
-# -> ∇²f(θ) = X'(σ(yXθ))X + λI
+# ->  f(θ)  = -∑logσ(yXθ) + λ|θ|₂²/2
+# -> ∇f(θ)  =  X'(y(w-1)) + λθ
+# -> ∇²f(θ) =  X' Diag(w(1-w)) X + λI
 # NOTE:
+# * w = σ(yXθ)
 # * yᵢ ∈ {±1} so that y² = 1
-# * -σ(-x) ==(σ(x)-1)
+# * -σ(-x) == (σ(x)-1)
+# NOTE: https://github.com/JuliaAI/MLJLinearModels.jl/issues/104
 # ---------------------------------------------------------
 
 function fgh!(glr::GLR{LogisticLoss,<:L2R}, X, y, scratch)
@@ -17,12 +19,12 @@ function fgh!(glr::GLR{LogisticLoss,<:L2R}, X, y, scratch)
         (f, g, H, θ) -> begin
             Xθ = scratch.n
             apply_X!(Xθ, X, θ)                       # -- Xθ = apply_X(X, θ)
-            # precompute σ(yXθ) use -σ(-x) = (σ(x)-1)
-            w  = scratch.n2
-            w .= σ.(Xθ .* y)                         # -- w  = σ.(Xθ .* y)
+            # precompute σ(yXθ)
+            w    = scratch.n2
+            w   .= σ.(Xθ .* y)                       # -- w  = σ.(Xθ .* y)
             g === nothing || begin
                 t  = scratch.n3
-                t .= y .* (w .- 1.0)                 # -- t = y .* (w .- 1.0)
+                t .= y .* w .- y                     # -- t = y .* (w .- 1.0)
                 apply_Xt!(g, X, t)                   # -- g = X't
                 g .+= λ .* θ
                 glr.penalize_intercept || (g[end] -= λ * θ[end])
@@ -31,19 +33,21 @@ function fgh!(glr::GLR{LogisticLoss,<:L2R}, X, y, scratch)
                 # NOTE: we could try to be clever to reduce the allocations for
                 # ΛX but computing the full hessian allocates a lot anyway so
                 # probably not really worth it
-                a  = 1:p
-                ΛX = w .* X                           # !! big allocs
-                mul!(view(H, a, a), X', ΛX)       # -- H[1:p,1:p] = X'ΛX
+                t    = scratch.n3
+                t   .= w .- w.^2                      # σ(yXθ)(1-σ(yXθ))
+                a    = 1:p
+                ΛX   = t .* X                         # !! big allocs
+                mul!(view(H, a, a), X', ΛX)           # -- H[1:p,1:p] = X'ΛX
                 ΛXt1   = view(scratch.p, a)
                 ΛXt1 .*= 0
                 @inbounds for i in a, j in 1:n
-                    ΛXt1[i] += ΛX[j, i]             # -- (ΛX)'1
+                    ΛXt1[i] += ΛX[j, i]               # -- (ΛX)'1
                 end
                 @inbounds for i in a
                     H[i, end] = H[end, i] = ΛXt1[i]   # -- H[:,p+1] = (ΛX)'1
                 end
-                H[end, end] = sum(w)                  # -- 1'Λ1'
-                add_λI!(H, λ, glr.penalize_intercept) # -- H = X'ΛX + λI
+                H[end, end] = sum(t)                  # -- 1'Λ1'
+                add_λI!(H, λ, glr.penalize_intercept) # -- H = -X'ΛX + λI
             end
             f === nothing || return J(y, Xθ, view_θ(glr, θ))
         end
@@ -53,16 +57,18 @@ function fgh!(glr::GLR{LogisticLoss,<:L2R}, X, y, scratch)
         (f, g, H, θ) -> begin
             Xθ = scratch.n
             apply_X!(Xθ, X, θ)
-            w  = scratch.n2
-            w .= σ.(y .* Xθ)
+            w    = scratch.n2
+            w   .= σ.(y .* Xθ)
             g === nothing || begin
                 t  = scratch.n3
-                t .= y .* (w .- 1.0)
+                t .= y .* w .- y
                 apply_Xt!(g, X, t)
                 g .+= λ .* θ
             end
             H === nothing || begin
-                mul!(H, X', w .* X)
+                t  = scratch.n3
+                t .= w .- w.^2
+                mul!(H, X', t .* X)
                 add_λI!(H, λ)
             end
             f === nothing || return J(y, Xθ, θ)
@@ -80,8 +86,9 @@ function Hv!(glr::GLR{LogisticLoss,<:L2R}, X, y, scratch)
         (Hv, θ, v) -> begin
             Xθ = scratch.n
             apply_X!(Xθ, X, θ)                       # -- Xθ = apply_X(X, θ)
-            w  = scratch.n2
-            w .= σ.(Xθ .* y)                         # -- w  = σ.(Xθ .* y)
+            w   = scratch.n2
+            w  .= σ.(Xθ .* y)                        # -- w  = σ.(Xθ .* y)
+            w .-= w.^2                               # -- w  = w(1-w)
             # view on the first p rows
             a    = 1:p
             Hvₐ  = view(Hv, a)
@@ -103,9 +110,10 @@ function Hv!(glr::GLR{LogisticLoss,<:L2R}, X, y, scratch)
         (Hv, θ, v) -> begin
             Xθ = scratch.n
             apply_X!(Xθ, X, θ)
-            w  = scratch.n2
-            w .= σ.(Xθ .* y)                # -- σ(yXθ)
-            Xv = scratch.n
+            w   = scratch.n2
+            w  .= σ.(Xθ .* y)                # -- σ(yXθ)
+            w .-= w.^2
+            Xv  = scratch.n
             mul!(Xv, X, v)
             Xv .*= scratch.n2               # -- ΛXv
             mul!(Hv, X', Xv)                # -- X'ΛXv
diff --git a/test/fit/quantile.jl b/test/fit/quantile.jl
@@ -91,7 +91,7 @@ y1a  = outlify(y1, 0.1)
         θ_ista     = fit(rr, X, y1a, solver=ISTA())
         θ_qr_lasso = rcopy(QUANTREG.rq_fit_lasso(X1, y1a))[:coefficients]
         @test isapprox(J(θ_ls),       888.3748, rtol=1e-5)
-        @test isapprox(J(θ_qr_lasso), 425.5977, rtol=1e-5)
+        @test isapprox(J(θ_qr_lasso), 425.5264, rtol=1e-5)
         # Our algorithms are close enough
         @test isapprox(J(θ_fista),    425.0526, rtol=1e-5)
         @test isapprox(J(θ_ista),     425.4113, rtol=1e-5)
diff --git a/test/glr/grad-hess-prox.jl b/test/glr/grad-hess-prox.jl
@@ -72,6 +72,10 @@ end
     @test g ≈               X1' * (X1*θ1 .- y1) .+ λ .* θ1 .* vcat(ones(p), 0)
 end
 
+n, p = 50, 5
+((X, y, θ), (X1, y1, θ1)) = generate_binary(n, p; seed=1212224)
+maskint = vcat(ones(p), 0.0)
+
 @testset "GH> LogitL2" begin
     rng = StableRNG(551551)
     # fgh! without fit_intercept
@@ -85,17 +89,29 @@ end
     g = similar(θ)
     H = zeros(p, p)
     f = fgh!(f, g, H, θ)
+
+    wminus = R.σ.(-y .* (X * θ))
+    wplus  = R.σ.(y .* (X * θ))
+
     @test f == J(θ)
-    @test g ≈               -X' * (y .* R.σ.(-y .* (X * θ))) .+ λ .* θ
-    @test H ≈                X' * (Diagonal(R.σ.(y .* (X * θ))) * X) + λ * I
+    @test g ≈ -X' * (y .* wminus) .+ λ .* θ
+    @test H ≈  X' * (Diagonal(wplus .* (1 .- wplus)) * X) + λ * I
+
+    # Use ForwardDiff
+    logsigmoid(x)  = -log1p(exp(-x))
+    objective2(θ_) = -sum(logsigmoid.(y .* (X*θ_))) + λ * sum(abs2, θ_) / 2
+    fd_grad = ForwardDiff.gradient(θ_ -> objective2(θ_), θ)
+    fd_hess = ForwardDiff.hessian(θ_ -> objective2(θ_), θ)
+    @test g ≈ fd_grad
+    @test H ≈ fd_hess
 
     # Hv! without  fit_intercept
     s = R.scratch(X; i=false)
     Hv! = R.Hv!(lr, X, y, s)
     v   = randn(rng, p)
     Hv  = similar(v)
     Hv!(Hv, θ, v)
-    @test Hv ≈               H * v
+    @test Hv ≈ H * v
 
     # fgh! with fit_intercept
     s = R.scratch(X; i=true)
@@ -108,16 +124,28 @@ end
     g1 = similar(θ1)
     H1 = zeros(p+1, p+1)
     f1 = fgh!(f1, g1, H1, θ1)
+
+    wminus = R.σ.(-y .* (X1 * θ1))
+    wplus  = R.σ.(y .* (X1 * θ1))
+
     @test f1 ≈ J(θ1)
-    @test g1 ≈              -X1' * (y .* R.σ.(-y .* (X1 * θ1))) .+ λ .* θ1
-    @test H1 ≈               X1' * (Diagonal(R.σ.(y .* (X1 * θ1))) * X1) + λ * I
+    @test g1 ≈ -X1' * (y .* wminus) .+ λ .* θ1
+    @test H1 ≈ X1' * (Diagonal(wplus .* (1 .- wplus)) * X1) + λ * I
+
+    # Use ForwardDiff
+    logsigmoid(x)  = -log1p(exp(-x))
+    objective2(θ_) = -sum(logsigmoid.(y .* (X1*θ_))) + λ * sum(abs2, θ_) / 2
+    fd_grad = ForwardDiff.gradient(θ_ -> objective2(θ_), θ1)
+    fd_hess = ForwardDiff.hessian(θ_ -> objective2(θ_), θ1)
+    @test g1 ≈ fd_grad
+    @test H1 ≈ fd_hess
 
     # Hv! with fit_intercept
     Hv! = R.Hv!(lr1, X, y, s)
     v   = randn(rng, p+1)
     Hv  = similar(v)
     Hv!(Hv, θ1, v)
-    @test Hv ≈               H1 * v
+    @test Hv ≈ H1 * v
 
     # fgh! with fit intercept and no penalty on intercept
     lr1 = LogisticRegression(λ)
@@ -128,9 +156,13 @@ end
     g1 = similar(θ1)
     H1 = zeros(p+1, p+1)
     f1 = fgh!(f1, g1, H1, θ1)
+
+    wminus = R.σ.(-y .* (X1 * θ1))
+    wplus  = R.σ.(y .* (X1 * θ1))
+
     @test f1 ≈ J(θ1)
-    @test g1 ≈              -X1' * (y .* R.σ.(-y .* (X1 * θ1))) .+ λ .* θ1 .* maskint
-    @test H1 ≈               X1' * (Diagonal(R.σ.(y .* (X1 * θ1))) * X1) + λ * Diagonal(maskint)
+    @test g1 ≈ -X1' * (y .* wminus) .+ λ .* θ1 .* maskint
+    @test H1 ≈  X1' * (Diagonal(wplus .* (1.0 .- wplus)) * X1) + λ * Diagonal(maskint)
     Hv! = R.Hv!(lr1, X, y, s)
     v   = randn(rng, p+1)
     Hv  = similar(v)
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,4 +1,5 @@
-using MLJLinearModels, Test, LinearAlgebra, Random, StableRNGs, DataFrames
+using MLJLinearModels, Test, LinearAlgebra
+using Random, StableRNGs, DataFrames, ForwardDiff
 import MLJBase # not MLJModelInterface, to mimic the full interface
 
 DO_COMPARISONS = false; include("testutils.jl")