proper level checks (#73)

tlienart · web-flow · commit 8d8a259bdf41 · 2020-05-21T13:25:42.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -1,9 +1,10 @@
 name = "MLJLinearModels"
 uuid = "6ee0df7b-362f-4a72-a706-9e79364fb692"
 authors = ["Thibaut Lienart <tlienart@me.com>"]
-version = "0.3.6"
+version = "0.4.0"
 
 [deps]
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
diff --git a/src/fit/analytical.jl b/src/fit/analytical.jl
@@ -34,7 +34,7 @@ function _fit(glr::GLR{L2Loss,<:L2R}, solver::Analytical, X, y, scratch)
     # it is done implicitly in the application of the Hessian to
     # avoid copying X
     # The number of CG steps to convergence is at most `p`
-    p = size(X, 2) + Int(glr.fit_intercept)
+    _,p,_ = npc(scratch)
     max_cg_steps = min(solver.max_inner, p)
     # Form the Hessian map, cost of application H*v is O(np)
     Hm = LinearMap(Hv!(glr, X, y, scratch), p;
diff --git a/src/fit/default.jl b/src/fit/default.jl
@@ -11,7 +11,7 @@ _solver(::GLR{L2Loss,<:L2R}, np::NTuple{2,Int}) = Analytical()
 
 # Logistic, Multinomial
 _solver(::GLR{LogisticLoss,<:L2R},    np::NTuple{2,Int}) = LBFGS()
-_solver(::GLR{MultinomialLoss,<:L2R}, np::NTuple{2,Int}) = LBFGS()
+_solver(::GLR{<:MultinomialLoss,<:L2R}, np::NTuple{2,Int}) = LBFGS()
 
 # Lasso, ElasticNet, Logistic, Multinomial
 function _solver(glr::GLR{<:SmoothLoss,<:ENR}, np::NTuple{2,Int})
@@ -37,17 +37,19 @@ function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR;
              solver::Solver=_solver(glr, size(X)))
     check_nrows(X, y)
     n, p = size(X)
-    c = glr.loss isa MultinomialLoss ? maximum(y) : 0
+    c = getc(glr, y)
     return _fit(glr, solver, X, y, scratch(n, p, c, i=glr.fit_intercept))
 end
 
 function scratch(n, p, c=0; i=false)
     p_ = p + Int(i)
-    s = (n=zeros(n), n2=zeros(n), n3=zeros(n), p=zeros(p_))
+    s = (n=zeros(n), n2=zeros(n), n3=zeros(n), p=zeros(p_), dims=(n,p_,c))
     if !iszero(c)
         s = (s..., nc=zeros(n,c), nc2=zeros(n,c), nc3=zeros(n,c),
                    nc4=zeros(n,c), pc=zeros(p_,c))
     end
     return s
 end
 scratch(X; kw...) = scratch(size(X)...; kw...)
+
+npc(s) = s.dims
diff --git a/src/fit/iwls.jl b/src/fit/iwls.jl
@@ -1,8 +1,7 @@
 function _fit(glr::GLR{RobustLoss{ρ},<:L2R}, solver::IWLSCG, X, y, scratch
               ) where {ρ}
     λ    = getscale(glr.penalty)
-    n    = size(X, 1)
-    p    = size(X, 2) + Int(glr.fit_intercept)
+    n,p,_ = npc(scratch)
     _Mv! = Mv!(glr, X, y, scratch; threshold=solver.threshold)
     κ    = solver.damping # between 0 and 1, 1 = fully take the new iteration
     # cache
diff --git a/src/fit/newton.jl b/src/fit/newton.jl
@@ -14,7 +14,7 @@ Hessian at each step with κ the number of Newton steps.
 """
 function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
               solver::Newton, X, y, scratch)
-    p     = size(X, 2) + Int(glr.fit_intercept)
+    _,p,_ = npc(scratch)
     θ₀    = zeros(p)
     _fgh! = fgh!(glr, X, y, scratch)
     opt   = Optim.only_fgh!(_fgh!)
@@ -36,13 +36,13 @@ average number of CG steps per Newton step (which is at most p).
 """
 function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
               solver::NewtonCG, X, y, scratch)
-    p    = size(X, 2) + Int(glr.fit_intercept)
-    θ₀   = zeros(p)
-    _f   = objective(glr, X, y)
-    _fg! = (g, θ) -> fgh!(glr, X, y, scratch)(0.0, g, nothing, θ) # Optim.jl/738
-    _Hv! = Hv!(glr, X, y, scratch)
-    opt  = Optim.TwiceDifferentiableHV(_f, _fg!, _Hv!, θ₀)
-    res  = Optim.optimize(opt, θ₀, Optim.KrylovTrustRegion())
+    _,p,_ = npc(scratch)
+    θ₀    = zeros(p)
+    _f    = objective(glr, X, y)
+    _fg!  = (g, θ) -> fgh!(glr, X, y, scratch)(0.0, g, nothing, θ) # Optim#738
+    _Hv!  = Hv!(glr, X, y, scratch)
+    opt   = Optim.TwiceDifferentiableHV(_f, _fg!, _Hv!, θ₀)
+    res   = Optim.optimize(opt, θ₀, Optim.KrylovTrustRegion())
     return Optim.minimizer(res)
 end
 
@@ -58,11 +58,11 @@ gradient at each step with κ the number of LBFGS steps.
 """
 function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
               solver::LBFGS, X, y, scratch)
-    p    = size(X, 2) + Int(glr.fit_intercept)
-    θ₀   = zeros(p)
-    _fg! = (f, g, θ) -> fgh!(glr, X, y, scratch)(f, g, nothing, θ)
-    opt  = Optim.only_fg!(_fg!)
-    res  = Optim.optimize(opt, θ₀, Optim.LBFGS())
+    _,p,_ = npc(scratch)
+    θ₀    = zeros(p)
+    _fg!  = (f, g, θ) -> fgh!(glr, X, y, scratch)(f, g, nothing, θ)
+    opt   = Optim.only_fg!(_fg!)
+    res   = Optim.optimize(opt, θ₀, Optim.LBFGS())
     return Optim.minimizer(res)
 end
 
@@ -82,15 +82,15 @@ computations are dominated by the application of the Hessian at each step with
 κ₁ the number of Newton steps and κ₂ the average number of CG steps per Newton
 step.
 """
-function _fit(glr::GLR{MultinomialLoss,<:L2R}, solver::NewtonCG, X, y, scratch)
-    p    = size(X, 2) + Int(glr.fit_intercept)
-    c    = maximum(y)
-    θ₀   = zeros(p * c)
-    _f   = objective(glr, X, y; c=c)
-    _fg! = (g, θ) -> fg!(glr, X, y, scratch)(0.0, g, θ) # XXX: Optim.jl/738
-    _Hv! = Hv!(glr, X, y, scratch)
-    opt  = Optim.TwiceDifferentiableHV(_f, _fg!, _Hv!, θ₀)
-    res  = Optim.optimize(opt, θ₀, Optim.KrylovTrustRegion())
+function _fit(glr::GLR{<:MultinomialLoss,<:L2R}, solver::NewtonCG,
+              X, y, scratch)
+    _,p,c = npc(scratch)
+    θ₀    = zeros(p * c)
+    _f    = objective(glr, X, y; c=c)
+    _fg!  = (g, θ) -> fg!(glr, X, y, scratch)(0.0, g, θ) # XXX: Optim.jl/738
+    _Hv!  = Hv!(glr, X, y, scratch)
+    opt   = Optim.TwiceDifferentiableHV(_f, _fg!, _Hv!, θ₀)
+    res   = Optim.optimize(opt, θ₀, Optim.KrylovTrustRegion())
     return Optim.minimizer(res)
 end
 
@@ -105,12 +105,12 @@ Assuming `n` dominates `p`, O(κnpc), with `c` the number of classes, dominated
 by the computation of the gradient at each step with κ the number of LBFGS
 steps.
 """
-function _fit(glr::GLR{MultinomialLoss,<:L2R}, solver::LBFGS, X, y, scratch)
-    p    = size(X, 2) + Int(glr.fit_intercept)
-    c    = maximum(y)
-    θ₀   = zeros(p * c)
-    _fg! = fg!(glr, X, y, scratch)
-    opt  = Optim.only_fg!(_fg!)
-    res  = Optim.optimize(opt, θ₀, Optim.LBFGS())
+function _fit(glr::GLR{<:MultinomialLoss,<:L2R}, solver::LBFGS,
+              X, y, scratch)
+    _,p,c = npc(scratch)
+    θ₀    = zeros(p * c)
+    _fg!  = fg!(glr, X, y, scratch)
+    opt   = Optim.only_fg!(_fg!)
+    res   = Optim.optimize(opt, θ₀, Optim.LBFGS())
     return Optim.minimizer(res)
 end
diff --git a/src/fit/proxgrad.jl b/src/fit/proxgrad.jl
@@ -3,8 +3,8 @@
 # Assumption: loss has gradient; penalty has prox e.g.: Lasso
 # J(θ) = f(θ) + r(θ) where f is smooth
 function _fit(glr::GLR, solver::ProxGrad, X, y, scratch)
-    c = ifelse(isa(glr.loss, MultinomialLoss), length(unique(y)), 1)
-    p = (size(X, 2) + Int(glr.fit_intercept)) * c
+    _,p,c = npc(scratch)
+    c > 0 && (p *= c)
     # vector caches + eval cache
     θ   = zeros(p)   # θ_k
     Δθ  = zeros(p)   # (θ_k - θ_{k-1})
diff --git a/src/glr/constructors.jl b/src/glr/constructors.jl
@@ -32,6 +32,10 @@ end
 
 const GLR = GeneralizedLinearRegression
 
+getc(g::GLR) = getc(g.loss)
+getc(g::GLR, y) = getc(g.loss, y)
+
+## Specific constructors
 
 """
 $SIGNATURES
@@ -116,10 +120,17 @@ logistic loss in the binary case or the multinomial loss otherwise.
 function LogisticRegression(λ::Real=1.0, γ::Real=0.0;
                             lambda::Real=λ, gamma::Real=γ,
                             penalty::Symbol=iszero(gamma) ? :l2 : :en,
-                            multi_class::Bool=false, fit_intercept::Bool=true,
-                            penalize_intercept::Bool=false)
+                            fit_intercept::Bool=true,
+                            penalize_intercept::Bool=false,
+                            multi_class::Bool=false,
+                            nclasses::Integer=0)
     penalty = _l1l2en(lambda, gamma, penalty, "Logistic regression")
-    loss = multi_class ? MultinomialLoss() : LogisticLoss()
+    loss = LogisticLoss()
+    if nclasses > 2     # number of classes is explicitly specified
+        loss = MultinomialLoss(nclasses)
+    elseif multi_class  # number of classes will be inferred from data
+        loss = MultinomialLoss()
+    end
     GLR(loss=loss,
         penalty=penalty,
         fit_intercept=fit_intercept,
@@ -132,7 +143,8 @@ $SIGNATURES
 Objective function: ``L(y, Xθ) + λ|θ|₂²/2 + γ|θ|₁`` where `L` is the
 multinomial loss.
 """
-MultinomialRegression(a...; kwa...) = LogisticRegression(a...; multi_class=true, kwa...)
+MultinomialRegression(a...; kwa...) =
+    LogisticRegression(a...; multi_class=true, kwa...)
 
 
 # ========
diff --git a/src/glr/d_logistic.jl b/src/glr/d_logistic.jl
@@ -144,9 +144,9 @@ end
 # * yᵢ ∈ {1, 2, ..., c}
 # ---------------------------------------------------------
 
-function fg!(glr::GLR{MultinomialLoss,<:L2R}, X, y, scratch)
+function fg!(glr::GLR{<:MultinomialLoss,<:L2R}, X, y, scratch)
     n, p = size(X)
-    c    = length(unique(y))
+    c    = getc(glr, y)
     λ    = getscale(glr.penalty)
     (f, g, θ) -> begin
         P  = scratch.nc
@@ -199,10 +199,10 @@ function fg!(glr::GLR{MultinomialLoss,<:L2R}, X, y, scratch)
     end
 end
 
-function Hv!(glr::GLR{MultinomialLoss,<:L2R}, X, y, scratch)
+function Hv!(glr::GLR{<:MultinomialLoss,<:L2R}, X, y, scratch)
     p = size(X, 2)
     λ = getscale(glr.penalty)
-    c = length(unique(y))
+    c = getc(glr, y)
     # NOTE:
     # * ideally P and Q should be recuperated from gradient computations (fghv!)
     # * assumption that c is small so that storing matrices of size n * c is
@@ -242,7 +242,7 @@ end
 # -> prox_r = soft-thresh
 # ---------------------------------------------------------
 
-function smooth_fg!(glr::GLR{MultinomialLoss,<:ENR}, X, y, scratch)
+function smooth_fg!(glr::GLR{<:MultinomialLoss,<:ENR}, X, y, scratch)
     smooth = get_smooth(glr)
     (g, θ) -> fg!(smooth, X, y, scratch)(0.0, g, θ)
 end
diff --git a/src/glr/utils.jl b/src/glr/utils.jl
@@ -17,7 +17,7 @@ $SIGNATURES
 Return a function computing the objective at a given point `θ`.
 Note that the [`apply_X`](@ref) takes care of a potential intercept.
 """
-objective(glr::GLR, X, y; c::Int=1) =
+objective(glr::GLR, X, y; c::Int=0) =
     θ -> objective(glr)(y, apply_X(X, θ, c), view_θ(glr, θ))
 
 
@@ -27,7 +27,7 @@ $SIGNATURES
 Return a function computing the smooth part of the objective at a given
 evaluation point `θ`.
 """
-smooth_objective(glr::GLR, X, y; c::Int=1) =
+smooth_objective(glr::GLR, X, y; c::Int=0) =
     θ -> smooth_objective(glr)(y, apply_X(X, θ, c), view_θ(glr, θ))
 
 """
diff --git a/src/loss-penalty/generic.jl b/src/loss-penalty/generic.jl
@@ -32,6 +32,15 @@ getscale(n::NoLoss)     = 0.0
 getscale(l::AtomicLoss) = 1.0
 getscale(l::ScaledLoss) = l.scale
 
+# Convenient extension for classification
+abstract type MultiClassLoss{c} <: AtomicLoss where c end
+
+getc(m) = 0
+getc(m, y) = 0
+getc(m::MultiClassLoss{c}) where c = c
+getc(m::MultiClassLoss{0}, y) = maximum(y)
+getc(m::MultiClassLoss{c}, y) where c = c
+
 # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
 # Penalty: θ -> P(θ)
 # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
diff --git a/src/loss-penalty/standard.jl b/src/loss-penalty/standard.jl
@@ -103,10 +103,17 @@ where `P` is a matrix where each row contains class probabilities,
 ``Zᵢ = ∑ exp(Pᵢ)``
 
 In a multinomial regression, `P` corresponds to `XW` where `X` is the design
-matrix and `W` the matrix of size `p * K` where each column corresponds to the
+matrix and `W` the matrix of size `p × K` where each column corresponds to the
 parameters corresponding to that class.
 """
-struct MultinomialLoss <: AtomicLoss end
+struct MultinomialLoss{c} <: MultiClassLoss{c} end
+
+MultinomialLoss() = MultinomialLoss{0}()  # inferred from data
+function MultinomialLoss(c)
+    c < 2 && throw(DomainError("The number of classes for a MultinomialLoss " *
+                               "must be greater or equal to 2."))
+    return MultinomialLoss{c}()
+end
 
 (::MultinomialLoss)(P::Matrix{<:Real}, y::Vector{Int}) = begin
     L = 0.0
diff --git a/src/mlj/classifiers.jl b/src/mlj/classifiers.jl
@@ -36,14 +36,16 @@ the strength of the L2 (resp. L1) regularisation components.
     penalize_intercept::Bool = false
     solver::Option{Solver}   = nothing
     multi_class::Bool        = false
+    nclasses::Int            = 2
 end
 
 glr(m::LogisticClassifier) =
     LogisticRegression(m.lambda, m.gamma;
                        penalty=Symbol(m.penalty),
                        multi_class=m.multi_class,
                        fit_intercept=m.fit_intercept,
-                       penalize_intercept=m.penalize_intercept)
+                       penalize_intercept=m.penalize_intercept,
+                       nclasses=m.nclasses)
 
 descr(::Type{LogisticClassifier}) = "Classifier corresponding to the loss function ``L(y, Xθ) + λ|θ|₂²/2 + γ|θ|₁`` where `L` is the logistic loss."
 
@@ -64,13 +66,15 @@ to `true` by default. The other parameters are the same.
     fit_intercept::Bool      = true
     penalize_intercept::Bool = false
     solver::Option{Solver}   = nothing
+    nclasses::Int            = 2       # leave to 2, cf LogisticRegression
 end
 
 glr(m::MultinomialClassifier) =
     MultinomialRegression(m.lambda, m.gamma;
                           penalty=Symbol(m.penalty),
                           fit_intercept=m.fit_intercept,
-                          penalize_intercept=m.penalize_intercept)
+                          penalize_intercept=m.penalize_intercept,
+                          nclasses=m.nclasses)
 
 descr(::Type{MultinomialClassifier}) =
     "Classifier corresponding to the loss function " *
diff --git a/src/mlj/interface.jl b/src/mlj/interface.jl
diff --git a/src/utils.jl b/src/utils.jl
diff --git a/test/glr/constructors.jl b/test/glr/constructors.jl
diff --git a/test/interface/extras.jl b/test/interface/extras.jl
diff --git a/test/loss-penalty/generic.jl b/test/loss-penalty/generic.jl
diff --git a/test/runtests.jl b/test/runtests.jl