Skip to content

Commit 8d8a259

Browse files
authored
proper level checks (#73)
1 parent 4ce79b5 commit 8d8a259

File tree

18 files changed

+174
-85
lines changed

18 files changed

+174
-85
lines changed

Project.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
name = "MLJLinearModels"
22
uuid = "6ee0df7b-362f-4a72-a706-9e79364fb692"
33
authors = ["Thibaut Lienart <[email protected]>"]
4-
version = "0.3.6"
4+
version = "0.4.0"
55

66
[deps]
7+
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
78
DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
89
IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
910
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

src/fit/analytical.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ function _fit(glr::GLR{L2Loss,<:L2R}, solver::Analytical, X, y, scratch)
3434
# it is done implicitly in the application of the Hessian to
3535
# avoid copying X
3636
# The number of CG steps to convergence is at most `p`
37-
p = size(X, 2) + Int(glr.fit_intercept)
37+
_,p,_ = npc(scratch)
3838
max_cg_steps = min(solver.max_inner, p)
3939
# Form the Hessian map, cost of application H*v is O(np)
4040
Hm = LinearMap(Hv!(glr, X, y, scratch), p;

src/fit/default.jl

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ _solver(::GLR{L2Loss,<:L2R}, np::NTuple{2,Int}) = Analytical()
1111

1212
# Logistic, Multinomial
1313
_solver(::GLR{LogisticLoss,<:L2R}, np::NTuple{2,Int}) = LBFGS()
14-
_solver(::GLR{MultinomialLoss,<:L2R}, np::NTuple{2,Int}) = LBFGS()
14+
_solver(::GLR{<:MultinomialLoss,<:L2R}, np::NTuple{2,Int}) = LBFGS()
1515

1616
# Lasso, ElasticNet, Logistic, Multinomial
1717
function _solver(glr::GLR{<:SmoothLoss,<:ENR}, np::NTuple{2,Int})
@@ -37,17 +37,19 @@ function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR;
3737
solver::Solver=_solver(glr, size(X)))
3838
check_nrows(X, y)
3939
n, p = size(X)
40-
c = glr.loss isa MultinomialLoss ? maximum(y) : 0
40+
c = getc(glr, y)
4141
return _fit(glr, solver, X, y, scratch(n, p, c, i=glr.fit_intercept))
4242
end
4343

4444
function scratch(n, p, c=0; i=false)
4545
p_ = p + Int(i)
46-
s = (n=zeros(n), n2=zeros(n), n3=zeros(n), p=zeros(p_))
46+
s = (n=zeros(n), n2=zeros(n), n3=zeros(n), p=zeros(p_), dims=(n,p_,c))
4747
if !iszero(c)
4848
s = (s..., nc=zeros(n,c), nc2=zeros(n,c), nc3=zeros(n,c),
4949
nc4=zeros(n,c), pc=zeros(p_,c))
5050
end
5151
return s
5252
end
5353
scratch(X; kw...) = scratch(size(X)...; kw...)
54+
55+
npc(s) = s.dims

src/fit/iwls.jl

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
function _fit(glr::GLR{RobustLoss{ρ},<:L2R}, solver::IWLSCG, X, y, scratch
22
) where {ρ}
33
λ = getscale(glr.penalty)
4-
n = size(X, 1)
5-
p = size(X, 2) + Int(glr.fit_intercept)
4+
n,p,_ = npc(scratch)
65
_Mv! = Mv!(glr, X, y, scratch; threshold=solver.threshold)
76
κ = solver.damping # between 0 and 1, 1 = fully take the new iteration
87
# cache

src/fit/newton.jl

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Hessian at each step with κ the number of Newton steps.
1414
"""
1515
function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
1616
solver::Newton, X, y, scratch)
17-
p = size(X, 2) + Int(glr.fit_intercept)
17+
_,p,_ = npc(scratch)
1818
θ₀ = zeros(p)
1919
_fgh! = fgh!(glr, X, y, scratch)
2020
opt = Optim.only_fgh!(_fgh!)
@@ -36,13 +36,13 @@ average number of CG steps per Newton step (which is at most p).
3636
"""
3737
function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
3838
solver::NewtonCG, X, y, scratch)
39-
p = size(X, 2) + Int(glr.fit_intercept)
40-
θ₀ = zeros(p)
41-
_f = objective(glr, X, y)
42-
_fg! = (g, θ) -> fgh!(glr, X, y, scratch)(0.0, g, nothing, θ) # Optim.jl/738
43-
_Hv! = Hv!(glr, X, y, scratch)
44-
opt = Optim.TwiceDifferentiableHV(_f, _fg!, _Hv!, θ₀)
45-
res = Optim.optimize(opt, θ₀, Optim.KrylovTrustRegion())
39+
_,p,_ = npc(scratch)
40+
θ₀ = zeros(p)
41+
_f = objective(glr, X, y)
42+
_fg! = (g, θ) -> fgh!(glr, X, y, scratch)(0.0, g, nothing, θ) # Optim#738
43+
_Hv! = Hv!(glr, X, y, scratch)
44+
opt = Optim.TwiceDifferentiableHV(_f, _fg!, _Hv!, θ₀)
45+
res = Optim.optimize(opt, θ₀, Optim.KrylovTrustRegion())
4646
return Optim.minimizer(res)
4747
end
4848

@@ -58,11 +58,11 @@ gradient at each step with κ the number of LBFGS steps.
5858
"""
5959
function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
6060
solver::LBFGS, X, y, scratch)
61-
p = size(X, 2) + Int(glr.fit_intercept)
62-
θ₀ = zeros(p)
63-
_fg! = (f, g, θ) -> fgh!(glr, X, y, scratch)(f, g, nothing, θ)
64-
opt = Optim.only_fg!(_fg!)
65-
res = Optim.optimize(opt, θ₀, Optim.LBFGS())
61+
_,p,_ = npc(scratch)
62+
θ₀ = zeros(p)
63+
_fg! = (f, g, θ) -> fgh!(glr, X, y, scratch)(f, g, nothing, θ)
64+
opt = Optim.only_fg!(_fg!)
65+
res = Optim.optimize(opt, θ₀, Optim.LBFGS())
6666
return Optim.minimizer(res)
6767
end
6868

@@ -82,15 +82,15 @@ computations are dominated by the application of the Hessian at each step with
8282
κ₁ the number of Newton steps and κ₂ the average number of CG steps per Newton
8383
step.
8484
"""
85-
function _fit(glr::GLR{MultinomialLoss,<:L2R}, solver::NewtonCG, X, y, scratch)
86-
p = size(X, 2) + Int(glr.fit_intercept)
87-
c = maximum(y)
88-
θ₀ = zeros(p * c)
89-
_f = objective(glr, X, y; c=c)
90-
_fg! = (g, θ) -> fg!(glr, X, y, scratch)(0.0, g, θ) # XXX: Optim.jl/738
91-
_Hv! = Hv!(glr, X, y, scratch)
92-
opt = Optim.TwiceDifferentiableHV(_f, _fg!, _Hv!, θ₀)
93-
res = Optim.optimize(opt, θ₀, Optim.KrylovTrustRegion())
85+
function _fit(glr::GLR{<:MultinomialLoss,<:L2R}, solver::NewtonCG,
86+
X, y, scratch)
87+
_,p,c = npc(scratch)
88+
θ₀ = zeros(p * c)
89+
_f = objective(glr, X, y; c=c)
90+
_fg! = (g, θ) -> fg!(glr, X, y, scratch)(0.0, g, θ) # XXX: Optim.jl/738
91+
_Hv! = Hv!(glr, X, y, scratch)
92+
opt = Optim.TwiceDifferentiableHV(_f, _fg!, _Hv!, θ₀)
93+
res = Optim.optimize(opt, θ₀, Optim.KrylovTrustRegion())
9494
return Optim.minimizer(res)
9595
end
9696

@@ -105,12 +105,12 @@ Assuming `n` dominates `p`, O(κnpc), with `c` the number of classes, dominated
105105
by the computation of the gradient at each step with κ the number of LBFGS
106106
steps.
107107
"""
108-
function _fit(glr::GLR{MultinomialLoss,<:L2R}, solver::LBFGS, X, y, scratch)
109-
p = size(X, 2) + Int(glr.fit_intercept)
110-
c = maximum(y)
111-
θ₀ = zeros(p * c)
112-
_fg! = fg!(glr, X, y, scratch)
113-
opt = Optim.only_fg!(_fg!)
114-
res = Optim.optimize(opt, θ₀, Optim.LBFGS())
108+
function _fit(glr::GLR{<:MultinomialLoss,<:L2R}, solver::LBFGS,
109+
X, y, scratch)
110+
_,p,c = npc(scratch)
111+
θ₀ = zeros(p * c)
112+
_fg! = fg!(glr, X, y, scratch)
113+
opt = Optim.only_fg!(_fg!)
114+
res = Optim.optimize(opt, θ₀, Optim.LBFGS())
115115
return Optim.minimizer(res)
116116
end

src/fit/proxgrad.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
# Assumption: loss has gradient; penalty has prox e.g.: Lasso
44
# J(θ) = f(θ) + r(θ) where f is smooth
55
function _fit(glr::GLR, solver::ProxGrad, X, y, scratch)
6-
c = ifelse(isa(glr.loss, MultinomialLoss), length(unique(y)), 1)
7-
p = (size(X, 2) + Int(glr.fit_intercept)) * c
6+
_,p,c = npc(scratch)
7+
c > 0 && (p *= c)
88
# vector caches + eval cache
99
θ = zeros(p) # θ_k
1010
Δθ = zeros(p) # (θ_k - θ_{k-1})

src/glr/constructors.jl

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ end
3232

3333
const GLR = GeneralizedLinearRegression
3434

35+
getc(g::GLR) = getc(g.loss)
36+
getc(g::GLR, y) = getc(g.loss, y)
37+
38+
## Specific constructors
3539

3640
"""
3741
$SIGNATURES
@@ -116,10 +120,17 @@ logistic loss in the binary case or the multinomial loss otherwise.
116120
function LogisticRegression::Real=1.0, γ::Real=0.0;
117121
lambda::Real=λ, gamma::Real=γ,
118122
penalty::Symbol=iszero(gamma) ? :l2 : :en,
119-
multi_class::Bool=false, fit_intercept::Bool=true,
120-
penalize_intercept::Bool=false)
123+
fit_intercept::Bool=true,
124+
penalize_intercept::Bool=false,
125+
multi_class::Bool=false,
126+
nclasses::Integer=0)
121127
penalty = _l1l2en(lambda, gamma, penalty, "Logistic regression")
122-
loss = multi_class ? MultinomialLoss() : LogisticLoss()
128+
loss = LogisticLoss()
129+
if nclasses > 2 # number of classes is explicitly specified
130+
loss = MultinomialLoss(nclasses)
131+
elseif multi_class # number of classes will be inferred from data
132+
loss = MultinomialLoss()
133+
end
123134
GLR(loss=loss,
124135
penalty=penalty,
125136
fit_intercept=fit_intercept,
@@ -132,7 +143,8 @@ $SIGNATURES
132143
Objective function: ``L(y, Xθ) + λ|θ|₂²/2 + γ|θ|₁`` where `L` is the
133144
multinomial loss.
134145
"""
135-
MultinomialRegression(a...; kwa...) = LogisticRegression(a...; multi_class=true, kwa...)
146+
MultinomialRegression(a...; kwa...) =
147+
LogisticRegression(a...; multi_class=true, kwa...)
136148

137149

138150
# ========

src/glr/d_logistic.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -144,9 +144,9 @@ end
144144
# * yᵢ ∈ {1, 2, ..., c}
145145
# ---------------------------------------------------------
146146

147-
function fg!(glr::GLR{MultinomialLoss,<:L2R}, X, y, scratch)
147+
function fg!(glr::GLR{<:MultinomialLoss,<:L2R}, X, y, scratch)
148148
n, p = size(X)
149-
c = length(unique(y))
149+
c = getc(glr, y)
150150
λ = getscale(glr.penalty)
151151
(f, g, θ) -> begin
152152
P = scratch.nc
@@ -199,10 +199,10 @@ function fg!(glr::GLR{MultinomialLoss,<:L2R}, X, y, scratch)
199199
end
200200
end
201201

202-
function Hv!(glr::GLR{MultinomialLoss,<:L2R}, X, y, scratch)
202+
function Hv!(glr::GLR{<:MultinomialLoss,<:L2R}, X, y, scratch)
203203
p = size(X, 2)
204204
λ = getscale(glr.penalty)
205-
c = length(unique(y))
205+
c = getc(glr, y)
206206
# NOTE:
207207
# * ideally P and Q should be recuperated from gradient computations (fghv!)
208208
# * assumption that c is small so that storing matrices of size n * c is
@@ -242,7 +242,7 @@ end
242242
# -> prox_r = soft-thresh
243243
# ---------------------------------------------------------
244244

245-
function smooth_fg!(glr::GLR{MultinomialLoss,<:ENR}, X, y, scratch)
245+
function smooth_fg!(glr::GLR{<:MultinomialLoss,<:ENR}, X, y, scratch)
246246
smooth = get_smooth(glr)
247247
(g, θ) -> fg!(smooth, X, y, scratch)(0.0, g, θ)
248248
end

src/glr/utils.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ $SIGNATURES
1717
Return a function computing the objective at a given point `θ`.
1818
Note that the [`apply_X`](@ref) takes care of a potential intercept.
1919
"""
20-
objective(glr::GLR, X, y; c::Int=1) =
20+
objective(glr::GLR, X, y; c::Int=0) =
2121
θ -> objective(glr)(y, apply_X(X, θ, c), view_θ(glr, θ))
2222

2323

@@ -27,7 +27,7 @@ $SIGNATURES
2727
Return a function computing the smooth part of the objective at a given
2828
evaluation point `θ`.
2929
"""
30-
smooth_objective(glr::GLR, X, y; c::Int=1) =
30+
smooth_objective(glr::GLR, X, y; c::Int=0) =
3131
θ -> smooth_objective(glr)(y, apply_X(X, θ, c), view_θ(glr, θ))
3232

3333
"""

src/loss-penalty/generic.jl

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,15 @@ getscale(n::NoLoss) = 0.0
3232
getscale(l::AtomicLoss) = 1.0
3333
getscale(l::ScaledLoss) = l.scale
3434

35+
# Convenient extension for classification
36+
abstract type MultiClassLoss{c} <: AtomicLoss where c end
37+
38+
getc(m) = 0
39+
getc(m, y) = 0
40+
getc(m::MultiClassLoss{c}) where c = c
41+
getc(m::MultiClassLoss{0}, y) = maximum(y)
42+
getc(m::MultiClassLoss{c}, y) where c = c
43+
3544
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
3645
# Penalty: θ -> P(θ)
3746
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =

0 commit comments

Comments
 (0)