Skip to content

Commit 273650a

Browse files
authored
Penalizeintercept (#31)
* decreasing allocs via temp * using morel legible refs * reviews 1 * adding tests for scratchspace * add scratch for robust * tests ok * adding a bunch of tests and completing the PR * additional fixes * update bench * fixing all conflicts... * stashing work * ongoing * adds toggle for penalty on intercept, closes #27 * diagm not available pre julia12
1 parent 6316160 commit 273650a

16 files changed

+341
-109
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ The core aims of this package are:
2020
- focus on performance including in "big data" settings exploiting packages such as [`Optim.jl`](https://github.com/JuliaNLSolvers/Optim.jl), [`IterativeSolvers.jl`](https://github.com/JuliaMath/IterativeSolvers.jl),
2121
- use a "machine learning" perspective, i.e.: focus essentially on prediction, hyper-parameters should be obtained via a data-driven procedure such as cross-validation.
2222

23+
All models allow to fit an intercept and allow the penalty to be applied or not on the intercept (not applied by default).
24+
All models attempt to be efficient in terms of memory allocation to avoid unnecessary copies of the data.
25+
2326
## Implemented
2427

2528
| Regressors | Formulation¹ | Available solvers | Comments |
@@ -63,6 +66,7 @@ Systematic timing benchmarks have not been run yet but it's planned (see [this i
6366

6467
* The models are built and tested assuming `n > p`; if this doesn't hold, tricks should be employed to speed up computations; these have not been implemented yet.
6568
* CV-aware code not implemented yet (code that re-uses computations when fitting over a number of hyper-parameters); "Meta" functionalities such as One-vs-All or Cross-Validation are left to other packages such as MLJ.
69+
* No support yet for sparse matrices.
6670
* Stochastic solvers have not yet been implemented.
6771
* All computations are assumed to be done in Float64.
6872

src/glr/constructors.jl

Lines changed: 42 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,10 @@ Special cases include:
2424
"""
2525
@with_kw mutable struct GeneralizedLinearRegression{L<:Loss, P<:Penalty}
2626
# Parameters that can be tuned
27-
loss::L = L2Loss() # L(y, ŷ=Xθ)
28-
penalty::P = NoPenalty() # P(θ)
29-
fit_intercept::Bool = true # add intercept ? def=true
27+
loss::L = L2Loss() # L(y, ŷ=Xθ)
28+
penalty::P = NoPenalty() # P(θ)
29+
fit_intercept::Bool = true # add intercept ? def=true
30+
penalize_intercept::Bool = false
3031
end
3132

3233
const GLR = GeneralizedLinearRegression
@@ -45,9 +46,12 @@ $SIGNATURES
4546
4647
Objective function: ``|Xθ - y|₂²/2 + λ|θ|₂²/2``.
4748
"""
48-
function RidgeRegression::Real=1.0; lambda::Real=λ, fit_intercept::Bool=true)
49+
function RidgeRegression::Real=1.0; lambda::Real=λ, fit_intercept::Bool=true,
50+
penalize_intercept::Bool=false)
4951
check_pos(lambda)
50-
GLR(fit_intercept=fit_intercept, penalty=lambda*L2Penalty())
52+
GLR(penalty=lambda*L2Penalty(),
53+
fit_intercept=fit_intercept,
54+
penalize_intercept=penalize_intercept)
5155
end
5256

5357

@@ -56,9 +60,12 @@ $SIGNATURES
5660
5761
Objective function: ``|Xθ - y|₂²/2 + λ|θ|₁``
5862
"""
59-
function LassoRegression::Real=1.0; lambda::Real=λ, fit_intercept::Bool=true)
63+
function LassoRegression::Real=1.0; lambda::Real=λ, fit_intercept::Bool=true,
64+
penalize_intercept::Bool=false)
6065
check_pos(lambda)
61-
GLR(fit_intercept=fit_intercept, penalty=lambda*L1Penalty())
66+
GLR(penalty=lambda*L1Penalty(),
67+
fit_intercept=fit_intercept,
68+
penalize_intercept=penalize_intercept)
6269
end
6370

6471

@@ -68,9 +75,11 @@ $SIGNATURES
6875
Objective function: ``|Xθ - y|₂²/2 + λ|θ|₂²/2 + γ|θ|₁``
6976
"""
7077
function ElasticNetRegression::Real=1.0, γ::Real=1.0; lambda::Real=λ, gamma::Real=γ,
71-
fit_intercept::Bool=true)
78+
fit_intercept::Bool=true, penalize_intercept::Bool=false)
7279
check_pos.((lambda, gamma))
73-
GLR(fit_intercept=fit_intercept, penalty=lambda*L2Penalty()+gamma*L1Penalty())
80+
GLR(penalty=lambda*L2Penalty()+gamma*L1Penalty(),
81+
fit_intercept=fit_intercept,
82+
penalize_intercept=penalize_intercept)
7483
end
7584

7685

@@ -104,11 +113,14 @@ binary case or the multinomial loss otherwise.
104113
"""
105114
function LogisticRegression::Real=1.0, γ::Real=0.0; lambda::Real=λ, gamma::Real=γ,
106115
penalty::Symbol=iszero(gamma) ? :l2 : :en,
107-
multi_class::Bool=false,
108-
fit_intercept::Bool=true)
116+
multi_class::Bool=false, fit_intercept::Bool=true,
117+
penalize_intercept::Bool=false)
109118
penalty = _l1l2en(lambda, gamma, penalty, "Logistic regression")
110119
loss = multi_class ? MultinomialLoss() : LogisticLoss()
111-
GeneralizedLinearRegression(loss=loss, penalty=penalty, fit_intercept=fit_intercept)
120+
GLR(loss=loss,
121+
penalty=penalty,
122+
fit_intercept=fit_intercept,
123+
penalize_intercept=penalize_intercept)
112124
end
113125

114126
MultinomialRegression(a...; kwa...) = LogisticRegression(a...; multi_class=true, kwa...)
@@ -125,58 +137,65 @@ radius of the ball in which residuals are weighed quadratically).
125137
"""
126138
function RobustRegression::RobustRho=HuberRho(0.1), λ::Real=1.0, γ::Real=0.0;
127139
rho::RobustRho=ρ, lambda::Real=λ, gamma::Real=γ,
128-
penalty::Symbol=iszero(gamma) ? :l2 : :en, fit_intercept::Bool=true)
140+
penalty::Symbol=iszero(gamma) ? :l2 : :en, fit_intercept::Bool=true,
141+
penalize_intercept::Bool=false)
129142
penalty = _l1l2en(lambda, gamma, penalty, "Robust regression")
130-
GLR(fit_intercept=fit_intercept, loss=RobustLoss(rho), penalty=penalty)
143+
GLR(loss=RobustLoss(rho),
144+
penalty=penalty,
145+
fit_intercept=fit_intercept,
146+
penalize_intercept=penalize_intercept)
131147
end
132148

133149
"""
134150
$SIGNATURES
135151
136152
Huber Regression with objective:
137153
138-
``∑ρ(Xθ - y) + λ|θ|₂²/2``
154+
``∑ρ(Xθ - y) + λ|θ|₂²/2 + γ|θ|``
139155
140156
Where `ρ` is the Huber function `ρ(r) = r²/2`` if `|r|≤δ` and `ρ(r)=δ(|r|-δ/2)` otherwise.
141157
"""
142158
function HuberRegression::Real=0.5, λ::Real=1.0, γ::Real=0.0;
143159
delta::Real=δ, lambda::Real=λ, gamma::Real=γ,
144160
penalty::Symbol=iszero(gamma) ? :l2 : :en,
145-
fit_intercept::Bool=true)
161+
fit_intercept::Bool=true, penalize_intercept::Bool=false)
146162
return RobustRegression(HuberRho(delta), lambda, gamma;
147-
penalty=penalty, fit_intercept=fit_intercept)
163+
penalty=penalty, fit_intercept=fit_intercept,
164+
penalize_intercept=penalize_intercept)
148165
end
149166

150167
"""
151168
$SIGNATURES
152169
153170
Quantile Regression with objective:
154171
155-
``∑ρ(Xθ - y) + λ|θ|₂²/2``
172+
``∑ρ(Xθ - y) + λ|θ|₂²/2 + γ|θ|``
156173
157174
Where `ρ` is the check function `ρ(r) = r(δ - 1(r < 0))`.
158175
"""
159176
function QuantileRegression::Real=0.5, λ::Real=1.0, γ::Real=0.0;
160177
delta::Real=δ, lambda::Real=λ, gamma::Real=γ,
161178
penalty::Symbol=iszero(gamma) ? :l2 : :en,
162-
fit_intercept::Bool=true)
179+
fit_intercept::Bool=true, penalize_intercept::Bool=false)
163180
return RobustRegression(QuantileRho(delta), lambda, gamma;
164-
penalty=penalty, fit_intercept=fit_intercept)
181+
penalty=penalty, fit_intercept=fit_intercept,
182+
penalize_intercept=penalize_intercept)
165183
end
166184

167185
"""
168186
$SIGNATURES
169187
170188
Least Absolute Deviation regression with objective:
171189
172-
``|Xθ - y|₁ + λ|θ|₂²/2``
190+
``|Xθ - y|₁ + λ|θ|₂²/2 + γ|θ|``
173191
174192
This is a specific type of Quantile Regression with `δ=0.5` (median).
175193
"""
176194
function LADRegression::Real=1.0, γ::Real=0.0;
177195
lambda::Real=λ, gamma::Real=γ,
178196
penalty::Symbol=iszero(gamma) ? :l2 : :en,
179-
fit_intercept::Bool=true)
197+
fit_intercept::Bool=true, penalize_intercept::Bool=false)
180198
return QuantileRegression(0.5, lambda, gamma;
181-
penalty=penalty, fit_intercept=fit_intercept)
199+
penalty=penalty, fit_intercept=fit_intercept,
200+
penalize_intercept=penalize_intercept)
182201
end

src/glr/d_l2loss.jl

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ function Hv!(glr::GLR{L2Loss,<:L2R}, X, y)
1818
if glr.fit_intercept
1919
# H = [X 1]'[X 1] + λ I
2020
# rows a 1:p = [X'X + λI | X'1]
21-
# row e end = [1'X | n+λ]
21+
# row e end = [1'X | n+λι] where ι is 1 if glr.penalize_intercept
22+
ι = float(glr.penalize_intercept)
2223
(Hv, v) -> begin
2324
# view on the first p rows
2425
a = 1:p
@@ -33,7 +34,7 @@ function Hv!(glr::GLR{L2Loss,<:L2R}, X, y)
3334
mul!(Hvₐ, X', Xvₐ)
3435
Hvₐ .+= λ .* vₐ .+ Xt1 .* vₑ
3536
# update for the last row -- (X'1)'v + n v[end]
36-
Hv[end] = dot(Xt1, vₐ) + (n+λ) * vₑ
37+
Hv[end] = dot(Xt1, vₐ) + (n + λ_if_penalize_intercept(glr, λ)) * vₑ
3738
end
3839
else
3940
(Hv, v) -> begin
@@ -64,6 +65,7 @@ function smooth_fg!(glr::GLR{L2Loss,<:ENR}, X, y)
6465
get_residuals!(r, X, θ, y) # -- r = Xθ-y
6566
apply_Xt!(g, X, r)
6667
g .+= λ .* θ
67-
return glr.loss(r) + get_l2(glr.penalty)(θ)
68+
glr.fit_intercept && (glr.penalize_intercept || (g[end] -= λ * θ[end]))
69+
return glr.loss(r) + get_l2(glr.penalty)(view_θ(glr, θ))
6870
end
6971
end

src/glr/d_logistic.jl

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,22 +25,23 @@ function fgh!(glr::GLR{LogisticLoss,<:L2R}, X, y)
2525
t .= y .* (w .- 1.0) # -- t = y .* (w .- 1.0)
2626
apply_Xt!(g, X, t) # -- g = X't
2727
g .+= λ .* θ
28+
glr.penalize_intercept || (g[end] -= λ * θ[end])
2829
end
2930
H === nothing || begin
3031
# NOTE: we could try to be clever to reduce the allocations for
3132
# ΛX but computing the full hessian allocates a lot anyway so
3233
# probably not really worth it
33-
ΛX = w .* X # !! big allocs
34-
mul!(view(H, 1:p, 1:p), X', ΛX) # -- H[1:p,1:p] = X'ΛX
34+
ΛX = w .* X # !! big allocs
35+
mul!(view(H, 1:p, 1:p), X', ΛX) # -- H[1:p,1:p] = X'ΛX
3536
ΛXt1 = view(SCRATCH_P[], 1:p)
36-
copyto!(ΛXt1, sum(ΛX, dims=1)) # -- (ΛX)'1
37+
copyto!(ΛXt1, sum(ΛX, dims=1)) # -- (ΛX)'1
3738
@inbounds for i = 1:p
38-
H[i, end] = H[end, i] = ΛXt1[i] # -- H[:,p+1] = H[p+1,:] = (ΛX)'1
39+
H[i, end] = H[end, i] = ΛXt1[i] # -- H[:,p+1] = H[p+1,:] = (ΛX)'1
3940
end
40-
H[end, end] = sum(w) # -- 1'Λ1'
41-
add_λI!(H, λ) # -- H = X'ΛX + λI
41+
H[end, end] = sum(w) # -- 1'Λ1'
42+
add_λI!(H, λ, glr.penalize_intercept) # -- H = X'ΛX + λI
4243
end
43-
f === nothing || return J(y, Xθ, θ)
44+
f === nothing || return J(y, Xθ, view_θ(glr, θ))
4445
end
4546
else
4647
# see comments above, same computations just no additional things for
@@ -91,7 +92,7 @@ function Hv!(glr::GLR{LogisticLoss,<:L2R}, X, y)
9192
mul!(Hvₐ, X', Xvₐ) # -- (X'ΛX)vₐ
9293
Hvₐ .+= λ .* vₐ .+ XtΛ1 .* vₑ
9394
# update for the last row -- (X'1)'v + n v[end]
94-
Hv[end] = dot(XtΛ1, vₐ) + (sum(w)+λ) * vₑ
95+
Hv[end] = dot(XtΛ1, vₐ) + (sum(w) + λ_if_penalize_intercept(glr, λ)) * vₑ
9596
end
9697
else
9798
(Hv, θ, v) -> begin
@@ -167,8 +168,9 @@ function fg!(glr::GLR{MultinomialLoss,<:L2R}, X, y)
167168
else
168169
mul!(G, X', R)
169170
end
170-
g .= reshape(G, (p+Int(glr.fit_intercept))*c)
171+
g .= reshape(G, (p + Int(glr.fit_intercept)) * c)
171172
g .+= λ .* θ
173+
glr.fit_intercept && (glr.penalize_intercept || (g[end] -= λ * θ[end]))
172174
end
173175
f === nothing || begin
174176
# we re-use pre-computations here, see also MultinomialLoss
@@ -186,7 +188,7 @@ function fg!(glr::GLR{MultinomialLoss,<:L2R}, X, y)
186188
@inbounds for i in eachindex(y)
187189
t += log(ss[i]) + ms[i] - P[i, y[i]]
188190
end
189-
return sum(t) + glr.penalty(θ)
191+
return sum(t) + glr.penalty(view_θ(glr, θ))
190192
end
191193
end
192194
end
@@ -218,6 +220,7 @@ function Hv!(glr::GLR{MultinomialLoss,<:L2R}, X, y)
218220
Hv .= reshape(Hv_mat, p * c)
219221
end
220222
Hv .+= λ .* v
223+
glr.fit_intercept && (glr.penalize_intercept || (Hv[end] -= λ * v[end]))
221224
end
222225
end
223226

src/glr/d_robust.jl

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ function fgh!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y) where ρ <: RobustRho1P{δ}
4848
ψr .= ψ_.(r, w)
4949
apply_Xt!(g, X, ψr)
5050
g .+= λ .* θ
51+
glr.penalize_intercept || (g[end] -= λ * θ[end])
5152
end
5253
# Hessian via ϕ functiono
5354
H === nothing || begin
@@ -60,10 +61,10 @@ function fgh!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y) where ρ <: RobustRho1P{δ}
6061
H[i, end] = H[end, i] = ΛXt1[i]
6162
end
6263
H[end, end] = sum(ϕr)
63-
add_λI!(H, λ)
64+
add_λI!(H, λ, glr.penalize_intercept)
6465
end
6566
# function value
66-
f === nothing || return glr.loss(r) + glr.penalty(θ)
67+
f === nothing || return glr.loss(r) + glr.penalty(view_θ(glr, θ))
6768
end
6869
else
6970
(f, g, H, θ) -> begin
@@ -112,7 +113,7 @@ function Hv!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y) where ρ <: RobustRho1P{δ} w
112113
apply_Xt!(Hvₐ, X, t)
113114
Hvₐ .+= λ .* vₐ .+ XtΛ1 .* vₑ
114115
# update for the last row (intercept)
115-
Hv[end] = dot(XtΛ1, vₐ) + (sum(w)+λ) * vₑ
116+
Hv[end] = dot(XtΛ1, vₐ) + (sum(w) + λ_if_penalize_intercept(glr, λ)) * vₑ
116117
end
117118
else
118119
(Hv, θ, v) -> begin
@@ -163,7 +164,7 @@ function Mv!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y;
163164
t .*= ωr
164165
mul!(Mvₐ, X', t)
165166
Mvₐ .+= λ .* vₐ .+ XtW1 .* vₑ
166-
Mv[end] = dot(XtW1, vₐ) + (sum(ωr)+λ) * vₑ
167+
Mv[end] = dot(XtW1, vₐ) + (sum(ωr) + λ_if_penalize_intercept(glr, λ)) * vₑ
167168
end
168169
else
169170
(Mv, v) -> begin
@@ -192,6 +193,7 @@ function smooth_fg!(glr::GLR{RobustLoss{ρ},<:ENR}, X, y) where ρ <: RobustRho1
192193
ψr .= ψ_.(r, w)
193194
apply_Xt!(g, X, ψr)
194195
g .+= λ .* θ
195-
return glr.loss(r) + get_l2(glr.penalty)(θ)
196+
glr.fit_intercept && (glr.penalize_intercept || (g[end] -= λ * θ[end]))
197+
return glr.loss(r) + get_l2(glr.penalty)(view_θ(glr, θ))
196198
end
197199
end

src/glr/prox.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,6 @@ function prox!(glr::GLR{<:Loss,<:Union{L1R,CompositePenalty}})
1919
γ = getscale_l1(glr.penalty)
2020
(p, α, z) -> begin
2121
p .= soft_thresh.(z, α * γ)
22+
glr.fit_intercept && (glr.penalize_intercept || (p[end] = z[end]))
2223
end
2324
end

src/glr/utils.jl

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,24 @@ Return the objective function (sum of loss + penalty) of a Generalized Linear Mo
1010
"""
1111
objective(glr::GLR) = glr.loss + glr.penalty
1212

13+
1314
"""
1415
$SIGNATURES
1516
1617
Return a function computing the objective at a given point `θ`.
1718
Note that the [`apply_X`](@ref) takes care of a potential intercept.
1819
"""
19-
objective(glr::GLR, X, y; c::Int=1) = θ -> objective(glr)(y, apply_X(X, θ, c), θ)
20+
objective(glr::GLR, X, y; c::Int=1) =
21+
θ -> objective(glr)(y, apply_X(X, θ, c), view_θ(glr, θ))
2022

2123

2224
"""
2325
$SIGNATURES
2426
2527
Return a function computing the smooth part of the objective at a given point `θ`.
2628
"""
27-
smooth_objective(glr::GLR, X, y; c::Int=1) = θ -> smooth_objective(glr)(y, apply_X(X, θ, c), θ)
29+
smooth_objective(glr::GLR, X, y; c::Int=1) =
30+
θ -> smooth_objective(glr)(y, apply_X(X, θ, c), view_θ(glr, θ))
2831

2932
"""
3033
$SIGNATURES
@@ -39,7 +42,9 @@ $SIGNATURES
3942
4043
Return a model corresponding to the smooth part of the objective.
4144
"""
42-
get_smooth(glr::GLR) = (o = smooth_objective(glr); GLR(o.loss, o.penalty, glr.fit_intercept))
45+
get_smooth(glr::GLR) = (
46+
o = smooth_objective(glr);
47+
GLR(o.loss, o.penalty, glr.fit_intercept, glr.penalize_intercept))
4348

4449

4550
"""

src/utils.jl

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,11 +175,12 @@ $SIGNATURES
175175
176176
In place computation of `H = H + λI` where `H` is a square matrix.
177177
"""
178-
function add_λI!(H::Matrix, λ::Real)
178+
function add_λI!(H::Matrix, λ::Real, penalize_intercept::Bool=true)
179179
λ = convert(eltype(H), λ)
180-
@inbounds for i in 1:size(H, 1)
180+
@inbounds for i in 1:size(H, 1)-1
181181
H[i,i] += λ
182182
end
183+
H[end, end] += ifelse(penalize_intercept, λ, zero(eltype(H)))
183184
end
184185

185186

@@ -197,3 +198,22 @@ $SIGNATURES
197198
Threshold the number if its absolute value is too close to zero.
198199
"""
199200
clip(z, τ) = ifelse(abs(z) < τ, τ, z)
201+
202+
203+
"""
204+
$SIGNATURES
205+
206+
Return λ if penalize intercept otherwise 0, useful in computations of Hessian.
207+
"""
208+
λ_if_penalize_intercept(glr, λ) = ifelse(glr.penalize_intercept, λ, zero(λ))
209+
210+
"""
211+
$SIGNATURES
212+
213+
Return a view of θ if the last element should not be penalized.
214+
"""
215+
@inline function view_θ(glr, θ)
216+
f = glr.fit_intercept && !glr.penalize_intercept
217+
f && return view(θ, 1:length(θ)-1)
218+
θ
219+
end

0 commit comments

Comments
 (0)