JuliaAI
diff --git a/‎README.md‎
Lines changed: 4 additions & 0 deletions b/‎README.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/glr/constructors.jl‎
Lines changed: 42 additions & 23 deletions b/‎src/glr/constructors.jl‎
Lines changed: 42 additions & 23 deletions
diff --git a/‎src/glr/d_l2loss.jl‎
Lines changed: 5 additions & 3 deletions b/‎src/glr/d_l2loss.jl‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎src/glr/d_logistic.jl‎
Lines changed: 13 additions & 10 deletions b/‎src/glr/d_logistic.jl‎
Lines changed: 13 additions & 10 deletions
diff --git a/‎src/glr/d_robust.jl‎
Lines changed: 7 additions & 5 deletions b/‎src/glr/d_robust.jl‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎src/glr/prox.jl‎
Lines changed: 1 addition & 0 deletions b/‎src/glr/prox.jl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/glr/utils.jl‎
Lines changed: 8 additions & 3 deletions b/‎src/glr/utils.jl‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎src/utils.jl‎
Lines changed: 22 additions & 2 deletions b/‎src/utils.jl‎
Lines changed: 22 additions & 2 deletions
@@ -20,6 +20,9 @@ The core aims of this package are:
 - focus on performance including in "big data" settings exploiting packages such as [`Optim.jl`](https://github.com/JuliaNLSolvers/Optim.jl), [`IterativeSolvers.jl`](https://github.com/JuliaMath/IterativeSolvers.jl),
 - use a "machine learning" perspective, i.e.: focus essentially on prediction, hyper-parameters should be obtained via a data-driven procedure such as cross-validation.
 
+All models allow to fit an intercept and allow the penalty to be applied or not on the intercept (not applied by default).
+All models attempt to be efficient in terms of memory allocation to avoid unnecessary copies of the data.
+
 ## Implemented
 
 | Regressors          | Formulation¹           | Available solvers                 | Comments  |
@@ -63,6 +66,7 @@ Systematic timing benchmarks have not been run yet but it's planned (see [this i
 
 * The models are built and tested assuming `n > p`; if this doesn't hold, tricks should be employed to speed up computations; these have not been implemented yet.
 * CV-aware code not implemented yet (code that re-uses computations when fitting over a number of hyper-parameters);  "Meta" functionalities such as One-vs-All or Cross-Validation are left to other packages such as MLJ.
+* No support yet for sparse matrices.
 * Stochastic solvers have not yet been implemented.
 * All computations are assumed to be done in Float64.
 
 
@@ -24,9 +24,10 @@ Special cases include:
 """
 @with_kw mutable struct GeneralizedLinearRegression{L<:Loss, P<:Penalty}
     # Parameters that can be tuned
-    loss::L             = L2Loss()    # L(y, ŷ=Xθ)
-    penalty::P          = NoPenalty() # P(θ)
-    fit_intercept::Bool = true        # add intercept ? def=true
+    loss::L                  = L2Loss()    # L(y, ŷ=Xθ)
+    penalty::P               = NoPenalty() # P(θ)
+    fit_intercept::Bool      = true        # add intercept ? def=true
+    penalize_intercept::Bool = false
 end
 
 const GLR = GeneralizedLinearRegression
@@ -45,9 +46,12 @@ $SIGNATURES
 
 Objective function: ``|Xθ - y|₂²/2 + λ|θ|₂²/2``.
 """
-function RidgeRegression(λ::Real=1.0; lambda::Real=λ, fit_intercept::Bool=true)
+function RidgeRegression(λ::Real=1.0; lambda::Real=λ, fit_intercept::Bool=true,
+                         penalize_intercept::Bool=false)
     check_pos(lambda)
-    GLR(fit_intercept=fit_intercept, penalty=lambda*L2Penalty())
+    GLR(penalty=lambda*L2Penalty(),
+        fit_intercept=fit_intercept,
+        penalize_intercept=penalize_intercept)
 end
 
 
@@ -56,9 +60,12 @@ $SIGNATURES
 
 Objective function: ``|Xθ - y|₂²/2 + λ|θ|₁``
 """
-function LassoRegression(λ::Real=1.0; lambda::Real=λ, fit_intercept::Bool=true)
+function LassoRegression(λ::Real=1.0; lambda::Real=λ, fit_intercept::Bool=true,
+                         penalize_intercept::Bool=false)
     check_pos(lambda)
-    GLR(fit_intercept=fit_intercept, penalty=lambda*L1Penalty())
+    GLR(penalty=lambda*L1Penalty(),
+        fit_intercept=fit_intercept,
+        penalize_intercept=penalize_intercept)
 end
 
 
@@ -68,9 +75,11 @@ $SIGNATURES
 Objective function: ``|Xθ - y|₂²/2 + λ|θ|₂²/2 + γ|θ|₁``
 """
 function ElasticNetRegression(λ::Real=1.0, γ::Real=1.0; lambda::Real=λ, gamma::Real=γ,
-                              fit_intercept::Bool=true)
+                              fit_intercept::Bool=true, penalize_intercept::Bool=false)
     check_pos.((lambda, gamma))
-    GLR(fit_intercept=fit_intercept, penalty=lambda*L2Penalty()+gamma*L1Penalty())
+    GLR(penalty=lambda*L2Penalty()+gamma*L1Penalty(),
+        fit_intercept=fit_intercept,
+        penalize_intercept=penalize_intercept)
 end
 
 
@@ -104,11 +113,14 @@ binary case or the multinomial loss otherwise.
 """
 function LogisticRegression(λ::Real=1.0, γ::Real=0.0; lambda::Real=λ, gamma::Real=γ,
                             penalty::Symbol=iszero(gamma) ? :l2 : :en,
-                            multi_class::Bool=false,
-                            fit_intercept::Bool=true)
+                            multi_class::Bool=false, fit_intercept::Bool=true,
+                            penalize_intercept::Bool=false)
     penalty = _l1l2en(lambda, gamma, penalty, "Logistic regression")
     loss = multi_class ? MultinomialLoss() : LogisticLoss()
-    GeneralizedLinearRegression(loss=loss, penalty=penalty, fit_intercept=fit_intercept)
+    GLR(loss=loss,
+        penalty=penalty,
+        fit_intercept=fit_intercept,
+        penalize_intercept=penalize_intercept)
 end
 
 MultinomialRegression(a...; kwa...) = LogisticRegression(a...; multi_class=true, kwa...)
@@ -125,58 +137,65 @@ radius of the ball in which residuals are weighed quadratically).
 """
 function RobustRegression(ρ::RobustRho=HuberRho(0.1), λ::Real=1.0, γ::Real=0.0;
                           rho::RobustRho=ρ, lambda::Real=λ, gamma::Real=γ,
-                          penalty::Symbol=iszero(gamma) ? :l2 : :en, fit_intercept::Bool=true)
+                          penalty::Symbol=iszero(gamma) ? :l2 : :en, fit_intercept::Bool=true,
+                          penalize_intercept::Bool=false)
     penalty = _l1l2en(lambda, gamma, penalty, "Robust regression")
-    GLR(fit_intercept=fit_intercept, loss=RobustLoss(rho), penalty=penalty)
+    GLR(loss=RobustLoss(rho),
+        penalty=penalty,
+        fit_intercept=fit_intercept,
+        penalize_intercept=penalize_intercept)
 end
 
 """
 $SIGNATURES
 
 Huber Regression with objective:
 
-``∑ρ(Xθ - y) + λ|θ|₂²/2``
+``∑ρ(Xθ - y) + λ|θ|₂²/2 + γ|θ|``
 
 Where `ρ` is the Huber function `ρ(r) = r²/2``  if `|r|≤δ` and `ρ(r)=δ(|r|-δ/2)` otherwise.
 """
 function HuberRegression(δ::Real=0.5, λ::Real=1.0, γ::Real=0.0;
                          delta::Real=δ, lambda::Real=λ, gamma::Real=γ,
                          penalty::Symbol=iszero(gamma) ? :l2 : :en,
-                         fit_intercept::Bool=true)
+                         fit_intercept::Bool=true, penalize_intercept::Bool=false)
     return RobustRegression(HuberRho(delta), lambda, gamma;
-                            penalty=penalty, fit_intercept=fit_intercept)
+                            penalty=penalty, fit_intercept=fit_intercept,
+                            penalize_intercept=penalize_intercept)
 end
 
 """
 $SIGNATURES
 
 Quantile Regression with objective:
 
-``∑ρ(Xθ - y) + λ|θ|₂²/2``
+``∑ρ(Xθ - y) + λ|θ|₂²/2 + γ|θ|``
 
 Where `ρ` is the check function `ρ(r) = r(δ - 1(r < 0))`.
 """
 function QuantileRegression(δ::Real=0.5, λ::Real=1.0, γ::Real=0.0;
                             delta::Real=δ, lambda::Real=λ, gamma::Real=γ,
                             penalty::Symbol=iszero(gamma) ? :l2 : :en,
-                            fit_intercept::Bool=true)
+                            fit_intercept::Bool=true, penalize_intercept::Bool=false)
     return RobustRegression(QuantileRho(delta), lambda, gamma;
-                            penalty=penalty, fit_intercept=fit_intercept)
+                            penalty=penalty, fit_intercept=fit_intercept,
+                            penalize_intercept=penalize_intercept)
 end
 
 """
 $SIGNATURES
 
 Least Absolute Deviation regression with objective:
 
-``|Xθ - y|₁ + λ|θ|₂²/2``
+``|Xθ - y|₁ + λ|θ|₂²/2 + γ|θ|``
 
 This is a specific type of Quantile Regression with `δ=0.5` (median).
 """
 function LADRegression(λ::Real=1.0, γ::Real=0.0;
                        lambda::Real=λ, gamma::Real=γ,
                        penalty::Symbol=iszero(gamma) ? :l2 : :en,
-                       fit_intercept::Bool=true)
+                       fit_intercept::Bool=true, penalize_intercept::Bool=false)
     return QuantileRegression(0.5, lambda, gamma;
-                              penalty=penalty, fit_intercept=fit_intercept)
+                              penalty=penalty, fit_intercept=fit_intercept,
+                              penalize_intercept=penalize_intercept)
 end
@@ -18,7 +18,8 @@ function Hv!(glr::GLR{L2Loss,<:L2R}, X, y)
     if glr.fit_intercept
         # H = [X 1]'[X 1] + λ I
         # rows a 1:p = [X'X + λI | X'1]
-        # row  e end = [1'X      | n+λ]
+        # row  e end = [1'X      | n+λι] where ι is 1 if glr.penalize_intercept
+        ι = float(glr.penalize_intercept)
         (Hv, v) -> begin
             # view on the first p rows
             a   = 1:p
@@ -33,7 +34,7 @@ function Hv!(glr::GLR{L2Loss,<:L2R}, X, y)
             mul!(Hvₐ, X', Xvₐ)
             Hvₐ .+= λ .* vₐ .+ Xt1 .* vₑ
             # update for the last row       -- (X'1)'v + n v[end]
-            Hv[end] = dot(Xt1, vₐ) + (n+λ) * vₑ
+            Hv[end] = dot(Xt1, vₐ) + (n + λ_if_penalize_intercept(glr, λ)) * vₑ
         end
     else
         (Hv, v) -> begin
@@ -64,6 +65,7 @@ function smooth_fg!(glr::GLR{L2Loss,<:ENR}, X, y)
         get_residuals!(r, X, θ, y) # -- r = Xθ-y
         apply_Xt!(g, X, r)
         g .+= λ .* θ
-        return glr.loss(r) + get_l2(glr.penalty)(θ)
+        glr.fit_intercept && (glr.penalize_intercept || (g[end] -= λ * θ[end]))
+        return glr.loss(r) + get_l2(glr.penalty)(view_θ(glr, θ))
     end
 end
@@ -25,22 +25,23 @@ function fgh!(glr::GLR{LogisticLoss,<:L2R}, X, y)
                 t .= y .* (w .- 1.0)                 # -- t = y .* (w .- 1.0)
                 apply_Xt!(g, X, t)                   # -- g = X't
                 g .+= λ .* θ
+                glr.penalize_intercept || (g[end] -= λ * θ[end])
             end
             H === nothing || begin
                 # NOTE: we could try to be clever to reduce the allocations for
                 # ΛX but computing the full hessian allocates a lot anyway so
                 # probably not really worth it
-                ΛX = w .* X                         # !! big allocs
-                mul!(view(H, 1:p, 1:p), X', ΛX)     # -- H[1:p,1:p] = X'ΛX
+                ΛX = w .* X                           # !! big allocs
+                mul!(view(H, 1:p, 1:p), X', ΛX)       # -- H[1:p,1:p] = X'ΛX
                 ΛXt1 = view(SCRATCH_P[], 1:p)
-                copyto!(ΛXt1, sum(ΛX, dims=1))      # -- (ΛX)'1
+                copyto!(ΛXt1, sum(ΛX, dims=1))        # -- (ΛX)'1
                 @inbounds for i = 1:p
-                    H[i, end] = H[end, i] = ΛXt1[i] # -- H[:,p+1] = H[p+1,:] = (ΛX)'1
+                    H[i, end] = H[end, i] = ΛXt1[i]   # -- H[:,p+1] = H[p+1,:] = (ΛX)'1
                 end
-                H[end, end] = sum(w)                # -- 1'Λ1'
-                add_λI!(H, λ)                       # -- H = X'ΛX + λI
+                H[end, end] = sum(w)                  # -- 1'Λ1'
+                add_λI!(H, λ, glr.penalize_intercept) # -- H = X'ΛX + λI
             end
-            f === nothing || return J(y, Xθ, θ)
+            f === nothing || return J(y, Xθ, view_θ(glr, θ))
         end
     else
         # see comments above, same computations just no additional things for
@@ -91,7 +92,7 @@ function Hv!(glr::GLR{LogisticLoss,<:L2R}, X, y)
             mul!(Hvₐ, X', Xvₐ)                       # -- (X'ΛX)vₐ
             Hvₐ .+= λ .* vₐ .+ XtΛ1 .* vₑ
             # update for the last row -- (X'1)'v + n v[end]
-            Hv[end] = dot(XtΛ1, vₐ) + (sum(w)+λ) * vₑ
+            Hv[end] = dot(XtΛ1, vₐ) + (sum(w) + λ_if_penalize_intercept(glr, λ)) * vₑ
         end
     else
         (Hv, θ, v) -> begin
@@ -167,8 +168,9 @@ function fg!(glr::GLR{MultinomialLoss,<:L2R}, X, y)
             else
                 mul!(G, X', R)
             end
-            g  .= reshape(G, (p+Int(glr.fit_intercept))*c)
+            g  .= reshape(G, (p + Int(glr.fit_intercept)) * c)
             g .+= λ .* θ
+            glr.fit_intercept && (glr.penalize_intercept || (g[end] -= λ * θ[end]))
         end
         f === nothing || begin
             # we re-use pre-computations here, see also MultinomialLoss
@@ -186,7 +188,7 @@ function fg!(glr::GLR{MultinomialLoss,<:L2R}, X, y)
             @inbounds for i in eachindex(y)
                 t += log(ss[i]) + ms[i] - P[i, y[i]]
             end
-            return sum(t) + glr.penalty(θ)
+            return sum(t) + glr.penalty(view_θ(glr, θ))
         end
     end
 end
@@ -218,6 +220,7 @@ function Hv!(glr::GLR{MultinomialLoss,<:L2R}, X, y)
             Hv .= reshape(Hv_mat, p * c)
         end
         Hv .+= λ .* v
+        glr.fit_intercept && (glr.penalize_intercept || (Hv[end] -= λ * v[end]))
     end
 end
 
 
@@ -48,6 +48,7 @@ function fgh!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y) where ρ <: RobustRho1P{δ}
                 ψr .= ψ_.(r, w)
                 apply_Xt!(g, X, ψr)
                 g .+= λ .* θ
+                glr.penalize_intercept || (g[end] -= λ * θ[end])
             end
             # Hessian via ϕ functiono
             H === nothing || begin
@@ -60,10 +61,10 @@ function fgh!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y) where ρ <: RobustRho1P{δ}
                     H[i, end] = H[end, i] = ΛXt1[i]
                 end
                 H[end, end] = sum(ϕr)
-                add_λI!(H, λ)
+                add_λI!(H, λ, glr.penalize_intercept)
             end
             # function value
-            f === nothing || return glr.loss(r) + glr.penalty(θ)
+            f === nothing || return glr.loss(r) + glr.penalty(view_θ(glr, θ))
         end
     else
         (f, g, H, θ) -> begin
@@ -112,7 +113,7 @@ function Hv!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y) where ρ <: RobustRho1P{δ} w
             apply_Xt!(Hvₐ, X, t)
             Hvₐ .+= λ .* vₐ .+ XtΛ1 .* vₑ
             # update for the last row (intercept)
-            Hv[end] = dot(XtΛ1, vₐ) + (sum(w)+λ) * vₑ
+            Hv[end] = dot(XtΛ1, vₐ) + (sum(w) + λ_if_penalize_intercept(glr, λ)) * vₑ
         end
     else
         (Hv, θ, v) -> begin
@@ -163,7 +164,7 @@ function Mv!(glr::GLR{RobustLoss{ρ},<:L2R}, X, y;
                 t .*= ωr
                 mul!(Mvₐ, X', t)
                 Mvₐ .+= λ .* vₐ .+ XtW1 .* vₑ
-                Mv[end] = dot(XtW1, vₐ) + (sum(ωr)+λ) * vₑ
+                Mv[end] = dot(XtW1, vₐ) + (sum(ωr) + λ_if_penalize_intercept(glr, λ)) * vₑ
             end
         else
             (Mv, v) -> begin
@@ -192,6 +193,7 @@ function smooth_fg!(glr::GLR{RobustLoss{ρ},<:ENR}, X, y) where ρ <: RobustRho1
         ψr .= ψ_.(r, w)
         apply_Xt!(g, X, ψr)
         g .+= λ .* θ
-        return glr.loss(r) + get_l2(glr.penalty)(θ)
+        glr.fit_intercept && (glr.penalize_intercept || (g[end] -= λ * θ[end]))
+        return glr.loss(r) + get_l2(glr.penalty)(view_θ(glr, θ))
     end
 end
@@ -19,5 +19,6 @@ function prox!(glr::GLR{<:Loss,<:Union{L1R,CompositePenalty}})
     γ = getscale_l1(glr.penalty)
     (p, α, z) -> begin
         p .= soft_thresh.(z, α * γ)
+        glr.fit_intercept && (glr.penalize_intercept || (p[end] = z[end]))
     end
 end
@@ -10,21 +10,24 @@ Return the objective function (sum of loss + penalty) of a Generalized Linear Mo
 """
 objective(glr::GLR) = glr.loss + glr.penalty
 
+
 """
 $SIGNATURES
 
 Return a function computing the objective at a given point `θ`.
 Note that the [`apply_X`](@ref) takes care of a potential intercept.
 """
-objective(glr::GLR, X, y; c::Int=1) = θ -> objective(glr)(y, apply_X(X, θ, c), θ)
+objective(glr::GLR, X, y; c::Int=1) =
+    θ -> objective(glr)(y, apply_X(X, θ, c), view_θ(glr, θ))
 
 
 """
 $SIGNATURES
 
 Return a function computing the smooth part of the objective at a given point `θ`.
 """
-smooth_objective(glr::GLR, X, y; c::Int=1) = θ -> smooth_objective(glr)(y, apply_X(X, θ, c), θ)
+smooth_objective(glr::GLR, X, y; c::Int=1) =
+    θ -> smooth_objective(glr)(y, apply_X(X, θ, c), view_θ(glr, θ))
 
 """
 $SIGNATURES
@@ -39,7 +42,9 @@ $SIGNATURES
 
 Return a model corresponding to the smooth part of the objective.
 """
-get_smooth(glr::GLR) = (o = smooth_objective(glr); GLR(o.loss, o.penalty, glr.fit_intercept))
+get_smooth(glr::GLR) = (
+    o = smooth_objective(glr);
+    GLR(o.loss, o.penalty, glr.fit_intercept, glr.penalize_intercept))
 
 
 """
 
@@ -175,11 +175,12 @@ $SIGNATURES
 
 In place computation of `H = H + λI` where  `H` is a square matrix.
 """
-function add_λI!(H::Matrix, λ::Real)
+function add_λI!(H::Matrix, λ::Real, penalize_intercept::Bool=true)
 	λ = convert(eltype(H), λ)
-	@inbounds for i in 1:size(H, 1)
+	@inbounds for i in 1:size(H, 1)-1
 		H[i,i] += λ
 	end
+	H[end, end] += ifelse(penalize_intercept, λ, zero(eltype(H)))
 end
 
 
@@ -197,3 +198,22 @@ $SIGNATURES
 Threshold the number if its absolute value is too close to zero.
 """
 clip(z, τ) = ifelse(abs(z) < τ, τ, z)
+
+
+"""
+$SIGNATURES
+
+Return λ if penalize intercept otherwise 0, useful in computations of Hessian.
+"""
+λ_if_penalize_intercept(glr, λ) = ifelse(glr.penalize_intercept, λ, zero(λ))
+
+"""
+$SIGNATURES
+
+Return a view of θ if the last element should not be penalized.
+"""
+@inline function view_θ(glr, θ)
+	f = glr.fit_intercept && !glr.penalize_intercept
+	f && return view(θ, 1:length(θ)-1)
+	θ
+end