JuliaAI
diff --git a/‎src/fit/analytical.jl‎
Lines changed: 8 additions & 4 deletions b/‎src/fit/analytical.jl‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎src/fit/default.jl‎
Lines changed: 5 additions & 4 deletions b/‎src/fit/default.jl‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎src/fit/iwls.jl‎
Lines changed: 4 additions & 2 deletions b/‎src/fit/iwls.jl‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/fit/newton.jl‎
Lines changed: 25 additions & 19 deletions b/‎src/fit/newton.jl‎
Lines changed: 25 additions & 19 deletions
diff --git a/‎src/fit/proxgrad.jl‎
Lines changed: 6 additions & 3 deletions b/‎src/fit/proxgrad.jl‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎src/fit/solvers.jl‎
Lines changed: 2 additions & 2 deletions b/‎src/fit/solvers.jl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/glr/constructors.jl‎
Lines changed: 21 additions & 13 deletions b/‎src/glr/constructors.jl‎
Lines changed: 21 additions & 13 deletions
diff --git a/‎src/glr/d_logistic.jl‎
Lines changed: 19 additions & 16 deletions b/‎src/glr/d_logistic.jl‎
Lines changed: 19 additions & 16 deletions
@@ -3,14 +3,17 @@
 """
 $SIGNATURES
 
-Fit a least square regression either with no penalty (OLS) or with a L2 penalty (Ridge).
+Fit a least square regression either with no penalty (OLS) or with a L2 penalty
+(Ridge).
 
 ## Complexity
 
 Assuming `n` dominates `p`,
 
-* non-iterative (full solve):     O(np²) - dominated by the construction of the Hessian X'X.
-* iterative (conjugate gradient): O(κnp) - with κ the number of CG steps (κ ≤ p).
+* non-iterative (full solve):     O(np²) - dominated by the construction of the
+								  Hessian X'X.
+* iterative (conjugate gradient): O(κnp) - with κ the number of CG steps
+							 	  (κ ≤ p).
 """
 function _fit(glr::GLR{L2Loss,<:L2R}, solver::Analytical, X, y)
 	# full solve
@@ -34,7 +37,8 @@ function _fit(glr::GLR{L2Loss,<:L2R}, solver::Analytical, X, y)
 	p = size(X, 2) + Int(glr.fit_intercept)
 	max_cg_steps = min(solver.max_inner, p)
 	# Form the Hessian map, cost of application H*v is O(np)
-	Hm = LinearMap(Hv!(glr, X, y), p; ismutating=true, isposdef=true, issymmetric=true)
+	Hm = LinearMap(Hv!(glr, X, y), p;
+				   ismutating=true, isposdef=true, issymmetric=true)
 	b  = X'y
 	glr.fit_intercept && (b = vcat(b, sum(y)))
 	return cg(Hm, b; maxiter=max_cg_steps)
 
@@ -2,8 +2,9 @@ export fit
 
 # Default solvers
 
-# TODO: in the future, have cases where if the things are too big, take another default.
-# also should check if p > n in which case should do dual stuff (or other appropriate alternative)
+# TODO: in the future, have cases where if the things are too big, take another
+# default. Also should check if p > n in which case should do dual stuff (or
+# other appropriate alternative)
 
 # Linear, Ridge
 _solver(::GLR{L2Loss,<:L2R}, np::NTuple{2,Int}) = Analytical()
@@ -21,8 +22,8 @@ end
 # Robust, Quantile
 _solver(::GLR{<:RobustLoss,<:L2R}, np::NTuple{2,Int}) = LBFGS()
 
-# Fallback NOTE: should revisit bc with non-smooth, wouldn't work probably PGD/PSGD
-# depending on how much data there is
+# Fallback NOTE: should revisit bc with non-smooth, wouldn't work probably
+# PGD/PSGD depending on how much data there is
 _solver(::GLR, np::NTuple{2,Int}) = @error "Not yet implemented."
 
 
 
@@ -16,7 +16,8 @@ function _fit(glr::GLR{RobustLoss{ρ},<:L2R}, solver::IWLSCG, X, y) where {ρ}
         # update the weights and retrieve the application function
         # Mθv! corresponds to the current application of (X'WX + λI) on v
         Mθv! = _Mv!(ω, θ)
-        Mm  = LinearMap(Mθv!, p; ismutating=true, isposdef=true, issymmetric=true)
+        Mm  = LinearMap(Mθv!, p;
+                        ismutating=true, isposdef=true, issymmetric=true)
         Wy  = ω .* y
         b   = X'Wy
         if glr.fit_intercept
@@ -30,6 +31,7 @@ function _fit(glr::GLR{RobustLoss{ρ},<:L2R}, solver::IWLSCG, X, y) where {ρ}
         copyto!(θ_, θ)
         k  += 1
     end
-    tol ≤ solver.tol || @warn "IWLS did not converge in $(solver.max_iter) iterations."
+    tol ≤ solver.tol ||
+        @warn "IWLS did not converge in $(solver.max_iter) iterations."
     return θ
 end
@@ -9,10 +9,11 @@ Fit a GLR using Newton's method.
 
 ## Complexity
 
-Assuming `n` dominates `p`, O(κnp²), dominated by the construction of the Hessian at each step with
-κ the number of Newton steps.
+Assuming `n` dominates `p`, O(κnp²), dominated by the construction of the
+Hessian at each step with κ the number of Newton steps.
 """
-function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R}, solver::Newton, X, y)
+function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
+              solver::Newton, X, y)
     p     = size(X, 2) + Int(glr.fit_intercept)
     θ₀    = zeros(p)
     _fgh! = fgh!(glr, X, y)
@@ -24,20 +25,21 @@ end
 """
 $SIGNATURES
 
-Fit a GLR using Newton's method combined with an iterative solver  (conjugate gradient) to solve
-the Newton steps (∇²f)⁻¹∇f.
+Fit a GLR using Newton's method combined with an iterative solver  (conjugate
+gradient) to solve the Newton steps (∇²f)⁻¹∇f.
 
 ## Complexity
 
-Assuming `n` dominates `p`, O(κ₁κ₂np), dominated by the application of the Hessian at each step
-where κ₁ is the number of Newton steps and κ₂ is the average number of CG steps per Newton step
-(which is at most p).
+Assuming `n` dominates `p`, O(κ₁κ₂np), dominated by the application of the
+Hessian at each step where κ₁ is the number of Newton steps and κ₂ is the
+average number of CG steps per Newton step (which is at most p).
 """
-function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R}, solver::NewtonCG, X, y)
+function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
+              solver::NewtonCG, X, y)
     p    = size(X, 2) + Int(glr.fit_intercept)
     θ₀   = zeros(p)
     _f   = objective(glr, X, y)
-    _fg! = (g, θ) -> fgh!(glr, X, y)(0.0, g, nothing, θ) # XXX: Optim.jl/issues/738
+    _fg! = (g, θ) -> fgh!(glr, X, y)(0.0, g, nothing, θ) # Optim.jl/issues/738
     _Hv! = Hv!(glr, X, y)
     opt  = Optim.TwiceDifferentiableHV(_f, _fg!, _Hv!, θ₀)
     res  = Optim.optimize(opt, θ₀, Optim.KrylovTrustRegion())
@@ -51,10 +53,11 @@ Fit a GLR using LBFGS.
 
 ## Complexity
 
-Assuming `n` dominates `p`, O(κnp), dominated by the computation of the gradient at each step with
-κ the number of LBFGS steps.
+Assuming `n` dominates `p`, O(κnp), dominated by the computation of the
+gradient at each step with κ the number of LBFGS steps.
 """
-function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R}, solver::LBFGS, X, y)
+function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
+              solver::LBFGS, X, y)
     p    = size(X, 2) + Int(glr.fit_intercept)
     θ₀   = zeros(p)
     _fg! = (f, g, θ) -> fgh!(glr, X, y)(f, g, nothing, θ)
@@ -69,13 +72,15 @@ end
 """
 $SIGNATURES
 
-Fit a multiclass GLR using Newton's method with an iterative solver (conjugate gradient).
+Fit a multiclass GLR using Newton's method with an iterative solver (conjugate
+gradient).
 
 ## Complexity
 
-Assuming `n` dominates `p`, O(κ₁κ₂npc), where `c` is the number of classes. The computations are
-dominated by the application of the Hessian at each step with κ₁ the number of Newton steps and κ₂
-the average number of CG steps per Newton step.
+Assuming `n` dominates `p`, O(κ₁κ₂npc), where `c` is the number of classes. The
+computations are dominated by the application of the Hessian at each step with
+κ₁ the number of Newton steps and κ₂ the average number of CG steps per Newton
+step.
 """
 function _fit(glr::GLR{MultinomialLoss,<:L2R}, solver::NewtonCG, X, y)
     p    = size(X, 2) + Int(glr.fit_intercept)
@@ -96,8 +101,9 @@ Fit a multiclass GLR using LBFGS.
 
 ## Complexity
 
-Assuming `n` dominates `p`, O(κnpc), with `c` the number of classes, dominated by the computation
-of the gradient at each step with κ the number of LBFGS steps.
+Assuming `n` dominates `p`, O(κnpc), with `c` the number of classes, dominated
+by the computation of the gradient at each step with κ the number of LBFGS
+steps.
 """
 function _fit(glr::GLR{MultinomialLoss,<:L2R}, solver::LBFGS, X, y)
     p    = size(X, 2) + Int(glr.fit_intercept)
 
@@ -22,7 +22,8 @@ function _fit(glr::GLR, solver::ProxGrad, X, y)
     _f      = smooth_objective(glr, X, y; c=c)
     _fg!    = smooth_fg!(glr, X, y)
     _prox!  = prox!(glr)
-    bt_cond = θ̂ -> _f(θ̂) > fθ̄ + dot(θ̂ .- θ̄, ∇fθ̄) + sum(abs2.(θ̂ .- θ̄))/(2η)
+    bt_cond = θ̂ ->
+                _f(θ̂) > fθ̄ + dot(θ̂ .- θ̄, ∇fθ̄) + sum(abs2.(θ̂ .- θ̄)) / (2η)
     # loop-related
     k, tol = 1, Inf
     while k ≤ solver.max_iter && tol > solver.tol
@@ -46,7 +47,8 @@ function _fit(glr::GLR, solver::ProxGrad, X, y)
             inner += 1
         end
         if inner == solver.max_inner
-            @warn "No appropriate stepsize found via backtracking; interrupting."
+            @warn "No appropriate stepsize found via backtracking; " *
+                  "interrupting."
             break
         end
         # update caches
@@ -59,6 +61,7 @@ function _fit(glr::GLR, solver::ProxGrad, X, y)
         # update niter
         k += 1
     end
-    tol ≤ solver.tol || @warn "Proximal GD did not converge in $(solver.max_iter) iterations."
+    tol ≤ solver.tol || @warn "Proximal GD did not converge in " *
+                              "$(solver.max_iter) iterations."
     return θ
 end
@@ -38,7 +38,7 @@ struct LBFGS <: Solver end
 @with_kw struct ProxGrad <: Solver
     accel::Bool    = false # use Nesterov style acceleration (see also FISTA)
     max_iter::Int  = 1000  # max number of overall iterations
-    tol::Float64   = 1e-4  # tolerance over relative change of θ i.e. norm(θ-θ_)/norm(θ)
+    tol::Float64   = 1e-4  # tol relative change of θ i.e. norm(θ-θ_)/norm(θ)
     max_inner::Int = 100   # β^max_inner should be > 1e-10
     beta::Float64  = 0.8   # in (0, 1); shrinkage in the backtracking step
 end
@@ -53,7 +53,7 @@ ISTA(; kwa...)  = ProxGrad(;accel = false, kwa...)
     max_inner::Int     = 200
     tol::Float64       = 1e-4
     damping::Float64   = 1.0   # should be between 0 and 1, 1 = trust iterates
-    threshold::Float64 = 1e-6  # threshold for the residuals used for instance in quantile reg
+    threshold::Float64 = 1e-6  # thresh for residuals; used eg in quantile reg
 end
 
 # ===================== admm.jl
 
@@ -6,14 +6,14 @@ export GeneralizedLinearRegression, GLR,
         RobustRegression, HuberRegression, QuantileRegression
 
 """
-GeneralizedLinearRegression{L<:Loss, P<:Penalty}
+    GeneralizedLinearRegression{L<:Loss, P<:Penalty}
 
 Generalized Linear Regression (GLR) model with objective function:
 
 ``L(y, Xθ) + P(θ)``
 
-where `L` is a loss function, `P` a penalty, `y` is the vector of observed response, `X` is
-the feature matrix and `θ` the vector of parameters.
+where `L` is a loss function, `P` a penalty, `y` is the vector of observed
+response, `X` is the feature matrix and `θ` the vector of parameters.
 
 Special cases include:
 
@@ -74,8 +74,10 @@ $SIGNATURES
 
 Objective function: ``|Xθ - y|₂²/2 + λ|θ|₂²/2 + γ|θ|₁``.
 """
-function ElasticNetRegression(λ::Real=1.0, γ::Real=1.0; lambda::Real=λ, gamma::Real=γ,
-                              fit_intercept::Bool=true, penalize_intercept::Bool=false)
+function ElasticNetRegression(λ::Real=1.0, γ::Real=1.0;
+                              lambda::Real=λ, gamma::Real=γ,
+                              fit_intercept::Bool=true,
+                              penalize_intercept::Bool=false)
     check_pos.((lambda, gamma))
     GLR(penalty=lambda*L2Penalty()+gamma*L1Penalty(),
         fit_intercept=fit_intercept,
@@ -108,10 +110,11 @@ end
 """
 $SIGNATURES
 
-Objective function: ``L(y, Xθ) + λ|θ|₂²/2 + γ|θ|₁`` where `L` is either the logistic loss in the
-binary case or the multinomial loss otherwise.
+Objective function: ``L(y, Xθ) + λ|θ|₂²/2 + γ|θ|₁`` where `L` is either the
+logistic loss in the binary case or the multinomial loss otherwise.
 """
-function LogisticRegression(λ::Real=1.0, γ::Real=0.0; lambda::Real=λ, gamma::Real=γ,
+function LogisticRegression(λ::Real=1.0, γ::Real=0.0;
+                            lambda::Real=λ, gamma::Real=γ,
                             penalty::Symbol=iszero(gamma) ? :l2 : :en,
                             multi_class::Bool=false, fit_intercept::Bool=true,
                             penalize_intercept::Bool=false)
@@ -131,11 +134,13 @@ MultinomialRegression(a...; kwa...) = LogisticRegression(a...; multi_class=true,
 """
 $SIGNATURES
 
-Objective function: ``∑ρ(Xθ - y) + λ|θ|₂² + γ|θ|₁`` where ρ is a given function on the residuals.
+Objective function: ``∑ρ(Xθ - y) + λ|θ|₂² + γ|θ|₁`` where ρ is a given function
+on the residuals.
 """
 function RobustRegression(ρ::RobustRho=HuberRho(0.1), λ::Real=1.0, γ::Real=0.0;
                           rho::RobustRho=ρ, lambda::Real=λ, gamma::Real=γ,
-                          penalty::Symbol=iszero(gamma) ? :l2 : :en, fit_intercept::Bool=true,
+                          penalty::Symbol=iszero(gamma) ? :l2 : :en,
+                          fit_intercept::Bool=true,
                           penalize_intercept::Bool=false)
     penalty = _l1l2en(lambda, gamma, penalty, "Robust regression")
     GLR(loss=RobustLoss(rho),
@@ -151,12 +156,14 @@ Huber Regression with objective:
 
 ``∑ρ(Xθ - y) + λ|θ|₂²/2 + γ|θ|₁``
 
-Where `ρ` is the Huber function `ρ(r) = r²/2``  if `|r|≤δ` and `ρ(r)=δ(|r|-δ/2)` otherwise.
+Where `ρ` is the Huber function `ρ(r) = r²/2``  if `|r|≤δ` and
+`ρ(r)=δ(|r|-δ/2)` otherwise.
 """
 function HuberRegression(δ::Real=0.5, λ::Real=1.0, γ::Real=0.0;
                          delta::Real=δ, lambda::Real=λ, gamma::Real=γ,
                          penalty::Symbol=iszero(gamma) ? :l2 : :en,
-                         fit_intercept::Bool=true, penalize_intercept::Bool=false)
+                         fit_intercept::Bool=true,
+                         penalize_intercept::Bool=false)
     return RobustRegression(HuberRho(delta), lambda, gamma;
                             penalty=penalty, fit_intercept=fit_intercept,
                             penalize_intercept=penalize_intercept)
@@ -174,7 +181,8 @@ Where `ρ` is the check function `ρ(r) = r(δ - 1(r < 0))`.
 function QuantileRegression(δ::Real=0.5, λ::Real=1.0, γ::Real=0.0;
                             delta::Real=δ, lambda::Real=λ, gamma::Real=γ,
                             penalty::Symbol=iszero(gamma) ? :l2 : :en,
-                            fit_intercept::Bool=true, penalize_intercept::Bool=false)
+                            fit_intercept::Bool=true,
+                            penalize_intercept::Bool=false)
     return RobustRegression(QuantileRho(delta), lambda, gamma;
                             penalty=penalty, fit_intercept=fit_intercept,
                             penalize_intercept=penalize_intercept)
 
@@ -36,7 +36,7 @@ function fgh!(glr::GLR{LogisticLoss,<:L2R}, X, y)
                 ΛXt1 = view(SCRATCH_P[], 1:p)
                 copyto!(ΛXt1, sum(ΛX, dims=1))        # -- (ΛX)'1
                 @inbounds for i = 1:p
-                    H[i, end] = H[end, i] = ΛXt1[i]   # -- H[:,p+1] = H[p+1,:] = (ΛX)'1
+                    H[i, end] = H[end, i] = ΛXt1[i]   # -- H[:,p+1] = (ΛX)'1
                 end
                 H[end, end] = sum(w)                  # -- 1'Λ1'
                 add_λI!(H, λ, glr.penalize_intercept) # -- H = X'ΛX + λI
@@ -92,7 +92,8 @@ function Hv!(glr::GLR{LogisticLoss,<:L2R}, X, y)
             mul!(Hvₐ, X', Xvₐ)                       # -- (X'ΛX)vₐ
             Hvₐ .+= λ .* vₐ .+ XtΛ1 .* vₑ
             # update for the last row -- (X'1)'v + n v[end]
-            Hv[end] = dot(XtΛ1, vₐ) + (sum(w) + λ_if_penalize_intercept(glr, λ)) * vₑ
+            Hv[end] = dot(XtΛ1, vₐ) +
+                        (sum(w) + λ_if_penalize_intercept(glr, λ)) * vₑ
         end
     else
         (Hv, θ, v) -> begin
@@ -145,12 +146,12 @@ function fg!(glr::GLR{MultinomialLoss,<:L2R}, X, y)
     λ    = getscale(glr.penalty)
     (f, g, θ) -> begin
         P  = SCRATCH_NC[]
-        apply_X!(P, X, θ, c)                                 # O(npc) store n * c
+        apply_X!(P, X, θ, c)                         # O(npc) store n * c
         M  = SCRATCH_NC2[]
-        M .= exp.(P)                                         # O(npc) store n * c
+        M .= exp.(P)                                 # O(npc) store n * c
         g === nothing || begin
             ΛM  = SCRATCH_NC3[]
-            ΛM .= M ./ sum(M, dims=2)                        # O(nc)  store n * c
+            ΛM .= M ./ sum(M, dims=2)                # O(nc)  store n * c
             Q   = SCRATCH_NC4[]
             @inbounds for i = 1:n, j=1:c
                 Q[i, j] = ifelse(y[i] == j, 1.0, 0.0)
@@ -170,7 +171,8 @@ function fg!(glr::GLR{MultinomialLoss,<:L2R}, X, y)
             end
             g  .= reshape(G, (p + Int(glr.fit_intercept)) * c)
             g .+= λ .* θ
-            glr.fit_intercept && (glr.penalize_intercept || (g[end] -= λ * θ[end]))
+            glr.fit_intercept &&
+                (glr.penalize_intercept || (g[end] -= λ * θ[end]))
         end
         f === nothing || begin
             # we re-use pre-computations here, see also MultinomialLoss
@@ -199,21 +201,22 @@ function Hv!(glr::GLR{MultinomialLoss,<:L2R}, X, y)
     c = length(unique(y))
     # NOTE:
     # * ideally P and Q should be recuperated from gradient computations (fghv!)
-    # * assumption that c is small so that storing matrices of size n * c is not too bad; if c
-    # is large and allocations should be minimized, all these computations can be done per class
-    # with views over (c-1)p+1:cp; it will allocate less but is likely slower; maybe in the future
-    # we could have a keyword indicating which one the user wants to use.
+    # * assumption that c is small so that storing matrices of size n * c is
+    # not too bad; if c is large and allocations should be minimized, all these
+    # computations can be done per class with views over (c-1)p+1:cp; it will
+    # allocate less but is likely slower; maybe in the future we could have a
+    # keyword indicating which one the user wants to use.
     (Hv, θ, v) -> begin
-        P  = apply_X(X, θ, c)    # P_ik = <x_i, θ_k>                  // dims n * c; O(npc)
-        Q  = apply_X(X, v, c)    # Q_ik = <x_i, v_k>                  // dims n * c; O(npc)
-        M  = exp.(P)             # M_ik = exp<x_i, w_k>               // dims n * c;
-        MQ = M .* Q              #                                    // dims n * c; O(nc)
+        P  = apply_X(X, θ, c)    # P_ik = <x_i, θ_k>    // dims n * c; O(npc)
+        Q  = apply_X(X, v, c)    # Q_ik = <x_i, v_k>    // dims n * c; O(npc)
+        M  = exp.(P)             # M_ik = exp<x_i, w_k> // dims n * c;
+        MQ = M .* Q              #                      // dims n * c; O(nc)
         ρ  = 1 ./ sum(M, dims=2) # ρ_i = 1/Z_i = 1/∑_k exp<x_i, w_k>
         κ  = sum(MQ, dims=2)     # κ_i  = ∑_k exp<x_i, w_k><x_i, v_k>
         γ  = κ .* ρ.^2           # γ_i  = κ_i / Z_i^2
         # computation of Hv
-        U      = (ρ .* MQ) .- (γ .* M)                              # // dims n * c; O(nc)
-        Hv_mat = X' * U                                             # // dims n * c; O(npc)
+        U      = (ρ .* MQ) .- (γ .* M)                  # // dims n * c; O(nc)
+        Hv_mat = X' * U                                 # // dims n * c; O(npc)
         if glr.fit_intercept
             Hv .= reshape(vcat(Hv_mat, sum(U, dims=1)), (p+1)*c)
         else