JuliaAI
diff --git a/‎Project.toml‎
Lines changed: 1 addition & 1 deletion b/‎Project.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/fit/analytical.jl‎
Lines changed: 2 additions & 2 deletions b/‎src/fit/analytical.jl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/fit/default.jl‎
Lines changed: 14 additions & 1 deletion b/‎src/fit/default.jl‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎src/fit/iwls.jl‎
Lines changed: 3 additions & 2 deletions b/‎src/fit/iwls.jl‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/fit/newton.jl‎
Lines changed: 12 additions & 12 deletions b/‎src/fit/newton.jl‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎src/fit/proxgrad.jl‎
Lines changed: 2 additions & 2 deletions b/‎src/fit/proxgrad.jl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/glr/d_l2loss.jl‎
Lines changed: 13 additions & 15 deletions b/‎src/glr/d_l2loss.jl‎
Lines changed: 13 additions & 15 deletions
@@ -1,7 +1,7 @@
 name = "MLJLinearModels"
 uuid = "6ee0df7b-362f-4a72-a706-9e79364fb692"
 authors = ["Thibaut Lienart <[email protected]>"]
-version = "0.3.5"
+version = "0.3.6"
 
 [deps]
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 
@@ -15,7 +15,7 @@ Assuming `n` dominates `p`,
 * iterative (conjugate gradient): O(κnp) - with κ the number of CG steps
                                   (κ ≤ p).
 """
-function _fit(glr::GLR{L2Loss,<:L2R}, solver::Analytical, X, y)
+function _fit(glr::GLR{L2Loss,<:L2R}, solver::Analytical, X, y, scratch)
     # full solve
     if !solver.iterative
         λ  = getscale(glr.penalty)
@@ -37,7 +37,7 @@ function _fit(glr::GLR{L2Loss,<:L2R}, solver::Analytical, X, y)
     p = size(X, 2) + Int(glr.fit_intercept)
     max_cg_steps = min(solver.max_inner, p)
     # Form the Hessian map, cost of application H*v is O(np)
-    Hm = LinearMap(Hv!(glr, X, y), p;
+    Hm = LinearMap(Hv!(glr, X, y, scratch), p;
                    ismutating=true, isposdef=true, issymmetric=true)
     b  = X'y
     glr.fit_intercept && (b = vcat(b, sum(y)))
 
@@ -36,5 +36,18 @@ the loss and penalty of the model. A method can, in some cases, be specified.
 function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR;
              solver::Solver=_solver(glr, size(X)))
     check_nrows(X, y)
-    return _fit(glr, solver, X, y)
+    n, p = size(X)
+    c = glr.loss isa MultinomialLoss ? maximum(y) : 0
+    return _fit(glr, solver, X, y, scratch(n, p, c, i=glr.fit_intercept))
 end
+
+function scratch(n, p, c=0; i=false)
+    p_ = p + Int(i)
+    s = (n=zeros(n), n2=zeros(n), n3=zeros(n), p=zeros(p_))
+    if !iszero(c)
+        s = (s..., nc=zeros(n,c), nc2=zeros(n,c), nc3=zeros(n,c),
+                   nc4=zeros(n,c), pc=zeros(p_,c))
+    end
+    return s
+end
+scratch(X; kw...) = scratch(size(X)...; kw...)
@@ -1,8 +1,9 @@
-function _fit(glr::GLR{RobustLoss{ρ},<:L2R}, solver::IWLSCG, X, y) where {ρ}
+function _fit(glr::GLR{RobustLoss{ρ},<:L2R}, solver::IWLSCG, X, y, scratch
+              ) where {ρ}
     λ    = getscale(glr.penalty)
     n    = size(X, 1)
     p    = size(X, 2) + Int(glr.fit_intercept)
-    _Mv! = Mv!(glr, X, y; threshold=solver.threshold)
+    _Mv! = Mv!(glr, X, y, scratch; threshold=solver.threshold)
     κ    = solver.damping # between 0 and 1, 1 = fully take the new iteration
     # cache
     θ  = zeros(p)
 
@@ -13,10 +13,10 @@ Assuming `n` dominates `p`, O(κnp²), dominated by the construction of the
 Hessian at each step with κ the number of Newton steps.
 """
 function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
-              solver::Newton, X, y)
+              solver::Newton, X, y, scratch)
     p     = size(X, 2) + Int(glr.fit_intercept)
     θ₀    = zeros(p)
-    _fgh! = fgh!(glr, X, y)
+    _fgh! = fgh!(glr, X, y, scratch)
     opt   = Optim.only_fgh!(_fgh!)
     res   = Optim.optimize(opt, θ₀, Optim.Newton())
     return Optim.minimizer(res)
@@ -35,12 +35,12 @@ Hessian at each step where κ₁ is the number of Newton steps and κ₂ is the
 average number of CG steps per Newton step (which is at most p).
 """
 function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
-              solver::NewtonCG, X, y)
+              solver::NewtonCG, X, y, scratch)
     p    = size(X, 2) + Int(glr.fit_intercept)
     θ₀   = zeros(p)
     _f   = objective(glr, X, y)
-    _fg! = (g, θ) -> fgh!(glr, X, y)(0.0, g, nothing, θ) # Optim.jl/issues/738
-    _Hv! = Hv!(glr, X, y)
+    _fg! = (g, θ) -> fgh!(glr, X, y, scratch)(0.0, g, nothing, θ) # Optim.jl/738
+    _Hv! = Hv!(glr, X, y, scratch)
     opt  = Optim.TwiceDifferentiableHV(_f, _fg!, _Hv!, θ₀)
     res  = Optim.optimize(opt, θ₀, Optim.KrylovTrustRegion())
     return Optim.minimizer(res)
@@ -57,10 +57,10 @@ Assuming `n` dominates `p`, O(κnp), dominated by the computation of the
 gradient at each step with κ the number of LBFGS steps.
 """
 function _fit(glr::GLR{<:Union{LogisticLoss,RobustLoss},<:L2R},
-              solver::LBFGS, X, y)
+              solver::LBFGS, X, y, scratch)
     p    = size(X, 2) + Int(glr.fit_intercept)
     θ₀   = zeros(p)
-    _fg! = (f, g, θ) -> fgh!(glr, X, y)(f, g, nothing, θ)
+    _fg! = (f, g, θ) -> fgh!(glr, X, y, scratch)(f, g, nothing, θ)
     opt  = Optim.only_fg!(_fg!)
     res  = Optim.optimize(opt, θ₀, Optim.LBFGS())
     return Optim.minimizer(res)
@@ -82,13 +82,13 @@ computations are dominated by the application of the Hessian at each step with
 κ₁ the number of Newton steps and κ₂ the average number of CG steps per Newton
 step.
 """
-function _fit(glr::GLR{MultinomialLoss,<:L2R}, solver::NewtonCG, X, y)
+function _fit(glr::GLR{MultinomialLoss,<:L2R}, solver::NewtonCG, X, y, scratch)
     p    = size(X, 2) + Int(glr.fit_intercept)
     c    = maximum(y)
     θ₀   = zeros(p * c)
     _f   = objective(glr, X, y; c=c)
-    _fg! = (g, θ) -> fg!(glr, X, y)(0.0, g, θ) # XXX: Optim.jl/issues/738
-    _Hv! = Hv!(glr, X, y)
+    _fg! = (g, θ) -> fg!(glr, X, y, scratch)(0.0, g, θ) # XXX: Optim.jl/738
+    _Hv! = Hv!(glr, X, y, scratch)
     opt  = Optim.TwiceDifferentiableHV(_f, _fg!, _Hv!, θ₀)
     res  = Optim.optimize(opt, θ₀, Optim.KrylovTrustRegion())
     return Optim.minimizer(res)
@@ -105,11 +105,11 @@ Assuming `n` dominates `p`, O(κnpc), with `c` the number of classes, dominated
 by the computation of the gradient at each step with κ the number of LBFGS
 steps.
 """
-function _fit(glr::GLR{MultinomialLoss,<:L2R}, solver::LBFGS, X, y)
+function _fit(glr::GLR{MultinomialLoss,<:L2R}, solver::LBFGS, X, y, scratch)
     p    = size(X, 2) + Int(glr.fit_intercept)
     c    = maximum(y)
     θ₀   = zeros(p * c)
-    _fg! = fg!(glr, X, y)
+    _fg! = fg!(glr, X, y, scratch)
     opt  = Optim.only_fg!(_fg!)
     res  = Optim.optimize(opt, θ₀, Optim.LBFGS())
     return Optim.minimizer(res)
 
@@ -2,7 +2,7 @@
 
 # Assumption: loss has gradient; penalty has prox e.g.: Lasso
 # J(θ) = f(θ) + r(θ) where f is smooth
-function _fit(glr::GLR, solver::ProxGrad, X, y)
+function _fit(glr::GLR, solver::ProxGrad, X, y, scratch)
     c = ifelse(isa(glr.loss, MultinomialLoss), length(unique(y)), 1)
     p = (size(X, 2) + Int(glr.fit_intercept)) * c
     # vector caches + eval cache
@@ -20,7 +20,7 @@ function _fit(glr::GLR, solver::ProxGrad, X, y)
     acc = ifelse(solver.accel, 1.0, 0.0) # if 0, no extrapolation (ISTA)
     # functions
     _f      = smooth_objective(glr, X, y; c=c)
-    _fg!    = smooth_fg!(glr, X, y)
+    _fg!    = smooth_fg!(glr, X, y, scratch)
     _prox!  = prox!(glr)
     bt_cond = θ̂ ->
                 _f(θ̂) > fθ̄ + dot(θ̂ .- θ̄, ∇fθ̄) + sum(abs2.(θ̂ .- θ̄)) / (2η)
 
@@ -12,27 +12,27 @@
 # * Hv! used in iterative solution
 # ---------------------------------------------------------
 
-function Hv!(glr::GLR{L2Loss,<:L2R}, X, y)
+function Hv!(glr::GLR{L2Loss,<:L2R}, X, y, scratch)
     n, p = size(X)
     λ    = getscale(glr.penalty)
-    # scratch allocation
-    SCRATCH_N = zeros(n)
     if glr.fit_intercept
-        SCRATCH_P = zeros(p)
         # H = [X 1]'[X 1] + λ I
         # rows a 1:p = [X'X + λI | X'1]
         # row  e end = [1'X      | n+λι] where ι is 1 if glr.penalize_intercept
         ι = float(glr.penalize_intercept)
         (Hv, v) -> begin
             # view on the first p rows
-            a   = 1:p
-            Hvₐ = view(Hv, a)
-            vₐ  = view(v,  a)
-            Xt1 = SCRATCH_P
-            copyto!(Xt1, sum(X, dims=1))  # -- X'1 (note: sum will allocate)
+            a     = 1:p
+            Hvₐ   = view(Hv, a)
+            vₐ    = view(v,  a)
+            Xt1   = view(scratch.p, a)
+            Xt1 .*= 0
+            @inbounds for i in a, j in 1:n
+                Xt1[i] += X[j, i]           # -- X'1
+            end
             vₑ  = v[end]
             # update for the first p rows   -- (X'X + λI)v[1:p] + (X'1)v[end]
-            Xvₐ = SCRATCH_N
+            Xvₐ = scratch.n
             mul!(Xvₐ, X, vₐ)
             mul!(Hvₐ, X', Xvₐ)
             Hvₐ .+= λ .* vₐ .+ Xt1 .* vₑ
@@ -41,7 +41,7 @@ function Hv!(glr::GLR{L2Loss,<:L2R}, X, y)
         end
     else
         (Hv, v) -> begin
-            Xv = SCRATCH_N
+            Xv = scratch.n
             mul!(Xv, X, v)       # -- Xv
             mul!(Hv, X', Xv)     # -- X'Xv
             Hv .+= λ .* v        # -- X'Xv + λv
@@ -60,13 +60,11 @@ end
 # -> prox_r = soft-thresh
 # ---------------------------------------------------------
 
-function smooth_fg!(glr::GLR{L2Loss,<:ENR}, X, y)
+function smooth_fg!(glr::GLR{L2Loss,<:ENR}, X, y, scratch)
     λ = getscale_l2(glr.penalty)
-    # scratch allocation
-    SCRATCH_N = zeros(size(X, 1))
     (g, θ) -> begin
         # cache contains the residuals (Xθ-y)
-        r = SCRATCH_N
+        r = scratch.n
         get_residuals!(r, X, θ, y) # -- r = Xθ-y
         apply_Xt!(g, X, r)
         g .+= λ .* θ