JuliaAI
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/MLJLinearModels.jl‎
Lines changed: 2 additions & 0 deletions b/‎src/MLJLinearModels.jl‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/fit/default.jl‎
Lines changed: 13 additions & 1 deletion b/‎src/fit/default.jl‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎src/glr/constructors.jl‎
Lines changed: 1 addition & 1 deletion b/‎src/glr/constructors.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/glr/d_l2loss.jl‎
Lines changed: 18 additions & 7 deletions b/‎src/glr/d_l2loss.jl‎
Lines changed: 18 additions & 7 deletions
diff --git a/‎src/glr/d_logistic.jl‎
Lines changed: 88 additions & 32 deletions b/‎src/glr/d_logistic.jl‎
Lines changed: 88 additions & 32 deletions
@@ -64,6 +64,7 @@ Systematic timing benchmarks have not been run yet but it's planned (see [this i
 * The models are built and tested assuming `n > p`; if this doesn't hold, tricks should be employed to speed up computations; these have not been implemented yet.
 * CV-aware code not implemented yet (code that re-uses computations when fitting over a number of hyper-parameters);  "Meta" functionalities such as One-vs-All or Cross-Validation are left to other packages such as MLJ.
 * Stochastic solvers have not yet been implemented.
+* All computations are assumed to be done in Float64.
 
 ### Possible future models
 
 
@@ -10,6 +10,8 @@ import Base.+, Base.-, Base.*, Base./, Base.convert
 
 const AVR = AbstractVector{<:Real}
 
+include("scratchspace.jl")
+
 include("utils.jl")
 
 # > Loss / penalty definitions <
 
@@ -32,5 +32,17 @@ the loss and penalty of the model. A method can, in some cases, be specified.
 function fit(glr::GLR, X::AbstractMatrix{<:Real}, y::AVR;
 			 solver::Solver=_solver(glr, size(X)))
     check_nrows(X, y)
-    _fit(glr, solver, X, y)
+	n, p = size(X)
+	p += Int(glr.fit_intercept)
+	# allocate cache for temporary computations of size n/p
+	# which are frequent but otherwise un-important so that
+	# we can reduce the overall number of allocations
+	# these are const Refs defined when the module is loaded
+	c = glr.loss isa MultinomialLoss ? maximum(y) : 0
+	allocate(n, p, c)
+	# effective call to fit routine
+    θ = _fit(glr, solver, X, y)
+	# de-allocate cache
+	deallocate()
+	return θ
 end
@@ -68,7 +68,7 @@ $SIGNATURES
 Objective function: ``|Xθ - y|₂²/2 + λ|θ|₂²/2 + γ|θ|₁``
 """
 function ElasticNetRegression(λ::Real=1.0, γ::Real=1.0; lambda::Real=λ, gamma::Real=γ,
-                             fit_intercept::Bool=true)
+                              fit_intercept::Bool=true)
     check_pos.((lambda, gamma))
     GLR(fit_intercept=fit_intercept, penalty=lambda*L2Penalty()+gamma*L1Penalty())
 end
 
@@ -24,16 +24,24 @@ function Hv!(glr::GLR{L2Loss,<:L2R}, X, y)
             a   = 1:p
             Hvₐ = view(Hv, a)
             vₐ  = view(v,  a)
-            Xt1 = vec(sum(X, dims=1))
+            Xt1 = view(SCRATCH_P[], a)
+            copyto!(Xt1, sum(X, dims=1))  # -- X'1
             vₑ  = v[end]
-            # update for the first p rows -- (X'X + λI)v[1:p] + (X'1)v[end]
-            mul!(Hvₐ, X', X * vₐ)
+            # update for the first p rows   -- (X'X + λI)v[1:p] + (X'1)v[end]
+            Xvₐ = SCRATCH_N[]
+            mul!(Xvₐ, X, vₐ)
+            mul!(Hvₐ, X', Xvₐ)
             Hvₐ .+= λ .* vₐ .+ Xt1 .* vₑ
-            # update for the last row -- (X'1)'v + n v[end]
+            # update for the last row       -- (X'1)'v + n v[end]
             Hv[end] = dot(Xt1, vₐ) + (n+λ) * vₑ
         end
     else
-        (Hv, v) -> (mul!(Hv, X', X * v); Hv .+= λ .* v)
+        (Hv, v) -> begin
+            Xv = SCRATCH_N[]
+            mul!(Xv, X, v)       # -- Xv
+            mul!(Hv, X', Xv)     # -- X'Xv
+            Hv .+= λ .* v        # -- X'Xv + λv
+        end
     end
 end
 
@@ -50,9 +58,12 @@ end
 
 function smooth_fg!(glr::GLR{L2Loss,<:ENR}, X, y)
     λ = getscale_l2(glr.penalty)
-    p = size(X, 2)
     (g, θ) -> begin
-        r = apply_X(X, θ) .- y
+        # cache contains the residuals (Xθ-y)
+        Xθ = SCRATCH_N[]
+        apply_X!(Xθ, X, θ)
+        r   = SCRATCH_N[]
+        r .-= y             # -- r = Xθ-y
         apply_Xt!(g, X, r)
         g .+= λ .* θ
         return glr.loss(r) + get_l2(glr.penalty)(θ)
 
@@ -15,33 +15,51 @@ function fgh!(glr::GLR{LogisticLoss,<:L2R}, X, y)
     λ = getscale(glr.penalty)
     if glr.fit_intercept
         (f, g, H, θ) -> begin
-            Xθ = apply_X(X, θ)
+            Xθ = SCRATCH_N[]
+            apply_X!(Xθ, X, θ)                       # -- Xθ = apply_X(X, θ)
             # precompute σ(yXθ) use -σ(-x) = (σ(x)-1)
-            w  = σ.(Xθ .* y)
+            w  = SCRATCH_N2[]
+            w .= σ.(Xθ .* y)                         # -- w  = σ.(Xθ .* y)
             g === nothing || begin
-                tmp = y .* (w .- 1.0)
-                apply_Xt!(g, X, tmp)
+                t  = SCRATCH_N3[]
+                t .= y .* (w .- 1.0)                 # -- t = y .* (w .- 1.0)
+                apply_Xt!(g, X, t)                   # -- g = X't
                 g .+= λ .* θ
             end
             H === nothing || begin
-                ΛX = w .* X
-                mul!(view(H, 1:p, 1:p), X', ΛX)
-                ΛXt1 = sum(ΛX, dims=1)
+                # NOTE: we could try to be clever to reduce the allocations for
+                # ΛX but computing the full hessian allocates a lot anyway so
+                # probably not really worth it
+                ΛX = w .* X                         # !! big allocs
+                mul!(view(H, 1:p, 1:p), X', ΛX)     # -- H[1:p,1:p] = X'ΛX
+                ΛXt1 = view(SCRATCH_P[], 1:p)
+                copyto!(ΛXt1, sum(ΛX, dims=1))      # -- (ΛX)'1
                 @inbounds for i = 1:p
-                    H[i, end] = H[end, i] = ΛXt1[i]
+                    H[i, end] = H[end, i] = ΛXt1[i] # -- H[:,p+1] = H[p+1,:] = (ΛX)'1
                 end
-                H[end, end] = sum(w)
-                add_λI!(H, λ)
+                H[end, end] = sum(w)                # -- 1'Λ1'
+                add_λI!(H, λ)                       # -- H = X'ΛX + λI
             end
             f === nothing || return J(y, Xθ, θ)
         end
     else
+        # see comments above, same computations just no additional things for
+        # fit_intercept
         (f, g, H, θ) -> begin
-            Xθ = apply_X(X, θ)
-            # precompute σ(yXθ) use -σ(-x) = σ(x)(σ(x)-1)
-            w = σ.(y .* Xθ)
-            g === nothing || (mul!(g, X', y .* (w .- 1.0)); g .+= λ .* θ)
-            H === nothing || (mul!(H, X', w .* X); add_λI!(H, λ))
+            Xθ = SCRATCH_N[]
+            apply_X!(Xθ, X, θ)
+            w  = SCRATCH_N2[]
+            w .= σ.(y .* Xθ)
+            g === nothing || begin
+                t  = SCRATCH_N3[]
+                t .= y .* (w .- 1.0)
+                apply_Xt!(g, X, t)
+                g .+= λ .* θ
+            end
+            H === nothing || begin
+                mul!(H, X', w .* X)
+                add_λI!(H, λ)
+            end
             f === nothing || return J(y, Xθ, θ)
         end
     end
@@ -55,24 +73,36 @@ function Hv!(glr::GLR{LogisticLoss,<:L2R}, X, y)
         # rows a 1:p = [X'ΛX + λI | X'Λ1]
         # row  e end = [1'ΛX      | sum(a)+λ]
         (Hv, θ, v) -> begin
-            # precompute σ(yXθ) use -σ(-x) = (σ(x)-1)
-            w = σ.(apply_X(X, θ) .* y)
+            Xθ = SCRATCH_N[]
+            apply_X!(Xθ, X, θ)                       # -- Xθ = apply_X(X, θ)
+            w  = SCRATCH_N2[]
+            w .= σ.(Xθ .* y)                         # -- w  = σ.(Xθ .* y)
             # view on the first p rows
             a    = 1:p
             Hvₐ  = view(Hv, a)
             vₐ   = view(v,  a)
-            XtΛ1 = X' * w     # X'Λ1; O(np)
+            XtΛ1 = view(SCRATCH_P[], 1:p)
+            mul!(XtΛ1, X', w)                        # -- X'Λ1; O(np)
             vₑ   = v[end]
             # update for the first p rows -- (X'X + λI)v[1:p] + (X'1)v[end]
-            mul!(Hvₐ, X', w .* (X * vₐ)) # (X'ΛX)vₐ
+            Xvₐ  = SCRATCH_N[]
+            mul!(Xvₐ, X, vₐ)
+            Xvₐ .*=  w                               # --  ΛXvₐ
+            mul!(Hvₐ, X', Xvₐ)                       # -- (X'ΛX)vₐ
             Hvₐ .+= λ .* vₐ .+ XtΛ1 .* vₑ
             # update for the last row -- (X'1)'v + n v[end]
             Hv[end] = dot(XtΛ1, vₐ) + (sum(w)+λ) * vₑ
         end
     else
         (Hv, θ, v) -> begin
-            w = σ.(apply_X(X, θ) .* y)
-            mul!(Hv, X', w .* (X * v))
+            Xθ = SCRATCH_N[]
+            apply_X!(Xθ, X, θ)
+            w  = SCRATCH_N2[]
+            w .= σ.(Xθ .* y)                # -- σ(yXθ)
+            Xv = SCRATCH_N3[]
+            mul!(Xv, X, v)
+            Xv .*= SCRATCH_N2[]                # -- ΛXv
+            mul!(Hv, X', Xv)                # -- X'ΛXv
             Hv .+= λ .* v
         end
     end
@@ -113,24 +143,50 @@ function fg!(glr::GLR{MultinomialLoss,<:L2R}, X, y)
     c    = length(unique(y))
     λ    = getscale(glr.penalty)
     (f, g, θ) -> begin
-        P = apply_X(X, θ, c)                                 # O(npc) store n * c
-        M = exp.(P)                                          # O(npc) store n * c
+        P  = SCRATCH_NC[]
+        apply_X!(P, X, θ, c)                                 # O(npc) store n * c
+        M  = SCRATCH_NC2[]
+        M .= exp.(P)                                         # O(npc) store n * c
         g === nothing || begin
-            ΛM  = M ./ sum(M, dims=2)                        # O(nc)  store n * c
-            Q   = BitArray(y[i] == j for i = 1:n, j=1:c)
-            G   = X'ΛM .- X'Q                                # O(npc) store n * c
+            ΛM  = SCRATCH_NC3[]
+            ΛM .= M ./ sum(M, dims=2)                        # O(nc)  store n * c
+            Q   = SCRATCH_NC4[]
+            @inbounds for i = 1:n, j=1:c
+                Q[i, j] = ifelse(y[i] == j, 1.0, 0.0)
+            end
+            ∑ΛM = sum(ΛM, dims=1)
+            ∑Q  = sum(Q, dims=1)
+            R   = ΛM
+            R .-= Q
+            G   = SCRATCH_PC[]
             if glr.fit_intercept
-                G = vcat(G, sum(ΛM, dims=1) .- sum(Q, dims=1))
+                mul!(view(G, 1:p, :), X', R)
+                @inbounds for k in 1:c
+                    G[end, k] = ∑ΛM[k] - ∑Q[k]
+                end
+            else
+                mul!(G, X', R)
             end
-            g  .= reshape(G, (p + Int(glr.fit_intercept)) * c)
+            g  .= reshape(G, (p+Int(glr.fit_intercept))*c)
             g .+= λ .* θ
         end
         f === nothing || begin
             # we re-use pre-computations here, see also MultinomialLoss
-            ms = maximum(P, dims=2)
-            ss = sum(M ./ exp.(ms), dims=2)
-            @inbounds ps = [P[i, y[i]] for i in eachindex(y)]
-            return sum(log.(ss) .+ ms .- ps) + glr.penalty(θ)
+            # ms = maximum(P, dims=2)
+            # ss = sum(M ./ exp.(ms), dims=2)
+            ms   = maximum(P, dims=2)
+            ems  = SCRATCH_N[]
+            @inbounds for i in 1:n
+                ems[i] = exp(ms[i])
+            end
+            ΛM  = SCRATCH_NC2[] # note that _NC is already linked to P
+            ΛM .= M ./ ems
+            ss  = sum(ΛM, dims=2)
+            t   = 0.0
+            @inbounds for i in eachindex(y)
+                t += log(ss[i]) + ms[i] - P[i, y[i]]
+            end
+            return sum(t) + glr.penalty(θ)
         end
     end
 end