@@ -15,33 +15,51 @@ function fgh!(glr::GLR{LogisticLoss,<:L2R}, X, y)
15
15
λ = getscale (glr. penalty)
16
16
if glr. fit_intercept
17
17
(f, g, H, θ) -> begin
18
- Xθ = apply_X (X, θ)
18
+ Xθ = SCRATCH_N[]
19
+ apply_X! (Xθ, X, θ) # -- Xθ = apply_X(X, θ)
19
20
# precompute σ(yXθ) use -σ(-x) = (σ(x)-1)
20
- w = σ .(Xθ .* y)
21
+ w = SCRATCH_N2[]
22
+ w .= σ .(Xθ .* y) # -- w = σ.(Xθ .* y)
21
23
g === nothing || begin
22
- tmp = y .* (w .- 1.0 )
23
- apply_Xt! (g, X, tmp)
24
+ t = SCRATCH_N3[]
25
+ t .= y .* (w .- 1.0 ) # -- t = y .* (w .- 1.0)
26
+ apply_Xt! (g, X, t) # -- g = X't
24
27
g .+ = λ .* θ
25
28
end
26
29
H === nothing || begin
27
- ΛX = w .* X
28
- mul! (view (H, 1 : p, 1 : p), X' , ΛX)
29
- ΛXt1 = sum (ΛX, dims= 1 )
30
+ # NOTE: we could try to be clever to reduce the allocations for
31
+ # ΛX but computing the full hessian allocates a lot anyway so
32
+ # probably not really worth it
33
+ ΛX = w .* X # !! big allocs
34
+ mul! (view (H, 1 : p, 1 : p), X' , ΛX) # -- H[1:p,1:p] = X'ΛX
35
+ ΛXt1 = view (SCRATCH_P[], 1 : p)
36
+ copyto! (ΛXt1, sum (ΛX, dims= 1 )) # -- (ΛX)'1
30
37
@inbounds for i = 1 : p
31
- H[i, end ] = H[end , i] = ΛXt1[i]
38
+ H[i, end ] = H[end , i] = ΛXt1[i] # -- H[:,p+1] = H[p+1,:] = (ΛX)'1
32
39
end
33
- H[end , end ] = sum (w)
34
- add_λI! (H, λ)
40
+ H[end , end ] = sum (w) # -- 1'Λ1'
41
+ add_λI! (H, λ) # -- H = X'ΛX + λI
35
42
end
36
43
f === nothing || return J (y, Xθ, θ)
37
44
end
38
45
else
46
+ # see comments above, same computations just no additional things for
47
+ # fit_intercept
39
48
(f, g, H, θ) -> begin
40
- Xθ = apply_X (X, θ)
41
- # precompute σ(yXθ) use -σ(-x) = σ(x)(σ(x)-1)
42
- w = σ .(y .* Xθ)
43
- g === nothing || (mul! (g, X' , y .* (w .- 1.0 )); g .+ = λ .* θ)
44
- H === nothing || (mul! (H, X' , w .* X); add_λI! (H, λ))
49
+ Xθ = SCRATCH_N[]
50
+ apply_X! (Xθ, X, θ)
51
+ w = SCRATCH_N2[]
52
+ w .= σ .(y .* Xθ)
53
+ g === nothing || begin
54
+ t = SCRATCH_N3[]
55
+ t .= y .* (w .- 1.0 )
56
+ apply_Xt! (g, X, t)
57
+ g .+ = λ .* θ
58
+ end
59
+ H === nothing || begin
60
+ mul! (H, X' , w .* X)
61
+ add_λI! (H, λ)
62
+ end
45
63
f === nothing || return J (y, Xθ, θ)
46
64
end
47
65
end
@@ -55,24 +73,36 @@ function Hv!(glr::GLR{LogisticLoss,<:L2R}, X, y)
55
73
# rows a 1:p = [X'ΛX + λI | X'Λ1]
56
74
# row e end = [1'ΛX | sum(a)+λ]
57
75
(Hv, θ, v) -> begin
58
- # precompute σ(yXθ) use -σ(-x) = (σ(x)-1)
59
- w = σ .(apply_X (X, θ) .* y)
76
+ Xθ = SCRATCH_N[]
77
+ apply_X! (Xθ, X, θ) # -- Xθ = apply_X(X, θ)
78
+ w = SCRATCH_N2[]
79
+ w .= σ .(Xθ .* y) # -- w = σ.(Xθ .* y)
60
80
# view on the first p rows
61
81
a = 1 : p
62
82
Hvₐ = view (Hv, a)
63
83
vₐ = view (v, a)
64
- XtΛ1 = X' * w # X'Λ1; O(np)
84
+ XtΛ1 = view (SCRATCH_P[], 1 : p)
85
+ mul! (XtΛ1, X' , w) # -- X'Λ1; O(np)
65
86
vₑ = v[end ]
66
87
# update for the first p rows -- (X'X + λI)v[1:p] + (X'1)v[end]
67
- mul! (Hvₐ, X' , w .* (X * vₐ)) # (X'ΛX)vₐ
88
+ Xvₐ = SCRATCH_N[]
89
+ mul! (Xvₐ, X, vₐ)
90
+ Xvₐ .*= w # -- ΛXvₐ
91
+ mul! (Hvₐ, X' , Xvₐ) # -- (X'ΛX)vₐ
68
92
Hvₐ .+ = λ .* vₐ .+ XtΛ1 .* vₑ
69
93
# update for the last row -- (X'1)'v + n v[end]
70
94
Hv[end ] = dot (XtΛ1, vₐ) + (sum (w)+ λ) * vₑ
71
95
end
72
96
else
73
97
(Hv, θ, v) -> begin
74
- w = σ .(apply_X (X, θ) .* y)
75
- mul! (Hv, X' , w .* (X * v))
98
+ Xθ = SCRATCH_N[]
99
+ apply_X! (Xθ, X, θ)
100
+ w = SCRATCH_N2[]
101
+ w .= σ .(Xθ .* y) # -- σ(yXθ)
102
+ Xv = SCRATCH_N3[]
103
+ mul! (Xv, X, v)
104
+ Xv .*= SCRATCH_N2[] # -- ΛXv
105
+ mul! (Hv, X' , Xv) # -- X'ΛXv
76
106
Hv .+ = λ .* v
77
107
end
78
108
end
@@ -113,24 +143,50 @@ function fg!(glr::GLR{MultinomialLoss,<:L2R}, X, y)
113
143
c = length (unique (y))
114
144
λ = getscale (glr. penalty)
115
145
(f, g, θ) -> begin
116
- P = apply_X (X, θ, c) # O(npc) store n * c
117
- M = exp .(P) # O(npc) store n * c
146
+ P = SCRATCH_NC[]
147
+ apply_X! (P, X, θ, c) # O(npc) store n * c
148
+ M = SCRATCH_NC2[]
149
+ M .= exp .(P) # O(npc) store n * c
118
150
g === nothing || begin
119
- ΛM = M ./ sum (M, dims= 2 ) # O(nc) store n * c
120
- Q = BitArray (y[i] == j for i = 1 : n, j= 1 : c)
121
- G = X' ΛM .- X' Q # O(npc) store n * c
151
+ ΛM = SCRATCH_NC3[]
152
+ ΛM .= M ./ sum (M, dims= 2 ) # O(nc) store n * c
153
+ Q = SCRATCH_NC4[]
154
+ @inbounds for i = 1 : n, j= 1 : c
155
+ Q[i, j] = ifelse (y[i] == j, 1.0 , 0.0 )
156
+ end
157
+ ∑ΛM = sum (ΛM, dims= 1 )
158
+ ∑Q = sum (Q, dims= 1 )
159
+ R = ΛM
160
+ R .- = Q
161
+ G = SCRATCH_PC[]
122
162
if glr. fit_intercept
123
- G = vcat (G, sum (ΛM, dims= 1 ) .- sum (Q, dims= 1 ))
163
+ mul! (view (G, 1 : p, :), X' , R)
164
+ @inbounds for k in 1 : c
165
+ G[end , k] = ∑ΛM[k] - ∑Q[k]
166
+ end
167
+ else
168
+ mul! (G, X' , R)
124
169
end
125
- g .= reshape (G, (p + Int (glr. fit_intercept)) * c)
170
+ g .= reshape (G, (p+ Int (glr. fit_intercept))* c)
126
171
g .+ = λ .* θ
127
172
end
128
173
f === nothing || begin
129
174
# we re-use pre-computations here, see also MultinomialLoss
130
- ms = maximum (P, dims= 2 )
131
- ss = sum (M ./ exp .(ms), dims= 2 )
132
- @inbounds ps = [P[i, y[i]] for i in eachindex (y)]
133
- return sum (log .(ss) .+ ms .- ps) + glr. penalty (θ)
175
+ # ms = maximum(P, dims=2)
176
+ # ss = sum(M ./ exp.(ms), dims=2)
177
+ ms = maximum (P, dims= 2 )
178
+ ems = SCRATCH_N[]
179
+ @inbounds for i in 1 : n
180
+ ems[i] = exp (ms[i])
181
+ end
182
+ ΛM = SCRATCH_NC2[] # note that _NC is already linked to P
183
+ ΛM .= M ./ ems
184
+ ss = sum (ΛM, dims= 2 )
185
+ t = 0.0
186
+ @inbounds for i in eachindex (y)
187
+ t += log (ss[i]) + ms[i] - P[i, y[i]]
188
+ end
189
+ return sum (t) + glr. penalty (θ)
134
190
end
135
191
end
136
192
end
0 commit comments