10
10
# ---------------------------------------------------------
11
11
12
12
function fgh! (glr:: GLR{LogisticLoss,<:L2R} , X, y)
13
- J = objective (glr) # GLR objective (loss+penalty)
14
- p = size (X, 2 )
15
- λ = getscale (glr. penalty)
13
+ J = objective (glr) # GLR objective (loss+penalty)
14
+ n, p = size (X)
15
+ λ = getscale (glr. penalty)
16
+ # scratch allocation
17
+ SCRATCH_N = zeros (n)
18
+ SCRATCH_N2 = zeros (n)
19
+ SCRATCH_N3 = zeros (n)
16
20
if glr. fit_intercept
21
+ SCRATCH_P = zeros (p)
17
22
(f, g, H, θ) -> begin
18
- Xθ = SCRATCH_N[]
23
+ Xθ = SCRATCH_N
19
24
apply_X! (Xθ, X, θ) # -- Xθ = apply_X(X, θ)
20
25
# precompute σ(yXθ) use -σ(-x) = (σ(x)-1)
21
- w = SCRATCH_N2[]
26
+ w = SCRATCH_N2
22
27
w .= σ .(Xθ .* y) # -- w = σ.(Xθ .* y)
23
28
g === nothing || begin
24
- t = SCRATCH_N3[]
29
+ t = SCRATCH_N3
25
30
t .= y .* (w .- 1.0 ) # -- t = y .* (w .- 1.0)
26
31
apply_Xt! (g, X, t) # -- g = X't
27
32
g .+ = λ .* θ
@@ -33,7 +38,7 @@ function fgh!(glr::GLR{LogisticLoss,<:L2R}, X, y)
33
38
# probably not really worth it
34
39
ΛX = w .* X # !! big allocs
35
40
mul! (view (H, 1 : p, 1 : p), X' , ΛX) # -- H[1:p,1:p] = X'ΛX
36
- ΛXt1 = view ( SCRATCH_P[], 1 : p)
41
+ ΛXt1 = SCRATCH_P
37
42
copyto! (ΛXt1, sum (ΛX, dims= 1 )) # -- (ΛX)'1
38
43
@inbounds for i = 1 : p
39
44
H[i, end ] = H[end , i] = ΛXt1[i] # -- H[:,p+1] = (ΛX)'1
@@ -47,12 +52,12 @@ function fgh!(glr::GLR{LogisticLoss,<:L2R}, X, y)
47
52
# see comments above, same computations just no additional things for
48
53
# fit_intercept
49
54
(f, g, H, θ) -> begin
50
- Xθ = SCRATCH_N[]
55
+ Xθ = SCRATCH_N
51
56
apply_X! (Xθ, X, θ)
52
- w = SCRATCH_N2[]
57
+ w = SCRATCH_N2
53
58
w .= σ .(y .* Xθ)
54
59
g === nothing || begin
55
- t = SCRATCH_N3[]
60
+ t = SCRATCH_N3
56
61
t .= y .* (w .- 1.0 )
57
62
apply_Xt! (g, X, t)
58
63
g .+ = λ .* θ
@@ -67,26 +72,30 @@ function fgh!(glr::GLR{LogisticLoss,<:L2R}, X, y)
67
72
end
68
73
69
74
function Hv! (glr:: GLR{LogisticLoss,<:L2R} , X, y)
70
- p = size (X, 2 )
71
- λ = getscale (glr. penalty)
75
+ n, p = size (X)
76
+ λ = getscale (glr. penalty)
77
+ # scratch allocation
78
+ SCRATCH_N = zeros (n)
79
+ SCRATCH_N2 = zeros (n)
72
80
if glr. fit_intercept
81
+ SCRATCH_P = zeros (p)
73
82
# H = [X 1]'Λ[X 1] + λ I
74
83
# rows a 1:p = [X'ΛX + λI | X'Λ1]
75
84
# row e end = [1'ΛX | sum(a)+λ]
76
85
(Hv, θ, v) -> begin
77
- Xθ = SCRATCH_N[]
86
+ Xθ = SCRATCH_N
78
87
apply_X! (Xθ, X, θ) # -- Xθ = apply_X(X, θ)
79
- w = SCRATCH_N2[]
88
+ w = SCRATCH_N2
80
89
w .= σ .(Xθ .* y) # -- w = σ.(Xθ .* y)
81
90
# view on the first p rows
82
91
a = 1 : p
83
92
Hvₐ = view (Hv, a)
84
93
vₐ = view (v, a)
85
- XtΛ1 = view (SCRATCH_P[] , 1 : p)
94
+ XtΛ1 = view (SCRATCH_P, 1 : p)
86
95
mul! (XtΛ1, X' , w) # -- X'Λ1; O(np)
87
96
vₑ = v[end ]
88
97
# update for the first p rows -- (X'X + λI)v[1:p] + (X'1)v[end]
89
- Xvₐ = SCRATCH_N[]
98
+ Xvₐ = SCRATCH_N
90
99
mul! (Xvₐ, X, vₐ)
91
100
Xvₐ .*= w # -- ΛXvₐ
92
101
mul! (Hvₐ, X' , Xvₐ) # -- (X'ΛX)vₐ
@@ -97,13 +106,13 @@ function Hv!(glr::GLR{LogisticLoss,<:L2R}, X, y)
97
106
end
98
107
else
99
108
(Hv, θ, v) -> begin
100
- Xθ = SCRATCH_N[]
109
+ Xθ = SCRATCH_N
101
110
apply_X! (Xθ, X, θ)
102
- w = SCRATCH_N2[]
111
+ w = SCRATCH_N2
103
112
w .= σ .(Xθ .* y) # -- σ(yXθ)
104
- Xv = SCRATCH_N3[]
113
+ Xv = SCRATCH_N
105
114
mul! (Xv, X, v)
106
- Xv .*= SCRATCH_N2[] # -- ΛXv
115
+ Xv .*= SCRATCH_N2 # -- ΛXv
107
116
mul! (Hv, X' , Xv) # -- X'ΛXv
108
117
Hv .+ = λ .* v
109
118
end
@@ -144,23 +153,29 @@ function fg!(glr::GLR{MultinomialLoss,<:L2R}, X, y)
144
153
n, p = size (X)
145
154
c = length (unique (y))
146
155
λ = getscale (glr. penalty)
156
+ SCRATCH_N = zeros (n)
157
+ SCRATCH_NC = zeros (n, c)
158
+ SCRATCH_NC2 = zeros (n, c)
159
+ SCRATCH_NC3 = zeros (n, c)
160
+ SCRATCH_NC4 = zeros (n, c)
161
+ SCRATCH_PC = zeros (p+ Int (glr. fit_intercept), c)
147
162
(f, g, θ) -> begin
148
- P = SCRATCH_NC[]
149
- apply_X! (P, X, θ, c) # O(npc) store n * c
150
- M = SCRATCH_NC2[]
163
+ P = SCRATCH_NC
164
+ apply_X! (P, X, θ, c, SCRATCH_PC) # O(npc) store n * c
165
+ M = SCRATCH_NC2
151
166
M .= exp .(P) # O(npc) store n * c
152
167
g === nothing || begin
153
- ΛM = SCRATCH_NC3[]
168
+ ΛM = SCRATCH_NC3
154
169
ΛM .= M ./ sum (M, dims= 2 ) # O(nc) store n * c
155
- Q = SCRATCH_NC4[]
170
+ Q = SCRATCH_NC4
156
171
@inbounds for i = 1 : n, j= 1 : c
157
172
Q[i, j] = ifelse (y[i] == j, 1.0 , 0.0 )
158
173
end
159
174
∑ΛM = sum (ΛM, dims= 1 )
160
175
∑Q = sum (Q, dims= 1 )
161
176
R = ΛM
162
177
R .- = Q
163
- G = SCRATCH_PC[]
178
+ G = SCRATCH_PC
164
179
if glr. fit_intercept
165
180
mul! (view (G, 1 : p, :), X' , R)
166
181
@inbounds for k in 1 : c
@@ -179,11 +194,11 @@ function fg!(glr::GLR{MultinomialLoss,<:L2R}, X, y)
179
194
# ms = maximum(P, dims=2)
180
195
# ss = sum(M ./ exp.(ms), dims=2)
181
196
ms = maximum (P, dims= 2 )
182
- ems = SCRATCH_N[]
197
+ ems = SCRATCH_N
183
198
@inbounds for i in 1 : n
184
199
ems[i] = exp (ms[i])
185
200
end
186
- ΛM = SCRATCH_NC2[] # note that _NC is already linked to P
201
+ ΛM = SCRATCH_NC2 # note that _NC is already linked to P
187
202
ΛM .= M ./ ems
188
203
ss = sum (ΛM, dims= 2 )
189
204
t = 0.0
@@ -207,13 +222,13 @@ function Hv!(glr::GLR{MultinomialLoss,<:L2R}, X, y)
207
222
# allocate less but is likely slower; maybe in the future we could have a
208
223
# keyword indicating which one the user wants to use.
209
224
(Hv, θ, v) -> begin
210
- P = apply_X (X, θ, c) # P_ik = <x_i, θ_k> // dims n * c; O(npc)
211
- Q = apply_X (X, v, c) # Q_ik = <x_i, v_k> // dims n * c; O(npc)
212
- M = exp .(P) # M_ik = exp<x_i, w_k> // dims n * c;
213
- MQ = M .* Q # // dims n * c; O(nc)
214
- ρ = 1 ./ sum (M, dims= 2 ) # ρ_i = 1/Z_i = 1/∑_k exp<x_i, w_k>
215
- κ = sum (MQ, dims= 2 ) # κ_i = ∑_k exp<x_i, w_k><x_i, v_k>
216
- γ = κ .* ρ.^ 2 # γ_i = κ_i / Z_i^2
225
+ P = apply_X (X, θ, c) # P_ik = <x_i, θ_k> // dims n * c; O(npc)
226
+ Q = apply_X (X, v, c) # Q_ik = <x_i, v_k> // dims n * c; O(npc)
227
+ M = exp .(P) # M_ik = exp<x_i, w_k> // dims n * c;
228
+ MQ = M .* Q # // dims n * c; O(nc)
229
+ ρ = 1 ./ sum (M, dims= 2 ) # ρ_i = 1/Z_i = 1/∑_k exp<x_i, w_k>
230
+ κ = sum (MQ, dims= 2 ) # κ_i = ∑_k exp<x_i, w_k><x_i, v_k>
231
+ γ = κ .* ρ.^ 2 # γ_i = κ_i / Z_i^2
217
232
# computation of Hv
218
233
U = (ρ .* MQ) .- (γ .* M) # // dims n * c; O(nc)
219
234
Hv_mat = X' * U # // dims n * c; O(npc)
0 commit comments