2
2
3
3
# In this example we show the two main methods to perform regression on a kernel from KernelFunctions.jl.
4
4
5
- # ## We load KernelFunctions and some other packages
5
+ # We load KernelFunctions and some other packages
6
6
7
7
using KernelFunctions
8
8
using LinearAlgebra
@@ -14,8 +14,7 @@ using Flux
14
14
using Flux: Optimise
15
15
using Zygote
16
16
using Random: seed!
17
- seed! (42 )
18
- using ParameterHandling
17
+ seed! (42 );
19
18
20
19
# ## Data Generation
21
20
# We generated data in 1 dimension
@@ -26,30 +25,29 @@ N = 50 # Number of samples
26
25
x_train = rand (Uniform (xmin, xmax), N) # We sample 100 random samples
27
26
σ = 0.1
28
27
y_train = sinc .(x_train) + randn (N) * σ # We create a function and add some noise
29
- x_test = range (xmin - 0.1 , xmax + 0.1 ; length= 300 )
30
-
28
+ x_test = range (xmin - 0.1 , xmax + 0.1 ; length= 300 );
31
29
# Plot the data
32
30
33
- scatter (x_train, y_train; lab= " data" )
34
- plot! (x_test, sinc; lab= " true function" )
31
+ # # scatter(x_train, y_train; lab="data")
32
+ # # plot!(x_test, sinc; lab="true function")
35
33
36
- # ## Method 1
37
- # The first method is to rebuild the parametrized kernel from a vector of parameters
38
- # in each evaluation of the cost fuction. This is similar to the approach taken in
39
- # [Stheno.jl](https://github.com/JuliaGaussianProcesses/Stheno.jl).
40
34
41
35
42
- # ### Base Approach
43
- # A simple way to ensure that the kernel parameters are positive
44
- # is to optimize over the logarithm of the parameters.
36
+
37
+ # ## Base Approach
38
+ # The first option is to rebuild the parametrized kernel from a vector of parameters
39
+ # in each evaluation of the cost fuction. This is similar to the approach taken in
40
+ # [Stheno.jl](https://github.com/JuliaGaussianProcesses/Stheno.jl).
45
41
46
42
# To train the kernel parameters via ForwardDiff.jl
47
43
# we need to create a function creating a kernel from an array.
44
+ # A simple way to ensure that the kernel parameters are positive
45
+ # is to optimize over the logarithm of the parameters.
48
46
49
47
function kernelcall (θ)
50
48
return (exp (θ[1 ]) * SqExponentialKernel () + exp (θ[2 ]) * Matern32Kernel ()) ∘
51
49
ScaleTransform (exp (θ[3 ]))
52
- end
50
+ end ;
53
51
54
52
# From theory we know the prediction for a test set x given
55
53
# the kernel parameters and normalization constant
@@ -58,63 +56,66 @@ function f(x, x_train, y_train, θ)
58
56
k = kernelcall (θ[1 : 3 ])
59
57
return kernelmatrix (k, x, x_train) *
60
58
((kernelmatrix (k, x_train) + exp (θ[4 ]) * I) \ y_train)
61
- end
59
+ end ;
62
60
63
61
# We look how the prediction looks like
64
62
# with starting parameters [1.0, 1.0, 1.0, 1.0] we get :
65
63
66
- ŷ = f (x_test, x_train, y_train, log .(ones (4 )))
67
- scatter (x_train, y_train; lab= " data" )
68
- plot! (x_test, sinc; lab= " true function" )
69
- plot! (x_test, ŷ; lab= " prediction" )
70
-
64
+ ŷ = f (x_test, x_train, y_train, log .(ones (4 )));
65
+ # # scatter(x_train, y_train; lab="data")
66
+ # # plot!(x_test, sinc; lab="true function")
67
+ # # plot!(x_test, ŷ; lab="prediction")
71
68
72
69
# We define the loss based on the L2 norm both
73
70
# for the loss and the regularization
74
71
75
72
function loss (θ)
76
73
ŷ = f (x_train, x_train, y_train, θ)
77
74
return sum (abs2, y_train - ŷ) + exp (θ[4 ]) * norm (ŷ)
78
- end
79
-
80
- # ## Training the model
75
+ end ;
81
76
77
+ # ### Training
78
+ # Setting an initial value and initializing the optimizer:
82
79
θ = log .([1.1 , 0.1 , 0.01 , 0.001 ]) # Initial vector
83
- opt = Optimise. ADAGrad (0.5 )
80
+ opt = Optimise. ADAGrad (0.5 );
84
81
85
- # The loss with our starting point :
82
+ # The loss with our starting point:
86
83
87
84
loss (θ)
88
85
89
- # Cost for one step
86
+ # Computational cost for one step
90
87
91
88
@benchmark let θt = θ[:], optt = Optimise. ADAGrad (0.5 )
92
- grads = only ((Zygote. gradient (loss, θt))) # We compute the gradients given the kernel parameters and regularization
89
+ grads = only ((Zygote. gradient (loss, θt)))
93
90
Optimise. update! (optt, θt, grads)
94
91
end
95
92
96
- # The optimization
93
+ # Optimizing
97
94
98
- anim = Animation ()
95
+ # # anim = Animation()
99
96
for i in 1 : 25
100
- grads = only ((Zygote. gradient (loss, θ))) # We compute the gradients given the kernel parameters and regularization
97
+ grads = only ((Zygote. gradient (loss, θ)))
101
98
Optimise. update! (opt, θ, grads)
102
- scatter (
103
- x_train, y_train; lab= " data" , title= " i = $(i) , Loss = $(round (loss (θ), digits = 4 )) "
104
- )
105
- plot! (x_test, sinc; lab= " true function" )
106
- plot! (x_test, f (x_test, x_train, y_train, θ); lab= " Prediction" , lw= 3.0 )
107
- frame (anim)
108
- end
109
- gif (anim)
99
+ end ;
100
+ # # scatter(
101
+ # # x_train, y_train; lab="data", title="i = $(i), Loss = $(round(loss(θ), digits = 4))"
102
+ # # )
103
+ # # plot!(x_test, sinc; lab="true function")
104
+ # # plot!(x_test, f(x_test, x_train, y_train, θ); lab="Prediction", lw=3.0)
105
+ # # frame(anim)
106
+ # # end
107
+ # # gif(anim)
110
108
111
109
# Final loss
112
110
loss (θ)
113
111
114
- # ### ParameterHandling.jl
112
+
113
+ # ## Using ParameterHandling.jl
115
114
# Alternatively, we can use the [ParameterHandling.jl](https://github.com/invenia/ParameterHandling.jl) package
116
115
# to handle the requirement that all kernel parameters should be positive.
117
116
117
+ using ParameterHandling
118
+
118
119
raw_initial_θ = (
119
120
k1 = positive (1.1 ),
120
121
k2 = positive (0.1 ),
@@ -127,20 +128,20 @@ flat_θ, unflatten = ParameterHandling.value_flatten(raw_initial_θ);
127
128
function kernelcall (θ)
128
129
return (θ. k1 * SqExponentialKernel () + θ. k2 * Matern32Kernel ()) ∘
129
130
ScaleTransform (θ. k3)
130
- end
131
+ end ;
131
132
132
133
function f (x, x_train, y_train, θ)
133
134
k = kernelcall (θ)
134
135
return kernelmatrix (k, x, x_train) *
135
136
((kernelmatrix (k, x_train) + θ. noise_var * I) \ y_train)
136
- end
137
+ end ;
137
138
138
139
function loss (θ)
139
140
ŷ = f (x_train, x_train, y_train, θ)
140
141
return sum (abs2, y_train - ŷ) + θ. noise_var * norm (ŷ)
141
- end
142
+ end ;
142
143
143
- initial_θ = ParameterHandling. value (raw_initial_θ)
144
+ initial_θ = ParameterHandling. value (raw_initial_θ);
144
145
145
146
# The loss with our starting point :
146
147
@@ -151,86 +152,76 @@ initial_θ = ParameterHandling.value(raw_initial_θ)
151
152
# ### Cost per step
152
153
153
154
@benchmark let θt = flat_θ[:], optt = Optimise. ADAGrad (0.5 )
154
- grads = (Zygote. gradient (loss ∘ unflatten, θt))[1 ] # We compute the gradients given the kernel parameters and regularization
155
+ grads = (Zygote. gradient (loss ∘ unflatten, θt))[1 ]
155
156
Optimise. update! (optt, θt, grads)
156
157
end
157
158
159
+ # ### Complete optimization
160
+
158
161
opt = Optimise. ADAGrad (0.5 )
159
162
for i in 1 : 25
160
- grads = (Zygote. gradient (loss ∘ unflatten, flat_θ))[1 ] # We compute the gradients given the kernel parameters and regularization
163
+ grads = (Zygote. gradient (loss ∘ unflatten, flat_θ))[1 ]
161
164
Optimise. update! (opt, flat_θ, grads)
162
- end
165
+ end ;
163
166
164
167
# Final loss
165
168
166
169
(loss ∘ unflatten)(flat_θ)
167
170
168
171
169
- # ## Method 2: Functor
170
- # An alternative method is to use tools from Flux.jl.
172
+ # ## Flux.destructure
173
+ # If don't want to write an explicit function to construct the kernel, we can alternatively use the `Flux.destructure` function.
174
+ # Again, we need to ensure that the parameters are positive. Note that the `exp` function now has to be in a different position.
171
175
172
- # raw_initial_θ = (
173
- # k1 = positive(1.1),
174
- # k2 = positive(0.1),
175
- # k3 = positive(0.01),
176
- # noise_var=positive(0.001),
177
- # )
178
- k1 = [1.1 ]
179
- k2 = [0.1 ]
180
- k3 = [0.01 ]
181
- noise_var = log .([0.001 ])
182
176
183
- kernel = (ScaledKernel (SqExponentialKernel (), relu .(k1)) + ScaledKernel (Matern32Kernel (), k2)) ∘
184
- ScaleTransform (map (exp,k3))
177
+ θ = [1.1 , 0.1 , 0.01 , 0.001 ]
185
178
186
- θ = Flux. params (k1, k2, k3, noise_var)
179
+ kernel = (θ[1 ] * SqExponentialKernel () + θ[2 ] * Matern32Kernel ()) ∘
180
+ ScaleTransform (θ[3 ])
187
181
188
- # kernel = (ScaledKernel(SqExponentialKernel(), softplus(θ[1])) + ScaledKernel(Matern32Kernel(), θ[2])) ∘
189
- # ScaleTransform(θ[3])
182
+ p, kernelc = Flux. destructure (kernel);
190
183
191
- # This next
184
+ # From theory we know the prediction for a test set x given
185
+ # the kernel parameters and normalization constant
192
186
193
- # function loss2()
194
- # ŷ = kernelmatrix(kernel, x_train, x_train) * ((kernelmatrix(kernel, x_train) + θ[4][1] * I) \ y_train)
195
- # return sum(abs2, y_train - ŷ) + θ[4][1] * norm(ŷ)
196
- # end
187
+ function f (x, x_train, y_train, θ)
188
+ k = kernelc (θ[1 : 3 ])
189
+ return kernelmatrix (k, x, x_train) *
190
+ ((kernelmatrix (k, x_train) + (θ[4 ]) * I) \ y_train)
191
+ end ;
197
192
198
- function loss ()
199
- ŷ = kernelmatrix (kernel, x_train, x_train) * ((kernelmatrix (kernel, x_train)) \ y_train)
200
- return sum (abs2, y_train - ŷ) + only (exp .(noise_var) .* norm (ŷ))
201
- end
202
193
203
- function f (x, x_train, y_train)
204
- return kernelmatrix (kernel, x, x_train) *
205
- ((kernelmatrix (kernel, x_train) + only (exp .(noise_var)) * I) \ y_train)
206
- end
194
+ # We define the loss based on the L2 norm both
195
+ # for the loss and the regularization
207
196
197
+ function loss (θ)
198
+ ŷ = f (x_train, x_train, y_train, exp .(θ))
199
+ return sum (abs2, y_train - ŷ) + exp (θ[4 ]) * norm (ŷ)
200
+ end ;
208
201
209
- grads = Flux. gradient (loss, θ)
210
- for p in θ
211
- println (grads[p])
212
- end
202
+ # ## Training the model
203
+
204
+ # The loss with our starting point :
205
+ θ = log .([1.1 , 0.1 , 0.01 , 0.001 ]) # Initial vector
206
+ loss (θ)
213
207
208
+ # Initialize optimizer
214
209
215
- grads = Flux . gradient (loss, θ )
210
+ opt = Optimise . ADAGrad ( 0.5 )
216
211
217
- η = 0.1 # Learning Rate
218
- opt = Optimise. ADAGrad (η)
219
- # for p in θ
220
- # update!(p, η * grads[p])
221
- # end
212
+ # Cost for one step
213
+
214
+ @benchmark let θt = θ[:], optt = Optimise. ADAGrad (0.5 )
215
+ grads = only ((Zygote. gradient (loss, θt))) # We compute the gradients given the kernel parameters and regularization
216
+ Optimise. update! (optt, θt, grads)
217
+ end
218
+
219
+ # The optimization
222
220
223
- anim = Animation ()
224
221
for i in 1 : 25
222
+ grads = only ((Zygote. gradient (loss, θ))) # We compute the gradients given the kernel parameters and regularization
225
223
Optimise. update! (opt, θ, grads)
226
- println (θ)
227
-
228
- scatter (
229
- x_train, y_train; lab= " data" , title= " i = $(i) , Loss = $(round (loss (), digits = 4 )) "
230
- )
231
- plot! (x_test, sinc; lab= " true function" )
232
- plot! (x_test, f (x_test, x_train, y_train); lab= " Prediction" , lw= 3.0 )
233
- frame (anim)
234
- end
224
+ end ;
235
225
236
- gif (anim)
226
+ # Final loss
227
+ loss (θ)
0 commit comments