Add ParameterHandling

Crown421 · Crown421 · commit 30171c98357f · 2021-11-01T00:31:48.000Z
diff --git a/examples/train-kernel-parameters/script.jl b/examples/train-kernel-parameters/script.jl
@@ -1,5 +1,7 @@
 # # Kernel Ridge Regression
 
+# In this example we show the two main methods to perform regression on a kernel from KernelFunctions.jl.
+
 # ## We load KernelFunctions and some other packages
 
 using KernelFunctions
@@ -11,6 +13,7 @@ using Flux: Optimise
 using ForwardDiff
 using Random: seed!
 seed!(42)
+using ParameterHandling
 
 # ## Data Generation
 # We generated data in 1 dimension
@@ -28,9 +31,18 @@ x_test = range(xmin - 0.1, xmax + 0.1; length=300)
 scatter(x_train, y_train; lab="data")
 plot!(x_test, sinc; lab="true function")
 
-# ## Kernel training
+# ## Kernel training, Method 1
+# The first method is to rebuild the parametrized kernel from a vector of parameters 
+# in each evaluation of the cost fuction. This is similar to the approach taken in 
+# [Stheno.jl](https://github.com/JuliaGaussianProcesses/Stheno.jl).
+
+
+# ### Simplest Approach
+# A simple way to ensure that the kernel parameters are positive
+# is to optimize over the logarithm of the parameters. 
+
 # To train the kernel parameters via ForwardDiff.jl
-# we need to create a function creating a kernel from an array
+# we need to create a function creating a kernel from an array.
 
 function kernelcall(θ)
     return (exp(θ[1]) * SqExponentialKernel() + exp(θ[2]) * Matern32Kernel()) ∘
@@ -54,6 +66,7 @@ scatter(x_train, y_train; lab="data")
 plot!(x_test, sinc; lab="true function")
 plot!(x_test, ŷ; lab="prediction")
 
+
 # We define the loss based on the L2 norm both
 # for the loss and the regularization
 
@@ -68,11 +81,13 @@ loss(log.(ones(4)))
 
 # ## Training the model
 
-θ = log.([1.0, 0.1, 0.01, 0.001]) # Initial vector
+θ = log.([1.1, 0.1, 0.01, 0.001]) # Initial vector
+# θ = positive.([1.1, 0.1, 0.01, 0.001]) 
 anim = Animation()
 opt = Optimise.ADAGrad(0.5)
 for i in 1:30
-    grads = ForwardDiff.gradient(loss, θ) # We compute the gradients given the kernel parameters and regularization
+    println(i)
+    grads = only(Zygote.gradient(loss, θ)) # We compute the gradients given the kernel parameters and regularization
     Optimise.update!(opt, θ, grads)
     scatter(
         x_train, y_train; lab="data", title="i = $(i), Loss = $(round(loss(θ), digits = 4))"
@@ -82,3 +97,54 @@ for i in 1:30
     frame(anim)
 end
 gif(anim)
+
+# ### ParameterHandling.jl
+# Alternatively, we can use the [ParameterHandling.jl](https://github.com/invenia/ParameterHandling.jl) package.
+
+raw_initial_θ = (
+    k1 = positive(1.1),
+    k2 = positive(0.1),
+    k3 = positive(0.01),
+    noise_var=positive(0.001),
+)
+
+flat_θ, unflatten = ParameterHandling.value_flatten(raw_initial_θ);
+
+function kernelcall(θ)
+    return (θ.k1 * SqExponentialKernel() + θ.k2 * Matern32Kernel()) ∘
+           ScaleTransform(θ.k3)
+end
+
+function f(x, x_train, y_train, θ)
+    k = kernelcall(θ)
+    return kernelmatrix(k, x, x_train) *
+           ((kernelmatrix(k, x_train) + θ.noise_var * I) \ y_train)
+end
+
+function loss(θ)
+    ŷ = f(x_train, x_train, y_train, θ)
+    return sum(abs2, y_train - ŷ) + θ.noise_var * norm(ŷ)
+end
+
+initial_θ = ParameterHandling.value(raw_initial_θ)
+
+# The loss with our starting point :
+
+loss(initial_θ)
+
+# ## Training the model
+
+anim = Animation()
+opt = Optimise.ADAGrad(0.5)
+for i in 1:30
+    println(i)
+    grads = only(Zygote.gradient(loss ∘ unflatten, flat_θ)) # We compute the gradients given the kernel parameters and regularization
+    Optimise.update!(opt, flat_θ, grads)
+    scatter(
+        x_train, y_train; lab="data", title="i = $(i), Loss = $(round((loss ∘ unflatten)(flat_θ), digits = 4))"
+    )
+    plot!(x_test, sinc; lab="true function")
+    plot!(x_test, f(x_test, x_train, y_train, unflatten(flat_θ)); lab="Prediction", lw=3.0)
+    frame(anim)
+end
+gif(anim)