@@ -28,7 +28,6 @@ S = SpectralDensity(ker, 1);
2828# ## ShiftedRFF
2929# The [`ShiftedRFF`](@ref) feature function is somewhat more common, and has
3030# been used in papers such as [Efficiently sampling functions from Gaussian process posteriors](https://proceedings.mlr.press/v119/wilson20a.html).
31- #
3231# It is defined as
3332# ```math
3433# \varphi_i(x) = \sqrt{2 / l} \cos(2 π ((w_i^T x) + b_i))
@@ -56,14 +55,12 @@ DisplayAs.PNG(f) #hide #md
5655# ## DoubleRFF
5756# The `DoubleRFF` feature function is less common, but is theoretically
5857# equivalent to the `ShiftedRFF` feature function.
59- #
6058# It is defined as
6159# ```math
6260# \varphi(x) = \sqrt{1 / l} \begin{pmatrix} \cos(2 π w' x) & \sin(2 π w' x) \end{pmatrix}
6361# ```
6462# where $w'$ is sampled from the spectral density $S$,
6563# with a total of $l/2$ sampled frequencies.
66- #
6764# Here, each function is effectively two feature functions in one,
6865# so specifying $l$ will result in $l/2$ samples but an $l$ dimensional
6966# feature vector.
10299DisplayAs. PNG (f) # hide #md
103100
104101# Clearly this is not quite correct, and we can quantify this
105- # by checking the error
102+ # by checking the error.
106103
107104norm (ker .(0 , x_plot) .- kt .(0 , x_plot))
108105
109- # Fortunately, we can improve the
110- # approximation by using more features
106+ # Fortunately, we can improve the approximation by using more features,
111107
112108rff1000 = ShiftedRFF (S, 5000 )
113109kt1000 (x, y) = dot (rff1000 (x), rff1000 (y))
114110
115111lines! (ax, x_plot, kt1000 .(0 , x_plot); label= " KT, l=1000" )
112+ axislegend (ax)
116113f
117114DisplayAs. PNG (f) # hide #md
118115
119- # We also see that the error reduces
116+ # which also reduces the error.
120117norm (ker .(0 , x_plot) .- kt1000 .(0 , x_plot))
121118
122119# ## Comparing the RFFs
123- # We can use to compare the two feature functions
120+ # In the section above we used the `ShiftedRFF` feature function, but what about the `DoubleRFF`?
121+ # Let's compare the two!. First we define some helper functions.
124122
125123function kt_error (ker, rff, S, l, x)
126124 rff = rff (S, l)
@@ -132,6 +130,8 @@ function mean_kt_error(ker, rff, S, l, x, n)
132130 return mean ([kt_error (ker, rff, S, l, x) for _ in 1 : n])
133131end
134132
133+ # Now we compute the mean error for both feature functions, using 100 features when recovering
134+ # the original kernel. To reduce the effect of randomness, we average over 5000 runs.
135135srff_err = mean_kt_error (ker, ShiftedRFF, S, 100 , x_plot, 5000 )
136136# -
137137drff_err = mean_kt_error (ker, DoubleRFF, S, 100 , x_plot, 5000 )
@@ -145,10 +145,9 @@ drff_err = mean_kt_error(ker, DoubleRFF, S, 1000, x_plot, 5000)
145145# ## Comparison, continued
146146# Lastly, we show a loglog plot of the mean error as a function
147147# of the number of features.
148- #
149- # We see that the error reduction with the number of features is
150- # the same for both options, but the DualRFF error is consistently
151- # lower.
148+ # We see that both feature functions have the order of error scaling,
149+ # but the DualRFF error has a small offset, resulting in a lower error.
150+ # This is especially impactful for a small number of features.
152151l_plot = [10 , 50 , 100 , 500 , 1000 ]
153152srff_comp = [mean_kt_error (ker, ShiftedRFF, S, l, x_plot, 5000 ) for l in l_plot]
154153drff_comp = [mean_kt_error (ker, DoubleRFF, S, l, x_plot, 5000 ) for l in l_plot]
0 commit comments