|
1 | 1 | # Store both grid and density for KDE over R2
|
2 |
| -immutable BivariateKDE |
3 |
| - x::Vector{Float64} |
4 |
| - y::Vector{Float64} |
| 2 | +immutable BivariateKDE{Rx<:Range,Ry<:Range} |
| 3 | + x::Rx |
| 4 | + y::Ry |
5 | 5 | density::Matrix{Float64}
|
6 | 6 | end
|
7 | 7 |
|
| 8 | +function kernel_dist{D<:UnivariateDistribution}(::Type{D},w::(Real,Real)) |
| 9 | + kernel_dist(D,w[1]), kernel_dist(D,w[2]) |
| 10 | +end |
| 11 | +function kernel_dist{Dx<:UnivariateDistribution,Dy<:UnivariateDistribution}(::Type{(Dx,Dy)},w::(Real,Real)) |
| 12 | + kernel_dist(Dx,w[1]), kernel_dist(Dy,w[2]) |
| 13 | +end |
8 | 14 |
|
9 |
| -# Algorithm from MASS Chapter 5 for calculating 2D KDE |
10 |
| -function kde(x::RealVector, y::RealVector; width::Float64=NaN, resolution::Int=25) |
11 |
| - n = length(x) |
| 15 | +# TODO: there are probably better choices. |
| 16 | +function default_bandwidth(data::(RealVector,RealVector)) |
| 17 | + default_bandwidth(data[1]), default_bandwidth(data[2]) |
| 18 | +end |
12 | 19 |
|
13 |
| - if length(y) != n |
14 |
| - error("x and y must have the same length") |
15 |
| - end |
| 20 | +# tabulate data for kde |
| 21 | +function tabulate(data::(RealVector, RealVector), midpoints::(Range, Range)) |
| 22 | + xdata, ydata = data |
| 23 | + ndata = length(xdata) |
| 24 | + length(ydata) == ndata || error("data vectors must be of same length") |
16 | 25 |
|
17 |
| - if isnan(width) |
18 |
| - h1 = kde_bandwidth(x) |
19 |
| - h2 = kde_bandwidth(y) |
20 |
| - else |
21 |
| - h1 = width |
22 |
| - h2 = width |
| 26 | + xmid, ymid = midpoints |
| 27 | + nx, ny = length(xmid), length(ymid) |
| 28 | + sx, sy = step(xmid), step(ymid) |
| 29 | + |
| 30 | + # Set up a grid for discretized data |
| 31 | + grid = zeros(Float64, nx, ny) |
| 32 | + ainc = 1.0 / (ndata*(sx*sy)^2) |
| 33 | + |
| 34 | + # weighted discretization (cf. Jones and Lotwick) |
| 35 | + for (x, y) in zip(xdata,ydata) |
| 36 | + kx, ky = searchsortedfirst(xmid,x), searchsortedfirst(ymid,y) |
| 37 | + jx, jy = kx-1, ky-1 |
| 38 | + if 1 <= jx <= nx && 1 <= jy <= ny |
| 39 | + grid[jx,jy] += (xmid[kx]-x)*(ymid[ky]-y)*ainc |
| 40 | + grid[kx,jy] += (x-xmid[jx])*(ymid[ky]-y)*ainc |
| 41 | + grid[jx,ky] += (xmid[kx]-x)*(y-ymid[jy])*ainc |
| 42 | + grid[kx,ky] += (x-xmid[jx])*(y-ymid[jy])*ainc |
| 43 | + end |
23 | 44 | end
|
24 | 45 |
|
25 |
| - min_x, max_x = extrema(x) |
26 |
| - min_y, max_y = extrema(y) |
| 46 | + # returns an un-convolved KDE |
| 47 | + BivariateKDE(xmid, ymid, grid) |
| 48 | +end |
| 49 | + |
| 50 | +# convolution with product distribution of two univariates distributions |
| 51 | +function conv(k::BivariateKDE, dist::(UnivariateDistribution,UnivariateDistribution) ) |
| 52 | + # Transform to Fourier basis |
| 53 | + Kx, Ky = size(k.density) |
| 54 | + ft = rfft(k.density) |
27 | 55 |
|
28 |
| - grid_x = [min_x:((max_x - min_x) / (resolution - 1)):max_x] |
29 |
| - grid_y = [min_y:((max_y - min_y) / (resolution - 1)):max_y] |
| 56 | + distx, disty = dist |
30 | 57 |
|
31 |
| - mx = Array(Float64, resolution, n) |
32 |
| - my = Array(Float64, resolution, n) |
33 |
| - for i in 1:resolution |
34 |
| - for j in 1:n |
35 |
| - mx[i, j] = pdf(Normal(), (grid_x[i] - x[j]) / h1) |
36 |
| - my[i, j] = pdf(Normal(), (grid_y[i] - y[j]) / h2) |
| 58 | + # Convolve fft with characteristic function of kernel |
| 59 | + cx = -twoπ/(step(k.x)*Kx) |
| 60 | + cy = -twoπ/(step(k.y)*Ky) |
| 61 | + for j = 1:size(ft,2) |
| 62 | + for i = 1:size(ft,1) |
| 63 | + ft[i,j] *= cf(distx,(i-1)*cx)*cf(disty,min(j-1,Ky-j+1)*cy) |
37 | 64 | end
|
38 | 65 | end
|
39 | 66 |
|
40 |
| - z = A_mul_Bt(mx, my) |
41 |
| - for i in 1:(resolution^2) |
42 |
| - z[i] /= (n * h1 * h2) |
43 |
| - end |
| 67 | + # Invert the Fourier transform to get the KDE |
| 68 | + BivariateKDE(k.x, k.y, irfft(ft, Kx)) |
| 69 | +end |
| 70 | + |
| 71 | +typealias BivariateDistribution Union(MultivariateDistribution,(UnivariateDistribution,UnivariateDistribution)) |
| 72 | + |
| 73 | +function kde(data::(RealVector, RealVector), midpoints::(Range, Range), dist::BivariateDistribution) |
| 74 | + k = tabulate(data,midpoints) |
| 75 | + conv(k,dist) |
| 76 | +end |
| 77 | + |
| 78 | +function kde(data::(RealVector, RealVector), dist::BivariateDistribution; |
| 79 | + boundary::((Real,Real),(Real,Real)) = (kde_boundary(data[1],std(dist[1])), |
| 80 | + kde_boundary(data[2],std(dist[2]))), |
| 81 | + npoints::(Int,Int)=(128,128)) |
| 82 | + |
| 83 | + xmid = kde_range(boundary[1],npoints[1]) |
| 84 | + ymid = kde_range(boundary[2],npoints[2]) |
| 85 | + |
| 86 | + kde(data,(xmid,ymid),dist) |
| 87 | +end |
| 88 | + |
| 89 | +function kde(data::(RealVector, RealVector), midpoints::(Range, Range); |
| 90 | + bandwidth=default_bandwidth(data), kernel=Normal) |
| 91 | + |
| 92 | + dist = kernel_dist(kernel,bandwidth) |
| 93 | + kde(data,midpoints,dist) |
| 94 | +end |
| 95 | + |
| 96 | +function kde(data::(RealVector, RealVector); |
| 97 | + bandwidth=default_bandwidth(data), |
| 98 | + kernel=Normal, |
| 99 | + boundary::((Real,Real),(Real,Real)) = (kde_boundary(data[1],bandwidth[1]), |
| 100 | + kde_boundary(data[2],bandwidth[2])), |
| 101 | + npoints::(Int,Int)=(128,128)) |
| 102 | + |
| 103 | + dist = kernel_dist(kernel,bandwidth) |
| 104 | + xmid = kde_range(boundary[1],npoints[1]) |
| 105 | + ymid = kde_range(boundary[2],npoints[2]) |
| 106 | + |
| 107 | + kde(data,(xmid,ymid),dist) |
| 108 | +end |
44 | 109 |
|
45 |
| - return BivariateKDE(grid_x, grid_y, z) |
| 110 | +# matrix data |
| 111 | +function kde(data::RealMatrix,args...;kwargs...) |
| 112 | + size(data,2) == 2 || error("Can only construct KDE from matrices with 2 columns.") |
| 113 | + kde((data[:,1],data[:,2]),args...;kwargs...) |
46 | 114 | end
|
0 commit comments