@@ -37,6 +37,10 @@ function default_bandwidth(data::RealVector, alpha::Float64 = 0.9)
37
37
return alpha * width * ndata^ (- 0.2 )
38
38
end
39
39
40
+ function default_weights (data:: RealVector )
41
+ UniformWeights (length (data))
42
+ end
43
+
40
44
41
45
# Roughly based on:
42
46
# B. W. Silverman (1982) "Algorithm AS 176: Kernel Density Estimation Using
@@ -66,23 +70,32 @@ function kde_range(boundary::Tuple{Real,Real}, npoints::Int)
66
70
lo: step: hi
67
71
end
68
72
73
+ immutable UniformWeights{N} end
74
+
75
+ UniformWeights (n) = UniformWeights {n} ()
76
+
77
+ Base. sum (x:: UniformWeights ) = 1.
78
+ Base. getindex {N} (x:: UniformWeights{N} , i) = 1 / N
79
+
80
+ typealias Weights Union{UniformWeights, RealVector, WeightVec}
81
+
82
+
69
83
# tabulate data for kde
70
- function tabulate (data:: RealVector , midpoints:: Range )
71
- ndata = length (data)
84
+ function tabulate (data:: RealVector , midpoints:: Range , weights:: Weights = default_weights (data))
72
85
npoints = length (midpoints)
73
86
s = step (midpoints)
74
87
75
88
# Set up a grid for discretized data
76
89
grid = zeros (Float64, npoints)
77
- ainc = 1.0 / (ndata * s* s)
90
+ ainc = 1.0 / (sum (weights) * s* s)
78
91
79
92
# weighted discretization (cf. Jones and Lotwick)
80
- for x in data
93
+ for (i,x) in enumerate ( data)
81
94
k = searchsortedfirst (midpoints,x)
82
95
j = k- 1
83
96
if 1 <= j <= npoints- 1
84
- grid[j] += (midpoints[k]- x)* ainc
85
- grid[k] += (x- midpoints[j])* ainc
97
+ grid[j] += (midpoints[k]- x)* ainc* weights[i]
98
+ grid[k] += (x- midpoints[j])* ainc* weights[i]
86
99
end
87
100
end
88
101
@@ -119,30 +132,30 @@ function conv(k::UnivariateKDE, dist::UnivariateDistribution)
119
132
end
120
133
121
134
# main kde interface methods
122
- function kde (data:: RealVector , midpoints:: Range , dist:: UnivariateDistribution )
123
- k = tabulate (data, midpoints)
135
+ function kde (data:: RealVector , weights :: Weights , midpoints:: Range , dist:: UnivariateDistribution )
136
+ k = tabulate (data, midpoints, weights )
124
137
conv (k,dist)
125
138
end
126
139
127
140
function kde (data:: RealVector , dist:: UnivariateDistribution ;
128
- boundary:: Tuple{Real,Real} = kde_boundary (data,std (dist)), npoints:: Int = 2048 )
141
+ boundary:: Tuple{Real,Real} = kde_boundary (data,std (dist)), npoints:: Int = 2048 , weights = default_weights (data) )
129
142
130
143
midpoints = kde_range (boundary,npoints)
131
- kde (data,midpoints,dist)
144
+ kde (data,weights, midpoints,dist)
132
145
end
133
146
134
147
function kde (data:: RealVector , midpoints:: Range ;
135
- bandwidth= default_bandwidth (data), kernel= Normal)
148
+ bandwidth= default_bandwidth (data), kernel= Normal, weights = default_weights (data) )
136
149
bandwidth > 0.0 || error (" Bandwidth must be positive" )
137
150
dist = kernel_dist (kernel,bandwidth)
138
- kde (data,midpoints,dist)
151
+ kde (data,weights, midpoints,dist)
139
152
end
140
153
141
154
function kde (data:: RealVector ; bandwidth= default_bandwidth (data), kernel= Normal,
142
- npoints:: Int = 2048 , boundary:: Tuple{Real,Real} = kde_boundary (data,bandwidth))
155
+ npoints:: Int = 2048 , boundary:: Tuple{Real,Real} = kde_boundary (data,bandwidth), weights = default_weights (data) )
143
156
bandwidth > 0.0 || error (" Bandwidth must be positive" )
144
157
dist = kernel_dist (kernel,bandwidth)
145
- kde (data,dist;boundary= boundary,npoints= npoints)
158
+ kde (data,dist;boundary= boundary,npoints= npoints,weights = weights )
146
159
end
147
160
148
161
# Select bandwidth using least-squares cross validation, from:
@@ -152,10 +165,11 @@ end
152
165
153
166
function kde_lscv (data:: RealVector , midpoints:: Range ;
154
167
kernel= Normal,
155
- bandwidth_range:: Tuple{Real,Real} = (h= default_bandwidth (data); (0.25 * h,1.5 * h)))
168
+ bandwidth_range:: Tuple{Real,Real} = (h= default_bandwidth (data); (0.25 * h,1.5 * h)),
169
+ weights= default_weights (data))
156
170
157
171
ndata = length (data)
158
- k = tabulate (data, midpoints)
172
+ k = tabulate (data, midpoints, weights )
159
173
160
174
# the ft here is K/ba*sqrt(2pi) * u(s), it is K times the Yl in Silverman's book
161
175
K = length (k. density)
@@ -194,8 +208,9 @@ function kde_lscv(data::RealVector;
194
208
boundary:: Tuple{Real,Real} = kde_boundary (data,default_bandwidth (data)),
195
209
npoints:: Int = 2048 ,
196
210
kernel= Normal,
197
- bandwidth_range:: Tuple{Real,Real} = (h= default_bandwidth (data); (0.25 * h,1.5 * h)))
211
+ bandwidth_range:: Tuple{Real,Real} = (h= default_bandwidth (data); (0.25 * h,1.5 * h)),
212
+ weights:: Weights = default_weights (data))
198
213
199
214
midpoints = kde_range (boundary,npoints)
200
- kde_lscv (data,midpoints; kernel= kernel, bandwidth_range= bandwidth_range)
215
+ kde_lscv (data,midpoints; kernel= kernel, bandwidth_range= bandwidth_range, weights = weights )
201
216
end
0 commit comments