Skip to content

Commit 26947bc

Browse files
authored
Fix handling of -0.0 in histograms (#768)
`searchsortedfirst` and `searchsortedlast` use `isless` for comparisons and therefore consider `-0.0` to be different from `0.0`. This means that these two values do not end up in the same bin when an edge is 0. This does not make much sense statistically, but even worse is that when an extreme edge is 0, `-0.0` is not counted at all. Fix this by replacing `-0.0` with `0.0` before the search.
1 parent a1b02d8 commit 26947bc

File tree

2 files changed

+65
-3
lines changed

2 files changed

+65
-3
lines changed

src/hist.jl

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,14 @@ mutable struct Histogram{T<:Real,N,E} <: AbstractHistogram{T,N,E}
191191
closed == :right || closed == :left || error("closed must :left or :right")
192192
isdensity && !(T <: AbstractFloat) && error("Density histogram must have float-type weights")
193193
_edges_nbins(edges) == size(weights) || error("Histogram edge vectors must be 1 longer than corresponding weight dimensions")
194+
# We do not handle -0.0 in ranges correctly in `binindex` for performance
195+
# Constructing ranges starting or ending with -0.0 is very hard,
196+
# and ranges containing -0.0 elsewhere virtually impossible,
197+
# but check this just in case as it is cheap
198+
foreach(edges) do e
199+
e isa AbstractRange && any(isequal(-0.0), e) &&
200+
throw(ArgumentError("ranges containing -0.0 not allowed in edges"))
201+
end
194202
new{T,N,E}(edges,weights,closed,isdensity)
195203
end
196204
end
@@ -226,11 +234,25 @@ binindex(h::AbstractHistogram{T,1}, x::Real) where {T} = binindex(h, (x,))[1]
226234
binindex(h::Histogram{T,N}, xs::NTuple{N,Real}) where {T,N} =
227235
map((edge, x) -> _edge_binindex(edge, h.closed, x), h.edges, xs)
228236

237+
_normalize_zero(x::AbstractFloat) = isequal(x, -0.0) ? zero(x) : x
238+
_normalize_zero(x::Any) = x
239+
240+
# Always treat -0.0 like 0.0
229241
@inline function _edge_binindex(edge::AbstractVector, closed::Symbol, x::Real)
230-
if closed == :right
231-
searchsortedfirst(edge, x) - 1
242+
if closed === :right
243+
return searchsortedfirst(edge, _normalize_zero(x), by=_normalize_zero) - 1
244+
else
245+
return searchsortedlast(edge, _normalize_zero(x), by=_normalize_zero)
246+
end
247+
end
248+
# Passing by=_normalize_zero for ranges would have a large performance hit
249+
# as it would force using the AbstractVector fallback
250+
# This is not worth it given that it is very difficult to construct a range containing -0.0
251+
@inline function _edge_binindex(edge::AbstractRange, closed::Symbol, x::Real)
252+
if closed === :right
253+
return searchsortedfirst(edge, _normalize_zero(x)) - 1
232254
else
233-
searchsortedlast(edge, x)
255+
return searchsortedlast(edge, _normalize_zero(x))
234256
end
235257
end
236258

test/hist.jl

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,4 +224,44 @@ end
224224
@test StatsBase.midpoints(range(0, stop = 1, length = 5)) == 0.125:0.25:0.875
225225
end
226226

227+
@testset "histogram with -0.0" begin
228+
@test fit(Histogram, [-0.0, 1.0]) == fit(Histogram, [0.0, 1.0])
229+
@test fit(Histogram, [-0.0, 1.0], closed=:right) ==
230+
fit(Histogram, [0.0, 1.0], closed=:right)
231+
@test fit(Histogram, [-0.0, -1.0]) == fit(Histogram, [0.0, -1.0])
232+
@test fit(Histogram, [-0.0, -1.0], closed=:right) ==
233+
fit(Histogram, [0.0, -1.0], closed=:right)
234+
235+
@test fit(Histogram, [-0.0, 1.0], [-0.0, 0.5]) ==
236+
fit(Histogram, [0.0, 1.0], [0.0, 0.5]) ==
237+
fit(Histogram, [-0.0, 1.0], [0.0, 0.5]) ==
238+
fit(Histogram, [0.0, 1.0], [-0.0, 0.5]) ==
239+
fit(Histogram, [0.0, 1.0], 0.0:0.5:0.5) ==
240+
fit(Histogram, [-0.0, 1.0], 0.0:0.5:0.5)
241+
@test fit(Histogram, [-0.0, 1.0], [-0.5, -0.0]) ==
242+
fit(Histogram, [0.0, 1.0], [-0.5, -0.0]) ==
243+
fit(Histogram, [-0.0, 1.0], [-0.5, 0.0]) ==
244+
fit(Histogram, [0.0, 1.0], [-0.5, 0.0]) ==
245+
fit(Histogram, [-0.0, 1.0], -0.5:0.5:0.0) ==
246+
fit(Histogram, [0.0, 1.0], -0.5:0.5:0.0)
247+
@test fit(Histogram, [-0.0, 1.0], [-0.5, -0.0], closed=:right) ==
248+
fit(Histogram, [0.0, 1.0], [-0.5, 0.0], closed=:right) ==
249+
fit(Histogram, [0.0, 1.0], -0.5:0.5:0.0, closed=:right)
250+
@test fit(Histogram, [-0.0, 1.0], [-0.0, 0.5], closed=:right) ==
251+
fit(Histogram, [0.0, 1.0], [0.0, 0.5], closed=:right) ==
252+
fit(Histogram, [0.0, 1.0], [-0.0, 0.5], closed=:right) ==
253+
fit(Histogram, [-0.0, 1.0], [0.0, 0.5], closed=:right) ==
254+
fit(Histogram, [0.0, 1.0], 0.0:0.5:0.5, closed=:right) ==
255+
fit(Histogram, [-0.0, 1.0], 0.0:0.5:0.5, closed=:right)
256+
@test fit(Histogram, [-0.0, 1.0], [-0.5, -0.0], closed=:right) ==
257+
fit(Histogram, [0.0, 1.0], [-0.5, 0.0], closed=:right) ==
258+
fit(Histogram, [0.0, 1.0], [-0.5, -0.0], closed=:right) ==
259+
fit(Histogram, [-0.0, 1.0], [-0.5, 0.0], closed=:right) ==
260+
fit(Histogram, [0.0, 1.0], -0.5:0.5:0.0, closed=:right) ==
261+
fit(Histogram, [-0.0, 1.0], -0.5:0.5:0.0, closed=:right)
262+
263+
@test_throws ArgumentError fit(Histogram, [-0.5], LinRange(-1.0, -0.0, 3))
264+
@test_throws ArgumentError fit(Histogram, [-0.5], UnitRange(-0.0, 1.0))
265+
end
266+
227267
end # @testset "StatsBase.Histogram"

0 commit comments

Comments
 (0)