Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/src/scalarstats.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ percentilerank
```@docs
mode
modes
hsm_mode
```

## Summary Statistics
Expand Down
47 changes: 47 additions & 0 deletions perf/hsm_mode.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
using BenchmarkTools
using StatsBase
using Random

println("=" ^ 60)
println("Half-Sample Mode (hsm_mode) Benchmarks")
println("=" ^ 60)

# Small dataset
println("\n1. Small dataset (n=100)")
Random.seed!(123)
small = randn(100)
@btime hsm_mode($small)

# Medium dataset
println("\n2. Medium dataset (n=1,000)")
Random.seed!(123)
medium = randn(1_000)
@btime hsm_mode($medium)

# Large dataset
println("\n3. Large dataset (n=10,000)")
Random.seed!(123)
large = randn(10_000)
@btime hsm_mode($large)

# Very large dataset
println("\n4. Very large dataset (n=100,000)")
Random.seed!(123)
very_large = randn(100_000)
@btime hsm_mode($very_large)

# With outliers
println("\n5. Dataset with outliers (n=1,000)")
Random.seed!(123)
with_outliers = vcat(randn(990), fill(1000.0, 10))
@btime hsm_mode($with_outliers)

# Skewed distribution
println("\n6. Skewed distribution (n=1,000)")
Random.seed!(123)
skewed = abs.(randn(1_000)) .^ 2
@btime hsm_mode($skewed)

println("\n" * "=" ^ 60)
println("Benchmark complete!")
println("=" ^ 60)
1 change: 1 addition & 0 deletions src/StatsBase.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ export
middle, # the mean of two real numbers
mode, # find a mode from data (the first one)
modes, # find all modes from data
hsm_mode, # half-sample mode (robust estimator)

zscore, # compute Z-scores
zscore!, # compute Z-scores inplace or to a pre-allocated array
Expand Down
93 changes: 93 additions & 0 deletions src/scalarstats.jl
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,99 @@ function modes(a::AbstractVector, wv::AbstractWeights{T}) where T <: Real
return [x for (x, w) in weights if w == mw]
end


#############################
#
# Mode: Half-Sample Mode
#
#############################

"""
hsm_mode(a::AbstractVector{<:Real})

Compute the half-sample mode (HSM) of a collection of real numbers.

The half-sample mode is a robust estimator of central tendency that iteratively
identifies the densest half of the data until only two values remain, then returns
their midpoint. This estimator is particularly useful for skewed or multimodal
distributions where the traditional mode may be unreliable. Non-finite values
(NaN, Inf, -Inf) are automatically filtered.

Returns a floating-point value (promoted from input type). Throws `ArgumentError`
for empty vectors or vectors containing only non-finite values.

# Algorithm
The method works by iteratively finding the half of the data with minimum width:
1. Sort and filter non-finite values
2. While more than 2 elements remain, find the half-window with minimum width
3. Return the midpoint of the final two elements

Time complexity: O(n log n) due to sorting. Space complexity: O(n).

# Examples
```julia-repl
julia> hsm_mode([1, 2, 2, 3, 4, 4, 4, 5])
4.0

julia> hsm_mode([NaN, 2, 2, Inf, 3])
2.0

julia> hsm_mode([10.0])
10.0

julia> hsm_mode([1.0097, 1.0054, 1.003212, 1.0231, 1.344, 1.00003])
1.00003

julia> data = [1, 1, 1, 2, 2, 100]; # outlier present

julia> mode(data) # frequency-based mode
1

julia> hsm_mode(data) # density-based mode (finds densest region)
1.0
```

!!! note
This implementation does not support weighted observations or missing values.
Use `filter(!ismissing, data)` to handle missing values before calling.

# References
Robertson, T., & Cryer, J. D. (1974). "An iterative procedure for estimating
the mode." Journal of the American Statistical Association, 69(348), 1012-1016.

Bickel, D. R., & Fruehwirth, R. (2006). "On a fast, robust estimator of the mode:
Comparisons to other robust estimators with applications."
Computational Statistics & Data Analysis, 50(12), 3500-3530.

See also: [`mode`](@ref), [`modes`](@ref)
"""
function hsm_mode(a::AbstractVector{T}) where T <: Real
a = sort(filter(isfinite, a))

len = length(a)

len == 0 && throw(ArgumentError("hsm_mode is not defined for empty vectors"))
len == 1 && return a[1]

while (len > 2)
win_size = ceil(Int, len / 2)
best_width = a[end] - a[1]
best_start = 1

for i in 1:(len-win_size+1)
width = a[i+win_size-1] - a[i]
if (width < best_width)
best_width = width
best_start = i
end
end
a = collect(view(a, best_start:(best_start+win_size-1)))
len = length(a)
end

return (a[1] + a[end]) / 2
end

#############################
#
# quantile and friends
Expand Down
27 changes: 27 additions & 0 deletions test/scalarstats.jl
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,33 @@ wv = weights([0.1:0.1:0.7; 0.1])
@test_throws ArgumentError mode([1, 2, 3], weights([0.1, 0.3]))
@test_throws ArgumentError modes([1, 2, 3], weights([0.1, 0.3]))

## hsm_mode (half-sample mode)

@test hsm_mode([1, 2, 2, 3, 4, 4, 4, 5]) ≈ 4.0
@test hsm_mode([10.0]) ≈ 10.0
@test hsm_mode([1.0, 2.0, 3.0]) ≈ 1.5
@test hsm_mode([NaN, 2, 2, Inf, 3]) ≈ 2.0
@test hsm_mode([1.0, 2.0, 3.0, 4.0, 5.0]) ≈ 1.5
@test isapprox(hsm_mode([1.0097, 1.0054, 1.003212, 1.0231, 1.344, 1.00003]), 1.00431, atol=1e-4)
@test hsm_mode([1.0, 1.0, 1.0, 10.0]) ≈ 1.0
@test hsm_mode([-5.0, -3.0, -1.0, 0.0, 1.0, 3.0, 5.0]) ≈ -0.5
@test_throws ArgumentError hsm_mode(Float64[])
@test hsm_mode([1.0, 5.0]) ≈ 3.0 # two elements: returns midpoint
@test_throws ArgumentError hsm_mode([NaN, Inf, -Inf])
@test hsm_mode([1, 2, 3]) isa Float64
@test hsm_mode([1.0f0, 2.0f0]) isa Float32

# Additional edge cases
@test hsm_mode([1.0, 1.0, 1.0]) ≈ 1.0 # all identical
@test hsm_mode([1, 1000000]) ≈ 500000.5 # extreme spread
@test hsm_mode([1, 1, 1, 2, 2, 100]) ≈ 1.0 # outlier: finds densest region
@test hsm_mode(Int32[1, 2, 3]) isa Float64 # type promotion
@test hsm_mode(Float32[1, 2, 3]) isa Float32 # type preservation
@test hsm_mode([1.0, 1.0, 1.0]) ≈ 1.0 # all identical
@test hsm_mode([1, 1000000]) ≈ 500000.5 # extreme spread



## zscores

@test zscore([-3:3;], 1.5, 0.5) == [-9.0:2.0:3.0;]
Expand Down
Loading