diff --git a/docs/src/scalarstats.md b/docs/src/scalarstats.md index 406e4765a..bfd3acc1e 100644 --- a/docs/src/scalarstats.md +++ b/docs/src/scalarstats.md @@ -79,6 +79,7 @@ percentilerank ```@docs mode modes +hsm_mode ``` ## Summary Statistics diff --git a/perf/hsm_mode.jl b/perf/hsm_mode.jl new file mode 100644 index 000000000..a5161b525 --- /dev/null +++ b/perf/hsm_mode.jl @@ -0,0 +1,47 @@ +using BenchmarkTools +using StatsBase +using Random + +println("=" ^ 60) +println("Half-Sample Mode (hsm_mode) Benchmarks") +println("=" ^ 60) + +# Small dataset +println("\n1. Small dataset (n=100)") +Random.seed!(123) +small = randn(100) +@btime hsm_mode($small) + +# Medium dataset +println("\n2. Medium dataset (n=1,000)") +Random.seed!(123) +medium = randn(1_000) +@btime hsm_mode($medium) + +# Large dataset +println("\n3. Large dataset (n=10,000)") +Random.seed!(123) +large = randn(10_000) +@btime hsm_mode($large) + +# Very large dataset +println("\n4. Very large dataset (n=100,000)") +Random.seed!(123) +very_large = randn(100_000) +@btime hsm_mode($very_large) + +# With outliers +println("\n5. Dataset with outliers (n=1,000)") +Random.seed!(123) +with_outliers = vcat(randn(990), fill(1000.0, 10)) +@btime hsm_mode($with_outliers) + +# Skewed distribution +println("\n6. Skewed distribution (n=1,000)") +Random.seed!(123) +skewed = abs.(randn(1_000)) .^ 2 +@btime hsm_mode($skewed) + +println("\n" * "=" ^ 60) +println("Benchmark complete!") +println("=" ^ 60) diff --git a/src/StatsBase.jl b/src/StatsBase.jl index 4d0d182ae..8d93368c3 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -78,6 +78,7 @@ export middle, # the mean of two real numbers mode, # find a mode from data (the first one) modes, # find all modes from data + hsm_mode, # half-sample mode (robust estimator) zscore, # compute Z-scores zscore!, # compute Z-scores inplace or to a pre-allocated array diff --git a/src/scalarstats.jl b/src/scalarstats.jl index e1efd38f7..f1079362a 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -204,6 +204,99 @@ function modes(a::AbstractVector, wv::AbstractWeights{T}) where T <: Real return [x for (x, w) in weights if w == mw] end + +############################# +# +# Mode: Half-Sample Mode +# +############################# + +""" + hsm_mode(a::AbstractVector{<:Real}) + +Compute the half-sample mode (HSM) of a collection of real numbers. + +The half-sample mode is a robust estimator of central tendency that iteratively +identifies the densest half of the data until only two values remain, then returns +their midpoint. This estimator is particularly useful for skewed or multimodal +distributions where the traditional mode may be unreliable. Non-finite values +(NaN, Inf, -Inf) are automatically filtered. + +Returns a floating-point value (promoted from input type). Throws `ArgumentError` +for empty vectors or vectors containing only non-finite values. + +# Algorithm +The method works by iteratively finding the half of the data with minimum width: +1. Sort and filter non-finite values +2. While more than 2 elements remain, find the half-window with minimum width +3. Return the midpoint of the final two elements + +Time complexity: O(n log n) due to sorting. Space complexity: O(n). + +# Examples +```julia-repl +julia> hsm_mode([1, 2, 2, 3, 4, 4, 4, 5]) +4.0 + +julia> hsm_mode([NaN, 2, 2, Inf, 3]) +2.0 + +julia> hsm_mode([10.0]) +10.0 + +julia> hsm_mode([1.0097, 1.0054, 1.003212, 1.0231, 1.344, 1.00003]) +1.00003 + +julia> data = [1, 1, 1, 2, 2, 100]; # outlier present + +julia> mode(data) # frequency-based mode +1 + +julia> hsm_mode(data) # density-based mode (finds densest region) +1.0 +``` + +!!! note + This implementation does not support weighted observations or missing values. + Use `filter(!ismissing, data)` to handle missing values before calling. + +# References +Robertson, T., & Cryer, J. D. (1974). "An iterative procedure for estimating +the mode." Journal of the American Statistical Association, 69(348), 1012-1016. + +Bickel, D. R., & Fruehwirth, R. (2006). "On a fast, robust estimator of the mode: +Comparisons to other robust estimators with applications." +Computational Statistics & Data Analysis, 50(12), 3500-3530. + +See also: [`mode`](@ref), [`modes`](@ref) +""" +function hsm_mode(a::AbstractVector{T}) where T <: Real + a = sort(filter(isfinite, a)) + + len = length(a) + + len == 0 && throw(ArgumentError("hsm_mode is not defined for empty vectors")) + len == 1 && return a[1] + + while (len > 2) + win_size = div(len, 2) + best_width = a[end] - a[1] + best_start = 1 + + for i in 1:(len-win_size+1) + width = a[i+win_size-1] - a[i] + if (width < best_width) + best_width = width + best_start = i + end + end + a = collect(view(a, best_start:(best_start+win_size-1))) + len = length(a) + end + + return (a[1] + a[end]) / 2 +end + ############################# # # quantile and friends diff --git a/test/scalarstats.jl b/test/scalarstats.jl index eec64ad74..e0e9e1743 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -63,6 +63,33 @@ wv = weights([0.1:0.1:0.7; 0.1]) @test_throws ArgumentError mode([1, 2, 3], weights([0.1, 0.3])) @test_throws ArgumentError modes([1, 2, 3], weights([0.1, 0.3])) +## hsm_mode (half-sample mode) + +@test hsm_mode([1, 2, 2, 3, 4, 4, 4, 5]) ≈ 4.0 +@test hsm_mode([10.0]) ≈ 10.0 +@test hsm_mode([1.0, 2.0, 3.0]) ≈ 1.5 +@test hsm_mode([NaN, 2, 2, Inf, 3]) ≈ 2.0 +@test hsm_mode([1.0, 2.0, 3.0, 4.0, 5.0]) ≈ 1.5 +@test isapprox(hsm_mode([1.0097, 1.0054, 1.003212, 1.0231, 1.344, 1.00003]), 1.00431, atol=1e-4) +@test hsm_mode([1.0, 1.0, 1.0, 10.0]) ≈ 1.0 +@test hsm_mode([-5.0, -3.0, -1.0, 0.0, 1.0, 3.0, 5.0]) ≈ -0.5 +@test_throws ArgumentError hsm_mode(Float64[]) +@test hsm_mode([1.0, 5.0]) ≈ 3.0 # two elements: returns midpoint +@test_throws ArgumentError hsm_mode([NaN, Inf, -Inf]) +@test hsm_mode([1, 2, 3]) isa Float64 +@test hsm_mode([1.0f0, 2.0f0]) isa Float32 + +# Additional edge cases +@test hsm_mode([1.0, 1.0, 1.0]) ≈ 1.0 # all identical +@test hsm_mode([1, 1000000]) ≈ 500000.5 # extreme spread +@test hsm_mode([1, 1, 1, 2, 2, 100]) ≈ 1.0 # outlier: finds densest region +@test hsm_mode(Int32[1, 2, 3]) isa Float64 # type promotion +@test hsm_mode(Float32[1, 2, 3]) isa Float32 # type preservation +@test hsm_mode([1.0, 1.0, 1.0]) ≈ 1.0 # all identical +@test hsm_mode([1, 1000000]) ≈ 500000.5 # extreme spread + + + ## zscores @test zscore([-3:3;], 1.5, 0.5) == [-9.0:2.0:3.0;]