JuliaStats · anurag-mds · Dec 27, 2025 · Dec 27, 2025 · Dec 27, 2025 · Dec 27, 2025
diff --git a/docs/src/scalarstats.md b/docs/src/scalarstats.md
@@ -79,6 +79,7 @@ percentilerank
 ```@docs
 mode
 modes
+hsm_mode
 ```
 
 ## Summary Statistics

diff --git a/perf/hsm_mode.jl b/perf/hsm_mode.jl
@@ -0,0 +1,47 @@
+using BenchmarkTools
+using StatsBase
+using Random
+
+println("=" ^ 60)
+println("Half-Sample Mode (hsm_mode) Benchmarks")
+println("=" ^ 60)
+
+# Small dataset
+println("\n1. Small dataset (n=100)")
+Random.seed!(123)
+small = randn(100)
+@btime hsm_mode($small)
+
+# Medium dataset
+println("\n2. Medium dataset (n=1,000)")
+Random.seed!(123)
+medium = randn(1_000)
+@btime hsm_mode($medium)
+
+# Large dataset
+println("\n3. Large dataset (n=10,000)")
+Random.seed!(123)
+large = randn(10_000)
+@btime hsm_mode($large)
+
+# Very large dataset
+println("\n4. Very large dataset (n=100,000)")
+Random.seed!(123)
+very_large = randn(100_000)
+@btime hsm_mode($very_large)
+
+# With outliers
+println("\n5. Dataset with outliers (n=1,000)")
+Random.seed!(123)
+with_outliers = vcat(randn(990), fill(1000.0, 10))
+@btime hsm_mode($with_outliers)
+
+# Skewed distribution
+println("\n6. Skewed distribution (n=1,000)")
+Random.seed!(123)
+skewed = abs.(randn(1_000)) .^ 2
+@btime hsm_mode($skewed)
+
+println("\n" * "=" ^ 60)
+println("Benchmark complete!")
+println("=" ^ 60)
diff --git a/src/StatsBase.jl b/src/StatsBase.jl
@@ -78,6 +78,7 @@ export
     middle,      # the mean of two real numbers
     mode,        # find a mode from data (the first one)
     modes,       # find all modes from data
+    hsm_mode,    # half-sample mode (robust estimator)
 
     zscore,      # compute Z-scores
     zscore!,     # compute Z-scores inplace or to a pre-allocated array

diff --git a/src/scalarstats.jl b/src/scalarstats.jl
@@ -204,6 +204,99 @@ function modes(a::AbstractVector, wv::AbstractWeights{T}) where T <: Real
     return [x for (x, w) in weights if w == mw]
 end
 
+
+#############################
+#
+#   Mode: Half-Sample Mode
+#
+#############################
+
+"""
+    hsm_mode(a::AbstractVector{<:Real})
+
+Compute the half-sample mode (HSM) of a collection of real numbers.
+
+The half-sample mode is a robust estimator of central tendency that iteratively
+identifies the densest half of the data until only two values remain, then returns
+their midpoint. This estimator is particularly useful for skewed or multimodal
+distributions where the traditional mode may be unreliable. Non-finite values
+(NaN, Inf, -Inf) are automatically filtered.
+
+Returns a floating-point value (promoted from input type). Throws `ArgumentError`
+for empty vectors or vectors containing only non-finite values.
+
+# Algorithm
+The method works by iteratively finding the half of the data with minimum width:
+1. Sort and filter non-finite values
+2. While more than 2 elements remain, find the half-window with minimum width
+3. Return the midpoint of the final two elements
+
+Time complexity: O(n log n) due to sorting. Space complexity: O(n).
+
+# Examples
+```julia-repl
+julia> hsm_mode([1, 2, 2, 3, 4, 4, 4, 5])
+4.0
+
+julia> hsm_mode([NaN, 2, 2, Inf, 3])
+2.0
+
+julia> hsm_mode([10.0])
+10.0
+
+julia> hsm_mode([1.0097, 1.0054, 1.003212, 1.0231, 1.344, 1.00003])
+1.00003
+
+julia> data = [1, 1, 1, 2, 2, 100];  # outlier present
+
+julia> mode(data)  # frequency-based mode
+1
+
+julia> hsm_mode(data)  # density-based mode (finds densest region)
+1.0
+```
+
+!!! note
+    This implementation does not support weighted observations or missing values.
+    Use `filter(!ismissing, data)` to handle missing values before calling.
+
+# References
+Robertson, T., & Cryer, J. D. (1974). "An iterative procedure for estimating
+the mode." Journal of the American Statistical Association, 69(348), 1012-1016.
+
+Bickel, D. R., & Fruehwirth, R. (2006). "On a fast, robust estimator of the mode:
+Comparisons to other robust estimators with applications."
+Computational Statistics & Data Analysis, 50(12), 3500-3530.
+
+See also: [`mode`](@ref), [`modes`](@ref)
+"""
+function hsm_mode(a::AbstractVector{T}) where T <: Real
+    a = sort(filter(isfinite, a))
+
+    len = length(a)
+
+    len == 0 && throw(ArgumentError("hsm_mode is not defined for empty vectors"))
+    len == 1 && return a[1]
+
+    while (len > 2)
+        win_size = div(len, 2)
+        best_width = a[end] - a[1]
+        best_start = 1
+
+        for i in 1:(len-win_size+1)
+            width = a[i+win_size-1] - a[i]
+            if (width < best_width)
+                best_width = width
+                best_start = i
+            end
+        end
+        a = collect(view(a, best_start:(best_start+win_size-1)))
+        len = length(a)
+    end
+
+    return (a[1] + a[end]) / 2
+end
+
 #############################
 #
 #   quantile and friends

diff --git a/test/scalarstats.jl b/test/scalarstats.jl
@@ -63,6 +63,33 @@ wv = weights([0.1:0.1:0.7; 0.1])
 @test_throws ArgumentError mode([1, 2, 3], weights([0.1, 0.3]))
 @test_throws ArgumentError modes([1, 2, 3], weights([0.1, 0.3]))
 
+## hsm_mode (half-sample mode)
+
+@test hsm_mode([1, 2, 2, 3, 4, 4, 4, 5]) ≈ 4.0
+@test hsm_mode([10.0]) ≈ 10.0
+@test hsm_mode([1.0, 2.0, 3.0]) ≈ 1.5
+@test hsm_mode([NaN, 2, 2, Inf, 3]) ≈ 2.0
+@test hsm_mode([1.0, 2.0, 3.0, 4.0, 5.0]) ≈ 1.5
+@test isapprox(hsm_mode([1.0097, 1.0054, 1.003212, 1.0231, 1.344, 1.00003]), 1.00431, atol=1e-4)
+@test hsm_mode([1.0, 1.0, 1.0, 10.0]) ≈ 1.0
+@test hsm_mode([-5.0, -3.0, -1.0, 0.0, 1.0, 3.0, 5.0]) ≈ -0.5
+@test_throws ArgumentError hsm_mode(Float64[])
+@test hsm_mode([1.0, 5.0]) ≈ 3.0  # two elements: returns midpoint
+@test_throws ArgumentError hsm_mode([NaN, Inf, -Inf])
+@test hsm_mode([1, 2, 3]) isa Float64
+@test hsm_mode([1.0f0, 2.0f0]) isa Float32
+
+# Additional edge cases
+@test hsm_mode([1.0, 1.0, 1.0]) ≈ 1.0  # all identical
+@test hsm_mode([1, 1000000]) ≈ 500000.5  # extreme spread
+@test hsm_mode([1, 1, 1, 2, 2, 100]) ≈ 1.0  # outlier: finds densest region
+@test hsm_mode(Int32[1, 2, 3]) isa Float64  # type promotion
+@test hsm_mode(Float32[1, 2, 3]) isa Float32  # type preservation
+@test hsm_mode([1.0, 1.0, 1.0]) ≈ 1.0  # all identical
+@test hsm_mode([1, 1000000]) ≈ 500000.5  # extreme spread
+
+
+
 ## zscores
 
 @test zscore([-3:3;], 1.5, 0.5) == [-9.0:2.0:3.0;]
-Original file line number
+Diff line change
@@ Expand Up / @@ -79,6 +79,7 @@ percentilerank @@
     ```@docs
     mode
     modes
+    hsm_mode
     ```
     ## Summary Statistics
@@ Expand Down @@