Merge pull request #350 from JuliaStats/an/issue280

andreasnoack · web-flow · commit 6fe67cfd4244 · 2025-10-22T12:43:22.000-06:00
Handle discrete distributions in ExactOneSampleKSTest
diff --git a/src/kolmogorov_smirnov.jl b/src/kolmogorov_smirnov.jl
@@ -38,11 +38,26 @@ default_tail(test::KSTest) = :both
 # compute supremum of differences between target and empirical cdf before and after the jump of the empirical cdf.
 function ksstats(x::AbstractVector{T}, d::UnivariateDistribution) where T<:Real
     n = length(x)
-    cdfs = cdf.(Ref(d), sort(x))
-    δp = maximum((1:n) / n - cdfs)
-    δn = -minimum((0:n-1) / n - cdfs)
-    δ = max(δn, δp)
-    (n, δ, δp, δn)
+    sx = sort(x)
+    g = cdf.(Ref(d), sx)
+    g₋ = if d isa DiscreteDistribution
+        # http://www.stat.yale.edu/~jay/EmersonMaterials/DiscreteGOF.pdf page 2
+        cdf.(Ref(d), prevfloat.(float.(sx)))
+    else
+        g
+    end
+    _ecdf = ecdf(sx)
+    f = _ecdf(sx)
+    δ₊ = zero(zero(eltype(f)) - zero(eltype(g)))
+    δ₋ = zero(zero(eltype(g₋)) - zero(eltype(f)))
+    f₋i = zero(eltype(f))
+    for (fi, gi, g₋i) in zip(f, g, g₋)
+        δ₊ = max(δ₊, fi - gi)
+        δ₋ = max(δ₋, g₋i - f₋i)
+        f₋i = fi
+    end
+    δ = max(δ₊, δ₋)
+    (n, δ, δ₊, δ₋)
 end
 
 ### EXACT KOLMOGOROV SMIRNOV TEST
@@ -64,10 +79,6 @@ sample is not drawn from `d`.
 Implements: [`pvalue`](@ref)
 """
 function ExactOneSampleKSTest(x::AbstractVector{<:Real}, d::UnivariateDistribution)
-    if !allunique(x)
-        @warn("This test is inaccurate with ties")
-    end
-
     ExactOneSampleKSTest(ksstats(x, d)...)
 end
 
@@ -108,10 +119,6 @@ that the sample is not drawn from `d`.
 Implements: [`pvalue`](@ref)
 """
 function ApproximateOneSampleKSTest(x::AbstractVector{<:Real}, d::UnivariateDistribution)
-    if !allunique(x)
-        @warn("This test is inaccurate with ties")
-    end
-
     ApproximateOneSampleKSTest(ksstats(x, d)...)
 end
 
@@ -215,4 +222,4 @@ function ksstats(x::AbstractVector{T}, y::AbstractVector{S}) where {T<:Real, S<:
     end
 
     (n_x, n_y, max(δp, -δn), δp, -δn)
-end   
+end
diff --git a/test/kolmogorov_smirnov.jl b/test/kolmogorov_smirnov.jl
@@ -8,84 +8,105 @@ x = [0.3500, 0.1966, 0.2511, 0.6160, 0.4733,
      0.0759, 0.0540, 0.5308, 0.7792, 0.9340,
      0.1299, 0.5688, 0.4694, 0.0119, 0.3371
 ]
-t = ApproximateOneSampleKSTest(x, Uniform())
-@test t.δ ≈ 0.1440
-@test t.δn ≈ 0.0571
-@test t.δp ≈ 0.1440
-@test pvalue(t) ≈ 0.6777349664784745
-@test pvalue(t; tail=:left) ≈ 0.849573771973747
-@test pvalue(t; tail=:right) ≈ 0.3545875485608989
-@test default_tail(t) == :both
-show(IOBuffer(), t)
 
-t = ApproximateTwoSampleKSTest(x, [(0:24)/25...])
-@test t.δ ≈ 0.12
-@test t.δn ≈ 0.08
-@test t.δp ≈ 0.12
-@test pvalue(t) ≈ 0.993764859699076
-@test pvalue(t; tail=:left) ≈ 0.8521437889662113
-@test pvalue(t; tail=:right) ≈ 0.697676326071031
-@test default_tail(t) == :both
-show(IOBuffer(), t)
+@testset "Uniform" begin
+     t = ApproximateOneSampleKSTest(x, Uniform())
+     @test t.δ ≈ 0.1440
+     @test t.δn ≈ 0.0571
+     @test t.δp ≈ 0.1440
+     @test pvalue(t) ≈ 0.6777349664784745
+     @test pvalue(t; tail=:left) ≈ 0.849573771973747
+     @test pvalue(t; tail=:right) ≈ 0.3545875485608989
+     @test default_tail(t) == :both
+     show(IOBuffer(), t)
 
-t = ExactOneSampleKSTest(x, Uniform())
-@test t.δ ≈ 0.1440
-@test t.δn ≈ 0.0571
-@test t.δp ≈ 0.1440
-@test pvalue(t) ≈ 0.6263437768244742
-@test pvalue(t; tail=:left) ≈ 0.8195705417998183
-@test pvalue(t; tail=:right) ≈ 0.32350648882777194
-@test default_tail(t) == :both
-show(IOBuffer(), t)
+     t = ApproximateTwoSampleKSTest(x, [(0:24)/25...])
+     @test t.δ ≈ 0.12
+     @test t.δn ≈ 0.08
+     @test t.δp ≈ 0.12
+     @test pvalue(t) ≈ 0.993764859699076
+     @test pvalue(t; tail=:left) ≈ 0.8521437889662113
+     @test pvalue(t; tail=:right) ≈ 0.697676326071031
+     @test default_tail(t) == :both
+     show(IOBuffer(), t)
 
-## check fit to normal distribution
-t = ApproximateOneSampleKSTest(x, Normal())
-@test t.δ ≈ 0.5047473010922947
-@test t.δn ≈ 0.5047473010922947
-@test t.δp ≈ 0.17515194649718513
-@test pvalue(t) ≈ 5.871827067532435e-6
-@test pvalue(t; tail=:left) ≈ 2.9359135337662175e-6
-@test pvalue(t; tail=:right) ≈ 0.21569061887162347
+     t = ExactOneSampleKSTest(x, Uniform())
+     @test t.δ ≈ 0.1440
+     @test t.δn ≈ 0.0571
+     @test t.δp ≈ 0.1440
+     @test pvalue(t) ≈ 0.6263437768244742
+     @test pvalue(t; tail=:left) ≈ 0.8195705417998183
+     @test pvalue(t; tail=:right) ≈ 0.32350648882777194
+     @test default_tail(t) == :both
+     show(IOBuffer(), t)
+end
 
-## check unequal sample size
-t = ApproximateTwoSampleKSTest(x, [(0:5)/6...])
-@test t.δ ≈ 0.22
-@test t.δn ≈ 0.22
-@test t.δp ≈ 0.09333333333333346
-@test pvalue(t) ≈ 0.973300892518972
-@test pvalue(t; tail=:left) ≈ 0.6260111498528065
-@test pvalue(t; tail=:right) ≈ 0.9191544797498837
+@testset "check fit to normal distribution" begin
+     t = ApproximateOneSampleKSTest(x, Normal())
+     @test t.δ ≈ 0.5047473010922947
+     @test t.δn ≈ 0.5047473010922947
+     @test t.δp ≈ 0.17515194649718513
+     @test pvalue(t) ≈ 5.871827067532435e-6
+     @test pvalue(t; tail=:left) ≈ 2.9359135337662175e-6
+     @test pvalue(t; tail=:right) ≈ 0.21569061887162347
+end
 
-# http://ocw.mit.edu/courses/mathematics/18-443-statistics-for-applications-fall-2006/lecture-notes/lecture14.pdf
-x = [0.58, 0.42, 0.52, 0.33, 0.43, 0.23, 0.58, 0.76, 0.53, 0.64]
-t = ApproximateOneSampleKSTest(x, Uniform())
-@test t.δ ≈ 0.26
-@test t.δn ≈ 0.23
-@test t.δp ≈ 0.26
-@test pvalue(t) ≈ 0.5084937988981307
-@test pvalue(t; tail=:left) ≈ 0.3471494153245104
-@test pvalue(t; tail=:right) ≈ 0.25872229825964005
+@testset "check unequal sample size" begin
+     t = ApproximateTwoSampleKSTest(x, [(0:5)/6...])
+     @test t.δ ≈ 0.22
+     @test t.δn ≈ 0.22
+     @test t.δp ≈ 0.09333333333333346
+     @test pvalue(t) ≈ 0.973300892518972
+     @test pvalue(t; tail=:left) ≈ 0.6260111498528065
+     @test pvalue(t; tail=:right) ≈ 0.9191544797498837
+end
 
-t = ApproximateTwoSampleKSTest(x, [(0:9)/10...])
-@test t.δ ≈ 0.3
-@test t.δn ≈ 0.3
-@test t.δp ≈ 0.2
-@test pvalue(t) ≈ 0.7590978384203948
-@test pvalue(t; tail=:left) ≈ 0.406569659740599
-@test pvalue(t; tail=:right) ≈ 0.6703200460356393
+@testset "MIT course examples" begin
+     # http://ocw.mit.edu/courses/mathematics/18-443-statistics-for-applications-fall-2006/lecture-notes/lecture14.pdf
+     x = [0.58, 0.42, 0.52, 0.33, 0.43, 0.23, 0.58, 0.76, 0.53, 0.64]
+     t = ApproximateOneSampleKSTest(x, Uniform())
+     @test t.δ ≈ 0.26
+     @test t.δn ≈ 0.23
+     @test t.δp ≈ 0.26
+     @test pvalue(t) ≈ 0.5084937988981307
+     @test pvalue(t; tail=:left) ≈ 0.3471494153245104
+     @test pvalue(t; tail=:right) ≈ 0.25872229825964005
 
-t = ExactOneSampleKSTest(x, Uniform())
-@test t.δ ≈ 0.26
-@test t.δn ≈ 0.23
-@test t.δp ≈ 0.26
-@test pvalue(t) ≈ 0.4351284228580825
-@test pvalue(t; tail=:left) ≈ 0.3013310572470338
-@test pvalue(t; tail=:right) ≈ 0.2193143479950862
+     t = ApproximateTwoSampleKSTest(x, [(0:9)/10...])
+     @test t.δ ≈ 0.3
+     @test t.δn ≈ 0.3
+     @test t.δp ≈ 0.2
+     @test pvalue(t) ≈ 0.7590978384203948
+     @test pvalue(t; tail=:left) ≈ 0.406569659740599
+     @test pvalue(t; tail=:right) ≈ 0.6703200460356393
 
-# Check two samples with ties
+     t = ExactOneSampleKSTest(x, Uniform())
+     @test t.δ ≈ 0.26
+     @test t.δn ≈ 0.23
+     @test t.δp ≈ 0.26
+     @test pvalue(t) ≈ 0.4351284228580825
+     @test pvalue(t; tail=:left) ≈ 0.3013310572470338
+     @test pvalue(t; tail=:right) ≈ 0.2193143479950862
+end
 
-t = ApproximateTwoSampleKSTest(ones(10), ones(10))
-@test isapprox(t.δ, 0., atol=1e-16)
-@test isapprox(t.δp, 0., atol=1e-16)
-@test isapprox(t.δn, 0., atol=1e-16)
+@testset "Check two samples with ties" begin
+     t = ApproximateTwoSampleKSTest(ones(10), ones(10))
+     @test isapprox(t.δ, 0., atol=1e-16)
+     @test isapprox(t.δp, 0., atol=1e-16)
+     @test isapprox(t.δn, 0., atol=1e-16)
+end
+
+@testset "Issue 280" begin
+     n = 50
+     x = 0.5 .^ (1:n)
+     d = DiscreteNonParametric(1:n, x)
+     y = vcat(fill(1, 16), fill(2, 8), fill(3, 4), 4, 4, 5)
+     t = ExactOneSampleKSTest(y, d)
+     @test t.δ ≈ 0.03125
+     @test t.δp ≈ 0.03125
+     @test t.δn ≈ 0.0
+     @test pvalue(t) ≈ 0.9999999999999348
+     @test pvalue(t; tail=:left) ≈ 1.0
+     @test pvalue(t; tail=:right) ≈ 0.9213372964737361
+end
 end