Skip to content

Commit 56bafb0

Browse files
committed
randindex(): fix int overflow
for very large clusterings the agreement/disagreement counts are very large, so we have to switch to float when multiplying them fixes #225 enhances #227
1 parent 5d1ad38 commit 56bafb0

File tree

2 files changed

+13
-4
lines changed

2 files changed

+13
-4
lines changed

src/randindex.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,10 @@ function randindex(a, b)
3030
D = c12 + c21
3131

3232
# expected index
33-
ERI = (c11+c12)*(c11+c21)+(c21+c22)*(c12+c22)
33+
T = typeof(one(A)/one(t))
34+
ERI = convert(T, c11+c12)*(c11+c21)+convert(T, c21+c22)*(c12+c22)
3435
# adjusted Rand - Hubert & Arabie 1985
35-
ARI = D == 0 ? 1.0 : (t*A-ERI)/(t*t-ERI) # (9) from Steinley 2004
36+
ARI = D == 0 ? 1.0 : (convert(T, t)*A - ERI)/(abs2(convert(T, t)) - ERI) # (9) from Steinley 2004
3637

3738
RI = A/t # Rand 1971 # Probability of agreement
3839
MI = D/t # Mirkin 1970 # p(disagreement)

test/randindex.jl

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,15 @@ a3 = [3, 3, 3, 2, 2, 2, 1, 1, 1, 1]
3636

3737
@test randindex(ones(Int, 3), ones(Int, 3)) == (1, 1, 0, 1)
3838

39-
a, b = rand(1:5, 10_000), rand(1:5, 10_000)
40-
@test randindex(a, b)[1] < 1.0e-2
39+
@testset "large independent clusterings (#225)" begin
40+
rng = MersenneTwister(123)
41+
42+
n = 10_000_000
43+
k = 5 # number of clusters
44+
a = rand(rng, 1:k, n)
45+
b = rand(rng, 1:k, n)
46+
47+
@test collect(randindex(a, b)) [0.0, ((k-1)^2 + 1)/k^2, 2*(k-1)/k^2, ((k-2)/k)^2] atol=1e-5
48+
end
4149

4250
end

0 commit comments

Comments
 (0)