Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions doc/source/validate.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,13 @@ the similarity of two different clusterings of a dataset.
```@docs
mutualinfo
```

## Confusion matrix

Pair [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix)
arising from two clusterings is a 2x2 contingency table representation of
the partition co-occurrence table, see [`counts`](@ref).

```@docs
confusion
```
6 changes: 5 additions & 1 deletion src/Clustering.jl
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@ module Clustering
Hclust, hclust, cutree,

# MCL
mcl, MCLResult
mcl, MCLResult,

# pair confusion matrix
confusion

## source files

Expand All @@ -85,6 +88,7 @@ module Clustering
include("varinfo.jl")
include("vmeasure.jl")
include("mutualinfo.jl")
include("confusion.jl")

include("hclust.jl")

Expand Down
40 changes: 40 additions & 0 deletions src/confusion.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""
confusion(a::ClusteringResult, b::ClusteringResult) -> Matrix{Int}
confusion(a::ClusteringResult, b::AbstractVector{<:Integer}) -> Matrix{Int}
confusion(a::AbstractVector{<:Integer}, b::ClusteringResult) -> Matrix{Int}
confusion(a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer}) -> Matrix{Int}
Return 2x2 confusion matrix `C` that represents partition co-occurrence or
similarity matrix between two clusterings by considering all pairs of samples
and counting pairs that are assigned into the same or into different clusters
under the true and predicted clusterings.
Considering a pair of samples that is in the same group as a **positive pair**,
and a pair is in the different group as a **negative pair**, then the count of
true positives is `C₀₀`, false negatives is `C₀₁`, false positives `C₁₀`, and
true negatives is `C₁₁`:
| | Positive | Negative |
|:--:|:-:|:-:|
|Positive|C₀₀|C₁₀|
|Negative|C₀₁|C₁₁|
"""
function confusion(a::AbstractVector{<:Integer}, b::AbstractVector{<:Integer})
c = counts(a, b)

n = sum(c)
nis = sum(abs2, sum(c, dims=2)) # sum of squares of sums of rows
njs = sum(abs2, sum(c, dims=1)) # sum of squares of sums of columns

t2 = sum(abs2, c) # sum over rows & columns of nij^2
t3 = nis+njs
C = Int[(t2-n)/2 (nis-t2)/2; (njs-t2)/2 (t2+n^2-t3)/2]
return C
end
confusion(a::ClusteringResult, b::ClusteringResult) =
confusion(assignments(a), assignments(b))
confusion(a::AbstractVector{<:Integer}, b::ClusteringResult) =
confusion(a, assignments(b))
confusion(a::ClusteringResult, b::AbstractVector{<:Integer}) =
confusion(assignments(a), b)

38 changes: 14 additions & 24 deletions src/randindex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,35 +18,25 @@ Returns a tuple of indices:

> Meila, Marina (2003). *Comparing Clusterings by the Variation of
> Information.* Learning Theory and Kernel Machines: 173–187.

> Steinley, Douglas (2004). *Properties of the Hubert–Arabie Adjusted
> Rand Index.* Psychological Methods, Vol. 9, No. 3: 386-396
"""
function randindex(a, b)
c = counts(a, b)

n = sum(c)
nis = sum(abs2, sum(c, dims=2)) # sum of squares of sums of rows
njs = sum(abs2, sum(c, dims=1)) # sum of squares of sums of columns

t1 = binomial(n, 2) # total number of pairs of entities
t2 = sum(abs2, c) # sum over rows & columnns of nij^2
t3 = .5*(nis+njs)

# Expected index (for adjustment)
nc = (n*(n^2+1)-(n+1)*nis-(n+1)*njs+2*(nis*njs)/n)/(2*(n-1))
a, c, b, d = confusion(a,b) # Table 2 from Steinley 2004

A = t1+t2-t3; # agreements count
D = -t2+t3; # disagreements count
t = a+ b + c + d # total number of pairs of entities
A = a + d
D = b + c

if t1 == nc
# avoid division by zero; if k=1, define Rand = 0
ARI = 0
else
# adjusted Rand - Hubert & Arabie 1985
ARI = (A-nc)/(t1-nc)
end
# expected index
ERI = (a+b)*(a+c)+(c+d)*(b+d)
# adjusted Rand - Hubert & Arabie 1985
ARI = D == 0 ? 1.0 : (t*A-ERI)/(t*t-ERI) # (9) from Steinley 2004

RI = A/t1 # Rand 1971 # Probability of agreement
MI = D/t1 # Mirkin 1970 # p(disagreement)
HI = (A-D)/t1 # Hubert 1977 # p(agree)-p(disagree)
RI = A/t # Rand 1971 # Probability of agreement
MI = D/t # Mirkin 1970 # p(disagreement)
HI = (A-D)/t # Hubert 1977 # p(agree)-p(disagree)

return (ARI, RI, MI, HI)
end
44 changes: 44 additions & 0 deletions test/confusion.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Test confusion matrix

using Test
using Clustering

@testset "confusion() (Confusion matrix)" begin

@testset "small size tests" begin
@test confusion([0,0,0], [0,0,0]) == [3 0; 0 0]
@test confusion([0,0,1], [0,0,0]) == [1 0; 2 0]
@test confusion([0,1,1], [0,0,0]) == [1 0; 2 0]
@test confusion([1,1,1], [0,0,0]) == [3 0; 0 0]

@test confusion([0,0,0], [0,0,1]) == [1 2; 0 0]
@test confusion([0,0,1], [0,0,1]) == [1 0; 0 2]
@test confusion([0,1,1], [0,0,1]) == [0 1; 1 1]
@test confusion([1,1,1], [0,0,1]) == [1 2; 0 0]

@test confusion([0,0,0], [0,1,1]) == [1 2; 0 0]
@test confusion([0,0,1], [0,1,1]) == [0 1; 1 1]
@test confusion([0,1,1], [0,1,1]) == [1 0; 0 2]
@test confusion([1,1,1], [0,1,1]) == [1 2; 0 0]

@test confusion([0,0,0], [1,1,1]) == [3 0; 0 0]
@test confusion([0,0,1], [1,1,1]) == [1 0; 2 0]
@test confusion([0,1,1], [1,1,1]) == [1 0; 2 0]
@test confusion([1,1,1], [1,1,1]) == [3 0; 0 0]
end

@testset "comparing 2 k-means clusterings" begin
m = 3
n = 100
k = 1
x = rand(m, n)

# non-weighted
r1 = kmeans(x, k; maxiter=5)
r2 = kmeans(x, k; maxiter=5)
C = confusion(r1, r2)
@test C == [n*(n-1)/2 0; 0 0]
end

end

5 changes: 5 additions & 0 deletions test/randindex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,9 @@ a3 = [3, 3, 3, 2, 2, 2, 1, 1, 1, 1]

@test randindex(a1, a2) == randindex(a2, a1)

@test randindex(ones(Int, 3), ones(Int, 3)) == (1, 1, 0, 1)

a,b = rand(1:5,10_000), rand(1:5,10_000)
@test randindex(a,b)[1] < 1.0e-2

end
3 changes: 2 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ tests = ["seeding",
"hclust",
"mcl",
"vmeasure",
"mutualinfo"]
"mutualinfo",
"confusion"]

println("Runing tests:")
for t in tests
Expand Down