Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
23bbed1
[add function or file]add hdbscan
MommaWatasu Mar 22, 2024
fa44398
[test]add test for hdbscan
MommaWatasu Mar 23, 2024
4e64fdf
[fix]change Int64 to Int
MommaWatasu Mar 23, 2024
e294565
[fix]change all Int64 into Int
MommaWatasu Mar 23, 2024
b851997
[change]change usage and remove extra code
MommaWatasu Mar 23, 2024
7822a7c
[test]update test
MommaWatasu Mar 23, 2024
8901cfe
[docs]add docs for HDBSCAN
MommaWatasu Mar 23, 2024
6de5d02
[fix]export HdbscanCluster
MommaWatasu Mar 23, 2024
85c5644
[docs]merge docs of HDBSCAN with DBSCAN.md
MommaWatasu Apr 14, 2024
edcc70a
[clean]refactoring HDBSCANGraph
MommaWatasu Apr 15, 2024
939ce65
[docs]fix docs
MommaWatasu Apr 26, 2024
2f67d07
[clean]refactoring
MommaWatasu Apr 26, 2024
61463f2
[clean]refactoring
MommaWatasu Apr 27, 2024
09ed174
[test]update test
MommaWatasu Apr 27, 2024
6039c0c
[fix]change isnothing into ===
MommaWatasu Apr 27, 2024
3cc7689
[fix]remove println
MommaWatasu Apr 27, 2024
1798148
[docs]update docs
MommaWatasu Apr 27, 2024
a0a819e
[docs]fix docs
MommaWatasu Apr 27, 2024
bf38eb6
[fix]fix docstring
MommaWatasu Apr 27, 2024
380acf1
[fix]add isnoise to the list of exprted function
MommaWatasu Apr 27, 2024
6fdebe6
core_distances() tweaks
alyst Apr 27, 2024
ff98806
Hdbscan: simplify results generation
alyst Apr 27, 2024
64a202a
remove heappush!: unused
alyst Apr 27, 2024
661edbc
hdbscan test: small tweaks
alyst Apr 27, 2024
082c5e6
fixup hdbscan assignments
alyst Apr 27, 2024
c9db368
hdbscan: further opt core_dists
alyst Apr 27, 2024
c205575
hdbscan: optimize edges generation
alyst Apr 27, 2024
bdf1aec
HDBSCANGraph -> HdbscanGraph
alyst Apr 27, 2024
9aa9841
HdbscanEdge
alyst Apr 27, 2024
c9b9374
HdbscanGraph: rename edges to adj_edges
alyst Apr 27, 2024
092ac40
MSTEdge remove no-op expand() method
alyst Apr 27, 2024
c972caa
refactor HdbscanMSTEdge
alyst Apr 27, 2024
57b05e6
hdbscan: fix graph vertices, remove add_edges!
alyst Apr 27, 2024
f616795
hdbscan_minspantree(): refactor
alyst Apr 27, 2024
0f6e992
prune_clusters!(): cleanup
alyst Apr 27, 2024
e01609b
hdbscan: fix 1.0 compat
alyst Apr 27, 2024
de6b83a
[docs]fix docstring
MommaWatasu Apr 27, 2024
38212ca
[clean]rename and refactoring
MommaWatasu Apr 27, 2024
159858d
[test]add test for unionfind
MommaWatasu Apr 27, 2024
a03c224
hdbscan_minspantree: fix edges sorting
alyst Apr 28, 2024
3a577ea
hdbscan_clusters(): fix cost type
alyst Apr 28, 2024
744af22
hdbscan_clusters: improve MST iteration
alyst Apr 28, 2024
8803573
Merge branch 'master' of https://github.com/MommaWatasu/Clustering.jl
MommaWatasu Apr 29, 2024
b94de25
[clean]rename the result structure
MommaWatasu May 1, 2024
7078223
[hotfix]apply hot fix
MommaWatasu May 2, 2024
b0791d9
[test]add test about min_size
MommaWatasu May 2, 2024
dc5dd40
[add function or file]support ClusteringResult
MommaWatasu May 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ makedocs(
"mcl.md",
"affprop.md",
"dbscan.md",
"hdbscan.md",
"fuzzycmeans.md",
],
"validate.md",
Expand Down
11 changes: 11 additions & 0 deletions docs/source/hdbscan.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# HDBSCAN

Hierarchical Density-based Spatial Clustering of Applications with Noise(HDBSCAN) is similar to DBSCAN but uses hierarchical tree to find clusters.

## Interface
The implementation of *HDBSCAN* algorithm is provided by [`hdbscan`](@ref) function
```@docs
hdbscan
HdbscanResult
HdbscanCluster
```
4 changes: 4 additions & 0 deletions src/Clustering.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ module Clustering
# dbscan
DbscanResult, DbscanCluster, dbscan,

# hdbscan
HdbscanResult, HdbscanCluster, hdbscan,

# fuzzy_cmeans
fuzzy_cmeans, FuzzyCMeansResult,

Expand Down Expand Up @@ -83,6 +86,7 @@ module Clustering
include("kmedoids.jl")
include("affprop.jl")
include("dbscan.jl")
include("hdbscan.jl")
include("mcl.jl")
include("fuzzycmeans.jl")

Expand Down
296 changes: 296 additions & 0 deletions src/hdbscan.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
struct HDBSCANGraph
edges::Vector{Vector{Tuple{Int, Float64}}}
HDBSCANGraph(n) = new([Tuple{Int, Float64}[] for _ in 1 : n])
end

function add_edge(G::HDBSCANGraph, v::Tuple{Int, Int, Float64})
i, j, c = v
push!(G.edges[i], (j, c))
push!(G.edges[j], (i, c))
end

Base.getindex(G::HDBSCANGraph, i::Int) = G.edges[i]

"""
HdbscanCluster(..., points, stability, ...)
- `points`: vector of points which belongs to the cluster
- `stability`: stablity of the cluster(-1 for noise clusters)

You can use `length` to know the number of pooints in the cluster. And `isnoise` funciton is also available to know whether the cluster is noise or not.
"""
mutable struct HdbscanCluster
parent::Int
children::Vector{Int}
points::Vector{Int}
λp::Vector{Float64}
stability::Float64
children_stability::Float64
function HdbscanCluster(noise::Bool, points::Vector{Int})
if noise
return new(0, [], [], [], -1, -1)
else
return new(0, [], points, [], 0, 0)
end
end
end

Base.length(c::HdbscanCluster) = size(c.points, 1)
isnoise(c::HdbscanCluster) = c.stability == -1
hasstability(c::HdbscanCluster) = c.stability != 0
function compute_stability(c::HdbscanCluster, λbirth)
c.stability += sum(c.λp.-λbirth)
end

"""
HdbscanResult(k, minpts, clusters)
- `k`: we will define "core distance of point A" as the distance between point A and the `k` th neighbor point of point A.
- `min_cluster_size`: minimum number of points in the cluster
- `clusters`: result vector of clusters
"""
mutable struct HdbscanResult
k::Int
min_cluster_size::Int
clusters::Vector{HdbscanCluster}
end

"""
hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing)
Density-Based Clustering Based on Hierarchical Density Estimates.
This algorithm performs clustering as follows.
1. generate a minimum spanning tree
2. build a HDBSCAN hierarchy
3. extract the target cluster
4. generate the list of cluster assignment for each point
The detail is so complex it is difficult to explain the detail in here. But, if you want to know more about this algorithm, you should read [this docs](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html).

# Parameters
- `points`: the d×n matrix, where each column is a d-dimensional coordinate of a point
- `k`: we will define "core distance of point A" as the distance between point A and the `k` th neighbor point of point A.
- `min_cluster_size`: minimum number of points in the cluster
- `gen_mst`: whether to generate minimum-spannig-tree or not
- `mst`: when is specified and `gen_mst` is false, new mst won't be generated
"""
function hdbscan(points::AbstractMatrix, k::Int, min_cluster_size::Int; gen_mst::Bool=true, mst=nothing)
if min_cluster_size < 1
throw(DomainError(min_cluster_size, "The `min_cluster_size` must be greater than or equal to 1"))
end
n = size(points, 1)
if gen_mst
#calculate core distances for each point
core_dists = core_dist(points, k)
#calculate mutual reachability distance between any two points
mrd = mutual_reachability_distance(core_dists, points)
#compute a minimum spanning tree by prim method
mst = prim(mrd, n)
elseif mst == nothing
throw(ArgumentError("if you set `gen_mst` to false, you must pass a minimum spanning tree as `mst`"))
end
#build a HDBSCAN hierarchy
hierarchy = build_hierarchy(mst, min_cluster_size)
#extract the target cluster
extract_cluster!(hierarchy)
#generate the list of cluster assignment for each point
result = HdbscanCluster[]
noise_points = fill(-1, n)
for (i, j) in enumerate(hierarchy[2n-1].children)
c = hierarchy[j]
push!(result, c)
for k in c.points
noise_points[k] = 0
end
end
push!(result, HdbscanCluster(true, Int[]))
result[end].points = findall(x->x==-1, noise_points)
return HdbscanResult(k, min_cluster_size, result)
end

function core_dist(points, k)
core_dists = Array{Float64}(undef, size(points, 1))
for i in 1 : size(points, 1)
p = points[i:i, :]
dists = vec(sum((@. (points - p)^2), dims=2))
sort!(dists)
core_dists[i] = dists[k]
end
return core_dists
end

function mutual_reachability_distance(core_dists, points)
n = size(points, 1)
graph = HDBSCANGraph(div(n * (n-1), 2))
for i in 1 : n-1
for j in i+1 : n
c = max(core_dists[i], core_dists[j], sum((points[i, :]-points[j, :]).^2))
add_edge(graph, (i, j, c))
end
end
return graph
end

function prim(graph, n)
minimum_spanning_tree = Array{Tuple{Float64, Int, Int}}(undef, n-1)

marked = falses(n)
marked_cnt = 1
marked[1] = true

h = []

for (i, c) in graph[1]
heappush!(h, (c, 1, i))
end

while marked_cnt < n
c, i, j = popfirst!(h)

marked[j] == true && continue
minimum_spanning_tree[marked_cnt] = (c, i, j)
marked[j] = true
marked_cnt += 1

for (k, c) in graph[j]
marked[k] == true && continue
heappush!(h, (c, j, k))
end
end
return minimum_spanning_tree
end

function build_hierarchy(mst, min_size)
n = length(mst) + 1
cost = 0
uf = UnionFind(n)
Hierarchy = Array{HdbscanCluster}(undef, 2n-1)
if min_size == 1
for i in 1 : n
Hierarchy[i] = HdbscanCluster(false, [i])
end
else
for i in 1 : n
Hierarchy[i] = HdbscanCluster(true, Int[])
end
end
sort!(mst)

for i in 1 : n-1
c, j, k = mst[i]
cost += c
λ = 1 / cost
#child clusters
c1 = group(uf, j)
c2 = group(uf, k)
#reference to the parent cluster
Hierarchy[c1].parent = Hierarchy[c2].parent = n+i
nc1, nc2 = isnoise(Hierarchy[c1]), isnoise(Hierarchy[c2])
if !(nc1 || nc2)
#compute stability
compute_stability(Hierarchy[c1], λ)
compute_stability(Hierarchy[c2], λ)
#unite cluster
unite!(uf, j, k)
#create parent cluster
points = members(uf, group(uf, j))
Hierarchy[n+i] = HdbscanCluster(false, points)
elseif !(nc1 && nc2)
if nc2 == true
(c1, c2) = (c2, c1)
end
#record the lambda value
append!(Hierarchy[c2].λp, fill(λ, length(Hierarchy[c1])))
#unite cluster
unite!(uf, j, k)
#create parent cluster
points = members(uf, group(uf, j))
Hierarchy[n+i] = HdbscanCluster(false, points)
else
#unite the noise cluster
unite!(uf, j, k)
#create parent cluster
points = members(uf, group(uf, j))
if length(points) < min_size
Hierarchy[n+i] = HdbscanCluster(true, Int[])
else
Hierarchy[n+i] = HdbscanCluster(false, points)
end
end
end
return Hierarchy
end

function extract_cluster!(hierarchy::Vector{HdbscanCluster})
for i in 1 : length(hierarchy)-1
if isnoise(hierarchy[i])
c = hierarchy[i]
push!(hierarchy[c.parent].children, i)
hierarchy[c.parent].children_stability += c.stability
else
c = hierarchy[i]
if c.stability > c.children_stability
push!(hierarchy[c.parent].children, i)
hierarchy[c.parent].children_stability += c.stability
else
append!(hierarchy[c.parent].children, c.children)
hierarchy[c.parent].children_stability += c.children_stability
end
end
end
end

# Below are utility functions for building hierarchical trees
heappush!(h, v) = insert!(h, searchsortedfirst(h, v), v)

mutable struct UnionFind{T <: Integer}
parent:: Vector{T} # parent[root] is the negative of the size
label::Dict{Int, Int}
cnt::Int

function UnionFind{T}(nodes::T) where T<:Integer
if nodes <= 0
throw(ArgumentError("invalid argument for nodes: $nodes"))
end

parent = -ones(T, nodes)
label = Dict([(i, i) for i in 1 : nodes])
new{T}(parent, label, nodes)
end
end

UnionFind(nodes::Integer) = UnionFind{typeof(nodes)}(nodes)
group(uf::UnionFind, x)::Int = uf.label[root(uf, x)]
members(uf::UnionFind, x::Int) = collect(keys(filter(n->n.second == x, uf.label)))

function root(uf::UnionFind{T}, x::T)::T where T<:Integer
if uf.parent[x] < 0
return x
else
return uf.parent[x] = root(uf, uf.parent[x])
end
end

function issame(uf::UnionFind{T}, x::T, y::T)::Bool where T<:Integer
return root(uf, x) == root(uf, y)
end

function Base.size(uf::UnionFind{T}, x::T)::T where T<:Integer
return -uf.parent[root(uf, x)]
end

function unite!(uf::UnionFind{T}, x::T, y::T)::Bool where T<:Integer
x = root(uf, x)
y = root(uf, y)
if x == y
return false
end
if uf.parent[x] > uf.parent[y]
x, y = y, x
end
# unite smaller tree(y) to bigger one(x)
uf.parent[x] += uf.parent[y]
uf.parent[y] = x
uf.cnt += 1
uf.label[y] = uf.cnt
for i in members(uf, group(uf, x))
uf.label[i] = uf.cnt
end
return true
end
19 changes: 19 additions & 0 deletions test/hdbscan.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
using Test
using Clustering

@testset "HDBSCAN" begin
# make moons for test
upper_x = [i for i in 0:π/50:π]
lower_x = [i for i in π/2:π/50:3/2*π]
upper_y = sin.(upper_x) .+ rand(50)./10
lower_y = cos.(lower_x) .+ rand(51)./10
data = hcat([lower_x; upper_x], [lower_y; upper_y])
#test for main function
@test_throws DomainError hdbscan(data, 5, 0)
@test_throws ArgumentError hdbscan(data, 5, 3; gen_mst=false)
@test_nowarn result = hdbscan(data, 5, 3)

# tests for result
result = hdbscan(data, 5, 3)
@test sum([length(c) for c in result.clusters]) == 101
end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ tests = ["seeding",
"kmedoids",
"affprop",
"dbscan",
"hdbscan",
"fuzzycmeans",
"counts",
"silhouette",
Expand Down