Merge pull request #17 from JuliaAI/dbscan

ablaom · web-flow · commit 39b1fdb49483 · 2022-08-29T09:24:33.000+12:00
Add interface for DBSCAN
diff --git a/Project.toml b/Project.toml
@@ -13,12 +13,3 @@ Clustering = "0.14"
 Distances = "0.9, 0.10"
 MLJModelInterface = "1.4"
 julia = "1.6"
-
-[extras]
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
-Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-
-[targets]
-test = ["LinearAlgebra", "MLJBase", "Random", "Test"]
diff --git a/src/MLJClusteringInterface.jl b/src/MLJClusteringInterface.jl
@@ -16,7 +16,7 @@ using Distances
 
 # ===================================================================
 ## EXPORTS
-export KMeans, KMedoids
+export KMeans, KMedoids, DBSCAN
 
 # ===================================================================
 ## CONSTANTS
@@ -25,19 +25,14 @@ const MMI = MLJModelInterface
 const Cl = Clustering
 const PKG = "MLJClusteringInterface"
 
-####
-#### KMeans
-####
+
+# # K_MEANS
 
 @mlj_model mutable struct KMeans <: MMI.Unsupervised
     k::Int = 3::(_ ≥ 2)
     metric::SemiMetric = SqEuclidean()
 end
 
-####
-#### KMeans
-####
-
 function MMI.fit(model::KMeans, verbosity::Int, X)
     # NOTE: using transpose here to get a LinearAlgebra.Transpose object
     # which Kmeans can handle.
@@ -66,6 +61,8 @@ function MMI.transform(model::KMeans, fitresult, X)
     return MMI.table(X̃, prototype=X)
 end
 
+# # K_MEDOIDS
+
 @mlj_model mutable struct KMedoids <: MMI.Unsupervised
     k::Int = 3::(_ ≥ 2)
     metric::SemiMetric = SqEuclidean()
@@ -100,9 +97,8 @@ function MMI.transform(model::KMedoids, fitresult, X)
     return MMI.table(X̃, prototype=X)
 end
 
-####
-#### Predict methods
-####
+
+# # PREDICT FOR K_MEANS AND K_MEDOIDS
 
 function MMI.predict(model::Union{KMeans,KMedoids}, fitresult, Xnew)
     locations, cluster_labels = fitresult
@@ -124,12 +120,59 @@ function MMI.predict(model::Union{KMeans,KMedoids}, fitresult, Xnew)
     return cluster_labels[pred]
 end
 
-####
-#### METADATA
-####
+# # DBSCAN
+
+@mlj_model mutable struct DBSCAN <: MMI.Static
+    radius::Real = 1.0::(_ > 0)
+    leafsize::Int = 20::(_ > 0)
+    min_neighbors::Int = 1::(_ > 0)
+    min_cluster_size::Int = 1::(_ > 0)
+end
+
+# As DBSCAN is `Static`, there is no `fit` to implement.
+
+function MMI.predict(model::DBSCAN, ::Nothing, X)
+
+    Xarray   = MMI.matrix(X)'
+
+    # output of core algorithm:
+    clusters = Cl.dbscan(
+        Xarray, model.radius;
+        leafsize=model.leafsize,
+        min_neighbors=model.min_neighbors,
+        min_cluster_size=model.min_cluster_size,
+    )
+    nclusters = length(clusters)
+
+    # assignments and point types
+    npoints     = size(Xarray, 2)
+    assignments = zeros(Int, npoints)
+    raw_point_types  = fill('N', npoints)
+    for (k, cluster) in enumerate(clusters)
+        for i in cluster.core_indices
+            assignments[i] = k
+            raw_point_types[i] = 'C'
+        end
+        for i in cluster.boundary_indices
+            assignments[i] = k
+            raw_point_types[i] = 'B'
+        end
+    end
+    point_types = MMI.categorical(raw_point_types)
+    cluster_labels = unique(assignments)
+
+    yhat = MMI.categorical(assignments)
+    report = (; point_types, nclusters, cluster_labels, clusters)
+    return yhat, report
+end
+
+MMI.reporting_operations(::Type{<:DBSCAN}) = (:predict,)
+
+
+# # METADATA
 
 metadata_pkg.(
-    (KMeans, KMedoids),
+    (KMeans, KMedoids, DBSCAN),
     name="Clustering",
     uuid="aaaa29a8-35af-508c-8bc3-b662a17a0fe5",
     url="https://github.com/JuliaStats/Clustering.jl",
@@ -143,7 +186,6 @@ metadata_model(
     human_name = "K-means clusterer",
     input = MMI.Table(Continuous),
     output = MMI.Table(Continuous),
-    weights = false,
     path = "$(PKG).KMeans"
 )
 
@@ -152,9 +194,18 @@ metadata_model(
     human_name = "K-medoids clusterer",
     input = MMI.Table(Continuous),
     output = MMI.Table(Continuous),
-    weights = false,
     path = "$(PKG).KMedoids"
 )
+
+metadata_model(
+    DBSCAN,
+    human_name = "DBSCAN clusterer (density-based spatial clustering of "*
+    "applications with noise)",
+    input = MMI.Table(Continuous),
+    path = "$(PKG).DBSCAN"
+)
+
+
 """
 $(MMI.doc_header(KMeans))
 
@@ -323,6 +374,107 @@ See also
 """
 KMedoids
 
+"""
+$(MMI.doc_header(DBSCAN))
 
-end # module
+[DBSCAN](https://en.wikipedia.org/wiki/DBSCAN) is a clustering algorithm that groups
+together points that are closely packed together (points with many nearby neighbors),
+marking as outliers points that lie alone in low-density regions (whose nearest neighbors
+are too far away). More information is available at the [Clustering.jl
+documentation](https://juliastats.org/Clustering.jl/stable/index.html). Use `predict` to
+get cluster assignments. Point types - core, boundary or noise - are accessed from the
+machine report (see below).
+
+This is a static implementation, i.e., it does not generalize to new data instances, and
+there is no training data. For clusterers that do generalize, see [`KMeans`](@ref) or
+[`KMedoids`](@ref).
+
+In MLJ or MLJBase, create a machine with
+
+    mach = machine(model)
+
+# Hyper-parameters
+
+- `radius=1.0`: query radius.
+
+- `leafsize=20`: number of points binned in each leaf node of the nearest neighbor k-d
+  tree.
+
+- `min_neighbors=1`: minimum number of a core point neighbors.
 
+- `min_cluster_size=1`: minimum number of points in a valid cluster.
+
+
+# Operations
+
+- `predict(mach, X)`: return cluster label assignments, as an unordered
+  `CategoricalVector`. Here `X` is any table of input features (eg, a `DataFrame`) whose
+  columns are of scitype `Continuous`; check column scitypes with `schema(X)`. Note that
+  points of type `noise` will always get a label of `0`.
+
+
+# Report
+
+After calling `predict(mach)`, the fields of `report(mach)`  are:
+
+- `point_types`: A `CategoricalVector` with the DBSCAN point type classification, one
+  element per row of `X`. Elements are either `'C'` (core), `'B'` (boundary), or `'N'`
+  (noise).
+
+- `nclusters`: The number of clusters (excluding the noise "cluster")
+
+- `cluster_labels`: The unique list of cluster labels
+
+- `clusters`: A vector of `Clustering.DbscanCluster` objects from Clustering.jl, which
+  have these fields:
+
+  - `size`: number of points in a cluster (core + boundary)
+
+  - `core_indices`: indices of points in the cluster core
+
+  - `boundary_indices`: indices of points on the cluster boundary
+
+
+# Examples
+
+```
+using MLJ
+
+X, labels  = make_moons(400, noise=0.09, rng=1) # synthetic data with 2 clusters; X
+y = map(labels) do label
+    label == 0 ? "cookie" : "monster"
+end;
+y = coerce(y, Multiclass);
+
+DBSCAN = @load DBSCAN pkg=Clustering
+model = DBSCAN(radius=0.13, min_cluster_size=5)
+mach = machine(model)
+
+# compute and output cluster assignments for observations in `X`:
+yhat = predict(mach, X)
+
+# get DBSCAN point types:
+report(mach).point_types
+report(mach).nclusters
+
+# compare cluster labels with actual labels:
+compare = zip(yhat, y) |> collect;
+compare[1:10] # clusters align with classes
+
+# visualize clusters, noise in red:
+points = zip(X.x1, X.x2) |> collect
+colors = map(yhat) do i
+   i == 0 ? :red :
+   i == 1 ? :blue :
+   i == 2 ? :green :
+   i == 3 ? :yellow :
+   :black
+end
+using Plots
+scatter(points, color=colors)
+```
+
+"""
+DBSCAN
+
+end # module
diff --git a/test/Project.toml b/test/Project.toml
@@ -0,0 +1,11 @@
+[deps]
+Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
+Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
+MLJTestIntegration = "697918b4-fdc1-4f9e-8ff9-929724cee270"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[compat]
+MLJTestIntegration = "0.2.2"
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -3,20 +3,18 @@ import Distances
 import LinearAlgebra: norm
 
 using MLJBase
+using MLJTestIntegration
 using MLJClusteringInterface
-using Random:seed!
+using Random: seed!
 using Test
 
-const Dist = Distances
-
 seed!(132442)
 X, y = @load_crabs
 
-####
-#### KMEANS
-####
 
-@testset "Kmeans" begin
+# # K_MEANS
+
+@testset "KMeans" begin
     barekm = KMeans()
     fitresult, cache, report = fit(barekm, 1, X)
     R = matrix(transform(barekm, fitresult, X))
@@ -28,25 +26,83 @@ X, y = @load_crabs
     p = predict(barekm, fitresult, X)
     @test argmin(R[1, :]) == p[1]
     @test argmin(R[10, :]) == p[10]
-
-
 end
 
-####
-#### KMEDOIDS
-####
 
-@testset "Kmedoids" begin
+# # K_MEDOIDS
+
+@testset "KMedoids" begin
     barekm = KMedoids()
     fitresult, cache, report = fit(barekm, 1, X)
     X_array = matrix(X)
     R = matrix(transform(barekm, fitresult, X))
-    @test R[1, 2] ≈ Dist.evaluate(
+    @test R[1, 2] ≈ Distances.evaluate(
         barekm.metric, view(X_array, 1, :), view(fitresult[1], :, 2)
     )
-    @test R[10, 3] ≈ Dist.evaluate(
+    @test R[10, 3] ≈ Distances.evaluate(
         barekm.metric, view(X_array, 10, :), view(fitresult[1], :, 3)
     )
     p = predict(barekm, fitresult, X)
     @test all(report.assignments .== p)
 end
+
+
+# # DBSCAN
+
+@testset "DBSCAN" begin
+
+    # five spot pattern
+    X = [
+        0.0 0.0
+        1.0 0.0
+        1.0 1.0
+        0.0 1.0
+        0.5 0.5
+    ] |> MLJBase.table
+
+    # radius < √2 ==> 5 clusters
+    dbscan = DBSCAN(radius=0.1)
+    yhat1, report1 = predict(dbscan, nothing, X)
+    @test report1.nclusters == 5
+    @test report1.point_types == fill('B', 5)
+    @test Set(yhat1) == Set(unique(yhat1))
+    @test Set(report1.cluster_labels) == Set(unique(yhat1))
+
+    # DbscanCluster fields:
+    @test propertynames(report1.clusters[1]) == (:size, :core_indices, :boundary_indices)
+
+    # radius > √2 ==> 1 cluster
+    dbscan = DBSCAN(radius=√2+eps())
+    yhat, report = predict(dbscan, nothing, X)
+    @test report.nclusters == 1
+    @test report.point_types == fill('C', 5)
+    @test length(unique(yhat)) == 1
+
+    # radius < √2 && min_cluster_size = 2 ==> all points are noise
+    dbscan = DBSCAN(radius=0.1, min_cluster_size=2)
+    yhat, report = predict(dbscan, nothing, X)
+    @test report.nclusters == 0
+    @test report.point_types == fill('N', 5)
+    @test length(unique(yhat)) == 1
+
+    # MLJ integration:
+    model = DBSCAN(radius=0.1)
+    mach = machine(model) # no training data
+    yhat = predict(mach, X)
+    @test yhat == yhat1
+    @test MLJBase.report(mach).point_types == report1.point_types
+    @test MLJBase.report(mach).nclusters == report1.nclusters
+
+end
+
+@testset "MLJ interface" begin
+    models = [KMeans, KMedoids, DBSCAN]
+    failures, summary = MLJTestIntegration.test(
+        models,
+        X;
+        mod=@__MODULE__,
+        verbosity=0,
+        throw=false, # set to true to debug
+    )
+    @test isempty(failures)
+end