@@ -16,7 +16,7 @@ using Distances
16
16
17
17
# ===================================================================
18
18
# # EXPORTS
19
- export KMeans, KMedoids
19
+ export KMeans, KMedoids, DBSCAN
20
20
21
21
# ===================================================================
22
22
# # CONSTANTS
@@ -25,19 +25,14 @@ const MMI = MLJModelInterface
25
25
const Cl = Clustering
26
26
const PKG = " MLJClusteringInterface"
27
27
28
- # ###
29
- # ### KMeans
30
- # ###
28
+
29
+ # # K_MEANS
31
30
32
31
@mlj_model mutable struct KMeans <: MMI.Unsupervised
33
32
k:: Int = 3 :: (_ ≥ 2)
34
33
metric:: SemiMetric = SqEuclidean ()
35
34
end
36
35
37
- # ###
38
- # ### KMeans
39
- # ###
40
-
41
36
function MMI. fit (model:: KMeans , verbosity:: Int , X)
42
37
# NOTE: using transpose here to get a LinearAlgebra.Transpose object
43
38
# which Kmeans can handle.
@@ -66,6 +61,8 @@ function MMI.transform(model::KMeans, fitresult, X)
66
61
return MMI. table (X̃, prototype= X)
67
62
end
68
63
64
+ # # K_MEDOIDS
65
+
69
66
@mlj_model mutable struct KMedoids <: MMI.Unsupervised
70
67
k:: Int = 3 :: (_ ≥ 2)
71
68
metric:: SemiMetric = SqEuclidean ()
@@ -100,9 +97,8 @@ function MMI.transform(model::KMedoids, fitresult, X)
100
97
return MMI. table (X̃, prototype= X)
101
98
end
102
99
103
- # ###
104
- # ### Predict methods
105
- # ###
100
+
101
+ # # PREDICT FOR K_MEANS AND K_MEDOIDS
106
102
107
103
function MMI. predict (model:: Union{KMeans,KMedoids} , fitresult, Xnew)
108
104
locations, cluster_labels = fitresult
@@ -124,12 +120,59 @@ function MMI.predict(model::Union{KMeans,KMedoids}, fitresult, Xnew)
124
120
return cluster_labels[pred]
125
121
end
126
122
127
- # ###
128
- # ### METADATA
129
- # ###
123
+ # # DBSCAN
124
+
125
+ @mlj_model mutable struct DBSCAN <: MMI.Static
126
+ radius:: Real = 1.0 :: (_ > 0)
127
+ leafsize:: Int = 20 :: (_ > 0)
128
+ min_neighbors:: Int = 1 :: (_ > 0)
129
+ min_cluster_size:: Int = 1 :: (_ > 0)
130
+ end
131
+
132
+ # As DBSCAN is `Static`, there is no `fit` to implement.
133
+
134
+ function MMI. predict (model:: DBSCAN , :: Nothing , X)
135
+
136
+ Xarray = MMI. matrix (X)'
137
+
138
+ # output of core algorithm:
139
+ clusters = Cl. dbscan (
140
+ Xarray, model. radius;
141
+ leafsize= model. leafsize,
142
+ min_neighbors= model. min_neighbors,
143
+ min_cluster_size= model. min_cluster_size,
144
+ )
145
+ nclusters = length (clusters)
146
+
147
+ # assignments and point types
148
+ npoints = size (Xarray, 2 )
149
+ assignments = zeros (Int, npoints)
150
+ raw_point_types = fill (' N' , npoints)
151
+ for (k, cluster) in enumerate (clusters)
152
+ for i in cluster. core_indices
153
+ assignments[i] = k
154
+ raw_point_types[i] = ' C'
155
+ end
156
+ for i in cluster. boundary_indices
157
+ assignments[i] = k
158
+ raw_point_types[i] = ' B'
159
+ end
160
+ end
161
+ point_types = MMI. categorical (raw_point_types)
162
+ cluster_labels = unique (assignments)
163
+
164
+ yhat = MMI. categorical (assignments)
165
+ report = (; point_types, nclusters, cluster_labels, clusters)
166
+ return yhat, report
167
+ end
168
+
169
+ MMI. reporting_operations (:: Type{<:DBSCAN} ) = (:predict ,)
170
+
171
+
172
+ # # METADATA
130
173
131
174
metadata_pkg .(
132
- (KMeans, KMedoids),
175
+ (KMeans, KMedoids, DBSCAN ),
133
176
name= " Clustering" ,
134
177
uuid= " aaaa29a8-35af-508c-8bc3-b662a17a0fe5" ,
135
178
url= " https://github.com/JuliaStats/Clustering.jl" ,
@@ -143,7 +186,6 @@ metadata_model(
143
186
human_name = " K-means clusterer" ,
144
187
input = MMI. Table (Continuous),
145
188
output = MMI. Table (Continuous),
146
- weights = false ,
147
189
path = " $(PKG) .KMeans"
148
190
)
149
191
@@ -152,9 +194,18 @@ metadata_model(
152
194
human_name = " K-medoids clusterer" ,
153
195
input = MMI. Table (Continuous),
154
196
output = MMI. Table (Continuous),
155
- weights = false ,
156
197
path = " $(PKG) .KMedoids"
157
198
)
199
+
200
+ metadata_model (
201
+ DBSCAN,
202
+ human_name = " DBSCAN clusterer (density-based spatial clustering of " *
203
+ " applications with noise)" ,
204
+ input = MMI. Table (Continuous),
205
+ path = " $(PKG) .DBSCAN"
206
+ )
207
+
208
+
158
209
"""
159
210
$(MMI. doc_header (KMeans))
160
211
@@ -323,6 +374,107 @@ See also
323
374
"""
324
375
KMedoids
325
376
377
+ """
378
+ $(MMI. doc_header (DBSCAN))
326
379
327
- end # module
380
+ [DBSCAN](https://en.wikipedia.org/wiki/DBSCAN) is a clustering algorithm that groups
381
+ together points that are closely packed together (points with many nearby neighbors),
382
+ marking as outliers points that lie alone in low-density regions (whose nearest neighbors
383
+ are too far away). More information is available at the [Clustering.jl
384
+ documentation](https://juliastats.org/Clustering.jl/stable/index.html). Use `predict` to
385
+ get cluster assignments. Point types - core, boundary or noise - are accessed from the
386
+ machine report (see below).
387
+
388
+ This is a static implementation, i.e., it does not generalize to new data instances, and
389
+ there is no training data. For clusterers that do generalize, see [`KMeans`](@ref) or
390
+ [`KMedoids`](@ref).
391
+
392
+ In MLJ or MLJBase, create a machine with
393
+
394
+ mach = machine(model)
395
+
396
+ # Hyper-parameters
397
+
398
+ - `radius=1.0`: query radius.
399
+
400
+ - `leafsize=20`: number of points binned in each leaf node of the nearest neighbor k-d
401
+ tree.
402
+
403
+ - `min_neighbors=1`: minimum number of a core point neighbors.
328
404
405
+ - `min_cluster_size=1`: minimum number of points in a valid cluster.
406
+
407
+
408
+ # Operations
409
+
410
+ - `predict(mach, X)`: return cluster label assignments, as an unordered
411
+ `CategoricalVector`. Here `X` is any table of input features (eg, a `DataFrame`) whose
412
+ columns are of scitype `Continuous`; check column scitypes with `schema(X)`. Note that
413
+ points of type `noise` will always get a label of `0`.
414
+
415
+
416
+ # Report
417
+
418
+ After calling `predict(mach)`, the fields of `report(mach)` are:
419
+
420
+ - `point_types`: A `CategoricalVector` with the DBSCAN point type classification, one
421
+ element per row of `X`. Elements are either `'C'` (core), `'B'` (boundary), or `'N'`
422
+ (noise).
423
+
424
+ - `nclusters`: The number of clusters (excluding the noise "cluster")
425
+
426
+ - `cluster_labels`: The unique list of cluster labels
427
+
428
+ - `clusters`: A vector of `Clustering.DbscanCluster` objects from Clustering.jl, which
429
+ have these fields:
430
+
431
+ - `size`: number of points in a cluster (core + boundary)
432
+
433
+ - `core_indices`: indices of points in the cluster core
434
+
435
+ - `boundary_indices`: indices of points on the cluster boundary
436
+
437
+
438
+ # Examples
439
+
440
+ ```
441
+ using MLJ
442
+
443
+ X, labels = make_moons(400, noise=0.09, rng=1) # synthetic data with 2 clusters; X
444
+ y = map(labels) do label
445
+ label == 0 ? "cookie" : "monster"
446
+ end;
447
+ y = coerce(y, Multiclass);
448
+
449
+ DBSCAN = @load DBSCAN pkg=Clustering
450
+ model = DBSCAN(radius=0.13, min_cluster_size=5)
451
+ mach = machine(model)
452
+
453
+ # compute and output cluster assignments for observations in `X`:
454
+ yhat = predict(mach, X)
455
+
456
+ # get DBSCAN point types:
457
+ report(mach).point_types
458
+ report(mach).nclusters
459
+
460
+ # compare cluster labels with actual labels:
461
+ compare = zip(yhat, y) |> collect;
462
+ compare[1:10] # clusters align with classes
463
+
464
+ # visualize clusters, noise in red:
465
+ points = zip(X.x1, X.x2) |> collect
466
+ colors = map(yhat) do i
467
+ i == 0 ? :red :
468
+ i == 1 ? :blue :
469
+ i == 2 ? :green :
470
+ i == 3 ? :yellow :
471
+ :black
472
+ end
473
+ using Plots
474
+ scatter(points, color=colors)
475
+ ```
476
+
477
+ """
478
+ DBSCAN
479
+
480
+ end # module
0 commit comments