Skip to content

Commit 778acbd

Browse files
committed
Complete robust single linkage implementation.
1 parent 1fc3b8e commit 778acbd

File tree

1 file changed

+174
-5
lines changed

1 file changed

+174
-5
lines changed

hdbscan/robust_single_linkage_.py

Lines changed: 174 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
from ._hdbscan_reachability import kdtree_pdist_mutual_reachability, kdtree_mutual_reachability, mutual_reachability
2525
from .plots import SingleLinkageTree
2626

27+
from warnings import warn
28+
2729
try:
2830
from fastcluster import single
2931
HAVE_FASTCLUSTER = True
@@ -118,6 +120,69 @@ def _rsl_large_kdtree_fastcluster(X, cut, k=5, alpha=1.4142135623730951, gamma=5
118120

119121

120122
def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric='minkowski', p=2, algorithm=None):
123+
"""Perform robust single linkage clustering from a vector array
124+
or distance matrix.
125+
126+
Parameters
127+
----------
128+
X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
129+
array of shape (n_samples, n_samples)
130+
A feature array, or array of distances between samples if
131+
``metric='precomputed'``.
132+
133+
cut : float
134+
The reachability distance value to cut the cluster heirarchy at
135+
to derive a flat cluster labelling.
136+
137+
k : int, optional
138+
Reachability distances will be computed with regard to the `k`
139+
nearest neighbors. (default 5)
140+
141+
alpha : float, optional
142+
Distance scaling for reachability distance computation. Reachability
143+
distance is computed as $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.
144+
(default sqrt(2))
145+
146+
gamma : int, optional
147+
Ignore any clusters in the flat clustering with size less than gamma,
148+
and declare points in such clusters as noise points. (default 5)
149+
150+
metric : string, or callable, optional
151+
The metric to use when calculating distance between instances in a
152+
feature array. If metric is a string or callable, it must be one of
153+
the options allowed by metrics.pairwise.pairwise_distances for its
154+
metric parameter.
155+
If metric is "precomputed", X is assumed to be a distance matrix and
156+
must be square.
157+
158+
algorithm : string, optional
159+
Exactly which algorithm to use; hdbscan has variants specialised
160+
for different characteristics of the data. By default this is set
161+
to ``best`` which chooses the "best" algorithm given the nature of
162+
the data. You can force other options if you believe you know
163+
better. Options are:
164+
* ``small``
165+
* ``small_kdtree``
166+
* ``large_kdtree``
167+
* ``large_kdtree_fastcluster``
168+
169+
Returns
170+
-------
171+
labels : array [n_samples]
172+
Cluster labels for each point. Noisy samples are given the label -1.
173+
174+
single_linkage_tree : array [n_samples - 1, 4]
175+
The single linkage tree produced during clustering in scipy
176+
hierarchical clustering format
177+
(see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html).
178+
179+
References
180+
----------
181+
K. Chaudhuri and S. Dasgupta.
182+
"Rates of convergence for the cluster tree."
183+
In Advances in Neural Information Processing Systems, 2010.
184+
185+
"""
121186

122187
if type(k) is not int or k < 1:
123188
raise ValueError('k must be an integer greater than zero!')
@@ -153,22 +218,126 @@ def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric
153218

154219

155220
class RobustSingleLinkage (BaseEstimator, ClusterMixin):
156-
157-
def __init__(self, k=5, alpha=1.4142135623730951, gamma=5, metric=euclidean, p=None):
221+
"""Perform robust single linkage clustering from a vector array
222+
or distance matrix.
223+
224+
Roust single linkage is a modified version of single linkage that
225+
attempts to be more robust to noise. Specifically the goal is to
226+
more accurately approximate the level set tree of the unknown
227+
probability density function from which the sample data has
228+
been drawn.
229+
230+
Parameters
231+
----------
232+
X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
233+
array of shape (n_samples, n_samples)
234+
A feature array, or array of distances between samples if
235+
``metric='precomputed'``.
236+
237+
cut : float
238+
The reachability distance value to cut the cluster heirarchy at
239+
to derive a flat cluster labelling.
240+
241+
k : int, optional
242+
Reachability distances will be computed with regard to the `k`
243+
nearest neighbors. (default 5)
244+
245+
alpha : float, optional
246+
Distance scaling for reachability distance computation. Reachability
247+
distance is computed as $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.
248+
(default sqrt(2))
249+
250+
gamma : int, optional
251+
Ignore any clusters in the flat clustering with size less than gamma,
252+
and declare points in such clusters as noise points. (default 5)
253+
254+
metric : string, or callable, optional
255+
The metric to use when calculating distance between instances in a
256+
feature array. If metric is a string or callable, it must be one of
257+
the options allowed by metrics.pairwise.pairwise_distances for its
258+
metric parameter.
259+
If metric is "precomputed", X is assumed to be a distance matrix and
260+
must be square.
261+
262+
algorithm : string, optional
263+
Exactly which algorithm to use; hdbscan has variants specialised
264+
for different characteristics of the data. By default this is set
265+
to ``best`` which chooses the "best" algorithm given the nature of
266+
the data. You can force other options if you believe you know
267+
better. Options are:
268+
* ``small``
269+
* ``small_kdtree``
270+
* ``large_kdtree``
271+
* ``large_kdtree_fastcluster``
272+
273+
Attributes
274+
-------
275+
labels_ : array [n_samples]
276+
Cluster labels for each point. Noisy samples are given the label -1.
277+
278+
cluster_hierarchy_ : SingleLinkageTree object
279+
The single linkage tree produced during clustering. This object provides
280+
several methods for:
281+
* Plotting
282+
* Generating a flat clustering
283+
* Exporting to NetworkX
284+
* Exporting to Pandas
285+
286+
References
287+
----------
288+
K. Chaudhuri and S. Dasgupta.
289+
"Rates of convergence for the cluster tree."
290+
In Advances in Neural Information Processing Systems, 2010.
291+
292+
"""
293+
294+
def __init__(self, k=5, alpha=1.4142135623730951, gamma=5, metric='euclidean', p=None):
158295

159296
self.k = k
160297
self.alpha = alpha
161298
self.gamma = gamma
162299
self.metric = metric
163300
self.p = p
164301

165-
self.cluster_hierarchy_ = None
302+
self._cluster_hierarchy_ = None
166303

167304
def fit(self, X, y=None):
305+
"""Perform robust single linkage clustering from features or distance matrix.
306+
307+
Parameters
308+
----------
309+
X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
310+
array of shape (n_samples, n_samples)
311+
A feature array, or array of distances between samples if
312+
``metric='precomputed'``.
313+
"""
168314
X = check_array(X, accept_sparse='csr')
169-
self.labels_, self.cluster_hierarchy = robust_single_linkage(X, **self.get_params())
315+
self.labels_, self._cluster_hierarchy = robust_single_linkage(X, **self.get_params())
170316
return self
171317

172318
def fit_predict(self, X, y=None):
319+
"""Performs clustering on X and returns cluster labels.
320+
321+
Parameters
322+
----------
323+
X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
324+
array of shape (n_samples, n_samples)
325+
A feature array, or array of distances between samples if
326+
``metric='precomputed'``.
327+
328+
Returns
329+
-------
330+
y : ndarray, shape (n_samples,)
331+
cluster labels
332+
"""
333+
173334
self.fit(X)
174-
return self.labels_
335+
return self.labels_
336+
337+
@property
338+
def cluster_hierarchy_(self):
339+
if self._cluster_hierarchy_ is not None:
340+
return SingleLinkageTree(self._cluster_hierarchy_)
341+
else:
342+
warn('No single linkage tree was generated; try running fit first.')
343+
return None

0 commit comments

Comments
 (0)