2424from ._hdbscan_reachability import kdtree_pdist_mutual_reachability , kdtree_mutual_reachability , mutual_reachability
2525from .plots import SingleLinkageTree
2626
27+ from warnings import warn
28+
2729try :
2830 from fastcluster import single
2931 HAVE_FASTCLUSTER = True
@@ -118,6 +120,69 @@ def _rsl_large_kdtree_fastcluster(X, cut, k=5, alpha=1.4142135623730951, gamma=5
118120
119121
120122def robust_single_linkage (X , cut , k = 5 , alpha = 1.4142135623730951 , gamma = 5 , metric = 'minkowski' , p = 2 , algorithm = None ):
123+ """Perform robust single linkage clustering from a vector array
124+ or distance matrix.
125+
126+ Parameters
127+ ----------
128+ X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
129+ array of shape (n_samples, n_samples)
130+ A feature array, or array of distances between samples if
131+ ``metric='precomputed'``.
132+
133+ cut : float
134+ The reachability distance value to cut the cluster heirarchy at
135+ to derive a flat cluster labelling.
136+
137+ k : int, optional
138+ Reachability distances will be computed with regard to the `k`
139+ nearest neighbors. (default 5)
140+
141+ alpha : float, optional
142+ Distance scaling for reachability distance computation. Reachability
143+ distance is computed as $max \{ core_k(a), core_k(b), 1/\a lpha d(a,b) \}$.
144+ (default sqrt(2))
145+
146+ gamma : int, optional
147+ Ignore any clusters in the flat clustering with size less than gamma,
148+ and declare points in such clusters as noise points. (default 5)
149+
150+ metric : string, or callable, optional
151+ The metric to use when calculating distance between instances in a
152+ feature array. If metric is a string or callable, it must be one of
153+ the options allowed by metrics.pairwise.pairwise_distances for its
154+ metric parameter.
155+ If metric is "precomputed", X is assumed to be a distance matrix and
156+ must be square.
157+
158+ algorithm : string, optional
159+ Exactly which algorithm to use; hdbscan has variants specialised
160+ for different characteristics of the data. By default this is set
161+ to ``best`` which chooses the "best" algorithm given the nature of
162+ the data. You can force other options if you believe you know
163+ better. Options are:
164+ * ``small``
165+ * ``small_kdtree``
166+ * ``large_kdtree``
167+ * ``large_kdtree_fastcluster``
168+
169+ Returns
170+ -------
171+ labels : array [n_samples]
172+ Cluster labels for each point. Noisy samples are given the label -1.
173+
174+ single_linkage_tree : array [n_samples - 1, 4]
175+ The single linkage tree produced during clustering in scipy
176+ hierarchical clustering format
177+ (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html).
178+
179+ References
180+ ----------
181+ K. Chaudhuri and S. Dasgupta.
182+ "Rates of convergence for the cluster tree."
183+ In Advances in Neural Information Processing Systems, 2010.
184+
185+ """
121186
122187 if type (k ) is not int or k < 1 :
123188 raise ValueError ('k must be an integer greater than zero!' )
@@ -153,22 +218,126 @@ def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric
153218
154219
155220class RobustSingleLinkage (BaseEstimator , ClusterMixin ):
156-
157- def __init__ (self , k = 5 , alpha = 1.4142135623730951 , gamma = 5 , metric = euclidean , p = None ):
221+ """Perform robust single linkage clustering from a vector array
222+ or distance matrix.
223+
224+ Roust single linkage is a modified version of single linkage that
225+ attempts to be more robust to noise. Specifically the goal is to
226+ more accurately approximate the level set tree of the unknown
227+ probability density function from which the sample data has
228+ been drawn.
229+
230+ Parameters
231+ ----------
232+ X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
233+ array of shape (n_samples, n_samples)
234+ A feature array, or array of distances between samples if
235+ ``metric='precomputed'``.
236+
237+ cut : float
238+ The reachability distance value to cut the cluster heirarchy at
239+ to derive a flat cluster labelling.
240+
241+ k : int, optional
242+ Reachability distances will be computed with regard to the `k`
243+ nearest neighbors. (default 5)
244+
245+ alpha : float, optional
246+ Distance scaling for reachability distance computation. Reachability
247+ distance is computed as $max \{ core_k(a), core_k(b), 1/\a lpha d(a,b) \}$.
248+ (default sqrt(2))
249+
250+ gamma : int, optional
251+ Ignore any clusters in the flat clustering with size less than gamma,
252+ and declare points in such clusters as noise points. (default 5)
253+
254+ metric : string, or callable, optional
255+ The metric to use when calculating distance between instances in a
256+ feature array. If metric is a string or callable, it must be one of
257+ the options allowed by metrics.pairwise.pairwise_distances for its
258+ metric parameter.
259+ If metric is "precomputed", X is assumed to be a distance matrix and
260+ must be square.
261+
262+ algorithm : string, optional
263+ Exactly which algorithm to use; hdbscan has variants specialised
264+ for different characteristics of the data. By default this is set
265+ to ``best`` which chooses the "best" algorithm given the nature of
266+ the data. You can force other options if you believe you know
267+ better. Options are:
268+ * ``small``
269+ * ``small_kdtree``
270+ * ``large_kdtree``
271+ * ``large_kdtree_fastcluster``
272+
273+ Attributes
274+ -------
275+ labels_ : array [n_samples]
276+ Cluster labels for each point. Noisy samples are given the label -1.
277+
278+ cluster_hierarchy_ : SingleLinkageTree object
279+ The single linkage tree produced during clustering. This object provides
280+ several methods for:
281+ * Plotting
282+ * Generating a flat clustering
283+ * Exporting to NetworkX
284+ * Exporting to Pandas
285+
286+ References
287+ ----------
288+ K. Chaudhuri and S. Dasgupta.
289+ "Rates of convergence for the cluster tree."
290+ In Advances in Neural Information Processing Systems, 2010.
291+
292+ """
293+
294+ def __init__ (self , k = 5 , alpha = 1.4142135623730951 , gamma = 5 , metric = 'euclidean' , p = None ):
158295
159296 self .k = k
160297 self .alpha = alpha
161298 self .gamma = gamma
162299 self .metric = metric
163300 self .p = p
164301
165- self .cluster_hierarchy_ = None
302+ self ._cluster_hierarchy_ = None
166303
167304 def fit (self , X , y = None ):
305+ """Perform robust single linkage clustering from features or distance matrix.
306+
307+ Parameters
308+ ----------
309+ X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
310+ array of shape (n_samples, n_samples)
311+ A feature array, or array of distances between samples if
312+ ``metric='precomputed'``.
313+ """
168314 X = check_array (X , accept_sparse = 'csr' )
169- self .labels_ , self .cluster_hierarchy = robust_single_linkage (X , ** self .get_params ())
315+ self .labels_ , self ._cluster_hierarchy = robust_single_linkage (X , ** self .get_params ())
170316 return self
171317
172318 def fit_predict (self , X , y = None ):
319+ """Performs clustering on X and returns cluster labels.
320+
321+ Parameters
322+ ----------
323+ X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
324+ array of shape (n_samples, n_samples)
325+ A feature array, or array of distances between samples if
326+ ``metric='precomputed'``.
327+
328+ Returns
329+ -------
330+ y : ndarray, shape (n_samples,)
331+ cluster labels
332+ """
333+
173334 self .fit (X )
174- return self .labels_
335+ return self .labels_
336+
337+ @property
338+ def cluster_hierarchy_ (self ):
339+ if self ._cluster_hierarchy_ is not None :
340+ return SingleLinkageTree (self ._cluster_hierarchy_ )
341+ else :
342+ warn ('No single linkage tree was generated; try running fit first.' )
343+ return None
0 commit comments