Skip to content

Commit 9840b7e

Browse files
authored
Merge pull request #243 from chaturv3di/feature/validity_using_MR-MST
Added relative_validity_ score as an attribute in HDBSCAN class
2 parents d580ccb + 2231713 commit 9840b7e

File tree

1 file changed

+96
-3
lines changed

1 file changed

+96
-3
lines changed

hdbscan/hdbscan_.py

Lines changed: 96 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,17 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
758758
of the list being a numpy array of exemplar points for a cluster --
759759
these points are the "most representative" points of the cluster.
760760
761+
relative_validity_ : float
762+
A fast approximation of the Density Based Cluster Validity (DBCV)
763+
score [4]. The only differece, and the speed, comes from the fact
764+
that this relative_validity_ is computed using the mutual-
765+
reachability minimum spanning tree, i.e. minimum_spanning_tree_,
766+
instead of the all-points minimum spanning tree used in the
767+
reference. This score might not be an objective measure of the
768+
goodness of clusterering. It may only be used to compare results
769+
across different choices of hyper-parameters, therefore is only a
770+
relative score.
771+
761772
References
762773
----------
763774
@@ -775,6 +786,10 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
775786
cluster tree. In Advances in Neural Information Processing Systems
776787
(pp. 343-351).
777788
789+
.. [4] Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and
790+
Sander, J., 2014. Density-Based Clustering Validation. In SDM
791+
(pp. 839-847).
792+
778793
"""
779794

780795
def __init__(self, min_cluster_size=5, min_samples=None,
@@ -813,6 +828,7 @@ def __init__(self, min_cluster_size=5, min_samples=None,
813828
self._raw_data = None
814829
self._outlier_scores = None
815830
self._prediction_data = None
831+
self._relative_validity = None
816832

817833
def fit(self, X, y=None):
818834
"""Perform HDBSCAN clustering from features or distance matrix.
@@ -892,7 +908,7 @@ def generate_prediction_data(self):
892908
else:
893909
warn('Metric {} not supported for prediction data!'.format(self.metric))
894910
return
895-
911+
896912
self._prediction_data = PredictionData(
897913
self._raw_data, self.condensed_tree_, min_samples,
898914
tree_type=tree_type, metric=self.metric,
@@ -963,5 +979,82 @@ def exemplars_(self):
963979
return self._prediction_data.exemplars
964980
else:
965981
raise AttributeError('Currently exemplars require the use of vector input data'
966-
'with a suitable metric. This will likely change in the '
967-
'future, but for now no exemplars can be provided')
982+
'with a suitable metric. This will likely change in the '
983+
'future, but for now no exemplars can be provided')
984+
985+
@property
986+
def relative_validity_(self):
987+
if self._relative_validity is not None:
988+
return self._relative_validity
989+
990+
if not self.gen_min_span_tree:
991+
raise AttributeError("Minimum spanning tree not present. " +
992+
"Either HDBSCAN object was created with " +
993+
"gen_min_span_tree=False or the tree was " +
994+
"not generated in spite of it owing to " +
995+
"internal optimization criteria.")
996+
return
997+
998+
labels = self.labels_
999+
sizes = np.bincount(labels + 1)
1000+
noise_size = sizes[0]
1001+
cluster_size = sizes[1:]
1002+
total = noise_size + np.sum(cluster_size)
1003+
num_clusters = len(cluster_size)
1004+
DSC = np.zeros(num_clusters)
1005+
min_outlier_sep = np.inf # only required if num_clusters = 1
1006+
correction_const = 2 # only required if num_clusters = 1
1007+
1008+
# Unltimately, for each Ci, we only require the
1009+
# minimum of DSPC(Ci, Cj) over all Cj != Ci.
1010+
# So let's call this value DSPC_wrt(Ci), i.e.
1011+
# density separation 'with respect to' Ci.
1012+
DSPC_wrt = np.ones(num_clusters) * np.inf
1013+
max_distance = 0
1014+
1015+
mst_df = self.minimum_spanning_tree_.to_pandas()
1016+
1017+
for edge in mst_df.iterrows():
1018+
label1 = labels[int(edge[1]['from'])]
1019+
label2 = labels[int(edge[1]['to'])]
1020+
length = edge[1]['distance']
1021+
1022+
max_distance = max(max_distance, length)
1023+
1024+
if label1 == -1 and label2 == -1:
1025+
continue
1026+
elif label1 == -1 or label2 == -1:
1027+
# If exactly one of the points is noise
1028+
min_outlier_sep = min(min_outlier_sep, length)
1029+
continue
1030+
1031+
if label1 == label2:
1032+
# Set the density sparseness of the cluster
1033+
# to the sparsest value seen so far.
1034+
DSC[label1] = max(length, DSC[label1])
1035+
else:
1036+
# Check whether density separations with
1037+
# respect to each of these clusters can
1038+
# be reduced.
1039+
DSPC_wrt[label1] = min(length, DSPC_wrt[label1])
1040+
DSPC_wrt[label2] = min(length, DSPC_wrt[label2])
1041+
1042+
# In case min_outlier_sep is still np.inf, we assign a new value to it.
1043+
# This only makes sense if num_clusters = 1 since it has turned out
1044+
# that the MR-MST has no edges between a noise point and a core point.
1045+
min_outlier_sep = max_distance if min_outlier_sep == np.inf else min_outlier_sep
1046+
1047+
# DSPC_wrt[Ci] might be infinite if the connected component for Ci is
1048+
# an "island" in the MR-MST. Whereas for other clusters Cj and Ck, the
1049+
# MR-MST might contain an edge with one point in Cj and ther other one
1050+
# in Ck. Here, we replace the infinite density separation of Ci by
1051+
# another large enough value.
1052+
#
1053+
# TODO: Think of a better yet efficient way to handle this.
1054+
correction = correction_const * (max_distance if num_clusters > 1 else min_outlier_sep)
1055+
DSPC_wrt[np.where(DSPC_wrt == np.inf)] = correction
1056+
1057+
V_index = [(DSPC_wrt[i] - DSC[i]) / max(DSPC_wrt[i], DSC[i]) for i in range(num_clusters)]
1058+
score = np.sum([(cluster_size[i] * V_index[i]) / total for i in range(num_clusters)])
1059+
self._relative_validity = score
1060+
return self._relative_validity

0 commit comments

Comments
 (0)