@@ -758,6 +758,17 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
758758 of the list being a numpy array of exemplar points for a cluster --
759759 these points are the "most representative" points of the cluster.
760760
761+ relative_validity_ : float
762+ A fast approximation of the Density Based Cluster Validity (DBCV)
763+ score [4]. The only differece, and the speed, comes from the fact
764+ that this relative_validity_ is computed using the mutual-
765+ reachability minimum spanning tree, i.e. minimum_spanning_tree_,
766+ instead of the all-points minimum spanning tree used in the
767+ reference. This score might not be an objective measure of the
768+ goodness of clusterering. It may only be used to compare results
769+ across different choices of hyper-parameters, therefore is only a
770+ relative score.
771+
761772 References
762773 ----------
763774
@@ -775,6 +786,10 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
775786 cluster tree. In Advances in Neural Information Processing Systems
776787 (pp. 343-351).
777788
789+ .. [4] Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and
790+ Sander, J., 2014. Density-Based Clustering Validation. In SDM
791+ (pp. 839-847).
792+
778793 """
779794
780795 def __init__ (self , min_cluster_size = 5 , min_samples = None ,
@@ -813,6 +828,7 @@ def __init__(self, min_cluster_size=5, min_samples=None,
813828 self ._raw_data = None
814829 self ._outlier_scores = None
815830 self ._prediction_data = None
831+ self ._relative_validity = None
816832
817833 def fit (self , X , y = None ):
818834 """Perform HDBSCAN clustering from features or distance matrix.
@@ -892,7 +908,7 @@ def generate_prediction_data(self):
892908 else :
893909 warn ('Metric {} not supported for prediction data!' .format (self .metric ))
894910 return
895-
911+
896912 self ._prediction_data = PredictionData (
897913 self ._raw_data , self .condensed_tree_ , min_samples ,
898914 tree_type = tree_type , metric = self .metric ,
@@ -963,5 +979,82 @@ def exemplars_(self):
963979 return self ._prediction_data .exemplars
964980 else :
965981 raise AttributeError ('Currently exemplars require the use of vector input data'
966- 'with a suitable metric. This will likely change in the '
967- 'future, but for now no exemplars can be provided' )
982+ 'with a suitable metric. This will likely change in the '
983+ 'future, but for now no exemplars can be provided' )
984+
985+ @property
986+ def relative_validity_ (self ):
987+ if self ._relative_validity is not None :
988+ return self ._relative_validity
989+
990+ if not self .gen_min_span_tree :
991+ raise AttributeError ("Minimum spanning tree not present. " +
992+ "Either HDBSCAN object was created with " +
993+ "gen_min_span_tree=False or the tree was " +
994+ "not generated in spite of it owing to " +
995+ "internal optimization criteria." )
996+ return
997+
998+ labels = self .labels_
999+ sizes = np .bincount (labels + 1 )
1000+ noise_size = sizes [0 ]
1001+ cluster_size = sizes [1 :]
1002+ total = noise_size + np .sum (cluster_size )
1003+ num_clusters = len (cluster_size )
1004+ DSC = np .zeros (num_clusters )
1005+ min_outlier_sep = np .inf # only required if num_clusters = 1
1006+ correction_const = 2 # only required if num_clusters = 1
1007+
1008+ # Unltimately, for each Ci, we only require the
1009+ # minimum of DSPC(Ci, Cj) over all Cj != Ci.
1010+ # So let's call this value DSPC_wrt(Ci), i.e.
1011+ # density separation 'with respect to' Ci.
1012+ DSPC_wrt = np .ones (num_clusters ) * np .inf
1013+ max_distance = 0
1014+
1015+ mst_df = self .minimum_spanning_tree_ .to_pandas ()
1016+
1017+ for edge in mst_df .iterrows ():
1018+ label1 = labels [int (edge [1 ]['from' ])]
1019+ label2 = labels [int (edge [1 ]['to' ])]
1020+ length = edge [1 ]['distance' ]
1021+
1022+ max_distance = max (max_distance , length )
1023+
1024+ if label1 == - 1 and label2 == - 1 :
1025+ continue
1026+ elif label1 == - 1 or label2 == - 1 :
1027+ # If exactly one of the points is noise
1028+ min_outlier_sep = min (min_outlier_sep , length )
1029+ continue
1030+
1031+ if label1 == label2 :
1032+ # Set the density sparseness of the cluster
1033+ # to the sparsest value seen so far.
1034+ DSC [label1 ] = max (length , DSC [label1 ])
1035+ else :
1036+ # Check whether density separations with
1037+ # respect to each of these clusters can
1038+ # be reduced.
1039+ DSPC_wrt [label1 ] = min (length , DSPC_wrt [label1 ])
1040+ DSPC_wrt [label2 ] = min (length , DSPC_wrt [label2 ])
1041+
1042+ # In case min_outlier_sep is still np.inf, we assign a new value to it.
1043+ # This only makes sense if num_clusters = 1 since it has turned out
1044+ # that the MR-MST has no edges between a noise point and a core point.
1045+ min_outlier_sep = max_distance if min_outlier_sep == np .inf else min_outlier_sep
1046+
1047+ # DSPC_wrt[Ci] might be infinite if the connected component for Ci is
1048+ # an "island" in the MR-MST. Whereas for other clusters Cj and Ck, the
1049+ # MR-MST might contain an edge with one point in Cj and ther other one
1050+ # in Ck. Here, we replace the infinite density separation of Ci by
1051+ # another large enough value.
1052+ #
1053+ # TODO: Think of a better yet efficient way to handle this.
1054+ correction = correction_const * (max_distance if num_clusters > 1 else min_outlier_sep )
1055+ DSPC_wrt [np .where (DSPC_wrt == np .inf )] = correction
1056+
1057+ V_index = [(DSPC_wrt [i ] - DSC [i ]) / max (DSPC_wrt [i ], DSC [i ]) for i in range (num_clusters )]
1058+ score = np .sum ([(cluster_size [i ] * V_index [i ]) / total for i in range (num_clusters )])
1059+ self ._relative_validity = score
1060+ return self ._relative_validity
0 commit comments