@@ -758,6 +758,17 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
758
758
of the list being a numpy array of exemplar points for a cluster --
759
759
these points are the "most representative" points of the cluster.
760
760
761
+ relative_validity_ : float
762
+ A fast approximation of the Density Based Cluster Validity (DBCV)
763
+ score [4]. The only differece, and the speed, comes from the fact
764
+ that this relative_validity_ is computed using the mutual-
765
+ reachability minimum spanning tree, i.e. minimum_spanning_tree_,
766
+ instead of the all-points minimum spanning tree used in the
767
+ reference. This score might not be an objective measure of the
768
+ goodness of clusterering. It may only be used to compare results
769
+ across different choices of hyper-parameters, therefore is only a
770
+ relative score.
771
+
761
772
References
762
773
----------
763
774
@@ -775,6 +786,10 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
775
786
cluster tree. In Advances in Neural Information Processing Systems
776
787
(pp. 343-351).
777
788
789
+ .. [4] Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and
790
+ Sander, J., 2014. Density-Based Clustering Validation. In SDM
791
+ (pp. 839-847).
792
+
778
793
"""
779
794
780
795
def __init__ (self , min_cluster_size = 5 , min_samples = None ,
@@ -813,6 +828,7 @@ def __init__(self, min_cluster_size=5, min_samples=None,
813
828
self ._raw_data = None
814
829
self ._outlier_scores = None
815
830
self ._prediction_data = None
831
+ self ._relative_validity = None
816
832
817
833
def fit (self , X , y = None ):
818
834
"""Perform HDBSCAN clustering from features or distance matrix.
@@ -892,7 +908,7 @@ def generate_prediction_data(self):
892
908
else :
893
909
warn ('Metric {} not supported for prediction data!' .format (self .metric ))
894
910
return
895
-
911
+
896
912
self ._prediction_data = PredictionData (
897
913
self ._raw_data , self .condensed_tree_ , min_samples ,
898
914
tree_type = tree_type , metric = self .metric ,
@@ -963,5 +979,82 @@ def exemplars_(self):
963
979
return self ._prediction_data .exemplars
964
980
else :
965
981
raise AttributeError ('Currently exemplars require the use of vector input data'
966
- 'with a suitable metric. This will likely change in the '
967
- 'future, but for now no exemplars can be provided' )
982
+ 'with a suitable metric. This will likely change in the '
983
+ 'future, but for now no exemplars can be provided' )
984
+
985
+ @property
986
+ def relative_validity_ (self ):
987
+ if self ._relative_validity is not None :
988
+ return self ._relative_validity
989
+
990
+ if not self .gen_min_span_tree :
991
+ raise AttributeError ("Minimum spanning tree not present. " +
992
+ "Either HDBSCAN object was created with " +
993
+ "gen_min_span_tree=False or the tree was " +
994
+ "not generated in spite of it owing to " +
995
+ "internal optimization criteria." )
996
+ return
997
+
998
+ labels = self .labels_
999
+ sizes = np .bincount (labels + 1 )
1000
+ noise_size = sizes [0 ]
1001
+ cluster_size = sizes [1 :]
1002
+ total = noise_size + np .sum (cluster_size )
1003
+ num_clusters = len (cluster_size )
1004
+ DSC = np .zeros (num_clusters )
1005
+ min_outlier_sep = np .inf # only required if num_clusters = 1
1006
+ correction_const = 2 # only required if num_clusters = 1
1007
+
1008
+ # Unltimately, for each Ci, we only require the
1009
+ # minimum of DSPC(Ci, Cj) over all Cj != Ci.
1010
+ # So let's call this value DSPC_wrt(Ci), i.e.
1011
+ # density separation 'with respect to' Ci.
1012
+ DSPC_wrt = np .ones (num_clusters ) * np .inf
1013
+ max_distance = 0
1014
+
1015
+ mst_df = self .minimum_spanning_tree_ .to_pandas ()
1016
+
1017
+ for edge in mst_df .iterrows ():
1018
+ label1 = labels [int (edge [1 ]['from' ])]
1019
+ label2 = labels [int (edge [1 ]['to' ])]
1020
+ length = edge [1 ]['distance' ]
1021
+
1022
+ max_distance = max (max_distance , length )
1023
+
1024
+ if label1 == - 1 and label2 == - 1 :
1025
+ continue
1026
+ elif label1 == - 1 or label2 == - 1 :
1027
+ # If exactly one of the points is noise
1028
+ min_outlier_sep = min (min_outlier_sep , length )
1029
+ continue
1030
+
1031
+ if label1 == label2 :
1032
+ # Set the density sparseness of the cluster
1033
+ # to the sparsest value seen so far.
1034
+ DSC [label1 ] = max (length , DSC [label1 ])
1035
+ else :
1036
+ # Check whether density separations with
1037
+ # respect to each of these clusters can
1038
+ # be reduced.
1039
+ DSPC_wrt [label1 ] = min (length , DSPC_wrt [label1 ])
1040
+ DSPC_wrt [label2 ] = min (length , DSPC_wrt [label2 ])
1041
+
1042
+ # In case min_outlier_sep is still np.inf, we assign a new value to it.
1043
+ # This only makes sense if num_clusters = 1 since it has turned out
1044
+ # that the MR-MST has no edges between a noise point and a core point.
1045
+ min_outlier_sep = max_distance if min_outlier_sep == np .inf else min_outlier_sep
1046
+
1047
+ # DSPC_wrt[Ci] might be infinite if the connected component for Ci is
1048
+ # an "island" in the MR-MST. Whereas for other clusters Cj and Ck, the
1049
+ # MR-MST might contain an edge with one point in Cj and ther other one
1050
+ # in Ck. Here, we replace the infinite density separation of Ci by
1051
+ # another large enough value.
1052
+ #
1053
+ # TODO: Think of a better yet efficient way to handle this.
1054
+ correction = correction_const * (max_distance if num_clusters > 1 else min_outlier_sep )
1055
+ DSPC_wrt [np .where (DSPC_wrt == np .inf )] = correction
1056
+
1057
+ V_index = [(DSPC_wrt [i ] - DSC [i ]) / max (DSPC_wrt [i ], DSC [i ]) for i in range (num_clusters )]
1058
+ score = np .sum ([(cluster_size [i ] * V_index [i ]) / total for i in range (num_clusters )])
1059
+ self ._relative_validity = score
1060
+ return self ._relative_validity
0 commit comments