@@ -36,11 +36,24 @@ def all_points_core_distance(distance_matrix, d=2.0):
3636 return result
3737
3838
39- def all_points_mutual_reachability (X , labels , cluster_id ,
40- metric = 'euclidean' , d = None , ** kwd_args ):
39+ def max_ratio (stacked_distances ):
40+ max_ratio = 0
41+ for i in range (stacked_distances .shape [0 ]):
42+ for j in range (stacked_distances .shape [1 ]):
43+ dist = stacked_distances [i ][j ][0 ]
44+ coredist = stacked_distances [i ][j ][1 ]
45+ if dist == 0 or coredist / dist <= max_ratio :
46+ continue
47+ max_ratio = coredist / dist
48+
49+ return max_ratio
50+
51+
52+ def distances_between_points (X , labels , cluster_id ,
53+ metric = 'euclidean' , d = None , no_coredist = False ,
54+ print_max_raw_to_coredist_ratio = False , ** kwd_args ):
4155 """
42- Compute the all-points-mutual-reachability distances for all the points of
43- a cluster.
56+ Compute pairwise distances for all the points of a cluster.
4457
4558 If metric is 'precomputed' then assume X is a distance matrix for the full
4659 dataset. Note that in this case you must pass in 'd' the dimension of the
@@ -58,9 +71,7 @@ def all_points_mutual_reachability(X, labels, cluster_id,
5871 cluster label to each data point, with -1 for noise points.
5972
6073 cluster_id : integer
61- The cluster label for which to compute the all-points
62- mutual-reachability (which should be done on a cluster
63- by cluster basis).
74+ The cluster label for which to compute the distances
6475
6576 metric : string
6677 The metric used to compute distances for the clustering (and
@@ -80,9 +91,8 @@ def all_points_mutual_reachability(X, labels, cluster_id,
8091 Returns
8192 -------
8293
83- mutual_reachaibility : array (n_samples, n_samples)
84- The pairwise mutual reachability distances between all points in `X`
85- with `label` equal to `cluster_id`.
94+ distances : array (n_samples, n_samples)
95+ The distances between all points in `X` with `label` equal to `cluster_id`.
8696
8797 core_distances : array (n_samples,)
8898 The all-points-core_distance of all points in `X` with `label` equal
@@ -104,13 +114,19 @@ def all_points_mutual_reachability(X, labels, cluster_id,
104114 ** kwd_args )
105115 d = X .shape [1 ]
106116
107- core_distances = all_points_core_distance (distance_matrix .copy (), d = d )
108- core_dist_matrix = np .tile (core_distances , (core_distances .shape [0 ], 1 ))
117+ if no_coredist :
118+ return distance_matrix , None
119+
120+ else :
121+ core_distances = all_points_core_distance (distance_matrix .copy (), d = d )
122+ core_dist_matrix = np .tile (core_distances , (core_distances .shape [0 ], 1 ))
123+ stacked_distances = np .dstack (
124+ [distance_matrix , core_dist_matrix , core_dist_matrix .T ])
109125
110- result = np . dstack (
111- [ distance_matrix , core_dist_matrix , core_dist_matrix . T ]). max ( axis = - 1 )
126+ if print_max_raw_to_coredist_ratio :
127+ print ( "Max raw distance to coredistance ratio: " + str ( max_ratio ( stacked_distances )) )
112128
113- return result , core_distances
129+ return stacked_distances . max ( axis = - 1 ) , core_distances
114130
115131
116132def internal_minimum_spanning_tree (mr_distances ):
@@ -181,11 +197,10 @@ def internal_minimum_spanning_tree(mr_distances):
181197def density_separation (X , labels , cluster_id1 , cluster_id2 ,
182198 internal_nodes1 , internal_nodes2 ,
183199 core_distances1 , core_distances2 ,
184- metric = 'euclidean' , ** kwd_args ):
200+ metric = 'euclidean' , no_coredist = False , ** kwd_args ):
185201 """
186202 Compute the density separation between two clusters. This is the minimum
187- all-points mutual reachability distance between pairs of points, one from
188- internal nodes of MSTs of each cluster.
203+ distance between pairs of points, one from internal nodes of MSTs of each cluster.
189204
190205 Parameters
191206 ----------
@@ -246,20 +261,24 @@ def density_separation(X, labels, cluster_id1, cluster_id2,
246261 cluster2 = X [labels == cluster_id2 ][internal_nodes2 ]
247262 distance_matrix = cdist (cluster1 , cluster2 , metric , ** kwd_args )
248263
249- core_dist_matrix1 = np .tile (core_distances1 [internal_nodes1 ],
250- (distance_matrix .shape [1 ], 1 )).T
251- core_dist_matrix2 = np .tile (core_distances2 [internal_nodes2 ],
252- (distance_matrix .shape [0 ], 1 ))
264+ if no_coredist :
265+ return distance_matrix .min ()
253266
254- mr_dist_matrix = np .dstack ([distance_matrix ,
255- core_dist_matrix1 ,
256- core_dist_matrix2 ]).max (axis = - 1 )
267+ else :
268+ core_dist_matrix1 = np .tile (core_distances1 [internal_nodes1 ],
269+ (distance_matrix .shape [1 ], 1 )).T
270+ core_dist_matrix2 = np .tile (core_distances2 [internal_nodes2 ],
271+ (distance_matrix .shape [0 ], 1 ))
272+
273+ mr_dist_matrix = np .dstack ([distance_matrix ,
274+ core_dist_matrix1 ,
275+ core_dist_matrix2 ]).max (axis = - 1 )
257276
258- return mr_dist_matrix .min ()
277+ return mr_dist_matrix .min ()
259278
260279
261280def validity_index (X , labels , metric = 'euclidean' ,
262- d = None , per_cluster_scores = False , ** kwd_args ):
281+ d = None , per_cluster_scores = False , mst_raw_dist = False , verbose = False , ** kwd_args ):
263282 """
264283 Compute the density based cluster validity index for the
265284 clustering specified by `labels` and for each cluster in `labels`.
@@ -291,6 +310,11 @@ def validity_index(X, labels, metric='euclidean',
291310 Defaults to False with the function returning a single float
292311 value for the whole clustering.
293312
313+ mst_raw_dist : optional, boolean (default False)
314+ If True, the MST's are constructed solely via 'raw' distances (depending on the given metric, e.g. euclidean distances)
315+ instead of using mutual reachability distances. Thus setting this parameter to True avoids using 'all-points-core-distances' at all.
316+ This is advantageous specifically in the case of elongated clusters that lie in close proximity to each other <citation needed>.
317+
294318 **kwd_args :
295319 Extra arguments to pass to the distance computation for other
296320 metrics, such as minkowski, Mahanalobis etc.
@@ -327,18 +351,20 @@ def validity_index(X, labels, metric='euclidean',
327351 if np .sum (labels == cluster_id ) == 0 :
328352 continue
329353
330- mr_distances , core_distances [
331- cluster_id ] = all_points_mutual_reachability (
354+ distances_for_mst , core_distances [
355+ cluster_id ] = distances_between_points (
332356 X ,
333357 labels ,
334358 cluster_id ,
335359 metric ,
336360 d ,
361+ no_coredist = mst_raw_dist ,
362+ print_max_raw_to_coredist_ratio = verbose ,
337363 ** kwd_args
338364 )
339365
340366 mst_nodes [cluster_id ], mst_edges [cluster_id ] = \
341- internal_minimum_spanning_tree (mr_distances )
367+ internal_minimum_spanning_tree (distances_for_mst )
342368 density_sparseness [cluster_id ] = mst_edges [cluster_id ].T [2 ].max ()
343369
344370 for i in range (max_cluster_id ):
@@ -357,7 +383,8 @@ def validity_index(X, labels, metric='euclidean',
357383 X , labels , i , j ,
358384 internal_nodes_i , internal_nodes_j ,
359385 core_distances [i ], core_distances [j ],
360- metric = metric , ** kwd_args
386+ metric = metric , no_coredist = mst_raw_dist ,
387+ ** kwd_args
361388 )
362389 density_sep [j , i ] = density_sep [i , j ]
363390
@@ -374,6 +401,11 @@ def validity_index(X, labels, metric='euclidean',
374401 (min_density_sep - density_sparseness [i ]) /
375402 max (min_density_sep , density_sparseness [i ])
376403 )
404+
405+ if verbose :
406+ print ("Minimum density separation: " + str (min_density_sep ))
407+ print ("Density sparseness: " + str (density_sparseness [i ]))
408+
377409 cluster_size = np .sum (labels == i )
378410 result += (cluster_size / n_samples ) * cluster_validity_indices [i ]
379411
0 commit comments