1+ from ._cluster_node import ClusterNode
2+ from collections import deque
13import heapq
24from numbers import Integral
35import numpy as np
@@ -58,22 +60,25 @@ def fit(self, X, y):
5860 order = "C" ,
5961 )
6062 n_samples , _ = X .shape
61- # We start with all samples in a single cluster
63+ # We start with all samples being in a single cluster
6264 self .n_clusters_ = 1
6365 # We assign all samples a label of zero
6466 labels = np .zeros (n_samples , dtype = np .uint32 )
65- clusters = []
67+ leaves = []
6668 scores = []
6769 label = 0
70+ root = ClusterNode (label )
71+ self .cluster_tree_ = root
6872 # The entire dataset has a discrimination score of zero
6973 score = 0
70- heap = [(None , label , score )]
74+ heap = [(None , root , score )]
7175 for _ in range (self .bahc_max_iter ):
7276 if not heap :
7377 # If the heap is empty we stop iterating
7478 break
7579 # Take the cluster with the highest standard deviation of metric y
76- _ , label , score = heapq .heappop (heap )
80+ _ , node , score = heapq .heappop (heap )
81+ label = node .label
7782 cluster_indices = np .nonzero (labels == label )[0 ]
7883 cluster = X [cluster_indices ]
7984
@@ -90,39 +95,82 @@ def fit(self, X, y):
9095 and len (indices1 ) >= self .bahc_min_cluster_size
9196 ):
9297 # We calculate the discrimination scores using formula (1) in [1]
98+ # TODO: Move y[indices0] and y[indices1] into separate variables
99+ # to avoid recomputing them
100+ # Maybe create a function to compute the score
93101 mask0 = np .ones (n_samples , dtype = bool )
94102 mask0 [indices0 ] = False
95103 score0 = np .mean (y [mask0 ]) - np .mean (y [indices0 ])
96104 mask1 = np .ones (n_samples , dtype = bool )
97105 mask1 [indices1 ] = False
98106 score1 = np .mean (y [mask1 ]) - np .mean (y [indices1 ])
99107 if max (score0 , score1 ) >= score :
108+ std0 = np .std (y [indices0 ])
109+ node0 = ClusterNode (label )
100110 # heapq implements min-heap
101111 # so we have to negate std before pushing
102- std0 = np .std (y [indices0 ])
103- heapq .heappush (heap , (- std0 , label , score0 ))
112+ heapq .heappush (heap , (- std0 , node0 , score0 ))
104113 std1 = np .std (y [indices1 ])
105- heapq .heappush (heap , (- std1 , self .n_clusters_ , score1 ))
114+ node1 = ClusterNode (self .n_clusters_ )
115+ heapq .heappush (heap , (- std1 , node1 , score1 ))
106116 labels [indices1 ] = self .n_clusters_
117+ # TODO: Increase n_clusters_ by clustering_model.n_clusters_ - 1
107118 self .n_clusters_ += 1
119+ children = [node0 , node1 ]
120+ node .split (clustering_model , children )
108121 else :
109- clusters .append (label )
122+ leaves .append (node )
110123 scores .append (score )
111124 else :
112- clusters .append (label )
125+ leaves .append (node )
113126 scores .append (score )
114127 if heap :
115- clusters = np .concatenate ([clusters , [label for _ , label , _ in heap ]])
128+ # TODO: Check if this can be made more efficient
129+ leaves .extend ((node for _ , node , _ in heap ))
116130 scores = np .concatenate ([scores , [score for _ , _ , score in heap ]])
117131 else :
118- clusters = np .array (clusters )
119132 scores = np .array (scores )
120133
121134 # We sort clusters by decreasing scores
122- indices = np .argsort (- scores )
123- clusters = clusters [indices ]
124- self .scores_ = scores [indices ]
125- mapping = np .zeros (self .n_clusters_ , dtype = np .uint32 )
126- mapping [clusters ] = np .arange (self .n_clusters_ , dtype = np .uint32 )
127- self .labels_ = mapping [labels ]
135+ sorted_indices = np .argsort (- scores )
136+ self .scores_ = scores [sorted_indices ]
137+ leaf_labels = np .array ([leaf .label for leaf in leaves ])
138+ leaf_labels = leaf_labels [sorted_indices ]
139+ label_mapping = np .zeros (self .n_clusters_ , dtype = np .uint32 )
140+ label_mapping [leaf_labels ] = np .arange (self .n_clusters_ , dtype = np .uint32 )
141+ self .labels_ = label_mapping [labels ]
142+ for leaf in leaves :
143+ leaf .label = label_mapping [leaf .label ]
128144 return self
145+
146+ def predict (self , X ):
147+ """Predict the cluster labels for the given data.
148+
149+ Parameters
150+ ----------
151+ X : array-like of shape (n_samples, n_features)
152+ """
153+ # TODO: Assert that fit has been called
154+ # TODO: Assert that X has the same number of features as the data used to fit
155+ # TODO: Assert that clustering_model has predict method
156+ # TODO: Validate X
157+ n_samples , _ = X .shape
158+ labels = np .zeros (n_samples , dtype = np .uint32 )
159+ queue = deque ([(self .cluster_tree_ , np .arange (n_samples ))])
160+ while queue :
161+ node , indices = queue .popleft ()
162+ if node .is_leaf :
163+ labels [indices ] = node .label
164+ else :
165+ cluster = X [indices ]
166+ clustering_model = node .clustering_model
167+ cluster_labels = clustering_model .predict (cluster )
168+ if hasattr (clustering_model , "n_clusters_" ):
169+ n_clusters = clustering_model .n_clusters_
170+ else :
171+ n_clusters = len (np .unique (cluster_labels ))
172+ for i in range (n_clusters ):
173+ child_indices = indices [np .nonzero (cluster_labels == i )[0 ]]
174+ if child_indices .size > 0 :
175+ queue .append ((node .children [i ], child_indices ))
176+ return labels
0 commit comments