@@ -53,7 +53,10 @@ def __init__(self, condensed_tree_array, cluster_selection_method='eom'):
5353 self ._raw_tree = condensed_tree_array
5454 self .cluster_selection_method = cluster_selection_method
5555
56- def get_plot_data (self , leaf_separation = 1 , log_size = False ):
56+ def get_plot_data (self ,
57+ leaf_separation = 1 ,
58+ log_size = False ,
59+ max_rectangle_per_icicle = 20 ):
5760 """Generates data for use in plotting the 'icicle plot' or dendrogram
5861 plot of the condensed tree generated by HDBSCAN.
5962
@@ -68,6 +71,12 @@ def get_plot_data(self, leaf_separation=1, log_size=False):
6871 points in the cluster at a given lambda value).
6972 (default False)
7073
74+ max_rectangles_per_icicle : int, optional
75+ To simplify the plot this method will only emit
76+ ``max_rectangles_per_icicle`` bars per branch of the dendrogram.
77+ This ensures that we don't suffer from massive overplotting in
78+ cases with a lot of data points.
79+
7180 Returns
7281 -------
7382 plot_data : dict
@@ -127,22 +136,39 @@ def get_plot_data(self, leaf_separation=1, log_size=False):
127136 c_children = self ._raw_tree [self ._raw_tree ['parent' ] == c ]
128137 current_size = np .sum (c_children ['child_size' ])
129138 current_lambda = cluster_y_coords [c ]
139+ cluster_max_size = current_size
140+ cluster_max_lambda = c_children ['lambda_val' ].max ()
141+ cluster_min_size = np .sum (
142+ c_children [c_children ['lambda_val' ] ==
143+ cluster_max_lambda ]['child_size' ])
130144
131145 if log_size :
132146 current_size = np .log (current_size )
147+ cluster_max_size = np .log (cluster_max_size )
148+ cluster_min_size = np .log (cluster_min_size )
149+
150+ total_size_change = float (cluster_max_size - cluster_min_size )
151+ step_size_change = total_size_change / max_rectangle_per_icicle
133152
134153 cluster_bounds [c ][CB_LEFT ] = cluster_x_coords [c ] * scaling - (current_size / 2.0 )
135154 cluster_bounds [c ][CB_RIGHT ] = cluster_x_coords [c ] * scaling + (current_size / 2.0 )
136155 cluster_bounds [c ][CB_BOTTOM ] = cluster_y_coords [c ]
137156 cluster_bounds [c ][CB_TOP ] = np .max (c_children ['lambda_val' ])
138157
158+ last_step_size = current_size
159+ last_step_lambda = current_lambda
160+
139161 for i in np .argsort (c_children ['lambda_val' ]):
140162 row = c_children [i ]
141- if row ['lambda_val' ] != current_lambda :
163+ if row ['lambda_val' ] != current_lambda and \
164+ (last_step_size - current_size > step_size_change
165+ or row ['lambda_val' ] == cluster_max_lambda ):
142166 bar_centers .append (cluster_x_coords [c ] * scaling )
143- bar_tops .append (row ['lambda_val' ] - current_lambda )
144- bar_bottoms .append (current_lambda )
145- bar_widths .append (current_size )
167+ bar_tops .append (row ['lambda_val' ] - last_step_lambda )
168+ bar_bottoms .append (last_step_lambda )
169+ bar_widths .append (last_step_size )
170+ last_step_size = current_size
171+ last_step_lambda = current_lambda
146172 if log_size :
147173 exp_size = np .exp (current_size ) - row ['child_size' ]
148174 # Ensure we don't try to take log of zero
@@ -184,6 +210,72 @@ def get_plot_data(self, leaf_separation=1, log_size=False):
184210 'cluster_bounds' : cluster_bounds
185211 }
186212
213+ def get_simple_plot_data (self , leaf_separation = 1 , log_size = False ,
214+ max_rectangle_per_icicle = 20 ):
215+ """Generates simplified data for use in plotting the 'icicle plot' or
216+ dendrogram plot of the condensed tree generated by HDBSCAN.
217+
218+ Parameters
219+ ----------
220+ leaf_separation : float, optional
221+ How far apart to space the final leaves of the
222+ dendrogram. (default 1)
223+
224+ log_size : boolean, optional
225+ Use log scale for the 'size' of clusters (i.e. number of
226+ points in the cluster at a given lambda value).
227+ (default False)
228+
229+ max_rectangles_per_icicle : int, optional
230+ To simplify the plot this method will only emit
231+ ``max_rectangles_per_icicle`` bars per branch of the dendrogram.
232+ This ensures that we don't suffer from massive overplotting in
233+ cases with a lot of data points.
234+
235+ Returns
236+ -------
237+ plot_data : dict
238+ Data associated to bars in a bar plot:
239+ `bar_centers` x coordinate centers for bars
240+ `bar_tops` heights of bars in lambda scale
241+ `bar_bottoms` y coordinate of bottoms of bars
242+ `bar_widths` widths of the bars (in x coord scale)
243+ `bar_bounds` a 4-tuple of [left, right, bottom, top]
244+ giving the bounds on a full set of
245+ cluster bars
246+ Data associates with cluster splits:
247+ `line_xs` x coordinates for horiontal dendrogram lines
248+ `line_ys` y coordinates for horiontal dendrogram lines
249+ """
250+ leaves = _get_leaves (self ._raw_tree )
251+ last_leaf = self ._raw_tree ['parent' ].max ()
252+ root = self ._raw_tree ['parent' ].min ()
253+
254+ # We want to get the x and y coordinates for the start of each cluster
255+ # Initialize the leaves, since we know where they go, the iterate
256+ # through everything from the leaves back, setting coords as we go
257+ cluster_x_coords = dict (zip (leaves , [leaf_separation * x
258+ for x in range (len (leaves ))]))
259+ cluster_y_coords = {root : 0.0 }
260+
261+ # We want to get the x and y coordinates for the start of each cluster
262+ # Initialize the leaves, since we know where they go, the iterate
263+ # through everything from the leaves back, setting coords as we go
264+ cluster_x_coords = dict (zip (leaves , [leaf_separation * x
265+ for x in range (len (leaves ))]))
266+ cluster_y_coords = {root : 0.0 }
267+
268+ for cluster in range (last_leaf , root - 1 , - 1 ):
269+ split = self ._raw_tree [['child' , 'lambda_val' ]]
270+ split = split [(self ._raw_tree ['parent' ] == cluster ) &
271+ (self ._raw_tree ['child_size' ] > 1 )]
272+ if len (split ['child' ]) > 1 :
273+ left_child , right_child = split ['child' ]
274+ cluster_x_coords [cluster ] = np .mean ([cluster_x_coords [left_child ],
275+ cluster_x_coords [right_child ]])
276+ cluster_y_coords [left_child ] = split ['lambda_val' ][0 ]
277+ cluster_y_coords [right_child ] = split ['lambda_val' ][1 ]
278+
187279 def _select_clusters (self ):
188280 if self .cluster_selection_method == 'eom' :
189281 stability = compute_stability (self ._raw_tree )
@@ -213,7 +305,8 @@ def _select_clusters(self):
213305
214306 def plot (self , leaf_separation = 1 , cmap = 'viridis' , select_clusters = False ,
215307 label_clusters = False , selection_palette = None ,
216- axis = None , colorbar = True , log_size = False ):
308+ axis = None , colorbar = True , log_size = False ,
309+ max_rectangles_per_icicle = 20 ):
217310 """Use matplotlib to plot an 'icicle plot' dendrogram of the condensed tree.
218311
219312 Effectively this is a dendrogram where the width of each cluster bar is
@@ -224,45 +317,51 @@ def plot(self, leaf_separation=1, cmap='viridis', select_clusters=False,
224317
225318 Parameters
226319 ----------
227- leaf_separation : float, optional
320+ leaf_separation : float, optional (default 1)
228321 How far apart to space the final leaves of the
229- dendrogram. (default 1)
322+ dendrogram.
230323
231- cmap : string or matplotlib colormap, optional
324+ cmap : string or matplotlib colormap, optional (default viridis)
232325 The matplotlib colormap to use to color the cluster bars.
233- (default viridis)
234326
235- select_clusters : boolean, optional
327+
328+ select_clusters : boolean, optional (default False)
236329 Whether to draw ovals highlighting which cluster
237330 bar represent the clusters that were selected by
238- HDBSCAN as the final clusters. (default False)
331+ HDBSCAN as the final clusters.
239332
240- label_clusters : boolean, optional
333+ label_clusters : boolean, optional (default False)
241334 If select_clusters is True then this determines
242335 whether to draw text labels on the clusters.
243336
244- selection_palette : list of colors, optional
337+ selection_palette : list of colors, optional (default None)
245338 If not None, and at least as long as
246339 the number of clusters, draw ovals
247340 in colors iterating through this palette.
248341 This can aid in cluster identification
249342 when plotting.
250343
251- axis : matplotlib axis or None, optional
344+ axis : matplotlib axis or None, optional (default None)
252345 The matplotlib axis to render to. If None then a new axis
253346 will be generated. The rendered axis will be returned.
254- (default None)
255347
256- colorbar : boolean, optional
348+
349+ colorbar : boolean, optional (default True)
257350 Whether to draw a matplotlib colorbar displaying the range
258- of cluster sizes as per the colormap. (default True)
351+ of cluster sizes as per the colormap.
259352
260- log_size : boolean, optional
353+ log_size : boolean, optional (default False)
261354 Use log scale for the 'size' of clusters (i.e. number of
262355 points in the cluster at a given lambda value).
263- (default False)
356+
264357
265- Returns
358+ max_rectangles_per_icicle : int, optional (default 20)
359+ To simplify the plot this method will only emit
360+ ``max_rectangles_per_icicle`` bars per branch of the dendrogram.
361+ This ensures that we don't suffer from massive overplotting in
362+ cases with a lot of data points.
363+
364+ Returns
266365 -------
267366 axis : matplotlib axis
268367 The axis on which the 'icicle plot' has been rendered.
@@ -274,7 +373,9 @@ def plot(self, leaf_separation=1, cmap='viridis', select_clusters=False,
274373 'You must install the matplotlib library to plot the condensed tree.'
275374 'Use get_plot_data to calculate the relevant data without plotting.' )
276375
277- plot_data = self .get_plot_data (leaf_separation = leaf_separation , log_size = log_size )
376+ plot_data = self .get_plot_data (leaf_separation = leaf_separation ,
377+ log_size = log_size ,
378+ max_rectangle_per_icicle = max_rectangles_per_icicle )
278379
279380 if cmap != 'none' :
280381 sm = plt .cm .ScalarMappable (cmap = cmap ,
0 commit comments