Skip to content

Commit cc67a61

Browse files
committed
Allow capping out of overplotting to speed up plots of condensed trees.
1 parent 2ab257a commit cc67a61

File tree

1 file changed

+123
-22
lines changed

1 file changed

+123
-22
lines changed

hdbscan/plots.py

Lines changed: 123 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,10 @@ def __init__(self, condensed_tree_array, cluster_selection_method='eom'):
5353
self._raw_tree = condensed_tree_array
5454
self.cluster_selection_method = cluster_selection_method
5555

56-
def get_plot_data(self, leaf_separation=1, log_size=False):
56+
def get_plot_data(self,
57+
leaf_separation=1,
58+
log_size=False,
59+
max_rectangle_per_icicle=20):
5760
"""Generates data for use in plotting the 'icicle plot' or dendrogram
5861
plot of the condensed tree generated by HDBSCAN.
5962
@@ -68,6 +71,12 @@ def get_plot_data(self, leaf_separation=1, log_size=False):
6871
points in the cluster at a given lambda value).
6972
(default False)
7073
74+
max_rectangles_per_icicle : int, optional
75+
To simplify the plot this method will only emit
76+
``max_rectangles_per_icicle`` bars per branch of the dendrogram.
77+
This ensures that we don't suffer from massive overplotting in
78+
cases with a lot of data points.
79+
7180
Returns
7281
-------
7382
plot_data : dict
@@ -127,22 +136,39 @@ def get_plot_data(self, leaf_separation=1, log_size=False):
127136
c_children = self._raw_tree[self._raw_tree['parent'] == c]
128137
current_size = np.sum(c_children['child_size'])
129138
current_lambda = cluster_y_coords[c]
139+
cluster_max_size = current_size
140+
cluster_max_lambda = c_children['lambda_val'].max()
141+
cluster_min_size = np.sum(
142+
c_children[c_children['lambda_val'] ==
143+
cluster_max_lambda]['child_size'])
130144

131145
if log_size:
132146
current_size = np.log(current_size)
147+
cluster_max_size = np.log(cluster_max_size)
148+
cluster_min_size = np.log(cluster_min_size)
149+
150+
total_size_change = float(cluster_max_size - cluster_min_size)
151+
step_size_change = total_size_change / max_rectangle_per_icicle
133152

134153
cluster_bounds[c][CB_LEFT] = cluster_x_coords[c] * scaling - (current_size / 2.0)
135154
cluster_bounds[c][CB_RIGHT] = cluster_x_coords[c] * scaling + (current_size / 2.0)
136155
cluster_bounds[c][CB_BOTTOM] = cluster_y_coords[c]
137156
cluster_bounds[c][CB_TOP] = np.max(c_children['lambda_val'])
138157

158+
last_step_size = current_size
159+
last_step_lambda = current_lambda
160+
139161
for i in np.argsort(c_children['lambda_val']):
140162
row = c_children[i]
141-
if row['lambda_val'] != current_lambda:
163+
if row['lambda_val'] != current_lambda and \
164+
(last_step_size - current_size > step_size_change
165+
or row['lambda_val'] == cluster_max_lambda):
142166
bar_centers.append(cluster_x_coords[c] * scaling)
143-
bar_tops.append(row['lambda_val'] - current_lambda)
144-
bar_bottoms.append(current_lambda)
145-
bar_widths.append(current_size)
167+
bar_tops.append(row['lambda_val'] - last_step_lambda)
168+
bar_bottoms.append(last_step_lambda)
169+
bar_widths.append(last_step_size)
170+
last_step_size = current_size
171+
last_step_lambda = current_lambda
146172
if log_size:
147173
exp_size = np.exp(current_size) - row['child_size']
148174
# Ensure we don't try to take log of zero
@@ -184,6 +210,72 @@ def get_plot_data(self, leaf_separation=1, log_size=False):
184210
'cluster_bounds': cluster_bounds
185211
}
186212

213+
def get_simple_plot_data(self, leaf_separation=1, log_size=False,
214+
max_rectangle_per_icicle=20):
215+
"""Generates simplified data for use in plotting the 'icicle plot' or
216+
dendrogram plot of the condensed tree generated by HDBSCAN.
217+
218+
Parameters
219+
----------
220+
leaf_separation : float, optional
221+
How far apart to space the final leaves of the
222+
dendrogram. (default 1)
223+
224+
log_size : boolean, optional
225+
Use log scale for the 'size' of clusters (i.e. number of
226+
points in the cluster at a given lambda value).
227+
(default False)
228+
229+
max_rectangles_per_icicle : int, optional
230+
To simplify the plot this method will only emit
231+
``max_rectangles_per_icicle`` bars per branch of the dendrogram.
232+
This ensures that we don't suffer from massive overplotting in
233+
cases with a lot of data points.
234+
235+
Returns
236+
-------
237+
plot_data : dict
238+
Data associated to bars in a bar plot:
239+
`bar_centers` x coordinate centers for bars
240+
`bar_tops` heights of bars in lambda scale
241+
`bar_bottoms` y coordinate of bottoms of bars
242+
`bar_widths` widths of the bars (in x coord scale)
243+
`bar_bounds` a 4-tuple of [left, right, bottom, top]
244+
giving the bounds on a full set of
245+
cluster bars
246+
Data associates with cluster splits:
247+
`line_xs` x coordinates for horiontal dendrogram lines
248+
`line_ys` y coordinates for horiontal dendrogram lines
249+
"""
250+
leaves = _get_leaves(self._raw_tree)
251+
last_leaf = self._raw_tree['parent'].max()
252+
root = self._raw_tree['parent'].min()
253+
254+
# We want to get the x and y coordinates for the start of each cluster
255+
# Initialize the leaves, since we know where they go, the iterate
256+
# through everything from the leaves back, setting coords as we go
257+
cluster_x_coords = dict(zip(leaves, [leaf_separation * x
258+
for x in range(len(leaves))]))
259+
cluster_y_coords = {root: 0.0}
260+
261+
# We want to get the x and y coordinates for the start of each cluster
262+
# Initialize the leaves, since we know where they go, the iterate
263+
# through everything from the leaves back, setting coords as we go
264+
cluster_x_coords = dict(zip(leaves, [leaf_separation * x
265+
for x in range(len(leaves))]))
266+
cluster_y_coords = {root: 0.0}
267+
268+
for cluster in range(last_leaf, root - 1, -1):
269+
split = self._raw_tree[['child', 'lambda_val']]
270+
split = split[(self._raw_tree['parent'] == cluster) &
271+
(self._raw_tree['child_size'] > 1)]
272+
if len(split['child']) > 1:
273+
left_child, right_child = split['child']
274+
cluster_x_coords[cluster] = np.mean([cluster_x_coords[left_child],
275+
cluster_x_coords[right_child]])
276+
cluster_y_coords[left_child] = split['lambda_val'][0]
277+
cluster_y_coords[right_child] = split['lambda_val'][1]
278+
187279
def _select_clusters(self):
188280
if self.cluster_selection_method == 'eom':
189281
stability = compute_stability(self._raw_tree)
@@ -213,7 +305,8 @@ def _select_clusters(self):
213305

214306
def plot(self, leaf_separation=1, cmap='viridis', select_clusters=False,
215307
label_clusters=False, selection_palette=None,
216-
axis=None, colorbar=True, log_size=False):
308+
axis=None, colorbar=True, log_size=False,
309+
max_rectangles_per_icicle=20):
217310
"""Use matplotlib to plot an 'icicle plot' dendrogram of the condensed tree.
218311
219312
Effectively this is a dendrogram where the width of each cluster bar is
@@ -224,45 +317,51 @@ def plot(self, leaf_separation=1, cmap='viridis', select_clusters=False,
224317
225318
Parameters
226319
----------
227-
leaf_separation : float, optional
320+
leaf_separation : float, optional (default 1)
228321
How far apart to space the final leaves of the
229-
dendrogram. (default 1)
322+
dendrogram.
230323
231-
cmap : string or matplotlib colormap, optional
324+
cmap : string or matplotlib colormap, optional (default viridis)
232325
The matplotlib colormap to use to color the cluster bars.
233-
(default viridis)
234326
235-
select_clusters : boolean, optional
327+
328+
select_clusters : boolean, optional (default False)
236329
Whether to draw ovals highlighting which cluster
237330
bar represent the clusters that were selected by
238-
HDBSCAN as the final clusters. (default False)
331+
HDBSCAN as the final clusters.
239332
240-
label_clusters : boolean, optional
333+
label_clusters : boolean, optional (default False)
241334
If select_clusters is True then this determines
242335
whether to draw text labels on the clusters.
243336
244-
selection_palette : list of colors, optional
337+
selection_palette : list of colors, optional (default None)
245338
If not None, and at least as long as
246339
the number of clusters, draw ovals
247340
in colors iterating through this palette.
248341
This can aid in cluster identification
249342
when plotting.
250343
251-
axis : matplotlib axis or None, optional
344+
axis : matplotlib axis or None, optional (default None)
252345
The matplotlib axis to render to. If None then a new axis
253346
will be generated. The rendered axis will be returned.
254-
(default None)
255347
256-
colorbar : boolean, optional
348+
349+
colorbar : boolean, optional (default True)
257350
Whether to draw a matplotlib colorbar displaying the range
258-
of cluster sizes as per the colormap. (default True)
351+
of cluster sizes as per the colormap.
259352
260-
log_size : boolean, optional
353+
log_size : boolean, optional (default False)
261354
Use log scale for the 'size' of clusters (i.e. number of
262355
points in the cluster at a given lambda value).
263-
(default False)
356+
264357
265-
Returns
358+
max_rectangles_per_icicle : int, optional (default 20)
359+
To simplify the plot this method will only emit
360+
``max_rectangles_per_icicle`` bars per branch of the dendrogram.
361+
This ensures that we don't suffer from massive overplotting in
362+
cases with a lot of data points.
363+
364+
Returns
266365
-------
267366
axis : matplotlib axis
268367
The axis on which the 'icicle plot' has been rendered.
@@ -274,7 +373,9 @@ def plot(self, leaf_separation=1, cmap='viridis', select_clusters=False,
274373
'You must install the matplotlib library to plot the condensed tree.'
275374
'Use get_plot_data to calculate the relevant data without plotting.')
276375

277-
plot_data = self.get_plot_data(leaf_separation=leaf_separation, log_size=log_size)
376+
plot_data = self.get_plot_data(leaf_separation=leaf_separation,
377+
log_size=log_size,
378+
max_rectangle_per_icicle=max_rectangles_per_icicle)
278379

279380
if cmap != 'none':
280381
sm = plt.cm.ScalarMappable(cmap=cmap,

0 commit comments

Comments
 (0)