Skip to content

Commit eaf96f0

Browse files
committed
cluster/hierarchical: Use scipy implementation of optimal_leaf_ordering
1 parent ccee167 commit eaf96f0

File tree

2 files changed

+93
-204
lines changed

2 files changed

+93
-204
lines changed

Orange/clustering/hierarchical.py

Lines changed: 42 additions & 204 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
from collections import namedtuple, deque
1+
import warnings
2+
3+
from collections import namedtuple, deque, defaultdict
24
from operator import attrgetter
3-
from itertools import chain, count
5+
from itertools import count
46
from distutils.version import LooseVersion as _LooseVersion
57

68
import heapq
@@ -13,6 +15,8 @@
1315

1416
__all__ = ['HierarchicalClustering']
1517

18+
_undef = object() # 'no value' sentinel
19+
1620
SINGLE = "single"
1721
AVERAGE = "average"
1822
COMPLETE = "complete"
@@ -274,6 +278,28 @@ def tree_from_linkage(linkage):
274278
return T[root]
275279

276280

281+
def linkage_from_tree(tree: Tree) -> numpy.ndarray:
282+
leafs = [n for n in preorder(tree) if n.is_leaf]
283+
284+
Z = numpy.zeros((len(leafs) - 1, 4), float)
285+
i = 0
286+
node_to_i = defaultdict(count(len(leafs)).__next__)
287+
for node in postorder(tree):
288+
if node.is_leaf:
289+
node_to_i[node] = node.value.index
290+
else:
291+
assert len(node.branches) == 2
292+
assert node.left in node_to_i
293+
assert node.right in node_to_i
294+
Z[i] = [node_to_i[node.left], node_to_i[node.right],
295+
node.value.height, 0]
296+
_ni = node_to_i[node]
297+
assert _ni == Z.shape[0] + i + 1
298+
i += 1
299+
assert i == Z.shape[0]
300+
return Z
301+
302+
277303
def postorder(tree, branches=attrgetter("branches")):
278304
stack = deque([tree])
279305
visited = set()
@@ -406,218 +432,30 @@ def item(node):
406432
return [n for _, n in heap]
407433

408434

409-
def optimal_leaf_ordering(tree, distances, progress_callback=None):
435+
def optimal_leaf_ordering(
436+
tree: Tree, distances: numpy.ndarray, progress_callback=_undef
437+
) -> Tree:
410438
"""
411439
Order the leaves in the clustering tree.
412440
413-
(based on Ziv Bar-Joseph et al. Fast optimal leaf ordering for
414-
hierarchical clustering)
415-
416441
:param Tree tree:
417442
Binary hierarchical clustering tree.
418443
:param numpy.ndarray distances:
419444
A (N, N) numpy.ndarray of distances that were used to compute
420445
the clustering.
421-
:param function progress_callback:
422-
Function used to report on progress.
423446
447+
.. seealso:: scipy.cluster.hierarchy.optimal_leaf_ordering
424448
"""
425-
distances = numpy.asarray(distances)
426-
M = numpy.zeros_like(distances)
427-
428-
# rearrange distances by order defined by tree's leaves
429-
indices = numpy.array([leaf.value.index for leaf in leaves(tree)])
430-
distances = distances[indices[numpy.newaxis, :],
431-
indices[:, numpy.newaxis]]
432-
distances = numpy.ascontiguousarray(distances)
433-
434-
# This is the 'fast' early termination search described in the paper
435-
# (it is slower in the pure python implementation)
436-
def argmin_ordered_xpypZ(x, y, Z, sorter_x=None, sorter_y=None):
437-
C = numpy.min(Z)
438-
if sorter_x is None:
439-
sorter_x = range(len(x))
440-
ordered_x = x
441-
else:
442-
ordered_x = x[sorter_x]
443-
if sorter_y is None:
444-
sorter_y = range(len(y))
445-
ordered_y = y
446-
else:
447-
ordered_y = y[sorter_y]
448-
449-
y0 = ordered_y[0]
450-
451-
best_val = numpy.inf
452-
best_i, best_j = 0, 0
453-
454-
y0pC = y0 + C
455-
for i, x in zip(sorter_x, ordered_x):
456-
if x + y0pC >= best_val:
457-
break
458-
xpC = x + C
459-
for j, y in zip(sorter_y, ordered_y):
460-
if xpC + y >= best_val:
461-
break
462-
val = x + y + Z[i, j]
463-
if val < best_val:
464-
best_val, best_i, best_j = val, i, j
465-
return best_i, best_j
466-
467-
def argmin_xpypZ(x, y, Z):
468-
_, h = Z.shape
469-
A = Z + numpy.reshape(x, (-1, 1))
470-
A += numpy.reshape(y, (1, -1))
471-
i = numpy.argmin(A)
472-
return i // h, i % h
473-
474-
def optimal_ordering(tree, M, ordering):
475-
if tree.is_leaf:
476-
M[tree.value.first, tree.value.first] = 0.0
477-
else:
478-
left, right = tree.left, tree.right
479-
if not left.is_leaf:
480-
V_ll = range(*left.left.value.range)
481-
V_lr = range(*left.right.value.range)
482-
u_iter = chain(((u, V_lr) for u in V_ll),
483-
((u, V_ll) for u in V_lr))
484-
else:
485-
V_lr = range(*left.value.range)
486-
u_iter = ((u, V_lr) for u in V_lr)
487-
488-
u_iter = list(u_iter)
489-
assert [u for u, _ in u_iter] == list(range(*left.value.range))
490-
491-
if not right.is_leaf:
492-
V_rl = range(*right.left.value.range)
493-
V_rr = range(*right.right.value.range)
494-
w_iter = chain(((w, V_rr) for w in V_rl),
495-
((w, V_rl) for w in V_rr))
496-
else:
497-
V_rr = range(*right.value.range)
498-
w_iter = ((w, V_rr) for w in V_rr)
499-
500-
w_iter = list(w_iter)
501-
assert [w for w, _ in w_iter] == list(range(*right.value.range))
502-
503-
for u, left_inner in u_iter:
504-
left_inner_slice = slice(left_inner.start, left_inner.stop)
505-
M_left_inner = M[u, left_inner_slice]
506-
# left_inner_sort = numpy.argsort(M_left_inner)
507-
for w, right_inner in w_iter:
508-
right_inner_slice = slice(right_inner.start,
509-
right_inner.stop)
510-
M_right_inner = M[w, right_inner_slice]
511-
# right_inner_sort = numpy.argsort(M_right_inner)
512-
# i, j = argmin_ordered_xpypZ(
513-
# M_left_inner, M_right_inner,
514-
# distances[left_inner_slice, right_inner_slice],
515-
# sorter_x=left_inner_sort, sorter_y=right_inner_sort,
516-
# )
517-
i, j = argmin_xpypZ(
518-
M_left_inner, M_right_inner,
519-
distances[left_inner_slice, right_inner_slice]
520-
)
521-
m, k = left_inner.start + i, right_inner.start + j
522-
score = M[u, m] + M[k, w] + distances[m, k]
523-
M[u, w] = M[w, u] = score
524-
ordering[u, w] = (m, k)
525-
526-
return M, ordering
527-
528-
subtrees = list(postorder(tree))
529-
ordering_dtype = numpy.dtype(
530-
[("m", numpy.uint32),
531-
("k", numpy.uint32)])
532-
533-
ordering = numpy.empty(distances.shape, dtype=ordering_dtype)
534-
535-
for i, subtree in enumerate(subtrees):
536-
M, ordering = optimal_ordering(subtree, M, ordering)
537-
538-
if progress_callback:
539-
progress_callback(100.0 * i / len(subtrees))
540-
541-
def min_uw(tree, u=None, w=None):
542-
if tree.is_leaf:
543-
return 0.0
544-
else:
545-
if u is None:
546-
U = slice(*tree.left.value.range)
547-
else:
548-
U = slice(u, u + 1)
549-
if w is None:
550-
W = slice(*tree.right.value.range)
551-
else:
552-
W = slice(w, w + 1)
553-
554-
M_ = M[U, W]
555-
_, w = M_.shape
556-
i = numpy.argmin(M_.ravel())
557-
i, j = i // w, i % w
558-
return U.start + i, W.start + j
559-
560-
def optimal_swap(root, M):
561-
opt_uw = {root: min_uw(root)}
562-
# run down the tree applying u, w constraints from parents.
563-
for tree in preorder(root):
564-
if tree.is_leaf:
565-
pass
566-
else:
567-
u, w = opt_uw[tree]
568-
assert u in range(*tree.value.range)
569-
assert w in range(*tree.value.range)
570-
if u < w:
571-
m, k = ordering[u, w]
572-
573-
opt_uw[tree.left] = (u, m)
574-
opt_uw[tree.right] = (k, w)
575-
else:
576-
k, m = ordering[w, u]
577-
opt_uw[tree.right] = (u, m)
578-
579-
opt_uw[tree.left] = (k, w)
580-
581-
def is_swapped(tree):
582-
"Is `tree` swapped based on computed optimal ordering"
583-
if tree.is_leaf:
584-
return False
585-
else:
586-
u, w = opt_uw[tree]
587-
return u > w
588-
589-
def swaped_branches(tree):
590-
"Return branches from `tree` in optimal order"
591-
if tree.is_leaf:
592-
return ()
593-
elif is_swapped(tree):
594-
return tree.branches
595-
else:
596-
return tuple(reversed(tree.branches))
597-
598-
# Create a new tree structure with optimally swapped branches.
599-
T = {}
600-
counter = count(0)
601-
for tree in postorder(root, branches=swaped_branches):
602-
if tree.is_leaf:
603-
# we need to 're-enumerate' the leaves
604-
i = next(counter)
605-
T[tree] = Tree(tree.value._replace(range=(i, i + 1)), ())
606-
else:
607-
left, right = T[tree.left], T[tree.right]
608-
609-
if left.value.first > right.value.first:
610-
right, left = left, right
611-
612-
assert left.value.first < right.value.last
613-
assert left.value.last == right.value.first
614-
615-
T[tree] = Tree(tree.value._replace(range=(left.value.first,
616-
right.value.last)),
617-
(left, right))
618-
return T[root]
619-
620-
return optimal_swap(tree, M)
449+
if progress_callback is not _undef:
450+
warnings.warn(
451+
"'progress_callback' parameter is deprecated and ignored. "
452+
"Passing it will raise an error in the future.",
453+
FutureWarning, stacklevel=2
454+
)
455+
Z = linkage_from_tree(tree)
456+
y = condensedform(numpy.asarray(distances))
457+
Zopt = scipy.cluster.hierarchy.optimal_leaf_ordering(Z, y)
458+
return tree_from_linkage(Zopt)
621459

622460

623461
class HierarchicalClustering:

Orange/tests/test_clustering_hierarchical.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Test methods with long descriptive names can omit docstrings
22
# pylint: disable=missing-docstring
3+
from numpy.testing import assert_array_equal
34

45
import unittest
56
from itertools import chain, tee
@@ -143,6 +144,56 @@ def test_invalid_linkage(self):
143144
with self.assertRaises(ValueError):
144145
hierarchical.tree_from_linkage(link)
145146

147+
def test_linkage_from_tree(self):
148+
T = hierarchical.Tree
149+
C = hierarchical.ClusterData
150+
S = hierarchical.SingletonData
151+
152+
def t(h: float, left: T, right: T):
153+
return T(C((left.value.first, right.value.last), h), (left, right))
154+
155+
def leaf(r, index):
156+
return T(S((r, r + 1), 0.0, index))
157+
158+
assert_array_equal(
159+
hierarchical.linkage_from_tree(leaf(0, 0)),
160+
numpy.empty((0, 4))
161+
)
162+
assert_array_equal(
163+
hierarchical.linkage_from_tree(t(1.0, leaf(0, 0), leaf(1, 1))),
164+
numpy.array([
165+
[0, 1, 1.0, 0.0]
166+
])
167+
)
168+
169+
tree = t(1.0, t(0.2, leaf(0, 0), leaf(1, 1)), leaf(2, 2))
170+
Z = hierarchical.linkage_from_tree(tree)
171+
assert_array_equal(
172+
Z,
173+
numpy.array([
174+
[0, 1, 0.2, 0.0],
175+
[3, 2, 1.0, 0.0]
176+
])
177+
)
178+
self.assertEqual(tree, hierarchical.tree_from_linkage(Z))
179+
180+
tree = t(
181+
2.0,
182+
left=t(0.5, leaf(0, 1), leaf(1, 2)),
183+
right=t(1.0, leaf(2, 0), leaf(3, 3),)
184+
)
185+
186+
Z = hierarchical.linkage_from_tree(tree)
187+
assert_array_equal(
188+
Z,
189+
numpy.array([
190+
[1, 2, 0.5, 0.0],
191+
[0, 3, 1.0, 0.0],
192+
[4, 5, 2.0, 0.0]
193+
])
194+
)
195+
self.assertEqual(tree, hierarchical.tree_from_linkage(Z))
196+
146197

147198
class TestTree(unittest.TestCase):
148199
def test_tree(self):

0 commit comments

Comments
 (0)