|
1 | | -from collections import namedtuple, deque |
| 1 | +import warnings |
| 2 | + |
| 3 | +from collections import namedtuple, deque, defaultdict |
2 | 4 | from operator import attrgetter |
3 | | -from itertools import chain, count |
| 5 | +from itertools import count |
4 | 6 | from distutils.version import LooseVersion as _LooseVersion |
5 | 7 |
|
6 | 8 | import heapq |
|
13 | 15 |
|
14 | 16 | __all__ = ['HierarchicalClustering'] |
15 | 17 |
|
| 18 | +_undef = object() # 'no value' sentinel |
| 19 | + |
16 | 20 | SINGLE = "single" |
17 | 21 | AVERAGE = "average" |
18 | 22 | COMPLETE = "complete" |
@@ -274,6 +278,28 @@ def tree_from_linkage(linkage): |
274 | 278 | return T[root] |
275 | 279 |
|
276 | 280 |
|
| 281 | +def linkage_from_tree(tree: Tree) -> numpy.ndarray: |
| 282 | + leafs = [n for n in preorder(tree) if n.is_leaf] |
| 283 | + |
| 284 | + Z = numpy.zeros((len(leafs) - 1, 4), float) |
| 285 | + i = 0 |
| 286 | + node_to_i = defaultdict(count(len(leafs)).__next__) |
| 287 | + for node in postorder(tree): |
| 288 | + if node.is_leaf: |
| 289 | + node_to_i[node] = node.value.index |
| 290 | + else: |
| 291 | + assert len(node.branches) == 2 |
| 292 | + assert node.left in node_to_i |
| 293 | + assert node.right in node_to_i |
| 294 | + Z[i] = [node_to_i[node.left], node_to_i[node.right], |
| 295 | + node.value.height, 0] |
| 296 | + _ni = node_to_i[node] |
| 297 | + assert _ni == Z.shape[0] + i + 1 |
| 298 | + i += 1 |
| 299 | + assert i == Z.shape[0] |
| 300 | + return Z |
| 301 | + |
| 302 | + |
277 | 303 | def postorder(tree, branches=attrgetter("branches")): |
278 | 304 | stack = deque([tree]) |
279 | 305 | visited = set() |
@@ -406,218 +432,30 @@ def item(node): |
406 | 432 | return [n for _, n in heap] |
407 | 433 |
|
408 | 434 |
|
409 | | -def optimal_leaf_ordering(tree, distances, progress_callback=None): |
| 435 | +def optimal_leaf_ordering( |
| 436 | + tree: Tree, distances: numpy.ndarray, progress_callback=_undef |
| 437 | +) -> Tree: |
410 | 438 | """ |
411 | 439 | Order the leaves in the clustering tree. |
412 | 440 |
|
413 | | - (based on Ziv Bar-Joseph et al. Fast optimal leaf ordering for |
414 | | - hierarchical clustering) |
415 | | -
|
416 | 441 | :param Tree tree: |
417 | 442 | Binary hierarchical clustering tree. |
418 | 443 | :param numpy.ndarray distances: |
419 | 444 | A (N, N) numpy.ndarray of distances that were used to compute |
420 | 445 | the clustering. |
421 | | - :param function progress_callback: |
422 | | - Function used to report on progress. |
423 | 446 |
|
| 447 | + .. seealso:: scipy.cluster.hierarchy.optimal_leaf_ordering |
424 | 448 | """ |
425 | | - distances = numpy.asarray(distances) |
426 | | - M = numpy.zeros_like(distances) |
427 | | - |
428 | | - # rearrange distances by order defined by tree's leaves |
429 | | - indices = numpy.array([leaf.value.index for leaf in leaves(tree)]) |
430 | | - distances = distances[indices[numpy.newaxis, :], |
431 | | - indices[:, numpy.newaxis]] |
432 | | - distances = numpy.ascontiguousarray(distances) |
433 | | - |
434 | | - # This is the 'fast' early termination search described in the paper |
435 | | - # (it is slower in the pure python implementation) |
436 | | - def argmin_ordered_xpypZ(x, y, Z, sorter_x=None, sorter_y=None): |
437 | | - C = numpy.min(Z) |
438 | | - if sorter_x is None: |
439 | | - sorter_x = range(len(x)) |
440 | | - ordered_x = x |
441 | | - else: |
442 | | - ordered_x = x[sorter_x] |
443 | | - if sorter_y is None: |
444 | | - sorter_y = range(len(y)) |
445 | | - ordered_y = y |
446 | | - else: |
447 | | - ordered_y = y[sorter_y] |
448 | | - |
449 | | - y0 = ordered_y[0] |
450 | | - |
451 | | - best_val = numpy.inf |
452 | | - best_i, best_j = 0, 0 |
453 | | - |
454 | | - y0pC = y0 + C |
455 | | - for i, x in zip(sorter_x, ordered_x): |
456 | | - if x + y0pC >= best_val: |
457 | | - break |
458 | | - xpC = x + C |
459 | | - for j, y in zip(sorter_y, ordered_y): |
460 | | - if xpC + y >= best_val: |
461 | | - break |
462 | | - val = x + y + Z[i, j] |
463 | | - if val < best_val: |
464 | | - best_val, best_i, best_j = val, i, j |
465 | | - return best_i, best_j |
466 | | - |
467 | | - def argmin_xpypZ(x, y, Z): |
468 | | - _, h = Z.shape |
469 | | - A = Z + numpy.reshape(x, (-1, 1)) |
470 | | - A += numpy.reshape(y, (1, -1)) |
471 | | - i = numpy.argmin(A) |
472 | | - return i // h, i % h |
473 | | - |
474 | | - def optimal_ordering(tree, M, ordering): |
475 | | - if tree.is_leaf: |
476 | | - M[tree.value.first, tree.value.first] = 0.0 |
477 | | - else: |
478 | | - left, right = tree.left, tree.right |
479 | | - if not left.is_leaf: |
480 | | - V_ll = range(*left.left.value.range) |
481 | | - V_lr = range(*left.right.value.range) |
482 | | - u_iter = chain(((u, V_lr) for u in V_ll), |
483 | | - ((u, V_ll) for u in V_lr)) |
484 | | - else: |
485 | | - V_lr = range(*left.value.range) |
486 | | - u_iter = ((u, V_lr) for u in V_lr) |
487 | | - |
488 | | - u_iter = list(u_iter) |
489 | | - assert [u for u, _ in u_iter] == list(range(*left.value.range)) |
490 | | - |
491 | | - if not right.is_leaf: |
492 | | - V_rl = range(*right.left.value.range) |
493 | | - V_rr = range(*right.right.value.range) |
494 | | - w_iter = chain(((w, V_rr) for w in V_rl), |
495 | | - ((w, V_rl) for w in V_rr)) |
496 | | - else: |
497 | | - V_rr = range(*right.value.range) |
498 | | - w_iter = ((w, V_rr) for w in V_rr) |
499 | | - |
500 | | - w_iter = list(w_iter) |
501 | | - assert [w for w, _ in w_iter] == list(range(*right.value.range)) |
502 | | - |
503 | | - for u, left_inner in u_iter: |
504 | | - left_inner_slice = slice(left_inner.start, left_inner.stop) |
505 | | - M_left_inner = M[u, left_inner_slice] |
506 | | -# left_inner_sort = numpy.argsort(M_left_inner) |
507 | | - for w, right_inner in w_iter: |
508 | | - right_inner_slice = slice(right_inner.start, |
509 | | - right_inner.stop) |
510 | | - M_right_inner = M[w, right_inner_slice] |
511 | | -# right_inner_sort = numpy.argsort(M_right_inner) |
512 | | -# i, j = argmin_ordered_xpypZ( |
513 | | -# M_left_inner, M_right_inner, |
514 | | -# distances[left_inner_slice, right_inner_slice], |
515 | | -# sorter_x=left_inner_sort, sorter_y=right_inner_sort, |
516 | | -# ) |
517 | | - i, j = argmin_xpypZ( |
518 | | - M_left_inner, M_right_inner, |
519 | | - distances[left_inner_slice, right_inner_slice] |
520 | | - ) |
521 | | - m, k = left_inner.start + i, right_inner.start + j |
522 | | - score = M[u, m] + M[k, w] + distances[m, k] |
523 | | - M[u, w] = M[w, u] = score |
524 | | - ordering[u, w] = (m, k) |
525 | | - |
526 | | - return M, ordering |
527 | | - |
528 | | - subtrees = list(postorder(tree)) |
529 | | - ordering_dtype = numpy.dtype( |
530 | | - [("m", numpy.uint32), |
531 | | - ("k", numpy.uint32)]) |
532 | | - |
533 | | - ordering = numpy.empty(distances.shape, dtype=ordering_dtype) |
534 | | - |
535 | | - for i, subtree in enumerate(subtrees): |
536 | | - M, ordering = optimal_ordering(subtree, M, ordering) |
537 | | - |
538 | | - if progress_callback: |
539 | | - progress_callback(100.0 * i / len(subtrees)) |
540 | | - |
541 | | - def min_uw(tree, u=None, w=None): |
542 | | - if tree.is_leaf: |
543 | | - return 0.0 |
544 | | - else: |
545 | | - if u is None: |
546 | | - U = slice(*tree.left.value.range) |
547 | | - else: |
548 | | - U = slice(u, u + 1) |
549 | | - if w is None: |
550 | | - W = slice(*tree.right.value.range) |
551 | | - else: |
552 | | - W = slice(w, w + 1) |
553 | | - |
554 | | - M_ = M[U, W] |
555 | | - _, w = M_.shape |
556 | | - i = numpy.argmin(M_.ravel()) |
557 | | - i, j = i // w, i % w |
558 | | - return U.start + i, W.start + j |
559 | | - |
560 | | - def optimal_swap(root, M): |
561 | | - opt_uw = {root: min_uw(root)} |
562 | | - # run down the tree applying u, w constraints from parents. |
563 | | - for tree in preorder(root): |
564 | | - if tree.is_leaf: |
565 | | - pass |
566 | | - else: |
567 | | - u, w = opt_uw[tree] |
568 | | - assert u in range(*tree.value.range) |
569 | | - assert w in range(*tree.value.range) |
570 | | - if u < w: |
571 | | - m, k = ordering[u, w] |
572 | | - |
573 | | - opt_uw[tree.left] = (u, m) |
574 | | - opt_uw[tree.right] = (k, w) |
575 | | - else: |
576 | | - k, m = ordering[w, u] |
577 | | - opt_uw[tree.right] = (u, m) |
578 | | - |
579 | | - opt_uw[tree.left] = (k, w) |
580 | | - |
581 | | - def is_swapped(tree): |
582 | | - "Is `tree` swapped based on computed optimal ordering" |
583 | | - if tree.is_leaf: |
584 | | - return False |
585 | | - else: |
586 | | - u, w = opt_uw[tree] |
587 | | - return u > w |
588 | | - |
589 | | - def swaped_branches(tree): |
590 | | - "Return branches from `tree` in optimal order" |
591 | | - if tree.is_leaf: |
592 | | - return () |
593 | | - elif is_swapped(tree): |
594 | | - return tree.branches |
595 | | - else: |
596 | | - return tuple(reversed(tree.branches)) |
597 | | - |
598 | | - # Create a new tree structure with optimally swapped branches. |
599 | | - T = {} |
600 | | - counter = count(0) |
601 | | - for tree in postorder(root, branches=swaped_branches): |
602 | | - if tree.is_leaf: |
603 | | - # we need to 're-enumerate' the leaves |
604 | | - i = next(counter) |
605 | | - T[tree] = Tree(tree.value._replace(range=(i, i + 1)), ()) |
606 | | - else: |
607 | | - left, right = T[tree.left], T[tree.right] |
608 | | - |
609 | | - if left.value.first > right.value.first: |
610 | | - right, left = left, right |
611 | | - |
612 | | - assert left.value.first < right.value.last |
613 | | - assert left.value.last == right.value.first |
614 | | - |
615 | | - T[tree] = Tree(tree.value._replace(range=(left.value.first, |
616 | | - right.value.last)), |
617 | | - (left, right)) |
618 | | - return T[root] |
619 | | - |
620 | | - return optimal_swap(tree, M) |
| 449 | + if progress_callback is not _undef: |
| 450 | + warnings.warn( |
| 451 | + "'progress_callback' parameter is deprecated and ignored. " |
| 452 | + "Passing it will raise an error in the future.", |
| 453 | + FutureWarning, stacklevel=2 |
| 454 | + ) |
| 455 | + Z = linkage_from_tree(tree) |
| 456 | + y = condensedform(numpy.asarray(distances)) |
| 457 | + Zopt = scipy.cluster.hierarchy.optimal_leaf_ordering(Z, y) |
| 458 | + return tree_from_linkage(Zopt) |
621 | 459 |
|
622 | 460 |
|
623 | 461 | class HierarchicalClustering: |
|
0 commit comments