Merge pull request #653 from daniel-goldstein/combinatorics-doc

mergify[bot] · web-flow · commit e5b38e4accb9 · 2020-05-29T16:37:02.000Z
Document enumerating tree topologies in combinatorics module
diff --git a/docs/_static/four_leaf_tree_shapes.png b/docs/_static/four_leaf_tree_shapes.png
diff --git a/docs/_static/topology_0_0.svg b/docs/_static/topology_0_0.svg
@@ -0,0 +1 @@
+<svg baseProfile="full" height="175" version="1.1" width="200" xmlns="http://www.w3.org/2000/svg" xmlns:ev="http://www.w3.org/2001/xml-events" xmlns:xlink="http://www.w3.org/1999/xlink"><defs><style type="text/css"><![CDATA[.axis {font-weight: bold}.tree, .axis {font-size: 14px; text-anchor:middle;}.edge {stroke: black; fill: none}.node > circle {r: 3px; fill: black; stroke: none}.tree text {dominant-baseline: middle}.mut > text.lft {transform: translateX(0.5em); text-anchor: start}.mut > text.rgt {transform: translateX(-0.5em); text-anchor: end}.root > text {transform: translateY(-0.8em)}.leaf > text {transform: translateY(1em)}.node > text.lft {transform: translate(0.5em, -0.5em); text-anchor: start}.node > text.rgt {transform: translate(-0.5em, -0.5em); text-anchor: end}.mut {fill: red; font-style: italic}]]></style></defs><g class="tree t0"><g class="node n3 root" transform="translate(100.0 30.0)"><g class="leaf node n0 p3 sample" transform="translate(-45.0 115.0)"><path class="edge" d="M 0 0 V -115.0 H 45.0" /><circle cx="0" cy="0" r="1" /><text>0</text></g><g class="leaf node n1 p3 sample" transform="translate(0.0 115.0)"><path class="edge" d="M 0 0 V -115.0 H 0.0" /><circle cx="0" cy="0" r="1" /><text>1</text></g><g class="leaf node n2 p3 sample" transform="translate(45.0 115.0)"><path class="edge" d="M 0 0 V -115.0 H -45.0" /><circle cx="0" cy="0" r="1" /><text>2</text></g><circle cx="0" cy="0" r="1" /><text /></g></g></svg>
diff --git a/docs/_static/topology_1_0.svg b/docs/_static/topology_1_0.svg
@@ -0,0 +1 @@
+<svg baseProfile="full" height="175" version="1.1" width="200" xmlns="http://www.w3.org/2000/svg" xmlns:ev="http://www.w3.org/2001/xml-events" xmlns:xlink="http://www.w3.org/1999/xlink"><defs><style type="text/css"><![CDATA[.axis {font-weight: bold}.tree, .axis {font-size: 14px; text-anchor:middle;}.edge {stroke: black; fill: none}.node > circle {r: 3px; fill: black; stroke: none}.tree text {dominant-baseline: middle}.mut > text.lft {transform: translateX(0.5em); text-anchor: start}.mut > text.rgt {transform: translateX(-0.5em); text-anchor: end}.root > text {transform: translateY(-0.8em)}.leaf > text {transform: translateY(1em)}.node > text.lft {transform: translate(0.5em, -0.5em); text-anchor: start}.node > text.rgt {transform: translate(-0.5em, -0.5em); text-anchor: end}.mut {fill: red; font-style: italic}]]></style></defs><g class="tree t0"><g class="node n4 root" transform="translate(88.75 30.0)"><g class="leaf node n0 p4 sample" transform="translate(-33.75 115.0)"><path class="edge" d="M 0 0 V -115.0 H 33.75" /><circle cx="0" cy="0" r="1" /><text>0</text></g><g class="node n3 p4" transform="translate(33.75 57.5)"><g class="leaf node n1 p3 sample" transform="translate(-22.5 57.5)"><path class="edge" d="M 0 0 V -57.5 H 22.5" /><circle cx="0" cy="0" r="1" /><text>1</text></g><g class="leaf node n2 p3 sample" transform="translate(22.5 57.5)"><path class="edge" d="M 0 0 V -57.5 H -22.5" /><circle cx="0" cy="0" r="1" /><text>2</text></g><path class="edge" d="M 0 0 V -57.5 H -33.75" /><circle cx="0" cy="0" r="1" /><text class="lft" /></g><circle cx="0" cy="0" r="1" /><text /></g></g></svg>
diff --git a/docs/_static/topology_1_1.svg b/docs/_static/topology_1_1.svg
@@ -0,0 +1 @@
+<svg baseProfile="full" height="175" version="1.1" width="200" xmlns="http://www.w3.org/2000/svg" xmlns:ev="http://www.w3.org/2001/xml-events" xmlns:xlink="http://www.w3.org/1999/xlink"><defs><style type="text/css"><![CDATA[.axis {font-weight: bold}.tree, .axis {font-size: 14px; text-anchor:middle;}.edge {stroke: black; fill: none}.node > circle {r: 3px; fill: black; stroke: none}.tree text {dominant-baseline: middle}.mut > text.lft {transform: translateX(0.5em); text-anchor: start}.mut > text.rgt {transform: translateX(-0.5em); text-anchor: end}.root > text {transform: translateY(-0.8em)}.leaf > text {transform: translateY(1em)}.node > text.lft {transform: translate(0.5em, -0.5em); text-anchor: start}.node > text.rgt {transform: translate(-0.5em, -0.5em); text-anchor: end}.mut {fill: red; font-style: italic}]]></style></defs><g class="tree t0"><g class="node n4 root" transform="translate(88.75 30.0)"><g class="leaf node n1 p4 sample" transform="translate(-33.75 115.0)"><path class="edge" d="M 0 0 V -115.0 H 33.75" /><circle cx="0" cy="0" r="1" /><text>1</text></g><g class="node n3 p4" transform="translate(33.75 57.5)"><g class="leaf node n0 p3 sample" transform="translate(-22.5 57.5)"><path class="edge" d="M 0 0 V -57.5 H 22.5" /><circle cx="0" cy="0" r="1" /><text>0</text></g><g class="leaf node n2 p3 sample" transform="translate(22.5 57.5)"><path class="edge" d="M 0 0 V -57.5 H -22.5" /><circle cx="0" cy="0" r="1" /><text>2</text></g><path class="edge" d="M 0 0 V -57.5 H -33.75" /><circle cx="0" cy="0" r="1" /><text class="lft" /></g><circle cx="0" cy="0" r="1" /><text /></g></g></svg>
diff --git a/docs/_static/topology_1_2.svg b/docs/_static/topology_1_2.svg
@@ -0,0 +1 @@
+<svg baseProfile="full" height="175" version="1.1" width="200" xmlns="http://www.w3.org/2000/svg" xmlns:ev="http://www.w3.org/2001/xml-events" xmlns:xlink="http://www.w3.org/1999/xlink"><defs><style type="text/css"><![CDATA[.axis {font-weight: bold}.tree, .axis {font-size: 14px; text-anchor:middle;}.edge {stroke: black; fill: none}.node > circle {r: 3px; fill: black; stroke: none}.tree text {dominant-baseline: middle}.mut > text.lft {transform: translateX(0.5em); text-anchor: start}.mut > text.rgt {transform: translateX(-0.5em); text-anchor: end}.root > text {transform: translateY(-0.8em)}.leaf > text {transform: translateY(1em)}.node > text.lft {transform: translate(0.5em, -0.5em); text-anchor: start}.node > text.rgt {transform: translate(-0.5em, -0.5em); text-anchor: end}.mut {fill: red; font-style: italic}]]></style></defs><g class="tree t0"><g class="node n4 root" transform="translate(88.75 30.0)"><g class="leaf node n2 p4 sample" transform="translate(-33.75 115.0)"><path class="edge" d="M 0 0 V -115.0 H 33.75" /><circle cx="0" cy="0" r="1" /><text>2</text></g><g class="node n3 p4" transform="translate(33.75 57.5)"><g class="leaf node n0 p3 sample" transform="translate(-22.5 57.5)"><path class="edge" d="M 0 0 V -57.5 H 22.5" /><circle cx="0" cy="0" r="1" /><text>0</text></g><g class="leaf node n1 p3 sample" transform="translate(22.5 57.5)"><path class="edge" d="M 0 0 V -57.5 H -22.5" /><circle cx="0" cy="0" r="1" /><text>1</text></g><path class="edge" d="M 0 0 V -57.5 H -33.75" /><circle cx="0" cy="0" r="1" /><text class="lft" /></g><circle cx="0" cy="0" r="1" /><text /></g></g></svg>
diff --git a/docs/combinatorics.rst b/docs/combinatorics.rst
@@ -1,6 +1,210 @@
+.. currentmodule:: tskit
 .. _sec_combinatorics:
 
-=====================================
-Ranking and Unranking Tree Topologies
-=====================================
-TODO
+=============
+Combinatorics
+=============
+tskit uses a combinatorial approach to identify unique topologies of
+rooted, leaf-labelled trees. It provides methods
+for enumerating all possible tree topologies, as well as converting
+back and forth between a tree and its position, or rank, in the
+enumeration of all possible topologies.
+These methods do not only apply to binary trees;
+rather, they cover general, rooted trees without unary nodes.
+
+=================================     =====================================
+:meth:`Tree.rank`                     Return the rank of this tree.
+:meth:`Tree.unrank`                   Return a Tree given its rank and
+                                      a number of leaves.
+:func:`tskit.all_trees`               Return a generator over all
+                                      leaf-labelled trees of n leaves.
+:func:`tskit.all_tree_shapes`         Return a generator over all
+                                      tree shapes of n leaves.
+:func:`tskit.all_tree_labellings`     Return a generator over all
+                                      labellings of the given tree's shape.
+=================================     =====================================
+
+.. _sec_tree_ranks:
+
++++++++++++++++++++++++
+Interpreting Tree Ranks
++++++++++++++++++++++++
+To understand tree ranks we must look at how leaf-labelled tree topologies
+are enumerated. For example, we can use :func:`tskit.all_trees`
+to generate all possible topologies of three leaves:
+
+.. code-block:: python
+
+    for t in tskit.all_trees(num_leaves=3):
+        display(SVG(t.draw(node_labels={0: 0, 1: 1, 2: 2}, order="tree")))
+
+.. image:: _static/topology_0_0.svg
+    :width: 24%
+.. image:: _static/topology_1_0.svg
+    :width: 24%
+.. image:: _static/topology_1_1.svg
+    :width: 24%
+.. image:: _static/topology_1_2.svg
+    :width: 24%
+
+In this sequence, there exist two distinct tree shapes and each shape
+can be labelled in at least one unique way. Given that topologies are
+ordered first by their shape and then by their labelling, a tree
+topology can be uniquely identified by
+
+1.
+    The shape of the tree
+2.
+    The labelling of the tree's shape
+
+We can refer to the first tree in the above enumeration as the
+first labelling of the first shape of trees with three leaves, or tree
+:math:`(0, 0)`. The second tree can be identified as the first labelling
+of the second shape, or :math:`(1, 0)`, and so on.
+This pair of indexes for the shape and labelling of a tree is referred
+to as the rank of the tree, and can be computed using the
+:meth:`Tree.rank` method.
+
+.. code-block:: python
+
+    ranks = [t.rank() for t in tskit.all_trees(num_leaves=3)]
+    print("Ranks of 3-leaf trees:", ranks)
+
+::
+
+   Ranks of 3-leaf trees: [(0, 0), (1, 0), (1, 1), (1, 2)]
+
+.. note::
+    Ranks in combinatorics are typically natural numbers. However,
+    we refer to this tuple of shape and label rank as a rank because
+    it serves the same purpose of indexing trees in an enumeration.
+
+For details on how shapes and labellings are ordered, see
+:ref:`sec_enumerating_topologies`.
+
+We can also reconstruct a leaf-labelled tree given its rank. This process
+is known as unranking, and can be performed using the :meth:`Tree.unrank`
+method.
+
+.. code-block:: python
+
+    for rank in [(0, 0), (1, 0), (1, 1), (1, 2)]:
+        t = Tree.unrank(rank, num_leaves=3)
+        display(SVG(t.draw(node_labels={0: 0, 1: 1, 2: 2}, order="tree")))
+
+.. image:: _static/topology_0_0.svg
+    :width: 24%
+.. image:: _static/topology_1_0.svg
+    :width: 24%
+.. image:: _static/topology_1_1.svg
+    :width: 24%
+.. image:: _static/topology_1_2.svg
+    :width: 24%
+
+++++++++
+Examples
+++++++++
+
+One application of tree ranks is to count the different
+leaf-labelled topologies in a tree sequence. Since the ranks
+are just tuples, we can use a Python ``Counter`` to track them.
+Here, we count and unrank the most frequently seen
+topology in a tree sequence. For brevity, this example assumes
+samples are synonymous with leaves.
+
+.. code-block:: python
+
+    rank_counts = collections.Counter(t.rank() for t in ts.trees())
+    most_freq_rank, count = rank_counts.most_common(1)[0]
+    Tree.unrank(most_freq_rank, num_leaves=ts.num_samples())
+
+.. _sec_enumerating_topologies:
+
+++++++++++++++++++++++
+Enumerating Topologies
+++++++++++++++++++++++
+
+This section expands briefly on the approach used to enumerate
+tree topologies that serves as the basis for :meth:`Tree.rank`
+and :meth:`Tree.unrank`.
+To enumerate all rooted, leaf-labelled tree topologies, we first
+formulate a system of ordering and enumerating tree shapes. Then
+we define an enumeration of labellings given an arbitrary tree shape.
+
+***********************
+Enumerating Tree Shapes
+***********************
+
+Starting with :math:`n = 1`, we see that the only shape for a tree
+with a single leaf is a single root leaf. A tree with :math:`n > 1`
+leaves can be obtained by joining at least two trees whose number of
+leaves sum to :math:`n`.
+This maps very closely to the concept of integer partitions.
+Each tree shape of :math:`n` leaves can be represented by taking a
+nondecreasing integer partition of :math:`n` (elements of the partition
+are sorted in nondecreasing order) and recursively partitioning its
+elements. The order in which we select partitions of :math:`n` is
+determined by the efficient
+`rule_asc <http://jeromekelleher.net/generating-integer-partitions.html>`_
+algorithm for generating them.
+
+All tree shapes with four leaves, and the partitions that generate
+them, are:
+
+.. image:: _static/four_leaf_tree_shapes.png
+   :alt: All four-leaf tree shapes and their generating partitions
+
+Note that the middle column reflects all tree shapes of three leaves
+in the right subtree!
+
+`*` This excludes the partition [:math:`n`], since this would create a unary node
+and trees with unary nodes are inumerable (and potentially infinite).
+
+.. note::
+    Using nondecreasing integer partitions enforces a
+    *canonical orientation* on the tree shapes, where children under a node are
+    ordered by the number of leaves below them.
+    This is important because it prevents us from repeating trees that are
+    topologically the same but whose children are ordered differently.
+
+*********************
+Labelling Tree Shapes
+*********************
+
+Tree shapes are useful in and of themselves, but we can use the enumeration
+formulated above to go further and assign labels to the leaves of each shape.
+
+Say we are given a tree :math:`T` with :math:`n` leaves, whose left-most
+subtree, :math:`T_l`, has `k` leaves. For each of the :math:`n \choose k`
+ways to select labels to assign to :math:`T_l`, we produce a unique labelling
+of :math:`T`. This process of choosing labels is repeated for the other
+children of :math:`T` and then recursively for the subtrees.
+
+Looking back to the example from :ref:`sec_tree_ranks`, we can see
+the different unique ways to label a particular tree of three leaves.
+
+.. image:: _static/topology_1_0.svg
+    :width: 32%
+.. image:: _static/topology_1_1.svg
+    :width: 32%
+.. image:: _static/topology_1_2.svg
+    :width: 32%
+
+The order of the tree labellings is a direct result of the way in which
+combinations of labels are chosen. The implementation in tskit uses a
+standard lexicographic ordering to choose labels. See how the trees
+are sorted by the order in which the left leaf's label was chosen.
+
+.. note::
+    There is a caveat here regarding symmetry, similar to that of repeating
+    tree shapes. Symmetrical trees run the risk of creating redundant labellings
+    if all combinations of labels were exhausted. To prevent redundant labellings
+    we impose a *canonical labelling*. In the case of two symmetrical subtrees,
+    the left subtree must receive the minimum label from the label set. Notice
+    how this is the case in the right subtrees above.
+
+These two enumerations create a complete ordering of topologies where trees are
+ordered first by size (number of leaves), then by shape, then by their minimum
+label. It is this canonical order that enables efficient ranking and unranking
+of topologies.
+
diff --git a/docs/python-api.rst b/docs/python-api.rst
@@ -487,6 +487,21 @@ using a schema. See :ref:`sec_metadata`, :ref:`sec_metadata_api_overview` and
 
 .. _sec_stats_api:
 
+*************
+Combinatorics
+*************
+The following are generators for fully enumerating unique tree topologies.
+The position of a tree in the enumeration ``all_trees`` is given by
+:meth:`Tree.rank`. Inversely, a :class:`Tree` can be constructed from a
+position in the enumeration with :meth:`Tree.unrank`.
+See :ref:`sec_combinatorics` for details.
+
+.. autofunction:: tskit.all_trees
+
+.. autofunction:: tskit.all_tree_shapes
+
+.. autofunction:: tskit.all_tree_labellings
+
 **********************
 Linkage disequilibrium
 **********************
diff --git a/python/tests/test_combinatorics.py b/python/tests/test_combinatorics.py
@@ -173,16 +173,23 @@ def test_all_labelled_trees_4(self):
     def test_generate_trees_roundtrip(self):
         n = 5
         all_rank_trees = RankTree.all_labelled_trees(n)
-        all_tsk_trees = comb.all_trees(n)
+        all_tsk_trees = tskit.all_trees(n)
         for rank_tree, tsk_tree in zip(all_rank_trees, all_tsk_trees):
             self.assertEqual(rank_tree, RankTree.from_tsk_tree(tsk_tree))
 
+    def test_all_shapes_roundtrip(self):
+        n = 5
+        all_rank_tree_shapes = RankTree.all_unlabelled_trees(n)
+        all_tsk_tree_shapes = tskit.all_tree_shapes(n)
+        for rank_tree, tsk_tree in zip(all_rank_tree_shapes, all_tsk_tree_shapes):
+            self.assertTrue(rank_tree.shape_equal(RankTree.from_tsk_tree(tsk_tree)))
+
     def test_all_labellings_roundtrip(self):
         n = 5
         rank_tree = RankTree.unrank((comb.num_shapes(n) - 1, 0), n)
         tsk_tree = rank_tree.to_tsk_tree()
         rank_tree_labellings = RankTree.all_labellings(rank_tree)
-        tsk_tree_labellings = comb.all_tree_labellings(tsk_tree)
+        tsk_tree_labellings = tskit.all_tree_labellings(tsk_tree)
         for rank_t, tsk_t in zip(rank_tree_labellings, tsk_tree_labellings):
             self.assertEqual(rank_t, RankTree.from_tsk_tree(tsk_t))
 
diff --git a/python/tskit/__init__.py b/python/tskit/__init__.py
@@ -52,7 +52,11 @@
 from tskit.trees import *  # NOQA
 from tskit.tables import *  # NOQA
 from tskit.stats import *  # NOQA
-from tskit.combinatorics import *  # NOQA
+from tskit.combinatorics import (  # NOQA
+    all_trees,
+    all_tree_shapes,
+    all_tree_labellings,
+)
 from tskit.exceptions import *  # NOQA
 from tskit.util import *  # NOQA
 from tskit.metadata import *  # NOQA
diff --git a/python/tskit/combinatorics.py b/python/tskit/combinatorics.py
diff --git a/python/tskit/trees.py b/python/tskit/trees.py