|
| 1 | +import collections |
| 2 | +import itertools |
| 3 | +import sys |
| 4 | + |
| 5 | +import numpy as np |
| 6 | +from IPython.display import HTML |
| 7 | + |
| 8 | +def remove_edges(ts, edge_id_remove_list): |
| 9 | + edges_to_remove_by_child = collections.defaultdict(list) |
| 10 | + edge_id_remove_list = set(edge_id_remove_list) |
| 11 | + for m in ts.mutations(): |
| 12 | + if m.edge in edge_id_remove_list: |
| 13 | + # If we remove this edge, we will remove the associated mutation |
| 14 | + # as the child node won't have ancestral material in this region. |
| 15 | + # So we force the user to explicitly (re)move the mutations beforehand |
| 16 | + raise ValueError("Cannot remove edges that have associated mutations") |
| 17 | + for remove_edge in edge_id_remove_list: |
| 18 | + e = ts.edge(remove_edge) |
| 19 | + edges_to_remove_by_child[e.child].append(e) |
| 20 | + |
| 21 | + # sort left-to-right for each child |
| 22 | + for k, v in edges_to_remove_by_child.items(): |
| 23 | + edges_to_remove_by_child[k] = sorted(v, key=lambda e: e.left) |
| 24 | + # check no overlaps |
| 25 | + for e1, e2 in zip(edges_to_remove_by_child[k], edges_to_remove_by_child[k][1:]): |
| 26 | + assert e1.right <= e2.left |
| 27 | + |
| 28 | + # Sanity check: this means the topmost node will deal with modified edges left at the end |
| 29 | + assert ts.edge(-1).parent not in edges_to_remove_by_child |
| 30 | + |
| 31 | + new_edges = collections.defaultdict(list) |
| 32 | + tables = ts.dump_tables() |
| 33 | + tables.edges.clear() |
| 34 | + samples = set(ts.samples()) |
| 35 | + # Edges are sorted by parent time, youngest first, so we can iterate over |
| 36 | + # nodes-as-parents visiting children before parents by using itertools.groupby |
| 37 | + for parent_id, ts_edges in itertools.groupby(ts.edges(), lambda e: e.parent): |
| 38 | + # Iterate through the ts edges *plus* the polytomy edges we created in previous steps. |
| 39 | + # This allows us to re-edit polytomy edges when the edges_to_remove are stacked |
| 40 | + edges = list(ts_edges) |
| 41 | + if parent_id in new_edges: |
| 42 | + edges += new_edges.pop(parent_id) |
| 43 | + if parent_id in edges_to_remove_by_child: |
| 44 | + for e in edges: |
| 45 | + assert parent_id == e.parent |
| 46 | + l = -1 |
| 47 | + if e.id in edge_id_remove_list: |
| 48 | + continue |
| 49 | + # NB: we go left to right along the target edges, reducing edge e as required |
| 50 | + for target_edge in edges_to_remove_by_child[parent_id]: |
| 51 | + # As we go along the target_edges, gradually split e into chunks. |
| 52 | + # If edge e is in the target_edge region, change the edge parent |
| 53 | + assert target_edge.left > l |
| 54 | + l = target_edge.left |
| 55 | + if e.left >= target_edge.right: |
| 56 | + # This target edge is entirely to the LHS of edge e, with no overlap |
| 57 | + continue |
| 58 | + elif e.right <= target_edge.left: |
| 59 | + # This target edge is entirely to the RHS of edge e with no overlap. |
| 60 | + # Since target edges are sorted by left coord, all other target edges |
| 61 | + # are to RHS too, and we are finished dealing with edge e |
| 62 | + tables.edges.append(e) |
| 63 | + e = None |
| 64 | + break |
| 65 | + else: |
| 66 | + # Edge e must overlap with current target edge somehow |
| 67 | + if e.left < target_edge.left: |
| 68 | + # Edge had region to LHS of target |
| 69 | + # Add the left hand section (change the edge right coord) |
| 70 | + tables.edges.add_row(left=e.left, right=target_edge.left, parent=e.parent, child=e.child) |
| 71 | + e = e.replace(left=target_edge.left) |
| 72 | + if e.right > target_edge.right: |
| 73 | + # Edge continues after RHS of target |
| 74 | + assert e.left < target_edge.right |
| 75 | + new_edges[target_edge.parent].append( |
| 76 | + e.replace(right=target_edge.right, parent=target_edge.parent) |
| 77 | + ) |
| 78 | + e = e.replace(left=target_edge.right) |
| 79 | + else: |
| 80 | + # No more of edge e to RHS |
| 81 | + assert e.left < e.right |
| 82 | + new_edges[target_edge.parent].append(e.replace(parent=target_edge.parent)) |
| 83 | + e = None |
| 84 | + break |
| 85 | + if e is not None: |
| 86 | + # Need to add any remaining regions of edge back in |
| 87 | + tables.edges.append(e) |
| 88 | + else: |
| 89 | + # NB: sanity check at top means that the oldest node will have no edges above, |
| 90 | + # so the last iteration should hit this branch |
| 91 | + for e in edges: |
| 92 | + if e.id not in edge_id_remove_list: |
| 93 | + tables.edges.append(e) |
| 94 | + assert len(new_edges) == 0 |
| 95 | + tables.sort() |
| 96 | + return tables.tree_sequence() |
| 97 | + |
| 98 | +def unsupported_edges(ts, per_interval=False): |
| 99 | + """ |
| 100 | + Return the internal edges that are unsupported by a mutation. |
| 101 | + If ``per_interval`` is True, each interval needs to be supported, |
| 102 | + otherwise, a mutation on an edge (even if there are multiple intervals |
| 103 | + per edge) will result in all intervals on that edge being treated |
| 104 | + as supported. |
| 105 | + """ |
| 106 | + edges_to_remove = np.ones(ts.num_edges, dtype="bool") |
| 107 | + edges_to_remove[[m.edge for m in ts.mutations()]] = False |
| 108 | + # We don't remove edges above samples |
| 109 | + edges_to_remove[np.isin(ts.edges_child, ts.samples())] = False |
| 110 | + |
| 111 | + if per_interval: |
| 112 | + return np.where(edges_to_remove)[0] |
| 113 | + else: |
| 114 | + keep = (edges_to_remove == False) |
| 115 | + for p, c in zip(ts.edges_parent[keep], ts.edges_child[keep]): |
| 116 | + edges_to_remove[np.logical_and(ts.edges_parent == p, ts.edges_child == c)] = False |
| 117 | + return np.where(edges_to_remove)[0] |
| 118 | + |
| 119 | + |
| 120 | +class Workbook: |
| 121 | + @staticmethod |
| 122 | + def setup(): |
| 123 | + display(HTML( |
| 124 | + "<style type='text/css'>" + |
| 125 | + ".exercise {background-color: yellow; color: black; font-family: 'serif'; font-size: 1.2em}" + |
| 126 | + ".exercise code {font-size: 0.7em}" + |
| 127 | + "</style>" + |
| 128 | + "<h4>✅ Your notebook is ready to go!</h4>" + |
| 129 | + ("This notebook is not running in JupyterLite: you may need to install tskit, tszip, etc." |
| 130 | + if sys.platform != 'emscripten' else ''' |
| 131 | +To clear the notebook and reset, |
| 132 | +select "Clear Browser Data" from the JupyterLite help menu. |
| 133 | +''') |
| 134 | + )) |
| 135 | + |
| 136 | + def node_coalescence_status(arg): |
| 137 | + """ |
| 138 | + Uses the num_children_array attribute to find nodes that represent local coalescence. |
| 139 | + See https://tskit.dev/tskit/docs/latest/python-api.html#tskit.Tree.num_children_array |
| 140 | + Returns an array of length num_nodes containing 0 if a node never has any coalescent |
| 141 | + segments, 1 if some segments of the node are coalescent and some unary, and 2 if |
| 142 | + all node segments represent a local coalescence point. |
| 143 | + """ |
| 144 | + has_unary = np.zeros(arg.num_nodes, dtype=int) |
| 145 | + has_coal = np.zeros(arg.num_nodes, dtype=int) |
| 146 | + tree = arg.first() |
| 147 | + nca = tree.num_children_array |
| 148 | + for edge_diffs in arg.edge_diffs(): |
| 149 | + for e in edge_diffs.edges_out: |
| 150 | + if nca[e.parent] == 0: |
| 151 | + has_unary[e.parent] = 1 |
| 152 | + elif nca[e.parent] > 1: |
| 153 | + has_coal[e.parent] = 1 |
| 154 | + for e in edge_diffs.edges_in: |
| 155 | + if nca[e.parent] == 0: |
| 156 | + has_unary[e.parent] = 1 |
| 157 | + elif nca[e.parent] > 1: |
| 158 | + has_coal[e.parent] = 1 |
| 159 | + tree.next() |
| 160 | + return np.where(has_coal, np.where(has_unary, 1, 2), 0) |
| 161 | + |
| 162 | + def remove_unsupported_edges(ts, per_interval=True): |
| 163 | + """ |
| 164 | + Remove edges from the tree sequence that are unsupported by mutations. |
| 165 | + If ``per_interval`` is True, each interval needs to be supported, |
| 166 | + otherwise, a mutation on an edge (even if there are multiple intervals |
| 167 | + per edge) will result in all intervals on that edge being treated |
| 168 | + as supported. |
| 169 | + """ |
| 170 | + edges_to_remove = unsupported_edges(ts, per_interval=per_interval) |
| 171 | + tables = remove_edges(ts, edges_to_remove).dump_tables() |
| 172 | + tables.edges.squash() |
| 173 | + return tables.tree_sequence() |
| 174 | + |
| 175 | +class Workbook1(Workbook): |
| 176 | + url = "json/WhatIsAnARG/Workbook1/" # could also put a full URL here, but a local one will work offline too |
| 177 | + |
| 178 | +class Workbook2(Workbook): |
| 179 | + url = "json/WhatIsAnARG/Workbook2/" # could also put a full URL here, but a local one will work offline too |
0 commit comments