Skip to content

Commit c5de2e1

Browse files
hyanwongjeromekelleher
authored andcommitted
Add trim() to tables and tree sequences
1 parent 165c056 commit c5de2e1

File tree

3 files changed

+366
-0
lines changed

3 files changed

+366
-0
lines changed

python/tests/test_topology.py

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import itertools
2929
import random
3030
import json
31+
import sys
3132

3233
import numpy as np
3334
import msprime
@@ -4984,3 +4985,242 @@ def test_ts_single_tree_delete_middle(self):
49844985
ts_keep = ts.delete_intervals([[0.25, 0.5]], record_provenance=False)
49854986
ts_delete = ts.keep_intervals([[0, 0.25], [0.5, 1.0]], record_provenance=False)
49864987
self.assertTrue(ts_equal(ts_keep, ts_delete))
4988+
4989+
4990+
class TestTrim(unittest.TestCase):
4991+
"""
4992+
Test the trimming functionality
4993+
"""
4994+
def add_mutations(self, ts, position, ancestral_state, derived_states, nodes):
4995+
"""
4996+
Create a site at the specified position and assign mutations to the specified
4997+
nodes (could be sequential mutations)
4998+
"""
4999+
tables = ts.dump_tables()
5000+
site = tables.sites.add_row(position, ancestral_state)
5001+
for state, node in zip(derived_states, nodes):
5002+
tables.mutations.add_row(site, node, state)
5003+
tables.sort()
5004+
tables.build_index()
5005+
tables.compute_mutation_parents()
5006+
return tables.tree_sequence()
5007+
5008+
def verify_sites(self, source_tree, trimmed_tree, position_offset):
5009+
source_sites = list(source_tree.sites())
5010+
trimmed_sites = list(trimmed_tree.sites())
5011+
self.assertEqual(len(source_sites), len(trimmed_sites))
5012+
for source_site, trimmed_site in zip(source_sites, trimmed_sites):
5013+
self.assertAlmostEqual(
5014+
source_site.position, position_offset + trimmed_site.position)
5015+
self.assertEqual(
5016+
source_site.ancestral_state, trimmed_site.ancestral_state)
5017+
self.assertEqual(source_site.metadata, trimmed_site.metadata)
5018+
self.assertEqual(
5019+
len(source_site.mutations), len(trimmed_site.mutations))
5020+
for source_mut, trimmed_mut in zip(
5021+
source_site.mutations, trimmed_site.mutations):
5022+
self.assertEqual(source_mut.node, trimmed_mut.node)
5023+
self.assertEqual(
5024+
source_mut.derived_state, trimmed_mut.derived_state)
5025+
self.assertEqual(
5026+
source_mut.metadata, trimmed_mut.metadata)
5027+
# mutation.parent id may have changed after deleting redundant mutations
5028+
if source_mut.parent == trimmed_mut.parent == tskit.NULL:
5029+
pass
5030+
else:
5031+
self.assertEqual(
5032+
source_tree.tree_sequence.mutation(source_mut.parent).node,
5033+
trimmed_tree.tree_sequence.mutation(trimmed_mut.parent).node)
5034+
5035+
def verify_ltrim(self, source_ts, trimmed_ts):
5036+
deleted_span = source_ts.first().span
5037+
self.assertAlmostEqual(
5038+
source_ts.sequence_length, trimmed_ts.sequence_length + deleted_span)
5039+
self.assertEqual(source_ts.num_trees, trimmed_ts.num_trees + 1)
5040+
for j in range(trimmed_ts.num_trees):
5041+
source_tree = source_ts.at_index(j + 1)
5042+
trimmed_tree = trimmed_ts.at_index(j)
5043+
self.assertEqual(source_tree.parent_dict, trimmed_tree.parent_dict)
5044+
self.assertAlmostEqual(source_tree.span, trimmed_tree.span)
5045+
self.assertAlmostEqual(
5046+
source_tree.interval[0], trimmed_tree.interval[0] + deleted_span)
5047+
self.verify_sites(source_tree, trimmed_tree, deleted_span)
5048+
5049+
def verify_rtrim(self, source_ts, trimmed_ts):
5050+
deleted_span = source_ts.last().span
5051+
self.assertAlmostEqual(
5052+
source_ts.sequence_length, trimmed_ts.sequence_length + deleted_span)
5053+
self.assertEqual(source_ts.num_trees, trimmed_ts.num_trees + 1)
5054+
for j in range(trimmed_ts.num_trees):
5055+
source_tree = source_ts.at_index(j)
5056+
trimmed_tree = trimmed_ts.at_index(j)
5057+
self.assertEqual(source_tree.parent_dict, trimmed_tree.parent_dict)
5058+
self.assertEqual(source_tree.interval, trimmed_tree.interval)
5059+
self.verify_sites(source_tree, trimmed_tree, 0)
5060+
5061+
def clear_left_mutate(self, ts, left, num_sites):
5062+
"""
5063+
Clear the edges from a tree sequence left of the specified coordinate
5064+
and add in num_sites regularly spaced sites into the cleared region.
5065+
"""
5066+
new_ts = ts.delete_intervals([[0.0, left]])
5067+
for j, x in enumerate(np.linspace(0, left, num_sites, endpoint=False)):
5068+
new_ts = self.add_mutations(new_ts, x, 'A' * j, ['T'] * j, range(j+1))
5069+
return new_ts
5070+
5071+
def clear_right_mutate(self, ts, right, num_sites):
5072+
"""
5073+
Clear the edges from a tree sequence right of the specified coordinate
5074+
and add in num_sites regularly spaced sites into the cleared region.
5075+
"""
5076+
new_ts = ts.delete_intervals([[right, ts.sequence_length]])
5077+
for j, x in enumerate(
5078+
np.linspace(right, ts.sequence_length, num_sites, endpoint=False)):
5079+
new_ts = self.add_mutations(new_ts, x, 'A' * j, ['T'] * j, range(j+1))
5080+
return new_ts
5081+
5082+
def clear_left_right_234(self, left, right):
5083+
"""
5084+
Clear edges to left and right and add 2 mutations at the same site into the left
5085+
cleared region, 3 at the same site into the untouched region, and 4 into the
5086+
right cleared region.
5087+
"""
5088+
assert 0.0 < left < right < 1.0
5089+
ts = msprime.simulate(10, recombination_rate=10, random_seed=2)
5090+
left_pos = np.mean([0.0, left])
5091+
left_root = ts.at(left_pos).root
5092+
mid_pos = np.mean([left, right])
5093+
mid_root = ts.at(mid_pos).root
5094+
right_pos = np.mean([right, ts.sequence_length])
5095+
right_root = ts.at(right_pos).root
5096+
# Clear
5097+
ts = ts.keep_intervals([[left, right]], simplify=False)
5098+
ts = self.add_mutations(ts, left_pos, 'A', ['T', 'C'], [left_root, 0])
5099+
ts = self.add_mutations(ts, mid_pos, 'T', ['A', 'C', 'G'], [mid_root, 0, 1])
5100+
ts = self.add_mutations(
5101+
ts, right_pos, 'X', ['T', 'C', 'G', 'A'], [right_root, 0, 1, 2])
5102+
self.assertNotEqual(np.min(ts.tables.edges.left), 0)
5103+
self.assertEqual(ts.num_mutations, 9)
5104+
self.assertEqual(ts.num_sites, 3)
5105+
return ts
5106+
5107+
def test_ltrim_single_tree(self):
5108+
ts = msprime.simulate(10, mutation_rate=12, random_seed=2)
5109+
ts = self.clear_left_mutate(ts, 0.5, 10)
5110+
self.verify_ltrim(ts, ts.ltrim())
5111+
5112+
def test_ltrim_single_tree_no_mutations(self):
5113+
ts = msprime.simulate(10, random_seed=2)
5114+
ts = self.clear_left_mutate(ts, 0.5, 0)
5115+
self.verify_ltrim(ts, ts.ltrim())
5116+
5117+
def test_ltrim_single_tree_tiny_left(self):
5118+
ts = msprime.simulate(10, mutation_rate=12, random_seed=2)
5119+
ts = self.clear_left_mutate(ts, 1e-200, 10)
5120+
self.verify_ltrim(ts, ts.ltrim())
5121+
5122+
def test_ltrim_many_trees(self):
5123+
ts = msprime.simulate(10, recombination_rate=10, mutation_rate=12, random_seed=2)
5124+
ts = self.clear_left_mutate(ts, 0.5, 10)
5125+
self.verify_ltrim(ts, ts.ltrim())
5126+
5127+
def test_ltrim_many_trees_left_min(self):
5128+
ts = msprime.simulate(10, recombination_rate=10, mutation_rate=12, random_seed=2)
5129+
ts = self.clear_left_mutate(ts, sys.float_info.min, 10)
5130+
self.verify_ltrim(ts, ts.ltrim())
5131+
5132+
def test_ltrim_many_trees_left_epsilon(self):
5133+
ts = msprime.simulate(10, recombination_rate=10, mutation_rate=12, random_seed=2)
5134+
ts = self.clear_left_mutate(ts, sys.float_info.epsilon, 0)
5135+
self.verify_ltrim(ts, ts.ltrim())
5136+
5137+
def test_ltrim_empty(self):
5138+
ts = msprime.simulate(2, random_seed=2)
5139+
ts = ts.delete_intervals([[0, 1]])
5140+
self.assertRaises(ValueError, ts.ltrim)
5141+
5142+
def test_ltrim_multiple_mutations(self):
5143+
ts = self.clear_left_right_234(0.1, 0.5)
5144+
trimmed_ts = ts.ltrim()
5145+
self.assertAlmostEqual(trimmed_ts.sequence_length, 0.9)
5146+
self.assertEqual(trimmed_ts.num_sites, 2)
5147+
self.assertEqual(trimmed_ts.num_mutations, 7) # We should have deleted 2
5148+
self.assertEqual(np.min(trimmed_ts.tables.edges.left), 0)
5149+
self.verify_ltrim(ts, trimmed_ts)
5150+
5151+
def test_rtrim_single_tree(self):
5152+
ts = msprime.simulate(10, mutation_rate=12, random_seed=2)
5153+
ts = self.clear_right_mutate(ts, 0.5, 10)
5154+
self.verify_rtrim(ts, ts.rtrim())
5155+
5156+
def test_rtrim_single_tree_no_mutations(self):
5157+
ts = msprime.simulate(10, random_seed=2)
5158+
ts = self.clear_right_mutate(ts, 0.5, 0)
5159+
self.verify_rtrim(ts, ts.rtrim())
5160+
5161+
def test_rtrim_single_tree_tiny_left(self):
5162+
ts = msprime.simulate(10, mutation_rate=12, random_seed=2)
5163+
ts = self.clear_right_mutate(ts, 1e-200, 10)
5164+
self.verify_rtrim(ts, ts.rtrim())
5165+
5166+
def test_rtrim_many_trees(self):
5167+
ts = msprime.simulate(10, recombination_rate=10, mutation_rate=12, random_seed=2)
5168+
ts = self.clear_right_mutate(ts, 0.5, 10)
5169+
self.verify_rtrim(ts, ts.rtrim())
5170+
5171+
def test_rtrim_many_trees_left_min(self):
5172+
ts = msprime.simulate(10, recombination_rate=10, mutation_rate=12, random_seed=2)
5173+
ts = self.clear_right_mutate(ts, sys.float_info.min, 10)
5174+
self.verify_rtrim(ts, ts.rtrim())
5175+
5176+
def test_rtrim_many_trees_left_epsilon(self):
5177+
ts = msprime.simulate(10, recombination_rate=10, mutation_rate=12, random_seed=2)
5178+
ts = self.clear_right_mutate(ts, sys.float_info.epsilon, 0)
5179+
self.verify_rtrim(ts, ts.rtrim())
5180+
5181+
def test_rtrim_empty(self):
5182+
ts = msprime.simulate(2, random_seed=2)
5183+
ts = ts.delete_intervals([[0, 1]])
5184+
self.assertRaises(ValueError, ts.rtrim)
5185+
5186+
def test_rtrim_multiple_mutations(self):
5187+
ts = self.clear_left_right_234(0.1, 0.5)
5188+
trimmed_ts = ts.rtrim()
5189+
self.assertAlmostEqual(trimmed_ts.sequence_length, 0.5)
5190+
self.assertEqual(trimmed_ts.num_sites, 2)
5191+
self.assertEqual(trimmed_ts.num_mutations, 5) # We should have deleted 4
5192+
self.assertEqual(
5193+
np.max(trimmed_ts.tables.edges.right), trimmed_ts.tables.sequence_length)
5194+
self.verify_rtrim(ts, trimmed_ts)
5195+
5196+
def test_trim_multiple_mutations(self):
5197+
ts = self.clear_left_right_234(0.1, 0.5)
5198+
trimmed_ts = ts.trim()
5199+
self.assertAlmostEqual(trimmed_ts.sequence_length, 0.4)
5200+
self.assertEqual(trimmed_ts.num_mutations, 3)
5201+
self.assertEqual(trimmed_ts.num_sites, 1)
5202+
self.assertEqual(np.min(trimmed_ts.tables.edges.left), 0)
5203+
self.assertEqual(
5204+
np.max(trimmed_ts.tables.edges.right), trimmed_ts.tables.sequence_length)
5205+
5206+
def test_trims_no_effect(self):
5207+
# Deleting from middle should have no effect on any trim function
5208+
ts = msprime.simulate(10, recombination_rate=2, mutation_rate=50, random_seed=2)
5209+
ts = ts.delete_intervals([[0.1, 0.5]])
5210+
trimmed_ts = ts.ltrim(record_provenance=False)
5211+
self.assertTrue(ts_equal(ts, trimmed_ts))
5212+
trimmed_ts = ts.rtrim(record_provenance=False)
5213+
self.assertTrue(ts_equal(ts, trimmed_ts))
5214+
trimmed_ts = ts.trim(record_provenance=False)
5215+
self.assertTrue(ts_equal(ts, trimmed_ts))
5216+
5217+
def test_failure_with_migrations(self):
5218+
# All trim functions fail if migrations present
5219+
ts = msprime.simulate(10, recombination_rate=2, random_seed=2)
5220+
ts = ts.keep_intervals([[0.1, 0.5]])
5221+
tables = ts.dump_tables()
5222+
tables.migrations.add_row(0, 1, 0, 0, 0, 0)
5223+
ts = tables.tree_sequence()
5224+
self.assertRaises(ValueError, ts.ltrim)
5225+
self.assertRaises(ValueError, ts.rtrim)
5226+
self.assertRaises(ValueError, ts.trim)

python/tskit/tables.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1791,6 +1791,88 @@ def keep_intervals(self, intervals, simplify=True, record_provenance=True):
17911791
self.provenances.add_row(record=json.dumps(
17921792
provenance.get_provenance_dict(parameters)))
17931793

1794+
def _check_trim_conditions(self):
1795+
if self.migrations.num_rows > 0:
1796+
raise ValueError(
1797+
"You cannot trim a tree sequence containing migrations")
1798+
if self.edges.num_rows == 0:
1799+
raise ValueError(
1800+
"Trimming a tree sequence with no edges would reduce the sequence length"
1801+
" to zero, which is not allowed")
1802+
1803+
def ltrim(self, record_provenance=True):
1804+
"""
1805+
Reset the coordinate system used in these tables, changing the left and right
1806+
genomic positions in the edge table such that the leftmost edge now starts at
1807+
position 0. This is identical to :meth:`TreeSequence.ltrim` but acts *in place*
1808+
to alter the data in this :class:`TableCollection`.
1809+
1810+
:param bool record_provenance: If ``True``, add details of this operation
1811+
to the provenance table in this TableCollection. (Default: ``True``).
1812+
"""
1813+
self._check_trim_conditions()
1814+
leftmost = np.min(self.edges.left)
1815+
self.delete_sites(
1816+
np.where(self.sites.position < leftmost), record_provenance=False)
1817+
self.edges.set_columns(
1818+
left=self.edges.left - leftmost, right=self.edges.right - leftmost,
1819+
parent=self.edges.parent, child=self.edges.child)
1820+
self.sites.set_columns(
1821+
position=self.sites.position - leftmost,
1822+
ancestral_state=self.sites.ancestral_state,
1823+
ancestral_state_offset=self.sites.ancestral_state_offset,
1824+
metadata=self.sites.metadata,
1825+
metadata_offset=self.sites.metadata_offset)
1826+
self.sequence_length = self.sequence_length - leftmost
1827+
if record_provenance:
1828+
# TODO replace with a version of https://github.com/tskit-dev/tskit/pull/243
1829+
parameters = {
1830+
"command": "ltrim",
1831+
}
1832+
self.provenances.add_row(record=json.dumps(
1833+
provenance.get_provenance_dict(parameters)))
1834+
1835+
def rtrim(self, record_provenance=True):
1836+
"""
1837+
Reset the ``sequence_length`` property so that the sequence ends at the end of
1838+
the last edge. This is identical to :meth:`TreeSequence.rtrim` but acts
1839+
*in place* to alter the data in this :class:`TableCollection`.
1840+
1841+
:param bool record_provenance: If ``True``, add details of this operation
1842+
to the provenance table in this TableCollection. (Default: ``True``).
1843+
"""
1844+
self._check_trim_conditions()
1845+
rightmost = np.max(self.edges.right)
1846+
self.delete_sites(
1847+
np.where(self.sites.position >= rightmost), record_provenance=False)
1848+
self.sequence_length = rightmost
1849+
if record_provenance:
1850+
# TODO replace with a version of https://github.com/tskit-dev/tskit/pull/243
1851+
parameters = {
1852+
"command": "rtrim",
1853+
}
1854+
self.provenances.add_row(record=json.dumps(
1855+
provenance.get_provenance_dict(parameters)))
1856+
1857+
def trim(self, record_provenance=True):
1858+
"""
1859+
Trim away any empty regions on the right and left of the tree sequence encoded by
1860+
these tables. This is identical to :meth:`TreeSequence.trim` but acts *in place*
1861+
to alter the data in this :class:`TableCollection`.
1862+
1863+
:param bool record_provenance: If ``True``, add details of this operation
1864+
to the provenance table in this TableCollection. (Default: ``True``).
1865+
"""
1866+
self.rtrim(record_provenance=False)
1867+
self.ltrim(record_provenance=False)
1868+
if record_provenance:
1869+
# TODO replace with a version of https://github.com/tskit-dev/tskit/pull/243
1870+
parameters = {
1871+
"command": "trim",
1872+
}
1873+
self.provenances.add_row(record=json.dumps(
1874+
provenance.get_provenance_dict(parameters)))
1875+
17941876
def has_index(self):
17951877
"""
17961878
Returns True if this TableCollection is indexed.

python/tskit/trees.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3401,6 +3401,50 @@ def keep_intervals(self, intervals, simplify=True, record_provenance=True):
34013401
tables.keep_intervals(intervals, simplify, record_provenance)
34023402
return tables.tree_sequence()
34033403

3404+
def ltrim(self, record_provenance=True):
3405+
"""
3406+
Returns a copy of this tree sequence with a potentially changed coordinate
3407+
system, such that empty regions (i.e. those not covered by any edge) at the start
3408+
of the tree sequence are trimmed away, and the leftmost edge starts at position
3409+
0. This affects the reported position of sites and edges. Additionally, sites and
3410+
their associated mutations to the left of the new zero point are thrown away.
3411+
3412+
:param bool record_provenance: If True, add details of this operation to the
3413+
provenance information of the returned tree sequence. (Default: True).
3414+
"""
3415+
tables = self.dump_tables()
3416+
tables.ltrim(record_provenance)
3417+
return tables.tree_sequence()
3418+
3419+
def rtrim(self, record_provenance=True):
3420+
"""
3421+
Returns a copy of this tree sequence with the ``sequence_length`` property reset
3422+
so that the sequence ends at the end of the rightmost edge. Additionally, sites
3423+
and their associated mutations at positions greater than the new
3424+
``sequence_length`` are thrown away.
3425+
3426+
:param bool record_provenance: If True, add details of this operation to the
3427+
provenance information of the returned tree sequence. (Default: True).
3428+
"""
3429+
tables = self.dump_tables()
3430+
tables.rtrim(record_provenance)
3431+
return tables.tree_sequence()
3432+
3433+
def trim(self, record_provenance=True):
3434+
"""
3435+
Returns a copy of this tree sequence with any empty regions (i.e. those not
3436+
covered by any edge) on the right and left trimmed away. This may reset both the
3437+
coordinate system and the ``sequence_length`` property. It is functionally
3438+
equivalent to :meth:`.rtrim` followed by :meth:`.ltrim`. Sites and their
3439+
associated mutations in the empty regions are thrown away.
3440+
3441+
:param bool record_provenance: If True, add details of this operation to the
3442+
provenance information of the returned tree sequence. (Default: True).
3443+
"""
3444+
tables = self.dump_tables()
3445+
tables.trim(record_provenance)
3446+
return tables.tree_sequence()
3447+
34043448
def draw_svg(self, path=None, **kwargs):
34053449
# TODO document this method, including semantic details of the
34063450
# returned SVG object.

0 commit comments

Comments
 (0)