Skip to content

Commit d68837e

Browse files
committed
Add map_to_vcf_model to TreeSequence and use in VCF code
1 parent 0663aaa commit d68837e

File tree

5 files changed

+544
-129
lines changed

5 files changed

+544
-129
lines changed

python/CHANGELOG.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,27 @@
1717
so a set of tree sequence can be added to the right of an existing one.
1818
(:user:`hyanwong`, :pr:`3165`, :issue:`3164`)
1919

20+
- Add ``TreeSequence.map_to_vcf_model`` method to return a mapping of
21+
the tree sequence to the VCF model.
22+
(:user:`benjeffery`, :pr:`3163`)
2023

2124
**Fixes**
2225

2326
- Correct assertion message when tables are compared with metadata ignored.
2427
(:user:`benjeffery`, :pr:`3162`, :issue:`3161`)
2528

29+
**Breaking changes**
30+
31+
- ``TreeSequence.write_vcf`` now filters non-sample nodes from individuals
32+
by default, instead of raising an error. These nodes can be included using the
33+
new ``include_non_sample_nodes`` argument.
34+
By default individual names (sample IDs) in VCF output are now of the form
35+
``tsk_{individual.id}`` Previously these were always
36+
``"tsk_{j}" for j in range(num_individuals)``. This may break some downstream
37+
code if individuals are specified. To fix, manually specify ``individual_names``
38+
to the required pattern.
39+
(:user:`benjeffery`, :pr:`3163`)
40+
2641
--------------------
2742
[0.6.3] - 2025-04-28
2843
--------------------

python/tests/test_highlevel.py

Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5668,3 +5668,245 @@ def test_different_node_flags(self):
56685668
result = ts.sample_nodes_by_ploidy(2)
56695669
assert result.shape == (1, 2)
56705670
assert_array_equal(result, np.array([[0, 2]]))
5671+
5672+
5673+
class TestMapToVcfModel:
5674+
def test_no_individuals_default_ploidy(self):
5675+
ts = tskit.Tree.generate_balanced(4).tree_sequence
5676+
assert ts.num_individuals == 0
5677+
5678+
# Default ploidy should be 1
5679+
result = ts.map_to_vcf_model()
5680+
assert isinstance(result, tskit.VcfModelMapping)
5681+
assert result.individuals_nodes.shape == (4, 1)
5682+
for i in range(4):
5683+
assert result.individuals_nodes[i, 0] == i
5684+
assert result.individuals_name.shape == (4,)
5685+
for i in range(4):
5686+
assert result.individuals_name[i] == f"tsk_{i}"
5687+
5688+
with pytest.raises(
5689+
ValueError,
5690+
match="Cannot include non-sample nodes when individuals are not present",
5691+
):
5692+
ts.map_to_vcf_model(include_non_sample_nodes=True)
5693+
5694+
def test_no_individuals_custom_ploidy(self):
5695+
ts = tskit.Tree.generate_balanced(6).tree_sequence
5696+
assert ts.num_individuals == 0
5697+
5698+
# Use ploidy = 2
5699+
result = ts.map_to_vcf_model(ploidy=2)
5700+
assert isinstance(result, tskit.VcfModelMapping)
5701+
assert result.individuals_nodes.shape == (3, 2)
5702+
for i in range(3):
5703+
assert result.individuals_nodes[i, 0] == i * 2
5704+
assert result.individuals_nodes[i, 1] == i * 2 + 1
5705+
assert result.individuals_name.shape == (3,)
5706+
for i in range(3):
5707+
assert result.individuals_name[i] == f"tsk_{i}"
5708+
5709+
def test_no_individuals_uneven_ploidy(self):
5710+
ts = tskit.Tree.generate_balanced(5).tree_sequence
5711+
# This tree sequence has no individuals
5712+
assert ts.num_individuals == 0
5713+
5714+
# 5 samples cannot be evenly divided into ploidy=2
5715+
with pytest.raises(ValueError, match="not a multiple"):
5716+
ts.map_to_vcf_model(ploidy=2)
5717+
5718+
def test_with_individuals(self):
5719+
ts = msprime.sim_ancestry(
5720+
5,
5721+
random_seed=42,
5722+
)
5723+
result = ts.map_to_vcf_model()
5724+
assert isinstance(result, tskit.VcfModelMapping)
5725+
assert result.individuals_nodes.shape == (5, 2)
5726+
assert np.array_equal(
5727+
result.individuals_nodes,
5728+
np.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]),
5729+
)
5730+
assert result.individuals_name.shape == (5,)
5731+
for i in range(5):
5732+
assert result.individuals_name[i] == f"tsk_{i}"
5733+
5734+
def test_with_individuals_and_ploidy_error(self):
5735+
tables = tskit.TableCollection(1.0)
5736+
tables.individuals.add_row()
5737+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0)
5738+
ts = tables.tree_sequence()
5739+
5740+
with pytest.raises(ValueError, match="Cannot specify ploidy when individuals"):
5741+
ts.map_to_vcf_model(ploidy=2)
5742+
5743+
def test_specific_individuals(self):
5744+
tables = tskit.TableCollection(1.0)
5745+
# Create 5 individuals with varying ploidy
5746+
for i in range(5):
5747+
tables.individuals.add_row()
5748+
# Individuals have ploidy i+1
5749+
for _ in range(i + 1):
5750+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=i)
5751+
ts = tables.tree_sequence()
5752+
5753+
result = ts.map_to_vcf_model(individuals=[1, 3])
5754+
assert isinstance(result, tskit.VcfModelMapping)
5755+
# Individual 1 has ploidy 2, individual 3 has ploidy 4
5756+
assert result.individuals_nodes.shape == (2, 5)
5757+
assert np.array_equal(result.individuals_nodes[0], [1, 2, -1, -1, -1])
5758+
assert np.array_equal(result.individuals_nodes[1], [6, 7, 8, 9, -1])
5759+
5760+
assert result.individuals_name.shape == (2,)
5761+
assert result.individuals_name[0] == "tsk_1"
5762+
assert result.individuals_name[1] == "tsk_3"
5763+
5764+
def test_individual_with_no_nodes(self):
5765+
tables = tskit.TableCollection(1.0)
5766+
# Individual with no nodes
5767+
tables.individuals.add_row()
5768+
# Individual with nodes
5769+
tables.individuals.add_row()
5770+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=1)
5771+
ts = tables.tree_sequence()
5772+
5773+
result = ts.map_to_vcf_model()
5774+
assert result.individuals_nodes.shape == (2, 1)
5775+
assert np.array_equal(result.individuals_nodes, [[-1], [0]])
5776+
5777+
def test_individual_with_no_nodes_only(self):
5778+
tables = tskit.TableCollection(1.0)
5779+
# Individual with no nodes
5780+
tables.individuals.add_row()
5781+
# Individual with nodes
5782+
tables.individuals.add_row()
5783+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=1)
5784+
ts = tables.tree_sequence()
5785+
5786+
result = ts.map_to_vcf_model(individuals=[0])
5787+
assert result.individuals_nodes.shape == (1, 1)
5788+
assert np.array_equal(result.individuals_nodes, [[-1]])
5789+
5790+
def test_invalid_individual_id(self):
5791+
tables = tskit.TableCollection(1.0)
5792+
tables.individuals.add_row()
5793+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0)
5794+
ts = tables.tree_sequence()
5795+
5796+
with pytest.raises(ValueError, match="Invalid individual ID"):
5797+
ts.map_to_vcf_model(individuals=[-1])
5798+
5799+
with pytest.raises(ValueError, match="Invalid individual ID"):
5800+
ts.map_to_vcf_model(individuals=[1])
5801+
5802+
def test_mixed_sample_non_sample_ordering(self):
5803+
tables = tskit.TableCollection(1.0)
5804+
tables.individuals.add_row()
5805+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0)
5806+
tables.nodes.add_row(flags=0, time=0, individual=0) # Non-sample node
5807+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0)
5808+
tables.nodes.add_row(flags=0, time=0, individual=0) # Non-sample node
5809+
tables.individuals.add_row()
5810+
tables.nodes.add_row(flags=0, time=0, individual=1) # Non-sample node
5811+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=1)
5812+
ts = tables.tree_sequence()
5813+
5814+
result = ts.map_to_vcf_model()
5815+
assert result.individuals_nodes.shape == (2, 4)
5816+
assert np.array_equal(
5817+
result.individuals_nodes,
5818+
np.array([[0, 2, -1, -1], [5, -1, -1, -1]]),
5819+
)
5820+
5821+
result = ts.map_to_vcf_model(include_non_sample_nodes=True)
5822+
assert result.individuals_nodes.shape == (2, 4)
5823+
assert np.array_equal(
5824+
result.individuals_nodes,
5825+
np.array([[0, 1, 2, 3], [4, 5, -1, -1]]),
5826+
)
5827+
5828+
def test_samples_without_individuals_warning(self):
5829+
tables = tskit.TableCollection(1.0)
5830+
tables.individuals.add_row()
5831+
# Node with individual
5832+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0)
5833+
# Node without individual
5834+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=tskit.NULL)
5835+
ts = tables.tree_sequence()
5836+
5837+
with warnings.catch_warnings(record=True) as w:
5838+
ts.map_to_vcf_model()
5839+
assert len(w) == 1
5840+
assert "At least one sample node does not have an individual ID" in str(
5841+
w[0].message
5842+
)
5843+
5844+
def test_metadata_key_for_names(self):
5845+
tables = tskit.TableCollection(1.0)
5846+
5847+
# Add individuals with metadata
5848+
tables.individuals.metadata_schema = tskit.MetadataSchema(
5849+
{
5850+
"codec": "json",
5851+
"type": "object",
5852+
"properties": {"name": {"type": "string"}},
5853+
}
5854+
)
5855+
tables.individuals.add_row(metadata={"name": "ind1"})
5856+
tables.individuals.add_row(metadata={"name": "ind2"})
5857+
5858+
# Add nodes
5859+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0)
5860+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=1)
5861+
ts = tables.tree_sequence()
5862+
5863+
result = ts.map_to_vcf_model(name_metadata_key="name")
5864+
assert result.individuals_name.shape == (2,)
5865+
assert result.individuals_name[0] == "ind1"
5866+
assert result.individuals_name[1] == "ind2"
5867+
5868+
def test_custom_individual_names(self):
5869+
tables = tskit.TableCollection(1.0)
5870+
tables.individuals.add_row()
5871+
tables.individuals.add_row()
5872+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0)
5873+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=1)
5874+
ts = tables.tree_sequence()
5875+
5876+
custom_names = ["individual_A", "individual_B"]
5877+
result = ts.map_to_vcf_model(individual_names=custom_names)
5878+
assert result.individuals_name.shape == (2,)
5879+
assert result.individuals_name[0] == "individual_A"
5880+
assert result.individuals_name[1] == "individual_B"
5881+
5882+
def test_name_conflict_error(self):
5883+
tables = tskit.TableCollection(1.0)
5884+
ts = tables.tree_sequence()
5885+
with pytest.raises(
5886+
ValueError,
5887+
match="Cannot specify both name_metadata_key and individual_names",
5888+
):
5889+
ts.map_to_vcf_model(
5890+
name_metadata_key="name", individual_names=["custom_name"]
5891+
)
5892+
5893+
def test_name_count_mismatch_error(self):
5894+
tables = tskit.TableCollection(1.0)
5895+
tables.individuals.add_row()
5896+
tables.individuals.add_row()
5897+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0)
5898+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=1)
5899+
ts = tables.tree_sequence()
5900+
5901+
with pytest.raises(
5902+
ValueError, match="number of individuals does not match the number of names"
5903+
):
5904+
ts.map_to_vcf_model(individual_names=["only_one_name"])
5905+
5906+
def test_all_individuals_no_nodes(self):
5907+
tables = tskit.TableCollection(1.0)
5908+
tables.individuals.add_row()
5909+
tables.individuals.add_row()
5910+
ts = tables.tree_sequence()
5911+
result = ts.map_to_vcf_model()
5912+
assert result.individuals_nodes.shape == (2, 0)

0 commit comments

Comments
 (0)