Skip to content

Commit 4195624

Browse files
committed
Include positions in VCF mapping
1 parent eee766c commit 4195624

File tree

5 files changed

+129
-38
lines changed

5 files changed

+129
-38
lines changed

python/CHANGELOG.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
--------------------
2+
[0.6.5] - 2025-0X-XX
3+
--------------------
4+
5+
**Features**
6+
7+
- ``TreeSequence.map_to_vcf_model`` now also returns the transformed positions and
8+
contig length. (:user:`benjeffery`, :pr:`XXXX`, :issue:`3173`)
9+
110
--------------------
211
[0.6.4] - 2025-05-21
312
--------------------

python/tests/test_highlevel.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5910,3 +5910,62 @@ def test_all_individuals_no_nodes(self):
59105910
ts = tables.tree_sequence()
59115911
result = ts.map_to_vcf_model()
59125912
assert result.individuals_nodes.shape == (2, 0)
5913+
5914+
def test_position_transform_default_and_custom(self):
5915+
tables = tskit.TableCollection(10.6)
5916+
tables.sites.add_row(position=1.3, ancestral_state="A")
5917+
tables.sites.add_row(position=5.7, ancestral_state="T")
5918+
tables.sites.add_row(position=9.9, ancestral_state="C")
5919+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
5920+
ts = tables.tree_sequence()
5921+
5922+
result = ts.map_to_vcf_model()
5923+
assert np.array_equal(result.transformed_positions, [1, 6, 10])
5924+
assert result.contig_length == 11
5925+
5926+
def floor_transform(positions):
5927+
return np.floor(positions).astype(int)
5928+
5929+
result = ts.map_to_vcf_model(position_transform=floor_transform)
5930+
assert np.array_equal(result.transformed_positions, [1, 5, 9])
5931+
assert result.contig_length == 10
5932+
5933+
def test_legacy_position_transform(self):
5934+
# Test legacy transform with duplicate positions
5935+
tables = tskit.TableCollection(10.0)
5936+
tables.sites.add_row(position=1.4, ancestral_state="A")
5937+
tables.sites.add_row(position=1.6, ancestral_state="T")
5938+
tables.sites.add_row(position=1.7, ancestral_state="T")
5939+
tables.sites.add_row(position=3.2, ancestral_state="C")
5940+
tables.sites.add_row(position=3.8, ancestral_state="G")
5941+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
5942+
ts = tables.tree_sequence()
5943+
5944+
result = ts.map_to_vcf_model(position_transform="legacy")
5945+
assert np.array_equal(result.transformed_positions, [1, 2, 3, 4, 5])
5946+
assert result.contig_length == 10
5947+
5948+
def test_position_transform_no_sites(self):
5949+
tables = tskit.TableCollection(5.5)
5950+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
5951+
ts = tables.tree_sequence()
5952+
5953+
result = ts.map_to_vcf_model()
5954+
assert result.transformed_positions.shape == (0,)
5955+
assert result.contig_length == 6
5956+
5957+
def test_invalid_position_transform_return_shape(self):
5958+
tables = tskit.TableCollection(10.0)
5959+
tables.sites.add_row(position=1.0, ancestral_state="A")
5960+
tables.sites.add_row(position=5.0, ancestral_state="T")
5961+
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
5962+
ts = tables.tree_sequence()
5963+
5964+
def bad_transform(positions):
5965+
return np.array([1]) # Wrong length
5966+
5967+
with pytest.raises(
5968+
ValueError,
5969+
match="Position transform must return an array of the same length",
5970+
):
5971+
ts.map_to_vcf_model(position_transform=bad_transform)

python/tskit/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# Definitive location for the version number.
22
# During development, should be x.y.z.devN
33
# For beta should be x.y.zbN
4-
tskit_version = "0.6.4"
4+
tskit_version = "0.6.5.dev0"

python/tskit/trees.py

Lines changed: 57 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@
6363
class VcfModelMapping:
6464
individuals_nodes: np.ndarray
6565
individuals_name: np.ndarray
66+
transformed_positions: np.ndarray
67+
contig_length: int
6668

6769

6870
class CoalescenceRecord(NamedTuple):
@@ -10636,13 +10638,15 @@ def map_to_vcf_model(
1063610638
name_metadata_key=None,
1063710639
individual_names=None,
1063810640
include_non_sample_nodes=None,
10641+
position_transform=None,
1063910642
):
1064010643
"""
1064110644
Maps the sample nodes in this tree sequence to a representation suitable for
1064210645
VCF output, using the individuals if present.
1064310646
10644-
Creates a VcfModelMapping object that contains both the nodes-to-individual
10645-
mapping as a 2D array of (individuals, nodes) and the individual names. The
10647+
Creates a VcfModelMapping object that contains a nodes-to-individual
10648+
mapping as a 2D array of (individuals, nodes), the individual names and VCF
10649+
compatible site positions and contig length. The
1064610650
mapping is created by first checking if the tree sequence contains individuals.
1064710651
If it does, the mapping is created using the individuals in the tree sequence.
1064810652
By default only the sample nodes of the individuals are included in the mapping,
@@ -10652,6 +10656,12 @@ def map_to_vcf_model(
1065210656
If no individuals are present, the mapping is created using only the sample nodes
1065310657
and the specified ploidy.
1065410658
10659+
As the tskit data model allows non-integer positions, site positions and contig
10660+
length are transformed to integer values suitable for VCF output. The
10661+
transformation is done using the `position_transform` function, which must
10662+
return an integer numpy array the same dimension as the input. By default,
10663+
this is set to ``numpy.round()`` which will round values to the nearest integer.
10664+
1065510665
If neither `name_metadata_key` nor `individual_names` is not specified, the
1065610666
individual names are set to "tsk_{individual_id}" for each individual. If
1065710667
no individuals are present, the individual names are set to "tsk_{i}" with
@@ -10672,9 +10682,20 @@ def map_to_vcf_model(
1067210682
be specified simultaneously with name_metadata_key.
1067310683
:param bool include_non_sample_nodes: If True, include all nodes belonging to
1067410684
the individuals in the mapping. If False, only include sample nodes.
10675-
Deafults to False.
10676-
:return: A VcfModelMapping containing the node-to-individual mapping and
10677-
individual names.
10685+
Defaults to False.
10686+
:param position_transform: A callable that transforms the
10687+
site position values into integer valued coordinates suitable for
10688+
VCF. The function takes a single positional parameter x and must
10689+
return an integer numpy array the same dimension as x. By default,
10690+
this is set to ``numpy.round()`` which will round values to the
10691+
nearest integer. If the string "legacy" is provided here, the
10692+
pre 0.2.0 legacy behaviour of rounding values to the nearest integer
10693+
(starting from 1) and avoiding the output of identical positions
10694+
by incrementing is used.
10695+
See the :ref:`sec_export_vcf_modifying_coordinates` for examples
10696+
and more information.
10697+
:return: A VcfModelMapping containing the node-to-individual mapping,
10698+
individual names, transformed positions, and transformed contig length.
1067810699
:raises ValueError: If both name_metadata_key and individual_names are specified,
1067910700
if ploidy is specified when individuals are present, if an invalid individual
1068010701
ID is specified, if a specified individual has no nodes, or if the number of
@@ -10752,7 +10773,37 @@ def map_to_vcf_model(
1075210773
"The number of individuals does not match the number of names"
1075310774
)
1075410775

10755-
return VcfModelMapping(individuals_nodes, individual_names)
10776+
def legacy_position_transform(positions):
10777+
"""
10778+
Transforms positions in the tree sequence into VCF coordinates under
10779+
the pre 0.2.0 legacy rule.
10780+
"""
10781+
last_pos = 0
10782+
transformed = []
10783+
for pos in positions:
10784+
pos = int(round(pos))
10785+
if pos <= last_pos:
10786+
pos = last_pos + 1
10787+
transformed.append(pos)
10788+
last_pos = pos
10789+
return transformed
10790+
10791+
if position_transform is None:
10792+
position_transform = np.round
10793+
elif position_transform == "legacy":
10794+
position_transform = legacy_position_transform
10795+
transformed_positions = np.array(
10796+
position_transform(self.sites_position), dtype=int
10797+
)
10798+
if transformed_positions.shape != (self.num_sites,):
10799+
raise ValueError(
10800+
"Position transform must return an array of the same length"
10801+
)
10802+
contig_length = max(1, int(position_transform([self.sequence_length])[0]))
10803+
10804+
return VcfModelMapping(
10805+
individuals_nodes, individual_names, transformed_positions, contig_length
10806+
)
1075610807

1075710808
############################################
1075810809
#

python/tskit/vcf.py

Lines changed: 3 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -28,22 +28,6 @@
2828
from . import provenance
2929

3030

31-
def legacy_position_transform(positions):
32-
"""
33-
Transforms positions in the tree sequence into VCF coordinates under
34-
the pre 0.2.0 legacy rule.
35-
"""
36-
last_pos = 0
37-
transformed = []
38-
for pos in positions:
39-
pos = int(round(pos))
40-
if pos <= last_pos:
41-
pos = last_pos + 1
42-
transformed.append(pos)
43-
last_pos = pos
44-
return transformed
45-
46-
4731
class VcfWriter:
4832
"""
4933
Writes a VCF representation of the genotypes tree sequence to a
@@ -74,6 +58,7 @@ def __init__(
7458
ploidy=ploidy,
7559
individual_names=individual_names,
7660
include_non_sample_nodes=include_non_sample_nodes,
61+
position_transform=position_transform,
7762
)
7863
# Remove individuals with zero ploidy as these cannot be
7964
# represented in VCF.
@@ -96,21 +81,8 @@ def __init__(
9681
if node_id != -1:
9782
self.samples.append(node_id)
9883

99-
# Transform coordinates for VCF
100-
if position_transform is None:
101-
position_transform = np.round
102-
elif position_transform == "legacy":
103-
position_transform = legacy_position_transform
104-
self.transformed_positions = np.array(
105-
position_transform(tree_sequence.tables.sites.position), dtype=int
106-
)
107-
if self.transformed_positions.shape != (tree_sequence.num_sites,):
108-
raise ValueError(
109-
"Position transform must return an array of the same length"
110-
)
111-
self.contig_length = max(
112-
1, int(position_transform([tree_sequence.sequence_length])[0])
113-
)
84+
self.transformed_positions = vcf_model.transformed_positions
85+
self.contig_length = vcf_model.contig_length
11486
if len(self.transformed_positions) > 0:
11587
# Arguably this should be last_pos + 1, but if we hit this
11688
# condition the coordinate systems are all muddled up anyway

0 commit comments

Comments
 (0)