63
63
class VcfModelMapping :
64
64
individuals_nodes : np .ndarray
65
65
individuals_name : np .ndarray
66
+ transformed_positions : np .ndarray
67
+ contig_length : int
66
68
67
69
68
70
class CoalescenceRecord (NamedTuple ):
@@ -10636,13 +10638,15 @@ def map_to_vcf_model(
10636
10638
name_metadata_key = None ,
10637
10639
individual_names = None ,
10638
10640
include_non_sample_nodes = None ,
10641
+ position_transform = None ,
10639
10642
):
10640
10643
"""
10641
10644
Maps the sample nodes in this tree sequence to a representation suitable for
10642
10645
VCF output, using the individuals if present.
10643
10646
10644
- Creates a VcfModelMapping object that contains both the nodes-to-individual
10645
- mapping as a 2D array of (individuals, nodes) and the individual names. The
10647
+ Creates a VcfModelMapping object that contains a nodes-to-individual
10648
+ mapping as a 2D array of (individuals, nodes), the individual names and VCF
10649
+ compatible site positions and contig length. The
10646
10650
mapping is created by first checking if the tree sequence contains individuals.
10647
10651
If it does, the mapping is created using the individuals in the tree sequence.
10648
10652
By default only the sample nodes of the individuals are included in the mapping,
@@ -10652,6 +10656,12 @@ def map_to_vcf_model(
10652
10656
If no individuals are present, the mapping is created using only the sample nodes
10653
10657
and the specified ploidy.
10654
10658
10659
+ As the tskit data model allows non-integer positions, site positions and contig
10660
+ length are transformed to integer values suitable for VCF output. The
10661
+ transformation is done using the `position_transform` function, which must
10662
+ return an integer numpy array the same dimension as the input. By default,
10663
+ this is set to ``numpy.round()`` which will round values to the nearest integer.
10664
+
10655
10665
If neither `name_metadata_key` nor `individual_names` is not specified, the
10656
10666
individual names are set to "tsk_{individual_id}" for each individual. If
10657
10667
no individuals are present, the individual names are set to "tsk_{i}" with
@@ -10672,9 +10682,20 @@ def map_to_vcf_model(
10672
10682
be specified simultaneously with name_metadata_key.
10673
10683
:param bool include_non_sample_nodes: If True, include all nodes belonging to
10674
10684
the individuals in the mapping. If False, only include sample nodes.
10675
- Deafults to False.
10676
- :return: A VcfModelMapping containing the node-to-individual mapping and
10677
- individual names.
10685
+ Defaults to False.
10686
+ :param position_transform: A callable that transforms the
10687
+ site position values into integer valued coordinates suitable for
10688
+ VCF. The function takes a single positional parameter x and must
10689
+ return an integer numpy array the same dimension as x. By default,
10690
+ this is set to ``numpy.round()`` which will round values to the
10691
+ nearest integer. If the string "legacy" is provided here, the
10692
+ pre 0.2.0 legacy behaviour of rounding values to the nearest integer
10693
+ (starting from 1) and avoiding the output of identical positions
10694
+ by incrementing is used.
10695
+ See the :ref:`sec_export_vcf_modifying_coordinates` for examples
10696
+ and more information.
10697
+ :return: A VcfModelMapping containing the node-to-individual mapping,
10698
+ individual names, transformed positions, and transformed contig length.
10678
10699
:raises ValueError: If both name_metadata_key and individual_names are specified,
10679
10700
if ploidy is specified when individuals are present, if an invalid individual
10680
10701
ID is specified, if a specified individual has no nodes, or if the number of
@@ -10752,7 +10773,37 @@ def map_to_vcf_model(
10752
10773
"The number of individuals does not match the number of names"
10753
10774
)
10754
10775
10755
- return VcfModelMapping (individuals_nodes , individual_names )
10776
+ def legacy_position_transform (positions ):
10777
+ """
10778
+ Transforms positions in the tree sequence into VCF coordinates under
10779
+ the pre 0.2.0 legacy rule.
10780
+ """
10781
+ last_pos = 0
10782
+ transformed = []
10783
+ for pos in positions :
10784
+ pos = int (round (pos ))
10785
+ if pos <= last_pos :
10786
+ pos = last_pos + 1
10787
+ transformed .append (pos )
10788
+ last_pos = pos
10789
+ return transformed
10790
+
10791
+ if position_transform is None :
10792
+ position_transform = np .round
10793
+ elif position_transform == "legacy" :
10794
+ position_transform = legacy_position_transform
10795
+ transformed_positions = np .array (
10796
+ position_transform (self .sites_position ), dtype = int
10797
+ )
10798
+ if transformed_positions .shape != (self .num_sites ,):
10799
+ raise ValueError (
10800
+ "Position transform must return an array of the same length"
10801
+ )
10802
+ contig_length = max (1 , int (position_transform ([self .sequence_length ])[0 ]))
10803
+
10804
+ return VcfModelMapping (
10805
+ individuals_nodes , individual_names , transformed_positions , contig_length
10806
+ )
10756
10807
10757
10808
############################################
10758
10809
#
0 commit comments