@@ -64,11 +64,12 @@ def reduce_to_contemporaneous(ts):
64
64
def preprocess_ts (
65
65
tree_sequence ,
66
66
* ,
67
- minimum_gap = 1000000 ,
68
- remove_telomeres = True ,
67
+ minimum_gap = None ,
68
+ remove_telomeres = None ,
69
69
filter_populations = False ,
70
70
filter_individuals = False ,
71
71
filter_sites = False ,
72
+ delete_intervals = None ,
72
73
** kwargs ,
73
74
):
74
75
"""
@@ -81,9 +82,10 @@ def preprocess_ts(
81
82
:param TreeSequence tree_sequence: The input :class`tskit.TreeSequence`
82
83
to be preprocessed.
83
84
:param float minimum_gap: The minimum gap between sites to remove from the tree
84
- sequence. Default: " 1000000"
85
+ sequence. Default: ``None`` treated as `` 1000000``
85
86
:param bool remove_telomeres: Should all material before the first site and after the
86
- last site be removed, regardless of the length. Default: "True"
87
+ last site be removed, regardless of the length. Default: ``None`` treated as
88
+ ``True``
87
89
:param bool filter_populations: parameter passed to the ``tskit.simplify``
88
90
command. Unlike calling that command directly, this defaults to ``False``, such
89
91
that all populations in the tree sequence are kept.
@@ -93,71 +95,96 @@ def preprocess_ts(
93
95
:param bool filter_sites: parameter passed to the ``tskit.simplify``
94
96
command. Unlike calling that command directly, this defaults to ``False``, such
95
97
that all sites in the tree sequence are kept
98
+ :param array_like delete_intervals: A list (start, end) pairs describing the
99
+ genomic intervals (gaps) to delete. This is usually left as ``None``
100
+ (the default) in which case ``minimum_gap`` and ``remove_telomeres`` are used
101
+ to determine the gaps to remove, and the calculated intervals are recorded in
102
+ the provenance of the resulting tree sequence.
96
103
:param \\ **kwargs: All further keyword arguments are passed to the ``tskit.simplify``
97
104
command.
98
105
99
106
:return: A tree sequence with gaps removed.
100
107
:rtype: tskit.TreeSequence
101
108
"""
109
+
102
110
logger .info ("Beginning preprocessing" )
103
111
logger .info (f"Minimum_gap: { minimum_gap } and remove_telomeres: { remove_telomeres } " )
104
- if tree_sequence .num_sites < 1 :
105
- raise ValueError ("Invalid tree sequence: no sites present" )
112
+ if delete_intervals is not None and (
113
+ minimum_gap is not None or remove_telomeres is not None
114
+ ):
115
+ raise ValueError (
116
+ "Cannot specify both delete_intervals and minimum_gap/remove_telomeres"
117
+ )
106
118
107
119
tables = tree_sequence .dump_tables ()
108
120
sites = tables .sites .position [:]
109
- delete_intervals = []
110
- if remove_telomeres :
111
- first_site = sites [0 ] - 1
112
- if first_site > 0 :
113
- delete_intervals .append ([0 , first_site ])
114
- logger .info (
115
- "REMOVING TELOMERE: Snip topology "
116
- "from 0 to first site at {}." .format (first_site )
117
- )
118
- last_site = sites [- 1 ] + 1
119
- sequence_length = tables .sequence_length
120
- if last_site < sequence_length :
121
- delete_intervals .append ([last_site , sequence_length ])
122
- logger .info (
123
- "REMOVING TELOMERE: Snip topology "
124
- "from {} to end of sequence at {}." .format (last_site , sequence_length )
125
- )
126
- gaps = sites [1 :] - sites [:- 1 ]
127
- threshold_gaps = np .where (gaps >= minimum_gap )[0 ]
128
- for gap in threshold_gaps :
129
- gap_start = sites [gap ] + 1
130
- gap_end = sites [gap + 1 ] - 1
131
- if gap_end > gap_start :
132
- logger .info (
133
- "Gap Size is {}. Snip topology "
134
- "from {} to {}." .format (gap_end - gap_start , gap_start , gap_end )
135
- )
136
- delete_intervals .append ([gap_start , gap_end ])
137
- delete_intervals = sorted (delete_intervals , key = lambda x : x [0 ])
121
+ if delete_intervals is None :
122
+ if minimum_gap is None :
123
+ minimum_gap = 1000000
124
+ if remove_telomeres is None :
125
+ remove_telomeres = True
126
+
127
+ if tree_sequence .num_sites < 1 :
128
+ raise ValueError ("Invalid tree sequence: no sites present" )
129
+ delete_intervals = []
130
+ if remove_telomeres :
131
+ first_site = sites [0 ] - 1
132
+ if first_site > 0 :
133
+ delete_intervals .append ([0 , first_site ])
134
+ logger .info (
135
+ "REMOVING TELOMERE: Snip topology "
136
+ "from 0 to first site at {}." .format (first_site )
137
+ )
138
+ last_site = sites [- 1 ] + 1
139
+ sequence_length = tables .sequence_length
140
+ if last_site < sequence_length :
141
+ delete_intervals .append ([last_site , sequence_length ])
142
+ logger .info (
143
+ "REMOVING TELOMERE: Snip topology "
144
+ "from {} to end of sequence at {}." .format (
145
+ last_site , sequence_length
146
+ )
147
+ )
148
+ gaps = sites [1 :] - sites [:- 1 ]
149
+ threshold_gaps = np .where (gaps >= minimum_gap )[0 ]
150
+ for gap in threshold_gaps :
151
+ gap_start = sites [gap ] + 1
152
+ gap_end = sites [gap + 1 ] - 1
153
+ if gap_end > gap_start :
154
+ logger .info (
155
+ "Gap Size is {}. Snip topology "
156
+ "from {} to {}." .format (gap_end - gap_start , gap_start , gap_end )
157
+ )
158
+ delete_intervals .append ([gap_start , gap_end ])
159
+ delete_intervals = sorted (delete_intervals , key = lambda x : x [0 ])
138
160
if len (delete_intervals ) > 0 :
139
161
tables .delete_intervals (delete_intervals , simplify = False )
140
162
tables .simplify (
141
163
filter_populations = filter_populations ,
142
164
filter_individuals = filter_individuals ,
143
165
filter_sites = filter_sites ,
166
+ record_provenance = False ,
144
167
** kwargs ,
145
168
)
146
- provenance .record_provenance (
147
- tables ,
148
- "preprocess_ts" ,
149
- minimum_gap = minimum_gap ,
150
- remove_telomeres = remove_telomeres ,
151
- delete_intervals = delete_intervals ,
152
- )
153
169
else :
154
170
logger .info ("No gaps to remove" )
155
171
tables .simplify (
156
172
filter_populations = filter_populations ,
157
173
filter_individuals = filter_individuals ,
158
174
filter_sites = filter_sites ,
175
+ record_provenance = False ,
159
176
** kwargs ,
160
177
)
178
+ provenance .record_provenance (
179
+ tables ,
180
+ "preprocess_ts" ,
181
+ minimum_gap = minimum_gap ,
182
+ remove_telomeres = remove_telomeres ,
183
+ filter_populations = filter_populations ,
184
+ filter_individuals = filter_individuals ,
185
+ filter_sites = filter_sites ,
186
+ delete_intervals = delete_intervals ,
187
+ )
161
188
return tables .tree_sequence ()
162
189
163
190
0 commit comments