@@ -182,42 +182,92 @@ def add_provenance(ts, output_file):
182
182
183
183
184
184
@click .command ()
185
+ @click .argument ("ts" , type = click .Path (dir_okay = False ))
186
+ @click .argument ("match_db" , type = click .Path (dir_okay = False ))
187
+ @click .option (
188
+ "--additional-problematic-sites" ,
189
+ default = None ,
190
+ type = click .Path (exists = True , dir_okay = False ),
191
+ help = "File containing the list of additional problematic sites to exclude." ,
192
+ )
193
+ @click .option ("-v" , "--verbose" , count = True )
194
+ @click .option ("-l" , "--log-file" , default = None , type = click .Path (dir_okay = False ))
195
+ def initialise (ts , match_db , additional_problematic_sites , verbose , log_file ):
196
+ """
197
+ Initialise a new base tree sequence to begin inference.
198
+ """
199
+ setup_logging (verbose , log_file )
200
+
201
+ additional_problematic = []
202
+ if additional_problematic_sites is not None :
203
+ additional_problematic = (
204
+ np .loadtxt (additional_problematic_sites , ndmin = 1 ).astype (int ).tolist ()
205
+ )
206
+ logger .info (
207
+ f"Excluding additional { len (additional_problematic )} problematic sites"
208
+ )
209
+
210
+ base_ts = inference .initial_ts (additional_problematic )
211
+ base_ts .dump (ts )
212
+ logger .info (f"New base ts at { ts } " )
213
+ inference .MatchDb .initialise (match_db )
214
+
215
+
216
+ @click .command ()
217
+ @click .argument ("metadata" , type = click .Path (exists = True , dir_okay = False ))
218
+ @click .option ("--counts/--no-counts" , default = False )
219
+ @click .option ("-v" , "--verbose" , count = True )
220
+ @click .option ("-l" , "--log-file" , default = None , type = click .Path (dir_okay = False ))
221
+ def list_dates (metadata , counts , verbose , log_file ):
222
+ """
223
+ List the dates included in specified metadataDB
224
+ """
225
+ setup_logging (verbose , log_file )
226
+ with sc2ts .MetadataDb (metadata ) as metadata_db :
227
+ counter = metadata_db .date_sample_counts ()
228
+ if counts :
229
+ for k , v in counter .items ():
230
+ print (k , v , sep = "\t " )
231
+
232
+ else :
233
+ for k in counter :
234
+ print (k )
235
+
236
+
237
+ @click .command ()
238
+ @click .argument ("base_ts" , type = click .Path (exists = True , dir_okay = False ))
239
+ @click .argument ("date" )
185
240
@click .argument ("alignments" , type = click .Path (exists = True , dir_okay = False ))
186
241
@click .argument ("metadata" , type = click .Path (exists = True , dir_okay = False ))
187
- @click .argument ("output-prefix" )
242
+ @click .argument ("matches" , type = click .Path (exists = True , dir_okay = False ))
243
+ @click .argument ("output_ts" , type = click .Path (dir_okay = False ))
188
244
@click .option (
189
- "-b" ,
190
- "--base" ,
191
- type = click .Path (dir_okay = False , exists = True ),
192
- default = None ,
193
- help = (
194
- "The base tree sequence to match against. If not specified, create "
195
- "a new initial base containing the reference. "
196
- ),
245
+ "--num-mismatches" ,
246
+ default = 3 ,
247
+ show_default = True ,
248
+ type = float ,
249
+ help = "Number of mismatches to accept in favour of recombination" ,
250
+ )
251
+ @click .option (
252
+ "--hmm-cost-threshold" ,
253
+ default = 5 ,
254
+ type = float ,
255
+ show_default = True ,
256
+ help = "The maximum HMM cost for samples to be included unconditionally" ,
197
257
)
198
- @click .option ("--num-mismatches" , default = None , type = float , help = "num-mismatches" )
199
- @click .option ("--max-hmm-cost" , default = 5 , type = float , help = "max-hmm-cost" )
200
258
@click .option (
201
259
"--min-group-size" ,
202
260
default = 10 ,
261
+ show_default = True ,
203
262
type = int ,
204
- help = "Minimum size of groups of reconsidered samples" ,
205
- show_default = True
263
+ help = "Minimum size of groups of reconsidered samples for inclusion" ,
206
264
)
207
265
@click .option (
208
- "--num-past-days" ,
209
- default = None ,
266
+ "--retrospective-window" ,
267
+ default = 30 ,
268
+ show_default = True ,
210
269
type = int ,
211
- help = "Number of past days to retrieve filtered samples" ,
212
- )
213
- @click .option (
214
- "--max-submission-delay" ,
215
- default = None ,
216
- type = int ,
217
- help = (
218
- "The maximum number of days between the sample and its submission date "
219
- "for it to be included in the inference"
220
- ),
270
+ help = "Number of days in the past to reconsider potential matches" ,
221
271
)
222
272
@click .option (
223
273
"--max-daily-samples" ,
@@ -228,91 +278,86 @@ def add_provenance(ts, output_file):
228
278
"is greater than this, randomly subsample."
229
279
),
230
280
)
231
- @click .option ("--num-threads" , default = 0 , type = int , help = "Number of match threads" )
232
- @click .option ("--random-seed" , default = 42 , type = int , help = "Random seed for subsampling" )
233
- @click .option ("--stop-date" , default = "2030-01-01" , type = str , help = "Stopping date" )
234
281
@click .option (
235
- "--additional-problematic-sites" ,
236
- default = None ,
237
- type = str ,
238
- help = "File containing the list of additional problematic sites to exclude." ,
282
+ "--random-seed" ,
283
+ default = 42 ,
284
+ type = int ,
285
+ help = "Random seed for subsampling" ,
286
+ show_default = True ,
239
287
)
240
- @click .option ("-p" , "--precision" , default = None , type = int , help = "Match precision" )
241
- @click .option ("--no-progress" , default = False , type = bool , help = "Don't show progress" )
288
+ @click .option (
289
+ "--num-threads" ,
290
+ default = 0 ,
291
+ type = int ,
292
+ help = "Number of match threads (default to one)" ,
293
+ )
294
+ @click .option ("--progress/--no-progress" , default = True )
242
295
@click .option ("-v" , "--verbose" , count = True )
243
296
@click .option ("-l" , "--log-file" , default = None , type = click .Path (dir_okay = False ))
244
- def daily_extend (
297
+ @click .option (
298
+ "-f" ,
299
+ "--force" ,
300
+ is_flag = True ,
301
+ flag_value = True ,
302
+ help = "Force clearing newer matches from DB" ,
303
+ )
304
+ def extend (
305
+ base_ts ,
306
+ date ,
245
307
alignments ,
246
308
metadata ,
247
- output_prefix ,
248
- base ,
309
+ matches ,
310
+ output_ts ,
249
311
num_mismatches ,
250
- max_hmm_cost ,
312
+ hmm_cost_threshold ,
251
313
min_group_size ,
252
- num_past_days ,
253
- max_submission_delay ,
314
+ retrospective_window ,
254
315
max_daily_samples ,
255
316
num_threads ,
256
317
random_seed ,
257
- stop_date ,
258
- additional_problematic_sites ,
259
- precision ,
260
- no_progress ,
318
+ progress ,
261
319
verbose ,
262
320
log_file ,
321
+ force ,
263
322
):
264
323
"""
265
- Sequentially extend the trees by adding samples in daily batches.
324
+ Extend base_ts with sequences for the specified date, using specified
325
+ alignments and metadata databases, updating the specified matches
326
+ database, and outputting the result to the specified file.
266
327
"""
267
328
setup_logging (verbose , log_file )
268
- rng = random .Random (random_seed )
269
-
270
- additional_problematic = []
271
- if additional_problematic_sites is not None :
272
- additional_problematic = (
273
- np .loadtxt (additional_problematic_sites ).astype (int ).tolist ()
274
- )
275
- logger .info (
276
- f"Excluding additional { len (additional_problematic )} problematic sites"
277
- )
278
-
279
- match_db_path = f"{ output_prefix } match.db"
280
- if base is None :
281
- base_ts = inference .initial_ts (additional_problematic )
282
- match_db = inference .MatchDb .initialise (match_db_path )
283
- else :
284
- base_ts = tskit .load (base )
285
-
286
- assert (
287
- base_ts .metadata ["sc2ts" ]["additional_problematic_sites" ]
288
- == additional_problematic
289
- )
290
-
329
+ base = tskit .load (base_ts )
330
+ logger .debug (f"Loaded base ts from { base_ts } " )
291
331
with contextlib .ExitStack () as exit_stack :
292
332
alignment_store = exit_stack .enter_context (sc2ts .AlignmentStore (alignments ))
293
333
metadata_db = exit_stack .enter_context (sc2ts .MetadataDb (metadata ))
294
- match_db = exit_stack .enter_context (inference .MatchDb (match_db_path ))
295
- ts_iter = inference .daily_extend (
334
+ match_db = exit_stack .enter_context (sc2ts .MatchDb (matches ))
335
+
336
+ newer_matches = match_db .count_newer (date )
337
+ if newer_matches > 0 :
338
+ if not force :
339
+ click .confirm (
340
+ f"Do you want to remove { newer_matches } newer matches "
341
+ f"from MatchDB > { date } ?" ,
342
+ abort = True ,
343
+ )
344
+ match_db .delete_newer (date )
345
+ ts_out = inference .extend (
296
346
alignment_store = alignment_store ,
297
347
metadata_db = metadata_db ,
298
- base_ts = base_ts ,
348
+ base_ts = base ,
349
+ date = date ,
299
350
match_db = match_db ,
300
351
num_mismatches = num_mismatches ,
301
- max_hmm_cost = max_hmm_cost ,
352
+ hmm_cost_threshold = hmm_cost_threshold ,
302
353
min_group_size = min_group_size ,
303
- num_past_days = num_past_days ,
304
- max_submission_delay = max_submission_delay ,
354
+ retrospective_window = retrospective_window ,
305
355
max_daily_samples = max_daily_samples ,
306
- rng = rng ,
307
- precision = precision ,
356
+ random_seed = random_seed ,
308
357
num_threads = num_threads ,
309
- show_progress = not no_progress ,
358
+ show_progress = progress ,
310
359
)
311
- for ts , date in ts_iter :
312
- output_ts = output_prefix + date + ".ts"
313
- add_provenance (ts , output_ts )
314
- if date >= stop_date :
315
- break
360
+ add_provenance (ts_out , output_ts )
316
361
317
362
318
363
@click .command ()
@@ -477,6 +522,8 @@ def cli():
477
522
cli .add_command (export_alignments )
478
523
cli .add_command (export_metadata )
479
524
480
- cli .add_command (daily_extend )
525
+ cli .add_command (initialise )
526
+ cli .add_command (list_dates )
527
+ cli .add_command (extend )
481
528
cli .add_command (validate )
482
529
cli .add_command (annotate_recombinants )
0 commit comments