@@ -433,73 +433,6 @@ def asdict(self):
433
433
# last_ts = ts
434
434
435
435
436
- def preprocess (
437
- date ,
438
- * ,
439
- base_ts ,
440
- metadata_db ,
441
- alignment_store ,
442
- max_daily_samples = None ,
443
- show_progress = False ,
444
- ):
445
- samples = []
446
- metadata_matches = list (metadata_db .get (date ))
447
-
448
- if len (metadata_matches ) == 0 :
449
- logger .warn (f"Zero metadata matches for { date } " )
450
- return []
451
-
452
- if date .endswith ("12-31" ):
453
- logger .warning (f"Skipping { len (metadata_matches )} samples for { date } " )
454
- return []
455
-
456
- # TODO implement this.
457
- assert max_daily_samples is None
458
-
459
- keep_sites = base_ts .sites_position .astype (int )
460
- problematic_sites = core .get_problematic_sites ()
461
- samples = []
462
-
463
- with tqdm .tqdm (
464
- metadata_matches ,
465
- desc = f"Preprocess:{ date } " ,
466
- disable = not show_progress ,
467
- ) as bar :
468
- for md in bar :
469
- strain = md ["strain" ]
470
- try :
471
- alignment = alignment_store [strain ]
472
- except KeyError :
473
- logger .debug (f"No alignment stored for { strain } " )
474
- continue
475
-
476
- sample = Sample (strain , date , metadata = md )
477
- ma = alignments .encode_and_mask (alignment )
478
- # Always mask the problematic_sites as well. We need to do this
479
- # for follow-up matching to inspect recombinants, as tsinfer
480
- # needs us to keep all sites in the table when doing mirrored
481
- # coordinates.
482
- ma .alignment [problematic_sites ] = - 1
483
- sample .alignment_qc = ma .qc_summary ()
484
- sample .masked_sites = ma .masked_sites
485
- sample .alignment = ma .alignment [keep_sites ]
486
- samples .append (sample )
487
- num_Ns = ma .original_base_composition .get ("N" , 0 )
488
- non_nuc_counts = dict (ma .original_base_composition )
489
- for nuc in "ACGT" :
490
- del non_nuc_counts [nuc ]
491
- counts = "," .join (
492
- f"{ key } ={ count } " for key , count in sorted (non_nuc_counts .items ())
493
- )
494
- num_masked = len (ma .masked_sites )
495
- logger .debug (f"Mask { strain } : masked={ num_masked } { counts } " )
496
-
497
- logger .info (
498
- f"Got alignments for { len (samples )} of { len (metadata_matches )} in metadata"
499
- )
500
- return samples
501
-
502
-
503
436
def match_samples (
504
437
date ,
505
438
samples ,
@@ -563,6 +496,47 @@ def check_base_ts(ts):
563
496
assert len (sc2ts_md ["samples_strain" ]) == ts .num_samples
564
497
565
498
499
+ def preprocess (samples_md , base_ts , date , alignment_store , show_progress = False ):
500
+ keep_sites = base_ts .sites_position .astype (int )
501
+ problematic_sites = core .get_problematic_sites ()
502
+
503
+ samples = []
504
+ with tqdm .tqdm (
505
+ samples_md ,
506
+ desc = f"Preprocess" ,
507
+ disable = not show_progress ,
508
+ ) as bar :
509
+ for md in bar :
510
+ strain = md ["strain" ]
511
+ try :
512
+ alignment = alignment_store [strain ]
513
+ except KeyError :
514
+ logger .debug (f"No alignment stored for { strain } " )
515
+ continue
516
+ sample = Sample (strain , date , metadata = md )
517
+ ma = alignments .encode_and_mask (alignment )
518
+ # Always mask the problematic_sites as well. We need to do this
519
+ # for follow-up matching to inspect recombinants, as tsinfer
520
+ # needs us to keep all sites in the table when doing mirrored
521
+ # coordinates.
522
+ ma .alignment [problematic_sites ] = - 1
523
+ sample .alignment_qc = ma .qc_summary ()
524
+ sample .masked_sites = ma .masked_sites
525
+ sample .alignment = ma .alignment [keep_sites ]
526
+ samples .append (sample )
527
+ num_Ns = ma .original_base_composition .get ("N" , 0 )
528
+ non_nuc_counts = dict (ma .original_base_composition )
529
+ for nuc in "ACGT" :
530
+ del non_nuc_counts [nuc ]
531
+ counts = "," .join (
532
+ f"{ key } ={ count } " for key , count in sorted (non_nuc_counts .items ())
533
+ )
534
+ num_masked = len (ma .masked_sites )
535
+ logger .debug (f"Mask { strain } : masked={ num_masked } { counts } " )
536
+
537
+ return samples
538
+
539
+
566
540
def extend (
567
541
* ,
568
542
alignment_store ,
@@ -594,19 +568,22 @@ def extend(
594
568
f"mutations={ base_ts .num_mutations } ;date={ base_ts .metadata ['sc2ts' ]['date' ]} "
595
569
)
596
570
571
+ metadata_matches = list (metadata_db .get (date ))
572
+ # TODO implement this.
573
+ assert max_daily_samples is None
574
+
597
575
samples = preprocess (
598
- date ,
599
- metadata_db = metadata_db ,
600
- alignment_store = alignment_store ,
601
- base_ts = base_ts ,
602
- max_daily_samples = max_daily_samples ,
603
- show_progress = show_progress ,
576
+ metadata_matches , base_ts , date , alignment_store , show_progress = show_progress
604
577
)
605
578
606
579
if len (samples ) == 0 :
607
580
logger .warning (f"Nothing to do for { date } " )
608
581
return base_ts
609
582
583
+ logger .info (
584
+ f"Got alignments for { len (samples )} of { len (metadata_matches )} in metadata"
585
+ )
586
+
610
587
match_samples (
611
588
date ,
612
589
samples ,
0 commit comments