@@ -443,7 +443,7 @@ def preprocess(
443
443
logger .warn (f"Zero metadata matches for { date } " )
444
444
return []
445
445
446
- if date .endswith ("01-01 " ):
446
+ if date .endswith ("12-31 " ):
447
447
logger .warning (f"Skipping { len (metadata_matches )} samples for { date } " )
448
448
return []
449
449
@@ -506,11 +506,37 @@ def match_samples(
506
506
samples = samples ,
507
507
ts = base_ts ,
508
508
num_mismatches = num_mismatches ,
509
- precision = precision ,
509
+ precision = 2 ,
510
510
num_threads = num_threads ,
511
511
show_progress = show_progress ,
512
512
mirror_coordinates = mirror_coordinates ,
513
513
)
514
+ samples_to_rerun = []
515
+ for sample in samples :
516
+ hmm_cost = sample .get_hmm_cost (num_mismatches )
517
+ logger .debug (
518
+ f"First sketch: { sample .strain } hmm_cost={ hmm_cost } path={ sample .path } "
519
+ )
520
+ if hmm_cost >= 2 :
521
+ sample .path .clear ()
522
+ sample .mutations .clear ()
523
+ samples_to_rerun .append (sample )
524
+
525
+ if len (samples_to_rerun ) > 0 :
526
+ match_tsinfer (
527
+ samples = samples_to_rerun ,
528
+ ts = base_ts ,
529
+ num_mismatches = num_mismatches ,
530
+ precision = precision ,
531
+ num_threads = num_threads ,
532
+ show_progress = show_progress ,
533
+ mirror_coordinates = mirror_coordinates ,
534
+ )
535
+ for sample in samples_to_rerun :
536
+ hmm_cost = sample .get_hmm_cost (num_mismatches )
537
+ logger .debug (
538
+ f"Final HMM pass:{ sample .strain } hmm_cost={ hmm_cost } path={ sample .path } "
539
+ )
514
540
515
541
match_db .add (samples , date , num_mismatches )
516
542
@@ -801,14 +827,20 @@ def solve_num_mismatches(ts, k):
801
827
r = 1e-3
802
828
mu = 1e-20
803
829
else :
804
- mu = 1e-6
830
+ # NOTE: the magnitude of mu matters because it puts a limit
831
+ # on how low we can push the HMM precision. We should be able to solve
832
+ # for the optimal value of this parameter such that the magnitude of the
833
+ # values within the HMM are as large as possible (so that we can truncate
834
+ # usefully).
835
+ mu = 1e-3
805
836
denom = (1 - mu ) ** k + (n - 1 ) * mu ** k
806
837
r = n * mu ** k / denom
807
838
assert mu < 0.5
808
839
assert r < 0.5
809
840
810
- # Add a tiny bit of extra mass for recombination so that we deterministically
841
+ # Add a little bit of extra mass for recombination so that we deterministically
811
842
# chose to recombine over k mutations
843
+ # NOTE: the magnitude of this value will depend also on mu, see above.
812
844
r += r * 0.01
813
845
ls_recomb = np .full (m - 1 , r )
814
846
ls_mismatch = np .full (m , mu )
0 commit comments