Add systematic matching tests against fixtures

jeromekelleher · jeromekelleher · commit e8435e587617 · 2024-08-30T16:02:32.000+01:00
diff --git a/sc2ts/inference.py b/sc2ts/inference.py
@@ -433,73 +433,6 @@ def asdict(self):
 #         last_ts = ts
 
 
-def preprocess(
-    date,
-    *,
-    base_ts,
-    metadata_db,
-    alignment_store,
-    max_daily_samples=None,
-    show_progress=False,
-):
-    samples = []
-    metadata_matches = list(metadata_db.get(date))
-
-    if len(metadata_matches) == 0:
-        logger.warn(f"Zero metadata matches for {date}")
-        return []
-
-    if date.endswith("12-31"):
-        logger.warning(f"Skipping {len(metadata_matches)} samples for {date}")
-        return []
-
-    # TODO implement this.
-    assert max_daily_samples is None
-
-    keep_sites = base_ts.sites_position.astype(int)
-    problematic_sites = core.get_problematic_sites()
-    samples = []
-
-    with tqdm.tqdm(
-        metadata_matches,
-        desc=f"Preprocess:{date}",
-        disable=not show_progress,
-    ) as bar:
-        for md in bar:
-            strain = md["strain"]
-            try:
-                alignment = alignment_store[strain]
-            except KeyError:
-                logger.debug(f"No alignment stored for {strain}")
-                continue
-
-            sample = Sample(strain, date, metadata=md)
-            ma = alignments.encode_and_mask(alignment)
-            # Always mask the problematic_sites as well. We need to do this
-            # for follow-up matching to inspect recombinants, as tsinfer
-            # needs us to keep all sites in the table when doing mirrored
-            # coordinates.
-            ma.alignment[problematic_sites] = -1
-            sample.alignment_qc = ma.qc_summary()
-            sample.masked_sites = ma.masked_sites
-            sample.alignment = ma.alignment[keep_sites]
-            samples.append(sample)
-            num_Ns = ma.original_base_composition.get("N", 0)
-            non_nuc_counts = dict(ma.original_base_composition)
-            for nuc in "ACGT":
-                del non_nuc_counts[nuc]
-            counts = ",".join(
-                f"{key}={count}" for key, count in sorted(non_nuc_counts.items())
-            )
-            num_masked = len(ma.masked_sites)
-            logger.debug(f"Mask {strain}: masked={num_masked} {counts}")
-
-    logger.info(
-        f"Got alignments for {len(samples)} of {len(metadata_matches)} in metadata"
-    )
-    return samples
-
-
 def match_samples(
     date,
     samples,
@@ -563,6 +496,47 @@ def check_base_ts(ts):
     assert len(sc2ts_md["samples_strain"]) == ts.num_samples
 
 
+def preprocess(samples_md, base_ts, date, alignment_store, show_progress=False):
+    keep_sites = base_ts.sites_position.astype(int)
+    problematic_sites = core.get_problematic_sites()
+
+    samples = []
+    with tqdm.tqdm(
+        samples_md,
+        desc=f"Preprocess",
+        disable=not show_progress,
+    ) as bar:
+        for md in bar:
+            strain = md["strain"]
+            try:
+                alignment = alignment_store[strain]
+            except KeyError:
+                logger.debug(f"No alignment stored for {strain}")
+                continue
+            sample = Sample(strain, date, metadata=md)
+            ma = alignments.encode_and_mask(alignment)
+            # Always mask the problematic_sites as well. We need to do this
+            # for follow-up matching to inspect recombinants, as tsinfer
+            # needs us to keep all sites in the table when doing mirrored
+            # coordinates.
+            ma.alignment[problematic_sites] = -1
+            sample.alignment_qc = ma.qc_summary()
+            sample.masked_sites = ma.masked_sites
+            sample.alignment = ma.alignment[keep_sites]
+            samples.append(sample)
+            num_Ns = ma.original_base_composition.get("N", 0)
+            non_nuc_counts = dict(ma.original_base_composition)
+            for nuc in "ACGT":
+                del non_nuc_counts[nuc]
+                counts = ",".join(
+                    f"{key}={count}" for key, count in sorted(non_nuc_counts.items())
+                )
+            num_masked = len(ma.masked_sites)
+            logger.debug(f"Mask {strain}: masked={num_masked} {counts}")
+
+    return samples
+
+
 def extend(
     *,
     alignment_store,
@@ -594,19 +568,22 @@ def extend(
         f"mutations={base_ts.num_mutations};date={base_ts.metadata['sc2ts']['date']}"
     )
 
+    metadata_matches = list(metadata_db.get(date))
+    # TODO implement this.
+    assert max_daily_samples is None
+
     samples = preprocess(
-        date,
-        metadata_db=metadata_db,
-        alignment_store=alignment_store,
-        base_ts=base_ts,
-        max_daily_samples=max_daily_samples,
-        show_progress=show_progress,
+        metadata_matches, base_ts, date, alignment_store, show_progress=show_progress
     )
 
     if len(samples) == 0:
         logger.warning(f"Nothing to do for {date}")
         return base_ts
 
+    logger.info(
+        f"Got alignments for {len(samples)} of {len(metadata_matches)} in metadata"
+    )
+
     match_samples(
         date,
         samples,
diff --git a/tests/test_inference.py b/tests/test_inference.py
@@ -546,21 +546,74 @@ def test_2020_02_10_metadata(self, fx_ts_2020_02_10):
 
 
 class TestMatchingDetails:
-
-    def test_exact_matches(self, fx_ts_2020_02_10, fx_alignment_store, fx_metadata_db):
-        print("HERE")
-
-    def test_other_exact_matches(self, tmp_path, fx_ts_2020_02_10, fx_alignment_store, fx_metadata_db):
-        print("HERE")
-        match_db = sc2ts.MatchDb.initialise(tmp_path / "match.db")
-        ts = sc2ts.extend(
-            alignment_store=fx_alignment_store,
-            metadata_db=fx_metadata_db,
-            base_ts=fx_ts_2020_02_10,
-            date="2020-02-11",
-            match_db=match_db,
-            min_group_size=2,
+    @pytest.mark.parametrize(
+        ("strain", "parent"), [("SRR11597207", 42), ("ERR4205570", 62)]
+    )
+    @pytest.mark.parametrize("num_mismatches", [1, 2, 3, 4])
+    @pytest.mark.parametrize("precision", [0, 1, 2, 12])
+    def test_exact_matches(
+        self,
+        fx_ts_2020_02_10,
+        fx_alignment_store,
+        fx_metadata_db,
+        strain,
+        parent,
+        num_mismatches,
+        precision,
+    ):
+        samples = sc2ts.preprocess(
+            [fx_metadata_db[strain]], fx_ts_2020_02_10, "2020-02-20", fx_alignment_store
         )
-
-
-
+        sc2ts.match_tsinfer(
+            samples=samples,
+            ts=fx_ts_2020_02_10,
+            num_mismatches=num_mismatches,
+            precision=precision,
+            num_threads=0,
+        )
+        s = samples[0]
+        assert len(s.mutations) == 0
+        assert len(s.path) == 1
+        assert s.path[0].parent == parent
+
+    # def test_stuff(
+    #     self, tmp_path, fx_ts_2020_02_10, fx_alignment_store, fx_metadata_db
+    # ):
+    #     # SRR11597207 0 42 0
+    #     # SRR11597218 1 10 1
+
+    #     # date = "2020-02-11" # 2 samples
+    #     date = "2020-02-13"  # 4 samples
+    #     samples = sc2ts.preprocess(
+    #         date,
+    #         metadata_db=fx_metadata_db,
+    #         alignment_store=fx_alignment_store,
+    #         base_ts=fx_ts_2020_02_10,
+    #     )
+    #     # print(samples)
+
+    #     num_mismatches = 3
+    #     sc2ts.match_tsinfer(
+    #         samples=samples,
+    #         ts=fx_ts_2020_02_10,
+    #         num_mismatches=3,
+    #         precision=12,
+    #         num_threads=0,
+    #     )
+    #     for sample in samples:
+    #         print(
+    #             sample.strain,
+    #             sample.get_hmm_cost(num_mismatches),
+    #             sample.path[0].parent,
+    #             len(sample.mutations),
+    #         )
+
+    #     # match_db = sc2ts.MatchDb.initialise(tmp_path / "match.db")
+    #     # ts = sc2ts.extend(
+    #     #     alignment_store=fx_alignment_store,
+    #     #     metadata_db=fx_metadata_db,
+    #     #     base_ts=fx_ts_2020_02_10,
+    #     #     date="2020-02-11",
+    #     #     match_db=match_db,
+    #     #     min_group_size=2,
+    #     # )