Fixup tests

jeromekelleher · jeromekelleher · commit 8b597402a221 · 2024-07-30T12:40:47.000+01:00
More fixups

Patch-up remaining tests
diff --git a/sc2ts/inference.py b/sc2ts/inference.py
@@ -28,7 +28,6 @@
 class MatchDb:
     def __init__(self, path):
         uri = f"file:{path}"
-        # uri += "?mode=rw"
         self.uri = uri
         self.conn = sqlite3.connect(uri, uri=True)
         self.conn.row_factory = metadata.dict_factory
@@ -151,6 +150,18 @@ def initialise(db_path):
         return MatchDb(db_path)
 
 
+    def print_all(self):
+        """
+        Debug method to print out full state of the DB.
+        """
+        import pandas as pd
+        data = []
+        with self.conn:
+            for row in self.conn.execute("SELECT * from samples"):
+                data.append(row)
+        df = pd.DataFrame(row, index=["strain"])
+        print(df)
+
 def mirror(x, L):
     return L - x
 
@@ -363,8 +374,8 @@ def asdict(self):
             "strain": self.strain,
             "path": self.path,
             "mutations": self.mutations,
-            "masked_sites": self.masked_sites.tolist(),
-            "alignment_qc": self.alignment_qc,
+            # "masked_sites": self.masked_sites.tolist(),
+            # "alignment_qc": self.alignment_qc,
         }
 
 
diff --git a/tests/test_inference.py b/tests/test_inference.py
@@ -10,23 +10,38 @@
 
 class TestAddMatchingResults:
     def add_matching_results(
-        self, samples, ts, date="2020-01-01", num_mismatches=None, max_hmm_cost=None
+        self,
+        samples,
+        ts,
+        db_path,
+        date="2020-01-01",
+        num_mismatches=1000,
+        max_hmm_cost=1e7,
     ):
+        # This is pretty ugly, need to figure out how to neatly factor this
+        # model of Sample object vs metadata vs alignment QC
+        for sample in samples:
+            sample.date = date
+            sample.metadata["date"] = date
+            sample.metadata["strain"] = sample.strain
+
+        match_db = util.get_match_db(ts, db_path, samples, date, num_mismatches)
+        # print("Match DB", len(match_db))
+        # match_db.print_all()
         ts2 = sc2ts.add_matching_results(
-            samples=samples,
+            f"hmm_cost <= {max_hmm_cost}",
+            match_db=match_db,
             ts=ts,
             date=date,
-            num_mismatches=num_mismatches,
-            max_hmm_cost=max_hmm_cost,
         )
-        assert ts2.num_samples == len(samples) + ts.num_samples
-        for u, sample in zip(ts2.samples()[-len(samples) :], samples):
-            node = ts2.node(u)
-            assert node.time == 0
+        # assert ts2.num_samples == len(samples) + ts.num_samples
+        # for u, sample in zip(ts2.samples()[-len(samples) :], samples):
+        #     node = ts2.node(u)
+        #     assert node.time == 0
         assert ts2.num_sites == ts.num_sites
         return ts2
 
-    def test_one_sample(self):
+    def test_one_sample(self, tmp_path):
         # 4.00┊  0  ┊
         #     ┊  ┃  ┊
         # 3.00┊  1  ┊
@@ -37,12 +52,12 @@ def test_one_sample(self):
         #     0   29904
         ts = util.example_binary(2)
         samples = util.get_samples(ts, [[(0, ts.sequence_length, 1)]])
-        ts2 = self.add_matching_results(samples, ts)
+        ts2 = self.add_matching_results(samples, ts, tmp_path / "match.db")
         assert ts2.num_trees == 1
         tree = ts2.first()
         assert tree.parent_dict == {1: 0, 4: 1, 2: 4, 3: 4, 5: 1}
 
-    def test_one_sample_recombinant(self):
+    def test_one_sample_recombinant(self, tmp_path):
         # 4.00┊  0  ┊
         #     ┊  ┃  ┊
         # 3.00┊  1  ┊
@@ -55,14 +70,16 @@ def test_one_sample_recombinant(self):
         L = ts.sequence_length
         x = L / 2
         samples = util.get_samples(ts, [[(0, x, 2), (x, L, 3)]])
-        ts2 = self.add_matching_results(samples, ts, "2021")
+        date = "2021-01-05"
+        ts2 = self.add_matching_results(samples, ts, tmp_path / "match.db", date=date)
+
         assert ts2.num_trees == 2
         assert ts2.first().parent_dict == {1: 0, 4: 1, 2: 4, 3: 4, 6: 2, 5: 6}
         assert ts2.last().parent_dict == {1: 0, 4: 1, 2: 4, 3: 4, 6: 3, 5: 6}
         assert ts2.node(6).flags == sc2ts.NODE_IS_RECOMBINANT
-        assert ts2.node(6).metadata == {"date_added": "2021"}
+        assert ts2.node(6).metadata == {"date_added": date}
 
-    def test_one_sample_recombinant_filtered(self):
+    def test_one_sample_recombinant_filtered(self, tmp_path):
         # 4.00┊  0  ┊
         #     ┊  ┃  ┊
         # 3.00┊  1  ┊
@@ -75,15 +92,14 @@ def test_one_sample_recombinant_filtered(self):
         L = ts.sequence_length
         x = L / 2
         samples = util.get_samples(ts, [[(0, x, 2), (x, L, 3)]])
-        # Note that it is calling the function in the main module.
-        ts2 = sc2ts.add_matching_results(
-            samples, ts, "2021", num_mismatches=1e3, max_hmm_cost=1e3 - 1
+        ts2 = self.add_matching_results(
+            samples, ts, tmp_path / "match.db", num_mismatches=1e3, max_hmm_cost=1e3 - 1
         )
         assert ts2.num_trees == 1
         assert ts2.num_nodes == ts.num_nodes
         assert ts2.num_samples == ts.num_samples
 
-    def test_two_samples_recombinant_one_filtered(self):
+    def test_two_samples_recombinant_one_filtered(self, tmp_path):
         ts = util.example_binary(2)
         L = ts.sequence_length
         x = L / 2
@@ -97,19 +113,19 @@ def test_two_samples_recombinant_one_filtered(self):
             ],  # Filtered
         ]
         samples = util.get_samples(ts, new_paths)
-        ts2 = sc2ts.add_matching_results(
-            samples, ts, "2021", num_mismatches=3, max_hmm_cost=4
+        ts2 = self.add_matching_results(
+            samples, ts, tmp_path / "match.db", num_mismatches=3, max_hmm_cost=4
         )
         assert ts2.num_trees == 2
         assert ts2.num_samples == ts.num_samples + 1
 
-    def test_one_sample_one_mutation(self):
+    def test_one_sample_one_mutation(self, tmp_path):
         ts = sc2ts.initial_ts()
         ts = sc2ts.increment_time("2020-01-01", ts)
         samples = util.get_samples(
             ts, [[(0, ts.sequence_length, 1)]], mutations=[[(0, "X")]]
         )
-        ts2 = self.add_matching_results(samples, ts)
+        ts2 = self.add_matching_results(samples, ts, tmp_path / "match.db")
         assert ts2.num_trees == 1
         tree = ts2.first()
         assert tree.parent_dict == {1: 0, 2: 1}
@@ -118,20 +134,20 @@ def test_one_sample_one_mutation(self):
         var = next(ts2.variants())
         assert var.alleles[var.genotypes[0]] == "X"
 
-    def test_one_sample_one_mutation_filtered(self):
+    def test_one_sample_one_mutation_filtered(self, tmp_path):
         ts = sc2ts.initial_ts()
         ts = sc2ts.increment_time("2020-01-01", ts)
         samples = util.get_samples(
             ts, [[(0, ts.sequence_length, 1)]], mutations=[[(0, "X")]]
         )
-        ts2 = sc2ts.add_matching_results(
-            samples, ts, "2021", num_mismatches=0.0, max_hmm_cost=0.0
+        ts2 = self.add_matching_results(
+            samples, ts, tmp_path / "match.db", num_mismatches=0.0, max_hmm_cost=0.0
         )
         assert ts2.num_trees == ts.num_trees
         assert ts2.site(0).ancestral_state == ts.site(0).ancestral_state
         assert ts2.num_mutations == 0
 
-    def test_two_samples_one_mutation_one_filtered(self):
+    def test_two_samples_one_mutation_one_filtered(self, tmp_path):
         ts = sc2ts.initial_ts()
         ts = sc2ts.increment_time("2020-01-01", ts)
         x = int(ts.sequence_length / 2)
@@ -148,8 +164,8 @@ def test_two_samples_one_mutation_one_filtered(self):
             paths=new_paths,
             mutations=new_mutations,
         )
-        ts2= sc2ts.add_matching_results(
-            samples, ts, "2021", num_mismatches=3, max_hmm_cost=1
+        ts2 = self.add_matching_results(
+            samples, ts, tmp_path / "match.db", num_mismatches=3, max_hmm_cost=1
         )
         assert ts2.num_trees == ts.num_trees
         assert ts2.site(0).ancestral_state == ts.site(0).ancestral_state
@@ -162,7 +178,9 @@ class TestMatchTsinfer:
     def match_tsinfer(self, samples, ts, haplotypes, **kwargs):
         assert len(samples) == len(haplotypes)
         G = np.array(haplotypes).T
-        sc2ts.inference.match_tsinfer(samples=samples, ts=ts, genotypes=G, **kwargs)
+        sc2ts.inference.match_tsinfer(
+            samples=samples, ts=ts, genotypes=G, num_mismatches=1000, **kwargs
+        )
 
     @pytest.mark.parametrize("mirror", [False, True])
     def test_match_reference(self, mirror):
@@ -351,8 +369,12 @@ def test_n_samples_metadata(self):
         ts = sc2ts.initial_ts()
         samples = []
         for j in range(10):
+            strain = f"x{j}"
+            date = "2021-01-01"
             samples.append(
                 sc2ts.Sample(
+                    strain=strain,
+                    date=date,
                     metadata={f"x{j}": j, f"y{j}": list(range(j))},
                     path=[(0, ts.sequence_length, 1)],
                     mutations=[],
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -19,7 +19,7 @@ def test_initial(self):
 
 
 class TestDetachSingletonRecombinants:
-    def make_recombinant_tree(self, num_samples=1):
+    def make_recombinant_tree(self, db_path, num_samples=1):
         # Make a tree sequence by adding num_samples samples under a
         # single recombination node. Start with the following tree:
         # 4.00┊  0  ┊
@@ -34,8 +34,23 @@ def make_recombinant_tree(self, num_samples=1):
         L = ts.sequence_length
         x = L / 2
         samples = util.get_samples(ts, [[(0, x, 2), (x, L, 3)]] * num_samples)
+        date = "2021-01-01"
+
+        # This is pretty ugly, need to figure out how to neatly factor this
+        # model of Sample object vs metadata vs alignment QC
+        # NOTE: code copied from test_inference.py
+        for sample in samples:
+            sample.date = date
+            sample.metadata["date"] = date
+            sample.metadata["strain"] = sample.strain
+        match_db = util.get_match_db(ts, db_path, samples, date, num_mismatches=1000)
+        # print("Match DB", len(match_db))
+        # match_db.print_all()
         ts_rec = sc2ts.add_matching_results(
-            samples, ts, "2021", num_mismatches=None, max_hmm_cost=None
+            "True",
+            match_db=match_db,
+            ts=ts,
+            date=date,
         )
         assert ts_rec.num_trees == 2
         return ts_rec
@@ -46,12 +61,12 @@ def make_recombinant_tree(self, num_samples=1):
         # https://github.com/jeromekelleher/sc2ts/issues/152
         [util.example_binary(1), util.example_binary(2), util.example_binary(3)],
     )
-    def test_no_recombinants(self, ts):
+    def test_no_recombinants(self, ts, tmp_path):
         ts2 = utils.detach_singleton_recombinants(ts)
         ts.tables.assert_equals(ts2.tables, ignore_provenance=True)
 
-    def test_one_sample_recombinant(self):
-        ts = self.make_recombinant_tree()
+    def test_one_sample_recombinant(self, tmp_path):
+        ts = self.make_recombinant_tree(tmp_path / "match.db")
         assert ts.num_samples == 3
         re_nodes = [
             node.id for node in ts.nodes() if node.flags & sc2ts.NODE_IS_RECOMBINANT
@@ -75,9 +90,9 @@ def test_one_sample_recombinant(self):
         assert ts3.num_samples == ts.num_samples - 1
         assert ts3.num_nodes == ts.num_nodes - 2  # both sample and re node gone
 
-    def test_two_sample_recombinant(self):
+    def test_two_sample_recombinant(self, tmp_path):
         """Test that we don't detach anything if the recombinant node is not a singleton"""
-        ts = self.make_recombinant_tree(num_samples=2)
+        ts = self.make_recombinant_tree(num_samples=2, db_path=tmp_path / "match.db")
         assert ts.num_samples == 4
         ts2 = utils.detach_singleton_recombinants(ts)
         ts.tables.assert_equals(ts2.tables, ignore_provenance=True)
diff --git a/tests/util.py b/tests/util.py
@@ -7,7 +7,7 @@
 # NOTE: the current API in which we update the Sample objects is
 # really horrible and we need to refactor to make it more testable.
 # This function is a symptom of that.
-def get_samples(ts, paths, mutations=None):
+def get_samples(ts, paths, mutations=None, date=None):
     if mutations is None:
         mutations = [[] for _ in paths]
 
@@ -18,12 +18,21 @@ def get_samples(ts, paths, mutations=None):
             (ts.sites_position[site], state) for (site, state) in sample_mutations
         ]
         updated_mutations.append(updated)
-    samples = [sc2ts.Sample() for _ in paths]
+    data = "2020-12-29" if date is None else date
+    samples = [sc2ts.Sample(f"strain_{j}", date) for j, _ in enumerate(paths)]
     sc2ts.update_path_info(samples, ts, paths, updated_mutations)
     return samples
 
 
-def example_binary(n):
+def get_match_db(ts, db_path, samples, date, num_mismatches):
+    sc2ts.MatchDb.initialise(db_path)
+    match_db = sc2ts.MatchDb(db_path)
+    match_db.add(samples, date, num_mismatches)
+    match_db.create_mask_table(ts)
+    return match_db
+
+
+def example_binary(n, date="2020-01-01"):
     base = sc2ts.initial_ts()
     tables = base.dump_tables()
     tree = tskit.Tree.generate_balanced(n, span=base.sequence_length)
@@ -32,12 +41,15 @@ def example_binary(n):
     tables.nodes.time += np.max(binary_tables.nodes.time) + 1
     binary_tables.edges.child += len(tables.nodes)
     binary_tables.edges.parent += len(tables.nodes)
-    for node in binary_tables.nodes:
-        tables.nodes.append(node.replace(metadata={}))
+    for j, node in enumerate(binary_tables.nodes):
+        md = {}
+        if node.flags == tskit.NODE_IS_SAMPLE:
+            md["strain"] = f"x{j}"
+            md["date"] = date
+        tables.nodes.append(node.replace(metadata=md))
     for edge in binary_tables.edges:
         tables.edges.append(edge)
     # FIXME brittle
     tables.edges.add_row(0, base.sequence_length, parent=1, child=tree.root + 2)
     tables.sort()
     return tables.tree_sequence()
-