Impove match testing and infrastructure

jeromekelleher · jeromekelleher · commit 23ad2d3fd1a5 · 2024-09-03T15:11:05.000+01:00
Add metadata to exact match samples Closes #238
diff --git a/run.sh b/run.sh
@@ -9,17 +9,17 @@ num_threads=8
 
 # Paths
 datadir=testrun
-run_id=tmp-dev
+run_id=tmp-dev-hp
 # run_id=upgma-mds-$max_daily_samples-md-$max_submission_delay-mm-$mismatches
 resultsdir=results/$run_id
 results_prefix=$resultsdir/$run_id-
 logfile=logs/$run_id.log
 
 alignments=$datadir/alignments.db
 metadata=$datadir/metadata.db
-matches=$resultsdir/matces.db
+matches=$resultsdir/matches.db
 
-dates=`python3 -m sc2ts list-dates $metadata | grep -v 2021-12-31`
+dates=`python3 -m sc2ts list-dates $metadata | grep -v 2021-12-31 | head -n 14`
 echo $dates
 
 options="--num-threads $num_threads -vv -l $logfile "
diff --git a/sc2ts/inference.py b/sc2ts/inference.py
@@ -404,35 +404,36 @@ def match_samples(
         # Default to no recombination
         num_mismatches = 1000
 
-    remaining_samples = samples
     # FIXME Something wrong here, we don't seem to get precisely the same
     # ARG for some reason. Need to track it down
     # Also: should only run the things at low precision that have that HMM cost.
     # Start out by setting everything to have 0 mutations and work up from there.
 
-    for cost, precision in [(0, 0), (1, 2)]: #, (2, 3)]:
-        match_tsinfer(
-            samples=remaining_samples,
-            ts=base_ts,
-            num_mismatches=num_mismatches,
-            precision=precision,
-            num_threads=num_threads,
-            show_progress=show_progress,
-            mirror_coordinates=mirror_coordinates,
-        )
-        samples_to_rerun = []
-        for sample in remaining_samples:
-            hmm_cost = sample.get_hmm_cost(num_mismatches)
-            # print(f"HMM@p={precision}: {sample.strain} hmm_cost={hmm_cost} path={sample.path}")
-            logger.debug(
-                f"HMM@p={precision}: {sample.strain} hmm_cost={hmm_cost} path={sample.path}"
-            )
-            if hmm_cost > cost:
-                sample.path.clear()
-                sample.mutations.clear()
-                samples_to_rerun.append(sample)
-        remaining_samples = samples_to_rerun
-
+    # remaining_samples = samples
+    # for cost, precision in [(0, 0), (1, 2)]: #, (2, 3)]:
+    #     match_tsinfer(
+    #         samples=remaining_samples,
+    #         ts=base_ts,
+    #         num_mismatches=num_mismatches,
+    #         precision=precision,
+    #         num_threads=num_threads,
+    #         show_progress=show_progress,
+    #         mirror_coordinates=mirror_coordinates,
+    #     )
+    #     samples_to_rerun = []
+    #     for sample in remaining_samples:
+    #         hmm_cost = sample.get_hmm_cost(num_mismatches)
+    #         # print(f"HMM@p={precision}: {sample.strain} hmm_cost={hmm_cost} path={sample.path}")
+    #         logger.debug(
+    #             f"HMM@p={precision}: {sample.strain} hmm_cost={hmm_cost} path={sample.path}"
+    #         )
+    #         if hmm_cost > cost:
+    #             sample.path.clear()
+    #             sample.mutations.clear()
+    #             samples_to_rerun.append(sample)
+    #     remaining_samples = samples_to_rerun
+
+    samples_to_rerun = samples
     match_tsinfer(
         samples=samples_to_rerun,
         ts=base_ts,
@@ -605,6 +606,18 @@ def update_top_level_metadata(ts, date):
     return tables.tree_sequence()
 
 
+def add_sample_to_tables(sample, tables, flags=tskit.NODE_IS_SAMPLE, time=0):
+    metadata = {
+        **sample.metadata,
+        "sc2ts": {
+            "qc": sample.alignment_qc,
+            "path": [x.asdict() for x in sample.path],
+            "mutations": [x.asdict() for x in sample.mutations],
+        },
+    }
+    return tables.nodes.add_row(flags=flags, time=time, metadata=metadata)
+
+
 def match_path_ts(samples, ts, path, reversions):
     """
     Given the specified list of samples with equal copying paths,
@@ -623,17 +636,7 @@ def match_path_ts(samples, ts, path, reversions):
     )
     for sample in samples:
         assert sample.path == path
-        metadata = {
-            **sample.metadata,
-            "sc2ts": {
-                "qc": sample.alignment_qc,
-                "path": [x.asdict() for x in sample.path],
-                "mutations": [x.asdict() for x in sample.mutations],
-            },
-        }
-        node_id = tables.nodes.add_row(
-            flags=tskit.NODE_IS_SAMPLE, time=0, metadata=metadata
-        )
+        node_id = add_sample_to_tables(sample, tables)
         tables.edges.add_row(0, ts.sequence_length, parent=0, child=node_id)
         for mut in sample.mutations:
             if mut.site_id not in site_id_map:
@@ -671,10 +674,10 @@ def add_exact_matches(match_db, ts, date):
     for sample in samples:
         assert len(sample.path) == 1
         assert len(sample.mutations) == 0
-        node_id = tables.nodes.add_row(
+        node_id = add_sample_to_tables(
+            sample,
+            tables,
             flags=tskit.NODE_IS_SAMPLE | core.NODE_IS_EXACT_MATCH,
-            time=0,
-            metadata=sample.metadata,
         )
         parent = sample.path[0].parent
         logger.debug(f"ARG add exact match {sample.strain}:{node_id}->{parent}")
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -35,6 +35,7 @@ def fx_alignment_store(fx_data_cache, fx_alignments_fasta):
             a.append(fasta, show_progress=False)
     return sc2ts.AlignmentStore(cache_path)
 
+
 @pytest.fixture
 def fx_metadata_db(fx_data_cache):
     cache_path = fx_data_cache / "metadata.db"
@@ -44,26 +45,46 @@ def fx_metadata_db(fx_data_cache):
     return sc2ts.MetadataDb(cache_path)
 
 
+# TODO make this a session fixture cacheing the tree sequences.
 @pytest.fixture
-def fx_ts_2020_02_10(tmp_path, fx_data_cache, fx_metadata_db, fx_alignment_store):
-    target_date = "2020-02-10"
-    cache_path = fx_data_cache / f"{target_date}.ts"
+def fx_ts_map(tmp_path, fx_data_cache, fx_metadata_db, fx_alignment_store):
+    dates = [
+        "2020-01-01",
+        "2020-01-19",
+        "2020-01-24",
+        "2020-01-25",
+        "2020-01-28",
+        "2020-01-29",
+        "2020-01-30",
+        "2020-01-31",
+        "2020-02-01",
+        "2020-02-02",
+        "2020-02-03",
+        "2020-02-04",
+        "2020-02-05",
+        "2020-02-06",
+        "2020-02-07",
+        "2020-02-08",
+        "2020-02-09",
+        "2020-02-10",
+        "2020-02-11",
+        "2020-02-13",
+    ]
+    cache_path = fx_data_cache / f"{dates[-1]}.ts"
     if not cache_path.exists():
         last_ts = sc2ts.initial_ts()
         match_db = sc2ts.MatchDb.initialise(tmp_path / "match.db")
-        for date in fx_metadata_db.date_sample_counts():
-            print("INFERRING", date)
+        for date in dates:
             last_ts = sc2ts.extend(
                 alignment_store=fx_alignment_store,
                 metadata_db=fx_metadata_db,
                 base_ts=last_ts,
                 date=date,
                 match_db=match_db,
-                min_group_size=2,
             )
-            if date == target_date:
-                break
-        last_ts.dump(cache_path)
-    return tskit.load(cache_path)
-
-
+            print(
+                f"INFERRED {date} nodes={last_ts.num_nodes} mutations={last_ts.num_mutations}"
+            )
+            cache_path = fx_data_cache / f"{date}.ts"
+            last_ts.dump(cache_path)
+    return {date: tskit.load(fx_data_cache / f"{date}.ts") for date in dates}
diff --git a/tests/test_inference.py b/tests/test_inference.py