Add additional-problematic-sites CLI ARG

jeromekelleher · jeromekelleher · commit 80c4a6cfc8e5 · 2024-08-28T15:07:54.000+01:00
Closes #239
diff --git a/sc2ts/cli.py b/sc2ts/cli.py
@@ -11,6 +11,7 @@
 import datetime
 import pickle
 
+import numpy as np
 import tqdm
 import tskit
 import tszip
@@ -22,6 +23,8 @@
 from . import core
 from . import inference
 
+logger = logging.getLogger(__name__)
+
 
 def get_environment():
     """
@@ -230,6 +233,12 @@ def dump_samples(samples, output_file):
 @click.option("--num-threads", default=0, type=int, help="Number of match threads")
 @click.option("--random-seed", default=42, type=int, help="Random seed for subsampling")
 @click.option("--stop-date", default="2030-01-01", type=str, help="Stopping date")
+@click.option(
+    "--additional-problematic-sites",
+    default=None,
+    type=str,
+    help="File containing the list of additional problematic sites to exclude.",
+)
 @click.option("-p", "--precision", default=None, type=int, help="Match precision")
 @click.option("--no-progress", default=False, type=bool, help="Don't show progress")
 @click.option("-v", "--verbose", count=True)
@@ -248,6 +257,7 @@ def daily_extend(
     num_threads,
     random_seed,
     stop_date,
+    additional_problematic_sites,
     precision,
     no_progress,
     verbose,
@@ -259,13 +269,27 @@ def daily_extend(
     setup_logging(verbose, log_file)
     rng = random.Random(random_seed)
 
+    additional_problematic = []
+    if additional_problematic_sites is not None:
+        additional_problematic = (
+            np.loadtxt(additional_problematic_sites).astype(int).tolist()
+        )
+        logger.info(
+            f"Excluding additional {len(additional_problematic)} problematic sites"
+        )
+
     match_db_path = f"{output_prefix}match.db"
     if base is None:
-        base_ts = inference.initial_ts()
+        base_ts = inference.initial_ts(additional_problematic)
         match_db = inference.MatchDb.initialise(match_db_path)
     else:
         base_ts = tskit.load(base)
 
+    assert (
+        base_ts.metadata["sc2ts"]["additional_problematic_sites"]
+        == additional_problematic
+    )
+
     with contextlib.ExitStack() as exit_stack:
         alignment_store = exit_stack.enter_context(sc2ts.AlignmentStore(alignments))
         metadata_db = exit_stack.enter_context(sc2ts.MetadataDb(metadata))
diff --git a/sc2ts/core.py b/sc2ts/core.py
@@ -50,30 +50,7 @@ def __len__(self):
 
 
 def get_problematic_sites():
-    base = np.loadtxt(data_path / "problematic_sites.txt", dtype=np.int64)
-    # Temporary to try out removing these outliers. See
-    # https://github.com/jeromekelleher/sc2ts/issues/231#issuecomment-2306665447
-    # In reality we'd probably want to provide an additional file of extra sites
-    # to remove.
-    additional = [
-        7851,
-        10323,
-        11750,
-        17040,
-        21137,
-        21846,
-        22917,
-        22995,
-        26681,
-        27384,
-        27638,
-        27752,
-        28254,
-        28271,
-        29614,
-    ]
-    full = np.append(base, additional)
-    return np.sort(full)
+    return np.loadtxt(data_path / "problematic_sites.txt", dtype=np.int64)
 
 
 __cached_reference = None
diff --git a/sc2ts/inference.py b/sc2ts/inference.py
@@ -4,7 +4,6 @@
 import datetime
 import dataclasses
 import collections
-import json
 import pickle
 import os
 import sqlite3
@@ -77,7 +76,6 @@ def add(self, samples, date, num_mismatches):
         data = []
         hmm_cost = np.zeros(len(samples))
         for j, sample in enumerate(samples):
-            d = sample.asdict()
             assert sample.date == date
             # FIXME we want to be more selective about what we're storing
             # here, as we're including the alignment too.
@@ -207,14 +205,17 @@ def mirror_ts_coordinates(ts):
     return tables.tree_sequence()
 
 
-def initial_ts():
+def initial_ts(additional_problematic_sites=list()):
     reference = core.get_reference_sequence()
     L = core.REFERENCE_SEQUENCE_LENGTH
     assert L == len(reference)
-    problematic_sites = set(core.get_problematic_sites())
+    problematic_sites = set(core.get_problematic_sites()) | set(additional_problematic_sites)
 
     tables = tskit.TableCollection(L)
     tables.time_units = core.TIME_UNITS
+
+    # TODO add known fields to the schemas and document them.
+
     base_schema = tskit.MetadataSchema.permissive_json().schema
     tables.reference_sequence.metadata_schema = tskit.MetadataSchema(base_schema)
     tables.reference_sequence.metadata = {
@@ -224,15 +225,15 @@ def initial_ts():
     tables.reference_sequence.data = reference
 
     tables.metadata_schema = tskit.MetadataSchema(base_schema)
+    # TODO gene annotations to top level
     tables.metadata = {
         "sc2ts": {
             "date": core.REFERENCE_DATE,
             "samples_strain": [core.REFERENCE_STRAIN],
+            "additional_problematic_sites": additional_problematic_sites,
         }
     }
 
-    # TODO gene annotations to top level
-    # TODO add known fields to the schemas and document them.
     tables.nodes.metadata_schema = tskit.MetadataSchema(base_schema)
     tables.sites.metadata_schema = tskit.MetadataSchema(base_schema)
     tables.mutations.metadata_schema = tskit.MetadataSchema(base_schema)