Skip to content

Commit 80c4a6c

Browse files
Add additional-problematic-sites CLI ARG
Closes #239
1 parent cca80b9 commit 80c4a6c

File tree

3 files changed

+33
-31
lines changed

3 files changed

+33
-31
lines changed

sc2ts/cli.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import datetime
1212
import pickle
1313

14+
import numpy as np
1415
import tqdm
1516
import tskit
1617
import tszip
@@ -22,6 +23,8 @@
2223
from . import core
2324
from . import inference
2425

26+
logger = logging.getLogger(__name__)
27+
2528

2629
def get_environment():
2730
"""
@@ -230,6 +233,12 @@ def dump_samples(samples, output_file):
230233
@click.option("--num-threads", default=0, type=int, help="Number of match threads")
231234
@click.option("--random-seed", default=42, type=int, help="Random seed for subsampling")
232235
@click.option("--stop-date", default="2030-01-01", type=str, help="Stopping date")
236+
@click.option(
237+
"--additional-problematic-sites",
238+
default=None,
239+
type=str,
240+
help="File containing the list of additional problematic sites to exclude.",
241+
)
233242
@click.option("-p", "--precision", default=None, type=int, help="Match precision")
234243
@click.option("--no-progress", default=False, type=bool, help="Don't show progress")
235244
@click.option("-v", "--verbose", count=True)
@@ -248,6 +257,7 @@ def daily_extend(
248257
num_threads,
249258
random_seed,
250259
stop_date,
260+
additional_problematic_sites,
251261
precision,
252262
no_progress,
253263
verbose,
@@ -259,13 +269,27 @@ def daily_extend(
259269
setup_logging(verbose, log_file)
260270
rng = random.Random(random_seed)
261271

272+
additional_problematic = []
273+
if additional_problematic_sites is not None:
274+
additional_problematic = (
275+
np.loadtxt(additional_problematic_sites).astype(int).tolist()
276+
)
277+
logger.info(
278+
f"Excluding additional {len(additional_problematic)} problematic sites"
279+
)
280+
262281
match_db_path = f"{output_prefix}match.db"
263282
if base is None:
264-
base_ts = inference.initial_ts()
283+
base_ts = inference.initial_ts(additional_problematic)
265284
match_db = inference.MatchDb.initialise(match_db_path)
266285
else:
267286
base_ts = tskit.load(base)
268287

288+
assert (
289+
base_ts.metadata["sc2ts"]["additional_problematic_sites"]
290+
== additional_problematic
291+
)
292+
269293
with contextlib.ExitStack() as exit_stack:
270294
alignment_store = exit_stack.enter_context(sc2ts.AlignmentStore(alignments))
271295
metadata_db = exit_stack.enter_context(sc2ts.MetadataDb(metadata))

sc2ts/core.py

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -50,30 +50,7 @@ def __len__(self):
5050

5151

5252
def get_problematic_sites():
53-
base = np.loadtxt(data_path / "problematic_sites.txt", dtype=np.int64)
54-
# Temporary to try out removing these outliers. See
55-
# https://github.com/jeromekelleher/sc2ts/issues/231#issuecomment-2306665447
56-
# In reality we'd probably want to provide an additional file of extra sites
57-
# to remove.
58-
additional = [
59-
7851,
60-
10323,
61-
11750,
62-
17040,
63-
21137,
64-
21846,
65-
22917,
66-
22995,
67-
26681,
68-
27384,
69-
27638,
70-
27752,
71-
28254,
72-
28271,
73-
29614,
74-
]
75-
full = np.append(base, additional)
76-
return np.sort(full)
53+
return np.loadtxt(data_path / "problematic_sites.txt", dtype=np.int64)
7754

7855

7956
__cached_reference = None

sc2ts/inference.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import datetime
55
import dataclasses
66
import collections
7-
import json
87
import pickle
98
import os
109
import sqlite3
@@ -77,7 +76,6 @@ def add(self, samples, date, num_mismatches):
7776
data = []
7877
hmm_cost = np.zeros(len(samples))
7978
for j, sample in enumerate(samples):
80-
d = sample.asdict()
8179
assert sample.date == date
8280
# FIXME we want to be more selective about what we're storing
8381
# here, as we're including the alignment too.
@@ -207,14 +205,17 @@ def mirror_ts_coordinates(ts):
207205
return tables.tree_sequence()
208206

209207

210-
def initial_ts():
208+
def initial_ts(additional_problematic_sites=list()):
211209
reference = core.get_reference_sequence()
212210
L = core.REFERENCE_SEQUENCE_LENGTH
213211
assert L == len(reference)
214-
problematic_sites = set(core.get_problematic_sites())
212+
problematic_sites = set(core.get_problematic_sites()) | set(additional_problematic_sites)
215213

216214
tables = tskit.TableCollection(L)
217215
tables.time_units = core.TIME_UNITS
216+
217+
# TODO add known fields to the schemas and document them.
218+
218219
base_schema = tskit.MetadataSchema.permissive_json().schema
219220
tables.reference_sequence.metadata_schema = tskit.MetadataSchema(base_schema)
220221
tables.reference_sequence.metadata = {
@@ -224,15 +225,15 @@ def initial_ts():
224225
tables.reference_sequence.data = reference
225226

226227
tables.metadata_schema = tskit.MetadataSchema(base_schema)
228+
# TODO gene annotations to top level
227229
tables.metadata = {
228230
"sc2ts": {
229231
"date": core.REFERENCE_DATE,
230232
"samples_strain": [core.REFERENCE_STRAIN],
233+
"additional_problematic_sites": additional_problematic_sites,
231234
}
232235
}
233236

234-
# TODO gene annotations to top level
235-
# TODO add known fields to the schemas and document them.
236237
tables.nodes.metadata_schema = tskit.MetadataSchema(base_schema)
237238
tables.sites.metadata_schema = tskit.MetadataSchema(base_schema)
238239
tables.mutations.metadata_schema = tskit.MetadataSchema(base_schema)

0 commit comments

Comments
 (0)