Skip to content

Commit ec251f0

Browse files
Remove daily-extend and add basic bash run script
1 parent 53f040c commit ec251f0

File tree

5 files changed

+234
-54
lines changed

5 files changed

+234
-54
lines changed

run.sh

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/bin/bash
2+
set -e
3+
set -u
4+
# set -x
5+
6+
mismatches=3
7+
max_daily_samples=1000
8+
num_threads=8
9+
10+
# Paths
11+
datadir=testrun
12+
run_id=tmp-dev
13+
# run_id=upgma-mds-$max_daily_samples-md-$max_submission_delay-mm-$mismatches
14+
resultsdir=results/$run_id
15+
results_prefix=$resultsdir/$run_id-
16+
logfile=logs/$run_id.log
17+
18+
alignments=$datadir/alignments.db
19+
metadata=$datadir/metadata.db
20+
matches=$resultsdir/matces.db
21+
22+
dates=`python3 -m sc2ts list-dates $metadata | grep -v 2021-12-31`
23+
echo $dates
24+
25+
options="--num-threads $num_threads -vv -l $logfile "
26+
# options+="--max-submission-delay $max_submission_delay "
27+
# options+="--max-daily-samples $max_daily_samples "
28+
options+="--num-mismatches $mismatches"
29+
30+
mkdir -p $resultsdir
31+
32+
last_ts=$resultsdir/initial.ts
33+
34+
python3 -m sc2ts initialise $last_ts $matches
35+
36+
for date in $dates; do
37+
out_ts="$results_prefix$date".ts
38+
python3 -m sc2ts extend $last_ts $date $alignments $metadata \
39+
$matches $out_ts $options
40+
last_ts=$out_ts
41+
done

sc2ts/alignments.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ def __init__(self, path, mode="r"):
8282
self.env = lmdb.Environment(
8383
str(path), subdir=False, readonly=mode == "r", map_size=map_size
8484
)
85+
logger.debug(f"Opened AlignmentStore at {path} mode={mode}")
8586

8687
def __enter__(self):
8788
return self

sc2ts/cli.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,132 @@ def list_dates(metadata, counts, verbose, log_file):
234234
print(k)
235235

236236

237+
@click.command()
238+
@click.argument("base_ts", type=click.Path(exists=True, dir_okay=False))
239+
@click.argument("date")
240+
@click.argument("alignments", type=click.Path(exists=True, dir_okay=False))
241+
@click.argument("metadata", type=click.Path(exists=True, dir_okay=False))
242+
@click.argument("matches", type=click.Path(exists=True, dir_okay=False))
243+
@click.argument("output_ts", type=click.Path(dir_okay=False))
244+
@click.option(
245+
"--num-mismatches",
246+
default=3,
247+
show_default=True,
248+
type=float,
249+
help="Number of mismatches to accept in favour of recombination",
250+
)
251+
@click.option(
252+
"--hmm-cost-threshold",
253+
default=5,
254+
type=float,
255+
show_default=True,
256+
help="The maximum HMM cost for samples to be included unconditionally",
257+
)
258+
@click.option(
259+
"--min-group-size",
260+
default=10,
261+
show_default=True,
262+
type=int,
263+
help="Minimum size of groups of reconsidered samples for inclusion",
264+
)
265+
@click.option(
266+
"--retrospective-window",
267+
default=30,
268+
show_default=True,
269+
type=int,
270+
help="Number of days in the past to reconsider potential matches",
271+
)
272+
@click.option(
273+
"--max-daily-samples",
274+
default=None,
275+
type=int,
276+
help=(
277+
"The maximum number of samples to match in a single day. If the total "
278+
"is greater than this, randomly subsample."
279+
),
280+
)
281+
@click.option(
282+
"--random-seed",
283+
default=42,
284+
type=int,
285+
help="Random seed for subsampling",
286+
show_default=True,
287+
)
288+
@click.option(
289+
"--num-threads",
290+
default=0,
291+
type=int,
292+
help="Number of match threads (default to one)",
293+
)
294+
@click.option("--progress/--no-progress", default=True)
295+
@click.option("-v", "--verbose", count=True)
296+
@click.option("-l", "--log-file", default=None, type=click.Path(dir_okay=False))
297+
@click.option(
298+
"-f",
299+
"--force",
300+
is_flag=True,
301+
flag_value=True,
302+
help="Force clearing newer matches from DB",
303+
)
304+
def extend(
305+
base_ts,
306+
date,
307+
alignments,
308+
metadata,
309+
matches,
310+
output_ts,
311+
num_mismatches,
312+
hmm_cost_threshold,
313+
min_group_size,
314+
retrospective_window,
315+
max_daily_samples,
316+
num_threads,
317+
random_seed,
318+
progress,
319+
verbose,
320+
log_file,
321+
force,
322+
):
323+
"""
324+
Extend base_ts with sequences for the specified date, using specified
325+
alignments and metadata databases, updating the specified matches
326+
database, and outputting the result to the specified file.
327+
"""
328+
setup_logging(verbose, log_file)
329+
base = tskit.load(base_ts)
330+
logger.debug(f"Loaded base ts from {base_ts}")
331+
with contextlib.ExitStack() as exit_stack:
332+
alignment_store = exit_stack.enter_context(sc2ts.AlignmentStore(alignments))
333+
metadata_db = exit_stack.enter_context(sc2ts.MetadataDb(metadata))
334+
match_db = exit_stack.enter_context(sc2ts.MatchDb(matches))
335+
336+
newer_matches = match_db.count_newer(date)
337+
if newer_matches > 0:
338+
if not force:
339+
click.confirm(
340+
f"Do you want to remove {newer_matches} newer matches "
341+
f"from MatchDB > {date}?",
342+
abort=True,
343+
)
344+
match_db.delete_newer(date)
345+
ts_out = inference.extend(
346+
alignment_store=alignment_store,
347+
metadata_db=metadata_db,
348+
base_ts=base,
349+
date=date,
350+
match_db=match_db,
351+
num_mismatches=num_mismatches,
352+
hmm_cost_threshold=hmm_cost_threshold,
353+
min_group_size=min_group_size,
354+
retrospective_window=retrospective_window,
355+
max_daily_samples=max_daily_samples,
356+
random_seed=random_seed,
357+
num_threads=num_threads,
358+
show_progress=progress,
359+
)
360+
add_provenance(ts_out, output_ts)
361+
362+
237363
@click.command()
238364
@click.argument("alignments", type=click.Path(exists=True, dir_okay=False))
239365
@click.argument("metadata", type=click.Path(exists=True, dir_okay=False))
@@ -531,6 +657,7 @@ def cli():
531657

532658
cli.add_command(initialise)
533659
cli.add_command(list_dates)
660+
cli.add_command(extend)
534661
cli.add_command(daily_extend)
535662
cli.add_command(validate)
536663
cli.add_command(annotate_recombinants)

sc2ts/inference.py

Lines changed: 64 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def __init__(self, path):
3232
self.uri = uri
3333
self.conn = sqlite3.connect(uri, uri=True)
3434
self.conn.row_factory = metadata.dict_factory
35+
logger.debug(f"Opened MatchDb at {path} mode=rw")
3536

3637
def __len__(self):
3738
sql = "SELECT COUNT(*) FROM samples"
@@ -52,6 +53,17 @@ def last_date(self):
5253
row = self.conn.execute(sql).fetchone()
5354
return row["MAX(match_date)"]
5455

56+
def count_newer(self, date):
57+
with self.conn:
58+
sql = "SELECT COUNT(*) FROM samples WHERE match_date >= ?"
59+
row = self.conn.execute(sql, (date,)).fetchone()
60+
return row["COUNT(*)"]
61+
62+
def delete_newer(self, date):
63+
sql = "DELETE FROM samples WHERE match_date >= ?"
64+
with self.conn:
65+
self.conn.execute(sql, (date,))
66+
5567
def __str__(self):
5668
return f"MatchDb at {self.uri} has {len(self)} samples"
5769

@@ -376,49 +388,49 @@ def asdict(self):
376388
}
377389

378390

379-
def daily_extend(
380-
*,
381-
alignment_store,
382-
metadata_db,
383-
base_ts,
384-
match_db,
385-
num_mismatches=None,
386-
max_hmm_cost=None,
387-
min_group_size=None,
388-
num_past_days=None,
389-
show_progress=False,
390-
max_submission_delay=None,
391-
max_daily_samples=None,
392-
num_threads=None,
393-
precision=None,
394-
rng=None,
395-
excluded_sample_dir=None,
396-
):
397-
assert num_past_days is None
398-
assert max_submission_delay is None
399-
400-
start_day = last_date(base_ts)
401-
402-
last_ts = base_ts
403-
for date in metadata_db.get_days(start_day):
404-
ts = extend(
405-
alignment_store=alignment_store,
406-
metadata_db=metadata_db,
407-
date=date,
408-
base_ts=last_ts,
409-
match_db=match_db,
410-
num_mismatches=num_mismatches,
411-
max_hmm_cost=max_hmm_cost,
412-
min_group_size=min_group_size,
413-
show_progress=show_progress,
414-
max_submission_delay=max_submission_delay,
415-
max_daily_samples=max_daily_samples,
416-
num_threads=num_threads,
417-
precision=precision,
418-
)
419-
yield ts, date
420-
421-
last_ts = ts
391+
# def daily_extend(
392+
# *,
393+
# alignment_store,
394+
# metadata_db,
395+
# base_ts,
396+
# match_db,
397+
# num_mismatches=None,
398+
# max_hmm_cost=None,
399+
# min_group_size=None,
400+
# num_past_days=None,
401+
# show_progress=False,
402+
# max_submission_delay=None,
403+
# max_daily_samples=None,
404+
# num_threads=None,
405+
# precision=None,
406+
# rng=None,
407+
# excluded_sample_dir=None,
408+
# ):
409+
# assert num_past_days is None
410+
# assert max_submission_delay is None
411+
412+
# start_day = last_date(base_ts)
413+
414+
# last_ts = base_ts
415+
# for date in metadata_db.get_days(start_day):
416+
# ts = extend(
417+
# alignment_store=alignment_store,
418+
# metadata_db=metadata_db,
419+
# date=date,
420+
# base_ts=last_ts,
421+
# match_db=match_db,
422+
# num_mismatches=num_mismatches,
423+
# max_hmm_cost=max_hmm_cost,
424+
# min_group_size=min_group_size,
425+
# show_progress=show_progress,
426+
# max_submission_delay=max_submission_delay,
427+
# max_daily_samples=max_daily_samples,
428+
# num_threads=num_threads,
429+
# precision=precision,
430+
# )
431+
# yield ts, date
432+
433+
# last_ts = ts
422434

423435

424436
def preprocess(
@@ -559,30 +571,28 @@ def extend(
559571
base_ts,
560572
match_db,
561573
num_mismatches=None,
562-
max_hmm_cost=None,
574+
hmm_cost_threshold=None,
563575
min_group_size=None,
564-
show_progress=False,
565-
max_submission_delay=None,
566576
max_daily_samples=None,
577+
show_progress=False,
578+
retrospective_window=None,
579+
random_seed=42,
567580
num_threads=0,
568-
precision=None,
569-
rng=None,
570581
):
571582
if num_mismatches is None:
572583
num_mismatches = 3
573-
if max_hmm_cost is None:
574-
max_hmm_cost = 5
584+
if hmm_cost_threshold is None:
585+
hmm_cost_threshold = 5
575586
if min_group_size is None:
576587
min_group_size = 10
577588

589+
# TMP
590+
precision = 6
578591
check_base_ts(base_ts)
579592
logger.info(
580593
f"Extend {date}; ts:nodes={base_ts.num_nodes};samples={base_ts.num_samples};"
581594
f"mutations={base_ts.num_mutations};date={base_ts.metadata['sc2ts']['date']}"
582595
)
583-
# TODO not sure whether we'll keep these params. Making sure they're not
584-
# used for now
585-
assert max_submission_delay is None
586596

587597
samples = preprocess(
588598
date,
@@ -615,7 +625,7 @@ def extend(
615625

616626
logger.info(f"Update ARG with low-cost samples for {date}")
617627
ts = add_matching_results(
618-
f"match_date=='{date}' and hmm_cost>0 and hmm_cost<={max_hmm_cost}",
628+
f"match_date=='{date}' and hmm_cost>0 and hmm_cost<={hmm_cost_threshold}",
619629
ts=ts,
620630
match_db=match_db,
621631
date=date,

sc2ts/metadata.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ def __init__(self, path):
2222
self.path = path
2323
self.conn = sqlite3.connect(uri, uri=True)
2424
self.conn.row_factory = dict_factory
25+
logger.debug(f"Opened MetadataDb at {path} mode=ro")
2526

2627
@staticmethod
2728
def import_csv(csv_path, db_path):

0 commit comments

Comments
 (0)