Skip to content

Commit f2a7713

Browse files
Merge pull request #253 from jeromekelleher/rejig-interface
Initial first tests on real data
2 parents 14f4be4 + 1a8b17e commit f2a7713

File tree

9 files changed

+425
-695
lines changed

9 files changed

+425
-695
lines changed

.github/workflows/ci.yml

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,20 @@ jobs:
7474
7575
- name: Run inference
7676
run: |
77-
sc2ts daily-extend -v testrun/alignments.db testrun/metadata.db \
78-
testrun/ --min-group-size=2
77+
# doing 10 days here as this is taking a while
78+
last_ts=testrun/initial.ts
79+
sc2ts initialise -v $last_ts testrun/match.db
80+
for date in `sc2ts list-dates testrun/metadata.db | head -n 10`; do
81+
out_ts=testrun/$date.ts
82+
sc2ts extend $last_ts $date \
83+
testrun/alignments.db testrun/metadata.db \
84+
testrun/match.db $out_ts -v --min-group-size=2
85+
last_ts=$out_ts
86+
done
7987
8088
- name: Validate
8189
run: |
82-
sc2ts validate -v testrun/alignments.db testrun/2020-02-13.ts
90+
sc2ts validate -v testrun/alignments.db testrun/2020-02-02.ts
8391
8492
- name: MatchDB
8593
run: |

run.sh

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/bin/bash
2+
set -e
3+
set -u
4+
# set -x
5+
6+
mismatches=3
7+
max_daily_samples=1000
8+
num_threads=8
9+
10+
# Paths
11+
datadir=testrun
12+
run_id=tmp-dev
13+
# run_id=upgma-mds-$max_daily_samples-md-$max_submission_delay-mm-$mismatches
14+
resultsdir=results/$run_id
15+
results_prefix=$resultsdir/$run_id-
16+
logfile=logs/$run_id.log
17+
18+
alignments=$datadir/alignments.db
19+
metadata=$datadir/metadata.db
20+
matches=$resultsdir/matces.db
21+
22+
dates=`python3 -m sc2ts list-dates $metadata | grep -v 2021-12-31`
23+
echo $dates
24+
25+
options="--num-threads $num_threads -vv -l $logfile "
26+
# options+="--max-submission-delay $max_submission_delay "
27+
# options+="--max-daily-samples $max_daily_samples "
28+
options+="--num-mismatches $mismatches"
29+
30+
mkdir -p $resultsdir
31+
32+
last_ts=$resultsdir/initial.ts
33+
34+
python3 -m sc2ts initialise $last_ts $matches
35+
36+
for date in $dates; do
37+
out_ts="$results_prefix$date".ts
38+
python3 -m sc2ts extend $last_ts $date $alignments $metadata \
39+
$matches $out_ts $options
40+
last_ts=$out_ts
41+
done

sc2ts/alignments.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ def __init__(self, path, mode="r"):
8282
self.env = lmdb.Environment(
8383
str(path), subdir=False, readonly=mode == "r", map_size=map_size
8484
)
85+
logger.debug(f"Opened AlignmentStore at {path} mode={mode}")
8586

8687
def __enter__(self):
8788
return self

sc2ts/cli.py

Lines changed: 130 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -182,42 +182,92 @@ def add_provenance(ts, output_file):
182182

183183

184184
@click.command()
185+
@click.argument("ts", type=click.Path(dir_okay=False))
186+
@click.argument("match_db", type=click.Path(dir_okay=False))
187+
@click.option(
188+
"--additional-problematic-sites",
189+
default=None,
190+
type=click.Path(exists=True, dir_okay=False),
191+
help="File containing the list of additional problematic sites to exclude.",
192+
)
193+
@click.option("-v", "--verbose", count=True)
194+
@click.option("-l", "--log-file", default=None, type=click.Path(dir_okay=False))
195+
def initialise(ts, match_db, additional_problematic_sites, verbose, log_file):
196+
"""
197+
Initialise a new base tree sequence to begin inference.
198+
"""
199+
setup_logging(verbose, log_file)
200+
201+
additional_problematic = []
202+
if additional_problematic_sites is not None:
203+
additional_problematic = (
204+
np.loadtxt(additional_problematic_sites, ndmin=1).astype(int).tolist()
205+
)
206+
logger.info(
207+
f"Excluding additional {len(additional_problematic)} problematic sites"
208+
)
209+
210+
base_ts = inference.initial_ts(additional_problematic)
211+
base_ts.dump(ts)
212+
logger.info(f"New base ts at {ts}")
213+
inference.MatchDb.initialise(match_db)
214+
215+
216+
@click.command()
217+
@click.argument("metadata", type=click.Path(exists=True, dir_okay=False))
218+
@click.option("--counts/--no-counts", default=False)
219+
@click.option("-v", "--verbose", count=True)
220+
@click.option("-l", "--log-file", default=None, type=click.Path(dir_okay=False))
221+
def list_dates(metadata, counts, verbose, log_file):
222+
"""
223+
List the dates included in specified metadataDB
224+
"""
225+
setup_logging(verbose, log_file)
226+
with sc2ts.MetadataDb(metadata) as metadata_db:
227+
counter = metadata_db.date_sample_counts()
228+
if counts:
229+
for k, v in counter.items():
230+
print(k, v, sep="\t")
231+
232+
else:
233+
for k in counter:
234+
print(k)
235+
236+
237+
@click.command()
238+
@click.argument("base_ts", type=click.Path(exists=True, dir_okay=False))
239+
@click.argument("date")
185240
@click.argument("alignments", type=click.Path(exists=True, dir_okay=False))
186241
@click.argument("metadata", type=click.Path(exists=True, dir_okay=False))
187-
@click.argument("output-prefix")
242+
@click.argument("matches", type=click.Path(exists=True, dir_okay=False))
243+
@click.argument("output_ts", type=click.Path(dir_okay=False))
188244
@click.option(
189-
"-b",
190-
"--base",
191-
type=click.Path(dir_okay=False, exists=True),
192-
default=None,
193-
help=(
194-
"The base tree sequence to match against. If not specified, create "
195-
"a new initial base containing the reference. "
196-
),
245+
"--num-mismatches",
246+
default=3,
247+
show_default=True,
248+
type=float,
249+
help="Number of mismatches to accept in favour of recombination",
250+
)
251+
@click.option(
252+
"--hmm-cost-threshold",
253+
default=5,
254+
type=float,
255+
show_default=True,
256+
help="The maximum HMM cost for samples to be included unconditionally",
197257
)
198-
@click.option("--num-mismatches", default=None, type=float, help="num-mismatches")
199-
@click.option("--max-hmm-cost", default=5, type=float, help="max-hmm-cost")
200258
@click.option(
201259
"--min-group-size",
202260
default=10,
261+
show_default=True,
203262
type=int,
204-
help="Minimum size of groups of reconsidered samples",
205-
show_default=True
263+
help="Minimum size of groups of reconsidered samples for inclusion",
206264
)
207265
@click.option(
208-
"--num-past-days",
209-
default=None,
266+
"--retrospective-window",
267+
default=30,
268+
show_default=True,
210269
type=int,
211-
help="Number of past days to retrieve filtered samples",
212-
)
213-
@click.option(
214-
"--max-submission-delay",
215-
default=None,
216-
type=int,
217-
help=(
218-
"The maximum number of days between the sample and its submission date "
219-
"for it to be included in the inference"
220-
),
270+
help="Number of days in the past to reconsider potential matches",
221271
)
222272
@click.option(
223273
"--max-daily-samples",
@@ -228,91 +278,86 @@ def add_provenance(ts, output_file):
228278
"is greater than this, randomly subsample."
229279
),
230280
)
231-
@click.option("--num-threads", default=0, type=int, help="Number of match threads")
232-
@click.option("--random-seed", default=42, type=int, help="Random seed for subsampling")
233-
@click.option("--stop-date", default="2030-01-01", type=str, help="Stopping date")
234281
@click.option(
235-
"--additional-problematic-sites",
236-
default=None,
237-
type=str,
238-
help="File containing the list of additional problematic sites to exclude.",
282+
"--random-seed",
283+
default=42,
284+
type=int,
285+
help="Random seed for subsampling",
286+
show_default=True,
239287
)
240-
@click.option("-p", "--precision", default=None, type=int, help="Match precision")
241-
@click.option("--no-progress", default=False, type=bool, help="Don't show progress")
288+
@click.option(
289+
"--num-threads",
290+
default=0,
291+
type=int,
292+
help="Number of match threads (default to one)",
293+
)
294+
@click.option("--progress/--no-progress", default=True)
242295
@click.option("-v", "--verbose", count=True)
243296
@click.option("-l", "--log-file", default=None, type=click.Path(dir_okay=False))
244-
def daily_extend(
297+
@click.option(
298+
"-f",
299+
"--force",
300+
is_flag=True,
301+
flag_value=True,
302+
help="Force clearing newer matches from DB",
303+
)
304+
def extend(
305+
base_ts,
306+
date,
245307
alignments,
246308
metadata,
247-
output_prefix,
248-
base,
309+
matches,
310+
output_ts,
249311
num_mismatches,
250-
max_hmm_cost,
312+
hmm_cost_threshold,
251313
min_group_size,
252-
num_past_days,
253-
max_submission_delay,
314+
retrospective_window,
254315
max_daily_samples,
255316
num_threads,
256317
random_seed,
257-
stop_date,
258-
additional_problematic_sites,
259-
precision,
260-
no_progress,
318+
progress,
261319
verbose,
262320
log_file,
321+
force,
263322
):
264323
"""
265-
Sequentially extend the trees by adding samples in daily batches.
324+
Extend base_ts with sequences for the specified date, using specified
325+
alignments and metadata databases, updating the specified matches
326+
database, and outputting the result to the specified file.
266327
"""
267328
setup_logging(verbose, log_file)
268-
rng = random.Random(random_seed)
269-
270-
additional_problematic = []
271-
if additional_problematic_sites is not None:
272-
additional_problematic = (
273-
np.loadtxt(additional_problematic_sites).astype(int).tolist()
274-
)
275-
logger.info(
276-
f"Excluding additional {len(additional_problematic)} problematic sites"
277-
)
278-
279-
match_db_path = f"{output_prefix}match.db"
280-
if base is None:
281-
base_ts = inference.initial_ts(additional_problematic)
282-
match_db = inference.MatchDb.initialise(match_db_path)
283-
else:
284-
base_ts = tskit.load(base)
285-
286-
assert (
287-
base_ts.metadata["sc2ts"]["additional_problematic_sites"]
288-
== additional_problematic
289-
)
290-
329+
base = tskit.load(base_ts)
330+
logger.debug(f"Loaded base ts from {base_ts}")
291331
with contextlib.ExitStack() as exit_stack:
292332
alignment_store = exit_stack.enter_context(sc2ts.AlignmentStore(alignments))
293333
metadata_db = exit_stack.enter_context(sc2ts.MetadataDb(metadata))
294-
match_db = exit_stack.enter_context(inference.MatchDb(match_db_path))
295-
ts_iter = inference.daily_extend(
334+
match_db = exit_stack.enter_context(sc2ts.MatchDb(matches))
335+
336+
newer_matches = match_db.count_newer(date)
337+
if newer_matches > 0:
338+
if not force:
339+
click.confirm(
340+
f"Do you want to remove {newer_matches} newer matches "
341+
f"from MatchDB > {date}?",
342+
abort=True,
343+
)
344+
match_db.delete_newer(date)
345+
ts_out = inference.extend(
296346
alignment_store=alignment_store,
297347
metadata_db=metadata_db,
298-
base_ts=base_ts,
348+
base_ts=base,
349+
date=date,
299350
match_db=match_db,
300351
num_mismatches=num_mismatches,
301-
max_hmm_cost=max_hmm_cost,
352+
hmm_cost_threshold=hmm_cost_threshold,
302353
min_group_size=min_group_size,
303-
num_past_days=num_past_days,
304-
max_submission_delay=max_submission_delay,
354+
retrospective_window=retrospective_window,
305355
max_daily_samples=max_daily_samples,
306-
rng=rng,
307-
precision=precision,
356+
random_seed=random_seed,
308357
num_threads=num_threads,
309-
show_progress=not no_progress,
358+
show_progress=progress,
310359
)
311-
for ts, date in ts_iter:
312-
output_ts = output_prefix + date + ".ts"
313-
add_provenance(ts, output_ts)
314-
if date >= stop_date:
315-
break
360+
add_provenance(ts_out, output_ts)
316361

317362

318363
@click.command()
@@ -477,6 +522,8 @@ def cli():
477522
cli.add_command(export_alignments)
478523
cli.add_command(export_metadata)
479524

480-
cli.add_command(daily_extend)
525+
cli.add_command(initialise)
526+
cli.add_command(list_dates)
527+
cli.add_command(extend)
481528
cli.add_command(validate)
482529
cli.add_command(annotate_recombinants)

0 commit comments

Comments
 (0)