Skip to content

Commit 7758245

Browse files
Merge pull request #505 from jeromekelleher/insert-missing-sites
Insert missing sites
2 parents 786ee14 + a8a20d2 commit 7758245

File tree

4 files changed

+75
-35
lines changed

4 files changed

+75
-35
lines changed

sc2ts/cli.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -610,7 +610,7 @@ def minimise_metadata(
610610
@click.option("--progress/--no-progress", default=True)
611611
@click.option("-v", "--verbose", count=True)
612612
@click.option("-l", "--log-file", default=None, type=click.Path(dir_okay=False))
613-
def map_deletions(
613+
def map_parsimony(
614614
dataset,
615615
ts_in,
616616
ts_out,
@@ -621,14 +621,14 @@ def map_deletions(
621621
log_file,
622622
):
623623
"""
624-
Map deletions onto the specified ARG.
624+
Map variation at the specified set of sites to the ARG using parsimony.
625625
"""
626626
setup_logging(verbose, log_file)
627627
ds = sc2ts.Dataset(dataset)
628628
ts = tszip.load(ts_in)
629629
if sites is not None:
630630
sites = np.loadtxt(sites, dtype=int)
631-
result = sc2ts.map_deletions(ts, ds, sites, show_progress=progress)
631+
result = sc2ts.map_parsimony(ts, ds, sites, show_progress=progress)
632632
if report is not None:
633633
result.report.to_csv(report)
634634
result.tree_sequence.dump(ts_out)
@@ -670,5 +670,5 @@ def cli():
670670
cli.add_command(validate)
671671
cli.add_command(postprocess)
672672
cli.add_command(minimise_metadata)
673-
cli.add_command(map_deletions)
673+
cli.add_command(map_parsimony)
674674
cli.add_command(run_hmm)

sc2ts/inference.py

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1956,14 +1956,16 @@ def get_recombinant_strains(ts):
19561956

19571957

19581958
@dataclasses.dataclass
1959-
class MapDeletionsResult:
1959+
class MapParsimonyResult:
19601960
tree_sequence: tskit.TreeSequence
19611961
report: pd.DataFrame
19621962

19631963

1964-
def map_deletions(ts, ds, sites=None, *, show_progress=False):
1964+
def map_parsimony(ts, ds, sites=None, *, show_progress=False):
19651965
"""
1966-
Map deletions at the specified set of site positions ARG using parsimony.
1966+
Map variation at the specified set of site positions to the ARG using parsimony
1967+
and return the resulting tree sequence. States ACGT- are treated as alleles
1968+
and all other states are treated as missing data.
19671969
"""
19681970
start_time = time.time() # wall time
19691971
if sites is not None:
@@ -1977,9 +1979,11 @@ def map_deletions(ts, ds, sites=None, *, show_progress=False):
19771979
# will have been dropped.
19781980
if "sc2ts" in md:
19791981
mut_metadata = {"sc2ts": {"type": "post_parsimony"}}
1982+
site_metadata = {"sc2ts": {"type": "post_parsimony"}}
19801983
sample_id = md["sc2ts"]["samples_strain"]
19811984
else:
19821985
mut_metadata = None
1986+
site_metadata = None
19831987
dfn = stats.node_data(ts, inheritance_stats=False)
19841988
sample_id = dfn[dfn.is_sample].sample_id.values
19851989

@@ -2000,13 +2004,25 @@ def map_deletions(ts, ds, sites=None, *, show_progress=False):
20002004

20012005
keep_mutations = np.ones(ts.num_mutations, dtype=bool)
20022006
report_data = []
2007+
current_sites = set(ts.sites_position)
20032008
for var in variants:
20042009
tree.seek(var.position)
2005-
try:
2010+
if var.position in current_sites:
20062011
site = ts.site(position=var.position)
2007-
except ValueError:
2008-
logger.warning(f"No site at position {var.position}; skipping")
2009-
continue
2012+
site_id = site.id
2013+
old_mutations = set((mut.node, mut.derived_state) for mut in site.mutations)
2014+
for mut in site.mutations:
2015+
keep_mutations[mut.id] = False
2016+
else:
2017+
assert ts.reference_sequence.data[0] == "X"
2018+
site_id = tables.sites.add_row(
2019+
position=var.position,
2020+
ancestral_state=ts.reference_sequence.data[var.position],
2021+
metadata=site_metadata,
2022+
)
2023+
site = tables.sites[site_id]
2024+
old_mutations = set()
2025+
20102026
g = mask_ambiguous(var.genotypes)
20112027

20122028
if np.all(g == -1):
@@ -2016,7 +2032,6 @@ def map_deletions(ts, ds, sites=None, *, show_progress=False):
20162032
g, list(var.alleles), ancestral_state=site.ancestral_state
20172033
)
20182034

2019-
old_mutations = set((mut.node, mut.derived_state) for mut in site.mutations)
20202035
new_mutations = set((mut.node, mut.derived_state) for mut in mutations)
20212036
inter = old_mutations & new_mutations
20222037
report_data.append(
@@ -2032,11 +2047,9 @@ def map_deletions(ts, ds, sites=None, *, show_progress=False):
20322047
f"Site {int(site.position)} "
20332048
f"old={len(old_mutations)} new={len(new_mutations)} inter={len(inter)}"
20342049
)
2035-
for mut in site.mutations:
2036-
keep_mutations[mut.id] = False
20372050
for m in mutations:
20382051
tables.mutations.add_row(
2039-
site=site.id,
2052+
site=site_id,
20402053
node=m.node,
20412054
derived_state=m.derived_state,
20422055
time=ts.nodes_time[m.node],
@@ -2055,10 +2068,10 @@ def map_deletions(ts, ds, sites=None, *, show_progress=False):
20552068
tables.build_index()
20562069
tables.compute_mutation_parents()
20572070
params = {"dataset": str(ds.path), "sites": sites.tolist()}
2058-
prov = get_provenance_dict("map_deletions", params, start_time)
2071+
prov = get_provenance_dict("map_parsimony", params, start_time)
20592072
tables.provenances.add_row(json.dumps(prov))
20602073

2061-
return MapDeletionsResult(tables.tree_sequence(), pd.DataFrame(report_data))
2074+
return MapParsimonyResult(tables.tree_sequence(), pd.DataFrame(report_data))
20622075

20632076

20642077
def append_exact_matches(ts, match_db, show_progress=False):

tests/test_cli.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ def test_example_args(self, tmp_path, fx_ts_map, fx_match_db):
433433
assert "pango" in dfn
434434

435435

436-
class TestMapDeletions:
436+
class TestMapParsimony:
437437

438438
def test_example(self, tmp_path, fx_ts_map, fx_dataset):
439439
ts = fx_ts_map["2020-02-13"]
@@ -446,7 +446,7 @@ def test_example(self, tmp_path, fx_ts_map, fx_dataset):
446446
runner = ct.CliRunner()
447447
result = runner.invoke(
448448
cli.cli,
449-
f"map-deletions {fx_dataset.path} {ts.path} --sites={del_sites_path} "
449+
f"map-parsimony {fx_dataset.path} {ts.path} --sites={del_sites_path} "
450450
f"{out_ts_path} --report={report_path}",
451451
catch_exceptions=False,
452452
)
@@ -466,7 +466,7 @@ def test_example_no_report(self, tmp_path, fx_ts_map, fx_dataset):
466466
runner = ct.CliRunner()
467467
result = runner.invoke(
468468
cli.cli,
469-
f"map-deletions {fx_dataset.path} {ts.path} --sites={del_sites_path} "
469+
f"map-parsimony {fx_dataset.path} {ts.path} --sites={del_sites_path} "
470470
f"{out_ts_path}",
471471
catch_exceptions=False,
472472
)
@@ -483,7 +483,7 @@ def test_example_no_sites(self, tmp_path, fx_ts_map, fx_dataset):
483483
runner = ct.CliRunner()
484484
result = runner.invoke(
485485
cli.cli,
486-
f"map-deletions {fx_dataset.path} {ts.path} {out_ts_path} "
486+
f"map-parsimony {fx_dataset.path} {ts.path} {out_ts_path} "
487487
f"--report={report_path}",
488488
catch_exceptions=False,
489489
)

tests/test_inference.py

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1402,11 +1402,11 @@ def fx_ts_exact_matches(fx_ts_map, fx_match_db):
14021402
return tsp
14031403

14041404

1405-
class TestMapDeletions:
1405+
class TestMapParsimony:
14061406
def test_example(self, fx_ts_map, fx_dataset):
14071407
ts = fx_ts_map["2020-02-13"]
14081408
sites = [1547, 3951, 3952, 3953]
1409-
result = sc2ts.map_deletions(ts, fx_dataset, sites)
1409+
result = sc2ts.map_parsimony(ts, fx_dataset, sites)
14101410
report = result.report
14111411
assert list(report.site.values) == sites
14121412
assert np.all(report.old == 1)
@@ -1422,7 +1422,7 @@ def test_example(self, fx_ts_map, fx_dataset):
14221422
def test_example_minimised_md(self, fx_ts_min_2020_02_15, fx_dataset):
14231423
ts = fx_ts_min_2020_02_15
14241424
sites = [1547, 3951, 3952, 3953]
1425-
result = sc2ts.map_deletions(ts, fx_dataset, sites)
1425+
result = sc2ts.map_parsimony(ts, fx_dataset, sites)
14261426
report = result.report
14271427
assert list(report.site.values) == sites
14281428
assert np.all(report.old == 1)
@@ -1438,7 +1438,7 @@ def test_example_minimised_md(self, fx_ts_min_2020_02_15, fx_dataset):
14381438
def test_example_minimised_zero_sites(self, fx_ts_min_2020_02_15, fx_dataset):
14391439
ts = fx_ts_min_2020_02_15
14401440
sites = []
1441-
result = sc2ts.map_deletions(ts, fx_dataset, sites)
1441+
result = sc2ts.map_parsimony(ts, fx_dataset, sites)
14421442
report = result.report
14431443
assert report.shape[0] == 0
14441444
ts.tables.assert_equals(
@@ -1447,33 +1447,60 @@ def test_example_minimised_zero_sites(self, fx_ts_min_2020_02_15, fx_dataset):
14471447

14481448
def test_empty(self, fx_ts_map, fx_dataset):
14491449
ts = fx_ts_map["2020-02-13"]
1450-
result = sc2ts.map_deletions(ts, fx_dataset, [])
1450+
result = sc2ts.map_parsimony(ts, fx_dataset, [])
14511451
ts.tables.assert_equals(
14521452
result.tree_sequence.tables, ignore_metadata=True, ignore_provenance=True
14531453
)
14541454
assert result.report.shape[0] == 0
14551455

14561456
def test_empty_all_sites(self, fx_ts_map, fx_dataset):
14571457
ts = fx_ts_map["2020-02-13"]
1458-
result = sc2ts.map_deletions(ts, fx_dataset)
1458+
result = sc2ts.map_parsimony(ts, fx_dataset)
14591459
assert result.report.shape[0] == ts.num_sites
14601460

1461-
def test_missing_site(self, fx_ts_map, fx_dataset):
1461+
def test_missing_sites(self, fx_ts_map, fx_dataset):
14621462
ts = fx_ts_map["2020-02-13"]
14631463
missing_positions = [56, 57, 58, 59, 60]
14641464
assert len(set(missing_positions) & set(ts.sites_position.astype(int))) == 0
14651465

1466-
result = sc2ts.map_deletions(ts, fx_dataset, missing_positions)
1466+
result = sc2ts.map_parsimony(ts, fx_dataset, missing_positions)
1467+
assert (
1468+
len(
1469+
set(missing_positions)
1470+
& set(result.tree_sequence.sites_position.astype(int))
1471+
)
1472+
== 5
1473+
)
1474+
assert list(result.report.site) == missing_positions
1475+
assert np.all(result.report.old == 0)
1476+
assert np.all(result.report.new == 0)
1477+
assert np.all(result.report.intersection == 0)
1478+
1479+
def test_readd_existing_site(self, fx_ts_map, fx_dataset):
1480+
ts = fx_ts_map["2020-02-13"]
1481+
site = ts.site(position=203)
1482+
assert len(site.mutations) == 1
1483+
ts_del = ts.delete_sites([site.id])
1484+
1485+
result = sc2ts.map_parsimony(ts_del, fx_dataset, [site.position])
1486+
new_site = result.tree_sequence.site(position=site.position)
1487+
assert new_site.ancestral_state == site.ancestral_state
1488+
assert new_site.position == site.position
1489+
assert new_site.id == site.id
1490+
14671491
ts.tables.assert_equals(
14681492
result.tree_sequence.tables, ignore_metadata=True, ignore_provenance=True
14691493
)
1470-
assert result.report.shape[0] == 0
1494+
assert result.report.shape[0] == 1
1495+
assert np.all(result.report.old == 0)
1496+
assert np.all(result.report.new == 1)
1497+
assert np.all(result.report.intersection == 0)
14711498

14721499
def test_example_exact_matches(self, fx_ts_exact_matches, fx_dataset):
14731500
ts = fx_ts_exact_matches
14741501
sites = [1547, 3951, 3952, 3953]
1475-
result = sc2ts.map_deletions(ts, fx_dataset, sites)
1476-
result = sc2ts.map_deletions(ts, fx_dataset, sites)
1502+
result = sc2ts.map_parsimony(ts, fx_dataset, sites)
1503+
result = sc2ts.map_parsimony(ts, fx_dataset, sites)
14771504
report = result.report
14781505
assert list(report.site.values) == sites
14791506
assert np.all(report.old == 1)
@@ -1483,18 +1510,18 @@ def test_example_exact_matches(self, fx_ts_exact_matches, fx_dataset):
14831510
def test_validate(self, fx_ts_map, fx_dataset):
14841511
ts = fx_ts_map["2020-02-13"]
14851512
sites = [1547, 3951, 3952, 3953]
1486-
result = sc2ts.map_deletions(ts, fx_dataset, sites)
1513+
result = sc2ts.map_parsimony(ts, fx_dataset, sites)
14871514
sc2ts.validate(result.tree_sequence, fx_dataset, deletions_as_missing=False)
14881515

14891516
def test_provenance(self, fx_ts_map, fx_dataset):
14901517
ts = fx_ts_map["2020-02-13"]
14911518
sites = [1547, 3951, 3952, 3953]
1492-
result = sc2ts.map_deletions(ts, fx_dataset, sites)
1519+
result = sc2ts.map_parsimony(ts, fx_dataset, sites)
14931520
tsp = result.tree_sequence
14941521
assert tsp.num_provenances == ts.num_provenances + 1
14951522
prov = tsp.provenance(-1)
14961523
assert json.loads(prov.record)["parameters"] == {
1497-
"command": "map_deletions",
1524+
"command": "map_parsimony",
14981525
"dataset": str(fx_dataset.path),
14991526
"sites": sites,
15001527
}

0 commit comments

Comments
 (0)