Skip to content

Commit a1fa4b2

Browse files
Merge pull request #508 from jeromekelleher/new-recombinants-summary
Updates to recombinants_summary dataframe
2 parents 070a729 + 33417ff commit a1fa4b2

File tree

2 files changed

+72
-46
lines changed

2 files changed

+72
-46
lines changed

sc2ts/info.py

Lines changed: 65 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,6 @@ def node_mutations(self):
240240
muts[site.position] = f"{state0}>{state1}"
241241
return muts
242242

243-
244243
def __init__(
245244
self,
246245
ts,
@@ -258,10 +257,13 @@ def __init__(
258257
self.node = node
259258
if edges is None: # the required edge table wasn't given, so recalculate
260259
edges = tskit.EdgeTable()
261-
for e in sorted([ts.edge(i) for i in np.where(ts.edges_child==node)[0]], key=lambda e: e.left):
260+
for e in sorted(
261+
[ts.edge(i) for i in np.where(ts.edges_child == node)[0]],
262+
key=lambda e: e.left,
263+
):
262264
edges.append(e)
263265
self.edges = edges
264-
266+
265267
def html(
266268
self,
267269
show_bases=True,
@@ -278,7 +280,7 @@ def html(
278280
using the ``IPython.display.HTML`` function.
279281
280282
:param ts TreeSequence:
281-
The tree sequence to which the nodes refer
283+
The tree sequence to which the nodes refer
282284
:param node int:
283285
The node ID of the child node, usually a recombination node.
284286
This will be placed on the second row of the copying pattern, so that
@@ -311,6 +313,7 @@ def html(
311313
document (e.g. a Jupyter notebook) that already has one copying table shown with
312314
the standard stylesheet. If False or None (default), include the default stylesheet.
313315
"""
316+
314317
def row_lab(txt):
315318
return "" if hide_labels else f"<th>{txt}</th>"
316319

@@ -448,11 +451,12 @@ def __init__(
448451
quick=False,
449452
show_progress=True,
450453
pango_source="Viridian_pangolin",
454+
scorpio_source="Viridian_scorpio",
451455
sample_group_id_prefix_len=10,
452456
):
453457
self.ts = ts
454458
self.pango_source = pango_source
455-
self.scorpio_source = "Viridian_scorpio"
459+
self.scorpio_source = scorpio_source
456460
self.strain_map = {}
457461
self.recombinants = np.where(ts.nodes_flags == core.NODE_IS_RECOMBINANT)[0]
458462

@@ -967,11 +971,25 @@ def recombinants_summary(
967971
):
968972
if parent_pango_source is None:
969973
parent_pango_source = self.pango_source
974+
975+
def node_info(node, label):
976+
datum = {label: node}
977+
datum[f"{label}_pango"] = self.nodes_metadata[node].get(
978+
self.pango_source, "Unknown"
979+
)
980+
datum[f"{label}_scorpio"] = self.nodes_metadata[node].get(
981+
self.scorpio_source, "Unknown"
982+
)
983+
datum[f"{label}_time"] = self.ts.nodes_time[node]
984+
datum[f"{label}_date"] = self.nodes_date[node]
985+
return datum
986+
970987
data = []
971988
for u in self.recombinants:
972989
md = dict(self.nodes_metadata[u]["sc2ts"])
973990
group_id = md["group_id"][: self.sample_group_id_prefix_len]
974991
md["group_id"] = group_id
992+
975993
group_nodes = self.sample_group_nodes[group_id]
976994
md["group_size"] = len(group_nodes)
977995

@@ -983,13 +1001,17 @@ def recombinants_summary(
9831001
causal_lineages = {}
9841002
hmm_matches = []
9851003
breakpoint_intervals = []
1004+
copying_path_mutations = collections.defaultdict(list)
9861005
for v in samples:
987-
causal_lineages[v] = self.nodes_metadata[v].get(
988-
self.pango_source, "Unknown"
989-
)
990-
991-
# Arbitrarily pick the first sample node as the representative
992-
v = samples[0]
1006+
sample_md = self.nodes_metadata[v]
1007+
causal_lineages[v] = sample_md.get(self.pango_source, "Unknown")
1008+
hmm_mutations = len(sample_md["sc2ts"]["hmm_match"]["mutations"])
1009+
copying_path_mutations[hmm_mutations].append(v)
1010+
1011+
min_mutations = min(copying_path_mutations.keys())
1012+
# Choose our representative sample as one of the ones that have the
1013+
# fewest mutations in it's copying path.
1014+
v = copying_path_mutations[min_mutations][0]
9931015
node_md = self.nodes_metadata[v]["sc2ts"]
9941016
hmm_matches.append(node_md["hmm_match"])
9951017
breakpoint_intervals.append(node_md["breakpoint_intervals"])
@@ -1003,30 +1025,33 @@ def recombinants_summary(
10031025
interval = breakpoint_intervals[0]
10041026
parent_left = hmm_match["path"][0]["parent"]
10051027
parent_right = hmm_match["path"][1]["parent"]
1006-
data.append(
1007-
{
1008-
"recombinant": u,
1009-
"descendants": self.nodes_max_descendant_samples[u],
1010-
"sample": v,
1011-
"sample_pango": causal_lineages[v],
1012-
"num_samples": len(samples),
1013-
"distinct_sample_pango": len(set(causal_lineages.values())),
1014-
"interval_left": interval[0][0],
1015-
"interval_right": interval[0][1],
1016-
"parent_left": parent_left,
1017-
"parent_right": parent_right,
1018-
"parent_left_pango": self.nodes_metadata[parent_left].get(
1019-
parent_pango_source,
1020-
"Unknown",
1021-
),
1022-
"parent_right_pango": self.nodes_metadata[parent_right].get(
1023-
parent_pango_source,
1024-
"Unknown",
1025-
),
1026-
"num_mutations": len(hmm_match["mutations"]),
1027-
**md,
1028-
}
1029-
)
1028+
1029+
datum = {
1030+
"num_descendant_samples": self.nodes_max_descendant_samples[u],
1031+
"num_samples": len(samples),
1032+
"distinct_sample_pango": len(set(causal_lineages.values())),
1033+
"interval_left": interval[0][0],
1034+
"interval_right": interval[0][1],
1035+
"num_mutations": len(hmm_match["mutations"]),
1036+
"Viridian_amplicon_scheme": self.nodes_metadata[v].get(
1037+
"Viridian_amplicon_scheme", "Unknown"
1038+
),
1039+
"Artic_primer_version": self.nodes_metadata[v].get(
1040+
"Artic_primer_version", "Unknown"
1041+
),
1042+
**md,
1043+
}
1044+
1045+
for node, label in [
1046+
(u, "recombinant"),
1047+
(v, "sample"),
1048+
(parent_left, "parent_left"),
1049+
(parent_right, "parent_right"),
1050+
]:
1051+
datum = {**datum, **node_info(node, label)}
1052+
1053+
data.append(datum)
1054+
10301055
# Compute the MRCAs by iterating along trees in order of
10311056
# breakpoint. We use the right interval
10321057
df = pd.DataFrame(data).sort_values("interval_right")
@@ -1043,10 +1068,11 @@ def recombinants_summary(
10431068
left_path = jit.get_root_path(tree, row.parent_left)
10441069
assert tree.parent(row.recombinant) == row.parent_left
10451070
mrca = jit.get_path_mrca(left_path, right_path, self.ts.nodes_time)
1046-
mrca_data.append(mrca)
1047-
mrca_data = np.array(mrca_data)
1048-
df["mrca"] = mrca_data
1049-
df["t_mrca"] = self.ts.nodes_time[mrca_data]
1071+
mrca_data.append(node_info(mrca, "parent_mrca"))
1072+
1073+
mrca_df = pd.DataFrame(mrca_data)
1074+
for col in mrca_df:
1075+
df[col] = mrca_df[col]
10501076

10511077
if characterise_copying:
10521078
# Slow - don't do this unless we really want to.

tests/test_info.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ def test_recombinants_summary_example_1(self, fx_ti_recombinant_example_1):
176176
df = fx_ti_recombinant_example_1.recombinants_summary()
177177
assert df.shape[0] == 1
178178
row = df.iloc[0]
179-
assert row.descendants == 2
179+
assert row.num_descendant_samples == 2
180180
assert row["sample"] == 53
181181
assert row.num_samples == 2
182182
assert row.group_size == 3
@@ -189,8 +189,8 @@ def test_recombinants_summary_example_1(self, fx_ti_recombinant_example_1):
189189
assert row.parent_right == 46
190190
assert row.parent_right_pango == "Unknown"
191191
assert row.num_mutations == 0
192-
assert row.mrca == 1
193-
assert row.t_mrca == 51
192+
assert row.parent_mrca == 1
193+
assert row.parent_mrca_time == 51
194194
assert "diffs" not in df
195195

196196
df2 = fx_ti_recombinant_example_1.recombinants_summary(
@@ -206,17 +206,19 @@ def test_recombinants_summary_example_2(self, fx_recombinant_example_2):
206206
df = ti.recombinants_summary(characterise_copying=True, show_progress=False)
207207
assert df.shape[0] == 1
208208
row = df.iloc[0]
209-
assert row.descendants == 1
209+
assert row.num_descendant_samples == 1
210210
assert row["sample"] == 55
211211
assert row["distinct_sample_pango"] == 1
212212
assert row["recombinant"] == 56
213+
assert row["recombinant_pango"] == "Unknown"
214+
assert row["recombinant_time"] == 0.000001
213215
assert row["sample_pango"] == "Unknown"
214216
assert row["num_mutations"] == 0
215217
assert row["parent_left"] == 53
216218
assert row["parent_left_pango"] == "Unknown"
217219
assert row["parent_right"] == 54
218220
assert row["parent_right_pango"] == "Unknown"
219-
assert row["mrca"] == 48
221+
assert row["parent_mrca"] == 48
220222
assert row["group_size"] == 2
221223
assert row["diffs"] == 6
222224
assert row["max_run_length"] == 2
@@ -243,8 +245,6 @@ def test_example_node(self, fx_ts_min_2020_02_15, fx_ti_2020_02_15):
243245
nt.assert_array_equal(
244246
ti.nodes_max_descendant_samples, df["max_descendant_samples"]
245247
)
246-
print(ti.nodes_date.dtype)
247-
print(df["date"].dtype)
248248
nt.assert_array_equal(ti.nodes_date, df["date"])
249249
assert list(np.where(df["is_recombinant"])[0]) == list(ti.recombinants)
250250
assert list(np.where(df["is_sample"])[0]) == list(ts.samples())

0 commit comments

Comments
 (0)