Skip to content

Commit 33417ff

Browse files
Add more systematic base info for nodes in recombinant report
1 parent cb20605 commit 33417ff

File tree

2 files changed

+54
-36
lines changed

2 files changed

+54
-36
lines changed

sc2ts/info.py

Lines changed: 47 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -451,11 +451,12 @@ def __init__(
451451
quick=False,
452452
show_progress=True,
453453
pango_source="Viridian_pangolin",
454+
scorpio_source="Viridian_scorpio",
454455
sample_group_id_prefix_len=10,
455456
):
456457
self.ts = ts
457458
self.pango_source = pango_source
458-
self.scorpio_source = "Viridian_scorpio"
459+
self.scorpio_source = scorpio_source
459460
self.strain_map = {}
460461
self.recombinants = np.where(ts.nodes_flags == core.NODE_IS_RECOMBINANT)[0]
461462

@@ -970,6 +971,19 @@ def recombinants_summary(
970971
):
971972
if parent_pango_source is None:
972973
parent_pango_source = self.pango_source
974+
975+
def node_info(node, label):
976+
datum = {label: node}
977+
datum[f"{label}_pango"] = self.nodes_metadata[node].get(
978+
self.pango_source, "Unknown"
979+
)
980+
datum[f"{label}_scorpio"] = self.nodes_metadata[node].get(
981+
self.scorpio_source, "Unknown"
982+
)
983+
datum[f"{label}_time"] = self.ts.nodes_time[node]
984+
datum[f"{label}_date"] = self.nodes_date[node]
985+
return datum
986+
973987
data = []
974988
for u in self.recombinants:
975989
md = dict(self.nodes_metadata[u]["sc2ts"])
@@ -1011,30 +1025,33 @@ def recombinants_summary(
10111025
interval = breakpoint_intervals[0]
10121026
parent_left = hmm_match["path"][0]["parent"]
10131027
parent_right = hmm_match["path"][1]["parent"]
1014-
data.append(
1015-
{
1016-
"recombinant": u,
1017-
"descendants": self.nodes_max_descendant_samples[u],
1018-
"sample": v,
1019-
"sample_pango": causal_lineages[v],
1020-
"num_samples": len(samples),
1021-
"distinct_sample_pango": len(set(causal_lineages.values())),
1022-
"interval_left": interval[0][0],
1023-
"interval_right": interval[0][1],
1024-
"parent_left": parent_left,
1025-
"parent_right": parent_right,
1026-
"parent_left_pango": self.nodes_metadata[parent_left].get(
1027-
parent_pango_source,
1028-
"Unknown",
1029-
),
1030-
"parent_right_pango": self.nodes_metadata[parent_right].get(
1031-
parent_pango_source,
1032-
"Unknown",
1033-
),
1034-
"num_mutations": len(hmm_match["mutations"]),
1035-
**md,
1036-
}
1037-
)
1028+
1029+
datum = {
1030+
"num_descendant_samples": self.nodes_max_descendant_samples[u],
1031+
"num_samples": len(samples),
1032+
"distinct_sample_pango": len(set(causal_lineages.values())),
1033+
"interval_left": interval[0][0],
1034+
"interval_right": interval[0][1],
1035+
"num_mutations": len(hmm_match["mutations"]),
1036+
"Viridian_amplicon_scheme": self.nodes_metadata[v].get(
1037+
"Viridian_amplicon_scheme", "Unknown"
1038+
),
1039+
"Artic_primer_version": self.nodes_metadata[v].get(
1040+
"Artic_primer_version", "Unknown"
1041+
),
1042+
**md,
1043+
}
1044+
1045+
for node, label in [
1046+
(u, "recombinant"),
1047+
(v, "sample"),
1048+
(parent_left, "parent_left"),
1049+
(parent_right, "parent_right"),
1050+
]:
1051+
datum = {**datum, **node_info(node, label)}
1052+
1053+
data.append(datum)
1054+
10381055
# Compute the MRCAs by iterating along trees in order of
10391056
# breakpoint. We use the right interval
10401057
df = pd.DataFrame(data).sort_values("interval_right")
@@ -1051,10 +1068,11 @@ def recombinants_summary(
10511068
left_path = jit.get_root_path(tree, row.parent_left)
10521069
assert tree.parent(row.recombinant) == row.parent_left
10531070
mrca = jit.get_path_mrca(left_path, right_path, self.ts.nodes_time)
1054-
mrca_data.append(mrca)
1055-
mrca_data = np.array(mrca_data)
1056-
df["mrca"] = mrca_data
1057-
df["t_mrca"] = self.ts.nodes_time[mrca_data]
1071+
mrca_data.append(node_info(mrca, "parent_mrca"))
1072+
1073+
mrca_df = pd.DataFrame(mrca_data)
1074+
for col in mrca_df:
1075+
df[col] = mrca_df[col]
10581076

10591077
if characterise_copying:
10601078
# Slow - don't do this unless we really want to.

tests/test_info.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ def test_recombinants_summary_example_1(self, fx_ti_recombinant_example_1):
176176
df = fx_ti_recombinant_example_1.recombinants_summary()
177177
assert df.shape[0] == 1
178178
row = df.iloc[0]
179-
assert row.descendants == 2
179+
assert row.num_descendant_samples == 2
180180
assert row["sample"] == 53
181181
assert row.num_samples == 2
182182
assert row.group_size == 3
@@ -189,8 +189,8 @@ def test_recombinants_summary_example_1(self, fx_ti_recombinant_example_1):
189189
assert row.parent_right == 46
190190
assert row.parent_right_pango == "Unknown"
191191
assert row.num_mutations == 0
192-
assert row.mrca == 1
193-
assert row.t_mrca == 51
192+
assert row.parent_mrca == 1
193+
assert row.parent_mrca_time == 51
194194
assert "diffs" not in df
195195

196196
df2 = fx_ti_recombinant_example_1.recombinants_summary(
@@ -206,17 +206,19 @@ def test_recombinants_summary_example_2(self, fx_recombinant_example_2):
206206
df = ti.recombinants_summary(characterise_copying=True, show_progress=False)
207207
assert df.shape[0] == 1
208208
row = df.iloc[0]
209-
assert row.descendants == 1
209+
assert row.num_descendant_samples == 1
210210
assert row["sample"] == 55
211211
assert row["distinct_sample_pango"] == 1
212212
assert row["recombinant"] == 56
213+
assert row["recombinant_pango"] == "Unknown"
214+
assert row["recombinant_time"] == 0.000001
213215
assert row["sample_pango"] == "Unknown"
214216
assert row["num_mutations"] == 0
215217
assert row["parent_left"] == 53
216218
assert row["parent_left_pango"] == "Unknown"
217219
assert row["parent_right"] == 54
218220
assert row["parent_right_pango"] == "Unknown"
219-
assert row["mrca"] == 48
221+
assert row["parent_mrca"] == 48
220222
assert row["group_size"] == 2
221223
assert row["diffs"] == 6
222224
assert row["max_run_length"] == 2
@@ -243,8 +245,6 @@ def test_example_node(self, fx_ts_min_2020_02_15, fx_ti_2020_02_15):
243245
nt.assert_array_equal(
244246
ti.nodes_max_descendant_samples, df["max_descendant_samples"]
245247
)
246-
print(ti.nodes_date.dtype)
247-
print(df["date"].dtype)
248248
nt.assert_array_equal(ti.nodes_date, df["date"])
249249
assert list(np.where(df["is_recombinant"])[0]) == list(ti.recombinants)
250250
assert list(np.where(df["is_sample"])[0]) == list(ts.samples())

0 commit comments

Comments
 (0)