Skip to content

Commit 3bc6bf2

Browse files
committed
fix for union vignette - closes #199
1 parent 5478b47 commit 3bc6bf2

File tree

6 files changed

+56
-44
lines changed

6 files changed

+56
-44
lines changed

docs/A.trees

-2.02 MB
Binary file not shown.

docs/B.trees

-2.11 MB
Binary file not shown.

docs/C.trees

-1.73 MB
Binary file not shown.

docs/I.trees

-1.89 MB
Binary file not shown.

docs/_static/phylo.tsv

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
child parent popsize edgelen
2-
root 5000 20000
3-
C root 500 2500
4-
I root 1000 2000
5-
B I 700 500
6-
A I 400 500
2+
root 500 2000
3+
C root 50 250
4+
I root 100 200
5+
B I 70 50
6+
A I 40 50

docs/vignette_parallel_phylo.md

Lines changed: 51 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ Here is python code that will write out a makefile from the information in ``df`
100100
f = open("sims.make", "w")
101101
print(f"all: {' '.join(df.outfile.to_list())}\n", file=f)
102102
for i, row in df.iterrows():
103-
print(f"{row.outfile}: {row.infile}", file=f)
103+
print(f"{row.outfile}: {row.infile} phylo_bgs.slim\n", file=f)
104104
print(f"\tslim -d \"infile='{row.infile}'\" -d popsize={row.popsize} -d num_gens={row.edgelen} -d \"outfile='{row.child}.trees'\" phylo_bgs.slim\n", file=f)
105105
f.close()
106106
```
@@ -117,13 +117,15 @@ cat sims.make
117117
With the makefile in hand,
118118
we can now run make, specifying the maximum number of simulations
119119
to be run simultaneously the ``-j``.
120-
121-
```python
120+
(Click on the "+" icon to see SLiM's output.)
121+
```{code-cell}
122+
:tags: ["hide-output"]
122123
%%bash
123124
make -f sims.make -j 3
124125
```
125126

126-
```{dropdown} Alternative to using make
127+
128+
```{dropdown} Click here for how to use python instead of make
127129
128130
You would have to write a recursion over the branches in your tree (starting
129131
from the root) and then parallelize the runs of sister branches somehow.
@@ -199,42 +201,46 @@ recursive function that goes through our data frame with the phylogeny and
199201
returns a dictionary with the merged tree sequences from the tip to the root.
200202

201203
```{code-cell}
202-
def union_recursion(df, focal="root", merged=None):
203-
if merged is None:
204-
merged = {
205-
row.child : (
206-
tskit.load(row.outfile),
207-
row.edgelen,
208-
[row.child]
209-
) for i, row in df[df.is_leaf].iterrows()
210-
}
211-
print(f"Going in: {focal}")
212-
childs = df[df.parent == focal]
213-
assert (len(childs) == 2) or (len(childs) == 0)
214-
if len(childs) == 2:
215-
for i, row in childs.iterrows():
216-
merged = union_recursion(df, row.child, merged)
217-
cname1 = childs.iloc[0,]["child"]
218-
cname2 = childs.iloc[1,]["child"]
219-
split_time = merged[cname1][1]
220-
assert split_time == merged[cname2][1] # ultrametric
221-
print(f'Unioning: {childs["child"].to_list()}, Split time: {split_time}')
222-
ts1 = merged[cname1][0]
223-
ts2 = merged[cname2][0]
224-
node_map = match_nodes(ts2, ts1, split_time)
225-
tsu = ts1.union(ts2, node_map, check_shared_equality=True)
204+
merged = {
205+
row.child : {
206+
"ts": tskit.load(row.outfile),
207+
"depth": row.edgelen,
208+
"children": [row.child]
209+
}
210+
for i, row in df[df.is_leaf].iterrows()
211+
}
212+
213+
def union_children(parent, df, merged):
214+
print(f"Going in: {parent}")
215+
child_rows = df[df.parent == parent]
216+
assert (len(child_rows) == 2) or (len(childs) == 0)
217+
if len(child_rows) == 2:
218+
children = [row.child for _, row in child_rows.iterrows()]
219+
for child in children:
220+
if child not in merged:
221+
union_children(child, df, merged)
222+
split_time = merged[children[0]]["depth"]
223+
assert split_time == merged[children[1]]["depth"] # ultrametric
224+
print(f'Unioning: {children}, Split time: {split_time}')
225+
ts0 = merged[children[0]]["ts"]
226+
ts1 = merged[children[1]]["ts"]
227+
node_map = match_nodes(ts1, ts0, split_time)
228+
tsu = pyslim.SlimTreeSequence(
229+
ts0.union(ts1, node_map, check_shared_equality=True)
230+
)
226231
# the time from tip to start of simulation is split_time plus the
227232
# length of the edge
228-
merged[focal] = (
229-
tsu,
230-
split_time + df[df.child==focal].edgelen.item(),
231-
merged[cname1][2] + merged[cname2][2]
232-
)
233-
return merged
233+
parent_edgelength = df[df.child==parent].edgelen.item()
234+
merged[parent] = {
235+
"ts": tsu,
236+
"depth": split_time + parent_edgelength,
237+
"children": merged[children[0]]["children"] + merged[children[1]]["children"]
238+
}
234239
235-
merged = union_recursion(df)
240+
union_children("root", df, merged)
236241
# union of all three species tree sequences is in the root.
237-
tsu, _, pops = merged["root"]
242+
tsu = merged["root"]["ts"]
243+
pops = merged["root"]["children"]
238244
```
239245

240246
A slightly tricky thing we had to do there was to make sure we kept track of
@@ -245,19 +251,25 @@ contains the names of the populations
245251
in the order in which they are indexed in the resulting tree sequence ``tsu``.
246252
(Since the population in SLiM is ``p1``, the zero-th population will be unused.)
247253

248-
Let's make sure we have the right number of samples in each of the populations
249-
we specified.
254+
Let's make sure we have the right number of present-day samples
255+
in each of the populations. To do this we need to make sure to get
256+
"alive" samples, because recall that we have saved the state of the
257+
population at each species split time.
250258

251259
```{code-cell}
260+
alive = np.where(np.isclose(tsu.tables.nodes.time, 0))[0]
252261
for i, name in enumerate(pops):
253-
n_samples = len(tsu.samples(i + 1)) // 2
262+
pop_samples = tsu.samples(i + 1)
263+
n_samples = sum(np.isin(pop_samples, alive)) // 2
254264
print(f"Union-ed tree sequence has {n_samples} samples in population {name},\n"
255265
f"\tand we specified {df[df.child==name].popsize.item()} individuals in our simulations.")
266+
assert n_samples == df[df.child==name].popsize.item()
256267
```
257268

258269
Finally, we should recapitate the result,
259270
in case some trees on the root branch haven't coalesced,
260271
and write out the result:
272+
261273
```{code-cell}
262274
tsu = pyslim.SlimTreeSequence(tsu)
263275
tsu = tsu.recapitate(recombination_rate=1e-8)

0 commit comments

Comments
 (0)