Skip to content

Commit fadb87c

Browse files
timothymillarmergify[bot]
authored andcommitted
Improve H&K genedrop performance
1 parent 65698d8 commit fadb87c

File tree

3 files changed

+459
-174
lines changed

3 files changed

+459
-174
lines changed

sgkit/stats/genedrop.py

Lines changed: 6 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -7,138 +7,19 @@
77
from xarray import Dataset
88

99
from sgkit import variables
10-
from sgkit.accelerate import numba_guvectorize, numba_jit
1110
from sgkit.typing import ArrayLike
1211
from sgkit.utils import (
1312
conditional_merge_datasets,
1413
create_dataset,
1514
define_variable_if_absent,
1615
)
1716

18-
from .pedigree import (
19-
_compress_hamilton_kerr_parameters,
20-
parent_indices,
21-
topological_argsort,
22-
)
17+
from .pedigree import parent_indices
2318

2419
EST_DIPLOID = "diploid"
2520
EST_HAMILTON_KERR = "Hamilton-Kerr"
2621

2722

28-
@numba_guvectorize( # type: ignore
29-
[
30-
"void(int8[:,:], int64[:,:], uint32, int8[:,:])",
31-
"void(int16[:,:], int64[:,:], uint32, int16[:,:])",
32-
"void(int32[:,:], int64[:,:], uint32, int32[:,:])",
33-
"void(int64[:,:], int64[:,:], uint32, int64[:,:])",
34-
],
35-
"(n,k),(n,p),()->(n,k)",
36-
)
37-
def genedrop_diploid(
38-
genotypes: ArrayLike,
39-
parent: ArrayLike,
40-
seed: ArrayLike,
41-
out: ArrayLike,
42-
) -> None: # pragma: no cover
43-
n_sample, n_parent = parent.shape
44-
_, ploidy = genotypes.shape
45-
if n_parent != 2:
46-
raise ValueError("The parents dimension must be length 2")
47-
if ploidy != 2:
48-
raise ValueError("Genotypes are not diploid")
49-
order = topological_argsort(parent)
50-
np.random.seed(seed)
51-
for i in range(n_sample):
52-
t = order[i]
53-
unknown_parent = 0
54-
for j in range(n_parent):
55-
p = parent[t, j]
56-
if p < 0:
57-
# founder
58-
unknown_parent += 1
59-
else:
60-
idx = np.random.randint(2)
61-
out[t, j] = out[p, idx]
62-
if unknown_parent == 1:
63-
raise ValueError("Pedigree contains half-founders")
64-
elif unknown_parent == 2:
65-
# copy founder
66-
out[t, 0] = genotypes[t, 0]
67-
out[t, 1] = genotypes[t, 1]
68-
69-
70-
@numba_jit(nogil=True)
71-
def _random_gamete_Hamilton_Kerr(
72-
genotype: ArrayLike, ploidy: int, tau: int, lambda_: float
73-
) -> ArrayLike:
74-
if ploidy < len(genotype):
75-
# remove fill values
76-
genotype = genotype[genotype > -2]
77-
if ploidy != len(genotype):
78-
raise ValueError("Genotype ploidy does not match number of alleles")
79-
if tau > ploidy:
80-
# TODO: this can be an option for encoding somatic genome duplication (with suitable lambda)
81-
raise NotImplementedError("Gamete tau cannot exceed parental ploidy")
82-
gamete = np.random.choice(genotype, tau, replace=False)
83-
if lambda_ > 0.0:
84-
if tau == 2:
85-
if np.random.rand() <= lambda_:
86-
gamete[1] = gamete[0]
87-
elif tau != 2:
88-
raise NotImplementedError("Non-zero lambda is only implemented for tau = 2")
89-
return gamete
90-
91-
92-
@numba_guvectorize( # type: ignore
93-
[
94-
"void(int8[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int8[:,:])",
95-
"void(int16[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int16[:,:])",
96-
"void(int32[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int32[:,:])",
97-
"void(int64[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int64[:,:])",
98-
],
99-
"(n,k),(n,p),(n,p),(n,p),()->(n,k)",
100-
)
101-
def genedrop_Hamilton_Kerr(
102-
genotypes: ArrayLike,
103-
parent: ArrayLike,
104-
tau: ArrayLike,
105-
lambda_: ArrayLike,
106-
seed: int,
107-
out: ArrayLike,
108-
) -> None: # pragma: no cover
109-
if parent.shape[1] != 2:
110-
parent, tau, lambda_ = _compress_hamilton_kerr_parameters(parent, tau, lambda_)
111-
n_sample, n_parent = parent.shape
112-
_, max_ploidy = genotypes.shape
113-
order = topological_argsort(parent)
114-
np.random.seed(seed)
115-
for i in range(n_sample):
116-
t = order[i]
117-
alleles = 0
118-
unknown_alleles = 0
119-
for j in range(n_parent):
120-
p = parent[t, j]
121-
tau_p = tau[t, j]
122-
lambda_p = lambda_[t, j]
123-
if p < 0:
124-
unknown_alleles += tau_p
125-
elif tau_p > 0:
126-
ploidy_p = tau[p, 0] + tau[p, 1]
127-
gamete = _random_gamete_Hamilton_Kerr(out[p], ploidy_p, tau_p, lambda_p)
128-
out[t, alleles : alleles + tau_p] = gamete
129-
alleles += tau_p
130-
if unknown_alleles == 0:
131-
if alleles < max_ploidy:
132-
# pad with fill value
133-
out[t, alleles:] = -2
134-
elif unknown_alleles == alleles:
135-
# founder
136-
out[t] = genotypes[t]
137-
else:
138-
# partial founder
139-
raise ValueError("Pedigree contains half-founders")
140-
141-
14223
def simulate_genedrop(
14324
ds: Dataset,
14425
*,
@@ -165,7 +46,7 @@ def simulate_genedrop(
16546
which is only suitable for pedigrees in which all samples are
16647
diploids resulting from sexual reproduction.
16748
The "Hamilton-Kerr" method is suitable for autopolyploid and
168-
mixed-ploidy datasets following Kerr et al. 2012 and [2]
49+
mixed-ploidy datasets following Kerr et al. 2012 [2] and
16950
Hamilton and Kerr 2017 [3].
17051
call_genotype
17152
Input variable name holding call_genotype as defined by
@@ -217,8 +98,8 @@ def simulate_genedrop(
21798
If the Hamilton-Kerr method is used and a sample has more than
21899
two contributing parents.
219100
ValueError
220-
If the Hamilton-Kerr method is used and the number of alleles
221-
in a genotype does not match the sum of tau values (i.e., ploidy).
101+
If the Hamilton-Kerr method is used and the number of alleles in a
102+
founder genotype does not match the sum of its tau values (i.e., ploidy).
222103
NotImplementedError
223104
If the Hamilton-Kerr method is used and a tau value exceeds the
224105
parental ploidy.
@@ -295,6 +176,8 @@ def simulate_genedrop(
295176
"Computation of the inverse additive relationship matrix for autopolyploid
296177
and multiple-ploidy populations." Theoretical and Applied Genetics 131: 851-860.
297178
"""
179+
from .genedrop_numba_fns import genedrop_diploid, genedrop_Hamilton_Kerr
180+
298181
ds = define_variable_if_absent(ds, variables.parent, parent, parent_indices)
299182
variables.validate(
300183
ds, {parent: variables.parent_spec, call_genotype: variables.call_genotype_spec}

sgkit/stats/genedrop_numba_fns.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
import numpy as np
2+
3+
from sgkit.accelerate import numba_guvectorize, numba_jit
4+
from sgkit.typing import ArrayLike
5+
6+
from .pedigree import _compress_hamilton_kerr_parameters, topological_argsort
7+
8+
9+
@numba_guvectorize( # type: ignore
10+
[
11+
"void(int8[:,:], int64[:,:], uint32, int8[:,:])",
12+
"void(int16[:,:], int64[:,:], uint32, int16[:,:])",
13+
"void(int32[:,:], int64[:,:], uint32, int32[:,:])",
14+
"void(int64[:,:], int64[:,:], uint32, int64[:,:])",
15+
],
16+
"(n,k),(n,p),()->(n,k)",
17+
)
18+
def genedrop_diploid(
19+
genotypes: ArrayLike,
20+
parent: ArrayLike,
21+
seed: ArrayLike,
22+
out: ArrayLike,
23+
) -> None: # pragma: no cover
24+
n_sample, n_parent = parent.shape
25+
_, ploidy = genotypes.shape
26+
if n_parent != 2:
27+
raise ValueError("The parents dimension must be length 2")
28+
if ploidy != 2:
29+
raise ValueError("Genotypes are not diploid")
30+
order = topological_argsort(parent)
31+
np.random.seed(seed)
32+
for i in range(n_sample):
33+
t = order[i]
34+
unknown_parent = 0
35+
for j in range(n_parent):
36+
p = parent[t, j]
37+
if p < 0:
38+
# founder
39+
unknown_parent += 1
40+
else:
41+
idx = np.random.randint(2)
42+
out[t, j] = out[p, idx]
43+
if unknown_parent == 1:
44+
raise ValueError("Pedigree contains half-founders")
45+
elif unknown_parent == 2:
46+
# copy founder
47+
out[t, 0] = genotypes[t, 0]
48+
out[t, 1] = genotypes[t, 1]
49+
50+
51+
@numba_jit(nogil=True)
52+
def _random_inheritance_Hamilton_Kerr(
53+
genotypes: ArrayLike,
54+
parent: ArrayLike,
55+
tau: ArrayLike,
56+
lambda_: ArrayLike,
57+
marked: ArrayLike,
58+
i: int,
59+
):
60+
_, n_parent = parent.shape
61+
_, max_ploidy = genotypes.shape
62+
next_allele = 0
63+
ploidy_i = 0
64+
for j in range(n_parent):
65+
p = parent[i, j]
66+
tau_p = tau[i, j]
67+
ploidy_i += tau_p
68+
if p < 0:
69+
# unknown parent
70+
continue
71+
lambda_p = lambda_[i, j]
72+
ploidy_p = tau[p, 0] + tau[p, 1]
73+
if tau_p > ploidy_p:
74+
raise NotImplementedError("Gamete tau cannot exceed parental ploidy.")
75+
if lambda_p > 0.0:
76+
if tau_p != 2:
77+
raise NotImplementedError(
78+
"Non-zero lambda is only implemented for tau = 2."
79+
)
80+
homozygous_gamete = np.random.rand() < lambda_p
81+
else:
82+
homozygous_gamete = False
83+
if homozygous_gamete:
84+
# diploid gamete with duplicated allele
85+
uniform = np.random.rand()
86+
choice = int(uniform * ploidy_p)
87+
for k in range(max_ploidy):
88+
parent_allele = genotypes[p, k]
89+
if parent_allele < -1:
90+
# non-allele
91+
pass
92+
elif choice > 0:
93+
# not the chosen allele
94+
choice -= 1
95+
else:
96+
# chosen allele is duplicated
97+
genotypes[i, next_allele] = parent_allele
98+
genotypes[i, next_allele + 1] = parent_allele
99+
next_allele += 2
100+
break
101+
else:
102+
# random alleles without replacement
103+
marked[:] = False
104+
for h in range(tau_p):
105+
uniform = np.random.rand()
106+
scale = ploidy_p - h
107+
choice = int(uniform * scale)
108+
k = 0
109+
while choice >= 0:
110+
parent_allele = genotypes[p, k]
111+
if marked[k] > 0:
112+
# already inherited
113+
pass
114+
elif parent_allele < -1:
115+
# non-allele
116+
pass
117+
elif choice > 0:
118+
# not the chosen allele
119+
choice -= 1
120+
else:
121+
# chosen allele
122+
genotypes[i, next_allele] = parent_allele
123+
marked[k] = True
124+
next_allele += 1
125+
choice -= 1
126+
k += 1
127+
if next_allele == 0:
128+
# full founder requires ploidy validation
129+
alleles_i = 0
130+
for k in range(max_ploidy):
131+
if genotypes[i, k] >= -1:
132+
alleles_i += 1
133+
if alleles_i != ploidy_i:
134+
raise ValueError("Genotype ploidy does not match number of alleles.")
135+
elif next_allele != ploidy_i:
136+
raise ValueError("Pedigree contains half-founders.")
137+
elif next_allele < max_ploidy:
138+
# pad with non-alleles
139+
genotypes[i, next_allele:] = -2
140+
141+
142+
@numba_guvectorize( # type: ignore
143+
[
144+
"void(int8[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int8[:,:])",
145+
"void(int16[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int16[:,:])",
146+
"void(int32[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int32[:,:])",
147+
"void(int64[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int64[:,:])",
148+
],
149+
"(n,k),(n,p),(n,p),(n,p),()->(n,k)",
150+
)
151+
def genedrop_Hamilton_Kerr(
152+
genotypes: ArrayLike,
153+
parent: ArrayLike,
154+
tau: ArrayLike,
155+
lambda_: ArrayLike,
156+
seed: int,
157+
out: ArrayLike,
158+
) -> None: # pragma: no cover
159+
if parent.shape[1] != 2:
160+
parent, tau, lambda_ = _compress_hamilton_kerr_parameters(parent, tau, lambda_)
161+
out[:] = genotypes
162+
n_sample, _ = parent.shape
163+
_, max_ploidy = genotypes.shape
164+
order = topological_argsort(parent)
165+
marked = np.zeros(max_ploidy, dtype=np.bool8)
166+
np.random.seed(seed)
167+
for idx in range(n_sample):
168+
i = order[idx]
169+
_random_inheritance_Hamilton_Kerr(out, parent, tau, lambda_, marked, i)

0 commit comments

Comments
 (0)