Skip to content

Commit 65698d8

Browse files
timothymillarmergify[bot]
authored andcommitted
Add genedrop simulation #1107
1 parent 298fbbc commit 65698d8

File tree

6 files changed

+714
-1
lines changed

6 files changed

+714
-1
lines changed

docs/api.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ Miscellaneous
162162
maximal_independent_set
163163
pairwise_distance
164164

165+
165166
Utilities
166167
=========
167168

@@ -179,6 +180,7 @@ Utilities
179180
invert_relationship_matrix
180181
parent_indices
181182
pedigree_sel
183+
simulate_genedrop
182184
simulate_genotype_call_dataset
183185
window_by_genome
184186
window_by_interval

docs/changelog.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ New Features
2121
(:user:`timothymillar`, :pr:`1104`, :issue:`1097`)
2222
- Add option to count variant alleles directly from call genotypes in function :func:`count_variant_alleles`.
2323
(:user:`timothymillar`, :pr:`1119`, :issue:`1116`)
24+
- Add :func:`simulate_genedrop` function.
25+
(:user:`timothymillar`, :pr:`1139`, :issue:`1107`)
2426

2527
.. Breaking changes
2628
.. ~~~~~~~~~~~~~~~~

sgkit/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
)
2929
from .stats.association import gwas_linear_regression, regenie_loco_regression
3030
from .stats.conversion import convert_call_to_index, convert_probability_to_call
31+
from .stats.genedrop import simulate_genedrop
3132
from .stats.genee import genee
3233
from .stats.grm import (
3334
genomic_relationship,
@@ -123,6 +124,7 @@
123124
"Tajimas_D",
124125
"pbs",
125126
"pc_relate",
127+
"simulate_genedrop",
126128
"simulate_genotype_call_dataset",
127129
"variables",
128130
"observed_heterozygosity",

sgkit/stats/genedrop.py

Lines changed: 345 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,345 @@
1+
from typing import Hashable, Optional
2+
3+
import dask.array as da
4+
import numpy as np
5+
import xarray as xr
6+
from typing_extensions import Literal
7+
from xarray import Dataset
8+
9+
from sgkit import variables
10+
from sgkit.accelerate import numba_guvectorize, numba_jit
11+
from sgkit.typing import ArrayLike
12+
from sgkit.utils import (
13+
conditional_merge_datasets,
14+
create_dataset,
15+
define_variable_if_absent,
16+
)
17+
18+
from .pedigree import (
19+
_compress_hamilton_kerr_parameters,
20+
parent_indices,
21+
topological_argsort,
22+
)
23+
24+
EST_DIPLOID = "diploid"
25+
EST_HAMILTON_KERR = "Hamilton-Kerr"
26+
27+
28+
@numba_guvectorize( # type: ignore
29+
[
30+
"void(int8[:,:], int64[:,:], uint32, int8[:,:])",
31+
"void(int16[:,:], int64[:,:], uint32, int16[:,:])",
32+
"void(int32[:,:], int64[:,:], uint32, int32[:,:])",
33+
"void(int64[:,:], int64[:,:], uint32, int64[:,:])",
34+
],
35+
"(n,k),(n,p),()->(n,k)",
36+
)
37+
def genedrop_diploid(
38+
genotypes: ArrayLike,
39+
parent: ArrayLike,
40+
seed: ArrayLike,
41+
out: ArrayLike,
42+
) -> None: # pragma: no cover
43+
n_sample, n_parent = parent.shape
44+
_, ploidy = genotypes.shape
45+
if n_parent != 2:
46+
raise ValueError("The parents dimension must be length 2")
47+
if ploidy != 2:
48+
raise ValueError("Genotypes are not diploid")
49+
order = topological_argsort(parent)
50+
np.random.seed(seed)
51+
for i in range(n_sample):
52+
t = order[i]
53+
unknown_parent = 0
54+
for j in range(n_parent):
55+
p = parent[t, j]
56+
if p < 0:
57+
# founder
58+
unknown_parent += 1
59+
else:
60+
idx = np.random.randint(2)
61+
out[t, j] = out[p, idx]
62+
if unknown_parent == 1:
63+
raise ValueError("Pedigree contains half-founders")
64+
elif unknown_parent == 2:
65+
# copy founder
66+
out[t, 0] = genotypes[t, 0]
67+
out[t, 1] = genotypes[t, 1]
68+
69+
70+
@numba_jit(nogil=True)
71+
def _random_gamete_Hamilton_Kerr(
72+
genotype: ArrayLike, ploidy: int, tau: int, lambda_: float
73+
) -> ArrayLike:
74+
if ploidy < len(genotype):
75+
# remove fill values
76+
genotype = genotype[genotype > -2]
77+
if ploidy != len(genotype):
78+
raise ValueError("Genotype ploidy does not match number of alleles")
79+
if tau > ploidy:
80+
# TODO: this can be an option for encoding somatic genome duplication (with suitable lambda)
81+
raise NotImplementedError("Gamete tau cannot exceed parental ploidy")
82+
gamete = np.random.choice(genotype, tau, replace=False)
83+
if lambda_ > 0.0:
84+
if tau == 2:
85+
if np.random.rand() <= lambda_:
86+
gamete[1] = gamete[0]
87+
elif tau != 2:
88+
raise NotImplementedError("Non-zero lambda is only implemented for tau = 2")
89+
return gamete
90+
91+
92+
@numba_guvectorize( # type: ignore
93+
[
94+
"void(int8[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int8[:,:])",
95+
"void(int16[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int16[:,:])",
96+
"void(int32[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int32[:,:])",
97+
"void(int64[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int64[:,:])",
98+
],
99+
"(n,k),(n,p),(n,p),(n,p),()->(n,k)",
100+
)
101+
def genedrop_Hamilton_Kerr(
102+
genotypes: ArrayLike,
103+
parent: ArrayLike,
104+
tau: ArrayLike,
105+
lambda_: ArrayLike,
106+
seed: int,
107+
out: ArrayLike,
108+
) -> None: # pragma: no cover
109+
if parent.shape[1] != 2:
110+
parent, tau, lambda_ = _compress_hamilton_kerr_parameters(parent, tau, lambda_)
111+
n_sample, n_parent = parent.shape
112+
_, max_ploidy = genotypes.shape
113+
order = topological_argsort(parent)
114+
np.random.seed(seed)
115+
for i in range(n_sample):
116+
t = order[i]
117+
alleles = 0
118+
unknown_alleles = 0
119+
for j in range(n_parent):
120+
p = parent[t, j]
121+
tau_p = tau[t, j]
122+
lambda_p = lambda_[t, j]
123+
if p < 0:
124+
unknown_alleles += tau_p
125+
elif tau_p > 0:
126+
ploidy_p = tau[p, 0] + tau[p, 1]
127+
gamete = _random_gamete_Hamilton_Kerr(out[p], ploidy_p, tau_p, lambda_p)
128+
out[t, alleles : alleles + tau_p] = gamete
129+
alleles += tau_p
130+
if unknown_alleles == 0:
131+
if alleles < max_ploidy:
132+
# pad with fill value
133+
out[t, alleles:] = -2
134+
elif unknown_alleles == alleles:
135+
# founder
136+
out[t] = genotypes[t]
137+
else:
138+
# partial founder
139+
raise ValueError("Pedigree contains half-founders")
140+
141+
142+
def simulate_genedrop(
143+
ds: Dataset,
144+
*,
145+
method: Optional[Literal[EST_DIPLOID, EST_HAMILTON_KERR]] = EST_DIPLOID, # type: ignore
146+
call_genotype: Hashable = variables.call_genotype,
147+
parent: Hashable = variables.parent,
148+
stat_Hamilton_Kerr_tau: Hashable = variables.stat_Hamilton_Kerr_tau,
149+
stat_Hamilton_Kerr_lambda: Hashable = variables.stat_Hamilton_Kerr_lambda,
150+
seed: Optional[ArrayLike] = None,
151+
merge: bool = True,
152+
) -> Dataset:
153+
"""Generate progeny genotypes via a gene-drop simulation
154+
(MacCluer et al. 1986 [1]).
155+
156+
Simulate Mendelian inheritance of founder alleles throughout a pedigree.
157+
Founders are identified as those individuals with unrecorded parents.
158+
159+
Parameters
160+
----------
161+
ds
162+
Dataset containing genotype calls and pedigree structure.
163+
method
164+
The method used for gene-drop simulation. Defaults to "diploid"
165+
which is only suitable for pedigrees in which all samples are
166+
diploids resulting from sexual reproduction.
167+
The "Hamilton-Kerr" method is suitable for autopolyploid and
168+
mixed-ploidy datasets following Kerr et al. 2012 and [2]
169+
Hamilton and Kerr 2017 [3].
170+
call_genotype
171+
Input variable name holding call_genotype as defined by
172+
:data:`sgkit.variables.call_genotype_spec`.
173+
Must be present in ``ds``.
174+
parent
175+
Input variable name holding parents of each sample as defined by
176+
:data:`sgkit.variables.parent_spec`.
177+
If the variable is not present in ``ds``, it will be computed
178+
using :func:`parent_indices`.
179+
stat_Hamilton_Kerr_tau
180+
Input variable name holding stat_Hamilton_Kerr_tau as defined
181+
by :data:`sgkit.variables.stat_Hamilton_Kerr_tau_spec`.
182+
This variable is only required for the "Hamilton-Kerr" method.
183+
stat_Hamilton_Kerr_lambda
184+
Input variable name holding stat_Hamilton_Kerr_lambda as defined
185+
by :data:`sgkit.variables.stat_Hamilton_Kerr_lambda_spec`.
186+
This variable is only required for the "Hamilton-Kerr" method.
187+
seed
188+
Optionally specify a random seed to initialise gene-drop simulations.
189+
This may be a single integer value or an array of unsigned 32 bit
190+
integers used to specify the random seed for each variant.
191+
merge
192+
If True (the default), merge the input dataset and the computed
193+
output variables into a single dataset, otherwise return only
194+
the computed output variables.
195+
See :ref:`dataset_merge` for more details.
196+
197+
Returns
198+
-------
199+
A dataset containing the following variables:
200+
201+
- :data:`sgkit.variables.call_genotype_spec`.
202+
- :data:`sgkit.variables.call_genotype_mask_spec`.
203+
- :data:`sgkit.variables.call_genotype_fill_spec`.
204+
205+
Raises
206+
------
207+
ValueError
208+
If an unknown method is specified.
209+
ValueError
210+
If the pedigree contains half-founders.
211+
ValueError
212+
If the diploid method is used with a non-diploid dataset.
213+
ValueError
214+
If the diploid method is used and the parents dimension does not
215+
have a length of two.
216+
ValueError
217+
If the Hamilton-Kerr method is used and a sample has more than
218+
two contributing parents.
219+
ValueError
220+
If the Hamilton-Kerr method is used and the number of alleles
221+
in a genotype does not match the sum of tau values (i.e., ploidy).
222+
NotImplementedError
223+
If the Hamilton-Kerr method is used and a tau value exceeds the
224+
parental ploidy.
225+
NotImplementedError
226+
If the Hamilton-Kerr method is used and a non-zero lambda value
227+
is specified when tau is not 2.
228+
229+
Note
230+
----
231+
Linkage between variant loci is not simulated. However, variants
232+
will have identical inheritance patterns if initialized with identical
233+
random seeds when using an array of seeds (see the example).
234+
235+
Examples
236+
--------
237+
Dataset with founder genotypes
238+
239+
>>> import sgkit as sg
240+
>>> import numpy as np
241+
>>> ds = sg.simulate_genotype_call_dataset(n_variant=3, n_sample=5, n_allele=4)
242+
>>> ds["parent_id"] = ["samples", "parents"], [
243+
... [".", "."],
244+
... [".", "."],
245+
... ["S0", "S1"],
246+
... ["S0", "S1"],
247+
... ["S2", "S3"],
248+
... ]
249+
>>> ds.call_genotype.data[:] = -1
250+
>>> ds.call_genotype.data[:,0:2] = np.arange(4).reshape(2,2)
251+
>>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
252+
samples S0 S1 S2 S3 S4
253+
variants
254+
0 0/1 2/3 ./. ./. ./.
255+
1 0/1 2/3 ./. ./. ./.
256+
2 0/1 2/3 ./. ./. ./.
257+
258+
Simulation with random seed
259+
260+
>>> sim = sg.simulate_genedrop(ds, merge=False, seed=1)
261+
>>> sim["sample_id"] = ds["sample_id"]
262+
>>> sim["variant_position"] = ds["variant_position"]
263+
>>> sim["variant_allele"] = ds["variant_allele"]
264+
>>> sg.display_genotypes(sim) # doctest: +NORMALIZE_WHITESPACE
265+
samples S0 S1 S2 S3 S4
266+
variants
267+
0 0/1 2/3 0/3 1/3 0/3
268+
1 0/1 2/3 0/3 0/3 0/3
269+
2 0/1 2/3 0/2 0/3 2/0
270+
271+
Simulation with seed per variant (including duplicates)
272+
273+
>>> seeds = np.array([0,0,1], 'uint32')
274+
>>> sim = sg.simulate_genedrop(ds, merge=False, seed=seeds)
275+
>>> sim["sample_id"] = ds["sample_id"]
276+
>>> sim["variant_position"] = ds["variant_position"]
277+
>>> sim["variant_allele"] = ds["variant_allele"]
278+
>>> sg.display_genotypes(sim) # doctest: +NORMALIZE_WHITESPACE
279+
samples S0 S1 S2 S3 S4
280+
variants
281+
0 0/1 2/3 1/2 0/3 2/3
282+
1 0/1 2/3 1/2 0/3 2/3
283+
2 0/1 2/3 0/2 1/3 2/3
284+
285+
References
286+
----------
287+
[1] Jean W. MacCluer, John L. VanderBerg. Bruce Read and Oliver A. Ryder 1986.
288+
"Pedigree analysis by computer simulation." Zoo Biology 5: 147-160.
289+
290+
[2] - Richard J. Kerr, Li Li, Bruce Tier, Gregory W. Dutkowski and Thomas A. McRae 2012.
291+
"Use of the numerator relationship matrix in genetic analysis of autopolyploid species."
292+
Theoretical and Applied Genetics 124: 1271-1282.
293+
294+
[3] - Matthew G. Hamilton and Richard J. Kerr 2017.
295+
"Computation of the inverse additive relationship matrix for autopolyploid
296+
and multiple-ploidy populations." Theoretical and Applied Genetics 131: 851-860.
297+
"""
298+
ds = define_variable_if_absent(ds, variables.parent, parent, parent_indices)
299+
variables.validate(
300+
ds, {parent: variables.parent_spec, call_genotype: variables.call_genotype_spec}
301+
)
302+
gt = da.asarray(ds[call_genotype].data)
303+
parent = da.asarray(ds[parent].data, chunks=ds[parent].shape)
304+
if hasattr(seed, "__len__"):
305+
seeds = np.array(seed, copy=False)
306+
elif seed is not None:
307+
rng = np.random.default_rng(seed)
308+
seeds = rng.integers(2**32, size=len(gt), dtype=np.uint32)
309+
else:
310+
seeds = np.random.randint(2**32, size=len(gt), dtype=np.uint32)
311+
seeds = da.asarray(seeds, chunks=gt.chunks[0])
312+
if method == EST_DIPLOID:
313+
func = da.gufunc(
314+
genedrop_diploid,
315+
signature=genedrop_diploid.ufunc.signature,
316+
output_dtypes=gt.dtype,
317+
)
318+
gt_sim = func(gt, parent, seeds)
319+
elif method == EST_HAMILTON_KERR:
320+
tau = da.asarray(
321+
ds[stat_Hamilton_Kerr_tau].data, chunks=ds[stat_Hamilton_Kerr_tau].shape
322+
)
323+
lambda_ = da.asarray(
324+
ds[stat_Hamilton_Kerr_lambda].data,
325+
chunks=ds[stat_Hamilton_Kerr_lambda].shape,
326+
)
327+
func = da.gufunc(
328+
genedrop_Hamilton_Kerr,
329+
signature=genedrop_Hamilton_Kerr.ufunc.signature,
330+
output_dtypes=gt.dtype,
331+
)
332+
gt_sim = func(gt, parent, tau, lambda_, seeds)
333+
else:
334+
raise ValueError("Unknown method '{}'".format(method))
335+
gt_sim = xr.DataArray(
336+
gt_sim, coords=ds[call_genotype].coords, dims=ds[call_genotype].dims
337+
)
338+
new_ds = create_dataset(
339+
{
340+
variables.call_genotype: gt_sim,
341+
variables.call_genotype_mask: gt_sim < 0,
342+
variables.call_genotype_fill: gt_sim < -1,
343+
}
344+
)
345+
return conditional_merge_datasets(ds, new_ds, merge)

sgkit/stats/pedigree.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def parent_indices(
110110
return conditional_merge_datasets(ds, new_ds, merge)
111111

112112

113-
@numba_jit
113+
@numba_jit(nogil=True)
114114
def topological_argsort(parent: ArrayLike) -> ArrayLike:
115115
"""Find a topological ordering of samples within a pedigree such
116116
that no individual occurs before its parents.

0 commit comments

Comments
 (0)