|
| 1 | +from typing import Hashable, Optional |
| 2 | + |
| 3 | +import dask.array as da |
| 4 | +import numpy as np |
| 5 | +import xarray as xr |
| 6 | +from typing_extensions import Literal |
| 7 | +from xarray import Dataset |
| 8 | + |
| 9 | +from sgkit import variables |
| 10 | +from sgkit.accelerate import numba_guvectorize, numba_jit |
| 11 | +from sgkit.typing import ArrayLike |
| 12 | +from sgkit.utils import ( |
| 13 | + conditional_merge_datasets, |
| 14 | + create_dataset, |
| 15 | + define_variable_if_absent, |
| 16 | +) |
| 17 | + |
| 18 | +from .pedigree import ( |
| 19 | + _compress_hamilton_kerr_parameters, |
| 20 | + parent_indices, |
| 21 | + topological_argsort, |
| 22 | +) |
| 23 | + |
| 24 | +EST_DIPLOID = "diploid" |
| 25 | +EST_HAMILTON_KERR = "Hamilton-Kerr" |
| 26 | + |
| 27 | + |
| 28 | +@numba_guvectorize( # type: ignore |
| 29 | + [ |
| 30 | + "void(int8[:,:], int64[:,:], uint32, int8[:,:])", |
| 31 | + "void(int16[:,:], int64[:,:], uint32, int16[:,:])", |
| 32 | + "void(int32[:,:], int64[:,:], uint32, int32[:,:])", |
| 33 | + "void(int64[:,:], int64[:,:], uint32, int64[:,:])", |
| 34 | + ], |
| 35 | + "(n,k),(n,p),()->(n,k)", |
| 36 | +) |
| 37 | +def genedrop_diploid( |
| 38 | + genotypes: ArrayLike, |
| 39 | + parent: ArrayLike, |
| 40 | + seed: ArrayLike, |
| 41 | + out: ArrayLike, |
| 42 | +) -> None: # pragma: no cover |
| 43 | + n_sample, n_parent = parent.shape |
| 44 | + _, ploidy = genotypes.shape |
| 45 | + if n_parent != 2: |
| 46 | + raise ValueError("The parents dimension must be length 2") |
| 47 | + if ploidy != 2: |
| 48 | + raise ValueError("Genotypes are not diploid") |
| 49 | + order = topological_argsort(parent) |
| 50 | + np.random.seed(seed) |
| 51 | + for i in range(n_sample): |
| 52 | + t = order[i] |
| 53 | + unknown_parent = 0 |
| 54 | + for j in range(n_parent): |
| 55 | + p = parent[t, j] |
| 56 | + if p < 0: |
| 57 | + # founder |
| 58 | + unknown_parent += 1 |
| 59 | + else: |
| 60 | + idx = np.random.randint(2) |
| 61 | + out[t, j] = out[p, idx] |
| 62 | + if unknown_parent == 1: |
| 63 | + raise ValueError("Pedigree contains half-founders") |
| 64 | + elif unknown_parent == 2: |
| 65 | + # copy founder |
| 66 | + out[t, 0] = genotypes[t, 0] |
| 67 | + out[t, 1] = genotypes[t, 1] |
| 68 | + |
| 69 | + |
| 70 | +@numba_jit(nogil=True) |
| 71 | +def _random_gamete_Hamilton_Kerr( |
| 72 | + genotype: ArrayLike, ploidy: int, tau: int, lambda_: float |
| 73 | +) -> ArrayLike: |
| 74 | + if ploidy < len(genotype): |
| 75 | + # remove fill values |
| 76 | + genotype = genotype[genotype > -2] |
| 77 | + if ploidy != len(genotype): |
| 78 | + raise ValueError("Genotype ploidy does not match number of alleles") |
| 79 | + if tau > ploidy: |
| 80 | + # TODO: this can be an option for encoding somatic genome duplication (with suitable lambda) |
| 81 | + raise NotImplementedError("Gamete tau cannot exceed parental ploidy") |
| 82 | + gamete = np.random.choice(genotype, tau, replace=False) |
| 83 | + if lambda_ > 0.0: |
| 84 | + if tau == 2: |
| 85 | + if np.random.rand() <= lambda_: |
| 86 | + gamete[1] = gamete[0] |
| 87 | + elif tau != 2: |
| 88 | + raise NotImplementedError("Non-zero lambda is only implemented for tau = 2") |
| 89 | + return gamete |
| 90 | + |
| 91 | + |
| 92 | +@numba_guvectorize( # type: ignore |
| 93 | + [ |
| 94 | + "void(int8[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int8[:,:])", |
| 95 | + "void(int16[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int16[:,:])", |
| 96 | + "void(int32[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int32[:,:])", |
| 97 | + "void(int64[:,:], int64[:,:], uint64[:,:], float64[:,:], uint32, int64[:,:])", |
| 98 | + ], |
| 99 | + "(n,k),(n,p),(n,p),(n,p),()->(n,k)", |
| 100 | +) |
| 101 | +def genedrop_Hamilton_Kerr( |
| 102 | + genotypes: ArrayLike, |
| 103 | + parent: ArrayLike, |
| 104 | + tau: ArrayLike, |
| 105 | + lambda_: ArrayLike, |
| 106 | + seed: int, |
| 107 | + out: ArrayLike, |
| 108 | +) -> None: # pragma: no cover |
| 109 | + if parent.shape[1] != 2: |
| 110 | + parent, tau, lambda_ = _compress_hamilton_kerr_parameters(parent, tau, lambda_) |
| 111 | + n_sample, n_parent = parent.shape |
| 112 | + _, max_ploidy = genotypes.shape |
| 113 | + order = topological_argsort(parent) |
| 114 | + np.random.seed(seed) |
| 115 | + for i in range(n_sample): |
| 116 | + t = order[i] |
| 117 | + alleles = 0 |
| 118 | + unknown_alleles = 0 |
| 119 | + for j in range(n_parent): |
| 120 | + p = parent[t, j] |
| 121 | + tau_p = tau[t, j] |
| 122 | + lambda_p = lambda_[t, j] |
| 123 | + if p < 0: |
| 124 | + unknown_alleles += tau_p |
| 125 | + elif tau_p > 0: |
| 126 | + ploidy_p = tau[p, 0] + tau[p, 1] |
| 127 | + gamete = _random_gamete_Hamilton_Kerr(out[p], ploidy_p, tau_p, lambda_p) |
| 128 | + out[t, alleles : alleles + tau_p] = gamete |
| 129 | + alleles += tau_p |
| 130 | + if unknown_alleles == 0: |
| 131 | + if alleles < max_ploidy: |
| 132 | + # pad with fill value |
| 133 | + out[t, alleles:] = -2 |
| 134 | + elif unknown_alleles == alleles: |
| 135 | + # founder |
| 136 | + out[t] = genotypes[t] |
| 137 | + else: |
| 138 | + # partial founder |
| 139 | + raise ValueError("Pedigree contains half-founders") |
| 140 | + |
| 141 | + |
| 142 | +def simulate_genedrop( |
| 143 | + ds: Dataset, |
| 144 | + *, |
| 145 | + method: Optional[Literal[EST_DIPLOID, EST_HAMILTON_KERR]] = EST_DIPLOID, # type: ignore |
| 146 | + call_genotype: Hashable = variables.call_genotype, |
| 147 | + parent: Hashable = variables.parent, |
| 148 | + stat_Hamilton_Kerr_tau: Hashable = variables.stat_Hamilton_Kerr_tau, |
| 149 | + stat_Hamilton_Kerr_lambda: Hashable = variables.stat_Hamilton_Kerr_lambda, |
| 150 | + seed: Optional[ArrayLike] = None, |
| 151 | + merge: bool = True, |
| 152 | +) -> Dataset: |
| 153 | + """Generate progeny genotypes via a gene-drop simulation |
| 154 | + (MacCluer et al. 1986 [1]). |
| 155 | +
|
| 156 | + Simulate Mendelian inheritance of founder alleles throughout a pedigree. |
| 157 | + Founders are identified as those individuals with unrecorded parents. |
| 158 | +
|
| 159 | + Parameters |
| 160 | + ---------- |
| 161 | + ds |
| 162 | + Dataset containing genotype calls and pedigree structure. |
| 163 | + method |
| 164 | + The method used for gene-drop simulation. Defaults to "diploid" |
| 165 | + which is only suitable for pedigrees in which all samples are |
| 166 | + diploids resulting from sexual reproduction. |
| 167 | + The "Hamilton-Kerr" method is suitable for autopolyploid and |
| 168 | + mixed-ploidy datasets following Kerr et al. 2012 and [2] |
| 169 | + Hamilton and Kerr 2017 [3]. |
| 170 | + call_genotype |
| 171 | + Input variable name holding call_genotype as defined by |
| 172 | + :data:`sgkit.variables.call_genotype_spec`. |
| 173 | + Must be present in ``ds``. |
| 174 | + parent |
| 175 | + Input variable name holding parents of each sample as defined by |
| 176 | + :data:`sgkit.variables.parent_spec`. |
| 177 | + If the variable is not present in ``ds``, it will be computed |
| 178 | + using :func:`parent_indices`. |
| 179 | + stat_Hamilton_Kerr_tau |
| 180 | + Input variable name holding stat_Hamilton_Kerr_tau as defined |
| 181 | + by :data:`sgkit.variables.stat_Hamilton_Kerr_tau_spec`. |
| 182 | + This variable is only required for the "Hamilton-Kerr" method. |
| 183 | + stat_Hamilton_Kerr_lambda |
| 184 | + Input variable name holding stat_Hamilton_Kerr_lambda as defined |
| 185 | + by :data:`sgkit.variables.stat_Hamilton_Kerr_lambda_spec`. |
| 186 | + This variable is only required for the "Hamilton-Kerr" method. |
| 187 | + seed |
| 188 | + Optionally specify a random seed to initialise gene-drop simulations. |
| 189 | + This may be a single integer value or an array of unsigned 32 bit |
| 190 | + integers used to specify the random seed for each variant. |
| 191 | + merge |
| 192 | + If True (the default), merge the input dataset and the computed |
| 193 | + output variables into a single dataset, otherwise return only |
| 194 | + the computed output variables. |
| 195 | + See :ref:`dataset_merge` for more details. |
| 196 | +
|
| 197 | + Returns |
| 198 | + ------- |
| 199 | + A dataset containing the following variables: |
| 200 | +
|
| 201 | + - :data:`sgkit.variables.call_genotype_spec`. |
| 202 | + - :data:`sgkit.variables.call_genotype_mask_spec`. |
| 203 | + - :data:`sgkit.variables.call_genotype_fill_spec`. |
| 204 | +
|
| 205 | + Raises |
| 206 | + ------ |
| 207 | + ValueError |
| 208 | + If an unknown method is specified. |
| 209 | + ValueError |
| 210 | + If the pedigree contains half-founders. |
| 211 | + ValueError |
| 212 | + If the diploid method is used with a non-diploid dataset. |
| 213 | + ValueError |
| 214 | + If the diploid method is used and the parents dimension does not |
| 215 | + have a length of two. |
| 216 | + ValueError |
| 217 | + If the Hamilton-Kerr method is used and a sample has more than |
| 218 | + two contributing parents. |
| 219 | + ValueError |
| 220 | + If the Hamilton-Kerr method is used and the number of alleles |
| 221 | + in a genotype does not match the sum of tau values (i.e., ploidy). |
| 222 | + NotImplementedError |
| 223 | + If the Hamilton-Kerr method is used and a tau value exceeds the |
| 224 | + parental ploidy. |
| 225 | + NotImplementedError |
| 226 | + If the Hamilton-Kerr method is used and a non-zero lambda value |
| 227 | + is specified when tau is not 2. |
| 228 | +
|
| 229 | + Note |
| 230 | + ---- |
| 231 | + Linkage between variant loci is not simulated. However, variants |
| 232 | + will have identical inheritance patterns if initialized with identical |
| 233 | + random seeds when using an array of seeds (see the example). |
| 234 | +
|
| 235 | + Examples |
| 236 | + -------- |
| 237 | + Dataset with founder genotypes |
| 238 | +
|
| 239 | + >>> import sgkit as sg |
| 240 | + >>> import numpy as np |
| 241 | + >>> ds = sg.simulate_genotype_call_dataset(n_variant=3, n_sample=5, n_allele=4) |
| 242 | + >>> ds["parent_id"] = ["samples", "parents"], [ |
| 243 | + ... [".", "."], |
| 244 | + ... [".", "."], |
| 245 | + ... ["S0", "S1"], |
| 246 | + ... ["S0", "S1"], |
| 247 | + ... ["S2", "S3"], |
| 248 | + ... ] |
| 249 | + >>> ds.call_genotype.data[:] = -1 |
| 250 | + >>> ds.call_genotype.data[:,0:2] = np.arange(4).reshape(2,2) |
| 251 | + >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE |
| 252 | + samples S0 S1 S2 S3 S4 |
| 253 | + variants |
| 254 | + 0 0/1 2/3 ./. ./. ./. |
| 255 | + 1 0/1 2/3 ./. ./. ./. |
| 256 | + 2 0/1 2/3 ./. ./. ./. |
| 257 | +
|
| 258 | + Simulation with random seed |
| 259 | +
|
| 260 | + >>> sim = sg.simulate_genedrop(ds, merge=False, seed=1) |
| 261 | + >>> sim["sample_id"] = ds["sample_id"] |
| 262 | + >>> sim["variant_position"] = ds["variant_position"] |
| 263 | + >>> sim["variant_allele"] = ds["variant_allele"] |
| 264 | + >>> sg.display_genotypes(sim) # doctest: +NORMALIZE_WHITESPACE |
| 265 | + samples S0 S1 S2 S3 S4 |
| 266 | + variants |
| 267 | + 0 0/1 2/3 0/3 1/3 0/3 |
| 268 | + 1 0/1 2/3 0/3 0/3 0/3 |
| 269 | + 2 0/1 2/3 0/2 0/3 2/0 |
| 270 | +
|
| 271 | + Simulation with seed per variant (including duplicates) |
| 272 | +
|
| 273 | + >>> seeds = np.array([0,0,1], 'uint32') |
| 274 | + >>> sim = sg.simulate_genedrop(ds, merge=False, seed=seeds) |
| 275 | + >>> sim["sample_id"] = ds["sample_id"] |
| 276 | + >>> sim["variant_position"] = ds["variant_position"] |
| 277 | + >>> sim["variant_allele"] = ds["variant_allele"] |
| 278 | + >>> sg.display_genotypes(sim) # doctest: +NORMALIZE_WHITESPACE |
| 279 | + samples S0 S1 S2 S3 S4 |
| 280 | + variants |
| 281 | + 0 0/1 2/3 1/2 0/3 2/3 |
| 282 | + 1 0/1 2/3 1/2 0/3 2/3 |
| 283 | + 2 0/1 2/3 0/2 1/3 2/3 |
| 284 | +
|
| 285 | + References |
| 286 | + ---------- |
| 287 | + [1] Jean W. MacCluer, John L. VanderBerg. Bruce Read and Oliver A. Ryder 1986. |
| 288 | + "Pedigree analysis by computer simulation." Zoo Biology 5: 147-160. |
| 289 | +
|
| 290 | + [2] - Richard J. Kerr, Li Li, Bruce Tier, Gregory W. Dutkowski and Thomas A. McRae 2012. |
| 291 | + "Use of the numerator relationship matrix in genetic analysis of autopolyploid species." |
| 292 | + Theoretical and Applied Genetics 124: 1271-1282. |
| 293 | +
|
| 294 | + [3] - Matthew G. Hamilton and Richard J. Kerr 2017. |
| 295 | + "Computation of the inverse additive relationship matrix for autopolyploid |
| 296 | + and multiple-ploidy populations." Theoretical and Applied Genetics 131: 851-860. |
| 297 | + """ |
| 298 | + ds = define_variable_if_absent(ds, variables.parent, parent, parent_indices) |
| 299 | + variables.validate( |
| 300 | + ds, {parent: variables.parent_spec, call_genotype: variables.call_genotype_spec} |
| 301 | + ) |
| 302 | + gt = da.asarray(ds[call_genotype].data) |
| 303 | + parent = da.asarray(ds[parent].data, chunks=ds[parent].shape) |
| 304 | + if hasattr(seed, "__len__"): |
| 305 | + seeds = np.array(seed, copy=False) |
| 306 | + elif seed is not None: |
| 307 | + rng = np.random.default_rng(seed) |
| 308 | + seeds = rng.integers(2**32, size=len(gt), dtype=np.uint32) |
| 309 | + else: |
| 310 | + seeds = np.random.randint(2**32, size=len(gt), dtype=np.uint32) |
| 311 | + seeds = da.asarray(seeds, chunks=gt.chunks[0]) |
| 312 | + if method == EST_DIPLOID: |
| 313 | + func = da.gufunc( |
| 314 | + genedrop_diploid, |
| 315 | + signature=genedrop_diploid.ufunc.signature, |
| 316 | + output_dtypes=gt.dtype, |
| 317 | + ) |
| 318 | + gt_sim = func(gt, parent, seeds) |
| 319 | + elif method == EST_HAMILTON_KERR: |
| 320 | + tau = da.asarray( |
| 321 | + ds[stat_Hamilton_Kerr_tau].data, chunks=ds[stat_Hamilton_Kerr_tau].shape |
| 322 | + ) |
| 323 | + lambda_ = da.asarray( |
| 324 | + ds[stat_Hamilton_Kerr_lambda].data, |
| 325 | + chunks=ds[stat_Hamilton_Kerr_lambda].shape, |
| 326 | + ) |
| 327 | + func = da.gufunc( |
| 328 | + genedrop_Hamilton_Kerr, |
| 329 | + signature=genedrop_Hamilton_Kerr.ufunc.signature, |
| 330 | + output_dtypes=gt.dtype, |
| 331 | + ) |
| 332 | + gt_sim = func(gt, parent, tau, lambda_, seeds) |
| 333 | + else: |
| 334 | + raise ValueError("Unknown method '{}'".format(method)) |
| 335 | + gt_sim = xr.DataArray( |
| 336 | + gt_sim, coords=ds[call_genotype].coords, dims=ds[call_genotype].dims |
| 337 | + ) |
| 338 | + new_ds = create_dataset( |
| 339 | + { |
| 340 | + variables.call_genotype: gt_sim, |
| 341 | + variables.call_genotype_mask: gt_sim < 0, |
| 342 | + variables.call_genotype_fill: gt_sim < -1, |
| 343 | + } |
| 344 | + ) |
| 345 | + return conditional_merge_datasets(ds, new_ds, merge) |
0 commit comments