Skip to content

Commit db10477

Browse files
timothymillarmergify[bot]
authored andcommitted
Pedigree contribution #963
1 parent c1f730c commit db10477

File tree

6 files changed

+414
-0
lines changed

6 files changed

+414
-0
lines changed

docs/api.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ Methods
112112
maximal_independent_set
113113
observed_heterozygosity
114114
pbs
115+
pedigree_contribution
115116
pedigree_inbreeding
116117
pedigree_inverse_kinship
117118
pedigree_kinship
@@ -214,6 +215,7 @@ By convention, variable names are singular in sgkit. For example, ``genotype_cou
214215
variables.stat_inverse_relationship_spec
215216
variables.stat_observed_heterozygosity_spec
216217
variables.stat_pbs_spec
218+
variables.stat_pedigree_contribution_spec
217219
variables.stat_pedigree_inbreeding_spec
218220
variables.stat_pedigree_inverse_kinship_spec
219221
variables.stat_pedigree_inverse_relationship_spec

sgkit/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
from .stats.pca import pca
4343
from .stats.pedigree import (
4444
parent_indices,
45+
pedigree_contribution,
4546
pedigree_inbreeding,
4647
pedigree_inverse_kinship,
4748
pedigree_kinship,
@@ -107,6 +108,7 @@
107108
"ld_prune",
108109
"maximal_independent_set",
109110
"parent_indices",
111+
"pedigree_contribution",
110112
"pedigree_inbreeding",
111113
"pedigree_inverse_kinship",
112114
"pedigree_kinship",

sgkit/stats/pedigree.py

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2142,3 +2142,154 @@ def pedigree_sel(
21422142
if drop_parent:
21432143
new_ds = new_ds.drop_vars(parent_var)
21442144
return new_ds
2145+
2146+
2147+
def pedigree_contribution(
2148+
ds: Dataset,
2149+
*,
2150+
method: Literal["even", "variable"] = "even",
2151+
chunks: Hashable = -1,
2152+
parent: Hashable = variables.parent,
2153+
stat_Hamilton_Kerr_tau: Hashable = variables.stat_Hamilton_Kerr_tau,
2154+
merge: bool = True,
2155+
) -> Dataset:
2156+
"""Calculate the expected genomic contribution of each sample to each
2157+
other sample based on pedigree structure.
2158+
2159+
Parameters
2160+
----------
2161+
ds
2162+
Dataset containing pedigree structure.
2163+
method
2164+
The method used for estimating genomic contributions.
2165+
The 'even' method assumes that all samples are of a single,
2166+
even ploidy (e.g., diploid) and have even contributions
2167+
from each parent. The 'variable' method allows for un-even
2168+
contributions due to ploidy manipulation and/or clonal
2169+
reproduction.
2170+
chunks:
2171+
Optionally specify chunks for the returned array. A single chunk
2172+
is used by default. Currently, chunking is only supported for a
2173+
single axis.
2174+
parent
2175+
Input variable name holding parents of each sample as defined by
2176+
:data:`sgkit.variables.parent_spec`.
2177+
If the variable is not present in ``ds``, it will be computed
2178+
using :func:`parent_indices`.
2179+
stat_Hamilton_Kerr_tau
2180+
Input variable name holding stat_Hamilton_Kerr_tau as defined
2181+
by :data:`sgkit.variables.stat_Hamilton_Kerr_tau_spec`.
2182+
This variable is only required for the 'variable' method.
2183+
merge
2184+
If True (the default), merge the input dataset and the computed
2185+
output variables into a single dataset, otherwise return only
2186+
the computed output variables.
2187+
2188+
Returns
2189+
-------
2190+
A dataset containing :data:`sgkit.variables.stat_pedigree_contribution_spec`.
2191+
2192+
Raises
2193+
------
2194+
ValueError
2195+
If an unknown method is specified.
2196+
ValueError
2197+
If the 'even' method is specified for an odd-ploidy dataset.
2198+
ValueError
2199+
If the 'even' method is specified and the length of the 'parents'
2200+
dimension is not 2.
2201+
NotImplementedError
2202+
If chunking is specified for both axes.
2203+
2204+
Note
2205+
----
2206+
Dimensions of :data:`sgkit.variables.stat_pedigree_contribution_spec`
2207+
are named ``samples_0`` and ``samples_1``.
2208+
2209+
Examples
2210+
--------
2211+
>>> ds = xr.Dataset()
2212+
>>> ds["sample_id"] = "samples", ["S0", "S1", "S2", "S3", "S4", "S5"]
2213+
>>> ds["parent_id"] = ["samples", "parents"], [
2214+
... [ ".", "."],
2215+
... [ ".", "."],
2216+
... ["S1", "S0"],
2217+
... ["S2", "S0"],
2218+
... ["S0", "S2"],
2219+
... ["S1", "S3"]
2220+
... ]
2221+
>>> ds = pedigree_contribution(ds)
2222+
>>> ds.stat_pedigree_contribution.values # doctest: +NORMALIZE_WHITESPACE
2223+
array([[1. , 0. , 0.5 , 0.75 , 0.75 , 0.375],
2224+
[0. , 1. , 0.5 , 0.25 , 0.25 , 0.625],
2225+
[0. , 0. , 1. , 0.5 , 0.5 , 0.25 ],
2226+
[0. , 0. , 0. , 1. , 0. , 0.5 ],
2227+
[0. , 0. , 0. , 0. , 1. , 0. ],
2228+
[0. , 0. , 0. , 0. , 0. , 1. ]])
2229+
"""
2230+
from .pedigree_numba_fns import contribution_from, contribution_to
2231+
2232+
if method not in {"even", "variable"}:
2233+
raise ValueError("Unknown method '{}'".format(method))
2234+
ds = define_variable_if_absent(
2235+
ds,
2236+
variables.parent,
2237+
parent,
2238+
parent_indices,
2239+
)
2240+
variables.validate(ds, {parent: variables.parent_spec})
2241+
parent = da.asarray(ds[parent].data, chunks=ds[parent].shape)
2242+
n_sample, n_parent = parent.shape
2243+
if method == "even":
2244+
if bool(ds.dims.get("ploidy", 2) % 2):
2245+
raise ValueError("The 'even' method requires an even-ploidy dataset")
2246+
if n_parent != 2:
2247+
raise ValueError(
2248+
"The 'even' requires that the 'parents' dimension has a length of 2"
2249+
)
2250+
parent_contribution = da.ones_like(parent, float) / 2
2251+
else:
2252+
variables.validate(
2253+
ds, {stat_Hamilton_Kerr_tau: variables.stat_Hamilton_Kerr_tau_spec}
2254+
)
2255+
tau = da.asarray(
2256+
ds[stat_Hamilton_Kerr_tau].data, chunks=ds[stat_Hamilton_Kerr_tau].shape
2257+
)
2258+
parent_contribution = tau / tau.sum(axis=-1, keepdims=True)
2259+
order = da.gufunc(
2260+
topological_argsort,
2261+
signature="(n,p)->(n)",
2262+
output_dtypes=np.uint64,
2263+
)(parent)
2264+
# create a dummy array for dask to work out chunks
2265+
chunks_0, chunks_1 = da.empty((n_sample, n_sample), chunks=chunks).chunks
2266+
if (len(chunks_0) > 1) and (len(chunks_1) > 1):
2267+
raise NotImplementedError("Chunking is only supported along a single axis")
2268+
elif len(chunks_0) == 1:
2269+
targets = da.arange(n_sample, chunks=chunks_1)
2270+
# transpose so that [x, y] is contribution of x to y
2271+
# TODO: dask already adds a transpose when calling the gufunc so
2272+
# avoiding that initial transpose somehow should be more efficient
2273+
contribution = contribution_to(
2274+
parent,
2275+
parent_contribution,
2276+
order,
2277+
targets,
2278+
).T
2279+
else:
2280+
targets = da.arange(n_sample, chunks=chunks_0)
2281+
contribution = contribution_from(
2282+
parent,
2283+
parent_contribution,
2284+
order,
2285+
targets,
2286+
)
2287+
new_ds = create_dataset(
2288+
{
2289+
variables.stat_pedigree_contribution: (
2290+
("samples_0", "samples_1"),
2291+
contribution,
2292+
)
2293+
}
2294+
)
2295+
return conditional_merge_datasets(ds, new_ds, merge)

sgkit/stats/pedigree_numba_fns.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Numba guvectorize functions (and their dependencies) are defined
2+
# in a separate file here, and imported dynamically to avoid
3+
# initial compilation overhead.
4+
5+
from sgkit.accelerate import numba_guvectorize
6+
from sgkit.typing import ArrayLike
7+
8+
9+
@numba_guvectorize( # type: ignore
10+
[
11+
"void(int64[:,:], float64[:,:], int64[:], int64, float64[:])",
12+
"void(int64[:,:], float64[:,:], uint64[:], int64, float64[:])",
13+
],
14+
"(n,p),(n,p),(n),()->(n)",
15+
)
16+
def contribution_to(
17+
parent: ArrayLike,
18+
parent_contribution: ArrayLike,
19+
order: ArrayLike,
20+
i: int,
21+
out: ArrayLike,
22+
) -> None: # pragma: no cover
23+
n_samples, n_parents = parent.shape
24+
out[:] = 0.0
25+
reverse_order = order[::-1]
26+
for jdx in range(n_samples):
27+
j = reverse_order[jdx]
28+
if i == j:
29+
# self contribution is 1
30+
out[j] = 1.0
31+
con = out[j]
32+
if con > 0.0:
33+
for pdx in range(n_parents):
34+
p = parent[j, pdx]
35+
if p >= 0:
36+
out[p] += con * parent_contribution[j, pdx]
37+
38+
39+
@numba_guvectorize( # type: ignore
40+
[
41+
"void(int64[:,:], float64[:,:], int64[:], int64, float64[:])",
42+
"void(int64[:,:], float64[:,:], uint64[:], int64, float64[:])",
43+
],
44+
"(n,p),(n,p),(n),()->(n)",
45+
)
46+
def contribution_from(
47+
parent: ArrayLike,
48+
parent_contribution: ArrayLike,
49+
order: ArrayLike,
50+
i: int,
51+
out: ArrayLike,
52+
) -> None: # pragma: no cover
53+
n_samples, n_parents = parent.shape
54+
out[:] = 0.0
55+
for jdx in range(n_samples):
56+
j = order[jdx]
57+
if i == j:
58+
# self contribution is 1
59+
out[j] = 1.0
60+
else:
61+
for pdx in range(n_parents):
62+
p = parent[j, pdx]
63+
if p >= 0:
64+
p_con = out[p]
65+
if p_con >= 0.0:
66+
out[j] += p_con * parent_contribution[j, pdx]

0 commit comments

Comments
 (0)