1+ import warnings
2+
13import dask .array as da
24import numpy as np
35import pandas as pd
@@ -16,7 +18,9 @@ def genee(ds: Dataset, ld: ArrayLike, *, reg_covar: float = 0.000001) -> DataFra
1618 Parameters
1719 ----------
1820 ds
19- Dataset containing beta values (OLS betas or regularized betas).
21+ Dataset containing marginal variant effect values in the variable "beta"
22+ (OLS betas or regularized betas), and window definitions corresponding to
23+ genomic regions of interest, e.g. genes.
2024 ld
2125 Dense 2D array corresponding to the LD matrix.
2226 reg_covar
@@ -30,6 +34,41 @@ def genee(ds: Dataset, ld: ArrayLike, *, reg_covar: float = 0.000001) -> DataFra
3034 the first mixture component with the largest variance if it is composed
3135 of more than 50% of the SNPs.
3236
37+ Examples
38+ --------
39+
40+ >>> import numpy as np
41+ >>> import xarray as xr
42+ >>> from sgkit.stats.genee import genee
43+ >>> from sgkit.stats.association import gwas_linear_regression
44+ >>> from sgkit.stats.ld import ld_matrix
45+ >>> from sgkit.testing import simulate_genotype_call_dataset
46+ >>> n_variant, n_sample, n_contig, n_covariate, n_trait, seed = 100, 50, 2, 3, 5, 0
47+ >>> rs = np.random.RandomState(seed)
48+ >>> ds = simulate_genotype_call_dataset(n_variant=n_variant, n_sample=n_sample, n_contig=n_contig, seed=seed)
49+ >>> ds_pheno = xr.DataArray(np.random.rand(n_sample), dims=['samples'], name='phenotype')
50+ >>> ds = ds.assign(phenotype=ds_pheno)
51+ >>> ds["call_dosage"] = ds.call_genotype.sum(dim="ploidy")
52+ >>> ds = gwas_linear_regression(ds, dosage="call_dosage", add_intercept=True, traits=["phenotype"], covariates=[])
53+ >>> ds = ds.rename({"variant_linreg_beta": "beta"})
54+ # genee cannot handle the return value of gwas_linear_regression, we need to
55+ # convert it to a dataset with a single variant dimension
56+ >>> ds_beta = xr.DataArray(np.random.rand(n_variant), dims=['variants'], name='beta')
57+ >>> ds = ds.assign(beta=ds_beta)
58+ >>> gene_start = np.array([0, 30])
59+ >>> gene_stop = np.array([20, 40])
60+ >>> gene_contig = np.array([0, 1])
61+ # genes are windows in this simple example
62+ >>> ds["window_contig"] = (["windows"], gene_contig)
63+ >>> ds["window_start"] = (["windows"], gene_start)
64+ >>> ds["window_stop"] = (["windows"], gene_stop)
65+ # this only works as long as the windows on different chromosomes are non-overlapping
66+ # ld_matrix looses all information about contigs
67+ # is there a way to do this properly in chunks and have genee work with the chunks?
68+ >>> ld_temp = ld_matrix(ds).compute().pivot(index="i", columns="j", values="value").fillna(-1).to_numpy()
69+ >>> ld = (ld_temp + ld_temp.T) / 2
70+ >>> df = genee(ds, ld).compute()
71+
3372 Returns
3473 -------
3574 A dataframe containing the following fields:
@@ -38,6 +77,8 @@ def genee(ds: Dataset, ld: ArrayLike, *, reg_covar: float = 0.000001) -> DataFra
3877 - ``q_var``: test variance
3978 - ``pval``: p-value
4079
80+ Each value corresponds to a window in the input dataset.
81+
4182 References
4283 ----------
4384 [1] - W. Cheng, S. Ramachandran, and L. Crawford (2020).
@@ -83,7 +124,7 @@ def genee_EM(betas, reg_covar=0.000001):
83124
84125 covars = best_gmm .covariances_ .squeeze ()
85126 if best_gmm .n_components == 1 : # pragma: no cover
86- epsilon_effect = covars [0 ]
127+ epsilon_effect = covars [0 ] if ( covars . ndim > 0 ) else covars
87128 else :
88129 # TODO: handle case where first component composed more than 50% SNPs
89130 # https://github.com/ramachandran-lab/genee/blob/a357a956241df93f16e07664e24f3aeac65f4177/genee/R/genee_EM.R#L28-L29
@@ -132,8 +173,13 @@ def genee_test(gene, ld, betas, epsilon_effect):
132173 test_statistics = betas_g .T @ betas_g
133174 t_var = np .diag ((ld_g * epsilon_effect ) @ (ld_g * epsilon_effect )).sum ()
134175
135- p_value_g = compute_p_values (e_values , test_statistics )
136- p_value_g = ensure_positive_real (p_value_g )
176+ if all (e_values == 0.0 ):
177+ warnings .warn ("All eigenvalues of the given gene LD matrix are zero. Cannot compute p-value." ,
178+ UserWarning )
179+ p_value_g = np .real_if_close (np .nan )
180+ else :
181+ p_value_g = compute_p_values (e_values , test_statistics )
182+ p_value_g = ensure_positive_real (p_value_g )
137183
138184 return test_statistics .squeeze ().item (), t_var , p_value_g .squeeze ().item ()
139185
0 commit comments