Skip to content

Commit fe6dd11

Browse files
authored
Version 2 (#109)
* Add support for pandas-genomics GenotypeArray as a type * WIP: Create a general �ssociation_study function. Tests need to be updated. * WIP: More fixes * Update tests to use association_study, and clean up further * Update interaction test to be similar to association test * WIP: Add a GWAS test * WIP: Tests for GWAS interaction * Finish updating and fixing tests, prep for release of v2 * Make some fixes for FutureWarnings in dependencies
1 parent 048d211 commit fe6dd11

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+1606
-3044
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
/docs/notebooks/.ipynb_checkpoints
44
/docs/source/modules
55

6+
poetry.lock
7+
68
# Byte-compiled / optimized / DLL files
79
__pycache__/
810
*.py[cod]

clarite/internal/utilities.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import click
66
import pandas as pd
7+
from pandas_genomics import GenotypeDtype
78

89

910
def print_wrap(func):
@@ -87,13 +88,20 @@ def _validate_skip_only(
8788

8889
def _get_dtypes(data: pd.DataFrame):
8990
"""Return a Series of CLARITE dtypes indexed by variable name"""
90-
# Ensure that 'data' is a DataFrame and not a Series
91-
if type(data) != pd.DataFrame:
92-
raise ValueError("The passed 'data' is not a Pandas DataFrame")
91+
# Ensure that 'data' is a DataFrame or Series (which is converted to a DataFrame)
92+
if isinstance(data, pd.Series):
93+
data = pd.DataFrame(data)
94+
if not isinstance(data, pd.DataFrame):
95+
raise ValueError("The passed 'data' is not a Pandas DataFrame or Series")
9396

9497
# Start with all as unknown
9598
dtypes = pd.Series("unknown", index=data.columns)
9699

100+
# Set genotype arrays
101+
gt_cols = data.apply(lambda col: GenotypeDtype.is_dtype(col))
102+
gt_cols = gt_cols[gt_cols].index
103+
dtypes.loc[gt_cols] = "genotypes"
104+
97105
# Set binary and categorical
98106
data_catbin = data.loc[:, data.dtypes == "category"]
99107
if len(data_catbin.columns) > 0:
@@ -132,7 +140,9 @@ def _get_dtypes(data: pd.DataFrame):
132140
def _get_dtype(data: pd.Series):
133141
"""Return the CLARITE dtype of a pandas series"""
134142
# Set binary and categorical
135-
if data.dtype.name == "category":
143+
if GenotypeDtype.is_dtype(data):
144+
return "genotypes"
145+
elif data.dtype.name == "category":
136146
num_categories = len(data.cat.categories)
137147
if num_categories == 1:
138148
return "constant"
@@ -195,15 +205,13 @@ def _remove_empty_categories(
195205
dtypes = data.loc[:, columns].dtypes
196206
catvars = [v for v in dtypes[dtypes == "category"].index]
197207
for var in catvars:
198-
counts = data[var].value_counts()
199-
keep_cats = list(counts[counts > 0].index)
200-
if len(keep_cats) < len(counts):
201-
removed_cats[var] = set(counts.index) - set(keep_cats)
202-
data[var].cat.set_categories(
203-
new_categories=keep_cats,
204-
ordered=data[var].cat.ordered,
205-
inplace=True,
206-
)
208+
existing_cats = data[var].cat.categories
209+
if data[var].cat.ordered:
210+
print()
211+
data[var] = data[var].cat.remove_unused_categories()
212+
removed_categories = set(existing_cats) - set(data[var].cat.categories)
213+
if len(removed_categories) > 0:
214+
removed_cats[var] = removed_categories
207215
return removed_cats
208216
elif type(data) == pd.Series:
209217
assert skip is None

clarite/modules/analyze/__init__.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,25 @@
77
.. autosummary::
88
:toctree: modules/analyze
99
10-
ewas
11-
interaction_test
10+
association_study
11+
interaction_study
1212
add_corrected_pvalues
1313
1414
"""
1515

16+
from .association_study import association_study
1617
from .ewas import ewas
17-
from .interactions import interaction_test
18+
from .interaction_study import interaction_study
1819
from .utils import add_corrected_pvalues
1920
from . import regression
2021

21-
__all__ = [ewas, interaction_test, add_corrected_pvalues, regression]
22+
__all__ = [
23+
association_study,
24+
ewas,
25+
interaction_study,
26+
add_corrected_pvalues,
27+
regression,
28+
]
2229

2330
# Constants
2431
required_result_columns = {"N", "pvalue", "error", "warnings"}
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
from typing import Optional, Union, Type, List
2+
3+
import click
4+
import pandas as pd
5+
from pandas_genomics import GenotypeDtype
6+
7+
from clarite.modules.analyze import regression
8+
from clarite.modules.analyze.regression import (
9+
builtin_regression_kinds,
10+
WeightedGLMRegression,
11+
GLMRegression,
12+
)
13+
14+
15+
def association_study(
16+
data: pd.DataFrame,
17+
outcomes: Union[str, List[str]],
18+
regression_variables: Optional[Union[str, List[str]]] = None,
19+
covariates: Optional[Union[str, List[str]]] = None,
20+
regression_kind: Optional[Union[str, Type[regression.Regression]]] = None,
21+
encoding: str = "additive",
22+
weighted_encoding_info: Optional[pd.DataFrame] = None,
23+
**kwargs,
24+
):
25+
"""
26+
Run an association study (EWAS, PhEWAS, GWAS, GxEWAS, etc)
27+
28+
Individual regression classes selected with `regression_kind` may work slightly differently.
29+
Results are sorted in order of increasing `pvalue`
30+
31+
Parameters
32+
----------
33+
data: pd.DataFrame
34+
Contains all outcomes, regression_variables, and covariates
35+
outcomes: str or List[str]
36+
The exogenous variable (str) or variables (List) to be used as the output of each regression.
37+
regression_variables: str, List[str], or None
38+
The endogenous variable (str) or variables (List) to be used invididually as inputs into regression.
39+
If None, use all variables in `data` that aren't an outcome or a covariate
40+
covariates: str, List[str], or None (default)
41+
The variable (str) or variables (List) to be used as covariates in each regression.
42+
regression_kind: None, str or subclass of Regression
43+
This can be 'glm', 'weighted_glm', or 'r_survey' for built-in Regression types,
44+
or a custom subclass of Regression. If None, it is set to 'glm' if a survey design is not specified
45+
and 'weighted_glm' if it is.
46+
encoding: str, default "additive"
47+
Encoding method to use for any genotype data. One of {'additive', 'dominant', 'recessive', 'codominant', or 'weighted'}
48+
weighted_encoding_info: Optional pd.DataFrame, default None
49+
If weighted encoding is used, this must be provided. See Pandas-Genomics documentation on weighted encodings.
50+
kwargs: Keyword arguments specific to the Regression being used
51+
52+
Returns
53+
-------
54+
df: pd.DataFrame
55+
Association Study results DataFrame with at least these columns: ['N', 'pvalue', 'error', 'warnings'].
56+
Indexed by the outcome variable and the variable being assessed in each regression
57+
"""
58+
# Copy data to avoid modifying the original, in case it is changed
59+
data = data.copy(deep=True)
60+
61+
# Encode any genotype data
62+
has_genotypes = False
63+
for dt in data.dtypes:
64+
if GenotypeDtype.is_dtype(dt):
65+
has_genotypes = True
66+
break
67+
if has_genotypes:
68+
if encoding == "additive":
69+
data = data.genomics.encode_additive()
70+
elif encoding == "dominant":
71+
data = data.genomics.encode_dominant()
72+
elif encoding == "recessive":
73+
data = data.genomics.encode_recessive()
74+
elif encoding == "codominant":
75+
data = data.genomics.encode_codominant()
76+
elif encoding == "weighted":
77+
if weighted_encoding_info is None:
78+
raise ValueError(
79+
"'weighted_encoding_info' must be provided when using weighted encoding"
80+
)
81+
else:
82+
data = data.genomics.encode_weighted(weighted_encoding_info)
83+
else:
84+
raise ValueError(f"Genotypes provided with unknown 'encoding': {encoding}")
85+
86+
# Ensure outcome, covariates, and regression variables are lists
87+
if isinstance(outcomes, str):
88+
outcomes = [
89+
outcomes,
90+
]
91+
if isinstance(covariates, str):
92+
covariates = [
93+
covariates,
94+
]
95+
elif covariates is None:
96+
covariates = []
97+
if isinstance(regression_variables, str):
98+
regression_variables = [
99+
regression_variables,
100+
]
101+
elif regression_variables is None:
102+
regression_variables = list(set(data.columns) - set(outcomes) - set(covariates))
103+
104+
# Delete the survey_design_spec kwarg if it is None
105+
# This would be fine, but kwarg parsing for different clases means possibly passing it to an init that isn't expecting it
106+
if "survey_design_spec" in kwargs:
107+
if kwargs["survey_design_spec"] is None:
108+
del kwargs["survey_design_spec"]
109+
110+
# Parse regression kind
111+
if regression_kind is None:
112+
# Match the original api, which is glm or weighted_glm based on whether a design is passes
113+
if "survey_design_spec" in kwargs:
114+
regression_cls = WeightedGLMRegression
115+
else:
116+
regression_cls = GLMRegression
117+
elif isinstance(regression_kind, str):
118+
regression_cls = builtin_regression_kinds.get(regression_kind, None)
119+
if regression_cls is None:
120+
raise ValueError(
121+
f"Unknown regression kind '{regression_kind}, known values are {','.join(builtin_regression_kinds.keys())}"
122+
)
123+
elif regression_kind in regression_kind.mro():
124+
regression_cls = regression_kind
125+
else:
126+
raise ValueError(
127+
f"Incorrect regression kind type ({type(regression_kind)}). "
128+
f"A valid string or a subclass of Regression is required."
129+
)
130+
131+
# Run each regression
132+
results = []
133+
for outcome in outcomes:
134+
regression = regression_cls(
135+
data=data,
136+
outcome_variable=outcome,
137+
regression_variables=regression_variables,
138+
covariates=covariates,
139+
**kwargs,
140+
)
141+
print(regression)
142+
143+
# Run and get results
144+
regression.run()
145+
result = regression.get_results()
146+
147+
# Process Results
148+
click.echo(f"Completed Association Study for {outcome}\n", color="green")
149+
results.append(result)
150+
151+
if len(outcomes) == 1:
152+
result = results[0]
153+
else:
154+
result = pd.concat(results)
155+
156+
# Sort across multiple outcomes
157+
if result.index.names == ["Variable", "Outcome", "Category"]:
158+
result = result.sort_values(["pvalue", "Beta_pvalue"])
159+
elif result.index.names == ["Variable", "Outcome"]:
160+
result = result.sort_values(["pvalue"])
161+
162+
click.echo("Completed association study", color="green")
163+
return result

clarite/modules/analyze/ewas.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,7 @@
33
import click
44

55
from clarite.modules.analyze import regression
6-
7-
builtin_regression_kinds = {
8-
"glm": regression.GLMRegression,
9-
"weighted_glm": regression.WeightedGLMRegression,
10-
"r_survey": regression.RSurveyRegression,
11-
}
6+
from clarite.modules.analyze.regression import builtin_regression_kinds
127

138

149
def ewas(
@@ -48,8 +43,11 @@ def ewas(
4843
Examples
4944
--------
5045
>>> ewas_discovery = clarite.analyze.ewas("logBMI", covariates, nhanes_discovery)
51-
Running EWAS on a continuous variable
46+
Running on a continuous variable
5247
"""
48+
raise DeprecationWarning(
49+
"This function will be depreciated in favor of clarite.analyze.association_study"
50+
)
5351
# Copy data to avoid modifying the original, in case it is changed
5452
data = data.copy(deep=True)
5553

@@ -80,10 +78,22 @@ def ewas(
8078
f"A valid string or a subclass of Regression is required."
8179
)
8280

81+
# regression variables are anything that isn't an outcome or covariate
82+
regression_variables = set(data.columns) - set(
83+
[
84+
outcome,
85+
]
86+
+ covariates
87+
)
88+
8389
# Initialize the regression and print details
8490
print(kwargs)
8591
regression = regression_cls(
86-
data=data, outcome_variable=outcome, covariates=covariates, **kwargs
92+
data=data,
93+
outcome_variable=outcome,
94+
covariates=covariates,
95+
regression_variables=regression_variables,
96+
**kwargs,
8797
)
8898
print(regression)
8999

0 commit comments

Comments
 (0)