Skip to content

Commit 2b5bcc7

Browse files
authored
Change "weighted" to "edge" and fix docs (#111)
* Change 'weighted' to 'edge' encoding and update pandas-genomics * Fix __all__ to use strings * whoops * Fix doc errors * Fix docs * Fix docs: missed some files
1 parent 3b4e61d commit 2b5bcc7

31 files changed

+2278
-2284
lines changed

clarite/modules/analyze/__init__.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,9 @@
44
55
Functions used for analyses such as EWAS
66
7-
.. autosummary::
8-
:toctree: modules/analyze
9-
10-
association_study
11-
interaction_study
12-
add_corrected_pvalues
7+
.. autofunction:: association_study
8+
.. autofunction:: interaction_study
9+
.. autofunction:: add_corrected_pvalues
1310
1411
"""
1512

@@ -20,11 +17,11 @@
2017
from . import regression
2118

2219
__all__ = [
23-
association_study,
24-
ewas,
25-
interaction_study,
26-
add_corrected_pvalues,
27-
regression,
20+
"association_study",
21+
"ewas",
22+
"interaction_study",
23+
"add_corrected_pvalues",
24+
"regression",
2825
]
2926

3027
# Constants
@@ -42,6 +39,6 @@
4239
]
4340
corrected_pvalue_columns = ["pvalue_bonferroni", "pvalue_fdr"]
4441

45-
__all__.append(required_result_columns)
46-
__all__.append(result_columns)
47-
__all__.append(corrected_pvalue_columns)
42+
__all__.append("required_result_columns")
43+
__all__.append("result_columns")
44+
__all__.append("corrected_pvalue_columns")

clarite/modules/analyze/association_study.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def association_study(
1919
covariates: Optional[Union[str, List[str]]] = None,
2020
regression_kind: Optional[Union[str, Type[regression.Regression]]] = None,
2121
encoding: str = "additive",
22-
weighted_encoding_info: Optional[pd.DataFrame] = None,
22+
edge_encoding_info: Optional[pd.DataFrame] = None,
2323
**kwargs,
2424
):
2525
"""
@@ -45,8 +45,8 @@ def association_study(
4545
and 'weighted_glm' if it is.
4646
encoding: str, default "additive"
4747
Encoding method to use for any genotype data. One of {'additive', 'dominant', 'recessive', 'codominant', or 'weighted'}
48-
weighted_encoding_info: Optional pd.DataFrame, default None
49-
If weighted encoding is used, this must be provided. See Pandas-Genomics documentation on weighted encodings.
48+
edge_encoding_info: Optional pd.DataFrame, default None
49+
If edge encoding is used, this must be provided. See Pandas-Genomics documentation on edge encodings.
5050
kwargs: Keyword arguments specific to the Regression being used
5151
5252
Returns
@@ -73,13 +73,13 @@ def association_study(
7373
data = data.genomics.encode_recessive()
7474
elif encoding == "codominant":
7575
data = data.genomics.encode_codominant()
76-
elif encoding == "weighted":
77-
if weighted_encoding_info is None:
76+
elif encoding == "edge":
77+
if edge_encoding_info is None:
7878
raise ValueError(
79-
"'weighted_encoding_info' must be provided when using weighted encoding"
79+
"'edge_encoding_info' must be provided when using edge encoding"
8080
)
8181
else:
82-
data = data.genomics.encode_weighted(weighted_encoding_info)
82+
data = data.genomics.encode_edge(edge_encoding_info)
8383
else:
8484
raise ValueError(f"Genotypes provided with unknown 'encoding': {encoding}")
8585

clarite/modules/analyze/interaction_study.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def interaction_study(
1313
interactions: Optional[Union[List[Tuple[str, str]], str]] = None,
1414
covariates: Optional[Union[str, List[str]]] = None,
1515
encoding: str = "additive",
16-
weighted_encoding_info: Optional[pd.DataFrame] = None,
16+
edge_encoding_info: Optional[pd.DataFrame] = None,
1717
report_betas: bool = False,
1818
min_n: int = 200,
1919
):
@@ -39,9 +39,9 @@ def interaction_study(
3939
covariates: str, List[str], or None (default)
4040
The variable (str) or variables (List) to be used as covariates in each regression.
4141
encoding: str, default "additive""
42-
Encoding method to use for any genotype data. One of {'additive', 'dominant', 'recessive', 'codominant', or 'weighted'}
43-
weighted_encoding_info: Optional pd.DataFrame, default None
44-
If weighted encoding is used, this must be provided. See Pandas-Genomics documentation on weighted encodings.
42+
Encoding method to use for any genotype data. One of {'additive', 'dominant', 'recessive', 'codominant', or 'edge'}
43+
edge_encoding_info: Optional pd.DataFrame, default None
44+
If edge encoding is used, this must be provided. See Pandas-Genomics documentation on edge encoding.
4545
report_betas: boolean
4646
False by default.
4747
If True, the results will contain one row for each interaction term and will include the beta value,
@@ -75,13 +75,13 @@ def interaction_study(
7575
data = data.genomics.encode_recessive()
7676
elif encoding == "codominant":
7777
data = data.genomics.encode_codominant()
78-
elif encoding == "weighted":
79-
if weighted_encoding_info is None:
78+
elif encoding == "edge":
79+
if edge_encoding_info is None:
8080
raise ValueError(
81-
"'weighted_encoding_info' must be provided when using weighted encoding"
81+
"'edge_encoding_info' must be provided when using edge encoding"
8282
)
8383
else:
84-
data = data.genomics.encode_weighted(weighted_encoding_info)
84+
data = data.genomics.encode_edge(edge_encoding_info)
8585
else:
8686
raise ValueError(f"Genotypes provided with unknown 'encoding': {encoding}")
8787

clarite/modules/analyze/regression/__init__.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
.. autoclass:: Regression
99
1010
11-
clarite.analyze.ewas
12-
--------------------
11+
clarite.analyze.association_study
12+
---------------------------------
1313
1414
The `regression_kind` parameter can be set to use one of three regression classes, or a custom subclass of `Regression`
1515
can be created.
@@ -21,8 +21,8 @@
2121
.. autoclass:: RSurveyRegression
2222
2323
24-
clarite.analyze.interactions
25-
----------------------------
24+
clarite.analyze.interaction_study
25+
---------------------------------
2626
2727
.. autoclass:: InteractionRegression
2828
@@ -43,10 +43,10 @@
4343

4444

4545
__all__ = [
46-
GLMRegression,
47-
RSurveyRegression,
48-
WeightedGLMRegression,
49-
Regression,
50-
InteractionRegression,
51-
builtin_regression_kinds,
46+
"GLMRegression",
47+
"RSurveyRegression",
48+
"WeightedGLMRegression",
49+
"Regression",
50+
"InteractionRegression",
51+
"builtin_regression_kinds",
5252
]

clarite/modules/analyze/regression/base.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ class Regression(metaclass=ABCMeta):
1212
"""
1313
Abstract Base Class for Regression objects used in EWAS.
1414
15-
Minimum Parameters
16-
------------------
15+
Parameters
16+
----------
1717
data: pd.DataFrame
1818
Data used in the analysis
1919
outcome_variable: str
@@ -25,10 +25,11 @@ class Regression(metaclass=ABCMeta):
2525
Any variables in the DataFrames not listed as covariates are regressed.
2626
Use `None` or an empty list when no covariates are being used.
2727
28-
Abstract Methods
29-
----------------
30-
run() -> None
31-
get_results() -> pd.DataFrame
28+
Notes
29+
-----
30+
These are the abstract methods:
31+
* run() -> None
32+
* get_results() -> pd.DataFrame
3233
"""
3334

3435
def __init__(

clarite/modules/analyze/regression/glm_regression.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,21 +20,21 @@ class GLMRegression(Regression):
2020
Statsmodels GLM Regression.
2121
This class handles running a regression for each variable of interest and collecting results.
2222
23-
Regression Methods
24-
------------------
23+
Notes
24+
-----
25+
* The family used is either Gaussian (continuous outcomes) or binomial(logit) for binary outcomes.
26+
* Covariates variables that are constant produce warnings and are ignored
27+
* The dataset is subset to drop missing values, and the same dataset is used for both models in the LRT
28+
29+
*Regression Methods*
30+
2531
Binary variables
2632
Treated as continuous features, with values of 0 and 1 (the larger value in the original data is encoded as 1).
2733
Categorical variables
2834
The results of a likelihood ratio test are used to calculate a pvalue. No Beta or SE values are reported.
2935
Continuous variables
3036
A GLM is used to obtain Beta, SE, and pvalue results.
3137
32-
Notes
33-
-----
34-
* The family used is either Gaussian (continuous outcomes) or binomial(logit) for binary outcomes.
35-
* Covariates variables that are constant produce warnings and are ignored
36-
* The dataset is subset to drop missing values, and the same dataset is used for both models in the LRT
37-
3838
Parameters
3939
----------
4040
data:

clarite/modules/analyze/regression/r_survey_regression.py

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from pathlib import Path
22
from typing import List, Optional
33

4-
import numpy as np
54
import pandas as pd
65

76
from clarite.internal.utilities import requires, _get_dtypes
@@ -12,33 +11,33 @@
1211

1312
class RSurveyRegression(Regression):
1413
"""
15-
Run regressions by calling R from Python
16-
When a SurveyDesignSpec is provided, the R *survey* library is used.
17-
Results should match those run with either GLMRegression or WeightedGLMRegression.
14+
Run regressions by calling R from Python
15+
When a SurveyDesignSpec is provided, the R *survey* library is used.
16+
Results should match those run with either GLMRegression or WeightedGLMRegression.
1817
19-
Parameters
20-
----------
21-
data:
18+
Parameters
19+
----------
20+
data:
2221
The data to be analyzed, including the outcome, covariates, and any variables to be regressed.
23-
outcome_variable:
22+
outcome_variable:
2423
The variable to be used as the output (y) of the regression
25-
covariates:
24+
covariates:
2625
The variables to be used as covariates. Any variables in the DataFrames not listed as covariates are regressed.
27-
survey_design_spec:
28-
A SurveyDesignSpec object is used to create SurveyDesign objects for each regression.
29-
Use None if unweighted regression is desired.
30-
min-n:
26+
survey_design_spec:
27+
A SurveyDesignSpec object is used to create SurveyDesign objects for each regression.
28+
Use None if unweighted regression is desired.
29+
min-n:
3130
Minimum number of complete-case observations (no NA values for outcome, covariates, variable, or weight)
3231
Defaults to 200
3332
report_betas: boolean
34-
False by default.
33+
False by default.
3534
If True, the results will contain one row for each categorical value (other than the reference category) and
3635
will include the beta value, standard error (SE), and beta pvalue for that specific category. The number of
3736
terms increases with the number of categories.
3837
standardize_data: boolean
3938
False by default.
40-
If True, numeric data will be standardized using z-scores before regression.
41-
This will affect the beta values and standard error, but not the pvalues.
39+
If True, numeric data will be standardized using z-scores before regression.
40+
This will affect the beta values and standard error, but not the pvalues.
4241
"""
4342

4443
def __init__(

clarite/modules/analyze/regression/weighted_glm_regression.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,6 @@ class WeightedGLMRegression(GLMRegression):
2121
The statistical adjustments (primarily the covariance calculation) are designed to match results when running with
2222
the R `survey` library.
2323
24-
Regression Methods
25-
------------------
26-
Binary variables
27-
Treated as continuous features, with values of 0 and 1 (the larger value in the original data is encoded as 1).
28-
Categorical variables
29-
The results of a likelihood ratio test are used to calculate a pvalue. No Beta or SE values are reported.
30-
Continuous variables
31-
A GLM is used to obtain Beta, SE, and pvalue results.
32-
3324
Notes
3425
-----
3526
* The family used is Gaussian for continuous outcomes or binomial(logit) for binary outcomes.
@@ -40,6 +31,15 @@ class WeightedGLMRegression(GLMRegression):
4031
* Categorical variables run with a survey design will not report Diff_AIC as it may not be possible to calculate
4132
it accurately
4233
34+
*Regression Methods*
35+
36+
Binary variables
37+
Treated as continuous features, with values of 0 and 1 (the larger value in the original data is encoded as 1).
38+
Categorical variables
39+
The results of a likelihood ratio test are used to calculate a pvalue. No Beta or SE values are reported.
40+
Continuous variables
41+
A GLM is used to obtain Beta, SE, and pvalue results.
42+
4343
Parameters
4444
----------
4545
data:

clarite/modules/describe.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,12 @@
44
55
Functions that are used to gather information about some data
66
7-
.. autosummary::
8-
:toctree: modules/describe
9-
10-
correlations
11-
freq_table
12-
get_types
13-
percent_na
14-
skewness
15-
summarize
7+
.. autofunction:: correlations
8+
.. autofunction:: freq_table
9+
.. autofunction:: get_types
10+
.. autofunction:: percent_na
11+
.. autofunction:: skewness
12+
.. autofunction:: summarize
1613
1714
"""
1815

@@ -215,7 +212,7 @@ def skewness(data: pd.DataFrame, dropna: bool = False):
215212
result: pd.DataFrame
216213
DataFrame listing three values for each continuous variable and NA for others: skew, zscore, and pvalue
217214
The test null hypothesis is that the skewness of the samples population is the same as the corresponding
218-
normal distribution. The pvalue is the two-sided pvalue for the hypothesis test
215+
normal distribution. The pvalue is the two-sided pvalue for the hypothesis test
219216
220217
Examples
221218
--------

clarite/modules/load.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,8 @@
44
55
Load data from different formats or sources
66
7-
.. autosummary::
8-
:toctree: modules/load
9-
10-
from_tsv
11-
from_csv
7+
.. autofunction:: from_tsv
8+
.. autofunction:: from_csv
129
"""
1310

1411
from typing import Optional, Union

0 commit comments

Comments
 (0)