Skip to content

Commit 4ff7289

Browse files
authored
100 standardize data (#101)
* Add standardize_data parameter which normalized continuous variables by z-score before the regression. * Refactored most analysis tests to use parametrize in order to remove some duplicate code.
1 parent 42a4c50 commit 4ff7289

File tree

7 files changed

+367
-548
lines changed

7 files changed

+367
-548
lines changed

clarite/modules/analyze/ewas.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,14 @@ def ewas(
5555

5656
# Set up regression object
5757
# Emulate existing API by figuring out which method automatically
58+
# glm if not specified, unless survey_design_spec is passed and isn't None
5859
if regression_kind is None:
5960
if "survey_design_spec" in kwargs:
60-
regression_kind = "weighted_glm"
61+
if kwargs["survey_design_spec"] is None:
62+
regression_kind = "glm"
63+
del kwargs["survey_design_spec"]
64+
else:
65+
regression_kind = "weighted_glm"
6166
else:
6267
regression_kind = "glm"
6368

clarite/modules/analyze/regression/base.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,12 +116,12 @@ def _validate_regression_params(self):
116116
raise ValueError("No variables are available to run regression on")
117117

118118
# Ensure covariates are all present and not unknown type
119-
covariate_types = [types.get(c, None) for c in self.covariates]
120-
missing_covariates = [
121-
c for c, dt in zip(self.covariates, covariate_types) if dt is None
122-
]
119+
self.covariate_types = {
120+
covariate: types.get(covariate, None) for covariate in self.covariates
121+
}
122+
missing_covariates = [c for c, dt in self.covariate_types.items() if dt is None]
123123
unknown_covariates = [
124-
c for c, dt in zip(self.covariates, covariate_types) if dt == "unknown"
124+
c for c, dt in self.covariate_types.items() if dt == "unknown"
125125
]
126126
if len(missing_covariates) > 0:
127127
raise ValueError(

clarite/modules/analyze/regression/glm_regression.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import patsy
88
import scipy
99
import statsmodels.api as sm
10+
from scipy.stats import stats
1011

1112
from clarite.internal.utilities import _remove_empty_categories
1213

@@ -50,6 +51,10 @@ class GLMRegression(Regression):
5051
If True, the results will contain one row for each categorical value (other than the reference category) and
5152
will include the beta value, standard error (SE), and beta pvalue for that specific category. The number of
5253
terms increases with the number of categories.
54+
standardize_data: boolean
55+
False by default.
56+
If True, numeric data will be standardized using z-scores before regression.
57+
This will affect the beta values and standard error, but not the pvalues.
5358
"""
5459

5560
def __init__(
@@ -59,6 +64,7 @@ def __init__(
5964
covariates: Optional[List[str]] = None,
6065
min_n: int = 200,
6166
report_categorical_betas: bool = False,
67+
standardize_data: bool = False,
6268
):
6369
"""
6470
Parameters
@@ -82,6 +88,7 @@ def __init__(
8288
# Custom init involving kwargs passed to this regression
8389
self.min_n = min_n
8490
self.report_categorical_betas = report_categorical_betas
91+
self.standardize_data = standardize_data
8592

8693
# Ensure the data output type is compatible
8794
# Set 'self.family' and 'self.use_t' which are dependent on the outcome dtype
@@ -121,6 +128,26 @@ def __init__(
121128
f"\n\t{na_outcome_count:,} are missing a value for the outcome variable"
122129
)
123130

131+
# Standardize continuous variables in the data if needed
132+
# Use ddof=1 in the zscore calculation (used for StdErr) to match R
133+
if self.standardize_data:
134+
if self.outcome_dtype == "continuous":
135+
self.data[self.outcome_variable] = stats.zscore(
136+
self.data[self.outcome_variable], nan_policy="omit", ddof=1
137+
)
138+
continuous_rvs = self.regression_variables["continuous"]
139+
self.data[continuous_rvs] = stats.zscore(
140+
self.data[continuous_rvs], nan_policy="omit", ddof=1
141+
)
142+
continuous_covars = [
143+
rv
144+
for rv, rv_type in self.covariate_types.items()
145+
if rv_type == "continuous"
146+
]
147+
self.data[continuous_covars] = stats.zscore(
148+
self.data[continuous_covars], nan_policy="omit", ddof=1
149+
)
150+
124151
# Finish updating description
125152
self.description += f"\nRegressing {sum([len(v) for v in self.regression_variables.values()]):,} variables"
126153
for k, v in self.regression_variables.items():
@@ -153,9 +180,10 @@ def _get_formulas(self, regression_variable, varying_covars) -> Tuple[str, str]:
153180

154181
return formula_restricted, formula
155182

156-
@staticmethod
157-
def _process_formula(formula, data) -> Tuple[pd.DataFrame, pd.DataFrame]:
158-
"""Use patsy to process the formula with quoted variable names, but return with the original names"""
183+
def _process_formula(self, formula, data) -> Tuple[pd.DataFrame, pd.DataFrame]:
184+
"""
185+
Use patsy to process the formula with quoted variable names, but return with the original names.
186+
"""
159187
y, X = patsy.dmatrices(formula, data, return_type="dataframe", NA_action="drop")
160188
y = fix_names(y)
161189
X = fix_names(X)
@@ -200,6 +228,9 @@ def get_results(self) -> pd.DataFrame:
200228
]
201229
result = result[column_order]
202230

231+
# Update datatypes
232+
result["Weight"] = result["Weight"].fillna("None").astype("category")
233+
203234
return result
204235

205236
def _run_continuous(self, data, regression_variable, formula) -> Dict:
@@ -248,7 +279,6 @@ def _run_binary(self, data, regression_variable, formula) -> Dict:
248279
return result
249280

250281
def _run_categorical(self, data, formula, formula_restricted) -> Dict:
251-
result = dict()
252282
# Regress both models
253283
y, X = self._process_formula(formula, data)
254284
est = sm.GLM(y, X, family=self.family).fit(use_t=self.use_t)

clarite/modules/analyze/regression/r_code/ewas_r.R

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,6 @@ regress_cat <- function(data, varying_covariates, outcome, var_name, regression_
193193

194194
regress_cat_survey <- function(data, varying_covariates, outcome, var_name, regression_family,
195195
weight_values, strata_values, fpc_values, id_values, subset_array, ...) {
196-
print(report_categorical_betas)
197196
# Create survey design
198197
if(is.null(id_values)){
199198
survey_design <- survey::svydesign(ids = ~1,
@@ -349,6 +348,15 @@ regress <- function(data, y, var_name, covariates, min_n, allowed_nonvarying, re
349348
return(data.frame(result, stringsAsFactors = FALSE))
350349
}
351350

351+
# Standardize data if needed
352+
if(standardize_data){
353+
allowed_to_scale_cols <- colnames(data) %in% c(y, var_name, varying_covariates)
354+
numeric_cols <- sapply(data, is.numeric) # Exclude factors
355+
binary_cols <- sapply(data, function(s){all(s==0 | s==1 | is.na(s))}) # Exclude binary encoded as 0/1/missing
356+
scale_cols <- allowed_to_scale_cols & numeric_cols & !binary_cols
357+
data[scale_cols] <- lapply(data[scale_cols], scale)
358+
}
359+
352360
# Run Regression for the single variable
353361
if(!use_survey){
354362
if(var_type == 'bin'){
@@ -408,6 +416,8 @@ regress <- function(data, y, var_name, covariates, min_n, allowed_nonvarying, re
408416
#' @param strata NULL by default (for no strata). May be set to a string name of a column in the data which provides strata IDs.
409417
#' @param fpc NULL by default (for no fpc). May be set to a string name of a column in the data which provides fpc values.
410418
#' @param subset_array NULL by default (for no subset). May be set to a boolean array used to subset the data after creating the design
419+
#' @param report_categorical_betas FALSE by default
420+
#' @param standardize_data FALSE by default
411421
#' @param ... other arguments passed to svydesign which are ignored if 'weights' is NULL
412422
#' @return data frame containing following fields Variable, Sample Size, Converged, SE, Beta, Variable p-value, LRT, AIC, pval, outcome, weight
413423
#' @export
@@ -420,12 +430,13 @@ ewas <- function(d, bin_vars=NULL, cat_vars=NULL, cont_vars=NULL, y,
420430
bin_covars=NULL, cat_covars=NULL, cont_covars=NULL,
421431
regression_family="gaussian", allowed_nonvarying=NULL, min_n=200, weights=NULL,
422432
ids=NULL, strata=NULL, fpc=NULL, subset_array=NULL,
423-
report_categorical_betas=FALSE, ...){
433+
report_categorical_betas=FALSE, standardize_data=FALSE, ...){
424434
# Record start time
425435
t1 <- Sys.time()
426436

427437
# Record global options
428438
report_categorical_betas <<- report_categorical_betas
439+
standardize_data <<- standardize_data
429440

430441
# Validate inputs
431442
#################

clarite/modules/analyze/regression/r_survey_regression.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ class RSurveyRegression(Regression):
3434
If True, the results will contain one row for each categorical value (other than the reference category) and
3535
will include the beta value, standard error (SE), and beta pvalue for that specific category. The number of
3636
terms increases with the number of categories.
37+
standardize_data: boolean
38+
False by default.
39+
If True, numeric data will be standardized using z-scores before regression.
40+
This will affect the beta values and standard error, but not the pvalues.
3741
"""
3842

3943
def __init__(
@@ -44,6 +48,7 @@ def __init__(
4448
survey_design_spec: Optional[SurveyDesignSpec] = None,
4549
min_n: int = 200,
4650
report_categorical_betas: bool = False,
51+
standardize_data: bool = False,
4752
):
4853
# base class init
4954
# This takes in minimal regression params (data, outcome_variable, covariates) and
@@ -56,6 +61,7 @@ def __init__(
5661
self.min_n = min_n
5762
self.survey_design_spec = survey_design_spec
5863
self.report_categorical_betas = report_categorical_betas
64+
self.standardize_data = standardize_data
5965

6066
# Note this runs the entire regression in R, returning a DataFrame instead of a dict.
6167
# Therefore, store the dataframe in self.result instead of a dict in self.results
@@ -144,6 +150,10 @@ def get_results(self) -> pd.DataFrame:
144150
]
145151
result = result[column_order]
146152

153+
# Convert datatype to match python results
154+
result["N"] = result["N"].astype("Int64")
155+
result["Weight"] = result["Weight"].fillna("None").astype("category")
156+
147157
return result
148158

149159
@requires("rpy2")
@@ -223,6 +233,7 @@ def run(self):
223233
allowed_nonvarying=allowed_nonvarying,
224234
min_n=self.min_n,
225235
report_categorical_betas=self.report_categorical_betas,
236+
standardize_data=self.standardize_data,
226237
)
227238
else:
228239
# Merge weights into data and get weight name(s) (Note 'data' becomes a local variable)
@@ -308,6 +319,7 @@ def run(self):
308319
weights=weights,
309320
subset=self.survey_design_spec.subset_array,
310321
drop_unweighted=self.survey_design_spec.drop_unweighted,
322+
standardize_data=self.standardize_data,
311323
**kwargs,
312324
)
313325

clarite/modules/analyze/regression/weighted_glm_regression.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,14 +53,18 @@ class WeightedGLMRegression(GLMRegression):
5353
min_n:
5454
Minimum number of complete-case observations (no NA values for outcome, covariates, variable, or weight)
5555
Defaults to 200
56-
report_betas: boolean
56+
report_categorical_betas: boolean
5757
False by default.
5858
If True, the results will contain one row for each categorical value (other than the reference category) and
5959
will include the beta value, standard error (SE), and beta pvalue for that specific category. The number of
6060
terms increases with the number of categories.
6161
cov_method:
6262
Covariance calculation method (if survey_design_spec is passed in). 'stata' by default.
6363
Warning: `jackknife` is untested and may not be accurate
64+
standardize_data: boolean
65+
False by default.
66+
If True, numeric data will be standardized using z-scores before regression.
67+
This will affect the beta values and standard error, but not the pvalues.
6468
"""
6569

6670
def __init__(
@@ -72,6 +76,7 @@ def __init__(
7276
min_n: int = 200,
7377
report_categorical_betas: bool = False,
7478
cov_method: Optional[str] = "stata",
79+
standardize_data: bool = False,
7580
):
7681
# survey_design_spec should actually not be None, but is a keyword for convenience
7782
if survey_design_spec is None:
@@ -84,6 +89,7 @@ def __init__(
8489
covariates=covariates,
8590
min_n=min_n,
8691
report_categorical_betas=report_categorical_betas,
92+
standardize_data=standardize_data,
8793
)
8894

8995
# Custom init involving kwargs passed to this regression
@@ -259,6 +265,8 @@ def run(self):
259265
for rv in rv_list:
260266
# Run in a try/except block to catch any errors specific to a regression variable
261267
try:
268+
# Must define result to catch errors outside running individual variables
269+
result = None
262270
# Take a copy of the data (ignoring other RVs) and create a keep_rows mask
263271
keep_columns = [rv, self.outcome_variable] + self.covariates
264272
data = self.data[keep_columns]

0 commit comments

Comments
 (0)