100 standardize data (#101)

jrm5100 · web-flow · commit 4ff72890ea73 · 2021-06-22T13:31:47.000-04:00
* Add standardize_data parameter which normalized continuous variables by z-score before the regression.
* Refactored most analysis tests to use parametrize in order to remove some duplicate code.
diff --git a/clarite/modules/analyze/ewas.py b/clarite/modules/analyze/ewas.py
@@ -55,9 +55,14 @@ def ewas(
 
     # Set up regression object
     # Emulate existing API by figuring out which method automatically
+    # glm if not specified, unless survey_design_spec is passed and isn't None
     if regression_kind is None:
         if "survey_design_spec" in kwargs:
-            regression_kind = "weighted_glm"
+            if kwargs["survey_design_spec"] is None:
+                regression_kind = "glm"
+                del kwargs["survey_design_spec"]
+            else:
+                regression_kind = "weighted_glm"
         else:
             regression_kind = "glm"
 
diff --git a/clarite/modules/analyze/regression/base.py b/clarite/modules/analyze/regression/base.py
@@ -116,12 +116,12 @@ def _validate_regression_params(self):
             raise ValueError("No variables are available to run regression on")
 
         # Ensure covariates are all present and not unknown type
-        covariate_types = [types.get(c, None) for c in self.covariates]
-        missing_covariates = [
-            c for c, dt in zip(self.covariates, covariate_types) if dt is None
-        ]
+        self.covariate_types = {
+            covariate: types.get(covariate, None) for covariate in self.covariates
+        }
+        missing_covariates = [c for c, dt in self.covariate_types.items() if dt is None]
         unknown_covariates = [
-            c for c, dt in zip(self.covariates, covariate_types) if dt == "unknown"
+            c for c, dt in self.covariate_types.items() if dt == "unknown"
         ]
         if len(missing_covariates) > 0:
             raise ValueError(
diff --git a/clarite/modules/analyze/regression/glm_regression.py b/clarite/modules/analyze/regression/glm_regression.py
@@ -7,6 +7,7 @@
 import patsy
 import scipy
 import statsmodels.api as sm
+from scipy.stats import stats
 
 from clarite.internal.utilities import _remove_empty_categories
 
@@ -50,6 +51,10 @@ class GLMRegression(Regression):
           If True, the results will contain one row for each categorical value (other than the reference category) and
           will include the beta value, standard error (SE), and beta pvalue for that specific category. The number of
           terms increases with the number of categories.
+    standardize_data: boolean
+        False by default.
+          If True, numeric data will be standardized using z-scores before regression.
+          This will affect the beta values and standard error, but not the pvalues.
     """
 
     def __init__(
@@ -59,6 +64,7 @@ def __init__(
         covariates: Optional[List[str]] = None,
         min_n: int = 200,
         report_categorical_betas: bool = False,
+        standardize_data: bool = False,
     ):
         """
         Parameters
@@ -82,6 +88,7 @@ def __init__(
         # Custom init involving kwargs passed to this regression
         self.min_n = min_n
         self.report_categorical_betas = report_categorical_betas
+        self.standardize_data = standardize_data
 
         # Ensure the data output type is compatible
         # Set 'self.family' and 'self.use_t' which are dependent on the outcome dtype
@@ -121,6 +128,26 @@ def __init__(
                 f"\n\t{na_outcome_count:,} are missing a value for the outcome variable"
             )
 
+        # Standardize continuous variables in the data if needed
+        # Use ddof=1 in the zscore calculation (used for StdErr) to match R
+        if self.standardize_data:
+            if self.outcome_dtype == "continuous":
+                self.data[self.outcome_variable] = stats.zscore(
+                    self.data[self.outcome_variable], nan_policy="omit", ddof=1
+                )
+            continuous_rvs = self.regression_variables["continuous"]
+            self.data[continuous_rvs] = stats.zscore(
+                self.data[continuous_rvs], nan_policy="omit", ddof=1
+            )
+            continuous_covars = [
+                rv
+                for rv, rv_type in self.covariate_types.items()
+                if rv_type == "continuous"
+            ]
+            self.data[continuous_covars] = stats.zscore(
+                self.data[continuous_covars], nan_policy="omit", ddof=1
+            )
+
         # Finish updating description
         self.description += f"\nRegressing {sum([len(v) for v in self.regression_variables.values()]):,} variables"
         for k, v in self.regression_variables.items():
@@ -153,9 +180,10 @@ def _get_formulas(self, regression_variable, varying_covars) -> Tuple[str, str]:
 
         return formula_restricted, formula
 
-    @staticmethod
-    def _process_formula(formula, data) -> Tuple[pd.DataFrame, pd.DataFrame]:
-        """Use patsy to process the formula with quoted variable names, but return with the original names"""
+    def _process_formula(self, formula, data) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        """
+        Use patsy to process the formula with quoted variable names, but return with the original names.
+        """
         y, X = patsy.dmatrices(formula, data, return_type="dataframe", NA_action="drop")
         y = fix_names(y)
         X = fix_names(X)
@@ -200,6 +228,9 @@ def get_results(self) -> pd.DataFrame:
         ]
         result = result[column_order]
 
+        # Update datatypes
+        result["Weight"] = result["Weight"].fillna("None").astype("category")
+
         return result
 
     def _run_continuous(self, data, regression_variable, formula) -> Dict:
@@ -248,7 +279,6 @@ def _run_binary(self, data, regression_variable, formula) -> Dict:
         return result
 
     def _run_categorical(self, data, formula, formula_restricted) -> Dict:
-        result = dict()
         # Regress both models
         y, X = self._process_formula(formula, data)
         est = sm.GLM(y, X, family=self.family).fit(use_t=self.use_t)
diff --git a/clarite/modules/analyze/regression/r_code/ewas_r.R b/clarite/modules/analyze/regression/r_code/ewas_r.R
@@ -193,7 +193,6 @@ regress_cat <- function(data, varying_covariates, outcome, var_name, regression_
 
 regress_cat_survey <- function(data, varying_covariates, outcome, var_name, regression_family,
                                weight_values, strata_values, fpc_values, id_values, subset_array, ...) {
-  print(report_categorical_betas)
   # Create survey design
   if(is.null(id_values)){
     survey_design <- survey::svydesign(ids = ~1,
@@ -349,6 +348,15 @@ regress <- function(data, y, var_name, covariates, min_n, allowed_nonvarying, re
     return(data.frame(result, stringsAsFactors = FALSE))
   }
 
+  # Standardize data if needed
+  if(standardize_data){
+    allowed_to_scale_cols <- colnames(data) %in% c(y, var_name, varying_covariates)
+    numeric_cols <- sapply(data, is.numeric)  # Exclude factors
+    binary_cols <- sapply(data, function(s){all(s==0 | s==1 | is.na(s))})  # Exclude binary encoded as 0/1/missing
+    scale_cols <- allowed_to_scale_cols & numeric_cols & !binary_cols
+    data[scale_cols] <- lapply(data[scale_cols], scale)
+  }
+
   # Run Regression for the single variable
   if(!use_survey){
     if(var_type == 'bin'){
@@ -408,6 +416,8 @@ regress <- function(data, y, var_name, covariates, min_n, allowed_nonvarying, re
 #' @param strata NULL by default (for no strata).  May be set to a string name of a column in the data which provides strata IDs.
 #' @param fpc NULL by default (for no fpc).  May be set to a string name of a column in the data which provides fpc values.
 #' @param subset_array NULL by default (for no subset).  May be set to a boolean array used to subset the data after creating the design
+#' @param report_categorical_betas FALSE by default
+#' @param standardize_data FALSE by default
 #' @param ... other arguments passed to svydesign which are ignored if 'weights' is NULL
 #' @return data frame containing following fields Variable, Sample Size, Converged, SE, Beta, Variable p-value, LRT, AIC, pval, outcome, weight
 #' @export
@@ -420,12 +430,13 @@ ewas <- function(d, bin_vars=NULL, cat_vars=NULL, cont_vars=NULL, y,
                  bin_covars=NULL, cat_covars=NULL, cont_covars=NULL,
                  regression_family="gaussian", allowed_nonvarying=NULL, min_n=200, weights=NULL,
                  ids=NULL, strata=NULL, fpc=NULL, subset_array=NULL,
-                 report_categorical_betas=FALSE, ...){
+                 report_categorical_betas=FALSE, standardize_data=FALSE, ...){
   # Record start time
   t1 <- Sys.time()
 
   # Record global options
   report_categorical_betas <<- report_categorical_betas
+  standardize_data <<- standardize_data
 
   # Validate inputs
   #################
diff --git a/clarite/modules/analyze/regression/r_survey_regression.py b/clarite/modules/analyze/regression/r_survey_regression.py
@@ -34,6 +34,10 @@ class RSurveyRegression(Regression):
         If True, the results will contain one row for each categorical value (other than the reference category) and
         will include the beta value, standard error (SE), and beta pvalue for that specific category. The number of
         terms increases with the number of categories.
+    standardize_data: boolean
+        False by default.
+          If True, numeric data will be standardized using z-scores before regression.
+          This will affect the beta values and standard error, but not the pvalues.
     """
 
     def __init__(
@@ -44,6 +48,7 @@ def __init__(
         survey_design_spec: Optional[SurveyDesignSpec] = None,
         min_n: int = 200,
         report_categorical_betas: bool = False,
+        standardize_data: bool = False,
     ):
         # base class init
         # This takes in minimal regression params (data, outcome_variable, covariates) and
@@ -56,6 +61,7 @@ def __init__(
         self.min_n = min_n
         self.survey_design_spec = survey_design_spec
         self.report_categorical_betas = report_categorical_betas
+        self.standardize_data = standardize_data
 
         # Note this runs the entire regression in R, returning a DataFrame instead of a dict.
         # Therefore, store the dataframe in self.result instead of a dict in self.results
@@ -144,6 +150,10 @@ def get_results(self) -> pd.DataFrame:
         ]
         result = result[column_order]
 
+        # Convert datatype to match python results
+        result["N"] = result["N"].astype("Int64")
+        result["Weight"] = result["Weight"].fillna("None").astype("category")
+
         return result
 
     @requires("rpy2")
@@ -223,6 +233,7 @@ def run(self):
                     allowed_nonvarying=allowed_nonvarying,
                     min_n=self.min_n,
                     report_categorical_betas=self.report_categorical_betas,
+                    standardize_data=self.standardize_data,
                 )
         else:
             # Merge weights into data and get weight name(s) (Note 'data' becomes a local variable)
@@ -308,6 +319,7 @@ def run(self):
                     weights=weights,
                     subset=self.survey_design_spec.subset_array,
                     drop_unweighted=self.survey_design_spec.drop_unweighted,
+                    standardize_data=self.standardize_data,
                     **kwargs,
                 )
 
diff --git a/clarite/modules/analyze/regression/weighted_glm_regression.py b/clarite/modules/analyze/regression/weighted_glm_regression.py
@@ -53,14 +53,18 @@ class WeightedGLMRegression(GLMRegression):
     min_n:
         Minimum number of complete-case observations (no NA values for outcome, covariates, variable, or weight)
         Defaults to 200
-    report_betas: boolean
+    report_categorical_betas: boolean
         False by default.
           If True, the results will contain one row for each categorical value (other than the reference category) and
           will include the beta value, standard error (SE), and beta pvalue for that specific category. The number of
           terms increases with the number of categories.
     cov_method:
         Covariance calculation method (if survey_design_spec is passed in).  'stata' by default.
         Warning: `jackknife` is untested and may not be accurate
+    standardize_data: boolean
+        False by default.
+          If True, numeric data will be standardized using z-scores before regression.
+          This will affect the beta values and standard error, but not the pvalues.
     """
 
     def __init__(
@@ -72,6 +76,7 @@ def __init__(
         min_n: int = 200,
         report_categorical_betas: bool = False,
         cov_method: Optional[str] = "stata",
+        standardize_data: bool = False,
     ):
         # survey_design_spec should actually not be None, but is a keyword for convenience
         if survey_design_spec is None:
@@ -84,6 +89,7 @@ def __init__(
             covariates=covariates,
             min_n=min_n,
             report_categorical_betas=report_categorical_betas,
+            standardize_data=standardize_data,
         )
 
         # Custom init involving kwargs passed to this regression
@@ -259,6 +265,8 @@ def run(self):
             for rv in rv_list:
                 # Run in a try/except block to catch any errors specific to a regression variable
                 try:
+                    # Must define result to catch errors outside running individual variables
+                    result = None
                     # Take a copy of the data (ignoring other RVs) and create a keep_rows mask
                     keep_columns = [rv, self.outcome_variable] + self.covariates
                     data = self.data[keep_columns]
diff --git a/clarite/tests/analyze/test_ewas.py b/clarite/tests/analyze/test_ewas.py