HallLab
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎clarite/internal/utilities.py‎
Lines changed: 21 additions & 13 deletions b/‎clarite/internal/utilities.py‎
Lines changed: 21 additions & 13 deletions
diff --git a/‎clarite/modules/analyze/__init__.py‎
Lines changed: 11 additions & 4 deletions b/‎clarite/modules/analyze/__init__.py‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎clarite/modules/analyze/association_study.py‎
Lines changed: 163 additions & 0 deletions b/‎clarite/modules/analyze/association_study.py‎
Lines changed: 163 additions & 0 deletions
diff --git a/‎clarite/modules/analyze/ewas.py‎
Lines changed: 18 additions & 8 deletions b/‎clarite/modules/analyze/ewas.py‎
Lines changed: 18 additions & 8 deletions
@@ -3,6 +3,8 @@
 /docs/notebooks/.ipynb_checkpoints
 /docs/source/modules
 
+poetry.lock
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 
@@ -4,6 +4,7 @@
 
 import click
 import pandas as pd
+from pandas_genomics import GenotypeDtype
 
 
 def print_wrap(func):
@@ -87,13 +88,20 @@ def _validate_skip_only(
 
 def _get_dtypes(data: pd.DataFrame):
     """Return a Series of CLARITE dtypes indexed by variable name"""
-    # Ensure that 'data' is a DataFrame and not a Series
-    if type(data) != pd.DataFrame:
-        raise ValueError("The passed 'data' is not a Pandas DataFrame")
+    # Ensure that 'data' is a DataFrame or Series (which is converted to a DataFrame)
+    if isinstance(data, pd.Series):
+        data = pd.DataFrame(data)
+    if not isinstance(data, pd.DataFrame):
+        raise ValueError("The passed 'data' is not a Pandas DataFrame or Series")
 
     # Start with all as unknown
     dtypes = pd.Series("unknown", index=data.columns)
 
+    # Set genotype arrays
+    gt_cols = data.apply(lambda col: GenotypeDtype.is_dtype(col))
+    gt_cols = gt_cols[gt_cols].index
+    dtypes.loc[gt_cols] = "genotypes"
+
     # Set binary and categorical
     data_catbin = data.loc[:, data.dtypes == "category"]
     if len(data_catbin.columns) > 0:
@@ -132,7 +140,9 @@ def _get_dtypes(data: pd.DataFrame):
 def _get_dtype(data: pd.Series):
     """Return the CLARITE dtype of a pandas series"""
     # Set binary and categorical
-    if data.dtype.name == "category":
+    if GenotypeDtype.is_dtype(data):
+        return "genotypes"
+    elif data.dtype.name == "category":
         num_categories = len(data.cat.categories)
         if num_categories == 1:
             return "constant"
@@ -195,15 +205,13 @@ def _remove_empty_categories(
         dtypes = data.loc[:, columns].dtypes
         catvars = [v for v in dtypes[dtypes == "category"].index]
         for var in catvars:
-            counts = data[var].value_counts()
-            keep_cats = list(counts[counts > 0].index)
-            if len(keep_cats) < len(counts):
-                removed_cats[var] = set(counts.index) - set(keep_cats)
-                data[var].cat.set_categories(
-                    new_categories=keep_cats,
-                    ordered=data[var].cat.ordered,
-                    inplace=True,
-                )
+            existing_cats = data[var].cat.categories
+            if data[var].cat.ordered:
+                print()
+            data[var] = data[var].cat.remove_unused_categories()
+            removed_categories = set(existing_cats) - set(data[var].cat.categories)
+            if len(removed_categories) > 0:
+                removed_cats[var] = removed_categories
         return removed_cats
     elif type(data) == pd.Series:
         assert skip is None
 
@@ -7,18 +7,25 @@
   .. autosummary::
      :toctree: modules/analyze
 
-     ewas
-     interaction_test
+     association_study
+     interaction_study
      add_corrected_pvalues
 
 """
 
+from .association_study import association_study
 from .ewas import ewas
-from .interactions import interaction_test
+from .interaction_study import interaction_study
 from .utils import add_corrected_pvalues
 from . import regression
 
-__all__ = [ewas, interaction_test, add_corrected_pvalues, regression]
+__all__ = [
+    association_study,
+    ewas,
+    interaction_study,
+    add_corrected_pvalues,
+    regression,
+]
 
 # Constants
 required_result_columns = {"N", "pvalue", "error", "warnings"}
 
@@ -0,0 +1,163 @@
+from typing import Optional, Union, Type, List
+
+import click
+import pandas as pd
+from pandas_genomics import GenotypeDtype
+
+from clarite.modules.analyze import regression
+from clarite.modules.analyze.regression import (
+    builtin_regression_kinds,
+    WeightedGLMRegression,
+    GLMRegression,
+)
+
+
+def association_study(
+    data: pd.DataFrame,
+    outcomes: Union[str, List[str]],
+    regression_variables: Optional[Union[str, List[str]]] = None,
+    covariates: Optional[Union[str, List[str]]] = None,
+    regression_kind: Optional[Union[str, Type[regression.Regression]]] = None,
+    encoding: str = "additive",
+    weighted_encoding_info: Optional[pd.DataFrame] = None,
+    **kwargs,
+):
+    """
+    Run an association study (EWAS, PhEWAS, GWAS, GxEWAS, etc)
+
+    Individual regression classes selected with `regression_kind` may work slightly differently.
+    Results are sorted in order of increasing `pvalue`
+
+    Parameters
+    ----------
+    data: pd.DataFrame
+        Contains all outcomes, regression_variables, and covariates
+    outcomes: str or List[str]
+        The exogenous variable (str) or variables (List) to be used as the output of each regression.
+    regression_variables: str, List[str], or None
+        The endogenous variable (str) or variables (List) to be used invididually as inputs into regression.
+        If None, use all variables in `data` that aren't an outcome or a covariate
+    covariates: str, List[str], or None (default)
+        The variable (str) or variables (List) to be used as covariates in each regression.
+    regression_kind: None, str or subclass of Regression
+        This can be 'glm', 'weighted_glm', or 'r_survey' for built-in Regression types,
+        or a custom subclass of Regression.  If None, it is set to 'glm' if a survey design is not specified
+        and 'weighted_glm' if it is.
+    encoding: str, default "additive"
+        Encoding method to use for any genotype data.  One of {'additive', 'dominant', 'recessive', 'codominant', or 'weighted'}
+    weighted_encoding_info: Optional pd.DataFrame, default None
+        If weighted encoding is used, this must be provided.  See Pandas-Genomics documentation on weighted encodings.
+    kwargs: Keyword arguments specific to the Regression being used
+
+    Returns
+    -------
+    df: pd.DataFrame
+        Association Study results DataFrame with at least these columns: ['N', 'pvalue', 'error', 'warnings'].
+        Indexed by the outcome variable and the variable being assessed in each regression
+    """
+    # Copy data to avoid modifying the original, in case it is changed
+    data = data.copy(deep=True)
+
+    # Encode any genotype data
+    has_genotypes = False
+    for dt in data.dtypes:
+        if GenotypeDtype.is_dtype(dt):
+            has_genotypes = True
+            break
+    if has_genotypes:
+        if encoding == "additive":
+            data = data.genomics.encode_additive()
+        elif encoding == "dominant":
+            data = data.genomics.encode_dominant()
+        elif encoding == "recessive":
+            data = data.genomics.encode_recessive()
+        elif encoding == "codominant":
+            data = data.genomics.encode_codominant()
+        elif encoding == "weighted":
+            if weighted_encoding_info is None:
+                raise ValueError(
+                    "'weighted_encoding_info' must be provided when using weighted encoding"
+                )
+            else:
+                data = data.genomics.encode_weighted(weighted_encoding_info)
+        else:
+            raise ValueError(f"Genotypes provided with unknown 'encoding': {encoding}")
+
+    # Ensure outcome, covariates, and regression variables are lists
+    if isinstance(outcomes, str):
+        outcomes = [
+            outcomes,
+        ]
+    if isinstance(covariates, str):
+        covariates = [
+            covariates,
+        ]
+    elif covariates is None:
+        covariates = []
+    if isinstance(regression_variables, str):
+        regression_variables = [
+            regression_variables,
+        ]
+    elif regression_variables is None:
+        regression_variables = list(set(data.columns) - set(outcomes) - set(covariates))
+
+    # Delete the survey_design_spec kwarg if it is None
+    # This would be fine, but kwarg parsing for different clases means possibly passing it to an init that isn't expecting it
+    if "survey_design_spec" in kwargs:
+        if kwargs["survey_design_spec"] is None:
+            del kwargs["survey_design_spec"]
+
+    # Parse regression kind
+    if regression_kind is None:
+        # Match the original api, which is glm or weighted_glm based on whether a design is passes
+        if "survey_design_spec" in kwargs:
+            regression_cls = WeightedGLMRegression
+        else:
+            regression_cls = GLMRegression
+    elif isinstance(regression_kind, str):
+        regression_cls = builtin_regression_kinds.get(regression_kind, None)
+        if regression_cls is None:
+            raise ValueError(
+                f"Unknown regression kind '{regression_kind}, known values are {','.join(builtin_regression_kinds.keys())}"
+            )
+    elif regression_kind in regression_kind.mro():
+        regression_cls = regression_kind
+    else:
+        raise ValueError(
+            f"Incorrect regression kind type ({type(regression_kind)}).  "
+            f"A valid string or a subclass of Regression is required."
+        )
+
+    # Run each regression
+    results = []
+    for outcome in outcomes:
+        regression = regression_cls(
+            data=data,
+            outcome_variable=outcome,
+            regression_variables=regression_variables,
+            covariates=covariates,
+            **kwargs,
+        )
+        print(regression)
+
+        # Run and get results
+        regression.run()
+        result = regression.get_results()
+
+        # Process Results
+        click.echo(f"Completed Association Study for {outcome}\n", color="green")
+        results.append(result)
+
+    if len(outcomes) == 1:
+        result = results[0]
+    else:
+        result = pd.concat(results)
+
+    # Sort across multiple outcomes
+    if result.index.names == ["Variable", "Outcome", "Category"]:
+        result = result.sort_values(["pvalue", "Beta_pvalue"])
+    elif result.index.names == ["Variable", "Outcome"]:
+        result = result.sort_values(["pvalue"])
+
+    click.echo("Completed association study", color="green")
+    return result
@@ -3,12 +3,7 @@
 import click
 
 from clarite.modules.analyze import regression
-
-builtin_regression_kinds = {
-    "glm": regression.GLMRegression,
-    "weighted_glm": regression.WeightedGLMRegression,
-    "r_survey": regression.RSurveyRegression,
-}
+from clarite.modules.analyze.regression import builtin_regression_kinds
 
 
 def ewas(
@@ -48,8 +43,11 @@ def ewas(
     Examples
     --------
     >>> ewas_discovery = clarite.analyze.ewas("logBMI", covariates, nhanes_discovery)
-    Running EWAS on a continuous variable
+    Running on a continuous variable
     """
+    raise DeprecationWarning(
+        "This function will be depreciated in favor of clarite.analyze.association_study"
+    )
     # Copy data to avoid modifying the original, in case it is changed
     data = data.copy(deep=True)
 
@@ -80,10 +78,22 @@ def ewas(
             f"A valid string or a subclass of Regression is required."
         )
 
+    # regression variables are anything that isn't an outcome or covariate
+    regression_variables = set(data.columns) - set(
+        [
+            outcome,
+        ]
+        + covariates
+    )
+
     # Initialize the regression and print details
     print(kwargs)
     regression = regression_cls(
-        data=data, outcome_variable=outcome, covariates=covariates, **kwargs
+        data=data,
+        outcome_variable=outcome,
+        covariates=covariates,
+        regression_variables=regression_variables,
+        **kwargs,
     )
     print(regression)