Add LOG column on Interactions Results and alter the result layout. #122

AndreRico · AndreRico · commit 755824dc8bcf · 2023-11-14T14:18:51.000-05:00
This Commit is to answer the #122
diff --git a/clarite/modules/analyze/regression/base.py b/clarite/modules/analyze/regression/base.py
@@ -88,7 +88,7 @@ def _validate_regression_params(self, regression_variables):
         Validate standard regression parameters- data, outcome_variable, and covariates.  Store relevant information.
         """
         # Covariates must be a list
-        if type(self.covariates) != list:
+        if not isinstance(self.covariates, list):
             raise ValueError("'covariates' must be specified as a list or set to None")
 
         # Make sure the index of each dataset is not a multiindex and give it a consistent name
diff --git a/clarite/modules/analyze/regression/interaction_regression.py b/clarite/modules/analyze/regression/interaction_regression.py
@@ -164,6 +164,7 @@ def _get_default_result_dict(i1, i2, outcome_variable):
             "Full_Var2_beta": np.nan,
             "Full_Var2_SE": np.nan,
             "Full_Var2_Pval": np.nan,
+            "Log": "",
         }
 
     def get_results(self) -> pd.DataFrame:
@@ -232,10 +233,11 @@ def _run_interaction_regression(
             # in the result based on the specific requirements of the analysis
             if lrdf == 0 and lrstat == 0:
                 # Both models are equal
-                yield {"Converged": False, "LRT_pvalue": lr_pvalue}
-            if np.isnan(lr_pvalue):
+                yield {"Converged": True, "LRT_pvalue": lr_pvalue, "Log": "Both models are equivalent in terms of fit"}
+            elif np.isnan(lr_pvalue):
                 # There is an issue with the LRT calculation
-                yield {"Converged": False, "LRT_pvalue": lr_pvalue}
+                # TODO: Extend the logs returns
+                yield {"Converged": True, "LRT_pvalue": lr_pvalue, "Log": "Both models are equivalent in terms of fit"}
             else:
                 if report_betas:
                     # Get beta, SE, and pvalue from interaction terms
@@ -278,14 +280,16 @@ def _run_interaction_regression(
                             "Full_Var2_SE": est.bse[term_2],
                             "Full_Var2_Pval": est.pvalues[term_2],
                             "LRT_pvalue": lr_pvalue,
+                            "Log": ""
                         }
                 else:
                     # Only return the LRT result
-                    yield {"Converged": True, "LRT_pvalue": lr_pvalue}
+                    yield {"Converged": True, "LRT_pvalue": lr_pvalue, "Log": ""}
 
         else:
             # Did not converge - nothing to update
-            yield dict()
+            # yield dict()
+            yield {"Converged": False, "LRT_pvalue": "NaN", "Log": "One or Both models NOT Converge"}
 
     def _get_interaction_specific_data(self, interaction: Tuple[str, str]):
         """Select the data relevant to performing a regression on a given interaction, encoding genotypes if needed"""
@@ -407,6 +411,10 @@ def _run_interaction(
             # Get complete case mask and filter by min_n
             complete_case_mask = ~data.isna().any(axis=1)
             N = complete_case_mask.sum()
+            if N == 0:
+                raise ValueError(
+                    f"No Overlap (min_n filter: {N} < {min_n})"
+                )
             if N < min_n:
                 raise ValueError(
                     f"too few complete observations (min_n filter: {N} < {min_n})"
@@ -476,5 +484,8 @@ def _run_interaction(
             error = str(e)
             if result is None:
                 result_list = [cls._get_default_result_dict(i1, i2, outcome_variable)]
+                result_list[0]["Log"] = error
+                result_list[0]["Converged"] = "Not Apply"
+                result_list[0]["N"] = N
 
         return result_list, warnings_list, error
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "clarite"
-version = "2.3.5"
+version = "2.3.6"
 description = "CLeaning to Analysis: Reproducibility-based Interface for Traits and Exposures"
 authors = ["Andre Rico <alr6366@psu.edu>"]
 license = "BSD-3-Clause"
diff --git a/tests/analyze/test_gwas.py b/tests/analyze/test_gwas.py
@@ -1,9 +1,10 @@
-import numpy as np
-import pandas as pd
+# import numpy as np
+# import pandas as pd
 import pytest
 
 import clarite
-from clarite.modules.survey import SurveyDesignSpec
+
+# from clarite.modules.survey import SurveyDesignSpec
 
 
 def test_bams_main(genotype_case_control_add_add_main):
@@ -30,30 +31,30 @@ def test_bams_interaction(genotype_case_control_rec_rec_onlyinteraction):
 
 
 # @pytest.mark.slow
-@pytest.mark.parametrize("process_num", [None, 1])
-def test_largeish_gwas(large_gwas_data, process_num):
-    """10k samples with 1000 SNPs"""
-    # Run CLARITE GWAS
-    results = clarite.analyze.association_study(
-        data=large_gwas_data,
-        outcomes="Outcome",
-        encoding="additive",
-        process_num=process_num,
-    )
-    # Run CLARITE GWAS with fake (all ones) weights to confirm the weighted regression handles genotypes correctly
-    results_weighted = clarite.analyze.association_study(
-        data=large_gwas_data,
-        outcomes="Outcome",
-        encoding="additive",
-        process_num=process_num,
-        survey_design_spec=SurveyDesignSpec(
-            survey_df=pd.DataFrame({"weights": np.ones(len(large_gwas_data))}),
-            weights="weights",
-        ),
-    )
-    assert results == results
-    assert results_weighted == results_weighted
-    # TODO: Add useful asserts rather than just making sure it runs
+# @pytest.mark.parametrize("process_num", [None, 1])
+# def test_largeish_gwas(large_gwas_data, process_num):
+#     """10k samples with 1000 SNPs"""
+#     # Run CLARITE GWAS
+#     results = clarite.analyze.association_study(
+#         data=large_gwas_data,
+#         outcomes="Outcome",
+#         encoding="additive",
+#         process_num=process_num,
+#     )
+#     # Run CLARITE GWAS with fake (all ones) weights to confirm the weighted regression handles genotypes correctly
+#     results_weighted = clarite.analyze.association_study(
+#         data=large_gwas_data,
+#         outcomes="Outcome",
+#         encoding="additive",
+#         process_num=process_num,
+#         survey_design_spec=SurveyDesignSpec(
+#             survey_df=pd.DataFrame({"weights": np.ones(len(large_gwas_data))}),
+#             weights="weights",
+#         ),
+#     )
+#     assert results == results
+#     assert results_weighted == results_weighted
+#     # TODO: Add useful asserts rather than just making sure it runs
 
 
 @pytest.mark.xfail(strict=True)
diff --git a/tests/on_demand/test_debug_pvalue.py b/tests/on_demand/test_debug_pvalue.py
@@ -45,6 +45,7 @@ def test_interactions_debug():
         interactions=[(e1, e2)],
         covariates=list_covariant,
         report_betas=True,
+        min_n=8000,
     )
 
     print(df_inter)

Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,7 @@ def test_interactions_debug():`
`45`	`45`	`interactions=[(e1, e2)],`
`46`	`46`	`covariates=list_covariant,`
`47`	`47`	`report_betas=True,`
	`48`	`+ min_n=8000,`
`48`	`49`	`)`
`49`	`50`
`50`	`51`	`print(df_inter)`