Skip to content

Commit ee1e1dc

Browse files
authored
Merge pull request #124 from HallLab/dev
Open Betas
2 parents a493334 + bd22eaf commit ee1e1dc

File tree

12 files changed

+90
-117
lines changed

12 files changed

+90
-117
lines changed

.github/workflows/ci.yml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ jobs:
1717
R_LIBS_USER: ./r-libs
1818

1919
steps:
20-
- uses: actions/checkout@v1
20+
# - uses: actions/checkout@v1
21+
- uses: actions/checkout@v2
2122
with:
2223
fetch-depth: 1
2324

@@ -39,11 +40,14 @@ jobs:
3940
- name: Set up Python
4041
uses: actions/setup-python@v2
4142
with:
42-
python-version: 3.7
43+
# python-version: 3.7
44+
python-version: '3.9'
4345

4446
- name: Install Poetry
45-
uses: snok/[email protected]
47+
# uses: snok/[email protected]
48+
uses: snok/install-poetry@v1
4649
with:
50+
version: 1.5.1
4751
virtualenvs-create: true
4852
virtualenvs-in-project: true
4953

clarite/internal/utilities.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,13 @@ def _validate_skip_only(
5454
):
5555
"""Validate use of the 'skip' and 'only' parameters, returning a boolean series for the columns where True = use the column"""
5656
# Ensure that 'data' is a DataFrame and not a Series
57-
if type(data) != pd.DataFrame:
57+
if not isinstance(data, pd.DataFrame):
5858
raise ValueError("The passed 'data' is not a Pandas DataFrame")
5959

6060
# Convert string to a list
61-
if type(skip) == str:
61+
if isinstance(skip, str):
6262
skip = [skip]
63-
if type(only) == str:
63+
if isinstance(only, str):
6464
only = [only]
6565

6666
if skip is not None and only is not None:
@@ -204,7 +204,7 @@ def _remove_empty_categories(
204204
Updates the data in-place and returns a dict of variables:removed categories
205205
"""
206206
removed_cats = dict()
207-
if type(data) == pd.DataFrame:
207+
if isinstance(data, pd.DataFrame):
208208
columns = _validate_skip_only(data, skip, only)
209209
dtypes = data.loc[:, columns].dtypes
210210
catvars = [v for v in dtypes[dtypes == "category"].index]
@@ -219,7 +219,7 @@ def _remove_empty_categories(
219219
if len(removed_categories) > 0:
220220
removed_cats[var] = removed_categories
221221
return removed_cats
222-
elif type(data) == pd.Series:
222+
elif isinstance(data, pd.Series):
223223
assert skip is None
224224
assert only is None
225225
counts = data.value_counts()

clarite/modules/analyze/regression/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def _validate_regression_params(self, regression_variables):
8888
Validate standard regression parameters- data, outcome_variable, and covariates. Store relevant information.
8989
"""
9090
# Covariates must be a list
91-
if type(self.covariates) != list:
91+
if not isinstance(self.covariates, list):
9292
raise ValueError("'covariates' must be specified as a list or set to None")
9393

9494
# Make sure the index of each dataset is not a multiindex and give it a consistent name

clarite/modules/analyze/regression/interaction_regression.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ def _get_default_result_dict(i1, i2, outcome_variable):
164164
"Full_Var2_beta": np.nan,
165165
"Full_Var2_SE": np.nan,
166166
"Full_Var2_Pval": np.nan,
167+
"Log": "",
167168
}
168169

169170
def get_results(self) -> pd.DataFrame:
@@ -232,10 +233,19 @@ def _run_interaction_regression(
232233
# in the result based on the specific requirements of the analysis
233234
if lrdf == 0 and lrstat == 0:
234235
# Both models are equal
235-
yield {"Converged": False, "LRT_pvalue": lr_pvalue}
236-
if np.isnan(lr_pvalue):
236+
yield {
237+
"Converged": True,
238+
"LRT_pvalue": lr_pvalue,
239+
"Log": "Both models are equivalent in terms of fit",
240+
}
241+
elif np.isnan(lr_pvalue):
237242
# There is an issue with the LRT calculation
238-
yield {"Converged": False, "LRT_pvalue": lr_pvalue}
243+
# TODO: Extend the logs returns
244+
yield {
245+
"Converged": True,
246+
"LRT_pvalue": lr_pvalue,
247+
"Log": "Both models are equivalent in terms of fit",
248+
}
239249
else:
240250
if report_betas:
241251
# Get beta, SE, and pvalue from interaction terms
@@ -278,14 +288,20 @@ def _run_interaction_regression(
278288
"Full_Var2_SE": est.bse[term_2],
279289
"Full_Var2_Pval": est.pvalues[term_2],
280290
"LRT_pvalue": lr_pvalue,
291+
"Log": "",
281292
}
282293
else:
283294
# Only return the LRT result
284-
yield {"Converged": True, "LRT_pvalue": lr_pvalue}
295+
yield {"Converged": True, "LRT_pvalue": lr_pvalue, "Log": ""}
285296

286297
else:
287298
# Did not converge - nothing to update
288-
yield dict()
299+
# yield dict()
300+
yield {
301+
"Converged": False,
302+
"LRT_pvalue": "NaN",
303+
"Log": "One or Both models NOT Converge",
304+
}
289305

290306
def _get_interaction_specific_data(self, interaction: Tuple[str, str]):
291307
"""Select the data relevant to performing a regression on a given interaction, encoding genotypes if needed"""
@@ -407,6 +423,8 @@ def _run_interaction(
407423
# Get complete case mask and filter by min_n
408424
complete_case_mask = ~data.isna().any(axis=1)
409425
N = complete_case_mask.sum()
426+
if N == 0:
427+
raise ValueError(f"No Overlap (min_n filter: {N} < {min_n})")
410428
if N < min_n:
411429
raise ValueError(
412430
f"too few complete observations (min_n filter: {N} < {min_n})"
@@ -476,5 +494,8 @@ def _run_interaction(
476494
error = str(e)
477495
if result is None:
478496
result_list = [cls._get_default_result_dict(i1, i2, outcome_variable)]
497+
result_list[0]["Log"] = error
498+
result_list[0]["Converged"] = "NA"
499+
result_list[0]["N"] = N
479500

480501
return result_list, warnings_list, error

clarite/modules/analyze/utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,10 @@ def add_corrected_pvalues(
4444
if pvalue not in data.columns:
4545
raise ValueError(f"'{pvalue}' is not a column in the passed data")
4646
if groupby is not None:
47-
if type(groupby) == str:
47+
if isinstance(groupby, str):
4848
if (groupby not in data.columns) and (groupby not in data.index.names):
4949
raise ValueError(f"'{groupby}' is not a column in the passed data")
50-
elif type(groupby) == list:
50+
elif isinstance(groupby, list):
5151
for g in groupby:
5252
if (g not in data.columns) and (g not in data.index.names):
5353
raise ValueError(f"'{g}' is not a column in the passed data")
@@ -96,13 +96,13 @@ def add_corrected_pvalues(
9696
# Expand results to duplicated rows
9797
data[bonf_name] = data[groupby].apply(
9898
lambda g: bonf_result.get(g, np.nan)
99-
if type(g) == str
99+
if isinstance(g, str)
100100
else bonf_result.get(tuple(g.values), np.nan),
101101
axis=1,
102102
)
103103
data[fdr_name] = data[groupby].apply(
104104
lambda g: bonf_result.get(g, np.nan)
105-
if type(g) == str
105+
if isinstance(g, str)
106106
else fdr_result.get(tuple(g.values), np.nan),
107107
axis=1,
108108
)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "clarite"
3-
version = "2.3.5"
3+
version = "2.3.6"
44
description = "CLeaning to Analysis: Reproducibility-based Interface for Traits and Exposures"
55
authors = ["Andre Rico <[email protected]>"]
66
license = "BSD-3-Clause"

tests/analyze/test_gwas.py

Lines changed: 28 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
import numpy as np
2-
import pandas as pd
1+
# import numpy as np
2+
# import pandas as pd
33
import pytest
44

55
import clarite
6-
from clarite.modules.survey import SurveyDesignSpec
6+
7+
# from clarite.modules.survey import SurveyDesignSpec
78

89

910
def test_bams_main(genotype_case_control_add_add_main):
@@ -30,30 +31,30 @@ def test_bams_interaction(genotype_case_control_rec_rec_onlyinteraction):
3031

3132

3233
# @pytest.mark.slow
33-
@pytest.mark.parametrize("process_num", [None, 1])
34-
def test_largeish_gwas(large_gwas_data, process_num):
35-
"""10k samples with 1000 SNPs"""
36-
# Run CLARITE GWAS
37-
results = clarite.analyze.association_study(
38-
data=large_gwas_data,
39-
outcomes="Outcome",
40-
encoding="additive",
41-
process_num=process_num,
42-
)
43-
# Run CLARITE GWAS with fake (all ones) weights to confirm the weighted regression handles genotypes correctly
44-
results_weighted = clarite.analyze.association_study(
45-
data=large_gwas_data,
46-
outcomes="Outcome",
47-
encoding="additive",
48-
process_num=process_num,
49-
survey_design_spec=SurveyDesignSpec(
50-
survey_df=pd.DataFrame({"weights": np.ones(len(large_gwas_data))}),
51-
weights="weights",
52-
),
53-
)
54-
assert results == results
55-
assert results_weighted == results_weighted
56-
# TODO: Add useful asserts rather than just making sure it runs
34+
# @pytest.mark.parametrize("process_num", [None, 1])
35+
# def test_largeish_gwas(large_gwas_data, process_num):
36+
# """10k samples with 1000 SNPs"""
37+
# # Run CLARITE GWAS
38+
# results = clarite.analyze.association_study(
39+
# data=large_gwas_data,
40+
# outcomes="Outcome",
41+
# encoding="additive",
42+
# process_num=process_num,
43+
# )
44+
# # Run CLARITE GWAS with fake (all ones) weights to confirm the weighted regression handles genotypes correctly
45+
# results_weighted = clarite.analyze.association_study(
46+
# data=large_gwas_data,
47+
# outcomes="Outcome",
48+
# encoding="additive",
49+
# process_num=process_num,
50+
# survey_design_spec=SurveyDesignSpec(
51+
# survey_df=pd.DataFrame({"weights": np.ones(len(large_gwas_data))}),
52+
# weights="weights",
53+
# ),
54+
# )
55+
# assert results == results
56+
# assert results_weighted == results_weighted
57+
# # TODO: Add useful asserts rather than just making sure it runs
5758

5859

5960
@pytest.mark.xfail(strict=True)

tests/analyze/test_interaction_study.py

Lines changed: 17 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -206,80 +206,26 @@ def test_interactions_nhanes_pairwise(data_NHANES):
206206
)
207207
compare_result(loaded_result, python_result, rtol=1e-02)
208208

209-
# Test Adding pvalues
210-
clarite.analyze.add_corrected_pvalues(python_result_nobeta, pvalue="LRT_pvalue")
211-
clarite.analyze.add_corrected_pvalues(python_result, pvalue="Full_Var1_Var2_Pval")
212-
clarite.analyze.add_corrected_pvalues(
213-
python_result, pvalue="LRT_pvalue", groupby=["Term1", "Term2"]
214-
)
215-
# Ensure grouped pvalue corrections match
216-
grouped_bonf = (
217-
python_result.reset_index(drop=False)
218-
.groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_bonferroni"]
219-
.first()
220-
)
221-
grouped_fdr = (
222-
python_result.reset_index(drop=False)
223-
.groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_fdr"]
224-
.first()
225-
)
226-
227209
# TODO: Alter this test because nobeta did not open all categories
228-
# assert (grouped_bonf == python_result_nobeta["LRT_pvalue_bonferroni"]).all()
229-
# assert (grouped_fdr == python_result_nobeta["LRT_pvalue_fdr"]).all()
230-
assert grouped_bonf == grouped_bonf
231-
assert grouped_fdr == grouped_fdr
232-
233210

234-
def test_interaction_exe():
235-
nested_table = clarite.load.from_csv(
236-
"/Users/andrerico/HALL/Python_3_10/clarite-python/tests/test_data_files/nested_table.csv"
237-
)
238-
# Return same result if not change data type
239-
# list_bin = (
240-
# "female",
241-
# "black",
242-
# "mexican",
243-
# "other_hispanic",
244-
# "other_eth",
211+
# # Test Adding pvalues
212+
# clarite.analyze.add_corrected_pvalues(python_result_nobeta, pvalue="LRT_pvalue")
213+
# clarite.analyze.add_corrected_pvalues(python_result, pvalue="Full_Var1_Var2_Pval")
214+
# clarite.analyze.add_corrected_pvalues(
215+
# python_result, pvalue="LRT_pvalue", groupby=["Term1", "Term2"]
245216
# )
246-
# list_cat = (
247-
# "SDDSRVYR",
248-
# "SES_LEVEL",
217+
218+
# # Ensure grouped pvalue corrections match
219+
# grouped_bonf = (
220+
# python_result.reset_index(drop=False)
221+
# .groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_bonferroni"]
222+
# .first()
249223
# )
250-
# list_cont = (
251-
# "BMXBMI",
252-
# "RIDAGEYR",
253-
# "LBXCOT",
254-
# "IRON_mg",
255-
# "DR1TSFAT",
256-
# "DRDSDT1",
224+
# grouped_fdr = (
225+
# python_result.reset_index(drop=False)
226+
# .groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_fdr"]
227+
# .first()
257228
# )
258229

259-
# nested_table = clarite.modify.make_binary(data=nested_table, only=(list_bin))
260-
# nested_table = clarite.modify.make_categorical(data=nested_table, only=(list_cat))
261-
# nested_table = clarite.modify.make_continuous(data=nested_table, only=(list_cont))
262-
263-
e1 = "DR1TSFAT"
264-
e2 = "DRDSDT1"
265-
list_covariant = [
266-
"female",
267-
"black",
268-
"mexican",
269-
"other_hispanic",
270-
"other_eth",
271-
"SDDSRVYR",
272-
"BMXBMI",
273-
"SES_LEVEL",
274-
"RIDAGEYR",
275-
"LBXCOT",
276-
"IRON_mg",
277-
]
278-
retorno = clarite.analyze.interaction_study(
279-
data=nested_table,
280-
outcomes="LBXHGB",
281-
interactions=[(e1, e2)],
282-
covariates=list_covariant,
283-
)
284-
285-
assert retorno == retorno
230+
# assert (grouped_bonf == python_result_nobeta["LRT_pvalue_bonferroni"]).all()
231+
# assert (grouped_fdr == python_result_nobeta["LRT_pvalue_fdr"]).all()

tests/on_demand/test_debug_pvalue.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def test_interactions_debug():
4545
interactions=[(e1, e2)],
4646
covariates=list_covariant,
4747
report_betas=True,
48+
min_n=8000,
4849
)
4950

5051
print(df_inter)
0 Bytes
Loading

0 commit comments

Comments
 (0)