Skip to content

Commit a493334

Browse files
committed
Version 2.3.5
1 parent 817ccad commit a493334

28 files changed

+18617
-219
lines changed

clarite/cli/commands/analyze.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -319,8 +319,6 @@ def get_significant(ewas_result, output, use_fdr, pvalue):
319319
else:
320320
col = "pvalue_bonferroni"
321321
_, data = ewas_result
322-
data = data.loc[
323-
data[col] <= pvalue,
324-
]
322+
data = data.loc[data[col] <= pvalue,]
325323
# Save result
326324
save_clarite_ewas(data, output)

clarite/cli/commands/describe.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,7 @@ def freq_table(data, output):
3838
# Save results
3939
results.to_csv(output, sep="\t", index=False)
4040
# Log
41-
processed = results.loc[
42-
results["value"] != "<Non-Categorical Values>",
43-
]
41+
processed = results.loc[results["value"] != "<Non-Categorical Values>",]
4442
if len(processed) > 0:
4543
num_values = processed[["Variable", "value"]].nunique()
4644
num_variables = processed["Variable"].nunique()

clarite/cli/commands/modify.py

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -242,25 +242,15 @@ def rowfilter(data, output, column, vs, vi, vf, comparison):
242242
value = values[0]
243243
# Filter
244244
if comparison == "lt":
245-
data.df = data.df.loc[
246-
data.df[column] < value,
247-
]
245+
data.df = data.df.loc[data.df[column] < value,]
248246
elif comparison == "lte":
249-
data.df = data.df.loc[
250-
data.df[column] <= value,
251-
]
247+
data.df = data.df.loc[data.df[column] <= value,]
252248
elif comparison == "eq":
253-
data.df = data.df.loc[
254-
data.df[column] == value,
255-
]
249+
data.df = data.df.loc[data.df[column] == value,]
256250
elif comparison == "gt":
257-
data.df = data.df.loc[
258-
data.df[column] >= value,
259-
]
251+
data.df = data.df.loc[data.df[column] >= value,]
260252
elif comparison == "gte":
261-
data.df = data.df.loc[
262-
data.df[column] > value,
263-
]
253+
data.df = data.df.loc[data.df[column] > value,]
264254
# Save
265255
save_clarite_data(data, output)
266256

clarite/cli/commands/plot.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def manhattan(ewas_result, output, categories, bonferroni, fdr, other, nlabeled,
116116
# Load data
117117
name, data = ewas_result
118118
data_dict = {name: data}
119-
for (name, data) in other:
119+
for name, data in other:
120120
data_dict[name] = data
121121
# Load categories, if any
122122
if categories is not None:
@@ -187,7 +187,7 @@ def manhattan_bonferroni(
187187
# Load data
188188
name, data = ewas_result
189189
data_dict = {name: data}
190-
for (name, data) in other:
190+
for name, data in other:
191191
data_dict[name] = data
192192
# Load categories, if any
193193
if categories is not None:
@@ -255,7 +255,7 @@ def manhattan_fdr(ewas_result, output, categories, cutoff, other, nlabeled, labe
255255
# Load data
256256
name, data = ewas_result
257257
data_dict = {name: data}
258-
for (name, data) in other:
258+
for name, data in other:
259259
data_dict[name] = data
260260
# Load categories, if any
261261
if categories is not None:

clarite/internal/utilities.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def wrapped(*args, **kwargs):
2525

2626
def requires(package_name):
2727
"""Decorator factory to ensure optional packages are imported before running"""
28+
2829
# Define and return an appropriate decorator
2930
def decorator(func):
3031
# Check if package is importable

clarite/modules/analyze/interaction_study.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import List, Optional, Tuple, Union
22

3-
import pandas as pd
43
import click
4+
import pandas as pd
55
from pandas_genomics import GenotypeDtype
66

77
from .regression import InteractionRegression
@@ -45,9 +45,9 @@ def interaction_study(
4545
If edge encoding is used, this must be provided. See Pandas-Genomics documentation on edge encoding.
4646
report_betas: boolean
4747
False by default.
48-
If True, the results will contain one row for each interaction term and will include the beta value,
49-
standard error (SE), and beta pvalue for that specific interaction. The number of terms increases with
50-
the number of categories in each interacting variable.
48+
If True, the results will contain one row for each interaction term and will include the beta value,
49+
standard error (SE), and beta pvalue for that specific interaction. The number of terms increases with
50+
the number of categories in each interacting variable.
5151
min_n: int or None
5252
Minimum number of complete-case observations (no NA values for outcome, covariates, or variable)
5353
Defaults to 200
@@ -119,7 +119,7 @@ def interaction_study(
119119
result = regression.get_results()
120120

121121
# Process Results
122-
click.echo(f"Completed Interaction Study for {outcome}\n", color="green")
122+
click.echo(f"Completed Interaction Study for {outcome}\n", color=True)
123123
results.append(result)
124124

125125
if len(outcomes) == 1:
@@ -128,7 +128,10 @@ def interaction_study(
128128
result = pd.concat(results)
129129

130130
# Sort across multiple outcomes
131-
result = result.sort_values(["LRT_pvalue", "Beta_pvalue"])
131+
if report_betas:
132+
result = result.sort_values(["LRT_pvalue", "Full_Var1_Var2_Pval"])
133+
else:
134+
result = result.sort_values(["LRT_pvalue"])
132135

133-
click.echo("Completed association study", color="green")
136+
click.echo("Completed association study", color=True)
134137
return result

clarite/modules/analyze/regression/glm_regression.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import multiprocessing
22
import re
33
from itertools import repeat
4-
from typing import Dict, List, Optional, Tuple
4+
from typing import Dict, Generator, List, Optional, Tuple
55

66
import click
77
import numpy as np
@@ -308,8 +308,14 @@ def _run_binary(data, regression_variable, formula, family, use_t) -> Dict:
308308

309309
@staticmethod
310310
def _run_categorical(
311-
data, formula, formula_restricted, family, use_t, report_categorical_betas
312-
) -> Dict:
311+
data,
312+
formula,
313+
formula_restricted,
314+
family,
315+
use_t,
316+
report_categorical_betas
317+
# ) -> Dict:
318+
) -> Generator[dict, None, None]:
313319
# Regress both models
314320
y, X = patsy.dmatrices(formula, data, return_type="dataframe", NA_action="drop")
315321
y = fix_names(y)
@@ -555,4 +561,4 @@ def _run_rv(
555561
if result is None:
556562
result_list = [cls.get_default_result_dict(rv)]
557563

558-
return result_list, warnings_list, error
564+
return result_list, warnings_list, error # type: ignore

clarite/modules/analyze/regression/interaction_regression.py

Lines changed: 105 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import multiprocessing
22
from itertools import combinations, repeat
3-
from typing import Dict, List, Optional, Tuple
3+
from typing import Dict, Generator, List, Optional, Tuple
44

55
import click
66
import numpy as np
@@ -16,7 +16,7 @@
1616
from . import GLMRegression
1717

1818
# GITHUB ISSUE #119: Regressions with Error after Multiprocessing release python > 3.8
19-
multiprocessing.get_start_method("fork")
19+
# multiprocessing.get_start_method("fork")
2020

2121

2222
class InteractionRegression(GLMRegression):
@@ -48,8 +48,8 @@ class InteractionRegression(GLMRegression):
4848
List of tuples: Test specific interactions of valid variables
4949
report_betas: boolean
5050
False by default.
51-
If True, the results will contain one row for each interaction term and will include the beta value
52-
for that term. The number of terms increases with the number of categories in each interacting term.
51+
If True, the results will contain one row for each interaction term and will include the beta value
52+
for that term. The number of terms increases with the number of categories in each interacting term.
5353
encoding: str, default "additive"
5454
Encoding method to use for any genotype data. One of {'additive', 'dominant', 'recessive', 'codominant', or 'weighted'}
5555
edge_encoding_info: Optional pd.DataFrame, default None
@@ -109,7 +109,7 @@ def _process_interactions(self, interactions):
109109
)
110110
if interactions is None:
111111
self.interactions = [c for c in combinations(regression_var_list, r=2)]
112-
elif type(interactions) == str:
112+
elif type(interactions) is str:
113113
if interactions not in regression_var_list:
114114
raise ValueError(
115115
f"'{interactions}' was passed as the value for 'interactions' "
@@ -140,16 +140,30 @@ def _process_interactions(self, interactions):
140140
self.description += f"\nProcessing {len(self.interactions):,} interactions"
141141

142142
@staticmethod
143-
def _get_default_result_dict(i1, i2):
143+
def _get_default_result_dict(i1, i2, outcome_variable):
144144
return {
145+
"Outcome": outcome_variable,
145146
"Term1": i1,
146147
"Term2": i2,
148+
"Parameter": str(i1 + ":" + i2),
147149
"Converged": False,
148150
"N": np.nan,
149-
"Beta": np.nan,
150-
"SE": np.nan,
151-
"Beta_pvalue": np.nan,
152151
"LRT_pvalue": np.nan,
152+
"Red_Var1_beta": np.nan,
153+
"Red_Var1_SE": np.nan,
154+
"Red_Var1_Pval": np.nan,
155+
"Red_Var2_beta": np.nan,
156+
"Red_Var2_SE": np.nan,
157+
"Red_Var2_Pval": np.nan,
158+
"Full_Var1_Var2_beta": np.nan,
159+
"Full_Var1_Var2_SE": np.nan,
160+
"Full_Var1_Var2_Pval": np.nan,
161+
"Full_Var1_beta": np.nan,
162+
"Full_Var1_SE": np.nan,
163+
"Full_Var1_Pval": np.nan,
164+
"Full_Var2_beta": np.nan,
165+
"Full_Var2_SE": np.nan,
166+
"Full_Var2_Pval": np.nan,
153167
}
154168

155169
def get_results(self) -> pd.DataFrame:
@@ -169,17 +183,18 @@ def get_results(self) -> pd.DataFrame:
169183
result["Outcome"] = self.outcome_variable
170184
if self.report_betas:
171185
return result.set_index(
172-
["Term1", "Term2", "Outcome", "Parameter"]
173-
).sort_values(["LRT_pvalue", "Beta_pvalue"])
186+
# ["Term1", "Term2", "Outcome", "Parameter"]
187+
["Term1", "Term2", "Outcome"]
188+
).sort_values(["LRT_pvalue", "Full_Var1_Var2_Pval"])
174189
else:
175190
return result.set_index(["Term1", "Term2", "Outcome"]).sort_values(
176191
["LRT_pvalue"]
177192
)
178193

179194
@staticmethod
180195
def _run_interaction_regression(
181-
data, formula, formula_restricted, family, use_t, report_betas
182-
) -> Dict:
196+
data, formula, formula_restricted, family, use_t, report_betas, i1, i2
197+
) -> Generator[Dict, None, None]:
183198
# Regress Full Model
184199
y, X = patsy.dmatrices(formula, data, return_type="dataframe", NA_action="drop")
185200
y = fix_names(y)
@@ -201,25 +216,73 @@ def _run_interaction_regression(
201216
lrdf = est_restricted.df_resid - est.df_resid
202217
lrstat = -2 * (est_restricted.llf - est.llf)
203218
lr_pvalue = scipy.stats.chi2.sf(lrstat, lrdf)
204-
if report_betas:
205-
# Get beta, SE, and pvalue from interaction terms
206-
# Where interaction terms are those appearing in the full model and not in the reduced model
207-
# Return all terms
208-
param_names = set(est.bse.index) - set(est_restricted.bse.index)
209-
# The restricted model shouldn't have extra terms, unless there is some case we have overlooked
210-
assert len(set(est_restricted.bse.index) - set(est.bse.index)) == 0
211-
for param_name in param_names:
212-
yield {
213-
"Converged": True,
214-
"Parameter": param_name,
215-
"Beta": est.params[param_name],
216-
"SE": est.bse[param_name],
217-
"Beta_pvalue": est.pvalues[param_name],
218-
"LRT_pvalue": lr_pvalue,
219-
}
219+
# GITHUB/ISSUES 121: Handling LRT_Pvalue when lrstat and lrdf are
220+
# both 0. When lrstat (the test statistic) and lrdf (degrees of
221+
# freedom for the Likelihood Ratio Test) are both 0, it typically
222+
# suggests that both models are equivalent in terms of fit. In
223+
# other words, there is no significant difference between the two
224+
# models.
225+
#
226+
# However when both lrstat and lrdf are 0, calc the survival
227+
# function (sf) of a chi-squared distribution with 0 degrees of
228+
# freedom results in NaN. This is because mathematically, it's
229+
# undefined to perform this calculation under these circumstances.
230+
#
231+
# In such cases, it's important to handle this scenario separately
232+
# in the result based on the specific requirements of the analysis
233+
if lrdf == 0 and lrstat == 0:
234+
# Both models are equal
235+
yield {"Converged": False, "LRT_pvalue": lr_pvalue}
236+
if np.isnan(lr_pvalue):
237+
# There is an issue with the LRT calculation
238+
yield {"Converged": False, "LRT_pvalue": lr_pvalue}
220239
else:
221-
# Only return the LRT result
222-
yield {"Converged": True, "LRT_pvalue": lr_pvalue}
240+
if report_betas:
241+
# Get beta, SE, and pvalue from interaction terms
242+
# Where interaction terms are those appearing in the full
243+
# model and not in the reduced model return all terms
244+
param_names = set(est.bse.index) - set(est_restricted.bse.index)
245+
# The restricted model shouldn't have extra terms, unless
246+
# there is some case we have overlooked.
247+
assert len(set(est_restricted.bse.index) - set(est.bse.index)) == 0
248+
# GITHUB/ISSUES 122: Open to show Terms Betas Values
249+
for param_name in param_names:
250+
# Names defined to aling with PLATO
251+
# Split the input_string by ":"
252+
term_1, term_2 = param_name.split(":")
253+
yield {
254+
"Term1": term_1,
255+
"Term2": term_2,
256+
"Converged": True,
257+
"Parameter": param_name,
258+
# Betas in Reduced Model
259+
# Var1 --> Term 1
260+
"Red_Var1_beta": est_restricted.params[term_1],
261+
"Red_Var1_SE": est_restricted.bse[term_1],
262+
"Red_Var1_Pval": est_restricted.pvalues[term_1],
263+
# Var2 --> Term 2
264+
"Red_Var2_beta": est_restricted.params[term_2],
265+
"Red_Var2_SE": est_restricted.bse[term_2],
266+
"Red_Var2_Pval": est_restricted.pvalues[term_2],
267+
# Betas in Full Model
268+
# Var1 --> Term 1
269+
"Full_Var1_Var2_beta": est.params[param_name],
270+
"Full_Var1_Var2_SE": est.bse[param_name],
271+
"Full_Var1_Var2_Pval": est.pvalues[param_name],
272+
# Var1 --> Term 1
273+
"Full_Var1_beta": est.params[term_1],
274+
"Full_Var1_SE": est.bse[term_1],
275+
"Full_Var1_Pval": est.pvalues[term_1],
276+
# Var2 --> Term 2
277+
"Full_Var2_beta": est.params[term_2],
278+
"Full_Var2_SE": est.bse[term_2],
279+
"Full_Var2_Pval": est.pvalues[term_2],
280+
"LRT_pvalue": lr_pvalue,
281+
}
282+
else:
283+
# Only return the LRT result
284+
yield {"Converged": True, "LRT_pvalue": lr_pvalue}
285+
223286
else:
224287
# Did not converge - nothing to update
225288
yield dict()
@@ -394,16 +457,24 @@ def _run_interaction(
394457

395458
# Run Regression LRT Test
396459
for regression_result in cls._run_interaction_regression(
397-
data, formula, formula_restricted, family, use_t, report_betas
460+
data,
461+
formula,
462+
formula_restricted,
463+
family,
464+
use_t,
465+
report_betas,
466+
i1,
467+
i2,
398468
):
399-
result = cls._get_default_result_dict(i1, i2)
469+
result = cls._get_default_result_dict(i1, i2, outcome_variable)
400470
result["N"] = N
471+
# TODO:
401472
result.update(regression_result)
402473
result_list.append(result)
403474

404475
except Exception as e:
405476
error = str(e)
406477
if result is None:
407-
result_list = [cls._get_default_result_dict(i1, i2)]
478+
result_list = [cls._get_default_result_dict(i1, i2, outcome_variable)]
408479

409480
return result_list, warnings_list, error

clarite/modules/analyze/regression/weighted_glm_regression.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,6 @@ def _run_weighted_rv(
368368
use_t: bool,
369369
report_categorical_betas: bool,
370370
) -> Tuple[List[dict], List[str], str]: # results, warnings, errors
371-
372371
# Initialize return values
373372
result_list = []
374373
warnings_list = []

clarite/modules/describe.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,7 @@ def correlations(data: pd.DataFrame, threshold: float = 0.75):
6666
.reset_index()
6767
)
6868
# Remove those with correlation below threshold
69-
correlation = correlation.loc[
70-
correlation["correlation"].abs() >= threshold,
71-
]
69+
correlation = correlation.loc[correlation["correlation"].abs() >= threshold,]
7270
# Sort by absolute value
7371
correlation = correlation.reindex(
7472
correlation["correlation"].abs().sort_values(ascending=False).index

0 commit comments

Comments
 (0)