Skip to content

Commit 42a4c50

Browse files
committed
2 parents 782c031 + 758abab commit 42a4c50

File tree

6 files changed

+345
-91
lines changed

6 files changed

+345
-91
lines changed

clarite/modules/analyze/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
"N",
2929
"Beta",
3030
"SE",
31-
"Variable_pvalue",
31+
"Beta_pvalue",
3232
"LRT_pvalue",
3333
"Diff_AIC",
3434
"pvalue",

clarite/modules/analyze/regression/glm_regression.py

Lines changed: 62 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ class GLMRegression(Regression):
4545
min_n:
4646
Minimum number of complete-case observations (no NA values for outcome, covariates, or variable)
4747
Defaults to 200
48+
report_categorical_betas: boolean
49+
False by default.
50+
If True, the results will contain one row for each categorical value (other than the reference category) and
51+
will include the beta value, standard error (SE), and beta pvalue for that specific category. The number of
52+
terms increases with the number of categories.
4853
"""
4954

5055
def __init__(
@@ -53,6 +58,7 @@ def __init__(
5358
outcome_variable: str,
5459
covariates: Optional[List[str]] = None,
5560
min_n: int = 200,
61+
report_categorical_betas: bool = False,
5662
):
5763
"""
5864
Parameters
@@ -64,6 +70,7 @@ def __init__(
6470
Kwargs
6571
______
6672
min_n - minimum number of observations (after discarding any with NA)
73+
report_categorical_betas - whether or not to report betas for individual categories
6774
"""
6875
# base class init
6976
# This takes in minimal regression params (data, outcome_variable, covariates) and
@@ -74,6 +81,7 @@ def __init__(
7481

7582
# Custom init involving kwargs passed to this regression
7683
self.min_n = min_n
84+
self.report_categorical_betas = report_categorical_betas
7785

7886
# Ensure the data output type is compatible
7987
# Set 'self.family' and 'self.use_t' which are dependent on the outcome dtype
@@ -126,7 +134,7 @@ def get_default_result_dict(rv):
126134
"N": np.nan,
127135
"Beta": np.nan,
128136
"SE": np.nan,
129-
"Variable_pvalue": np.nan,
137+
"Beta_pvalue": np.nan,
130138
"LRT_pvalue": np.nan,
131139
"Diff_AIC": np.nan,
132140
"pvalue": np.nan,
@@ -170,7 +178,12 @@ def get_results(self) -> pd.DataFrame:
170178

171179
# Add "Outcome" and set the index
172180
result["Outcome"] = self.outcome_variable
173-
result = result.set_index(["Variable", "Outcome"])
181+
if self.report_categorical_betas:
182+
result = result.set_index(["Variable", "Outcome", "Category"]).sort_values(
183+
["pvalue", "Beta_pvalue"]
184+
)
185+
else:
186+
result = result.set_index(["Variable", "Outcome"]).sort_values(["pvalue"])
174187

175188
# Order columns
176189
column_order = [
@@ -180,16 +193,13 @@ def get_results(self) -> pd.DataFrame:
180193
"N",
181194
"Beta",
182195
"SE",
183-
"Variable_pvalue",
196+
"Beta_pvalue",
184197
"LRT_pvalue",
185198
"Diff_AIC",
186199
"pvalue",
187200
]
188201
result = result[column_order]
189202

190-
# Sort rows
191-
result = result.sort_values("pvalue")
192-
193203
return result
194204

195205
def _run_continuous(self, data, regression_variable, formula) -> Dict:
@@ -202,8 +212,8 @@ def _run_continuous(self, data, regression_variable, formula) -> Dict:
202212
result["Converged"] = True
203213
result["Beta"] = est.params[regression_variable]
204214
result["SE"] = est.bse[regression_variable]
205-
result["Variable_pvalue"] = est.pvalues[regression_variable]
206-
result["pvalue"] = result["Variable_pvalue"]
215+
result["Beta_pvalue"] = est.pvalues[regression_variable]
216+
result["pvalue"] = result["Beta_pvalue"]
207217

208218
return result
209219

@@ -232,8 +242,8 @@ def _run_binary(self, data, regression_variable, formula) -> Dict:
232242
)
233243
result["Beta"] = est.params[rv_key]
234244
result["SE"] = est.bse[rv_key]
235-
result["Variable_pvalue"] = est.pvalues[rv_key]
236-
result["pvalue"] = result["Variable_pvalue"]
245+
result["Beta_pvalue"] = est.pvalues[rv_key]
246+
result["pvalue"] = result["Beta_pvalue"]
237247

238248
return result
239249

@@ -248,16 +258,33 @@ def _run_categorical(self, data, formula, formula_restricted) -> Dict:
248258
)
249259
# Check convergence
250260
if est.converged & est_restricted.converged:
251-
result["Converged"] = True
252261
# Calculate Results
253262
lrdf = est_restricted.df_resid - est.df_resid
254263
lrstat = -2 * (est_restricted.llf - est.llf)
255264
lr_pvalue = scipy.stats.chi2.sf(lrstat, lrdf)
256-
result["LRT_pvalue"] = lr_pvalue
257-
result["pvalue"] = result["LRT_pvalue"]
258-
result["Diff_AIC"] = est.aic - est_restricted.aic
259-
260-
return result
265+
if self.report_categorical_betas:
266+
param_names = set(est.bse.index) - set(est_restricted.bse.index)
267+
# The restricted model shouldn't have extra terms, unless there is some case we have overlooked
268+
assert len(set(est_restricted.bse.index) - set(est.bse.index)) == 0
269+
for param_name in param_names:
270+
yield {
271+
"Converged": True,
272+
"Category": param_name,
273+
"Beta": est.params[param_name],
274+
"SE": est.bse[param_name],
275+
"Beta_pvalue": est.pvalues[param_name],
276+
"LRT_pvalue": lr_pvalue,
277+
"pvalue": lr_pvalue,
278+
"Diff_AIC": est.aic - est_restricted.aic,
279+
}
280+
else:
281+
# Only return the LRT result
282+
yield {
283+
"Converged": True,
284+
"LRT_pvalue": lr_pvalue,
285+
"pvalue": lr_pvalue,
286+
"Diff_AIC": est.aic - est_restricted.aic,
287+
}
261288

262289
def run(self):
263290
"""Run a regression object, returning the results and logging any warnings/errors"""
@@ -269,9 +296,6 @@ def run(self):
269296
)
270297
# TODO: Parallelize this loop
271298
for rv in rv_list:
272-
# Initialize result with placeholders
273-
result = self.get_default_result_dict(rv)
274-
result["Variable_type"] = rv_type
275299
# Run in a try/except block to catch any errors specific to a regression variable
276300
try:
277301
# Take a copy of the data (ignoring other RVs)
@@ -285,7 +309,6 @@ def run(self):
285309
.any(axis=1)
286310
)
287311
N = complete_case_mask.sum()
288-
result["N"] = N
289312
if N < self.min_n:
290313
raise ValueError(
291314
f"too few complete observations (min_n filter: {N} < {self.min_n})"
@@ -315,20 +338,34 @@ def run(self):
315338

316339
# Run Regression
317340
if rv_type == "continuous":
341+
result = self.get_default_result_dict(rv)
342+
result["Variable_type"] = rv_type
343+
result["N"] = N
318344
result.update(self._run_continuous(data, rv, formula))
345+
self.results.append(result)
319346
elif (
320347
rv_type == "binary"
321348
): # Essentially same as continuous, except string used to key the results
349+
# Initialize result with placeholders
350+
result = self.get_default_result_dict(rv)
351+
result["Variable_type"] = rv_type
352+
result["N"] = N
322353
result.update(self._run_binary(data, rv, formula))
354+
self.results.append(result)
323355
elif rv_type == "categorical":
324-
result.update(
325-
self._run_categorical(data, formula, formula_restricted)
326-
)
356+
for r in self._run_categorical(
357+
data, formula, formula_restricted
358+
):
359+
# Initialize result with placeholders
360+
result = self.get_default_result_dict(rv)
361+
result["Variable_type"] = rv_type
362+
result["N"] = N
363+
result.update(r)
364+
self.results.append(result)
327365

328366
except Exception as e:
329367
self.errors[rv] = str(e)
330-
331-
self.results.append(result)
368+
self.results.append(result)
332369

333370
click.echo(
334371
click.style(

0 commit comments

Comments
 (0)