@@ -45,6 +45,11 @@ class GLMRegression(Regression):
4545 min_n:
4646 Minimum number of complete-case observations (no NA values for outcome, covariates, or variable)
4747 Defaults to 200
48+ report_categorical_betas: boolean
49+ False by default.
50+ If True, the results will contain one row for each categorical value (other than the reference category) and
51+ will include the beta value, standard error (SE), and beta pvalue for that specific category. The number of
52+ terms increases with the number of categories.
4853 """
4954
5055 def __init__ (
@@ -53,6 +58,7 @@ def __init__(
5358 outcome_variable : str ,
5459 covariates : Optional [List [str ]] = None ,
5560 min_n : int = 200 ,
61+ report_categorical_betas : bool = False ,
5662 ):
5763 """
5864 Parameters
@@ -64,6 +70,7 @@ def __init__(
6470 Kwargs
6571 ______
6672 min_n - minimum number of observations (after discarding any with NA)
73+ report_categorical_betas - whether or not to report betas for individual categories
6774 """
6875 # base class init
6976 # This takes in minimal regression params (data, outcome_variable, covariates) and
@@ -74,6 +81,7 @@ def __init__(
7481
7582 # Custom init involving kwargs passed to this regression
7683 self .min_n = min_n
84+ self .report_categorical_betas = report_categorical_betas
7785
7886 # Ensure the data output type is compatible
7987 # Set 'self.family' and 'self.use_t' which are dependent on the outcome dtype
@@ -126,7 +134,7 @@ def get_default_result_dict(rv):
126134 "N" : np .nan ,
127135 "Beta" : np .nan ,
128136 "SE" : np .nan ,
129- "Variable_pvalue " : np .nan ,
137+ "Beta_pvalue " : np .nan ,
130138 "LRT_pvalue" : np .nan ,
131139 "Diff_AIC" : np .nan ,
132140 "pvalue" : np .nan ,
@@ -170,7 +178,12 @@ def get_results(self) -> pd.DataFrame:
170178
171179 # Add "Outcome" and set the index
172180 result ["Outcome" ] = self .outcome_variable
173- result = result .set_index (["Variable" , "Outcome" ])
181+ if self .report_categorical_betas :
182+ result = result .set_index (["Variable" , "Outcome" , "Category" ]).sort_values (
183+ ["pvalue" , "Beta_pvalue" ]
184+ )
185+ else :
186+ result = result .set_index (["Variable" , "Outcome" ]).sort_values (["pvalue" ])
174187
175188 # Order columns
176189 column_order = [
@@ -180,16 +193,13 @@ def get_results(self) -> pd.DataFrame:
180193 "N" ,
181194 "Beta" ,
182195 "SE" ,
183- "Variable_pvalue " ,
196+ "Beta_pvalue " ,
184197 "LRT_pvalue" ,
185198 "Diff_AIC" ,
186199 "pvalue" ,
187200 ]
188201 result = result [column_order ]
189202
190- # Sort rows
191- result = result .sort_values ("pvalue" )
192-
193203 return result
194204
195205 def _run_continuous (self , data , regression_variable , formula ) -> Dict :
@@ -202,8 +212,8 @@ def _run_continuous(self, data, regression_variable, formula) -> Dict:
202212 result ["Converged" ] = True
203213 result ["Beta" ] = est .params [regression_variable ]
204214 result ["SE" ] = est .bse [regression_variable ]
205- result ["Variable_pvalue " ] = est .pvalues [regression_variable ]
206- result ["pvalue" ] = result ["Variable_pvalue " ]
215+ result ["Beta_pvalue " ] = est .pvalues [regression_variable ]
216+ result ["pvalue" ] = result ["Beta_pvalue " ]
207217
208218 return result
209219
@@ -232,8 +242,8 @@ def _run_binary(self, data, regression_variable, formula) -> Dict:
232242 )
233243 result ["Beta" ] = est .params [rv_key ]
234244 result ["SE" ] = est .bse [rv_key ]
235- result ["Variable_pvalue " ] = est .pvalues [rv_key ]
236- result ["pvalue" ] = result ["Variable_pvalue " ]
245+ result ["Beta_pvalue " ] = est .pvalues [rv_key ]
246+ result ["pvalue" ] = result ["Beta_pvalue " ]
237247
238248 return result
239249
@@ -248,16 +258,33 @@ def _run_categorical(self, data, formula, formula_restricted) -> Dict:
248258 )
249259 # Check convergence
250260 if est .converged & est_restricted .converged :
251- result ["Converged" ] = True
252261 # Calculate Results
253262 lrdf = est_restricted .df_resid - est .df_resid
254263 lrstat = - 2 * (est_restricted .llf - est .llf )
255264 lr_pvalue = scipy .stats .chi2 .sf (lrstat , lrdf )
256- result ["LRT_pvalue" ] = lr_pvalue
257- result ["pvalue" ] = result ["LRT_pvalue" ]
258- result ["Diff_AIC" ] = est .aic - est_restricted .aic
259-
260- return result
265+ if self .report_categorical_betas :
266+ param_names = set (est .bse .index ) - set (est_restricted .bse .index )
267+ # The restricted model shouldn't have extra terms, unless there is some case we have overlooked
268+ assert len (set (est_restricted .bse .index ) - set (est .bse .index )) == 0
269+ for param_name in param_names :
270+ yield {
271+ "Converged" : True ,
272+ "Category" : param_name ,
273+ "Beta" : est .params [param_name ],
274+ "SE" : est .bse [param_name ],
275+ "Beta_pvalue" : est .pvalues [param_name ],
276+ "LRT_pvalue" : lr_pvalue ,
277+ "pvalue" : lr_pvalue ,
278+ "Diff_AIC" : est .aic - est_restricted .aic ,
279+ }
280+ else :
281+ # Only return the LRT result
282+ yield {
283+ "Converged" : True ,
284+ "LRT_pvalue" : lr_pvalue ,
285+ "pvalue" : lr_pvalue ,
286+ "Diff_AIC" : est .aic - est_restricted .aic ,
287+ }
261288
262289 def run (self ):
263290 """Run a regression object, returning the results and logging any warnings/errors"""
@@ -269,9 +296,6 @@ def run(self):
269296 )
270297 # TODO: Parallelize this loop
271298 for rv in rv_list :
272- # Initialize result with placeholders
273- result = self .get_default_result_dict (rv )
274- result ["Variable_type" ] = rv_type
275299 # Run in a try/except block to catch any errors specific to a regression variable
276300 try :
277301 # Take a copy of the data (ignoring other RVs)
@@ -285,7 +309,6 @@ def run(self):
285309 .any (axis = 1 )
286310 )
287311 N = complete_case_mask .sum ()
288- result ["N" ] = N
289312 if N < self .min_n :
290313 raise ValueError (
291314 f"too few complete observations (min_n filter: { N } < { self .min_n } )"
@@ -315,20 +338,34 @@ def run(self):
315338
316339 # Run Regression
317340 if rv_type == "continuous" :
341+ result = self .get_default_result_dict (rv )
342+ result ["Variable_type" ] = rv_type
343+ result ["N" ] = N
318344 result .update (self ._run_continuous (data , rv , formula ))
345+ self .results .append (result )
319346 elif (
320347 rv_type == "binary"
321348 ): # Essentially same as continuous, except string used to key the results
349+ # Initialize result with placeholders
350+ result = self .get_default_result_dict (rv )
351+ result ["Variable_type" ] = rv_type
352+ result ["N" ] = N
322353 result .update (self ._run_binary (data , rv , formula ))
354+ self .results .append (result )
323355 elif rv_type == "categorical" :
324- result .update (
325- self ._run_categorical (data , formula , formula_restricted )
326- )
356+ for r in self ._run_categorical (
357+ data , formula , formula_restricted
358+ ):
359+ # Initialize result with placeholders
360+ result = self .get_default_result_dict (rv )
361+ result ["Variable_type" ] = rv_type
362+ result ["N" ] = N
363+ result .update (r )
364+ self .results .append (result )
327365
328366 except Exception as e :
329367 self .errors [rv ] = str (e )
330-
331- self .results .append (result )
368+ self .results .append (result )
332369
333370 click .echo (
334371 click .style (
0 commit comments