@@ -21,7 +21,7 @@ def design_matrix(
2121 as_categorical : Union [bool , list ] = True ,
2222 dmat : Union [pd .DataFrame , None ] = None ,
2323 return_type : str = "patsy" ,
24- ) -> Union [patsy .design_info .DesignMatrix , pd .DataFrame ]:
24+ ) -> Tuple [ Union [patsy .design_info .DesignMatrix , pd .DataFrame ], List [ str ] ]:
2525 """
2626 Create a design matrix from some sample description.
2727
@@ -62,6 +62,7 @@ def design_matrix(
6262 sample_description [col ] = sample_description [col ].astype ("category" )
6363
6464 dmat = patsy .dmatrix (formula , sample_description )
65+ coef_names = dmat .design_info .column_names
6566
6667 if return_type == "dataframe" :
6768 df = pd .DataFrame (dmat , columns = dmat .design_info .column_names )
@@ -70,12 +71,12 @@ def design_matrix(
7071
7172 return df
7273 elif return_type == "patsy" :
73- return dmat
74+ return dmat , coef_names
7475 else :
7576 raise ValueError ("return type %s not recognized" % return_type )
7677 else :
7778 if return_type == "dataframe" :
78- return dmat
79+ return dmat , dmat . columns
7980 elif return_type == "patsy" :
8081 raise ValueError ("return type 'patsy' not supported for input (dmat is not None)" )
8182 else :
@@ -105,7 +106,7 @@ def preview_coef_names(
105106 sample_description : pd .DataFrame ,
106107 formula : str ,
107108 as_categorical : Union [bool , list ] = True
108- ) -> np . ndarray :
109+ ) -> List [ str ] :
109110 """
110111 Return coefficient names of model.
111112
@@ -125,21 +126,22 @@ def preview_coef_names(
125126 Set to false, if columns should not be changed.
126127 :return: A list of coefficient names.
127128 """
128- return view_coef_names ( dmat = design_matrix (
129+ _ , coef_names = design_matrix (
129130 sample_description = sample_description ,
130131 formula = formula ,
131132 as_categorical = as_categorical ,
132133 return_type = "patsy"
133- ))
134+ )
135+ return coef_names
134136
135137
136138def constraint_system_from_star (
137- dmat : Union [None , np . ndarray ] = None ,
139+ dmat : Union [None , patsy . design_info . DesignMatrix , pd . DataFrame ] = None ,
138140 sample_description : Union [None , pd .DataFrame ] = None ,
139141 formula : Union [None , str ] = None ,
140142 as_categorical : Union [bool , list ] = True ,
141143 constraints : Union [None , List [str ], Tuple [str ], dict , np .ndarray ] = None ,
142- return_type : str = "patsy" ,
144+ return_type : str = "patsy"
143145) -> Tuple :
144146 """
145147 Wrap different constraint matrix building formats with building of design matrix.
@@ -202,7 +204,7 @@ def constraint_system_from_star(
202204 raise ValueError ("supply either sample_description or dmat" )
203205
204206 if dmat is None and not isinstance (constraints , dict ):
205- dmat = design_matrix (
207+ dmat , coef_names = design_matrix (
206208 sample_description = sample_description ,
207209 formula = formula ,
208210 as_categorical = as_categorical ,
@@ -213,39 +215,69 @@ def constraint_system_from_star(
213215 raise ValueError ("dmat was supplied even though constraints were given as dict" )
214216
215217 if isinstance (constraints , dict ):
216- dmat , cmat = constraint_matrix_from_dict (
218+ dmat , coef_names , cmat , term_names = constraint_matrix_from_dict (
217219 sample_description = sample_description ,
218220 formula = formula ,
219221 as_categorical = as_categorical ,
220222 constraints = constraints ,
221- return_type = "dataframe "
223+ return_type = "patsy "
222224 )
223225 elif isinstance (constraints , tuple ) or isinstance (constraints , list ):
224- cmat = constraint_matrix_from_string (
226+ cmat , coef_names = constraint_matrix_from_string (
225227 dmat = dmat ,
228+ coef_names = dmat .design_info .column_names ,
226229 constraints = constraints
227230 )
231+ term_names = None # not supported yet.
228232 elif isinstance (constraints , np .ndarray ):
229233 cmat = constraints
234+ term_names = None
235+ if isinstance (dmat , pd .DataFrame ):
236+ coef_names = dmat .columns
237+ dmat = dmat .values
230238 elif constraints is None :
231239 cmat = None
240+ term_names = None
241+ if isinstance (dmat , pd .DataFrame ):
242+ coef_names = dmat .columns
243+ dmat = dmat .values
232244 else :
233245 raise ValueError ("constraint format %s not recognized" % type (constraints ))
234246
235- return dmat , cmat
247+ # Test full design matrix for being full rank before returning:
248+ if cmat is None :
249+ if np .linalg .matrix_rank (dmat ) != dmat .shape [1 ]:
250+ raise ValueError (
251+ "constrained design matrix is not full rank: %i %i" %
252+ (np .linalg .matrix_rank (dmat ), dmat .shape [1 ])
253+ )
254+ else :
255+ if np .linalg .matrix_rank (np .matmul (dmat , cmat )) != cmat .shape [1 ]:
256+ raise ValueError (
257+ "constrained design matrix is not full rank: %i %i" %
258+ (np .linalg .matrix_rank (np .matmul (dmat , cmat )), cmat .shape [1 ])
259+ )
260+
261+ return dmat , coef_names , cmat , term_names
236262
237263
238264def constraint_matrix_from_dict (
239265 sample_description : pd .DataFrame ,
240266 formula : str ,
241267 as_categorical : Union [bool , list ] = True ,
242268 constraints : dict = {},
243- return_type : str = "dataframe "
269+ return_type : str = "patsy "
244270) -> Tuple :
245271 """
246272 Create a design matrix from some sample description and a constraint matrix
247273 based on factor encoding of constrained parameter sets.
248274
275+ Note that we build a dataframe instead of a pasty.DesignMatrix here if constraints are used.
276+ This is done because we were not able to build a patsy.DesignMatrix of the constrained form
277+ required in this context. In those cases in which the return type cannot be patsy, we encourage the
278+ use of the returned term_names to perform term-wise slicing which is not supported by other
279+ design matrix return types.
280+
249281 :param sample_description: pandas.DataFrame of length "num_observations" containing explanatory variables as columns
250282 :param formula: model formula as string, describing the relations of the explanatory variables.
251283
@@ -270,7 +302,9 @@ def constraint_matrix_from_dict(
270302
271303 Can only group by non-constrained effects right now, use constraint_matrix_from_string
272304 for other cases.
273- :return: a model design matrix
305+ :return:
306+ - model design matrix
307+ - term_names to allow slicing by factor if return type cannot be patsy.DesignMatrix
274308 """
275309 assert len (constraints ) > 0 , "supply constraints"
276310 sample_description : pd .DataFrame = sample_description .copy ()
@@ -287,10 +321,11 @@ def constraint_matrix_from_dict(
287321 # absorption of the first level of each factor for each constrained factor onto the
288322 # core matrix.
289323 formula_unconstrained = formula .split ("+" )
290- formula_unconstrained = [x for x in formula_unconstrained if x not in constraints .keys ()]
324+ formula_unconstrained = [x for x in formula_unconstrained if x . strip ( " " ) not in constraints .keys ()]
291325 formula_unconstrained = "+" .join (formula_unconstrained )
292326 dmat = patsy .dmatrix (formula_unconstrained , sample_description )
293327 coef_names = dmat .design_info .column_names
328+ term_names = dmat .design_info .term_names
294329
295330 constraints_ls = string_constraints_from_dict (
296331 sample_description = sample_description ,
@@ -301,6 +336,7 @@ def constraint_matrix_from_dict(
301336 dmat_constrained_temp = patsy .highlevel .dmatrix ("0+" + x , sample_description )
302337 dmat = np .hstack ([dmat , dmat_constrained_temp ])
303338 coef_names .extend (dmat_constrained_temp .design_info .column_names )
339+ term_names .extend (dmat_constrained_temp .design_info .term_names )
304340
305341 # Build constraint matrix.
306342 constraints_ar = constraint_matrix_from_string (
@@ -312,8 +348,7 @@ def constraint_matrix_from_dict(
312348 # Format return type
313349 if return_type == "dataframe" :
314350 dmat = pd .DataFrame (dmat , columns = coef_names )
315-
316- return dmat , constraints_ar
351+ return dmat , coef_names , constraints_ar , term_names
317352
318353
319354def string_constraints_from_dict (
@@ -388,6 +423,10 @@ def constraint_matrix_from_string(
388423
389424 di = patsy .DesignInfo (coef_names )
390425 constraint_ls = [di .linear_constraint (x ).coefs [0 ] for x in constraints ]
426+ # Check that constraints are sensible:
427+ for constraint_i in constraint_ls :
428+ if np .sum (constraint_i != 0 ) == 1 :
429+ raise ValueError ("a zero-equality constraint only involved one parameter: remove this parameter" )
391430 idx_constr = np .asarray ([np .where (x == 1 )[0 ][0 ] for x in constraint_ls ])
392431 idx_depending = [np .where (x == 1 )[0 ][1 :] for x in constraint_ls ]
393432 idx_unconstr = np .asarray (list (
@@ -407,8 +446,10 @@ def constraint_matrix_from_string(
407446 constraint_mat [i , idx_unconstr_i ] = 1
408447
409448 # Test unconstrained subset design matrix for being full rank before returning constraints:
410- dmat_var = dmat [:, idx_unconstr ]
411- if np .linalg .matrix_rank (dmat_var ) != np .linalg .matrix_rank (dmat_var .T ):
412- logging .getLogger ("batchglm" ).error ("constrained design matrix is not full rank" )
449+ if np .linalg .matrix_rank (dmat [:, idx_unconstr ]) != np .linalg .matrix_rank (dmat [:, idx_unconstr ].T ):
450+ raise ValueError (
451+ "unconstrained sub-design matrix is not full rank" %
452+ np .linalg .matrix_rank (dmat [:, idx_unconstr ]), np .linalg .matrix_rank (dmat [:, idx_unconstr ].T )
453+ )
413454
414455 return constraint_mat
0 commit comments