Skip to content

Commit 238bb5d

Browse files
Merge pull request #86 from theislab/dev
Dev
2 parents c63bb23 + 0e8f77d commit 238bb5d

File tree

7 files changed

+90
-29
lines changed

7 files changed

+90
-29
lines changed

batchglm/data.py

Lines changed: 62 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def design_matrix(
2121
as_categorical: Union[bool, list] = True,
2222
dmat: Union[pd.DataFrame, None] = None,
2323
return_type: str = "patsy",
24-
) -> Union[patsy.design_info.DesignMatrix, pd.DataFrame]:
24+
) -> Tuple[Union[patsy.design_info.DesignMatrix, pd.DataFrame], List[str]]:
2525
"""
2626
Create a design matrix from some sample description.
2727
@@ -62,6 +62,7 @@ def design_matrix(
6262
sample_description[col] = sample_description[col].astype("category")
6363

6464
dmat = patsy.dmatrix(formula, sample_description)
65+
coef_names = dmat.design_info.column_names
6566

6667
if return_type == "dataframe":
6768
df = pd.DataFrame(dmat, columns=dmat.design_info.column_names)
@@ -70,12 +71,12 @@ def design_matrix(
7071

7172
return df
7273
elif return_type == "patsy":
73-
return dmat
74+
return dmat, coef_names
7475
else:
7576
raise ValueError("return type %s not recognized" % return_type)
7677
else:
7778
if return_type == "dataframe":
78-
return dmat
79+
return dmat, dmat.columns
7980
elif return_type == "patsy":
8081
raise ValueError("return type 'patsy' not supported for input (dmat is not None)")
8182
else:
@@ -105,7 +106,7 @@ def preview_coef_names(
105106
sample_description: pd.DataFrame,
106107
formula: str,
107108
as_categorical: Union[bool, list] = True
108-
) -> np.ndarray:
109+
) -> List[str]:
109110
"""
110111
Return coefficient names of model.
111112
@@ -125,21 +126,22 @@ def preview_coef_names(
125126
Set to false, if columns should not be changed.
126127
:return: A list of coefficient names.
127128
"""
128-
return view_coef_names(dmat=design_matrix(
129+
_, coef_names = design_matrix(
129130
sample_description=sample_description,
130131
formula=formula,
131132
as_categorical=as_categorical,
132133
return_type="patsy"
133-
))
134+
)
135+
return coef_names
134136

135137

136138
def constraint_system_from_star(
137-
dmat: Union[None, np.ndarray] = None,
139+
dmat: Union[None, patsy.design_info.DesignMatrix, pd.DataFrame] = None,
138140
sample_description: Union[None, pd.DataFrame] = None,
139141
formula: Union[None, str] = None,
140142
as_categorical: Union[bool, list] = True,
141143
constraints: Union[None, List[str], Tuple[str], dict, np.ndarray] = None,
142-
return_type: str = "patsy",
144+
return_type: str = "patsy"
143145
) -> Tuple:
144146
"""
145147
Wrap different constraint matrix building formats with building of design matrix.
@@ -202,7 +204,7 @@ def constraint_system_from_star(
202204
raise ValueError("supply either sample_description or dmat")
203205

204206
if dmat is None and not isinstance(constraints, dict):
205-
dmat = design_matrix(
207+
dmat, coef_names = design_matrix(
206208
sample_description=sample_description,
207209
formula=formula,
208210
as_categorical=as_categorical,
@@ -213,39 +215,69 @@ def constraint_system_from_star(
213215
raise ValueError("dmat was supplied even though constraints were given as dict")
214216

215217
if isinstance(constraints, dict):
216-
dmat, cmat = constraint_matrix_from_dict(
218+
dmat, coef_names, cmat, term_names = constraint_matrix_from_dict(
217219
sample_description=sample_description,
218220
formula=formula,
219221
as_categorical=as_categorical,
220222
constraints=constraints,
221-
return_type="dataframe"
223+
return_type="patsy"
222224
)
223225
elif isinstance(constraints, tuple) or isinstance(constraints, list):
224-
cmat = constraint_matrix_from_string(
226+
cmat, coef_names = constraint_matrix_from_string(
225227
dmat=dmat,
228+
coef_names=dmat.design_info.column_names,
226229
constraints=constraints
227230
)
231+
term_names = None # not supported yet.
228232
elif isinstance(constraints, np.ndarray):
229233
cmat = constraints
234+
term_names = None
235+
if isinstance(dmat, pd.DataFrame):
236+
coef_names = dmat.columns
237+
dmat = dmat.values
230238
elif constraints is None:
231239
cmat = None
240+
term_names = None
241+
if isinstance(dmat, pd.DataFrame):
242+
coef_names = dmat.columns
243+
dmat = dmat.values
232244
else:
233245
raise ValueError("constraint format %s not recognized" % type(constraints))
234246

235-
return dmat, cmat
247+
# Test full design matrix for being full rank before returning:
248+
if cmat is None:
249+
if np.linalg.matrix_rank(dmat) != dmat.shape[1]:
250+
raise ValueError(
251+
"constrained design matrix is not full rank: %i %i" %
252+
(np.linalg.matrix_rank(dmat), dmat.shape[1])
253+
)
254+
else:
255+
if np.linalg.matrix_rank(np.matmul(dmat, cmat)) != cmat.shape[1]:
256+
raise ValueError(
257+
"constrained design matrix is not full rank: %i %i" %
258+
(np.linalg.matrix_rank(np.matmul(dmat, cmat)), cmat.shape[1])
259+
)
260+
261+
return dmat, coef_names, cmat, term_names
236262

237263

238264
def constraint_matrix_from_dict(
239265
sample_description: pd.DataFrame,
240266
formula: str,
241267
as_categorical: Union[bool, list] = True,
242268
constraints: dict = {},
243-
return_type: str = "dataframe"
269+
return_type: str = "patsy"
244270
) -> Tuple:
245271
"""
246272
Create a design matrix from some sample description and a constraint matrix
247273
based on factor encoding of constrained parameter sets.
248274
275+
Note that we build a dataframe instead of a pasty.DesignMatrix here if constraints are used.
276+
This is done because we were not able to build a patsy.DesignMatrix of the constrained form
277+
required in this context. In those cases in which the return type cannot be patsy, we encourage the
278+
use of the returned term_names to perform term-wise slicing which is not supported by other
279+
design matrix return types.
280+
249281
:param sample_description: pandas.DataFrame of length "num_observations" containing explanatory variables as columns
250282
:param formula: model formula as string, describing the relations of the explanatory variables.
251283
@@ -270,7 +302,9 @@ def constraint_matrix_from_dict(
270302
271303
Can only group by non-constrained effects right now, use constraint_matrix_from_string
272304
for other cases.
273-
:return: a model design matrix
305+
:return:
306+
- model design matrix
307+
- term_names to allow slicing by factor if return type cannot be patsy.DesignMatrix
274308
"""
275309
assert len(constraints) > 0, "supply constraints"
276310
sample_description: pd.DataFrame = sample_description.copy()
@@ -287,10 +321,11 @@ def constraint_matrix_from_dict(
287321
# absorption of the first level of each factor for each constrained factor onto the
288322
# core matrix.
289323
formula_unconstrained = formula.split("+")
290-
formula_unconstrained = [x for x in formula_unconstrained if x not in constraints.keys()]
324+
formula_unconstrained = [x for x in formula_unconstrained if x.strip(" ") not in constraints.keys()]
291325
formula_unconstrained = "+".join(formula_unconstrained)
292326
dmat = patsy.dmatrix(formula_unconstrained, sample_description)
293327
coef_names = dmat.design_info.column_names
328+
term_names = dmat.design_info.term_names
294329

295330
constraints_ls = string_constraints_from_dict(
296331
sample_description=sample_description,
@@ -301,6 +336,7 @@ def constraint_matrix_from_dict(
301336
dmat_constrained_temp = patsy.highlevel.dmatrix("0+" + x, sample_description)
302337
dmat = np.hstack([dmat, dmat_constrained_temp])
303338
coef_names.extend(dmat_constrained_temp.design_info.column_names)
339+
term_names.extend(dmat_constrained_temp.design_info.term_names)
304340

305341
# Build constraint matrix.
306342
constraints_ar = constraint_matrix_from_string(
@@ -312,8 +348,7 @@ def constraint_matrix_from_dict(
312348
# Format return type
313349
if return_type == "dataframe":
314350
dmat = pd.DataFrame(dmat, columns=coef_names)
315-
316-
return dmat, constraints_ar
351+
return dmat, coef_names, constraints_ar, term_names
317352

318353

319354
def string_constraints_from_dict(
@@ -388,6 +423,10 @@ def constraint_matrix_from_string(
388423

389424
di = patsy.DesignInfo(coef_names)
390425
constraint_ls = [di.linear_constraint(x).coefs[0] for x in constraints]
426+
# Check that constraints are sensible:
427+
for constraint_i in constraint_ls:
428+
if np.sum(constraint_i != 0) == 1:
429+
raise ValueError("a zero-equality constraint only involved one parameter: remove this parameter")
391430
idx_constr = np.asarray([np.where(x == 1)[0][0] for x in constraint_ls])
392431
idx_depending = [np.where(x == 1)[0][1:] for x in constraint_ls]
393432
idx_unconstr = np.asarray(list(
@@ -407,8 +446,10 @@ def constraint_matrix_from_string(
407446
constraint_mat[i, idx_unconstr_i] = 1
408447

409448
# Test unconstrained subset design matrix for being full rank before returning constraints:
410-
dmat_var =dmat[:, idx_unconstr]
411-
if np.linalg.matrix_rank(dmat_var) != np.linalg.matrix_rank(dmat_var.T):
412-
logging.getLogger("batchglm").error("constrained design matrix is not full rank")
449+
if np.linalg.matrix_rank(dmat[:, idx_unconstr]) != np.linalg.matrix_rank(dmat[:, idx_unconstr].T):
450+
raise ValueError(
451+
"unconstrained sub-design matrix is not full rank" %
452+
np.linalg.matrix_rank(dmat[:, idx_unconstr]), np.linalg.matrix_rank(dmat[:, idx_unconstr].T)
453+
)
413454

414455
return constraint_mat

batchglm/models/base_glm/input.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def __init__(
8888
design_matrix=design_loc,
8989
param_names=design_loc_names
9090
)
91-
design_scale, design_scale_names = parse_design(
91+
design_scale, design_scale_names = parse_design(
9292
design_matrix=design_scale,
9393
param_names=design_scale_names
9494
)

batchglm/models/base_glm/simulator.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import numpy as np
44
import pandas
55
import patsy
6-
from typing import Union
6+
from typing import Union, Tuple
77

88
from .model import _ModelGLM
99
from .external import _SimulatorBase
@@ -14,7 +14,7 @@ def generate_sample_description(
1414
num_conditions: int = 2,
1515
num_batches: int = 4,
1616
shuffle_assignments=False
17-
) -> np.ndarray:
17+
) -> Tuple[patsy.DesignMatrix, pandas.DataFrame]:
1818
""" Build a sample description.
1919
2020
:param num_observations: Number of observations to simulate.
@@ -45,7 +45,7 @@ def generate_sample_description(
4545
observations=np.random.permutation(sample_description.observations.values)
4646
)
4747

48-
return np.asarray(patsy.dmatrix("~1+condition+batch", sample_description)), sample_description
48+
return patsy.dmatrix("~1+condition+batch", sample_description), sample_description
4949

5050

5151
class _SimulatorGLM(_SimulatorBase, metaclass=abc.ABCMeta):
@@ -73,6 +73,7 @@ def __init__(
7373
self.sample_description = None
7474
self.sim_a_var = None
7575
self.sim_b_var = None
76+
self._size_factors = None
7677

7778
def generate_sample_description(
7879
self,
@@ -139,6 +140,10 @@ def _generate_params(
139140
rand_fn_scale((self.sim_design_scale.shape[1], self.nfeatures))
140141
], axis=0)
141142

143+
@property
144+
def size_factors(self):
145+
return self._size_factors
146+
142147
@property
143148
def a_var(self):
144149
return self.sim_a_var

batchglm/models/base_glm/utils.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ def parse_design(
3737
assert False
3838

3939
if param_names is not None:
40-
if params is not None:
41-
assert len(param_names) == len(params)
40+
if params is None:
41+
assert len(param_names) == dmat.shape[1]
4242
params = param_names
4343

4444
return dmat, params
@@ -62,8 +62,11 @@ def parse_constraints(
6262
constraints = np.identity(n=dmat.shape[1])
6363
constraint_params = dmat_par_names
6464
else:
65-
# Cannot use given parameter names if constraint matrix is not identity: Make up new ones.
66-
constraint_params = ["var_"+str(x) for x in range(constraints.shape[1])]
65+
# Cannot use all parameter names if constraint matrix is not identity: Make up new ones.
66+
# Use variable names that can be mapped (unconstrained).
67+
constraint_params = ["var_"+str(i) if np.sum(constraints[:, i] != 0) > 1
68+
else dmat_par_names[np.where(constraints[:, i] != 0)[0][0]]
69+
for i in range(constraints.shape[1])]
6770
assert constraints.shape[0] == dmat.shape[1], "constraint dimension mismatch"
6871

6972
if constraint_par_names is not None:

batchglm/models/glm_nb/simulator.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ def __init__(
1515
num_observations=1000,
1616
num_features=100
1717
):
18+
Model.__init__(
19+
self=self,
20+
input_data=None
21+
)
1822
_SimulatorGLM.__init__(
1923
self=self,
2024
model=None,
@@ -53,3 +57,4 @@ def generate_data(self):
5357
design_loc_names=None,
5458
design_scale_names=None
5559
)
60+

batchglm/unit_test/test_acc_constrained_vglm_all.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,11 @@ def simulate(self):
2424
constraints[1, 1] = 1
2525
constraints[2, 2] = 1
2626
constraints[3, 2] = -1
27+
new_coef_names = ['Intercept', 'condition[T.1]', 'batch[1]', 'batch[2]']
2728
self.sim1.input_data.design_loc = dmat
2829
self.sim1.input_data.design_scale = dmat
30+
self.sim1.input_data._design_loc_names = new_coef_names
31+
self.sim1.input_data._design_scale_names = new_coef_names
2932
self.sim1.input_data.constraints_loc = constraints
3033
self.sim1.input_data.constraints_scale = constraints
3134

batchglm/unit_test/test_acc_glm_all.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ def __init__(
5050
data=scipy.sparse.csr_matrix(simulator.input_data.x),
5151
design_loc=simulator.input_data.design_loc,
5252
design_scale=simulator.input_data.design_scale,
53+
design_loc_names=simulator.input_data.design_loc_names,
54+
design_scale_names=simulator.input_data.design_scale_names,
5355
constraints_loc=simulator.input_data.constraints_loc,
5456
constraints_scale=simulator.input_data.constraints_scale,
5557
size_factors=simulator.input_data.size_factors
@@ -59,6 +61,8 @@ def __init__(
5961
data=simulator.input_data.x,
6062
design_loc=simulator.input_data.design_loc,
6163
design_scale=simulator.input_data.design_scale,
64+
design_loc_names=simulator.input_data.design_loc_names,
65+
design_scale_names=simulator.input_data.design_scale_names,
6266
constraints_loc=simulator.input_data.constraints_loc,
6367
constraints_scale=simulator.input_data.constraints_scale,
6468
size_factors=simulator.input_data.size_factors

0 commit comments

Comments
 (0)