Skip to content

Commit 7b186ae

Browse files
added global design and constraint generatio wrapper used in diffxpy now
1 parent 7b01f35 commit 7b186ae

File tree

2 files changed

+208
-75
lines changed

2 files changed

+208
-75
lines changed

batchglm/api/data.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,5 @@
1-
from batchglm.data import design_matrix
2-
from batchglm.data import design_matrix_from_xarray
3-
from batchglm.data import design_matrix_from_anndata
4-
from batchglm.data import sample_description_from_xarray
5-
from batchglm.data import sample_description_from_anndata
6-
from batchglm.data import load_mtx_to_adata
7-
from batchglm.data import load_mtx_to_xarray
8-
from batchglm.data import load_recursive_mtx
9-
from batchglm.data import xarray_from_data
10-
from batchglm.data import setup_constrained, constraint_matrix_from_string
1+
from batchglm.data import design_matrix, design_matrix_from_xarray, design_matrix_from_anndata
2+
from batchglm.data import sample_description_from_xarray, sample_description_from_anndata
3+
from batchglm.data import load_mtx_to_adata, load_mtx_to_xarray, load_recursive_mtx, xarray_from_data
4+
from batchglm.data import constraint_matrix_from_dict, constraint_matrix_from_string, string_constraints_from_dict
115
from batchglm.data import view_coef_names, preview_coef_names

batchglm/data.py

Lines changed: 204 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@ def load_mtx_to_adata(path, cache=True):
415415
delim = "\t"
416416

417417
fpath = os.path.join(path, file)
418-
logger.info("Reading %s as gene annotation...", fpath)
418+
logging.getLogger("batchglm").info("Reading %s as gene annotation...", fpath)
419419
tbl = pd.read_csv(fpath, header=None, sep=delim)
420420
tbl.columns = np.vectorize(lambda x: "col_%d" % x)(tbl.columns)
421421

@@ -427,7 +427,7 @@ def load_mtx_to_adata(path, cache=True):
427427
delim = "\t"
428428

429429
fpath = os.path.join(path, file)
430-
logger.info("Reading %s as barcode file...", fpath)
430+
logging.getLogger("batchglm").info("Reading %s as barcode file...", fpath)
431431
tbl = pd.read_csv(fpath, header=None, sep=delim)
432432
tbl.columns = np.vectorize(lambda x: "col_%d" % x)(tbl.columns)
433433

@@ -507,10 +507,10 @@ def load_recursive_mtx(dir_or_zipfile, target_format="xarray", cache=True) -> Di
507507
for file in files:
508508
if file == "matrix.mtx":
509509
if target_format.lower() == "xarray":
510-
logger.info("Reading %s as xarray...", root)
510+
logging.getLogger("batchglm").info("Reading %s as xarray...", root)
511511
ad = load_mtx_to_xarray(root)
512512
elif target_format.lower() == "adata" or target_format.lower() == "anndata":
513-
logger.info("Reading %s as AnnData...", root)
513+
logging.getLogger("batchglm").info("Reading %s as AnnData...", root)
514514
ad = load_mtx_to_adata(root, cache=cache)
515515
else:
516516
raise RuntimeError("Unknown target format %s" % target_format)
@@ -520,12 +520,124 @@ def load_recursive_mtx(dir_or_zipfile, target_format="xarray", cache=True) -> Di
520520
return adatas
521521

522522

523-
def setup_constrained(
523+
def constraint_system_from_star(
524+
dmat: Union[None, np.ndarray, xr.DataArray, xr.Dataset] = None,
525+
sample_description: Union[None, pd.DataFrame] = None,
526+
formula: Union[None, str] = None,
527+
as_categorical: Union[bool, list] = True,
528+
constraints: Union[None, List[str], Tuple[str], dict, np.ndarray] = None,
529+
dims: Union[Tuple[str, str], List[str]] = (),
530+
return_type: str = "xarray",
531+
) -> Tuple:
532+
"""
533+
Wrap different constraint matrix building formats with building of design matrix.
534+
535+
:param dmat: Pre-built model design matrix.
536+
:param sample_description: pandas.DataFrame of length "num_observations" containing explanatory variables as columns
537+
:param formula: model formula as string, describing the relations of the explanatory variables.
538+
539+
E.g. '~ 1 + batch + confounder'
540+
:param as_categorical: boolean or list of booleans corresponding to the columns in 'sample_description'
541+
542+
If True, all values in 'sample_description' will be treated as categorical values.
543+
544+
If list of booleans, each column will be changed to categorical if the corresponding value in 'as_categorical'
545+
is True.
546+
547+
Set to false, if columns should not be changed.
548+
:param constraints: Constraints for model. Can be one of the following:
549+
550+
- np.ndarray:
551+
Array with constraints in rows and model parameters in columns.
552+
Each constraint contains non-zero entries for the a of parameters that
553+
has to sum to zero. This constraint is enforced by binding one parameter
554+
to the negative sum of the other parameters, effectively representing that
555+
parameter as a function of the other parameters. This dependent
556+
parameter is indicated by a -1 in this array, the independent parameters
557+
of that constraint (which may be dependent at an earlier constraint)
558+
are indicated by a 1. You should only use this option
559+
together with prebuilt design matrix for the scale model, dmat_scale,
560+
for example via de.utils.setup_constrained().
561+
- dict:
562+
Every element of the dictionary corresponds to one set of equality constraints.
563+
Each set has to be be an entry of the form {..., x: y, ...}
564+
where x is the factor to be constrained and y is a factor by which levels of x are grouped
565+
and then constrained. Set y="1" to constrain all levels of x to sum to one,
566+
a single equality constraint.
567+
568+
E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
569+
zero. This is applicable if repeats of a an experiment within each condition
570+
are independent so that the set-up ~1+condition+batch is perfectly confounded.
571+
572+
Can only group by non-constrained effects right now, use constraint_matrix_from_string
573+
for other cases.
574+
- list of strings or tuple of strings:
575+
String encoded equality constraints.
576+
577+
E.g. ["batch1 + batch2 + batch3 = 0"]
578+
- None:
579+
No constraints are used, this is equivalent to using an identity matrix as a
580+
constraint matrix.
581+
:param dims: Dimension names of xarray.
582+
583+
E.g.: ["design_loc_params", "loc_params"] or ["design_scale_params", "scale_params"]
584+
:param return_type: type of the returned value.
585+
586+
- "patsy": return plain patsy.design_info.DesignMatrix object
587+
- "dataframe": return pd.DataFrame with observations as rows and params as columns
588+
- "xarray": return xr.Dataset with design matrix as ds["design"] and the sample description embedded as
589+
one variable per column
590+
This option is overridden if constraints are supplied as dict.
591+
:return: a model design matrix and a constraint matrix formatted as xr.DataArray
592+
"""
593+
if sample_description is None and dmat is None:
594+
raise ValueError("supply either sample_description or dmat")
595+
596+
if dmat is None and not isinstance(constraints, dict):
597+
dmat = design_matrix(
598+
sample_description=sample_description,
599+
formula=formula,
600+
as_categorical=as_categorical,
601+
dmat=None,
602+
return_type=return_type
603+
)
604+
elif dmat is not None and isinstance(constraints, dict):
605+
raise ValueError("dmat was supplied even though constraints were given as dict")
606+
607+
if isinstance(constraints, dict):
608+
dmat, cmat = constraint_matrix_from_dict(
609+
sample_description=sample_description,
610+
formula=formula,
611+
as_categorical=as_categorical,
612+
constraints=constraints,
613+
dims=dims
614+
)
615+
elif isinstance(constraints, tuple) or isinstance(constraints, list):
616+
cmat = constraint_matrix_from_string(
617+
dmat=dmat,
618+
constraints=constraints,
619+
dims=dims
620+
)
621+
elif isinstance(constraints, np.ndarray) or isinstance(constraints, xr.DataArray):
622+
cmat = parse_constraints(
623+
dmat=dmat,
624+
constraints=constraints,
625+
dims=dims
626+
)
627+
elif constraints is None:
628+
cmat = None
629+
else:
630+
raise ValueError("constraint format %s not recognized" % type(constraints))
631+
632+
return dmat, cmat
633+
634+
635+
def constraint_matrix_from_dict(
524636
sample_description: pd.DataFrame,
525637
formula: str,
526-
as_numeric: Union[List[str], Tuple[str], str] = (),
638+
as_categorical: Union[bool, list] = True,
527639
constraints: dict = {},
528-
dims: Union[Tuple[str], List[str]] = ()
640+
dims: Union[Tuple[str, str], List[str]] = ()
529641
) -> Tuple:
530642
"""
531643
Create a design matrix from some sample description and a constraint matrix
@@ -535,10 +647,14 @@ def setup_constrained(
535647
:param formula: model formula as string, describing the relations of the explanatory variables.
536648
537649
E.g. '~ 1 + batch + confounder'
538-
:param as_numeric:
539-
Which columns of sample_description to treat as numeric and
540-
not as categorical. This yields columns in the design matrix
541-
which do not correspond to one-hot encoded discrete factors.
650+
:param as_categorical: boolean or list of booleans corresponding to the columns in 'sample_description'
651+
652+
If True, all values in 'sample_description' will be treated as categorical values.
653+
654+
If list of booleans, each column will be changed to categorical if the corresponding value in 'as_categorical'
655+
is True.
656+
657+
Set to false, if columns should not be changed.
542658
:param constraints: Grouped factors to enfore equality constraints on. Every element of
543659
the dictionary corresponds to one set of equality constraints. Each set has to be
544660
be an entry of the form {..., x: y, ...} where x is the factor to be constrained and y is
@@ -551,17 +667,15 @@ def setup_constrained(
551667
552668
Can only group by non-constrained effects right now, use constraint_matrix_from_string
553669
for other cases.
554-
:param dims: ["design_loc_params", "loc_params"] or ["design_scale_params", "scale_params"]
555-
Dimension names of xarray.
670+
:param dims: Dimension names of xarray.
671+
672+
E.g.: ["design_loc_params", "loc_params"] or ["design_scale_params", "scale_params"]
556673
:return: a model design matrix
557674
"""
558675
assert len(constraints) > 0, "supply constraints"
559676
assert len(dims) == 2, "supply 2 dimension names in dim"
560677
sample_description: pd.DataFrame = sample_description.copy()
561678

562-
if isinstance(as_numeric, str):
563-
as_numeric = [as_numeric]
564-
as_categorical = [False if x in as_numeric else True for x in sample_description.columns.values]
565679
if type(as_categorical) is not bool or as_categorical:
566680
if type(as_categorical) is bool and as_categorical:
567681
as_categorical = np.repeat(True, sample_description.columns.size)
@@ -579,30 +693,16 @@ def setup_constrained(
579693
dmat = patsy.dmatrix(formula_unconstrained, sample_description)
580694
coef_names = dmat.design_info.column_names
581695

582-
constraints_ls = []
696+
constraints_ls = string_constraints_from_dict(
697+
sample_description=sample_description,
698+
constraints=constraints
699+
)
583700
for i, x in enumerate(constraints.keys()):
584701
assert isinstance(x, str), "constrained should contain strings"
585702
dmat_constrained_temp = patsy.highlevel.dmatrix("0+" + x, sample_description)
586703
dmat = np.hstack([dmat, dmat_constrained_temp])
587704
coef_names.extend(dmat_constrained_temp.design_info.column_names)
588705

589-
# Build slices by group.
590-
dmat_grouping_temp = patsy.highlevel.dmatrix("0+" + list(constraints.values())[i], sample_description)
591-
for j in range(dmat_grouping_temp.shape[1]):
592-
grouping = dmat_grouping_temp[:, j]
593-
idx_constrained_group = np.where(np.sum(dmat_constrained_temp[grouping == 1, :], axis=0) > 0)[0]
594-
# Assert that required grouping is nested.
595-
assert np.all(np.logical_xor(
596-
np.sum(dmat_constrained_temp[grouping == 1, :], axis=0) > 0,
597-
np.sum(dmat_constrained_temp[grouping == 0, :], axis=0) > 0
598-
)), "proposed grouping of constraints is not nested, read docstrings"
599-
# Add new string-encoded equality constraint.
600-
constraints_ls.append(
601-
"+".join(list(np.asarray(dmat_constrained_temp.design_info.column_names)[idx_constrained_group]))+"=0"
602-
)
603-
604-
logging.getLogger("batchglm").warning("Built constraints: "+", ".join(constraints_ls))
605-
606706
# Parse design matrix to xarray.
607707
ar = xr.DataArray(dmat, dims=("observations", "design_params"))
608708
ar.coords["design_params"] = coef_names
@@ -617,9 +717,60 @@ def setup_constrained(
617717
return ar, constraints_ar
618718

619719

720+
def string_constraints_from_dict(
721+
sample_description: pd.DataFrame,
722+
constraints: Union[None, dict] = {}
723+
):
724+
r"""
725+
Create string-encoded constraints from dictionary encoded constraints and sample description.
726+
727+
:param sample_description: pandas.DataFrame of length "num_observations" containing explanatory variables as columns
728+
:param constraints: Grouped factors to enfore equality constraints on. Every element of
729+
the dictionary corresponds to one set of equality constraints. Each set has to be
730+
be an entry of the form {..., x: y, ...} where x is the factor to be constrained and y is
731+
a factor by which levels of x are grouped and then constrained. Set y="1" to constrain
732+
all levels of x to sum to one, a single equality constraint.
733+
734+
E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
735+
zero. This is applicable if repeats of a an experiment within each condition
736+
are independent so that the set-up ~1+condition+batch is perfectly confounded.
737+
738+
Can only group by non-constrained effects right now, use constraint_matrix_from_string
739+
for other cases.
740+
:return: List of constraints as strings.
741+
742+
E.g. ["batch1 + batch2 + batch3 = 0"]
743+
"""
744+
if constraints is not None:
745+
constraints_ls = []
746+
for i, x in enumerate(constraints.keys()):
747+
assert isinstance(x, str), "constrained should contain strings"
748+
dmat_constrained_temp = patsy.highlevel.dmatrix("0+" + x, sample_description)
749+
750+
dmat_grouping_temp = patsy.highlevel.dmatrix("0+" + list(constraints.values())[i], sample_description)
751+
for j in range(dmat_grouping_temp.shape[1]):
752+
grouping = dmat_grouping_temp[:, j]
753+
idx_constrained_group = np.where(np.sum(dmat_constrained_temp[grouping == 1, :], axis=0) > 0)[0]
754+
# Assert that required grouping is nested.
755+
assert np.all(np.logical_xor(
756+
np.sum(dmat_constrained_temp[grouping == 1, :], axis=0) > 0,
757+
np.sum(dmat_constrained_temp[grouping == 0, :], axis=0) > 0
758+
)), "proposed grouping of constraints is not nested, read docstrings"
759+
# Add new string-encoded equality constraint.
760+
constraints_ls.append(
761+
"+".join(list(np.asarray(dmat_constrained_temp.design_info.column_names)[idx_constrained_group])) + "=0"
762+
)
763+
764+
logging.getLogger("batchglm").warning("Built constraints: " + ", ".join(constraints_ls))
765+
else:
766+
constraints_ls = None
767+
768+
return constraints_ls
769+
770+
620771
def constraint_matrix_from_string(
621772
dmat: Union[xr.DataArray, xr.Dataset],
622-
constraints: Union[Tuple[str], List[str]],
773+
constraints: Union[Tuple[str, str], List[str]],
623774
dims: list
624775
):
625776
r"""
@@ -629,8 +780,9 @@ def constraint_matrix_from_string(
629780
:param constraints: List of constraints as strings.
630781
631782
E.g. ["batch1 + batch2 + batch3 = 0"]
632-
:param dims: ["design_loc_params", "loc_params"] or ["design_scale_params", "scale_params"]
633-
Dimension names of xarray.
783+
:param dims: Dimension names of xarray.
784+
785+
E.g.: ["design_loc_params", "loc_params"] or ["design_scale_params", "scale_params"]
634786
:return: a constraint matrix
635787
"""
636788
assert len(constraints) > 0, "supply constraints"
@@ -666,14 +818,13 @@ def constraint_matrix_from_string(
666818
dims=dims
667819
)
668820

669-
# Test reduced design matrix for full rank before returning constraints:
821+
# Test unconstrained subset design matrix for being full rank before returning constraints:
670822
dmat_var = xr.DataArray(
671823
dims=[dmat.dims[0], "params"],
672824
data=dmat[:, idx_unconstr],
673825
coords={dmat.dims[0]: dmat.coords["observations"].values,
674826
"params": dmat.coords["design_params"].values[idx_unconstr]}
675827
)
676-
677828
if np.linalg.matrix_rank(dmat_var) != np.linalg.matrix_rank(dmat_var.T):
678829
logging.getLogger("batchglm").error("constrained design matrix is not full rank")
679830

@@ -682,41 +833,29 @@ def constraint_matrix_from_string(
682833

683834
def parse_constraints(
684835
dmat: Union[xr.DataArray, xr.Dataset],
685-
constraints: np.ndarray,
836+
constraints: Union[np.ndarray, xr.DataArray],
686837
dims: list
687838
):
688839
r"""
689840
Parse constraint matrix into xarray.
690841
691842
:param dmat: Design matrix.
692843
:param constraints: a constraint matrix.
693-
:param dims: ["design_loc_params", "loc_params"] or ["design_scale_params", "scale_params"]
694-
Dimension names of xarray
844+
:param dims: Dimension names of xarray.
845+
846+
E.g.: ["design_loc_params", "loc_params"] or ["design_scale_params", "scale_params"]
695847
:return: constraint matrix in xarray format
696848
"""
697849
if isinstance(dmat, xr.Dataset):
698850
dmat = dmat.data_vars['design']
699851

700-
constraints_ar = xr.DataArray(
701-
dims=dims,
702-
data=constraints,
703-
coords={dims[0]: dmat.coords["design_params"].values,
704-
dims[1]: ["var_"+str(x) for x in range(constraints.shape[1])]}
705-
)
706-
return constraints_ar
707-
708-
709-
class ChDir:
710-
"""
711-
Context manager to temporarily change the working directory
712-
"""
713-
714-
def __init__(self, path):
715-
self.cwd = os.getcwd()
716-
self.other_wd = path
717-
718-
def __enter__(self):
719-
os.chdir(self.other_wd)
720-
721-
def __exit__(self, *args):
722-
os.chdir(self.cwd)
852+
if isinstance(constraints, xr.DataArray):
853+
return constraints
854+
else:
855+
constraints_ar = xr.DataArray(
856+
dims=dims,
857+
data=constraints,
858+
coords={dims[0]: dmat.coords["design_params"].values,
859+
dims[1]: ["var_"+str(x) for x in range(constraints.shape[1])]}
860+
)
861+
return constraints_ar

0 commit comments

Comments
 (0)