@@ -415,7 +415,7 @@ def load_mtx_to_adata(path, cache=True):
415415 delim = "\t "
416416
417417 fpath = os .path .join (path , file )
418- logger .info ("Reading %s as gene annotation..." , fpath )
418+ logging . getLogger ( "batchglm" ) .info ("Reading %s as gene annotation..." , fpath )
419419 tbl = pd .read_csv (fpath , header = None , sep = delim )
420420 tbl .columns = np .vectorize (lambda x : "col_%d" % x )(tbl .columns )
421421
@@ -427,7 +427,7 @@ def load_mtx_to_adata(path, cache=True):
427427 delim = "\t "
428428
429429 fpath = os .path .join (path , file )
430- logger .info ("Reading %s as barcode file..." , fpath )
430+ logging . getLogger ( "batchglm" ) .info ("Reading %s as barcode file..." , fpath )
431431 tbl = pd .read_csv (fpath , header = None , sep = delim )
432432 tbl .columns = np .vectorize (lambda x : "col_%d" % x )(tbl .columns )
433433
@@ -507,10 +507,10 @@ def load_recursive_mtx(dir_or_zipfile, target_format="xarray", cache=True) -> Di
507507 for file in files :
508508 if file == "matrix.mtx" :
509509 if target_format .lower () == "xarray" :
510- logger .info ("Reading %s as xarray..." , root )
510+ logging . getLogger ( "batchglm" ) .info ("Reading %s as xarray..." , root )
511511 ad = load_mtx_to_xarray (root )
512512 elif target_format .lower () == "adata" or target_format .lower () == "anndata" :
513- logger .info ("Reading %s as AnnData..." , root )
513+ logging . getLogger ( "batchglm" ) .info ("Reading %s as AnnData..." , root )
514514 ad = load_mtx_to_adata (root , cache = cache )
515515 else :
516516 raise RuntimeError ("Unknown target format %s" % target_format )
@@ -520,12 +520,124 @@ def load_recursive_mtx(dir_or_zipfile, target_format="xarray", cache=True) -> Di
520520 return adatas
521521
522522
523- def setup_constrained (
523+ def constraint_system_from_star (
524+ dmat : Union [None , np .ndarray , xr .DataArray , xr .Dataset ] = None ,
525+ sample_description : Union [None , pd .DataFrame ] = None ,
526+ formula : Union [None , str ] = None ,
527+ as_categorical : Union [bool , list ] = True ,
528+ constraints : Union [None , List [str ], Tuple [str ], dict , np .ndarray ] = None ,
529+ dims : Union [Tuple [str , str ], List [str ]] = (),
530+ return_type : str = "xarray" ,
531+ ) -> Tuple :
532+ """
533+ Wrap different constraint matrix building formats with building of design matrix.
534+
535+ :param dmat: Pre-built model design matrix.
536+ :param sample_description: pandas.DataFrame of length "num_observations" containing explanatory variables as columns
537+ :param formula: model formula as string, describing the relations of the explanatory variables.
538+
539+ E.g. '~ 1 + batch + confounder'
540+ :param as_categorical: boolean or list of booleans corresponding to the columns in 'sample_description'
541+
542+ If True, all values in 'sample_description' will be treated as categorical values.
543+
544+ If list of booleans, each column will be changed to categorical if the corresponding value in 'as_categorical'
545+ is True.
546+
547+ Set to false, if columns should not be changed.
548+ :param constraints: Constraints for model. Can be one of the following:
549+
550+ - np.ndarray:
551+ Array with constraints in rows and model parameters in columns.
552+ Each constraint contains non-zero entries for the a of parameters that
553+ has to sum to zero. This constraint is enforced by binding one parameter
554+ to the negative sum of the other parameters, effectively representing that
555+ parameter as a function of the other parameters. This dependent
556+ parameter is indicated by a -1 in this array, the independent parameters
557+ of that constraint (which may be dependent at an earlier constraint)
558+ are indicated by a 1. You should only use this option
559+ together with prebuilt design matrix for the scale model, dmat_scale,
560+ for example via de.utils.setup_constrained().
561+ - dict:
562+ Every element of the dictionary corresponds to one set of equality constraints.
563+ Each set has to be be an entry of the form {..., x: y, ...}
564+ where x is the factor to be constrained and y is a factor by which levels of x are grouped
565+ and then constrained. Set y="1" to constrain all levels of x to sum to one,
566+ a single equality constraint.
567+
568+ E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
569+ zero. This is applicable if repeats of a an experiment within each condition
570+ are independent so that the set-up ~1+condition+batch is perfectly confounded.
571+
572+ Can only group by non-constrained effects right now, use constraint_matrix_from_string
573+ for other cases.
574+ - list of strings or tuple of strings:
575+ String encoded equality constraints.
576+
577+ E.g. ["batch1 + batch2 + batch3 = 0"]
578+ - None:
579+ No constraints are used, this is equivalent to using an identity matrix as a
580+ constraint matrix.
581+ :param dims: Dimension names of xarray.
582+
583+ E.g.: ["design_loc_params", "loc_params"] or ["design_scale_params", "scale_params"]
584+ :param return_type: type of the returned value.
585+
586+ - "patsy": return plain patsy.design_info.DesignMatrix object
587+ - "dataframe": return pd.DataFrame with observations as rows and params as columns
588+ - "xarray": return xr.Dataset with design matrix as ds["design"] and the sample description embedded as
589+ one variable per column
590+ This option is overridden if constraints are supplied as dict.
591+ :return: a model design matrix and a constraint matrix formatted as xr.DataArray
592+ """
593+ if sample_description is None and dmat is None :
594+ raise ValueError ("supply either sample_description or dmat" )
595+
596+ if dmat is None and not isinstance (constraints , dict ):
597+ dmat = design_matrix (
598+ sample_description = sample_description ,
599+ formula = formula ,
600+ as_categorical = as_categorical ,
601+ dmat = None ,
602+ return_type = return_type
603+ )
604+ elif dmat is not None and isinstance (constraints , dict ):
605+ raise ValueError ("dmat was supplied even though constraints were given as dict" )
606+
607+ if isinstance (constraints , dict ):
608+ dmat , cmat = constraint_matrix_from_dict (
609+ sample_description = sample_description ,
610+ formula = formula ,
611+ as_categorical = as_categorical ,
612+ constraints = constraints ,
613+ dims = dims
614+ )
615+ elif isinstance (constraints , tuple ) or isinstance (constraints , list ):
616+ cmat = constraint_matrix_from_string (
617+ dmat = dmat ,
618+ constraints = constraints ,
619+ dims = dims
620+ )
621+ elif isinstance (constraints , np .ndarray ) or isinstance (constraints , xr .DataArray ):
622+ cmat = parse_constraints (
623+ dmat = dmat ,
624+ constraints = constraints ,
625+ dims = dims
626+ )
627+ elif constraints is None :
628+ cmat = None
629+ else :
630+ raise ValueError ("constraint format %s not recognized" % type (constraints ))
631+
632+ return dmat , cmat
633+
634+
635+ def constraint_matrix_from_dict (
524636 sample_description : pd .DataFrame ,
525637 formula : str ,
526- as_numeric : Union [List [ str ], Tuple [ str ], str ] = () ,
638+ as_categorical : Union [bool , list ] = True ,
527639 constraints : dict = {},
528- dims : Union [Tuple [str ], List [str ]] = ()
640+ dims : Union [Tuple [str , str ], List [str ]] = ()
529641) -> Tuple :
530642 """
531643 Create a design matrix from some sample description and a constraint matrix
@@ -535,10 +647,14 @@ def setup_constrained(
535647 :param formula: model formula as string, describing the relations of the explanatory variables.
536648
537649 E.g. '~ 1 + batch + confounder'
538- :param as_numeric:
539- Which columns of sample_description to treat as numeric and
540- not as categorical. This yields columns in the design matrix
541- which do not correspond to one-hot encoded discrete factors.
650+ :param as_categorical: boolean or list of booleans corresponding to the columns in 'sample_description'
651+
652+ If True, all values in 'sample_description' will be treated as categorical values.
653+
654+ If list of booleans, each column will be changed to categorical if the corresponding value in 'as_categorical'
655+ is True.
656+
657+ Set to false, if columns should not be changed.
542658 :param constraints: Grouped factors to enfore equality constraints on. Every element of
543659 the dictionary corresponds to one set of equality constraints. Each set has to be
544660 be an entry of the form {..., x: y, ...} where x is the factor to be constrained and y is
@@ -551,17 +667,15 @@ def setup_constrained(
551667
552668 Can only group by non-constrained effects right now, use constraint_matrix_from_string
553669 for other cases.
554- :param dims: ["design_loc_params", "loc_params"] or ["design_scale_params", "scale_params"]
555- Dimension names of xarray.
670+ :param dims: Dimension names of xarray.
671+
672+ E.g.: ["design_loc_params", "loc_params"] or ["design_scale_params", "scale_params"]
556673 :return: a model design matrix
557674 """
558675 assert len (constraints ) > 0 , "supply constraints"
559676 assert len (dims ) == 2 , "supply 2 dimension names in dim"
560677 sample_description : pd .DataFrame = sample_description .copy ()
561678
562- if isinstance (as_numeric , str ):
563- as_numeric = [as_numeric ]
564- as_categorical = [False if x in as_numeric else True for x in sample_description .columns .values ]
565679 if type (as_categorical ) is not bool or as_categorical :
566680 if type (as_categorical ) is bool and as_categorical :
567681 as_categorical = np .repeat (True , sample_description .columns .size )
@@ -579,30 +693,16 @@ def setup_constrained(
579693 dmat = patsy .dmatrix (formula_unconstrained , sample_description )
580694 coef_names = dmat .design_info .column_names
581695
582- constraints_ls = []
696+ constraints_ls = string_constraints_from_dict (
697+ sample_description = sample_description ,
698+ constraints = constraints
699+ )
583700 for i , x in enumerate (constraints .keys ()):
584701 assert isinstance (x , str ), "constrained should contain strings"
585702 dmat_constrained_temp = patsy .highlevel .dmatrix ("0+" + x , sample_description )
586703 dmat = np .hstack ([dmat , dmat_constrained_temp ])
587704 coef_names .extend (dmat_constrained_temp .design_info .column_names )
588705
589- # Build slices by group.
590- dmat_grouping_temp = patsy .highlevel .dmatrix ("0+" + list (constraints .values ())[i ], sample_description )
591- for j in range (dmat_grouping_temp .shape [1 ]):
592- grouping = dmat_grouping_temp [:, j ]
593- idx_constrained_group = np .where (np .sum (dmat_constrained_temp [grouping == 1 , :], axis = 0 ) > 0 )[0 ]
594- # Assert that required grouping is nested.
595- assert np .all (np .logical_xor (
596- np .sum (dmat_constrained_temp [grouping == 1 , :], axis = 0 ) > 0 ,
597- np .sum (dmat_constrained_temp [grouping == 0 , :], axis = 0 ) > 0
598- )), "proposed grouping of constraints is not nested, read docstrings"
599- # Add new string-encoded equality constraint.
600- constraints_ls .append (
601- "+" .join (list (np .asarray (dmat_constrained_temp .design_info .column_names )[idx_constrained_group ]))+ "=0"
602- )
603-
604- logging .getLogger ("batchglm" ).warning ("Built constraints: " + ", " .join (constraints_ls ))
605-
606706 # Parse design matrix to xarray.
607707 ar = xr .DataArray (dmat , dims = ("observations" , "design_params" ))
608708 ar .coords ["design_params" ] = coef_names
@@ -617,9 +717,60 @@ def setup_constrained(
617717 return ar , constraints_ar
618718
619719
720+ def string_constraints_from_dict (
721+ sample_description : pd .DataFrame ,
722+ constraints : Union [None , dict ] = {}
723+ ):
724+ r"""
725+ Create string-encoded constraints from dictionary encoded constraints and sample description.
726+
727+ :param sample_description: pandas.DataFrame of length "num_observations" containing explanatory variables as columns
728+ :param constraints: Grouped factors to enfore equality constraints on. Every element of
729+ the dictionary corresponds to one set of equality constraints. Each set has to be
730+ be an entry of the form {..., x: y, ...} where x is the factor to be constrained and y is
731+ a factor by which levels of x are grouped and then constrained. Set y="1" to constrain
732+ all levels of x to sum to one, a single equality constraint.
733+
734+ E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
735+ zero. This is applicable if repeats of a an experiment within each condition
736+ are independent so that the set-up ~1+condition+batch is perfectly confounded.
737+
738+ Can only group by non-constrained effects right now, use constraint_matrix_from_string
739+ for other cases.
740+ :return: List of constraints as strings.
741+
742+ E.g. ["batch1 + batch2 + batch3 = 0"]
743+ """
744+ if constraints is not None :
745+ constraints_ls = []
746+ for i , x in enumerate (constraints .keys ()):
747+ assert isinstance (x , str ), "constrained should contain strings"
748+ dmat_constrained_temp = patsy .highlevel .dmatrix ("0+" + x , sample_description )
749+
750+ dmat_grouping_temp = patsy .highlevel .dmatrix ("0+" + list (constraints .values ())[i ], sample_description )
751+ for j in range (dmat_grouping_temp .shape [1 ]):
752+ grouping = dmat_grouping_temp [:, j ]
753+ idx_constrained_group = np .where (np .sum (dmat_constrained_temp [grouping == 1 , :], axis = 0 ) > 0 )[0 ]
754+ # Assert that required grouping is nested.
755+ assert np .all (np .logical_xor (
756+ np .sum (dmat_constrained_temp [grouping == 1 , :], axis = 0 ) > 0 ,
757+ np .sum (dmat_constrained_temp [grouping == 0 , :], axis = 0 ) > 0
758+ )), "proposed grouping of constraints is not nested, read docstrings"
759+ # Add new string-encoded equality constraint.
760+ constraints_ls .append (
761+ "+" .join (list (np .asarray (dmat_constrained_temp .design_info .column_names )[idx_constrained_group ])) + "=0"
762+ )
763+
764+ logging .getLogger ("batchglm" ).warning ("Built constraints: " + ", " .join (constraints_ls ))
765+ else :
766+ constraints_ls = None
767+
768+ return constraints_ls
769+
770+
620771def constraint_matrix_from_string (
621772 dmat : Union [xr .DataArray , xr .Dataset ],
622- constraints : Union [Tuple [str ], List [str ]],
773+ constraints : Union [Tuple [str , str ], List [str ]],
623774 dims : list
624775):
625776 r"""
@@ -629,8 +780,9 @@ def constraint_matrix_from_string(
629780 :param constraints: List of constraints as strings.
630781
631782 E.g. ["batch1 + batch2 + batch3 = 0"]
632- :param dims: ["design_loc_params", "loc_params"] or ["design_scale_params", "scale_params"]
633- Dimension names of xarray.
783+ :param dims: Dimension names of xarray.
784+
785+ E.g.: ["design_loc_params", "loc_params"] or ["design_scale_params", "scale_params"]
634786 :return: a constraint matrix
635787 """
636788 assert len (constraints ) > 0 , "supply constraints"
@@ -666,14 +818,13 @@ def constraint_matrix_from_string(
666818 dims = dims
667819 )
668820
669- # Test reduced design matrix for full rank before returning constraints:
821+ # Test unconstrained subset design matrix for being full rank before returning constraints:
670822 dmat_var = xr .DataArray (
671823 dims = [dmat .dims [0 ], "params" ],
672824 data = dmat [:, idx_unconstr ],
673825 coords = {dmat .dims [0 ]: dmat .coords ["observations" ].values ,
674826 "params" : dmat .coords ["design_params" ].values [idx_unconstr ]}
675827 )
676-
677828 if np .linalg .matrix_rank (dmat_var ) != np .linalg .matrix_rank (dmat_var .T ):
678829 logging .getLogger ("batchglm" ).error ("constrained design matrix is not full rank" )
679830
@@ -682,41 +833,29 @@ def constraint_matrix_from_string(
682833
683834def parse_constraints (
684835 dmat : Union [xr .DataArray , xr .Dataset ],
685- constraints : np .ndarray ,
836+ constraints : Union [ np .ndarray , xr . DataArray ] ,
686837 dims : list
687838):
688839 r"""
689840 Parse constraint matrix into xarray.
690841
691842 :param dmat: Design matrix.
692843 :param constraints: a constraint matrix.
693- :param dims: ["design_loc_params", "loc_params"] or ["design_scale_params", "scale_params"]
694- Dimension names of xarray
844+ :param dims: Dimension names of xarray.
845+
846+ E.g.: ["design_loc_params", "loc_params"] or ["design_scale_params", "scale_params"]
695847 :return: constraint matrix in xarray format
696848 """
697849 if isinstance (dmat , xr .Dataset ):
698850 dmat = dmat .data_vars ['design' ]
699851
700- constraints_ar = xr .DataArray (
701- dims = dims ,
702- data = constraints ,
703- coords = {dims [0 ]: dmat .coords ["design_params" ].values ,
704- dims [1 ]: ["var_" + str (x ) for x in range (constraints .shape [1 ])]}
705- )
706- return constraints_ar
707-
708-
709- class ChDir :
710- """
711- Context manager to temporarily change the working directory
712- """
713-
714- def __init__ (self , path ):
715- self .cwd = os .getcwd ()
716- self .other_wd = path
717-
718- def __enter__ (self ):
719- os .chdir (self .other_wd )
720-
721- def __exit__ (self , * args ):
722- os .chdir (self .cwd )
852+ if isinstance (constraints , xr .DataArray ):
853+ return constraints
854+ else :
855+ constraints_ar = xr .DataArray (
856+ dims = dims ,
857+ data = constraints ,
858+ coords = {dims [0 ]: dmat .coords ["design_params" ].values ,
859+ dims [1 ]: ["var_" + str (x ) for x in range (constraints .shape [1 ])]}
860+ )
861+ return constraints_ar
0 commit comments