theislab
diff --git a/‎batchglm/pkg_constants.py‎
Lines changed: 1 addition & 0 deletions b/‎batchglm/pkg_constants.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎batchglm/train/tf/nb_glm/estimator.py‎
Lines changed: 72 additions & 23 deletions b/‎batchglm/train/tf/nb_glm/estimator.py‎
Lines changed: 72 additions & 23 deletions
diff --git a/‎batchglm/train/tf/nb_glm/external.py‎
Lines changed: 2 additions & 0 deletions b/‎batchglm/train/tf/nb_glm/external.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎batchglm/train/tf/nb_glm/hessians.py‎
Lines changed: 12 additions & 11 deletions b/‎batchglm/train/tf/nb_glm/hessians.py‎
Lines changed: 12 additions & 11 deletions
@@ -8,6 +8,7 @@
 
 ACCURACY_MARGIN_RELATIVE_TO_LIMIT = float(os.environ.get('BATCHGLM_ACCURACY_MARGIN', 2.5))
 HESSIAN_MODE = str(os.environ.get('HESSIAN_MODE', "obs_batched"))
+JACOBIAN_MODE = str(os.environ.get('JACOBIAN_MODE', "analytic"))
 
 XARRAY_NETCDF_ENGINE = "h5netcdf"
 
 
@@ -19,9 +19,10 @@
 from .base import param_bounds, tf_clip_param, np_clip_param, apply_constraints
 
 from .external import AbstractEstimator, XArrayEstimatorStore, InputData, Model, MonitoredTFEstimator, TFEstimatorGraph
-from .external import nb_utils, train_utils, op_utils, rand_utils
+from .external import nb_utils, train_utils, op_utils, rand_utils, data_utils
 from .external import pkg_constants
 from .hessians import Hessians
+from .jacobians import Jacobians
 
 logger = logging.getLogger(__name__)
 
@@ -91,6 +92,20 @@ def map_model(idx, data) -> BasicModelGraph:
                 constraints_scale=constraints_scale,
                 model_vars=model_vars,
                 mode=pkg_constants.HESSIAN_MODE,
+                iterator=True,
+                dtype=dtype
+            )
+
+        with tf.name_scope("jacobians"):
+            jacobians = Jacobians(
+                batched_data=batched_data,
+                sample_indices=sample_indices,
+                batch_model=None,
+                constraints_loc=constraints_loc,
+                constraints_scale=constraints_scale,
+                model_vars=model_vars,
+                mode=pkg_constants.JACOBIAN_MODE,
+                iterator=True,
                 dtype=dtype
             )
 
@@ -121,6 +136,8 @@ def map_model(idx, data) -> BasicModelGraph:
         self.norm_neg_log_likelihood = norm_neg_log_likelihood
         self.loss = loss
 
+        self.jac = jacobians.jac
+        self.neg_jac = jacobians.neg_jac
         self.hessian = hessians.hessian
         self.neg_hessian = hessians.neg_hessian
 
@@ -235,7 +252,20 @@ def __init__(
                 # use the mean loss to keep a constant learning rate independently of the batch size
                 batch_loss = batch_model.loss
 
-                # Define the hessian on the batched model:
+                # Define the jacobian on the batched model for newton-rhapson:
+                batch_jac = Jacobians(
+                    batched_data=batch_data,
+                    sample_indices=batch_sample_index,
+                    batch_model=batch_model,
+                    constraints_loc=constraints_loc,
+                    constraints_scale=constraints_scale,
+                    model_vars=model_vars,
+                    mode="analytic",
+                    iterator=False,
+                    dtype=dtype
+                )
+
+                # Define the hessian on the batched model for newton-rhapson:
                 batch_hessians = Hessians(
                     batched_data=batch_data,
                     singleobs_data=None,
@@ -366,21 +396,23 @@ def __init__(
                     name="full_data_trainers_b_only"
                 )
                 with tf.name_scope("full_gradient"):
-                    full_gradient = full_data_trainers.gradient[0][0]
-                    full_gradient = tf.reduce_sum(tf.abs(full_gradient), axis=0)
+                    #full_gradient = full_data_trainers.gradient[0][0]
+                    #full_gradient = tf.reduce_sum(tf.abs(full_gradient), axis=0)
+                    full_gradient = full_data_model.neg_jac
                     # full_gradient = tf.add_n(
                     #     [tf.reduce_sum(tf.abs(grad), axis=0) for (grad, var) in full_data_trainers.gradient])
 
                 with tf.name_scope("newton-raphson"):
                     # tf.gradients(- full_data_model.log_likelihood, [model_vars.a, model_vars.b])
                     # Full data model:
-                    param_grad_vec = tf.gradients(- full_data_model.log_likelihood, model_vars.params)[0]
-                    param_grad_vec_t = tf.transpose(param_grad_vec)
+                    param_grad_vec = full_data_model.neg_jac
+                    #param_grad_vec = tf.gradients(- full_data_model.log_likelihood, model_vars.params)[0]
+                    #param_grad_vec_t = tf.transpose(param_grad_vec)
 
                     delta_t = tf.squeeze(tf.matrix_solve_ls(
                         full_data_model.neg_hessian,
                         # (full_data_model.hessians + tf.transpose(full_data_model.hessians, perm=[0, 2, 1])) / 2, # don't need this with closed forms
-                        tf.expand_dims(param_grad_vec_t, axis=-1),
+                        tf.expand_dims(param_grad_vec, axis=-1),
                         fast=False
                     ), axis=-1)
                     delta = tf.transpose(delta_t)
@@ -392,13 +424,14 @@ def __init__(
                     )
 
                     # Batched data model:
-                    param_grad_vec_batched = tf.gradients(- batch_model.log_likelihood,
-                                                          model_vars.params)[0]
-                    param_grad_vec_batched_t = tf.transpose(param_grad_vec_batched)
+                    param_grad_vec_batched = batch_jac.neg_jac
+                    #param_grad_vec_batched = tf.gradients(- batch_model.log_likelihood,
+                    #                                      model_vars.params)[0]
+                    #param_grad_vec_batched_t = tf.transpose(param_grad_vec_batched)
 
                     delta_batched_t = tf.squeeze(tf.matrix_solve_ls(
                         batch_hessians.neg_hessian,
-                        tf.expand_dims(param_grad_vec_batched_t, axis=-1),
+                        tf.expand_dims(param_grad_vec_batched, axis=-1),
                         fast=False
                     ), axis=-1)
                     delta_batched = tf.transpose(delta_batched_t)
@@ -741,6 +774,9 @@ def __init__(
                     shape=[input_data.num_observations, input_data.num_features]
                 )
 
+            groupwise_means = None  # [groups, features]
+            overall_means = None  # [1, features]
+            logger.debug(" * Initialize mean model")
             if isinstance(init_a, str):
                 # Chose option if auto was chosen
                 if init_a.lower() == "auto":
@@ -766,12 +802,12 @@ def __init__(
                         if size_factors_init is not None:
                             X = np.divide(X, size_factors_init)
 
-                        mean = X.groupby("group").mean(dim="observations").values
+                        groupwise_means = X.groupby("group").mean(dim="observations").values
                         # clipping
-                        mean = np_clip_param(mean, "mu")
+                        groupwise_means = np_clip_param(groupwise_means, "mu")
                         # mean = np.nextafter(0, 1, out=mean.values, where=mean == 0, dtype=mean.dtype)
 
-                        a = np.log(mean)
+                        a = np.log(groupwise_means)
                         if input_data.constraints_loc is not None:
                             a_constraints = np.zeros([input_data.constraints_loc.shape[0], a.shape[1]])
                             # Add constraints (sum to zero) to value vector to remove structural unidentifiability.
@@ -781,7 +817,9 @@ def __init__(
                         # inv_design = np.linalg.inv(unique_design_loc) # NOTE: this is exact if full rank!
                         # init_a = np.matmul(inv_design, a)
                         #
-                        # Better option: use least-squares solver to calculate a'
+                        # Use least-squares solver to calculate a':
+                        # This is faster and more accurate than using matrix inversion.
+                        logger.debug(" ** Solve lstsq problem")
                         a_prime = np.linalg.lstsq(unique_design_loc, a, rcond=None)
                         init_a = a_prime[0]
                         # stat_utils.rmsd(np.exp(unique_design_loc @ init_a), mean)
@@ -799,18 +837,19 @@ def __init__(
                     except np.linalg.LinAlgError:
                         logger.warning("Closed form initialization failed!")
                 elif init_a.lower() == "standard":
-                    mean = input_data.X.mean(dim="observations").values  # directly calculate the mean
+                    overall_means = input_data.X.mean(dim="observations").values  # directly calculate the mean
                     # clipping
-                    mean = np_clip_param(mean, "mu")
+                    overall_means = np_clip_param(overall_means, "mu")
                     # mean = np.nextafter(0, 1, out=mean, where=mean == 0, dtype=mean.dtype)
 
                     init_a = np.zeros([input_data.num_design_loc_params, input_data.num_features])
-                    init_a[0, :] = np.log(mean)
+                    init_a[0, :] = np.log(overall_means)
                     self._train_mu = True
 
                     logger.info("Using standard initialization for mean")
                     logger.info("Should train mu: %s", self._train_mu)
 
+            logger.debug(" * Initialize dispersion model")
             if isinstance(init_b, str):
                 if init_b.lower() == "auto":
                     init_b = "closed_form"
@@ -837,14 +876,20 @@ def __init__(
                         if input_data.size_factors is not None:
                             X = np.divide(X, size_factors_init)
 
-                        Xdiff = X - np.exp(input_data.design_loc @ init_a)
+                        #Xdiff = X - np.exp(input_data.design_loc @ init_a)
+                        # Define xarray version of init so that Xdiff can be evaluated lazy by dask.
+                        init_a_xr = data_utils.xarray_from_data(init_a, dims=("design_loc_params", "features"))
+                        init_a_xr.coords["design_loc_params"] = input_data.design_loc.coords["design_loc_params"]
+                        logger.debug(" ** Define Xdiff")
+                        Xdiff = X - np.exp(input_data.design_loc.dot(init_a_xr))
                         variance = np.square(Xdiff).groupby("group").mean(dim="observations")
 
-                        group_mean = X.groupby("group").mean(dim="observations")
-                        denominator = np.fmax(variance - group_mean, 0)
+                        if groupwise_means is None:
+                            groupwise_means = X.groupby("group").mean(dim="observations")
+                        denominator = np.fmax(variance - groupwise_means, 0)
                         denominator = np.nextafter(0, 1, out=denominator.values, where=denominator == 0,
                                                    dtype=denominator.dtype)
-                        r = np.asarray(np.square(group_mean) / denominator)
+                        r = np.asarray(np.square(groupwise_means) / denominator)
                         # clipping
                         r = np_clip_param(r, "r")
                         # r = np.nextafter(0, 1, out=r.values, where=r == 0, dtype=r.dtype)
@@ -860,7 +905,9 @@ def __init__(
                         # inv_design = np.linalg.inv(unique_design_scale) # NOTE: this is exact if full rank!
                         # init_b = np.matmul(inv_design, b)
                         #
-                        # Better option: use least-squares solver to calculate b''
+                        # Use least-squares solver to calculate a':
+                        # This is faster and more accurate than using matrix inversion.
+                        logger.debug(" ** Solve lstsq problem")
                         b_prime = np.linalg.lstsq(unique_design_scale, b, rcond=None)
                         init_b = b_prime[0]
 
@@ -981,6 +1028,7 @@ def fetch_fn(idx):
         else:
             init_b = init_b.astype(dtype)
 
+        logger.debug(" * Start creating model")
         with graph.as_default():
             # create model
             model = EstimatorGraph(
@@ -999,6 +1047,7 @@ def fetch_fn(idx):
                 extended_summary=extended_summary,
                 dtype=dtype
             )
+        logger.debug(" * Finished creating model")
 
         MonitoredTFEstimator.__init__(self, model)
 
 
@@ -1,3 +1,5 @@
+import batchglm.data as data_utils
+
 from batchglm.models.nb_glm.base import AbstractEstimator, XArrayEstimatorStore, InputData, Model
 
 import batchglm.train.tf.ops as op_utils
 
@@ -90,7 +90,7 @@ def _coef_invariant_aa(
         given observations and features.
     """
     const = tf.negative(tf.multiply(
-        mu,  # [observations x features]
+        mu,  # [observations, features]
         tf.divide(
             (X / r) + 1,
             tf.square((mu / r) + 1)
@@ -169,7 +169,8 @@ def _coef_invariant_bb(
 class Hessians:
     """ Compute the nb_glm model hessian.
     """
-    H: tf.Tensor
+    hessian: tf.Tensor
+    neg_hessian: tf.Tensor
 
     def __init__(
             self,
@@ -290,10 +291,10 @@ def byobs(
     ):
         """
         Compute the closed-form of the nb_glm model hessian
-        by evalutating its terms grouped by observations.
+        by evaluating its terms grouped by observations.
 
-        Has three subfunctions which built the specific blocks of the hessian
-        and one subfunction which concatenates the blocks into a full hessian.
+        Has three sub-functions which built the specific blocks of the hessian
+        and one sub-function which concatenates the blocks into a full hessian.
 
         Note that two different groups of functions compute the hessian
         block either with standard matrix multiplication for a single observation
@@ -303,7 +304,7 @@ def byobs(
         use the einsum to compute the hessian block on a batch of observations
         in a single go. This requires the handling of a latent 4D tensor which
         potentially large memory usage, depending on the einsum behaviour. In
-        principle the latter can be fast though as they relace iterations which
+        principle the latter can be fast though as they replace iterations which
         larger tensor operations.
         """
 
@@ -355,7 +356,7 @@ def _bb_byobs(X, design_loc, design_scale, mu, r):
         def _ab_byobs(X, design_loc, design_scale, mu, r):
             """
             Compute the mean-dispersion model off-diagonal block of the
-            closed form hessian of nb_glm model by observastion across features.
+            closed form hessian of nb_glm model by observation across features.
 
             Note that there are two blocks of the same size which can
             be compute from each other with a transpose operation as
@@ -510,7 +511,7 @@ def _red(prev, cur):
 
             Every evaluation of the hessian on an observation yields a full
             hessian matrix. This function sums over consecutive evaluations
-            of this hessian so that not all seperate evluations have to be
+            of this hessian so that not all separate evaluations have to be
             stored.
             """
             return tf.add(prev, cur)
@@ -545,11 +546,11 @@ def byfeature(
     ):
         """
         Compute the closed-form of the nb_glm model hessian
-        by evalutating its terms grouped by features.
+        by evaluating its terms grouped by features.
 
 
-        Has three subfunctions which built the specific blocks of the hessian
-        and one subfunction which concatenates the blocks into a full hessian.
+        Has three sub-functions which built the specific blocks of the hessian
+        and one sub-function which concatenates the blocks into a full hessian.
         """
 
         def _aa_byfeature(X, design_loc, design_scale, mu, r):
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+import batchglm.data as data_utils`
	`2`	`+`
`1`	`3`	`from batchglm.models.nb_glm.base import AbstractEstimator, XArrayEstimatorStore, InputData, Model`
`2`	`4`
`3`	`5`	`import batchglm.train.tf.ops as op_utils`