closed form jacobians are working

davidsebfischer · davidsebfischer · commit dc408d180e23 · 2018-10-23T11:32:18.000+02:00
diff --git a/batchglm/train/tf/nb_glm/estimator.py b/batchglm/train/tf/nb_glm/estimator.py
@@ -92,6 +92,7 @@ def map_model(idx, data) -> BasicModelGraph:
                 constraints_scale=constraints_scale,
                 model_vars=model_vars,
                 mode=pkg_constants.HESSIAN_MODE,
+                iterator=True,
                 dtype=dtype
             )
 
@@ -104,6 +105,7 @@ def map_model(idx, data) -> BasicModelGraph:
                 constraints_scale=constraints_scale,
                 model_vars=model_vars,
                 mode=pkg_constants.JACOBIAN_MODE,
+                iterator=True,
                 dtype=dtype
             )
 
@@ -252,7 +254,7 @@ def __init__(
 
                 # Define the jacobian on the batched model for newton-rhapson:
                 batch_jac = Jacobians(
-                    batched_data=batch_X,
+                    batched_data=batch_data,
                     sample_indices=batch_sample_index,
                     batch_model=batch_model,
                     constraints_loc=constraints_loc,
@@ -394,8 +396,9 @@ def __init__(
                     name="full_data_trainers_b_only"
                 )
                 with tf.name_scope("full_gradient"):
-                    full_gradient = full_data_trainers.gradient[0][0]
-                    full_gradient = tf.reduce_sum(tf.abs(full_gradient), axis=0)
+                    #full_gradient = full_data_trainers.gradient[0][0]
+                    #full_gradient = tf.reduce_sum(tf.abs(full_gradient), axis=0)
+                    full_gradient = full_data_model.neg_jac
                     # full_gradient = tf.add_n(
                     #     [tf.reduce_sum(tf.abs(grad), axis=0) for (grad, var) in full_data_trainers.gradient])
 
@@ -404,12 +407,12 @@ def __init__(
                     # Full data model:
                     param_grad_vec = full_data_model.neg_jac
                     #param_grad_vec = tf.gradients(- full_data_model.log_likelihood, model_vars.params)[0]
-                    param_grad_vec_t = tf.transpose(param_grad_vec)
+                    #param_grad_vec_t = tf.transpose(param_grad_vec)
 
                     delta_t = tf.squeeze(tf.matrix_solve_ls(
                         full_data_model.neg_hessian,
                         # (full_data_model.hessians + tf.transpose(full_data_model.hessians, perm=[0, 2, 1])) / 2, # don't need this with closed forms
-                        tf.expand_dims(param_grad_vec_t, axis=-1),
+                        tf.expand_dims(param_grad_vec, axis=-1),
                         fast=False
                     ), axis=-1)
                     delta = tf.transpose(delta_t)
@@ -424,11 +427,11 @@ def __init__(
                     param_grad_vec_batched = batch_jac.neg_jac
                     #param_grad_vec_batched = tf.gradients(- batch_model.log_likelihood,
                     #                                      model_vars.params)[0]
-                    param_grad_vec_batched_t = tf.transpose(param_grad_vec_batched)
+                    #param_grad_vec_batched_t = tf.transpose(param_grad_vec_batched)
 
                     delta_batched_t = tf.squeeze(tf.matrix_solve_ls(
                         batch_hessians.neg_hessian,
-                        tf.expand_dims(param_grad_vec_batched_t, axis=-1),
+                        tf.expand_dims(param_grad_vec_batched, axis=-1),
                         fast=False
                     ), axis=-1)
                     delta_batched = tf.transpose(delta_batched_t)
diff --git a/batchglm/train/tf/nb_glm/jacobians.py b/batchglm/train/tf/nb_glm/jacobians.py
@@ -28,9 +28,9 @@ def _coef_invariant_a(
     of i and j.
     .. math::
 
-        &J^{m}_{i} = X^m_i*Y-X^m_i*(Y+r)*\frac{mu}{mu+r} \\
-        &const = (Y+r)*\frac{mu}{mu+r} \\
-        &J^{m}_{i} = X^m_i*Y-X^m_i*const \\
+        &J^{m}_{i} = X^m_i*\bigg(Y-(Y+r)*\frac{mu}{mu+r}\bigg) \\
+        &const = Y-(Y+r)*\frac{mu}{mu+r} \\
+        &J^{m}_{i} = X^m_i*const \\
 
     :param X: tf.tensor observations x features
         Observation by observation and feature.
@@ -43,13 +43,14 @@ def _coef_invariant_a(
         Coefficient invariant terms of hessian of
         given observations and features.
     """
-    const = tf.multiply(
-        tf.add(X, r),  # [observations, features]
+    const = tf.multiply(  # [observations, features]
+        tf.add(X, r),
         tf.divide(
-            mu,  # [observations, features]
+            mu,
             tf.add(mu, r)
         )
     )
+    const = tf.subtract(X, const)
     return const
 
 
@@ -68,11 +69,11 @@ def _coef_invariant_b(
     of i and j.
     .. math::
 
-        GJ{r}_{i} &= X^r_i \\
-            &*r*\bigg(psi_0(r+Y)+psi_0(r) \\
+        J{r}_{i} &= X^r_i \\
+            &*r*\bigg(psi_0(r+Y)-psi_0(r) \\
             &-\frac{r+Y}{r+mu} \\
             &+log(r)+1-log(r+mu) \bigg) \\
-        const = r*\bigg(psi_0(r+Y)+psi_0(r) \\ const1
+        const = r*\bigg(psi_0(r+Y)-psi_0(r) \\ const1
             &-\frac{r+Y}{r+mu} \\ const2
             &+log(r)+1-log(r+mu) \bigg) \\ const3
         J^{r}_{i} &= X^r_i * const \\
@@ -88,22 +89,19 @@ def _coef_invariant_b(
         Coefficient invariant terms of hessian of
         given observations and features.
     """
-    scalar_one = tf.constant(1, shape=(), dtype=X.dtype)
+    scalar_one = tf.constant(1, shape=[1,1], dtype=X.dtype)
     # Pre-define sub-graphs that are used multiple times:
-    r_plus_mu = r + mu
-    r_plus_x = r + X
+    r_plus_mu = tf.add(r, mu)
+    r_plus_x = tf.add(r, X)
     # Define graphs for individual terms of constant term of hessian:
-    const1 = tf.add(  # [observations, features]
+    const1 = tf.subtract(
         tf.math.digamma(x=r_plus_x),
         tf.math.digamma(x=r)
     )
-    const2 = tf.negative(tf.divide(
-        r_plus_x,
-        r_plus_mu
-    ))
-    const3 = tf.add(  # [observations, features]
+    const2 = tf.negative(tf.divide(r_plus_x, r_plus_mu))
+    const3 = tf.add(
         tf.log(r),
-        scalar_two - tf.log(r_plus_mu)
+        tf.subtract(scalar_one, tf.log(r_plus_mu))
     )
     const = tf.add_n([const1, const2, const3])  # [observations, features]
     const = tf.multiply(r, const)
@@ -178,12 +176,10 @@ def __init__(
             )
             self.neg_jac = tf.negative(self.jac)
         elif mode == "tf":
-            if batch_model is None:
-                raise ValueError("mode tf only possible if batch_model is given to Jacobians.")
             # tensorflow computes the jacobian based on the objective,
             # which is the negative log-likelihood. Accordingly, the jacobian
             # is the negative jacobian computed here.
-            self.neg_jac = self.tf(
+            self.jac = self.tf(
                 batched_data=batched_data,
                 sample_indices=sample_indices,
                 batch_model=batch_model,
@@ -193,7 +189,7 @@ def __init__(
                 iterator=iterator,
                 dtype=dtype
             )
-            self.jac = tf.negative(self.neg_jac)
+            self.neg_jac = tf.negative(self.jac)
         else:
             raise ValueError("mode not recognized in Jacobian: " + mode)
 
@@ -225,19 +221,16 @@ def _a_byobs(X, design_loc, design_scale, mu, r):
             :return Jblock: tf.tensor features x coefficients
                 Block of jacobian.
             """
-            const = _coef_invariant_a(X=X, mu=mu, r=r)  # [observations x features]
-            Jblock = tf.subtract( # [features x coefficients]
-                tf.matmul(tf.transpose(X), design_loc, axes=1),
-                tf.matmul(tf.transpose(const), design_loc, axes=1)
-            )
+            const = _coef_invariant_a(X=X, mu=mu, r=r)  # [observations, features]
+            Jblock = tf.matmul(tf.transpose(const), design_loc)  # [features, coefficients]
             return Jblock
 
         def _b_byobs(X, design_loc, design_scale, mu, r):
             """
             Compute the dispersion model block of the jacobian.
             """
-            const = _coef_invariant_b(X=X, mu=mu, r=r)  # [observations x features]
-            Jblock = tf.matmul(tf.transpose(const), design_loc, axes=1)  # [features x coefficients]
+            const = _coef_invariant_b(X=X, mu=mu, r=r)  # [observations, features]
+            Jblock = tf.matmul(tf.transpose(const), design_scale)  # [features, coefficients]
             return Jblock
 
         def _assemble_bybatch(idx, data):
@@ -310,6 +303,7 @@ def _red(prev, cur):
                 idx=sample_indices,
                 data=batched_data
             )
+
         return J
 
     def tf(
@@ -328,7 +322,9 @@ def tf(
         """
 
         def _jac(batch_model, model_vars):
-            return tf.gradients(batch_model.log_likelihood, model_vars.params)[0]
+            J = tf.gradients(batch_model.log_likelihood, model_vars.params)[0]
+            J = tf.transpose(J)
+            return J
 
         def _assemble_bybatch(idx, data):
             """
@@ -364,7 +360,7 @@ def _assemble_bybatch(idx, data):
                 size_factors=size_factors
             )
 
-            J = _jac(batch_model=batch_model, model_vars=model_vars)
+            J = _jac(batch_model=model, model_vars=model_vars)
             return J
 
         def _red(prev, cur):
@@ -378,6 +374,10 @@ def _red(prev, cur):
             """
             return tf.add(prev, cur)
 
+        params = model_vars.params
+        p_shape_a = model_vars.a.shape[0]
+        p_shape_b = model_vars.b.shape[0]
+
         if iterator==True and batch_model is None:
             J = op_utils.map_reduce(
                 last_elem=tf.gather(sample_indices, tf.size(sample_indices) - 1),
diff --git a/batchglm/unit_test/test_nb_glm_jacobians.py b/batchglm/unit_test/test_nb_glm_jacobians.py
@@ -20,7 +20,7 @@
 def estimate(input_data: InputData):
     estimator = Estimator(input_data)
     estimator.initialize()
-    estimator.train_sequence(training_strategy="QUICK")
+    # Do not train, evalute at initialization!
     return estimator
 
 
@@ -52,28 +52,36 @@ def test_compute_hessians(self):
 
         input_data = InputData.new(sim.X, design_loc=design_loc, design_scale=design_scale)
 
-        pkg_constants.JACOBIAN_MODE = "tf"
-        self.estimator_ow = estimate(input_data)
-        t0_tf = time.time()
-        self.J_tf = self.estimator_ow.hessians
-        t1_tf = time.time()
-        self.estimator_ow.close_session()
-        self.t_tf = t1_tf - t0_tf
-
         pkg_constants.JACOBIAN_MODE = "analytic"
         self.estimator_analytic = estimate(input_data)
         t0_analytic = time.time()
-        self.J_analytic = self.estimator_analytic.jac
+        self.J_analytic = self.estimator_analytic['full_gradient']
+        self.a_analytic = self.estimator_analytic.a.values
+        self.b_analytic = self.estimator_analytic.b.values
         t1_analytic = time.time()
         self.estimator_analytic.close_session()
         self.t_analytic = t1_analytic - t0_analytic
 
+        pkg_constants.JACOBIAN_MODE = "tf"
+        self.estimator_tf = estimate(input_data)
+        t0_tf = time.time()
+        self.J_tf = self.estimator_tf['full_gradient']
+        self.a_tf = self.estimator_tf.a.values
+        self.b_tf = self.estimator_tf.b.values
+        t1_tf = time.time()
+        self.estimator_tf.close_session()
+        self.t_tf = t1_tf - t0_tf
+
         i = 1
         print("\n")
         print("run time tensorflow solution: ", str(self.t_tf))
         print("run time observation batch-wise analytic solution: ", str(self.t_analytic))
-        print("ratio of analytic jacobian to analytic observation-wise jacobian:")
-        print(self.J_tf.values[i, :] / self.J_analytic.values[i, :])
+        print("relative difference of mean estimates for analytic jacobian to observation-wise jacobian:")
+        print((self.a_analytic - self.a_tf) / self.a_tf)
+        print("relative difference of dispersion estimates for analytic jacobian to observation-wise jacobian:")
+        print((self.b_analytic - self.b_tf) / self.b_tf)
+        print("relative difference of analytic jacobian to analytic observation-wise jacobian:")
+        print((self.J_tf - self.J_analytic)/self.J_tf)
 
 
 if __name__ == '__main__':