improved numpy backend convergence decision reduced unnecessary likelihood evaluations

davidsebfischer · davidsebfischer · commit 8dcd460e52d8 · 2019-11-29T17:07:29.000+01:00
diff --git a/batchglm/train/numpy/base_glm/estimator.py b/batchglm/train/numpy/base_glm/estimator.py
@@ -56,68 +56,137 @@ def train(
             nproc: int = 3,
             **kwargs
     ):
+        """
+        Train GLM.
+
+        Convergence decision:
+        Location and scale model updates are done in separate iterations and are done with different algorithms.
+        Scale model updates are much less frequent (only every update_b_freq-th iteration) as they are much slower.
+        During a stretch of update_b_freq number of location model updates between two scale model updates, convergence
+        of the location model is tracked with self.model.converged. This is re-set after a scale model update, as this
+        convergence only holds conditioned on a particular scale model value.
+        Full convergence of a feature wise model is evaluated after each scale model update: If the loss function based
+        convergence criterium holds across the cumulative updates of the sequence of location updates and last scale
+        model update, the feature is considered converged. For this, the loss value at the last scale model update is
+        save in ll_last_b_update. Full convergence is saved in fully_converged.
+
+        :param max_steps:
+        :param method_b:
+        :param update_b_freq: One over minimum frequency of scale model updates per location model update.
+            A scale model update will be run at least every update_b_freq number of location model update iterations.
+        :param ftol_b:
+        :param lr_b:
+        :param max_iter_b:
+        :param nproc:
+        :param kwargs:
+        :return:
+        """
         # Iterate until conditions are fulfilled.
         train_step = 0
-        delayed_converged = np.tile(False, self.model.model_vars.n_features)
+        epochs_until_b_update = update_b_freq
+        fully_converged = np.tile(False, self.model.model_vars.n_features)
 
         ll_current = - self.model.ll_byfeature.compute()
+        ll_last_b_update = ll_current.copy()
         #logging.getLogger("batchglm").info(
         sys.stdout.write("iter   %i: ll=%f\n" % (0, np.sum(ll_current)))
-        while np.any(np.logical_not(delayed_converged)) and \
+        while np.any(np.logical_not(fully_converged)) and \
                 train_step < max_steps:
             t0 = time.time()
-            # Update parameters:
             # Line search step for scale model:
-            if train_step % update_b_freq == 0 and train_step > 0:
-                if isinstance(self.model.b_var, dask.array.core.Array):
-                    b_var_cache = self.model.b_var.compute()
-                else:
-                    b_var_cache = self.model.b_var.copy()
-                self.model.b_var = self.b_step(
-                    idx=np.where(np.logical_not(delayed_converged))[0],
+            # Run this update every update_b_freq iterations.
+            if epochs_until_b_update == 0:
+                # Compute update.
+                idx_update = np.where(np.logical_not(fully_converged))[0]
+                b_step = self.b_step(
+                    idx_update=idx_update,
                     method=method_b,
                     ftol=ftol_b,
                     lr=lr_b,
                     max_iter=max_iter_b,
                     nproc=nproc
                 )
+                # Perform trial update.
+                self.model.b_var = self.model.b_var + b_step
                 # Reverse update by feature if update leads to worse loss:
-                ll_proposal = - self.model.ll_byfeature.compute()
+                ll_proposal = - self.model.ll_byfeature_j(j=idx_update).compute()
+                idx_bad_step = idx_update[np.where(ll_proposal > ll_current[idx_update])[0]]
                 if isinstance(self.model.b_var, dask.array.core.Array):
                     b_var_new = self.model.b_var.compute()
                 else:
                     b_var_new = self.model.b_var.copy()
-                b_var_new[:, ll_proposal > ll_current] = b_var_cache[:, ll_proposal > ll_current]
+                b_var_new[:, idx_bad_step] = b_var_new[:, idx_bad_step] - b_step[:, idx_bad_step]
                 self.model.b_var = b_var_new
-                delayed_converged = self.model.converged.copy()
-            # IWLS step for location model:
-            if np.any(np.logical_not(self.model.converged)) or train_step % update_b_freq == 0 and train_step > 0:
-                self.model.a_var = self.model.a_var + self.iwls_step()
-                # Evaluate convergence
-                ll_previous = ll_current
-                ll_current = - self.model.ll_byfeature.compute()
-            converged_f = np.logical_or(
-                ll_previous < ll_current,  # loss gets worse
-                np.abs(ll_previous - ll_current) / np.maximum(  # relative decrease in loss is too small
-                    np.nextafter(0, np.inf, dtype=ll_previous.dtype),  # catch division by zero
-                    np.abs(ll_previous)
-                ) < pkg_constants.LLTOL_BY_FEATURE,
-            )
-            # Location model convergence status has to be updated if b model was updated
-            if train_step % update_b_freq == 0 and train_step > 0:
-                self.model.converged = converged_f
-                delayed_converged = converged_f
+                # Update likelihood vector with updated genes based on already evaluated proposal likelihood.
+                ll_new = ll_current.copy()
+                ll_new[idx_update] = ll_proposal
+                ll_new[idx_bad_step] = ll_current[idx_bad_step]
+                # Reset b model update counter.
+                epochs_until_b_update = update_b_freq
+            else:
+                # IWLS step for location model:
+                # Compute update.
+                idx_update = self.model.idx_not_converged
+                a_step = self.iwls_step(idx_update=idx_update)
+                # Perform trial update.
+                self.model.a_var = self.model.a_var + a_step
+                # Reverse update by feature if update leads to worse loss:
+                ll_proposal = - self.model.ll_byfeature_j(j=idx_update).compute()
+                idx_bad_step = idx_update[np.where(ll_proposal > ll_current[idx_update])[0]]
+                if isinstance(self.model.b_var, dask.array.core.Array):
+                    a_var_new = self.model.a_var.compute()
+                else:
+                    a_var_new = self.model.a_var.copy()
+                a_var_new[:, idx_bad_step] = a_var_new[:, idx_bad_step] - a_step[:, idx_bad_step]
+                self.model.a_var = a_var_new
+                # Update likelihood vector with updated genes based on already evaluated proposal likelihood.
+                ll_new = ll_current.copy()
+                ll_new[idx_update] = ll_proposal
+                ll_new[idx_bad_step] = ll_current[idx_bad_step]
+                # Update epoch counter of a updates until next b update:
+                epochs_until_b_update -= 1
+
+            # Evaluate and update convergence:
+            ll_previous = ll_current
+            ll_current = ll_new
+            if epochs_until_b_update == update_b_freq:  # b step update was executed.
+                # Update terminal convergence in fully_converged and intermediate convergence in self.model.converged.
+                converged_f = np.logical_or(
+                    ll_last_b_update < ll_current,  # loss gets worse
+                    np.abs(ll_last_b_update - ll_current) / np.maximum(  # relative decrease in loss is too small
+                        np.nextafter(0, np.inf, dtype=ll_previous.dtype),  # catch division by zero
+                        np.abs(ll_last_b_update)
+                    ) < pkg_constants.LLTOL_BY_FEATURE,
+                )
+                self.model.converged = np.logical_or(fully_converged, converged_f)
+                ll_last_b_update = ll_current.copy()
+                fully_converged = self.model.converged.copy()
             else:
+                # Update intermediate convergence in self.model.converged.
+                converged_f = np.logical_or(
+                    ll_previous < ll_current,  # loss gets worse
+                    np.abs(ll_previous - ll_current) / np.maximum(  # relative decrease in loss is too small
+                        np.nextafter(0, np.inf, dtype=ll_previous.dtype),  # catch division by zero
+                        np.abs(ll_previous)
+                    ) < pkg_constants.LLTOL_BY_FEATURE,
+                )
                 self.model.converged = np.logical_or(self.model.converged, converged_f)
+                if np.all(self.model.converged):
+                    # All location models are converged. This means that the next update will be b model
+                    # update and all remaining intermediate a model updates can be skipped:
+                    epochs_until_b_update = 0
+
+            # Conclude and report iteration.
             train_step += 1
             #logging.getLogger("batchglm").info(
             sys.stdout.write(
-                "iter %s: ll=%f, converged: %.2f%% (location model: %.2f%%), in %.2fsec\n" %
+                "iter %s: ll=%f, converged: %.2f%% (loc: %.2f%%, scale update: %s), in %.2fsec\n" %
                 (
                     (" " if train_step < 10 else "") + (" " if train_step < 100 else "") + str(train_step),
                     np.sum(ll_current),
-                    np.mean(delayed_converged)*100,
+                    np.mean(fully_converged)*100,
                     np.mean(self.model.converged) * 100,
+                    str(epochs_until_b_update == update_b_freq),
                     time.time()-t0
                 )
             )
@@ -143,6 +212,7 @@ def a_step_gd(
         :return:
         """
         iter = 0
+        a_var_old = self.model.a_var.compute()
         converged = np.tile(True, self.model.model_vars.n_features)
         converged[idx] = False
         ll_current = - self.model.ll_byfeature.compute()
@@ -170,15 +240,18 @@ def a_step_gd(
                     np.mean(converged) * 100
                 )
             )
-        return self.model.a_var.compute()
+        return self.model.a_var.compute() - a_var_old
 
-    def iwls_step(self) -> np.ndarray:
+    def iwls_step(
+            self,
+            idx_update: np.ndarray
+    ) -> np.ndarray:
         """
 
         :return: (inferred param x features)
         """
-        w = self.model.fim_weight_j(j=self.model.idx_not_converged)  # (observations x features)
-        ybar = self.model.ybar_j(j=self.model.idx_not_converged)  # (observations x features)
+        w = self.model.fim_weight_j(j=idx_update)  # (observations x features)
+        ybar = self.model.ybar_j(j=idx_update)  # (observations x features)
         # Translate to problem of form ax = b for each feature:
         # (in the following, X=design and Y=counts)
         # a=X^T*W*X: ([features] x inferred param)
@@ -188,7 +261,7 @@ def iwls_step(self) -> np.ndarray:
         xhw = np.einsum('ob,of->fob', xh, w)
         a = np.einsum('fob,oc->fbc', xhw, xh)
         b = np.einsum('fob,of->fb', xhw, ybar)
-        # Via np.linalg.solve:
+
         delta_theta = np.zeros_like(self.model.a_var)
         if isinstance(delta_theta, dask.array.core.Array):
             delta_theta = delta_theta.compute()
@@ -197,31 +270,31 @@ def iwls_step(self) -> np.ndarray:
             # Have to use a workaround to solve problems in parallel in dask here. This workaround does
             # not work if there is only a single problem, ie. if the first dimension of a and b has length 1.
             if a.shape[0] != 1:
-                delta_theta[:, self.model.idx_not_converged] = dask.array.map_blocks(
+                delta_theta[:, idx_update] = dask.array.map_blocks(
                     np.linalg.solve, a, b[:, :, None], chunks=b[:, :, None].shape
                 ).squeeze().T.compute()
             else:
-                delta_theta[:, self.model.idx_not_converged] = np.expand_dims(
+                delta_theta[:, idx_update] = np.expand_dims(
                     np.linalg.solve(a[0], b[0]).compute(),
                     axis=-1
                 )
         else:
-            delta_theta[:, self.model.idx_not_converged] = np.linalg.solve(a, b).T
+            delta_theta[:, idx_update] = np.linalg.solve(a, b).T
         # Via np.linalg.lsts:
-        #delta_theta[:, self.idx_not_converged] = np.concatenate([
+        #delta_theta[:, idx_update] = np.concatenate([
         #    np.expand_dims(np.linalg.lstsq(a[i, :, :], b[i, :])[0], axis=-1)
-        #    for i in self.idx_not_converged)
+        #    for i in idx_update)
         #], axis=-1)
         # Via np.linalg.inv:
-        # #delta_theta[:, self.idx_not_converged] = np.concatenate([
+        # #delta_theta[:, idx_update] = np.concatenate([
         #    np.expand_dims(np.matmul(np.linalg.inv(a[i, :, :]), b[i, :]), axis=-1)
-        #    for i in self.idx_not_converged)
+        #    for i in idx_update)
         #], axis=-1)
         return delta_theta
 
     def b_step(
             self,
-            idx: np.ndarray,
+            idx_update: np.ndarray,
             method: str,
             ftol: float,
             lr: float,
@@ -234,14 +307,14 @@ def b_step(
         """
         if method.lower() in ["gd"]:
             return self._b_step_gd(
-                idx=idx,
+                idx_update=idx_update,
                 ftol=ftol,
                 lr=lr,
                 max_iter=max_iter
             )
         else:
             return self._b_step_loop(
-                idx=idx,
+                idx_update=idx_update,
                 method=method,
                 ftol=ftol,
                 max_iter=max_iter,
@@ -250,7 +323,7 @@ def b_step(
 
     def _b_step_gd(
             self,
-            idx: np.ndarray,
+            idx_update: np.ndarray,
             ftol: float,
             max_iter: int,
             lr: float
@@ -260,8 +333,9 @@ def _b_step_gd(
         :return:
         """
         iter = 0
+        b_var_old = self.model.b_var.compute()
         converged = np.tile(True, self.model.model_vars.n_features)
-        converged[idx] = False
+        converged[idx_update] = False
         ll_current = - self.model.ll_byfeature.compute()
         while np.any(np.logical_not(converged)) and iter < max_iter:
             idx_to_update = np.where(np.logical_not(converged))[0]
@@ -290,7 +364,7 @@ def _b_step_gd(
                     np.mean(converged) * 100
                 )
             )
-        return self.model.b_var.compute()
+        return self.model.b_var.compute() - b_var_old
 
     def optim_handle(
             self,
@@ -300,8 +374,8 @@ def optim_handle(
             max_iter,
             ftol
     ):
-
-        if isinstance(data_j, sparse._coo.core.COO) or isinstance(data_j, scipy.sparse.csr_matrix):
+        # Need to supply dense numpy array to scipy optimize:
+        if isinstance(data_j, sparse.COO) or isinstance(data_j, scipy.sparse.csr_matrix):
             data_j = data_j.todense()
         if len(data_j.shape) == 1:
             data_j = np.expand_dims(data_j, axis=-1)
@@ -323,7 +397,7 @@ def cost_b_var(x, data_jj, eta_loc_jj, xh_scale_jj):
 
     def _b_step_loop(
             self,
-            idx: np.ndarray,
+            idx_update: np.ndarray,
             method: str,
             max_iter: int,
             ftol: float,
@@ -334,15 +408,13 @@ def _b_step_loop(
         :return:
         """
         x0 = -10
-
-        if isinstance(self.model.b_var, dask.array.core.Array):
-            b_var_new = self.model.b_var.compute()
-        else:
-            b_var_new = self.model.b_var.copy()
+        delta_theta = np.zeros_like(self.model.b_var)
+        if isinstance(delta_theta, dask.array.core.Array):
+            delta_theta = delta_theta.compute()
 
         xh_scale = np.matmul(self.model.design_scale, self.model.constraints_scale).compute()
         if nproc > 1:
-            sys.stdout.write('\rFitting %i dispersion models: (progress not available with multiprocessing)' % len(idx))
+            sys.stdout.write('\rFitting %i dispersion models: (progress not available with multiprocessing)' % len(idx_update))
             sys.stdout.flush()
             with multiprocessing.Pool(processes=nproc) as pool:
                 x = self.x.compute()
@@ -355,27 +427,28 @@ def _b_step_loop(
                         xh_scale,
                         max_iter,
                         ftol
-                    ) for j in idx]
+                    ) for j in idx_update]
                 )
                 pool.close()
-            b_var_new[0, idx] = np.array([x[0] for x in results])
+            delta_theta[0, idx_update] = np.array([x[0] for x in results])
             sys.stdout.write('\r')
             sys.stdout.flush()
         else:
             t0 = time.time()
-            for i, j in enumerate(idx):
+            for i, j in enumerate(idx_update):
                 sys.stdout.write(
                     '\rFitting dispersion models: %.2f%% in %.2fsec' %
                     (
-                        np.round(i/len(idx)*100., 2),
+                        np.round(i / len(idx_update) * 100., 2),
                         time.time() - t0
                     )
                 )
                 sys.stdout.flush()
                 if method.lower() == "brent":
                     eta_loc = self.model.eta_loc_j(j=j).compute()
                     data = self.x[:, [j]].compute()
-                    if isinstance(data, sparse._coo.core.COO) or isinstance(data, scipy.sparse.csr_matrix):
+                    # Need to supply dense numpy array to scipy optimize:
+                    if isinstance(data, sparse.COO) or isinstance(data, scipy.sparse.csr_matrix):
                         data = data.todense()
 
                     ll = self.model.ll_handle()
@@ -388,7 +461,7 @@ def cost_b_var(x, data_j, eta_loc_j, xh_scale_j):
                             xh_scale_j
                         ))
 
-                    b_var_new[0, j] = scipy.optimize.brent(
+                    delta_theta[0, j] = scipy.optimize.brent(
                         func=cost_b_var,
                         args=(data, eta_loc, xh_scale),
                         maxiter=max_iter,
@@ -400,7 +473,12 @@ def cost_b_var(x, data_j, eta_loc_j, xh_scale_j):
                     raise ValueError("method %s not recognized" % method)
             sys.stdout.write('\r')
             sys.stdout.flush()
-        return b_var_new
+
+        if isinstance(self.model.b_var, dask.array.core.Array):
+            delta_theta[:, idx_update] = delta_theta[:, idx_update] - self.model.b_var.compute()[:, idx_update]
+        else:
+            delta_theta[:, idx_update] = delta_theta[:, idx_update] - self.model.b_var.copy()[:, idx_update]
+        return delta_theta
 
     def finalize(self):
         """