Merge pull request #105 from theislab/singular_fix

davidsebfischer · web-flow · commit ed2f5ba528d7 · 2020-02-25T10:29:33.000+01:00
Singular fix
diff --git a/batchglm/models/base_glm/simulator.py b/batchglm/models/base_glm/simulator.py
@@ -171,3 +171,16 @@ def constraints_loc(self):
     @property
     def constraints_scale(self):
         return np.identity(n=self.b_var.shape[0])
+
+    def np_clip_param(
+            self,
+            param,
+            name
+    ):
+        # TODO: inherit this from somewhere?
+        bounds_min, bounds_max = self.param_bounds(param.dtype)
+        return np.clip(
+            param,
+            bounds_min[name],
+            bounds_max[name]
+        )
diff --git a/batchglm/models/glm_nb/external.py b/batchglm/models/glm_nb/external.py
@@ -5,4 +5,6 @@
 from batchglm.models.base_glm import closedform_glm_mean, closedform_glm_scale
 
 import batchglm.data as data_utils
-from batchglm.utils.linalg import groupwise_solve_lm
+from batchglm.utils.linalg import groupwise_solve_lm
+
+from batchglm import pkg_constants
diff --git a/batchglm/models/glm_nb/simulator.py b/batchglm/models/glm_nb/simulator.py
@@ -2,6 +2,7 @@
 
 from .model import Model
 from .external import _SimulatorGLM, InputDataGLM
+from .external import pkg_constants
 
 
 class Simulator(_SimulatorGLM, Model):
@@ -58,3 +59,35 @@ def generate_data(self):
             design_scale_names=None
         )
 
+    def param_bounds(
+            self,
+            dtype
+    ):
+        # TODO: inherit this from somewhere?
+        dtype = np.dtype(dtype)
+        dmin = np.finfo(dtype).min
+        dmax = np.finfo(dtype).max
+        dtype = dtype.type
+
+        sf = dtype(pkg_constants.ACCURACY_MARGIN_RELATIVE_TO_LIMIT)
+        bounds_min = {
+            "a_var": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "b_var": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "eta_loc": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "eta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "loc": np.nextafter(0, np.inf, dtype=dtype),
+            "scale": np.nextafter(0, np.inf, dtype=dtype),
+            "likelihood": dtype(0),
+            "ll": np.log(np.nextafter(0, np.inf, dtype=dtype)),
+        }
+        bounds_max = {
+            "a_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "b_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "eta_loc": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "loc": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "scale": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "likelihood": dtype(1),
+            "ll": dtype(0),
+        }
+        return bounds_min, bounds_max
diff --git a/batchglm/train/numpy/base_glm/estimator.py b/batchglm/train/numpy/base_glm/estimator.py
@@ -3,7 +3,6 @@
 import logging
 import multiprocessing
 import numpy as np
-import pprint
 import scipy
 import scipy.sparse
 import scipy.optimize
@@ -270,16 +269,28 @@ def iwls_step(
             # Have to use a workaround to solve problems in parallel in dask here. This workaround does
             # not work if there is only a single problem, ie. if the first dimension of a and b has length 1.
             if a.shape[0] != 1:
-                delta_theta[:, idx_update] = dask.array.map_blocks(
-                    np.linalg.solve, a, b[:, :, None], chunks=b[:, :, None].shape
+                get_cond_number = lambda x: np.expand_dims(np.expand_dims(np.linalg.cond(x, p=None), axis=-1), axis=-1)
+                invertible = np.where(dask.array.map_blocks(
+                    get_cond_number, a, chunks=a.shape
+                ).squeeze().compute() < 1 / sys.float_info.epsilon)[0]
+                delta_theta[:, idx_update[invertible]] = dask.array.map_blocks(
+                    np.linalg.solve, a[invertible], b[invertible, :, None],
+                    chunks=b[invertible, :, None].shape
                 ).squeeze().T.compute()
             else:
-                delta_theta[:, idx_update] = np.expand_dims(
-                    np.linalg.solve(a[0], b[0]).compute(),
-                    axis=-1
-                )
+                if np.linalg.cond(a.compute(), p=None) < 1 / sys.float_info.epsilon:
+                    delta_theta[:, idx_update] = np.expand_dims(
+                        np.linalg.solve(a[0], b[0]).compute(),
+                        axis=-1
+                    )
+                    invertible = np.array([0])
+                else:
+                    invertible = np.array([])
         else:
-            delta_theta[:, idx_update] = np.linalg.solve(a, b).T
+            invertible = np.where(np.linalg.cond(a, p=None) < 1 / sys.float_info.epsilon)[0]
+            delta_theta[:, idx_update[invertible]] = np.linalg.solve(a[invertible], b[invertible]).T
+        if invertible.shape[0] < len(idx_update):
+            print("caught %i linalg singular matrix errors" % (len(idx_update) - invertible.shape[0]))
         # Via np.linalg.lsts:
         #delta_theta[:, idx_update] = np.concatenate([
         #    np.expand_dims(np.linalg.lstsq(a[i, :, :], b[i, :])[0], axis=-1)
@@ -512,7 +523,10 @@ def finalize(self):
         """
         # Read from numpy-IRLS estimator specific model:
         self._hessian = - self.model.fim.compute()
-        self._fisher_inv = np.linalg.inv(- self._hessian)
+        fisher_inv = np.zeros_like(self._hessian)
+        invertible = np.where(np.linalg.cond(self._hessian, p=None) < 1 / sys.float_info.epsilon)[0]
+        fisher_inv[invertible] = np.linalg.inv(- self._hessian[invertible])
+        self._fisher_inv = fisher_inv
         self._jacobian = np.sum(np.abs(self.model.jac.compute() / self.model.x.shape[0]), axis=1)
         self._log_likelihood = self.model.ll_byfeature.compute()
         self._loss = np.sum(self._log_likelihood)
diff --git a/batchglm/unit_test/test_acc_glm_all_numpy.py b/batchglm/unit_test/test_acc_glm_all_numpy.py
@@ -36,7 +36,9 @@ def __init__(
                 design_scale_names=simulator.input_data.design_scale_names,
                 constraints_loc=simulator.input_data.constraints_loc,
                 constraints_scale=simulator.input_data.constraints_scale,
-                size_factors=simulator.input_data.size_factors
+                size_factors=simulator.input_data.size_factors,
+                chunk_size_cells=int(1e9),
+                chunk_size_genes=2
             )
         else:
             input_data = InputDataGLM(
@@ -47,7 +49,9 @@ def __init__(
                 design_scale_names=simulator.input_data.design_scale_names,
                 constraints_loc=simulator.input_data.constraints_loc,
                 constraints_scale=simulator.input_data.constraints_scale,
-                size_factors=simulator.input_data.size_factors
+                size_factors=simulator.input_data.size_factors,
+                chunk_size_cells=int(1e9),
+                chunk_size_genes=2
             )
 
         self.estimator = Estimator(