Reduced size of compute graph with pathfinder_body_fn

aphc14 · aphc14 · commit 8b134b7bc19d · 2024-11-12T04:36:37.000+11:00
Summaryh of changes:
- Remove multiprocessing code in favour of reusing compiled  for each path
-  takes only random_seed as argument for each path
- Compute graph significantly smaller by using pure pytensor op and symoblic variables
- Added LBFGSOp to compile with pytensor.function
- Cleaned up codes using pytensor variables
diff --git a/pymc_experimental/inference/pathfinder/importance_sampling.py b/pymc_experimental/inference/pathfinder/importance_sampling.py
@@ -2,15 +2,35 @@
 
 import arviz as az
 import numpy as np
+import pytensor.tensor as pt
+
+from pytensor.graph import Apply, Op
+from pytensor.tensor.variable import TensorVariable
 
-logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
 logger = logging.getLogger(__name__)
 
 
+class PSIS(Op):
+    __props__ = ()
+
+    def make_node(self, inputs):
+        logweights = pt.as_tensor(inputs)
+        psislw = pt.dvector()
+        pareto_k = pt.dscalar()
+        return Apply(self, [logweights], [psislw, pareto_k])
+
+    def perform(self, node: Apply, inputs, outputs) -> None:
+        logweights = inputs[0]
+        psislw, pareto_k = az.psislw(logweights)
+        outputs[0][0] = psislw
+        outputs[1][0] = pareto_k
+
+
 def psir(
-    samples: np.ndarray,
-    logP: np.ndarray,
-    logQ: np.ndarray,
+    samples: TensorVariable,
+    # logP: TensorVariable,
+    # logQ: TensorVariable,
+    logiw: TensorVariable,
     num_draws: int = 1000,
     random_seed: int | None = None,
 ) -> np.ndarray:
@@ -48,14 +68,10 @@ def psir(
 
     Zhang, L., Carpenter, B., Gelman, A., & Vehtari, A. (2022). Pathfinder: Parallel quasi-Newton variational inference. Journal of Machine Learning Research, 23(306), 1-49.
     """
-
-    def logsumexp(x):
-        c = x.max()
-        return c + np.log(np.sum(np.exp(x - c)))
-
-    logiw = np.reshape(logP - logQ, -1, order="F")
-    psislw, pareto_k = az.psislw(logiw)
-
+    # logiw = np.reshape(logP - logQ, (-1,), order="F")
+    # logiw = (logP - logQ).ravel()
+    psislw, pareto_k = PSIS()(logiw)
+    pareto_k = pareto_k.eval()
     # FIXME: pareto_k is mostly bad, find out why!
     if pareto_k <= 0.70:
         pass
@@ -68,6 +84,6 @@ def logsumexp(x):
             "consider reparametrising the model, increasing ftol, gtol or maxcor parameters"
         )
 
-    p = np.exp(psislw - logsumexp(psislw))
+    p = pt.exp(psislw - pt.logsumexp(psislw)).eval()
     rng = np.random.default_rng(random_seed)
-    return rng.choice(samples, size=num_draws, p=p, shuffle=False, axis=0)
+    return rng.choice(samples, size=num_draws, replace=True, p=p, shuffle=False, axis=0)
diff --git a/pymc_experimental/inference/pathfinder/lbfgs.py b/pymc_experimental/inference/pathfinder/lbfgs.py
@@ -2,49 +2,46 @@
 from typing import NamedTuple
 
 import numpy as np
+import pytensor.tensor as pt
 
+from pytensor.graph import Apply, Op
 from scipy.optimize import minimize
 
 
 class LBFGSHistory(NamedTuple):
     x: np.ndarray
-    f: np.ndarray
     g: np.ndarray
 
 
 class LBFGSHistoryManager:
-    def __init__(self, fn: Callable, grad_fn: Callable, x0: np.ndarray, maxiter: int):
+    def __init__(self, grad_fn: Callable, x0: np.ndarray, maxiter: int):
         dim = x0.shape[0]
         maxiter_add_one = maxiter + 1
         # Pre-allocate arrays to save memory and improve speed
         self.x_history = np.empty((maxiter_add_one, dim), dtype=np.float64)
-        self.f_history = np.empty(maxiter_add_one, dtype=np.float64)
         self.g_history = np.empty((maxiter_add_one, dim), dtype=np.float64)
         self.count = 0
-        self.fn = fn
         self.grad_fn = grad_fn
-        self.add_entry(x0, fn(x0), grad_fn(x0))
+        self.add_entry(x0, grad_fn(x0))
 
-    def add_entry(self, x, f, g=None):
+    def add_entry(self, x, g):
         self.x_history[self.count] = x
-        self.f_history[self.count] = f
-        if self.g_history is not None and g is not None:
-            self.g_history[self.count] = g
+        self.g_history[self.count] = g
         self.count += 1
 
     def get_history(self):
-        # Return trimmed arrays up to the number of entries actually used
+        # Return trimmed arrays up to L << L^max
         x = self.x_history[: self.count]
-        f = self.f_history[: self.count]
-        g = self.g_history[: self.count] if self.g_history is not None else None
+        g = self.g_history[: self.count]
         return LBFGSHistory(
             x=x,
-            f=f,
             g=g,
         )
 
     def __call__(self, x):
-        self.add_entry(x, self.fn(x), self.grad_fn(x))
+        grad = self.grad_fn(x)
+        if np.all(np.isfinite(grad)):
+            self.add_entry(x, grad)
 
 
 def lbfgs(
@@ -62,7 +59,6 @@ def callback(xk):
         lbfgs_history_manager(xk)
 
     lbfgs_history_manager = LBFGSHistoryManager(
-        fn=fn,
         grad_fn=grad_fn,
         x0=x0,
         maxiter=maxiter,
@@ -89,4 +85,58 @@ def callback(xk):
         callback=callback,
         **lbfgs_kwargs,
     )
-    return lbfgs_history_manager.get_history()
+    lbfgs_history = lbfgs_history_manager.get_history()
+    return lbfgs_history.x, lbfgs_history.g
+
+
+class LBFGSOp(Op):
+    def __init__(self, fn, grad_fn, maxcor, maxiter=1000, ftol=1e-5, gtol=1e-8, maxls=1000):
+        self.fn = fn
+        self.grad_fn = grad_fn
+        self.maxcor = maxcor
+        self.maxiter = maxiter
+        self.ftol = ftol
+        self.gtol = gtol
+        self.maxls = maxls
+
+    def make_node(self, x0):
+        x0 = pt.as_tensor_variable(x0)
+        x_history = pt.dmatrix()
+        g_history = pt.dmatrix()
+        return Apply(self, [x0], [x_history, g_history])
+
+    def perform(self, node, inputs, outputs):
+        x0 = inputs[0]
+        x0 = np.array(x0, dtype=np.float64)
+
+        history_manager = LBFGSHistoryManager(grad_fn=self.grad_fn, x0=x0, maxiter=self.maxiter)
+
+        minimize(
+            self.fn,
+            x0,
+            method="L-BFGS-B",
+            jac=self.grad_fn,
+            callback=history_manager,
+            options={
+                "maxcor": self.maxcor,
+                "maxiter": self.maxiter,
+                "ftol": self.ftol,
+                "gtol": self.gtol,
+                "maxls": self.maxls,
+            },
+        )
+
+        # fmin_l_bfgs_b(
+        #     func=self.fn,
+        #     fprime=self.grad_fn,
+        #     x0=x0,
+        #     pgtol=self.gtol,
+        #     factr=self.ftol / np.finfo(float).eps,
+        #     maxls=self.maxls,
+        #     maxiter=self.maxiter,
+        #     m=self.maxcor,
+        #     callback=history_manager,
+        # )
+
+        outputs[0][0] = history_manager.get_history().x
+        outputs[1][0] = history_manager.get_history().g
diff --git a/pymc_experimental/inference/pathfinder/pathfinder.py b/pymc_experimental/inference/pathfinder/pathfinder.py