WIP: Should find root of conditional_gaussian_approx not minimize nll

Michal-Novomestsky · Michal-Novomestsky · commit a1292eabcc01 · 2025-06-23T00:28:57.000+10:00
diff --git a/pymc_extras/inference/laplace.py b/pymc_extras/inference/laplace.py
@@ -29,7 +29,7 @@
 import xarray as xr
 
 from arviz import dict_to_dataset
-from better_optimize.constants import minimize_method
+from better_optimize.constants import minimize_method, root_method
 from pymc import DictToArrayBijection
 from pymc.backends.arviz import (
     coords_and_dims_for_inferencedata,
@@ -41,7 +41,7 @@
 from pymc.model.transform.optimization import freeze_dims_and_data
 from pymc.util import get_default_varnames
 from pytensor.tensor import TensorVariable
-from pytensor.tensor.optimize import minimize
+from pytensor.tensor.optimize import root
 from scipy import stats
 
 from pymc_extras.inference.find_map import (
@@ -55,6 +55,128 @@
 _log = logging.getLogger(__name__)
 
 
+def find_mode_jac_hess(
+    x: TensorVariable,  # Should be vector specifically
+    Q: TensorVariable,  # Matrix # TODO tensorinv doesn't have grad implemented yet
+    mu: TensorVariable,  # Vector
+    model: pm.Model | None = None,
+    method: root_method = "hybr",
+    use_jac: bool = True,
+    # use_hess: bool = False,
+    optimizer_kwargs: dict | None = None,
+) -> Callable:
+    """
+    Returns a function to estimate the mode and both the first and second derivatives of a model at that point by minimizing negative log likelihood. Wrapper for (pytensor-native) scipy.optimize.minimize.
+
+    Parameters
+    ----------
+    x: TensorVariable
+        The parameter with which to minimize wrt (that is, find the mode in x).
+    model: Model
+        PyMC model to use.
+    method: minimize_method
+        Which minimization algorithm to use.
+    use_jac: bool
+        If true, the minimizer will compute and store the Jacobian.
+    use_hess: bool
+        If true, the minimizer will compute and store the Hessian (note that the Hessian will be computed explicitely even if this is False).
+    optimizer_kwargs: dict
+        Kwargs to pass to scipy.optimize.minimize.
+
+    Returns
+    -------
+    f: Callable
+        A function which accepts the values of the model RVs as args and returns [mu, jac(mu) hess(mu)], where mu is the mode. The TensorVariable x is specified as an initial guess for mu in args.
+    """
+    model = pm.modelcontext(model)
+
+    # f = log(p(y | x, params))
+    f = model.logp()
+    jac = pytensor.gradient.grad(f, x)
+    hess = pytensor.gradient.jacobian(jac.flatten(), x)
+
+    # Component of log(p(x | y, params)) which depends on x (for rootfinding)
+    conditional_gaussian_approx = -0.5 * x.T @ (-hess + Q) @ x + x.T @ (Q @ mu + jac - hess @ x)
+
+    x0, _ = root(
+        equations=pt.stack([conditional_gaussian_approx]),
+        variables=x,
+        method=method,
+        jac=use_jac,
+        optimizer_kwargs=optimizer_kwargs,
+    )
+
+    # require f'(x0) and f''(x0) for Laplace approx
+    jac = pytensor.graph.replace.graph_replace(jac, {x: x0})
+    hess = pytensor.graph.replace.graph_replace(
+        hess, {x: x0}
+    )  # Possibly unecessary because jac already does this replace
+
+    # Full log(p(x | y, params))
+    _, logdetQ = pt.nlinalg.slogdet(Q)
+    conditional_gaussian_approx = (
+        -0.5 * x.T @ (-hess + Q) @ x + x.T @ (Q @ mu + jac - hess @ x0) + 0.5 * logdetQ
+    )  # TODO does doing this change the graph in root before if changed before it's compiled?
+
+    args = model.continuous_value_vars + model.discrete_value_vars
+    return pytensor.function(
+        args, [x0, conditional_gaussian_approx]
+    )  # Currently x being passed in as an initial guess for x0 AND then also going to the true value of x
+
+    # Minimise negative log likelihood
+    # nll = -model.logp()
+    # soln, _ = minimize(
+    #     objective=nll,
+    #     x=x,
+    #     method=method,
+    #     jac=use_jac,
+    #     hess=use_hess,
+    #     optimizer_kwargs=optimizer_kwargs,
+    # )
+
+    # TODO: Jesse suggested I use this graph_replace function, but it seems that "mode" here is a different type to soln:
+    #
+    # TypeError: Cannot convert Type Vector(float64, shape=(10,)) (of Variable MinimizeOp(method=BFGS, jac=True, hess=True, hessp=False).0) into Type Scalar(float64, shape=()). You can try to manually convert MinimizeOp(method=BFGS, jac=True, hess=True, hessp=False).0 into a Scalar(float64, shape=()).
+    #
+    # My understanding here is that for some function which evaluates the hessian at x, we're replacing "x" in the hess graph with the subgraph that computes "x" (i.e. soln)?
+
+    # Obtain the Hessian (re-use graph if already computed in minimize)
+    # if use_hess:
+    #     mode, _, hess = (
+    #         soln.owner.op.inner_outputs
+    #     )  # Note that this mode, _, hess will need to be slightly more elaborate for when use_jac is False (2 items to unpack instead of 3). Just a few if-blocks, but not implemented for now while we're debugging
+    #     hess = pytensor.graph.replace.graph_replace(hess, {mode: soln})
+    # else:
+    #     hess = pytensor.gradient.hessian(nll, x)
+
+    # Obtain the gradient and Hessian (re-use graphs if already computed in minimize)
+    # res = soln.owner.op.inner_outputs
+    # mode = res[0]
+
+    # print(res)
+
+    # if use_jac:
+    #     # jac = pytensor.gradient.grad(nll, x)
+    #     jac = res.pop(1)
+    # else:
+    #     jac = pytensor.gradient.grad(nll, x)
+    #     jac = pytensor.graph.replace.graph_replace(jac, {x: soln})
+
+    # print(x)
+    # # jac = pytensor.graph.replace.graph_replace(jac, {x: soln})
+
+    # jac = -jac # We subsequently want the gradients wrt log(p(y | x)) rather than the negative of this (nll)
+
+    # if use_hess:
+    #     hess = res.pop(1)
+    # else:
+    #     hess = pytensor.gradient.jacobian(jac.flatten(), soln)
+    #     # hess = pytensor.graph.replace.graph_replace(hess, {x: soln})
+
+    # args = model.continuous_value_vars + model.discrete_value_vars
+    # return pytensor.function(args, [soln, jac, hess])
+
+
 def laplace_draws_to_inferencedata(
     posterior_draws: list[np.ndarray[float | int]], model: pm.Model | None = None
 ) -> az.InferenceData:
@@ -418,69 +540,6 @@ def sample_laplace_posterior(
     return idata
 
 
-def find_mode_and_hess(
-    x: TensorVariable,
-    model: pm.Model | None = None,
-    method: minimize_method = "BFGS",
-    use_jac: bool = True,
-    use_hess: bool = False,  # TODO Tbh we can probably just remove this arg and pass True to the minimizer all the time, but if this is the case, it will throw a warning when the hessian doesn't need to be computed for a particular optimisation routine.
-    optimizer_kwargs: dict | None = None,
-) -> Callable:
-    """
-    Returns a function to estimate the mode and hessian of a model by minimizing negative log likelihood. Wrapper for (pytensor-native) scipy.optimize.minimize.
-
-    Parameters
-    ----------
-    x: TensorVariable
-        The parameter with which to minimize wrt (that is, find the mode in x).
-    model: Model
-        PyMC model to use.
-    method: minimize_method
-        Which minimization algorithm to use.
-    use_jac: bool
-        If true, the minimizer will compute and store the Jacobian.
-    use_hess: bool
-        If true, the minimizer will compute and store the Hessian (note that the Hessian will be computed explicitely even if this is False).
-    optimizer_kwargs: dict
-        Kwargs to pass to scipy.optimize.minimize.
-
-    Returns
-    -------
-    f: Callable
-        A function which accepts the values of the model RVs as args and returns [mu, hess(mu)], where mu is the mode. The TensorVariable x is specified as an initial guess for mu in args.
-    """
-    model = pm.modelcontext(model)
-
-    # Minimise negative log likelihood
-    nll = -model.logp()
-    soln, _ = minimize(
-        objective=nll,
-        x=x,
-        method=method,
-        jac=use_jac,
-        hess=use_hess,
-        optimizer_kwargs=optimizer_kwargs,
-    )
-
-    # TODO: Jesse suggested I use this graph_replace function, but it seems that "mode" here is a different type to soln:
-    #
-    # TypeError: Cannot convert Type Vector(float64, shape=(10,)) (of Variable MinimizeOp(method=BFGS, jac=True, hess=True, hessp=False).0) into Type Scalar(float64, shape=()). You can try to manually convert MinimizeOp(method=BFGS, jac=True, hess=True, hessp=False).0 into a Scalar(float64, shape=()).
-    #
-    # My understanding here is that for some function which evaluates the hessian at x, we're replacing "x" in the hess graph with the subgraph that computes "x" (i.e. soln)?
-
-    # Obtain the Hessian (re-use graph if already computed in minimize)
-    if use_hess:
-        mode, _, hess = (
-            soln.owner.op.inner_outputs
-        )  # Note that this mode, _, hess will need to be slightly more elaborate for when use_jac is False (2 items to unpack instead of 3). Just a few if-blocks, but not implemented for now while we're debugging
-        hess = pytensor.graph.replace.graph_replace(hess, {mode: soln})
-    else:
-        hess = pytensor.gradient.hessian(nll, x)
-
-    args = model.continuous_value_vars + model.discrete_value_vars
-    return pytensor.function(args, [soln, hess])
-
-
 def fit_laplace(
     optimize_method: minimize_method | Literal["basinhopping"] = "BFGS",
     *,
diff --git a/tests/test_laplace.py b/tests/test_laplace.py
@@ -21,7 +21,7 @@
 
 from pymc_extras.inference.find_map import GradientBackend, find_MAP
 from pymc_extras.inference.laplace import (
-    find_mode_and_hess,
+    find_mode_jac_hess,
     fit_laplace,
     fit_mvn_at_MAP,
     sample_laplace_posterior,
@@ -282,17 +282,19 @@ def test_laplace_scalar():
     np.testing.assert_allclose(idata_laplace.fit.mean_vector.values.item(), data.mean(), atol=0.1)
 
 
-def test_find_mode_and_hess():
+def test_find_mode_jac_hess():
     rng = np.random.default_rng(42)
     n = 100
     sigma_obs = rng.random()
     sigma_mu = rng.random()
+    true_mu = rng.random()
+    mu_val = rng.random()
 
     coords = {"city": ["A", "B", "C"], "obs_idx": np.arange(n)}
     with pm.Model(coords=coords) as model:
-        obs_val = rng.normal(loc=3, scale=1.5, size=(n, 3))
+        obs_val = rng.normal(loc=true_mu, scale=1.5, size=(n, 3))
 
-        mu = pm.Normal("mu", mu=1, sigma=sigma_mu, dims=["city"])
+        mu = pm.Normal("mu", mu=mu_val, sigma=sigma_mu, dims=["city"])
         obs = pm.Normal(
             "obs",
             mu=mu,
@@ -301,14 +303,14 @@ def test_find_mode_and_hess():
             dims=["obs_idx", "city"],
         )
 
-        get_mode_and_hessian = find_mode_and_hess(
+        get_mode_and_hessian = find_mode_jac_hess(
             use_hess=False, x=model.rvs_to_values[mu], method="BFGS", optimizer_kwargs={"tol": 1e-8}
         )
 
-    mode, hess = get_mode_and_hessian(**{"mu": [1, 1, 1]})
+    mode, jac, hess = get_mode_and_hessian(mu=[1, 1, 1])
 
     true_mode = obs_val.mean(axis=0)
-    true_hess = np.diag((1 / sigma_mu**2 + n / sigma_obs**2) * np.ones(3))
+    true_hess = -np.diag((1 / sigma_mu**2 + n / sigma_obs**2) * np.ones(3))
 
     np.testing.assert_allclose(mode, true_mode, atol=0.1, rtol=0.1)
     np.testing.assert_allclose(hess, true_hess, atol=0.1, rtol=0.1)