pymc-devs
diff --git a/‎pymc_extras/inference/dadvi/dadvi.py‎
Lines changed: 93 additions & 55 deletions b/‎pymc_extras/inference/dadvi/dadvi.py‎
Lines changed: 93 additions & 55 deletions
diff --git a/‎pymc_extras/inference/laplace_approx/find_map.py‎
Lines changed: 2 additions & 30 deletions b/‎pymc_extras/inference/laplace_approx/find_map.py‎
Lines changed: 2 additions & 30 deletions
diff --git a/‎pymc_extras/inference/laplace_approx/idata.py‎
Lines changed: 21 additions & 3 deletions b/‎pymc_extras/inference/laplace_approx/idata.py‎
Lines changed: 21 additions & 3 deletions
@@ -13,12 +13,18 @@
     apply_function_over_dataset,
     coords_and_dims_for_inferencedata,
 )
+from pymc.blocking import RaveledVars
 from pymc.util import RandomSeed, get_default_varnames
 from pytensor.tensor.variable import TensorVariable
 
+from pymc_extras.inference.laplace_approx.idata import (
+    add_data_to_inference_data,
+    add_optimizer_result_to_inference_data,
+)
 from pymc_extras.inference.laplace_approx.laplace import unstack_laplace_draws
 from pymc_extras.inference.laplace_approx.scipy_interface import (
-    _compile_functions_for_scipy_optimize,
+    scipy_optimize_funcs_from_loss,
+    set_optimizer_function_defaults,
 )
 
 
@@ -29,64 +35,63 @@ def fit_dadvi(
     n_draws: int = 1000,
     keep_untransformed: bool = False,
     optimizer_method: minimize_method = "trust-ncg",
-    use_grad: bool = True,
-    use_hessp: bool = True,
-    use_hess: bool = False,
+    use_grad: bool | None = None,
+    use_hessp: bool | None = None,
+    use_hess: bool | None = None,
+    gradient_backend: str = "pytensor",
+    compile_kwargs: dict | None = None,
     **minimize_kwargs,
 ) -> az.InferenceData:
     """
-    Does inference using deterministic ADVI (automatic differentiation
-    variational inference), DADVI for short.
+    Does inference using Deterministic ADVI (Automatic Differentiation Variational Inference), DADVI for short.
 
-    For full details see the paper cited in the references:
-    https://www.jmlr.org/papers/v25/23-1015.html
+    For full details see the paper cited in the references: https://www.jmlr.org/papers/v25/23-1015.html
 
     Parameters
     ----------
     model : pm.Model
         The PyMC model to be fit. If None, the current model context is used.
 
     n_fixed_draws : int
-        The number of fixed draws to use for the optimisation. More
-        draws will result in more accurate estimates, but also
-        increase inference time. Usually, the default of 30 is a good
-        tradeoff.between speed and accuracy.
+        The number of fixed draws to use for the optimisation. More draws will result in more accurate estimates, but
+        also increase inference time. Usually, the default of 30 is a good tradeoff between speed and accuracy.
 
     random_seed: int
-        The random seed to use for the fixed draws. Running the optimisation
-        twice with the same seed should arrive at the same result.
+        The random seed to use for the fixed draws. Running the optimisation twice with the same seed should arrive at
+        the same result.
 
     n_draws: int
         The number of draws to return from the variational approximation.
 
     keep_untransformed: bool
-        Whether or not to keep the unconstrained variables (such as
-        logs of positive-constrained parameters) in the output.
+        Whether or not to keep the unconstrained variables (such as logs of positive-constrained parameters) in the
+        output.
 
     optimizer_method: str
-        Which optimization method to use. The function calls
-        ``scipy.optimize.minimize``, so any of the methods there can
-        be used. The default is trust-ncg, which uses second-order
-        information and is generally very reliable. Other methods such
-        as L-BFGS-B might be faster but potentially more brittle and
-        may not converge exactly to the optimum.
+        Which optimization method to use. The function calls ``scipy.optimize.minimize``, so any of the methods there
+        can be used. The default is trust-ncg, which uses second-order information and is generally very reliable.
+        Other methods such as L-BFGS-B might be faster but potentially more brittle and may not converge exactly to
+        the optimum.
+
+    gradient_backend: str
+        Which backend to use to compute gradients. Must be one of "jax" or "pytensor". Default is "pytensor".
+
+    compile_kwargs: dict, optional
+        Additional keyword arguments to pass to `pytensor.function`
 
     minimize_kwargs:
-        Additional keyword arguments to pass to the
-        ``scipy.optimize.minimize`` function. See the documentation of
+        Additional keyword arguments to pass to the ``scipy.optimize.minimize`` function. See the documentation of
         that function for details.
 
-    use_grad:
-        If True, pass the gradient function to
-        `scipy.optimize.minimize` (where it is referred to as `jac`).
+    use_grad: bool, optional
+        If True, pass the gradient function to `scipy.optimize.minimize` (where it is referred to as `jac`).
 
-    use_hessp:
+    use_hessp: bool, optional
         If True, pass the hessian vector product to `scipy.optimize.minimize`.
 
-    use_hess:
-        If True, pass the hessian to `scipy.optimize.minimize`. Note that
-        this is generally not recommended since its computation can be slow
-        and memory-intensive if there are many parameters.
+    use_hess: bool, optional
+        If True, pass the hessian to `scipy.optimize.minimize`. Note that this is generally not recommended since its
+        computation can be slow and memory-intensive if there are many parameters.
 
     Returns
     -------
@@ -95,16 +100,15 @@ def fit_dadvi(
 
     References
     ----------
-    Giordano, R., Ingram, M., & Broderick, T. (2024). Black Box
-    Variational Inference with a Deterministic Objective: Faster, More
-    Accurate, and Even More Black Box. Journal of Machine Learning
-    Research, 25(18), 1–39.
+    Giordano, R., Ingram, M., & Broderick, T. (2024). Black Box Variational Inference with a Deterministic Objective:
+    Faster, More Accurate, and Even More Black Box. Journal of Machine Learning Research, 25(18), 1–39.
     """
 
     model = pymc.modelcontext(model) if model is None else model
 
     initial_point_dict = model.initial_point()
-    n_params = DictToArrayBijection.map(initial_point_dict).data.shape[0]
+    initial_point = DictToArrayBijection.map(initial_point_dict)
+    n_params = initial_point.data.shape[0]
 
     var_params, objective = create_dadvi_graph(
         model,
@@ -113,31 +117,45 @@ def fit_dadvi(
         n_params=n_params,
     )
 
-    f_fused, f_hessp = _compile_functions_for_scipy_optimize(
-        objective,
-        [var_params],
-        compute_grad=use_grad,
-        compute_hessp=use_hessp,
-        compute_hess=use_hess,
+    use_grad, use_hess, use_hessp = set_optimizer_function_defaults(
+        optimizer_method, use_grad, use_hess, use_hessp
+    )
+
+    f_fused, f_hessp = scipy_optimize_funcs_from_loss(
+        loss=objective,
+        inputs=[var_params],
+        initial_point_dict=None,
+        use_grad=use_grad,
+        use_hessp=use_hessp,
+        use_hess=use_hess,
+        gradient_backend=gradient_backend,
+        compile_kwargs=compile_kwargs,
+        inputs_are_flat=True,
     )
 
-    derivative_kwargs = {}
+    dadvi_initial_point = {
+        f"{var_name}_mu": np.zeros_like(value).ravel()
+        for var_name, value in initial_point_dict.items()
+    }
+    dadvi_initial_point.update(
+        {
+            f"{var_name}_sigma__log": np.zeros_like(value).ravel()
+            for var_name, value in initial_point_dict.items()
+        }
+    )
 
-    if use_grad:
-        derivative_kwargs["jac"] = True
-    if use_hessp:
-        derivative_kwargs["hessp"] = f_hessp
-    if use_hess:
-        derivative_kwargs["hess"] = True
+    dadvi_initial_point = DictToArrayBijection.map(dadvi_initial_point)
 
     result = minimize(
-        f_fused,
-        np.zeros(2 * n_params),
+        f=f_fused,
+        x0=dadvi_initial_point.data,
         method=optimizer_method,
-        **derivative_kwargs,
+        hessp=f_hessp,
         **minimize_kwargs,
     )
 
+    raveled_optimized = RaveledVars(result.x, dadvi_initial_point.point_map_info)
+
     opt_var_params = result.x
     opt_means, opt_log_sds = np.split(opt_var_params, 2)
 
@@ -148,9 +166,29 @@ def fit_dadvi(
     draws = opt_means + draws_raw * np.exp(opt_log_sds)
     draws_arviz = unstack_laplace_draws(draws, model, chains=1, draws=n_draws)
 
-    transformed_draws = transform_draws(draws_arviz, model, keep_untransformed=keep_untransformed)
+    idata = az.InferenceData(
+        posterior=transform_draws(draws_arviz, model, keep_untransformed=keep_untransformed)
+    )
+
+    var_name_to_model_var = {f"{var_name}_mu": var_name for var_name in initial_point_dict.keys()}
+    var_name_to_model_var.update(
+        {f"{var_name}_sigma__log": var_name for var_name in initial_point_dict.keys()}
+    )
+
+    idata = add_optimizer_result_to_inference_data(
+        idata=idata,
+        result=result,
+        method=optimizer_method,
+        mu=raveled_optimized,
+        model=model,
+        var_name_to_model_var=var_name_to_model_var,
+    )
+
+    idata = add_data_to_inference_data(
+        idata=idata, progressbar=False, model=model, compile_kwargs=compile_kwargs
+    )
 
-    return transformed_draws
+    return idata
 
 
 def create_dadvi_graph(
 
@@ -7,7 +7,7 @@
 import pymc as pm
 
 from better_optimize import basinhopping, minimize
-from better_optimize.constants import MINIMIZE_MODE_KWARGS, minimize_method
+from better_optimize.constants import minimize_method
 from pymc.blocking import DictToArrayBijection, RaveledVars
 from pymc.initial_point import make_initial_point_fn
 from pymc.model.transform.optimization import freeze_dims_and_data
@@ -24,40 +24,12 @@
 from pymc_extras.inference.laplace_approx.scipy_interface import (
     GradientBackend,
     scipy_optimize_funcs_from_loss,
+    set_optimizer_function_defaults,
 )
 
 _log = logging.getLogger(__name__)
 
 
-def set_optimizer_function_defaults(method, use_grad, use_hess, use_hessp):
-    method_info = MINIMIZE_MODE_KWARGS[method].copy()
-
-    if use_hess and use_hessp:
-        _log.warning(
-            'Both "use_hess" and "use_hessp" are set to True, but scipy.optimize.minimize never uses both at the '
-            'same time. When possible "use_hessp" is preferred because its is computationally more efficient. '
-            'Setting "use_hess" to False.'
-        )
-        use_hess = False
-
-    use_grad = use_grad if use_grad is not None else method_info["uses_grad"]
-
-    if use_hessp is not None and use_hess is None:
-        use_hess = not use_hessp
-
-    elif use_hess is not None and use_hessp is None:
-        use_hessp = not use_hess
-
-    elif use_hessp is None and use_hess is None:
-        use_hessp = method_info["uses_hessp"]
-        use_hess = method_info["uses_hess"]
-        if use_hessp and use_hess:
-            # If a method could use either hess or hessp, we default to using hessp
-            use_hess = False
-
-    return use_grad, use_hess, use_hessp
-
-
 def get_nearest_psd(A: np.ndarray) -> np.ndarray:
     """
     Compute the nearest positive semi-definite matrix to a given matrix.
 
@@ -22,10 +22,15 @@ def make_default_labels(name: str, shape: tuple[int, ...]) -> list:
     return [list(range(dim)) for dim in shape]
 
 
-def make_unpacked_variable_names(names: list[str], model: pm.Model) -> list[str]:
+def make_unpacked_variable_names(
+    names: list[str], model: pm.Model, var_name_to_model_var: dict[str, str] | None = None
+) -> list[str]:
     coords = model.coords
     initial_point = model.initial_point()
 
+    if var_name_to_model_var is None:
+        var_name_to_model_var = {}
+
     value_to_dim = {
         value.name: model.named_vars_to_dims.get(model.values_to_rvs[value].name, None)
         for value in model.value_vars
@@ -37,6 +42,7 @@ def make_unpacked_variable_names(names: list[str], model: pm.Model) -> list[str]
 
     unpacked_variable_names = []
     for name in names:
+        name = var_name_to_model_var.get(name, name)
         shape = initial_point[name].shape
         if shape:
             dims = dims_dict.get(name)
@@ -258,6 +264,7 @@ def optimizer_result_to_dataset(
     method: minimize_method | Literal["basinhopping"],
     mu: RaveledVars | None = None,
     model: pm.Model | None = None,
+    var_name_to_model_var: dict[str, str] | None = None,
 ) -> xr.Dataset:
     """
     Convert an OptimizeResult object to an xarray Dataset object.
@@ -268,6 +275,9 @@ def optimizer_result_to_dataset(
         The result of the optimization process.
     method: minimize_method or "basinhopping"
         The optimization method used.
+    var_name_to_model_var: dict, optional
+        Mapping between variables in the optimization result and the model variable names. Used when auxiliary
+        variables were introduced, e.g. in DADVI.
 
     Returns
     -------
@@ -279,7 +289,9 @@ def optimizer_result_to_dataset(
 
     model = pm.modelcontext(model) if model is None else model
     variable_names, *_ = zip(*mu.point_map_info)
-    unpacked_variable_names = make_unpacked_variable_names(variable_names, model)
+    unpacked_variable_names = make_unpacked_variable_names(
+        variable_names, model, var_name_to_model_var
+    )
 
     data_vars = {}
 
@@ -368,6 +380,7 @@ def add_optimizer_result_to_inference_data(
     method: minimize_method | Literal["basinhopping"],
     mu: RaveledVars | None = None,
     model: pm.Model | None = None,
+    var_name_to_model_var: dict[str, str] | None = None,
 ) -> az.InferenceData:
     """
     Add the optimization result to an InferenceData object.
@@ -384,13 +397,18 @@ def add_optimizer_result_to_inference_data(
         The MAP estimate of the model parameters.
     model: Model, optional
         A PyMC model. If None, the model is taken from the current model context.
+    var_name_to_model_var: dict, optional
+        Mapping between variables in the optimization result and the model variable names. Used when auxiliary
+        variables were introduced, e.g. in DADVI.
 
     Returns
     -------
     idata: az.InferenceData
         The provided InferenceData, with the optimization results added to the "optimizer" group.
     """
-    dataset = optimizer_result_to_dataset(result, method=method, mu=mu, model=model)
+    dataset = optimizer_result_to_dataset(
+        result, method=method, mu=mu, model=model, var_name_to_model_var=var_name_to_model_var
+    )
     idata.add_groups({"optimizer_result": dataset})
 
     return idata