diff --git a/pytensor/sparse/math.py b/pytensor/sparse/math.py
index 6a87e46760..bdb2b52ef4 100644
--- a/pytensor/sparse/math.py
+++ b/pytensor/sparse/math.py
@@ -12,8 +12,8 @@
 from pytensor.gradient import grad_not_implemented
 from pytensor.graph import Apply, Op
 from pytensor.link.c.op import COp
-from pytensor.tensor import TensorType, Variable, specify_broadcastable, tensor
-from pytensor.tensor.type import complex_dtypes
+from pytensor.tensor.shape import specify_broadcastable
+from pytensor.tensor.type import TensorType, Variable, complex_dtypes, tensor
 
 
 def structured_elemwise(tensor_op):
diff --git a/pytensor/sparse/variable.py b/pytensor/sparse/variable.py
index 04f5860de0..d9a97d7de1 100644
--- a/pytensor/sparse/variable.py
+++ b/pytensor/sparse/variable.py
@@ -30,8 +30,8 @@
 )
 from pytensor.sparse.type import SparseTensorType
 from pytensor.sparse.utils import hash_from_sparse
-from pytensor.tensor import iscalar
 from pytensor.tensor.shape import shape
+from pytensor.tensor.type import iscalar
 from pytensor.tensor.variable import (
     TensorConstant,
     TensorVariable,
diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index 3bfc8be09f..26e22eb486 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -6,24 +6,35 @@
 
 import pytensor.scalar as ps
 from pytensor.compile.function import function
-from pytensor.gradient import grad, grad_not_implemented, jacobian
+from pytensor.gradient import DisconnectedType, grad, jacobian
 from pytensor.graph.basic import Apply, Constant
 from pytensor.graph.fg import FunctionGraph
-from pytensor.graph.op import ComputeMapType, HasInnerGraph, Op, StorageMapType
+from pytensor.graph.null_type import NullType
+from pytensor.graph.op import (
+    ComputeMapType,
+    HasInnerGraph,
+    Op,
+    StorageMapType,
+    io_connection_pattern,
+)
 from pytensor.graph.replace import graph_replace
-from pytensor.graph.traversal import ancestors, truncated_graph_inputs
+from pytensor.graph.traversal import (
+    ancestors,
+    truncated_graph_inputs,
+)
 from pytensor.scalar import ScalarType, ScalarVariable
+from pytensor.sparse.variable import SparseVariable
+from pytensor.tensor import as_tensor_variable
 from pytensor.tensor.basic import (
     atleast_2d,
-    concatenate,
     scalar_from_tensor,
     tensor,
     tensor_from_scalar,
     zeros_like,
 )
-from pytensor.tensor.math import dot
+from pytensor.tensor.math import tensordot
+from pytensor.tensor.reshape import pack, unpack
 from pytensor.tensor.slinalg import solve
-from pytensor.tensor.type import DenseTensorType
 from pytensor.tensor.variable import TensorVariable, Variable
 
 
@@ -143,36 +154,6 @@ def _find_optimization_parameters(
     ]
 
 
-def _get_parameter_grads_from_vector(
-    grad_wrt_args_vector: TensorVariable,
-    x_star: TensorVariable,
-    args: Sequence[TensorVariable | ScalarVariable],
-    output_grad: TensorVariable,
-) -> list[TensorVariable | ScalarVariable]:
-    """
-    Given a single concatenated vector of objective function gradients with respect to raveled optimization parameters,
-    returns the contribution of each parameter to the total loss function, with the unraveled shape of the parameter.
-    """
-    cursor = 0
-    grad_wrt_args = []
-
-    for arg in args:
-        arg_shape = arg.shape
-        arg_size = arg_shape.prod()
-        arg_grad = grad_wrt_args_vector[:, cursor : cursor + arg_size].reshape(
-            (*x_star.shape, *arg_shape)
-        )
-
-        grad_wrt_arg = dot(output_grad, arg_grad)
-        if isinstance(arg.type, ScalarType):
-            grad_wrt_arg = scalar_from_tensor(grad_wrt_arg)
-        grad_wrt_args.append(grad_wrt_arg)
-
-        cursor += arg_size
-
-    return grad_wrt_args
-
-
 class ScipyWrapperOp(Op, HasInnerGraph):
     """Shared logic for scipy optimization ops"""
 
@@ -233,6 +214,39 @@ def make_node(self, *inputs):
 
         return Apply(self, inputs, [self.inner_inputs[0].type(), ps.bool("success")])
 
+    def connection_pattern(self, node=None):
+        """
+        All Ops that inherit from ScipyWrapperOp share the same connection pattern logic, because they all share the
+        same output structure. There are two outputs: the optimized variable, and a success flag. The success flag is
+        not differentiable, so it is never connected. The optimized variable is connected only to inputs that are
+        both connected to the objective function and of float dtype.
+        """
+        fx = self.fgraph.outputs[0]
+        inputs = self.fgraph.inputs
+        grads = grad(
+            fx.sum(),
+            inputs,
+            disconnected_inputs="ignore",
+            null_gradients="return",
+            return_disconnected="disconnected",
+        )
+
+        return [
+            [
+                (
+                    connection
+                    and not isinstance(g.type, DisconnectedType)
+                    and isinstance(
+                        input, TensorVariable | ScalarVariable | SparseVariable
+                    )
+                ),
+                False,
+            ]
+            for input, g, connection in zip(
+                inputs, grads, io_connection_pattern(inputs, [fx])
+            )
+        ]
+
 
 class ScipyScalarWrapperOp(ScipyWrapperOp):
     def build_fn(self):
@@ -251,6 +265,74 @@ def build_fn(self):
 
         self._fn_wrapped = LRUCache1(fn)
 
+    def compute_implicit_gradients(
+        self,
+        x_star: TensorVariable,
+        args: Sequence[TensorVariable | ScalarVariable],
+        output_grad: TensorVariable,
+        is_minimization: bool,
+    ):
+        """
+        Compute gradients of a scalar optimization problem with respect to its parameters.
+
+        For details, see the docstring of ScipyVectorWrapperOp.compute_implicit_gradients.
+
+        Parameters
+        ----------
+        x_star : TensorVariable
+            The symbolic solution of the optimization problem.
+        args : Sequence of TensorVariable or ScalarVariable
+            The parameters of the optimization problem.
+        output_grad : TensorVariable
+            The gradient of the output of the optimization Op with respect to some scalar loss.
+        is_minimization : bool
+            Whether the optimization problem is a minimization problem. If False, it is assumed to be a root-finding
+            problem.
+        """
+        fgraph = self.fgraph
+        inner_x, *inner_args = self.inner_inputs
+        inner_fx = self.inner_outputs[0]
+
+        if is_minimization:
+            inner_fx = grad(inner_fx, inner_x)
+
+        df_dx, *arg_grads = grad(
+            inner_fx,
+            [inner_x, *inner_args],
+            disconnected_inputs="ignore",
+            null_gradients="return",
+            return_disconnected="disconnected",
+        )
+
+        outer_arg_grad_map = dict(zip(args, arg_grads))
+        valid_args_and_grads = [
+            (arg, g)
+            for arg, g in outer_arg_grad_map.items()
+            if not isinstance(g.type, DisconnectedType | NullType)
+        ]
+
+        if len(valid_args_and_grads) == 0:
+            # No differentiable arguments, return disconnected gradients
+            return arg_grads
+
+        outer_args_to_diff, df_dthetas = zip(*valid_args_and_grads)
+
+        replace = dict(zip(fgraph.inputs, (x_star, *args), strict=True))
+        df_dx_star, *df_dthetas_stars = graph_replace(
+            [df_dx, *df_dthetas], replace=replace
+        )
+
+        arg_to_grad = dict(zip(outer_args_to_diff, df_dthetas_stars))
+
+        grad_wrt_args = [
+            (-arg_to_grad[arg] / df_dx_star) * output_grad
+            if arg in arg_to_grad
+            else outer_arg_grad_map[arg]
+            for arg in args
+        ]
+
+        return grad_wrt_args
+
 
 class ScipyVectorWrapperOp(ScipyWrapperOp):
     def build_fn(self):
@@ -269,97 +351,145 @@ def build_fn(self):
         # self.fgraph = fn.maker.fgraph
         self._fn_wrapped = LRUCache1(fn)
 
+    def compute_implicit_gradients(
+        self,
+        x_star: TensorVariable,
+        args: Sequence[TensorVariable | ScalarVariable],
+        output_grad: TensorVariable,
+        is_minimization: bool,
+    ):
+        r"""
+        Compute gradients of an optimization problem with respect to its parameters.
+
+        Parameters
+        ----------
+        x_star : TensorVariable
+            The symbolic solution of the optimization problem.
+        args : Sequence of TensorVariable or ScalarVariable
+            The parameters of the optimization problem.
+        output_grad : TensorVariable
+            The gradient of the output of the optimization Op with respect to some scalar loss.
+        is_minimization : bool
+            Whether the optimization problem is a minimization problem. If False, it is assumed to be a root-finding
+            problem.
+
+        Notes
+        -----
+        The gradents are computed using the implicit function theorem. Given a fuction `f(x, theta) = 0`, and a function
+        `x_star(theta)` that, given input parameters theta returns `x` such that `f(x_star(theta), theta) = 0`, we can
+        compute the gradients of `x_star` with respect to `theta` as follows:
+
+        .. math::
+
+            \underbrace{\frac{\partial f}{\partial x}\left(x^*(\theta), \theta\right)}_{\text{Jacobian wrt } x}
+            \frac{d x^*(\theta)}{d \theta}
+            +
+            \underbrace{\frac{\partial f}{\partial \theta}\left(x^*(\theta), \theta\right)}_{\text{Jacobian wrt } \theta}
+            = 0
+
+        Which, after rearranging, gives us:
+
+        .. math::
+
+            \frac{d x^*(\theta)}{d \theta} = - \left(\frac{\partial f}{\partial x}\left(x^*(\theta),
+            \theta\right)\right)^{-1} \frac{\partial f}{\partial \theta}\left(x^*(\theta), \theta\right)
+
+        Note that this method assumes `f(x_star(theta), theta) = 0`; so it is not immediately applicable to a minimization
+        problem, where `f` is the objective function. In this case, we instead take `f` to be the gradient of the objective
+        function, which *is* indeed zero at the minimum.
+        """
+        fgraph = self.fgraph
+        inner_x, *inner_args = self.inner_inputs
+        implicit_f = self.inner_outputs[0]
 
-def scalar_implict_optimization_grads(
-    inner_fx: TensorVariable,
-    inner_x: TensorVariable,
-    inner_args: Sequence[TensorVariable | ScalarVariable],
-    args: Sequence[TensorVariable | ScalarVariable],
-    x_star: TensorVariable,
-    output_grad: TensorVariable,
-    fgraph: FunctionGraph,
-) -> list[TensorVariable | ScalarVariable]:
-    df_dx, *df_dthetas = grad(
-        inner_fx, [inner_x, *inner_args], disconnected_inputs="ignore"
-    )
-
-    replace = dict(zip(fgraph.inputs, (x_star, *args), strict=True))
-    df_dx_star, *df_dthetas_stars = graph_replace([df_dx, *df_dthetas], replace=replace)
-
-    grad_wrt_args = [
-        (-df_dtheta_star / df_dx_star) * output_grad
-        for df_dtheta_star in df_dthetas_stars
-    ]
-
-    return grad_wrt_args
-
-
-def implict_optimization_grads(
-    df_dx: TensorVariable,
-    df_dtheta_columns: Sequence[TensorVariable],
-    args: Sequence[TensorVariable | ScalarVariable],
-    x_star: TensorVariable,
-    output_grad: TensorVariable,
-    fgraph: FunctionGraph,
-) -> list[TensorVariable | ScalarVariable]:
-    r"""
-    Compute gradients of an optimization problem with respect to its parameters.
-
-    The gradents are computed using the implicit function theorem. Given a fuction `f(x, theta) =`, and a function
-    `x_star(theta)` that, given input parameters theta returns `x` such that `f(x_star(theta), theta) = 0`, we can
-    compute the gradients of `x_star` with respect to `theta` as follows:
-
-    .. math::
+        df_dx, *arg_grads = grad(
+            implicit_f.sum(),
+            [inner_x, *inner_args],
+            disconnected_inputs="ignore",
+            null_gradients="return",
+            return_disconnected="disconnected",
+        )
 
-        \underbrace{\frac{\partial f}{\partial x}\left(x^*(\theta), \theta\right)}_{\text{Jacobian wrt } x}
-        \frac{d x^*(\theta)}{d \theta}
-        +
-        \underbrace{\frac{\partial f}{\partial \theta}\left(x^*(\theta), \theta\right)}_{\text{Jacobian wrt } \theta}
-        = 0
+        inner_args_to_diff = [
+            arg
+            for arg, g in zip(inner_args, arg_grads)
+            if not isinstance(g.type, DisconnectedType | NullType)
+        ]
+
+        if len(inner_args_to_diff) == 0:
+            # No differentiable arguments, return disconnected/null gradients
+            return arg_grads
+
+        outer_args_to_diff = [
+            arg
+            for inner_arg, arg in zip(inner_args, args)
+            if inner_arg in inner_args_to_diff
+        ]
+        invalid_grad_map = {
+            arg: g for arg, g in zip(args, arg_grads) if arg not in outer_args_to_diff
+        }
+
+        if is_minimization:
+            implicit_f = grad(implicit_f, inner_x)
+
+        # Gradients are computed using the inner graph of the optimization op, not the actual inputs/outputs of the op.
+        packed_inner_args, packed_arg_shapes, implicit_f = pack_inputs_of_objective(
+            implicit_f,
+            inner_args_to_diff,
+        )
 
-    Which, after rearranging, gives us:
+        df_dx, df_dtheta = jacobian(
+            implicit_f,
+            [inner_x, packed_inner_args],
+            disconnected_inputs="ignore",
+            vectorize=self.use_vectorized_jac,
+        )
 
-    .. math::
+        # Replace inner inputs (abstract dummies) with outer inputs (the actual user-provided symbols)
+        # at the solution point. Innner arguments aren't needed anymore, delete them to avoid accidental references.
+        del inner_x
+        del inner_args
+        inner_to_outer_map = dict(zip(fgraph.inputs, (x_star, *args)))
+        df_dx_star, df_dtheta_star = graph_replace(
+            [df_dx, df_dtheta], inner_to_outer_map
+        )
 
-        \frac{d x^*(\theta)}{d \theta} = - \left(\frac{\partial f}{\partial x}\left(x^*(\theta), \theta\right)\right)^{-1} \frac{\partial f}{\partial \theta}\left(x^*(\theta), \theta\right)
+        if df_dtheta_star.ndim == 0 or df_dx_star.ndim == 0:
+            grad_wrt_args_packed = -(df_dtheta_star / df_dx_star)
+        else:
+            grad_wrt_args_packed = solve(-atleast_2d(df_dx_star), df_dtheta_star)
 
-    Note that this method assumes `f(x_star(theta), theta) = 0`; so it is not immediately applicable to a minimization
-    problem, where `f` is the objective function. In this case, we instead take `f` to be the gradient of the objective
-    function, which *is* indeed zero at the minimum.
+        if packed_arg_shapes is not None:
+            packed_shapes_from_outer = graph_replace(
+                packed_arg_shapes, inner_to_outer_map, strict=False
+            )
+            grad_wrt_args = unpack(
+                grad_wrt_args_packed,
+                packed_shapes=packed_shapes_from_outer,
+                axes=0 if not all(inp.ndim == 0 for inp in (x_star, *args)) else None,
+            )
+        else:
+            grad_wrt_args = [grad_wrt_args_packed]
 
-    Parameters
-    ----------
-    df_dx : Variable
-        The Jacobian of the objective function with respect to the variable `x`.
-    df_dtheta_columns : Sequence[Variable]
-        The Jacobians of the objective function with respect to the optimization parameters `theta`.
-        Each column (or columns) corresponds to a different parameter. Should be returned by pytensor.gradient.jacobian.
-    args : Sequence[Variable]
-        The optimization parameters `theta`.
-    x_star : Variable
-        Symbolic graph representing the value of the variable `x` such that `f(x_star, theta) = 0 `.
-    output_grad : Variable
-        The gradient of the output with respect to the objective function.
-    fgraph : FunctionGraph
-        The function graph that contains the inputs and outputs of the optimization problem.
-    """
-    df_dtheta = concatenate(
-        [atleast_2d(jac_col, left=False) for jac_col in df_dtheta_columns],
-        axis=-1,
-    )
+        arg_to_grad = dict(zip(outer_args_to_diff, grad_wrt_args))
 
-    replace = dict(zip(fgraph.inputs, (x_star, *args), strict=True))
+        final_grads = []
+        for arg in args:
+            arg_grad = arg_to_grad.get(arg, None)
 
-    df_dx_star, df_dtheta_star = graph_replace(
-        [atleast_2d(df_dx), df_dtheta], replace=replace
-    )
+            if arg_grad is None:
+                final_grads.append(invalid_grad_map[arg])
+                continue
 
-    grad_wrt_args_vector = solve(-df_dx_star, df_dtheta_star)
-    grad_wrt_args = _get_parameter_grads_from_vector(
-        grad_wrt_args_vector, x_star, args, output_grad
-    )
+            if arg_grad.ndim > 0 and output_grad.ndim > 0:
+                g = tensordot(output_grad, arg_grad, [[0], [0]])
+            else:
+                g = arg_grad * output_grad
+            if isinstance(arg.type, ScalarType) and isinstance(g, TensorVariable):
+                g = scalar_from_tensor(g)
+            final_grads.append(g)
 
-    return grad_wrt_args
+        return final_grads
 
 
 class MinimizeScalarOp(ScipyScalarWrapperOp):
@@ -418,36 +548,17 @@ def perform(self, node, inputs, outputs):
     def L_op(self, inputs, outputs, output_grads):
         # TODO: Handle disconnected inputs
         x, *args = inputs
-        if non_supported_types := tuple(
-            inp.type
-            for inp in inputs
-            if not isinstance(inp.type, DenseTensorType | ScalarType)
-        ):
-            # TODO: Support SparseTensorTypes
-            # TODO: Remaining types are likely just disconnected anyway
-            msg = f"Minimize gradient not implemented due to inputs of type {non_supported_types}"
-            return [
-                grad_not_implemented(self, i, inp, msg) for i, inp in enumerate(inputs)
-            ]
         x_star, _ = outputs
         output_grad, _ = output_grads
 
-        inner_x, *inner_args = self.fgraph.inputs
-        inner_fx = self.fgraph.outputs[0]
-
-        implicit_f = grad(inner_fx, inner_x)
-
-        grad_wrt_args = scalar_implict_optimization_grads(
-            inner_fx=implicit_f,
-            inner_x=inner_x,
-            inner_args=inner_args,
-            args=args,
+        d_xstar_d_theta = self.compute_implicit_gradients(
             x_star=x_star,
+            args=args,
             output_grad=output_grad,
-            fgraph=self.fgraph,
+            is_minimization=True,
         )
 
-        return [zeros_like(x), *grad_wrt_args]
+        return [zeros_like(x), *d_xstar_d_theta]
 
 
 def minimize_scalar(
@@ -578,54 +689,62 @@ def perform(self, node, inputs, outputs):
         outputs[1][0] = np.bool_(res.success)
 
     def L_op(self, inputs, outputs, output_grads):
-        # TODO: Handle disconnected inputs
         x, *args = inputs
-        if non_supported_types := tuple(
-            inp.type
-            for inp in inputs
-            if not isinstance(inp.type, DenseTensorType | ScalarType)
-        ):
-            # TODO: Support SparseTensorTypes
-            # TODO: Remaining types are likely just disconnected anyway
-            msg = f"MinimizeOp gradient not implemented due to inputs of type {non_supported_types}"
-            return [
-                grad_not_implemented(self, i, inp, msg) for i, inp in enumerate(inputs)
-            ]
         x_star, _success = outputs
         output_grad, _ = output_grads
 
-        inner_x, *inner_args = self.fgraph.inputs
-        inner_fx = self.fgraph.outputs[0]
-
-        implicit_f = grad(inner_fx, inner_x)
-
-        df_dx, *df_dtheta_columns = jacobian(
-            implicit_f,
-            [inner_x, *inner_args],
-            disconnected_inputs="ignore",
-            vectorize=self.use_vectorized_jac,
-        )
-        grad_wrt_args = implict_optimization_grads(
-            df_dx=df_dx,
-            df_dtheta_columns=df_dtheta_columns,
-            args=args,
+        d_xstar_d_theta = self.compute_implicit_gradients(
             x_star=x_star,
+            args=args,
             output_grad=output_grad,
-            fgraph=self.fgraph,
+            is_minimization=True,
+        )
+
+        return [zeros_like(x), *d_xstar_d_theta]
+
+
+def pack_inputs_of_objective(
+    objective: TensorVariable,
+    x: TensorVariable | Sequence[TensorVariable],
+) -> tuple[TensorVariable, list[TensorVariable] | None, TensorVariable]:
+    """
+    Packs inputs `x` into a single tensor if `x` is a sequence of tensors, and rewrites the `objective` graph
+    to use the packed input. Also returns the packed shapes for unpacking the output later.
+
+    If `x` is a single tensor, it is returned as is, and no rewriting is done.
+    """
+    packed_shapes = None
+
+    if not isinstance(x, Sequence):
+        packed_input = x
+    elif len(x) == 1:
+        packed_input = x[0]
+    else:
+        packed_input, packed_shapes = pack(*x, axes=None)
+        unpacked_output = unpack(packed_input, axes=None, packed_shapes=packed_shapes)
+
+        objective = graph_replace(
+            objective,
+            {
+                xi: ui.astype(xi.type.dtype)
+                if not (isinstance(xi.type, ScalarType))
+                else scalar_from_tensor(ui.astype(xi.type.dtype))
+                for xi, ui in zip(x, unpacked_output)
+            },
         )
 
-        return [zeros_like(x), *grad_wrt_args]
+    return packed_input, packed_shapes, objective
 
 
 def minimize(
     objective: TensorVariable,
-    x: TensorVariable,
+    x: TensorVariable | Sequence[TensorVariable],
     method: str = "BFGS",
     jac: bool = True,
     hess: bool = False,
     use_vectorized_jac: bool = False,
     optimizer_kwargs: dict | None = None,
-) -> tuple[TensorVariable, TensorVariable]:
+) -> tuple[TensorVariable | tuple[TensorVariable, ...], TensorVariable]:
     """
     Minimize a scalar objective function using scipy.optimize.minimize.
 
@@ -633,8 +752,8 @@ def minimize(
     ----------
     objective : TensorVariable
         The objective function to minimize. This should be a pytensor variable representing a scalar value.
-    x: TensorVariable
-        The variable with respect to which the objective function is minimized. It must be an input to the
+    x: TensorVariable or list of TensorVariable
+        The variable or variables with respect to which the objective function is minimized. It must be an input to the
         computational graph of `objective`.
     method: str, optional
         The optimization method to use. Default is "BFGS". See scipy.optimize.minimize for other options.
@@ -653,18 +772,21 @@ def minimize(
 
     Returns
     -------
-    solution: TensorVariable
-        The optimized value of the vector of inputs `x` that minimizes `objective(x, *args)`. If the success flag
+    solution: TensorVariable or tuple of TensorVariable
+        The optimized value of each of inputs in `x` that minimizes `objective(x, *args)`. If the success flag
         is False, this will be the final state of the minimization routine, but not necessarily a minimum.
 
     success: TensorVariable
         Symbolic boolean flag indicating whether the minimization routine reported convergence to a minimum
         value, based on the requested convergence criteria.
     """
-    args = _find_optimization_parameters(objective, x)
+    objective = as_tensor_variable(objective)
+
+    packed_input, packed_shapes, objective = pack_inputs_of_objective(objective, x)
+    args = _find_optimization_parameters(objective, packed_input)
 
     minimize_op = MinimizeOp(
-        x,
+        packed_input,
         *args,
         objective=objective,
         method=method,
@@ -674,7 +796,10 @@ def minimize(
         optimizer_kwargs=optimizer_kwargs,
     )
 
-    solution, success = minimize_op(x, *args)
+    solution, success = minimize_op(packed_input, *args)
+
+    if packed_shapes is not None:
+        solution = unpack(solution, axes=None, packed_shapes=packed_shapes)
 
     return solution, success
 
@@ -757,36 +882,18 @@ def perform(self, node, inputs, outputs):
         outputs[1][0] = np.bool_(res.converged)
 
     def L_op(self, inputs, outputs, output_grads):
-        # TODO: Handle disconnected inputs
         x, *args = inputs
-        if non_supported_types := tuple(
-            inp.type
-            for inp in inputs
-            if not isinstance(inp.type, DenseTensorType | ScalarType)
-        ):
-            # TODO: Support SparseTensorTypes
-            # TODO: Remaining types are likely just disconnected anyway
-            msg = f"RootScalarOp gradient not implemented due to inputs of type {non_supported_types}"
-            return [
-                grad_not_implemented(self, i, inp, msg) for i, inp in enumerate(inputs)
-            ]
         x_star, _ = outputs
         output_grad, _ = output_grads
 
-        inner_x, *inner_args = self.fgraph.inputs
-        inner_fx = self.fgraph.outputs[0]
-
-        grad_wrt_args = scalar_implict_optimization_grads(
-            inner_fx=inner_fx,
-            inner_x=inner_x,
-            inner_args=inner_args,
-            args=args,
+        d_xstar_d_theta = self.compute_implicit_gradients(
             x_star=x_star,
+            args=args,
             output_grad=output_grad,
-            fgraph=self.fgraph,
+            is_minimization=False,
         )
 
-        return [zeros_like(x), *grad_wrt_args]
+        return [zeros_like(x), *d_xstar_d_theta]
 
 
 def root_scalar(
@@ -948,57 +1055,28 @@ def perform(self, node, inputs, outputs):
         outputs[1][0] = np.bool_(res.success)
 
     def L_op(self, inputs, outputs, output_grads):
-        # TODO: Handle disconnected inputs
         x, *args = inputs
-        if non_supported_types := tuple(
-            inp.type
-            for inp in inputs
-            if not isinstance(inp.type, DenseTensorType | ScalarType)
-        ):
-            # TODO: Support SparseTensorTypes
-            # TODO: Remaining types are likely just disconnected anyway
-            msg = f"RootOp gradient not implemented due to inputs of type {non_supported_types}"
-            return [
-                grad_not_implemented(self, i, inp, msg) for i, inp in enumerate(inputs)
-            ]
         x_star, _ = outputs
         output_grad, _ = output_grads
 
-        inner_x, *inner_args = self.fgraph.inputs
-        inner_fx = self.fgraph.outputs[0]
-
-        df_dx = (
-            jacobian(inner_fx, inner_x, vectorize=self.use_vectorized_jac)
-            if not self.jac
-            else self.fgraph.outputs[1]
-        )
-        df_dtheta_columns = jacobian(
-            inner_fx,
-            inner_args,
-            disconnected_inputs="ignore",
-            vectorize=self.use_vectorized_jac,
-        )
-
-        grad_wrt_args = implict_optimization_grads(
-            df_dx=df_dx,
-            df_dtheta_columns=df_dtheta_columns,
-            args=args,
+        d_xstar_d_theta = self.compute_implicit_gradients(
             x_star=x_star,
+            args=args,
             output_grad=output_grad,
-            fgraph=self.fgraph,
+            is_minimization=False,
         )
 
-        return [zeros_like(x), *grad_wrt_args]
+        return [zeros_like(x), *d_xstar_d_theta]
 
 
 def root(
     equations: TensorVariable,
-    variables: TensorVariable,
+    variables: TensorVariable | Sequence[TensorVariable],
     method: str = "hybr",
     jac: bool = True,
     use_vectorized_jac: bool = False,
     optimizer_kwargs: dict | None = None,
-) -> tuple[TensorVariable, TensorVariable]:
+) -> tuple[TensorVariable | Sequence[TensorVariable], TensorVariable]:
     """
     Find roots of a system of equations using scipy.optimize.root.
 
@@ -1032,11 +1110,13 @@ def root(
     success: TensorVariable
         Boolean indicating whether the root-finding was successful. If True, the solution is a root of the equation
     """
-
-    args = _find_optimization_parameters(equations, variables)
+    packed_variables, packed_shapes, equations = pack_inputs_of_objective(
+        equations, variables
+    )
+    args = _find_optimization_parameters(equations, packed_variables)
 
     root_op = RootOp(
-        variables,
+        packed_variables,
         *args,
         equations=equations,
         method=method,
@@ -1045,7 +1125,9 @@ def root(
         use_vectorized_jac=use_vectorized_jac,
     )
 
-    solution, success = root_op(variables, *args)
+    solution, success = root_op(packed_variables, *args)
+    if packed_shapes is not None:
+        solution = unpack(solution, axes=None, packed_shapes=packed_shapes)
 
     return solution, success
 
diff --git a/pytensor/tensor/reshape.py b/pytensor/tensor/reshape.py
index f556af2a75..385202949d 100644
--- a/pytensor/tensor/reshape.py
+++ b/pytensor/tensor/reshape.py
@@ -11,7 +11,6 @@
 from pytensor.graph.replace import _vectorize_node
 from pytensor.tensor import TensorLike, as_tensor_variable
 from pytensor.tensor.basic import expand_dims, infer_static_shape, join, split
-from pytensor.tensor.extra_ops import squeeze
 from pytensor.tensor.math import prod
 from pytensor.tensor.shape import ShapeValueType, shape
 from pytensor.tensor.type import tensor
@@ -66,7 +65,7 @@ def infer_shape(self, fgraph, node, shapes):
         [input_shape] = shapes
         axis_range = self.axis_range
 
-        joined_shape = prod([input_shape[i] for i in axis_range])
+        joined_shape = prod([input_shape[i] for i in axis_range], dtype=int)
         return [self.output_shapes(input_shape, joined_shape)]
 
     def perform(self, node, inputs, outputs):
@@ -92,7 +91,7 @@ def L_op(
 
         x_shape = shape(x)
         packed_shape = [x_shape[i] for i in self.axis_range]
-        return [split_dims(g_out, shape=packed_shape, axis=self.start_axis)]
+        return [SplitDims(axis=self.start_axis)(g_out, shape=packed_shape)]  # type: ignore[list-item]
 
 
 @_vectorize_node.register(JoinDims)
@@ -215,9 +214,10 @@ def L_op(
         (g_out,) = output_grads
 
         n_axes = g_out.ndim - x.ndim + 1  # type: ignore[attr-defined]
-        axis_range = list(range(self.axis, self.axis + n_axes))
-
-        return [join_dims(g_out, axis=axis_range), DisconnectedType()()]
+        return [
+            JoinDims(start_axis=self.axis, n_axes=n_axes)(g_out),  # type: ignore[list-item]
+            DisconnectedType()(),
+        ]
 
 
 @_vectorize_node.register(SplitDims)
@@ -277,12 +277,6 @@ def split_dims(
     else:
         shape = list(shape)  # type: ignore[arg-type]
 
-    if not shape:
-        # If we get an empty shape, there is potentially a dummy dimension at the requested axis. This happens for
-        # example when splitting a packed tensor that had its dims expanded before packing (e.g. when packing shapes
-        # (3, ) and (3, 3) to (3, 4)
-        return squeeze(x, axis=axis)  # type: ignore[no-any-return]
-
     [axis] = normalize_axis_tuple(axis, x.ndim)  # type: ignore[misc]
     shape = as_tensor_variable(shape, dtype="int64", ndim=1)  # type: ignore[arg-type]
 
@@ -535,12 +529,17 @@ def unpack(
                 "Unpack must have exactly one more dimension that implied by axes"
             ) from err
 
-    split_inputs = split(
-        packed_input,
-        splits_size=[prod(shape, dtype=int) for shape in packed_shapes],
-        n_splits=len(packed_shapes),
-        axis=split_axis,
-    )
+    n_splits = len(packed_shapes)
+    if n_splits == 1:
+        # If there is only one tensor to unpack, no need to split
+        split_inputs = [packed_input]
+    else:
+        split_inputs = split(
+            packed_input,
+            splits_size=[prod(shape, dtype=int) for shape in packed_shapes],
+            n_splits=n_splits,
+            axis=split_axis,
+        )
 
     return [
         split_dims(inp, shape, split_axis)
diff --git a/tests/tensor/test_optimize.py b/tests/tensor/test_optimize.py
index b211381e30..e42550ded3 100644
--- a/tests/tensor/test_optimize.py
+++ b/tests/tensor/test_optimize.py
@@ -4,7 +4,10 @@
 import pytensor
 import pytensor.tensor as pt
 from pytensor import Variable, config, function
-from pytensor.gradient import NullTypeGradError, disconnected_type
+from pytensor.gradient import (
+    DisconnectedInputError,
+    disconnected_type,
+)
 from pytensor.graph import Apply, Op, Type
 from pytensor.tensor import alloc, scalar, scalar_from_tensor, tensor_from_scalar
 from pytensor.tensor.optimize import minimize, minimize_scalar, root, root_scalar
@@ -69,8 +72,8 @@ def test_simple_minimize():
         rtol=1e-8 if config.floatX == "float64" else 1e-6,
     )
 
-    def f(x, a, b):
-        objective = (x - a * b) ** 2
+    def f(x, a, c):
+        objective = (x - a * c) ** 2
         out = minimize(objective, x)[0]
         return out
 
@@ -96,7 +99,13 @@ def rosenbrock_shifted_scaled(x, a, b):
 
     objective = rosenbrock_shifted_scaled(x, a, b)
     minimized_x, success = minimize(
-        objective, x, method=method, jac=jac, hess=hess, optimizer_kwargs={"tol": 1e-16}
+        objective,
+        x,
+        method=method,
+        jac=jac,
+        hess=hess,
+        optimizer_kwargs={"tol": 1e-16},
+        use_vectorized_jac=True,
     )
 
     fn = pytensor.function([x, a, b], [minimized_x, success])
@@ -119,12 +128,144 @@ def rosenbrock_shifted_scaled(x, a, b):
 
     def f(x, a, b):
         objective = rosenbrock_shifted_scaled(x, a, b)
-        out = minimize(objective, x)[0]
+        out = minimize(objective, x, use_vectorized_jac=False)[0]
         return out
 
     utt.verify_grad(f, [x0, a_val, b_val], eps=1e-3 if floatX == "float32" else 1e-6)
 
 
+def test_optimize_multiple_minimands():
+    """
+    Test optimization with many input variables of different shapes, as occurs in a PyMC model.
+    """
+    x0, x1, x2 = pt.dvectors("x1", "x2", "d3")
+    x3 = pt.dmatrix("x3")
+    b0, b1, b2 = pt.dscalars("b0", "b1", "b2")
+    b3 = pt.dvector("b3")
+
+    y = pt.dvector("y")
+
+    y_hat = x0 * b0 + x1 * b1 + x2 * b2 + x3 @ b3
+    objective = ((y - y_hat) ** 2).sum()
+
+    minimized_x, success = minimize(
+        objective,
+        [b0, b1, b2, b3],
+        jac=True,
+        hess=True,
+        method="Newton-CG",
+        use_vectorized_jac=True,
+    )
+
+    assert len(minimized_x) == 4
+
+    fn = pytensor.function([b0, b1, b2, b3, x0, x1, x2, x3, y], [*minimized_x, success])
+
+    rng = np.random.default_rng()
+    X = rng.normal(size=(100, 3)).astype(floatX)
+    X3 = rng.normal(size=(100, 5)).astype(floatX)
+    b_vec = rng.normal(size=(8,)).astype(floatX)
+    true_b = [b_vec[0], b_vec[1], b_vec[2], b_vec[3:]]
+    true_y = X @ b_vec[0:3] + X3 @ b_vec[3:]
+    init_b = np.zeros((8,)).astype(floatX)
+
+    inputs = (
+        init_b[0],
+        init_b[1],
+        init_b[2],
+        init_b[3:],
+        X[:, 0],
+        X[:, 1],
+        X[:, 2],
+        X3,
+        true_y,
+    )
+
+    *estimated_b, success = fn(*inputs)
+    assert success
+    for est, true in zip(estimated_b, true_b):
+        np.testing.assert_allclose(
+            est,
+            true,
+            atol=1e-5,
+            rtol=1e-5,
+        )
+
+    def f(b0, b1, b2, b3, x0, x1, x2, x3, y):
+        y_hat = x0 * b0 + x1 * b1 + x2 * b2 + x3 @ b3
+        objective = ((y - y_hat) ** 2).sum()
+        result = minimize(
+            objective,
+            [b0, b1, b2, b3],
+            jac=True,
+            hess=True,
+            method="trust-ncg",
+            use_vectorized_jac=True,
+        )[0]
+        return pt.sum([x.sum() for x in result])
+
+    utt.verify_grad(f, inputs, eps=1e-6)
+
+
+def test_minimize_mvn_logp_mu_and_cov():
+    """Regression test for https://github.com/pymc-devs/pytensor/issues/1550"""
+    d = 3
+
+    def objective(mu, cov, data):
+        L = pt.linalg.cholesky(cov)
+        _, logdet = pt.linalg.slogdet(cov)
+
+        v = mu - data
+        y = pt.linalg.solve_triangular(L, v, lower=True)
+        quad_term = (y**2).sum()
+
+        return 0.5 * (d * pt.log(2 * np.pi) + logdet + quad_term)
+
+    data = pt.vector("data", shape=(d,))
+    mu = pt.vector("mu", shape=(d,))
+    cov = pt.dmatrix("cov", shape=(d, d))
+
+    neg_logp = objective(mu, cov, data)
+    mu_star, success = minimize(
+        objective=neg_logp,
+        x=mu,
+        method="BFGS",
+        jac=True,
+        hess=False,
+        use_vectorized_jac=True,
+    )
+
+    # This replace + gradient was the original source of the error in #1550, check that no longer raises
+    y_star = pytensor.graph_replace(neg_logp, {mu: mu_star})
+    _ = pt.grad(y_star, [mu, cov, data])
+
+    rng = np.random.default_rng()
+    data_val = rng.normal(size=(d,)).astype(floatX)
+
+    L = rng.normal(size=(d, d)).astype(floatX)
+    cov_val = L @ L.T
+    mu0_val = rng.normal(size=(d,)).astype(floatX)
+
+    fn = pytensor.function([mu, cov, data], [mu_star, success])
+    _, success_flag = fn(mu0_val, cov_val, data_val)
+    assert success_flag
+
+    def min_fn(mu, cov, data):
+        mu_star, _ = minimize(
+            objective=objective(mu, cov, data),
+            x=mu,
+            method="BFGS",
+            jac=True,
+            hess=False,
+            use_vectorized_jac=True,
+        )
+        return mu_star.sum()
+
+    utt.verify_grad(
+        min_fn, [mu0_val, cov_val, data_val], eps=1e-3 if floatX == "float32" else 1e-6
+    )
+
+
 @pytest.mark.parametrize(
     "method, jac, hess",
     [("secant", False, False), ("newton", True, False), ("halley", True, True)],
@@ -224,6 +365,39 @@ def root_fn(x, a, b):
     )
 
 
+def test_root_system_multiple_inputs():
+    # Example problem from Notes and Problems from Applied General Equilibrium Economics, Chapter 3
+
+    variables = v1, v2 = [pt.dscalar(name) for name in ["v1", "v2"]]
+    v3 = pt.dscalar("v3")
+    equations = pt.stack([v1**2 * v3 - 1, v1 + v2 - 2])
+
+    def f_analytic(v3):
+        v1 = 1 / np.sqrt(v3)
+        v2 = 2 - v1
+        return np.array([v1, v2])
+
+    solution, success = root(equations=equations, variables=variables)
+    fn = pytensor.function([v1, v2, v3], [*solution, success])
+
+    v1_val = 1.0
+    v2_val = 1.0
+    v3_val = 1.0
+
+    *solution_vals, success_flag = fn(v1_val, v2_val, v3_val)
+    assert success_flag
+    np.testing.assert_allclose(np.array(solution_vals), f_analytic(v3_val))
+
+    def root_fn(v1, v2, v3):
+        equations = pt.stack([v1**2 * v3 - 1, v1 + v2 - 2])
+        [v1_solution, v2_solution], _ = root(equations=equations, variables=[v1, v2])
+        return v1_solution + v2_solution
+
+    utt.verify_grad(
+        root_fn, [v1_val, v2_val, v3_val], eps=1e-6 if floatX == "float64" else 1e-3
+    )
+
+
 @pytest.mark.parametrize("optimize_op", (minimize, root))
 def test_optimize_0d(optimize_op):
     # Scipy vector minimizers upcast 0d x to 1d. We need to work-around this
@@ -267,7 +441,11 @@ def test_optimize_grad_scalar_arg(optimize_op):
     np.testing.assert_allclose(grad_wrt_theta.eval({x: np.pi, theta: np.e}), -1)
 
 
-@pytest.mark.parametrize("optimize_op", (minimize, minimize_scalar, root, root_scalar))
+@pytest.mark.parametrize(
+    "optimize_op",
+    (minimize, minimize_scalar, root, root_scalar),
+    ids=["minimize", "minimize_scalar", "root", "root_scalar"],
+)
 def test_optimize_grad_disconnected_numerical_inp(optimize_op):
     x = scalar("x", dtype="float64")
     theta = scalar("theta", dtype="int64")
@@ -277,13 +455,14 @@ def test_optimize_grad_disconnected_numerical_inp(optimize_op):
     # Confirm theta is a direct input to the node
     assert x0.owner.inputs[1] is theta
 
-    # This should technically raise, but does not right now
-    grad_wrt_theta = pt.grad(x0, theta, disconnected_inputs="raise")
-    np.testing.assert_allclose(grad_wrt_theta.eval({x: np.pi, theta: 5}), 0)
+    with pytest.raises(DisconnectedInputError):
+        pt.grad(x0, theta, disconnected_inputs="raise")
 
     # This should work even if the previous one raised
     grad_wrt_theta = pt.grad(x0, theta, disconnected_inputs="ignore")
-    np.testing.assert_allclose(grad_wrt_theta.eval({x: np.pi, theta: 5}), 0)
+    np.testing.assert_allclose(
+        grad_wrt_theta.eval({x: np.pi, theta: 5}, on_unused_input="ignore"), 0
+    )
 
 
 @pytest.mark.parametrize("optimize_op", (minimize, minimize_scalar, root, root_scalar))
@@ -344,11 +523,73 @@ def L_op(self, inputs, outputs, output_gradients):
         np.e,
     )
 
-    with pytest.raises(NullTypeGradError):
+    with pytest.raises(DisconnectedInputError):
         pt.grad(x_star, str_theta, disconnected_inputs="raise")
 
-    # This could be supported, but it is not right now.
-    with pytest.raises(NullTypeGradError):
-        _grad_wrt_num_theta = pt.grad(x_star, num_theta, disconnected_inputs="raise")
-    # np.testing.assert_allclose(grad_wrt_num_theta.eval({x: np.pi, num_theta: np.e, str_theta: ":)"}), -1)
-    # np.testing.assert_allclose(grad_wrt_num_theta.eval({x: np.pi, num_theta: np.e, str_theta: ":("}), 1)
+    grad_wrt_num_theta = pt.grad(x_star, num_theta, disconnected_inputs="raise")
+    np.testing.assert_allclose(
+        grad_wrt_num_theta.eval({x: np.pi, num_theta: np.e, str_theta: ":)"}), -1
+    )
+    np.testing.assert_allclose(
+        grad_wrt_num_theta.eval({x: np.pi, num_theta: np.e, str_theta: ":("}), 1
+    )
+
+
+def test_vectorize_root_gradients():
+    """Regression test for https://github.com/pymc-devs/pytensor/issues/1586"""
+    a, x, y = pt.dscalars("a", "x", "y")
+
+    eq_1 = a * x**2 - y - 1
+    eq_2 = x - a * y**2 + 1
+
+    [x_star, y_star], _ = pt.optimize.root(
+        equations=pt.stack([eq_1, eq_2]),
+        variables=[x, y],
+        method="hybr",
+        optimizer_kwargs={"tol": 1e-8},
+    )
+    solution = pt.stack([x_star, y_star])
+    a_grad = pt.grad(solution.sum(), a)
+    a_grid = pt.dmatrix("a_grid")
+
+    solution_grid, a_grad_grid = pytensor.graph.vectorize_graph(
+        [solution, a_grad], {a: a_grid}
+    )
+
+    def analytical_roots_and_grad(
+        a_vals: np.ndarray,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        a = a_vals
+        # There are 4 roots to this equation, but we're always starting the optimizer at (1, 1) to control which one
+        # we get. For a > 0, this is the root with both x and y positive.
+        solution_grid = np.array(
+            (
+                -1 + (np.sqrt(4 * a + 1) + 1) ** 2 / (4 * a),
+                (np.sqrt(4 * a + 1) + 1) / (2 * a),
+            )
+        )
+
+        # Derivative of the sum of the two solutions w.r.t. a
+        dx_da = (np.sqrt(4 * a + 1) + 1) / (a * np.sqrt(4 * a + 1)) + 1 / (
+            a * np.sqrt(4 * a + 1)
+        )
+        dy_da = -((np.sqrt(4 * a + 1) + 1) ** 2) / (4 * a**2) - (
+            np.sqrt(4 * a + 1) + 1
+        ) / (2 * a**2)
+        solution_a_grad = dx_da + dy_da
+
+        return solution_grid.transpose((1, 2, 0)), solution_a_grad
+
+    fn = pytensor.function(
+        [a_grid, x, y],
+        [solution_grid, a_grad_grid],
+        on_unused_input="ignore",
+    )
+
+    a_grid = np.linspace(1, 10, 9).reshape((3, 3))
+    solution_grid_val, a_grad_grid_val = fn(a_grid=a_grid, x=1.0, y=1.0)
+
+    analytical_solution_grid, analytical_a_grad_grid = analytical_roots_and_grad(a_grid)
+
+    np.testing.assert_allclose(solution_grid_val, analytical_solution_grid)
+    np.testing.assert_allclose(a_grad_grid_val, analytical_a_grad_grid)