meta-pytorch
diff --git a/‎botorch/utils/probability/lin_ess.py‎
Lines changed: 110 additions & 137 deletions b/‎botorch/utils/probability/lin_ess.py‎
Lines changed: 110 additions & 137 deletions
@@ -12,22 +12,24 @@
     A. Gessner, O. Kanjilal, and P. Hennig. Integrals over gaussians under
     linear domain constraints. AISTATS 2020.
 
+.. [Wu2024]
+    K. Wu, and J. Gardner. A Fast, Robust Elliptical Slice Sampling Implementation for
+    Linearly Truncated Multivariate Normal Distributions. arXiv:2407.10449. 2024.
 
 This implementation is based (with multiple changes / optimiations) on
 the following implementations based on the algorithm in [Gessner2020]_:
 - https://github.com/alpiges/LinConGauss
 - https://github.com/wjmaddox/pytorch_ess
 
+In addition, the active intervals (from which the angle is sampled) are computed using
+the improved algorithm described in [Wu2024]_:
+https://github.com/kayween/linear-ess
+
 The implementation here differentiates itself from the original implementations with:
 1) Support for fixed feature equality constraints.
 2) Support for non-standard Normal distributions.
 3) Numerical stability improvements, especially relevant for high-dimensional cases.
-
-Notably, this implementation does not rely on an adaptive `delta_theta` parameter in
-order to determine if two neighboring constraint intersection angles `theta` lead to a
-change in the feasibility of the sample. This both simplifies the implementation and
-makes it more robust to numerical imprecisions when two constraint intersection angles
-are close to each other.
+4) Support multiple Markov chains running in parallel.
 """
 
 from __future__ import annotations
@@ -47,7 +49,6 @@ class LinearEllipticalSliceSampler(PolytopeSampler):
     r"""Linear Elliptical Slice Sampler.
 
     Ideas:
-    - Add batch support, broadcasting over parallel chains.
     - Optimize computations if possible, potentially with torch.compile.
     - Extend fixed features constraint to general linear equality constraints.
     """
@@ -64,6 +65,7 @@ def __init__(
         check_feasibility: bool = False,
         burnin: int = 0,
         thinning: int = 0,
+        num_chains: int = 1,
     ) -> None:
         r"""Initialize LinearEllipticalSliceSampler.
 
@@ -99,6 +101,7 @@ def __init__(
             burnin: Number of samples to generate upon initialization to warm up the
                 sampler.
             thinning: Number of samples to skip before returning a sample in `draw`.
+            num_chains: Number of Markov chains to run in parallel.
 
         This sampler samples from a multivariante Normal `N(mean, covariance_matrix)`
         subject to linear domain constraints `A x <= b` (intersected with box bounds,
@@ -158,10 +161,17 @@ def __init__(
         self._x = self.x0.clone()
         self._z = self._transform(self._x)
 
+        # Expand the shape to (d, num_chains) for running parallel Markov chains.
+        if num_chains > 1:
+            self._z = self._z.expand(-1, num_chains).clone()
+
         # We will need the following repeatedly, let's allocate them once
-        self._zero = torch.zeros(1, **tkwargs)
-        self._nan = torch.tensor(float("nan"), **tkwargs)
-        self._full_angular_range = torch.tensor([0.0, _twopi], **tkwargs)
+        self.zeros = torch.zeros((num_chains, 1), **tkwargs)
+        self.ones = torch.ones((num_chains, 1), **tkwargs)
+        self.indices_batch = torch.arange(
+            num_chains, dtype=torch.int64, device=tkwargs["device"]
+        )
+
         self.check_feasibility = check_feasibility
         self._lifetime_samples = 0
         if burnin > 0:
@@ -245,14 +255,14 @@ def lifetime_samples(self) -> int:
         """The total number of samples generated by the sampler during its lifetime."""
         return self._lifetime_samples
 
-    def draw(self, n: int = 1) -> Tuple[Tensor, Tensor]:
+    def draw(self, n: int = 1) -> Tensor:
         r"""Draw samples.
 
         Args:
             n: The number of samples.
 
         Returns:
-            A `n x d`-dim tensor of `n` samples.
+            A `(n * num_chains) x d`-dim tensor of `n * num_chains` samples.
         """
         samples = []
         for _ in range(n):
@@ -265,16 +275,17 @@ def step(self) -> Tensor:
         r"""Take a step, return the new sample, update the internal state.
 
         Returns:
-            A `d x 1`-dim sample from the domain.
+            A `d x num_chains`-dim tensor, where each column is a sample from a Markov
+            chain.
         """
         nu = torch.randn_like(self._z)
         theta = self._draw_angle(nu=nu)
-        z = self._get_cart_coords(nu=nu, theta=theta)
-        self._z[:] = z
-        x = self._untransform(z)
-        self._x[:] = x
+
+        self._z = z = self._get_cart_coords(nu=nu, theta=theta)
+        self._x = x = self._untransform(z)
+
         self._lifetime_samples += 1
-        if self.check_feasibility and (not self._is_feasible(self._x)):
+        if self.check_feasibility and (not self._is_feasible(self._x).all()):
             Axmb = self.A @ self._x - self.b
             violated_indices = Axmb > 0
             raise RuntimeError(
@@ -289,157 +300,119 @@ def _draw_angle(self, nu: Tensor) -> Tensor:
         r"""Draw the rotation angle.
 
         Args:
-            nu: A `d x 1`-dim tensor (the "new" direction, drawn from N(0, I)).
+            nu: A `d x num_chains`-dim tensor (the "new" direction, drawn from N(0, I)).
 
         Returns:
-            A `1`-dim Tensor containing the rotation angle (radians).
+            A `num_chains`-dim Tensor containing the rotation angle (radians).
         """
-        rot_angle, rot_slices = self._find_rotated_intersections(nu)
-        rot_lengths = rot_slices[:, 1] - rot_slices[:, 0]
-        cum_lengths = torch.cumsum(rot_lengths, dim=0)
-        cum_lengths = torch.cat((self._zero, cum_lengths), dim=0)
-        rnd_angle = cum_lengths[-1] * torch.rand(
-            1, device=cum_lengths.device, dtype=cum_lengths.dtype
+        left, right = self._find_active_intersection_angles(nu)
+        left, right = self._trim_intervals(left, right)
+
+        # If left[i, j] <= right[i, j], then [left[i, j], right[i, j]] is an active
+        # interval. On the other hand, if left[i, j] > right[i, j], then they are both
+        # dummy variables and should be discarded. Thus, we clamp their difference so
+        # that they do not contribute to the cumulative length.
+        csum = right.sub(left).clamp(min=0.0).cumsum(dim=-1)
+
+        u = csum[:, -1] * torch.rand(
+            right.size(-2), dtype=right.dtype, device=right.device
         )
-        idx = torch.searchsorted(cum_lengths, rnd_angle) - 1
-        return (rot_slices[idx, 0] + rnd_angle + rot_angle) - cum_lengths[idx]
+
+        # The returned index i satisfies csum[i - 1] < u <= csum[i]
+        idx = torch.searchsorted(csum, u.unsqueeze(-1)).squeeze(-1)
+
+        # Do a zero padding so that padded_csum[i] = csum[i - 1]
+        padded_csum = torch.cat([self.zeros, csum], dim=-1)
+
+        return u - padded_csum[self.indices_batch, idx] + left[self.indices_batch, idx]
 
     def _get_cart_coords(self, nu: Tensor, theta: Tensor) -> Tensor:
-        r"""Determine location on ellipsoid in cartesian coordinates.
+        r"""Determine location on the ellipse in Cartesian coordinates.
 
         Args:
-            nu: A `d x 1`-dim tensor (the "new" direction, drawn from N(0, I)).
-            theta: A `k`-dim tensor of angles.
+            nu: A `d x num_chains`-dim tensor (the "new" direction, drawn from N(0, I)).
+            theta: A `num_chains`-dim tensor of angles.
 
         Returns:
-            A `d x k`-dim tensor of samples from the domain in cartesian coordinates.
+            A `d x num_chains`-dim tensor of samples from the domain in Cartesian
+            coordinates.
         """
         return self._z * torch.cos(theta) + nu * torch.sin(theta)
 
-    def _find_rotated_intersections(self, nu: Tensor) -> Tuple[Tensor, Tensor]:
-        r"""Finds rotated intersections.
+    def _trim_intervals(self, left: Tensor, right: Tensor) -> Tuple[Tensor, Tensor]:
+        """Trim the intervals by a small positive constant. This encourages the Markov
+        chain to stay in the interior of the domain.
+        """
+        gap = torch.clamp(right - left, min=0.0)
+        eps = gap.mul(0.25).clamp(max=1e-6 if gap.dtype == torch.float32 else 1e-12)
+
+        return left + eps, right - eps
 
-        Rotates the intersections by the rotation angle and makes sure that all
-        angles lie in [0, 2*pi].
+    def _find_active_intersection_angles(self, nu: Tensor) -> Tuple[Tensor, Tensor]:
+        """Construct the active intersection angles.
 
         Args:
-            nu: A `d x 1`-dim tensor (the "new" direction, drawn from N(0, I)).
+            nu: A `d x num_chains`-dim tensor (the "new" direction, drawn from N(0, I)).
 
         Returns:
-            A two-tuple containing rotation angle (scalar) and a
-            `num_active / 2 x 2`-dim tensor of shifted angles.
+            A tuple (left, right) of two tensors of size `num_chains x m` representing
+            the active intersection angles. For the i-th Markov chain and the j-th
+            constraint, a pair of angles left[i, j] and right[i, j] is active if and
+            only if left[i, j] <= right[i, j]. If left[i, j] > right[i, j], they are
+            inactive and should be ignored.
         """
-        slices = self._find_active_intersections(nu)
-        rot_angle = slices[0]
-        slices = (slices - rot_angle).reshape(-1, 2)
-        # Ensuring that we don't sample within numerical precision of the boundaries
-        # due to resulting instabilities in the constraint satisfaction.
-        eps = 1e-6 if slices.dtype == torch.float32 else 1e-12
-        eps = torch.tensor(eps, dtype=slices.dtype, device=slices.device)
-        eps = eps.minimum(slices.diff(dim=-1).abs() / 4)
-        slices = slices + torch.cat((eps, -eps), dim=-1)
-        # NOTE: The remainder call relies on the epsilon contraction, since the
-        # remainder of_twopi divided by _twopi is zero, not _twopi.
-        return rot_angle, slices.remainder(_twopi)
-
-    def _find_active_intersections(self, nu: Tensor) -> Tensor:
-        """
-        Find angles of those intersections that are at the boundary of the integration
-        domain by adding and subtracting a small angle and evaluating on the ellipse
-        to see if we are on the boundary of the integration domain.
+        alpha, beta = self._find_intersection_angles(nu)
+
+        # It's easier to put `num_chains` as the first dimension,
+        # because `torch.searchsorted` only supports searching in the last dimension
+        alpha, beta = alpha.T, beta.T
+
+        srted, indices = torch.sort(alpha, descending=False)
+        cummax = beta[self.indices_batch.unsqueeze(-1), indices].cummax(dim=-1).values
+
+        srted = torch.cat([srted, self.ones * 2 * math.pi], dim=-1)
+        cummax = torch.cat([self.zeros, cummax], dim=-1)
+
+        return cummax, srted
+
+    def _find_intersection_angles(self, nu: Tensor) -> Tuple[Tensor, Tensor]:
+        """Compute all 2 * m intersections of the ellipse and the domain, where
+        `m = n_ineq_con` is the number of inequality constraints defining the domain.
+        If the i-th linear inequality constraint has no intersection with the ellipse,
+        we will create two dummy intersection angles alpha_i = beta_i = 0.
 
         Args:
-            nu: A `d x 1`-dim tensor (the "new" direction, drawn from N(0, I)).
+            nu: A `d x num_chains`-dim tensor (the "new" direction, drawn from N(0, I)).
 
         Returns:
-            A `num_active`-dim tensor containing the angles of active intersection in
-            increasing order so that activation happens in positive direction. If a
-            slice crosses `theta=0`, the first angle is appended at the end of the
-            tensor. Every element of the returned tensor defines a slice for elliptical
-            slice sampling.
+            A tuple of two tensors with the same size `m x num_chains`. The first tensor
+            represents the smaller intersection angles. The second tensor represents the
+            larger intersection angles.
         """
-        theta = self._find_intersection_angles(nu)
-        theta_active, delta_active = self._active_theta_and_delta(
-            nu=nu,
-            theta=theta,
-        )
-        if theta_active.numel() == 0:
-            theta_active = self._full_angular_range
-            # TODO: What about `self.ellipse_in_domain = False` in the original code?
-        elif delta_active[0] == -1:  # ensuring that the first interval is feasible
+        p = self._Az @ self._z
+        q = self._Az @ nu
 
-            theta_active = torch.cat((theta_active[1:], theta_active[:1]))
+        radius = torch.sqrt(p**2 + q**2)
 
-        return theta_active.view(-1)
+        ratio = self._bz / radius
 
-    def _find_intersection_angles(self, nu: Tensor) -> Tensor:
-        """Compute all of the up to 2*n_ineq_con intersections of the ellipse
-        and the linear constraints.
+        has_solution = ratio < 1.0
 
-        For background, see equation (2) in
-        http://proceedings.mlr.press/v108/gessner20a/gessner20a.pdf
+        arccos = torch.arccos(ratio)
+        arccos[~has_solution] = 0.0
+        arctan = torch.arctan2(q, p)
 
-        Args:
-            nu: A `d x 1`-dim tensor (the "new" direction, drawn from N(0, I)).
+        theta1 = arctan + arccos
+        theta2 = arctan - arccos
 
-        Returns:
-            An `M`-dim tensor, where `M <= 2 * n_ineq_con` (with `M = n_ineq_con`
-            if all intermediate computations yield finite numbers).
-        """
-        # Compared to the implementation in https://github.com/alpiges/LinConGauss
-        # we need to flip the sign of A b/c the original algorithm considers
-        # A @ x + b >= 0 feasible, whereas we consider A @ x - b <= 0 feasible.
-        g1 = -self._Az @ self._z
-        g2 = -self._Az @ nu
-        r = torch.sqrt(g1**2 + g2**2)
-        phi = 2 * torch.atan(g2 / (r + g1)).squeeze()
-
-        arg = -(self._bz / r).squeeze()
-        # Write NaNs if there is no intersection
-        arg = torch.where(torch.absolute(arg) <= 1, arg, self._nan)
-        # Two solutions per linear constraint, shape of theta: (n_ineq_con, 2)
-        acos_arg = torch.arccos(arg)
-        theta = torch.stack((phi + acos_arg, phi - acos_arg), dim=-1)
-        theta = theta[torch.isfinite(theta)]  # shape: `n_ineq_con - num_not_finite`
-        theta = torch.where(theta < 0, theta + _twopi, theta)  # in [0, 2*pi]
-        return torch.sort(theta).values
-
-    def _active_theta_and_delta(self, nu: Tensor, theta: Tensor) -> Tensor:
-        r"""Determine active indices.
+        # translate every angle to [0, 2 * pi]
+        theta1 = theta1 + theta1.lt(0.0) * _twopi
+        theta2 = theta2 + theta2.lt(0.0) * _twopi
 
-        Args:
-            nu: A `d x 1`-dim tensor (the "new" direction, drawn from N(0, I)).
-            theta: A sorted `M`-dim tensor of intersection angles in [0, 2pi].
+        alpha = torch.minimum(theta1, theta2)
+        beta = torch.maximum(theta1, theta2)
 
-        Returns:
-            A tuple of Tensors of active constraint intersection angles `theta_active`,
-            and the change in the feasibility of the points on the ellipse on the left
-            and right of the active intersection angles `delta_active`. `delta_active`
-            is is negative if decreasing the angle renders the sample feasible, and
-            positive if increasing the angle renders the sample feasible.
-        """
-        # In order to determine if an angle that gives rise to an intersection with a
-        # constraint boundary leads to a change in the feasibility of the solution,
-        # we evaluate the constraints on the midpoint of the intersection angles.
-        # This gets rid of the `delta_theta` parameter in the original implementation,
-        # which cannot be set universally since it can be both 1) too large, when
-        # the distance in adjacent intersection angles is small, and 2) too small,
-        # when it approaches the numerical precision limit.
-        # The implementation below solves both problems and gets rid of the parameter.
-        if len(theta) < 2:  # if we have no or only a tangential intersection
-            theta_active = torch.tensor([], dtype=theta.dtype, device=theta.device)
-            delta_active = torch.tensor([], dtype=int, device=theta.device)
-            return theta_active, delta_active
-        theta_mid = (theta[:-1] + theta[1:]) / 2  # midpoints of intersection angles
-        last_mid = (theta[:1] + theta[-1:] + _twopi) / 2
-        last_mid = last_mid.where(last_mid < _twopi, last_mid - _twopi)
-        theta_mid = torch.cat((last_mid, theta_mid, last_mid), dim=0)
-        samples_mid = self._get_cart_coords(nu=nu, theta=theta_mid)
-        delta_feasibility = (
-            self._is_feasible(samples_mid, transformed=True).to(dtype=int).diff()
-        )
-        active_indices = delta_feasibility.nonzero()
-        return theta[active_indices], delta_feasibility[active_indices]
+        return alpha, beta
 
     def _is_feasible(self, points: Tensor, transformed: bool = False) -> Tensor:
         r"""Returns a Boolean tensor indicating whether the `points` are feasible,