Non-uniform model weights in EnsembleModel & EnsemblePosterior (#2993)

Carl Hvarfner · facebook-github-bot · commit 725801746973 · 2025-09-03T15:08:42.000-07:00
Summary: Pull Request resolved: #2993 Pull Request resolved: #2993 Adds support for non-uniform model weights in EnsembleModel & EnsemblePosterior. Why bother? - Setting weights to be non-uniform for `MatheronPathModel` (e.g., [0, 0, 0, 1]) for different models in an ensemble (e.g. a `FullyBayesianSingleTaskGP`) allows the drawing of function samples from fully Bayesian Models. - Also implemented modified sampling, since the rsample() method of the ensemble posterior would not be valid with batch shapes otherwise. Sampling occurs along the ensemble dimension only. Indended use in benchmarking. What this diff does not do: Create a batchable rsample for EnsemblePosteriors. The current implementation does not work as intended, as it does not sample exclusively over the ensemble dimension and cannot handle different ensemble weights for different batches. Instead, it samples one model in the ensemble and applies it across batches. Attempting to implement this was very cumbersome, and broke a number of tests across the stack. Reviewed By: Balandat Differential Revision: D80728012 fbshipit-source-id: 6a2316862d98f0275a199b9b1442040b675337ef
diff --git a/botorch/models/ensemble.py b/botorch/models/ensemble.py
@@ -24,6 +24,19 @@
 class EnsembleModel(Model, ABC):
     """Abstract base class for ensemble models."""
 
+    def __init__(self, weights: Tensor | None = None):
+        """Initialize the ensemble model.
+
+        Args:
+            weights: Optional weights for the ensemble members.
+                If None, the model weights will default to uniform in the
+                corresponding mixture posterior.
+        """
+        super().__init__()
+        # buffer `weights` is generally a name occupied by another module,
+        # so we have to be more specific here
+        self.ensemble_weights = weights
+
     @abstractmethod
     def forward(self, X: Tensor) -> Tensor:
         r"""Compute the (ensemble) model output at X.
@@ -82,7 +95,7 @@ def posterior(
             values, _ = self.outcome_transform.untransform(values, X=X)
         if output_indices is not None:
             values = values[..., output_indices]
-        posterior = EnsemblePosterior(values=values)
+        posterior = EnsemblePosterior(values=values, weights=self.ensemble_weights)
         if posterior_transform is not None:
             return posterior_transform(posterior)
         else:
diff --git a/botorch/posteriors/ensemble.py b/botorch/posteriors/ensemble.py
@@ -13,34 +13,91 @@
 import torch
 from botorch.posteriors.posterior import Posterior
 from torch import Tensor
+from torch.distributions.multinomial import Multinomial
 
 
 class EnsemblePosterior(Posterior):
     r"""Ensemble posterior, that should be used for ensemble models that compute
     eagerly a finite number of samples per X value as for example a deep ensemble
     or a random forest."""
 
-    def __init__(self, values: Tensor) -> None:
+    def __init__(self, values: Tensor, weights: Tensor | None = None) -> None:
         r"""
         Args:
             values: Values of the samples produced by this posterior as
                 a `(b) x s x q x m` tensor where `m` is the output size of the
                 model and `s` is the ensemble size.
+            weights: Optional weights for the ensemble members as a tensor of shape
+                `(s,)`. If None, uses uniform weights.
         """
         if values.ndim < 3:
             raise ValueError("Values has to be at least three-dimensional.")
         self.values = values
+        self._weights = weights.to(values) if weights is not None else None
+        # Pre-compute normalized weights and mixture properties for efficiency
+        self._mixture_dims = list(range(self.values.ndim - 2))
+        self._normalized_weights = self._compute_normalized_weights()
+        self._normalized_mixture_weights = self._compute_normalized_mixture_weights()
 
     @property
     def ensemble_size(self) -> int:
         r"""The size of the ensemble"""
         return self.values.shape[-3]
 
+    @property
+    def mixture_size(self) -> int:
+        r"""The total number of elements in the mixture dimensions"""
+        return self.values.shape[:-2].numel()
+
+    def _compute_normalized_weights(self) -> Tensor:
+        r"""Compute and cache normalized weights."""
+        if self._weights is not None:
+            return self._weights / self._weights.sum(dim=-1, keepdim=True)
+        else:
+            return (
+                torch.ones(
+                    self.ensemble_size,
+                    dtype=self.dtype,
+                    device=self.device,
+                )
+                / self.ensemble_size
+            )
+
+    def _compute_normalized_mixture_weights(self) -> Tensor:
+        r"""Compute and cache normalized mixture weights."""
+        if self._weights is not None:
+            unnorm_weights = self._weights.expand(self.values.shape[:-2])
+            return unnorm_weights / unnorm_weights.sum(
+                dim=self._mixture_dims, keepdim=True
+            )
+        else:
+            return (
+                torch.ones(
+                    self.values.shape[:-2],
+                    dtype=self.dtype,
+                    device=self.device,
+                )
+                / self.mixture_size
+            )
+
     @property
     def weights(self) -> Tensor:
         r"""The weights of the individual models in the ensemble.
-        Equally weighted by default."""
-        return torch.ones(self.ensemble_size) / self.ensemble_size
+        uniformly weighted by default."""
+        return self._normalized_weights
+
+    @property
+    def mixture_weights(self) -> Tensor:
+        r"""The weights of the individual models in the ensemble.
+        uniformly weighted by default, and normalized over ensemble and
+        batch dimensions of the model."""
+        return self._normalized_mixture_weights
+
+    @property
+    def mixture_dims(self) -> list[int]:
+        r"""The mixture dimensions of the posterior. For ensemble posteriors,
+        this includes all dimensions except the last two (query points and outputs)."""
+        return self._mixture_dims
 
     @property
     def device(self) -> torch.device:
@@ -55,17 +112,60 @@ def dtype(self) -> torch.dtype:
     @property
     def mean(self) -> Tensor:
         r"""The mean of the posterior as a `(b) x n x m`-dim Tensor."""
-        return self.values.mean(dim=-3)
+        # Weighted average across ensemble dimension
+        return (self.values * self.weights[..., None, None]).sum(dim=-3)
 
     @property
     def variance(self) -> Tensor:
         r"""The variance of the posterior as a `(b) x n x m`-dim Tensor.
 
-        Computed as the sample variance across the ensemble outputs.
+        Computed as the weighted sample variance across the ensemble outputs.
+
+        This treats weights as probability weights (normalized to sum to 1) and
+        computes the unbiased weighted sample variance using the formula:
+        Var = Σ(w_i * (x_i - μ)²) / (1 - Σw_i²)
+        where the sum over w_i² is taken over the ensemble dimension only.
+        Source: https://en.wikipedia.org/wiki/Weighted_arithmetic_mean under
+        "Reliability Weights".
         """
         if self.ensemble_size == 1:
             return torch.zeros_like(self.values.squeeze(-3))
-        return self.values.var(dim=-3)
+
+        # Add dimensions for query points and outputs to enable broadcasting
+        weights = self.weights[..., None, None]
+        squared_deviations = (self.values - self.mean.unsqueeze(-3)) ** 2
+        return (weights * squared_deviations).sum(dim=-3) / (1 - (weights**2).sum())
+
+    @property
+    def mixture_mean(self) -> Tensor:
+        r"""The mixture mean of the posterior as a `(b) x n x m`-dim Tensor.
+
+        Computed as the weighted average across the ensemble outputs.
+        """
+        return (self.values * self.mixture_weights[..., None, None]).sum(
+            dim=self.mixture_dims
+        )
+
+    @property
+    def mixture_variance(self) -> Tensor:
+        r"""The mixture variance of the posterior as a `(b) x n x m`-dim Tensor.
+
+        Computed as the weighted sample variance across the ensemble outputs.
+
+        This treats weights as probability weights (normalized to sum to 1) and
+        computes the unbiased weighted sample variance using the formula:
+        Var = Σ(w_i * (x_i - μ)²) / (1 - Σw_i²) where w_i is normalized over the
+        entire mixture, and the sum over w_i² is taken over all mixture dimensions.
+        Source: https://en.wikipedia.org/wiki/Weighted_arithmetic_mean under
+        "Reliability Weights".
+        """
+
+        # Add dimensions for query points and outputs to enable broadcasting
+        weights = self.mixture_weights[..., None, None]
+        squared_deviations = (self.values - self.mixture_mean.unsqueeze(-3)) ** 2
+        return (weights * squared_deviations).sum(dim=self.mixture_dims) / (
+            1 - (weights**2).sum()
+        )
 
     def _extended_shape(
         self,
@@ -76,6 +176,10 @@ def _extended_shape(
         """
         return sample_shape + self.values.shape[:-3] + self.values.shape[-2:]
 
+    @property
+    def batch_shape(self) -> torch.Size:
+        return self.values.shape[:-3]
+
     def rsample(
         self,
         sample_shape: torch.Size | None = None,
@@ -94,17 +198,26 @@ def rsample(
             Samples from the posterior, a tensor of shape
             `self._extended_shape(sample_shape=sample_shape)`.
         """
-        if sample_shape is None:
+        if sample_shape is None or len(sample_shape) == 0:
             sample_shape = torch.Size([1])
-        # get indices as base_samples
+
+        # NOTE This occasionally happens in Hypervolume evals when there
+        # are no points which improve over the reference point. In this case, we
+        # create a posterior for all the points which improve over the reference,
+        # which is an empty set.
+        if self.values.numel() == 0:
+            return torch.empty(
+                *self._extended_shape(sample_shape=sample_shape),
+                device=self.device,
+                dtype=self.dtype,
+            )
+
         base_samples = (
-            torch.multinomial(
-                self.weights,
-                num_samples=sample_shape.numel(),
-                replacement=True,
+            Multinomial(
+                probs=self.mixture_weights,
             )
-            .reshape(sample_shape)
-            .to(device=self.device)
+            .sample(sample_shape=sample_shape)
+            .argmax(dim=-1)
         )
         return self.rsample_from_base_samples(
             sample_shape=sample_shape, base_samples=base_samples
@@ -132,9 +245,31 @@ def rsample_from_base_samples(
             Samples from the posterior, a tensor of shape
             `self._extended_shape(sample_shape=sample_shape)`.
         """
-        if base_samples.shape != sample_shape:
-            raise ValueError("Base samples do not match sample shape.")
-        # move sample axis to front
-        values = self.values.movedim(-3, 0)
-        # sample from the first dimension of values
-        return values[base_samples, ...]
+        # Check that the first dimensions of base_samples match sample_shape
+        if base_samples.shape != sample_shape + self.batch_shape:
+            raise ValueError(
+                f"Sample_shape={sample_shape + self.batch_shape} does not match "
+                f"the leading dimensions of base_samples.shape={base_samples.shape}."
+            )
+
+        if self.batch_shape:
+            # Values is always going to be 4-dimensional with this reshape,
+            # even if we have more than one batch dimension
+            values = self.values.reshape(
+                ((self.batch_shape.numel(),) + self.values.shape[-3:])
+            )
+
+            # Collapse the base samples to enable index selecting along the
+            # ensemble dim (dim -3)
+            batch_numel = self.batch_shape.numel()
+            collapsed_base_samples = base_samples.reshape(sample_shape + (batch_numel,))
+
+            # First dimension is just 1, 2, 3, ..., batch_shape.numel() -1 to flatten
+            # the first dimension and extract one index
+
+            # second dimension extracts the ensemble member, for each element in the
+            # entire batch shape
+            return values[torch.arange(batch_numel), collapsed_base_samples].reshape(
+                self._extended_shape(sample_shape=sample_shape)
+            )
+        return self.values[base_samples]
diff --git a/botorch/sampling/index_sampler.py b/botorch/sampling/index_sampler.py
@@ -15,6 +15,7 @@
 from botorch.posteriors.ensemble import EnsemblePosterior
 from botorch.sampling.base import MCSampler
 from torch import Tensor
+from torch.distributions.multinomial import Multinomial
 
 
 class IndexSampler(MCSampler):
@@ -44,14 +45,19 @@ def _construct_base_samples(self, posterior: EnsemblePosterior) -> None:
             posterior: The ensemble posterior to construct the base samples
                 for.
         """
-        if self.base_samples is None or self.base_samples.shape != self.sample_shape:
+        if (
+            self.base_samples is None
+            or self.base_samples.shape != self.sample_shape + posterior.batch_shape
+        ):
             with torch.random.fork_rng():
                 torch.manual_seed(self.seed)
-                base_samples = torch.multinomial(
-                    posterior.weights,
-                    num_samples=self.sample_shape.numel(),
-                    replacement=True,
-                ).reshape(self.sample_shape)
+                base_samples = (
+                    Multinomial(
+                        probs=posterior.mixture_weights,
+                    )
+                    .sample(sample_shape=self.sample_shape)
+                    .argmax(dim=-1)
+                )
             self.register_buffer("base_samples", base_samples)
         if self.base_samples.device != posterior.device:
             self.to(device=posterior.device)  # pragma: nocover
diff --git a/test/models/test_ensemble.py b/test/models/test_ensemble.py
@@ -12,9 +12,9 @@
 class DummyEnsembleModel(EnsembleModel):
     r"""A dummy ensemble model."""
 
-    def __init__(self):
+    def __init__(self, weights=None):
         r"""Init model."""
-        super().__init__()
+        super().__init__(weights=weights)
         self._num_outputs = 2
         self.a = torch.rand(4, 3, 2)
 
@@ -35,3 +35,19 @@ def test_DummyEnsembleModel(self):
             X = torch.randn(*shape)
             p = e.posterior(X)
             self.assertEqual(p.ensemble_size, 4)
+
+    def test_EnsembleModel_weights(self):
+        """Test that weights are properly passed from EnsembleModel to
+        EnsemblePosterior."""
+        custom_weights = torch.tensor([0.4, 0.3, 0.2, 0.1])
+        e = DummyEnsembleModel(weights=custom_weights)
+
+        # Test weights are correctly passed through
+        X = torch.randn(5, 3)
+        p = e.posterior(X)
+        self.assertAllClose(p.weights, custom_weights)
+
+        # Test with batch dimensions - weights should remain 1-dimensional
+        X_batch = torch.randn(2, 5, 3)  # batch_shape = (2,)
+        p_batch = e.posterior(X_batch)
+        self.assertAllClose(p_batch.weights, custom_weights)
diff --git a/test/posteriors/test_ensemble.py b/test/posteriors/test_ensemble.py
diff --git a/test/sampling/test_index_sampler.py b/test/sampling/test_index_sampler.py