Adding Kaiming/He initialization for the VBLL mean (#3053)

brunzema · meta-codesync[bot] · commit 82536af6245f · 2025-10-23T09:22:22.000-07:00
Summary: Hi everyone :) I have a minor update to the VBLLs. This update does not change the default behavior of the current implementation. ## Motivation In the VBLL repo, we observed improved performance on regression tasks, when applying Kaiming/He initialization to the VBLL mean (VectorInstitute/vbll@fdc6ad0). This has not yet been tested extensively in BO tasks, but we still wanted to migrate this option here to the community branch. The default initialization remains unchanged but one can choose to switch to the new init which is now also in the VBLL repo. ### Have you read the [Contributing Guidelines on pull requests](https://github.com/meta-pytorch/botorch/blob/main/CONTRIBUTING.md#pull-requests)? Yes Pull Request resolved: #3053 Test Plan: Added tests to maintain coverage. ## Related PRs - Original VBLL PR: #2754 - Commit with Kaiming/He initialization in VBLL repo: VectorInstitute/vbll@fdc6ad0 Reviewed By: mpolson64 Differential Revision: D85254301 Pulled By: Balandat fbshipit-source-id: 0ba4c3f6d3374bc2b0d8a8ee103432c20421460c
diff --git a/botorch_community/models/vbll_helper.py b/botorch_community/models/vbll_helper.py
@@ -16,7 +16,6 @@
 from typing import Callable
 
 import numpy as np
-
 import torch
 import torch.nn as nn
 
@@ -361,6 +360,7 @@ def __init__(
         out_features,
         regularization_weight,
         parameterization="dense",
+        mean_initialization=None,
         prior_scale=1.0,
         wishart_scale=1e-2,
         cov_rank=None,
@@ -381,10 +381,18 @@ def __init__(
         parameterization : str
             Parameterization of covariance matrix.
             Currently supports {'dense', 'diagonal', 'lowrank', 'dense_precision'}
+        mean_initialization : str or None
+            Initialization method for the mean of the weights.
+            Supports {'kaiming', None}. If None, weights are initialized from
+            a standard normal distribution. Defaults to None.
         prior_scale : float
             Scale of prior covariance matrix
         wishart_scale : float
             Scale of Wishart prior on noise covariance
+        cov_rank : int or None
+            For 'lowrank' parameterization, the rank of the covariance matrix.
+        clamp_noise_init : bool
+            Whether to clamp the noise initialization to be positive.
         dof : float
             Degrees of freedom of Wishart prior on noise covariance
         """
@@ -412,9 +420,26 @@ def __init__(
 
         # last layer distribution
         self.W_dist = get_parameterization(parameterization)
-        self.W_mean = nn.Parameter(
-            torch.randn(out_features, in_features, dtype=self.dtype)
-        )
+
+        if mean_initialization is None:
+            self.W_mean = nn.Parameter(
+                torch.randn(out_features, in_features, dtype=self.dtype)
+            )
+        elif mean_initialization == "kaiming":
+            self.W_mean = nn.Parameter(
+                torch.randn(out_features, in_features, dtype=self.dtype)
+                * np.sqrt(2.0 / in_features)
+            )
+        elif isinstance(mean_initialization, str):
+            raise ValueError(
+                f"Unknown initialization method: {mean_initialization!r}. "
+                f"Supported methods: 'kaiming'"
+            )
+        else:
+            raise TypeError(
+                f"mean_initialization must be a string or None, "
+                f"got {type(mean_initialization).__name__}"
+            )
 
         if parameterization == "diagonal":
             self.W_logdiag = nn.Parameter(
diff --git a/botorch_community/models/vblls.py b/botorch_community/models/vblls.py
@@ -25,10 +25,8 @@
 from botorch.logging import logger
 from botorch.posteriors import Posterior
 from botorch_community.models.blls import AbstractBLLModel
-
 from botorch_community.models.vbll_helper import DenseNormal, Normal, Regression
 from botorch_community.posteriors.bll_posterior import BLLPosterior
-
 from gpytorch.distributions import MultivariateNormal
 from torch import Tensor
 from torch.optim import Optimizer
@@ -85,6 +83,7 @@ def __init__(
         num_layers: int = 3,
         parameterization: str = "dense",
         cov_rank: int | None = None,
+        mean_initialization: str | None = None,
         prior_scale: float = 1.0,
         wishart_scale: float = 0.01,
         clamp_noise_init: bool = True,
@@ -103,6 +102,10 @@ def __init__(
             num_layers: Number of hidden layers in the MLP. Defaults to 3.
             parameterization: Parameterization of the posterior covariance of the last
                 layer. Supports {'dense', 'diagonal', 'lowrank', 'dense_precision'}.
+            cov_rank: For 'lowrank' parameterization, the rank of the covariance matrix.
+            mean_initialization: Initialization method for the mean of the weights in
+                the last layer. Supports {'kaiming', None}. If None, weights are
+                initialized from a standard normal distribution. Defaults to None.
             prior_scale: Scaling factor for the prior distribution in the Bayesian last
                 layer. Defaults to 1.0.
             wishart_scale: Scaling factor for the Wishart prior in the Bayesian last
@@ -177,6 +180,7 @@ def __init__(
             parameterization=parameterization,
             cov_rank=cov_rank,
             clamp_noise_init=clamp_noise_init,
+            mean_initialization=mean_initialization,
         ).to(dtype=torch.float64, device=self.device)
 
     def forward(self, x: Tensor) -> Tensor:
@@ -253,6 +257,10 @@ def __init__(self, *args, **kwargs):
     def backbone(self):
         return self.model.backbone
 
+    @property
+    def head(self):
+        return self.model.head
+
     def sample(self, sample_shape: torch.Size | None = None) -> nn.Module:
         """Create posterior sample networks of the VBLL model. Note that posterior
         samples, we first sample from the posterior distribution of the last layer and
diff --git a/test_community/models/test_vbll_helper.py b/test_community/models/test_vbll_helper.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
+
 from botorch.utils.testing import BotorchTestCase
 from botorch_community.models.vbll_helper import (
     DenseNormal,
diff --git a/test_community/models/test_vblls.py b/test_community/models/test_vblls.py
@@ -5,8 +5,11 @@
 # LICENSE file in the root directory of this source tree.
 
 import copy
+from unittest.mock import patch
 
+import numpy as np
 import torch
+
 from botorch.utils.testing import BotorchTestCase
 from botorch_community.models.blls import AbstractBLLModel
 from botorch_community.models.vblls import VBLLModel
@@ -82,6 +85,69 @@ def test_initialization(self) -> None:
                 parameterization="lowrank",  # lowrank requires cov_rank
             )
 
+    def test_mean_initialization(self):
+        """Test different mean_initialization options."""
+        d, num_hidden, num_outputs, num_layers = 2, 3, 1, 4
+
+        torch.manual_seed(0)
+        model = VBLLModel(
+            in_features=d,
+            hidden_features=num_hidden,
+            num_layers=num_layers,
+            out_features=num_outputs,
+            mean_initialization=None,
+        )
+
+        # fix seeds to see if mean init is the same
+        torch.manual_seed(0)
+        model2 = VBLLModel(
+            in_features=d,
+            hidden_features=num_hidden,
+            num_layers=num_layers,
+            out_features=num_outputs,
+        )
+
+        self.assertTrue(
+            torch.allclose(model.head.W_mean, model2.head.W_mean, atol=1e-6),
+            "mean_initialization=None should be equivalent to default initialization.",
+        )
+
+        # Test kaiming initialization, check of np.sqrt is called
+        with patch("numpy.sqrt", wraps=np.sqrt) as mock_sqrt:
+            model = VBLLModel(
+                in_features=d,
+                hidden_features=num_hidden,
+                num_layers=num_layers,
+                out_features=num_outputs,
+                mean_initialization="kaiming",
+            )
+
+            # Verify that np.sqrt was called with the correct argument
+            mock_sqrt.assert_called_once_with(2.0 / num_hidden)
+
+        # Test invalid string initialization
+        with self.assertRaises(ValueError) as cm:
+            model = VBLLModel(
+                in_features=d,
+                hidden_features=num_hidden,
+                num_layers=num_layers,
+                out_features=num_outputs,
+                mean_initialization="invalid",
+            )
+        self.assertIn("Unknown initialization method", str(cm.exception))
+        self.assertIn("kaiming", str(cm.exception))
+
+        # Test invalid type (not string or None)
+        with self.assertRaises(TypeError) as cm:
+            model = VBLLModel(
+                in_features=d,
+                hidden_features=num_hidden,
+                num_layers=num_layers,
+                out_features=num_outputs,
+                mean_initialization=["kaiming"],
+            )
+        self.assertIn("must be a string or None", str(cm.exception))
+
     def test_backbone_initialization(self) -> None:
         d, num_hidden = 4, 3
         test_backbone = torch.nn.Sequential(