Merge branch 'master' into dGPFantasize

jacobrgardner · web-flow · commit 2f7a3cfeb026 · 2023-05-08T20:50:53.000-04:00
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,12 +1,14 @@
-setuptools_scm
-nbformat
+setuptools_scm<=7.1.0
 ipython<=8.6.0
 ipykernel<=6.17.1
-sphinx
-sphinx_rtd_theme
-sphinx_autodoc_typehints
-nbsphinx
-m2r2
-pyro-ppl
 linear_operator>=0.4.0
+m2r2<=0.3.3.post2
+nbclient<=0.7.3
+nbformat<=5.8.0
+nbsphinx<=0.9.1
+platformdirs<=3.2.0
+pyro-ppl
+sphinx<=6.2.1
+sphinx_rtd_theme<0.5
+sphinx_autodoc_typehints<=1.23.0
 torch>=1.11
diff --git a/docs/source/kernels.rst b/docs/source/kernels.rst
@@ -147,6 +147,11 @@ Specialty Kernels
 .. autoclass:: ArcKernel
    :members:
 
+:hidden:`HammingIMQKernel`
+
+..autoclass:: HammingIMQKernel
+  :members:
+
 :hidden:`IndexKernel`
 ~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/gpytorch/distributions/multivariate_normal.py b/gpytorch/distributions/multivariate_normal.py
@@ -215,7 +215,7 @@ def rsample(self, sample_shape: torch.Size = torch.Size(), base_samples: Optiona
         :param sample_shape: The number of samples to generate. (Default: `torch.Size([])`.)
         :param base_samples: The `*sample_shape x *batch_shape x N` tensor of
             i.i.d. (or approximately i.i.d.) standard Normal samples to
-            reparameterize. (Defualt: None.)
+            reparameterize. (Default: None.)
         :return: A `*sample_shape x *batch_shape x N` tensor of i.i.d. reparameterized samples.
         """
         covar = self.lazy_covariance_matrix
@@ -274,7 +274,7 @@ def sample(self, sample_shape: torch.Size = torch.Size(), base_samples: Optional
         :param sample_shape: The number of samples to generate. (Default: `torch.Size([])`.)
         :param base_samples: The `*sample_shape x *batch_shape x N` tensor of
             i.i.d. (or approximately i.i.d.) standard Normal samples to
-            reparameterize. (Defualt: None.)
+            reparameterize. (Default: None.)
         :return: A `*sample_shape x *batch_shape x N` tensor of i.i.d. samples.
         """
         with torch.no_grad():
diff --git a/gpytorch/kernels/__init__.py b/gpytorch/kernels/__init__.py
@@ -8,6 +8,7 @@
 from .gaussian_symmetrized_kl_kernel import GaussianSymmetrizedKLKernel
 from .grid_interpolation_kernel import GridInterpolationKernel
 from .grid_kernel import GridKernel
+from .hamming_kernel import HammingIMQKernel
 from .index_kernel import IndexKernel
 from .inducing_point_kernel import InducingPointKernel
 from .kernel import AdditiveKernel, Kernel, ProductKernel
@@ -43,6 +44,7 @@
     "GaussianSymmetrizedKLKernel",
     "GridKernel",
     "GridInterpolationKernel",
+    "HammingIMQKernel",
     "IndexKernel",
     "InducingPointKernel",
     "LCMKernel",
diff --git a/gpytorch/kernels/hamming_kernel.py b/gpytorch/kernels/hamming_kernel.py
@@ -0,0 +1,160 @@
+from typing import Optional
+
+import torch
+from torch import nn, Tensor
+
+from gpytorch.constraints.constraints import Interval, Positive
+from gpytorch.kernels.kernel import Kernel
+from gpytorch.priors.prior import Prior
+
+
+EMPTY_SIZE = torch.Size([])
+
+
+class HammingIMQKernel(Kernel):
+    r"""
+    Computes a covariance matrix based on the inverse multiquadratic Hamming kernel
+    between inputs :math:`\mathbf{x_1}` and :math:`\mathbf{x_2}`:
+
+    .. math::
+       \begin{equation*}
+            k_{\text{H-IMQ}}(\mathbf{x_1}, \mathbf{x_2}) =
+            \left( \frac{1 + \alpha}{\alpha + d_{\text{Hamming}}(x1, x2)} \right)^\beta
+       \end{equation*}
+    where :math:`\alpha` and :math:`\beta` are strictly positive scale parameters.
+    This kernel was proposed in `Biological Sequence Kernels with Guaranteed Flexibility`.
+    See http://arxiv.org/abs/2304.03775 for more details.
+
+    This kernel is meant to be used for fixed-length one-hot encoded discrete sequences.
+    Because GPyTorch is particular about dimensions, the one-hot sequence encoding should be flattened
+    to a vector with length :math:`T \times V`, where :math:`T` is the sequence length and :math:`V` is the
+    vocabulary size.
+
+    :param vocab_size: The size of the vocabulary.
+    :param batch_shape: Set this if you want a separate kernel hyperparameters for each batch of input
+        data. It should be :math:`B_1 \times \ldots \times B_k` if :math:`\mathbf{x_1}` is
+        a :math:`B_1 \times \ldots \times B_k \times N \times D` tensor.
+    :param alpha_prior: Set this if you want to apply a prior to the
+        alpha parameter.
+    :param: alpha_constraint: Set this if you want to apply a constraint
+        to the alpha parameter. If None is passed, the default is `Positive()`.
+    :param beta_prior: Set this if you want to apply a prior to the
+        beta parameter.
+    :param beta_constraint: Set this if you want to apply a constraint
+        to the beta parameter. If None is passed, the default is `Positive()`.
+
+    Example:
+        >>> vocab_size = 8
+        >>> x_cat = torch.tensor([[7, 7, 7, 7], [5, 7, 3, 4]])  # batch_size x seq_length
+        >>> x_one_hot = F.one_hot(x_cat, num_classes=vocab_size)  # batch_size x seq_length x vocab_size
+        >>> x_flat = x_one_hot.view(*x_cat.shape[:-1], -1)  # batch_size x (seq_length * vocab_size)
+        >>> covar_module = gpytorch.kernels.HammingIMQKernel(vocab_size=vocab_size)
+        >>> covar = covar_module(x_flat)  # Output: LinearOperator of size (2 x 2)
+    """
+
+    def __init__(
+        self,
+        vocab_size: int,
+        batch_shape: torch.Size = EMPTY_SIZE,
+        alpha_prior: Optional[Prior] = None,
+        alpha_constraint: Optional[Interval] = None,
+        beta_prior: Optional[Prior] = None,
+        beta_constraint: Optional[Interval] = None,
+    ):
+        super().__init__(batch_shape=batch_shape)
+        self.vocab_size = vocab_size
+        # add alpha (scale) parameter
+        alpha_constraint = Positive() if alpha_constraint is None else alpha_constraint
+        self.register_parameter(
+            name="raw_alpha",
+            parameter=nn.Parameter(torch.zeros(*self.batch_shape, 1)),
+        )
+        if alpha_prior is not None:
+            self.register_prior("alpha_prior", alpha_prior, self._alpha_param, self._alpha_closure)
+        self.register_constraint("raw_alpha", alpha_constraint)
+
+        # add beta parameter
+        beta_constraint = Positive() if beta_constraint is None else beta_constraint
+        self.register_parameter(
+            name="raw_beta",
+            parameter=nn.Parameter(torch.zeros(*self.batch_shape, 1)),
+        )
+        if beta_prior is not None:
+            self.register_prior("beta_prior", beta_prior, self._beta_param, self._beta_closure)
+        self.register_constraint("raw_beta", beta_constraint)
+
+    @property
+    def alpha(self) -> Tensor:
+        return self.raw_alpha_constraint.transform(self.raw_alpha)
+
+    @alpha.setter
+    def alpha(self, value: Tensor):
+        self._set_alpha(value)
+
+    def _alpha_param(self, m: Kernel) -> Tensor:
+        # Used by the alpha_prior
+        return m.alpha
+
+    def _alpha_closure(self, m: Kernel, v: Tensor) -> Tensor:
+        # Used by the alpha_prior
+        return m._set_alpha(v)
+
+    def _set_alpha(self, value: Tensor):
+        # Used by the alpha_prior
+        if not torch.is_tensor(value):
+            value = torch.as_tensor(value).to(self.raw_alpha)
+        self.initialize(raw_alpha=self.raw_alpha_constraint.inverse_transform(value))
+
+    @property
+    def beta(self) -> Tensor:
+        return self.raw_beta_constraint.transform(self.raw_beta)
+
+    @beta.setter
+    def beta(self, value: Tensor):
+        self._set_beta(value)
+
+    def _beta_param(self, m: Kernel) -> Tensor:
+        # Used by the beta_prior
+        return m.beta
+
+    def _beta_closure(self, m: Kernel, v: Tensor) -> Tensor:
+        # Used by the beta_prior
+        return m._set_beta(v)
+
+    def _set_beta(self, value: Tensor):
+        # Used by the beta_prior
+        if not torch.is_tensor(value):
+            value = torch.as_tensor(value).to(self.raw_beta)
+        self.initialize(raw_beta=self.raw_beta_constraint.inverse_transform(value))
+
+    def _imq(self, dist: Tensor) -> Tensor:
+        return ((1 + self.alpha) / (self.alpha + dist)).pow(self.beta)
+
+    def forward(self, x1: Tensor, x2: Tensor, diag: bool = False, **params):
+        # GPyTorch is pretty particular about dimensions so we need to unflatten the one-hot encoding
+        x1 = x1.view(*x1.shape[:-1], -1, self.vocab_size)
+        x2 = x2.view(*x2.shape[:-1], -1, self.vocab_size)
+
+        x1_eq_x2 = torch.equal(x1, x2)
+
+        if diag:
+            if x1_eq_x2:
+                res = ((1 + self.alpha) / self.alpha).pow(self.beta)
+                skip_dims = [-1] * len(self.batch_shape)
+                return res.expand(*skip_dims, x1.size(-3))
+            else:
+                dist = x1.size(-2) - (x1 * x2).sum(dim=(-1, -2))
+                return self._imq(dist)
+
+        else:
+            dist = hamming_dist(x1, x2, x1_eq_x2)
+
+        return self._imq(dist)
+
+
+def hamming_dist(x1: Tensor, x2: Tensor, x1_eq_x2: bool) -> Tensor:
+    res = x1.size(-2) - (x1.unsqueeze(-3) * x2.unsqueeze(-4)).sum(dim=(-1, -2))
+    if x1_eq_x2 and not x1.requires_grad and not x2.requires_grad:
+        res.diagonal(dim1=-2, dim2=-1).fill_(0)
+    # Zero out negative values
+    return res.clamp_min_(0)
diff --git a/test/kernels/test_hamming_kernel.py b/test/kernels/test_hamming_kernel.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+
+import pickle
+import unittest
+
+import torch
+
+from gpytorch.kernels import HammingIMQKernel
+from gpytorch.priors import GammaPrior
+
+
+class TestHammingIMQKernel(unittest.TestCase):
+    def create_seq(self, batch_size, seq_len, vocab_size):
+        return torch.randint(0, vocab_size, (batch_size, seq_len))
+
+    def create_seq_pairs(self, batch_size, seq_len, vocab_size):
+        a = self.create_seq(batch_size, seq_len, vocab_size)
+        b = self.create_seq(batch_size, seq_len, vocab_size)
+        set_to_a = torch.rand(batch_size, seq_len) < 0.5
+        b = torch.where(set_to_a, a, b)
+        return a, b
+
+    def test_computes_hamming_imq_function(self):
+        """
+        Create one-hot encoded discrete sequences and flatten them.
+        Compute the pairwise Hamming distance $d$ between the sequences.
+        Check the result of the kernel evaluation is
+        $((1 + \alpha) / (\alpha + d))^{\beta}$.
+        """
+        vocab_size = 8
+        seq_len = 4
+        alpha = 2.0
+        beta = 0.5
+        kernel = HammingIMQKernel(vocab_size=vocab_size)
+        kernel.initialize(alpha=alpha, beta=beta)
+        kernel.eval()
+
+        # Create two discrete sequences with some matches.
+        a = torch.tensor([[7, 7, 7, 7], [5, 7, 3, 4]])
+        b = torch.tensor(
+            [
+                [7, 5, 7, 4],
+                [6, 7, 3, 7],
+                [5, 7, 3, 4],
+            ]
+        )
+
+        # Convert to one-hot representation.
+        a_one_hot = torch.zeros(*a.shape, vocab_size)
+        a_one_hot.scatter_(index=a.unsqueeze(-1), dim=-1, value=1)
+        b_one_hot = torch.zeros(*b.shape, vocab_size)
+        b_one_hot.scatter_(index=b.unsqueeze(-1), dim=-1, value=1)
+
+        # Flatten the one-hot representations.
+        a_one_hot_flat = a_one_hot.view(a.size(0), -1)
+        b_one_hot_flat = b_one_hot.view(b.size(0), -1)
+
+        # Compute the Hamming distance.
+        d = seq_len - (a_one_hot.unsqueeze(-3) * b_one_hot.unsqueeze(-4)).sum(dim=(-1, -2))
+
+        # Compute the kernel evaluation.
+        actual = ((1 + alpha) / (alpha + d)) ** beta
+        res = kernel(a_one_hot_flat, b_one_hot_flat).to_dense()
+
+        # Check the result.
+        self.assertLess(torch.norm(res - actual), 1e-5)
+
+    def test_initialize_alpha(self):
+        """
+        Check that the kernel can be initialized with alpha.
+        """
+        alpha = 2.0
+        kernel = HammingIMQKernel(vocab_size=8)
+        kernel.initialize(alpha=alpha)
+        actual_value = torch.tensor(alpha).view_as(kernel.alpha)
+        self.assertLess(torch.norm(kernel.alpha - actual_value), 1e-5)
+
+    def test_initialize_alpha_batch(self):
+        batch_size = 2
+        alpha = torch.rand(batch_size)
+        kernel = HammingIMQKernel(vocab_size=8, batch_shape=torch.Size([batch_size]))
+        kernel.initialize(alpha=alpha)
+        actual_value = alpha.view_as(kernel.alpha)
+        self.assertLess(torch.norm(kernel.alpha - actual_value), 1e-5)
+
+    def test_initialize_beta(self):
+        """
+        Check that the kernel can be initialized with beta.
+        """
+        beta = 0.5
+        kernel = HammingIMQKernel(vocab_size=8)
+        kernel.initialize(beta=beta)
+        actual_value = torch.tensor(beta).view_as(kernel.beta)
+        self.assertLess(torch.norm(kernel.beta - actual_value), 1e-5)
+
+    def test_initialize_beta_batch(self):
+        batch_size = 2
+        beta = torch.rand(batch_size)
+        kernel = HammingIMQKernel(vocab_size=8, batch_shape=torch.Size([batch_size]))
+        kernel.initialize(beta=beta)
+        actual_value = beta.view_as(kernel.beta)
+        self.assertLess(torch.norm(kernel.beta - actual_value), 1e-5)
+
+    def create_kernel_with_prior(self, alpha_prior=None, beta_prior=None):
+        return HammingIMQKernel(
+            vocab_size=8,
+            alpha_prior=alpha_prior,
+            beta_prior=beta_prior,
+        )
+
+    def test_prior_type(self):
+        self.create_kernel_with_prior()
+        self.create_kernel_with_prior(
+            alpha_prior=GammaPrior(1.0, 1.0),
+            beta_prior=GammaPrior(1.0, 1.0),
+        )
+        self.assertRaises(TypeError, self.create_kernel_with_prior, 1)
+
+    def test_pickle_with_prior(self):
+        kernel = self.create_kernel_with_prior(
+            alpha_prior=GammaPrior(1.0, 1.0),
+            beta_prior=GammaPrior(1.0, 1.0),
+        )
+        pickle.loads(pickle.dumps(kernel))  # Should be able to pickle and unpickle with a prior.
+
+
+if __name__ == "__main__":
+    unittest.main()