towards rvq

Vermeille · Vermeille · commit 2b0d45f9bdc8 · 2025-12-14T11:30:59.000+01:00
diff --git a/torchelie/nn/__init__.py b/torchelie/nn/__init__.py
@@ -2,7 +2,7 @@
 from .conv import *
 from .debug import Debug, Dummy
 from .noise import Noise
-from .vq import VQ, MultiVQ, MultiVQ2
+from .vq import VQ, MultiVQ, RVQ
 from .imagenetinputnorm import ImageNetInputNorm
 from .withsavedactivations import WithSavedActivations
 from .maskedconv import MaskedConv2d, TopLeftConv2d
diff --git a/torchelie/nn/vq.py b/torchelie/nn/vq.py
@@ -12,39 +12,43 @@ class VQ(nn.Module):
     Quantization layer from *Neural Discrete Representation Learning*
 
     Args:
-        latent_dim (int): number of features along which to quantize
-        num_tokens (int): number of tokens in the codebook
+        embedding_dim (int): number of features along which to quantize
+        num_embeddings (int): number of tokens in the codebook
         dim (int): dimension along which to quantize
         return_indices (bool): whether to return the indices of the quantized
             code points
     """
+
     embedding: nn.Embedding
     dim: int
     commitment: float
     initialized: torch.Tensor
     return_indices: bool
     init_mode: str
 
-    def __init__(self,
-                 latent_dim: int,
-                 num_tokens: int,
-                 dim: int = 1,
-                 commitment: float = 0.25,
-                 init_mode: str = 'normal',
-                 space="l2",
-                 return_indices: bool = True,
-                 max_age: int = 1000):
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        *,
+        dim: int = 1,
+        commitment: float = 0.25,
+        init_mode: str = "normal",
+        space="l2",
+        return_indices: bool = True,
+        max_age: int = 1000,
+    ):
         super(VQ, self).__init__()
-        self.latent_dim = latent_dim
-        self.embedding = nn.Embedding(num_tokens, latent_dim)
+        self.embedding_dim = embedding_dim
+        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
         nn.init.normal_(self.embedding.weight, 0, 1.1)
         self.dim = dim
         self.commitment = commitment
-        self.register_buffer('initialized', torch.Tensor([0]))
+        self.register_buffer("initialized", torch.Tensor([0]))
         self.return_indices = return_indices
-        assert init_mode in ['normal', 'first']
+        assert init_mode in ["normal", "first"]
         self.init_mode = init_mode
-        self.register_buffer('age', torch.empty(num_tokens).fill_(max_age))
+        self.age = nn.Buffer(torch.empty(num_embeddings).fill_(max_age))
         self.max_age = max_age
         self.space = space
         assert space in ["l2", "angular"]
@@ -66,12 +70,13 @@ def resample_dead(self, x):
             if len(dead) == 0:
                 return
 
-            print(f'{len(dead)} dead codes resampled')
+            print(f"{len(dead)} dead codes resampled")
             x_flat = x.view(-1, x.shape[-1])
             emb_weight = self.embedding.weight.data
-            emb_weight[dead[:len(x_flat)]] = x_flat[torch.randperm(
-                len(x_flat))[:len(dead)]].to(emb_weight.dtype)
-            self.age[dead[:len(x_flat)]] = 0
+            emb_weight[dead[: len(x_flat)]] = x_flat[
+                torch.randperm(len(x_flat))[: len(dead)]
+            ].to(emb_weight.dtype)
+            self.age[dead[: len(x_flat)]] = 0
 
             if torch.distributed.is_initialized():
                 torch.distributed.broadcast(emb_weight, 0)
@@ -94,11 +99,10 @@ def forward(
         else:
             return self.lookup(x)
 
-    def lookup(
-        self, x: torch.Tensor
-    ) -> torch.Tensor:
+    def lookup(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (..., K)
         dim = self.dim
-        needs_transpose = dim != -1 or dim != x.dim() - 1
+        needs_transpose = dim not in (-1, x.dim() - 1)
 
         x = self.embedding(x)
         if self.space == "angular":
@@ -109,6 +113,7 @@ def lookup(
             dims.insert(dim, dims[-1])
             dims.pop()
             x = x.permute(*dims)
+        # x: (..., D)
         return x
 
     def quantize(
@@ -118,17 +123,16 @@ def quantize(
         nb_codes = self.embedding.weight.shape[0]
 
         codebook = self.embedding.weight
-        if (self.init_mode == 'first' and self.initialized.item() == 0
-                and self.training):
+        if self.init_mode == "first" and self.initialized.item() == 0 and self.training:
             n_proto = self.embedding.weight.shape[0]
 
             ch_first = x.transpose(dim, -1).contiguous().view(-1, x.shape[dim])
             n_samples = ch_first.shape[0]
-            idx = torch.randint(0, n_samples, (n_proto, ))[:nb_codes]
+            idx = torch.randint(0, n_samples, (n_proto,))[:nb_codes]
             self.embedding.weight.data.copy_(ch_first[idx])
             self.initialized[:] = 1
 
-        needs_transpose = dim != -1 or dim != x.dim() - 1
+        needs_transpose = dim not in (-1, x.dim() - 1)
         if needs_transpose:
             x = x.transpose(-1, dim).contiguous()
 
@@ -139,7 +143,8 @@ def quantize(
             codebook = F.normalize(codebook, dim=1)
             x = F.normalize(x, dim=-1)
 
-        codes, indices = quantize(x, codebook, self.commitment, -1)
+        # x: (..., D)
+        codes, indices = quantize(x, codebook, self.commitment)
 
         if self.training:
             self.update_usage(indices)
@@ -160,39 +165,47 @@ class MultiVQ(nn.Module):
     Learning*
 
     Args:
-        latent_dim (int): number of features along which to quantize
-        num_tokens (int): number of tokens in the codebook
+        embedding_dim (int): number of features along which to quantize
+        num_embeddings (int): number of tokens in the codebook
         num_codebooks (int): number of parallel codebooks
         dim (int): dimension along which to quantize
             an angular distance
         return_indices (bool): whether to return the indices of the quantized
             code points
     """
 
-    def __init__(self,
-                 latent_dim: int,
-                 num_tokens: int,
-                 num_codebooks: int,
-                 dim: int = 1,
-                 commitment: float = 0.25,
-                 init_mode: str = 'normal',
-                 return_indices: bool = True,
-                 max_age: int = 1000):
-        assert latent_dim % num_codebooks == 0, (
-            "num_codebooks must divide evenly latent_dim")
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_embeddings: int,
+        num_codebooks: int,
+        dim: int = 1,
+        commitment: float = 0.25,
+        init_mode: str = "normal",
+        return_indices: bool = True,
+        max_age: int = 1000,
+    ):
+        assert (
+            embedding_dim % num_codebooks == 0
+        ), "num_codebooks must divide evenly embedding_dim"
         super(MultiVQ, self).__init__()
         self.dim = dim
         self.num_codebooks = num_codebooks
         self.return_indices = return_indices
-        self.vqs = nn.ModuleList([
-            VQ(latent_dim // num_codebooks,
-               num_tokens,
-               dim=dim,
-               commitment=commitment,
-               init_mode=init_mode,
-               return_indices=return_indices,
-               max_age=max_age) for _ in range(num_codebooks)
-        ])
+        self.vqs = nn.ModuleList(
+            [
+                VQ(
+                    embedding_dim // num_codebooks,
+                    num_embeddings,
+                    dim=dim,
+                    commitment=commitment,
+                    init_mode=init_mode,
+                    return_indices=return_indices,
+                    max_age=max_age,
+                )
+                for _ in range(num_codebooks)
+            ]
+        )
 
     def forward(
         self, x: torch.Tensor
@@ -206,13 +219,79 @@ def forward(
             return torch.cat(quantized, dim=self.dim)
 
 
-class MultiVQ2(VQ):
+class RVQ(nn.Module):
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        num_codebooks: int,
+        *,
+        dim: int = 1,
+        commitment: float = 0.25,
+        init_mode: str = "normal",
+        return_indices: bool = True,
+        max_age: int = 1000,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.return_indices = return_indices
+        self.codebooks = nn.ModuleList(
+            [
+                VQ(
+                    num_embeddings,
+                    embedding_dim,
+                    dim=-1,
+                    commitment=commitment,
+                    init_mode=init_mode,
+                    return_indices=True,
+                    max_age=max_age,
+                )
+                for _ in range(num_codebooks)
+            ]
+        )
+
+    def forward(
+        self, x: torch.Tensor
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        dim = self.dim
+        needs_transpose = dim not in (-1, x.dim() - 1)
+        if needs_transpose:
+            x = x.transpose(-1, dim).contiguous()
+
+        out = torch.zeros_like(x)
+        indices = []
+        for i, cb in enumerate(self.codebooks):
+            this_codes, this_indices = cb(x - out)
+            out += this_codes
+            print("residual", torch.norm(x - out).item())
+            indices.append(this_indices)
+
+        indices = torch.cat(indices, dim=-1)
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        d = self.latent_dim
-        dims = x.shape
-        batched_dims = list(dims)
-        batched_dims[self.dim] = d
-        batched_dims[self.dim - 1] = -1
-        out = super(MultiVQ2, self).forward(x.view(*batched_dims))
-        return out.view(*dims).contiguous()
+        if needs_transpose:
+            out = out.transpose(-1, dim).contiguous()
+            indices = indices.transpose(-1, dim).contiguous()
+
+        if self.return_indices:
+            return out, indices
+        else:
+            return out
+
+    def lookup(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (..., K)
+        dim = self.dim
+        needs_transpose = dim not in (-1, x.dim() - 1)
+
+        x = torch.stack(
+            [cb.lookup(xx) for cb, xx in zip(self.codebooks, x.split(1, dim=-1))],
+            dim=-1,
+        )
+        x = x.sum(-1)
+
+        if needs_transpose:
+            dims = list(range(x.ndim))
+            dims.insert(dim, dims[-1])
+            dims.pop()
+            x = x.permute(*dims)
+        # x: (..., D)
+        return x