clarified with author that superclass output is initialized using the mean of the first batch of superclass encodings

lucidrains · lucidrains · commit bd500361f850 · 2023-08-15T08:21:44.000-07:00
diff --git a/README.md b/README.md
@@ -22,9 +22,10 @@ $ pip install perfusion-pytorch
 
 ```python
 import torch
-from perfusion_pytorch import Rank1EditModule
 from torch import nn
 
+from perfusion_pytorch import Rank1EditModule
+
 to_keys = nn.Linear(768, 320, bias = False)
 to_values = nn.Linear(768, 320, bias = False)
 
@@ -33,37 +34,34 @@ input_covariance = torch.randn(768, 768)
 wrapped_to_keys = Rank1EditModule(
     to_keys,
     C = input_covariance,
-    is_key_proj = True,
-    num_finetune_prompts = 32
+    is_key_proj = True
 )
 
 wrapped_to_values = Rank1EditModule(
     to_values,
-    C = input_covariance,
-    num_finetune_prompts = 32
+    C = input_covariance
 )
 
-prompt_ids = torch.arange(4).long()                 # id of each training prompt, so that it can automatically keep track of the EMA
 text_enc = torch.randn(4, 77, 768)                  # regular input
 text_enc_with_superclass = torch.randn(4, 77, 768)  # init_input in algorithm 1, for key-locking
 concept_ids = torch.randint(0, 77, (4,))
 
 keys = wrapped_to_keys(
     text_enc,
     text_enc_with_superclass,
-    concept_ids,
-    prompt_ids = prompt_ids
+    concept_ids
 )
 
 values = wrapped_to_values(
     text_enc,
     text_enc_with_superclass,
-    concept_ids,
-    prompt_ids = prompt_ids
+    concept_ids
 )
 
 # after much training ...
-# simply omit the prompt ids
+
+wrapped_to_keys.eval()
+wrapped_to_values.eval()
 
 keys = wrapped_to_keys(
     text_enc,
@@ -80,6 +78,8 @@ values = wrapped_to_values(
 
 ## Todo
 
+- [ ] handle rank-1 update for multiple concepts
+
 - [x] take care of the function that takes in the dataset and text encoder and precomputes the covariance matrix needed for the rank-1 update
 - [x] instead of having the researcher worry about different learning rates, offer the fractional gradient trick from other paper (to learn the concept embedding)
 
diff --git a/perfusion_pytorch/perfusion.py b/perfusion_pytorch/perfusion.py
@@ -7,7 +7,7 @@
 from torch.nn import Module
 import torch.nn.functional as F
 
-from einops import rearrange
+from einops import rearrange, reduce
 
 from opt_einsum import contract as opt_einsum
 
@@ -45,7 +45,8 @@ def calculate_input_covariance(
 
     all_embeds = torch.cat((all_embeds), dim = 0)
     all_embeds = rearrange(all_embeds, 'n d -> d n')
-    return torch.cov(all_embeds, **cov_kwargs)
+
+    return torch.cov(all_embeds, correction = 0, **cov_kwargs)
 
 # a module that wraps the keys and values projection of the cross attentions to text encodings
 
@@ -56,7 +57,6 @@ def __init__(
         self,
         key_or_values_proj: nn.Linear,
         *,
-        num_finetune_prompts: int,
         C: Tensor,                         # covariance of input, precomputed from 100K laion text
         text_seq_len: int = 77,
         is_key_proj: bool = False,
@@ -90,14 +90,14 @@ def __init__(
         # for exponentially smoothing the inputs
         # will smooth both concept and superclass token inputs
 
-        self.register_buffer('initted', torch.zeros(num_finetune_prompts).bool())
-        self.register_buffer('ema_concept_text_encs', torch.zeros(num_finetune_prompts, dim_input))
+        self.register_buffer('initted', Tensor([False]))
+        self.register_buffer('ema_concept_text_encs', torch.zeros(dim_input))
 
         # superclass outputs - only optimized for values, but not keys
 
         self.is_key_proj = is_key_proj # will lock the output to the super-class, and turn off gradients
 
-        self.superclass_outputs = nn.Parameter(torch.zeros(num_finetune_prompts, dim_output), requires_grad = not is_key_proj)
+        self.superclass_outputs = nn.Parameter(torch.zeros(dim_output), requires_grad = not is_key_proj)
 
         # C in the paper, inverse precomputed
 
@@ -150,57 +150,52 @@ def forward(
         concept_indices = rearrange(concept_indices, 'b -> b 1')
 
         concept_text_enc = text_enc[batch_indices, concept_indices]
-        concept_text_enc = rearrange(concept_text_enc, 'b 1 d -> b d')
+        concept_text_enc = reduce(concept_text_enc, 'b 1 d -> d', 'mean')
 
-        # only if training, and if prompt ids are given
+        # only if training
         # do exponential smoothing of the inputs, both concept and superclass
 
         if exists(text_enc_with_superclass):
             superclass_text_enc = text_enc_with_superclass[batch_indices, concept_indices]
-            superclass_text_enc = rearrange(superclass_text_enc, 'b 1 d -> b d')
+            superclass_text_enc = reduce(superclass_text_enc, 'b 1 d -> d', 'mean')
 
-            superclass_output = einsum('b i, o i -> b o', superclass_text_enc, weights)
+            superclass_output = einsum('i, o i -> o', superclass_text_enc, weights)
 
         if self.training and exists(prompt_ids):
             # get the initialization state
             # as well as the exponentially smoothed text encodings
 
-            initted = self.initted[prompt_ids]
-            all_initted = initted.all()
+            initted = self.initted.item()
 
             ema_concept_text_enc = self.ema_concept_text_encs[prompt_ids]
 
             # store the superclass i* if not all initialized
             # else fetch it from the buffer
 
-            if not all_initted:
+            if not initted:
                 assert exists(superclass_output), 'text_enc_with_superclass must be passed in for the first epoch for all prompts to initialize the module correctly'
 
                 non_initted_prompt_ids = prompt_ids[~initted]
 
                 # for the prompt ids not initialized yet, hard copy over the initial superclass outputs
-                self.superclass_outputs[non_initted_prompt_ids].data.copy_(superclass_output)
+                self.superclass_outputs.data.copy_(superclass_output)
 
-            superclass_output = self.superclass_outputs[prompt_ids]
+            superclass_output = self.superclass_outputs
 
             # if any in the batch is not initialized, initialize
 
-            if not all_initted:
-                ema_concept_text_enc = torch.where(
-                    rearrange(initted, 'b -> b 1'),
-                    ema_concept_text_enc,
-                    concept_text_enc
-                )
+            if not initted:
+                ema_concept_text_enc = concept_text_enc
 
             # exponential moving average for concept input encoding
 
             concept_text_enc = ema_concept_text_enc * decay + concept_text_enc * (1. - decay)
 
             # store
 
-            if not all_initted:
-                self.initted[prompt_ids] = True
-                self.ema_concept_text_encs[prompt_ids] = ema_concept_text_enc
+            if not initted:
+                self.initted.data.copy_(Tensor([True]))
+                self.ema_concept_text_encs.data.copy_(ema_concept_text_enc)
 
         # take care of the output
         # for the keys, make sure to turn off gradients as it is 'locked'
@@ -214,19 +209,18 @@ def forward(
 
         # main contribution eq (3)
 
-        i_energy = opt_einsum('b o, o i, b i -> b', i, Ci, i)
-        i_energy = rearrange(i_energy, '... -> ... 1 1')
+        i_energy = opt_einsum('o, o i, i ->', i, Ci, i)
 
-        sim = opt_einsum('b n o, o i, b i -> b n', text_enc, Ci, i)
+        sim = opt_einsum('b n o, o i, i -> b n', text_enc, Ci, i)
         sim = rearrange(sim, '... -> ... 1')
 
         sigmoid_term = (((sim / i_energy) - beta) / temperature).sigmoid()
 
         text_enc_output = einsum('b n i, o i -> b n o', text_enc, W)
 
-        concept_output = einsum('b i, o i -> b o', i, W)
-        concept_output = rearrange(concept_output, 'b d -> b 1 d')
+        concept_output = einsum('i, o i -> o', i, W)
+        concept_output = rearrange(concept_output, 'd -> 1 1 d')
 
         W_em_orthogonal_term = text_enc_output - (sim * concept_output / i_energy)
 
-        return W_em_orthogonal_term + sigmoid_term * rearrange(o, 'b d -> b 1 d')
+        return W_em_orthogonal_term + sigmoid_term * rearrange(o, 'd -> 1 1 d')
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'perfusion-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.14',
+  version = '0.0.15',
   license='MIT',
   description = 'Perfusion - Pytorch',
   author = 'Phil Wang',