bring back the logic for automatically initting EMA of concept and superclass inputs

lucidrains · lucidrains · commit bfbad05f56f8 · 2023-08-14T07:36:01.000-07:00
diff --git a/README.md b/README.md
@@ -41,12 +41,39 @@ wrapped_to_values = Rank1EditModule(
     num_finetune_prompts = 32
 )
 
+prompt_ids = torch.arange(4).long()                 # id of each training prompt, so that it can automatically keep track of the EMA
 text_enc = torch.randn(4, 77, 768)                  # regular input
 text_enc_with_superclass = torch.randn(4, 77, 768)  # init_input in algorithm 1, for key-locking
 concept_ids = torch.randint(0, 77, (4,))
 
-keys = wrapped_to_keys(text_enc, text_enc_with_superclass, concept_ids)
-values = wrapped_to_values(text_enc, text_enc_with_superclass, concept_ids)
+keys = wrapped_to_keys(
+    text_enc,
+    text_enc_with_superclass,
+    concept_ids,
+    prompt_ids = prompt_ids
+)
+
+values = wrapped_to_values(
+    text_enc,
+    text_enc_with_superclass,
+    concept_ids,
+    prompt_ids = prompt_ids
+)
+
+# after much training ...
+# simply omit the prompt ids
+
+keys = wrapped_to_keys(
+    text_enc,
+    text_enc_with_superclass,
+    concept_ids
+)
+
+values = wrapped_to_values(
+    text_enc,
+    text_enc_with_superclass,
+    concept_ids
+)
 ```
 
 ## Todo
diff --git a/perfusion_pytorch/perfusion.py b/perfusion_pytorch/perfusion.py
@@ -2,7 +2,7 @@
 from beartype.typing import Union
 
 import torch
-from torch import nn, einsum, Tensor, IntTensor, LongTensor, FloatTensor
+from torch import nn, einsum, Tensor, IntTensor, LongTensor, FloatTensor, Optional
 from torch.nn import Module
 import torch.nn.functional as F
 
@@ -15,6 +15,8 @@
 def exists(val):
     return val is not None
 
+IndicesTensor = Union[LongTensor, IntTensor]
+
 # a module that wraps the keys and values projection of the cross attentions to text encodings
 
 class Rank1EditModule(Module):
@@ -51,6 +53,13 @@ def __init__(
 
         self.text_seq_len = text_seq_len
 
+        # for exponentially smoothing the inputs
+        # will smooth both concept and superclass token inputs
+
+        self.register_buffer('initted', torch.zeros(num_finetune_prompts).bool())
+        self.register_buffer('ema_concept_text_enc', torch.zeros(num_finetune_prompts, dim_input))
+        self.register_buffer('ema_superclass_text_enc', torch.zeros(num_finetune_prompts, dim_input))
+
         # C in the paper, inverse precomputed
 
         self.register_buffer('C_inv', torch.inverse(C))
@@ -60,7 +69,9 @@ def forward(
         self,
         text_enc: FloatTensor,
         text_enc_with_superclass: FloatTensor,
-        concept_indices: Union[IntTensor, LongTensor]
+        concept_indices: IndicesTensor,
+        *,
+        prompt_ids: Optional[IndicesTensor] = None
     ):
         assert text_enc.shape[-2] == self.text_seq_len, f'CLIP text sequence length is set to be {self.text_seq_len}, but received text encoding with length {text_enc.shape[-2]}'
 
@@ -98,21 +109,58 @@ def forward(
         superclass_text_enc = text_enc_with_superclass[batch_indices, concept_indices]
         superclass_text_enc = rearrange(superclass_text_enc, 'b 1 d -> b d')
 
+        # only if training, and if prompt ids are given
+        # do exponential smoothing of the inputs, both concept and superclass
+
+        if self.training and exists(prompt_ids):
+            # get the initialization state
+            # as well as the exponentially smoothed text encodings
+
+            initted = self.initted[prompt_ids]
+            initted = rearrange(initted, 'b -> b 1')
+            all_initted = initted.all()
+
+            ema_concept_text_enc = self.ema_concept_text_enc[prompt_ids]
+            ema_superclass_text_enc = self.ema_superclass_text_enc[prompt_ids]
+
+            # if any in the batch is not initialized, initialize
+
+            if not all_initted:
+                ema_concept_text_enc = torch.where(
+                    initted,
+                    ema_concept_text_enc,
+                    concept_text_enc
+                )
+
+                ema_superclass_text_enc = torch.where(
+                    initted,
+                    ema_superclass_text_enc,
+                    superclass_text_enc
+                )
+
+            # exponential moving average of both concept and superclass
+
+            concept_text_enc = ema_concept_text_enc * decay + concept_text_enc * (1. - decay)
+            superclass_text_enc = ema_superclass_text_enc * decay + superclass_text_enc * (1. - decay)
+
+            # store
+
+            if not all_initted:
+                self.initted[prompt_ids] = True
+                self.ema_concept_text_enc[prompt_ids] = ema_concept_text_enc
+                self.ema_superclass_text_enc[prompt_ids] = ema_superclass_text_enc
+
+        # take care of the output
+        # for the keys, make sure to turn off gradients as it is 'locked'
+
         superclass_text_enc_output = einsum('b i, o i -> b o', superclass_text_enc, weights)
 
         if self.is_key_proj:
             superclass_text_enc_output = superclass_text_enc_output.detach()
 
-        # only during training do they exponentially smooth
-
-        if self.training:            
-            online_estimated_concept_enc = decay * superclass_text_enc + (1. - decay) * concept_text_enc
-        else:
-            online_estimated_concept_enc = concept_text_enc
-
         # make it easier to match with paper
 
-        i, o, W = online_estimated_concept_enc, superclass_text_enc_output, weights
+        i, o, W = concept_text_enc, superclass_text_enc_output, weights
 
         # main contribution eq (3)
 
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'perfusion-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.7',
+  version = '0.0.8',
   license='MIT',
   description = 'Perfusion - Pytorch',
   author = 'Phil Wang',