make a rough pass over the initialization logic + exponential moving average of the concept text encoding token

lucidrains · lucidrains · commit 42d84bdba0de · 2023-08-12T12:20:46.000-07:00
diff --git a/README.md b/README.md
@@ -6,6 +6,44 @@ Implementation of <a href="https://arxiv.org/abs/2305.01644">Key-Locked Rank One
 
 It seems they successfully applied the Rank-1 editing technique from a <a href="https://arxiv.org/abs/2202.05262">memory editing paper for LLM</a>, with a few improvements. They also identified that the keys determine the "where" of the new concept, while the values determine the "what", and propose local / global-key locking to a superclass concept (while learning the values).
 
+## Install
+
+```bash
+$ pip install perfusion-pytorch
+```
+
+## Usage
+
+```python
+import torch
+from perfusion_pytorch import Rank1EditModule
+from torch import nn
+
+to_keys = nn.Linear(512, 1024, bias = False)
+to_values = nn.Linear(512, 1024, bias = False)
+
+C = torch.randn(512, 512)
+
+wrapped_to_keys = Rank1EditModule(
+    to_keys,
+    C = C,
+    num_finetune_prompts = 32
+)
+
+wrapped_to_values = Rank1EditModule(
+    to_values,
+    C = C,
+    num_finetune_prompts = 32
+)
+
+prompt_ids = torch.arange(4).long()
+text_enc = torch.randn(4, 1024, 512)
+concept_ids = torch.randint(0, 1024, (4,))
+
+keys = wrapped_to_keys(prompt_ids, text_enc, concept_ids)
+values = wrapped_to_values(prompt_ids, text_enc, concept_ids)
+```
+
 ## Citations
 
 ```bibtex
diff --git a/perfusion_pytorch/perfusion.py b/perfusion_pytorch/perfusion.py
@@ -21,6 +21,7 @@ def __init__(
         self,
         key_or_values_proj: nn.Linear,
         *,
+        num_finetune_prompts: int,
         C: Tensor,
         input_decay = 0.99,
         train_beta = 0.75,
@@ -32,6 +33,7 @@ def __init__(
         assert not exists(key_or_values_proj.bias), 'key value projection in attention should not have bias'
 
         self.weight = key_or_values_proj.weight
+        dim_input = self.weight.shape[-1]
 
         self.train_beta = train_beta
         self.train_temperature = train_temperature
@@ -40,18 +42,61 @@ def __init__(
 
         self.input_decay = input_decay
 
+        # they exponentially smooth the text encoding inputs during training
+        # in addition to a lowered learning rate on the text encodings
+
+        self.register_buffer('initted', torch.zeros(num_finetune_prompts).bool())
+        self.register_buffer('ema_concept_text_enc', torch.zeros(num_finetune_prompts, dim_input))
+
         # buffers
 
         self.register_buffer('C_inv', torch.inverse(C))
 
     @beartype
     def forward(
         self,
+        prompt_ids: Tensor,
         text_enc: Tensor,
         concept_indices: Tensor
     ):
         """
         following the pseudocode of Algorithm 1 in appendix
         """
 
-        return text_enc
+        batch, device = text_enc.shape[0], self.initted.device
+
+        weights, decay = self.weight, self.input_decay
+
+        # beta and temperature depends on whether training or inference
+
+        beta, temperature = (self.train_beta, self.train_temperature) if self.training else (self.eval_beta, self.eval_temperature)
+
+        # extract the concept text encoding input
+
+        batch_indices = torch.arange(batch, device = device)
+        batch_indices = rearrange(batch_indices, 'b -> b 1')
+        concept_indices = rearrange(concept_indices, 'b -> b 1')
+
+        concept_text_enc = text_enc[batch_indices, concept_indices]
+        concept_text_enc = rearrange(concept_text_enc, 'b 1 d -> b d')
+
+        # during training, keep track of exponentially smoothed input
+
+        if self.training:
+            batch_initted = self.initted[prompt_ids]
+            ema_concept_text_enc = self.ema_concept_text_enc[prompt_ids]
+
+            ema_concept_text_enc = torch.where(
+                rearrange(batch_initted, 'b -> b 1'),
+                ema_concept_text_enc,
+                concept_text_enc
+            )
+
+            # update using exponential moving average
+
+            ema_concept_text_enc = ema_concept_text_enc * decay + concept_text_enc * (1. - decay)
+
+            self.initted[prompt_ids] = True
+            self.ema_concept_text_enc[prompt_ids] = ema_concept_text_enc
+
+        return einsum('b n i, o i -> b n o', text_enc, weights)