add a function that can accept open clip, a bunch of prompts as List[str], and return the C covariance matrix needed

lucidrains · lucidrains · commit a88ab9353a81 · 2023-08-14T09:07:14.000-07:00
diff --git a/README.md b/README.md
@@ -10,6 +10,8 @@ It seems they successfully applied the Rank-1 editing technique from a <a href="
 
 - <a href="https://stability.ai/">StabilityAI</a> for the generous sponsorship, as well as my other sponsors out there
 
+- All the maintainers at <a href="https://github.com/mlfoundations/open_clip">OpenClip</a>, for their SOTA open sourced contrastive learning text-image models
+
 ## Install
 
 ```bash
diff --git a/perfusion_pytorch/__init__.py b/perfusion_pytorch/__init__.py
@@ -1,3 +1,4 @@
 from perfusion_pytorch.perfusion import (
-    Rank1EditModule
+    Rank1EditModule,
+    calculate_input_covariance
 )
diff --git a/perfusion_pytorch/open_clip.py b/perfusion_pytorch/open_clip.py
@@ -0,0 +1,90 @@
+from beartype import beartype
+from beartype.typing import List, Optional
+
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+
+from einops import rearrange
+
+import open_clip
+
+def exists(val):
+    return val is not None
+
+def l2norm(t):
+    return F.normalize(t, dim = -1)
+
+class OpenClipAdapter(nn.Module):
+    @beartype
+    def __init__(
+        self,
+        name = 'ViT-B/32',
+        pretrained = 'laion400m_e32',
+        tokenizer_name = 'ViT-B-32-quickgelu',
+        eos_id = 49407
+    ):
+        super().__init__()
+
+        clip, _, preprocess = open_clip.create_model_and_transforms(name, pretrained = pretrained)
+        tokenizer = open_clip.get_tokenizer(tokenizer_name)
+
+        self.clip = clip
+        self.tokenizer = tokenizer
+        self.eos_id = eos_id
+
+        # hook for getting final text representation
+
+        text_attention_final = self.find_layer('ln_final')
+        self._dim_latent = text_attention_final.weight.shape[0]
+        self.text_handle = text_attention_final.register_forward_hook(self._text_hook)
+
+        # normalize fn
+
+        self.clip_normalize = preprocess.transforms[-1]
+        self.cleared = False
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    def find_layer(self,  layer):
+        modules = dict([*self.clip.named_modules()])
+        return modules.get(layer, None)
+
+    def clear(self):
+        if self.cleared:
+            return
+
+        self.text_handle()
+
+    def _text_hook(self, _, inputs, outputs):
+        self.text_encodings = outputs
+
+    @property
+    def dim_latent(self):
+        return self._dim_latent
+
+    @property
+    def max_text_len(self):
+        return self.clip.positional_embedding.shape[0]
+
+    @beartype
+    def embed_texts(
+        self,
+        texts: List[str]
+    ):
+        ids = self.tokenizer(texts)
+        ids = ids.to(self.device)
+        ids = ids[..., :self.max_text_len]
+
+        is_eos_id = (ids == self.eos_id)
+        text_mask_excluding_eos = is_eos_id.cumsum(dim = -1) == 0
+        text_mask = F.pad(text_mask_excluding_eos, (1, -1), value = True)
+        text_mask = text_mask & (ids != 0)
+        assert not self.cleared
+
+        text_embed = self.clip.encode_text(ids)
+        text_encodings = self.text_encodings
+        text_encodings = text_encodings.masked_fill(~text_mask[..., None], 0.)
+        return text_encodings.float(), text_mask
diff --git a/perfusion_pytorch/perfusion.py b/perfusion_pytorch/perfusion.py
@@ -1,22 +1,52 @@
+from math import ceil
 from beartype import beartype
-from beartype.typing import Union
+from beartype.typing import Union, List, Optional
 
 import torch
-from torch import nn, einsum, Tensor, IntTensor, LongTensor, FloatTensor, Optional
+from torch import nn, einsum, Tensor, IntTensor, LongTensor, FloatTensor
 from torch.nn import Module
 import torch.nn.functional as F
 
 from einops import rearrange
 
 from opt_einsum import contract as opt_einsum
 
+from perfusion_pytorch.open_clip import OpenClipAdapter
+
 # helpers
 
 def exists(val):
     return val is not None
 
 IndicesTensor = Union[LongTensor, IntTensor]
 
+# function for calculating C - input covariance
+
+@beartype
+@torch.no_grad()
+def calculate_input_covariance(
+    open_clip: OpenClipAdapter,
+    texts: List[str],
+    batch_size = 32,
+    **cov_kwargs
+):
+    embeds, mask = open_clip.embed_texts(texts)
+
+    num_batches = ceil(len(texts) / batch_size)
+
+    all_embeds = []
+
+    for batch_ind in range(num_batches):
+        start_index = batch_ind * batch_size
+        batch_texts = texts[start_index:(start_index + batch_size)]
+
+        embeds, mask = open_clip.embed_texts(batch_texts)
+        all_embeds.append(embeds[mask])
+
+    all_embeds = torch.cat((all_embeds), dim = 0)
+    all_embeds = rearrange(all_embeds, 'n d -> d n')
+    return torch.cov(all_embeds, **cov_kwargs)
+
 # a module that wraps the keys and values projection of the cross attentions to text encodings
 
 class Rank1EditModule(Module):
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'perfusion-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.9',
+  version = '0.0.10',
   license='MIT',
   description = 'Perfusion - Pytorch',
   author = 'Phil Wang',
@@ -19,6 +19,7 @@
   install_requires=[
     'beartype',
     'einops>=0.6.1',
+    'open-clip-torch>=2.0.0,<3.0.0',
     'opt-einsum',
     'torch>=2.0'
   ],

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
`1`	`1`	`from perfusion_pytorch.perfusion import (`
`2`		`- Rank1EditModule`
	`2`	`+ Rank1EditModule,`
	`3`	`+ calculate_input_covariance`
`3`	`4`	`)`