allow for simply fine tuning off prompts with superclass string, without manipulation of the BPE dictionary

lucidrains · lucidrains · commit 03059d94b0ed · 2023-08-26T10:14:57.000-07:00
diff --git a/README.md b/README.md
@@ -73,6 +73,35 @@ values = wrapped_to_values(text_enc)
 
 ```
 
+The repository also contains an `EmbeddingWrapper` that makes it easy to train on a new concept (and for eventual inference with multiple concepts)
+
+```python
+import torch
+from torch import nn
+
+from perfusion_pytorch import EmbeddingWrapper
+
+embed = nn.Embedding(49407, 512) # open clip embedding, somewhere in the module tree of stable diffusion
+
+# wrap it, and will automatically create a new concept for learning, based on the superclass embed string
+
+wrapped_embed = EmbeddingWrapper(
+    embed,
+    superclass_string = 'dog'
+)
+
+# now just pass in your prompts with the superclass id
+
+embeds_with_new_concept, embeds_with_superclass, embed_mask = wrapped_embed([
+    'a portrait of dog',
+    'dog running through a green field',
+    'a man walking his dog'
+]) # (3, 77, 512), (3, 77, 512), (3, 77)
+
+# now pass both embeds through clip text transformer
+# the embed_mask needs to be passed to the cross attention as key padding mask
+```
+
 ## Todo
 
 - [ ] wire up with SD 1.5, starting with xiao's dreambooth-sd
diff --git a/perfusion_pytorch/embedding.py b/perfusion_pytorch/embedding.py
@@ -1,41 +1,66 @@
 import torch
-from torch import nn
+from torch import nn, Tensor
 from torch.nn import Module
 
 from collections import namedtuple
 
 from beartype import beartype
-from beartype.typing import Optional, Tuple, Union
+from beartype.door import is_bearable
+from beartype.typing import Optional, Tuple, Union, Callable, List
 
 from einops import rearrange
 
+from open_clip import tokenizer
+
 # constants
 
 EmbeddingReturn = namedtuple('EmbeddingReturn', [
     'embed_with_concept',
-    'embed_with_superclass'
+    'embed_with_superclass',
+    'embed_mask'
 ])
 
 # helper functions
 
 def exists(val):
     return val is not None
 
+def default(val, d):
+    return val if exists(val) else d
+
 def is_all_unique(arr):
     return len(set(arr)) == len(arr)
 
 def filter_tuple_indices(tup, indices):
     return tuple(tup[i] for i in indices)
 
+@beartype
+def get_mask(
+    x: Tensor,
+    ids: Tuple[int, ...]
+):
+    masks = tuple(x == i for i in ids)
+    mask, *rest_masks = masks
+
+    for rest_mask in rest_masks:
+        mask = mask | rest_mask
+
+    return mask
+
 # embedding wrapper class
 
 class EmbeddingWrapper(Module):
+
     @beartype
     def __init__(
         self,
         embed: nn.Embedding,
         num_concepts = 1,
-        superclass_embed_id: Optional[Union[int, Tuple[int, ...]]] = None
+        superclass_embed_id: Optional[Union[int, Tuple[int, ...]]] = None,
+        superclass_string: Optional[str] = None,
+        tokenize: Callable[str, Tensor] = tokenizer.tokenize,
+        tokenizer_pad_id: int = 0,
+        tokenizer_sos_eos_id: Tuple[int, int] = (49406, 49407)
     ):
         super().__init__()
         self.embed = embed
@@ -45,7 +70,27 @@ def __init__(
         self.num_concepts = num_concepts
         self.concepts = nn.Parameter(torch.zeros(num_concepts, dim))
 
+        assert exists(superclass_embed_id) ^ exists(superclass_string), 'either superclass embed id is given, or the superclass string'
+
+        self.pad_id = tokenizer_pad_id
+        self.tokenize = None
+
+        if exists(superclass_string):
+            self.tokenize = tokenize
+
+            ids = tokenize([superclass_string])[0]
+
+            mask_for_ids = get_mask(ids, (tokenizer_pad_id, *tokenizer_sos_eos_id))
+            ids = ids[~mask_for_ids]
+
+            assert ids.shape[-1] == 1, f'your superclass concept string must map exactly one token id'
+            superclass_embed_id = ids[0].item()
+
+            print(f'super class embed for "{superclass_string}"" set as {superclass_embed_id}')
+            print(f'you can now pass in a list of strings containing superclass concept, and this wrapper will return the embedding w/ concept and superclass required for finetuning')
+
         self.superclass_embed_id = superclass_embed_id
+
         assert not (exists(superclass_embed_id) and num_concepts > 1), 'cannot do multi concept with superclass embed id given'
 
         if exists(superclass_embed_id):
@@ -67,18 +112,42 @@ def __init__(
     def parameters(self):
         return [self.concepts]
 
+    @beartype
     def forward(
         self,
-        x,
+        x: Union[Tensor, List[str]],
         concept_id: Optional[Union[int, Tuple[int, ...]]] = None,
         return_embed_with_superclass = True
     ) -> EmbeddingReturn:
-        concept_masks = tuple(concept_id == x for concept_id in self.concept_embed_ids)
 
         if exists(concept_id):
             if not isinstance(concept_id, tuple):
                 concept_id = (concept_id,)
 
+            assert len(concept_id) == 1, 'can only train or inference on single concepts if passing in list of superclass strings'
+            assert self.num_concepts == 1
+
+        if is_bearable(x, List[str]):
+            inferred_concept_id = self.concept_embed_ids[0]
+
+            x = self.tokenize(x)
+
+            superclass_mask = x == self.superclass_embed_id
+            assert superclass_mask.any(dim = -1).all(), 'superclass embed id must be present for all prompts'
+
+            # automatically replace the superclass id with the concept id
+            x = torch.where(superclass_mask, inferred_concept_id, x)
+
+        # get the embedding mask, defined as not padding id
+        # default to open clip tokenizer padding id of 0
+
+        embed_mask = x != self.pad_id
+
+        # get masks for all concepts (support for multi-concepts)
+
+        concept_masks = tuple(concept_id == x for concept_id in self.concept_embed_ids)
+
+        if exists(concept_id):
             assert is_all_unique(concept_id), 'concept ids must be all unique'
             assert all([cid < self.num_concepts for cid in concept_id])
 
@@ -87,13 +156,20 @@ def forward(
             assert all(filter_tuple_indices(has_concept, concept_id)), f'concept ids {filter_tuple_indices(self.concept_embed_ids, concept_id)} not found in ids passed in'
             concept_masks = filter_tuple_indices(concept_masks, concept_id)
 
+        # just fetch the first embedding, as the concept embeddings are kept external to nn.Embedding
+
         for concept_mask in concept_masks:
             x = x.masked_fill(concept_mask, 0)
 
+        # get all the embeddings that are not the concept or superclass concept
+        # rest of embeddings are also not learnable, only concept embedding
+
         with torch.no_grad():
             embeds = self.embed(x)
             embeds.detach_()
 
+        # substitute the concept back into the embeddings
+
         for concept, concept_mask in zip(self.concepts, concept_masks):
             embeds = torch.where(
                 rearrange(concept_mask, '... -> ... 1'),
@@ -110,9 +186,9 @@ def forward(
             with torch.no_grad():
                 superclass_embeds = self.embed(x)
 
-            return EmbeddingReturn(embeds, superclass_embeds)
+            return EmbeddingReturn(embeds, superclass_embeds, embed_mask)
 
-        return EmbeddingReturn(embeds, None)
+        return EmbeddingReturn(embeds, None, embed_mask)
 
 @beartype
 def merge_embedding_wrappers(
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'perfusion-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.1.9',
+  version = '0.1.10',
   license='MIT',
   description = 'Perfusion - Pytorch',
   author = 'Phil Wang',