Move custom blocks from AuxiliaryLatentModule to AnyTextControlNetConditioningEmbedding

tolgacangoz · tolgacangoz · commit c4db96a37a92 · 2024-10-20T17:31:35.000+03:00
diff --git a/examples/research_projects/anytext/pipeline_anytext.py b/examples/research_projects/anytext/pipeline_anytext.py
@@ -33,7 +33,6 @@
 from frozen_clip_embedder_t3 import FrozenCLIPEmbedderT3
 from PIL import Image, ImageDraw, ImageFont
 from recognizer import TextRecognizer, create_predictor
-from safetensors.torch import load_file
 from torch import nn
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
@@ -410,9 +409,6 @@ class AuxiliaryLatentModule(nn.Module):
     def __init__(
         self,
         font_path,
-        glyph_channels=1,
-        position_channels=1,
-        model_channels=320,
         vae=None,
         device="cpu",
         use_fp16=False,
@@ -422,57 +418,8 @@ def __init__(
         self.use_fp16 = use_fp16
         self.device = device
 
-        self.glyph_block = nn.Sequential(
-            nn.Conv2d(glyph_channels, 8, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(8, 8, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(8, 16, 3, padding=1, stride=2),
-            nn.SiLU(),
-            nn.Conv2d(16, 16, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(16, 32, 3, padding=1, stride=2),
-            nn.SiLU(),
-            nn.Conv2d(32, 32, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(32, 96, 3, padding=1, stride=2),
-            nn.SiLU(),
-            nn.Conv2d(96, 96, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(96, 256, 3, padding=1, stride=2),
-            nn.SiLU(),
-        )
-
-        self.position_block = nn.Sequential(
-            nn.Conv2d(position_channels, 8, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(8, 8, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(8, 16, 3, padding=1, stride=2),
-            nn.SiLU(),
-            nn.Conv2d(16, 16, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(16, 32, 3, padding=1, stride=2),
-            nn.SiLU(),
-            nn.Conv2d(32, 32, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(32, 64, 3, padding=1, stride=2),
-            nn.SiLU(),
-        )
-
         self.vae = vae.eval() if vae is not None else None
 
-        self.fuse_block = nn.Conv2d(256 + 64 + 4, model_channels, 3, padding=1)
-
-        self.glyph_block.load_state_dict(load_file("glyph_block.safetensors", device=str(self.device)))
-        self.position_block.load_state_dict(load_file("position_block.safetensors", device=str(self.device)))
-        self.fuse_block.load_state_dict(load_file("fuse_block.safetensors", device=str(self.device)))
-
-        if use_fp16:
-            self.glyph_block = self.glyph_block.to(dtype=torch.float16)
-            self.position_block = self.position_block.to(dtype=torch.float16)
-            self.fuse_block = self.fuse_block.to(dtype=torch.float16)
-
     @torch.no_grad()
     def forward(
         self,
@@ -518,11 +465,8 @@ def forward(
 
         glyphs = torch.cat(text_info["glyphs"], dim=1).sum(dim=1, keepdim=True)
         positions = torch.cat(text_info["positions"], dim=1).sum(dim=1, keepdim=True)
-        enc_glyph = self.glyph_block(glyphs)
-        enc_pos = self.position_block(positions)
-        guided_hint = self.fuse_block(torch.cat([enc_glyph, enc_pos, text_info["masked_x"]], dim=1))
 
-        return guided_hint
+        return glyphs, positions, text_info
 
     def check_channels(self, image):
         channels = image.shape[2] if len(image.shape) == 3 else 1
diff --git a/examples/research_projects/anytext/text_controlnet.py b/examples/research_projects/anytext/text_controlnet.py
@@ -14,7 +14,6 @@
 from typing import Any, Dict, Optional, Tuple, Union
 
 import torch
-import torch.nn.functional as F
 from torch import nn
 
 from diffusers.configuration_utils import register_to_config
@@ -40,37 +39,67 @@ class AnyTextControlNetConditioningEmbedding(nn.Module):
 
     def __init__(
         self,
-        conditioning_embedding_channels: int,
-        conditioning_channels: int = 3,
-        block_out_channels: Tuple[int, ...] = (16, 32, 96, 256),
+        glyph_channels=1,
+        position_channels=1,
+        model_channels=320,
     ):
         super().__init__()
 
-        self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
-
-        self.blocks = nn.ModuleList([])
-
-        for i in range(len(block_out_channels) - 1):
-            channel_in = block_out_channels[i]
-            channel_out = block_out_channels[i + 1]
-            self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
-            self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
+        self.glyph_block = nn.Sequential(
+            nn.Conv2d(glyph_channels, 8, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(8, 8, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(8, 16, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(16, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 32, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(32, 32, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(32, 96, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(96, 96, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(96, 256, 3, padding=1, stride=2),
+            nn.SiLU(),
+        )
 
-        self.conv_out = zero_module(
-            nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
+        self.position_block = nn.Sequential(
+            nn.Conv2d(position_channels, 8, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(8, 8, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(8, 16, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(16, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 32, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(32, 32, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(32, 64, 3, padding=1, stride=2),
+            nn.SiLU(),
         )
 
-    def forward(self, conditioning):
-        embedding = self.conv_in(conditioning)
-        embedding = F.silu(embedding)
+        self.fuse_block = nn.Conv2d(256 + 64 + 4, model_channels, 3, padding=1)
+
+        # self.glyph_block.load_state_dict(load_file("glyph_block.safetensors", device=str(self.device)))
+        # self.position_block.load_state_dict(load_file("position_block.safetensors", device=str(self.device)))
+        # self.fuse_block.load_state_dict(load_file("fuse_block.safetensors", device=str(self.device)))
 
-        for block in self.blocks:
-            embedding = block(embedding)
-            embedding = F.silu(embedding)
+        # if use_fp16:
+        #     self.glyph_block = self.glyph_block.to(dtype=torch.float16)
+        #     self.position_block = self.position_block.to(dtype=torch.float16)
+        #     self.fuse_block = self.fuse_block.to(dtype=torch.float16)
 
-        embedding = self.conv_out(embedding)
+    def forward(self, glyphs, positions, text_info):
+        glyph_embedding = self.glyph_block(glyphs)
+        position_embedding = self.position_block(positions)
+        guided_hint = self.fuse_block(torch.cat([glyph_embedding, position_embedding, text_info["masked_x"]], dim=1))
 
-        return embedding
+        return guided_hint
 
 
 class AnyTextControlNetModel(ControlNetModel):