support SD3 LoRA

Artiprocher · Artiprocher · commit 979a8814f18c · 2024-07-10T10:07:02.000+08:00
diff --git a/README.md b/README.md
@@ -80,15 +80,15 @@ https://github.com/modelscope/DiffSynth-Studio/assets/35051019/d97f6aa9-8064-4b5
 
 ### Image Synthesis
 
-Generate high-resolution images, by breaking the limitation of diffusion models! [`examples/image_synthesis`](./examples/image_synthesis/)
+Generate high-resolution images, by breaking the limitation of diffusion models! [`examples/image_synthesis`](./examples/image_synthesis/).
 
-|512*512|1024*1024|2048*2048|4096*4096|
-|-|-|-|-|
-|![512](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/55f679e9-7445-4605-9315-302e93d11370)|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/6fc84611-8da6-4a1f-8fee-9a34eba3b4a5)|![2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/9087a73c-9164-4c58-b2a0-effc694143fb)|![4096](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/edee9e71-fc39-4d1c-9ca9-fa52002c67ac)|
+LoRA fine-tuning is supported in [`examples/train`](./examples/train/).
 
-|1024*1024|2048*2048|
+|Stable Diffusion|Stable Diffusion XL|
 |-|-|
-|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/67687748-e738-438c-aee5-96096f09ac90)|![2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/584186bc-9855-4140-878e-99541f9a757f)|
+|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/6fc84611-8da6-4a1f-8fee-9a34eba3b4a5)|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/67687748-e738-438c-aee5-96096f09ac90)|![2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/584186bc-9855-4140-878e-99541f9a757f)|
+|Stable Diffusion 3|Hunyuan-DiT|
+|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4df346db-6f91-420a-b4c1-26e205376098)|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/60b022c8-df3f-4541-95ab-bf39f2fa8bb5)|
 
 ### Toon Shading
 
@@ -104,22 +104,6 @@ Video stylization without video models. [`examples/diffsynth`](./examples/diffsy
 
 https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-4481-b79f-0c3a7361a1ea
 
-### Chinese Models
-
-Use Hunyuan-DiT to generate images with Chinese prompts. We also support LoRA fine-tuning of this model. [`examples/hunyuan_dit`](./examples/hunyuan_dit/)
-
-Prompt: 少女手捧鲜花，坐在公园的长椅上，夕阳的余晖洒在少女的脸庞，整个画面充满诗意的美感
-
-|1024x1024|2048x2048 (highres-fix)|
-|-|-|
-|![image_1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/2b6528cf-a229-46e9-b7dd-4a9475b07308)|![image_2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/11d264ec-966b-45c9-9804-74b60428b866)|
-
-Prompt: 一只小狗蹦蹦跳跳，周围是姹紫嫣红的鲜花，远处是山脉
-
-|Without LoRA|With LoRA|
-|-|-|
-|![image_without_lora](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/1aa21de5-a992-4b66-b14f-caa44e08876e)|![image_with_lora](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/83a0a41a-691f-4610-8e7b-d8e17c50a282)|
-
 ## Usage (in WebUI)
 
 ```
diff --git a/diffsynth/models/__init__.py b/diffsynth/models/__init__.py
@@ -567,7 +567,7 @@ def load_stable_diffusion_3(self, state_dict, components=None, file_path=""):
             if component == "sd3_text_encoder_3":
                 if "text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight" not in state_dict:
                     continue
-            elif component == "sd3_text_encoder_1":
+            if component == "sd3_text_encoder_1":
                 # Add additional token embeddings to text encoder
                 token_embeddings = [state_dict["text_encoders.clip_l.transformer.text_model.embeddings.token_embedding.weight"]]
                 for keyword in self.textual_inversion_dict:
diff --git a/diffsynth/models/sd3_dit.py b/diffsynth/models/sd3_dit.py
@@ -199,16 +199,30 @@ def tiled_forward(self, hidden_states, timestep, prompt_emb, pooled_prompt_emb,
         )
         return hidden_states
 
-    def forward(self, hidden_states, timestep, prompt_emb, pooled_prompt_emb, tiled=False, tile_size=128, tile_stride=64):
+    def forward(self, hidden_states, timestep, prompt_emb, pooled_prompt_emb, tiled=False, tile_size=128, tile_stride=64, use_gradient_checkpointing=False):
         if tiled:
             return self.tiled_forward(hidden_states, timestep, prompt_emb, pooled_prompt_emb, tile_size, tile_stride)
         conditioning = self.time_embedder(timestep, hidden_states.dtype) + self.pooled_text_embedder(pooled_prompt_emb)
         prompt_emb = self.context_embedder(prompt_emb)
 
         height, width = hidden_states.shape[-2:]
         hidden_states = self.pos_embedder(hidden_states)
+
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+            return custom_forward
+        
         for block in self.blocks:
-            hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning)
+            if self.training and use_gradient_checkpointing:
+                hidden_states, prompt_emb = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states, prompt_emb, conditioning,
+                    use_reentrant=False,
+                )
+            else:
+                hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning)
+        
         hidden_states = self.norm_out(hidden_states, conditioning)
         hidden_states = self.proj_out(hidden_states)
         hidden_states = rearrange(hidden_states, "B (H W) (P Q C) -> B C (H P) (W Q)", P=2, Q=2, H=height//2, W=width//2)
diff --git a/diffsynth/prompts/sd3_prompter.py b/diffsynth/prompts/sd3_prompter.py
@@ -69,7 +69,7 @@ def encode_prompt(
 
         # T5
         if text_encoder_3 is None:
-            prompt_emb_3 = torch.zeros((1, 256, 4096), dtype=prompt_emb_1.dtype, device=device)
+            prompt_emb_3 = torch.zeros((prompt_emb_1.shape[0], 256, 4096), dtype=prompt_emb_1.dtype, device=device)
         else:
             prompt_emb_3 = self.encode_prompt_using_t5(pure_prompt, text_encoder_3, self.tokenizer_3, 256, device)
             prompt_emb_3 = prompt_emb_3.to(prompt_emb_1.dtype) # float32 -> float16
diff --git a/diffsynth/prompts/utils.py b/diffsynth/prompts/utils.py
@@ -124,6 +124,13 @@ def del_textual_inversion_tokens(self, prompt):
         return prompt
 
     def process_prompt(self, prompt, positive=True, require_pure_prompt=False):
+        if isinstance(prompt, list):
+            prompt = [self.process_prompt(prompt_, positive=positive, require_pure_prompt=require_pure_prompt) for prompt_ in prompt]
+            if require_pure_prompt:
+                prompt, pure_prompt = [i[0] for i in prompt], [i[1] for i in prompt]
+                return prompt, pure_prompt
+            else:
+                return prompt
         prompt, pure_prompt = self.add_textual_inversion_tokens(prompt), self.del_textual_inversion_tokens(prompt)
         if positive and self.translator is not None:
             prompt = self.translator(prompt)
diff --git a/diffsynth/schedulers/flow_match.py b/diffsynth/schedulers/flow_match.py
@@ -40,3 +40,8 @@ def add_noise(self, original_samples, noise, timestep):
         sigma = self.sigmas[timestep_id]
         sample = (1 - sigma) * original_samples + sigma * noise
         return sample
+    
+
+    def training_target(self, sample, noise, timestep):
+        target = noise - sample
+        return target
diff --git a/examples/image_synthesis/README.md b/examples/image_synthesis/README.md
@@ -1,34 +1,48 @@
 # Image Synthesis
 
-Image synthesis is the base feature of DiffSynth Studio.
+Image synthesis is the base feature of DiffSynth Studio. We can generate images with very high resolution.
 
 ### Example: Stable Diffusion
 
-We can generate images with very high resolution. Please see [`sd_text_to_image.py`](./sd_text_to_image.py) for more details.
+Example script: [`sd_text_to_image.py`](./sd_text_to_image.py)
 
 |512*512|1024*1024|2048*2048|4096*4096|
 |-|-|-|-|
 |![512](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/55f679e9-7445-4605-9315-302e93d11370)|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/6fc84611-8da6-4a1f-8fee-9a34eba3b4a5)|![2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/9087a73c-9164-4c58-b2a0-effc694143fb)|![4096](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/edee9e71-fc39-4d1c-9ca9-fa52002c67ac)|
 
 ### Example: Stable Diffusion XL
 
-Generate images with Stable Diffusion XL. Please see [`sdxl_text_to_image.py`](./sdxl_text_to_image.py) for more details.
+Example script: [`sdxl_text_to_image.py`](./sdxl_text_to_image.py)
 
 |1024*1024|2048*2048|
 |-|-|
 |![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/67687748-e738-438c-aee5-96096f09ac90)|![2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/584186bc-9855-4140-878e-99541f9a757f)|
 
 ### Example: Stable Diffusion 3
 
-Generate images with Stable Diffusion 3. High resolution is also supported in this model. See [`sd3_text_to_image.py`](./sd3_text_to_image.py).
+Example script: [`sd3_text_to_image.py`](./sd3_text_to_image.py)
+
+LoRA Training: [`../train/stable_diffusion_3/`](../train/stable_diffusion_3/)
 
 |1024*1024|2048*2048|
 |-|-|
 |![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4df346db-6f91-420a-b4c1-26e205376098)|![image_2048](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/1386c802-e580-4101-939d-f1596802df9d)|
 
+### Example: Hunyuan-DiT
+
+Example script: [`hunyuan_dit_text_to_image.py`](./hunyuan_dit_text_to_image.py)
+
+LoRA Training: [`../train/hunyuan_dit/`](../train/hunyuan_dit/)
+
+|1024*1024|2048*2048|
+|-|-|
+|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/60b022c8-df3f-4541-95ab-bf39f2fa8bb5)|![image_2048](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/87919ea8-d428-4963-8257-da05f3901bbb)|
+
 ### Example: Stable Diffusion XL Turbo
 
-Generate images with Stable Diffusion XL Turbo. You can see [`sdxl_turbo.py`](./sdxl_turbo.py) for more details, but we highly recommend you to use it in the WebUI.
+Example script: [`sdxl_turbo.py`](./sdxl_turbo.py)
+
+We highly recommend you to use this model in the WebUI.
 
 |"black car"|"red car"|
 |-|-|
diff --git a/examples/image_synthesis/hunyuan_dit_text_to_image.py b/examples/image_synthesis/hunyuan_dit_text_to_image.py
@@ -0,0 +1,42 @@
+from diffsynth import ModelManager, HunyuanDiTImagePipeline, download_models
+import torch
+
+
+# Download models (automatically)
+# `models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin`: [link](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/clip_text_encoder/pytorch_model.bin)
+# `models/HunyuanDiT/t2i/mt5/pytorch_model.bin`: [link](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/mt5/pytorch_model.bin)
+# `models/HunyuanDiT/t2i/model/pytorch_model_ema.pt`: [link](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/model/pytorch_model_ema.pt)
+# `models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin`: [link](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin)
+download_models(["HunyuanDiT"])
+
+# Load models
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_models([
+    "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
+    "models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
+    "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
+    "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
+])
+pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
+
+prompt = "一幅充满诗意美感的全身肖像画，画中一位银发、蓝色眼睛、身穿蓝色连衣裙的少女漂浮在水下，周围是光彩的气泡，和煦的阳光透过水面折射进水下"
+negative_prompt = "错误的眼睛，糟糕的人脸，毁容，糟糕的艺术，变形，多余的肢体，模糊的颜色，模糊，重复，病态，残缺，"
+
+# Enjoy!
+torch.manual_seed(0)
+image = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    num_inference_steps=50, height=1024, width=1024,
+)
+image.save("image_1024.png")
+
+# Highres fix
+image = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    input_image=image.resize((2048, 2048)),
+    num_inference_steps=50, height=2048, width=2048,
+    denoising_strength=0.4, tiled=True,
+)
+image.save("image_2048.png")
diff --git a/examples/image_synthesis/sd3_text_to_image.py b/examples/image_synthesis/sd3_text_to_image.py
@@ -6,7 +6,7 @@
 # `models/stable_diffusion_3/sd3_medium_incl_clips.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips.safetensors)
 download_models(["StableDiffusion3"])
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
-                             file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips_t5xxlfp16.safetensors"])
+                             file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
 pipe = SD3ImagePipeline.from_model_manager(model_manager)
 
 
diff --git a/examples/train/hunyuan_dit/README.md b/examples/train/hunyuan_dit/README.md
diff --git a/examples/train/hunyuan_dit/train_hunyuan_dit_lora.py b/examples/train/hunyuan_dit/train_hunyuan_dit_lora.py
diff --git a/examples/train/stable_diffusion_3/README.md b/examples/train/stable_diffusion_3/README.md
diff --git a/examples/train/stable_diffusion_3/train_sd3_lora.py b/examples/train/stable_diffusion_3/train_sd3_lora.py