Merge pull request #101 from modelscope/Artiprocher-sd3-lora

Artiprocher · web-flow · commit c3d899dd48f8 · 2024-07-10T13:42:54.000+08:00
Support SD3 LoRA
diff --git a/README.md b/README.md
@@ -80,15 +80,15 @@ https://github.com/modelscope/DiffSynth-Studio/assets/35051019/d97f6aa9-8064-4b5
 
 ### Image Synthesis
 
-Generate high-resolution images, by breaking the limitation of diffusion models! [`examples/image_synthesis`](./examples/image_synthesis/)
+Generate high-resolution images, by breaking the limitation of diffusion models! [`examples/image_synthesis`](./examples/image_synthesis/).
 
-|512*512|1024*1024|2048*2048|4096*4096|
-|-|-|-|-|
-|![512](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/55f679e9-7445-4605-9315-302e93d11370)|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/6fc84611-8da6-4a1f-8fee-9a34eba3b4a5)|![2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/9087a73c-9164-4c58-b2a0-effc694143fb)|![4096](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/edee9e71-fc39-4d1c-9ca9-fa52002c67ac)|
+LoRA fine-tuning is supported in [`examples/train`](./examples/train/).
 
-|1024*1024|2048*2048|
+|Stable Diffusion|Stable Diffusion XL|
 |-|-|
-|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/67687748-e738-438c-aee5-96096f09ac90)|![2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/584186bc-9855-4140-878e-99541f9a757f)|
+|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/6fc84611-8da6-4a1f-8fee-9a34eba3b4a5)|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/67687748-e738-438c-aee5-96096f09ac90)|![2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/584186bc-9855-4140-878e-99541f9a757f)|
+|Stable Diffusion 3|Hunyuan-DiT|
+|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4df346db-6f91-420a-b4c1-26e205376098)|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/60b022c8-df3f-4541-95ab-bf39f2fa8bb5)|
 
 ### Toon Shading
 
@@ -104,22 +104,6 @@ Video stylization without video models. [`examples/diffsynth`](./examples/diffsy
 
 https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-4481-b79f-0c3a7361a1ea
 
-### Chinese Models
-
-Use Hunyuan-DiT to generate images with Chinese prompts. We also support LoRA fine-tuning of this model. [`examples/hunyuan_dit`](./examples/hunyuan_dit/)
-
-Prompt: 少女手捧鲜花，坐在公园的长椅上，夕阳的余晖洒在少女的脸庞，整个画面充满诗意的美感
-
-|1024x1024|2048x2048 (highres-fix)|
-|-|-|
-|![image_1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/2b6528cf-a229-46e9-b7dd-4a9475b07308)|![image_2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/11d264ec-966b-45c9-9804-74b60428b866)|
-
-Prompt: 一只小狗蹦蹦跳跳，周围是姹紫嫣红的鲜花，远处是山脉
-
-|Without LoRA|With LoRA|
-|-|-|
-|![image_without_lora](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/1aa21de5-a992-4b66-b14f-caa44e08876e)|![image_with_lora](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/83a0a41a-691f-4610-8e7b-d8e17c50a282)|
-
 ## Usage (in WebUI)
 
 ```
diff --git a/diffsynth/models/__init__.py b/diffsynth/models/__init__.py
@@ -567,7 +567,7 @@ def load_stable_diffusion_3(self, state_dict, components=None, file_path=""):
             if component == "sd3_text_encoder_3":
                 if "text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight" not in state_dict:
                     continue
-            elif component == "sd3_text_encoder_1":
+            if component == "sd3_text_encoder_1":
                 # Add additional token embeddings to text encoder
                 token_embeddings = [state_dict["text_encoders.clip_l.transformer.text_model.embeddings.token_embedding.weight"]]
                 for keyword in self.textual_inversion_dict:
diff --git a/diffsynth/models/sd3_dit.py b/diffsynth/models/sd3_dit.py
@@ -199,16 +199,30 @@ def tiled_forward(self, hidden_states, timestep, prompt_emb, pooled_prompt_emb,
         )
         return hidden_states
 
-    def forward(self, hidden_states, timestep, prompt_emb, pooled_prompt_emb, tiled=False, tile_size=128, tile_stride=64):
+    def forward(self, hidden_states, timestep, prompt_emb, pooled_prompt_emb, tiled=False, tile_size=128, tile_stride=64, use_gradient_checkpointing=False):
         if tiled:
             return self.tiled_forward(hidden_states, timestep, prompt_emb, pooled_prompt_emb, tile_size, tile_stride)
         conditioning = self.time_embedder(timestep, hidden_states.dtype) + self.pooled_text_embedder(pooled_prompt_emb)
         prompt_emb = self.context_embedder(prompt_emb)
 
         height, width = hidden_states.shape[-2:]
         hidden_states = self.pos_embedder(hidden_states)
+
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+            return custom_forward
+        
         for block in self.blocks:
-            hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning)
+            if self.training and use_gradient_checkpointing:
+                hidden_states, prompt_emb = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states, prompt_emb, conditioning,
+                    use_reentrant=False,
+                )
+            else:
+                hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning)
+        
         hidden_states = self.norm_out(hidden_states, conditioning)
         hidden_states = self.proj_out(hidden_states)
         hidden_states = rearrange(hidden_states, "B (H W) (P Q C) -> B C (H P) (W Q)", P=2, Q=2, H=height//2, W=width//2)
diff --git a/diffsynth/prompts/sd3_prompter.py b/diffsynth/prompts/sd3_prompter.py
@@ -69,7 +69,7 @@ def encode_prompt(
 
         # T5
         if text_encoder_3 is None:
-            prompt_emb_3 = torch.zeros((1, 256, 4096), dtype=prompt_emb_1.dtype, device=device)
+            prompt_emb_3 = torch.zeros((prompt_emb_1.shape[0], 256, 4096), dtype=prompt_emb_1.dtype, device=device)
         else:
             prompt_emb_3 = self.encode_prompt_using_t5(pure_prompt, text_encoder_3, self.tokenizer_3, 256, device)
             prompt_emb_3 = prompt_emb_3.to(prompt_emb_1.dtype) # float32 -> float16
diff --git a/diffsynth/prompts/utils.py b/diffsynth/prompts/utils.py
@@ -124,6 +124,13 @@ def del_textual_inversion_tokens(self, prompt):
         return prompt
 
     def process_prompt(self, prompt, positive=True, require_pure_prompt=False):
+        if isinstance(prompt, list):
+            prompt = [self.process_prompt(prompt_, positive=positive, require_pure_prompt=require_pure_prompt) for prompt_ in prompt]
+            if require_pure_prompt:
+                prompt, pure_prompt = [i[0] for i in prompt], [i[1] for i in prompt]
+                return prompt, pure_prompt
+            else:
+                return prompt
         prompt, pure_prompt = self.add_textual_inversion_tokens(prompt), self.del_textual_inversion_tokens(prompt)
         if positive and self.translator is not None:
             prompt = self.translator(prompt)
diff --git a/diffsynth/schedulers/flow_match.py b/diffsynth/schedulers/flow_match.py
@@ -40,3 +40,8 @@ def add_noise(self, original_samples, noise, timestep):
         sigma = self.sigmas[timestep_id]
         sample = (1 - sigma) * original_samples + sigma * noise
         return sample
+    
+
+    def training_target(self, sample, noise, timestep):
+        target = noise - sample
+        return target
diff --git a/examples/image_synthesis/README.md b/examples/image_synthesis/README.md
@@ -1,34 +1,48 @@
 # Image Synthesis
 
-Image synthesis is the base feature of DiffSynth Studio.
+Image synthesis is the base feature of DiffSynth Studio. We can generate images with very high resolution.
 
 ### Example: Stable Diffusion
 
-We can generate images with very high resolution. Please see [`sd_text_to_image.py`](./sd_text_to_image.py) for more details.
+Example script: [`sd_text_to_image.py`](./sd_text_to_image.py)
 
 |512*512|1024*1024|2048*2048|4096*4096|
 |-|-|-|-|
 |![512](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/55f679e9-7445-4605-9315-302e93d11370)|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/6fc84611-8da6-4a1f-8fee-9a34eba3b4a5)|![2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/9087a73c-9164-4c58-b2a0-effc694143fb)|![4096](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/edee9e71-fc39-4d1c-9ca9-fa52002c67ac)|
 
 ### Example: Stable Diffusion XL
 
-Generate images with Stable Diffusion XL. Please see [`sdxl_text_to_image.py`](./sdxl_text_to_image.py) for more details.
+Example script: [`sdxl_text_to_image.py`](./sdxl_text_to_image.py)
 
 |1024*1024|2048*2048|
 |-|-|
 |![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/67687748-e738-438c-aee5-96096f09ac90)|![2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/584186bc-9855-4140-878e-99541f9a757f)|
 
 ### Example: Stable Diffusion 3
 
-Generate images with Stable Diffusion 3. High resolution is also supported in this model. See [`sd3_text_to_image.py`](./sd3_text_to_image.py).
+Example script: [`sd3_text_to_image.py`](./sd3_text_to_image.py)
+
+LoRA Training: [`../train/stable_diffusion_3/`](../train/stable_diffusion_3/)
 
 |1024*1024|2048*2048|
 |-|-|
 |![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4df346db-6f91-420a-b4c1-26e205376098)|![image_2048](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/1386c802-e580-4101-939d-f1596802df9d)|
 
+### Example: Hunyuan-DiT
+
+Example script: [`hunyuan_dit_text_to_image.py`](./hunyuan_dit_text_to_image.py)
+
+LoRA Training: [`../train/hunyuan_dit/`](../train/hunyuan_dit/)
+
+|1024*1024|2048*2048|
+|-|-|
+|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/60b022c8-df3f-4541-95ab-bf39f2fa8bb5)|![image_2048](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/87919ea8-d428-4963-8257-da05f3901bbb)|
+
 ### Example: Stable Diffusion XL Turbo
 
-Generate images with Stable Diffusion XL Turbo. You can see [`sdxl_turbo.py`](./sdxl_turbo.py) for more details, but we highly recommend you to use it in the WebUI.
+Example script: [`sdxl_turbo.py`](./sdxl_turbo.py)
+
+We highly recommend you to use this model in the WebUI.
 
 |"black car"|"red car"|
 |-|-|
diff --git a/examples/image_synthesis/hunyuan_dit_text_to_image.py b/examples/image_synthesis/hunyuan_dit_text_to_image.py
@@ -0,0 +1,42 @@
+from diffsynth import ModelManager, HunyuanDiTImagePipeline, download_models
+import torch
+
+
+# Download models (automatically)
+# `models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin`: [link](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/clip_text_encoder/pytorch_model.bin)
+# `models/HunyuanDiT/t2i/mt5/pytorch_model.bin`: [link](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/mt5/pytorch_model.bin)
+# `models/HunyuanDiT/t2i/model/pytorch_model_ema.pt`: [link](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/model/pytorch_model_ema.pt)
+# `models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin`: [link](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin)
+download_models(["HunyuanDiT"])
+
+# Load models
+model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
+model_manager.load_models([
+    "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
+    "models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
+    "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
+    "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
+])
+pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
+
+prompt = "一幅充满诗意美感的全身肖像画，画中一位银发、蓝色眼睛、身穿蓝色连衣裙的少女漂浮在水下，周围是光彩的气泡，和煦的阳光透过水面折射进水下"
+negative_prompt = "错误的眼睛，糟糕的人脸，毁容，糟糕的艺术，变形，多余的肢体，模糊的颜色，模糊，重复，病态，残缺，"
+
+# Enjoy!
+torch.manual_seed(0)
+image = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    num_inference_steps=50, height=1024, width=1024,
+)
+image.save("image_1024.png")
+
+# Highres fix
+image = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    input_image=image.resize((2048, 2048)),
+    num_inference_steps=50, height=2048, width=2048,
+    denoising_strength=0.4, tiled=True,
+)
+image.save("image_2048.png")
diff --git a/examples/image_synthesis/sd3_text_to_image.py b/examples/image_synthesis/sd3_text_to_image.py
@@ -4,9 +4,9 @@
 
 # Download models (automatically)
 # `models/stable_diffusion_3/sd3_medium_incl_clips.safetensors`: [link](https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips.safetensors)
-download_models(["StableDiffusion3"])
+download_models(["StableDiffusion3_without_T5"])
 model_manager = ModelManager(torch_dtype=torch.float16, device="cuda",
-                             file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips_t5xxlfp16.safetensors"])
+                             file_path_list=["models/stable_diffusion_3/sd3_medium_incl_clips.safetensors"])
 pipe = SD3ImagePipeline.from_model_manager(model_manager)
 
 
diff --git a/examples/train/hunyuan_dit/README.md b/examples/train/hunyuan_dit/README.md
@@ -28,99 +28,6 @@ from diffsynth import download_models
 download_models(["HunyuanDiT"])
 ```
 
-## Inference
-
-### Text-to-image with highres-fix
-
-The original resolution of Hunyuan DiT is 1024x1024. If you want to use larger resolutions, please use highres-fix.
-
-Hunyuan DiT is also supported in our UI.
-
-```python
-from diffsynth import ModelManager, HunyuanDiTImagePipeline
-import torch
-
-
-# Load models
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
-model_manager.load_models([
-    "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
-    "models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
-    "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
-    "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
-])
-pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
-
-# Enjoy!
-torch.manual_seed(0)
-image = pipe(
-    prompt="少女手捧鲜花，坐在公园的长椅上，夕阳的余晖洒在少女的脸庞，整个画面充满诗意的美感",
-    negative_prompt="错误的眼睛，糟糕的人脸，毁容，糟糕的艺术，变形，多余的肢体，模糊的颜色，模糊，重复，病态，残缺，",
-    num_inference_steps=50, height=1024, width=1024,
-)
-image.save("image_1024.png")
-
-# Highres fix
-image = pipe(
-    prompt="少女手捧鲜花，坐在公园的长椅上，夕阳的余晖洒在少女的脸庞，整个画面充满诗意的美感",
-    negative_prompt="错误的眼睛，糟糕的人脸，毁容，糟糕的艺术，变形，多余的肢体，模糊的颜色，模糊，重复，病态，残缺，",
-    input_image=image.resize((2048, 2048)),
-    num_inference_steps=50, height=2048, width=2048,
-    cfg_scale=3.0, denoising_strength=0.5, tiled=True,
-)
-image.save("image_2048.png")
-```
-
-Prompt: 少女手捧鲜花，坐在公园的长椅上，夕阳的余晖洒在少女的脸庞，整个画面充满诗意的美感
-
-|1024x1024|2048x2048 (highres-fix)|
-|-|-|
-|![image_1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/2b6528cf-a229-46e9-b7dd-4a9475b07308)|![image_2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/11d264ec-966b-45c9-9804-74b60428b866)|
-
-### In-context reference (experimental)
-
-This feature is similar to the "reference-only" mode in ControlNets. By extending the self-attention layer, the content in the reference image can be retained in the new image. Any number of reference images are supported, and the influence from each reference image can be controled by independent `reference_strengths` parameters.
-
-```python
-from diffsynth import ModelManager, HunyuanDiTImagePipeline
-import torch
-
-
-# Load models
-model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
-model_manager.load_models([
-    "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
-    "models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
-    "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
-    "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
-])
-pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
-
-# Generate an image as reference
-torch.manual_seed(0)
-reference_image = pipe(
-    prompt="梵高，星空，油画，明亮",
-    negative_prompt="",
-    num_inference_steps=50, height=1024, width=1024,
-)
-reference_image.save("image_reference.png")
-
-# Generate a new image with reference
-image = pipe(
-    prompt="层峦叠嶂的山脉，郁郁葱葱的森林，皎洁明亮的月光，夜色下的自然美景",
-    negative_prompt="",
-    reference_images=[reference_image], reference_strengths=[0.4],
-    num_inference_steps=50, height=1024, width=1024,
-)
-image.save("image_with_reference.png")
-```
-
-Prompt: 层峦叠嶂的山脉，郁郁葱葱的森林，皎洁明亮的月光，夜色下的自然美景
-
-|Reference image|Generated new image|
-|-|-|
-|![image_reference](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/99b0189d-6175-4842-b480-3c0d2f9f7e17)|![image_with_reference](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/8e41dddb-f302-4a2d-9e52-5487d1f47ae6)|
-
 ## Train
 
 ### Install training dependency
@@ -254,7 +161,8 @@ pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)
 
 # Generate an image with lora
 pipe.dit = load_lora(
-    pipe.dit, lora_rank=4, lora_alpha=4.0,
+    pipe.dit,
+    lora_rank=4, lora_alpha=4.0, # The two parameters should be consistent with those in your training script.
     lora_path="path/to/your/lora/model/lightning_logs/version_x/checkpoints/epoch=x-step=xxx.ckpt"
 )
 torch.manual_seed(0)
diff --git a/examples/train/hunyuan_dit/train_hunyuan_dit_lora.py b/examples/train/hunyuan_dit/train_hunyuan_dit_lora.py
diff --git a/examples/train/stable_diffusion_3/README.md b/examples/train/stable_diffusion_3/README.md
diff --git a/examples/train/stable_diffusion_3/train_sd3_lora.py b/examples/train/stable_diffusion_3/train_sd3_lora.py