modelscope
diff --git a/‎README.md‎
Lines changed: 7 additions & 4 deletions b/‎README.md‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎diffsynth/models/__init__.py‎
Lines changed: 62 additions & 7 deletions b/‎diffsynth/models/__init__.py‎
Lines changed: 62 additions & 7 deletions
@@ -8,6 +8,7 @@ DiffSynth Studio is a Diffusion engine. We have restructured architectures inclu
 Until now, DiffSynth Studio has supported the following models:
 
 * [ExVideo](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
+* [Kolors](https://huggingface.co/Kwai-Kolors/Kolors)
 * [Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium)
 * [Stable Video Diffusion](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt)
 * [Hunyuan-DiT](https://github.com/Tencent/HunyuanDiT)
@@ -85,11 +86,13 @@ Generate high-resolution images, by breaking the limitation of diffusion models!
 
 LoRA fine-tuning is supported in [`examples/train`](./examples/train/).
 
-|Stable Diffusion|Stable Diffusion XL|
+|Model|Example|
 |-|-|
-|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/6fc84611-8da6-4a1f-8fee-9a34eba3b4a5)|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/67687748-e738-438c-aee5-96096f09ac90)|![2048](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/584186bc-9855-4140-878e-99541f9a757f)|
-|Stable Diffusion 3|Hunyuan-DiT|
-|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4df346db-6f91-420a-b4c1-26e205376098)|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/60b022c8-df3f-4541-95ab-bf39f2fa8bb5)|
+|Stable Diffusion|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/6fc84611-8da6-4a1f-8fee-9a34eba3b4a5)|
+|Stable Diffusion XL|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/67687748-e738-438c-aee5-96096f09ac90)|
+|Stable Diffusion 3|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4df346db-6f91-420a-b4c1-26e205376098)|
+|Kolors|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/53ef6f41-da11-4701-8665-9f64392607bf)|
+|Hunyuan-DiT|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/60b022c8-df3f-4541-95ab-bf39f2fa8bb5)|
 
 ### Toon Shading
 
 
@@ -1,4 +1,4 @@
-import torch, os
+import torch, os, json
 from safetensors import safe_open
 from typing_extensions import Literal, TypeAlias
 from typing import List
@@ -36,6 +36,7 @@
 
 from .hunyuan_dit_text_encoder import HunyuanDiTCLIPTextEncoder, HunyuanDiTT5TextEncoder
 from .hunyuan_dit import HunyuanDiT
+from .kolors_text_encoder import ChatGLMModel
 
 
 preset_models_on_huggingface = {
@@ -159,6 +160,20 @@
         ("AI-ModelScope/IP-Adapter", "sdxl_models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion_xl/image_encoder"),
         ("AI-ModelScope/IP-Adapter", "sdxl_models/ip-adapter_sdxl.bin", "models/IpAdapter/stable_diffusion_xl"),
     ],
+    # Kolors
+    "Kolors": [
+        ("Kwai-Kolors/Kolors", "text_encoder/config.json", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model.bin.index.json", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00001-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00002-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00003-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00004-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00005-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00006-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00007-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "unet/diffusion_pytorch_model.safetensors", "models/kolors/Kolors/unet"),
+        ("Kwai-Kolors/Kolors", "vae/diffusion_pytorch_model.safetensors", "models/kolors/Kolors/vae"),
+    ],
 }
 Preset_model_id: TypeAlias = Literal[
     "HunyuanDiT",
@@ -184,7 +199,8 @@
     "IP-Adapter-SD",
     "IP-Adapter-SDXL",
     "StableDiffusion3",
-    "StableDiffusion3_without_T5"
+    "StableDiffusion3_without_T5",
+    "Kolors",
 ]
 Preset_model_website: TypeAlias = Literal[
     "HuggingFace",
@@ -272,8 +288,7 @@ def is_stable_diffusion(self, state_dict):
 
     def is_controlnet(self, state_dict):
         param_name = "control_model.time_embed.0.weight"
-        param_name_2 = "mid_block.resnets.1.time_emb_proj.weight" # For controlnets in diffusers format
-        return param_name in state_dict or param_name_2 in state_dict
+        return param_name in state_dict
 
     def is_animatediff(self, state_dict):
         param_name = "mid_block.motion_modules.0.temporal_transformer.proj_out.weight"
@@ -343,6 +358,21 @@ def is_stable_diffusion_3_t5(self, state_dict):
         param_name = "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
         return param_name in state_dict
 
+    def is_kolors_text_encoder(self, file_path):
+        file_list = os.listdir(file_path)
+        if "config.json" in file_list:
+            try:
+                with open(os.path.join(file_path, "config.json"), "r") as f:
+                    config = json.load(f)
+                    if config.get("model_type") == "chatglm":
+                        return True
+            except:
+                pass
+        return False
+    
+    def is_kolors_unet(self, state_dict):
+        return "up_blocks.2.resnets.2.time_emb_proj.weight" in state_dict and "encoder_hid_proj.weight" in state_dict
+    
     def load_stable_video_diffusion(self, state_dict, components=None, file_path="", add_positional_conv=None):
         component_dict = {
             "image_encoder": SVDImageEncoder,
@@ -532,13 +562,13 @@ def load_diffusers_vae(self, state_dict, file_path=""):
         component = "vae_encoder"
         model = SDXLVAEEncoder()
         model.load_state_dict(model.state_dict_converter().from_diffusers(state_dict))
-        model.to(self.torch_dtype).to(self.device)
+        model.to(torch.float32).to(self.device)
         self.model[component] = model
         self.model_path[component] = file_path
         component = "vae_decoder"
         model = SDXLVAEDecoder()
         model.load_state_dict(model.state_dict_converter().from_diffusers(state_dict))
-        model.to(self.torch_dtype).to(self.device)
+        model.to(torch.float32).to(self.device)
         self.model[component] = model
         self.model_path[component] = file_path
 
@@ -592,6 +622,21 @@ def load_stable_diffusion_3_t5(self, state_dict, file_path=""):
         self.model[component] = model
         self.model_path[component] = file_path
 
+    def load_kolors_text_encoder(self, state_dict=None, file_path=""):
+        component = "kolors_text_encoder"
+        model = ChatGLMModel.from_pretrained(file_path, torch_dtype=self.torch_dtype)
+        model = model.to(dtype=self.torch_dtype, device=self.device)
+        self.model[component] = model
+        self.model_path[component] = file_path
+
+    def load_kolors_unet(self, state_dict, file_path=""):
+        component = "kolors_unet"
+        model = SDXLUNet(is_kolors=True)
+        model.load_state_dict(model.state_dict_converter().from_diffusers(state_dict))
+        model.to(self.torch_dtype).to(self.device)
+        self.model[component] = model
+        self.model_path[component] = file_path
+
     def search_for_embeddings(self, state_dict):
         embeddings = []
         for k in state_dict:
@@ -607,7 +652,11 @@ def load_textual_inversions(self, folder):
 
         # Load every textual inversion file
         for file_name in os.listdir(folder):
-            if file_name.endswith(".txt"):
+            if os.path.isdir(os.path.join(folder, file_name)) or \
+                not (file_name.endswith(".bin") or \
+                     file_name.endswith(".safetensors") or \
+                     file_name.endswith(".pth") or \
+                     file_name.endswith(".pt")):
                 continue
             keyword = os.path.splitext(file_name)[0]
             state_dict = load_state_dict(os.path.join(folder, file_name))
@@ -620,6 +669,10 @@ def load_textual_inversions(self, folder):
                     break
 
     def load_model(self, file_path, components=None, lora_alphas=[]):
+        if os.path.isdir(file_path):
+            if self.is_kolors_text_encoder(file_path):
+                self.load_kolors_text_encoder(file_path=file_path)
+            return
         state_dict = load_state_dict(file_path, torch_dtype=self.torch_dtype)
         if self.is_stable_video_diffusion(state_dict):
             self.load_stable_video_diffusion(state_dict, file_path=file_path)
@@ -663,6 +716,8 @@ def load_model(self, file_path, components=None, lora_alphas=[]):
             self.load_stable_diffusion_3(state_dict, components=components, file_path=file_path)
         elif self.is_stable_diffusion_3_t5(state_dict):
             self.load_stable_diffusion_3_t5(state_dict, file_path=file_path)
+        elif self.is_kolors_unet(state_dict):
+            self.load_kolors_unet(state_dict, file_path=file_path)
 
     def load_models(self, file_path_list, lora_alphas=[]):
         for file_path in file_path_list: