modelscope
diff --git a/‎diffsynth/configs/model_configs.py‎
Lines changed: 0 additions & 38 deletions b/‎diffsynth/configs/model_configs.py‎
Lines changed: 0 additions & 38 deletions
diff --git a/‎diffsynth/models/hunyuanimage_dit.py‎
Lines changed: 8 additions & 8 deletions b/‎diffsynth/models/hunyuanimage_dit.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎diffsynth/models/hunyuanimage_text_encoder_qwen.py‎
Lines changed: 140 additions & 5 deletions b/‎diffsynth/models/hunyuanimage_text_encoder_qwen.py‎
Lines changed: 140 additions & 5 deletions
@@ -32,61 +32,23 @@
         "model_name": "hunyuan_dit",
         "model_class": "diffsynth.models.hunyuanimage_dit.HYImageDiffusionTransformer",
         "state_dict_converter": "diffsynth.utils.state_dict_converters.hunyuan_dit_converter.HunyuanDiTStateDictConverter",
-        "extra_kwargs": {
-            "in_channels": 64,
-            "out_channels": 64,
-            "mm_double_blocks_depth": 20,
-            "mm_single_blocks_depth": 40,
-            "rope_dim_list": [64, 64],
-            "hidden_size": 3584,
-            "heads_num": 28,
-            "mlp_width_ratio": 4,
-            "patch_size": [1, 1],
-            "text_states_dim": 3584,
-            "glyph_byT5_v2": True,
-            "guidance_embed": False,
-        }
     },
     {
         "model_hash": "17119adfcaec79e9045b50274d51c65e",
         "model_name": "hunyuan_vae",
         "model_class": "diffsynth.models.hunyuanimage_vae.HunyuanImageVAE2D",
         "state_dict_converter": "diffsynth.utils.state_dict_converters.hunyuan_vae_converter.HunyuanVAEStateDictConverter",
-        "extra_kwargs": {
-            "in_channels": 3,
-            "out_channels": 3,
-            "latent_channels": 64,
-            "block_out_channels": [ 128, 256, 512, 512, 1024, 1024 ],
-            "layers_per_block": 2,
-            "ffactor_spatial": 32,
-            "sample_size": 384,
-            "sample_tsize": 96,
-            "scaling_factor": 0.75289,
-            "downsample_match_channel": True,
-            "upsample_match_channel": True
-        }
     },
     {
         "model_hash": "8004730443f55db63092006dd9f7110e",
         "model_name": "hunyuan_text_encoder_qwen",
         "model_class": "diffsynth.models.hunyuanimage_text_encoder_qwen.HunyuanImage_TextEncoder_Qwen",
         "state_dict_converter": "diffsynth.utils.state_dict_converters.hunyuan_qwen_converter.HunyuanQwenStateDictConverter",
-        "extra_kwargs": {
-            "model_path": "Qwen/Qwen2.5-VL-7B-Instruct", 
-            "apply_final_norm": False,
-            "hidden_state_skip_layer": 2,
-            "crop_start": 34
-        }
     },
     {
         "model_hash": "e47fee6f4928b305e2fd32bd45ef1950",
         "model_name": "hunyuan_text_encoder_t5",
         "model_class": "diffsynth.models.hunyuanimage_text_encoder_t5.HunyuanImage_ByT5",
         "state_dict_converter": "diffsynth.utils.state_dict_converters.hunyuan_t5_converter.HunyuanT5StateDictConverter",
-        "extra_kwargs": {
-            "model_path": "google/byt5-small", 
-            "color_ann_path": "AI-ModelScope/Glyph-SDXL-v2",
-            "font_ann_path": "AI-ModelScope/Glyph-SDXL-v2"
-        }
     }
 ]
@@ -1474,16 +1474,16 @@ class HYImageDiffusionTransformer(torch.nn.Module):
     # @register_to_config
     def __init__(
         self,
-        patch_size: list = [1, 2, 2],
-        in_channels: int = 4,
-        out_channels: int = None,
-        hidden_size: int = 3072,
-        heads_num: int = 24,
+        patch_size: list = [1, 1],
+        in_channels: int = 64,
+        out_channels: int = 64,
+        hidden_size: int = 3584,
+        heads_num: int = 28,
         mlp_width_ratio: float = 4.0,
         mlp_act_type: str = "gelu_tanh",
         mm_double_blocks_depth: int = 20,
         mm_single_blocks_depth: int = 40,
-        rope_dim_list: List[int] = [16, 56, 56],
+        rope_dim_list: List[int] = [64, 64],
         qkv_bias: bool = True,
         qk_norm: bool = True,
         qk_norm_type: str = "rms",
@@ -1492,9 +1492,9 @@ def __init__(
         use_attention_mask: bool = True,
         dtype: Optional[torch.dtype] = None,
         device: Optional[torch.device] = None,
-        text_states_dim: int = 4096,
+        text_states_dim: int = 3584,
         rope_theta: int = 256,
-        glyph_byT5_v2: bool = False,
+        glyph_byT5_v2: bool = True,
         use_meanflow: bool = False,
     ):
         factory_kwargs = {"device": device, "dtype": dtype}
 
@@ -4,7 +4,7 @@
 
 import torch
 import torch.nn as nn
-from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoConfig
+from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration
 
 from transformers.utils import ModelOutput
 
@@ -33,7 +33,7 @@ class TextEncoderModelOutput(ModelOutput):
 class HunyuanImage_TextEncoder_Qwen(nn.Module):
     def __init__(
         self,
-        model_path: str, 
+        model_path: str = "Qwen/Qwen2.5-VL-7B-Instruct", 
         apply_final_norm: bool = False,
         hidden_state_skip_layer: Optional[int] = 2,
         crop_start: int = 34,
@@ -43,9 +43,144 @@ def __init__(
         self.apply_final_norm = apply_final_norm
         self.hidden_state_skip_layer = hidden_state_skip_layer
         self.crop_start = crop_start
-        
-        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-        self.model = AutoModelForVision2Seq.from_config(config, trust_remote_code=True)
+
+        config = Qwen2_5_VLConfig(**{
+            "architectures": [
+                "Qwen2_5_VLForConditionalGeneration"
+            ],
+            "attention_dropout": 0.0,
+            "bos_token_id": 151643,
+            "eos_token_id": 151645,
+            "hidden_act": "silu",
+            "hidden_size": 3584,
+            "image_token_id": 151655,
+            "initializer_range": 0.02,
+            "intermediate_size": 18944,
+            "max_position_embeddings": 128000,
+            "max_window_layers": 28,
+            "model_type": "qwen2_5_vl",
+            "num_attention_heads": 28,
+            "num_hidden_layers": 28,
+            "num_key_value_heads": 4,
+            "rms_norm_eps": 1e-06,
+            "rope_scaling": {
+                "mrope_section": [
+                    16,
+                    24,
+                    24
+                ],
+                "rope_type": "default",
+                "type": "default"
+            },
+            "rope_theta": 1000000.0,
+            "sliding_window": 32768,
+            "text_config": {
+                "architectures": [
+                    "Qwen2_5_VLForConditionalGeneration"
+                ],
+                "attention_dropout": 0.0,
+                "bos_token_id": 151643,
+                "eos_token_id": 151645,
+                "hidden_act": "silu",
+                "hidden_size": 3584,
+                "image_token_id": None,
+                "initializer_range": 0.02,
+                "intermediate_size": 18944,
+                "layer_types": [
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention"
+                ],
+                "max_position_embeddings": 128000,
+                "max_window_layers": 28,
+                "model_type": "qwen2_5_vl_text",
+                "num_attention_heads": 28,
+                "num_hidden_layers": 28,
+                "num_key_value_heads": 4,
+                "rms_norm_eps": 1e-06,
+                "rope_scaling": {
+                "mrope_section": [
+                    16,
+                    24,
+                    24
+                ],
+                "rope_type": "default",
+                "type": "default"
+                },
+                "rope_theta": 1000000.0,
+                "sliding_window": None,
+                "torch_dtype": "float32",
+                "use_cache": True,
+                "use_sliding_window": False,
+                "video_token_id": None,
+                "vision_end_token_id": 151653,
+                "vision_start_token_id": 151652,
+                "vision_token_id": 151654,
+                "vocab_size": 152064
+            },
+            "tie_word_embeddings": False,
+            "torch_dtype": "float32",
+            "transformers_version": "4.54.0",
+            "use_cache": True,
+            "use_sliding_window": False,
+            "video_token_id": 151656,
+            "vision_config": {
+                "depth": 32,
+                "fullatt_block_indexes": [
+                    7,
+                    15,
+                    23,
+                    31
+                ],
+                "hidden_act": "silu",
+                "hidden_size": 1280,
+                "in_channels": 3,
+                "in_chans": 3,
+                "initializer_range": 0.02,
+                "intermediate_size": 3420,
+                "model_type": "qwen2_5_vl",
+                "num_heads": 16,
+                "out_hidden_size": 3584,
+                "patch_size": 14,
+                "spatial_merge_size": 2,
+                "spatial_patch_size": 14,
+                "temporal_patch_size": 2,
+                "tokens_per_second": 2,
+                "torch_dtype": "float32",
+                "window_size": 112
+            },
+            "vision_end_token_id": 151653,
+            "vision_start_token_id": 151652,
+            "vision_token_id": 151654,
+            "vocab_size": 152064
+        })
+
+        self.model = Qwen2_5_VLForConditionalGeneration(config)
 
         self.output_key = "last_hidden_state"