modelscope
diff --git a/‎diffsynth/configs/model_configs.py‎
Lines changed: 28 additions & 0 deletions b/‎diffsynth/configs/model_configs.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎diffsynth/models/flux_ipadapter.py‎
Lines changed: 32 additions & 3 deletions b/‎diffsynth/models/flux_ipadapter.py‎
Lines changed: 32 additions & 3 deletions
diff --git a/‎diffsynth/models/flux_vae.py‎
Lines changed: 19 additions & 12 deletions b/‎diffsynth/models/flux_vae.py‎
Lines changed: 19 additions & 12 deletions
diff --git a/‎diffsynth/models/flux_value_control.py‎
Lines changed: 3 additions & 1 deletion b/‎diffsynth/models/flux_value_control.py‎
Lines changed: 3 additions & 1 deletion
@@ -285,6 +285,34 @@
         "model_class": "diffsynth.models.flux_text_encoder_t5.FluxTextEncoderT5",
         "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_text_encoder_t5.FluxTextEncoderT5StateDictConverter",
     },
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors")
+        "model_hash": "21ea55f476dfc4fd135587abb59dfe5d",
+        "model_name": "flux_vae_encoder",
+        "model_class": "diffsynth.models.flux_vae.FluxVAEEncoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_vae.FluxVAEEncoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors")
+        "model_hash": "21ea55f476dfc4fd135587abb59dfe5d",
+        "model_name": "flux_vae_decoder",
+        "model_class": "diffsynth.models.flux_vae.FluxVAEDecoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_vae.FluxVAEDecoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="ostris/Flex.2-preview", origin_file_pattern="Flex.2-preview.safetensors")
+        "model_hash": "d02f41c13549fa5093d3521f62a5570a",
+        "model_name": "flux_dit",
+        "model_class": "diffsynth.models.flux_dit.FluxDiT",
+        "extra_kwargs": {'input_dim': 196, 'num_blocks': 8},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/AttriCtrl-FLUX.1-Dev", origin_file_pattern="models/brightness.safetensors")
+        "model_hash": "0629116fce1472503a66992f96f3eb1a",
+        "model_name": "flux_value_controller",
+        "model_class": "diffsynth.models.flux_value_control.SingleValueEncoder",
+    }
 ]
 
 MODEL_CONFIGS = qwen_image_series + wan_series + flux_series
@@ -1,9 +1,38 @@
-from .svd_image_encoder import SVDImageEncoder
-from .sd3_dit import RMSNorm
-from transformers import CLIPImageProcessor
+from .general_modules import RMSNorm
+from transformers import SiglipVisionModel, SiglipVisionConfig
 import torch
 
 
+class SiglipVisionModelSO400M(SiglipVisionModel):
+    def __init__(self):
+        config = SiglipVisionConfig(**{
+            "architectures": [
+                "SiglipModel"
+            ],
+            "initializer_factor": 1.0,
+            "model_type": "siglip",
+            "text_config": {
+                "hidden_size": 1152,
+                "intermediate_size": 4304,
+                "model_type": "siglip_text_model",
+                "num_attention_heads": 16,
+                "num_hidden_layers": 27
+            },
+            "torch_dtype": "float32",
+            "transformers_version": "4.37.0.dev0",
+            "vision_config": {
+                "hidden_size": 1152,
+                "image_size": 384,
+                "intermediate_size": 4304,
+                "model_type": "siglip_vision_model",
+                "num_attention_heads": 16,
+                "num_hidden_layers": 27,
+                "patch_size": 14
+            }
+        })
+        super().__init__(config)
+
+
 class MLPProjModel(torch.nn.Module):
     def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, num_tokens=4):
         super().__init__()
 
@@ -106,7 +106,7 @@ def tiled_forward(self, forward_fn, model_input, tile_size, tile_stride, tile_ba
         return model_output
 
 
-class Attention(torch.nn.Module):
+class ConvAttention(torch.nn.Module):
 
     def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=False, bias_kv=False, bias_out=False):
         super().__init__()
@@ -115,20 +115,25 @@ def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=False, bias_k
         self.num_heads = num_heads
         self.head_dim = head_dim
 
-        self.to_q = torch.nn.Linear(q_dim, dim_inner, bias=bias_q)
-        self.to_k = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
-        self.to_v = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
-        self.to_out = torch.nn.Linear(dim_inner, q_dim, bias=bias_out)
+        self.to_q = torch.nn.Conv2d(q_dim, dim_inner, kernel_size=(1, 1), bias=bias_q)
+        self.to_k = torch.nn.Conv2d(kv_dim, dim_inner, kernel_size=(1, 1), bias=bias_kv)
+        self.to_v = torch.nn.Conv2d(kv_dim, dim_inner, kernel_size=(1, 1), bias=bias_kv)
+        self.to_out = torch.nn.Conv2d(dim_inner, q_dim, kernel_size=(1, 1), bias=bias_out)
 
     def forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None):
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
 
         batch_size = encoder_hidden_states.shape[0]
 
-        q = self.to_q(hidden_states)
-        k = self.to_k(encoder_hidden_states)
-        v = self.to_v(encoder_hidden_states)
+        conv_input = rearrange(hidden_states, "B L C -> B C L 1")
+        q = self.to_q(conv_input)
+        q = rearrange(q[:, :, :, 0], "B C L -> B L C")
+        conv_input = rearrange(encoder_hidden_states, "B L C -> B C L 1")
+        k = self.to_k(conv_input)
+        v = self.to_v(conv_input)
+        k = rearrange(k[:, :, :, 0], "B C L -> B L C")
+        v = rearrange(v[:, :, :, 0], "B C L -> B L C")
 
         q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
         k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
@@ -138,7 +143,9 @@ def forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None):
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
         hidden_states = hidden_states.to(q.dtype)
 
-        hidden_states = self.to_out(hidden_states)
+        conv_input = rearrange(hidden_states, "B L C -> B C L 1")
+        hidden_states = self.to_out(conv_input)
+        hidden_states = rearrange(hidden_states[:, :, :, 0], "B C L -> B L C")
 
         return hidden_states
 
@@ -152,7 +159,7 @@ def __init__(self, num_attention_heads, attention_head_dim, in_channels, num_lay
         self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=eps, affine=True)
 
         self.transformer_blocks = torch.nn.ModuleList([
-            Attention(
+            ConvAttention(
                 inner_dim,
                 num_attention_heads,
                 attention_head_dim,
@@ -236,7 +243,7 @@ def forward(self, hidden_states, time_emb, text_emb, res_stack, **kwargs):
         return hidden_states, time_emb, text_emb, res_stack
 
 
-class SD3VAEDecoder(torch.nn.Module):
+class FluxVAEDecoder(torch.nn.Module):
     def __init__(self):
         super().__init__()
         self.scaling_factor = 0.3611
@@ -308,7 +315,7 @@ def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, **kwargs):
         return hidden_states
 
 
-class SD3VAEEncoder(torch.nn.Module):
+class FluxVAEEncoder(torch.nn.Module):
     def __init__(self):
         super().__init__()
         self.scaling_factor = 0.3611
 
@@ -1,10 +1,12 @@
 import torch
-from diffsynth.models.svd_unet import TemporalTimesteps
+from .general_modules import TemporalTimesteps
 
 
 class MultiValueEncoder(torch.nn.Module):
     def __init__(self, encoders=()):
         super().__init__()
+        if not isinstance(encoders, list):
+            encoders = [encoders]
         self.encoders = torch.nn.ModuleList(encoders)
 
     def __call__(self, values, dtype):