maifeeulasad
diff --git a/‎comfy/latent_formats.py‎
Lines changed: 9 additions & 0 deletions b/‎comfy/latent_formats.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎comfy/ldm/flux/layers.py‎
Lines changed: 63 additions & 27 deletions b/‎comfy/ldm/flux/layers.py‎
Lines changed: 63 additions & 27 deletions
diff --git a/‎comfy/ldm/flux/model.py‎
Lines changed: 61 additions & 19 deletions b/‎comfy/ldm/flux/model.py‎
Lines changed: 61 additions & 19 deletions
@@ -178,6 +178,15 @@ def process_in(self, latent):
     def process_out(self, latent):
         return (latent / self.scale_factor) + self.shift_factor
 
+class Flux2(LatentFormat):
+    latent_channels = 128
+
+    def process_in(self, latent):
+        return latent
+
+    def process_out(self, latent):
+        return latent
+
 class Mochi(LatentFormat):
     latent_channels = 12
     latent_dimensions = 3
 
@@ -48,11 +48,11 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10
     return embedding
 
 class MLPEmbedder(nn.Module):
-    def __init__(self, in_dim: int, hidden_dim: int, dtype=None, device=None, operations=None):
+    def __init__(self, in_dim: int, hidden_dim: int, bias=True, dtype=None, device=None, operations=None):
         super().__init__()
-        self.in_layer = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device)
+        self.in_layer = operations.Linear(in_dim, hidden_dim, bias=bias, dtype=dtype, device=device)
         self.silu = nn.SiLU()
-        self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=True, dtype=dtype, device=device)
+        self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=bias, dtype=dtype, device=device)
 
     def forward(self, x: Tensor) -> Tensor:
         return self.out_layer(self.silu(self.in_layer(x)))
@@ -80,14 +80,14 @@ def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple:
 
 
 class SelfAttention(nn.Module):
-    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, dtype=None, device=None, operations=None):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, proj_bias: bool = True, dtype=None, device=None, operations=None):
         super().__init__()
         self.num_heads = num_heads
         head_dim = dim // num_heads
 
         self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
         self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
-        self.proj = operations.Linear(dim, dim, dtype=dtype, device=device)
+        self.proj = operations.Linear(dim, dim, bias=proj_bias, dtype=dtype, device=device)
 
 
 @dataclass
@@ -98,11 +98,11 @@ class ModulationOut:
 
 
 class Modulation(nn.Module):
-    def __init__(self, dim: int, double: bool, dtype=None, device=None, operations=None):
+    def __init__(self, dim: int, double: bool, bias=True, dtype=None, device=None, operations=None):
         super().__init__()
         self.is_double = double
         self.multiplier = 6 if double else 3
-        self.lin = operations.Linear(dim, self.multiplier * dim, bias=True, dtype=dtype, device=device)
+        self.lin = operations.Linear(dim, self.multiplier * dim, bias=bias, dtype=dtype, device=device)
 
     def forward(self, vec: Tensor) -> tuple:
         if vec.ndim == 2:
@@ -129,8 +129,18 @@ def apply_mod(tensor, m_mult, m_add=None, modulation_dims=None):
         return tensor
 
 
+class SiLUActivation(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.gate_fn = nn.SiLU()
+
+    def forward(self, x: Tensor) -> Tensor:
+        x1, x2 = x.chunk(2, dim=-1)
+        return self.gate_fn(x1) * x2
+
+
 class DoubleStreamBlock(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, dtype=None, device=None, operations=None):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, mlp_silu_act=False, proj_bias=True, dtype=None, device=None, operations=None):
         super().__init__()
 
         mlp_hidden_dim = int(hidden_size * mlp_ratio)
@@ -142,27 +152,44 @@ def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias:
             self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
 
         self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, dtype=dtype, device=device, operations=operations)
 
         self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_mlp = nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.GELU(approximate="tanh"),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )
+
+        if mlp_silu_act:
+            self.img_mlp = nn.Sequential(
+                operations.Linear(hidden_size, mlp_hidden_dim * 2, bias=False, dtype=dtype, device=device),
+                SiLUActivation(),
+                operations.Linear(mlp_hidden_dim, hidden_size, bias=False, dtype=dtype, device=device),
+            )
+        else:
+            self.img_mlp = nn.Sequential(
+                operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
+                nn.GELU(approximate="tanh"),
+                operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
+            )
 
         if self.modulation:
             self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
 
         self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, dtype=dtype, device=device, operations=operations)
 
         self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_mlp = nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.GELU(approximate="tanh"),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )
+
+        if mlp_silu_act:
+            self.txt_mlp = nn.Sequential(
+                operations.Linear(hidden_size, mlp_hidden_dim * 2, bias=False, dtype=dtype, device=device),
+                SiLUActivation(),
+                operations.Linear(mlp_hidden_dim, hidden_size, bias=False, dtype=dtype, device=device),
+            )
+        else:
+            self.txt_mlp = nn.Sequential(
+                operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
+                nn.GELU(approximate="tanh"),
+                operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
+            )
+
         self.flipped_img_txt = flipped_img_txt
 
     def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None, transformer_options={}):
@@ -246,6 +273,8 @@ def __init__(
         mlp_ratio: float = 4.0,
         qk_scale: float = None,
         modulation=True,
+        mlp_silu_act=False,
+        bias=True,
         dtype=None,
         device=None,
         operations=None
@@ -257,17 +286,24 @@ def __init__(
         self.scale = qk_scale or head_dim**-0.5
 
         self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+
+        self.mlp_hidden_dim_first = self.mlp_hidden_dim
+        if mlp_silu_act:
+            self.mlp_hidden_dim_first = int(hidden_size * mlp_ratio * 2)
+            self.mlp_act = SiLUActivation()
+        else:
+            self.mlp_act = nn.GELU(approximate="tanh")
+
         # qkv and mlp_in
-        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device)
+        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim_first, bias=bias, dtype=dtype, device=device)
         # proj and mlp_out
-        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device)
+        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, bias=bias, dtype=dtype, device=device)
 
         self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
 
         self.hidden_size = hidden_size
         self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
 
-        self.mlp_act = nn.GELU(approximate="tanh")
         if modulation:
             self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)
         else:
@@ -279,7 +315,7 @@ def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation
         else:
             mod = vec
 
-        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim_first], dim=-1)
 
         q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
         del qkv
@@ -298,11 +334,11 @@ def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation
 
 
 class LastLayer(nn.Module):
-    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, bias=True, dtype=None, device=None, operations=None):
         super().__init__()
         self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
-        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device))
+        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=bias, dtype=dtype, device=device)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=bias, dtype=dtype, device=device))
 
     def forward(self, x: Tensor, vec: Tensor, modulation_dims=None) -> Tensor:
         if vec.ndim == 2:
 
@@ -15,6 +15,7 @@
     MLPEmbedder,
     SingleStreamBlock,
     timestep_embedding,
+    Modulation
 )
 
 @dataclass
@@ -33,6 +34,11 @@ class FluxParams:
     patch_size: int
     qkv_bias: bool
     guidance_embed: bool
+    global_modulation: bool = False
+    mlp_silu_act: bool = False
+    ops_bias: bool = True
+    default_ref_method: str = "offset"
+    ref_index_scale: float = 1.0
 
 
 class Flux(nn.Module):
@@ -58,13 +64,17 @@ def __init__(self, image_model=None, final_layer=True, dtype=None, device=None,
         self.hidden_size = params.hidden_size
         self.num_heads = params.num_heads
         self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
-        self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations)
-        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations)
+        self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device, operations=operations)
+        if params.vec_in_dim is not None:
+            self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations)
+        else:
+            self.vector_in = None
+
         self.guidance_in = (
-            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity()
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity()
         )
-        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, dtype=dtype, device=device)
+        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device)
 
         self.double_blocks = nn.ModuleList(
             [
@@ -73,6 +83,9 @@ def __init__(self, image_model=None, final_layer=True, dtype=None, device=None,
                     self.num_heads,
                     mlp_ratio=params.mlp_ratio,
                     qkv_bias=params.qkv_bias,
+                    modulation=params.global_modulation is False,
+                    mlp_silu_act=params.mlp_silu_act,
+                    proj_bias=params.ops_bias,
                     dtype=dtype, device=device, operations=operations
                 )
                 for _ in range(params.depth)
@@ -81,13 +94,30 @@ def __init__(self, image_model=None, final_layer=True, dtype=None, device=None,
 
         self.single_blocks = nn.ModuleList(
             [
-                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=params.global_modulation is False, mlp_silu_act=params.mlp_silu_act, bias=params.ops_bias, dtype=dtype, device=device, operations=operations)
                 for _ in range(params.depth_single_blocks)
             ]
         )
 
         if final_layer:
-            self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, dtype=dtype, device=device, operations=operations)
+            self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, bias=params.ops_bias, dtype=dtype, device=device, operations=operations)
+
+        if params.global_modulation:
+            self.double_stream_modulation_img = Modulation(
+                self.hidden_size,
+                double=True,
+                bias=False,
+                dtype=dtype, device=device, operations=operations
+            )
+            self.double_stream_modulation_txt = Modulation(
+                self.hidden_size,
+                double=True,
+                bias=False,
+                dtype=dtype, device=device, operations=operations
+            )
+            self.single_stream_modulation = Modulation(
+                self.hidden_size, double=False, bias=False, dtype=dtype, device=device, operations=operations
+            )
 
     def forward_orig(
         self,
@@ -103,9 +133,6 @@ def forward_orig(
         attn_mask: Tensor = None,
     ) -> Tensor:
 
-        if y is None:
-            y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
-
         patches = transformer_options.get("patches", {})
         patches_replace = transformer_options.get("patches_replace", {})
         if img.ndim != 3 or txt.ndim != 3:
@@ -118,9 +145,17 @@ def forward_orig(
             if guidance is not None:
                 vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))
 
-        vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
+        if self.vector_in is not None:
+            if y is None:
+                y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
+            vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
+
         txt = self.txt_in(txt)
 
+        vec_orig = vec
+        if self.params.global_modulation:
+            vec = (self.double_stream_modulation_img(vec_orig), self.double_stream_modulation_txt(vec_orig))
+
         if "post_input" in patches:
             for p in patches["post_input"]:
                 out = p({"img": img, "txt": txt, "img_ids": img_ids, "txt_ids": txt_ids})
@@ -177,6 +212,9 @@ def block_wrap(args):
 
         img = torch.cat((txt, img), 1)
 
+        if self.params.global_modulation:
+            vec, _ = self.single_stream_modulation(vec_orig)
+
         for i, block in enumerate(self.single_blocks):
             if ("single_block", i) in blocks_replace:
                 def block_wrap(args):
@@ -207,7 +245,7 @@ def block_wrap(args):
 
         img = img[:, txt.shape[1] :, ...]
 
-        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        img = self.final_layer(img, vec_orig)  # (N, T, patch_size ** 2 * out_channels)
         return img
 
     def process_img(self, x, index=0, h_offset=0, w_offset=0, transformer_options={}):
@@ -234,10 +272,10 @@ def process_img(self, x, index=0, h_offset=0, w_offset=0, transformer_options={}
             h_offset += rope_options.get("shift_y", 0.0)
             w_offset += rope_options.get("shift_x", 0.0)
 
-        img_ids = torch.zeros((steps_h, steps_w, 3), device=x.device, dtype=x.dtype)
+        img_ids = torch.zeros((steps_h, steps_w, len(self.params.axes_dim)), device=x.device, dtype=torch.float32)
         img_ids[:, :, 0] = img_ids[:, :, 1] + index
-        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=steps_h, device=x.device, dtype=x.dtype).unsqueeze(1)
-        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=steps_w, device=x.device, dtype=x.dtype).unsqueeze(0)
+        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=steps_h, device=x.device, dtype=torch.float32).unsqueeze(1)
+        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=steps_w, device=x.device, dtype=torch.float32).unsqueeze(0)
         return img, repeat(img_ids, "h w c -> b (h w) c", b=bs)
 
     def forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None, control=None, transformer_options={}, **kwargs):
@@ -259,10 +297,10 @@ def _forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None
             h = 0
             w = 0
             index = 0
-            ref_latents_method = kwargs.get("ref_latents_method", "offset")
+            ref_latents_method = kwargs.get("ref_latents_method", self.params.default_ref_method)
             for ref in ref_latents:
                 if ref_latents_method == "index":
-                    index += 1
+                    index += self.params.ref_index_scale
                     h_offset = 0
                     w_offset = 0
                 elif ref_latents_method == "uxo":
@@ -286,7 +324,11 @@ def _forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None
                 img = torch.cat([img, kontext], dim=1)
                 img_ids = torch.cat([img_ids, kontext_ids], dim=1)
 
-        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
+        txt_ids = torch.zeros((bs, context.shape[1], len(self.params.axes_dim)), device=x.device, dtype=torch.float32)
+
+        if len(self.params.axes_dim) == 4: # Flux 2
+            txt_ids[:, :, 3] = torch.linspace(0, context.shape[1] - 1, steps=context.shape[1], device=x.device, dtype=torch.float32)
+
         out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
         out = out[:, :img_tokens]
-        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h_orig,:w_orig]
+        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=self.patch_size, pw=self.patch_size)[:,:,:h_orig,:w_orig]