From a3fc50750a2325b72c9c91f0fbe5c1d5be463ef4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AD=A6=E5=98=89=E6=B6=B5?= Date: Thu, 14 Aug 2025 19:01:47 +0800 Subject: [PATCH 1/7] CogView4: remove SiLU in final AdaLN (match Megatron); add switch to AdaLayerNormContinuous; split temb_raw/temb_blocks --- src/diffusers/models/normalization.py | 5 +++-- .../models/transformers/transformer_cogview4.py | 12 ++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/diffusers/models/normalization.py b/src/diffusers/models/normalization.py index ae2a6298f5f7..bbf0a002ee27 100644 --- a/src/diffusers/models/normalization.py +++ b/src/diffusers/models/normalization.py @@ -333,9 +333,10 @@ def __init__( eps=1e-5, bias=True, norm_type="layer_norm", + use_silu: bool = True, ): super().__init__() - self.silu = nn.SiLU() + self.act = nn.SiLU() if use_silu else nn.Identity() self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias) if norm_type == "layer_norm": self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias) @@ -346,7 +347,7 @@ def __init__( def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor: # convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT) - emb = self.linear(self.silu(conditioning_embedding).to(x.dtype)) + emb = self.linear(self.act(conditioning_embedding).to(x.dtype)) scale, shift = torch.chunk(emb, 2, dim=1) x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :] return x diff --git a/src/diffusers/models/transformers/transformer_cogview4.py b/src/diffusers/models/transformers/transformer_cogview4.py index dc45befb98fa..ebf6158199f4 100644 --- a/src/diffusers/models/transformers/transformer_cogview4.py +++ b/src/diffusers/models/transformers/transformer_cogview4.py @@ -666,7 +666,7 @@ def __init__( ) # 4. Output projection - self.norm_out = AdaLayerNormContinuous(inner_dim, time_embed_dim, elementwise_affine=False) + self.norm_out = AdaLayerNormContinuous(inner_dim, time_embed_dim, elementwise_affine=False, use_silu=False) self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels, bias=True) self.gradient_checkpointing = False @@ -714,8 +714,8 @@ def forward( hidden_states, encoder_hidden_states = self.patch_embed(hidden_states, encoder_hidden_states) - temb = self.time_condition_embed(timestep, original_size, target_size, crop_coords, hidden_states.dtype) - temb = F.silu(temb) + temb_raw = self.time_condition_embed(timestep, original_size, target_size, crop_coords, hidden_states.dtype) + temb_blocks = F.silu(temb_raw) # 3. Transformer blocks for block in self.transformer_blocks: @@ -724,7 +724,7 @@ def forward( block, hidden_states, encoder_hidden_states, - temb, + temb_blocks, image_rotary_emb, attention_mask, attention_kwargs, @@ -733,14 +733,14 @@ def forward( hidden_states, encoder_hidden_states = block( hidden_states, encoder_hidden_states, - temb, + temb_blocks, image_rotary_emb, attention_mask, attention_kwargs, ) # 4. Output norm & projection - hidden_states = self.norm_out(hidden_states, temb) + hidden_states = self.norm_out(hidden_states, temb_raw) hidden_states = self.proj_out(hidden_states) # 5. Unpatchify From 710a1c4859eaa6af82c00dbaae0b62a2c0c59d1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AD=A6=E5=98=89=E6=B6=B5?= Date: Fri, 15 Aug 2025 10:37:01 +0800 Subject: [PATCH 2/7] CogView4: remove SiLU in final AdaLN (match Megatron); add switch to AdaLayerNormContinuous; split temb_raw/temb_blocks --- src/diffusers/models/transformers/transformer_cogview4.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/models/transformers/transformer_cogview4.py b/src/diffusers/models/transformers/transformer_cogview4.py index ebf6158199f4..b64f3a308295 100644 --- a/src/diffusers/models/transformers/transformer_cogview4.py +++ b/src/diffusers/models/transformers/transformer_cogview4.py @@ -740,7 +740,7 @@ def forward( ) # 4. Output norm & projection - hidden_states = self.norm_out(hidden_states, temb_raw) + hidden_states = self.norm_out(hidden_states, temb) hidden_states = self.proj_out(hidden_states) # 5. Unpatchify From db619eb94080aa61664132c20a051af74cd26b34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AD=A6=E5=98=89=E6=B6=B5?= Date: Fri, 15 Aug 2025 10:45:54 +0800 Subject: [PATCH 3/7] CogView4: remove SiLU in final AdaLN (match Megatron); add switch to AdaLayerNormContinuous; split temb_raw/temb_blocks --- src/diffusers/models/transformers/transformer_cogview4.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/diffusers/models/transformers/transformer_cogview4.py b/src/diffusers/models/transformers/transformer_cogview4.py index b64f3a308295..3455f8e6ab6d 100644 --- a/src/diffusers/models/transformers/transformer_cogview4.py +++ b/src/diffusers/models/transformers/transformer_cogview4.py @@ -714,8 +714,8 @@ def forward( hidden_states, encoder_hidden_states = self.patch_embed(hidden_states, encoder_hidden_states) - temb_raw = self.time_condition_embed(timestep, original_size, target_size, crop_coords, hidden_states.dtype) - temb_blocks = F.silu(temb_raw) + temb = self.time_condition_embed(timestep, original_size, target_size, crop_coords, hidden_states.dtype) + temb = F.silu(temb) # 3. Transformer blocks for block in self.transformer_blocks: @@ -724,7 +724,7 @@ def forward( block, hidden_states, encoder_hidden_states, - temb_blocks, + temb, image_rotary_emb, attention_mask, attention_kwargs, @@ -733,7 +733,7 @@ def forward( hidden_states, encoder_hidden_states = block( hidden_states, encoder_hidden_states, - temb_blocks, + temb, image_rotary_emb, attention_mask, attention_kwargs, From bbafd5326dc201773cb99073b3d4852a8d6ae9e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AD=A6=E5=98=89=E6=B6=B5?= Date: Fri, 15 Aug 2025 16:43:59 +0800 Subject: [PATCH 4/7] CogView4: use local final AdaLN (no SiLU) per review; keep generic AdaLN unchanged --- .../transformers/transformer_cogview4.py | 36 +++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/src/diffusers/models/transformers/transformer_cogview4.py b/src/diffusers/models/transformers/transformer_cogview4.py index 3455f8e6ab6d..c0c86593b4c1 100644 --- a/src/diffusers/models/transformers/transformer_cogview4.py +++ b/src/diffusers/models/transformers/transformer_cogview4.py @@ -28,8 +28,7 @@ from ..embeddings import CogView3CombinedTimestepSizeEmbeddings from ..modeling_outputs import Transformer2DModelOutput from ..modeling_utils import ModelMixin -from ..normalization import AdaLayerNormContinuous - +from ..normalization import LayerNorm, RMSNorm logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -584,6 +583,37 @@ def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tens return (freqs.cos(), freqs.sin()) +class _CogViewFinalAdaLayerNormContinuous(nn.Module): + """ + CogView4-only final AdaLN: LN(x) -> Linear(cond) -> chunk -> affine. + Matches Megatron: **no activation** before the Linear on conditioning embedding. + """ + def __init__( + self, + embedding_dim: int, + conditioning_embedding_dim: int, + elementwise_affine: bool = True, + eps: float = 1e-5, + bias: bool = True, + norm_type: str = "layer_norm", + ): + super().__init__() + self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias) + if norm_type == "layer_norm": + self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias) + elif norm_type == "rms_norm": + self.norm = RMSNorm(embedding_dim, eps, elementwise_affine) + else: + raise ValueError(f"unknown norm_type {norm_type}") + + def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor: + # *** NO SiLU here *** + emb = self.linear(conditioning_embedding.to(x.dtype)) + scale, shift = torch.chunk(emb, 2, dim=1) + x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :] + return x + + class CogView4Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, CacheMixin): r""" Args: @@ -666,7 +696,7 @@ def __init__( ) # 4. Output projection - self.norm_out = AdaLayerNormContinuous(inner_dim, time_embed_dim, elementwise_affine=False, use_silu=False) + self.norm_out = _CogViewFinalAdaLayerNormContinuous(inner_dim, time_embed_dim, elementwise_affine=False) self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels, bias=True) self.gradient_checkpointing = False From 6dd7ff604e6d467e8615012814d626cbacefe56f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AD=A6=E5=98=89=E6=B6=B5?= Date: Fri, 15 Aug 2025 16:48:37 +0800 Subject: [PATCH 5/7] re-add configs as normal files (no LFS) --- src/diffusers/models/normalization.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/diffusers/models/normalization.py b/src/diffusers/models/normalization.py index bbf0a002ee27..ae2a6298f5f7 100644 --- a/src/diffusers/models/normalization.py +++ b/src/diffusers/models/normalization.py @@ -333,10 +333,9 @@ def __init__( eps=1e-5, bias=True, norm_type="layer_norm", - use_silu: bool = True, ): super().__init__() - self.act = nn.SiLU() if use_silu else nn.Identity() + self.silu = nn.SiLU() self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias) if norm_type == "layer_norm": self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias) @@ -347,7 +346,7 @@ def __init__( def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor: # convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT) - emb = self.linear(self.act(conditioning_embedding).to(x.dtype)) + emb = self.linear(self.silu(conditioning_embedding).to(x.dtype)) scale, shift = torch.chunk(emb, 2, dim=1) x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :] return x From e09f1b76702659ebae946eef6c7c5f55e12c4ccc Mon Sep 17 00:00:00 2001 From: Aryan Date: Sun, 17 Aug 2025 03:43:39 +0530 Subject: [PATCH 6/7] Apply suggestions from code review --- src/diffusers/models/transformers/transformer_cogview4.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/models/transformers/transformer_cogview4.py b/src/diffusers/models/transformers/transformer_cogview4.py index c0c86593b4c1..593efdb5e4b0 100644 --- a/src/diffusers/models/transformers/transformer_cogview4.py +++ b/src/diffusers/models/transformers/transformer_cogview4.py @@ -583,7 +583,7 @@ def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tens return (freqs.cos(), freqs.sin()) -class _CogViewFinalAdaLayerNormContinuous(nn.Module): +class CogView4AdaLayerNormContinuous(nn.Module): """ CogView4-only final AdaLN: LN(x) -> Linear(cond) -> chunk -> affine. Matches Megatron: **no activation** before the Linear on conditioning embedding. @@ -696,7 +696,7 @@ def __init__( ) # 4. Output projection - self.norm_out = _CogViewFinalAdaLayerNormContinuous(inner_dim, time_embed_dim, elementwise_affine=False) + self.norm_out = CogView4AdaLayerNormContinuous(inner_dim, time_embed_dim, elementwise_affine=False) self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels, bias=True) self.gradient_checkpointing = False From b519191f0d7bc2dfd5b8a2895fc3de5c757e52fc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 16 Aug 2025 22:23:16 +0000 Subject: [PATCH 7/7] Apply style fixes --- src/diffusers/models/transformers/transformer_cogview4.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/diffusers/models/transformers/transformer_cogview4.py b/src/diffusers/models/transformers/transformer_cogview4.py index 593efdb5e4b0..25dcfa14cc0b 100644 --- a/src/diffusers/models/transformers/transformer_cogview4.py +++ b/src/diffusers/models/transformers/transformer_cogview4.py @@ -30,6 +30,7 @@ from ..modeling_utils import ModelMixin from ..normalization import LayerNorm, RMSNorm + logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -585,9 +586,10 @@ def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tens class CogView4AdaLayerNormContinuous(nn.Module): """ - CogView4-only final AdaLN: LN(x) -> Linear(cond) -> chunk -> affine. - Matches Megatron: **no activation** before the Linear on conditioning embedding. + CogView4-only final AdaLN: LN(x) -> Linear(cond) -> chunk -> affine. Matches Megatron: **no activation** before the + Linear on conditioning embedding. """ + def __init__( self, embedding_dim: int,