huggingface · will-rice · Apr 4, 2024 · Apr 4, 2024 · Apr 5, 2024 · Apr 5, 2024
diff --git a/src/diffusers/models/normalization.py b/src/diffusers/models/normalization.py
@@ -43,8 +43,8 @@ def __init__(self, embedding_dim: int, num_embeddings: int):
 
     def forward(self, x: torch.Tensor, timestep: torch.Tensor) -> torch.Tensor:
         emb = self.linear(self.silu(self.emb(timestep)))
-        scale, shift = torch.chunk(emb, 2)
-        x = self.norm(x) * (1 + scale) + shift
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale[:, None]) + shift[:, None]
         return x
 
 

diff --git a/src/diffusers/models/transformers/transformer_2d.py b/src/diffusers/models/transformers/transformer_2d.py
@@ -492,10 +492,8 @@ def custom_forward(*inputs):
             output = F.log_softmax(logits.double(), dim=1).float()
 
         if self.is_input_patches:
-            if self.config.norm_type != "ada_norm_single":
-                conditioning = self.transformer_blocks[0].norm1.emb(
-                    timestep, class_labels, hidden_dtype=hidden_states.dtype
-                )
+            if self.config.norm_type == "ada_norm":
+                conditioning = self.transformer_blocks[0].norm1.emb(timestep)
                 shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
                 hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
                 hidden_states = self.proj_out_2(hidden_states)
@@ -506,6 +504,13 @@ def custom_forward(*inputs):
                 hidden_states = hidden_states * (1 + scale) + shift
                 hidden_states = self.proj_out(hidden_states)
                 hidden_states = hidden_states.squeeze(1)
+            else:
+                conditioning = self.transformer_blocks[0].norm1.emb(
+                    timestep, class_labels, hidden_dtype=hidden_states.dtype
+                )
+                shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+                hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+                hidden_states = self.proj_out_2(hidden_states)
 
             # unpatchify
             if self.adaln_single is None:

diff --git a/tests/models/test_layers_utils.py b/tests/models/test_layers_utils.py
@@ -378,8 +378,8 @@ def test_spatial_transformer_timestep(self):
             num_embeds_ada_norm=num_embeds_ada_norm,
         ).to(torch_device)
         with torch.no_grad():
-            timestep_1 = torch.tensor(1, dtype=torch.long).to(torch_device)
-            timestep_2 = torch.tensor(2, dtype=torch.long).to(torch_device)
+            timestep_1 = torch.tensor(1, dtype=torch.long).to(torch_device)[None]
+            timestep_2 = torch.tensor(2, dtype=torch.long).to(torch_device)[None]
             attention_scores_1 = spatial_transformer_block(sample, timestep=timestep_1).sample
             attention_scores_2 = spatial_transformer_block(sample, timestep=timestep_2).sample