no expand attn mask, two forward pass

hlky · yiyixuxu · commit c1b0f3d12b7d · 2025-02-11T17:17:32.000+01:00
diff --git a/src/diffusers/models/normalization.py b/src/diffusers/models/normalization.py
@@ -515,19 +515,15 @@ def forward(self, hidden_states):
             if self.bias is not None:
                 hidden_states = hidden_states + self.bias
         else:
-            input_dtype = hidden_states.dtype
-            variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-            hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
-
             if self.weight is not None:
                 # convert into half-precision if necessary
                 if self.weight.dtype in [torch.float16, torch.bfloat16]:
                     hidden_states = hidden_states.to(self.weight.dtype)
-                hidden_states = hidden_states * self.weight
-                if self.bias is not None:
-                    hidden_states = hidden_states + self.bias
-            else:
-                hidden_states = hidden_states.to(input_dtype)
+            hidden_states = nn.functional.rms_norm(
+                hidden_states, normalized_shape=(hidden_states.shape[-1],), weight=self.weight, eps=self.eps
+            )
+            if self.bias is not None:
+                hidden_states = hidden_states + self.bias
 
         return hidden_states
 
diff --git a/src/diffusers/models/transformers/transformer_lumina2.py b/src/diffusers/models/transformers/transformer_lumina2.py
@@ -130,7 +130,6 @@ def __call__(
         # scaled_dot_product_attention expects attention_mask shape to be
         # (batch, heads, source_length, target_length)
         attention_mask = attention_mask.bool().view(batch_size, 1, 1, -1)
-        attention_mask = attention_mask.expand(-1, attn.heads, sequence_length, -1)
 
         query = query.transpose(1, 2)
         key = key.transpose(1, 2)
@@ -493,10 +492,12 @@ def forward(
 
         # 2. Context & noise refinement
         for layer in self.context_refiner:
-            encoder_hidden_states = layer(encoder_hidden_states, attention_mask, encoder_rotary_emb)
+            # NOTE: mask not used for performance
+            encoder_hidden_states = layer(encoder_hidden_states, None, encoder_rotary_emb)
 
         for layer in self.noise_refiner:
-            hidden_states = layer(hidden_states, hidden_mask, hidden_rotary_emb, temb)
+            # NOTE: mask not used for performance
+            hidden_states = layer(hidden_states, None, hidden_rotary_emb, temb)
 
         # 3. Attention mask preparation
         mask = hidden_states.new_zeros(batch_size, max_seq_len, dtype=torch.bool)
@@ -511,10 +512,11 @@ def forward(
 
         # 4. Transformer blocks
         for layer in self.layers:
+            # NOTE: mask not used for performance
             if torch.is_grad_enabled() and self.gradient_checkpointing:
-                hidden_states = self._gradient_checkpointing_func(layer, hidden_states, mask, joint_rotary_emb, temb)
+                hidden_states = self._gradient_checkpointing_func(layer, hidden_states, None, joint_rotary_emb, temb)
             else:
-                hidden_states = layer(hidden_states, mask, joint_rotary_emb, temb)
+                hidden_states = layer(hidden_states, None, joint_rotary_emb, temb)
 
         # 5. Output norm & projection & unpatchify
         hidden_states = self.norm_out(hidden_states, temb)
diff --git a/src/diffusers/pipelines/lumina2/pipeline_lumina2.py b/src/diffusers/pipelines/lumina2/pipeline_lumina2.py
@@ -658,9 +658,6 @@ def __call__(
             max_sequence_length=max_sequence_length,
             system_prompt=system_prompt,
         )
-        if do_classifier_free_guidance:
-            prompt_embeds = torch.cat([prompt_embeds, negative_prompt_embeds], dim=0)
-            prompt_attention_mask = torch.cat([prompt_attention_mask, negative_prompt_attention_mask], dim=0)
 
         # 4. Prepare latents.
         latent_channels = self.transformer.config.in_channels
@@ -700,22 +697,13 @@ def __call__(
             for i, t in enumerate(timesteps):
                 # compute whether apply classifier-free truncation on this timestep
                 do_classifier_free_truncation = (i + 1) / num_inference_steps > cfg_trunc_ratio
-                
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = (
-                    torch.cat([latents] * 2)
-                    if do_classifier_free_guidance and not do_classifier_free_truncation
-                    else latents
-                )
-                
-                current_timestep = t
-                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                current_timestep = current_timestep.expand(latent_model_input.shape[0])
                 # reverse the timestep since Lumina uses t=0 as the noise and t=1 as the image
-                current_timestep = 1 - current_timestep / self.scheduler.config.num_train_timesteps
+                current_timestep = 1 - t / self.scheduler.config.num_train_timesteps
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                current_timestep = current_timestep.expand(latents.shape[0])
 
-                noise_pred = self.transformer(
-                    hidden_states=latent_model_input,
+                noise_pred_cond = self.transformer(
+                    hidden_states=latents,
                     timestep=current_timestep,
                     encoder_hidden_states=prompt_embeds,
                     attention_mask=prompt_attention_mask,
@@ -724,7 +712,13 @@ def __call__(
 
                 # perform normalization-based guidance scale on a truncated timestep interval
                 if self.do_classifier_free_guidance and not do_classifier_free_truncation:
-                    noise_pred_cond, noise_pred_uncond = torch.split(noise_pred, len(noise_pred) // 2, dim=0)
+                    noise_pred_uncond = self.transformer(
+                        hidden_states=latents,
+                        timestep=current_timestep,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        attention_mask=negative_prompt_attention_mask,
+                        return_dict=False,
+                    )[0]
                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
                     # apply normalization after classifier-free guidance
                     if cfg_normalization: