use_mask_in_transformer, is_torch_version

hlky · yiyixuxu · commit 7c5a46f46f68 · 2025-02-11T17:17:41.000+01:00
diff --git a/src/diffusers/models/normalization.py b/src/diffusers/models/normalization.py
@@ -514,7 +514,7 @@ def forward(self, hidden_states):
             hidden_states = torch_npu.npu_rms_norm(hidden_states, self.weight, epsilon=self.eps)[0]
             if self.bias is not None:
                 hidden_states = hidden_states + self.bias
-        else:
+        elif is_torch_version(">=", "2.4"):
             if self.weight is not None:
                 # convert into half-precision if necessary
                 if self.weight.dtype in [torch.float16, torch.bfloat16]:
@@ -524,6 +524,20 @@ def forward(self, hidden_states):
             )
             if self.bias is not None:
                 hidden_states = hidden_states + self.bias
+        else:
+            input_dtype = hidden_states.dtype
+            variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+            hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+
+            if self.weight is not None:
+                # convert into half-precision if necessary
+                if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                    hidden_states = hidden_states.to(self.weight.dtype)
+                hidden_states = hidden_states * self.weight
+                if self.bias is not None:
+                    hidden_states = hidden_states + self.bias
+            else:
+                hidden_states = hidden_states.to(input_dtype)
 
         return hidden_states
 
diff --git a/src/diffusers/models/transformers/transformer_lumina2.py b/src/diffusers/models/transformers/transformer_lumina2.py
@@ -469,6 +469,7 @@ def forward(
         timestep: torch.Tensor,
         encoder_hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
+        use_mask_in_transformer: bool = True,
         return_dict: bool = True,
     ) -> Union[torch.Tensor, Transformer2DModelOutput]:
         batch_size = hidden_states.size(0)
@@ -493,11 +494,15 @@ def forward(
         # 2. Context & noise refinement
         for layer in self.context_refiner:
             # NOTE: mask not used for performance
-            encoder_hidden_states = layer(encoder_hidden_states, None, encoder_rotary_emb)
+            encoder_hidden_states = layer(
+                encoder_hidden_states, attention_mask if use_mask_in_transformer else None, encoder_rotary_emb
+            )
 
         for layer in self.noise_refiner:
             # NOTE: mask not used for performance
-            hidden_states = layer(hidden_states, None, hidden_rotary_emb, temb)
+            hidden_states = layer(
+                hidden_states, hidden_mask if use_mask_in_transformer else None, hidden_rotary_emb, temb
+            )
 
         # 3. Attention mask preparation
         mask = hidden_states.new_zeros(batch_size, max_seq_len, dtype=torch.bool)
@@ -514,9 +519,11 @@ def forward(
         for layer in self.layers:
             # NOTE: mask not used for performance
             if torch.is_grad_enabled() and self.gradient_checkpointing:
-                hidden_states = self._gradient_checkpointing_func(layer, hidden_states, None, joint_rotary_emb, temb)
+                hidden_states = self._gradient_checkpointing_func(
+                    layer, hidden_states, mask if use_mask_in_transformer else None, joint_rotary_emb, temb
+                )
             else:
-                hidden_states = layer(hidden_states, None, joint_rotary_emb, temb)
+                hidden_states = layer(hidden_states, mask if use_mask_in_transformer else None, joint_rotary_emb, temb)
 
         # 5. Output norm & projection & unpatchify
         hidden_states = self.norm_out(hidden_states, temb)
diff --git a/src/diffusers/pipelines/lumina2/pipeline_lumina2.py b/src/diffusers/pipelines/lumina2/pipeline_lumina2.py
@@ -525,6 +525,7 @@ def __call__(
         system_prompt: Optional[str] = None,
         cfg_trunc_ratio: float = 1.0,
         cfg_normalization: bool = True,
+        use_mask_in_transformer: bool = True,
         max_sequence_length: int = 256,
     ) -> Union[ImagePipelineOutput, Tuple]:
         """
@@ -596,6 +597,8 @@ def __call__(
                 The ratio of the timestep interval to apply normalization-based guidance scale.
             cfg_normalization (`bool`, *optional*, defaults to `True`):
                 Whether to apply normalization-based guidance scale.
+            use_mask_in_transformer (`bool`, *optional*, defaults to `True`):
+                Whether to use attention mask in `Lumina2Transformer2DModel`. Set `False` for performance gain.
             max_sequence_length (`int`, defaults to `256`):
                 Maximum sequence length to use with the `prompt`.
 
@@ -707,6 +710,7 @@ def __call__(
                     timestep=current_timestep,
                     encoder_hidden_states=prompt_embeds,
                     attention_mask=prompt_attention_mask,
+                    use_mask_in_transformer=use_mask_in_transformer,
                     return_dict=False,
                 )[0]
 
@@ -717,6 +721,7 @@ def __call__(
                         timestep=current_timestep,
                         encoder_hidden_states=negative_prompt_embeds,
                         attention_mask=negative_prompt_attention_mask,
+                        use_mask_in_transformer=use_mask_in_transformer,
                         return_dict=False,
                     )[0]
                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)