diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py index c720b379551f..c99133f257a5 100644 --- a/src/diffusers/models/attention.py +++ b/src/diffusers/models/attention.py @@ -674,7 +674,7 @@ def forward( encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor, joint_attention_kwargs: Optional[Dict[str, Any]] = None, - ): + ) -> Tuple[torch.Tensor, torch.Tensor]: joint_attention_kwargs = joint_attention_kwargs or {} if self.use_dual_attention: norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp, norm_hidden_states2, gate_msa2 = self.norm1( diff --git a/src/diffusers/models/transformers/auraflow_transformer_2d.py b/src/diffusers/models/transformers/auraflow_transformer_2d.py index a8d275d14214..26d80add05dc 100644 --- a/src/diffusers/models/transformers/auraflow_transformer_2d.py +++ b/src/diffusers/models/transformers/auraflow_transformer_2d.py @@ -173,7 +173,7 @@ def forward( hidden_states: torch.FloatTensor, temb: torch.FloatTensor, attention_kwargs: Optional[Dict[str, Any]] = None, - ): + ) -> torch.Tensor: residual = hidden_states attention_kwargs = attention_kwargs or {} diff --git a/src/diffusers/models/transformers/cogvideox_transformer_3d.py b/src/diffusers/models/transformers/cogvideox_transformer_3d.py index a8c98bccb86c..ebe230380d42 100644 --- a/src/diffusers/models/transformers/cogvideox_transformer_3d.py +++ b/src/diffusers/models/transformers/cogvideox_transformer_3d.py @@ -122,7 +122,7 @@ def forward( temb: torch.Tensor, image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, attention_kwargs: Optional[Dict[str, Any]] = None, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: text_seq_length = encoder_hidden_states.size(1) attention_kwargs = attention_kwargs or {} diff --git a/src/diffusers/models/transformers/consisid_transformer_3d.py b/src/diffusers/models/transformers/consisid_transformer_3d.py index 41632dbd4751..ec5a7844885d 100644 --- a/src/diffusers/models/transformers/consisid_transformer_3d.py +++ b/src/diffusers/models/transformers/consisid_transformer_3d.py @@ -315,7 +315,7 @@ def forward( encoder_hidden_states: torch.Tensor, temb: torch.Tensor, image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: text_seq_length = encoder_hidden_states.size(1) # norm & modulate diff --git a/src/diffusers/models/transformers/lumina_nextdit2d.py b/src/diffusers/models/transformers/lumina_nextdit2d.py index 84b1175386b0..0c79a57a0395 100644 --- a/src/diffusers/models/transformers/lumina_nextdit2d.py +++ b/src/diffusers/models/transformers/lumina_nextdit2d.py @@ -124,7 +124,7 @@ def forward( encoder_mask: torch.Tensor, temb: torch.Tensor, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - ): + ) -> torch.Tensor: """ Perform a forward pass through the LuminaNextDiTBlock. diff --git a/src/diffusers/models/transformers/transformer_bria.py b/src/diffusers/models/transformers/transformer_bria.py index 27a9941501a1..a2fcf81f7ac8 100644 --- a/src/diffusers/models/transformers/transformer_bria.py +++ b/src/diffusers/models/transformers/transformer_bria.py @@ -472,7 +472,7 @@ def forward( temb: torch.Tensor, image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, attention_kwargs: Optional[Dict[str, Any]] = None, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: text_seq_len = encoder_hidden_states.shape[1] hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1) diff --git a/src/diffusers/models/transformers/transformer_cogview3plus.py b/src/diffusers/models/transformers/transformer_cogview3plus.py index 77f15f6ca6f1..81d16f5b59b6 100644 --- a/src/diffusers/models/transformers/transformer_cogview3plus.py +++ b/src/diffusers/models/transformers/transformer_cogview3plus.py @@ -13,7 +13,7 @@ # limitations under the License. -from typing import Dict, Union +from typing import Dict, Union, Tuple import torch import torch.nn as nn @@ -79,7 +79,7 @@ def forward( hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, emb: torch.Tensor, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: text_seq_length = encoder_hidden_states.size(1) # norm & modulate diff --git a/src/diffusers/models/transformers/transformer_cogview4.py b/src/diffusers/models/transformers/transformer_cogview4.py index 25dcfa14cc0b..58f80a72f6ee 100644 --- a/src/diffusers/models/transformers/transformer_cogview4.py +++ b/src/diffusers/models/transformers/transformer_cogview4.py @@ -494,7 +494,7 @@ def forward( ] = None, attention_mask: Optional[Dict[str, torch.Tensor]] = None, attention_kwargs: Optional[Dict[str, Any]] = None, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: # 1. Timestep conditioning ( norm_hidden_states, diff --git a/src/diffusers/models/transformers/transformer_hidream_image.py b/src/diffusers/models/transformers/transformer_hidream_image.py index 77902dcf5852..5578ef5ab20a 100644 --- a/src/diffusers/models/transformers/transformer_hidream_image.py +++ b/src/diffusers/models/transformers/transformer_hidream_image.py @@ -534,7 +534,7 @@ def forward( encoder_hidden_states: Optional[torch.Tensor] = None, temb: Optional[torch.Tensor] = None, image_rotary_emb: torch.Tensor = None, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: wtype = hidden_states.dtype ( shift_msa_i, @@ -592,7 +592,7 @@ def forward( encoder_hidden_states: Optional[torch.Tensor] = None, temb: Optional[torch.Tensor] = None, image_rotary_emb: torch.Tensor = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: return self.block( hidden_states=hidden_states, hidden_states_masks=hidden_states_masks, diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py index 6944a6c536b5..3723156805d8 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py @@ -684,7 +684,7 @@ def forward( image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, token_replace_emb: torch.Tensor = None, num_tokens: int = None, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: text_seq_length = encoder_hidden_states.shape[1] hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)