Merge branch 'main' into low-cpu-mem-usage-lora

sayakpaul · web-flow · commit 8f5b0e1dbe5c · 2024-10-09T10:39:45.000+05:30
diff --git a/src/diffusers/models/transformers/transformer_flux.py b/src/diffusers/models/transformers/transformer_flux.py
@@ -83,14 +83,16 @@ def forward(
         hidden_states: torch.FloatTensor,
         temb: torch.FloatTensor,
         image_rotary_emb=None,
+        joint_attention_kwargs=None,
     ):
         residual = hidden_states
         norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
         mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
-
+        joint_attention_kwargs = joint_attention_kwargs or {}
         attn_output = self.attn(
             hidden_states=norm_hidden_states,
             image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
         )
 
         hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
@@ -161,18 +163,20 @@ def forward(
         encoder_hidden_states: torch.FloatTensor,
         temb: torch.FloatTensor,
         image_rotary_emb=None,
+        joint_attention_kwargs=None,
     ):
         norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
 
         norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
             encoder_hidden_states, emb=temb
         )
-
+        joint_attention_kwargs = joint_attention_kwargs or {}
         # Attention.
         attn_output, context_attn_output = self.attn(
             hidden_states=norm_hidden_states,
             encoder_hidden_states=norm_encoder_hidden_states,
             image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
         )
 
         # Process attention outputs for the `hidden_states`.
@@ -497,6 +501,7 @@ def custom_forward(*inputs):
                     encoder_hidden_states=encoder_hidden_states,
                     temb=temb,
                     image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
                 )
 
             # controlnet residual
@@ -533,6 +538,7 @@ def custom_forward(*inputs):
                     hidden_states=hidden_states,
                     temb=temb,
                     image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
                 )
 
             # controlnet residual
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py
@@ -9,16 +9,17 @@
 
 @dataclass
 class IFPipelineOutput(BaseOutput):
-    """
-    Args:
+    r"""
     Output class for Stable Diffusion pipelines.
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`):
             List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
             num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
-        nsfw_detected (`List[bool]`)
+        nsfw_detected (`List[bool]`):
             List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content or a watermark. `None` if safety checking could not be performed.
-        watermark_detected (`List[bool]`)
+        watermark_detected (`List[bool]`):
             List of flags denoting whether the corresponding generated image likely has a watermark. `None` if safety
             checking could not be performed.
     """