Quality and style checks

guiyrt · guiyrt · commit 832324085160 · 2024-12-09T15:37:42.000Z
diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py
@@ -33,23 +33,19 @@
 
 
 if is_transformers_available():
-    from transformers import (
-        CLIPImageProcessor,
-        CLIPVisionModelWithProjection,
-        SiglipImageProcessor,
-        SiglipVisionModel
-    )
+    from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection, SiglipImageProcessor, SiglipVisionModel
 
 from ..models.attention_processor import (
     AttnProcessor,
     AttnProcessor2_0,
-    JointAttnProcessor2_0,
     IPAdapterAttnProcessor,
     IPAdapterAttnProcessor2_0,
-    IPAdapterXFormersAttnProcessor,
     IPAdapterJointAttnProcessor2_0,
+    IPAdapterXFormersAttnProcessor,
+    JointAttnProcessor2_0,
 )
 
+
 logger = logging.get_logger(__name__)
 
 
@@ -495,8 +491,10 @@ def load_ip_adapter(
                     )
 
                     self.register_modules(
-                        feature_extractor = SiglipImageProcessor.from_pretrained(**args).to(self.device, dtype=self.dtype),
-                        image_encoder = SiglipVisionModel.from_pretrained(**args).to(self.device, dtype=self.dtype),
+                        feature_extractor=SiglipImageProcessor.from_pretrained(**args).to(
+                            self.device, dtype=self.dtype
+                        ),
+                        image_encoder=SiglipVisionModel.from_pretrained(**args).to(self.device, dtype=self.dtype),
                     )
                 else:
                     raise ValueError(
@@ -513,9 +511,9 @@ def load_ip_adapter(
 
     def set_ip_adapter_scale(self, scale: float):
         """
-        Controls image/text prompt conditioning. A value of 1.0 means the model is only conditioned on the image prompt, and 0.0
-        only conditioned by the text prompt. Lowering this value encourages the model to produce more diverse images, but they 
-        may not be as aligned with the image prompt.
+        Controls image/text prompt conditioning. A value of 1.0 means the model is only conditioned on the image
+        prompt, and 0.0 only conditioned by the text prompt. Lowering this value encourages the model to produce more
+        diverse images, but they may not be as aligned with the image prompt.
 
         Example:
 
@@ -556,11 +554,7 @@ def unload_ip_adapter(self):
 
         # Restore original attention processors layers
         attn_procs = {
-            name: (
-                JointAttnProcessor2_0()
-                if isinstance(value, IPAdapterJointAttnProcessor2_0)
-                else value.__class__()
-            )
+            name: (JointAttnProcessor2_0() if isinstance(value, IPAdapterJointAttnProcessor2_0) else value.__class__())
             for name, value in self.transformer.attn_processors.items()
         }
-        self.transformer.set_attn_processor(attn_procs)
+        self.transformer.set_attn_processor(attn_procs)
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
@@ -188,8 +188,11 @@ def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
         self._chunk_dim = dim
 
     def forward(
-        self, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor,
-        joint_attention_kwargs: Dict[str, Any] = {}
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        joint_attention_kwargs: Dict[str, Any] = {},
     ):
         if self.use_dual_attention:
             norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp, norm_hidden_states2, gate_msa2 = self.norm1(
@@ -207,8 +210,9 @@ def forward(
 
         # Attention.
         attn_output, context_attn_output = self.attn(
-            hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states,
-            **joint_attention_kwargs
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            **joint_attention_kwargs,
         )
 
         # Process attention outputs for the `hidden_states`.
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -5047,7 +5047,7 @@ def __call__(
         hidden_states = hidden_states / attn.rescale_output_factor
 
         return hidden_states
-    
+
 
 class IPAdapterJointAttnProcessor2_0(torch.nn.Module):
     """Attention processor for IP-Adapter used typically in processing the SD3-like self-attention projections."""
@@ -5058,15 +5058,14 @@ def __init__(
         ip_hidden_states_dim: int,
         head_dim: int,
         timesteps_emb_dim: int = 1280,
-        scale: float = 0.5
+        scale: float = 0.5,
     ):
         super().__init__()
 
         # To prevent circular import
-        from .normalization import RMSNorm, AdaLayerNorm
+        from .normalization import AdaLayerNorm, RMSNorm
 
-        self.norm_ip = AdaLayerNorm(timesteps_emb_dim, output_dim=ip_hidden_states_dim * 2,
-                                    norm_eps=1e-6, chunk_dim=1)
+        self.norm_ip = AdaLayerNorm(timesteps_emb_dim, output_dim=ip_hidden_states_dim * 2, norm_eps=1e-6, chunk_dim=1)
         self.to_k_ip = nn.Linear(ip_hidden_states_dim, hidden_size, bias=False)
         self.to_v_ip = nn.Linear(ip_hidden_states_dim, hidden_size, bias=False)
         self.norm_q = RMSNorm(head_dim, 1e-6)
@@ -5081,7 +5080,7 @@ def __call__(
         encoder_hidden_states: torch.FloatTensor = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         ip_hidden_states: torch.FloatTensor = None,
-        temb: torch.FloatTensor = None
+        temb: torch.FloatTensor = None,
     ) -> torch.FloatTensor:
         residual = hidden_states
 
@@ -5170,7 +5169,9 @@ def __call__(
             img_key = torch.cat([img_key, ip_key], dim=2)
             img_value = torch.cat([img_value, ip_value], dim=2)
 
-            ip_hidden_states = F.scaled_dot_product_attention(img_query, img_key, img_value, dropout_p=0.0, is_causal=False)
+            ip_hidden_states = F.scaled_dot_product_attention(
+                img_query, img_key, img_value, dropout_p=0.0, is_causal=False
+            )
             ip_hidden_states = ip_hidden_states.transpose(1, 2).view(batch_size, -1, attn.heads * head_dim)
             ip_hidden_states = ip_hidden_states.to(img_query.dtype)
 
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -2115,7 +2115,7 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        self.scale = dim_head ** -0.5
+        self.scale = dim_head**-0.5
         self.dim_head = dim_head
         self.heads = heads
         inner_dim = dim_head * heads
@@ -2135,6 +2135,7 @@ def forward(self, x, latents, shift=None, scale=None):
             latent (torch.Tensor): latent features
                 shape (b, n2, D)
         """
+
         def reshape_tensor(x, heads):
             bs, length, _ = x.shape
             # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
@@ -2169,7 +2170,7 @@ def reshape_tensor(x, heads):
         out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
 
         return self.to_out(out)
-    
+
 
 # Modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
 class TimePerceiverResampler(nn.Module):
@@ -2188,12 +2189,12 @@ def __init__(
         timestep_freq_shift: int = 0,
     ) -> None:
         super().__init__()
-        
-        self.latents = nn.Parameter(torch.randn(1, num_queries, hidden_dim) / hidden_dim ** 0.5)        
+
+        self.latents = nn.Parameter(torch.randn(1, num_queries, hidden_dim) / hidden_dim**0.5)
         self.proj_in = nn.Linear(embed_dim, hidden_dim)
         self.proj_out = nn.Linear(hidden_dim, output_dim)
         self.norm_out = nn.LayerNorm(output_dim)
-        
+
         ff_inner_dim = int(hidden_dim * ffn_ratio)
         self.layers = nn.ModuleList([])
         for _ in range(depth):
@@ -2210,10 +2211,7 @@ def __init__(
                             nn.Linear(ff_inner_dim, hidden_dim, bias=False),
                         ),
                         # adaLN
-                        nn.Sequential(
-                            nn.SiLU(),
-                            nn.Linear(hidden_dim, ff_inner_dim, bias=True)
-                        )
+                        nn.Sequential(nn.SiLU(), nn.Linear(hidden_dim, ff_inner_dim, bias=True)),
                     ]
                 )
             )
@@ -2227,7 +2225,7 @@ def forward(self, x, timestep, need_temb=False):
         timestep_emb = self.time_embedding(timestep_emb, None)
 
         latents = self.latents.repeat(x.size(0), 1, 1)
-        
+
         x = self.proj_in(x)
         x = x + timestep_emb[:, None]
 
@@ -2242,7 +2240,7 @@ def forward(self, x, timestep, need_temb=False):
                 if idx_ff == 0 and isinstance(layer_ff, nn.LayerNorm):  # adaLN
                     latents = latents * (1 + scale_mlp.unsqueeze(1)) + shift_mlp.unsqueeze(1)
             latents = latents + res
-            
+
         latents = self.proj_out(latents)
         latents = self.norm_out(latents)
 
diff --git a/src/diffusers/models/transformers/transformer_sd3.py b/src/diffusers/models/transformers/transformer_sd3.py
@@ -24,8 +24,8 @@
     Attention,
     AttentionProcessor,
     FusedJointAttnProcessor2_0,
-    JointAttnProcessor2_0,
     IPAdapterJointAttnProcessor2_0,
+    JointAttnProcessor2_0,
 )
 from ...models.modeling_utils import ModelMixin, load_model_dict_into_meta
 from ...models.normalization import AdaLayerNormContinuous, AdaLayerNormZero
@@ -376,7 +376,7 @@ def _load_ip_adapter_weights(self, state_dict: Dict, low_cpu_mem_usage: bool):
             hidden_dim=hidden_dim,
             heads=heads,
             num_queries=num_queries,
-            timestep_in_dim=timestep_in_dim
+            timestep_in_dim=timestep_in_dim,
         ).to(device=self.device, dtype=self.dtype)
 
         if not low_cpu_mem_usage:
@@ -470,7 +470,9 @@ def custom_forward(*inputs):
                 )
             elif not is_skip:
                 encoder_hidden_states, hidden_states = block(
-                    hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states, temb=temb,
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
                     joint_attention_kwargs=joint_attention_kwargs,
                 )
 
diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
@@ -17,16 +17,16 @@
 
 import torch
 from transformers import (
+    BaseImageProcessor,
     CLIPTextModelWithProjection,
     CLIPTokenizer,
+    PreTrainedModel,
     T5EncoderModel,
     T5TokenizerFast,
-    PreTrainedModel,
-    BaseImageProcessor,
 )
 
-from ...image_processor import VaeImageProcessor, PipelineImageInput
-from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin, SD3IPAdapterMixin
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, SD3IPAdapterMixin, SD3LoraLoaderMixin
 from ...models.autoencoders import AutoencoderKL
 from ...models.transformers import SD3Transformer2DModel
 from ...schedulers import FlowMatchEulerDiscreteScheduler
@@ -184,7 +184,7 @@ def __init__(
         text_encoder_3: T5EncoderModel,
         tokenizer_3: T5TokenizerFast,
         image_encoder: PreTrainedModel = None,
-        feature_extractor: BaseImageProcessor = None
+        feature_extractor: BaseImageProcessor = None,
     ):
         super().__init__()
 
@@ -199,7 +199,7 @@ def __init__(
             transformer=transformer,
             scheduler=scheduler,
             image_encoder=image_encoder,
-            feature_extractor=feature_extractor
+            feature_extractor=feature_extractor,
         )
         self.vae_scale_factor = (
             2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
@@ -678,7 +678,7 @@ def num_timesteps(self):
     @property
     def interrupt(self):
         return self._interrupt
-    
+
     # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_image
     def encode_image(self, image):
         if not isinstance(image, torch.Tensor):
@@ -687,16 +687,18 @@ def encode_image(self, image):
         image = image.to(device=self.device, dtype=self.dtype)
 
         image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
-        uncond_image_enc_hidden_states = self.image_encoder(torch.zeros_like(image), output_hidden_states=True).hidden_states[-2]
-        
+        uncond_image_enc_hidden_states = self.image_encoder(
+            torch.zeros_like(image), output_hidden_states=True
+        ).hidden_states[-2]
+
         return image_enc_hidden_states, uncond_image_enc_hidden_states
 
     # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.prepare_ip_adapter_image_embeds
     def prepare_ip_adapter_image_embeds(
         self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
     ):
         if ip_adapter_image_embeds is None:
-                single_image_embeds, single_negative_image_embeds = self.encode_image(ip_adapter_image)
+            single_image_embeds, single_negative_image_embeds = self.encode_image(ip_adapter_image)
         else:
             for single_image_embeds in ip_adapter_image_embeds:
                 if do_classifier_free_guidance:
@@ -705,13 +707,13 @@ def prepare_ip_adapter_image_embeds(
                     single_image_embeds = ip_adapter_image_embeds
 
         single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
-        
+
         if do_classifier_free_guidance:
             single_negative_image_embeds = torch.cat([single_negative_image_embeds] * num_images_per_prompt, dim=0)
             single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
 
         return single_image_embeds.to(device=device)
-        
+
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -979,15 +981,12 @@ def __call__(
                         need_temb=True,
                     )
 
-                    image_prompt_embeds = dict(
-                        ip_hidden_states=ip_hidden_states,
-                        temb=temb
-                    )
+                    image_prompt_embeds = {"ip_hidden_states": ip_hidden_states, "temb": temb}
 
                     if self.joint_attention_kwargs is None:
                         self._joint_attention_kwargs = image_prompt_embeds
                     else:
-                        self._joint_attention_kwargs.update(**image_prompt_embeds)                        
+                        self._joint_attention_kwargs.update(**image_prompt_embeds)
 
                 noise_pred = self.transformer(
                     hidden_states=latent_model_input,