Updated dosctrings and doc entries

guiyrt · guiyrt · commit 68169f8f2ead · 2024-12-16T18:57:45.000Z
diff --git a/docs/source/en/api/attnprocessor.md b/docs/source/en/api/attnprocessor.md
@@ -52,3 +52,6 @@ An attention processor is a class for applying different types of attention mech
 
 ## AttnProcessorNPU
 [[autodoc]] models.attention_processor.AttnProcessorNPU
+
+## IPAdapterJointAttnProcessor2_0
+[[autodoc]] models.attention_processor.IPAdapterJointAttnProcessor2_0
diff --git a/docs/source/en/api/loaders/ip_adapter.md b/docs/source/en/api/loaders/ip_adapter.md
@@ -24,6 +24,12 @@ Learn how to load an IP-Adapter checkpoint and image in the IP-Adapter [loading]
 
 [[autodoc]] loaders.ip_adapter.IPAdapterMixin
 
+## SD3IPAdapterMixin
+
+[[autodoc]] loaders.ip_adapter.SD3IPAdapterMixin
+    - all
+    - is_ip_adapter_active
+
 ## IPAdapterMaskProcessor
 
 [[autodoc]] image_processor.IPAdapterMaskProcessor
diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py
@@ -358,13 +358,13 @@ class SD3IPAdapterMixin:
 
     @property
     def is_ip_adapter_active(self) -> bool:
-        r"""Checks if any ip_adapter attention processor have scale > 0.
+        """Checks if IP-Adapter is loaded and scale > 0.
 
         IP-Adapter scale controls the influence of the image prompt versus text prompt. When this value is set to 0,
-        image is irrelevant.
+        the image context is irrelevant.
 
         Returns:
-            `bool`: True when ip_adapter is loaded and any ip_adapter layer scale > 0.
+            `bool`: True when IP-Adapter is loaded and any layer has scale > 0.
         """
         scales = [
             attn_proc.scale
@@ -382,7 +382,7 @@ def load_ip_adapter(
         weight_name: str,
         image_encoder_folder: Optional[str] = "image_encoder",
         **kwargs,
-    ):
+    ) -> None:
         """
         Parameters:
             pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
@@ -500,19 +500,19 @@ def load_ip_adapter(
                         image_encoder_subfolder = Path(image_encoder_folder).as_posix()
 
                     # Commons args for loading image encoder and image processor
-                    args = dict(
-                        pretrained_model_name_or_path_or_dict,
-                        subfolder=image_encoder_subfolder,
-                        low_cpu_mem_usage=low_cpu_mem_usage,
-                        cache_dir=cache_dir,
-                        local_files_only=local_files_only,
-                    )
+                    kwargs = {
+                        "low_cpu_mem_usage": low_cpu_mem_usage,
+                        "cache_dir": cache_dir,
+                        "local_files_only": local_files_only,
+                    }
 
                     self.register_modules(
-                        feature_extractor=SiglipImageProcessor.from_pretrained(**args).to(
+                        feature_extractor=SiglipImageProcessor.from_pretrained(image_encoder_subfolder, **kwargs).to(
+                            self.device, dtype=self.dtype
+                        ),
+                        image_encoder=SiglipVisionModel.from_pretrained(image_encoder_subfolder, **kwargs).to(
                             self.device, dtype=self.dtype
                         ),
-                        image_encoder=SiglipVisionModel.from_pretrained(**args).to(self.device, dtype=self.dtype),
                     )
                 else:
                     raise ValueError(
@@ -527,11 +527,11 @@ def load_ip_adapter(
         # Load IP-Adapter into transformer
         self.transformer._load_ip_adapter_weights(state_dict, low_cpu_mem_usage=low_cpu_mem_usage)
 
-    def set_ip_adapter_scale(self, scale: float):
+    def set_ip_adapter_scale(self, scale: float) -> None:
         """
-        Controls image/text prompt conditioning. A value of 1.0 means the model is only conditioned on the image
-        prompt, and 0.0 only conditioned by the text prompt. Lowering this value encourages the model to produce more
-        diverse images, but they may not be as aligned with the image prompt.
+        Set IP-Adapter scale, which controls image prompt conditioning. A value of 1.0 means the model is only
+        conditioned on the image prompt, and 0.0 only conditioned by the text prompt. Lowering this value encourages
+        the model to produce more diverse images, but they may not be as aligned with the image prompt.
 
         Example:
 
@@ -540,12 +540,17 @@ def set_ip_adapter_scale(self, scale: float):
         >>> pipeline.set_ip_adapter_scale(0.6)
         >>> ...
         ```
+
+        Args:
+            scale (float):
+                IP-Adapter scale to be set.
+
         """
         for attn_processor in self.transformer.attn_processors.values():
             if isinstance(attn_processor, IPAdapterJointAttnProcessor2_0):
                 attn_processor.scale = scale
 
-    def unload_ip_adapter(self):
+    def unload_ip_adapter(self) -> None:
         """
         Unloads the IP Adapter weights.
 
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -5149,7 +5149,22 @@ def __call__(
 
 
 class IPAdapterJointAttnProcessor2_0(torch.nn.Module):
-    """Attention processor for IP-Adapter used typically in processing the SD3-like self-attention projections."""
+    """
+    Attention processor for IP-Adapter used typically in processing the SD3-like self-attention projections, with
+    additional image-based information and timestep embeddings.
+
+    Args:
+        hidden_size (`int`):
+            The number of hidden channels.
+        ip_hidden_states_dim (`int`):
+            The image feature dimension.
+        head_dim (`int`):
+            The number of head channels.
+        timesteps_emb_dim (`int`, defaults to 1280):
+            The number of input channels for timestep embedding.
+        scale (`float`, defaults to 0.5):
+            IP-Adapter scale.
+    """
 
     def __init__(
         self,
@@ -5181,6 +5196,28 @@ def __call__(
         ip_hidden_states: torch.FloatTensor = None,
         temb: torch.FloatTensor = None,
     ) -> torch.FloatTensor:
+        """
+        Perform the attention computation, integrating image features (if provided) and timestep embeddings.
+
+        If `ip_hidden_states` is `None`, this is equivalent to using JointAttnProcessor2_0.
+
+        Args:
+            attn (`Attention`):
+                Attention instance.
+            hidden_states (`torch.FloatTensor`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.FloatTensor`, *optional*):
+                The encoder hidden states.
+            attention_mask (`torch.FloatTensor`, *optional*):
+                Attention mask.
+            ip_hidden_states (`torch.FloatTensor`, *optional*):
+                Image embeddings.
+            temb (`torch.FloatTensor`, *optional*):
+                Timestep embeddings.
+
+        Returns:
+            `torch.FloatTensor`: Output hidden states.
+        """
         residual = hidden_states
 
         batch_size = hidden_states.shape[0]
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -2119,6 +2119,19 @@ def forward(self, id_embeds: torch.Tensor) -> torch.Tensor:
 
 
 class IPAdapterTimeImageProjectionBlock(nn.Module):
+    """Block for IPAdapterTimeImageProjection.
+
+    Args:
+        hidden_dim (`int`, defaults to 1280):
+            The number of hidden channels.
+        dim_head (`int`, defaults to 64):
+            The number of head channels.
+        heads (`int`, defaults to 20):
+            Parallel attention heads.
+        ffn_ratio (`int`, defaults to 4):
+            The expansion ratio of feedforward network hidden layer channels.
+    """
+
     def __init__(
         self,
         hidden_dim: int = 1280,
@@ -2152,7 +2165,21 @@ def __init__(
         self.attn.to_k = None
         self.attn.to_v = None
 
-    def forward(self, x, latents, timestep_emb):
+    def forward(self, x: torch.Tensor, latents: torch.Tensor, timestep_emb: torch.Tensor) -> torch.Tensor:
+        """Forward pass.
+
+        Args:
+            x (`torch.Tensor`):
+                Image features.
+            latents (`torch.Tensor`):
+                Latent features.
+            timestep_emb (`torch.Tensor`):
+                Timestep embedding.
+
+        Returns:
+            `torch.Tensor`: Output latent features.
+        """
+
         # Shift and scale for AdaLayerNorm
         emb = self.adaln_proj(self.adaln_silu(timestep_emb))
         shift_msa, scale_msa, shift_mlp, scale_mlp = emb.chunk(4, dim=1)
@@ -2192,6 +2219,33 @@ def forward(self, x, latents, timestep_emb):
 
 # Modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
 class IPAdapterTimeImageProjection(nn.Module):
+    """Resampler of SD3 IP-Adapter with timestep embedding.
+
+    Args:
+        embed_dim (`int`, defaults to 1152):
+            The feature dimension.
+        output_dim (`int`, defaults to 2432):
+            The number of output channels.
+        hidden_dim (`int`, defaults to 1280):
+            The number of hidden channels.
+        depth (`int`, defaults to 4):
+            The number of blocks.
+        dim_head (`int`, defaults to 64):
+            The number of head channels.
+        heads (`int`, defaults to 20):
+            Parallel attention heads.
+        num_queries (`int`, defaults to 64):
+            The number of queries.
+        ffn_ratio (`int`, defaults to 4):
+            The expansion ratio of feedforward network hidden layer channels.
+        timestep_in_dim (`int`, defaults to 320):
+            The number of input channels for timestep embedding.
+        timestep_flip_sin_to_cos (`bool`, defaults to True):
+            Flip the timestep embedding order to `cos, sin` (if True) or `sin, cos` (if False).
+        timestep_freq_shift (`int`, defaults to 0):
+            Controls the timestep delta between frequencies between dimensions.
+    """
+
     def __init__(
         self,
         embed_dim: int = 1152,
@@ -2217,7 +2271,17 @@ def __init__(
         self.time_proj = Timesteps(timestep_in_dim, timestep_flip_sin_to_cos, timestep_freq_shift)
         self.time_embedding = TimestepEmbedding(timestep_in_dim, hidden_dim, act_fn="silu")
 
-    def forward(self, x, timestep):
+    def forward(self, x: torch.Tensor, timestep: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward pass.
+
+        Args:
+            x (`torch.Tensor`):
+                Image features.
+            timestep (`torch.Tensor`):
+                Timestep in denoising process.
+        Returns:
+            `Tuple`[`torch.Tensor`, `torch.Tensor`]: The pair (latents, timestep_emb).
+        """
         timestep_emb = self.time_proj(timestep).to(dtype=x.dtype)
         timestep_emb = self.time_embedding(timestep_emb)
 
diff --git a/src/diffusers/models/transformers/transformer_sd3.py b/src/diffusers/models/transformers/transformer_sd3.py
@@ -331,7 +331,19 @@ def _set_gradient_checkpointing(self, module, value=False):
         if hasattr(module, "gradient_checkpointing"):
             module.gradient_checkpointing = value
 
-    def _load_ip_adapter_weights(self, state_dict: Dict, low_cpu_mem_usage: bool):
+    def _load_ip_adapter_weights(self, state_dict: Dict, low_cpu_mem_usage: bool) -> None:
+        """Sets IP-Adapter attention processors, image projection, and loads state_dict.
+
+        Args:
+            state_dict (`Dict`):
+                PyTorch state dict with keys "ip_adapter", which contains parameters for attention processors, and
+                "image_proj", which contains parameters for image projection net.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+        """
         # IP-Adapter cross attention parameters
         hidden_size = self.config.attention_head_dim * self.config.num_attention_heads
         ip_hidden_states_dim = self.config.attention_head_dim * self.config.num_attention_heads
diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
@@ -680,7 +680,16 @@ def interrupt(self):
         return self._interrupt
 
     # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_image
-    def encode_image(self, image):
+    def encode_image(self, image: PipelineImageInput) -> torch.Tensor:
+        """Encodes the given image into a feature representation using a pre-trained image encoder.
+
+        Args:
+            image (`PipelineImageInput`):
+                Input image to be encoded.
+
+        Returns:
+            `torch.Tensor`: The encoded image feature representation.
+        """
         if not isinstance(image, torch.Tensor):
             image = self.feature_extractor(image, return_tensors="pt").pixel_values
 
@@ -690,17 +699,42 @@ def encode_image(self, image):
 
     # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.prepare_ip_adapter_image_embeds
     def prepare_ip_adapter_image_embeds(
-        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
-    ):
-        if ip_adapter_image_embeds is None:
+        self,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[torch.Tensor] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+    ) -> torch.Tensor:
+        """Prepares image embeddings for use in the IP-Adapter.
+
+        Either `ip_adapter_image` or `ip_adapter_image_embeds` must be passed.
+
+        Args:
+            ip_adapter_image (`PipelineImageInput`, *optional*):
+                The input image to extract features from for IP-Adapter.
+            ip_adapter_image_embeds (`torch.Tensor`, *optional*):
+                Precomputed image embeddings.
+            device: (`torch.device`, *optional*):
+                Torch device.
+            num_images_per_prompt (`int`, defaults to 1):
+                Number of images that should be generated per prompt.
+            do_classifier_free_guidance (`bool`, defaults to True):
+                Whether to use classifier free guidance or not.
+        """
+        device = device or self._execution_device
+
+        if ip_adapter_image_embeds is not None:
+            if do_classifier_free_guidance:
+                single_negative_image_embeds, single_image_embeds = ip_adapter_image_embeds.chunk(2)
+            else:
+                single_image_embeds = ip_adapter_image_embeds
+        elif ip_adapter_image is not None:
             single_image_embeds = self.encode_image(ip_adapter_image)
             if do_classifier_free_guidance:
                 single_negative_image_embeds = torch.zeros_like(single_image_embeds)
         else:
-            if do_classifier_free_guidance:
-                single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-            else:
-                single_image_embeds = ip_adapter_image_embeds
+            raise ValueError("Neither `ip_adapter_image_embeds` or `ip_adapter_image_embeds` were provided.")
 
         image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
 
@@ -733,7 +767,7 @@ def __call__(
         pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
         ip_adapter_image: Optional[PipelineImageInput] = None,
-        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        ip_adapter_image_embeds: Optional[torch.Tensor] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         joint_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -810,11 +844,10 @@ def __call__(
                 weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
                 input argument.
             ip_adapter_image (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
-            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
-                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
-                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
-                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            ip_adapter_image_embeds (`torch.Tensor`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. Should be a tensor of shape `(batch_size, num_images,
+                emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to
+                `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generate image. Choose between
                 [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -950,8 +983,6 @@ def __call__(
         )
 
         # 6. Prepare image embeddings
-        # Either image is passed and ip_adapter is active
-        # Or image_embeds are passed directly
         if (ip_adapter_image is not None and self.is_ip_adapter_active) or ip_adapter_image_embeds is not None:
             ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
                 ip_adapter_image,