Fixes

hlky · hlky · commit 60d2673d27fb · 2024-10-29T13:35:40.000Z
diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py
@@ -357,10 +357,10 @@ class FluxIPAdapterMixin:
     def load_ip_adapter(
         self,
         pretrained_model_name_or_path_or_dict: Union[str, List[str], Dict[str, torch.Tensor]],
-        subfolder: Union[str, List[str]],
         weight_name: Union[str, List[str]],
+        subfolder: Optional[Union[str, List[str]]] = "",
         image_encoder_pretrained_model_name_or_path: Optional[str] = "image_encoder",
-        image_encoder_subfolder: Optional[str] = None,
+        image_encoder_subfolder: Optional[str] = "",
         **kwargs,
     ):
         """
@@ -492,6 +492,7 @@ def load_ip_adapter(
                                     ".".join(key.split(".")[1:])
                                     .replace("ip_adapter_double_stream_k_proj", "to_k_ip")
                                     .replace("ip_adapter_double_stream_v_proj", "to_v_ip")
+                                    .replace("processor.", "")
                                 )
                                 state_dict["ip_adapter"][diffusers_name] = f.get_tensor(key)
                 else:
@@ -555,10 +556,22 @@ def set_ip_adapter_scale(self, scale):
         ```
         """
         transformer = self.transformer
+        if not isinstance(scale, list):
+            scale = [scale]
+
+        scale_configs = scale
 
         for attn_name, attn_processor in transformer.attn_processors.items():
             if isinstance(attn_processor, (FluxIPAdapterAttnProcessor2_0)):
-                attn_processor.scale = scale
+                if len(scale_configs) != len(attn_processor.scale):
+                    raise ValueError(
+                        f"Cannot assign {len(scale_configs)} scale_configs to "
+                        f"{len(attn_processor.scale)} IP-Adapter."
+                    )
+                elif len(scale_configs) == 1:
+                    scale_configs = scale_configs * len(attn_processor.scale)
+                for i, scale_config in enumerate(scale_configs):
+                    attn_processor.scale[i] = scale_config
 
     def unload_ip_adapter(self):
         """
diff --git a/src/diffusers/loaders/transformer_flux.py b/src/diffusers/loaders/transformer_flux.py
@@ -84,7 +84,7 @@ def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict, low_cpu_mem_us
 
         return image_projection
 
-    def _convert_ip_adapter_attn_to_diffusers(self, state_dict, low_cpu_mem_usage=False):
+    def _convert_ip_adapter_attn_to_diffusers(self, state_dicts, low_cpu_mem_usage=False):
         from ..models.attention_processor import (
             FluxIPAdapterAttnProcessor2_0,
         )
@@ -110,35 +110,47 @@ def _convert_ip_adapter_attn_to_diffusers(self, state_dict, low_cpu_mem_usage=Fa
 
         # set ip-adapter cross-attention processors & load state_dict
         attn_procs = {}
-        key_id = 1
+        key_id = 0
         init_context = init_empty_weights if low_cpu_mem_usage else nullcontext
         for name in self.attn_processors.keys():
             if name.startswith("single_transformer_blocks"):
-                continue
-
-            cross_attention_dim = self.config.joint_attention_dim
-            hidden_size = self.config.inner_dim
-            attn_processor_class = FluxIPAdapterAttnProcessor2_0
-
-            with init_context():
-                attn_procs[name] = attn_processor_class(
-                    hidden_size=hidden_size,
-                    cross_attention_dim=cross_attention_dim,
-                    scale=1.0,
-                )
-
-            value_dict = {}
-            value_dict.update({"to_k_ip.weight": state_dict["ip_adapter"][f"{key_id}.to_k_ip.weight"]})
-            value_dict.update({"to_v_ip.weight": state_dict["ip_adapter"][f"{key_id}.to_v_ip.weight"]})
-
-            if not low_cpu_mem_usage:
-                attn_procs[name].load_state_dict(value_dict)
+                attn_processor_class = self.attn_processors[name].__class__
+                attn_procs[name] = attn_processor_class()
             else:
-                device = next(iter(value_dict.values())).device
-                dtype = next(iter(value_dict.values())).dtype
-                load_model_dict_into_meta(attn_procs[name], value_dict, device=device, dtype=dtype)
-
-            key_id += 1
+                cross_attention_dim = self.config.joint_attention_dim
+                hidden_size = self.inner_dim
+                attn_processor_class = FluxIPAdapterAttnProcessor2_0
+                num_image_text_embeds = []
+                for state_dict in state_dicts:
+                    if "proj.weight" in state_dict["image_proj"]:
+                        # IP-Adapter
+                        num_image_text_embeds += [4]
+
+                with init_context():
+                    attn_procs[name] = attn_processor_class(
+                        hidden_size=hidden_size,
+                        cross_attention_dim=cross_attention_dim,
+                        scale=1.0,
+                        num_tokens=num_image_text_embeds,
+                        dtype=self.dtype,
+                        device=self.device,
+                    )
+
+                value_dict = {}
+                for i, state_dict in enumerate(state_dicts):
+                    value_dict.update({f"to_k_ip.{i}.weight": state_dict["ip_adapter"][f"{key_id}.to_k_ip.weight"]})
+                    value_dict.update({f"to_v_ip.{i}.weight": state_dict["ip_adapter"][f"{key_id}.to_v_ip.weight"]})
+                    value_dict.update({f"to_k_ip.{i}.bias": state_dict["ip_adapter"][f"{key_id}.to_k_ip.bias"]})
+                    value_dict.update({f"to_v_ip.{i}.bias": state_dict["ip_adapter"][f"{key_id}.to_v_ip.bias"]})
+
+                if not low_cpu_mem_usage:
+                    attn_procs[name].load_state_dict(value_dict)
+                else:
+                    device = self.device
+                    dtype = self.dtype
+                    load_model_dict_into_meta(attn_procs[name], value_dict, device=device, dtype=dtype)
+
+                key_id += 1
 
         return attn_procs
 
@@ -160,5 +172,3 @@ def _load_ip_adapter_weights(self, state_dicts, low_cpu_mem_usage=False):
 
         self.encoder_hid_proj = MultiIPAdapterImageProjection(image_projection_layers)
         self.config.encoder_hid_dim_type = "ip_image_proj"
-
-        self.to(dtype=self.dtype, device=self.device)
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -482,7 +482,7 @@ def forward(
         # For standard processors that are defined here, `**cross_attention_kwargs` is empty
 
         attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
-        quiet_attn_parameters = {"ip_adapter_masks"}
+        quiet_attn_parameters = {"ip_adapter_masks", "image_projection"}
         unused_kwargs = [
             k for k, _ in cross_attention_kwargs.items() if k not in attn_parameters and k not in quiet_attn_parameters
         ]
@@ -1893,31 +1893,43 @@ def __call__(
             return hidden_states
 
 
-class FluxIPAdapterAttnProcessor2_0:
+class FluxIPAdapterAttnProcessor2_0(torch.nn.Module):
     """Flux Attention processor for IP-Adapter."""
 
-    def __init__(self, hidden_size: int, cross_attention_dim: int, scale: float = 1.0):
+    def __init__(
+        self, hidden_size: int, cross_attention_dim: int, num_tokens=(4,), scale=1.0, device=None, dtype=None
+    ):
         super().__init__()
 
-        r"""
-        Args:
-            hidden_size (`int`):
-                The hidden size of the attention layer.
-            cross_attention_dim (`int`):
-                The number of channels in the `encoder_hidden_states`.
-            scale (`float`, defaults to 1.0):
-                the weight scale of image prompt.
-        """
-
         if not hasattr(F, "scaled_dot_product_attention"):
             raise ImportError(
-                "FluxIPAdapterAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+                f"{self.__class__.__name__} requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
             )
 
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+
+        if not isinstance(num_tokens, (tuple, list)):
+            num_tokens = [num_tokens]
+
+        if not isinstance(scale, list):
+            scale = [scale] * len(num_tokens)
+        if len(scale) != len(num_tokens):
+            raise ValueError("`scale` should be a list of integers with the same length as `num_tokens`.")
         self.scale = scale
 
-        self.to_k_ip = nn.Linear(cross_attention_dim, hidden_size)
-        self.to_v_ip = nn.Linear(cross_attention_dim, hidden_size)
+        self.to_k_ip = nn.ModuleList(
+            [
+                nn.Linear(cross_attention_dim, hidden_size, bias=True, device=device, dtype=dtype)
+                for _ in range(len(num_tokens))
+            ]
+        )
+        self.to_v_ip = nn.ModuleList(
+            [
+                nn.Linear(cross_attention_dim, hidden_size, bias=True, device=device, dtype=dtype)
+                for _ in range(len(num_tokens))
+            ]
+        )
 
     def __call__(
         self,
@@ -1926,24 +1938,27 @@ def __call__(
         encoder_hidden_states: torch.FloatTensor = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         image_rotary_emb: Optional[torch.Tensor] = None,
-        image_projection: Optional[torch.Tensor] = None,
+        image_projection: Optional[List[torch.Tensor]] = None,
+        ip_adapter_masks: Optional[torch.Tensor] = None,
     ) -> torch.FloatTensor:
+        if image_projection is None:
+            raise ValueError("image_projection is None")
         batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
 
         # `sample` projections.
-        query = attn.to_q(hidden_states)
+        hidden_states_query_proj = attn.to_q(hidden_states)
         key = attn.to_k(hidden_states)
         value = attn.to_v(hidden_states)
 
         inner_dim = key.shape[-1]
         head_dim = inner_dim // attn.heads
 
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        hidden_states_query_proj = hidden_states_query_proj.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
         key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
         value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
 
         if attn.norm_q is not None:
-            query = attn.norm_q(query)
+            hidden_states_query_proj = attn.norm_q(hidden_states_query_proj)
         if attn.norm_k is not None:
             key = attn.norm_k(key)
 
@@ -1970,7 +1985,7 @@ def __call__(
                 encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
 
             # attention
-            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+            query = torch.cat([encoder_hidden_states_query_proj, hidden_states_query_proj], dim=2)
             key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
             value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
 
@@ -1997,19 +2012,104 @@ def __call__(
             encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
 
             # IP-adapter
-            ip_key = self.to_k_ip(image_projection)
-            ip_value = self.to_v_ip(image_projection)
+            ip_hidden_states = image_projection
+
+            if ip_adapter_masks is not None:
+                if not isinstance(ip_adapter_masks, List):
+                    # for backward compatibility, we accept `ip_adapter_mask` as a tensor of shape [num_ip_adapter, 1, height, width]
+                    ip_adapter_masks = list(ip_adapter_masks.unsqueeze(1))
+                if not (len(ip_adapter_masks) == len(self.scale) == len(ip_hidden_states)):
+                    raise ValueError(
+                        f"Length of ip_adapter_masks array ({len(ip_adapter_masks)}) must match "
+                        f"length of self.scale array ({len(self.scale)}) and number of ip_hidden_states "
+                        f"({len(ip_hidden_states)})"
+                    )
+                else:
+                    for index, (mask, scale, ip_state) in enumerate(
+                        zip(ip_adapter_masks, self.scale, ip_hidden_states)
+                    ):
+                        if not isinstance(mask, torch.Tensor) or mask.ndim != 4:
+                            raise ValueError(
+                                "Each element of the ip_adapter_masks array should be a tensor with shape "
+                                "[1, num_images_for_ip_adapter, height, width]."
+                                " Please use `IPAdapterMaskProcessor` to preprocess your mask"
+                            )
+                        if mask.shape[1] != ip_state.shape[1]:
+                            raise ValueError(
+                                f"Number of masks ({mask.shape[1]}) does not match "
+                                f"number of ip images ({ip_state.shape[1]}) at index {index}"
+                            )
+                        if isinstance(scale, list) and not len(scale) == mask.shape[1]:
+                            raise ValueError(
+                                f"Number of masks ({mask.shape[1]}) does not match "
+                                f"number of scales ({len(scale)}) at index {index}"
+                            )
+            else:
+                ip_adapter_masks = [None] * len(self.scale)
+
+            ip_query = hidden_states_query_proj
+            # for ip-adapter
+            for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip(
+                ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks
+            ):
+                skip = False
+                if isinstance(scale, list):
+                    if all(s == 0 for s in scale):
+                        skip = True
+                elif scale == 0:
+                    skip = True
+                if not skip:
+                    if mask is not None:
+                        if not isinstance(scale, list):
+                            scale = [scale] * mask.shape[1]
+
+                        current_num_images = mask.shape[1]
+                        for i in range(current_num_images):
+                            ip_key = to_k_ip(current_ip_hidden_states[:, i, :, :])
+                            ip_value = to_v_ip(current_ip_hidden_states[:, i, :, :])
+
+                            ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                            ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+                            # the output of sdp = (batch, num_heads, seq_len, head_dim)
+                            # TODO: add support for attn.scale when we move to Torch 2.1
+                            _current_ip_hidden_states = F.scaled_dot_product_attention(
+                                ip_query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+                            )
+
+                            _current_ip_hidden_states = _current_ip_hidden_states.transpose(1, 2).reshape(
+                                batch_size, -1, attn.heads * head_dim
+                            )
+                            _current_ip_hidden_states = _current_ip_hidden_states.to(ip_query.dtype)
+
+                            mask_downsample = IPAdapterMaskProcessor.downsample(
+                                mask[:, i, :, :],
+                                batch_size,
+                                _current_ip_hidden_states.shape[1],
+                                _current_ip_hidden_states.shape[2],
+                            )
+
+                            mask_downsample = mask_downsample.to(dtype=ip_query.dtype, device=ip_query.device)
+                            hidden_states = hidden_states + scale[i] * (_current_ip_hidden_states * mask_downsample)
+                    else:
+                        ip_key = to_k_ip(current_ip_hidden_states)
+                        ip_value = to_v_ip(current_ip_hidden_states)
 
-            ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-            ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-            # the output of sdp = (batch, num_heads, seq_len, head_dim)
-            # TODO: add support for attn.scale when we move to Torch 2.1
-            ip_hidden_states = F.scaled_dot_product_attention(
-                query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
-            )
-            ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-            ip_hidden_states = ip_hidden_states.to(query.dtype)
-            hidden_states = hidden_states + self.scale * ip_hidden_states
+                        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+                        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+                        # TODO: add support for attn.scale when we move to Torch 2.1
+                        current_ip_hidden_states = F.scaled_dot_product_attention(
+                            ip_query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+                        )
+
+                        current_ip_hidden_states = current_ip_hidden_states.transpose(1, 2).reshape(
+                            batch_size, -1, attn.heads * head_dim
+                        )
+                        current_ip_hidden_states = current_ip_hidden_states.to(ip_query.dtype)
+
+                        hidden_states = hidden_states + scale * current_ip_hidden_states
 
             return hidden_states, encoder_hidden_states
         else:
diff --git a/src/diffusers/pipelines/flux/pipeline_flux.py b/src/diffusers/pipelines/flux/pipeline_flux.py