test

hlky · hlky · commit 111603eaf748 · 2024-12-16T14:35:17.000Z
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -2532,6 +2532,7 @@ def __call__(
     ) -> torch.FloatTensor:
         if image_projection is None:
             raise ValueError("image_projection is None")
+        print(image_projection, image_projection.shape)
         batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
 
         # `sample` projections.
@@ -2603,103 +2604,30 @@ def __call__(
             # IP-adapter
             ip_hidden_states = image_projection
 
-            if ip_adapter_masks is not None:
-                if not isinstance(ip_adapter_masks, List):
-                    # for backward compatibility, we accept `ip_adapter_mask` as a tensor of shape [num_ip_adapter, 1, height, width]
-                    ip_adapter_masks = list(ip_adapter_masks.unsqueeze(1))
-                if not (len(ip_adapter_masks) == len(self.scale) == len(ip_hidden_states)):
-                    raise ValueError(
-                        f"Length of ip_adapter_masks array ({len(ip_adapter_masks)}) must match "
-                        f"length of self.scale array ({len(self.scale)}) and number of ip_hidden_states "
-                        f"({len(ip_hidden_states)})"
-                    )
-                else:
-                    for index, (mask, scale, ip_state) in enumerate(
-                        zip(ip_adapter_masks, self.scale, ip_hidden_states)
-                    ):
-                        if not isinstance(mask, torch.Tensor) or mask.ndim != 4:
-                            raise ValueError(
-                                "Each element of the ip_adapter_masks array should be a tensor with shape "
-                                "[1, num_images_for_ip_adapter, height, width]."
-                                " Please use `IPAdapterMaskProcessor` to preprocess your mask"
-                            )
-                        if mask.shape[1] != ip_state.shape[1]:
-                            raise ValueError(
-                                f"Number of masks ({mask.shape[1]}) does not match "
-                                f"number of ip images ({ip_state.shape[1]}) at index {index}"
-                            )
-                        if isinstance(scale, list) and not len(scale) == mask.shape[1]:
-                            raise ValueError(
-                                f"Number of masks ({mask.shape[1]}) does not match "
-                                f"number of scales ({len(scale)}) at index {index}"
-                            )
-            else:
-                ip_adapter_masks = [None] * len(self.scale)
-
             ip_query = hidden_states_query_proj
             ip_attn_output = None
             # for ip-adapter
             for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip(
                 ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks
             ):
-                skip = False
-                if isinstance(scale, list):
-                    if all(s == 0 for s in scale):
-                        skip = True
-                elif scale == 0:
-                    skip = True
-                if not skip:
-                    if mask is not None:
-                        if not isinstance(scale, list):
-                            scale = [scale] * mask.shape[1]
-
-                        current_num_images = mask.shape[1]
-                        for i in range(current_num_images):
-                            ip_key = to_k_ip(current_ip_hidden_states[:, i, :, :])
-                            ip_value = to_v_ip(current_ip_hidden_states[:, i, :, :])
+                ip_key = to_k_ip(current_ip_hidden_states)
+                ip_value = to_v_ip(current_ip_hidden_states)
 
-                            ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-                            ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
 
-                            # the output of sdp = (batch, num_heads, seq_len, head_dim)
-                            # TODO: add support for attn.scale when we move to Torch 2.1
-                            _current_ip_hidden_states = F.scaled_dot_product_attention(
-                                ip_query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
-                            )
-
-                            _current_ip_hidden_states = _current_ip_hidden_states.transpose(1, 2).reshape(
-                                batch_size, -1, attn.heads * head_dim
-                            )
-                            _current_ip_hidden_states = _current_ip_hidden_states.to(ip_query.dtype)
-
-                            mask_downsample = IPAdapterMaskProcessor.downsample(
-                                mask[:, i, :, :],
-                                batch_size,
-                                _current_ip_hidden_states.shape[1],
-                                _current_ip_hidden_states.shape[2],
-                            )
-
-                            mask_downsample = mask_downsample.to(dtype=ip_query.dtype, device=ip_query.device)
-                            hidden_states = hidden_states + scale[i] * (_current_ip_hidden_states * mask_downsample)
-                    else:
-                        ip_key = to_k_ip(current_ip_hidden_states)
-                        ip_value = to_v_ip(current_ip_hidden_states)
-
-                        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-                        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-                        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-                        # TODO: add support for attn.scale when we move to Torch 2.1
-                        current_ip_hidden_states = F.scaled_dot_product_attention(
-                            ip_query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
-                        )
-
-                        current_ip_hidden_states = current_ip_hidden_states.transpose(1, 2).reshape(
-                            batch_size, -1, attn.heads * head_dim
-                        )
-                        current_ip_hidden_states = current_ip_hidden_states.to(ip_query.dtype)
+                # the output of sdp = (batch, num_heads, seq_len, head_dim)
+                # TODO: add support for attn.scale when we move to Torch 2.1
+                ip_attn_output = F.scaled_dot_product_attention(
+                    ip_query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+                )
 
-                        ip_attn_output = scale * current_ip_hidden_states
+                ip_attn_output = ip_attn_output.transpose(1, 2).reshape(
+                    batch_size, -1, attn.heads * head_dim
+                )
+                ip_attn_output = scale * ip_attn_output
+                print(ip_attn_output)
+                ip_attn_output = ip_attn_output.to(ip_query.dtype)
 
             return hidden_states, encoder_hidden_states, ip_attn_output
         else: