pag tests and refactor

a-r-r-o-w · a-r-r-o-w · commit 3ab6db76db08 · 2024-12-12T10:48:32.000+01:00
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -5410,95 +5410,45 @@ def __call__(
         hidden_states: torch.Tensor,
         encoder_hidden_states: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
-        temb: Optional[torch.Tensor] = None,
-        *args,
-        **kwargs,
     ) -> torch.Tensor:
-        if len(args) > 0 or kwargs.get("scale", None) is not None:
-            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
-            deprecate("scale", "1.0.0", deprecation_message)
-
-        residual = hidden_states
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        original_dtype = hidden_states.dtype
 
-        # chunk
         hidden_states_uncond, hidden_states_org, hidden_states_ptb = hidden_states.chunk(3)
         hidden_states_org = torch.cat([hidden_states_uncond, hidden_states_org])
 
-        # original path
-        batch_size, sequence_length, _ = (
-            hidden_states_org.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-
         query = attn.to_q(hidden_states_org)
         key = attn.to_k(hidden_states_org)
         value = attn.to_v(hidden_states_org)
 
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        dtype = query.dtype
-
-        query = query.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
-        key = key.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1).transpose(-1, -2)
-        value = value.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
+        query = query.transpose(1, 2).unflatten(1, (attn.heads, -1))
+        key = key.transpose(1, 2).unflatten(1, (attn.heads, -1)).transpose(2, 3)
+        value = value.transpose(1, 2).unflatten(1, (attn.heads, -1))
 
-        query = self.kernel_func(query)  # B, h, h_d, N
+        query = self.kernel_func(query)
         key = self.kernel_func(key)
 
-        # need torch.float
         query, key, value = query.float(), key.float(), value.float()
 
         value = F.pad(value, (0, 0, 0, 1), mode="constant", value=self.pad_val)
-        vk = torch.matmul(value, key)
-        hidden_states_org = torch.matmul(vk, query)
+        scores = torch.matmul(value, key)
+        hidden_states_org = torch.matmul(scores, query)
 
-        if hidden_states_org.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states_org = hidden_states_org.float()
         hidden_states_org = hidden_states_org[:, :, :-1] / (hidden_states_org[:, :, -1:] + self.eps)
+        hidden_states_org = hidden_states_org.flatten(1, 2).transpose(1, 2)
+        hidden_states_org = hidden_states_org.to(original_dtype)
 
-        hidden_states_org = hidden_states_org.view(batch_size, attn.heads * head_dim, -1).permute(0, 2, 1)
-        hidden_states_org = hidden_states_org.to(dtype)
-
-        # linear proj
         hidden_states_org = attn.to_out[0](hidden_states_org)
-        # dropout
         hidden_states_org = attn.to_out[1](hidden_states_org)
 
-        if input_ndim == 4:
-            hidden_states_org = hidden_states_org.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
         # perturbed path (identity attention)
-        batch_size, sequence_length, _ = hidden_states_ptb.shape
+        hidden_states_ptb = attn.to_v(hidden_states_ptb).to(original_dtype)
 
-        value = attn.to_v(hidden_states_ptb)
-        hidden_states_ptb = value
-        hidden_states_ptb = hidden_states_ptb.to(dtype)
-
-        # linear proj
         hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
-        # dropout
         hidden_states_ptb = attn.to_out[1](hidden_states_ptb)
 
-        if input_ndim == 4:
-            hidden_states_ptb = hidden_states_ptb.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        # cat
         hidden_states = torch.cat([hidden_states_org, hidden_states_ptb])
 
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        if hidden_states.dtype == torch.float16:
+        if original_dtype == torch.float16:
             hidden_states = hidden_states.clip(-65504, 65504)
 
         return hidden_states
@@ -5520,93 +5470,47 @@ def __call__(
         hidden_states: torch.Tensor,
         encoder_hidden_states: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
-        temb: Optional[torch.Tensor] = None,
-        *args,
-        **kwargs,
     ) -> torch.Tensor:
-        if len(args) > 0 or kwargs.get("scale", None) is not None:
-            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
-            deprecate("scale", "1.0.0", deprecation_message)
-
-        residual = hidden_states
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        original_dtype = hidden_states.dtype
 
-        # chunk
         hidden_states_org, hidden_states_ptb = hidden_states.chunk(2)
 
-        # original path
-        batch_size, sequence_length, _ = (
-            hidden_states_org.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-
         query = attn.to_q(hidden_states_org)
         key = attn.to_k(hidden_states_org)
         value = attn.to_v(hidden_states_org)
 
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        dtype = query.dtype
-
-        query = query.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
-        key = key.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1).transpose(-1, -2)
-        value = value.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
+        query = query.transpose(1, 2).unflatten(1, (attn.heads, -1))
+        key = key.transpose(1, 2).unflatten(1, (attn.heads, -1)).transpose(2, 3)
+        value = value.transpose(1, 2).unflatten(1, (attn.heads, -1))
 
-        query = self.kernel_func(query)  # B, h, h_d, N
+        query = self.kernel_func(query)
         key = self.kernel_func(key)
 
-        # need torch.float
         query, key, value = query.float(), key.float(), value.float()
 
         value = F.pad(value, (0, 0, 0, 1), mode="constant", value=self.pad_val)
-        vk = torch.matmul(value, key)
-        hidden_states_org = torch.matmul(vk, query)
+        scores = torch.matmul(value, key)
+        hidden_states_org = torch.matmul(scores, query)
 
         if hidden_states_org.dtype in [torch.float16, torch.bfloat16]:
             hidden_states_org = hidden_states_org.float()
-        hidden_states_org = hidden_states_org[:, :, :-1] / (hidden_states_org[:, :, -1:] + self.eps)
 
-        hidden_states_org = hidden_states_org.view(batch_size, attn.heads * head_dim, -1).permute(0, 2, 1)
-        hidden_states_org = hidden_states_org.to(dtype)
+        hidden_states_org = hidden_states_org[:, :, :-1] / (hidden_states_org[:, :, -1:] + self.eps)
+        hidden_states_org = hidden_states_org.flatten(1, 2).transpose(1, 2)
+        hidden_states_org = hidden_states_org.to(original_dtype)
 
-        # linear proj
         hidden_states_org = attn.to_out[0](hidden_states_org)
-        # dropout
         hidden_states_org = attn.to_out[1](hidden_states_org)
 
-        if input_ndim == 4:
-            hidden_states_org = hidden_states_org.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
         # perturbed path (identity attention)
-        batch_size, sequence_length, _ = hidden_states_ptb.shape
+        hidden_states_ptb = attn.to_v(hidden_states_ptb).to(original_dtype)
 
-        hidden_states_ptb = attn.to_v(hidden_states_ptb)
-        hidden_states_ptb = hidden_states_ptb.to(dtype)
-
-        # linear proj
         hidden_states_ptb = attn.to_out[0](hidden_states_ptb)
-        # dropout
         hidden_states_ptb = attn.to_out[1](hidden_states_ptb)
 
-        if input_ndim == 4:
-            hidden_states_ptb = hidden_states_ptb.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        # cat
         hidden_states = torch.cat([hidden_states_org, hidden_states_ptb])
 
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        if hidden_states.dtype == torch.float16:
+        if original_dtype == torch.float16:
             hidden_states = hidden_states.clip(-65504, 65504)
 
         return hidden_states
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sana.py b/src/diffusers/pipelines/pag/pipeline_pag_sana.py
@@ -142,7 +142,7 @@ def __init__(
         vae: AutoencoderDC,
         transformer: SanaTransformer2DModel,
         scheduler: FlowDPMSolverMultistepScheduler,
-        pag_applied_layers: Union[str, List[str]] = "transformer_blocks.8",
+        pag_applied_layers: Union[str, List[str]] = "transformer_blocks.0",
     ):
         super().__init__()
 
@@ -511,8 +511,11 @@ def _clean_caption(self, caption):
 
         return caption.strip()
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    # Copied from diffusers.pipelines.sana.pipeline_sana.SanaPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
         shape = (
             batch_size,
             num_channels_latents,
@@ -525,13 +528,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
 
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         return latents
 
     @property
@@ -561,8 +558,8 @@ def __call__(
         sigmas: List[float] = None,
         guidance_scale: float = 4.5,
         num_images_per_prompt: Optional[int] = 1,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
+        height: int = 1024,
+        width: int = 1024,
         eta: float = 0.0,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.Tensor] = None,
@@ -771,9 +768,6 @@ def __call__(
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
-        # 6.1 Prepare micro-conditions.
-        added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
-
         # 7. Denoising loop
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
         self._num_timesteps = len(timesteps)
@@ -796,7 +790,6 @@ def __call__(
                     encoder_hidden_states=prompt_embeds,
                     encoder_attention_mask=prompt_attention_mask,
                     timestep=timestep,
-                    added_cond_kwargs=added_cond_kwargs,
                     return_dict=False,
                 )[0]
                 noise_pred = noise_pred.float()
diff --git a/src/diffusers/pipelines/sana/pipeline_sana.py b/src/diffusers/pipelines/sana/pipeline_sana.py
@@ -504,8 +504,10 @@ def _clean_caption(self, caption):
 
         return caption.strip()
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
         shape = (
             batch_size,
             num_channels_latents,
@@ -518,13 +520,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
 
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         return latents
 
     @property