pack/unpack latents

a-r-r-o-w · a-r-r-o-w · commit 30a3bb723494 · 2024-11-29T05:29:26.000+01:00
diff --git a/src/diffusers/models/transformers/transformer_ltx.py b/src/diffusers/models/transformers/transformer_ltx.py
@@ -116,17 +116,14 @@ def __init__(
         self.theta = theta
 
     def forward(
-        self, hidden_states: torch.Tensor, rope_interpolation_scale: Optional[Tuple[torch.Tensor, float, float]] = None
+        self, hidden_states: torch.Tensor, num_frames: int, height: int, width: int, rope_interpolation_scale: Optional[Tuple[torch.Tensor, float, float]] = None
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        batch_size, num_channels, num_frames, height, width = hidden_states.shape
-        post_patch_num_frames = num_frames // self.patch_size_t
-        post_patch_height = height // self.patch_size
-        post_patch_width = width // self.patch_size
+        batch_size = hidden_states.size(0)
 
         # Always compute rope in fp32
-        grid_h = torch.arange(post_patch_height, dtype=torch.float32, device=hidden_states.device)
-        grid_w = torch.arange(post_patch_width, dtype=torch.float32, device=hidden_states.device)
-        grid_f = torch.arange(post_patch_num_frames, dtype=torch.float32, device=hidden_states.device)
+        grid_h = torch.arange(height, dtype=torch.float32, device=hidden_states.device)
+        grid_w = torch.arange(width, dtype=torch.float32, device=hidden_states.device)
+        grid_f = torch.arange(num_frames, dtype=torch.float32, device=hidden_states.device)
         grid = torch.meshgrid(grid_f, grid_h, grid_w, indexing="ij")
         grid = torch.stack(grid, dim=0)
         grid = grid.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
@@ -374,28 +371,20 @@ def forward(
         encoder_hidden_states: torch.Tensor,
         timestep: torch.LongTensor,
         encoder_attention_mask: torch.Tensor,
+        num_frames: int,
+        height: int,
+        width: int,
         rope_interpolation_scale: Optional[Tuple[float, float, float]] = None,
         return_dict: bool = True,
     ) -> torch.Tensor:
-        image_rotary_emb = self.rope(hidden_states, rope_interpolation_scale)
+        image_rotary_emb = self.rope(hidden_states, num_frames, height, width, rope_interpolation_scale)
 
         # convert encoder_attention_mask to a bias the same way we do for attention_mask
         if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
             encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
             encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
 
-        batch_size, num_channels, num_frames, height, width = hidden_states.shape
-        p = self.config.patch_size
-        p_t = self.config.patch_size_t
-
-        post_patch_height = height // p
-        post_patch_width = width // p
-        post_patch_num_frames = num_frames // p_t
-
-        hidden_states = hidden_states.reshape(
-            batch_size, -1, post_patch_num_frames, p_t, post_patch_height, p, post_patch_width, p
-        )
-        hidden_states = hidden_states.permute(0, 2, 4, 6, 1, 3, 5, 7).flatten(4, 7).flatten(1, 3)
+        batch_size = hidden_states.size(0)
         hidden_states = self.proj_in(hidden_states)
 
         temb, embedded_timestep = self.time_embed(
@@ -446,12 +435,7 @@ def custom_forward(*inputs):
 
         hidden_states = self.norm_out(hidden_states)
         hidden_states = hidden_states * (1 + scale) + shift
-        hidden_states = self.proj_out(hidden_states)
-
-        hidden_states = hidden_states.reshape(
-            batch_size, post_patch_num_frames, post_patch_height, post_patch_width, -1, p_t, p, p
-        )
-        output = hidden_states.permute(0, 4, 1, 5, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(2, 3)
+        output = self.proj_out(hidden_states)
 
         if not return_dict:
             return (output,)
diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx.py b/src/diffusers/pipelines/ltx/pipeline_ltx.py
@@ -387,6 +387,34 @@ def check_inputs(
                     f" {negative_prompt_attention_mask.shape}."
                 )
 
+    @staticmethod
+    def _pack_latents(latents: torch.Tensor, patch_size: int = 1, patch_size_t: int = 1) -> torch.Tensor:
+        batch_size, num_channels, num_frames, height, width = latents.shape
+        post_patch_num_frames = num_frames // patch_size_t
+        post_patch_height = height // patch_size
+        post_patch_width = width // patch_size
+        latents = latents.reshape(
+            batch_size,
+            -1,
+            post_patch_num_frames,
+            patch_size_t,
+            post_patch_height,
+            patch_size,
+            post_patch_width,
+            patch_size,
+        )
+        latents = latents.permute(0, 2, 4, 6, 1, 3, 5, 7).flatten(4, 7).flatten(1, 3)
+        return latents
+
+    @staticmethod
+    def _unpack_latents(
+        latents: torch.Tensor, num_frames: int, height: int, width: int, patch_size: int = 1, patch_size_t: int = 1
+    ) -> torch.Tensor:
+        batch_size, num_channels, video_sequence_length = latents.shape
+        latents = latents.reshape(batch_size, num_frames, height, width, -1, patch_size_t, patch_size, patch_size)
+        latents = latents.permute(0, 4, 1, 5, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(2, 3)
+        return latents
+
     def prepare_latents(
         self,
         batch_size: int = 1,
@@ -415,20 +443,9 @@ def prepare_latents(
             )
 
         latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self._pack_latents(latents, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size)
         return latents
 
-    def decode_latents(self, latents: torch.Tensor):
-        # unscale/denormalize the latents
-        latents_mean = self.vae.latents_mean.view(1, self.vae.config.latent_channels, 1, 1, 1).to(
-            latents.device, latents.dtype
-        )
-        latents_std = self.vae.latents_std.view(1, self.vae.config.latent_channels, 1, 1, 1).to(
-            latents.device, latents.dtype
-        )
-        latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
-        video = self.vae.decode(latents, return_dict=False)[0]
-        return video
-
     @property
     def guidance_scale(self):
         return self._guidance_scale
@@ -610,10 +627,10 @@ def __call__(
         )
 
         # 5. Prepare timesteps
-        latent_frames = latents.size(2)
+        latent_num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1
         latent_height = height // self.vae_spatial_compression_ratio
         latent_width = width // self.vae_spatial_compression_ratio
-        video_sequence_length = latent_height * latent_width * latent_frames
+        video_sequence_length = latent_num_frames * latent_height * latent_width
         sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
         mu = calculate_shift(
             video_sequence_length,
@@ -656,6 +673,9 @@ def __call__(
                     encoder_hidden_states=prompt_embeds,
                     timestep=timestep,
                     encoder_attention_mask=prompt_attention_mask,
+                    num_frames=latent_num_frames,
+                    height=latent_height,
+                    width=latent_width,
                     rope_interpolation_scale=rope_interpolation_scale,
                     return_dict=False,
                 )[0]
@@ -689,7 +709,16 @@ def __call__(
         if output_type == "latent":
             video = latents
         else:
-            video = self.decode_latents(latents)
+            latents = self._unpack_latents(latents, latent_num_frames, latent_height, latent_width, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size)
+            # unscale/denormalize the latents
+            latents_mean = self.vae.latents_mean.view(1, self.vae.config.latent_channels, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents_std = self.vae.latents_std.view(1, self.vae.config.latent_channels, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            video = self.vae.decode(latents, return_dict=False)[0]
             video = self.video_processor.postprocess_video(video, output_type=output_type)
 
         # Offload all models