MoonRide303
diff --git a/‎comfy/ldm/lightricks/model.py‎
Lines changed: 18 additions & 37 deletions b/‎comfy/ldm/lightricks/model.py‎
Lines changed: 18 additions & 37 deletions
diff --git a/‎comfy/ldm/lightricks/symmetric_patchifier.py‎
Lines changed: 45 additions & 33 deletions b/‎comfy/ldm/lightricks/symmetric_patchifier.py‎
Lines changed: 45 additions & 33 deletions
diff --git a/‎comfy/ldm/lightricks/vae/causal_conv3d.py‎
Lines changed: 2 additions & 1 deletion b/‎comfy/ldm/lightricks/vae/causal_conv3d.py‎
Lines changed: 2 additions & 1 deletion
@@ -7,7 +7,7 @@
 import math
 from typing import Dict, Optional, Tuple
 
-from .symmetric_patchifier import SymmetricPatchifier
+from .symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords
 
 
 def get_timestep_embedding(
@@ -377,12 +377,16 @@ def __init__(self,
 
                  positional_embedding_theta=10000.0,
                  positional_embedding_max_pos=[20, 2048, 2048],
+                 causal_temporal_positioning=False,
+                 vae_scale_factors=(8, 32, 32),
                  dtype=None, device=None, operations=None, **kwargs):
         super().__init__()
         self.generator = None
+        self.vae_scale_factors = vae_scale_factors
         self.dtype = dtype
         self.out_channels = in_channels
         self.inner_dim = num_attention_heads * attention_head_dim
+        self.causal_temporal_positioning = causal_temporal_positioning
 
         self.patchify_proj = operations.Linear(in_channels, self.inner_dim, bias=True, dtype=dtype, device=device)
 
@@ -416,50 +420,31 @@ def __init__(self,
 
         self.patchifier = SymmetricPatchifier(1)
 
-    def forward(self, x, timestep, context, attention_mask, frame_rate=25, guiding_latent=None, guiding_latent_noise_scale=0, transformer_options={}, **kwargs):
+    def forward(self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, **kwargs):
         patches_replace = transformer_options.get("patches_replace", {})
 
-        indices_grid = self.patchifier.get_grid(
-            orig_num_frames=x.shape[2],
-            orig_height=x.shape[3],
-            orig_width=x.shape[4],
-            batch_size=x.shape[0],
-            scale_grid=((1 / frame_rate) * 8, 32, 32),
-            device=x.device,
-        )
-
-        if guiding_latent is not None:
-            ts = torch.ones([x.shape[0], 1, x.shape[2], x.shape[3], x.shape[4]], device=x.device, dtype=x.dtype)
-            input_ts = timestep.view([timestep.shape[0]] + [1] * (x.ndim - 1))
-            ts *= input_ts
-            ts[:, :, 0] = guiding_latent_noise_scale * (input_ts[:, :, 0] ** 2)
-            timestep = self.patchifier.patchify(ts)
-            input_x = x.clone()
-            x[:, :, 0] = guiding_latent[:, :, 0]
-            if guiding_latent_noise_scale > 0:
-                if self.generator is None:
-                    self.generator = torch.Generator(device=x.device).manual_seed(42)
-                elif self.generator.device != x.device:
-                    self.generator = torch.Generator(device=x.device).set_state(self.generator.get_state())
-
-                noise_shape = [guiding_latent.shape[0], guiding_latent.shape[1], 1, guiding_latent.shape[3], guiding_latent.shape[4]]
-                scale = guiding_latent_noise_scale * (input_ts ** 2)
-                guiding_noise = scale * torch.randn(size=noise_shape, device=x.device, generator=self.generator)
-
-                x[:, :, 0] = guiding_noise[:, :, 0] + x[:, :, 0] *  (1.0 - scale[:, :, 0])
+        orig_shape = list(x.shape)
 
+        x, latent_coords = self.patchifier.patchify(x)
+        pixel_coords = latent_to_pixel_coords(
+            latent_coords=latent_coords,
+            scale_factors=self.vae_scale_factors,
+            causal_fix=self.causal_temporal_positioning,
+        )
 
-        orig_shape = list(x.shape)
+        if keyframe_idxs is not None:
+            pixel_coords[:, :, -keyframe_idxs.shape[2]:] = keyframe_idxs
 
-        x = self.patchifier.patchify(x)
+        fractional_coords = pixel_coords.to(torch.float32)
+        fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)
 
         x = self.patchify_proj(x)
         timestep = timestep * 1000.0
 
         if attention_mask is not None and not torch.is_floating_point(attention_mask):
             attention_mask = (attention_mask - 1).to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])) * torch.finfo(x.dtype).max
 
-        pe = precompute_freqs_cis(indices_grid, dim=self.inner_dim, out_dtype=x.dtype)
+        pe = precompute_freqs_cis(fractional_coords, dim=self.inner_dim, out_dtype=x.dtype)
 
         batch_size = x.shape[0]
         timestep, embedded_timestep = self.adaln_single(
@@ -519,8 +504,4 @@ def block_wrap(args):
             out_channels=orig_shape[1] // math.prod(self.patchifier.patch_size),
         )
 
-        if guiding_latent is not None:
-            x[:, :, 0] = (input_x[:, :, 0] - guiding_latent[:, :, 0]) / input_ts[:, :, 0]
-
-        # print("res", x)
         return x
@@ -6,16 +6,29 @@
 from torch import Tensor
 
 
-def append_dims(x: torch.Tensor, target_dims: int) -> torch.Tensor:
-    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
-    dims_to_append = target_dims - x.ndim
-    if dims_to_append < 0:
-        raise ValueError(
-            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
-        )
-    elif dims_to_append == 0:
-        return x
-    return x[(...,) + (None,) * dims_to_append]
+def latent_to_pixel_coords(
+    latent_coords: Tensor, scale_factors: Tuple[int, int, int], causal_fix: bool = False
+) -> Tensor:
+    """
+    Converts latent coordinates to pixel coordinates by scaling them according to the VAE's
+    configuration.
+    Args:
+        latent_coords (Tensor): A tensor of shape [batch_size, 3, num_latents]
+        containing the latent corner coordinates of each token.
+        scale_factors (Tuple[int, int, int]): The scale factors of the VAE's latent space.
+        causal_fix (bool): Whether to take into account the different temporal scale
+            of the first frame. Default = False for backwards compatibility.
+    Returns:
+        Tensor: A tensor of pixel coordinates corresponding to the input latent coordinates.
+    """
+    pixel_coords = (
+        latent_coords
+        * torch.tensor(scale_factors, device=latent_coords.device)[None, :, None]
+    )
+    if causal_fix:
+        # Fix temporal scale for first frame to 1 due to causality
+        pixel_coords[:, 0] = (pixel_coords[:, 0] + 1 - scale_factors[0]).clamp(min=0)
+    return pixel_coords
 
 
 class Patchifier(ABC):
@@ -44,44 +57,43 @@ def unpatchify(
     def patch_size(self):
         return self._patch_size
 
-    def get_grid(
-        self, orig_num_frames, orig_height, orig_width, batch_size, scale_grid, device
+    def get_latent_coords(
+        self, latent_num_frames, latent_height, latent_width, batch_size, device
     ):
-        f = orig_num_frames // self._patch_size[0]
-        h = orig_height // self._patch_size[1]
-        w = orig_width // self._patch_size[2]
-        grid_h = torch.arange(h, dtype=torch.float32, device=device)
-        grid_w = torch.arange(w, dtype=torch.float32, device=device)
-        grid_f = torch.arange(f, dtype=torch.float32, device=device)
-        grid = torch.meshgrid(grid_f, grid_h, grid_w, indexing='ij')
-        grid = torch.stack(grid, dim=0)
-        grid = grid.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
-
-        if scale_grid is not None:
-            for i in range(3):
-                if isinstance(scale_grid[i], Tensor):
-                    scale = append_dims(scale_grid[i], grid.ndim - 1)
-                else:
-                    scale = scale_grid[i]
-                grid[:, i, ...] = grid[:, i, ...] * scale * self._patch_size[i]
-
-        grid = rearrange(grid, "b c f h w -> b c (f h w)", b=batch_size)
-        return grid
+        """
+        Return a tensor of shape [batch_size, 3, num_patches] containing the
+            top-left corner latent coordinates of each latent patch.
+        The tensor is repeated for each batch element.
+        """
+        latent_sample_coords = torch.meshgrid(
+            torch.arange(0, latent_num_frames, self._patch_size[0], device=device),
+            torch.arange(0, latent_height, self._patch_size[1], device=device),
+            torch.arange(0, latent_width, self._patch_size[2], device=device),
+            indexing="ij",
+        )
+        latent_sample_coords = torch.stack(latent_sample_coords, dim=0)
+        latent_coords = latent_sample_coords.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
+        latent_coords = rearrange(
+            latent_coords, "b c f h w -> b c (f h w)", b=batch_size
+        )
+        return latent_coords
 
 
 class SymmetricPatchifier(Patchifier):
     def patchify(
         self,
         latents: Tensor,
     ) -> Tuple[Tensor, Tensor]:
+        b, _, f, h, w = latents.shape
+        latent_coords = self.get_latent_coords(f, h, w, b, latents.device)
         latents = rearrange(
             latents,
             "b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)",
             p1=self._patch_size[0],
             p2=self._patch_size[1],
             p3=self._patch_size[2],
         )
-        return latents
+        return latents, latent_coords
 
     def unpatchify(
         self,
 
@@ -15,6 +15,7 @@ def __init__(
         stride: Union[int, Tuple[int]] = 1,
         dilation: int = 1,
         groups: int = 1,
+        spatial_padding_mode: str = "zeros",
         **kwargs,
     ):
         super().__init__()
@@ -38,7 +39,7 @@ def __init__(
             stride=stride,
             dilation=dilation,
             padding=padding,
-            padding_mode="zeros",
+            padding_mode=spatial_padding_mode,
             groups=groups,
         )