comments

sayakpaul · sayakpaul · commit 790735939334 · 2024-12-18T14:30:14.000+05:30
diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx_video2video.py b/src/diffusers/pipelines/ltx/pipeline_ltx_video2video.py
@@ -463,6 +463,7 @@ def prepare_latents(
         width = width // self.vae_spatial_compression_ratio
 
         # TODO: should video_processor take care of it? Because for Cog, we get a 5D tensor here.
+        # `video` memory layout is (num_frames, num_channels, height, width)
         if video.ndim == 4:
             video = video.unsqueeze(0)
 
@@ -490,7 +491,7 @@ def prepare_latents(
                 retrieve_latents(self.vae.encode(video[i].unsqueeze(0).permute(0, 2, 1, 3, 4)), generator[i])
                 for i in range(batch_size)
             ]
-        else:
+        else: # `premute()` because we want `batch_size, num_channels, num_frames, height, width`
             init_latents = [
                 retrieve_latents(self.vae.encode(vid.unsqueeze(0).permute(0, 2, 1, 3, 4)), generator) for vid in video
             ]

Original file line number	Diff line number	Diff line change
`@@ -463,6 +463,7 @@ def prepare_latents(`
`463`	`463`	`width = width // self.vae_spatial_compression_ratio`
`464`	`464`
`465`	`465`	`# TODO: should video_processor take care of it? Because for Cog, we get a 5D tensor here.`
	`466`	+ # `video` memory layout is (num_frames, num_channels, height, width)
`466`	`467`	`if video.ndim == 4:`
`467`	`468`	`video = video.unsqueeze(0)`
`468`	`469`
`@@ -490,7 +491,7 @@ def prepare_latents(`
`490`	`491`	`retrieve_latents(self.vae.encode(video[i].unsqueeze(0).permute(0, 2, 1, 3, 4)), generator[i])`
`491`	`492`	`for i in range(batch_size)`
`492`	`493`	`]`
`493`		`- else:`
	`494`	+ else: # `premute()` because we want `batch_size, num_channels, num_frames, height, width`
`494`	`495`	`init_latents = [`
`495`	`496`	`retrieve_latents(self.vae.encode(vid.unsqueeze(0).permute(0, 2, 1, 3, 4)), generator) for vid in video`
`496`	`497`	`]`