remove framewise encoding/decoding

a-r-r-o-w · a-r-r-o-w · commit c2018808ac16 · 2024-11-29T04:58:46.000+01:00
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
@@ -899,12 +899,12 @@ def _encode(self, x: torch.Tensor) -> torch.Tensor:
             return self.tiled_encode(x)
 
         if self.use_framewise_encoding:
-            enc = []
-            for i in range(0, num_frames, self.num_sample_frames_batch_size):
-                x_intermediate = x[:, :, i : i + self.num_sample_frames_batch_size]
-                x_intermediate = self.encoder(x_intermediate)
-                enc.append(x_intermediate)
-            enc = torch.cat(enc, dim=2)
+            # TODO(aryan): requires investigation
+            raise NotImplementedError(
+                "Frame-wise encoding has not been implemented for AutoencoderKLLTX, at the moment, due to "
+                "quality issues caused by splitting inference across frame dimension. If you believe this "
+                "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
+            )
         else:
             enc = self.encoder(x)
 
@@ -946,12 +946,12 @@ def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOut
             return self.tiled_decode(z, return_dict=return_dict)
 
         if self.use_framewise_decoding:
-            dec = []
-            for i in range(0, num_frames, self.num_latent_frames_batch_size):
-                z_intermediate = z[:, :, i : i + self.num_latent_frames_batch_size]
-                z_intermediate = self.decoder(z_intermediate)
-                dec.append(z_intermediate)
-            dec = torch.cat(dec, dim=2)
+            # TODO(aryan): requires investigation
+            raise NotImplementedError(
+                "Frame-wise decoding has not been implemented for AutoencoderKLLTX, at the moment, due to "
+                "quality issues caused by splitting inference across frame dimension. If you believe this "
+                "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
+            )
         else:
             dec = self.decoder(z)
 
@@ -1031,17 +1031,12 @@ def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
             row = []
             for j in range(0, width, self.tile_sample_stride_width):
                 if self.use_framewise_encoding:
-                    time = []
-                    for k in range(0, num_frames, self.num_sample_frames_batch_size):
-                        tile = x[
-                            :,
-                            :,
-                            k : k + self.num_sample_frames_batch_size,
-                            i : i + self.tile_sample_min_height,
-                            j : j + self.tile_sample_min_width,
-                        ]
-                        tile = self.encoder(tile)
-                        time.append(tile)
+                    # TODO(aryan): requires investigation
+                    raise NotImplementedError(
+                        "Frame-wise encoding has not been implemented for AutoencoderKLLTX, at the moment, due to "
+                        "quality issues caused by splitting inference across frame dimension. If you believe this "
+                        "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
+                    )
                 else:
                     time = self.encoder(
                         x[:, :, :, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width]
@@ -1100,18 +1095,12 @@ def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[Decod
             row = []
             for j in range(0, width, tile_latent_stride_width):
                 if self.use_framewise_decoding:
-                    time = []
-                    for k in range(0, num_frames, self.num_latent_frames_batch_size):
-                        tile = z[
-                            :,
-                            :,
-                            k : k + self.num_latent_frames_batch_size,
-                            i : i + tile_latent_min_height,
-                            j : j + tile_latent_min_width,
-                        ]
-                        tile = self.decoder(tile)
-                        time.append(tile)
-                    time = torch.cat(time, dim=2)
+                    # TODO(aryan): requires investigation
+                    raise NotImplementedError(
+                        "Frame-wise decoding has not been implemented for AutoencoderKLLTX, at the moment, due to "
+                        "quality issues caused by splitting inference across frame dimension. If you believe this "
+                        "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
+                    )
                 else:
                     time = self.decoder(z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width])
 
diff --git a/src/diffusers/models/normalization.py b/src/diffusers/models/normalization.py
@@ -543,9 +543,6 @@ def forward(self, hidden_states):
 
         return hidden_states
 
-    def extra_repr(self) -> str:
-        return f"features={self.dim}, eps={self.eps}, elementwise_affine={self.elementwise_affine}"
-
 
 class GlobalResponseNorm(nn.Module):
     # Taken from https://github.com/facebookresearch/ConvNeXt-V2/blob/3608f67cc1dae164790c5d0aead7bf2d73d9719b/models/utils.py#L105
diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx.py b/src/diffusers/pipelines/ltx/pipeline_ltx.py
@@ -632,8 +632,6 @@ def __call__(
         )
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
         self._num_timesteps = len(timesteps)
-        print(self.scheduler.sigmas)
-        print(len(self.scheduler.sigmas))
 
         # 6. Prepare micro-conditions
         rope_interpolation_scale = (

Original file line number	Diff line number	Diff line change
`@@ -632,8 +632,6 @@ def __call__(`
`632`	`632`	`)`
`633`	`633`	`num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)`
`634`	`634`	`self._num_timesteps = len(timesteps)`
`635`		`- print(self.scheduler.sigmas)`
`636`		`- print(len(self.scheduler.sigmas))`
`637`	`635`
`638`	`636`	`# 6. Prepare micro-conditions`
`639`	`637`	`rope_interpolation_scale = (`