make style

a-r-r-o-w · a-r-r-o-w · commit 58a51aa5e306 · 2024-12-21T02:01:04.000+01:00
diff --git a/scripts/convert_ltx_to_diffusers.py b/scripts/convert_ltx_to_diffusers.py
@@ -1,6 +1,6 @@
 import argparse
-from typing import Any, Dict
 from pathlib import Path
+from typing import Any, Dict
 
 import torch
 from accelerate import init_empty_weights
@@ -133,7 +133,7 @@ def convert_transformer(
 
 def convert_vae(ckpt_path: str, config, dtype: torch.dtype):
     PREFIX_KEY = "vae."
-    
+
     original_state_dict = get_state_dict(load_file(ckpt_path))
     with init_empty_weights():
         vae = AutoencoderKLLTXVideo(**config)
@@ -155,54 +155,6 @@ def convert_vae(ckpt_path: str, config, dtype: torch.dtype):
     vae.load_state_dict(original_state_dict, strict=True, assign=True)
     return vae
 
-# OURS_VAE_CONFIG = {
-#     "_class_name": "CausalVideoAutoencoder",
-#     "dims": 3,
-#     "in_channels": 3,
-#     "out_channels": 3,
-#     "latent_channels": 128,
-#     "blocks": [
-#         ["res_x", 4],
-#         ["compress_all", 1],
-#         ["res_x_y", 1],
-#         ["res_x", 3],
-#         ["compress_all", 1],
-#         ["res_x_y", 1],
-#         ["res_x", 3],
-#         ["compress_all", 1],
-#         ["res_x", 3],
-#         ["res_x", 4],
-#     ],
-#     "scaling_factor": 1.0,
-#     "norm_layer": "pixel_norm",
-#     "patch_size": 4,
-#     "latent_log_var": "uniform",
-#     "use_quant_conv": False,
-#     "causal_decoder": False,
-# }
-
-# {
-#   "_class_name": "CausalVideoAutoencoder",
-#   "dims": 3, "in_channels": 3, "out_channels": 3, "latent_channels": 128,
-# "encoder_blocks": [["res_x", {"num_layers": 4}], ["compress_all", {}], ["res_x_y", 1], ["res_x", {"num_layers": 3}], ["compress_all", {}], ["res_x_y", 1], ["res_x", {"num_layers": 3}], ["compress_all", {}], ["res_x", {"num_layers": 3}], ["res_x", {"num_layers": 4}]],
-
-# previous decoder
-# mid: resx
-# resx
-# compress_all, resx
-# resxy, compress_all, resx
-# resxy, compress_all, resx
-
-# "decoder_blocks": [["res_x", {"num_layers": 5, "inject_noise": true}], ["compress_all", {"residual": true, "multiplier": 2}], ["res_x", {"num_layers": 6, "inject_noise": true}], ["compress_all", {"residual": true, "multiplier": 2}], ["res_x", {"num_layers": 7, "inject_noise": true}], ["compress_all", {"residual": true, "multiplier": 2}], ["res_x", {"num_layers": 8, "inject_noise": false}]],
-
-# current decoder
-# mid: resx
-# compress_all, resx
-# compress_all, resx
-# compress_all, resx
-
-# "scaling_factor": 1.0, "norm_layer": "pixel_norm", "patch_size": 4, "latent_log_var": "uniform", "use_quant_conv": false, "causal_decoder": false, "timestep_conditioning": true
-# }
 
 def get_vae_config(version: str) -> Dict[str, Any]:
     if version == "0.9.0":
@@ -272,7 +224,9 @@ def get_args():
     parser.add_argument("--save_pipeline", action="store_true")
     parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
     parser.add_argument("--dtype", default="fp32", help="Torch dtype to save the model in.")
-    parser.add_argument("--version", type=str, default="0.9.0", choices=["0.9.0", "0.9.1"], help="Version of the LTX model")
+    parser.add_argument(
+        "--version", type=str, default="0.9.0", choices=["0.9.0", "0.9.1"], help="Version of the LTX model"
+    )
     return parser.parse_args()
 
 
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
@@ -137,13 +137,13 @@ def __init__(
             self.conv_shortcut = LTXCausalConv3d(
                 in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1, is_causal=is_causal
             )
-        
+
         self.scale1 = None
         self.scale2 = None
         if inject_noise:
             self.scale1 = nn.Parameter(torch.zeros(in_channels, 1, 1))
             self.scale2 = nn.Parameter(torch.zeros(in_channels, 1, 1))
-        
+
         self.scale_shift_table = None
         if timestep_conditioning:
             self.scale_shift_table = nn.Parameter(torch.randn(4, in_channels) / in_channels**0.5)
@@ -166,7 +166,7 @@ def forward(self, inputs: torch.Tensor, temb: Optional[torch.Tensor] = None) ->
 
         if self.scale_shift_table is not None:
             hidden_states = hidden_states * (1 + scale_1) + shift_1
-        
+
         hidden_states = self.nonlinearity(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = self.conv2(hidden_states)
@@ -211,7 +211,6 @@ def __init__(
             is_causal=is_causal,
         )
 
-
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, num_channels, num_frames, height, width = hidden_states.shape
 
@@ -495,7 +494,17 @@ def __init__(
 
         self.upsamplers = None
         if spatio_temporal_scale:
-            self.upsamplers = nn.ModuleList([LTXUpsampler3d(out_channels * upscale_factor, stride=(2, 2, 2), is_causal=is_causal, residual=upsample_residual, upscale_factor=upscale_factor)])
+            self.upsamplers = nn.ModuleList(
+                [
+                    LTXUpsampler3d(
+                        out_channels * upscale_factor,
+                        stride=(2, 2, 2),
+                        is_causal=is_causal,
+                        residual=upsample_residual,
+                        upscale_factor=upscale_factor,
+                    )
+                ]
+            )
 
         resnets = []
         for _ in range(num_layers):
@@ -508,7 +517,7 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     is_causal=is_causal,
                     inject_noise=inject_noise,
-                    timestep_conditioning=timestep_conditioning
+                    timestep_conditioning=timestep_conditioning,
                 )
             )
         self.resnets = nn.ModuleList(resnets)
@@ -518,7 +527,7 @@ def __init__(
     def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
         if self.conv_in is not None:
             hidden_states = self.conv_in(hidden_states)
-        
+
         if self.time_embedder is not None:
             temb = self.time_embedder(
                 timestep=temb.flatten(),
@@ -744,7 +753,12 @@ def __init__(
         )
 
         self.mid_block = LTXMidBlock3d(
-            in_channels=output_channel, num_layers=layers_per_block[0], resnet_eps=resnet_norm_eps, is_causal=is_causal, inject_noise=inject_noise[0], timestep_conditioning=timestep_conditioning
+            in_channels=output_channel,
+            num_layers=layers_per_block[0],
+            resnet_eps=resnet_norm_eps,
+            is_causal=is_causal,
+            inject_noise=inject_noise[0],
+            timestep_conditioning=timestep_conditioning,
         )
 
         # up blocks