fix bugs

Artiprocher · Artiprocher · commit 9ed54c188e2c · 2024-07-26T17:51:03.000+08:00
diff --git a/diffsynth/pipelines/sd_video.py b/diffsynth/pipelines/sd_video.py
@@ -257,10 +257,10 @@ def __call__(
                 progress_bar_st.progress(progress_id / len(self.scheduler.timesteps))
         
         # Decode image
-        image = self.decode_video(latents, **tiler_kwargs)
+        output_frames = self.decode_video(latents, **tiler_kwargs)
 
         # Post-process
         if smoother is not None and (num_inference_steps in smoother_progress_ids or -1 in smoother_progress_ids):
             output_frames = smoother(output_frames, original_frames=input_frames)
 
-        return image
+        return output_frames
diff --git a/diffsynth/pipelines/sdxl_video.py b/diffsynth/pipelines/sdxl_video.py
@@ -214,10 +214,10 @@ def __call__(
                 progress_bar_st.progress(progress_id / len(self.scheduler.timesteps))
         
         # Decode image
-        image = self.decode_video(latents, **tiler_kwargs)
+        output_frames = self.decode_video(latents, **tiler_kwargs)
 
         # Post-process
         if smoother is not None and (num_inference_steps in smoother_progress_ids or -1 in smoother_progress_ids):
             output_frames = smoother(output_frames, original_frames=input_frames)
 
-        return image
+        return output_frames
diff --git a/examples/ExVideo/ExVideo_svd_train.py b/examples/ExVideo/ExVideo_svd_train.py
@@ -4,7 +4,7 @@
 from einops import rearrange, repeat
 import lightning as pl
 from diffsynth import ModelManager, SVDImageEncoder, SVDUNet, SVDVAEEncoder, ContinuousODEScheduler, load_state_dict
-from diffsynth.pipelines.stable_video_diffusion import SVDCLIPImageProcessor
+from diffsynth.pipelines.svd_video import SVDCLIPImageProcessor
 from diffsynth.models.svd_unet import TemporalAttentionBlock
 
 
@@ -131,14 +131,14 @@ def __init__(self, learning_rate=1e-5, svd_ckpt_path=None, add_positional_conv=1
         self.image_encoder.requires_grad_(False)
 
         self.unet = SVDUNet(add_positional_conv=add_positional_conv).to(dtype=torch.float16, device=self.device)
-        self.unet.load_state_dict(SVDUNet.state_dict_converter().from_civitai(state_dict), strict=False)
+        self.unet.load_state_dict(SVDUNet.state_dict_converter().from_civitai(state_dict, add_positional_conv=add_positional_conv), strict=False)
         self.unet.train()
         self.unet.requires_grad_(False)
         for block in self.unet.blocks:
             if isinstance(block, TemporalAttentionBlock):
                 block.requires_grad_(True)
 
-        self.vae_encoder = SVDVAEEncoder.to(dtype=torch.float16, device=self.device)
+        self.vae_encoder = SVDVAEEncoder().to(dtype=torch.float16, device=self.device)
         self.vae_encoder.load_state_dict(SVDVAEEncoder.state_dict_converter().from_civitai(state_dict))
         self.vae_encoder.eval()
         self.vae_encoder.requires_grad_(False)
diff --git a/examples/image_synthesis/sd3_text_to_image_textual_inversion.py b/examples/image_synthesis/sd3_text_to_image_textual_inversion.py