Remove diffusers vae dependency

kevin314 · kevin314 · commit 26c2f0f6bf01 · 2025-10-10T02:45:23.000Z
diff --git a/fastvideo/pipelines/basic/cosmos/cosmos_pipeline.py b/fastvideo/pipelines/basic/cosmos/cosmos_pipeline.py
@@ -12,8 +12,8 @@
 
 # TEMPORARY: Import diffusers VAE for comparison
 import sys
-sys.path.insert(0, '/workspace/diffusers/src')
-from diffusers.models.autoencoders.autoencoder_kl_wan import AutoencoderKLWan as DiffusersAutoencoderKLWan
+# sys.path.insert(0, '/mnt/fast-disks/nfs/hao_lab/kevin/diffusers/src')
+# from diffusers.models.autoencoders.autoencoder_kl_wan import AutoencoderKLWan as DiffusersAutoencoderKLWan
 
 from fastvideo.fastvideo_args import FastVideoArgs
 from fastvideo.logger import init_logger
@@ -27,6 +27,8 @@
 from fastvideo.models.schedulers.scheduling_flow_match_euler_discrete import (
     FlowMatchEulerDiscreteScheduler)
 
+from fastvideo.models.vaes.wanvae import AutoencoderKLWan
+
 logger = init_logger(__name__)
 
 
@@ -38,93 +40,70 @@ class Cosmos2VideoToWorldPipeline(ComposedPipelineBase):
 
     def initialize_pipeline(self, fastvideo_args: FastVideoArgs):
 
-        # TEMPORARY: Replace FastVideo VAE with diffusers VAE for testing
-        print("[TEMPORARY] Replacing FastVideo VAE with diffusers VAE...")
-        original_vae = self.modules["vae"]
-        print(f"[TEMPORARY] Original VAE type: {type(original_vae)}")
+        # original_vae = self.modules["vae"]
+
+        # diffusers_vae = DiffusersAutoencoderKLWan.from_pretrained(
+        #     self.model_path,
+        #     subfolder="vae",
+        #     torch_dtype=torch.bfloat16,
+        # )
 
-        # Load diffusers VAE with same config
-        diffusers_vae = DiffusersAutoencoderKLWan.from_pretrained(
-            self.model_path,
-            subfolder="vae",
-            torch_dtype=torch.bfloat16,
-        )
-        print(f"[TEMPORARY] Diffusers VAE type: {type(diffusers_vae)}")
+        # with open("/mnt/fast-disks/nfs/hao_lab/kevin/FastVideo/fastvideo_hidden_states.log", "a") as f:
+        #     f.write(f"[TEMPORARY] Diffusers VAE type: {type(diffusers_vae)}\n")
 
         # Replace the VAE module
-        self.modules["vae"] = diffusers_vae
-        print("[TEMPORARY] VAE replacement complete!")
+        # self.modules["vae"] = diffusers_vae
+
 
         self.modules["scheduler"] = FlowMatchEulerDiscreteScheduler(
             shift=fastvideo_args.pipeline_config.flow_shift,
             use_karras_sigmas=True)
         
-        # Configure Cosmos-specific scheduler parameters (matching diffusers)
-        # Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:209-219
         sigma_max = 80.0
         sigma_min = 0.002
         sigma_data = 1.0
         final_sigmas_type = "sigma_min"
         
         if self.modules["scheduler"] is not None:
-            # Update scheduler config and attributes directly
             scheduler = self.modules["scheduler"]
             scheduler.config.sigma_max = sigma_max
             scheduler.config.sigma_min = sigma_min
             scheduler.config.sigma_data = sigma_data
             scheduler.config.final_sigmas_type = final_sigmas_type
-            # Also set the direct attributes used by the scheduler
             scheduler.sigma_max = sigma_max
             scheduler.sigma_min = sigma_min
             scheduler.sigma_data = sigma_data
 
     def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
         """Set up pipeline stages with proper dependency injection."""
 
-        # Input validation - corresponds to diffusers check_inputs method
-        # Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:427-456
         self.add_stage(stage_name="input_validation_stage",
                        stage=InputValidationStage())
 
-        # Text encoding - corresponds to diffusers encode_prompt method
-        # Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:265-346
-        # Also uses _get_t5_prompt_embeds method: lines 222-262
         self.add_stage(stage_name="prompt_encoding_stage",
                        stage=TextEncodingStage(
                            text_encoders=[self.get_module("text_encoder")],
                            tokenizers=[self.get_module("tokenizer")],
                        ))
 
-        # Conditioning preparation - part of main __call__ method setup
-        # Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:607-628
         self.add_stage(stage_name="conditioning_stage",
                        stage=ConditioningStage())
 
-        # Timestep preparation - corresponds to timestep setup in __call__
-        # Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:630-637
-        # Uses retrieve_timesteps function: lines 81-137
         self.add_stage(stage_name="timestep_preparation_stage",
                        stage=TimestepPreparationStage(
                            scheduler=self.get_module("scheduler")))
 
-        # Latent preparation - corresponds to prepare_latents method
-        # Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:348-424
-        # Also includes video preprocessing: lines 642-661
         self.add_stage(stage_name="latent_preparation_stage",
                        stage=CosmosLatentPreparationStage(
                            scheduler=self.get_module("scheduler"),
                            transformer=self.get_module("transformer"),
                            vae=self.get_module("vae")))
 
-        # Denoising loop - corresponds to main denoising loop in __call__
-        # Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:673-752
         self.add_stage(stage_name="denoising_stage",
                        stage=CosmosDenoisingStage(
                            transformer=self.get_module("transformer"),
                            scheduler=self.get_module("scheduler")))
 
-        # VAE decoding - corresponds to final decoding section in __call__
-        # Source: /workspace/diffusers/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:755-784
         self.add_stage(stage_name="decoding_stage",
                        stage=DecodingStage(vae=self.get_module("vae")))
         
diff --git a/fastvideo/pipelines/stages/decoding.py b/fastvideo/pipelines/stages/decoding.py
@@ -83,7 +83,7 @@ def forward(
             raise ValueError("Latents must be provided")
 
         print(f"[FASTVIDEO VAE DEBUG] Before scaling/shifting - latents sum: {latents.float().sum().item():.6f}, shape: {latents.shape}, dtype: {latents.dtype}")
-        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+        with open("/mnt/fast-disks/nfs/hao_lab/kevin/FastVideo/fastvideo_hidden_states.log", "a") as f:
             f.write(f"[FASTVIDEO VAE DEBUG] Before scaling/shifting - latents sum: {latents.float().sum().item():.6f}, shape: {latents.shape}, dtype: {latents.dtype}\n")
 
         # Skip decoding if output type is latent
@@ -107,6 +107,14 @@ def forward(
                     if hasattr(scheduler, 'config') and hasattr(scheduler.config, 'sigma_data'):
                         sigma_data = scheduler.config.sigma_data
 
+                print(f"[FASTVIDEO VAE DEBUG] sigma_data = {sigma_data}")
+                print(f"[FASTVIDEO VAE DEBUG] latents_mean config = {self.vae.config.latents_mean}")
+                print(f"[FASTVIDEO VAE DEBUG] latents_std config = {self.vae.config.latents_std}")
+                with open("/mnt/fast-disks/nfs/hao_lab/kevin/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                    f.write(f"[FASTVIDEO VAE DEBUG] sigma_data = {sigma_data}\n")
+                    f.write(f"[FASTVIDEO VAE DEBUG] latents_mean config = {self.vae.config.latents_mean}\n")
+                    f.write(f"[FASTVIDEO VAE DEBUG] latents_std config = {self.vae.config.latents_std}\n")
+
                 latents_mean = (
                     torch.tensor(self.vae.config.latents_mean)
                     .view(1, self.vae.config.z_dim, 1, 1, 1)
@@ -117,7 +125,29 @@ def forward(
                     .view(1, self.vae.config.z_dim, 1, 1, 1)
                     .to(latents.device, latents.dtype)
                 )
-                latents = latents * latents_std / sigma_data + latents_mean
+                print(f"[FASTVIDEO VAE DEBUG] latents dtype = {latents.dtype}, latents_mean dtype = {latents_mean.dtype}, latents_std dtype = {latents_std.dtype}")
+                print(f"[FASTVIDEO VAE DEBUG] latents_mean tensor sum = {latents_mean.sum().item():.6f}")
+                print(f"[FASTVIDEO VAE DEBUG] latents_std tensor sum = {latents_std.sum().item():.6f}")
+                with open("/mnt/fast-disks/nfs/hao_lab/kevin/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                    f.write(f"[FASTVIDEO VAE DEBUG] latents dtype = {latents.dtype}, latents_mean dtype = {latents_mean.dtype}, latents_std dtype = {latents_std.dtype}\n")
+                    f.write(f"[FASTVIDEO VAE DEBUG] latents_mean tensor sum = {latents_mean.sum().item():.6f}\n")
+                    f.write(f"[FASTVIDEO VAE DEBUG] latents_std tensor sum = {latents_std.sum().item():.6f}\n")
+
+                print(f"[FASTVIDEO VAE DEBUG] latents shape = {latents.shape}, latents_mean shape = {latents_mean.shape}, latents_std shape = {latents_std.shape}")
+                with open("/mnt/fast-disks/nfs/hao_lab/kevin/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                    f.write(f"[FASTVIDEO VAE DEBUG] latents shape = {latents.shape}, latents_mean shape = {latents_mean.shape}, latents_std shape = {latents_std.shape}\n")
+
+                latents_after_mul = latents * latents_std / sigma_data
+                print(f"[FASTVIDEO VAE DEBUG] After multiply (latents * latents_std / sigma_data) sum = {latents_after_mul.float().sum().item():.6f}")
+                print(f"[FASTVIDEO VAE DEBUG] latents_after_mul shape = {latents_after_mul.shape}")
+                with open("/mnt/fast-disks/nfs/hao_lab/kevin/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                    f.write(f"[FASTVIDEO VAE DEBUG] After multiply sum = {latents_after_mul.float().sum().item():.6f}\n")
+                    f.write(f"[FASTVIDEO VAE DEBUG] latents_after_mul shape = {latents_after_mul.shape}\n")
+
+                latents = latents_after_mul + latents_mean
+                print(f"[FASTVIDEO VAE DEBUG] After adding latents_mean, latents shape = {latents.shape}")
+                with open("/mnt/fast-disks/nfs/hao_lab/kevin/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                    f.write(f"[FASTVIDEO VAE DEBUG] After adding latents_mean shape = {latents.shape}\n")
             # Fallback to scaling_factor for other VAE types
             elif hasattr(self.vae, 'scaling_factor'):
                 if isinstance(self.vae.scaling_factor, torch.Tensor):
@@ -126,11 +156,10 @@ def forward(
                 else:
                     latents = latents / self.vae.scaling_factor
             elif hasattr(self.vae, 'config') and hasattr(self.vae.config, 'scaling_factor'):
-                # Fallback to config scaling factor for other diffusers VAEs
                 latents = latents / self.vae.config.scaling_factor
 
-            # Apply shifting if needed (for other VAE types)
-            if (hasattr(self.vae, "shift_factor")
+            # NOTE: Skip this if we already applied latents_mean (for Cosmos VAE)
+            elif (hasattr(self.vae, "shift_factor")
                     and self.vae.shift_factor is not None):
                 if isinstance(self.vae.shift_factor, torch.Tensor):
                     latents += self.vae.shift_factor.to(latents.device,
@@ -139,7 +168,7 @@ def forward(
                     latents += self.vae.shift_factor
 
             print(f"[FASTVIDEO VAE DEBUG] After scaling/shifting - latents sum: {latents.float().sum().item():.6f}")
-            with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+            with open("/mnt/fast-disks/nfs/hao_lab/kevin/FastVideo/fastvideo_hidden_states.log", "a") as f:
                 f.write(f"[FASTVIDEO VAE DEBUG] After scaling/shifting - latents sum: {latents.float().sum().item():.6f}\n")
 
             # Decode latents
@@ -163,14 +192,14 @@ def forward(
                     image = decode_output
 
                 print(f"[FASTVIDEO VAE DEBUG] After decode - image sum: {image.float().sum().item():.6f}, shape: {image.shape}")
-                with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+                with open("/mnt/fast-disks/nfs/hao_lab/kevin/FastVideo/fastvideo_hidden_states.log", "a") as f:
                     f.write(f"[FASTVIDEO VAE DEBUG] After decode - image sum: {image.float().sum().item():.6f}, shape: {image.shape}\n")
 
         # Normalize image to [0, 1] range
         image = (image / 2 + 0.5).clamp(0, 1)
 
         print(f"[FASTVIDEO VAE DEBUG] After normalization - image sum: {image.float().sum().item():.6f}")
-        with open("/workspace/FastVideo/fastvideo_hidden_states.log", "a") as f:
+        with open("/mnt/fast-disks/nfs/hao_lab/kevin/FastVideo/fastvideo_hidden_states.log", "a") as f:
             f.write(f"[FASTVIDEO VAE DEBUG] After normalization - image sum: {image.float().sum().item():.6f}\n")
 
         # Convert to CPU float32 for compatibility
diff --git a/test_fastvideo_pipeline.py b/test_fastvideo_pipeline.py
@@ -6,8 +6,6 @@
 import os
 import sys
 
-# Add FastVideo to path
-sys.path.insert(0, "/workspace/FastVideo")
 
 from fastvideo.entrypoints.video_generator import VideoGenerator
 
@@ -18,10 +16,10 @@ def generate_video():
     # Configuration
     #input_image_path = "/workspace/FastVideo/tennis.jpg"
     #prompt = "A tennis ball bouncing on a racquet, the ball moves in a smooth arc as it hits the strings and rebounds with natural physics. The racquet strings vibrate slightly from the impact, and the ball continues its trajectory with realistic motion."
-    input_image_path = "/workspace/FastVideo/yellow-scrubber.png"
+    input_image_path = "/mnt/fast-disks/nfs/hao_lab/kevin/FastVideo/yellow-scrubber.png"
     prompt = "A close-up shot captures a vibrant yellow scrubber vigorously working on a grimy plate, its bristles moving in circular motions to lift stubborn grease and food residue. The dish, once covered in remnants of a hearty meal, gradually reveals its original glossy surface. Suds form and bubble around the scrubber, creating a satisfying visual of cleanliness in progress. The sound of scrubbing fills the air, accompanied by the gentle clinking of the dish against the sink. As the scrubber continues its task, the dish transforms, gleaming under the bright kitchen lights, symbolizing the triumph of cleanliness over mess."
     negative_prompt = "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality."
-    output_path = "/workspace/FastVideo/cosmos2_fastvideo_output.mp4"
+    output_path = "/mnt/fast-disks/nfs/hao_lab/kevin/FastVideo/cosmos2_fastvideo_output.mp4"
     
     # Check if input image exists
     if not os.path.exists(input_image_path):
@@ -51,7 +49,7 @@ def generate_video():
             guidance_scale=7.0,
             seed=1,
             save_video=True,
-            output_path=output_path
+            output_path=output_path,
         )
         
         if result: