Init pipeline

kevin314 · kevin314 · commit baeea192dc1f · 2025-08-24T09:04:56.000Z
diff --git a/fastvideo/configs/models/vaes/cosmosvae.py b/fastvideo/configs/models/vaes/cosmosvae.py
@@ -18,6 +18,14 @@ class CosmosVAEArchConfig(VAEArchConfig):
     attn_scales: tuple[float, ...] = ()
     temperal_downsample: tuple[bool, ...] = (False, True, True)
     dropout: float = 0.0
+    decoder_base_dim: int | None = None
+    is_residual: bool = False
+    in_channels: int = 3
+    out_channels: int = 3
+    patch_size: int | None = None
+    scale_factor_temporal: int = 4
+    scale_factor_spatial: int = 8
+    clip_output: bool = True
     latents_mean: tuple[float, ...] = (
         -0.7571,
         -0.7089,
@@ -62,6 +70,8 @@ def __post_init__(self):
             self.latents_std).view(1, self.z_dim, 1, 1, 1)
         self.shift_factor: torch.Tensor = torch.tensor(self.latents_mean).view(
             1, self.z_dim, 1, 1, 1)
+        self.temporal_compression_ratio = self.scale_factor_temporal
+        self.spatial_compression_ratio = self.scale_factor_spatial
 
 
 @dataclass
diff --git a/fastvideo/models/schedulers/scheduling_flow_match_euler_discrete.py b/fastvideo/models/schedulers/scheduling_flow_match_euler_discrete.py
@@ -88,6 +88,14 @@ class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin,
             The type of dynamic resolution-dependent timestep shifting to apply. Either "exponential" or "linear".
         stochastic_sampling (`bool`, defaults to False):
             Whether to use stochastic sampling.
+        final_sigmas_type (`str`, defaults to "sigma_min"):
+            The type of final sigmas to use. Either "sigma_min" or "zero".
+        sigma_max (`float`, *optional*):
+            The maximum sigma value for the noise schedule.
+        sigma_min (`float`, *optional*):
+            The minimum sigma value for the noise schedule.
+        sigma_data (`float`, *optional*):
+            The sigma data value for scaling.
     """
 
     _compatibles: list[Any] = []
@@ -110,6 +118,10 @@ def __init__(
         use_beta_sigmas: bool | None = False,
         time_shift_type: str = "exponential",
         stochastic_sampling: bool = False,
+        final_sigmas_type: str = "sigma_min",
+        sigma_max: float | None = None,
+        sigma_min: float | None = None,
+        sigma_data: float | None = None,
     ):
         if sum([
                 self.config.use_beta_sigmas, self.config.use_exponential_sigmas,
@@ -403,9 +415,14 @@ def set_timesteps(
                 [sigmas_tensor,
                  torch.ones(1, device=sigmas_tensor.device)])
         else:
-            sigmas_tensor = torch.cat(
-                [sigmas_tensor,
-                 torch.zeros(1, device=sigmas_tensor.device)])
+            # Handle final_sigmas_type parameter
+            if self.config.final_sigmas_type == "sigma_min":
+                # Use sigma_min instead of zero for final sigma
+                final_sigma = torch.tensor([self.sigma_min], device=sigmas_tensor.device)
+            else:  # "zero" or default
+                final_sigma = torch.zeros(1, device=sigmas_tensor.device)
+            
+            sigmas_tensor = torch.cat([sigmas_tensor, final_sigma])
 
         self.timesteps = timesteps_tensor
         self.sigmas = sigmas_tensor
diff --git a/fastvideo/pipelines/basic/cosmos/cosmos_pipeline.py b/fastvideo/pipelines/basic/cosmos/cosmos_pipeline.py
@@ -14,26 +14,59 @@
 from fastvideo.logger import init_logger
 from fastvideo.pipelines.composed_pipeline_base import ComposedPipelineBase
 from fastvideo.pipelines.stages import (ConditioningStage, DecodingStage,
-                                           DenoisingStage, InputValidationStage,
+                                           CosmosDenoisingStage, InputValidationStage,
                                            LatentPreparationStage,
                                            TextEncodingStage,
                                            TimestepPreparationStage)
 from fastvideo.pipelines.stages.base import PipelineStage
+from fastvideo.models.schedulers.scheduling_flow_match_euler_discrete import (
+    FlowMatchEulerDiscreteScheduler)
 
 logger = init_logger(__name__)
 
 
 class Cosmos2VideoToWorldPipeline(ComposedPipelineBase):
 
     _required_config_modules = [
-        "text_encoder", "tokenizer", "vae", "transformer", "scheduler"
+        "text_encoder", "tokenizer", "vae", "transformer", "scheduler", "safety_checker"
     ]
-    
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+
+    def initialize_pipeline(self, fastvideo_args: FastVideoArgs):
+
+        self.modules["scheduler"] = FlowMatchEulerDiscreteScheduler(
+            shift=fastvideo_args.pipeline_config.flow_shift)
 
     def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
         """Set up pipeline stages with proper dependency injection."""
+
+        self.add_stage(stage_name="input_validation_stage",
+                       stage=InputValidationStage())
+
+        self.add_stage(stage_name="prompt_encoding_stage",
+                       stage=TextEncodingStage(
+                           text_encoders=[self.get_module("text_encoder")],
+                           tokenizers=[self.get_module("tokenizer")],
+                       ))
+
+        self.add_stage(stage_name="conditioning_stage",
+                       stage=ConditioningStage())
+
+        self.add_stage(stage_name="timestep_preparation_stage",
+                       stage=TimestepPreparationStage(
+                           scheduler=self.get_module("scheduler")))
+
+        self.add_stage(stage_name="latent_preparation_stage",
+                       stage=LatentPreparationStage(
+                           scheduler=self.get_module("scheduler"),
+                           transformer=self.get_module("transformer", None)))
+
+        self.add_stage(stage_name="denoising_stage",
+                       stage=CosmosDenoisingStage(
+                           transformer=self.get_module("transformer"),
+                           scheduler=self.get_module("scheduler")))
+
+        self.add_stage(stage_name="decoding_stage",
+                       stage=DecodingStage(vae=self.get_module("vae")))
         
 
 
diff --git a/fastvideo/pipelines/composed_pipeline_base.py b/fastvideo/pipelines/composed_pipeline_base.py
@@ -282,6 +282,7 @@ def load_modules(
         for module_name, (transformers_or_diffusers,
                           architecture) in model_index.items():
             if transformers_or_diffusers is None:
+                print("REQURED", self.required_config_modules, module_name)
                 self.required_config_modules.remove(module_name)
                 continue
             if module_name not in required_modules:
diff --git a/fastvideo/pipelines/pipeline_registry.py b/fastvideo/pipelines/pipeline_registry.py
@@ -23,6 +23,7 @@
     "WanImageToVideoPipeline": "wan",
     "StepVideoPipeline": "stepvideo",
     "HunyuanVideoPipeline": "hunyuan",
+    "Cosmos2VideoToWorldPipeline": "cosmos"
 }
 
 _PREPROCESS_WORKLOAD_TYPE_TO_PIPELINE_NAME: dict[WorkloadType, str] = {
diff --git a/fastvideo/pipelines/stages/__init__.py b/fastvideo/pipelines/stages/__init__.py
@@ -10,7 +10,8 @@
 from fastvideo.pipelines.stages.conditioning import ConditioningStage
 from fastvideo.pipelines.stages.decoding import DecodingStage
 from fastvideo.pipelines.stages.denoising import (DenoisingStage,
-                                                  DmdDenoisingStage)
+                                                  DmdDenoisingStage,
+                                                  CosmosDenoisingStage)
 from fastvideo.pipelines.stages.encoding import EncodingStage
 from fastvideo.pipelines.stages.image_encoding import (ImageEncodingStage,
                                                        ImageVAEEncodingStage)
@@ -30,6 +31,7 @@
     "ConditioningStage",
     "DenoisingStage",
     "DmdDenoisingStage",
+    "CosmosDenoisingStage",
     "EncodingStage",
     "DecodingStage",
     "ImageEncodingStage",
diff --git a/fastvideo/pipelines/stages/denoising.py b/fastvideo/pipelines/stages/denoising.py
@@ -600,6 +600,166 @@ def verify_output(self, batch: ForwardBatch,
         return result
 
 
+class CosmosDenoisingStage(PipelineStage):
+    """
+    Denoising stage for Cosmos models using FlowMatchEulerDiscreteScheduler.
+    
+    This stage implements the diffusers-compatible Cosmos denoising process with velocity prediction,
+    classifier-free guidance, and conditional video generation support.
+    Compatible with Hugging Face Cosmos models.
+    """
+
+    def __init__(self, 
+                 transformer,
+                 scheduler,
+                 pipeline=None) -> None:
+        super().__init__()
+        self.transformer = transformer
+        self.scheduler = scheduler  # FlowMatchEulerDiscreteScheduler
+        self.pipeline = weakref.ref(pipeline) if pipeline else None
+
+    def forward(
+        self,
+        batch: ForwardBatch,
+        fastvideo_args: FastVideoArgs,
+    ) -> ForwardBatch:
+        """
+        Run the diffusers-style Cosmos denoising loop.
+        
+        Args:
+            batch: The current batch information.
+            fastvideo_args: The inference arguments.
+            
+        Returns:
+            The batch with denoised latents.
+        """
+        pipeline = self.pipeline() if self.pipeline else None
+        if not fastvideo_args.model_loaded["transformer"]:
+            loader = TransformerLoader()
+            self.transformer = loader.load(
+                fastvideo_args.model_paths["transformer"], fastvideo_args)
+            if pipeline:
+                pipeline.add_module("transformer", self.transformer)
+            fastvideo_args.model_loaded["transformer"] = True
+
+        # Setup precision and autocast settings
+        target_dtype = torch.bfloat16
+        autocast_enabled = (target_dtype != torch.float32
+                            ) and not fastvideo_args.disable_autocast
+
+        # Get latents and setup
+        latents = batch.latents
+        num_inference_steps = batch.num_inference_steps
+        guidance_scale = batch.guidance_scale
+        
+        # Setup scheduler with sigma schedule
+        sigmas_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
+        sigmas = torch.linspace(0, 1, num_inference_steps, dtype=sigmas_dtype)
+        timesteps = torch.arange(num_inference_steps, device=latents.device, dtype=torch.long)
+        self.scheduler.set_timesteps(device=latents.device, sigmas=sigmas)
+        
+        # Initialize with maximum noise
+        latents = torch.randn_like(latents, dtype=torch.float32) * self.scheduler.config.sigma_max
+        
+        # Prepare conditional frame handling (if needed)
+        # This would be implemented based on batch.conditioning_latents or similar
+        
+        # Sampling loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Skip if interrupted
+                if hasattr(self, 'interrupt') and self.interrupt:
+                    continue
+                
+                # Get current sigma and preconditioning coefficients
+                current_sigma = self.scheduler.sigmas[i]
+                current_t = current_sigma / (current_sigma + 1)
+                c_in = 1 - current_t
+                c_skip = 1 - current_t
+                c_out = -current_t
+                
+                # Prepare timestep tensor
+                timestep = current_t.view(1, 1, 1, 1, 1).expand(
+                    latents.size(0), -1, latents.size(2), -1, -1
+                )
+                
+                with torch.autocast(device_type="cuda",
+                                    dtype=target_dtype,
+                                    enabled=autocast_enabled):
+                    
+                    # Conditional forward pass
+                    cond_latent = latents * c_in
+                    # Add conditional frame handling here if needed:
+                    # cond_latent = cond_indicator * conditioning_latents + (1 - cond_indicator) * cond_latent
+                    
+                    cond_velocity = self.transformer(
+                        hidden_states=cond_latent.to(target_dtype),
+                        timestep=timestep.to(target_dtype),
+                        encoder_hidden_states=batch.prompt_embeds[0].to(target_dtype),
+                        return_dict=False,
+                    )[0]
+                    
+                    # Apply preconditioning
+                    cond_pred = (c_skip * latents + c_out * cond_velocity.float()).to(target_dtype)
+                    
+                    # Classifier-free guidance
+                    if batch.do_classifier_free_guidance and batch.negative_prompt_embeds is not None:
+                        uncond_latent = latents * c_in
+                        
+                        uncond_velocity = self.transformer(
+                            hidden_states=uncond_latent.to(target_dtype),
+                            timestep=timestep.to(target_dtype),
+                            encoder_hidden_states=batch.negative_prompt_embeds[0].to(target_dtype),
+                            return_dict=False,
+                        )[0]
+                        
+                        uncond_pred = (c_skip * latents + c_out * uncond_velocity.float()).to(target_dtype)
+                        
+                        # Apply guidance
+                        velocity_pred = cond_pred + guidance_scale * (cond_pred - uncond_pred)
+                    else:
+                        velocity_pred = cond_pred
+                
+                # Convert velocity to noise for scheduler
+                noise_pred = (latents - velocity_pred) / current_sigma
+                
+                # Standard scheduler step
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                
+                progress_bar.update()
+        
+        # Update batch with final latents
+        batch.latents = latents
+        
+        return batch
+
+    def verify_input(self, batch: ForwardBatch,
+                     fastvideo_args: FastVideoArgs) -> VerificationResult:
+        """Verify Cosmos denoising stage inputs."""
+        result = VerificationResult()
+        result.add_check("latents", batch.latents,
+                         [V.is_tensor, V.with_dims(5)])
+        result.add_check("prompt_embeds", batch.prompt_embeds, V.list_not_empty)
+        result.add_check("num_inference_steps", batch.num_inference_steps,
+                         V.positive_int)
+        result.add_check("guidance_scale", batch.guidance_scale,
+                         V.positive_float)
+        result.add_check("do_classifier_free_guidance",
+                         batch.do_classifier_free_guidance, V.bool_value)
+        result.add_check(
+            "negative_prompt_embeds", batch.negative_prompt_embeds, lambda x:
+            not batch.do_classifier_free_guidance or V.list_not_empty(x))
+        return result
+
+    def verify_output(self, batch: ForwardBatch,
+                      fastvideo_args: FastVideoArgs) -> VerificationResult:
+        """Verify Cosmos denoising stage outputs."""
+        result = VerificationResult()
+        result.add_check("latents", batch.latents,
+                         [V.is_tensor, V.with_dims(5)])
+        return result
+
+
 class DmdDenoisingStage(DenoisingStage):
     """
     Denoising stage for DMD.

Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,7 @@`
`23`	`23`	`"WanImageToVideoPipeline": "wan",`
`24`	`24`	`"StepVideoPipeline": "stepvideo",`
`25`	`25`	`"HunyuanVideoPipeline": "hunyuan",`
	`26`	`+ "Cosmos2VideoToWorldPipeline": "cosmos"`
`26`	`27`	`}`
`27`	`28`
`28`	`29`	`_PREPROCESS_WORKLOAD_TYPE_TO_PIPELINE_NAME: dict[WorkloadType, str] = {`