hao-ai-lab
diff --git a/‎fastvideo/configs/models/vaes/cosmosvae.py‎
Lines changed: 3 additions & 4 deletions b/‎fastvideo/configs/models/vaes/cosmosvae.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎fastvideo/configs/pipelines/cosmos.py‎
Lines changed: 10 additions & 64 deletions b/‎fastvideo/configs/pipelines/cosmos.py‎
Lines changed: 10 additions & 64 deletions
diff --git a/‎fastvideo/configs/sample/cosmos.py‎
Lines changed: 12 additions & 27 deletions b/‎fastvideo/configs/sample/cosmos.py‎
Lines changed: 12 additions & 27 deletions
diff --git a/‎fastvideo/layers/layernorm.py‎
Lines changed: 1 addition & 2 deletions b/‎fastvideo/layers/layernorm.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎fastvideo/layers/rotary_embedding.py‎
Lines changed: 1 addition & 1 deletion b/‎fastvideo/layers/rotary_embedding.py‎
Lines changed: 1 addition & 1 deletion
@@ -8,8 +8,6 @@
 
 @dataclass
 class CosmosVAEArchConfig(VAEArchConfig):
-    _class_name: str = "AutoencoderKLWan"
-    _diffusers_version: str = "0.34.0.dev0"
     _name_or_path: str = ""
     base_dim: int = 96
     z_dim: int = 16
@@ -76,7 +74,8 @@ def __post_init__(self):
 
 @dataclass
 class CosmosVAEConfig(VAEConfig):
-    arch_config: CosmosVAEArchConfig = field(default_factory=CosmosVAEArchConfig)
+    arch_config: CosmosVAEArchConfig = field(
+        default_factory=CosmosVAEArchConfig)
     use_feature_cache: bool = True
 
     use_tiling: bool = False
@@ -85,4 +84,4 @@ class CosmosVAEConfig(VAEConfig):
 
     def __post_init__(self):
         self.blend_num_frames = (self.tile_sample_min_num_frames -
-                                 self.tile_sample_stride_num_frames) * 2
+                                 self.tile_sample_stride_num_frames) * 2
@@ -23,90 +23,37 @@ def t5_large_postprocess_text(outputs: BaseEncoderOutput) -> torch.Tensor:
     if hidden_state is None:
         raise ValueError("T5 Large outputs missing last_hidden_state")
 
-    # Check for NaN values and provide debugging info
     nan_count = torch.isnan(hidden_state).sum()
     if nan_count > 0:
-        print(f"WARNING: Found {nan_count} NaN values in T5 Large hidden states")
-        print(f"Hidden state shape: {hidden_state.shape}")
-        print(f"Hidden state dtype: {hidden_state.dtype}")
-        print(f"Hidden state device: {hidden_state.device}")
-        # Replace NaN values with zeros to avoid pipeline failure
         hidden_state = hidden_state.masked_fill(torch.isnan(hidden_state), 0.0)
 
-    # Return raw last_hidden_state (no truncation/padding)
     return hidden_state
 
 
-@dataclass
-class CosmosVideoConfigFixed(CosmosVideoConfig):
-    """Fixed Cosmos Video Config that matches original Cosmos2 Video2World configuration."""
-    
-    def update_model_arch(self, config: dict) -> None:
-        """Update model architecture config with HF config, but fix parameters to match original Cosmos2."""
-        # First, apply the standard update
-        super().update_model_arch(config)
-        
-        # CRITICAL FIXES to match original Cosmos2 Video2World configuration:
-        
-        # 1. Fix input channels: should be 16 (VAE) + 1 (condition mask) = 17
-        setattr(self.arch_config, 'in_channels', 17)
-        
-        # 2. Fix output channels: should be 16 (VAE latent dimension)
-        setattr(self.arch_config, 'out_channels', 16)
-        
-        # 3. Fix model architecture to match Cosmos2 2B model
-        setattr(self.arch_config, 'num_attention_heads', 16)
-        setattr(self.arch_config, 'attention_head_dim', 128)  # Fixed: should be 128, not 64
-        setattr(self.arch_config, 'num_layers', 28)
-        setattr(self.arch_config, 'hidden_size', 2048)  # 16 * 128 = 2048
-        
-        # 4. Fix patch size to match original
-        setattr(self.arch_config, 'patch_size', (1, 2, 2))
-        
-        # 5. Fix max size to match original
-        setattr(self.arch_config, 'max_size', (128, 240, 240))
-        
-        # 6. Fix text embedding dimension
-        setattr(self.arch_config, 'text_embed_dim', 1024)
-        
-        # 7. Fix adaln lora dimension
-        setattr(self.arch_config, 'adaln_lora_dim', 256)
-        
-        # 8. Fix rope scale to match original
-        setattr(self.arch_config, 'rope_scale', (1.0, 3.0, 3.0))
-        
-        # 9. Enable concat padding mask
-        setattr(self.arch_config, 'concat_padding_mask', True)
-        
-        # 10. Set num_channels_latents to 16 (VAE output dim)
-        setattr(self.arch_config, 'num_channels_latents', 16)
-
-
 @dataclass
 class CosmosConfig(PipelineConfig):
-    """Configuration for Cosmos2 Video2World pipeline matching original implementation."""
+    """Configuration for Cosmos2 Video2World pipeline matching diffusers."""
 
-    # DiT configuration matching Cosmos2 2B model
-    dit_config: DiTConfig = field(default_factory=CosmosVideoConfigFixed)
+
+    dit_config: DiTConfig = field(default_factory=CosmosVideoConfig)
 
-    # VAE configuration matching Cosmos2
+
     vae_config: VAEConfig = field(default_factory=CosmosVAEConfig)
 
-    # Text encoding configuration
+
     text_encoder_configs: tuple[EncoderConfig, ...] = field(
         default_factory=lambda: (T5LargeConfig(), ))
     postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], torch.Tensor],
                                   ...] = field(default_factory=lambda:
                                                (t5_large_postprocess_text, ))
 
-    # Precision for each component
+
     dit_precision: str = "bf16"
     vae_precision: str = "fp16"
     text_encoder_precisions: tuple[str, ...] = field(
         default_factory=lambda: ("bf16",))
 
-    # Cosmos2 Video2World specific parameters
-    conditioning_strategy: str = "frame_replace"  # Match original ConditioningStrategy.FRAME_REPLACE
+    conditioning_strategy: str = "frame_replace"
     min_num_conditional_frames: int = 1
     max_num_conditional_frames: int = 2
     sigma_conditional: float = 0.0001
@@ -115,13 +62,12 @@ class CosmosConfig(PipelineConfig):
     state_t: int = 24
     text_encoder_class: str = "T5"
 
-    # Denoising parameters
+
     embedded_cfg_scale: int = 6
-    flow_shift: float = 1.0  # Changed to 1.0 to match diffusers (no shift transformation)
+    flow_shift: float = 1.0 
 
     def __post_init__(self):
         self.vae_config.load_encoder = True
         self.vae_config.load_decoder = True
 
-        # Store the VAE's latent dimension to use later
-        self._vae_latent_dim = 16  # From CosmosVAEArchConfig.z_dim
+        self._vae_latent_dim = 16
@@ -1,33 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 
-from fastvideo.configs.sample.base import CacheParams
+from fastvideo.configs.sample.base import SamplingParam
 
 
 @dataclass
-class CosmosTeaCacheParams(CacheParams):
-    cache_type: str = "teacache"
-    teacache_thresh: float = 0.0
-    use_ret_steps: bool = True
-    ret_steps_coeffs: list[float] = field(default_factory=list)
-    non_ret_steps_coeffs: list[float] = field(default_factory=list)
+class Cosmos_Predict2_2B_Video2World_SamplingParam(SamplingParam):
+    # Video parameters
+    height: int = 704
+    width: int = 1280
+    num_frames: int = 93
+    fps: int = 16
 
-    @property
-    def coefficients(self) -> list[float]:
-        if self.use_ret_steps:
-            return self.ret_steps_coeffs
-        else:
-            return self.non_ret_steps_coeffs
-
-    @property
-    def ret_steps(self) -> int:
-        if self.use_ret_steps:
-            return 5 * 2
-        else:
-            return 1 * 2
-
-    def get_cutoff_steps(self, num_inference_steps: int) -> int:
-        if self.use_ret_steps:
-            return num_inference_steps * 2
-        else:
-            return num_inference_steps * 2 - 2
+    # Denoising stage
+    guidance_scale: float = 7.0
+    negative_prompt: str = "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality."
+    num_inference_steps: int = 35
@@ -39,14 +39,13 @@ def __init__(
         if self.has_weight:
             self.weight = nn.Parameter(self.weight)
 
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def forward_diffusers(self, hidden_states: torch.Tensor) -> torch.Tensor:
         """Forward method that matches Diffusers RMSNorm implementation exactly."""
         input_dtype = hidden_states.dtype
         variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
 
         if self.has_weight and self.weight is not None:
-            # convert into half-precision if necessary (match Diffusers exactly)
             if self.weight.dtype in [torch.float16, torch.bfloat16]:
                 hidden_states = hidden_states.to(self.weight.dtype)
             hidden_states = hidden_states * self.weight
 
@@ -64,7 +64,7 @@ def apply_rotary_emb(
     """
     if use_real:
         cos, sin = freqs_cis  # [S, D]
-        # Match Diffusers exact broadcasting (sequence_dim=2 case)
+        # Match Diffusers broadcasting (sequence_dim=2 case)
         cos = cos[None, None, :, :]
         sin = sin[None, None, :, :]
         cos, sin = cos.to(x.device), sin.to(x.device)