hao-ai-lab
diff --git a/‎fastvideo/image_processor.py‎
Lines changed: 193 additions & 0 deletions b/‎fastvideo/image_processor.py‎
Lines changed: 193 additions & 0 deletions
diff --git a/‎fastvideo/models/dits/cosmos.py‎
Lines changed: 0 additions & 4 deletions b/‎fastvideo/models/dits/cosmos.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎fastvideo/models/registry.py‎
Lines changed: 0 additions & 1 deletion b/‎fastvideo/models/registry.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎fastvideo/pipelines/composed_pipeline_base.py‎
Lines changed: 0 additions & 1 deletion b/‎fastvideo/pipelines/composed_pipeline_base.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎fastvideo/pipelines/stages/decoding.py‎
Lines changed: 11 additions & 46 deletions b/‎fastvideo/pipelines/stages/decoding.py‎
Lines changed: 11 additions & 46 deletions
@@ -0,0 +1,193 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Minimal image processing utilities for FastVideo.
+This module provides lightweight image preprocessing without external dependencies beyond PyTorch/NumPy/PIL.
+"""
+
+from typing import Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+
+
+class ImageProcessor:
+    """
+    Minimal image processor for video frame preprocessing.
+
+    This is a lightweight alternative to diffusers.VideoProcessor that handles:
+    - PIL image to tensor conversion
+    - Resizing to specified dimensions
+    - Normalization to [-1, 1] range
+
+    Args:
+        vae_scale_factor: The VAE scale factor used to ensure dimensions are multiples of this value.
+    """
+
+    def __init__(self, vae_scale_factor: int = 8) -> None:
+        self.vae_scale_factor = vae_scale_factor
+
+    def preprocess(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Preprocess an image to a normalized torch tensor.
+
+        Args:
+            image: Input image (PIL Image, NumPy array, or torch tensor)
+            height: Target height. If None, uses image's original height.
+            width: Target width. If None, uses image's original width.
+
+        Returns:
+            torch.Tensor: Normalized tensor of shape (1, 3, height, width) or (1, 1, height, width) for grayscale,
+                         with values in range [-1, 1].
+        """
+        # Handle different input types
+        if isinstance(image, PIL.Image.Image):
+            return self._preprocess_pil(image, height, width)
+        elif isinstance(image, np.ndarray):
+            return self._preprocess_numpy(image, height, width)
+        elif isinstance(image, torch.Tensor):
+            return self._preprocess_tensor(image, height, width)
+        else:
+            raise ValueError(
+                f"Unsupported image type: {type(image)}. "
+                "Supported types: PIL.Image.Image, np.ndarray, torch.Tensor"
+            )
+
+    def _preprocess_pil(
+        self,
+        image: PIL.Image.Image,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+    ) -> torch.Tensor:
+        """Preprocess a PIL image."""
+        if height is None:
+            height = image.height
+        if width is None:
+            width = image.width
+
+        height = height - (height % self.vae_scale_factor)
+        width = width - (width % self.vae_scale_factor)
+
+        image = image.resize((width, height), resample=PIL.Image.Resampling.LANCZOS)
+
+        image_np = np.array(image, dtype=np.float32) / 255.0
+
+        if image_np.ndim == 2:  # Grayscale
+            image_np = np.expand_dims(image_np, axis=-1)
+
+        return self._normalize_to_tensor(image_np)
+
+    def _preprocess_numpy(
+        self,
+        image: np.ndarray,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+    ) -> torch.Tensor:
+        """Preprocess a numpy array."""
+        # Determine target dimensions if not provided
+        if image.ndim == 3:
+            img_height, img_width = image.shape[:2]
+        elif image.ndim == 2:
+            img_height, img_width = image.shape
+        else:
+            raise ValueError(f"Expected 2D or 3D array, got {image.ndim}D")
+
+        if height is None:
+            height = img_height
+        if width is None:
+            width = img_width
+
+        height = height - (height % self.vae_scale_factor)
+        width = width - (width % self.vae_scale_factor)
+
+        if image.dtype == np.uint8:
+            pil_image = PIL.Image.fromarray(image)
+        else:
+            # Assume normalized [0, 1] or similar
+            if image.max() <= 1.0:
+                image_uint8 = (image * 255).astype(np.uint8)
+            else:
+                image_uint8 = image.astype(np.uint8)
+            pil_image = PIL.Image.fromarray(image_uint8)
+
+        pil_image = pil_image.resize((width, height), resample=PIL.Image.Resampling.LANCZOS)
+        image_np = np.array(pil_image, dtype=np.float32) / 255.0
+
+        # Ensure 3D shape
+        if image_np.ndim == 2:
+            image_np = np.expand_dims(image_np, axis=-1)
+
+        return self._normalize_to_tensor(image_np)
+
+    def _preprocess_tensor(
+        self,
+        image: torch.Tensor,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+    ) -> torch.Tensor:
+        """Preprocess a torch tensor."""
+        # Determine target dimensions
+        if image.ndim == 3:  # (H, W, C) or (C, H, W)
+            if image.shape[0] in (1, 3, 4):  # Likely (C, H, W)
+                img_height, img_width = image.shape[1], image.shape[2]
+            else:  # Likely (H, W, C)
+                img_height, img_width = image.shape[0], image.shape[1]
+        elif image.ndim == 2:  # (H, W)
+            img_height, img_width = image.shape
+        else:
+            raise ValueError(f"Expected 2D or 3D tensor, got {image.ndim}D")
+
+        if height is None:
+            height = img_height
+        if width is None:
+            width = img_width
+
+        height = height - (height % self.vae_scale_factor)
+        width = width - (width % self.vae_scale_factor)
+
+        if image.ndim == 2:
+            image = image.unsqueeze(0).unsqueeze(0)  # (1, 1, H, W)
+        elif image.ndim == 3:
+            if image.shape[0] in (1, 3, 4):  # (C, H, W)
+                image = image.unsqueeze(0)  # (1, C, H, W)
+            else:  # (H, W, C) - need to rearrange
+                image = image.permute(2, 0, 1).unsqueeze(0)  # (1, C, H, W)
+
+        image = torch.nn.functional.interpolate(
+            image, size=(height, width), mode="bilinear", align_corners=False
+        )
+
+        if image.max() > 1.0:  # Assume [0, 255] range
+            image = image / 255.0
+
+        image = 2.0 * image - 1.0
+
+        return image
+
+    def _normalize_to_tensor(self, image_np: np.ndarray) -> torch.Tensor:
+        """
+        Convert normalized numpy array [0, 1] to torch tensor [-1, 1].
+
+        Args:
+            image_np: NumPy array with shape (H, W) or (H, W, C) with values in [0, 1]
+
+        Returns:
+            torch.Tensor: Shape (1, C, H, W) or (1, 1, H, W) with values in [-1, 1]
+        """
+        # Convert to tensor
+        if image_np.ndim == 2:  # (H, W) - grayscale
+            tensor = torch.from_numpy(image_np).unsqueeze(0).unsqueeze(0)  # (1, 1, H, W)
+        elif image_np.ndim == 3:  # (H, W, C)
+            tensor = torch.from_numpy(image_np).permute(2, 0, 1).unsqueeze(0)  # (1, C, H, W)
+        else:
+            raise ValueError(f"Expected 2D or 3D array, got {image_np.ndim}D")
+
+        # Normalize to [-1, 1]
+        tensor = 2.0 * tensor - 1.0
+
+        return tensor
@@ -471,14 +471,10 @@ def forward(self,
                             w_spatial_freqs)[None, None, :, :].repeat(
                                 pe_size[0], pe_size[1], 1, 1)
 
-        # Apply sequence scaling in temporal dimension
         if fps is None:
             emb_t = torch.outer(seq[:pe_size[0]], temporal_freqs)
         else:
-            # Videos
-            print(f"[FASTVIDEO ROPE FORWARD] Using video mode (fps={fps})")
             temporal_scale = seq[:pe_size[0]] / fps * self.base_fps
-            print(f"[FASTVIDEO ROPE FORWARD] temporal_scale range: {temporal_scale.min().item():.6f} to {temporal_scale.max().item():.6f}")
             emb_t = torch.outer(temporal_scale,
                                 temporal_freqs)
 
 
@@ -236,7 +236,6 @@ def register_model(
 
     def _raise_for_unsupported(self, architectures: list[str]) -> NoReturn:
         all_supported_archs = self.get_supported_archs()
-        print('all_supported1', all_supported_archs)
         if any(arch in all_supported_archs for arch in architectures):
             raise ValueError(
                 f"Model architectures {architectures} failed "
 
@@ -282,7 +282,6 @@ def load_modules(
         for module_name, (transformers_or_diffusers,
                           architecture) in model_index.items():
             if transformers_or_diffusers is None:
-                print("REQURED", self.required_config_modules, module_name)
                 self.required_config_modules.remove(module_name)
                 continue
             if module_name not in required_modules:
 
@@ -92,49 +92,21 @@ def forward(
             vae_autocast_enabled = (vae_dtype != torch.float32
                                     ) and not fastvideo_args.disable_autocast
 
-            # Apply latents normalization for Cosmos VAE
-            # Source: /diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py:1000-1010
-            if hasattr(self.vae, 'config') and hasattr(self.vae.config, 'latents_mean') and hasattr(self.vae.config, 'latents_std'):
-                # Get scheduler for sigma_data
-                pipeline = self.pipeline() if self.pipeline else None
-                sigma_data = 1.0  # default
-                if pipeline and hasattr(pipeline, 'modules') and 'scheduler' in pipeline.modules:
-                    scheduler = pipeline.modules['scheduler']
-                    if hasattr(scheduler, 'config') and hasattr(scheduler.config, 'sigma_data'):
-                        sigma_data = scheduler.config.sigma_data
-
-                latents_mean = (
-                    torch.tensor(self.vae.config.latents_mean)
-                    .view(1, self.vae.config.z_dim, 1, 1, 1)
-                    .to(latents.device, latents.dtype)
-                )
-                latents_std = (
-                    torch.tensor(self.vae.config.latents_std)
-                    .view(1, self.vae.config.z_dim, 1, 1, 1)
-                    .to(latents.device, latents.dtype)
-                )
-
-                latents_after_mul = latents * latents_std / sigma_data
-                latents = latents_after_mul + latents_mean
-
-            # Fallback to scaling_factor for other VAE types
-            elif hasattr(self.vae, 'scaling_factor'):
+            if hasattr(self.vae, 'scaling_factor'):
                 if isinstance(self.vae.scaling_factor, torch.Tensor):
                     latents = latents / self.vae.scaling_factor.to(
                         latents.device, latents.dtype)
                 else:
                     latents = latents / self.vae.scaling_factor
-            elif hasattr(self.vae, 'config') and hasattr(self.vae.config, 'scaling_factor'):
-                latents = latents / self.vae.config.scaling_factor
-
-            # NOTE: Skip this if we already applied latents_mean (for Cosmos VAE)
-            elif (hasattr(self.vae, "shift_factor")
-                    and self.vae.shift_factor is not None):
-                if isinstance(self.vae.shift_factor, torch.Tensor):
-                    latents += self.vae.shift_factor.to(latents.device,
-                                                        latents.dtype)
-                else:
-                    latents += self.vae.shift_factor
+
+                # Apply shifting if needed
+                if (hasattr(self.vae, "shift_factor")
+                        and self.vae.shift_factor is not None):
+                    if isinstance(self.vae.shift_factor, torch.Tensor):
+                        latents += self.vae.shift_factor.to(latents.device,
+                                                            latents.dtype)
+                    else:
+                        latents += self.vae.shift_factor
 
             # Decode latents
             with torch.autocast(device_type="cuda",
@@ -146,15 +118,8 @@ def forward(
                 #     self.vae.enable_parallel()
                 if not vae_autocast_enabled:
                     latents = latents.to(vae_dtype)
-                decode_output = self.vae.decode(latents)
 
-                # TEMPORARY: Handle diffusers VAE decode output compatibility
-                if hasattr(decode_output, 'sample'):
-                    # Diffusers VAE returns DecoderOutput with .sample attribute
-                    image = decode_output.sample
-                else:
-                    # FastVideo VAE returns tensor directly
-                    image = decode_output
+                image = self.vae.decode(latents)
 
         # Normalize image to [0, 1] range
         image = (image / 2 + 0.5).clamp(0, 1)