quic-abhamidi
diff --git a/‎QEfficient/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎QEfficient/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py‎
Lines changed: 48 additions & 3 deletions b/‎QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py‎
Lines changed: 48 additions & 3 deletions
diff --git a/‎QEfficient/diffusers/models/pytorch_transforms.py‎
Lines changed: 3 additions & 0 deletions b/‎QEfficient/diffusers/models/pytorch_transforms.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎QEfficient/diffusers/pipelines/configs/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎QEfficient/diffusers/pipelines/configs/__init__.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎QEfficient/diffusers/pipelines/configs/npi_wan_i2v_vae_encoder.yaml‎
Lines changed: 1 addition & 0 deletions b/‎QEfficient/diffusers/pipelines/configs/npi_wan_i2v_vae_encoder.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎QEfficient/diffusers/pipelines/configs/wan_i2v_config.json‎
Lines changed: 92 additions & 0 deletions b/‎QEfficient/diffusers/pipelines/configs/wan_i2v_config.json‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎QEfficient/diffusers/pipelines/pipeline_module.py‎
Lines changed: 44 additions & 8 deletions b/‎QEfficient/diffusers/pipelines/pipeline_module.py‎
Lines changed: 44 additions & 8 deletions
diff --git a/‎QEfficient/diffusers/pipelines/pipeline_utils.py‎
Lines changed: 10 additions & 0 deletions b/‎QEfficient/diffusers/pipelines/pipeline_utils.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎QEfficient/diffusers/pipelines/wan/pipeline_wan.py‎
Lines changed: 2 additions & 2 deletions b/‎QEfficient/diffusers/pipelines/wan/pipeline_wan.py‎
Lines changed: 2 additions & 2 deletions
@@ -31,6 +31,7 @@
 from QEfficient.compile.compile_helper import compile
 from QEfficient.diffusers.pipelines.flux.pipeline_flux import QEffFluxPipeline
 from QEfficient.diffusers.pipelines.wan.pipeline_wan import QEffWanPipeline
+from QEfficient.diffusers.pipelines.wan.pipeline_wan_i2v import QEffWanImageToVideoPipeline
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
 from QEfficient.peft import QEffAutoPeftModelForCausalLM
@@ -59,6 +60,7 @@
     "QEFFCommonLoader",
     "QEffFluxPipeline",
     "QEffWanPipeline",
+    "QEffWanImageToVideoPipeline",
 ]
 
 
 
@@ -5,8 +5,11 @@
 #
 # -----------------------------------------------------------------------------
 
+from typing import Optional
+
 import torch
 from diffusers.models.autoencoders.autoencoder_kl_wan import (
+    AutoencoderKLWan,
     WanDecoder3d,
     WanEncoder3d,
     WanResample,
@@ -16,8 +19,6 @@
 
 CACHE_T = 2
 
-modes = []
-
 # Used max(0, x.shape[2] - CACHE_T) instead of CACHE_T because x.shape[2] is either 1 or 4,
 # and CACHE_T = 2. This ensures the value never goes negative
 
@@ -58,7 +59,6 @@ def forward(self, x, feat_cache=None, feat_idx=[0]):
                     x = x.reshape(b, c, t * 2, h, w)
         t = x.shape[2]
         x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
-        modes.append(self.mode)
         x = self.resample(x)
         x = x.view(b, t, x.size(1), x.size(2), x.size(3)).permute(0, 2, 1, 3, 4)
 
@@ -198,3 +198,48 @@ def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
         else:
             x = self.conv_out(x)
         return x
+
+
+class QEffAutoencoderKLWan(AutoencoderKLWan):
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        r"""
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+        """
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self._encode(x)
+        return h
+
+    def forward(
+        self,
+        image: Optional[torch.Tensor] = None,
+        latent_sample: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> torch.Tensor:
+        r"""
+        Forward pass through the VAE autoencoder with dual-mode functionality.
+        This method automatically determines whether to perform encoding or decoding based on the provided inputs:
+        - If `image` is provided, performs encoding (image → latent space)
+        - If `latent_sample` is provided, performs decoding (latent space → image)
+
+        Args:
+            image (`torch.Tensor`, *optional*): Input image tensor to encode into latent space.
+            latent_sample (`torch.Tensor`, *optional*): input latent tensor to decode back to image space.
+                    If provided, `image` should be None.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a dictionary with structured output or a raw tensor.
+                Only applies to decoding operations.
+        Returns:
+            `torch.Tensor`:
+                - If encoding: Latent representation of the input image
+                - If decoding: Reconstructed image/video from latent representation
+        """
+        if image is not None:
+            return self.encode(image)
+        else:
+            return self.decode(latent_sample, return_dict)
@@ -6,6 +6,7 @@
 # -----------------------------------------------------------------------------
 
 from diffusers.models.autoencoders.autoencoder_kl_wan import (
+    AutoencoderKLWan,
     WanDecoder3d,
     WanEncoder3d,
     WanResample,
@@ -25,6 +26,7 @@
 from QEfficient.base.pytorch_transforms import ModuleMappingTransform
 from QEfficient.customop.rms_norm import CustomRMSNormAIC
 from QEfficient.diffusers.models.autoencoders.autoencoder_kl_wan import (
+    QEffAutoencoderKLWan,
     QEffWanDecoder3d,
     QEffWanEncoder3d,
     QEffWanResample,
@@ -66,6 +68,7 @@ class AttentionTransform(ModuleMappingTransform):
         WanAttnProcessor: QEffWanAttnProcessor,
         WanAttention: QEffWanAttention,
         WanTransformer3DModel: QEffWanTransformer3DModel,
+        AutoencoderKLWan: QEffAutoencoderKLWan,
         WanDecoder3d: QEffWanDecoder3d,
         WanEncoder3d: QEffWanEncoder3d,
         WanResidualBlock: QEffWanResidualBlock,
 
@@ -0,0 +1,6 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
@@ -0,0 +1,92 @@
+{
+  "description": "Default configuration for Wan image-to-video pipeline with unified transformer (model_type: 1 for high noise; model_type:2 for low noise)",
+  "modules": {
+    "vae_encoder":
+                  {
+                    "specializations":
+                                      {
+                                          "batch_size": 1,
+                                          "num_channels": 16
+                                      },
+                    "compilation":
+                                      {
+                                        "onnx_path": null,
+                                        "compile_dir": null,
+                                        "mdp_ts_num_devices": 8,
+                                        "mxfp6_matmul": false,
+                                        "convert_to_fp16": true,
+                                        "aic_num_cores": 16,
+                                        "aic-enable-depth-first": true,
+                                        "compile_only":true,
+                                        "mos": 1,
+                                        "mdts_mos": 1,
+                                        "node_precision_info" : "QEfficient/diffusers/pipelines/configs/npi_wan_i2v_vae_encoder.yaml"
+                                      },
+                    "execute":
+                                      {
+                                        "device_ids": null,
+                                        "qpc_path" : null
+                                      }
+                   },
+    "transformer": {
+                    "specializations": [
+                                        {
+                                            "batch_size": "1",
+                                            "num_channels": "36",
+                                            "steps": "1",
+                                            "sequence_length": "512",
+                                            "model_type": 1
+                                        },
+                                        {
+                                            "batch_size": "1",
+                                            "num_channels": "36",
+                                            "steps": "1",
+                                            "sequence_length": "512",
+                                            "model_type": 2
+                                        }
+                                    ],
+                    "compilation":  {
+                                      "onnx_path": null,
+                                      "compile_dir": null,
+                                      "mdp_ts_num_devices": 16,
+                                      "mxfp6_matmul": true,
+                                      "convert_to_fp16": true,
+                                      "compile_only":true,
+                                      "aic_num_cores": 16,
+                                      "mos": 1,
+                                      "mdts_mos": 1
+                                  },
+                    "execute":     {
+                                        "device_ids": null,
+                                        "qpc_path" : null
+                                    }
+                   },
+    "vae_decoder":
+                  {
+                    "specializations":
+                                      {
+                                          "batch_size": 1,
+                                          "num_channels": 16
+                                      },
+                    "compilation":
+                                      {
+                                        "onnx_path": null,
+                                        "compile_dir": null,
+                                        "mdp_ts_num_devices": 8,
+                                        "mxfp6_matmul": false,
+                                        "convert_to_fp16": true,
+                                        "aic_num_cores": 16,
+                                        "aic-enable-depth-first": true,
+                                        "compile_only":true,
+                                        "mos": 1,
+                                        "mdts_mos": 1
+                                      },
+                    "execute":
+                                      {
+                                        "device_ids": null,
+                                        "qpc_path" : null
+                                      }
+                  }
+
+  }
+}
@@ -247,9 +247,8 @@ def __init__(self, model: nn.Module, type: str) -> None:
         """
         super().__init__(model)
         self.model = model
-
-        # To have different hashing for encoder/decoder
-        self.model.config["type"] = type
+        self.type = type
+        # TODO: add vae type in hash file
 
     def get_onnx_params(self, latent_height: int = 32, latent_width: int = 32) -> Tuple[Dict, Dict, List[str]]:
         """
@@ -282,6 +281,43 @@ def get_onnx_params(self, latent_height: int = 32, latent_width: int = 32) -> Tu
 
         return example_inputs, dynamic_axes, output_names
 
+    def get_img_encoder_onnx_params(self) -> Tuple[Dict, Dict, List[str]]:
+        """
+        Generate ONNX export configuration for the VAE Encoder.
+
+        Returns:
+            Tuple containing:
+                - example_inputs (Dict): Sample inputs for ONNX export
+                - dynamic_axes (Dict): Specification of dynamic dimensions
+                - output_names (List[str]): Names of model outputs
+        """
+        bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
+        num_frames = constants.WAN_ONNX_EXPORT_FRAMES
+        height = constants.WAN_ONNX_EXPORT_HEIGHT_45P
+        width = constants.WAN_ONNX_EXPORT_WIDTH_45P
+        example_inputs = {
+            "image": torch.randn(
+                bs,
+                3,  # channels
+                num_frames,
+                height,
+                width,
+            ),
+        }
+        output_names = ["latents"]
+        # All dimensions except channels can be dynamic
+        dynamic_axes = {
+            "image": {
+                0: "batch_size",
+                # 1: "num_channels",
+                2: "num_frames",
+                3: "height",
+                4: "width",
+            },
+        }
+
+        return example_inputs, dynamic_axes, output_names
+
     def get_video_onnx_params(self) -> Tuple[Dict, Dict, List[str]]:
         """
         Generate ONNX export configuration for the VAE decoder.
@@ -298,8 +334,8 @@ def get_video_onnx_params(self) -> Tuple[Dict, Dict, List[str]]:
         """
         bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
         latent_frames = constants.WAN_ONNX_EXPORT_LATENT_FRAMES
-        latent_height = constants.WAN_ONNX_EXPORT_LATENT_HEIGHT_180P
-        latent_width = constants.WAN_ONNX_EXPORT_LATENT_WIDTH_180P
+        latent_height = constants.WAN_ONNX_EXPORT_LATENT_HEIGHT_45P
+        latent_width = constants.WAN_ONNX_EXPORT_LATENT_WIDTH_45P
 
         # VAE decoder takes latent representation as input
         example_inputs = {
@@ -568,8 +604,8 @@ def get_onnx_params(self):
                 batch_size,
                 self.model.config.in_channels,
                 constants.WAN_ONNX_EXPORT_LATENT_FRAMES,
-                constants.WAN_ONNX_EXPORT_LATENT_HEIGHT_180P,
-                constants.WAN_ONNX_EXPORT_LATENT_WIDTH_180P,
+                constants.WAN_ONNX_EXPORT_LATENT_HEIGHT_45P,
+                constants.WAN_ONNX_EXPORT_LATENT_WIDTH_45P,
                 dtype=torch.float32,
             ),
             # encoder_hidden_states = [BS, seq len , text dim]
@@ -578,7 +614,7 @@ def get_onnx_params(self):
             ),
             # Rotary position embeddings: [2, context_length, 1, rotary_dim]; 2 is from tuple of cos, sin freqs
             "rotary_emb": torch.randn(
-                2, constants.WAN_ONNX_EXPORT_CL_180P, 1, constants.WAN_ONNX_EXPORT_ROTARY_DIM, dtype=torch.float32
+                2, constants.WAN_ONNX_EXPORT_CL_45P, 1, constants.WAN_ONNX_EXPORT_ROTARY_DIM, dtype=torch.float32
             ),
             # Timestep embeddings: [batch_size=1, embedding_dim]
             "temb": torch.randn(batch_size, constants.WAN_TEXT_EMBED_DIM, dtype=torch.float32),
 
@@ -131,6 +131,16 @@ def set_execute_params(cls):
                 )
 
 
+def update_npi_path(cls, npi_full_path, module_name):
+    """To Set NPI for path in compilation config"""
+    if module_name in cls.custom_config["modules"]:
+        # Check if the NPI file exists
+        if not os.path.exists(npi_full_path):
+            raise FileNotFoundError(f"Node precision info file not found: {npi_full_path}")
+
+        cls.custom_config["modules"][module_name]["compilation"]["node_precision_info"] = npi_full_path
+
+
 def compile_modules_parallel(
     modules: Dict[str, Any],
     config: Dict[str, Any],
 
@@ -260,8 +260,8 @@ def compile(
         self,
         compile_config: Optional[str] = None,
         parallel: bool = False,
-        height: int = constants.WAN_ONNX_EXPORT_HEIGHT_180P,
-        width: int = constants.WAN_ONNX_EXPORT_WIDTH_180P,
+        height: int = constants.WAN_ONNX_EXPORT_HEIGHT_45P,
+        width: int = constants.WAN_ONNX_EXPORT_WIDTH_45P,
         num_frames: int = constants.WAN_ONNX_EXPORT_FRAMES,
         use_onnx_subfunctions: bool = False,
     ) -> str: