diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
index 0e1118407..6ebccdfbf 100644
--- a/QEfficient/transformers/cache_utils.py
+++ b/QEfficient/transformers/cache_utils.py
@@ -10,7 +10,7 @@
 from typing import Any, Dict, List, Optional, Tuple
 
 import torch
-from transformers.cache_utils import DynamicCache, DynamicLayer, EncoderDecoderCache, HybridCache, HybridChunkedCache
+from transformers.cache_utils import Cache, CacheLayerMixin, EncoderDecoderCache, HybridCache, HybridChunkedCache
 
 from QEfficient.customop import (
     CtxGatherFunc,
@@ -54,7 +54,47 @@ def _get_invalid_idx_value(cls):
             return 0
 
 
-class QEffDynamicLayer(DynamicLayer):
+class QEffDynamicLayer(CacheLayerMixin):
+    is_sliding = False
+
+    def __init__(self):
+        super().__init__()
+
+    def lazy_initialization(self, key_states: torch.Tensor):
+        self.dtype = key_states.dtype
+        self.device = key_states.device
+        self.keys = torch.tensor([], dtype=self.dtype, device=self.device)
+        self.values = torch.tensor([], dtype=self.dtype, device=self.device)
+        self.is_initialized = True
+
+    def get_mask_sizes(self, cache_position: torch.Tensor) -> tuple[int, int]:
+        kv_offset = 0
+        query_length = cache_position.shape[0]
+        kv_length = self.get_seq_length() + query_length
+        return kv_length, kv_offset
+
+    def get_seq_length(self) -> int:
+        if self.keys is None or self.keys.numel() == 0:
+            return 0
+        return self.keys.shape[-2]
+
+    def get_max_cache_shape(self) -> int:
+        return -1
+
+    @classmethod
+    def from_tensors(cls, key_states: torch.Tensor, value_states: torch.Tensor) -> "QEffDynamicLayer":
+        layer = cls()
+        layer.keys = key_states
+        layer.values = value_states
+        layer._mark_initialized(key_states)
+        return layer
+
+    def _mark_initialized(self, reference_states: torch.Tensor) -> None:
+        if not self.is_initialized:
+            self.dtype = reference_states.dtype
+            self.device = reference_states.device
+            self.is_initialized = True
+
     def read_only(self, cache_kwargs):
         """
         Reads the `key_states` and `value_states` for the layer.
@@ -68,6 +108,8 @@ def read_only(self, cache_kwargs):
         """
         # Gather
         k_out, v_out = self.keys, self.values
+        if k_out is not None:
+            self._mark_initialized(k_out)
         position_ids = cache_kwargs.get("position_ids")
         batch_index = cache_kwargs.get("batch_index", None)
         ctx_len = cache_kwargs.get("CCL", k_out.shape[2])
@@ -109,6 +151,8 @@ def read_only_blockedKV(self, start_index, end_index, cache_kwargs):
         """
         # Gather
         k_out, v_out = self.keys, self.values
+        if k_out is not None:
+            self._mark_initialized(k_out)
         position_ids = cache_kwargs.get("position_ids")
         batch_index = cache_kwargs.get("batch_index", None)
         batch, num_kv_heads, _, _ = k_out.shape
@@ -150,7 +194,9 @@ def write_only(self, key_states, value_states, cache_kwargs):
         if self.keys is None:
             self.keys = key_states
             self.values = value_states
+            self._mark_initialized(self.keys)
         else:
+            self._mark_initialized(self.keys)
             position_ids = cache_kwargs.get("position_ids")
             batch_index = cache_kwargs.get("batch_index", None)  # Check and fetch batch index value form the kwargs
 
@@ -189,8 +235,10 @@ def update(
         if self.keys is None:
             self.keys = key_states
             self.values = value_states
+            self._mark_initialized(self.keys)
             k_out, v_out = self.keys, self.values
         else:
+            self._mark_initialized(self.keys)
             position_ids = cache_kwargs.get("position_ids")
             batch_index = cache_kwargs.get("batch_index", None)  # Check and fetch batch index value form the kwargs
 
@@ -252,8 +300,10 @@ def update3D(
         if self.keys is None:
             self.keys = key_states
             self.values = value_states
+            self._mark_initialized(self.keys)
             k_out, v_out = self.keys, self.values
         else:
+            self._mark_initialized(self.keys)
             position_ids = cache_kwargs.get("position_ids")
             batch_index = cache_kwargs.get("batch_index", None)
 
@@ -293,7 +343,7 @@ def update3D(
         return k_out, v_out
 
 
-class QEffDynamicCache(DynamicCache):
+class QEffDynamicCache(Cache):
     """
     A cache that grows dynamically as more tokens are generated. This is the default for generative models.
 
@@ -307,15 +357,46 @@ class QEffDynamicCache(DynamicCache):
     """
 
     def __init__(self, ddp_cache_data: Optional[Iterable[tuple[torch.Tensor, torch.Tensor]]] = None, *args, **kwargs):
-        # Remove layer_classes if present to avoid duplicate argument
+        # Remove cache-layer construction args if present to avoid duplicate arguments.
         kwargs.pop("layer_classes", None)
-        from transformers.cache_utils import Cache  # Import here to avoid circular import
-
-        Cache.__init__(self, layer_classes=QEffDynamicLayer, *args, **kwargs)
+        kwargs.pop("layers", None)
+        kwargs.pop("layer_class_to_replicate", None)
+
+        try:
+            # transformers>=4.57
+            Cache.__init__(self, *args, layer_class_to_replicate=QEffDynamicLayer, **kwargs)
+        except TypeError:
+            # transformers<=4.56
+            Cache.__init__(self, *args, layer_classes=QEffDynamicLayer, **kwargs)
         if ddp_cache_data is not None:
             for key_states, value_states in ddp_cache_data:
                 self.layers.append(QEffDynamicLayer.from_tensors(key_states, value_states))
 
+    def append_new_layers(self, layer_idx: int) -> None:
+        while len(self.layers) <= layer_idx:
+            self.layers.append(QEffDynamicLayer())
+
+    @classmethod
+    def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "QEffDynamicCache":
+        cache = cls()
+        if past_key_values is not None:
+            for layer_idx in range(len(past_key_values)):
+                key_states, value_states = past_key_values[layer_idx]
+                cache.update(key_states, value_states, layer_idx)
+        return cache
+
+    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+        legacy_cache = ()
+        for layer in self.layers:
+            legacy_cache += ((layer.keys, layer.values),)
+        return legacy_cache
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0, cache_position: Optional[torch.LongTensor] = None) -> int:
+        """
+        Keep backward-compatible call shape while deferring to upstream implementation.
+        """
+        return super().get_seq_length(layer_idx)
+
     def read_only(self, layer_idx, cache_kwargs):
         """
         Reads the `key_states` and `value_states` for the layer `layer_idx`.
@@ -405,10 +486,7 @@ def from_legacy_cache(
         cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     ) -> "EncoderDecoderCache":
         """Converts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`."""
-        cache = cls(
-            self_attention_cache=QEffDynamicCache(),
-            cross_attention_cache=QEffDynamicCache(),
-        )
+        cache = cls(QEffDynamicCache(), QEffDynamicCache())
         if past_key_values is not None:
             for layer_idx in range(len(past_key_values)):
                 key_states, value_states = past_key_values[layer_idx][:2]
@@ -419,6 +497,18 @@ def from_legacy_cache(
                     cache.is_updated[layer_idx] = True
         return cache
 
+    def to_legacy_cache(self):
+        self_attn_legacy = self.self_attention_cache.to_legacy_cache()
+        cross_attn_legacy = self.cross_attention_cache.to_legacy_cache()
+
+        legacy_cache = ()
+        for layer_idx, self_attn_layer in enumerate(self_attn_legacy):
+            if layer_idx < len(cross_attn_legacy):
+                legacy_cache += (self_attn_layer + cross_attn_legacy[layer_idx],)
+            else:
+                legacy_cache += (self_attn_layer,)
+        return legacy_cache
+
 
 # TODO:This function will be depercated in future.
 class QEffHybridCache(HybridCache):
@@ -447,7 +537,7 @@ def __len__(self):
         """
         return len(self.key_cache)
 
-    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+    def get_seq_length(self, layer_idx: Optional[int] = 0, cache_position: Optional[torch.LongTensor] = None) -> int:
         """Returns the sequence length of the cached states. A layer index can be optionally passed."""
         # TODO: deprecate this function in favor of `cache_position`
         is_empty_layer = (
@@ -531,7 +621,7 @@ def __len__(self):
         """
         return len(self.key_cache)
 
-    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+    def get_seq_length(self, layer_idx: Optional[int] = 0, cache_position: Optional[torch.LongTensor] = None) -> int:
         """Returns the sequence length of the cached states. A layer index can be optionally passed."""
         # TODO: deprecate this function in favor of `cache_position`
         is_empty_layer = (
@@ -663,7 +753,7 @@ def __len__(self):
         """
         return len(self.key_cache)
 
-    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+    def get_seq_length(self, layer_idx: Optional[int] = 0, cache_position: Optional[torch.LongTensor] = None) -> int:
         """Returns the sequence length of the cached states. A layer index can be optionally passed."""
         # TODO: deprecate this function in favor of `cache_position`
         is_empty_layer = (
@@ -783,7 +873,7 @@ def __len__(self):
         """
         return len(self.key_cache)
 
-    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+    def get_seq_length(self, layer_idx: Optional[int] = 0, cache_position: Optional[torch.LongTensor] = None) -> int:
         """Returns the sequence length of the cached states. A layer index can be optionally passed."""
         # TODO: deprecate this function in favor of `cache_position`
         is_empty_layer = (
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 47ae57557..77a440018 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -191,6 +191,7 @@
     ]
 )
 
+
 # This is for supporting different seq_len for different layers for Sliding window attn, chunked attn etc.
 DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH = {"gemma3", "llama4", "gemma3_text", "llama4_text"}
 
diff --git a/QEfficient/transformers/models/falcon/modeling_falcon.py b/QEfficient/transformers/models/falcon/modeling_falcon.py
index 4ebb2fb96..90032be4e 100644
--- a/QEfficient/transformers/models/falcon/modeling_falcon.py
+++ b/QEfficient/transformers/models/falcon/modeling_falcon.py
@@ -32,6 +32,7 @@
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+from QEfficient.utils._utils import resolve_kv_seq_len
 from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 
@@ -137,7 +138,7 @@ def forward(
         key_layer = key_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim)
         value_layer = value_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim)
 
-        kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position)
+        kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_layer.shape[-2], cache_position)
         cos, sin = self.rotary_emb(value_layer, seq_len=kv_seq_len)
         query_layer, key_layer = qeff_apply_rotary_pos_emb(query_layer, key_layer, cos, sin, position_ids)
 
diff --git a/QEfficient/transformers/models/gemma/modeling_gemma.py b/QEfficient/transformers/models/gemma/modeling_gemma.py
index 260d1857a..bc3b00e6a 100644
--- a/QEfficient/transformers/models/gemma/modeling_gemma.py
+++ b/QEfficient/transformers/models/gemma/modeling_gemma.py
@@ -27,6 +27,7 @@
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+from QEfficient.utils._utils import resolve_kv_seq_len
 from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 
@@ -149,7 +150,7 @@ def forward(
         key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
         value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
 
-        kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position)
+        kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
diff --git a/QEfficient/transformers/models/gemma2/modeling_gemma2.py b/QEfficient/transformers/models/gemma2/modeling_gemma2.py
index 6dee8c85d..8d15e3485 100644
--- a/QEfficient/transformers/models/gemma2/modeling_gemma2.py
+++ b/QEfficient/transformers/models/gemma2/modeling_gemma2.py
@@ -30,6 +30,7 @@
 
 # from transformers.utils import is_torchdynamo_compiling
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+from QEfficient.utils._utils import resolve_kv_seq_len
 from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 
@@ -156,7 +157,7 @@ def forward(
         key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
         value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
 
-        kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position)
+        kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
diff --git a/QEfficient/transformers/models/gptj/modeling_gptj.py b/QEfficient/transformers/models/gptj/modeling_gptj.py
index a4c81dbec..bbf621f10 100644
--- a/QEfficient/transformers/models/gptj/modeling_gptj.py
+++ b/QEfficient/transformers/models/gptj/modeling_gptj.py
@@ -223,7 +223,7 @@ def forward(
         else:
             past_length = past_key_values[0][0].size(-2)
 
-        if not self._use_flash_attention_2:
+        if not getattr(self, "_use_flash_attention_2", False):
             attention_mask = _create_causal_mask(position_ids, past_length, None)
 
         # # Prepare head mask if needed
diff --git a/QEfficient/transformers/models/granite/modeling_granite.py b/QEfficient/transformers/models/granite/modeling_granite.py
index 8a32c52ef..d30b9fc39 100644
--- a/QEfficient/transformers/models/granite/modeling_granite.py
+++ b/QEfficient/transformers/models/granite/modeling_granite.py
@@ -27,6 +27,7 @@
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+from QEfficient.utils._utils import resolve_kv_seq_len
 from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 
@@ -142,7 +143,7 @@ def forward(
         key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
         value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
 
-        kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position)
+        kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
diff --git a/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py b/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py
index 935df7c2d..2f61ac164 100644
--- a/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py
+++ b/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py
@@ -28,6 +28,7 @@
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+from QEfficient.utils._utils import resolve_kv_seq_len
 from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 
@@ -137,7 +138,7 @@ def forward(
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position)
+        kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
         if past_key_value is not None:
diff --git a/QEfficient/transformers/models/grok_1/modeling_grok1.py b/QEfficient/transformers/models/grok_1/modeling_grok1.py
index 1a1c919bb..5c2f145b4 100644
--- a/QEfficient/transformers/models/grok_1/modeling_grok1.py
+++ b/QEfficient/transformers/models/grok_1/modeling_grok1.py
@@ -20,6 +20,7 @@
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
 from QEfficient.transformers.models.llama.modeling_llama import qeff_apply_rotary_pos_emb
+from QEfficient.utils._utils import resolve_kv_seq_len
 from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 
@@ -87,8 +88,7 @@ def forward(
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        if past_key_value is not None:
-            kv_seq_len = past_key_value.get_seq_length(layer_idx)
+        kv_seq_len = resolve_kv_seq_len(past_key_value, layer_idx, key_states.shape[-2])
 
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
diff --git a/QEfficient/transformers/models/llama/modeling_llama.py b/QEfficient/transformers/models/llama/modeling_llama.py
index 57bccdb1b..a0a3b0237 100644
--- a/QEfficient/transformers/models/llama/modeling_llama.py
+++ b/QEfficient/transformers/models/llama/modeling_llama.py
@@ -27,6 +27,7 @@
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+from QEfficient.utils._utils import resolve_kv_seq_len
 from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 
@@ -226,7 +227,7 @@ def forward(
         key_states = self.k_proj(hidden_states, **kwargs).view(hidden_shape).transpose(1, 2)
         value_states = self.v_proj(hidden_states, **kwargs).view(hidden_shape).transpose(1, 2)
 
-        kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position)
+        kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position)
         past_seen_tokens = past_key_value.get_seq_length() if past_key_value is not None else 0
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index e219d5e03..8c96955dd 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -29,6 +29,7 @@
     QEffLlamaRotaryEmbedding,
     qeff_apply_rotary_pos_emb,
 )
+from QEfficient.utils._utils import resolve_kv_seq_len
 from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 
@@ -110,7 +111,7 @@ def forward(
             if comp_ctx_lengths is not None:
                 attention_mask = attention_mask[:, :, :, : comp_ctx_lengths.shape[-1]]
                 cache_kwargs["CCL"] = attention_mask.shape[-1]
-            kv_seq_len = past_key_value.get_seq_length(self.layer_idx)
+            kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, query_states.shape[-2])
         key_states, value_states = past_key_value.read_only(self.layer_idx, cache_kwargs=cache_kwargs)
 
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
@@ -370,7 +371,7 @@ def forward(
                         "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                         "with a layer index."
                     )
-                kv_seq_len = past_key_values.get_seq_length(self_attn.layer_idx)
+                kv_seq_len = resolve_kv_seq_len(past_key_values, self_attn.layer_idx, key_states.shape[-2])
 
             cos, sin = self_attn.rotary_emb(value_states, seq_len=kv_seq_len)
             _, key_states = qeff_apply_rotary_pos_emb(torch.empty_like(key_states), key_states, cos, sin, position_ids)
diff --git a/QEfficient/transformers/models/mistral/modeling_mistral.py b/QEfficient/transformers/models/mistral/modeling_mistral.py
index 47107384e..878920234 100644
--- a/QEfficient/transformers/models/mistral/modeling_mistral.py
+++ b/QEfficient/transformers/models/mistral/modeling_mistral.py
@@ -30,6 +30,7 @@
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+from QEfficient.utils._utils import resolve_kv_seq_len
 from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 
@@ -159,7 +160,7 @@ def forward(
         key_states = key_states.view(hidden_shape).transpose(1, 2)
         value_states = value_states.view(hidden_shape).transpose(1, 2)
 
-        kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position)
+        kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
diff --git a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py
index 680c839ae..9e8a2a020 100644
--- a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py
+++ b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py
@@ -32,6 +32,7 @@
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+from QEfficient.utils._utils import resolve_kv_seq_len
 from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 
@@ -148,14 +149,14 @@ def forward(
         key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
         value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
 
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len = past_key_value.get_seq_length(self.layer_idx)
+        if past_key_value is not None and self.layer_idx is None:
+            raise ValueError(
+                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                "with a layer index."
+            )
+        cache_position = kwargs.get("cache_position")
+        kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py
index 3cba022b4..a350a92dc 100644
--- a/QEfficient/transformers/models/mllama/modeling_mllama.py
+++ b/QEfficient/transformers/models/mllama/modeling_mllama.py
@@ -42,7 +42,7 @@
     _prepare_cross_attention_mask,
 )
 from QEfficient.utils import constants
-from QEfficient.utils._utils import IOInfo
+from QEfficient.utils._utils import IOInfo, resolve_kv_seq_len
 from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 MAX_NUM_IMG = 1
@@ -267,14 +267,14 @@ def forward(
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position)
+        if past_key_value is not None and self.layer_idx is None:
+            raise ValueError(
+                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                "with a layer index."
+            )
+
+        kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position)
 
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py
index 57f2729b9..fbc7b34b8 100644
--- a/QEfficient/transformers/models/molmo/modeling_molmo.py
+++ b/QEfficient/transformers/models/molmo/modeling_molmo.py
@@ -17,7 +17,7 @@
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
 from QEfficient.utils import constants
-from QEfficient.utils._utils import IOInfo, get_padding_shape_from_config
+from QEfficient.utils._utils import IOInfo, get_padding_shape_from_config, resolve_kv_seq_len
 
 
 def _non_meta_init_device(config) -> torch.device:
@@ -265,15 +265,13 @@ def attention(
         v = v.view(B, T, self.config.effective_n_kv_heads, C // self.config.n_heads).transpose(1, 2)
 
         if self.config.use_position_ids and self.config.rope:
-            kv_seq_len = k.shape[-2]
-            kv_seq_len = layer_past.get_seq_length(self.layer_id)
+            kv_seq_len = resolve_kv_seq_len(layer_past, self.layer_id, k.shape[-2])
             # Apply rotary embeddings
             cos, sin = self.rotary_emb(v, seq_len=kv_seq_len)
             q, k = qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, self.config)
 
         if not self.config.use_position_ids and self.config.rope:
-            kv_seq_len = k.shape[-2]
-            kv_seq_len = layer_past.get_seq_length(kv_seq_len, self.layer_id)
+            kv_seq_len = resolve_kv_seq_len(layer_past, self.layer_id, k.shape[-2])
             # Apply rotary embeddings
             cos, sin = self.rotary_emb(v, seq_len=kv_seq_len)
             q, k = qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, self.config)
diff --git a/QEfficient/transformers/models/olmo2/modeling_olmo2.py b/QEfficient/transformers/models/olmo2/modeling_olmo2.py
index c79ad7fae..0e9394040 100644
--- a/QEfficient/transformers/models/olmo2/modeling_olmo2.py
+++ b/QEfficient/transformers/models/olmo2/modeling_olmo2.py
@@ -27,6 +27,7 @@
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+from QEfficient.utils._utils import resolve_kv_seq_len
 from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 
@@ -148,9 +149,7 @@ def forward(
         key_states = key_states.view(hidden_shape).transpose(1, 2)
         value_states = value_states.view(hidden_shape).transpose(1, 2)
 
-        kv_seq_len = key_states.shape[-2]
-
-        kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position)
+        kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
diff --git a/QEfficient/transformers/models/phi3/modeling_phi3.py b/QEfficient/transformers/models/phi3/modeling_phi3.py
index b48ab2897..aaaaa8081 100644
--- a/QEfficient/transformers/models/phi3/modeling_phi3.py
+++ b/QEfficient/transformers/models/phi3/modeling_phi3.py
@@ -27,6 +27,7 @@
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+from QEfficient.utils._utils import resolve_kv_seq_len
 from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 
@@ -157,7 +158,7 @@ def forward(
         key_states = key_states.view(hidden_shape).transpose(1, 2)
         value_states = value_states.view(hidden_shape).transpose(1, 2)
 
-        kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position)
+        kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 
         query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
diff --git a/QEfficient/transformers/models/qwen2/modeling_qwen2.py b/QEfficient/transformers/models/qwen2/modeling_qwen2.py
index 841df6526..c41dc13bb 100644
--- a/QEfficient/transformers/models/qwen2/modeling_qwen2.py
+++ b/QEfficient/transformers/models/qwen2/modeling_qwen2.py
@@ -30,6 +30,7 @@
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+from QEfficient.utils._utils import resolve_kv_seq_len
 from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 
@@ -162,7 +163,7 @@ def forward(
         key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
         value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
 
-        kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position)
+        kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index d6bfbda81..39dd285a0 100644
--- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -36,7 +36,7 @@
 # from transformers import Qw
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
 from QEfficient.utils import constants
-from QEfficient.utils._utils import IOInfo, get_padding_shape_from_config
+from QEfficient.utils._utils import IOInfo, get_padding_shape_from_config, resolve_kv_seq_len
 from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 from QEfficient.utils.logging_utils import logger
 
@@ -591,8 +591,7 @@ def forward(
         key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
 
-        kv_seq_len = key_states.shape[-2]
-        kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position)
+        kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position)
         past_seen_tokens = past_key_value.get_seq_length() if past_key_value is not None else 0
 
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
@@ -743,13 +742,13 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-
-        if use_cache and not isinstance(past_key_values, Cache):
+        return_legacy_cache = False
+        if past_key_values is not None and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
             past_key_values = QEffDynamicCache.from_legacy_cache(past_key_values)
 
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         if cache_position is None:
-            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position = torch.arange(
                 past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
             )
diff --git a/QEfficient/transformers/models/qwen3/modeling_qwen3.py b/QEfficient/transformers/models/qwen3/modeling_qwen3.py
index ccc4bbac2..c3c1df82d 100644
--- a/QEfficient/transformers/models/qwen3/modeling_qwen3.py
+++ b/QEfficient/transformers/models/qwen3/modeling_qwen3.py
@@ -30,6 +30,7 @@
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+from QEfficient.utils._utils import resolve_kv_seq_len
 from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 
@@ -163,7 +164,7 @@ def forward(
         key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
         value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
 
-        kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position)
+        kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
diff --git a/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py
index 6bdd5e243..bfe0c90db 100644
--- a/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py
+++ b/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py
@@ -28,6 +28,7 @@
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+from QEfficient.utils._utils import resolve_kv_seq_len
 from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 
@@ -209,7 +210,7 @@ def forward(
         key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
         value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
 
-        kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position)
+        kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
diff --git a/QEfficient/transformers/quantizers/quantizer_awq.py b/QEfficient/transformers/quantizers/quantizer_awq.py
index ef8a03521..b7199a71e 100644
--- a/QEfficient/transformers/quantizers/quantizer_awq.py
+++ b/QEfficient/transformers/quantizers/quantizer_awq.py
@@ -29,15 +29,18 @@ def post_init(self):
                 f"Only quantization backend {AwqBackendPackingMethod.AUTOAWQ} is supported - not recognized backend {self.backend}"
             )
 
-        self.version = AWQLinearVersion.from_str(self.version)
+        if isinstance(self.version, str):
+            self.version = AWQLinearVersion.from_str(self.version)
         if self.version not in [AWQLinearVersion.GEMM]:
             raise ValueError(
                 f"Only {AWQLinearVersion.GEMM} version in supported - not recognized version {self.version}"
             )
 
-        if self.do_fuse or self.fuse_max_seq_len is not None:
+        do_fuse = getattr(self, "do_fuse", None)
+        fuse_max_seq_len = getattr(self, "fuse_max_seq_len", None)
+        if do_fuse or fuse_max_seq_len is not None:
             raise ValueError(
-                f"fused modules are not supported, got do_fuse={self.do_fuse}, fuse_max_seq_len={self.fuse_max_seq_len}"
+                f"fused modules are not supported, got do_fuse={do_fuse}, fuse_max_seq_len={fuse_max_seq_len}"
             )
 
         if self.bits != 4:
@@ -63,6 +66,9 @@ def update_torch_dtype(self, torch_dtype):
             logger.warning(f"Requested dtype {torch_dtype} is not supported, overriding to None")
         return None
 
+    def update_dtype(self, dtype):
+        return self.update_torch_dtype(dtype)
+
     def _process_model_before_weight_loading(self, model, **kwargs):
         self.modules_to_not_convert = get_keys_to_not_convert(model)
 
diff --git a/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py b/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py
index e7e14166d..f2746528c 100644
--- a/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py
@@ -188,6 +188,9 @@ def update_torch_dtype(self, torch_dtype):
             logger.warning(f"Requested dtype {torch_dtype} is not supported, overriding to None")
         return None
 
+    def update_dtype(self, dtype):
+        return self.update_torch_dtype(dtype)
+
     def _process_model_before_weight_loading(self, model, **kwargs):
         if not self.modules_to_not_convert or "lm_head" not in self.modules_to_not_convert:
             self.modules_to_not_convert.extend(get_keys_to_not_convert(model))
@@ -366,6 +369,9 @@ def update_torch_dtype(self, torch_dtype):
             logger.warning(f"Requested dtype {torch_dtype} is not supported, overriding to None")
         return None
 
+    def update_dtype(self, dtype):
+        return self.update_torch_dtype(dtype)
+
     def _process_model_before_weight_loading(self, model, **kwargs):
         if self.quantization_config.targets != ["Linear"]:
             raise NotImplementedError(
diff --git a/QEfficient/transformers/quantizers/quantizer_mxfp4.py b/QEfficient/transformers/quantizers/quantizer_mxfp4.py
index 2ffba1bea..44c255feb 100644
--- a/QEfficient/transformers/quantizers/quantizer_mxfp4.py
+++ b/QEfficient/transformers/quantizers/quantizer_mxfp4.py
@@ -105,6 +105,9 @@ def update_torch_dtype(self, torch_dtype):
             logger.warning(f"Requested dtype {torch_dtype} is not supported, overriding to None")
         return None
 
+    def update_dtype(self, dtype):
+        return self.update_torch_dtype(dtype)
+
     def _process_model_before_weight_loading(
         self,
         model: torch.nn.Module,
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 26bae7a34..9a62f57fd 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -67,6 +67,40 @@ class DownloadRetryLimitExceeded(Exception):
     """
 
 
+def resolve_kv_seq_len(
+    past_key_value: Optional[Any],
+    layer_idx: int,
+    current_seq_len: int,
+    cache_position: Optional[torch.LongTensor] = None,
+) -> int:
+    """
+    Resolve KV sequence length for rotary embeddings with cache compatibility.
+
+    Use the current key sequence length as baseline, then grow it with:
+    - cache_position max (when provided)
+    - cache object reported length for the current layer
+    """
+    resolved_seq_len = current_seq_len
+    if cache_position is not None and isinstance(cache_position, torch.Tensor) and cache_position.numel() > 0:
+        resolved_seq_len = max(resolved_seq_len, int(cache_position.max().item()) + 1)
+
+    if past_key_value is None:
+        return resolved_seq_len
+
+    get_seq_length = getattr(past_key_value, "get_seq_length", None)
+    if get_seq_length is None:
+        return resolved_seq_len
+
+    try:
+        cache_seq_len = get_seq_length(layer_idx)
+    except TypeError:
+        cache_seq_len = get_seq_length()
+
+    if cache_seq_len is None:
+        return resolved_seq_len
+    return max(resolved_seq_len, int(cache_seq_len))
+
+
 def login_and_download_hf_lm(model_name, *args, **kwargs):
     logger.info(f"loading HuggingFace model for {model_name}")
     hf_token = kwargs.pop("hf_token", None)
diff --git a/QEfficient/utils/test_utils.py b/QEfficient/utils/test_utils.py
index 3cf560266..c125a317f 100644
--- a/QEfficient/utils/test_utils.py
+++ b/QEfficient/utils/test_utils.py
@@ -229,3 +229,7 @@ class ModelConfig:
     SWIFTKV_MODELS = {
         "Snowflake/Llama-3.1-SwiftKV-8B-Instruct",
     }
+
+    FULL_MODEL_TESTS_TO_SKIP = {
+        "hpcai-tech/grok-1",
+    }
diff --git a/pyproject.toml b/pyproject.toml
index 6de8048b4..207868adb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ classifiers = [
 ]
 requires-python = ">=3.8,<3.13"
 dependencies = [
-    "transformers==4.55.0",
+    "transformers==4.57.3",
     "diffusers== 0.35.1",
     "huggingface-hub==0.34.0",
     "hf_transfer==0.1.9",
@@ -55,7 +55,7 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-test = ["pytest","pytest-mock"]
+test = ["pytest","pytest-mock","pytest-xdist"]
 docs = ["Sphinx==7.1.2","sphinx-rtd-theme==2.0.0","myst-parser==3.0.1","sphinx-multiversion"]
 quality = ["black", "ruff", "hf_doc_builder@git+https://github.com/huggingface/doc-builder.git"]
 
diff --git a/scripts/Nightly/Jenkinsfile b/scripts/Nightly/Jenkinsfile
new file mode 100644
index 000000000..b9c9687f3
--- /dev/null
+++ b/scripts/Nightly/Jenkinsfile
@@ -0,0 +1,308 @@
+pipeline {
+    agent {
+        node {
+            label 'qeff_node'
+        }
+    }
+
+    options {
+        disableConcurrentBuilds()
+        timeout(time: 1, unit: 'DAYS')
+        timestamps()
+        buildDiscarder(logRotator(numToKeepStr: '5', daysToKeepStr: '30'))
+    }
+
+    triggers {
+        cron('''TZ=Asia/Kolkata
+        0 21 * * 6''')
+    }
+
+    environment {
+        DOCKER_IMAGE = "${DOCKER_LATEST}:master_latest"
+        VENV_PATH = 'preflight_qeff'
+        TOKENIZERS_PARALLELISM = 'false'
+        HF_HUB_CACHE = '/huggingface_hub'
+        PYTEST_ARGS = '--durations=10'
+        DOCKER_USER = 'ubuntu'
+    }
+
+    stages {
+        stage('Prepare Environment') {
+            steps {
+                script {
+                    echo "Starting QEfficient Nightly Test Suite"
+                    echo "Build Tag: ${BUILD_TAG}"
+                }
+                sh '''
+                    . ~/.bashrc
+                    # Launch privileged Docker container with necessary mounts
+                    sudo docker run --privileged -dit \
+                        --name ${BUILD_TAG} \
+                        -e HF_TOKEN=${HF_TOKEN} \
+                        -v ./:/efficient-transformers \
+                        -v ${HF_PATH}:${DOCKER_HF_PATH} \
+                        ${DOCKER_LATEST}:master_latest
+                    
+                    # Install QEfficient and dependencies
+                    sudo docker exec ${BUILD_TAG} bash -c "
+                        set -e
+                        cd /efficient-transformers
+                        apt update && apt install -y python3.10-venv
+                        python3.10 -m venv ${VENV_PATH}
+                        . ${VENV_PATH}/bin/activate
+                        
+                        # Upgrade pip and core packages
+                        pip install --upgrade pip setuptools wheel
+                        pip install .[test]
+                        pip install junitparser pytest-xdist
+                        
+                        # Audio processing libraries for speech-to-text models
+                        pip install librosa==0.10.2 soundfile==0.13.1
+                        
+                        # Vision and multimodal model dependencies
+                        pip install --extra-index-url https://download.pytorch.org/whl/cpu \
+                            timm==1.0.14 torchvision==0.22.0+cpu einops==0.8.1
+                        
+                        rm -rf QEfficient
+                    "
+                '''
+            }
+        }
+        stage('Unit & Integration Tests') {
+            parallel {
+                stage('Model Export & ONNX Tests') {
+                    steps {
+                        timeout(time: 40, unit: 'MINUTES') {
+                            sh '''
+                            sudo docker exec ${BUILD_TAG} bash -c "
+                                set -e
+                                cd /efficient-transformers
+                                . ${VENV_PATH}/bin/activate
+                                
+                                mkdir -p $PWD/Non_cli_qaic
+                                export QEFF_HOME=$PWD/Non_cli_qaic
+                                
+                                pytest tests \
+                                    -m '(not cli) and (not on_qaic) and (not finetune)' \
+                                    --ignore tests/vllm \
+                                    --ignore tests/transformers/models/image_text_to_text \
+                                    ${PYTEST_ARGS} -n 4\
+                                    --junitxml=tests/tests_log1.xml
+                                
+                                junitparser merge tests/tests_log1.xml tests/tests_log.xml
+                                deactivate
+                            "
+                        '''
+                        }
+                    }
+                }
+                
+                stage('QAIC LLM Tests') {
+                    steps {
+                        // timeout(time: 120, unit: 'MINUTES') {
+                            sh '''
+                            sudo docker exec ${BUILD_TAG} bash -c "
+                                set -e
+                                cd /efficient-transformers
+                                . ${VENV_PATH}/bin/activate
+                                
+                                mkdir -p $PWD/Non_qaic_llm
+                                export QEFF_HOME=$PWD/Non_qaic_llm
+                                
+                                pytest tests \
+                                    -m '(not cli) and (on_qaic) and (llm_model) and (not custom_layers) and (not dummy_model) and (not multimodal) and (not qnn) and (not finetune) and (not diffusion_models)' \
+                                    --ignore tests/vllm \
+                                    ${PYTEST_ARGS} \
+                                    --junitxml=tests/tests_log2.xml
+                                
+                                junitparser merge tests/tests_log2.xml tests/tests_log.xml
+                                deactivate
+                            "
+                        '''
+                        // }
+                    }
+                }
+                
+                stage('QAIC Feature Tests') {
+                    steps {
+                        // timeout(time: 80, unit: 'MINUTES') {
+                            sh '''
+                            sudo docker exec ${BUILD_TAG} bash -c "
+                                set -e
+                                cd /efficient-transformers
+                                . ${VENV_PATH}/bin/activate
+                                
+                                mkdir -p $PWD/Non_qaic_feature
+                                export QEFF_HOME=$PWD/Non_qaic_feature
+                                
+                                pytest tests \
+                                    -m '(not cli) and (on_qaic) and (feature) and (not multimodal) and (not qnn) and (not finetune) and (not diffusion_models)' \
+                                    --ignore tests/vllm \
+                                    ${PYTEST_ARGS} \
+                                    --junitxml=tests/tests_log2_feature.xml
+                                
+                                junitparser merge tests/tests_log2_feature.xml tests/tests_log.xml
+                                deactivate
+                            "
+                        '''
+                        // }
+                    }
+                }
+            }
+        }
+        stage('QAIC MultiModal Tests') {
+            steps {
+                // timeout(time: 120, unit: 'MINUTES') {
+                    sh '''
+                        sudo docker exec ${BUILD_TAG} bash -c "
+                            set -e
+                            cd /efficient-transformers
+                            . ${VENV_PATH}/bin/activate
+                            
+                            mkdir -p $PWD/Non_cli_qaic_multimodal
+                            export QEFF_HOME=$PWD/Non_cli_qaic_multimodal
+                            
+                            pytest tests \
+                                -m '(not cli) and (on_qaic) and (multimodal) and (not qnn) and (not finetune) and (not diffusion_models)' \
+                                --ignore tests/vllm \
+                                ${PYTEST_ARGS} \
+                                --junitxml=tests/tests_log6.xml
+                            
+                            junitparser merge tests/tests_log6.xml tests/tests_log.xml
+                            deactivate
+                        "
+                    '''
+                // }
+            }
+        }
+
+        stage('QAIC Diffusion Models Tests') {
+            steps {
+                timeout(time: 120, unit: 'MINUTES') {
+                    sh '''
+                        sudo docker exec ${BUILD_TAG} bash -c "
+                            set -e
+                            cd /efficient-transformers
+                            . ${VENV_PATH}/bin/activate
+                            
+                            mkdir -p $PWD/Non_cli_qaic_diffusion
+                            export QEFF_HOME=$PWD/Non_cli_qaic_diffusion
+                            export HF_HUB_CACHE=${HF_HUB_CACHE}
+                            
+                            pytest tests \
+                                -m '(not cli) and (on_qaic) and (diffusion_models) and (not wan) and (not qnn) and (not finetune)' \
+                                --ignore tests/vllm \
+                                ${PYTEST_ARGS} \
+                                --junitxml=tests/tests_log_diffusion.xml
+                            
+                            junitparser merge tests/tests_log_diffusion.xml tests/tests_log.xml
+                            deactivate
+                        "
+                    '''
+                }
+            }
+        }
+
+        stage('CLI Inference Tests') {
+            steps {
+                timeout(time: 120, unit: 'MINUTES') {
+                    sh '''
+                        sudo docker exec ${BUILD_TAG} bash -c "
+                            set -e
+                            cd /efficient-transformers
+                            . ${VENV_PATH}/bin/activate
+                            
+                            mkdir -p $PWD/cli
+                            export QEFF_HOME=$PWD/cli
+                            
+                            pytest tests \
+                                -m '(cli and not qnn) and (not finetune)' \
+                                --ignore tests/vllm \
+                                ${PYTEST_ARGS} \
+                                --junitxml=tests/tests_log3.xml
+                            
+                            junitparser merge tests/tests_log3.xml tests/tests_log.xml
+                            deactivate
+                        "
+                    '''
+                }
+            }
+        }
+        stage('Finetune CLI Tests') {
+            steps {
+                timeout(time: 20, unit: 'MINUTES') {
+                    sh '''
+                        sudo docker exec ${BUILD_TAG} bash -c "
+                            set -e
+                            cd /efficient-transformers
+                            . ${VENV_PATH}/bin/activate
+                            
+                            # Install QAIC PyTorch integration
+                            pip install /opt/qti-aic/integrations/torch_qaic/py310/torch_qaic-0.1.0-cp310-cp310-linux_x86_64.whl
+                            pip install torch==2.9.0 torchvision==0.24.0 torchaudio==2.9.0 \
+                                --index-url https://download.pytorch.org/whl/cpu
+                            
+                            mkdir -p $PWD/cli_qaic_finetuning
+                            export QEFF_HOME=$PWD/cli_qaic_finetuning
+                            
+                            pytest tests \
+                                -m '(cli) and (on_qaic) and (not qnn) and (not multimodal) and (finetune)' \
+                                --ignore tests/vllm \
+                                ${PYTEST_ARGS} \
+                                --junitxml=tests/tests_log_finetune.xml
+                            
+                            junitparser merge tests/tests_log_finetune.xml tests/tests_log.xml
+                            deactivate
+                        "
+                    '''
+                }
+            }
+        }
+    }
+
+    post {
+        always {
+            script {
+                echo "========== Test Execution Summary =========="
+                sh '''
+                    # Restore file ownership
+                    sudo chown -R ${DOCKER_USER} . 2>/dev/null || true
+                '''
+            }
+            
+            junit testResults: 'tests/tests_log.xml', 
+                  allowEmptyResults: true,
+                  keepLongStdio: true
+            
+            script {
+                sh '''
+                    # Cleanup Docker container
+                    echo "Cleaning up Docker container: ${BUILD_TAG}"
+                    sudo docker rm -f ${BUILD_TAG} 2>/dev/null || true
+                '''
+            }
+            
+            cleanWs(
+                deleteDirs: true,
+            )
+            
+            echo "Pipeline cleanup completed"
+        }
+
+        success {
+            echo "✓ QEfficient Nightly Test Suite completed successfully"
+            // Optionally trigger downstream jobs here
+            // build job: 'qefficient_downstream_job', wait: false
+        }
+
+        failure {
+            echo "✗ QEfficient Nightly Test Suite failed"
+            echo "Check logs above for detailed error information"
+        }
+
+        unstable {
+            echo "⚠ QEfficient Nightly Test Suite produced unstable results"
+            echo "Some tests may have been skipped or failed"
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/configs/causal_model_configs.json b/tests/configs/causal_model_configs.json
index bf0fd642d..b17e57336 100644
--- a/tests/configs/causal_model_configs.json
+++ b/tests/configs/causal_model_configs.json
@@ -25,338 +25,6 @@
         "vocab_size": 50257,
         "num_key_value_heads": 1
       }
-    },
-    {
-      "model_name": "allenai/OLMo-2-0425-1B",
-      "model_type": "olmo2",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 100352,
-        "num_key_value_heads": 1
-      }
-    },
-    {
-      "model_name": "Salesforce/codegen-350M-mono",
-      "model_type": "codegen",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 4,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 51200,
-        "num_key_value_heads": 1,
-        "rotary_dim": 16
-      }
-    },
-    {
-      "model_name": "ibm-granite/granite-3.1-1b-a400m-base",
-      "model_type": "granitemoe",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 49155,
-        "num_key_value_heads": 1
-      }
-    },
-    {
-      "model_name": "microsoft/Phi-3-mini-4k-instruct",
-      "model_type": "phi3",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 32064,
-        "num_key_value_heads": 1
-      }
-    },
-    {
-      "model_name": "tiiuae/falcon-7b",
-      "model_type": "falcon",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 65024,
-        "num_key_value_heads": 1
-      }
-    },
-    {
-      "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507",
-      "model_type": "qwen3_moe",
-      "additional_params": {
-        "hidden_size": 256,
-        "intermediate_size": 256,
-        "max_position_embeddings": 128,
-        "max_window_layers": 48,
-        "moe_intermediate_size": 768,
-        "num_attention_heads": 2,
-        "num_experts": 4,
-        "num_experts_per_tok": 2,
-        "num_hidden_layers": 1,
-        "num_key_value_heads": 1,
-        "vocab_size": 151936
-      }
-    },
-    {
-      "model_name": "Qwen/Qwen2-0.5B",
-      "model_type": "qwen2",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 151936,
-        "num_key_value_heads": 1
-      }
-    },
-    {
-      "model_name": "bigcode/starcoder2-3b",
-      "model_type": "starcoder2",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 49152,
-        "num_key_value_heads": 1
-      }
-    },
-    {
-      "model_name": "Felladrin/Minueza-32M-Base",
-      "model_type": "mistral",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 32002,
-        "num_key_value_heads": 1
-      }
-    },
-    {
-      "model_name": "wtang06/mpt-125m-c4",
-      "model_type": "mpt",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 50368
-      }
-    },
-    {
-      "model_name": "hakurei/gpt-j-random-tinier",
-      "model_type": "gptj",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 50400,
-        "num_key_value_heads": 1,
-        "rotary_dim": 16
-      }
-    },
-    {
-      "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-      "model_type": "mixtral",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 32000,
-        "num_key_value_heads": 1
-      }
-    },
-    {
-      "model_name": "meta-llama/Llama-3.2-1B",
-      "model_type": "llama",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 128256,
-        "num_key_value_heads": 1,
-        "rope_scaling": {
-          "factor": 32.0,
-          "high_freq_factor": 4.0,
-          "low_freq_factor": 1.0,
-          "original_max_position_embeddings": 8192,
-          "rope_type": "llama3"
-        }
-      }
-    },
-    {
-      "model_name": "unsloth/gemma-2b",
-      "model_type": "gemma",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 256000,
-        "num_key_value_heads": 1
-      }
-    },
-    {
-      "model_name": "unsloth/gemma-2-2b",
-      "model_type": "gemma2",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 256000,
-        "num_key_value_heads": 1
-      }
-    },
-    {
-      "model_name": "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
-      "model_type": "llama",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 32003
-      }
-    },
-    {
-      "model_name": "TheBloke/Llama-2-7B-GPTQ",
-      "model_type": "llama",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 32000
-      }
-    },
-    {
-      "model_name": "ibm-granite/granite-20b-code-base",
-      "model_type": "gpt_bigcode",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 49152,
-        "num_key_value_heads": 1,
-        "activation_function": "gelu",
-        "architectures": [
-          "GPTBigCodeForCausalLM"
-        ]
-      }
-    },
-    {
-      "model_name": "neuralmagic/Llama-3.2-3B-Instruct-FP8",
-      "model_type": "llama",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 128256
-      }
-    },
-    {
-      "model_name": "neuralmagic/Qwen2-0.5B-Instruct-FP8",
-      "model_type": "qwen2",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 2,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 151936
-      }
-    },
-    {
-      "model_name": "ibm-granite/granite-3.1-2b-instruct",
-      "model_type": "granite",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 49155,
-        "num_key_value_heads": 1
-      }
-    },
-    {
-      "model_name": "ibm-granite/granite-guardian-3.1-2b",
-      "model_type": "granite",
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 49155,
-        "num_key_value_heads": 1
-      }
-    },
-    {
-      "model_name": "hpcai-tech/grok-1",
-      "model_type": null,
-      "additional_params":{
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "hidden_size": 64,
-        "intermediate_size": 256,
-        "vocab_size": 131072,
-        "num_key_value_heads": 1
-      }
-    },
-    {
-      "model_name": "Snowflake/Llama-3.1-SwiftKV-8B-Instruct",
-      "model_type": null,
-      "additional_params": {
-        "max_position_embeddings": 128,
-        "num_hidden_layers": 2,
-        "num_attention_heads": 2,
-        "hidden_size": 256,
-        "intermediate_size": 256,
-        "vocab_size": 128256,
-        "num_key_value_layers": 1,
-        "num_key_value_heads": 1,
-        "rope_scaling": {
-        "factor": 8.0,
-        "high_freq_factor": 4.0,
-        "low_freq_factor": 1.0,
-        "original_max_position_embeddings": 8192,
-        "rope_type": "llama3"
-        }
-      }
     }
   ],
 
diff --git a/tests/conftest.py b/tests/conftest.py
index d1f553cda..8e024360f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -13,6 +13,55 @@
 from QEfficient.utils.constants import QEFF_MODELS_DIR
 from QEfficient.utils.logging_utils import logger
 
+_QUICKCHECK_FILE = "tests/test_model_quickcheck.py"
+_QUICKCHECK_SUMMARY = {}
+_QUICKCHECK_META = {
+    "test_causal_lm_cpu_runtime_parity_with_api_runner": (
+        "Causal LM",
+        "Full parity: HF PyTorch vs QEff PyTorch vs ORT tokens",
+    ),
+    "test_vlm_text_side_runtime_parity_and_full_export": (
+        "VLM",
+        "Text-side full parity + full VLM export smoke",
+    ),
+    "test_vlm_export_smoke_additional_models": (
+        "VLM",
+        "Export smoke with text-side fallback when needed",
+    ),
+    "test_text_embedding_cpu_parity_and_export": (
+        "Text Embedding",
+        "Tensor parity: HF vs QEff PyTorch vs ORT",
+    ),
+    "test_audio_embedding_ctc_cpu_parity_and_export": (
+        "Audio CTC",
+        "Logits parity: HF vs ORT + export",
+    ),
+    "test_seq_classification_cpu_parity_and_export": (
+        "Sequence Classification",
+        "Logits parity: HF vs QEff PyTorch vs ORT",
+    ),
+    "test_whisper_export_smoke": (
+        "Whisper",
+        "Export smoke + retained-state outputs check",
+    ),
+    "test_causal_subfunction_export_smoke": (
+        "Causal LM",
+        "Subfunction export check (with/without QEffGPT2Block)",
+    ),
+    "test_causal_subfunction_export_smoke_all_models": (
+        "Causal LM",
+        "Full parity: HF PyTorch vs QEff PyTorch vs ORT tokens (subfunctions)",
+    ),
+    "test_prefix_caching_continuous_batching_export_and_ort_smoke": (
+        "Prefix Caching",
+        "Continuous-batching export structural checks",
+    ),
+    "test_awq_export_smoke": (
+        "AWQ",
+        "Export smoke + MatMulNBits presence check",
+    ),
+}
+
 
 def qeff_models_clean_up():
     if os.path.exists(QEFF_MODELS_DIR):
@@ -42,3 +91,32 @@ def pytest_sessionfinish(session, exitstatus):
     if inside_worker is None:
         qeff_models_clean_up()
         logger.info("...PYTEST Session Ended.")
+
+
+def pytest_runtest_logreport(report):
+    if _QUICKCHECK_FILE not in report.nodeid:
+        return
+
+    if report.when == "call":
+        _QUICKCHECK_SUMMARY[report.nodeid] = report.outcome
+        return
+
+    if report.when == "setup" and report.outcome == "skipped":
+        _QUICKCHECK_SUMMARY.setdefault(report.nodeid, report.outcome)
+
+
+def pytest_terminal_summary(terminalreporter):
+    if not _QUICKCHECK_SUMMARY:
+        return
+
+    terminalreporter.section("Quickcheck Coverage Summary", sep="=")
+    header = f"{'Status':7}  {'Test Case':58}  {'Category':24}  Validation"
+    terminalreporter.write_line(header)
+    terminalreporter.write_line("-" * len(header))
+
+    for nodeid in sorted(_QUICKCHECK_SUMMARY):
+        test_case = nodeid.split("::", 1)[1]
+        base_name = test_case.split("[", 1)[0]
+        category, validation = _QUICKCHECK_META.get(base_name, ("Other", "N/A"))
+        status = _QUICKCHECK_SUMMARY[nodeid].upper()
+        terminalreporter.write_line(f"{status:7}  {test_case:58}  {category:24}  {validation}")
diff --git a/tests/transformers/__init__.py b/tests/transformers/__init__.py
new file mode 100644
index 000000000..d647b73a6
--- /dev/null
+++ b/tests/transformers/__init__.py
@@ -0,0 +1,6 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
diff --git a/tests/transformers/models/__init__.py b/tests/transformers/models/__init__.py
new file mode 100644
index 000000000..d647b73a6
--- /dev/null
+++ b/tests/transformers/models/__init__.py
@@ -0,0 +1,6 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
diff --git a/tests/transformers/models/check_model_results.py b/tests/transformers/models/check_model_results.py
new file mode 100644
index 000000000..73f198001
--- /dev/null
+++ b/tests/transformers/models/check_model_results.py
@@ -0,0 +1,179 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import json
+import os
+from datetime import datetime
+
+import numpy as np
+
+
+def parse_exec_info_metrics(exec_info_str):
+    """
+    Parse performance metrics from exec_info string.
+
+    :exec_info_str: str - The exec_info string containing performance stats
+    :return: dict - Dictionary containing parsed metrics
+    """
+    import re
+
+    metrics = {
+        "prefill_time_sec": None,
+        "decode_throughput_tokens_per_sec": None,
+        "total_throughput_tokens_per_sec": None,
+        "e2e_inference_time_sec": None,
+    }
+
+    exec_info_text = str(exec_info_str)
+
+    # Parse Average Prefill time (TTFT)
+    if "Average Prefill time" in exec_info_text or "TTFT" in exec_info_text:
+        match = re.search(r"Average Prefill time.*?is=\s*([\d.]+)\s*sec", exec_info_text)
+        if match:
+            metrics["prefill_time_sec"] = float(match.group(1))
+
+    # Parse Decode throughput
+    if "Decode" in exec_info_text:
+        match = re.search(r"Decode\s+is=\s*([\d.]+)\s*tokens/sec", exec_info_text)
+        if match:
+            metrics["decode_throughput_tokens_per_sec"] = float(match.group(1))
+
+    # Parse Total throughput
+    if "Total is=" in exec_info_text:
+        match = re.search(r"Total\s+is=\s*([\d.]+)\s*tokens/sec", exec_info_text)
+        if match:
+            metrics["total_throughput_tokens_per_sec"] = float(match.group(1))
+
+    # Parse Total E2E inference time
+    if "Total (E2E) inference time" in exec_info_text:
+        match = re.search(r"Total \(E2E\) inference time\s+is=\s*([\d.]+)\s*sec", exec_info_text)
+        if match:
+            metrics["e2e_inference_time_sec"] = float(match.group(1))
+
+    return metrics
+
+
+def dump_and_compare_results(
+    model_name,
+    compile_params,
+    json_file_path,
+    cloud_ai_100_tokens,
+    exec_info=None,
+    pytorch_hf_tokens=None,
+    pytorch_kv_tokens=None,
+    ort_tokens=None,
+):
+    """
+    Function to dump the test results to JSON file and compare the performance and output results with previous runs if available
+
+    :model_name: str
+    :pytorch_hf_tokens: list
+    :pytorch_kv_tokens: list
+    :ort_tokens: list
+    :cloud_ai_100_tokens: list
+    :exec_info: object
+    :compile_params: dict
+    :return None
+    """
+
+    current_logs_dir = os.environ.get("NIGHTLY_LOG_DIR")
+    if current_logs_dir is None:
+        current_logs_dir = os.path.expanduser("~/.cache/Nightly_Logs/build_tag")
+    os.makedirs(current_logs_dir, exist_ok=True)
+    # original_logs_dir = Path(current_logs_dir).parent
+    original_logs_dir = current_logs_dir
+    current_results_json_file_path = os.path.join(current_logs_dir, json_file_path)
+    original_results_json_file_path = os.path.join(original_logs_dir, json_file_path)
+
+    def convert_to_serializable(obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, np.integer):
+            return int(obj)
+        elif isinstance(obj, np.floating):
+            return float(obj)
+        elif isinstance(obj, list):
+            return [convert_to_serializable(item) for item in obj]
+        elif isinstance(obj, dict):
+            return {k: convert_to_serializable(v) for k, v in obj.items()}
+        return obj
+
+    exec_info_metrics = parse_exec_info_metrics(exec_info)
+
+    test_data = {
+        "model_name": model_name,
+        "timestamp": datetime.now().isoformat(),
+        "compile_params": compile_params,
+        "pytorch_hf_tokens": convert_to_serializable(pytorch_hf_tokens) if pytorch_hf_tokens is not None else None,
+        "pytorch_kv_tokens": convert_to_serializable(pytorch_kv_tokens),
+        "ort_tokens": convert_to_serializable(ort_tokens),
+        "cloud_ai_100_tokens": convert_to_serializable(cloud_ai_100_tokens),
+        "exec_info_metrics": exec_info_metrics,
+        "exec_info_raw_string": str(exec_info),
+    }
+
+    # Load existing results if file exists
+    all_results = {}
+    if os.path.exists(current_results_json_file_path):
+        with open(current_results_json_file_path, "r") as f:
+            all_results = json.load(f)
+            print(f"Loaded existing model results from {current_results_json_file_path}")
+    else:
+        with open(current_results_json_file_path, "w", encoding="utf-8") as f:
+            json.dump({}, f)
+        print(f"Created new results file at {current_results_json_file_path}")
+
+    model_name_safe = model_name.replace("/", "_").replace("-", "_")
+    all_results[model_name_safe] = test_data
+
+    with open(current_results_json_file_path, "w") as f:
+        json.dump(all_results, f, indent=4, default=str)
+    print(f"Successfully saved test results to {current_results_json_file_path}")
+
+    with open(original_results_json_file_path, "r") as f:
+        previous_results = json.load(f)
+        print(f"Loaded Previous model results from {original_results_json_file_path}")
+
+    previous_data = previous_results[model_name_safe]
+
+    # Compare performance metrics with 5% tolerance
+    previous_metrics = previous_data.get("exec_info_metrics", {})
+    current_metrics = exec_info_metrics
+
+    for metric_name in [
+        "prefill_time_sec",
+        "decode_throughput_tokens_per_sec",
+        "total_throughput_tokens_per_sec",
+        "e2e_inference_time_sec",
+    ]:
+        prev_val = previous_metrics[metric_name]
+        curr_val = current_metrics[metric_name]
+
+        if prev_val is not None and curr_val is not None and prev_val != 0:
+            percent_diff = abs((curr_val - prev_val) / prev_val) * 100
+            assert percent_diff <= 5.0, (
+                f"Performance metric {metric_name} exceeds 5% tolerance: "
+                f"previous={prev_val}, current={curr_val}, diff={percent_diff:.2f}%"
+            )
+            print(f"✓ {metric_name}: {percent_diff:.2f}% difference (within 5% tolerance)")
+
+    # Compare output tokens using Mean Absolute Deviation (MAD) with 10^-2 tolerance
+    previous_tokens = previous_data.get("cloud_ai_100_tokens", None)
+
+    if previous_tokens is not None and isinstance(previous_tokens, list):
+        if previous_tokens and isinstance(previous_tokens[0], str):
+            print("⊘ Output tokens: Skipping Tokens check (previous data contains strings)")
+        else:
+            prev_tokens_arr = np.array(previous_tokens, dtype=np.float32)
+            curr_tokens_arr = np.array(cloud_ai_100_tokens, dtype=np.float32)
+
+            mad = np.mean(np.abs(curr_tokens_arr - prev_tokens_arr))
+            tolerance = 1e-2
+
+            assert mad <= tolerance, f"Output tokens MAD exceeds 10^-2 tolerance: MAD={mad:.6f}, tolerance={tolerance}"
+            print(f"✓ Output tokens MAD: {mad:.6f} (within 10^-2 tolerance)")
+    return True
diff --git a/tests/transformers/models/image_text_to_text/test_continuous_batching.py b/tests/transformers/models/image_text_to_text/test_continuous_batching.py
index c1a31eaa3..d472b1ce4 100644
--- a/tests/transformers/models/image_text_to_text/test_continuous_batching.py
+++ b/tests/transformers/models/image_text_to_text/test_continuous_batching.py
@@ -90,7 +90,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_CB(
     ctx_len: int,
     max_gen_len: int = 20,
     batch_size: int = 1,
-    n_layer: int = 1,
+    n_layer: int = -1,
     kv_offload: bool = False,
     num_devices: int = 1,
     enable_qnn: Optional[bool] = False,
@@ -277,12 +277,12 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_CB(
 
     compile_kwargs = {
         "num_cores": 16,
-        "num_devices": num_devices,
         "prefill_seq_len": prompt_len,
         "ctx_len": ctx_len,
         "batch_size": batch_size,
         "full_batch_size": full_batch_size,
         "mxfp6_matmul": False,
+        "num_devices": 4,
     }
 
     if is_intern_model:
@@ -375,7 +375,7 @@ def test_image_text_to_text_pytorch_vs_ai100_continuous_batching(model_name, kv_
         img_size=img_size,
         image_urls=model_config_dict[model_name]["img_url_list"],
         queries=model_config_dict[model_name]["text_prompt_list"],
-        n_layer=model_config_dict[model_name]["num_layers"],
+        # n_layer=model_config_dict[model_name]["num_layers"],
         batch_size=model_config_dict[model_name]["batch_size"],
         full_batch_size=model_config_dict[model_name]["full_batch_size"],
         kv_offload=kv_offload,
diff --git a/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py b/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py
index a2c72ba7a..b647a1a7f 100644
--- a/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py
+++ b/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py
@@ -115,7 +115,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
     ctx_len: int,
     max_gen_len: int = 20,
     batch_size: int = 1,
-    n_layer: int = 1,
+    n_layer: int = -1,
     kv_offload: bool = False,
     num_devices: int = 1,
     enable_qnn: Optional[bool] = False,
@@ -302,12 +302,12 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
     # assert (pytorch_hf_tokens == ort_tokens).all(), "Tokens don't match for pytorch HF output and ORT output"
 
     compile_kwargs = {
-        "num_devices": num_devices,
         "prefill_seq_len": prompt_len,
         "ctx_len": ctx_len,
         "mxfp6": False,
         "enable_qnn": enable_qnn,
         "qnn_config": qnn_config,
+        "num_devices": 4,
     }
 
     if is_intern_model:
@@ -367,7 +367,7 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_offload
         img_size=img_size,
         img_url=model_config_dict[model_name]["img_url"],
         query=model_config_dict[model_name]["text_prompt"],
-        n_layer=model_config_dict[model_name]["num_layers"],
+        # n_layer=model_config_dict[model_name]["num_layers"],
         batch_size=model_config_dict[model_name]["batch_size"],
         kv_offload=kv_offload,
     )
diff --git a/tests/transformers/models/test_audio_embedding_models.py b/tests/transformers/models/test_audio_embedding_models.py
index 998546853..669dd272c 100644
--- a/tests/transformers/models/test_audio_embedding_models.py
+++ b/tests/transformers/models/test_audio_embedding_models.py
@@ -22,7 +22,8 @@
 from QEfficient.utils import hf_download
 from QEfficient.utils._utils import create_json, load_hf_processor
 from QEfficient.utils.constants import WAV2VEC2_MAX_SEQ_LEN, QnnConstants
-from QEfficient.utils.device_utils import get_available_device_id
+
+from .check_model_results import dump_and_compare_results
 
 CONFIG_PATH = "tests/configs/embedding_model_configs.json"
 
@@ -129,9 +130,10 @@ def run_ctc_ort(onnx_path, config, processor: AutoProcessor, inputs: np.ndarray,
 
 def check_ctc_pytorch_vs_kv_vs_ort_vs_ai100(
     model_name: str,
-    n_layer: int = 1,
+    n_layer: int = -1,
     enable_qnn: Optional[bool] = False,
     qnn_config: Optional[str] = None,
+    compare_results: Optional[bool] = False,
 ):
     """
     Validate the PyTorch model, the PyTorch model after ONNX model and the Cloud AI 100 model
@@ -162,10 +164,8 @@ def check_ctc_pytorch_vs_kv_vs_ort_vs_ai100(
     predicted_ids = torch.argmax(ort_tokens, dim=-1)
     ort_output = processor.batch_decode(predicted_ids)
     assert pytorch_output == ort_output, "Tokens don't match for pytorch output and ORT output."
-    if not get_available_device_id():
-        pytest.skip("No available devices to run model on Cloud AI 100")
+
     qeff_model.compile(
-        num_cores=16,
         batch_size=batch_size,
         enable_qnn=enable_qnn,
         qnn_config=qnn_config,
@@ -173,8 +173,25 @@ def check_ctc_pytorch_vs_kv_vs_ort_vs_ai100(
     cloud_ai_100_output = qeff_model.generate(processor, data)
     assert pytorch_output == cloud_ai_100_output, "Tokens don't match for pytorch output and Cloud AI 100 output."
     assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
+    if compare_results is False:
+        return
+
+    compile_params = {
+        "batch_size": batch_size,
+        "enable_qnn": enable_qnn,
+        "qnn_config": qnn_config,
+    }
+    assert dump_and_compare_results(
+        model_name,
+        compile_params,
+        "ctc_model_results.json",
+        cloud_ai_100_output,
+        pytorch_hf_tokens=pytorch_output,
+        ort_tokens=ort_output,
+    )
 
 
+@pytest.mark.custom_layers
 @pytest.mark.on_qaic
 @pytest.mark.llm_model
 @pytest.mark.parametrize("model_name", test_models)
@@ -187,6 +204,22 @@ def test_ctc_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
     check_ctc_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=4)
 
 
+@pytest.mark.full_model
+@pytest.mark.on_qaic
+@pytest.mark.llm_model
+@pytest.mark.parametrize("model_name", test_models)
+def test_full_ctc_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
+    """
+    Test function to validate the PyTorch model, the PyTorch model the ONNX model, and the Cloud AI 100 model.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+    """
+    check_ctc_pytorch_vs_kv_vs_ort_vs_ai100(
+        model_name=model_name,
+        compare_results=True,
+    )
+
+
 @pytest.mark.on_qaic
 @pytest.mark.llm_model
 @pytest.mark.qnn
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
index a87ac8efc..72501c387 100644
--- a/tests/transformers/models/test_causal_lm_models.py
+++ b/tests/transformers/models/test_causal_lm_models.py
@@ -25,6 +25,8 @@
 from QEfficient.utils.run_utils import ApiRunner
 from QEfficient.utils.test_utils import ModelConfig
 
+from .check_model_results import dump_and_compare_results
+
 CONFIG_PATH = "tests/configs/causal_model_configs.json"
 
 with open(CONFIG_PATH, "r") as f:
@@ -72,14 +74,15 @@ def get_custom_n_layers(model_name):
 
     :return n_layer
     """
-    if model_name in {"microsoft/Phi-3-mini-4k-instruct", "neuralmagic/Qwen2-0.5B-Instruct-FP8", "openai/gpt-oss-20b"}:
-        return 2
-    elif model_name in ModelConfig.SWIFTKV_MODELS:
-        return None
-    return 1
+    # if model_name in {"microsoft/Phi-3-mini-4k-instruct", "neuralmagic/Qwen2-0.5B-Instruct-FP8", "openai/gpt-oss-20b"}:
+    #     return 2
+    # elif model_name in ModelConfig.SWIFTKV_MODELS:
+    #     return None
+    # return 1
+    return None
 
 
-def load_causal_lm_model(model_name, n_layer=1, config=None):
+def load_causal_lm_model(model_name, n_layer=None, config=None):
     """
     Function to load model from huggingface and transform to KV model
     --------
@@ -130,6 +133,103 @@ def load_causal_lm_model(model_name, n_layer=1, config=None):
     return model_hf, params
 
 
+def check_full_causal_lm_and_compare_results(model_name):
+    """
+    Function to check the full model and compare results between PyTorch, ONNX Runtime and Cloud AI 100. Compare the peformance and tokens with the previous results.
+
+    :model_name: str
+
+    :return None
+    """
+    prompt_len: int = Constants.PROMPT_LEN
+    ctx_len: int = Constants.CTX_LEN
+    prefill_only = None
+    retain_full_kv = None
+    pytorch_hf_tokens = None
+    pytorch_kv_tokens = None
+
+    model_hf, _ = load_causal_lm_model(model_name)
+    tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
+    config = model_hf.config
+    batch_size = len(Constants.INPUT_STR)
+    api_runner = ApiRunner(
+        batch_size,
+        tokenizer,
+        config,
+        Constants.INPUT_STR,
+        Constants.PROMPT_LEN,
+        Constants.CTX_LEN,
+    )
+
+    if model_name not in ModelConfig.SWIFTKV_MODELS and model_name not in ModelConfig.EXTERNAL_MODELS:
+        pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf)
+        print(f"HF PyTorch tokens: {pytorch_hf_tokens}")
+
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
+        pretrained_model_name_or_path=model_name,
+    )
+    pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model)
+    print(f"KV PyTorch tokens: {pytorch_kv_tokens}")
+
+    if model_name not in ModelConfig.SWIFTKV_MODELS and model_name not in ModelConfig.EXTERNAL_MODELS:
+        assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), (
+            "Tokens don't match for HF PyTorch model output and KV PyTorch model output"
+        )
+    onnx_model_path = qeff_model.export()
+    ort_tokens = api_runner.run_kv_model_on_ort(
+        onnx_model_path,
+    )
+    print(f"ONNX tokens: {ort_tokens}")
+    gen_len = ort_tokens.shape[-1]
+
+    assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for ONNXRT output and PyTorch output."
+
+    qpc_path = qeff_model.compile(
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        num_devices=1,
+        mxfp6=False,
+        aic_enable_depth_first=False,
+        prefill_only=prefill_only,
+        retain_full_kv=retain_full_kv,
+    )
+    exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
+    print(f"exec_info: {exec_info}")
+    print(f"Cloud AI 100 tokens: {exec_info.generated_ids}")
+    cloud_ai_100_tokens = exec_info.generated_ids[0][
+        :, :gen_len
+    ]  # Because we always run for single input and single batch size
+    if prefill_only:
+        assert (ort_tokens[0][0] == cloud_ai_100_tokens[0][0]).all(), (
+            "prefill run output tokens don't match for ONNXRT output and Cloud AI 100 output."
+        )
+    else:
+        assert (ort_tokens == cloud_ai_100_tokens).all(), (
+            "Tokens don't match for ONNXRT output and Cloud AI 100 output."
+        )
+        assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
+
+    compile_params = {
+        "prefill_seq_len": prompt_len,
+        "ctx_len": ctx_len,
+        "num_devices": 1,
+        "mxfp6": False,
+        "aic_enable_depth_first": False,
+        "prefill_only": prefill_only,
+        "retain_full_kv": retain_full_kv,
+    }
+    assert dump_and_compare_results(
+        model_name,
+        compile_params,
+        "causal_lm_model_results.json",
+        cloud_ai_100_tokens,
+        exec_info,
+        pytorch_hf_tokens,
+        pytorch_kv_tokens,
+        ort_tokens,
+    )
+
+
 def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     model_name: str,
     prompt_len: int = Constants.PROMPT_LEN,
@@ -193,7 +293,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
-        num_cores=14,
+        num_devices=4,
         mxfp6=False,
         aic_enable_depth_first=False,
         num_speculative_tokens=num_speculative_tokens,
@@ -268,7 +368,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
-        num_cores=14,
+        num_devices=4,
         mxfp6=False,
         aic_enable_depth_first=False,
         batch_size=batch_size,
@@ -330,8 +430,8 @@ def test_causal_lm_export_with_deprecated_api(model_name):
     )
 
 
+@pytest.mark.dummy_model
 @pytest.mark.on_qaic
-@pytest.mark.regular
 @pytest.mark.llm_model
 @pytest.mark.parametrize("model_name", test_models_causal)
 def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
@@ -349,7 +449,7 @@ def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
         check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, config=hf_config)
 
 
-@pytest.mark.nightly
+@pytest.mark.custom_layers
 @pytest.mark.on_qaic
 @pytest.mark.llm_model
 @pytest.mark.parametrize("model_name", test_models_causal)
@@ -364,6 +464,16 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer)
 
 
+@pytest.mark.full_model
+@pytest.mark.on_qaic
+@pytest.mark.llm_model
+@pytest.mark.parametrize("model_name", test_models_causal)
+def test_full_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
+    if model_name in ModelConfig.FULL_MODEL_TESTS_TO_SKIP:
+        pytest.skip(f"Skipping full model test for {model_name} due to resource constraints.")
+    check_full_causal_lm_and_compare_results(model_name)
+
+
 @pytest.mark.nightly
 @pytest.mark.on_qaic
 @pytest.mark.parametrize("retain_full_kv", [True, False])
diff --git a/tests/transformers/models/test_disagg_mode.py b/tests/transformers/models/test_disagg_mode.py
index 537ecd0cc..74e768d5e 100644
--- a/tests/transformers/models/test_disagg_mode.py
+++ b/tests/transformers/models/test_disagg_mode.py
@@ -50,7 +50,7 @@ def test_disagg_mode_prefill(model_id, prompt):
     padded_len = num_chunks * PREFILL_SEQ_LEN  # Convert to a multiple of prompt_len
 
     replace_transformers_quantizers()
-    model = AutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2)
+    model = AutoModelForCausalLM.from_pretrained(model_id)
     config = model.config
     inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len)
     inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1)
@@ -62,7 +62,7 @@ def test_disagg_mode_prefill(model_id, prompt):
 
     undo_transformers_quantizers()
 
-    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2)
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id)
     qeff_model.prefill(True)
     config = qeff_model.model.config
     inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len)
@@ -87,10 +87,9 @@ def test_disagg_mode_prefill(model_id, prompt):
     prefill_qpc_path = qeff_model.compile(
         prefill_seq_len=PREFILL_SEQ_LEN,
         ctx_len=CTX_LEN,
-        num_cores=16,
+        num_devices=4,
         mxfp6_matmul=False,
         mxint8_kv_cache=False,
-        num_devices=1,
         mos=1,
         aic_enable_depth_first=True,
         num_speculative_tokens=None,
@@ -212,7 +211,9 @@ def test_disagg_mode_prefill_only_and_decode_only(model_id, prompt):
     padded_len = num_chunks * PREFILL_SEQ_LEN  # Convert to a multiple of prompt_len
 
     replace_transformers_quantizers()
-    model = AutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+    )
     config = model.config
     inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len)
     inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1)
@@ -246,7 +247,7 @@ def test_disagg_mode_prefill_only_and_decode_only(model_id, prompt):
 
     undo_transformers_quantizers()
 
-    prefill_qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2)
+    prefill_qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id)
     prefill_qeff_model.prefill(enable=True)
     config = prefill_qeff_model.model.config
     past_key_values = []
@@ -264,7 +265,9 @@ def test_disagg_mode_prefill_only_and_decode_only(model_id, prompt):
     # Check our pytorch implementation
     assert (prefill_qeff_out.logits - orig_out.logits[:, -1, :]).abs().max() < 1e-4
 
-    decode_qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2)
+    decode_qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
+        model_id,
+    )
     decode_qeff_model.prefill(enable=False)
     qeff_out = prefill_qeff_out
 
@@ -293,10 +296,9 @@ def test_disagg_mode_prefill_only_and_decode_only(model_id, prompt):
     prefill_qpc_path = prefill_qeff_model.compile(
         prefill_seq_len=PREFILL_SEQ_LEN,
         ctx_len=CTX_LEN,
-        num_cores=16,
+        num_devices=4,
         mxfp6_matmul=False,
         mxint8_kv_cache=False,
-        num_devices=1,
         mos=1,
         aic_enable_depth_first=True,
         num_speculative_tokens=None,
@@ -316,10 +318,9 @@ def test_disagg_mode_prefill_only_and_decode_only(model_id, prompt):
     decode_qpc_path = decode_qeff_model.compile(
         prefill_seq_len=1,
         ctx_len=CTX_LEN,
-        num_cores=16,
+        num_devices=4,
         mxfp6_matmul=False,
         mxint8_kv_cache=False,
-        num_devices=1,
         mos=1,
         aic_enable_depth_first=True,
         num_speculative_tokens=None,
@@ -375,18 +376,17 @@ def test_disagg_mode_prefill_only_and_decode_only(model_id, prompt):
 def test_disagg_mode_prefix_caching(model_id, prompt):
     PREFILL_SEQ_LEN = 128
     CTX_LEN = 128 * 3
-    config = AutoConfig.from_pretrained(model_id, num_hidden_layers=2)
-    prefill_qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
-        model_id, num_hidden_layers=2, continuous_batching=True
+    config = AutoConfig.from_pretrained(
+        model_id,
     )
+    prefill_qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, continuous_batching=True)
     prefill_qeff_model.prefill(enable=True, enable_chunking=True)
     prefill_qpc_path = prefill_qeff_model.compile(
         prefill_seq_len=PREFILL_SEQ_LEN,
         ctx_len=CTX_LEN,
-        num_cores=16,
+        num_devices=4,
         mxfp6_matmul=False,
         mxint8_kv_cache=False,
-        num_devices=1,
         mos=1,
         aic_enable_depth_first=True,
         num_speculative_tokens=None,
@@ -396,17 +396,14 @@ def test_disagg_mode_prefix_caching(model_id, prompt):
         kv_cache_batch_size=2,
     )
 
-    decode_qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
-        model_id, num_hidden_layers=2, continuous_batching=True
-    )
+    decode_qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, continuous_batching=True)
     decode_qeff_model.prefill(enable=False)
     decode_qpc_path = decode_qeff_model.compile(
         prefill_seq_len=1,
         ctx_len=CTX_LEN,
-        num_cores=16,
+        num_devices=4,
         mxfp6_matmul=False,
         mxint8_kv_cache=False,
-        num_devices=1,
         mos=1,
         aic_enable_depth_first=True,
         num_speculative_tokens=None,
@@ -437,7 +434,9 @@ def test_disagg_mode_prefix_caching(model_id, prompt):
 def prefix_caching_inference(model_id, prefill_qpc_path, decode_qpc_path, prompt, decode_batch_id):
     PREFILL_SEQ_LEN = 128
     tokenizer = AutoTokenizer.from_pretrained(model_id)
-    config = AutoConfig.from_pretrained(model_id, num_hidden_layers=2)
+    config = AutoConfig.from_pretrained(
+        model_id,
+    )
     inputs = tokenizer(prompt, return_tensors="np", padding=True)
     padded_len = inputs["input_ids"].shape[1]
     num_chunks = -(padded_len // -PREFILL_SEQ_LEN)  # ceil divide without float
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index 7eb09d911..b4aacc757 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -19,6 +19,8 @@
 from QEfficient.utils._utils import create_json
 from QEfficient.utils.constants import Constants, QnnConstants
 
+from .check_model_results import dump_and_compare_results
+
 CONFIG_PATH = "tests/configs/embedding_model_configs.json"
 
 with open(CONFIG_PATH, "r") as f:
@@ -29,22 +31,31 @@
 def check_embed_pytorch_vs_ort_vs_ai100(
     model_name: str,
     seq_len: int = Constants.CTX_LEN,
-    n_layer: int = 1,
+    n_layer: int = -1,
     enable_qnn: Optional[bool] = False,
     qnn_config: Optional[str] = None,
     pooling: Optional[str] = None,
+    compare_results: Optional[bool] = False,
 ):
     # Prepare input
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     inputs = tokenizer("My name is", return_tensors="pt")
 
     # Original PyTorch model
-    pt_model = AutoModel.from_pretrained(
-        model_name,
-        num_hidden_layers=n_layer,
-        attn_implementation="eager",
-        trust_remote_code=True,
-    )
+    pt_model = None
+    if n_layer == -1:
+        pt_model = AutoModel.from_pretrained(
+            model_name,
+            attn_implementation="eager",
+            trust_remote_code=True,
+        )
+    else:
+        pt_model = AutoModel.from_pretrained(
+            model_name,
+            num_hidden_layers=n_layer,
+            attn_implementation="eager",
+            trust_remote_code=True,
+        )
 
     # Original PyTorch model output
     pt_outputs = pt_model(**inputs)
@@ -85,7 +96,6 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}"
 
     qeff_model.compile(
-        num_cores=14,
         enable_qnn=enable_qnn,
         qnn_config=qnn_config,
     )
@@ -100,6 +110,20 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     assert mad <= 10**-2, f"MAD is too high for onnx and Pytorch: {mad}"
     assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
+    if compare_results is False:
+        return
+
+    compile_params = {"enable_qnn": enable_qnn, "qnn_config": qnn_config, "pooling": pooling, "seq_len": seq_len}
+    assert dump_and_compare_results(
+        model_name,
+        compile_params,
+        "embedding_model_results.json",
+        qeff_ai100_embeddings,
+        pytorch_hf_tokens=pt_embeddings,
+        pytorch_kv_tokens=qeff_pt_embeddings,
+        ort_tokens=onnx_outputs[0],
+    )
+
 
 @pytest.mark.on_qaic
 @pytest.mark.llm_model
@@ -131,6 +155,19 @@ def test_embed_model_pytorch_vs_onnx_vs_ai100_multiple_seq_len(model):
     check_embed_pytorch_vs_ort_vs_ai100(model_name=model["model_name"], seq_len=[32, 20], n_layer=1)
 
 
+@pytest.mark.full_model
+@pytest.mark.on_qaic
+@pytest.mark.llm_model
+@pytest.mark.parametrize("model", embed_test_models)
+def test_full_embed_model_pytorch_vs_onnx_vs_ai100_pooling(model):
+    """
+    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with multiple seq_len.
+    """
+    check_embed_pytorch_vs_ort_vs_ai100(
+        model_name=model["model_name"], seq_len=32, pooling=model["pooling"], compare_results=True
+    )
+
+
 ##########  QNN TESTS ##############
 
 
diff --git a/tests/transformers/models/test_seq_classification.py b/tests/transformers/models/test_seq_classification.py
index d1c9cd84e..d2f3ec5ea 100644
--- a/tests/transformers/models/test_seq_classification.py
+++ b/tests/transformers/models/test_seq_classification.py
@@ -6,7 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import os
-from typing import List, Union
+from typing import List, Optional, Union
 
 import numpy as np
 import pytest
@@ -15,12 +15,16 @@
 
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForSequenceClassification
 
+from .check_model_results import dump_and_compare_results
+
 seq_classification_test_models = [
     "meta-llama/Llama-Prompt-Guard-2-22M",
 ]
 
 
-def check_seq_classification_pytorch_vs_ai100(model_name: str, seq_len: Union[int, List[int]] = 32, n_layer: int = 1):
+def check_seq_classification_pytorch_vs_ai100(
+    model_name: str, seq_len: Union[int, List[int]] = 32, n_layer: int = -1, compare_results: Optional[bool] = False
+):
     """
     Validate the PyTorch model and the Cloud AI 100 model for sequence classification.
 
@@ -40,12 +44,20 @@ def check_seq_classification_pytorch_vs_ai100(model_name: str, seq_len: Union[in
     inputs = tokenizer(test_text, return_tensors="pt")
 
     # Run PyTorch model
-    pt_model = AutoModelForSequenceClassification.from_pretrained(
-        model_name,
-        num_hidden_layers=n_layer,
-        attn_implementation="eager",
-        trust_remote_code=True,
-    )
+    pt_model = None
+    if n_layer == -1:
+        pt_model = AutoModelForSequenceClassification.from_pretrained(
+            model_name,
+            attn_implementation="eager",
+            trust_remote_code=True,
+        )
+    else:
+        pt_model = AutoModelForSequenceClassification.from_pretrained(
+            model_name,
+            num_hidden_layers=n_layer,
+            attn_implementation="eager",
+            trust_remote_code=True,
+        )
     pt_model.eval()
 
     with torch.no_grad():
@@ -56,7 +68,6 @@ def check_seq_classification_pytorch_vs_ai100(model_name: str, seq_len: Union[in
     # Create QEff model and compile
     qeff_model = QEFFAutoModelForSequenceClassification(pt_model)
     qpc_path = qeff_model.compile(
-        num_cores=16,
         seq_len=seq_len,
         batch_size=1,
         num_devices=1,
@@ -83,8 +94,25 @@ def check_seq_classification_pytorch_vs_ai100(model_name: str, seq_len: Union[in
 
     # Print final result
     print(f"MAD (PyTorch vs AI100): {mad_pt_ai100:.2e}")
+    if compare_results is False:
+        return
+
+    compile_params = {
+        "seq_len": seq_len,
+        "batch_size": 1,
+        "num_devices": 1,
+        "mxfp6_matmul": False,
+    }
+    assert dump_and_compare_results(
+        model_name,
+        compile_params,
+        "seq_classification_model_results.json",
+        ai100_logits.numpy(),
+        pytorch_hf_tokens=pt_logits.numpy(),
+    )
 
 
+@pytest.mark.custom_layers
 @pytest.mark.on_qaic
 @pytest.mark.parametrize("model_name", seq_classification_test_models)
 def test_seq_classification_pytorch_vs_ai100(model_name):
@@ -120,3 +148,23 @@ def test_seq_classification_multiple_seq_len(model_name):
         seq_len=[32, 64, 128],
         n_layer=1,
     )
+
+
+@pytest.mark.full_model
+@pytest.mark.on_qaic
+@pytest.mark.parametrize("model_name", seq_classification_test_models)
+def test_full_seq_classification_pytorch_vs_ai100(model_name):
+    """
+    Test function to validate the sequence classification model with multiple sequence lengths.
+
+    This test ensures that:
+    1. Dynamic shape handling works correctly
+    2. Model can handle variable input sizes
+    3. Compilation with multiple specializations succeeds
+    4. Outputs remain consistent across different sequence lengths
+    """
+    check_seq_classification_pytorch_vs_ai100(
+        model_name=model_name,
+        seq_len=32,
+        compare_results=True,
+    )
diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py
index 774802c83..130a401a9 100644
--- a/tests/transformers/models/test_speech_seq2seq_models.py
+++ b/tests/transformers/models/test_speech_seq2seq_models.py
@@ -24,7 +24,8 @@
 from QEfficient.utils import get_padding_shape_from_config, hf_download
 from QEfficient.utils._utils import create_json, load_hf_processor
 from QEfficient.utils.constants import Constants, QnnConstants
-from QEfficient.utils.device_utils import get_available_device_id
+
+from .check_model_results import dump_and_compare_results
 
 CONFIG_PATH = "tests/configs/speech_seq2seq_model_configs.json"
 
@@ -46,13 +47,22 @@ def load_seq2seq_model(model_config):
         repo_id=model_config["model_name"],
         ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
     )
-    model_hf = AutoModelForSpeechSeq2Seq.from_pretrained(
-        model_path,
-        use_cache=True,
-        num_hidden_layers=model_config["n_layer"],
-        attn_implementation="eager",
-        low_cpu_mem_usage=False,
-    )  # Run models for single layers only
+    model_hf = None
+    if model_config["n_layer"] != -1:
+        model_hf = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_path,
+            use_cache=True,
+            num_hidden_layers=model_config["n_layer"],
+            attn_implementation="eager",
+            low_cpu_mem_usage=False,
+        )
+    else:
+        model_hf = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_path,
+            use_cache=True,
+            attn_implementation="eager",
+            low_cpu_mem_usage=False,
+        )
     params = sum(p.numel() for p in model_hf.parameters())
     model_hf.eval()
     return model_hf, params
@@ -290,9 +300,10 @@ def run_seq2seq_ort(
 def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
     model_name: str,
     ctx_len: int = Constants.CTX_LEN,
-    n_layer: int = 1,
+    n_layer: int = -1,
     enable_qnn: Optional[bool] = False,
     qnn_config: Optional[str] = None,
+    compare_results: Optional[bool] = False,
 ):
     """
     Validate the PyTorch model, the PyTorch model after KV changes, ONNX model and the Cloud AI 100 model
@@ -307,6 +318,7 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
 
     model_hf, _ = load_seq2seq_model(model_config)
 
+    print(model_hf)
     processor = load_hf_processor(pretrained_model_name_or_path=model_name)
     batch_size = 1
 
@@ -314,26 +326,19 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
     data = ds[0]["audio"]["array"]
     data = data.reshape(-1)
     sample_rate = ds[0]["audio"]["sampling_rate"]
-
     pytorch_hf_tokens = run_seq2seq_pytorch_hf(model_hf, processor, data, sample_rate, ctx_len)
 
     qeff_model = QEFFAutoModelForSpeechSeq2Seq(model_hf, pretrained_model_name_or_path=model_name)
 
     pytorch_kv_tokens = run_seq2seq_pytorch_with_kv(qeff_model, processor, data, sample_rate, ctx_len)
-
     assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), (
         "Tokens don't match for HF PyTorch model output and KV PyTorch model output"
     )
 
     qeff_model.export()
-
     ort_tokens = run_seq2seq_ort(qeff_model.onnx_path, qeff_model.model.config, processor, data, sample_rate, ctx_len)
-
     assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for pytorch output and ort output"
 
-    if not get_available_device_id():
-        pytest.skip("No available devices to run model on Cloud AI 100")
-
     qeff_model.compile(
         ctx_len=ctx_len,
         num_cores=16,
@@ -341,7 +346,6 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
         enable_qnn=enable_qnn,
         qnn_config=qnn_config,
     )
-
     exec_info = qeff_model.generate(
         inputs=processor(data, sampling_rate=sample_rate, return_tensors="pt"), generation_len=ctx_len
     )
@@ -351,7 +355,23 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
     )
     assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
+    if compare_results is False:
+        return
+
+    compile_params = {"enable_qnn": enable_qnn, "qnn_config": qnn_config, "seq_len": ctx_len, "n_layer": n_layer}
+    assert dump_and_compare_results(
+        model_name,
+        compile_params,
+        "speech_seq2seq_model_results.json",
+        cloud_ai_100_tokens,
+        exec_info=exec_info,
+        pytorch_hf_tokens=pytorch_hf_tokens,
+        pytorch_kv_tokens=pytorch_kv_tokens,
+        ort_tokens=ort_tokens,
+    )
+
 
+@pytest.mark.custom_layers
 @pytest.mark.on_qaic
 @pytest.mark.llm_model
 @pytest.mark.parametrize("model_name", test_models)
@@ -364,6 +384,17 @@ def test_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
     check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=4)
 
 
+@pytest.mark.full_model
+@pytest.mark.on_qaic
+@pytest.mark.llm_model
+@pytest.mark.parametrize("model_name", test_models)
+def test_full_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
+    check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
+        model_name=model_name,
+        compare_results=True,
+    )
+
+
 @pytest.mark.on_qaic
 @pytest.mark.llm_model
 @pytest.mark.qnn