diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py index 0e1118407..6ebccdfbf 100644 --- a/QEfficient/transformers/cache_utils.py +++ b/QEfficient/transformers/cache_utils.py @@ -10,7 +10,7 @@ from typing import Any, Dict, List, Optional, Tuple import torch -from transformers.cache_utils import DynamicCache, DynamicLayer, EncoderDecoderCache, HybridCache, HybridChunkedCache +from transformers.cache_utils import Cache, CacheLayerMixin, EncoderDecoderCache, HybridCache, HybridChunkedCache from QEfficient.customop import ( CtxGatherFunc, @@ -54,7 +54,47 @@ def _get_invalid_idx_value(cls): return 0 -class QEffDynamicLayer(DynamicLayer): +class QEffDynamicLayer(CacheLayerMixin): + is_sliding = False + + def __init__(self): + super().__init__() + + def lazy_initialization(self, key_states: torch.Tensor): + self.dtype = key_states.dtype + self.device = key_states.device + self.keys = torch.tensor([], dtype=self.dtype, device=self.device) + self.values = torch.tensor([], dtype=self.dtype, device=self.device) + self.is_initialized = True + + def get_mask_sizes(self, cache_position: torch.Tensor) -> tuple[int, int]: + kv_offset = 0 + query_length = cache_position.shape[0] + kv_length = self.get_seq_length() + query_length + return kv_length, kv_offset + + def get_seq_length(self) -> int: + if self.keys is None or self.keys.numel() == 0: + return 0 + return self.keys.shape[-2] + + def get_max_cache_shape(self) -> int: + return -1 + + @classmethod + def from_tensors(cls, key_states: torch.Tensor, value_states: torch.Tensor) -> "QEffDynamicLayer": + layer = cls() + layer.keys = key_states + layer.values = value_states + layer._mark_initialized(key_states) + return layer + + def _mark_initialized(self, reference_states: torch.Tensor) -> None: + if not self.is_initialized: + self.dtype = reference_states.dtype + self.device = reference_states.device + self.is_initialized = True + def read_only(self, cache_kwargs): """ Reads the `key_states` and `value_states` for the layer. @@ -68,6 +108,8 @@ def read_only(self, cache_kwargs): """ # Gather k_out, v_out = self.keys, self.values + if k_out is not None: + self._mark_initialized(k_out) position_ids = cache_kwargs.get("position_ids") batch_index = cache_kwargs.get("batch_index", None) ctx_len = cache_kwargs.get("CCL", k_out.shape[2]) @@ -109,6 +151,8 @@ def read_only_blockedKV(self, start_index, end_index, cache_kwargs): """ # Gather k_out, v_out = self.keys, self.values + if k_out is not None: + self._mark_initialized(k_out) position_ids = cache_kwargs.get("position_ids") batch_index = cache_kwargs.get("batch_index", None) batch, num_kv_heads, _, _ = k_out.shape @@ -150,7 +194,9 @@ def write_only(self, key_states, value_states, cache_kwargs): if self.keys is None: self.keys = key_states self.values = value_states + self._mark_initialized(self.keys) else: + self._mark_initialized(self.keys) position_ids = cache_kwargs.get("position_ids") batch_index = cache_kwargs.get("batch_index", None) # Check and fetch batch index value form the kwargs @@ -189,8 +235,10 @@ def update( if self.keys is None: self.keys = key_states self.values = value_states + self._mark_initialized(self.keys) k_out, v_out = self.keys, self.values else: + self._mark_initialized(self.keys) position_ids = cache_kwargs.get("position_ids") batch_index = cache_kwargs.get("batch_index", None) # Check and fetch batch index value form the kwargs @@ -252,8 +300,10 @@ def update3D( if self.keys is None: self.keys = key_states self.values = value_states + self._mark_initialized(self.keys) k_out, v_out = self.keys, self.values else: + self._mark_initialized(self.keys) position_ids = cache_kwargs.get("position_ids") batch_index = cache_kwargs.get("batch_index", None) @@ -293,7 +343,7 @@ def update3D( return k_out, v_out -class QEffDynamicCache(DynamicCache): +class QEffDynamicCache(Cache): """ A cache that grows dynamically as more tokens are generated. This is the default for generative models. @@ -307,15 +357,46 @@ class QEffDynamicCache(DynamicCache): """ def __init__(self, ddp_cache_data: Optional[Iterable[tuple[torch.Tensor, torch.Tensor]]] = None, *args, **kwargs): - # Remove layer_classes if present to avoid duplicate argument + # Remove cache-layer construction args if present to avoid duplicate arguments. kwargs.pop("layer_classes", None) - from transformers.cache_utils import Cache # Import here to avoid circular import - - Cache.__init__(self, layer_classes=QEffDynamicLayer, *args, **kwargs) + kwargs.pop("layers", None) + kwargs.pop("layer_class_to_replicate", None) + + try: + # transformers>=4.57 + Cache.__init__(self, *args, layer_class_to_replicate=QEffDynamicLayer, **kwargs) + except TypeError: + # transformers<=4.56 + Cache.__init__(self, *args, layer_classes=QEffDynamicLayer, **kwargs) if ddp_cache_data is not None: for key_states, value_states in ddp_cache_data: self.layers.append(QEffDynamicLayer.from_tensors(key_states, value_states)) + def append_new_layers(self, layer_idx: int) -> None: + while len(self.layers) <= layer_idx: + self.layers.append(QEffDynamicLayer()) + + @classmethod + def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "QEffDynamicCache": + cache = cls() + if past_key_values is not None: + for layer_idx in range(len(past_key_values)): + key_states, value_states = past_key_values[layer_idx] + cache.update(key_states, value_states, layer_idx) + return cache + + def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]: + legacy_cache = () + for layer in self.layers: + legacy_cache += ((layer.keys, layer.values),) + return legacy_cache + + def get_seq_length(self, layer_idx: Optional[int] = 0, cache_position: Optional[torch.LongTensor] = None) -> int: + """ + Keep backward-compatible call shape while deferring to upstream implementation. + """ + return super().get_seq_length(layer_idx) + def read_only(self, layer_idx, cache_kwargs): """ Reads the `key_states` and `value_states` for the layer `layer_idx`. @@ -405,10 +486,7 @@ def from_legacy_cache( cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None ) -> "EncoderDecoderCache": """Converts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`.""" - cache = cls( - self_attention_cache=QEffDynamicCache(), - cross_attention_cache=QEffDynamicCache(), - ) + cache = cls(QEffDynamicCache(), QEffDynamicCache()) if past_key_values is not None: for layer_idx in range(len(past_key_values)): key_states, value_states = past_key_values[layer_idx][:2] @@ -419,6 +497,18 @@ def from_legacy_cache( cache.is_updated[layer_idx] = True return cache + def to_legacy_cache(self): + self_attn_legacy = self.self_attention_cache.to_legacy_cache() + cross_attn_legacy = self.cross_attention_cache.to_legacy_cache() + + legacy_cache = () + for layer_idx, self_attn_layer in enumerate(self_attn_legacy): + if layer_idx < len(cross_attn_legacy): + legacy_cache += (self_attn_layer + cross_attn_legacy[layer_idx],) + else: + legacy_cache += (self_attn_layer,) + return legacy_cache + # TODO:This function will be depercated in future. class QEffHybridCache(HybridCache): @@ -447,7 +537,7 @@ def __len__(self): """ return len(self.key_cache) - def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + def get_seq_length(self, layer_idx: Optional[int] = 0, cache_position: Optional[torch.LongTensor] = None) -> int: """Returns the sequence length of the cached states. A layer index can be optionally passed.""" # TODO: deprecate this function in favor of `cache_position` is_empty_layer = ( @@ -531,7 +621,7 @@ def __len__(self): """ return len(self.key_cache) - def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + def get_seq_length(self, layer_idx: Optional[int] = 0, cache_position: Optional[torch.LongTensor] = None) -> int: """Returns the sequence length of the cached states. A layer index can be optionally passed.""" # TODO: deprecate this function in favor of `cache_position` is_empty_layer = ( @@ -663,7 +753,7 @@ def __len__(self): """ return len(self.key_cache) - def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + def get_seq_length(self, layer_idx: Optional[int] = 0, cache_position: Optional[torch.LongTensor] = None) -> int: """Returns the sequence length of the cached states. A layer index can be optionally passed.""" # TODO: deprecate this function in favor of `cache_position` is_empty_layer = ( @@ -783,7 +873,7 @@ def __len__(self): """ return len(self.key_cache) - def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + def get_seq_length(self, layer_idx: Optional[int] = 0, cache_position: Optional[torch.LongTensor] = None) -> int: """Returns the sequence length of the cached states. A layer index can be optionally passed.""" # TODO: deprecate this function in favor of `cache_position` is_empty_layer = ( diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py index 47ae57557..77a440018 100644 --- a/QEfficient/transformers/modeling_utils.py +++ b/QEfficient/transformers/modeling_utils.py @@ -191,6 +191,7 @@ ] ) + # This is for supporting different seq_len for different layers for Sliding window attn, chunked attn etc. DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH = {"gemma3", "llama4", "gemma3_text", "llama4_text"} diff --git a/QEfficient/transformers/models/falcon/modeling_falcon.py b/QEfficient/transformers/models/falcon/modeling_falcon.py index 4ebb2fb96..90032be4e 100644 --- a/QEfficient/transformers/models/falcon/modeling_falcon.py +++ b/QEfficient/transformers/models/falcon/modeling_falcon.py @@ -32,6 +32,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils._utils import resolve_kv_seq_len from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE @@ -137,7 +138,7 @@ def forward( key_layer = key_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim) value_layer = value_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim) - kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position) + kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_layer.shape[-2], cache_position) cos, sin = self.rotary_emb(value_layer, seq_len=kv_seq_len) query_layer, key_layer = qeff_apply_rotary_pos_emb(query_layer, key_layer, cos, sin, position_ids) diff --git a/QEfficient/transformers/models/gemma/modeling_gemma.py b/QEfficient/transformers/models/gemma/modeling_gemma.py index 260d1857a..bc3b00e6a 100644 --- a/QEfficient/transformers/models/gemma/modeling_gemma.py +++ b/QEfficient/transformers/models/gemma/modeling_gemma.py @@ -27,6 +27,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils._utils import resolve_kv_seq_len from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE @@ -149,7 +150,7 @@ def forward( key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position) + kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) diff --git a/QEfficient/transformers/models/gemma2/modeling_gemma2.py b/QEfficient/transformers/models/gemma2/modeling_gemma2.py index 6dee8c85d..8d15e3485 100644 --- a/QEfficient/transformers/models/gemma2/modeling_gemma2.py +++ b/QEfficient/transformers/models/gemma2/modeling_gemma2.py @@ -30,6 +30,7 @@ # from transformers.utils import is_torchdynamo_compiling from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils._utils import resolve_kv_seq_len from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE @@ -156,7 +157,7 @@ def forward( key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position) + kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) diff --git a/QEfficient/transformers/models/gptj/modeling_gptj.py b/QEfficient/transformers/models/gptj/modeling_gptj.py index a4c81dbec..bbf621f10 100644 --- a/QEfficient/transformers/models/gptj/modeling_gptj.py +++ b/QEfficient/transformers/models/gptj/modeling_gptj.py @@ -223,7 +223,7 @@ def forward( else: past_length = past_key_values[0][0].size(-2) - if not self._use_flash_attention_2: + if not getattr(self, "_use_flash_attention_2", False): attention_mask = _create_causal_mask(position_ids, past_length, None) # # Prepare head mask if needed diff --git a/QEfficient/transformers/models/granite/modeling_granite.py b/QEfficient/transformers/models/granite/modeling_granite.py index 8a32c52ef..d30b9fc39 100644 --- a/QEfficient/transformers/models/granite/modeling_granite.py +++ b/QEfficient/transformers/models/granite/modeling_granite.py @@ -27,6 +27,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils._utils import resolve_kv_seq_len from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE @@ -142,7 +143,7 @@ def forward( key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position) + kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) diff --git a/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py b/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py index 935df7c2d..2f61ac164 100644 --- a/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py +++ b/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py @@ -28,6 +28,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils._utils import resolve_kv_seq_len from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE @@ -137,7 +138,7 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position) + kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: diff --git a/QEfficient/transformers/models/grok_1/modeling_grok1.py b/QEfficient/transformers/models/grok_1/modeling_grok1.py index 1a1c919bb..5c2f145b4 100644 --- a/QEfficient/transformers/models/grok_1/modeling_grok1.py +++ b/QEfficient/transformers/models/grok_1/modeling_grok1.py @@ -20,6 +20,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask from QEfficient.transformers.models.llama.modeling_llama import qeff_apply_rotary_pos_emb +from QEfficient.utils._utils import resolve_kv_seq_len from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE @@ -87,8 +88,7 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - if past_key_value is not None: - kv_seq_len = past_key_value.get_seq_length(layer_idx) + kv_seq_len = resolve_kv_seq_len(past_key_value, layer_idx, key_states.shape[-2]) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) diff --git a/QEfficient/transformers/models/llama/modeling_llama.py b/QEfficient/transformers/models/llama/modeling_llama.py index 57bccdb1b..a0a3b0237 100644 --- a/QEfficient/transformers/models/llama/modeling_llama.py +++ b/QEfficient/transformers/models/llama/modeling_llama.py @@ -27,6 +27,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils._utils import resolve_kv_seq_len from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE @@ -226,7 +227,7 @@ def forward( key_states = self.k_proj(hidden_states, **kwargs).view(hidden_shape).transpose(1, 2) value_states = self.v_proj(hidden_states, **kwargs).view(hidden_shape).transpose(1, 2) - kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position) + kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position) past_seen_tokens = past_key_value.get_seq_length() if past_key_value is not None else 0 cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py index e219d5e03..8c96955dd 100644 --- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py +++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py @@ -29,6 +29,7 @@ QEffLlamaRotaryEmbedding, qeff_apply_rotary_pos_emb, ) +from QEfficient.utils._utils import resolve_kv_seq_len from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE @@ -110,7 +111,7 @@ def forward( if comp_ctx_lengths is not None: attention_mask = attention_mask[:, :, :, : comp_ctx_lengths.shape[-1]] cache_kwargs["CCL"] = attention_mask.shape[-1] - kv_seq_len = past_key_value.get_seq_length(self.layer_idx) + kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, query_states.shape[-2]) key_states, value_states = past_key_value.read_only(self.layer_idx, cache_kwargs=cache_kwargs) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) @@ -370,7 +371,7 @@ def forward( "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " "with a layer index." ) - kv_seq_len = past_key_values.get_seq_length(self_attn.layer_idx) + kv_seq_len = resolve_kv_seq_len(past_key_values, self_attn.layer_idx, key_states.shape[-2]) cos, sin = self_attn.rotary_emb(value_states, seq_len=kv_seq_len) _, key_states = qeff_apply_rotary_pos_emb(torch.empty_like(key_states), key_states, cos, sin, position_ids) diff --git a/QEfficient/transformers/models/mistral/modeling_mistral.py b/QEfficient/transformers/models/mistral/modeling_mistral.py index 47107384e..878920234 100644 --- a/QEfficient/transformers/models/mistral/modeling_mistral.py +++ b/QEfficient/transformers/models/mistral/modeling_mistral.py @@ -30,6 +30,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils._utils import resolve_kv_seq_len from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE @@ -159,7 +160,7 @@ def forward( key_states = key_states.view(hidden_shape).transpose(1, 2) value_states = value_states.view(hidden_shape).transpose(1, 2) - kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position) + kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) diff --git a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py index 680c839ae..9e8a2a020 100644 --- a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py +++ b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py @@ -32,6 +32,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils._utils import resolve_kv_seq_len from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE @@ -148,14 +149,14 @@ def forward( key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len = past_key_value.get_seq_length(self.layer_idx) + if past_key_value is not None and self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + cache_position = kwargs.get("cache_position") + kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 3cba022b4..a350a92dc 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -42,7 +42,7 @@ _prepare_cross_attention_mask, ) from QEfficient.utils import constants -from QEfficient.utils._utils import IOInfo +from QEfficient.utils._utils import IOInfo, resolve_kv_seq_len from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE MAX_NUM_IMG = 1 @@ -267,14 +267,14 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position) + if past_key_value is not None and self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + + kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py index 57f2729b9..fbc7b34b8 100644 --- a/QEfficient/transformers/models/molmo/modeling_molmo.py +++ b/QEfficient/transformers/models/molmo/modeling_molmo.py @@ -17,7 +17,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask from QEfficient.utils import constants -from QEfficient.utils._utils import IOInfo, get_padding_shape_from_config +from QEfficient.utils._utils import IOInfo, get_padding_shape_from_config, resolve_kv_seq_len def _non_meta_init_device(config) -> torch.device: @@ -265,15 +265,13 @@ def attention( v = v.view(B, T, self.config.effective_n_kv_heads, C // self.config.n_heads).transpose(1, 2) if self.config.use_position_ids and self.config.rope: - kv_seq_len = k.shape[-2] - kv_seq_len = layer_past.get_seq_length(self.layer_id) + kv_seq_len = resolve_kv_seq_len(layer_past, self.layer_id, k.shape[-2]) # Apply rotary embeddings cos, sin = self.rotary_emb(v, seq_len=kv_seq_len) q, k = qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, self.config) if not self.config.use_position_ids and self.config.rope: - kv_seq_len = k.shape[-2] - kv_seq_len = layer_past.get_seq_length(kv_seq_len, self.layer_id) + kv_seq_len = resolve_kv_seq_len(layer_past, self.layer_id, k.shape[-2]) # Apply rotary embeddings cos, sin = self.rotary_emb(v, seq_len=kv_seq_len) q, k = qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, self.config) diff --git a/QEfficient/transformers/models/olmo2/modeling_olmo2.py b/QEfficient/transformers/models/olmo2/modeling_olmo2.py index c79ad7fae..0e9394040 100644 --- a/QEfficient/transformers/models/olmo2/modeling_olmo2.py +++ b/QEfficient/transformers/models/olmo2/modeling_olmo2.py @@ -27,6 +27,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils._utils import resolve_kv_seq_len from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE @@ -148,9 +149,7 @@ def forward( key_states = key_states.view(hidden_shape).transpose(1, 2) value_states = value_states.view(hidden_shape).transpose(1, 2) - kv_seq_len = key_states.shape[-2] - - kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position) + kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) diff --git a/QEfficient/transformers/models/phi3/modeling_phi3.py b/QEfficient/transformers/models/phi3/modeling_phi3.py index b48ab2897..aaaaa8081 100644 --- a/QEfficient/transformers/models/phi3/modeling_phi3.py +++ b/QEfficient/transformers/models/phi3/modeling_phi3.py @@ -27,6 +27,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils._utils import resolve_kv_seq_len from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE @@ -157,7 +158,7 @@ def forward( key_states = key_states.view(hidden_shape).transpose(1, 2) value_states = value_states.view(hidden_shape).transpose(1, 2) - kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position) + kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) diff --git a/QEfficient/transformers/models/qwen2/modeling_qwen2.py b/QEfficient/transformers/models/qwen2/modeling_qwen2.py index 841df6526..c41dc13bb 100644 --- a/QEfficient/transformers/models/qwen2/modeling_qwen2.py +++ b/QEfficient/transformers/models/qwen2/modeling_qwen2.py @@ -30,6 +30,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils._utils import resolve_kv_seq_len from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE @@ -162,7 +163,7 @@ def forward( key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position) + kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index d6bfbda81..39dd285a0 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -36,7 +36,7 @@ # from transformers import Qw from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask from QEfficient.utils import constants -from QEfficient.utils._utils import IOInfo, get_padding_shape_from_config +from QEfficient.utils._utils import IOInfo, get_padding_shape_from_config, resolve_kv_seq_len from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE from QEfficient.utils.logging_utils import logger @@ -591,8 +591,7 @@ def forward( key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) - kv_seq_len = key_states.shape[-2] - kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position) + kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position) past_seen_tokens = past_key_value.get_seq_length() if past_key_value is not None else 0 cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) @@ -743,13 +742,13 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - - if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = False + if past_key_values is not None and not isinstance(past_key_values, Cache): return_legacy_cache = True past_key_values = QEffDynamicCache.from_legacy_cache(past_key_values) + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 if cache_position is None: - past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 cache_position = torch.arange( past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device ) diff --git a/QEfficient/transformers/models/qwen3/modeling_qwen3.py b/QEfficient/transformers/models/qwen3/modeling_qwen3.py index ccc4bbac2..c3c1df82d 100644 --- a/QEfficient/transformers/models/qwen3/modeling_qwen3.py +++ b/QEfficient/transformers/models/qwen3/modeling_qwen3.py @@ -30,6 +30,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils._utils import resolve_kv_seq_len from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE @@ -163,7 +164,7 @@ def forward( key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position) + kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) diff --git a/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py index 6bdd5e243..bfe0c90db 100644 --- a/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py +++ b/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py @@ -28,6 +28,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils._utils import resolve_kv_seq_len from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE @@ -209,7 +210,7 @@ def forward( key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - kv_seq_len = past_key_value.get_seq_length(self.layer_idx, cache_position) + kv_seq_len = resolve_kv_seq_len(past_key_value, self.layer_idx, key_states.shape[-2], cache_position) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) diff --git a/QEfficient/transformers/quantizers/quantizer_awq.py b/QEfficient/transformers/quantizers/quantizer_awq.py index ef8a03521..b7199a71e 100644 --- a/QEfficient/transformers/quantizers/quantizer_awq.py +++ b/QEfficient/transformers/quantizers/quantizer_awq.py @@ -29,15 +29,18 @@ def post_init(self): f"Only quantization backend {AwqBackendPackingMethod.AUTOAWQ} is supported - not recognized backend {self.backend}" ) - self.version = AWQLinearVersion.from_str(self.version) + if isinstance(self.version, str): + self.version = AWQLinearVersion.from_str(self.version) if self.version not in [AWQLinearVersion.GEMM]: raise ValueError( f"Only {AWQLinearVersion.GEMM} version in supported - not recognized version {self.version}" ) - if self.do_fuse or self.fuse_max_seq_len is not None: + do_fuse = getattr(self, "do_fuse", None) + fuse_max_seq_len = getattr(self, "fuse_max_seq_len", None) + if do_fuse or fuse_max_seq_len is not None: raise ValueError( - f"fused modules are not supported, got do_fuse={self.do_fuse}, fuse_max_seq_len={self.fuse_max_seq_len}" + f"fused modules are not supported, got do_fuse={do_fuse}, fuse_max_seq_len={fuse_max_seq_len}" ) if self.bits != 4: @@ -63,6 +66,9 @@ def update_torch_dtype(self, torch_dtype): logger.warning(f"Requested dtype {torch_dtype} is not supported, overriding to None") return None + def update_dtype(self, dtype): + return self.update_torch_dtype(dtype) + def _process_model_before_weight_loading(self, model, **kwargs): self.modules_to_not_convert = get_keys_to_not_convert(model) diff --git a/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py b/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py index e7e14166d..f2746528c 100644 --- a/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py +++ b/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py @@ -188,6 +188,9 @@ def update_torch_dtype(self, torch_dtype): logger.warning(f"Requested dtype {torch_dtype} is not supported, overriding to None") return None + def update_dtype(self, dtype): + return self.update_torch_dtype(dtype) + def _process_model_before_weight_loading(self, model, **kwargs): if not self.modules_to_not_convert or "lm_head" not in self.modules_to_not_convert: self.modules_to_not_convert.extend(get_keys_to_not_convert(model)) @@ -366,6 +369,9 @@ def update_torch_dtype(self, torch_dtype): logger.warning(f"Requested dtype {torch_dtype} is not supported, overriding to None") return None + def update_dtype(self, dtype): + return self.update_torch_dtype(dtype) + def _process_model_before_weight_loading(self, model, **kwargs): if self.quantization_config.targets != ["Linear"]: raise NotImplementedError( diff --git a/QEfficient/transformers/quantizers/quantizer_mxfp4.py b/QEfficient/transformers/quantizers/quantizer_mxfp4.py index 2ffba1bea..44c255feb 100644 --- a/QEfficient/transformers/quantizers/quantizer_mxfp4.py +++ b/QEfficient/transformers/quantizers/quantizer_mxfp4.py @@ -105,6 +105,9 @@ def update_torch_dtype(self, torch_dtype): logger.warning(f"Requested dtype {torch_dtype} is not supported, overriding to None") return None + def update_dtype(self, dtype): + return self.update_torch_dtype(dtype) + def _process_model_before_weight_loading( self, model: torch.nn.Module, diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 26bae7a34..9a62f57fd 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -67,6 +67,40 @@ class DownloadRetryLimitExceeded(Exception): """ +def resolve_kv_seq_len( + past_key_value: Optional[Any], + layer_idx: int, + current_seq_len: int, + cache_position: Optional[torch.LongTensor] = None, +) -> int: + """ + Resolve KV sequence length for rotary embeddings with cache compatibility. + + Use the current key sequence length as baseline, then grow it with: + - cache_position max (when provided) + - cache object reported length for the current layer + """ + resolved_seq_len = current_seq_len + if cache_position is not None and isinstance(cache_position, torch.Tensor) and cache_position.numel() > 0: + resolved_seq_len = max(resolved_seq_len, int(cache_position.max().item()) + 1) + + if past_key_value is None: + return resolved_seq_len + + get_seq_length = getattr(past_key_value, "get_seq_length", None) + if get_seq_length is None: + return resolved_seq_len + + try: + cache_seq_len = get_seq_length(layer_idx) + except TypeError: + cache_seq_len = get_seq_length() + + if cache_seq_len is None: + return resolved_seq_len + return max(resolved_seq_len, int(cache_seq_len)) + + def login_and_download_hf_lm(model_name, *args, **kwargs): logger.info(f"loading HuggingFace model for {model_name}") hf_token = kwargs.pop("hf_token", None) diff --git a/QEfficient/utils/test_utils.py b/QEfficient/utils/test_utils.py index 3cf560266..c125a317f 100644 --- a/QEfficient/utils/test_utils.py +++ b/QEfficient/utils/test_utils.py @@ -229,3 +229,7 @@ class ModelConfig: SWIFTKV_MODELS = { "Snowflake/Llama-3.1-SwiftKV-8B-Instruct", } + + FULL_MODEL_TESTS_TO_SKIP = { + "hpcai-tech/grok-1", + } diff --git a/pyproject.toml b/pyproject.toml index 6de8048b4..207868adb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ classifiers = [ ] requires-python = ">=3.8,<3.13" dependencies = [ - "transformers==4.55.0", + "transformers==4.57.3", "diffusers== 0.35.1", "huggingface-hub==0.34.0", "hf_transfer==0.1.9", @@ -55,7 +55,7 @@ dependencies = [ ] [project.optional-dependencies] -test = ["pytest","pytest-mock"] +test = ["pytest","pytest-mock","pytest-xdist"] docs = ["Sphinx==7.1.2","sphinx-rtd-theme==2.0.0","myst-parser==3.0.1","sphinx-multiversion"] quality = ["black", "ruff", "hf_doc_builder@git+https://github.com/huggingface/doc-builder.git"] diff --git a/scripts/Nightly/Jenkinsfile b/scripts/Nightly/Jenkinsfile new file mode 100644 index 000000000..b9c9687f3 --- /dev/null +++ b/scripts/Nightly/Jenkinsfile @@ -0,0 +1,308 @@ +pipeline { + agent { + node { + label 'qeff_node' + } + } + + options { + disableConcurrentBuilds() + timeout(time: 1, unit: 'DAYS') + timestamps() + buildDiscarder(logRotator(numToKeepStr: '5', daysToKeepStr: '30')) + } + + triggers { + cron('''TZ=Asia/Kolkata + 0 21 * * 6''') + } + + environment { + DOCKER_IMAGE = "${DOCKER_LATEST}:master_latest" + VENV_PATH = 'preflight_qeff' + TOKENIZERS_PARALLELISM = 'false' + HF_HUB_CACHE = '/huggingface_hub' + PYTEST_ARGS = '--durations=10' + DOCKER_USER = 'ubuntu' + } + + stages { + stage('Prepare Environment') { + steps { + script { + echo "Starting QEfficient Nightly Test Suite" + echo "Build Tag: ${BUILD_TAG}" + } + sh ''' + . ~/.bashrc + # Launch privileged Docker container with necessary mounts + sudo docker run --privileged -dit \ + --name ${BUILD_TAG} \ + -e HF_TOKEN=${HF_TOKEN} \ + -v ./:/efficient-transformers \ + -v ${HF_PATH}:${DOCKER_HF_PATH} \ + ${DOCKER_LATEST}:master_latest + + # Install QEfficient and dependencies + sudo docker exec ${BUILD_TAG} bash -c " + set -e + cd /efficient-transformers + apt update && apt install -y python3.10-venv + python3.10 -m venv ${VENV_PATH} + . ${VENV_PATH}/bin/activate + + # Upgrade pip and core packages + pip install --upgrade pip setuptools wheel + pip install .[test] + pip install junitparser pytest-xdist + + # Audio processing libraries for speech-to-text models + pip install librosa==0.10.2 soundfile==0.13.1 + + # Vision and multimodal model dependencies + pip install --extra-index-url https://download.pytorch.org/whl/cpu \ + timm==1.0.14 torchvision==0.22.0+cpu einops==0.8.1 + + rm -rf QEfficient + " + ''' + } + } + stage('Unit & Integration Tests') { + parallel { + stage('Model Export & ONNX Tests') { + steps { + timeout(time: 40, unit: 'MINUTES') { + sh ''' + sudo docker exec ${BUILD_TAG} bash -c " + set -e + cd /efficient-transformers + . ${VENV_PATH}/bin/activate + + mkdir -p $PWD/Non_cli_qaic + export QEFF_HOME=$PWD/Non_cli_qaic + + pytest tests \ + -m '(not cli) and (not on_qaic) and (not finetune)' \ + --ignore tests/vllm \ + --ignore tests/transformers/models/image_text_to_text \ + ${PYTEST_ARGS} -n 4\ + --junitxml=tests/tests_log1.xml + + junitparser merge tests/tests_log1.xml tests/tests_log.xml + deactivate + " + ''' + } + } + } + + stage('QAIC LLM Tests') { + steps { + // timeout(time: 120, unit: 'MINUTES') { + sh ''' + sudo docker exec ${BUILD_TAG} bash -c " + set -e + cd /efficient-transformers + . ${VENV_PATH}/bin/activate + + mkdir -p $PWD/Non_qaic_llm + export QEFF_HOME=$PWD/Non_qaic_llm + + pytest tests \ + -m '(not cli) and (on_qaic) and (llm_model) and (not custom_layers) and (not dummy_model) and (not multimodal) and (not qnn) and (not finetune) and (not diffusion_models)' \ + --ignore tests/vllm \ + ${PYTEST_ARGS} \ + --junitxml=tests/tests_log2.xml + + junitparser merge tests/tests_log2.xml tests/tests_log.xml + deactivate + " + ''' + // } + } + } + + stage('QAIC Feature Tests') { + steps { + // timeout(time: 80, unit: 'MINUTES') { + sh ''' + sudo docker exec ${BUILD_TAG} bash -c " + set -e + cd /efficient-transformers + . ${VENV_PATH}/bin/activate + + mkdir -p $PWD/Non_qaic_feature + export QEFF_HOME=$PWD/Non_qaic_feature + + pytest tests \ + -m '(not cli) and (on_qaic) and (feature) and (not multimodal) and (not qnn) and (not finetune) and (not diffusion_models)' \ + --ignore tests/vllm \ + ${PYTEST_ARGS} \ + --junitxml=tests/tests_log2_feature.xml + + junitparser merge tests/tests_log2_feature.xml tests/tests_log.xml + deactivate + " + ''' + // } + } + } + } + } + stage('QAIC MultiModal Tests') { + steps { + // timeout(time: 120, unit: 'MINUTES') { + sh ''' + sudo docker exec ${BUILD_TAG} bash -c " + set -e + cd /efficient-transformers + . ${VENV_PATH}/bin/activate + + mkdir -p $PWD/Non_cli_qaic_multimodal + export QEFF_HOME=$PWD/Non_cli_qaic_multimodal + + pytest tests \ + -m '(not cli) and (on_qaic) and (multimodal) and (not qnn) and (not finetune) and (not diffusion_models)' \ + --ignore tests/vllm \ + ${PYTEST_ARGS} \ + --junitxml=tests/tests_log6.xml + + junitparser merge tests/tests_log6.xml tests/tests_log.xml + deactivate + " + ''' + // } + } + } + + stage('QAIC Diffusion Models Tests') { + steps { + timeout(time: 120, unit: 'MINUTES') { + sh ''' + sudo docker exec ${BUILD_TAG} bash -c " + set -e + cd /efficient-transformers + . ${VENV_PATH}/bin/activate + + mkdir -p $PWD/Non_cli_qaic_diffusion + export QEFF_HOME=$PWD/Non_cli_qaic_diffusion + export HF_HUB_CACHE=${HF_HUB_CACHE} + + pytest tests \ + -m '(not cli) and (on_qaic) and (diffusion_models) and (not wan) and (not qnn) and (not finetune)' \ + --ignore tests/vllm \ + ${PYTEST_ARGS} \ + --junitxml=tests/tests_log_diffusion.xml + + junitparser merge tests/tests_log_diffusion.xml tests/tests_log.xml + deactivate + " + ''' + } + } + } + + stage('CLI Inference Tests') { + steps { + timeout(time: 120, unit: 'MINUTES') { + sh ''' + sudo docker exec ${BUILD_TAG} bash -c " + set -e + cd /efficient-transformers + . ${VENV_PATH}/bin/activate + + mkdir -p $PWD/cli + export QEFF_HOME=$PWD/cli + + pytest tests \ + -m '(cli and not qnn) and (not finetune)' \ + --ignore tests/vllm \ + ${PYTEST_ARGS} \ + --junitxml=tests/tests_log3.xml + + junitparser merge tests/tests_log3.xml tests/tests_log.xml + deactivate + " + ''' + } + } + } + stage('Finetune CLI Tests') { + steps { + timeout(time: 20, unit: 'MINUTES') { + sh ''' + sudo docker exec ${BUILD_TAG} bash -c " + set -e + cd /efficient-transformers + . ${VENV_PATH}/bin/activate + + # Install QAIC PyTorch integration + pip install /opt/qti-aic/integrations/torch_qaic/py310/torch_qaic-0.1.0-cp310-cp310-linux_x86_64.whl + pip install torch==2.9.0 torchvision==0.24.0 torchaudio==2.9.0 \ + --index-url https://download.pytorch.org/whl/cpu + + mkdir -p $PWD/cli_qaic_finetuning + export QEFF_HOME=$PWD/cli_qaic_finetuning + + pytest tests \ + -m '(cli) and (on_qaic) and (not qnn) and (not multimodal) and (finetune)' \ + --ignore tests/vllm \ + ${PYTEST_ARGS} \ + --junitxml=tests/tests_log_finetune.xml + + junitparser merge tests/tests_log_finetune.xml tests/tests_log.xml + deactivate + " + ''' + } + } + } + } + + post { + always { + script { + echo "========== Test Execution Summary ==========" + sh ''' + # Restore file ownership + sudo chown -R ${DOCKER_USER} . 2>/dev/null || true + ''' + } + + junit testResults: 'tests/tests_log.xml', + allowEmptyResults: true, + keepLongStdio: true + + script { + sh ''' + # Cleanup Docker container + echo "Cleaning up Docker container: ${BUILD_TAG}" + sudo docker rm -f ${BUILD_TAG} 2>/dev/null || true + ''' + } + + cleanWs( + deleteDirs: true, + ) + + echo "Pipeline cleanup completed" + } + + success { + echo "✓ QEfficient Nightly Test Suite completed successfully" + // Optionally trigger downstream jobs here + // build job: 'qefficient_downstream_job', wait: false + } + + failure { + echo "✗ QEfficient Nightly Test Suite failed" + echo "Check logs above for detailed error information" + } + + unstable { + echo "⚠ QEfficient Nightly Test Suite produced unstable results" + echo "Some tests may have been skipped or failed" + } + } +} \ No newline at end of file diff --git a/tests/configs/causal_model_configs.json b/tests/configs/causal_model_configs.json index bf0fd642d..b17e57336 100644 --- a/tests/configs/causal_model_configs.json +++ b/tests/configs/causal_model_configs.json @@ -25,338 +25,6 @@ "vocab_size": 50257, "num_key_value_heads": 1 } - }, - { - "model_name": "allenai/OLMo-2-0425-1B", - "model_type": "olmo2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 100352, - "num_key_value_heads": 1 - } - }, - { - "model_name": "Salesforce/codegen-350M-mono", - "model_type": "codegen", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 4, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 51200, - "num_key_value_heads": 1, - "rotary_dim": 16 - } - }, - { - "model_name": "ibm-granite/granite-3.1-1b-a400m-base", - "model_type": "granitemoe", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 49155, - "num_key_value_heads": 1 - } - }, - { - "model_name": "microsoft/Phi-3-mini-4k-instruct", - "model_type": "phi3", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32064, - "num_key_value_heads": 1 - } - }, - { - "model_name": "tiiuae/falcon-7b", - "model_type": "falcon", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 65024, - "num_key_value_heads": 1 - } - }, - { - "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", - "model_type": "qwen3_moe", - "additional_params": { - "hidden_size": 256, - "intermediate_size": 256, - "max_position_embeddings": 128, - "max_window_layers": 48, - "moe_intermediate_size": 768, - "num_attention_heads": 2, - "num_experts": 4, - "num_experts_per_tok": 2, - "num_hidden_layers": 1, - "num_key_value_heads": 1, - "vocab_size": 151936 - } - }, - { - "model_name": "Qwen/Qwen2-0.5B", - "model_type": "qwen2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 151936, - "num_key_value_heads": 1 - } - }, - { - "model_name": "bigcode/starcoder2-3b", - "model_type": "starcoder2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 49152, - "num_key_value_heads": 1 - } - }, - { - "model_name": "Felladrin/Minueza-32M-Base", - "model_type": "mistral", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32002, - "num_key_value_heads": 1 - } - }, - { - "model_name": "wtang06/mpt-125m-c4", - "model_type": "mpt", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 50368 - } - }, - { - "model_name": "hakurei/gpt-j-random-tinier", - "model_type": "gptj", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 50400, - "num_key_value_heads": 1, - "rotary_dim": 16 - } - }, - { - "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "model_type": "mixtral", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32000, - "num_key_value_heads": 1 - } - }, - { - "model_name": "meta-llama/Llama-3.2-1B", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 128256, - "num_key_value_heads": 1, - "rope_scaling": { - "factor": 32.0, - "high_freq_factor": 4.0, - "low_freq_factor": 1.0, - "original_max_position_embeddings": 8192, - "rope_type": "llama3" - } - } - }, - { - "model_name": "unsloth/gemma-2b", - "model_type": "gemma", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 256000, - "num_key_value_heads": 1 - } - }, - { - "model_name": "unsloth/gemma-2-2b", - "model_type": "gemma2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 256000, - "num_key_value_heads": 1 - } - }, - { - "model_name": "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32003 - } - }, - { - "model_name": "TheBloke/Llama-2-7B-GPTQ", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32000 - } - }, - { - "model_name": "ibm-granite/granite-20b-code-base", - "model_type": "gpt_bigcode", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 49152, - "num_key_value_heads": 1, - "activation_function": "gelu", - "architectures": [ - "GPTBigCodeForCausalLM" - ] - } - }, - { - "model_name": "neuralmagic/Llama-3.2-3B-Instruct-FP8", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 128256 - } - }, - { - "model_name": "neuralmagic/Qwen2-0.5B-Instruct-FP8", - "model_type": "qwen2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 2, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 151936 - } - }, - { - "model_name": "ibm-granite/granite-3.1-2b-instruct", - "model_type": "granite", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 49155, - "num_key_value_heads": 1 - } - }, - { - "model_name": "ibm-granite/granite-guardian-3.1-2b", - "model_type": "granite", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 49155, - "num_key_value_heads": 1 - } - }, - { - "model_name": "hpcai-tech/grok-1", - "model_type": null, - "additional_params":{ - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 131072, - "num_key_value_heads": 1 - } - }, - { - "model_name": "Snowflake/Llama-3.1-SwiftKV-8B-Instruct", - "model_type": null, - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 2, - "num_attention_heads": 2, - "hidden_size": 256, - "intermediate_size": 256, - "vocab_size": 128256, - "num_key_value_layers": 1, - "num_key_value_heads": 1, - "rope_scaling": { - "factor": 8.0, - "high_freq_factor": 4.0, - "low_freq_factor": 1.0, - "original_max_position_embeddings": 8192, - "rope_type": "llama3" - } - } } ], diff --git a/tests/conftest.py b/tests/conftest.py index d1f553cda..8e024360f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,6 +13,55 @@ from QEfficient.utils.constants import QEFF_MODELS_DIR from QEfficient.utils.logging_utils import logger +_QUICKCHECK_FILE = "tests/test_model_quickcheck.py" +_QUICKCHECK_SUMMARY = {} +_QUICKCHECK_META = { + "test_causal_lm_cpu_runtime_parity_with_api_runner": ( + "Causal LM", + "Full parity: HF PyTorch vs QEff PyTorch vs ORT tokens", + ), + "test_vlm_text_side_runtime_parity_and_full_export": ( + "VLM", + "Text-side full parity + full VLM export smoke", + ), + "test_vlm_export_smoke_additional_models": ( + "VLM", + "Export smoke with text-side fallback when needed", + ), + "test_text_embedding_cpu_parity_and_export": ( + "Text Embedding", + "Tensor parity: HF vs QEff PyTorch vs ORT", + ), + "test_audio_embedding_ctc_cpu_parity_and_export": ( + "Audio CTC", + "Logits parity: HF vs ORT + export", + ), + "test_seq_classification_cpu_parity_and_export": ( + "Sequence Classification", + "Logits parity: HF vs QEff PyTorch vs ORT", + ), + "test_whisper_export_smoke": ( + "Whisper", + "Export smoke + retained-state outputs check", + ), + "test_causal_subfunction_export_smoke": ( + "Causal LM", + "Subfunction export check (with/without QEffGPT2Block)", + ), + "test_causal_subfunction_export_smoke_all_models": ( + "Causal LM", + "Full parity: HF PyTorch vs QEff PyTorch vs ORT tokens (subfunctions)", + ), + "test_prefix_caching_continuous_batching_export_and_ort_smoke": ( + "Prefix Caching", + "Continuous-batching export structural checks", + ), + "test_awq_export_smoke": ( + "AWQ", + "Export smoke + MatMulNBits presence check", + ), +} + def qeff_models_clean_up(): if os.path.exists(QEFF_MODELS_DIR): @@ -42,3 +91,32 @@ def pytest_sessionfinish(session, exitstatus): if inside_worker is None: qeff_models_clean_up() logger.info("...PYTEST Session Ended.") + + +def pytest_runtest_logreport(report): + if _QUICKCHECK_FILE not in report.nodeid: + return + + if report.when == "call": + _QUICKCHECK_SUMMARY[report.nodeid] = report.outcome + return + + if report.when == "setup" and report.outcome == "skipped": + _QUICKCHECK_SUMMARY.setdefault(report.nodeid, report.outcome) + + +def pytest_terminal_summary(terminalreporter): + if not _QUICKCHECK_SUMMARY: + return + + terminalreporter.section("Quickcheck Coverage Summary", sep="=") + header = f"{'Status':7} {'Test Case':58} {'Category':24} Validation" + terminalreporter.write_line(header) + terminalreporter.write_line("-" * len(header)) + + for nodeid in sorted(_QUICKCHECK_SUMMARY): + test_case = nodeid.split("::", 1)[1] + base_name = test_case.split("[", 1)[0] + category, validation = _QUICKCHECK_META.get(base_name, ("Other", "N/A")) + status = _QUICKCHECK_SUMMARY[nodeid].upper() + terminalreporter.write_line(f"{status:7} {test_case:58} {category:24} {validation}") diff --git a/tests/transformers/__init__.py b/tests/transformers/__init__.py new file mode 100644 index 000000000..d647b73a6 --- /dev/null +++ b/tests/transformers/__init__.py @@ -0,0 +1,6 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- diff --git a/tests/transformers/models/__init__.py b/tests/transformers/models/__init__.py new file mode 100644 index 000000000..d647b73a6 --- /dev/null +++ b/tests/transformers/models/__init__.py @@ -0,0 +1,6 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- diff --git a/tests/transformers/models/check_model_results.py b/tests/transformers/models/check_model_results.py new file mode 100644 index 000000000..73f198001 --- /dev/null +++ b/tests/transformers/models/check_model_results.py @@ -0,0 +1,179 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import json +import os +from datetime import datetime + +import numpy as np + + +def parse_exec_info_metrics(exec_info_str): + """ + Parse performance metrics from exec_info string. + + :exec_info_str: str - The exec_info string containing performance stats + :return: dict - Dictionary containing parsed metrics + """ + import re + + metrics = { + "prefill_time_sec": None, + "decode_throughput_tokens_per_sec": None, + "total_throughput_tokens_per_sec": None, + "e2e_inference_time_sec": None, + } + + exec_info_text = str(exec_info_str) + + # Parse Average Prefill time (TTFT) + if "Average Prefill time" in exec_info_text or "TTFT" in exec_info_text: + match = re.search(r"Average Prefill time.*?is=\s*([\d.]+)\s*sec", exec_info_text) + if match: + metrics["prefill_time_sec"] = float(match.group(1)) + + # Parse Decode throughput + if "Decode" in exec_info_text: + match = re.search(r"Decode\s+is=\s*([\d.]+)\s*tokens/sec", exec_info_text) + if match: + metrics["decode_throughput_tokens_per_sec"] = float(match.group(1)) + + # Parse Total throughput + if "Total is=" in exec_info_text: + match = re.search(r"Total\s+is=\s*([\d.]+)\s*tokens/sec", exec_info_text) + if match: + metrics["total_throughput_tokens_per_sec"] = float(match.group(1)) + + # Parse Total E2E inference time + if "Total (E2E) inference time" in exec_info_text: + match = re.search(r"Total \(E2E\) inference time\s+is=\s*([\d.]+)\s*sec", exec_info_text) + if match: + metrics["e2e_inference_time_sec"] = float(match.group(1)) + + return metrics + + +def dump_and_compare_results( + model_name, + compile_params, + json_file_path, + cloud_ai_100_tokens, + exec_info=None, + pytorch_hf_tokens=None, + pytorch_kv_tokens=None, + ort_tokens=None, +): + """ + Function to dump the test results to JSON file and compare the performance and output results with previous runs if available + + :model_name: str + :pytorch_hf_tokens: list + :pytorch_kv_tokens: list + :ort_tokens: list + :cloud_ai_100_tokens: list + :exec_info: object + :compile_params: dict + :return None + """ + + current_logs_dir = os.environ.get("NIGHTLY_LOG_DIR") + if current_logs_dir is None: + current_logs_dir = os.path.expanduser("~/.cache/Nightly_Logs/build_tag") + os.makedirs(current_logs_dir, exist_ok=True) + # original_logs_dir = Path(current_logs_dir).parent + original_logs_dir = current_logs_dir + current_results_json_file_path = os.path.join(current_logs_dir, json_file_path) + original_results_json_file_path = os.path.join(original_logs_dir, json_file_path) + + def convert_to_serializable(obj): + if isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, list): + return [convert_to_serializable(item) for item in obj] + elif isinstance(obj, dict): + return {k: convert_to_serializable(v) for k, v in obj.items()} + return obj + + exec_info_metrics = parse_exec_info_metrics(exec_info) + + test_data = { + "model_name": model_name, + "timestamp": datetime.now().isoformat(), + "compile_params": compile_params, + "pytorch_hf_tokens": convert_to_serializable(pytorch_hf_tokens) if pytorch_hf_tokens is not None else None, + "pytorch_kv_tokens": convert_to_serializable(pytorch_kv_tokens), + "ort_tokens": convert_to_serializable(ort_tokens), + "cloud_ai_100_tokens": convert_to_serializable(cloud_ai_100_tokens), + "exec_info_metrics": exec_info_metrics, + "exec_info_raw_string": str(exec_info), + } + + # Load existing results if file exists + all_results = {} + if os.path.exists(current_results_json_file_path): + with open(current_results_json_file_path, "r") as f: + all_results = json.load(f) + print(f"Loaded existing model results from {current_results_json_file_path}") + else: + with open(current_results_json_file_path, "w", encoding="utf-8") as f: + json.dump({}, f) + print(f"Created new results file at {current_results_json_file_path}") + + model_name_safe = model_name.replace("/", "_").replace("-", "_") + all_results[model_name_safe] = test_data + + with open(current_results_json_file_path, "w") as f: + json.dump(all_results, f, indent=4, default=str) + print(f"Successfully saved test results to {current_results_json_file_path}") + + with open(original_results_json_file_path, "r") as f: + previous_results = json.load(f) + print(f"Loaded Previous model results from {original_results_json_file_path}") + + previous_data = previous_results[model_name_safe] + + # Compare performance metrics with 5% tolerance + previous_metrics = previous_data.get("exec_info_metrics", {}) + current_metrics = exec_info_metrics + + for metric_name in [ + "prefill_time_sec", + "decode_throughput_tokens_per_sec", + "total_throughput_tokens_per_sec", + "e2e_inference_time_sec", + ]: + prev_val = previous_metrics[metric_name] + curr_val = current_metrics[metric_name] + + if prev_val is not None and curr_val is not None and prev_val != 0: + percent_diff = abs((curr_val - prev_val) / prev_val) * 100 + assert percent_diff <= 5.0, ( + f"Performance metric {metric_name} exceeds 5% tolerance: " + f"previous={prev_val}, current={curr_val}, diff={percent_diff:.2f}%" + ) + print(f"✓ {metric_name}: {percent_diff:.2f}% difference (within 5% tolerance)") + + # Compare output tokens using Mean Absolute Deviation (MAD) with 10^-2 tolerance + previous_tokens = previous_data.get("cloud_ai_100_tokens", None) + + if previous_tokens is not None and isinstance(previous_tokens, list): + if previous_tokens and isinstance(previous_tokens[0], str): + print("⊘ Output tokens: Skipping Tokens check (previous data contains strings)") + else: + prev_tokens_arr = np.array(previous_tokens, dtype=np.float32) + curr_tokens_arr = np.array(cloud_ai_100_tokens, dtype=np.float32) + + mad = np.mean(np.abs(curr_tokens_arr - prev_tokens_arr)) + tolerance = 1e-2 + + assert mad <= tolerance, f"Output tokens MAD exceeds 10^-2 tolerance: MAD={mad:.6f}, tolerance={tolerance}" + print(f"✓ Output tokens MAD: {mad:.6f} (within 10^-2 tolerance)") + return True diff --git a/tests/transformers/models/image_text_to_text/test_continuous_batching.py b/tests/transformers/models/image_text_to_text/test_continuous_batching.py index c1a31eaa3..d472b1ce4 100644 --- a/tests/transformers/models/image_text_to_text/test_continuous_batching.py +++ b/tests/transformers/models/image_text_to_text/test_continuous_batching.py @@ -90,7 +90,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_CB( ctx_len: int, max_gen_len: int = 20, batch_size: int = 1, - n_layer: int = 1, + n_layer: int = -1, kv_offload: bool = False, num_devices: int = 1, enable_qnn: Optional[bool] = False, @@ -277,12 +277,12 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_CB( compile_kwargs = { "num_cores": 16, - "num_devices": num_devices, "prefill_seq_len": prompt_len, "ctx_len": ctx_len, "batch_size": batch_size, "full_batch_size": full_batch_size, "mxfp6_matmul": False, + "num_devices": 4, } if is_intern_model: @@ -375,7 +375,7 @@ def test_image_text_to_text_pytorch_vs_ai100_continuous_batching(model_name, kv_ img_size=img_size, image_urls=model_config_dict[model_name]["img_url_list"], queries=model_config_dict[model_name]["text_prompt_list"], - n_layer=model_config_dict[model_name]["num_layers"], + # n_layer=model_config_dict[model_name]["num_layers"], batch_size=model_config_dict[model_name]["batch_size"], full_batch_size=model_config_dict[model_name]["full_batch_size"], kv_offload=kv_offload, diff --git a/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py b/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py index a2c72ba7a..b647a1a7f 100644 --- a/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py +++ b/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py @@ -115,7 +115,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( ctx_len: int, max_gen_len: int = 20, batch_size: int = 1, - n_layer: int = 1, + n_layer: int = -1, kv_offload: bool = False, num_devices: int = 1, enable_qnn: Optional[bool] = False, @@ -302,12 +302,12 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( # assert (pytorch_hf_tokens == ort_tokens).all(), "Tokens don't match for pytorch HF output and ORT output" compile_kwargs = { - "num_devices": num_devices, "prefill_seq_len": prompt_len, "ctx_len": ctx_len, "mxfp6": False, "enable_qnn": enable_qnn, "qnn_config": qnn_config, + "num_devices": 4, } if is_intern_model: @@ -367,7 +367,7 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_offload img_size=img_size, img_url=model_config_dict[model_name]["img_url"], query=model_config_dict[model_name]["text_prompt"], - n_layer=model_config_dict[model_name]["num_layers"], + # n_layer=model_config_dict[model_name]["num_layers"], batch_size=model_config_dict[model_name]["batch_size"], kv_offload=kv_offload, ) diff --git a/tests/transformers/models/test_audio_embedding_models.py b/tests/transformers/models/test_audio_embedding_models.py index 998546853..669dd272c 100644 --- a/tests/transformers/models/test_audio_embedding_models.py +++ b/tests/transformers/models/test_audio_embedding_models.py @@ -22,7 +22,8 @@ from QEfficient.utils import hf_download from QEfficient.utils._utils import create_json, load_hf_processor from QEfficient.utils.constants import WAV2VEC2_MAX_SEQ_LEN, QnnConstants -from QEfficient.utils.device_utils import get_available_device_id + +from .check_model_results import dump_and_compare_results CONFIG_PATH = "tests/configs/embedding_model_configs.json" @@ -129,9 +130,10 @@ def run_ctc_ort(onnx_path, config, processor: AutoProcessor, inputs: np.ndarray, def check_ctc_pytorch_vs_kv_vs_ort_vs_ai100( model_name: str, - n_layer: int = 1, + n_layer: int = -1, enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, + compare_results: Optional[bool] = False, ): """ Validate the PyTorch model, the PyTorch model after ONNX model and the Cloud AI 100 model @@ -162,10 +164,8 @@ def check_ctc_pytorch_vs_kv_vs_ort_vs_ai100( predicted_ids = torch.argmax(ort_tokens, dim=-1) ort_output = processor.batch_decode(predicted_ids) assert pytorch_output == ort_output, "Tokens don't match for pytorch output and ORT output." - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") + qeff_model.compile( - num_cores=16, batch_size=batch_size, enable_qnn=enable_qnn, qnn_config=qnn_config, @@ -173,8 +173,25 @@ def check_ctc_pytorch_vs_kv_vs_ort_vs_ai100( cloud_ai_100_output = qeff_model.generate(processor, data) assert pytorch_output == cloud_ai_100_output, "Tokens don't match for pytorch output and Cloud AI 100 output." assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) + if compare_results is False: + return + + compile_params = { + "batch_size": batch_size, + "enable_qnn": enable_qnn, + "qnn_config": qnn_config, + } + assert dump_and_compare_results( + model_name, + compile_params, + "ctc_model_results.json", + cloud_ai_100_output, + pytorch_hf_tokens=pytorch_output, + ort_tokens=ort_output, + ) +@pytest.mark.custom_layers @pytest.mark.on_qaic @pytest.mark.llm_model @pytest.mark.parametrize("model_name", test_models) @@ -187,6 +204,22 @@ def test_ctc_pytorch_vs_kv_vs_ort_vs_ai100(model_name): check_ctc_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=4) +@pytest.mark.full_model +@pytest.mark.on_qaic +@pytest.mark.llm_model +@pytest.mark.parametrize("model_name", test_models) +def test_full_ctc_pytorch_vs_kv_vs_ort_vs_ai100(model_name): + """ + Test function to validate the PyTorch model, the PyTorch model the ONNX model, and the Cloud AI 100 model. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + check_ctc_pytorch_vs_kv_vs_ort_vs_ai100( + model_name=model_name, + compare_results=True, + ) + + @pytest.mark.on_qaic @pytest.mark.llm_model @pytest.mark.qnn diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index a87ac8efc..72501c387 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -25,6 +25,8 @@ from QEfficient.utils.run_utils import ApiRunner from QEfficient.utils.test_utils import ModelConfig +from .check_model_results import dump_and_compare_results + CONFIG_PATH = "tests/configs/causal_model_configs.json" with open(CONFIG_PATH, "r") as f: @@ -72,14 +74,15 @@ def get_custom_n_layers(model_name): :return n_layer """ - if model_name in {"microsoft/Phi-3-mini-4k-instruct", "neuralmagic/Qwen2-0.5B-Instruct-FP8", "openai/gpt-oss-20b"}: - return 2 - elif model_name in ModelConfig.SWIFTKV_MODELS: - return None - return 1 + # if model_name in {"microsoft/Phi-3-mini-4k-instruct", "neuralmagic/Qwen2-0.5B-Instruct-FP8", "openai/gpt-oss-20b"}: + # return 2 + # elif model_name in ModelConfig.SWIFTKV_MODELS: + # return None + # return 1 + return None -def load_causal_lm_model(model_name, n_layer=1, config=None): +def load_causal_lm_model(model_name, n_layer=None, config=None): """ Function to load model from huggingface and transform to KV model -------- @@ -130,6 +133,103 @@ def load_causal_lm_model(model_name, n_layer=1, config=None): return model_hf, params +def check_full_causal_lm_and_compare_results(model_name): + """ + Function to check the full model and compare results between PyTorch, ONNX Runtime and Cloud AI 100. Compare the peformance and tokens with the previous results. + + :model_name: str + + :return None + """ + prompt_len: int = Constants.PROMPT_LEN + ctx_len: int = Constants.CTX_LEN + prefill_only = None + retain_full_kv = None + pytorch_hf_tokens = None + pytorch_kv_tokens = None + + model_hf, _ = load_causal_lm_model(model_name) + tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name) + config = model_hf.config + batch_size = len(Constants.INPUT_STR) + api_runner = ApiRunner( + batch_size, + tokenizer, + config, + Constants.INPUT_STR, + Constants.PROMPT_LEN, + Constants.CTX_LEN, + ) + + if model_name not in ModelConfig.SWIFTKV_MODELS and model_name not in ModelConfig.EXTERNAL_MODELS: + pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf) + print(f"HF PyTorch tokens: {pytorch_hf_tokens}") + + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + pretrained_model_name_or_path=model_name, + ) + pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model) + print(f"KV PyTorch tokens: {pytorch_kv_tokens}") + + if model_name not in ModelConfig.SWIFTKV_MODELS and model_name not in ModelConfig.EXTERNAL_MODELS: + assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), ( + "Tokens don't match for HF PyTorch model output and KV PyTorch model output" + ) + onnx_model_path = qeff_model.export() + ort_tokens = api_runner.run_kv_model_on_ort( + onnx_model_path, + ) + print(f"ONNX tokens: {ort_tokens}") + gen_len = ort_tokens.shape[-1] + + assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for ONNXRT output and PyTorch output." + + qpc_path = qeff_model.compile( + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + num_devices=1, + mxfp6=False, + aic_enable_depth_first=False, + prefill_only=prefill_only, + retain_full_kv=retain_full_kv, + ) + exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR) + print(f"exec_info: {exec_info}") + print(f"Cloud AI 100 tokens: {exec_info.generated_ids}") + cloud_ai_100_tokens = exec_info.generated_ids[0][ + :, :gen_len + ] # Because we always run for single input and single batch size + if prefill_only: + assert (ort_tokens[0][0] == cloud_ai_100_tokens[0][0]).all(), ( + "prefill run output tokens don't match for ONNXRT output and Cloud AI 100 output." + ) + else: + assert (ort_tokens == cloud_ai_100_tokens).all(), ( + "Tokens don't match for ONNXRT output and Cloud AI 100 output." + ) + assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json")) + + compile_params = { + "prefill_seq_len": prompt_len, + "ctx_len": ctx_len, + "num_devices": 1, + "mxfp6": False, + "aic_enable_depth_first": False, + "prefill_only": prefill_only, + "retain_full_kv": retain_full_kv, + } + assert dump_and_compare_results( + model_name, + compile_params, + "causal_lm_model_results.json", + cloud_ai_100_tokens, + exec_info, + pytorch_hf_tokens, + pytorch_kv_tokens, + ort_tokens, + ) + + def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( model_name: str, prompt_len: int = Constants.PROMPT_LEN, @@ -193,7 +293,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( qpc_path = qeff_model.compile( prefill_seq_len=prompt_len, ctx_len=ctx_len, - num_cores=14, + num_devices=4, mxfp6=False, aic_enable_depth_first=False, num_speculative_tokens=num_speculative_tokens, @@ -268,7 +368,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( qpc_path = qeff_model.compile( prefill_seq_len=prompt_len, ctx_len=ctx_len, - num_cores=14, + num_devices=4, mxfp6=False, aic_enable_depth_first=False, batch_size=batch_size, @@ -330,8 +430,8 @@ def test_causal_lm_export_with_deprecated_api(model_name): ) +@pytest.mark.dummy_model @pytest.mark.on_qaic -@pytest.mark.regular @pytest.mark.llm_model @pytest.mark.parametrize("model_name", test_models_causal) def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): @@ -349,7 +449,7 @@ def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, config=hf_config) -@pytest.mark.nightly +@pytest.mark.custom_layers @pytest.mark.on_qaic @pytest.mark.llm_model @pytest.mark.parametrize("model_name", test_models_causal) @@ -364,6 +464,16 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer) +@pytest.mark.full_model +@pytest.mark.on_qaic +@pytest.mark.llm_model +@pytest.mark.parametrize("model_name", test_models_causal) +def test_full_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): + if model_name in ModelConfig.FULL_MODEL_TESTS_TO_SKIP: + pytest.skip(f"Skipping full model test for {model_name} due to resource constraints.") + check_full_causal_lm_and_compare_results(model_name) + + @pytest.mark.nightly @pytest.mark.on_qaic @pytest.mark.parametrize("retain_full_kv", [True, False]) diff --git a/tests/transformers/models/test_disagg_mode.py b/tests/transformers/models/test_disagg_mode.py index 537ecd0cc..74e768d5e 100644 --- a/tests/transformers/models/test_disagg_mode.py +++ b/tests/transformers/models/test_disagg_mode.py @@ -50,7 +50,7 @@ def test_disagg_mode_prefill(model_id, prompt): padded_len = num_chunks * PREFILL_SEQ_LEN # Convert to a multiple of prompt_len replace_transformers_quantizers() - model = AutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2) + model = AutoModelForCausalLM.from_pretrained(model_id) config = model.config inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1) @@ -62,7 +62,7 @@ def test_disagg_mode_prefill(model_id, prompt): undo_transformers_quantizers() - qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2) + qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id) qeff_model.prefill(True) config = qeff_model.model.config inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) @@ -87,10 +87,9 @@ def test_disagg_mode_prefill(model_id, prompt): prefill_qpc_path = qeff_model.compile( prefill_seq_len=PREFILL_SEQ_LEN, ctx_len=CTX_LEN, - num_cores=16, + num_devices=4, mxfp6_matmul=False, mxint8_kv_cache=False, - num_devices=1, mos=1, aic_enable_depth_first=True, num_speculative_tokens=None, @@ -212,7 +211,9 @@ def test_disagg_mode_prefill_only_and_decode_only(model_id, prompt): padded_len = num_chunks * PREFILL_SEQ_LEN # Convert to a multiple of prompt_len replace_transformers_quantizers() - model = AutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2) + model = AutoModelForCausalLM.from_pretrained( + model_id, + ) config = model.config inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1) @@ -246,7 +247,7 @@ def test_disagg_mode_prefill_only_and_decode_only(model_id, prompt): undo_transformers_quantizers() - prefill_qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2) + prefill_qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id) prefill_qeff_model.prefill(enable=True) config = prefill_qeff_model.model.config past_key_values = [] @@ -264,7 +265,9 @@ def test_disagg_mode_prefill_only_and_decode_only(model_id, prompt): # Check our pytorch implementation assert (prefill_qeff_out.logits - orig_out.logits[:, -1, :]).abs().max() < 1e-4 - decode_qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2) + decode_qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_id, + ) decode_qeff_model.prefill(enable=False) qeff_out = prefill_qeff_out @@ -293,10 +296,9 @@ def test_disagg_mode_prefill_only_and_decode_only(model_id, prompt): prefill_qpc_path = prefill_qeff_model.compile( prefill_seq_len=PREFILL_SEQ_LEN, ctx_len=CTX_LEN, - num_cores=16, + num_devices=4, mxfp6_matmul=False, mxint8_kv_cache=False, - num_devices=1, mos=1, aic_enable_depth_first=True, num_speculative_tokens=None, @@ -316,10 +318,9 @@ def test_disagg_mode_prefill_only_and_decode_only(model_id, prompt): decode_qpc_path = decode_qeff_model.compile( prefill_seq_len=1, ctx_len=CTX_LEN, - num_cores=16, + num_devices=4, mxfp6_matmul=False, mxint8_kv_cache=False, - num_devices=1, mos=1, aic_enable_depth_first=True, num_speculative_tokens=None, @@ -375,18 +376,17 @@ def test_disagg_mode_prefill_only_and_decode_only(model_id, prompt): def test_disagg_mode_prefix_caching(model_id, prompt): PREFILL_SEQ_LEN = 128 CTX_LEN = 128 * 3 - config = AutoConfig.from_pretrained(model_id, num_hidden_layers=2) - prefill_qeff_model = QEFFAutoModelForCausalLM.from_pretrained( - model_id, num_hidden_layers=2, continuous_batching=True + config = AutoConfig.from_pretrained( + model_id, ) + prefill_qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, continuous_batching=True) prefill_qeff_model.prefill(enable=True, enable_chunking=True) prefill_qpc_path = prefill_qeff_model.compile( prefill_seq_len=PREFILL_SEQ_LEN, ctx_len=CTX_LEN, - num_cores=16, + num_devices=4, mxfp6_matmul=False, mxint8_kv_cache=False, - num_devices=1, mos=1, aic_enable_depth_first=True, num_speculative_tokens=None, @@ -396,17 +396,14 @@ def test_disagg_mode_prefix_caching(model_id, prompt): kv_cache_batch_size=2, ) - decode_qeff_model = QEFFAutoModelForCausalLM.from_pretrained( - model_id, num_hidden_layers=2, continuous_batching=True - ) + decode_qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, continuous_batching=True) decode_qeff_model.prefill(enable=False) decode_qpc_path = decode_qeff_model.compile( prefill_seq_len=1, ctx_len=CTX_LEN, - num_cores=16, + num_devices=4, mxfp6_matmul=False, mxint8_kv_cache=False, - num_devices=1, mos=1, aic_enable_depth_first=True, num_speculative_tokens=None, @@ -437,7 +434,9 @@ def test_disagg_mode_prefix_caching(model_id, prompt): def prefix_caching_inference(model_id, prefill_qpc_path, decode_qpc_path, prompt, decode_batch_id): PREFILL_SEQ_LEN = 128 tokenizer = AutoTokenizer.from_pretrained(model_id) - config = AutoConfig.from_pretrained(model_id, num_hidden_layers=2) + config = AutoConfig.from_pretrained( + model_id, + ) inputs = tokenizer(prompt, return_tensors="np", padding=True) padded_len = inputs["input_ids"].shape[1] num_chunks = -(padded_len // -PREFILL_SEQ_LEN) # ceil divide without float diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py index 7eb09d911..b4aacc757 100644 --- a/tests/transformers/models/test_embedding_models.py +++ b/tests/transformers/models/test_embedding_models.py @@ -19,6 +19,8 @@ from QEfficient.utils._utils import create_json from QEfficient.utils.constants import Constants, QnnConstants +from .check_model_results import dump_and_compare_results + CONFIG_PATH = "tests/configs/embedding_model_configs.json" with open(CONFIG_PATH, "r") as f: @@ -29,22 +31,31 @@ def check_embed_pytorch_vs_ort_vs_ai100( model_name: str, seq_len: int = Constants.CTX_LEN, - n_layer: int = 1, + n_layer: int = -1, enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, pooling: Optional[str] = None, + compare_results: Optional[bool] = False, ): # Prepare input tokenizer = AutoTokenizer.from_pretrained(model_name) inputs = tokenizer("My name is", return_tensors="pt") # Original PyTorch model - pt_model = AutoModel.from_pretrained( - model_name, - num_hidden_layers=n_layer, - attn_implementation="eager", - trust_remote_code=True, - ) + pt_model = None + if n_layer == -1: + pt_model = AutoModel.from_pretrained( + model_name, + attn_implementation="eager", + trust_remote_code=True, + ) + else: + pt_model = AutoModel.from_pretrained( + model_name, + num_hidden_layers=n_layer, + attn_implementation="eager", + trust_remote_code=True, + ) # Original PyTorch model output pt_outputs = pt_model(**inputs) @@ -85,7 +96,6 @@ def check_embed_pytorch_vs_ort_vs_ai100( assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}" qeff_model.compile( - num_cores=14, enable_qnn=enable_qnn, qnn_config=qnn_config, ) @@ -100,6 +110,20 @@ def check_embed_pytorch_vs_ort_vs_ai100( assert mad <= 10**-2, f"MAD is too high for onnx and Pytorch: {mad}" assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) + if compare_results is False: + return + + compile_params = {"enable_qnn": enable_qnn, "qnn_config": qnn_config, "pooling": pooling, "seq_len": seq_len} + assert dump_and_compare_results( + model_name, + compile_params, + "embedding_model_results.json", + qeff_ai100_embeddings, + pytorch_hf_tokens=pt_embeddings, + pytorch_kv_tokens=qeff_pt_embeddings, + ort_tokens=onnx_outputs[0], + ) + @pytest.mark.on_qaic @pytest.mark.llm_model @@ -131,6 +155,19 @@ def test_embed_model_pytorch_vs_onnx_vs_ai100_multiple_seq_len(model): check_embed_pytorch_vs_ort_vs_ai100(model_name=model["model_name"], seq_len=[32, 20], n_layer=1) +@pytest.mark.full_model +@pytest.mark.on_qaic +@pytest.mark.llm_model +@pytest.mark.parametrize("model", embed_test_models) +def test_full_embed_model_pytorch_vs_onnx_vs_ai100_pooling(model): + """ + Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with multiple seq_len. + """ + check_embed_pytorch_vs_ort_vs_ai100( + model_name=model["model_name"], seq_len=32, pooling=model["pooling"], compare_results=True + ) + + ########## QNN TESTS ############## diff --git a/tests/transformers/models/test_seq_classification.py b/tests/transformers/models/test_seq_classification.py index d1c9cd84e..d2f3ec5ea 100644 --- a/tests/transformers/models/test_seq_classification.py +++ b/tests/transformers/models/test_seq_classification.py @@ -6,7 +6,7 @@ # ----------------------------------------------------------------------------- import os -from typing import List, Union +from typing import List, Optional, Union import numpy as np import pytest @@ -15,12 +15,16 @@ from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForSequenceClassification +from .check_model_results import dump_and_compare_results + seq_classification_test_models = [ "meta-llama/Llama-Prompt-Guard-2-22M", ] -def check_seq_classification_pytorch_vs_ai100(model_name: str, seq_len: Union[int, List[int]] = 32, n_layer: int = 1): +def check_seq_classification_pytorch_vs_ai100( + model_name: str, seq_len: Union[int, List[int]] = 32, n_layer: int = -1, compare_results: Optional[bool] = False +): """ Validate the PyTorch model and the Cloud AI 100 model for sequence classification. @@ -40,12 +44,20 @@ def check_seq_classification_pytorch_vs_ai100(model_name: str, seq_len: Union[in inputs = tokenizer(test_text, return_tensors="pt") # Run PyTorch model - pt_model = AutoModelForSequenceClassification.from_pretrained( - model_name, - num_hidden_layers=n_layer, - attn_implementation="eager", - trust_remote_code=True, - ) + pt_model = None + if n_layer == -1: + pt_model = AutoModelForSequenceClassification.from_pretrained( + model_name, + attn_implementation="eager", + trust_remote_code=True, + ) + else: + pt_model = AutoModelForSequenceClassification.from_pretrained( + model_name, + num_hidden_layers=n_layer, + attn_implementation="eager", + trust_remote_code=True, + ) pt_model.eval() with torch.no_grad(): @@ -56,7 +68,6 @@ def check_seq_classification_pytorch_vs_ai100(model_name: str, seq_len: Union[in # Create QEff model and compile qeff_model = QEFFAutoModelForSequenceClassification(pt_model) qpc_path = qeff_model.compile( - num_cores=16, seq_len=seq_len, batch_size=1, num_devices=1, @@ -83,8 +94,25 @@ def check_seq_classification_pytorch_vs_ai100(model_name: str, seq_len: Union[in # Print final result print(f"MAD (PyTorch vs AI100): {mad_pt_ai100:.2e}") + if compare_results is False: + return + + compile_params = { + "seq_len": seq_len, + "batch_size": 1, + "num_devices": 1, + "mxfp6_matmul": False, + } + assert dump_and_compare_results( + model_name, + compile_params, + "seq_classification_model_results.json", + ai100_logits.numpy(), + pytorch_hf_tokens=pt_logits.numpy(), + ) +@pytest.mark.custom_layers @pytest.mark.on_qaic @pytest.mark.parametrize("model_name", seq_classification_test_models) def test_seq_classification_pytorch_vs_ai100(model_name): @@ -120,3 +148,23 @@ def test_seq_classification_multiple_seq_len(model_name): seq_len=[32, 64, 128], n_layer=1, ) + + +@pytest.mark.full_model +@pytest.mark.on_qaic +@pytest.mark.parametrize("model_name", seq_classification_test_models) +def test_full_seq_classification_pytorch_vs_ai100(model_name): + """ + Test function to validate the sequence classification model with multiple sequence lengths. + + This test ensures that: + 1. Dynamic shape handling works correctly + 2. Model can handle variable input sizes + 3. Compilation with multiple specializations succeeds + 4. Outputs remain consistent across different sequence lengths + """ + check_seq_classification_pytorch_vs_ai100( + model_name=model_name, + seq_len=32, + compare_results=True, + ) diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py index 774802c83..130a401a9 100644 --- a/tests/transformers/models/test_speech_seq2seq_models.py +++ b/tests/transformers/models/test_speech_seq2seq_models.py @@ -24,7 +24,8 @@ from QEfficient.utils import get_padding_shape_from_config, hf_download from QEfficient.utils._utils import create_json, load_hf_processor from QEfficient.utils.constants import Constants, QnnConstants -from QEfficient.utils.device_utils import get_available_device_id + +from .check_model_results import dump_and_compare_results CONFIG_PATH = "tests/configs/speech_seq2seq_model_configs.json" @@ -46,13 +47,22 @@ def load_seq2seq_model(model_config): repo_id=model_config["model_name"], ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], ) - model_hf = AutoModelForSpeechSeq2Seq.from_pretrained( - model_path, - use_cache=True, - num_hidden_layers=model_config["n_layer"], - attn_implementation="eager", - low_cpu_mem_usage=False, - ) # Run models for single layers only + model_hf = None + if model_config["n_layer"] != -1: + model_hf = AutoModelForSpeechSeq2Seq.from_pretrained( + model_path, + use_cache=True, + num_hidden_layers=model_config["n_layer"], + attn_implementation="eager", + low_cpu_mem_usage=False, + ) + else: + model_hf = AutoModelForSpeechSeq2Seq.from_pretrained( + model_path, + use_cache=True, + attn_implementation="eager", + low_cpu_mem_usage=False, + ) params = sum(p.numel() for p in model_hf.parameters()) model_hf.eval() return model_hf, params @@ -290,9 +300,10 @@ def run_seq2seq_ort( def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100( model_name: str, ctx_len: int = Constants.CTX_LEN, - n_layer: int = 1, + n_layer: int = -1, enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, + compare_results: Optional[bool] = False, ): """ Validate the PyTorch model, the PyTorch model after KV changes, ONNX model and the Cloud AI 100 model @@ -307,6 +318,7 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100( model_hf, _ = load_seq2seq_model(model_config) + print(model_hf) processor = load_hf_processor(pretrained_model_name_or_path=model_name) batch_size = 1 @@ -314,26 +326,19 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100( data = ds[0]["audio"]["array"] data = data.reshape(-1) sample_rate = ds[0]["audio"]["sampling_rate"] - pytorch_hf_tokens = run_seq2seq_pytorch_hf(model_hf, processor, data, sample_rate, ctx_len) qeff_model = QEFFAutoModelForSpeechSeq2Seq(model_hf, pretrained_model_name_or_path=model_name) pytorch_kv_tokens = run_seq2seq_pytorch_with_kv(qeff_model, processor, data, sample_rate, ctx_len) - assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), ( "Tokens don't match for HF PyTorch model output and KV PyTorch model output" ) qeff_model.export() - ort_tokens = run_seq2seq_ort(qeff_model.onnx_path, qeff_model.model.config, processor, data, sample_rate, ctx_len) - assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for pytorch output and ort output" - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") - qeff_model.compile( ctx_len=ctx_len, num_cores=16, @@ -341,7 +346,6 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100( enable_qnn=enable_qnn, qnn_config=qnn_config, ) - exec_info = qeff_model.generate( inputs=processor(data, sampling_rate=sample_rate, return_tensors="pt"), generation_len=ctx_len ) @@ -351,7 +355,23 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100( ) assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) + if compare_results is False: + return + + compile_params = {"enable_qnn": enable_qnn, "qnn_config": qnn_config, "seq_len": ctx_len, "n_layer": n_layer} + assert dump_and_compare_results( + model_name, + compile_params, + "speech_seq2seq_model_results.json", + cloud_ai_100_tokens, + exec_info=exec_info, + pytorch_hf_tokens=pytorch_hf_tokens, + pytorch_kv_tokens=pytorch_kv_tokens, + ort_tokens=ort_tokens, + ) + +@pytest.mark.custom_layers @pytest.mark.on_qaic @pytest.mark.llm_model @pytest.mark.parametrize("model_name", test_models) @@ -364,6 +384,17 @@ def test_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(model_name): check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=4) +@pytest.mark.full_model +@pytest.mark.on_qaic +@pytest.mark.llm_model +@pytest.mark.parametrize("model_name", test_models) +def test_full_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(model_name): + check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100( + model_name=model_name, + compare_results=True, + ) + + @pytest.mark.on_qaic @pytest.mark.llm_model @pytest.mark.qnn