Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions QEfficient/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,8 @@ def _configure_proxy_for_model(instance: "QEFFBaseModel", enable_proxy: bool) ->
"""
Configure per-instance transform lists based on proxy mode.

By default, clip/split ONNX transforms are disabled for production exports.
They are only enabled when proxy flow is explicitly requested.
Keep class-defined ONNX transforms by default.
Proxy flow appends additional proxy-only transforms.
"""
instance._pytorch_transforms = list(instance._pytorch_transforms)
instance._onnx_transforms = list(instance._onnx_transforms)
Expand All @@ -218,9 +218,6 @@ def _configure_proxy_for_model(instance: "QEFFBaseModel", enable_proxy: bool) ->
if transform not in instance._onnx_transforms:
instance._onnx_transforms.append(transform)
logger.info("Proxy Model Enabled for QEfficient Model")
return

instance._onnx_transforms = [t for t in instance._onnx_transforms if t not in _PROXY_ONLY_ONNX_TRANSFORMS]


# Define a transformers layers to QEff layers dictionary
Expand Down
18 changes: 9 additions & 9 deletions QEfficient/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

import QEfficient
from QEfficient.base.modeling_qeff import QEFFBaseModel
from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform
from QEfficient.base.onnx_transforms import FP16ClipTransform
from QEfficient.base.pytorch_transforms import SplitGateUpWeightsTransform
from QEfficient.generation.cloud_infer import QAICInferenceSession
from QEfficient.generation.text_generation_inference import (
Expand Down Expand Up @@ -229,7 +229,7 @@ class QEFFAutoModel(QEFFTransformersBase):

_hf_auto_class = AutoModel
_pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform]
_onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
_onnx_transforms = [FP16ClipTransform]

def __init__(self, model: nn.Module, pooling=None, **kwargs):
"""
Expand Down Expand Up @@ -617,7 +617,7 @@ class QEFFAutoModelForSequenceClassification(QEFFTransformersBase):

_hf_auto_class = AutoModelForSequenceClassification
_pytorch_transforms = [CustomOpsTransform, TextClassificationTransform]
_onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
_onnx_transforms = []

def __init__(self, model: nn.Module, **kwargs):
"""
Expand Down Expand Up @@ -859,7 +859,7 @@ class QEffVisionEncoderForTextImageToTextModel(QEFFBaseModel):
KVCacheTransform,
KVCacheExternalModuleMapperTransform,
]
_onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
_onnx_transforms = []

def __init__(self, model: nn.modules, **kwargs):
"""
Expand Down Expand Up @@ -998,7 +998,7 @@ class QEffCausalLMForTextImageToTextModel(QEFFBaseModel):
VlmKVOffloadTransform,
SplitGateUpWeightsTransform,
]
_onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
_onnx_transforms = []

def __init__(self, model, qaic_config: Optional[dict] = None, **kwargs):
"""
Expand Down Expand Up @@ -1874,7 +1874,7 @@ class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, Multimodal
VlmNoKVOffloadTransform,
SplitGateUpWeightsTransform,
]
_onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
_onnx_transforms = []

def __init__(
self,
Expand Down Expand Up @@ -2626,7 +2626,7 @@ class QEFFAutoModelForCausalLM(QEFFBaseModel):
KVCacheExternalModuleMapperTransform,
]

_onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
_onnx_transforms = []

def prefill(
self,
Expand Down Expand Up @@ -3575,7 +3575,7 @@ class QEFFAutoModelForSpeechSeq2Seq(QEFFTransformersBase, MultimodalUtilityMixin

_hf_auto_class = AutoModelForSpeechSeq2Seq
_pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, KVCacheTransform]
_onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
_onnx_transforms = []

def __init__(self, model: nn.Module, **kwargs):
"""
Expand Down Expand Up @@ -3934,7 +3934,7 @@ class QEFFAutoModelForCTC(QEFFTransformersBase):

_hf_auto_class = AutoModelForCTC
_pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform]
_onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
_onnx_transforms = []

def __init__(self, model: nn.Module, **kwargs):
super().__init__(model, **kwargs)
Expand Down
92 changes: 92 additions & 0 deletions QEfficient/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,68 @@
import torch.nn as nn
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from transformers import (
AutoModelForCausalLM,
AutoModelForImageTextToText,
)


def load_vlm_model(config):
try:
model_hf = AutoModelForImageTextToText.from_pretrained(
config._name_or_path,
low_cpu_mem_usage=False,
config=config,
)
except ValueError:
model_hf = AutoModelForCausalLM.from_pretrained(
config._name_or_path,
low_cpu_mem_usage=False,
trust_remote_code=True,
config=config,
)
model_hf.eval()
return model_hf


def load_vlm_model_from_config(config):
try:
model_hf = AutoModelForImageTextToText.from_config(
config,
attn_implementation="eager",
trust_remote_code=True,
)
except ValueError:
model_hf = AutoModelForCausalLM.from_config(
config,
attn_implementation="eager",
trust_remote_code=True,
)
torch_dtype = getattr(model_hf.config, "torch_dtype", None)
if torch_dtype == torch.bfloat16 or torch_dtype == torch.float16:
model_hf = model_hf.to(torch.float32)
model_hf.eval()
return model_hf


def set_num_layers_vlm(config, n_layer=1):
## -1 indicates use all the layers of the model.
if n_layer == -1:
return config
elif hasattr(config, "model_type") and "mllama" in config.model_type:
config.text_config.num_hidden_layers = n_layer
config.text_config.cross_attention_layers = [
x for x in config.text_config.cross_attention_layers if x < n_layer
]
elif hasattr(config, "text_config"):
config.text_config.num_hidden_layers = n_layer
config.vision_config.num_hidden_layers = n_layer
elif hasattr(config, "llm_config"):
config.llm_config.num_hidden_layers = n_layer
config.vision_config.num_hidden_layers = n_layer
else:
config.num_hidden_layers = n_layer
return config


# Processor class for InternVL models
Expand Down Expand Up @@ -169,6 +231,36 @@ class ModelConfig:
"TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
}

STANDARD_VLM_MODELS = {
"llava-hf/llava-1.5-7b-hf",
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"google/gemma-3-4b-it",
"mistralai/Mistral-Small-3.1-24B-Instruct-2503",
"Qwen/Qwen2.5-VL-3B-Instruct",
"meta-llama/Llama-3.2-11B-Vision-Instruct",
}

INTERNVL_MODELS = {
"OpenGVLab/InternVL2_5-1B",
"OpenGVLab/InternVL3_5-1B",
}

MOLMO_MODELS = {
"allenai/Molmo-7B-D-0924",
}

SKIPPED_MODELS = {
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"allenai/Molmo-7B-D-0924",
"meta-llama/Llama-3.2-11B-Vision-Instruct",
}

DUAL_QPC_MODELS = {
"OpenGVLab/InternVL2_5-1B",
"OpenGVLab/InternVL3_5-1B",
"Qwen/Qwen2.5-VL-3B-Instruct",
}

EXTERNAL_MODELS = {
"hpcai-tech/grok-1": {
"pytorch_hf_tokens_custom_case": [
Expand Down
7 changes: 4 additions & 3 deletions scripts/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ pipeline {
mkdir -p $PWD/Non_cli_qaic_multimodal &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/Non_cli_qaic_multimodal &&
pytest tests -m '(not cli) and (on_qaic) and (multimodal) and (not qnn) and (not finetune) and (not diffusion_models)' --ignore tests/vllm --ignore tests/unit_test --junitxml=tests/tests_log6.xml --durations=10 &&
pytest tests -m '(not cli) and (on_qaic) and (multimodal) and (not qnn) and (not finetune) and (not diffusion_models) and (not nightly)' --ignore tests/vllm --ignore tests/unit_test --junitxml=tests/tests_log6.xml --durations=10 &&

junitparser merge tests/tests_log6.xml tests/tests_log.xml &&
deactivate"
'''
Expand Down Expand Up @@ -203,9 +204,9 @@ pipeline {
cd /efficient-transformers &&
. preflight_qeff/bin/activate &&
# TODO: Update torch_qaic path to py312 when migrating to Python 3.12
pip install /opt/qti-aic/integrations/torch_qaic/py312/torch_qaic-0.1.0-cp312-cp312-linux_x86_64.whl &&
pip install /opt/qti-aic/integrations/torch_qaic/py312/torch_qaic-0.1.0-cp312-cp312-manylinux_2_34_x86_64.whl &&
# pip install /opt/qti-aic/integrations/torch_qaic/py310/torch_qaic-0.1.0-cp310-cp310-linux_x86_64.whl &&
pip install torch==2.9.0 torchvision==0.24.0 torchaudio==2.9.0 --index-url https://download.pytorch.org/whl/cpu &&
pip install torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/cpu &&
mkdir -p $PWD/cli_qaic_finetuning &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/cli_qaic_finetuning &&
Expand Down
52 changes: 51 additions & 1 deletion tests/configs/causal_model_configs.json
Original file line number Diff line number Diff line change
Expand Up @@ -487,5 +487,55 @@
}
}
}
],
"disaggregated_causal_lm_models": [
{
"model_name": "openai/gpt-oss-120b",
"model_type": "gpt_oss",
"additional_params": {
"num_hidden_layers": 2,
"hidden_size": 64,
"intermediate_size": 256,
"num_attention_heads": 2,
"num_key_value_heads": 1,
"num_local_experts": 4
}
}
],
"disaggregated_dummy_models": [
{
"model_name": "openai/gpt-oss-20b",
"model_type": "gpt_oss",
"tokenizer_id": "gpt2",
"additional_params": {
"num_hidden_layers": 2,
"hidden_size": 64,
"intermediate_size": 256,
"num_attention_heads": 2,
"num_key_value_heads": 1,
"num_local_experts": 4,
"head_dim": 32,
"max_position_embeddings": 512,
"vocab_size": 201088,
"sliding_window": 128
}
},
{
"model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507",
"model_type": "qwen3_moe",
"additional_params": {
"hidden_size": 256,
"intermediate_size": 256,
"max_position_embeddings": 512,
"max_window_layers": 48,
"moe_intermediate_size": 768,
"num_attention_heads": 2,
"num_experts": 4,
"num_experts_per_tok": 2,
"num_hidden_layers": 2,
"num_key_value_heads": 1,
"vocab_size": 151936
}
}
]
}
}
Loading
Loading