diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 4657c0f3..81f4b639 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -742,7 +742,7 @@ def output_decode(generated_ids, input_shape): ) parser.add_argument( "--verbose", - help="Print verbose output (e.g. quantization summary). Disable by --no_verbose.", + help="Print verbose output (e.g. quantization summary). Disable by --no-verbose.", default=True, action=argparse.BooleanOptionalAction, ) diff --git a/examples/vlm_ptq/README.md b/examples/vlm_ptq/README.md index 23d0de30..1f51b950 100644 --- a/examples/vlm_ptq/README.md +++ b/examples/vlm_ptq/README.md @@ -41,6 +41,7 @@ Please refer to the [llm_ptq/README.md](../llm_ptq/README.md#current-out-of-the- | Llava | llava | ✅ | ✅ | ✅ | ✅ | ❌ | | VILA | vila | ✅ | ✅ | ✅ | ✅ | ❌ | | Phi-3-vision | phi | ✅ | ✅ | ✅ | ✅ | ❌ | +| Qwen2.5-VL | qwen | ✅ | ✅ | ✅ | ✅ | ❌ | > *1.The w4a8_awq is an experimental quantization scheme that may result in a higher accuracy penalty.* \ > *2.A selective set of the popular models are internally tested. The actual model support list may be longer. NVFP4 inference requires Blackwell GPUs and TensorRT-LLM v0.17 or later.* @@ -51,7 +52,7 @@ Please refer to the [llm_ptq/README.md](../llm_ptq/README.md#current-out-of-the- Please refer to the [llm_ptq/README.md](../llm_ptq/README.md) about the details of model quantization. -The following scripts provide an all-in-one and step-by-step model quantization example for Llava, VILA and Phi-3-vision models. The quantization format and the number of GPUs will be supplied as inputs to these scripts. By default, we build the engine for the fp8 format and 1 GPU. +The following scripts provide an all-in-one and step-by-step model quantization example for Llava, VILA, Phi-3-vision and Qwen2.5-VL models. The quantization format and the number of GPUs will be supplied as inputs to these scripts. By default, we build the engine for the fp8 format and 1 GPU. ### Hugging Face Example [Script](./scripts/huggingface_example.sh) @@ -76,6 +77,13 @@ git clone https://huggingface.co/microsoft/Phi-3-vision-128k-instruct scripts/huggingface_example.sh --type phi --model Phi-3-vision-128k-instruct --quant [fp8|int8_sq|int4_awq|w4a8_awq] ``` +For [Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): + +```bash +git clone https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct +scripts/huggingface_example.sh --type qwen --model Qwen2.5-VL-7B-Instruct --export_fmt hf --quant [fp8|nvfp4|int8_sq|int4_awq|w4a8_awq] +``` + The example scripts above also have an additional flag `--tasks gqa`, which will trigger evaluation of the built TensorRT engine using GQA benchmark. Details of the evaluation is explained in this [tutorial](../vlm_eval/README.md). If you encounter Out of Memory (OOM) issues during inference or evaluation, you can try lowering the `--kv_cache_free_gpu_memory_fraction` argument (default is 0.8) to reduce GPU memory usage for kv_cache: diff --git a/examples/vlm_ptq/scripts/huggingface_example.sh b/examples/vlm_ptq/scripts/huggingface_example.sh index 9ac5aaa0..9bab141d 100755 --- a/examples/vlm_ptq/scripts/huggingface_example.sh +++ b/examples/vlm_ptq/scripts/huggingface_example.sh @@ -30,10 +30,10 @@ for i in $(env | grep ^PMI_ | cut -d"=" -f 1); do unset -v $i; done for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done case $MODEL_TYPE in - llava|phi|vila|mllama) + llava|phi|vila|mllama|qwen) ;; *) - echo "Unsupported type argument: Expected one of: [llava, phi, vila, mllama]" >&2 + echo "Unsupported type argument: Expected one of: [llava, phi, vila, mllama, qwen]" >&2 exit 1 esac @@ -58,10 +58,10 @@ case $SPARSITY_FMT in esac case $QFORMAT in - fp8|int8_sq|int4_awq|w4a8_awq|fp16|bf16) + fp8|nvfp4|int8_sq|int4_awq|w4a8_awq|fp16|bf16) ;; *) - echo "Unknown quant argument: Expected one of: [fp8, int8_sq, int4_awq, w4a8_awq, fp16, bf16]" >&2 + echo "Unknown quant argument: Expected one of: [fp8, nvfp4, int8_sq, int4_awq, w4a8_awq, fp16, bf16]" >&2 exit 1 esac @@ -91,7 +91,7 @@ fi BUILD_MAX_OUTPUT_LEN=512 -if [ "$MODEL_TYPE" = "llava" ] || [ "$MODEL_TYPE" = "vila" ]; then +if [ "$MODEL_TYPE" = "llava" ] || [ "$MODEL_TYPE" = "vila" ] || [ "$MODEL_TYPE" = "qwen" ]; then BUILD_MAX_BATCH_SIZE=20 else BUILD_MAX_BATCH_SIZE=4 @@ -149,6 +149,9 @@ case "${MODEL_TYPE}" in PTQ_ARGS+=" --kv_cache_qformat none " VLM_ARGS=" --max_encoder_input_len=6404 --skip_run" ;; + "qwen") + PTQ_ARGS+=" --kv_cache_qformat none " + ;; esac if [ "${MODEL_TYPE}" = "vila" ]; then @@ -177,6 +180,7 @@ if [[ $TASKS =~ "build" ]] || [[ ! -d "$ENGINE_DIR" ]] || [[ ! $(ls -A $ENGINE_D --inference_tensor_parallel=$TP \ --inference_pipeline_parallel=$PP \ --export_fmt=$EXPORT_FORMAT \ + --no-verbose \ $PTQ_ARGS else echo "Quantized model config $MODEL_CONFIG exists, skipping the quantization stage" @@ -213,6 +217,10 @@ case "${MODEL_TYPE}" in "phi") VISUAL_MODEL_TYPE="phi-3-vision" ;; + "qwen") + # Map generic type to TRT-LLM multimodal model type + VISUAL_MODEL_TYPE="qwen2_vl" + ;; esac diff --git a/modelopt/torch/export/model_config_export.py b/modelopt/torch/export/model_config_export.py index 6dbec4a2..9671cd3f 100644 --- a/modelopt/torch/export/model_config_export.py +++ b/modelopt/torch/export/model_config_export.py @@ -362,6 +362,7 @@ def torch_to_tensorrt_llm_checkpoint( "glm", "llama", "mllama", + "qwen", ], f"lm_head not available for decoder {decoder_type}" config.share_embedding_table = True diff --git a/modelopt/torch/export/plugins/hf_spec_export.py b/modelopt/torch/export/plugins/hf_spec_export.py index 0a5045f0..fe044828 100644 --- a/modelopt/torch/export/plugins/hf_spec_export.py +++ b/modelopt/torch/export/plugins/hf_spec_export.py @@ -82,7 +82,12 @@ def rename_and_prune_if_spec_decoding(model: nn.Module, post_state_dict: dict): def set_config_if_spec_decoding(model: nn.Module, config_data: dict): """Return the config of draft model in official format.""" - if len(model._modelopt_state) != 1 or model._modelopt_state[0][0] != "eagle": + opt_modes = getattr(model, "_modelopt_state", None) + if ( + not isinstance(opt_modes, (list, tuple)) + or len(opt_modes) != 1 + or opt_modes[0][0] != "eagle" + ): # return as is return config_data