qwen quantize and hf export support in examples

i-riyad · i-riyad · commit bc62a2454c52 · 2025-09-09T20:13:52.000Z
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -742,7 +742,7 @@ def output_decode(generated_ids, input_shape):
     )
     parser.add_argument(
         "--verbose",
-        help="Print verbose output (e.g. quantization summary). Disable by --no_verbose.",
+        help="Print verbose output (e.g. quantization summary). Disable by --no-verbose.",
         default=True,
         action=argparse.BooleanOptionalAction,
     )
diff --git a/examples/vlm_ptq/scripts/huggingface_example.sh b/examples/vlm_ptq/scripts/huggingface_example.sh
@@ -30,10 +30,10 @@ for i in $(env | grep ^PMI_ | cut -d"=" -f 1); do unset -v $i; done
 for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done
 
 case $MODEL_TYPE in
-    llava|phi|vila|mllama)
+    llava|phi|vila|mllama|qwen)
         ;;
     *)
-        echo "Unsupported type argument: Expected one of: [llava, phi, vila, mllama]" >&2
+        echo "Unsupported type argument: Expected one of: [llava, phi, vila, mllama, qwen]" >&2
         exit 1
 esac
 
@@ -91,7 +91,7 @@ fi
 
 BUILD_MAX_OUTPUT_LEN=512
 
-if [ "$MODEL_TYPE" = "llava" ] || [ "$MODEL_TYPE" = "vila" ]; then
+if [ "$MODEL_TYPE" = "llava" ] || [ "$MODEL_TYPE" = "vila" ] || [ "$MODEL_TYPE" = "qwen" ]; then
     BUILD_MAX_BATCH_SIZE=20
 else
     BUILD_MAX_BATCH_SIZE=4
@@ -149,6 +149,10 @@ case "${MODEL_TYPE}" in
         PTQ_ARGS+=" --kv_cache_qformat none "
         VLM_ARGS=" --max_encoder_input_len=6404 --skip_run"
         ;;
+    "qwen")
+        PTQ_ARGS+=" --kv_cache_qformat none "
+        EXPORT_FORMAT="hf"
+        ;;
 esac
 
 if [ "${MODEL_TYPE}" = "vila" ]; then
@@ -177,6 +181,7 @@ if [[ $TASKS =~ "build" ]] || [[ ! -d "$ENGINE_DIR" ]] || [[ ! $(ls -A $ENGINE_D
             --inference_tensor_parallel=$TP \
             --inference_pipeline_parallel=$PP \
             --export_fmt=$EXPORT_FORMAT \
+            --no-verbose \
             $PTQ_ARGS
     else
         echo "Quantized model config $MODEL_CONFIG exists, skipping the quantization stage"
@@ -213,6 +218,10 @@ case "${MODEL_TYPE}" in
     "phi")
         VISUAL_MODEL_TYPE="phi-3-vision"
         ;;
+    "qwen")
+        # Map generic type to TRT-LLM multimodal model type
+        VISUAL_MODEL_TYPE="qwen2_vl"
+        ;;
 esac
 
 
diff --git a/modelopt/torch/export/model_config_export.py b/modelopt/torch/export/model_config_export.py
@@ -362,6 +362,7 @@ def torch_to_tensorrt_llm_checkpoint(
                     "glm",
                     "llama",
                     "mllama",
+                    "qwen",
                 ], f"lm_head not available for decoder {decoder_type}"
                 config.share_embedding_table = True
 
diff --git a/modelopt/torch/export/plugins/hf_spec_export.py b/modelopt/torch/export/plugins/hf_spec_export.py
@@ -82,7 +82,12 @@ def rename_and_prune_if_spec_decoding(model: nn.Module, post_state_dict: dict):
 
 def set_config_if_spec_decoding(model: nn.Module, config_data: dict):
     """Return the config of draft model in official format."""
-    if len(model._modelopt_state) != 1 or model._modelopt_state[0][0] != "eagle":
+    opt_modes = getattr(model, "_modelopt_state", None)
+    if (
+        not isinstance(opt_modes, (list, tuple))
+        or len(opt_modes) != 1
+        or opt_modes[0][0] != "eagle"
+    ):
         # return as is
         return config_data
 

Original file line number	Diff line number	Diff line change
`@@ -742,7 +742,7 @@ def output_decode(generated_ids, input_shape):`
`742`	`742`	`)`
`743`	`743`	`parser.add_argument(`
`744`	`744`	`"--verbose",`
`745`		`- help="Print verbose output (e.g. quantization summary). Disable by --no_verbose.",`
	`745`	`+ help="Print verbose output (e.g. quantization summary). Disable by --no-verbose.",`
`746`	`746`	`default=True,`
`747`	`747`	`action=argparse.BooleanOptionalAction,`
`748`	`748`	`)`