NVIDIA · yueshen2016 · Sep 23, 2025 · Sep 22, 2025 · cjluo-nv · Sep 23, 2025
@@ -316,6 +316,7 @@ def main(args):
                     mtq.quantize(child, disabled_quant_cfg, forward_loop=None)
 
             model = model.language_model
+            model_type = get_model_type(model)
 
     if args.sparsity_fmt != "dense":
-            model = model.language_model
-            model_type = get_model_type(model)
-
-    if args.sparsity_fmt != "dense":
+            model = model.language_model
+            model_type = get_model_type(model)
+            # Keep subsequent logic consistent with the sub‑model we actually operate on.
+            model_is_already_quantized = is_quantized(model)
+            device = getattr(model, "device", device)
+
+    if args.sparsity_fmt != "dense":
-            model = model.language_model
-            model_type = get_model_type(model)
-
-    if args.sparsity_fmt != "dense":
+            model = model.language_model
+            model_type = get_model_type(model)
+            # Keep subsequent logic consistent with the sub‑model we actually operate on.
+            model_is_already_quantized = is_quantized(model)
+            device = getattr(model, "device", device)
+
+    if args.sparsity_fmt != "dense":
         if args.batch_size == 0:

@@ -73,7 +73,7 @@ if [ -n "$KV_CACHE_QUANT" ]; then
     PTQ_ARGS+=" --kv_cache_qformat=$KV_CACHE_QUANT "
 fi
 
-if [ "${MODEL_TYPE}" = "vila" ]; then
+if [[ "${MODEL_NAME,,}" == *"vila"* ]]; then
     # Install required dependency for VILA
     pip install -r ../vlm_ptq/requirements-vila.txt
     # Clone original VILA repo