diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6fde2bcb..be9d5e7e 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,7 +9,6 @@ Model Optimizer Changelog (Linux) - Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead. - Deprecated TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. For performance evaluation, please use ``trtllm-bench`` directly. - ``--export_fmt`` flag in ``examples/llm_ptq`` is removed. By default we export to the unified Hugging Face checkpoint format. -- ``int8_sq`` quantization format is deprecated from the ``examples/vlm_ptq`` with respect to the TensorRT-LLM's torch backend switch. Please refer to the previous releases if this quantization format is needed. - Deprecated ``examples/vlm_eval`` as it depends on the deprecated TRT-LLM's TRT backend. **New Features** diff --git a/examples/vlm_ptq/scripts/huggingface_example.sh b/examples/vlm_ptq/scripts/huggingface_example.sh index 69e2dce9..ea273306 100755 --- a/examples/vlm_ptq/scripts/huggingface_example.sh +++ b/examples/vlm_ptq/scripts/huggingface_example.sh @@ -35,10 +35,10 @@ if [ -z "$MODEL_PATH" ]; then fi case $QFORMAT in - fp8|int4_awq|w4a8_awq|nvfp4) + fp8|int8_sq|int4_awq|w4a8_awq|nvfp4) ;; *) - echo "Unknown quant argument: Expected one of: [fp8, int4_awq, w4a8_awq, nvfp4]" >&2 + echo "Unknown quant argument: Expected one of: [fp8, int8_sq, int4_awq, w4a8_awq, nvfp4]" >&2 exit 1 esac @@ -95,6 +95,8 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH --qformat=$QFORMAT \ --calib_size=$CALIB_SIZE \ --batch_size=$CALIB_BATCH_SIZE \ + --inference_tensor_parallel=$TP \ + --inference_pipeline_parallel=$PP \ $PTQ_ARGS else echo "Quantized model config $MODEL_CONFIG exists, skipping the quantization stage"