Update doc

cjluo-nv · cjluo-nv · commit be355d1b366b · 2025-09-16T15:56:48.000Z
Signed-off-by: Chenjie Luo &lt;chenjiel@nvidia.com&gt;
diff --git a/examples/llm_eval/README.md b/examples/llm_eval/README.md
@@ -93,7 +93,7 @@ If `trust_remote_code` needs to be true, please append the command with the `--t
 ### TensorRT-LLM
 
 ```sh
-python lm_eval_tensorrt_llm.py --model trt-llm --model_args tokenizer=<HF model folder>,engine_dir=<TRT LLM engine dir> --tasks <comma separated tasks> --batch_size <engine batch size>
+python lm_eval_tensorrt_llm.py --model trt-llm --model_args tokenizer=<HF model folder>,engine_dir=<Quantized checkpoint dir> --tasks <comma separated tasks> --batch_size <engine batch size>
 ```
 
 ## MMLU
@@ -140,7 +140,7 @@ python mmlu.py --model_name causal --model_path <HF model folder or model card>
 ### Evaluate the TensorRT-LLM engine
 
 ```bash
-python mmlu.py --model_name causal --model_path <HF model folder or model card> --engine_dir <built TensorRT-LLM folder>
+python mmlu.py --model_name causal --model_path <HF model folder or model card> --engine_dir <Quantized checkpoint dir>
 ```
 
 ## MT-Bench
@@ -163,7 +163,7 @@ bash run_fastchat.sh -h <HF model folder or model card> --quant_cfg MODELOPT_QUA
 ### Evaluate the TensorRT-LLM engine
 
 ```bash
-bash run_fastchat.sh -h <HF model folder or model card> <built TensorRT-LLM folder>
+bash run_fastchat.sh -h <HF model folder or model card> <Quantized checkpoint dir>
 ```
 
 ### Judging the responses
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -696,6 +696,13 @@ def output_decode(generated_ids, input_shape):
         choices=KV_QUANT_CFG_CHOICES.keys(),
         help="Specify KV cache quantization format, default to fp8 if not provided",
     )
+    parser.add_argument(
+        "--export_fmt",
+        required=False,
+        default="hf",
+        choices=["tensorrt_llm", "hf"],
+        help="Deprecated. Please avoid using this argument.",
+    )
     parser.add_argument(
         "--trust_remote_code",
         help="Set trust_remote_code for Huggingface models and tokenizers",
@@ -749,6 +756,9 @@ def output_decode(generated_ids, input_shape):
 
     args = parser.parse_args()
 
+    if args.export_fmt != "hf":
+        warnings.warn("Deprecated. --export_fmt will be ignored.")
+
     args.dataset = args.dataset.split(",") if args.dataset else None
     args.calib_size = [int(num_sample) for num_sample in args.calib_size.split(",")]
     main(args)
diff --git a/examples/llm_sparsity/README.md b/examples/llm_sparsity/README.md
@@ -148,5 +148,4 @@ python export_trtllm_ckpt.py --model_name_or_path meta-llama/Llama-2-7b-hf \
 
 ## Build TensorRT-LLM Engine
 
-For guidance on how to build TensorRT-LLM engines, please refer to [link](../llm_ptq/README.md#TensorRT-LLM-Engine-Build).
-To validate the built TensorRT-LLM engines, please follow the instructions at [link](../llm_ptq/README.md#TensorRT-LLM-Engine-Validation).
+For guidance on how to build TensorRT-LLM engines, please refer to [link](https://nvidia.github.io/TensorRT-LLM/commands/trtllm-build.html#trtllm-build) and use the `--weight_sparsity` flag.