Skip to content

Commit 29fa81c

Browse files
committed
Update
1 parent 92e6900 commit 29fa81c

File tree

8 files changed

+44
-42
lines changed

8 files changed

+44
-42
lines changed

examples/llm_ptq/scripts/huggingface_example.sh

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ QFORMAT_MODIFIED="${QFORMAT//,/_}"
8989

9090
MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')
9191

92-
MODEL_FULL_NAME=${MODEL_NAME}_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}_${EXPORT_FORMAT}
92+
MODEL_FULL_NAME=${MODEL_NAME}_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}
9393

9494
SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_FULL_NAME}
9595

@@ -188,13 +188,13 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
188188
cuda_major=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader -i 0 | cut -d. -f1)
189189

190190
if [ "$cuda_major" -lt 10 ]; then
191-
echo "Please build the tensorrt_llm engine on Blackwell GPU for deployment. Checkpoint export_path: $SAVE_PATH"
191+
echo "Please deploy the NVFP4 checkpoint on a Blackwell GPU. Checkpoint export_path: $SAVE_PATH"
192192
exit 0
193193
fi
194194
fi
195195

196196
if [[ ! " fp8 nvfp4 bf16 fp16 int4_awq w4a8_awq " =~ " ${QFORMAT} " ]]; then
197-
echo "Quant $QFORMAT not supported with the TensorRT-LLM torch llmapi. Allowed values are: fp8, nvfp4, bf16, fp16, int4_awq, w4a8_awq"
197+
echo "Quant $QFORMAT specified. Please read TensorRT-LLM quantization support matrix https://nvidia.github.io/TensorRT-LLM/features/quantization.html#quantization-in-tensorrt-llm and use TensorRT-LLM for deployment. Checkpoint export_path: $SAVE_PATH"
198198
exit 0
199199
fi
200200

@@ -333,12 +333,6 @@ if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then
333333
fi
334334

335335
if [[ $TASKS =~ "benchmark" ]]; then
336-
337-
if [ "$PP" -ne 1 ]; then
338-
echo "Benchmark does not work with multi PP. Please run the c++ benchmark in the TensorRT-LLM repo..."
339-
exit 1
340-
fi
341-
342336
BENCHMARK_RESULT=${SAVE_PATH}/benchmark.txt
343337
echo "Evaluating performance, result saved to $BENCHMARK_RESULT..."
344338

@@ -371,12 +365,17 @@ if [[ $TASKS =~ "benchmark" ]]; then
371365
fi
372366

373367
MODEL_ARGS="--model_path $SAVE_PATH "
374-
EXTRA_ARGS="--backend pytorch "
368+
if [ -n "$CUDA_VISIBLE_DEVICES" ]; then
369+
VISIBLE_GPU_COUNT=$(echo $CUDA_VISIBLE_DEVICES | tr ',' '\n' | grep -v '^$' | wc -l)
370+
else
371+
VISIBLE_GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
372+
fi
373+
EXTRA_ARGS="--backend pytorch --tp $VISIBLE_GPU_COUNT "
375374

376375
if [ "$BUILD_MAX_BATCH_SIZE" -gt 1 ]; then
377-
trtllm-bench --model $MODEL_PATH $MODEL_ARGS throughput $EXTRA_ARGS --dataset $DATASET_TXT | tee -a $BENCHMARK_RESULT
376+
trtllm-bench throughput --model $MODEL_PATH $MODEL_ARGS $EXTRA_ARGS --dataset $DATASET_TXT | tee -a $BENCHMARK_RESULT
378377
else
379-
trtllm-bench --model $MODEL_PATH $MODEL_ARGS latency $EXTRA_ARGS --dataset $DATASET_TXT | tee -a $BENCHMARK_RESULT
378+
trtllm-bench latency --model $MODEL_PATH $MODEL_ARGS $EXTRA_ARGS --dataset $DATASET_TXT | tee -a $BENCHMARK_RESULT
380379
fi
381380

382381
fi

examples/llm_ptq/scripts/parser.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ parse_options() {
5454
--lm_eval_tasks ) LM_EVAL_TASKS="$2"; shift 2;;
5555
--lm_eval_limit ) LM_EVAL_LIMIT="$2"; shift 2;;
5656
--simple_eval_tasks ) SIMPLE_EVAL_TASKS="$2"; shift 2;;
57-
--num_samples ) NUM_SAMPLES="$2"; shift 2;;
5857
--trust_remote_code ) TRUST_REMOTE_CODE=true; shift;;
5958
--use_seq_device_map ) USE_SEQ_DEVICE_MAP=true; shift;;
6059
--gpu_max_mem_percentage ) GPU_MAX_MEM_PERCENTAGE="$2"; shift 2;;

modelopt/deploy/llm/generate.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
from tensorrt_llm.llmapi import CudaGraphConfig
3131
from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
3232
from tensorrt_llm.llmapi.llm import LLM as TRTLLM
33-
from tensorrt_llm.llmapi.tokenizer import TokenizerBase
3433
except ImportError:
3534
print("Please upgrade tensorrt-llm to 1.1.0rc2 or later")
3635
raise
@@ -57,7 +56,7 @@ class LLM(TRTLLM):
5756
def __init__(
5857
self,
5958
checkpoint_dir: str | Path,
60-
tokenizer: "str | Path | TokenizerBase | None" = None,
59+
tokenizer: "str | Path | None" = None,
6160
kv_cache_config: dict[str, int | float] = {},
6261
medusa_choices: Any = None,
6362
tp: int = 0,
@@ -67,7 +66,7 @@ def __init__(
6766
"""Initializes the LLM runner class.
6867
6968
Args:
70-
engine_dir: the directory path of the TensorRT-LLM engine.
69+
checkpoint_dir: the directory path of the model checkpoint.
7170
tokenizer: the tokenizer. For example, a tokenizer from the Huggingface model.
7271
kv_cache_config: the kv cache config as a dict. Please refer to
7372
https://nvidia.github.io/TensorRT-LLM/performance/performance-tuning-guide/
@@ -112,7 +111,7 @@ def _find_max_position_embeddings(cfg: dict) -> int | None:
112111
# Check if any key in config contains both "num" and "experts"
113112
ep = 1
114113
enable_attention_dp = False
115-
for k in config.keys():
114+
for k in config:
116115
if "num" in k and "experts" in k:
117116
ep = torch.cuda.device_count()
118117
enable_attention_dp = True

tests/_test_utils/model.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,11 @@ def _select_path(remote_id: str, local_id: str) -> str:
6363
local_id="llava-1.5-7b-hf",
6464
)
6565

66+
LLAMA3_2_PATH = _select_path(
67+
remote_id="meta-llama/Llama-3.2-1B-Instruct",
68+
local_id="Llama-3.2-1B-Instruct",
69+
)
70+
6671
# Diffusers
6772
FLUX_SCHNELL_PATH = _select_path(
6873
remote_id="hf-internal-testing/tiny-flux-pipe",

tests/_test_utils/ptq_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ class PTQCommand:
3939
pp: int | None = None
4040
min_sm: int | None = None
4141
min_gpu: int | None = None
42+
batch: int | None = None
4243

4344
def run(self, model_path: str):
4445
if self.min_sm and torch.cuda.get_device_capability() < (

tests/examples/llm_eval/test_llm_eval.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def test_llama_eval_fp8(tiny_llama_path):
3030
lm_eval_tasks="hellaswag,gsm8k",
3131
simple_eval_tasks="humaneval",
3232
lm_eval_limit=0.1,
33+
batch=8,
3334
)
3435
finally:
3536
# Force kill llm-serve if it's still running

tests/examples/llm_ptq/test_llm_ptq.py

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -92,51 +92,49 @@ def llama_path(tiny_llama_path):
9292
[
9393
PTQCommand(quant="fp16"),
9494
PTQCommand(quant="bf16"),
95-
PTQCommand(quant="int8_sq"),
96-
# ("int8_sq", "tensorrt_llm", "sparsegpt"),
97-
PTQCommand(quant="int4_awq"),
95+
PTQCommand(quant="int8_sq", kv_cache_quant="none"),
96+
PTQCommand(quant="int4_awq", kv_cache_quant="none"),
9897
PTQCommand(quant="nvfp4"),
9998
PTQCommand(quant="nvfp4_awq"),
100-
#
10199
# autoquant
102100
PTQCommand(
103101
quant="int4_awq,nvfp4,fp8,w4a8_awq",
104102
calib_batch_size=4,
105103
auto_quantize_bits=6.4,
104+
kv_cache_quant="none",
106105
),
107-
#
108106
# kv_cache
109107
PTQCommand(quant="nvfp4_awq", kv_cache_quant="nvfp4"),
110-
# ("nvfp4_awq", "tensorrt_llm", "nvfp4_affine"),
111-
# ("nvfp4_awq", "hf", "nvfp4_affine"),
112108
#
113109
# autoquant_kv_cache
114110
PTQCommand(
115-
quant="int4_awq,nvfp4,fp8,w4a8_awq",
111+
quant="nvfp4,fp8",
112+
kv_cache_quant="fp8",
113+
calib_batch_size=4,
114+
auto_quantize_bits=6.4,
115+
),
116+
PTQCommand(
117+
quant="nvfp4,fp8",
116118
kv_cache_quant="nvfp4",
117119
calib_batch_size=4,
118120
auto_quantize_bits=6.4,
119121
),
120-
# ("int4_awq,nvfp4,fp8,w4a8_awq", "tensorrt_llm", "nvfp4_affine"),
121-
# ("int4_awq,nvfp4,fp8,w4a8_awq", "hf", "nvfp4_affine"),
122-
#
123122
# sm89
124123
PTQCommand(quant="fp8", min_sm=89),
125124
PTQCommand(quant="fp8", kv_cache_quant="none", min_sm=89),
126-
# ("fp8", "tensorrt_llm", "sparsegpt", None),
127-
PTQCommand(quant="w4a8_awq", min_sm=89),
125+
PTQCommand(quant="w4a8_awq", kv_cache_quant="none", min_sm=89),
126+
# sm100
127+
PTQCommand(quant="nvfp4", min_sm=100),
128128
#
129129
# multi_gpu
130-
# TP
131-
PTQCommand(quant="fp16", tp=2, pp=1, min_gpu=2),
132-
# ("fp16", "build", "sparsegpt", 1),
133-
PTQCommand(quant="nvfp4", tp=2, pp=1, min_gpu=2),
134-
PTQCommand(quant="fp16", tasks="benchmark", tp=2, pp=1, min_gpu=2),
135-
# ("fp16", "benchmark", "sparsegpt", 2, 1),
136-
# PP
137-
# ("nvfp4", "build", None, 1, 2),
138-
# ("fp16", "build", None, 1, 2),
139-
# ("fp16", "build", "sparsegpt", 1, 2),
130+
PTQCommand(quant="fp16", min_gpu=2),
131+
PTQCommand(quant="nvfp4", min_gpu=2),
132+
PTQCommand(quant="fp16", tasks="benchmark", min_gpu=2),
133+
PTQCommand(quant="fp8", tasks="benchmark", min_gpu=2),
134+
PTQCommand(quant="nvfp4", tasks="benchmark", min_gpu=2),
135+
PTQCommand(quant="fp16", tasks="benchmark", min_gpu=2, batch=1),
136+
PTQCommand(quant="fp8", tasks="benchmark", min_gpu=2, batch=1),
137+
PTQCommand(quant="nvfp4", tasks="benchmark", min_gpu=2, batch=1),
140138
],
141139
ids=PTQCommand.param_str,
142140
)

tests/examples/vlm_ptq/test_llava.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@
1616

1717
import pytest
1818
from _test_utils.examples.run_command import run_vlm_ptq_command
19-
from _test_utils.model import LLAVA_PATH
19+
from _test_utils.model import LLAMA3_2_PATH
2020
from _test_utils.torch_misc import minimum_gpu
2121

2222

2323
@pytest.mark.parametrize("quant", ["fp16"])
2424
@minimum_gpu(2)
2525
def test_llava_multi_gpu(quant):
26-
run_vlm_ptq_command(model=LLAVA_PATH, type="llava", quant=quant, tp=2)
26+
run_vlm_ptq_command(model=LLAMA3_2_PATH, quant=quant, tp=2)

0 commit comments

Comments
 (0)