Skip to content

Commit f77cae9

Browse files
Update examples for 0.21.1 release with TRT-LLM 0.15
1 parent 45cb1ab commit f77cae9

File tree

5 files changed

+30
-9
lines changed

5 files changed

+30
-9
lines changed

chained_optimizations/bert_prune_distill_quantize.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -841,10 +841,6 @@ def postprocess_qa_predictions(
841841
all_start_logits.append(accelerator.gather_for_metrics(start_logits).cpu().numpy())
842842
all_end_logits.append(accelerator.gather_for_metrics(end_logits).cpu().numpy())
843843

844-
# Model Optimizer: clear the intermediate states of the distillation model from the forward passes
845-
if args.do_modelopt_distill:
846-
model.module.compute_kd_loss()
847-
848844
max_len = max([x.shape[1] for x in all_start_logits]) # Get the max_length of the tensor
849845

850846
# concatenate the numpy array

diffusers/quantization/onnx_utils/export.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ def modelopt_export_sd(backbone, onnx_dir, model_name, precision, model_dtype="B
253253

254254
dynamic_axes = AXES_NAME[model_name]
255255
do_constant_folding = True
256-
opset_version = 17
256+
opset_version = 20
257257

258258
# Copied from Huggingface's Optimum
259259
onnx_export(

docker/Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM nvidia/cuda:12.6.1-devel-ubuntu22.04
1+
FROM nvidia/cuda:12.6.2-devel-ubuntu22.04
22

33
WORKDIR /workspace
44

@@ -13,7 +13,7 @@ RUN rm -rf /usr/lib/python3/dist-packages/setuptools*
1313
RUN pip install setuptools -U
1414

1515
# Install TensorRT-LLM
16-
ARG TRT_LLM_VERSION=0.14.0
16+
ARG TRT_LLM_VERSION=0.15.0
1717
RUN pip install "tensorrt-llm~=$TRT_LLM_VERSION" -U
1818
RUN git clone --depth 1 --branch "v$TRT_LLM_VERSION" https://github.com/NVIDIA/TensorRT-LLM.git && \
1919
mkdir tensorrt-llm && \
@@ -26,7 +26,7 @@ ENV LD_LIBRARY_PATH=/usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs:$L
2626
ENV LD_LIBRARY_PATH=/usr/local/lib/python3.10/dist-packages/nvidia/cudnn/lib:$LD_LIBRARY_PATH
2727

2828
# Install TensorRT dev environment
29-
ARG TENSORRT_URL=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/tars/TensorRT-10.4.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz
29+
ARG TENSORRT_URL=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz
3030
RUN wget -q -O tensorrt.tar.gz $TENSORRT_URL && \
3131
tar -xf tensorrt.tar.gz && \
3232
cp TensorRT-*/bin/trtexec /usr/local/bin && \

vlm_ptq/scripts/huggingface_example.sh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ else
7474
BUILD_MAX_OUTPUT_LEN=2560
7575
fi
7676

77-
BUILD_MAX_BATCH_SIZE=1
77+
BUILD_MAX_BATCH_SIZE=4
7878

7979
echo "Using the following config: max input $BUILD_MAX_INPUT_LEN max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE"
8080

@@ -207,10 +207,17 @@ python vlm_visual_engine.py \
207207

208208
echo "Run inference example"
209209

210+
VLM_RUN_ARGS=""
211+
212+
if [ "$MODEL_TYPE" == "vila" ]; then
213+
VLM_RUN_ARGS+=" --image_path https://github.com/NVlabs/VILA/blob/6b941da19e31ddfdfaa60160908ccf0978d96615/demo_images/av.png?raw=true"
214+
fi
215+
210216
python vlm_run.py \
211217
--hf_model_dir $MODEL_PATH \
212218
--visual_engine_dir $VISION_ENCODER_DIR \
213219
--llm_engine_dir $ENGINE_DIR \
220+
$VLM_RUN_ARGS
214221

215222
if [ "${MODEL_TYPE}" = "llava" ]; then
216223
echo "For Llava model, current transformers version is 4.42.4, higher version transformers may be needed for other model."

vlm_ptq/vlm_run.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,24 @@ def parse_arguments():
8585
action="store_true",
8686
help="Whether or not to use Python runtime session",
8787
)
88+
parser.add_argument(
89+
"--enable_chunked_context",
90+
action="store_true",
91+
help="Enables chunked context (only available with cpp session).",
92+
)
93+
parser.add_argument(
94+
"--kv_cache_free_gpu_memory_fraction",
95+
default=0.2,
96+
type=float,
97+
help="Specify the free gpu memory fraction.",
98+
)
99+
parser.add_argument(
100+
"--multi_block_mode",
101+
type=lambda s: s.lower()
102+
in ("yes", "true", "t", "1"), # custom boolean function to convert input string to boolean
103+
default=True,
104+
help="Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel.",
105+
)
88106

89107
return parser.parse_args()
90108

0 commit comments

Comments
 (0)