Update examples for 0.21.1 release with TRT-LLM 0.15

kevalmorabia97 · kevalmorabia97 · commit f77cae9ffb0f · 2024-12-24T10:12:05.000+05:30
diff --git a/chained_optimizations/bert_prune_distill_quantize.py b/chained_optimizations/bert_prune_distill_quantize.py
@@ -841,10 +841,6 @@ def postprocess_qa_predictions(
             all_start_logits.append(accelerator.gather_for_metrics(start_logits).cpu().numpy())
             all_end_logits.append(accelerator.gather_for_metrics(end_logits).cpu().numpy())
 
-    # Model Optimizer: clear the intermediate states of the distillation model from the forward passes
-    if args.do_modelopt_distill:
-        model.module.compute_kd_loss()
-
     max_len = max([x.shape[1] for x in all_start_logits])  # Get the max_length of the tensor
 
     # concatenate the numpy array
diff --git a/diffusers/quantization/onnx_utils/export.py b/diffusers/quantization/onnx_utils/export.py
@@ -253,7 +253,7 @@ def modelopt_export_sd(backbone, onnx_dir, model_name, precision, model_dtype="B
 
     dynamic_axes = AXES_NAME[model_name]
     do_constant_folding = True
-    opset_version = 17
+    opset_version = 20
 
     # Copied from Huggingface's Optimum
     onnx_export(
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.6.1-devel-ubuntu22.04
+FROM nvidia/cuda:12.6.2-devel-ubuntu22.04
 
 WORKDIR /workspace
 
@@ -13,7 +13,7 @@ RUN rm -rf /usr/lib/python3/dist-packages/setuptools*
 RUN pip install setuptools -U
 
 # Install TensorRT-LLM
-ARG TRT_LLM_VERSION=0.14.0
+ARG TRT_LLM_VERSION=0.15.0
 RUN pip install "tensorrt-llm~=$TRT_LLM_VERSION" -U
 RUN git clone --depth 1 --branch "v$TRT_LLM_VERSION" https://github.com/NVIDIA/TensorRT-LLM.git && \
     mkdir tensorrt-llm && \
@@ -26,7 +26,7 @@ ENV LD_LIBRARY_PATH=/usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs:$L
 ENV LD_LIBRARY_PATH=/usr/local/lib/python3.10/dist-packages/nvidia/cudnn/lib:$LD_LIBRARY_PATH
 
 # Install TensorRT dev environment
-ARG TENSORRT_URL=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/tars/TensorRT-10.4.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz
+ARG TENSORRT_URL=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz
 RUN wget -q -O tensorrt.tar.gz $TENSORRT_URL && \
     tar -xf tensorrt.tar.gz && \
     cp TensorRT-*/bin/trtexec /usr/local/bin && \
diff --git a/vlm_ptq/scripts/huggingface_example.sh b/vlm_ptq/scripts/huggingface_example.sh
@@ -74,7 +74,7 @@ else
     BUILD_MAX_OUTPUT_LEN=2560
 fi
 
-BUILD_MAX_BATCH_SIZE=1
+BUILD_MAX_BATCH_SIZE=4
 
 echo "Using the following config: max input $BUILD_MAX_INPUT_LEN max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE"
 
@@ -207,10 +207,17 @@ python vlm_visual_engine.py \
 
 echo "Run inference example"
 
+VLM_RUN_ARGS=""
+
+if [ "$MODEL_TYPE" == "vila" ]; then
+    VLM_RUN_ARGS+=" --image_path https://github.com/NVlabs/VILA/blob/6b941da19e31ddfdfaa60160908ccf0978d96615/demo_images/av.png?raw=true"
+fi
+
 python vlm_run.py  \
     --hf_model_dir $MODEL_PATH \
     --visual_engine_dir $VISION_ENCODER_DIR \
     --llm_engine_dir $ENGINE_DIR \
+    $VLM_RUN_ARGS
 
 if [ "${MODEL_TYPE}" = "llava" ]; then
     echo "For Llava model, current transformers version is 4.42.4, higher version transformers may be needed for other model."
diff --git a/vlm_ptq/vlm_run.py b/vlm_ptq/vlm_run.py
@@ -85,6 +85,24 @@ def parse_arguments():
         action="store_true",
         help="Whether or not to use Python runtime session",
     )
+    parser.add_argument(
+        "--enable_chunked_context",
+        action="store_true",
+        help="Enables chunked context (only available with cpp session).",
+    )
+    parser.add_argument(
+        "--kv_cache_free_gpu_memory_fraction",
+        default=0.2,
+        type=float,
+        help="Specify the free gpu memory fraction.",
+    )
+    parser.add_argument(
+        "--multi_block_mode",
+        type=lambda s: s.lower()
+        in ("yes", "true", "t", "1"),  # custom boolean function to convert input string to boolean
+        default=True,
+        help="Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel.",
+    )
 
     return parser.parse_args()