diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2d9469f45..3da4e38fb 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,7 +8,7 @@ Model Optimizer Changelog (Linux) - Deprecated ModelOpt's custom docker images. Please use the PyTorch, TensorRT-LLM or TensorRT docker image directly or refer to the `installation guide `_ for more details. - Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead. -- Deprecated TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. For performance evaluation, please use ``trtllm-bench`` directly. +- Deprecated TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. ``engine_dir`` is replaced with ``checkpoint_dir`` in ``examples/llm_ptq`` and ``examples/vlm_ptq``. For performance evaluation, please use ``trtllm-bench`` directly. - ``--export_fmt`` flag in ``examples/llm_ptq`` is removed. By default we export to the unified Hugging Face checkpoint format. - Deprecated ``examples/vlm_eval`` as it depends on the deprecated TRT-LLM's TRT backend. diff --git a/examples/llm_eval/README.md b/examples/llm_eval/README.md index 0e1855d99..bad3ca477 100644 --- a/examples/llm_eval/README.md +++ b/examples/llm_eval/README.md @@ -93,7 +93,7 @@ If `trust_remote_code` needs to be true, please append the command with the `--t ### TensorRT-LLM ```sh -python lm_eval_tensorrt_llm.py --model trt-llm --model_args tokenizer=,engine_dir= --tasks --batch_size +python lm_eval_tensorrt_llm.py --model trt-llm --model_args tokenizer=,checkpoint_dir= --tasks --batch_size ``` ## MMLU @@ -137,10 +137,10 @@ python mmlu.py --model_name causal --model_path python mmlu.py --model_name causal --model_path --quant_cfg $MODELOPT_QUANT_CFG_TO_SEARCH --auto_quantize_bits $EFFECTIVE_BITS --batch_size 4 ``` -### Evaluate the TensorRT-LLM engine +### Evaluate with TensorRT-LLM ```bash -python mmlu.py --model_name causal --model_path --engine_dir +python mmlu.py --model_name causal --model_path --checkpoint_dir ``` ## MT-Bench @@ -160,7 +160,7 @@ bash run_fastchat.sh -h bash run_fastchat.sh -h --quant_cfg MODELOPT_QUANT_CFG ``` -### Evaluate the TensorRT-LLM engine +### Evaluate with TensorRT-LLM ```bash bash run_fastchat.sh -h diff --git a/examples/llm_eval/gen_model_answer.py b/examples/llm_eval/gen_model_answer.py index e0b752dcf..afa76e25b 100644 --- a/examples/llm_eval/gen_model_answer.py +++ b/examples/llm_eval/gen_model_answer.py @@ -118,7 +118,7 @@ def run_eval( max_gpu_memory, dtype, revision, - engine_dir, + checkpoint_dir, nim_model, args, ): @@ -150,7 +150,7 @@ def run_eval( revision=revision, top_p=top_p, temperature=temperature, - engine_dir=engine_dir, + checkpoint_dir=checkpoint_dir, nim_model=nim_model, ) for i in range(0, len(questions), chunk_size) @@ -174,25 +174,22 @@ def get_model_answers( revision, top_p=None, temperature=None, - engine_dir=None, + checkpoint_dir=None, nim_model=None, ): # Model Optimizer modification - if engine_dir: - tokenizer = get_tokenizer(model_path, trust_remote_code=args.trust_remote_code) - if engine_dir: - # get model type - last_part = os.path.basename(engine_dir) - model_type = last_part.split("_")[0] - # Some models require to set pad_token and eos_token based on external config (e.g., qwen) - if model_type == "qwen": - tokenizer.pad_token = tokenizer.convert_ids_to_tokens(151643) - tokenizer.eos_token = tokenizer.convert_ids_to_tokens(151643) - - assert LLM is not None, "tensorrt_llm APIs could not be imported." - model = LLM(engine_dir, tokenizer=tokenizer) - else: - raise ValueError("engine_dir is required for TensorRT LLM inference.") + tokenizer = get_tokenizer(model_path, trust_remote_code=args.trust_remote_code) + if checkpoint_dir: + # get model type + last_part = os.path.basename(checkpoint_dir) + model_type = last_part.split("_")[0] + # Some models require to set pad_token and eos_token based on external config (e.g., qwen) + if model_type == "qwen": + tokenizer.pad_token = tokenizer.convert_ids_to_tokens(151643) + tokenizer.eos_token = tokenizer.convert_ids_to_tokens(151643) + + assert LLM is not None, "tensorrt_llm APIs could not be imported." + model = LLM(checkpoint_dir, tokenizer=tokenizer) elif not nim_model: model, _ = load_model( model_path, @@ -205,7 +202,6 @@ def get_model_answers( cpu_offloading=False, debug=False, ) - tokenizer = get_tokenizer(model_path, trust_remote_code=args.trust_remote_code) if args.quant_cfg: quantize_model( model, @@ -259,7 +255,7 @@ def get_model_answers( # some models may error out when generating long outputs try: - if not engine_dir: + if not checkpoint_dir: output_ids = model.generate( torch.as_tensor(input_ids).cuda(), do_sample=do_sample, @@ -427,9 +423,9 @@ def reorg_answer_file(answer_file): help="The model revision to load.", ) parser.add_argument( - "--engine-dir", + "--checkpoint-dir", type=str, - help="The path to the TensorRT LLM engine directory.", + help="The path to the model checkpoint directory.", ) parser.add_argument( "--nim-model", @@ -502,7 +498,7 @@ def reorg_answer_file(answer_file): max_gpu_memory=args.max_gpu_memory, dtype=str_to_torch_dtype(args.dtype), revision=args.revision, - engine_dir=args.engine_dir, + checkpoint_dir=args.checkpoint_dir, nim_model=args.nim_model, args=args, ) diff --git a/examples/llm_eval/lm_eval_tensorrt_llm.py b/examples/llm_eval/lm_eval_tensorrt_llm.py index ffd716413..4b23be46f 100644 --- a/examples/llm_eval/lm_eval_tensorrt_llm.py +++ b/examples/llm_eval/lm_eval_tensorrt_llm.py @@ -42,7 +42,7 @@ class TRTLLM(TemplateAPI): def __init__( self, tokenizer: str, - engine_dir: str, + checkpoint_dir: str, batch_size: int = 1, **kwargs, ): @@ -56,11 +56,11 @@ def __init__( if self.tokenizer.pad_token_id is None: self.tokenizer.pad_token_id = self.tokenizer.eos_token_id - assert isinstance(engine_dir, str) + assert isinstance(checkpoint_dir, str) - self.llm = LLM(checkpoint_dir=engine_dir, tokenizer=self.tokenizer) + self.llm = LLM(checkpoint_dir=checkpoint_dir, tokenizer=self.tokenizer) self.max_length = self.llm.max_seq_len - 1 - logger.info("Loaded TRT-LLM engine") + logger.info("Loaded TRT-LLM") def model_call( self, diff --git a/examples/llm_eval/mmlu.py b/examples/llm_eval/mmlu.py index 3e12abfe7..4b0f3b341 100755 --- a/examples/llm_eval/mmlu.py +++ b/examples/llm_eval/mmlu.py @@ -252,9 +252,9 @@ def main( mto.enable_huggingface_checkpointing() model_path = kwargs["model_path"] tokenizer = get_tokenizer(model_path, trust_remote_code=kwargs.get("trust_remote_code", False)) - if kwargs.get("engine_dir"): + if kwargs.get("checkpoint_dir"): # get model type - last_part = os.path.basename(kwargs["engine_dir"]) + last_part = os.path.basename(kwargs["checkpoint_dir"]) model_type = last_part.split("_")[0] # Some models require to set pad_token and eos_token based on external config (e.g., qwen) if model_type == "qwen": @@ -264,7 +264,9 @@ def main( assert LLM is not None, "tensorrt_llm APIs could not be imported." medusa_choices = kwargs.get("medusa_choices") model = LLM( - checkpoint_dir=kwargs["engine_dir"], tokenizer=tokenizer, medusa_choices=medusa_choices + checkpoint_dir=kwargs["checkpoint_dir"], + tokenizer=tokenizer, + medusa_choices=medusa_choices, ) else: model = select_model( diff --git a/examples/llm_eval/run_fastchat.sh b/examples/llm_eval/run_fastchat.sh index 16aa54bc2..e7d53b80d 100644 --- a/examples/llm_eval/run_fastchat.sh +++ b/examples/llm_eval/run_fastchat.sh @@ -20,9 +20,9 @@ # If you are using NIM, ensure that you export the NIM API key using: # export OPENAI_API_KEY= # -# Usage: bash run_fastchat.sh -h -e -n +# Usage: bash run_fastchat.sh -h -e -n # model_name: The HuggingFace handle or folder of the model to evaluate. -# engine_dir: The directory where the TRT-LLM engine is stored. +# checkpoint_dir: The directory where the checkpoint is stored. # nim_model_name: The handle of the NIM model to be used for evaluation. # # Example commands: @@ -30,8 +30,8 @@ # Evaluate "meta-llama/Meta-Llama-3-8B-Instruct" HF model: # bash run_fastchat.sh -h meta-llama/Meta-Llama-3-8B-Instruct # -# Evaluate "meta-llama/Meta-Llama-3-8B-Instruct" HF model with TRT-LLM engine: -# bash run_fastchat.sh -h meta-llama/Meta-Llama-3-8B-Instruct -e /path/to/engine_dir +# Evaluate "meta-llama/Meta-Llama-3-8B-Instruct" HF model with TRT-LLM: +# bash run_fastchat.sh -h meta-llama/Meta-Llama-3-8B-Instruct -e /path/to/checkpoint_dir # # Evaluate "meta-llama/Meta-Llama-3-8B-Instruct" HF model with NIM: # bash run_fastchat.sh -h meta-llama/Meta-Llama-3-8B-Instruct -n meta-llama/Meta-Llama-3-8B-Instruct @@ -41,7 +41,7 @@ set -e set -x hf_model_name="" -engine_dir="" +checkpoint_dir="" nim_model_name="" answer_file="" quant_cfg="" @@ -56,9 +56,9 @@ while [[ "$1" != "" ]]; do shift hf_model_name=$1 ;; - -e | --engine_dir ) + -e | --checkpoint_dir ) shift - engine_dir=$1 + checkpoint_dir=$1 ;; -n | --nim_model_name ) shift @@ -96,8 +96,8 @@ if [ "$hf_model_name" == "" ]; then exit 1 fi -if [ "$engine_dir" != "" ]; then - engine_dir=" --engine-dir $engine_dir " +if [ "$checkpoint_dir" != "" ]; then + checkpoint_dir=" --checkpoint-dir $checkpoint_dir " fi if [ "$nim_model_name" != "" ]; then @@ -143,7 +143,7 @@ PYTHONPATH=FastChat:$PYTHONPATH python gen_model_answer.py \ --model-id $hf_model_name \ --temperature 0.0001 \ --top-p 0.0001 \ - $engine_dir \ + $checkpoint_dir \ $nim_model_name \ $answer_file \ $quant_args diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 3ac167db2..7203e78c7 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -36,16 +36,6 @@ def is_speculative(hf_config): ) -def get_mode_type_from_engine_dir(engine_dir_str): - # Split the path by '/' and get the last part - last_part = os.path.basename(engine_dir_str) - - # Split the last part by '_' and get the first segment - model_type = last_part.split("_")[0] - - return model_type - - def get_tokenizer(ckpt_path, trust_remote_code=False, **kwargs): print(f"Initializing tokenizer from {ckpt_path}") diff --git a/examples/llm_ptq/run_tensorrt_llm.py b/examples/llm_ptq/run_tensorrt_llm.py index c3152959a..56d25df70 100644 --- a/examples/llm_ptq/run_tensorrt_llm.py +++ b/examples/llm_ptq/run_tensorrt_llm.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""An example script to run the tensorrt_llm engine.""" +"""An example script to run the tensorrt_llm inference.""" import argparse @@ -28,7 +28,7 @@ def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument("--tokenizer", type=str, default="") parser.add_argument("--max_output_len", type=int, default=100) - parser.add_argument("--engine_dir", type=str, default="/tmp/modelopt") + parser.add_argument("--checkpoint_dir", type=str) parser.add_argument( "--input_texts", type=str, @@ -49,8 +49,8 @@ def parse_arguments(): def run(args): if not args.tokenizer: - # Assume the tokenizer files are saved in the engine_dr. - args.tokenizer = args.engine_dir + # Assume the tokenizer files are saved in the checkpoint_dir. + args.tokenizer = args.checkpoint_dir if isinstance(args.tokenizer, PreTrainedTokenizerBase): tokenizer = args.tokenizer @@ -66,7 +66,7 @@ def run(args): print("TensorRT-LLM example outputs:") - llm = LLM(args.engine_dir, tokenizer=tokenizer, max_batch_size=len(input_texts)) + llm = LLM(args.checkpoint_dir, tokenizer=tokenizer, max_batch_size=len(input_texts)) torch.cuda.cudart().cudaProfilerStart() outputs = llm.generate_text(input_texts, args.max_output_len) torch.cuda.cudart().cudaProfilerStop() diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh index 8878b824c..97d14ea03 100755 --- a/examples/llm_ptq/scripts/huggingface_example.sh +++ b/examples/llm_ptq/scripts/huggingface_example.sh @@ -158,7 +158,7 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH echo "Quantized model config $MODEL_CONFIG exists, skipping the quantization stage" fi - # for enc-dec model, users need to refer TRT-LLM example to build engines and deployment + # for enc-dec model, users need to refer TRT-LLM example for deployment if [[ -f "$SAVE_PATH/encoder/config.json" && -f "$SAVE_PATH/decoder/config.json" && ! -f $MODEL_CONFIG ]]; then echo "Please continue to deployment with the TRT-LLM enc_dec example, https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/enc_dec. Checkpoint export_path: $SAVE_PATH" exit 0 @@ -187,7 +187,7 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH RUN_ARGS+=" --trust_remote_code " fi - python run_tensorrt_llm.py --engine_dir=$SAVE_PATH $RUN_ARGS + python run_tensorrt_llm.py --checkpoint_dir=$SAVE_PATH $RUN_ARGS fi if [[ -d "${MODEL_PATH}" ]]; then @@ -229,7 +229,7 @@ if [[ $TASKS =~ "lm_eval" ]]; then python lm_eval_tensorrt_llm.py \ --model trt-llm \ - --model_args tokenizer=$MODEL_PATH,engine_dir=$SAVE_PATH,max_gen_toks=$BUILD_MAX_OUTPUT_LEN \ + --model_args tokenizer=$MODEL_PATH,checkpoint_dir=$SAVE_PATH,max_gen_toks=$BUILD_MAX_OUTPUT_LEN \ --tasks $LM_EVAL_TASKS \ --batch_size $BUILD_MAX_BATCH_SIZE $lm_eval_flags | tee $LM_EVAL_RESULT @@ -259,7 +259,7 @@ if [[ $TASKS =~ "mmlu" ]]; then python mmlu.py \ --model_name causal \ --model_path $MODEL_ABS_PATH \ - --engine_dir $SAVE_PATH \ + --checkpoint_dir $SAVE_PATH \ --data_dir $MMLU_DATA_PATH | tee $MMLU_RESULT popd diff --git a/examples/vlm_ptq/README.md b/examples/vlm_ptq/README.md index cdadb3374..1357ffc8f 100644 --- a/examples/vlm_ptq/README.md +++ b/examples/vlm_ptq/README.md @@ -56,7 +56,7 @@ Please refer to the [llm_ptq/README.md](../llm_ptq/README.md#current-out-of-the- Please refer to the [llm_ptq/README.md](../llm_ptq/README.md) about the details of model quantization. -The following scripts provide an all-in-one and step-by-step model quantization example for Llava, VILA, Phi-3-vision and Qwen2.5-VL models. The quantization format and the number of GPUs will be supplied as inputs to these scripts. By default, we build the engine for the fp8 format and 1 GPU. +The following scripts provide an all-in-one and step-by-step model quantization example for the supported Hugging Face multi-modal models. The quantization format and the number of GPUs will be supplied as inputs to these scripts. ### Hugging Face Example [Script](./scripts/huggingface_example.sh) diff --git a/examples/vlm_ptq/utils.py b/examples/vlm_ptq/utils.py deleted file mode 100644 index 496ce236b..000000000 --- a/examples/vlm_ptq/utils.py +++ /dev/null @@ -1,138 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def add_common_args(parser): - parser.add_argument("--max_new_tokens", type=int, default=128) - parser.add_argument("--batch_size", type=int, default=1) - parser.add_argument("--log_level", type=str, default="info") - parser.add_argument( - "--visual_engine_name", type=str, default="model.engine", help="Name of visual TRT engine" - ) - parser.add_argument( - "--engine_dir", - type=str, - default=None, - help="Directory containing visual and LLM TRT engines", - ) - parser.add_argument( - "--hf_model_dir", type=str, default=None, help="Directory containing tokenizer" - ) - parser.add_argument( - "--input_text", type=str, nargs="+", default=None, help="Text prompt to LLM" - ) - parser.add_argument("--num_beams", type=int, help="Use beam search if num_beams >1", default=1) - parser.add_argument("--top_k", type=int, default=1) - parser.add_argument("--top_p", type=float, default=0.0) - parser.add_argument("--temperature", type=float, default=1.0) - parser.add_argument("--repetition_penalty", type=float, default=1.0) - parser.add_argument( - "--run_profiling", action="store_true", help="Profile runtime over several iterations" - ) - parser.add_argument( - "--profiling_iterations", type=int, help="Number of iterations to run profiling", default=20 - ) - parser.add_argument( - "--check_accuracy", action="store_true", help="Check correctness of text output" - ) - parser.add_argument( - "--video_path", - type=str, - default=None, - help=( - "Path to your local video file, using 'llava-onevision-accuracy' to check the" - "Llava-OneVision model accuracy" - ), - ) - parser.add_argument( - "--video_num_frames", - type=int, - help="The number of frames sampled from the video in the Llava-OneVision model.", - default=None, - ) - parser.add_argument( - "--image_path", - type=str, - nargs="+", - default=None, - help="List of input image paths, separated by symbol", - ) - parser.add_argument("--path_sep", type=str, default=",", help="Path separator symbol") - parser.add_argument("--prompt_sep", type=str, default=",", help="Prompt separator symbol") - parser.add_argument( - "--enable_context_fmha_fp32_acc", - action="store_true", - default=None, - help="Enable FMHA runner FP32 accumulation.", - ) - parser.add_argument( - "--enable_chunked_context", - action="store_true", - help="Enables chunked context (only available with cpp session).", - ) - parser.add_argument( - "--mm_embedding_offloading", - action="store_true", - help=( - "Enable position table offloading. When not specified, defaults to True if using with " - "--enable_chunked_context." - ), - ) - parser.add_argument( - "--use_py_session", - default=False, - action="store_true", - help="Whether or not to use Python runtime session. By default C++ runtime session is used for the LLM.", - ) - parser.add_argument( - "--kv_cache_free_gpu_memory_fraction", - default=0.8, - type=float, - help="Specify the free gpu memory fraction.", - ) - parser.add_argument( - "--cross_kv_cache_fraction", - default=0.5, - type=float, - help=( - "Specify the kv cache fraction reserved for cross attention. Only applicable for" - "encoder-decoder models. By default 0.5 for self and 0.5 for cross." - ), - ) - parser.add_argument( - "--multi_block_mode", - type=lambda s: s.lower() - in ("yes", "true", "t", "1"), # custom boolean function to convert input string to boolean - default=True, - help="Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel.", - ) - parser.add_argument( - "--session", - default="cpp_llm_only", - type=str, - choices=["python", "cpp_llm_only", "cpp"], - help="Runtime used to run the models.\n" - "`cpp_llm_only`: vision engine run in python runtime, but LLM in pybind cpp runtime\n" - "`python`: everything runs in python runtime\n" - "`cpp`: everything runs in C++ runtime", - ) - parser.add_argument( - "--lora_task_uids", - type=str, - default=None, - nargs="+", - help="The list of LoRA task uids; use -1 to disable the LoRA module", - ) - return parser