diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 1aa17ae9..924861c1 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -50,6 +50,5 @@ modelopt/torch/utils @NVIDIA/modelopt-torch-utils-codeowners /examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners /examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners /examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners -/examples/vlm_eval @NVIDIA/modelopt-examples-vlm-codeowners /examples/vlm_ptq @NVIDIA/modelopt-examples-vlm-codeowners /examples/windows @NVIDIA/modelopt-windows-codeowners diff --git a/CHANGELOG.rst b/CHANGELOG.rst index fffda483..6fde2bcb 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,12 +5,17 @@ Model Optimizer Changelog (Linux) ^^^^^^^^^^^^^^^^^ **Deprecations** -- Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead. -**Bug Fixes** +- Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead. +- Deprecated TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. For performance evaluation, please use ``trtllm-bench`` directly. +- ``--export_fmt`` flag in ``examples/llm_ptq`` is removed. By default we export to the unified Hugging Face checkpoint format. +- ``int8_sq`` quantization format is deprecated from the ``examples/vlm_ptq`` with respect to the TensorRT-LLM's torch backend switch. Please refer to the previous releases if this quantization format is needed. +- Deprecated ``examples/vlm_eval`` as it depends on the deprecated TRT-LLM's TRT backend. **New Features** + - ``high_precision_dtype`` default to fp16 in ONNX quantization, i.e. quantized output model weights are now FP16 by default. +- Upgrade TensorRT-LLM dependency to 1.1.0rc2. 0.35 (2025-09-04) ^^^^^^^^^^^^^^^^^ @@ -23,7 +28,6 @@ Model Optimizer Changelog (Linux) **Bug Fixes** - Fix attention head ranking logic for pruning Megatron Core GPT models. -- Upgrade TensorRT-LLM dependency to 1.1.0rc2. **New Features** diff --git a/docs/source/getting_started/_installation_for_Linux.rst b/docs/source/getting_started/_installation_for_Linux.rst index 24ae8ffe..42892966 100644 --- a/docs/source/getting_started/_installation_for_Linux.rst +++ b/docs/source/getting_started/_installation_for_Linux.rst @@ -18,7 +18,7 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system +-------------------------+-----------------------------+ | PyTorch | >=2.6 | +-------------------------+-----------------------------+ -| TensorRT-LLM (Optional) | 1.0.0rc6 | +| TensorRT-LLM (Optional) | 1.1.0rc2.post2 | +-------------------------+-----------------------------+ | ONNX Runtime (Optional) | 1.22 | +-------------------------+-----------------------------+ diff --git a/examples/gpt-oss/requirements.txt b/examples/gpt-oss/requirements.txt index dead5cb7..4d75b59c 100644 --- a/examples/gpt-oss/requirements.txt +++ b/examples/gpt-oss/requirements.txt @@ -3,7 +3,7 @@ datasets deepspeed kernels>=0.9.0 peft>=0.17.0 -torch >= 2.8.0 +torch>2.7.1 trackio transformers>=4.55.0 trl>=0.21.0 diff --git a/examples/llm_eval/README.md b/examples/llm_eval/README.md index a7349596..0e1855d9 100644 --- a/examples/llm_eval/README.md +++ b/examples/llm_eval/README.md @@ -93,7 +93,7 @@ If `trust_remote_code` needs to be true, please append the command with the `--t ### TensorRT-LLM ```sh -python lm_eval_tensorrt_llm.py --model trt-llm --model_args tokenizer=,engine_dir= --tasks --batch_size +python lm_eval_tensorrt_llm.py --model trt-llm --model_args tokenizer=,engine_dir= --tasks --batch_size ``` ## MMLU @@ -140,7 +140,7 @@ python mmlu.py --model_name causal --model_path ### Evaluate the TensorRT-LLM engine ```bash -python mmlu.py --model_name causal --model_path --engine_dir +python mmlu.py --model_name causal --model_path --engine_dir ``` ## MT-Bench @@ -163,7 +163,7 @@ bash run_fastchat.sh -h --quant_cfg MODELOPT_QUA ### Evaluate the TensorRT-LLM engine ```bash -bash run_fastchat.sh -h +bash run_fastchat.sh -h ``` ### Judging the responses diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md index 83341261..2c95b831 100755 --- a/examples/llm_ptq/README.md +++ b/examples/llm_ptq/README.md @@ -203,7 +203,7 @@ scripts/huggingface_example.sh --type llama --model $HF_PATH --quant w4a8_awq,fp The above example perform `AutoQuantize` where the less quantization accuracy sensitive layers are quantized with `w4a8_awq` (specified by `--quant w4a8_awq`) and the more sensitive layers are kept un-quantized such that the effective bits is 4.8 (specified by `--auto_quantize_bits 4.8`). -The example scripts above also have an additional flag `--tasks`, where the actual tasks run in the script can be customized. The allowed tasks are `build,mmlu,benchmark,lm_eval,livecodebench` specified in the script [parser](./scripts/parser.sh). The tasks combo can be specified with a comma-separated task list. Some tasks like mmlu can take a long time to run. To run lm_eval tasks, please also specify the `--lm_eval_tasks` flag with comma separated lm_eval tasks [here](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks). +The example scripts above also have an additional flag `--tasks`, where the actual tasks run in the script can be customized. The allowed tasks are `quant,mmlu,lm_eval,livecodebench` specified in the script [parser](./scripts/parser.sh). The tasks combo can be specified with a comma-separated task list. Some tasks like mmlu can take a long time to run. To run lm_eval tasks, please also specify the `--lm_eval_tasks` flag with comma separated lm_eval tasks [here](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks). > *If GPU out-of-memory error is reported running the scripts, please try editing the scripts and reducing the max batch size to save GPU memory.* @@ -251,7 +251,7 @@ scripts/huggingface_example.sh --model $HF_PATH --quant [fp8|nvfp4|int8_sq|int4_ > *If a GPU OOM error occurs during model quantization despite sufficient memory, setting the --use_seq_device_map flag can help. This enforces sequential device mapping, distributing the model across GPUs and utilizing up to 80% of each GPU's memory.* -> *You can now add `--low_memory_mode` to the command when setting `--export_fmt=hf` to lower the memory requirements of the PTQ process. With this mode, the script will compress model weights to low precision before calibration. This mode is only supported for FP8 and NVFP4 with max calibration.* +> *You can add `--low_memory_mode` to the command to lower the memory requirements of the PTQ process. With this mode, the script will compress model weights to low precision before calibration. This mode is only supported for FP8 and NVFP4 with max calibration.* #### Deepseek R1 @@ -301,7 +301,7 @@ with torch.inference_mode(): ### Quantize and Export ```bash -python hf_ptq.py --pyt_ckpt_path --qformat fp8 --export_fmt hf --export_path --trust_remote_code +python hf_ptq.py --pyt_ckpt_path --qformat fp8 --export_path --trust_remote_code ``` ### Hugging Face framework [Script](./scripts/huggingface_example.sh) @@ -309,7 +309,7 @@ python hf_ptq.py --pyt_ckpt_path --qformat fp8 --export Alternatively, the framework script `huggingface_example.sh` also supports quantize and export: ```bash -scripts/huggingface_example.sh --model --quant fp8 --export_fmt hf +scripts/huggingface_example.sh --model --quant fp8 ``` ### Deployment diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 81f4b639..119a34f2 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -89,28 +89,20 @@ def auto_quantize( qformat_list = qformat.split(",") assert qformat_list, "No quantization formats provided" # Check if all provided quantization formats are supported - if args.export_fmt == "hf": - assert all( - qformat - in [ - "fp8", - "int4_awq", - "nvfp4", - "nvfp4_awq", - "w4a8_awq", - "fp8_pb_wo", - "w4a8_mxfp4_fp8", - "nvfp4_mlp_only", - ] - for qformat in qformat_list - ), ( - "One or more quantization formats provided are not supported for unified checkpoint export" - ) - else: - assert all( - qformat in ["fp8", "int8_sq", "int4_awq", "w4a8_awq", "nvfp4", "nvfp4_awq"] - for qformat in qformat_list - ), "One or more quantization formats provided are not supported for tensorrt llm export" + assert all( + qformat + in [ + "fp8", + "int4_awq", + "nvfp4", + "nvfp4_awq", + "w4a8_awq", + "fp8_pb_wo", + "w4a8_mxfp4_fp8", + "nvfp4_mlp_only", + ] + for qformat in qformat_list + ), "One or more quantization formats provided are not supported for unified checkpoint export" def loss_func(output, data): # For transformers AutoModelForCausalLM models, the outputs are wrapped in `CausalLMOutputWithPast` @@ -219,27 +211,21 @@ def main(args): "Quantization supports only one quantization format." ) - # Check arguments for unified_hf export format and set to default if unsupported arguments are provided - if args.export_fmt == "hf": - assert args.sparsity_fmt == "dense", ( - f"Sparsity format {args.sparsity_fmt} not supported by unified export api." - ) - - if not args.auto_quantize_bits: - assert ( - args.qformat - in [ - "int4_awq", - "fp8", - "nvfp4", - "nvfp4_awq", - "w4a8_awq", - "fp8_pb_wo", - "w4a8_mxfp4_fp8", - "nvfp4_mlp_only", - ] - or args.kv_cache_qformat in KV_QUANT_CFG_CHOICES - ), f"Quantization format {args.qformat} not supported for HF export path" + if not args.auto_quantize_bits: + assert ( + args.qformat + in [ + "int4_awq", + "fp8", + "nvfp4", + "nvfp4_awq", + "w4a8_awq", + "fp8_pb_wo", + "w4a8_mxfp4_fp8", + "nvfp4_mlp_only", + ] + or args.kv_cache_qformat in KV_QUANT_CFG_CHOICES + ), f"Quantization format {args.qformat} not supported for HF export path" # If low memory mode is enabled, we compress the model while loading the HF checkpoint. calibration_only = False @@ -253,9 +239,6 @@ def main(args): attn_implementation=args.attn_implementation, ) else: - assert args.export_fmt == "hf", ( - "Low memory mode is only supported for exporting HF checkpoint." - ) assert args.qformat in QUANT_CFG_CHOICES, ( f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}" ) @@ -600,34 +583,41 @@ def output_decode(generated_ids, input_shape): setattr(model.config, "architectures", full_model_config.architectures) start_time = time.time() - if args.export_fmt == "tensorrt_llm": + if ( + model_type in ["t5", "bart", "whisper"] + or args.sparsity_fmt != "dense" + or "int8_sq" in args.qformat + ): + warnings.warn( + "Still exporting TensorRT-LLM checkpoints for models not supported by the TensorRT-LLM torch runtime." + ) + # Move meta tensor back to device before exporting. remove_hook_from_module(model, recurse=True) - dtype = None - if "w4a8_awq" in args.qformat: - # TensorRT-LLM w4a8 only support fp16 as the dtype. - dtype = torch.float16 - - # For Gemma2-27B, TRT-LLM only works with bfloat16 as the dtype. - if model_type == "gemma2": - dtype = torch.bfloat16 - export_tensorrt_llm_checkpoint( model, model_type, - dtype=dtype, export_dir=export_path, inference_tensor_parallel=args.inference_tensor_parallel, inference_pipeline_parallel=args.inference_pipeline_parallel, ) - elif args.export_fmt == "hf": + else: + # Check arguments for unified_hf export format and set to default if unsupported arguments are provided + assert args.sparsity_fmt == "dense", ( + f"Sparsity format {args.sparsity_fmt} not supported by unified export api." + ) + + if args.inference_tensor_parallel != 1 or args.inference_pipeline_parallel != 1: + warnings.warn( + "Unified HF export format does not specify inference tensor parallel or pipeline parallel. " + "They will be set at deployment time." + ) + export_hf_checkpoint( full_model, export_dir=export_path, ) - else: - raise NotImplementedError(f"{args.export_fmt} not supported") # Restore default padding and export the tokenizer as well. if tokenizer is not None: @@ -710,9 +700,9 @@ def output_decode(generated_ids, input_shape): parser.add_argument( "--export_fmt", required=False, - default="tensorrt_llm", + default="hf", choices=["tensorrt_llm", "hf"], - help=("Checkpoint export format"), + help="Deprecated. Please avoid using this argument.", ) parser.add_argument( "--trust_remote_code", @@ -767,6 +757,9 @@ def output_decode(generated_ids, input_shape): args = parser.parse_args() + if args.export_fmt != "hf": + warnings.warn("Deprecated. --export_fmt forced to hf.") + args.dataset = args.dataset.split(",") if args.dataset else None args.calib_size = [int(num_sample) for num_sample in args.calib_size.split(",")] main(args) diff --git a/examples/llm_ptq/modelopt_to_tensorrt_llm.py b/examples/llm_ptq/modelopt_to_tensorrt_llm.py deleted file mode 100644 index e3e1ea87..00000000 --- a/examples/llm_ptq/modelopt_to_tensorrt_llm.py +++ /dev/null @@ -1,310 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""An example to convert an Model Optimizer exported model to tensorrt_llm.""" - -import argparse -import subprocess -import warnings -from pathlib import Path - -import tensorrt_llm -import torch -from packaging import version -from packaging.version import parse -from tensorrt_llm.llmapi import BuildConfig -from tensorrt_llm.models import PretrainedConfig -from transformers import AutoTokenizer - -try: - # run depends on features from the min-supported TensorRT-LLM - from run_tensorrt_llm import run -except Exception as e: - warnings.warn(f"Cannot run TensorRT-LLM inference: {e}") - run = None - - -MIN_TENSORRT_LLM_VERSION = "0.13.0" - - -def str2bool(v): - if isinstance(v, bool): - return v - if v.lower() in ("yes", "true", "t", "y", "1"): - return True - elif v.lower() in ("no", "false", "f", "n", "0"): - return False - else: - raise argparse.ArgumentTypeError("Boolean value expected.") - - -def build_tensorrt_llm( - pretrained_config: str | Path, - engine_dir: str | Path, - max_input_len: int = 200, - max_output_len: int = 200, - max_batch_size: int = 1, - max_beam_width: int = 1, - max_num_tokens: int | None = None, - num_build_workers: int = 1, - enable_sparsity: bool = False, - max_prompt_embedding_table_size: int = BuildConfig.max_prompt_embedding_table_size, - max_encoder_input_len: int = BuildConfig.max_encoder_input_len, - perf_mode: bool = False, -): - """The API to convert the TensorRT-LLM checkpoint to engines. - - Args: - pretrained_config: The pretrained_config (file path) exported by - ``modelopt.torch.export.export_tensorrt_llm_checkpoint``. - engine_dir: The target output directory to save the built tensorrt_llm engines. - max_input_len: The max input sequence length. - max_output_len: The max output sequence length. - max_batch_size: The max batch size. - max_beam_width: The max beam search width. - max_num_tokens: The max number of tokens that can be processed at the same time. - For the context phase, the max_num_tokens counts the full sequence length. - For the generation phase, the max_num_tokens counts only the ones under generation - as the input sequence has been processed as cached. - max_num_tokens should fall between [max_batch_size * max_beam_width, max_batch_size * max_input_len]. - when inflight batching is enabled. - Higher max_num_tokens means more GPU memory will be used for resource allocation. - If not specified the max_num_tokens will be set to the max bound. - Details: https://nvidia.github.io/TensorRT-LLM/performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.html - num_build_workers: The number of workers to use for the building process. - If build time is a concern, you can increase this worker count to num of GPUs. - At a lost of higher CPU memory usage footprint. - If CPU memory is limited, num_build_workers should be set to 1 to conserve memory. - enable_sparsity: The switch to enable sparsity for TRT compiler. - With this flag, the TRT compiler will search tactics of sparse kernels for each node of which - weight tensors are sparsified. This increases engine building time significantly. - max_prompt_embedding_table_size: Length of the prepended/concatenated embeddings (either multimodal - feature embeddings or prompt tuning embeddings) to the LLM input embeddings. - max_encoder_input_len: Maximum encoder input length for enc-dec models. - perf_mode: Whether build the engine with max perf at a cost of longer build time and less flexibility. - checkpoint_format: The model checkpoint format. Choose between [tensorrt_llm, hf]. - tp: tensor_parallel_size. Effective for hf checkpoint_format only. - """ - engine_dir = Path(engine_dir) - engine_dir.mkdir(parents=True, exist_ok=True) - - pretrained_config_path = Path(pretrained_config) - assert pretrained_config_path.exists() - ckpt_dir = pretrained_config_path.parent - - timing_cache_file = ( - torch.cuda.get_device_name().replace(" ", "_") - + "_trtllm_" - + tensorrt_llm.__version__ - + ".cache" - ) - timing_cache_path = engine_dir / timing_cache_file - - if not max_num_tokens: - # tensorrt-llm recommends max max_num_tokens to be 16384 - max_num_tokens = min(max_batch_size * max_input_len, 16384) - - config = PretrainedConfig.from_json_file(pretrained_config_path) - - log_level = "warning" - - use_paged_context_fmha = config.quantization.quant_algo in [ - "FP8", - "W4A8_AWQ", - "NVFP4", - None, - ] - - # FP8 FMHA for gemma is not supported in tensorrt_llm < 0.19.0 - if "GemmaForCausalLM" in config.architecture and version.parse( - tensorrt_llm.__version__ - ) < version.parse("0.19.0"): - use_paged_context_fmha = False - - use_fused_mlp = "RecurrentGemma" not in config.architecture - if config.quantization.exclude_modules: - for module_name in config.quantization.exclude_modules: - # fp8_context_fhma requires all attention.dense to be quantized - if "attention.dense" in module_name: - use_paged_context_fmha = False - # For AutoQuant, fc and gate might not be quantized at the same time - # TODO: relax this limitation on the TRT-LLM side - if "gate" in module_name or "fc" in module_name: - use_fused_mlp = False - - quant_algo = config.quantization.quant_algo - use_qdq = quant_algo in ["FP8", "W8A8_SQ_PER_CHANNEL"] - - speculative_decoding_mode = "medusa" if "Medusa" in config.architecture else None - - if num_build_workers > torch.cuda.device_count(): - num_build_workers = torch.cuda.device_count() - print(f"Cap num_build_workers to num gpus: ${num_build_workers}") - - build_cmd = "trtllm-build " - build_cmd += f"--checkpoint_dir {ckpt_dir} " - build_cmd += f"--input_timing_cache {timing_cache_path} " - build_cmd += f"--output_timing_cache {timing_cache_path} " - build_cmd += f"--log_level {log_level} " - build_cmd += f"--output_dir {engine_dir} " - build_cmd += f"--workers {num_build_workers} " - build_cmd += f"--max_batch_size {max_batch_size} " - build_cmd += f"--max_input_len {max_input_len} " - build_cmd += f"--max_seq_len {max_output_len + max_input_len} " - build_cmd += f"--max_beam_width {max_beam_width} " - build_cmd += f"--max_prompt_embedding_table_size {max_prompt_embedding_table_size} " - build_cmd += f"--max_encoder_input_len {max_encoder_input_len} " - build_cmd += ( - "--reduce_fusion enable " - if config.mapping.pp_size == 1 - and config.architecture - not in [ - "DbrxForCausalLM", - "BaichuanForCausalLM", - "QWenForCausalLM", - "GPTForCausalLM", - ] - else "" - ) - - if use_fused_mlp: - build_cmd += "--use_fused_mlp enable " - else: - build_cmd += "--use_fused_mlp disable " - - if enable_sparsity: - build_cmd += "--weight_sparsity " - - # Low batch size scenario - if max_batch_size <= 4 and quant_algo == "FP8": - build_cmd += "--gemm_plugin fp8 " - if quant_algo == "NVFP4": - build_cmd += "--gemm_plugin nvfp4 " - elif not use_qdq: - build_cmd += "--gemm_plugin auto " - - build_cmd += f"--max_num_tokens {max_num_tokens} " - - if speculative_decoding_mode: - build_cmd += f"--speculative_decoding_mode {speculative_decoding_mode} " - - if use_paged_context_fmha: - build_cmd += "--use_paged_context_fmha enable " - - if perf_mode: - build_cmd += "--multiple_profiles enable" - elif not speculative_decoding_mode: - build_cmd += "--gather_context_logits " # for evaluation benchmarking purpose - - print(f"trtllm-build command:\n{build_cmd}") - - assert parse(tensorrt_llm.__version__) >= parse(MIN_TENSORRT_LLM_VERSION), ( - f"Detected lower version of tensorrt_llm installed instead of {MIN_TENSORRT_LLM_VERSION}. " - f"Please build the tensorrt_llm engines with tensorrt_llm version {MIN_TENSORRT_LLM_VERSION} " - " or higher instead.\n\n Build command: {build_cmd}" - ) - subprocess.run(build_cmd, shell=True, check=True) - - try: - tokenizer = AutoTokenizer.from_pretrained(ckpt_dir) - tokenizer.save_pretrained(engine_dir) - except Exception as e: - warnings.warn(f"Cannot copy tokenizer to the engine dir. {e}") - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--model_config", type=str, default="") - parser.add_argument("--max_output_len", type=int, default=512) - parser.add_argument("--max_input_len", type=int, default=2048) - parser.add_argument("--max_batch_size", type=int, default=8) - parser.add_argument("--max_num_beams", type=int, default=1) - parser.add_argument( - "--perf", - action="store_true", - help="Build engines for max perf benchmark", - ) - parser.add_argument("--engine_dir", type=str, default="/tmp/modelopt") - parser.add_argument("--tokenizer", type=str, default="") - parser.add_argument( - "--input_texts", - type=str, - default=( - "Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a" - ), - help="Input texts. Please use | to separate different batches.", - ) - parser.add_argument("--num_build_workers", type=int, default="1") - parser.add_argument("--enable_sparsity", type=str2bool, default=False) - parser.add_argument( - "--max_prompt_embedding_table_size", - "--max_multimodal_len", - type=int, - default=BuildConfig.max_prompt_embedding_table_size, - help="Maximum prompt embedding table size for prompt tuning, " - "or maximum multimodal input size for multimodal models.", - ) - parser.add_argument( - "--max_encoder_input_len", - type=int, - default=BuildConfig.max_encoder_input_len, - help="Maximum encoder input length for enc-dec models.", - ) - parser.add_argument( - "--trust_remote_code", - help="Set trust_remote_code for Huggingface models and tokenizers", - default=False, - action="store_true", - ) - parser.add_argument( - "--skip_run", - help="Skip the inference run", - default=False, - action="store_true", - ) - - return parser.parse_args() - - -def main(args): - build_tensorrt_llm( - pretrained_config=args.model_config, - engine_dir=args.engine_dir, - max_input_len=args.max_input_len, - max_output_len=args.max_output_len, - max_batch_size=args.max_batch_size, - max_beam_width=args.max_num_beams, - num_build_workers=args.num_build_workers, - enable_sparsity=args.enable_sparsity, - max_prompt_embedding_table_size=args.max_prompt_embedding_table_size, - max_encoder_input_len=args.max_encoder_input_len, - perf_mode=args.perf, - ) - - if ( - args.model_config is not None - and all(model_name not in args.model_config for model_name in ("vila", "llava")) - and run is not None - ): - # Reduce output_len for the inference run example. - args.max_output_len = 100 - - if not args.skip_run: - run(args) - - -if __name__ == "__main__": - args = parse_arguments() - main(args) diff --git a/examples/llm_ptq/run_tensorrt_llm.py b/examples/llm_ptq/run_tensorrt_llm.py index a414496a..c3152959 100644 --- a/examples/llm_ptq/run_tensorrt_llm.py +++ b/examples/llm_ptq/run_tensorrt_llm.py @@ -80,9 +80,8 @@ def run(args): outputs = llm.generate_tokens(input_texts, args.max_output_len) print(f"Generated tokens: {outputs}") - if llm.gather_context_logits: - logits = llm.generate_context_logits(input_texts) - print(f"Generated logits: {logits}") + logits = llm.generate_context_logits(input_texts) + print(f"Generated logits: {logits}") if __name__ == "__main__": diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh index 8f18c5ac..8878b824 100755 --- a/examples/llm_ptq/scripts/huggingface_example.sh +++ b/examples/llm_ptq/scripts/huggingface_example.sh @@ -34,27 +34,6 @@ if [ -z "$MODEL_PATH" ]; then exit 1 fi -#Check if arguments are supported by HF export path -if [ "$EXPORT_FORMAT" = "hf" ]; then - if [ "$SPARSITY_FMT" != "dense" ]; then - echo "Unsupported sparsity argument: Expected dense" >&2 - exit 1 - fi - - #Iterate over list of qformats provided and check if they are supported in HF export path - IFS="," - for qformat in $QFORMAT; do - case $qformat in - fp16 | bf16 | fp8 | fp8_pc_pt | fp8_pb_wo | int4_awq | nvfp4 | nvfp4_awq | w4a8_awq | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8) ;; - *) - echo "Unsupported quant argument: Expected one of: [fp16, bf16, fp8, fp8_pc_pt, fp8_pb_wo, int4_awq, nvfp4, nvfp4_awq, w4a8_awq, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8]" >&2 - exit 1 - ;; - esac - done - IFS=" " -fi - # Check if ENABLE_SPARSITY environment variable is set to "true" if [ "$SPARSITY_FMT" = "dense" ]; then ENABLE_SPARSITY=false @@ -83,26 +62,6 @@ for qformat in $QFORMAT; do done IFS=" " -case $TP in -1 | 2 | 4 | 8) ;; -*) - echo "Unknown tp argument: Expected one of: [1, 2, 4, 8]" >&2 - exit 1 - ;; -esac - -case $PP in -1 | 2 | 4 | 8) ;; -*) - echo "Unknown pp argument: Expected one of: [1, 2, 4, 8]" >&2 - exit 1 - ;; -esac - -GPU_NAME=$(nvidia-smi --id 0 --query-gpu=name --format=csv,noheader,nounits | sed 's/ /_/g') - -echo "Using the following config: max input $BUILD_MAX_INPUT_LEN max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE" - script_dir="$(dirname "$(readlink -f "$0")")" pushd $script_dir/.. @@ -113,24 +72,13 @@ fi QFORMAT_MODIFIED="${QFORMAT//,/_}" -MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g') - -MODEL_FULL_NAME=${MODEL_NAME}_${SPARSITY_FMT}_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}_tp${TP}_pp${PP} +MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}} -if [ $EXPORT_FORMAT != "tensorrt_llm" ]; then - MODEL_FULL_NAME=${MODEL_NAME}_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}_${EXPORT_FORMAT} -fi - -SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_FULL_NAME} +SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME} MODEL_CONFIG=${SAVE_PATH}/config.json -ENGINE_DIR=${SAVE_PATH}/${TP}x${PP}x${GPU_NAME}_input${BUILD_MAX_INPUT_LEN}_output${BUILD_MAX_OUTPUT_LEN}_batch${BUILD_MAX_BATCH_SIZE}_engine -if [ $EXPORT_FORMAT = "hf" ]; then - ENGINE_DIR=${SAVE_PATH} -fi - -mkdir -p $ENGINE_DIR +mkdir -p $SAVE_PATH if [ "${REMOVE_EXISTING_MODEL_CONFIG,,}" = "true" ]; then rm -f $MODEL_CONFIG @@ -180,14 +128,13 @@ else MODEL_CONFIG_EXIST=false fi -if [[ $TASKS =~ "build" ]] || [[ ! -d "$ENGINE_DIR" ]] || [[ ! $(ls -A $ENGINE_DIR) ]]; then +if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH) ]]; then - if [ "$EXPORT_FORMAT" == "hf" ] && ([ "$qformat" == "bf16" ] || [ "$qformat" == "fp16" ]); then + if [ "$qformat" == "bf16" ] || [ "$qformat" == "fp16" ]; then if [ -d "$MODEL_PATH" ]; then MODEL_CONFIG_EXIST=true MODEL_CONFIG=$MODEL_PATH/config.json - mkdir -p $ENGINE_DIR - for file in $MODEL_PATH/*; do ln -sf "$file" $ENGINE_DIR/; done + for file in $MODEL_PATH/*; do ln -sf "$file" $SAVE_PATH/; done else echo "Please use the model directory where the config.json file is present." exit 1 @@ -205,7 +152,6 @@ if [[ $TASKS =~ "build" ]] || [[ ! -d "$ENGINE_DIR" ]] || [[ ! $(ls -A $ENGINE_D --batch_size=$CALIB_BATCH_SIZE \ --inference_tensor_parallel=$TP \ --inference_pipeline_parallel=$PP \ - --export_fmt=$EXPORT_FORMAT \ $PTQ_ARGS \ $AWQ_ARGS else @@ -218,46 +164,30 @@ if [[ $TASKS =~ "build" ]] || [[ ! -d "$ENGINE_DIR" ]] || [[ ! $(ls -A $ENGINE_D exit 0 fi + if [[ "$SPARSITY_FMT" != "dense" ]]; then + echo "Sparse quantization detected (SPARSITY_FMT=$SPARSITY_FMT). Please deploy with the TRT-LLM using trtllm-build. Checkpoint export_path: $SAVE_PATH" + exit 0 + fi + if [[ "$QFORMAT" == *"nvfp4"* ]] || [[ "$KV_CACHE_QUANT" == *"nvfp4"* ]]; then cuda_major=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader -i 0 | cut -d. -f1) if [ "$cuda_major" -lt 10 ]; then - echo "Please build the tensorrt_llm engine on Blackwell GPU for deployment. Checkpoint export_path: $SAVE_PATH" + echo "Please deploy the NVFP4 checkpoint on a Blackwell GPU. Checkpoint export_path: $SAVE_PATH" exit 0 fi fi - if [ $EXPORT_FORMAT = "tensorrt_llm" ]; then - echo "Building tensorrt_llm engine from Model Optimizer-quantized model..." - - BUILD_ARGS="" - if [[ $TASKS =~ "benchmark" && ! $TASKS =~ "lm_eval" ]]; then - BUILD_ARGS+=" --perf " - fi - - python modelopt_to_tensorrt_llm.py \ - --model_config=$MODEL_CONFIG \ - --engine_dir=$ENGINE_DIR \ - --tokenizer=$MODEL_PATH \ - --max_input_len=$BUILD_MAX_INPUT_LEN \ - --max_output_len=$BUILD_MAX_OUTPUT_LEN \ - --max_batch_size=$BUILD_MAX_BATCH_SIZE \ - --num_build_workers=$GPUS \ - --enable_sparsity=$ENABLE_SPARSITY \ - $BUILD_ARGS - else - - if [[ ! " fp8 nvfp4 bf16 fp16 " =~ " ${QFORMAT} " ]]; then - echo "Quant $QFORMAT not supported with the TensorRT-LLM torch llmapi. Allowed values are: fp8, nvfp4, bf16, fp16" - exit 0 - fi - - if $TRUST_REMOTE_CODE; then - RUN_ARGS+=" --trust_remote_code " - fi + if [[ ! " fp8 nvfp4 bf16 fp16 " =~ " ${QFORMAT} " ]]; then + echo "Quant $QFORMAT specified. Please read TensorRT-LLM quantization support matrix https://nvidia.github.io/TensorRT-LLM/features/quantization.html#quantization-in-tensorrt-llm and use TensorRT-LLM for deployment. Checkpoint export_path: $SAVE_PATH" + exit 0 + fi - python run_tensorrt_llm.py --engine_dir=$ENGINE_DIR $RUN_ARGS + if $TRUST_REMOTE_CODE; then + RUN_ARGS+=" --trust_remote_code " fi + + python run_tensorrt_llm.py --engine_dir=$SAVE_PATH $RUN_ARGS fi if [[ -d "${MODEL_PATH}" ]]; then @@ -288,16 +218,18 @@ if [[ $TASKS =~ "lm_eval" ]]; then lm_eval_flags+=" --trust_remote_code " fi - LM_EVAL_RESULT=${ENGINE_DIR}/lm_eval.txt + LM_EVAL_RESULT=${SAVE_PATH}/lm_eval.txt echo "Evaluating lm_eval, result saved to $LM_EVAL_RESULT..." pushd ../llm_eval/ pip install -r requirements.txt + echo "Using the following config: max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE" + python lm_eval_tensorrt_llm.py \ --model trt-llm \ - --model_args tokenizer=$MODEL_PATH,engine_dir=$ENGINE_DIR,max_gen_toks=$BUILD_MAX_OUTPUT_LEN \ + --model_args tokenizer=$MODEL_PATH,engine_dir=$SAVE_PATH,max_gen_toks=$BUILD_MAX_OUTPUT_LEN \ --tasks $LM_EVAL_TASKS \ --batch_size $BUILD_MAX_BATCH_SIZE $lm_eval_flags | tee $LM_EVAL_RESULT @@ -307,7 +239,7 @@ fi if [[ $TASKS =~ "mmlu" ]]; then - MMLU_RESULT=${ENGINE_DIR}/mmlu.txt + MMLU_RESULT=${SAVE_PATH}/mmlu.txt echo "Evaluating MMLU, result saved to $MMLU_RESULT..." pushd ../llm_eval/ @@ -327,7 +259,7 @@ if [[ $TASKS =~ "mmlu" ]]; then python mmlu.py \ --model_name causal \ --model_path $MODEL_ABS_PATH \ - --engine_dir $ENGINE_DIR \ + --engine_dir $SAVE_PATH \ --data_dir $MMLU_DATA_PATH | tee $MMLU_RESULT popd @@ -337,10 +269,10 @@ if [[ $TASKS =~ "mtbench" ]]; then pushd ../llm_eval/ - bash run_fastchat.sh -h $MODEL_ABS_PATH -e $ENGINE_DIR - find data/mt_bench/model_answer/ -type f -name '*.jsonl' -exec mv {} $ENGINE_DIR \; + bash run_fastchat.sh -h $MODEL_ABS_PATH -e $SAVE_PATH + find data/mt_bench/model_answer/ -type f -name '*.jsonl' -exec mv {} $SAVE_PATH \; - JSONL_PATH=$(readlink -f $(find $ENGINE_DIR -type f -name '*.jsonl')) + JSONL_PATH=$(readlink -f $(find $SAVE_PATH -type f -name '*.jsonl')) echo "FastChat generation complete. The results are saved under $JSONL_PATH . Please run the judge(https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge) to evaluate the quality of the responses." popd @@ -350,13 +282,13 @@ fi if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then # Clean a previous session if exists pkill -f "trtllm-serve" && while pgrep -f "trtllm-serve" >/dev/null; do sleep 1; done - HASH=$(echo -n "$ENGINE_DIR" | md5sum | awk '{print $1}') + HASH=$(echo -n "$SAVE_PATH" | md5sum | awk '{print $1}') PORT=$((10000 + (0x${HASH:0:4} % 50001))) echo "Starting trtllm-serve on $PORT" - trtllm-serve $ENGINE_DIR --host 0.0.0.0 --port $PORT >$ENGINE_DIR/serve.txt 2>&1 & + trtllm-serve $SAVE_PATH --host 0.0.0.0 --port $PORT >$SAVE_PATH/serve.txt 2>&1 & SERVE_PID=$! - tail -f $ENGINE_DIR/serve.txt | while read line; do + tail -f $SAVE_PATH/serve.txt | while read line; do if echo "$line" | grep -q "Application startup complete"; then echo "Application startup complete." break @@ -370,16 +302,18 @@ if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then pushd ../llm_eval/ if [[ $TASKS =~ "livecodebench" ]]; then - bash run_livecodebench.sh $MODEL_FULL_NAME $BUILD_MAX_BATCH_SIZE $BUILD_MAX_OUTPUT_LEN $PORT | tee $ENGINE_DIR/livecodebench.txt - mkdir -p $ENGINE_DIR/livecodebench - mv LiveCodeBench/output/$MODEL_FULL_NAME/* $ENGINE_DIR/livecodebench - echo "LiveCodeBench results are saved under $ENGINE_DIR/livecodebench." + echo "Using the following config: max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE" + bash run_livecodebench.sh $MODEL_NAME $BUILD_MAX_BATCH_SIZE $BUILD_MAX_OUTPUT_LEN $PORT | tee $SAVE_PATH/livecodebench.txt + mkdir -p $SAVE_PATH/livecodebench + mv LiveCodeBench/output/$MODEL_NAME/* $SAVE_PATH/livecodebench + echo "LiveCodeBench results are saved under $SAVE_PATH/livecodebench." fi if [[ $TASKS =~ "simple_eval" ]]; then - bash run_simple_eval.sh $MODEL_FULL_NAME $SIMPLE_EVAL_TASKS $BUILD_MAX_OUTPUT_LEN $PORT | tee $ENGINE_DIR/simple_eval.txt - echo "Simple eval results are saved under $ENGINE_DIR/simple_eval.txt." + echo "Using the following config: max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE" + bash run_simple_eval.sh $MODEL_NAME $SIMPLE_EVAL_TASKS $BUILD_MAX_OUTPUT_LEN $PORT | tee $SAVE_PATH/simple_eval.txt + echo "Simple eval results are saved under $SAVE_PATH/simple_eval.txt." fi popd @@ -387,78 +321,5 @@ if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then kill $SERVE_PID fi -if [[ $TASKS =~ "benchmark" ]]; then - - if [ -z "$perf" ]; then - echo "!!!Warning: Not building tensorrt llm with optimized perf (e.g. context logits enabled). The benchmark result might be lower than optimal perf." - echo "Please rebuild the engine and not run accuracy evals where the context logits are needed (e.g. lm_eval)." - fi - - if [ "$PP" -ne 1 ]; then - echo "Benchmark does not work with multi PP. Please run the c++ benchmark in the TensorRT-LLM repo..." - exit 1 - fi - - BENCHMARK_RESULT=${ENGINE_DIR}/benchmark.txt - echo "Evaluating performance, result saved to $BENCHMARK_RESULT..." - - # Prepare datasets for TRT-LLM benchmark - if [ -z "$TRT_LLM_CODE_PATH" ]; then - TRT_LLM_CODE_PATH=/workspace/tensorrt_llm - echo "Setting default TRT_LLM_CODE_PATH to $TRT_LLM_CODE_PATH." - fi - - # Synthesize the tokenized benchmarking dataset - TRT_LLM_PREPARE_DATASET=$TRT_LLM_CODE_PATH/benchmarks/cpp/prepare_dataset.py - - # Align with the official benchmark - BENCHMARK_INPUT_LEN=$BUILD_MAX_INPUT_LEN - BENCHMARK_OUTPUT_LEN=$BUILD_MAX_OUTPUT_LEN - BENCHMARK_NUM_REQUESTS=256 - - DATASET_TXT=${SAVE_PATH}/synthetic_${BENCHMARK_INPUT_LEN}_${BENCHMARK_OUTPUT_LEN}_${BENCHMARK_NUM_REQUESTS}.txt - - if [ -z "$TRT_LLM_PREPARE_DATASET" ]; then - echo "Unable to prepare dataset for benchmarking. Please set TRT_LLM_CODE_PATH to the TRT-LLM code path." - else - if ! [ -f $DATASET_TXT ]; then - python $TRT_LLM_PREPARE_DATASET --stdout --tokenizer $MODEL_PATH token-norm-dist \ - --input-mean $BENCHMARK_INPUT_LEN --output-mean $BENCHMARK_OUTPUT_LEN --input-stdev 0 --output-stdev 0 \ - --num-requests $BENCHMARK_NUM_REQUESTS >$DATASET_TXT - else - echo "Use existing benchmark dataset in $DATASET_TXT." - fi - fi - - MODEL_ARGS="" - EXTRA_ARGS="" - if [ "$EXPORT_FORMAT" = "hf" ]; then - MODEL_ARGS="--model_path $ENGINE_DIR " - EXTRA_ARGS="--backend pytorch " - if [ "$TP" -ne 1 ]; then - EXTRA_ARGS+="--tp $TP " - fi - if [ "$PP" -ne 1 ]; then - EXTRA_ARGS+="--pp $PP " - fi - else - EXTRA_ARGS="--engine_dir $ENGINE_DIR " - fi - - if [ "$BUILD_MAX_BATCH_SIZE" -gt 1 ]; then - trtllm-bench --model $MODEL_PATH $MODEL_ARGS throughput $EXTRA_ARGS --dataset $DATASET_TXT | tee -a $BENCHMARK_RESULT - else - trtllm-bench --model $MODEL_PATH $MODEL_ARGS latency $EXTRA_ARGS --dataset $DATASET_TXT | tee -a $BENCHMARK_RESULT - fi - -fi - -if [ -n "$FREE_SPACE" ]; then - rm -f $SAVE_PATH/*.json - rm -f $SAVE_PATH/*.safetensors - rm -f $SAVE_PATH/*/*.json - rm -f $SAVE_PATH/*/*.engine - rm -f $SAVE_PATH/*/*.cache -fi popd diff --git a/examples/llm_ptq/scripts/parser.sh b/examples/llm_ptq/scripts/parser.sh index ea6f29a7..cd5b9546 100644 --- a/examples/llm_ptq/scripts/parser.sh +++ b/examples/llm_ptq/scripts/parser.sh @@ -18,21 +18,17 @@ # Define a function to parse command-line options parse_options() { # Default values - MODEL_TYPE="" MODEL_PATH="" QFORMAT="" KV_CACHE_QUANT="" TP=1 - CALIB_TP= PP=1 - GPUS=1 SPARSITY_FMT="dense" - EXPORT_FORMAT="tensorrt_llm" LM_EVAL_TASKS="mmlu,gsm8k" LM_EVAL_LIMIT= SIMPLE_EVAL_TASKS="mmlu" - TASKS="build" + TASKS="quant" TRUST_REMOTE_CODE=false KV_CACHE_FREE_GPU_MEMORY_FRACTION=0.8 @@ -40,32 +36,27 @@ parse_options() { USE_SEQ_DEVICE_MAP=false # Parse command-line options - ARGS=$(getopt -o "" -l "type:,model:,quant:,kv_cache_quant:,tp:,calib_tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,input:,output:,batch:,tasks:,export_fmt:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:" -n "$0" -- "$@") + ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:" -n "$0" -- "$@") eval set -- "$ARGS" while true; do case "$1" in - --type ) MODEL_TYPE="$2"; shift 2;; --model ) MODEL_PATH="$2"; shift 2;; --quant ) QFORMAT="$2"; shift 2;; --kv_cache_quant ) KV_CACHE_QUANT="$2"; shift 2;; --tp ) TP="$2"; shift 2;; - --calib_tp ) CALIB_TP="$2"; shift 2;; --pp ) PP="$2"; shift 2;; --sparsity ) SPARSITY_FMT="$2"; shift 2;; --awq_block_size ) AWQ_BLOCK_SIZE="$2"; shift 2;; --calib ) CALIB_SIZE="$2"; shift 2;; --calib_batch_size ) CALIB_BATCH_SIZE="$2"; shift 2;; --auto_quantize_bits ) AUTO_QUANTIZE_BITS="$2"; shift 2;; - --input ) BUILD_MAX_INPUT_LEN="$2"; shift 2;; --output ) BUILD_MAX_OUTPUT_LEN="$2"; shift 2;; --batch ) BUILD_MAX_BATCH_SIZE="$2"; shift 2;; --tasks ) TASKS="$2"; shift 2;; - --export_fmt ) EXPORT_FORMAT="$2"; shift 2;; --lm_eval_tasks ) LM_EVAL_TASKS="$2"; shift 2;; --lm_eval_limit ) LM_EVAL_LIMIT="$2"; shift 2;; --simple_eval_tasks ) SIMPLE_EVAL_TASKS="$2"; shift 2;; - --num_samples ) NUM_SAMPLES="$2"; shift 2;; --trust_remote_code ) TRUST_REMOTE_CODE=true; shift;; --use_seq_device_map ) USE_SEQ_DEVICE_MAP=true; shift;; --gpu_max_mem_percentage ) GPU_MAX_MEM_PERCENTAGE="$2"; shift 2;; @@ -80,7 +71,6 @@ parse_options() { DEFAULT_CALIB_SIZE=512 DEFAULT_CALIB_BATCH_SIZE=0 - DEFAULT_BUILD_MAX_INPUT_LEN=4096 DEFAULT_BUILD_MAX_OUTPUT_LEN=1024 DEFAULT_BUILD_MAX_BATCH_SIZE=2 @@ -90,9 +80,6 @@ parse_options() { if [ -z "$CALIB_BATCH_SIZE" ]; then CALIB_BATCH_SIZE=$DEFAULT_CALIB_BATCH_SIZE fi - if [ -z "$BUILD_MAX_INPUT_LEN" ]; then - BUILD_MAX_INPUT_LEN=$DEFAULT_BUILD_MAX_INPUT_LEN - fi if [ -z "$BUILD_MAX_OUTPUT_LEN" ]; then BUILD_MAX_OUTPUT_LEN=$DEFAULT_BUILD_MAX_OUTPUT_LEN fi @@ -103,12 +90,11 @@ parse_options() { # Verify required options are provided if [ -z "$MODEL_PATH" ] || [ -z "$QFORMAT" ] || [ -z "$TASKS" ]; then echo "Usage: $0 --model= --quant= --tasks=" - echo "Optional args: --tp= --pp= --sparsity= --awq_block_size= --calib=" - echo "Optional args for NeMo: --type= --calib_tp=" + echo "Optional args: --sparsity= --awq_block_size= --calib=" exit 1 fi - VALID_TASKS=("build" "mmlu" "mtbench" "benchmark" "lm_eval" "gqa" "livecodebench" "simple_eval") + VALID_TASKS=("quant" "mmlu" "mtbench" "lm_eval" "livecodebench" "simple_eval") for task in $(echo "$TASKS" | tr ',' ' '); do is_valid_task=false @@ -126,8 +112,6 @@ parse_options() { fi done - GPUS=$(($TP*$PP)) - # Make sparsity and int4 quantization mutually exclusive as it does not brings speedup if [[ "$SPARSITY_FMT" = "sparsegpt" || "$SPARSITY_FMT" = "sparse_magnitude" ]]; then if [[ "$QFORMAT" == *"awq"* ]]; then @@ -138,13 +122,10 @@ parse_options() { # Now you can use the variables $GPU, $MODEL, and $TASKS in your script echo "=================" - echo "type: $MODEL_TYPE" echo "model: $MODEL_PATH" echo "quant: $QFORMAT" - echo "tp: $TP" - echo "calib_tp: $CALIB_TP" - echo "pp: $PP" - echo "gpus: $GPUS" + echo "tp (TensorRT-LLM Checkpoint only): $TP" + echo "pp (TensorRT-LLM Checkpoint only): $PP" echo "sparsity: $SPARSITY_FMT" echo "awq_block_size: $AWQ_BLOCK_SIZE" echo "calib: $CALIB_SIZE" @@ -154,7 +135,6 @@ parse_options() { echo "output: $BUILD_MAX_OUTPUT_LEN" echo "batch: $BUILD_MAX_BATCH_SIZE" echo "tasks: $TASKS" - echo "export_fmt: $EXPORT_FORMAT" echo "lm_eval_tasks: $LM_EVAL_TASKS" echo "lm_eval_limit: $LM_EVAL_LIMIT" echo "simple_eval_tasks: $SIMPLE_EVAL_TASKS" diff --git a/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb b/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb index bc271325..0c292563 100644 --- a/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb +++ b/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb @@ -691,7 +691,7 @@ "\n", "# run conversion script\n", "cd ..\n", - "bash TensorRT-Model-Optimizer/examples/llm_ptq/scripts/huggingface_example.sh --model $(pwd)/qat/checkpoint-450/ --quant nvfp4 --export_fmt hf" + "bash TensorRT-Model-Optimizer/examples/llm_ptq/scripts/huggingface_example.sh --model $(pwd)/qat/checkpoint-450/ --quant nvfp4" ] }, { diff --git a/examples/llm_sparsity/README.md b/examples/llm_sparsity/README.md index 5d1f8f29..e7b8b30e 100644 --- a/examples/llm_sparsity/README.md +++ b/examples/llm_sparsity/README.md @@ -148,5 +148,4 @@ python export_trtllm_ckpt.py --model_name_or_path meta-llama/Llama-2-7b-hf \ ## Build TensorRT-LLM Engine -For guidance on how to build TensorRT-LLM engines, please refer to [link](../llm_ptq/README.md#TensorRT-LLM-Engine-Build). -To validate the built TensorRT-LLM engines, please follow the instructions at [link](../llm_ptq/README.md#TensorRT-LLM-Engine-Validation). +For guidance on how to build TensorRT-LLM engines, please refer to [link](https://nvidia.github.io/TensorRT-LLM/commands/trtllm-build.html#trtllm-build) and use the `--weight_sparsity` flag. diff --git a/examples/vlm_eval/README.md b/examples/vlm_eval/README.md deleted file mode 100644 index e093d79f..00000000 --- a/examples/vlm_eval/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# Evaluation scripts for VLM tasks - -This folder includes popular 3rd-party VLM benchmarks for VLM accuracy evaluation. - -The following instructions show how to evaluate the VLM (including Model Optimizer quantized LLM) with the benchmarks, including the TensorRT-LLM deployment. - -## GQA - -[GQA: a dataset for real-world visual reasoning and compositional question answering](https://arxiv.org/abs/1902.09506). Upon completing the benchmark, the model's accuracy (in percentage format) will be displayed, providing a clear metric for performance evaluation. - -First log in to Hugging Face account with your token. - -```bash -huggingface-cli login -``` - -### Baseline - -```bash -bash gqa.sh --hf_model -``` - -### Quantized (simulated) - -```bash -# MODELOPT_QUANT_CFG: Choose from [INT8_SMOOTHQUANT_CFG|FP8_DEFAULT_CFG|INT4_AWQ_CFG|W4A8_AWQ_BETA_CFG] -bash gqa.sh --hf_model --quant_cfg MODELOPT_QUANT_CFG -``` - -### Evaluate the TensorRT-LLM engine - -TensorRT engine could be built following this [guide](../vlm_ptq/README.md) - -```bash -bash gqa.sh --hf_model --visual_engine --llm_engine -``` - -If you encounter Out of Memory (OOM) issues during evaluation, you can try lowering the `--kv_cache_free_gpu_memory_fraction` argument (default is 0.8) to reduce GPU memory usage for kv_cache: - -```bash -bash gqa.sh --hf_model --visual_engine --llm_engine --kv_cache_free_gpu_memory_fraction 0.5 -``` diff --git a/examples/vlm_eval/convert_gqa_for_eval.py b/examples/vlm_eval/convert_gqa_for_eval.py deleted file mode 100644 index 357f8a62..00000000 --- a/examples/vlm_eval/convert_gqa_for_eval.py +++ /dev/null @@ -1,35 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Adapted from https://github.com/NVlabs/VILA/blob/ec7fb2c264920bf004fd9fa37f1ec36ea0942db5/scripts/convert_gqa_for_eval.py - - -import argparse -import json - -parser = argparse.ArgumentParser() -parser.add_argument("--src", type=str) -parser.add_argument("--dst", type=str) -args = parser.parse_args() - -all_answers = [] -for line_idx, line in enumerate(open(args.src)): - res = json.loads(line) - question_id = res["question_id"] - text = res["text"].rstrip(".").lower() - all_answers.append({"questionId": question_id, "prediction": text}) - -with open(args.dst, "w") as f: - json.dump(all_answers, f) diff --git a/examples/vlm_eval/gqa.sh b/examples/vlm_eval/gqa.sh deleted file mode 100644 index 5cd977a4..00000000 --- a/examples/vlm_eval/gqa.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -set -e - -# Download dataset -script_dir="$(dirname "$(readlink -f "$0")")" - -EVAL_FILE="$script_dir/eval.py" -if [ ! -f $EVAL_FILE ]; then - echo "$EVAL_FILE does not exist. Downloading this file from https://nlp.stanford.edu/data/gqa/eval.zip." - wget https://nlp.stanford.edu/data/gqa/eval.zip - unzip eval.zip "eval.py" -d . - rm eval.zip - - # Changes to eval.py due to the missing assets in GQA v1.2 release - sed -i '77s/{tier}_all_questions.json/{tier}_questions.json/' "$EVAL_FILE" - sed -i '119,120s/^/# /' "$EVAL_FILE" - sed -i '126,128s/^/# /' "$EVAL_FILE" - sed -i '367,373s/^/# /' "$EVAL_FILE" - sed -i '376,379s/^/# /' "$EVAL_FILE" - sed -i '388s/^/# /' "$EVAL_FILE" -fi - -gqa_data=$script_dir/gqa/data -QUESTION=$gqa_data/testdev_balanced_questions.json -if [ ! -f $QUESTION ]; then - echo "$QUESTION does not exist. Downloading this file from https://downloads.cs.stanford.edu/nlp/data/gqa/questions1.2.zip." - wget -P $gqa_data https://downloads.cs.stanford.edu/nlp/data/gqa/questions1.2.zip - unzip $gqa_data/questions1.2.zip "testdev_balanced_questions.json" -d $gqa_data - rm $gqa_data/questions1.2.zip -fi - -# Parse command-line arguments -while [[ $# -gt 0 ]]; do - case "$1" in - --hf_model) - HF_MODEL_DIR="$2" - shift 2 - ;; - --engine_dir) - ENGINE_DIR="$2" - shift 2 - ;; - --batch_size) - BATCH_SIZE="$2" - shift 2 - ;; - --quant_cfg) - QUANT_CFG="$2" - shift 2 - ;; - --kv_cache_free_gpu_memory_fraction) - KV_CACHE_FREE_GPU_MEMORY_FRACTION="$2" - shift 2 - ;; - *) - echo "Unknown option $1" - exit 1 - ;; - esac -done - -# Set default value for kv_cache_free_gpu_memory_fraction if not provided -if [ -z "$KV_CACHE_FREE_GPU_MEMORY_FRACTION" ]; then - KV_CACHE_FREE_GPU_MEMORY_FRACTION=0.8 -fi - -# Verify required arguments are set -if [ -z "$HF_MODEL_DIR" ]; then - echo "Error: Missing required argument --hf_model" - exit 1 -fi - -MODEL_NAME=$(basename $HF_MODEL_DIR | sed 's/[^0-9a-zA-Z\-]/_/g' | tr 'A-Z' 'a-z') - -if [[ "$MODEL_NAME" == *"vila"* ]] && [[ -z "$ENGINE_DIR" ]]; then - # Install required dependency for VILA - pip install -r requirements-vila.txt - # Clone original VILA repo - if [ ! -d "$(dirname "$HF_MODEL_DIR")/VILA" ]; then - echo "VILA repository is needed until it is added to HF model zoo. Cloning the repository parallel to $HF_MODEL_DIR..." - git clone https://github.com/Efficient-Large-Model/VILA.git "$(dirname "$HF_MODEL_DIR")/VILA" && \ - cd "$(dirname "$HF_MODEL_DIR")/VILA" && \ - git checkout ec7fb2c264920bf004fd9fa37f1ec36ea0942db5 && \ - cd - - fi -fi - -# Set batch size defaulted to 20 for VILA and Llava -if [[ -z "$BATCH_SIZE" && ("$MODEL_NAME" == *"vila"* || "$MODEL_NAME" == *"llava"*) ]]; then - BATCH_SIZE=20 -fi - -# Check if TRT engine is provided -if [ -z "$ENGINE_DIR" ]; then - echo "ENGINE_DIR not provided, evaluation will be based on Pytorch." - if [ -z "$QUANT_CFG" ]; then - ANSWER_DIR="$script_dir/gqa/$MODEL_NAME/llava_gqa_testdev_balanced/answers" - ANSWERS_FILE="$ANSWER_DIR/merge.jsonl" - else - ANSWER_DIR="$script_dir/gqa/${MODEL_NAME}_${QUANT_CFG}/llava_gqa_testdev_balanced/answers" - ANSWERS_FILE="$ANSWER_DIR/merge.jsonl" - fi -else - echo "Both --visual_engine or --llm_engine are provided, evaluation will be based on TRT engine." - ANSWER_DIR="$script_dir/gqa/$(basename $ENGINE_DIR)/llava_gqa_testdev_balanced/answers" - ANSWERS_FILE="$ANSWER_DIR/merge.jsonl" -fi - -# Run the Python script with the parsed arguments -if [ ! -f $ANSWERS_FILE ]; then - python model_gqa_loader.py \ - --answers_file "$ANSWERS_FILE" \ - --hf_model_dir "$HF_MODEL_DIR" \ - ${ENGINE_DIR:+--engine_dir "$ENGINE_DIR"} \ - ${BATCH_SIZE:+--batch_size "$BATCH_SIZE"} \ - ${QUANT_CFG:+--quant_cfg "$QUANT_CFG"} \ - --kv_cache_free_gpu_memory_fraction "$KV_CACHE_FREE_GPU_MEMORY_FRACTION" -fi - -# Convert answer to prediction for evaluation -PREDICTION_FILE="$ANSWER_DIR/testdev_balanced_predictions.json" -if [ ! -f $PREDICTION_FILE ]; then - python convert_gqa_for_eval.py \ - --src $ANSWERS_FILE \ - --dst $PREDICTION_FILE -fi - -# Get evaluation result -python eval.py \ - --tier "$gqa_data/testdev_balanced" \ - --predictions $PREDICTION_FILE diff --git a/examples/vlm_eval/model_gqa_loader.py b/examples/vlm_eval/model_gqa_loader.py deleted file mode 100644 index 706d0a02..00000000 --- a/examples/vlm_eval/model_gqa_loader.py +++ /dev/null @@ -1,325 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import copy -import os -import sys -import time -from pathlib import Path - -from datasets import load_dataset -from tensorrt_llm import logger -from tensorrt_llm.runtime import MultimodalModelRunner -from tqdm import tqdm -from transformers import ( - AutoModelForCausalLM, - AutoProcessor, - GenerationConfig, - LlavaForConditionalGeneration, - MllamaForConditionalGeneration, -) - -import modelopt.torch.quantization as mtq -from modelopt.torch.utils.dataset_utils import ( - create_forward_loop, - get_dataset_dataloader, - get_max_batch_size, -) -from modelopt.torch.utils.image_processor import MllamaImageProcessor -from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader - -sys.path.append(str(Path(__file__).resolve().parent / "../llm_ptq")) -sys.path.append(str(Path(__file__).resolve().parent / "../vlm_ptq")) -from example_utils import get_processor, get_tokenizer -from utils import add_common_args -from vlm_eval_utils import save_jsonl - - -def quantize_model(model, args, tokenizer, processor=None): - sample_memory_usage_ratio = ( - 2 if "AWQ" in args.quant_cfg or "SMOOTHQUANT" in args.quant_cfg else 1.1 - ) - batch_size = get_max_batch_size(model, sample_memory_usage_ratio=sample_memory_usage_ratio) - calib_size = args.calib_size - batch_size = min(batch_size, calib_size) - - # Handle Mllama models with VLM dataset - if processor is not None and isinstance(processor, MllamaImageProcessor): - calib_dataloader = get_vlm_dataset_dataloader( - dataset_name="scienceqa", # Default dataset for Mllama - processor=processor, - batch_size=batch_size, - num_samples=calib_size, - ) - else: - calib_dataloader = get_dataset_dataloader( - dataset_name="cnn_dailymail", - tokenizer=tokenizer, - batch_size=batch_size, - num_samples=calib_size, - device=model.device, - ) - calibrate_loop = create_forward_loop(dataloader=calib_dataloader) - - quant_cfg = getattr(mtq, args.quant_cfg) - if "AWQ" in args.quant_cfg: - quant_cfg = copy.deepcopy(getattr(mtq, args.quant_cfg)) - weight_quantizer = quant_cfg["quant_cfg"]["*weight_quantizer"] - if isinstance(weight_quantizer, list): - weight_quantizer = weight_quantizer[0] - enable_quant_kv_cache = args.quant_cfg not in ["INT8_SMOOTHQUANT_CFG"] - print(f"{'Enable' if enable_quant_kv_cache else 'Disable'} KV cache quantization") - quant_cfg["quant_cfg"]["*output_quantizer"] = { - "num_bits": 8 if args.quant_cfg == "INT8_SMOOTHQUANT_CFG" else (4, 3), - "axis": None, - "enable": enable_quant_kv_cache, - } - - print("Starting quantization...") - start_time = time.time() - model = mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop) - end_time = time.time() - print(f"Quantization done. Total time used: {end_time - start_time}s") - return model - - -def main(): - parser = argparse.ArgumentParser() - parser = add_common_args(parser) - parser.add_argument("--answers_file", type=str, required=True) - parser.add_argument( - "--quant_cfg", - type=str, - default=None, - help="Specify the modelopt quantization configuration for simulated evaluation", - choices=[ - "INT8_SMOOTHQUANT_CFG", - "FP8_DEFAULT_CFG", - "INT4_AWQ_CFG", - "W4A8_AWQ_BETA_CFG", - ], - ) - parser.add_argument( - "--calib_size", type=int, default=512, help="Number of samples for calibration." - ) - parser.add_argument( - "--trust_remote_code", - help="Set trust_remote_code for Huggingface models and tokenizers", - default=False, - action="store_true", - ) - args = parser.parse_args() - - # Load data - instances = load_dataset( - "lmms-lab/GQA", "testdev_balanced_instructions", split="testdev", token=True - ) - images = load_dataset("lmms-lab/GQA", "testdev_balanced_images", split="testdev", token=True) - id2image = {} - for row in images: - id2image[row["id"]] = row["image"].convert("RGB") - - # Load model - if args.engine_dir is not None: - os.environ["TOKENIZERS_PARALLELISM"] = "false" - logger.set_level(args.log_level) - # Load TensorRT engine - model = MultimodalModelRunner(args) - # Run batch inference - outputs = [] - batch_size = args.batch_size - if model.model_type in ["phi-3-vision"]: - # Phi-3-vision doesn't support batch inference for now - batch_size = 1 - for index in tqdm(range(0, len(instances), batch_size)): - batch = instances[index : index + batch_size] - raw_images = [id2image[imageId] for imageId in batch["imageId"]] - questions = batch["question"] - questions = [ - q + "\nAnswer the question using a single word or phrase." for q in questions - ] - if model.model_type in ["llava"]: - input_text = ["\n" + question for question in questions] - elif model.model_type in ["vila"]: - input_text = ["\n" + question for question in questions] - elif model.model_type in ["phi-3-vision"]: - input_text = questions[0] - elif model.model_type in ["mllama"]: - input_text = ["<|image|><|begin_of_text|>" + question for question in questions] - _, output_text = model.run(input_text, raw_images, None, args.max_new_tokens) - outputs.extend( - [ - { - "question_id": id, - "prompt": batch["question"][index], - "text": output_text[index][0], - } - for index, id in enumerate(batch["id"]) - ] - ) - - else: - # Load HF model - if "vila" in args.hf_model_dir.lower(): - sys.path.append(os.path.join(args.hf_model_dir, "..", "VILA")) - import llava - - model = llava.load(args.hf_model_dir) - from llava import conversation as conversation_lib - - if "8b" in args.hf_model_dir.lower(): - conv_mode = "llama_3" - elif "40b" in args.hf_model_dir.lower(): - conv_mode = "hermes-2" - else: - conv_mode = "vicuna_v1" - - conversation_lib.default_conversation = conversation_lib.conv_templates[ - conv_mode - ].copy() - - generation_config = GenerationConfig.from_pretrained(args.hf_model_dir + "/llm") - generation_config.update(max_new_tokens=args.max_new_tokens) - elif "llama" in args.hf_model_dir.lower(): - model = MllamaForConditionalGeneration.from_pretrained( - args.hf_model_dir, - device_map="auto", - trust_remote_code=args.trust_remote_code, - torch_dtype="auto", - ) - # processor = AutoProcessor.from_pretrained(args.hf_model_dir) - processor = get_processor( - args.hf_model_dir, "mllama", model.device, trust_remote_code=args.trust_remote_code - ) - - else: - processor = AutoProcessor.from_pretrained( - args.hf_model_dir, trust_remote_code=args.trust_remote_code - ) - if "llava" in args.hf_model_dir.lower(): - model = LlavaForConditionalGeneration.from_pretrained( - args.hf_model_dir, device_map="auto", torch_dtype="auto" - ) - # To be deprecated for new version transformers - processor.patch_size = model.config.vision_config.patch_size - processor.vision_feature_select_strategy = ( - model.config.vision_feature_select_strategy - ) - elif "phi" in args.hf_model_dir.lower(): - model = AutoModelForCausalLM.from_pretrained( - args.hf_model_dir, - device_map="auto", - trust_remote_code=args.trust_remote_code, - torch_dtype="auto", - _attn_implementation="flash_attention_2", - ) - else: - raise ValueError(f"Unsupported model: {args.hf_model_dir}") - # Evaluation for simulated quantization - if args.quant_cfg: - tokenizer = get_tokenizer(args.hf_model_dir, trust_remote_code=args.trust_remote_code) - if "vila" in args.hf_model_dir.lower(): - model.llm = quantize_model(model.llm, args, tokenizer) - elif "llava" in args.hf_model_dir.lower(): - model.language_model = quantize_model(model.language_model, args, tokenizer) - elif "phi" in args.hf_model_dir.lower(): - model = quantize_model(model, args, tokenizer) - elif "llama" in args.hf_model_dir.lower(): - model = quantize_model(model, args, tokenizer, processor) - else: - raise ValueError(f"Unsupported model: {args.hf_model_dir}") - if "llama" in args.hf_model_dir.lower(): - processor = processor.tokenizer - - outputs = [] - for instance in tqdm(instances): - image = id2image[instance["imageId"]] - question = instance["question"] - if "vila" in args.hf_model_dir.lower(): - response = model.generate_content( - [image, question], generation_config=generation_config - ) - else: - if "llava" in args.hf_model_dir.lower(): - conversation = [ - { - "role": "user", - "content": [ - {"type": "text", "text": question}, - {"type": "image"}, - ], - }, - ] - prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) - inputs = processor(images=image, text=prompt, return_tensors="pt").to( - "cuda:0", model.dtype - ) - response = model.generate( - **inputs, max_new_tokens=args.max_new_tokens, do_sample=False - ) - elif "phi" in args.hf_model_dir.lower(): - conversation = [ - {"role": "user", "content": f"<|image_1|>\n{question}"}, - ] - prompt = processor.tokenizer.apply_chat_template( - conversation, tokenize=False, add_generation_prompt=True - ) - inputs = processor(images=image, text=prompt, return_tensors="pt").to( - "cuda:0", model.dtype - ) - response = model.generate( - **inputs, - eos_token_id=processor.tokenizer.eos_token_id, - max_new_tokens=args.max_new_tokens, - do_sample=False, - ) - elif "llama" in args.hf_model_dir.lower(): - conversation = [ - { - "role": "user", - "content": [ - {"type": "image"}, - { - "type": "text", - "text": question - + "\nAnswer the question using a single word or phrase.", - }, - ], - } - ] - prompt = processor.apply_chat_template( - conversation, tokenize=False, add_generation_prompt=True - ) - - inputs = processor(image, prompt, return_tensors="pt").to("cuda:0", model.dtype) - response = model.generate( - **inputs, - eos_token_id=processor.tokenizer.eos_token_id, - max_new_tokens=args.max_new_tokens, - do_sample=False, - ) - else: - raise ValueError(f"Unsupported model: {args.hf_model_dir}") - response = processor.decode( - response[0][inputs["input_ids"].shape[-1] :], skip_special_tokens=True - ) - - outputs.append({"question_id": instance["id"], "prompt": question, "text": response}) - save_jsonl(args.answers_file, outputs) - - -if __name__ == "__main__": - main() diff --git a/examples/vlm_eval/requirements-vila.txt b/examples/vlm_eval/requirements-vila.txt deleted file mode 100644 index d88b1186..00000000 --- a/examples/vlm_eval/requirements-vila.txt +++ /dev/null @@ -1,2 +0,0 @@ -deepspeed>=0.16.0 -git+https://github.com/bfshi/scaling_on_scales.git diff --git a/examples/vlm_eval/vlm_eval_utils.py b/examples/vlm_eval/vlm_eval_utils.py deleted file mode 100644 index fdc848ae..00000000 --- a/examples/vlm_eval/vlm_eval_utils.py +++ /dev/null @@ -1,44 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import json -import os -from collections.abc import Iterator -from contextlib import contextmanager -from io import TextIOWrapper -from typing import IO, Any, TextIO - - -@contextmanager -def file_descriptor(f: str | IO, mode: str = "r") -> Iterator[IO]: - opened = False - try: - if isinstance(f, str): - f = open(f, mode) - opened = True - yield f - finally: - if opened: - assert isinstance(f, TextIOWrapper), type(f) - f.close() - - -def save_jsonl(f: str | TextIO, obj: Any, **kwargs) -> None: - assert isinstance(f, str), type(f) - os.makedirs(os.path.dirname(f), exist_ok=True) - - with file_descriptor(f, mode="w") as fd: - fd.write("\n".join(json.dumps(datum, **kwargs) for datum in obj)) diff --git a/examples/vlm_ptq/README.md b/examples/vlm_ptq/README.md index 1f51b950..e2d79cec 100644 --- a/examples/vlm_ptq/README.md +++ b/examples/vlm_ptq/README.md @@ -81,7 +81,7 @@ For [Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): ```bash git clone https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct -scripts/huggingface_example.sh --type qwen --model Qwen2.5-VL-7B-Instruct --export_fmt hf --quant [fp8|nvfp4|int8_sq|int4_awq|w4a8_awq] +scripts/huggingface_example.sh --type qwen --model Qwen2.5-VL-7B-Instruct --quant [fp8|nvfp4|int8_sq|int4_awq|w4a8_awq] ``` The example scripts above also have an additional flag `--tasks gqa`, which will trigger evaluation of the built TensorRT engine using GQA benchmark. Details of the evaluation is explained in this [tutorial](../vlm_eval/README.md). diff --git a/examples/vlm_ptq/scripts/huggingface_example.sh b/examples/vlm_ptq/scripts/huggingface_example.sh index 9bab141d..69e2dce9 100755 --- a/examples/vlm_ptq/scripts/huggingface_example.sh +++ b/examples/vlm_ptq/scripts/huggingface_example.sh @@ -29,77 +29,19 @@ for i in $(env | grep ^SLURM_ | cut -d"=" -f 1); do unset -v $i; done for i in $(env | grep ^PMI_ | cut -d"=" -f 1); do unset -v $i; done for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done -case $MODEL_TYPE in - llava|phi|vila|mllama|qwen) - ;; - *) - echo "Unsupported type argument: Expected one of: [llava, phi, vila, mllama, qwen]" >&2 - exit 1 -esac - if [ -z "$MODEL_PATH" ]; then echo "Unsupported model argument: Expected a huggingface model path or model name or a nemo path" >&2 exit 1 fi -# Check if ENABLE_SPARSITY environment variable is set to "true" -if [ "$SPARSITY_FMT" = "dense" ]; then - ENABLE_SPARSITY=false -else - ENABLE_SPARSITY=true -fi - -case $SPARSITY_FMT in - dense|sparsegpt) - ;; - *) - echo "Unknown sparsity argument: Expected one of: [dense, sparsegpt]" >&2 - exit 1 -esac - case $QFORMAT in - fp8|nvfp4|int8_sq|int4_awq|w4a8_awq|fp16|bf16) - ;; - *) - echo "Unknown quant argument: Expected one of: [fp8, nvfp4, int8_sq, int4_awq, w4a8_awq, fp16, bf16]" >&2 - exit 1 -esac - -case $TP in - 1|2|4|8) - ;; - *) - echo "Unknown tp argument: Expected one of: [1, 2, 4, 8]" >&2 - exit 1 -esac - -case $PP in - 1|2|4|8) + fp8|int4_awq|w4a8_awq|nvfp4) ;; *) - echo "Unknown pp argument: Expected one of: [1, 2, 4, 8]" >&2 + echo "Unknown quant argument: Expected one of: [fp8, int4_awq, w4a8_awq, nvfp4]" >&2 exit 1 esac -GPU_NAME=$(nvidia-smi --id 0 --query-gpu=name --format=csv,noheader,nounits | sed 's/ /_/g') - -if [ "${MODEL_TYPE}" = "phi" ]; then - BUILD_MAX_INPUT_LEN=4096 -else - BUILD_MAX_INPUT_LEN=1024 -fi - -BUILD_MAX_OUTPUT_LEN=512 - -if [ "$MODEL_TYPE" = "llava" ] || [ "$MODEL_TYPE" = "vila" ] || [ "$MODEL_TYPE" = "qwen" ]; then - BUILD_MAX_BATCH_SIZE=20 -else - BUILD_MAX_BATCH_SIZE=4 -fi - - -echo "Using the following config: max input $BUILD_MAX_INPUT_LEN max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE" - script_dir="$(dirname "$(readlink -f "$0")")" pushd $script_dir/.. @@ -108,15 +50,10 @@ if [ -z "$ROOT_SAVE_PATH" ]; then ROOT_SAVE_PATH=$(pwd) fi -MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g') -SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME}_${SPARSITY_FMT}_${QFORMAT}_tp${TP}_pp${PP} - -if [ $EXPORT_FORMAT != "tensorrt_llm" ]; then - SAVE_PATH=${SAVE_PATH}_${EXPORT_FORMAT} -fi +MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}} +SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME} MODEL_CONFIG=${SAVE_PATH}/config.json -ENGINE_DIR=${SAVE_PATH}/${MODEL_TYPE}_${TP}x${PP}x${GPU_NAME}_input${BUILD_MAX_INPUT_LEN}_output${BUILD_MAX_OUTPUT_LEN}_batch${BUILD_MAX_BATCH_SIZE}_engine if [ "${REMOVE_EXISTING_MODEL_CONFIG,,}" = "true" ]; then rm -f $MODEL_CONFIG @@ -132,27 +69,9 @@ if $TRUST_REMOTE_CODE; then PTQ_ARGS+=" --trust_remote_code " fi -case "${MODEL_TYPE}" in - "vila") - VISUAL_FEATURE=196 - VLM_ARGS=" --max_multimodal_len=$((BUILD_MAX_BATCH_SIZE * VISUAL_FEATURE)) " - ;; - "phi") - VISUAL_FEATURE=4096 - VLM_ARGS=" --max_multimodal_len=$((BUILD_MAX_BATCH_SIZE * VISUAL_FEATURE)) " - ;; - "llava") - VISUAL_FEATURE=576 - VLM_ARGS=" --max_multimodal_len=$((BUILD_MAX_BATCH_SIZE * VISUAL_FEATURE)) " - ;; - "mllama") - PTQ_ARGS+=" --kv_cache_qformat none " - VLM_ARGS=" --max_encoder_input_len=6404 --skip_run" - ;; - "qwen") - PTQ_ARGS+=" --kv_cache_qformat none " - ;; -esac +if [ -n "$KV_CACHE_QUANT" ]; then + PTQ_ARGS+=" --kv_cache_qformat=$KV_CACHE_QUANT " +fi if [ "${MODEL_TYPE}" = "vila" ]; then # Install required dependency for VILA @@ -167,102 +86,47 @@ if [ "${MODEL_TYPE}" = "vila" ]; then fi fi -if [[ $TASKS =~ "build" ]] || [[ ! -d "$ENGINE_DIR" ]] || [[ ! $(ls -A $ENGINE_DIR) ]]; then +if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH) ]]; then if ! [ -f $MODEL_CONFIG ]; then echo "Quantizing original model..." python ../llm_ptq/hf_ptq.py \ --pyt_ckpt_path=$MODEL_PATH \ --export_path=$SAVE_PATH \ - --sparsity_fmt=$SPARSITY_FMT \ --qformat=$QFORMAT \ --calib_size=$CALIB_SIZE \ --batch_size=$CALIB_BATCH_SIZE \ - --inference_tensor_parallel=$TP \ - --inference_pipeline_parallel=$PP \ - --export_fmt=$EXPORT_FORMAT \ - --no-verbose \ $PTQ_ARGS else echo "Quantized model config $MODEL_CONFIG exists, skipping the quantization stage" fi - - if [ $EXPORT_FORMAT != "tensorrt_llm" ]; then - echo "Please continue deployment with $EXPORT_FORMAT. Checkpoint export_path: $SAVE_PATH" - exit 0 - fi - - - echo "Building tensorrt_llm engine from Model Optimizer-quantized model..." - - python ../llm_ptq/modelopt_to_tensorrt_llm.py \ - --model_config=$MODEL_CONFIG \ - --engine_dir=${ENGINE_DIR}/llm \ - --tokenizer=$MODEL_PATH \ - --max_input_len=$BUILD_MAX_INPUT_LEN \ - --max_output_len=$BUILD_MAX_OUTPUT_LEN \ - --max_batch_size=$BUILD_MAX_BATCH_SIZE \ - --num_build_workers=$GPUS \ - --enable_sparsity=$ENABLE_SPARSITY \ - $VLM_ARGS fi +if [[ "$QFORMAT" != "fp8" ]]; then + echo "For quant format $QFORMAT, please refer to the TensorRT-LLM documentation for deployment. Checkpoint saved to $SAVE_PATH." + exit 0 +fi -VISUAL_ARGS="" -VISION_ENCODER_DIR=${ENGINE_DIR}/vision -VISUAL_MODEL_TYPE=$MODEL_TYPE -case "${MODEL_TYPE}" in - "vila") - VISUAL_ARGS+=" --vila_path ${MODEL_PATH}/../VILA " - ;; - "phi") - VISUAL_MODEL_TYPE="phi-3-vision" - ;; - "qwen") - # Map generic type to TRT-LLM multimodal model type - VISUAL_MODEL_TYPE="qwen2_vl" - ;; -esac - - -VISUAL_MAX_BATCH_SIZE=$BUILD_MAX_BATCH_SIZE +if [[ "$QFORMAT" == *"nvfp4"* ]] || [[ "$KV_CACHE_QUANT" == *"nvfp4"* ]]; then + cuda_major=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader -i 0 | cut -d. -f1) -if [[ $TASKS =~ "build" ]] || [[ ! -d "$VISION_ENCODER_DIR" ]] || [[ ! $(ls -A $VISION_ENCODER_DIR) ]]; then - echo "Build visual engine" - python vlm_visual_engine.py \ - --model_path $MODEL_PATH \ - --model_type $VISUAL_MODEL_TYPE \ - --output_dir $VISION_ENCODER_DIR \ - --max_batch_size $VISUAL_MAX_BATCH_SIZE \ - $VISUAL_ARGS + if [ "$cuda_major" -lt 10 ]; then + echo "Please deploy the NVFP4 checkpoint on a Blackwell GPU. Checkpoint export_path: $SAVE_PATH" + exit 0 + fi fi -VLM_RUN_ARGS="" -case "${MODEL_TYPE}" in - "mllama") - VLM_RUN_ARGS+=" --image_path https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg --input_text \"<|image|><|begin_of_text|>If I had to write a haiku for this one\" --max_new_tokens 50 --batch_size 2 " - ;; -esac -echo "Run inference example" - -mpirun -n $GPUS --allow-run-as-root python vlm_run.py \ - --hf_model_dir $MODEL_PATH \ - --engine_dir $ENGINE_DIR \ - --kv_cache_free_gpu_memory_fraction $KV_CACHE_FREE_GPU_MEMORY_FRACTION \ - $VLM_RUN_ARGS +# Prepare datasets for TRT-LLM benchmark +if [ -z "$TRT_LLM_CODE_PATH" ]; then + TRT_LLM_CODE_PATH=/app/tensorrt_llm # default path for the TRT-LLM release docker image + echo "Setting default TRT_LLM_CODE_PATH to $TRT_LLM_CODE_PATH." +fi -if [[ $TASKS =~ "gqa" ]]; then - echo "Evaluating the TensorRT engine of the quantized model using GQA benchmark." - pushd ../vlm_eval/ - if [[ "$MODEL_PATH" =~ ^/ ]]; then - # If MODEL_PATH is absolute path - source gqa.sh --hf_model $MODEL_PATH --engine_dir $ENGINE_DIR --kv_cache_free_gpu_memory_fraction $KV_CACHE_FREE_GPU_MEMORY_FRACTION - else - # If MODEL_PATH is absolute path - script_parent_dir=$(dirname "$script_dir") - source gqa.sh --hf_model $script_parent_dir/$MODEL_PATH --engine_dir $ENGINE_DIR --kv_cache_free_gpu_memory_fraction $KV_CACHE_FREE_GPU_MEMORY_FRACTION - fi +QUICK_START_MULTIMODAL=$TRT_LLM_CODE_PATH/examples/llm-api/quickstart_multimodal.py - popd +if [ -f "$QUICK_START_MULTIMODAL" ]; then + python3 $QUICK_START_MULTIMODAL --model_dir $SAVE_PATH --modality image +else + echo "Warning: $QUICK_START_MULTIMODAL cannot be found. Please set TRT_LLM_CODE_PATH to the TRT-LLM code path or test the quantized checkpoint $SAVE_PATH with the TRT-LLM repo directly." fi popd diff --git a/examples/vlm_ptq/vlm_run.py b/examples/vlm_ptq/vlm_run.py deleted file mode 100644 index 7c84ae61..00000000 --- a/examples/vlm_ptq/vlm_run.py +++ /dev/null @@ -1,128 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -import tensorrt_llm -import tensorrt_llm.profiler as profiler -from tensorrt_llm import logger -from tensorrt_llm.runtime import MultimodalModelRunner -from utils import add_common_args - - -def print_result(model, input_text, output_text, args): - logger.info("---------------------------------------------------------") - if model.model_type != "nougat": - logger.info(f"\n[Q] {input_text}") - for i in range(len(output_text)): - logger.info(f"\n[A]: {output_text[i]}") - - if args.num_beams == 1: - output_ids = model.tokenizer(output_text[0][0], add_special_tokens=False)["input_ids"] - logger.info(f"Generated {len(output_ids)} tokens") - - if args.check_accuracy and model.model_type != "nougat": - if model.model_type == "vila": - for i in range(len(args.image_path.split(args.path_sep))): - if i % 2 == 0: - assert output_text[i][0].lower() == ( - "the image captures a bustling city intersection teeming with life. " - "from the perspective of a car's dashboard camera, we see" - ) - else: - assert output_text[i][0].lower() == ( - "the image captures the iconic merlion statue in singapore, " - "a renowned worldwide landmark. the merlion, a mythical" - ) - elif model.model_type == "llava": - for i in range(len(args.image_path.split(args.path_sep))): - assert output_text[i][0].lower() == "singapore" - elif model.model_type == "fuyu": - assert output_text[0][0].lower() == "4" - elif model.model_type == "pix2struct": - assert ( - "characteristic | cat food, day | cat food, wet | cat treats" - in output_text[0][0].lower() - ) - elif model.model_type in ["blip2", "neva", "phi-3-vision", "llava_next"]: - assert "singapore" in output_text[0][0].lower() - elif model.model_type == "video-neva": - assert "robot" in output_text[0][0].lower() - elif model.model_type == "kosmos-2": - assert "snowman" in output_text[0][0].lower() - elif model.model_type == "mllama": - if "If I had to write a haiku for this one" in input_text: - assert ( - "it would be:.\\nPeter Rabbit is a rabbit.\\nHe lives in a" in output_text[0][0] - or "Here is a haiku for the image:\n\n" in output_text[0][0] - ), ( - f"expected results: 'it would be:.\\nPeter Rabbit is a rabbit.\\nHe lives in a', \ - generated results: '{output_text[0][0]}'" - ) - elif "The key to life is" in input_text: - assert ( - "to find your passion and pursue it with all your heart." in output_text[0][0] - or "not to be found in the external world," in output_text[0][0] - ), ( - f"expected results: 'to find your passion and pursue it with all your heart.', \ - generated results: '{output_text[0][0]}'" - ) - elif model.model_type == "llava_onevision": - if args.video_path is None: - assert "singapore" in output_text[0][0].lower() - else: - assert ( - "the video is funny because the child's actions are" - in output_text[0][0].lower() - ) - elif model.model_type == "qwen2_vl": - assert "dog" in output_text[0][0].lower() - else: - assert output_text[0][0].lower() == "singapore" - - if args.run_profiling: - - def msec_per_batch(name): - return 1000 * profiler.elapsed_time_in_sec(name) / args.profiling_iterations - - logger.info("Latencies per batch (msec)") - logger.info("TRT vision encoder: {:.1f}".format(msec_per_batch("Vision"))) - logger.info("TRTLLM LLM generate: {:.1f}".format(msec_per_batch("LLM"))) - logger.info("Multimodal generate: {:.1f}".format(msec_per_batch("Generate"))) - - logger.info("---------------------------------------------------------") - - -if __name__ == "__main__": - os.environ["TOKENIZERS_PARALLELISM"] = "false" - parser = argparse.ArgumentParser() - parser = add_common_args(parser) - args = parser.parse_args() - logger.set_level(args.log_level) - - model = MultimodalModelRunner(args) - input_multimodal_data = model.load_test_data(args.image_path, args.video_path) - - num_iters = args.profiling_iterations if args.run_profiling else 1 - - for _ in range(num_iters): - input_text, output_text = model.run( - args.input_text, input_multimodal_data, None, args.max_new_tokens - ) - - runtime_rank = tensorrt_llm.mpi_rank() - if runtime_rank == 0: - print_result(model, input_text, output_text, args) diff --git a/examples/vlm_ptq/vlm_visual_engine.py b/examples/vlm_ptq/vlm_visual_engine.py deleted file mode 100644 index c21c7b53..00000000 --- a/examples/vlm_ptq/vlm_visual_engine.py +++ /dev/null @@ -1,26 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -from tensorrt_llm.tools.multimodal_builder import MultimodalEngineBuilder, add_multimodal_arguments - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser = add_multimodal_arguments(parser) - args = parser.parse_args() - - builder = MultimodalEngineBuilder(args) - builder.build() diff --git a/modelopt/deploy/llm/generate.py b/modelopt/deploy/llm/generate.py index d6f71bda..39f62798 100644 --- a/modelopt/deploy/llm/generate.py +++ b/modelopt/deploy/llm/generate.py @@ -21,17 +21,17 @@ from pathlib import Path from typing import Any +import tensorrt_llm import torch +from packaging.version import parse as parse_version from tensorrt_llm import SamplingParams -from tensorrt_llm.bindings.executor import DecodingConfig try: from tensorrt_llm.llmapi import CudaGraphConfig from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig - from tensorrt_llm.llmapi.llm import _TorchLLM, _TrtLLM - from tensorrt_llm.llmapi.tokenizer import TokenizerBase, TransformersTokenizer + from tensorrt_llm.llmapi.llm import LLM as TRTLLM except ImportError: - print("Please upgrade tensorrt-llm to 1.0.0rc or later") + print("Please upgrade tensorrt-llm to 1.1.0rc2 or later") raise @@ -50,53 +50,57 @@ def _sanitize_temperature_and_top_p(temperature, top_p): return kwargs -class LLM: +class LLM(TRTLLM): """A wrapper over the ``tensorrt_llm.llmapi.llm.LLM`` for LLM profiling and validation.""" - def _build_trt_llm_from_config( - self, config, engine_dir, tokenizer, kv_cache_config, medusa_choices, max_batch_size + def __init__( + self, + checkpoint_dir: str | Path, + tokenizer: "str | Path | None" = None, + kv_cache_config: dict[str, int | float] = {}, + medusa_choices: Any = None, + tp: int = 0, + trust_remote_code: bool = False, + max_batch_size: int = 0, ): - build_config = config["build_config"] - world_size = config.get("pretrained_config", {}).get("mapping", {}).get("world_size", 1) - max_batch_size = max(max_batch_size, build_config["max_batch_size"]) - max_tokens_kv_cache = build_config["max_seq_len"] * max_batch_size - - trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False) - - # If not specified, free_gpu_memory_fraction is set to the default TRT LLM value 0.9 - trt_kv_cache_config.free_gpu_memory_fraction = kv_cache_config.get( - "free_gpu_memory_fraction", 0.9 - ) + """Initializes the LLM runner class. - # If not specified, max_tokens is set to the max value calculated above. - trt_kv_cache_config.max_tokens = kv_cache_config.get("max_tokens", max_tokens_kv_cache) + Args: + checkpoint_dir: the directory path of the model checkpoint. + tokenizer: the tokenizer. For example, a tokenizer from the Huggingface model. + kv_cache_config: the kv cache config as a dict. Please refer to + https://nvidia.github.io/TensorRT-LLM/performance/performance-tuning-guide/ + medusa_choices: The medusa choices for the decoding config. + tp: the tensor parallel size (for the torch backend). If 0, it will be set to the number of GPUs. + trust_remote_code: whether to trust the remote code (for the torch backend). + max_batch_size: Max batch size for the LLM backend. If 0, it is not specified. + """ + with open(Path(checkpoint_dir) / "config.json") as config_file: + config = json.load(config_file) - kwargs = {} - if medusa_choices is not None: - decoding_config = DecodingConfig() - decoding_config.medusa_choices = medusa_choices - kwargs["decoding_config"] = decoding_config - assert world_size == 1, "decoding_config does not support multi TP in HLAPI." - - if tokenizer is None: - # Assume the tokenizer is stored in the engine_dir if not specified. - tokenizer = engine_dir - - # CustomSentencePieceTokenizer will not be recognized by llmapi, wrapping it around TransformersTokenizer - if type(tokenizer).__name__ in ["CustomSentencePieceTokenizer"]: - tokenizer = TransformersTokenizer(tokenizer) - - self.llm = _TrtLLM( - backend=None, - model=engine_dir, - tokenizer=tokenizer, - kv_cache_config=trt_kv_cache_config, - **kwargs, - ) + assert medusa_choices is None, "medusa_choices is not supported with the torch llmapi" + + def _find_max_position_embeddings(cfg: dict) -> int | None: + if "max_position_embeddings" in cfg: + return cfg["max_position_embeddings"] + for v in cfg.values(): + if isinstance(v, dict): + res = _find_max_position_embeddings(v) + if res is not None: + return res + return None + + # Some VLMs may have a sub-config for max_position_embeddings, so we need to find it. + self._max_seq_len = _find_max_position_embeddings(config) + if self._max_seq_len is None: + warnings.warn( + "max_position_embeddings not found in config.json, using default value 8192" + ) + self._max_seq_len = 8192 + else: + print(f"max_position_embeddings: {self._max_seq_len}") + self._max_beam_width = 1 - def _build_torch_llm_from_config( - self, checkpoint_dir, tokenizer, tp, trust_remote_code, max_batch_size - ): kwargs = {} if tokenizer is not None: kwargs["tokenizer"] = tokenizer @@ -104,9 +108,19 @@ def _build_torch_llm_from_config( if tp < 1: tp = torch.cuda.device_count() + # Check if any key in config contains both "num" and "experts" + ep = 1 + enable_attention_dp = False + for k in config: + if "num" in k and "experts" in k: + ep = torch.cuda.device_count() + enable_attention_dp = True + break + # Sometimes 90% of the GPU memory is not enough for the TRT LLM torch engine. - trt_kv_cache_config = TRT_KvCacheConfig( - enable_block_reuse=False, free_gpu_memory_fraction=0.85 + trt_kv_cache_config = TRT_KvCacheConfig(free_gpu_memory_fraction=0.7) + trt_kv_cache_config.max_tokens = self._max_seq_len * ( + max_batch_size if max_batch_size > 0 else 8 ) cuda_graph_config = None @@ -118,90 +132,24 @@ def _build_torch_llm_from_config( enable_padding=True, ) - self.llm = _TorchLLM( + self._support_context_logits_and_stop_words = parse_version( + tensorrt_llm.__version__ + ) >= parse_version("1.1.0rc2") + + super().__init__( backend="pytorch", model=checkpoint_dir, tensor_parallel_size=tp, + moe_expert_parallel_size=ep, trust_remote_code=trust_remote_code, enable_chunked_prefill=True, kv_cache_config=trt_kv_cache_config, # pytorch backend configs cuda_graph_config=cuda_graph_config, + enable_attention_dp=enable_attention_dp, **kwargs, ) - def __init__( - self, - checkpoint_dir: str | Path, - tokenizer: "str | Path | TokenizerBase | None" = None, - kv_cache_config: dict[str, int | float] = {}, - medusa_choices: Any = None, - tp: int = 0, - trust_remote_code: bool = False, - max_batch_size: int = 0, - ): - """Initializes the LLM runner class. - - Args: - engine_dir: the directory path of the TensorRT-LLM engine. - tokenizer: the tokenizer. For example, a tokenizer from the Huggingface model. - kv_cache_config: the kv cache config as a dict. Please refer to - https://nvidia.github.io/TensorRT-LLM/performance/performance-tuning-guide/ - medusa_choices: The medusa choices for the decoding config. - tp: the tensor parallel size (for the torch backend). If 0, it will be set to the number of GPUs. - trust_remote_code: whether to trust the remote code (for the torch backend). - max_batch_size: Max batch size for the LLM backend. If 0, it will be set to the max batch size - in the engine config. - """ - with open(Path(checkpoint_dir) / "config.json") as config_file: - config = json.load(config_file) - - if "build_config" in config: - self._is_torch = False - self._build_trt_llm_from_config( - config, - checkpoint_dir, - tokenizer, - kv_cache_config, - medusa_choices, - max_batch_size, - ) - - self._max_seq_len = self.llm.args.build_config.max_seq_len - self._max_beam_width = self.llm.args.build_config.max_beam_width - self._gather_context_logits = self.llm.args.build_config.gather_context_logits - else: - self._is_torch = True - assert medusa_choices is None, ( - "medusa_choices is not supported with the torch llmapi" - ) - - self._build_torch_llm_from_config( - checkpoint_dir, tokenizer, tp, trust_remote_code, max_batch_size - ) - - def _find_max_position_embeddings(cfg: dict) -> int | None: - if "max_position_embeddings" in cfg: - return cfg["max_position_embeddings"] - for v in cfg.values(): - if isinstance(v, dict): - res = _find_max_position_embeddings(v) - if res is not None: - return res - return None - - # Some VLMs may have a sub-config for max_position_embeddings, so we need to find it. - self._max_seq_len = _find_max_position_embeddings(config) - if self._max_seq_len is None: - warnings.warn( - "max_position_embeddings not found in config.json, using default value 8192" - ) - self._max_seq_len = 8192 - else: - print(f"max_position_embeddings: {self._max_seq_len}") - self._max_beam_width = 1 - self._gather_context_logits = False - @property def max_seq_len(self): """Get the max sequence length from the LLM instance.""" @@ -215,7 +163,7 @@ def max_beam_width(self): @property def gather_context_logits(self): """Returns whether the context_logits can be returned from the LLM instance.""" - return self._gather_context_logits + return self._support_context_logits_and_stop_words def _generate( self, @@ -227,10 +175,8 @@ def _generate( ): assert temperature >= 0.0, "Temperature must be greater than 0.0." - # TODO: Remove this once torch backend supports stop words - if self._is_torch: + if not self._support_context_logits_and_stop_words: stop_words = None - beam_width = self.max_beam_width kwargs = _sanitize_temperature_and_top_p(temperature, top_p) sampling_config = SamplingParams( @@ -241,7 +187,7 @@ def _generate( **kwargs, ) - return self.llm.generate(prompts, sampling_params=sampling_config, use_tqdm=False) + return self.generate(prompts, sampling_params=sampling_config, use_tqdm=False) def generate_tokens( self, @@ -330,8 +276,8 @@ def generate_context_logits( Returns: a tensor list of the context_logits. """ - assert self.gather_context_logits, ( - "Please enable gather_context_logits flag when building the engine." + assert self._support_context_logits_and_stop_words, ( + "Context logits are not supported with the current tensorrt_llm version." ) assert temperature >= 0.0, "Temperature must be greater than 0.0." @@ -340,6 +286,6 @@ def generate_context_logits( sampling_config = SamplingParams(max_tokens=1, use_beam_search=True, best_of=1, **kwargs) - outputs = self.llm.generate(prompts, sampling_params=sampling_config, use_tqdm=False) + outputs = self.generate(prompts, sampling_params=sampling_config, use_tqdm=False) return [output.context_logits for output in outputs] diff --git a/tests/_test_utils/examples/run_command.py b/tests/_test_utils/examples/run_command.py index cf31ce38..ebae8c80 100644 --- a/tests/_test_utils/examples/run_command.py +++ b/tests/_test_utils/examples/run_command.py @@ -123,16 +123,16 @@ def run_llm_export_command( def run_llm_ptq_command(*, model: str, quant: str, **kwargs): kwargs.update({"model": model, "quant": quant}) - kwargs.setdefault("tasks", "build") + kwargs.setdefault("tasks", "quant") kwargs.setdefault("calib", 16) cmd_parts = _extend_cmd_parts(["scripts/huggingface_example.sh", "--no-verbose"], **kwargs) run_example_command(cmd_parts, "llm_ptq") -def run_vlm_ptq_command(*, model: str, type: str, quant: str, **kwargs): - kwargs.update({"model": model, "type": type, "quant": quant}) - kwargs.setdefault("tasks", "build") +def run_vlm_ptq_command(*, model: str, quant: str, **kwargs): + kwargs.update({"model": model, "quant": quant}) + kwargs.setdefault("tasks", "quant") kwargs.setdefault("calib", 16) cmd_parts = _extend_cmd_parts(["scripts/huggingface_example.sh"], **kwargs) diff --git a/tests/_test_utils/model.py b/tests/_test_utils/model.py index 6e2fe17f..abedd7b2 100644 --- a/tests/_test_utils/model.py +++ b/tests/_test_utils/model.py @@ -63,6 +63,11 @@ def _select_path(remote_id: str, local_id: str) -> str: local_id="llava-1.5-7b-hf", ) +QWEN_VL_PATH = _select_path( + remote_id="Qwen/Qwen2-VL-2B-Instruct", + local_id="Qwen2-VL-2B-Instruct", +) + # Diffusers FLUX_SCHNELL_PATH = _select_path( remote_id="hf-internal-testing/tiny-flux-pipe", diff --git a/tests/_test_utils/ptq_utils.py b/tests/_test_utils/ptq_utils.py index f943faad..89254070 100644 --- a/tests/_test_utils/ptq_utils.py +++ b/tests/_test_utils/ptq_utils.py @@ -27,8 +27,7 @@ @dataclass class PTQCommand: quant: str - export_fmt: str = "tensorrt_llm" - tasks: str = "build" + tasks: str = "quant" calib: int = 16 sparsity: str | None = None kv_cache_quant: str | None = None @@ -38,7 +37,9 @@ class PTQCommand: tp: int | None = None pp: int | None = None min_sm: int | None = None + max_sm: int | None = None min_gpu: int | None = None + batch: int | None = None def run(self, model_path: str): if self.min_sm and torch.cuda.get_device_capability() < ( @@ -48,6 +49,13 @@ def run(self, model_path: str): pytest.skip(reason=f"Requires sm{self.min_sm} or higher") return + if self.max_sm and torch.cuda.get_device_capability() > ( + self.max_sm // 10, + self.max_sm % 10, + ): + pytest.skip(reason=f"Requires sm{self.max_sm} or lower") + return + if self.min_gpu and torch.cuda.device_count() < self.min_gpu: pytest.skip(reason=f"Requires at least {self.min_gpu} GPUs") return diff --git a/tests/examples/llm_eval/test_llm_eval.py b/tests/examples/llm_eval/test_llm_eval.py index 5da65cae..d745df85 100644 --- a/tests/examples/llm_eval/test_llm_eval.py +++ b/tests/examples/llm_eval/test_llm_eval.py @@ -16,20 +16,22 @@ import subprocess from _test_utils.examples.run_command import run_llm_ptq_command +from _test_utils.model import TINY_LLAMA_PATH from _test_utils.torch_misc import minimum_sm @minimum_sm(89) -def test_llama_eval_fp8(tiny_llama_path): +def test_llama_eval_fp8(): try: run_llm_ptq_command( - model=tiny_llama_path, + model=TINY_LLAMA_PATH, quant="fp8", - tasks="mmlu,lm_eval,simple_eval,benchmark", + tasks="mmlu,lm_eval,simple_eval", calib=64, lm_eval_tasks="hellaswag,gsm8k", simple_eval_tasks="humaneval", lm_eval_limit=0.1, + batch=8, ) finally: # Force kill llm-serve if it's still running diff --git a/tests/examples/llm_ptq/test_llm_ptq.py b/tests/examples/llm_ptq/test_llm_ptq.py index b3eccf2b..1da11a8c 100644 --- a/tests/examples/llm_ptq/test_llm_ptq.py +++ b/tests/examples/llm_ptq/test_llm_ptq.py @@ -14,8 +14,6 @@ # limitations under the License. -import os - import pytest from _test_utils.model import BART_PATH, MIXTRAL_PATH, T5_PATH, TINY_LLAMA_PATH, WHISPER_PATH from _test_utils.ptq_utils import PTQCommand, WithRequirements @@ -24,7 +22,6 @@ @pytest.mark.parametrize( "command", [ - PTQCommand(quant="fp16"), PTQCommand(quant="fp8", min_sm=89), ], ids=PTQCommand.param_str, @@ -39,7 +36,6 @@ class TestT5(WithRequirements): @pytest.mark.parametrize( "command", [ - PTQCommand(quant="fp16"), PTQCommand(quant="fp8", min_sm=89), ], ids=PTQCommand.param_str, @@ -51,9 +47,7 @@ def test_ptq_t5(self, command): @pytest.mark.parametrize( "command", [ - PTQCommand(quant="fp16"), - PTQCommand(quant="fp8", min_sm=89), - PTQCommand(quant="fp8", export_fmt="hf", min_sm=89), + PTQCommand(quant="fp8", min_sm=90), ], ids=PTQCommand.param_str, ) @@ -71,7 +65,6 @@ class TestWhisper(WithRequirements): "command", [ # Auto-batch-size computation seems to take >10mins for Whisper hence using a fixed batch size - PTQCommand(quant="fp16", calib_batch_size=16), PTQCommand(quant="fp8", calib_batch_size=16, min_sm=89), ], ids=PTQCommand.param_str, @@ -80,84 +73,47 @@ def test_ptq_whisper(self, command): command.run(WHISPER_PATH) -@pytest.fixture(scope="module") -def llama_path(tiny_llama_path): - fast_tests = os.getenv("MODELOPT_FAST_TESTS", "true").lower() == "true" - if fast_tests: - return tiny_llama_path - return TINY_LLAMA_PATH - - @pytest.mark.parametrize( "command", [ - PTQCommand(quant="fp16"), - PTQCommand(quant="bf16"), - PTQCommand(quant="int8_sq"), - # ("int8_sq", "tensorrt_llm", "sparsegpt"), - PTQCommand(quant="int4_awq"), - PTQCommand(quant="int4_awq", export_fmt="hf"), + PTQCommand(quant="int8_sq", kv_cache_quant="none"), + PTQCommand(quant="int8_sq", kv_cache_quant="none", tp=2, pp=2), + PTQCommand(quant="int4_awq", kv_cache_quant="none"), + PTQCommand(quant="w4a8_awq", kv_cache_quant="none"), PTQCommand(quant="nvfp4"), - PTQCommand(quant="nvfp4", export_fmt="hf"), PTQCommand(quant="nvfp4_awq"), - PTQCommand(quant="nvfp4_awq", export_fmt="hf"), - # # autoquant PTQCommand( quant="int4_awq,nvfp4,fp8,w4a8_awq", calib_batch_size=4, auto_quantize_bits=6.4, + kv_cache_quant="none", ), - PTQCommand( - quant="int4_awq,nvfp4,fp8", - export_fmt="hf", - calib_batch_size=4, - auto_quantize_bits=6.4, - ), - # # kv_cache PTQCommand(quant="nvfp4_awq", kv_cache_quant="nvfp4"), - PTQCommand(quant="nvfp4_awq", export_fmt="hf", kv_cache_quant="nvfp4"), - # ("nvfp4_awq", "tensorrt_llm", "nvfp4_affine"), - # ("nvfp4_awq", "hf", "nvfp4_affine"), - # # autoquant_kv_cache PTQCommand( - quant="int4_awq,nvfp4,fp8,w4a8_awq", - kv_cache_quant="nvfp4", + quant="nvfp4,fp8", + kv_cache_quant="fp8", calib_batch_size=4, auto_quantize_bits=6.4, ), PTQCommand( - quant="int4_awq,nvfp4,fp8,w4a8_awq", - export_fmt="hf", + quant="nvfp4,fp8", kv_cache_quant="nvfp4", calib_batch_size=4, auto_quantize_bits=6.4, ), - # ("int4_awq,nvfp4,fp8,w4a8_awq", "tensorrt_llm", "nvfp4_affine"), - # ("int4_awq,nvfp4,fp8,w4a8_awq", "hf", "nvfp4_affine"), - # # sm89 PTQCommand(quant="fp8", min_sm=89), - PTQCommand(quant="fp8", kv_cache_quant="none", min_sm=89), - # ("fp8", "tensorrt_llm", "sparsegpt", None), - PTQCommand(quant="fp8", export_fmt="hf", min_sm=89), - PTQCommand(quant="w4a8_awq", min_sm=89), + PTQCommand(quant="fp8", kv_cache_quant="none", min_sm=89), # sm100 + PTQCommand(quant="nvfp4", min_sm=100), # # multi_gpu - # TP - PTQCommand(quant="fp16", tp=2, pp=1, min_gpu=2), - # ("fp16", "build", "sparsegpt", 1), - PTQCommand(quant="nvfp4", tp=2, pp=1, min_gpu=2), - PTQCommand(quant="fp16", tasks="benchmark", tp=2, pp=1, min_gpu=2), - # ("fp16", "benchmark", "sparsegpt", 2, 1), - # PP - # ("nvfp4", "build", None, 1, 2), - # ("fp16", "build", None, 1, 2), - # ("fp16", "build", "sparsegpt", 1, 2), + PTQCommand(quant="fp8", min_gpu=2, min_sm=89), + PTQCommand(quant="nvfp4", min_gpu=2, min_sm=100), ], ids=PTQCommand.param_str, ) -def test_ptq_llama(command, llama_path): - command.run(llama_path) +def test_ptq_llama(command): + command.run(TINY_LLAMA_PATH) diff --git a/tests/examples/speculative_decoding/test_medusa.py b/tests/examples/speculative_decoding/test_medusa.py index 58395c14..27f74eda 100644 --- a/tests/examples/speculative_decoding/test_medusa.py +++ b/tests/examples/speculative_decoding/test_medusa.py @@ -29,7 +29,7 @@ def install_transformers_lt_4_50(): # fmt: off -def _run_hf_ptq(model_path, output_dir, qformat, export_fmt): +def _run_hf_ptq(model_path, output_dir, qformat): run_example_command( [ "python", "hf_ptq.py", @@ -38,7 +38,6 @@ def _run_hf_ptq(model_path, output_dir, qformat, export_fmt): "--calib_size", "64", "--export_path", output_dir, "--qformat", qformat, - "--export_fmt", export_fmt, ], "llm_ptq", ) @@ -66,8 +65,7 @@ def test_llama_medusa_fp8_qat(tiny_llama_path, num_gpus, tiny_daring_anteater_pa ) # Test PTQ on Medusa - _run_hf_ptq(medusa_path, tmp_path / "medusa-tinyllama-trtllm", "fp8", "tensorrt_llm") - _run_hf_ptq(medusa_path, tmp_path / "medusa-tinyllama-hf", "fp8", "hf") + _run_hf_ptq(medusa_path, tmp_path / "medusa-tinyllama-hf", "fp8") # Test QAT on Medusa run_example_command( diff --git a/tests/examples/vlm_ptq/test_llava.py b/tests/examples/vlm_ptq/test_qwen_vl.py similarity index 80% rename from tests/examples/vlm_ptq/test_llava.py rename to tests/examples/vlm_ptq/test_qwen_vl.py index c811bd08..1f06f3a5 100644 --- a/tests/examples/vlm_ptq/test_llava.py +++ b/tests/examples/vlm_ptq/test_qwen_vl.py @@ -16,11 +16,11 @@ import pytest from _test_utils.examples.run_command import run_vlm_ptq_command -from _test_utils.model import LLAVA_PATH +from _test_utils.model import QWEN_VL_PATH from _test_utils.torch_misc import minimum_gpu -@pytest.mark.parametrize("quant", ["fp16"]) +@pytest.mark.parametrize("quant", ["fp8", "int8_sq", "nvfp4"]) @minimum_gpu(2) -def test_llava_multi_gpu(quant): - run_vlm_ptq_command(model=LLAVA_PATH, type="llava", quant=quant, tp=2) +def test_qwen_vl_multi_gpu(quant): + run_vlm_ptq_command(model=QWEN_VL_PATH, quant=quant) diff --git a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py index 86c9b0e4..c3d5653c 100644 --- a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py +++ b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py @@ -38,7 +38,6 @@ ("nvfp4_awq", "tiny_llama-nvfp4-awq", True, False, True, True), ("int4_awq", "tiny_llama-int4-awq", True, False, True, True), ("w4a8_awq", "tiny_llama-w4a8-awq", True, False, True, True), - ("fp8", "t5_tiny-fp8", True, False, True, True), ], ) def test_unified_hf_export_and_check_safetensors( @@ -83,8 +82,6 @@ def test_unified_hf_export_and_check_safetensors( str(tiny_model_dir), "--qformat", qformat, - "--export_fmt", - "hf", "--export_path", str(output_dir), ]