chore: Detect pre-quantized hf model

keehyuna · keehyuna · commit 303775e99d93 · 2025-09-02T13:13:11.000Z
diff --git a/tools/llm/README.md b/tools/llm/README.md
@@ -40,8 +40,7 @@ python run_llm.py --model meta-llama/Llama-3.2-1B-Instruct --prompt "What is par
 - `--tokenizer`: (Optional) Tokenizer name; defaults to model.
 - `--prompt`: Input prompt for generation.
 - `--precision`: Precision mode (`FP16`, `FP32`).
-- `--qformat`: Quantization format (`fp8`, `nvfp4`) to apply.
-- `--pre_quantized`: Flag to use pre-quantized models from HuggingFace.
+- `--quant_format`: Quantization format (`fp8`, `nvfp4`) to apply.
 - `--num_tokens`: Number of output tokens to generate.
 - `--cache`: KV cache type (`static_v1`, `static_v2`, or empty for no KV caching).
 - `--benchmark`: Enable benchmarking mode.
@@ -56,15 +55,15 @@ Torch-TensorRT supports quantization to reduce model memory footprint and improv
 To use pre-quantized models from HuggingFace:
 
 ```bash
-python run_llm.py --model nvidia/Llama-3.1-8B-Instruct-FP8 --pre_quantized --prompt "What is parallel programming?" --precision FP16 --num_tokens 128
+python run_llm.py --model nvidia/Llama-3.1-8B-Instruct-FP8 --prompt "What is parallel programming?" --precision FP16 --num_tokens 128
 ```
 
 #### Applying quantization by ModelOpt
 
 Apply fp8 quantization from HuggingFace:
 
 ```bash
-python run_llm.py --model meta-llama/Llama-3.1-8B --qformat fp8 --prompt "What is parallel programming?" --precision FP16 --num_tokens 128
+python run_llm.py --model meta-llama/Llama-3.1-8B --quant_format fp8 --prompt "What is parallel programming?" --precision FP16 --num_tokens 128
 ```
 
 #### Quantization Requirements
diff --git a/tools/llm/quantize_utils.py b/tools/llm/quantize_utils.py
@@ -27,22 +27,22 @@
 
 def quantize_model(model, args, tokenizer):
     """
-    Quantize a PyTorch model using ModelOpt quantization.
+    Quantize a PyTorch model using ModelOpt post-training quantization (PTQ).
 
-    This function performs post-training quantization (PTQ) on the model using
-    calibration data from the provided tokenizer. It supports both FP8 and NVFP4
-    quantization formats.
+    This function applies quantization to reduce model precision for faster inference
+    while maintaining acceptable accuracy. It uses calibration data generated from
+    the provided tokenizer to determine optimal quantization parameters.
 
+    Supported quantization formats:
+        - fp8: 8-bit floating point quantization
+        - nvfp4: 4-bit NVIDIA floating point quantization
     Args:
-        model: PyTorch model to quantize
-        args: Arguments containing quantization format and debug settings
-        tokenizer: Tokenizer for creating calibration dataloader
+        model: PyTorch model to quantize. Must be in evaluation mode.
+        args: Command line arguments containing quant_format and debug
+        tokenizer: Hugging Face tokenizer for creating calibration data
 
     Returns:
-        Quantized model with reduced precision weights and activations
-
-    Raises:
-        RuntimeError: If unsupported quantization format is specified
+        Quantized model
     """
     # Create calibration dataloader for quantization
     calib_dataloader = get_dataset_dataloader(
@@ -51,9 +51,9 @@ def quantize_model(model, args, tokenizer):
         num_samples=512,
         device="cuda:0",
     )
-    if args.qformat == "fp8":
+    if args.quant_format == "fp8":
         quant_cfg = mtq.FP8_DEFAULT_CFG
-    elif args.qformat == "nvfp4":
+    elif args.quant_format == "nvfp4":
         quant_cfg = mtq.NVFP4_DEFAULT_CFG
     else:
         raise RuntimeError("Unsupported quantization format")
@@ -108,7 +108,38 @@ def forward(self, input):
         return torch.nn.functional.linear(input, weight, self.bias)
 
 
-def convert_linear_to_tensorrt_quantized(model, model_name):
+def load_quantization_config(model_name):
+    """
+    Load quantization configuration from a Hugging Face model.
+    Args:
+        model_name (str): Local directory path or model identifier
+    Returns:
+        dict or None: Quantization configuration. None if no config found.
+    """
+    # Determine if model_name is a local directory or needs to be downloaded
+    if os.path.isdir(model_name):
+        model_path = model_name
+    else:
+        # Download model from Hugging Face Hub
+        model_path = snapshot_download(
+            model_name,
+            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+            ignore_patterns=["original/**/*"],
+            revision=None,
+        )
+    hf_quant_config = None
+    # Load and parse quantization configuration
+    hf_quant_config_path = f"{model_path}/hf_quant_config.json"
+    if os.path.exists(hf_quant_config_path):
+        with open(hf_quant_config_path, "r") as f:
+            hf_quant_config = json.load(f)
+            hf_quant_config = hf_quant_config["quantization"]
+            hf_quant_config["model_path"] = model_path
+
+    return hf_quant_config
+
+
+def convert_linear_to_tensorrt_quantized(model, hf_quant_config):
     """
     Convert linear layers in a model to TensorRT quantized versions from pre-quantized weights.
 
@@ -119,58 +150,37 @@ def convert_linear_to_tensorrt_quantized(model, model_name):
 
     The function:
     1. Loads quantization scales from Hugging Face model files (SafeTensors)
-    2. Parses quantization configuration from hf_quant_config.json
-    3. Replaces standard linear layers with TensorRTQuantizedLinear layers
-    4. Applies appropriate quantization based on the model's quantization format
+    2. Replaces standard linear layers with TensorRTQuantizedLinear layers
+    3. Applies appropriate quantization based on the model's quantization format
 
     Note: This function only quantizes linear operations and is intended for use
     with pre-quantized Hugging Face models that have been quantized using ModelOpt.
 
     Args:
         model: PyTorch model to quantize
-        model_name: Path to Hugging Face model directory or model identifier
+        hf_quant_config: Quantization configuration
 
     Returns:
         Model with quantized linear layers
 
     Raises:
         RuntimeError: If quantization config is not found or unsupported format
     """
-    # Determine if model_name is a local directory or needs to be downloaded
-    if os.path.isdir(model_name):
-        hf_folder = model_name
-    else:
-        # Download model from Hugging Face Hub
-        hf_folder = snapshot_download(
-            model_name,
-            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-            ignore_patterns=["original/**/*"],
-            revision=None,
-        )
-
+    model_path = hf_quant_config["model_path"]
     # Load all tensors from SafeTensors files
     tensors = {}
-    for file in os.listdir(hf_folder):
+    for file in os.listdir(model_path):
         if file.endswith(".safetensors"):
             with safe_open(
-                os.path.join(hf_folder, file), framework="pt", device="cpu"
+                os.path.join(model_path, file), framework="pt", device="cpu"
             ) as f:
                 tensor_names = f.keys()
                 for name in tensor_names:
                     tensors[name] = f.get_tensor(name)
 
-    # Load and parse quantization configuration
-    hf_quant_config_path = f"{hf_folder}/hf_quant_config.json"
-    if os.path.exists(hf_quant_config_path):
-        with open(hf_quant_config_path, "r") as f:
-            hf_quant_config = json.load(f)
-            hf_quant_config = hf_quant_config["quantization"]
-
-        hf_quant_algo = hf_quant_config.pop("quant_algo", None)
-        if hf_quant_algo != "FP8" and hf_quant_algo != "NVFP4":
-            raise RuntimeError("Only FP8 or NVFP4 quantization is supported")
-    else:
-        raise RuntimeError("No quantization config found")
+    hf_quant_algo = hf_quant_config.get("quant_algo", None)
+    if hf_quant_algo != "FP8" and hf_quant_algo != "NVFP4":
+        raise RuntimeError("Only FP8 or NVFP4 quantization is supported")
 
     # Iterate through all modules in the model
     for name, module in model.named_modules():
diff --git a/tools/llm/run_llm.py b/tools/llm/run_llm.py
@@ -19,6 +19,12 @@
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 import torch
 import torch_tensorrt
+from modelopt.torch.quantization.utils import export_torch_mode
+from quantize_utils import (
+    convert_linear_to_tensorrt_quantized,
+    load_quantization_config,
+    quantize_model,
+)
 from torchtrt_ext import register_sdpa
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from utils import (
@@ -60,8 +66,11 @@ def get_model(args):
             .eval()
             .cuda()
         )
-    if args.pre_quantized:
-        model = convert_linear_to_tensorrt_quantized(model, args.model).cuda()
+
+    hf_quant_config = load_quantization_config(args.model)
+    if hf_quant_config:
+        model = convert_linear_to_tensorrt_quantized(model, hf_quant_config).cuda()
+        print(f"Model converted to TensorRT quantized")
 
     if args.precision == "FP16":
         model = model.to(torch.float16)
@@ -95,7 +104,7 @@ def compile_torchtrt(model, input_ids, args):
             for optimized inference
     """
     max_seq_len = input_ids.shape[1] + args.num_tokens
-    with export_torch_mode() if args.qformat or args.pre_quantized else nullcontext():
+    with export_torch_mode():
         ep = export_llm(model, input_ids, max_seq_len=max_seq_len)
     position_ids = torch.arange(input_ids.shape[1]).unsqueeze(0).to(DEVICE)
     # Set precision specific flags
@@ -240,28 +249,12 @@ def measure_perf(trt_model, input_signature, backend_name):
         "--benchmark", action="store_true", help="Enable benchmark (default: False)"
     )
     arg_parser.add_argument(
-        "--qformat",
+        "--quant_format",
         help=("Apply quantization format. Options: fp8, nvfp4 (default: None)"),
         default=None,
     )
-    arg_parser.add_argument(
-        "--pre_quantized",
-        action="store_true",
-        help="Use pre-quantized hf model weights (default: False)",
-    )
     args = arg_parser.parse_args()
 
-    if args.qformat and args.pre_quantized:
-        print("Error: --qformat and --pre_quantized cannot be used together")
-        exit()
-
-    if args.qformat or args.pre_quantized:
-        from modelopt.torch.quantization.utils import export_torch_mode
-        from quantize_utils import (
-            convert_linear_to_tensorrt_quantized,
-            quantize_model,
-        )
-
     with torch.inference_mode():
         model = get_model(args)
 
@@ -286,7 +279,7 @@ def measure_perf(trt_model, input_signature, backend_name):
         pyt_timings = None
         pyt_stats = None
 
-        if args.qformat != None:
+        if args.quant_format != None:
             model = quantize_model(model, args, tokenizer)
         if args.enable_pytorch_run:
             pyt_gen_tokens = generate(