diff --git a/examples/windows/accuracy_benchmark/kl_divergence_metrics/README.md b/examples/windows/accuracy_benchmark/kl_divergence_metrics/README.md new file mode 100644 index 000000000..0c144e095 --- /dev/null +++ b/examples/windows/accuracy_benchmark/kl_divergence_metrics/README.md @@ -0,0 +1,154 @@ +# KL Divergence Model Validation Toolkit + +This toolkit provides comprehensive model validation capabilities using KL divergence metrics to compare two models. It's designed to evaluate the similarity between model outputs across different optimization techniques, frameworks, and hardware backends. + +## Overview + +The toolkit measures output similarity between models using KL (Kullback-Leibler) divergence, which quantifies how one probability distribution differs from another. Lower KL divergence values indicate more similar model outputs. + +**Primary Use Cases:** + +1. **Model Optimization Validation** - Verify that optimized models (quantization, pruning) maintain output quality +2. **Framework Comparison** - Compare Hugging Face models vs ONNX Runtime GenAI models +3. **Precision Analysis** - Evaluate FP16 vs INT4 vs INT8 model outputs +4. **Execution Provider Testing** - Test different EP implementations (CUDA, DirectML, CPU, TensorRT) + +## Key Components + +### Main Script + +| Script | Purpose | Comparison Modes | +|--------|---------|------------------| +| `compute_kl_divergence.py` | **Two-model sequential comparison** | • HF vs GenAI
• GenAI vs GenAI (same EP)
• GenAI vs HF
• HF vs HF | + +### Datasets Used + +- **Wikitext-2** test split for consistent evaluation across all models +- Automatic dataset loading and preprocessing via HuggingFace datasets + +## Installation + +### 1. Install Base Requirements + + ```bash + pip install -r requirements.txt + ``` + + Note: Install torch with CUDA for faster inference: + "pip install torch torchvision torchaudio --index-url " + +### 2. Install ONNX Runtime GenAI Package + +Install **one** of the following based on your hardware: + + ```bash +# For CUDA + pip install onnxruntime-genai-cuda + + # For DirectML support + pip install onnxruntime-genai-directml + +# For CPU + pip install onnxruntime-genai + ``` + +## Usage Examples + +### Quick Start + +#### Compare HF vs GenAI Model + +```bash +python compute_kl_divergence.py \ + --model1 "meta-llama/Llama-3.1-8B-Instruct" --model1_type hf \ + --model2 "G:\models\genai_model" --model2_type genai \ + --device cuda \ + --output results.json +``` + +#### Compare Two GenAI Models (Same EP) + +```bash +python compute_kl_divergence.py \ + --model1 "G:\models\genai_fp16" --model1_type genai \ + --model2 "G:\models\genai_int4" --model2_type genai \ + --output fp16_vs_int4.json +``` + +### Advanced Options + +#### Enable Debug Output + +```bash +python compute_kl_divergence.py \ + --model1 "meta-llama/Llama-3.1-8B-Instruct" --model1_type hf \ + --model2 "G:\models\genai_model" --model2_type genai \ + --device cuda \ + --output results.json \ + --debug # Enables verbose logging +``` + +## Configuration Parameters + +### compute_kl_divergence.py + +**Required Parameters:** + +| Parameter | Description | Values | +|-----------|-------------|--------| +| `--model1` | Path to first model | Local path or HF Hub identifier | +| `--model1_type` | Type of first model | `hf`, `genai` | +| `--model2` | Path to second model | Local path or HF Hub identifier | +| `--model2_type` | Type of second model | `hf`, `genai` | + +**Optional Parameters:** + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `--device` | Device for HF model inference | `cuda` | +| `--output` | Output JSON file path | None (prints to console) | +| `--debug` | Enable verbose debug output | False | + +**Model Path Formats:** + +- **HF models**: + - Hub identifier: `meta-llama/Llama-3.1-8B-Instruct` + - Local path: `F:\shared\Llama-3.1-8B-Instruct` +- **GenAI models**: + - Local path only: `G:\models\genai_model` + +### Key Insights + +- **Lower is better**: Smaller KL divergence = more similar outputs +- **Relative comparison**: Compare against baseline (e.g., HF FP32) + +## Troubleshooting + +### Common Issues and Solutions + +#### 1. CUDA Out of Memory + +**Error:** + +```text +RuntimeError: CUDA out of memory +``` + +**Solutions:** + +- Use CPU for HF model: `--device cpu` +- Close other applications using GPU +- Try smaller batch size (modify code if needed) +- Ensure only one model loads at a time (script should handle this) + +#### 2. Execution Provider Mismatch + +**Error:** + +```text +[INFO] Comparing two GenAI models (same execution provider) +``` + +**Note:** This is informational. GenAI vs GenAI comparisons require same EP. + +**Solution:** Ensure both models were created for the same execution provider. diff --git a/examples/windows/accuracy_benchmark/kl_divergence_metrics/compute_kl_divergence.py b/examples/windows/accuracy_benchmark/kl_divergence_metrics/compute_kl_divergence.py new file mode 100644 index 000000000..3a3fbbdd2 --- /dev/null +++ b/examples/windows/accuracy_benchmark/kl_divergence_metrics/compute_kl_divergence.py @@ -0,0 +1,895 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +KL divergence computation between two models. + +This script supports two comparison modes: +1. HF model vs GenAI model +2. GenAI model vs GenAI model (same execution provider) + + +Usage: + # HF vs GenAI comparison + python compute_kl_divergence_hf_vs_genai_sequential.py \ + --model1 "path/to/hf/model" --model1_type hf \ + --model2 "path/to/genai/model" --model2_type genai \ + --device cuda + + # GenAI vs GenAI comparison + python compute_kl_divergence_hf_vs_genai_sequential.py \ + --model1 "path/to/genai/model1" --model1_type genai \ + --model2 "path/to/genai/model2" --model2_type genai + +Note: + The GenAI models automatically use the appropriate execution provider + based on the installed onnxruntime-genai package. + +Requirements: + - Only requires VRAM for one model at a time (~8GB for 8B models) + - Sufficient system RAM to store logits (~2-4GB) +""" + +import argparse +import gc +import json +import os +import time +from datetime import datetime +from typing import Any + +import numpy as np +import torch +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +# Lazy import for onnxruntime_genai +og: Any = None + +DEBUG = False + + +def debug_print(message): + """Print debug message only if DEBUG flag is enabled.""" + if DEBUG: + print(f"[DEBUG] {message}") + + +def cleanup_vram(): + """Aggressively clean up VRAM.""" + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + debug_print("VRAM cleaned") + try: + debug_print( + f"[INFO] GPU Memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB" + ) + debug_print(f"[INFO] GPU Memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB") + except Exception: + pass + + +def get_wikitext2(): + """ + Load and concatenate the WikiText-2 test dataset. + + Returns: + str: Concatenated text from all samples, separated by double newlines. + """ + print("\n[INFO] Loading Wikitext-2 'test' split...") + test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + debug_print(f"Number of raw samples: {len(test)}") + + result = "\n\n".join(text for text in test["text"]) + debug_print(f"[INFO] Dataset loaded ({len(result):,} characters)") + return result + + +def compute_kl_divergence(log_probs_ref, log_probs_tar): + """ + Compute Kullback-Leibler divergence between two log probability distributions. + + Args: + log_probs_ref (np.ndarray): Reference log probabilities with shape (seq_len, vocab_size). + log_probs_tar (np.ndarray): Target log probabilities with shape (seq_len, vocab_size). + + Returns: + float: Average KL divergence across all positions. + """ + kl_divergence = 0.0 + for i in range(log_probs_ref.shape[0]): + log_probs_ref_i = np.array(log_probs_ref[i]) + log_probs_tar_i = np.array(log_probs_tar[i]) + prob_ref_i = np.exp(log_probs_ref_i) + kl_divergence += np.sum(prob_ref_i * np.abs(log_probs_ref_i - log_probs_tar_i)) + kl_divergence = kl_divergence / log_probs_ref.shape[0] + return kl_divergence + + +def extract_genai_logits(model_path, dataset, max_context_length=4096): + """ + Extract logits from GenAI model and store in CPU memory. + + Args: + model_path (str): Path to ONNX Runtime GenAI model directory. + dataset (str): Text dataset to process. + max_context_length (int): Maximum context length for chunks. + + Returns: + dict: Dictionary containing logits list, chunk info, and metadata. + """ + print("\n" + "=" * 80) + print("STEP 1: EXTRACTING MODEL 1 LOGITS (GenAI)") + print("=" * 80) + print(f"[INFO] Loading ONNX Runtime GenAI model from: {model_path}") + + # Import onnxruntime_genai + global og + try: + import onnxruntime_genai as og_module + + og = og_module + debug_print("[INFO] Successfully imported onnxruntime_genai") + except ImportError as e: + raise ImportError( + f"Failed to import onnxruntime_genai: {e}. " + f"Make sure the correct onnxruntime-genai package is installed" + ) + + assert og is not None, "onnxruntime_genai module not loaded" + + # Load GenAI model + model = og.Model(model_path) + tokenizer = og.Tokenizer(model) + print("[INFO] GenAI model loaded successfully") + + if torch.cuda.is_available(): + try: + debug_print( + f"[INFO] GPU Memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB" + ) + debug_print(f"[INFO] GPU Memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB") + except Exception: + pass + + # Tokenize + print("[INFO] Tokenizing dataset...") + input_ids = tokenizer.encode_batch([dataset]) + + if isinstance(input_ids, dict) and "input_ids" in input_ids: + input_ids = input_ids["input_ids"] + if hasattr(input_ids, "as_numpy"): + input_ids = input_ids.as_numpy() + + input_ids = np.array(input_ids) + + if input_ids.ndim == 1: + input_ids = np.expand_dims(input_ids, 0) + + input_ids = torch.tensor(input_ids, dtype=torch.long) + seq_len = int(input_ids.shape[1]) + debug_print(f"[INFO] Input sequence length: {seq_len}") + + # Store all logits in CPU memory + all_logits = [] + chunk_info = [] + + print("[INFO] Extracting logits ...") + for chunk_count, begin_loc in enumerate(range(0, seq_len, max_context_length), start=1): + end_loc = min(begin_loc + max_context_length, seq_len) + + # Extract chunk + input_ids_chunk = input_ids[:, begin_loc:end_loc].clone() + debug_print(f"Chunk shape: {input_ids_chunk.shape}") + + # Run GenAI model + params = og.GeneratorParams(model) + params.set_search_options( + max_length=int(input_ids_chunk.shape[1]), do_sample=False, early_stopping=False + ) + + generator = og.Generator(model, params) + generator.append_tokens(input_ids_chunk.numpy()) + + with torch.no_grad(): + try: + generator.generate_next_token() + logits = generator.get_output("logits") + + if hasattr(logits, "as_numpy"): + logits = logits.as_numpy() + + logits = torch.tensor(logits, dtype=torch.float32) + debug_print(f"Logits shape: {logits.shape}") + + # Store logits in CPU memory (important!) + logits_cpu = logits.cpu().numpy() + all_logits.append(logits_cpu) + + chunk_info.append( + { + "chunk_id": chunk_count, + "begin_loc": begin_loc, + "end_loc": end_loc, + "shape": logits_cpu.shape, + } + ) + + # Clean up chunk tensors immediately + del generator, params, logits, input_ids_chunk + + except Exception as e: + print(f"[ERROR] GenAI model forward pass failed: {e}") + break + + debug_print(f"[INFO] Extracted {len(all_logits)} chunks from GenAI model") + + genai_data = { + "logits": all_logits, + "chunk_info": chunk_info, + "model_path": model_path, + "model_type": "genai", + "seq_len": seq_len, + "max_context_length": max_context_length, + "total_chunks": len(all_logits), + "input_ids": input_ids.cpu(), # Move to CPU to free GPU memory + } + + # Cleanup: Delete model and free VRAM + debug_print("\n[INFO] Cleaning up GenAI model from VRAM...") + del model + del tokenizer + del input_ids + + cleanup_vram() + + debug_print("[INFO] GenAI model cleaned from VRAM") + debug_print("=" * 80) + + return genai_data + + +def extract_hf_logits(hf_model_path, dataset, device="cuda", max_context_length=4096): + """ + Extract logits from Hugging Face model and store in CPU memory. + + Args: + hf_model_path (str): Path to Hugging Face model directory or model name. + dataset (str): Text dataset to process. + device (str): Device for inference ('cuda' or 'cpu'). + max_context_length (int): Maximum context length for chunks. + + Returns: + dict: Dictionary containing logits list, chunk info, and metadata. + """ + print("\n" + "=" * 80) + print("STEP 1: EXTRACTING MODEL 1 LOGITS (HF)") + print("=" * 80) + print(f"[INFO] Loading Hugging Face model from: {hf_model_path}") + + # Load model + tokenizer = AutoTokenizer.from_pretrained(hf_model_path) + model = AutoModelForCausalLM.from_pretrained( + hf_model_path, + torch_dtype=torch.float16 if device == "cuda" else torch.float32, + device_map=None, # Don't use device_map="auto" for proper cleanup + ) + + if device == "cuda": + model = model.to("cuda") + else: + model = model.to("cpu") + + model.eval() + + # Add padding token if it doesn't exist + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + if getattr(getattr(model, "config", None), "pad_token_id", None) is None: + model.config.pad_token_id = tokenizer.pad_token_id + + print("[INFO] Hugging Face model loaded successfully") + + if device == "cuda": + try: + debug_print( + f"[INFO] GPU Memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB" + ) + debug_print(f"[INFO] GPU Memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB") + except Exception: + pass + + # Tokenize + print("[INFO] Tokenizing dataset...") + inputs = tokenizer(dataset, return_tensors="pt", truncation=False) + input_ids = inputs["input_ids"] + seq_len = int(input_ids.shape[1]) + debug_print(f"[INFO] Input sequence length: {seq_len}") + + # Store all logits in CPU memory + all_logits = [] + chunk_info = [] + + print("[INFO] Extracting logits ...") + for chunk_count, begin_loc in enumerate(range(0, seq_len, max_context_length), 1): + end_loc = min(begin_loc + max_context_length, seq_len) + + # Extract chunk + input_ids_chunk = input_ids[:, begin_loc:end_loc] + debug_print(f"Chunk shape: {input_ids_chunk.shape}") + + # Move to device + input_ids_chunk = input_ids_chunk.to(model.device) + + # Run model + with torch.no_grad(): + try: + outputs = model(input_ids_chunk) + logits = outputs.logits + debug_print(f"Logits shape: {logits.shape}") + + # Store logits in CPU memory (important!) + logits_cpu = logits.cpu().numpy() + all_logits.append(logits_cpu) + + chunk_info.append( + { + "chunk_id": chunk_count, + "begin_loc": begin_loc, + "end_loc": end_loc, + "shape": logits_cpu.shape, + } + ) + + # Clean up chunk tensors immediately + del outputs, logits, input_ids_chunk + + except Exception as e: + print(f"[ERROR] HF model forward pass failed: {e}") + break + + debug_print(f"[INFO] Extracted {len(all_logits)} chunks from HF model") + + hf_data = { + "logits": all_logits, + "chunk_info": chunk_info, + "model_path": hf_model_path, + "model_type": "hf", + "seq_len": seq_len, + "max_context_length": max_context_length, + "total_chunks": len(all_logits), + "input_ids": input_ids.cpu(), # Move to CPU to free GPU memory + } + + # Cleanup: Delete model and free VRAM + debug_print("\n[INFO] Cleaning up HF model from VRAM...") + + # Move model to CPU first to free GPU memory + if device == "cuda": + model = model.to("cpu") + + # Delete all references + del model + del tokenizer + del input_ids + del inputs + + # Aggressive cleanup + cleanup_vram() + gc.collect() + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + + debug_print("[INFO] HF model cleaned from VRAM") + if torch.cuda.is_available(): + try: + debug_print( + f"[INFO] GPU Memory allocated after cleanup: {torch.cuda.memory_allocated() / 1e9:.2f} GB" + ) + debug_print( + f"[INFO] GPU Memory reserved after cleanup: {torch.cuda.memory_reserved() / 1e9:.2f} GB" + ) + except Exception: + pass + debug_print("=" * 80) + + return hf_data + + +def compute_kl_with_model2(model_path, model1_data, dataset, model2_type, device="cuda"): + """ + Load second model and compute KL divergence against stored model1 logits. + + Args: + model_path (str): Path to second model directory. + model1_data (dict): Dictionary containing model1 logits and metadata. + dataset (str): Text dataset to process. + model2_type (str): Type of model2 ('hf' or 'genai'). + device (str): Device for HF model inference if model2 is HF. + + Returns: + dict: Results containing KL divergence metrics. + """ + print("\n" + "=" * 80) + print(f"STEP 2: LOADING MODEL 2 ({model2_type.upper()}) AND COMPUTING KL DIVERGENCE") + print("=" * 80) + print(f"[INFO] Loading model from: {model_path}") + + # Load model based on type + if model2_type == "genai": + # Import onnxruntime_genai + global og + try: + import onnxruntime_genai as og_module + + og = og_module + debug_print("[INFO] Successfully imported onnxruntime_genai") + except ImportError as e: + raise ImportError( + f"Failed to import onnxruntime_genai: {e}. " + f"Make sure the correct onnxruntime-genai package is installed" + ) + + assert og is not None, "onnxruntime_genai module not loaded" + + # Load GenAI model + model = og.Model(model_path) + tokenizer = og.Tokenizer(model) + print("[INFO] GenAI model loaded successfully") + + if torch.cuda.is_available(): + try: + debug_print( + f"[INFO] GPU Memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB" + ) + debug_print( + f"[INFO] GPU Memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB" + ) + except Exception: + pass + + # Tokenize with GenAI tokenizer + print("\n[INFO] Tokenizing dataset...") + input_ids = tokenizer.encode_batch([dataset]) + + if isinstance(input_ids, dict) and "input_ids" in input_ids: + input_ids = input_ids["input_ids"] + if hasattr(input_ids, "as_numpy"): + input_ids = input_ids.as_numpy() + + input_ids = np.array(input_ids) + + if input_ids.ndim == 1: + input_ids = np.expand_dims(input_ids, 0) + + input_ids = torch.tensor(input_ids, dtype=torch.long) + + else: # model2_type == "hf" + # Load HF model + from transformers import AutoModelForCausalLM, AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(model_path) + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.float16 if device == "cuda" else torch.float32, + device_map=None, # Don't use device_map="auto" for proper cleanup + ) + + if device == "cuda": + model = model.to("cuda") + else: + model = model.to("cpu") + + model.eval() + + # Add padding token if it doesn't exist + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + if getattr(getattr(model, "config", None), "pad_token_id", None) is None: + model.config.pad_token_id = tokenizer.pad_token_id + + print("[INFO] HF model loaded successfully") + + if device == "cuda" and torch.cuda.is_available(): + try: + debug_print( + f"[INFO] GPU Memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB" + ) + debug_print( + f"[INFO] GPU Memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB" + ) + except Exception: + pass + + # Tokenize with HF tokenizer + print("\n[INFO] Tokenizing dataset...") + inputs = tokenizer(dataset, return_tensors="pt", truncation=False) + input_ids = inputs["input_ids"] + + seq_len = int(input_ids.shape[1]) + debug_print(f"[INFO] Input sequence length: {seq_len}") + + # Get config from model1 data + max_context_length = model1_data["max_context_length"] + total_chunks = model1_data["total_chunks"] + + debug_print(f"\n[INFO] Computing KL divergence for {total_chunks} chunks...") + debug_print("=" * 80) + + # Process chunks and compute KL divergence + total_kl_divergence = 0.0 + chunk_results = [] + chunk_count = 0 + + for begin_loc in range(0, seq_len, max_context_length): + chunk_count += 1 + + if chunk_count > total_chunks: + break + + end_loc = min(begin_loc + max_context_length, seq_len) + + # Extract chunk + input_ids_chunk = input_ids[:, begin_loc:end_loc].clone() + debug_print(f"Chunk shape: {input_ids_chunk.shape}") + + # Run model2 based on type + with torch.no_grad(): + try: + if model2_type == "genai": + # Run GenAI model + params = og.GeneratorParams(model) + params.set_search_options( + max_length=int(input_ids_chunk.shape[1]), + do_sample=False, + early_stopping=False, + ) + + generator = og.Generator(model, params) + generator.append_tokens(input_ids_chunk.numpy()) + generator.generate_next_token() + + model2_logits = generator.get_output("logits") + if hasattr(model2_logits, "as_numpy"): + model2_logits = model2_logits.as_numpy() + model2_logits = torch.tensor(model2_logits, dtype=torch.float32) + + else: # model2_type == "hf" + # Run HF model + input_ids_chunk_device = input_ids_chunk.to(model.device) + outputs = model(input_ids_chunk_device) + model2_logits = outputs.logits + + debug_print(f"Model2 logits shape: {model2_logits.shape}") + + except Exception as e: + print(f"[ERROR] Model2 forward pass failed: {e}") + break + + # Get corresponding model1 logits from stored data + model1_logits = torch.tensor(model1_data["logits"][chunk_count - 1], dtype=torch.float32) + debug_print(f"Model1 logits shape: {model1_logits.shape}") + + # Ensure logits are compatible (trim to same dimensions) + min_seq = min(model1_logits.shape[1], model2_logits.shape[1]) + min_vocab = min(model1_logits.shape[2], model2_logits.shape[2]) + + model1_logits_trimmed = model1_logits[:, :min_seq, :min_vocab] + model2_logits_trimmed = model2_logits[:, :min_seq, :min_vocab] + + debug_print(f"Trimmed Model1 logits: {model1_logits_trimmed.shape}") + debug_print(f"Trimmed Model2 logits: {model2_logits_trimmed.shape}") + + # Compute log probabilities + model1_log_probs = ( + torch.nn.functional.log_softmax(model1_logits_trimmed, dim=2).cpu().numpy() + ) + model2_log_probs = ( + torch.nn.functional.log_softmax(model2_logits_trimmed, dim=2).cpu().numpy() + ) + + # Squeeze batch dimension + model1_log_probs = np.squeeze(model1_log_probs, axis=0) + model2_log_probs = np.squeeze(model2_log_probs, axis=0) + + debug_print(f"Model1 log probs shape: {model1_log_probs.shape}") + debug_print(f"Model2 log probs shape: {model2_log_probs.shape}") + + # Compute KL divergence for this chunk + chunk_kl = compute_kl_divergence(model1_log_probs, model2_log_probs) + total_kl_divergence += chunk_kl + + debug_print(f"[RESULT] Chunk {chunk_count} KL divergence: {chunk_kl:.6f}") + + chunk_results.append( + { + "chunk_id": chunk_count, + "begin_loc": int(model1_data["chunk_info"][chunk_count - 1]["begin_loc"]), + "end_loc": int(model1_data["chunk_info"][chunk_count - 1]["end_loc"]), + "kl_divergence": float(chunk_kl), + } + ) + + # Cleanup chunk data immediately + del model2_logits, model1_logits, model1_logits_trimmed, model2_logits_trimmed + del model1_log_probs, model2_log_probs, input_ids_chunk + if model2_type == "genai": + del generator, params + else: + del outputs, input_ids_chunk_device + + avg_kl_divergence = total_kl_divergence / chunk_count + + print("\n" + "=" * 80) + print("RESULTS") + print("=" * 80) + debug_print(f"Total chunks processed: {chunk_count}") + debug_print(f"Total KL divergence: {total_kl_divergence:.6f}") + print(f"Average KL divergence: {avg_kl_divergence:.6f}") + print("=" * 80) + + # Cleanup model2 + debug_print(f"\n[INFO] Cleaning up model2 ({model2_type}) from VRAM...") + + # For HF models, move to CPU first + if model2_type == "hf" and device == "cuda": + model = model.to("cpu") + + del model + del tokenizer + del input_ids + + # Aggressive cleanup + cleanup_vram() + gc.collect() + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + + debug_print(f"[INFO] Model2 ({model2_type}) cleaned from VRAM") + if torch.cuda.is_available(): + try: + debug_print( + f"[INFO] GPU Memory allocated after cleanup: {torch.cuda.memory_allocated() / 1e9:.2f} GB" + ) + debug_print( + f"[INFO] GPU Memory reserved after cleanup: {torch.cuda.memory_reserved() / 1e9:.2f} GB" + ) + except Exception: + pass + + results = { + "total_kl_divergence": float(total_kl_divergence), + "average_kl_divergence": float(avg_kl_divergence), + "total_chunks": int(chunk_count), + "chunk_results": chunk_results, + } + + return results + + +def main(): + """ + Command-line entry point for sequential KL divergence comparison. + """ + parser = argparse.ArgumentParser( + description="Memory-efficient sequential KL divergence comparison between HF and GenAI models", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Compare HF vs GenAI model + python compute_kl_divergence_hf_vs_genai_sequential.py \\ + --model1 "meta-llama/Llama-3.1-8B-Instruct" --model1_type hf \\ + --model2 "G:\\models\\genai_model" --model2_type genai \\ + --device cuda \\ + --output results.json + + # Compare two GenAI models (same EP) + python compute_kl_divergence_hf_vs_genai_sequential.py \\ + --model1 "G:\\models\\genai_fp16" --model1_type genai \\ + --model2 "G:\\models\\genai_int4" --model2_type genai \\ + --output results.json + + # Compare GenAI vs HF model + python compute_kl_divergence_hf_vs_genai_sequential.py \\ + --model1 "G:\\models\\genai_model" --model1_type genai \\ + --model2 "F:\\shared\\Llama-3.1-8B-Instruct" --model2_type hf \\ + --device cuda \\ + --output results.json + +Note: + - GenAI models automatically use the appropriate execution provider + based on the installed onnxruntime-genai package (cuda, directml, cpu, tensorrt) + - The --device flag only controls HF model inference (applies to any HF model) + - For GenAI vs GenAI comparison, both models must use the same execution provider + +Advantages: + - Only loads one model at a time (minimal VRAM usage) + - No disk I/O overhead (stores logits in RAM) + - Single process execution + - Real-time KL divergence computation + +VRAM Requirements: + - Only requires VRAM for one model at a time (~8GB for 8B models) + - Requires system RAM to store logits (~2-4GB) + """, + ) + + parser.add_argument("--model1", required=True, help="Path to first model directory") + parser.add_argument( + "--model1_type", + required=True, + choices=["hf", "genai"], + help="Type of first model (hf or genai)", + ) + parser.add_argument("--model2", required=True, help="Path to second model directory") + parser.add_argument( + "--model2_type", + required=True, + choices=["hf", "genai"], + help="Type of second model (hf or genai)", + ) + parser.add_argument( + "--device", + default="cuda", + choices=["cuda", "cpu"], + help="Device for HF model inference (default: cuda)", + ) + parser.add_argument("--output", required=False, help="Output JSON file for results (optional)") + parser.add_argument("--debug", action="store_true", help="Enable verbose debug output") + + args = parser.parse_args() + + # Set global debug flag + global DEBUG + DEBUG = args.debug + + # Validate model paths (only check local paths, HF model identifiers will be downloaded) + if args.model1_type == "genai" and not os.path.exists(args.model1): + print(f"[ERROR] Model1 path does not exist: {args.model1}") + return 1 + elif ( + args.model1_type == "hf" and os.path.sep in args.model1 and not os.path.exists(args.model1) + ): + # Only validate if it looks like a local path (contains path separators) + print(f"[ERROR] Model1 path does not exist: {args.model1}") + return 1 + + if args.model2_type == "genai" and not os.path.exists(args.model2): + print(f"[ERROR] Model2 path does not exist: {args.model2}") + return 1 + elif ( + args.model2_type == "hf" and os.path.sep in args.model2 and not os.path.exists(args.model2) + ): + # Only validate if it looks like a local path (contains path separators) + print(f"[ERROR] Model2 path does not exist: {args.model2}") + return 1 + + # Validate GenAI vs GenAI comparison uses same EP + if args.model1_type == "genai" and args.model2_type == "genai": + print("[INFO] Comparing two GenAI models (same execution provider)") + + print("=" * 80) + print("KL DIVERGENCE COMPUTATION BETWEEN TWO MODELS") + print("=" * 80) + print(f"Model 1: {args.model1} ({args.model1_type.upper()})") + print(f"Model 2: {args.model2} ({args.model2_type.upper()})") + if args.model1_type == "hf" or args.model2_type == "hf": + print(f"HF Device: {args.device}") + if args.output: + print(f"Output: {args.output}") + print("=" * 80) + + overall_start_time = time.time() + + try: + # Load dataset once + dataset = get_wikitext2() + + # Step 1: Extract Model1 logits and store in memory + model1_start_time = time.time() + if args.model1_type == "hf": + model1_data = extract_hf_logits(args.model1, dataset, args.device) + else: # genai + model1_data = extract_genai_logits(args.model1, dataset) + model1_end_time = time.time() + print( + f"\n[TIMING] Model1 extraction time: {model1_end_time - model1_start_time:.2f} seconds" + ) + + # Step 2: Load Model2 and compute KL divergence + model2_start_time = time.time() + kl_results = compute_kl_with_model2( + args.model2, model1_data, dataset, args.model2_type, args.device + ) + model2_end_time = time.time() + debug_print( + f"\n[TIMING] Model2 computation time: {model2_end_time - model2_start_time:.2f} seconds" + ) + + overall_end_time = time.time() + + # Prepare final results + final_results = { + "models": { + "model1": {"path": str(args.model1), "type": args.model1_type}, + "model2": {"path": str(args.model2), "type": args.model2_type}, + }, + "device": args.device + if (args.model1_type == "hf" or args.model2_type == "hf") + else "N/A", + "total_chunks": kl_results["total_chunks"], + "max_context_length": model1_data["max_context_length"], + "kl_divergence": { + "total": kl_results["total_kl_divergence"], + "average": kl_results["average_kl_divergence"], + }, + "chunk_results": kl_results["chunk_results"], + "timing": { + "model1_extraction_seconds": float(model1_end_time - model1_start_time), + "model2_computation_seconds": float(model2_end_time - model2_start_time), + "total_seconds": float(overall_end_time - overall_start_time), + }, + "computation_timestamp": datetime.now().isoformat(), + } + + # Save results if output file specified + if args.output: + print(f"\n[INFO] Saving results to: {args.output}") + with open(args.output, "w") as f: + json.dump(final_results, f, indent=2) + print("[INFO] Results saved successfully") + + print("\n" + "=" * 80) + print("FINAL SUMMARY") + print("=" * 80) + print(f"Total execution time: {overall_end_time - overall_start_time:.2f} seconds") + print( + f" - Model1 ({args.model1_type}) extraction: {model1_end_time - model1_start_time:.2f} seconds" + ) + print( + f" - Model2 ({args.model2_type}) + KL computation: {model2_end_time - model2_start_time:.2f} seconds" + ) + print(f"\nAverage KL divergence: {kl_results['average_kl_divergence']:.6f}") + print("=" * 80) + + print("\n[SUCCESS] KL divergence computation completed!") + return 0 + + except KeyboardInterrupt: + print("\n[INFO] Computation interrupted by user") + return 1 + except Exception as e: + print(f"\n[ERROR] Computation failed: {e}") + if DEBUG: + import traceback + + traceback.print_exc() + return 1 + finally: + # Final cleanup + cleanup_vram() + + +if __name__ == "__main__": + import sys + + sys.exit(main()) diff --git a/examples/windows/accuracy_benchmark/kl_divergence_metrics/requirements.txt b/examples/windows/accuracy_benchmark/kl_divergence_metrics/requirements.txt new file mode 100644 index 000000000..8409b2f8e --- /dev/null +++ b/examples/windows/accuracy_benchmark/kl_divergence_metrics/requirements.txt @@ -0,0 +1,8 @@ +--extra-index-url https://download.pytorch.org/whl/cu129 +accelerate +datasets +numpy +safetensors>=0.4.0 + +torch>=2.0.0 +transformers>=4.30.0 diff --git a/examples/windows/accuracy_benchmark/perplexity_metrics/README.md b/examples/windows/accuracy_benchmark/perplexity_metrics/README.md new file mode 100644 index 000000000..dd2dda5c5 --- /dev/null +++ b/examples/windows/accuracy_benchmark/perplexity_metrics/README.md @@ -0,0 +1,237 @@ +# Perplexity Evaluation Tool + +## Overview + +This tool evaluates the perplexity of ONNX Runtime GenAI models and HuggingFace models using the [WikiText-2](https://huggingface.co/datasets/wikitext) dataset. Perplexity is a standard metric for language models: lower values indicate better predictive performance. + +## Attribution + +This script is originally based on [perplexity_metrics.py](https://github.com/microsoft/onnxruntime-genai/blob/main/tools/python/model_validation/perplexity_metrics.py) from the Microsoft ONNX Runtime GenAI repository. It has been modified to handle: + +- Multiple context lengths +- Configurable chunk sizes +- Enhanced prefill chunking handling +- HuggingFace model evaluation support + +## Scripts + +- **`perplexity_metrics.py`**: Core evaluation logic for computing perplexity. +- **`run_perplexity.py`**: Command-line utility for evaluating one or more models and saving results to CSV. + +## Requirements + +- Python 3.8+ +- CUDA 12.x (if using GPU acceleration) + +- Install dependencies: + + **For CUDA 12.x (recommended for CUDA 12.1-12.9):** + + ```bash + pip install -r requirements.txt + ``` + +- Install ONNX Runtime GenAI (required for ONNX model evaluation): + + ```bash + pip install onnxruntime-genai + ``` + +- [HuggingFace CLI](https://huggingface.co/docs/huggingface_hub/main/en/guides/cli) login is required to access the WikiText-2 dataset: + + ```bash + huggingface-cli login + ``` + +## Supported Models + +### ONNX Runtime GenAI Models + +- Any ONNX Runtime GenAI model exported with a compatible `genai_config.json` and tokenizer. +- Supported architectures include: Gemma, Llama, Mistral, Phi (language + vision), Qwen. +- Supported execution providers: CPU, DirectML, CUDA, NvTensorRtRtx. + +### HuggingFace Models + +- Any HuggingFace causal language model (e.g., `meta-llama/Llama-2-7b-hf`, `gpt2`, `mistralai/Mistral-7B-v0.1`). +- Models are automatically downloaded from the HuggingFace Hub if not cached locally. +- Supports custom data types (float16, bfloat16, float32) for efficient inference. + +## How to Run + +### Evaluate ONNX Models + +#### Single Model + +```bash +python run_perplexity.py --models /path/to/model +``` + +#### Multiple Models + +```bash +python run_perplexity.py --models /path/to/model1 /path/to/model2 +``` + +#### Custom Input Sequence Length(s) + +You can specify the input sequence length(s) to evaluate using the `--i` argument: + +```bash +python run_perplexity.py --models /path/to/model --i 1024,2048,4096,8192,12288 +``` + +#### Custom Prefill Chunk Size + +You can specify the prefill chunk size to evaluate using the `--chunk_size` argument: + +```bash +python run_perplexity.py --models /path/to/model --i 1024,2048,4096,8192,12288 --chunk_size=1024 +``` + +### Evaluate HuggingFace Models + +#### Basic HuggingFace Model Evaluation + +```bash +python run_perplexity.py --hf_model meta-llama/Llama-2-7b-hf --i 1024 +``` + +#### With Custom Data Type (Recommended for Performance) + +```bash +python run_perplexity.py --hf_model meta-llama/Llama-2-7b-hf --hf_dtype float16 --i 1024 +``` + +#### With Multiple Input Lengths + +```bash +python run_perplexity.py --hf_model meta-llama/Llama-2-7b-hf --hf_dtype float16 --i 1024,2048,4096 +``` + +#### On CPU (if no GPU available) + +```bash +python run_perplexity.py --hf_model gpt2 --hf_device cpu --i 1024 +``` + +### Evaluate Both ONNX and HuggingFace Models Together + +Compare ONNX and HuggingFace models side-by-side: + +```bash +python run_perplexity.py \ + --models /path/to/onnx_model \ + --hf_model meta-llama/Llama-2-7b-hf \ + --hf_dtype float16 \ + --i 1024 \ + --output comparison_results.csv +``` + +### HuggingFace Model Arguments + +- `--hf_model`: HuggingFace model name or local path (e.g., `meta-llama/Llama-2-7b-hf`) +- `--hf_device`: Device to run on (`cuda`, `cpu`, `cuda:0`, etc.) - default: `cuda` +- `--hf_dtype`: Data type for model weights - options: `float16`, `bfloat16`, `float32`, `fp16`, `bf16`, `fp32` - default: model default (usually float32) + +### Custom Output File + +```bash +python run_perplexity.py --models /path/to/model --output results.csv +``` + +## Expected Output + +Expected scores often fall between 2 and 1000; lower is better. See ranges below. + +### Perplexity Configuration Setting (for ONNX models) + +- If **kv_chunking** is enabled in the model configuration (i.e., `"chunk_size"` is present in the `"search"` section of `genai_config.json`), then: + - `max_input_seq_length` is set to **8192** + - `stride` is set to the value of `chunk_size` +- If **kv_chunking** is not enabled (default): + - `max_input_seq_length` is **1024** + - `stride` is **512** + +### For HuggingFace Models + +- Default `max_length` is **1024** +- Default `stride` is **512** (or `chunk_size` if specified) + +### Console Output + +```text +============================================================ +Evaluating HuggingFace model: meta-llama/Llama-2-7b-hf +============================================================ +[INFO] Loading Wikitext-2 'test' split ... +[TOKENIZER] Tokenizing ... + +[RESULT] Perplexity of meta-llama/Llama-2-7b-hf: 5.47 + +HuggingFace perplexity evaluation completed + +============================================================ +Evaluating perplexity for: /path/to/onnx_model +============================================================ +[INFO] Loading Wikitext-2 'test' split ... +[TOKENIZER] Tokenizing ... + +[RESULT] Perplexity of /path/to/onnx_model: 5.48 + +Perplexity evaluation completed successfully +``` + +### CSV Output + +Generated file contains: + +- Model Path (model directory or HuggingFace model name) +- Model Type (ONNX or HuggingFace) +- Input Length +- Perplexity score +- Status (Success/Failed) +- Error details (if any) + +## Debug Mode + +Set `DEBUG = True` in `perplexity_metrics.py` for detailed logs. + +## Typical Perplexity Ranges + +- Excellent: 2-20 +- Good: 20-40 +- OK: 40-80 +- Poor: 100+ + +## Common Use Cases + +### Compare ONNX vs. HuggingFace Model + +Verify that your ONNX exported model has similar perplexity to the original HuggingFace model: + +```bash +python run_perplexity.py \ + --models /path/to/exported_onnx_model \ + --hf_model meta-llama/Llama-2-7b-hf \ + --hf_dtype float16 \ + --i 1024 \ + --output validation_results.csv +``` + +### Evaluate Small Models (for quick testing) + +```bash +python run_perplexity.py --hf_model gpt2 --hf_dtype float16 --i 1024 +``` + +### Benchmark Multiple Quantization Variants + +```bash +python run_perplexity.py \ + --models /path/to/fp16_model /path/to/int8_model /path/to/int4_model \ + --hf_model original/model-name \ + --hf_dtype float16 \ + --i 2048 \ + --output quantization_comparison.csv +``` diff --git a/examples/windows/accuracy_benchmark/perplexity_metrics/perplexity_metrics.py b/examples/windows/accuracy_benchmark/perplexity_metrics/perplexity_metrics.py new file mode 100644 index 000000000..f63677300 --- /dev/null +++ b/examples/windows/accuracy_benchmark/perplexity_metrics/perplexity_metrics.py @@ -0,0 +1,559 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# SPDX-License-Identifier: MIT +# +# Copyright (c) Microsoft Corporation. All rights reserved. +# +# This file is based on perplexity_metrics.py from the ONNX Runtime GenAI project: +# https://github.com/microsoft/onnxruntime-genai/blob/main/tools/python/model_validation/perplexity_metrics.py +# +# Modifications Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# Modifications made: +# - Added support for multiple context lengths +# - Added configurable chunk sizes +# - Enhanced prefill chunking handling + +import json +import time + +import numpy as np +import onnxruntime_genai as og +import torch +from datasets import load_dataset + +# Global debug flag - set to True for verbose output +DEBUG = False + + +def calculate_perplexity_hf( + model_name_or_path, max_length=1024, stride=512, device="cuda", torch_dtype=None +): + """ + Evaluate perplexity of a HuggingFace model on the WikiText-2 dataset. + + This function computes perplexity using a sliding window approach similar to the + ONNX Runtime GenAI version, but using native HuggingFace transformers. + + Args: + model_name_or_path (str): HuggingFace model name (e.g., 'meta-llama/Llama-2-7b-hf') + or path to a local model directory. + max_length (int, optional): Maximum input sequence length for evaluation. + Defaults to 1024. + stride (int, optional): Stride for sliding window evaluation. + Defaults to 512. + device (str, optional): Device to run the model on ('cuda', 'cpu', etc.). + Defaults to 'cuda'. + torch_dtype: PyTorch dtype for the model. If None, uses default (float32). + Common options: torch.float16, torch.bfloat16, torch.float32. + + Returns: + float: Computed perplexity score. Lower values indicate better model performance. + + Raises: + ImportError: If transformers package is not installed. + """ + try: + from transformers import AutoModelForCausalLM, AutoTokenizer + except ImportError as e: + raise ImportError( + "The 'transformers' package is required for HuggingFace model evaluation. " + "Install it with: pip install transformers" + ) from e + + time_start = time.time() + print(f"\n[RUN] === BEGIN calculate_perplexity_hf('{model_name_or_path}') ===") + print(f"[RUN] Loading HuggingFace model from: {model_name_or_path}") + + # Load tokenizer + print("[TOKENIZER] Loading tokenizer ...") + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + + # Set pad_token if not already set + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + # Load model + print(f"[MODEL] Loading model on device: {device}") + model_kwargs = {"device_map": device} + if torch_dtype is not None: + model_kwargs["torch_dtype"] = torch_dtype + print(f"[MODEL] Using dtype: {torch_dtype}") + + model = AutoModelForCausalLM.from_pretrained(model_name_or_path, **model_kwargs) + model.eval() + + # Load and prepare the evaluation dataset + dataset = get_wikitext2() + print("[TOKENIZER] Tokenizing ...") + + # Tokenize the entire dataset + encodings = tokenizer(dataset, return_tensors="pt", add_special_tokens=True) + input_ids = encodings.input_ids + + if DEBUG: + print(f"[TOKENIZER] Input shape: {input_ids.shape}, dtype: {input_ids.dtype}") + + seq_len = input_ids.size(1) + print(f"[INFO] Full input length: {seq_len}") + print(f"[INFO] max_length: {max_length}, stride: {stride}") + + max_eval_length = seq_len + + # Initialize accumulators for log probabilities + total_log_probs = 0.0 + total_token_count = 0 + prev_end_loc = 0 + + # Slide a window over the input to compute perplexity in chunks + for chunk_idx, begin_loc in enumerate(range(0, max_eval_length, stride)): + end_loc = min(begin_loc + max_length, seq_len) + trg_len = end_loc - prev_end_loc + + if DEBUG: + print( + f"\n[LOOP] chunk_idx={chunk_idx} [begin={begin_loc} end={end_loc}] trg_len={trg_len}" + ) + + # Extract the current chunk of input tokens (keep on CPU until needed) + input_ids_chunk = input_ids[:, begin_loc:end_loc].to(device) + target_ids = input_ids_chunk.clone() + + # Mask context tokens: only predict for last trg_len tokens in chunk + mask = np.ones(target_ids.shape, dtype=bool) + mask[:, :-trg_len] = False + target_ids_masked = target_ids.clone() + target_ids_masked[~torch.from_numpy(mask)] = -100 # -100 is the ignore index + + if DEBUG: + print(f"[MASK] Mask shape: {mask.shape}") + print(f"[TARGET_IDS_MASKED] Target ids masked: {target_ids_masked}") + + # Run the model forward pass without gradient calculation + with torch.no_grad(): + if DEBUG: + print("[INFER] Running model forward pass ...") + + outputs = model(input_ids_chunk) + logits = outputs.logits + + if DEBUG: + print(f"[LOGITS] Shape: {logits.shape}, dtype: {logits.dtype}") + + # Compute log probabilities over vocabulary for each position + log_probs = torch.nn.functional.log_softmax(logits, dim=2).cpu().numpy() + chunk_seq_len = log_probs.shape[1] + + # Language models predict next token: logits[i] predicts token[i+1] + # So we need logits[:-1] to match with target_ids[1:] + if chunk_seq_len > 1: + # Get log probabilities for all positions except the last + pred_log_probs = log_probs[0, :-1, :] # predictions for positions 0 to max_length-2 + # Get the target token ids for positions 1 to max_length-1 + target_ids_shifted = ( + target_ids_masked[0, 1:].cpu().numpy() + ) # targets at positions 1 to max_length-1 + + if DEBUG: + print(f"[TARGET_IDS_SHIFTED] Target ids shifted shape: {target_ids_shifted.shape}") + print(f"[PRED_LOG_PROBS] Pred log probs shape: {pred_log_probs.shape}") + print(f"chunk_seq_len: {chunk_seq_len}") + + # Only include tokens with label != -100 (matching masking) + mask_flat = target_ids_shifted != -100 + valid_indices = np.arange(len(target_ids_shifted))[mask_flat] + valid_targets = target_ids_shifted[mask_flat] + + if DEBUG: + print(f"[VALID_INDICES] Valid indices shape: {valid_indices.shape}") + print(f"[VALID_TARGETS] Valid targets shape: {valid_targets.shape}") + + # Gather the log probabilities for the correct target tokens + valid_log_probs = pred_log_probs[valid_indices, valid_targets] + + if DEBUG: + print(f"[VALID_LOG_PROBS] Valid log probs shape: {valid_log_probs.shape}") + else: + valid_log_probs = np.array([]) + mask_flat = np.array([], dtype=bool) + + # Accumulate log probabilities and token count (same as ONNX) + total_log_probs += float(np.sum(valid_log_probs)) + total_token_count += int(valid_log_probs.size) + + if DEBUG: + print( + f"[LOOP] This chunk: valid tokens={valid_log_probs.size}, sum={np.sum(valid_log_probs)}" + ) + print(f"[TALLY] total_log_probs: {total_log_probs}") + print(f"[TALLY] total_token_count: {total_token_count}") + + # Clear GPU cache to prevent OOM + del ( + outputs, + logits, + log_probs, + pred_log_probs, + input_ids_chunk, + target_ids, + target_ids_masked, + ) + if device == "cuda": + torch.cuda.empty_cache() + + # Update for next chunk + prev_end_loc = end_loc + if end_loc >= max_eval_length: + if DEBUG: + print("[LOOP] Reached evaluation limit.") + break + + # Compute average log probability and perplexity (same as ONNX) + avg_log_prob = total_log_probs / total_token_count + perplexity = np.exp(-avg_log_prob) # Note the negative sign! + + if DEBUG: + print(f"[FINAL] avg_log_prob: {avg_log_prob}") + + print(f"\n[RESULT] Perplexity of {model_name_or_path}: {perplexity}") + print("[RUN] === END calculate_perplexity_hf ===\n") + time_end = time.time() + print(f"[RUN] Time taken: {time_end - time_start:.2f} seconds") + + # Cleanup: Unload model from GPU memory + print("[CLEANUP] Unloading model from GPU...") + del model, tokenizer + if device == "cuda": + torch.cuda.empty_cache() + print("[CLEANUP] Model unloaded") + + return perplexity + + +def get_wikitext2(): + """ + Load and concatenate the WikiText-2 test dataset. + + Returns: + str: Concatenated text from all samples in the WikiText-2 test split, + with samples separated by double newlines. + + Note: + Requires HuggingFace CLI authentication to access the dataset. + """ + # Load the Wikitext-2 test split using HuggingFace datasets + print("\n[INFO] Loading Wikitext-2 'test' split ...") + test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + if DEBUG: + print(f"[DATASET] Number of raw samples: {len(test)}") + for i in range(3): + print(f"[DATASET] Sample[{i}]: {repr(test[i]['text'])[:200]} ...") + # Concatenate all text samples into a single string, separated by double newlines + result = "\n\n".join(text for text in test["text"]) + if DEBUG: + print( + f"[DATASET] Concatenated text preview: {result[:512]!r} ... [total chars: {len(result)}]" + ) + return result + + +def perplexity_eval(model_dir, input_len=1024, chunk_size=None): + """ + Evaluate perplexity of an ONNX Runtime GenAI model on the WikiText-2 dataset. + + This function computes perplexity using a sliding window approach. It supports + both standard evaluation and prefill chunking for longer context lengths. + + Args: + model_dir (str): Path to the ONNX Runtime GenAI model directory. + Must contain genai_config.json and tokenizer files. + input_len (int, optional): Maximum input sequence length for evaluation. + Used as context length when KV chunking is enabled. + Defaults to 1024. + chunk_size (int, optional): Prefill chunk size for prefill chunking. + If provided, overrides the chunk_size in genai_config.json. + When set, enables evaluation with longer context lengths. + Defaults to None. + + Returns: + float: Computed perplexity score. Lower values indicate better model performance. + Typical ranges: 2-20 (excellent), 20-40 (good), 40-80 (ok), 100+ (poor). + + """ + time_start = time.time() + print(f"\n[RUN] === BEGIN perplexity_eval('{model_dir}') ===") + print(f"[RUN] Loading ONNX model from: {model_dir}") + chunking_failed = False + # Load the ONNX model + # Apply chunk_size overlay if provided + config = og.Config(model_dir) + if chunk_size is not None: + search_config = {"chunk_size": int(chunk_size)} + try: + print(f"[CONFIG] Applying chunk_size overlay: {chunk_size}") + config.overlay(json.dumps({"search": search_config})) + print(f"[CONFIG] Successfully applied chunk_size: {chunk_size}") + except Exception as e: + print(f"[WARNING] Failed to apply chunk_size overlay: {e}") + chunking_failed = True + model = og.Model(config) + + if DEBUG: + print("[RUN] Creating tokenizer ...") + # Create the tokenizer for the model + tokenizer = og.Tokenizer(model) + # Load model configuration from JSON file (optional) + model_cfg_json = None + try: + with open(f"{model_dir}/genai_config.json") as file: + model_cfg_json = json.load(file) + if DEBUG: + print( + f"[CONFIG] Model config loaded: {json.dumps(model_cfg_json.get('model', {}), indent=2)}" + ) + except Exception as e: + print(f"[WARNING] Could not read genai_config.json: {e}. Falling back to defaults.") + + max_context_length = 1024 + stride = 512 + kv_chunking_enabled = False + + # Check for chunk_size - prioritize parameter over config file + effective_chunk_size = None + if chunk_size is not None and not chunking_failed: + # Use the provided chunk_size parameter (overlaid) + effective_chunk_size = int(chunk_size) + kv_chunking_enabled = True + if DEBUG: + print(f"[CONFIG] Using provided chunk_size: {effective_chunk_size}") + elif model_cfg_json and "search" in model_cfg_json and "chunk_size" in model_cfg_json["search"]: + # Use chunk_size from existing config file + effective_chunk_size = model_cfg_json["search"]["chunk_size"] + kv_chunking_enabled = True + if DEBUG: + print(f"[CONFIG] Using config file chunk_size: {effective_chunk_size}") + + if DEBUG: + print( + f"[CONFIG] Effective chunk_size: {effective_chunk_size if kv_chunking_enabled else 'disabled'}" + ) + + if kv_chunking_enabled and effective_chunk_size: + if DEBUG: + print(f"[INFO] chunk size: {effective_chunk_size}") + print(f"[INFO] input length: {input_len}") + max_context_length = int(input_len) # Use input_len when chunking is enabled + stride = effective_chunk_size + if DEBUG: + print( + f"[CONFIG] KV chunking enabled with chunk_size: {effective_chunk_size}, input_len: {input_len}" + ) + elif DEBUG: + print(f"[CONFIG] KV chunking disabled, using default stride: {stride}") + + # Set chunk and stride lengths for evaluation + model_context_len = ( + int(model_cfg_json["model"]["context_length"]) + if model_cfg_json + and "model" in model_cfg_json + and "context_length" in model_cfg_json["model"] + else max_context_length + ) + max_length = min(max_context_length, model_context_len) + if DEBUG: + print(f"[INFO] max_length for chunk: {max_length}, stride for sliding window: {stride}") + + # Load and prepare the evaluation dataset + dataset = get_wikitext2() + print("[TOKENIZER] Tokenizing ...") + # Tokenize the entire dataset + input_ids = tokenizer.encode_batch([dataset]) + # Handle possible dict output from tokenizer + if isinstance(input_ids, dict) and "input_ids" in input_ids: + input_ids = input_ids["input_ids"] + # Convert to numpy if needed + if hasattr(input_ids, "as_numpy"): + input_ids = input_ids.as_numpy() + if DEBUG: + print("[TOKENIZER] Used as_numpy()") + input_ids = np.array(input_ids) + if DEBUG: + print(f"[TOKENIZER] Numpy array shape: {input_ids.shape}, dtype: {input_ids.dtype}") + # Ensure input_ids is 2D (batch, seq_len) + if input_ids.ndim == 1: + input_ids = np.expand_dims(input_ids, 0) + if DEBUG: + print(f"[SHAPE] Expanded dims, now: {input_ids.shape}") + + # Convert input_ids to torch tensor + input_ids = torch.tensor(input_ids, dtype=torch.long) + if DEBUG: + print(f"[TENSOR] Torch tensor shape: {input_ids.shape}, dtype: {input_ids.dtype}") + + # Determine the sequence length to use + seq_len = int(input_ids.shape[1]) + if DEBUG: + print(f"[INFO] Full input length: {seq_len}") + + # Initialize accumulators for log probabilities and token count + total_log_probs = 0.0 + total_token_count = 0 + prev_end_loc = 0 + if kv_chunking_enabled: + assert stride == effective_chunk_size, ( + f"For chunking case, stride must equal chunk_size. " + f"Got stride={stride}, chunk_size={effective_chunk_size}" + ) + # Slide a window over the input to compute perplexity in chunks + for chunk_idx, begin_loc in enumerate(range(0, seq_len, stride)): + end_loc = min(begin_loc + max_length, seq_len) + trg_len = end_loc - prev_end_loc + if DEBUG: + print( + f"\n[LOOP] chunk_idx={chunk_idx} [begin={begin_loc} end={end_loc}] trg_len={trg_len}" + ) + + # Extract the current chunk of input tokens + input_ids_chunk = input_ids[:, begin_loc:end_loc].clone() + target_ids = input_ids_chunk.clone() + if DEBUG: + print(f"input_ids_chunk.shape: {input_ids_chunk.shape}") + # Mask context tokens: only predict for last trg_len tokens in chunk + mask = np.ones(target_ids.shape, dtype=bool) + mask[:, :-trg_len] = False + target_ids_masked = target_ids.clone() + target_ids_masked[~torch.from_numpy(mask)] = -100 # -100 is the ignore index + if DEBUG: + print(f"[MASK] Mask : {mask}") + print(f"[TARGET_IDS_MASKED] Target ids masked : {target_ids_masked}") + # Set up generator parameters for deterministic generation (no sampling) + params = og.GeneratorParams(model) + params.set_search_options( + max_length=int(input_ids_chunk.shape[1]), do_sample=False, early_stopping=False + ) + # Create generator and append input tokens + generator = og.Generator(model, params) + generator.append_tokens(input_ids_chunk.numpy()) + + # Run the model forward pass without gradient calculation + with torch.no_grad(): + if DEBUG: + print("[INFER] Running model forward pass ...") + try: + generator.generate_next_token() + except Exception as e: + print(f"[INFER] .generate_next_token() failed: {e}") + break # Fatal error + # Get logits output from the model + logits = generator.get_output("logits") + if hasattr(logits, "as_numpy"): + logits = logits.as_numpy() + if DEBUG: + print("[LOGITS] Used as_numpy()") + logits = torch.tensor(logits, dtype=torch.float32) + if DEBUG: + print(f"[LOGITS] Torch tensor shape: {logits.shape}, dtype: {logits.dtype}") + + # Compute log probabilities over vocabulary for each position + log_probs = torch.nn.functional.log_softmax(logits, dim=2).cpu().numpy() + chunk_seq_len = log_probs.shape[1] + # Language models predict next token: logits[i] predicts token[i+1] + # So we need logits[:-1] to match with target_ids[1:] + if chunk_seq_len > 1: + # Get log probabilities for all positions except the last + pred_log_probs = log_probs[0, :-1, :] # predictions for positions 0 to max_length-2 + # Get the target token ids for positions 1 to max_length-1 + target_ids_shifted = ( + target_ids_masked[0, 1:].cpu().numpy() + ) # targets at positions 1 to max_length-1 + if DEBUG: + print(f"[TARGET_IDS_SHIFTED] Target ids shifted shape: {target_ids_shifted.shape}") + print(f"[PRED_LOG_PROBS] Pred log probs shape: {pred_log_probs.shape}") + print(f"chunk_seq_len: {chunk_seq_len}") + + # Only include tokens with label != -100 (matching HF masking) + mask_flat = target_ids_shifted != -100 + if kv_chunking_enabled: + trg_len = min(trg_len, stride) + mask_flat = np.ones(trg_len, dtype=bool) + valid_indices = np.arange(0, trg_len - 1) + valid_targets = target_ids_shifted[-trg_len + 1 :] + else: + valid_indices = np.arange(len(target_ids_shifted))[mask_flat] + valid_targets = target_ids_shifted[mask_flat] + if DEBUG: + print(f"[VALID_INDICES] Valid indices shape: {valid_indices.shape}") + print(f"[VALID_TARGETS] Valid targets shape: {valid_targets.shape}") + # Gather the log probabilities for the correct target tokens + valid_log_probs = pred_log_probs[valid_indices, valid_targets] + if DEBUG: + print(f"[VALID_LOG_PROBS] Valid log probs shape: {valid_log_probs.shape}") + else: + valid_log_probs = np.array([]) + mask_flat = np.array([], dtype=bool) + + # Accumulate log probabilities and token count + total_log_probs += float(np.sum(valid_log_probs)) + total_token_count += int(valid_log_probs.size) + + if DEBUG: + print( + f"[LOOP] This chunk: valid tokens={valid_log_probs.size}, sum={np.sum(valid_log_probs)}" + ) + print(f"[TALLY] total_log_probs: {total_log_probs}") + print(f"[TALLY] total_token_count: {total_token_count}") + + # Update for next chunk + prev_end_loc = end_loc + if end_loc == seq_len: + if DEBUG: + print("[LOOP] Reached end of sequence.") + break + + # Compute average log probability and perplexity + avg_log_prob = total_log_probs / total_token_count + perplexity = np.exp(-avg_log_prob) + if DEBUG: + print(f"[FINAL] avg_log_prob: {avg_log_prob}") + print(f"\n[RESULT] Perplexity of {model_dir}: {perplexity}") + print("[RUN] === END perplexity_eval ===\n") + time_end = time.time() + print(f"[RUN] Time taken: {time_end - time_start:.2f} seconds") + return perplexity + + +# Example usage: +# perplexity_eval("/path/to/model_dir") +# +# To enable debug output, set DEBUG = True at the top of this file diff --git a/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt b/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt new file mode 100644 index 000000000..73bb392b0 --- /dev/null +++ b/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt @@ -0,0 +1,12 @@ +# PyTorch with CUDA 12.x support (compatible with CUDA 12.1-12.9) +--extra-index-url https://download.pytorch.org/whl/cu129 +accelerate +datasets +numpy +onnxruntime-genai +pandas +sentencepiece +tokenizers>=0.14.1 + +torch>=2.0.0 +transformers>=4.36 diff --git a/examples/windows/accuracy_benchmark/perplexity_metrics/run_perplexity.py b/examples/windows/accuracy_benchmark/perplexity_metrics/run_perplexity.py new file mode 100644 index 000000000..e8b34b281 --- /dev/null +++ b/examples/windows/accuracy_benchmark/perplexity_metrics/run_perplexity.py @@ -0,0 +1,391 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import sys + +import pandas as pd + +# Ensure this directory is on sys.path for local imports +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +if SCRIPT_DIR not in sys.path: + sys.path.insert(0, SCRIPT_DIR) +from perplexity_metrics import calculate_perplexity_hf, perplexity_eval # noqa: E402 + + +def run_perplexity_on_models( + model_dirs, + output_file="perplexity_results.csv", + i="1024", + chunk_size=None, + hf_model=None, + hf_device="cuda", + hf_dtype=None, +): + """ + Run perplexity evaluation on multiple ONNX Runtime GenAI models and/or a HuggingFace model. + + This function evaluates one or more models at different input sequence lengths, + saves results to a CSV file, and prints a summary report. Each model-length + combination is evaluated independently, with errors handled gracefully. + + Args: + model_dirs (list[str]): List of model directory paths to evaluate. + Each directory must contain a valid ONNX Runtime GenAI model. + output_file (str, optional): Path for the output CSV file containing results. + Defaults to "perplexity_results.csv". + i (str or list, optional): Input sequence lengths to evaluate. Can be: + - String: comma-separated values (e.g., "1024,2048,4096") + - List/tuple: sequence of integers + - Single int: one length to evaluate + Defaults to "1024". + chunk_size (int, optional): Prefill chunk size for KV cache chunking. + Required for input lengths > 1024. + Overrides chunk_size in model config if provided. + Defaults to None. + hf_model (str, optional): HuggingFace model name or path to evaluate. + If provided, will download and evaluate this model. + Defaults to None. + hf_device (str, optional): Device to run HuggingFace model on. + Defaults to "cuda". + hf_dtype (str, optional): Data type for HuggingFace model. + Options: "float16", "bfloat16", "float32". + Defaults to None (uses model default). + + Returns: + pd.DataFrame: DataFrame containing evaluation results with columns: + - Model Path: Full path to model directory + - Model Type: "ONNX" or "HuggingFace" + - Input Length: Sequence length used for evaluation + - Perplexity: Computed perplexity score (or "N/A" if failed) + - Status: "Success" or "Failed" + - Error: Error message if failed, "None" if successful + + """ + results = [] + + # Parse input lengths + if isinstance(i, str): + i_list = [int(x.strip()) for x in i.split(",") if x.strip()] + elif isinstance(i, (list, tuple)): + i_list = [int(x) for x in i] + else: + i_list = [int(i)] + + # Evaluate HuggingFace model if provided + if hf_model is not None: + print(f"\n{'=' * 60}") + print(f"Evaluating HuggingFace model: {hf_model}") + print(f"{'=' * 60}") + + # Convert dtype string to torch dtype + import torch + + dtype_map = { + "float16": torch.float16, + "bfloat16": torch.bfloat16, + "float32": torch.float32, + "fp16": torch.float16, + "bf16": torch.bfloat16, + "fp32": torch.float32, + } + torch_dtype = dtype_map.get(hf_dtype.lower()) if hf_dtype else torch.float16 + + for input_len in i_list: + try: + print(f" Evaluating with input length: {input_len}") + if torch_dtype: + print(f" Using dtype: {torch_dtype}") + + # Calculate stride (use chunk_size if provided, otherwise use half of input_len) + stride = chunk_size if chunk_size is not None else input_len // 2 + + if chunk_size is not None: + assert stride == chunk_size, ( + f"For chunking case, stride must equal chunk_size. " + f"Got stride={stride}, chunk_size={chunk_size}" + ) + perplexity = calculate_perplexity_hf( + model_name_or_path=hf_model, + max_length=input_len, + stride=stride, + device=hf_device, + torch_dtype=torch_dtype, + ) + + results.append( + { + "Model Path": hf_model, + "Model Type": "HuggingFace", + "Input Length": int(input_len), + "Perplexity": float(perplexity), + "Status": "Success", + "Error": "None", + } + ) + except Exception as e: # noqa: PERF203 + print(f" Error for input length {input_len}: {e!s}") + results.append( + { + "Model Path": hf_model, + "Model Type": "HuggingFace", + "Input Length": int(input_len), + "Perplexity": "N/A", + "Status": "Failed", + "Error": str(e), + } + ) + + print(" HuggingFace perplexity evaluation completed") + + # Unload HuggingFace model from GPU memory before ONNX evaluation + print("[CLEANUP] Unloading HuggingFace model from GPU memory...") + import gc + + import torch + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + gc.collect() + print("[CLEANUP] GPU memory freed") + + # Evaluate ONNX models + for model_dir in model_dirs: + print(f"\n{'=' * 60}") + print(f"Evaluating perplexity for: {model_dir}") + print(f"{'=' * 60}") + + try: + # Check if model directory exists + if not os.path.exists(model_dir): + print(f"Error: Model directory does not exist: {model_dir}") + results.append( + { + "Model Path": model_dir, + "Perplexity": "N/A", + "Status": "Directory not found", + "Error": "Directory does not exist", + } + ) + continue + + # Check if genai_config.json exists + config_path = os.path.join(model_dir, "genai_config.json") + if not os.path.exists(config_path): + print(f"Error: genai_config.json not found in: {model_dir}") + results.append( + { + "Model Path": model_dir, + "Model Type": "ONNX", + "Perplexity": "N/A", + "Status": "Invalid model format", + "Error": "genai_config.json not found", + } + ) + continue + + # For each input length, run perplexity_eval and record results + for input_len in i_list: + try: + print(f" Evaluating with input length: {input_len}") + if chunk_size is None: + print( + " Note: input length is ignored unless chunk_size is set or " + "config.search.chunk_size is present." + ) + if chunk_size is not None: + print(f" Using chunk_size: {chunk_size}") + perplexity = perplexity_eval(model_dir, str(input_len), chunk_size) + else: + perplexity = perplexity_eval(model_dir, str(input_len)) + results.append( + { + "Model Path": model_dir, + "Model Type": "ONNX", + "Input Length": int(input_len), + "Perplexity": float(perplexity), + "Status": "Success", + "Error": "None", + } + ) + except Exception as e: # noqa: PERF203 + print(f" Error for input length {input_len}: {e!s}") + results.append( + { + "Model Path": model_dir, + "Model Type": "ONNX", + "Input Length": int(input_len), + "Perplexity": "N/A", + "Status": "Failed", + "Error": str(e), + } + ) + + print(" Perplexity evaluation completed successfully") + + except Exception as e: + print(f"Error during perplexity evaluation: {e!s}") + results.append( + { + "Model Path": model_dir, + "Model Type": "ONNX", + "Perplexity": "N/A", + "Status": "Failed", + "Error": str(e), + } + ) + + # Create results DataFrame and save to CSV + df = pd.DataFrame(results) + df.to_csv(output_file, index=False) + + print(f"\n{'=' * 60}") + print(f"Results saved to: {output_file}") + print(f"{'=' * 60}") + + # Print summary + successful = df[df["Status"] == "Success"] + failed = df[df["Status"] != "Success"] + + print("\nSummary:") + print(f" Successful evaluations: {len(successful)}") + print(f" Failed evaluations: {len(failed)}") + + if len(successful) > 0: + print("\nPerplexity Results:") + for _, row in successful.iterrows(): + print( + f" {os.path.basename(row['Model Path'])} [i={row.get('Input Length', '?')}]: " + f"{row['Perplexity']:.4f}" + if isinstance(row["Perplexity"], (int, float)) + else row["Perplexity"] + ) + + return df + + +def main(): + """ + Command-line entry point for perplexity evaluation. + + Parses command-line arguments and runs perplexity evaluation on specified + ONNX Runtime GenAI models and/or HuggingFace models. Results are saved to a CSV file. + + Command-line Arguments: + --models: One or more ONNX model directory paths (optional) + --hf_model: HuggingFace model name or path (optional) + --hf_device: Device for HuggingFace model (default: "cuda") + --hf_dtype: Data type for HuggingFace model (default: None) + --i: Comma-separated input sequence lengths (default: "1024") + --output: Output CSV file path (default: "perplexity_results.csv") + --chunk_size: Prefill chunk size for prefill chunking (optional) + + Examples: + # Evaluate ONNX models + $ python run_perplexity.py --models /path/to/model + $ python run_perplexity.py --models /path/to/model1 /path/to/model2 \\ + --i 1024,2048,4096 --chunk_size 1024 --output results.csv + + # Evaluate HuggingFace model + $ python run_perplexity.py --hf_model meta-llama/Llama-2-7b-hf --i 1024 + $ python run_perplexity.py --hf_model meta-llama/Llama-2-7b-hf \\ + --hf_dtype float16 --hf_device cuda --i 1024,2048 + + # Evaluate both ONNX and HuggingFace models + $ python run_perplexity.py --models /path/to/onnx_model \\ + --hf_model meta-llama/Llama-2-7b-hf --i 1024 + """ + parser = argparse.ArgumentParser( + description="Run perplexity evaluation on ONNX Runtime GenAI and/or HuggingFace models" + ) + parser.add_argument( + "--models", + nargs="+", + default=[], + help="List of ONNX model directory paths to evaluate (optional)", + ) + parser.add_argument( + "--i", + default="1024", + help="Comma-separated input seq lengths to be evaluated (e.g. 1024,2048) please enter number >= 1024", + ) + parser.add_argument( + "--output", + default="perplexity_results.csv", + help="Output CSV file name (default: perplexity_results.csv)", + ) + parser.add_argument( + "--chunk_size", + type=int, + default=None, + help="Chunk size for KV caching optimization (optional)", + ) + parser.add_argument( + "--hf_model", + type=str, + default=None, + help="HuggingFace model name or path to evaluate (e.g., 'meta-llama/Llama-2-7b-hf')", + ) + parser.add_argument( + "--hf_device", + type=str, + default="cuda", + help="Device to run HuggingFace model on (default: 'cuda')", + ) + parser.add_argument( + "--hf_dtype", + type=str, + default=None, + choices=["float16", "bfloat16", "float32", "fp16", "bf16", "fp32"], + help="Data type for HuggingFace model (default: None, uses model default)", + ) + + args = parser.parse_args() + + # Validate that at least one model source is provided + if not args.models and not args.hf_model: + print("Error: You must provide either --models or --hf_model (or both)") + parser.print_help() + return + + # Validate that all model directories exist + valid_models = [] + for model_dir in args.models: + if os.path.exists(model_dir): + valid_models.append(model_dir) + else: + print(f"Warning: Model directory does not exist: {model_dir}") + + # Count total models to evaluate + total_models = len(valid_models) + (1 if args.hf_model else 0) + + print(f"Running perplexity evaluation on {total_models} model(s)...") + if args.chunk_size is not None: + print(f"Using chunk_size: {args.chunk_size}") + + run_perplexity_on_models( + valid_models, + args.output, + args.i, + args.chunk_size, + args.hf_model, + args.hf_device, + args.hf_dtype, + ) + + +if __name__ == "__main__": + main() diff --git a/modelopt/onnx/quantization/graph_utils.py b/modelopt/onnx/quantization/graph_utils.py index ce252bc8f..31ea42764 100755 --- a/modelopt/onnx/quantization/graph_utils.py +++ b/modelopt/onnx/quantization/graph_utils.py @@ -866,7 +866,7 @@ def get_layer_info( layers_8bit = kwargs.get("layers_8bit") gather_block_size = kwargs.get("gather_block_size", DEFAULT_GATHER_BLOCK_SIZE) gather_quantize_axis = kwargs.get("gather_quantize_axis", DEFAULT_GATHER_QUANTIZE_AXIS) - if enable_mixed_quant: + if enable_mixed_quant or layers_8bit: layer_info = get_layer_precision_mapping( onnx_model, layers_8bit,