-
Notifications
You must be signed in to change notification settings - Fork 193
Enable Nemotron nano vlm v1&v2 nvfp4 PTQ workflow #347
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 25 commits
476b59f
b583d98
8b76102
89d207c
6991fdf
80fecf0
15f0d61
ae42b9b
587d427
208cb9e
f94558f
31c4f75
ec4a0ef
5f0ea72
60a698a
446e135
f849c17
96e1613
c572513
8e6dea3
16bea91
8e1d6cb
4561de9
57d388e
f9b88fd
0e00954
1a3bac1
a4fa12d
6216038
4352ab6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,6 +39,30 @@ | |
| SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"] | ||
|
|
||
|
|
||
| def _is_multimodal_config(config): | ||
| """Check if a config indicates a multimodal model (config-only version of is_multimodal_model).""" | ||
| return ( | ||
| hasattr(config, "vision_config") # Standard vision config (e.g., Qwen2.5-VL) | ||
| or getattr(config, "model_type", "") == "phi4mm" # Phi-4 multimodal | ||
| or hasattr(config, "vision_lora") # Vision LoRA configurations | ||
| or hasattr(config, "audio_processor") # Audio processing capabilities | ||
| or ( | ||
| hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer") | ||
| ) # Image embedding layers | ||
| ) | ||
|
|
||
|
|
||
| def is_nemotron_vl_model(model): | ||
| """Check if model is a Nemotron VL model based on config architectures.""" | ||
| from modelopt.torch.export.model_utils import is_multimodal_model | ||
|
|
||
| if not is_multimodal_model(model): | ||
| return False | ||
|
|
||
| architectures = getattr(model.config, "architectures", []) | ||
| return any("nemotron" in arch.lower() for arch in architectures) | ||
|
|
||
|
|
||
| def build_quant_cfg( | ||
| qformat, | ||
| kv_cache_qformat, | ||
|
|
@@ -185,7 +209,21 @@ def get_model( | |
| if device == "cpu": | ||
| device_map = "cpu" | ||
|
|
||
| # Prepare config kwargs for loading | ||
| config_kwargs = {"trust_remote_code": trust_remote_code} if trust_remote_code else {} | ||
|
|
||
| # Load config once and handle VL model detection | ||
| try: | ||
| hf_config = AutoConfig.from_pretrained(ckpt_path, **config_kwargs) | ||
| if _is_multimodal_config(hf_config): | ||
|
||
| print( | ||
| "Detected vision-language model from config. " | ||
| "Disabling automatic device mapping to avoid device_map errors." | ||
| ) | ||
| device_map = None | ||
Edwardf0t1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| except Exception as e: | ||
| print(f"Error: Could not load config from {ckpt_path}: {e}") | ||
| raise RuntimeError(f"Failed to load model configuration from {ckpt_path}") from e | ||
| if attn_implementation is not None: | ||
| config_kwargs["attn_implementation"] = attn_implementation | ||
|
|
||
|
|
@@ -207,11 +245,6 @@ def get_model( | |
| ) | ||
| model = hf_vila.llm | ||
| else: | ||
| hf_config = AutoConfig.from_pretrained( | ||
| ckpt_path, | ||
| **config_kwargs, | ||
| ) | ||
|
|
||
| if use_seq_device_map: | ||
| device_map = "sequential" | ||
| # If we use sequential, set max_memory limit to ensure that the model does not occupy the full GPU | ||
|
|
@@ -282,6 +315,12 @@ def get_model( | |
| **model_kwargs, | ||
| ) | ||
| model.eval() | ||
|
|
||
| # If device_map was disabled (None), manually move model to target device | ||
| if device_map is None and device != "cpu": | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if device == "cpu"
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That was handled by HF's device_map="cpu" in L210. |
||
| print(f"Moving model to {device} device...") | ||
| model = model.to(device) | ||
|
|
||
| if device == "cuda" and not is_model_on_gpu(model): | ||
| print("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM") | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,6 +30,7 @@ | |
| get_processor, | ||
| get_tokenizer, | ||
| is_enc_dec, | ||
| is_nemotron_vl_model, | ||
| ) | ||
| from transformers import ( | ||
| AutoConfig, | ||
|
|
@@ -39,6 +40,7 @@ | |
| PreTrainedTokenizerFast, | ||
| WhisperProcessor, | ||
| ) | ||
| from vlm_utils import run_text_only_generation, run_vl_preview_generation | ||
|
|
||
| import modelopt.torch.opt as mto | ||
| import modelopt.torch.quantization as mtq | ||
|
|
@@ -48,7 +50,7 @@ | |
| export_tensorrt_llm_checkpoint, | ||
| get_model_type, | ||
| ) | ||
| from modelopt.torch.export.model_utils import is_multimodal_model | ||
| from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model | ||
| from modelopt.torch.quantization.config import need_calibration | ||
| from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights | ||
| from modelopt.torch.quantization.utils import is_quantized | ||
|
|
@@ -283,6 +285,9 @@ def main(args): | |
|
|
||
| full_model = model | ||
|
|
||
| # Detect if this is a Nemotron VL model using architecture-based detection | ||
| is_nemotron_vl = is_nemotron_vl_model(full_model) | ||
|
|
||
| if model_type == "mllama": | ||
| processor = get_processor( | ||
| args.pyt_ckpt_path, | ||
|
|
@@ -312,15 +317,8 @@ def main(args): | |
| tokenizer.padding_side = "left" | ||
|
|
||
| # We only quantize the language model for VLMs other than the type supported above. | ||
| if hasattr(model, "language_model"): | ||
| parent_model = model # llama4 case | ||
| if isinstance(type(model).__dict__.get("language_model"), property): | ||
| assert hasattr(model, "model") and hasattr(model.model, "language_model"), ( | ||
| "Expected language_model in model.model, but attribute not found. " | ||
| "This may indicate an unsupported model structure." | ||
| ) | ||
| parent_model = model.model # gemma3, qwen2.5 VL case | ||
|
|
||
| language_model, parent_model = get_language_model_from_vl(model) | ||
| if language_model is not None: | ||
| disabled_quant_cfg = { | ||
| "quant_cfg": {"default": {"enable": False}}, | ||
| "algorithm": "max", | ||
|
|
@@ -331,7 +329,7 @@ def main(args): | |
| if name != "language_model": | ||
| mtq.quantize(child, disabled_quant_cfg, forward_loop=None) | ||
|
|
||
| model = model.language_model | ||
| model = language_model | ||
| model_type = get_model_type(model) | ||
|
|
||
| if model_type == "phi4mm": | ||
|
|
@@ -458,34 +456,108 @@ def main(args): | |
| KV_QUANT_CFG_CHOICES, | ||
| ) | ||
|
|
||
| # For Nemotron VL models, disable quantization of vision components | ||
| if is_nemotron_vl: | ||
| print("Disabling quantization for vision components in Nemotron VL model") | ||
| quant_cfg["quant_cfg"]["*vision*"] = {"enable": False} | ||
| quant_cfg["quant_cfg"]["*image*"] = {"enable": False} | ||
| # Also disable radio model components specifically | ||
| quant_cfg["quant_cfg"]["*radio*"] = {"enable": False} | ||
| quant_cfg["quant_cfg"]["*visual*"] = {"enable": False} | ||
|
|
||
|
Comment on lines
459
to
467
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chainBug: mutating quant_cfg when auto_quantize_bits is set (quant_cfg={}) build_quant_cfg returns {} for auto-quant mode, so writing quant_cfg["quant_cfg"][...] raises KeyError and vision modules aren’t disabled. In auto-quant, disable modules via disabled_layers. Apply this diff to cover both code paths: - if is_nemotron_vl:
- print("Disabling quantization for vision components in Nemotron VL model")
- quant_cfg["quant_cfg"]["*vision*"] = {"enable": False}
- quant_cfg["quant_cfg"]["*image*"] = {"enable": False}
- # Also disable radio model components specifically
- quant_cfg["quant_cfg"]["*radio*"] = {"enable": False}
- quant_cfg["quant_cfg"]["*visual*"] = {"enable": False}
+ if is_nemotron_vl:
+ print("Disabling quantization for vision components in Nemotron VL model")
+ nemotron_disabled = {
+ "*vision*": {"enable": False},
+ "*image*": {"enable": False},
+ "*radio*": {"enable": False},
+ "*visual*": {"enable": False},
+ }
+ if not args.auto_quantize_bits:
+ quant_cfg.setdefault("quant_cfg", {}).update(nemotron_disabled)
+ else:
+ # Pass patterns to auto_quant to skip these modules during search
+ extra_disabled_patterns = list(nemotron_disabled.keys())And extend auto_quantize() to accept these patterns: def auto_quantize(
- model, qformat, auto_quantize_bits, calib_dataloader, calibrate_loop, batch_size=1
+ model, qformat, auto_quantize_bits, calib_dataloader, calibrate_loop, batch_size=1,
+ extra_disabled_layers=None,
):
@@
- disabled_layers=["*lm_head*"],
+ disabled_layers=(["*lm_head*"] + (extra_disabled_layers or [])),Finally, pass extra_disabled_patterns when calling auto_quantize(): - model = auto_quantize(
+ model = auto_quantize(
model,
args.qformat,
args.auto_quantize_bits,
calib_dataloader,
calibrate_loop,
- args.batch_size,
+ args.batch_size,
+ extra_disabled_layers=(extra_disabled_patterns if is_nemotron_vl else None),
)Please run a short auto-quant session on a Nemotron VL to confirm vision blocks remain unquantized. I'll verify the concerns in this review comment by examining the codebase structure and the specific functions involved. I'll search the codebase to verify the claims about the 🏁 Script executed: #!/bin/bash
# Find and examine the relevant function definitions and code
echo "=== Checking file structure and examining hf_ptq.py ==="
fd "hf_ptq.py" --type f
echo -e "\n=== Examining lines 460-480 to see context ==="
sed -n '460,480p' examples/llm_ptq/hf_ptq.py
echo -e "\n=== Finding build_quant_cfg function ==="
rg -n "def build_quant_cfg" --type py
echo -e "\n=== Finding auto_quantize function signature ==="
rg -n "def auto_quantize" --type py -A 5
echo -e "\n=== Finding calls to auto_quantize ==="
rg -n "auto_quantize\(" --type py -B 2 -A 5
echo -e "\n=== Searching for args.auto_quantize_bits usage ==="
rg -n "auto_quantize_bits" examples/llm_ptq/hf_ptq.py --type py -B 2 -A 2Length of output: 19066 🏁 Script executed: #!/bin/bash
# Examine the key functions and usage patterns
echo "=== Lines 460-480 in hf_ptq.py ==="
sed -n '460,480p' examples/llm_ptq/hf_ptq.py
echo -e "\n=== Finding build_quant_cfg function definition ==="
rg -n "def build_quant_cfg" --type py -A 20
echo -e "\n=== Finding auto_quantize function definition ==="
rg -n "def auto_quantize" --type py -A 10
echo -e "\n=== Finding calls to auto_quantize ==="
rg -n "model = auto_quantize" --type py -B 3 -A 8
echo -e "\n=== Checking quant_cfg initialization and conditional branches ==="
rg -n "if.*auto_quantize_bits" examples/llm_ptq/hf_ptq.py --type py -B 3 -A 8Length of output: 10706 🏁 Script executed: #!/bin/bash
# Get complete build_quant_cfg function
echo "=== Full build_quant_cfg function ==="
sed -n '55,100p' examples/llm_ptq/example_utils.py
echo -e "\n=== Check control flow around Nemotron VL code (lines 450-480) ==="
sed -n '450,480p' examples/llm_ptq/hf_ptq.py
echo -e "\n=== Full auto_quantize function signature in hf_ptq.py ==="
sed -n '96,120p' examples/llm_ptq/hf_ptq.py
echo -e "\n=== Check disabled_layers usage in auto_quantize ==="
rg -n "disabled_layers" examples/llm_ptq/hf_ptq.py -B 2 -A 2Length of output: 4324 Bug confirmed: KeyError when accessing quant_cfg["quant_cfg"] in auto-quantize mode The issue is real. When The proposed diff correctly addresses this by:
🤖 Prompt for AI Agents |
||
| if not model_is_already_quantized or calibration_only: | ||
| # Only run single sample for preview | ||
| input_ids = next(iter(calib_dataloader))[ | ||
| "input_features" if model_type == "whisper" else "input_ids" | ||
| ][0:1] | ||
| try: | ||
| generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100) | ||
| except Exception as e: | ||
| print( | ||
| "Error during model generation. Please check if your transformers version is " | ||
| "compatible with the model." | ||
|
|
||
Edwardf0t1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # Generate preview before quantization | ||
| if is_nemotron_vl and tokenizer is not None: | ||
| print("Running text-only preview generation for Nemotron VL model...") | ||
|
||
| question = tokenizer.decode(input_ids[0], skip_special_tokens=True) | ||
| generation_config = { | ||
| "max_new_tokens": 100, | ||
| "do_sample": False, | ||
| "eos_token_id": tokenizer.eos_token_id, | ||
| } | ||
|
|
||
| # Try text-only generation first, fall back to standard generate | ||
| text_response = run_text_only_generation( | ||
| full_model, tokenizer, question, generation_config, args.pyt_ckpt_path | ||
| ) | ||
|
|
||
| if text_response is not None: | ||
| generated_ids_before_ptq = text_response | ||
| print(f"✅ Text-only generation successful: {text_response[:100]}...") | ||
| else: | ||
| print("Text-only generation failed, falling back to standard generate...") | ||
| generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100) | ||
|
|
||
| # Run additional VL test with images | ||
| print("Running additional VL test with images...") | ||
| run_vl_preview_generation( | ||
| full_model, tokenizer, args.pyt_ckpt_path, "before quantization (VL test)" | ||
| ) | ||
| print(f"Error details: {e}") | ||
| raise | ||
| else: | ||
| # Standard generation for non-Nemotron VL models | ||
| generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100) | ||
| if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only": | ||
| print("Applying nvfp4 quantization (MoE only) for gpt-oss") | ||
|
|
||
| # quantize the model | ||
| model = quantize_model(model, quant_cfg, args, calib_dataloader, calibration_only) | ||
|
|
||
| # For VL models, update full_model to use the quantized language model | ||
| if is_nemotron_vl: | ||
| _, parent_model = get_language_model_from_vl(full_model) | ||
| if parent_model is not None: | ||
| print("Updating full_model with quantized language_model...") | ||
| parent_model.language_model = model | ||
|
|
||
| if args.verbose: | ||
| mtq.print_quant_summary(model) | ||
|
|
||
| # Run some samples | ||
| torch.cuda.empty_cache() | ||
| generated_ids_after_ptq = None | ||
| if model_type != "llama4": | ||
| if model_type != "llama4" and not is_nemotron_vl: | ||
| # Our fake quantizer may not be fully compatible with torch.compile. | ||
| generated_ids_after_ptq = full_model.generate(input_ids, max_new_tokens=100) | ||
| elif is_nemotron_vl: | ||
| print("Running text-only preview generation for quantized Nemotron VL model...") | ||
| try: | ||
| # Try text-only generation using helper function that supports both v1 and v2 | ||
| if tokenizer is None: | ||
| raise ValueError("Tokenizer is required for Nemotron VL text generation") | ||
|
|
||
| question = tokenizer.decode(input_ids[0], skip_special_tokens=True) | ||
| generation_config = { | ||
| "max_new_tokens": 100, | ||
| "do_sample": False, | ||
| "eos_token_id": tokenizer.eos_token_id, | ||
| } | ||
|
|
||
| # Use helper function that supports both v1 and v2 models | ||
| text_response = run_text_only_generation( | ||
| full_model, tokenizer, question, generation_config, args.pyt_ckpt_path | ||
| ) | ||
|
|
||
| if text_response is not None: | ||
| generated_ids_after_ptq = text_response # Store text response | ||
| print(f"✅ Text-only generation successful: {text_response[:100]}...") | ||
| else: | ||
| generated_ids_after_ptq = None | ||
|
|
||
| except Exception as e: | ||
| print(f"Text-only generation failed: {e}") | ||
| generated_ids_after_ptq = None | ||
|
|
||
| # Run additional VL test with images | ||
| print("Running additional VL test with images...") | ||
| run_vl_preview_generation( | ||
| full_model, tokenizer, args.pyt_ckpt_path, "after quantization (VL test)" | ||
| ) | ||
|
|
||
| else: | ||
| warnings.warn( | ||
| "Llama4 Maverick generation after quantization has a bug. Skipping generation sample." | ||
|
|
@@ -518,15 +590,25 @@ def output_decode(generated_ids, input_shape): | |
|
|
||
| if generated_ids_after_ptq is not None: | ||
| print("--------") | ||
| print(f"example test input: {input_decode(input_ids)}") | ||
| print("--------") | ||
| print( | ||
| f"example outputs before ptq: {output_decode(generated_ids_before_ptq, input_ids.shape[1])}" | ||
| ) | ||
| print("--------") | ||
| print( | ||
| f"example outputs after ptq: {output_decode(generated_ids_after_ptq, input_ids.shape[1])}" | ||
| ) | ||
| if is_nemotron_vl: | ||
| # For Nemotron VL models, generated_ids are text strings from model.chat() | ||
| print("Nemotron VL model text-only generation results:") | ||
| print(f"Text response before quantization: {generated_ids_before_ptq}") | ||
| print("--------") | ||
| print(f"Text response after quantization: {generated_ids_after_ptq}") | ||
| print("--------") | ||
| print("Note: Additional VL tests with images were run separately above") | ||
| else: | ||
| # For regular LLMs, generated_ids are token tensors that need decoding | ||
| print(f"example test input: {input_decode(input_ids)}") | ||
| print("--------") | ||
| print( | ||
| f"example outputs before ptq: {output_decode(generated_ids_before_ptq, input_ids.shape[1])}" | ||
| ) | ||
| print("--------") | ||
| print( | ||
| f"example outputs after ptq: {output_decode(generated_ids_after_ptq, input_ids.shape[1])}" | ||
| ) | ||
| else: | ||
| warnings.warn("Skipping quantization: model is already quantized.") | ||
|
|
||
|
|
@@ -548,9 +630,12 @@ def output_decode(generated_ids, input_shape): | |
| # Save original model config and the processor config to the export path for VLMs. | ||
| print(f"Saving original model config to {export_path}") | ||
|
|
||
| AutoConfig.from_pretrained( | ||
| args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code | ||
| ).save_pretrained(export_path) | ||
| config_kwargs = {"trust_remote_code": args.trust_remote_code} | ||
| if args.attn_implementation is not None: | ||
| config_kwargs["attn_implementation"] = args.attn_implementation | ||
| AutoConfig.from_pretrained(args.pyt_ckpt_path, **config_kwargs).save_pretrained( | ||
| export_path | ||
| ) | ||
|
|
||
| # Try to save processor config if available | ||
| try: | ||
|
|
@@ -748,7 +833,7 @@ def output_decode(generated_ids, input_shape): | |
| parser.add_argument( | ||
| "--attn_implementation", | ||
| help=( | ||
| "Specify the attention implementation to use." | ||
| "Specify the attention implementation to use. " | ||
| "This arg will be passed to the HF model loading if specified." | ||
| ), | ||
| default=None, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Forward-dated release entry
0.39 (2025-11-07) is in the future (today is 2025-10-23). Please mark this as Unreleased/TBD to avoid confusion until the release is cut.
🤖 Prompt for AI Agents