Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 89 additions & 77 deletions src/llmcompressor/modifiers/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,104 +7,116 @@
strategies like NVFP4.
"""

import logging

import torch
from compressed_tensors.offload import align_modules, update_offload_parameter
from compressed_tensors.quantization import QuantizationStrategy, is_attention_module
from torch.nn import Linear, Module
from torch.nn import Module

__all__ = ["update_fused_layer_weight_global_scales"]

logger = logging.getLogger(__name__)

def update_fused_layer_weight_global_scales(submodule: torch.nn.Module):
"""
When running NVFP4 quantization, update the global scale
such that q,k,v layers are treated as one tensor with the same
global_scale and gate_proj/up_proj layers are treated as one tensor
with the same global scale. This is requirement currently being set
by vLLM and may be removed in the future OR potentially make it
an optional step.
# Fused attention projection groups.
# Each entry is a list of attribute names that should share the same global scale.
# The first matching group is used; order matters.
_ATTENTION_FUSED_GROUPS: list[list[str]] = [
# Already-fused QKV (e.g. GPT-NeoX, Falcon)
["qkv_proj"],
# Standard separate Q/K/V projections (Llama, Mistral, Qwen, etc.)
["q_proj", "k_proj", "v_proj"],
# DeepSeek V2/V3 MLA: compressed Q + compressed KV
["q_a_proj", "kv_a_proj_with_mqa"],
# DeepSeek V2/V3 MLA: decompressed Q + decompressed KV
["q_b_proj", "kv_b_proj"],
Comment on lines +24 to +32
Copy link

Copilot AI Mar 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_ATTENTION_FUSED_GROUPS doesn't account for the DeepSeek variant where the compressed Q projection is named q_proj (repo already notes some DeepSeek models use q_proj instead of q_a_proj). In that case an attention module with q_proj + kv_a_proj_with_mqa will match no group and scales won't be fused. Consider adding an additional group for ["q_proj", "kv_a_proj_with_mqa"] (ordered before the standard [q_proj,k_proj,v_proj] group, since it won't fully match anyway) or otherwise handling this alias explicitly.

Copilot uses AI. Check for mistakes.
]

:param model: model to quantize

def _fuse_global_scales(layers: list[Module]):
"""
Given a list of Linear-like modules, set all of their
``weight_global_scale`` parameters to the element-wise minimum.
"""
with align_modules(layers):
global_scale = torch.min(
torch.cat([layer.weight_global_scale.data for layer in layers])
).reshape([1])

def _is_mlp_module(module: Module):
return "mlp" in module.__class__.__name__.lower() and (
hasattr(module, "gate_proj") and hasattr(module, "up_proj")
)
for layer in layers:
update_offload_parameter(layer, "weight_global_scale", global_scale)

def _valid_tensor_group_quant(layer_list: list[Linear]):
"""
Return True if all the linear layers in the layer_list are
TENSOR_GROUP quantized.
"""
for layer in layer_list:
scheme = getattr(layer, "quantization_scheme", None)
if scheme is None:
return False
del global_scale

weight_quant_args = scheme.weights

if weight_quant_args is None:
return False
def _valid_tensor_group_quant(layer_list: list[Module]) -> bool:
"""
Return True if all the modules in *layer_list* are
TENSOR_GROUP quantized (i.e. they carry an NVFP4-style global scale).
"""
for layer in layer_list:
scheme = getattr(layer, "quantization_scheme", None)
if scheme is None:
return False

if weight_quant_args.strategy != QuantizationStrategy.TENSOR_GROUP:
return False
return True
weight_quant_args = scheme.weights
if weight_quant_args is None:
return False

if is_attention_module(submodule):
# already fused/treated as one layer
if hasattr(submodule, "qkv_proj"):
return
if weight_quant_args.strategy != QuantizationStrategy.TENSOR_GROUP:
return False
return True

# not traditional attention (TODO: MLA)
if not (
hasattr(submodule, "q_proj")
and hasattr(submodule, "k_proj")
and hasattr(submodule, "v_proj")
):
return

if not _valid_tensor_group_quant(
[submodule.q_proj, submodule.k_proj, submodule.v_proj]
):
return
def update_fused_layer_weight_global_scales(submodule: torch.nn.Module):
"""
When running NVFP4 quantization, update the global scale
such that fused projection layers are treated as one tensor with the same
global_scale. Specifically:

with align_modules([submodule.q_proj, submodule.v_proj, submodule.k_proj]):
global_scale = torch.min(
torch.cat(
(
submodule.q_proj.weight_global_scale.data,
submodule.k_proj.weight_global_scale.data,
submodule.v_proj.weight_global_scale.data,
)
)
).reshape([1])
* **Attention**: q/k/v projections (or MLA-style compressed projections
like ``q_a_proj``/``kv_a_proj_with_mqa``) share one global scale.
* **MLP**: gate_proj and up_proj share one global scale.

update_offload_parameter(submodule.k_proj, "weight_global_scale", global_scale)
update_offload_parameter(submodule.q_proj, "weight_global_scale", global_scale)
update_offload_parameter(submodule.v_proj, "weight_global_scale", global_scale)
This is a requirement currently being set by vLLM and may be removed in
the future OR potentially made an optional step.

del global_scale
:param submodule: a single sub-module of the model (attention or MLP block)
"""

def _is_mlp_module(module: Module) -> bool:
return "mlp" in module.__class__.__name__.lower() and (
hasattr(module, "gate_proj") and hasattr(module, "up_proj")
)

# --- Attention fused groups ---
if is_attention_module(submodule):
for group in _ATTENTION_FUSED_GROUPS:
layers = [
getattr(submodule, name)
for name in group
if hasattr(submodule, name)
]
# Only fuse when ALL names in the group are present
if len(layers) != len(group):
continue

# Skip single-projection groups (already fused, e.g. qkv_proj)
if len(layers) <= 1:
return

if not _valid_tensor_group_quant(layers):
return

_fuse_global_scales(layers)
return # only the first matching group applies

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Fuse every matching attention projection group

When an MLA attention module contains both q_a_proj/kv_a_proj_with_mqa and q_b_proj/kv_b_proj, this early return stops after fusing the first complete group, so the second pair is never synchronized even though it is explicitly listed in _ATTENTION_FUSED_GROUPS. In the calibration flows I checked (quantization/base.py, awq/base.py, gptq/base.py), this helper is invoked once per module via for module in state.model.modules(), so there is no second call on the same attention block to fuse the remaining group. That leaves part of DeepSeek MLA attention with inconsistent NVFP4 global scales.

Useful? React with 👍 / 👎.


Comment on lines +93 to +113
Copy link

Copilot AI Mar 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The attention branch returns after fusing the first matching group. For DeepSeek MLA modules that expose both compressed (q_a_proj/kv_a_proj_with_mqa) and decompressed (q_b_proj/kv_b_proj) projections on the same attention module, the second group will never be fused because update_fused_layer_weight_global_scales is only called once per module during model.modules() traversal. Consider fusing all complete, disjoint groups present on the module (or at least not returning after the first successful fusion), and similarly continue (not return) when a matched group isn't TENSOR_GROUP so later groups can still be handled.

Copilot uses AI. Check for mistakes.
# --- MLP fused groups ---
if _is_mlp_module(submodule):
if not _valid_tensor_group_quant([submodule.gate_proj, submodule.up_proj]):
return
gate_proj = getattr(submodule, "gate_proj")
up_proj = getattr(submodule, "up_proj")

with align_modules([submodule.gate_proj, submodule.up_proj]):
global_scale = torch.min(
torch.cat(
(
submodule.gate_proj.weight_global_scale.data,
submodule.up_proj.weight_global_scale.data,
)
)
).reshape([1])

update_offload_parameter(
submodule.gate_proj,
"weight_global_scale",
global_scale,
)
update_offload_parameter(submodule.up_proj, "weight_global_scale", global_scale)
if not _valid_tensor_group_quant([gate_proj, up_proj]):
return

del global_scale
_fuse_global_scales([gate_proj, up_proj])
Loading
Loading