Calibration: move fused_modules to modeling, rescale s when fusing g', add parallel path

Avishek Goswami · Avishek Goswami · commit 6e7a4f5d1ca1 · 2026-02-25T21:47:57.000+05:30
- Move fused_modules.py to modeling/ and update imports
- In update_fused_layer_weight_global_scales, rescale weight_scale s' = s*g'/g
  when applying fused global scale so q unchanged
- Add calibrate_weights(..., parallel=True, max_workers=N) for two-phase
  parallel weight calibration

Signed-off-by: Avishek Goswami &lt;avishek.goswami@ibm.com&gt;
diff --git a/src/llmcompressor/modeling/fused_modules.py b/src/llmcompressor/modeling/fused_modules.py
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -1,6 +1,9 @@
-from typing import Any, Iterable, Optional, Set, Tuple
+import threading
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Iterable, Optional, Tuple
 
 import torch
+import tqdm
 from compressed_tensors.quantization import (
     DynamicType,
     QuantizationArgs,
@@ -11,6 +14,7 @@
 from compressed_tensors.utils import (
     align_module_device,
     getattr_chain,
+    match_named_modules,
     update_offload_parameter,
 )
 from loguru import logger
@@ -136,6 +140,39 @@ def update_weight_global_scale(module: Module):
     )
 
 
+def _update_weight_calibration_once(module: Module, update_zp_scale: bool) -> None:
+    """
+    Onload weight once and run both global scale (gparam) and scale/zp (qparams).
+    Used in sequential DFS to avoid double onload for NVFP4.
+    """
+    if getattr_chain(module, "quantization_scheme.weights", None) is None:
+        return
+    need_gparam = (
+        getattr_chain(module, "quantization_scheme.weights.strategy", None)
+        == QuantizationStrategy.TENSOR_GROUP
+    )
+    need_qparams = update_zp_scale
+    if not need_gparam and not need_qparams:
+        return
+    if (
+        need_qparams
+        and getattr(module, "quantization_status", None)
+        != QuantizationStatus.CALIBRATION
+    ):
+        logger.warning(
+            "Attempting to calibrate weights of a module not in calibration mode"
+        )
+    with align_module_device(module):
+        value = module.weight
+        call_observer(
+            module,
+            base_name="weight",
+            value=value,
+            should_calculate_gparam=need_gparam,
+            should_calculate_qparams=need_qparams,
+        )
+
+
 def update_weight_zp_scale(module: Module):
     """
     marks a layer as ready for calibration which activates observers
@@ -162,11 +199,13 @@ def calibrate_weights(
     model: Module,
     *,
     named_modules: Optional[Iterable[Tuple[str, Module]]] = None,
-    targets: Optional[Set[str]] = None,
-    ignore: Optional[Iterable[str]] = None,
+    targets: Iterable[str] = (),
+    ignore: Iterable[str] = (),
     update_zp_scale: bool = True,
     desc: Optional[str] = "Calibrating weights",
     show_progress: bool = True,
+    parallel: bool = False,
+    max_workers: Optional[int] = None,
 ) -> None:
     """
     Traverse the model once (DFS) and run weight calibration: global scales for
@@ -175,72 +214,103 @@ def calibrate_weights(
     model.modules() for better cache locality and fewer CPU–GPU onloads when
     using offloading.
 
-    Order of operations per module:
-    1. Pre-order: update_weight_global_scale for target (quantizable) modules.
+    Order of operations (default, parallel=False):
+    1. Pre-order: one weight onload per target module; run both global scale
+       (gparam) and scale/zp (qparams) via _update_weight_calibration_once.
     2. Post-order: update_fused_layer_weight_global_scales for every module
-       (no-op except for Attention/MLP containers); then update_weight_zp_scale
-       for target modules if update_zp_scale is True.
+       (no-op except for Attention/MLP containers). No second onload for targets.
+
+    When parallel=True (parallel weight calibration):
+    1. Phase 1: For each target module, run update_weight_global_scale then
+       update_weight_zp_scale (if update_zp_scale). Order is independent so
+       phase 1 can be parallelized (e.g. with max_workers).
+    2. Phase 2: Traverse model and run update_fused_layer_weight_global_scales
+       on every module. Fused global scale g' is applied and per-tensor scale
+       is rescaled s' = s * (g' / g) so that q = x/(s'*g') = x/(s*g) is unchanged.
 
     :param model: Root module to traverse (e.g. state.model).
     :param named_modules: Optional list of (name, module) for target modules.
         If provided, only these modules get global_scale and zp_scale; enables
         DDP by passing this rank's subset (see #2220). If None, targets and
-        ignore must be provided and match_named_modules(model, targets, ignore)
-        is used.
+        ignore are used via match_named_modules(model, targets, ignore)
+        (default () for both means no name-based filtering).
     :param targets: Target module name patterns (used when named_modules is None).
-    :param ignore: Ignore patterns (used when named_modules is None).
+        Default () means no name-based filtering when named_modules is None.
+    :param ignore: Ignore patterns (used when named_modules is None). Default ().
     :param update_zp_scale: If True, call update_weight_zp_scale on target
         modules in post-order. Set False for modifiers that do zp_scale in
         hooks (e.g. GPTQ).
     :param desc: Progress bar description; None to disable progress bar.
     :param show_progress: If True and desc is not None, show a tqdm progress bar.
+    :param parallel: If True, use two-phase parallel calibration (phase 1 per-layer,
+        phase 2 fused global scales with scale rescaling).
+    :param max_workers: If parallel=True and int, run phase 1 with this many
+        workers. If None, phase 1 runs sequentially.
     """
     if named_modules is None:
-        if targets is None or ignore is None:
-            raise ValueError(
-                "calibrate_weights requires either named_modules or both "
-                "targets and ignore"
-            )
-        from compressed_tensors.utils import match_named_modules
-
         named_modules = list(match_named_modules(model, targets, ignore))
     else:
         named_modules = list(named_modules)
 
-    target_set = {id(m) for _, m in named_modules}
-    total_targets = len(target_set)
+    target_set = {m for _, m in named_modules}
+    target_list = list(target_set)
+    total_targets = len(target_list)
 
-    try:
-        import tqdm
-    except ImportError:
-        tqdm = None
-
-    if show_progress and desc is not None and tqdm is not None and total_targets > 0:
+    if show_progress and desc is not None and total_targets > 0:
         pbar = tqdm.tqdm(total=total_targets, desc=desc)
     else:
         pbar = None
 
-    # Stack-based DFS: (module, children_visited)
-    stack: list[Tuple[Module, bool]] = [(model, False)]
-
-    while stack:
-        module, children_done = stack.pop()
+    if parallel:
+        # Phase 1: per-module global scale + scale/zp (order-independent)
+        pbar_lock = threading.Lock()
 
-        if not children_done:
-            # Pre-order: global scale for target modules (FP4 / TENSOR_GROUP)
-            if id(module) in target_set:
-                update_weight_global_scale(module)
-            stack.append((module, True))
-            for child in reversed(list(module.children())):
-                stack.append((child, False))
-        else:
-            # Post-order: fused global scales (Attention/MLP), then zp_scale for targets
-            update_fused_layer_weight_global_scales(module)
-            if update_zp_scale and id(module) in target_set:
+        def _phase1_one(module: Module) -> None:
+            update_weight_global_scale(module)
+            if update_zp_scale:
                 update_weight_zp_scale(module)
-                if pbar is not None:
+            if pbar is not None:
+                with pbar_lock:
                     pbar.update(1)
 
+        if max_workers is not None and max_workers > 0:
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                list(executor.map(_phase1_one, target_list))
+        else:
+            for module in target_list:
+                _phase1_one(module)
+
+        # Phase 2: fused global scales (rescale per-tensor scale s' = s * g' / g)
+        stack: list[Tuple[Module, bool]] = [(model, False)]
+        while stack:
+            module, children_done = stack.pop()
+            if not children_done:
+                stack.append((module, True))
+                for child in reversed(list(module.children())):
+                    stack.append((child, False))
+            else:
+                update_fused_layer_weight_global_scales(module)
+    else:
+        # Sequential DFS: pre-order one onload for gparam + qparams, post-order fused
+        seen_pre: set[Module] = set()
+        seen_post: set[Module] = set()
+        stack = [(model, False)]
+        while stack:
+            module, children_done = stack.pop()
+            if not children_done:
+                if module in target_set and module not in seen_pre:
+                    seen_pre.add(module)
+                    _update_weight_calibration_once(module, update_zp_scale)
+                stack.append((module, True))
+                for child in reversed(list(module.children())):
+                    stack.append((child, False))
+            else:
+                update_fused_layer_weight_global_scales(module)
+                if update_zp_scale and module in target_set and module not in seen_post:
+                    seen_post.add(module)
+                    if pbar is not None:
+                        pbar.update(1)
+
     if pbar is not None:
         pbar.close()
 
diff --git a/src/llmcompressor/modifiers/utils/__init__.py b/src/llmcompressor/modifiers/utils/__init__.py
@@ -1,7 +1,7 @@
 # ruff: noqa
 
 from .constants import *
-from .fused_modules import (
+from llmcompressor.modeling.fused_modules import (
     get_fused_attention_linears,
     get_fused_mlp_linears,
     is_fused_attention_module,
diff --git a/src/llmcompressor/modifiers/utils/helpers.py b/src/llmcompressor/modifiers/utils/helpers.py
@@ -9,10 +9,14 @@
 
 import torch
 from compressed_tensors.quantization import QuantizationStrategy
-from compressed_tensors.utils import align_modules, update_parameter_data
+from compressed_tensors.utils import (
+    align_modules,
+    update_offload_parameter,
+    update_parameter_data,
+)
 from torch.nn import Linear
 
-from llmcompressor.modifiers.utils.fused_modules import (
+from llmcompressor.modeling.fused_modules import (
     get_fused_attention_linears,
     get_fused_mlp_linears,
 )
@@ -39,7 +43,12 @@ def update_fused_layer_weight_global_scales(submodule: torch.nn.Module):
     When running NVFP4 quantization, update the global scale so that vLLM
     fused groups share one global scale: attention (traditional q/k/v or
     MLA q_a + kv_a) and MLP (gate/up). Uses the centralized fused module
-    definitions; see :mod:`llmcompressor.modifiers.utils.fused_modules`.
+    definitions; see :mod:`llmcompressor.modeling.fused_modules`.
+
+    When a linear already has ``weight_scale`` (e.g. after parallel phase-1
+    calibration), per-tensor scale is rescaled so that q = x/(s'*g') is
+    unchanged: s' = s * (g' / g), where g' is the fused global scale and g
+    was the previous per-tensor global scale.
 
     This is a requirement currently set by vLLM and may be removed or
     made optional in the future.
@@ -55,7 +64,7 @@ def update_fused_layer_weight_global_scales(submodule: torch.nn.Module):
                 torch.cat([lin.weight_global_scale.data for lin in linears])
             ).reshape([1])
         for lin in linears:
-            update_parameter_data(lin, global_scale, "weight_global_scale")
+            _apply_fused_global_scale(lin, global_scale)
         del global_scale
 
     # Fused MLP: gate_proj, up_proj
@@ -66,5 +75,17 @@ def update_fused_layer_weight_global_scales(submodule: torch.nn.Module):
                 torch.cat([lin.weight_global_scale.data for lin in linears])
             ).reshape([1])
         for lin in linears:
-            update_parameter_data(lin, global_scale, "weight_global_scale")
+            _apply_fused_global_scale(lin, global_scale)
         del global_scale
+
+
+def _apply_fused_global_scale(lin: Linear, g_prime: torch.Tensor) -> None:
+    """Set weight_global_scale to g'; rescale weight_scale so q = x/(s*g) unchanged."""
+    old_g = lin.weight_global_scale.data
+    update_parameter_data(lin, g_prime, "weight_global_scale")
+    weight_scale = getattr(lin, "weight_scale", None)
+    if weight_scale is not None:
+        # s' = s * (g' / g) so that x / s' / g' = x / s / g
+        ratio = (g_prime / old_g).to(weight_scale.dtype).to(weight_scale.device)
+        new_scale = weight_scale.data * ratio
+        update_offload_parameter(lin, "weight_scale", new_scale)