Calibration: move fused_modules to modeling, rescale s when fusing g', add parallel path

Avishek Goswami · Avishek Goswami · commit b3d2de93ca50 · 2026-02-27T13:48:22.000+05:30
- Move fused_modules.py to modeling/ and update imports
- In update_fused_layer_weight_global_scales, rescale weight_scale s' = s*g'/g
  when applying fused global scale so q unchanged
- Add calibrate_weights(..., parallel=True, max_workers=N) for two-phase
  parallel weight calibration

Signed-off-by: Avishek Goswami &lt;avishek.goswami@ibm.com&gt;
diff --git a/src/llmcompressor/modeling/fused_modules.py b/src/llmcompressor/modeling/fused_modules.py
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -1,6 +1,9 @@
-from typing import Any, Iterable, Optional, Set, Tuple
+import threading
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Iterable, Iterator, Optional, Tuple
 
 import torch
+import tqdm
 from compressed_tensors.quantization import (
     DynamicType,
     QuantizationArgs,
@@ -11,6 +14,7 @@
 from compressed_tensors.utils import (
     align_module_device,
     getattr_chain,
+    match_named_modules,
     update_offload_parameter,
 )
 from loguru import logger
@@ -136,6 +140,52 @@ def update_weight_global_scale(module: Module):
     )
 
 
+def _post_order_modules(model: Module) -> Iterator[Module]:
+    """Yield every module in the tree in DFS post-order."""
+    stack: list[Tuple[Module, bool]] = [(model, False)]
+    while stack:
+        module, children_done = stack.pop()
+        if not children_done:
+            stack.append((module, True))
+            for child in reversed(list(module.children())):
+                stack.append((child, False))
+        else:
+            yield module
+
+
+def _update_weight_calibration_once(module: Module, update_zp_scale: bool) -> None:
+    """
+    Onload weight once and run both global scale (gparam) and scale/zp (qparams).
+    Used in sequential DFS to avoid double onload for NVFP4.
+    """
+    if getattr_chain(module, "quantization_scheme.weights", None) is None:
+        return
+    need_gparam = (
+        getattr_chain(module, "quantization_scheme.weights.strategy", None)
+        == QuantizationStrategy.TENSOR_GROUP
+    )
+    need_qparams = update_zp_scale
+    if not need_gparam and not need_qparams:
+        return
+    if (
+        need_qparams
+        and getattr(module, "quantization_status", None)
+        != QuantizationStatus.CALIBRATION
+    ):
+        logger.warning(
+            "Attempting to calibrate weights of a module not in calibration mode"
+        )
+    with align_module_device(module):
+        value = module.weight
+        call_observer(
+            module,
+            base_name="weight",
+            value=value,
+            should_calculate_gparam=need_gparam,
+            should_calculate_qparams=need_qparams,
+        )
+
+
 def update_weight_zp_scale(module: Module):
     """
     marks a layer as ready for calibration which activates observers
@@ -162,84 +212,112 @@ def calibrate_weights(
     model: Module,
     *,
     named_modules: Optional[Iterable[Tuple[str, Module]]] = None,
-    targets: Optional[Set[str]] = None,
-    ignore: Optional[Iterable[str]] = None,
+    targets: Iterable[str] = (),
+    ignore: Iterable[str] = (),
     update_zp_scale: bool = True,
     desc: Optional[str] = "Calibrating weights",
     show_progress: bool = True,
+    parallel: bool = False,
+    max_workers: Optional[int] = None,
 ) -> None:
     """
-    Traverse the model once (DFS) and run weight calibration: global scales for
-    FP4/TENSOR_GROUP, fused layer global scales for Attention/MLP, and weight
-    scale/zero-point. Replaces separate loops over named_modules and
-    model.modules() for better cache locality and fewer CPU–GPU onloads when
-    using offloading.
-
-    Order of operations per module:
-    1. Pre-order: update_weight_global_scale for target (quantizable) modules.
-    2. Post-order: update_fused_layer_weight_global_scales for every module
-       (no-op except for Attention/MLP containers); then update_weight_zp_scale
-       for target modules if update_zp_scale is True.
+    Run weight calibration: per-tensor global scale (gparam), fused global scales
+    for Attention/MLP, and scale/zero-point (qparams). Minimizes weight onloads
+    when using offloading (one onload per target in the default path).
+
+    Two modes:
+    - Sequential (parallel=False): DFS over the model. Pre-order: one onload per
+      target via _update_weight_calibration_once (gparam + qparams). Post-order:
+      update_fused_layer_weight_global_scales (no extra onload for targets).
+    - Parallel (parallel=True): Phase 1 runs gparam + qparams per target
+      (order-independent, parallelizable). Phase 2 applies fused global scales
+      and rescales per-tensor scale s' = s * (g' / g).
+
+    DDP: Works with distributed setups. Pass named_modules as this rank's
+    subset so each rank only calibrates its assigned modules (see e.g. #2220).
+    Activation observer sync across ranks is handled by
+    QuantizationMixin.sync_activation_observers at layer
+    boundaries (PR #2391); weight calibration does not all-reduce weight
+    observer state—each rank calibrates its subset and can broadcast
+    quantized params afterward (e.g. GPTQ-style) if needed. Fused groups
+    (q/k/v, gate/up) must be assigned to the same rank so
+    update_fused_layer_weight_global_scales sees the full group. For
+    balanced wall time, assign by weight size (e.g. greedy_bin_packing with
+    item_weight_fn=lambda m: m.weight.numel(); see GPTQ DDP #2333 which uses
+    hessian shape for the same idea).
+
+    Benchmark: See tests/benchmark_calibrate_weights.py for onload count and
+    single-vs-double-onload timing.
 
     :param model: Root module to traverse (e.g. state.model).
-    :param named_modules: Optional list of (name, module) for target modules.
-        If provided, only these modules get global_scale and zp_scale; enables
-        DDP by passing this rank's subset (see #2220). If None, targets and
-        ignore must be provided and match_named_modules(model, targets, ignore)
-        is used.
-    :param targets: Target module name patterns (used when named_modules is None).
-    :param ignore: Ignore patterns (used when named_modules is None).
-    :param update_zp_scale: If True, call update_weight_zp_scale on target
-        modules in post-order. Set False for modifiers that do zp_scale in
-        hooks (e.g. GPTQ).
-    :param desc: Progress bar description; None to disable progress bar.
-    :param show_progress: If True and desc is not None, show a tqdm progress bar.
+    :param named_modules: If provided, only these (name, module) pairs are
+        calibrated; enables DDP by passing this rank's subset. If None, uses
+        match_named_modules(model, targets, ignore).
+    :param targets: Name patterns when named_modules is None. Default ().
+    :param ignore: Ignore patterns when named_modules is None. Default ().
+    :param update_zp_scale: If True, compute scale/zp for targets. False for
+        modifiers that do zp in hooks (e.g. GPTQ).
+    :param desc: Progress bar description; None disables bar.
+    :param show_progress: If True and desc set, show tqdm bar.
+    :param parallel: If True, use two-phase parallel calibration.
+    :param max_workers: If parallel and int, phase 1 uses this many workers.
     """
     if named_modules is None:
-        if targets is None or ignore is None:
-            raise ValueError(
-                "calibrate_weights requires either named_modules or both "
-                "targets and ignore"
-            )
-        from compressed_tensors.utils import match_named_modules
-
         named_modules = list(match_named_modules(model, targets, ignore))
     else:
         named_modules = list(named_modules)
+    # DDP: target_set = only these get gparam + qparams (this rank's subset).
+    target_set = {m for _, m in named_modules}
+    target_list = list(target_set)
+    total_targets = len(target_list)
 
-    target_set = {id(m) for _, m in named_modules}
-    total_targets = len(target_set)
-
-    try:
-        import tqdm
-    except ImportError:
-        tqdm = None
-
-    if show_progress and desc is not None and tqdm is not None and total_targets > 0:
+    if show_progress and desc is not None and total_targets > 0:
         pbar = tqdm.tqdm(total=total_targets, desc=desc)
     else:
         pbar = None
 
-    # Stack-based DFS: (module, children_visited)
-    stack: list[Tuple[Module, bool]] = [(model, False)]
+    if parallel:
+        # Phase 1: per-module global scale + scale/zp (order-independent)
+        pbar_lock = threading.Lock()
 
-    while stack:
-        module, children_done = stack.pop()
+        def _phase1_one(module: Module) -> None:
+            update_weight_global_scale(module)
+            if update_zp_scale:
+                update_weight_zp_scale(module)
+            if pbar is not None:
+                with pbar_lock:
+                    pbar.update(1)
 
-        if not children_done:
-            # Pre-order: global scale for target modules (FP4 / TENSOR_GROUP)
-            if id(module) in target_set:
-                update_weight_global_scale(module)
-            stack.append((module, True))
-            for child in reversed(list(module.children())):
-                stack.append((child, False))
+        if max_workers is not None and max_workers > 0:
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                list(executor.map(_phase1_one, target_list))
         else:
-            # Post-order: fused global scales (Attention/MLP), then zp_scale for targets
+            for module in target_list:
+                _phase1_one(module)
+
+        # Phase 2: fused global scales (rescale per-tensor scale s' = s * g' / g)
+        for module in _post_order_modules(model):
             update_fused_layer_weight_global_scales(module)
-            if update_zp_scale and id(module) in target_set:
-                update_weight_zp_scale(module)
-                if pbar is not None:
-                    pbar.update(1)
+    else:
+        # Sequential DFS: pre-order one onload for gparam + qparams, post-order fused
+        seen_pre: set[Module] = set()
+        seen_post: set[Module] = set()
+        stack = [(model, False)]
+        while stack:
+            module, children_done = stack.pop()
+            if not children_done:
+                if module in target_set and module not in seen_pre:
+                    seen_pre.add(module)
+                    _update_weight_calibration_once(module, update_zp_scale)
+                stack.append((module, True))
+                for child in reversed(list(module.children())):
+                    stack.append((child, False))
+            else:
+                update_fused_layer_weight_global_scales(module)
+                if update_zp_scale and module in target_set and module not in seen_post:
+                    seen_post.add(module)
+                    if pbar is not None:
+                        pbar.update(1)
 
     if pbar is not None:
         pbar.close()
diff --git a/src/llmcompressor/modifiers/utils/__init__.py b/src/llmcompressor/modifiers/utils/__init__.py
@@ -1,7 +1,7 @@
 # ruff: noqa
 
 from .constants import *
-from .fused_modules import (
+from llmcompressor.modeling.fused_modules import (
     get_fused_attention_linears,
     get_fused_mlp_linears,
     is_fused_attention_module,
diff --git a/src/llmcompressor/modifiers/utils/helpers.py b/src/llmcompressor/modifiers/utils/helpers.py
@@ -9,10 +9,14 @@
 
 import torch
 from compressed_tensors.quantization import QuantizationStrategy
-from compressed_tensors.utils import align_modules, update_parameter_data
+from compressed_tensors.utils import (
+    align_modules,
+    update_offload_parameter,
+    update_parameter_data,
+)
 from torch.nn import Linear
 
-from llmcompressor.modifiers.utils.fused_modules import (
+from llmcompressor.modeling.fused_modules import (
     get_fused_attention_linears,
     get_fused_mlp_linears,
 )
@@ -39,7 +43,12 @@ def update_fused_layer_weight_global_scales(submodule: torch.nn.Module):
     When running NVFP4 quantization, update the global scale so that vLLM
     fused groups share one global scale: attention (traditional q/k/v or
     MLA q_a + kv_a) and MLP (gate/up). Uses the centralized fused module
-    definitions; see :mod:`llmcompressor.modifiers.utils.fused_modules`.
+    definitions; see :mod:`llmcompressor.modeling.fused_modules`.
+
+    When a linear already has ``weight_scale`` (e.g. after parallel phase-1
+    calibration), per-tensor scale is rescaled so that q = x/(s'*g') is
+    unchanged: s' = s * (g' / g), where g' is the fused global scale and g
+    was the previous per-tensor global scale.
 
     This is a requirement currently set by vLLM and may be removed or
     made optional in the future.
@@ -55,7 +64,7 @@ def update_fused_layer_weight_global_scales(submodule: torch.nn.Module):
                 torch.cat([lin.weight_global_scale.data for lin in linears])
             ).reshape([1])
         for lin in linears:
-            update_parameter_data(lin, global_scale, "weight_global_scale")
+            _apply_fused_global_scale(lin, global_scale)
         del global_scale
 
     # Fused MLP: gate_proj, up_proj
@@ -66,5 +75,17 @@ def update_fused_layer_weight_global_scales(submodule: torch.nn.Module):
                 torch.cat([lin.weight_global_scale.data for lin in linears])
             ).reshape([1])
         for lin in linears:
-            update_parameter_data(lin, global_scale, "weight_global_scale")
+            _apply_fused_global_scale(lin, global_scale)
         del global_scale
+
+
+def _apply_fused_global_scale(lin: Linear, g_prime: torch.Tensor) -> None:
+    """Set weight_global_scale to g'; rescale weight_scale so q = x/(s*g) unchanged."""
+    old_g = lin.weight_global_scale.data
+    update_parameter_data(lin, g_prime, "weight_global_scale")
+    weight_scale = getattr(lin, "weight_scale", None)
+    if weight_scale is not None:
+        # s' = s * (g' / g) so that x / s' / g' = x / s / g
+        ratio = (g_prime / old_g).to(weight_scale.dtype).to(weight_scale.device)
+        new_scale = weight_scale.data * ratio
+        update_offload_parameter(lin, "weight_scale", new_scale)