perf: make MSE observer compatible with torch.compile

Bias92 · Bias92 · commit 9d3397319574 · 2026-03-12T01:58:14.000+09:00
compile inner _compute_candidate_error via torch.compile(dynamic=True). early stopping preserved in outer loop. compile flag added as oneshot arg. requires: vllm-project/compressed-tensors#627 related: pytorch/pytorch#177131
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -22,6 +22,7 @@
 from llmcompressor.core.session_functions import active_session
 from llmcompressor.datasets import get_calibration_dataloader
 from llmcompressor.entrypoints.utils import post_process, pre_process
+from llmcompressor.observers.compile_config import set_observer_compile
 from llmcompressor.modeling.moe_context import moe_calibration_context
 from llmcompressor.pipelines import CalibrationPipeline
 
@@ -299,6 +300,8 @@ def oneshot(
     sequential_targets: list[str] | None = None,
     sequential_offload_device: str = "cpu",
     quantization_aware_calibration: bool = True,
+    sequential_prefetch: bool = False,
+    enable_observer_compile: bool = False,
     # Miscellaneous arguments
     output_dir: str | None = None,
     log_dir: str | None = None,
@@ -364,9 +367,10 @@ def oneshot(
     :param streaming: True to stream data from a cloud dataset.
     :param overwrite_cache: Whether to overwrite the cached preprocessed datasets.
     :param preprocessing_num_workers: Number of processes for dataset preprocessing.
-    :param dataloader_num_workers: Number of worker processes for data loading. Set to 0
-        to disable multiprocessing. Note: Custom data collators may not work with
-        multiprocessing. Default is 0.
+    :param dataloader_num_workers: Number of worker processes for data loading. Default
+        is 0 (safe for low CPU/GPU memory). Set to 2 or more for faster calibration if
+        you have sufficient RAM. Custom data collators may not work with
+        multiprocessing.
     :param min_tokens_per_module: Minimum percentage of tokens per
         module, relevant for MoE models.
     :param moe_calibrate_all_experts: Whether to calibrate all experts during MoE
@@ -388,6 +392,9 @@ def oneshot(
         calibration in the sequential pipeline. When True, quantization is applied
         during forward pass in calibration. When False, quantization is disabled
         during forward pass in calibration. Default is set to True.
+    :param sequential_prefetch: When using the sequential pipeline, prefetch the
+        next batch in a background thread to overlap onload with forward. Default
+        False; set True for faster calibration when GPU memory allows.
 
     # Miscellaneous arguments
     :param output_dir: Path to save the output model after calibration.
@@ -400,9 +407,10 @@ def oneshot(
 
     # pass all args directly into Oneshot
     local_args = {
-        k: v for k, v in locals().items() if k not in ("local_args", "kwargs")
+        k: v for k, v in locals().items() if k not in ("local_args", "kwargs", "enable_observer_compile")
     }
     one_shot = Oneshot(**local_args, **kwargs)
+    set_observer_compile(enable_observer_compile)
     one_shot()
 
     return one_shot.model
diff --git a/src/llmcompressor/observers/compile_config.py b/src/llmcompressor/observers/compile_config.py
@@ -0,0 +1,18 @@
+"""
+Global configuration for observer torch.compile support.
+
+The compile flag is set by the oneshot entrypoint and read by observer
+instances at call time. This avoids threading the flag through recipe
+and modifier layers.
+"""
+
+_enable_observer_compile: bool = False
+
+
+def set_observer_compile(enabled: bool) -> None:
+    global _enable_observer_compile
+    _enable_observer_compile = enabled
+
+
+def get_observer_compile() -> bool:
+    return _enable_observer_compile
diff --git a/src/llmcompressor/observers/mse.py b/src/llmcompressor/observers/mse.py
@@ -1,19 +1,25 @@
-from typing import Optional
+from typing import Optional, Tuple
 
 import torch
+import torch._dynamo.config
 from compressed_tensors.quantization import (
     QuantizationArgs,
     QuantizationStrategy,
 )
 from compressed_tensors.quantization.lifecycle import fake_quantize
 from compressed_tensors.quantization.utils import calculate_qparams, generate_gparam
-from compressed_tensors.utils import patch_attr
 
 from llmcompressor.observers.base import MinMaxTuple, Observer
+from llmcompressor.observers.compile_config import get_observer_compile
 from llmcompressor.observers.moving_base import MovingAverageObserverBase
 
 __all__ = ["MovingAverageMSEObserver"]
 
+# Allow torch.compile to handle scalar conversions inside
+# compressed_tensors' calculate_qparams (float(bit_range)).
+# Same approach as GPTQ compile path (commit a4f9ba2e).
+torch._dynamo.config.capture_scalar_outputs = True
+
 
 @Observer.register("memoryless_mse")
 class MemorylessMSEObserver(Observer):
@@ -32,7 +38,7 @@ class MemorylessMSEObserver(Observer):
     :param module: optional module with attached quantization parameters. This argument
         is required to utilize existing qparams such as global_scale or g_idx
     :param **observer_kwargs: keyword arguments for observer initialization\n
-        maxshrink: maximum shrink amount (in “grid steps”). The number of
+        maxshrink: maximum shrink amount (in "grid steps"). The number of
             search steps is int(maxshrink * grid)\n
         patience: number of consecutive search steps without improvement before
             early stopping\n
@@ -53,32 +59,39 @@ def __init__(self, *args, **kwargs):
         self.grid = observer_kwargs.get("grid", 100.0)
         self.norm = observer_kwargs.get("norm", 2.4)
 
-    def get_min_max(self, observed: torch.Tensor) -> MinMaxTuple:
-        # min[min_vals, max_vals](mse_quant_error)
-        global_scale = self._get_module_param("global_scale")
+        # Pre-create token_args to avoid patch_attr context manager
+        # which causes torch.compile graph breaks
+        self._token_args = self.args.model_copy(
+            update={"strategy": QuantizationStrategy.TOKEN}
+        )
+
+    def _call_grid_search(
+        self,
+        observed: torch.Tensor,
+        global_scale: Optional[torch.Tensor],
+        optimize_global_scale: bool,
+    ) -> MinMaxTuple:
         return _grid_search_mse(
             observed,
             self.args,
+            self._token_args,
             self.maxshrink,
             self.patience,
             self.grid,
             self.norm,
             global_scale=global_scale,
-            optimize_global_scale=False,
+            optimize_global_scale=optimize_global_scale,
+            enable_compile=get_observer_compile(),
         )
 
+    def get_min_max(self, observed: torch.Tensor) -> MinMaxTuple:
+        # min[min_vals, max_vals](mse_quant_error)
+        global_scale = self._get_module_param("global_scale")
+        return self._call_grid_search(observed, global_scale, False)
+
     def get_global_min_max(self, observed: torch.Tensor) -> MinMaxTuple:
         # min[min_vals, max_vals, global_scale](mse_quant_error)
-        return _grid_search_mse(
-            observed,
-            self.args,
-            self.maxshrink,
-            self.patience,
-            self.grid,
-            self.norm,
-            global_scale=None,
-            optimize_global_scale=True,
-        )
+        return self._call_grid_search(observed, None, True)
 
 
 @Observer.register("mse")
@@ -98,7 +111,7 @@ class MovingAverageMSEObserver(MovingAverageObserverBase):
     :param module: optional module with attached quantization parameters. This argument
         is required to utilize existing qparams such as global_scale or g_idx
     :param **observer_kwargs: keyword arguments for observer initialization\n
-        maxshrink: maximum shrink amount (in “grid steps”). The number of
+        maxshrink: maximum shrink amount (in "grid steps"). The number of
             search steps is int(maxshrink * grid)\n
         patience: number of consecutive search steps without improvement before
             early stopping\n
@@ -119,55 +132,134 @@ def __init__(self, *args, **kwargs):
         self.grid = observer_kwargs.get("grid", 100.0)
         self.norm = observer_kwargs.get("norm", 2.4)
 
-    def get_current_min_max(self, observed: torch.Tensor) -> MinMaxTuple:
-        # min[min_vals, max_vals](mse_quant_error)
-        global_scale = self._get_module_param("global_scale")
+        # Pre-create token_args to avoid patch_attr context manager
+        # which causes torch.compile graph breaks
+        self._token_args = self.args.model_copy(
+            update={"strategy": QuantizationStrategy.TOKEN}
+        )
+
+    def _call_grid_search(
+        self,
+        observed: torch.Tensor,
+        global_scale: Optional[torch.Tensor],
+        optimize_global_scale: bool,
+    ) -> MinMaxTuple:
         return _grid_search_mse(
             observed,
             self.args,
+            self._token_args,
             self.maxshrink,
             self.patience,
             self.grid,
             self.norm,
             global_scale=global_scale,
-            optimize_global_scale=False,
+            optimize_global_scale=optimize_global_scale,
+            enable_compile=get_observer_compile(),
         )
 
+    def get_current_min_max(self, observed: torch.Tensor) -> MinMaxTuple:
+        # min[min_vals, max_vals](mse_quant_error)
+        global_scale = self._get_module_param("global_scale")
+        return self._call_grid_search(observed, global_scale, False)
+
     def get_current_global_min_max(self, observed: torch.Tensor) -> MinMaxTuple:
         # min[min_vals, max_vals, global_scale](mse_quant_error)
-        return _grid_search_mse(
-            observed,
-            self.args,
-            self.maxshrink,
-            self.patience,
-            self.grid,
-            self.norm,
-            global_scale=None,
-            optimize_global_scale=True,
-        )
+        return self._call_grid_search(observed, None, True)
+
+
+def _compute_candidate_error(
+    observed: torch.Tensor,
+    args: QuantizationArgs,
+    token_args: QuantizationArgs,
+    min_val: torch.Tensor,
+    max_val: torch.Tensor,
+    p: float,
+    norm: float,
+    global_scale: Optional[torch.Tensor],
+    optimize_global_scale: bool,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Compute the quantization error for a single shrink factor.
+
+    Shared helper used by the grid search. When enable_compile is set
+    via oneshot, this function is called through its compiled wrapper
+    for accelerated execution.
+
+    :param observed: value of shape (num_observations, *qparams_shape, group_size)
+    :param args: quantization args used for computing qparams
+    :param token_args: quantization args with strategy set to TOKEN, pre-created
+        to avoid patch_attr context manager which causes torch.compile graph breaks
+    :param min_val: per-channel minimum values
+    :param max_val: per-channel maximum values
+    :param p: shrink factor (1 - i/grid)
+    :param norm: exponent used when computing the error
+    :param global_scale: precomputed global scale to use for quantization
+    :param optimize_global_scale: If True, recompute global_scale from candidates
+    :return: (error, shrinked_min_val, shrinked_max_val)
+    """
+    shrinked_min_val = p * min_val
+    shrinked_max_val = p * max_val
+
+    if optimize_global_scale:
+        global_scale = generate_gparam(shrinked_min_val, shrinked_max_val)
+
+    candidate_scales, candidate_zero_points = calculate_qparams(
+        min_vals=shrinked_min_val,
+        max_vals=shrinked_max_val,
+        quantization_args=args,
+        global_scale=global_scale,
+    )
+
+    # Use pre-created token_args instead of patch_attr context manager
+    # to maintain torch.compile compatibility
+    q = fake_quantize(
+        observed,
+        candidate_scales.unsqueeze(-1),
+        candidate_zero_points.unsqueeze(-1),
+        token_args,
+        global_scale=global_scale,
+    ).to(observed.dtype)
+
+    err = torch.sum((q - observed).abs().pow(norm), dim=(0, -1))
+    return err, shrinked_min_val, shrinked_max_val
+
+
+# Compiled variant of the inner computation.
+# The outer grid search loop stays in eager mode to preserve
+# early stopping (data-dependent control flow).
+_compute_candidate_error_compiled = torch.compile(
+    _compute_candidate_error, dynamic=True
+)
 
 
 def _grid_search_mse(
     observed: torch.Tensor,
     args: QuantizationArgs,
+    token_args: QuantizationArgs,
     maxshrink: float,
     patience: float,
     grid: float,
     norm: float,
     global_scale: Optional[torch.Tensor] = None,
     optimize_global_scale: bool = False,
+    enable_compile: bool = False,
 ) -> MinMaxTuple:
     """
     Perform a 1-D grid search to find per-channel min/max ranges that minimize
     mean-squared quantization error.
 
-    This routine progressively “shrinks” the absolute min/max ranges of the
-    observed tensor and evaluates the quantization error at each candidate
-    range. For each shrink factor ``p = 1 - i/grid`` up to ``maxshrink``.
+    Progressively shrinks the absolute min/max ranges of the observed tensor
+    and evaluates the quantization error at each candidate. Early stopping
+    exits when no improvement is found for ``patience`` consecutive steps.
+
+    When enable_compile is True, the inner error computation is executed
+    through a torch.compiled wrapper for accelerated execution while
+    preserving early stopping in the outer loop.
 
     :param observed: value of shape (num_observations, *qparams_shape, group_size)
     :param args: quantization args used for computing qparams and fake quant
-    :param maxshrink: maximum shrink amount (in “grid steps”). The number of
+    :param token_args: quantization args with strategy set to TOKEN
+    :param maxshrink: maximum shrink amount (in "grid steps"). The number of
         search steps is int(maxshrink * grid)
     :param patience: number of consecutive search steps without improvement before
         early stopping
@@ -178,50 +270,35 @@ def _grid_search_mse(
         `optimize_global_scale` is True
     :param optimize_global_scale: If True, recompute ``global_scale`` from the
         candidate min/max during each step of the search
+    :param enable_compile: If True, use torch.compiled inner computation
     """
     min_val = torch.amin(observed, dim=(0, -1))
     max_val = torch.amax(observed, dim=(0, -1))
     best_error = torch.full_like(min_val, torch.finfo(min_val.dtype).max)
     best_min_val = min_val.clone()
     best_max_val = max_val.clone()
 
-    # Early stopping params
+    compute_fn = (
+        _compute_candidate_error_compiled if enable_compile
+        else _compute_candidate_error
+    )
     no_improve_count = 0
 
     # @ksayers @HGCharles: investigate searching over separate shrinking factors
     for i in range(int(maxshrink * grid)):
         p = 1 - i / grid
-        shrinked_min_val = p * min_val
-        shrinked_max_val = p * max_val
-
-        if optimize_global_scale:
-            global_scale = generate_gparam(shrinked_min_val, shrinked_max_val)
-
-        candidate_scales, candidate_zero_points = calculate_qparams(
-            min_vals=shrinked_min_val,
-            max_vals=shrinked_max_val,
-            quantization_args=args,
-            global_scale=global_scale,
+        err, shrinked_min_val, shrinked_max_val = compute_fn(
+            observed,
+            args,
+            token_args,
+            min_val,
+            max_val,
+            p,
+            norm,
+            global_scale,
+            optimize_global_scale,
         )
 
-        # Note that observed.shape = (num_observations, *qparams_shape, group_size).
-        # For the purposes of fake quantization, this is equivalent to token quant
-        with patch_attr(args, "strategy", QuantizationStrategy.TOKEN):
-            q = fake_quantize(
-                observed,
-                candidate_scales.unsqueeze(-1),
-                candidate_zero_points.unsqueeze(-1),
-                args,
-                global_scale=global_scale,
-            ).to(observed.dtype)
-            # Note that due to forward quantization implementation, token quant,
-            # unlike tensor_group, requires extra dtype cast
-
-        q -= observed
-        q.abs_()
-        q.pow_(norm)
-        err = torch.sum(q, dim=(0, -1))
-
         tmp = err < best_error
         if torch.any(tmp):
             best_error[tmp] = err[tmp]
diff --git a/tests/llmcompressor/observers/test_mse.py b/tests/llmcompressor/observers/test_mse.py