[AWQ] Fix _accumulate_mean bug, move AWQ activation averaging off CPU, and improve logging (vllm-project#2161)

ZewenShen-Cohere · gemini-code-assist[bot] · dsikka · web-flow · commit 8b01f3fd4d75 · 2026-01-08T21:21:24.000Z
This PR addresses the following issues: 1. _accumulate_mean produces incorrect output on its first run. 2. cache_smooth_activations_hook previously performed the averaging computation on the CPU. When both the hidden dimension and sequence length are large, this makes AWQ calibration CPU-bound. The slowdown is especially severe when multiple AWQ quantization jobs run concurrently. 3. Added more informative logging to the AWQ calibration grid search, including per-mapping JSON logs. This PR is a subset of vllm-project#2158 --------- Signed-off-by: ZewenShen-Cohere <zewen.shen@cohere.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
@@ -160,6 +160,8 @@ class AWQModifier(Modifier, QuantizationMixin):
     _smooth_activation_means: dict[str, tuple[torch.FloatTensor, int]] = PrivateAttr(
         default_factory=dict
     )
+    # List to store error metrics for each layer
+    _error_metrics: list[dict] = PrivateAttr(default_factory=list)
 
     def on_initialize(self, state: State, **kwargs) -> bool:
         """
@@ -273,9 +275,12 @@ def on_finalize(self, state: State, **kwargs) -> bool:
         if not self.ended_:
             self.on_end(state, None)
 
+        self._log_error_metrics()
+
         self._parent_args_cache.clear()
         self._smooth_activation_means.clear()
         self._resolved_mappings.clear()
+        self._error_metrics.clear()
 
         return True
 
@@ -386,11 +391,11 @@ def cache_smooth_activations_hook(
                 args: tuple[torch.Tensor, ...],
                 _output: torch.Tensor,
             ):
-                self._smooth_activation_means[smooth_name] = _accumulate_mean(
-                    # Assume that first argument is the input
-                    args[0].cpu().abs().detach().flatten(0, -2),
+                act_mean, count = _accumulate_mean(
+                    args[0].abs().detach().flatten(0, -2),
                     self._smooth_activation_means.get(smooth_name, None),
                 )
+                self._smooth_activation_means[smooth_name] = (act_mean.cpu(), count)
 
             return cache_smooth_activations_hook
 
@@ -555,6 +560,7 @@ def _compute_best_scale(
         best_ratio = -1
         best_scales = None
         best_error = float("inf")
+        initial_error = None
 
         org_sd = {
             k: v.cpu()
@@ -600,11 +606,13 @@ def _compute_best_scale(
             ],
         ):
             total_iterations = n_grid * len(duo_scalings)
-            for grid_idx, use_duo_scaling in tqdm(
+            pbar = tqdm(
                 product(range(n_grid), duo_scalings),
                 total=total_iterations,
-                desc="Grid search",
-            ):
+                desc=f"Grid search for {mapping.smooth_name}",
+                leave=False,
+            )
+            for grid_idx, use_duo_scaling in pbar:
                 # create new scales
                 ratio = grid_idx / n_grid
 
@@ -668,13 +676,17 @@ def _compute_best_scale(
                 # compute mean squared error (L2 norm)
                 loss = self._compute_loss(fp16_outputs, int_w_outputs)
 
+                if initial_error is None:
+                    initial_error = loss
+
                 history.append(
                     {"ratio": ratio, "duo_scaling": use_duo_scaling, "error": loss}
                 )
                 if loss < best_error:
                     best_error = loss
                     best_ratio = ratio
                     best_scales = scales.clone()
+                pbar.set_postfix({"best_error": f"{best_error:.3e}"})
 
                 mapping.parent.load_state_dict(org_sd, strict=False)
 
@@ -687,6 +699,25 @@ def _compute_best_scale(
                 "https://github.com/vllm-project/llm-compressor/issues"
             )
 
+        err_reduction = best_error / initial_error if initial_error > 0 else 1.0
+        logger.debug(
+            f"AWQ grid search for {mapping.smooth_name}: "
+            f"initial error = {initial_error:.3e}, "
+            f"best error = {best_error:.3e}, "
+            f"error reduction rate (best/initial) = {err_reduction * 100:.3f}%"
+        )
+
+        # Store error metrics for this layer
+        self._error_metrics.append(
+            {
+                "layer_name": mapping.smooth_name,
+                "parent_name": mapping.parent_name,
+                "initial_error": initial_error,
+                "best_error": best_error,
+                "reduction": err_reduction,
+            }
+        )
+
         assert (
             torch.isnan(best_scales).sum() == 0
         ), f"Nan found in scales: {best_scales}"
@@ -705,7 +736,7 @@ def _compute_loss(
         # Compute the MSE loss for each batch
         for fp16_batch, int_w_batch in zip(fp16_outputs, int_w_outputs):
             loss += torch.nn.functional.mse_loss(
-                fp16_batch, int_w_batch.to(fp16_batch.device)
+                fp16_batch, int_w_batch.to(fp16_batch.device), reduction="sum"
             ).item()
             num_elements += fp16_batch.numel()
 
@@ -714,6 +745,37 @@ def _compute_loss(
 
         return loss
 
+    def _log_error_metrics(self):
+        """
+        Log the error metrics (initial error, best error, reduction).
+        """
+
+        # Prepare data for saving
+        metrics_data = {
+            "quantization_config": {
+                "duo_scaling": self.duo_scaling,
+                "n_grid": self.n_grid,
+            },
+            "total_layers": len(self._error_metrics),
+            "metrics": self._error_metrics,
+        }
+
+        # Save to disk
+        logger.debug(f"AWQ per-mapping error metrics: {metrics_data}")
+
+        # Also print summary statistics
+        reductions = [m["reduction"] for m in self._error_metrics]
+        avg_reduction = sum(reductions) / len(reductions)
+        min_reduction = min(reductions)
+        max_reduction = max(reductions)
+        sorted_reductions = sorted(reductions)
+        median_reduction = sorted_reductions[len(sorted_reductions) // 2]
+        logger.debug(
+            f"Error reduction statistics: "
+            f"avg={avg_reduction:.4f}, median={median_reduction:.4f}, "
+            f"min={min_reduction:.4f}, max={max_reduction:.4f}"
+        )
+
     def _assert_all_activations_consumed(self):
         """
         Confirm all activations have been consumed
@@ -860,9 +922,10 @@ def _accumulate_mean(
     sum_added = inp.sum(dim=0)
     num_added = inp.size(0)
     if prev_mean_and_count is None:
-        return sum_added, num_added
+        return sum_added / num_added, num_added
 
     prev_mean, prev_count = prev_mean_and_count
+    prev_mean = prev_mean.to(inp.device)
 
     prev_sum = prev_mean * prev_count
     new_count = prev_count + num_added