AWQ minor performance improvements to smoothing

brian-dellabetta · brian-dellabetta · commit d80dd38a8f91 · 2025-08-21T15:53:56.000Z
Signed-off-by: Brian Dellabetta &lt;bdellabe@redhat.com&gt;
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
@@ -470,8 +470,8 @@ def _apply_smoothing(self, model: Module) -> None:
 
                 # [STEP 3]: Compute output of module
                 # could cache from hook, rather than recomputing here
-                fp16_outputs = self._run_samples(parent_module)
-                if len(fp16_outputs) == 0 or all(f.numel() == 0 for f in fp16_outputs):
+                fp16_output = self._get_flattened_output(parent_module)
+                if fp16_output.numel() == 0:
                     logger.info(
                         f"Skipping smooth_layer {mapping.smooth_name}, no activations "
                         "found to scale. This can occasionally occur in MoE models "
@@ -484,7 +484,7 @@ def _apply_smoothing(self, model: Module) -> None:
 
                 # [STEP 4]: Compute loss
                 best_scales = self._compute_best_scale(
-                    x_mean, w_mean, parent_module, balance_layers, fp16_outputs
+                    x_mean, w_mean, parent_module, balance_layers, fp16_output
                 )
 
                 @torch.no_grad()
@@ -552,7 +552,7 @@ def _compute_best_scale(
         w_mean: torch.Tensor,
         parent_module: torch.nn.Module,
         linears2scale: List[torch.nn.Linear],
-        fp16_outputs: List[torch.Tensor],
+        fp16_output: torch.Tensor,
     ) -> torch.Tensor:
         """
         Compute loss and select best scales
@@ -616,7 +616,7 @@ def _compute_best_scale(
             int_w_outputs = self._run_samples(parent_module)
 
             # compute mean squared error (L2 norm)
-            loss = self._compute_loss(fp16_outputs, int_w_outputs, device)
+            loss = _compute_loss(fp16_output, int_w_output)
 
             history.append(loss)
             if loss < best_error:
@@ -641,34 +641,6 @@ def _compute_best_scale(
 
         return best_scales.detach().cpu()
 
-    @torch.no_grad()
-    def _compute_loss(
-        self,
-        fp16_outputs: List[torch.Tensor],
-        int_w_outputs: List[torch.Tensor],
-        device: torch.device,
-    ) -> torch.Tensor:
-        loss = 0.0
-        num_elements = 0
-
-        # Compute the MSE loss for each batch
-        for fp16_batch, int_w_batch in zip(fp16_outputs, int_w_outputs):
-            batch_loss = (
-                (fp16_batch.to(device) - int_w_batch.to(device))
-                .view(-1)
-                .float()
-                .pow(2)
-                .sum()
-                .item()
-            )
-            loss += batch_loss
-            num_elements += fp16_batch.numel()
-
-        # Normalize the loss by the total number of elements
-        loss /= num_elements
-
-        return loss
-
     def _assert_all_activations_consumed(self):
         """
         Confirm all activations have been consumed
@@ -678,6 +650,17 @@ def _assert_all_activations_consumed(self):
             raise RuntimeError("Some cached activations were not used")
 
 
+@torch.no_grad()
+@torch.compile()
+def _compute_loss(
+    fp16_output: torch.Tensor,
+    int_w_output: torch.Tensor,
+) -> torch.Tensor:
+    """Compute MSE loss for each batch"""
+    return (fp16_output - int_w_output).view(-1).float().pow(2).mean()
+
+
+@torch.compile()
 def _pseudo_quantize_tensor(
     w: torch.Tensor, symmetric: bool = False, bit_width: int = 8, group_size: int = -1
 ):