[AWQ] Support for Calibration Datasets of varying feature dimension (#1536)

brian-dellabetta · web-flow · commit 7fb58dfbefb3 · 2025-06-12T15:10:59.000-05:00
SUMMARY: AWQModifier currently expects all batches to have the same feature dimension. Users are reporting this causes errors for vision-language datasets. This PR adds support to AWQModifier for calibration dataset batches with varying feature dimension. Rather than concatentating all outputs to a single torch tensor, the lists are retained and passed through one by one to compute loss. This removes the need for chunk memory configuration, and the logic around that calculation. * Resolves #1524 TEST PLAN: - [x] Re-ran for `"meta-llama/Llama-3.2-3B-Instruct"`, wikitext PPL of 13.30 is better than the 14.08 previously achieved, because the dataset is slightly different now. - [x] Also confirmed [user-provided code in #1524](#1524 (comment)) can be run with smaller `"Qwen/Qwen2.5-VL-7B-Instruct"` model up until it tries to access jpg file on user's local machine. --------- Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
@@ -113,7 +113,6 @@ class AWQModifier(Modifier, QuantizationMixin):
         requirements but requires more time to move data between cpu and execution
         device. Defaults to None, so cached args are not offloaded. Consider setting
         to torch.device("cpu") if you are encountering OOM errors
-    :param max_chunk_memory: maximum memory to use for each chunk of input activations
     :param duo_scaling: whether to use duo scaling, which uses both input activations
         and weights to determine the scaling factor
     """
@@ -125,7 +124,6 @@ class AWQModifier(Modifier, QuantizationMixin):
     sequential_targets: Union[str, List[str], None] = None
     mappings: Optional[List[AWQMapping]] = None
     offload_device: Optional[torch.device] = None
-    max_chunk_memory: int = 1024 * 1024 * 1024
     duo_scaling: bool = True
 
     # Private vars set during validation
@@ -476,8 +474,8 @@ def _apply_smoothing(self, model: Module) -> None:
             with calibration_forward_context(model), HooksMixin.disable_hooks():
                 # [STEP 3]: Compute output of module
                 # could cache from hook, rather than recomputing here
-                fp16_output = self._run_samples(parent_module)
-                if fp16_output.numel() == 0:
+                fp16_outputs = self._run_samples(parent_module)
+                if len(fp16_outputs) == 0 or all(f.numel() == 0 for f in fp16_outputs):
                     logger.info(
                         f"Skipping smooth_layer {mapping.smooth_name}, no activations "
                         "found to scale. This can occasionally occur in MoE models "
@@ -490,7 +488,7 @@ def _apply_smoothing(self, model: Module) -> None:
 
                 # [STEP 4]: Compute loss
                 best_scales = self._compute_best_scale(
-                    x_mean, w_mean, parent_module, balance_layers, fp16_output
+                    x_mean, w_mean, parent_module, balance_layers, fp16_outputs
                 )
 
             @torch.no_grad()
@@ -543,28 +541,25 @@ def smooth(module):
             v.batch_intermediates.clear()
         self._assert_all_activations_consumed()
 
-    def _run_samples(self, module: Module) -> torch.Tensor:
+    def _run_samples(self, module: Module) -> List[torch.Tensor]:
         with align_module_device(module):
             outputs = [
                 module(**batch_kwargs)
                 for batch_kwargs in self._parent_args_cache[module]
             ]
-            return torch.cat(
-                [
-                    # If Tuple, assume that first argument is the input
-                    output[0] if isinstance(output, Tuple) else output
-                    for output in outputs
-                ],
-                dim=0,
-            )
+            return [
+                # If Tuple, assume that first argument is the input
+                output[0] if isinstance(output, Tuple) else output
+                for output in outputs
+            ]
 
     def _compute_best_scale(
         self,
         x_mean: torch.Tensor,
         w_mean: torch.Tensor,
         parent_module: torch.nn.Module,
         linears2scale: List[torch.nn.Linear],
-        fp16_output: torch.Tensor,
+        fp16_outputs: List[torch.Tensor],
     ) -> torch.Tensor:
         """
         Compute loss and select best scales
@@ -623,10 +618,10 @@ def _compute_best_scale(
 
             # W * X
             with HooksMixin.disable_hooks():
-                int_w_output = self._run_samples(parent_module)
+                int_w_outputs = self._run_samples(parent_module)
 
             # compute mean squared error (L2 norm)
-            loss = self._compute_loss(fp16_output, int_w_output, device)
+            loss = self._compute_loss(fp16_outputs, int_w_outputs, device)
 
             history.append(loss)
             if loss < best_error:
@@ -648,35 +643,25 @@ def _compute_best_scale(
     @torch.no_grad()
     def _compute_loss(
         self,
-        fp16_output: torch.Tensor,
-        int_w_output: torch.Tensor,
+        fp16_outputs: List[torch.Tensor],
+        int_w_outputs: List[torch.Tensor],
         device: torch.device,
     ) -> torch.Tensor:
         loss = 0.0
-        fp16_output_flat = fp16_output.view(-1)
-        int_w_output_flat = int_w_output.view(-1)
-        num_elements = fp16_output_flat.size(0)
-        element_size_bytes = fp16_output.element_size()
-
-        # Calculate chunk size dynamically based on max_chunk_memory
-        # Divide the max_chunk_memory by twice the element size
-        chunk_size = self.max_chunk_memory // (element_size_bytes * 2)
-        chunk_size = min(chunk_size, num_elements)
-
-        # Split the computation into chunks
-        fp16_chunks = torch.split(fp16_output_flat, chunk_size)
-        int_w_chunks = torch.split(int_w_output_flat, chunk_size)
-
-        # Compute the MSE loss for each chunk
-        for fp16_chunk, int_w_chunk in zip(fp16_chunks, int_w_chunks):
-            chunk_loss = (
-                (fp16_chunk.to(device) - int_w_chunk.to(device))
+        num_elements = 0
+
+        # Compute the MSE loss for each batch
+        for fp16_batch, int_w_batch in zip(fp16_outputs, int_w_outputs):
+            batch_loss = (
+                (fp16_batch.to(device) - int_w_batch.to(device))
+                .view(-1)
                 .float()
                 .pow(2)
                 .sum()
                 .item()
             )
-            loss += chunk_loss
+            loss += batch_loss
+            num_elements += fp16_batch.numel()
 
         # Normalize the loss by the total number of elements
         loss /= num_elements