add compression param; update qdq for batch greater than 1

dsikka · dsikka · commit b59388497d20 · 2025-07-28T21:52:26.000Z
diff --git a/src/compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py b/src/compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py
@@ -60,6 +60,25 @@ def compression_param_names(self) -> Tuple[str]:
             "weight_zero_point",
             "weight_global_scale",
         )
+    
+    def compression_param_info(
+        self,
+        weight_shape: torch.Size,
+        quantization_args: Optional[QuantizationArgs] = None,
+    ) -> Dict[str, Tuple[torch.Size, torch.dtype]]:
+        """
+        Creates a dictionary of expected shapes and dtypes for each compression
+            parameter used by the compressor
+
+        :param weight_shape: uncompressed weight shape
+        :param quantization_args: quantization parameters for the weight
+        :return: dictionary mapping compressed parameter names to shape and dtype
+        """
+        output = {
+            "weight_packed": (torch.Size((weight_shape[0], weight_shape[1] // 2)), torch.uint8),
+        }
+        return output
+
 
     def compress_weight(
         self,
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -253,13 +253,14 @@ def _process_quantization(
         QuantizationStrategy.GROUP,
         QuantizationStrategy.TENSOR_GROUP,
     ):
+        """
         n_dims = x.shape
         if len(n_dims) > 2:
             x = x.squeeze(0)
-
+        """
         output_dtype = dtype if dtype is not None else x.dtype
         output = torch.zeros_like(x).to(output_dtype)
-        columns = output.shape[1]
+        columns = output.shape[-1]
 
         # TODO: make validation step for inputs
 
@@ -289,14 +290,25 @@ def _process_quantization(
             perm = torch.argsort(g_idx)
             x = safe_permute(x, perm, dim=1)
 
-        x = torch.reshape(
-            x,
-            (
-                x.shape[0],
-                ceil(x.shape[1] / group_size),
-                group_size,
-            ),
-        )
+        if len(x.shape) > 2:
+            x = torch.reshape(
+                x,
+                (
+                    x.shape[0],
+                    x.shape[1],
+                    ceil(x.shape[-1] / group_size),
+                    group_size,
+                ),
+            )
+        else:
+              x = torch.reshape(
+                x,
+                (
+                    x.shape[0],
+                    ceil(x.shape[-1] / group_size),
+                    group_size,
+                ),
+            )
 
         if do_quantize:
             output = _quantize(
@@ -319,18 +331,24 @@ def _process_quantization(
                 global_scale=global_scale,
             )
 
-        output = torch.reshape(
-            output,
-            (output.shape[0], output.shape[1] * output.shape[2]),
-        )
+        if len(x.shape) > 3:
+            output = torch.reshape(
+                output,
+                (output.shape[0], output.shape[1], output.shape[-1] * output.shape[-2]),
+            )
+        else:
+            output = torch.reshape(
+                output,
+                (output.shape[0], output.shape[-1] * output.shape[-2]),
+            )
 
         output = output.to(output_dtype)
 
         if not is_column_order:
             output = safe_permute(output, torch.argsort(perm), dim=1)
 
-        if len(n_dims) > 2:
-            output = output.unsqueeze(0)
+        #if len(n_dims) > 2:
+        #    output = output.unsqueeze(0)
 
     else:  # covers channel, token and tensor strategies
         if do_quantize:
diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py
@@ -167,28 +167,43 @@ def compute_dynamic_scales_and_zp(
 
     keep_dims = True
     if args.strategy == QuantizationStrategy.TOKEN:
-        dim = {1, 2}
+        dim = {0, 1, 2}
         reduce_dims = tuple(idx for idx in range(value.ndim) if idx not in dim)
     elif args.strategy == QuantizationStrategy.TENSOR:
         reduce_dims = None
     elif args.strategy in (
         QuantizationStrategy.TENSOR_GROUP,
         QuantizationStrategy.GROUP,
     ):
+        #if len(value.shape) > 2:
+        #    value = value.squeeze(0)
         if len(value.shape) > 2:
-            value = value.squeeze(0)
+            dim = {0, 1, 2}
+        else:
+            dim = {0, 1}
 
-        dim = {0, 1}
-        reduce_dims = tuple(idx for idx in range(3) if idx not in dim)
+        reduce_dims = tuple(idx for idx in range(len(value.shape) + 1) if idx not in dim)
         keep_dims = False
-        value = torch.reshape(
-            value,
-            (
-                value.shape[0],
-                math.ceil(value.shape[1] / args.group_size),
-                args.group_size,
-            ),
-        )
+
+        if len(value.shape) > 2:
+            value = torch.reshape(
+                value,
+                (
+                    value.shape[0],
+                    value.shape[1],
+                    math.ceil(value.shape[-1] / args.group_size),
+                    args.group_size,
+                ),
+            )
+        else:
+            value = torch.reshape(
+                value,
+                (
+                    value.shape[0],
+                    math.ceil(value.shape[-1] / args.group_size),
+                    args.group_size,
+                ),
+            )
     else:
         supported_strategies = (
             QuantizationStrategy.TOKEN,