Fix NF4 scale padding (#183)

kevalmorabia97 · ishan-modi · kevalmorabia97 · commit 8e3bfb5703ac · 2025-07-14T23:17:58.000+05:30
Co-authored-by: ishan-modi &lt;ishan.modi24@gmail.com&gt;
diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
@@ -575,7 +575,9 @@ def _real_quantize(self, inputs):
         ):
             # NF4 double quantization
             # Return real quantized tensor class and store scales inside the TensorQuantizer
-            outputs, scales = NF4QTensor.quantize(inputs, self._block_sizes[-1])
+            outputs, scales = NF4QTensor.quantize(
+                inputs, self._block_sizes[-1], self._block_sizes["scale_block_sizes"][-1]
+            )
             _scale, _double_scale, _scale_zeros = NF4QTensor.double_quantization(
                 scales,
                 self._block_sizes["scale_block_sizes"][-1],
diff --git a/modelopt/torch/quantization/qtensor/int4_tensor.py b/modelopt/torch/quantization/qtensor/int4_tensor.py
@@ -37,7 +37,7 @@ def _get_quant_maxbound(num_bits):
         return 2 ** (num_bits - 1) - 1
 
     @classmethod
-    def quantize(cls, input: torch.Tensor, block_size: int) -> torch.Tensor:
+    def quantize(cls, input: torch.Tensor, block_size: int) -> tuple:
         """Converting a tensor to a quantized format based on INT4 (AWQ) quantization.
 
         Args:
diff --git a/modelopt/torch/quantization/qtensor/nf4_tensor.py b/modelopt/torch/quantization/qtensor/nf4_tensor.py
@@ -74,7 +74,9 @@ class NF4QTensor(BaseQuantizedTensor):
     """
 
     @classmethod
-    def quantize(cls, input: torch.Tensor, block_size: int) -> torch.Tensor:
+    def quantize(
+        cls, input: torch.Tensor, block_size: int, scale_block_size: int | None = None
+    ) -> tuple:
         """Converting a tensor to a quantized format based on NF4 double quantization.
 
         Args:
@@ -116,6 +118,8 @@ def quantize(cls, input: torch.Tensor, block_size: int) -> torch.Tensor:
             #               | byte  | byte  | byte  |
             packed_output_uint8 = quantized_output_uint8[::2] << 4 | quantized_output_uint8[1::2]
 
+        # pad the scales if needed
+        scales = reduce_block_padding(scales.view(-1), block_sizes={-1: scale_block_size})
         return cls(original_input.shape, original_input.dtype, packed_output_uint8), scales
 
     @classmethod
@@ -159,6 +163,9 @@ def dequantize(self, dtype: torch.dtype = None, **kwarg):
         double_scale = kwarg["double_scale"]
         scale_zeros = kwarg["scale_zeros"]
 
+        # unpadd the scales if needed
+        scales = scales.view(-1)[: (self._quantized_data.numel() * 2) // block_sizes[-1]]
+
         if cuda_ext and self._quantized_data.is_cuda:
             # with a custom cuda kernel
             scales = _dequantize_scalers(scales, double_scale, scale_zeros, dtype).flatten()
diff --git a/tests/gpu/torch/quantization/test_qtensor_cuda.py b/tests/gpu/torch/quantization/test_qtensor_cuda.py
@@ -30,7 +30,10 @@ class TestQTensor:
     )
     @pytest.mark.parametrize("device", ["cpu", "cuda"])
     @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float16, torch.bfloat16])
-    def test_qtensor(self, num_bits, block_sizes, device, input_dtype):
+    @pytest.mark.parametrize(
+        ("input_shape", "check_memory"), [((256, 64), True), ((256, 32), False)]
+    )  # test
+    def test_qtensor(self, num_bits, block_sizes, device, input_dtype, input_shape, check_memory):
         nf4_attr_cfg = QuantizerAttributeConfig(
             num_bits=num_bits,
             block_sizes=block_sizes,
@@ -40,7 +43,7 @@ def test_qtensor(self, num_bits, block_sizes, device, input_dtype):
 
         # Original tensor
         base_mem = torch.cuda.memory_allocated("cuda")
-        x = torch.rand(256, 64).to(device).to(dtype=input_dtype)
+        x = torch.rand(input_shape).to(device).to(dtype=input_dtype)
         x_allocated = torch.cuda.memory_allocated("cuda")
         bf16_mem_usage = x_allocated - base_mem
 
@@ -51,7 +54,7 @@ def test_qtensor(self, num_bits, block_sizes, device, input_dtype):
         nf4_mem_usage = nf4_x_allocated - base_mem
 
         # Check the memory saving
-        if bf16_mem_usage > 0:
+        if bf16_mem_usage > 0 and check_memory:
             assert (nf4_mem_usage) / bf16_mem_usage < 0.3
 
         # De-quantize to origin dtype
diff --git a/tests/gpu/torch/quantization/test_real_quantize_cuda.py b/tests/gpu/torch/quantization/test_real_quantize_cuda.py
@@ -47,7 +47,6 @@ def test_real_quantize(model_cls, config):
         config["quant_cfg"]["*weight_quantizer"]["block_sizes"] = {
             -1: 16,
             "scale_bits": 8,
-            "scale_block_sizes": {-1: 16},
         }
         if model_cls is SimpleConv or model_cls is SimpleConvLinear:
             pytest.skip(
@@ -102,7 +101,6 @@ def test_save_restore(model_cls, config):
         config["quant_cfg"]["*weight_quantizer"]["block_sizes"] = {
             -1: 16,
             "scale_bits": 8,
-            "scale_block_sizes": {-1: 16},
         }
         if model_cls is SimpleConv or model_cls is SimpleConvLinear:
             pytest.skip(