restructure the dynamic double quantize and static double quantize code

lanluo-nvidia · lanluo-nvidia · commit 559ada57331d · 2025-05-07T15:38:17.000-07:00
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py
@@ -1,4 +1,4 @@
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 import numpy as np
 import tensorrt as trt
@@ -92,90 +92,64 @@ def nvfp4_quantize(
         f"lan added nvfp4_quantize entered: {target=} {source_ir=} {name=} {input_tensor.shape=} {input_tensor.dtype=} {block_size=} {amax=} {num_bits=} {exponent_bits=} {scale_num_bits=} {scale_exponent_bits=}"
     )
     with unset_fake_temporarily():
-        if not isinstance(input_tensor, TRTTensor):
-            input_tensor = get_trt_tensor(ctx, input_tensor, name + "_input")
         if input_tensor.dtype not in (
             trt.float32,
             trt.float16,
             trt.bfloat16,
+            torch.float32,
+            torch.float16,
+            torch.bfloat16,
         ):
             raise ValueError(
                 f"dynamic_block_quantize converter received an input of {input_tensor.dtype} type. Supported types: float32 | float16 | bfloat16"
             )
-        # TODO: ADD PADDING IF
-
-        # calculate global scale (the global per-tensor scaling factor, should only contain 1 element)
-        amax = to_torch(
-            amax, None
-        )  # amax is calculated from input_tensor.abs().amax().float()
-        global_scale = torch.divide(amax, 6)
-        global_scale = get_trt_tensor(ctx, global_scale, name + "_global_scale")
-
-        if ".weight_quantizer" in name:
-            # calculate block scaling factor of weights
-            [n, k] = input_tensor.shape[-2:]
-            assert block_size != 0, "block_size must be non-zero"
-            assert k % block_size == 0, "k must be a multiple of block_size"
-            reshaped_input_tensor = input_tensor.reshape(
-                tuple(input_tensor.shape[:-2]) + (n, k // block_size, block_size)
+        if len(input_tensor.shape) not in (2, 3):
+            raise ValueError(
+                f"dynamic_block_quantize converter received an input of {input_tensor.shape} shape. Supported shapes: 2D or 3D"
             )
-            per_block_amax = reshaped_input_tensor.abs().amax(dim=-1).float()
-            per_block_scale = torch.divide(per_block_amax, 6)
+        axis = len(input_tensor.shape) - 1
 
-            per_block_scale = get_trt_tensor(
-                ctx, per_block_scale, name + "_per_block_scale"
-            )
+        # TODO: ADD PADDING IF NEEDED
+        # TODO: ADD DYNAMIC SHAPE SUPPORT
 
-            # static double quantization is used for weights
-            quantized_data_in_fp4, quantized_block_scale_in_fp8 = (
-                _static_double_quantize(
-                    ctx,
-                    target,
-                    source_ir,
-                    name,
-                    input_tensor,
-                    per_block_scale,
-                    global_scale,
-                )
-            )
-            output = _block_double_dequantize(
+        global_scale = _calculate_global_scale(ctx, name, amax)
+
+        if ".weight_quantizer" in name:
+            block_scale = _calculate_block_scale(
                 ctx,
-                target,
-                source_ir,
                 name,
-                quantized_data_in_fp4,
-                quantized_block_scale_in_fp8,
-                global_scale,
+                input_tensor,
+                block_size,
             )
-        elif ".input_quantizer" in name:
-            # quantize input tensor to fp4, output should be data tensor in fp4 and block scale tensor in fp8
-            quantized_data_in_fp4, quantized_scale_in_fp8 = _dynamic_quantize(
+            input_tensor = get_trt_tensor(ctx, input_tensor, name + "_input")
+            output = _static_double_quantize(
                 ctx,
                 target,
                 source_ir,
                 name,
                 input_tensor,
+                block_scale,
                 global_scale,
             )
-            # Add double DQ node
-            output = _block_double_dequantize(
+        elif ".input_quantizer" in name:
+            # quantize input tensor to fp4, output should be data tensor in fp4 and block scale tensor in fp8
+            output = _dynamic_double_quantize(
                 ctx,
                 target,
                 source_ir,
                 name,
-                quantized_data_in_fp4,
-                quantized_scale_in_fp8,
+                input_tensor,
                 global_scale,
-                input_tensor.dtype,
             )
+
         else:
             raise ValueError(
-                f"dynamic_block_quantize converter received an input of {name} name. Supported names: weight_quantizer | input_quantizer"
+                f"quantizer received an input of {name}. Supported values: weight_quantizer | input_quantizer"
             )
         return output
 
 
-def _dynamic_quantize(
+def _dynamic_double_quantize(
     ctx: ConversionContext,
     target: Target,
     source_ir: Optional[SourceIR],
@@ -186,7 +160,7 @@ def _dynamic_quantize(
     block_size: int = 16,
     output_type: trt.DataType = trt.DataType.FP4,
     scale_type: trt.DataType = trt.DataType.FP8,
-) -> Tuple[TRTTensor, TRTTensor]:
+) -> TRTTensor:
     """
     quantize input tensor to fp4, output should be data tensor in fp4 and block scale tensor in fp8
     Parameters:
@@ -202,20 +176,13 @@ def _dynamic_quantize(
             The axis to quantize. Default is -1 (the last axis).
         block_size : int
             The block size for quantization. Default is 16.
-        data_qtype : trt.DataType
+        output_type : trt.DataType
             The data type for quantized data. Default is FP4.
-        scale_qtype : trt.DataType
+        scale_type : trt.DataType
             The data type for block scale. Default is FP8.
-    Returns:
-        A tuple of two tensors: quantized data tensor in fp4 and quantized scale tensor in fp8.
+
     """
-    if len(input_tensor.shape) not in (2, 3):
-        raise ValueError(
-            f"dynamic_block_quantize converter received an input of {input_tensor.shape} shape. Supported shapes: 2D or 3D"
-        )
-    if axis < 0:
-        axis = len(input_tensor.shape) + axis
-    # Add DYQ node
+    # dynamic quantize input tensor to fp4
     dynamic_quantize_layer = ctx.net.add_dynamic_quantize(
         input_tensor,
         axis,
@@ -229,51 +196,23 @@ def _dynamic_quantize(
     )
     quantized_data_in_fp4 = dynamic_quantize_layer.get_output(0)
     quantized_scale_in_fp8 = dynamic_quantize_layer.get_output(1)
-    return quantized_data_in_fp4, quantized_scale_in_fp8
 
-
-def _block_double_dequantize(
-    ctx: ConversionContext,
-    target: Target,
-    source_ir: Optional[SourceIR],
-    name: str,
-    input_tensor: TRTTensor,
-    scale: TRTTensor,
-    global_scale: TRTTensor,
-    dtype: trt.DataType = trt.DataType.FLOAT,
-) -> TRTTensor:
-    """
-    dequantize input_tensor from fp4 to dtype(default is float32)
-    Parameters:
-        ctx: ConversionContext,
-        target: Target,
-        source_ir: Optional[SourceIR]
-        name: str
-        input_tensor : Tensor (On GPU)
-            The input tensor.
-        scale : Tensor (On GPU)
-            The block scale tensor.
-        global_scale : Tensor (On GPU)
-            The global per-tensor scaling factor. It should contain only 1 element.
-        dtype : trt.DataType | str
-            The data type for dequantized data. Default is float32.
-    Returns:
-        The dequantized tensor.
-    """
-    # dequantize scale from fp8 to dtype(default is float32)
-    dequantize_scale_layer = ctx.net.add_dequantize(scale, global_scale, dtype)
+    # dequantize scale from fp8 to orignal dtype(default is float32)
+    dequantize_scale_layer = ctx.net.add_dequantize(
+        quantized_scale_in_fp8, global_scale, input_tensor.dtype
+    )
     set_layer_name(
         dequantize_scale_layer, target, name + "_dequantize_scale", source_ir
     )
     dequantized_scale = dequantize_scale_layer.get_output(0)
 
-    # dequantize input_tensor from  fp4 to dtype(default is float32)
+    # dequantize quantized_data_in_fp4 from  fp4 to orignal dtype(default is float32)
     dequantize_data_layer = ctx.net.add_dequantize(
-        input_tensor, dequantized_scale, dtype
+        quantized_data_in_fp4, dequantized_scale, input_tensor.dtype
     )
     set_layer_name(dequantize_data_layer, target, name + "_dequantize_data", source_ir)
-    dq_output = dequantize_data_layer.get_output(0)
-    return dq_output
+    dequantized_data = dequantize_data_layer.get_output(0)
+    return dequantized_data
 
 
 def _static_double_quantize(
@@ -282,9 +221,9 @@ def _static_double_quantize(
     source_ir: Optional[SourceIR],
     name: str,
     input_tensor: TRTTensor,
-    per_block_scale: TRTTensor,
+    block_scale: TRTTensor,
     global_scale: TRTTensor,
-) -> Tuple[TRTTensor, TRTTensor]:
+) -> TRTTensor:
     """
     Parameters:
         ctx: ConversionContext,
@@ -293,41 +232,84 @@ def _static_double_quantize(
         name: str,
         input_tensor : Tensor (On GPU)
             The input tensor.
-        per_block_scale : Tensor (On GPU)
+        block_scale : Tensor (On GPU)
             The per-block scaling factor.
         global_scale : Tensor (On GPU)
             The global per-tensor scaling factor. It should contain only 1 element.
     Returns:
         A tuple of two tensors: quantized data tensor in fp4 and quantized block scaling factor tensor in fp8
     """
-
-    block_scale_quantize_layer = ctx.net.add_quantize(per_block_scale, global_scale)
+    # quantize block scale to fp8
+    block_scale_quantize_layer = ctx.net.add_quantize(block_scale, global_scale)
     set_layer_name(
         block_scale_quantize_layer,
         target,
-        name + "_per_block_scale_quantize",
+        name + "_block_scale_quantize",
         source_ir,
     )
     block_scale_quantize_layer.set_output_type(0, trt.DataType.FP8)
     quantized_block_scale_in_fp8 = block_scale_quantize_layer.get_output(0)
 
+    # dequantize block scale from fp8 to original dtype(default is float32)
     dequantize_block_scale_layer = ctx.net.add_dequantize(
         quantized_block_scale_in_fp8,
         global_scale,
-        per_block_scale.dtype,
+        block_scale.dtype,
     )
     set_layer_name(
         dequantize_block_scale_layer,
         target,
         name + "_dequantize_block_scale",
         source_ir,
     )
-    dequantize_block_scale_layer.precision = trt.DataType.FP8
     dequantized_block_scale = dequantize_block_scale_layer.get_output(0)
 
+    # quantize input tensor to fp4
     data_quantize_layer = ctx.net.add_quantize(input_tensor, dequantized_block_scale)
     set_layer_name(data_quantize_layer, target, name + "_data_quantize", source_ir)
     data_quantize_layer.set_output_type(0, trt.DataType.FP4)
     quantized_data_in_fp4 = data_quantize_layer.get_output(0)
 
-    return quantized_data_in_fp4, quantized_block_scale_in_fp8
+    # dequantize input tensor from fp4 to originaldtype(default is float32)
+    dequantize_data_layer = ctx.net.add_dequantize(
+        quantized_data_in_fp4,
+        dequantized_block_scale,
+        input_tensor.dtype,
+    )
+    set_layer_name(dequantize_data_layer, target, name + "_dequantize_data", source_ir)
+    dequantized_data = dequantize_data_layer.get_output(0)
+    return dequantized_data
+
+
+def _calculate_global_scale(
+    ctx: ConversionContext,
+    name: str,
+    amax: TRTTensor,
+) -> TRTTensor:
+    # calculate global scale (the global per-tensor scaling factor, should only contain 1 element)
+    amax = to_torch(
+        amax, None
+    )  # amax is calculated from input_tensor.abs().amax().float()
+    global_scale = torch.divide(amax, 6 * 448)
+    global_scale = get_trt_tensor(ctx, global_scale, name + "_global_scale")
+    return global_scale
+
+
+def _calculate_block_scale(
+    ctx: ConversionContext,
+    name: str,
+    input_tensor: TRTTensor,
+    block_size: int,
+) -> TRTTensor:
+
+    [n, k] = input_tensor.shape[-2:]
+    assert block_size != 0, "block_size must be non-zero"
+    assert k % block_size == 0, "k must be a multiple of block_size"
+    reshaped_input_tensor = input_tensor.reshape(
+        tuple(input_tensor.shape[:-2]) + (n, k // block_size, block_size)
+    )
+    block_amax = reshaped_input_tensor.abs().amax(dim=-1).float()
+    block_scale = torch.divide(block_amax, 6)
+
+    block_scale = get_trt_tensor(ctx, block_scale, name + "_block_scale")
+    return block_scale
diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py
@@ -215,7 +215,7 @@ def test_base_fp4(ir):
     class SimpleNetwork(torch.nn.Module):
         def __init__(self):
             super(SimpleNetwork, self).__init__()
-            self.linear1 = torch.nn.Linear(in_features=16, out_features=5)
+            self.linear1 = torch.nn.Linear(in_features=16, out_features=3)
 
         def forward(self, x):
             x = self.linear1(x)
@@ -249,7 +249,7 @@ def calibrate_loop(model):
             outputs_trt = trt_model(input_tensor)
             print(f"lan added outputs_trt: {outputs_trt}")
             print(f"lan added output_pyt: {output_pyt}")
-            assert torch.allclose(output_pyt, outputs_trt, rtol=5e-1, atol=5e-1)
+            assert torch.allclose(output_pyt, outputs_trt, rtol=4e-1, atol=4e-1)
 
 
 @unittest.skipIf(
@@ -284,6 +284,7 @@ def calibrate_loop(model):
     input_tensor = torch.randn(1, 10).cuda()
     model = SimpleNetwork().eval().cuda()
     quant_cfg = mtq.FP8_DEFAULT_CFG
+
     mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
     # model has FP8 qdq nodes at this point
     output_pyt = model(input_tensor)