Add fp4 support

lanluo-nvidia · lanluo-nvidia · commit 1d172ced5c0a · 2025-04-29T09:22:36.000-07:00
diff --git a/examples/dynamo/vgg16_ptq.py b/examples/dynamo/vgg16_ptq.py
@@ -200,6 +200,8 @@ def calibrate_loop(model):
     quant_cfg = mtq.INT8_DEFAULT_CFG
 elif args.quantize_type == "fp8":
     quant_cfg = mtq.FP8_DEFAULT_CFG
+elif args.quantize_type == "fp4":
+    quant_cfg = mtq.NVFP4_DEFAULT_CFG
 # PTQ with in-place replacement to quantized modules
 mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
 # model has FP8 qdq nodes at this point
@@ -239,6 +241,8 @@ def calibrate_loop(model):
             enabled_precisions = {torch.int8}
         elif args.quantize_type == "fp8":
             enabled_precisions = {torch.float8_e4m3fn}
+        elif args.quantize_type == "fp4":
+            enabled_precisions = {torch.float4_e2m1fn_x2}
         trt_model = torchtrt.dynamo.compile(
             exp_program,
             inputs=[input_tensor],
diff --git a/py/torch_tensorrt/_enums.py b/py/torch_tensorrt/_enums.py
@@ -76,6 +76,12 @@ class dtype(Enum):
 
     f8 = auto()
     """8 bit floating-point number, equivalent to ``dtype.fp8`` and ``dtype.float8``
+    
+    :meta hide-value:
+    """
+    
+    f4 = auto()
+    """4 bit floating-point number, equivalent to ``dtype.fp4`` and ``dtype.float4``
 
     :meta hide-value:
     """
@@ -90,6 +96,7 @@ class dtype(Enum):
 
     float8 = f8
     fp8 = f8
+    fp4 = f4
 
     half = f16
     fp16 = f16
@@ -162,6 +169,8 @@ def _from(
                 return dtype.i32
             elif t == torch.float8_e4m3fn:
                 return dtype.f8
+            elif t == torch.float4_e2m1fn_x2:
+                return dtype.f4
             elif t == torch.half:
                 return dtype.f16
             elif t == torch.float:
@@ -188,6 +197,8 @@ def _from(
                 return dtype.i8
             elif t == trt.DataType.FP8:
                 return dtype.f8
+            elif t == trt.DataType.FP4:
+                return dtype.fp4
             elif t == trt.DataType.INT32:
                 return dtype.i32
             elif t == trt.DataType.INT64:
@@ -357,6 +368,8 @@ def to(
                 return torch.long
             elif self == dtype.f8:
                 return torch.float8_e4m3fn
+            elif self == dtype.f4:
+                return torch.float4_e2m1fn_x2
             elif self == dtype.f16:
                 return torch.half
             elif self == dtype.f32:
@@ -410,6 +423,8 @@ def to(
                 return np.int64
             elif self == dtype.f16:
                 return np.float16
+            elif self == dtype.f4:
+                return np.float4_e2m1fn_x2
             elif self == dtype.f32:
                 return np.float32
             elif self == dtype.f64:
diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py
@@ -617,6 +617,39 @@ def aten_ops_quantize_op(
         )
 
 
+try:
+    import modelopt.torch.quantization as mtq  # noqa: F401
+
+    assert torch.ops.tensorrt.dynamic_block_quantize_op.default
+except Exception as e:
+    _LOGGER.warning(
+        "Unable to import dynamic block quantize op. Please install modelopt library (https://github.com/NVIDIA/TensorRT-Model-Optimizer?tab=readme-ov-file#installation) to add support for compiling dynamic blockquantized models"
+    )
+else:
+
+    @dynamo_tensorrt_converter(torch.ops.tensorrt.dynamic_block_quantize_op.default)
+    def aten_ops_dynamic_block_quantize_op(
+        ctx: ConversionContext,
+        target: Target,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        name: str,
+    ) -> Union[TRTTensor, Sequence[TRTTensor]]:
+        return impl.quantize.dynamic_block_quantize(
+            ctx,
+            target,
+            SourceIR.ATEN,
+            name,
+            args[0],
+            args[1],
+            args[2],
+            args[3],
+            args[4],
+            args[5],
+            args[6],
+        )
+
+
 @dynamo_tensorrt_converter(torch.ops.aten.squeeze.dim, supports_dynamic_shapes=True)
 @dynamo_tensorrt_converter(torch.ops.aten.squeeze.dims, supports_dynamic_shapes=True)
 def aten_ops_squeeze(
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py
@@ -67,3 +67,55 @@ def quantize(
         dq_output = dequantize_layer.get_output(0)
 
         return dq_output
+
+def dynamic_block_quantize(
+    ctx: ConversionContext,
+    target: Target,
+    source_ir: Optional[SourceIR],
+    name: str,
+    input_tensor: TRTTensor,
+    block_size: int,
+    amax: Union[np.ndarray, torch.Tensor],
+    num_bits: int,
+    exponent_bits: int,
+    scale_num_bits: int,
+    scale_exponent_bits: int,
+) -> TRTTensor:
+    """
+    Adds quantize and dequantize ops (QDQ) which quantize to FP4 based
+    on the output_type set and dequantizes them back.
+    """
+
+    with unset_fake_temporarily():
+        if isinstance(input_tensor, TRTTensor) and input_tensor.dtype not in (
+            trt.float32,
+            trt.float16,
+            trt.bfloat16,
+        ):
+            raise ValueError(
+                f"dynamic_block_quantize converter received an input of {input_tensor.dtype} type. Supported types: float32 | float16 | bfloat16"
+            )
+        if len(input_tensor.shape) not in (2, 3):
+            raise ValueError(
+                f"dynamic_block_quantize converter received an input of {input_tensor.shape} shape. Supported shapes: 2D or 3D"
+            )
+        print(f"input_tensor.shape: {input_tensor.shape} {block_size=} {amax=} {num_bits=} {exponent_bits=} {scale_num_bits=} {scale_exponent_bits=}")
+        max_bound = 6
+        amax = to_torch(amax, None)
+        scale = torch.divide(amax, max_bound)
+        scale = get_trt_tensor(ctx, scale, name + "_scale")
+
+        output_type=trt.DataType.FP4
+        # Add Q node
+        dynamic_quantize_layer = ctx.net.add_dynamic_quantize(input_tensor, axis=-1, block_size=16, output_type=output_type)
+        quantize_layer.set_output_type(0, output_type)
+
+        set_layer_name(quantize_layer, target, name + "_quantize", source_ir)
+        q_output = quantize_layer.get_output(0)
+        # Add DQ node
+        dequantize_layer = ctx.net.add_dequantize(q_output, scale)
+        set_layer_name(dequantize_layer, target, name + "_dequantize", source_ir)
+        dequantize_layer.precision = output_type
+        dq_output = dequantize_layer.get_output(0)
+
+        return dq_output
diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py
@@ -199,6 +199,60 @@ def test_resnet18_half(ir):
     torch._dynamo.reset()
 
 
+
+@unittest.skipIf(
+    torch.cuda.get_device_capability() < (8, 9),
+    "FP4 quantization requires compute capability 8.9 or later",
+)
+@unittest.skipIf(
+    not importlib.util.find_spec("modelopt"),
+    "ModelOpt is required to run this test",
+)
+@pytest.mark.unit
+def test_base_fp4(ir):
+    import modelopt.torch.quantization as mtq
+    from modelopt.torch.quantization.utils import export_torch_mode
+
+    class SimpleNetwork(torch.nn.Module):
+        def __init__(self):
+            super(SimpleNetwork, self).__init__()
+            self.linear1 = torch.nn.Linear(in_features=10, out_features=5)
+            self.linear2 = torch.nn.Linear(in_features=5, out_features=1)
+
+        def forward(self, x):
+            x = self.linear1(x)
+            x = torch.nn.ReLU()(x)
+            x = self.linear2(x)
+            return x
+
+    def calibrate_loop(model):
+        """Simple calibration function for testing."""
+        model(input_tensor)
+
+    input_tensor = torch.randn(1, 10).cuda()
+    model = SimpleNetwork().eval().cuda()
+
+    quant_cfg = mtq.NVFP4_DEFAULT_CFG
+    mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
+    # model has FP8 qdq nodes at this point
+    output_pyt = model(input_tensor)
+
+    with torch.no_grad():
+        with export_torch_mode():
+            exp_program = torch.export.export(model, (input_tensor,), strict=False)
+            trt_model = torchtrt.dynamo.compile(
+                exp_program,
+                inputs=[input_tensor],
+                enabled_precisions={torch.float4_e2m1fn_x2},
+                min_block_size=1,
+                debug=True,
+                cache_built_engines=False,
+                reuse_cached_engines=False,
+            )
+            outputs_trt = trt_model(input_tensor)
+            assert torch.allclose(output_pyt, outputs_trt, rtol=1e-3, atol=5e-1)
+
+
 @unittest.skipIf(
     torch.cuda.get_device_capability() < (8, 9),
     "FP8 quantization requires compute capability 8.9 or later",