NVIDIA · Bruce-x-1997 · Sep 18, 2025
@@ -43,3 +43,20 @@ We provide a one-step-script which will:
 ```bash
 ./quantize_fp8_to_nvfp4.sh --amax_path $FP4_QUANT_PATH --fp4_output_path $HF_FP4_PATH --fp8_hf_path $HF_FP8_CKPT --world_size 8
 ```
+
+#### W4AFP8 for V3 & R1
+
+firstly , prepare a trtllm image, like
+```bash
+docker run --rm -it --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus=all \
+	nvcr.io/nvidia/tensorrt-llm/release
+```
+then we can operate modelopt in the docker pod as (Trtllm example)[https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/models/core/deepseek_v3/README.md?plain=1]
+but we should notice that just using the latest DeepSeek-V3.git is ok, because there is a dtype bug in bias proto at commit 1398800.
+
+#### W4AFP8 for V3.1
+
+The basic operation is the same as V3.
+But we need to notice two point:
+1. use config_v3.1.json or add "scale_fmt":"ue8m0" in config_671B.json.ue8m0 is a key item as it was used in training of V3.1
+2. set gemm_impl to fp8 (default is bf16) to enbale ue8m0 quant kernel
@@ -43,7 +43,7 @@
 import os
 import sys
 from pathlib import Path
-from typing import Literal
+from typing import Optional, Literal
 
 import torch
 import torch.distributed as dist
@@ -79,6 +79,7 @@ def linear(
         bias: torch.Tensor | None = None,
         act_quantizer: TensorQuantizer | None = None,
         weight_quantizer: TensorQuantizer | None = None,
+        scale_fmt: Optional[str] = None,
     ) -> torch.Tensor:
         if weight.element_size() > 1:
             if act_quantizer is not None:
@@ -95,9 +96,7 @@ def linear(
 
             return F.linear(x, weight, bias)
         else:
-            assert weight_quantizer is None
-            assert act_quantizer is None
-            x, scale = act_quant(x, block_size)
+            x, scale = act_quant(x, block_size, scale_fmt)
             y = fp8_gemm(x, scale, weight, weight.scale)
             if bias is not None:
                 y += bias
@@ -174,6 +173,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 self.bias,
                 act_quantizer=self.input_quantizer,
                 weight_quantizer=self.weight_quantizer,
+                scale_fmt=self.scale_fmt,
             )
             return y