diff --git a/examples/deepseek/README.md b/examples/deepseek/README.md index 34097f22..46b08c6b 100644 --- a/examples/deepseek/README.md +++ b/examples/deepseek/README.md @@ -43,3 +43,20 @@ We provide a one-step-script which will: ```bash ./quantize_fp8_to_nvfp4.sh --amax_path $FP4_QUANT_PATH --fp4_output_path $HF_FP4_PATH --fp8_hf_path $HF_FP8_CKPT --world_size 8 ``` + +#### W4AFP8 for V3 & R1 + +firstly , prepare a trtllm image, like +```bash +docker run --rm -it --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus=all \ + nvcr.io/nvidia/tensorrt-llm/release +``` +then we can operate modelopt in the docker pod as (Trtllm example)[https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/models/core/deepseek_v3/README.md?plain=1] +but we should notice that just using the latest DeepSeek-V3.git is ok, because there is a dtype bug in bias proto at commit 1398800. + +#### W4AFP8 for V3.1 + +The basic operation is the same as V3. +But we need to notice two point: +1. use config_v3.1.json or add "scale_fmt":"ue8m0" in config_671B.json.ue8m0 is a key item as it was used in training of V3.1 +2. set gemm_impl to fp8 (default is bf16) to enbale ue8m0 quant kernel diff --git a/examples/deepseek/ptq.py b/examples/deepseek/ptq.py index 06407121..c6fa7361 100644 --- a/examples/deepseek/ptq.py +++ b/examples/deepseek/ptq.py @@ -43,7 +43,7 @@ import os import sys from pathlib import Path -from typing import Literal +from typing import Optional, Literal import torch import torch.distributed as dist @@ -79,6 +79,7 @@ def linear( bias: torch.Tensor | None = None, act_quantizer: TensorQuantizer | None = None, weight_quantizer: TensorQuantizer | None = None, + scale_fmt: Optional[str] = None, ) -> torch.Tensor: if weight.element_size() > 1: if act_quantizer is not None: @@ -95,9 +96,7 @@ def linear( return F.linear(x, weight, bias) else: - assert weight_quantizer is None - assert act_quantizer is None - x, scale = act_quant(x, block_size) + x, scale = act_quant(x, block_size, scale_fmt) y = fp8_gemm(x, scale, weight, weight.scale) if bias is not None: y += bias @@ -174,6 +173,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: self.bias, act_quantizer=self.input_quantizer, weight_quantizer=self.weight_quantizer, + scale_fmt=self.scale_fmt, ) return y