[NNBUG: 5701866] Update DS V3.2 PTQ code (NVIDIA#630)

cjluo-nv · web-flow · commit 5ade7b03f8a7 · 2025-12-02T09:36:55.000-08:00
## What does this PR do? **Type of change:** ? Bug fix **Overview:** 1) Update the DS V3.2 repo code reference to the latest version 2) The new DS V3.2 model now includes fp32 layers. We cast it down to match the checkpoint format during loading 3) Fix get_quant_config API change. ## Testing Generate the deepseek-ai/DeepSeek-V3.2 checkpoint ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes/No  - **Did you write any new necessary tests?**: Yes/No - **Did you add or update any necessary documentation?**: Yes/No - **Did you update [Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No  ## Additional Information  Signed-off-by: Chenjie Luo <chenjiel@nvidia.com>
diff --git a/examples/deepseek/README.md b/examples/deepseek/README.md
@@ -33,7 +33,7 @@ git clone https://github.com/deepseek-ai/DeepSeek-V3.git && cd DeepSeek-V3 && gi
 huggingface-cli download deepseek-ai/DeepSeek-V3.2-Exp --local-dir $HF_FP8_CKPT
 
 # clone DeepSeek-V3.2 Github repository for FP8 inference,
-git clone https://github.com/deepseek-ai/DeepSeek-V3.2-Exp.git && cd DeepSeek-V3.2-Exp && git checkout 3b99a53
+git clone https://github.com/deepseek-ai/DeepSeek-V3.2-Exp.git && cd DeepSeek-V3.2-Exp && git checkout 87e509a
 
 # Install requirements
 pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git
diff --git a/examples/deepseek/ptq.py b/examples/deepseek/ptq.py
@@ -257,7 +257,18 @@ def load_deepseek_model(model_config: str, model_path: str, batch_size: int):
     # load model
     checkpoint_path = os.path.join(model_path, f"model{rank}-mp{world_size}.safetensors")
     print(f"Loading {checkpoint_path}")
+
+    # Temporary fix for fp32 params
+    fp32_params = {}
+    for name, param in model.named_parameters():
+        if param.dtype == torch.float32 and (
+            "head.weight" in name or "attn.indexer.weights_proj.weight" in name
+        ):
+            param.data = param.data.to(torch.get_default_dtype())
+            fp32_params[name] = param
     load_model(model, checkpoint_path)
+    for param in fp32_params.values():
+        param.data = param.data.to(torch.float32)
     print(f"Loaded {checkpoint_path}")
     return model
 
@@ -347,7 +358,7 @@ def state_dict_filter(state_dict):
     #                 counts = module.activated_expert_counts()
     #                 f.writelines(f"{name}: {count}\n" for count in counts)
 
-    quant_config = get_quant_config(model.named_modules())
+    quant_config = get_quant_config(model)
 
     if enable_fp8_kvcache:
         quant_config["quantization"]["kv_cache_quant_algo"] = KV_CACHE_FP8