vllm-project
diff --git a/‎docs/source/user_guide/feature_guide/quantization.md‎
Lines changed: 3 additions & 2 deletions b/‎docs/source/user_guide/feature_guide/quantization.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎tests/e2e/multicard/test_offline_inference_distributed.py‎
Lines changed: 8 additions & 2 deletions b/‎tests/e2e/multicard/test_offline_inference_distributed.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎tests/ut/quantization/test_w4a8_dynamic.py‎
Lines changed: 85 additions & 45 deletions b/‎tests/ut/quantization/test_w4a8_dynamic.py‎
Lines changed: 85 additions & 45 deletions
@@ -108,18 +108,19 @@ Please convert DeepSeek series models using `br_release_MindStudio_8.1.RC2_TR5_2
 
 ### 3. When converting deepseek series models with modelslim, what should you pay attention?
 
-When using the weight generated by modelslim with the `--dynamic` parameter, if torchair graph mode is enabled, please modify the configuration file in the CANN package to prevent incorrect inference results.
+When the mla portion of the weights used `W8A8_DYNAMIC` quantization, if torchair graph mode is enabled, please modify the configuration file in the CANN package to prevent incorrect inference results.
 
 The operation steps are as follows:
 
 1. Search in the CANN package directory used, for example:
 find /usr/local/Ascend/ -name fusion_config.json
 
-2. Add `"AddRmsNormDynamicQuantFusionPass":"off",` to the fusion_config.json you find, the location is as follows:
+2. Add `"AddRmsNormDynamicQuantFusionPass":"off",` and `"MultiAddRmsNormDynamicQuantFusionPass":"off",` to the fusion_config.json you find, the location is as follows:
 
 ```bash
 {
     "Switch":{
         "GraphFusion":{
             "AddRmsNormDynamicQuantFusionPass":"off",
+            "MultiAddRmsNormDynamicQuantFusionPass":"off",
 ```
@@ -35,6 +35,11 @@
     "vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"
 ]
 
+DEEPSEEK_W4A8_MODELS = [
+    "vllm-ascend/DeepSeek-V3-W4A8-Pruing",
+    "vllm-ascend/DeepSeek-V3.1-W4A8-puring"
+]
+
 
 def test_models_distributed_QwQ():
     example_prompts = [
@@ -109,14 +114,15 @@ def test_models_distributed_Qwen3_W4A8DYNAMIC():
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
+@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
 @patch.dict(os.environ, {"VLLM_ASCEND_MLA_PA": "1"})
-def test_models_distributed_DeepSeek_W4A8DYNAMIC():
+def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
     prompts = [
         "Hello, my name is",
     ]
     max_tokens = 5
     with VllmRunner(
-            snapshot_download("vllm-ascend/DeepSeek-V3-W4A8-Pruing"),
+            snapshot_download(model),
             dtype="auto",
             tensor_parallel_size=2,
             quantization="ascend",
 
@@ -1,4 +1,3 @@
-import copy
 from unittest.mock import Mock, patch
 
 import torch
@@ -95,19 +94,19 @@ def test_get_dynamic_quant_param(self):
         # old quant version weight
         param_dict = self.quant_method.get_dynamic_quant_param(
             self.experts, self.input_size, self.output_size, torch.bfloat16)
-        self.assertEqual(param_dict["w13_weight_scale"].dtype, torch.bfloat16)
+        self.assertEqual(param_dict["w13_weight_scale"].dtype, torch.float32)
         self.assertEqual(param_dict["w13_weight_scale"].shape,
                          (self.experts, 2 * self.input_size, 1))
         self.assertEqual(param_dict["w13_weight_scale_second"].dtype,
-                         torch.bfloat16)
+                         torch.float32)
         self.assertEqual(param_dict["w13_weight_scale_second"].shape,
                          (self.experts, 2 * self.input_size,
                           self.output_size // self.group_size))
-        self.assertEqual(param_dict["w2_weight_scale"].dtype, torch.bfloat16)
+        self.assertEqual(param_dict["w2_weight_scale"].dtype, torch.float32)
         self.assertEqual(param_dict["w2_weight_scale"].shape,
                          (self.experts, self.output_size, 1))
         self.assertEqual(param_dict["w2_weight_scale_second"].dtype,
-                         torch.bfloat16)
+                         torch.float32)
         self.assertEqual(param_dict["w2_weight_scale_second"].shape,
                          (self.experts, self.output_size,
                           self.input_size // self.group_size))
@@ -119,40 +118,87 @@ def test_get_dynamic_quant_param(self):
         self.assertEqual(
             param_dict["w2_scale_bias"].shape,
             (self.experts, self.output_size, 16 // self.quant_method.tp_size))
+        # per-channel weight
+        self.quant_method.is_per_channel_weight = True
+        param_dict = self.quant_method.get_dynamic_quant_param(
+            self.experts, self.input_size, self.output_size, torch.bfloat16)
+        pergroup_param = [
+            "w13_weight_scale_second", "w13_weight_offset_second",
+            "w2_weight_scale_second", "w2_weight_offset_second"
+        ]
+        is_contains = any(key in param_dict for key in pergroup_param)
+        self.assertFalse(is_contains)
 
-    @patch('torch_npu.npu_quantize')
-    @patch('torch.Tensor.npu')
-    def test_process_weights_after_loading(self, mock_npu, mock_npu_quantize):
-        # old quant version weight
+    def build_layer(self,
+                    is_new_quant_version=True,
+                    is_per_channel_weight=False):
         layer = torch.nn.Module()
-        layer.w13_weight = torch.nn.Parameter(torch.zeros(
-            (self.experts, 2 * self.input_size, self.output_size),
-            dtype=torch.int8),
-                                              requires_grad=False)
-        layer.w2_weight = torch.nn.Parameter(torch.zeros(
-            (self.experts, self.output_size, self.input_size),
-            dtype=torch.int8),
-                                             requires_grad=False)
+        if is_new_quant_version:
+            layer.w13_weight = torch.nn.Parameter(torch.zeros(
+                (self.experts, self.input_size, self.output_size),
+                dtype=torch.int8),
+                                                  requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(torch.zeros(
+                (self.experts, self.output_size // 2, self.input_size),
+                dtype=torch.int8),
+                                                 requires_grad=False)
+            w13_scale_bias = torch.zeros(
+                (self.experts, 2 * self.input_size, 1), dtype=torch.float32)
+            layer.w13_scale_bias = torch.nn.Parameter(w13_scale_bias,
+                                                      requires_grad=False)
+            w2_scale_bias = torch.zeros((self.experts, self.output_size,
+                                         16 // self.quant_method.tp_size),
+                                        dtype=torch.float32)
+            layer.w2_scale_bias = torch.nn.Parameter(w2_scale_bias,
+                                                     requires_grad=False)
+        else:
+            layer.w13_weight = torch.nn.Parameter(torch.zeros(
+                (self.experts, 2 * self.input_size, self.output_size),
+                dtype=torch.int8),
+                                                  requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(torch.zeros(
+                (self.experts, self.output_size, self.input_size),
+                dtype=torch.int8),
+                                                 requires_grad=False)
         layer.w13_weight_scale = torch.nn.Parameter(torch.ones(
-            (self.experts, 2 * self.input_size, 1), dtype=torch.bfloat16),
+            (self.experts, 2 * self.input_size, 1), dtype=torch.float32),
                                                     requires_grad=False)
-        layer.w13_weight_scale_second = torch.nn.Parameter(torch.ones(
-            (self.experts, 2 * self.input_size,
-             self.output_size // self.group_size),
-            dtype=torch.bfloat16),
-                                                           requires_grad=False)
         layer.w2_weight_scale = torch.nn.Parameter(torch.ones(
-            (self.experts, self.output_size, 1), dtype=torch.bfloat16),
+            (self.experts, self.output_size, 1), dtype=torch.float32),
                                                    requires_grad=False)
-        layer.w2_weight_scale_second = torch.nn.Parameter(torch.ones(
-            (self.experts, self.output_size,
-             self.input_size // self.group_size),
-            dtype=torch.bfloat16),
-                                                          requires_grad=False)
-        new_layer = copy.deepcopy(layer)
+        if not is_per_channel_weight:
+            layer.w13_weight_scale_second = torch.nn.Parameter(
+                torch.ones((self.experts, 2 * self.input_size,
+                            self.output_size // self.group_size),
+                           dtype=torch.float32),
+                requires_grad=False)
+            layer.w13_weight_offset_second = torch.nn.Parameter(
+                torch.empty_like(layer.w13_weight_scale_second.data),
+                requires_grad=False)
+            layer.w2_weight_scale_second = torch.nn.Parameter(
+                torch.ones((self.experts, self.output_size,
+                            self.input_size // self.group_size),
+                           dtype=torch.float32),
+                requires_grad=False)
+            layer.w2_weight_offset_second = torch.nn.Parameter(
+                torch.empty_like(layer.w2_weight_scale_second.data),
+                requires_grad=False)
+        return layer
 
+    @patch('torch_npu.npu_format_cast')
+    @patch('torch_npu.npu_quantize')
+    @patch('torch.Tensor.npu')
+    def test_process_weights_after_loading(self, mock_npu, mock_npu_quantize,
+                                           mock_npu_format_cast):
         mock_npu.return_value = torch.Tensor()
         mock_npu_quantize.return_value = torch.Tensor()
+
+        def func_by_args(weight, num_format):
+            return weight
+
+        mock_npu_format_cast.side_effect = func_by_args
+        # old quant version weight
+        layer = self.build_layer(is_new_quant_version=False)
         self.quant_method.process_weights_after_loading(layer)
         self.assertTrue(hasattr(layer, "w13_scale_bias"))
         self.assertEqual(layer.w13_scale_bias.data.shape,
@@ -164,23 +210,17 @@ def test_process_weights_after_loading(self, mock_npu, mock_npu_quantize):
         self.assertEqual(layer.w2_scale_bias.data.dtype, torch.float32)
         # new quant version weight
         self.quant_method.new_quant_version = True
-        new_layer.w13_weight.data = torch.zeros(
-            (self.experts, self.input_size, self.output_size),
-            dtype=torch.int8)
-        new_layer.w2_weight.data = torch.zeros(
-            (self.experts, self.output_size // 2, self.input_size),
-            dtype=torch.int8)
-        w13_scale_bias = torch.zeros((self.experts, 2 * self.input_size, 1),
-                                     dtype=torch.float32)
-        new_layer.w13_scale_bias = torch.nn.Parameter(w13_scale_bias,
-                                                      requires_grad=False)
-        w2_scale_bias = torch.zeros(
-            (self.experts, self.output_size, 16 // self.quant_method.tp_size),
-            dtype=torch.float32)
-        new_layer.w2_scale_bias = torch.nn.Parameter(w2_scale_bias,
-                                                     requires_grad=False)
+        new_layer = self.build_layer(is_new_quant_version=True)
         self.quant_method.process_weights_after_loading(new_layer)
         self.assertEqual(new_layer.w13_scale_bias.data.shape,
                          (self.experts, 2 * self.input_size))
         self.assertEqual(new_layer.w2_scale_bias.data.shape,
                          (self.experts, self.output_size))
+        self.assertFalse(hasattr(new_layer, "w13_weight_scale_second"))
+        # per-channel weight
+        self.quant_method.is_per_channel_weight = True
+        per_channel_layer = self.build_layer(is_new_quant_version=True,
+                                             is_per_channel_weight=True)
+        self.quant_method.process_weights_after_loading(per_channel_layer)
+        self.assertEqual(new_layer.w13_scale_bias.data.shape,
+                         (self.experts, 2 * self.input_size))