fix nvfp4 weight_scale2 (Tencent#76)

StromNoNo · web-flow · commit 6adea793dcad · 2025-09-18T15:43:03.000+08:00
diff --git a/angelslim/compressor/quant/modules/nvfp4/nvfp4.py b/angelslim/compressor/quant/modules/nvfp4/nvfp4.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import torch
 
 from .....utils import print_info
@@ -89,7 +88,12 @@ def get_weights_scaling_factor(
         return q_per_block_scale
 
     def post_process(self, sub_layer, name):
-        weight_observer_amax = self.model.weight_scales_dict[name]
+        # TODO:Fuse observer amax because TRT-LLM requires the qkv,
+        # gate and up to share the weight_scale2
+        weight_observer_amax, input_observer_amax = self.model.fuse_observer_amax(
+            sub_layer, name
+        )
+
         weight_scale_2 = self.get_weights_scaling_factor_2(weight_observer_amax)
         self.model.weight_scales_dict_2[name] = weight_scale_2
 
@@ -100,6 +104,5 @@ def post_process(self, sub_layer, name):
         )
         self.model.weight_scales_dict[name] = weight_scale
 
-        input_observer_amax = self.model.act_scales_dict[name]
         input_scale = self.get_activation_scaling_factor(input_observer_amax)
         self.model.act_scales_dict[name] = input_scale
diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py
@@ -218,6 +218,8 @@ def _convert(self):
         self.ptq_hook.post_process()
 
         quant_convert_module = self.quant_model.get_quant_convert_module()
+        if "nvfp4" in self.quant_algo:
+            self.quant_model.get_observer_values()
         # 2. insert qdq module
         for name, sub_layer in self.ptq_hook.quant_layers_dict.items():
             parent_layer, sub_name = find_parent_layer_and_sub_name(
diff --git a/angelslim/models/base_model.py b/angelslim/models/base_model.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import re
 from abc import ABCMeta, abstractmethod
 from typing import Optional
@@ -170,6 +171,10 @@ def get_nvfp4_qdq_module(self, sub_layer, name):
             raise NotImplementedError
         return q_linear
 
+    def get_observer_values(self):
+        self.weight_observer_amax_dict = copy.deepcopy(self.weight_scales_dict)
+        self.input_observer_amax_dict = copy.deepcopy(self.act_scales_dict)
+
     def get_kvcache_observer_layers_names(self, observe_names):
         names = ["self_attn.k_proj", "self_attn.v_proj"]
         return [
diff --git a/angelslim/models/llm/qwen.py b/angelslim/models/llm/qwen.py
@@ -94,3 +94,43 @@ def get_save_func(self):
             raise NotImplementedError(
                 f"deploy_backend {self.deploy_backend} is not supported for saving."
             )
+
+    def fuse_observer_amax(self, sub_layer, name):
+        if "q_proj" in name or "k_proj" in name or "v_proj" in name:
+            prefix = name.rsplit(".", 1)[0]
+            q_name = f"{prefix}.q_proj"
+            k_name = f"{prefix}.k_proj"
+            v_name = f"{prefix}.v_proj"
+
+            weight_scales = []
+            for key in [q_name, k_name, v_name]:
+                tensor = self.weight_observer_amax_dict[key]
+                weight_scales.append(tensor)
+            weight_observer_amax = max(weight_scales)
+
+            act_scales = []
+            for key in [q_name, k_name, v_name]:
+                tensor = self.input_observer_amax_dict[key]
+                act_scales.append(tensor)
+            input_observer_amax = max(act_scales)
+        elif "gate_proj" in name or "up_proj" in name:
+            prefix = name.rsplit(".", 1)[0]
+            gate_name = f"{prefix}.gate_proj"
+            up_name = f"{prefix}.up_proj"
+
+            weight_scales = []
+            for key in [gate_name, up_name]:
+                tensor = self.weight_observer_amax_dict[key]
+                weight_scales.append(tensor)
+            weight_observer_amax = max(weight_scales)
+
+            act_scales = []
+            for key in [gate_name, up_name]:
+                tensor = self.input_observer_amax_dict[key]
+                act_scales.append(tensor)
+            input_observer_amax = max(act_scales)
+        else:
+            weight_observer_amax = self.weight_observer_amax_dict[name]
+            input_observer_amax = self.input_observer_amax_dict[name]
+
+        return weight_observer_amax, input_observer_amax
diff --git a/configs/qwen3/nvfp4/qwen3-a22b_nvfp4.yaml b/configs/qwen3/nvfp4/qwen3-a22b_nvfp4.yaml
@@ -0,0 +1,35 @@
+# Global configuration of pipeline
+global:
+  save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+  name: Qwen
+  model_path: Qwen/Qwen3-235B-A22B
+  trust_remote_code: true
+  low_cpu_mem_usage: true
+  use_cache: false
+  torch_dtype: auto
+  device_map: auto
+
+# Compression configuration
+compression:
+  name: PTQ
+  quantization:
+    name: nvfp4
+    bits: 4
+    quant_method:
+      weight: "per-block"
+      activation: "per-block"
+      group_size: 16
+    ignore_layers:         # Skip quantization for these layers
+      - "lm_head"
+      - "model.embed_tokens"
+
+# Dataset for calibration
+dataset:
+  name: TextDataset
+  data_path: ./dataset/sharegpt_gpt4_qwen/sharegpt_gpt4-qwen3_a22B_output.jsonl
+  max_seq_length: 4096
+  num_samples: 256
+  batch_size: 1