NVIDIA · cjluo-nv · Jan 16, 2026 · Jan 16, 2026 · Jan 20, 2026 · Jan 21, 2026
@@ -577,14 +577,10 @@ def load_weights_fused_qkv_linear(self, module: Linear,
         if os.environ.get("TRTLLM_LOAD_KV_SCALES", "0") == "1":
             if len(k_scale) != 0:
                 assert len(v_scale) != 0
-                # The calibrated KV scales are amax / (6 * 448), but the requested KV scales are amax / 448,
-                # to avoid overflow when dequantizing NVFP4 in attention kernels.
                 copy_weight(
                     module.kv_scales,
                     torch.tensor(
-                        [1.0, max(k_scale) * 6.0,
-                         max(v_scale) * 6.0],
-                        dtype=torch.float32))
+                        [1.0, max(k_scale), max(v_scale)], dtype=torch.float32))
                 module.inv_kv_scales.data = 1.0 / module.kv_scales
 
     def load_weights_fused_gate_up_linear(self, module: Linear,
@@ -1096,14 +1092,10 @@ def load_weights_fused_qkv_linear(self, module: Linear,
         if os.environ.get("TRTLLM_LOAD_KV_SCALES", "0") == "1":
             if len(k_scale) != 0:
                 assert len(v_scale) != 0
-                # The calibrated KV scales are amax / (6 * 448), but the requested KV scales are amax / 448,
-                # to avoid overflow when dequantizing NVFP4 in attention kernels using FP8 math.
                 copy_weight(
                     module.kv_scales,
                     torch.tensor(
-                        [1.0, max(k_scale) * 6.0,
-                         max(v_scale) * 6.0],
-                        dtype=torch.float32))
+                        [1.0, max(k_scale), max(v_scale)], dtype=torch.float32))
                 module.inv_kv_scales.data = 1.0 / module.kv_scales
 
     def load_weights_fused_gate_up_linear(self, module: Linear,