fix:add fabsf() to general kernel when compare max values (#943)

theNiemand · web-flow · commit a71b1b2659cb · 2025-06-24T20:29:55.000+08:00
diff --git a/lightllm-kernel/csrc/quant/per_token_quantize_bf16_fp8.cu b/lightllm-kernel/csrc/quant/per_token_quantize_bf16_fp8.cu
@@ -13,7 +13,6 @@ __global__ void device_per_token_quant_bf16_to_fp8_general(
     const bf16_t* __restrict__ input,  // Input tensor in BF16 format
     fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
     fp32_t* __restrict__ scales,       // Output scales for each token
-    const int64_t M,                  // Number of rows in the input tensor
     const int64_t N
 ) {
     const int32_t bid = blockIdx.x;
@@ -38,7 +37,7 @@ __global__ void device_per_token_quant_bf16_to_fp8_general(
         workspace1[i] = local_bf16;
 
         fp32_t tmp = cvt_bf16_f32(local_bf16);
-        local_max = fmaxf(local_max, tmp);
+        local_max = fmaxf(local_max, fabsf(tmp));
     }
 
     // Reduce the maximum value across the block
@@ -71,7 +70,6 @@ __global__ void device_per_token_quant_bf16_to_fp8_vpt(
     const bf16_t* __restrict__ input,  // Input tensor in BF16 format
     fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
     fp32_t* __restrict__ scales,       // Output scales for each token
-    const int64_t M,                  // Number of rows in the input tensor
     const int32_t N
 ) {
     constexpr int32_t VPT = 8;
@@ -147,8 +145,7 @@ template<int32_t TPB, int32_t N>
 __global__ void device_per_token_quant_bf16_to_fp8(
     const bf16_t* __restrict__ input,  // Input tensor in BF16 format
     fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
-    fp32_t* __restrict__ scales,       // Output scales for each token
-    const int64_t M                  // Number of rows in the input tensor
+    fp32_t* __restrict__ scales       // Output scales for each token
 ) {
     constexpr int32_t VPT = 8;
 
@@ -243,71 +240,63 @@ void per_token_quant_bf16_fp8 (
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<fp8_e4m3_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 32:
             device_per_token_quant_bf16_to_fp8<128, 32>
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<fp8_e4m3_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 64:
             device_per_token_quant_bf16_to_fp8<128, 64>
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<fp8_e4m3_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 512:
             device_per_token_quant_bf16_to_fp8<128, 512>
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<fp8_e4m3_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 1024:
             device_per_token_quant_bf16_to_fp8<128, 1024>
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<fp8_e4m3_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 3200:
             device_per_token_quant_bf16_to_fp8<128, 3200>
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<fp8_e4m3_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 4096:
             device_per_token_quant_bf16_to_fp8<128, 4096>
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<fp8_e4m3_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 12800:
             device_per_token_quant_bf16_to_fp8<256, 12800>
             <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<fp8_e4m3_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         default: {
@@ -319,7 +308,6 @@ void per_token_quant_bf16_fp8 (
                     PTR<bf16_t>(contiguous_input),
                     PTR<fp8_e4m3_t>(output),
                     PTR<fp32_t>(contiguous_scales),
-                    M,
                     N
                 );
             } else {
@@ -328,7 +316,6 @@ void per_token_quant_bf16_fp8 (
                     PTR<bf16_t>(contiguous_input),
                     PTR<fp8_e4m3_t>(output),
                     PTR<fp32_t>(contiguous_scales),
-                    M,
                     N
                 );
             }
@@ -339,4 +326,4 @@ void per_token_quant_bf16_fp8 (
 }
 
 } // namespace ops
-} // namespace lightllm
+} // namespace lightllm
diff --git a/lightllm-kernel/csrc/quant/per_token_quantize_bf16_int8.cu b/lightllm-kernel/csrc/quant/per_token_quantize_bf16_int8.cu
@@ -13,7 +13,6 @@ __global__ void device_per_token_quant_bf16_to_int8_general(
     const bf16_t* __restrict__ input,  // Input tensor in BF16 format
     int8_t* __restrict__ output,   // Output tensor in INT8 format
     fp32_t* __restrict__ scales,       // Output scales for each token
-    const int64_t M,                  // Number of rows in the input tensor
     const int64_t N
 ) {
     const int32_t bid = blockIdx.x;
@@ -38,7 +37,7 @@ __global__ void device_per_token_quant_bf16_to_int8_general(
         workspace1[i] = local_bf16;
 
         fp32_t tmp = cvt_bf16_f32(local_bf16);
-        local_max = fmaxf(local_max, tmp);
+        local_max = fmaxf(local_max, fabsf(tmp));
     }
 
     // Reduce the maximum value across the block
@@ -71,7 +70,6 @@ __global__ void device_per_token_quant_bf16_to_int8_vpt(
     const bf16_t* __restrict__ input,  // Input tensor in BF16 format
     int8_t* __restrict__ output,   // Output tensor in INT8 format
     fp32_t* __restrict__ scales,       // Output scales for each token
-    const int64_t M,                  // Number of rows in the input tensor
     const int32_t N
 ) {
     constexpr int32_t VPT = 8;
@@ -145,8 +143,7 @@ template<int32_t TPB, int32_t N>
 __global__ void device_per_token_quant_bf16_to_int8(
     const bf16_t* __restrict__ input,  // Input tensor in BF16 format
     int8_t* __restrict__ output,   // Output tensor in INT8 format
-    fp32_t* __restrict__ scales,       // Output scales for each token
-    const int64_t M                  // Number of rows in the input tensor
+    fp32_t* __restrict__ scales       // Output scales for each token
 ) {
     constexpr int32_t VPT = 8;
 
@@ -239,71 +236,63 @@ void per_token_quant_bf16_int8 (
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<int8_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 32:
             device_per_token_quant_bf16_to_int8<128, 32>
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<int8_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 64:
             device_per_token_quant_bf16_to_int8<128, 64>
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<int8_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 512:
             device_per_token_quant_bf16_to_int8<128, 512>
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<int8_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 1024:
             device_per_token_quant_bf16_to_int8<128, 1024>
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<int8_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 3200:
             device_per_token_quant_bf16_to_int8<128, 3200>
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<int8_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 4096:
             device_per_token_quant_bf16_to_int8<128, 4096>
             <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<int8_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         case 12800:
             device_per_token_quant_bf16_to_int8<256, 12800>
             <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
                 PTR<bf16_t>(contiguous_input),
                 PTR<int8_t>(output),
-                PTR<fp32_t>(contiguous_scales),
-                M
+                PTR<fp32_t>(contiguous_scales)
             );
             break;
         default: {
@@ -315,7 +304,6 @@ void per_token_quant_bf16_int8 (
                     PTR<bf16_t>(contiguous_input),
                     PTR<int8_t>(output),
                     PTR<fp32_t>(contiguous_scales),
-                    M,
                     N
                 );
             } else {
@@ -324,7 +312,6 @@ void per_token_quant_bf16_int8 (
                     PTR<bf16_t>(contiguous_input),
                     PTR<int8_t>(output),
                     PTR<fp32_t>(contiguous_scales),
-                    M,
                     N
                 );
             }
@@ -335,4 +322,4 @@ void per_token_quant_bf16_int8 (
 }
 
 } // namespace ops
-} // namespace lightllm
+} // namespace lightllm
diff --git a/lightllm-kernel/test/quant/fp8_quant_test.py b/lightllm-kernel/test/quant/fp8_quant_test.py
@@ -9,7 +9,7 @@ class TestQuantBF16(unittest.TestCase):
     def setUp(self):
         """Set up common test parameters."""
         self.tokens = [1024, 13325]
-        self.hiddenDims = [256, 511, 1023, 1024, 1025, 1032, 3200, 3201, 3208, 12800]
+        self.hiddenDims = [3, 256, 511, 1023, 1024, 1025, 1032, 3200, 3201, 3208, 12800]
         self.device = "cuda"
         self.dtype = torch.bfloat16
 
@@ -20,7 +20,7 @@ def test_accuracy(self):
                 with self.subTest(shape=[token, hiddenDim]):
                     input = torch.rand(size=[token, hiddenDim], device=self.device, dtype=self.dtype) - 0.5
                     y_real, scales_real = ops.scaled_fp8_quant(
-                        input.contiguous().cuda(self.device), scale=None, use_per_token_if_dynamic=True
+                        input.contiguous(), scale=None, use_per_token_if_dynamic=True
                     )
                     y_pred, scales_pred = per_token_quant_bf16_fp8(input)
                     self.assertTrue(
diff --git a/lightllm-kernel/test/quant/int8_quant_test.py b/lightllm-kernel/test/quant/int8_quant_test.py
@@ -9,7 +9,7 @@ class TestQuantBF16(unittest.TestCase):
     def setUp(self):
         """Set up common test parameters."""
         self.tokens = [1024, 13325]
-        self.hiddenDims = [256, 257, 511, 1023, 1024, 1025, 1032, 3200, 3201, 3208, 12800]
+        self.hiddenDims = [3, 256, 257, 511, 1023, 1024, 1025, 1032, 3200, 3201, 3208, 12800]
         self.device = "cuda:2"
         self.dtype = torch.bfloat16
         torch.cuda.set_device(self.device)
@@ -21,7 +21,7 @@ def test_accuracy(self):
                 with self.subTest(shape=[token, hiddenDim]):
                     input = torch.rand(size=[token, hiddenDim], device=self.device, dtype=self.dtype) - 0.5
                     y_real, scales_real, _ = ops.scaled_int8_quant(
-                        input.contiguous().cuda(self.device)
+                        input.contiguous()
                     )
                     y_pred, scales_pred = per_token_quant_bf16_int8(input)
                     self.assertTrue(