fix: fix an int32 overflow bug in destindex_copy_kv (#907)

blueswhen · web-flow · commit 058eb80c49ee · 2025-05-21T15:39:43.000+08:00
diff --git a/lightllm/common/basemodel/triton_kernel/destindex_copy_kv.py b/lightllm/common/basemodel/triton_kernel/destindex_copy_kv.py
@@ -6,19 +6,24 @@
 
 @triton.jit
 def _fwd_kernel_destindex_copy_kv(
-    K, Dest_loc,
+    K,
+    Dest_loc,
     Out,
-    stride_k_bs, stride_k_h, stride_k_d,
-    stride_o_bs, stride_o_h, stride_o_d,
+    stride_k_bs,
+    stride_k_h,
+    stride_k_d,
+    stride_o_bs,
+    stride_o_h,
+    stride_o_d,
     head_num,
     BLOCK_DMODEL: tl.constexpr,
-    BLOCK_HEAD: tl.constexpr
+    BLOCK_HEAD: tl.constexpr,
 ):
     cur_index = tl.program_id(0)
     offs_h = tl.arange(0, BLOCK_HEAD)
     offs_d = tl.arange(0, BLOCK_DMODEL)
 
-    dest_index = tl.load(Dest_loc + cur_index)
+    dest_index = tl.load(Dest_loc + cur_index).to(tl.int64)
 
     k_ptrs = K + cur_index * stride_k_bs + stride_k_h * offs_h[:, None] + stride_k_d * offs_d[None, :]
     o_ptrs = Out + dest_index * stride_o_bs + stride_o_h * offs_h[:, None] + stride_o_d * offs_d[None, :]
@@ -39,9 +44,15 @@ def destindex_copy_kv(K, DestLoc, Out):
     num_warps = 1
 
     _fwd_kernel_destindex_copy_kv[grid](
-        K, DestLoc, Out,
-        K.stride(0), K.stride(1), K.stride(2),
-        Out.stride(0), Out.stride(1), Out.stride(2),
+        K,
+        DestLoc,
+        Out,
+        K.stride(0),
+        K.stride(1),
+        K.stride(2),
+        Out.stride(0),
+        Out.stride(1),
+        Out.stride(2),
         head_num,
         BLOCK_DMODEL=head_dim,
         BLOCK_HEAD=BLOCK_HEAD,
@@ -53,23 +64,35 @@ def destindex_copy_kv(K, DestLoc, Out):
 
 @triton.jit
 def _fwd_kernel_destindex_copy_quantize_kv(
-    K, Dest_loc, Out, Out_scale,
-    stride_k_bs, stride_k_h, stride_k_d,
-    stride_o_bs, stride_o_h, stride_o_d,
-    stride_os_bs, stride_os_h, stride_os_d,
+    K,
+    Dest_loc,
+    Out,
+    Out_scale,
+    stride_k_bs,
+    stride_k_h,
+    stride_k_d,
+    stride_o_bs,
+    stride_o_h,
+    stride_o_d,
+    stride_os_bs,
+    stride_os_h,
+    stride_os_d,
     head_num,
     BLOCK_DMODEL: tl.constexpr,
-    BLOCK_HEAD: tl.constexpr
+    BLOCK_HEAD: tl.constexpr,
 ):
     cur_index = tl.program_id(0)
     offs_h = tl.arange(0, BLOCK_HEAD)
     offs_d = tl.arange(0, BLOCK_DMODEL)
 
-    dest_index = tl.load(Dest_loc + cur_index)
-    src_data = tl.load(K + cur_index * stride_k_bs + offs_h[:, None] * stride_k_h + stride_k_d * offs_d[None, :], 
-                       mask=offs_h[:, None] < head_num, other=0.0)
+    dest_index = tl.load(Dest_loc + cur_index).to(tl.int64)
+    src_data = tl.load(
+        K + cur_index * stride_k_bs + offs_h[:, None] * stride_k_h + stride_k_d * offs_d[None, :],
+        mask=offs_h[:, None] < head_num,
+        other=0.0,
+    )
     abs_data = tl.abs(src_data)
-    data_scale = (tl.max(abs_data, axis=1) / 127.).to(Out_scale.dtype.element_ty)[:, None]
+    data_scale = (tl.max(abs_data, axis=1) / 127.0).to(Out_scale.dtype.element_ty)[:, None]
     q_src_data = (src_data / data_scale).to(tl.int8)
     o_ptrs = Out + dest_index * stride_o_bs + stride_o_h * offs_h[:, None] + stride_o_d * offs_d[None, :]
     os_ptrs = Out_scale + dest_index * stride_os_bs + stride_os_h * offs_h[:, None]
@@ -88,10 +111,19 @@ def destindex_copy_quantize_kv(K, DestLoc, Out, Out_scale):
     num_warps = 1
 
     _fwd_kernel_destindex_copy_quantize_kv[grid](
-        K, DestLoc, Out, Out_scale,
-        K.stride(0), K.stride(1), K.stride(2),
-        Out.stride(0), Out.stride(1), Out.stride(2),
-        Out_scale.stride(0), Out_scale.stride(1), Out_scale.stride(2),
+        K,
+        DestLoc,
+        Out,
+        Out_scale,
+        K.stride(0),
+        K.stride(1),
+        K.stride(2),
+        Out.stride(0),
+        Out.stride(1),
+        Out.stride(2),
+        Out_scale.stride(0),
+        Out_scale.stride(1),
+        Out_scale.stride(2),
         head_num,
         BLOCK_DMODEL=head_dim,
         BLOCK_HEAD=BLOCK_HEAD,
@@ -149,6 +181,6 @@ def test2():
     print("cos ", cos(src.flatten().to(torch.float32), (value_dest * scale_dest).flatten().to(torch.float32)))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     test1()
     test2()
diff --git a/test/benchmark_qps.py b/test/benchmark_qps.py
@@ -105,12 +105,6 @@ async def async_post_stream_openai(url, prompt, max_new_tokens, session):
             async for line in response.content:
                 line = line.strip()
                 if line:
-                    line = line.decode("utf-8")[6:]  # remove "data: "
-                    if line == "[DONE]":
-                        continue
-                    data = json.loads(line)
-                    if not data["choices"][0]["text"]:
-                        continue
                     current_time = time.time()
                     elapsed_time = current_time - last_time
                     used_time.append(elapsed_time)
@@ -249,7 +243,17 @@ async def run_continuous_benchmark(
     end_time = [0.0]
     pending_tasks = []
 
-    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=10 * reqs_num)) as session:
+    timeout = aiohttp.ClientTimeout(
+        total=3600,  # 总超时时间1小时
+        connect=300,  # 连接超时5分钟
+        sock_connect=300,
+        sock_read=3600,
+    )
+
+    async with aiohttp.ClientSession(
+        connector=aiohttp.TCPConnector(limit=10 * reqs_num),
+        timeout=timeout,
+    ) as session:
         sender_task = asyncio.create_task(
             continuous_sender(
                 session,