fix all

wangzaijun · wangzaijun · commit d35a2cfb3e6c · 2025-11-25T03:14:46.000Z
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -283,6 +283,10 @@ def _create_inferstate(self, model_input: ModelInput, microbatch_index: int = 0)
                 infer_state.b_ready_cache_len = model_input.b_ready_cache_len
             else:
                 infer_state.b_ready_cache_len = torch.zeros_like(input=infer_state.b_seq_len)
+        else:
+            if enable_diverse_mode_gqa_decode_fast_kernel():
+                infer_state.b_shared_seq_len = model_input.b_shared_seq_len
+                infer_state.b_mark_shared_group = model_input.b_mark_shared_group
 
         infer_state.multimodal_params = model_input.multimodal_params
 
diff --git a/lightllm/common/basemodel/infer_struct.py b/lightllm/common/basemodel/infer_struct.py
@@ -24,6 +24,10 @@ def __init__(self):
         self.b_req_idx: torch.Tensor = None
         self.b_start_loc: torch.Tensor = None
         self.b_ready_cache_len: torch.Tensor = None  # only for prefill prompt cache used.
+
+        self.b_shared_seq_len: torch.Tensor = None  # only for diverse kv cache used in decode phase.
+        self.b_mark_shared_group: torch.Tensor = None  # only for diverse kv cache used in decode phase.
+
         self.b_seq_len: torch.Tensor = None
         # max_len_in_batch prefill 和 decode 阶段含义不同
         # prefill 阶段指每个req 输入token的长度（不包括已经cache的部分）最大值
diff --git a/lightllm/models/llama/layer_infer/transformer_layer_infer.py b/lightllm/models/llama/layer_infer/transformer_layer_infer.py
@@ -111,6 +111,14 @@ def _bind_attention(self):
                 LlamaTransformerLayerInfer._context_attention_kernel_ppl_int8kv, self
             )
         elif "ppl_int8kv_flashdecoding" in self.mode:
+            self._token_attention_kernel = partial(
+                LlamaTransformerLayerInfer._token_decode_attention_ppl_int8kv_flashdecoding_diverse, self
+            )
+            self._copy_kv_to_mem_cache = partial(LlamaTransformerLayerInfer._copy_kv_to_mem_cache_ppl_int8kv, self)
+            self._context_attention_kernel = partial(
+                LlamaTransformerLayerInfer._context_attention_kernel_ppl_int8kv, self
+            )
+        elif "ppl_int8kv_flashdecoding_diverse" in self.mode:
             self._token_attention_kernel = partial(
                 LlamaTransformerLayerInfer._token_decode_attention_ppl_int8kv_flashdecoding, self
             )
@@ -784,6 +792,34 @@ def _token_decode_attention_ppl_int8kv_flashdecoding(
             alloc_tensor_func=self.alloc_tensor,
         )
 
+    def _token_decode_attention_ppl_int8kv_flashdecoding_diverse(
+        self, q, infer_state: LlamaInferStateInfo, layer_weight, out=None
+    ):
+        from lightllm.models.llama.triton_kernel.ppl_int8kv_flash_decoding_diverse import (
+            token_decode_attention_flash_decoding,
+        )
+
+        cache_k = infer_state.mem_manager.kv_buffer[self.layer_num_][:, 0 : self.tp_k_head_num_, :]
+        cache_k_scale = infer_state.mem_manager.scale_buffer[self.layer_num_][:, 0 : self.tp_k_head_num_, :]
+        cache_v = infer_state.mem_manager.kv_buffer[self.layer_num_][
+            :, self.tp_k_head_num_ : self.tp_k_head_num_ + self.tp_v_head_num_, :
+        ]
+        cache_v_scale = infer_state.mem_manager.scale_buffer[self.layer_num_][
+            :, self.tp_k_head_num_ : self.tp_k_head_num_ + self.tp_v_head_num_, :
+        ]
+        return token_decode_attention_flash_decoding(
+            q,
+            infer_state,
+            self.tp_q_head_num_,
+            self.head_dim_,
+            cache_k,
+            cache_k_scale,
+            cache_v,
+            cache_v_scale,
+            out=out,
+            alloc_tensor_func=self.alloc_tensor,
+        )
+
     def _token_decode_attention_ppl_int4kv_flashdecoding(
         self, q, infer_state: LlamaInferStateInfo, layer_weight, out=None
     ):
diff --git a/lightllm/models/llama/triton_kernel/ppl_int8kv_flash_decoding_diverse.py b/lightllm/models/llama/triton_kernel/ppl_int8kv_flash_decoding_diverse.py
@@ -1,11 +1,15 @@
 # 为 diverse mode 定制设计的 int8kv flash decoding attention 实现，可以实现更高效的多样性采样
 import torch
 from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
+from lightllm.common.basemodel.infer_struct import InferStateInfo
+from .ppl_int8kv_flash_decoding_diverse_stage1 import flash_decode_stage1
+from .ppl_int8kv_flash_decoding_diverse_stage3 import flash_diverse_decode_stage3
+from lightllm.utils.envs_utils import get_diverse_max_batch_shared_group_size
 
 
 def token_decode_attention_flash_decoding(
     q,
-    infer_state,
+    infer_state: InferStateInfo,
     q_head_num,
     head_dim,
     cache_k,
@@ -14,14 +18,21 @@ def token_decode_attention_flash_decoding(
     cache_v_scale,
     out=None,
     alloc_tensor_func=torch.empty,
+    shared_streams_dict={},
 ):
+    if "stream1" not in shared_streams_dict:
+        shared_streams_dict["stream1"] = torch.cuda.Stream()
+    if "stream2" not in shared_streams_dict:
+        shared_streams_dict["stream2"] = torch.cuda.Stream()
+
+    stream1 = shared_streams_dict["stream1"]
+    stream2 = shared_streams_dict["stream2"]
+
     BLOCK_SEQ = 256
     batch_size = infer_state.batch_size
     max_len_in_batch = infer_state.max_len_in_batch
     calcu_shape1 = (batch_size, q_head_num, head_dim)
 
-    from .flash_decoding_stage2 import flash_decode_stage2
-
     o_tensor = alloc_tensor_func(q.shape, q.dtype, q.device) if out is None else out
 
     mid_o = alloc_tensor_func(
@@ -31,21 +42,54 @@ def token_decode_attention_flash_decoding(
         [batch_size, q_head_num, max_len_in_batch // BLOCK_SEQ + 2], dtype=q.dtype, device="cuda"
     )
 
-    light_ops.group8_int8kv_flashdecoding_stage1(
-        BLOCK_SEQ,
-        mid_o,
-        mid_o_logexpsum,
-        1.0 / (head_dim ** 0.5),
-        q.view(calcu_shape1),
-        cache_k,
-        cache_k_scale,
-        cache_v,
-        cache_v_scale,
-        infer_state.req_manager.req_to_token_indexs,
-        infer_state.b_req_idx,
-        infer_state.b_seq_len,
-        infer_state.max_len_in_batch,
-    )
+    current_stream = torch.cuda.current_stream()
 
-    flash_decode_stage2(mid_o, mid_o_logexpsum, infer_state.b_seq_len, o_tensor.view(calcu_shape1), BLOCK_SEQ)
+    stream1.wait_stream(current_stream)
+    with torch.cuda.stream(stream1):
+        flash_decode_stage1(
+            q=q.view(calcu_shape1),
+            k=cache_k,
+            k_scale=cache_k_scale,
+            v=cache_v,
+            v_scale=cache_v_scale,
+            Req_to_tokens=infer_state.req_manager.req_to_token_indexs,
+            B_req_idx=infer_state.b_req_idx,
+            b_shared_seq_len=infer_state.b_shared_seq_len,
+            b_mark_shared_group=infer_state.b_mark_shared_group,
+            b_seq_len=infer_state.b_seq_len,
+            max_len_in_batch=infer_state.max_len_in_batch,
+            mid_out=mid_o,
+            mid_out_logsumexp=mid_o_logexpsum,
+            BLOCK_SEQ=BLOCK_SEQ,
+            max_batch_group_size=get_diverse_max_batch_shared_group_size(),
+        )
+    stream2.wait_stream(current_stream)
+    with torch.cuda.stream(stream2):
+        light_ops.group8_int8kv_flashdecoding_stage1(
+            BLOCK_SEQ,
+            mid_o,
+            mid_o_logexpsum,
+            1.0 / (head_dim ** 0.5),
+            q.view(calcu_shape1),
+            cache_k,
+            cache_k_scale,
+            cache_v,
+            cache_v_scale,
+            infer_state.req_manager.req_to_token_indexs,
+            infer_state.b_req_idx,
+            infer_state.b_seq_len,
+            infer_state.max_len_in_batch,
+        )
+
+    current_stream.wait_stream(stream1)
+    current_stream.wait_stream(stream2)
+
+    flash_diverse_decode_stage3(
+        mid_out=mid_o,
+        mid_out_logexpsum=mid_o_logexpsum,
+        B_Seqlen=infer_state.b_seq_len,
+        b_shared_seq_len=infer_state.b_shared_seq_len,
+        O=o_tensor.view(calcu_shape1),
+        block_seq=BLOCK_SEQ,
+    )
     return o_tensor
diff --git a/lightllm/models/llama/triton_kernel/ppl_int8kv_flash_decoding_diverse_stage3.py b/lightllm/models/llama/triton_kernel/ppl_int8kv_flash_decoding_diverse_stage3.py
@@ -0,0 +1,95 @@
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _fwd_kernel_flash_diverse_decode_stage3(
+    B_Seqlen,
+    b_shared_seq_len,
+    Mid_O,  # [batch, head, seq_block_num, head_dim]
+    Mid_O_LogExpSum,  # [batch, head, seq_block_num]
+    O,  # [batch, head, head_dim]
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_mid_od,
+    stride_mid_o_eb,
+    stride_mid_o_eh,
+    stride_mid_o_es,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    BLOCK_SEQ: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_shared_len = tl.load(b_shared_seq_len + cur_batch)
+
+    shared_block_n = tl.cdiv(cur_batch_shared_len, BLOCK_SEQ)
+    not_shared_block_n = tl.cdiv(cur_batch_seq_len - cur_batch_shared_len, BLOCK_SEQ)
+
+    block_n_size = shared_block_n + not_shared_block_n
+
+    sum_exp = 0.0
+    max_logic = -float("inf")
+    acc = tl.zeros([BLOCK_DMODEL], dtype=tl.float32)
+
+    offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d
+    offs_logic = cur_batch * stride_mid_o_eb + cur_head * stride_mid_o_eh
+    for block_seq_n in range(0, block_n_size, 1):
+        tv = tl.load(Mid_O + offs_v + block_seq_n * stride_mid_os)
+        tlogic = tl.load(Mid_O_LogExpSum + offs_logic + block_seq_n)
+        new_max_logic = tl.maximum(tlogic, max_logic)
+
+        old_scale = tl.exp(max_logic - new_max_logic)
+        acc *= old_scale
+        exp_logic = tl.exp(tlogic - new_max_logic)
+        acc += exp_logic * tv
+        sum_exp = sum_exp * old_scale + exp_logic
+        max_logic = new_max_logic
+
+    tl.store(O + cur_batch * stride_obs + cur_head * stride_oh + offs_d, acc / sum_exp)
+    return
+
+
+@torch.no_grad()
+def flash_diverse_decode_stage3(
+    mid_out: torch.Tensor,
+    mid_out_logexpsum: torch.Tensor,
+    B_Seqlen: torch.Tensor,
+    b_shared_seq_len: torch.Tensor,
+    O: torch.Tensor,
+    block_seq: int,
+):
+    Lk = mid_out.shape[-1]
+    assert Lk in {16, 32, 64, 128}
+    batch, head_num = mid_out.shape[0], mid_out.shape[1]
+    grid = (batch, head_num)
+
+    _fwd_kernel_flash_diverse_decode_stage3[grid](
+        B_Seqlen=B_Seqlen,
+        b_shared_seq_len=b_shared_seq_len,
+        Mid_O=mid_out,
+        Mid_O_LogExpSum=mid_out_logexpsum,
+        O=O,
+        stride_mid_ob=mid_out.stride(0),
+        stride_mid_oh=mid_out.stride(1),
+        stride_mid_os=mid_out.stride(2),
+        stride_mid_od=mid_out.stride(3),
+        stride_mid_o_eb=mid_out_logexpsum.stride(0),
+        stride_mid_o_eh=mid_out_logexpsum.stride(1),
+        stride_mid_o_es=mid_out_logexpsum.stride(2),
+        stride_obs=O.stride(0),
+        stride_oh=O.stride(1),
+        stride_od=O.stride(2),
+        BLOCK_SEQ=block_seq,
+        BLOCK_DMODEL=Lk,
+        num_warps=4,
+        num_stages=2,
+    )
+    return
diff --git a/lightllm/server/router/model_infer/mode_backend/generic_pre_process.py b/lightllm/server/router/model_infer/mode_backend/generic_pre_process.py
@@ -144,7 +144,6 @@ def prepare_decode_inputs(req_objs: List[InferReq]) -> Tuple[ModelInput, List[In
         # b_mark_shared_group 中每一个不为0的位置都代表其与前面多少个请求形成一个共享前缀组。属于
         # 同一个共享前缀组的请求, 其在对应的 b_shared_seq_len 中的内容必然相同。某些模式可以利用这两个
         # 输入加速算子的运行。
-        b_shared_seq_len = torch.tensor(b_shared_seq_len, dtype=torch.int32, device="cpu")
         b_mark_shared_group = []
         shared_nodes = [req.shared_kv_node for req in run_reqs]
         _current_group = []
@@ -169,6 +168,13 @@ def prepare_decode_inputs(req_objs: List[InferReq]) -> Tuple[ModelInput, List[In
             _current_group.clear()
 
         assert len(b_mark_shared_group) == len(run_reqs)
+        # 如果一个 shared group 的长度为1， 则将其共享长度强制修改为0， 避免无效计算，提升
+        # 算子执行效率。
+        b_shared_seq_len = [
+            0 if group_size == 1 else shared_len
+            for shared_len, group_size in zip(b_shared_seq_len, b_mark_shared_group)
+        ]
+        b_shared_seq_len = torch.tensor(b_shared_seq_len, dtype=torch.int32, device="cpu")
         b_mark_shared_group = torch.tensor(b_mark_shared_group, dtype=torch.int32, device="cpu")
     else:
         b_shared_seq_len = None
diff --git a/unit_tests/models/llama/test_ppl_int8kv_flash_decoding_diverse_stage3.py b/unit_tests/models/llama/test_ppl_int8kv_flash_decoding_diverse_stage3.py
@@ -0,0 +1,30 @@
+import pytest
+import torch
+from lightllm.models.llama.triton_kernel.ppl_int8kv_flash_decoding_diverse_stage3 import flash_diverse_decode_stage3
+
+
+@pytest.mark.parametrize(
+    "batch, head_num, seq_len, shared_seq_len, block_seq, head_dim",
+    [
+        (2, 4, 256, 256, 256, 128),
+        (1, 8, 256 * 2, 256, 256, 128),
+        (3, 2, 256 * 4, 256 * 2, 256, 128),
+    ],
+)
+def test_flash_diverse_decode_stage3(batch, head_num, seq_len, shared_seq_len, block_seq, head_dim):
+    # Initialize inputs
+    mid_out = torch.randn(batch, head_num, seq_len // block_seq + 2, head_dim, dtype=torch.bfloat16, device="cuda")
+    mid_out_logexpsum = torch.randn(batch, head_num, seq_len // block_seq + 2, dtype=torch.bfloat16, device="cuda")
+    B_Seqlen = torch.tensor([seq_len] * batch, dtype=torch.int32, device="cuda")
+    b_shared_seq_len = torch.tensor([shared_seq_len] * batch, dtype=torch.int32, device="cuda")
+    out = torch.zeros(batch, head_num, head_dim, dtype=torch.float32, device="cuda")
+
+    # Call the function
+    flash_diverse_decode_stage3(mid_out, mid_out_logexpsum, B_Seqlen, b_shared_seq_len, out, block_seq)
+
+    true_out = torch.zeros_like(out)
+    from lightllm.models.llama.triton_kernel.flash_decoding_stage2 import flash_decode_stage2
+
+    flash_decode_stage2(mid_out, mid_out_logexpsum, B_Seqlen, true_out, block_seq)
+
+    assert torch.allclose(out, true_out, atol=1e-2)