ModelTC
diff --git a/‎lightllm/common/basemodel/batch_objs.py‎
Lines changed: 4 additions & 0 deletions b/‎lightllm/common/basemodel/batch_objs.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎lightllm/common/basemodel/triton_kernel/gather_token_id.py‎
Lines changed: 65 additions & 130 deletions b/‎lightllm/common/basemodel/triton_kernel/gather_token_id.py‎
Lines changed: 65 additions & 130 deletions
@@ -1,6 +1,7 @@
 import torch
 from dataclasses import dataclass, field
 from typing import Optional
+from typing import List
 
 
 @dataclass
@@ -20,6 +21,9 @@ class ModelInput:
 
     # cpu 变量
     mem_indexes_cpu: torch.Tensor = None
+    # prefill 阶段使用的参数，但是不是推理过程使用的参数，是推理外部进行资源管理
+    # 的一些变量
+    b_prefill_has_output_cpu: List[bool] = None  # 标记进行prefill的请求是否具有输出
 
     # 专有变量，用于一些特殊的模型，特殊的模式下, 传递一些特殊
     # 的输入变量。只在特殊的模型模式下才会具体使用和生效。
 
@@ -4,86 +4,41 @@
 import triton.language as tl
 
 
-@triton.jit
-def _fwd_kernel_gather_and_scatter(
-    probs_idx,
-    probs_sort,
-    req_to_next_token_ids,
-    req_to_next_token_probs,
-    sampled_index,
-    b_req_idx,
-    probs_idx_stride,
-    probs_sort_stride,
-    req_to_next_token_ids_stride,
-    req_to_next_token_probs_stride,
-):
-    cur_index = tl.program_id(0)
-    cur_req_idx = tl.load(b_req_idx + cur_index)
-    cur_sampled_index = tl.load(sampled_index + cur_index)
-    cur_token_index = tl.load(probs_idx + cur_index * probs_idx_stride + cur_sampled_index)
-    cur_token_probs = tl.load(probs_sort + cur_index * probs_sort_stride + cur_sampled_index)
-    tl.store(req_to_next_token_ids + cur_req_idx * req_to_next_token_ids_stride, cur_token_index)
-    tl.store(req_to_next_token_probs + cur_req_idx * req_to_next_token_probs_stride, tl.log(cur_token_probs))
-    return
-
-
-@torch.no_grad()
-def gather_and_scatter_token_to_cpu(
-    probs_idx: torch.Tensor,
-    probs_sort: torch.Tensor,
-    req_to_next_token_ids: torch.Tensor,
-    req_to_next_token_probs: torch.Tensor,
-    sampled_index: torch.Tensor,
-    b_req_idx: torch.Tensor,
-):
-    """
-    This function is used to gather the next_token_id(GPU tensor) and next_token_probs(GPU tensor)
-    info to the req_to_next_token_ids and req_to_next_token_probs(CPU tensor).
-    Args:
-        probs_idx: (batch_size, vocab_size)
-        probs_sort: (batch_size, vocab_size)
-        req_to_next_token_ids: (max_req_num,)
-        req_to_next_token_probs: (max_req_num,)
-        sampled_index: (batch_size,)
-        b_req_idx: (batch_size,)
-    """
-    assert probs_idx.shape == probs_sort.shape
-    assert sampled_index.shape[0] == b_req_idx.shape[0]
-    batch_size = b_req_idx.shape[0]
-    grid = (batch_size,)
-    num_warps = 1
-
-    _fwd_kernel_gather_and_scatter[grid](
-        probs_idx,
-        probs_sort,
-        req_to_next_token_ids,
-        req_to_next_token_probs,
-        sampled_index,
-        b_req_idx,
-        probs_idx.stride(0),
-        probs_sort.stride(0),
-        req_to_next_token_ids.stride(0),
-        req_to_next_token_probs.stride(0),
-        num_warps=num_warps,
-        num_stages=1,
-    )
-    return
-
-
 @triton.jit
 def _fwd_kernel_scatter(
     next_token_ids,
     req_to_next_token_ids,
     b_req_idx,
     b_mtp_index,
+    b_has_out,
     req_to_next_token_ids_stride,
     req_to_next_token_ids_stride_1,
+    num_size,
+    HAS_OUT_IS_NONE: tl.constexpr,
+    BLOCK: tl.constexpr,
 ):
-    cur_index = tl.program_id(0)
-    cur_req_idx = tl.load(b_req_idx + cur_index)
-    cur_mtp_index = tl.load(b_mtp_index + cur_index)
-    cur_next_token_id = tl.load(next_token_ids + cur_index)
-    tl.store(req_to_next_token_ids + cur_req_idx * req_to_next_token_ids_stride + cur_mtp_index, cur_next_token_id)
+    block_index = tl.program_id(0)
+    block_range = block_index * BLOCK + tl.arange(0, BLOCK)
+    block_mask = block_range < num_size
+
+    cur_req_idx = tl.load(b_req_idx + block_range, mask=block_mask)
+    cur_mtp_index = tl.load(b_mtp_index + block_range, mask=block_mask)
+    cur_next_token_id = tl.load(next_token_ids + block_range, mask=block_mask)
+
+    if not HAS_OUT_IS_NONE:
+        cur_has_out = tl.load(b_has_out + block_range, mask=block_mask, other=False)
+        tl.store(
+            req_to_next_token_ids + cur_req_idx * req_to_next_token_ids_stride + cur_mtp_index,
+            cur_next_token_id,
+            mask=cur_has_out & block_mask,
+        )
+    else:
+        tl.store(
+            req_to_next_token_ids + cur_req_idx * req_to_next_token_ids_stride + cur_mtp_index,
+            cur_next_token_id,
+            mask=block_mask,
+        )
+
     return
 
 
@@ -93,6 +48,7 @@ def scatter_token(
     req_to_next_token_ids: torch.Tensor,
     b_req_idx: torch.Tensor,
     b_mtp_index: torch.Tensor,
+    b_has_out: torch.Tensor = None,
 ):
     """
     This function is used to scatter the token_info(GPU tensor) to the req_to_token_info(CPU tensor).
@@ -104,16 +60,22 @@ def scatter_token(
     """
     assert next_token_ids.shape[0] == b_req_idx.shape[0]
     batch_size = b_req_idx.shape[0]
-    grid = (batch_size,)
+    BLOCK = 256
+
+    grid = (triton.cdiv(batch_size, BLOCK),)
     num_warps = 1
 
     _fwd_kernel_scatter[grid](
-        next_token_ids,
-        req_to_next_token_ids,
-        b_req_idx,
-        b_mtp_index,
-        req_to_next_token_ids.stride(0),
-        req_to_next_token_ids.stride(1),
+        next_token_ids=next_token_ids,
+        req_to_next_token_ids=req_to_next_token_ids,
+        b_req_idx=b_req_idx,
+        b_mtp_index=b_mtp_index,
+        b_has_out=b_has_out,
+        req_to_next_token_ids_stride=req_to_next_token_ids.stride(0),
+        req_to_next_token_ids_stride_1=req_to_next_token_ids.stride(1),
+        num_size=batch_size,
+        HAS_OUT_IS_NONE=b_has_out is None,
+        BLOCK=BLOCK,
         num_warps=num_warps,
         num_stages=1,
     )
@@ -128,12 +90,18 @@ def _fwd_kernel_gather(
     output,
     b_req_idx,
     b_mtp_index,
+    num_size,
+    BLOCK: tl.constexpr,
 ):
-    cur_index = tl.program_id(0)
-    cur_req_idx = tl.load(b_req_idx + cur_index)
-    cur_mtp_index = tl.load(b_mtp_index + cur_index)
-    cur_next_token_id = tl.load(req_to_next_token_ids + cur_req_idx * req_to_next_token_ids_stride + cur_mtp_index)
-    tl.store(output + cur_index, cur_next_token_id)
+    block_index = tl.program_id(0)
+    block_range = block_index * BLOCK + tl.arange(0, BLOCK)
+    block_mask = block_range < num_size
+    cur_req_idx = tl.load(b_req_idx + block_range, mask=block_mask)
+    cur_mtp_index = tl.load(b_mtp_index + block_range, mask=block_mask)
+    cur_next_token_id = tl.load(
+        req_to_next_token_ids + cur_req_idx * req_to_next_token_ids_stride + cur_mtp_index, mask=block_mask
+    )
+    tl.store(output + block_range, cur_next_token_id, mask=block_mask)
     return
 
 
@@ -148,72 +116,40 @@ def gather_token(req_to_next_token_ids: torch.Tensor, b_req_idx: torch.Tensor, b
         output: (batch_size,)
     """
     batch_size = b_req_idx.shape[0]
-    output = torch.empty_like(b_req_idx)
-    grid = (batch_size,)
+    output = torch.empty(batch_size, dtype=req_to_next_token_ids.dtype, device="cuda")
+    BLOCK = 256
+    grid = (triton.cdiv(batch_size, BLOCK),)
     num_warps = 1
     _fwd_kernel_gather[grid](
-        req_to_next_token_ids,
-        req_to_next_token_ids.stride(0),
-        req_to_next_token_ids.stride(1),
-        output,
-        b_req_idx,
-        b_mtp_index,
+        req_to_next_token_ids=req_to_next_token_ids,
+        req_to_next_token_ids_stride=req_to_next_token_ids.stride(0),
+        req_to_next_token_ids_stride_1=req_to_next_token_ids.stride(1),
+        output=output,
+        b_req_idx=b_req_idx,
+        b_mtp_index=b_mtp_index,
+        num_size=batch_size,
+        BLOCK=BLOCK,
         num_warps=num_warps,
         num_stages=1,
     )
     return output
 
 
-def _top_p_top_k(probs: torch.Tensor, top_ps: torch.Tensor, top_ks: torch.Tensor):
-    probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
-
-    probs_sum = torch.cumsum(probs_sort, dim=-1)
-    probs_sort[(probs_sum - probs_sort) > top_ps.view(-1, 1)] = 0.0
-
-    probs_sort[torch.arange(0, probs.shape[-1], device="cuda").view(1, -1) >= top_ks.view(-1, 1)] = 0.0
-
-    return probs_sort, probs_idx
-
-
-def test_gather_and_scatter_token_to_cpu():
-    batch_size = 30
-    vocab_size = 60000
-    req_to_next_token_ids = torch.ones((1000,), dtype=torch.int32, pin_memory=True)
-    req_to_next_token_probs = torch.ones((1000,), dtype=torch.float32, pin_memory=True)
-    req_ids = torch.arange(20, 20 + batch_size, dtype=torch.int32).cuda()
-    probs = torch.randn((batch_size, vocab_size)).cuda()
-    top_ps = torch.rand((batch_size,)).cuda()
-    top_ks = torch.ones((batch_size,), dtype=torch.int32).cuda()
-    probs_sort, probs_idx = _top_p_top_k(probs, top_ps, top_ks)
-    sampled_index = torch.multinomial(probs_sort, num_samples=1, replacement=True)
-    batch_next_token_ids = torch.gather(probs_idx, dim=1, index=sampled_index)
-    batch_next_token_probs = torch.gather(probs_sort, dim=1, index=sampled_index)
-
-    gather_and_scatter_token_to_cpu(
-        probs_idx, probs_sort, req_to_next_token_ids, req_to_next_token_probs, sampled_index, req_ids
-    )
-    diff_ids = (req_to_next_token_ids[20 : 20 + batch_size].cuda() - batch_next_token_ids.view(-1)).abs().max()
-    diff_probs = (req_to_next_token_probs[20 : 20 + batch_size].cuda() - batch_next_token_probs.view(-1)).abs().max()
-    assert diff_ids < 1e-6
-    assert diff_probs < 1e-6
-    print("test_gather_and_scatter_token_to_cpu passed")
-
-
 def test_scatter_token_to_cpu():
     batch_size = 30
-    req_to_token_info = torch.zeros((1000,), dtype=torch.float32, pin_memory=True)
+    req_to_token_info = torch.zeros((1000, 1), dtype=torch.float32, pin_memory=True)
     token_info = torch.randn((batch_size,)).cuda()
     req_ids = torch.arange(20, 20 + batch_size, dtype=torch.int32).cuda()
     mtp_index = torch.zeros((batch_size,), dtype=torch.int32).cuda()
     scatter_token(token_info, req_to_token_info, req_ids, mtp_index)
-    diff = (req_to_token_info[20 : 20 + batch_size].cuda() - token_info).abs().max()
+    diff = (req_to_token_info[20 : 20 + batch_size].cuda().view(-1) - token_info).abs().max()
     assert diff < 1e-6
     print("test_scatter_token_to_cpu passed")
 
 
 def test_gather_token():
     batch_size = 30
-    req_to_token_info = torch.zeros((1000,), dtype=torch.int32, pin_memory=True)
+    req_to_token_info = torch.zeros((1000, 1), dtype=torch.float32, pin_memory=True)
     token_info = torch.randn((batch_size,)).cuda()
     req_ids = torch.arange(20, 20 + batch_size, dtype=torch.int32).cuda()
     mtp_index = torch.zeros((batch_size,), dtype=torch.int32).cuda()
@@ -225,6 +161,5 @@ def test_gather_token():
 
 
 if __name__ == "__main__":
-    test_gather_and_scatter_token_to_cpu()
     test_scatter_token_to_cpu()
     test_gather_token()