improve post_process. (#857)

hiworldwzj · web-flow · commit 2b2d30f8cb78 · 2025-04-24T14:00:22.000+08:00
diff --git a/lightllm/common/basemodel/triton_kernel/apply_penalty.py b/lightllm/common/basemodel/triton_kernel/apply_penalty.py
@@ -4,12 +4,24 @@
 import triton.language as tl
 import numpy as np
 
+
 @triton.jit
 def _fwd_kernel_apply_penalty(
-    Logits, presence_penalty, freqency_penalty, repetition_penalty,
-    p_token_ids, p_token_counts, p_cumsum_seq_len, 
-    stride_logit_b, stride_logit_s,
-    BLOCK_P: tl.constexpr
+    Logits,
+    presence_penalty,
+    freqency_penalty,
+    repetition_penalty,
+    p_token_ids,
+    p_token_counts,
+    p_cumsum_seq_len,
+    exponential_decay_length_penalties,
+    length_penalty_idx,
+    eos_ids,
+    mask_eos_reqs,
+    stride_logit_b,
+    stride_logit_s,
+    BLOCK_P: tl.constexpr,
+    EOS_ID_NUM: tl.constexpr,
 ):
     cur_batch = tl.program_id(0)
     cur_freqency = tl.load(freqency_penalty + cur_batch)
@@ -18,36 +30,70 @@ def _fwd_kernel_apply_penalty(
 
     cur_batch_start_index = tl.load(p_cumsum_seq_len + cur_batch)
     cur_batch_end_index = tl.load(p_cumsum_seq_len + cur_batch + 1)
+    for block_start_index in range(cur_batch_start_index, cur_batch_end_index, BLOCK_P):
+        cur_batch_id_offset = block_start_index + tl.arange(0, BLOCK_P)
+        batch_ids = tl.load(p_token_ids + cur_batch_id_offset, mask=cur_batch_id_offset < cur_batch_end_index, other=0)
+        batch_ids_count = tl.load(
+            p_token_counts + cur_batch_id_offset, mask=cur_batch_id_offset < cur_batch_end_index, other=0
+        )
+
+        row_start_ptr = Logits + cur_batch * stride_logit_b
+        cur_offset = row_start_ptr + batch_ids
+        cur_logits = tl.load(cur_offset, mask=cur_batch_id_offset < cur_batch_end_index, other=0.0)
+        rep_logits = tl.where(cur_logits > 0, cur_logits / cur_repetition, cur_logits * cur_repetition)
+        freq_logits = rep_logits - batch_ids_count * cur_freqency
+        pre_logits = freq_logits - cur_presence
+        output_ptr = Logits + cur_batch * stride_logit_b + batch_ids
+        tl.store(output_ptr, pre_logits, mask=cur_batch_id_offset < cur_batch_end_index)
 
-    cur_batch_id_offset = cur_batch_start_index + tl.arange(0, BLOCK_P)
-    batch_ids = tl.load(p_token_ids + cur_batch_id_offset, mask=cur_batch_id_offset<cur_batch_end_index, other=0)
-    batch_ids_count = tl.load(p_token_counts + cur_batch_id_offset, mask=cur_batch_id_offset<cur_batch_end_index, other=0)
-    
-    row_start_ptr = Logits + cur_batch * stride_logit_b
-    cur_offset = row_start_ptr + batch_ids
-    cur_logits = tl.load(cur_offset, mask=cur_batch_id_offset<cur_batch_end_index, other=0.0)
-    rep_logits = tl.where(cur_logits > 0, cur_logits / cur_repetition, cur_logits * cur_repetition)
-    freq_logits = rep_logits - batch_ids_count * cur_freqency
-    pre_logits = freq_logits - cur_presence
-    output_ptr = Logits + cur_batch * stride_logit_b + batch_ids
-    tl.store(output_ptr, pre_logits, mask=cur_batch_id_offset<cur_batch_end_index)
+    mask_eos = tl.load(mask_eos_reqs + cur_batch)
+    exponential_decay_length_penalty = tl.load(exponential_decay_length_penalties + cur_batch)
+    length_penalty = tl.load(length_penalty_idx + cur_batch)
+    penalty_scale = tl.exp2(tl.log2(exponential_decay_length_penalty) * length_penalty) - 1
 
+    for eos_index in range(EOS_ID_NUM):
+        eos_id = tl.load(eos_ids + eos_index)
+        cur_eos_logit_ptr = Logits + cur_batch * stride_logit_b + eos_id
+        cur_eos_logit = tl.load(cur_eos_logit_ptr)
+        cur_eos_logit = cur_eos_logit + tl.abs(cur_eos_logit) * penalty_scale
+        cur_eos_logit = tl.where(mask_eos, -10000000.0, cur_eos_logit)
+        tl.store(cur_eos_logit_ptr, cur_eos_logit)
     return
 
+
 @torch.no_grad()
-def apply_penalty(Logits, presence_penalty, freqency_penalty, repetition_penalty, p_token_ids, p_token_counts, p_cumsum_seq_len, p_max_len_in_batch):
+def apply_penalty(
+    Logits,
+    presence_penalty,
+    freqency_penalty,
+    repetition_penalty,
+    p_token_ids,
+    p_token_counts,
+    p_cumsum_seq_len,
+    exponential_decay_length_penalties,
+    length_penalty_idx,
+    eos_ids,
+    mask_eos_reqs,
+):
     assert Logits.is_contiguous()
-    BLOCK = triton.next_power_of_2(p_max_len_in_batch)
-    if BLOCK <= 512:
-        BLOCK = 512
-    elif BLOCK <= 1024:
-        BLOCK = 1024
+    BLOCK_P = 1024
     num_warps = 8
-    _fwd_kernel_apply_penalty[(Logits.shape[0], )](
-        Logits, presence_penalty, freqency_penalty, repetition_penalty,
-        p_token_ids, p_token_counts, p_cumsum_seq_len,
-        Logits.stride(0), Logits.stride(1),
+    _fwd_kernel_apply_penalty[(Logits.shape[0],)](
+        Logits,
+        presence_penalty,
+        freqency_penalty,
+        repetition_penalty,
+        p_token_ids,
+        p_token_counts,
+        p_cumsum_seq_len,
+        exponential_decay_length_penalties,
+        length_penalty_idx,
+        eos_ids,
+        mask_eos_reqs,
+        Logits.stride(0),
+        Logits.stride(1),
         num_warps=num_warps,
-        BLOCK_P=BLOCK
+        BLOCK_P=BLOCK_P,
+        EOS_ID_NUM=eos_ids.shape[0],
     )
     return
diff --git a/lightllm/server/router/model_infer/mode_backend/generic_post_process.py b/lightllm/server/router/model_infer/mode_backend/generic_post_process.py
@@ -18,11 +18,12 @@ def sample(logits, reqs, eos_id: List[int] = [2]):
         p_token_ids,
         p_token_counts,
         p_cumsum_seq_len,
-        p_max_len_in_batch,
         length_penalty_idx,
         mask_eos_reqs,
     ) = _get_post_sample_tensors(reqs)
 
+    eos_ids = torch.tensor(eos_id, dtype=torch.int32, device="cpu", pin_memory=True).cuda(non_blocking=True)
+
     logits = logits.contiguous()
 
     apply_penalty(
@@ -33,13 +34,12 @@ def sample(logits, reqs, eos_id: List[int] = [2]):
         p_token_ids,
         p_token_counts,
         p_cumsum_seq_len,
-        p_max_len_in_batch,
-    )
-    logits[:, eos_id] = logits[:, eos_id] + torch.abs(logits[:, eos_id]) * (
-        torch.pow(exponential_decay_length_penalties, length_penalty_idx).view((-1, 1)) - 1
+        exponential_decay_length_penalties,
+        length_penalty_idx,
+        eos_ids,
+        mask_eos_reqs,
     )
-    if mask_eos_reqs.any():
-        logits[mask_eos_reqs, eos_id] = -1000000.0
+
     logits.div_(temperatures.view((-1, 1)))
     probs = torch.softmax(logits, dim=-1)
 
@@ -94,7 +94,6 @@ def _get_post_sample_tensors(reqs: List[InferReq]):
     p_seq_len: List[int] = [
         0,
     ]
-    p_max_len_in_batch: int = 0
     length_penalty_idx: List[int] = []
     mask_eos_reqs: List[bool] = []
     for i, req_obj in enumerate(reqs):
@@ -113,11 +112,9 @@ def _get_post_sample_tensors(reqs: List[InferReq]):
         top_ps.append(sample_param.shm_param.top_p)
         top_ks.append(sample_param.shm_param.top_k)
 
-        for token_id, count in id_to_count.items():
-            p_token_ids.append(token_id)
-            p_token_counts.append(count)
+        p_token_ids.extend(list(id_to_count.keys()))
+        p_token_counts.extend(list(id_to_count.values()))
         p_seq_len.append(len(id_to_count))
-        p_max_len_in_batch = max(p_max_len_in_batch, len(id_to_count))
 
     presence_penalties_cpu = torch.tensor(presence_penalties, dtype=torch.float, device="cpu", pin_memory=True)
     frequency_penalties_cpu = torch.tensor(frequency_penalties, dtype=torch.float, device="cpu", pin_memory=True)
@@ -146,7 +143,6 @@ def _get_post_sample_tensors(reqs: List[InferReq]):
         p_token_ids_cpu.cuda(non_blocking=True),
         p_token_counts_cpu.cuda(non_blocking=True),
         p_cumsum_seq_len_cpu.cuda(non_blocking=True),
-        p_max_len_in_batch,
         length_penalty_idx_cpu.cuda(non_blocking=True),
         mask_eos_reqs_cpu.cuda(non_blocking=True),
     )