sampling_backend support sglang_kernel (#855)

hiworldwzj · web-flow · commit 171ee4e3e6eb · 2025-04-23T17:50:29.000+08:00
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
@@ -352,4 +352,12 @@ def make_argument_parser() -> argparse.ArgumentParser:
         help="""Path of quantization config. It can be used for mixed quantization.
             Examples can be found in lightllm/common/quantization/configs.""",
     )
+    parser.add_argument(
+        "--sampling_backend",
+        type=str,
+        choices=["triton", "sglang_kernel"],
+        default="triton",
+        help="""sampling used impl. 'triton' is use torch and triton kernel,
+        sglang_kernel use sglang_kernel impl""",
+    )
     return parser
diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py
@@ -79,3 +79,4 @@ class StartArgs:
     vit_quant_cfg: Optional[str] = field(default=None)
     enable_flashinfer_prefill: bool = field(default=False)
     enable_flashinfer_decode: bool = field(default=False)
+    sampling_backend: str = field(default="triton", metadata={"choices": ["triton", "sglang_kernel"]})
diff --git a/lightllm/server/router/model_infer/mode_backend/generic_post_process.py b/lightllm/server/router/model_infer/mode_backend/generic_post_process.py
@@ -3,6 +3,7 @@
 from lightllm.common.basemodel.triton_kernel.apply_penalty import apply_penalty
 from dataclasses import dataclass
 from lightllm.server.router.model_infer.infer_batch import InferReq, g_infer_context
+from lightllm.utils.envs_utils import get_env_start_args
 
 
 def sample(logits, reqs, eos_id: List[int] = [2]):
@@ -41,13 +42,32 @@ def sample(logits, reqs, eos_id: List[int] = [2]):
         logits[mask_eos_reqs, eos_id] = -1000000.0
     logits.div_(temperatures.view((-1, 1)))
     probs = torch.softmax(logits, dim=-1)
-    probs_sort, probs_idx = _top_p_top_k(probs, top_ps, top_ks)
-    sampled_index = torch.multinomial(probs_sort, num_samples=1, replacement=True)
 
-    batch_next_token_ids = torch.gather(probs_idx, dim=1, index=sampled_index)
-    batch_next_token_probs = torch.gather(probs_sort, dim=1, index=sampled_index)
-
-    return batch_next_token_ids.view(-1), batch_next_token_probs.view(-1)
+    if get_env_start_args().sampling_backend == "triton":
+        probs_sort, probs_idx = _top_p_top_k(probs, top_ps, top_ks)
+        sampled_index = torch.multinomial(probs_sort, num_samples=1, replacement=True)
+
+        batch_next_token_ids = torch.gather(probs_idx, dim=1, index=sampled_index)
+        batch_next_token_probs = torch.gather(probs_sort, dim=1, index=sampled_index)
+
+        return batch_next_token_ids.view(-1), batch_next_token_probs.view(-1)
+
+    elif get_env_start_args().sampling_backend == "sglang_kernel":
+        from sgl_kernel import top_k_top_p_sampling_from_probs
+
+        batch_next_token_ids = top_k_top_p_sampling_from_probs(
+            probs,
+            top_ks,
+            top_ps,
+            filter_apply_order="joint",
+            check_nan=True,
+        )
+        int64_batch_next_token_ids = torch.empty_like(batch_next_token_ids, dtype=torch.int64)
+        int64_batch_next_token_ids[:] = batch_next_token_ids
+        batch_next_token_probs = torch.gather(probs, dim=1, index=int64_batch_next_token_ids.view(-1, 1))
+        return batch_next_token_ids.view(-1), batch_next_token_probs.view(-1)
+    else:
+        assert False, "dead path"
 
 
 def _top_p_top_k(probs: torch.Tensor, top_ps: torch.Tensor, top_ks: torch.Tensor):