ModelTC
diff --git a/‎docs/CN/source/getting_started/installation.rst‎
Lines changed: 10 additions & 3 deletions b/‎docs/CN/source/getting_started/installation.rst‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎docs/CN/source/tutorial/api_server_args_zh.rst‎
Lines changed: 4 additions & 0 deletions b/‎docs/CN/source/tutorial/api_server_args_zh.rst‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/EN/source/getting_started/installation.rst‎
Lines changed: 15 additions & 4 deletions b/‎docs/EN/source/getting_started/installation.rst‎
Lines changed: 15 additions & 4 deletions
diff --git a/‎docs/EN/source/tutorial/api_server_args_zh.rst‎
Lines changed: 4 additions & 0 deletions b/‎docs/EN/source/tutorial/api_server_args_zh.rst‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎lightllm/common/basemodel/triton_kernel/gen_sampling_params.py‎
Lines changed: 27 additions & 10 deletions b/‎lightllm/common/basemodel/triton_kernel/gen_sampling_params.py‎
Lines changed: 27 additions & 10 deletions
diff --git a/‎lightllm/common/req_manager.py‎
Lines changed: 23 additions & 17 deletions b/‎lightllm/common/req_manager.py‎
Lines changed: 23 additions & 17 deletions
diff --git a/‎lightllm/server/api_cli.py‎
Lines changed: 13 additions & 1 deletion b/‎lightllm/server/api_cli.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎lightllm/server/core/objs/req.py‎
Lines changed: 3 additions & 0 deletions b/‎lightllm/server/core/objs/req.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lightllm/server/httpserver/manager.py‎
Lines changed: 46 additions & 42 deletions b/‎lightllm/server/httpserver/manager.py‎
Lines changed: 46 additions & 42 deletions
diff --git a/‎lightllm/server/router/batch.py‎
Lines changed: 9 additions & 0 deletions b/‎lightllm/server/router/batch.py‎
Lines changed: 9 additions & 0 deletions
@@ -23,9 +23,16 @@ Lightllm 是一个纯python开发的推理框架，其中的算子使用triton
     $ # 拉取官方镜像
     $ docker pull ghcr.io/modeltc/lightllm:main
     $
-    $ # 运行
+    $ # 运行服务, 注意现在的lightllm服务非常的依赖共享内存部分，在启动
+    $ # 前请确保你的docker设置中已经分配了足够的共享内存，否则可能导致
+    $ # 服务无法正常启动。
+    $ # 1.如果是纯文本服务，建议分配2GB以上的共享内存, 如果你的内存充足，建议分配16GB以上的共享内存.
+    $ # 2.如果是多模态服务，建议分配16GB以上的共享内存，具体可以根据实际情况进行调整. 
+    $ # 如果你没有足够的共享内存，可以尝试在启动服务的时候调低 --running_max_req_size 参数，这会降低
+    $ # 服务的并发请求数量，但可以减少共享内存的占用。如果是多模态服务，也可以通过降低 --cache_capacity
+    $ # 参数来减少共享内存的占用。
     $ docker run -it --gpus all -p 8080:8080            \
-    $   --shm-size 1g -v your_local_path:/data/         \
+    $   --shm-size 2g -v your_local_path:/data/         \
     $   ghcr.io/modeltc/lightllm:main /bin/bash
 
 你也可以使用源码手动构建镜像并运行：
@@ -37,7 +44,7 @@ Lightllm 是一个纯python开发的推理框架，其中的算子使用triton
     $
     $ # 运行
     $ docker run -it --gpus all -p 8080:8080            \
-    $   --shm-size 1g -v your_local_path:/data/         \
+    $   --shm-size 2g -v your_local_path:/data/         \
     $   <image_name> /bin/bash
 
 或者你也可以直接使用脚本一键启动镜像并且运行：
 
@@ -236,6 +236,10 @@ attention类型选择参数
 
     多结果输出模式
 
+.. option:: --schedule_time_interval
+
+    调度时间间隔，默认为 ``0.03``，单位为秒
+
 
 输出约束参数
 -----------
 
@@ -23,9 +23,20 @@ The easiest way to install Lightllm is using the official image. You can directl
     $ # Pull the official image
     $ docker pull ghcr.io/modeltc/lightllm:main
     $
-    $ # Run
+    $ # Run，The current LightLLM service relies heavily on shared memory.
+    $ # Before starting, please make sure that you have allocated enough shared memory 
+    $ # in your Docker settings; otherwise, the service may fail to start properly.
+    $ #
+    $ # 1. For text-only services, it is recommended to allocate more than 2GB of shared memory. 
+    $ # If your system has sufficient RAM, allocating 16GB or more is recommended.
+    $ # 2.For multimodal services, it is recommended to allocate 16GB or more of shared memory. 
+    $ # You can adjust this value according to your specific requirements.
+    $ #
+    $ # If you do not have enough shared memory available, you can try lowering 
+    $ # the --running_max_req_size parameter when starting the service. 
+    $ # This will reduce the number of concurrent requests, but also decrease shared memory usage.
     $ docker run -it --gpus all -p 8080:8080            \
-    $   --shm-size 1g -v your_local_path:/data/         \
+    $   --shm-size 2g -v your_local_path:/data/         \
     $   ghcr.io/modeltc/lightllm:main /bin/bash
 
 You can also manually build the image from source and run it:
@@ -35,9 +46,9 @@ You can also manually build the image from source and run it:
     $ # Manually build the image
     $ docker build -t <image_name> .
     $
-    $ # Run
+    $ # Run, 
     $ docker run -it --gpus all -p 8080:8080            \
-    $   --shm-size 1g -v your_local_path:/data/         \
+    $   --shm-size 2g -v your_local_path:/data/         \
     $   <image_name> /bin/bash
 
 Or you can directly use the script to launch the image and run it with one click:
 
@@ -236,6 +236,10 @@ Scheduling Parameters
 
     Multi-result output mode
 
+.. option:: --schedule_time_interval
+
+    Schedule time interval, default is ``0.03``, unit is seconds
+
 Output Constraint Parameters
 ---------------------------
 
 
@@ -121,37 +121,54 @@ def _token_id_counter_update_kernel(
     counter_stride_m,
     counter_stride_n,
     next_token_ids_ptr,
+    mask_ptr,
     batch_size,
+    HAS_MASK: tl.constexpr,
     BLOCK: tl.constexpr,
 ):
 
     block_start_index = tl.program_id(0) * BLOCK
     offs = block_start_index + tl.arange(0, BLOCK)
-    mask = offs < batch_size
-
-    req_idx = tl.load(b_req_idx_ptr + offs, mask=mask, other=0)
-    token_ids = tl.load(next_token_ids_ptr + offs, mask=mask, other=0)
-
-    tl.atomic_add(
-        req_to_out_token_id_counter_ptr + req_idx * counter_stride_m + token_ids * counter_stride_n, 1, mask=mask
-    )
+    loc_mask = offs < batch_size
+
+    req_idx = tl.load(b_req_idx_ptr + offs, mask=loc_mask, other=0)
+    token_ids = tl.load(next_token_ids_ptr + offs, mask=loc_mask, other=0)
+
+    if HAS_MASK:
+        mask = tl.load(mask_ptr + offs, mask=loc_mask, other=False)
+        tl.atomic_add(
+            req_to_out_token_id_counter_ptr + req_idx * counter_stride_m + token_ids * counter_stride_n,
+            1,
+            mask=loc_mask & mask,
+        )
+    else:
+        tl.atomic_add(
+            req_to_out_token_id_counter_ptr + req_idx * counter_stride_m + token_ids * counter_stride_n,
+            1,
+            mask=loc_mask,
+        )
     return
 
 
 @torch.no_grad()
 def update_req_to_token_id_counter(
-    b_req_idx: torch.Tensor, next_token_ids: torch.Tensor, req_to_out_token_id_counter: torch.Tensor
+    b_req_idx: torch.Tensor,
+    next_token_ids: torch.Tensor,
+    req_to_out_token_id_counter: torch.Tensor,
+    mask: torch.Tensor = None,
 ):
     batch_size = b_req_idx.shape[0]
     BLOCK = 256
-
+    has_mask = mask is not None
     _token_id_counter_update_kernel[(triton.cdiv(batch_size, BLOCK),)](
         b_req_idx_ptr=b_req_idx,
         req_to_out_token_id_counter_ptr=req_to_out_token_id_counter,
         counter_stride_m=req_to_out_token_id_counter.stride(0),
         counter_stride_n=req_to_out_token_id_counter.stride(1),
         next_token_ids_ptr=next_token_ids,
+        mask_ptr=mask,
         batch_size=batch_size,
+        HAS_MASK=has_mask,
         BLOCK=BLOCK,
         num_warps=1,
     )
 
@@ -155,36 +155,42 @@ def init_req_sampling_params(self, req):
         else:
             self.req_to_out_token_id_counter[req.req_idx].fill_(0)
             if req.sampling_param.shm_param.input_penalty and req.need_out_token_id_statistics:
-                prompt_ids = torch.from_numpy(req.shm_req.get_prompt_ids()).pin_memory().cuda(non_blocking=True)
+                prompt_ids = torch.from_numpy(req.shm_req.get_prompt_ids_numpy()).pin_memory().cuda(non_blocking=True)
                 token_id_counter(
                     prompt_ids=prompt_ids, out_token_id_counter=self.req_to_out_token_id_counter[req.req_idx]
                 )
 
         return
 
+    def update_reqs_out_token_counter_gpu(
+        self, b_req_idx: torch.Tensor, next_token_ids: torch.Tensor, mask: torch.Tensor = None
+    ):
+        if self.penalty_counter_mode not in ["gpu_counter", "pin_mem_counter"]:
+            return
+
+        assert b_req_idx.is_cuda and next_token_ids.is_cuda and b_req_idx.shape[0] == next_token_ids.shape[0]
+
+        update_req_to_token_id_counter(
+            b_req_idx=b_req_idx,
+            next_token_ids=next_token_ids,
+            req_to_out_token_id_counter=self.req_to_out_token_id_counter,
+            mask=mask,
+        )
+        return
+
     def update_reqs_token_counter(
         self, req_objs: List, next_token_ids: List[int], accept_mark: Optional[List[List[bool]]] = None
     ):
         from lightllm.server.router.model_infer.infer_batch import InferReq
 
         req_objs: List[InferReq] = req_objs
 
-        if self.penalty_counter_mode == "cpu_counter":
-            for req_obj, next_token_id in zip(req_objs, next_token_ids):
-                if req_obj.need_out_token_id_statistics and req_obj.cur_output_len > 0:
-                    req_obj.out_token_id_count[next_token_id] += 1
-        else:
-            b_req_idx = torch.tensor(
-                [req.req_idx for req in req_objs], dtype=torch.int32, device="cpu", pin_memory=True
-            ).cuda(non_blocking=True)
-            next_token_ids = (
-                torch.tensor(next_token_ids, dtype=torch.int32, device="cpu").pin_memory().cuda(non_blocking=True)
-            )
-            update_req_to_token_id_counter(
-                b_req_idx=b_req_idx,
-                next_token_ids=next_token_ids,
-                req_to_out_token_id_counter=self.req_to_out_token_id_counter,
-            )
+        if self.penalty_counter_mode != "cpu_counter":
+            return
+
+        for req_obj, next_token_id in zip(req_objs, next_token_ids):
+            if req_obj.need_out_token_id_statistics and req_obj.cur_output_len > 0:
+                req_obj.out_token_id_count[next_token_id] += 1
         return
 
     def gen_cpu_out_token_counter_sampling_params(self, req_objs: List):
 
@@ -112,7 +112,7 @@ def make_argument_parser() -> argparse.ArgumentParser:
         help="tool call parser type",
     )
     parser.add_argument(
-        "--running_max_req_size", type=int, default=1000, help="the max size for forward requests in the same time"
+        "--running_max_req_size", type=int, default=2048, help="the max size for forward requests in the same time"
     )
     parser.add_argument("--nnodes", type=int, default=1, help="the number of nodes")
     parser.add_argument("--node_rank", type=int, default=0, help="the rank of the current node")
@@ -137,6 +137,12 @@ def make_argument_parser() -> argparse.ArgumentParser:
                         using the deepseekv2 model, set dp to be equal to the tp parameter. In other cases, please
                         do not set it and keep the default value as 1.""",
     )
+    parser.add_argument(
+        "--dp_balancer",
+        type=str,
+        default="round_robin",
+        help="the dp balancer type, default is round_robin",
+    )
     parser.add_argument(
         "--max_req_total_len", type=int, default=16384, help="the max value for req_input_len + req_output_len"
     )
@@ -476,4 +482,10 @@ def make_argument_parser() -> argparse.ArgumentParser:
         default=None,
         help="""Path of the kv quant calibration config. It can be used for llama and qwen model.""",
     )
+    parser.add_argument(
+        "--schedule_time_interval",
+        type=float,
+        default=0.03,
+        help="""The interval of the schedule time, default is 30ms.""",
+    )
     return parser
@@ -188,6 +188,9 @@ def link_logprobs_shm_array(self):
     def get_prompt_ids(self):
         return self.shm_prompt_ids.arr[: self.input_len].tolist()
 
+    def get_prompt_ids_numpy(self):
+        return self.shm_prompt_ids.arr[: self.input_len]
+
     def to_router_rpc_obj(self):
         if hasattr(self, "multimodal_params"):
             return (
 
@@ -645,50 +645,54 @@ async def handle_loop(self):
             except asyncio.TimeoutError:
                 pass
 
-            for group_req_id_ in list(self.req_id_to_out_inf.keys()):
-                req_status = self.req_id_to_out_inf.get(group_req_id_, None)
-                if req_status is None:
-                    continue
+            try:
+                for group_req_id_ in list(self.req_id_to_out_inf.keys()):
+                    req_status = self.req_id_to_out_inf.get(group_req_id_, None)
+                    if req_status is None:
+                        continue
 
-                token_list = []
-                for req in req_status.group_req_objs.shm_req_objs:
-                    req_id = req.request_id
-                    read_token_count = 1
-                    if req.out_tokens_queue.is_full():
-                        read_token_count = LIGHTLLM_OUT_TOKEN_QUEUE_SIZE
-
-                    for _ in range(read_token_count):
-                        if not req.out_tokens_queue.is_empty():
-
-                            text, src_index, special, count_output_tokens = req.out_tokens_queue.peek()
-                            req.cumlogprob += float(req.shm_logprobs.arr[src_index])
-                            metadata = {
-                                "id": int(req.shm_prompt_ids.arr[src_index]),
-                                "logprob": float(req.shm_logprobs.arr[src_index]),
-                                "cumlogprob": float(req.cumlogprob) / count_output_tokens,
-                                "special": special,
-                                "count_output_tokens": count_output_tokens,
-                                "prompt_cache_len": req.prompt_cache_len,
-                                "mtp_accepted_token_num": req.mtp_accepted_token_num,
-                            }
-                            if self.args.return_all_prompt_logprobs:
-                                metadata.update(req.get_all_prompt_metadata())
-                            if self.args.use_reward_model:
-                                metadata["score"] = float(req.reward_score)
-
-                            req.out_tokens_queue.pop_no_ret()
-
-                            if req.finish_token_index != src_index:
-                                token_list.append((req_id, text, metadata, FinishStatus()))
+                    token_list = []
+                    for req in req_status.group_req_objs.shm_req_objs:
+                        req_id = req.request_id
+                        read_token_count = 1
+                        if req.out_tokens_queue.is_full():
+                            read_token_count = LIGHTLLM_OUT_TOKEN_QUEUE_SIZE
+
+                        for _ in range(read_token_count):
+                            if not req.out_tokens_queue.is_empty():
+
+                                text, src_index, special, count_output_tokens = req.out_tokens_queue.peek()
+                                req.cumlogprob += float(req.shm_logprobs.arr[src_index])
+                                metadata = {
+                                    "id": int(req.shm_prompt_ids.arr[src_index]),
+                                    "logprob": float(req.shm_logprobs.arr[src_index]),
+                                    "cumlogprob": float(req.cumlogprob) / count_output_tokens,
+                                    "special": special,
+                                    "count_output_tokens": count_output_tokens,
+                                    "prompt_cache_len": req.prompt_cache_len,
+                                    "mtp_accepted_token_num": req.mtp_accepted_token_num,
+                                }
+                                if self.args.return_all_prompt_logprobs:
+                                    metadata.update(req.get_all_prompt_metadata())
+                                if self.args.use_reward_model:
+                                    metadata["score"] = float(req.reward_score)
+
+                                req.out_tokens_queue.pop_no_ret()
+
+                                if req.finish_token_index != src_index:
+                                    token_list.append((req_id, text, metadata, FinishStatus()))
+                                else:
+                                    finish_status = FinishStatus(req.finish_status.status)
+                                    token_list.append((req_id, text, metadata, finish_status))
                             else:
-                                finish_status = FinishStatus(req.finish_status.status)
-                                token_list.append((req_id, text, metadata, finish_status))
-                        else:
-                            break
-
-                async with req_status.lock:
-                    req_status.out_token_info_list.extend(token_list)
-                    req_status.event.set()
+                                break
+
+                    async with req_status.lock:
+                        req_status.out_token_info_list.extend(token_list)
+                        req_status.event.set()
+            except BaseException as e:
+                logger.exception(str(e))
+                raise e
 
             self.recycle_event.set()
         return
 
@@ -40,6 +40,15 @@ def get_req_list_for_dp(self, dp_index: int):
                 req_list.append(req)
         return req_list
 
+    def get_all_dp_req_num(self) -> List[int]:
+        if self.dp_size_in_node == 1:
+            return [len(self.reqs)]
+
+        all_dp_req_num = [0 for _ in range(self.dp_size_in_node)]
+        for req in self.reqs:
+            all_dp_req_num[req.sample_params.suggested_dp_index] += 1
+        return all_dp_req_num
+
     def filter_out_finished_req(self, shm_req_manager: ShmReqManager):
         unfinished_req_ids = []
         for req in self.reqs: