use list

niushengxiao · niushengxiao · commit 78c1ffdc8264 · 2025-03-14T20:26:05.000+08:00
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -78,8 +78,11 @@ def __init__(self, kvargs):
             self._init_mem_manager()
             self._init_weights()
 
-        self.stream1 = torch.cuda.Stream()
-        self.stream2 = torch.cuda.Stream()
+        self.stream_num = 2
+        self.graph = [None] * self.stream_num
+        self.stream = [None] * self.stream_num
+        for i in range(self.stream_num):
+            self.stream[i] = torch.cuda.Stream()
         self._init_kv_move_buffer()
         self._check_mem_size()
         self._init_req_manager()
@@ -205,16 +208,11 @@ def _init_datatype(self):
             raise ValueError(f"Unsupport datatype {self.data_type}!")
 
     def _init_cudagraph(self):
-        self.graph = (
-            None if self.disable_cudagraph else CudaGraph(self.stream1, self.graph_max_batch_size, self.graph_max_len_in_batch)
-        )
-        self.graph2 = (
-            None if self.disable_cudagraph else CudaGraph(self.stream2, self.graph_max_batch_size, self.graph_max_len_in_batch)
-        )
-        if self.graph is not None:
-            self.graph.warmup(self, 0)
-        if self.graph2 is not None:
-            self.graph2.warmup(self, 1)
+        for i in range(self.stream_num):
+            self.graph[i] = (
+                None if self.disable_cudagraph else CudaGraph(self.stream[i], self.graph_max_batch_size, self.graph_max_len_in_batch)
+            )
+            self.graph[i].warmup(self, i)
 
     def _init_custom(self):
         pass
@@ -363,7 +361,7 @@ def _decode(
         copy_kv_index_to_req(self.req_manager.req_to_token_indexs, b_req_idx, b_seq_len, infer_state.mem_index)
 
         infer_state.init_some_extra_state(self, input_ids)
-        graph = self.graph if all_reduce_id == 0 else self.graph2
+        graph = self.graph[all_reduce_id]
         if graph is not None and graph.can_run(batch_size, max_len_in_batch):
             if graph.need_capture(batch_size):
                 infer_state.is_cuda_graph = True
diff --git a/lightllm/distributed/communication_op.py b/lightllm/distributed/communication_op.py
@@ -51,34 +51,22 @@
 
 class CustomCommunicationOp:
     def __init__(self):
-        self.vllm_reduce1 = None
-        self.vllm_reduce2 = None
-        self.custom_gather = None
-        self.custom_gather2 = None
+        self.reduce_num = 2
+        self.vllm_reduce = [None] * self.reduce_num
+        self.custom_gather = [None] * self.reduce_num
         self.device_group = None
 
     @contextmanager
     def lightllm_capture_graph(self, all_reduce_id):
-        if all_reduce_id == 0:
-            if self.vllm_reduce1 is not None:
-                with self.vllm_reduce1.capture():
-                    if self.custom_gather is not None:
-                        with self.custom_gather.capture():
-                            yield
-                    else:
+        if self.vllm_reduce[all_reduce_id] is not None:
+            with self.vllm_reduce[all_reduce_id].capture():
+                if self.custom_gather[all_reduce_id] is not None:
+                    with self.custom_gather[all_reduce_id].capture():
                         yield
-            else:
-                yield
+                else:
+                    yield
         else:
-            if self.vllm_reduce2 is not None:
-                with self.vllm_reduce2.capture():
-                    if self.custom_gather2 is not None:
-                        with self.custom_gather2.capture():
-                            yield
-                    else:
-                        yield
-            else:
-                yield
+            yield
 
     def set_custom_reduce(self):
         ENABLE_VLLM_REDUCE = os.getenv("ENABLE_VLLM_REDUCE", "True").upper() in ["ON", "TRUE", "1"]
@@ -97,17 +85,16 @@ def set_custom_reduce(self):
             self.device_group = dist.new_group(ranks, backend="nccl")
 
         if ENABLE_VLLM_REDUCE and HAS_VLLM:
-            cpu_group1 = dist.new_group(ranks, backend="gloo")
-            self.vllm_reduce1 = CustomAllreduce(cpu_group1, torch.cuda.current_device())
-            cpu_group2 = dist.new_group(ranks, backend="gloo")
-            self.vllm_reduce2 = CustomAllreduce(cpu_group2, torch.cuda.current_device())
+            cpu_group = [dist.new_group(ranks, backend="gloo")] * self.reduce_num
+            for i in range(self.reduce_num):
+                self.vllm_reduce[i] = CustomAllreduce(cpu_group[i], torch.cuda.current_device())
             logger.info("Enable VLLM ALLReduce.")
 
         def _all_reduce_closure(input_, op=ReduceOp.SUM, group=self.device_group, async_op=False, all_reduce_id=0):
             if op != ReduceOp.SUM or async_op:
                 original_all_reduce(input_, op, group, async_op)
             else:
-                vllm_reduce = self.vllm_reduce1 if all_reduce_id == 0 else self.vllm_reduce2
+                vllm_reduce = self.vllm_reduce[all_reduce_id]
                 if vllm_reduce is not None and vllm_reduce.should_custom_ar(input_):
                     input_.data = vllm_reduce.custom_all_reduce(input_)
                 else:
diff --git a/lightllm/server/router/model_infer/mode_backend/continues_batch/impl.py b/lightllm/server/router/model_infer/mode_backend/continues_batch/impl.py
@@ -108,9 +108,9 @@ def decode(self):
         # logits = self.model.forward(**kwargs)
         if kwargs["batch_size"] > 1:
             kwargs1, kwargs2 = split_kwargs(**kwargs)
-            with torch.cuda.stream(self.model.stream1):
+            with torch.cuda.stream(self.model.stream[0]):
                 logits1 = self.model.forward(**kwargs1)
-            with torch.cuda.stream(self.model.stream2):
+            with torch.cuda.stream(self.model.stream[1]):
                 logits2 = self.model.forward(**kwargs2)
             torch.cuda.synchronize()
             logits = torch.cat([logits1, logits2], dim=0)