fix reduce stuck on h100 with graph (#654)

shihaobai · web-flow · commit 1a52796c3529 · 2024-12-06T13:52:51.000+08:00
Co-authored-by: baishihao &lt;baishihao@sensetime.com&gt;
diff --git a/lightllm/distributed/communication_op.py b/lightllm/distributed/communication_op.py
@@ -37,7 +37,6 @@
 vllm_reduce = None
 logger = init_logger(__name__)
 
-
 @contextmanager
 def lightllm_capture_graph():
     if vllm_reduce is not None:
@@ -47,22 +46,22 @@ def lightllm_capture_graph():
         yield
     pass
 
-
 def _all_reduce(input_, op=ReduceOp.SUM, group=None, async_op=False):
-    if op != ReduceOp.SUM or group is not None or async_op or vllm_reduce is None:
+    if op != ReduceOp.SUM or group is not None or async_op:
         original_all_reduce(input_, op, group, async_op)
     else:
         if vllm_reduce is not None:
             can_use = vllm_reduce.should_custom_ar(input_)
             if can_use:
                 input_.data = vllm_reduce.custom_all_reduce(input_)
                 return
-        original_all_reduce(input_, op, group, async_op)
+            original_all_reduce(input_, op, vllm_reduce.device_group, async_op)
+        else:
+            original_all_reduce(input_, op, group, async_op)
 
 
 def set_custom_reduce():
     global vllm_reduce
-
     ENABLE_VLLM_REDUCE = os.getenv("ENABLE_VLLM_REDUCE", "False").upper() in [
         "ON",
         "TRUE",
@@ -71,7 +70,8 @@ def set_custom_reduce():
     if ENABLE_VLLM_REDUCE and HAS_VLLM:
         world_size = dist.get_world_size()
         ranks = list(range(world_size))
+        device_group = torch.distributed.new_group(ranks, backend="nccl")
         cpu_group = torch.distributed.new_group(ranks, backend="gloo")
-        vllm_reduce = CustomAllreduce(cpu_group, torch.cuda.current_device())
+        vllm_reduce = CustomAllreduce(cpu_group, device_group, torch.cuda.current_device())
         logger.info("Enable VLLM ALLReduce.")
         dist.all_reduce = _all_reduce
diff --git a/lightllm/distributed/custom_all_reduce.py b/lightllm/distributed/custom_all_reduce.py
@@ -30,7 +30,7 @@
 from lightllm.utils.log_utils import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import cuda_device_count_stateless
-
+from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 ops.meta_size()
 custom_ar = True
 
@@ -49,7 +49,7 @@ class CustomAllreduce:
     _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
 
     # max_size: max supported allreduce size
-    def __init__(self, group: ProcessGroup, device: Union[int, str, torch.device], max_size=8192 * 1024) -> None:
+    def __init__(self, group: ProcessGroup, device_group: ProcessGroup, device: Union[int, str, torch.device], max_size=8192 * 1024) -> None:
         """
         Args:
             group: the process group to work on. If None, it will use the
@@ -69,7 +69,7 @@ def __init__(self, group: ProcessGroup, device: Union[int, str, torch.device], m
             return
 
         self.group = group
-
+        self.device_group = device_group
         assert dist.get_backend(group) != dist.Backend.NCCL, "CustomAllreduce should be attached to a non-NCCL group."
 
         rank = dist.get_rank(group=self.group)
@@ -226,7 +226,7 @@ def all_reduce(self, inp: torch.Tensor, *, out: torch.Tensor = None, registered:
         buffer.
         """
         if out is None:
-            out = torch.empty_like(inp)
+            out = g_cache_manager.alloc_tensor(inp.shape, inp.dtype, device=inp.device, is_graph_out=False)
         if registered:
             ops.all_reduce(self._ptr, inp, out, 0, 0)
         else:
@@ -244,7 +244,8 @@ def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
             else:
                 # If warm up, mimic the allocation pattern since custom
                 # allreduce is out-of-place.
-                return torch.empty_like(input)
+                out = g_cache_manager.alloc_tensor(input.shape, input.dtype, device=input.device, is_graph_out=False)
+                return out
         else:
             # Note: outside of cuda graph context, custom allreduce incurs a
             # cost of cudaMemcpy, which should be small (<=1% of overall
diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
@@ -84,6 +84,7 @@ def init_model(self, kvargs):
             init_method=f'tcp://127.0.0.1:{kvargs["nccl_port"]}',
             rank=self.tp_rank,
             world_size=self.world_size,
+            device_id=torch.device(f"cuda:{self.tp_rank}"),
         )
 
         from lightllm.distributed import set_custom_reduce

Original file line number	Diff line number	Diff line change
`@@ -84,6 +84,7 @@ def init_model(self, kvargs):`
`84`	`84`	`init_method=f'tcp://127.0.0.1:{kvargs["nccl_port"]}',`
`85`	`85`	`rank=self.tp_rank,`
`86`	`86`	`world_size=self.world_size,`
	`87`	`+ device_id=torch.device(f"cuda:{self.tp_rank}"),`
`87`	`88`	`)`
`88`	`89`
`89`	`90`	`from lightllm.distributed import set_custom_reduce`