[None][feat] Implement send_object for TorchDist. (#10213)

yuxianq · web-flow · commit 1f3afb8e6fb1 · 2025-12-31T10:40:52.000+08:00
Signed-off-by: Yuxian Qiu &lt;142763828+yuxianq@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/distributed/communicator.py b/tensorrt_llm/_torch/distributed/communicator.py
@@ -622,8 +622,7 @@ def recv_object(self, src, tag=0):
 
     @log_op
     def send_object(self, obj, dest, tag=0):
-        raise NotImplementedError(
-            "send_object is not implemented for TorchDist")
+        self.isend_object(obj, dest, tag).wait()
 
     @log_op
     def isend_object(self, obj, dest, tag=0):
@@ -640,16 +639,6 @@ def isend_object(self, obj, dest, tag=0):
         works.append(torch.distributed.isend(input_tensor, dst=dest, tag=tag))
         return MultiHandleWrapper(works)
 
-    @log_op
-    def recv_object_from_isend(self, src, tag):
-        size_tensor = torch.tensor([0], dtype=torch.int32)
-        torch.distributed.recv(size_tensor, src=src, tag=tag)
-        bytes_size = size_tensor.item()
-        recv_tensor = torch.empty(bytes_size, dtype=torch.uint8)
-        torch.distributed.recv(recv_tensor, src=src, tag=tag)
-        return _tensor_to_object(recv_tensor, bytes_size,
-                                 torch.distributed.group.WORLD)
-
     @log_op
     def allreduce(self,
                   obj: int | float | torch.Tensor,
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -22,7 +22,7 @@
 from tensorrt_llm._torch.pyexecutor.resource_manager import (
     ResourceManagerType, request_context)
 from tensorrt_llm._utils import (customized_gc_thresholds, is_trace_enabled,
-                                 mpi_disabled, nvtx_range, trace_func)
+                                 nvtx_range, trace_func)
 from tensorrt_llm.bindings.executor import (DisServingRequestStats,
                                             FinishReason, InflightBatchingStats,
                                             IterationStats, KvCacheStats,
@@ -229,7 +229,6 @@ def __init__(self,
         self.num_scheduled_requests: int = 0
         self.benchmark_req_queues_size = int(
             os.environ.get("TLLM_BENCHMARK_REQ_QUEUES_SIZE", 0))
-        self._disable_mpi = mpi_disabled()
 
         # list of requests in each PP micro batch
         self.num_micro_batches = self.dist.pp_size
@@ -1094,11 +1093,9 @@ def _executor_loop_pp(self):
                 if previous_batch is not None:
                     sample_state = previous_batch.sample_state
                     if not self.dist.is_last_pp_rank:
-                        recv_object_funct = self.dist.recv_object_from_isend if self._disable_mpi \
-                            else self.dist.recv_object
                         # Receive tokens from previous pp rank (w.r.t model forward direction)
                         with nvtx_range("recv_sample_state"):
-                            sample_state.host = recv_object_funct(
+                            sample_state.host = self.dist.recv_object(
                                 src=self.dist.prev_pp_rank,
                                 tag=tag,
                             )