add rpc test list

Superjomn · Superjomn · commit 1d97a595a73d · 2025-10-03T11:27:11.000+08:00
Signed-off-by: chunweiy &lt;chunweiy@nvidia.com&gt;
Signed-off-by: Superjomn &lt;328693+Superjomn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/executor/ipc.py b/tensorrt_llm/executor/ipc.py
@@ -194,38 +194,6 @@ async def put_async_noblock(self, obj: Any):
             logger.error(traceback.format_exc())
             raise e
 
-    async def put_async_with_timeout(self, obj: Any, timeout: float = 5.0):
-        """
-        Send an object with timeout to detect connection failures.
-
-        Args:
-            obj: The object to send
-            timeout: Timeout in seconds for the send operation
-
-        Raises:
-            zmq.Again: If send operation times out (peer may be disconnected)
-            Exception: Other send errors
-        """
-        self.setup_lazily()
-        try:
-            if self.use_hmac_encryption:
-                data = pickle.dumps(obj)  # nosec B301
-                signed_data = self._sign_data(data)
-                # Use asyncio.wait_for to implement timeout instead of zmq.NOBLOCK
-                await asyncio.wait_for(self.socket.send(signed_data),
-                                       timeout=timeout)
-            else:
-                await asyncio.wait_for(self.socket.send_pyobj(obj),
-                                       timeout=timeout)
-        except asyncio.TimeoutError:
-            # Convert timeout to zmq.Again to maintain compatibility with existing error handling
-            raise zmq.Again(
-                "Send operation timed out - peer may be disconnected")
-        except Exception as e:
-            logger.error(f"Error sending object: {e}")
-            logger.error(traceback.format_exc())
-            raise e
-
     def get(self) -> Any:
         self.setup_lazily()
         return self._recv_data()
diff --git a/tensorrt_llm/executor/rpc/README.md b/tensorrt_llm/executor/rpc/README.md
@@ -0,0 +1,85 @@
+# A Lightweight RPC
+This is a pure-Python lightweight RPC we build to simplify our existing IPC code in the orchestrator part. It provides multiple call modes (sync, async, future, streaming) and supports both IPC and TCP connections.
+
+## Examples
+### Create Server and Client
+
+```python
+from tensorrt_llm.executor.rpc import RPCServer, RPCClient
+
+# Define your application
+class App:
+    def add(self, a: int, b: int) -> int:
+        return a + b
+    
+    async def async_multiply(self, x: int, y: int) -> int:
+        return x * y
+
+# Create and start server
+app = App()
+with RPCServer(app) as server:
+    server.bind("ipc:///tmp/my_rpc")  # or "tcp://127.0.0.1:5555"
+    server.start()
+    
+    # Create client and make calls
+    with RPCClient("ipc:///tmp/my_rpc") as client:
+        result = client.add(5, 3).remote()
+        print(result)  # Output: 8
+```
+
+### Different Remote Calls
+
+#### Synchronous Call
+```python
+# Blocking call that waits for result
+result = client.add(10, 20).remote()
+# or with timeout
+result = client.add(10, 20).remote(timeout=5.0)
+```
+
+#### Asynchronous Call
+```python
+# Async call that returns a coroutine
+result = await client.async_multiply(3, 4).remote_async()
+```
+
+#### Future-based Call
+```python
+# Returns a concurrent.futures.Future
+future = client.add(1, 2).remote_future()
+# Get result later
+result = future.result()
+```
+
+#### Fire-and-Forget Call
+```python
+# Send request without waiting for response
+client.submit_task(task_id=123).remote(need_response=False)
+```
+
+#### Streaming Call
+```python
+# For async generator methods
+async for value in client.stream_data(n=10).remote_streaming():
+    print(f"Received: {value}")
+```
+
+### Error Handling
+```python
+from tensorrt_llm.executor.rpc import RPCError, RPCTimeout
+
+try:
+    result = client.risky_operation().remote(timeout=1.0)
+except RPCTimeout:
+    print("Operation timed out")
+except RPCError as e:
+    print(f"RPC Error: {e}")
+    print(f"Original cause: {e.cause}")
+    print(f"Traceback: {e.traceback}")
+```
+
+### Graceful Shutdown
+```python
+# Shutdown server from client
+client.shutdown_server()
+```
diff --git a/tensorrt_llm/executor/rpc/rpc_client.py b/tensorrt_llm/executor/rpc/rpc_client.py
@@ -59,13 +59,13 @@ def remote_future(self,
                       need_response: bool = True) -> concurrent.futures.Future:
         """Remote call that returns a Future object."""
         return self._prepare_and_call(timeout, need_response, "future",
-                                      "call_future")
+                                      "_call_future")
 
     def remote_streaming(self,
                          timeout: Optional[float] = None) -> AsyncIterator[Any]:
         """Remote call for streaming results."""
         # Streaming always needs a response
-        return self._prepare_and_call(timeout, True, "async", "call_streaming")
+        return self._prepare_and_call(timeout, True, "async", "_call_streaming")
 
 
 class RPCClient:
@@ -365,27 +365,8 @@ def _call_sync(self, method_name, *args, **kwargs):
             f"RPC Client _call_sync: Got result for {method_name}: {result}")
         return result
 
-    def call_async(self, name: str, *args, **kwargs) -> Any:
-        """
-        Call a remote method asynchronously.
-
-        Args:
-            name: Method name to call
-            *args: Positional arguments
-            **kwargs: Keyword arguments
-
-        Returns:
-            Coroutine that can be awaited
-
-        Example:
-            result = await client.call_async('remote_method', arg1, arg2, key=value)
-        """
-        if "__rpc_params" not in kwargs:
-            kwargs["__rpc_params"] = RPCParams(need_response=True)
-        return self._call_async(name, *args, **kwargs)
-
-    def call_future(self, name: str, *args,
-                    **kwargs) -> concurrent.futures.Future:
+    def _call_future(self, name: str, *args,
+                     **kwargs) -> concurrent.futures.Future:
         """
         Call a remote method and return a Future.
 
@@ -396,12 +377,6 @@ def call_future(self, name: str, *args,
 
         Returns:
             A Future object that can be used to retrieve the result
-
-        Example:
-            future = client.call_future('remote_method', arg1, arg2, key=value)
-            result = future.result()  # blocks until complete
-            # or
-            future.add_done_callback(lambda f: print(f.result()))
         """
 
         def _async_to_sync():
@@ -412,25 +387,8 @@ def _async_to_sync():
 
         return self._executor.submit(_async_to_sync)
 
-    def call_sync(self, name: str, *args, **kwargs) -> Any:
-        """
-        Call a remote method synchronously (blocking).
-
-        Args:
-            name: Method name to call
-            *args: Positional arguments
-            **kwargs: Keyword arguments
-
-        Returns:
-            The result of the remote method call
-
-        Example:
-            result = client.call_sync('remote_method', arg1, arg2, key=value)
-        """
-        return self._call_sync(name, *args, **kwargs)
-
-    async def call_streaming(self, name: str, *args,
-                             **kwargs) -> AsyncIterator[Any]:
+    async def _call_streaming(self, name: str, *args,
+                              **kwargs) -> AsyncIterator[Any]:
         """
         Call a remote async generator method and get streaming results.
 
@@ -441,10 +399,6 @@ async def call_streaming(self, name: str, *args,
 
         Yields:
             Results from the remote async generator
-
-        Example:
-            async for result in client.call_streaming('streaming_task'):
-                print(result)
         """
         if self._server_stopped:
             raise RPCCancelled("Server is shutting down, request cancelled")
@@ -474,7 +428,7 @@ async def call_streaming(self, name: str, *args,
 
             # Read streaming responses
             while True:
-                logger_debug(f"RPC Client call_streaming waiting for response",
+                logger_debug(f"RPC Client _call_streaming waiting for response",
                              color="green")
                 if timeout is None:
                     response = await queue.get()
@@ -483,14 +437,14 @@ async def call_streaming(self, name: str, *args,
                                                       timeout=timeout)
 
                 logger_debug(
-                    f"RPC Client call_streaming received [{response.stream_status}] response: {response}",
+                    f"RPC Client _call_streaming received [{response.stream_status}] response: {response}",
                     color="green")
                 if response.stream_status == 'start':
                     # Start of stream
                     continue
                 elif response.stream_status == 'data':
                     logger_debug(
-                        f"RPC Client call_streaming received data: {response.result}",
+                        f"RPC Client _call_streaming received data: {response.result}",
                         color="green")
                     yield response.result
                 elif response.stream_status == 'end':
diff --git a/tensorrt_llm/executor/rpc_proxy.py b/tensorrt_llm/executor/rpc_proxy.py
@@ -3,7 +3,6 @@
 import json
 import os
 import threading
-import time
 from typing import Optional
 
 from ..llmapi.llm_args import KvCacheConnectorConfig
@@ -71,7 +70,6 @@ def __init__(
         self.main_loop = None
 
         self.launch_workers()
-        time.sleep(1)  # wait for the workers to launch
 
         # Invoke model creation on the remote
         # TBD: Move model creation to the mpi task, or left in RPC?
diff --git a/tensorrt_llm/executor/rpc_worker.py b/tensorrt_llm/executor/rpc_worker.py
@@ -230,7 +230,7 @@ def main_task(
                          color="yellow")
             worker.setup_engine()
 
-        if mpi_rank() == 0:
+        else:
             logger_debug(f"Worker {mpi_rank()} is creating the RPC service",
                          color="yellow")
             # Step 2: Create the RPC service, it will expose all the APIs of the worker as remote call to the client
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -46,6 +46,8 @@ l0_a10:
   - unittest/llmapi/test_serialization.py
   - unittest/llmapi/test_utils.py
   - unittest/llmapi/test_llm_args.py
+  # executor
+  - unittest/executor/test_rpc.py
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/integration/test_lists/test-db/l0_a100.yml b/tests/integration/test_lists/test-db/l0_a100.yml
@@ -16,6 +16,10 @@ l0_a100:
     - unittest/llmapi/test_llm_pytorch.py
     - unittest/llmapi/test_mpi_session.py # generic tests
     - unittest/trt/model_api/test_model_quantization.py
+    # executor
+    - unittest/executor/test_base_worker.py
+    - unittest/executor/test_rpc_proxy.py
+    - unittest/executor/test_rpc_worker.py
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/unittest/executor/test_base_worker.py b/tests/unittest/executor/test_base_worker.py
@@ -12,6 +12,7 @@
 # isort: off
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/..")
 from utils.llm_data import llm_models_root
+from utils.util import skip_single_gpu
 # isort: on
 
 from tensorrt_llm._torch.pyexecutor.config import update_executor_config
@@ -156,6 +157,8 @@ def create_worker_session(self):
         session = MpiPoolSession(n_workers=2)
         return session
 
+    @pytest.mark.gpu2
+    @skip_single_gpu
     def test_create_executor(self):
         futures = self.session.submit(
             TestRpcWorkerBaseTP2.create_executor,
diff --git a/tests/unittest/executor/test_rpc.py b/tests/unittest/executor/test_rpc.py
@@ -238,10 +238,10 @@ def slow_method(self):
                 with pytest.raises(RPCError) as exc_info:
                     client.slow_method().remote(timeout=0.5)
 
-                    error = exc_info.value
-                    # Should be either a timeout error or RPC error indicating timeout
-                    assert "timed out" in str(
-                        error).lower() or "timeout" in str(error).lower()
+                error = exc_info.value
+                # Should be either a timeout error or RPC error indicating timeout
+                assert "timed out" in str(error).lower() or "timeout" in str(
+                    error).lower()
 
     def test_method_not_found_error(self):
         """Test that calling non-existent methods returns proper error."""
diff --git a/tests/unittest/executor/test_rpc_proxy.py b/tests/unittest/executor/test_rpc_proxy.py
@@ -14,7 +14,7 @@
 # isort: off
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/..")
 from utils.llm_data import llm_models_root
-from utils.util import similar
+from utils.util import similar, skip_single_gpu
 # isort: on
 
 model_path = llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
@@ -78,6 +78,8 @@ def test_tp1(self, num_reqs):
             assert isinstance(kv_cache_events, list)
 
     @pytest.mark.parametrize("num_reqs", [1, 10])
+    @skip_single_gpu
+    @pytest.mark.gpu2
     def test_tp2(self, num_reqs):
         tokenizer = TransformersTokenizer.from_pretrained(model_path)
         prompt = "A B C D"
diff --git a/tests/unittest/executor/test_rpc_worker.py b/tests/unittest/executor/test_rpc_worker.py
@@ -18,6 +18,7 @@
 # isort: off
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/..")
 from utils.llm_data import llm_models_root
+from utils.util import skip_single_gpu
 # isort: on
 
 model_path = llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
@@ -177,12 +178,20 @@ async def process_request_streaming():
     async def test_fetch_stats_loop_async(self):
         await asyncio.sleep(1)
         results = []
-        async for stats in self.client.fetch_stats_loop_async(
-        ).remote_streaming():
-            results.append(stats)
-            assert not stats  # empty stats
+        max_batches = 5
 
-        assert len(results) == 0
+        async def consume_stats():
+            async for stats in self.client.fetch_stats_loop_async(
+            ).remote_streaming():
+                results.append(stats)
+                assert not stats  # empty stats
+                if len(results) >= max_batches:
+                    break
+
+        await asyncio.wait_for(consume_stats(), timeout=5)
+
+        assert len(results) == max_batches
+        assert all(not stats for stats in results)
 
 
 class TestRpcWorkerTP2:
@@ -215,11 +224,15 @@ def create_worker_session(self):
     def create_rpc_client(self, addr: str):
         return RPCClient(addr)
 
+    @skip_single_gpu
+    @pytest.mark.gpu2
     def test_create_shutdown(self):
         # Invoke setup_engine in rank 0, and that will unblock all the ranks to
         # invoke setup_engine simultaneously.
         pass
 
+    @skip_single_gpu
+    @pytest.mark.gpu2
     def test_fetch_responses_sync(self):
         # Wait a bit to ensure engine is ready
         time.sleep(1)
diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py