hchings
diff --git a/‎examples/llm-api/rl_integration_test.py‎
Lines changed: 0 additions & 618 deletions b/‎examples/llm-api/rl_integration_test.py‎
Lines changed: 0 additions & 618 deletions
diff --git a/‎examples/llm-api/rl_integration_test_async.py‎
Lines changed: 0 additions & 647 deletions b/‎examples/llm-api/rl_integration_test_async.py‎
Lines changed: 0 additions & 647 deletions
diff --git a/‎examples/rl/rl_integration_test.py‎
Lines changed: 0 additions & 618 deletions b/‎examples/rl/rl_integration_test.py‎
Lines changed: 0 additions & 618 deletions
diff --git a/‎tensorrt_llm/_torch/async_llm.py‎
Lines changed: 68 additions & 0 deletions b/‎tensorrt_llm/_torch/async_llm.py‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/virtual_memory.py‎
Lines changed: 2 additions & 1 deletion b/‎tensorrt_llm/_torch/virtual_memory.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tensorrt_llm/executor/ray_executor.py‎
Lines changed: 84 additions & 33 deletions b/‎tensorrt_llm/executor/ray_executor.py‎
Lines changed: 84 additions & 33 deletions
diff --git a/‎tensorrt_llm/executor/ray_gpu_worker.py‎
Lines changed: 3 additions & 0 deletions b/‎tensorrt_llm/executor/ray_gpu_worker.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tensorrt_llm/llmapi/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎tensorrt_llm/llmapi/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tensorrt_llm/llmapi/llm.py‎
Lines changed: 1 addition & 7 deletions b/‎tensorrt_llm/llmapi/llm.py‎
Lines changed: 1 addition & 7 deletions
@@ -0,0 +1,68 @@
+from typing import Any, Optional
+
+from ..llmapi.llm import LLM
+from .virtual_memory import ExecutorMemoryType
+
+
+class AsyncLLM(LLM):
+    """AsyncLLM is a subclass of LLM that supports asynchronous setup, release and
+    resume operations that are necessary for RL or agentic scenarios.
+    """
+
+    def __init__(self, *args, **kwargs):
+        # AsyncLLM is only supported with Ray orchestrator now.
+        kwargs["orchestrator_type"] = "ray"
+        if 'ray_worker_extension_cls' not in kwargs:
+            kwargs['ray_worker_extension_cls'] = 'tensorrt_llm.llmapi.rlhf_utils.WorkerExtension'
+        super().__init__(*args, **kwargs)
+
+    async def setup_async(self):
+        """Setup the LLM asynchronously."""
+        await self._executor.init_workers_async()
+
+    async def release(self, tags: list[str]):
+        """Release the GPU memory used by the LLM asynchronously.
+
+        Args:
+            tags: List of memory tag strings to release (e.g., ["model", "kv_cache"]).
+        """
+        await self.collective_rpc("sleep", args=(tags,))
+
+    async def resume(self, tags: list[str]):
+        """Resume the GPU memory used by the LLM asynchronously.
+
+        Args:
+            tags: List of memory tag strings to resume (e.g., ["model", "kv_cache"]).
+        """
+        await self.collective_rpc("wakeup", args=(tags,))
+
+    async def update_weights(self, weights: dict[str, str]):
+        """Update the weights of the LLM asynchronously.
+
+
+        Args:
+            weights: Dictionary mapping device UUIDs to IPC handles for weight tensors.
+        """
+        await self.collective_rpc("update_weights", args=(weights,))
+
+    async def collective_rpc(
+        self,
+        method: str,
+        args: tuple[Any, ...] = (),
+        kwargs: Optional[dict] = None,
+        unique_reply_rank: Optional[int] = None,
+    ) -> list[Any]:
+        """Execute an asynchronous RPC call on all GPU workers. Currently, this is only supported for RayExecutor.
+
+        Args:
+            method (str): The name of the worker method to execute.
+            args (tuple[Any, ...]): Positional arguments to pass to the worker method. Defaults to ().
+            kwargs (dict, optional): Keyword arguments to pass to the worker method. Defaults to None.
+            unique_reply_rank (int, optional): The rank of the worker that will be used to send the reply.
+
+        Returns:
+            list[Any]: A list of results from each worker.
+        """
+        return await self._executor.collective_rpc_async(
+            method, args, kwargs, unique_reply_rank=unique_reply_rank
+        )
@@ -74,7 +74,8 @@ class ExecutorMemoryType(StrEnum):
     SPEC_RESOURCES = "spec_resource_manager"
     INIT_KV_CACHE = "_no_capture_init_kv_cache"
     INIT_EXTRA_RESOURCES = "_no_capture_init_extra_resources"
-    MODEL_EXTRA = "_no_capture_model_extra"  # TODO: remove _no_capture after torch fix crash on torch.cuda.empty_cache()
+    # MODEL_EXTRA = "_no_capture_model_extra"  # TODO: remove _no_capture after torch fix crash on torch.cuda.empty_cache()
+    MODEL_EXTRA = "model_extra"
     EXTRA_RESOURCES = "executor_extra"
     KV_CACHE = "kv_cache"
     MODEL_ENGINE_MAIN = "model"
 
@@ -1,5 +1,5 @@
-import os
 import asyncio
+import os
 from typing import Any, Dict, List, Optional, Tuple
 
 try:
@@ -8,8 +8,7 @@
     e.msg = """Cannot import Ray. Please install 'ray' package to use ray orchestrator"""
     raise
 
-from ray.util.placement_group import (PlacementGroup,
-                                      PlacementGroupSchedulingStrategy,
+from ray.util.placement_group import (PlacementGroupSchedulingStrategy,
                                       get_current_placement_group,
                                       placement_group)
 
@@ -79,15 +78,15 @@ def __init__(self,
             self.master_address = ray.util.get_node_ip_address()
             self.master_port = get_free_port()
 
-            self.worker_kwargs = dict(**worker_kwargs,
-                                 postproc_worker_config=postproc_worker_config,
-                                 is_llm_executor=is_llm_executor)
-            if not has_event_loop():
-                self.init_workers_sync()
+            self.worker_kwargs = dict(
+                **worker_kwargs,
+                postproc_worker_config=postproc_worker_config,
+                is_llm_executor=is_llm_executor)
 
             self.init_rpc_executor()
             worker_kwargs['rpc_addr'] = self.rpc_addr
-            self.create_workers(RayGPUWorker, worker_kwargs)
+            if not has_event_loop():
+                self.init_workers_sync()
             self.setup_engine_remote()
             self.setup_mainloop(tasks=[self._fetch_responses_loop_async],
                                 thread_name="ray_executor_main_loop")
@@ -99,9 +98,13 @@ def __init__(self,
             raise e
 
     def create_workers(self, worker_cls, worker_kwargs):
+        llm_args = worker_kwargs.get("llm_args")
+
         # When set to be a fraction, it allows Ray to schedule
         # multiple actors on a single GPU for colocate use cases.
-        num_gpus = float(os.getenv("TRTLLM_RAY_PER_WORKER_GPUS", "1.0"))
+        num_gpus = (llm_args.per_worker_gpu_share if llm_args
+                    and llm_args.per_worker_gpu_share is not None else float(
+                        os.getenv("TRTLLM_RAY_PER_WORKER_GPUS", "1.0")))
         logger.debug(f"{num_gpus=} for each worker.")
 
         runtime_env = ray.runtime_env.RuntimeEnv()
@@ -112,42 +115,40 @@ def create_workers(self, worker_cls, worker_kwargs):
             "MASTER_PORT": str(self.master_port)
         })
 
-        self.placement_group, self.bundle_indices = self._get_placement_group(
-            tp_size=self.tp_size)
+        placement_groups, self.bundle_indices = self._get_placement_group(
+            tp_size=self.tp_size, worker_kwargs=worker_kwargs)
 
-        self.workers = [
-            RayWorkerWrapper.options(
+        if isinstance(placement_groups, list):
+            self.placement_group = None
+        else:
+            self.placement_group = placement_groups
+
+        self.workers = []
+        for rank in range(self.world_size):
+            pg = placement_groups[rank] if isinstance(
+                placement_groups, list) else placement_groups
+            worker = RayWorkerWrapper.options(
                 num_gpus=num_gpus,
-                runtime_env=runtime_env,  # per-actor env
+                runtime_env=runtime_env,
                 scheduling_strategy=PlacementGroupSchedulingStrategy(
-                    placement_group=self.placement_group,
+                    placement_group=pg,
                     placement_group_bundle_index=self.bundle_indices[rank],
                 )).remote(worker_cls, worker_kwargs, self.world_size, rank)
-            for rank in range(self.world_size)
-        ]
+            self.workers.append(worker)
 
     def init_workers_sync(self):
         self.create_workers(RayGPUWorker, self.worker_kwargs)
         try:
-            ray.get([worker.__ray_ready__.remote() for worker in self.workers])
+            ray.get(self._get_worker_ready_futures())
         except ray.exceptions.ActorDiedError as e:
-            if "The actor died because of an error raised in its creation task" in str(
-                    e):
-                raise RuntimeError(
-                    "RayGPUWorker died during initialization") from e
-            raise
+            raise RuntimeError("RayGPUWorker died during initialization") from e
 
     async def init_workers_async(self):
         self.create_workers(RayGPUWorker, self.worker_kwargs)
         try:
-            await asyncio.gather(*[worker.__ray_ready__.remote() for worker in self.workers])
+            await asyncio.gather(*self._get_worker_ready_futures())
         except ray.exceptions.ActorDiedError as e:
-            if "The actor died because of an error raised in its creation task" in str(
-                    e):
-                raise RuntimeError(
-                    "RayGPUWorker died during initialization") from e
-            raise
-
+            raise RuntimeError("RayGPUWorker died during initialization") from e
 
     @unwrap_ray_errors()
     def call_all_ray_workers(self, func: str, leader_only: bool,
@@ -187,6 +188,20 @@ def collective_rpc(self,
                                                         **kwargs))
         return refs if non_block else ray.get(refs)
 
+    @unwrap_ray_errors()
+    async def collective_rpc_async(
+            self,
+            method: str,
+            args: tuple = (),
+            kwargs: Optional[dict] = None,
+            unique_reply_rank: Optional[int] = None) -> list[Any]:
+        refs = self.collective_rpc(method,
+                                   args,
+                                   kwargs,
+                                   non_block=True,
+                                   unique_reply_rank=unique_reply_rank)
+        return await asyncio.gather(*refs)
+
     def submit(self, request: "GenerationRequest") -> "GenerationResult":
         """
         Low-level API to the executor. Return a "future" GenerationResult
@@ -281,15 +296,51 @@ def shutdown(self):
             logger.debug("Shutting down Ray cluster")
             ray.shutdown()
 
-    def _get_placement_group(self,
-                             tp_size: int) -> Tuple[PlacementGroup, List[int]]:
+    def _get_worker_ready_futures(self):
+        return [worker.__ray_ready__.remote() for worker in self.workers]
+
+    def _get_placement_group(
+            self,
+            tp_size: int,
+            worker_kwargs: Dict = None) -> Tuple[Any, List[int]]:
         """
         Either use the existing placement group from driver script (e.g., in the case of RL FW integration),
         or create a default PACK placement group where each bundle has tp_size GPUs.
          - When tp_size ≤ GPUs per node, keep one TP group per node.
          - When tp_size >  GPUs per node, allow a TP group span nodes.
          - rank 0 must be put on the driver node
+
+        Returns:
+            Tuple of (placement_group(s), bundle_indices)
+            - placement_group(s) can be a single PlacementGroup or a List[PlacementGroup]
+            - bundle_indices is always a List[int]
         """
+        llm_args = worker_kwargs.get("llm_args") if worker_kwargs else None
+
+        if llm_args and hasattr(
+                llm_args,
+                'placement_groups') and llm_args.placement_groups is not None:
+            total_workers = sum(
+                len(indices) for indices in llm_args.placement_bundle_indices)
+            if total_workers != self.world_size:
+                raise ValueError(
+                    f"Total bundle indices ({total_workers}) must equal world_size ({self.world_size})"
+                )
+
+            logger.info(
+                f"Creating {self.world_size} workers with external placement groups"
+            )
+
+            flat_pgs = []
+            flat_indices = []
+            for pg, indices in zip(llm_args.placement_groups,
+                                   llm_args.placement_bundle_indices):
+                for idx in indices:
+                    flat_pgs.append(pg)
+                    flat_indices.append(idx)
+
+            return flat_pgs, flat_indices
+
         bundle_indices = os.getenv("TRTLLM_RAY_BUNDLE_INDICES", None)
 
         if bundle_indices:
 
@@ -1,3 +1,4 @@
+import gc
 import importlib
 import os
 from pathlib import Path
@@ -216,6 +217,8 @@ def sleep(self, sleep_tags: List[str]):
             torch.cuda.synchronize()
             release_with_tag(*tags)
             torch.cuda.synchronize()
+            gc.collect()
+            torch.cuda.empty_cache()
         except Exception as e:
             logger.error(f"Encountered an error in sleep: {e}")
             raise e
 
@@ -2,7 +2,8 @@
 from ..executor import CompletionOutput, LoRARequest, RequestError
 from ..sampling_params import GuidedDecodingParams, SamplingParams
 from .build_cache import BuildCacheConfig
-from .llm import LLM, AsyncLLM, RequestOutput
+from .llm import LLM, RequestOutput
+from .._torch.async_llm import AsyncLLM
 # yapf: disable
 from .llm_args import (AttentionDpConfig, AutoDecodingConfig, BatchingType,
                        CacheTransceiverConfig, CalibConfig,
 
@@ -48,6 +48,7 @@
 # TODO[chunweiy]: move the following symbols back to utils scope, and remove the following import
 from .utils import (append_docstring, exception_handler, get_device_count,
                     logger_debug, set_api_status)
+from ray.util.placement_group import PlacementGroup, placement_group
 
 
 class RequestOutput(DetokenizedGenerationResultBase, GenerationResult):
@@ -1149,10 +1150,3 @@ def __init__(self,
 
     Parameters:
 """ + TORCH_LLM_DOCSTRING
-
-class AsyncLLM(LLM):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    async def async_init_phase(self):
-        await self._executor.init_workers_async()