fredricz-20070104
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 21 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp‎
Lines changed: 2 additions & 1 deletion b/‎cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp‎
Lines changed: 2 additions & 1 deletion b/‎cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/llm-api/rlhf/rlhf_utils.py‎
Lines changed: 77 additions & 0 deletions b/‎examples/llm-api/rlhf/rlhf_utils.py‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎tensorrt_llm/_ray_utils.py‎
Lines changed: 15 additions & 0 deletions b/‎tensorrt_llm/_ray_utils.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/models/modeling_utils.py‎
Lines changed: 48 additions & 29 deletions b/‎tensorrt_llm/_torch/models/modeling_utils.py‎
Lines changed: 48 additions & 29 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 3 additions & 1 deletion b/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/pyexecutor/config.py‎
Lines changed: 3 additions & 0 deletions b/‎tensorrt_llm/_torch/pyexecutor/config.py‎
Lines changed: 3 additions & 0 deletions
@@ -871,6 +871,13 @@ class WindowBlockManager
         return mIsValidStoreForReuseSequence.at(requestId);
     }
 
+    void resetReuseState()
+    {
+        std::lock_guard<std::mutex> lock(mCachedBlocksRootMutex);
+        mCachedBlocksRoot
+            = std::make_shared<KVCacheBlock>(KVCacheBlock::kCachedBlocksRootId, tensorrt_llm::kernels::KVCacheIndex{0});
+    }
+
 private:
     //! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq.
     void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx);
@@ -1347,6 +1354,14 @@ class BlockManager
         return mWindowBlockManagers.at(windowSize).isSequenceValidForStoreForReuse(requestId);
     }
 
+    void resetReuseState()
+    {
+        for (auto& [windowSize, manager] : mWindowBlockManagers)
+        {
+            manager.resetReuseState();
+        }
+    }
+
 private:
     [[nodiscard]] WindowBlockManager const& windowManagerByLayer(SizeType32 layerIdx) const
     {
@@ -1533,6 +1548,7 @@ class BaseKVCacheManager
 
     virtual void refreshBlocks() = 0;
     virtual void flushIterationEvents() = 0;
+    virtual void resetReuseState() = 0;
 
     [[nodiscard]] static SizeType32 getSinkBubbleLength(SizeType32 sinkTokenLen, SizeType32 tokensPerBlock);
 
@@ -1913,6 +1929,11 @@ class KVCacheManager : public BaseKVCacheManager
         return mBlockManager.findBlocksInReuseTreeByBlockKey(blockKey, windowSize);
     }
 
+    void resetReuseState() override
+    {
+        mBlockManager.resetReuseState();
+    }
+
     /// @brief Finds the maximum attention window that can be used on a sequence, given some kv-cache block capacity.
     ///
     /// @param inputLength The number of input tokens in the sequence.
 
@@ -482,7 +482,8 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(nb::module_& m)
         .def("flush_iteration_events", &BaseKVCacheManager::flushIterationEvents,
             nb::call_guard<nb::gil_scoped_release>())
         .def("get_last_block_id", &BaseKVCacheManager::getLastBlockId, nb::call_guard<nb::gil_scoped_release>())
-        .def("unpin_blocks_by_id", &BaseKVCacheManager::unpinBlocksById, nb::call_guard<nb::gil_scoped_release>());
+        .def("unpin_blocks_by_id", &BaseKVCacheManager::unpinBlocksById, nb::call_guard<nb::gil_scoped_release>())
+        .def("reset_reuse_state", &BaseKVCacheManager::resetReuseState, nb::call_guard<nb::gil_scoped_release>());
 
     nb::bind_vector<CacheBlockIds>(m, "CacheBlockIds")
         .def("__getstate__", [](CacheBlockIds const& v) { return nb::make_tuple(v); })
 
@@ -486,7 +486,8 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m)
         .def("flush_iteration_events", &BaseKVCacheManager::flushIterationEvents,
             py::call_guard<py::gil_scoped_release>())
         .def("get_last_block_id", &BaseKVCacheManager::getLastBlockId, py::call_guard<py::gil_scoped_release>())
-        .def("unpin_blocks_by_id", &BaseKVCacheManager::unpinBlocksById, py::call_guard<py::gil_scoped_release>());
+        .def("unpin_blocks_by_id", &BaseKVCacheManager::unpinBlocksById, py::call_guard<py::gil_scoped_release>())
+        .def("reset_reuse_state", &BaseKVCacheManager::resetReuseState, py::call_guard<py::gil_scoped_release>());
 
     py::enum_<tbk::CacheType>(m, "CacheType")
         .value("SELF", tbk::CacheType::kSELF)
 
@@ -0,0 +1,77 @@
+import torch
+
+from tensorrt_llm._ray_utils import control_action_decorator
+from tensorrt_llm._torch.utils import get_device_uuid
+from tensorrt_llm.logger import logger
+
+
+class WorkerExtension:
+    """Worker extension class for extending TensorRT-LLM Ray workers with custom functionality.
+
+    This class can be injected into tensorrt_llm.LLM() by specifying it via the
+    ray_worker_extension_cls parameter in LLMArgs when using orchestrator_type='ray'.
+    The extension methods will be available on each Ray worker and can be called via
+    the LLM's collective RPC mechanism.
+
+    Examples:
+        Creating an LLM with worker extension:
+
+        >>> llm = LLM(
+        ...     model=model_dir,
+        ...     orchestrator_type="ray",
+        ...     ray_worker_extension_cls="rlhf_utils.WorkerExtension",
+        ... )
+
+        Calling extension methods via collective RPC:
+
+        >>> llm._collective_rpc("update_weights", args=(ipc_handles,))
+    """
+
+    @control_action_decorator
+    def update_weights(self, ipc_handles: dict):
+        """Update model weights from IPC (Inter-Process Communication) handles.
+
+        This method receives shared memory handles from another process (typically FSDP training),
+        reconstructs tensors from these handles, and loads them into the TensorRT-LLM model.
+        Uses the control_action_decorator to ensure all active requests are finished before
+        updating weights.
+
+        Args:
+            ipc_handles: Dictionary mapping device UUIDs to lists of (param_name, tensor_handle) tuples.
+                        Each tensor_handle is a tuple of (func, args) for reconstructing the tensor.
+
+        Raises:
+            ValueError: If the current device's UUID is not found in ipc_handles.
+            Exception: Re-raises any exception encountered during weight update.
+        """
+        try:
+            logger.info("Update weights from IPC handles")
+            device_uuid = get_device_uuid(self.device_id)
+
+            if device_uuid not in ipc_handles:
+                raise ValueError(f"Device UUID {device_uuid} not found in ipc_handles")
+
+            weights = {}
+            all_handles = ipc_handles[device_uuid]
+
+            for param_name, tensor_handle in all_handles:
+                func, args = tensor_handle
+                list_args = list(args)
+                list_args[6] = self.device_id  # Set target device
+                tensor = func(*list_args)
+                weights[param_name] = tensor
+
+            self.engine.model_engine.model.load_weights(weights)
+            torch.cuda.synchronize()
+            self.engine.reset_prefix_cache()
+
+        except Exception as e:
+            logger.error("Encountered an error in update_weights")
+            raise e
+
+    def check_weights_updated(self):
+        """Check if the weights are updated to 0."""
+        weights_updated = True
+        for name, p in self.engine.model_engine.model.named_parameters():
+            weights_updated = weights_updated and torch.allclose(p, torch.zeros_like(p))
+        return weights_updated
@@ -12,7 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import functools
 from contextlib import contextmanager
+from typing import Callable
 
 try:
     import ray
@@ -26,3 +28,16 @@ def unwrap_ray_errors():
         yield
     except ray.exceptions.RayTaskError as e:
         raise e.as_instanceof_cause() from e
+
+
+def control_action_decorator(func: Callable) -> Callable:
+    """
+    Decorator that wraps a method to use control_action context manager.
+    """
+
+    @functools.wraps(func)
+    def wrapper(self, *args, **kwargs):
+        with self.engine.control_action():
+            return func(self, *args, **kwargs)
+
+    return wrapper
@@ -871,6 +871,8 @@ def load_single_module(name, module):
                 for new_name in params_map[names[-1]]:
                     fw = filter_weights('.'.join(names[:-1] + [new_name]),
                                         weights)
+                    if not fw:
+                        continue
                     if new_name in ['k_proj', 'v_proj']:
                         num_kv_heads_list = [num_kv_heads
                                              ] * len(fw) if isinstance(
@@ -887,23 +889,29 @@ def load_single_module(name, module):
                         }
 
                     module_weights.append(fw)
-                module.load_weights(weights=module_weights)
+                # Note: module_weights may be empty after filtering (e.g., in streaming weight updates)
+                if module_weights:
+                    module.load_weights(weights=module_weights)
+
             else:
                 module_weights = filter_weights(name, weights)
-                if hasattr(module, 'load_weights'):
-                    module.load_weights(weights=[module_weights])
-                else:
-                    for n, p in module._parameters.items():
-                        if p is not None:
+                # Note: module_weights may be empty after filtering (e.g., in streaming weight updates)
+                if module_weights:
+                    if hasattr(module, 'load_weights'):
+                        module.load_weights(weights=[module_weights])
+                    else:
+                        for n, p in module.named_parameters(recurse=False):
                             p.data.copy_(module_weights[n][:])
 
     if os.environ.get("TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL",
                       "True") in ["True", "true", "1", "yes", "y"]:
-        for name, module in tqdm(list(model.named_modules()),
+        for name, module in tqdm(list(
+                model.named_modules(remove_duplicate=False)),
                                  desc="Loading weights"):
             load_single_module(name, module)
     else:
-        all_modules = dict(model.named_modules())
+        # remove_duplicate=False ensures original modules sharing weights with next_layer_layernorm are not skipped
+        all_modules = dict(model.named_modules(remove_duplicate=False))
         serial_load_modules = []
         if preload_weight_modules is not None:
             for module in preload_weight_modules:
@@ -919,10 +927,13 @@ def load_single_module(name, module):
                 del all_modules[module]
             pbar.close()
 
-        pbar = tqdm(list(model.named_modules()),
+        pbar = tqdm(list(model.named_modules(remove_duplicate=False)),
                     desc="Loading weights concurrently")
-        args_list = [(name, module) for name, module in model.named_modules()
-                     if name not in serial_load_modules]
+        args_list = [
+            (name, module)
+            for name, module in model.named_modules(remove_duplicate=False)
+            if name not in serial_load_modules
+        ]
         run_concurrently(load_single_module, args_list, pbar=pbar)
 
 
@@ -950,31 +961,36 @@ def load_single_module(name, module):
             if weight_mapper.does_require_special_handling(module_name):
                 module_weights = weight_mapper.apply_callbacks(
                     module, module_name, module_names_breakdown, weights)
-                module.load_weights(weights=module_weights)
+                # Note: module_weights may be empty after filtering (e.g., in streaming weight updates)
+                if module_weights:
+                    module.load_weights(weights=module_weights)
             else:
                 module_weights = weight_mapper.filter_weights(name, weights)
-                if weight_mapper.is_special_instance_module(module):
-                    weight_mapper.handle_special_instance_module(
-                        module, module_name, module_weights)
-
-                elif hasattr(module, 'load_weights'):
-                    if "linear_attn.conv1d" in name:
-                        module_weights['weight'] = module_weights[
-                            'weight'].squeeze(dim=1)
-                    module.load_weights(weights=[module_weights])
-                else:
-                    for n, p in module._parameters.items():
-                        if p is not None:
+                # Note: module_weights may be empty after filtering (e.g., in streaming weight updates)
+                if module_weights:
+                    if weight_mapper.is_special_instance_module(module):
+                        weight_mapper.handle_special_instance_module(
+                            module, module_name, module_weights)
+                    elif hasattr(module, 'load_weights'):
+                        if module_weights:
+                            if "linear_attn.conv1d" in name:
+                                module_weights['weight'] = module_weights[
+                                    'weight'].squeeze(dim=1)
+                            module.load_weights(weights=[module_weights])
+                    else:
+                        for n, p in module.named_parameters(recurse=False):
                             weight_mapper.handle_manual_copy(
                                 module_name, module_weights, n, p)
 
     if os.environ.get("TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL",
                       "True") in ["True", "true", "1", "yes", "y"]:
-        for name, module in tqdm(list(model.named_modules()),
+        for name, module in tqdm(list(
+                model.named_modules(remove_duplicate=False)),
                                  desc="Loading weights"):
             load_single_module(name, module)
     else:
-        all_modules = dict(model.named_modules())
+        # remove_duplicate=False ensures original modules sharing weights with next_layer_layernorm are not skipped
+        all_modules = dict(model.named_modules(remove_duplicate=False))
         serial_load_modules = []
         if preload_weight_modules is not None:
             for module in preload_weight_modules:
@@ -990,8 +1006,11 @@ def load_single_module(name, module):
                 del all_modules[module]
             pbar.close()
 
-        pbar = tqdm(list(model.named_modules()),
+        pbar = tqdm(list(model.named_modules(remove_duplicate=False)),
                     desc="Loading weights concurrently")
-        args_list = [(name, module) for name, module in model.named_modules()
-                     if name not in serial_load_modules]
+        args_list = [
+            (name, module)
+            for name, module in model.named_modules(remove_duplicate=False)
+            if name not in serial_load_modules
+        ]
         run_concurrently(load_single_module, args_list, pbar=pbar)
@@ -670,6 +670,7 @@ def create_py_executor_instance(
     peft_cache_config: Optional[PeftCacheConfig] = None,
     scheduler_config: Optional[SchedulerConfig] = None,
     cache_transceiver_config: Optional[CacheTransceiverConfig] = None,
+    virtual_memory_pools: Optional[dict] = None,
 ) -> PyExecutor:
     kv_cache_manager = resources.get(ResourceManagerType.KV_CACHE_MANAGER, None)
 
@@ -818,7 +819,8 @@ def create_py_executor_instance(
         garbage_collection_gen0_threshold=garbage_collection_gen0_threshold,
         kv_connector_manager=kv_connector_manager,
         max_seq_len=max_seq_len,
-        peft_cache_config=peft_cache_config)
+        peft_cache_config=peft_cache_config,
+        virtual_memory_pools=virtual_memory_pools)
 
 
 def create_torch_sampler_args(mapping: Mapping, *, max_seq_len: int,
 
@@ -108,6 +108,9 @@ class PyTorchConfig:
     # If true, ONLY the vision encoder part of the full model is loaded/executed.
     mm_encoder_only: bool = False
 
+    # Enable extra setup to support sleep feature.
+    enable_sleep: bool = False
+
     # If true, adjust PyTorch CUDA memory fraction to correspond to the
     # total GPU memory minus the statically allocated engine memory.
     # If false, set the PyTorch CUDA memory fraction to 1.0.