davidmlw
diff --git a/‎tensorrt_llm/_torch/models/modeling_utils.py‎
Lines changed: 13 additions & 7 deletions b/‎tensorrt_llm/_torch/models/modeling_utils.py‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 6 additions & 3 deletions b/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/py_executor.py‎
Lines changed: 120 additions & 2 deletions b/‎tensorrt_llm/_torch/pyexecutor/py_executor.py‎
Lines changed: 120 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/utils.py‎
Lines changed: 72 additions & 1 deletion b/‎tensorrt_llm/_torch/utils.py‎
Lines changed: 72 additions & 1 deletion
diff --git a/‎tensorrt_llm/executor/executor.py‎
Lines changed: 19 additions & 0 deletions b/‎tensorrt_llm/executor/executor.py‎
Lines changed: 19 additions & 0 deletions
@@ -804,6 +804,9 @@ def load_single_module(name, module):
                 for new_name in params_map[names[-1]]:
                     fw = filter_weights('.'.join(names[:-1] + [new_name]),
                                         weights)
+                    # tmp fixes to enable partial updates in old path
+                    if not fw:
+                        continue
                     if new_name in ['k_proj', 'v_proj']:
                         num_kv_heads_list = [num_kv_heads
                                              ] * len(fw) if isinstance(
@@ -820,15 +823,18 @@ def load_single_module(name, module):
                         }
 
                     module_weights.append(fw)
-                module.load_weights(weights=module_weights)
+                if module_weights:
+                    module.load_weights(weights=module_weights)
+
             else:
                 module_weights = filter_weights(name, weights)
-                if hasattr(module, 'load_weights'):
-                    module.load_weights(weights=[module_weights])
-                else:
-                    for n, p in module._parameters.items():
-                        if p is not None:
-                            p.data.copy_(module_weights[n][:])
+                if module_weights:
+                    if hasattr(module, 'load_weights'):
+                        module.load_weights(weights=[module_weights])
+                    else:
+                        for n, p in module._parameters.items():
+                            if p is not None:
+                                p.data.copy_(module_weights[n][:])
 
     if os.environ.get("TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL",
                       False) in ["True", "true", "1", "yes", "y"]:
 
@@ -164,14 +164,16 @@ def __init__(self,
                  return_log_probs: bool = False,
                  return_context_logits: bool = False,
                  return_generation_logits: bool = False,
-                 exclude_last_generation_logits: bool = False):
+                 exclude_last_generation_logits: bool = False,
+                 success: bool = False):
         self._streaming = streaming
         self._context_logits = LogitsStorage(
             prompt_len, use_device_memory) if return_context_logits else None
         self._generation_logits = LogitsStorage(
             max_new_tokens, use_device_memory, exclude_last_generation_logits
         ) if return_generation_logits else None
         self._log_probs = LogProbStorage() if return_log_probs else None
+        self._success = success
 
     def append_context_logits(self, context_logits: torch.Tensor):
         if self._context_logits:
@@ -247,8 +249,9 @@ def __getattr__(self, item):
         return getattr(result, item)
 
     def deserialize(self):
-        self._result = tensorrt_llm.bindings.executor.deserialize_result(
-            self._result)
+        if self._result is not None:
+            self._result = tensorrt_llm.bindings.executor.deserialize_result(
+                self._result)
 
 
 @dataclass
 
@@ -15,6 +15,7 @@
 
 from tensorrt_llm._torch.pyexecutor.resource_manager import ResourceManagerType
 from tensorrt_llm._torch.pyexecutor.seq_slot_manager import SeqSlotManager
+from tensorrt_llm._torch.utils import get_device_uuid
 from tensorrt_llm._utils import (customized_gc_thresholds, global_mpi_rank,
                                  is_trace_enabled, nvtx_range, trace_func)
 from tensorrt_llm.bindings.executor import (DisServingRequestStats,
@@ -35,7 +36,7 @@
 from .guided_decoder import GuidedDecoder
 from .kv_cache_transceiver import KvCacheTransceiver
 from .llm_request import (ExecutorRequest, LlmRequest, LlmRequestState,
-                          LlmResponse)
+                          LlmResponse, LlmResult, executor_request_to_llm_request, PyResult)
 from .model_engine import ModelEngine
 from .sampler import Sampler, SampleState, SampleStateTensors
 from .scheduler import RequestScheduler, ScheduledRequests
@@ -184,6 +185,7 @@ def __init__(self,
         self.num_fetch_requests_cur_rank = 0
         self.num_fetch_requests = 0
         self.shutdown_event = threading.Event()
+        self.request_accumulator: List[RequestQueueItem] = []
 
         # response used data
         self.response_lock = threading.Lock()
@@ -235,6 +237,8 @@ def __init__(self,
         )
         self.executor_request_queue.set_exclude_last_generation_logits(
             self.disable_overlap_scheduler, self.sampler)
+        self.is_control_request = False
+        self.control_request_id = 0
 
         self.stats_lock = threading.Lock()
         self.stats = []
@@ -383,12 +387,29 @@ def wait_shutdown(self):
 
     def enqueue_request(self,
                         request: ExecutorRequest,
-                        query: Optional[List] = None) -> int:
+                        query: Optional[List] = None,
+                        weight_ipc_handles: Optional[dict] = None,
+                        sleep_level: Optional[int] = None,
+                        wakeup_level: Optional[int] = None) -> int:
         """
         Enqueue a new request, query is only used in `StarAttention`.
         """
         req_id = self.executor_request_queue.enqueue_request(request, query)
 
+        ##     if weight_ipc_handles is not None:
+        ##         self.request_queue.put(RequestQueueItem(UPDATE_WEIGHT_REQUEST_ID, None, False, None, weight_ipc_handles))
+        ##     elif sleep_level is not None:
+        ##         self.request_queue.put(RequestQueueItem(SLEEP_REQUEST_ID, None, False, None, None, sleep_level))
+        ##     elif wakeup_level is not None:
+        ##         self.request_queue.put(RequestQueueItem(WAKEUP_REQUEST_ID, None, False, None, None, None, wakeup_level))
+        ##     elif query is not None:
+        ##         self.request_queue.put(RequestQueueItem(req_id, request, query))
+        ##     else:
+        ##         self.request_queue.put(RequestQueueItem(req_id, request))
+        ##     #self.request_queue.put(RequestQueueItem(req_id, request, False, query, weight_ipc_handles, sleep_level, wakeup_level))
+        ##     self.next_req_id += 1
+        ## finally:
+        ##     self.enqueue_lock.release()
         return req_id
 
     def set_gather_responses(self, gather_all_responses):
@@ -666,6 +687,18 @@ def _executor_loop_pp(self):
                 new_requests = self._fetch_and_activate_new_requests()
                 if self.should_stop_processing:
                     break
+                if self.is_control_request:
+                    self.is_control_request = False
+                    assert len(new_requests) == 1, f"control request should be the only request in the list, but got {len(new_requests)}"
+                    if (new_requests[0].is_update_weight_request()):
+                        self._update_weight(new_requests[0])
+                    elif (new_requests[0].is_sleep_request()):
+                        self._sleep(new_requests[0])
+                    elif (new_requests[0].is_wakeup_request()):
+                        self._wakeup(new_requests[0])
+                    else:
+                        assert False, "Invalid control request"
+                    continue
 
                 if self.kv_cache_transceiver:
                     self._check_disagg_gen_transfer_status()
@@ -914,6 +947,18 @@ def _executor_loop(self):
                 scheduled_batch, iter_stats = self._prepare_and_schedule_batch()
                 if scheduled_batch is None:
                     break
+                if self.is_control_request:
+                    self.is_control_request = False
+                    assert len(new_requests) == 1, f"control request should be the only request in the list, but got {len(new_requests)}"
+                    if (new_requests[0].is_update_weight_request()):
+                        self._update_weight(new_requests[0])
+                    elif (new_requests[0].is_sleep_request()):
+                        self._sleep(new_requests[0])
+                    elif (new_requests[0].is_wakeup_request()):
+                        self._wakeup(new_requests[0])
+                    else:
+                        assert False, "Invalid control request"
+                    continue
 
                 self._pause_requests(scheduled_batch.paused_requests)
 
@@ -995,6 +1040,67 @@ def _prepare_draft_requests(self):
             logger.error(f"Encountered an error in decode: {error_msg}")
             self._handle_errors(error_msg)
 
+    def update_weights(self, weights):
+        # Load weights into the model
+        self.model_engine.model.load_weights(weights)
+        torch.cuda.synchronize()
+
+        # TODO: reset prefix cache
+
+    def update_weight_from_ipc_handles(self, handles):
+        """
+        Update model weights from IPC handles.
+
+        Args:
+            ipc_handles (dict): Dictionary mapping device UUIDs to parameter IPC handles.
+                {device_uuid: all_handles}
+        """
+        from tensorrt_llm._torch.utils import get_device_uuid
+        device_uuid = get_device_uuid(self.device_id)
+
+        if device_uuid not in handles:
+            raise ValueError(f"Device UUID {device_uuid} not found in ipc_handles")
+
+        try:
+            weights = {}
+            all_handles = handles[device_uuid]
+
+            for param_name, tensor_handle in all_handles:
+                func, args = tensor_handle
+                list_args = list(args)
+                list_args[6] = self.device_id  # Set target device
+                tensor = func(*list_args)
+                weights[param_name] = tensor
+
+            self.update_weights(weights)
+
+        except Exception as e:
+            logger.error(f"failed to update weights from ipc handles: {e}")
+            raise e
+
+    def _sleep(self, sleep_request):
+        self.is_sleep_request = False
+        self._enqueue_responses({sleep_request.id: LlmResponse(request_id=sleep_request.id, result=LlmResult(result=None, py_result=PyResult(0, 0, success=True), is_final=True), client_id=sleep_request.id)})
+
+    def _wakeup(self, wakeup_request):
+        self.is_wakeup_request = False
+        self._enqueue_responses({wakeup_request.id: LlmResponse(request_id=wakeup_request.id, result=LlmResult(result=None, py_result=PyResult(0, 0, success=True), is_final=True), client_id=wakeup_request.id)})
+
+    def _update_weight(self, update_weight_request):
+        self.is_update_weight_request = False
+
+        try:
+            self.update_weight_from_ipc_handles(update_weight_request.weight_ipc_handles)
+            update_weight_response = LlmResponse(request_id=update_weight_request.id, result=LlmResult(result=None, py_result=PyResult(0, 0, success=True), is_final=True),     client_id=update_weight_request.id)
+            self._enqueue_responses({update_weight_request.id: update_weight_response})
+        except Exception as e:
+            print(
+                f"Error in update_weights_from_ipc_handles: {e}"
+            )
+            raise e
+            #update_weight_response = LlmResponse(request_id=update_weight_request.id, result=LlmResult(result=None, py_result=PyResult(0, 0, success=False), is_final=True), client_id=update_weight_request.id)
+            #self._enqueue_responses({update_weight_request.id: update_weight_response})
+
     def _executor_loop_overlap(self):
         torch.cuda.set_device(self.device_id)
         # ensure the context is created, otherwise, some MPI calls will fail.
@@ -1018,6 +1124,18 @@ def _executor_loop_overlap(self):
                 scheduled_batch, iter_stats = self._prepare_and_schedule_batch()
                 if scheduled_batch is None:
                     break
+                if self.is_control_request:
+                    self.is_control_request = False
+                    assert len(new_requests) == 1, f"control request should be the only request in the list, but got {len(new_requests)}"
+                    if (new_requests[0].is_update_weight_request()):
+                        self._update_weight(new_requests[0])
+                    elif (new_requests[0].is_sleep_request()):
+                        self._sleep(new_requests[0])
+                    elif (new_requests[0].is_wakeup_request()):
+                        self._wakeup(new_requests[0])
+                    else:
+                        assert False, "Invalid control request"
+                    continue
 
                 self._pause_requests(scheduled_batch.paused_requests)
 
 
@@ -2,9 +2,10 @@
 import threading
 from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, List
+from typing import Dict, List, Generator
 
 import torch
+import pynvml
 
 from tensorrt_llm._utils import TensorWrapper, convert_to_torch_tensor
 from tensorrt_llm.math_utils import ceil_div, pad_up
@@ -261,3 +262,73 @@ def set_piecewise_cuda_graph_flag(enable: bool):
 def get_piecewise_cuda_graph_flag() -> bool:
     global _enable_piecewise_cuda_graph
     return _enable_piecewise_cuda_graph
+
+
+@contextlib.contextmanager
+def nvml_context() -> Generator[None, None, None]:
+    """Context manager for NVML initialization and shutdown.
+
+    Raises:
+        RuntimeError: If NVML initialization fails
+    """
+    try:
+        pynvml.nvmlInit()
+        yield
+    except pynvml.NVMLError as e:
+        raise RuntimeError(f"Failed to initialize NVML: {e}")
+    finally:
+        try:
+            pynvml.nvmlShutdown()
+        except:
+            pass
+
+def device_id_to_physical_device_id(device_id: int) -> int:
+    """Convert a logical device ID to a physical device ID considering CUDA_VISIBLE_DEVICES."""
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        try:
+            physical_device_id = int(device_ids[device_id])
+            return physical_device_id
+        except ValueError:
+            raise RuntimeError(
+                f"Failed to convert logical device ID {device_id} to physical device ID. Available devices are: {device_ids}."
+            )
+    else:
+        return device_id
+
+def get_device_uuid(device_idx: int) -> str:
+    """Get the UUID of a CUDA device using NVML."""
+    # Convert logical device index to physical device index
+
+    global_device_idx = device_id_to_physical_device_id(device_idx)
+
+    # Get the device handle and UUID
+    with nvml_context():
+        try:
+            handle = pynvml.nvmlDeviceGetHandleByIndex(global_device_idx)
+            uuid = pynvml.nvmlDeviceGetUUID(handle)
+            # Ensure the UUID is returned as a string, not bytes
+            if isinstance(uuid, bytes):
+                return uuid.decode("utf-8")
+            elif isinstance(uuid, str):
+                return uuid
+            else:
+                raise RuntimeError(
+                    f"Unexpected UUID type: {type(uuid)} for device {device_idx} (global index: {global_device_idx})"
+                )
+        except pynvml.NVMLError as e:
+            raise RuntimeError(
+                f"Failed to get device UUID for device {device_idx} (global index: {global_device_idx}): {e}"
+            )
+
+def get_free_memory_bytes(device_idx: int) -> float:
+    """Get the free memory of a CUDA device in bytes using NVML."""
+    global_device_idx = device_id_to_physical_device_id(device_idx)
+    with nvml_context():
+        try:
+            handle = pynvml.nvmlDeviceGetHandleByIndex(global_device_idx)
+            return pynvml.nvmlDeviceGetMemoryInfo(handle).free
+        except pynvml.NVMLError as e:
+            raise RuntimeError(
+                f"Failed to get free memory for device {device_idx} (global index: {global_device_idx}): {e}"
+            )
@@ -204,6 +204,25 @@ def generate(
 
         return futures
 
+    def async_update_weights_from_ipc_handles(self, handles: dict):
+        update_weights_request = GenerationRequest([], SamplingParams(end_id=0))
+        update_weights_request.set_weight_ipc_handles(handles)
+        result = self.submit(update_weights_request)
+        return result
+
+    def async_sleep(self, level: int = 1):
+        sleep_request = GenerationRequest([], SamplingParams(end_id=0))
+        sleep_request.set_sleep_level(level)
+        result = self.submit(sleep_request)
+        return result
+
+    def async_wakeup(self):
+        sleep_request = GenerationRequest([], SamplingParams(end_id=0))
+        sleep_request.set_wakeup_level(1)
+        result = self.submit(sleep_request)
+        return result
+
+
     def _get_next_client_id(self):
         # (self._last_client_id + 1) % UINT64_MAX
         self._last_client_id = (self._last_client_id + 1) & ((1 << 64) - 1)