davidmlw
diff --git a/‎tests/unittest/llmapi/test_llm_update_weights.py‎
Lines changed: 144 additions & 41 deletions b/‎tests/unittest/llmapi/test_llm_update_weights.py‎
Lines changed: 144 additions & 41 deletions
@@ -16,7 +16,7 @@
 )
 #from torch.distributed.fsdp.api import ShardedStateDictConfig, StateDictType
 from transformers import AutoModelForCausalLM, AutoTokenizer
-
+from torch.distributed.tensor import DTensor
 
 def init_distributed():
     """Initialize distributed training"""
@@ -29,7 +29,7 @@ def init_distributed():
     if "MASTER_PORT" not in os.environ:
         os.environ["MASTER_PORT"] = "29500"
 
-    dist.init_process_group(backend="nccl")
+    dist.init_process_group(backend="cpu:gloo,cuda:nccl")
     world_size = dist.get_world_size()
     rank = dist.get_rank()
     torch.cuda.set_device(rank)
@@ -39,6 +39,18 @@ def exit_distributed():
     """Exit distributed training"""
     if dist.is_initialized():
         dist.destroy_process_group()
+
+def report_device_id() -> str:
+    """Report the UUID of the current CUDA device using NVML.
+    Returns:
+        str: UUID of the device in the format "GPU-xxxxx"
+    """
+    from tensorrt_llm._torch.utils import get_device_uuid
+    # Get current device index from torch
+    device_idx = torch.cuda.current_device()
+    # Get device UUID using NVML
+    return get_device_uuid(device_idx)
+
 class fsdp_interface:
     def __init__(self, model_dir):
         self.model_dir = model_dir
@@ -96,17 +108,23 @@ def load_fsdp_model(self, model_dir):
         return fsdp_model
 
 
-    def report_device_id(self) -> str:
-        """Report the UUID of the current CUDA device using NVML.
 
-        Returns:
-            str: UUID of the device in the format "GPU-xxxxx"
-        """
-        from tensorrt_llm._torch.utils import get_device_uuid
-        # Get current device index from torch
-        device_idx = torch.cuda.current_device()
-        # Get device UUID using NVML
-        return get_device_uuid(device_idx)
+    def per_tensor_generator(self):
+        # If the model is not FSDP, then we need to manually move it to the GPU
+        # For an FSDP model, model.state_dict() will move the params to the GPU
+        if not isinstance(self.model, FSDP):
+            self.model = self.manual_load_to_gpu(self.model)
+            self._held_sharded_state_dict_reference = self.model.state_dict()
+        else:
+            # Get sharded state dict instead of full state dict for FSDP1
+            with FSDP.state_dict_type(
+                self.model,
+                state_dict_type=StateDictType.FULL_STATE_DICT,
+                state_dict_config=FullStateDictConfig()
+            ):
+                self._held_sharded_state_dict_reference = self.model.state_dict()
+        for name, param in self._held_sharded_state_dict_reference.items():
+            yield name, param
 
     @torch.no_grad()
     def prepare_weights_for_ipc(self) -> tuple[list[tuple[str, int]], float]:
@@ -182,7 +200,7 @@ def get_weights_ipc_handles(self, keys: list[str]) -> dict[str, Any]:
         self._held_streamed_param_reference = converted_params
 
         # Get device UUID for IPC
-        device_uuid = self.report_device_id()
+        device_uuid = report_device_id()
         # Create handles for the tensors
         all_handles = []
         for key, p in converted_params.items():
@@ -231,6 +249,25 @@ def prepare_weights_for_ipc_refit(
 
         return grouped_param_keys
 
+class NamedParam:
+    def __init__(self, name, size, param):
+        self.name = name
+        self.size = size
+        self.param = param
+
+class GateAndUp:
+    def __init__(self):
+        self.gate = None
+        self.up = None
+    def set_gate(self, gate):
+        self.gate = gate
+    def set_up(self, up):
+        self.up = up
+    def get_size(self):
+        return self.gate.size + self.up.size
+    def is_complete(self):
+        return self.gate is not None and self.up is not None
+
 class trtllm_interface:
     def __init__(self, model_dir, tensor_parallel_size):
         self.world_size = dist.get_world_size()
@@ -257,13 +294,104 @@ def load_trtllm_model(self, model_dir, tensor_parallel_size):
         else:
             return None
 
+    def update_weights_from_ipc_handles(self, rank, device_handles):
+        if rank == 0:
+            gathered_handles = [None for _ in range(dist.get_world_size())]
+        else:
+            gathered_handles = None
+        dist.gather_object(
+            obj=device_handles,
+            object_gather_list=gathered_handles,
+            dst=0
+        )
+        if rank == 0:
+            all_handles = {k: v for d in gathered_handles for k, v in d.items()}
+            self.llm.update_weights_from_ipc_handles(all_handles)
+
+    def update_weights_from_tensor_generator(self, tensor_generator):
+        device_uuid = report_device_id()
+        rank = dist.get_rank()
+        from torch.multiprocessing.reductions import reduce_tensor
+        total_available_bytes = 0.7 * (1024**3)
+        cur_available_bytes = total_available_bytes
+        converted_params = {}
+        cur_handles = []
+        gate_up = {}
+        for name, param in tensor_generator:
+            size_in_bytes = param.element_size() * param.numel()
+            if isinstance(param, DTensor):
+                param = param.full_tensor()
+            gate_up_name = None
+            gate_up_pair = None
+            if "gate_proj" in name:
+                gate_up_name = name.replace("gate_proj", "")
+                if (gate_up_name not in gate_up):
+                    gate_up[gate_up_name] = GateAndUp()
+                assert gate_up[gate_up_name].gate is None
+                gate_up[gate_up_name].set_gate(NamedParam(name, size_in_bytes, param))
+            elif "up_proj" in name:
+                gate_up_name = name.replace("up_proj", "")
+                if (gate_up_name not in gate_up):
+                    gate_up[gate_up_name] = GateAndUp()
+                assert gate_up[gate_up_name].up is None
+                gate_up[gate_up_name].set_up(NamedParam(name, size_in_bytes, param))
+            if (gate_up_name is not None):
+                if gate_up[gate_up_name].is_complete():
+                    gate_up_pair = gate_up.pop(gate_up_name)
+                    size_in_bytes = gate_up_pair.get_size()
+                else:
+                    continue
+
+            if size_in_bytes > cur_available_bytes:
+                device_handles = {device_uuid: cur_handles}
+                self.update_weights_from_ipc_handles(rank, device_handles)
+                cur_available_bytes = total_available_bytes
+                del converted_params
+                converted_params = {}
+                cur_handles = []
+
+            assert cur_available_bytes >= size_in_bytes
+            cur_available_bytes -= size_in_bytes
+            if (gate_up_pair is not None):
+                converted_params[gate_up_pair.gate.name] = gate_up_pair.gate.param
+                converted_params[gate_up_pair.up.name] = gate_up_pair.up.param
+                handle = reduce_tensor(gate_up_pair.gate.param.detach())
+                cur_handles.append((gate_up_pair.gate.name, handle))
+                handle = reduce_tensor(gate_up_pair.up.param.detach())
+                cur_handles.append((gate_up_pair.up.name, handle))
+                gate_up_pair = None
+            else:
+                converted_params[name] = param
+                handle = reduce_tensor(param.detach())
+                cur_handles.append((name, handle))
+
+        assert len(gate_up) == 0
+
+        if cur_handles:
+            device_handles = {device_uuid: cur_handles}
+            self.update_weights_from_ipc_handles(rank, device_handles)
+            cur_available_bytes = total_available_bytes
+            del converted_params
+            converted_params = {}
+            cur_handles = []
+
+def get_total_available_bytes(pg: dist.ProcessGroup, message: str = "") -> int:
+    mem_allocated = torch.cuda.memory_allocated()
+    mem_reserved = torch.cuda.memory_reserved()
+    mem_free, mem_total = torch.cuda.mem_get_info()
+    print(f"{message} mem_free: {mem_free:,}, mem_total: {mem_total:,}, mem_allocated: {mem_allocated:,}, mem_reserved: {mem_reserved:,}")
+    mem_free = torch.tensor(mem_free)
+    dist.all_reduce(mem_free, op=dist.ReduceOp.MIN, group=pg)
+    mem_free = mem_free.item()
+    print(f"{message} gathered_mem_free: {mem_free:,}")
+    return mem_free * 0.2
+
 def cleanup():
     """Cleanup function to destroy process group"""
     if dist.is_initialized():
         print(f"Cleaning up process group on rank {dist.get_rank()}")
         dist.destroy_process_group()
 
-
 def main():
     parser = argparse.ArgumentParser(
         description="LLM models with the PyTorch workflow.")
@@ -306,7 +434,6 @@ def main():
     # For FSDP mode, we would need additional logic to integrate withTensorRT-LLM
     # This is a placeholder for now
     if rank == 0:
-
         outputs = trtllm.llm.generate(prompts, sampling_params)
         for i, output in enumerate(outputs):
             prompt = output.prompt
@@ -321,33 +448,9 @@ def main():
         result = trtllm.llm.wakeup()
         print(f"wakeup result: {result}")
 
-    dict_info, total_available_bytes = fsdp.prepare_weights_for_ipc()
-
-    grouped_param_keys = fsdp.prepare_weights_for_ipc_refit(0.5)
-    total_num_keys = sum(len(k) for k in grouped_param_keys)
-    print(
-        f"[Refit] Split {total_num_keys} keys into {len(grouped_param_keys)} groups"
-    )
-
-    from tensorrt_llm._torch.utils import get_free_memory_bytes
-    for keys in grouped_param_keys:
-        handles = fsdp.get_weights_ipc_handles(keys)
-        #print(f"handles: {handles}")
-
-        # Collect handles from all ranks
-        all_handles = [None for _ in range(world_size)]
-        dist.all_gather_object(all_handles, handles)
-        all_handles = {k: v for d in all_handles for k, v in d.items()}
-        #print(f"all_handles: {all_handles.keys()}")
-
-        device_idx = torch.cuda.current_device()
-        total_available_bytes = get_free_memory_bytes(device_idx)
-        print(f"total_available_bytes: {total_available_bytes}")
-
-        if rank == 0:
-            result = trtllm.llm.update_weights_from_ipc_handles(all_handles)
-            print(f"update weights result: {result}")
+    trtllm.update_weights_from_tensor_generator(fsdp.per_tensor_generator())
 
+    # generate the output again
     if rank == 0:
         outputs = trtllm.llm.generate(prompts, sampling_params)
         for i, output in enumerate(outputs):