[None][fix] fix: resolve GPU memory imbalance in concurrent weight loading (#6472)

Nekofish-L · jieli-matrix · web-flow · commit 8614cd343993 · 2025-12-24T09:43:09.000-05:00
Signed-off-by: Necofish &lt;liuxiangyang@mail.ustc.edu.cn&gt;
Signed-off-by: Nekofish-L &lt;liuxiangyang@mail.ustc.edu.cn&gt;
Signed-off-by: Jie Li &lt;lijie@nvidia.com&gt;
Co-authored-by: Jie Li &lt;lijie@nvidia.com&gt;
diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py
@@ -77,6 +77,11 @@ def add_llm_args(parser):
                         choices=["auto", "TorchSampler", "TRTLLMSampler"])
     parser.add_argument('--tp_size', type=int, default=1)
     parser.add_argument('--pp_size', type=int, default=1)
+    parser.add_argument('--orchestrator_type',
+                        type=str,
+                        default=None,
+                        choices=[None, 'rpc', 'ray'],
+                        help='Orchestrator type for multi-GPU execution')
     parser.add_argument('--moe_ep_size', type=int, default=-1)
     parser.add_argument('--moe_tp_size', type=int, default=-1)
     parser.add_argument('--moe_cluster_size', type=int, default=-1)
@@ -288,6 +293,7 @@ def setup_llm(args, **kwargs):
         trust_remote_code=args.trust_remote_code,
         gather_generation_logits=args.return_generation_logits,
         max_beam_width=args.max_beam_width,
+        orchestrator_type=args.orchestrator_type,
         **kwargs)
 
     use_beam_search = args.max_beam_width > 1
diff --git a/tensorrt_llm/_torch/models/modeling_utils.py b/tensorrt_llm/_torch/models/modeling_utils.py
@@ -12,6 +12,7 @@
 from torch.utils._pytree import tree_any_only
 from tqdm import tqdm
 
+from tensorrt_llm._utils import local_mpi_rank
 from tensorrt_llm.lora_manager import HfLoraLoader
 from tensorrt_llm.models.convert_utils import split_matrix_tp
 
@@ -852,8 +853,10 @@ def _load_weights_impl(model: Union[nn.Module, DecoderModelForCausalLM],
         'qkv_proj': ['q_proj', 'k_proj', 'v_proj'],
         'gate_up_proj': ['gate_proj', 'up_proj']
     }
+    device_id = local_mpi_rank()
 
     def load_single_module(name, module):
+        torch.cuda.set_device(device_id)
         if len(module._parameters) > 0:
             # skip load weights if module is in skip_modules
             if any(skip_module in name for skip_module in skip_modules):
@@ -931,7 +934,7 @@ def load_single_module(name, module):
                                 p.data.copy_(module_weights[n][:])
 
     if os.environ.get("TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL",
-                      "True") in ["True", "true", "1", "yes", "y"]:
+                      "False") in ["True", "true", "1", "yes", "y"]:
         for name, module in tqdm(list(
                 model.named_modules(remove_duplicate=False)),
                                  desc="Loading weights"):
@@ -977,8 +980,10 @@ def _load_weights_impl_v2(model: Union[nn.Module, DecoderModelForCausalLM],
     if params_map is not None:
         weights = weight_mapper.rename_by_params_map(params_map, weights)
         logger.info(f"Renamed weights with params_map: {params_map}")
+    device_id = local_mpi_rank()
 
     def load_single_module(name, module):
+        torch.cuda.set_device(device_id)
         if len(module._parameters) > 0:
             if weight_mapper.should_skip_module(name):
                 return
@@ -1034,7 +1039,7 @@ def load_single_module(name, module):
                                 allow_partial_loading=allow_partial_loading)
 
     if os.environ.get("TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL",
-                      "True") in ["True", "true", "1", "yes", "y"]:
+                      "False") in ["True", "true", "1", "yes", "y"]:
         for name, module in tqdm(list(
                 model.named_modules(remove_duplicate=False)),
                                  desc="Loading weights"):
diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py
@@ -561,7 +561,15 @@ def mpi_world_size():
 
 
 def local_mpi_rank():
-    return local_comm.Get_rank() if ENABLE_MULTI_DEVICE else 0
+    if mpi_disabled():
+        # For Ray/non-MPI: the device was already set during worker init
+        # torch.cuda.current_device() returns the correct local device ID
+        try:
+            return torch.cuda.current_device()
+        except ValueError:
+            return 0
+    return mpi_comm().Get_rank() % torch.cuda.device_count(
+    ) if ENABLE_MULTI_DEVICE else 0
 
 
 def local_mpi_size():