Change the all-reduce strategy to NCCL (#99)

nzmora-nvidia · Gal Agam · web-flow · commit 0c224e48cbfe · 2025-07-23T09:04:13.000+03:00
* Change the all-reduce strategy to NCCL When the strategy is set to AUTO and world_size>1 we experience hangs and CUDA memory errors. * This is the same issue as https://nvbugspro.nvidia.com/bug/5331013 * Without this change test test_ad_build_small_multi.py fails (tp==2) * This is a temporary change until we understand why this hang is happening. * On dllcuster this issue does not manifest. Signed-off-by: Neta Zmora <96238833+nzmora-nvidia@users.noreply.github.com> * Re-enable test_ad_build_small_multi.py tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py Signed-off-by: Neta Zmora <96238833+nzmora-nvidia@users.noreply.github.com> * fix kvcache mem size compute - convert to MB Signed-off-by: Gal Agam <ghubaraagam@cw-dfw-cs-001-login-01.cm.cluster> --------- Signed-off-by: Neta Zmora <96238833+nzmora-nvidia@users.noreply.github.com> Signed-off-by: Gal Agam <ghubaraagam@cw-dfw-cs-001-login-01.cm.cluster> Co-authored-by: Gal Agam <ghubaraagam@cw-dfw-h100-004-328-012.cm.cluster>
diff --git a/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py b/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py
@@ -17,7 +17,8 @@ def trtllm_allreduce(tensor, op, all_reduce_params=None):
         rank, world_size = get_rank_world_size()
         assert op == ReduceOp.SUM, "TRT-LLM all reduce only supports SUM op."
         p_config = Mapping(world_size=world_size, tp_size=world_size, rank=rank)
-        torch_op = AllReduce(mapping=p_config, strategy=AllReduceStrategy.AUTO)
+        # Use Strategy.NCCL until https://nvbugspro.nvidia.com/bug/5331013 is fixed, then change to Strategy.AUTO
+        torch_op = AllReduce(mapping=p_config, strategy=AllReduceStrategy.NCCL)
         return torch_op(tensor, all_reduce_params=all_reduce_params)
 
     @torch.library.custom_op(
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/kvcache.py b/tensorrt_llm/_torch/auto_deploy/transformations/library/kvcache.py
@@ -174,7 +174,7 @@ def _get_mem_info_in_mb():
         memory_for_forward_pass = free_mem_pre - free_mem_post
         ad_logger.info(f"Memory for forward pass (MB): {memory_for_forward_pass}")
 
-        new_cache_size = free_mem_post * free_mem_ratio + current_cache_size
+        new_cache_size = free_mem_post * 1024 * 1024 * free_mem_ratio + current_cache_size
         new_num_pages = int(new_cache_size // (current_cache_size // current_num_pages))
 
         # Need to sync all the GPUs
diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py
@@ -19,9 +19,6 @@
     ],
 )
 def test_build_ad(world_size: int, experiment_config: Dict):
-    if world_size > 1:
-        pytest.skip("https://nvbugspro.nvidia.com/bug/5331013")
-
     experiment_config["args"]["world_size"] = world_size
     experiment_config["args"]["runtime"] = "trtllm"  # Default runtime set to trtllm
     experiment_config = ExperimentConfig(**experiment_config)