formatting - integration test passes

brb-nv · brb-nv · commit e7cf5b27de42 · 2025-12-21T05:57:15.000Z
diff --git a/cpp/tensorrt_llm/kernels/CMakeLists.txt b/cpp/tensorrt_llm/kernels/CMakeLists.txt
@@ -18,8 +18,8 @@
 file(GLOB_RECURSE SRC_CPP *.cpp)
 file(GLOB_RECURSE SRC_CU *.cu)
 
-# Explicitly add newly added kernel files to ensure they're included
-# (GLOB only runs at configure time, not build time)
+# Explicitly add newly added kernel files to ensure they're included (GLOB only
+# runs at configure time, not build time)
 list(APPEND SRC_CU ${CMAKE_CURRENT_SOURCE_DIR}/helixAllToAll.cu)
 list(REMOVE_DUPLICATES SRC_CU)
 
diff --git a/cpp/tensorrt_llm/kernels/helixAllToAll.cu b/cpp/tensorrt_llm/kernels/helixAllToAll.cu
@@ -203,7 +203,8 @@ __device__ __forceinline__ uint64_t* getFifoBasePtr(HelixAllToAllParams const& p
     return mappedMemory + fifoOffset;
 }
 
-__device__ __forceinline__ HelixFifoInfo* getSenderHelixFifoInfo(HelixAllToAllParams const& params, HelixPairInfo const& pairInfo)
+__device__ __forceinline__ HelixFifoInfo* getSenderHelixFifoInfo(
+    HelixAllToAllParams const& params, HelixPairInfo const& pairInfo)
 {
     // SenderSideHelixFifoInfo is physically located at sender rank
     int mappedMemoryRank = pairInfo.senderRank;
diff --git a/tensorrt_llm/_mnnvl_utils.py b/tensorrt_llm/_mnnvl_utils.py
@@ -251,9 +251,7 @@ def open_mnnvl_memory(cls, mapping: Mapping, size: int):
 
         for i, remote_handle_data in enumerate(all_handles_data):
             rank_ptr = (
-                cls.current_start_address
-                + cls.current_rank_stride * i
-                + cls.current_mem_offset
+                cls.current_start_address + cls.current_rank_stride * i + cls.current_mem_offset
             )
             if i == comm_rank:
                 # Local memory mapping
diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py
@@ -7,6 +7,7 @@
 import torch
 from torch import nn
 
+from tensorrt_llm._mnnvl_utils import HelixCpMnnvlMemory, MnnvlMemory
 from tensorrt_llm._torch.distributed.symm_mem_allreduce import \
     SymmetricMemoryAllReduce
 from tensorrt_llm._utils import mpi_comm, mpi_disabled
@@ -15,7 +16,6 @@
                                      AllReduceStrategy, MoEAllReduceParams)
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
-from tensorrt_llm._mnnvl_utils import HelixCpMnnvlMemory, MnnvlMemory
 from tensorrt_llm.plugin.plugin import CustomAllReduceHelper
 
 _thread_local = threading.local()
@@ -442,7 +442,6 @@ def alltoall_native(self, field0: torch.Tensor, field1: torch.Tensor):
         return field0_out, field1_out
 
 
-
 def reducescatter(
     input: Union[torch.Tensor, List[torch.Tensor]],
     mapping: Mapping,
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -1,13 +1,12 @@
 import math
 import os
 import weakref
-from typing import Dict, Optional, Union, cast
+from typing import Optional, Union, cast
 
 import torch
 from torch import nn
 
 import tensorrt_llm.quantization.utils.fp8_utils as fp8_utils
-from tensorrt_llm._mnnvl_utils import HelixCpMnnvlMemory, MnnvlMemory
 from tensorrt_llm._utils import (get_sm_version, is_sm_100f, nvtx_range,
                                  nvtx_range_debug)
 from tensorrt_llm.logger import logger
diff --git a/tests/unittest/_torch/modules/test_mla_helix.py b/tests/unittest/_torch/modules/test_mla_helix.py
@@ -878,7 +878,9 @@ def test_mla_helix_distributed(
     for use_nccl in [False, True]:
         nccl_mode = "NCCL" if use_nccl else "FIFO"
         print(f"\n{'=' * 60}")
-        print(f"Testing with TRTLLM_USE_NCCL_FOR_HELIX={'1' if use_nccl else '0'} ({nccl_mode} mode)")
+        print(
+            f"Testing with TRTLLM_USE_NCCL_FOR_HELIX={'1' if use_nccl else '0'} ({nccl_mode} mode)"
+        )
         print(f"{'=' * 60}\n")
         for scenario in all_scenarios[:11]:
             timing_steps = 256

Original file line number	Diff line number	Diff line change
`@@ -203,7 +203,8 @@ __device__ __forceinline__ uint64_t* getFifoBasePtr(HelixAllToAllParams const& p`
`203`	`203`	`return mappedMemory + fifoOffset;`
`204`	`204`	`}`
`205`	`205`
`206`		`-__device__ __forceinline__ HelixFifoInfo* getSenderHelixFifoInfo(HelixAllToAllParams const& params, HelixPairInfo const& pairInfo)`
	`206`	`+__device__ __forceinline__ HelixFifoInfo* getSenderHelixFifoInfo(`
	`207`	`+ HelixAllToAllParams const& params, HelixPairInfo const& pairInfo)`
`207`	`208`	`{`
`208`	`209`	`// SenderSideHelixFifoInfo is physically located at sender rank`
`209`	`210`	`int mappedMemoryRank = pairInfo.senderRank;`