[None][chore] Support larger topK for NVLinkOneSided AlltoAll. (NVIDIA#9816)

bobboli · web-flow · commit 9d3c675a0bb5 · 2025-12-10T11:10:55.000+08:00
Signed-off-by: Bo Li &lt;22713281+bobboli@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu
@@ -45,6 +45,18 @@ namespace tensorrt_llm::kernels::moe_comm
 #define SWITCH_TOP_K(top_k, TOP_K, ...)                                                                                \
     switch (top_k)                                                                                                     \
     {                                                                                                                  \
+    case 16:                                                                                                           \
+    {                                                                                                                  \
+        constexpr int TOP_K = 16;                                                                                      \
+        __VA_ARGS__;                                                                                                   \
+        break;                                                                                                         \
+    }                                                                                                                  \
+    case 10:                                                                                                           \
+    {                                                                                                                  \
+        constexpr int TOP_K = 10;                                                                                      \
+        __VA_ARGS__;                                                                                                   \
+        break;                                                                                                         \
+    }                                                                                                                  \
     case 8:                                                                                                            \
     {                                                                                                                  \
         constexpr int TOP_K = 8;                                                                                       \
@@ -611,6 +623,90 @@ __device__ void vectorized_combine_impl(
             // Load directly into the per-k accumulator; reduce across k below
             acc[k].load(recv_buffer + base_token + offset);
         }
+        if constexpr (TOP_K == 16)
+        {
+            T* a0 = reinterpret_cast<T*>(&acc[0]);
+            T* a1 = reinterpret_cast<T*>(&acc[1]);
+            T* a2 = reinterpret_cast<T*>(&acc[2]);
+            T* a3 = reinterpret_cast<T*>(&acc[3]);
+            T* a4 = reinterpret_cast<T*>(&acc[4]);
+            T* a5 = reinterpret_cast<T*>(&acc[5]);
+            T* a6 = reinterpret_cast<T*>(&acc[6]);
+            T* a7 = reinterpret_cast<T*>(&acc[7]);
+            T* a8 = reinterpret_cast<T*>(&acc[8]);
+            T* a9 = reinterpret_cast<T*>(&acc[9]);
+            T* a10 = reinterpret_cast<T*>(&acc[10]);
+            T* a11 = reinterpret_cast<T*>(&acc[11]);
+            T* a12 = reinterpret_cast<T*>(&acc[12]);
+            T* a13 = reinterpret_cast<T*>(&acc[13]);
+            T* a14 = reinterpret_cast<T*>(&acc[14]);
+            T* a15 = reinterpret_cast<T*>(&acc[15]);
+#pragma unroll
+            for (int j = 0; j < elems_per_vec; ++j)
+            {
+                a0[j] += a1[j];
+                a2[j] += a3[j];
+                a4[j] += a5[j];
+                a6[j] += a7[j];
+                a8[j] += a9[j];
+                a10[j] += a11[j];
+                a12[j] += a13[j];
+                a14[j] += a15[j];
+            }
+#pragma unroll
+            for (int j = 0; j < elems_per_vec; ++j)
+            {
+                a0[j] += a2[j];
+                a4[j] += a6[j];
+                a8[j] += a10[j];
+                a12[j] += a14[j];
+            }
+#pragma unroll
+            for (int j = 0; j < elems_per_vec; ++j)
+            {
+                a0[j] += a4[j];
+                a8[j] += a12[j];
+            }
+#pragma unroll
+            for (int j = 0; j < elems_per_vec; ++j)
+            {
+                a0[j] += a8[j];
+            }
+        }
+        else if constexpr (TOP_K == 10)
+        {
+            T* a0 = reinterpret_cast<T*>(&acc[0]);
+            T* a1 = reinterpret_cast<T*>(&acc[1]);
+            T* a2 = reinterpret_cast<T*>(&acc[2]);
+            T* a3 = reinterpret_cast<T*>(&acc[3]);
+            T* a4 = reinterpret_cast<T*>(&acc[4]);
+            T* a5 = reinterpret_cast<T*>(&acc[5]);
+            T* a6 = reinterpret_cast<T*>(&acc[6]);
+            T* a7 = reinterpret_cast<T*>(&acc[7]);
+            T* a8 = reinterpret_cast<T*>(&acc[8]);
+            T* a9 = reinterpret_cast<T*>(&acc[9]);
+#pragma unroll
+            for (int j = 0; j < elems_per_vec; ++j)
+            {
+                a0[j] += a1[j];
+                a2[j] += a3[j];
+                a4[j] += a5[j];
+                a6[j] += a7[j];
+                a8[j] += a9[j];
+            }
+#pragma unroll
+            for (int j = 0; j < elems_per_vec; ++j)
+            {
+                a0[j] += a2[j];
+                a4[j] += a6[j];
+            }
+#pragma unroll
+            for (int j = 0; j < elems_per_vec; ++j)
+            {
+                a0[j] += a4[j];
+                a0[j] += a8[j];
+            }
+        }
 
         // Reduce acc[TOP_K] into acc[0]
         if constexpr (TOP_K == 8)
@@ -643,6 +739,28 @@ __device__ void vectorized_combine_impl(
                 a0[j] += a4[j];
             }
         }
+        else if constexpr (TOP_K == 6)
+        {
+            T* a0 = reinterpret_cast<T*>(&acc[0]);
+            T* a1 = reinterpret_cast<T*>(&acc[1]);
+            T* a2 = reinterpret_cast<T*>(&acc[2]);
+            T* a3 = reinterpret_cast<T*>(&acc[3]);
+            T* a4 = reinterpret_cast<T*>(&acc[4]);
+            T* a5 = reinterpret_cast<T*>(&acc[5]);
+#pragma unroll
+            for (int j = 0; j < elems_per_vec; ++j)
+            {
+                a0[j] += a1[j];
+                a2[j] += a3[j];
+                a4[j] += a5[j];
+            }
+#pragma unroll
+            for (int j = 0; j < elems_per_vec; ++j)
+            {
+                a0[j] += a2[j];
+                a0[j] += a4[j];
+            }
+        }
         else if constexpr (TOP_K == 4)
         {
             T* a0 = reinterpret_cast<T*>(&acc[0]);
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h
@@ -23,10 +23,9 @@ namespace tensorrt_llm::kernels::moe_comm
 {
 
 // Configuration constants
-static constexpr int kMaxExperts = 256; // Maximum number of experts per rank
-static constexpr int kMaxTopK = 8;      // Maximum top-k experts per token
-static constexpr int kMaxPayloads = 8;  // Maximum number of different payload types
-static constexpr int kMaxRanks = 64;    // Maximum supported EP size
+static constexpr int kMaxTopK = 16;    // Maximum top-k experts per token
+static constexpr int kMaxPayloads = 4; // Maximum number of different payload types
+static constexpr int kMaxRanks = 64;   // Maximum supported EP size
 
 // Describes a single payload type to be communicated
 struct PayloadDescriptor