add pdl support for more kernels

dc3671 · dc3671 · commit f3322f1d391c · 2025-09-27T01:12:50.000-07:00
Signed-off-by: Zhenhuan Chen &lt;zhenhuanc@nvidia.com&gt;
diff --git a/cpp/tensorrt_llm/common/envUtils.h b/cpp/tensorrt_llm/common/envUtils.h
@@ -16,7 +16,9 @@
  */
 
 #pragma once
+#include "tensorrt_llm/common/cudaUtils.h"
 #include <cstdint>
+#include <cuda_runtime.h>
 #include <optional>
 #include <string>
 
@@ -55,6 +57,26 @@ int getEnvMmhaKernelBlockSize();
 // Whether PDL is enabled.
 bool getEnvEnablePDL();
 
+template <typename KernelFn, typename... Args>
+inline void launchWithPdlWhenEnabled(char const* name, KernelFn kernelFn, dim3 grid, dim3 block, size_t dynamicShmSize,
+    cudaStream_t stream, Args&&... args)
+{
+    TLLM_LOG_DEBUG("Enable PDL in %s", name);
+    cudaLaunchConfig_t kernelConfig;
+    kernelConfig.gridDim = grid;
+    kernelConfig.blockDim = block;
+    kernelConfig.dynamicSmemBytes = dynamicShmSize;
+    kernelConfig.stream = stream;
+
+    cudaLaunchAttribute attrs[1];
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
+    kernelConfig.attrs = attrs;
+    kernelConfig.numAttrs = 1;
+
+    TLLM_CUDA_CHECK(cudaLaunchKernelEx(&kernelConfig, kernelFn, std::forward<Args>(args)...));
+}
+
 bool getEnvUseUCXKvCache();
 
 bool getEnvUseMPIKvCache();
diff --git a/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.cu b/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.cu
@@ -27,6 +27,8 @@ namespace tensorrt_llm
 namespace kernels
 {
 
+using tensorrt_llm::common::launchWithPdlWhenEnabled;
+
 // Quantize a contiguous shared-memory buffer containing elements of DType into NVFP4 with per-16-element FP8 scales.
 // Output layout (repeated per 16-element group per lane), followed by one global scale float:
 //   [WARP_SIZE * 8 bytes packed e2m1 values] [WARP_SIZE * 1 byte E4M3 per-group scales] ... [global_scale (4 bytes)]
@@ -1069,6 +1071,9 @@ public:
 
         int sendIndex = mPairInfo.channel;
         uint32_t phaseParity = 0;
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+        cudaGridDependencySynchronize();
+#endif
         for (; sendIndex < tokenCount; sendIndex += mPairInfo.runChannelCount)
         {
             int tokenIndex = sendIndexMapping == nullptr ? sendIndex : sendIndexMapping[sendIndex];
@@ -1140,6 +1145,9 @@ public:
         int recvIndex = mPairInfo.channel;
         uint32_t phaseParity = 0;
         bool needRelease = false;
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+        cudaGridDependencySynchronize();
+#endif
         for (; recvIndex < tokenCount; recvIndex += mPairInfo.runChannelCount)
         {
             int tokenIndex = recvIndexMapping == nullptr ? recvIndex : recvIndexMapping[recvIndex];
@@ -1459,7 +1467,8 @@ void moeAllToAll(FusedMoeCommKernelParam params, FusedMoeWorkspace workspace, cu
 
     dim3 block = FusedMoeCommunicator::getLaunchBlockDim(groupCountPerCta);
     dim3 grid = FusedMoeCommunicator::getLaunchGridDim(params.worldInfo.epInfo.epSize, groupCountPerCta);
-    kernelFn<<<grid, block, totalDynamicShmSize, stream>>>(params, workspace, hasBasicFields);
+    launchWithPdlWhenEnabled(
+        "moeAllToAll", kernelFn, grid, block, totalDynamicShmSize, stream, params, workspace, hasBasicFields);
     TLLM_CUDA_CHECK(cudaGetLastError());
 }
 
diff --git a/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.h b/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.h
@@ -20,6 +20,7 @@
 #include <cuda_runtime_api.h>
 
 #include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/kernels/moeCommKernelsCommon.h"
 
 namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.cu b/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.cu
@@ -19,6 +19,7 @@
 #include <cub/cub.cuh>
 
 #include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.h"
 
 namespace cg = cooperative_groups;
@@ -28,6 +29,8 @@ namespace tensorrt_llm
 namespace kernels
 {
 
+using tensorrt_llm::common::launchWithPdlWhenEnabled;
+
 int getOwnerDevice(unsigned long long int stepAndOwner)
 {
     return static_cast<int>(stepAndOwner & MoeLoadBalanceSingleLayerSignal::kDevice);
@@ -138,6 +141,9 @@ __global__ void zeroExpertTokenCountKernel(MoeLoadBalanceMetaInfo metaInfo, int*
     TYPE oldExpertTokenCount = {0};
     int* expertTokenCountPtr = expertTokenCount + metaInfo.expertCount * blockIdx.x;
     TYPE* typedExpertTokenCountPtr = reinterpret_cast<TYPE*>(expertTokenCountPtr);
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    cudaGridDependencySynchronize();
+#endif
     typedExpertTokenCountPtr[threadIdx.x] = oldExpertTokenCount;
 }
 
@@ -177,6 +183,9 @@ __global__ void statisticKernel(MoeLoadBalanceMetaInfo metaInfo, int* expertToke
         sharedExpertCount[i] = 0;
     }
     __syncthreads();
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    cudaGridDependencySynchronize();
+#endif
     for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < totalEltCount; idx += gridDim.x * blockDim.x)
     {
         int expertId = gatheredRawExpertIds[idx];
@@ -282,11 +291,10 @@ void moeHierarchicalStatisticLocalDevice(MoeLoadBalanceMetaInfo metaInfo, int nu
         }
         dim3 gridDim(1);
         dim3 blockDim(threadCount);
-        void* args[]
-            = {&metaInfo, static_cast<void*>(const_cast<int**>(&enabled)), static_cast<void*>(&localExpertTokenCount)};
         TLLM_CHECK_WITH_INFO(
             threadCount <= 1024, "expertCount=%d is too large and not supported now.", metaInfo.expertCount);
-        TLLM_CUDA_CHECK(cudaLaunchKernel(kernelFunc, gridDim, blockDim, &args[0], 0, stream));
+        launchWithPdlWhenEnabled(
+            "zeroExpertTokenCount", kernelFunc, gridDim, blockDim, 0, stream, metaInfo, enabled, localExpertTokenCount);
     }
 
     {
@@ -299,7 +307,7 @@ void moeHierarchicalStatisticLocalDevice(MoeLoadBalanceMetaInfo metaInfo, int nu
             blockCount = smCount;
         }
         int sharedMemorySize = metaInfo.expertCount * sizeof(int);
-        statisticKernel<<<blockCount, threadCount, sharedMemorySize, stream>>>(
+        launchWithPdlWhenEnabled("statisticKernel", statisticKernel, blockCount, threadCount, sharedMemorySize, stream,
             metaInfo, localExpertTokenCount, totalEltCount, enabled, localRawExpertIds);
     }
 }
@@ -327,6 +335,10 @@ __global__ void moeComputeRouteNoRedundantKernel(MoeLoadBalanceMetaInfo metaInfo
 
     int blockOffset = blockIdx.x * THREAD_COUNT * ITEM_PER_THREAD;
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    cudaGridDependencySynchronize();
+#endif
+
     for (; blockOffset < tokenCount * metaInfo.topK; blockOffset += gridDim.x * THREAD_COUNT * ITEM_PER_THREAD)
     {
         int tokenIdxBase = blockOffset + threadIdx.x;
@@ -501,6 +513,10 @@ __global__ void moeComputeRouteSortKernel(MoeLoadBalanceMetaInfo metaInfo, MoePl
 
     int expertIds[ITEM_PER_THREAD];
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    cudaGridDependencySynchronize();
+#endif
+
     for (int blockOffset = blockIdx.x * THREAD_COUNT * ITEM_PER_THREAD; blockOffset < tokenCount * metaInfo.topK;
          blockOffset += gridDim.x * THREAD_COUNT * ITEM_PER_THREAD)
     {
@@ -586,14 +602,15 @@ void moeComputeRouteDevice(MoeLoadBalanceMetaInfo metaInfo, MoePlacementInfo pla
     int dynamicShmSize = sizeof(int16_t) * metaInfo.epSize * metaInfo.slotCountPerRank;
     if (metaInfo.expertCount == metaInfo.epSize * metaInfo.slotCountPerRank)
     {
+        auto* kernelFn = moeComputeRouteNoRedundantKernel<1024, kThreadCount, kEltPerThread>;
         // no redundant expert, so we don't need complex routing, but just assign to the correct solt.
-        moeComputeRouteNoRedundantKernel<1024, kThreadCount, kEltPerThread>
-            <<<blockCount, kThreadCount, dynamicShmSize, stream>>>(
-                metaInfo, placementInfo, tokenSelectedExperts, tokenRoutedSlotIds, tokenCount);
+        launchWithPdlWhenEnabled("moeComputeRouteNoRedundant", kernelFn, blockCount, kThreadCount, dynamicShmSize,
+            stream, metaInfo, placementInfo, tokenSelectedExperts, tokenRoutedSlotIds, tokenCount);
     }
     else
     {
-        moeComputeRouteKernel<1024, kThreadCount, kEltPerThread><<<blockCount, kThreadCount, dynamicShmSize, stream>>>(
+        auto* kernelFn = moeComputeRouteKernel<1024, kThreadCount, kEltPerThread>;
+        launchWithPdlWhenEnabled("moeComputeRoute", kernelFn, blockCount, kThreadCount, dynamicShmSize, stream,
             metaInfo, placementInfo, tokenSelectedExperts, tokenRoutedSlotIds, tokenCount, offsetByEpRank);
     }
 }
diff --git a/cpp/tensorrt_llm/kernels/moePrepareKernels.cu b/cpp/tensorrt_llm/kernels/moePrepareKernels.cu
@@ -30,6 +30,8 @@ namespace tensorrt_llm::kernels
 namespace moe_prepare
 {
 
+using tensorrt_llm::common::launchWithPdlWhenEnabled;
+
 __device__ __forceinline__ void st_release_sys_global(uint64_t volatile* ptr, uint64_t val)
 {
     asm volatile("st.release.sys.global.u64 [%0], %1;" ::"l"(ptr), "l"(val) : "memory");
@@ -110,6 +112,10 @@ __device__ __forceinline__ void computeCountAndSendStatics(int* experts, int tok
     int* localSendIndice = sendIndiceWorkspace + targetRankId * maxTokenCountPerRank;
     int* localBackwardIndice = backwardIndiceWorkspace + targetRankId * maxTokenCountPerRank;
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    cudaGridDependencySynchronize();
+#endif
+
     for (int i = tileId; i < readRankTokenCount; i += tileCountPerBlock)
     {
         int expertRankId = laneInTile < topK ? experts[i * topK + laneInTile] / expertCountPerRank : epSize;
@@ -163,6 +169,11 @@ __device__ __forceinline__ void recvCountAndStatics(int* recvIndiceWorkspace, in
 
     CounterCommunicator counter(workspace.getFifoConnInfo(false, rankId, targetRankId, 0, rankCount, 1));
     int communicationCount = gatheredExpertStatics == nullptr ? 1 : expertCount + 1;
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    cudaGridDependencySynchronize();
+#endif
+
     for (int i = rankTile.thread_rank(); i < communicationCount; i += THREADS_PER_PIPELINE)
     {
         int recvValue = counter.acquireValue(i);
@@ -218,6 +229,9 @@ __global__ void moveIndiceDevice(int* sendCountsCumsum, int* recvCountsCumsum, i
         int count = endIndex - startIndex;
         int* localSendIndice = sendIndice + targetRankId * maxTokenCountPerRank;
         int* localBackwardIndice = backwardIndice + targetRankId * maxTokenCountPerRank;
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+        cudaGridDependencySynchronize();
+#endif
         for (int localIdx = threadIdx.x; localIdx < count; localIdx += blockDim.x)
         {
             gatherSendIndice[startIndex + localIdx] = localSendIndice[localIdx];
@@ -230,6 +244,9 @@ __global__ void moveIndiceDevice(int* sendCountsCumsum, int* recvCountsCumsum, i
         int startIndex = targetRankId == 0 ? 0 : recvCountsCumsum[targetRankId - 1];
         int endIndex = recvCountsCumsum[targetRankId];
         int count = endIndex - startIndex;
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+        cudaGridDependencySynchronize();
+#endif
         for (int localIdx = threadIdx.x; localIdx < count; localIdx += blockDim.x)
         {
             gatherRecvIndice[startIndex + localIdx] = startIndex + localIdx;
@@ -249,6 +266,10 @@ __global__ void computeCumsumDevice(int* sendCountsCumsum, int* recvCountsCumsum
     int threadData = tid < rankCount ? inputOutputPtr[tid] : 0;
     __syncthreads();
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    cudaGridDependencySynchronize();
+#endif
+
     BlockScan(temp_storage).InclusiveSum(threadData, threadData);
     if (tid < rankCount)
     {
@@ -261,6 +282,9 @@ __global__ void memsetExpertIdsDevice(
 {
     int maxTokenCount = maxTokenCountPerRank * rankCount;
     int totalRecvTokenCount = *(recvCountsCumsum + rankCount - 1);
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    cudaGridDependencySynchronize();
+#endif
     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i + totalRecvTokenCount * topK < maxTokenCount * topK;
          i += gridDim.x * blockDim.x)
     {
@@ -300,17 +324,20 @@ void computeCountAndIndice(int* experts, int* sendCounts, int* recvCounts, int*
     {
         kernelFn = computeCountAndIndiceDevice<2>;
     }
-    kernelFn<<<grid, block, 0, stream>>>(experts, sendCounts, recvCounts, sendIndiceWorkspace, backwardIndiceWorkspace,
-        recvIndiceWorkspace, expertStatics, gatheredExpertStatics, workspace, tokenCount, maxTokenCountPerRank, topK,
-        slotCount, expertCount, rankId, rankCount);
+
+    launchWithPdlWhenEnabled("computeCountAndIndice", kernelFn, grid, block, 0, stream, experts, sendCounts, recvCounts,
+        sendIndiceWorkspace, backwardIndiceWorkspace, recvIndiceWorkspace, expertStatics, gatheredExpertStatics,
+        workspace, tokenCount, maxTokenCountPerRank, topK, slotCount, expertCount, rankId, rankCount);
 }
 
 void computeCumsum(int* sendCountsCumsum, int* recvCountsCumsum, int rankId, int rankCount, cudaStream_t stream)
 {
     int block_size = CUMSUM_THREADS_PER_BLOCK;
     dim3 block(block_size);
     dim3 grid(2);
-    computeCumsumDevice<<<grid, block, 0, stream>>>(sendCountsCumsum, recvCountsCumsum, rankId, rankCount);
+
+    launchWithPdlWhenEnabled("computeCumsum", computeCumsumDevice, grid, block, 0, stream, sendCountsCumsum,
+        recvCountsCumsum, rankId, rankCount);
 }
 
 void moveIndice(int* sendCountsCumsum, int* recvCountsCumsum, int* sendIndice, int* gatherSendIndice,
@@ -319,17 +346,22 @@ void moveIndice(int* sendCountsCumsum, int* recvCountsCumsum, int* sendIndice, i
 {
     dim3 block(512);
     dim3 grid(rankCount, 2);
-    moveIndiceDevice<<<grid, block, 0, stream>>>(sendCountsCumsum, recvCountsCumsum, sendIndice, gatherSendIndice,
-        backwardIndice, gatherBackwardIndice, recvIndice, gatherRecvIndice, maxTokenCountPerRank);
+
+    launchWithPdlWhenEnabled("moveIndice", moveIndiceDevice, grid, block, 0, stream, sendCountsCumsum, recvCountsCumsum,
+        sendIndice, gatherSendIndice, backwardIndice, gatherBackwardIndice, recvIndice, gatherRecvIndice,
+        maxTokenCountPerRank);
 }
 
 void memsetExpertIds(int* expertIds, int* recvCountsCumsum, int maxTokenCountPerRank, int topK, int slotCount,
     int rankCount, cudaStream_t stream)
 {
     int smCount = tensorrt_llm::common::getMultiProcessorCount();
     int block_size = 256;
-    memsetExpertIdsDevice<<<smCount, block_size, 0, stream>>>(
-        expertIds, recvCountsCumsum, maxTokenCountPerRank, topK, slotCount, rankCount);
+    dim3 block(block_size);
+    dim3 grid(smCount);
+
+    launchWithPdlWhenEnabled("memsetExpertIds", memsetExpertIdsDevice, grid, block, 0, stream, expertIds,
+        recvCountsCumsum, maxTokenCountPerRank, topK, slotCount, rankCount);
 }
 
 size_t getMoePrepareWorkspaceSize(int epSize)
diff --git a/cpp/tensorrt_llm/kernels/moePrepareKernels.h b/cpp/tensorrt_llm/kernels/moePrepareKernels.h
@@ -19,6 +19,7 @@
 #include <map>
 
 #include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/envUtils.h"
 
 #define DEBUG_PIPELINE 0
 
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp
@@ -18,11 +18,11 @@
 
 #include "KernelRunner.h"
 #include "tensorrt_llm/common/assert.h"
-#include "tensorrt_llm/common/envUtils.h"
 #include "trtllmGen_bmm_export/BatchedGemmInterface.h"
 #include "trtllmGen_bmm_export/trtllm/gen/DtypeDecl.h"
 // DO NOT include cudaUtils.h and logger.h before BatchedGemmInterface.h as it #undef TLLM_LOG_INFO and co.
 #include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/common/logger.h"
 
 namespace tensorrt_llm

Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@`
`19`	`19`	`#include <cub/cub.cuh>`
`20`	`20`
`21`	`21`	`#include "tensorrt_llm/common/cudaUtils.h"`
	`22`	`+#include "tensorrt_llm/common/envUtils.h"`
`22`	`23`	`#include "tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.h"`
`23`	`24`
`24`	`25`	`namespace cg = cooperative_groups;`
`@@ -28,6 +29,8 @@ namespace tensorrt_llm`
`28`	`29`	`namespace kernels`
`29`	`30`	`{`
`30`	`31`
	`32`	`+using tensorrt_llm::common::launchWithPdlWhenEnabled;`
	`33`	`+`
`31`	`34`	`int getOwnerDevice(unsigned long long int stepAndOwner)`
`32`	`35`	`{`
`33`	`36`	`return static_cast<int>(stepAndOwner & MoeLoadBalanceSingleLayerSignal::kDevice);`
`@@ -138,6 +141,9 @@ __global__ void zeroExpertTokenCountKernel(MoeLoadBalanceMetaInfo metaInfo, int*`
`138`	`141`	`TYPE oldExpertTokenCount = {0};`
`139`	`142`	`int* expertTokenCountPtr = expertTokenCount + metaInfo.expertCount * blockIdx.x;`
`140`	`143`	`TYPE* typedExpertTokenCountPtr = reinterpret_cast<TYPE*>(expertTokenCountPtr);`
	`144`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`145`	`+ cudaGridDependencySynchronize();`
	`146`	`+#endif`
`141`	`147`	`typedExpertTokenCountPtr[threadIdx.x] = oldExpertTokenCount;`
`142`	`148`	`}`
`143`	`149`
`@@ -177,6 +183,9 @@ __global__ void statisticKernel(MoeLoadBalanceMetaInfo metaInfo, int* expertToke`
`177`	`183`	`sharedExpertCount[i] = 0;`
`178`	`184`	`}`
`179`	`185`	`__syncthreads();`
	`186`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`187`	`+ cudaGridDependencySynchronize();`
	`188`	`+#endif`
`180`	`189`	`for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < totalEltCount; idx += gridDim.x * blockDim.x)`
`181`	`190`	`{`
`182`	`191`	`int expertId = gatheredRawExpertIds[idx];`
`@@ -282,11 +291,10 @@ void moeHierarchicalStatisticLocalDevice(MoeLoadBalanceMetaInfo metaInfo, int nu`
`282`	`291`	`}`
`283`	`292`	`dim3 gridDim(1);`
`284`	`293`	`dim3 blockDim(threadCount);`
`285`		`- void* args[]`
`286`		`- = {&metaInfo, static_cast<void>(const_cast<int>(&enabled)), static_cast<void>(&localExpertTokenCount)};`
`287`	`294`	`TLLM_CHECK_WITH_INFO(`
`288`	`295`	`threadCount <= 1024, "expertCount=%d is too large and not supported now.", metaInfo.expertCount);`
`289`		`- TLLM_CUDA_CHECK(cudaLaunchKernel(kernelFunc, gridDim, blockDim, &args[0], 0, stream));`
	`296`	`+ launchWithPdlWhenEnabled(`
	`297`	`+ "zeroExpertTokenCount", kernelFunc, gridDim, blockDim, 0, stream, metaInfo, enabled, localExpertTokenCount);`
`290`	`298`	`}`
`291`	`299`
`292`	`300`	`{`
`@@ -299,7 +307,7 @@ void moeHierarchicalStatisticLocalDevice(MoeLoadBalanceMetaInfo metaInfo, int nu`
`299`	`307`	`blockCount = smCount;`
`300`	`308`	`}`
`301`	`309`	`int sharedMemorySize = metaInfo.expertCount * sizeof(int);`
`302`		`- statisticKernel<<<blockCount, threadCount, sharedMemorySize, stream>>>(`
	`310`	`+ launchWithPdlWhenEnabled("statisticKernel", statisticKernel, blockCount, threadCount, sharedMemorySize, stream,`
`303`	`311`	`metaInfo, localExpertTokenCount, totalEltCount, enabled, localRawExpertIds);`
`304`	`312`	`}`
`305`	`313`	`}`
`@@ -327,6 +335,10 @@ __global__ void moeComputeRouteNoRedundantKernel(MoeLoadBalanceMetaInfo metaInfo`
`327`	`335`
`328`	`336`	`int blockOffset = blockIdx.x * THREAD_COUNT * ITEM_PER_THREAD;`
`329`	`337`
	`338`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`339`	`+ cudaGridDependencySynchronize();`
	`340`	`+#endif`
	`341`	`+`
`330`	`342`	`for (; blockOffset < tokenCount * metaInfo.topK; blockOffset += gridDim.x * THREAD_COUNT * ITEM_PER_THREAD)`
`331`	`343`	`{`
`332`	`344`	`int tokenIdxBase = blockOffset + threadIdx.x;`
`@@ -501,6 +513,10 @@ __global__ void moeComputeRouteSortKernel(MoeLoadBalanceMetaInfo metaInfo, MoePl`
`501`	`513`
`502`	`514`	`int expertIds[ITEM_PER_THREAD];`
`503`	`515`
	`516`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
	`517`	`+ cudaGridDependencySynchronize();`
	`518`	`+#endif`
	`519`	`+`
`504`	`520`	`for (int blockOffset = blockIdx.x * THREAD_COUNT * ITEM_PER_THREAD; blockOffset < tokenCount * metaInfo.topK;`
`505`	`521`	`blockOffset += gridDim.x * THREAD_COUNT * ITEM_PER_THREAD)`
`506`	`522`	`{`
`@@ -586,14 +602,15 @@ void moeComputeRouteDevice(MoeLoadBalanceMetaInfo metaInfo, MoePlacementInfo pla`
`586`	`602`	`int dynamicShmSize = sizeof(int16_t) * metaInfo.epSize * metaInfo.slotCountPerRank;`
`587`	`603`	`if (metaInfo.expertCount == metaInfo.epSize * metaInfo.slotCountPerRank)`
`588`	`604`	`{`
	`605`	`+ auto* kernelFn = moeComputeRouteNoRedundantKernel<1024, kThreadCount, kEltPerThread>;`
`589`	`606`	`// no redundant expert, so we don't need complex routing, but just assign to the correct solt.`
`590`		`- moeComputeRouteNoRedundantKernel<1024, kThreadCount, kEltPerThread>`
`591`		`- <<<blockCount, kThreadCount, dynamicShmSize, stream>>>(`
`592`		`- metaInfo, placementInfo, tokenSelectedExperts, tokenRoutedSlotIds, tokenCount);`
	`607`	`+ launchWithPdlWhenEnabled("moeComputeRouteNoRedundant", kernelFn, blockCount, kThreadCount, dynamicShmSize,`
	`608`	`+ stream, metaInfo, placementInfo, tokenSelectedExperts, tokenRoutedSlotIds, tokenCount);`
`593`	`609`	`}`
`594`	`610`	`else`
`595`	`611`	`{`
`596`		`- moeComputeRouteKernel<1024, kThreadCount, kEltPerThread><<<blockCount, kThreadCount, dynamicShmSize, stream>>>(`
	`612`	`+ auto* kernelFn = moeComputeRouteKernel<1024, kThreadCount, kEltPerThread>;`
	`613`	`+ launchWithPdlWhenEnabled("moeComputeRoute", kernelFn, blockCount, kThreadCount, dynamicShmSize, stream,`
`597`	`614`	`metaInfo, placementInfo, tokenSelectedExperts, tokenRoutedSlotIds, tokenCount, offsetByEpRank);`
`598`	`615`	`}`
`599`	`616`	`}`