LarryXFly
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.cu‎
Lines changed: 31 additions & 55 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.cu‎
Lines changed: 31 additions & 55 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.h‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tests/unit_tests/kernels/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎cpp/tests/unit_tests/kernels/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
@@ -327,6 +327,7 @@ __global__ void routingMainKernel(KernelParams params)
     // note that with invalid values, because sigmoid is < 1 and bias is -1,
     // we must get a negative value, which is smaller than any valid value
     auto scoreBias = float{scoreSigmoid + float{biasVal}};
+
     if (expertSelected)
     {
         smemScoreBias[threadExpert] = scoreBias;
@@ -859,7 +860,6 @@ __global__ void routingIndicesCoopKernel(KernelParams params)
 // inefficient if we have one CTA per token doing a single global atomic.
 
 template <typename KernelParams>
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
 __global__ void __launch_bounds__(NumThreads) routingIndicesHistogramKernel(KernelParams params)
 {
     // number of experts is bounded by number of threads
@@ -872,12 +872,14 @@ __global__ void __launch_bounds__(NumThreads) routingIndicesHistogramKernel(Kern
     smemExpertCount[threadIdx.x] = 0;
     __syncthreads();
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     // Wait on primary grid and trigger secondary kernel.
     if constexpr (KernelParams::UsePdl)
     {
         cudaGridDependencySynchronize();
         cudaTriggerProgrammaticLaunchCompletion();
     }
+#endif
 
     int32_t const expandedIdxSize = params.mNumTokens * params.mTopK;
     int32_t const localExpertExtent = params.mNumLocalExperts << params.mLocalExpertsStrideLog2;
@@ -932,17 +934,10 @@ __global__ void __launch_bounds__(NumThreads) routingIndicesHistogramKernel(Kern
     int32_t const localExpertCount = smemExpertCount[threadIdx.x];
     atomicAdd(&params.mPtrExpertCounts[threadIdx.x], localExpertCount);
 }
-#else
-__global__ void routingIndicesHistogramKernel(KernelParams params)
-{
-    assert(false && "routingIndicesHistogramKernel is only supported on SM90+ architectures");
-}
-#endif
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename KernelParams>
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
 __global__ void __launch_bounds__(NumThreads) routingIndicesOffsetsKernel(KernelParams params)
 {
     // number of experts is bounded by number of threads
@@ -960,11 +955,13 @@ __global__ void __launch_bounds__(NumThreads) routingIndicesOffsetsKernel(Kernel
     int32_t const expandedIdxSize = params.mNumTokens * params.mTopK;
     int32_t const numTiles = (expandedIdxSize + MaxExpandedIdxPerBlock - 1) / (MaxExpandedIdxPerBlock);
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     // Wait on primary grid.
     if constexpr (KernelParams::UsePdl)
     {
         cudaGridDependencySynchronize();
     }
+#endif
 
     // The expert offsets are common to all tiles of all blocks.
     // Load the histogram, scan it and write offsets to shared memory.
@@ -1163,17 +1160,13 @@ __global__ void __launch_bounds__(NumThreads) routingIndicesOffsetsKernel(Kernel
     // Trigger secondary kernel.
     // Note: this does not guarantee the visibility of prior writes unless the consumer executes a
     // dependency sync.
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     if constexpr (KernelParams::UsePdl)
     {
         cudaTriggerProgrammaticLaunchCompletion();
     }
-}
-#else
-__global__ void routingIndicesOffsetsKernel(KernelParams params)
-{
-    assert(false && "routingIndicesOffsetsKernel is only supported on SM90+ architectures");
-}
 #endif
+}
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -1577,7 +1570,6 @@ __host__ __device__ constexpr void setBits(int32_t& value, int32_t newBits, int
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename KernelParams>
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
 __global__ void __launch_bounds__(WarpSize) routingIndicesWarpKernel(KernelParams params)
 {
     // types used in this kernel
@@ -1614,11 +1606,13 @@ __global__ void __launch_bounds__(WarpSize) routingIndicesWarpKernel(KernelParam
     }
     __syncwarp();
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     // then wait on primary grid
     if constexpr (KernelParams::UsePdl)
     {
         cudaGridDependencySynchronize();
     }
+#endif
 
     if (params.mPtrScores != nullptr)
     {
@@ -1744,12 +1738,14 @@ __global__ void __launch_bounds__(WarpSize) routingIndicesWarpKernel(KernelParam
         params.mPtrNumNonExitingCtas[0] = numNonExitingCtas;
     }
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
 #if !defined(PDL_PROFILE) || PDL_PROFILE == 0
     // we can trigger the next kernel at this point
     if constexpr (KernelParams::UsePdl)
     {
         cudaTriggerProgrammaticLaunchCompletion();
     }
+#endif
 #endif
 
     // at this point, all values for offsets are ready, except the final offsets
@@ -1806,13 +1802,6 @@ __global__ void __launch_bounds__(WarpSize) routingIndicesWarpKernel(KernelParam
         }
     }
 }
-#else
-__global__ void routingIndicesWarpKernel(KernelParams params)
-{
-    assert(false && "routingIndicesWarpKernel is only supported on SM90+ architectures");
-}
-#endif
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename KernelParams>
@@ -2076,7 +2065,6 @@ __global__ void routingIndicesClusterKernel(KernelParams params)
 
 // this kernel is needed in case we have scores as input for the histogram kernel
 template <typename KernelParams>
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
 __global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramScoresKernel(KernelParams params)
 {
     using TypeExpW = typename KernelParams::TypeExpW;
@@ -2094,12 +2082,14 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramScoresK
     auto block = cg::this_thread_block();
     auto warp = cg::tiled_partition<WarpSize>(block);
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     // Wait on primary grid and trigger secondary kernel.
     if constexpr (KernelParams::UsePdl)
     {
         cudaGridDependencySynchronize();
         cudaTriggerProgrammaticLaunchCompletion();
     }
+#endif
 
     // in this case, each warp represents a token, and we use a grid-stride loop
     // over all warps/tokens
@@ -2132,12 +2122,6 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramScoresK
         }
     }
 }
-#else
-__global__ void routingIndicesHistogramScoresKernel(KernelParams params)
-{
-    assert(false && "routingIndicesHistogramScoresKernel is only supported on SM90+ architectures");
-}
-#endif
 
 // Two-step approach (if number of tokens exceed limits of what cluster / cooperative launch
 // variants can handle): in order to minimize the amount of data to exchange through global memory,
@@ -2148,7 +2132,6 @@ __global__ void routingIndicesHistogramScoresKernel(KernelParams params)
 // Note: the histogram calculation could also be fused with routingMainKernel, but this might be
 // inefficient if we have one CTA per token doing a single global atomic.
 template <typename KernelParams>
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
 __global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramKernel(KernelParams params)
 {
     using TypeExpW = typename KernelParams::TypeExpW;
@@ -2166,12 +2149,14 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramKernel(
     }
     __syncthreads();
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     // Wait on primary grid and trigger secondary kernel.
     if constexpr (KernelParams::UsePdl)
     {
         cudaGridDependencySynchronize();
         cudaTriggerProgrammaticLaunchCompletion();
     }
+#endif
 
     uint32_t const expandedIdxSize = params.mNumTokens * NumTopExperts;
     uint32_t const localExpertExtent = params.mNumLocalExperts << params.mLocalExpertsStrideLog2;
@@ -2234,17 +2219,10 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramKernel(
         atomicAdd(&params.mPtrExpertCounts[threadIdx.x], localExpertCount);
     }
 }
-#else
-__global__ void routingIndicesHistogramKernel(KernelParams params)
-{
-    assert(false && "routingIndicesHistogramKernel is only supported on SM90+ architectures");
-}
-#endif
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename KernelParams>
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
 __global__ void __launch_bounds__(NumThreadsHist) routingIndicesOffsetsKernel(KernelParams params)
 {
     using TypeExpW = typename KernelParams::TypeExpW;
@@ -2264,11 +2242,13 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesOffsetsKernel(Ke
     uint32_t const expandedIdxSize = params.mNumTokens * NumTopExperts;
     uint32_t const numTiles = (expandedIdxSize + MaxExpandedIdxPerBlock - 1) / (MaxExpandedIdxPerBlock);
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     // Wait on primary grid.
     if constexpr (KernelParams::UsePdl)
     {
         cudaGridDependencySynchronize();
     }
+#endif
 
     // The expert offsets are common to all tiles of all blocks.
     // Load the histogram, scan it and write offsets to shared memory.
@@ -2484,6 +2464,7 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesOffsetsKernel(Ke
         }
     }
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
 // Trigger secondary kernel.
 // Note: this does not guarantee the visibility of prior writes unless the consumer executes a
 // dependency sync.
@@ -2493,13 +2474,8 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesOffsetsKernel(Ke
         cudaTriggerProgrammaticLaunchCompletion();
     }
 #endif
-}
-#else
-__global__ void routingIndicesOffsetsKernel(KernelParams params)
-{
-    assert(false && "routingIndicesOffsetsKernel is only supported on SM90+ architectures");
-}
 #endif
+}
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -2599,7 +2575,7 @@ void run(Data const& data, void* stream)
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-namespace routingQwen3
+namespace routingRenormalize
 {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -3230,13 +3206,13 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramScoresK
     auto block = cg::this_thread_block();
     auto warp = cg::tiled_partition<WarpSize>(block);
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     // Wait on primary grid.
     if constexpr (KernelParams::UsePdl)
     {
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
         cudaGridDependencySynchronize();
-#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     }
+#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
 
     // initialize the mPtrPermutedIdxToTokenIdx
     int32_t globalThreadIdx = globalWarpIdx * WarpSize + laneIdx;
@@ -3261,13 +3237,13 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramScoresK
         }
     }
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     // Trigger secondary kernel.
     if constexpr (KernelParams::UsePdl)
     {
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
         cudaTriggerProgrammaticLaunchCompletion();
-#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     }
+#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
 
     // in this case, each warp represents a token, and we use a grid-stride loop
     // over all warps/tokens
@@ -3360,14 +3336,14 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramKernel(
     }
     __syncthreads();
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     // Wait on primary grid and trigger secondary kernel.
     if constexpr (KernelParams::UsePdl)
     {
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
         cudaGridDependencySynchronize();
         cudaTriggerProgrammaticLaunchCompletion();
-#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     }
+#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
 
     uint32_t const expandedIdxSize = params.mNumTokens * NumTopExperts;
     uint32_t const localExpertExtent = params.mNumLocalExperts << params.mLocalExpertsStrideLog2;
@@ -3454,13 +3430,13 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesOffsetsKernel(Ke
     uint32_t const expandedIdxSize = params.mNumTokens * NumTopExperts;
     uint32_t const numTiles = (expandedIdxSize + MaxExpandedIdxPerBlock - 1) / (MaxExpandedIdxPerBlock);
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     // Wait on primary grid.
     if constexpr (KernelParams::UsePdl)
     {
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
         cudaGridDependencySynchronize();
-#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     }
+#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
 
     // The expert offsets are common to all tiles of all blocks.
     // Load the histogram, scan it and write offsets to shared memory.
@@ -3676,17 +3652,17 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesOffsetsKernel(Ke
         }
     }
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
 // Trigger secondary kernel.
 // Note: this does not guarantee the visibility of prior writes unless the consumer executes a
 // dependency sync.
 #if !defined(PDL_PROFILE) || PDL_PROFILE == 0
     if constexpr (KernelParams::UsePdl)
     {
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
         cudaTriggerProgrammaticLaunchCompletion();
-#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     }
 #endif
+#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -3756,7 +3732,7 @@ void run(Data const& data, void* stream)
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-} // namespace routingQwen3
+} // namespace routingRenormalize
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 
@@ -307,7 +307,7 @@ void run(Data const& data, void* stream);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-namespace routingQwen3
+namespace routingRenormalize
 {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -430,7 +430,7 @@ struct KernelParams
 
 void run(Data const& data, void* stream);
 
-} // namespace routingQwen3
+} // namespace routingRenormalize
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 
@@ -154,7 +154,7 @@ void Runner::run(void* routingLogits, void* routingBias, int32_t numTokens, int3
     else if (routingMethodType == RoutingMethodType::Renormalize /* default */
         || routingMethodType == RoutingMethodType::RenormalizeNaive /* Softmax -> TopK */)
     {
-        moe::dev::routingQwen3::Data routingData;
+        moe::dev::routingRenormalize::Data routingData;
 
         //
         // Config
@@ -196,7 +196,7 @@ void Runner::run(void* routingLogits, void* routingBias, int32_t numTokens, int3
         routingData.mLocalExpertsStrideLog2 = 0;
         routingData.mNumLocalExperts = localNumExperts;
 
-        moe::dev::routingQwen3::run(routingData, stream);
+        moe::dev::routingRenormalize::run(routingData, stream);
     }
     else
     {
 
@@ -76,3 +76,9 @@ set(SAMPLING_KERNEL_TEST_SRC
     sampling/samplingPenaltyTest.cpp sampling/samplingUtilsTest.cu)
 
 add_gtest(samplingKernelsTest "${SAMPLING_KERNEL_TEST_SRC}")
+
+set(ROUTING_KERNEL_TEST_SRC
+    routing/routingTest.cpp routing/routingLlama4Test.cpp
+    routing/routingRenormalizeTest.cpp routing/routingDeepSeekTest.cpp)
+
+add_gtest(routingKernelsTest "${ROUTING_KERNEL_TEST_SRC}")
Original file line number	Diff line number	Diff line change
`@@ -327,6 +327,7 @@ __global__ void routingMainKernel(KernelParams params)`
`327`	`327`	`// note that with invalid values, because sigmoid is < 1 and bias is -1,`
`328`	`328`	`// we must get a negative value, which is smaller than any valid value`
`329`	`329`	`auto scoreBias = float{scoreSigmoid + float{biasVal}};`
	`330`	`+`
`330`	`331`	`if (expertSelected)`
`331`	`332`	`{`
`332`	`333`	`smemScoreBias[threadExpert] = scoreBias;`
`@@ -859,7 +860,6 @@ __global__ void routingIndicesCoopKernel(KernelParams params)`
`859`	`860`	`// inefficient if we have one CTA per token doing a single global atomic.`
`860`	`861`
`861`	`862`	`template <typename KernelParams>`
`862`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`863`	`863`	`__global__ void __launch_bounds__(NumThreads) routingIndicesHistogramKernel(KernelParams params)`
`864`	`864`	`{`
`865`	`865`	`// number of experts is bounded by number of threads`
`@@ -872,12 +872,14 @@ __global__ void __launch_bounds__(NumThreads) routingIndicesHistogramKernel(Kern`
`872`	`872`	`smemExpertCount[threadIdx.x] = 0;`
`873`	`873`	`__syncthreads();`
`874`	`874`
	`875`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`875`	`876`	`// Wait on primary grid and trigger secondary kernel.`
`876`	`877`	`if constexpr (KernelParams::UsePdl)`
`877`	`878`	`{`
`878`	`879`	`cudaGridDependencySynchronize();`
`879`	`880`	`cudaTriggerProgrammaticLaunchCompletion();`
`880`	`881`	`}`
	`882`	`+#endif`
`881`	`883`
`882`	`884`	`int32_t const expandedIdxSize = params.mNumTokens * params.mTopK;`
`883`	`885`	`int32_t const localExpertExtent = params.mNumLocalExperts << params.mLocalExpertsStrideLog2;`
`@@ -932,17 +934,10 @@ __global__ void __launch_bounds__(NumThreads) routingIndicesHistogramKernel(Kern`
`932`	`934`	`int32_t const localExpertCount = smemExpertCount[threadIdx.x];`
`933`	`935`	`atomicAdd(&params.mPtrExpertCounts[threadIdx.x], localExpertCount);`
`934`	`936`	`}`
`935`		`-#else`
`936`		`-__global__ void routingIndicesHistogramKernel(KernelParams params)`
`937`		`-{`
`938`		`- assert(false && "routingIndicesHistogramKernel is only supported on SM90+ architectures");`
`939`		`-}`
`940`		`-#endif`
`941`	`937`
`942`	`938`	`////////////////////////////////////////////////////////////////////////////////////////////////////`
`943`	`939`
`944`	`940`	`template <typename KernelParams>`
`945`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`946`	`941`	`__global__ void __launch_bounds__(NumThreads) routingIndicesOffsetsKernel(KernelParams params)`
`947`	`942`	`{`
`948`	`943`	`// number of experts is bounded by number of threads`
`@@ -960,11 +955,13 @@ __global__ void __launch_bounds__(NumThreads) routingIndicesOffsetsKernel(Kernel`
`960`	`955`	`int32_t const expandedIdxSize = params.mNumTokens * params.mTopK;`
`961`	`956`	`int32_t const numTiles = (expandedIdxSize + MaxExpandedIdxPerBlock - 1) / (MaxExpandedIdxPerBlock);`
`962`	`957`
	`958`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`963`	`959`	`// Wait on primary grid.`
`964`	`960`	`if constexpr (KernelParams::UsePdl)`
`965`	`961`	`{`
`966`	`962`	`cudaGridDependencySynchronize();`
`967`	`963`	`}`
	`964`	`+#endif`
`968`	`965`
`969`	`966`	`// The expert offsets are common to all tiles of all blocks.`
`970`	`967`	`// Load the histogram, scan it and write offsets to shared memory.`
`@@ -1163,17 +1160,13 @@ __global__ void __launch_bounds__(NumThreads) routingIndicesOffsetsKernel(Kernel`
`1163`	`1160`	`// Trigger secondary kernel.`
`1164`	`1161`	`// Note: this does not guarantee the visibility of prior writes unless the consumer executes a`
`1165`	`1162`	`// dependency sync.`
	`1163`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`1166`	`1164`	`if constexpr (KernelParams::UsePdl)`
`1167`	`1165`	`{`
`1168`	`1166`	`cudaTriggerProgrammaticLaunchCompletion();`
`1169`	`1167`	`}`
`1170`		`-}`
`1171`		`-#else`
`1172`		`-__global__ void routingIndicesOffsetsKernel(KernelParams params)`
`1173`		`-{`
`1174`		`- assert(false && "routingIndicesOffsetsKernel is only supported on SM90+ architectures");`
`1175`		`-}`
`1176`	`1168`	`#endif`
	`1169`	`+}`
`1177`	`1170`
`1178`	`1171`	`////////////////////////////////////////////////////////////////////////////////////////////////////`
`1179`	`1172`
`@@ -1577,7 +1570,6 @@ __host__ __device__ constexpr void setBits(int32_t& value, int32_t newBits, int`
`1577`	`1570`	`////////////////////////////////////////////////////////////////////////////////////////////////////`
`1578`	`1571`
`1579`	`1572`	`template <typename KernelParams>`
`1580`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`1581`	`1573`	`__global__ void __launch_bounds__(WarpSize) routingIndicesWarpKernel(KernelParams params)`
`1582`	`1574`	`{`
`1583`	`1575`	`// types used in this kernel`
`@@ -1614,11 +1606,13 @@ __global__ void __launch_bounds__(WarpSize) routingIndicesWarpKernel(KernelParam`
`1614`	`1606`	`}`
`1615`	`1607`	`__syncwarp();`
`1616`	`1608`
	`1609`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`1617`	`1610`	`// then wait on primary grid`
`1618`	`1611`	`if constexpr (KernelParams::UsePdl)`
`1619`	`1612`	`{`
`1620`	`1613`	`cudaGridDependencySynchronize();`
`1621`	`1614`	`}`
	`1615`	`+#endif`
`1622`	`1616`
`1623`	`1617`	`if (params.mPtrScores != nullptr)`
`1624`	`1618`	`{`
`@@ -1744,12 +1738,14 @@ __global__ void __launch_bounds__(WarpSize) routingIndicesWarpKernel(KernelParam`
`1744`	`1738`	`params.mPtrNumNonExitingCtas[0] = numNonExitingCtas;`
`1745`	`1739`	`}`
`1746`	`1740`
	`1741`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`1747`	`1742`	`#if !defined(PDL_PROFILE) \|\| PDL_PROFILE == 0`
`1748`	`1743`	`// we can trigger the next kernel at this point`
`1749`	`1744`	`if constexpr (KernelParams::UsePdl)`
`1750`	`1745`	`{`
`1751`	`1746`	`cudaTriggerProgrammaticLaunchCompletion();`
`1752`	`1747`	`}`
	`1748`	`+#endif`
`1753`	`1749`	`#endif`
`1754`	`1750`
`1755`	`1751`	`// at this point, all values for offsets are ready, except the final offsets`
`@@ -1806,13 +1802,6 @@ __global__ void __launch_bounds__(WarpSize) routingIndicesWarpKernel(KernelParam`
`1806`	`1802`	`}`
`1807`	`1803`	`}`
`1808`	`1804`	`}`
`1809`		`-#else`
`1810`		`-__global__ void routingIndicesWarpKernel(KernelParams params)`
`1811`		`-{`
`1812`		`- assert(false && "routingIndicesWarpKernel is only supported on SM90+ architectures");`
`1813`		`-}`
`1814`		`-#endif`
`1815`		`-`
`1816`	`1805`	`////////////////////////////////////////////////////////////////////////////////////////////////////`
`1817`	`1806`
`1818`	`1807`	`template <typename KernelParams>`
`@@ -2076,7 +2065,6 @@ __global__ void routingIndicesClusterKernel(KernelParams params)`
`2076`	`2065`
`2077`	`2066`	`// this kernel is needed in case we have scores as input for the histogram kernel`
`2078`	`2067`	`template <typename KernelParams>`
`2079`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`2080`	`2068`	`__global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramScoresKernel(KernelParams params)`
`2081`	`2069`	`{`
`2082`	`2070`	`using TypeExpW = typename KernelParams::TypeExpW;`
`@@ -2094,12 +2082,14 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramScoresK`
`2094`	`2082`	`auto block = cg::this_thread_block();`
`2095`	`2083`	`auto warp = cg::tiled_partition<WarpSize>(block);`
`2096`	`2084`
	`2085`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`2097`	`2086`	`// Wait on primary grid and trigger secondary kernel.`
`2098`	`2087`	`if constexpr (KernelParams::UsePdl)`
`2099`	`2088`	`{`
`2100`	`2089`	`cudaGridDependencySynchronize();`
`2101`	`2090`	`cudaTriggerProgrammaticLaunchCompletion();`
`2102`	`2091`	`}`
	`2092`	`+#endif`
`2103`	`2093`
`2104`	`2094`	`// in this case, each warp represents a token, and we use a grid-stride loop`
`2105`	`2095`	`// over all warps/tokens`
`@@ -2132,12 +2122,6 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramScoresK`
`2132`	`2122`	`}`
`2133`	`2123`	`}`
`2134`	`2124`	`}`
`2135`		`-#else`
`2136`		`-__global__ void routingIndicesHistogramScoresKernel(KernelParams params)`
`2137`		`-{`
`2138`		`- assert(false && "routingIndicesHistogramScoresKernel is only supported on SM90+ architectures");`
`2139`		`-}`
`2140`		`-#endif`
`2141`	`2125`
`2142`	`2126`	`// Two-step approach (if number of tokens exceed limits of what cluster / cooperative launch`
`2143`	`2127`	`// variants can handle): in order to minimize the amount of data to exchange through global memory,`
`@@ -2148,7 +2132,6 @@ __global__ void routingIndicesHistogramScoresKernel(KernelParams params)`
`2148`	`2132`	`// Note: the histogram calculation could also be fused with routingMainKernel, but this might be`
`2149`	`2133`	`// inefficient if we have one CTA per token doing a single global atomic.`
`2150`	`2134`	`template <typename KernelParams>`
`2151`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`2152`	`2135`	`__global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramKernel(KernelParams params)`
`2153`	`2136`	`{`
`2154`	`2137`	`using TypeExpW = typename KernelParams::TypeExpW;`
`@@ -2166,12 +2149,14 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramKernel(`
`2166`	`2149`	`}`
`2167`	`2150`	`__syncthreads();`
`2168`	`2151`
	`2152`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`2169`	`2153`	`// Wait on primary grid and trigger secondary kernel.`
`2170`	`2154`	`if constexpr (KernelParams::UsePdl)`
`2171`	`2155`	`{`
`2172`	`2156`	`cudaGridDependencySynchronize();`
`2173`	`2157`	`cudaTriggerProgrammaticLaunchCompletion();`
`2174`	`2158`	`}`
	`2159`	`+#endif`
`2175`	`2160`
`2176`	`2161`	`uint32_t const expandedIdxSize = params.mNumTokens * NumTopExperts;`
`2177`	`2162`	`uint32_t const localExpertExtent = params.mNumLocalExperts << params.mLocalExpertsStrideLog2;`
`@@ -2234,17 +2219,10 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramKernel(`
`2234`	`2219`	`atomicAdd(&params.mPtrExpertCounts[threadIdx.x], localExpertCount);`
`2235`	`2220`	`}`
`2236`	`2221`	`}`
`2237`		`-#else`
`2238`		`-__global__ void routingIndicesHistogramKernel(KernelParams params)`
`2239`		`-{`
`2240`		`- assert(false && "routingIndicesHistogramKernel is only supported on SM90+ architectures");`
`2241`		`-}`
`2242`		`-#endif`
`2243`	`2222`
`2244`	`2223`	`////////////////////////////////////////////////////////////////////////////////////////////////////`
`2245`	`2224`
`2246`	`2225`	`template <typename KernelParams>`
`2247`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`2248`	`2226`	`__global__ void __launch_bounds__(NumThreadsHist) routingIndicesOffsetsKernel(KernelParams params)`
`2249`	`2227`	`{`
`2250`	`2228`	`using TypeExpW = typename KernelParams::TypeExpW;`
`@@ -2264,11 +2242,13 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesOffsetsKernel(Ke`
`2264`	`2242`	`uint32_t const expandedIdxSize = params.mNumTokens * NumTopExperts;`
`2265`	`2243`	`uint32_t const numTiles = (expandedIdxSize + MaxExpandedIdxPerBlock - 1) / (MaxExpandedIdxPerBlock);`
`2266`	`2244`
	`2245`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`2267`	`2246`	`// Wait on primary grid.`
`2268`	`2247`	`if constexpr (KernelParams::UsePdl)`
`2269`	`2248`	`{`
`2270`	`2249`	`cudaGridDependencySynchronize();`
`2271`	`2250`	`}`
	`2251`	`+#endif`
`2272`	`2252`
`2273`	`2253`	`// The expert offsets are common to all tiles of all blocks.`
`2274`	`2254`	`// Load the histogram, scan it and write offsets to shared memory.`
`@@ -2484,6 +2464,7 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesOffsetsKernel(Ke`
`2484`	`2464`	`}`
`2485`	`2465`	`}`
`2486`	`2466`
	`2467`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`2487`	`2468`	`// Trigger secondary kernel.`
`2488`	`2469`	`// Note: this does not guarantee the visibility of prior writes unless the consumer executes a`
`2489`	`2470`	`// dependency sync.`
`@@ -2493,13 +2474,8 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesOffsetsKernel(Ke`
`2493`	`2474`	`cudaTriggerProgrammaticLaunchCompletion();`
`2494`	`2475`	`}`
`2495`	`2476`	`#endif`
`2496`		`-}`
`2497`		`-#else`
`2498`		`-__global__ void routingIndicesOffsetsKernel(KernelParams params)`
`2499`		`-{`
`2500`		`- assert(false && "routingIndicesOffsetsKernel is only supported on SM90+ architectures");`
`2501`		`-}`
`2502`	`2477`	`#endif`
	`2478`	`+}`
`2503`	`2479`
`2504`	`2480`	`////////////////////////////////////////////////////////////////////////////////////////////////////`
`2505`	`2481`
`@@ -2599,7 +2575,7 @@ void run(Data const& data, void* stream)`
`2599`	`2575`
`2600`	`2576`	`////////////////////////////////////////////////////////////////////////////////////////////////////`
`2601`	`2577`
`2602`		`-namespace routingQwen3`
	`2578`	`+namespace routingRenormalize`
`2603`	`2579`	`{`
`2604`	`2580`
`2605`	`2581`	`////////////////////////////////////////////////////////////////////////////////////////////////////`
`@@ -3230,13 +3206,13 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramScoresK`
`3230`	`3206`	`auto block = cg::this_thread_block();`
`3231`	`3207`	`auto warp = cg::tiled_partition<WarpSize>(block);`
`3232`	`3208`
	`3209`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3233`	`3210`	`// Wait on primary grid.`
`3234`	`3211`	`if constexpr (KernelParams::UsePdl)`
`3235`	`3212`	`{`
`3236`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3237`	`3213`	`cudaGridDependencySynchronize();`
`3238`		`-#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3239`	`3214`	`}`
	`3215`	`+#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3240`	`3216`
`3241`	`3217`	`// initialize the mPtrPermutedIdxToTokenIdx`
`3242`	`3218`	`int32_t globalThreadIdx = globalWarpIdx * WarpSize + laneIdx;`
`@@ -3261,13 +3237,13 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramScoresK`
`3261`	`3237`	`}`
`3262`	`3238`	`}`
`3263`	`3239`
	`3240`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3264`	`3241`	`// Trigger secondary kernel.`
`3265`	`3242`	`if constexpr (KernelParams::UsePdl)`
`3266`	`3243`	`{`
`3267`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3268`	`3244`	`cudaTriggerProgrammaticLaunchCompletion();`
`3269`		`-#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3270`	`3245`	`}`
	`3246`	`+#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3271`	`3247`
`3272`	`3248`	`// in this case, each warp represents a token, and we use a grid-stride loop`
`3273`	`3249`	`// over all warps/tokens`
`@@ -3360,14 +3336,14 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesHistogramKernel(`
`3360`	`3336`	`}`
`3361`	`3337`	`__syncthreads();`
`3362`	`3338`
	`3339`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3363`	`3340`	`// Wait on primary grid and trigger secondary kernel.`
`3364`	`3341`	`if constexpr (KernelParams::UsePdl)`
`3365`	`3342`	`{`
`3366`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3367`	`3343`	`cudaGridDependencySynchronize();`
`3368`	`3344`	`cudaTriggerProgrammaticLaunchCompletion();`
`3369`		`-#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3370`	`3345`	`}`
	`3346`	`+#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3371`	`3347`
`3372`	`3348`	`uint32_t const expandedIdxSize = params.mNumTokens * NumTopExperts;`
`3373`	`3349`	`uint32_t const localExpertExtent = params.mNumLocalExperts << params.mLocalExpertsStrideLog2;`
`@@ -3454,13 +3430,13 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesOffsetsKernel(Ke`
`3454`	`3430`	`uint32_t const expandedIdxSize = params.mNumTokens * NumTopExperts;`
`3455`	`3431`	`uint32_t const numTiles = (expandedIdxSize + MaxExpandedIdxPerBlock - 1) / (MaxExpandedIdxPerBlock);`
`3456`	`3432`
	`3433`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3457`	`3434`	`// Wait on primary grid.`
`3458`	`3435`	`if constexpr (KernelParams::UsePdl)`
`3459`	`3436`	`{`
`3460`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3461`	`3437`	`cudaGridDependencySynchronize();`
`3462`		`-#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3463`	`3438`	`}`
	`3439`	`+#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3464`	`3440`
`3465`	`3441`	`// The expert offsets are common to all tiles of all blocks.`
`3466`	`3442`	`// Load the histogram, scan it and write offsets to shared memory.`
`@@ -3676,17 +3652,17 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesOffsetsKernel(Ke`
`3676`	`3652`	`}`
`3677`	`3653`	`}`
`3678`	`3654`
	`3655`	`+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3679`	`3656`	`// Trigger secondary kernel.`
`3680`	`3657`	`// Note: this does not guarantee the visibility of prior writes unless the consumer executes a`
`3681`	`3658`	`// dependency sync.`
`3682`	`3659`	`#if !defined(PDL_PROFILE) \|\| PDL_PROFILE == 0`
`3683`	`3660`	`if constexpr (KernelParams::UsePdl)`
`3684`	`3661`	`{`
`3685`		`-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3686`	`3662`	`cudaTriggerProgrammaticLaunchCompletion();`
`3687`		`-#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3688`	`3663`	`}`
`3689`	`3664`	`#endif`
	`3665`	`+#endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))`
`3690`	`3666`	`}`
`3691`	`3667`
`3692`	`3668`	`////////////////////////////////////////////////////////////////////////////////////////////////////`
`@@ -3756,7 +3732,7 @@ void run(Data const& data, void* stream)`
`3756`	`3732`
`3757`	`3733`	`////////////////////////////////////////////////////////////////////////////////////////////////////`
`3758`	`3734`
`3759`		`-} // namespace routingQwen3`
	`3735`	`+} // namespace routingRenormalize`
`3760`	`3736`
`3761`	`3737`	`////////////////////////////////////////////////////////////////////////////////////////////////////`
`3762`	`3738`
Original file line number	Diff line number	Diff line change
`@@ -154,7 +154,7 @@ void Runner::run(void* routingLogits, void* routingBias, int32_t numTokens, int3`
`154`	`154`	`else if (routingMethodType == RoutingMethodType::Renormalize /* default */`
`155`	`155`	`\|\| routingMethodType == RoutingMethodType::RenormalizeNaive /* Softmax -> TopK */)`
`156`	`156`	`{`
`157`		`- moe::dev::routingQwen3::Data routingData;`
	`157`	`+ moe::dev::routingRenormalize::Data routingData;`
`158`	`158`
`159`	`159`	`//`
`160`	`160`	`// Config`
`@@ -196,7 +196,7 @@ void Runner::run(void* routingLogits, void* routingBias, int32_t numTokens, int3`
`196`	`196`	`routingData.mLocalExpertsStrideLog2 = 0;`
`197`	`197`	`routingData.mNumLocalExperts = localNumExperts;`
`198`	`198`
`199`		`- moe::dev::routingQwen3::run(routingData, stream);`
	`199`	`+ moe::dev::routingRenormalize::run(routingData, stream);`
`200`	`200`	`}`
`201`	`201`	`else`
`202`	`202`	`{`